diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,52793 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 7520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026595744680851064, + "grad_norm": 12.928236961364746, + "learning_rate": 1.0638297872340427e-08, + "loss": 1.5564, + "step": 1 + }, + { + "epoch": 0.0005319148936170213, + "grad_norm": 12.116073608398438, + "learning_rate": 2.1276595744680853e-08, + "loss": 1.5756, + "step": 2 + }, + { + "epoch": 0.0007978723404255319, + "grad_norm": 13.450613975524902, + "learning_rate": 3.191489361702128e-08, + "loss": 1.6078, + "step": 3 + }, + { + "epoch": 0.0010638297872340426, + "grad_norm": 14.591333389282227, + "learning_rate": 4.2553191489361707e-08, + "loss": 1.6333, + "step": 4 + }, + { + "epoch": 0.0013297872340425532, + "grad_norm": 14.167532920837402, + "learning_rate": 5.319148936170213e-08, + "loss": 1.4764, + "step": 5 + }, + { + "epoch": 0.0015957446808510637, + "grad_norm": 11.665863037109375, + "learning_rate": 6.382978723404255e-08, + "loss": 1.5681, + "step": 6 + }, + { + "epoch": 0.0018617021276595746, + "grad_norm": 12.705963134765625, + "learning_rate": 7.446808510638299e-08, + "loss": 1.5249, + "step": 7 + }, + { + "epoch": 0.002127659574468085, + "grad_norm": 13.839447021484375, + "learning_rate": 8.510638297872341e-08, + "loss": 1.6567, + "step": 8 + }, + { + "epoch": 0.0023936170212765957, + "grad_norm": 11.46570110321045, + "learning_rate": 9.574468085106384e-08, + "loss": 1.4166, + "step": 9 + }, + { + "epoch": 0.0026595744680851063, + "grad_norm": 12.468977928161621, + "learning_rate": 1.0638297872340426e-07, + "loss": 1.4788, + "step": 10 + }, + { + "epoch": 0.002925531914893617, + "grad_norm": 10.813947677612305, + "learning_rate": 1.1702127659574468e-07, + "loss": 1.3127, + "step": 11 + }, + { + "epoch": 0.0031914893617021275, + "grad_norm": 12.833952903747559, + "learning_rate": 1.276595744680851e-07, + "loss": 1.5291, + "step": 12 + }, + { + "epoch": 0.003457446808510638, + "grad_norm": 13.475564956665039, + "learning_rate": 1.3829787234042553e-07, + "loss": 1.4629, + "step": 13 + }, + { + "epoch": 0.003723404255319149, + "grad_norm": 11.995802879333496, + "learning_rate": 1.4893617021276598e-07, + "loss": 1.5887, + "step": 14 + }, + { + "epoch": 0.003989361702127659, + "grad_norm": 14.704851150512695, + "learning_rate": 1.5957446808510638e-07, + "loss": 1.4533, + "step": 15 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 11.153929710388184, + "learning_rate": 1.7021276595744683e-07, + "loss": 1.4027, + "step": 16 + }, + { + "epoch": 0.0045212765957446804, + "grad_norm": 14.091814994812012, + "learning_rate": 1.8085106382978722e-07, + "loss": 1.6199, + "step": 17 + }, + { + "epoch": 0.0047872340425531915, + "grad_norm": 13.533143997192383, + "learning_rate": 1.9148936170212767e-07, + "loss": 1.4809, + "step": 18 + }, + { + "epoch": 0.0050531914893617025, + "grad_norm": 13.076473236083984, + "learning_rate": 2.0212765957446812e-07, + "loss": 1.5374, + "step": 19 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 13.062971115112305, + "learning_rate": 2.1276595744680852e-07, + "loss": 1.6008, + "step": 20 + }, + { + "epoch": 0.005585106382978724, + "grad_norm": 13.033509254455566, + "learning_rate": 2.2340425531914897e-07, + "loss": 1.4679, + "step": 21 + }, + { + "epoch": 0.005851063829787234, + "grad_norm": 11.98855972290039, + "learning_rate": 2.3404255319148937e-07, + "loss": 1.5049, + "step": 22 + }, + { + "epoch": 0.006117021276595745, + "grad_norm": 13.161596298217773, + "learning_rate": 2.446808510638298e-07, + "loss": 1.5114, + "step": 23 + }, + { + "epoch": 0.006382978723404255, + "grad_norm": 12.387269020080566, + "learning_rate": 2.553191489361702e-07, + "loss": 1.3019, + "step": 24 + }, + { + "epoch": 0.006648936170212766, + "grad_norm": 10.667431831359863, + "learning_rate": 2.6595744680851066e-07, + "loss": 1.3113, + "step": 25 + }, + { + "epoch": 0.006914893617021276, + "grad_norm": 11.682806015014648, + "learning_rate": 2.7659574468085106e-07, + "loss": 1.627, + "step": 26 + }, + { + "epoch": 0.007180851063829787, + "grad_norm": 11.338486671447754, + "learning_rate": 2.872340425531915e-07, + "loss": 1.6309, + "step": 27 + }, + { + "epoch": 0.007446808510638298, + "grad_norm": 12.796504020690918, + "learning_rate": 2.9787234042553196e-07, + "loss": 1.4464, + "step": 28 + }, + { + "epoch": 0.007712765957446808, + "grad_norm": 12.2352876663208, + "learning_rate": 3.0851063829787236e-07, + "loss": 1.5748, + "step": 29 + }, + { + "epoch": 0.007978723404255319, + "grad_norm": 10.04947566986084, + "learning_rate": 3.1914893617021275e-07, + "loss": 1.3302, + "step": 30 + }, + { + "epoch": 0.00824468085106383, + "grad_norm": 11.51389217376709, + "learning_rate": 3.297872340425532e-07, + "loss": 1.3543, + "step": 31 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 9.522992134094238, + "learning_rate": 3.4042553191489365e-07, + "loss": 1.4485, + "step": 32 + }, + { + "epoch": 0.008776595744680852, + "grad_norm": 8.156554222106934, + "learning_rate": 3.510638297872341e-07, + "loss": 1.3791, + "step": 33 + }, + { + "epoch": 0.009042553191489361, + "grad_norm": 10.546247482299805, + "learning_rate": 3.6170212765957445e-07, + "loss": 1.6197, + "step": 34 + }, + { + "epoch": 0.009308510638297872, + "grad_norm": 8.094082832336426, + "learning_rate": 3.723404255319149e-07, + "loss": 1.2722, + "step": 35 + }, + { + "epoch": 0.009574468085106383, + "grad_norm": 7.64621114730835, + "learning_rate": 3.8297872340425535e-07, + "loss": 1.2489, + "step": 36 + }, + { + "epoch": 0.009840425531914894, + "grad_norm": 7.087127208709717, + "learning_rate": 3.936170212765958e-07, + "loss": 1.3383, + "step": 37 + }, + { + "epoch": 0.010106382978723405, + "grad_norm": 7.989037990570068, + "learning_rate": 4.0425531914893625e-07, + "loss": 1.2275, + "step": 38 + }, + { + "epoch": 0.010372340425531914, + "grad_norm": 9.057306289672852, + "learning_rate": 4.148936170212766e-07, + "loss": 1.4094, + "step": 39 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 7.628477573394775, + "learning_rate": 4.2553191489361704e-07, + "loss": 1.3137, + "step": 40 + }, + { + "epoch": 0.010904255319148936, + "grad_norm": 7.493610858917236, + "learning_rate": 4.361702127659575e-07, + "loss": 1.3603, + "step": 41 + }, + { + "epoch": 0.011170212765957447, + "grad_norm": 6.819916725158691, + "learning_rate": 4.4680851063829794e-07, + "loss": 1.5013, + "step": 42 + }, + { + "epoch": 0.011436170212765957, + "grad_norm": 7.222757339477539, + "learning_rate": 4.574468085106383e-07, + "loss": 1.4389, + "step": 43 + }, + { + "epoch": 0.011702127659574468, + "grad_norm": 6.92927885055542, + "learning_rate": 4.6808510638297873e-07, + "loss": 1.386, + "step": 44 + }, + { + "epoch": 0.011968085106382979, + "grad_norm": 6.100423336029053, + "learning_rate": 4.787234042553192e-07, + "loss": 1.3654, + "step": 45 + }, + { + "epoch": 0.01223404255319149, + "grad_norm": 6.047520637512207, + "learning_rate": 4.893617021276596e-07, + "loss": 1.2467, + "step": 46 + }, + { + "epoch": 0.0125, + "grad_norm": 6.429448127746582, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2826, + "step": 47 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 6.81625509262085, + "learning_rate": 5.106382978723404e-07, + "loss": 1.4576, + "step": 48 + }, + { + "epoch": 0.013031914893617021, + "grad_norm": 5.9020609855651855, + "learning_rate": 5.212765957446809e-07, + "loss": 1.2929, + "step": 49 + }, + { + "epoch": 0.013297872340425532, + "grad_norm": 6.343348979949951, + "learning_rate": 5.319148936170213e-07, + "loss": 1.4692, + "step": 50 + }, + { + "epoch": 0.013563829787234043, + "grad_norm": 6.274758338928223, + "learning_rate": 5.425531914893618e-07, + "loss": 1.3331, + "step": 51 + }, + { + "epoch": 0.013829787234042552, + "grad_norm": 6.188233852386475, + "learning_rate": 5.531914893617021e-07, + "loss": 1.4061, + "step": 52 + }, + { + "epoch": 0.014095744680851063, + "grad_norm": 6.108701705932617, + "learning_rate": 5.638297872340426e-07, + "loss": 1.2786, + "step": 53 + }, + { + "epoch": 0.014361702127659574, + "grad_norm": 6.032108306884766, + "learning_rate": 5.74468085106383e-07, + "loss": 1.3159, + "step": 54 + }, + { + "epoch": 0.014627659574468085, + "grad_norm": 6.019993305206299, + "learning_rate": 5.851063829787235e-07, + "loss": 1.3846, + "step": 55 + }, + { + "epoch": 0.014893617021276596, + "grad_norm": 6.405829906463623, + "learning_rate": 5.957446808510639e-07, + "loss": 1.3691, + "step": 56 + }, + { + "epoch": 0.015159574468085106, + "grad_norm": 6.517266273498535, + "learning_rate": 6.063829787234043e-07, + "loss": 1.416, + "step": 57 + }, + { + "epoch": 0.015425531914893617, + "grad_norm": 5.831709861755371, + "learning_rate": 6.170212765957447e-07, + "loss": 1.3022, + "step": 58 + }, + { + "epoch": 0.015691489361702126, + "grad_norm": 6.413986682891846, + "learning_rate": 6.276595744680851e-07, + "loss": 1.2001, + "step": 59 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 5.887234687805176, + "learning_rate": 6.382978723404255e-07, + "loss": 1.301, + "step": 60 + }, + { + "epoch": 0.016223404255319148, + "grad_norm": 6.500317573547363, + "learning_rate": 6.48936170212766e-07, + "loss": 1.2389, + "step": 61 + }, + { + "epoch": 0.01648936170212766, + "grad_norm": 5.423646450042725, + "learning_rate": 6.595744680851064e-07, + "loss": 1.1179, + "step": 62 + }, + { + "epoch": 0.01675531914893617, + "grad_norm": 6.422118663787842, + "learning_rate": 6.702127659574469e-07, + "loss": 1.2685, + "step": 63 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 6.100841999053955, + "learning_rate": 6.808510638297873e-07, + "loss": 1.3432, + "step": 64 + }, + { + "epoch": 0.017287234042553192, + "grad_norm": 6.879647254943848, + "learning_rate": 6.914893617021278e-07, + "loss": 1.4595, + "step": 65 + }, + { + "epoch": 0.017553191489361703, + "grad_norm": 5.739667892456055, + "learning_rate": 7.021276595744682e-07, + "loss": 1.254, + "step": 66 + }, + { + "epoch": 0.017819148936170214, + "grad_norm": 5.58401346206665, + "learning_rate": 7.127659574468087e-07, + "loss": 1.275, + "step": 67 + }, + { + "epoch": 0.018085106382978722, + "grad_norm": 5.75786018371582, + "learning_rate": 7.234042553191489e-07, + "loss": 1.2797, + "step": 68 + }, + { + "epoch": 0.018351063829787233, + "grad_norm": 5.23975133895874, + "learning_rate": 7.340425531914893e-07, + "loss": 1.2314, + "step": 69 + }, + { + "epoch": 0.018617021276595744, + "grad_norm": 5.783809661865234, + "learning_rate": 7.446808510638298e-07, + "loss": 1.2621, + "step": 70 + }, + { + "epoch": 0.018882978723404255, + "grad_norm": 6.303256988525391, + "learning_rate": 7.553191489361702e-07, + "loss": 1.2988, + "step": 71 + }, + { + "epoch": 0.019148936170212766, + "grad_norm": 6.035338401794434, + "learning_rate": 7.659574468085107e-07, + "loss": 1.3572, + "step": 72 + }, + { + "epoch": 0.019414893617021277, + "grad_norm": 5.458433628082275, + "learning_rate": 7.765957446808511e-07, + "loss": 1.2515, + "step": 73 + }, + { + "epoch": 0.019680851063829788, + "grad_norm": 5.706748008728027, + "learning_rate": 7.872340425531916e-07, + "loss": 1.2144, + "step": 74 + }, + { + "epoch": 0.0199468085106383, + "grad_norm": 5.4996018409729, + "learning_rate": 7.97872340425532e-07, + "loss": 1.2999, + "step": 75 + }, + { + "epoch": 0.02021276595744681, + "grad_norm": 5.666746139526367, + "learning_rate": 8.085106382978725e-07, + "loss": 1.2947, + "step": 76 + }, + { + "epoch": 0.020478723404255317, + "grad_norm": 5.446689128875732, + "learning_rate": 8.191489361702127e-07, + "loss": 1.4081, + "step": 77 + }, + { + "epoch": 0.02074468085106383, + "grad_norm": 5.886783123016357, + "learning_rate": 8.297872340425532e-07, + "loss": 1.5147, + "step": 78 + }, + { + "epoch": 0.02101063829787234, + "grad_norm": 5.839478969573975, + "learning_rate": 8.404255319148936e-07, + "loss": 1.3047, + "step": 79 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 5.6594767570495605, + "learning_rate": 8.510638297872341e-07, + "loss": 1.3499, + "step": 80 + }, + { + "epoch": 0.02154255319148936, + "grad_norm": 5.712738990783691, + "learning_rate": 8.617021276595745e-07, + "loss": 1.2731, + "step": 81 + }, + { + "epoch": 0.021808510638297873, + "grad_norm": 5.7129316329956055, + "learning_rate": 8.72340425531915e-07, + "loss": 1.2454, + "step": 82 + }, + { + "epoch": 0.022074468085106384, + "grad_norm": 5.676748275756836, + "learning_rate": 8.829787234042554e-07, + "loss": 1.4916, + "step": 83 + }, + { + "epoch": 0.022340425531914895, + "grad_norm": 5.481147289276123, + "learning_rate": 8.936170212765959e-07, + "loss": 1.3493, + "step": 84 + }, + { + "epoch": 0.022606382978723406, + "grad_norm": 5.774475574493408, + "learning_rate": 9.042553191489363e-07, + "loss": 1.2583, + "step": 85 + }, + { + "epoch": 0.022872340425531913, + "grad_norm": 6.059263229370117, + "learning_rate": 9.148936170212766e-07, + "loss": 1.2257, + "step": 86 + }, + { + "epoch": 0.023138297872340424, + "grad_norm": 5.5594258308410645, + "learning_rate": 9.25531914893617e-07, + "loss": 1.3313, + "step": 87 + }, + { + "epoch": 0.023404255319148935, + "grad_norm": 5.335761070251465, + "learning_rate": 9.361702127659575e-07, + "loss": 1.221, + "step": 88 + }, + { + "epoch": 0.023670212765957446, + "grad_norm": 5.275820255279541, + "learning_rate": 9.468085106382979e-07, + "loss": 1.315, + "step": 89 + }, + { + "epoch": 0.023936170212765957, + "grad_norm": 5.96125602722168, + "learning_rate": 9.574468085106384e-07, + "loss": 1.2792, + "step": 90 + }, + { + "epoch": 0.02420212765957447, + "grad_norm": 5.549777984619141, + "learning_rate": 9.680851063829788e-07, + "loss": 1.2194, + "step": 91 + }, + { + "epoch": 0.02446808510638298, + "grad_norm": 5.814997673034668, + "learning_rate": 9.787234042553193e-07, + "loss": 1.2917, + "step": 92 + }, + { + "epoch": 0.02473404255319149, + "grad_norm": 5.332813739776611, + "learning_rate": 9.893617021276597e-07, + "loss": 1.2458, + "step": 93 + }, + { + "epoch": 0.025, + "grad_norm": 5.473198890686035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2752, + "step": 94 + }, + { + "epoch": 0.02526595744680851, + "grad_norm": 5.484592914581299, + "learning_rate": 1.0106382978723404e-06, + "loss": 1.3052, + "step": 95 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 6.4860453605651855, + "learning_rate": 1.0212765957446809e-06, + "loss": 1.4454, + "step": 96 + }, + { + "epoch": 0.02579787234042553, + "grad_norm": 5.582982540130615, + "learning_rate": 1.0319148936170213e-06, + "loss": 1.2514, + "step": 97 + }, + { + "epoch": 0.026063829787234042, + "grad_norm": 5.618495464324951, + "learning_rate": 1.0425531914893618e-06, + "loss": 1.4123, + "step": 98 + }, + { + "epoch": 0.026329787234042553, + "grad_norm": 5.169803619384766, + "learning_rate": 1.0531914893617022e-06, + "loss": 1.3128, + "step": 99 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 5.215284824371338, + "learning_rate": 1.0638297872340427e-06, + "loss": 1.4286, + "step": 100 + }, + { + "epoch": 0.026861702127659575, + "grad_norm": 5.888491153717041, + "learning_rate": 1.074468085106383e-06, + "loss": 1.2953, + "step": 101 + }, + { + "epoch": 0.027127659574468086, + "grad_norm": 5.597144603729248, + "learning_rate": 1.0851063829787236e-06, + "loss": 1.2401, + "step": 102 + }, + { + "epoch": 0.027393617021276597, + "grad_norm": 5.215080261230469, + "learning_rate": 1.095744680851064e-06, + "loss": 1.1961, + "step": 103 + }, + { + "epoch": 0.027659574468085105, + "grad_norm": 5.162172794342041, + "learning_rate": 1.1063829787234042e-06, + "loss": 1.2641, + "step": 104 + }, + { + "epoch": 0.027925531914893616, + "grad_norm": 5.490815162658691, + "learning_rate": 1.1170212765957447e-06, + "loss": 1.1788, + "step": 105 + }, + { + "epoch": 0.028191489361702127, + "grad_norm": 5.236513137817383, + "learning_rate": 1.1276595744680851e-06, + "loss": 1.3241, + "step": 106 + }, + { + "epoch": 0.028457446808510638, + "grad_norm": 5.335816860198975, + "learning_rate": 1.1382978723404256e-06, + "loss": 1.299, + "step": 107 + }, + { + "epoch": 0.02872340425531915, + "grad_norm": 5.176724910736084, + "learning_rate": 1.148936170212766e-06, + "loss": 1.3305, + "step": 108 + }, + { + "epoch": 0.02898936170212766, + "grad_norm": 6.114458084106445, + "learning_rate": 1.1595744680851065e-06, + "loss": 1.3005, + "step": 109 + }, + { + "epoch": 0.02925531914893617, + "grad_norm": 5.407876491546631, + "learning_rate": 1.170212765957447e-06, + "loss": 1.2806, + "step": 110 + }, + { + "epoch": 0.029521276595744682, + "grad_norm": 4.949467658996582, + "learning_rate": 1.1808510638297874e-06, + "loss": 1.2961, + "step": 111 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 6.091759204864502, + "learning_rate": 1.1914893617021278e-06, + "loss": 1.3533, + "step": 112 + }, + { + "epoch": 0.0300531914893617, + "grad_norm": 6.605318069458008, + "learning_rate": 1.202127659574468e-06, + "loss": 1.3292, + "step": 113 + }, + { + "epoch": 0.03031914893617021, + "grad_norm": 5.556684494018555, + "learning_rate": 1.2127659574468085e-06, + "loss": 1.2438, + "step": 114 + }, + { + "epoch": 0.030585106382978722, + "grad_norm": 5.465230941772461, + "learning_rate": 1.223404255319149e-06, + "loss": 1.2679, + "step": 115 + }, + { + "epoch": 0.030851063829787233, + "grad_norm": 5.770520210266113, + "learning_rate": 1.2340425531914894e-06, + "loss": 1.355, + "step": 116 + }, + { + "epoch": 0.031117021276595744, + "grad_norm": 5.495830535888672, + "learning_rate": 1.2446808510638299e-06, + "loss": 1.2153, + "step": 117 + }, + { + "epoch": 0.03138297872340425, + "grad_norm": 5.549342632293701, + "learning_rate": 1.2553191489361701e-06, + "loss": 1.3283, + "step": 118 + }, + { + "epoch": 0.03164893617021276, + "grad_norm": 5.871270656585693, + "learning_rate": 1.2659574468085106e-06, + "loss": 1.2485, + "step": 119 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.074721813201904, + "learning_rate": 1.276595744680851e-06, + "loss": 1.2725, + "step": 120 + }, + { + "epoch": 0.032180851063829785, + "grad_norm": 5.2500715255737305, + "learning_rate": 1.2872340425531915e-06, + "loss": 1.1767, + "step": 121 + }, + { + "epoch": 0.032446808510638296, + "grad_norm": 5.220420837402344, + "learning_rate": 1.297872340425532e-06, + "loss": 1.2566, + "step": 122 + }, + { + "epoch": 0.03271276595744681, + "grad_norm": 5.691092014312744, + "learning_rate": 1.3085106382978724e-06, + "loss": 1.1828, + "step": 123 + }, + { + "epoch": 0.03297872340425532, + "grad_norm": 5.540714740753174, + "learning_rate": 1.3191489361702128e-06, + "loss": 1.4373, + "step": 124 + }, + { + "epoch": 0.03324468085106383, + "grad_norm": 5.538027286529541, + "learning_rate": 1.3297872340425533e-06, + "loss": 1.2955, + "step": 125 + }, + { + "epoch": 0.03351063829787234, + "grad_norm": 5.601515769958496, + "learning_rate": 1.3404255319148937e-06, + "loss": 1.4246, + "step": 126 + }, + { + "epoch": 0.03377659574468085, + "grad_norm": 5.398896217346191, + "learning_rate": 1.3510638297872342e-06, + "loss": 1.2479, + "step": 127 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 5.281778335571289, + "learning_rate": 1.3617021276595746e-06, + "loss": 1.4188, + "step": 128 + }, + { + "epoch": 0.03430851063829787, + "grad_norm": 5.898463249206543, + "learning_rate": 1.372340425531915e-06, + "loss": 1.2214, + "step": 129 + }, + { + "epoch": 0.034574468085106384, + "grad_norm": 5.390676975250244, + "learning_rate": 1.3829787234042555e-06, + "loss": 1.2872, + "step": 130 + }, + { + "epoch": 0.034840425531914895, + "grad_norm": 5.157502174377441, + "learning_rate": 1.393617021276596e-06, + "loss": 1.2954, + "step": 131 + }, + { + "epoch": 0.035106382978723406, + "grad_norm": 5.678062438964844, + "learning_rate": 1.4042553191489364e-06, + "loss": 1.2732, + "step": 132 + }, + { + "epoch": 0.03537234042553192, + "grad_norm": 5.359380722045898, + "learning_rate": 1.4148936170212769e-06, + "loss": 1.2858, + "step": 133 + }, + { + "epoch": 0.03563829787234043, + "grad_norm": 6.153907775878906, + "learning_rate": 1.4255319148936173e-06, + "loss": 1.3225, + "step": 134 + }, + { + "epoch": 0.03590425531914894, + "grad_norm": 5.03823709487915, + "learning_rate": 1.4361702127659578e-06, + "loss": 1.196, + "step": 135 + }, + { + "epoch": 0.036170212765957444, + "grad_norm": 5.12296199798584, + "learning_rate": 1.4468085106382978e-06, + "loss": 1.1534, + "step": 136 + }, + { + "epoch": 0.036436170212765955, + "grad_norm": 5.526867866516113, + "learning_rate": 1.4574468085106382e-06, + "loss": 1.3099, + "step": 137 + }, + { + "epoch": 0.036702127659574466, + "grad_norm": 5.23512601852417, + "learning_rate": 1.4680851063829787e-06, + "loss": 1.167, + "step": 138 + }, + { + "epoch": 0.03696808510638298, + "grad_norm": 5.28326940536499, + "learning_rate": 1.4787234042553191e-06, + "loss": 1.2882, + "step": 139 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.0062336921691895, + "learning_rate": 1.4893617021276596e-06, + "loss": 1.2937, + "step": 140 + }, + { + "epoch": 0.0375, + "grad_norm": 5.471292495727539, + "learning_rate": 1.5e-06, + "loss": 1.2783, + "step": 141 + }, + { + "epoch": 0.03776595744680851, + "grad_norm": 4.784001350402832, + "learning_rate": 1.5106382978723405e-06, + "loss": 1.1493, + "step": 142 + }, + { + "epoch": 0.03803191489361702, + "grad_norm": 5.167656898498535, + "learning_rate": 1.521276595744681e-06, + "loss": 1.2872, + "step": 143 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 5.2528276443481445, + "learning_rate": 1.5319148936170214e-06, + "loss": 1.2876, + "step": 144 + }, + { + "epoch": 0.03856382978723404, + "grad_norm": 5.4960784912109375, + "learning_rate": 1.5425531914893618e-06, + "loss": 1.2364, + "step": 145 + }, + { + "epoch": 0.038829787234042554, + "grad_norm": 5.419551372528076, + "learning_rate": 1.5531914893617023e-06, + "loss": 1.3695, + "step": 146 + }, + { + "epoch": 0.039095744680851065, + "grad_norm": 5.1890974044799805, + "learning_rate": 1.5638297872340427e-06, + "loss": 1.2263, + "step": 147 + }, + { + "epoch": 0.039361702127659576, + "grad_norm": 5.578823566436768, + "learning_rate": 1.5744680851063832e-06, + "loss": 1.2531, + "step": 148 + }, + { + "epoch": 0.03962765957446809, + "grad_norm": 5.37275505065918, + "learning_rate": 1.5851063829787236e-06, + "loss": 1.2201, + "step": 149 + }, + { + "epoch": 0.0398936170212766, + "grad_norm": 5.344025135040283, + "learning_rate": 1.595744680851064e-06, + "loss": 1.1419, + "step": 150 + }, + { + "epoch": 0.04015957446808511, + "grad_norm": 5.697562217712402, + "learning_rate": 1.6063829787234045e-06, + "loss": 1.3923, + "step": 151 + }, + { + "epoch": 0.04042553191489362, + "grad_norm": 5.420823097229004, + "learning_rate": 1.617021276595745e-06, + "loss": 1.2936, + "step": 152 + }, + { + "epoch": 0.04069148936170213, + "grad_norm": 5.53727912902832, + "learning_rate": 1.6276595744680854e-06, + "loss": 1.2047, + "step": 153 + }, + { + "epoch": 0.040957446808510635, + "grad_norm": 5.577879428863525, + "learning_rate": 1.6382978723404255e-06, + "loss": 1.2495, + "step": 154 + }, + { + "epoch": 0.041223404255319146, + "grad_norm": 5.115095138549805, + "learning_rate": 1.648936170212766e-06, + "loss": 1.3324, + "step": 155 + }, + { + "epoch": 0.04148936170212766, + "grad_norm": 5.6801862716674805, + "learning_rate": 1.6595744680851064e-06, + "loss": 1.3554, + "step": 156 + }, + { + "epoch": 0.04175531914893617, + "grad_norm": 5.293743133544922, + "learning_rate": 1.6702127659574468e-06, + "loss": 1.2226, + "step": 157 + }, + { + "epoch": 0.04202127659574468, + "grad_norm": 5.129601955413818, + "learning_rate": 1.6808510638297873e-06, + "loss": 1.3393, + "step": 158 + }, + { + "epoch": 0.04228723404255319, + "grad_norm": 5.572645664215088, + "learning_rate": 1.6914893617021277e-06, + "loss": 1.2734, + "step": 159 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.944756507873535, + "learning_rate": 1.7021276595744682e-06, + "loss": 1.3417, + "step": 160 + }, + { + "epoch": 0.04281914893617021, + "grad_norm": 4.982651710510254, + "learning_rate": 1.7127659574468086e-06, + "loss": 1.2622, + "step": 161 + }, + { + "epoch": 0.04308510638297872, + "grad_norm": 5.134377479553223, + "learning_rate": 1.723404255319149e-06, + "loss": 1.1741, + "step": 162 + }, + { + "epoch": 0.043351063829787234, + "grad_norm": 4.829857349395752, + "learning_rate": 1.7340425531914895e-06, + "loss": 1.2298, + "step": 163 + }, + { + "epoch": 0.043617021276595745, + "grad_norm": 5.052809715270996, + "learning_rate": 1.74468085106383e-06, + "loss": 1.1607, + "step": 164 + }, + { + "epoch": 0.043882978723404256, + "grad_norm": 5.3465776443481445, + "learning_rate": 1.7553191489361704e-06, + "loss": 1.3924, + "step": 165 + }, + { + "epoch": 0.04414893617021277, + "grad_norm": 5.502316951751709, + "learning_rate": 1.7659574468085109e-06, + "loss": 1.1488, + "step": 166 + }, + { + "epoch": 0.04441489361702128, + "grad_norm": 5.253002643585205, + "learning_rate": 1.7765957446808513e-06, + "loss": 1.2004, + "step": 167 + }, + { + "epoch": 0.04468085106382979, + "grad_norm": 5.437882900238037, + "learning_rate": 1.7872340425531918e-06, + "loss": 1.3885, + "step": 168 + }, + { + "epoch": 0.0449468085106383, + "grad_norm": 5.526264190673828, + "learning_rate": 1.7978723404255322e-06, + "loss": 1.2351, + "step": 169 + }, + { + "epoch": 0.04521276595744681, + "grad_norm": 5.078868389129639, + "learning_rate": 1.8085106382978727e-06, + "loss": 1.1479, + "step": 170 + }, + { + "epoch": 0.04547872340425532, + "grad_norm": 5.379688739776611, + "learning_rate": 1.8191489361702131e-06, + "loss": 1.246, + "step": 171 + }, + { + "epoch": 0.045744680851063826, + "grad_norm": 4.756881237030029, + "learning_rate": 1.8297872340425531e-06, + "loss": 1.3602, + "step": 172 + }, + { + "epoch": 0.04601063829787234, + "grad_norm": 5.651166915893555, + "learning_rate": 1.8404255319148936e-06, + "loss": 1.1183, + "step": 173 + }, + { + "epoch": 0.04627659574468085, + "grad_norm": 5.725973129272461, + "learning_rate": 1.851063829787234e-06, + "loss": 1.2474, + "step": 174 + }, + { + "epoch": 0.04654255319148936, + "grad_norm": 4.994713306427002, + "learning_rate": 1.8617021276595745e-06, + "loss": 1.1945, + "step": 175 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 4.701328277587891, + "learning_rate": 1.872340425531915e-06, + "loss": 1.2735, + "step": 176 + }, + { + "epoch": 0.04707446808510638, + "grad_norm": 5.917819023132324, + "learning_rate": 1.8829787234042554e-06, + "loss": 1.2192, + "step": 177 + }, + { + "epoch": 0.04734042553191489, + "grad_norm": 5.055963039398193, + "learning_rate": 1.8936170212765958e-06, + "loss": 1.4119, + "step": 178 + }, + { + "epoch": 0.047606382978723404, + "grad_norm": 5.516870021820068, + "learning_rate": 1.9042553191489363e-06, + "loss": 1.2739, + "step": 179 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 5.217896461486816, + "learning_rate": 1.9148936170212767e-06, + "loss": 1.0916, + "step": 180 + }, + { + "epoch": 0.048138297872340426, + "grad_norm": 5.3772807121276855, + "learning_rate": 1.925531914893617e-06, + "loss": 1.2636, + "step": 181 + }, + { + "epoch": 0.04840425531914894, + "grad_norm": 5.261349678039551, + "learning_rate": 1.9361702127659576e-06, + "loss": 1.1872, + "step": 182 + }, + { + "epoch": 0.04867021276595745, + "grad_norm": 5.209681510925293, + "learning_rate": 1.946808510638298e-06, + "loss": 1.1946, + "step": 183 + }, + { + "epoch": 0.04893617021276596, + "grad_norm": 6.393560886383057, + "learning_rate": 1.9574468085106385e-06, + "loss": 1.4354, + "step": 184 + }, + { + "epoch": 0.04920212765957447, + "grad_norm": 5.200966835021973, + "learning_rate": 1.968085106382979e-06, + "loss": 1.264, + "step": 185 + }, + { + "epoch": 0.04946808510638298, + "grad_norm": 4.81060791015625, + "learning_rate": 1.9787234042553194e-06, + "loss": 1.345, + "step": 186 + }, + { + "epoch": 0.04973404255319149, + "grad_norm": 5.786832332611084, + "learning_rate": 1.98936170212766e-06, + "loss": 1.2897, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 5.332983493804932, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3621, + "step": 188 + }, + { + "epoch": 0.050265957446808514, + "grad_norm": 5.093095779418945, + "learning_rate": 2.0106382978723408e-06, + "loss": 1.3366, + "step": 189 + }, + { + "epoch": 0.05053191489361702, + "grad_norm": 5.604922771453857, + "learning_rate": 2.021276595744681e-06, + "loss": 1.2009, + "step": 190 + }, + { + "epoch": 0.05079787234042553, + "grad_norm": 5.312707901000977, + "learning_rate": 2.0319148936170213e-06, + "loss": 1.1604, + "step": 191 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 5.330122470855713, + "learning_rate": 2.0425531914893617e-06, + "loss": 1.2102, + "step": 192 + }, + { + "epoch": 0.05132978723404255, + "grad_norm": 5.350152015686035, + "learning_rate": 2.053191489361702e-06, + "loss": 1.3483, + "step": 193 + }, + { + "epoch": 0.05159574468085106, + "grad_norm": 5.540630340576172, + "learning_rate": 2.0638297872340426e-06, + "loss": 1.437, + "step": 194 + }, + { + "epoch": 0.05186170212765957, + "grad_norm": 4.698929309844971, + "learning_rate": 2.074468085106383e-06, + "loss": 1.2083, + "step": 195 + }, + { + "epoch": 0.052127659574468084, + "grad_norm": 5.128317356109619, + "learning_rate": 2.0851063829787235e-06, + "loss": 1.1502, + "step": 196 + }, + { + "epoch": 0.052393617021276595, + "grad_norm": 5.425604343414307, + "learning_rate": 2.095744680851064e-06, + "loss": 1.2919, + "step": 197 + }, + { + "epoch": 0.052659574468085106, + "grad_norm": 5.3685712814331055, + "learning_rate": 2.1063829787234044e-06, + "loss": 1.2305, + "step": 198 + }, + { + "epoch": 0.05292553191489362, + "grad_norm": 6.010136127471924, + "learning_rate": 2.117021276595745e-06, + "loss": 1.0582, + "step": 199 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 5.427469253540039, + "learning_rate": 2.1276595744680853e-06, + "loss": 1.2515, + "step": 200 + }, + { + "epoch": 0.05345744680851064, + "grad_norm": 5.31635856628418, + "learning_rate": 2.1382978723404258e-06, + "loss": 1.2157, + "step": 201 + }, + { + "epoch": 0.05372340425531915, + "grad_norm": 5.334502220153809, + "learning_rate": 2.148936170212766e-06, + "loss": 1.271, + "step": 202 + }, + { + "epoch": 0.05398936170212766, + "grad_norm": 4.88215970993042, + "learning_rate": 2.1595744680851067e-06, + "loss": 1.2777, + "step": 203 + }, + { + "epoch": 0.05425531914893617, + "grad_norm": 5.919299602508545, + "learning_rate": 2.170212765957447e-06, + "loss": 1.3336, + "step": 204 + }, + { + "epoch": 0.05452127659574468, + "grad_norm": 5.037824630737305, + "learning_rate": 2.1808510638297876e-06, + "loss": 1.316, + "step": 205 + }, + { + "epoch": 0.054787234042553194, + "grad_norm": 5.16343879699707, + "learning_rate": 2.191489361702128e-06, + "loss": 1.2724, + "step": 206 + }, + { + "epoch": 0.055053191489361705, + "grad_norm": 5.36834192276001, + "learning_rate": 2.2021276595744685e-06, + "loss": 1.1693, + "step": 207 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 4.99350118637085, + "learning_rate": 2.2127659574468085e-06, + "loss": 1.225, + "step": 208 + }, + { + "epoch": 0.05558510638297872, + "grad_norm": 5.564612865447998, + "learning_rate": 2.223404255319149e-06, + "loss": 1.2125, + "step": 209 + }, + { + "epoch": 0.05585106382978723, + "grad_norm": 5.21875, + "learning_rate": 2.2340425531914894e-06, + "loss": 1.3788, + "step": 210 + }, + { + "epoch": 0.05611702127659574, + "grad_norm": 5.006836891174316, + "learning_rate": 2.24468085106383e-06, + "loss": 1.2095, + "step": 211 + }, + { + "epoch": 0.05638297872340425, + "grad_norm": 5.6003546714782715, + "learning_rate": 2.2553191489361703e-06, + "loss": 1.3872, + "step": 212 + }, + { + "epoch": 0.056648936170212764, + "grad_norm": 4.7773613929748535, + "learning_rate": 2.2659574468085107e-06, + "loss": 1.1979, + "step": 213 + }, + { + "epoch": 0.056914893617021275, + "grad_norm": 4.554566860198975, + "learning_rate": 2.276595744680851e-06, + "loss": 1.1656, + "step": 214 + }, + { + "epoch": 0.057180851063829786, + "grad_norm": 5.66951322555542, + "learning_rate": 2.2872340425531916e-06, + "loss": 1.3728, + "step": 215 + }, + { + "epoch": 0.0574468085106383, + "grad_norm": 5.2931013107299805, + "learning_rate": 2.297872340425532e-06, + "loss": 1.2003, + "step": 216 + }, + { + "epoch": 0.05771276595744681, + "grad_norm": 5.449213981628418, + "learning_rate": 2.3085106382978725e-06, + "loss": 1.2337, + "step": 217 + }, + { + "epoch": 0.05797872340425532, + "grad_norm": 5.684970378875732, + "learning_rate": 2.319148936170213e-06, + "loss": 1.2196, + "step": 218 + }, + { + "epoch": 0.05824468085106383, + "grad_norm": 5.038141250610352, + "learning_rate": 2.3297872340425534e-06, + "loss": 1.0954, + "step": 219 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 5.255678176879883, + "learning_rate": 2.340425531914894e-06, + "loss": 1.3141, + "step": 220 + }, + { + "epoch": 0.05877659574468085, + "grad_norm": 5.490760326385498, + "learning_rate": 2.3510638297872343e-06, + "loss": 1.1469, + "step": 221 + }, + { + "epoch": 0.059042553191489364, + "grad_norm": 5.482240676879883, + "learning_rate": 2.3617021276595748e-06, + "loss": 1.2831, + "step": 222 + }, + { + "epoch": 0.059308510638297875, + "grad_norm": 6.045271873474121, + "learning_rate": 2.3723404255319152e-06, + "loss": 1.1601, + "step": 223 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 5.145684719085693, + "learning_rate": 2.3829787234042557e-06, + "loss": 1.1432, + "step": 224 + }, + { + "epoch": 0.0598404255319149, + "grad_norm": 4.948934555053711, + "learning_rate": 2.393617021276596e-06, + "loss": 1.1199, + "step": 225 + }, + { + "epoch": 0.0601063829787234, + "grad_norm": 5.273087978363037, + "learning_rate": 2.404255319148936e-06, + "loss": 1.3225, + "step": 226 + }, + { + "epoch": 0.06037234042553191, + "grad_norm": 5.76677131652832, + "learning_rate": 2.4148936170212766e-06, + "loss": 1.3144, + "step": 227 + }, + { + "epoch": 0.06063829787234042, + "grad_norm": 5.51316499710083, + "learning_rate": 2.425531914893617e-06, + "loss": 1.2931, + "step": 228 + }, + { + "epoch": 0.060904255319148934, + "grad_norm": 5.077220916748047, + "learning_rate": 2.4361702127659575e-06, + "loss": 1.1972, + "step": 229 + }, + { + "epoch": 0.061170212765957445, + "grad_norm": 5.733246803283691, + "learning_rate": 2.446808510638298e-06, + "loss": 1.2773, + "step": 230 + }, + { + "epoch": 0.061436170212765956, + "grad_norm": 4.702721118927002, + "learning_rate": 2.4574468085106384e-06, + "loss": 1.2654, + "step": 231 + }, + { + "epoch": 0.06170212765957447, + "grad_norm": 5.210516452789307, + "learning_rate": 2.468085106382979e-06, + "loss": 1.3222, + "step": 232 + }, + { + "epoch": 0.06196808510638298, + "grad_norm": 5.6721720695495605, + "learning_rate": 2.4787234042553193e-06, + "loss": 1.1756, + "step": 233 + }, + { + "epoch": 0.06223404255319149, + "grad_norm": 4.598169326782227, + "learning_rate": 2.4893617021276598e-06, + "loss": 1.2613, + "step": 234 + }, + { + "epoch": 0.0625, + "grad_norm": 5.069137096405029, + "learning_rate": 2.5e-06, + "loss": 1.2629, + "step": 235 + }, + { + "epoch": 0.0627659574468085, + "grad_norm": 4.875532627105713, + "learning_rate": 2.5106382978723402e-06, + "loss": 1.1515, + "step": 236 + }, + { + "epoch": 0.06303191489361702, + "grad_norm": 5.547458171844482, + "learning_rate": 2.521276595744681e-06, + "loss": 1.4157, + "step": 237 + }, + { + "epoch": 0.06329787234042553, + "grad_norm": 5.377124786376953, + "learning_rate": 2.531914893617021e-06, + "loss": 1.3036, + "step": 238 + }, + { + "epoch": 0.06356382978723404, + "grad_norm": 5.135563850402832, + "learning_rate": 2.542553191489362e-06, + "loss": 1.1638, + "step": 239 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 5.6008172035217285, + "learning_rate": 2.553191489361702e-06, + "loss": 1.2787, + "step": 240 + }, + { + "epoch": 0.06409574468085107, + "grad_norm": 5.453914165496826, + "learning_rate": 2.563829787234043e-06, + "loss": 1.3239, + "step": 241 + }, + { + "epoch": 0.06436170212765957, + "grad_norm": 5.219985485076904, + "learning_rate": 2.574468085106383e-06, + "loss": 1.0942, + "step": 242 + }, + { + "epoch": 0.06462765957446809, + "grad_norm": 5.180700778961182, + "learning_rate": 2.585106382978724e-06, + "loss": 1.1501, + "step": 243 + }, + { + "epoch": 0.06489361702127659, + "grad_norm": 5.2240071296691895, + "learning_rate": 2.595744680851064e-06, + "loss": 1.2269, + "step": 244 + }, + { + "epoch": 0.06515957446808511, + "grad_norm": 6.328047275543213, + "learning_rate": 2.6063829787234047e-06, + "loss": 1.405, + "step": 245 + }, + { + "epoch": 0.06542553191489361, + "grad_norm": 5.10886287689209, + "learning_rate": 2.6170212765957447e-06, + "loss": 1.2698, + "step": 246 + }, + { + "epoch": 0.06569148936170213, + "grad_norm": 5.45538330078125, + "learning_rate": 2.6276595744680856e-06, + "loss": 1.33, + "step": 247 + }, + { + "epoch": 0.06595744680851064, + "grad_norm": 5.294386386871338, + "learning_rate": 2.6382978723404256e-06, + "loss": 1.2895, + "step": 248 + }, + { + "epoch": 0.06622340425531915, + "grad_norm": 4.7668776512146, + "learning_rate": 2.6489361702127665e-06, + "loss": 1.1176, + "step": 249 + }, + { + "epoch": 0.06648936170212766, + "grad_norm": 4.915814399719238, + "learning_rate": 2.6595744680851065e-06, + "loss": 1.2469, + "step": 250 + }, + { + "epoch": 0.06675531914893618, + "grad_norm": 5.320147514343262, + "learning_rate": 2.6702127659574474e-06, + "loss": 1.4904, + "step": 251 + }, + { + "epoch": 0.06702127659574468, + "grad_norm": 5.417577266693115, + "learning_rate": 2.6808510638297874e-06, + "loss": 1.3166, + "step": 252 + }, + { + "epoch": 0.0672872340425532, + "grad_norm": 4.704782485961914, + "learning_rate": 2.6914893617021283e-06, + "loss": 1.2362, + "step": 253 + }, + { + "epoch": 0.0675531914893617, + "grad_norm": 5.100544452667236, + "learning_rate": 2.7021276595744683e-06, + "loss": 1.2969, + "step": 254 + }, + { + "epoch": 0.0678191489361702, + "grad_norm": 6.336488723754883, + "learning_rate": 2.7127659574468084e-06, + "loss": 1.2708, + "step": 255 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 5.281217098236084, + "learning_rate": 2.7234042553191492e-06, + "loss": 1.3103, + "step": 256 + }, + { + "epoch": 0.06835106382978723, + "grad_norm": 5.127480983734131, + "learning_rate": 2.7340425531914893e-06, + "loss": 1.2957, + "step": 257 + }, + { + "epoch": 0.06861702127659575, + "grad_norm": 5.289313316345215, + "learning_rate": 2.74468085106383e-06, + "loss": 1.2658, + "step": 258 + }, + { + "epoch": 0.06888297872340425, + "grad_norm": 5.088155746459961, + "learning_rate": 2.75531914893617e-06, + "loss": 1.1359, + "step": 259 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 5.367323875427246, + "learning_rate": 2.765957446808511e-06, + "loss": 1.2408, + "step": 260 + }, + { + "epoch": 0.06941489361702127, + "grad_norm": 5.337047576904297, + "learning_rate": 2.776595744680851e-06, + "loss": 1.2908, + "step": 261 + }, + { + "epoch": 0.06968085106382979, + "grad_norm": 5.167153358459473, + "learning_rate": 2.787234042553192e-06, + "loss": 1.3217, + "step": 262 + }, + { + "epoch": 0.0699468085106383, + "grad_norm": 5.522439956665039, + "learning_rate": 2.797872340425532e-06, + "loss": 1.2799, + "step": 263 + }, + { + "epoch": 0.07021276595744681, + "grad_norm": 4.691408157348633, + "learning_rate": 2.808510638297873e-06, + "loss": 1.096, + "step": 264 + }, + { + "epoch": 0.07047872340425532, + "grad_norm": 5.208773612976074, + "learning_rate": 2.819148936170213e-06, + "loss": 1.3215, + "step": 265 + }, + { + "epoch": 0.07074468085106383, + "grad_norm": 5.4790496826171875, + "learning_rate": 2.8297872340425537e-06, + "loss": 1.4218, + "step": 266 + }, + { + "epoch": 0.07101063829787234, + "grad_norm": 5.256765842437744, + "learning_rate": 2.8404255319148938e-06, + "loss": 1.4242, + "step": 267 + }, + { + "epoch": 0.07127659574468086, + "grad_norm": 4.874395370483398, + "learning_rate": 2.8510638297872346e-06, + "loss": 1.2518, + "step": 268 + }, + { + "epoch": 0.07154255319148936, + "grad_norm": 5.108527183532715, + "learning_rate": 2.8617021276595747e-06, + "loss": 1.2919, + "step": 269 + }, + { + "epoch": 0.07180851063829788, + "grad_norm": 5.333227157592773, + "learning_rate": 2.8723404255319155e-06, + "loss": 1.459, + "step": 270 + }, + { + "epoch": 0.07207446808510638, + "grad_norm": 5.232532501220703, + "learning_rate": 2.8829787234042556e-06, + "loss": 1.1832, + "step": 271 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 5.147657871246338, + "learning_rate": 2.8936170212765956e-06, + "loss": 1.3219, + "step": 272 + }, + { + "epoch": 0.0726063829787234, + "grad_norm": 5.002472400665283, + "learning_rate": 2.9042553191489365e-06, + "loss": 1.2989, + "step": 273 + }, + { + "epoch": 0.07287234042553191, + "grad_norm": 4.903095722198486, + "learning_rate": 2.9148936170212765e-06, + "loss": 1.1621, + "step": 274 + }, + { + "epoch": 0.07313829787234043, + "grad_norm": 5.269963264465332, + "learning_rate": 2.9255319148936174e-06, + "loss": 1.2966, + "step": 275 + }, + { + "epoch": 0.07340425531914893, + "grad_norm": 5.356837749481201, + "learning_rate": 2.9361702127659574e-06, + "loss": 1.2455, + "step": 276 + }, + { + "epoch": 0.07367021276595745, + "grad_norm": 5.510587215423584, + "learning_rate": 2.9468085106382983e-06, + "loss": 1.2386, + "step": 277 + }, + { + "epoch": 0.07393617021276595, + "grad_norm": 5.7554755210876465, + "learning_rate": 2.9574468085106383e-06, + "loss": 1.3096, + "step": 278 + }, + { + "epoch": 0.07420212765957447, + "grad_norm": 5.236169815063477, + "learning_rate": 2.968085106382979e-06, + "loss": 1.2496, + "step": 279 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 4.870725631713867, + "learning_rate": 2.978723404255319e-06, + "loss": 1.083, + "step": 280 + }, + { + "epoch": 0.0747340425531915, + "grad_norm": 5.181726455688477, + "learning_rate": 2.98936170212766e-06, + "loss": 1.223, + "step": 281 + }, + { + "epoch": 0.075, + "grad_norm": 4.924530506134033, + "learning_rate": 3e-06, + "loss": 1.2855, + "step": 282 + }, + { + "epoch": 0.07526595744680852, + "grad_norm": 5.177605628967285, + "learning_rate": 3.010638297872341e-06, + "loss": 1.2215, + "step": 283 + }, + { + "epoch": 0.07553191489361702, + "grad_norm": 4.895737648010254, + "learning_rate": 3.021276595744681e-06, + "loss": 1.2451, + "step": 284 + }, + { + "epoch": 0.07579787234042554, + "grad_norm": 5.425995349884033, + "learning_rate": 3.031914893617022e-06, + "loss": 1.6053, + "step": 285 + }, + { + "epoch": 0.07606382978723404, + "grad_norm": 5.228978157043457, + "learning_rate": 3.042553191489362e-06, + "loss": 1.1846, + "step": 286 + }, + { + "epoch": 0.07632978723404256, + "grad_norm": 4.825231552124023, + "learning_rate": 3.0531914893617027e-06, + "loss": 1.1355, + "step": 287 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 6.309840679168701, + "learning_rate": 3.0638297872340428e-06, + "loss": 1.1388, + "step": 288 + }, + { + "epoch": 0.07686170212765958, + "grad_norm": 5.012725830078125, + "learning_rate": 3.0744680851063836e-06, + "loss": 0.9926, + "step": 289 + }, + { + "epoch": 0.07712765957446809, + "grad_norm": 5.028249263763428, + "learning_rate": 3.0851063829787237e-06, + "loss": 1.2024, + "step": 290 + }, + { + "epoch": 0.07739361702127659, + "grad_norm": 5.77925968170166, + "learning_rate": 3.0957446808510637e-06, + "loss": 1.5436, + "step": 291 + }, + { + "epoch": 0.07765957446808511, + "grad_norm": 5.277095794677734, + "learning_rate": 3.1063829787234046e-06, + "loss": 1.2018, + "step": 292 + }, + { + "epoch": 0.07792553191489361, + "grad_norm": 5.4600958824157715, + "learning_rate": 3.1170212765957446e-06, + "loss": 1.072, + "step": 293 + }, + { + "epoch": 0.07819148936170213, + "grad_norm": 5.168891906738281, + "learning_rate": 3.1276595744680855e-06, + "loss": 1.3841, + "step": 294 + }, + { + "epoch": 0.07845744680851063, + "grad_norm": 4.869060516357422, + "learning_rate": 3.1382978723404255e-06, + "loss": 1.1663, + "step": 295 + }, + { + "epoch": 0.07872340425531915, + "grad_norm": 5.289313316345215, + "learning_rate": 3.1489361702127664e-06, + "loss": 1.0781, + "step": 296 + }, + { + "epoch": 0.07898936170212766, + "grad_norm": 5.145017147064209, + "learning_rate": 3.1595744680851064e-06, + "loss": 1.1087, + "step": 297 + }, + { + "epoch": 0.07925531914893617, + "grad_norm": 5.634250640869141, + "learning_rate": 3.1702127659574473e-06, + "loss": 1.3936, + "step": 298 + }, + { + "epoch": 0.07952127659574468, + "grad_norm": 5.201961040496826, + "learning_rate": 3.1808510638297873e-06, + "loss": 1.3752, + "step": 299 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 5.372065544128418, + "learning_rate": 3.191489361702128e-06, + "loss": 1.1715, + "step": 300 + }, + { + "epoch": 0.0800531914893617, + "grad_norm": 6.010387420654297, + "learning_rate": 3.202127659574468e-06, + "loss": 1.2187, + "step": 301 + }, + { + "epoch": 0.08031914893617022, + "grad_norm": 5.143375396728516, + "learning_rate": 3.212765957446809e-06, + "loss": 1.2051, + "step": 302 + }, + { + "epoch": 0.08058510638297872, + "grad_norm": 5.376684665679932, + "learning_rate": 3.223404255319149e-06, + "loss": 1.2319, + "step": 303 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 4.905093193054199, + "learning_rate": 3.23404255319149e-06, + "loss": 1.2187, + "step": 304 + }, + { + "epoch": 0.08111702127659574, + "grad_norm": 5.650513648986816, + "learning_rate": 3.24468085106383e-06, + "loss": 1.1528, + "step": 305 + }, + { + "epoch": 0.08138297872340426, + "grad_norm": 5.2889227867126465, + "learning_rate": 3.255319148936171e-06, + "loss": 1.0795, + "step": 306 + }, + { + "epoch": 0.08164893617021277, + "grad_norm": 5.284914970397949, + "learning_rate": 3.265957446808511e-06, + "loss": 1.2885, + "step": 307 + }, + { + "epoch": 0.08191489361702127, + "grad_norm": 5.4190449714660645, + "learning_rate": 3.276595744680851e-06, + "loss": 1.4991, + "step": 308 + }, + { + "epoch": 0.08218085106382979, + "grad_norm": 4.965026378631592, + "learning_rate": 3.287234042553192e-06, + "loss": 1.2674, + "step": 309 + }, + { + "epoch": 0.08244680851063829, + "grad_norm": 5.040426254272461, + "learning_rate": 3.297872340425532e-06, + "loss": 1.2347, + "step": 310 + }, + { + "epoch": 0.08271276595744681, + "grad_norm": 5.759904384613037, + "learning_rate": 3.3085106382978727e-06, + "loss": 1.2976, + "step": 311 + }, + { + "epoch": 0.08297872340425531, + "grad_norm": 4.893044471740723, + "learning_rate": 3.3191489361702127e-06, + "loss": 1.213, + "step": 312 + }, + { + "epoch": 0.08324468085106383, + "grad_norm": 4.674813270568848, + "learning_rate": 3.3297872340425536e-06, + "loss": 1.2795, + "step": 313 + }, + { + "epoch": 0.08351063829787234, + "grad_norm": 5.59810209274292, + "learning_rate": 3.3404255319148936e-06, + "loss": 1.2338, + "step": 314 + }, + { + "epoch": 0.08377659574468085, + "grad_norm": 4.63198709487915, + "learning_rate": 3.3510638297872345e-06, + "loss": 1.2026, + "step": 315 + }, + { + "epoch": 0.08404255319148936, + "grad_norm": 5.4756245613098145, + "learning_rate": 3.3617021276595745e-06, + "loss": 1.2838, + "step": 316 + }, + { + "epoch": 0.08430851063829788, + "grad_norm": 5.258046627044678, + "learning_rate": 3.3723404255319154e-06, + "loss": 1.1449, + "step": 317 + }, + { + "epoch": 0.08457446808510638, + "grad_norm": 5.205422878265381, + "learning_rate": 3.3829787234042554e-06, + "loss": 1.223, + "step": 318 + }, + { + "epoch": 0.0848404255319149, + "grad_norm": 5.365026473999023, + "learning_rate": 3.3936170212765963e-06, + "loss": 1.191, + "step": 319 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 5.367187023162842, + "learning_rate": 3.4042553191489363e-06, + "loss": 1.2246, + "step": 320 + }, + { + "epoch": 0.08537234042553192, + "grad_norm": 5.512171745300293, + "learning_rate": 3.414893617021277e-06, + "loss": 1.2601, + "step": 321 + }, + { + "epoch": 0.08563829787234042, + "grad_norm": 5.804540157318115, + "learning_rate": 3.4255319148936172e-06, + "loss": 1.1537, + "step": 322 + }, + { + "epoch": 0.08590425531914894, + "grad_norm": 5.474178791046143, + "learning_rate": 3.436170212765958e-06, + "loss": 1.3175, + "step": 323 + }, + { + "epoch": 0.08617021276595745, + "grad_norm": 5.454108715057373, + "learning_rate": 3.446808510638298e-06, + "loss": 1.1764, + "step": 324 + }, + { + "epoch": 0.08643617021276596, + "grad_norm": 5.368601322174072, + "learning_rate": 3.457446808510639e-06, + "loss": 1.2001, + "step": 325 + }, + { + "epoch": 0.08670212765957447, + "grad_norm": 5.19401741027832, + "learning_rate": 3.468085106382979e-06, + "loss": 1.2673, + "step": 326 + }, + { + "epoch": 0.08696808510638297, + "grad_norm": 4.70231294631958, + "learning_rate": 3.478723404255319e-06, + "loss": 1.1736, + "step": 327 + }, + { + "epoch": 0.08723404255319149, + "grad_norm": 5.607789039611816, + "learning_rate": 3.48936170212766e-06, + "loss": 1.1986, + "step": 328 + }, + { + "epoch": 0.0875, + "grad_norm": 5.1046013832092285, + "learning_rate": 3.5e-06, + "loss": 1.2426, + "step": 329 + }, + { + "epoch": 0.08776595744680851, + "grad_norm": 5.214546203613281, + "learning_rate": 3.510638297872341e-06, + "loss": 1.1211, + "step": 330 + }, + { + "epoch": 0.08803191489361702, + "grad_norm": 4.989225387573242, + "learning_rate": 3.521276595744681e-06, + "loss": 1.3025, + "step": 331 + }, + { + "epoch": 0.08829787234042553, + "grad_norm": 4.886022567749023, + "learning_rate": 3.5319148936170217e-06, + "loss": 1.2109, + "step": 332 + }, + { + "epoch": 0.08856382978723404, + "grad_norm": 5.30552339553833, + "learning_rate": 3.5425531914893617e-06, + "loss": 1.1811, + "step": 333 + }, + { + "epoch": 0.08882978723404256, + "grad_norm": 4.81152868270874, + "learning_rate": 3.5531914893617026e-06, + "loss": 1.1677, + "step": 334 + }, + { + "epoch": 0.08909574468085106, + "grad_norm": 5.06434440612793, + "learning_rate": 3.5638297872340426e-06, + "loss": 1.2425, + "step": 335 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 7.036694526672363, + "learning_rate": 3.5744680851063835e-06, + "loss": 1.2682, + "step": 336 + }, + { + "epoch": 0.08962765957446808, + "grad_norm": 5.208419322967529, + "learning_rate": 3.5851063829787235e-06, + "loss": 1.2394, + "step": 337 + }, + { + "epoch": 0.0898936170212766, + "grad_norm": 4.592006206512451, + "learning_rate": 3.5957446808510644e-06, + "loss": 1.2083, + "step": 338 + }, + { + "epoch": 0.0901595744680851, + "grad_norm": 5.002110481262207, + "learning_rate": 3.6063829787234044e-06, + "loss": 1.2284, + "step": 339 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 4.708452224731445, + "learning_rate": 3.6170212765957453e-06, + "loss": 1.1616, + "step": 340 + }, + { + "epoch": 0.09069148936170213, + "grad_norm": 4.872410297393799, + "learning_rate": 3.6276595744680853e-06, + "loss": 1.181, + "step": 341 + }, + { + "epoch": 0.09095744680851064, + "grad_norm": 5.24644136428833, + "learning_rate": 3.6382978723404262e-06, + "loss": 1.285, + "step": 342 + }, + { + "epoch": 0.09122340425531915, + "grad_norm": 5.019744396209717, + "learning_rate": 3.6489361702127662e-06, + "loss": 1.2677, + "step": 343 + }, + { + "epoch": 0.09148936170212765, + "grad_norm": 6.380999565124512, + "learning_rate": 3.6595744680851063e-06, + "loss": 1.1268, + "step": 344 + }, + { + "epoch": 0.09175531914893617, + "grad_norm": 5.100999355316162, + "learning_rate": 3.670212765957447e-06, + "loss": 1.2023, + "step": 345 + }, + { + "epoch": 0.09202127659574467, + "grad_norm": 5.221463203430176, + "learning_rate": 3.680851063829787e-06, + "loss": 1.2482, + "step": 346 + }, + { + "epoch": 0.09228723404255319, + "grad_norm": 4.895312309265137, + "learning_rate": 3.691489361702128e-06, + "loss": 1.2515, + "step": 347 + }, + { + "epoch": 0.0925531914893617, + "grad_norm": 4.988393306732178, + "learning_rate": 3.702127659574468e-06, + "loss": 1.1969, + "step": 348 + }, + { + "epoch": 0.09281914893617021, + "grad_norm": 5.19982385635376, + "learning_rate": 3.712765957446809e-06, + "loss": 1.2488, + "step": 349 + }, + { + "epoch": 0.09308510638297872, + "grad_norm": 5.010618686676025, + "learning_rate": 3.723404255319149e-06, + "loss": 1.2475, + "step": 350 + }, + { + "epoch": 0.09335106382978724, + "grad_norm": 4.905212879180908, + "learning_rate": 3.73404255319149e-06, + "loss": 1.3921, + "step": 351 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 5.373055458068848, + "learning_rate": 3.74468085106383e-06, + "loss": 1.4741, + "step": 352 + }, + { + "epoch": 0.09388297872340426, + "grad_norm": 4.804662704467773, + "learning_rate": 3.7553191489361707e-06, + "loss": 1.2208, + "step": 353 + }, + { + "epoch": 0.09414893617021276, + "grad_norm": 5.451242923736572, + "learning_rate": 3.7659574468085108e-06, + "loss": 1.3764, + "step": 354 + }, + { + "epoch": 0.09441489361702128, + "grad_norm": 5.5642409324646, + "learning_rate": 3.7765957446808516e-06, + "loss": 1.4001, + "step": 355 + }, + { + "epoch": 0.09468085106382979, + "grad_norm": 4.492448806762695, + "learning_rate": 3.7872340425531917e-06, + "loss": 1.1094, + "step": 356 + }, + { + "epoch": 0.0949468085106383, + "grad_norm": 5.439316749572754, + "learning_rate": 3.7978723404255325e-06, + "loss": 1.3348, + "step": 357 + }, + { + "epoch": 0.09521276595744681, + "grad_norm": 4.795385837554932, + "learning_rate": 3.8085106382978726e-06, + "loss": 1.23, + "step": 358 + }, + { + "epoch": 0.09547872340425533, + "grad_norm": 5.010631084442139, + "learning_rate": 3.819148936170213e-06, + "loss": 1.1724, + "step": 359 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 5.740480422973633, + "learning_rate": 3.8297872340425535e-06, + "loss": 1.3756, + "step": 360 + }, + { + "epoch": 0.09601063829787235, + "grad_norm": 4.986555099487305, + "learning_rate": 3.840425531914894e-06, + "loss": 1.2722, + "step": 361 + }, + { + "epoch": 0.09627659574468085, + "grad_norm": 5.041133880615234, + "learning_rate": 3.851063829787234e-06, + "loss": 1.0448, + "step": 362 + }, + { + "epoch": 0.09654255319148936, + "grad_norm": 5.378165245056152, + "learning_rate": 3.861702127659575e-06, + "loss": 1.2111, + "step": 363 + }, + { + "epoch": 0.09680851063829787, + "grad_norm": 4.8053059577941895, + "learning_rate": 3.872340425531915e-06, + "loss": 1.1344, + "step": 364 + }, + { + "epoch": 0.09707446808510638, + "grad_norm": 5.25260066986084, + "learning_rate": 3.882978723404256e-06, + "loss": 1.1288, + "step": 365 + }, + { + "epoch": 0.0973404255319149, + "grad_norm": 4.839104175567627, + "learning_rate": 3.893617021276596e-06, + "loss": 1.2131, + "step": 366 + }, + { + "epoch": 0.0976063829787234, + "grad_norm": 5.487301826477051, + "learning_rate": 3.904255319148937e-06, + "loss": 1.1969, + "step": 367 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 4.733921051025391, + "learning_rate": 3.914893617021277e-06, + "loss": 1.097, + "step": 368 + }, + { + "epoch": 0.09813829787234042, + "grad_norm": 5.042628765106201, + "learning_rate": 3.9255319148936175e-06, + "loss": 1.3554, + "step": 369 + }, + { + "epoch": 0.09840425531914894, + "grad_norm": 6.3879876136779785, + "learning_rate": 3.936170212765958e-06, + "loss": 1.1231, + "step": 370 + }, + { + "epoch": 0.09867021276595744, + "grad_norm": 4.907758712768555, + "learning_rate": 3.946808510638298e-06, + "loss": 1.4223, + "step": 371 + }, + { + "epoch": 0.09893617021276596, + "grad_norm": 4.765664577484131, + "learning_rate": 3.957446808510639e-06, + "loss": 1.2346, + "step": 372 + }, + { + "epoch": 0.09920212765957447, + "grad_norm": 4.949317932128906, + "learning_rate": 3.968085106382979e-06, + "loss": 1.1447, + "step": 373 + }, + { + "epoch": 0.09946808510638298, + "grad_norm": 5.256651878356934, + "learning_rate": 3.97872340425532e-06, + "loss": 1.25, + "step": 374 + }, + { + "epoch": 0.09973404255319149, + "grad_norm": 5.307461261749268, + "learning_rate": 3.98936170212766e-06, + "loss": 1.3373, + "step": 375 + }, + { + "epoch": 0.1, + "grad_norm": 5.324861526489258, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1654, + "step": 376 + }, + { + "epoch": 0.10026595744680851, + "grad_norm": 5.055593013763428, + "learning_rate": 4.010638297872341e-06, + "loss": 1.1508, + "step": 377 + }, + { + "epoch": 0.10053191489361703, + "grad_norm": 4.892101287841797, + "learning_rate": 4.0212765957446816e-06, + "loss": 1.2529, + "step": 378 + }, + { + "epoch": 0.10079787234042553, + "grad_norm": 4.846734523773193, + "learning_rate": 4.031914893617022e-06, + "loss": 1.1536, + "step": 379 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 5.4368462562561035, + "learning_rate": 4.042553191489362e-06, + "loss": 1.1512, + "step": 380 + }, + { + "epoch": 0.10132978723404255, + "grad_norm": 5.102158546447754, + "learning_rate": 4.053191489361702e-06, + "loss": 1.2382, + "step": 381 + }, + { + "epoch": 0.10159574468085106, + "grad_norm": 5.7933030128479, + "learning_rate": 4.0638297872340425e-06, + "loss": 1.4996, + "step": 382 + }, + { + "epoch": 0.10186170212765958, + "grad_norm": 4.7221221923828125, + "learning_rate": 4.074468085106383e-06, + "loss": 1.3471, + "step": 383 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 4.660311222076416, + "learning_rate": 4.085106382978723e-06, + "loss": 1.103, + "step": 384 + }, + { + "epoch": 0.1023936170212766, + "grad_norm": 5.399576663970947, + "learning_rate": 4.095744680851064e-06, + "loss": 1.3684, + "step": 385 + }, + { + "epoch": 0.1026595744680851, + "grad_norm": 4.925390720367432, + "learning_rate": 4.106382978723404e-06, + "loss": 1.2596, + "step": 386 + }, + { + "epoch": 0.10292553191489362, + "grad_norm": 5.198457717895508, + "learning_rate": 4.117021276595745e-06, + "loss": 1.2224, + "step": 387 + }, + { + "epoch": 0.10319148936170212, + "grad_norm": 5.053544044494629, + "learning_rate": 4.127659574468085e-06, + "loss": 1.0447, + "step": 388 + }, + { + "epoch": 0.10345744680851064, + "grad_norm": 5.769658088684082, + "learning_rate": 4.138297872340426e-06, + "loss": 1.4491, + "step": 389 + }, + { + "epoch": 0.10372340425531915, + "grad_norm": 4.969061851501465, + "learning_rate": 4.148936170212766e-06, + "loss": 1.2964, + "step": 390 + }, + { + "epoch": 0.10398936170212766, + "grad_norm": 4.825634479522705, + "learning_rate": 4.1595744680851066e-06, + "loss": 1.1521, + "step": 391 + }, + { + "epoch": 0.10425531914893617, + "grad_norm": 5.240276336669922, + "learning_rate": 4.170212765957447e-06, + "loss": 1.27, + "step": 392 + }, + { + "epoch": 0.10452127659574469, + "grad_norm": 4.926823139190674, + "learning_rate": 4.1808510638297875e-06, + "loss": 1.1428, + "step": 393 + }, + { + "epoch": 0.10478723404255319, + "grad_norm": 5.143110275268555, + "learning_rate": 4.191489361702128e-06, + "loss": 1.2502, + "step": 394 + }, + { + "epoch": 0.10505319148936171, + "grad_norm": 5.7517876625061035, + "learning_rate": 4.202127659574468e-06, + "loss": 1.3353, + "step": 395 + }, + { + "epoch": 0.10531914893617021, + "grad_norm": 5.096099853515625, + "learning_rate": 4.212765957446809e-06, + "loss": 1.2383, + "step": 396 + }, + { + "epoch": 0.10558510638297873, + "grad_norm": 5.0476484298706055, + "learning_rate": 4.223404255319149e-06, + "loss": 1.1639, + "step": 397 + }, + { + "epoch": 0.10585106382978723, + "grad_norm": 5.166505813598633, + "learning_rate": 4.23404255319149e-06, + "loss": 1.327, + "step": 398 + }, + { + "epoch": 0.10611702127659574, + "grad_norm": 5.315145969390869, + "learning_rate": 4.24468085106383e-06, + "loss": 1.2239, + "step": 399 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 5.185245990753174, + "learning_rate": 4.255319148936171e-06, + "loss": 1.3102, + "step": 400 + }, + { + "epoch": 0.10664893617021276, + "grad_norm": 5.77607536315918, + "learning_rate": 4.265957446808511e-06, + "loss": 1.3943, + "step": 401 + }, + { + "epoch": 0.10691489361702128, + "grad_norm": 5.244495391845703, + "learning_rate": 4.2765957446808515e-06, + "loss": 1.2495, + "step": 402 + }, + { + "epoch": 0.10718085106382978, + "grad_norm": 4.943081378936768, + "learning_rate": 4.287234042553192e-06, + "loss": 1.1773, + "step": 403 + }, + { + "epoch": 0.1074468085106383, + "grad_norm": 4.948064804077148, + "learning_rate": 4.297872340425532e-06, + "loss": 1.2758, + "step": 404 + }, + { + "epoch": 0.1077127659574468, + "grad_norm": 5.133402347564697, + "learning_rate": 4.308510638297873e-06, + "loss": 1.28, + "step": 405 + }, + { + "epoch": 0.10797872340425532, + "grad_norm": 5.113506317138672, + "learning_rate": 4.319148936170213e-06, + "loss": 1.3164, + "step": 406 + }, + { + "epoch": 0.10824468085106383, + "grad_norm": 5.551205635070801, + "learning_rate": 4.329787234042554e-06, + "loss": 1.3766, + "step": 407 + }, + { + "epoch": 0.10851063829787234, + "grad_norm": 5.358046531677246, + "learning_rate": 4.340425531914894e-06, + "loss": 1.3146, + "step": 408 + }, + { + "epoch": 0.10877659574468085, + "grad_norm": 4.947327136993408, + "learning_rate": 4.351063829787235e-06, + "loss": 1.2566, + "step": 409 + }, + { + "epoch": 0.10904255319148937, + "grad_norm": 5.421116828918457, + "learning_rate": 4.361702127659575e-06, + "loss": 1.3041, + "step": 410 + }, + { + "epoch": 0.10930851063829787, + "grad_norm": 5.073742866516113, + "learning_rate": 4.3723404255319156e-06, + "loss": 1.2297, + "step": 411 + }, + { + "epoch": 0.10957446808510639, + "grad_norm": 4.688051700592041, + "learning_rate": 4.382978723404256e-06, + "loss": 1.281, + "step": 412 + }, + { + "epoch": 0.10984042553191489, + "grad_norm": 4.957024097442627, + "learning_rate": 4.3936170212765965e-06, + "loss": 1.2235, + "step": 413 + }, + { + "epoch": 0.11010638297872341, + "grad_norm": 4.920490741729736, + "learning_rate": 4.404255319148937e-06, + "loss": 1.3369, + "step": 414 + }, + { + "epoch": 0.11037234042553191, + "grad_norm": 4.797316551208496, + "learning_rate": 4.414893617021277e-06, + "loss": 1.2144, + "step": 415 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 5.424980640411377, + "learning_rate": 4.425531914893617e-06, + "loss": 1.3891, + "step": 416 + }, + { + "epoch": 0.11090425531914894, + "grad_norm": 6.654335021972656, + "learning_rate": 4.436170212765957e-06, + "loss": 1.2438, + "step": 417 + }, + { + "epoch": 0.11117021276595744, + "grad_norm": 4.950499057769775, + "learning_rate": 4.446808510638298e-06, + "loss": 1.1873, + "step": 418 + }, + { + "epoch": 0.11143617021276596, + "grad_norm": 4.553642272949219, + "learning_rate": 4.457446808510638e-06, + "loss": 1.1059, + "step": 419 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 5.221842288970947, + "learning_rate": 4.468085106382979e-06, + "loss": 1.2645, + "step": 420 + }, + { + "epoch": 0.11196808510638298, + "grad_norm": 5.45412015914917, + "learning_rate": 4.478723404255319e-06, + "loss": 1.234, + "step": 421 + }, + { + "epoch": 0.11223404255319148, + "grad_norm": 5.6037750244140625, + "learning_rate": 4.48936170212766e-06, + "loss": 1.2393, + "step": 422 + }, + { + "epoch": 0.1125, + "grad_norm": 6.701963901519775, + "learning_rate": 4.5e-06, + "loss": 1.2275, + "step": 423 + }, + { + "epoch": 0.1127659574468085, + "grad_norm": 5.183774471282959, + "learning_rate": 4.5106382978723406e-06, + "loss": 1.345, + "step": 424 + }, + { + "epoch": 0.11303191489361702, + "grad_norm": 5.005707263946533, + "learning_rate": 4.521276595744681e-06, + "loss": 1.2778, + "step": 425 + }, + { + "epoch": 0.11329787234042553, + "grad_norm": 4.887904644012451, + "learning_rate": 4.5319148936170215e-06, + "loss": 1.2156, + "step": 426 + }, + { + "epoch": 0.11356382978723405, + "grad_norm": 5.077915191650391, + "learning_rate": 4.542553191489362e-06, + "loss": 1.3213, + "step": 427 + }, + { + "epoch": 0.11382978723404255, + "grad_norm": 5.669859409332275, + "learning_rate": 4.553191489361702e-06, + "loss": 1.2028, + "step": 428 + }, + { + "epoch": 0.11409574468085107, + "grad_norm": 4.871664047241211, + "learning_rate": 4.563829787234043e-06, + "loss": 1.2471, + "step": 429 + }, + { + "epoch": 0.11436170212765957, + "grad_norm": 6.208220958709717, + "learning_rate": 4.574468085106383e-06, + "loss": 1.3042, + "step": 430 + }, + { + "epoch": 0.11462765957446809, + "grad_norm": 5.47734260559082, + "learning_rate": 4.585106382978724e-06, + "loss": 1.1327, + "step": 431 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 4.876042366027832, + "learning_rate": 4.595744680851064e-06, + "loss": 1.2484, + "step": 432 + }, + { + "epoch": 0.11515957446808511, + "grad_norm": 4.497283458709717, + "learning_rate": 4.606382978723405e-06, + "loss": 1.0734, + "step": 433 + }, + { + "epoch": 0.11542553191489362, + "grad_norm": 5.2405314445495605, + "learning_rate": 4.617021276595745e-06, + "loss": 1.3122, + "step": 434 + }, + { + "epoch": 0.11569148936170212, + "grad_norm": 5.948802947998047, + "learning_rate": 4.6276595744680855e-06, + "loss": 1.2006, + "step": 435 + }, + { + "epoch": 0.11595744680851064, + "grad_norm": 5.318106174468994, + "learning_rate": 4.638297872340426e-06, + "loss": 1.2712, + "step": 436 + }, + { + "epoch": 0.11622340425531914, + "grad_norm": 5.686134338378906, + "learning_rate": 4.648936170212766e-06, + "loss": 1.3471, + "step": 437 + }, + { + "epoch": 0.11648936170212766, + "grad_norm": 5.246779441833496, + "learning_rate": 4.659574468085107e-06, + "loss": 1.2967, + "step": 438 + }, + { + "epoch": 0.11675531914893617, + "grad_norm": 4.675699710845947, + "learning_rate": 4.670212765957447e-06, + "loss": 1.2304, + "step": 439 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 5.018355846405029, + "learning_rate": 4.680851063829788e-06, + "loss": 1.3061, + "step": 440 + }, + { + "epoch": 0.11728723404255319, + "grad_norm": 5.387866497039795, + "learning_rate": 4.691489361702128e-06, + "loss": 1.3658, + "step": 441 + }, + { + "epoch": 0.1175531914893617, + "grad_norm": 4.927948951721191, + "learning_rate": 4.702127659574469e-06, + "loss": 1.3331, + "step": 442 + }, + { + "epoch": 0.11781914893617021, + "grad_norm": 5.1225738525390625, + "learning_rate": 4.712765957446809e-06, + "loss": 1.1334, + "step": 443 + }, + { + "epoch": 0.11808510638297873, + "grad_norm": 4.9314751625061035, + "learning_rate": 4.7234042553191496e-06, + "loss": 1.2384, + "step": 444 + }, + { + "epoch": 0.11835106382978723, + "grad_norm": 5.148207664489746, + "learning_rate": 4.73404255319149e-06, + "loss": 1.2677, + "step": 445 + }, + { + "epoch": 0.11861702127659575, + "grad_norm": 4.629826068878174, + "learning_rate": 4.7446808510638305e-06, + "loss": 1.2096, + "step": 446 + }, + { + "epoch": 0.11888297872340425, + "grad_norm": 4.850092887878418, + "learning_rate": 4.755319148936171e-06, + "loss": 1.2004, + "step": 447 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 5.228341102600098, + "learning_rate": 4.765957446808511e-06, + "loss": 1.1828, + "step": 448 + }, + { + "epoch": 0.11941489361702128, + "grad_norm": 4.738990306854248, + "learning_rate": 4.776595744680852e-06, + "loss": 1.2557, + "step": 449 + }, + { + "epoch": 0.1196808510638298, + "grad_norm": 4.737931251525879, + "learning_rate": 4.787234042553192e-06, + "loss": 1.1705, + "step": 450 + }, + { + "epoch": 0.1199468085106383, + "grad_norm": 4.852109432220459, + "learning_rate": 4.797872340425533e-06, + "loss": 1.175, + "step": 451 + }, + { + "epoch": 0.1202127659574468, + "grad_norm": 4.808513641357422, + "learning_rate": 4.808510638297872e-06, + "loss": 1.3285, + "step": 452 + }, + { + "epoch": 0.12047872340425532, + "grad_norm": 5.352870464324951, + "learning_rate": 4.819148936170213e-06, + "loss": 1.2471, + "step": 453 + }, + { + "epoch": 0.12074468085106382, + "grad_norm": 4.533960819244385, + "learning_rate": 4.829787234042553e-06, + "loss": 1.2059, + "step": 454 + }, + { + "epoch": 0.12101063829787234, + "grad_norm": 4.770225524902344, + "learning_rate": 4.840425531914894e-06, + "loss": 1.2049, + "step": 455 + }, + { + "epoch": 0.12127659574468085, + "grad_norm": 5.0733418464660645, + "learning_rate": 4.851063829787234e-06, + "loss": 1.2758, + "step": 456 + }, + { + "epoch": 0.12154255319148936, + "grad_norm": 4.347215175628662, + "learning_rate": 4.8617021276595746e-06, + "loss": 1.1401, + "step": 457 + }, + { + "epoch": 0.12180851063829787, + "grad_norm": 5.329954147338867, + "learning_rate": 4.872340425531915e-06, + "loss": 1.276, + "step": 458 + }, + { + "epoch": 0.12207446808510639, + "grad_norm": 5.255573272705078, + "learning_rate": 4.8829787234042555e-06, + "loss": 1.234, + "step": 459 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 5.189822196960449, + "learning_rate": 4.893617021276596e-06, + "loss": 1.3676, + "step": 460 + }, + { + "epoch": 0.12260638297872341, + "grad_norm": 5.039921283721924, + "learning_rate": 4.904255319148936e-06, + "loss": 1.3342, + "step": 461 + }, + { + "epoch": 0.12287234042553191, + "grad_norm": 4.65778923034668, + "learning_rate": 4.914893617021277e-06, + "loss": 1.1117, + "step": 462 + }, + { + "epoch": 0.12313829787234043, + "grad_norm": 5.006718635559082, + "learning_rate": 4.925531914893617e-06, + "loss": 1.2543, + "step": 463 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 5.547107219696045, + "learning_rate": 4.936170212765958e-06, + "loss": 1.2113, + "step": 464 + }, + { + "epoch": 0.12367021276595745, + "grad_norm": 6.148080348968506, + "learning_rate": 4.946808510638298e-06, + "loss": 1.1889, + "step": 465 + }, + { + "epoch": 0.12393617021276596, + "grad_norm": 5.120206832885742, + "learning_rate": 4.957446808510639e-06, + "loss": 1.2198, + "step": 466 + }, + { + "epoch": 0.12420212765957447, + "grad_norm": 5.487342834472656, + "learning_rate": 4.968085106382979e-06, + "loss": 1.2786, + "step": 467 + }, + { + "epoch": 0.12446808510638298, + "grad_norm": 8.382891654968262, + "learning_rate": 4.9787234042553195e-06, + "loss": 1.3757, + "step": 468 + }, + { + "epoch": 0.1247340425531915, + "grad_norm": 5.241554260253906, + "learning_rate": 4.98936170212766e-06, + "loss": 1.3302, + "step": 469 + }, + { + "epoch": 0.125, + "grad_norm": 5.201963901519775, + "learning_rate": 5e-06, + "loss": 1.2948, + "step": 470 + }, + { + "epoch": 0.12526595744680852, + "grad_norm": 5.143476486206055, + "learning_rate": 5.010638297872341e-06, + "loss": 1.2364, + "step": 471 + }, + { + "epoch": 0.125531914893617, + "grad_norm": 4.847978115081787, + "learning_rate": 5.0212765957446805e-06, + "loss": 1.1692, + "step": 472 + }, + { + "epoch": 0.12579787234042553, + "grad_norm": 7.869311809539795, + "learning_rate": 5.031914893617022e-06, + "loss": 1.3719, + "step": 473 + }, + { + "epoch": 0.12606382978723404, + "grad_norm": 5.498979091644287, + "learning_rate": 5.042553191489362e-06, + "loss": 1.3422, + "step": 474 + }, + { + "epoch": 0.12632978723404256, + "grad_norm": 6.362303256988525, + "learning_rate": 5.053191489361703e-06, + "loss": 1.4323, + "step": 475 + }, + { + "epoch": 0.12659574468085105, + "grad_norm": 5.051971435546875, + "learning_rate": 5.063829787234042e-06, + "loss": 1.1821, + "step": 476 + }, + { + "epoch": 0.12686170212765957, + "grad_norm": 4.8123250007629395, + "learning_rate": 5.0744680851063836e-06, + "loss": 1.2988, + "step": 477 + }, + { + "epoch": 0.1271276595744681, + "grad_norm": 5.487412452697754, + "learning_rate": 5.085106382978724e-06, + "loss": 1.3167, + "step": 478 + }, + { + "epoch": 0.1273936170212766, + "grad_norm": 8.315117835998535, + "learning_rate": 5.0957446808510645e-06, + "loss": 1.192, + "step": 479 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 5.151649475097656, + "learning_rate": 5.106382978723404e-06, + "loss": 1.2499, + "step": 480 + }, + { + "epoch": 0.12792553191489361, + "grad_norm": 5.335565567016602, + "learning_rate": 5.117021276595745e-06, + "loss": 1.2643, + "step": 481 + }, + { + "epoch": 0.12819148936170213, + "grad_norm": 4.590991020202637, + "learning_rate": 5.127659574468086e-06, + "loss": 1.218, + "step": 482 + }, + { + "epoch": 0.12845744680851065, + "grad_norm": 4.4650750160217285, + "learning_rate": 5.138297872340426e-06, + "loss": 1.1962, + "step": 483 + }, + { + "epoch": 0.12872340425531914, + "grad_norm": 4.609473705291748, + "learning_rate": 5.148936170212766e-06, + "loss": 1.476, + "step": 484 + }, + { + "epoch": 0.12898936170212766, + "grad_norm": 4.7010087966918945, + "learning_rate": 5.159574468085107e-06, + "loss": 1.1609, + "step": 485 + }, + { + "epoch": 0.12925531914893618, + "grad_norm": 4.8034257888793945, + "learning_rate": 5.170212765957448e-06, + "loss": 1.3393, + "step": 486 + }, + { + "epoch": 0.1295212765957447, + "grad_norm": 5.149427890777588, + "learning_rate": 5.180851063829788e-06, + "loss": 1.2883, + "step": 487 + }, + { + "epoch": 0.12978723404255318, + "grad_norm": 5.017268657684326, + "learning_rate": 5.191489361702128e-06, + "loss": 1.1178, + "step": 488 + }, + { + "epoch": 0.1300531914893617, + "grad_norm": 4.924554347991943, + "learning_rate": 5.202127659574468e-06, + "loss": 1.3381, + "step": 489 + }, + { + "epoch": 0.13031914893617022, + "grad_norm": 4.674248218536377, + "learning_rate": 5.212765957446809e-06, + "loss": 1.0916, + "step": 490 + }, + { + "epoch": 0.1305851063829787, + "grad_norm": 4.853366851806641, + "learning_rate": 5.223404255319149e-06, + "loss": 1.2784, + "step": 491 + }, + { + "epoch": 0.13085106382978723, + "grad_norm": 5.032970428466797, + "learning_rate": 5.2340425531914895e-06, + "loss": 1.2575, + "step": 492 + }, + { + "epoch": 0.13111702127659575, + "grad_norm": 4.911726474761963, + "learning_rate": 5.24468085106383e-06, + "loss": 1.2049, + "step": 493 + }, + { + "epoch": 0.13138297872340426, + "grad_norm": 5.197798252105713, + "learning_rate": 5.255319148936171e-06, + "loss": 1.3461, + "step": 494 + }, + { + "epoch": 0.13164893617021275, + "grad_norm": 4.873477458953857, + "learning_rate": 5.265957446808511e-06, + "loss": 1.2681, + "step": 495 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 4.855223178863525, + "learning_rate": 5.276595744680851e-06, + "loss": 1.1849, + "step": 496 + }, + { + "epoch": 0.1321808510638298, + "grad_norm": 5.735394477844238, + "learning_rate": 5.287234042553192e-06, + "loss": 1.2821, + "step": 497 + }, + { + "epoch": 0.1324468085106383, + "grad_norm": 4.7265305519104, + "learning_rate": 5.297872340425533e-06, + "loss": 1.1253, + "step": 498 + }, + { + "epoch": 0.1327127659574468, + "grad_norm": 5.138075351715088, + "learning_rate": 5.308510638297873e-06, + "loss": 1.1951, + "step": 499 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 4.761940002441406, + "learning_rate": 5.319148936170213e-06, + "loss": 1.4573, + "step": 500 + }, + { + "epoch": 0.13297872340425532, + "eval_loss": 1.276181697845459, + "eval_runtime": 12.4372, + "eval_samples_per_second": 32.162, + "eval_steps_per_second": 4.02, + "step": 500 + }, + { + "epoch": 0.13324468085106383, + "grad_norm": 5.0954132080078125, + "learning_rate": 5.3297872340425535e-06, + "loss": 1.43, + "step": 501 + }, + { + "epoch": 0.13351063829787235, + "grad_norm": 5.592034816741943, + "learning_rate": 5.340425531914895e-06, + "loss": 1.3052, + "step": 502 + }, + { + "epoch": 0.13377659574468084, + "grad_norm": 5.18677282333374, + "learning_rate": 5.351063829787234e-06, + "loss": 1.3141, + "step": 503 + }, + { + "epoch": 0.13404255319148936, + "grad_norm": 5.0918707847595215, + "learning_rate": 5.361702127659575e-06, + "loss": 1.3649, + "step": 504 + }, + { + "epoch": 0.13430851063829788, + "grad_norm": 4.749475002288818, + "learning_rate": 5.372340425531915e-06, + "loss": 1.1692, + "step": 505 + }, + { + "epoch": 0.1345744680851064, + "grad_norm": 4.383024215698242, + "learning_rate": 5.382978723404257e-06, + "loss": 1.3438, + "step": 506 + }, + { + "epoch": 0.1348404255319149, + "grad_norm": 4.863028049468994, + "learning_rate": 5.393617021276596e-06, + "loss": 1.3332, + "step": 507 + }, + { + "epoch": 0.1351063829787234, + "grad_norm": 4.633965492248535, + "learning_rate": 5.404255319148937e-06, + "loss": 1.2012, + "step": 508 + }, + { + "epoch": 0.13537234042553192, + "grad_norm": 5.257637023925781, + "learning_rate": 5.414893617021277e-06, + "loss": 1.3595, + "step": 509 + }, + { + "epoch": 0.1356382978723404, + "grad_norm": 4.795042037963867, + "learning_rate": 5.425531914893617e-06, + "loss": 1.3843, + "step": 510 + }, + { + "epoch": 0.13590425531914893, + "grad_norm": 5.261885643005371, + "learning_rate": 5.436170212765958e-06, + "loss": 1.2708, + "step": 511 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 4.95104455947876, + "learning_rate": 5.4468085106382985e-06, + "loss": 1.2268, + "step": 512 + }, + { + "epoch": 0.13643617021276597, + "grad_norm": 5.171029567718506, + "learning_rate": 5.457446808510639e-06, + "loss": 1.38, + "step": 513 + }, + { + "epoch": 0.13670212765957446, + "grad_norm": 4.671914577484131, + "learning_rate": 5.4680851063829785e-06, + "loss": 1.1485, + "step": 514 + }, + { + "epoch": 0.13696808510638298, + "grad_norm": 4.562173843383789, + "learning_rate": 5.47872340425532e-06, + "loss": 1.3282, + "step": 515 + }, + { + "epoch": 0.1372340425531915, + "grad_norm": 4.870545387268066, + "learning_rate": 5.48936170212766e-06, + "loss": 1.1943, + "step": 516 + }, + { + "epoch": 0.1375, + "grad_norm": 5.231775760650635, + "learning_rate": 5.500000000000001e-06, + "loss": 1.2763, + "step": 517 + }, + { + "epoch": 0.1377659574468085, + "grad_norm": 5.05985689163208, + "learning_rate": 5.51063829787234e-06, + "loss": 1.2018, + "step": 518 + }, + { + "epoch": 0.13803191489361702, + "grad_norm": 4.818659782409668, + "learning_rate": 5.521276595744682e-06, + "loss": 1.2307, + "step": 519 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 4.803600311279297, + "learning_rate": 5.531914893617022e-06, + "loss": 1.3586, + "step": 520 + }, + { + "epoch": 0.13856382978723406, + "grad_norm": 4.65132999420166, + "learning_rate": 5.5425531914893625e-06, + "loss": 1.2147, + "step": 521 + }, + { + "epoch": 0.13882978723404255, + "grad_norm": 4.503746032714844, + "learning_rate": 5.553191489361702e-06, + "loss": 1.2307, + "step": 522 + }, + { + "epoch": 0.13909574468085106, + "grad_norm": 4.557102203369141, + "learning_rate": 5.563829787234043e-06, + "loss": 1.1906, + "step": 523 + }, + { + "epoch": 0.13936170212765958, + "grad_norm": 4.347774028778076, + "learning_rate": 5.574468085106384e-06, + "loss": 1.1632, + "step": 524 + }, + { + "epoch": 0.13962765957446807, + "grad_norm": 4.431983947753906, + "learning_rate": 5.5851063829787235e-06, + "loss": 1.2617, + "step": 525 + }, + { + "epoch": 0.1398936170212766, + "grad_norm": 4.971803665161133, + "learning_rate": 5.595744680851064e-06, + "loss": 1.2581, + "step": 526 + }, + { + "epoch": 0.1401595744680851, + "grad_norm": 4.5451979637146, + "learning_rate": 5.606382978723404e-06, + "loss": 1.3048, + "step": 527 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 4.687234878540039, + "learning_rate": 5.617021276595746e-06, + "loss": 1.2556, + "step": 528 + }, + { + "epoch": 0.14069148936170212, + "grad_norm": 4.7519378662109375, + "learning_rate": 5.627659574468085e-06, + "loss": 1.2017, + "step": 529 + }, + { + "epoch": 0.14095744680851063, + "grad_norm": 5.454826354980469, + "learning_rate": 5.638297872340426e-06, + "loss": 1.137, + "step": 530 + }, + { + "epoch": 0.14122340425531915, + "grad_norm": 5.442596435546875, + "learning_rate": 5.648936170212766e-06, + "loss": 1.3776, + "step": 531 + }, + { + "epoch": 0.14148936170212767, + "grad_norm": 5.057155132293701, + "learning_rate": 5.6595744680851075e-06, + "loss": 1.4229, + "step": 532 + }, + { + "epoch": 0.14175531914893616, + "grad_norm": 4.806349277496338, + "learning_rate": 5.670212765957447e-06, + "loss": 1.2874, + "step": 533 + }, + { + "epoch": 0.14202127659574468, + "grad_norm": 4.934086322784424, + "learning_rate": 5.6808510638297875e-06, + "loss": 1.3149, + "step": 534 + }, + { + "epoch": 0.1422872340425532, + "grad_norm": 4.371129035949707, + "learning_rate": 5.691489361702128e-06, + "loss": 1.2567, + "step": 535 + }, + { + "epoch": 0.1425531914893617, + "grad_norm": 5.498307228088379, + "learning_rate": 5.702127659574469e-06, + "loss": 1.166, + "step": 536 + }, + { + "epoch": 0.1428191489361702, + "grad_norm": 4.467796802520752, + "learning_rate": 5.712765957446809e-06, + "loss": 1.1359, + "step": 537 + }, + { + "epoch": 0.14308510638297872, + "grad_norm": 4.92448091506958, + "learning_rate": 5.723404255319149e-06, + "loss": 1.2873, + "step": 538 + }, + { + "epoch": 0.14335106382978724, + "grad_norm": 4.561826705932617, + "learning_rate": 5.73404255319149e-06, + "loss": 1.0615, + "step": 539 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 4.773728370666504, + "learning_rate": 5.744680851063831e-06, + "loss": 1.1718, + "step": 540 + }, + { + "epoch": 0.14388297872340425, + "grad_norm": 4.3747639656066895, + "learning_rate": 5.755319148936171e-06, + "loss": 1.165, + "step": 541 + }, + { + "epoch": 0.14414893617021277, + "grad_norm": 5.261002063751221, + "learning_rate": 5.765957446808511e-06, + "loss": 1.3091, + "step": 542 + }, + { + "epoch": 0.14441489361702128, + "grad_norm": 5.58752965927124, + "learning_rate": 5.7765957446808516e-06, + "loss": 1.2045, + "step": 543 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 4.371783256530762, + "learning_rate": 5.787234042553191e-06, + "loss": 1.1548, + "step": 544 + }, + { + "epoch": 0.1449468085106383, + "grad_norm": 4.958721160888672, + "learning_rate": 5.7978723404255325e-06, + "loss": 1.4517, + "step": 545 + }, + { + "epoch": 0.1452127659574468, + "grad_norm": 4.846461296081543, + "learning_rate": 5.808510638297873e-06, + "loss": 1.3224, + "step": 546 + }, + { + "epoch": 0.14547872340425533, + "grad_norm": 5.132719039916992, + "learning_rate": 5.819148936170213e-06, + "loss": 1.1865, + "step": 547 + }, + { + "epoch": 0.14574468085106382, + "grad_norm": 4.791563987731934, + "learning_rate": 5.829787234042553e-06, + "loss": 1.2571, + "step": 548 + }, + { + "epoch": 0.14601063829787234, + "grad_norm": 5.137845039367676, + "learning_rate": 5.840425531914894e-06, + "loss": 1.3008, + "step": 549 + }, + { + "epoch": 0.14627659574468085, + "grad_norm": 4.80680513381958, + "learning_rate": 5.851063829787235e-06, + "loss": 1.243, + "step": 550 + }, + { + "epoch": 0.14654255319148937, + "grad_norm": 4.938924312591553, + "learning_rate": 5.861702127659575e-06, + "loss": 1.3482, + "step": 551 + }, + { + "epoch": 0.14680851063829786, + "grad_norm": 5.239283561706543, + "learning_rate": 5.872340425531915e-06, + "loss": 1.1938, + "step": 552 + }, + { + "epoch": 0.14707446808510638, + "grad_norm": 4.885773658752441, + "learning_rate": 5.882978723404256e-06, + "loss": 1.1257, + "step": 553 + }, + { + "epoch": 0.1473404255319149, + "grad_norm": 5.183603763580322, + "learning_rate": 5.8936170212765965e-06, + "loss": 1.3353, + "step": 554 + }, + { + "epoch": 0.14760638297872342, + "grad_norm": 4.765013694763184, + "learning_rate": 5.904255319148937e-06, + "loss": 1.2058, + "step": 555 + }, + { + "epoch": 0.1478723404255319, + "grad_norm": 5.2760419845581055, + "learning_rate": 5.9148936170212766e-06, + "loss": 1.2109, + "step": 556 + }, + { + "epoch": 0.14813829787234042, + "grad_norm": 5.04670524597168, + "learning_rate": 5.925531914893618e-06, + "loss": 1.3347, + "step": 557 + }, + { + "epoch": 0.14840425531914894, + "grad_norm": 4.968268394470215, + "learning_rate": 5.936170212765958e-06, + "loss": 1.3295, + "step": 558 + }, + { + "epoch": 0.14867021276595746, + "grad_norm": 4.791049480438232, + "learning_rate": 5.946808510638299e-06, + "loss": 1.2116, + "step": 559 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 4.980474948883057, + "learning_rate": 5.957446808510638e-06, + "loss": 1.4063, + "step": 560 + }, + { + "epoch": 0.14920212765957447, + "grad_norm": 4.56986141204834, + "learning_rate": 5.968085106382979e-06, + "loss": 1.2442, + "step": 561 + }, + { + "epoch": 0.149468085106383, + "grad_norm": 4.691464424133301, + "learning_rate": 5.97872340425532e-06, + "loss": 1.2784, + "step": 562 + }, + { + "epoch": 0.14973404255319148, + "grad_norm": 5.040019512176514, + "learning_rate": 5.98936170212766e-06, + "loss": 1.2195, + "step": 563 + }, + { + "epoch": 0.15, + "grad_norm": 5.160355091094971, + "learning_rate": 6e-06, + "loss": 1.4814, + "step": 564 + }, + { + "epoch": 0.1502659574468085, + "grad_norm": 4.696538925170898, + "learning_rate": 6.010638297872341e-06, + "loss": 1.2542, + "step": 565 + }, + { + "epoch": 0.15053191489361703, + "grad_norm": 4.901849269866943, + "learning_rate": 6.021276595744682e-06, + "loss": 1.2633, + "step": 566 + }, + { + "epoch": 0.15079787234042552, + "grad_norm": 4.936095237731934, + "learning_rate": 6.0319148936170215e-06, + "loss": 1.2812, + "step": 567 + }, + { + "epoch": 0.15106382978723404, + "grad_norm": 4.6663055419921875, + "learning_rate": 6.042553191489362e-06, + "loss": 1.3449, + "step": 568 + }, + { + "epoch": 0.15132978723404256, + "grad_norm": 4.95345401763916, + "learning_rate": 6.053191489361702e-06, + "loss": 1.1968, + "step": 569 + }, + { + "epoch": 0.15159574468085107, + "grad_norm": 4.66139030456543, + "learning_rate": 6.063829787234044e-06, + "loss": 1.1773, + "step": 570 + }, + { + "epoch": 0.15186170212765956, + "grad_norm": 5.310500144958496, + "learning_rate": 6.074468085106383e-06, + "loss": 1.2606, + "step": 571 + }, + { + "epoch": 0.15212765957446808, + "grad_norm": 5.423430442810059, + "learning_rate": 6.085106382978724e-06, + "loss": 1.4334, + "step": 572 + }, + { + "epoch": 0.1523936170212766, + "grad_norm": 5.189186096191406, + "learning_rate": 6.095744680851064e-06, + "loss": 1.2955, + "step": 573 + }, + { + "epoch": 0.15265957446808512, + "grad_norm": 5.515524864196777, + "learning_rate": 6.1063829787234055e-06, + "loss": 1.2777, + "step": 574 + }, + { + "epoch": 0.1529255319148936, + "grad_norm": 4.615379810333252, + "learning_rate": 6.117021276595745e-06, + "loss": 1.2492, + "step": 575 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 4.674113750457764, + "learning_rate": 6.1276595744680855e-06, + "loss": 1.2807, + "step": 576 + }, + { + "epoch": 0.15345744680851064, + "grad_norm": 4.907557487487793, + "learning_rate": 6.138297872340426e-06, + "loss": 1.4288, + "step": 577 + }, + { + "epoch": 0.15372340425531916, + "grad_norm": 4.517690658569336, + "learning_rate": 6.148936170212767e-06, + "loss": 1.2274, + "step": 578 + }, + { + "epoch": 0.15398936170212765, + "grad_norm": 4.350996971130371, + "learning_rate": 6.159574468085107e-06, + "loss": 1.284, + "step": 579 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.552090644836426, + "learning_rate": 6.170212765957447e-06, + "loss": 1.193, + "step": 580 + }, + { + "epoch": 0.1545212765957447, + "grad_norm": 5.3864827156066895, + "learning_rate": 6.180851063829788e-06, + "loss": 1.2869, + "step": 581 + }, + { + "epoch": 0.15478723404255318, + "grad_norm": 4.946741104125977, + "learning_rate": 6.191489361702127e-06, + "loss": 1.1894, + "step": 582 + }, + { + "epoch": 0.1550531914893617, + "grad_norm": 4.652212619781494, + "learning_rate": 6.202127659574469e-06, + "loss": 1.3841, + "step": 583 + }, + { + "epoch": 0.15531914893617021, + "grad_norm": 4.876087188720703, + "learning_rate": 6.212765957446809e-06, + "loss": 1.4244, + "step": 584 + }, + { + "epoch": 0.15558510638297873, + "grad_norm": 4.947083473205566, + "learning_rate": 6.22340425531915e-06, + "loss": 1.3616, + "step": 585 + }, + { + "epoch": 0.15585106382978722, + "grad_norm": 4.663647174835205, + "learning_rate": 6.234042553191489e-06, + "loss": 1.2258, + "step": 586 + }, + { + "epoch": 0.15611702127659574, + "grad_norm": 4.758052825927734, + "learning_rate": 6.2446808510638305e-06, + "loss": 1.1514, + "step": 587 + }, + { + "epoch": 0.15638297872340426, + "grad_norm": 4.887540340423584, + "learning_rate": 6.255319148936171e-06, + "loss": 1.1887, + "step": 588 + }, + { + "epoch": 0.15664893617021278, + "grad_norm": 4.9997477531433105, + "learning_rate": 6.265957446808511e-06, + "loss": 1.2235, + "step": 589 + }, + { + "epoch": 0.15691489361702127, + "grad_norm": 5.29210090637207, + "learning_rate": 6.276595744680851e-06, + "loss": 1.3761, + "step": 590 + }, + { + "epoch": 0.15718085106382979, + "grad_norm": 4.92548942565918, + "learning_rate": 6.287234042553192e-06, + "loss": 1.3848, + "step": 591 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 5.194962978363037, + "learning_rate": 6.297872340425533e-06, + "loss": 1.4225, + "step": 592 + }, + { + "epoch": 0.15771276595744682, + "grad_norm": 4.7201080322265625, + "learning_rate": 6.308510638297873e-06, + "loss": 1.142, + "step": 593 + }, + { + "epoch": 0.1579787234042553, + "grad_norm": 4.397183895111084, + "learning_rate": 6.319148936170213e-06, + "loss": 1.0353, + "step": 594 + }, + { + "epoch": 0.15824468085106383, + "grad_norm": 4.910755157470703, + "learning_rate": 6.329787234042554e-06, + "loss": 1.3927, + "step": 595 + }, + { + "epoch": 0.15851063829787235, + "grad_norm": 4.846840858459473, + "learning_rate": 6.3404255319148945e-06, + "loss": 1.3298, + "step": 596 + }, + { + "epoch": 0.15877659574468084, + "grad_norm": 4.725717067718506, + "learning_rate": 6.351063829787234e-06, + "loss": 1.319, + "step": 597 + }, + { + "epoch": 0.15904255319148936, + "grad_norm": 4.561202049255371, + "learning_rate": 6.361702127659575e-06, + "loss": 1.3586, + "step": 598 + }, + { + "epoch": 0.15930851063829787, + "grad_norm": 5.391122817993164, + "learning_rate": 6.372340425531915e-06, + "loss": 1.2876, + "step": 599 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 4.996328830718994, + "learning_rate": 6.382978723404256e-06, + "loss": 1.5125, + "step": 600 + }, + { + "epoch": 0.15984042553191488, + "grad_norm": 5.271803855895996, + "learning_rate": 6.393617021276596e-06, + "loss": 1.3858, + "step": 601 + }, + { + "epoch": 0.1601063829787234, + "grad_norm": 4.3907318115234375, + "learning_rate": 6.404255319148936e-06, + "loss": 1.1134, + "step": 602 + }, + { + "epoch": 0.16037234042553192, + "grad_norm": 5.224330902099609, + "learning_rate": 6.414893617021277e-06, + "loss": 1.572, + "step": 603 + }, + { + "epoch": 0.16063829787234044, + "grad_norm": 5.044121742248535, + "learning_rate": 6.425531914893618e-06, + "loss": 1.4531, + "step": 604 + }, + { + "epoch": 0.16090425531914893, + "grad_norm": 4.903571128845215, + "learning_rate": 6.436170212765958e-06, + "loss": 1.2779, + "step": 605 + }, + { + "epoch": 0.16117021276595744, + "grad_norm": 4.621399402618408, + "learning_rate": 6.446808510638298e-06, + "loss": 1.1709, + "step": 606 + }, + { + "epoch": 0.16143617021276596, + "grad_norm": 4.697232723236084, + "learning_rate": 6.457446808510639e-06, + "loss": 1.1601, + "step": 607 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 5.482996940612793, + "learning_rate": 6.46808510638298e-06, + "loss": 1.401, + "step": 608 + }, + { + "epoch": 0.16196808510638297, + "grad_norm": 4.974328994750977, + "learning_rate": 6.4787234042553195e-06, + "loss": 1.288, + "step": 609 + }, + { + "epoch": 0.1622340425531915, + "grad_norm": 4.7073140144348145, + "learning_rate": 6.48936170212766e-06, + "loss": 1.331, + "step": 610 + }, + { + "epoch": 0.1625, + "grad_norm": 4.540210247039795, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.217, + "step": 611 + }, + { + "epoch": 0.16276595744680852, + "grad_norm": 4.792731285095215, + "learning_rate": 6.510638297872342e-06, + "loss": 1.2696, + "step": 612 + }, + { + "epoch": 0.163031914893617, + "grad_norm": 4.365908622741699, + "learning_rate": 6.521276595744681e-06, + "loss": 1.1104, + "step": 613 + }, + { + "epoch": 0.16329787234042553, + "grad_norm": 4.6623101234436035, + "learning_rate": 6.531914893617022e-06, + "loss": 1.0165, + "step": 614 + }, + { + "epoch": 0.16356382978723405, + "grad_norm": 4.874281883239746, + "learning_rate": 6.542553191489362e-06, + "loss": 1.3418, + "step": 615 + }, + { + "epoch": 0.16382978723404254, + "grad_norm": 5.30225133895874, + "learning_rate": 6.553191489361702e-06, + "loss": 1.2965, + "step": 616 + }, + { + "epoch": 0.16409574468085106, + "grad_norm": 5.1621880531311035, + "learning_rate": 6.563829787234043e-06, + "loss": 1.411, + "step": 617 + }, + { + "epoch": 0.16436170212765958, + "grad_norm": 5.011656761169434, + "learning_rate": 6.574468085106384e-06, + "loss": 1.2324, + "step": 618 + }, + { + "epoch": 0.1646276595744681, + "grad_norm": 4.633167743682861, + "learning_rate": 6.585106382978724e-06, + "loss": 1.2498, + "step": 619 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 4.762227535247803, + "learning_rate": 6.595744680851064e-06, + "loss": 1.3774, + "step": 620 + }, + { + "epoch": 0.1651595744680851, + "grad_norm": 4.581019401550293, + "learning_rate": 6.606382978723405e-06, + "loss": 1.2745, + "step": 621 + }, + { + "epoch": 0.16542553191489362, + "grad_norm": 4.845024585723877, + "learning_rate": 6.617021276595745e-06, + "loss": 1.2003, + "step": 622 + }, + { + "epoch": 0.16569148936170214, + "grad_norm": 4.555243015289307, + "learning_rate": 6.627659574468086e-06, + "loss": 1.265, + "step": 623 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 4.3719987869262695, + "learning_rate": 6.6382978723404254e-06, + "loss": 1.2131, + "step": 624 + }, + { + "epoch": 0.16622340425531915, + "grad_norm": 4.629434108734131, + "learning_rate": 6.648936170212767e-06, + "loss": 1.3491, + "step": 625 + }, + { + "epoch": 0.16648936170212766, + "grad_norm": 5.0472540855407715, + "learning_rate": 6.659574468085107e-06, + "loss": 1.4119, + "step": 626 + }, + { + "epoch": 0.16675531914893618, + "grad_norm": 4.784181594848633, + "learning_rate": 6.670212765957448e-06, + "loss": 1.3079, + "step": 627 + }, + { + "epoch": 0.16702127659574467, + "grad_norm": 5.000133514404297, + "learning_rate": 6.680851063829787e-06, + "loss": 1.2378, + "step": 628 + }, + { + "epoch": 0.1672872340425532, + "grad_norm": 4.911679267883301, + "learning_rate": 6.6914893617021285e-06, + "loss": 1.1824, + "step": 629 + }, + { + "epoch": 0.1675531914893617, + "grad_norm": 4.674395561218262, + "learning_rate": 6.702127659574469e-06, + "loss": 1.1836, + "step": 630 + }, + { + "epoch": 0.16781914893617023, + "grad_norm": 4.964152812957764, + "learning_rate": 6.7127659574468094e-06, + "loss": 1.2419, + "step": 631 + }, + { + "epoch": 0.16808510638297872, + "grad_norm": 4.766603946685791, + "learning_rate": 6.723404255319149e-06, + "loss": 1.2885, + "step": 632 + }, + { + "epoch": 0.16835106382978723, + "grad_norm": 4.679075241088867, + "learning_rate": 6.7340425531914895e-06, + "loss": 1.279, + "step": 633 + }, + { + "epoch": 0.16861702127659575, + "grad_norm": 4.590879440307617, + "learning_rate": 6.744680851063831e-06, + "loss": 1.2808, + "step": 634 + }, + { + "epoch": 0.16888297872340424, + "grad_norm": 4.539956092834473, + "learning_rate": 6.75531914893617e-06, + "loss": 1.3353, + "step": 635 + }, + { + "epoch": 0.16914893617021276, + "grad_norm": 4.546907424926758, + "learning_rate": 6.765957446808511e-06, + "loss": 1.2691, + "step": 636 + }, + { + "epoch": 0.16941489361702128, + "grad_norm": 4.260477066040039, + "learning_rate": 6.776595744680851e-06, + "loss": 1.313, + "step": 637 + }, + { + "epoch": 0.1696808510638298, + "grad_norm": 4.697219371795654, + "learning_rate": 6.787234042553193e-06, + "loss": 1.131, + "step": 638 + }, + { + "epoch": 0.1699468085106383, + "grad_norm": 4.471210479736328, + "learning_rate": 6.797872340425532e-06, + "loss": 1.1466, + "step": 639 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 5.731024742126465, + "learning_rate": 6.808510638297873e-06, + "loss": 1.1923, + "step": 640 + }, + { + "epoch": 0.17047872340425532, + "grad_norm": 4.853487491607666, + "learning_rate": 6.819148936170213e-06, + "loss": 1.3019, + "step": 641 + }, + { + "epoch": 0.17074468085106384, + "grad_norm": 4.857687950134277, + "learning_rate": 6.829787234042554e-06, + "loss": 1.382, + "step": 642 + }, + { + "epoch": 0.17101063829787233, + "grad_norm": 5.497145652770996, + "learning_rate": 6.840425531914894e-06, + "loss": 1.2611, + "step": 643 + }, + { + "epoch": 0.17127659574468085, + "grad_norm": 4.852382659912109, + "learning_rate": 6.8510638297872344e-06, + "loss": 1.3002, + "step": 644 + }, + { + "epoch": 0.17154255319148937, + "grad_norm": 4.891834259033203, + "learning_rate": 6.861702127659575e-06, + "loss": 1.3009, + "step": 645 + }, + { + "epoch": 0.17180851063829788, + "grad_norm": 5.264189720153809, + "learning_rate": 6.872340425531916e-06, + "loss": 1.2047, + "step": 646 + }, + { + "epoch": 0.17207446808510637, + "grad_norm": 4.408929347991943, + "learning_rate": 6.882978723404256e-06, + "loss": 1.4105, + "step": 647 + }, + { + "epoch": 0.1723404255319149, + "grad_norm": 4.550996780395508, + "learning_rate": 6.893617021276596e-06, + "loss": 1.4495, + "step": 648 + }, + { + "epoch": 0.1726063829787234, + "grad_norm": 4.704092025756836, + "learning_rate": 6.904255319148937e-06, + "loss": 1.2031, + "step": 649 + }, + { + "epoch": 0.17287234042553193, + "grad_norm": 4.802618026733398, + "learning_rate": 6.914893617021278e-06, + "loss": 1.2879, + "step": 650 + }, + { + "epoch": 0.17313829787234042, + "grad_norm": 4.637843608856201, + "learning_rate": 6.925531914893618e-06, + "loss": 1.2621, + "step": 651 + }, + { + "epoch": 0.17340425531914894, + "grad_norm": 4.558661937713623, + "learning_rate": 6.936170212765958e-06, + "loss": 1.1671, + "step": 652 + }, + { + "epoch": 0.17367021276595745, + "grad_norm": 4.981627464294434, + "learning_rate": 6.9468085106382985e-06, + "loss": 1.2137, + "step": 653 + }, + { + "epoch": 0.17393617021276594, + "grad_norm": 4.708109378814697, + "learning_rate": 6.957446808510638e-06, + "loss": 1.1408, + "step": 654 + }, + { + "epoch": 0.17420212765957446, + "grad_norm": 5.328996658325195, + "learning_rate": 6.968085106382979e-06, + "loss": 1.1697, + "step": 655 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 4.988645553588867, + "learning_rate": 6.97872340425532e-06, + "loss": 1.2962, + "step": 656 + }, + { + "epoch": 0.1747340425531915, + "grad_norm": 5.570682048797607, + "learning_rate": 6.98936170212766e-06, + "loss": 1.4083, + "step": 657 + }, + { + "epoch": 0.175, + "grad_norm": 5.141003608703613, + "learning_rate": 7e-06, + "loss": 1.2558, + "step": 658 + }, + { + "epoch": 0.1752659574468085, + "grad_norm": 4.548361778259277, + "learning_rate": 7.010638297872341e-06, + "loss": 1.2556, + "step": 659 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 4.381852149963379, + "learning_rate": 7.021276595744682e-06, + "loss": 1.3609, + "step": 660 + }, + { + "epoch": 0.17579787234042554, + "grad_norm": 4.388241767883301, + "learning_rate": 7.031914893617022e-06, + "loss": 1.2165, + "step": 661 + }, + { + "epoch": 0.17606382978723403, + "grad_norm": 4.472124099731445, + "learning_rate": 7.042553191489362e-06, + "loss": 1.3372, + "step": 662 + }, + { + "epoch": 0.17632978723404255, + "grad_norm": 4.284490585327148, + "learning_rate": 7.053191489361703e-06, + "loss": 1.1206, + "step": 663 + }, + { + "epoch": 0.17659574468085107, + "grad_norm": 4.448127269744873, + "learning_rate": 7.0638297872340434e-06, + "loss": 1.3206, + "step": 664 + }, + { + "epoch": 0.1768617021276596, + "grad_norm": 4.701923847198486, + "learning_rate": 7.074468085106384e-06, + "loss": 1.1289, + "step": 665 + }, + { + "epoch": 0.17712765957446808, + "grad_norm": 4.249335289001465, + "learning_rate": 7.0851063829787235e-06, + "loss": 1.136, + "step": 666 + }, + { + "epoch": 0.1773936170212766, + "grad_norm": 4.292792320251465, + "learning_rate": 7.095744680851065e-06, + "loss": 1.1827, + "step": 667 + }, + { + "epoch": 0.1776595744680851, + "grad_norm": 4.595381736755371, + "learning_rate": 7.106382978723405e-06, + "loss": 1.1449, + "step": 668 + }, + { + "epoch": 0.1779255319148936, + "grad_norm": 4.856510162353516, + "learning_rate": 7.117021276595745e-06, + "loss": 1.2378, + "step": 669 + }, + { + "epoch": 0.17819148936170212, + "grad_norm": 4.735593318939209, + "learning_rate": 7.127659574468085e-06, + "loss": 1.1641, + "step": 670 + }, + { + "epoch": 0.17845744680851064, + "grad_norm": 4.771074295043945, + "learning_rate": 7.138297872340426e-06, + "loss": 1.33, + "step": 671 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 4.873645782470703, + "learning_rate": 7.148936170212767e-06, + "loss": 1.3388, + "step": 672 + }, + { + "epoch": 0.17898936170212765, + "grad_norm": 4.672497749328613, + "learning_rate": 7.159574468085107e-06, + "loss": 1.3479, + "step": 673 + }, + { + "epoch": 0.17925531914893617, + "grad_norm": 4.454950332641602, + "learning_rate": 7.170212765957447e-06, + "loss": 1.3631, + "step": 674 + }, + { + "epoch": 0.17952127659574468, + "grad_norm": 5.085921764373779, + "learning_rate": 7.1808510638297875e-06, + "loss": 1.4711, + "step": 675 + }, + { + "epoch": 0.1797872340425532, + "grad_norm": 4.528400421142578, + "learning_rate": 7.191489361702129e-06, + "loss": 1.1868, + "step": 676 + }, + { + "epoch": 0.1800531914893617, + "grad_norm": 4.722430229187012, + "learning_rate": 7.2021276595744684e-06, + "loss": 1.3842, + "step": 677 + }, + { + "epoch": 0.1803191489361702, + "grad_norm": 4.894054889678955, + "learning_rate": 7.212765957446809e-06, + "loss": 1.4365, + "step": 678 + }, + { + "epoch": 0.18058510638297873, + "grad_norm": 4.8365559577941895, + "learning_rate": 7.223404255319149e-06, + "loss": 1.4409, + "step": 679 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 5.0071916580200195, + "learning_rate": 7.234042553191491e-06, + "loss": 1.214, + "step": 680 + }, + { + "epoch": 0.18111702127659574, + "grad_norm": 4.514876365661621, + "learning_rate": 7.24468085106383e-06, + "loss": 1.1646, + "step": 681 + }, + { + "epoch": 0.18138297872340425, + "grad_norm": 4.465925693511963, + "learning_rate": 7.255319148936171e-06, + "loss": 1.2662, + "step": 682 + }, + { + "epoch": 0.18164893617021277, + "grad_norm": 4.698017120361328, + "learning_rate": 7.265957446808511e-06, + "loss": 1.3683, + "step": 683 + }, + { + "epoch": 0.1819148936170213, + "grad_norm": 4.704659461975098, + "learning_rate": 7.2765957446808524e-06, + "loss": 1.2236, + "step": 684 + }, + { + "epoch": 0.18218085106382978, + "grad_norm": 4.9184675216674805, + "learning_rate": 7.287234042553192e-06, + "loss": 1.1904, + "step": 685 + }, + { + "epoch": 0.1824468085106383, + "grad_norm": 4.5409088134765625, + "learning_rate": 7.2978723404255325e-06, + "loss": 1.2257, + "step": 686 + }, + { + "epoch": 0.18271276595744682, + "grad_norm": 4.9037556648254395, + "learning_rate": 7.308510638297873e-06, + "loss": 1.31, + "step": 687 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 4.719064235687256, + "learning_rate": 7.3191489361702125e-06, + "loss": 1.2651, + "step": 688 + }, + { + "epoch": 0.18324468085106382, + "grad_norm": 4.5164971351623535, + "learning_rate": 7.329787234042554e-06, + "loss": 1.306, + "step": 689 + }, + { + "epoch": 0.18351063829787234, + "grad_norm": 4.281124591827393, + "learning_rate": 7.340425531914894e-06, + "loss": 1.1963, + "step": 690 + }, + { + "epoch": 0.18377659574468086, + "grad_norm": 4.6168951988220215, + "learning_rate": 7.351063829787235e-06, + "loss": 1.2118, + "step": 691 + }, + { + "epoch": 0.18404255319148935, + "grad_norm": 4.85908842086792, + "learning_rate": 7.361702127659574e-06, + "loss": 1.2587, + "step": 692 + }, + { + "epoch": 0.18430851063829787, + "grad_norm": 4.3025336265563965, + "learning_rate": 7.372340425531916e-06, + "loss": 1.1239, + "step": 693 + }, + { + "epoch": 0.18457446808510639, + "grad_norm": 4.3702311515808105, + "learning_rate": 7.382978723404256e-06, + "loss": 1.0654, + "step": 694 + }, + { + "epoch": 0.1848404255319149, + "grad_norm": 4.243852615356445, + "learning_rate": 7.3936170212765965e-06, + "loss": 1.2725, + "step": 695 + }, + { + "epoch": 0.1851063829787234, + "grad_norm": 4.241601467132568, + "learning_rate": 7.404255319148936e-06, + "loss": 1.1379, + "step": 696 + }, + { + "epoch": 0.1853723404255319, + "grad_norm": 4.863661766052246, + "learning_rate": 7.4148936170212774e-06, + "loss": 1.2644, + "step": 697 + }, + { + "epoch": 0.18563829787234043, + "grad_norm": 4.637073040008545, + "learning_rate": 7.425531914893618e-06, + "loss": 1.3296, + "step": 698 + }, + { + "epoch": 0.18590425531914895, + "grad_norm": 4.703394889831543, + "learning_rate": 7.436170212765958e-06, + "loss": 1.3016, + "step": 699 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 4.478874206542969, + "learning_rate": 7.446808510638298e-06, + "loss": 1.3163, + "step": 700 + }, + { + "epoch": 0.18643617021276596, + "grad_norm": 4.600717067718506, + "learning_rate": 7.457446808510639e-06, + "loss": 1.3648, + "step": 701 + }, + { + "epoch": 0.18670212765957447, + "grad_norm": 4.729065418243408, + "learning_rate": 7.46808510638298e-06, + "loss": 1.3604, + "step": 702 + }, + { + "epoch": 0.186968085106383, + "grad_norm": 4.127298831939697, + "learning_rate": 7.47872340425532e-06, + "loss": 1.153, + "step": 703 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 4.612214088439941, + "learning_rate": 7.48936170212766e-06, + "loss": 1.2951, + "step": 704 + }, + { + "epoch": 0.1875, + "grad_norm": 5.011428356170654, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4121, + "step": 705 + }, + { + "epoch": 0.18776595744680852, + "grad_norm": 4.605989933013916, + "learning_rate": 7.5106382978723415e-06, + "loss": 1.262, + "step": 706 + }, + { + "epoch": 0.188031914893617, + "grad_norm": 5.028648853302002, + "learning_rate": 7.521276595744681e-06, + "loss": 1.4181, + "step": 707 + }, + { + "epoch": 0.18829787234042553, + "grad_norm": 4.571159839630127, + "learning_rate": 7.5319148936170215e-06, + "loss": 1.2364, + "step": 708 + }, + { + "epoch": 0.18856382978723404, + "grad_norm": 4.608417510986328, + "learning_rate": 7.542553191489362e-06, + "loss": 1.3094, + "step": 709 + }, + { + "epoch": 0.18882978723404256, + "grad_norm": 4.881725311279297, + "learning_rate": 7.553191489361703e-06, + "loss": 1.313, + "step": 710 + }, + { + "epoch": 0.18909574468085105, + "grad_norm": 4.912058353424072, + "learning_rate": 7.563829787234043e-06, + "loss": 1.392, + "step": 711 + }, + { + "epoch": 0.18936170212765957, + "grad_norm": 4.419525623321533, + "learning_rate": 7.574468085106383e-06, + "loss": 1.2366, + "step": 712 + }, + { + "epoch": 0.1896276595744681, + "grad_norm": 4.507438659667969, + "learning_rate": 7.585106382978724e-06, + "loss": 1.2404, + "step": 713 + }, + { + "epoch": 0.1898936170212766, + "grad_norm": 4.561898708343506, + "learning_rate": 7.595744680851065e-06, + "loss": 1.3596, + "step": 714 + }, + { + "epoch": 0.1901595744680851, + "grad_norm": 4.635844707489014, + "learning_rate": 7.606382978723405e-06, + "loss": 1.2898, + "step": 715 + }, + { + "epoch": 0.19042553191489361, + "grad_norm": 5.374488353729248, + "learning_rate": 7.617021276595745e-06, + "loss": 1.3445, + "step": 716 + }, + { + "epoch": 0.19069148936170213, + "grad_norm": 4.574670314788818, + "learning_rate": 7.627659574468086e-06, + "loss": 1.2414, + "step": 717 + }, + { + "epoch": 0.19095744680851065, + "grad_norm": 4.509703159332275, + "learning_rate": 7.638297872340426e-06, + "loss": 1.1649, + "step": 718 + }, + { + "epoch": 0.19122340425531914, + "grad_norm": 4.2057929039001465, + "learning_rate": 7.648936170212766e-06, + "loss": 1.3734, + "step": 719 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 4.571545124053955, + "learning_rate": 7.659574468085107e-06, + "loss": 1.2722, + "step": 720 + }, + { + "epoch": 0.19175531914893618, + "grad_norm": 4.561543941497803, + "learning_rate": 7.670212765957448e-06, + "loss": 1.4057, + "step": 721 + }, + { + "epoch": 0.1920212765957447, + "grad_norm": 4.365459442138672, + "learning_rate": 7.680851063829788e-06, + "loss": 1.2348, + "step": 722 + }, + { + "epoch": 0.19228723404255318, + "grad_norm": 4.416993141174316, + "learning_rate": 7.691489361702127e-06, + "loss": 1.3065, + "step": 723 + }, + { + "epoch": 0.1925531914893617, + "grad_norm": 4.762002944946289, + "learning_rate": 7.702127659574469e-06, + "loss": 1.3231, + "step": 724 + }, + { + "epoch": 0.19281914893617022, + "grad_norm": 5.0312604904174805, + "learning_rate": 7.71276595744681e-06, + "loss": 1.3851, + "step": 725 + }, + { + "epoch": 0.1930851063829787, + "grad_norm": 4.8303046226501465, + "learning_rate": 7.72340425531915e-06, + "loss": 1.3391, + "step": 726 + }, + { + "epoch": 0.19335106382978723, + "grad_norm": 5.312425136566162, + "learning_rate": 7.73404255319149e-06, + "loss": 1.3422, + "step": 727 + }, + { + "epoch": 0.19361702127659575, + "grad_norm": 4.574582576751709, + "learning_rate": 7.74468085106383e-06, + "loss": 1.2543, + "step": 728 + }, + { + "epoch": 0.19388297872340426, + "grad_norm": 4.735869884490967, + "learning_rate": 7.755319148936172e-06, + "loss": 1.427, + "step": 729 + }, + { + "epoch": 0.19414893617021275, + "grad_norm": 4.317601203918457, + "learning_rate": 7.765957446808511e-06, + "loss": 1.221, + "step": 730 + }, + { + "epoch": 0.19441489361702127, + "grad_norm": 4.69275426864624, + "learning_rate": 7.776595744680851e-06, + "loss": 1.2186, + "step": 731 + }, + { + "epoch": 0.1946808510638298, + "grad_norm": 4.865464210510254, + "learning_rate": 7.787234042553192e-06, + "loss": 1.3243, + "step": 732 + }, + { + "epoch": 0.1949468085106383, + "grad_norm": 4.288273811340332, + "learning_rate": 7.797872340425534e-06, + "loss": 1.2224, + "step": 733 + }, + { + "epoch": 0.1952127659574468, + "grad_norm": 4.230968475341797, + "learning_rate": 7.808510638297873e-06, + "loss": 1.1869, + "step": 734 + }, + { + "epoch": 0.19547872340425532, + "grad_norm": 5.056215286254883, + "learning_rate": 7.819148936170213e-06, + "loss": 1.2755, + "step": 735 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 4.373525142669678, + "learning_rate": 7.829787234042554e-06, + "loss": 1.2649, + "step": 736 + }, + { + "epoch": 0.19601063829787235, + "grad_norm": 4.4216179847717285, + "learning_rate": 7.840425531914895e-06, + "loss": 1.2578, + "step": 737 + }, + { + "epoch": 0.19627659574468084, + "grad_norm": 4.517039775848389, + "learning_rate": 7.851063829787235e-06, + "loss": 1.1759, + "step": 738 + }, + { + "epoch": 0.19654255319148936, + "grad_norm": 4.973018169403076, + "learning_rate": 7.861702127659575e-06, + "loss": 1.2073, + "step": 739 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 4.714282035827637, + "learning_rate": 7.872340425531916e-06, + "loss": 1.3551, + "step": 740 + }, + { + "epoch": 0.1970744680851064, + "grad_norm": 4.824267387390137, + "learning_rate": 7.882978723404257e-06, + "loss": 1.287, + "step": 741 + }, + { + "epoch": 0.1973404255319149, + "grad_norm": 4.343824863433838, + "learning_rate": 7.893617021276597e-06, + "loss": 1.1736, + "step": 742 + }, + { + "epoch": 0.1976063829787234, + "grad_norm": 5.130711555480957, + "learning_rate": 7.904255319148936e-06, + "loss": 1.3622, + "step": 743 + }, + { + "epoch": 0.19787234042553192, + "grad_norm": 4.943610191345215, + "learning_rate": 7.914893617021278e-06, + "loss": 1.2538, + "step": 744 + }, + { + "epoch": 0.1981382978723404, + "grad_norm": 4.978169918060303, + "learning_rate": 7.925531914893617e-06, + "loss": 1.2547, + "step": 745 + }, + { + "epoch": 0.19840425531914893, + "grad_norm": 4.933815956115723, + "learning_rate": 7.936170212765959e-06, + "loss": 1.3827, + "step": 746 + }, + { + "epoch": 0.19867021276595745, + "grad_norm": 4.288017272949219, + "learning_rate": 7.946808510638298e-06, + "loss": 1.2695, + "step": 747 + }, + { + "epoch": 0.19893617021276597, + "grad_norm": 4.4305267333984375, + "learning_rate": 7.95744680851064e-06, + "loss": 1.1459, + "step": 748 + }, + { + "epoch": 0.19920212765957446, + "grad_norm": 4.959934711456299, + "learning_rate": 7.968085106382979e-06, + "loss": 1.1793, + "step": 749 + }, + { + "epoch": 0.19946808510638298, + "grad_norm": 4.623016834259033, + "learning_rate": 7.97872340425532e-06, + "loss": 1.2508, + "step": 750 + }, + { + "epoch": 0.1997340425531915, + "grad_norm": 4.426565170288086, + "learning_rate": 7.98936170212766e-06, + "loss": 1.2464, + "step": 751 + }, + { + "epoch": 0.2, + "grad_norm": 4.914389610290527, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2941, + "step": 752 + }, + { + "epoch": 0.2002659574468085, + "grad_norm": 4.474592685699463, + "learning_rate": 8.010638297872341e-06, + "loss": 1.2285, + "step": 753 + }, + { + "epoch": 0.20053191489361702, + "grad_norm": 4.237037181854248, + "learning_rate": 8.021276595744682e-06, + "loss": 1.3422, + "step": 754 + }, + { + "epoch": 0.20079787234042554, + "grad_norm": 4.545922756195068, + "learning_rate": 8.031914893617022e-06, + "loss": 1.2456, + "step": 755 + }, + { + "epoch": 0.20106382978723406, + "grad_norm": 4.951487064361572, + "learning_rate": 8.042553191489363e-06, + "loss": 1.3001, + "step": 756 + }, + { + "epoch": 0.20132978723404255, + "grad_norm": 5.056552886962891, + "learning_rate": 8.053191489361703e-06, + "loss": 1.3875, + "step": 757 + }, + { + "epoch": 0.20159574468085106, + "grad_norm": 4.5373101234436035, + "learning_rate": 8.063829787234044e-06, + "loss": 1.2855, + "step": 758 + }, + { + "epoch": 0.20186170212765958, + "grad_norm": 4.698331832885742, + "learning_rate": 8.074468085106384e-06, + "loss": 1.1841, + "step": 759 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 4.885603904724121, + "learning_rate": 8.085106382978723e-06, + "loss": 1.2843, + "step": 760 + }, + { + "epoch": 0.2023936170212766, + "grad_norm": 4.819825172424316, + "learning_rate": 8.095744680851065e-06, + "loss": 1.2908, + "step": 761 + }, + { + "epoch": 0.2026595744680851, + "grad_norm": 4.332822799682617, + "learning_rate": 8.106382978723404e-06, + "loss": 1.1986, + "step": 762 + }, + { + "epoch": 0.20292553191489363, + "grad_norm": 4.102404594421387, + "learning_rate": 8.117021276595745e-06, + "loss": 1.3478, + "step": 763 + }, + { + "epoch": 0.20319148936170212, + "grad_norm": 4.496637344360352, + "learning_rate": 8.127659574468085e-06, + "loss": 1.265, + "step": 764 + }, + { + "epoch": 0.20345744680851063, + "grad_norm": 4.544750690460205, + "learning_rate": 8.138297872340426e-06, + "loss": 1.2299, + "step": 765 + }, + { + "epoch": 0.20372340425531915, + "grad_norm": 4.774095058441162, + "learning_rate": 8.148936170212766e-06, + "loss": 1.3596, + "step": 766 + }, + { + "epoch": 0.20398936170212767, + "grad_norm": 4.508190155029297, + "learning_rate": 8.159574468085107e-06, + "loss": 1.3143, + "step": 767 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 4.832380771636963, + "learning_rate": 8.170212765957447e-06, + "loss": 1.2449, + "step": 768 + }, + { + "epoch": 0.20452127659574468, + "grad_norm": 4.282026290893555, + "learning_rate": 8.180851063829788e-06, + "loss": 1.199, + "step": 769 + }, + { + "epoch": 0.2047872340425532, + "grad_norm": 4.594806671142578, + "learning_rate": 8.191489361702128e-06, + "loss": 1.2466, + "step": 770 + }, + { + "epoch": 0.2050531914893617, + "grad_norm": 4.925674915313721, + "learning_rate": 8.202127659574469e-06, + "loss": 1.2771, + "step": 771 + }, + { + "epoch": 0.2053191489361702, + "grad_norm": 4.634965419769287, + "learning_rate": 8.212765957446809e-06, + "loss": 1.2511, + "step": 772 + }, + { + "epoch": 0.20558510638297872, + "grad_norm": 4.774378776550293, + "learning_rate": 8.22340425531915e-06, + "loss": 1.1902, + "step": 773 + }, + { + "epoch": 0.20585106382978724, + "grad_norm": 4.943484783172607, + "learning_rate": 8.23404255319149e-06, + "loss": 1.454, + "step": 774 + }, + { + "epoch": 0.20611702127659576, + "grad_norm": 4.800187587738037, + "learning_rate": 8.24468085106383e-06, + "loss": 1.3709, + "step": 775 + }, + { + "epoch": 0.20638297872340425, + "grad_norm": 5.566744327545166, + "learning_rate": 8.25531914893617e-06, + "loss": 1.3158, + "step": 776 + }, + { + "epoch": 0.20664893617021277, + "grad_norm": 4.241647720336914, + "learning_rate": 8.265957446808512e-06, + "loss": 1.3173, + "step": 777 + }, + { + "epoch": 0.20691489361702128, + "grad_norm": 4.561349868774414, + "learning_rate": 8.276595744680851e-06, + "loss": 1.1971, + "step": 778 + }, + { + "epoch": 0.20718085106382977, + "grad_norm": 4.4153828620910645, + "learning_rate": 8.287234042553191e-06, + "loss": 1.2479, + "step": 779 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 4.6610107421875, + "learning_rate": 8.297872340425532e-06, + "loss": 1.5759, + "step": 780 + }, + { + "epoch": 0.2077127659574468, + "grad_norm": 5.142064094543457, + "learning_rate": 8.308510638297874e-06, + "loss": 1.3802, + "step": 781 + }, + { + "epoch": 0.20797872340425533, + "grad_norm": 4.54619026184082, + "learning_rate": 8.319148936170213e-06, + "loss": 1.3185, + "step": 782 + }, + { + "epoch": 0.20824468085106382, + "grad_norm": 4.640912055969238, + "learning_rate": 8.329787234042553e-06, + "loss": 1.2491, + "step": 783 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 4.866705894470215, + "learning_rate": 8.340425531914894e-06, + "loss": 1.28, + "step": 784 + }, + { + "epoch": 0.20877659574468085, + "grad_norm": 4.362489700317383, + "learning_rate": 8.351063829787235e-06, + "loss": 1.3603, + "step": 785 + }, + { + "epoch": 0.20904255319148937, + "grad_norm": 4.756308078765869, + "learning_rate": 8.361702127659575e-06, + "loss": 1.4108, + "step": 786 + }, + { + "epoch": 0.20930851063829786, + "grad_norm": 4.564047813415527, + "learning_rate": 8.372340425531915e-06, + "loss": 1.3404, + "step": 787 + }, + { + "epoch": 0.20957446808510638, + "grad_norm": 4.4327921867370605, + "learning_rate": 8.382978723404256e-06, + "loss": 1.2675, + "step": 788 + }, + { + "epoch": 0.2098404255319149, + "grad_norm": 4.656761646270752, + "learning_rate": 8.393617021276597e-06, + "loss": 1.2601, + "step": 789 + }, + { + "epoch": 0.21010638297872342, + "grad_norm": 4.353705883026123, + "learning_rate": 8.404255319148937e-06, + "loss": 1.2144, + "step": 790 + }, + { + "epoch": 0.2103723404255319, + "grad_norm": 4.420286655426025, + "learning_rate": 8.414893617021276e-06, + "loss": 1.249, + "step": 791 + }, + { + "epoch": 0.21063829787234042, + "grad_norm": 4.781008243560791, + "learning_rate": 8.425531914893618e-06, + "loss": 1.3132, + "step": 792 + }, + { + "epoch": 0.21090425531914894, + "grad_norm": 5.137455463409424, + "learning_rate": 8.436170212765959e-06, + "loss": 1.2915, + "step": 793 + }, + { + "epoch": 0.21117021276595746, + "grad_norm": 4.893155097961426, + "learning_rate": 8.446808510638299e-06, + "loss": 1.3679, + "step": 794 + }, + { + "epoch": 0.21143617021276595, + "grad_norm": 4.635669708251953, + "learning_rate": 8.457446808510638e-06, + "loss": 1.3222, + "step": 795 + }, + { + "epoch": 0.21170212765957447, + "grad_norm": 4.853140354156494, + "learning_rate": 8.46808510638298e-06, + "loss": 1.2849, + "step": 796 + }, + { + "epoch": 0.211968085106383, + "grad_norm": 4.836693286895752, + "learning_rate": 8.47872340425532e-06, + "loss": 1.395, + "step": 797 + }, + { + "epoch": 0.21223404255319148, + "grad_norm": 4.493725299835205, + "learning_rate": 8.48936170212766e-06, + "loss": 1.3197, + "step": 798 + }, + { + "epoch": 0.2125, + "grad_norm": 5.088167190551758, + "learning_rate": 8.5e-06, + "loss": 1.4093, + "step": 799 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 4.372249603271484, + "learning_rate": 8.510638297872341e-06, + "loss": 1.3612, + "step": 800 + }, + { + "epoch": 0.21303191489361703, + "grad_norm": 4.2862420082092285, + "learning_rate": 8.521276595744683e-06, + "loss": 1.2227, + "step": 801 + }, + { + "epoch": 0.21329787234042552, + "grad_norm": 4.741192817687988, + "learning_rate": 8.531914893617022e-06, + "loss": 1.2799, + "step": 802 + }, + { + "epoch": 0.21356382978723404, + "grad_norm": 5.022809982299805, + "learning_rate": 8.542553191489362e-06, + "loss": 1.407, + "step": 803 + }, + { + "epoch": 0.21382978723404256, + "grad_norm": 4.443842887878418, + "learning_rate": 8.553191489361703e-06, + "loss": 1.3346, + "step": 804 + }, + { + "epoch": 0.21409574468085107, + "grad_norm": 4.133638858795166, + "learning_rate": 8.563829787234044e-06, + "loss": 1.2443, + "step": 805 + }, + { + "epoch": 0.21436170212765956, + "grad_norm": 4.916075706481934, + "learning_rate": 8.574468085106384e-06, + "loss": 1.3503, + "step": 806 + }, + { + "epoch": 0.21462765957446808, + "grad_norm": 4.634794235229492, + "learning_rate": 8.585106382978724e-06, + "loss": 1.4072, + "step": 807 + }, + { + "epoch": 0.2148936170212766, + "grad_norm": 4.912757396697998, + "learning_rate": 8.595744680851065e-06, + "loss": 1.3311, + "step": 808 + }, + { + "epoch": 0.21515957446808512, + "grad_norm": 5.202310085296631, + "learning_rate": 8.606382978723406e-06, + "loss": 1.3224, + "step": 809 + }, + { + "epoch": 0.2154255319148936, + "grad_norm": 4.477729320526123, + "learning_rate": 8.617021276595746e-06, + "loss": 1.2806, + "step": 810 + }, + { + "epoch": 0.21569148936170213, + "grad_norm": 4.493345260620117, + "learning_rate": 8.627659574468085e-06, + "loss": 1.0227, + "step": 811 + }, + { + "epoch": 0.21595744680851064, + "grad_norm": 5.053197383880615, + "learning_rate": 8.638297872340427e-06, + "loss": 1.2941, + "step": 812 + }, + { + "epoch": 0.21622340425531916, + "grad_norm": 4.492358684539795, + "learning_rate": 8.648936170212768e-06, + "loss": 1.2651, + "step": 813 + }, + { + "epoch": 0.21648936170212765, + "grad_norm": 4.270611763000488, + "learning_rate": 8.659574468085108e-06, + "loss": 1.2417, + "step": 814 + }, + { + "epoch": 0.21675531914893617, + "grad_norm": 4.236185073852539, + "learning_rate": 8.670212765957447e-06, + "loss": 1.1717, + "step": 815 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 4.765509128570557, + "learning_rate": 8.680851063829788e-06, + "loss": 1.3134, + "step": 816 + }, + { + "epoch": 0.21728723404255318, + "grad_norm": 5.146259784698486, + "learning_rate": 8.691489361702128e-06, + "loss": 1.4561, + "step": 817 + }, + { + "epoch": 0.2175531914893617, + "grad_norm": 4.461063385009766, + "learning_rate": 8.70212765957447e-06, + "loss": 1.2138, + "step": 818 + }, + { + "epoch": 0.21781914893617021, + "grad_norm": 4.676782608032227, + "learning_rate": 8.712765957446809e-06, + "loss": 1.2614, + "step": 819 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 4.411204814910889, + "learning_rate": 8.72340425531915e-06, + "loss": 1.3142, + "step": 820 + }, + { + "epoch": 0.21835106382978722, + "grad_norm": 4.208769798278809, + "learning_rate": 8.73404255319149e-06, + "loss": 1.4278, + "step": 821 + }, + { + "epoch": 0.21861702127659574, + "grad_norm": 4.132145404815674, + "learning_rate": 8.744680851063831e-06, + "loss": 1.214, + "step": 822 + }, + { + "epoch": 0.21888297872340426, + "grad_norm": 4.246182441711426, + "learning_rate": 8.75531914893617e-06, + "loss": 1.4079, + "step": 823 + }, + { + "epoch": 0.21914893617021278, + "grad_norm": 4.859819412231445, + "learning_rate": 8.765957446808512e-06, + "loss": 1.2343, + "step": 824 + }, + { + "epoch": 0.21941489361702127, + "grad_norm": 4.722071170806885, + "learning_rate": 8.776595744680852e-06, + "loss": 1.276, + "step": 825 + }, + { + "epoch": 0.21968085106382979, + "grad_norm": 4.489323139190674, + "learning_rate": 8.787234042553193e-06, + "loss": 1.2388, + "step": 826 + }, + { + "epoch": 0.2199468085106383, + "grad_norm": 4.459937572479248, + "learning_rate": 8.797872340425533e-06, + "loss": 1.1911, + "step": 827 + }, + { + "epoch": 0.22021276595744682, + "grad_norm": 4.6483988761901855, + "learning_rate": 8.808510638297874e-06, + "loss": 1.5344, + "step": 828 + }, + { + "epoch": 0.2204787234042553, + "grad_norm": 4.822110176086426, + "learning_rate": 8.819148936170213e-06, + "loss": 1.2885, + "step": 829 + }, + { + "epoch": 0.22074468085106383, + "grad_norm": 4.722024917602539, + "learning_rate": 8.829787234042555e-06, + "loss": 1.2496, + "step": 830 + }, + { + "epoch": 0.22101063829787235, + "grad_norm": 5.146275520324707, + "learning_rate": 8.840425531914894e-06, + "loss": 1.3017, + "step": 831 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 4.489665508270264, + "learning_rate": 8.851063829787234e-06, + "loss": 1.1933, + "step": 832 + }, + { + "epoch": 0.22154255319148936, + "grad_norm": 4.318885803222656, + "learning_rate": 8.861702127659575e-06, + "loss": 1.1849, + "step": 833 + }, + { + "epoch": 0.22180851063829787, + "grad_norm": 4.603454113006592, + "learning_rate": 8.872340425531915e-06, + "loss": 1.3538, + "step": 834 + }, + { + "epoch": 0.2220744680851064, + "grad_norm": 4.531906604766846, + "learning_rate": 8.882978723404256e-06, + "loss": 1.3913, + "step": 835 + }, + { + "epoch": 0.22234042553191488, + "grad_norm": 4.391329288482666, + "learning_rate": 8.893617021276596e-06, + "loss": 1.289, + "step": 836 + }, + { + "epoch": 0.2226063829787234, + "grad_norm": 5.546546459197998, + "learning_rate": 8.904255319148937e-06, + "loss": 1.2507, + "step": 837 + }, + { + "epoch": 0.22287234042553192, + "grad_norm": 4.61740779876709, + "learning_rate": 8.914893617021277e-06, + "loss": 1.3726, + "step": 838 + }, + { + "epoch": 0.22313829787234044, + "grad_norm": 4.953794479370117, + "learning_rate": 8.925531914893618e-06, + "loss": 1.2434, + "step": 839 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 4.278190612792969, + "learning_rate": 8.936170212765958e-06, + "loss": 1.2559, + "step": 840 + }, + { + "epoch": 0.22367021276595744, + "grad_norm": 4.941532135009766, + "learning_rate": 8.946808510638299e-06, + "loss": 1.3278, + "step": 841 + }, + { + "epoch": 0.22393617021276596, + "grad_norm": 4.883002758026123, + "learning_rate": 8.957446808510638e-06, + "loss": 1.2537, + "step": 842 + }, + { + "epoch": 0.22420212765957448, + "grad_norm": 4.7191619873046875, + "learning_rate": 8.96808510638298e-06, + "loss": 1.2726, + "step": 843 + }, + { + "epoch": 0.22446808510638297, + "grad_norm": 4.509050369262695, + "learning_rate": 8.97872340425532e-06, + "loss": 1.2025, + "step": 844 + }, + { + "epoch": 0.2247340425531915, + "grad_norm": 3.9332523345947266, + "learning_rate": 8.98936170212766e-06, + "loss": 1.1207, + "step": 845 + }, + { + "epoch": 0.225, + "grad_norm": 4.3128204345703125, + "learning_rate": 9e-06, + "loss": 1.2433, + "step": 846 + }, + { + "epoch": 0.22526595744680852, + "grad_norm": 4.253404140472412, + "learning_rate": 9.010638297872342e-06, + "loss": 1.2193, + "step": 847 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 4.779951572418213, + "learning_rate": 9.021276595744681e-06, + "loss": 1.2158, + "step": 848 + }, + { + "epoch": 0.22579787234042553, + "grad_norm": 4.481555461883545, + "learning_rate": 9.031914893617022e-06, + "loss": 1.4551, + "step": 849 + }, + { + "epoch": 0.22606382978723405, + "grad_norm": 4.955724239349365, + "learning_rate": 9.042553191489362e-06, + "loss": 1.4291, + "step": 850 + }, + { + "epoch": 0.22632978723404254, + "grad_norm": 4.106208801269531, + "learning_rate": 9.053191489361702e-06, + "loss": 1.3655, + "step": 851 + }, + { + "epoch": 0.22659574468085106, + "grad_norm": 4.6892499923706055, + "learning_rate": 9.063829787234043e-06, + "loss": 1.2516, + "step": 852 + }, + { + "epoch": 0.22686170212765958, + "grad_norm": 4.553836822509766, + "learning_rate": 9.074468085106384e-06, + "loss": 1.2107, + "step": 853 + }, + { + "epoch": 0.2271276595744681, + "grad_norm": 5.072434902191162, + "learning_rate": 9.085106382978724e-06, + "loss": 1.3445, + "step": 854 + }, + { + "epoch": 0.22739361702127658, + "grad_norm": 4.725018501281738, + "learning_rate": 9.095744680851063e-06, + "loss": 1.2701, + "step": 855 + }, + { + "epoch": 0.2276595744680851, + "grad_norm": 4.630471706390381, + "learning_rate": 9.106382978723405e-06, + "loss": 1.3229, + "step": 856 + }, + { + "epoch": 0.22792553191489362, + "grad_norm": 4.0610880851745605, + "learning_rate": 9.117021276595746e-06, + "loss": 1.0857, + "step": 857 + }, + { + "epoch": 0.22819148936170214, + "grad_norm": 4.523334503173828, + "learning_rate": 9.127659574468086e-06, + "loss": 1.446, + "step": 858 + }, + { + "epoch": 0.22845744680851063, + "grad_norm": 5.042343616485596, + "learning_rate": 9.138297872340425e-06, + "loss": 1.3728, + "step": 859 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 4.5774664878845215, + "learning_rate": 9.148936170212767e-06, + "loss": 1.3178, + "step": 860 + }, + { + "epoch": 0.22898936170212766, + "grad_norm": 4.425473213195801, + "learning_rate": 9.159574468085108e-06, + "loss": 1.3412, + "step": 861 + }, + { + "epoch": 0.22925531914893618, + "grad_norm": 4.738778114318848, + "learning_rate": 9.170212765957447e-06, + "loss": 1.3676, + "step": 862 + }, + { + "epoch": 0.22952127659574467, + "grad_norm": 4.462982654571533, + "learning_rate": 9.180851063829787e-06, + "loss": 1.2755, + "step": 863 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 4.682027816772461, + "learning_rate": 9.191489361702128e-06, + "loss": 1.2625, + "step": 864 + }, + { + "epoch": 0.2300531914893617, + "grad_norm": 4.37489652633667, + "learning_rate": 9.20212765957447e-06, + "loss": 1.291, + "step": 865 + }, + { + "epoch": 0.23031914893617023, + "grad_norm": 4.652685642242432, + "learning_rate": 9.21276595744681e-06, + "loss": 1.1782, + "step": 866 + }, + { + "epoch": 0.23058510638297872, + "grad_norm": 4.401131629943848, + "learning_rate": 9.223404255319149e-06, + "loss": 1.2626, + "step": 867 + }, + { + "epoch": 0.23085106382978723, + "grad_norm": 4.712587356567383, + "learning_rate": 9.23404255319149e-06, + "loss": 1.2888, + "step": 868 + }, + { + "epoch": 0.23111702127659575, + "grad_norm": 4.425190448760986, + "learning_rate": 9.244680851063831e-06, + "loss": 1.2566, + "step": 869 + }, + { + "epoch": 0.23138297872340424, + "grad_norm": 5.040404319763184, + "learning_rate": 9.255319148936171e-06, + "loss": 1.1856, + "step": 870 + }, + { + "epoch": 0.23164893617021276, + "grad_norm": 4.372191905975342, + "learning_rate": 9.26595744680851e-06, + "loss": 1.3153, + "step": 871 + }, + { + "epoch": 0.23191489361702128, + "grad_norm": 4.518852233886719, + "learning_rate": 9.276595744680852e-06, + "loss": 1.2652, + "step": 872 + }, + { + "epoch": 0.2321808510638298, + "grad_norm": 5.675739288330078, + "learning_rate": 9.287234042553193e-06, + "loss": 1.2654, + "step": 873 + }, + { + "epoch": 0.2324468085106383, + "grad_norm": 4.503605842590332, + "learning_rate": 9.297872340425533e-06, + "loss": 1.2693, + "step": 874 + }, + { + "epoch": 0.2327127659574468, + "grad_norm": 4.573145866394043, + "learning_rate": 9.308510638297872e-06, + "loss": 1.3126, + "step": 875 + }, + { + "epoch": 0.23297872340425532, + "grad_norm": 4.833911418914795, + "learning_rate": 9.319148936170214e-06, + "loss": 1.3583, + "step": 876 + }, + { + "epoch": 0.23324468085106384, + "grad_norm": 4.768589496612549, + "learning_rate": 9.329787234042555e-06, + "loss": 1.273, + "step": 877 + }, + { + "epoch": 0.23351063829787233, + "grad_norm": 4.1959638595581055, + "learning_rate": 9.340425531914895e-06, + "loss": 1.1774, + "step": 878 + }, + { + "epoch": 0.23377659574468085, + "grad_norm": 4.231587886810303, + "learning_rate": 9.351063829787234e-06, + "loss": 1.3215, + "step": 879 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 4.725379943847656, + "learning_rate": 9.361702127659576e-06, + "loss": 1.3458, + "step": 880 + }, + { + "epoch": 0.23430851063829788, + "grad_norm": 4.831368446350098, + "learning_rate": 9.372340425531917e-06, + "loss": 1.3499, + "step": 881 + }, + { + "epoch": 0.23457446808510637, + "grad_norm": 4.571084499359131, + "learning_rate": 9.382978723404256e-06, + "loss": 1.2071, + "step": 882 + }, + { + "epoch": 0.2348404255319149, + "grad_norm": 4.676523208618164, + "learning_rate": 9.393617021276596e-06, + "loss": 1.3009, + "step": 883 + }, + { + "epoch": 0.2351063829787234, + "grad_norm": 4.406195640563965, + "learning_rate": 9.404255319148937e-06, + "loss": 1.3127, + "step": 884 + }, + { + "epoch": 0.23537234042553193, + "grad_norm": 4.958892822265625, + "learning_rate": 9.414893617021279e-06, + "loss": 1.3724, + "step": 885 + }, + { + "epoch": 0.23563829787234042, + "grad_norm": 4.296865463256836, + "learning_rate": 9.425531914893618e-06, + "loss": 1.2535, + "step": 886 + }, + { + "epoch": 0.23590425531914894, + "grad_norm": 4.650951862335205, + "learning_rate": 9.436170212765958e-06, + "loss": 1.2432, + "step": 887 + }, + { + "epoch": 0.23617021276595745, + "grad_norm": 4.3874831199646, + "learning_rate": 9.446808510638299e-06, + "loss": 1.4075, + "step": 888 + }, + { + "epoch": 0.23643617021276594, + "grad_norm": 4.246219158172607, + "learning_rate": 9.457446808510639e-06, + "loss": 1.2787, + "step": 889 + }, + { + "epoch": 0.23670212765957446, + "grad_norm": 4.379426956176758, + "learning_rate": 9.46808510638298e-06, + "loss": 1.2586, + "step": 890 + }, + { + "epoch": 0.23696808510638298, + "grad_norm": 4.164050102233887, + "learning_rate": 9.47872340425532e-06, + "loss": 1.3071, + "step": 891 + }, + { + "epoch": 0.2372340425531915, + "grad_norm": 4.572608947753906, + "learning_rate": 9.489361702127661e-06, + "loss": 1.3735, + "step": 892 + }, + { + "epoch": 0.2375, + "grad_norm": 4.812750339508057, + "learning_rate": 9.5e-06, + "loss": 1.3627, + "step": 893 + }, + { + "epoch": 0.2377659574468085, + "grad_norm": 4.5463056564331055, + "learning_rate": 9.510638297872342e-06, + "loss": 1.2688, + "step": 894 + }, + { + "epoch": 0.23803191489361702, + "grad_norm": 4.700718402862549, + "learning_rate": 9.521276595744681e-06, + "loss": 1.3242, + "step": 895 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 4.626996040344238, + "learning_rate": 9.531914893617023e-06, + "loss": 1.3346, + "step": 896 + }, + { + "epoch": 0.23856382978723403, + "grad_norm": 4.4340643882751465, + "learning_rate": 9.542553191489362e-06, + "loss": 1.266, + "step": 897 + }, + { + "epoch": 0.23882978723404255, + "grad_norm": 4.288296222686768, + "learning_rate": 9.553191489361704e-06, + "loss": 1.3097, + "step": 898 + }, + { + "epoch": 0.23909574468085107, + "grad_norm": 4.531320571899414, + "learning_rate": 9.563829787234043e-06, + "loss": 1.2607, + "step": 899 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 4.4416985511779785, + "learning_rate": 9.574468085106385e-06, + "loss": 1.2443, + "step": 900 + }, + { + "epoch": 0.23962765957446808, + "grad_norm": 4.752575397491455, + "learning_rate": 9.585106382978724e-06, + "loss": 1.263, + "step": 901 + }, + { + "epoch": 0.2398936170212766, + "grad_norm": 4.418696403503418, + "learning_rate": 9.595744680851065e-06, + "loss": 1.4263, + "step": 902 + }, + { + "epoch": 0.2401595744680851, + "grad_norm": 4.149245262145996, + "learning_rate": 9.606382978723405e-06, + "loss": 1.2097, + "step": 903 + }, + { + "epoch": 0.2404255319148936, + "grad_norm": 4.261038303375244, + "learning_rate": 9.617021276595745e-06, + "loss": 1.284, + "step": 904 + }, + { + "epoch": 0.24069148936170212, + "grad_norm": 4.526815414428711, + "learning_rate": 9.627659574468086e-06, + "loss": 1.2036, + "step": 905 + }, + { + "epoch": 0.24095744680851064, + "grad_norm": 4.194947719573975, + "learning_rate": 9.638297872340426e-06, + "loss": 1.3215, + "step": 906 + }, + { + "epoch": 0.24122340425531916, + "grad_norm": 4.903501987457275, + "learning_rate": 9.648936170212767e-06, + "loss": 1.2824, + "step": 907 + }, + { + "epoch": 0.24148936170212765, + "grad_norm": 4.600060939788818, + "learning_rate": 9.659574468085106e-06, + "loss": 1.3283, + "step": 908 + }, + { + "epoch": 0.24175531914893617, + "grad_norm": 4.43640661239624, + "learning_rate": 9.670212765957448e-06, + "loss": 1.2952, + "step": 909 + }, + { + "epoch": 0.24202127659574468, + "grad_norm": 4.518085479736328, + "learning_rate": 9.680851063829787e-06, + "loss": 1.2436, + "step": 910 + }, + { + "epoch": 0.2422872340425532, + "grad_norm": 4.508195877075195, + "learning_rate": 9.691489361702129e-06, + "loss": 1.448, + "step": 911 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 4.132392406463623, + "learning_rate": 9.702127659574468e-06, + "loss": 1.2467, + "step": 912 + }, + { + "epoch": 0.2428191489361702, + "grad_norm": 4.272422790527344, + "learning_rate": 9.71276595744681e-06, + "loss": 1.1718, + "step": 913 + }, + { + "epoch": 0.24308510638297873, + "grad_norm": 3.7474145889282227, + "learning_rate": 9.723404255319149e-06, + "loss": 1.2312, + "step": 914 + }, + { + "epoch": 0.24335106382978725, + "grad_norm": 4.318002700805664, + "learning_rate": 9.73404255319149e-06, + "loss": 1.2954, + "step": 915 + }, + { + "epoch": 0.24361702127659574, + "grad_norm": 4.300724506378174, + "learning_rate": 9.74468085106383e-06, + "loss": 1.324, + "step": 916 + }, + { + "epoch": 0.24388297872340425, + "grad_norm": 4.362585067749023, + "learning_rate": 9.755319148936171e-06, + "loss": 1.2939, + "step": 917 + }, + { + "epoch": 0.24414893617021277, + "grad_norm": 4.705591678619385, + "learning_rate": 9.765957446808511e-06, + "loss": 1.3472, + "step": 918 + }, + { + "epoch": 0.2444148936170213, + "grad_norm": 4.612809658050537, + "learning_rate": 9.776595744680852e-06, + "loss": 1.323, + "step": 919 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 4.289991855621338, + "learning_rate": 9.787234042553192e-06, + "loss": 1.3352, + "step": 920 + }, + { + "epoch": 0.2449468085106383, + "grad_norm": 4.43556022644043, + "learning_rate": 9.797872340425533e-06, + "loss": 1.2358, + "step": 921 + }, + { + "epoch": 0.24521276595744682, + "grad_norm": 4.365429878234863, + "learning_rate": 9.808510638297873e-06, + "loss": 1.3711, + "step": 922 + }, + { + "epoch": 0.2454787234042553, + "grad_norm": 4.680497646331787, + "learning_rate": 9.819148936170212e-06, + "loss": 1.3057, + "step": 923 + }, + { + "epoch": 0.24574468085106382, + "grad_norm": 4.54257869720459, + "learning_rate": 9.829787234042554e-06, + "loss": 1.4173, + "step": 924 + }, + { + "epoch": 0.24601063829787234, + "grad_norm": 4.676888465881348, + "learning_rate": 9.840425531914895e-06, + "loss": 1.386, + "step": 925 + }, + { + "epoch": 0.24627659574468086, + "grad_norm": 4.417918682098389, + "learning_rate": 9.851063829787235e-06, + "loss": 1.4044, + "step": 926 + }, + { + "epoch": 0.24654255319148935, + "grad_norm": 4.195037841796875, + "learning_rate": 9.861702127659574e-06, + "loss": 1.2735, + "step": 927 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 4.587873935699463, + "learning_rate": 9.872340425531915e-06, + "loss": 1.2647, + "step": 928 + }, + { + "epoch": 0.24707446808510639, + "grad_norm": 4.467301845550537, + "learning_rate": 9.882978723404257e-06, + "loss": 1.387, + "step": 929 + }, + { + "epoch": 0.2473404255319149, + "grad_norm": 4.606912136077881, + "learning_rate": 9.893617021276596e-06, + "loss": 1.3188, + "step": 930 + }, + { + "epoch": 0.2476063829787234, + "grad_norm": 4.470932483673096, + "learning_rate": 9.904255319148936e-06, + "loss": 1.3166, + "step": 931 + }, + { + "epoch": 0.2478723404255319, + "grad_norm": 4.317614555358887, + "learning_rate": 9.914893617021277e-06, + "loss": 1.3514, + "step": 932 + }, + { + "epoch": 0.24813829787234043, + "grad_norm": 4.443989276885986, + "learning_rate": 9.925531914893619e-06, + "loss": 1.2636, + "step": 933 + }, + { + "epoch": 0.24840425531914895, + "grad_norm": 4.796088218688965, + "learning_rate": 9.936170212765958e-06, + "loss": 1.2652, + "step": 934 + }, + { + "epoch": 0.24867021276595744, + "grad_norm": 4.967231750488281, + "learning_rate": 9.946808510638298e-06, + "loss": 1.4264, + "step": 935 + }, + { + "epoch": 0.24893617021276596, + "grad_norm": 4.075037002563477, + "learning_rate": 9.957446808510639e-06, + "loss": 1.1912, + "step": 936 + }, + { + "epoch": 0.24920212765957447, + "grad_norm": 4.505919933319092, + "learning_rate": 9.96808510638298e-06, + "loss": 1.3069, + "step": 937 + }, + { + "epoch": 0.249468085106383, + "grad_norm": 4.194151878356934, + "learning_rate": 9.97872340425532e-06, + "loss": 1.3177, + "step": 938 + }, + { + "epoch": 0.24973404255319148, + "grad_norm": 4.591639518737793, + "learning_rate": 9.98936170212766e-06, + "loss": 1.3742, + "step": 939 + }, + { + "epoch": 0.25, + "grad_norm": 4.259275913238525, + "learning_rate": 1e-05, + "loss": 1.2802, + "step": 940 + }, + { + "epoch": 0.2502659574468085, + "grad_norm": 5.042564392089844, + "learning_rate": 9.999999922647056e-06, + "loss": 1.3329, + "step": 941 + }, + { + "epoch": 0.25053191489361704, + "grad_norm": 4.728914737701416, + "learning_rate": 9.999999690588228e-06, + "loss": 1.2498, + "step": 942 + }, + { + "epoch": 0.25079787234042555, + "grad_norm": 4.191166877746582, + "learning_rate": 9.999999303823525e-06, + "loss": 1.3322, + "step": 943 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 4.627315044403076, + "learning_rate": 9.999998762352953e-06, + "loss": 1.4223, + "step": 944 + }, + { + "epoch": 0.25132978723404253, + "grad_norm": 4.210728168487549, + "learning_rate": 9.999998066176536e-06, + "loss": 1.2534, + "step": 945 + }, + { + "epoch": 0.25159574468085105, + "grad_norm": 4.210343837738037, + "learning_rate": 9.99999721529429e-06, + "loss": 1.2587, + "step": 946 + }, + { + "epoch": 0.25186170212765957, + "grad_norm": 4.43513298034668, + "learning_rate": 9.999996209706243e-06, + "loss": 1.2222, + "step": 947 + }, + { + "epoch": 0.2521276595744681, + "grad_norm": 4.577609539031982, + "learning_rate": 9.999995049412428e-06, + "loss": 1.3063, + "step": 948 + }, + { + "epoch": 0.2523936170212766, + "grad_norm": 4.520708084106445, + "learning_rate": 9.99999373441288e-06, + "loss": 1.2357, + "step": 949 + }, + { + "epoch": 0.2526595744680851, + "grad_norm": 4.051931858062744, + "learning_rate": 9.999992264707636e-06, + "loss": 1.265, + "step": 950 + }, + { + "epoch": 0.25292553191489364, + "grad_norm": 4.30267333984375, + "learning_rate": 9.999990640296747e-06, + "loss": 1.1791, + "step": 951 + }, + { + "epoch": 0.2531914893617021, + "grad_norm": 4.397022724151611, + "learning_rate": 9.99998886118026e-06, + "loss": 1.2239, + "step": 952 + }, + { + "epoch": 0.2534574468085106, + "grad_norm": 4.552164077758789, + "learning_rate": 9.999986927358231e-06, + "loss": 1.3983, + "step": 953 + }, + { + "epoch": 0.25372340425531914, + "grad_norm": 4.569587707519531, + "learning_rate": 9.999984838830721e-06, + "loss": 1.3307, + "step": 954 + }, + { + "epoch": 0.25398936170212766, + "grad_norm": 4.352025985717773, + "learning_rate": 9.999982595597793e-06, + "loss": 1.3996, + "step": 955 + }, + { + "epoch": 0.2542553191489362, + "grad_norm": 4.358248710632324, + "learning_rate": 9.999980197659515e-06, + "loss": 1.4166, + "step": 956 + }, + { + "epoch": 0.2545212765957447, + "grad_norm": 4.449854373931885, + "learning_rate": 9.999977645015963e-06, + "loss": 1.2414, + "step": 957 + }, + { + "epoch": 0.2547872340425532, + "grad_norm": 4.66248083114624, + "learning_rate": 9.999974937667217e-06, + "loss": 1.2852, + "step": 958 + }, + { + "epoch": 0.2550531914893617, + "grad_norm": 4.217624187469482, + "learning_rate": 9.99997207561336e-06, + "loss": 1.2624, + "step": 959 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 4.449913501739502, + "learning_rate": 9.99996905885448e-06, + "loss": 1.2733, + "step": 960 + }, + { + "epoch": 0.2555851063829787, + "grad_norm": 3.9325287342071533, + "learning_rate": 9.99996588739067e-06, + "loss": 1.2253, + "step": 961 + }, + { + "epoch": 0.25585106382978723, + "grad_norm": 4.425497531890869, + "learning_rate": 9.99996256122203e-06, + "loss": 1.1233, + "step": 962 + }, + { + "epoch": 0.25611702127659575, + "grad_norm": 3.946796178817749, + "learning_rate": 9.99995908034866e-06, + "loss": 1.2961, + "step": 963 + }, + { + "epoch": 0.25638297872340426, + "grad_norm": 4.145402431488037, + "learning_rate": 9.999955444770671e-06, + "loss": 1.3856, + "step": 964 + }, + { + "epoch": 0.2566489361702128, + "grad_norm": 4.4032206535339355, + "learning_rate": 9.99995165448817e-06, + "loss": 1.3649, + "step": 965 + }, + { + "epoch": 0.2569148936170213, + "grad_norm": 4.492345333099365, + "learning_rate": 9.999947709501282e-06, + "loss": 1.2992, + "step": 966 + }, + { + "epoch": 0.25718085106382976, + "grad_norm": 4.298032760620117, + "learning_rate": 9.999943609810125e-06, + "loss": 1.3756, + "step": 967 + }, + { + "epoch": 0.2574468085106383, + "grad_norm": 3.9896862506866455, + "learning_rate": 9.999939355414825e-06, + "loss": 1.2034, + "step": 968 + }, + { + "epoch": 0.2577127659574468, + "grad_norm": 4.537227630615234, + "learning_rate": 9.999934946315516e-06, + "loss": 1.2959, + "step": 969 + }, + { + "epoch": 0.2579787234042553, + "grad_norm": 4.087522029876709, + "learning_rate": 9.999930382512331e-06, + "loss": 1.2928, + "step": 970 + }, + { + "epoch": 0.25824468085106383, + "grad_norm": 4.388976573944092, + "learning_rate": 9.999925664005415e-06, + "loss": 1.2452, + "step": 971 + }, + { + "epoch": 0.25851063829787235, + "grad_norm": 4.264836311340332, + "learning_rate": 9.99992079079491e-06, + "loss": 1.3477, + "step": 972 + }, + { + "epoch": 0.25877659574468087, + "grad_norm": 4.548455715179443, + "learning_rate": 9.999915762880971e-06, + "loss": 1.2818, + "step": 973 + }, + { + "epoch": 0.2590425531914894, + "grad_norm": 4.096053600311279, + "learning_rate": 9.99991058026375e-06, + "loss": 1.1407, + "step": 974 + }, + { + "epoch": 0.25930851063829785, + "grad_norm": 4.8142571449279785, + "learning_rate": 9.99990524294341e-06, + "loss": 1.5322, + "step": 975 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 4.194404602050781, + "learning_rate": 9.999899750920115e-06, + "loss": 1.2874, + "step": 976 + }, + { + "epoch": 0.2598404255319149, + "grad_norm": 3.905287504196167, + "learning_rate": 9.999894104194037e-06, + "loss": 1.1986, + "step": 977 + }, + { + "epoch": 0.2601063829787234, + "grad_norm": 4.401111602783203, + "learning_rate": 9.999888302765347e-06, + "loss": 1.2148, + "step": 978 + }, + { + "epoch": 0.2603723404255319, + "grad_norm": 4.558286666870117, + "learning_rate": 9.999882346634225e-06, + "loss": 1.247, + "step": 979 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.902086019515991, + "learning_rate": 9.999876235800859e-06, + "loss": 1.3395, + "step": 980 + }, + { + "epoch": 0.26090425531914896, + "grad_norm": 4.327469825744629, + "learning_rate": 9.999869970265434e-06, + "loss": 1.301, + "step": 981 + }, + { + "epoch": 0.2611702127659574, + "grad_norm": 4.4269609451293945, + "learning_rate": 9.999863550028147e-06, + "loss": 1.3436, + "step": 982 + }, + { + "epoch": 0.26143617021276594, + "grad_norm": 4.277595520019531, + "learning_rate": 9.999856975089193e-06, + "loss": 1.3487, + "step": 983 + }, + { + "epoch": 0.26170212765957446, + "grad_norm": 5.5637311935424805, + "learning_rate": 9.99985024544878e-06, + "loss": 1.3848, + "step": 984 + }, + { + "epoch": 0.261968085106383, + "grad_norm": 4.938830852508545, + "learning_rate": 9.999843361107111e-06, + "loss": 1.2637, + "step": 985 + }, + { + "epoch": 0.2622340425531915, + "grad_norm": 4.1854376792907715, + "learning_rate": 9.999836322064404e-06, + "loss": 1.2802, + "step": 986 + }, + { + "epoch": 0.2625, + "grad_norm": 4.120711803436279, + "learning_rate": 9.999829128320873e-06, + "loss": 1.2468, + "step": 987 + }, + { + "epoch": 0.26276595744680853, + "grad_norm": 4.207146167755127, + "learning_rate": 9.999821779876744e-06, + "loss": 1.2662, + "step": 988 + }, + { + "epoch": 0.26303191489361705, + "grad_norm": 4.666594505310059, + "learning_rate": 9.999814276732242e-06, + "loss": 1.3755, + "step": 989 + }, + { + "epoch": 0.2632978723404255, + "grad_norm": 4.344621181488037, + "learning_rate": 9.9998066188876e-06, + "loss": 1.3096, + "step": 990 + }, + { + "epoch": 0.263563829787234, + "grad_norm": 4.433095455169678, + "learning_rate": 9.999798806343055e-06, + "loss": 1.3499, + "step": 991 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 4.92564058303833, + "learning_rate": 9.999790839098847e-06, + "loss": 1.281, + "step": 992 + }, + { + "epoch": 0.26409574468085106, + "grad_norm": 4.6375603675842285, + "learning_rate": 9.999782717155225e-06, + "loss": 1.3261, + "step": 993 + }, + { + "epoch": 0.2643617021276596, + "grad_norm": 4.372560024261475, + "learning_rate": 9.999774440512438e-06, + "loss": 1.186, + "step": 994 + }, + { + "epoch": 0.2646276595744681, + "grad_norm": 4.910377502441406, + "learning_rate": 9.999766009170743e-06, + "loss": 1.4187, + "step": 995 + }, + { + "epoch": 0.2648936170212766, + "grad_norm": 4.599401473999023, + "learning_rate": 9.999757423130402e-06, + "loss": 1.4278, + "step": 996 + }, + { + "epoch": 0.2651595744680851, + "grad_norm": 4.204658508300781, + "learning_rate": 9.999748682391682e-06, + "loss": 1.3376, + "step": 997 + }, + { + "epoch": 0.2654255319148936, + "grad_norm": 4.476613998413086, + "learning_rate": 9.999739786954849e-06, + "loss": 1.1909, + "step": 998 + }, + { + "epoch": 0.2656914893617021, + "grad_norm": 4.173623561859131, + "learning_rate": 9.999730736820182e-06, + "loss": 1.2678, + "step": 999 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 4.294970989227295, + "learning_rate": 9.999721531987958e-06, + "loss": 1.224, + "step": 1000 + }, + { + "epoch": 0.26595744680851063, + "eval_loss": 1.3182601928710938, + "eval_runtime": 12.5838, + "eval_samples_per_second": 31.787, + "eval_steps_per_second": 3.973, + "step": 1000 + }, + { + "epoch": 0.26622340425531915, + "grad_norm": 4.1402411460876465, + "learning_rate": 9.999712172458462e-06, + "loss": 1.1836, + "step": 1001 + }, + { + "epoch": 0.26648936170212767, + "grad_norm": 5.045607566833496, + "learning_rate": 9.999702658231987e-06, + "loss": 1.2545, + "step": 1002 + }, + { + "epoch": 0.2667553191489362, + "grad_norm": 4.2975921630859375, + "learning_rate": 9.999692989308827e-06, + "loss": 1.4903, + "step": 1003 + }, + { + "epoch": 0.2670212765957447, + "grad_norm": 4.366122245788574, + "learning_rate": 9.999683165689277e-06, + "loss": 1.3197, + "step": 1004 + }, + { + "epoch": 0.26728723404255317, + "grad_norm": 4.20319938659668, + "learning_rate": 9.999673187373644e-06, + "loss": 1.5023, + "step": 1005 + }, + { + "epoch": 0.2675531914893617, + "grad_norm": 4.779364109039307, + "learning_rate": 9.999663054362236e-06, + "loss": 1.4043, + "step": 1006 + }, + { + "epoch": 0.2678191489361702, + "grad_norm": 4.18774938583374, + "learning_rate": 9.999652766655367e-06, + "loss": 1.2043, + "step": 1007 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 4.277698040008545, + "learning_rate": 9.999642324253357e-06, + "loss": 1.3012, + "step": 1008 + }, + { + "epoch": 0.26835106382978724, + "grad_norm": 4.673196315765381, + "learning_rate": 9.999631727156523e-06, + "loss": 1.4028, + "step": 1009 + }, + { + "epoch": 0.26861702127659576, + "grad_norm": 3.9610633850097656, + "learning_rate": 9.9996209753652e-06, + "loss": 1.2564, + "step": 1010 + }, + { + "epoch": 0.2688829787234043, + "grad_norm": 4.724634170532227, + "learning_rate": 9.999610068879717e-06, + "loss": 1.2371, + "step": 1011 + }, + { + "epoch": 0.2691489361702128, + "grad_norm": 4.770898342132568, + "learning_rate": 9.999599007700411e-06, + "loss": 1.3291, + "step": 1012 + }, + { + "epoch": 0.26941489361702126, + "grad_norm": 4.2460551261901855, + "learning_rate": 9.999587791827627e-06, + "loss": 1.321, + "step": 1013 + }, + { + "epoch": 0.2696808510638298, + "grad_norm": 4.29102897644043, + "learning_rate": 9.99957642126171e-06, + "loss": 1.2469, + "step": 1014 + }, + { + "epoch": 0.2699468085106383, + "grad_norm": 4.516227722167969, + "learning_rate": 9.999564896003013e-06, + "loss": 1.2158, + "step": 1015 + }, + { + "epoch": 0.2702127659574468, + "grad_norm": 4.530557632446289, + "learning_rate": 9.999553216051892e-06, + "loss": 1.3454, + "step": 1016 + }, + { + "epoch": 0.27047872340425533, + "grad_norm": 4.2970290184021, + "learning_rate": 9.999541381408706e-06, + "loss": 1.3784, + "step": 1017 + }, + { + "epoch": 0.27074468085106385, + "grad_norm": 4.136434078216553, + "learning_rate": 9.999529392073825e-06, + "loss": 1.2268, + "step": 1018 + }, + { + "epoch": 0.27101063829787236, + "grad_norm": 4.108096122741699, + "learning_rate": 9.999517248047618e-06, + "loss": 1.2798, + "step": 1019 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 4.367121696472168, + "learning_rate": 9.99950494933046e-06, + "loss": 1.2629, + "step": 1020 + }, + { + "epoch": 0.27154255319148934, + "grad_norm": 4.400355815887451, + "learning_rate": 9.999492495922735e-06, + "loss": 1.3386, + "step": 1021 + }, + { + "epoch": 0.27180851063829786, + "grad_norm": 4.384739875793457, + "learning_rate": 9.999479887824826e-06, + "loss": 1.2904, + "step": 1022 + }, + { + "epoch": 0.2720744680851064, + "grad_norm": 4.273925304412842, + "learning_rate": 9.999467125037121e-06, + "loss": 1.268, + "step": 1023 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 4.222406387329102, + "learning_rate": 9.999454207560019e-06, + "loss": 1.2875, + "step": 1024 + }, + { + "epoch": 0.2726063829787234, + "grad_norm": 4.79681396484375, + "learning_rate": 9.999441135393917e-06, + "loss": 1.3315, + "step": 1025 + }, + { + "epoch": 0.27287234042553193, + "grad_norm": 4.473938941955566, + "learning_rate": 9.99942790853922e-06, + "loss": 1.4033, + "step": 1026 + }, + { + "epoch": 0.27313829787234045, + "grad_norm": 4.128412246704102, + "learning_rate": 9.999414526996337e-06, + "loss": 1.1818, + "step": 1027 + }, + { + "epoch": 0.2734042553191489, + "grad_norm": 4.2525739669799805, + "learning_rate": 9.999400990765683e-06, + "loss": 1.2004, + "step": 1028 + }, + { + "epoch": 0.27367021276595743, + "grad_norm": 4.565985202789307, + "learning_rate": 9.999387299847677e-06, + "loss": 1.3035, + "step": 1029 + }, + { + "epoch": 0.27393617021276595, + "grad_norm": 4.308706283569336, + "learning_rate": 9.99937345424274e-06, + "loss": 1.2976, + "step": 1030 + }, + { + "epoch": 0.27420212765957447, + "grad_norm": 4.31046724319458, + "learning_rate": 9.999359453951303e-06, + "loss": 1.3213, + "step": 1031 + }, + { + "epoch": 0.274468085106383, + "grad_norm": 4.618355751037598, + "learning_rate": 9.9993452989738e-06, + "loss": 1.3231, + "step": 1032 + }, + { + "epoch": 0.2747340425531915, + "grad_norm": 4.580687999725342, + "learning_rate": 9.999330989310665e-06, + "loss": 1.3654, + "step": 1033 + }, + { + "epoch": 0.275, + "grad_norm": 4.229262351989746, + "learning_rate": 9.999316524962347e-06, + "loss": 1.2944, + "step": 1034 + }, + { + "epoch": 0.2752659574468085, + "grad_norm": 3.708747148513794, + "learning_rate": 9.999301905929286e-06, + "loss": 1.154, + "step": 1035 + }, + { + "epoch": 0.275531914893617, + "grad_norm": 4.275104999542236, + "learning_rate": 9.999287132211938e-06, + "loss": 1.2148, + "step": 1036 + }, + { + "epoch": 0.2757978723404255, + "grad_norm": 4.225863456726074, + "learning_rate": 9.999272203810763e-06, + "loss": 1.4705, + "step": 1037 + }, + { + "epoch": 0.27606382978723404, + "grad_norm": 4.132633209228516, + "learning_rate": 9.999257120726219e-06, + "loss": 1.2538, + "step": 1038 + }, + { + "epoch": 0.27632978723404256, + "grad_norm": 5.643379211425781, + "learning_rate": 9.999241882958772e-06, + "loss": 1.2564, + "step": 1039 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 4.306319713592529, + "learning_rate": 9.999226490508897e-06, + "loss": 1.4085, + "step": 1040 + }, + { + "epoch": 0.2768617021276596, + "grad_norm": 4.2022247314453125, + "learning_rate": 9.99921094337707e-06, + "loss": 1.3632, + "step": 1041 + }, + { + "epoch": 0.2771276595744681, + "grad_norm": 4.866800785064697, + "learning_rate": 9.999195241563768e-06, + "loss": 1.3262, + "step": 1042 + }, + { + "epoch": 0.2773936170212766, + "grad_norm": 4.111828327178955, + "learning_rate": 9.99917938506948e-06, + "loss": 1.3087, + "step": 1043 + }, + { + "epoch": 0.2776595744680851, + "grad_norm": 4.37149715423584, + "learning_rate": 9.999163373894696e-06, + "loss": 1.2089, + "step": 1044 + }, + { + "epoch": 0.2779255319148936, + "grad_norm": 4.524958610534668, + "learning_rate": 9.999147208039912e-06, + "loss": 1.1935, + "step": 1045 + }, + { + "epoch": 0.2781914893617021, + "grad_norm": 4.5271406173706055, + "learning_rate": 9.999130887505627e-06, + "loss": 1.3111, + "step": 1046 + }, + { + "epoch": 0.27845744680851064, + "grad_norm": 4.4966301918029785, + "learning_rate": 9.999114412292347e-06, + "loss": 1.3695, + "step": 1047 + }, + { + "epoch": 0.27872340425531916, + "grad_norm": 4.8100714683532715, + "learning_rate": 9.999097782400582e-06, + "loss": 1.3152, + "step": 1048 + }, + { + "epoch": 0.2789893617021277, + "grad_norm": 4.238595962524414, + "learning_rate": 9.999080997830845e-06, + "loss": 1.2533, + "step": 1049 + }, + { + "epoch": 0.27925531914893614, + "grad_norm": 4.036017417907715, + "learning_rate": 9.999064058583657e-06, + "loss": 1.1984, + "step": 1050 + }, + { + "epoch": 0.27952127659574466, + "grad_norm": 4.587932586669922, + "learning_rate": 9.99904696465954e-06, + "loss": 1.2216, + "step": 1051 + }, + { + "epoch": 0.2797872340425532, + "grad_norm": 5.027749538421631, + "learning_rate": 9.999029716059026e-06, + "loss": 1.4618, + "step": 1052 + }, + { + "epoch": 0.2800531914893617, + "grad_norm": 4.331791400909424, + "learning_rate": 9.999012312782645e-06, + "loss": 1.2566, + "step": 1053 + }, + { + "epoch": 0.2803191489361702, + "grad_norm": 4.737422943115234, + "learning_rate": 9.99899475483094e-06, + "loss": 1.2935, + "step": 1054 + }, + { + "epoch": 0.28058510638297873, + "grad_norm": 4.8805832862854, + "learning_rate": 9.998977042204449e-06, + "loss": 1.3277, + "step": 1055 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 4.296173095703125, + "learning_rate": 9.998959174903725e-06, + "loss": 1.341, + "step": 1056 + }, + { + "epoch": 0.28111702127659577, + "grad_norm": 4.3713788986206055, + "learning_rate": 9.998941152929316e-06, + "loss": 1.308, + "step": 1057 + }, + { + "epoch": 0.28138297872340423, + "grad_norm": 4.576108932495117, + "learning_rate": 9.998922976281785e-06, + "loss": 1.2585, + "step": 1058 + }, + { + "epoch": 0.28164893617021275, + "grad_norm": 4.187806129455566, + "learning_rate": 9.998904644961689e-06, + "loss": 1.393, + "step": 1059 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 4.360199928283691, + "learning_rate": 9.9988861589696e-06, + "loss": 1.4, + "step": 1060 + }, + { + "epoch": 0.2821808510638298, + "grad_norm": 4.283745288848877, + "learning_rate": 9.998867518306087e-06, + "loss": 1.2823, + "step": 1061 + }, + { + "epoch": 0.2824468085106383, + "grad_norm": 3.8223369121551514, + "learning_rate": 9.998848722971727e-06, + "loss": 1.3144, + "step": 1062 + }, + { + "epoch": 0.2827127659574468, + "grad_norm": 4.405114650726318, + "learning_rate": 9.998829772967103e-06, + "loss": 1.4051, + "step": 1063 + }, + { + "epoch": 0.28297872340425534, + "grad_norm": 4.547544479370117, + "learning_rate": 9.9988106682928e-06, + "loss": 1.2622, + "step": 1064 + }, + { + "epoch": 0.28324468085106386, + "grad_norm": 3.850954055786133, + "learning_rate": 9.998791408949408e-06, + "loss": 1.197, + "step": 1065 + }, + { + "epoch": 0.2835106382978723, + "grad_norm": 3.994758367538452, + "learning_rate": 9.998771994937528e-06, + "loss": 1.1907, + "step": 1066 + }, + { + "epoch": 0.28377659574468084, + "grad_norm": 4.24208927154541, + "learning_rate": 9.998752426257754e-06, + "loss": 1.4078, + "step": 1067 + }, + { + "epoch": 0.28404255319148936, + "grad_norm": 4.435787200927734, + "learning_rate": 9.998732702910697e-06, + "loss": 1.2044, + "step": 1068 + }, + { + "epoch": 0.2843085106382979, + "grad_norm": 4.169311046600342, + "learning_rate": 9.998712824896963e-06, + "loss": 1.2126, + "step": 1069 + }, + { + "epoch": 0.2845744680851064, + "grad_norm": 4.478437900543213, + "learning_rate": 9.99869279221717e-06, + "loss": 1.3164, + "step": 1070 + }, + { + "epoch": 0.2848404255319149, + "grad_norm": 4.775943756103516, + "learning_rate": 9.998672604871936e-06, + "loss": 1.3169, + "step": 1071 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 4.637179374694824, + "learning_rate": 9.998652262861888e-06, + "loss": 1.2441, + "step": 1072 + }, + { + "epoch": 0.2853723404255319, + "grad_norm": 4.511475086212158, + "learning_rate": 9.998631766187651e-06, + "loss": 1.3766, + "step": 1073 + }, + { + "epoch": 0.2856382978723404, + "grad_norm": 4.503199100494385, + "learning_rate": 9.998611114849866e-06, + "loss": 1.1787, + "step": 1074 + }, + { + "epoch": 0.2859042553191489, + "grad_norm": 4.549198627471924, + "learning_rate": 9.998590308849164e-06, + "loss": 1.3229, + "step": 1075 + }, + { + "epoch": 0.28617021276595744, + "grad_norm": 4.182891368865967, + "learning_rate": 9.998569348186194e-06, + "loss": 1.2659, + "step": 1076 + }, + { + "epoch": 0.28643617021276596, + "grad_norm": 4.964444160461426, + "learning_rate": 9.998548232861604e-06, + "loss": 1.4196, + "step": 1077 + }, + { + "epoch": 0.2867021276595745, + "grad_norm": 4.905456066131592, + "learning_rate": 9.998526962876047e-06, + "loss": 1.3089, + "step": 1078 + }, + { + "epoch": 0.286968085106383, + "grad_norm": 4.207391738891602, + "learning_rate": 9.998505538230179e-06, + "loss": 1.3231, + "step": 1079 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 4.414906024932861, + "learning_rate": 9.998483958924666e-06, + "loss": 1.229, + "step": 1080 + }, + { + "epoch": 0.2875, + "grad_norm": 4.2714667320251465, + "learning_rate": 9.998462224960176e-06, + "loss": 1.4204, + "step": 1081 + }, + { + "epoch": 0.2877659574468085, + "grad_norm": 4.423734188079834, + "learning_rate": 9.998440336337376e-06, + "loss": 1.3774, + "step": 1082 + }, + { + "epoch": 0.288031914893617, + "grad_norm": 4.450468063354492, + "learning_rate": 9.998418293056949e-06, + "loss": 1.2639, + "step": 1083 + }, + { + "epoch": 0.28829787234042553, + "grad_norm": 4.328600883483887, + "learning_rate": 9.998396095119575e-06, + "loss": 1.3594, + "step": 1084 + }, + { + "epoch": 0.28856382978723405, + "grad_norm": 4.951174259185791, + "learning_rate": 9.998373742525941e-06, + "loss": 1.4862, + "step": 1085 + }, + { + "epoch": 0.28882978723404257, + "grad_norm": 4.484705924987793, + "learning_rate": 9.998351235276738e-06, + "loss": 1.3577, + "step": 1086 + }, + { + "epoch": 0.2890957446808511, + "grad_norm": 4.428178310394287, + "learning_rate": 9.998328573372664e-06, + "loss": 1.2438, + "step": 1087 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 4.682640552520752, + "learning_rate": 9.998305756814419e-06, + "loss": 1.3493, + "step": 1088 + }, + { + "epoch": 0.28962765957446807, + "grad_norm": 4.30879020690918, + "learning_rate": 9.998282785602709e-06, + "loss": 1.253, + "step": 1089 + }, + { + "epoch": 0.2898936170212766, + "grad_norm": 4.327608108520508, + "learning_rate": 9.998259659738243e-06, + "loss": 1.3574, + "step": 1090 + }, + { + "epoch": 0.2901595744680851, + "grad_norm": 3.996189594268799, + "learning_rate": 9.998236379221742e-06, + "loss": 1.1811, + "step": 1091 + }, + { + "epoch": 0.2904255319148936, + "grad_norm": 4.262546062469482, + "learning_rate": 9.99821294405392e-06, + "loss": 1.1899, + "step": 1092 + }, + { + "epoch": 0.29069148936170214, + "grad_norm": 3.7779383659362793, + "learning_rate": 9.998189354235506e-06, + "loss": 1.3034, + "step": 1093 + }, + { + "epoch": 0.29095744680851066, + "grad_norm": 4.748449325561523, + "learning_rate": 9.998165609767228e-06, + "loss": 1.1943, + "step": 1094 + }, + { + "epoch": 0.2912234042553192, + "grad_norm": 4.325401782989502, + "learning_rate": 9.998141710649822e-06, + "loss": 1.2955, + "step": 1095 + }, + { + "epoch": 0.29148936170212764, + "grad_norm": 4.276817321777344, + "learning_rate": 9.998117656884025e-06, + "loss": 1.2853, + "step": 1096 + }, + { + "epoch": 0.29175531914893615, + "grad_norm": 4.66014289855957, + "learning_rate": 9.998093448470585e-06, + "loss": 1.2643, + "step": 1097 + }, + { + "epoch": 0.29202127659574467, + "grad_norm": 3.963014602661133, + "learning_rate": 9.998069085410249e-06, + "loss": 1.2145, + "step": 1098 + }, + { + "epoch": 0.2922872340425532, + "grad_norm": 4.040323734283447, + "learning_rate": 9.99804456770377e-06, + "loss": 1.3845, + "step": 1099 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 3.8575801849365234, + "learning_rate": 9.99801989535191e-06, + "loss": 1.131, + "step": 1100 + }, + { + "epoch": 0.2928191489361702, + "grad_norm": 4.067200183868408, + "learning_rate": 9.997995068355428e-06, + "loss": 1.352, + "step": 1101 + }, + { + "epoch": 0.29308510638297874, + "grad_norm": 4.207942962646484, + "learning_rate": 9.997970086715096e-06, + "loss": 1.2372, + "step": 1102 + }, + { + "epoch": 0.2933510638297872, + "grad_norm": 4.058019638061523, + "learning_rate": 9.997944950431684e-06, + "loss": 1.203, + "step": 1103 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 4.622230052947998, + "learning_rate": 9.99791965950597e-06, + "loss": 1.3916, + "step": 1104 + }, + { + "epoch": 0.29388297872340424, + "grad_norm": 4.3508076667785645, + "learning_rate": 9.997894213938738e-06, + "loss": 1.3344, + "step": 1105 + }, + { + "epoch": 0.29414893617021276, + "grad_norm": 3.9889092445373535, + "learning_rate": 9.997868613730775e-06, + "loss": 1.1658, + "step": 1106 + }, + { + "epoch": 0.2944148936170213, + "grad_norm": 4.091287136077881, + "learning_rate": 9.997842858882873e-06, + "loss": 1.3258, + "step": 1107 + }, + { + "epoch": 0.2946808510638298, + "grad_norm": 4.280172824859619, + "learning_rate": 9.997816949395828e-06, + "loss": 1.3231, + "step": 1108 + }, + { + "epoch": 0.2949468085106383, + "grad_norm": 4.268125057220459, + "learning_rate": 9.997790885270444e-06, + "loss": 1.1984, + "step": 1109 + }, + { + "epoch": 0.29521276595744683, + "grad_norm": 4.030393600463867, + "learning_rate": 9.997764666507523e-06, + "loss": 1.3441, + "step": 1110 + }, + { + "epoch": 0.2954787234042553, + "grad_norm": 4.591287136077881, + "learning_rate": 9.997738293107882e-06, + "loss": 1.3059, + "step": 1111 + }, + { + "epoch": 0.2957446808510638, + "grad_norm": 5.225955486297607, + "learning_rate": 9.997711765072333e-06, + "loss": 1.3236, + "step": 1112 + }, + { + "epoch": 0.29601063829787233, + "grad_norm": 4.161701679229736, + "learning_rate": 9.997685082401698e-06, + "loss": 1.2, + "step": 1113 + }, + { + "epoch": 0.29627659574468085, + "grad_norm": 4.316693305969238, + "learning_rate": 9.997658245096802e-06, + "loss": 1.2758, + "step": 1114 + }, + { + "epoch": 0.29654255319148937, + "grad_norm": 4.311786651611328, + "learning_rate": 9.997631253158477e-06, + "loss": 1.1873, + "step": 1115 + }, + { + "epoch": 0.2968085106382979, + "grad_norm": 4.271190643310547, + "learning_rate": 9.997604106587555e-06, + "loss": 1.1661, + "step": 1116 + }, + { + "epoch": 0.2970744680851064, + "grad_norm": 4.620399475097656, + "learning_rate": 9.99757680538488e-06, + "loss": 1.3542, + "step": 1117 + }, + { + "epoch": 0.2973404255319149, + "grad_norm": 4.287705421447754, + "learning_rate": 9.997549349551295e-06, + "loss": 1.3467, + "step": 1118 + }, + { + "epoch": 0.2976063829787234, + "grad_norm": 4.158224105834961, + "learning_rate": 9.997521739087647e-06, + "loss": 1.229, + "step": 1119 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 4.308200836181641, + "learning_rate": 9.997493973994793e-06, + "loss": 1.3478, + "step": 1120 + }, + { + "epoch": 0.2981382978723404, + "grad_norm": 4.467398643493652, + "learning_rate": 9.997466054273593e-06, + "loss": 1.2729, + "step": 1121 + }, + { + "epoch": 0.29840425531914894, + "grad_norm": 4.264455318450928, + "learning_rate": 9.997437979924908e-06, + "loss": 1.234, + "step": 1122 + }, + { + "epoch": 0.29867021276595745, + "grad_norm": 4.258848190307617, + "learning_rate": 9.99740975094961e-06, + "loss": 1.1682, + "step": 1123 + }, + { + "epoch": 0.298936170212766, + "grad_norm": 4.3061089515686035, + "learning_rate": 9.99738136734857e-06, + "loss": 1.3241, + "step": 1124 + }, + { + "epoch": 0.2992021276595745, + "grad_norm": 4.324080467224121, + "learning_rate": 9.997352829122667e-06, + "loss": 1.254, + "step": 1125 + }, + { + "epoch": 0.29946808510638295, + "grad_norm": 4.312755584716797, + "learning_rate": 9.997324136272784e-06, + "loss": 1.309, + "step": 1126 + }, + { + "epoch": 0.29973404255319147, + "grad_norm": 4.023726463317871, + "learning_rate": 9.997295288799806e-06, + "loss": 1.238, + "step": 1127 + }, + { + "epoch": 0.3, + "grad_norm": 4.355762004852295, + "learning_rate": 9.99726628670463e-06, + "loss": 1.2271, + "step": 1128 + }, + { + "epoch": 0.3002659574468085, + "grad_norm": 4.85224723815918, + "learning_rate": 9.997237129988154e-06, + "loss": 1.2849, + "step": 1129 + }, + { + "epoch": 0.300531914893617, + "grad_norm": 4.464909553527832, + "learning_rate": 9.997207818651273e-06, + "loss": 1.2992, + "step": 1130 + }, + { + "epoch": 0.30079787234042554, + "grad_norm": 3.7525863647460938, + "learning_rate": 9.997178352694902e-06, + "loss": 1.1764, + "step": 1131 + }, + { + "epoch": 0.30106382978723406, + "grad_norm": 4.892136096954346, + "learning_rate": 9.997148732119947e-06, + "loss": 1.4041, + "step": 1132 + }, + { + "epoch": 0.3013297872340426, + "grad_norm": 3.8774726390838623, + "learning_rate": 9.99711895692733e-06, + "loss": 1.1936, + "step": 1133 + }, + { + "epoch": 0.30159574468085104, + "grad_norm": 4.585043907165527, + "learning_rate": 9.997089027117966e-06, + "loss": 1.2402, + "step": 1134 + }, + { + "epoch": 0.30186170212765956, + "grad_norm": 4.731383800506592, + "learning_rate": 9.997058942692786e-06, + "loss": 1.3886, + "step": 1135 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 4.4259033203125, + "learning_rate": 9.997028703652718e-06, + "loss": 1.4784, + "step": 1136 + }, + { + "epoch": 0.3023936170212766, + "grad_norm": 4.584959030151367, + "learning_rate": 9.996998309998699e-06, + "loss": 1.1575, + "step": 1137 + }, + { + "epoch": 0.3026595744680851, + "grad_norm": 4.300727844238281, + "learning_rate": 9.996967761731668e-06, + "loss": 1.3999, + "step": 1138 + }, + { + "epoch": 0.30292553191489363, + "grad_norm": 4.30328893661499, + "learning_rate": 9.996937058852575e-06, + "loss": 1.3061, + "step": 1139 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 4.1981964111328125, + "learning_rate": 9.996906201362361e-06, + "loss": 1.3078, + "step": 1140 + }, + { + "epoch": 0.3034574468085106, + "grad_norm": 4.507598876953125, + "learning_rate": 9.99687518926199e-06, + "loss": 1.3732, + "step": 1141 + }, + { + "epoch": 0.30372340425531913, + "grad_norm": 4.559037685394287, + "learning_rate": 9.996844022552416e-06, + "loss": 1.3447, + "step": 1142 + }, + { + "epoch": 0.30398936170212765, + "grad_norm": 4.10542106628418, + "learning_rate": 9.996812701234604e-06, + "loss": 1.2118, + "step": 1143 + }, + { + "epoch": 0.30425531914893617, + "grad_norm": 4.441193103790283, + "learning_rate": 9.996781225309526e-06, + "loss": 1.3549, + "step": 1144 + }, + { + "epoch": 0.3045212765957447, + "grad_norm": 4.166191577911377, + "learning_rate": 9.996749594778153e-06, + "loss": 1.3067, + "step": 1145 + }, + { + "epoch": 0.3047872340425532, + "grad_norm": 4.284362316131592, + "learning_rate": 9.996717809641464e-06, + "loss": 1.31, + "step": 1146 + }, + { + "epoch": 0.3050531914893617, + "grad_norm": 4.457339286804199, + "learning_rate": 9.996685869900444e-06, + "loss": 1.2858, + "step": 1147 + }, + { + "epoch": 0.30531914893617024, + "grad_norm": 5.572897434234619, + "learning_rate": 9.99665377555608e-06, + "loss": 1.3094, + "step": 1148 + }, + { + "epoch": 0.3055851063829787, + "grad_norm": 3.9291319847106934, + "learning_rate": 9.996621526609364e-06, + "loss": 1.1499, + "step": 1149 + }, + { + "epoch": 0.3058510638297872, + "grad_norm": 4.23716926574707, + "learning_rate": 9.996589123061297e-06, + "loss": 1.1395, + "step": 1150 + }, + { + "epoch": 0.30611702127659574, + "grad_norm": 4.1819047927856445, + "learning_rate": 9.99655656491288e-06, + "loss": 1.2152, + "step": 1151 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 4.467685222625732, + "learning_rate": 9.99652385216512e-06, + "loss": 1.38, + "step": 1152 + }, + { + "epoch": 0.30664893617021277, + "grad_norm": 3.723454236984253, + "learning_rate": 9.996490984819027e-06, + "loss": 1.1745, + "step": 1153 + }, + { + "epoch": 0.3069148936170213, + "grad_norm": 4.097151756286621, + "learning_rate": 9.996457962875623e-06, + "loss": 1.3743, + "step": 1154 + }, + { + "epoch": 0.3071808510638298, + "grad_norm": 4.7414326667785645, + "learning_rate": 9.996424786335925e-06, + "loss": 1.4252, + "step": 1155 + }, + { + "epoch": 0.3074468085106383, + "grad_norm": 3.7857699394226074, + "learning_rate": 9.996391455200963e-06, + "loss": 1.2984, + "step": 1156 + }, + { + "epoch": 0.3077127659574468, + "grad_norm": 4.953484535217285, + "learning_rate": 9.996357969471767e-06, + "loss": 1.3539, + "step": 1157 + }, + { + "epoch": 0.3079787234042553, + "grad_norm": 4.564802646636963, + "learning_rate": 9.996324329149372e-06, + "loss": 1.2833, + "step": 1158 + }, + { + "epoch": 0.3082446808510638, + "grad_norm": 4.2867045402526855, + "learning_rate": 9.99629053423482e-06, + "loss": 1.2933, + "step": 1159 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 4.2070817947387695, + "learning_rate": 9.996256584729157e-06, + "loss": 1.163, + "step": 1160 + }, + { + "epoch": 0.30877659574468086, + "grad_norm": 4.603311061859131, + "learning_rate": 9.996222480633433e-06, + "loss": 1.2404, + "step": 1161 + }, + { + "epoch": 0.3090425531914894, + "grad_norm": 4.443660736083984, + "learning_rate": 9.996188221948702e-06, + "loss": 1.3518, + "step": 1162 + }, + { + "epoch": 0.3093085106382979, + "grad_norm": 4.2897443771362305, + "learning_rate": 9.996153808676025e-06, + "loss": 1.2786, + "step": 1163 + }, + { + "epoch": 0.30957446808510636, + "grad_norm": 4.69590425491333, + "learning_rate": 9.996119240816469e-06, + "loss": 1.3259, + "step": 1164 + }, + { + "epoch": 0.3098404255319149, + "grad_norm": 4.064958095550537, + "learning_rate": 9.996084518371101e-06, + "loss": 1.2768, + "step": 1165 + }, + { + "epoch": 0.3101063829787234, + "grad_norm": 4.3534626960754395, + "learning_rate": 9.996049641340994e-06, + "loss": 1.3245, + "step": 1166 + }, + { + "epoch": 0.3103723404255319, + "grad_norm": 4.278623580932617, + "learning_rate": 9.996014609727232e-06, + "loss": 1.405, + "step": 1167 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 4.835923671722412, + "learning_rate": 9.995979423530893e-06, + "loss": 1.2416, + "step": 1168 + }, + { + "epoch": 0.31090425531914895, + "grad_norm": 4.191746711730957, + "learning_rate": 9.99594408275307e-06, + "loss": 1.154, + "step": 1169 + }, + { + "epoch": 0.31117021276595747, + "grad_norm": 3.9082558155059814, + "learning_rate": 9.995908587394854e-06, + "loss": 1.2412, + "step": 1170 + }, + { + "epoch": 0.311436170212766, + "grad_norm": 4.342267036437988, + "learning_rate": 9.995872937457345e-06, + "loss": 1.2312, + "step": 1171 + }, + { + "epoch": 0.31170212765957445, + "grad_norm": 4.569537162780762, + "learning_rate": 9.995837132941646e-06, + "loss": 1.3551, + "step": 1172 + }, + { + "epoch": 0.31196808510638296, + "grad_norm": 4.246980667114258, + "learning_rate": 9.995801173848863e-06, + "loss": 1.2517, + "step": 1173 + }, + { + "epoch": 0.3122340425531915, + "grad_norm": 4.276669025421143, + "learning_rate": 9.995765060180111e-06, + "loss": 1.2417, + "step": 1174 + }, + { + "epoch": 0.3125, + "grad_norm": 4.076509952545166, + "learning_rate": 9.995728791936505e-06, + "loss": 1.2837, + "step": 1175 + }, + { + "epoch": 0.3127659574468085, + "grad_norm": 4.078117370605469, + "learning_rate": 9.99569236911917e-06, + "loss": 1.1589, + "step": 1176 + }, + { + "epoch": 0.31303191489361704, + "grad_norm": 4.253208637237549, + "learning_rate": 9.995655791729231e-06, + "loss": 1.4023, + "step": 1177 + }, + { + "epoch": 0.31329787234042555, + "grad_norm": 4.0782790184021, + "learning_rate": 9.99561905976782e-06, + "loss": 1.2094, + "step": 1178 + }, + { + "epoch": 0.313563829787234, + "grad_norm": 4.714814186096191, + "learning_rate": 9.995582173236073e-06, + "loss": 1.2883, + "step": 1179 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 4.640500068664551, + "learning_rate": 9.995545132135133e-06, + "loss": 1.3784, + "step": 1180 + }, + { + "epoch": 0.31409574468085105, + "grad_norm": 4.722717761993408, + "learning_rate": 9.995507936466144e-06, + "loss": 1.2644, + "step": 1181 + }, + { + "epoch": 0.31436170212765957, + "grad_norm": 4.296687602996826, + "learning_rate": 9.99547058623026e-06, + "loss": 1.2238, + "step": 1182 + }, + { + "epoch": 0.3146276595744681, + "grad_norm": 4.157870769500732, + "learning_rate": 9.995433081428631e-06, + "loss": 1.2275, + "step": 1183 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 4.162895202636719, + "learning_rate": 9.995395422062424e-06, + "loss": 1.2697, + "step": 1184 + }, + { + "epoch": 0.3151595744680851, + "grad_norm": 4.142743110656738, + "learning_rate": 9.9953576081328e-06, + "loss": 1.2514, + "step": 1185 + }, + { + "epoch": 0.31542553191489364, + "grad_norm": 4.504545211791992, + "learning_rate": 9.995319639640932e-06, + "loss": 1.1996, + "step": 1186 + }, + { + "epoch": 0.3156914893617021, + "grad_norm": 4.5642523765563965, + "learning_rate": 9.995281516587992e-06, + "loss": 1.4783, + "step": 1187 + }, + { + "epoch": 0.3159574468085106, + "grad_norm": 4.14572286605835, + "learning_rate": 9.99524323897516e-06, + "loss": 1.3261, + "step": 1188 + }, + { + "epoch": 0.31622340425531914, + "grad_norm": 4.159525394439697, + "learning_rate": 9.995204806803622e-06, + "loss": 1.3492, + "step": 1189 + }, + { + "epoch": 0.31648936170212766, + "grad_norm": 3.9404852390289307, + "learning_rate": 9.995166220074566e-06, + "loss": 1.2726, + "step": 1190 + }, + { + "epoch": 0.3167553191489362, + "grad_norm": 4.158994197845459, + "learning_rate": 9.995127478789186e-06, + "loss": 1.2472, + "step": 1191 + }, + { + "epoch": 0.3170212765957447, + "grad_norm": 4.277184009552002, + "learning_rate": 9.995088582948682e-06, + "loss": 1.3549, + "step": 1192 + }, + { + "epoch": 0.3172872340425532, + "grad_norm": 4.210202217102051, + "learning_rate": 9.995049532554253e-06, + "loss": 1.313, + "step": 1193 + }, + { + "epoch": 0.3175531914893617, + "grad_norm": 4.146048545837402, + "learning_rate": 9.995010327607113e-06, + "loss": 1.3272, + "step": 1194 + }, + { + "epoch": 0.3178191489361702, + "grad_norm": 4.287917137145996, + "learning_rate": 9.994970968108473e-06, + "loss": 1.4158, + "step": 1195 + }, + { + "epoch": 0.3180851063829787, + "grad_norm": 3.8834691047668457, + "learning_rate": 9.99493145405955e-06, + "loss": 1.1957, + "step": 1196 + }, + { + "epoch": 0.31835106382978723, + "grad_norm": 4.134634494781494, + "learning_rate": 9.994891785461565e-06, + "loss": 1.3806, + "step": 1197 + }, + { + "epoch": 0.31861702127659575, + "grad_norm": 4.137069225311279, + "learning_rate": 9.99485196231575e-06, + "loss": 1.2337, + "step": 1198 + }, + { + "epoch": 0.31888297872340426, + "grad_norm": 3.9084503650665283, + "learning_rate": 9.994811984623332e-06, + "loss": 1.1263, + "step": 1199 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 4.515985012054443, + "learning_rate": 9.994771852385552e-06, + "loss": 1.3851, + "step": 1200 + }, + { + "epoch": 0.3194148936170213, + "grad_norm": 4.150672912597656, + "learning_rate": 9.994731565603651e-06, + "loss": 1.2034, + "step": 1201 + }, + { + "epoch": 0.31968085106382976, + "grad_norm": 4.727832317352295, + "learning_rate": 9.994691124278874e-06, + "loss": 1.3987, + "step": 1202 + }, + { + "epoch": 0.3199468085106383, + "grad_norm": 4.292087554931641, + "learning_rate": 9.994650528412472e-06, + "loss": 1.3757, + "step": 1203 + }, + { + "epoch": 0.3202127659574468, + "grad_norm": 4.135016918182373, + "learning_rate": 9.994609778005704e-06, + "loss": 1.3413, + "step": 1204 + }, + { + "epoch": 0.3204787234042553, + "grad_norm": 4.273712635040283, + "learning_rate": 9.994568873059829e-06, + "loss": 1.2102, + "step": 1205 + }, + { + "epoch": 0.32074468085106383, + "grad_norm": 4.216573715209961, + "learning_rate": 9.994527813576111e-06, + "loss": 1.3998, + "step": 1206 + }, + { + "epoch": 0.32101063829787235, + "grad_norm": 3.847257375717163, + "learning_rate": 9.994486599555823e-06, + "loss": 1.1265, + "step": 1207 + }, + { + "epoch": 0.32127659574468087, + "grad_norm": 4.784033298492432, + "learning_rate": 9.99444523100024e-06, + "loss": 1.3363, + "step": 1208 + }, + { + "epoch": 0.3215425531914894, + "grad_norm": 4.474783897399902, + "learning_rate": 9.994403707910642e-06, + "loss": 1.2317, + "step": 1209 + }, + { + "epoch": 0.32180851063829785, + "grad_norm": 4.004277229309082, + "learning_rate": 9.994362030288312e-06, + "loss": 1.2477, + "step": 1210 + }, + { + "epoch": 0.32207446808510637, + "grad_norm": 3.9819071292877197, + "learning_rate": 9.99432019813454e-06, + "loss": 1.1898, + "step": 1211 + }, + { + "epoch": 0.3223404255319149, + "grad_norm": 3.8308217525482178, + "learning_rate": 9.994278211450622e-06, + "loss": 1.287, + "step": 1212 + }, + { + "epoch": 0.3226063829787234, + "grad_norm": 4.272090435028076, + "learning_rate": 9.994236070237854e-06, + "loss": 1.3905, + "step": 1213 + }, + { + "epoch": 0.3228723404255319, + "grad_norm": 4.1817169189453125, + "learning_rate": 9.994193774497544e-06, + "loss": 1.2512, + "step": 1214 + }, + { + "epoch": 0.32313829787234044, + "grad_norm": 3.9769554138183594, + "learning_rate": 9.994151324231e-06, + "loss": 1.2287, + "step": 1215 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 4.290254592895508, + "learning_rate": 9.994108719439533e-06, + "loss": 1.2741, + "step": 1216 + }, + { + "epoch": 0.3236702127659574, + "grad_norm": 4.185919284820557, + "learning_rate": 9.994065960124462e-06, + "loss": 1.3203, + "step": 1217 + }, + { + "epoch": 0.32393617021276594, + "grad_norm": 4.25853967666626, + "learning_rate": 9.994023046287109e-06, + "loss": 1.3062, + "step": 1218 + }, + { + "epoch": 0.32420212765957446, + "grad_norm": 3.9912209510803223, + "learning_rate": 9.993979977928805e-06, + "loss": 1.1988, + "step": 1219 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 3.865492343902588, + "learning_rate": 9.993936755050881e-06, + "loss": 1.1626, + "step": 1220 + }, + { + "epoch": 0.3247340425531915, + "grad_norm": 4.017344951629639, + "learning_rate": 9.993893377654673e-06, + "loss": 1.3626, + "step": 1221 + }, + { + "epoch": 0.325, + "grad_norm": 3.9618587493896484, + "learning_rate": 9.993849845741525e-06, + "loss": 1.361, + "step": 1222 + }, + { + "epoch": 0.32526595744680853, + "grad_norm": 4.2321648597717285, + "learning_rate": 9.993806159312783e-06, + "loss": 1.3773, + "step": 1223 + }, + { + "epoch": 0.32553191489361705, + "grad_norm": 4.570196151733398, + "learning_rate": 9.9937623183698e-06, + "loss": 1.3895, + "step": 1224 + }, + { + "epoch": 0.3257978723404255, + "grad_norm": 3.9867353439331055, + "learning_rate": 9.99371832291393e-06, + "loss": 1.1623, + "step": 1225 + }, + { + "epoch": 0.326063829787234, + "grad_norm": 5.1412200927734375, + "learning_rate": 9.993674172946536e-06, + "loss": 1.3987, + "step": 1226 + }, + { + "epoch": 0.32632978723404255, + "grad_norm": 4.0850605964660645, + "learning_rate": 9.993629868468984e-06, + "loss": 1.2399, + "step": 1227 + }, + { + "epoch": 0.32659574468085106, + "grad_norm": 5.263411521911621, + "learning_rate": 9.993585409482645e-06, + "loss": 1.311, + "step": 1228 + }, + { + "epoch": 0.3268617021276596, + "grad_norm": 3.8653786182403564, + "learning_rate": 9.993540795988895e-06, + "loss": 1.1391, + "step": 1229 + }, + { + "epoch": 0.3271276595744681, + "grad_norm": 4.475793838500977, + "learning_rate": 9.993496027989112e-06, + "loss": 1.2644, + "step": 1230 + }, + { + "epoch": 0.3273936170212766, + "grad_norm": 4.395388603210449, + "learning_rate": 9.993451105484682e-06, + "loss": 1.342, + "step": 1231 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 4.290927410125732, + "learning_rate": 9.993406028476997e-06, + "loss": 1.3893, + "step": 1232 + }, + { + "epoch": 0.3279255319148936, + "grad_norm": 4.348012924194336, + "learning_rate": 9.993360796967451e-06, + "loss": 1.2903, + "step": 1233 + }, + { + "epoch": 0.3281914893617021, + "grad_norm": 4.174604415893555, + "learning_rate": 9.993315410957442e-06, + "loss": 1.2951, + "step": 1234 + }, + { + "epoch": 0.32845744680851063, + "grad_norm": 4.359421253204346, + "learning_rate": 9.993269870448375e-06, + "loss": 1.4433, + "step": 1235 + }, + { + "epoch": 0.32872340425531915, + "grad_norm": 4.25851583480835, + "learning_rate": 9.99322417544166e-06, + "loss": 1.2445, + "step": 1236 + }, + { + "epoch": 0.32898936170212767, + "grad_norm": 4.110776901245117, + "learning_rate": 9.993178325938711e-06, + "loss": 1.3569, + "step": 1237 + }, + { + "epoch": 0.3292553191489362, + "grad_norm": 4.008944988250732, + "learning_rate": 9.993132321940947e-06, + "loss": 1.2227, + "step": 1238 + }, + { + "epoch": 0.3295212765957447, + "grad_norm": 4.228448390960693, + "learning_rate": 9.993086163449787e-06, + "loss": 1.2388, + "step": 1239 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 4.701793193817139, + "learning_rate": 9.993039850466664e-06, + "loss": 1.5212, + "step": 1240 + }, + { + "epoch": 0.3300531914893617, + "grad_norm": 4.4202094078063965, + "learning_rate": 9.99299338299301e-06, + "loss": 1.2413, + "step": 1241 + }, + { + "epoch": 0.3303191489361702, + "grad_norm": 4.218541622161865, + "learning_rate": 9.992946761030261e-06, + "loss": 1.2663, + "step": 1242 + }, + { + "epoch": 0.3305851063829787, + "grad_norm": 4.355581283569336, + "learning_rate": 9.99289998457986e-06, + "loss": 1.3233, + "step": 1243 + }, + { + "epoch": 0.33085106382978724, + "grad_norm": 4.184298992156982, + "learning_rate": 9.992853053643257e-06, + "loss": 1.3291, + "step": 1244 + }, + { + "epoch": 0.33111702127659576, + "grad_norm": 4.030219078063965, + "learning_rate": 9.992805968221902e-06, + "loss": 1.3502, + "step": 1245 + }, + { + "epoch": 0.3313829787234043, + "grad_norm": 4.068756103515625, + "learning_rate": 9.992758728317252e-06, + "loss": 1.1977, + "step": 1246 + }, + { + "epoch": 0.3316489361702128, + "grad_norm": 4.332919120788574, + "learning_rate": 9.99271133393077e-06, + "loss": 1.2899, + "step": 1247 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 3.9694416522979736, + "learning_rate": 9.992663785063919e-06, + "loss": 1.3366, + "step": 1248 + }, + { + "epoch": 0.3321808510638298, + "grad_norm": 3.924436569213867, + "learning_rate": 9.992616081718171e-06, + "loss": 1.2552, + "step": 1249 + }, + { + "epoch": 0.3324468085106383, + "grad_norm": 4.128008842468262, + "learning_rate": 9.992568223895007e-06, + "loss": 1.2872, + "step": 1250 + }, + { + "epoch": 0.3327127659574468, + "grad_norm": 4.744760036468506, + "learning_rate": 9.992520211595902e-06, + "loss": 1.2885, + "step": 1251 + }, + { + "epoch": 0.33297872340425533, + "grad_norm": 3.722013235092163, + "learning_rate": 9.992472044822344e-06, + "loss": 1.1684, + "step": 1252 + }, + { + "epoch": 0.33324468085106385, + "grad_norm": 4.375733852386475, + "learning_rate": 9.992423723575822e-06, + "loss": 1.4177, + "step": 1253 + }, + { + "epoch": 0.33351063829787236, + "grad_norm": 4.03129243850708, + "learning_rate": 9.992375247857833e-06, + "loss": 1.3669, + "step": 1254 + }, + { + "epoch": 0.3337765957446808, + "grad_norm": 3.828651189804077, + "learning_rate": 9.992326617669876e-06, + "loss": 1.3573, + "step": 1255 + }, + { + "epoch": 0.33404255319148934, + "grad_norm": 4.016900062561035, + "learning_rate": 9.992277833013457e-06, + "loss": 1.2265, + "step": 1256 + }, + { + "epoch": 0.33430851063829786, + "grad_norm": 4.38175630569458, + "learning_rate": 9.992228893890084e-06, + "loss": 1.3774, + "step": 1257 + }, + { + "epoch": 0.3345744680851064, + "grad_norm": 4.081117153167725, + "learning_rate": 9.992179800301269e-06, + "loss": 1.2978, + "step": 1258 + }, + { + "epoch": 0.3348404255319149, + "grad_norm": 4.280460834503174, + "learning_rate": 9.992130552248535e-06, + "loss": 1.1316, + "step": 1259 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 4.5057268142700195, + "learning_rate": 9.992081149733404e-06, + "loss": 1.3776, + "step": 1260 + }, + { + "epoch": 0.33537234042553193, + "grad_norm": 3.8671257495880127, + "learning_rate": 9.992031592757405e-06, + "loss": 1.3541, + "step": 1261 + }, + { + "epoch": 0.33563829787234045, + "grad_norm": 4.478667736053467, + "learning_rate": 9.991981881322072e-06, + "loss": 1.3155, + "step": 1262 + }, + { + "epoch": 0.3359042553191489, + "grad_norm": 5.32509183883667, + "learning_rate": 9.991932015428941e-06, + "loss": 1.3662, + "step": 1263 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 4.138638973236084, + "learning_rate": 9.991881995079558e-06, + "loss": 1.3641, + "step": 1264 + }, + { + "epoch": 0.33643617021276595, + "grad_norm": 4.780951499938965, + "learning_rate": 9.991831820275466e-06, + "loss": 1.4626, + "step": 1265 + }, + { + "epoch": 0.33670212765957447, + "grad_norm": 3.6165192127227783, + "learning_rate": 9.991781491018223e-06, + "loss": 1.2914, + "step": 1266 + }, + { + "epoch": 0.336968085106383, + "grad_norm": 4.3747992515563965, + "learning_rate": 9.991731007309382e-06, + "loss": 1.2756, + "step": 1267 + }, + { + "epoch": 0.3372340425531915, + "grad_norm": 5.0972580909729, + "learning_rate": 9.991680369150507e-06, + "loss": 1.4694, + "step": 1268 + }, + { + "epoch": 0.3375, + "grad_norm": 3.841791868209839, + "learning_rate": 9.991629576543164e-06, + "loss": 1.1905, + "step": 1269 + }, + { + "epoch": 0.3377659574468085, + "grad_norm": 4.1475324630737305, + "learning_rate": 9.991578629488926e-06, + "loss": 1.3379, + "step": 1270 + }, + { + "epoch": 0.338031914893617, + "grad_norm": 4.152446269989014, + "learning_rate": 9.991527527989366e-06, + "loss": 1.1402, + "step": 1271 + }, + { + "epoch": 0.3382978723404255, + "grad_norm": 4.5577006340026855, + "learning_rate": 9.99147627204607e-06, + "loss": 1.3844, + "step": 1272 + }, + { + "epoch": 0.33856382978723404, + "grad_norm": 4.605076313018799, + "learning_rate": 9.991424861660621e-06, + "loss": 1.4557, + "step": 1273 + }, + { + "epoch": 0.33882978723404256, + "grad_norm": 4.045496940612793, + "learning_rate": 9.99137329683461e-06, + "loss": 1.2976, + "step": 1274 + }, + { + "epoch": 0.3390957446808511, + "grad_norm": 4.148492336273193, + "learning_rate": 9.991321577569632e-06, + "loss": 1.4065, + "step": 1275 + }, + { + "epoch": 0.3393617021276596, + "grad_norm": 4.128026485443115, + "learning_rate": 9.991269703867288e-06, + "loss": 1.3056, + "step": 1276 + }, + { + "epoch": 0.3396276595744681, + "grad_norm": 4.140103340148926, + "learning_rate": 9.991217675729184e-06, + "loss": 1.3136, + "step": 1277 + }, + { + "epoch": 0.3398936170212766, + "grad_norm": 4.122238636016846, + "learning_rate": 9.991165493156927e-06, + "loss": 1.2575, + "step": 1278 + }, + { + "epoch": 0.3401595744680851, + "grad_norm": 4.590948104858398, + "learning_rate": 9.991113156152134e-06, + "loss": 1.2896, + "step": 1279 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 4.469196796417236, + "learning_rate": 9.991060664716423e-06, + "loss": 1.4088, + "step": 1280 + }, + { + "epoch": 0.3406914893617021, + "grad_norm": 4.643316268920898, + "learning_rate": 9.99100801885142e-06, + "loss": 1.4124, + "step": 1281 + }, + { + "epoch": 0.34095744680851064, + "grad_norm": 4.106162071228027, + "learning_rate": 9.990955218558751e-06, + "loss": 1.3555, + "step": 1282 + }, + { + "epoch": 0.34122340425531916, + "grad_norm": 4.337850093841553, + "learning_rate": 9.990902263840053e-06, + "loss": 1.1865, + "step": 1283 + }, + { + "epoch": 0.3414893617021277, + "grad_norm": 3.8557538986206055, + "learning_rate": 9.990849154696963e-06, + "loss": 1.2002, + "step": 1284 + }, + { + "epoch": 0.34175531914893614, + "grad_norm": 4.412120342254639, + "learning_rate": 9.990795891131125e-06, + "loss": 1.3584, + "step": 1285 + }, + { + "epoch": 0.34202127659574466, + "grad_norm": 5.199094772338867, + "learning_rate": 9.990742473144184e-06, + "loss": 1.3745, + "step": 1286 + }, + { + "epoch": 0.3422872340425532, + "grad_norm": 3.8888189792633057, + "learning_rate": 9.990688900737795e-06, + "loss": 1.2443, + "step": 1287 + }, + { + "epoch": 0.3425531914893617, + "grad_norm": 3.81540846824646, + "learning_rate": 9.990635173913616e-06, + "loss": 1.347, + "step": 1288 + }, + { + "epoch": 0.3428191489361702, + "grad_norm": 4.090488910675049, + "learning_rate": 9.990581292673309e-06, + "loss": 1.283, + "step": 1289 + }, + { + "epoch": 0.34308510638297873, + "grad_norm": 4.115976333618164, + "learning_rate": 9.990527257018544e-06, + "loss": 1.2893, + "step": 1290 + }, + { + "epoch": 0.34335106382978725, + "grad_norm": 3.9170165061950684, + "learning_rate": 9.990473066950987e-06, + "loss": 1.2133, + "step": 1291 + }, + { + "epoch": 0.34361702127659577, + "grad_norm": 3.8994202613830566, + "learning_rate": 9.990418722472317e-06, + "loss": 1.1986, + "step": 1292 + }, + { + "epoch": 0.34388297872340423, + "grad_norm": 3.8675310611724854, + "learning_rate": 9.990364223584218e-06, + "loss": 1.16, + "step": 1293 + }, + { + "epoch": 0.34414893617021275, + "grad_norm": 4.010871410369873, + "learning_rate": 9.990309570288374e-06, + "loss": 1.2748, + "step": 1294 + }, + { + "epoch": 0.34441489361702127, + "grad_norm": 4.264376163482666, + "learning_rate": 9.990254762586477e-06, + "loss": 1.167, + "step": 1295 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 4.201075553894043, + "learning_rate": 9.990199800480222e-06, + "loss": 1.2061, + "step": 1296 + }, + { + "epoch": 0.3449468085106383, + "grad_norm": 4.1181535720825195, + "learning_rate": 9.99014468397131e-06, + "loss": 1.188, + "step": 1297 + }, + { + "epoch": 0.3452127659574468, + "grad_norm": 3.747342824935913, + "learning_rate": 9.990089413061445e-06, + "loss": 1.1944, + "step": 1298 + }, + { + "epoch": 0.34547872340425534, + "grad_norm": 4.067655086517334, + "learning_rate": 9.990033987752341e-06, + "loss": 1.1876, + "step": 1299 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 4.090482234954834, + "learning_rate": 9.989978408045709e-06, + "loss": 1.2122, + "step": 1300 + }, + { + "epoch": 0.3460106382978723, + "grad_norm": 3.879619598388672, + "learning_rate": 9.989922673943271e-06, + "loss": 1.2099, + "step": 1301 + }, + { + "epoch": 0.34627659574468084, + "grad_norm": 4.814892768859863, + "learning_rate": 9.98986678544675e-06, + "loss": 1.3879, + "step": 1302 + }, + { + "epoch": 0.34654255319148936, + "grad_norm": 4.234111309051514, + "learning_rate": 9.989810742557875e-06, + "loss": 1.5134, + "step": 1303 + }, + { + "epoch": 0.3468085106382979, + "grad_norm": 4.2561469078063965, + "learning_rate": 9.989754545278381e-06, + "loss": 1.3591, + "step": 1304 + }, + { + "epoch": 0.3470744680851064, + "grad_norm": 4.519184112548828, + "learning_rate": 9.989698193610007e-06, + "loss": 1.1676, + "step": 1305 + }, + { + "epoch": 0.3473404255319149, + "grad_norm": 4.09921407699585, + "learning_rate": 9.989641687554496e-06, + "loss": 1.238, + "step": 1306 + }, + { + "epoch": 0.3476063829787234, + "grad_norm": 3.9749245643615723, + "learning_rate": 9.989585027113598e-06, + "loss": 1.2444, + "step": 1307 + }, + { + "epoch": 0.3478723404255319, + "grad_norm": 4.225282192230225, + "learning_rate": 9.989528212289064e-06, + "loss": 1.1724, + "step": 1308 + }, + { + "epoch": 0.3481382978723404, + "grad_norm": 4.391535758972168, + "learning_rate": 9.98947124308265e-06, + "loss": 1.4058, + "step": 1309 + }, + { + "epoch": 0.3484042553191489, + "grad_norm": 3.8815417289733887, + "learning_rate": 9.989414119496126e-06, + "loss": 1.2464, + "step": 1310 + }, + { + "epoch": 0.34867021276595744, + "grad_norm": 4.186168193817139, + "learning_rate": 9.989356841531252e-06, + "loss": 1.2393, + "step": 1311 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 3.9777474403381348, + "learning_rate": 9.989299409189802e-06, + "loss": 1.1674, + "step": 1312 + }, + { + "epoch": 0.3492021276595745, + "grad_norm": 4.088747978210449, + "learning_rate": 9.989241822473557e-06, + "loss": 1.2024, + "step": 1313 + }, + { + "epoch": 0.349468085106383, + "grad_norm": 4.297309398651123, + "learning_rate": 9.989184081384295e-06, + "loss": 1.384, + "step": 1314 + }, + { + "epoch": 0.3497340425531915, + "grad_norm": 3.6362228393554688, + "learning_rate": 9.989126185923803e-06, + "loss": 1.266, + "step": 1315 + }, + { + "epoch": 0.35, + "grad_norm": 4.015252113342285, + "learning_rate": 9.989068136093873e-06, + "loss": 1.2447, + "step": 1316 + }, + { + "epoch": 0.3502659574468085, + "grad_norm": 3.9256210327148438, + "learning_rate": 9.989009931896302e-06, + "loss": 1.2674, + "step": 1317 + }, + { + "epoch": 0.350531914893617, + "grad_norm": 4.108496189117432, + "learning_rate": 9.988951573332888e-06, + "loss": 1.232, + "step": 1318 + }, + { + "epoch": 0.35079787234042553, + "grad_norm": 4.183421611785889, + "learning_rate": 9.98889306040544e-06, + "loss": 1.2652, + "step": 1319 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 4.556921482086182, + "learning_rate": 9.988834393115768e-06, + "loss": 1.3536, + "step": 1320 + }, + { + "epoch": 0.35132978723404257, + "grad_norm": 4.081547737121582, + "learning_rate": 9.988775571465684e-06, + "loss": 1.3168, + "step": 1321 + }, + { + "epoch": 0.3515957446808511, + "grad_norm": 4.136814594268799, + "learning_rate": 9.988716595457011e-06, + "loss": 1.3124, + "step": 1322 + }, + { + "epoch": 0.35186170212765955, + "grad_norm": 4.485897064208984, + "learning_rate": 9.988657465091572e-06, + "loss": 1.3164, + "step": 1323 + }, + { + "epoch": 0.35212765957446807, + "grad_norm": 4.273427963256836, + "learning_rate": 9.988598180371198e-06, + "loss": 1.2051, + "step": 1324 + }, + { + "epoch": 0.3523936170212766, + "grad_norm": 3.715895175933838, + "learning_rate": 9.988538741297724e-06, + "loss": 1.0755, + "step": 1325 + }, + { + "epoch": 0.3526595744680851, + "grad_norm": 3.932218551635742, + "learning_rate": 9.98847914787299e-06, + "loss": 1.4028, + "step": 1326 + }, + { + "epoch": 0.3529255319148936, + "grad_norm": 4.555146217346191, + "learning_rate": 9.988419400098834e-06, + "loss": 1.2805, + "step": 1327 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 4.291238784790039, + "learning_rate": 9.98835949797711e-06, + "loss": 1.3683, + "step": 1328 + }, + { + "epoch": 0.35345744680851066, + "grad_norm": 4.525993824005127, + "learning_rate": 9.98829944150967e-06, + "loss": 1.2788, + "step": 1329 + }, + { + "epoch": 0.3537234042553192, + "grad_norm": 3.771448850631714, + "learning_rate": 9.988239230698373e-06, + "loss": 1.3256, + "step": 1330 + }, + { + "epoch": 0.35398936170212764, + "grad_norm": 4.0126633644104, + "learning_rate": 9.988178865545081e-06, + "loss": 1.2984, + "step": 1331 + }, + { + "epoch": 0.35425531914893615, + "grad_norm": 3.521714210510254, + "learning_rate": 9.988118346051663e-06, + "loss": 1.192, + "step": 1332 + }, + { + "epoch": 0.35452127659574467, + "grad_norm": 4.065241813659668, + "learning_rate": 9.98805767221999e-06, + "loss": 1.383, + "step": 1333 + }, + { + "epoch": 0.3547872340425532, + "grad_norm": 4.3708720207214355, + "learning_rate": 9.987996844051939e-06, + "loss": 1.3586, + "step": 1334 + }, + { + "epoch": 0.3550531914893617, + "grad_norm": 4.104064464569092, + "learning_rate": 9.987935861549393e-06, + "loss": 1.2583, + "step": 1335 + }, + { + "epoch": 0.3553191489361702, + "grad_norm": 4.293087959289551, + "learning_rate": 9.98787472471424e-06, + "loss": 1.3606, + "step": 1336 + }, + { + "epoch": 0.35558510638297874, + "grad_norm": 3.906818151473999, + "learning_rate": 9.98781343354837e-06, + "loss": 1.2305, + "step": 1337 + }, + { + "epoch": 0.3558510638297872, + "grad_norm": 4.049057960510254, + "learning_rate": 9.98775198805368e-06, + "loss": 1.1915, + "step": 1338 + }, + { + "epoch": 0.3561170212765957, + "grad_norm": 4.160476207733154, + "learning_rate": 9.987690388232071e-06, + "loss": 1.3273, + "step": 1339 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 4.2301344871521, + "learning_rate": 9.98762863408545e-06, + "loss": 1.242, + "step": 1340 + }, + { + "epoch": 0.35664893617021276, + "grad_norm": 4.272438049316406, + "learning_rate": 9.987566725615725e-06, + "loss": 1.3378, + "step": 1341 + }, + { + "epoch": 0.3569148936170213, + "grad_norm": 4.048627853393555, + "learning_rate": 9.987504662824814e-06, + "loss": 1.2938, + "step": 1342 + }, + { + "epoch": 0.3571808510638298, + "grad_norm": 4.272396087646484, + "learning_rate": 9.987442445714637e-06, + "loss": 1.363, + "step": 1343 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 4.04710578918457, + "learning_rate": 9.98738007428712e-06, + "loss": 1.3823, + "step": 1344 + }, + { + "epoch": 0.35771276595744683, + "grad_norm": 4.724300384521484, + "learning_rate": 9.98731754854419e-06, + "loss": 1.4429, + "step": 1345 + }, + { + "epoch": 0.3579787234042553, + "grad_norm": 4.071347713470459, + "learning_rate": 9.987254868487783e-06, + "loss": 1.2203, + "step": 1346 + }, + { + "epoch": 0.3582446808510638, + "grad_norm": 3.8509132862091064, + "learning_rate": 9.987192034119839e-06, + "loss": 1.2774, + "step": 1347 + }, + { + "epoch": 0.35851063829787233, + "grad_norm": 3.7690467834472656, + "learning_rate": 9.987129045442304e-06, + "loss": 1.1786, + "step": 1348 + }, + { + "epoch": 0.35877659574468085, + "grad_norm": 4.102452754974365, + "learning_rate": 9.987065902457122e-06, + "loss": 1.232, + "step": 1349 + }, + { + "epoch": 0.35904255319148937, + "grad_norm": 4.353301048278809, + "learning_rate": 9.98700260516625e-06, + "loss": 1.204, + "step": 1350 + }, + { + "epoch": 0.3593085106382979, + "grad_norm": 4.020050048828125, + "learning_rate": 9.986939153571647e-06, + "loss": 1.2681, + "step": 1351 + }, + { + "epoch": 0.3595744680851064, + "grad_norm": 4.041562080383301, + "learning_rate": 9.986875547675274e-06, + "loss": 1.2093, + "step": 1352 + }, + { + "epoch": 0.3598404255319149, + "grad_norm": 3.9428937435150146, + "learning_rate": 9.9868117874791e-06, + "loss": 1.4088, + "step": 1353 + }, + { + "epoch": 0.3601063829787234, + "grad_norm": 3.8776018619537354, + "learning_rate": 9.986747872985099e-06, + "loss": 1.2944, + "step": 1354 + }, + { + "epoch": 0.3603723404255319, + "grad_norm": 4.4396796226501465, + "learning_rate": 9.986683804195248e-06, + "loss": 1.2328, + "step": 1355 + }, + { + "epoch": 0.3606382978723404, + "grad_norm": 6.8338093757629395, + "learning_rate": 9.986619581111528e-06, + "loss": 1.2865, + "step": 1356 + }, + { + "epoch": 0.36090425531914894, + "grad_norm": 3.8783535957336426, + "learning_rate": 9.986555203735926e-06, + "loss": 1.2004, + "step": 1357 + }, + { + "epoch": 0.36117021276595745, + "grad_norm": 4.063074111938477, + "learning_rate": 9.986490672070438e-06, + "loss": 1.2033, + "step": 1358 + }, + { + "epoch": 0.361436170212766, + "grad_norm": 5.602739334106445, + "learning_rate": 9.986425986117055e-06, + "loss": 1.2993, + "step": 1359 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.687655448913574, + "learning_rate": 9.986361145877783e-06, + "loss": 1.1984, + "step": 1360 + }, + { + "epoch": 0.36196808510638295, + "grad_norm": 4.312001705169678, + "learning_rate": 9.986296151354625e-06, + "loss": 1.2943, + "step": 1361 + }, + { + "epoch": 0.36223404255319147, + "grad_norm": 4.478762149810791, + "learning_rate": 9.986231002549594e-06, + "loss": 1.294, + "step": 1362 + }, + { + "epoch": 0.3625, + "grad_norm": 4.86306095123291, + "learning_rate": 9.986165699464706e-06, + "loss": 1.5325, + "step": 1363 + }, + { + "epoch": 0.3627659574468085, + "grad_norm": 4.426929950714111, + "learning_rate": 9.986100242101982e-06, + "loss": 1.3561, + "step": 1364 + }, + { + "epoch": 0.363031914893617, + "grad_norm": 4.546680450439453, + "learning_rate": 9.986034630463443e-06, + "loss": 1.3143, + "step": 1365 + }, + { + "epoch": 0.36329787234042554, + "grad_norm": 4.5038957595825195, + "learning_rate": 9.985968864551123e-06, + "loss": 1.2948, + "step": 1366 + }, + { + "epoch": 0.36356382978723406, + "grad_norm": 4.967344284057617, + "learning_rate": 9.985902944367058e-06, + "loss": 1.2844, + "step": 1367 + }, + { + "epoch": 0.3638297872340426, + "grad_norm": 3.8887312412261963, + "learning_rate": 9.985836869913283e-06, + "loss": 1.2737, + "step": 1368 + }, + { + "epoch": 0.36409574468085104, + "grad_norm": 4.1144795417785645, + "learning_rate": 9.985770641191847e-06, + "loss": 1.3379, + "step": 1369 + }, + { + "epoch": 0.36436170212765956, + "grad_norm": 4.12211275100708, + "learning_rate": 9.985704258204798e-06, + "loss": 1.3465, + "step": 1370 + }, + { + "epoch": 0.3646276595744681, + "grad_norm": 4.424558162689209, + "learning_rate": 9.985637720954188e-06, + "loss": 1.0785, + "step": 1371 + }, + { + "epoch": 0.3648936170212766, + "grad_norm": 4.308188438415527, + "learning_rate": 9.985571029442078e-06, + "loss": 1.4829, + "step": 1372 + }, + { + "epoch": 0.3651595744680851, + "grad_norm": 3.587887763977051, + "learning_rate": 9.98550418367053e-06, + "loss": 1.2684, + "step": 1373 + }, + { + "epoch": 0.36542553191489363, + "grad_norm": 4.300267696380615, + "learning_rate": 9.985437183641612e-06, + "loss": 1.305, + "step": 1374 + }, + { + "epoch": 0.36569148936170215, + "grad_norm": 4.035099506378174, + "learning_rate": 9.985370029357399e-06, + "loss": 1.2249, + "step": 1375 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.958627939224243, + "learning_rate": 9.985302720819967e-06, + "loss": 1.2176, + "step": 1376 + }, + { + "epoch": 0.36622340425531913, + "grad_norm": 4.257254600524902, + "learning_rate": 9.9852352580314e-06, + "loss": 1.2714, + "step": 1377 + }, + { + "epoch": 0.36648936170212765, + "grad_norm": 4.782037258148193, + "learning_rate": 9.985167640993784e-06, + "loss": 1.4979, + "step": 1378 + }, + { + "epoch": 0.36675531914893617, + "grad_norm": 4.400300025939941, + "learning_rate": 9.985099869709213e-06, + "loss": 1.3505, + "step": 1379 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 4.289068698883057, + "learning_rate": 9.985031944179781e-06, + "loss": 1.2113, + "step": 1380 + }, + { + "epoch": 0.3672872340425532, + "grad_norm": 4.770625591278076, + "learning_rate": 9.984963864407593e-06, + "loss": 1.4373, + "step": 1381 + }, + { + "epoch": 0.3675531914893617, + "grad_norm": 4.392122268676758, + "learning_rate": 9.984895630394755e-06, + "loss": 1.3069, + "step": 1382 + }, + { + "epoch": 0.36781914893617024, + "grad_norm": 3.9814369678497314, + "learning_rate": 9.984827242143376e-06, + "loss": 1.281, + "step": 1383 + }, + { + "epoch": 0.3680851063829787, + "grad_norm": 3.9791054725646973, + "learning_rate": 9.984758699655572e-06, + "loss": 1.1758, + "step": 1384 + }, + { + "epoch": 0.3683510638297872, + "grad_norm": 4.434001922607422, + "learning_rate": 9.984690002933465e-06, + "loss": 1.3586, + "step": 1385 + }, + { + "epoch": 0.36861702127659574, + "grad_norm": 4.445183753967285, + "learning_rate": 9.984621151979183e-06, + "loss": 1.367, + "step": 1386 + }, + { + "epoch": 0.36888297872340425, + "grad_norm": 3.8560211658477783, + "learning_rate": 9.984552146794853e-06, + "loss": 1.2933, + "step": 1387 + }, + { + "epoch": 0.36914893617021277, + "grad_norm": 4.20532751083374, + "learning_rate": 9.984482987382612e-06, + "loss": 1.3036, + "step": 1388 + }, + { + "epoch": 0.3694148936170213, + "grad_norm": 4.1775898933410645, + "learning_rate": 9.984413673744597e-06, + "loss": 1.1862, + "step": 1389 + }, + { + "epoch": 0.3696808510638298, + "grad_norm": 4.668176651000977, + "learning_rate": 9.984344205882954e-06, + "loss": 1.3125, + "step": 1390 + }, + { + "epoch": 0.3699468085106383, + "grad_norm": 4.170348644256592, + "learning_rate": 9.984274583799833e-06, + "loss": 1.1855, + "step": 1391 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 3.893609046936035, + "learning_rate": 9.98420480749739e-06, + "loss": 1.3567, + "step": 1392 + }, + { + "epoch": 0.3704787234042553, + "grad_norm": 3.791059970855713, + "learning_rate": 9.98413487697778e-06, + "loss": 1.2596, + "step": 1393 + }, + { + "epoch": 0.3707446808510638, + "grad_norm": 3.89493465423584, + "learning_rate": 9.984064792243171e-06, + "loss": 1.1468, + "step": 1394 + }, + { + "epoch": 0.37101063829787234, + "grad_norm": 3.932354211807251, + "learning_rate": 9.983994553295728e-06, + "loss": 1.2274, + "step": 1395 + }, + { + "epoch": 0.37127659574468086, + "grad_norm": 3.772759199142456, + "learning_rate": 9.983924160137627e-06, + "loss": 1.1687, + "step": 1396 + }, + { + "epoch": 0.3715425531914894, + "grad_norm": 4.090175628662109, + "learning_rate": 9.983853612771043e-06, + "loss": 1.1627, + "step": 1397 + }, + { + "epoch": 0.3718085106382979, + "grad_norm": 5.041259288787842, + "learning_rate": 9.983782911198161e-06, + "loss": 1.2878, + "step": 1398 + }, + { + "epoch": 0.37207446808510636, + "grad_norm": 4.565484523773193, + "learning_rate": 9.98371205542117e-06, + "loss": 1.2838, + "step": 1399 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 3.94577956199646, + "learning_rate": 9.983641045442256e-06, + "loss": 1.3253, + "step": 1400 + }, + { + "epoch": 0.3726063829787234, + "grad_norm": 3.559597969055176, + "learning_rate": 9.983569881263625e-06, + "loss": 1.0896, + "step": 1401 + }, + { + "epoch": 0.3728723404255319, + "grad_norm": 4.101516246795654, + "learning_rate": 9.983498562887471e-06, + "loss": 1.4844, + "step": 1402 + }, + { + "epoch": 0.37313829787234043, + "grad_norm": 4.680913925170898, + "learning_rate": 9.983427090316005e-06, + "loss": 1.3343, + "step": 1403 + }, + { + "epoch": 0.37340425531914895, + "grad_norm": 5.2188286781311035, + "learning_rate": 9.983355463551439e-06, + "loss": 1.3206, + "step": 1404 + }, + { + "epoch": 0.37367021276595747, + "grad_norm": 4.363986968994141, + "learning_rate": 9.983283682595986e-06, + "loss": 1.5722, + "step": 1405 + }, + { + "epoch": 0.373936170212766, + "grad_norm": 4.405764579772949, + "learning_rate": 9.98321174745187e-06, + "loss": 1.3106, + "step": 1406 + }, + { + "epoch": 0.37420212765957445, + "grad_norm": 3.671576738357544, + "learning_rate": 9.983139658121316e-06, + "loss": 1.1663, + "step": 1407 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 4.068467140197754, + "learning_rate": 9.983067414606553e-06, + "loss": 1.3443, + "step": 1408 + }, + { + "epoch": 0.3747340425531915, + "grad_norm": 4.050812244415283, + "learning_rate": 9.982995016909817e-06, + "loss": 1.2671, + "step": 1409 + }, + { + "epoch": 0.375, + "grad_norm": 4.016097545623779, + "learning_rate": 9.98292246503335e-06, + "loss": 1.2389, + "step": 1410 + }, + { + "epoch": 0.3752659574468085, + "grad_norm": 4.278280258178711, + "learning_rate": 9.982849758979394e-06, + "loss": 1.3095, + "step": 1411 + }, + { + "epoch": 0.37553191489361704, + "grad_norm": 3.826686143875122, + "learning_rate": 9.9827768987502e-06, + "loss": 1.0923, + "step": 1412 + }, + { + "epoch": 0.37579787234042555, + "grad_norm": 3.954808473587036, + "learning_rate": 9.982703884348023e-06, + "loss": 1.3359, + "step": 1413 + }, + { + "epoch": 0.376063829787234, + "grad_norm": 3.8342320919036865, + "learning_rate": 9.982630715775121e-06, + "loss": 1.287, + "step": 1414 + }, + { + "epoch": 0.37632978723404253, + "grad_norm": 4.190742492675781, + "learning_rate": 9.982557393033758e-06, + "loss": 1.2957, + "step": 1415 + }, + { + "epoch": 0.37659574468085105, + "grad_norm": 4.030623435974121, + "learning_rate": 9.982483916126204e-06, + "loss": 1.2992, + "step": 1416 + }, + { + "epoch": 0.37686170212765957, + "grad_norm": 4.164768695831299, + "learning_rate": 9.98241028505473e-06, + "loss": 1.5608, + "step": 1417 + }, + { + "epoch": 0.3771276595744681, + "grad_norm": 4.243110656738281, + "learning_rate": 9.982336499821617e-06, + "loss": 1.3214, + "step": 1418 + }, + { + "epoch": 0.3773936170212766, + "grad_norm": 3.969595193862915, + "learning_rate": 9.982262560429147e-06, + "loss": 1.3743, + "step": 1419 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 4.253571033477783, + "learning_rate": 9.982188466879607e-06, + "loss": 1.329, + "step": 1420 + }, + { + "epoch": 0.37792553191489364, + "grad_norm": 4.254541397094727, + "learning_rate": 9.98211421917529e-06, + "loss": 1.3093, + "step": 1421 + }, + { + "epoch": 0.3781914893617021, + "grad_norm": 4.365729808807373, + "learning_rate": 9.982039817318491e-06, + "loss": 1.3744, + "step": 1422 + }, + { + "epoch": 0.3784574468085106, + "grad_norm": 4.0368499755859375, + "learning_rate": 9.981965261311519e-06, + "loss": 1.1517, + "step": 1423 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 4.165602207183838, + "learning_rate": 9.981890551156673e-06, + "loss": 1.2983, + "step": 1424 + }, + { + "epoch": 0.37898936170212766, + "grad_norm": 4.241005897521973, + "learning_rate": 9.981815686856268e-06, + "loss": 1.2491, + "step": 1425 + }, + { + "epoch": 0.3792553191489362, + "grad_norm": 3.9506289958953857, + "learning_rate": 9.981740668412622e-06, + "loss": 1.175, + "step": 1426 + }, + { + "epoch": 0.3795212765957447, + "grad_norm": 4.209918022155762, + "learning_rate": 9.981665495828053e-06, + "loss": 1.379, + "step": 1427 + }, + { + "epoch": 0.3797872340425532, + "grad_norm": 4.048032283782959, + "learning_rate": 9.981590169104889e-06, + "loss": 1.4339, + "step": 1428 + }, + { + "epoch": 0.3800531914893617, + "grad_norm": 3.9107158184051514, + "learning_rate": 9.98151468824546e-06, + "loss": 1.4468, + "step": 1429 + }, + { + "epoch": 0.3803191489361702, + "grad_norm": 3.8230321407318115, + "learning_rate": 9.981439053252102e-06, + "loss": 1.2942, + "step": 1430 + }, + { + "epoch": 0.3805851063829787, + "grad_norm": 3.772338390350342, + "learning_rate": 9.981363264127154e-06, + "loss": 1.3236, + "step": 1431 + }, + { + "epoch": 0.38085106382978723, + "grad_norm": 4.234860897064209, + "learning_rate": 9.981287320872962e-06, + "loss": 1.3763, + "step": 1432 + }, + { + "epoch": 0.38111702127659575, + "grad_norm": 3.8890817165374756, + "learning_rate": 9.981211223491876e-06, + "loss": 1.3667, + "step": 1433 + }, + { + "epoch": 0.38138297872340426, + "grad_norm": 3.8217055797576904, + "learning_rate": 9.98113497198625e-06, + "loss": 1.1392, + "step": 1434 + }, + { + "epoch": 0.3816489361702128, + "grad_norm": 3.9971745014190674, + "learning_rate": 9.981058566358443e-06, + "loss": 1.1892, + "step": 1435 + }, + { + "epoch": 0.3819148936170213, + "grad_norm": 4.417277812957764, + "learning_rate": 9.98098200661082e-06, + "loss": 1.3306, + "step": 1436 + }, + { + "epoch": 0.38218085106382976, + "grad_norm": 4.433936595916748, + "learning_rate": 9.980905292745749e-06, + "loss": 1.2253, + "step": 1437 + }, + { + "epoch": 0.3824468085106383, + "grad_norm": 3.668414831161499, + "learning_rate": 9.980828424765603e-06, + "loss": 1.3243, + "step": 1438 + }, + { + "epoch": 0.3827127659574468, + "grad_norm": 4.062864303588867, + "learning_rate": 9.980751402672762e-06, + "loss": 1.2416, + "step": 1439 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 4.28949499130249, + "learning_rate": 9.980674226469608e-06, + "loss": 1.3018, + "step": 1440 + }, + { + "epoch": 0.38324468085106383, + "grad_norm": 3.598482847213745, + "learning_rate": 9.980596896158532e-06, + "loss": 1.1174, + "step": 1441 + }, + { + "epoch": 0.38351063829787235, + "grad_norm": 4.300634384155273, + "learning_rate": 9.980519411741922e-06, + "loss": 1.3079, + "step": 1442 + }, + { + "epoch": 0.38377659574468087, + "grad_norm": 4.2363128662109375, + "learning_rate": 9.980441773222178e-06, + "loss": 1.3546, + "step": 1443 + }, + { + "epoch": 0.3840425531914894, + "grad_norm": 4.521866321563721, + "learning_rate": 9.980363980601702e-06, + "loss": 1.2007, + "step": 1444 + }, + { + "epoch": 0.38430851063829785, + "grad_norm": 3.9129135608673096, + "learning_rate": 9.9802860338829e-06, + "loss": 1.3101, + "step": 1445 + }, + { + "epoch": 0.38457446808510637, + "grad_norm": 4.559953689575195, + "learning_rate": 9.980207933068185e-06, + "loss": 1.3183, + "step": 1446 + }, + { + "epoch": 0.3848404255319149, + "grad_norm": 4.102110385894775, + "learning_rate": 9.980129678159974e-06, + "loss": 1.2549, + "step": 1447 + }, + { + "epoch": 0.3851063829787234, + "grad_norm": 4.215007781982422, + "learning_rate": 9.980051269160686e-06, + "loss": 1.3281, + "step": 1448 + }, + { + "epoch": 0.3853723404255319, + "grad_norm": 4.188117980957031, + "learning_rate": 9.97997270607275e-06, + "loss": 1.267, + "step": 1449 + }, + { + "epoch": 0.38563829787234044, + "grad_norm": 3.9828150272369385, + "learning_rate": 9.979893988898592e-06, + "loss": 1.2967, + "step": 1450 + }, + { + "epoch": 0.38590425531914896, + "grad_norm": 3.9680116176605225, + "learning_rate": 9.979815117640654e-06, + "loss": 1.2711, + "step": 1451 + }, + { + "epoch": 0.3861702127659574, + "grad_norm": 3.9651451110839844, + "learning_rate": 9.979736092301374e-06, + "loss": 1.2298, + "step": 1452 + }, + { + "epoch": 0.38643617021276594, + "grad_norm": 3.7032337188720703, + "learning_rate": 9.979656912883193e-06, + "loss": 1.1644, + "step": 1453 + }, + { + "epoch": 0.38670212765957446, + "grad_norm": 4.174644470214844, + "learning_rate": 9.979577579388566e-06, + "loss": 1.1941, + "step": 1454 + }, + { + "epoch": 0.386968085106383, + "grad_norm": 3.9499082565307617, + "learning_rate": 9.979498091819946e-06, + "loss": 1.2205, + "step": 1455 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 4.005082130432129, + "learning_rate": 9.979418450179792e-06, + "loss": 1.2983, + "step": 1456 + }, + { + "epoch": 0.3875, + "grad_norm": 4.425258159637451, + "learning_rate": 9.97933865447057e-06, + "loss": 1.3444, + "step": 1457 + }, + { + "epoch": 0.38776595744680853, + "grad_norm": 4.169209003448486, + "learning_rate": 9.979258704694747e-06, + "loss": 1.3914, + "step": 1458 + }, + { + "epoch": 0.38803191489361705, + "grad_norm": 3.7960317134857178, + "learning_rate": 9.979178600854797e-06, + "loss": 1.2186, + "step": 1459 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 3.9216535091400146, + "learning_rate": 9.979098342953198e-06, + "loss": 1.0839, + "step": 1460 + }, + { + "epoch": 0.388563829787234, + "grad_norm": 4.077401638031006, + "learning_rate": 9.979017930992436e-06, + "loss": 1.225, + "step": 1461 + }, + { + "epoch": 0.38882978723404255, + "grad_norm": 3.871135950088501, + "learning_rate": 9.978937364974996e-06, + "loss": 1.2545, + "step": 1462 + }, + { + "epoch": 0.38909574468085106, + "grad_norm": 4.12876558303833, + "learning_rate": 9.978856644903373e-06, + "loss": 1.3806, + "step": 1463 + }, + { + "epoch": 0.3893617021276596, + "grad_norm": 4.172638416290283, + "learning_rate": 9.978775770780061e-06, + "loss": 1.3444, + "step": 1464 + }, + { + "epoch": 0.3896276595744681, + "grad_norm": 4.253303050994873, + "learning_rate": 9.978694742607566e-06, + "loss": 1.3015, + "step": 1465 + }, + { + "epoch": 0.3898936170212766, + "grad_norm": 3.937948226928711, + "learning_rate": 9.978613560388396e-06, + "loss": 1.4014, + "step": 1466 + }, + { + "epoch": 0.3901595744680851, + "grad_norm": 3.959920644760132, + "learning_rate": 9.978532224125059e-06, + "loss": 1.2797, + "step": 1467 + }, + { + "epoch": 0.3904255319148936, + "grad_norm": 4.240394592285156, + "learning_rate": 9.978450733820073e-06, + "loss": 1.3541, + "step": 1468 + }, + { + "epoch": 0.3906914893617021, + "grad_norm": 4.060705661773682, + "learning_rate": 9.97836908947596e-06, + "loss": 1.2997, + "step": 1469 + }, + { + "epoch": 0.39095744680851063, + "grad_norm": 4.276419162750244, + "learning_rate": 9.978287291095248e-06, + "loss": 1.4451, + "step": 1470 + }, + { + "epoch": 0.39122340425531915, + "grad_norm": 3.961526393890381, + "learning_rate": 9.978205338680465e-06, + "loss": 1.3248, + "step": 1471 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 4.002696514129639, + "learning_rate": 9.978123232234147e-06, + "loss": 1.3274, + "step": 1472 + }, + { + "epoch": 0.3917553191489362, + "grad_norm": 3.857750654220581, + "learning_rate": 9.978040971758836e-06, + "loss": 1.2552, + "step": 1473 + }, + { + "epoch": 0.3920212765957447, + "grad_norm": 3.973501682281494, + "learning_rate": 9.977958557257077e-06, + "loss": 1.3911, + "step": 1474 + }, + { + "epoch": 0.39228723404255317, + "grad_norm": 4.301419258117676, + "learning_rate": 9.977875988731418e-06, + "loss": 1.2423, + "step": 1475 + }, + { + "epoch": 0.3925531914893617, + "grad_norm": 3.7840960025787354, + "learning_rate": 9.977793266184416e-06, + "loss": 1.1739, + "step": 1476 + }, + { + "epoch": 0.3928191489361702, + "grad_norm": 3.6807820796966553, + "learning_rate": 9.977710389618628e-06, + "loss": 1.1685, + "step": 1477 + }, + { + "epoch": 0.3930851063829787, + "grad_norm": 3.942674398422241, + "learning_rate": 9.977627359036624e-06, + "loss": 1.2033, + "step": 1478 + }, + { + "epoch": 0.39335106382978724, + "grad_norm": 4.07774543762207, + "learning_rate": 9.977544174440965e-06, + "loss": 1.2707, + "step": 1479 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 4.302217483520508, + "learning_rate": 9.977460835834231e-06, + "loss": 1.3944, + "step": 1480 + }, + { + "epoch": 0.3938829787234043, + "grad_norm": 4.006019592285156, + "learning_rate": 9.977377343218998e-06, + "loss": 1.3301, + "step": 1481 + }, + { + "epoch": 0.3941489361702128, + "grad_norm": 4.067336082458496, + "learning_rate": 9.977293696597849e-06, + "loss": 1.3282, + "step": 1482 + }, + { + "epoch": 0.39441489361702126, + "grad_norm": 4.4912004470825195, + "learning_rate": 9.977209895973374e-06, + "loss": 1.374, + "step": 1483 + }, + { + "epoch": 0.3946808510638298, + "grad_norm": 3.933626651763916, + "learning_rate": 9.977125941348165e-06, + "loss": 1.1584, + "step": 1484 + }, + { + "epoch": 0.3949468085106383, + "grad_norm": 4.08411169052124, + "learning_rate": 9.97704183272482e-06, + "loss": 1.3587, + "step": 1485 + }, + { + "epoch": 0.3952127659574468, + "grad_norm": 4.316272735595703, + "learning_rate": 9.976957570105939e-06, + "loss": 1.2544, + "step": 1486 + }, + { + "epoch": 0.39547872340425533, + "grad_norm": 4.05543851852417, + "learning_rate": 9.976873153494132e-06, + "loss": 1.1699, + "step": 1487 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 4.137149810791016, + "learning_rate": 9.976788582892012e-06, + "loss": 1.3501, + "step": 1488 + }, + { + "epoch": 0.39601063829787236, + "grad_norm": 3.830085515975952, + "learning_rate": 9.976703858302192e-06, + "loss": 1.2818, + "step": 1489 + }, + { + "epoch": 0.3962765957446808, + "grad_norm": 4.138214588165283, + "learning_rate": 9.976618979727295e-06, + "loss": 1.2769, + "step": 1490 + }, + { + "epoch": 0.39654255319148934, + "grad_norm": 4.205438137054443, + "learning_rate": 9.976533947169948e-06, + "loss": 1.4103, + "step": 1491 + }, + { + "epoch": 0.39680851063829786, + "grad_norm": 4.104953289031982, + "learning_rate": 9.976448760632782e-06, + "loss": 1.3701, + "step": 1492 + }, + { + "epoch": 0.3970744680851064, + "grad_norm": 3.725175619125366, + "learning_rate": 9.976363420118432e-06, + "loss": 1.2986, + "step": 1493 + }, + { + "epoch": 0.3973404255319149, + "grad_norm": 4.973143577575684, + "learning_rate": 9.97627792562954e-06, + "loss": 1.3123, + "step": 1494 + }, + { + "epoch": 0.3976063829787234, + "grad_norm": 3.5973260402679443, + "learning_rate": 9.976192277168748e-06, + "loss": 1.1878, + "step": 1495 + }, + { + "epoch": 0.39787234042553193, + "grad_norm": 3.9308860301971436, + "learning_rate": 9.97610647473871e-06, + "loss": 1.3139, + "step": 1496 + }, + { + "epoch": 0.39813829787234045, + "grad_norm": 3.831552028656006, + "learning_rate": 9.976020518342078e-06, + "loss": 1.249, + "step": 1497 + }, + { + "epoch": 0.3984042553191489, + "grad_norm": 3.8937809467315674, + "learning_rate": 9.975934407981512e-06, + "loss": 1.2361, + "step": 1498 + }, + { + "epoch": 0.39867021276595743, + "grad_norm": 4.4092512130737305, + "learning_rate": 9.97584814365968e-06, + "loss": 1.424, + "step": 1499 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 4.096745491027832, + "learning_rate": 9.975761725379243e-06, + "loss": 1.3488, + "step": 1500 + }, + { + "epoch": 0.39893617021276595, + "eval_loss": 1.3084138631820679, + "eval_runtime": 12.5754, + "eval_samples_per_second": 31.808, + "eval_steps_per_second": 3.976, + "step": 1500 + }, + { + "epoch": 0.39920212765957447, + "grad_norm": 5.023965835571289, + "learning_rate": 9.975675153142884e-06, + "loss": 1.3409, + "step": 1501 + }, + { + "epoch": 0.399468085106383, + "grad_norm": 4.182278156280518, + "learning_rate": 9.975588426953276e-06, + "loss": 1.2497, + "step": 1502 + }, + { + "epoch": 0.3997340425531915, + "grad_norm": 3.872786283493042, + "learning_rate": 9.975501546813104e-06, + "loss": 1.29, + "step": 1503 + }, + { + "epoch": 0.4, + "grad_norm": 3.9527881145477295, + "learning_rate": 9.975414512725058e-06, + "loss": 1.3427, + "step": 1504 + }, + { + "epoch": 0.4002659574468085, + "grad_norm": 3.563168525695801, + "learning_rate": 9.975327324691828e-06, + "loss": 1.2509, + "step": 1505 + }, + { + "epoch": 0.400531914893617, + "grad_norm": 3.8460729122161865, + "learning_rate": 9.975239982716113e-06, + "loss": 1.214, + "step": 1506 + }, + { + "epoch": 0.4007978723404255, + "grad_norm": 4.321569442749023, + "learning_rate": 9.975152486800615e-06, + "loss": 1.1959, + "step": 1507 + }, + { + "epoch": 0.40106382978723404, + "grad_norm": 4.102901935577393, + "learning_rate": 9.975064836948041e-06, + "loss": 1.2786, + "step": 1508 + }, + { + "epoch": 0.40132978723404256, + "grad_norm": 3.8385143280029297, + "learning_rate": 9.974977033161103e-06, + "loss": 1.3574, + "step": 1509 + }, + { + "epoch": 0.4015957446808511, + "grad_norm": 3.912363290786743, + "learning_rate": 9.97488907544252e-06, + "loss": 1.388, + "step": 1510 + }, + { + "epoch": 0.4018617021276596, + "grad_norm": 4.346206188201904, + "learning_rate": 9.974800963795012e-06, + "loss": 1.4532, + "step": 1511 + }, + { + "epoch": 0.4021276595744681, + "grad_norm": 4.346587657928467, + "learning_rate": 9.974712698221306e-06, + "loss": 1.2098, + "step": 1512 + }, + { + "epoch": 0.4023936170212766, + "grad_norm": 3.9622318744659424, + "learning_rate": 9.97462427872413e-06, + "loss": 1.1556, + "step": 1513 + }, + { + "epoch": 0.4026595744680851, + "grad_norm": 3.903508186340332, + "learning_rate": 9.974535705306222e-06, + "loss": 1.1644, + "step": 1514 + }, + { + "epoch": 0.4029255319148936, + "grad_norm": 4.4463605880737305, + "learning_rate": 9.974446977970322e-06, + "loss": 1.4892, + "step": 1515 + }, + { + "epoch": 0.4031914893617021, + "grad_norm": 3.8401832580566406, + "learning_rate": 9.974358096719178e-06, + "loss": 1.3681, + "step": 1516 + }, + { + "epoch": 0.40345744680851064, + "grad_norm": 4.009060382843018, + "learning_rate": 9.974269061555537e-06, + "loss": 1.2134, + "step": 1517 + }, + { + "epoch": 0.40372340425531916, + "grad_norm": 3.609969139099121, + "learning_rate": 9.974179872482153e-06, + "loss": 1.34, + "step": 1518 + }, + { + "epoch": 0.4039893617021277, + "grad_norm": 4.289672374725342, + "learning_rate": 9.97409052950179e-06, + "loss": 1.4246, + "step": 1519 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.6479434967041016, + "learning_rate": 9.974001032617208e-06, + "loss": 1.2366, + "step": 1520 + }, + { + "epoch": 0.40452127659574466, + "grad_norm": 4.251558780670166, + "learning_rate": 9.973911381831178e-06, + "loss": 1.3208, + "step": 1521 + }, + { + "epoch": 0.4047872340425532, + "grad_norm": 3.7560923099517822, + "learning_rate": 9.973821577146475e-06, + "loss": 1.2298, + "step": 1522 + }, + { + "epoch": 0.4050531914893617, + "grad_norm": 3.9338622093200684, + "learning_rate": 9.973731618565876e-06, + "loss": 1.34, + "step": 1523 + }, + { + "epoch": 0.4053191489361702, + "grad_norm": 3.8561365604400635, + "learning_rate": 9.973641506092165e-06, + "loss": 1.4198, + "step": 1524 + }, + { + "epoch": 0.40558510638297873, + "grad_norm": 3.7590527534484863, + "learning_rate": 9.973551239728129e-06, + "loss": 1.3644, + "step": 1525 + }, + { + "epoch": 0.40585106382978725, + "grad_norm": 4.470832824707031, + "learning_rate": 9.973460819476562e-06, + "loss": 1.3641, + "step": 1526 + }, + { + "epoch": 0.40611702127659577, + "grad_norm": 3.5494723320007324, + "learning_rate": 9.973370245340264e-06, + "loss": 1.2552, + "step": 1527 + }, + { + "epoch": 0.40638297872340423, + "grad_norm": 4.204685211181641, + "learning_rate": 9.973279517322033e-06, + "loss": 1.3577, + "step": 1528 + }, + { + "epoch": 0.40664893617021275, + "grad_norm": 4.775966167449951, + "learning_rate": 9.97318863542468e-06, + "loss": 1.4342, + "step": 1529 + }, + { + "epoch": 0.40691489361702127, + "grad_norm": 4.2795729637146, + "learning_rate": 9.973097599651013e-06, + "loss": 1.3033, + "step": 1530 + }, + { + "epoch": 0.4071808510638298, + "grad_norm": 4.110699653625488, + "learning_rate": 9.973006410003853e-06, + "loss": 1.3463, + "step": 1531 + }, + { + "epoch": 0.4074468085106383, + "grad_norm": 3.8819406032562256, + "learning_rate": 9.97291506648602e-06, + "loss": 1.1908, + "step": 1532 + }, + { + "epoch": 0.4077127659574468, + "grad_norm": 4.164956092834473, + "learning_rate": 9.972823569100338e-06, + "loss": 1.2573, + "step": 1533 + }, + { + "epoch": 0.40797872340425534, + "grad_norm": 3.9775986671447754, + "learning_rate": 9.97273191784964e-06, + "loss": 1.2141, + "step": 1534 + }, + { + "epoch": 0.40824468085106386, + "grad_norm": 4.500059604644775, + "learning_rate": 9.972640112736764e-06, + "loss": 1.3342, + "step": 1535 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 4.081606864929199, + "learning_rate": 9.972548153764547e-06, + "loss": 1.2027, + "step": 1536 + }, + { + "epoch": 0.40877659574468084, + "grad_norm": 4.272010803222656, + "learning_rate": 9.972456040935838e-06, + "loss": 1.2332, + "step": 1537 + }, + { + "epoch": 0.40904255319148936, + "grad_norm": 4.042487144470215, + "learning_rate": 9.972363774253481e-06, + "loss": 1.1932, + "step": 1538 + }, + { + "epoch": 0.4093085106382979, + "grad_norm": 3.9628350734710693, + "learning_rate": 9.972271353720337e-06, + "loss": 1.2636, + "step": 1539 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 4.018553256988525, + "learning_rate": 9.972178779339264e-06, + "loss": 1.2822, + "step": 1540 + }, + { + "epoch": 0.4098404255319149, + "grad_norm": 4.054775714874268, + "learning_rate": 9.972086051113123e-06, + "loss": 1.3419, + "step": 1541 + }, + { + "epoch": 0.4101063829787234, + "grad_norm": 4.035485744476318, + "learning_rate": 9.971993169044787e-06, + "loss": 1.2586, + "step": 1542 + }, + { + "epoch": 0.4103723404255319, + "grad_norm": 4.139084815979004, + "learning_rate": 9.971900133137128e-06, + "loss": 1.3533, + "step": 1543 + }, + { + "epoch": 0.4106382978723404, + "grad_norm": 3.9709324836730957, + "learning_rate": 9.971806943393026e-06, + "loss": 1.1807, + "step": 1544 + }, + { + "epoch": 0.4109042553191489, + "grad_norm": 3.836603879928589, + "learning_rate": 9.971713599815364e-06, + "loss": 1.2364, + "step": 1545 + }, + { + "epoch": 0.41117021276595744, + "grad_norm": 3.484250068664551, + "learning_rate": 9.97162010240703e-06, + "loss": 1.2536, + "step": 1546 + }, + { + "epoch": 0.41143617021276596, + "grad_norm": 4.203670978546143, + "learning_rate": 9.971526451170914e-06, + "loss": 1.2339, + "step": 1547 + }, + { + "epoch": 0.4117021276595745, + "grad_norm": 3.7969377040863037, + "learning_rate": 9.971432646109919e-06, + "loss": 1.4205, + "step": 1548 + }, + { + "epoch": 0.411968085106383, + "grad_norm": 3.9421546459198, + "learning_rate": 9.971338687226944e-06, + "loss": 1.2441, + "step": 1549 + }, + { + "epoch": 0.4122340425531915, + "grad_norm": 3.8566412925720215, + "learning_rate": 9.971244574524897e-06, + "loss": 1.3148, + "step": 1550 + }, + { + "epoch": 0.4125, + "grad_norm": 3.6699059009552, + "learning_rate": 9.971150308006689e-06, + "loss": 1.1396, + "step": 1551 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 4.328299522399902, + "learning_rate": 9.971055887675238e-06, + "loss": 1.4105, + "step": 1552 + }, + { + "epoch": 0.413031914893617, + "grad_norm": 3.6258397102355957, + "learning_rate": 9.970961313533465e-06, + "loss": 1.2399, + "step": 1553 + }, + { + "epoch": 0.41329787234042553, + "grad_norm": 4.217952251434326, + "learning_rate": 9.970866585584298e-06, + "loss": 1.2643, + "step": 1554 + }, + { + "epoch": 0.41356382978723405, + "grad_norm": 3.8410286903381348, + "learning_rate": 9.970771703830666e-06, + "loss": 1.3982, + "step": 1555 + }, + { + "epoch": 0.41382978723404257, + "grad_norm": 4.1184234619140625, + "learning_rate": 9.970676668275504e-06, + "loss": 1.3206, + "step": 1556 + }, + { + "epoch": 0.4140957446808511, + "grad_norm": 3.805264472961426, + "learning_rate": 9.970581478921755e-06, + "loss": 1.3301, + "step": 1557 + }, + { + "epoch": 0.41436170212765955, + "grad_norm": 3.7191929817199707, + "learning_rate": 9.970486135772362e-06, + "loss": 1.3443, + "step": 1558 + }, + { + "epoch": 0.41462765957446807, + "grad_norm": 3.7962100505828857, + "learning_rate": 9.970390638830275e-06, + "loss": 1.1145, + "step": 1559 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 3.8480000495910645, + "learning_rate": 9.970294988098452e-06, + "loss": 1.303, + "step": 1560 + }, + { + "epoch": 0.4151595744680851, + "grad_norm": 4.154008388519287, + "learning_rate": 9.970199183579847e-06, + "loss": 1.2505, + "step": 1561 + }, + { + "epoch": 0.4154255319148936, + "grad_norm": 3.6945624351501465, + "learning_rate": 9.97010322527743e-06, + "loss": 1.2318, + "step": 1562 + }, + { + "epoch": 0.41569148936170214, + "grad_norm": 4.145558834075928, + "learning_rate": 9.970007113194168e-06, + "loss": 1.2855, + "step": 1563 + }, + { + "epoch": 0.41595744680851066, + "grad_norm": 4.037220001220703, + "learning_rate": 9.969910847333032e-06, + "loss": 1.2599, + "step": 1564 + }, + { + "epoch": 0.4162234042553192, + "grad_norm": 4.070208549499512, + "learning_rate": 9.969814427697007e-06, + "loss": 1.3002, + "step": 1565 + }, + { + "epoch": 0.41648936170212764, + "grad_norm": 4.0794548988342285, + "learning_rate": 9.969717854289069e-06, + "loss": 1.3807, + "step": 1566 + }, + { + "epoch": 0.41675531914893615, + "grad_norm": 3.9017162322998047, + "learning_rate": 9.969621127112211e-06, + "loss": 1.1982, + "step": 1567 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 4.089752674102783, + "learning_rate": 9.969524246169424e-06, + "loss": 1.2734, + "step": 1568 + }, + { + "epoch": 0.4172872340425532, + "grad_norm": 3.7550644874572754, + "learning_rate": 9.969427211463705e-06, + "loss": 1.2207, + "step": 1569 + }, + { + "epoch": 0.4175531914893617, + "grad_norm": 3.9977076053619385, + "learning_rate": 9.969330022998057e-06, + "loss": 1.3695, + "step": 1570 + }, + { + "epoch": 0.4178191489361702, + "grad_norm": 4.422798156738281, + "learning_rate": 9.969232680775491e-06, + "loss": 1.3292, + "step": 1571 + }, + { + "epoch": 0.41808510638297874, + "grad_norm": 4.122771263122559, + "learning_rate": 9.969135184799013e-06, + "loss": 1.3753, + "step": 1572 + }, + { + "epoch": 0.4183510638297872, + "grad_norm": 3.827120542526245, + "learning_rate": 9.969037535071641e-06, + "loss": 1.2738, + "step": 1573 + }, + { + "epoch": 0.4186170212765957, + "grad_norm": 3.823761463165283, + "learning_rate": 9.968939731596399e-06, + "loss": 1.2201, + "step": 1574 + }, + { + "epoch": 0.41888297872340424, + "grad_norm": 4.0475616455078125, + "learning_rate": 9.96884177437631e-06, + "loss": 1.3511, + "step": 1575 + }, + { + "epoch": 0.41914893617021276, + "grad_norm": 4.167337894439697, + "learning_rate": 9.968743663414408e-06, + "loss": 1.3725, + "step": 1576 + }, + { + "epoch": 0.4194148936170213, + "grad_norm": 4.683474063873291, + "learning_rate": 9.968645398713726e-06, + "loss": 1.3719, + "step": 1577 + }, + { + "epoch": 0.4196808510638298, + "grad_norm": 4.450965881347656, + "learning_rate": 9.968546980277305e-06, + "loss": 1.2847, + "step": 1578 + }, + { + "epoch": 0.4199468085106383, + "grad_norm": 4.25331449508667, + "learning_rate": 9.968448408108191e-06, + "loss": 1.4151, + "step": 1579 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 4.090495586395264, + "learning_rate": 9.968349682209434e-06, + "loss": 1.2518, + "step": 1580 + }, + { + "epoch": 0.4204787234042553, + "grad_norm": 4.116806507110596, + "learning_rate": 9.96825080258409e-06, + "loss": 1.3986, + "step": 1581 + }, + { + "epoch": 0.4207446808510638, + "grad_norm": 4.016780376434326, + "learning_rate": 9.968151769235216e-06, + "loss": 1.2488, + "step": 1582 + }, + { + "epoch": 0.42101063829787233, + "grad_norm": 4.153627872467041, + "learning_rate": 9.968052582165874e-06, + "loss": 1.3459, + "step": 1583 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 4.0243048667907715, + "learning_rate": 9.96795324137914e-06, + "loss": 1.2554, + "step": 1584 + }, + { + "epoch": 0.42154255319148937, + "grad_norm": 4.162500381469727, + "learning_rate": 9.96785374687808e-06, + "loss": 1.3597, + "step": 1585 + }, + { + "epoch": 0.4218085106382979, + "grad_norm": 3.8271100521087646, + "learning_rate": 9.967754098665778e-06, + "loss": 1.2375, + "step": 1586 + }, + { + "epoch": 0.4220744680851064, + "grad_norm": 3.73313045501709, + "learning_rate": 9.967654296745317e-06, + "loss": 1.1394, + "step": 1587 + }, + { + "epoch": 0.4223404255319149, + "grad_norm": 4.17546272277832, + "learning_rate": 9.96755434111978e-06, + "loss": 1.3004, + "step": 1588 + }, + { + "epoch": 0.4226063829787234, + "grad_norm": 3.7987289428710938, + "learning_rate": 9.967454231792267e-06, + "loss": 1.2551, + "step": 1589 + }, + { + "epoch": 0.4228723404255319, + "grad_norm": 4.171220779418945, + "learning_rate": 9.967353968765868e-06, + "loss": 1.2722, + "step": 1590 + }, + { + "epoch": 0.4231382978723404, + "grad_norm": 4.090373516082764, + "learning_rate": 9.96725355204369e-06, + "loss": 1.2963, + "step": 1591 + }, + { + "epoch": 0.42340425531914894, + "grad_norm": 4.222188949584961, + "learning_rate": 9.967152981628841e-06, + "loss": 1.1075, + "step": 1592 + }, + { + "epoch": 0.42367021276595745, + "grad_norm": 3.9014172554016113, + "learning_rate": 9.967052257524428e-06, + "loss": 1.251, + "step": 1593 + }, + { + "epoch": 0.423936170212766, + "grad_norm": 4.0223870277404785, + "learning_rate": 9.966951379733572e-06, + "loss": 1.1924, + "step": 1594 + }, + { + "epoch": 0.4242021276595745, + "grad_norm": 3.724557876586914, + "learning_rate": 9.96685034825939e-06, + "loss": 1.206, + "step": 1595 + }, + { + "epoch": 0.42446808510638295, + "grad_norm": 4.103020191192627, + "learning_rate": 9.966749163105011e-06, + "loss": 1.374, + "step": 1596 + }, + { + "epoch": 0.42473404255319147, + "grad_norm": 3.997119188308716, + "learning_rate": 9.966647824273567e-06, + "loss": 1.2097, + "step": 1597 + }, + { + "epoch": 0.425, + "grad_norm": 4.226285934448242, + "learning_rate": 9.966546331768192e-06, + "loss": 1.3387, + "step": 1598 + }, + { + "epoch": 0.4252659574468085, + "grad_norm": 4.060708999633789, + "learning_rate": 9.966444685592025e-06, + "loss": 1.2762, + "step": 1599 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 4.005706787109375, + "learning_rate": 9.966342885748212e-06, + "loss": 1.2845, + "step": 1600 + }, + { + "epoch": 0.42579787234042554, + "grad_norm": 4.201882839202881, + "learning_rate": 9.966240932239904e-06, + "loss": 1.2953, + "step": 1601 + }, + { + "epoch": 0.42606382978723406, + "grad_norm": 3.7558727264404297, + "learning_rate": 9.966138825070254e-06, + "loss": 1.2806, + "step": 1602 + }, + { + "epoch": 0.4263297872340426, + "grad_norm": 3.9751381874084473, + "learning_rate": 9.96603656424242e-06, + "loss": 1.2354, + "step": 1603 + }, + { + "epoch": 0.42659574468085104, + "grad_norm": 3.775033712387085, + "learning_rate": 9.96593414975957e-06, + "loss": 1.2592, + "step": 1604 + }, + { + "epoch": 0.42686170212765956, + "grad_norm": 4.114045143127441, + "learning_rate": 9.965831581624872e-06, + "loss": 1.1019, + "step": 1605 + }, + { + "epoch": 0.4271276595744681, + "grad_norm": 3.6853203773498535, + "learning_rate": 9.965728859841497e-06, + "loss": 1.356, + "step": 1606 + }, + { + "epoch": 0.4273936170212766, + "grad_norm": 3.8778109550476074, + "learning_rate": 9.965625984412623e-06, + "loss": 1.2266, + "step": 1607 + }, + { + "epoch": 0.4276595744680851, + "grad_norm": 3.860879421234131, + "learning_rate": 9.965522955341437e-06, + "loss": 1.2998, + "step": 1608 + }, + { + "epoch": 0.42792553191489363, + "grad_norm": 3.7324464321136475, + "learning_rate": 9.965419772631125e-06, + "loss": 1.3103, + "step": 1609 + }, + { + "epoch": 0.42819148936170215, + "grad_norm": 3.8030385971069336, + "learning_rate": 9.965316436284877e-06, + "loss": 1.2967, + "step": 1610 + }, + { + "epoch": 0.4284574468085106, + "grad_norm": 4.376537322998047, + "learning_rate": 9.965212946305893e-06, + "loss": 1.4258, + "step": 1611 + }, + { + "epoch": 0.42872340425531913, + "grad_norm": 4.365556716918945, + "learning_rate": 9.965109302697376e-06, + "loss": 1.3794, + "step": 1612 + }, + { + "epoch": 0.42898936170212765, + "grad_norm": 4.431367874145508, + "learning_rate": 9.96500550546253e-06, + "loss": 1.2973, + "step": 1613 + }, + { + "epoch": 0.42925531914893617, + "grad_norm": 4.084920406341553, + "learning_rate": 9.96490155460457e-06, + "loss": 1.2417, + "step": 1614 + }, + { + "epoch": 0.4295212765957447, + "grad_norm": 3.6877284049987793, + "learning_rate": 9.964797450126708e-06, + "loss": 1.2577, + "step": 1615 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 4.147090911865234, + "learning_rate": 9.964693192032168e-06, + "loss": 1.3127, + "step": 1616 + }, + { + "epoch": 0.4300531914893617, + "grad_norm": 3.9144530296325684, + "learning_rate": 9.964588780324176e-06, + "loss": 1.2333, + "step": 1617 + }, + { + "epoch": 0.43031914893617024, + "grad_norm": 3.9510538578033447, + "learning_rate": 9.964484215005963e-06, + "loss": 1.2541, + "step": 1618 + }, + { + "epoch": 0.4305851063829787, + "grad_norm": 4.1784892082214355, + "learning_rate": 9.964379496080763e-06, + "loss": 1.3247, + "step": 1619 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 3.9380571842193604, + "learning_rate": 9.964274623551814e-06, + "loss": 1.3042, + "step": 1620 + }, + { + "epoch": 0.43111702127659574, + "grad_norm": 3.6729469299316406, + "learning_rate": 9.964169597422367e-06, + "loss": 1.2064, + "step": 1621 + }, + { + "epoch": 0.43138297872340425, + "grad_norm": 4.168332576751709, + "learning_rate": 9.964064417695666e-06, + "loss": 1.2936, + "step": 1622 + }, + { + "epoch": 0.43164893617021277, + "grad_norm": 3.7848429679870605, + "learning_rate": 9.963959084374969e-06, + "loss": 1.3055, + "step": 1623 + }, + { + "epoch": 0.4319148936170213, + "grad_norm": 3.760188579559326, + "learning_rate": 9.963853597463533e-06, + "loss": 1.2085, + "step": 1624 + }, + { + "epoch": 0.4321808510638298, + "grad_norm": 3.734712839126587, + "learning_rate": 9.963747956964623e-06, + "loss": 1.1788, + "step": 1625 + }, + { + "epoch": 0.4324468085106383, + "grad_norm": 4.398496627807617, + "learning_rate": 9.963642162881506e-06, + "loss": 1.1853, + "step": 1626 + }, + { + "epoch": 0.4327127659574468, + "grad_norm": 4.267323970794678, + "learning_rate": 9.963536215217457e-06, + "loss": 1.2317, + "step": 1627 + }, + { + "epoch": 0.4329787234042553, + "grad_norm": 4.306065082550049, + "learning_rate": 9.963430113975753e-06, + "loss": 1.5309, + "step": 1628 + }, + { + "epoch": 0.4332446808510638, + "grad_norm": 3.862356424331665, + "learning_rate": 9.963323859159679e-06, + "loss": 1.2449, + "step": 1629 + }, + { + "epoch": 0.43351063829787234, + "grad_norm": 3.6479053497314453, + "learning_rate": 9.96321745077252e-06, + "loss": 1.1502, + "step": 1630 + }, + { + "epoch": 0.43377659574468086, + "grad_norm": 3.702998399734497, + "learning_rate": 9.963110888817569e-06, + "loss": 1.1776, + "step": 1631 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 4.183767795562744, + "learning_rate": 9.963004173298125e-06, + "loss": 1.2266, + "step": 1632 + }, + { + "epoch": 0.4343085106382979, + "grad_norm": 3.9834625720977783, + "learning_rate": 9.96289730421749e-06, + "loss": 1.222, + "step": 1633 + }, + { + "epoch": 0.43457446808510636, + "grad_norm": 3.971428871154785, + "learning_rate": 9.962790281578966e-06, + "loss": 1.3843, + "step": 1634 + }, + { + "epoch": 0.4348404255319149, + "grad_norm": 3.833468437194824, + "learning_rate": 9.96268310538587e-06, + "loss": 1.3268, + "step": 1635 + }, + { + "epoch": 0.4351063829787234, + "grad_norm": 3.7899720668792725, + "learning_rate": 9.962575775641516e-06, + "loss": 1.2939, + "step": 1636 + }, + { + "epoch": 0.4353723404255319, + "grad_norm": 3.8362271785736084, + "learning_rate": 9.962468292349223e-06, + "loss": 1.2681, + "step": 1637 + }, + { + "epoch": 0.43563829787234043, + "grad_norm": 3.884549140930176, + "learning_rate": 9.96236065551232e-06, + "loss": 1.267, + "step": 1638 + }, + { + "epoch": 0.43590425531914895, + "grad_norm": 3.975801944732666, + "learning_rate": 9.962252865134136e-06, + "loss": 1.3039, + "step": 1639 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 4.278522491455078, + "learning_rate": 9.962144921218005e-06, + "loss": 1.3885, + "step": 1640 + }, + { + "epoch": 0.436436170212766, + "grad_norm": 3.9850552082061768, + "learning_rate": 9.962036823767269e-06, + "loss": 1.2586, + "step": 1641 + }, + { + "epoch": 0.43670212765957445, + "grad_norm": 4.315723419189453, + "learning_rate": 9.961928572785272e-06, + "loss": 1.3281, + "step": 1642 + }, + { + "epoch": 0.43696808510638296, + "grad_norm": 3.7114546298980713, + "learning_rate": 9.96182016827536e-06, + "loss": 1.1813, + "step": 1643 + }, + { + "epoch": 0.4372340425531915, + "grad_norm": 4.079943656921387, + "learning_rate": 9.961711610240892e-06, + "loss": 1.2878, + "step": 1644 + }, + { + "epoch": 0.4375, + "grad_norm": 3.7427685260772705, + "learning_rate": 9.961602898685225e-06, + "loss": 1.3068, + "step": 1645 + }, + { + "epoch": 0.4377659574468085, + "grad_norm": 4.234682083129883, + "learning_rate": 9.961494033611726e-06, + "loss": 1.4143, + "step": 1646 + }, + { + "epoch": 0.43803191489361704, + "grad_norm": 3.7043113708496094, + "learning_rate": 9.961385015023755e-06, + "loss": 1.356, + "step": 1647 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.9575397968292236, + "learning_rate": 9.961275842924694e-06, + "loss": 1.3257, + "step": 1648 + }, + { + "epoch": 0.438563829787234, + "grad_norm": 4.285686016082764, + "learning_rate": 9.961166517317914e-06, + "loss": 1.2934, + "step": 1649 + }, + { + "epoch": 0.43882978723404253, + "grad_norm": 4.141624927520752, + "learning_rate": 9.961057038206804e-06, + "loss": 1.1941, + "step": 1650 + }, + { + "epoch": 0.43909574468085105, + "grad_norm": 3.7219042778015137, + "learning_rate": 9.960947405594747e-06, + "loss": 1.309, + "step": 1651 + }, + { + "epoch": 0.43936170212765957, + "grad_norm": 4.113218307495117, + "learning_rate": 9.960837619485136e-06, + "loss": 1.2331, + "step": 1652 + }, + { + "epoch": 0.4396276595744681, + "grad_norm": 4.069479465484619, + "learning_rate": 9.96072767988137e-06, + "loss": 1.1383, + "step": 1653 + }, + { + "epoch": 0.4398936170212766, + "grad_norm": 3.974097967147827, + "learning_rate": 9.960617586786847e-06, + "loss": 1.2015, + "step": 1654 + }, + { + "epoch": 0.4401595744680851, + "grad_norm": 3.991530656814575, + "learning_rate": 9.960507340204977e-06, + "loss": 1.254, + "step": 1655 + }, + { + "epoch": 0.44042553191489364, + "grad_norm": 4.121614933013916, + "learning_rate": 9.960396940139169e-06, + "loss": 1.4372, + "step": 1656 + }, + { + "epoch": 0.4406914893617021, + "grad_norm": 4.809171676635742, + "learning_rate": 9.960286386592839e-06, + "loss": 1.1771, + "step": 1657 + }, + { + "epoch": 0.4409574468085106, + "grad_norm": 3.7910423278808594, + "learning_rate": 9.960175679569409e-06, + "loss": 1.4103, + "step": 1658 + }, + { + "epoch": 0.44122340425531914, + "grad_norm": 3.5597236156463623, + "learning_rate": 9.960064819072305e-06, + "loss": 1.2461, + "step": 1659 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 4.393692493438721, + "learning_rate": 9.959953805104953e-06, + "loss": 1.3746, + "step": 1660 + }, + { + "epoch": 0.4417553191489362, + "grad_norm": 4.309146881103516, + "learning_rate": 9.959842637670791e-06, + "loss": 1.2619, + "step": 1661 + }, + { + "epoch": 0.4420212765957447, + "grad_norm": 4.537207126617432, + "learning_rate": 9.95973131677326e-06, + "loss": 1.2895, + "step": 1662 + }, + { + "epoch": 0.4422872340425532, + "grad_norm": 4.204534530639648, + "learning_rate": 9.959619842415802e-06, + "loss": 1.2458, + "step": 1663 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 3.859935998916626, + "learning_rate": 9.959508214601866e-06, + "loss": 1.2334, + "step": 1664 + }, + { + "epoch": 0.4428191489361702, + "grad_norm": 4.042413711547852, + "learning_rate": 9.959396433334907e-06, + "loss": 1.451, + "step": 1665 + }, + { + "epoch": 0.4430851063829787, + "grad_norm": 4.226952075958252, + "learning_rate": 9.959284498618385e-06, + "loss": 1.3204, + "step": 1666 + }, + { + "epoch": 0.44335106382978723, + "grad_norm": 4.049594402313232, + "learning_rate": 9.95917241045576e-06, + "loss": 1.3671, + "step": 1667 + }, + { + "epoch": 0.44361702127659575, + "grad_norm": 3.731627941131592, + "learning_rate": 9.959060168850504e-06, + "loss": 1.289, + "step": 1668 + }, + { + "epoch": 0.44388297872340426, + "grad_norm": 4.097120761871338, + "learning_rate": 9.958947773806084e-06, + "loss": 1.2126, + "step": 1669 + }, + { + "epoch": 0.4441489361702128, + "grad_norm": 4.148438930511475, + "learning_rate": 9.958835225325984e-06, + "loss": 1.1967, + "step": 1670 + }, + { + "epoch": 0.4444148936170213, + "grad_norm": 3.9843711853027344, + "learning_rate": 9.958722523413685e-06, + "loss": 1.3463, + "step": 1671 + }, + { + "epoch": 0.44468085106382976, + "grad_norm": 4.3066630363464355, + "learning_rate": 9.958609668072673e-06, + "loss": 1.4344, + "step": 1672 + }, + { + "epoch": 0.4449468085106383, + "grad_norm": 3.673088550567627, + "learning_rate": 9.958496659306436e-06, + "loss": 1.3849, + "step": 1673 + }, + { + "epoch": 0.4452127659574468, + "grad_norm": 4.2683210372924805, + "learning_rate": 9.958383497118478e-06, + "loss": 1.3148, + "step": 1674 + }, + { + "epoch": 0.4454787234042553, + "grad_norm": 3.677374839782715, + "learning_rate": 9.958270181512295e-06, + "loss": 1.1148, + "step": 1675 + }, + { + "epoch": 0.44574468085106383, + "grad_norm": 4.075168132781982, + "learning_rate": 9.958156712491396e-06, + "loss": 1.4016, + "step": 1676 + }, + { + "epoch": 0.44601063829787235, + "grad_norm": 4.137705326080322, + "learning_rate": 9.95804309005929e-06, + "loss": 1.3865, + "step": 1677 + }, + { + "epoch": 0.44627659574468087, + "grad_norm": 3.7367939949035645, + "learning_rate": 9.957929314219494e-06, + "loss": 1.3304, + "step": 1678 + }, + { + "epoch": 0.4465425531914894, + "grad_norm": 3.8000895977020264, + "learning_rate": 9.957815384975528e-06, + "loss": 1.4171, + "step": 1679 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.774846315383911, + "learning_rate": 9.957701302330915e-06, + "loss": 1.0019, + "step": 1680 + }, + { + "epoch": 0.44707446808510637, + "grad_norm": 3.7514147758483887, + "learning_rate": 9.957587066289189e-06, + "loss": 1.0711, + "step": 1681 + }, + { + "epoch": 0.4473404255319149, + "grad_norm": 4.298345565795898, + "learning_rate": 9.957472676853882e-06, + "loss": 1.2902, + "step": 1682 + }, + { + "epoch": 0.4476063829787234, + "grad_norm": 3.632465362548828, + "learning_rate": 9.957358134028535e-06, + "loss": 1.1969, + "step": 1683 + }, + { + "epoch": 0.4478723404255319, + "grad_norm": 3.680661201477051, + "learning_rate": 9.957243437816688e-06, + "loss": 1.2266, + "step": 1684 + }, + { + "epoch": 0.44813829787234044, + "grad_norm": 3.757211208343506, + "learning_rate": 9.957128588221895e-06, + "loss": 1.2374, + "step": 1685 + }, + { + "epoch": 0.44840425531914896, + "grad_norm": 3.93074107170105, + "learning_rate": 9.957013585247703e-06, + "loss": 1.2285, + "step": 1686 + }, + { + "epoch": 0.4486702127659574, + "grad_norm": 4.218538284301758, + "learning_rate": 9.95689842889768e-06, + "loss": 1.1887, + "step": 1687 + }, + { + "epoch": 0.44893617021276594, + "grad_norm": 4.04231595993042, + "learning_rate": 9.95678311917538e-06, + "loss": 1.3696, + "step": 1688 + }, + { + "epoch": 0.44920212765957446, + "grad_norm": 3.7490601539611816, + "learning_rate": 9.956667656084376e-06, + "loss": 1.2857, + "step": 1689 + }, + { + "epoch": 0.449468085106383, + "grad_norm": 3.642409324645996, + "learning_rate": 9.956552039628237e-06, + "loss": 1.1536, + "step": 1690 + }, + { + "epoch": 0.4497340425531915, + "grad_norm": 4.070724964141846, + "learning_rate": 9.956436269810543e-06, + "loss": 1.3129, + "step": 1691 + }, + { + "epoch": 0.45, + "grad_norm": 3.6677682399749756, + "learning_rate": 9.956320346634877e-06, + "loss": 1.2578, + "step": 1692 + }, + { + "epoch": 0.45026595744680853, + "grad_norm": 3.783087730407715, + "learning_rate": 9.956204270104823e-06, + "loss": 1.2943, + "step": 1693 + }, + { + "epoch": 0.45053191489361705, + "grad_norm": 4.206989765167236, + "learning_rate": 9.956088040223975e-06, + "loss": 1.4913, + "step": 1694 + }, + { + "epoch": 0.4507978723404255, + "grad_norm": 4.3370819091796875, + "learning_rate": 9.955971656995927e-06, + "loss": 1.1996, + "step": 1695 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 3.9697062969207764, + "learning_rate": 9.95585512042428e-06, + "loss": 1.253, + "step": 1696 + }, + { + "epoch": 0.45132978723404255, + "grad_norm": 3.6939969062805176, + "learning_rate": 9.95573843051264e-06, + "loss": 1.1627, + "step": 1697 + }, + { + "epoch": 0.45159574468085106, + "grad_norm": 4.0041351318359375, + "learning_rate": 9.955621587264621e-06, + "loss": 1.2185, + "step": 1698 + }, + { + "epoch": 0.4518617021276596, + "grad_norm": 4.0276079177856445, + "learning_rate": 9.955504590683834e-06, + "loss": 1.2071, + "step": 1699 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 4.058544158935547, + "learning_rate": 9.955387440773902e-06, + "loss": 1.2284, + "step": 1700 + }, + { + "epoch": 0.4523936170212766, + "grad_norm": 3.8239941596984863, + "learning_rate": 9.955270137538446e-06, + "loss": 1.3371, + "step": 1701 + }, + { + "epoch": 0.4526595744680851, + "grad_norm": 4.147292613983154, + "learning_rate": 9.955152680981099e-06, + "loss": 1.3542, + "step": 1702 + }, + { + "epoch": 0.4529255319148936, + "grad_norm": 3.7271342277526855, + "learning_rate": 9.955035071105495e-06, + "loss": 1.0038, + "step": 1703 + }, + { + "epoch": 0.4531914893617021, + "grad_norm": 4.002806663513184, + "learning_rate": 9.954917307915272e-06, + "loss": 1.3361, + "step": 1704 + }, + { + "epoch": 0.45345744680851063, + "grad_norm": 3.8606765270233154, + "learning_rate": 9.954799391414073e-06, + "loss": 1.2703, + "step": 1705 + }, + { + "epoch": 0.45372340425531915, + "grad_norm": 4.117914199829102, + "learning_rate": 9.954681321605546e-06, + "loss": 1.4262, + "step": 1706 + }, + { + "epoch": 0.45398936170212767, + "grad_norm": 3.956178903579712, + "learning_rate": 9.954563098493349e-06, + "loss": 1.2889, + "step": 1707 + }, + { + "epoch": 0.4542553191489362, + "grad_norm": 3.8659157752990723, + "learning_rate": 9.954444722081133e-06, + "loss": 1.2892, + "step": 1708 + }, + { + "epoch": 0.4545212765957447, + "grad_norm": 3.936624765396118, + "learning_rate": 9.954326192372565e-06, + "loss": 1.5031, + "step": 1709 + }, + { + "epoch": 0.45478723404255317, + "grad_norm": 3.8671083450317383, + "learning_rate": 9.954207509371313e-06, + "loss": 1.3221, + "step": 1710 + }, + { + "epoch": 0.4550531914893617, + "grad_norm": 4.292788505554199, + "learning_rate": 9.954088673081048e-06, + "loss": 1.3216, + "step": 1711 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 3.8020899295806885, + "learning_rate": 9.953969683505444e-06, + "loss": 1.2248, + "step": 1712 + }, + { + "epoch": 0.4555851063829787, + "grad_norm": 4.227027893066406, + "learning_rate": 9.953850540648189e-06, + "loss": 1.2624, + "step": 1713 + }, + { + "epoch": 0.45585106382978724, + "grad_norm": 4.067933559417725, + "learning_rate": 9.953731244512963e-06, + "loss": 1.2756, + "step": 1714 + }, + { + "epoch": 0.45611702127659576, + "grad_norm": 3.9916749000549316, + "learning_rate": 9.953611795103462e-06, + "loss": 1.2651, + "step": 1715 + }, + { + "epoch": 0.4563829787234043, + "grad_norm": 4.110116004943848, + "learning_rate": 9.953492192423379e-06, + "loss": 1.3669, + "step": 1716 + }, + { + "epoch": 0.4566489361702128, + "grad_norm": 4.194306373596191, + "learning_rate": 9.953372436476414e-06, + "loss": 1.534, + "step": 1717 + }, + { + "epoch": 0.45691489361702126, + "grad_norm": 3.9467716217041016, + "learning_rate": 9.953252527266275e-06, + "loss": 1.2748, + "step": 1718 + }, + { + "epoch": 0.4571808510638298, + "grad_norm": 4.1253886222839355, + "learning_rate": 9.953132464796674e-06, + "loss": 1.2625, + "step": 1719 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 4.45941162109375, + "learning_rate": 9.95301224907132e-06, + "loss": 1.3565, + "step": 1720 + }, + { + "epoch": 0.4577127659574468, + "grad_norm": 4.033083915710449, + "learning_rate": 9.952891880093935e-06, + "loss": 1.2789, + "step": 1721 + }, + { + "epoch": 0.45797872340425533, + "grad_norm": 4.035634517669678, + "learning_rate": 9.952771357868245e-06, + "loss": 1.2641, + "step": 1722 + }, + { + "epoch": 0.45824468085106385, + "grad_norm": 3.722550630569458, + "learning_rate": 9.952650682397978e-06, + "loss": 1.3316, + "step": 1723 + }, + { + "epoch": 0.45851063829787236, + "grad_norm": 3.8771049976348877, + "learning_rate": 9.952529853686868e-06, + "loss": 1.3889, + "step": 1724 + }, + { + "epoch": 0.4587765957446808, + "grad_norm": 4.175072193145752, + "learning_rate": 9.952408871738652e-06, + "loss": 1.3766, + "step": 1725 + }, + { + "epoch": 0.45904255319148934, + "grad_norm": 3.859618902206421, + "learning_rate": 9.952287736557078e-06, + "loss": 1.1251, + "step": 1726 + }, + { + "epoch": 0.45930851063829786, + "grad_norm": 4.060375213623047, + "learning_rate": 9.952166448145887e-06, + "loss": 1.2308, + "step": 1727 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 3.9827208518981934, + "learning_rate": 9.952045006508839e-06, + "loss": 1.2434, + "step": 1728 + }, + { + "epoch": 0.4598404255319149, + "grad_norm": 3.8347811698913574, + "learning_rate": 9.951923411649686e-06, + "loss": 1.1165, + "step": 1729 + }, + { + "epoch": 0.4601063829787234, + "grad_norm": 3.8551104068756104, + "learning_rate": 9.951801663572194e-06, + "loss": 1.2536, + "step": 1730 + }, + { + "epoch": 0.46037234042553193, + "grad_norm": 4.300414562225342, + "learning_rate": 9.951679762280127e-06, + "loss": 1.3653, + "step": 1731 + }, + { + "epoch": 0.46063829787234045, + "grad_norm": 3.9349825382232666, + "learning_rate": 9.95155770777726e-06, + "loss": 1.1563, + "step": 1732 + }, + { + "epoch": 0.4609042553191489, + "grad_norm": 4.161105632781982, + "learning_rate": 9.951435500067366e-06, + "loss": 1.3807, + "step": 1733 + }, + { + "epoch": 0.46117021276595743, + "grad_norm": 4.0084686279296875, + "learning_rate": 9.95131313915423e-06, + "loss": 1.2486, + "step": 1734 + }, + { + "epoch": 0.46143617021276595, + "grad_norm": 3.6559159755706787, + "learning_rate": 9.951190625041634e-06, + "loss": 1.2063, + "step": 1735 + }, + { + "epoch": 0.46170212765957447, + "grad_norm": 3.99893856048584, + "learning_rate": 9.95106795773337e-06, + "loss": 1.2945, + "step": 1736 + }, + { + "epoch": 0.461968085106383, + "grad_norm": 4.061460018157959, + "learning_rate": 9.950945137233237e-06, + "loss": 1.3383, + "step": 1737 + }, + { + "epoch": 0.4622340425531915, + "grad_norm": 4.054213047027588, + "learning_rate": 9.950822163545032e-06, + "loss": 1.2836, + "step": 1738 + }, + { + "epoch": 0.4625, + "grad_norm": 3.9057390689849854, + "learning_rate": 9.95069903667256e-06, + "loss": 1.2157, + "step": 1739 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 3.977504014968872, + "learning_rate": 9.95057575661963e-06, + "loss": 1.322, + "step": 1740 + }, + { + "epoch": 0.463031914893617, + "grad_norm": 3.478853702545166, + "learning_rate": 9.950452323390058e-06, + "loss": 1.1772, + "step": 1741 + }, + { + "epoch": 0.4632978723404255, + "grad_norm": 3.8592848777770996, + "learning_rate": 9.950328736987664e-06, + "loss": 1.3234, + "step": 1742 + }, + { + "epoch": 0.46356382978723404, + "grad_norm": 3.858339309692383, + "learning_rate": 9.95020499741627e-06, + "loss": 1.3079, + "step": 1743 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 3.797468900680542, + "learning_rate": 9.950081104679704e-06, + "loss": 1.1611, + "step": 1744 + }, + { + "epoch": 0.4640957446808511, + "grad_norm": 3.9753012657165527, + "learning_rate": 9.949957058781802e-06, + "loss": 1.3449, + "step": 1745 + }, + { + "epoch": 0.4643617021276596, + "grad_norm": 4.22615385055542, + "learning_rate": 9.9498328597264e-06, + "loss": 1.1605, + "step": 1746 + }, + { + "epoch": 0.4646276595744681, + "grad_norm": 4.091019153594971, + "learning_rate": 9.949708507517342e-06, + "loss": 1.2877, + "step": 1747 + }, + { + "epoch": 0.4648936170212766, + "grad_norm": 4.121149063110352, + "learning_rate": 9.949584002158474e-06, + "loss": 1.2463, + "step": 1748 + }, + { + "epoch": 0.4651595744680851, + "grad_norm": 4.406885147094727, + "learning_rate": 9.949459343653652e-06, + "loss": 1.3303, + "step": 1749 + }, + { + "epoch": 0.4654255319148936, + "grad_norm": 4.5540666580200195, + "learning_rate": 9.94933453200673e-06, + "loss": 1.3149, + "step": 1750 + }, + { + "epoch": 0.4656914893617021, + "grad_norm": 3.9736440181732178, + "learning_rate": 9.949209567221569e-06, + "loss": 1.4947, + "step": 1751 + }, + { + "epoch": 0.46595744680851064, + "grad_norm": 4.265797138214111, + "learning_rate": 9.949084449302038e-06, + "loss": 1.2727, + "step": 1752 + }, + { + "epoch": 0.46622340425531916, + "grad_norm": 3.906663656234741, + "learning_rate": 9.948959178252007e-06, + "loss": 1.2346, + "step": 1753 + }, + { + "epoch": 0.4664893617021277, + "grad_norm": 3.8884990215301514, + "learning_rate": 9.948833754075351e-06, + "loss": 1.2997, + "step": 1754 + }, + { + "epoch": 0.46675531914893614, + "grad_norm": 3.943458080291748, + "learning_rate": 9.948708176775954e-06, + "loss": 1.2945, + "step": 1755 + }, + { + "epoch": 0.46702127659574466, + "grad_norm": 3.9176204204559326, + "learning_rate": 9.9485824463577e-06, + "loss": 1.2714, + "step": 1756 + }, + { + "epoch": 0.4672872340425532, + "grad_norm": 3.834636926651001, + "learning_rate": 9.948456562824478e-06, + "loss": 1.1341, + "step": 1757 + }, + { + "epoch": 0.4675531914893617, + "grad_norm": 3.8121955394744873, + "learning_rate": 9.948330526180183e-06, + "loss": 1.3064, + "step": 1758 + }, + { + "epoch": 0.4678191489361702, + "grad_norm": 4.121542930603027, + "learning_rate": 9.948204336428717e-06, + "loss": 1.2775, + "step": 1759 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 4.043048858642578, + "learning_rate": 9.948077993573983e-06, + "loss": 1.2601, + "step": 1760 + }, + { + "epoch": 0.46835106382978725, + "grad_norm": 3.7144079208374023, + "learning_rate": 9.94795149761989e-06, + "loss": 1.1136, + "step": 1761 + }, + { + "epoch": 0.46861702127659577, + "grad_norm": 4.818117141723633, + "learning_rate": 9.947824848570352e-06, + "loss": 1.4366, + "step": 1762 + }, + { + "epoch": 0.46888297872340423, + "grad_norm": 4.190409183502197, + "learning_rate": 9.947698046429287e-06, + "loss": 1.2308, + "step": 1763 + }, + { + "epoch": 0.46914893617021275, + "grad_norm": 4.0341267585754395, + "learning_rate": 9.94757109120062e-06, + "loss": 1.2466, + "step": 1764 + }, + { + "epoch": 0.46941489361702127, + "grad_norm": 3.9223225116729736, + "learning_rate": 9.947443982888279e-06, + "loss": 1.212, + "step": 1765 + }, + { + "epoch": 0.4696808510638298, + "grad_norm": 4.121956825256348, + "learning_rate": 9.947316721496196e-06, + "loss": 1.2635, + "step": 1766 + }, + { + "epoch": 0.4699468085106383, + "grad_norm": 3.9485208988189697, + "learning_rate": 9.947189307028308e-06, + "loss": 1.3579, + "step": 1767 + }, + { + "epoch": 0.4702127659574468, + "grad_norm": 4.009948253631592, + "learning_rate": 9.947061739488559e-06, + "loss": 1.4448, + "step": 1768 + }, + { + "epoch": 0.47047872340425534, + "grad_norm": 4.2954912185668945, + "learning_rate": 9.946934018880896e-06, + "loss": 1.1665, + "step": 1769 + }, + { + "epoch": 0.47074468085106386, + "grad_norm": 3.6225626468658447, + "learning_rate": 9.94680614520927e-06, + "loss": 1.2863, + "step": 1770 + }, + { + "epoch": 0.4710106382978723, + "grad_norm": 3.9409780502319336, + "learning_rate": 9.946678118477635e-06, + "loss": 1.1042, + "step": 1771 + }, + { + "epoch": 0.47127659574468084, + "grad_norm": 3.5868918895721436, + "learning_rate": 9.946549938689958e-06, + "loss": 1.1924, + "step": 1772 + }, + { + "epoch": 0.47154255319148936, + "grad_norm": 3.5596354007720947, + "learning_rate": 9.946421605850201e-06, + "loss": 1.1459, + "step": 1773 + }, + { + "epoch": 0.4718085106382979, + "grad_norm": 3.595719337463379, + "learning_rate": 9.946293119962336e-06, + "loss": 1.2274, + "step": 1774 + }, + { + "epoch": 0.4720744680851064, + "grad_norm": 4.341657638549805, + "learning_rate": 9.946164481030339e-06, + "loss": 1.433, + "step": 1775 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 4.137777328491211, + "learning_rate": 9.946035689058189e-06, + "loss": 1.3307, + "step": 1776 + }, + { + "epoch": 0.4726063829787234, + "grad_norm": 4.115199565887451, + "learning_rate": 9.94590674404987e-06, + "loss": 1.3575, + "step": 1777 + }, + { + "epoch": 0.4728723404255319, + "grad_norm": 3.9467270374298096, + "learning_rate": 9.945777646009375e-06, + "loss": 1.1772, + "step": 1778 + }, + { + "epoch": 0.4731382978723404, + "grad_norm": 3.986268997192383, + "learning_rate": 9.945648394940697e-06, + "loss": 1.3949, + "step": 1779 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 4.070546627044678, + "learning_rate": 9.945518990847835e-06, + "loss": 1.3664, + "step": 1780 + }, + { + "epoch": 0.47367021276595744, + "grad_norm": 4.0783233642578125, + "learning_rate": 9.94538943373479e-06, + "loss": 1.3199, + "step": 1781 + }, + { + "epoch": 0.47393617021276596, + "grad_norm": 4.331148147583008, + "learning_rate": 9.945259723605579e-06, + "loss": 1.3809, + "step": 1782 + }, + { + "epoch": 0.4742021276595745, + "grad_norm": 4.163266658782959, + "learning_rate": 9.945129860464205e-06, + "loss": 1.3325, + "step": 1783 + }, + { + "epoch": 0.474468085106383, + "grad_norm": 4.23274564743042, + "learning_rate": 9.944999844314693e-06, + "loss": 1.3793, + "step": 1784 + }, + { + "epoch": 0.4747340425531915, + "grad_norm": 4.219319820404053, + "learning_rate": 9.944869675161062e-06, + "loss": 1.3631, + "step": 1785 + }, + { + "epoch": 0.475, + "grad_norm": 4.5794830322265625, + "learning_rate": 9.944739353007344e-06, + "loss": 1.3941, + "step": 1786 + }, + { + "epoch": 0.4752659574468085, + "grad_norm": 3.806102752685547, + "learning_rate": 9.944608877857567e-06, + "loss": 1.2896, + "step": 1787 + }, + { + "epoch": 0.475531914893617, + "grad_norm": 3.927706241607666, + "learning_rate": 9.94447824971577e-06, + "loss": 1.4121, + "step": 1788 + }, + { + "epoch": 0.47579787234042553, + "grad_norm": 3.8713526725769043, + "learning_rate": 9.944347468585995e-06, + "loss": 1.3029, + "step": 1789 + }, + { + "epoch": 0.47606382978723405, + "grad_norm": 3.6732828617095947, + "learning_rate": 9.944216534472287e-06, + "loss": 1.2379, + "step": 1790 + }, + { + "epoch": 0.47632978723404257, + "grad_norm": 4.1793084144592285, + "learning_rate": 9.9440854473787e-06, + "loss": 1.391, + "step": 1791 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 4.131939888000488, + "learning_rate": 9.943954207309287e-06, + "loss": 1.2346, + "step": 1792 + }, + { + "epoch": 0.47686170212765955, + "grad_norm": 4.083577632904053, + "learning_rate": 9.94382281426811e-06, + "loss": 1.4478, + "step": 1793 + }, + { + "epoch": 0.47712765957446807, + "grad_norm": 3.640902280807495, + "learning_rate": 9.943691268259234e-06, + "loss": 1.2515, + "step": 1794 + }, + { + "epoch": 0.4773936170212766, + "grad_norm": 4.226308345794678, + "learning_rate": 9.943559569286731e-06, + "loss": 1.3599, + "step": 1795 + }, + { + "epoch": 0.4776595744680851, + "grad_norm": 4.301510810852051, + "learning_rate": 9.943427717354674e-06, + "loss": 1.2623, + "step": 1796 + }, + { + "epoch": 0.4779255319148936, + "grad_norm": 3.6332836151123047, + "learning_rate": 9.943295712467145e-06, + "loss": 1.2776, + "step": 1797 + }, + { + "epoch": 0.47819148936170214, + "grad_norm": 3.6086063385009766, + "learning_rate": 9.943163554628223e-06, + "loss": 1.2306, + "step": 1798 + }, + { + "epoch": 0.47845744680851066, + "grad_norm": 3.787510395050049, + "learning_rate": 9.943031243842004e-06, + "loss": 1.3904, + "step": 1799 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 4.257116317749023, + "learning_rate": 9.942898780112578e-06, + "loss": 1.2504, + "step": 1800 + }, + { + "epoch": 0.47898936170212764, + "grad_norm": 4.033913612365723, + "learning_rate": 9.942766163444044e-06, + "loss": 1.1252, + "step": 1801 + }, + { + "epoch": 0.47925531914893615, + "grad_norm": 3.9039859771728516, + "learning_rate": 9.942633393840504e-06, + "loss": 1.2183, + "step": 1802 + }, + { + "epoch": 0.47952127659574467, + "grad_norm": 4.116021156311035, + "learning_rate": 9.94250047130607e-06, + "loss": 1.3872, + "step": 1803 + }, + { + "epoch": 0.4797872340425532, + "grad_norm": 4.146193504333496, + "learning_rate": 9.94236739584485e-06, + "loss": 1.2302, + "step": 1804 + }, + { + "epoch": 0.4800531914893617, + "grad_norm": 4.098079681396484, + "learning_rate": 9.942234167460966e-06, + "loss": 1.3785, + "step": 1805 + }, + { + "epoch": 0.4803191489361702, + "grad_norm": 3.643486976623535, + "learning_rate": 9.942100786158537e-06, + "loss": 1.1499, + "step": 1806 + }, + { + "epoch": 0.48058510638297874, + "grad_norm": 4.246469974517822, + "learning_rate": 9.94196725194169e-06, + "loss": 1.3295, + "step": 1807 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 3.857382297515869, + "learning_rate": 9.94183356481456e-06, + "loss": 1.325, + "step": 1808 + }, + { + "epoch": 0.4811170212765957, + "grad_norm": 3.5324032306671143, + "learning_rate": 9.94169972478128e-06, + "loss": 1.1482, + "step": 1809 + }, + { + "epoch": 0.48138297872340424, + "grad_norm": 3.7972612380981445, + "learning_rate": 9.941565731845993e-06, + "loss": 1.4476, + "step": 1810 + }, + { + "epoch": 0.48164893617021276, + "grad_norm": 3.770042896270752, + "learning_rate": 9.941431586012844e-06, + "loss": 1.3034, + "step": 1811 + }, + { + "epoch": 0.4819148936170213, + "grad_norm": 3.675645351409912, + "learning_rate": 9.941297287285984e-06, + "loss": 1.2526, + "step": 1812 + }, + { + "epoch": 0.4821808510638298, + "grad_norm": 3.526350975036621, + "learning_rate": 9.941162835669568e-06, + "loss": 1.1573, + "step": 1813 + }, + { + "epoch": 0.4824468085106383, + "grad_norm": 3.4532649517059326, + "learning_rate": 9.941028231167756e-06, + "loss": 1.1735, + "step": 1814 + }, + { + "epoch": 0.48271276595744683, + "grad_norm": 3.9783992767333984, + "learning_rate": 9.940893473784714e-06, + "loss": 1.3828, + "step": 1815 + }, + { + "epoch": 0.4829787234042553, + "grad_norm": 4.059201717376709, + "learning_rate": 9.940758563524611e-06, + "loss": 1.2649, + "step": 1816 + }, + { + "epoch": 0.4832446808510638, + "grad_norm": 4.069849491119385, + "learning_rate": 9.94062350039162e-06, + "loss": 1.2833, + "step": 1817 + }, + { + "epoch": 0.48351063829787233, + "grad_norm": 3.488699197769165, + "learning_rate": 9.940488284389923e-06, + "loss": 1.0884, + "step": 1818 + }, + { + "epoch": 0.48377659574468085, + "grad_norm": 3.721902370452881, + "learning_rate": 9.940352915523699e-06, + "loss": 1.2442, + "step": 1819 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 4.082354545593262, + "learning_rate": 9.94021739379714e-06, + "loss": 1.3406, + "step": 1820 + }, + { + "epoch": 0.4843085106382979, + "grad_norm": 3.9286141395568848, + "learning_rate": 9.94008171921444e-06, + "loss": 1.2856, + "step": 1821 + }, + { + "epoch": 0.4845744680851064, + "grad_norm": 3.968208074569702, + "learning_rate": 9.939945891779795e-06, + "loss": 1.3172, + "step": 1822 + }, + { + "epoch": 0.4848404255319149, + "grad_norm": 4.114230155944824, + "learning_rate": 9.939809911497407e-06, + "loss": 1.2936, + "step": 1823 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 3.840162754058838, + "learning_rate": 9.939673778371484e-06, + "loss": 1.3923, + "step": 1824 + }, + { + "epoch": 0.4853723404255319, + "grad_norm": 4.272914886474609, + "learning_rate": 9.939537492406239e-06, + "loss": 1.2932, + "step": 1825 + }, + { + "epoch": 0.4856382978723404, + "grad_norm": 3.7386868000030518, + "learning_rate": 9.939401053605889e-06, + "loss": 1.3849, + "step": 1826 + }, + { + "epoch": 0.48590425531914894, + "grad_norm": 4.278271675109863, + "learning_rate": 9.939264461974654e-06, + "loss": 1.2878, + "step": 1827 + }, + { + "epoch": 0.48617021276595745, + "grad_norm": 3.827216386795044, + "learning_rate": 9.939127717516763e-06, + "loss": 1.2833, + "step": 1828 + }, + { + "epoch": 0.486436170212766, + "grad_norm": 3.888113498687744, + "learning_rate": 9.938990820236445e-06, + "loss": 1.2384, + "step": 1829 + }, + { + "epoch": 0.4867021276595745, + "grad_norm": 3.886965036392212, + "learning_rate": 9.938853770137935e-06, + "loss": 1.3365, + "step": 1830 + }, + { + "epoch": 0.48696808510638295, + "grad_norm": 3.9059507846832275, + "learning_rate": 9.938716567225475e-06, + "loss": 1.3569, + "step": 1831 + }, + { + "epoch": 0.48723404255319147, + "grad_norm": 3.922834634780884, + "learning_rate": 9.93857921150331e-06, + "loss": 1.2035, + "step": 1832 + }, + { + "epoch": 0.4875, + "grad_norm": 3.949385643005371, + "learning_rate": 9.938441702975689e-06, + "loss": 1.3485, + "step": 1833 + }, + { + "epoch": 0.4877659574468085, + "grad_norm": 4.1959333419799805, + "learning_rate": 9.938304041646869e-06, + "loss": 1.3079, + "step": 1834 + }, + { + "epoch": 0.488031914893617, + "grad_norm": 3.98871111869812, + "learning_rate": 9.938166227521106e-06, + "loss": 1.3067, + "step": 1835 + }, + { + "epoch": 0.48829787234042554, + "grad_norm": 4.129928112030029, + "learning_rate": 9.938028260602668e-06, + "loss": 1.3053, + "step": 1836 + }, + { + "epoch": 0.48856382978723406, + "grad_norm": 4.131626129150391, + "learning_rate": 9.937890140895819e-06, + "loss": 1.3332, + "step": 1837 + }, + { + "epoch": 0.4888297872340426, + "grad_norm": 3.8896591663360596, + "learning_rate": 9.937751868404838e-06, + "loss": 1.2105, + "step": 1838 + }, + { + "epoch": 0.48909574468085104, + "grad_norm": 3.6959292888641357, + "learning_rate": 9.937613443134e-06, + "loss": 1.1607, + "step": 1839 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 4.914716720581055, + "learning_rate": 9.937474865087588e-06, + "loss": 1.1406, + "step": 1840 + }, + { + "epoch": 0.4896276595744681, + "grad_norm": 3.811239004135132, + "learning_rate": 9.93733613426989e-06, + "loss": 1.2047, + "step": 1841 + }, + { + "epoch": 0.4898936170212766, + "grad_norm": 3.8995115756988525, + "learning_rate": 9.937197250685202e-06, + "loss": 1.1582, + "step": 1842 + }, + { + "epoch": 0.4901595744680851, + "grad_norm": 3.6087286472320557, + "learning_rate": 9.937058214337817e-06, + "loss": 1.1866, + "step": 1843 + }, + { + "epoch": 0.49042553191489363, + "grad_norm": 3.854526996612549, + "learning_rate": 9.936919025232036e-06, + "loss": 1.2744, + "step": 1844 + }, + { + "epoch": 0.49069148936170215, + "grad_norm": 3.870508909225464, + "learning_rate": 9.936779683372169e-06, + "loss": 1.1989, + "step": 1845 + }, + { + "epoch": 0.4909574468085106, + "grad_norm": 4.0505194664001465, + "learning_rate": 9.936640188762527e-06, + "loss": 1.206, + "step": 1846 + }, + { + "epoch": 0.49122340425531913, + "grad_norm": 3.8995118141174316, + "learning_rate": 9.936500541407424e-06, + "loss": 1.1642, + "step": 1847 + }, + { + "epoch": 0.49148936170212765, + "grad_norm": 4.045437812805176, + "learning_rate": 9.936360741311185e-06, + "loss": 1.2949, + "step": 1848 + }, + { + "epoch": 0.49175531914893617, + "grad_norm": 3.954519271850586, + "learning_rate": 9.93622078847813e-06, + "loss": 1.3334, + "step": 1849 + }, + { + "epoch": 0.4920212765957447, + "grad_norm": 3.9482545852661133, + "learning_rate": 9.936080682912594e-06, + "loss": 1.2859, + "step": 1850 + }, + { + "epoch": 0.4922872340425532, + "grad_norm": 3.7565512657165527, + "learning_rate": 9.935940424618908e-06, + "loss": 1.1294, + "step": 1851 + }, + { + "epoch": 0.4925531914893617, + "grad_norm": 4.012822151184082, + "learning_rate": 9.935800013601415e-06, + "loss": 1.4283, + "step": 1852 + }, + { + "epoch": 0.49281914893617024, + "grad_norm": 3.7840845584869385, + "learning_rate": 9.935659449864458e-06, + "loss": 1.332, + "step": 1853 + }, + { + "epoch": 0.4930851063829787, + "grad_norm": 4.097705364227295, + "learning_rate": 9.935518733412387e-06, + "loss": 1.1062, + "step": 1854 + }, + { + "epoch": 0.4933510638297872, + "grad_norm": 4.073275089263916, + "learning_rate": 9.935377864249558e-06, + "loss": 1.4567, + "step": 1855 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 4.020910263061523, + "learning_rate": 9.935236842380325e-06, + "loss": 1.247, + "step": 1856 + }, + { + "epoch": 0.49388297872340425, + "grad_norm": 4.380120277404785, + "learning_rate": 9.935095667809053e-06, + "loss": 1.2439, + "step": 1857 + }, + { + "epoch": 0.49414893617021277, + "grad_norm": 3.8681838512420654, + "learning_rate": 9.934954340540111e-06, + "loss": 1.3522, + "step": 1858 + }, + { + "epoch": 0.4944148936170213, + "grad_norm": 3.7794203758239746, + "learning_rate": 9.934812860577871e-06, + "loss": 1.1068, + "step": 1859 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 3.9970266819000244, + "learning_rate": 9.934671227926714e-06, + "loss": 1.228, + "step": 1860 + }, + { + "epoch": 0.4949468085106383, + "grad_norm": 4.03349494934082, + "learning_rate": 9.934529442591016e-06, + "loss": 1.5158, + "step": 1861 + }, + { + "epoch": 0.4952127659574468, + "grad_norm": 3.6862449645996094, + "learning_rate": 9.934387504575169e-06, + "loss": 1.3988, + "step": 1862 + }, + { + "epoch": 0.4954787234042553, + "grad_norm": 3.7959797382354736, + "learning_rate": 9.934245413883561e-06, + "loss": 1.2412, + "step": 1863 + }, + { + "epoch": 0.4957446808510638, + "grad_norm": 3.952791929244995, + "learning_rate": 9.934103170520592e-06, + "loss": 1.3866, + "step": 1864 + }, + { + "epoch": 0.49601063829787234, + "grad_norm": 3.7724785804748535, + "learning_rate": 9.933960774490663e-06, + "loss": 1.1724, + "step": 1865 + }, + { + "epoch": 0.49627659574468086, + "grad_norm": 3.9937689304351807, + "learning_rate": 9.933818225798178e-06, + "loss": 1.3353, + "step": 1866 + }, + { + "epoch": 0.4965425531914894, + "grad_norm": 3.818441152572632, + "learning_rate": 9.933675524447549e-06, + "loss": 1.205, + "step": 1867 + }, + { + "epoch": 0.4968085106382979, + "grad_norm": 3.97725772857666, + "learning_rate": 9.933532670443188e-06, + "loss": 1.289, + "step": 1868 + }, + { + "epoch": 0.49707446808510636, + "grad_norm": 3.930464744567871, + "learning_rate": 9.93338966378952e-06, + "loss": 1.5099, + "step": 1869 + }, + { + "epoch": 0.4973404255319149, + "grad_norm": 4.353559494018555, + "learning_rate": 9.933246504490966e-06, + "loss": 1.4003, + "step": 1870 + }, + { + "epoch": 0.4976063829787234, + "grad_norm": 3.9544339179992676, + "learning_rate": 9.933103192551958e-06, + "loss": 1.1387, + "step": 1871 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.9833321571350098, + "learning_rate": 9.932959727976928e-06, + "loss": 1.2584, + "step": 1872 + }, + { + "epoch": 0.49813829787234043, + "grad_norm": 3.862346887588501, + "learning_rate": 9.932816110770317e-06, + "loss": 1.4073, + "step": 1873 + }, + { + "epoch": 0.49840425531914895, + "grad_norm": 3.7747912406921387, + "learning_rate": 9.932672340936568e-06, + "loss": 1.2541, + "step": 1874 + }, + { + "epoch": 0.49867021276595747, + "grad_norm": 4.324585437774658, + "learning_rate": 9.93252841848013e-06, + "loss": 1.4344, + "step": 1875 + }, + { + "epoch": 0.498936170212766, + "grad_norm": 4.572371006011963, + "learning_rate": 9.932384343405452e-06, + "loss": 1.246, + "step": 1876 + }, + { + "epoch": 0.49920212765957445, + "grad_norm": 4.566850662231445, + "learning_rate": 9.932240115716998e-06, + "loss": 1.2813, + "step": 1877 + }, + { + "epoch": 0.49946808510638296, + "grad_norm": 3.940889358520508, + "learning_rate": 9.932095735419228e-06, + "loss": 1.1925, + "step": 1878 + }, + { + "epoch": 0.4997340425531915, + "grad_norm": 3.6935203075408936, + "learning_rate": 9.93195120251661e-06, + "loss": 1.2649, + "step": 1879 + }, + { + "epoch": 0.5, + "grad_norm": 4.11472749710083, + "learning_rate": 9.931806517013612e-06, + "loss": 1.3672, + "step": 1880 + }, + { + "epoch": 0.5002659574468085, + "grad_norm": 4.156626224517822, + "learning_rate": 9.931661678914717e-06, + "loss": 1.4258, + "step": 1881 + }, + { + "epoch": 0.500531914893617, + "grad_norm": 4.2577805519104, + "learning_rate": 9.9315166882244e-06, + "loss": 1.3524, + "step": 1882 + }, + { + "epoch": 0.5007978723404255, + "grad_norm": 3.9902119636535645, + "learning_rate": 9.931371544947154e-06, + "loss": 1.2988, + "step": 1883 + }, + { + "epoch": 0.5010638297872341, + "grad_norm": 4.20100736618042, + "learning_rate": 9.931226249087465e-06, + "loss": 1.3102, + "step": 1884 + }, + { + "epoch": 0.5013297872340425, + "grad_norm": 4.172153949737549, + "learning_rate": 9.93108080064983e-06, + "loss": 1.2019, + "step": 1885 + }, + { + "epoch": 0.5015957446808511, + "grad_norm": 4.27764892578125, + "learning_rate": 9.93093519963875e-06, + "loss": 1.2075, + "step": 1886 + }, + { + "epoch": 0.5018617021276596, + "grad_norm": 4.327826023101807, + "learning_rate": 9.930789446058729e-06, + "loss": 1.2459, + "step": 1887 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 4.269448757171631, + "learning_rate": 9.930643539914276e-06, + "loss": 1.4385, + "step": 1888 + }, + { + "epoch": 0.5023936170212766, + "grad_norm": 3.7377564907073975, + "learning_rate": 9.930497481209908e-06, + "loss": 1.2267, + "step": 1889 + }, + { + "epoch": 0.5026595744680851, + "grad_norm": 3.958397388458252, + "learning_rate": 9.930351269950144e-06, + "loss": 1.3289, + "step": 1890 + }, + { + "epoch": 0.5029255319148936, + "grad_norm": 3.992171049118042, + "learning_rate": 9.930204906139506e-06, + "loss": 1.2989, + "step": 1891 + }, + { + "epoch": 0.5031914893617021, + "grad_norm": 3.8019278049468994, + "learning_rate": 9.930058389782523e-06, + "loss": 1.3542, + "step": 1892 + }, + { + "epoch": 0.5034574468085107, + "grad_norm": 3.7610788345336914, + "learning_rate": 9.929911720883729e-06, + "loss": 1.247, + "step": 1893 + }, + { + "epoch": 0.5037234042553191, + "grad_norm": 3.765941619873047, + "learning_rate": 9.929764899447662e-06, + "loss": 1.3651, + "step": 1894 + }, + { + "epoch": 0.5039893617021277, + "grad_norm": 4.16331672668457, + "learning_rate": 9.929617925478868e-06, + "loss": 1.28, + "step": 1895 + }, + { + "epoch": 0.5042553191489362, + "grad_norm": 4.166515827178955, + "learning_rate": 9.929470798981888e-06, + "loss": 1.2401, + "step": 1896 + }, + { + "epoch": 0.5045212765957446, + "grad_norm": 4.0264177322387695, + "learning_rate": 9.929323519961278e-06, + "loss": 1.3036, + "step": 1897 + }, + { + "epoch": 0.5047872340425532, + "grad_norm": 3.85672926902771, + "learning_rate": 9.929176088421596e-06, + "loss": 1.1619, + "step": 1898 + }, + { + "epoch": 0.5050531914893617, + "grad_norm": 4.00507926940918, + "learning_rate": 9.929028504367402e-06, + "loss": 1.2787, + "step": 1899 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 3.6691126823425293, + "learning_rate": 9.928880767803264e-06, + "loss": 1.3256, + "step": 1900 + }, + { + "epoch": 0.5055851063829787, + "grad_norm": 4.093438625335693, + "learning_rate": 9.92873287873375e-06, + "loss": 1.2623, + "step": 1901 + }, + { + "epoch": 0.5058510638297873, + "grad_norm": 3.689911127090454, + "learning_rate": 9.92858483716344e-06, + "loss": 1.4022, + "step": 1902 + }, + { + "epoch": 0.5061170212765957, + "grad_norm": 4.178584575653076, + "learning_rate": 9.928436643096909e-06, + "loss": 1.3588, + "step": 1903 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 4.098899841308594, + "learning_rate": 9.928288296538749e-06, + "loss": 1.2687, + "step": 1904 + }, + { + "epoch": 0.5066489361702128, + "grad_norm": 4.034060001373291, + "learning_rate": 9.928139797493545e-06, + "loss": 1.2859, + "step": 1905 + }, + { + "epoch": 0.5069148936170212, + "grad_norm": 4.75716495513916, + "learning_rate": 9.927991145965894e-06, + "loss": 1.445, + "step": 1906 + }, + { + "epoch": 0.5071808510638298, + "grad_norm": 3.466297149658203, + "learning_rate": 9.927842341960396e-06, + "loss": 1.0634, + "step": 1907 + }, + { + "epoch": 0.5074468085106383, + "grad_norm": 3.9337103366851807, + "learning_rate": 9.927693385481652e-06, + "loss": 1.4115, + "step": 1908 + }, + { + "epoch": 0.5077127659574469, + "grad_norm": 3.6876132488250732, + "learning_rate": 9.927544276534275e-06, + "loss": 1.2333, + "step": 1909 + }, + { + "epoch": 0.5079787234042553, + "grad_norm": 4.154485702514648, + "learning_rate": 9.927395015122876e-06, + "loss": 1.2432, + "step": 1910 + }, + { + "epoch": 0.5082446808510638, + "grad_norm": 4.0430073738098145, + "learning_rate": 9.927245601252074e-06, + "loss": 1.3562, + "step": 1911 + }, + { + "epoch": 0.5085106382978724, + "grad_norm": 3.6701016426086426, + "learning_rate": 9.927096034926491e-06, + "loss": 1.2138, + "step": 1912 + }, + { + "epoch": 0.5087765957446808, + "grad_norm": 3.7969815731048584, + "learning_rate": 9.926946316150757e-06, + "loss": 1.3166, + "step": 1913 + }, + { + "epoch": 0.5090425531914894, + "grad_norm": 3.662705183029175, + "learning_rate": 9.926796444929502e-06, + "loss": 1.1107, + "step": 1914 + }, + { + "epoch": 0.5093085106382979, + "grad_norm": 3.8880231380462646, + "learning_rate": 9.926646421267366e-06, + "loss": 1.2989, + "step": 1915 + }, + { + "epoch": 0.5095744680851064, + "grad_norm": 3.6114046573638916, + "learning_rate": 9.926496245168989e-06, + "loss": 1.1822, + "step": 1916 + }, + { + "epoch": 0.5098404255319149, + "grad_norm": 3.799083948135376, + "learning_rate": 9.926345916639018e-06, + "loss": 1.1918, + "step": 1917 + }, + { + "epoch": 0.5101063829787233, + "grad_norm": 3.4708175659179688, + "learning_rate": 9.926195435682102e-06, + "loss": 1.1244, + "step": 1918 + }, + { + "epoch": 0.5103723404255319, + "grad_norm": 4.323407173156738, + "learning_rate": 9.926044802302904e-06, + "loss": 1.275, + "step": 1919 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 3.8659491539001465, + "learning_rate": 9.925894016506076e-06, + "loss": 1.2904, + "step": 1920 + }, + { + "epoch": 0.510904255319149, + "grad_norm": 3.7898192405700684, + "learning_rate": 9.925743078296288e-06, + "loss": 1.2569, + "step": 1921 + }, + { + "epoch": 0.5111702127659574, + "grad_norm": 3.559047222137451, + "learning_rate": 9.925591987678212e-06, + "loss": 1.3267, + "step": 1922 + }, + { + "epoch": 0.511436170212766, + "grad_norm": 3.8164639472961426, + "learning_rate": 9.925440744656518e-06, + "loss": 1.2059, + "step": 1923 + }, + { + "epoch": 0.5117021276595745, + "grad_norm": 4.318164825439453, + "learning_rate": 9.925289349235892e-06, + "loss": 1.3528, + "step": 1924 + }, + { + "epoch": 0.511968085106383, + "grad_norm": 3.8021814823150635, + "learning_rate": 9.925137801421011e-06, + "loss": 1.2096, + "step": 1925 + }, + { + "epoch": 0.5122340425531915, + "grad_norm": 3.7836246490478516, + "learning_rate": 9.924986101216569e-06, + "loss": 1.2719, + "step": 1926 + }, + { + "epoch": 0.5125, + "grad_norm": 4.108916282653809, + "learning_rate": 9.92483424862726e-06, + "loss": 1.4018, + "step": 1927 + }, + { + "epoch": 0.5127659574468085, + "grad_norm": 3.7151575088500977, + "learning_rate": 9.92468224365778e-06, + "loss": 1.3966, + "step": 1928 + }, + { + "epoch": 0.513031914893617, + "grad_norm": 3.5576205253601074, + "learning_rate": 9.924530086312834e-06, + "loss": 1.2066, + "step": 1929 + }, + { + "epoch": 0.5132978723404256, + "grad_norm": 3.6642985343933105, + "learning_rate": 9.924377776597128e-06, + "loss": 1.3887, + "step": 1930 + }, + { + "epoch": 0.513563829787234, + "grad_norm": 4.360495567321777, + "learning_rate": 9.924225314515375e-06, + "loss": 1.6151, + "step": 1931 + }, + { + "epoch": 0.5138297872340426, + "grad_norm": 3.934380292892456, + "learning_rate": 9.924072700072296e-06, + "loss": 1.2027, + "step": 1932 + }, + { + "epoch": 0.5140957446808511, + "grad_norm": 3.95251727104187, + "learning_rate": 9.923919933272608e-06, + "loss": 1.4496, + "step": 1933 + }, + { + "epoch": 0.5143617021276595, + "grad_norm": 3.660336494445801, + "learning_rate": 9.923767014121042e-06, + "loss": 1.2549, + "step": 1934 + }, + { + "epoch": 0.5146276595744681, + "grad_norm": 3.936469316482544, + "learning_rate": 9.923613942622326e-06, + "loss": 1.3851, + "step": 1935 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 3.912565231323242, + "learning_rate": 9.923460718781198e-06, + "loss": 1.303, + "step": 1936 + }, + { + "epoch": 0.5151595744680851, + "grad_norm": 3.9063549041748047, + "learning_rate": 9.923307342602399e-06, + "loss": 1.315, + "step": 1937 + }, + { + "epoch": 0.5154255319148936, + "grad_norm": 3.749720335006714, + "learning_rate": 9.923153814090675e-06, + "loss": 1.2961, + "step": 1938 + }, + { + "epoch": 0.5156914893617022, + "grad_norm": 3.978954315185547, + "learning_rate": 9.923000133250776e-06, + "loss": 1.4325, + "step": 1939 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 4.081971645355225, + "learning_rate": 9.922846300087454e-06, + "loss": 1.2811, + "step": 1940 + }, + { + "epoch": 0.5162234042553191, + "grad_norm": 3.9421591758728027, + "learning_rate": 9.922692314605472e-06, + "loss": 1.3513, + "step": 1941 + }, + { + "epoch": 0.5164893617021277, + "grad_norm": 3.6500041484832764, + "learning_rate": 9.922538176809597e-06, + "loss": 1.2927, + "step": 1942 + }, + { + "epoch": 0.5167553191489361, + "grad_norm": 3.858421564102173, + "learning_rate": 9.922383886704594e-06, + "loss": 1.1699, + "step": 1943 + }, + { + "epoch": 0.5170212765957447, + "grad_norm": 4.286783695220947, + "learning_rate": 9.922229444295238e-06, + "loss": 1.4037, + "step": 1944 + }, + { + "epoch": 0.5172872340425532, + "grad_norm": 4.163476943969727, + "learning_rate": 9.922074849586308e-06, + "loss": 1.1268, + "step": 1945 + }, + { + "epoch": 0.5175531914893617, + "grad_norm": 3.8577239513397217, + "learning_rate": 9.921920102582587e-06, + "loss": 1.2154, + "step": 1946 + }, + { + "epoch": 0.5178191489361702, + "grad_norm": 4.213263988494873, + "learning_rate": 9.921765203288862e-06, + "loss": 1.3188, + "step": 1947 + }, + { + "epoch": 0.5180851063829788, + "grad_norm": 3.817172050476074, + "learning_rate": 9.921610151709929e-06, + "loss": 1.2897, + "step": 1948 + }, + { + "epoch": 0.5183510638297872, + "grad_norm": 3.954479694366455, + "learning_rate": 9.921454947850582e-06, + "loss": 1.1568, + "step": 1949 + }, + { + "epoch": 0.5186170212765957, + "grad_norm": 4.054901123046875, + "learning_rate": 9.921299591715624e-06, + "loss": 1.1991, + "step": 1950 + }, + { + "epoch": 0.5188829787234043, + "grad_norm": 3.9514553546905518, + "learning_rate": 9.921144083309864e-06, + "loss": 1.2588, + "step": 1951 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 4.228671550750732, + "learning_rate": 9.920988422638112e-06, + "loss": 1.3348, + "step": 1952 + }, + { + "epoch": 0.5194148936170213, + "grad_norm": 3.997422695159912, + "learning_rate": 9.920832609705184e-06, + "loss": 1.2402, + "step": 1953 + }, + { + "epoch": 0.5196808510638298, + "grad_norm": 3.8394384384155273, + "learning_rate": 9.920676644515902e-06, + "loss": 1.222, + "step": 1954 + }, + { + "epoch": 0.5199468085106383, + "grad_norm": 3.654381036758423, + "learning_rate": 9.92052052707509e-06, + "loss": 1.4059, + "step": 1955 + }, + { + "epoch": 0.5202127659574468, + "grad_norm": 3.881578207015991, + "learning_rate": 9.92036425738758e-06, + "loss": 1.3507, + "step": 1956 + }, + { + "epoch": 0.5204787234042553, + "grad_norm": 3.819066286087036, + "learning_rate": 9.920207835458208e-06, + "loss": 1.3433, + "step": 1957 + }, + { + "epoch": 0.5207446808510638, + "grad_norm": 3.2657382488250732, + "learning_rate": 9.920051261291812e-06, + "loss": 1.0601, + "step": 1958 + }, + { + "epoch": 0.5210106382978723, + "grad_norm": 3.789560556411743, + "learning_rate": 9.919894534893237e-06, + "loss": 1.2395, + "step": 1959 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 3.620661973953247, + "learning_rate": 9.919737656267335e-06, + "loss": 1.1793, + "step": 1960 + }, + { + "epoch": 0.5215425531914893, + "grad_norm": 4.208719253540039, + "learning_rate": 9.919580625418955e-06, + "loss": 1.5431, + "step": 1961 + }, + { + "epoch": 0.5218085106382979, + "grad_norm": 4.2255024909973145, + "learning_rate": 9.919423442352958e-06, + "loss": 1.3665, + "step": 1962 + }, + { + "epoch": 0.5220744680851064, + "grad_norm": 4.246603965759277, + "learning_rate": 9.91926610707421e-06, + "loss": 1.2552, + "step": 1963 + }, + { + "epoch": 0.5223404255319148, + "grad_norm": 4.042827606201172, + "learning_rate": 9.919108619587575e-06, + "loss": 1.2171, + "step": 1964 + }, + { + "epoch": 0.5226063829787234, + "grad_norm": 4.006556510925293, + "learning_rate": 9.918950979897928e-06, + "loss": 1.2559, + "step": 1965 + }, + { + "epoch": 0.5228723404255319, + "grad_norm": 3.7249419689178467, + "learning_rate": 9.918793188010147e-06, + "loss": 1.0816, + "step": 1966 + }, + { + "epoch": 0.5231382978723405, + "grad_norm": 4.087320804595947, + "learning_rate": 9.918635243929115e-06, + "loss": 1.2607, + "step": 1967 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 4.031649589538574, + "learning_rate": 9.918477147659715e-06, + "loss": 1.2983, + "step": 1968 + }, + { + "epoch": 0.5236702127659575, + "grad_norm": 4.055499076843262, + "learning_rate": 9.918318899206842e-06, + "loss": 1.2686, + "step": 1969 + }, + { + "epoch": 0.523936170212766, + "grad_norm": 4.922122955322266, + "learning_rate": 9.918160498575394e-06, + "loss": 1.2761, + "step": 1970 + }, + { + "epoch": 0.5242021276595744, + "grad_norm": 4.155685901641846, + "learning_rate": 9.918001945770267e-06, + "loss": 1.3004, + "step": 1971 + }, + { + "epoch": 0.524468085106383, + "grad_norm": 4.165022373199463, + "learning_rate": 9.91784324079637e-06, + "loss": 1.4643, + "step": 1972 + }, + { + "epoch": 0.5247340425531914, + "grad_norm": 3.9013566970825195, + "learning_rate": 9.917684383658614e-06, + "loss": 1.2264, + "step": 1973 + }, + { + "epoch": 0.525, + "grad_norm": 4.016994953155518, + "learning_rate": 9.917525374361913e-06, + "loss": 1.2748, + "step": 1974 + }, + { + "epoch": 0.5252659574468085, + "grad_norm": 4.0600996017456055, + "learning_rate": 9.917366212911187e-06, + "loss": 1.2, + "step": 1975 + }, + { + "epoch": 0.5255319148936171, + "grad_norm": 4.1870903968811035, + "learning_rate": 9.91720689931136e-06, + "loss": 1.2307, + "step": 1976 + }, + { + "epoch": 0.5257978723404255, + "grad_norm": 3.7501108646392822, + "learning_rate": 9.917047433567364e-06, + "loss": 1.2853, + "step": 1977 + }, + { + "epoch": 0.5260638297872341, + "grad_norm": 3.8789479732513428, + "learning_rate": 9.91688781568413e-06, + "loss": 1.3571, + "step": 1978 + }, + { + "epoch": 0.5263297872340426, + "grad_norm": 3.641453981399536, + "learning_rate": 9.9167280456666e-06, + "loss": 1.1975, + "step": 1979 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 4.097661972045898, + "learning_rate": 9.916568123519713e-06, + "loss": 1.2415, + "step": 1980 + }, + { + "epoch": 0.5268617021276596, + "grad_norm": 3.447585105895996, + "learning_rate": 9.91640804924842e-06, + "loss": 1.1599, + "step": 1981 + }, + { + "epoch": 0.527127659574468, + "grad_norm": 3.906158208847046, + "learning_rate": 9.916247822857675e-06, + "loss": 1.2141, + "step": 1982 + }, + { + "epoch": 0.5273936170212766, + "grad_norm": 4.226005554199219, + "learning_rate": 9.916087444352433e-06, + "loss": 1.3575, + "step": 1983 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.955073118209839, + "learning_rate": 9.91592691373766e-06, + "loss": 1.159, + "step": 1984 + }, + { + "epoch": 0.5279255319148937, + "grad_norm": 3.770538568496704, + "learning_rate": 9.915766231018317e-06, + "loss": 1.2722, + "step": 1985 + }, + { + "epoch": 0.5281914893617021, + "grad_norm": 4.1326422691345215, + "learning_rate": 9.91560539619938e-06, + "loss": 1.4044, + "step": 1986 + }, + { + "epoch": 0.5284574468085106, + "grad_norm": 3.933978319168091, + "learning_rate": 9.915444409285827e-06, + "loss": 1.1495, + "step": 1987 + }, + { + "epoch": 0.5287234042553192, + "grad_norm": 3.8940069675445557, + "learning_rate": 9.915283270282637e-06, + "loss": 1.2658, + "step": 1988 + }, + { + "epoch": 0.5289893617021276, + "grad_norm": 3.8015975952148438, + "learning_rate": 9.915121979194793e-06, + "loss": 1.2155, + "step": 1989 + }, + { + "epoch": 0.5292553191489362, + "grad_norm": 4.204024791717529, + "learning_rate": 9.914960536027289e-06, + "loss": 1.3081, + "step": 1990 + }, + { + "epoch": 0.5295212765957447, + "grad_norm": 3.80530047416687, + "learning_rate": 9.91479894078512e-06, + "loss": 1.2827, + "step": 1991 + }, + { + "epoch": 0.5297872340425532, + "grad_norm": 4.011538505554199, + "learning_rate": 9.914637193473284e-06, + "loss": 1.2801, + "step": 1992 + }, + { + "epoch": 0.5300531914893617, + "grad_norm": 3.848898410797119, + "learning_rate": 9.914475294096788e-06, + "loss": 1.2904, + "step": 1993 + }, + { + "epoch": 0.5303191489361702, + "grad_norm": 3.7076499462127686, + "learning_rate": 9.91431324266064e-06, + "loss": 1.3455, + "step": 1994 + }, + { + "epoch": 0.5305851063829787, + "grad_norm": 4.372555255889893, + "learning_rate": 9.914151039169855e-06, + "loss": 1.3233, + "step": 1995 + }, + { + "epoch": 0.5308510638297872, + "grad_norm": 4.168186664581299, + "learning_rate": 9.913988683629449e-06, + "loss": 1.3303, + "step": 1996 + }, + { + "epoch": 0.5311170212765958, + "grad_norm": 3.4844412803649902, + "learning_rate": 9.91382617604445e-06, + "loss": 1.28, + "step": 1997 + }, + { + "epoch": 0.5313829787234042, + "grad_norm": 3.981612205505371, + "learning_rate": 9.913663516419883e-06, + "loss": 1.4133, + "step": 1998 + }, + { + "epoch": 0.5316489361702128, + "grad_norm": 3.6310243606567383, + "learning_rate": 9.913500704760781e-06, + "loss": 1.2546, + "step": 1999 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 3.6045448780059814, + "learning_rate": 9.913337741072183e-06, + "loss": 1.1445, + "step": 2000 + }, + { + "epoch": 0.5319148936170213, + "eval_loss": 1.2938566207885742, + "eval_runtime": 12.2817, + "eval_samples_per_second": 32.569, + "eval_steps_per_second": 4.071, + "step": 2000 + }, + { + "epoch": 0.5321808510638298, + "grad_norm": 4.040936470031738, + "learning_rate": 9.913174625359132e-06, + "loss": 1.2325, + "step": 2001 + }, + { + "epoch": 0.5324468085106383, + "grad_norm": 3.7908430099487305, + "learning_rate": 9.913011357626672e-06, + "loss": 1.3091, + "step": 2002 + }, + { + "epoch": 0.5327127659574468, + "grad_norm": 3.7691242694854736, + "learning_rate": 9.912847937879855e-06, + "loss": 1.2236, + "step": 2003 + }, + { + "epoch": 0.5329787234042553, + "grad_norm": 4.643370628356934, + "learning_rate": 9.91268436612374e-06, + "loss": 1.3033, + "step": 2004 + }, + { + "epoch": 0.5332446808510638, + "grad_norm": 3.5233020782470703, + "learning_rate": 9.912520642363387e-06, + "loss": 1.1542, + "step": 2005 + }, + { + "epoch": 0.5335106382978724, + "grad_norm": 4.1154022216796875, + "learning_rate": 9.912356766603862e-06, + "loss": 1.4088, + "step": 2006 + }, + { + "epoch": 0.5337765957446808, + "grad_norm": 5.4873247146606445, + "learning_rate": 9.912192738850234e-06, + "loss": 1.3057, + "step": 2007 + }, + { + "epoch": 0.5340425531914894, + "grad_norm": 3.9308226108551025, + "learning_rate": 9.912028559107577e-06, + "loss": 1.2788, + "step": 2008 + }, + { + "epoch": 0.5343085106382979, + "grad_norm": 3.6488893032073975, + "learning_rate": 9.91186422738098e-06, + "loss": 1.1555, + "step": 2009 + }, + { + "epoch": 0.5345744680851063, + "grad_norm": 3.553065061569214, + "learning_rate": 9.911699743675513e-06, + "loss": 1.2228, + "step": 2010 + }, + { + "epoch": 0.5348404255319149, + "grad_norm": 3.8336079120635986, + "learning_rate": 9.911535107996278e-06, + "loss": 1.2563, + "step": 2011 + }, + { + "epoch": 0.5351063829787234, + "grad_norm": 4.1601715087890625, + "learning_rate": 9.911370320348363e-06, + "loss": 1.2525, + "step": 2012 + }, + { + "epoch": 0.535372340425532, + "grad_norm": 3.4441726207733154, + "learning_rate": 9.911205380736868e-06, + "loss": 1.2293, + "step": 2013 + }, + { + "epoch": 0.5356382978723404, + "grad_norm": 4.281271457672119, + "learning_rate": 9.911040289166896e-06, + "loss": 1.5168, + "step": 2014 + }, + { + "epoch": 0.535904255319149, + "grad_norm": 3.982959508895874, + "learning_rate": 9.910875045643555e-06, + "loss": 1.2864, + "step": 2015 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 3.9199705123901367, + "learning_rate": 9.91070965017196e-06, + "loss": 1.2906, + "step": 2016 + }, + { + "epoch": 0.5364361702127659, + "grad_norm": 4.073878288269043, + "learning_rate": 9.910544102757224e-06, + "loss": 1.2435, + "step": 2017 + }, + { + "epoch": 0.5367021276595745, + "grad_norm": 4.169588088989258, + "learning_rate": 9.910378403404473e-06, + "loss": 1.3231, + "step": 2018 + }, + { + "epoch": 0.5369680851063829, + "grad_norm": 3.7797560691833496, + "learning_rate": 9.910212552118835e-06, + "loss": 1.2632, + "step": 2019 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 4.002804756164551, + "learning_rate": 9.910046548905437e-06, + "loss": 1.3988, + "step": 2020 + }, + { + "epoch": 0.5375, + "grad_norm": 3.8956003189086914, + "learning_rate": 9.90988039376942e-06, + "loss": 1.2534, + "step": 2021 + }, + { + "epoch": 0.5377659574468086, + "grad_norm": 3.6937549114227295, + "learning_rate": 9.90971408671592e-06, + "loss": 1.2312, + "step": 2022 + }, + { + "epoch": 0.538031914893617, + "grad_norm": 3.7216007709503174, + "learning_rate": 9.909547627750089e-06, + "loss": 1.2408, + "step": 2023 + }, + { + "epoch": 0.5382978723404256, + "grad_norm": 3.827702760696411, + "learning_rate": 9.909381016877074e-06, + "loss": 1.2551, + "step": 2024 + }, + { + "epoch": 0.538563829787234, + "grad_norm": 3.5307586193084717, + "learning_rate": 9.909214254102027e-06, + "loss": 1.2352, + "step": 2025 + }, + { + "epoch": 0.5388297872340425, + "grad_norm": 3.7490625381469727, + "learning_rate": 9.909047339430113e-06, + "loss": 1.2867, + "step": 2026 + }, + { + "epoch": 0.5390957446808511, + "grad_norm": 4.107030391693115, + "learning_rate": 9.908880272866495e-06, + "loss": 1.3459, + "step": 2027 + }, + { + "epoch": 0.5393617021276595, + "grad_norm": 3.855973482131958, + "learning_rate": 9.908713054416342e-06, + "loss": 1.224, + "step": 2028 + }, + { + "epoch": 0.5396276595744681, + "grad_norm": 4.167142391204834, + "learning_rate": 9.908545684084826e-06, + "loss": 1.4258, + "step": 2029 + }, + { + "epoch": 0.5398936170212766, + "grad_norm": 3.899373769760132, + "learning_rate": 9.90837816187713e-06, + "loss": 1.2853, + "step": 2030 + }, + { + "epoch": 0.5401595744680852, + "grad_norm": 3.8360328674316406, + "learning_rate": 9.908210487798433e-06, + "loss": 1.3503, + "step": 2031 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 3.633971929550171, + "learning_rate": 9.908042661853926e-06, + "loss": 1.0622, + "step": 2032 + }, + { + "epoch": 0.5406914893617021, + "grad_norm": 4.1685991287231445, + "learning_rate": 9.9078746840488e-06, + "loss": 1.3733, + "step": 2033 + }, + { + "epoch": 0.5409574468085107, + "grad_norm": 3.9930756092071533, + "learning_rate": 9.907706554388253e-06, + "loss": 1.4306, + "step": 2034 + }, + { + "epoch": 0.5412234042553191, + "grad_norm": 3.9129087924957275, + "learning_rate": 9.907538272877487e-06, + "loss": 1.1834, + "step": 2035 + }, + { + "epoch": 0.5414893617021277, + "grad_norm": 3.658611536026001, + "learning_rate": 9.90736983952171e-06, + "loss": 1.1908, + "step": 2036 + }, + { + "epoch": 0.5417553191489362, + "grad_norm": 3.9367542266845703, + "learning_rate": 9.907201254326132e-06, + "loss": 1.2853, + "step": 2037 + }, + { + "epoch": 0.5420212765957447, + "grad_norm": 3.9035940170288086, + "learning_rate": 9.907032517295966e-06, + "loss": 1.2867, + "step": 2038 + }, + { + "epoch": 0.5422872340425532, + "grad_norm": 3.702096939086914, + "learning_rate": 9.906863628436441e-06, + "loss": 1.2614, + "step": 2039 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 4.073267459869385, + "learning_rate": 9.906694587752777e-06, + "loss": 1.3793, + "step": 2040 + }, + { + "epoch": 0.5428191489361702, + "grad_norm": 3.864699363708496, + "learning_rate": 9.906525395250206e-06, + "loss": 1.1233, + "step": 2041 + }, + { + "epoch": 0.5430851063829787, + "grad_norm": 3.8738772869110107, + "learning_rate": 9.906356050933962e-06, + "loss": 1.1704, + "step": 2042 + }, + { + "epoch": 0.5433510638297873, + "grad_norm": 3.837299108505249, + "learning_rate": 9.906186554809284e-06, + "loss": 1.1802, + "step": 2043 + }, + { + "epoch": 0.5436170212765957, + "grad_norm": 4.00624942779541, + "learning_rate": 9.906016906881419e-06, + "loss": 1.2934, + "step": 2044 + }, + { + "epoch": 0.5438829787234043, + "grad_norm": 3.6519479751586914, + "learning_rate": 9.905847107155615e-06, + "loss": 1.2313, + "step": 2045 + }, + { + "epoch": 0.5441489361702128, + "grad_norm": 4.127234935760498, + "learning_rate": 9.905677155637126e-06, + "loss": 1.476, + "step": 2046 + }, + { + "epoch": 0.5444148936170212, + "grad_norm": 3.580862283706665, + "learning_rate": 9.90550705233121e-06, + "loss": 1.1991, + "step": 2047 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 4.004328727722168, + "learning_rate": 9.90533679724313e-06, + "loss": 1.2811, + "step": 2048 + }, + { + "epoch": 0.5449468085106383, + "grad_norm": 3.6748900413513184, + "learning_rate": 9.905166390378154e-06, + "loss": 1.3381, + "step": 2049 + }, + { + "epoch": 0.5452127659574468, + "grad_norm": 3.5765295028686523, + "learning_rate": 9.904995831741553e-06, + "loss": 1.2265, + "step": 2050 + }, + { + "epoch": 0.5454787234042553, + "grad_norm": 3.910905361175537, + "learning_rate": 9.904825121338609e-06, + "loss": 1.2516, + "step": 2051 + }, + { + "epoch": 0.5457446808510639, + "grad_norm": 3.8337693214416504, + "learning_rate": 9.9046542591746e-06, + "loss": 1.2997, + "step": 2052 + }, + { + "epoch": 0.5460106382978723, + "grad_norm": 3.837082862854004, + "learning_rate": 9.904483245254812e-06, + "loss": 1.3341, + "step": 2053 + }, + { + "epoch": 0.5462765957446809, + "grad_norm": 4.098066806793213, + "learning_rate": 9.90431207958454e-06, + "loss": 1.2182, + "step": 2054 + }, + { + "epoch": 0.5465425531914894, + "grad_norm": 4.022514343261719, + "learning_rate": 9.904140762169079e-06, + "loss": 1.4144, + "step": 2055 + }, + { + "epoch": 0.5468085106382978, + "grad_norm": 3.779283046722412, + "learning_rate": 9.903969293013727e-06, + "loss": 1.2291, + "step": 2056 + }, + { + "epoch": 0.5470744680851064, + "grad_norm": 4.28890323638916, + "learning_rate": 9.903797672123791e-06, + "loss": 1.3899, + "step": 2057 + }, + { + "epoch": 0.5473404255319149, + "grad_norm": 3.720780372619629, + "learning_rate": 9.903625899504583e-06, + "loss": 1.1992, + "step": 2058 + }, + { + "epoch": 0.5476063829787234, + "grad_norm": 3.80373215675354, + "learning_rate": 9.903453975161416e-06, + "loss": 1.322, + "step": 2059 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 4.012282371520996, + "learning_rate": 9.90328189909961e-06, + "loss": 1.1998, + "step": 2060 + }, + { + "epoch": 0.5481382978723405, + "grad_norm": 4.059588432312012, + "learning_rate": 9.903109671324488e-06, + "loss": 1.286, + "step": 2061 + }, + { + "epoch": 0.5484042553191489, + "grad_norm": 3.9015207290649414, + "learning_rate": 9.902937291841383e-06, + "loss": 1.3525, + "step": 2062 + }, + { + "epoch": 0.5486702127659574, + "grad_norm": 4.0359954833984375, + "learning_rate": 9.902764760655623e-06, + "loss": 1.3094, + "step": 2063 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.487372875213623, + "learning_rate": 9.90259207777255e-06, + "loss": 1.2127, + "step": 2064 + }, + { + "epoch": 0.5492021276595744, + "grad_norm": 3.607064723968506, + "learning_rate": 9.902419243197505e-06, + "loss": 1.2091, + "step": 2065 + }, + { + "epoch": 0.549468085106383, + "grad_norm": 3.9896395206451416, + "learning_rate": 9.902246256935837e-06, + "loss": 1.3059, + "step": 2066 + }, + { + "epoch": 0.5497340425531915, + "grad_norm": 4.376030445098877, + "learning_rate": 9.9020731189929e-06, + "loss": 1.3092, + "step": 2067 + }, + { + "epoch": 0.55, + "grad_norm": 3.3590362071990967, + "learning_rate": 9.901899829374048e-06, + "loss": 1.201, + "step": 2068 + }, + { + "epoch": 0.5502659574468085, + "grad_norm": 3.7063753604888916, + "learning_rate": 9.901726388084643e-06, + "loss": 1.182, + "step": 2069 + }, + { + "epoch": 0.550531914893617, + "grad_norm": 3.709569215774536, + "learning_rate": 9.901552795130054e-06, + "loss": 1.1766, + "step": 2070 + }, + { + "epoch": 0.5507978723404255, + "grad_norm": 4.3449249267578125, + "learning_rate": 9.90137905051565e-06, + "loss": 1.3167, + "step": 2071 + }, + { + "epoch": 0.551063829787234, + "grad_norm": 3.8162055015563965, + "learning_rate": 9.901205154246807e-06, + "loss": 1.2192, + "step": 2072 + }, + { + "epoch": 0.5513297872340426, + "grad_norm": 3.792880058288574, + "learning_rate": 9.901031106328907e-06, + "loss": 1.2957, + "step": 2073 + }, + { + "epoch": 0.551595744680851, + "grad_norm": 3.6657822132110596, + "learning_rate": 9.900856906767334e-06, + "loss": 1.3045, + "step": 2074 + }, + { + "epoch": 0.5518617021276596, + "grad_norm": 3.327601194381714, + "learning_rate": 9.900682555567478e-06, + "loss": 1.1348, + "step": 2075 + }, + { + "epoch": 0.5521276595744681, + "grad_norm": 3.9993128776550293, + "learning_rate": 9.900508052734734e-06, + "loss": 1.2678, + "step": 2076 + }, + { + "epoch": 0.5523936170212767, + "grad_norm": 3.922495126724243, + "learning_rate": 9.900333398274501e-06, + "loss": 1.1644, + "step": 2077 + }, + { + "epoch": 0.5526595744680851, + "grad_norm": 3.6909377574920654, + "learning_rate": 9.900158592192184e-06, + "loss": 1.208, + "step": 2078 + }, + { + "epoch": 0.5529255319148936, + "grad_norm": 4.378490924835205, + "learning_rate": 9.89998363449319e-06, + "loss": 1.2866, + "step": 2079 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.6202850341796875, + "learning_rate": 9.899808525182935e-06, + "loss": 1.238, + "step": 2080 + }, + { + "epoch": 0.5534574468085106, + "grad_norm": 3.9422550201416016, + "learning_rate": 9.899633264266835e-06, + "loss": 1.2932, + "step": 2081 + }, + { + "epoch": 0.5537234042553192, + "grad_norm": 4.002807140350342, + "learning_rate": 9.899457851750312e-06, + "loss": 1.301, + "step": 2082 + }, + { + "epoch": 0.5539893617021276, + "grad_norm": 4.242476940155029, + "learning_rate": 9.899282287638795e-06, + "loss": 1.2967, + "step": 2083 + }, + { + "epoch": 0.5542553191489362, + "grad_norm": 4.148952007293701, + "learning_rate": 9.899106571937716e-06, + "loss": 1.2863, + "step": 2084 + }, + { + "epoch": 0.5545212765957447, + "grad_norm": 3.8258893489837646, + "learning_rate": 9.898930704652512e-06, + "loss": 1.2253, + "step": 2085 + }, + { + "epoch": 0.5547872340425531, + "grad_norm": 4.117706298828125, + "learning_rate": 9.898754685788623e-06, + "loss": 1.3706, + "step": 2086 + }, + { + "epoch": 0.5550531914893617, + "grad_norm": 3.989381790161133, + "learning_rate": 9.898578515351498e-06, + "loss": 1.2585, + "step": 2087 + }, + { + "epoch": 0.5553191489361702, + "grad_norm": 3.8721275329589844, + "learning_rate": 9.898402193346585e-06, + "loss": 1.1284, + "step": 2088 + }, + { + "epoch": 0.5555851063829788, + "grad_norm": 4.169785499572754, + "learning_rate": 9.898225719779342e-06, + "loss": 1.2176, + "step": 2089 + }, + { + "epoch": 0.5558510638297872, + "grad_norm": 3.8007307052612305, + "learning_rate": 9.898049094655229e-06, + "loss": 1.1421, + "step": 2090 + }, + { + "epoch": 0.5561170212765958, + "grad_norm": 3.48579740524292, + "learning_rate": 9.897872317979708e-06, + "loss": 1.1123, + "step": 2091 + }, + { + "epoch": 0.5563829787234043, + "grad_norm": 3.6224656105041504, + "learning_rate": 9.897695389758253e-06, + "loss": 1.2452, + "step": 2092 + }, + { + "epoch": 0.5566489361702127, + "grad_norm": 4.0066752433776855, + "learning_rate": 9.897518309996336e-06, + "loss": 1.3127, + "step": 2093 + }, + { + "epoch": 0.5569148936170213, + "grad_norm": 3.5834217071533203, + "learning_rate": 9.897341078699437e-06, + "loss": 1.1945, + "step": 2094 + }, + { + "epoch": 0.5571808510638298, + "grad_norm": 3.616166830062866, + "learning_rate": 9.897163695873036e-06, + "loss": 1.2113, + "step": 2095 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 4.5236945152282715, + "learning_rate": 9.896986161522627e-06, + "loss": 1.556, + "step": 2096 + }, + { + "epoch": 0.5577127659574468, + "grad_norm": 4.006591320037842, + "learning_rate": 9.896808475653701e-06, + "loss": 1.3505, + "step": 2097 + }, + { + "epoch": 0.5579787234042554, + "grad_norm": 4.137003421783447, + "learning_rate": 9.896630638271755e-06, + "loss": 1.2105, + "step": 2098 + }, + { + "epoch": 0.5582446808510638, + "grad_norm": 4.136394500732422, + "learning_rate": 9.896452649382291e-06, + "loss": 1.4277, + "step": 2099 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 3.8342485427856445, + "learning_rate": 9.896274508990818e-06, + "loss": 1.2839, + "step": 2100 + }, + { + "epoch": 0.5587765957446809, + "grad_norm": 3.687845230102539, + "learning_rate": 9.896096217102848e-06, + "loss": 1.1659, + "step": 2101 + }, + { + "epoch": 0.5590425531914893, + "grad_norm": 3.971306562423706, + "learning_rate": 9.895917773723895e-06, + "loss": 1.4681, + "step": 2102 + }, + { + "epoch": 0.5593085106382979, + "grad_norm": 3.5636236667633057, + "learning_rate": 9.895739178859483e-06, + "loss": 1.2463, + "step": 2103 + }, + { + "epoch": 0.5595744680851064, + "grad_norm": 4.580478191375732, + "learning_rate": 9.895560432515136e-06, + "loss": 1.488, + "step": 2104 + }, + { + "epoch": 0.5598404255319149, + "grad_norm": 3.5549540519714355, + "learning_rate": 9.895381534696385e-06, + "loss": 1.1869, + "step": 2105 + }, + { + "epoch": 0.5601063829787234, + "grad_norm": 3.6891443729400635, + "learning_rate": 9.895202485408766e-06, + "loss": 1.2356, + "step": 2106 + }, + { + "epoch": 0.560372340425532, + "grad_norm": 4.139247894287109, + "learning_rate": 9.895023284657821e-06, + "loss": 1.2941, + "step": 2107 + }, + { + "epoch": 0.5606382978723404, + "grad_norm": 3.616758346557617, + "learning_rate": 9.89484393244909e-06, + "loss": 1.2292, + "step": 2108 + }, + { + "epoch": 0.5609042553191489, + "grad_norm": 3.634755849838257, + "learning_rate": 9.894664428788126e-06, + "loss": 1.2215, + "step": 2109 + }, + { + "epoch": 0.5611702127659575, + "grad_norm": 3.9066550731658936, + "learning_rate": 9.89448477368048e-06, + "loss": 1.3777, + "step": 2110 + }, + { + "epoch": 0.5614361702127659, + "grad_norm": 3.8861474990844727, + "learning_rate": 9.894304967131713e-06, + "loss": 1.2666, + "step": 2111 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 3.3856041431427, + "learning_rate": 9.894125009147389e-06, + "loss": 1.3001, + "step": 2112 + }, + { + "epoch": 0.561968085106383, + "grad_norm": 3.5979838371276855, + "learning_rate": 9.893944899733076e-06, + "loss": 1.2005, + "step": 2113 + }, + { + "epoch": 0.5622340425531915, + "grad_norm": 3.851020336151123, + "learning_rate": 9.893764638894345e-06, + "loss": 1.3479, + "step": 2114 + }, + { + "epoch": 0.5625, + "grad_norm": 4.208298206329346, + "learning_rate": 9.893584226636773e-06, + "loss": 1.3329, + "step": 2115 + }, + { + "epoch": 0.5627659574468085, + "grad_norm": 3.6734988689422607, + "learning_rate": 9.893403662965944e-06, + "loss": 1.3678, + "step": 2116 + }, + { + "epoch": 0.563031914893617, + "grad_norm": 3.708069324493408, + "learning_rate": 9.893222947887446e-06, + "loss": 1.3176, + "step": 2117 + }, + { + "epoch": 0.5632978723404255, + "grad_norm": 4.194994926452637, + "learning_rate": 9.893042081406868e-06, + "loss": 1.381, + "step": 2118 + }, + { + "epoch": 0.5635638297872341, + "grad_norm": 3.740922689437866, + "learning_rate": 9.892861063529807e-06, + "loss": 1.1555, + "step": 2119 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 3.744663715362549, + "learning_rate": 9.892679894261865e-06, + "loss": 1.132, + "step": 2120 + }, + { + "epoch": 0.5640957446808511, + "grad_norm": 4.050332546234131, + "learning_rate": 9.892498573608645e-06, + "loss": 1.3709, + "step": 2121 + }, + { + "epoch": 0.5643617021276596, + "grad_norm": 3.9612951278686523, + "learning_rate": 9.89231710157576e-06, + "loss": 1.2954, + "step": 2122 + }, + { + "epoch": 0.564627659574468, + "grad_norm": 3.165841817855835, + "learning_rate": 9.892135478168824e-06, + "loss": 1.1757, + "step": 2123 + }, + { + "epoch": 0.5648936170212766, + "grad_norm": 3.6281683444976807, + "learning_rate": 9.891953703393455e-06, + "loss": 1.0733, + "step": 2124 + }, + { + "epoch": 0.5651595744680851, + "grad_norm": 3.7431442737579346, + "learning_rate": 9.89177177725528e-06, + "loss": 1.3628, + "step": 2125 + }, + { + "epoch": 0.5654255319148936, + "grad_norm": 3.704817295074463, + "learning_rate": 9.891589699759929e-06, + "loss": 1.284, + "step": 2126 + }, + { + "epoch": 0.5656914893617021, + "grad_norm": 3.5511844158172607, + "learning_rate": 9.89140747091303e-06, + "loss": 1.1152, + "step": 2127 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.450695753097534, + "learning_rate": 9.891225090720227e-06, + "loss": 1.2245, + "step": 2128 + }, + { + "epoch": 0.5662234042553191, + "grad_norm": 3.8009350299835205, + "learning_rate": 9.891042559187161e-06, + "loss": 1.319, + "step": 2129 + }, + { + "epoch": 0.5664893617021277, + "grad_norm": 4.276994228363037, + "learning_rate": 9.890859876319479e-06, + "loss": 1.3191, + "step": 2130 + }, + { + "epoch": 0.5667553191489362, + "grad_norm": 4.0986738204956055, + "learning_rate": 9.890677042122834e-06, + "loss": 1.2553, + "step": 2131 + }, + { + "epoch": 0.5670212765957446, + "grad_norm": 3.861093044281006, + "learning_rate": 9.890494056602883e-06, + "loss": 1.1618, + "step": 2132 + }, + { + "epoch": 0.5672872340425532, + "grad_norm": 3.8807971477508545, + "learning_rate": 9.89031091976529e-06, + "loss": 1.3676, + "step": 2133 + }, + { + "epoch": 0.5675531914893617, + "grad_norm": 3.5750906467437744, + "learning_rate": 9.890127631615719e-06, + "loss": 1.3009, + "step": 2134 + }, + { + "epoch": 0.5678191489361702, + "grad_norm": 3.740861654281616, + "learning_rate": 9.88994419215984e-06, + "loss": 1.3059, + "step": 2135 + }, + { + "epoch": 0.5680851063829787, + "grad_norm": 3.945333480834961, + "learning_rate": 9.88976060140333e-06, + "loss": 1.3027, + "step": 2136 + }, + { + "epoch": 0.5683510638297873, + "grad_norm": 3.9484307765960693, + "learning_rate": 9.889576859351873e-06, + "loss": 1.4177, + "step": 2137 + }, + { + "epoch": 0.5686170212765957, + "grad_norm": 3.9661643505096436, + "learning_rate": 9.88939296601115e-06, + "loss": 1.3607, + "step": 2138 + }, + { + "epoch": 0.5688829787234042, + "grad_norm": 3.4872074127197266, + "learning_rate": 9.88920892138685e-06, + "loss": 1.1658, + "step": 2139 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 3.545102119445801, + "learning_rate": 9.889024725484672e-06, + "loss": 1.1813, + "step": 2140 + }, + { + "epoch": 0.5694148936170212, + "grad_norm": 3.738452434539795, + "learning_rate": 9.888840378310312e-06, + "loss": 1.2977, + "step": 2141 + }, + { + "epoch": 0.5696808510638298, + "grad_norm": 3.6037521362304688, + "learning_rate": 9.888655879869475e-06, + "loss": 1.2053, + "step": 2142 + }, + { + "epoch": 0.5699468085106383, + "grad_norm": 4.002810955047607, + "learning_rate": 9.888471230167869e-06, + "loss": 1.1678, + "step": 2143 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 3.659442186355591, + "learning_rate": 9.88828642921121e-06, + "loss": 1.3656, + "step": 2144 + }, + { + "epoch": 0.5704787234042553, + "grad_norm": 3.817089557647705, + "learning_rate": 9.88810147700521e-06, + "loss": 1.3597, + "step": 2145 + }, + { + "epoch": 0.5707446808510638, + "grad_norm": 3.5655431747436523, + "learning_rate": 9.887916373555597e-06, + "loss": 1.2276, + "step": 2146 + }, + { + "epoch": 0.5710106382978724, + "grad_norm": 3.873889923095703, + "learning_rate": 9.887731118868098e-06, + "loss": 1.3873, + "step": 2147 + }, + { + "epoch": 0.5712765957446808, + "grad_norm": 4.273273468017578, + "learning_rate": 9.887545712948441e-06, + "loss": 1.366, + "step": 2148 + }, + { + "epoch": 0.5715425531914894, + "grad_norm": 3.5899455547332764, + "learning_rate": 9.887360155802366e-06, + "loss": 1.1787, + "step": 2149 + }, + { + "epoch": 0.5718085106382979, + "grad_norm": 3.615471124649048, + "learning_rate": 9.887174447435615e-06, + "loss": 1.1561, + "step": 2150 + }, + { + "epoch": 0.5720744680851064, + "grad_norm": 3.8445990085601807, + "learning_rate": 9.886988587853933e-06, + "loss": 1.315, + "step": 2151 + }, + { + "epoch": 0.5723404255319149, + "grad_norm": 3.989668846130371, + "learning_rate": 9.886802577063068e-06, + "loss": 1.3116, + "step": 2152 + }, + { + "epoch": 0.5726063829787233, + "grad_norm": 4.619128227233887, + "learning_rate": 9.886616415068779e-06, + "loss": 1.3862, + "step": 2153 + }, + { + "epoch": 0.5728723404255319, + "grad_norm": 3.6989963054656982, + "learning_rate": 9.886430101876825e-06, + "loss": 1.2221, + "step": 2154 + }, + { + "epoch": 0.5731382978723404, + "grad_norm": 4.153132915496826, + "learning_rate": 9.886243637492969e-06, + "loss": 1.2128, + "step": 2155 + }, + { + "epoch": 0.573404255319149, + "grad_norm": 3.970520257949829, + "learning_rate": 9.886057021922984e-06, + "loss": 1.2802, + "step": 2156 + }, + { + "epoch": 0.5736702127659574, + "grad_norm": 3.751838207244873, + "learning_rate": 9.885870255172642e-06, + "loss": 1.1967, + "step": 2157 + }, + { + "epoch": 0.573936170212766, + "grad_norm": 3.6611552238464355, + "learning_rate": 9.88568333724772e-06, + "loss": 1.2956, + "step": 2158 + }, + { + "epoch": 0.5742021276595745, + "grad_norm": 4.170332908630371, + "learning_rate": 9.885496268154005e-06, + "loss": 1.2867, + "step": 2159 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 3.5777552127838135, + "learning_rate": 9.885309047897285e-06, + "loss": 1.1703, + "step": 2160 + }, + { + "epoch": 0.5747340425531915, + "grad_norm": 3.9369912147521973, + "learning_rate": 9.88512167648335e-06, + "loss": 1.3682, + "step": 2161 + }, + { + "epoch": 0.575, + "grad_norm": 4.30880069732666, + "learning_rate": 9.884934153917998e-06, + "loss": 1.2892, + "step": 2162 + }, + { + "epoch": 0.5752659574468085, + "grad_norm": 4.251465797424316, + "learning_rate": 9.884746480207031e-06, + "loss": 1.3043, + "step": 2163 + }, + { + "epoch": 0.575531914893617, + "grad_norm": 3.4858951568603516, + "learning_rate": 9.88455865535626e-06, + "loss": 1.3418, + "step": 2164 + }, + { + "epoch": 0.5757978723404256, + "grad_norm": 3.715372085571289, + "learning_rate": 9.88437067937149e-06, + "loss": 1.274, + "step": 2165 + }, + { + "epoch": 0.576063829787234, + "grad_norm": 3.5083811283111572, + "learning_rate": 9.884182552258543e-06, + "loss": 1.1127, + "step": 2166 + }, + { + "epoch": 0.5763297872340426, + "grad_norm": 4.5049004554748535, + "learning_rate": 9.883994274023237e-06, + "loss": 1.3182, + "step": 2167 + }, + { + "epoch": 0.5765957446808511, + "grad_norm": 4.002771377563477, + "learning_rate": 9.883805844671396e-06, + "loss": 1.4289, + "step": 2168 + }, + { + "epoch": 0.5768617021276595, + "grad_norm": 3.691743850708008, + "learning_rate": 9.883617264208854e-06, + "loss": 1.3677, + "step": 2169 + }, + { + "epoch": 0.5771276595744681, + "grad_norm": 4.031147003173828, + "learning_rate": 9.883428532641445e-06, + "loss": 1.1805, + "step": 2170 + }, + { + "epoch": 0.5773936170212766, + "grad_norm": 4.453026294708252, + "learning_rate": 9.883239649975007e-06, + "loss": 1.4034, + "step": 2171 + }, + { + "epoch": 0.5776595744680851, + "grad_norm": 3.6685361862182617, + "learning_rate": 9.883050616215383e-06, + "loss": 1.3169, + "step": 2172 + }, + { + "epoch": 0.5779255319148936, + "grad_norm": 3.6789016723632812, + "learning_rate": 9.882861431368425e-06, + "loss": 1.3912, + "step": 2173 + }, + { + "epoch": 0.5781914893617022, + "grad_norm": 3.6971778869628906, + "learning_rate": 9.882672095439987e-06, + "loss": 1.1346, + "step": 2174 + }, + { + "epoch": 0.5784574468085106, + "grad_norm": 3.8128819465637207, + "learning_rate": 9.882482608435924e-06, + "loss": 1.3105, + "step": 2175 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 4.369806289672852, + "learning_rate": 9.882292970362101e-06, + "loss": 1.3673, + "step": 2176 + }, + { + "epoch": 0.5789893617021277, + "grad_norm": 3.403639316558838, + "learning_rate": 9.882103181224386e-06, + "loss": 1.2435, + "step": 2177 + }, + { + "epoch": 0.5792553191489361, + "grad_norm": 3.7755768299102783, + "learning_rate": 9.88191324102865e-06, + "loss": 1.3237, + "step": 2178 + }, + { + "epoch": 0.5795212765957447, + "grad_norm": 3.4330899715423584, + "learning_rate": 9.88172314978077e-06, + "loss": 1.249, + "step": 2179 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 3.9291467666625977, + "learning_rate": 9.88153290748663e-06, + "loss": 1.4475, + "step": 2180 + }, + { + "epoch": 0.5800531914893617, + "grad_norm": 3.731370210647583, + "learning_rate": 9.881342514152114e-06, + "loss": 1.2166, + "step": 2181 + }, + { + "epoch": 0.5803191489361702, + "grad_norm": 3.7620556354522705, + "learning_rate": 9.881151969783113e-06, + "loss": 1.2329, + "step": 2182 + }, + { + "epoch": 0.5805851063829788, + "grad_norm": 3.822985887527466, + "learning_rate": 9.880961274385523e-06, + "loss": 1.2219, + "step": 2183 + }, + { + "epoch": 0.5808510638297872, + "grad_norm": 3.2141547203063965, + "learning_rate": 9.880770427965245e-06, + "loss": 1.0712, + "step": 2184 + }, + { + "epoch": 0.5811170212765957, + "grad_norm": 3.733004331588745, + "learning_rate": 9.880579430528183e-06, + "loss": 1.203, + "step": 2185 + }, + { + "epoch": 0.5813829787234043, + "grad_norm": 3.6706783771514893, + "learning_rate": 9.880388282080247e-06, + "loss": 1.1757, + "step": 2186 + }, + { + "epoch": 0.5816489361702127, + "grad_norm": 3.7189342975616455, + "learning_rate": 9.880196982627352e-06, + "loss": 1.2265, + "step": 2187 + }, + { + "epoch": 0.5819148936170213, + "grad_norm": 3.8598103523254395, + "learning_rate": 9.88000553217542e-06, + "loss": 1.2892, + "step": 2188 + }, + { + "epoch": 0.5821808510638298, + "grad_norm": 3.854811191558838, + "learning_rate": 9.879813930730367e-06, + "loss": 1.1292, + "step": 2189 + }, + { + "epoch": 0.5824468085106383, + "grad_norm": 4.142318248748779, + "learning_rate": 9.879622178298128e-06, + "loss": 1.1795, + "step": 2190 + }, + { + "epoch": 0.5827127659574468, + "grad_norm": 3.688462257385254, + "learning_rate": 9.879430274884632e-06, + "loss": 1.2044, + "step": 2191 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 3.4742586612701416, + "learning_rate": 9.879238220495818e-06, + "loss": 1.1547, + "step": 2192 + }, + { + "epoch": 0.5832446808510638, + "grad_norm": 3.9008736610412598, + "learning_rate": 9.87904601513763e-06, + "loss": 1.2293, + "step": 2193 + }, + { + "epoch": 0.5835106382978723, + "grad_norm": 3.70694899559021, + "learning_rate": 9.878853658816015e-06, + "loss": 1.2758, + "step": 2194 + }, + { + "epoch": 0.5837765957446809, + "grad_norm": 4.015002727508545, + "learning_rate": 9.878661151536923e-06, + "loss": 1.3352, + "step": 2195 + }, + { + "epoch": 0.5840425531914893, + "grad_norm": 3.423016309738159, + "learning_rate": 9.87846849330631e-06, + "loss": 1.1313, + "step": 2196 + }, + { + "epoch": 0.5843085106382979, + "grad_norm": 3.549492120742798, + "learning_rate": 9.87827568413014e-06, + "loss": 1.3162, + "step": 2197 + }, + { + "epoch": 0.5845744680851064, + "grad_norm": 4.05422306060791, + "learning_rate": 9.878082724014375e-06, + "loss": 1.2593, + "step": 2198 + }, + { + "epoch": 0.5848404255319148, + "grad_norm": 3.875730514526367, + "learning_rate": 9.877889612964988e-06, + "loss": 1.1837, + "step": 2199 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 3.4176459312438965, + "learning_rate": 9.877696350987954e-06, + "loss": 1.1748, + "step": 2200 + }, + { + "epoch": 0.5853723404255319, + "grad_norm": 4.281347751617432, + "learning_rate": 9.87750293808925e-06, + "loss": 1.272, + "step": 2201 + }, + { + "epoch": 0.5856382978723405, + "grad_norm": 4.0162577629089355, + "learning_rate": 9.877309374274865e-06, + "loss": 1.2567, + "step": 2202 + }, + { + "epoch": 0.5859042553191489, + "grad_norm": 4.051181793212891, + "learning_rate": 9.877115659550785e-06, + "loss": 1.2305, + "step": 2203 + }, + { + "epoch": 0.5861702127659575, + "grad_norm": 3.711719512939453, + "learning_rate": 9.876921793923005e-06, + "loss": 1.1956, + "step": 2204 + }, + { + "epoch": 0.586436170212766, + "grad_norm": 3.402353048324585, + "learning_rate": 9.876727777397522e-06, + "loss": 1.1938, + "step": 2205 + }, + { + "epoch": 0.5867021276595744, + "grad_norm": 3.7966136932373047, + "learning_rate": 9.87653360998034e-06, + "loss": 1.2964, + "step": 2206 + }, + { + "epoch": 0.586968085106383, + "grad_norm": 3.816732406616211, + "learning_rate": 9.876339291677466e-06, + "loss": 1.2739, + "step": 2207 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 3.801443576812744, + "learning_rate": 9.876144822494913e-06, + "loss": 1.2832, + "step": 2208 + }, + { + "epoch": 0.5875, + "grad_norm": 3.7559401988983154, + "learning_rate": 9.8759502024387e-06, + "loss": 1.2176, + "step": 2209 + }, + { + "epoch": 0.5877659574468085, + "grad_norm": 3.9138758182525635, + "learning_rate": 9.875755431514846e-06, + "loss": 1.3423, + "step": 2210 + }, + { + "epoch": 0.5880319148936171, + "grad_norm": 4.0434041023254395, + "learning_rate": 9.875560509729379e-06, + "loss": 1.3064, + "step": 2211 + }, + { + "epoch": 0.5882978723404255, + "grad_norm": 3.7799887657165527, + "learning_rate": 9.87536543708833e-06, + "loss": 1.2518, + "step": 2212 + }, + { + "epoch": 0.5885638297872341, + "grad_norm": 3.8034684658050537, + "learning_rate": 9.875170213597731e-06, + "loss": 1.2485, + "step": 2213 + }, + { + "epoch": 0.5888297872340426, + "grad_norm": 4.390495300292969, + "learning_rate": 9.874974839263629e-06, + "loss": 1.263, + "step": 2214 + }, + { + "epoch": 0.589095744680851, + "grad_norm": 4.027488708496094, + "learning_rate": 9.874779314092065e-06, + "loss": 1.2718, + "step": 2215 + }, + { + "epoch": 0.5893617021276596, + "grad_norm": 3.8035428524017334, + "learning_rate": 9.87458363808909e-06, + "loss": 1.2636, + "step": 2216 + }, + { + "epoch": 0.589627659574468, + "grad_norm": 3.5652413368225098, + "learning_rate": 9.874387811260756e-06, + "loss": 1.241, + "step": 2217 + }, + { + "epoch": 0.5898936170212766, + "grad_norm": 4.2285614013671875, + "learning_rate": 9.874191833613128e-06, + "loss": 1.1943, + "step": 2218 + }, + { + "epoch": 0.5901595744680851, + "grad_norm": 4.229702472686768, + "learning_rate": 9.873995705152264e-06, + "loss": 1.382, + "step": 2219 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 4.092412948608398, + "learning_rate": 9.873799425884235e-06, + "loss": 1.132, + "step": 2220 + }, + { + "epoch": 0.5906914893617021, + "grad_norm": 3.6512703895568848, + "learning_rate": 9.873602995815113e-06, + "loss": 1.2022, + "step": 2221 + }, + { + "epoch": 0.5909574468085106, + "grad_norm": 3.634768009185791, + "learning_rate": 9.873406414950977e-06, + "loss": 1.2932, + "step": 2222 + }, + { + "epoch": 0.5912234042553192, + "grad_norm": 3.6227974891662598, + "learning_rate": 9.873209683297908e-06, + "loss": 1.2947, + "step": 2223 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 3.5124943256378174, + "learning_rate": 9.873012800861996e-06, + "loss": 1.1896, + "step": 2224 + }, + { + "epoch": 0.5917553191489362, + "grad_norm": 3.759474992752075, + "learning_rate": 9.872815767649329e-06, + "loss": 1.2116, + "step": 2225 + }, + { + "epoch": 0.5920212765957447, + "grad_norm": 3.7036375999450684, + "learning_rate": 9.872618583666005e-06, + "loss": 1.2293, + "step": 2226 + }, + { + "epoch": 0.5922872340425532, + "grad_norm": 3.61789608001709, + "learning_rate": 9.872421248918124e-06, + "loss": 1.2121, + "step": 2227 + }, + { + "epoch": 0.5925531914893617, + "grad_norm": 4.019472122192383, + "learning_rate": 9.872223763411794e-06, + "loss": 1.1467, + "step": 2228 + }, + { + "epoch": 0.5928191489361702, + "grad_norm": 3.774531364440918, + "learning_rate": 9.872026127153126e-06, + "loss": 1.3685, + "step": 2229 + }, + { + "epoch": 0.5930851063829787, + "grad_norm": 3.9165661334991455, + "learning_rate": 9.871828340148232e-06, + "loss": 1.1668, + "step": 2230 + }, + { + "epoch": 0.5933510638297872, + "grad_norm": 3.762282133102417, + "learning_rate": 9.871630402403235e-06, + "loss": 1.2315, + "step": 2231 + }, + { + "epoch": 0.5936170212765958, + "grad_norm": 3.96540904045105, + "learning_rate": 9.871432313924255e-06, + "loss": 1.3042, + "step": 2232 + }, + { + "epoch": 0.5938829787234042, + "grad_norm": 4.1440229415893555, + "learning_rate": 9.871234074717424e-06, + "loss": 1.3715, + "step": 2233 + }, + { + "epoch": 0.5941489361702128, + "grad_norm": 3.7638661861419678, + "learning_rate": 9.871035684788878e-06, + "loss": 1.2619, + "step": 2234 + }, + { + "epoch": 0.5944148936170213, + "grad_norm": 3.5591323375701904, + "learning_rate": 9.870837144144752e-06, + "loss": 1.1941, + "step": 2235 + }, + { + "epoch": 0.5946808510638298, + "grad_norm": 4.143522262573242, + "learning_rate": 9.87063845279119e-06, + "loss": 1.1687, + "step": 2236 + }, + { + "epoch": 0.5949468085106383, + "grad_norm": 4.148569583892822, + "learning_rate": 9.87043961073434e-06, + "loss": 1.4218, + "step": 2237 + }, + { + "epoch": 0.5952127659574468, + "grad_norm": 3.687147378921509, + "learning_rate": 9.870240617980353e-06, + "loss": 1.1311, + "step": 2238 + }, + { + "epoch": 0.5954787234042553, + "grad_norm": 3.5179238319396973, + "learning_rate": 9.870041474535388e-06, + "loss": 1.1823, + "step": 2239 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 3.844238519668579, + "learning_rate": 9.869842180405607e-06, + "loss": 1.3256, + "step": 2240 + }, + { + "epoch": 0.5960106382978724, + "grad_norm": 3.9333431720733643, + "learning_rate": 9.869642735597174e-06, + "loss": 1.3545, + "step": 2241 + }, + { + "epoch": 0.5962765957446808, + "grad_norm": 3.531179666519165, + "learning_rate": 9.869443140116261e-06, + "loss": 1.3254, + "step": 2242 + }, + { + "epoch": 0.5965425531914894, + "grad_norm": 3.795381546020508, + "learning_rate": 9.869243393969045e-06, + "loss": 1.2744, + "step": 2243 + }, + { + "epoch": 0.5968085106382979, + "grad_norm": 4.001238822937012, + "learning_rate": 9.869043497161707e-06, + "loss": 1.3585, + "step": 2244 + }, + { + "epoch": 0.5970744680851063, + "grad_norm": 4.289900302886963, + "learning_rate": 9.868843449700429e-06, + "loss": 1.3628, + "step": 2245 + }, + { + "epoch": 0.5973404255319149, + "grad_norm": 3.581144332885742, + "learning_rate": 9.868643251591403e-06, + "loss": 1.3021, + "step": 2246 + }, + { + "epoch": 0.5976063829787234, + "grad_norm": 3.504152536392212, + "learning_rate": 9.868442902840823e-06, + "loss": 1.2073, + "step": 2247 + }, + { + "epoch": 0.597872340425532, + "grad_norm": 3.648141622543335, + "learning_rate": 9.868242403454886e-06, + "loss": 1.3169, + "step": 2248 + }, + { + "epoch": 0.5981382978723404, + "grad_norm": 3.544408082962036, + "learning_rate": 9.8680417534398e-06, + "loss": 1.1334, + "step": 2249 + }, + { + "epoch": 0.598404255319149, + "grad_norm": 3.6868479251861572, + "learning_rate": 9.867840952801768e-06, + "loss": 1.209, + "step": 2250 + }, + { + "epoch": 0.5986702127659574, + "grad_norm": 3.6805198192596436, + "learning_rate": 9.867640001547007e-06, + "loss": 1.3011, + "step": 2251 + }, + { + "epoch": 0.5989361702127659, + "grad_norm": 3.646977186203003, + "learning_rate": 9.867438899681734e-06, + "loss": 1.2178, + "step": 2252 + }, + { + "epoch": 0.5992021276595745, + "grad_norm": 3.4612386226654053, + "learning_rate": 9.867237647212168e-06, + "loss": 1.1646, + "step": 2253 + }, + { + "epoch": 0.5994680851063829, + "grad_norm": 3.663968324661255, + "learning_rate": 9.867036244144544e-06, + "loss": 1.2337, + "step": 2254 + }, + { + "epoch": 0.5997340425531915, + "grad_norm": 3.724919080734253, + "learning_rate": 9.866834690485083e-06, + "loss": 1.3467, + "step": 2255 + }, + { + "epoch": 0.6, + "grad_norm": 3.6140668392181396, + "learning_rate": 9.86663298624003e-06, + "loss": 1.2684, + "step": 2256 + }, + { + "epoch": 0.6002659574468086, + "grad_norm": 3.805572271347046, + "learning_rate": 9.866431131415621e-06, + "loss": 1.3172, + "step": 2257 + }, + { + "epoch": 0.600531914893617, + "grad_norm": 3.921037435531616, + "learning_rate": 9.866229126018104e-06, + "loss": 1.1632, + "step": 2258 + }, + { + "epoch": 0.6007978723404256, + "grad_norm": 4.814824104309082, + "learning_rate": 9.866026970053728e-06, + "loss": 1.371, + "step": 2259 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 3.8934485912323, + "learning_rate": 9.86582466352875e-06, + "loss": 1.2192, + "step": 2260 + }, + { + "epoch": 0.6013297872340425, + "grad_norm": 4.167794704437256, + "learning_rate": 9.865622206449428e-06, + "loss": 1.3167, + "step": 2261 + }, + { + "epoch": 0.6015957446808511, + "grad_norm": 3.916013479232788, + "learning_rate": 9.865419598822025e-06, + "loss": 1.2492, + "step": 2262 + }, + { + "epoch": 0.6018617021276595, + "grad_norm": 3.5649423599243164, + "learning_rate": 9.865216840652811e-06, + "loss": 1.1833, + "step": 2263 + }, + { + "epoch": 0.6021276595744681, + "grad_norm": 3.508890151977539, + "learning_rate": 9.865013931948061e-06, + "loss": 1.2527, + "step": 2264 + }, + { + "epoch": 0.6023936170212766, + "grad_norm": 3.513054132461548, + "learning_rate": 9.864810872714053e-06, + "loss": 1.2032, + "step": 2265 + }, + { + "epoch": 0.6026595744680852, + "grad_norm": 3.777679443359375, + "learning_rate": 9.864607662957066e-06, + "loss": 1.3355, + "step": 2266 + }, + { + "epoch": 0.6029255319148936, + "grad_norm": 3.778639316558838, + "learning_rate": 9.864404302683393e-06, + "loss": 1.3697, + "step": 2267 + }, + { + "epoch": 0.6031914893617021, + "grad_norm": 3.5880136489868164, + "learning_rate": 9.864200791899323e-06, + "loss": 1.2124, + "step": 2268 + }, + { + "epoch": 0.6034574468085107, + "grad_norm": 3.5101895332336426, + "learning_rate": 9.863997130611153e-06, + "loss": 1.1641, + "step": 2269 + }, + { + "epoch": 0.6037234042553191, + "grad_norm": 3.5391786098480225, + "learning_rate": 9.863793318825186e-06, + "loss": 1.2167, + "step": 2270 + }, + { + "epoch": 0.6039893617021277, + "grad_norm": 3.74766206741333, + "learning_rate": 9.863589356547728e-06, + "loss": 1.3565, + "step": 2271 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 3.966728925704956, + "learning_rate": 9.863385243785088e-06, + "loss": 1.3416, + "step": 2272 + }, + { + "epoch": 0.6045212765957447, + "grad_norm": 3.2839200496673584, + "learning_rate": 9.863180980543582e-06, + "loss": 1.1073, + "step": 2273 + }, + { + "epoch": 0.6047872340425532, + "grad_norm": 3.958099603652954, + "learning_rate": 9.862976566829532e-06, + "loss": 1.356, + "step": 2274 + }, + { + "epoch": 0.6050531914893617, + "grad_norm": 3.6041507720947266, + "learning_rate": 9.862772002649261e-06, + "loss": 1.4091, + "step": 2275 + }, + { + "epoch": 0.6053191489361702, + "grad_norm": 3.320826530456543, + "learning_rate": 9.862567288009099e-06, + "loss": 1.196, + "step": 2276 + }, + { + "epoch": 0.6055851063829787, + "grad_norm": 3.375542163848877, + "learning_rate": 9.862362422915382e-06, + "loss": 1.161, + "step": 2277 + }, + { + "epoch": 0.6058510638297873, + "grad_norm": 3.680457353591919, + "learning_rate": 9.862157407374446e-06, + "loss": 1.129, + "step": 2278 + }, + { + "epoch": 0.6061170212765957, + "grad_norm": 3.8363595008850098, + "learning_rate": 9.861952241392633e-06, + "loss": 1.309, + "step": 2279 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 3.7582051753997803, + "learning_rate": 9.861746924976297e-06, + "loss": 1.2328, + "step": 2280 + }, + { + "epoch": 0.6066489361702128, + "grad_norm": 3.5171892642974854, + "learning_rate": 9.861541458131785e-06, + "loss": 1.2098, + "step": 2281 + }, + { + "epoch": 0.6069148936170212, + "grad_norm": 3.905834197998047, + "learning_rate": 9.861335840865455e-06, + "loss": 1.2909, + "step": 2282 + }, + { + "epoch": 0.6071808510638298, + "grad_norm": 3.9347522258758545, + "learning_rate": 9.861130073183674e-06, + "loss": 1.265, + "step": 2283 + }, + { + "epoch": 0.6074468085106383, + "grad_norm": 3.6212542057037354, + "learning_rate": 9.860924155092803e-06, + "loss": 1.3044, + "step": 2284 + }, + { + "epoch": 0.6077127659574468, + "grad_norm": 3.9703807830810547, + "learning_rate": 9.860718086599217e-06, + "loss": 1.3497, + "step": 2285 + }, + { + "epoch": 0.6079787234042553, + "grad_norm": 3.94783091545105, + "learning_rate": 9.860511867709289e-06, + "loss": 1.248, + "step": 2286 + }, + { + "epoch": 0.6082446808510639, + "grad_norm": 4.237410545349121, + "learning_rate": 9.860305498429404e-06, + "loss": 1.3791, + "step": 2287 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 3.7259433269500732, + "learning_rate": 9.860098978765942e-06, + "loss": 1.3233, + "step": 2288 + }, + { + "epoch": 0.6087765957446809, + "grad_norm": 3.8508055210113525, + "learning_rate": 9.859892308725296e-06, + "loss": 1.2324, + "step": 2289 + }, + { + "epoch": 0.6090425531914894, + "grad_norm": 3.8663196563720703, + "learning_rate": 9.859685488313861e-06, + "loss": 1.2425, + "step": 2290 + }, + { + "epoch": 0.6093085106382978, + "grad_norm": 4.03026008605957, + "learning_rate": 9.859478517538035e-06, + "loss": 1.2932, + "step": 2291 + }, + { + "epoch": 0.6095744680851064, + "grad_norm": 3.517122745513916, + "learning_rate": 9.859271396404223e-06, + "loss": 1.1597, + "step": 2292 + }, + { + "epoch": 0.6098404255319149, + "grad_norm": 3.6704776287078857, + "learning_rate": 9.85906412491883e-06, + "loss": 1.1834, + "step": 2293 + }, + { + "epoch": 0.6101063829787234, + "grad_norm": 4.267923831939697, + "learning_rate": 9.858856703088276e-06, + "loss": 1.1888, + "step": 2294 + }, + { + "epoch": 0.6103723404255319, + "grad_norm": 4.178102493286133, + "learning_rate": 9.85864913091897e-06, + "loss": 1.3685, + "step": 2295 + }, + { + "epoch": 0.6106382978723405, + "grad_norm": 4.176131725311279, + "learning_rate": 9.858441408417345e-06, + "loss": 1.231, + "step": 2296 + }, + { + "epoch": 0.6109042553191489, + "grad_norm": 3.4884450435638428, + "learning_rate": 9.85823353558982e-06, + "loss": 1.2206, + "step": 2297 + }, + { + "epoch": 0.6111702127659574, + "grad_norm": 3.8766729831695557, + "learning_rate": 9.85802551244283e-06, + "loss": 1.3035, + "step": 2298 + }, + { + "epoch": 0.611436170212766, + "grad_norm": 3.5301473140716553, + "learning_rate": 9.857817338982811e-06, + "loss": 1.1712, + "step": 2299 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 3.7902379035949707, + "learning_rate": 9.857609015216205e-06, + "loss": 1.1324, + "step": 2300 + }, + { + "epoch": 0.611968085106383, + "grad_norm": 4.028817176818848, + "learning_rate": 9.857400541149455e-06, + "loss": 1.3142, + "step": 2301 + }, + { + "epoch": 0.6122340425531915, + "grad_norm": 3.6242549419403076, + "learning_rate": 9.857191916789016e-06, + "loss": 1.2368, + "step": 2302 + }, + { + "epoch": 0.6125, + "grad_norm": 3.6776719093322754, + "learning_rate": 9.856983142141338e-06, + "loss": 1.3289, + "step": 2303 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 3.8104121685028076, + "learning_rate": 9.856774217212886e-06, + "loss": 1.3076, + "step": 2304 + }, + { + "epoch": 0.613031914893617, + "grad_norm": 3.668893337249756, + "learning_rate": 9.85656514201012e-06, + "loss": 1.2935, + "step": 2305 + }, + { + "epoch": 0.6132978723404255, + "grad_norm": 3.5787241458892822, + "learning_rate": 9.85635591653951e-06, + "loss": 1.1477, + "step": 2306 + }, + { + "epoch": 0.613563829787234, + "grad_norm": 3.9113807678222656, + "learning_rate": 9.856146540807531e-06, + "loss": 1.3338, + "step": 2307 + }, + { + "epoch": 0.6138297872340426, + "grad_norm": 3.6910572052001953, + "learning_rate": 9.85593701482066e-06, + "loss": 1.1302, + "step": 2308 + }, + { + "epoch": 0.614095744680851, + "grad_norm": 4.1038689613342285, + "learning_rate": 9.855727338585381e-06, + "loss": 1.4519, + "step": 2309 + }, + { + "epoch": 0.6143617021276596, + "grad_norm": 3.5061099529266357, + "learning_rate": 9.855517512108182e-06, + "loss": 1.2243, + "step": 2310 + }, + { + "epoch": 0.6146276595744681, + "grad_norm": 3.5231192111968994, + "learning_rate": 9.855307535395553e-06, + "loss": 1.2158, + "step": 2311 + }, + { + "epoch": 0.6148936170212767, + "grad_norm": 3.8572421073913574, + "learning_rate": 9.855097408453993e-06, + "loss": 1.2392, + "step": 2312 + }, + { + "epoch": 0.6151595744680851, + "grad_norm": 3.7707557678222656, + "learning_rate": 9.854887131290002e-06, + "loss": 1.2316, + "step": 2313 + }, + { + "epoch": 0.6154255319148936, + "grad_norm": 3.860130548477173, + "learning_rate": 9.854676703910092e-06, + "loss": 1.2118, + "step": 2314 + }, + { + "epoch": 0.6156914893617021, + "grad_norm": 3.404811382293701, + "learning_rate": 9.854466126320763e-06, + "loss": 1.1942, + "step": 2315 + }, + { + "epoch": 0.6159574468085106, + "grad_norm": 3.659116268157959, + "learning_rate": 9.854255398528541e-06, + "loss": 1.2822, + "step": 2316 + }, + { + "epoch": 0.6162234042553192, + "grad_norm": 3.97190260887146, + "learning_rate": 9.85404452053994e-06, + "loss": 1.3892, + "step": 2317 + }, + { + "epoch": 0.6164893617021276, + "grad_norm": 3.99293851852417, + "learning_rate": 9.853833492361486e-06, + "loss": 1.2248, + "step": 2318 + }, + { + "epoch": 0.6167553191489362, + "grad_norm": 3.846611499786377, + "learning_rate": 9.85362231399971e-06, + "loss": 1.3553, + "step": 2319 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 3.922665596008301, + "learning_rate": 9.853410985461145e-06, + "loss": 1.2831, + "step": 2320 + }, + { + "epoch": 0.6172872340425531, + "grad_norm": 3.788879871368408, + "learning_rate": 9.85319950675233e-06, + "loss": 1.3213, + "step": 2321 + }, + { + "epoch": 0.6175531914893617, + "grad_norm": 3.7415027618408203, + "learning_rate": 9.852987877879807e-06, + "loss": 1.1951, + "step": 2322 + }, + { + "epoch": 0.6178191489361702, + "grad_norm": 4.016115665435791, + "learning_rate": 9.852776098850128e-06, + "loss": 1.2595, + "step": 2323 + }, + { + "epoch": 0.6180851063829788, + "grad_norm": 3.5927200317382812, + "learning_rate": 9.85256416966984e-06, + "loss": 1.2103, + "step": 2324 + }, + { + "epoch": 0.6183510638297872, + "grad_norm": 3.9768147468566895, + "learning_rate": 9.852352090345504e-06, + "loss": 1.3389, + "step": 2325 + }, + { + "epoch": 0.6186170212765958, + "grad_norm": 3.378852605819702, + "learning_rate": 9.852139860883684e-06, + "loss": 1.1266, + "step": 2326 + }, + { + "epoch": 0.6188829787234043, + "grad_norm": 4.071725368499756, + "learning_rate": 9.851927481290943e-06, + "loss": 1.4006, + "step": 2327 + }, + { + "epoch": 0.6191489361702127, + "grad_norm": 3.721118688583374, + "learning_rate": 9.851714951573853e-06, + "loss": 1.2344, + "step": 2328 + }, + { + "epoch": 0.6194148936170213, + "grad_norm": 3.551180839538574, + "learning_rate": 9.851502271738989e-06, + "loss": 1.3175, + "step": 2329 + }, + { + "epoch": 0.6196808510638298, + "grad_norm": 3.6764516830444336, + "learning_rate": 9.851289441792934e-06, + "loss": 1.2169, + "step": 2330 + }, + { + "epoch": 0.6199468085106383, + "grad_norm": 3.8505606651306152, + "learning_rate": 9.851076461742272e-06, + "loss": 1.3586, + "step": 2331 + }, + { + "epoch": 0.6202127659574468, + "grad_norm": 3.9605445861816406, + "learning_rate": 9.850863331593591e-06, + "loss": 1.2454, + "step": 2332 + }, + { + "epoch": 0.6204787234042554, + "grad_norm": 4.140010833740234, + "learning_rate": 9.85065005135349e-06, + "loss": 1.4014, + "step": 2333 + }, + { + "epoch": 0.6207446808510638, + "grad_norm": 4.118074417114258, + "learning_rate": 9.850436621028565e-06, + "loss": 1.2367, + "step": 2334 + }, + { + "epoch": 0.6210106382978723, + "grad_norm": 3.6424777507781982, + "learning_rate": 9.85022304062542e-06, + "loss": 1.129, + "step": 2335 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 3.643145799636841, + "learning_rate": 9.850009310150662e-06, + "loss": 1.3767, + "step": 2336 + }, + { + "epoch": 0.6215425531914893, + "grad_norm": 3.913959503173828, + "learning_rate": 9.849795429610908e-06, + "loss": 1.1977, + "step": 2337 + }, + { + "epoch": 0.6218085106382979, + "grad_norm": 3.91186261177063, + "learning_rate": 9.849581399012772e-06, + "loss": 1.2842, + "step": 2338 + }, + { + "epoch": 0.6220744680851064, + "grad_norm": 3.7167961597442627, + "learning_rate": 9.849367218362879e-06, + "loss": 1.2802, + "step": 2339 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 3.5471532344818115, + "learning_rate": 9.849152887667855e-06, + "loss": 1.2785, + "step": 2340 + }, + { + "epoch": 0.6226063829787234, + "grad_norm": 4.358826637268066, + "learning_rate": 9.84893840693433e-06, + "loss": 1.1696, + "step": 2341 + }, + { + "epoch": 0.622872340425532, + "grad_norm": 3.869590997695923, + "learning_rate": 9.848723776168942e-06, + "loss": 1.3316, + "step": 2342 + }, + { + "epoch": 0.6231382978723404, + "grad_norm": 4.493122577667236, + "learning_rate": 9.848508995378333e-06, + "loss": 1.2928, + "step": 2343 + }, + { + "epoch": 0.6234042553191489, + "grad_norm": 3.808885335922241, + "learning_rate": 9.848294064569146e-06, + "loss": 1.331, + "step": 2344 + }, + { + "epoch": 0.6236702127659575, + "grad_norm": 3.6614105701446533, + "learning_rate": 9.848078983748032e-06, + "loss": 1.3549, + "step": 2345 + }, + { + "epoch": 0.6239361702127659, + "grad_norm": 3.5685722827911377, + "learning_rate": 9.847863752921649e-06, + "loss": 1.1914, + "step": 2346 + }, + { + "epoch": 0.6242021276595745, + "grad_norm": 4.203314781188965, + "learning_rate": 9.847648372096652e-06, + "loss": 1.3369, + "step": 2347 + }, + { + "epoch": 0.624468085106383, + "grad_norm": 3.762103796005249, + "learning_rate": 9.847432841279707e-06, + "loss": 1.261, + "step": 2348 + }, + { + "epoch": 0.6247340425531915, + "grad_norm": 4.371121883392334, + "learning_rate": 9.847217160477483e-06, + "loss": 1.3071, + "step": 2349 + }, + { + "epoch": 0.625, + "grad_norm": 3.928662061691284, + "learning_rate": 9.847001329696653e-06, + "loss": 1.2321, + "step": 2350 + }, + { + "epoch": 0.6252659574468085, + "grad_norm": 3.7375707626342773, + "learning_rate": 9.846785348943896e-06, + "loss": 1.3022, + "step": 2351 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 3.684936046600342, + "learning_rate": 9.846569218225892e-06, + "loss": 1.2365, + "step": 2352 + }, + { + "epoch": 0.6257978723404255, + "grad_norm": 3.5079708099365234, + "learning_rate": 9.846352937549332e-06, + "loss": 1.2328, + "step": 2353 + }, + { + "epoch": 0.6260638297872341, + "grad_norm": 3.814976692199707, + "learning_rate": 9.846136506920907e-06, + "loss": 1.1824, + "step": 2354 + }, + { + "epoch": 0.6263297872340425, + "grad_norm": 3.3843934535980225, + "learning_rate": 9.84591992634731e-06, + "loss": 1.0477, + "step": 2355 + }, + { + "epoch": 0.6265957446808511, + "grad_norm": 3.712428569793701, + "learning_rate": 9.845703195835248e-06, + "loss": 1.2826, + "step": 2356 + }, + { + "epoch": 0.6268617021276596, + "grad_norm": 3.617882251739502, + "learning_rate": 9.845486315391421e-06, + "loss": 1.2472, + "step": 2357 + }, + { + "epoch": 0.627127659574468, + "grad_norm": 4.057145595550537, + "learning_rate": 9.845269285022545e-06, + "loss": 1.4144, + "step": 2358 + }, + { + "epoch": 0.6273936170212766, + "grad_norm": 4.23139762878418, + "learning_rate": 9.845052104735331e-06, + "loss": 1.4445, + "step": 2359 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 3.8976731300354004, + "learning_rate": 9.844834774536503e-06, + "loss": 1.2646, + "step": 2360 + }, + { + "epoch": 0.6279255319148936, + "grad_norm": 3.6036627292633057, + "learning_rate": 9.844617294432781e-06, + "loss": 1.251, + "step": 2361 + }, + { + "epoch": 0.6281914893617021, + "grad_norm": 3.4059393405914307, + "learning_rate": 9.844399664430896e-06, + "loss": 1.1432, + "step": 2362 + }, + { + "epoch": 0.6284574468085107, + "grad_norm": 3.6594855785369873, + "learning_rate": 9.844181884537583e-06, + "loss": 1.3047, + "step": 2363 + }, + { + "epoch": 0.6287234042553191, + "grad_norm": 4.183903217315674, + "learning_rate": 9.843963954759578e-06, + "loss": 1.2951, + "step": 2364 + }, + { + "epoch": 0.6289893617021277, + "grad_norm": 3.496905565261841, + "learning_rate": 9.843745875103628e-06, + "loss": 1.3087, + "step": 2365 + }, + { + "epoch": 0.6292553191489362, + "grad_norm": 3.5995302200317383, + "learning_rate": 9.843527645576475e-06, + "loss": 1.2998, + "step": 2366 + }, + { + "epoch": 0.6295212765957446, + "grad_norm": 3.597393035888672, + "learning_rate": 9.843309266184875e-06, + "loss": 1.2151, + "step": 2367 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 3.922405481338501, + "learning_rate": 9.843090736935583e-06, + "loss": 1.4409, + "step": 2368 + }, + { + "epoch": 0.6300531914893617, + "grad_norm": 3.7593741416931152, + "learning_rate": 9.842872057835363e-06, + "loss": 1.0905, + "step": 2369 + }, + { + "epoch": 0.6303191489361702, + "grad_norm": 3.570892572402954, + "learning_rate": 9.842653228890979e-06, + "loss": 1.2337, + "step": 2370 + }, + { + "epoch": 0.6305851063829787, + "grad_norm": 3.2270023822784424, + "learning_rate": 9.842434250109202e-06, + "loss": 0.9824, + "step": 2371 + }, + { + "epoch": 0.6308510638297873, + "grad_norm": 3.9054601192474365, + "learning_rate": 9.84221512149681e-06, + "loss": 1.3091, + "step": 2372 + }, + { + "epoch": 0.6311170212765957, + "grad_norm": 3.7820627689361572, + "learning_rate": 9.84199584306058e-06, + "loss": 1.2331, + "step": 2373 + }, + { + "epoch": 0.6313829787234042, + "grad_norm": 3.407257080078125, + "learning_rate": 9.841776414807297e-06, + "loss": 1.1868, + "step": 2374 + }, + { + "epoch": 0.6316489361702128, + "grad_norm": 3.471640110015869, + "learning_rate": 9.841556836743752e-06, + "loss": 1.2025, + "step": 2375 + }, + { + "epoch": 0.6319148936170212, + "grad_norm": 3.824422597885132, + "learning_rate": 9.841337108876739e-06, + "loss": 1.1932, + "step": 2376 + }, + { + "epoch": 0.6321808510638298, + "grad_norm": 3.6980538368225098, + "learning_rate": 9.841117231213055e-06, + "loss": 1.2374, + "step": 2377 + }, + { + "epoch": 0.6324468085106383, + "grad_norm": 3.9002277851104736, + "learning_rate": 9.840897203759502e-06, + "loss": 1.3205, + "step": 2378 + }, + { + "epoch": 0.6327127659574469, + "grad_norm": 3.993248462677002, + "learning_rate": 9.840677026522893e-06, + "loss": 1.1262, + "step": 2379 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.8742499351501465, + "learning_rate": 9.840456699510038e-06, + "loss": 1.1456, + "step": 2380 + }, + { + "epoch": 0.6332446808510638, + "grad_norm": 3.772584915161133, + "learning_rate": 9.840236222727752e-06, + "loss": 1.1367, + "step": 2381 + }, + { + "epoch": 0.6335106382978724, + "grad_norm": 3.7653708457946777, + "learning_rate": 9.840015596182861e-06, + "loss": 1.24, + "step": 2382 + }, + { + "epoch": 0.6337765957446808, + "grad_norm": 3.4554617404937744, + "learning_rate": 9.839794819882188e-06, + "loss": 1.2708, + "step": 2383 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 3.808807611465454, + "learning_rate": 9.839573893832564e-06, + "loss": 1.3985, + "step": 2384 + }, + { + "epoch": 0.6343085106382979, + "grad_norm": 3.6254007816314697, + "learning_rate": 9.839352818040825e-06, + "loss": 1.3145, + "step": 2385 + }, + { + "epoch": 0.6345744680851064, + "grad_norm": 3.83559513092041, + "learning_rate": 9.839131592513814e-06, + "loss": 1.2868, + "step": 2386 + }, + { + "epoch": 0.6348404255319149, + "grad_norm": 3.465432643890381, + "learning_rate": 9.838910217258375e-06, + "loss": 1.213, + "step": 2387 + }, + { + "epoch": 0.6351063829787233, + "grad_norm": 3.762899160385132, + "learning_rate": 9.838688692281356e-06, + "loss": 1.3678, + "step": 2388 + }, + { + "epoch": 0.6353723404255319, + "grad_norm": 3.573856830596924, + "learning_rate": 9.83846701758961e-06, + "loss": 1.3181, + "step": 2389 + }, + { + "epoch": 0.6356382978723404, + "grad_norm": 3.873749256134033, + "learning_rate": 9.838245193189999e-06, + "loss": 1.252, + "step": 2390 + }, + { + "epoch": 0.635904255319149, + "grad_norm": 3.5495100021362305, + "learning_rate": 9.838023219089386e-06, + "loss": 1.352, + "step": 2391 + }, + { + "epoch": 0.6361702127659574, + "grad_norm": 3.6257059574127197, + "learning_rate": 9.837801095294639e-06, + "loss": 1.2099, + "step": 2392 + }, + { + "epoch": 0.636436170212766, + "grad_norm": 3.658745288848877, + "learning_rate": 9.83757882181263e-06, + "loss": 1.2089, + "step": 2393 + }, + { + "epoch": 0.6367021276595745, + "grad_norm": 3.6948094367980957, + "learning_rate": 9.837356398650235e-06, + "loss": 1.3032, + "step": 2394 + }, + { + "epoch": 0.636968085106383, + "grad_norm": 3.677865743637085, + "learning_rate": 9.83713382581434e-06, + "loss": 1.2295, + "step": 2395 + }, + { + "epoch": 0.6372340425531915, + "grad_norm": 3.758213758468628, + "learning_rate": 9.836911103311828e-06, + "loss": 1.2542, + "step": 2396 + }, + { + "epoch": 0.6375, + "grad_norm": 3.710860252380371, + "learning_rate": 9.836688231149593e-06, + "loss": 1.3331, + "step": 2397 + }, + { + "epoch": 0.6377659574468085, + "grad_norm": 3.436738967895508, + "learning_rate": 9.836465209334529e-06, + "loss": 1.1318, + "step": 2398 + }, + { + "epoch": 0.638031914893617, + "grad_norm": 4.398902416229248, + "learning_rate": 9.836242037873536e-06, + "loss": 1.3268, + "step": 2399 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 3.483926773071289, + "learning_rate": 9.836018716773522e-06, + "loss": 1.1744, + "step": 2400 + }, + { + "epoch": 0.638563829787234, + "grad_norm": 3.766038417816162, + "learning_rate": 9.835795246041395e-06, + "loss": 1.1829, + "step": 2401 + }, + { + "epoch": 0.6388297872340426, + "grad_norm": 3.7989938259124756, + "learning_rate": 9.835571625684068e-06, + "loss": 1.2691, + "step": 2402 + }, + { + "epoch": 0.6390957446808511, + "grad_norm": 3.6767778396606445, + "learning_rate": 9.835347855708464e-06, + "loss": 1.1456, + "step": 2403 + }, + { + "epoch": 0.6393617021276595, + "grad_norm": 3.689368963241577, + "learning_rate": 9.835123936121504e-06, + "loss": 1.2714, + "step": 2404 + }, + { + "epoch": 0.6396276595744681, + "grad_norm": 3.6774284839630127, + "learning_rate": 9.834899866930116e-06, + "loss": 1.1968, + "step": 2405 + }, + { + "epoch": 0.6398936170212766, + "grad_norm": 3.734713077545166, + "learning_rate": 9.834675648141235e-06, + "loss": 1.4036, + "step": 2406 + }, + { + "epoch": 0.6401595744680851, + "grad_norm": 3.4915902614593506, + "learning_rate": 9.834451279761796e-06, + "loss": 1.0733, + "step": 2407 + }, + { + "epoch": 0.6404255319148936, + "grad_norm": 3.5466091632843018, + "learning_rate": 9.834226761798742e-06, + "loss": 1.2197, + "step": 2408 + }, + { + "epoch": 0.6406914893617022, + "grad_norm": 3.5611202716827393, + "learning_rate": 9.83400209425902e-06, + "loss": 1.092, + "step": 2409 + }, + { + "epoch": 0.6409574468085106, + "grad_norm": 3.35369610786438, + "learning_rate": 9.833777277149585e-06, + "loss": 1.2385, + "step": 2410 + }, + { + "epoch": 0.6412234042553191, + "grad_norm": 3.7679550647735596, + "learning_rate": 9.833552310477388e-06, + "loss": 1.0647, + "step": 2411 + }, + { + "epoch": 0.6414893617021277, + "grad_norm": 3.6990325450897217, + "learning_rate": 9.833327194249392e-06, + "loss": 1.1853, + "step": 2412 + }, + { + "epoch": 0.6417553191489361, + "grad_norm": 3.6745262145996094, + "learning_rate": 9.833101928472562e-06, + "loss": 1.2038, + "step": 2413 + }, + { + "epoch": 0.6420212765957447, + "grad_norm": 3.357508897781372, + "learning_rate": 9.832876513153867e-06, + "loss": 1.0274, + "step": 2414 + }, + { + "epoch": 0.6422872340425532, + "grad_norm": 3.786376953125, + "learning_rate": 9.832650948300284e-06, + "loss": 1.288, + "step": 2415 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 3.253251314163208, + "learning_rate": 9.83242523391879e-06, + "loss": 1.0876, + "step": 2416 + }, + { + "epoch": 0.6428191489361702, + "grad_norm": 3.3168015480041504, + "learning_rate": 9.832199370016371e-06, + "loss": 1.1551, + "step": 2417 + }, + { + "epoch": 0.6430851063829788, + "grad_norm": 3.8747761249542236, + "learning_rate": 9.831973356600013e-06, + "loss": 1.2343, + "step": 2418 + }, + { + "epoch": 0.6433510638297872, + "grad_norm": 3.9137704372406006, + "learning_rate": 9.83174719367671e-06, + "loss": 1.1782, + "step": 2419 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 3.64943528175354, + "learning_rate": 9.831520881253462e-06, + "loss": 1.0506, + "step": 2420 + }, + { + "epoch": 0.6438829787234043, + "grad_norm": 3.5648887157440186, + "learning_rate": 9.83129441933727e-06, + "loss": 1.0195, + "step": 2421 + }, + { + "epoch": 0.6441489361702127, + "grad_norm": 3.6668763160705566, + "learning_rate": 9.83106780793514e-06, + "loss": 1.349, + "step": 2422 + }, + { + "epoch": 0.6444148936170213, + "grad_norm": 3.6365723609924316, + "learning_rate": 9.830841047054083e-06, + "loss": 1.2105, + "step": 2423 + }, + { + "epoch": 0.6446808510638298, + "grad_norm": 3.657466411590576, + "learning_rate": 9.830614136701116e-06, + "loss": 1.2453, + "step": 2424 + }, + { + "epoch": 0.6449468085106383, + "grad_norm": 3.7750251293182373, + "learning_rate": 9.83038707688326e-06, + "loss": 1.2753, + "step": 2425 + }, + { + "epoch": 0.6452127659574468, + "grad_norm": 3.4032111167907715, + "learning_rate": 9.830159867607543e-06, + "loss": 1.2054, + "step": 2426 + }, + { + "epoch": 0.6454787234042553, + "grad_norm": 3.546877861022949, + "learning_rate": 9.82993250888099e-06, + "loss": 1.35, + "step": 2427 + }, + { + "epoch": 0.6457446808510638, + "grad_norm": 3.5076162815093994, + "learning_rate": 9.829705000710642e-06, + "loss": 1.1382, + "step": 2428 + }, + { + "epoch": 0.6460106382978723, + "grad_norm": 3.955322742462158, + "learning_rate": 9.829477343103533e-06, + "loss": 1.3948, + "step": 2429 + }, + { + "epoch": 0.6462765957446809, + "grad_norm": 3.5918376445770264, + "learning_rate": 9.82924953606671e-06, + "loss": 1.2271, + "step": 2430 + }, + { + "epoch": 0.6465425531914893, + "grad_norm": 3.8371551036834717, + "learning_rate": 9.82902157960722e-06, + "loss": 1.2004, + "step": 2431 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 3.573141098022461, + "learning_rate": 9.828793473732116e-06, + "loss": 1.2059, + "step": 2432 + }, + { + "epoch": 0.6470744680851064, + "grad_norm": 3.8021459579467773, + "learning_rate": 9.828565218448457e-06, + "loss": 1.1852, + "step": 2433 + }, + { + "epoch": 0.6473404255319148, + "grad_norm": 4.022589206695557, + "learning_rate": 9.828336813763308e-06, + "loss": 1.2385, + "step": 2434 + }, + { + "epoch": 0.6476063829787234, + "grad_norm": 3.364841938018799, + "learning_rate": 9.82810825968373e-06, + "loss": 1.1976, + "step": 2435 + }, + { + "epoch": 0.6478723404255319, + "grad_norm": 4.046548843383789, + "learning_rate": 9.8278795562168e-06, + "loss": 1.3522, + "step": 2436 + }, + { + "epoch": 0.6481382978723405, + "grad_norm": 3.795485019683838, + "learning_rate": 9.82765070336959e-06, + "loss": 1.2166, + "step": 2437 + }, + { + "epoch": 0.6484042553191489, + "grad_norm": 3.8107662200927734, + "learning_rate": 9.827421701149187e-06, + "loss": 1.3138, + "step": 2438 + }, + { + "epoch": 0.6486702127659575, + "grad_norm": 3.618577241897583, + "learning_rate": 9.82719254956267e-06, + "loss": 1.1677, + "step": 2439 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 3.680255651473999, + "learning_rate": 9.826963248617133e-06, + "loss": 1.2319, + "step": 2440 + }, + { + "epoch": 0.6492021276595744, + "grad_norm": 3.6145694255828857, + "learning_rate": 9.82673379831967e-06, + "loss": 1.2276, + "step": 2441 + }, + { + "epoch": 0.649468085106383, + "grad_norm": 3.643686532974243, + "learning_rate": 9.82650419867738e-06, + "loss": 1.2989, + "step": 2442 + }, + { + "epoch": 0.6497340425531914, + "grad_norm": 3.774909019470215, + "learning_rate": 9.82627444969737e-06, + "loss": 1.2749, + "step": 2443 + }, + { + "epoch": 0.65, + "grad_norm": 3.7553470134735107, + "learning_rate": 9.826044551386743e-06, + "loss": 1.0902, + "step": 2444 + }, + { + "epoch": 0.6502659574468085, + "grad_norm": 3.453191041946411, + "learning_rate": 9.825814503752618e-06, + "loss": 1.2609, + "step": 2445 + }, + { + "epoch": 0.6505319148936171, + "grad_norm": 3.889417886734009, + "learning_rate": 9.825584306802109e-06, + "loss": 1.2514, + "step": 2446 + }, + { + "epoch": 0.6507978723404255, + "grad_norm": 3.5073375701904297, + "learning_rate": 9.825353960542342e-06, + "loss": 1.2466, + "step": 2447 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 3.4606523513793945, + "learning_rate": 9.825123464980442e-06, + "loss": 1.1156, + "step": 2448 + }, + { + "epoch": 0.6513297872340426, + "grad_norm": 3.831897497177124, + "learning_rate": 9.82489282012354e-06, + "loss": 1.1323, + "step": 2449 + }, + { + "epoch": 0.651595744680851, + "grad_norm": 4.391724109649658, + "learning_rate": 9.824662025978774e-06, + "loss": 1.2543, + "step": 2450 + }, + { + "epoch": 0.6518617021276596, + "grad_norm": 3.8090097904205322, + "learning_rate": 9.824431082553285e-06, + "loss": 1.3592, + "step": 2451 + }, + { + "epoch": 0.652127659574468, + "grad_norm": 3.706662893295288, + "learning_rate": 9.824199989854217e-06, + "loss": 1.2753, + "step": 2452 + }, + { + "epoch": 0.6523936170212766, + "grad_norm": 4.826519966125488, + "learning_rate": 9.823968747888722e-06, + "loss": 1.501, + "step": 2453 + }, + { + "epoch": 0.6526595744680851, + "grad_norm": 3.7181127071380615, + "learning_rate": 9.823737356663956e-06, + "loss": 1.283, + "step": 2454 + }, + { + "epoch": 0.6529255319148937, + "grad_norm": 3.6020474433898926, + "learning_rate": 9.823505816187076e-06, + "loss": 1.195, + "step": 2455 + }, + { + "epoch": 0.6531914893617021, + "grad_norm": 3.7805116176605225, + "learning_rate": 9.823274126465245e-06, + "loss": 1.3032, + "step": 2456 + }, + { + "epoch": 0.6534574468085106, + "grad_norm": 3.6897008419036865, + "learning_rate": 9.823042287505636e-06, + "loss": 1.33, + "step": 2457 + }, + { + "epoch": 0.6537234042553192, + "grad_norm": 3.6036691665649414, + "learning_rate": 9.82281029931542e-06, + "loss": 1.2454, + "step": 2458 + }, + { + "epoch": 0.6539893617021276, + "grad_norm": 3.8645083904266357, + "learning_rate": 9.822578161901774e-06, + "loss": 1.4082, + "step": 2459 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 3.982588052749634, + "learning_rate": 9.822345875271884e-06, + "loss": 1.2635, + "step": 2460 + }, + { + "epoch": 0.6545212765957447, + "grad_norm": 3.576320171356201, + "learning_rate": 9.822113439432933e-06, + "loss": 1.3524, + "step": 2461 + }, + { + "epoch": 0.6547872340425532, + "grad_norm": 3.387544870376587, + "learning_rate": 9.821880854392115e-06, + "loss": 1.2344, + "step": 2462 + }, + { + "epoch": 0.6550531914893617, + "grad_norm": 3.385258436203003, + "learning_rate": 9.821648120156628e-06, + "loss": 1.2054, + "step": 2463 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 3.952305316925049, + "learning_rate": 9.82141523673367e-06, + "loss": 1.153, + "step": 2464 + }, + { + "epoch": 0.6555851063829787, + "grad_norm": 3.8070571422576904, + "learning_rate": 9.821182204130448e-06, + "loss": 1.3405, + "step": 2465 + }, + { + "epoch": 0.6558510638297872, + "grad_norm": 3.9651296138763428, + "learning_rate": 9.820949022354174e-06, + "loss": 1.3205, + "step": 2466 + }, + { + "epoch": 0.6561170212765958, + "grad_norm": 3.980510950088501, + "learning_rate": 9.82071569141206e-06, + "loss": 1.401, + "step": 2467 + }, + { + "epoch": 0.6563829787234042, + "grad_norm": 4.441346168518066, + "learning_rate": 9.820482211311326e-06, + "loss": 1.3839, + "step": 2468 + }, + { + "epoch": 0.6566489361702128, + "grad_norm": 3.4150032997131348, + "learning_rate": 9.820248582059197e-06, + "loss": 1.0058, + "step": 2469 + }, + { + "epoch": 0.6569148936170213, + "grad_norm": 3.4013893604278564, + "learning_rate": 9.820014803662905e-06, + "loss": 1.1612, + "step": 2470 + }, + { + "epoch": 0.6571808510638298, + "grad_norm": 4.017107009887695, + "learning_rate": 9.819780876129677e-06, + "loss": 1.2295, + "step": 2471 + }, + { + "epoch": 0.6574468085106383, + "grad_norm": 3.500370979309082, + "learning_rate": 9.819546799466756e-06, + "loss": 1.2573, + "step": 2472 + }, + { + "epoch": 0.6577127659574468, + "grad_norm": 3.7119557857513428, + "learning_rate": 9.81931257368138e-06, + "loss": 1.1827, + "step": 2473 + }, + { + "epoch": 0.6579787234042553, + "grad_norm": 4.006588935852051, + "learning_rate": 9.8190781987808e-06, + "loss": 1.3236, + "step": 2474 + }, + { + "epoch": 0.6582446808510638, + "grad_norm": 3.6574013233184814, + "learning_rate": 9.818843674772268e-06, + "loss": 1.2783, + "step": 2475 + }, + { + "epoch": 0.6585106382978724, + "grad_norm": 3.4724280834198, + "learning_rate": 9.818609001663038e-06, + "loss": 1.3469, + "step": 2476 + }, + { + "epoch": 0.6587765957446808, + "grad_norm": 3.3943772315979004, + "learning_rate": 9.818374179460372e-06, + "loss": 1.1934, + "step": 2477 + }, + { + "epoch": 0.6590425531914894, + "grad_norm": 3.6822094917297363, + "learning_rate": 9.818139208171537e-06, + "loss": 1.3505, + "step": 2478 + }, + { + "epoch": 0.6593085106382979, + "grad_norm": 3.474010467529297, + "learning_rate": 9.817904087803802e-06, + "loss": 1.1487, + "step": 2479 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 3.4429280757904053, + "learning_rate": 9.817668818364441e-06, + "loss": 1.1786, + "step": 2480 + }, + { + "epoch": 0.6598404255319149, + "grad_norm": 4.096560955047607, + "learning_rate": 9.817433399860736e-06, + "loss": 1.3167, + "step": 2481 + }, + { + "epoch": 0.6601063829787234, + "grad_norm": 3.4501636028289795, + "learning_rate": 9.817197832299971e-06, + "loss": 1.0416, + "step": 2482 + }, + { + "epoch": 0.660372340425532, + "grad_norm": 3.7687666416168213, + "learning_rate": 9.816962115689432e-06, + "loss": 1.1121, + "step": 2483 + }, + { + "epoch": 0.6606382978723404, + "grad_norm": 3.6816604137420654, + "learning_rate": 9.816726250036413e-06, + "loss": 1.2019, + "step": 2484 + }, + { + "epoch": 0.660904255319149, + "grad_norm": 4.033024787902832, + "learning_rate": 9.816490235348215e-06, + "loss": 1.3078, + "step": 2485 + }, + { + "epoch": 0.6611702127659574, + "grad_norm": 3.7372167110443115, + "learning_rate": 9.816254071632137e-06, + "loss": 1.4434, + "step": 2486 + }, + { + "epoch": 0.6614361702127659, + "grad_norm": 3.694561004638672, + "learning_rate": 9.816017758895488e-06, + "loss": 1.2969, + "step": 2487 + }, + { + "epoch": 0.6617021276595745, + "grad_norm": 4.178577423095703, + "learning_rate": 9.815781297145578e-06, + "loss": 1.3661, + "step": 2488 + }, + { + "epoch": 0.6619680851063829, + "grad_norm": 3.647728681564331, + "learning_rate": 9.815544686389727e-06, + "loss": 1.1693, + "step": 2489 + }, + { + "epoch": 0.6622340425531915, + "grad_norm": 3.6795883178710938, + "learning_rate": 9.815307926635252e-06, + "loss": 1.2308, + "step": 2490 + }, + { + "epoch": 0.6625, + "grad_norm": 3.8441531658172607, + "learning_rate": 9.81507101788948e-06, + "loss": 1.2011, + "step": 2491 + }, + { + "epoch": 0.6627659574468086, + "grad_norm": 3.512495994567871, + "learning_rate": 9.814833960159744e-06, + "loss": 1.1509, + "step": 2492 + }, + { + "epoch": 0.663031914893617, + "grad_norm": 3.631899356842041, + "learning_rate": 9.814596753453376e-06, + "loss": 1.0989, + "step": 2493 + }, + { + "epoch": 0.6632978723404256, + "grad_norm": 3.5272533893585205, + "learning_rate": 9.814359397777716e-06, + "loss": 1.3053, + "step": 2494 + }, + { + "epoch": 0.663563829787234, + "grad_norm": 3.492922306060791, + "learning_rate": 9.814121893140105e-06, + "loss": 1.2977, + "step": 2495 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 3.5858078002929688, + "learning_rate": 9.813884239547898e-06, + "loss": 1.1963, + "step": 2496 + }, + { + "epoch": 0.6640957446808511, + "grad_norm": 3.4466118812561035, + "learning_rate": 9.813646437008444e-06, + "loss": 1.266, + "step": 2497 + }, + { + "epoch": 0.6643617021276595, + "grad_norm": 3.682159900665283, + "learning_rate": 9.813408485529103e-06, + "loss": 1.1549, + "step": 2498 + }, + { + "epoch": 0.6646276595744681, + "grad_norm": 4.358649253845215, + "learning_rate": 9.813170385117235e-06, + "loss": 1.3577, + "step": 2499 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 4.059812068939209, + "learning_rate": 9.81293213578021e-06, + "loss": 1.3728, + "step": 2500 + }, + { + "epoch": 0.6648936170212766, + "eval_loss": 1.2857128381729126, + "eval_runtime": 12.6822, + "eval_samples_per_second": 31.54, + "eval_steps_per_second": 3.943, + "step": 2500 + }, + { + "epoch": 0.6651595744680852, + "grad_norm": 3.519260883331299, + "learning_rate": 9.812693737525396e-06, + "loss": 1.1743, + "step": 2501 + }, + { + "epoch": 0.6654255319148936, + "grad_norm": 4.004322052001953, + "learning_rate": 9.812455190360172e-06, + "loss": 1.2847, + "step": 2502 + }, + { + "epoch": 0.6656914893617021, + "grad_norm": 3.699012517929077, + "learning_rate": 9.81221649429192e-06, + "loss": 1.3645, + "step": 2503 + }, + { + "epoch": 0.6659574468085107, + "grad_norm": 3.5919108390808105, + "learning_rate": 9.811977649328021e-06, + "loss": 1.1794, + "step": 2504 + }, + { + "epoch": 0.6662234042553191, + "grad_norm": 3.382624626159668, + "learning_rate": 9.81173865547587e-06, + "loss": 1.2909, + "step": 2505 + }, + { + "epoch": 0.6664893617021277, + "grad_norm": 3.7188732624053955, + "learning_rate": 9.811499512742861e-06, + "loss": 1.2731, + "step": 2506 + }, + { + "epoch": 0.6667553191489362, + "grad_norm": 3.5745997428894043, + "learning_rate": 9.811260221136392e-06, + "loss": 1.1994, + "step": 2507 + }, + { + "epoch": 0.6670212765957447, + "grad_norm": 3.6393473148345947, + "learning_rate": 9.811020780663865e-06, + "loss": 1.2335, + "step": 2508 + }, + { + "epoch": 0.6672872340425532, + "grad_norm": 3.4967026710510254, + "learning_rate": 9.810781191332692e-06, + "loss": 1.2272, + "step": 2509 + }, + { + "epoch": 0.6675531914893617, + "grad_norm": 3.826430559158325, + "learning_rate": 9.810541453150286e-06, + "loss": 1.3689, + "step": 2510 + }, + { + "epoch": 0.6678191489361702, + "grad_norm": 4.058473110198975, + "learning_rate": 9.810301566124063e-06, + "loss": 1.1942, + "step": 2511 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.5520458221435547, + "learning_rate": 9.810061530261446e-06, + "loss": 1.1599, + "step": 2512 + }, + { + "epoch": 0.6683510638297873, + "grad_norm": 3.7619452476501465, + "learning_rate": 9.80982134556986e-06, + "loss": 1.2391, + "step": 2513 + }, + { + "epoch": 0.6686170212765957, + "grad_norm": 3.9400548934936523, + "learning_rate": 9.809581012056743e-06, + "loss": 1.2792, + "step": 2514 + }, + { + "epoch": 0.6688829787234043, + "grad_norm": 3.3986830711364746, + "learning_rate": 9.809340529729523e-06, + "loss": 1.2333, + "step": 2515 + }, + { + "epoch": 0.6691489361702128, + "grad_norm": 3.8278701305389404, + "learning_rate": 9.809099898595647e-06, + "loss": 1.2988, + "step": 2516 + }, + { + "epoch": 0.6694148936170212, + "grad_norm": 3.8813681602478027, + "learning_rate": 9.808859118662558e-06, + "loss": 1.1505, + "step": 2517 + }, + { + "epoch": 0.6696808510638298, + "grad_norm": 3.5952844619750977, + "learning_rate": 9.808618189937706e-06, + "loss": 1.3804, + "step": 2518 + }, + { + "epoch": 0.6699468085106383, + "grad_norm": 3.642479181289673, + "learning_rate": 9.808377112428546e-06, + "loss": 1.2918, + "step": 2519 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 3.810826301574707, + "learning_rate": 9.808135886142536e-06, + "loss": 1.3684, + "step": 2520 + }, + { + "epoch": 0.6704787234042553, + "grad_norm": 3.843879222869873, + "learning_rate": 9.807894511087141e-06, + "loss": 1.2815, + "step": 2521 + }, + { + "epoch": 0.6707446808510639, + "grad_norm": 3.68229341506958, + "learning_rate": 9.807652987269829e-06, + "loss": 1.1894, + "step": 2522 + }, + { + "epoch": 0.6710106382978723, + "grad_norm": 3.585465669631958, + "learning_rate": 9.807411314698075e-06, + "loss": 1.3078, + "step": 2523 + }, + { + "epoch": 0.6712765957446809, + "grad_norm": 3.825195074081421, + "learning_rate": 9.807169493379353e-06, + "loss": 1.2117, + "step": 2524 + }, + { + "epoch": 0.6715425531914894, + "grad_norm": 3.376753091812134, + "learning_rate": 9.806927523321148e-06, + "loss": 1.1575, + "step": 2525 + }, + { + "epoch": 0.6718085106382978, + "grad_norm": 3.877986431121826, + "learning_rate": 9.806685404530946e-06, + "loss": 1.3773, + "step": 2526 + }, + { + "epoch": 0.6720744680851064, + "grad_norm": 3.9964683055877686, + "learning_rate": 9.806443137016237e-06, + "loss": 1.2466, + "step": 2527 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.6897804737091064, + "learning_rate": 9.80620072078452e-06, + "loss": 1.2107, + "step": 2528 + }, + { + "epoch": 0.6726063829787234, + "grad_norm": 3.921840190887451, + "learning_rate": 9.805958155843294e-06, + "loss": 1.226, + "step": 2529 + }, + { + "epoch": 0.6728723404255319, + "grad_norm": 3.4277050495147705, + "learning_rate": 9.805715442200065e-06, + "loss": 1.2126, + "step": 2530 + }, + { + "epoch": 0.6731382978723405, + "grad_norm": 3.841946601867676, + "learning_rate": 9.805472579862342e-06, + "loss": 1.323, + "step": 2531 + }, + { + "epoch": 0.6734042553191489, + "grad_norm": 3.7039599418640137, + "learning_rate": 9.805229568837637e-06, + "loss": 1.2843, + "step": 2532 + }, + { + "epoch": 0.6736702127659574, + "grad_norm": 3.5301520824432373, + "learning_rate": 9.804986409133475e-06, + "loss": 1.0612, + "step": 2533 + }, + { + "epoch": 0.673936170212766, + "grad_norm": 4.042654037475586, + "learning_rate": 9.804743100757375e-06, + "loss": 1.215, + "step": 2534 + }, + { + "epoch": 0.6742021276595744, + "grad_norm": 3.895273447036743, + "learning_rate": 9.804499643716866e-06, + "loss": 1.4006, + "step": 2535 + }, + { + "epoch": 0.674468085106383, + "grad_norm": 3.5299017429351807, + "learning_rate": 9.804256038019482e-06, + "loss": 1.3813, + "step": 2536 + }, + { + "epoch": 0.6747340425531915, + "grad_norm": 3.8434762954711914, + "learning_rate": 9.80401228367276e-06, + "loss": 1.4165, + "step": 2537 + }, + { + "epoch": 0.675, + "grad_norm": 4.0280256271362305, + "learning_rate": 9.803768380684242e-06, + "loss": 1.3851, + "step": 2538 + }, + { + "epoch": 0.6752659574468085, + "grad_norm": 3.663043260574341, + "learning_rate": 9.803524329061474e-06, + "loss": 1.3044, + "step": 2539 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 3.575730562210083, + "learning_rate": 9.803280128812009e-06, + "loss": 1.2849, + "step": 2540 + }, + { + "epoch": 0.6757978723404255, + "grad_norm": 3.7937097549438477, + "learning_rate": 9.8030357799434e-06, + "loss": 1.2569, + "step": 2541 + }, + { + "epoch": 0.676063829787234, + "grad_norm": 3.982719898223877, + "learning_rate": 9.80279128246321e-06, + "loss": 1.411, + "step": 2542 + }, + { + "epoch": 0.6763297872340426, + "grad_norm": 3.825068950653076, + "learning_rate": 9.802546636379001e-06, + "loss": 1.295, + "step": 2543 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.8499345779418945, + "learning_rate": 9.80230184169835e-06, + "loss": 1.282, + "step": 2544 + }, + { + "epoch": 0.6768617021276596, + "grad_norm": 3.4873030185699463, + "learning_rate": 9.802056898428823e-06, + "loss": 1.2803, + "step": 2545 + }, + { + "epoch": 0.6771276595744681, + "grad_norm": 3.9438254833221436, + "learning_rate": 9.801811806578001e-06, + "loss": 1.2881, + "step": 2546 + }, + { + "epoch": 0.6773936170212767, + "grad_norm": 3.392169237136841, + "learning_rate": 9.80156656615347e-06, + "loss": 1.2485, + "step": 2547 + }, + { + "epoch": 0.6776595744680851, + "grad_norm": 3.8698456287384033, + "learning_rate": 9.801321177162814e-06, + "loss": 1.281, + "step": 2548 + }, + { + "epoch": 0.6779255319148936, + "grad_norm": 3.8232076168060303, + "learning_rate": 9.801075639613628e-06, + "loss": 1.3045, + "step": 2549 + }, + { + "epoch": 0.6781914893617021, + "grad_norm": 3.8453428745269775, + "learning_rate": 9.80082995351351e-06, + "loss": 1.2239, + "step": 2550 + }, + { + "epoch": 0.6784574468085106, + "grad_norm": 3.7375547885894775, + "learning_rate": 9.800584118870063e-06, + "loss": 1.195, + "step": 2551 + }, + { + "epoch": 0.6787234042553192, + "grad_norm": 3.84708571434021, + "learning_rate": 9.800338135690889e-06, + "loss": 1.1614, + "step": 2552 + }, + { + "epoch": 0.6789893617021276, + "grad_norm": 3.612217664718628, + "learning_rate": 9.800092003983602e-06, + "loss": 1.2499, + "step": 2553 + }, + { + "epoch": 0.6792553191489362, + "grad_norm": 3.217289447784424, + "learning_rate": 9.799845723755818e-06, + "loss": 1.1648, + "step": 2554 + }, + { + "epoch": 0.6795212765957447, + "grad_norm": 4.510238170623779, + "learning_rate": 9.799599295015154e-06, + "loss": 1.2728, + "step": 2555 + }, + { + "epoch": 0.6797872340425531, + "grad_norm": 4.0085129737854, + "learning_rate": 9.79935271776924e-06, + "loss": 1.3524, + "step": 2556 + }, + { + "epoch": 0.6800531914893617, + "grad_norm": 3.8481833934783936, + "learning_rate": 9.799105992025699e-06, + "loss": 1.2783, + "step": 2557 + }, + { + "epoch": 0.6803191489361702, + "grad_norm": 3.901775598526001, + "learning_rate": 9.79885911779217e-06, + "loss": 1.1736, + "step": 2558 + }, + { + "epoch": 0.6805851063829788, + "grad_norm": 3.864826202392578, + "learning_rate": 9.798612095076291e-06, + "loss": 1.3108, + "step": 2559 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 3.7867627143859863, + "learning_rate": 9.798364923885703e-06, + "loss": 1.1626, + "step": 2560 + }, + { + "epoch": 0.6811170212765958, + "grad_norm": 3.8203864097595215, + "learning_rate": 9.798117604228054e-06, + "loss": 1.2232, + "step": 2561 + }, + { + "epoch": 0.6813829787234043, + "grad_norm": 3.5479917526245117, + "learning_rate": 9.797870136110998e-06, + "loss": 1.1571, + "step": 2562 + }, + { + "epoch": 0.6816489361702127, + "grad_norm": 3.782655715942383, + "learning_rate": 9.797622519542193e-06, + "loss": 1.3004, + "step": 2563 + }, + { + "epoch": 0.6819148936170213, + "grad_norm": 3.477875232696533, + "learning_rate": 9.797374754529297e-06, + "loss": 1.0335, + "step": 2564 + }, + { + "epoch": 0.6821808510638298, + "grad_norm": 3.8241772651672363, + "learning_rate": 9.797126841079979e-06, + "loss": 1.4163, + "step": 2565 + }, + { + "epoch": 0.6824468085106383, + "grad_norm": 3.764817476272583, + "learning_rate": 9.796878779201906e-06, + "loss": 1.2243, + "step": 2566 + }, + { + "epoch": 0.6827127659574468, + "grad_norm": 3.784823417663574, + "learning_rate": 9.796630568902758e-06, + "loss": 1.4082, + "step": 2567 + }, + { + "epoch": 0.6829787234042554, + "grad_norm": 3.3941454887390137, + "learning_rate": 9.796382210190212e-06, + "loss": 1.0939, + "step": 2568 + }, + { + "epoch": 0.6832446808510638, + "grad_norm": 3.484823226928711, + "learning_rate": 9.796133703071956e-06, + "loss": 1.2322, + "step": 2569 + }, + { + "epoch": 0.6835106382978723, + "grad_norm": 3.6055960655212402, + "learning_rate": 9.795885047555673e-06, + "loss": 1.3383, + "step": 2570 + }, + { + "epoch": 0.6837765957446809, + "grad_norm": 3.7031943798065186, + "learning_rate": 9.795636243649061e-06, + "loss": 1.2987, + "step": 2571 + }, + { + "epoch": 0.6840425531914893, + "grad_norm": 3.5490245819091797, + "learning_rate": 9.795387291359819e-06, + "loss": 1.291, + "step": 2572 + }, + { + "epoch": 0.6843085106382979, + "grad_norm": 3.611907958984375, + "learning_rate": 9.795138190695647e-06, + "loss": 1.2693, + "step": 2573 + }, + { + "epoch": 0.6845744680851064, + "grad_norm": 3.580634832382202, + "learning_rate": 9.794888941664253e-06, + "loss": 1.3336, + "step": 2574 + }, + { + "epoch": 0.6848404255319149, + "grad_norm": 3.957103967666626, + "learning_rate": 9.794639544273352e-06, + "loss": 1.2077, + "step": 2575 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.5140933990478516, + "learning_rate": 9.794389998530659e-06, + "loss": 1.2885, + "step": 2576 + }, + { + "epoch": 0.685372340425532, + "grad_norm": 3.6171066761016846, + "learning_rate": 9.794140304443891e-06, + "loss": 1.2211, + "step": 2577 + }, + { + "epoch": 0.6856382978723404, + "grad_norm": 3.641486167907715, + "learning_rate": 9.793890462020781e-06, + "loss": 1.0571, + "step": 2578 + }, + { + "epoch": 0.6859042553191489, + "grad_norm": 3.605208396911621, + "learning_rate": 9.793640471269055e-06, + "loss": 1.1932, + "step": 2579 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 3.67253041267395, + "learning_rate": 9.793390332196448e-06, + "loss": 1.1474, + "step": 2580 + }, + { + "epoch": 0.6864361702127659, + "grad_norm": 4.190906524658203, + "learning_rate": 9.793140044810701e-06, + "loss": 1.2488, + "step": 2581 + }, + { + "epoch": 0.6867021276595745, + "grad_norm": 4.1439104080200195, + "learning_rate": 9.792889609119558e-06, + "loss": 1.2747, + "step": 2582 + }, + { + "epoch": 0.686968085106383, + "grad_norm": 3.9002907276153564, + "learning_rate": 9.79263902513077e-06, + "loss": 1.2291, + "step": 2583 + }, + { + "epoch": 0.6872340425531915, + "grad_norm": 3.6862435340881348, + "learning_rate": 9.792388292852084e-06, + "loss": 1.1637, + "step": 2584 + }, + { + "epoch": 0.6875, + "grad_norm": 3.789638042449951, + "learning_rate": 9.792137412291265e-06, + "loss": 1.1779, + "step": 2585 + }, + { + "epoch": 0.6877659574468085, + "grad_norm": 3.5384011268615723, + "learning_rate": 9.791886383456071e-06, + "loss": 1.2701, + "step": 2586 + }, + { + "epoch": 0.688031914893617, + "grad_norm": 3.6008050441741943, + "learning_rate": 9.79163520635427e-06, + "loss": 1.2479, + "step": 2587 + }, + { + "epoch": 0.6882978723404255, + "grad_norm": 3.71974515914917, + "learning_rate": 9.791383880993635e-06, + "loss": 1.267, + "step": 2588 + }, + { + "epoch": 0.6885638297872341, + "grad_norm": 3.5324504375457764, + "learning_rate": 9.791132407381942e-06, + "loss": 1.2725, + "step": 2589 + }, + { + "epoch": 0.6888297872340425, + "grad_norm": 3.602149724960327, + "learning_rate": 9.790880785526971e-06, + "loss": 1.1551, + "step": 2590 + }, + { + "epoch": 0.6890957446808511, + "grad_norm": 3.761108160018921, + "learning_rate": 9.790629015436508e-06, + "loss": 1.2654, + "step": 2591 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 3.6845576763153076, + "learning_rate": 9.790377097118342e-06, + "loss": 1.1352, + "step": 2592 + }, + { + "epoch": 0.689627659574468, + "grad_norm": 3.4206063747406006, + "learning_rate": 9.79012503058027e-06, + "loss": 1.1649, + "step": 2593 + }, + { + "epoch": 0.6898936170212766, + "grad_norm": 3.91064190864563, + "learning_rate": 9.789872815830089e-06, + "loss": 1.2736, + "step": 2594 + }, + { + "epoch": 0.6901595744680851, + "grad_norm": 3.3683114051818848, + "learning_rate": 9.789620452875605e-06, + "loss": 1.1734, + "step": 2595 + }, + { + "epoch": 0.6904255319148936, + "grad_norm": 3.797476053237915, + "learning_rate": 9.789367941724623e-06, + "loss": 1.239, + "step": 2596 + }, + { + "epoch": 0.6906914893617021, + "grad_norm": 3.623358964920044, + "learning_rate": 9.78911528238496e-06, + "loss": 1.2941, + "step": 2597 + }, + { + "epoch": 0.6909574468085107, + "grad_norm": 4.187454700469971, + "learning_rate": 9.78886247486443e-06, + "loss": 1.3176, + "step": 2598 + }, + { + "epoch": 0.6912234042553191, + "grad_norm": 4.131342887878418, + "learning_rate": 9.78860951917086e-06, + "loss": 1.3183, + "step": 2599 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 3.6273796558380127, + "learning_rate": 9.78835641531207e-06, + "loss": 1.1836, + "step": 2600 + }, + { + "epoch": 0.6917553191489362, + "grad_norm": 3.8663980960845947, + "learning_rate": 9.788103163295897e-06, + "loss": 1.4566, + "step": 2601 + }, + { + "epoch": 0.6920212765957446, + "grad_norm": 3.8288991451263428, + "learning_rate": 9.787849763130174e-06, + "loss": 1.2238, + "step": 2602 + }, + { + "epoch": 0.6922872340425532, + "grad_norm": 4.178062438964844, + "learning_rate": 9.787596214822743e-06, + "loss": 1.399, + "step": 2603 + }, + { + "epoch": 0.6925531914893617, + "grad_norm": 3.824878215789795, + "learning_rate": 9.787342518381447e-06, + "loss": 1.2654, + "step": 2604 + }, + { + "epoch": 0.6928191489361702, + "grad_norm": 3.742422103881836, + "learning_rate": 9.787088673814137e-06, + "loss": 1.3921, + "step": 2605 + }, + { + "epoch": 0.6930851063829787, + "grad_norm": 4.080827713012695, + "learning_rate": 9.78683468112867e-06, + "loss": 1.2525, + "step": 2606 + }, + { + "epoch": 0.6933510638297873, + "grad_norm": 3.393066883087158, + "learning_rate": 9.7865805403329e-06, + "loss": 1.0471, + "step": 2607 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 3.3034181594848633, + "learning_rate": 9.786326251434694e-06, + "loss": 1.1627, + "step": 2608 + }, + { + "epoch": 0.6938829787234042, + "grad_norm": 3.8288989067077637, + "learning_rate": 9.786071814441918e-06, + "loss": 1.2483, + "step": 2609 + }, + { + "epoch": 0.6941489361702128, + "grad_norm": 3.4944722652435303, + "learning_rate": 9.785817229362445e-06, + "loss": 1.2921, + "step": 2610 + }, + { + "epoch": 0.6944148936170212, + "grad_norm": 3.653322219848633, + "learning_rate": 9.785562496204151e-06, + "loss": 1.2367, + "step": 2611 + }, + { + "epoch": 0.6946808510638298, + "grad_norm": 3.3792853355407715, + "learning_rate": 9.785307614974922e-06, + "loss": 1.1746, + "step": 2612 + }, + { + "epoch": 0.6949468085106383, + "grad_norm": 3.608031988143921, + "learning_rate": 9.78505258568264e-06, + "loss": 1.2059, + "step": 2613 + }, + { + "epoch": 0.6952127659574469, + "grad_norm": 4.2280402183532715, + "learning_rate": 9.784797408335195e-06, + "loss": 1.294, + "step": 2614 + }, + { + "epoch": 0.6954787234042553, + "grad_norm": 3.8257791996002197, + "learning_rate": 9.784542082940488e-06, + "loss": 1.3261, + "step": 2615 + }, + { + "epoch": 0.6957446808510638, + "grad_norm": 3.9494855403900146, + "learning_rate": 9.784286609506415e-06, + "loss": 1.3776, + "step": 2616 + }, + { + "epoch": 0.6960106382978724, + "grad_norm": 3.8635013103485107, + "learning_rate": 9.78403098804088e-06, + "loss": 1.3371, + "step": 2617 + }, + { + "epoch": 0.6962765957446808, + "grad_norm": 3.8114707469940186, + "learning_rate": 9.783775218551796e-06, + "loss": 1.3064, + "step": 2618 + }, + { + "epoch": 0.6965425531914894, + "grad_norm": 3.8006489276885986, + "learning_rate": 9.783519301047072e-06, + "loss": 1.3864, + "step": 2619 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 3.504070997238159, + "learning_rate": 9.783263235534632e-06, + "loss": 1.2172, + "step": 2620 + }, + { + "epoch": 0.6970744680851064, + "grad_norm": 3.741771936416626, + "learning_rate": 9.783007022022394e-06, + "loss": 1.2375, + "step": 2621 + }, + { + "epoch": 0.6973404255319149, + "grad_norm": 3.5260889530181885, + "learning_rate": 9.782750660518288e-06, + "loss": 1.4035, + "step": 2622 + }, + { + "epoch": 0.6976063829787233, + "grad_norm": 3.832963466644287, + "learning_rate": 9.782494151030245e-06, + "loss": 1.2979, + "step": 2623 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 3.5783939361572266, + "learning_rate": 9.782237493566202e-06, + "loss": 1.1859, + "step": 2624 + }, + { + "epoch": 0.6981382978723404, + "grad_norm": 3.677419900894165, + "learning_rate": 9.781980688134102e-06, + "loss": 1.2306, + "step": 2625 + }, + { + "epoch": 0.698404255319149, + "grad_norm": 3.812321901321411, + "learning_rate": 9.781723734741889e-06, + "loss": 1.3585, + "step": 2626 + }, + { + "epoch": 0.6986702127659574, + "grad_norm": 3.3270645141601562, + "learning_rate": 9.781466633397512e-06, + "loss": 1.0776, + "step": 2627 + }, + { + "epoch": 0.698936170212766, + "grad_norm": 3.6559667587280273, + "learning_rate": 9.78120938410893e-06, + "loss": 1.3296, + "step": 2628 + }, + { + "epoch": 0.6992021276595745, + "grad_norm": 3.707422971725464, + "learning_rate": 9.7809519868841e-06, + "loss": 1.2396, + "step": 2629 + }, + { + "epoch": 0.699468085106383, + "grad_norm": 3.875147581100464, + "learning_rate": 9.780694441730987e-06, + "loss": 1.4079, + "step": 2630 + }, + { + "epoch": 0.6997340425531915, + "grad_norm": 4.308002471923828, + "learning_rate": 9.780436748657559e-06, + "loss": 1.3675, + "step": 2631 + }, + { + "epoch": 0.7, + "grad_norm": 3.6063718795776367, + "learning_rate": 9.780178907671788e-06, + "loss": 1.1953, + "step": 2632 + }, + { + "epoch": 0.7002659574468085, + "grad_norm": 3.582390308380127, + "learning_rate": 9.779920918781656e-06, + "loss": 1.2841, + "step": 2633 + }, + { + "epoch": 0.700531914893617, + "grad_norm": 3.8668954372406006, + "learning_rate": 9.779662781995144e-06, + "loss": 1.3806, + "step": 2634 + }, + { + "epoch": 0.7007978723404256, + "grad_norm": 3.4479143619537354, + "learning_rate": 9.779404497320236e-06, + "loss": 1.3201, + "step": 2635 + }, + { + "epoch": 0.701063829787234, + "grad_norm": 4.041039943695068, + "learning_rate": 9.779146064764925e-06, + "loss": 1.1912, + "step": 2636 + }, + { + "epoch": 0.7013297872340426, + "grad_norm": 3.944117307662964, + "learning_rate": 9.77888748433721e-06, + "loss": 1.1603, + "step": 2637 + }, + { + "epoch": 0.7015957446808511, + "grad_norm": 4.008464336395264, + "learning_rate": 9.77862875604509e-06, + "loss": 1.3612, + "step": 2638 + }, + { + "epoch": 0.7018617021276595, + "grad_norm": 3.5746493339538574, + "learning_rate": 9.778369879896568e-06, + "loss": 1.3117, + "step": 2639 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 4.120686054229736, + "learning_rate": 9.778110855899659e-06, + "loss": 1.2801, + "step": 2640 + }, + { + "epoch": 0.7023936170212766, + "grad_norm": 3.7582547664642334, + "learning_rate": 9.777851684062371e-06, + "loss": 1.291, + "step": 2641 + }, + { + "epoch": 0.7026595744680851, + "grad_norm": 3.8033053874969482, + "learning_rate": 9.77759236439273e-06, + "loss": 1.3342, + "step": 2642 + }, + { + "epoch": 0.7029255319148936, + "grad_norm": 3.712113618850708, + "learning_rate": 9.777332896898754e-06, + "loss": 1.1921, + "step": 2643 + }, + { + "epoch": 0.7031914893617022, + "grad_norm": 3.1552655696868896, + "learning_rate": 9.777073281588476e-06, + "loss": 1.1407, + "step": 2644 + }, + { + "epoch": 0.7034574468085106, + "grad_norm": 4.050416946411133, + "learning_rate": 9.776813518469924e-06, + "loss": 1.3787, + "step": 2645 + }, + { + "epoch": 0.7037234042553191, + "grad_norm": 3.63802170753479, + "learning_rate": 9.77655360755114e-06, + "loss": 1.3203, + "step": 2646 + }, + { + "epoch": 0.7039893617021277, + "grad_norm": 4.1890482902526855, + "learning_rate": 9.77629354884016e-06, + "loss": 1.3532, + "step": 2647 + }, + { + "epoch": 0.7042553191489361, + "grad_norm": 4.1286444664001465, + "learning_rate": 9.776033342345038e-06, + "loss": 1.2704, + "step": 2648 + }, + { + "epoch": 0.7045212765957447, + "grad_norm": 3.4052047729492188, + "learning_rate": 9.77577298807382e-06, + "loss": 1.2537, + "step": 2649 + }, + { + "epoch": 0.7047872340425532, + "grad_norm": 4.194342136383057, + "learning_rate": 9.775512486034564e-06, + "loss": 1.449, + "step": 2650 + }, + { + "epoch": 0.7050531914893617, + "grad_norm": 3.945206880569458, + "learning_rate": 9.775251836235327e-06, + "loss": 1.357, + "step": 2651 + }, + { + "epoch": 0.7053191489361702, + "grad_norm": 3.5744996070861816, + "learning_rate": 9.774991038684177e-06, + "loss": 1.2701, + "step": 2652 + }, + { + "epoch": 0.7055851063829788, + "grad_norm": 3.9091970920562744, + "learning_rate": 9.774730093389182e-06, + "loss": 1.3401, + "step": 2653 + }, + { + "epoch": 0.7058510638297872, + "grad_norm": 3.7527072429656982, + "learning_rate": 9.774469000358418e-06, + "loss": 1.2886, + "step": 2654 + }, + { + "epoch": 0.7061170212765957, + "grad_norm": 3.5021281242370605, + "learning_rate": 9.774207759599961e-06, + "loss": 1.2253, + "step": 2655 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 3.725334405899048, + "learning_rate": 9.773946371121894e-06, + "loss": 1.3451, + "step": 2656 + }, + { + "epoch": 0.7066489361702127, + "grad_norm": 3.3787760734558105, + "learning_rate": 9.773684834932306e-06, + "loss": 1.183, + "step": 2657 + }, + { + "epoch": 0.7069148936170213, + "grad_norm": 3.956935167312622, + "learning_rate": 9.77342315103929e-06, + "loss": 1.3828, + "step": 2658 + }, + { + "epoch": 0.7071808510638298, + "grad_norm": 3.7493388652801514, + "learning_rate": 9.77316131945094e-06, + "loss": 1.2192, + "step": 2659 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 4.022577285766602, + "learning_rate": 9.772899340175362e-06, + "loss": 1.2509, + "step": 2660 + }, + { + "epoch": 0.7077127659574468, + "grad_norm": 3.9888761043548584, + "learning_rate": 9.772637213220658e-06, + "loss": 1.3076, + "step": 2661 + }, + { + "epoch": 0.7079787234042553, + "grad_norm": 3.502845048904419, + "learning_rate": 9.772374938594937e-06, + "loss": 1.4205, + "step": 2662 + }, + { + "epoch": 0.7082446808510638, + "grad_norm": 3.611692190170288, + "learning_rate": 9.772112516306318e-06, + "loss": 1.2036, + "step": 2663 + }, + { + "epoch": 0.7085106382978723, + "grad_norm": 3.3075003623962402, + "learning_rate": 9.77184994636292e-06, + "loss": 1.1399, + "step": 2664 + }, + { + "epoch": 0.7087765957446809, + "grad_norm": 3.6357240676879883, + "learning_rate": 9.771587228772866e-06, + "loss": 1.2438, + "step": 2665 + }, + { + "epoch": 0.7090425531914893, + "grad_norm": 3.798506259918213, + "learning_rate": 9.771324363544286e-06, + "loss": 1.2793, + "step": 2666 + }, + { + "epoch": 0.7093085106382979, + "grad_norm": 3.3980555534362793, + "learning_rate": 9.771061350685312e-06, + "loss": 1.2446, + "step": 2667 + }, + { + "epoch": 0.7095744680851064, + "grad_norm": 3.5380852222442627, + "learning_rate": 9.770798190204083e-06, + "loss": 1.1996, + "step": 2668 + }, + { + "epoch": 0.7098404255319148, + "grad_norm": 3.93696665763855, + "learning_rate": 9.77053488210874e-06, + "loss": 1.2549, + "step": 2669 + }, + { + "epoch": 0.7101063829787234, + "grad_norm": 4.042500019073486, + "learning_rate": 9.770271426407432e-06, + "loss": 1.455, + "step": 2670 + }, + { + "epoch": 0.7103723404255319, + "grad_norm": 3.6526906490325928, + "learning_rate": 9.770007823108309e-06, + "loss": 1.3447, + "step": 2671 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.8958542346954346, + "learning_rate": 9.76974407221953e-06, + "loss": 1.2542, + "step": 2672 + }, + { + "epoch": 0.7109042553191489, + "grad_norm": 3.5408430099487305, + "learning_rate": 9.769480173749252e-06, + "loss": 1.3333, + "step": 2673 + }, + { + "epoch": 0.7111702127659575, + "grad_norm": 3.586918592453003, + "learning_rate": 9.769216127705643e-06, + "loss": 1.2469, + "step": 2674 + }, + { + "epoch": 0.711436170212766, + "grad_norm": 3.6321678161621094, + "learning_rate": 9.76895193409687e-06, + "loss": 1.3352, + "step": 2675 + }, + { + "epoch": 0.7117021276595744, + "grad_norm": 3.4352383613586426, + "learning_rate": 9.768687592931111e-06, + "loss": 1.228, + "step": 2676 + }, + { + "epoch": 0.711968085106383, + "grad_norm": 3.756770610809326, + "learning_rate": 9.768423104216544e-06, + "loss": 1.1776, + "step": 2677 + }, + { + "epoch": 0.7122340425531914, + "grad_norm": 4.270863056182861, + "learning_rate": 9.76815846796135e-06, + "loss": 1.2372, + "step": 2678 + }, + { + "epoch": 0.7125, + "grad_norm": 4.0467848777771, + "learning_rate": 9.767893684173722e-06, + "loss": 1.33, + "step": 2679 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 3.9330484867095947, + "learning_rate": 9.767628752861848e-06, + "loss": 1.2019, + "step": 2680 + }, + { + "epoch": 0.7130319148936171, + "grad_norm": 4.011680603027344, + "learning_rate": 9.767363674033928e-06, + "loss": 1.1982, + "step": 2681 + }, + { + "epoch": 0.7132978723404255, + "grad_norm": 3.5905420780181885, + "learning_rate": 9.767098447698163e-06, + "loss": 1.2441, + "step": 2682 + }, + { + "epoch": 0.7135638297872341, + "grad_norm": 3.8876521587371826, + "learning_rate": 9.766833073862758e-06, + "loss": 1.3112, + "step": 2683 + }, + { + "epoch": 0.7138297872340426, + "grad_norm": 3.6759207248687744, + "learning_rate": 9.766567552535928e-06, + "loss": 1.2974, + "step": 2684 + }, + { + "epoch": 0.714095744680851, + "grad_norm": 3.6160476207733154, + "learning_rate": 9.766301883725884e-06, + "loss": 1.3107, + "step": 2685 + }, + { + "epoch": 0.7143617021276596, + "grad_norm": 3.9795331954956055, + "learning_rate": 9.766036067440849e-06, + "loss": 1.4063, + "step": 2686 + }, + { + "epoch": 0.714627659574468, + "grad_norm": 3.899998188018799, + "learning_rate": 9.765770103689045e-06, + "loss": 1.3517, + "step": 2687 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 3.501302719116211, + "learning_rate": 9.765503992478704e-06, + "loss": 1.078, + "step": 2688 + }, + { + "epoch": 0.7151595744680851, + "grad_norm": 3.4490084648132324, + "learning_rate": 9.76523773381806e-06, + "loss": 1.2363, + "step": 2689 + }, + { + "epoch": 0.7154255319148937, + "grad_norm": 3.773393154144287, + "learning_rate": 9.76497132771535e-06, + "loss": 1.2677, + "step": 2690 + }, + { + "epoch": 0.7156914893617021, + "grad_norm": 3.2833402156829834, + "learning_rate": 9.764704774178816e-06, + "loss": 1.2409, + "step": 2691 + }, + { + "epoch": 0.7159574468085106, + "grad_norm": 3.798407793045044, + "learning_rate": 9.764438073216706e-06, + "loss": 1.2375, + "step": 2692 + }, + { + "epoch": 0.7162234042553192, + "grad_norm": 3.383553981781006, + "learning_rate": 9.764171224837274e-06, + "loss": 1.223, + "step": 2693 + }, + { + "epoch": 0.7164893617021276, + "grad_norm": 3.781569242477417, + "learning_rate": 9.763904229048775e-06, + "loss": 1.1822, + "step": 2694 + }, + { + "epoch": 0.7167553191489362, + "grad_norm": 3.862577438354492, + "learning_rate": 9.76363708585947e-06, + "loss": 1.2266, + "step": 2695 + }, + { + "epoch": 0.7170212765957447, + "grad_norm": 3.4044363498687744, + "learning_rate": 9.763369795277627e-06, + "loss": 1.1887, + "step": 2696 + }, + { + "epoch": 0.7172872340425532, + "grad_norm": 3.930368185043335, + "learning_rate": 9.763102357311511e-06, + "loss": 1.2911, + "step": 2697 + }, + { + "epoch": 0.7175531914893617, + "grad_norm": 3.72084379196167, + "learning_rate": 9.762834771969403e-06, + "loss": 1.2693, + "step": 2698 + }, + { + "epoch": 0.7178191489361702, + "grad_norm": 3.3735997676849365, + "learning_rate": 9.762567039259577e-06, + "loss": 1.2202, + "step": 2699 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 3.3215930461883545, + "learning_rate": 9.762299159190322e-06, + "loss": 1.311, + "step": 2700 + }, + { + "epoch": 0.7183510638297872, + "grad_norm": 3.2667737007141113, + "learning_rate": 9.762031131769923e-06, + "loss": 1.1621, + "step": 2701 + }, + { + "epoch": 0.7186170212765958, + "grad_norm": 3.8327572345733643, + "learning_rate": 9.761762957006673e-06, + "loss": 1.2764, + "step": 2702 + }, + { + "epoch": 0.7188829787234042, + "grad_norm": 3.693328857421875, + "learning_rate": 9.761494634908872e-06, + "loss": 1.168, + "step": 2703 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 3.7882509231567383, + "learning_rate": 9.761226165484822e-06, + "loss": 1.3076, + "step": 2704 + }, + { + "epoch": 0.7194148936170213, + "grad_norm": 3.366978645324707, + "learning_rate": 9.760957548742828e-06, + "loss": 1.3628, + "step": 2705 + }, + { + "epoch": 0.7196808510638298, + "grad_norm": 3.4671497344970703, + "learning_rate": 9.7606887846912e-06, + "loss": 1.2197, + "step": 2706 + }, + { + "epoch": 0.7199468085106383, + "grad_norm": 4.486639022827148, + "learning_rate": 9.760419873338261e-06, + "loss": 1.1786, + "step": 2707 + }, + { + "epoch": 0.7202127659574468, + "grad_norm": 3.5285980701446533, + "learning_rate": 9.760150814692321e-06, + "loss": 1.0701, + "step": 2708 + }, + { + "epoch": 0.7204787234042553, + "grad_norm": 3.4500350952148438, + "learning_rate": 9.759881608761714e-06, + "loss": 1.1768, + "step": 2709 + }, + { + "epoch": 0.7207446808510638, + "grad_norm": 3.219653606414795, + "learning_rate": 9.759612255554765e-06, + "loss": 1.1413, + "step": 2710 + }, + { + "epoch": 0.7210106382978724, + "grad_norm": 3.7905290126800537, + "learning_rate": 9.75934275507981e-06, + "loss": 1.3632, + "step": 2711 + }, + { + "epoch": 0.7212765957446808, + "grad_norm": 3.765892744064331, + "learning_rate": 9.759073107345186e-06, + "loss": 1.3237, + "step": 2712 + }, + { + "epoch": 0.7215425531914894, + "grad_norm": 3.8589115142822266, + "learning_rate": 9.758803312359236e-06, + "loss": 1.3028, + "step": 2713 + }, + { + "epoch": 0.7218085106382979, + "grad_norm": 3.688624143600464, + "learning_rate": 9.758533370130308e-06, + "loss": 1.2325, + "step": 2714 + }, + { + "epoch": 0.7220744680851063, + "grad_norm": 3.397474765777588, + "learning_rate": 9.758263280666757e-06, + "loss": 1.3173, + "step": 2715 + }, + { + "epoch": 0.7223404255319149, + "grad_norm": 3.9396157264709473, + "learning_rate": 9.757993043976937e-06, + "loss": 1.4517, + "step": 2716 + }, + { + "epoch": 0.7226063829787234, + "grad_norm": 3.5887930393218994, + "learning_rate": 9.757722660069211e-06, + "loss": 1.1431, + "step": 2717 + }, + { + "epoch": 0.722872340425532, + "grad_norm": 3.520183563232422, + "learning_rate": 9.757452128951945e-06, + "loss": 1.3442, + "step": 2718 + }, + { + "epoch": 0.7231382978723404, + "grad_norm": 3.704939365386963, + "learning_rate": 9.757181450633507e-06, + "loss": 1.2257, + "step": 2719 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 4.201409816741943, + "learning_rate": 9.756910625122276e-06, + "loss": 1.234, + "step": 2720 + }, + { + "epoch": 0.7236702127659574, + "grad_norm": 3.571162700653076, + "learning_rate": 9.756639652426627e-06, + "loss": 1.195, + "step": 2721 + }, + { + "epoch": 0.7239361702127659, + "grad_norm": 3.463414192199707, + "learning_rate": 9.75636853255495e-06, + "loss": 1.2494, + "step": 2722 + }, + { + "epoch": 0.7242021276595745, + "grad_norm": 3.4496824741363525, + "learning_rate": 9.75609726551563e-06, + "loss": 1.1707, + "step": 2723 + }, + { + "epoch": 0.7244680851063829, + "grad_norm": 3.9885363578796387, + "learning_rate": 9.75582585131706e-06, + "loss": 1.2613, + "step": 2724 + }, + { + "epoch": 0.7247340425531915, + "grad_norm": 4.085259437561035, + "learning_rate": 9.755554289967638e-06, + "loss": 1.2527, + "step": 2725 + }, + { + "epoch": 0.725, + "grad_norm": 4.417264938354492, + "learning_rate": 9.755282581475769e-06, + "loss": 1.466, + "step": 2726 + }, + { + "epoch": 0.7252659574468086, + "grad_norm": 3.954056739807129, + "learning_rate": 9.755010725849857e-06, + "loss": 1.2379, + "step": 2727 + }, + { + "epoch": 0.725531914893617, + "grad_norm": 3.838103771209717, + "learning_rate": 9.754738723098316e-06, + "loss": 1.1999, + "step": 2728 + }, + { + "epoch": 0.7257978723404256, + "grad_norm": 4.1355695724487305, + "learning_rate": 9.75446657322956e-06, + "loss": 1.2805, + "step": 2729 + }, + { + "epoch": 0.726063829787234, + "grad_norm": 4.266016483306885, + "learning_rate": 9.75419427625201e-06, + "loss": 1.274, + "step": 2730 + }, + { + "epoch": 0.7263297872340425, + "grad_norm": 3.8930816650390625, + "learning_rate": 9.753921832174094e-06, + "loss": 1.3094, + "step": 2731 + }, + { + "epoch": 0.7265957446808511, + "grad_norm": 3.7425036430358887, + "learning_rate": 9.753649241004238e-06, + "loss": 1.2826, + "step": 2732 + }, + { + "epoch": 0.7268617021276595, + "grad_norm": 4.708345890045166, + "learning_rate": 9.753376502750878e-06, + "loss": 1.4243, + "step": 2733 + }, + { + "epoch": 0.7271276595744681, + "grad_norm": 3.6511597633361816, + "learning_rate": 9.753103617422452e-06, + "loss": 1.1892, + "step": 2734 + }, + { + "epoch": 0.7273936170212766, + "grad_norm": 3.807124376296997, + "learning_rate": 9.752830585027406e-06, + "loss": 1.2767, + "step": 2735 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.596545457839966, + "learning_rate": 9.752557405574184e-06, + "loss": 1.1901, + "step": 2736 + }, + { + "epoch": 0.7279255319148936, + "grad_norm": 3.6757147312164307, + "learning_rate": 9.752284079071242e-06, + "loss": 1.4032, + "step": 2737 + }, + { + "epoch": 0.7281914893617021, + "grad_norm": 3.862985372543335, + "learning_rate": 9.752010605527033e-06, + "loss": 1.1524, + "step": 2738 + }, + { + "epoch": 0.7284574468085107, + "grad_norm": 3.685128927230835, + "learning_rate": 9.751736984950023e-06, + "loss": 1.1703, + "step": 2739 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 3.4319050312042236, + "learning_rate": 9.751463217348675e-06, + "loss": 1.1965, + "step": 2740 + }, + { + "epoch": 0.7289893617021277, + "grad_norm": 3.4726648330688477, + "learning_rate": 9.751189302731463e-06, + "loss": 1.24, + "step": 2741 + }, + { + "epoch": 0.7292553191489362, + "grad_norm": 3.4759905338287354, + "learning_rate": 9.750915241106857e-06, + "loss": 1.1663, + "step": 2742 + }, + { + "epoch": 0.7295212765957447, + "grad_norm": 3.5179250240325928, + "learning_rate": 9.750641032483344e-06, + "loss": 1.1964, + "step": 2743 + }, + { + "epoch": 0.7297872340425532, + "grad_norm": 3.397850751876831, + "learning_rate": 9.750366676869401e-06, + "loss": 1.159, + "step": 2744 + }, + { + "epoch": 0.7300531914893617, + "grad_norm": 3.505492687225342, + "learning_rate": 9.75009217427352e-06, + "loss": 1.4271, + "step": 2745 + }, + { + "epoch": 0.7303191489361702, + "grad_norm": 3.516559362411499, + "learning_rate": 9.749817524704198e-06, + "loss": 1.2119, + "step": 2746 + }, + { + "epoch": 0.7305851063829787, + "grad_norm": 3.5949020385742188, + "learning_rate": 9.749542728169925e-06, + "loss": 1.1291, + "step": 2747 + }, + { + "epoch": 0.7308510638297873, + "grad_norm": 3.3480985164642334, + "learning_rate": 9.749267784679211e-06, + "loss": 1.1421, + "step": 2748 + }, + { + "epoch": 0.7311170212765957, + "grad_norm": 3.4003922939300537, + "learning_rate": 9.74899269424056e-06, + "loss": 1.3106, + "step": 2749 + }, + { + "epoch": 0.7313829787234043, + "grad_norm": 3.5191762447357178, + "learning_rate": 9.748717456862484e-06, + "loss": 1.1878, + "step": 2750 + }, + { + "epoch": 0.7316489361702128, + "grad_norm": 3.5664145946502686, + "learning_rate": 9.748442072553496e-06, + "loss": 1.2272, + "step": 2751 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 3.928241491317749, + "learning_rate": 9.748166541322124e-06, + "loss": 1.2986, + "step": 2752 + }, + { + "epoch": 0.7321808510638298, + "grad_norm": 3.8403828144073486, + "learning_rate": 9.747890863176887e-06, + "loss": 1.3132, + "step": 2753 + }, + { + "epoch": 0.7324468085106383, + "grad_norm": 3.4996137619018555, + "learning_rate": 9.747615038126317e-06, + "loss": 1.3824, + "step": 2754 + }, + { + "epoch": 0.7327127659574468, + "grad_norm": 3.5281126499176025, + "learning_rate": 9.747339066178947e-06, + "loss": 1.3015, + "step": 2755 + }, + { + "epoch": 0.7329787234042553, + "grad_norm": 3.466567277908325, + "learning_rate": 9.747062947343318e-06, + "loss": 1.2638, + "step": 2756 + }, + { + "epoch": 0.7332446808510639, + "grad_norm": 3.8412346839904785, + "learning_rate": 9.746786681627971e-06, + "loss": 1.1944, + "step": 2757 + }, + { + "epoch": 0.7335106382978723, + "grad_norm": 3.3403968811035156, + "learning_rate": 9.746510269041459e-06, + "loss": 1.215, + "step": 2758 + }, + { + "epoch": 0.7337765957446809, + "grad_norm": 3.735173225402832, + "learning_rate": 9.746233709592328e-06, + "loss": 1.393, + "step": 2759 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 4.095008373260498, + "learning_rate": 9.745957003289138e-06, + "loss": 1.2848, + "step": 2760 + }, + { + "epoch": 0.7343085106382978, + "grad_norm": 3.8568758964538574, + "learning_rate": 9.745680150140452e-06, + "loss": 1.3195, + "step": 2761 + }, + { + "epoch": 0.7345744680851064, + "grad_norm": 3.512941360473633, + "learning_rate": 9.745403150154833e-06, + "loss": 1.0682, + "step": 2762 + }, + { + "epoch": 0.7348404255319149, + "grad_norm": 4.007373332977295, + "learning_rate": 9.745126003340854e-06, + "loss": 1.2665, + "step": 2763 + }, + { + "epoch": 0.7351063829787234, + "grad_norm": 3.8637166023254395, + "learning_rate": 9.74484870970709e-06, + "loss": 1.4367, + "step": 2764 + }, + { + "epoch": 0.7353723404255319, + "grad_norm": 3.6544454097747803, + "learning_rate": 9.744571269262122e-06, + "loss": 1.157, + "step": 2765 + }, + { + "epoch": 0.7356382978723405, + "grad_norm": 3.5814568996429443, + "learning_rate": 9.744293682014532e-06, + "loss": 1.2989, + "step": 2766 + }, + { + "epoch": 0.7359042553191489, + "grad_norm": 3.59860897064209, + "learning_rate": 9.74401594797291e-06, + "loss": 1.1852, + "step": 2767 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 3.694519519805908, + "learning_rate": 9.743738067145849e-06, + "loss": 1.3947, + "step": 2768 + }, + { + "epoch": 0.736436170212766, + "grad_norm": 3.570734977722168, + "learning_rate": 9.743460039541947e-06, + "loss": 1.3176, + "step": 2769 + }, + { + "epoch": 0.7367021276595744, + "grad_norm": 3.448857545852661, + "learning_rate": 9.743181865169806e-06, + "loss": 1.2162, + "step": 2770 + }, + { + "epoch": 0.736968085106383, + "grad_norm": 3.7955188751220703, + "learning_rate": 9.742903544038033e-06, + "loss": 1.2489, + "step": 2771 + }, + { + "epoch": 0.7372340425531915, + "grad_norm": 3.520260810852051, + "learning_rate": 9.742625076155244e-06, + "loss": 1.2545, + "step": 2772 + }, + { + "epoch": 0.7375, + "grad_norm": 3.3301799297332764, + "learning_rate": 9.742346461530048e-06, + "loss": 1.0909, + "step": 2773 + }, + { + "epoch": 0.7377659574468085, + "grad_norm": 3.57509708404541, + "learning_rate": 9.742067700171069e-06, + "loss": 1.2049, + "step": 2774 + }, + { + "epoch": 0.738031914893617, + "grad_norm": 3.4712679386138916, + "learning_rate": 9.741788792086934e-06, + "loss": 1.1797, + "step": 2775 + }, + { + "epoch": 0.7382978723404255, + "grad_norm": 3.4553110599517822, + "learning_rate": 9.74150973728627e-06, + "loss": 1.1082, + "step": 2776 + }, + { + "epoch": 0.738563829787234, + "grad_norm": 3.6550087928771973, + "learning_rate": 9.741230535777712e-06, + "loss": 1.281, + "step": 2777 + }, + { + "epoch": 0.7388297872340426, + "grad_norm": 3.3699588775634766, + "learning_rate": 9.7409511875699e-06, + "loss": 1.2331, + "step": 2778 + }, + { + "epoch": 0.739095744680851, + "grad_norm": 3.393129825592041, + "learning_rate": 9.740671692671478e-06, + "loss": 1.1614, + "step": 2779 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 3.888546943664551, + "learning_rate": 9.74039205109109e-06, + "loss": 1.3773, + "step": 2780 + }, + { + "epoch": 0.7396276595744681, + "grad_norm": 3.5572216510772705, + "learning_rate": 9.740112262837391e-06, + "loss": 1.2269, + "step": 2781 + }, + { + "epoch": 0.7398936170212767, + "grad_norm": 3.7788665294647217, + "learning_rate": 9.73983232791904e-06, + "loss": 1.2385, + "step": 2782 + }, + { + "epoch": 0.7401595744680851, + "grad_norm": 4.092897891998291, + "learning_rate": 9.739552246344692e-06, + "loss": 1.3396, + "step": 2783 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 3.679199457168579, + "learning_rate": 9.73927201812302e-06, + "loss": 1.2957, + "step": 2784 + }, + { + "epoch": 0.7406914893617021, + "grad_norm": 3.590893030166626, + "learning_rate": 9.738991643262693e-06, + "loss": 1.3364, + "step": 2785 + }, + { + "epoch": 0.7409574468085106, + "grad_norm": 3.5082991123199463, + "learning_rate": 9.738711121772384e-06, + "loss": 1.1921, + "step": 2786 + }, + { + "epoch": 0.7412234042553192, + "grad_norm": 3.556530475616455, + "learning_rate": 9.738430453660774e-06, + "loss": 1.2388, + "step": 2787 + }, + { + "epoch": 0.7414893617021276, + "grad_norm": 4.152648448944092, + "learning_rate": 9.738149638936547e-06, + "loss": 1.3962, + "step": 2788 + }, + { + "epoch": 0.7417553191489362, + "grad_norm": 3.8726470470428467, + "learning_rate": 9.73786867760839e-06, + "loss": 1.368, + "step": 2789 + }, + { + "epoch": 0.7420212765957447, + "grad_norm": 3.4200189113616943, + "learning_rate": 9.737587569685e-06, + "loss": 1.3165, + "step": 2790 + }, + { + "epoch": 0.7422872340425531, + "grad_norm": 3.8217222690582275, + "learning_rate": 9.737306315175072e-06, + "loss": 1.07, + "step": 2791 + }, + { + "epoch": 0.7425531914893617, + "grad_norm": 4.083987236022949, + "learning_rate": 9.73702491408731e-06, + "loss": 1.2129, + "step": 2792 + }, + { + "epoch": 0.7428191489361702, + "grad_norm": 3.396623373031616, + "learning_rate": 9.73674336643042e-06, + "loss": 1.1692, + "step": 2793 + }, + { + "epoch": 0.7430851063829788, + "grad_norm": 3.545069456100464, + "learning_rate": 9.736461672213112e-06, + "loss": 1.2257, + "step": 2794 + }, + { + "epoch": 0.7433510638297872, + "grad_norm": 3.856208324432373, + "learning_rate": 9.736179831444103e-06, + "loss": 1.4061, + "step": 2795 + }, + { + "epoch": 0.7436170212765958, + "grad_norm": 3.6652262210845947, + "learning_rate": 9.735897844132116e-06, + "loss": 1.1792, + "step": 2796 + }, + { + "epoch": 0.7438829787234043, + "grad_norm": 3.402409791946411, + "learning_rate": 9.735615710285873e-06, + "loss": 1.1954, + "step": 2797 + }, + { + "epoch": 0.7441489361702127, + "grad_norm": 4.120236396789551, + "learning_rate": 9.735333429914103e-06, + "loss": 1.3625, + "step": 2798 + }, + { + "epoch": 0.7444148936170213, + "grad_norm": 3.873011350631714, + "learning_rate": 9.735051003025543e-06, + "loss": 1.1915, + "step": 2799 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 3.4933876991271973, + "learning_rate": 9.73476842962893e-06, + "loss": 1.1695, + "step": 2800 + }, + { + "epoch": 0.7449468085106383, + "grad_norm": 3.8242671489715576, + "learning_rate": 9.734485709733007e-06, + "loss": 1.2618, + "step": 2801 + }, + { + "epoch": 0.7452127659574468, + "grad_norm": 3.512907028198242, + "learning_rate": 9.734202843346522e-06, + "loss": 1.1924, + "step": 2802 + }, + { + "epoch": 0.7454787234042554, + "grad_norm": 4.221972465515137, + "learning_rate": 9.733919830478227e-06, + "loss": 1.2335, + "step": 2803 + }, + { + "epoch": 0.7457446808510638, + "grad_norm": 3.864529609680176, + "learning_rate": 9.73363667113688e-06, + "loss": 1.3128, + "step": 2804 + }, + { + "epoch": 0.7460106382978723, + "grad_norm": 4.328346252441406, + "learning_rate": 9.73335336533124e-06, + "loss": 1.3956, + "step": 2805 + }, + { + "epoch": 0.7462765957446809, + "grad_norm": 3.605314254760742, + "learning_rate": 9.733069913070074e-06, + "loss": 1.1795, + "step": 2806 + }, + { + "epoch": 0.7465425531914893, + "grad_norm": 4.531727313995361, + "learning_rate": 9.732786314362154e-06, + "loss": 1.3895, + "step": 2807 + }, + { + "epoch": 0.7468085106382979, + "grad_norm": 3.587550163269043, + "learning_rate": 9.732502569216252e-06, + "loss": 1.289, + "step": 2808 + }, + { + "epoch": 0.7470744680851064, + "grad_norm": 3.99782133102417, + "learning_rate": 9.73221867764115e-06, + "loss": 1.3014, + "step": 2809 + }, + { + "epoch": 0.7473404255319149, + "grad_norm": 3.9140994548797607, + "learning_rate": 9.731934639645628e-06, + "loss": 1.2428, + "step": 2810 + }, + { + "epoch": 0.7476063829787234, + "grad_norm": 3.7804577350616455, + "learning_rate": 9.73165045523848e-06, + "loss": 1.2315, + "step": 2811 + }, + { + "epoch": 0.747872340425532, + "grad_norm": 4.103899002075195, + "learning_rate": 9.731366124428495e-06, + "loss": 1.4515, + "step": 2812 + }, + { + "epoch": 0.7481382978723404, + "grad_norm": 4.170511245727539, + "learning_rate": 9.73108164722447e-06, + "loss": 1.3773, + "step": 2813 + }, + { + "epoch": 0.7484042553191489, + "grad_norm": 3.4937591552734375, + "learning_rate": 9.73079702363521e-06, + "loss": 1.1113, + "step": 2814 + }, + { + "epoch": 0.7486702127659575, + "grad_norm": 3.6979286670684814, + "learning_rate": 9.730512253669523e-06, + "loss": 1.2525, + "step": 2815 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 3.6911709308624268, + "learning_rate": 9.730227337336214e-06, + "loss": 1.2443, + "step": 2816 + }, + { + "epoch": 0.7492021276595745, + "grad_norm": 3.462308883666992, + "learning_rate": 9.729942274644102e-06, + "loss": 1.1075, + "step": 2817 + }, + { + "epoch": 0.749468085106383, + "grad_norm": 4.0079240798950195, + "learning_rate": 9.729657065602007e-06, + "loss": 1.2715, + "step": 2818 + }, + { + "epoch": 0.7497340425531915, + "grad_norm": 3.6619253158569336, + "learning_rate": 9.729371710218755e-06, + "loss": 1.135, + "step": 2819 + }, + { + "epoch": 0.75, + "grad_norm": 3.3799519538879395, + "learning_rate": 9.729086208503174e-06, + "loss": 1.2331, + "step": 2820 + }, + { + "epoch": 0.7502659574468085, + "grad_norm": 3.828418493270874, + "learning_rate": 9.728800560464097e-06, + "loss": 1.3006, + "step": 2821 + }, + { + "epoch": 0.750531914893617, + "grad_norm": 4.1295928955078125, + "learning_rate": 9.728514766110366e-06, + "loss": 1.2404, + "step": 2822 + }, + { + "epoch": 0.7507978723404255, + "grad_norm": 3.73343825340271, + "learning_rate": 9.728228825450818e-06, + "loss": 1.3261, + "step": 2823 + }, + { + "epoch": 0.7510638297872341, + "grad_norm": 3.336246967315674, + "learning_rate": 9.727942738494305e-06, + "loss": 1.0928, + "step": 2824 + }, + { + "epoch": 0.7513297872340425, + "grad_norm": 3.4438130855560303, + "learning_rate": 9.727656505249676e-06, + "loss": 1.2058, + "step": 2825 + }, + { + "epoch": 0.7515957446808511, + "grad_norm": 3.7546231746673584, + "learning_rate": 9.72737012572579e-06, + "loss": 1.1447, + "step": 2826 + }, + { + "epoch": 0.7518617021276596, + "grad_norm": 4.008635520935059, + "learning_rate": 9.727083599931506e-06, + "loss": 1.3526, + "step": 2827 + }, + { + "epoch": 0.752127659574468, + "grad_norm": 4.192075729370117, + "learning_rate": 9.726796927875688e-06, + "loss": 1.3889, + "step": 2828 + }, + { + "epoch": 0.7523936170212766, + "grad_norm": 3.805386543273926, + "learning_rate": 9.726510109567211e-06, + "loss": 1.3894, + "step": 2829 + }, + { + "epoch": 0.7526595744680851, + "grad_norm": 3.9009950160980225, + "learning_rate": 9.726223145014946e-06, + "loss": 1.2844, + "step": 2830 + }, + { + "epoch": 0.7529255319148936, + "grad_norm": 3.870450735092163, + "learning_rate": 9.725936034227771e-06, + "loss": 1.2328, + "step": 2831 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 3.5746779441833496, + "learning_rate": 9.725648777214571e-06, + "loss": 1.2661, + "step": 2832 + }, + { + "epoch": 0.7534574468085107, + "grad_norm": 4.304332733154297, + "learning_rate": 9.725361373984235e-06, + "loss": 1.2722, + "step": 2833 + }, + { + "epoch": 0.7537234042553191, + "grad_norm": 3.693098783493042, + "learning_rate": 9.725073824545655e-06, + "loss": 1.3476, + "step": 2834 + }, + { + "epoch": 0.7539893617021277, + "grad_norm": 3.3664565086364746, + "learning_rate": 9.724786128907726e-06, + "loss": 1.2575, + "step": 2835 + }, + { + "epoch": 0.7542553191489362, + "grad_norm": 3.585892915725708, + "learning_rate": 9.724498287079353e-06, + "loss": 1.3478, + "step": 2836 + }, + { + "epoch": 0.7545212765957446, + "grad_norm": 3.768718957901001, + "learning_rate": 9.72421029906944e-06, + "loss": 1.2749, + "step": 2837 + }, + { + "epoch": 0.7547872340425532, + "grad_norm": 3.891233205795288, + "learning_rate": 9.723922164886898e-06, + "loss": 1.3033, + "step": 2838 + }, + { + "epoch": 0.7550531914893617, + "grad_norm": 3.5751054286956787, + "learning_rate": 9.723633884540643e-06, + "loss": 1.1453, + "step": 2839 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 3.516754150390625, + "learning_rate": 9.723345458039595e-06, + "loss": 1.2553, + "step": 2840 + }, + { + "epoch": 0.7555851063829787, + "grad_norm": 3.76668643951416, + "learning_rate": 9.723056885392677e-06, + "loss": 1.3444, + "step": 2841 + }, + { + "epoch": 0.7558510638297873, + "grad_norm": 3.9877772331237793, + "learning_rate": 9.722768166608818e-06, + "loss": 1.2582, + "step": 2842 + }, + { + "epoch": 0.7561170212765957, + "grad_norm": 3.631065607070923, + "learning_rate": 9.72247930169695e-06, + "loss": 1.3652, + "step": 2843 + }, + { + "epoch": 0.7563829787234042, + "grad_norm": 3.124361515045166, + "learning_rate": 9.722190290666014e-06, + "loss": 0.9727, + "step": 2844 + }, + { + "epoch": 0.7566489361702128, + "grad_norm": 3.7869699001312256, + "learning_rate": 9.721901133524951e-06, + "loss": 1.3348, + "step": 2845 + }, + { + "epoch": 0.7569148936170212, + "grad_norm": 3.49450421333313, + "learning_rate": 9.721611830282707e-06, + "loss": 1.2607, + "step": 2846 + }, + { + "epoch": 0.7571808510638298, + "grad_norm": 4.137457370758057, + "learning_rate": 9.721322380948235e-06, + "loss": 1.2993, + "step": 2847 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 3.492685317993164, + "learning_rate": 9.721032785530488e-06, + "loss": 1.3636, + "step": 2848 + }, + { + "epoch": 0.7577127659574469, + "grad_norm": 3.78635835647583, + "learning_rate": 9.72074304403843e-06, + "loss": 1.3039, + "step": 2849 + }, + { + "epoch": 0.7579787234042553, + "grad_norm": 3.5052456855773926, + "learning_rate": 9.720453156481023e-06, + "loss": 1.1737, + "step": 2850 + }, + { + "epoch": 0.7582446808510638, + "grad_norm": 3.5687224864959717, + "learning_rate": 9.72016312286724e-06, + "loss": 1.3378, + "step": 2851 + }, + { + "epoch": 0.7585106382978724, + "grad_norm": 3.2821710109710693, + "learning_rate": 9.71987294320605e-06, + "loss": 1.0614, + "step": 2852 + }, + { + "epoch": 0.7587765957446808, + "grad_norm": 3.9896838665008545, + "learning_rate": 9.719582617506434e-06, + "loss": 1.4842, + "step": 2853 + }, + { + "epoch": 0.7590425531914894, + "grad_norm": 3.674095392227173, + "learning_rate": 9.719292145777377e-06, + "loss": 1.2268, + "step": 2854 + }, + { + "epoch": 0.7593085106382979, + "grad_norm": 3.586404800415039, + "learning_rate": 9.719001528027863e-06, + "loss": 1.3219, + "step": 2855 + }, + { + "epoch": 0.7595744680851064, + "grad_norm": 3.734853744506836, + "learning_rate": 9.718710764266888e-06, + "loss": 1.2469, + "step": 2856 + }, + { + "epoch": 0.7598404255319149, + "grad_norm": 3.4392611980438232, + "learning_rate": 9.718419854503444e-06, + "loss": 1.1928, + "step": 2857 + }, + { + "epoch": 0.7601063829787233, + "grad_norm": 3.7639527320861816, + "learning_rate": 9.718128798746537e-06, + "loss": 1.2995, + "step": 2858 + }, + { + "epoch": 0.7603723404255319, + "grad_norm": 3.564790964126587, + "learning_rate": 9.717837597005169e-06, + "loss": 1.2086, + "step": 2859 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 3.9883244037628174, + "learning_rate": 9.71754624928835e-06, + "loss": 1.2138, + "step": 2860 + }, + { + "epoch": 0.760904255319149, + "grad_norm": 3.823289632797241, + "learning_rate": 9.717254755605097e-06, + "loss": 1.2225, + "step": 2861 + }, + { + "epoch": 0.7611702127659574, + "grad_norm": 3.4945852756500244, + "learning_rate": 9.716963115964427e-06, + "loss": 1.26, + "step": 2862 + }, + { + "epoch": 0.761436170212766, + "grad_norm": 3.7626545429229736, + "learning_rate": 9.716671330375366e-06, + "loss": 1.2424, + "step": 2863 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.789428949356079, + "learning_rate": 9.71637939884694e-06, + "loss": 1.3538, + "step": 2864 + }, + { + "epoch": 0.761968085106383, + "grad_norm": 3.781531810760498, + "learning_rate": 9.716087321388184e-06, + "loss": 1.2693, + "step": 2865 + }, + { + "epoch": 0.7622340425531915, + "grad_norm": 3.184601306915283, + "learning_rate": 9.715795098008132e-06, + "loss": 1.0477, + "step": 2866 + }, + { + "epoch": 0.7625, + "grad_norm": 3.636810302734375, + "learning_rate": 9.715502728715827e-06, + "loss": 1.2691, + "step": 2867 + }, + { + "epoch": 0.7627659574468085, + "grad_norm": 4.0694122314453125, + "learning_rate": 9.715210213520317e-06, + "loss": 1.3419, + "step": 2868 + }, + { + "epoch": 0.763031914893617, + "grad_norm": 3.9551241397857666, + "learning_rate": 9.714917552430652e-06, + "loss": 1.2398, + "step": 2869 + }, + { + "epoch": 0.7632978723404256, + "grad_norm": 3.7696473598480225, + "learning_rate": 9.714624745455885e-06, + "loss": 1.2691, + "step": 2870 + }, + { + "epoch": 0.763563829787234, + "grad_norm": 3.726793050765991, + "learning_rate": 9.71433179260508e-06, + "loss": 1.2308, + "step": 2871 + }, + { + "epoch": 0.7638297872340426, + "grad_norm": 3.6226067543029785, + "learning_rate": 9.714038693887298e-06, + "loss": 1.3653, + "step": 2872 + }, + { + "epoch": 0.7640957446808511, + "grad_norm": 3.4948949813842773, + "learning_rate": 9.713745449311606e-06, + "loss": 1.2048, + "step": 2873 + }, + { + "epoch": 0.7643617021276595, + "grad_norm": 3.3849282264709473, + "learning_rate": 9.713452058887084e-06, + "loss": 1.1664, + "step": 2874 + }, + { + "epoch": 0.7646276595744681, + "grad_norm": 3.9506824016571045, + "learning_rate": 9.713158522622804e-06, + "loss": 1.4175, + "step": 2875 + }, + { + "epoch": 0.7648936170212766, + "grad_norm": 3.5069642066955566, + "learning_rate": 9.71286484052785e-06, + "loss": 1.2298, + "step": 2876 + }, + { + "epoch": 0.7651595744680851, + "grad_norm": 3.5655500888824463, + "learning_rate": 9.71257101261131e-06, + "loss": 1.1717, + "step": 2877 + }, + { + "epoch": 0.7654255319148936, + "grad_norm": 3.450375556945801, + "learning_rate": 9.712277038882274e-06, + "loss": 1.1573, + "step": 2878 + }, + { + "epoch": 0.7656914893617022, + "grad_norm": 3.849936008453369, + "learning_rate": 9.711982919349839e-06, + "loss": 1.1671, + "step": 2879 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 3.557499647140503, + "learning_rate": 9.711688654023105e-06, + "loss": 1.2369, + "step": 2880 + }, + { + "epoch": 0.7662234042553191, + "grad_norm": 4.1276326179504395, + "learning_rate": 9.711394242911177e-06, + "loss": 1.2304, + "step": 2881 + }, + { + "epoch": 0.7664893617021277, + "grad_norm": 3.553694725036621, + "learning_rate": 9.711099686023161e-06, + "loss": 1.285, + "step": 2882 + }, + { + "epoch": 0.7667553191489361, + "grad_norm": 3.484138250350952, + "learning_rate": 9.710804983368177e-06, + "loss": 1.2578, + "step": 2883 + }, + { + "epoch": 0.7670212765957447, + "grad_norm": 3.855220317840576, + "learning_rate": 9.71051013495534e-06, + "loss": 1.2213, + "step": 2884 + }, + { + "epoch": 0.7672872340425532, + "grad_norm": 3.9998855590820312, + "learning_rate": 9.710215140793774e-06, + "loss": 1.231, + "step": 2885 + }, + { + "epoch": 0.7675531914893617, + "grad_norm": 3.568758487701416, + "learning_rate": 9.709920000892605e-06, + "loss": 1.1779, + "step": 2886 + }, + { + "epoch": 0.7678191489361702, + "grad_norm": 3.5209362506866455, + "learning_rate": 9.709624715260965e-06, + "loss": 1.0908, + "step": 2887 + }, + { + "epoch": 0.7680851063829788, + "grad_norm": 3.783108949661255, + "learning_rate": 9.709329283907993e-06, + "loss": 1.3374, + "step": 2888 + }, + { + "epoch": 0.7683510638297872, + "grad_norm": 3.672305107116699, + "learning_rate": 9.70903370684283e-06, + "loss": 1.2719, + "step": 2889 + }, + { + "epoch": 0.7686170212765957, + "grad_norm": 3.9783568382263184, + "learning_rate": 9.708737984074616e-06, + "loss": 1.2343, + "step": 2890 + }, + { + "epoch": 0.7688829787234043, + "grad_norm": 3.6471900939941406, + "learning_rate": 9.708442115612508e-06, + "loss": 1.1384, + "step": 2891 + }, + { + "epoch": 0.7691489361702127, + "grad_norm": 3.8330166339874268, + "learning_rate": 9.708146101465657e-06, + "loss": 1.3178, + "step": 2892 + }, + { + "epoch": 0.7694148936170213, + "grad_norm": 3.224055290222168, + "learning_rate": 9.707849941643222e-06, + "loss": 1.087, + "step": 2893 + }, + { + "epoch": 0.7696808510638298, + "grad_norm": 4.061996936798096, + "learning_rate": 9.707553636154366e-06, + "loss": 1.4389, + "step": 2894 + }, + { + "epoch": 0.7699468085106383, + "grad_norm": 3.7000250816345215, + "learning_rate": 9.707257185008259e-06, + "loss": 1.2383, + "step": 2895 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 3.3188624382019043, + "learning_rate": 9.706960588214072e-06, + "loss": 1.1835, + "step": 2896 + }, + { + "epoch": 0.7704787234042553, + "grad_norm": 3.68198299407959, + "learning_rate": 9.706663845780984e-06, + "loss": 1.2511, + "step": 2897 + }, + { + "epoch": 0.7707446808510638, + "grad_norm": 3.831139326095581, + "learning_rate": 9.706366957718174e-06, + "loss": 1.3409, + "step": 2898 + }, + { + "epoch": 0.7710106382978723, + "grad_norm": 3.3753414154052734, + "learning_rate": 9.70606992403483e-06, + "loss": 1.1988, + "step": 2899 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 3.3466532230377197, + "learning_rate": 9.705772744740142e-06, + "loss": 1.1079, + "step": 2900 + }, + { + "epoch": 0.7715425531914893, + "grad_norm": 3.39589524269104, + "learning_rate": 9.705475419843304e-06, + "loss": 1.2094, + "step": 2901 + }, + { + "epoch": 0.7718085106382979, + "grad_norm": 3.5272488594055176, + "learning_rate": 9.705177949353516e-06, + "loss": 1.2466, + "step": 2902 + }, + { + "epoch": 0.7720744680851064, + "grad_norm": 3.9202656745910645, + "learning_rate": 9.704880333279985e-06, + "loss": 1.2347, + "step": 2903 + }, + { + "epoch": 0.7723404255319148, + "grad_norm": 3.421706199645996, + "learning_rate": 9.704582571631915e-06, + "loss": 1.1643, + "step": 2904 + }, + { + "epoch": 0.7726063829787234, + "grad_norm": 3.8939504623413086, + "learning_rate": 9.704284664418521e-06, + "loss": 1.4996, + "step": 2905 + }, + { + "epoch": 0.7728723404255319, + "grad_norm": 3.362236976623535, + "learning_rate": 9.703986611649024e-06, + "loss": 1.2661, + "step": 2906 + }, + { + "epoch": 0.7731382978723405, + "grad_norm": 3.2896718978881836, + "learning_rate": 9.70368841333264e-06, + "loss": 1.0865, + "step": 2907 + }, + { + "epoch": 0.7734042553191489, + "grad_norm": 3.662534475326538, + "learning_rate": 9.7033900694786e-06, + "loss": 1.223, + "step": 2908 + }, + { + "epoch": 0.7736702127659575, + "grad_norm": 3.7135627269744873, + "learning_rate": 9.703091580096132e-06, + "loss": 1.4123, + "step": 2909 + }, + { + "epoch": 0.773936170212766, + "grad_norm": 3.431130886077881, + "learning_rate": 9.702792945194475e-06, + "loss": 1.139, + "step": 2910 + }, + { + "epoch": 0.7742021276595744, + "grad_norm": 4.038398742675781, + "learning_rate": 9.702494164782866e-06, + "loss": 1.3352, + "step": 2911 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 3.5457537174224854, + "learning_rate": 9.702195238870552e-06, + "loss": 1.2472, + "step": 2912 + }, + { + "epoch": 0.7747340425531914, + "grad_norm": 3.9684653282165527, + "learning_rate": 9.70189616746678e-06, + "loss": 1.2834, + "step": 2913 + }, + { + "epoch": 0.775, + "grad_norm": 3.520798683166504, + "learning_rate": 9.701596950580807e-06, + "loss": 1.1989, + "step": 2914 + }, + { + "epoch": 0.7752659574468085, + "grad_norm": 3.4203343391418457, + "learning_rate": 9.701297588221888e-06, + "loss": 1.2368, + "step": 2915 + }, + { + "epoch": 0.7755319148936171, + "grad_norm": 3.5501503944396973, + "learning_rate": 9.700998080399287e-06, + "loss": 1.2317, + "step": 2916 + }, + { + "epoch": 0.7757978723404255, + "grad_norm": 3.5603249073028564, + "learning_rate": 9.700698427122269e-06, + "loss": 1.2071, + "step": 2917 + }, + { + "epoch": 0.7760638297872341, + "grad_norm": 3.5951790809631348, + "learning_rate": 9.700398628400109e-06, + "loss": 1.1681, + "step": 2918 + }, + { + "epoch": 0.7763297872340426, + "grad_norm": 3.6561312675476074, + "learning_rate": 9.700098684242082e-06, + "loss": 1.3097, + "step": 2919 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 3.628885269165039, + "learning_rate": 9.699798594657464e-06, + "loss": 1.2199, + "step": 2920 + }, + { + "epoch": 0.7768617021276596, + "grad_norm": 3.6864166259765625, + "learning_rate": 9.699498359655548e-06, + "loss": 1.2123, + "step": 2921 + }, + { + "epoch": 0.777127659574468, + "grad_norm": 4.034405708312988, + "learning_rate": 9.699197979245617e-06, + "loss": 1.3019, + "step": 2922 + }, + { + "epoch": 0.7773936170212766, + "grad_norm": 3.9352498054504395, + "learning_rate": 9.69889745343697e-06, + "loss": 1.4196, + "step": 2923 + }, + { + "epoch": 0.7776595744680851, + "grad_norm": 3.983980894088745, + "learning_rate": 9.698596782238904e-06, + "loss": 1.1829, + "step": 2924 + }, + { + "epoch": 0.7779255319148937, + "grad_norm": 3.4715261459350586, + "learning_rate": 9.698295965660721e-06, + "loss": 1.144, + "step": 2925 + }, + { + "epoch": 0.7781914893617021, + "grad_norm": 3.7768967151641846, + "learning_rate": 9.69799500371173e-06, + "loss": 1.2891, + "step": 2926 + }, + { + "epoch": 0.7784574468085106, + "grad_norm": 3.628307580947876, + "learning_rate": 9.697693896401239e-06, + "loss": 1.2956, + "step": 2927 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 3.601635456085205, + "learning_rate": 9.697392643738571e-06, + "loss": 1.2924, + "step": 2928 + }, + { + "epoch": 0.7789893617021276, + "grad_norm": 3.6882519721984863, + "learning_rate": 9.697091245733043e-06, + "loss": 1.2887, + "step": 2929 + }, + { + "epoch": 0.7792553191489362, + "grad_norm": 3.7858314514160156, + "learning_rate": 9.696789702393982e-06, + "loss": 1.3439, + "step": 2930 + }, + { + "epoch": 0.7795212765957447, + "grad_norm": 3.6974260807037354, + "learning_rate": 9.696488013730717e-06, + "loss": 1.2487, + "step": 2931 + }, + { + "epoch": 0.7797872340425532, + "grad_norm": 3.5106611251831055, + "learning_rate": 9.696186179752587e-06, + "loss": 1.1533, + "step": 2932 + }, + { + "epoch": 0.7800531914893617, + "grad_norm": 3.440690279006958, + "learning_rate": 9.695884200468923e-06, + "loss": 1.1004, + "step": 2933 + }, + { + "epoch": 0.7803191489361702, + "grad_norm": 3.43935227394104, + "learning_rate": 9.695582075889077e-06, + "loss": 1.192, + "step": 2934 + }, + { + "epoch": 0.7805851063829787, + "grad_norm": 3.6551554203033447, + "learning_rate": 9.695279806022391e-06, + "loss": 1.2693, + "step": 2935 + }, + { + "epoch": 0.7808510638297872, + "grad_norm": 3.6879799365997314, + "learning_rate": 9.694977390878219e-06, + "loss": 1.3101, + "step": 2936 + }, + { + "epoch": 0.7811170212765958, + "grad_norm": 3.6642568111419678, + "learning_rate": 9.69467483046592e-06, + "loss": 1.3313, + "step": 2937 + }, + { + "epoch": 0.7813829787234042, + "grad_norm": 3.6739001274108887, + "learning_rate": 9.694372124794855e-06, + "loss": 1.175, + "step": 2938 + }, + { + "epoch": 0.7816489361702128, + "grad_norm": 3.346895933151245, + "learning_rate": 9.69406927387439e-06, + "loss": 1.135, + "step": 2939 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 3.605050563812256, + "learning_rate": 9.693766277713893e-06, + "loss": 1.2365, + "step": 2940 + }, + { + "epoch": 0.7821808510638298, + "grad_norm": 3.56868839263916, + "learning_rate": 9.693463136322743e-06, + "loss": 1.2756, + "step": 2941 + }, + { + "epoch": 0.7824468085106383, + "grad_norm": 3.4643678665161133, + "learning_rate": 9.693159849710317e-06, + "loss": 1.1344, + "step": 2942 + }, + { + "epoch": 0.7827127659574468, + "grad_norm": 3.7843425273895264, + "learning_rate": 9.692856417885998e-06, + "loss": 1.2301, + "step": 2943 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 3.7226831912994385, + "learning_rate": 9.69255284085918e-06, + "loss": 1.2124, + "step": 2944 + }, + { + "epoch": 0.7832446808510638, + "grad_norm": 3.5860259532928467, + "learning_rate": 9.69224911863925e-06, + "loss": 1.2237, + "step": 2945 + }, + { + "epoch": 0.7835106382978724, + "grad_norm": 3.68369722366333, + "learning_rate": 9.691945251235608e-06, + "loss": 1.3566, + "step": 2946 + }, + { + "epoch": 0.7837765957446808, + "grad_norm": 3.778324842453003, + "learning_rate": 9.691641238657655e-06, + "loss": 1.2369, + "step": 2947 + }, + { + "epoch": 0.7840425531914894, + "grad_norm": 3.4326350688934326, + "learning_rate": 9.6913370809148e-06, + "loss": 1.0766, + "step": 2948 + }, + { + "epoch": 0.7843085106382979, + "grad_norm": 3.609269380569458, + "learning_rate": 9.691032778016452e-06, + "loss": 1.228, + "step": 2949 + }, + { + "epoch": 0.7845744680851063, + "grad_norm": 3.3350110054016113, + "learning_rate": 9.690728329972025e-06, + "loss": 1.1658, + "step": 2950 + }, + { + "epoch": 0.7848404255319149, + "grad_norm": 3.53971004486084, + "learning_rate": 9.690423736790944e-06, + "loss": 1.2674, + "step": 2951 + }, + { + "epoch": 0.7851063829787234, + "grad_norm": 3.3145904541015625, + "learning_rate": 9.690118998482628e-06, + "loss": 1.2601, + "step": 2952 + }, + { + "epoch": 0.785372340425532, + "grad_norm": 3.7415387630462646, + "learning_rate": 9.689814115056509e-06, + "loss": 1.3693, + "step": 2953 + }, + { + "epoch": 0.7856382978723404, + "grad_norm": 3.2443130016326904, + "learning_rate": 9.689509086522019e-06, + "loss": 1.1516, + "step": 2954 + }, + { + "epoch": 0.785904255319149, + "grad_norm": 3.4239816665649414, + "learning_rate": 9.689203912888597e-06, + "loss": 1.2722, + "step": 2955 + }, + { + "epoch": 0.7861702127659574, + "grad_norm": 3.5822324752807617, + "learning_rate": 9.688898594165685e-06, + "loss": 1.2253, + "step": 2956 + }, + { + "epoch": 0.7864361702127659, + "grad_norm": 3.2302675247192383, + "learning_rate": 9.688593130362731e-06, + "loss": 1.1031, + "step": 2957 + }, + { + "epoch": 0.7867021276595745, + "grad_norm": 3.6517271995544434, + "learning_rate": 9.688287521489184e-06, + "loss": 1.2459, + "step": 2958 + }, + { + "epoch": 0.7869680851063829, + "grad_norm": 3.772766351699829, + "learning_rate": 9.687981767554502e-06, + "loss": 1.2623, + "step": 2959 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 3.646852731704712, + "learning_rate": 9.687675868568145e-06, + "loss": 1.2951, + "step": 2960 + }, + { + "epoch": 0.7875, + "grad_norm": 3.738582134246826, + "learning_rate": 9.687369824539577e-06, + "loss": 1.3321, + "step": 2961 + }, + { + "epoch": 0.7877659574468086, + "grad_norm": 3.6618778705596924, + "learning_rate": 9.687063635478269e-06, + "loss": 1.3527, + "step": 2962 + }, + { + "epoch": 0.788031914893617, + "grad_norm": 3.6133735179901123, + "learning_rate": 9.686757301393693e-06, + "loss": 1.2852, + "step": 2963 + }, + { + "epoch": 0.7882978723404256, + "grad_norm": 3.7590041160583496, + "learning_rate": 9.686450822295327e-06, + "loss": 1.2057, + "step": 2964 + }, + { + "epoch": 0.788563829787234, + "grad_norm": 3.4455080032348633, + "learning_rate": 9.686144198192658e-06, + "loss": 1.2478, + "step": 2965 + }, + { + "epoch": 0.7888297872340425, + "grad_norm": 3.4166572093963623, + "learning_rate": 9.685837429095169e-06, + "loss": 1.2585, + "step": 2966 + }, + { + "epoch": 0.7890957446808511, + "grad_norm": 3.322124719619751, + "learning_rate": 9.685530515012352e-06, + "loss": 1.2452, + "step": 2967 + }, + { + "epoch": 0.7893617021276595, + "grad_norm": 3.493075132369995, + "learning_rate": 9.685223455953703e-06, + "loss": 1.1951, + "step": 2968 + }, + { + "epoch": 0.7896276595744681, + "grad_norm": 3.7366654872894287, + "learning_rate": 9.684916251928727e-06, + "loss": 1.4098, + "step": 2969 + }, + { + "epoch": 0.7898936170212766, + "grad_norm": 3.846484899520874, + "learning_rate": 9.684608902946926e-06, + "loss": 1.2726, + "step": 2970 + }, + { + "epoch": 0.7901595744680852, + "grad_norm": 3.382856607437134, + "learning_rate": 9.684301409017808e-06, + "loss": 1.2072, + "step": 2971 + }, + { + "epoch": 0.7904255319148936, + "grad_norm": 3.600064277648926, + "learning_rate": 9.68399377015089e-06, + "loss": 1.2991, + "step": 2972 + }, + { + "epoch": 0.7906914893617021, + "grad_norm": 3.4890823364257812, + "learning_rate": 9.683685986355692e-06, + "loss": 1.303, + "step": 2973 + }, + { + "epoch": 0.7909574468085107, + "grad_norm": 3.2720248699188232, + "learning_rate": 9.683378057641735e-06, + "loss": 1.305, + "step": 2974 + }, + { + "epoch": 0.7912234042553191, + "grad_norm": 3.3121964931488037, + "learning_rate": 9.683069984018545e-06, + "loss": 1.228, + "step": 2975 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 3.5907375812530518, + "learning_rate": 9.682761765495657e-06, + "loss": 1.3374, + "step": 2976 + }, + { + "epoch": 0.7917553191489362, + "grad_norm": 3.518444538116455, + "learning_rate": 9.682453402082607e-06, + "loss": 1.0759, + "step": 2977 + }, + { + "epoch": 0.7920212765957447, + "grad_norm": 3.7533528804779053, + "learning_rate": 9.682144893788934e-06, + "loss": 1.2666, + "step": 2978 + }, + { + "epoch": 0.7922872340425532, + "grad_norm": 3.877476453781128, + "learning_rate": 9.681836240624187e-06, + "loss": 1.2371, + "step": 2979 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 3.945760488510132, + "learning_rate": 9.681527442597916e-06, + "loss": 1.282, + "step": 2980 + }, + { + "epoch": 0.7928191489361702, + "grad_norm": 3.585514783859253, + "learning_rate": 9.681218499719673e-06, + "loss": 1.3038, + "step": 2981 + }, + { + "epoch": 0.7930851063829787, + "grad_norm": 4.198021411895752, + "learning_rate": 9.680909411999018e-06, + "loss": 1.4758, + "step": 2982 + }, + { + "epoch": 0.7933510638297873, + "grad_norm": 3.670048713684082, + "learning_rate": 9.680600179445514e-06, + "loss": 1.2579, + "step": 2983 + }, + { + "epoch": 0.7936170212765957, + "grad_norm": 3.6147031784057617, + "learning_rate": 9.68029080206873e-06, + "loss": 1.2565, + "step": 2984 + }, + { + "epoch": 0.7938829787234043, + "grad_norm": 3.589110851287842, + "learning_rate": 9.67998127987824e-06, + "loss": 1.2516, + "step": 2985 + }, + { + "epoch": 0.7941489361702128, + "grad_norm": 3.5315637588500977, + "learning_rate": 9.679671612883615e-06, + "loss": 1.2206, + "step": 2986 + }, + { + "epoch": 0.7944148936170212, + "grad_norm": 3.6465420722961426, + "learning_rate": 9.679361801094445e-06, + "loss": 1.2784, + "step": 2987 + }, + { + "epoch": 0.7946808510638298, + "grad_norm": 3.6671435832977295, + "learning_rate": 9.679051844520308e-06, + "loss": 1.4118, + "step": 2988 + }, + { + "epoch": 0.7949468085106383, + "grad_norm": 3.479151725769043, + "learning_rate": 9.6787417431708e-06, + "loss": 1.303, + "step": 2989 + }, + { + "epoch": 0.7952127659574468, + "grad_norm": 3.694517135620117, + "learning_rate": 9.678431497055515e-06, + "loss": 1.1658, + "step": 2990 + }, + { + "epoch": 0.7954787234042553, + "grad_norm": 3.453770637512207, + "learning_rate": 9.67812110618405e-06, + "loss": 1.2784, + "step": 2991 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 3.926161527633667, + "learning_rate": 9.677810570566011e-06, + "loss": 1.2926, + "step": 2992 + }, + { + "epoch": 0.7960106382978723, + "grad_norm": 3.6100566387176514, + "learning_rate": 9.677499890211005e-06, + "loss": 1.2504, + "step": 2993 + }, + { + "epoch": 0.7962765957446809, + "grad_norm": 3.496819019317627, + "learning_rate": 9.677189065128646e-06, + "loss": 1.1922, + "step": 2994 + }, + { + "epoch": 0.7965425531914894, + "grad_norm": 3.4073357582092285, + "learning_rate": 9.676878095328547e-06, + "loss": 1.1934, + "step": 2995 + }, + { + "epoch": 0.7968085106382978, + "grad_norm": 3.5559115409851074, + "learning_rate": 9.676566980820338e-06, + "loss": 1.3128, + "step": 2996 + }, + { + "epoch": 0.7970744680851064, + "grad_norm": 3.844743013381958, + "learning_rate": 9.676255721613639e-06, + "loss": 1.2881, + "step": 2997 + }, + { + "epoch": 0.7973404255319149, + "grad_norm": 3.2858474254608154, + "learning_rate": 9.675944317718083e-06, + "loss": 1.2103, + "step": 2998 + }, + { + "epoch": 0.7976063829787234, + "grad_norm": 3.7412915229797363, + "learning_rate": 9.675632769143303e-06, + "loss": 1.2254, + "step": 2999 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 4.140746116638184, + "learning_rate": 9.67532107589894e-06, + "loss": 1.2933, + "step": 3000 + }, + { + "epoch": 0.7978723404255319, + "eval_loss": 1.2683638334274292, + "eval_runtime": 12.6307, + "eval_samples_per_second": 31.669, + "eval_steps_per_second": 3.959, + "step": 3000 + }, + { + "epoch": 0.7981382978723405, + "grad_norm": 3.8456828594207764, + "learning_rate": 9.67500923799464e-06, + "loss": 1.3237, + "step": 3001 + }, + { + "epoch": 0.7984042553191489, + "grad_norm": 3.4592676162719727, + "learning_rate": 9.67469725544005e-06, + "loss": 1.0598, + "step": 3002 + }, + { + "epoch": 0.7986702127659574, + "grad_norm": 3.729926586151123, + "learning_rate": 9.674385128244823e-06, + "loss": 1.2681, + "step": 3003 + }, + { + "epoch": 0.798936170212766, + "grad_norm": 3.4208433628082275, + "learning_rate": 9.674072856418616e-06, + "loss": 1.3245, + "step": 3004 + }, + { + "epoch": 0.7992021276595744, + "grad_norm": 3.511957883834839, + "learning_rate": 9.673760439971091e-06, + "loss": 1.1623, + "step": 3005 + }, + { + "epoch": 0.799468085106383, + "grad_norm": 3.794137477874756, + "learning_rate": 9.673447878911916e-06, + "loss": 1.1303, + "step": 3006 + }, + { + "epoch": 0.7997340425531915, + "grad_norm": 3.826404571533203, + "learning_rate": 9.673135173250763e-06, + "loss": 1.3698, + "step": 3007 + }, + { + "epoch": 0.8, + "grad_norm": 3.5505003929138184, + "learning_rate": 9.672822322997305e-06, + "loss": 1.257, + "step": 3008 + }, + { + "epoch": 0.8002659574468085, + "grad_norm": 3.616678237915039, + "learning_rate": 9.672509328161222e-06, + "loss": 1.263, + "step": 3009 + }, + { + "epoch": 0.800531914893617, + "grad_norm": 3.5338237285614014, + "learning_rate": 9.672196188752201e-06, + "loss": 1.2328, + "step": 3010 + }, + { + "epoch": 0.8007978723404255, + "grad_norm": 3.4037692546844482, + "learning_rate": 9.671882904779927e-06, + "loss": 1.1843, + "step": 3011 + }, + { + "epoch": 0.801063829787234, + "grad_norm": 3.918245315551758, + "learning_rate": 9.671569476254096e-06, + "loss": 1.3486, + "step": 3012 + }, + { + "epoch": 0.8013297872340426, + "grad_norm": 3.5351336002349854, + "learning_rate": 9.671255903184405e-06, + "loss": 1.3272, + "step": 3013 + }, + { + "epoch": 0.801595744680851, + "grad_norm": 3.9071462154388428, + "learning_rate": 9.670942185580557e-06, + "loss": 1.1649, + "step": 3014 + }, + { + "epoch": 0.8018617021276596, + "grad_norm": 3.493410110473633, + "learning_rate": 9.670628323452259e-06, + "loss": 1.1651, + "step": 3015 + }, + { + "epoch": 0.8021276595744681, + "grad_norm": 3.2986040115356445, + "learning_rate": 9.670314316809222e-06, + "loss": 1.2718, + "step": 3016 + }, + { + "epoch": 0.8023936170212767, + "grad_norm": 3.4360411167144775, + "learning_rate": 9.67000016566116e-06, + "loss": 1.1393, + "step": 3017 + }, + { + "epoch": 0.8026595744680851, + "grad_norm": 3.690444231033325, + "learning_rate": 9.669685870017795e-06, + "loss": 1.1887, + "step": 3018 + }, + { + "epoch": 0.8029255319148936, + "grad_norm": 3.58248233795166, + "learning_rate": 9.669371429888852e-06, + "loss": 1.3714, + "step": 3019 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 3.723407745361328, + "learning_rate": 9.66905684528406e-06, + "loss": 1.2999, + "step": 3020 + }, + { + "epoch": 0.8034574468085106, + "grad_norm": 3.7996089458465576, + "learning_rate": 9.66874211621315e-06, + "loss": 1.3091, + "step": 3021 + }, + { + "epoch": 0.8037234042553192, + "grad_norm": 3.741523265838623, + "learning_rate": 9.668427242685864e-06, + "loss": 1.261, + "step": 3022 + }, + { + "epoch": 0.8039893617021276, + "grad_norm": 3.6952426433563232, + "learning_rate": 9.668112224711941e-06, + "loss": 1.3148, + "step": 3023 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.728320837020874, + "learning_rate": 9.667797062301133e-06, + "loss": 1.2188, + "step": 3024 + }, + { + "epoch": 0.8045212765957447, + "grad_norm": 3.7836687564849854, + "learning_rate": 9.667481755463183e-06, + "loss": 1.3981, + "step": 3025 + }, + { + "epoch": 0.8047872340425531, + "grad_norm": 3.308515787124634, + "learning_rate": 9.667166304207856e-06, + "loss": 1.2107, + "step": 3026 + }, + { + "epoch": 0.8050531914893617, + "grad_norm": 3.5682644844055176, + "learning_rate": 9.666850708544907e-06, + "loss": 1.2288, + "step": 3027 + }, + { + "epoch": 0.8053191489361702, + "grad_norm": 3.817530632019043, + "learning_rate": 9.666534968484105e-06, + "loss": 1.2821, + "step": 3028 + }, + { + "epoch": 0.8055851063829788, + "grad_norm": 3.1704676151275635, + "learning_rate": 9.666219084035215e-06, + "loss": 1.1683, + "step": 3029 + }, + { + "epoch": 0.8058510638297872, + "grad_norm": 3.884427547454834, + "learning_rate": 9.665903055208013e-06, + "loss": 1.3448, + "step": 3030 + }, + { + "epoch": 0.8061170212765958, + "grad_norm": 3.8523178100585938, + "learning_rate": 9.665586882012278e-06, + "loss": 1.1827, + "step": 3031 + }, + { + "epoch": 0.8063829787234043, + "grad_norm": 3.217390298843384, + "learning_rate": 9.66527056445779e-06, + "loss": 1.1782, + "step": 3032 + }, + { + "epoch": 0.8066489361702127, + "grad_norm": 3.484069585800171, + "learning_rate": 9.66495410255434e-06, + "loss": 1.2279, + "step": 3033 + }, + { + "epoch": 0.8069148936170213, + "grad_norm": 3.62542724609375, + "learning_rate": 9.664637496311717e-06, + "loss": 1.232, + "step": 3034 + }, + { + "epoch": 0.8071808510638298, + "grad_norm": 3.6373066902160645, + "learning_rate": 9.664320745739717e-06, + "loss": 1.2463, + "step": 3035 + }, + { + "epoch": 0.8074468085106383, + "grad_norm": 3.3646364212036133, + "learning_rate": 9.664003850848142e-06, + "loss": 1.1543, + "step": 3036 + }, + { + "epoch": 0.8077127659574468, + "grad_norm": 3.772383689880371, + "learning_rate": 9.663686811646798e-06, + "loss": 1.3646, + "step": 3037 + }, + { + "epoch": 0.8079787234042554, + "grad_norm": 3.8896496295928955, + "learning_rate": 9.663369628145493e-06, + "loss": 1.2321, + "step": 3038 + }, + { + "epoch": 0.8082446808510638, + "grad_norm": 4.038544654846191, + "learning_rate": 9.66305230035404e-06, + "loss": 1.2345, + "step": 3039 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 3.7592129707336426, + "learning_rate": 9.662734828282258e-06, + "loss": 1.2879, + "step": 3040 + }, + { + "epoch": 0.8087765957446809, + "grad_norm": 3.3927769660949707, + "learning_rate": 9.662417211939974e-06, + "loss": 1.2495, + "step": 3041 + }, + { + "epoch": 0.8090425531914893, + "grad_norm": 3.7398223876953125, + "learning_rate": 9.662099451337009e-06, + "loss": 1.2328, + "step": 3042 + }, + { + "epoch": 0.8093085106382979, + "grad_norm": 3.697510004043579, + "learning_rate": 9.6617815464832e-06, + "loss": 1.2306, + "step": 3043 + }, + { + "epoch": 0.8095744680851064, + "grad_norm": 3.362252712249756, + "learning_rate": 9.66146349738838e-06, + "loss": 1.2598, + "step": 3044 + }, + { + "epoch": 0.8098404255319149, + "grad_norm": 3.629018783569336, + "learning_rate": 9.661145304062391e-06, + "loss": 1.2364, + "step": 3045 + }, + { + "epoch": 0.8101063829787234, + "grad_norm": 3.6889262199401855, + "learning_rate": 9.66082696651508e-06, + "loss": 1.2122, + "step": 3046 + }, + { + "epoch": 0.810372340425532, + "grad_norm": 3.6210176944732666, + "learning_rate": 9.660508484756295e-06, + "loss": 1.2425, + "step": 3047 + }, + { + "epoch": 0.8106382978723404, + "grad_norm": 3.52443528175354, + "learning_rate": 9.66018985879589e-06, + "loss": 1.1755, + "step": 3048 + }, + { + "epoch": 0.8109042553191489, + "grad_norm": 3.6943182945251465, + "learning_rate": 9.659871088643724e-06, + "loss": 1.2033, + "step": 3049 + }, + { + "epoch": 0.8111702127659575, + "grad_norm": 3.6708784103393555, + "learning_rate": 9.65955217430966e-06, + "loss": 1.2418, + "step": 3050 + }, + { + "epoch": 0.8114361702127659, + "grad_norm": 3.3263115882873535, + "learning_rate": 9.659233115803565e-06, + "loss": 1.133, + "step": 3051 + }, + { + "epoch": 0.8117021276595745, + "grad_norm": 3.9797048568725586, + "learning_rate": 9.658913913135314e-06, + "loss": 1.2549, + "step": 3052 + }, + { + "epoch": 0.811968085106383, + "grad_norm": 3.505920648574829, + "learning_rate": 9.658594566314781e-06, + "loss": 1.3769, + "step": 3053 + }, + { + "epoch": 0.8122340425531915, + "grad_norm": 3.466444492340088, + "learning_rate": 9.658275075351846e-06, + "loss": 1.2394, + "step": 3054 + }, + { + "epoch": 0.8125, + "grad_norm": 3.4919936656951904, + "learning_rate": 9.657955440256396e-06, + "loss": 1.1807, + "step": 3055 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 3.8641278743743896, + "learning_rate": 9.65763566103832e-06, + "loss": 1.2532, + "step": 3056 + }, + { + "epoch": 0.813031914893617, + "grad_norm": 3.5937435626983643, + "learning_rate": 9.657315737707514e-06, + "loss": 1.2234, + "step": 3057 + }, + { + "epoch": 0.8132978723404255, + "grad_norm": 3.8876571655273438, + "learning_rate": 9.656995670273877e-06, + "loss": 1.2057, + "step": 3058 + }, + { + "epoch": 0.8135638297872341, + "grad_norm": 3.532804012298584, + "learning_rate": 9.656675458747308e-06, + "loss": 1.2109, + "step": 3059 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 3.421060800552368, + "learning_rate": 9.65635510313772e-06, + "loss": 1.2677, + "step": 3060 + }, + { + "epoch": 0.8140957446808511, + "grad_norm": 3.599653720855713, + "learning_rate": 9.656034603455022e-06, + "loss": 1.2561, + "step": 3061 + }, + { + "epoch": 0.8143617021276596, + "grad_norm": 3.297154664993286, + "learning_rate": 9.655713959709133e-06, + "loss": 1.1693, + "step": 3062 + }, + { + "epoch": 0.814627659574468, + "grad_norm": 3.678478240966797, + "learning_rate": 9.65539317190997e-06, + "loss": 1.2403, + "step": 3063 + }, + { + "epoch": 0.8148936170212766, + "grad_norm": 3.6876394748687744, + "learning_rate": 9.655072240067464e-06, + "loss": 1.2774, + "step": 3064 + }, + { + "epoch": 0.8151595744680851, + "grad_norm": 3.6876394748687744, + "learning_rate": 9.65475116419154e-06, + "loss": 1.1866, + "step": 3065 + }, + { + "epoch": 0.8154255319148936, + "grad_norm": 4.459439277648926, + "learning_rate": 9.654429944292136e-06, + "loss": 1.255, + "step": 3066 + }, + { + "epoch": 0.8156914893617021, + "grad_norm": 3.636715888977051, + "learning_rate": 9.65410858037919e-06, + "loss": 1.4368, + "step": 3067 + }, + { + "epoch": 0.8159574468085107, + "grad_norm": 3.7368946075439453, + "learning_rate": 9.653787072462644e-06, + "loss": 1.3039, + "step": 3068 + }, + { + "epoch": 0.8162234042553191, + "grad_norm": 3.32794451713562, + "learning_rate": 9.653465420552445e-06, + "loss": 1.1366, + "step": 3069 + }, + { + "epoch": 0.8164893617021277, + "grad_norm": 3.3161087036132812, + "learning_rate": 9.65314362465855e-06, + "loss": 1.0602, + "step": 3070 + }, + { + "epoch": 0.8167553191489362, + "grad_norm": 3.6150729656219482, + "learning_rate": 9.652821684790912e-06, + "loss": 1.3939, + "step": 3071 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 3.7740049362182617, + "learning_rate": 9.652499600959493e-06, + "loss": 1.3626, + "step": 3072 + }, + { + "epoch": 0.8172872340425532, + "grad_norm": 3.8331871032714844, + "learning_rate": 9.65217737317426e-06, + "loss": 1.3151, + "step": 3073 + }, + { + "epoch": 0.8175531914893617, + "grad_norm": 3.3269927501678467, + "learning_rate": 9.65185500144518e-06, + "loss": 1.1879, + "step": 3074 + }, + { + "epoch": 0.8178191489361702, + "grad_norm": 3.318422555923462, + "learning_rate": 9.651532485782231e-06, + "loss": 1.2128, + "step": 3075 + }, + { + "epoch": 0.8180851063829787, + "grad_norm": 3.8798575401306152, + "learning_rate": 9.65120982619539e-06, + "loss": 1.2097, + "step": 3076 + }, + { + "epoch": 0.8183510638297873, + "grad_norm": 3.538886785507202, + "learning_rate": 9.650887022694639e-06, + "loss": 1.2558, + "step": 3077 + }, + { + "epoch": 0.8186170212765957, + "grad_norm": 3.8403117656707764, + "learning_rate": 9.65056407528997e-06, + "loss": 1.4618, + "step": 3078 + }, + { + "epoch": 0.8188829787234042, + "grad_norm": 3.731025218963623, + "learning_rate": 9.650240983991372e-06, + "loss": 1.2627, + "step": 3079 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 3.7986326217651367, + "learning_rate": 9.649917748808844e-06, + "loss": 1.2213, + "step": 3080 + }, + { + "epoch": 0.8194148936170212, + "grad_norm": 3.556394577026367, + "learning_rate": 9.649594369752384e-06, + "loss": 1.2093, + "step": 3081 + }, + { + "epoch": 0.8196808510638298, + "grad_norm": 3.989525318145752, + "learning_rate": 9.649270846832001e-06, + "loss": 1.4164, + "step": 3082 + }, + { + "epoch": 0.8199468085106383, + "grad_norm": 3.6029410362243652, + "learning_rate": 9.648947180057705e-06, + "loss": 1.315, + "step": 3083 + }, + { + "epoch": 0.8202127659574469, + "grad_norm": 3.677532196044922, + "learning_rate": 9.648623369439509e-06, + "loss": 1.3006, + "step": 3084 + }, + { + "epoch": 0.8204787234042553, + "grad_norm": 3.241009473800659, + "learning_rate": 9.648299414987434e-06, + "loss": 1.1637, + "step": 3085 + }, + { + "epoch": 0.8207446808510638, + "grad_norm": 3.470125198364258, + "learning_rate": 9.647975316711502e-06, + "loss": 1.1894, + "step": 3086 + }, + { + "epoch": 0.8210106382978724, + "grad_norm": 3.6613218784332275, + "learning_rate": 9.647651074621741e-06, + "loss": 1.2222, + "step": 3087 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 3.4483370780944824, + "learning_rate": 9.647326688728184e-06, + "loss": 1.1142, + "step": 3088 + }, + { + "epoch": 0.8215425531914894, + "grad_norm": 3.830843687057495, + "learning_rate": 9.647002159040868e-06, + "loss": 1.2923, + "step": 3089 + }, + { + "epoch": 0.8218085106382979, + "grad_norm": 3.445209264755249, + "learning_rate": 9.646677485569834e-06, + "loss": 1.2042, + "step": 3090 + }, + { + "epoch": 0.8220744680851064, + "grad_norm": 3.818505048751831, + "learning_rate": 9.646352668325128e-06, + "loss": 1.3102, + "step": 3091 + }, + { + "epoch": 0.8223404255319149, + "grad_norm": 3.4437718391418457, + "learning_rate": 9.646027707316798e-06, + "loss": 1.1836, + "step": 3092 + }, + { + "epoch": 0.8226063829787233, + "grad_norm": 3.690908670425415, + "learning_rate": 9.645702602554902e-06, + "loss": 1.1375, + "step": 3093 + }, + { + "epoch": 0.8228723404255319, + "grad_norm": 4.1998209953308105, + "learning_rate": 9.645377354049499e-06, + "loss": 1.3336, + "step": 3094 + }, + { + "epoch": 0.8231382978723404, + "grad_norm": 3.559067487716675, + "learning_rate": 9.64505196181065e-06, + "loss": 1.1967, + "step": 3095 + }, + { + "epoch": 0.823404255319149, + "grad_norm": 3.657874584197998, + "learning_rate": 9.644726425848425e-06, + "loss": 1.2603, + "step": 3096 + }, + { + "epoch": 0.8236702127659574, + "grad_norm": 3.2679355144500732, + "learning_rate": 9.644400746172896e-06, + "loss": 1.177, + "step": 3097 + }, + { + "epoch": 0.823936170212766, + "grad_norm": 3.9587206840515137, + "learning_rate": 9.644074922794139e-06, + "loss": 1.2768, + "step": 3098 + }, + { + "epoch": 0.8242021276595745, + "grad_norm": 3.2773869037628174, + "learning_rate": 9.643748955722238e-06, + "loss": 1.2397, + "step": 3099 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 3.796388864517212, + "learning_rate": 9.643422844967274e-06, + "loss": 1.3281, + "step": 3100 + }, + { + "epoch": 0.8247340425531915, + "grad_norm": 3.6081080436706543, + "learning_rate": 9.643096590539343e-06, + "loss": 1.1514, + "step": 3101 + }, + { + "epoch": 0.825, + "grad_norm": 3.6461782455444336, + "learning_rate": 9.642770192448537e-06, + "loss": 1.3713, + "step": 3102 + }, + { + "epoch": 0.8252659574468085, + "grad_norm": 3.731442451477051, + "learning_rate": 9.642443650704954e-06, + "loss": 1.3621, + "step": 3103 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 3.8544721603393555, + "learning_rate": 9.642116965318697e-06, + "loss": 1.2699, + "step": 3104 + }, + { + "epoch": 0.8257978723404256, + "grad_norm": 3.6057963371276855, + "learning_rate": 9.641790136299877e-06, + "loss": 1.1425, + "step": 3105 + }, + { + "epoch": 0.826063829787234, + "grad_norm": 3.618706226348877, + "learning_rate": 9.641463163658606e-06, + "loss": 1.309, + "step": 3106 + }, + { + "epoch": 0.8263297872340426, + "grad_norm": 3.2677018642425537, + "learning_rate": 9.641136047405e-06, + "loss": 1.221, + "step": 3107 + }, + { + "epoch": 0.8265957446808511, + "grad_norm": 3.311882734298706, + "learning_rate": 9.64080878754918e-06, + "loss": 1.2231, + "step": 3108 + }, + { + "epoch": 0.8268617021276595, + "grad_norm": 3.435105562210083, + "learning_rate": 9.640481384101273e-06, + "loss": 1.3697, + "step": 3109 + }, + { + "epoch": 0.8271276595744681, + "grad_norm": 3.77473783493042, + "learning_rate": 9.640153837071407e-06, + "loss": 1.4063, + "step": 3110 + }, + { + "epoch": 0.8273936170212766, + "grad_norm": 3.6035094261169434, + "learning_rate": 9.63982614646972e-06, + "loss": 1.3273, + "step": 3111 + }, + { + "epoch": 0.8276595744680851, + "grad_norm": 3.4138381481170654, + "learning_rate": 9.639498312306348e-06, + "loss": 1.1646, + "step": 3112 + }, + { + "epoch": 0.8279255319148936, + "grad_norm": 3.638125419616699, + "learning_rate": 9.639170334591437e-06, + "loss": 1.3288, + "step": 3113 + }, + { + "epoch": 0.8281914893617022, + "grad_norm": 3.917206287384033, + "learning_rate": 9.638842213335132e-06, + "loss": 1.3541, + "step": 3114 + }, + { + "epoch": 0.8284574468085106, + "grad_norm": 4.120351314544678, + "learning_rate": 9.63851394854759e-06, + "loss": 1.3473, + "step": 3115 + }, + { + "epoch": 0.8287234042553191, + "grad_norm": 3.6400179862976074, + "learning_rate": 9.638185540238963e-06, + "loss": 1.3199, + "step": 3116 + }, + { + "epoch": 0.8289893617021277, + "grad_norm": 3.4678385257720947, + "learning_rate": 9.637856988419413e-06, + "loss": 1.3348, + "step": 3117 + }, + { + "epoch": 0.8292553191489361, + "grad_norm": 3.490227460861206, + "learning_rate": 9.637528293099111e-06, + "loss": 1.2041, + "step": 3118 + }, + { + "epoch": 0.8295212765957447, + "grad_norm": 3.3085920810699463, + "learning_rate": 9.637199454288222e-06, + "loss": 1.2509, + "step": 3119 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 3.5364296436309814, + "learning_rate": 9.636870471996923e-06, + "loss": 1.3302, + "step": 3120 + }, + { + "epoch": 0.8300531914893617, + "grad_norm": 3.952470302581787, + "learning_rate": 9.636541346235392e-06, + "loss": 1.3387, + "step": 3121 + }, + { + "epoch": 0.8303191489361702, + "grad_norm": 3.678920269012451, + "learning_rate": 9.636212077013812e-06, + "loss": 1.2225, + "step": 3122 + }, + { + "epoch": 0.8305851063829788, + "grad_norm": 3.4960269927978516, + "learning_rate": 9.635882664342373e-06, + "loss": 1.1883, + "step": 3123 + }, + { + "epoch": 0.8308510638297872, + "grad_norm": 3.1453335285186768, + "learning_rate": 9.635553108231266e-06, + "loss": 1.0471, + "step": 3124 + }, + { + "epoch": 0.8311170212765957, + "grad_norm": 3.6323747634887695, + "learning_rate": 9.635223408690688e-06, + "loss": 1.1595, + "step": 3125 + }, + { + "epoch": 0.8313829787234043, + "grad_norm": 3.2408368587493896, + "learning_rate": 9.634893565730841e-06, + "loss": 1.2454, + "step": 3126 + }, + { + "epoch": 0.8316489361702127, + "grad_norm": 3.628117322921753, + "learning_rate": 9.63456357936193e-06, + "loss": 1.3161, + "step": 3127 + }, + { + "epoch": 0.8319148936170213, + "grad_norm": 3.896415948867798, + "learning_rate": 9.634233449594165e-06, + "loss": 1.29, + "step": 3128 + }, + { + "epoch": 0.8321808510638298, + "grad_norm": 3.3425135612487793, + "learning_rate": 9.63390317643776e-06, + "loss": 1.0845, + "step": 3129 + }, + { + "epoch": 0.8324468085106383, + "grad_norm": 3.593471050262451, + "learning_rate": 9.633572759902936e-06, + "loss": 1.1751, + "step": 3130 + }, + { + "epoch": 0.8327127659574468, + "grad_norm": 3.8105530738830566, + "learning_rate": 9.633242199999916e-06, + "loss": 1.2935, + "step": 3131 + }, + { + "epoch": 0.8329787234042553, + "grad_norm": 3.5633177757263184, + "learning_rate": 9.632911496738927e-06, + "loss": 1.2376, + "step": 3132 + }, + { + "epoch": 0.8332446808510638, + "grad_norm": 3.5305428504943848, + "learning_rate": 9.632580650130201e-06, + "loss": 1.2905, + "step": 3133 + }, + { + "epoch": 0.8335106382978723, + "grad_norm": 3.328059196472168, + "learning_rate": 9.632249660183977e-06, + "loss": 1.2773, + "step": 3134 + }, + { + "epoch": 0.8337765957446809, + "grad_norm": 3.8208043575286865, + "learning_rate": 9.631918526910493e-06, + "loss": 1.2472, + "step": 3135 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 3.6366043090820312, + "learning_rate": 9.631587250319998e-06, + "loss": 1.1361, + "step": 3136 + }, + { + "epoch": 0.8343085106382979, + "grad_norm": 3.3834152221679688, + "learning_rate": 9.631255830422739e-06, + "loss": 1.2766, + "step": 3137 + }, + { + "epoch": 0.8345744680851064, + "grad_norm": 3.6326873302459717, + "learning_rate": 9.630924267228973e-06, + "loss": 1.2792, + "step": 3138 + }, + { + "epoch": 0.8348404255319148, + "grad_norm": 3.720566749572754, + "learning_rate": 9.630592560748957e-06, + "loss": 1.113, + "step": 3139 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 3.732006549835205, + "learning_rate": 9.630260710992956e-06, + "loss": 1.1235, + "step": 3140 + }, + { + "epoch": 0.8353723404255319, + "grad_norm": 3.3565263748168945, + "learning_rate": 9.629928717971237e-06, + "loss": 1.1881, + "step": 3141 + }, + { + "epoch": 0.8356382978723405, + "grad_norm": 3.7368946075439453, + "learning_rate": 9.629596581694072e-06, + "loss": 1.2955, + "step": 3142 + }, + { + "epoch": 0.8359042553191489, + "grad_norm": 3.77895188331604, + "learning_rate": 9.629264302171739e-06, + "loss": 1.2691, + "step": 3143 + }, + { + "epoch": 0.8361702127659575, + "grad_norm": 3.6195473670959473, + "learning_rate": 9.628931879414519e-06, + "loss": 1.125, + "step": 3144 + }, + { + "epoch": 0.836436170212766, + "grad_norm": 3.4380621910095215, + "learning_rate": 9.628599313432694e-06, + "loss": 1.2379, + "step": 3145 + }, + { + "epoch": 0.8367021276595744, + "grad_norm": 3.972651958465576, + "learning_rate": 9.628266604236558e-06, + "loss": 1.2316, + "step": 3146 + }, + { + "epoch": 0.836968085106383, + "grad_norm": 3.770378351211548, + "learning_rate": 9.627933751836405e-06, + "loss": 1.4091, + "step": 3147 + }, + { + "epoch": 0.8372340425531914, + "grad_norm": 3.359567165374756, + "learning_rate": 9.627600756242532e-06, + "loss": 1.076, + "step": 3148 + }, + { + "epoch": 0.8375, + "grad_norm": 3.5449929237365723, + "learning_rate": 9.627267617465243e-06, + "loss": 1.1785, + "step": 3149 + }, + { + "epoch": 0.8377659574468085, + "grad_norm": 3.8262412548065186, + "learning_rate": 9.626934335514847e-06, + "loss": 1.1613, + "step": 3150 + }, + { + "epoch": 0.8380319148936171, + "grad_norm": 3.5842607021331787, + "learning_rate": 9.626600910401656e-06, + "loss": 1.4153, + "step": 3151 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 3.2474827766418457, + "learning_rate": 9.626267342135983e-06, + "loss": 1.1652, + "step": 3152 + }, + { + "epoch": 0.8385638297872341, + "grad_norm": 3.3414809703826904, + "learning_rate": 9.625933630728153e-06, + "loss": 1.062, + "step": 3153 + }, + { + "epoch": 0.8388297872340426, + "grad_norm": 3.496842384338379, + "learning_rate": 9.62559977618849e-06, + "loss": 1.255, + "step": 3154 + }, + { + "epoch": 0.839095744680851, + "grad_norm": 3.2567241191864014, + "learning_rate": 9.625265778527325e-06, + "loss": 1.1378, + "step": 3155 + }, + { + "epoch": 0.8393617021276596, + "grad_norm": 3.720892906188965, + "learning_rate": 9.62493163775499e-06, + "loss": 1.4717, + "step": 3156 + }, + { + "epoch": 0.839627659574468, + "grad_norm": 3.342963695526123, + "learning_rate": 9.624597353881827e-06, + "loss": 1.2974, + "step": 3157 + }, + { + "epoch": 0.8398936170212766, + "grad_norm": 3.3030459880828857, + "learning_rate": 9.624262926918174e-06, + "loss": 1.1823, + "step": 3158 + }, + { + "epoch": 0.8401595744680851, + "grad_norm": 3.4827306270599365, + "learning_rate": 9.623928356874384e-06, + "loss": 1.2282, + "step": 3159 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 3.247631311416626, + "learning_rate": 9.623593643760805e-06, + "loss": 1.2173, + "step": 3160 + }, + { + "epoch": 0.8406914893617021, + "grad_norm": 3.571974515914917, + "learning_rate": 9.623258787587795e-06, + "loss": 1.2277, + "step": 3161 + }, + { + "epoch": 0.8409574468085106, + "grad_norm": 3.5363829135894775, + "learning_rate": 9.622923788365716e-06, + "loss": 1.2212, + "step": 3162 + }, + { + "epoch": 0.8412234042553192, + "grad_norm": 3.816324234008789, + "learning_rate": 9.622588646104934e-06, + "loss": 1.3759, + "step": 3163 + }, + { + "epoch": 0.8414893617021276, + "grad_norm": 3.8033061027526855, + "learning_rate": 9.622253360815814e-06, + "loss": 1.1493, + "step": 3164 + }, + { + "epoch": 0.8417553191489362, + "grad_norm": 3.7425754070281982, + "learning_rate": 9.621917932508733e-06, + "loss": 1.1964, + "step": 3165 + }, + { + "epoch": 0.8420212765957447, + "grad_norm": 3.4991588592529297, + "learning_rate": 9.62158236119407e-06, + "loss": 1.2337, + "step": 3166 + }, + { + "epoch": 0.8422872340425532, + "grad_norm": 3.450436592102051, + "learning_rate": 9.621246646882209e-06, + "loss": 1.1413, + "step": 3167 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 3.449032783508301, + "learning_rate": 9.620910789583534e-06, + "loss": 1.269, + "step": 3168 + }, + { + "epoch": 0.8428191489361702, + "grad_norm": 3.609985589981079, + "learning_rate": 9.62057478930844e-06, + "loss": 1.2008, + "step": 3169 + }, + { + "epoch": 0.8430851063829787, + "grad_norm": 3.5072379112243652, + "learning_rate": 9.620238646067322e-06, + "loss": 1.2176, + "step": 3170 + }, + { + "epoch": 0.8433510638297872, + "grad_norm": 3.481480836868286, + "learning_rate": 9.619902359870579e-06, + "loss": 1.2152, + "step": 3171 + }, + { + "epoch": 0.8436170212765958, + "grad_norm": 3.640972852706909, + "learning_rate": 9.619565930728618e-06, + "loss": 1.4143, + "step": 3172 + }, + { + "epoch": 0.8438829787234042, + "grad_norm": 3.5323524475097656, + "learning_rate": 9.61922935865185e-06, + "loss": 1.1856, + "step": 3173 + }, + { + "epoch": 0.8441489361702128, + "grad_norm": 3.837163209915161, + "learning_rate": 9.618892643650686e-06, + "loss": 1.243, + "step": 3174 + }, + { + "epoch": 0.8444148936170213, + "grad_norm": 3.702387809753418, + "learning_rate": 9.618555785735546e-06, + "loss": 1.1177, + "step": 3175 + }, + { + "epoch": 0.8446808510638298, + "grad_norm": 3.696453094482422, + "learning_rate": 9.618218784916851e-06, + "loss": 1.2794, + "step": 3176 + }, + { + "epoch": 0.8449468085106383, + "grad_norm": 3.467315435409546, + "learning_rate": 9.617881641205032e-06, + "loss": 1.1261, + "step": 3177 + }, + { + "epoch": 0.8452127659574468, + "grad_norm": 3.392866849899292, + "learning_rate": 9.617544354610516e-06, + "loss": 1.3169, + "step": 3178 + }, + { + "epoch": 0.8454787234042553, + "grad_norm": 3.4695167541503906, + "learning_rate": 9.617206925143742e-06, + "loss": 1.3706, + "step": 3179 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 3.658966064453125, + "learning_rate": 9.61686935281515e-06, + "loss": 1.289, + "step": 3180 + }, + { + "epoch": 0.8460106382978724, + "grad_norm": 3.779771327972412, + "learning_rate": 9.616531637635183e-06, + "loss": 1.2999, + "step": 3181 + }, + { + "epoch": 0.8462765957446808, + "grad_norm": 3.8787152767181396, + "learning_rate": 9.616193779614294e-06, + "loss": 1.2876, + "step": 3182 + }, + { + "epoch": 0.8465425531914894, + "grad_norm": 3.5529751777648926, + "learning_rate": 9.615855778762933e-06, + "loss": 1.2511, + "step": 3183 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 4.681981563568115, + "learning_rate": 9.61551763509156e-06, + "loss": 1.3139, + "step": 3184 + }, + { + "epoch": 0.8470744680851063, + "grad_norm": 3.130150556564331, + "learning_rate": 9.615179348610638e-06, + "loss": 1.1744, + "step": 3185 + }, + { + "epoch": 0.8473404255319149, + "grad_norm": 3.374901056289673, + "learning_rate": 9.614840919330632e-06, + "loss": 1.0669, + "step": 3186 + }, + { + "epoch": 0.8476063829787234, + "grad_norm": 3.805163621902466, + "learning_rate": 9.614502347262015e-06, + "loss": 1.3958, + "step": 3187 + }, + { + "epoch": 0.847872340425532, + "grad_norm": 3.173311948776245, + "learning_rate": 9.614163632415265e-06, + "loss": 1.2402, + "step": 3188 + }, + { + "epoch": 0.8481382978723404, + "grad_norm": 3.7105321884155273, + "learning_rate": 9.613824774800857e-06, + "loss": 1.2364, + "step": 3189 + }, + { + "epoch": 0.848404255319149, + "grad_norm": 3.5191519260406494, + "learning_rate": 9.613485774429279e-06, + "loss": 1.3238, + "step": 3190 + }, + { + "epoch": 0.8486702127659574, + "grad_norm": 3.2969210147857666, + "learning_rate": 9.613146631311018e-06, + "loss": 1.2284, + "step": 3191 + }, + { + "epoch": 0.8489361702127659, + "grad_norm": 3.6637449264526367, + "learning_rate": 9.612807345456571e-06, + "loss": 1.1128, + "step": 3192 + }, + { + "epoch": 0.8492021276595745, + "grad_norm": 3.9408974647521973, + "learning_rate": 9.612467916876434e-06, + "loss": 1.171, + "step": 3193 + }, + { + "epoch": 0.8494680851063829, + "grad_norm": 3.3598899841308594, + "learning_rate": 9.612128345581108e-06, + "loss": 1.1941, + "step": 3194 + }, + { + "epoch": 0.8497340425531915, + "grad_norm": 3.5474600791931152, + "learning_rate": 9.6117886315811e-06, + "loss": 1.1679, + "step": 3195 + }, + { + "epoch": 0.85, + "grad_norm": 3.9404945373535156, + "learning_rate": 9.611448774886925e-06, + "loss": 1.3117, + "step": 3196 + }, + { + "epoch": 0.8502659574468086, + "grad_norm": 3.389488935470581, + "learning_rate": 9.611108775509093e-06, + "loss": 1.1708, + "step": 3197 + }, + { + "epoch": 0.850531914893617, + "grad_norm": 3.5706136226654053, + "learning_rate": 9.610768633458127e-06, + "loss": 1.249, + "step": 3198 + }, + { + "epoch": 0.8507978723404256, + "grad_norm": 3.899035930633545, + "learning_rate": 9.610428348744552e-06, + "loss": 1.2828, + "step": 3199 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 3.648972511291504, + "learning_rate": 9.610087921378895e-06, + "loss": 1.2152, + "step": 3200 + }, + { + "epoch": 0.8513297872340425, + "grad_norm": 3.762350559234619, + "learning_rate": 9.60974735137169e-06, + "loss": 1.3663, + "step": 3201 + }, + { + "epoch": 0.8515957446808511, + "grad_norm": 3.8155291080474854, + "learning_rate": 9.609406638733474e-06, + "loss": 1.1777, + "step": 3202 + }, + { + "epoch": 0.8518617021276595, + "grad_norm": 3.5268514156341553, + "learning_rate": 9.609065783474792e-06, + "loss": 1.2634, + "step": 3203 + }, + { + "epoch": 0.8521276595744681, + "grad_norm": 3.3057730197906494, + "learning_rate": 9.608724785606186e-06, + "loss": 1.2208, + "step": 3204 + }, + { + "epoch": 0.8523936170212766, + "grad_norm": 3.9648935794830322, + "learning_rate": 9.60838364513821e-06, + "loss": 1.2936, + "step": 3205 + }, + { + "epoch": 0.8526595744680852, + "grad_norm": 3.8742856979370117, + "learning_rate": 9.608042362081418e-06, + "loss": 1.298, + "step": 3206 + }, + { + "epoch": 0.8529255319148936, + "grad_norm": 3.845383644104004, + "learning_rate": 9.60770093644637e-06, + "loss": 1.2274, + "step": 3207 + }, + { + "epoch": 0.8531914893617021, + "grad_norm": 3.532756805419922, + "learning_rate": 9.60735936824363e-06, + "loss": 1.339, + "step": 3208 + }, + { + "epoch": 0.8534574468085107, + "grad_norm": 3.7821319103240967, + "learning_rate": 9.607017657483768e-06, + "loss": 1.3414, + "step": 3209 + }, + { + "epoch": 0.8537234042553191, + "grad_norm": 3.5962960720062256, + "learning_rate": 9.606675804177355e-06, + "loss": 1.1815, + "step": 3210 + }, + { + "epoch": 0.8539893617021277, + "grad_norm": 3.8669700622558594, + "learning_rate": 9.606333808334966e-06, + "loss": 1.2821, + "step": 3211 + }, + { + "epoch": 0.8542553191489362, + "grad_norm": 3.288717269897461, + "learning_rate": 9.605991669967189e-06, + "loss": 1.1532, + "step": 3212 + }, + { + "epoch": 0.8545212765957447, + "grad_norm": 3.445049285888672, + "learning_rate": 9.605649389084605e-06, + "loss": 1.2534, + "step": 3213 + }, + { + "epoch": 0.8547872340425532, + "grad_norm": 3.075615644454956, + "learning_rate": 9.605306965697809e-06, + "loss": 1.0243, + "step": 3214 + }, + { + "epoch": 0.8550531914893617, + "grad_norm": 3.6676225662231445, + "learning_rate": 9.604964399817392e-06, + "loss": 1.2927, + "step": 3215 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 3.4644627571105957, + "learning_rate": 9.604621691453954e-06, + "loss": 1.2167, + "step": 3216 + }, + { + "epoch": 0.8555851063829787, + "grad_norm": 3.3108158111572266, + "learning_rate": 9.6042788406181e-06, + "loss": 1.2437, + "step": 3217 + }, + { + "epoch": 0.8558510638297873, + "grad_norm": 3.634568929672241, + "learning_rate": 9.603935847320437e-06, + "loss": 1.2587, + "step": 3218 + }, + { + "epoch": 0.8561170212765957, + "grad_norm": 3.472355365753174, + "learning_rate": 9.603592711571581e-06, + "loss": 1.1544, + "step": 3219 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 3.7467241287231445, + "learning_rate": 9.603249433382145e-06, + "loss": 1.1884, + "step": 3220 + }, + { + "epoch": 0.8566489361702128, + "grad_norm": 4.016312599182129, + "learning_rate": 9.60290601276275e-06, + "loss": 1.2884, + "step": 3221 + }, + { + "epoch": 0.8569148936170212, + "grad_norm": 3.432687282562256, + "learning_rate": 9.602562449724027e-06, + "loss": 1.2495, + "step": 3222 + }, + { + "epoch": 0.8571808510638298, + "grad_norm": 3.466148614883423, + "learning_rate": 9.6022187442766e-06, + "loss": 1.0967, + "step": 3223 + }, + { + "epoch": 0.8574468085106383, + "grad_norm": 3.7120723724365234, + "learning_rate": 9.60187489643111e-06, + "loss": 1.1666, + "step": 3224 + }, + { + "epoch": 0.8577127659574468, + "grad_norm": 3.6994261741638184, + "learning_rate": 9.60153090619819e-06, + "loss": 1.3106, + "step": 3225 + }, + { + "epoch": 0.8579787234042553, + "grad_norm": 3.481760025024414, + "learning_rate": 9.601186773588486e-06, + "loss": 1.2581, + "step": 3226 + }, + { + "epoch": 0.8582446808510639, + "grad_norm": 3.5702121257781982, + "learning_rate": 9.600842498612647e-06, + "loss": 1.3228, + "step": 3227 + }, + { + "epoch": 0.8585106382978723, + "grad_norm": 4.04725980758667, + "learning_rate": 9.600498081281324e-06, + "loss": 1.2431, + "step": 3228 + }, + { + "epoch": 0.8587765957446809, + "grad_norm": 3.632622480392456, + "learning_rate": 9.600153521605176e-06, + "loss": 1.1693, + "step": 3229 + }, + { + "epoch": 0.8590425531914894, + "grad_norm": 3.6271767616271973, + "learning_rate": 9.59980881959486e-06, + "loss": 1.2398, + "step": 3230 + }, + { + "epoch": 0.8593085106382978, + "grad_norm": 3.3347911834716797, + "learning_rate": 9.599463975261042e-06, + "loss": 1.1603, + "step": 3231 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 3.6934587955474854, + "learning_rate": 9.599118988614396e-06, + "loss": 1.305, + "step": 3232 + }, + { + "epoch": 0.8598404255319149, + "grad_norm": 3.461353063583374, + "learning_rate": 9.598773859665593e-06, + "loss": 1.2013, + "step": 3233 + }, + { + "epoch": 0.8601063829787234, + "grad_norm": 3.2839810848236084, + "learning_rate": 9.598428588425312e-06, + "loss": 1.1208, + "step": 3234 + }, + { + "epoch": 0.8603723404255319, + "grad_norm": 3.599320650100708, + "learning_rate": 9.598083174904235e-06, + "loss": 1.4372, + "step": 3235 + }, + { + "epoch": 0.8606382978723405, + "grad_norm": 3.540738105773926, + "learning_rate": 9.597737619113055e-06, + "loss": 1.0961, + "step": 3236 + }, + { + "epoch": 0.8609042553191489, + "grad_norm": 3.327744722366333, + "learning_rate": 9.597391921062457e-06, + "loss": 1.2087, + "step": 3237 + }, + { + "epoch": 0.8611702127659574, + "grad_norm": 3.619152545928955, + "learning_rate": 9.59704608076314e-06, + "loss": 1.3197, + "step": 3238 + }, + { + "epoch": 0.861436170212766, + "grad_norm": 3.381136178970337, + "learning_rate": 9.596700098225806e-06, + "loss": 1.258, + "step": 3239 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 3.6447596549987793, + "learning_rate": 9.59635397346116e-06, + "loss": 1.1877, + "step": 3240 + }, + { + "epoch": 0.861968085106383, + "grad_norm": 4.12053918838501, + "learning_rate": 9.596007706479908e-06, + "loss": 1.3712, + "step": 3241 + }, + { + "epoch": 0.8622340425531915, + "grad_norm": 3.1644914150238037, + "learning_rate": 9.595661297292768e-06, + "loss": 1.079, + "step": 3242 + }, + { + "epoch": 0.8625, + "grad_norm": 4.086709022521973, + "learning_rate": 9.595314745910455e-06, + "loss": 1.2766, + "step": 3243 + }, + { + "epoch": 0.8627659574468085, + "grad_norm": 4.086410999298096, + "learning_rate": 9.594968052343697e-06, + "loss": 1.2103, + "step": 3244 + }, + { + "epoch": 0.863031914893617, + "grad_norm": 3.550549030303955, + "learning_rate": 9.594621216603215e-06, + "loss": 1.3625, + "step": 3245 + }, + { + "epoch": 0.8632978723404255, + "grad_norm": 3.555739402770996, + "learning_rate": 9.594274238699744e-06, + "loss": 1.2163, + "step": 3246 + }, + { + "epoch": 0.863563829787234, + "grad_norm": 3.2902424335479736, + "learning_rate": 9.593927118644017e-06, + "loss": 0.9849, + "step": 3247 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 3.554675579071045, + "learning_rate": 9.593579856446778e-06, + "loss": 1.1437, + "step": 3248 + }, + { + "epoch": 0.864095744680851, + "grad_norm": 3.3788020610809326, + "learning_rate": 9.59323245211877e-06, + "loss": 1.2336, + "step": 3249 + }, + { + "epoch": 0.8643617021276596, + "grad_norm": 3.4318618774414062, + "learning_rate": 9.592884905670742e-06, + "loss": 1.2021, + "step": 3250 + }, + { + "epoch": 0.8646276595744681, + "grad_norm": 3.5366907119750977, + "learning_rate": 9.592537217113446e-06, + "loss": 1.3365, + "step": 3251 + }, + { + "epoch": 0.8648936170212767, + "grad_norm": 3.7782368659973145, + "learning_rate": 9.592189386457645e-06, + "loss": 1.3855, + "step": 3252 + }, + { + "epoch": 0.8651595744680851, + "grad_norm": 3.480111837387085, + "learning_rate": 9.591841413714094e-06, + "loss": 1.2029, + "step": 3253 + }, + { + "epoch": 0.8654255319148936, + "grad_norm": 3.305756092071533, + "learning_rate": 9.591493298893567e-06, + "loss": 1.1172, + "step": 3254 + }, + { + "epoch": 0.8656914893617021, + "grad_norm": 3.342085361480713, + "learning_rate": 9.591145042006829e-06, + "loss": 1.0662, + "step": 3255 + }, + { + "epoch": 0.8659574468085106, + "grad_norm": 3.6532325744628906, + "learning_rate": 9.590796643064658e-06, + "loss": 1.2083, + "step": 3256 + }, + { + "epoch": 0.8662234042553192, + "grad_norm": 3.8469889163970947, + "learning_rate": 9.590448102077835e-06, + "loss": 1.1185, + "step": 3257 + }, + { + "epoch": 0.8664893617021276, + "grad_norm": 3.6516644954681396, + "learning_rate": 9.590099419057142e-06, + "loss": 1.314, + "step": 3258 + }, + { + "epoch": 0.8667553191489362, + "grad_norm": 3.6090152263641357, + "learning_rate": 9.58975059401337e-06, + "loss": 1.2411, + "step": 3259 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 3.436042308807373, + "learning_rate": 9.589401626957309e-06, + "loss": 1.3095, + "step": 3260 + }, + { + "epoch": 0.8672872340425531, + "grad_norm": 3.2654285430908203, + "learning_rate": 9.589052517899759e-06, + "loss": 1.1265, + "step": 3261 + }, + { + "epoch": 0.8675531914893617, + "grad_norm": 3.6885263919830322, + "learning_rate": 9.588703266851523e-06, + "loss": 1.2568, + "step": 3262 + }, + { + "epoch": 0.8678191489361702, + "grad_norm": 3.9233293533325195, + "learning_rate": 9.588353873823404e-06, + "loss": 1.2273, + "step": 3263 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 3.254892349243164, + "learning_rate": 9.588004338826213e-06, + "loss": 1.0894, + "step": 3264 + }, + { + "epoch": 0.8683510638297872, + "grad_norm": 3.3320047855377197, + "learning_rate": 9.58765466187077e-06, + "loss": 1.3296, + "step": 3265 + }, + { + "epoch": 0.8686170212765958, + "grad_norm": 3.730386972427368, + "learning_rate": 9.587304842967887e-06, + "loss": 1.3909, + "step": 3266 + }, + { + "epoch": 0.8688829787234043, + "grad_norm": 3.557739734649658, + "learning_rate": 9.586954882128391e-06, + "loss": 1.2858, + "step": 3267 + }, + { + "epoch": 0.8691489361702127, + "grad_norm": 3.292858362197876, + "learning_rate": 9.58660477936311e-06, + "loss": 1.2351, + "step": 3268 + }, + { + "epoch": 0.8694148936170213, + "grad_norm": 3.87530255317688, + "learning_rate": 9.58625453468288e-06, + "loss": 1.1993, + "step": 3269 + }, + { + "epoch": 0.8696808510638298, + "grad_norm": 3.5502493381500244, + "learning_rate": 9.585904148098532e-06, + "loss": 1.2225, + "step": 3270 + }, + { + "epoch": 0.8699468085106383, + "grad_norm": 3.9256691932678223, + "learning_rate": 9.585553619620913e-06, + "loss": 1.4114, + "step": 3271 + }, + { + "epoch": 0.8702127659574468, + "grad_norm": 3.4120373725891113, + "learning_rate": 9.585202949260866e-06, + "loss": 1.1049, + "step": 3272 + }, + { + "epoch": 0.8704787234042554, + "grad_norm": 3.6664795875549316, + "learning_rate": 9.58485213702924e-06, + "loss": 1.1906, + "step": 3273 + }, + { + "epoch": 0.8707446808510638, + "grad_norm": 3.315964460372925, + "learning_rate": 9.584501182936891e-06, + "loss": 1.1104, + "step": 3274 + }, + { + "epoch": 0.8710106382978723, + "grad_norm": 3.3911890983581543, + "learning_rate": 9.584150086994678e-06, + "loss": 1.1979, + "step": 3275 + }, + { + "epoch": 0.8712765957446809, + "grad_norm": 3.3415443897247314, + "learning_rate": 9.583798849213467e-06, + "loss": 1.2044, + "step": 3276 + }, + { + "epoch": 0.8715425531914893, + "grad_norm": 3.4745638370513916, + "learning_rate": 9.58344746960412e-06, + "loss": 1.2126, + "step": 3277 + }, + { + "epoch": 0.8718085106382979, + "grad_norm": 3.358224868774414, + "learning_rate": 9.58309594817751e-06, + "loss": 1.2591, + "step": 3278 + }, + { + "epoch": 0.8720744680851064, + "grad_norm": 3.607102155685425, + "learning_rate": 9.582744284944519e-06, + "loss": 1.2529, + "step": 3279 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 3.4642441272735596, + "learning_rate": 9.582392479916023e-06, + "loss": 1.1749, + "step": 3280 + }, + { + "epoch": 0.8726063829787234, + "grad_norm": 3.5729122161865234, + "learning_rate": 9.582040533102908e-06, + "loss": 1.3488, + "step": 3281 + }, + { + "epoch": 0.872872340425532, + "grad_norm": 3.499811887741089, + "learning_rate": 9.581688444516064e-06, + "loss": 1.1714, + "step": 3282 + }, + { + "epoch": 0.8731382978723404, + "grad_norm": 3.7235212326049805, + "learning_rate": 9.581336214166386e-06, + "loss": 1.2336, + "step": 3283 + }, + { + "epoch": 0.8734042553191489, + "grad_norm": 3.3966002464294434, + "learning_rate": 9.580983842064772e-06, + "loss": 1.2197, + "step": 3284 + }, + { + "epoch": 0.8736702127659575, + "grad_norm": 3.7711052894592285, + "learning_rate": 9.580631328222124e-06, + "loss": 1.3275, + "step": 3285 + }, + { + "epoch": 0.8739361702127659, + "grad_norm": 3.6308035850524902, + "learning_rate": 9.58027867264935e-06, + "loss": 1.1036, + "step": 3286 + }, + { + "epoch": 0.8742021276595745, + "grad_norm": 3.5871105194091797, + "learning_rate": 9.579925875357361e-06, + "loss": 1.2099, + "step": 3287 + }, + { + "epoch": 0.874468085106383, + "grad_norm": 3.3607616424560547, + "learning_rate": 9.579572936357073e-06, + "loss": 1.3576, + "step": 3288 + }, + { + "epoch": 0.8747340425531915, + "grad_norm": 3.5098683834075928, + "learning_rate": 9.579219855659407e-06, + "loss": 1.1218, + "step": 3289 + }, + { + "epoch": 0.875, + "grad_norm": 3.2693376541137695, + "learning_rate": 9.578866633275289e-06, + "loss": 1.2022, + "step": 3290 + }, + { + "epoch": 0.8752659574468085, + "grad_norm": 3.9929087162017822, + "learning_rate": 9.578513269215643e-06, + "loss": 1.2267, + "step": 3291 + }, + { + "epoch": 0.875531914893617, + "grad_norm": 3.7925865650177, + "learning_rate": 9.578159763491408e-06, + "loss": 1.3087, + "step": 3292 + }, + { + "epoch": 0.8757978723404255, + "grad_norm": 3.5196733474731445, + "learning_rate": 9.577806116113519e-06, + "loss": 1.2655, + "step": 3293 + }, + { + "epoch": 0.8760638297872341, + "grad_norm": 3.529148578643799, + "learning_rate": 9.57745232709292e-06, + "loss": 1.1591, + "step": 3294 + }, + { + "epoch": 0.8763297872340425, + "grad_norm": 3.423691987991333, + "learning_rate": 9.577098396440557e-06, + "loss": 1.2312, + "step": 3295 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 3.6896872520446777, + "learning_rate": 9.57674432416738e-06, + "loss": 1.3319, + "step": 3296 + }, + { + "epoch": 0.8768617021276596, + "grad_norm": 3.2412073612213135, + "learning_rate": 9.576390110284343e-06, + "loss": 1.1944, + "step": 3297 + }, + { + "epoch": 0.877127659574468, + "grad_norm": 3.716688871383667, + "learning_rate": 9.576035754802411e-06, + "loss": 1.1713, + "step": 3298 + }, + { + "epoch": 0.8773936170212766, + "grad_norm": 3.721823215484619, + "learning_rate": 9.575681257732546e-06, + "loss": 1.2639, + "step": 3299 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 3.4668095111846924, + "learning_rate": 9.575326619085713e-06, + "loss": 1.2198, + "step": 3300 + }, + { + "epoch": 0.8779255319148936, + "grad_norm": 3.647254467010498, + "learning_rate": 9.574971838872889e-06, + "loss": 1.2587, + "step": 3301 + }, + { + "epoch": 0.8781914893617021, + "grad_norm": 3.563108205795288, + "learning_rate": 9.574616917105049e-06, + "loss": 1.2173, + "step": 3302 + }, + { + "epoch": 0.8784574468085107, + "grad_norm": 5.121861457824707, + "learning_rate": 9.574261853793176e-06, + "loss": 1.2889, + "step": 3303 + }, + { + "epoch": 0.8787234042553191, + "grad_norm": 3.9446914196014404, + "learning_rate": 9.573906648948256e-06, + "loss": 1.4498, + "step": 3304 + }, + { + "epoch": 0.8789893617021277, + "grad_norm": 3.368877649307251, + "learning_rate": 9.573551302581279e-06, + "loss": 1.1592, + "step": 3305 + }, + { + "epoch": 0.8792553191489362, + "grad_norm": 3.4360673427581787, + "learning_rate": 9.57319581470324e-06, + "loss": 1.2784, + "step": 3306 + }, + { + "epoch": 0.8795212765957446, + "grad_norm": 3.9499571323394775, + "learning_rate": 9.572840185325139e-06, + "loss": 1.2127, + "step": 3307 + }, + { + "epoch": 0.8797872340425532, + "grad_norm": 3.3917598724365234, + "learning_rate": 9.572484414457976e-06, + "loss": 1.1193, + "step": 3308 + }, + { + "epoch": 0.8800531914893617, + "grad_norm": 3.3946712017059326, + "learning_rate": 9.572128502112765e-06, + "loss": 1.2026, + "step": 3309 + }, + { + "epoch": 0.8803191489361702, + "grad_norm": 3.7101964950561523, + "learning_rate": 9.571772448300514e-06, + "loss": 1.2095, + "step": 3310 + }, + { + "epoch": 0.8805851063829787, + "grad_norm": 3.727922201156616, + "learning_rate": 9.571416253032241e-06, + "loss": 1.4194, + "step": 3311 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.457578182220459, + "learning_rate": 9.571059916318967e-06, + "loss": 1.26, + "step": 3312 + }, + { + "epoch": 0.8811170212765957, + "grad_norm": 3.6214683055877686, + "learning_rate": 9.570703438171717e-06, + "loss": 1.3319, + "step": 3313 + }, + { + "epoch": 0.8813829787234042, + "grad_norm": 3.4604907035827637, + "learning_rate": 9.570346818601522e-06, + "loss": 1.1988, + "step": 3314 + }, + { + "epoch": 0.8816489361702128, + "grad_norm": 3.6304855346679688, + "learning_rate": 9.569990057619414e-06, + "loss": 1.3127, + "step": 3315 + }, + { + "epoch": 0.8819148936170212, + "grad_norm": 3.6774277687072754, + "learning_rate": 9.569633155236436e-06, + "loss": 1.1874, + "step": 3316 + }, + { + "epoch": 0.8821808510638298, + "grad_norm": 3.3065695762634277, + "learning_rate": 9.569276111463626e-06, + "loss": 1.2098, + "step": 3317 + }, + { + "epoch": 0.8824468085106383, + "grad_norm": 3.712066650390625, + "learning_rate": 9.568918926312033e-06, + "loss": 1.2148, + "step": 3318 + }, + { + "epoch": 0.8827127659574469, + "grad_norm": 3.215933084487915, + "learning_rate": 9.568561599792709e-06, + "loss": 1.2424, + "step": 3319 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 3.317523717880249, + "learning_rate": 9.568204131916712e-06, + "loss": 1.1701, + "step": 3320 + }, + { + "epoch": 0.8832446808510638, + "grad_norm": 4.0422749519348145, + "learning_rate": 9.5678465226951e-06, + "loss": 1.3527, + "step": 3321 + }, + { + "epoch": 0.8835106382978724, + "grad_norm": 3.700969934463501, + "learning_rate": 9.56748877213894e-06, + "loss": 1.243, + "step": 3322 + }, + { + "epoch": 0.8837765957446808, + "grad_norm": 3.6172409057617188, + "learning_rate": 9.567130880259296e-06, + "loss": 1.3409, + "step": 3323 + }, + { + "epoch": 0.8840425531914894, + "grad_norm": 3.587956190109253, + "learning_rate": 9.56677284706725e-06, + "loss": 1.327, + "step": 3324 + }, + { + "epoch": 0.8843085106382979, + "grad_norm": 3.8839058876037598, + "learning_rate": 9.566414672573873e-06, + "loss": 1.2556, + "step": 3325 + }, + { + "epoch": 0.8845744680851064, + "grad_norm": 3.610464572906494, + "learning_rate": 9.56605635679025e-06, + "loss": 1.2233, + "step": 3326 + }, + { + "epoch": 0.8848404255319149, + "grad_norm": 3.350374221801758, + "learning_rate": 9.565697899727466e-06, + "loss": 1.1454, + "step": 3327 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 3.175729513168335, + "learning_rate": 9.565339301396616e-06, + "loss": 1.1474, + "step": 3328 + }, + { + "epoch": 0.8853723404255319, + "grad_norm": 3.39150333404541, + "learning_rate": 9.564980561808793e-06, + "loss": 1.1578, + "step": 3329 + }, + { + "epoch": 0.8856382978723404, + "grad_norm": 4.003450393676758, + "learning_rate": 9.564621680975095e-06, + "loss": 1.3537, + "step": 3330 + }, + { + "epoch": 0.885904255319149, + "grad_norm": 3.366062879562378, + "learning_rate": 9.564262658906628e-06, + "loss": 1.2119, + "step": 3331 + }, + { + "epoch": 0.8861702127659574, + "grad_norm": 4.014388084411621, + "learning_rate": 9.563903495614503e-06, + "loss": 1.3046, + "step": 3332 + }, + { + "epoch": 0.886436170212766, + "grad_norm": 3.3641979694366455, + "learning_rate": 9.563544191109828e-06, + "loss": 1.1204, + "step": 3333 + }, + { + "epoch": 0.8867021276595745, + "grad_norm": 3.584113836288452, + "learning_rate": 9.563184745403725e-06, + "loss": 1.1223, + "step": 3334 + }, + { + "epoch": 0.886968085106383, + "grad_norm": 3.905111312866211, + "learning_rate": 9.562825158507311e-06, + "loss": 1.2031, + "step": 3335 + }, + { + "epoch": 0.8872340425531915, + "grad_norm": 3.787869453430176, + "learning_rate": 9.562465430431716e-06, + "loss": 1.1798, + "step": 3336 + }, + { + "epoch": 0.8875, + "grad_norm": 3.336646795272827, + "learning_rate": 9.562105561188069e-06, + "loss": 1.0405, + "step": 3337 + }, + { + "epoch": 0.8877659574468085, + "grad_norm": 3.7780652046203613, + "learning_rate": 9.561745550787504e-06, + "loss": 1.1147, + "step": 3338 + }, + { + "epoch": 0.888031914893617, + "grad_norm": 3.8940999507904053, + "learning_rate": 9.561385399241164e-06, + "loss": 1.371, + "step": 3339 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 3.7703256607055664, + "learning_rate": 9.561025106560184e-06, + "loss": 1.2073, + "step": 3340 + }, + { + "epoch": 0.888563829787234, + "grad_norm": 3.8208539485931396, + "learning_rate": 9.560664672755721e-06, + "loss": 1.3914, + "step": 3341 + }, + { + "epoch": 0.8888297872340426, + "grad_norm": 3.8787341117858887, + "learning_rate": 9.560304097838922e-06, + "loss": 1.2999, + "step": 3342 + }, + { + "epoch": 0.8890957446808511, + "grad_norm": 3.4178457260131836, + "learning_rate": 9.559943381820947e-06, + "loss": 1.2978, + "step": 3343 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.7168829441070557, + "learning_rate": 9.559582524712953e-06, + "loss": 1.2428, + "step": 3344 + }, + { + "epoch": 0.8896276595744681, + "grad_norm": 3.8447728157043457, + "learning_rate": 9.55922152652611e-06, + "loss": 1.3121, + "step": 3345 + }, + { + "epoch": 0.8898936170212766, + "grad_norm": 3.5572218894958496, + "learning_rate": 9.558860387271583e-06, + "loss": 1.3853, + "step": 3346 + }, + { + "epoch": 0.8901595744680851, + "grad_norm": 3.461214780807495, + "learning_rate": 9.558499106960548e-06, + "loss": 1.2634, + "step": 3347 + }, + { + "epoch": 0.8904255319148936, + "grad_norm": 3.4366822242736816, + "learning_rate": 9.558137685604184e-06, + "loss": 1.322, + "step": 3348 + }, + { + "epoch": 0.8906914893617022, + "grad_norm": 3.7072808742523193, + "learning_rate": 9.557776123213673e-06, + "loss": 1.2393, + "step": 3349 + }, + { + "epoch": 0.8909574468085106, + "grad_norm": 3.6192643642425537, + "learning_rate": 9.557414419800204e-06, + "loss": 1.2106, + "step": 3350 + }, + { + "epoch": 0.8912234042553191, + "grad_norm": 3.3502161502838135, + "learning_rate": 9.557052575374967e-06, + "loss": 1.1333, + "step": 3351 + }, + { + "epoch": 0.8914893617021277, + "grad_norm": 3.4909167289733887, + "learning_rate": 9.556690589949158e-06, + "loss": 1.2107, + "step": 3352 + }, + { + "epoch": 0.8917553191489361, + "grad_norm": 3.3816614151000977, + "learning_rate": 9.556328463533976e-06, + "loss": 1.217, + "step": 3353 + }, + { + "epoch": 0.8920212765957447, + "grad_norm": 3.6492433547973633, + "learning_rate": 9.55596619614063e-06, + "loss": 1.1954, + "step": 3354 + }, + { + "epoch": 0.8922872340425532, + "grad_norm": 3.4829185009002686, + "learning_rate": 9.555603787780321e-06, + "loss": 1.1374, + "step": 3355 + }, + { + "epoch": 0.8925531914893617, + "grad_norm": 3.2989566326141357, + "learning_rate": 9.555241238464271e-06, + "loss": 1.2678, + "step": 3356 + }, + { + "epoch": 0.8928191489361702, + "grad_norm": 3.325765609741211, + "learning_rate": 9.554878548203695e-06, + "loss": 1.1352, + "step": 3357 + }, + { + "epoch": 0.8930851063829788, + "grad_norm": 3.680143356323242, + "learning_rate": 9.55451571700981e-06, + "loss": 1.1376, + "step": 3358 + }, + { + "epoch": 0.8933510638297872, + "grad_norm": 3.4539363384246826, + "learning_rate": 9.554152744893848e-06, + "loss": 1.2099, + "step": 3359 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.541053295135498, + "learning_rate": 9.553789631867039e-06, + "loss": 1.2115, + "step": 3360 + }, + { + "epoch": 0.8938829787234043, + "grad_norm": 3.2321863174438477, + "learning_rate": 9.553426377940618e-06, + "loss": 1.2008, + "step": 3361 + }, + { + "epoch": 0.8941489361702127, + "grad_norm": 4.26365852355957, + "learning_rate": 9.553062983125822e-06, + "loss": 1.3757, + "step": 3362 + }, + { + "epoch": 0.8944148936170213, + "grad_norm": 3.7996468544006348, + "learning_rate": 9.552699447433899e-06, + "loss": 1.3071, + "step": 3363 + }, + { + "epoch": 0.8946808510638298, + "grad_norm": 3.2904140949249268, + "learning_rate": 9.552335770876094e-06, + "loss": 1.0914, + "step": 3364 + }, + { + "epoch": 0.8949468085106383, + "grad_norm": 3.48201584815979, + "learning_rate": 9.551971953463659e-06, + "loss": 1.1438, + "step": 3365 + }, + { + "epoch": 0.8952127659574468, + "grad_norm": 3.721348285675049, + "learning_rate": 9.551607995207854e-06, + "loss": 1.1116, + "step": 3366 + }, + { + "epoch": 0.8954787234042553, + "grad_norm": 3.6480965614318848, + "learning_rate": 9.551243896119938e-06, + "loss": 1.1571, + "step": 3367 + }, + { + "epoch": 0.8957446808510638, + "grad_norm": 3.7615323066711426, + "learning_rate": 9.550879656211179e-06, + "loss": 1.4653, + "step": 3368 + }, + { + "epoch": 0.8960106382978723, + "grad_norm": 3.1234636306762695, + "learning_rate": 9.550515275492843e-06, + "loss": 1.1518, + "step": 3369 + }, + { + "epoch": 0.8962765957446809, + "grad_norm": 3.5595285892486572, + "learning_rate": 9.550150753976209e-06, + "loss": 1.213, + "step": 3370 + }, + { + "epoch": 0.8965425531914893, + "grad_norm": 3.4824399948120117, + "learning_rate": 9.549786091672553e-06, + "loss": 1.1228, + "step": 3371 + }, + { + "epoch": 0.8968085106382979, + "grad_norm": 3.6110517978668213, + "learning_rate": 9.549421288593157e-06, + "loss": 1.3169, + "step": 3372 + }, + { + "epoch": 0.8970744680851064, + "grad_norm": 4.197827339172363, + "learning_rate": 9.549056344749312e-06, + "loss": 1.4542, + "step": 3373 + }, + { + "epoch": 0.8973404255319148, + "grad_norm": 3.3921542167663574, + "learning_rate": 9.548691260152308e-06, + "loss": 1.236, + "step": 3374 + }, + { + "epoch": 0.8976063829787234, + "grad_norm": 3.5142951011657715, + "learning_rate": 9.54832603481344e-06, + "loss": 1.2546, + "step": 3375 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 3.390557050704956, + "learning_rate": 9.547960668744009e-06, + "loss": 1.2041, + "step": 3376 + }, + { + "epoch": 0.8981382978723405, + "grad_norm": 3.5497653484344482, + "learning_rate": 9.547595161955321e-06, + "loss": 1.2139, + "step": 3377 + }, + { + "epoch": 0.8984042553191489, + "grad_norm": 3.379268169403076, + "learning_rate": 9.547229514458684e-06, + "loss": 1.1503, + "step": 3378 + }, + { + "epoch": 0.8986702127659575, + "grad_norm": 3.826500177383423, + "learning_rate": 9.546863726265414e-06, + "loss": 1.2808, + "step": 3379 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 3.121777296066284, + "learning_rate": 9.546497797386824e-06, + "loss": 1.1966, + "step": 3380 + }, + { + "epoch": 0.8992021276595744, + "grad_norm": 3.6707565784454346, + "learning_rate": 9.546131727834242e-06, + "loss": 1.33, + "step": 3381 + }, + { + "epoch": 0.899468085106383, + "grad_norm": 3.555612325668335, + "learning_rate": 9.545765517618992e-06, + "loss": 1.1858, + "step": 3382 + }, + { + "epoch": 0.8997340425531914, + "grad_norm": 3.481360912322998, + "learning_rate": 9.545399166752402e-06, + "loss": 1.4109, + "step": 3383 + }, + { + "epoch": 0.9, + "grad_norm": 3.1930184364318848, + "learning_rate": 9.545032675245814e-06, + "loss": 1.1161, + "step": 3384 + }, + { + "epoch": 0.9002659574468085, + "grad_norm": 3.5262556076049805, + "learning_rate": 9.544666043110562e-06, + "loss": 1.2255, + "step": 3385 + }, + { + "epoch": 0.9005319148936171, + "grad_norm": 3.4826877117156982, + "learning_rate": 9.544299270357992e-06, + "loss": 1.2001, + "step": 3386 + }, + { + "epoch": 0.9007978723404255, + "grad_norm": 3.602201223373413, + "learning_rate": 9.543932356999452e-06, + "loss": 1.2133, + "step": 3387 + }, + { + "epoch": 0.9010638297872341, + "grad_norm": 3.6607158184051514, + "learning_rate": 9.543565303046297e-06, + "loss": 1.1962, + "step": 3388 + }, + { + "epoch": 0.9013297872340426, + "grad_norm": 3.664412260055542, + "learning_rate": 9.543198108509879e-06, + "loss": 1.2857, + "step": 3389 + }, + { + "epoch": 0.901595744680851, + "grad_norm": 3.5442616939544678, + "learning_rate": 9.542830773401564e-06, + "loss": 1.2096, + "step": 3390 + }, + { + "epoch": 0.9018617021276596, + "grad_norm": 4.058464527130127, + "learning_rate": 9.542463297732716e-06, + "loss": 1.4371, + "step": 3391 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.6064326763153076, + "learning_rate": 9.542095681514708e-06, + "loss": 1.2809, + "step": 3392 + }, + { + "epoch": 0.9023936170212766, + "grad_norm": 3.585545301437378, + "learning_rate": 9.541727924758907e-06, + "loss": 1.3174, + "step": 3393 + }, + { + "epoch": 0.9026595744680851, + "grad_norm": 3.465228319168091, + "learning_rate": 9.5413600274767e-06, + "loss": 1.2042, + "step": 3394 + }, + { + "epoch": 0.9029255319148937, + "grad_norm": 3.581475019454956, + "learning_rate": 9.540991989679468e-06, + "loss": 1.3837, + "step": 3395 + }, + { + "epoch": 0.9031914893617021, + "grad_norm": 3.4275171756744385, + "learning_rate": 9.540623811378597e-06, + "loss": 1.209, + "step": 3396 + }, + { + "epoch": 0.9034574468085106, + "grad_norm": 3.159125328063965, + "learning_rate": 9.540255492585478e-06, + "loss": 1.2519, + "step": 3397 + }, + { + "epoch": 0.9037234042553192, + "grad_norm": 3.7644615173339844, + "learning_rate": 9.53988703331151e-06, + "loss": 1.2965, + "step": 3398 + }, + { + "epoch": 0.9039893617021276, + "grad_norm": 3.519270896911621, + "learning_rate": 9.53951843356809e-06, + "loss": 1.2125, + "step": 3399 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 3.7408711910247803, + "learning_rate": 9.539149693366628e-06, + "loss": 1.3432, + "step": 3400 + }, + { + "epoch": 0.9045212765957447, + "grad_norm": 3.343994617462158, + "learning_rate": 9.538780812718527e-06, + "loss": 1.2149, + "step": 3401 + }, + { + "epoch": 0.9047872340425532, + "grad_norm": 3.3215134143829346, + "learning_rate": 9.538411791635205e-06, + "loss": 1.2844, + "step": 3402 + }, + { + "epoch": 0.9050531914893617, + "grad_norm": 3.9590845108032227, + "learning_rate": 9.53804263012808e-06, + "loss": 1.289, + "step": 3403 + }, + { + "epoch": 0.9053191489361702, + "grad_norm": 3.299415349960327, + "learning_rate": 9.537673328208572e-06, + "loss": 1.0875, + "step": 3404 + }, + { + "epoch": 0.9055851063829787, + "grad_norm": 3.5640780925750732, + "learning_rate": 9.53730388588811e-06, + "loss": 1.2735, + "step": 3405 + }, + { + "epoch": 0.9058510638297872, + "grad_norm": 3.2300360202789307, + "learning_rate": 9.536934303178123e-06, + "loss": 1.3574, + "step": 3406 + }, + { + "epoch": 0.9061170212765958, + "grad_norm": 3.6983630657196045, + "learning_rate": 9.536564580090046e-06, + "loss": 1.2751, + "step": 3407 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 3.740288257598877, + "learning_rate": 9.536194716635322e-06, + "loss": 1.25, + "step": 3408 + }, + { + "epoch": 0.9066489361702128, + "grad_norm": 3.6063649654388428, + "learning_rate": 9.535824712825393e-06, + "loss": 1.1656, + "step": 3409 + }, + { + "epoch": 0.9069148936170213, + "grad_norm": 3.738442897796631, + "learning_rate": 9.535454568671705e-06, + "loss": 1.3204, + "step": 3410 + }, + { + "epoch": 0.9071808510638298, + "grad_norm": 3.7406976222991943, + "learning_rate": 9.535084284185714e-06, + "loss": 1.2681, + "step": 3411 + }, + { + "epoch": 0.9074468085106383, + "grad_norm": 3.7773613929748535, + "learning_rate": 9.534713859378875e-06, + "loss": 1.2303, + "step": 3412 + }, + { + "epoch": 0.9077127659574468, + "grad_norm": 3.531691312789917, + "learning_rate": 9.53434329426265e-06, + "loss": 1.1495, + "step": 3413 + }, + { + "epoch": 0.9079787234042553, + "grad_norm": 3.730365514755249, + "learning_rate": 9.533972588848507e-06, + "loss": 1.1998, + "step": 3414 + }, + { + "epoch": 0.9082446808510638, + "grad_norm": 4.04153299331665, + "learning_rate": 9.533601743147911e-06, + "loss": 1.2527, + "step": 3415 + }, + { + "epoch": 0.9085106382978724, + "grad_norm": 3.547910451889038, + "learning_rate": 9.53323075717234e-06, + "loss": 1.3033, + "step": 3416 + }, + { + "epoch": 0.9087765957446808, + "grad_norm": 3.444802761077881, + "learning_rate": 9.532859630933276e-06, + "loss": 1.2513, + "step": 3417 + }, + { + "epoch": 0.9090425531914894, + "grad_norm": 3.7553112506866455, + "learning_rate": 9.532488364442195e-06, + "loss": 1.1689, + "step": 3418 + }, + { + "epoch": 0.9093085106382979, + "grad_norm": 3.748389959335327, + "learning_rate": 9.532116957710587e-06, + "loss": 1.2341, + "step": 3419 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 3.5497937202453613, + "learning_rate": 9.531745410749946e-06, + "loss": 1.198, + "step": 3420 + }, + { + "epoch": 0.9098404255319149, + "grad_norm": 3.540468692779541, + "learning_rate": 9.531373723571765e-06, + "loss": 1.3774, + "step": 3421 + }, + { + "epoch": 0.9101063829787234, + "grad_norm": 3.332838535308838, + "learning_rate": 9.531001896187548e-06, + "loss": 1.3205, + "step": 3422 + }, + { + "epoch": 0.910372340425532, + "grad_norm": 3.7700576782226562, + "learning_rate": 9.530629928608797e-06, + "loss": 1.0956, + "step": 3423 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.387652635574341, + "learning_rate": 9.530257820847022e-06, + "loss": 1.1835, + "step": 3424 + }, + { + "epoch": 0.910904255319149, + "grad_norm": 3.9318602085113525, + "learning_rate": 9.529885572913735e-06, + "loss": 1.3197, + "step": 3425 + }, + { + "epoch": 0.9111702127659574, + "grad_norm": 3.158997058868408, + "learning_rate": 9.529513184820458e-06, + "loss": 1.2074, + "step": 3426 + }, + { + "epoch": 0.9114361702127659, + "grad_norm": 3.5039327144622803, + "learning_rate": 9.529140656578707e-06, + "loss": 1.3652, + "step": 3427 + }, + { + "epoch": 0.9117021276595745, + "grad_norm": 3.682145118713379, + "learning_rate": 9.528767988200015e-06, + "loss": 1.1703, + "step": 3428 + }, + { + "epoch": 0.9119680851063829, + "grad_norm": 3.6255364418029785, + "learning_rate": 9.528395179695907e-06, + "loss": 1.269, + "step": 3429 + }, + { + "epoch": 0.9122340425531915, + "grad_norm": 3.666750907897949, + "learning_rate": 9.528022231077921e-06, + "loss": 1.4003, + "step": 3430 + }, + { + "epoch": 0.9125, + "grad_norm": 3.167771816253662, + "learning_rate": 9.527649142357596e-06, + "loss": 1.1409, + "step": 3431 + }, + { + "epoch": 0.9127659574468086, + "grad_norm": 3.6556570529937744, + "learning_rate": 9.527275913546475e-06, + "loss": 1.3847, + "step": 3432 + }, + { + "epoch": 0.913031914893617, + "grad_norm": 3.794574737548828, + "learning_rate": 9.526902544656108e-06, + "loss": 1.3673, + "step": 3433 + }, + { + "epoch": 0.9132978723404256, + "grad_norm": 3.597594976425171, + "learning_rate": 9.526529035698046e-06, + "loss": 1.068, + "step": 3434 + }, + { + "epoch": 0.913563829787234, + "grad_norm": 3.1316208839416504, + "learning_rate": 9.526155386683848e-06, + "loss": 1.1379, + "step": 3435 + }, + { + "epoch": 0.9138297872340425, + "grad_norm": 3.3742425441741943, + "learning_rate": 9.525781597625073e-06, + "loss": 1.2233, + "step": 3436 + }, + { + "epoch": 0.9140957446808511, + "grad_norm": 3.6747100353240967, + "learning_rate": 9.525407668533286e-06, + "loss": 1.3035, + "step": 3437 + }, + { + "epoch": 0.9143617021276595, + "grad_norm": 3.4809205532073975, + "learning_rate": 9.525033599420058e-06, + "loss": 1.1033, + "step": 3438 + }, + { + "epoch": 0.9146276595744681, + "grad_norm": 3.575571298599243, + "learning_rate": 9.524659390296961e-06, + "loss": 1.222, + "step": 3439 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.502336263656616, + "learning_rate": 9.524285041175578e-06, + "loss": 1.1575, + "step": 3440 + }, + { + "epoch": 0.9151595744680852, + "grad_norm": 3.6172244548797607, + "learning_rate": 9.523910552067489e-06, + "loss": 1.1852, + "step": 3441 + }, + { + "epoch": 0.9154255319148936, + "grad_norm": 3.6247096061706543, + "learning_rate": 9.523535922984281e-06, + "loss": 1.4405, + "step": 3442 + }, + { + "epoch": 0.9156914893617021, + "grad_norm": 3.5026776790618896, + "learning_rate": 9.523161153937546e-06, + "loss": 1.2206, + "step": 3443 + }, + { + "epoch": 0.9159574468085107, + "grad_norm": 3.7139501571655273, + "learning_rate": 9.522786244938877e-06, + "loss": 1.3555, + "step": 3444 + }, + { + "epoch": 0.9162234042553191, + "grad_norm": 3.3043665885925293, + "learning_rate": 9.522411195999879e-06, + "loss": 1.0747, + "step": 3445 + }, + { + "epoch": 0.9164893617021277, + "grad_norm": 3.3844451904296875, + "learning_rate": 9.522036007132154e-06, + "loss": 1.2419, + "step": 3446 + }, + { + "epoch": 0.9167553191489362, + "grad_norm": 3.499330520629883, + "learning_rate": 9.521660678347311e-06, + "loss": 1.2287, + "step": 3447 + }, + { + "epoch": 0.9170212765957447, + "grad_norm": 3.4153192043304443, + "learning_rate": 9.521285209656964e-06, + "loss": 1.2425, + "step": 3448 + }, + { + "epoch": 0.9172872340425532, + "grad_norm": 3.838230848312378, + "learning_rate": 9.520909601072726e-06, + "loss": 1.2476, + "step": 3449 + }, + { + "epoch": 0.9175531914893617, + "grad_norm": 3.879303455352783, + "learning_rate": 9.520533852606226e-06, + "loss": 1.2743, + "step": 3450 + }, + { + "epoch": 0.9178191489361702, + "grad_norm": 3.2687835693359375, + "learning_rate": 9.520157964269083e-06, + "loss": 1.0722, + "step": 3451 + }, + { + "epoch": 0.9180851063829787, + "grad_norm": 3.6070616245269775, + "learning_rate": 9.519781936072933e-06, + "loss": 1.2863, + "step": 3452 + }, + { + "epoch": 0.9183510638297873, + "grad_norm": 3.410642623901367, + "learning_rate": 9.519405768029408e-06, + "loss": 1.2184, + "step": 3453 + }, + { + "epoch": 0.9186170212765957, + "grad_norm": 3.642425775527954, + "learning_rate": 9.519029460150148e-06, + "loss": 1.2836, + "step": 3454 + }, + { + "epoch": 0.9188829787234043, + "grad_norm": 3.6479597091674805, + "learning_rate": 9.518653012446794e-06, + "loss": 1.3349, + "step": 3455 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 3.2941248416900635, + "learning_rate": 9.518276424931e-06, + "loss": 1.1445, + "step": 3456 + }, + { + "epoch": 0.9194148936170212, + "grad_norm": 3.3414933681488037, + "learning_rate": 9.51789969761441e-06, + "loss": 1.3321, + "step": 3457 + }, + { + "epoch": 0.9196808510638298, + "grad_norm": 3.39167857170105, + "learning_rate": 9.517522830508685e-06, + "loss": 1.222, + "step": 3458 + }, + { + "epoch": 0.9199468085106383, + "grad_norm": 3.520202875137329, + "learning_rate": 9.517145823625485e-06, + "loss": 1.2299, + "step": 3459 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 3.953166961669922, + "learning_rate": 9.516768676976476e-06, + "loss": 1.3692, + "step": 3460 + }, + { + "epoch": 0.9204787234042553, + "grad_norm": 3.654834032058716, + "learning_rate": 9.516391390573326e-06, + "loss": 1.1788, + "step": 3461 + }, + { + "epoch": 0.9207446808510639, + "grad_norm": 4.268529415130615, + "learning_rate": 9.516013964427708e-06, + "loss": 1.3661, + "step": 3462 + }, + { + "epoch": 0.9210106382978723, + "grad_norm": 3.7426726818084717, + "learning_rate": 9.515636398551302e-06, + "loss": 1.3322, + "step": 3463 + }, + { + "epoch": 0.9212765957446809, + "grad_norm": 3.7757678031921387, + "learning_rate": 9.515258692955788e-06, + "loss": 1.2663, + "step": 3464 + }, + { + "epoch": 0.9215425531914894, + "grad_norm": 3.2425293922424316, + "learning_rate": 9.514880847652855e-06, + "loss": 1.1537, + "step": 3465 + }, + { + "epoch": 0.9218085106382978, + "grad_norm": 3.891484498977661, + "learning_rate": 9.514502862654192e-06, + "loss": 1.3394, + "step": 3466 + }, + { + "epoch": 0.9220744680851064, + "grad_norm": 3.499422788619995, + "learning_rate": 9.514124737971495e-06, + "loss": 1.3386, + "step": 3467 + }, + { + "epoch": 0.9223404255319149, + "grad_norm": 3.8201444149017334, + "learning_rate": 9.513746473616466e-06, + "loss": 1.2374, + "step": 3468 + }, + { + "epoch": 0.9226063829787234, + "grad_norm": 3.488330841064453, + "learning_rate": 9.513368069600806e-06, + "loss": 1.1239, + "step": 3469 + }, + { + "epoch": 0.9228723404255319, + "grad_norm": 3.2124156951904297, + "learning_rate": 9.512989525936223e-06, + "loss": 1.2058, + "step": 3470 + }, + { + "epoch": 0.9231382978723405, + "grad_norm": 3.4447717666625977, + "learning_rate": 9.512610842634432e-06, + "loss": 1.1785, + "step": 3471 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 3.3703794479370117, + "learning_rate": 9.512232019707148e-06, + "loss": 1.3696, + "step": 3472 + }, + { + "epoch": 0.9236702127659574, + "grad_norm": 3.2821013927459717, + "learning_rate": 9.511853057166094e-06, + "loss": 1.181, + "step": 3473 + }, + { + "epoch": 0.923936170212766, + "grad_norm": 3.2314436435699463, + "learning_rate": 9.511473955022992e-06, + "loss": 1.2571, + "step": 3474 + }, + { + "epoch": 0.9242021276595744, + "grad_norm": 3.635651111602783, + "learning_rate": 9.511094713289575e-06, + "loss": 1.2779, + "step": 3475 + }, + { + "epoch": 0.924468085106383, + "grad_norm": 3.7356226444244385, + "learning_rate": 9.510715331977579e-06, + "loss": 1.3406, + "step": 3476 + }, + { + "epoch": 0.9247340425531915, + "grad_norm": 3.5567257404327393, + "learning_rate": 9.510335811098737e-06, + "loss": 1.2792, + "step": 3477 + }, + { + "epoch": 0.925, + "grad_norm": 3.603287696838379, + "learning_rate": 9.509956150664796e-06, + "loss": 1.1966, + "step": 3478 + }, + { + "epoch": 0.9252659574468085, + "grad_norm": 3.915576219558716, + "learning_rate": 9.509576350687502e-06, + "loss": 1.2955, + "step": 3479 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 3.7345378398895264, + "learning_rate": 9.509196411178605e-06, + "loss": 1.1994, + "step": 3480 + }, + { + "epoch": 0.9257978723404255, + "grad_norm": 3.4640583992004395, + "learning_rate": 9.508816332149862e-06, + "loss": 1.1937, + "step": 3481 + }, + { + "epoch": 0.926063829787234, + "grad_norm": 3.5885074138641357, + "learning_rate": 9.508436113613036e-06, + "loss": 1.2895, + "step": 3482 + }, + { + "epoch": 0.9263297872340426, + "grad_norm": 3.241925001144409, + "learning_rate": 9.508055755579886e-06, + "loss": 1.1693, + "step": 3483 + }, + { + "epoch": 0.926595744680851, + "grad_norm": 3.664020538330078, + "learning_rate": 9.507675258062183e-06, + "loss": 1.2333, + "step": 3484 + }, + { + "epoch": 0.9268617021276596, + "grad_norm": 3.365907669067383, + "learning_rate": 9.507294621071702e-06, + "loss": 1.1572, + "step": 3485 + }, + { + "epoch": 0.9271276595744681, + "grad_norm": 3.634084939956665, + "learning_rate": 9.506913844620217e-06, + "loss": 1.1676, + "step": 3486 + }, + { + "epoch": 0.9273936170212767, + "grad_norm": 3.2822062969207764, + "learning_rate": 9.506532928719514e-06, + "loss": 1.2271, + "step": 3487 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 3.920335292816162, + "learning_rate": 9.506151873381376e-06, + "loss": 1.3218, + "step": 3488 + }, + { + "epoch": 0.9279255319148936, + "grad_norm": 3.8373231887817383, + "learning_rate": 9.505770678617592e-06, + "loss": 1.2391, + "step": 3489 + }, + { + "epoch": 0.9281914893617021, + "grad_norm": 3.5426108837127686, + "learning_rate": 9.50538934443996e-06, + "loss": 1.2676, + "step": 3490 + }, + { + "epoch": 0.9284574468085106, + "grad_norm": 3.550251007080078, + "learning_rate": 9.505007870860276e-06, + "loss": 1.2651, + "step": 3491 + }, + { + "epoch": 0.9287234042553192, + "grad_norm": 3.3801169395446777, + "learning_rate": 9.504626257890345e-06, + "loss": 1.1764, + "step": 3492 + }, + { + "epoch": 0.9289893617021276, + "grad_norm": 4.002630233764648, + "learning_rate": 9.504244505541974e-06, + "loss": 1.2602, + "step": 3493 + }, + { + "epoch": 0.9292553191489362, + "grad_norm": 3.6300952434539795, + "learning_rate": 9.503862613826976e-06, + "loss": 1.1864, + "step": 3494 + }, + { + "epoch": 0.9295212765957447, + "grad_norm": 3.574536085128784, + "learning_rate": 9.503480582757163e-06, + "loss": 1.3364, + "step": 3495 + }, + { + "epoch": 0.9297872340425531, + "grad_norm": 3.6244354248046875, + "learning_rate": 9.50309841234436e-06, + "loss": 1.1998, + "step": 3496 + }, + { + "epoch": 0.9300531914893617, + "grad_norm": 3.826706886291504, + "learning_rate": 9.502716102600393e-06, + "loss": 1.1791, + "step": 3497 + }, + { + "epoch": 0.9303191489361702, + "grad_norm": 3.3346476554870605, + "learning_rate": 9.502333653537085e-06, + "loss": 1.1943, + "step": 3498 + }, + { + "epoch": 0.9305851063829788, + "grad_norm": 3.4599905014038086, + "learning_rate": 9.501951065166276e-06, + "loss": 1.2966, + "step": 3499 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 3.6470425128936768, + "learning_rate": 9.501568337499798e-06, + "loss": 1.2633, + "step": 3500 + }, + { + "epoch": 0.9308510638297872, + "eval_loss": 1.2690000534057617, + "eval_runtime": 12.8787, + "eval_samples_per_second": 31.059, + "eval_steps_per_second": 3.882, + "step": 3500 + }, + { + "epoch": 0.9311170212765958, + "grad_norm": 3.7849044799804688, + "learning_rate": 9.501185470549496e-06, + "loss": 1.2158, + "step": 3501 + }, + { + "epoch": 0.9313829787234043, + "grad_norm": 3.3262534141540527, + "learning_rate": 9.500802464327217e-06, + "loss": 1.2429, + "step": 3502 + }, + { + "epoch": 0.9316489361702127, + "grad_norm": 3.458172559738159, + "learning_rate": 9.500419318844811e-06, + "loss": 1.2177, + "step": 3503 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 3.7243428230285645, + "learning_rate": 9.500036034114132e-06, + "loss": 1.2877, + "step": 3504 + }, + { + "epoch": 0.9321808510638298, + "grad_norm": 3.6194655895233154, + "learning_rate": 9.49965261014704e-06, + "loss": 1.3507, + "step": 3505 + }, + { + "epoch": 0.9324468085106383, + "grad_norm": 3.4799468517303467, + "learning_rate": 9.499269046955398e-06, + "loss": 1.2658, + "step": 3506 + }, + { + "epoch": 0.9327127659574468, + "grad_norm": 3.6711440086364746, + "learning_rate": 9.498885344551077e-06, + "loss": 1.1922, + "step": 3507 + }, + { + "epoch": 0.9329787234042554, + "grad_norm": 3.7202506065368652, + "learning_rate": 9.498501502945943e-06, + "loss": 1.1922, + "step": 3508 + }, + { + "epoch": 0.9332446808510638, + "grad_norm": 3.440639019012451, + "learning_rate": 9.498117522151878e-06, + "loss": 1.1795, + "step": 3509 + }, + { + "epoch": 0.9335106382978723, + "grad_norm": 3.513429880142212, + "learning_rate": 9.497733402180761e-06, + "loss": 1.2098, + "step": 3510 + }, + { + "epoch": 0.9337765957446809, + "grad_norm": 3.599651575088501, + "learning_rate": 9.497349143044478e-06, + "loss": 1.2052, + "step": 3511 + }, + { + "epoch": 0.9340425531914893, + "grad_norm": 4.015235900878906, + "learning_rate": 9.496964744754915e-06, + "loss": 1.233, + "step": 3512 + }, + { + "epoch": 0.9343085106382979, + "grad_norm": 3.3815979957580566, + "learning_rate": 9.49658020732397e-06, + "loss": 1.1291, + "step": 3513 + }, + { + "epoch": 0.9345744680851064, + "grad_norm": 3.3032724857330322, + "learning_rate": 9.49619553076354e-06, + "loss": 1.2174, + "step": 3514 + }, + { + "epoch": 0.9348404255319149, + "grad_norm": 3.571817398071289, + "learning_rate": 9.495810715085526e-06, + "loss": 1.3212, + "step": 3515 + }, + { + "epoch": 0.9351063829787234, + "grad_norm": 3.5486996173858643, + "learning_rate": 9.495425760301836e-06, + "loss": 1.1428, + "step": 3516 + }, + { + "epoch": 0.935372340425532, + "grad_norm": 3.3801069259643555, + "learning_rate": 9.495040666424378e-06, + "loss": 1.1673, + "step": 3517 + }, + { + "epoch": 0.9356382978723404, + "grad_norm": 3.6057615280151367, + "learning_rate": 9.494655433465071e-06, + "loss": 1.1342, + "step": 3518 + }, + { + "epoch": 0.9359042553191489, + "grad_norm": 3.6146769523620605, + "learning_rate": 9.494270061435834e-06, + "loss": 1.4436, + "step": 3519 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 3.200052499771118, + "learning_rate": 9.493884550348589e-06, + "loss": 1.1598, + "step": 3520 + }, + { + "epoch": 0.9364361702127659, + "grad_norm": 3.6785783767700195, + "learning_rate": 9.493498900215265e-06, + "loss": 1.2838, + "step": 3521 + }, + { + "epoch": 0.9367021276595745, + "grad_norm": 3.905540943145752, + "learning_rate": 9.493113111047794e-06, + "loss": 1.2665, + "step": 3522 + }, + { + "epoch": 0.936968085106383, + "grad_norm": 3.300579786300659, + "learning_rate": 9.492727182858115e-06, + "loss": 1.2111, + "step": 3523 + }, + { + "epoch": 0.9372340425531915, + "grad_norm": 3.8752784729003906, + "learning_rate": 9.492341115658167e-06, + "loss": 1.2444, + "step": 3524 + }, + { + "epoch": 0.9375, + "grad_norm": 3.561800241470337, + "learning_rate": 9.491954909459895e-06, + "loss": 1.2224, + "step": 3525 + }, + { + "epoch": 0.9377659574468085, + "grad_norm": 3.434983730316162, + "learning_rate": 9.491568564275252e-06, + "loss": 1.2249, + "step": 3526 + }, + { + "epoch": 0.938031914893617, + "grad_norm": 3.5711958408355713, + "learning_rate": 9.491182080116185e-06, + "loss": 1.3134, + "step": 3527 + }, + { + "epoch": 0.9382978723404255, + "grad_norm": 3.2614593505859375, + "learning_rate": 9.490795456994658e-06, + "loss": 1.1418, + "step": 3528 + }, + { + "epoch": 0.9385638297872341, + "grad_norm": 3.7001163959503174, + "learning_rate": 9.490408694922635e-06, + "loss": 1.2611, + "step": 3529 + }, + { + "epoch": 0.9388297872340425, + "grad_norm": 3.287165880203247, + "learning_rate": 9.490021793912079e-06, + "loss": 1.1458, + "step": 3530 + }, + { + "epoch": 0.9390957446808511, + "grad_norm": 3.9669268131256104, + "learning_rate": 9.489634753974961e-06, + "loss": 1.1978, + "step": 3531 + }, + { + "epoch": 0.9393617021276596, + "grad_norm": 3.8696441650390625, + "learning_rate": 9.48924757512326e-06, + "loss": 1.3488, + "step": 3532 + }, + { + "epoch": 0.939627659574468, + "grad_norm": 3.8109893798828125, + "learning_rate": 9.48886025736895e-06, + "loss": 1.2341, + "step": 3533 + }, + { + "epoch": 0.9398936170212766, + "grad_norm": 3.3541629314422607, + "learning_rate": 9.488472800724022e-06, + "loss": 1.1629, + "step": 3534 + }, + { + "epoch": 0.9401595744680851, + "grad_norm": 3.4784152507781982, + "learning_rate": 9.48808520520046e-06, + "loss": 1.3021, + "step": 3535 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 3.4299418926239014, + "learning_rate": 9.487697470810257e-06, + "loss": 1.1674, + "step": 3536 + }, + { + "epoch": 0.9406914893617021, + "grad_norm": 3.467414617538452, + "learning_rate": 9.487309597565413e-06, + "loss": 1.1953, + "step": 3537 + }, + { + "epoch": 0.9409574468085107, + "grad_norm": 3.263312816619873, + "learning_rate": 9.486921585477924e-06, + "loss": 1.1662, + "step": 3538 + }, + { + "epoch": 0.9412234042553191, + "grad_norm": 3.3032853603363037, + "learning_rate": 9.486533434559801e-06, + "loss": 1.2386, + "step": 3539 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 3.641338348388672, + "learning_rate": 9.48614514482305e-06, + "loss": 1.25, + "step": 3540 + }, + { + "epoch": 0.9417553191489362, + "grad_norm": 3.5189712047576904, + "learning_rate": 9.485756716279686e-06, + "loss": 1.2763, + "step": 3541 + }, + { + "epoch": 0.9420212765957446, + "grad_norm": 3.464155912399292, + "learning_rate": 9.485368148941728e-06, + "loss": 1.278, + "step": 3542 + }, + { + "epoch": 0.9422872340425532, + "grad_norm": 3.5938682556152344, + "learning_rate": 9.484979442821199e-06, + "loss": 1.1817, + "step": 3543 + }, + { + "epoch": 0.9425531914893617, + "grad_norm": 3.399099588394165, + "learning_rate": 9.484590597930125e-06, + "loss": 1.3007, + "step": 3544 + }, + { + "epoch": 0.9428191489361702, + "grad_norm": 3.681652545928955, + "learning_rate": 9.484201614280539e-06, + "loss": 1.1233, + "step": 3545 + }, + { + "epoch": 0.9430851063829787, + "grad_norm": 3.4110119342803955, + "learning_rate": 9.483812491884475e-06, + "loss": 1.3159, + "step": 3546 + }, + { + "epoch": 0.9433510638297873, + "grad_norm": 3.347201347351074, + "learning_rate": 9.483423230753975e-06, + "loss": 1.2668, + "step": 3547 + }, + { + "epoch": 0.9436170212765957, + "grad_norm": 3.551835775375366, + "learning_rate": 9.48303383090108e-06, + "loss": 1.2695, + "step": 3548 + }, + { + "epoch": 0.9438829787234042, + "grad_norm": 7.742011547088623, + "learning_rate": 9.48264429233784e-06, + "loss": 1.3468, + "step": 3549 + }, + { + "epoch": 0.9441489361702128, + "grad_norm": 3.5810296535491943, + "learning_rate": 9.482254615076307e-06, + "loss": 1.2088, + "step": 3550 + }, + { + "epoch": 0.9444148936170212, + "grad_norm": 3.6081788539886475, + "learning_rate": 9.481864799128541e-06, + "loss": 1.199, + "step": 3551 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.4480881690979004, + "learning_rate": 9.481474844506602e-06, + "loss": 1.2016, + "step": 3552 + }, + { + "epoch": 0.9449468085106383, + "grad_norm": 3.4126522541046143, + "learning_rate": 9.481084751222553e-06, + "loss": 1.0633, + "step": 3553 + }, + { + "epoch": 0.9452127659574469, + "grad_norm": 3.731552839279175, + "learning_rate": 9.480694519288467e-06, + "loss": 1.3171, + "step": 3554 + }, + { + "epoch": 0.9454787234042553, + "grad_norm": 3.7800607681274414, + "learning_rate": 9.480304148716418e-06, + "loss": 1.4008, + "step": 3555 + }, + { + "epoch": 0.9457446808510638, + "grad_norm": 3.509230375289917, + "learning_rate": 9.47991363951848e-06, + "loss": 1.2949, + "step": 3556 + }, + { + "epoch": 0.9460106382978724, + "grad_norm": 3.7124991416931152, + "learning_rate": 9.479522991706744e-06, + "loss": 1.1951, + "step": 3557 + }, + { + "epoch": 0.9462765957446808, + "grad_norm": 3.6707465648651123, + "learning_rate": 9.479132205293291e-06, + "loss": 1.1625, + "step": 3558 + }, + { + "epoch": 0.9465425531914894, + "grad_norm": 3.456841468811035, + "learning_rate": 9.478741280290214e-06, + "loss": 1.1969, + "step": 3559 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 4.189627170562744, + "learning_rate": 9.478350216709609e-06, + "loss": 1.4571, + "step": 3560 + }, + { + "epoch": 0.9470744680851064, + "grad_norm": 3.5188887119293213, + "learning_rate": 9.477959014563575e-06, + "loss": 1.2589, + "step": 3561 + }, + { + "epoch": 0.9473404255319149, + "grad_norm": 3.594780206680298, + "learning_rate": 9.477567673864217e-06, + "loss": 1.2652, + "step": 3562 + }, + { + "epoch": 0.9476063829787233, + "grad_norm": 3.3485286235809326, + "learning_rate": 9.477176194623644e-06, + "loss": 1.2256, + "step": 3563 + }, + { + "epoch": 0.9478723404255319, + "grad_norm": 3.549306631088257, + "learning_rate": 9.476784576853967e-06, + "loss": 1.2868, + "step": 3564 + }, + { + "epoch": 0.9481382978723404, + "grad_norm": 3.50877046585083, + "learning_rate": 9.476392820567306e-06, + "loss": 1.0912, + "step": 3565 + }, + { + "epoch": 0.948404255319149, + "grad_norm": 3.3570492267608643, + "learning_rate": 9.476000925775782e-06, + "loss": 1.2827, + "step": 3566 + }, + { + "epoch": 0.9486702127659574, + "grad_norm": 3.3039703369140625, + "learning_rate": 9.475608892491516e-06, + "loss": 1.1552, + "step": 3567 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 3.559574604034424, + "learning_rate": 9.475216720726644e-06, + "loss": 1.1988, + "step": 3568 + }, + { + "epoch": 0.9492021276595745, + "grad_norm": 3.8060848712921143, + "learning_rate": 9.474824410493298e-06, + "loss": 1.3264, + "step": 3569 + }, + { + "epoch": 0.949468085106383, + "grad_norm": 3.3232123851776123, + "learning_rate": 9.474431961803615e-06, + "loss": 1.1884, + "step": 3570 + }, + { + "epoch": 0.9497340425531915, + "grad_norm": 3.821077346801758, + "learning_rate": 9.47403937466974e-06, + "loss": 1.3414, + "step": 3571 + }, + { + "epoch": 0.95, + "grad_norm": 3.464698076248169, + "learning_rate": 9.473646649103819e-06, + "loss": 1.1284, + "step": 3572 + }, + { + "epoch": 0.9502659574468085, + "grad_norm": 3.464268922805786, + "learning_rate": 9.473253785118003e-06, + "loss": 1.3262, + "step": 3573 + }, + { + "epoch": 0.950531914893617, + "grad_norm": 3.7841787338256836, + "learning_rate": 9.472860782724448e-06, + "loss": 1.1169, + "step": 3574 + }, + { + "epoch": 0.9507978723404256, + "grad_norm": 3.278888463973999, + "learning_rate": 9.472467641935314e-06, + "loss": 1.1413, + "step": 3575 + }, + { + "epoch": 0.951063829787234, + "grad_norm": 3.321603536605835, + "learning_rate": 9.472074362762767e-06, + "loss": 1.0513, + "step": 3576 + }, + { + "epoch": 0.9513297872340426, + "grad_norm": 3.8839926719665527, + "learning_rate": 9.471680945218973e-06, + "loss": 1.2412, + "step": 3577 + }, + { + "epoch": 0.9515957446808511, + "grad_norm": 3.5885181427001953, + "learning_rate": 9.471287389316107e-06, + "loss": 1.1092, + "step": 3578 + }, + { + "epoch": 0.9518617021276595, + "grad_norm": 3.592010498046875, + "learning_rate": 9.470893695066345e-06, + "loss": 1.275, + "step": 3579 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 3.785581111907959, + "learning_rate": 9.470499862481867e-06, + "loss": 1.3256, + "step": 3580 + }, + { + "epoch": 0.9523936170212766, + "grad_norm": 3.41489315032959, + "learning_rate": 9.47010589157486e-06, + "loss": 1.2419, + "step": 3581 + }, + { + "epoch": 0.9526595744680851, + "grad_norm": 3.4412648677825928, + "learning_rate": 9.469711782357513e-06, + "loss": 1.3029, + "step": 3582 + }, + { + "epoch": 0.9529255319148936, + "grad_norm": 3.6879758834838867, + "learning_rate": 9.469317534842025e-06, + "loss": 1.217, + "step": 3583 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 3.8642208576202393, + "learning_rate": 9.468923149040587e-06, + "loss": 1.3035, + "step": 3584 + }, + { + "epoch": 0.9534574468085106, + "grad_norm": 3.9491965770721436, + "learning_rate": 9.468528624965406e-06, + "loss": 1.3494, + "step": 3585 + }, + { + "epoch": 0.9537234042553191, + "grad_norm": 3.6963748931884766, + "learning_rate": 9.468133962628688e-06, + "loss": 1.1793, + "step": 3586 + }, + { + "epoch": 0.9539893617021277, + "grad_norm": 3.4110567569732666, + "learning_rate": 9.467739162042643e-06, + "loss": 1.1798, + "step": 3587 + }, + { + "epoch": 0.9542553191489361, + "grad_norm": 3.718494176864624, + "learning_rate": 9.46734422321949e-06, + "loss": 1.3528, + "step": 3588 + }, + { + "epoch": 0.9545212765957447, + "grad_norm": 3.9455974102020264, + "learning_rate": 9.466949146171449e-06, + "loss": 1.341, + "step": 3589 + }, + { + "epoch": 0.9547872340425532, + "grad_norm": 3.668195962905884, + "learning_rate": 9.46655393091074e-06, + "loss": 1.1503, + "step": 3590 + }, + { + "epoch": 0.9550531914893617, + "grad_norm": 3.662208080291748, + "learning_rate": 9.466158577449593e-06, + "loss": 1.3243, + "step": 3591 + }, + { + "epoch": 0.9553191489361702, + "grad_norm": 3.463543176651001, + "learning_rate": 9.465763085800244e-06, + "loss": 1.187, + "step": 3592 + }, + { + "epoch": 0.9555851063829788, + "grad_norm": 3.6207196712493896, + "learning_rate": 9.465367455974926e-06, + "loss": 1.2523, + "step": 3593 + }, + { + "epoch": 0.9558510638297872, + "grad_norm": 3.3348443508148193, + "learning_rate": 9.46497168798588e-06, + "loss": 1.2145, + "step": 3594 + }, + { + "epoch": 0.9561170212765957, + "grad_norm": 4.174299240112305, + "learning_rate": 9.464575781845355e-06, + "loss": 1.4818, + "step": 3595 + }, + { + "epoch": 0.9563829787234043, + "grad_norm": 3.3657476902008057, + "learning_rate": 9.464179737565598e-06, + "loss": 1.2587, + "step": 3596 + }, + { + "epoch": 0.9566489361702127, + "grad_norm": 3.697920560836792, + "learning_rate": 9.463783555158866e-06, + "loss": 1.36, + "step": 3597 + }, + { + "epoch": 0.9569148936170213, + "grad_norm": 3.825244903564453, + "learning_rate": 9.463387234637413e-06, + "loss": 1.2879, + "step": 3598 + }, + { + "epoch": 0.9571808510638298, + "grad_norm": 3.5759551525115967, + "learning_rate": 9.462990776013504e-06, + "loss": 1.4189, + "step": 3599 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 3.6317455768585205, + "learning_rate": 9.462594179299408e-06, + "loss": 1.3723, + "step": 3600 + }, + { + "epoch": 0.9577127659574468, + "grad_norm": 3.254585027694702, + "learning_rate": 9.46219744450739e-06, + "loss": 1.1231, + "step": 3601 + }, + { + "epoch": 0.9579787234042553, + "grad_norm": 3.0535624027252197, + "learning_rate": 9.461800571649734e-06, + "loss": 1.0536, + "step": 3602 + }, + { + "epoch": 0.9582446808510638, + "grad_norm": 3.603959798812866, + "learning_rate": 9.461403560738713e-06, + "loss": 1.254, + "step": 3603 + }, + { + "epoch": 0.9585106382978723, + "grad_norm": 3.4408342838287354, + "learning_rate": 9.461006411786613e-06, + "loss": 1.2253, + "step": 3604 + }, + { + "epoch": 0.9587765957446809, + "grad_norm": 3.6801369190216064, + "learning_rate": 9.460609124805724e-06, + "loss": 1.2253, + "step": 3605 + }, + { + "epoch": 0.9590425531914893, + "grad_norm": 3.968122959136963, + "learning_rate": 9.460211699808334e-06, + "loss": 1.2456, + "step": 3606 + }, + { + "epoch": 0.9593085106382979, + "grad_norm": 3.602989912033081, + "learning_rate": 9.459814136806746e-06, + "loss": 1.2261, + "step": 3607 + }, + { + "epoch": 0.9595744680851064, + "grad_norm": 3.5720174312591553, + "learning_rate": 9.459416435813258e-06, + "loss": 1.1869, + "step": 3608 + }, + { + "epoch": 0.9598404255319148, + "grad_norm": 3.626312732696533, + "learning_rate": 9.459018596840173e-06, + "loss": 1.3385, + "step": 3609 + }, + { + "epoch": 0.9601063829787234, + "grad_norm": 3.5388100147247314, + "learning_rate": 9.458620619899803e-06, + "loss": 1.2523, + "step": 3610 + }, + { + "epoch": 0.9603723404255319, + "grad_norm": 3.8266894817352295, + "learning_rate": 9.458222505004462e-06, + "loss": 1.4002, + "step": 3611 + }, + { + "epoch": 0.9606382978723405, + "grad_norm": 3.576223373413086, + "learning_rate": 9.457824252166467e-06, + "loss": 1.2669, + "step": 3612 + }, + { + "epoch": 0.9609042553191489, + "grad_norm": 3.5163745880126953, + "learning_rate": 9.457425861398144e-06, + "loss": 1.1806, + "step": 3613 + }, + { + "epoch": 0.9611702127659575, + "grad_norm": 3.586691379547119, + "learning_rate": 9.457027332711814e-06, + "loss": 1.3403, + "step": 3614 + }, + { + "epoch": 0.961436170212766, + "grad_norm": 3.5483405590057373, + "learning_rate": 9.456628666119812e-06, + "loss": 1.2426, + "step": 3615 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 3.600684881210327, + "learning_rate": 9.456229861634471e-06, + "loss": 1.2333, + "step": 3616 + }, + { + "epoch": 0.961968085106383, + "grad_norm": 3.446035385131836, + "learning_rate": 9.455830919268134e-06, + "loss": 1.161, + "step": 3617 + }, + { + "epoch": 0.9622340425531914, + "grad_norm": 3.329267978668213, + "learning_rate": 9.45543183903314e-06, + "loss": 1.1162, + "step": 3618 + }, + { + "epoch": 0.9625, + "grad_norm": 3.4342401027679443, + "learning_rate": 9.45503262094184e-06, + "loss": 1.3068, + "step": 3619 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 3.230329751968384, + "learning_rate": 9.454633265006585e-06, + "loss": 1.1398, + "step": 3620 + }, + { + "epoch": 0.9630319148936171, + "grad_norm": 3.3767967224121094, + "learning_rate": 9.454233771239733e-06, + "loss": 1.3104, + "step": 3621 + }, + { + "epoch": 0.9632978723404255, + "grad_norm": 3.2001163959503174, + "learning_rate": 9.453834139653643e-06, + "loss": 1.1632, + "step": 3622 + }, + { + "epoch": 0.9635638297872341, + "grad_norm": 3.9331612586975098, + "learning_rate": 9.453434370260683e-06, + "loss": 1.3891, + "step": 3623 + }, + { + "epoch": 0.9638297872340426, + "grad_norm": 4.0084052085876465, + "learning_rate": 9.453034463073218e-06, + "loss": 1.4323, + "step": 3624 + }, + { + "epoch": 0.964095744680851, + "grad_norm": 3.2673776149749756, + "learning_rate": 9.452634418103626e-06, + "loss": 1.0984, + "step": 3625 + }, + { + "epoch": 0.9643617021276596, + "grad_norm": 3.2544898986816406, + "learning_rate": 9.45223423536428e-06, + "loss": 1.2681, + "step": 3626 + }, + { + "epoch": 0.964627659574468, + "grad_norm": 3.625535488128662, + "learning_rate": 9.451833914867567e-06, + "loss": 1.258, + "step": 3627 + }, + { + "epoch": 0.9648936170212766, + "grad_norm": 3.048551082611084, + "learning_rate": 9.451433456625871e-06, + "loss": 1.207, + "step": 3628 + }, + { + "epoch": 0.9651595744680851, + "grad_norm": 3.567139148712158, + "learning_rate": 9.451032860651583e-06, + "loss": 1.2771, + "step": 3629 + }, + { + "epoch": 0.9654255319148937, + "grad_norm": 3.618807077407837, + "learning_rate": 9.450632126957098e-06, + "loss": 1.2666, + "step": 3630 + }, + { + "epoch": 0.9656914893617021, + "grad_norm": 3.4883675575256348, + "learning_rate": 9.450231255554814e-06, + "loss": 1.1142, + "step": 3631 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 3.687424898147583, + "learning_rate": 9.449830246457136e-06, + "loss": 1.1745, + "step": 3632 + }, + { + "epoch": 0.9662234042553192, + "grad_norm": 3.457051992416382, + "learning_rate": 9.44942909967647e-06, + "loss": 1.1846, + "step": 3633 + }, + { + "epoch": 0.9664893617021276, + "grad_norm": 3.5090994834899902, + "learning_rate": 9.449027815225231e-06, + "loss": 1.3255, + "step": 3634 + }, + { + "epoch": 0.9667553191489362, + "grad_norm": 3.2658236026763916, + "learning_rate": 9.448626393115833e-06, + "loss": 1.0964, + "step": 3635 + }, + { + "epoch": 0.9670212765957447, + "grad_norm": 3.7192766666412354, + "learning_rate": 9.448224833360695e-06, + "loss": 1.3171, + "step": 3636 + }, + { + "epoch": 0.9672872340425532, + "grad_norm": 3.891343355178833, + "learning_rate": 9.447823135972247e-06, + "loss": 1.206, + "step": 3637 + }, + { + "epoch": 0.9675531914893617, + "grad_norm": 3.7228803634643555, + "learning_rate": 9.447421300962911e-06, + "loss": 1.2032, + "step": 3638 + }, + { + "epoch": 0.9678191489361702, + "grad_norm": 3.348090171813965, + "learning_rate": 9.447019328345125e-06, + "loss": 1.2437, + "step": 3639 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 3.3824315071105957, + "learning_rate": 9.446617218131326e-06, + "loss": 1.1005, + "step": 3640 + }, + { + "epoch": 0.9683510638297872, + "grad_norm": 4.107891082763672, + "learning_rate": 9.446214970333954e-06, + "loss": 1.3365, + "step": 3641 + }, + { + "epoch": 0.9686170212765958, + "grad_norm": 3.609551191329956, + "learning_rate": 9.445812584965458e-06, + "loss": 1.2756, + "step": 3642 + }, + { + "epoch": 0.9688829787234042, + "grad_norm": 3.625800371170044, + "learning_rate": 9.445410062038284e-06, + "loss": 1.2114, + "step": 3643 + }, + { + "epoch": 0.9691489361702128, + "grad_norm": 3.605753183364868, + "learning_rate": 9.445007401564889e-06, + "loss": 1.3025, + "step": 3644 + }, + { + "epoch": 0.9694148936170213, + "grad_norm": 3.2446835041046143, + "learning_rate": 9.444604603557733e-06, + "loss": 1.2037, + "step": 3645 + }, + { + "epoch": 0.9696808510638298, + "grad_norm": 3.478797674179077, + "learning_rate": 9.444201668029278e-06, + "loss": 1.2862, + "step": 3646 + }, + { + "epoch": 0.9699468085106383, + "grad_norm": 3.33634352684021, + "learning_rate": 9.443798594991989e-06, + "loss": 1.1298, + "step": 3647 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 3.82041597366333, + "learning_rate": 9.44339538445834e-06, + "loss": 1.2301, + "step": 3648 + }, + { + "epoch": 0.9704787234042553, + "grad_norm": 3.5176687240600586, + "learning_rate": 9.442992036440808e-06, + "loss": 1.1489, + "step": 3649 + }, + { + "epoch": 0.9707446808510638, + "grad_norm": 3.265772819519043, + "learning_rate": 9.44258855095187e-06, + "loss": 1.1147, + "step": 3650 + }, + { + "epoch": 0.9710106382978724, + "grad_norm": 3.5735883712768555, + "learning_rate": 9.442184928004012e-06, + "loss": 1.2768, + "step": 3651 + }, + { + "epoch": 0.9712765957446808, + "grad_norm": 3.6002047061920166, + "learning_rate": 9.441781167609722e-06, + "loss": 1.3395, + "step": 3652 + }, + { + "epoch": 0.9715425531914894, + "grad_norm": 3.8888189792633057, + "learning_rate": 9.441377269781496e-06, + "loss": 1.2223, + "step": 3653 + }, + { + "epoch": 0.9718085106382979, + "grad_norm": 3.6971378326416016, + "learning_rate": 9.440973234531825e-06, + "loss": 1.1813, + "step": 3654 + }, + { + "epoch": 0.9720744680851063, + "grad_norm": 3.6079912185668945, + "learning_rate": 9.440569061873213e-06, + "loss": 1.1156, + "step": 3655 + }, + { + "epoch": 0.9723404255319149, + "grad_norm": 3.839540481567383, + "learning_rate": 9.440164751818168e-06, + "loss": 1.4711, + "step": 3656 + }, + { + "epoch": 0.9726063829787234, + "grad_norm": 3.7191896438598633, + "learning_rate": 9.439760304379197e-06, + "loss": 1.2351, + "step": 3657 + }, + { + "epoch": 0.972872340425532, + "grad_norm": 3.902529001235962, + "learning_rate": 9.439355719568817e-06, + "loss": 1.3487, + "step": 3658 + }, + { + "epoch": 0.9731382978723404, + "grad_norm": 3.389925241470337, + "learning_rate": 9.438950997399543e-06, + "loss": 1.1905, + "step": 3659 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 3.6134610176086426, + "learning_rate": 9.438546137883898e-06, + "loss": 1.2323, + "step": 3660 + }, + { + "epoch": 0.9736702127659574, + "grad_norm": 4.062784671783447, + "learning_rate": 9.438141141034409e-06, + "loss": 1.2437, + "step": 3661 + }, + { + "epoch": 0.9739361702127659, + "grad_norm": 3.6207644939422607, + "learning_rate": 9.437736006863611e-06, + "loss": 1.2922, + "step": 3662 + }, + { + "epoch": 0.9742021276595745, + "grad_norm": 3.2939248085021973, + "learning_rate": 9.437330735384034e-06, + "loss": 1.2348, + "step": 3663 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 3.6209723949432373, + "learning_rate": 9.43692532660822e-06, + "loss": 1.2698, + "step": 3664 + }, + { + "epoch": 0.9747340425531915, + "grad_norm": 3.766961097717285, + "learning_rate": 9.436519780548712e-06, + "loss": 1.3306, + "step": 3665 + }, + { + "epoch": 0.975, + "grad_norm": 3.1702146530151367, + "learning_rate": 9.43611409721806e-06, + "loss": 1.2877, + "step": 3666 + }, + { + "epoch": 0.9752659574468086, + "grad_norm": 3.411604642868042, + "learning_rate": 9.435708276628814e-06, + "loss": 1.1874, + "step": 3667 + }, + { + "epoch": 0.975531914893617, + "grad_norm": 3.3507773876190186, + "learning_rate": 9.435302318793533e-06, + "loss": 1.1614, + "step": 3668 + }, + { + "epoch": 0.9757978723404256, + "grad_norm": 3.42853045463562, + "learning_rate": 9.434896223724774e-06, + "loss": 1.128, + "step": 3669 + }, + { + "epoch": 0.976063829787234, + "grad_norm": 3.5911173820495605, + "learning_rate": 9.434489991435106e-06, + "loss": 1.2216, + "step": 3670 + }, + { + "epoch": 0.9763297872340425, + "grad_norm": 3.4679529666900635, + "learning_rate": 9.434083621937096e-06, + "loss": 1.1932, + "step": 3671 + }, + { + "epoch": 0.9765957446808511, + "grad_norm": 3.4107143878936768, + "learning_rate": 9.433677115243318e-06, + "loss": 1.1279, + "step": 3672 + }, + { + "epoch": 0.9768617021276595, + "grad_norm": 3.5593109130859375, + "learning_rate": 9.433270471366352e-06, + "loss": 1.1996, + "step": 3673 + }, + { + "epoch": 0.9771276595744681, + "grad_norm": 3.193164110183716, + "learning_rate": 9.432863690318777e-06, + "loss": 1.103, + "step": 3674 + }, + { + "epoch": 0.9773936170212766, + "grad_norm": 3.5351223945617676, + "learning_rate": 9.432456772113179e-06, + "loss": 1.2212, + "step": 3675 + }, + { + "epoch": 0.9776595744680852, + "grad_norm": 3.4629955291748047, + "learning_rate": 9.432049716762151e-06, + "loss": 1.2055, + "step": 3676 + }, + { + "epoch": 0.9779255319148936, + "grad_norm": 3.661907196044922, + "learning_rate": 9.431642524278286e-06, + "loss": 1.3389, + "step": 3677 + }, + { + "epoch": 0.9781914893617021, + "grad_norm": 3.140364408493042, + "learning_rate": 9.431235194674185e-06, + "loss": 1.2099, + "step": 3678 + }, + { + "epoch": 0.9784574468085107, + "grad_norm": 3.7145817279815674, + "learning_rate": 9.43082772796245e-06, + "loss": 1.49, + "step": 3679 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 3.3982760906219482, + "learning_rate": 9.430420124155687e-06, + "loss": 1.2001, + "step": 3680 + }, + { + "epoch": 0.9789893617021277, + "grad_norm": 3.7518324851989746, + "learning_rate": 9.43001238326651e-06, + "loss": 1.4143, + "step": 3681 + }, + { + "epoch": 0.9792553191489362, + "grad_norm": 3.708822250366211, + "learning_rate": 9.429604505307535e-06, + "loss": 1.2038, + "step": 3682 + }, + { + "epoch": 0.9795212765957447, + "grad_norm": 3.5261037349700928, + "learning_rate": 9.42919649029138e-06, + "loss": 1.2233, + "step": 3683 + }, + { + "epoch": 0.9797872340425532, + "grad_norm": 3.842564582824707, + "learning_rate": 9.428788338230672e-06, + "loss": 1.3385, + "step": 3684 + }, + { + "epoch": 0.9800531914893617, + "grad_norm": 3.688267230987549, + "learning_rate": 9.428380049138038e-06, + "loss": 1.2034, + "step": 3685 + }, + { + "epoch": 0.9803191489361702, + "grad_norm": 3.877396583557129, + "learning_rate": 9.42797162302611e-06, + "loss": 1.2775, + "step": 3686 + }, + { + "epoch": 0.9805851063829787, + "grad_norm": 3.4748518466949463, + "learning_rate": 9.427563059907528e-06, + "loss": 1.4141, + "step": 3687 + }, + { + "epoch": 0.9808510638297873, + "grad_norm": 3.0281589031219482, + "learning_rate": 9.427154359794931e-06, + "loss": 1.2591, + "step": 3688 + }, + { + "epoch": 0.9811170212765957, + "grad_norm": 3.5246212482452393, + "learning_rate": 9.42674552270097e-06, + "loss": 1.1775, + "step": 3689 + }, + { + "epoch": 0.9813829787234043, + "grad_norm": 3.599862813949585, + "learning_rate": 9.426336548638287e-06, + "loss": 1.187, + "step": 3690 + }, + { + "epoch": 0.9816489361702128, + "grad_norm": 3.8031740188598633, + "learning_rate": 9.42592743761954e-06, + "loss": 1.3704, + "step": 3691 + }, + { + "epoch": 0.9819148936170212, + "grad_norm": 3.708652973175049, + "learning_rate": 9.425518189657388e-06, + "loss": 1.2567, + "step": 3692 + }, + { + "epoch": 0.9821808510638298, + "grad_norm": 3.341240882873535, + "learning_rate": 9.425108804764493e-06, + "loss": 1.4062, + "step": 3693 + }, + { + "epoch": 0.9824468085106383, + "grad_norm": 3.5106687545776367, + "learning_rate": 9.42469928295352e-06, + "loss": 1.1759, + "step": 3694 + }, + { + "epoch": 0.9827127659574468, + "grad_norm": 3.153082847595215, + "learning_rate": 9.424289624237143e-06, + "loss": 1.1955, + "step": 3695 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 3.4173176288604736, + "learning_rate": 9.423879828628038e-06, + "loss": 1.3188, + "step": 3696 + }, + { + "epoch": 0.9832446808510639, + "grad_norm": 3.5854523181915283, + "learning_rate": 9.42346989613888e-06, + "loss": 1.2425, + "step": 3697 + }, + { + "epoch": 0.9835106382978723, + "grad_norm": 3.536123752593994, + "learning_rate": 9.423059826782355e-06, + "loss": 1.2088, + "step": 3698 + }, + { + "epoch": 0.9837765957446809, + "grad_norm": 3.5280613899230957, + "learning_rate": 9.422649620571155e-06, + "loss": 1.4956, + "step": 3699 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.896684169769287, + "learning_rate": 9.422239277517964e-06, + "loss": 1.3236, + "step": 3700 + }, + { + "epoch": 0.9843085106382978, + "grad_norm": 3.417961597442627, + "learning_rate": 9.421828797635487e-06, + "loss": 1.2044, + "step": 3701 + }, + { + "epoch": 0.9845744680851064, + "grad_norm": 3.4376044273376465, + "learning_rate": 9.421418180936419e-06, + "loss": 1.2657, + "step": 3702 + }, + { + "epoch": 0.9848404255319149, + "grad_norm": 3.8742475509643555, + "learning_rate": 9.421007427433467e-06, + "loss": 1.2526, + "step": 3703 + }, + { + "epoch": 0.9851063829787234, + "grad_norm": 4.002706527709961, + "learning_rate": 9.42059653713934e-06, + "loss": 1.446, + "step": 3704 + }, + { + "epoch": 0.9853723404255319, + "grad_norm": 3.462308883666992, + "learning_rate": 9.420185510066753e-06, + "loss": 1.2338, + "step": 3705 + }, + { + "epoch": 0.9856382978723405, + "grad_norm": 3.684730291366577, + "learning_rate": 9.41977434622842e-06, + "loss": 1.2417, + "step": 3706 + }, + { + "epoch": 0.9859042553191489, + "grad_norm": 3.5235018730163574, + "learning_rate": 9.419363045637067e-06, + "loss": 1.3775, + "step": 3707 + }, + { + "epoch": 0.9861702127659574, + "grad_norm": 3.2986860275268555, + "learning_rate": 9.418951608305417e-06, + "loss": 1.1967, + "step": 3708 + }, + { + "epoch": 0.986436170212766, + "grad_norm": 3.2341742515563965, + "learning_rate": 9.418540034246202e-06, + "loss": 1.1223, + "step": 3709 + }, + { + "epoch": 0.9867021276595744, + "grad_norm": 3.5601837635040283, + "learning_rate": 9.418128323472157e-06, + "loss": 1.2934, + "step": 3710 + }, + { + "epoch": 0.986968085106383, + "grad_norm": 4.002072811126709, + "learning_rate": 9.41771647599602e-06, + "loss": 1.2226, + "step": 3711 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 3.6095480918884277, + "learning_rate": 9.417304491830533e-06, + "loss": 1.2332, + "step": 3712 + }, + { + "epoch": 0.9875, + "grad_norm": 3.7682595252990723, + "learning_rate": 9.416892370988445e-06, + "loss": 1.1929, + "step": 3713 + }, + { + "epoch": 0.9877659574468085, + "grad_norm": 3.4983551502227783, + "learning_rate": 9.416480113482505e-06, + "loss": 1.2426, + "step": 3714 + }, + { + "epoch": 0.988031914893617, + "grad_norm": 3.490725040435791, + "learning_rate": 9.416067719325472e-06, + "loss": 1.2009, + "step": 3715 + }, + { + "epoch": 0.9882978723404255, + "grad_norm": 3.564605474472046, + "learning_rate": 9.415655188530104e-06, + "loss": 1.2105, + "step": 3716 + }, + { + "epoch": 0.988563829787234, + "grad_norm": 3.5361475944519043, + "learning_rate": 9.415242521109166e-06, + "loss": 1.3389, + "step": 3717 + }, + { + "epoch": 0.9888297872340426, + "grad_norm": 3.3671114444732666, + "learning_rate": 9.414829717075426e-06, + "loss": 1.3157, + "step": 3718 + }, + { + "epoch": 0.989095744680851, + "grad_norm": 3.7442715167999268, + "learning_rate": 9.414416776441656e-06, + "loss": 1.1551, + "step": 3719 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 3.4414875507354736, + "learning_rate": 9.414003699220636e-06, + "loss": 1.2135, + "step": 3720 + }, + { + "epoch": 0.9896276595744681, + "grad_norm": 4.052205562591553, + "learning_rate": 9.413590485425143e-06, + "loss": 1.3299, + "step": 3721 + }, + { + "epoch": 0.9898936170212767, + "grad_norm": 3.0953876972198486, + "learning_rate": 9.413177135067964e-06, + "loss": 1.1183, + "step": 3722 + }, + { + "epoch": 0.9901595744680851, + "grad_norm": 3.767108678817749, + "learning_rate": 9.41276364816189e-06, + "loss": 1.325, + "step": 3723 + }, + { + "epoch": 0.9904255319148936, + "grad_norm": 3.3017489910125732, + "learning_rate": 9.412350024719713e-06, + "loss": 1.2328, + "step": 3724 + }, + { + "epoch": 0.9906914893617021, + "grad_norm": 3.5287554264068604, + "learning_rate": 9.41193626475423e-06, + "loss": 1.2442, + "step": 3725 + }, + { + "epoch": 0.9909574468085106, + "grad_norm": 3.6898324489593506, + "learning_rate": 9.411522368278243e-06, + "loss": 1.2682, + "step": 3726 + }, + { + "epoch": 0.9912234042553192, + "grad_norm": 3.9228873252868652, + "learning_rate": 9.411108335304562e-06, + "loss": 1.3415, + "step": 3727 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 3.9011435508728027, + "learning_rate": 9.410694165845996e-06, + "loss": 1.2388, + "step": 3728 + }, + { + "epoch": 0.9917553191489362, + "grad_norm": 3.714230537414551, + "learning_rate": 9.41027985991536e-06, + "loss": 1.2085, + "step": 3729 + }, + { + "epoch": 0.9920212765957447, + "grad_norm": 3.627887010574341, + "learning_rate": 9.409865417525473e-06, + "loss": 1.2682, + "step": 3730 + }, + { + "epoch": 0.9922872340425531, + "grad_norm": 3.4126439094543457, + "learning_rate": 9.409450838689156e-06, + "loss": 1.2089, + "step": 3731 + }, + { + "epoch": 0.9925531914893617, + "grad_norm": 3.5555756092071533, + "learning_rate": 9.409036123419239e-06, + "loss": 1.2066, + "step": 3732 + }, + { + "epoch": 0.9928191489361702, + "grad_norm": 3.5292632579803467, + "learning_rate": 9.408621271728555e-06, + "loss": 1.1913, + "step": 3733 + }, + { + "epoch": 0.9930851063829788, + "grad_norm": 3.5443150997161865, + "learning_rate": 9.408206283629937e-06, + "loss": 1.2293, + "step": 3734 + }, + { + "epoch": 0.9933510638297872, + "grad_norm": 3.8415119647979736, + "learning_rate": 9.407791159136226e-06, + "loss": 1.496, + "step": 3735 + }, + { + "epoch": 0.9936170212765958, + "grad_norm": 3.647085189819336, + "learning_rate": 9.407375898260267e-06, + "loss": 1.1983, + "step": 3736 + }, + { + "epoch": 0.9938829787234043, + "grad_norm": 3.2950799465179443, + "learning_rate": 9.40696050101491e-06, + "loss": 1.1298, + "step": 3737 + }, + { + "epoch": 0.9941489361702127, + "grad_norm": 3.837249517440796, + "learning_rate": 9.406544967413008e-06, + "loss": 1.2763, + "step": 3738 + }, + { + "epoch": 0.9944148936170213, + "grad_norm": 3.437069892883301, + "learning_rate": 9.406129297467414e-06, + "loss": 1.1689, + "step": 3739 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 3.7600064277648926, + "learning_rate": 9.405713491190992e-06, + "loss": 1.4092, + "step": 3740 + }, + { + "epoch": 0.9949468085106383, + "grad_norm": 3.547830104827881, + "learning_rate": 9.405297548596607e-06, + "loss": 1.3794, + "step": 3741 + }, + { + "epoch": 0.9952127659574468, + "grad_norm": 3.673377752304077, + "learning_rate": 9.404881469697132e-06, + "loss": 1.1934, + "step": 3742 + }, + { + "epoch": 0.9954787234042554, + "grad_norm": 3.6018290519714355, + "learning_rate": 9.404465254505435e-06, + "loss": 1.2228, + "step": 3743 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 3.5014569759368896, + "learning_rate": 9.4040489030344e-06, + "loss": 1.1731, + "step": 3744 + }, + { + "epoch": 0.9960106382978723, + "grad_norm": 3.6044108867645264, + "learning_rate": 9.403632415296907e-06, + "loss": 1.2917, + "step": 3745 + }, + { + "epoch": 0.9962765957446809, + "grad_norm": 3.626147985458374, + "learning_rate": 9.40321579130584e-06, + "loss": 1.2297, + "step": 3746 + }, + { + "epoch": 0.9965425531914893, + "grad_norm": 3.5548157691955566, + "learning_rate": 9.402799031074095e-06, + "loss": 1.2096, + "step": 3747 + }, + { + "epoch": 0.9968085106382979, + "grad_norm": 4.016201019287109, + "learning_rate": 9.402382134614563e-06, + "loss": 1.2461, + "step": 3748 + }, + { + "epoch": 0.9970744680851064, + "grad_norm": 3.2637929916381836, + "learning_rate": 9.401965101940144e-06, + "loss": 1.1531, + "step": 3749 + }, + { + "epoch": 0.9973404255319149, + "grad_norm": 3.330240249633789, + "learning_rate": 9.40154793306374e-06, + "loss": 1.1598, + "step": 3750 + }, + { + "epoch": 0.9976063829787234, + "grad_norm": 3.522907257080078, + "learning_rate": 9.401130627998265e-06, + "loss": 1.1563, + "step": 3751 + }, + { + "epoch": 0.997872340425532, + "grad_norm": 3.462400197982788, + "learning_rate": 9.400713186756625e-06, + "loss": 1.0948, + "step": 3752 + }, + { + "epoch": 0.9981382978723404, + "grad_norm": 3.6393964290618896, + "learning_rate": 9.400295609351738e-06, + "loss": 1.2499, + "step": 3753 + }, + { + "epoch": 0.9984042553191489, + "grad_norm": 3.4382801055908203, + "learning_rate": 9.399877895796526e-06, + "loss": 1.2587, + "step": 3754 + }, + { + "epoch": 0.9986702127659575, + "grad_norm": 3.769301414489746, + "learning_rate": 9.399460046103908e-06, + "loss": 1.283, + "step": 3755 + }, + { + "epoch": 0.9989361702127659, + "grad_norm": 3.3904542922973633, + "learning_rate": 9.399042060286819e-06, + "loss": 1.3667, + "step": 3756 + }, + { + "epoch": 0.9992021276595745, + "grad_norm": 3.413027763366699, + "learning_rate": 9.398623938358188e-06, + "loss": 1.1575, + "step": 3757 + }, + { + "epoch": 0.999468085106383, + "grad_norm": 3.8313398361206055, + "learning_rate": 9.398205680330954e-06, + "loss": 1.1665, + "step": 3758 + }, + { + "epoch": 0.9997340425531915, + "grad_norm": 3.5040853023529053, + "learning_rate": 9.397787286218058e-06, + "loss": 1.3182, + "step": 3759 + }, + { + "epoch": 1.0, + "grad_norm": 3.6746809482574463, + "learning_rate": 9.397368756032445e-06, + "loss": 1.2287, + "step": 3760 + }, + { + "epoch": 1.0002659574468085, + "grad_norm": 3.308379650115967, + "learning_rate": 9.396950089787066e-06, + "loss": 0.8299, + "step": 3761 + }, + { + "epoch": 1.000531914893617, + "grad_norm": 3.8195013999938965, + "learning_rate": 9.396531287494877e-06, + "loss": 0.8431, + "step": 3762 + }, + { + "epoch": 1.0007978723404256, + "grad_norm": 3.317417621612549, + "learning_rate": 9.396112349168832e-06, + "loss": 0.9087, + "step": 3763 + }, + { + "epoch": 1.001063829787234, + "grad_norm": 3.6359126567840576, + "learning_rate": 9.395693274821893e-06, + "loss": 0.8605, + "step": 3764 + }, + { + "epoch": 1.0013297872340425, + "grad_norm": 3.3946707248687744, + "learning_rate": 9.39527406446703e-06, + "loss": 0.9424, + "step": 3765 + }, + { + "epoch": 1.001595744680851, + "grad_norm": 3.7910523414611816, + "learning_rate": 9.394854718117214e-06, + "loss": 0.7635, + "step": 3766 + }, + { + "epoch": 1.0018617021276597, + "grad_norm": 3.847181558609009, + "learning_rate": 9.394435235785417e-06, + "loss": 0.8419, + "step": 3767 + }, + { + "epoch": 1.0021276595744681, + "grad_norm": 3.5999948978424072, + "learning_rate": 9.394015617484621e-06, + "loss": 0.7906, + "step": 3768 + }, + { + "epoch": 1.0023936170212766, + "grad_norm": 3.53528094291687, + "learning_rate": 9.393595863227808e-06, + "loss": 0.7652, + "step": 3769 + }, + { + "epoch": 1.002659574468085, + "grad_norm": 4.102449417114258, + "learning_rate": 9.393175973027967e-06, + "loss": 0.837, + "step": 3770 + }, + { + "epoch": 1.0029255319148935, + "grad_norm": 4.625784397125244, + "learning_rate": 9.392755946898087e-06, + "loss": 0.8694, + "step": 3771 + }, + { + "epoch": 1.0031914893617022, + "grad_norm": 3.7955758571624756, + "learning_rate": 9.392335784851168e-06, + "loss": 0.7127, + "step": 3772 + }, + { + "epoch": 1.0034574468085107, + "grad_norm": 4.6287970542907715, + "learning_rate": 9.39191548690021e-06, + "loss": 0.6634, + "step": 3773 + }, + { + "epoch": 1.0037234042553191, + "grad_norm": 4.188403129577637, + "learning_rate": 9.391495053058213e-06, + "loss": 0.7676, + "step": 3774 + }, + { + "epoch": 1.0039893617021276, + "grad_norm": 4.061558723449707, + "learning_rate": 9.39107448333819e-06, + "loss": 0.6863, + "step": 3775 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 3.9614672660827637, + "learning_rate": 9.390653777753151e-06, + "loss": 0.8902, + "step": 3776 + }, + { + "epoch": 1.0045212765957447, + "grad_norm": 3.7978405952453613, + "learning_rate": 9.390232936316116e-06, + "loss": 0.8576, + "step": 3777 + }, + { + "epoch": 1.0047872340425532, + "grad_norm": 4.081401348114014, + "learning_rate": 9.389811959040106e-06, + "loss": 0.9293, + "step": 3778 + }, + { + "epoch": 1.0050531914893617, + "grad_norm": 4.4708123207092285, + "learning_rate": 9.389390845938147e-06, + "loss": 0.7971, + "step": 3779 + }, + { + "epoch": 1.0053191489361701, + "grad_norm": 3.670398235321045, + "learning_rate": 9.388969597023265e-06, + "loss": 0.7746, + "step": 3780 + }, + { + "epoch": 1.0055851063829788, + "grad_norm": 3.678659200668335, + "learning_rate": 9.388548212308496e-06, + "loss": 0.7505, + "step": 3781 + }, + { + "epoch": 1.0058510638297873, + "grad_norm": 3.943781614303589, + "learning_rate": 9.388126691806879e-06, + "loss": 0.7205, + "step": 3782 + }, + { + "epoch": 1.0061170212765957, + "grad_norm": 3.976630926132202, + "learning_rate": 9.387705035531455e-06, + "loss": 0.8597, + "step": 3783 + }, + { + "epoch": 1.0063829787234042, + "grad_norm": 3.6376004219055176, + "learning_rate": 9.387283243495273e-06, + "loss": 0.7911, + "step": 3784 + }, + { + "epoch": 1.0066489361702127, + "grad_norm": 3.698863983154297, + "learning_rate": 9.386861315711382e-06, + "loss": 0.7718, + "step": 3785 + }, + { + "epoch": 1.0069148936170214, + "grad_norm": 3.553309679031372, + "learning_rate": 9.386439252192836e-06, + "loss": 0.8233, + "step": 3786 + }, + { + "epoch": 1.0071808510638298, + "grad_norm": 3.588423252105713, + "learning_rate": 9.386017052952694e-06, + "loss": 0.782, + "step": 3787 + }, + { + "epoch": 1.0074468085106383, + "grad_norm": 3.5977461338043213, + "learning_rate": 9.385594718004023e-06, + "loss": 0.8548, + "step": 3788 + }, + { + "epoch": 1.0077127659574467, + "grad_norm": 4.447713375091553, + "learning_rate": 9.385172247359887e-06, + "loss": 0.833, + "step": 3789 + }, + { + "epoch": 1.0079787234042554, + "grad_norm": 3.6044774055480957, + "learning_rate": 9.384749641033358e-06, + "loss": 0.8453, + "step": 3790 + }, + { + "epoch": 1.008244680851064, + "grad_norm": 3.4909749031066895, + "learning_rate": 9.384326899037515e-06, + "loss": 0.7723, + "step": 3791 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 3.8825156688690186, + "learning_rate": 9.383904021385433e-06, + "loss": 0.7219, + "step": 3792 + }, + { + "epoch": 1.0087765957446808, + "grad_norm": 4.605208396911621, + "learning_rate": 9.3834810080902e-06, + "loss": 0.8625, + "step": 3793 + }, + { + "epoch": 1.0090425531914893, + "grad_norm": 3.8827695846557617, + "learning_rate": 9.383057859164904e-06, + "loss": 0.7579, + "step": 3794 + }, + { + "epoch": 1.009308510638298, + "grad_norm": 3.8152899742126465, + "learning_rate": 9.382634574622637e-06, + "loss": 0.7785, + "step": 3795 + }, + { + "epoch": 1.0095744680851064, + "grad_norm": 3.9749300479888916, + "learning_rate": 9.382211154476497e-06, + "loss": 0.7768, + "step": 3796 + }, + { + "epoch": 1.0098404255319149, + "grad_norm": 3.9352428913116455, + "learning_rate": 9.381787598739586e-06, + "loss": 0.9265, + "step": 3797 + }, + { + "epoch": 1.0101063829787233, + "grad_norm": 3.8235480785369873, + "learning_rate": 9.381363907425006e-06, + "loss": 0.7915, + "step": 3798 + }, + { + "epoch": 1.0103723404255318, + "grad_norm": 4.1063103675842285, + "learning_rate": 9.380940080545869e-06, + "loss": 0.8271, + "step": 3799 + }, + { + "epoch": 1.0106382978723405, + "grad_norm": 3.7685892581939697, + "learning_rate": 9.380516118115287e-06, + "loss": 0.7611, + "step": 3800 + }, + { + "epoch": 1.010904255319149, + "grad_norm": 3.679269790649414, + "learning_rate": 9.380092020146379e-06, + "loss": 0.7943, + "step": 3801 + }, + { + "epoch": 1.0111702127659574, + "grad_norm": 3.7096617221832275, + "learning_rate": 9.379667786652267e-06, + "loss": 0.8254, + "step": 3802 + }, + { + "epoch": 1.0114361702127659, + "grad_norm": 3.4425570964813232, + "learning_rate": 9.379243417646077e-06, + "loss": 0.7538, + "step": 3803 + }, + { + "epoch": 1.0117021276595746, + "grad_norm": 3.324869155883789, + "learning_rate": 9.378818913140941e-06, + "loss": 0.6687, + "step": 3804 + }, + { + "epoch": 1.011968085106383, + "grad_norm": 3.6117424964904785, + "learning_rate": 9.378394273149992e-06, + "loss": 0.8059, + "step": 3805 + }, + { + "epoch": 1.0122340425531915, + "grad_norm": 3.843747615814209, + "learning_rate": 9.377969497686369e-06, + "loss": 0.7257, + "step": 3806 + }, + { + "epoch": 1.0125, + "grad_norm": 3.997349977493286, + "learning_rate": 9.377544586763216e-06, + "loss": 0.837, + "step": 3807 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 3.5746796131134033, + "learning_rate": 9.377119540393677e-06, + "loss": 0.7891, + "step": 3808 + }, + { + "epoch": 1.013031914893617, + "grad_norm": 3.7787206172943115, + "learning_rate": 9.37669435859091e-06, + "loss": 0.7984, + "step": 3809 + }, + { + "epoch": 1.0132978723404256, + "grad_norm": 4.2211174964904785, + "learning_rate": 9.376269041368063e-06, + "loss": 0.7274, + "step": 3810 + }, + { + "epoch": 1.013563829787234, + "grad_norm": 3.591057300567627, + "learning_rate": 9.375843588738302e-06, + "loss": 0.807, + "step": 3811 + }, + { + "epoch": 1.0138297872340425, + "grad_norm": 3.5017266273498535, + "learning_rate": 9.375418000714787e-06, + "loss": 0.7173, + "step": 3812 + }, + { + "epoch": 1.014095744680851, + "grad_norm": 4.4692487716674805, + "learning_rate": 9.374992277310688e-06, + "loss": 0.7584, + "step": 3813 + }, + { + "epoch": 1.0143617021276596, + "grad_norm": 4.453067302703857, + "learning_rate": 9.374566418539178e-06, + "loss": 0.8444, + "step": 3814 + }, + { + "epoch": 1.014627659574468, + "grad_norm": 4.007133483886719, + "learning_rate": 9.37414042441343e-06, + "loss": 0.7163, + "step": 3815 + }, + { + "epoch": 1.0148936170212766, + "grad_norm": 3.714021682739258, + "learning_rate": 9.37371429494663e-06, + "loss": 0.7979, + "step": 3816 + }, + { + "epoch": 1.015159574468085, + "grad_norm": 4.196898460388184, + "learning_rate": 9.37328803015196e-06, + "loss": 0.8057, + "step": 3817 + }, + { + "epoch": 1.0154255319148937, + "grad_norm": 3.6794686317443848, + "learning_rate": 9.37286163004261e-06, + "loss": 0.8608, + "step": 3818 + }, + { + "epoch": 1.0156914893617022, + "grad_norm": 4.034078121185303, + "learning_rate": 9.37243509463177e-06, + "loss": 0.8794, + "step": 3819 + }, + { + "epoch": 1.0159574468085106, + "grad_norm": 3.671816110610962, + "learning_rate": 9.37200842393264e-06, + "loss": 0.755, + "step": 3820 + }, + { + "epoch": 1.016223404255319, + "grad_norm": 3.6856508255004883, + "learning_rate": 9.371581617958424e-06, + "loss": 0.7839, + "step": 3821 + }, + { + "epoch": 1.0164893617021276, + "grad_norm": 4.332293510437012, + "learning_rate": 9.371154676722326e-06, + "loss": 0.8305, + "step": 3822 + }, + { + "epoch": 1.0167553191489362, + "grad_norm": 4.032402038574219, + "learning_rate": 9.370727600237557e-06, + "loss": 0.8552, + "step": 3823 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 4.2808756828308105, + "learning_rate": 9.370300388517329e-06, + "loss": 0.8609, + "step": 3824 + }, + { + "epoch": 1.0172872340425532, + "grad_norm": 3.675684690475464, + "learning_rate": 9.36987304157486e-06, + "loss": 0.7307, + "step": 3825 + }, + { + "epoch": 1.0175531914893616, + "grad_norm": 3.6821727752685547, + "learning_rate": 9.369445559423376e-06, + "loss": 0.8393, + "step": 3826 + }, + { + "epoch": 1.0178191489361703, + "grad_norm": 4.112141132354736, + "learning_rate": 9.369017942076101e-06, + "loss": 0.8027, + "step": 3827 + }, + { + "epoch": 1.0180851063829788, + "grad_norm": 3.8829188346862793, + "learning_rate": 9.368590189546268e-06, + "loss": 0.8558, + "step": 3828 + }, + { + "epoch": 1.0183510638297872, + "grad_norm": 4.182821750640869, + "learning_rate": 9.368162301847112e-06, + "loss": 0.9872, + "step": 3829 + }, + { + "epoch": 1.0186170212765957, + "grad_norm": 4.043810844421387, + "learning_rate": 9.36773427899187e-06, + "loss": 0.731, + "step": 3830 + }, + { + "epoch": 1.0188829787234042, + "grad_norm": 3.6814448833465576, + "learning_rate": 9.367306120993787e-06, + "loss": 0.7434, + "step": 3831 + }, + { + "epoch": 1.0191489361702128, + "grad_norm": 3.823333978652954, + "learning_rate": 9.366877827866112e-06, + "loss": 0.7962, + "step": 3832 + }, + { + "epoch": 1.0194148936170213, + "grad_norm": 4.10197639465332, + "learning_rate": 9.366449399622092e-06, + "loss": 0.8655, + "step": 3833 + }, + { + "epoch": 1.0196808510638298, + "grad_norm": 3.4033734798431396, + "learning_rate": 9.366020836274991e-06, + "loss": 0.6871, + "step": 3834 + }, + { + "epoch": 1.0199468085106382, + "grad_norm": 3.9210493564605713, + "learning_rate": 9.365592137838063e-06, + "loss": 0.8913, + "step": 3835 + }, + { + "epoch": 1.0202127659574467, + "grad_norm": 3.972930431365967, + "learning_rate": 9.365163304324576e-06, + "loss": 0.7394, + "step": 3836 + }, + { + "epoch": 1.0204787234042554, + "grad_norm": 3.603489875793457, + "learning_rate": 9.364734335747795e-06, + "loss": 0.6501, + "step": 3837 + }, + { + "epoch": 1.0207446808510638, + "grad_norm": 3.678868532180786, + "learning_rate": 9.364305232120997e-06, + "loss": 0.7685, + "step": 3838 + }, + { + "epoch": 1.0210106382978723, + "grad_norm": 4.074692726135254, + "learning_rate": 9.363875993457454e-06, + "loss": 0.8085, + "step": 3839 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.683279514312744, + "learning_rate": 9.363446619770452e-06, + "loss": 0.7703, + "step": 3840 + }, + { + "epoch": 1.0215425531914895, + "grad_norm": 3.837007999420166, + "learning_rate": 9.363017111073273e-06, + "loss": 0.8403, + "step": 3841 + }, + { + "epoch": 1.021808510638298, + "grad_norm": 4.0264973640441895, + "learning_rate": 9.362587467379208e-06, + "loss": 0.8001, + "step": 3842 + }, + { + "epoch": 1.0220744680851064, + "grad_norm": 3.9169387817382812, + "learning_rate": 9.362157688701551e-06, + "loss": 0.7603, + "step": 3843 + }, + { + "epoch": 1.0223404255319148, + "grad_norm": 3.4985976219177246, + "learning_rate": 9.3617277750536e-06, + "loss": 0.6856, + "step": 3844 + }, + { + "epoch": 1.0226063829787233, + "grad_norm": 3.9737682342529297, + "learning_rate": 9.361297726448656e-06, + "loss": 0.8021, + "step": 3845 + }, + { + "epoch": 1.022872340425532, + "grad_norm": 4.206306457519531, + "learning_rate": 9.360867542900023e-06, + "loss": 0.7726, + "step": 3846 + }, + { + "epoch": 1.0231382978723405, + "grad_norm": 3.5013468265533447, + "learning_rate": 9.360437224421017e-06, + "loss": 0.7046, + "step": 3847 + }, + { + "epoch": 1.023404255319149, + "grad_norm": 4.186954021453857, + "learning_rate": 9.360006771024947e-06, + "loss": 0.8574, + "step": 3848 + }, + { + "epoch": 1.0236702127659574, + "grad_norm": 3.8380942344665527, + "learning_rate": 9.359576182725136e-06, + "loss": 0.8463, + "step": 3849 + }, + { + "epoch": 1.023936170212766, + "grad_norm": 4.439043998718262, + "learning_rate": 9.359145459534906e-06, + "loss": 0.868, + "step": 3850 + }, + { + "epoch": 1.0242021276595745, + "grad_norm": 3.555283546447754, + "learning_rate": 9.358714601467581e-06, + "loss": 0.7842, + "step": 3851 + }, + { + "epoch": 1.024468085106383, + "grad_norm": 3.4938576221466064, + "learning_rate": 9.358283608536498e-06, + "loss": 0.8562, + "step": 3852 + }, + { + "epoch": 1.0247340425531914, + "grad_norm": 3.709388256072998, + "learning_rate": 9.357852480754985e-06, + "loss": 0.7753, + "step": 3853 + }, + { + "epoch": 1.025, + "grad_norm": 3.594524621963501, + "learning_rate": 9.357421218136387e-06, + "loss": 0.9016, + "step": 3854 + }, + { + "epoch": 1.0252659574468086, + "grad_norm": 3.8423714637756348, + "learning_rate": 9.356989820694046e-06, + "loss": 0.918, + "step": 3855 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 4.120334625244141, + "learning_rate": 9.356558288441312e-06, + "loss": 0.8276, + "step": 3856 + }, + { + "epoch": 1.0257978723404255, + "grad_norm": 3.7441205978393555, + "learning_rate": 9.356126621391532e-06, + "loss": 0.6485, + "step": 3857 + }, + { + "epoch": 1.026063829787234, + "grad_norm": 3.652815341949463, + "learning_rate": 9.35569481955807e-06, + "loss": 0.8443, + "step": 3858 + }, + { + "epoch": 1.0263297872340424, + "grad_norm": 3.8127315044403076, + "learning_rate": 9.355262882954277e-06, + "loss": 0.8928, + "step": 3859 + }, + { + "epoch": 1.0265957446808511, + "grad_norm": 4.254662036895752, + "learning_rate": 9.354830811593527e-06, + "loss": 0.7228, + "step": 3860 + }, + { + "epoch": 1.0268617021276596, + "grad_norm": 3.737208366394043, + "learning_rate": 9.354398605489182e-06, + "loss": 0.7144, + "step": 3861 + }, + { + "epoch": 1.027127659574468, + "grad_norm": 4.630359172821045, + "learning_rate": 9.353966264654619e-06, + "loss": 1.0136, + "step": 3862 + }, + { + "epoch": 1.0273936170212765, + "grad_norm": 4.139670372009277, + "learning_rate": 9.353533789103213e-06, + "loss": 0.7467, + "step": 3863 + }, + { + "epoch": 1.0276595744680852, + "grad_norm": 3.5735762119293213, + "learning_rate": 9.353101178848345e-06, + "loss": 0.6863, + "step": 3864 + }, + { + "epoch": 1.0279255319148937, + "grad_norm": 4.091590881347656, + "learning_rate": 9.352668433903402e-06, + "loss": 0.9083, + "step": 3865 + }, + { + "epoch": 1.0281914893617021, + "grad_norm": 4.462408065795898, + "learning_rate": 9.352235554281775e-06, + "loss": 0.8134, + "step": 3866 + }, + { + "epoch": 1.0284574468085106, + "grad_norm": 4.514068603515625, + "learning_rate": 9.351802539996853e-06, + "loss": 0.8516, + "step": 3867 + }, + { + "epoch": 1.028723404255319, + "grad_norm": 4.771678447723389, + "learning_rate": 9.351369391062037e-06, + "loss": 0.8317, + "step": 3868 + }, + { + "epoch": 1.0289893617021277, + "grad_norm": 3.9608962535858154, + "learning_rate": 9.350936107490731e-06, + "loss": 0.7668, + "step": 3869 + }, + { + "epoch": 1.0292553191489362, + "grad_norm": 3.6606082916259766, + "learning_rate": 9.350502689296337e-06, + "loss": 0.8021, + "step": 3870 + }, + { + "epoch": 1.0295212765957447, + "grad_norm": 3.395991563796997, + "learning_rate": 9.35006913649227e-06, + "loss": 0.7561, + "step": 3871 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.9416377544403076, + "learning_rate": 9.34963544909194e-06, + "loss": 0.6551, + "step": 3872 + }, + { + "epoch": 1.0300531914893618, + "grad_norm": 3.8515100479125977, + "learning_rate": 9.34920162710877e-06, + "loss": 0.9596, + "step": 3873 + }, + { + "epoch": 1.0303191489361703, + "grad_norm": 3.532066583633423, + "learning_rate": 9.34876767055618e-06, + "loss": 0.7312, + "step": 3874 + }, + { + "epoch": 1.0305851063829787, + "grad_norm": 3.523547887802124, + "learning_rate": 9.3483335794476e-06, + "loss": 0.9029, + "step": 3875 + }, + { + "epoch": 1.0308510638297872, + "grad_norm": 3.8942482471466064, + "learning_rate": 9.347899353796456e-06, + "loss": 0.852, + "step": 3876 + }, + { + "epoch": 1.0311170212765957, + "grad_norm": 3.8025577068328857, + "learning_rate": 9.347464993616191e-06, + "loss": 0.7704, + "step": 3877 + }, + { + "epoch": 1.0313829787234043, + "grad_norm": 3.5986201763153076, + "learning_rate": 9.347030498920239e-06, + "loss": 0.8289, + "step": 3878 + }, + { + "epoch": 1.0316489361702128, + "grad_norm": 4.27517032623291, + "learning_rate": 9.346595869722044e-06, + "loss": 0.9252, + "step": 3879 + }, + { + "epoch": 1.0319148936170213, + "grad_norm": 3.845385789871216, + "learning_rate": 9.346161106035056e-06, + "loss": 0.7372, + "step": 3880 + }, + { + "epoch": 1.0321808510638297, + "grad_norm": 3.875645875930786, + "learning_rate": 9.345726207872728e-06, + "loss": 0.9036, + "step": 3881 + }, + { + "epoch": 1.0324468085106382, + "grad_norm": 4.004083156585693, + "learning_rate": 9.345291175248514e-06, + "loss": 0.8, + "step": 3882 + }, + { + "epoch": 1.0327127659574469, + "grad_norm": 4.025826930999756, + "learning_rate": 9.344856008175874e-06, + "loss": 0.8063, + "step": 3883 + }, + { + "epoch": 1.0329787234042553, + "grad_norm": 4.168485641479492, + "learning_rate": 9.344420706668274e-06, + "loss": 0.8712, + "step": 3884 + }, + { + "epoch": 1.0332446808510638, + "grad_norm": 3.7525241374969482, + "learning_rate": 9.343985270739184e-06, + "loss": 0.8075, + "step": 3885 + }, + { + "epoch": 1.0335106382978723, + "grad_norm": 4.079540729522705, + "learning_rate": 9.343549700402073e-06, + "loss": 0.7574, + "step": 3886 + }, + { + "epoch": 1.033776595744681, + "grad_norm": 3.5480105876922607, + "learning_rate": 9.34311399567042e-06, + "loss": 0.8544, + "step": 3887 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 3.6420836448669434, + "learning_rate": 9.342678156557709e-06, + "loss": 0.8279, + "step": 3888 + }, + { + "epoch": 1.0343085106382979, + "grad_norm": 3.8541533946990967, + "learning_rate": 9.342242183077422e-06, + "loss": 0.8794, + "step": 3889 + }, + { + "epoch": 1.0345744680851063, + "grad_norm": 3.5861008167266846, + "learning_rate": 9.341806075243049e-06, + "loss": 0.7949, + "step": 3890 + }, + { + "epoch": 1.0348404255319148, + "grad_norm": 4.284236431121826, + "learning_rate": 9.341369833068086e-06, + "loss": 0.7882, + "step": 3891 + }, + { + "epoch": 1.0351063829787235, + "grad_norm": 4.239330768585205, + "learning_rate": 9.340933456566028e-06, + "loss": 0.8299, + "step": 3892 + }, + { + "epoch": 1.035372340425532, + "grad_norm": 4.633347988128662, + "learning_rate": 9.340496945750377e-06, + "loss": 0.9297, + "step": 3893 + }, + { + "epoch": 1.0356382978723404, + "grad_norm": 4.2658538818359375, + "learning_rate": 9.340060300634642e-06, + "loss": 0.7928, + "step": 3894 + }, + { + "epoch": 1.0359042553191489, + "grad_norm": 3.876652717590332, + "learning_rate": 9.33962352123233e-06, + "loss": 0.7742, + "step": 3895 + }, + { + "epoch": 1.0361702127659576, + "grad_norm": 3.939422130584717, + "learning_rate": 9.339186607556959e-06, + "loss": 0.7676, + "step": 3896 + }, + { + "epoch": 1.036436170212766, + "grad_norm": 3.9666736125946045, + "learning_rate": 9.338749559622042e-06, + "loss": 0.8759, + "step": 3897 + }, + { + "epoch": 1.0367021276595745, + "grad_norm": 3.6032910346984863, + "learning_rate": 9.338312377441108e-06, + "loss": 0.6806, + "step": 3898 + }, + { + "epoch": 1.036968085106383, + "grad_norm": 3.6236395835876465, + "learning_rate": 9.337875061027681e-06, + "loss": 0.8275, + "step": 3899 + }, + { + "epoch": 1.0372340425531914, + "grad_norm": 4.132247447967529, + "learning_rate": 9.337437610395292e-06, + "loss": 0.8429, + "step": 3900 + }, + { + "epoch": 1.0375, + "grad_norm": 3.7111639976501465, + "learning_rate": 9.337000025557477e-06, + "loss": 0.9638, + "step": 3901 + }, + { + "epoch": 1.0377659574468086, + "grad_norm": 3.9870896339416504, + "learning_rate": 9.336562306527775e-06, + "loss": 0.7931, + "step": 3902 + }, + { + "epoch": 1.038031914893617, + "grad_norm": 3.9265518188476562, + "learning_rate": 9.336124453319729e-06, + "loss": 0.7928, + "step": 3903 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 3.5974245071411133, + "learning_rate": 9.335686465946888e-06, + "loss": 0.7127, + "step": 3904 + }, + { + "epoch": 1.038563829787234, + "grad_norm": 3.6213388442993164, + "learning_rate": 9.335248344422803e-06, + "loss": 0.7669, + "step": 3905 + }, + { + "epoch": 1.0388297872340426, + "grad_norm": 4.555843830108643, + "learning_rate": 9.33481008876103e-06, + "loss": 0.8885, + "step": 3906 + }, + { + "epoch": 1.039095744680851, + "grad_norm": 4.553684234619141, + "learning_rate": 9.33437169897513e-06, + "loss": 0.9339, + "step": 3907 + }, + { + "epoch": 1.0393617021276595, + "grad_norm": 4.390134811401367, + "learning_rate": 9.333933175078665e-06, + "loss": 0.887, + "step": 3908 + }, + { + "epoch": 1.039627659574468, + "grad_norm": 4.3838677406311035, + "learning_rate": 9.333494517085205e-06, + "loss": 0.8234, + "step": 3909 + }, + { + "epoch": 1.0398936170212767, + "grad_norm": 4.019488334655762, + "learning_rate": 9.333055725008323e-06, + "loss": 0.9096, + "step": 3910 + }, + { + "epoch": 1.0401595744680852, + "grad_norm": 3.4591004848480225, + "learning_rate": 9.332616798861596e-06, + "loss": 0.7404, + "step": 3911 + }, + { + "epoch": 1.0404255319148936, + "grad_norm": 4.587208271026611, + "learning_rate": 9.332177738658603e-06, + "loss": 0.8192, + "step": 3912 + }, + { + "epoch": 1.040691489361702, + "grad_norm": 3.734438180923462, + "learning_rate": 9.331738544412932e-06, + "loss": 0.8286, + "step": 3913 + }, + { + "epoch": 1.0409574468085105, + "grad_norm": 3.7644083499908447, + "learning_rate": 9.33129921613817e-06, + "loss": 0.8243, + "step": 3914 + }, + { + "epoch": 1.0412234042553192, + "grad_norm": 3.412766456604004, + "learning_rate": 9.33085975384791e-06, + "loss": 0.8141, + "step": 3915 + }, + { + "epoch": 1.0414893617021277, + "grad_norm": 3.1695566177368164, + "learning_rate": 9.33042015755575e-06, + "loss": 0.6531, + "step": 3916 + }, + { + "epoch": 1.0417553191489362, + "grad_norm": 4.0986151695251465, + "learning_rate": 9.329980427275293e-06, + "loss": 0.8253, + "step": 3917 + }, + { + "epoch": 1.0420212765957446, + "grad_norm": 3.9123079776763916, + "learning_rate": 9.329540563020143e-06, + "loss": 0.8211, + "step": 3918 + }, + { + "epoch": 1.0422872340425533, + "grad_norm": 3.860915184020996, + "learning_rate": 9.32910056480391e-06, + "loss": 0.7886, + "step": 3919 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 3.6465773582458496, + "learning_rate": 9.328660432640211e-06, + "loss": 0.7254, + "step": 3920 + }, + { + "epoch": 1.0428191489361702, + "grad_norm": 4.174450874328613, + "learning_rate": 9.328220166542659e-06, + "loss": 0.8686, + "step": 3921 + }, + { + "epoch": 1.0430851063829787, + "grad_norm": 3.563661575317383, + "learning_rate": 9.32777976652488e-06, + "loss": 0.8862, + "step": 3922 + }, + { + "epoch": 1.0433510638297872, + "grad_norm": 3.976609468460083, + "learning_rate": 9.3273392326005e-06, + "loss": 0.9412, + "step": 3923 + }, + { + "epoch": 1.0436170212765958, + "grad_norm": 3.979386568069458, + "learning_rate": 9.32689856478315e-06, + "loss": 0.767, + "step": 3924 + }, + { + "epoch": 1.0438829787234043, + "grad_norm": 3.6504030227661133, + "learning_rate": 9.326457763086463e-06, + "loss": 0.7288, + "step": 3925 + }, + { + "epoch": 1.0441489361702128, + "grad_norm": 3.5788464546203613, + "learning_rate": 9.32601682752408e-06, + "loss": 0.7756, + "step": 3926 + }, + { + "epoch": 1.0444148936170212, + "grad_norm": 4.129055976867676, + "learning_rate": 9.325575758109642e-06, + "loss": 0.8129, + "step": 3927 + }, + { + "epoch": 1.0446808510638297, + "grad_norm": 4.022395133972168, + "learning_rate": 9.325134554856799e-06, + "loss": 0.8346, + "step": 3928 + }, + { + "epoch": 1.0449468085106384, + "grad_norm": 3.9106342792510986, + "learning_rate": 9.3246932177792e-06, + "loss": 0.7345, + "step": 3929 + }, + { + "epoch": 1.0452127659574468, + "grad_norm": 5.765318870544434, + "learning_rate": 9.324251746890501e-06, + "loss": 1.0247, + "step": 3930 + }, + { + "epoch": 1.0454787234042553, + "grad_norm": 3.858736276626587, + "learning_rate": 9.323810142204361e-06, + "loss": 0.8736, + "step": 3931 + }, + { + "epoch": 1.0457446808510638, + "grad_norm": 3.313824415206909, + "learning_rate": 9.323368403734445e-06, + "loss": 0.8105, + "step": 3932 + }, + { + "epoch": 1.0460106382978724, + "grad_norm": 3.7220394611358643, + "learning_rate": 9.32292653149442e-06, + "loss": 0.7904, + "step": 3933 + }, + { + "epoch": 1.046276595744681, + "grad_norm": 3.852928638458252, + "learning_rate": 9.32248452549796e-06, + "loss": 0.7263, + "step": 3934 + }, + { + "epoch": 1.0465425531914894, + "grad_norm": 3.9275519847869873, + "learning_rate": 9.322042385758738e-06, + "loss": 0.8318, + "step": 3935 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 4.239774227142334, + "learning_rate": 9.321600112290439e-06, + "loss": 0.7238, + "step": 3936 + }, + { + "epoch": 1.0470744680851063, + "grad_norm": 3.672391891479492, + "learning_rate": 9.321157705106741e-06, + "loss": 0.87, + "step": 3937 + }, + { + "epoch": 1.047340425531915, + "grad_norm": 3.510413646697998, + "learning_rate": 9.320715164221338e-06, + "loss": 0.7332, + "step": 3938 + }, + { + "epoch": 1.0476063829787234, + "grad_norm": 3.9943974018096924, + "learning_rate": 9.32027248964792e-06, + "loss": 0.7492, + "step": 3939 + }, + { + "epoch": 1.047872340425532, + "grad_norm": 3.3832719326019287, + "learning_rate": 9.319829681400185e-06, + "loss": 0.7657, + "step": 3940 + }, + { + "epoch": 1.0481382978723404, + "grad_norm": 3.761160135269165, + "learning_rate": 9.319386739491834e-06, + "loss": 0.7968, + "step": 3941 + }, + { + "epoch": 1.048404255319149, + "grad_norm": 3.9942009449005127, + "learning_rate": 9.31894366393657e-06, + "loss": 0.8027, + "step": 3942 + }, + { + "epoch": 1.0486702127659575, + "grad_norm": 3.8257179260253906, + "learning_rate": 9.318500454748105e-06, + "loss": 0.8245, + "step": 3943 + }, + { + "epoch": 1.048936170212766, + "grad_norm": 4.181244850158691, + "learning_rate": 9.318057111940153e-06, + "loss": 0.7048, + "step": 3944 + }, + { + "epoch": 1.0492021276595744, + "grad_norm": 4.021924018859863, + "learning_rate": 9.317613635526431e-06, + "loss": 0.8669, + "step": 3945 + }, + { + "epoch": 1.049468085106383, + "grad_norm": 4.112471580505371, + "learning_rate": 9.317170025520656e-06, + "loss": 0.7719, + "step": 3946 + }, + { + "epoch": 1.0497340425531916, + "grad_norm": 4.079671859741211, + "learning_rate": 9.31672628193656e-06, + "loss": 0.9156, + "step": 3947 + }, + { + "epoch": 1.05, + "grad_norm": 3.6803247928619385, + "learning_rate": 9.31628240478787e-06, + "loss": 0.741, + "step": 3948 + }, + { + "epoch": 1.0502659574468085, + "grad_norm": 3.8785572052001953, + "learning_rate": 9.315838394088322e-06, + "loss": 0.7652, + "step": 3949 + }, + { + "epoch": 1.050531914893617, + "grad_norm": 3.9115874767303467, + "learning_rate": 9.31539424985165e-06, + "loss": 0.8373, + "step": 3950 + }, + { + "epoch": 1.0507978723404254, + "grad_norm": 4.03147029876709, + "learning_rate": 9.3149499720916e-06, + "loss": 0.7918, + "step": 3951 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 3.7957963943481445, + "learning_rate": 9.31450556082192e-06, + "loss": 0.8583, + "step": 3952 + }, + { + "epoch": 1.0513297872340426, + "grad_norm": 3.83341646194458, + "learning_rate": 9.314061016056354e-06, + "loss": 0.8166, + "step": 3953 + }, + { + "epoch": 1.051595744680851, + "grad_norm": 3.7149436473846436, + "learning_rate": 9.313616337808664e-06, + "loss": 0.7958, + "step": 3954 + }, + { + "epoch": 1.0518617021276595, + "grad_norm": 3.941300392150879, + "learning_rate": 9.313171526092606e-06, + "loss": 0.8765, + "step": 3955 + }, + { + "epoch": 1.0521276595744682, + "grad_norm": 3.688690423965454, + "learning_rate": 9.312726580921942e-06, + "loss": 0.7011, + "step": 3956 + }, + { + "epoch": 1.0523936170212767, + "grad_norm": 3.683009147644043, + "learning_rate": 9.31228150231044e-06, + "loss": 0.7307, + "step": 3957 + }, + { + "epoch": 1.0526595744680851, + "grad_norm": 3.816660165786743, + "learning_rate": 9.311836290271872e-06, + "loss": 0.8001, + "step": 3958 + }, + { + "epoch": 1.0529255319148936, + "grad_norm": 3.8870654106140137, + "learning_rate": 9.311390944820012e-06, + "loss": 0.7563, + "step": 3959 + }, + { + "epoch": 1.053191489361702, + "grad_norm": 4.011544704437256, + "learning_rate": 9.31094546596864e-06, + "loss": 0.946, + "step": 3960 + }, + { + "epoch": 1.0534574468085107, + "grad_norm": 4.572283744812012, + "learning_rate": 9.31049985373154e-06, + "loss": 0.8803, + "step": 3961 + }, + { + "epoch": 1.0537234042553192, + "grad_norm": 3.7621991634368896, + "learning_rate": 9.310054108122499e-06, + "loss": 0.8607, + "step": 3962 + }, + { + "epoch": 1.0539893617021276, + "grad_norm": 3.4957644939422607, + "learning_rate": 9.309608229155311e-06, + "loss": 0.7627, + "step": 3963 + }, + { + "epoch": 1.054255319148936, + "grad_norm": 4.007942199707031, + "learning_rate": 9.30916221684377e-06, + "loss": 0.7599, + "step": 3964 + }, + { + "epoch": 1.0545212765957448, + "grad_norm": 3.790900945663452, + "learning_rate": 9.308716071201676e-06, + "loss": 0.6845, + "step": 3965 + }, + { + "epoch": 1.0547872340425533, + "grad_norm": 4.06134557723999, + "learning_rate": 9.308269792242833e-06, + "loss": 0.8446, + "step": 3966 + }, + { + "epoch": 1.0550531914893617, + "grad_norm": 3.927212715148926, + "learning_rate": 9.30782337998105e-06, + "loss": 0.8009, + "step": 3967 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 3.9333722591400146, + "learning_rate": 9.307376834430142e-06, + "loss": 0.8184, + "step": 3968 + }, + { + "epoch": 1.0555851063829786, + "grad_norm": 4.4977288246154785, + "learning_rate": 9.306930155603923e-06, + "loss": 0.841, + "step": 3969 + }, + { + "epoch": 1.0558510638297873, + "grad_norm": 3.587890386581421, + "learning_rate": 9.306483343516212e-06, + "loss": 0.6937, + "step": 3970 + }, + { + "epoch": 1.0561170212765958, + "grad_norm": 4.001445293426514, + "learning_rate": 9.30603639818084e-06, + "loss": 0.8711, + "step": 3971 + }, + { + "epoch": 1.0563829787234043, + "grad_norm": 3.6268887519836426, + "learning_rate": 9.30558931961163e-06, + "loss": 0.7053, + "step": 3972 + }, + { + "epoch": 1.0566489361702127, + "grad_norm": 3.929903030395508, + "learning_rate": 9.305142107822415e-06, + "loss": 0.8549, + "step": 3973 + }, + { + "epoch": 1.0569148936170212, + "grad_norm": 3.7672524452209473, + "learning_rate": 9.304694762827038e-06, + "loss": 0.6872, + "step": 3974 + }, + { + "epoch": 1.0571808510638299, + "grad_norm": 4.7689738273620605, + "learning_rate": 9.304247284639335e-06, + "loss": 0.8544, + "step": 3975 + }, + { + "epoch": 1.0574468085106383, + "grad_norm": 3.8088295459747314, + "learning_rate": 9.303799673273153e-06, + "loss": 0.7047, + "step": 3976 + }, + { + "epoch": 1.0577127659574468, + "grad_norm": 4.246236324310303, + "learning_rate": 9.303351928742344e-06, + "loss": 0.7887, + "step": 3977 + }, + { + "epoch": 1.0579787234042553, + "grad_norm": 3.864558696746826, + "learning_rate": 9.302904051060758e-06, + "loss": 0.828, + "step": 3978 + }, + { + "epoch": 1.058244680851064, + "grad_norm": 4.24592399597168, + "learning_rate": 9.302456040242257e-06, + "loss": 0.7851, + "step": 3979 + }, + { + "epoch": 1.0585106382978724, + "grad_norm": 4.1537909507751465, + "learning_rate": 9.302007896300697e-06, + "loss": 0.8281, + "step": 3980 + }, + { + "epoch": 1.0587765957446809, + "grad_norm": 4.180373668670654, + "learning_rate": 9.30155961924995e-06, + "loss": 0.8334, + "step": 3981 + }, + { + "epoch": 1.0590425531914893, + "grad_norm": 3.3669097423553467, + "learning_rate": 9.301111209103883e-06, + "loss": 0.745, + "step": 3982 + }, + { + "epoch": 1.0593085106382978, + "grad_norm": 3.8249645233154297, + "learning_rate": 9.300662665876373e-06, + "loss": 0.8035, + "step": 3983 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 3.8265540599823, + "learning_rate": 9.300213989581294e-06, + "loss": 0.708, + "step": 3984 + }, + { + "epoch": 1.059840425531915, + "grad_norm": 4.226235866546631, + "learning_rate": 9.299765180232534e-06, + "loss": 0.8594, + "step": 3985 + }, + { + "epoch": 1.0601063829787234, + "grad_norm": 4.107953071594238, + "learning_rate": 9.299316237843976e-06, + "loss": 0.8162, + "step": 3986 + }, + { + "epoch": 1.0603723404255319, + "grad_norm": 3.8606715202331543, + "learning_rate": 9.298867162429511e-06, + "loss": 0.7562, + "step": 3987 + }, + { + "epoch": 1.0606382978723403, + "grad_norm": 3.6489405632019043, + "learning_rate": 9.298417954003036e-06, + "loss": 0.7331, + "step": 3988 + }, + { + "epoch": 1.060904255319149, + "grad_norm": 4.5174150466918945, + "learning_rate": 9.297968612578448e-06, + "loss": 0.8392, + "step": 3989 + }, + { + "epoch": 1.0611702127659575, + "grad_norm": 3.8880250453948975, + "learning_rate": 9.29751913816965e-06, + "loss": 0.8565, + "step": 3990 + }, + { + "epoch": 1.061436170212766, + "grad_norm": 3.8482306003570557, + "learning_rate": 9.297069530790552e-06, + "loss": 0.6222, + "step": 3991 + }, + { + "epoch": 1.0617021276595744, + "grad_norm": 3.9345664978027344, + "learning_rate": 9.296619790455062e-06, + "loss": 0.7166, + "step": 3992 + }, + { + "epoch": 1.061968085106383, + "grad_norm": 4.360013961791992, + "learning_rate": 9.296169917177099e-06, + "loss": 0.7584, + "step": 3993 + }, + { + "epoch": 1.0622340425531915, + "grad_norm": 3.7796449661254883, + "learning_rate": 9.295719910970577e-06, + "loss": 0.8688, + "step": 3994 + }, + { + "epoch": 1.0625, + "grad_norm": 3.968502998352051, + "learning_rate": 9.295269771849426e-06, + "loss": 0.7795, + "step": 3995 + }, + { + "epoch": 1.0627659574468085, + "grad_norm": 4.514654636383057, + "learning_rate": 9.294819499827572e-06, + "loss": 0.8955, + "step": 3996 + }, + { + "epoch": 1.063031914893617, + "grad_norm": 3.8706483840942383, + "learning_rate": 9.294369094918945e-06, + "loss": 0.7875, + "step": 3997 + }, + { + "epoch": 1.0632978723404256, + "grad_norm": 3.6928679943084717, + "learning_rate": 9.293918557137483e-06, + "loss": 0.7198, + "step": 3998 + }, + { + "epoch": 1.063563829787234, + "grad_norm": 3.9840540885925293, + "learning_rate": 9.293467886497123e-06, + "loss": 0.8831, + "step": 3999 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 4.153161525726318, + "learning_rate": 9.293017083011814e-06, + "loss": 0.8204, + "step": 4000 + }, + { + "epoch": 1.0638297872340425, + "eval_loss": 1.3173630237579346, + "eval_runtime": 13.912, + "eval_samples_per_second": 28.752, + "eval_steps_per_second": 3.594, + "step": 4000 + }, + { + "epoch": 1.064095744680851, + "grad_norm": 3.50370717048645, + "learning_rate": 9.2925661466955e-06, + "loss": 0.6799, + "step": 4001 + }, + { + "epoch": 1.0643617021276595, + "grad_norm": 3.481992244720459, + "learning_rate": 9.292115077562138e-06, + "loss": 0.6651, + "step": 4002 + }, + { + "epoch": 1.0646276595744681, + "grad_norm": 3.986703634262085, + "learning_rate": 9.291663875625681e-06, + "loss": 0.713, + "step": 4003 + }, + { + "epoch": 1.0648936170212766, + "grad_norm": 3.7703604698181152, + "learning_rate": 9.291212540900091e-06, + "loss": 0.8728, + "step": 4004 + }, + { + "epoch": 1.065159574468085, + "grad_norm": 3.9758448600769043, + "learning_rate": 9.290761073399333e-06, + "loss": 0.8273, + "step": 4005 + }, + { + "epoch": 1.0654255319148935, + "grad_norm": 3.999802350997925, + "learning_rate": 9.290309473137376e-06, + "loss": 0.8826, + "step": 4006 + }, + { + "epoch": 1.0656914893617022, + "grad_norm": 4.072256088256836, + "learning_rate": 9.289857740128192e-06, + "loss": 0.8037, + "step": 4007 + }, + { + "epoch": 1.0659574468085107, + "grad_norm": 3.619701623916626, + "learning_rate": 9.289405874385759e-06, + "loss": 0.6833, + "step": 4008 + }, + { + "epoch": 1.0662234042553191, + "grad_norm": 4.227363586425781, + "learning_rate": 9.288953875924057e-06, + "loss": 0.8688, + "step": 4009 + }, + { + "epoch": 1.0664893617021276, + "grad_norm": 3.589017629623413, + "learning_rate": 9.288501744757073e-06, + "loss": 0.6888, + "step": 4010 + }, + { + "epoch": 1.0667553191489363, + "grad_norm": 3.9024956226348877, + "learning_rate": 9.288049480898797e-06, + "loss": 0.8349, + "step": 4011 + }, + { + "epoch": 1.0670212765957447, + "grad_norm": 3.854668617248535, + "learning_rate": 9.287597084363222e-06, + "loss": 0.8158, + "step": 4012 + }, + { + "epoch": 1.0672872340425532, + "grad_norm": 3.511909008026123, + "learning_rate": 9.287144555164343e-06, + "loss": 0.8076, + "step": 4013 + }, + { + "epoch": 1.0675531914893617, + "grad_norm": 4.2021098136901855, + "learning_rate": 9.286691893316165e-06, + "loss": 0.8434, + "step": 4014 + }, + { + "epoch": 1.0678191489361701, + "grad_norm": 3.823734760284424, + "learning_rate": 9.286239098832693e-06, + "loss": 0.8124, + "step": 4015 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.6504952907562256, + "learning_rate": 9.285786171727938e-06, + "loss": 0.7402, + "step": 4016 + }, + { + "epoch": 1.0683510638297873, + "grad_norm": 3.7579758167266846, + "learning_rate": 9.28533311201591e-06, + "loss": 0.8335, + "step": 4017 + }, + { + "epoch": 1.0686170212765957, + "grad_norm": 3.902036428451538, + "learning_rate": 9.284879919710631e-06, + "loss": 0.8564, + "step": 4018 + }, + { + "epoch": 1.0688829787234042, + "grad_norm": 3.6956422328948975, + "learning_rate": 9.284426594826124e-06, + "loss": 0.7766, + "step": 4019 + }, + { + "epoch": 1.0691489361702127, + "grad_norm": 3.866909980773926, + "learning_rate": 9.283973137376414e-06, + "loss": 0.8988, + "step": 4020 + }, + { + "epoch": 1.0694148936170214, + "grad_norm": 4.163184642791748, + "learning_rate": 9.28351954737553e-06, + "loss": 0.9235, + "step": 4021 + }, + { + "epoch": 1.0696808510638298, + "grad_norm": 4.208329200744629, + "learning_rate": 9.28306582483751e-06, + "loss": 0.7734, + "step": 4022 + }, + { + "epoch": 1.0699468085106383, + "grad_norm": 4.030316352844238, + "learning_rate": 9.28261196977639e-06, + "loss": 0.8427, + "step": 4023 + }, + { + "epoch": 1.0702127659574467, + "grad_norm": 3.842853307723999, + "learning_rate": 9.282157982206212e-06, + "loss": 0.8647, + "step": 4024 + }, + { + "epoch": 1.0704787234042552, + "grad_norm": 4.306194305419922, + "learning_rate": 9.281703862141024e-06, + "loss": 0.7107, + "step": 4025 + }, + { + "epoch": 1.070744680851064, + "grad_norm": 4.034607887268066, + "learning_rate": 9.28124960959488e-06, + "loss": 0.76, + "step": 4026 + }, + { + "epoch": 1.0710106382978724, + "grad_norm": 4.018486022949219, + "learning_rate": 9.280795224581832e-06, + "loss": 0.8058, + "step": 4027 + }, + { + "epoch": 1.0712765957446808, + "grad_norm": 4.060681343078613, + "learning_rate": 9.280340707115938e-06, + "loss": 0.772, + "step": 4028 + }, + { + "epoch": 1.0715425531914893, + "grad_norm": 3.8870697021484375, + "learning_rate": 9.279886057211264e-06, + "loss": 0.8036, + "step": 4029 + }, + { + "epoch": 1.071808510638298, + "grad_norm": 3.455979585647583, + "learning_rate": 9.279431274881876e-06, + "loss": 0.6292, + "step": 4030 + }, + { + "epoch": 1.0720744680851064, + "grad_norm": 3.5263242721557617, + "learning_rate": 9.278976360141848e-06, + "loss": 0.7937, + "step": 4031 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 4.214826583862305, + "learning_rate": 9.27852131300525e-06, + "loss": 0.8888, + "step": 4032 + }, + { + "epoch": 1.0726063829787233, + "grad_norm": 3.6315364837646484, + "learning_rate": 9.278066133486167e-06, + "loss": 0.7101, + "step": 4033 + }, + { + "epoch": 1.0728723404255318, + "grad_norm": 4.311771869659424, + "learning_rate": 9.277610821598682e-06, + "loss": 0.8687, + "step": 4034 + }, + { + "epoch": 1.0731382978723405, + "grad_norm": 3.720752716064453, + "learning_rate": 9.277155377356881e-06, + "loss": 0.709, + "step": 4035 + }, + { + "epoch": 1.073404255319149, + "grad_norm": 3.8687169551849365, + "learning_rate": 9.276699800774858e-06, + "loss": 0.7483, + "step": 4036 + }, + { + "epoch": 1.0736702127659574, + "grad_norm": 4.010682582855225, + "learning_rate": 9.276244091866706e-06, + "loss": 0.7954, + "step": 4037 + }, + { + "epoch": 1.0739361702127659, + "grad_norm": 3.9716639518737793, + "learning_rate": 9.27578825064653e-06, + "loss": 0.8228, + "step": 4038 + }, + { + "epoch": 1.0742021276595746, + "grad_norm": 3.6064131259918213, + "learning_rate": 9.275332277128428e-06, + "loss": 0.8019, + "step": 4039 + }, + { + "epoch": 1.074468085106383, + "grad_norm": 3.986684560775757, + "learning_rate": 9.274876171326514e-06, + "loss": 0.7684, + "step": 4040 + }, + { + "epoch": 1.0747340425531915, + "grad_norm": 3.6139955520629883, + "learning_rate": 9.274419933254897e-06, + "loss": 0.7885, + "step": 4041 + }, + { + "epoch": 1.075, + "grad_norm": 4.203228950500488, + "learning_rate": 9.273963562927695e-06, + "loss": 0.8082, + "step": 4042 + }, + { + "epoch": 1.0752659574468084, + "grad_norm": 4.109843730926514, + "learning_rate": 9.27350706035903e-06, + "loss": 0.6948, + "step": 4043 + }, + { + "epoch": 1.075531914893617, + "grad_norm": 3.8464603424072266, + "learning_rate": 9.273050425563023e-06, + "loss": 0.8871, + "step": 4044 + }, + { + "epoch": 1.0757978723404256, + "grad_norm": 3.8080790042877197, + "learning_rate": 9.272593658553806e-06, + "loss": 0.7375, + "step": 4045 + }, + { + "epoch": 1.076063829787234, + "grad_norm": 3.829904556274414, + "learning_rate": 9.272136759345512e-06, + "loss": 0.7572, + "step": 4046 + }, + { + "epoch": 1.0763297872340425, + "grad_norm": 4.1604390144348145, + "learning_rate": 9.271679727952274e-06, + "loss": 0.7503, + "step": 4047 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 3.538896322250366, + "learning_rate": 9.271222564388238e-06, + "loss": 0.7042, + "step": 4048 + }, + { + "epoch": 1.0768617021276596, + "grad_norm": 3.960331439971924, + "learning_rate": 9.270765268667547e-06, + "loss": 0.8119, + "step": 4049 + }, + { + "epoch": 1.077127659574468, + "grad_norm": 4.355499267578125, + "learning_rate": 9.270307840804349e-06, + "loss": 0.8219, + "step": 4050 + }, + { + "epoch": 1.0773936170212766, + "grad_norm": 4.223673343658447, + "learning_rate": 9.2698502808128e-06, + "loss": 0.782, + "step": 4051 + }, + { + "epoch": 1.077659574468085, + "grad_norm": 3.8911452293395996, + "learning_rate": 9.269392588707056e-06, + "loss": 0.8562, + "step": 4052 + }, + { + "epoch": 1.0779255319148937, + "grad_norm": 3.9379541873931885, + "learning_rate": 9.268934764501279e-06, + "loss": 0.8103, + "step": 4053 + }, + { + "epoch": 1.0781914893617022, + "grad_norm": 4.371243000030518, + "learning_rate": 9.268476808209635e-06, + "loss": 0.7773, + "step": 4054 + }, + { + "epoch": 1.0784574468085106, + "grad_norm": 3.5743019580841064, + "learning_rate": 9.26801871984629e-06, + "loss": 0.8976, + "step": 4055 + }, + { + "epoch": 1.078723404255319, + "grad_norm": 3.959336280822754, + "learning_rate": 9.267560499425425e-06, + "loss": 0.8294, + "step": 4056 + }, + { + "epoch": 1.0789893617021276, + "grad_norm": 3.2908687591552734, + "learning_rate": 9.267102146961211e-06, + "loss": 0.7021, + "step": 4057 + }, + { + "epoch": 1.0792553191489362, + "grad_norm": 3.952495574951172, + "learning_rate": 9.266643662467834e-06, + "loss": 0.8368, + "step": 4058 + }, + { + "epoch": 1.0795212765957447, + "grad_norm": 3.691890239715576, + "learning_rate": 9.266185045959478e-06, + "loss": 0.7606, + "step": 4059 + }, + { + "epoch": 1.0797872340425532, + "grad_norm": 4.092920780181885, + "learning_rate": 9.265726297450332e-06, + "loss": 0.7791, + "step": 4060 + }, + { + "epoch": 1.0800531914893616, + "grad_norm": 4.004536151885986, + "learning_rate": 9.265267416954595e-06, + "loss": 0.7055, + "step": 4061 + }, + { + "epoch": 1.0803191489361703, + "grad_norm": 3.7672064304351807, + "learning_rate": 9.26480840448646e-06, + "loss": 0.7552, + "step": 4062 + }, + { + "epoch": 1.0805851063829788, + "grad_norm": 3.8815436363220215, + "learning_rate": 9.264349260060134e-06, + "loss": 0.7602, + "step": 4063 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 4.021637916564941, + "learning_rate": 9.26388998368982e-06, + "loss": 0.7595, + "step": 4064 + }, + { + "epoch": 1.0811170212765957, + "grad_norm": 3.9159035682678223, + "learning_rate": 9.26343057538973e-06, + "loss": 0.7554, + "step": 4065 + }, + { + "epoch": 1.0813829787234042, + "grad_norm": 3.9444377422332764, + "learning_rate": 9.26297103517408e-06, + "loss": 0.6694, + "step": 4066 + }, + { + "epoch": 1.0816489361702128, + "grad_norm": 3.8889427185058594, + "learning_rate": 9.262511363057085e-06, + "loss": 0.7356, + "step": 4067 + }, + { + "epoch": 1.0819148936170213, + "grad_norm": 4.03524923324585, + "learning_rate": 9.262051559052972e-06, + "loss": 0.6715, + "step": 4068 + }, + { + "epoch": 1.0821808510638298, + "grad_norm": 4.430936336517334, + "learning_rate": 9.261591623175965e-06, + "loss": 0.9173, + "step": 4069 + }, + { + "epoch": 1.0824468085106382, + "grad_norm": 3.784855604171753, + "learning_rate": 9.261131555440295e-06, + "loss": 0.8472, + "step": 4070 + }, + { + "epoch": 1.0827127659574467, + "grad_norm": 3.9647388458251953, + "learning_rate": 9.260671355860196e-06, + "loss": 0.6908, + "step": 4071 + }, + { + "epoch": 1.0829787234042554, + "grad_norm": 4.330158710479736, + "learning_rate": 9.260211024449913e-06, + "loss": 0.7744, + "step": 4072 + }, + { + "epoch": 1.0832446808510638, + "grad_norm": 3.934960126876831, + "learning_rate": 9.259750561223682e-06, + "loss": 0.7585, + "step": 4073 + }, + { + "epoch": 1.0835106382978723, + "grad_norm": 4.234976291656494, + "learning_rate": 9.259289966195754e-06, + "loss": 0.7642, + "step": 4074 + }, + { + "epoch": 1.0837765957446808, + "grad_norm": 4.297840118408203, + "learning_rate": 9.25882923938038e-06, + "loss": 0.8493, + "step": 4075 + }, + { + "epoch": 1.0840425531914895, + "grad_norm": 3.9343340396881104, + "learning_rate": 9.258368380791818e-06, + "loss": 0.8649, + "step": 4076 + }, + { + "epoch": 1.084308510638298, + "grad_norm": 4.02085018157959, + "learning_rate": 9.257907390444322e-06, + "loss": 0.7595, + "step": 4077 + }, + { + "epoch": 1.0845744680851064, + "grad_norm": 4.010712146759033, + "learning_rate": 9.257446268352158e-06, + "loss": 0.9151, + "step": 4078 + }, + { + "epoch": 1.0848404255319148, + "grad_norm": 3.8062400817871094, + "learning_rate": 9.256985014529595e-06, + "loss": 0.8318, + "step": 4079 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 4.219789505004883, + "learning_rate": 9.256523628990903e-06, + "loss": 0.7924, + "step": 4080 + }, + { + "epoch": 1.085372340425532, + "grad_norm": 3.7686777114868164, + "learning_rate": 9.25606211175036e-06, + "loss": 0.8027, + "step": 4081 + }, + { + "epoch": 1.0856382978723405, + "grad_norm": 3.6773087978363037, + "learning_rate": 9.255600462822241e-06, + "loss": 0.7568, + "step": 4082 + }, + { + "epoch": 1.085904255319149, + "grad_norm": 3.480522394180298, + "learning_rate": 9.255138682220837e-06, + "loss": 0.7156, + "step": 4083 + }, + { + "epoch": 1.0861702127659574, + "grad_norm": 3.8398611545562744, + "learning_rate": 9.254676769960429e-06, + "loss": 0.7162, + "step": 4084 + }, + { + "epoch": 1.086436170212766, + "grad_norm": 3.8505029678344727, + "learning_rate": 9.254214726055314e-06, + "loss": 0.8488, + "step": 4085 + }, + { + "epoch": 1.0867021276595745, + "grad_norm": 4.238323211669922, + "learning_rate": 9.253752550519787e-06, + "loss": 0.8742, + "step": 4086 + }, + { + "epoch": 1.086968085106383, + "grad_norm": 3.7396814823150635, + "learning_rate": 9.253290243368149e-06, + "loss": 0.8127, + "step": 4087 + }, + { + "epoch": 1.0872340425531914, + "grad_norm": 4.44807767868042, + "learning_rate": 9.2528278046147e-06, + "loss": 0.8144, + "step": 4088 + }, + { + "epoch": 1.0875, + "grad_norm": 3.88287091255188, + "learning_rate": 9.252365234273754e-06, + "loss": 0.691, + "step": 4089 + }, + { + "epoch": 1.0877659574468086, + "grad_norm": 3.7738873958587646, + "learning_rate": 9.251902532359622e-06, + "loss": 0.7662, + "step": 4090 + }, + { + "epoch": 1.088031914893617, + "grad_norm": 3.789278745651245, + "learning_rate": 9.251439698886618e-06, + "loss": 0.7773, + "step": 4091 + }, + { + "epoch": 1.0882978723404255, + "grad_norm": 3.8501172065734863, + "learning_rate": 9.250976733869065e-06, + "loss": 0.795, + "step": 4092 + }, + { + "epoch": 1.088563829787234, + "grad_norm": 4.324002265930176, + "learning_rate": 9.250513637321287e-06, + "loss": 0.7957, + "step": 4093 + }, + { + "epoch": 1.0888297872340424, + "grad_norm": 3.598450183868408, + "learning_rate": 9.250050409257612e-06, + "loss": 0.8029, + "step": 4094 + }, + { + "epoch": 1.0890957446808511, + "grad_norm": 3.749985694885254, + "learning_rate": 9.249587049692375e-06, + "loss": 0.7377, + "step": 4095 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 3.7555527687072754, + "learning_rate": 9.24912355863991e-06, + "loss": 0.7276, + "step": 4096 + }, + { + "epoch": 1.089627659574468, + "grad_norm": 3.826099395751953, + "learning_rate": 9.248659936114558e-06, + "loss": 0.9592, + "step": 4097 + }, + { + "epoch": 1.0898936170212765, + "grad_norm": 4.4053263664245605, + "learning_rate": 9.248196182130669e-06, + "loss": 0.846, + "step": 4098 + }, + { + "epoch": 1.0901595744680852, + "grad_norm": 3.7693631649017334, + "learning_rate": 9.247732296702586e-06, + "loss": 0.8702, + "step": 4099 + }, + { + "epoch": 1.0904255319148937, + "grad_norm": 3.8193347454071045, + "learning_rate": 9.247268279844666e-06, + "loss": 0.8124, + "step": 4100 + }, + { + "epoch": 1.0906914893617021, + "grad_norm": 3.5872762203216553, + "learning_rate": 9.246804131571263e-06, + "loss": 0.8409, + "step": 4101 + }, + { + "epoch": 1.0909574468085106, + "grad_norm": 3.6679608821868896, + "learning_rate": 9.246339851896742e-06, + "loss": 0.8331, + "step": 4102 + }, + { + "epoch": 1.091223404255319, + "grad_norm": 3.838644027709961, + "learning_rate": 9.245875440835466e-06, + "loss": 0.8683, + "step": 4103 + }, + { + "epoch": 1.0914893617021277, + "grad_norm": 4.146610736846924, + "learning_rate": 9.245410898401806e-06, + "loss": 0.7721, + "step": 4104 + }, + { + "epoch": 1.0917553191489362, + "grad_norm": 3.685303211212158, + "learning_rate": 9.244946224610132e-06, + "loss": 0.6993, + "step": 4105 + }, + { + "epoch": 1.0920212765957447, + "grad_norm": 3.9541261196136475, + "learning_rate": 9.244481419474824e-06, + "loss": 0.7942, + "step": 4106 + }, + { + "epoch": 1.0922872340425531, + "grad_norm": 4.122397422790527, + "learning_rate": 9.244016483010266e-06, + "loss": 0.7709, + "step": 4107 + }, + { + "epoch": 1.0925531914893618, + "grad_norm": 4.400294303894043, + "learning_rate": 9.24355141523084e-06, + "loss": 0.8702, + "step": 4108 + }, + { + "epoch": 1.0928191489361703, + "grad_norm": 4.555760383605957, + "learning_rate": 9.243086216150938e-06, + "loss": 0.8594, + "step": 4109 + }, + { + "epoch": 1.0930851063829787, + "grad_norm": 4.033708095550537, + "learning_rate": 9.242620885784952e-06, + "loss": 0.9066, + "step": 4110 + }, + { + "epoch": 1.0933510638297872, + "grad_norm": 3.908421754837036, + "learning_rate": 9.24215542414728e-06, + "loss": 0.7454, + "step": 4111 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 3.8368232250213623, + "learning_rate": 9.241689831252327e-06, + "loss": 0.6895, + "step": 4112 + }, + { + "epoch": 1.0938829787234043, + "grad_norm": 3.6774628162384033, + "learning_rate": 9.241224107114495e-06, + "loss": 0.8634, + "step": 4113 + }, + { + "epoch": 1.0941489361702128, + "grad_norm": 4.185787677764893, + "learning_rate": 9.240758251748195e-06, + "loss": 0.8685, + "step": 4114 + }, + { + "epoch": 1.0944148936170213, + "grad_norm": 3.8751626014709473, + "learning_rate": 9.240292265167843e-06, + "loss": 0.86, + "step": 4115 + }, + { + "epoch": 1.0946808510638297, + "grad_norm": 4.215353965759277, + "learning_rate": 9.239826147387857e-06, + "loss": 0.8188, + "step": 4116 + }, + { + "epoch": 1.0949468085106382, + "grad_norm": 3.7287204265594482, + "learning_rate": 9.239359898422656e-06, + "loss": 0.71, + "step": 4117 + }, + { + "epoch": 1.0952127659574469, + "grad_norm": 3.8123693466186523, + "learning_rate": 9.238893518286668e-06, + "loss": 0.7727, + "step": 4118 + }, + { + "epoch": 1.0954787234042553, + "grad_norm": 3.990419626235962, + "learning_rate": 9.238427006994325e-06, + "loss": 0.7953, + "step": 4119 + }, + { + "epoch": 1.0957446808510638, + "grad_norm": 3.976417303085327, + "learning_rate": 9.237960364560063e-06, + "loss": 0.8596, + "step": 4120 + }, + { + "epoch": 1.0960106382978723, + "grad_norm": 4.219186305999756, + "learning_rate": 9.237493590998315e-06, + "loss": 0.809, + "step": 4121 + }, + { + "epoch": 1.096276595744681, + "grad_norm": 3.693594455718994, + "learning_rate": 9.237026686323527e-06, + "loss": 0.8066, + "step": 4122 + }, + { + "epoch": 1.0965425531914894, + "grad_norm": 3.7492263317108154, + "learning_rate": 9.236559650550143e-06, + "loss": 0.7525, + "step": 4123 + }, + { + "epoch": 1.0968085106382979, + "grad_norm": 4.333737850189209, + "learning_rate": 9.236092483692617e-06, + "loss": 0.8718, + "step": 4124 + }, + { + "epoch": 1.0970744680851063, + "grad_norm": 3.505357503890991, + "learning_rate": 9.235625185765403e-06, + "loss": 0.8482, + "step": 4125 + }, + { + "epoch": 1.0973404255319148, + "grad_norm": 4.302443027496338, + "learning_rate": 9.235157756782957e-06, + "loss": 1.0046, + "step": 4126 + }, + { + "epoch": 1.0976063829787235, + "grad_norm": 3.8847270011901855, + "learning_rate": 9.234690196759746e-06, + "loss": 0.8921, + "step": 4127 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 3.976154327392578, + "learning_rate": 9.234222505710232e-06, + "loss": 0.7338, + "step": 4128 + }, + { + "epoch": 1.0981382978723404, + "grad_norm": 3.829082489013672, + "learning_rate": 9.233754683648891e-06, + "loss": 0.7554, + "step": 4129 + }, + { + "epoch": 1.0984042553191489, + "grad_norm": 3.693549633026123, + "learning_rate": 9.233286730590195e-06, + "loss": 0.7555, + "step": 4130 + }, + { + "epoch": 1.0986702127659576, + "grad_norm": 3.9820609092712402, + "learning_rate": 9.232818646548622e-06, + "loss": 0.8567, + "step": 4131 + }, + { + "epoch": 1.098936170212766, + "grad_norm": 3.9395439624786377, + "learning_rate": 9.232350431538656e-06, + "loss": 0.7728, + "step": 4132 + }, + { + "epoch": 1.0992021276595745, + "grad_norm": 4.385442733764648, + "learning_rate": 9.231882085574788e-06, + "loss": 0.7803, + "step": 4133 + }, + { + "epoch": 1.099468085106383, + "grad_norm": 4.260448932647705, + "learning_rate": 9.231413608671504e-06, + "loss": 0.8111, + "step": 4134 + }, + { + "epoch": 1.0997340425531914, + "grad_norm": 3.9470431804656982, + "learning_rate": 9.2309450008433e-06, + "loss": 0.718, + "step": 4135 + }, + { + "epoch": 1.1, + "grad_norm": 3.897451877593994, + "learning_rate": 9.230476262104678e-06, + "loss": 0.7257, + "step": 4136 + }, + { + "epoch": 1.1002659574468086, + "grad_norm": 4.178949356079102, + "learning_rate": 9.23000739247014e-06, + "loss": 0.8704, + "step": 4137 + }, + { + "epoch": 1.100531914893617, + "grad_norm": 3.9306554794311523, + "learning_rate": 9.22953839195419e-06, + "loss": 0.8856, + "step": 4138 + }, + { + "epoch": 1.1007978723404255, + "grad_norm": 3.2699522972106934, + "learning_rate": 9.229069260571346e-06, + "loss": 0.7263, + "step": 4139 + }, + { + "epoch": 1.101063829787234, + "grad_norm": 3.980687141418457, + "learning_rate": 9.228599998336119e-06, + "loss": 0.8805, + "step": 4140 + }, + { + "epoch": 1.1013297872340426, + "grad_norm": 4.091682434082031, + "learning_rate": 9.228130605263028e-06, + "loss": 0.8572, + "step": 4141 + }, + { + "epoch": 1.101595744680851, + "grad_norm": 3.8642654418945312, + "learning_rate": 9.2276610813666e-06, + "loss": 0.7285, + "step": 4142 + }, + { + "epoch": 1.1018617021276595, + "grad_norm": 3.6476948261260986, + "learning_rate": 9.227191426661359e-06, + "loss": 0.7736, + "step": 4143 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 3.8674888610839844, + "learning_rate": 9.22672164116184e-06, + "loss": 0.6885, + "step": 4144 + }, + { + "epoch": 1.1023936170212767, + "grad_norm": 3.6890833377838135, + "learning_rate": 9.226251724882576e-06, + "loss": 0.9683, + "step": 4145 + }, + { + "epoch": 1.1026595744680852, + "grad_norm": 3.688188314437866, + "learning_rate": 9.225781677838108e-06, + "loss": 0.8236, + "step": 4146 + }, + { + "epoch": 1.1029255319148936, + "grad_norm": 4.241778373718262, + "learning_rate": 9.22531150004298e-06, + "loss": 0.7666, + "step": 4147 + }, + { + "epoch": 1.103191489361702, + "grad_norm": 3.8804636001586914, + "learning_rate": 9.22484119151174e-06, + "loss": 0.7547, + "step": 4148 + }, + { + "epoch": 1.1034574468085105, + "grad_norm": 3.8728346824645996, + "learning_rate": 9.224370752258938e-06, + "loss": 0.7856, + "step": 4149 + }, + { + "epoch": 1.1037234042553192, + "grad_norm": 3.4745118618011475, + "learning_rate": 9.223900182299132e-06, + "loss": 0.8213, + "step": 4150 + }, + { + "epoch": 1.1039893617021277, + "grad_norm": 3.9133832454681396, + "learning_rate": 9.223429481646881e-06, + "loss": 0.8894, + "step": 4151 + }, + { + "epoch": 1.1042553191489362, + "grad_norm": 3.5466485023498535, + "learning_rate": 9.22295865031675e-06, + "loss": 0.7024, + "step": 4152 + }, + { + "epoch": 1.1045212765957446, + "grad_norm": 4.195438385009766, + "learning_rate": 9.222487688323306e-06, + "loss": 0.9108, + "step": 4153 + }, + { + "epoch": 1.1047872340425533, + "grad_norm": 4.125967025756836, + "learning_rate": 9.222016595681122e-06, + "loss": 0.7909, + "step": 4154 + }, + { + "epoch": 1.1050531914893618, + "grad_norm": 3.8983302116394043, + "learning_rate": 9.221545372404774e-06, + "loss": 0.8179, + "step": 4155 + }, + { + "epoch": 1.1053191489361702, + "grad_norm": 4.264431953430176, + "learning_rate": 9.22107401850884e-06, + "loss": 0.8438, + "step": 4156 + }, + { + "epoch": 1.1055851063829787, + "grad_norm": 3.9519243240356445, + "learning_rate": 9.220602534007908e-06, + "loss": 0.7254, + "step": 4157 + }, + { + "epoch": 1.1058510638297872, + "grad_norm": 4.435789585113525, + "learning_rate": 9.220130918916563e-06, + "loss": 0.8453, + "step": 4158 + }, + { + "epoch": 1.1061170212765958, + "grad_norm": 4.175622463226318, + "learning_rate": 9.2196591732494e-06, + "loss": 0.8253, + "step": 4159 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 3.691840410232544, + "learning_rate": 9.219187297021015e-06, + "loss": 0.7372, + "step": 4160 + }, + { + "epoch": 1.1066489361702128, + "grad_norm": 3.997159957885742, + "learning_rate": 9.218715290246007e-06, + "loss": 0.9002, + "step": 4161 + }, + { + "epoch": 1.1069148936170212, + "grad_norm": 3.8894736766815186, + "learning_rate": 9.21824315293898e-06, + "loss": 0.8466, + "step": 4162 + }, + { + "epoch": 1.1071808510638297, + "grad_norm": 4.081361293792725, + "learning_rate": 9.217770885114544e-06, + "loss": 0.8159, + "step": 4163 + }, + { + "epoch": 1.1074468085106384, + "grad_norm": 3.6552507877349854, + "learning_rate": 9.21729848678731e-06, + "loss": 0.7608, + "step": 4164 + }, + { + "epoch": 1.1077127659574468, + "grad_norm": 3.844689130783081, + "learning_rate": 9.216825957971898e-06, + "loss": 0.8599, + "step": 4165 + }, + { + "epoch": 1.1079787234042553, + "grad_norm": 3.742281198501587, + "learning_rate": 9.216353298682925e-06, + "loss": 0.8188, + "step": 4166 + }, + { + "epoch": 1.1082446808510638, + "grad_norm": 4.145520210266113, + "learning_rate": 9.215880508935016e-06, + "loss": 0.8485, + "step": 4167 + }, + { + "epoch": 1.1085106382978724, + "grad_norm": 4.048991680145264, + "learning_rate": 9.2154075887428e-06, + "loss": 0.8058, + "step": 4168 + }, + { + "epoch": 1.108776595744681, + "grad_norm": 3.9312491416931152, + "learning_rate": 9.214934538120912e-06, + "loss": 0.8728, + "step": 4169 + }, + { + "epoch": 1.1090425531914894, + "grad_norm": 4.000396251678467, + "learning_rate": 9.214461357083986e-06, + "loss": 0.8695, + "step": 4170 + }, + { + "epoch": 1.1093085106382978, + "grad_norm": 4.0020904541015625, + "learning_rate": 9.213988045646664e-06, + "loss": 0.7386, + "step": 4171 + }, + { + "epoch": 1.1095744680851063, + "grad_norm": 3.527221441268921, + "learning_rate": 9.21351460382359e-06, + "loss": 0.8856, + "step": 4172 + }, + { + "epoch": 1.109840425531915, + "grad_norm": 3.984145164489746, + "learning_rate": 9.213041031629413e-06, + "loss": 0.7518, + "step": 4173 + }, + { + "epoch": 1.1101063829787234, + "grad_norm": 3.6558425426483154, + "learning_rate": 9.212567329078787e-06, + "loss": 0.7465, + "step": 4174 + }, + { + "epoch": 1.110372340425532, + "grad_norm": 4.261702060699463, + "learning_rate": 9.21209349618637e-06, + "loss": 0.8813, + "step": 4175 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 3.556643486022949, + "learning_rate": 9.211619532966817e-06, + "loss": 0.8007, + "step": 4176 + }, + { + "epoch": 1.110904255319149, + "grad_norm": 3.8246734142303467, + "learning_rate": 9.211145439434801e-06, + "loss": 0.7599, + "step": 4177 + }, + { + "epoch": 1.1111702127659575, + "grad_norm": 3.6221678256988525, + "learning_rate": 9.210671215604985e-06, + "loss": 0.8526, + "step": 4178 + }, + { + "epoch": 1.111436170212766, + "grad_norm": 3.6839540004730225, + "learning_rate": 9.210196861492045e-06, + "loss": 0.88, + "step": 4179 + }, + { + "epoch": 1.1117021276595744, + "grad_norm": 3.7845680713653564, + "learning_rate": 9.209722377110657e-06, + "loss": 0.7316, + "step": 4180 + }, + { + "epoch": 1.111968085106383, + "grad_norm": 3.9798831939697266, + "learning_rate": 9.209247762475502e-06, + "loss": 0.7928, + "step": 4181 + }, + { + "epoch": 1.1122340425531916, + "grad_norm": 3.394745349884033, + "learning_rate": 9.208773017601265e-06, + "loss": 0.7692, + "step": 4182 + }, + { + "epoch": 1.1125, + "grad_norm": 3.9630630016326904, + "learning_rate": 9.208298142502637e-06, + "loss": 0.8699, + "step": 4183 + }, + { + "epoch": 1.1127659574468085, + "grad_norm": 4.089821815490723, + "learning_rate": 9.207823137194307e-06, + "loss": 0.8295, + "step": 4184 + }, + { + "epoch": 1.113031914893617, + "grad_norm": 3.949355125427246, + "learning_rate": 9.20734800169098e-06, + "loss": 0.8049, + "step": 4185 + }, + { + "epoch": 1.1132978723404254, + "grad_norm": 3.588606119155884, + "learning_rate": 9.206872736007348e-06, + "loss": 0.7184, + "step": 4186 + }, + { + "epoch": 1.1135638297872341, + "grad_norm": 4.689065933227539, + "learning_rate": 9.206397340158122e-06, + "loss": 0.8687, + "step": 4187 + }, + { + "epoch": 1.1138297872340426, + "grad_norm": 3.685701847076416, + "learning_rate": 9.20592181415801e-06, + "loss": 0.7918, + "step": 4188 + }, + { + "epoch": 1.114095744680851, + "grad_norm": 4.084209442138672, + "learning_rate": 9.205446158021725e-06, + "loss": 0.888, + "step": 4189 + }, + { + "epoch": 1.1143617021276595, + "grad_norm": 3.9949495792388916, + "learning_rate": 9.204970371763984e-06, + "loss": 0.7975, + "step": 4190 + }, + { + "epoch": 1.1146276595744682, + "grad_norm": 4.016841888427734, + "learning_rate": 9.204494455399509e-06, + "loss": 0.8413, + "step": 4191 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 4.1810712814331055, + "learning_rate": 9.204018408943026e-06, + "loss": 0.7981, + "step": 4192 + }, + { + "epoch": 1.1151595744680851, + "grad_norm": 3.305906295776367, + "learning_rate": 9.203542232409263e-06, + "loss": 0.6931, + "step": 4193 + }, + { + "epoch": 1.1154255319148936, + "grad_norm": 4.138253688812256, + "learning_rate": 9.203065925812955e-06, + "loss": 0.7971, + "step": 4194 + }, + { + "epoch": 1.115691489361702, + "grad_norm": 4.11892557144165, + "learning_rate": 9.20258948916884e-06, + "loss": 0.7175, + "step": 4195 + }, + { + "epoch": 1.1159574468085107, + "grad_norm": 3.4274680614471436, + "learning_rate": 9.202112922491657e-06, + "loss": 0.7685, + "step": 4196 + }, + { + "epoch": 1.1162234042553192, + "grad_norm": 3.894113540649414, + "learning_rate": 9.201636225796151e-06, + "loss": 0.6782, + "step": 4197 + }, + { + "epoch": 1.1164893617021276, + "grad_norm": 4.417131423950195, + "learning_rate": 9.201159399097077e-06, + "loss": 0.7756, + "step": 4198 + }, + { + "epoch": 1.116755319148936, + "grad_norm": 4.476882457733154, + "learning_rate": 9.200682442409183e-06, + "loss": 0.8896, + "step": 4199 + }, + { + "epoch": 1.1170212765957448, + "grad_norm": 3.9255595207214355, + "learning_rate": 9.200205355747228e-06, + "loss": 0.669, + "step": 4200 + }, + { + "epoch": 1.1172872340425533, + "grad_norm": 3.3451404571533203, + "learning_rate": 9.199728139125976e-06, + "loss": 0.6271, + "step": 4201 + }, + { + "epoch": 1.1175531914893617, + "grad_norm": 4.113248825073242, + "learning_rate": 9.199250792560187e-06, + "loss": 0.8501, + "step": 4202 + }, + { + "epoch": 1.1178191489361702, + "grad_norm": 3.8352253437042236, + "learning_rate": 9.198773316064639e-06, + "loss": 0.6881, + "step": 4203 + }, + { + "epoch": 1.1180851063829786, + "grad_norm": 3.8396568298339844, + "learning_rate": 9.1982957096541e-06, + "loss": 0.695, + "step": 4204 + }, + { + "epoch": 1.1183510638297873, + "grad_norm": 4.240661144256592, + "learning_rate": 9.197817973343347e-06, + "loss": 0.8287, + "step": 4205 + }, + { + "epoch": 1.1186170212765958, + "grad_norm": 3.553846836090088, + "learning_rate": 9.197340107147166e-06, + "loss": 0.7441, + "step": 4206 + }, + { + "epoch": 1.1188829787234043, + "grad_norm": 4.087765693664551, + "learning_rate": 9.196862111080339e-06, + "loss": 0.6896, + "step": 4207 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 4.254801273345947, + "learning_rate": 9.196383985157657e-06, + "loss": 0.794, + "step": 4208 + }, + { + "epoch": 1.1194148936170212, + "grad_norm": 3.8654487133026123, + "learning_rate": 9.195905729393913e-06, + "loss": 0.7891, + "step": 4209 + }, + { + "epoch": 1.1196808510638299, + "grad_norm": 4.078755855560303, + "learning_rate": 9.195427343803906e-06, + "loss": 0.9686, + "step": 4210 + }, + { + "epoch": 1.1199468085106383, + "grad_norm": 3.3730618953704834, + "learning_rate": 9.19494882840244e-06, + "loss": 0.7186, + "step": 4211 + }, + { + "epoch": 1.1202127659574468, + "grad_norm": 3.944267511367798, + "learning_rate": 9.194470183204315e-06, + "loss": 0.7949, + "step": 4212 + }, + { + "epoch": 1.1204787234042553, + "grad_norm": 3.8274521827697754, + "learning_rate": 9.193991408224347e-06, + "loss": 0.8237, + "step": 4213 + }, + { + "epoch": 1.1207446808510637, + "grad_norm": 3.8445777893066406, + "learning_rate": 9.193512503477345e-06, + "loss": 0.7119, + "step": 4214 + }, + { + "epoch": 1.1210106382978724, + "grad_norm": 4.098488807678223, + "learning_rate": 9.19303346897813e-06, + "loss": 0.9102, + "step": 4215 + }, + { + "epoch": 1.1212765957446809, + "grad_norm": 4.096566200256348, + "learning_rate": 9.192554304741522e-06, + "loss": 0.8465, + "step": 4216 + }, + { + "epoch": 1.1215425531914893, + "grad_norm": 3.770343065261841, + "learning_rate": 9.192075010782348e-06, + "loss": 0.8278, + "step": 4217 + }, + { + "epoch": 1.1218085106382978, + "grad_norm": 3.843766689300537, + "learning_rate": 9.191595587115439e-06, + "loss": 0.8402, + "step": 4218 + }, + { + "epoch": 1.1220744680851065, + "grad_norm": 4.594594478607178, + "learning_rate": 9.191116033755625e-06, + "loss": 0.8473, + "step": 4219 + }, + { + "epoch": 1.122340425531915, + "grad_norm": 4.192259311676025, + "learning_rate": 9.190636350717747e-06, + "loss": 0.8356, + "step": 4220 + }, + { + "epoch": 1.1226063829787234, + "grad_norm": 3.919210195541382, + "learning_rate": 9.190156538016648e-06, + "loss": 0.8494, + "step": 4221 + }, + { + "epoch": 1.1228723404255319, + "grad_norm": 4.091637134552002, + "learning_rate": 9.189676595667172e-06, + "loss": 0.7264, + "step": 4222 + }, + { + "epoch": 1.1231382978723405, + "grad_norm": 4.496889114379883, + "learning_rate": 9.189196523684168e-06, + "loss": 0.876, + "step": 4223 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 3.492234230041504, + "learning_rate": 9.188716322082494e-06, + "loss": 0.7568, + "step": 4224 + }, + { + "epoch": 1.1236702127659575, + "grad_norm": 3.6598973274230957, + "learning_rate": 9.188235990877004e-06, + "loss": 0.683, + "step": 4225 + }, + { + "epoch": 1.123936170212766, + "grad_norm": 4.073709964752197, + "learning_rate": 9.18775553008256e-06, + "loss": 0.7798, + "step": 4226 + }, + { + "epoch": 1.1242021276595744, + "grad_norm": 4.100635528564453, + "learning_rate": 9.18727493971403e-06, + "loss": 0.8356, + "step": 4227 + }, + { + "epoch": 1.124468085106383, + "grad_norm": 4.231848239898682, + "learning_rate": 9.186794219786285e-06, + "loss": 0.8528, + "step": 4228 + }, + { + "epoch": 1.1247340425531915, + "grad_norm": 3.7461369037628174, + "learning_rate": 9.186313370314196e-06, + "loss": 0.7103, + "step": 4229 + }, + { + "epoch": 1.125, + "grad_norm": 3.610039234161377, + "learning_rate": 9.185832391312644e-06, + "loss": 0.7271, + "step": 4230 + }, + { + "epoch": 1.1252659574468085, + "grad_norm": 3.5538463592529297, + "learning_rate": 9.18535128279651e-06, + "loss": 0.82, + "step": 4231 + }, + { + "epoch": 1.125531914893617, + "grad_norm": 3.878833293914795, + "learning_rate": 9.184870044780677e-06, + "loss": 0.8418, + "step": 4232 + }, + { + "epoch": 1.1257978723404256, + "grad_norm": 4.012277126312256, + "learning_rate": 9.184388677280038e-06, + "loss": 0.8024, + "step": 4233 + }, + { + "epoch": 1.126063829787234, + "grad_norm": 3.702630043029785, + "learning_rate": 9.183907180309489e-06, + "loss": 0.7978, + "step": 4234 + }, + { + "epoch": 1.1263297872340425, + "grad_norm": 4.186684608459473, + "learning_rate": 9.183425553883925e-06, + "loss": 0.8459, + "step": 4235 + }, + { + "epoch": 1.126595744680851, + "grad_norm": 4.011842727661133, + "learning_rate": 9.18294379801825e-06, + "loss": 0.7931, + "step": 4236 + }, + { + "epoch": 1.1268617021276595, + "grad_norm": 4.870151042938232, + "learning_rate": 9.182461912727368e-06, + "loss": 0.9028, + "step": 4237 + }, + { + "epoch": 1.1271276595744681, + "grad_norm": 3.5846457481384277, + "learning_rate": 9.18197989802619e-06, + "loss": 0.783, + "step": 4238 + }, + { + "epoch": 1.1273936170212766, + "grad_norm": 3.910689115524292, + "learning_rate": 9.181497753929629e-06, + "loss": 0.8441, + "step": 4239 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 3.768601894378662, + "learning_rate": 9.181015480452607e-06, + "loss": 0.8207, + "step": 4240 + }, + { + "epoch": 1.1279255319148935, + "grad_norm": 4.229056358337402, + "learning_rate": 9.18053307761004e-06, + "loss": 0.8025, + "step": 4241 + }, + { + "epoch": 1.1281914893617022, + "grad_norm": 4.3545050621032715, + "learning_rate": 9.180050545416861e-06, + "loss": 0.8154, + "step": 4242 + }, + { + "epoch": 1.1284574468085107, + "grad_norm": 4.138397693634033, + "learning_rate": 9.179567883887997e-06, + "loss": 0.8033, + "step": 4243 + }, + { + "epoch": 1.1287234042553191, + "grad_norm": 3.9504189491271973, + "learning_rate": 9.17908509303838e-06, + "loss": 0.85, + "step": 4244 + }, + { + "epoch": 1.1289893617021276, + "grad_norm": 3.9662301540374756, + "learning_rate": 9.178602172882951e-06, + "loss": 0.8327, + "step": 4245 + }, + { + "epoch": 1.1292553191489363, + "grad_norm": 4.157631874084473, + "learning_rate": 9.178119123436651e-06, + "loss": 0.8558, + "step": 4246 + }, + { + "epoch": 1.1295212765957447, + "grad_norm": 3.9172611236572266, + "learning_rate": 9.177635944714424e-06, + "loss": 0.9087, + "step": 4247 + }, + { + "epoch": 1.1297872340425532, + "grad_norm": 3.9250762462615967, + "learning_rate": 9.177152636731225e-06, + "loss": 0.7709, + "step": 4248 + }, + { + "epoch": 1.1300531914893617, + "grad_norm": 3.6299500465393066, + "learning_rate": 9.176669199502004e-06, + "loss": 0.717, + "step": 4249 + }, + { + "epoch": 1.1303191489361701, + "grad_norm": 4.225446701049805, + "learning_rate": 9.17618563304172e-06, + "loss": 0.8766, + "step": 4250 + }, + { + "epoch": 1.1305851063829788, + "grad_norm": 3.9178264141082764, + "learning_rate": 9.175701937365337e-06, + "loss": 0.7634, + "step": 4251 + }, + { + "epoch": 1.1308510638297873, + "grad_norm": 3.905505657196045, + "learning_rate": 9.175218112487821e-06, + "loss": 0.7784, + "step": 4252 + }, + { + "epoch": 1.1311170212765957, + "grad_norm": 4.228585243225098, + "learning_rate": 9.174734158424138e-06, + "loss": 0.8445, + "step": 4253 + }, + { + "epoch": 1.1313829787234042, + "grad_norm": 3.9836041927337646, + "learning_rate": 9.174250075189268e-06, + "loss": 0.8252, + "step": 4254 + }, + { + "epoch": 1.1316489361702127, + "grad_norm": 4.349749565124512, + "learning_rate": 9.173765862798185e-06, + "loss": 0.8154, + "step": 4255 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 3.7815349102020264, + "learning_rate": 9.17328152126587e-06, + "loss": 0.7356, + "step": 4256 + }, + { + "epoch": 1.1321808510638298, + "grad_norm": 3.9180119037628174, + "learning_rate": 9.172797050607313e-06, + "loss": 0.8098, + "step": 4257 + }, + { + "epoch": 1.1324468085106383, + "grad_norm": 3.720789670944214, + "learning_rate": 9.172312450837504e-06, + "loss": 0.815, + "step": 4258 + }, + { + "epoch": 1.1327127659574467, + "grad_norm": 4.155251502990723, + "learning_rate": 9.171827721971434e-06, + "loss": 0.8976, + "step": 4259 + }, + { + "epoch": 1.1329787234042552, + "grad_norm": 4.600409030914307, + "learning_rate": 9.171342864024103e-06, + "loss": 0.8868, + "step": 4260 + }, + { + "epoch": 1.133244680851064, + "grad_norm": 3.8379268646240234, + "learning_rate": 9.170857877010512e-06, + "loss": 0.7867, + "step": 4261 + }, + { + "epoch": 1.1335106382978724, + "grad_norm": 4.109460830688477, + "learning_rate": 9.170372760945668e-06, + "loss": 0.7826, + "step": 4262 + }, + { + "epoch": 1.1337765957446808, + "grad_norm": 3.895494222640991, + "learning_rate": 9.16988751584458e-06, + "loss": 0.854, + "step": 4263 + }, + { + "epoch": 1.1340425531914893, + "grad_norm": 3.7237160205841064, + "learning_rate": 9.169402141722264e-06, + "loss": 0.7098, + "step": 4264 + }, + { + "epoch": 1.134308510638298, + "grad_norm": 4.19631814956665, + "learning_rate": 9.168916638593736e-06, + "loss": 0.9218, + "step": 4265 + }, + { + "epoch": 1.1345744680851064, + "grad_norm": 4.052074909210205, + "learning_rate": 9.168431006474018e-06, + "loss": 0.8367, + "step": 4266 + }, + { + "epoch": 1.1348404255319149, + "grad_norm": 4.097432613372803, + "learning_rate": 9.167945245378139e-06, + "loss": 0.8705, + "step": 4267 + }, + { + "epoch": 1.1351063829787233, + "grad_norm": 3.81488037109375, + "learning_rate": 9.167459355321127e-06, + "loss": 0.6803, + "step": 4268 + }, + { + "epoch": 1.135372340425532, + "grad_norm": 4.266942501068115, + "learning_rate": 9.166973336318015e-06, + "loss": 0.8108, + "step": 4269 + }, + { + "epoch": 1.1356382978723405, + "grad_norm": 3.9824750423431396, + "learning_rate": 9.166487188383841e-06, + "loss": 0.811, + "step": 4270 + }, + { + "epoch": 1.135904255319149, + "grad_norm": 3.8896446228027344, + "learning_rate": 9.16600091153365e-06, + "loss": 0.8925, + "step": 4271 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 4.690064907073975, + "learning_rate": 9.165514505782484e-06, + "loss": 1.1356, + "step": 4272 + }, + { + "epoch": 1.1364361702127659, + "grad_norm": 4.304286479949951, + "learning_rate": 9.165027971145397e-06, + "loss": 0.8041, + "step": 4273 + }, + { + "epoch": 1.1367021276595746, + "grad_norm": 4.315762519836426, + "learning_rate": 9.16454130763744e-06, + "loss": 0.7519, + "step": 4274 + }, + { + "epoch": 1.136968085106383, + "grad_norm": 4.10341739654541, + "learning_rate": 9.16405451527367e-06, + "loss": 0.919, + "step": 4275 + }, + { + "epoch": 1.1372340425531915, + "grad_norm": 3.7802481651306152, + "learning_rate": 9.163567594069154e-06, + "loss": 0.8271, + "step": 4276 + }, + { + "epoch": 1.1375, + "grad_norm": 4.523904323577881, + "learning_rate": 9.163080544038953e-06, + "loss": 0.7865, + "step": 4277 + }, + { + "epoch": 1.1377659574468084, + "grad_norm": 3.958662509918213, + "learning_rate": 9.162593365198138e-06, + "loss": 0.8165, + "step": 4278 + }, + { + "epoch": 1.138031914893617, + "grad_norm": 3.8943662643432617, + "learning_rate": 9.162106057561784e-06, + "loss": 0.7951, + "step": 4279 + }, + { + "epoch": 1.1382978723404256, + "grad_norm": 3.9076874256134033, + "learning_rate": 9.161618621144967e-06, + "loss": 0.8135, + "step": 4280 + }, + { + "epoch": 1.138563829787234, + "grad_norm": 3.5434067249298096, + "learning_rate": 9.161131055962773e-06, + "loss": 0.7228, + "step": 4281 + }, + { + "epoch": 1.1388297872340425, + "grad_norm": 4.137996673583984, + "learning_rate": 9.160643362030284e-06, + "loss": 0.7711, + "step": 4282 + }, + { + "epoch": 1.139095744680851, + "grad_norm": 3.783001661300659, + "learning_rate": 9.160155539362589e-06, + "loss": 0.8494, + "step": 4283 + }, + { + "epoch": 1.1393617021276596, + "grad_norm": 3.8411149978637695, + "learning_rate": 9.159667587974786e-06, + "loss": 0.7447, + "step": 4284 + }, + { + "epoch": 1.139627659574468, + "grad_norm": 3.6387648582458496, + "learning_rate": 9.15917950788197e-06, + "loss": 0.8385, + "step": 4285 + }, + { + "epoch": 1.1398936170212766, + "grad_norm": 4.564189910888672, + "learning_rate": 9.158691299099241e-06, + "loss": 0.7572, + "step": 4286 + }, + { + "epoch": 1.140159574468085, + "grad_norm": 4.022932529449463, + "learning_rate": 9.15820296164171e-06, + "loss": 0.7129, + "step": 4287 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 4.345612525939941, + "learning_rate": 9.157714495524481e-06, + "loss": 0.8371, + "step": 4288 + }, + { + "epoch": 1.1406914893617022, + "grad_norm": 4.161421298980713, + "learning_rate": 9.157225900762672e-06, + "loss": 0.7528, + "step": 4289 + }, + { + "epoch": 1.1409574468085106, + "grad_norm": 4.042864799499512, + "learning_rate": 9.156737177371399e-06, + "loss": 0.8491, + "step": 4290 + }, + { + "epoch": 1.141223404255319, + "grad_norm": 3.8026928901672363, + "learning_rate": 9.156248325365782e-06, + "loss": 0.8444, + "step": 4291 + }, + { + "epoch": 1.1414893617021278, + "grad_norm": 4.251069068908691, + "learning_rate": 9.15575934476095e-06, + "loss": 0.7857, + "step": 4292 + }, + { + "epoch": 1.1417553191489362, + "grad_norm": 3.8531103134155273, + "learning_rate": 9.155270235572031e-06, + "loss": 0.867, + "step": 4293 + }, + { + "epoch": 1.1420212765957447, + "grad_norm": 3.975175142288208, + "learning_rate": 9.15478099781416e-06, + "loss": 0.808, + "step": 4294 + }, + { + "epoch": 1.1422872340425532, + "grad_norm": 3.695078134536743, + "learning_rate": 9.154291631502471e-06, + "loss": 0.7942, + "step": 4295 + }, + { + "epoch": 1.1425531914893616, + "grad_norm": 3.8435237407684326, + "learning_rate": 9.15380213665211e-06, + "loss": 0.8701, + "step": 4296 + }, + { + "epoch": 1.1428191489361703, + "grad_norm": 3.642451047897339, + "learning_rate": 9.153312513278219e-06, + "loss": 0.7479, + "step": 4297 + }, + { + "epoch": 1.1430851063829788, + "grad_norm": 3.8612117767333984, + "learning_rate": 9.15282276139595e-06, + "loss": 0.8394, + "step": 4298 + }, + { + "epoch": 1.1433510638297872, + "grad_norm": 3.818319082260132, + "learning_rate": 9.152332881020454e-06, + "loss": 0.789, + "step": 4299 + }, + { + "epoch": 1.1436170212765957, + "grad_norm": 3.6774802207946777, + "learning_rate": 9.15184287216689e-06, + "loss": 0.7991, + "step": 4300 + }, + { + "epoch": 1.1438829787234042, + "grad_norm": 4.338614463806152, + "learning_rate": 9.15135273485042e-06, + "loss": 0.8602, + "step": 4301 + }, + { + "epoch": 1.1441489361702128, + "grad_norm": 3.9688498973846436, + "learning_rate": 9.15086246908621e-06, + "loss": 0.7759, + "step": 4302 + }, + { + "epoch": 1.1444148936170213, + "grad_norm": 3.848708152770996, + "learning_rate": 9.150372074889427e-06, + "loss": 0.7635, + "step": 4303 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 4.042501926422119, + "learning_rate": 9.149881552275244e-06, + "loss": 0.8029, + "step": 4304 + }, + { + "epoch": 1.1449468085106382, + "grad_norm": 4.199094772338867, + "learning_rate": 9.149390901258841e-06, + "loss": 0.8343, + "step": 4305 + }, + { + "epoch": 1.1452127659574467, + "grad_norm": 4.045470714569092, + "learning_rate": 9.1489001218554e-06, + "loss": 0.831, + "step": 4306 + }, + { + "epoch": 1.1454787234042554, + "grad_norm": 3.7915914058685303, + "learning_rate": 9.148409214080103e-06, + "loss": 0.8476, + "step": 4307 + }, + { + "epoch": 1.1457446808510638, + "grad_norm": 3.7452378273010254, + "learning_rate": 9.14791817794814e-06, + "loss": 0.776, + "step": 4308 + }, + { + "epoch": 1.1460106382978723, + "grad_norm": 3.521505355834961, + "learning_rate": 9.147427013474706e-06, + "loss": 0.6753, + "step": 4309 + }, + { + "epoch": 1.1462765957446808, + "grad_norm": 3.906930923461914, + "learning_rate": 9.146935720674996e-06, + "loss": 0.6909, + "step": 4310 + }, + { + "epoch": 1.1465425531914895, + "grad_norm": 4.262080192565918, + "learning_rate": 9.146444299564215e-06, + "loss": 0.8444, + "step": 4311 + }, + { + "epoch": 1.146808510638298, + "grad_norm": 4.085954666137695, + "learning_rate": 9.145952750157563e-06, + "loss": 0.7587, + "step": 4312 + }, + { + "epoch": 1.1470744680851064, + "grad_norm": 3.9519617557525635, + "learning_rate": 9.145461072470253e-06, + "loss": 0.8757, + "step": 4313 + }, + { + "epoch": 1.1473404255319148, + "grad_norm": 4.349664211273193, + "learning_rate": 9.144969266517495e-06, + "loss": 0.7766, + "step": 4314 + }, + { + "epoch": 1.1476063829787235, + "grad_norm": 5.140100955963135, + "learning_rate": 9.144477332314509e-06, + "loss": 0.9414, + "step": 4315 + }, + { + "epoch": 1.147872340425532, + "grad_norm": 3.641763210296631, + "learning_rate": 9.143985269876516e-06, + "loss": 0.7562, + "step": 4316 + }, + { + "epoch": 1.1481382978723405, + "grad_norm": 3.641606092453003, + "learning_rate": 9.143493079218738e-06, + "loss": 0.7992, + "step": 4317 + }, + { + "epoch": 1.148404255319149, + "grad_norm": 4.611671447753906, + "learning_rate": 9.143000760356407e-06, + "loss": 0.8306, + "step": 4318 + }, + { + "epoch": 1.1486702127659574, + "grad_norm": 3.4973011016845703, + "learning_rate": 9.142508313304754e-06, + "loss": 0.7915, + "step": 4319 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 3.9405927658081055, + "learning_rate": 9.142015738079017e-06, + "loss": 0.8279, + "step": 4320 + }, + { + "epoch": 1.1492021276595745, + "grad_norm": 4.37050199508667, + "learning_rate": 9.141523034694436e-06, + "loss": 0.8506, + "step": 4321 + }, + { + "epoch": 1.149468085106383, + "grad_norm": 4.181821346282959, + "learning_rate": 9.141030203166256e-06, + "loss": 0.8439, + "step": 4322 + }, + { + "epoch": 1.1497340425531914, + "grad_norm": 3.8523123264312744, + "learning_rate": 9.140537243509729e-06, + "loss": 0.7565, + "step": 4323 + }, + { + "epoch": 1.15, + "grad_norm": 3.5637168884277344, + "learning_rate": 9.140044155740102e-06, + "loss": 0.7406, + "step": 4324 + }, + { + "epoch": 1.1502659574468086, + "grad_norm": 3.8401317596435547, + "learning_rate": 9.139550939872635e-06, + "loss": 0.8231, + "step": 4325 + }, + { + "epoch": 1.150531914893617, + "grad_norm": 4.033459186553955, + "learning_rate": 9.139057595922587e-06, + "loss": 0.7585, + "step": 4326 + }, + { + "epoch": 1.1507978723404255, + "grad_norm": 4.144162654876709, + "learning_rate": 9.138564123905225e-06, + "loss": 0.8237, + "step": 4327 + }, + { + "epoch": 1.151063829787234, + "grad_norm": 4.219383716583252, + "learning_rate": 9.138070523835816e-06, + "loss": 0.793, + "step": 4328 + }, + { + "epoch": 1.1513297872340424, + "grad_norm": 4.144248962402344, + "learning_rate": 9.137576795729635e-06, + "loss": 0.743, + "step": 4329 + }, + { + "epoch": 1.1515957446808511, + "grad_norm": 3.836845636367798, + "learning_rate": 9.137082939601953e-06, + "loss": 0.7829, + "step": 4330 + }, + { + "epoch": 1.1518617021276596, + "grad_norm": 3.8342814445495605, + "learning_rate": 9.136588955468057e-06, + "loss": 0.7298, + "step": 4331 + }, + { + "epoch": 1.152127659574468, + "grad_norm": 3.852695941925049, + "learning_rate": 9.136094843343228e-06, + "loss": 0.8051, + "step": 4332 + }, + { + "epoch": 1.1523936170212765, + "grad_norm": 3.9740166664123535, + "learning_rate": 9.135600603242753e-06, + "loss": 0.8096, + "step": 4333 + }, + { + "epoch": 1.1526595744680852, + "grad_norm": 4.557644367218018, + "learning_rate": 9.13510623518193e-06, + "loss": 0.8826, + "step": 4334 + }, + { + "epoch": 1.1529255319148937, + "grad_norm": 4.095839500427246, + "learning_rate": 9.13461173917605e-06, + "loss": 0.7624, + "step": 4335 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 3.6598823070526123, + "learning_rate": 9.134117115240412e-06, + "loss": 0.6786, + "step": 4336 + }, + { + "epoch": 1.1534574468085106, + "grad_norm": 4.052873611450195, + "learning_rate": 9.133622363390326e-06, + "loss": 0.7476, + "step": 4337 + }, + { + "epoch": 1.1537234042553193, + "grad_norm": 3.892709255218506, + "learning_rate": 9.133127483641096e-06, + "loss": 0.7902, + "step": 4338 + }, + { + "epoch": 1.1539893617021277, + "grad_norm": 4.127117156982422, + "learning_rate": 9.132632476008036e-06, + "loss": 0.8427, + "step": 4339 + }, + { + "epoch": 1.1542553191489362, + "grad_norm": 3.911402463912964, + "learning_rate": 9.132137340506464e-06, + "loss": 0.744, + "step": 4340 + }, + { + "epoch": 1.1545212765957447, + "grad_norm": 4.6202826499938965, + "learning_rate": 9.131642077151695e-06, + "loss": 0.816, + "step": 4341 + }, + { + "epoch": 1.1547872340425531, + "grad_norm": 3.967888593673706, + "learning_rate": 9.131146685959055e-06, + "loss": 0.8608, + "step": 4342 + }, + { + "epoch": 1.1550531914893618, + "grad_norm": 3.7461965084075928, + "learning_rate": 9.130651166943875e-06, + "loss": 0.8002, + "step": 4343 + }, + { + "epoch": 1.1553191489361703, + "grad_norm": 3.893925666809082, + "learning_rate": 9.130155520121484e-06, + "loss": 0.7651, + "step": 4344 + }, + { + "epoch": 1.1555851063829787, + "grad_norm": 4.108353614807129, + "learning_rate": 9.129659745507219e-06, + "loss": 0.847, + "step": 4345 + }, + { + "epoch": 1.1558510638297872, + "grad_norm": 3.766580104827881, + "learning_rate": 9.129163843116417e-06, + "loss": 0.7361, + "step": 4346 + }, + { + "epoch": 1.1561170212765957, + "grad_norm": 4.005224227905273, + "learning_rate": 9.128667812964428e-06, + "loss": 0.846, + "step": 4347 + }, + { + "epoch": 1.1563829787234043, + "grad_norm": 4.085299491882324, + "learning_rate": 9.128171655066592e-06, + "loss": 0.7435, + "step": 4348 + }, + { + "epoch": 1.1566489361702128, + "grad_norm": 3.649341583251953, + "learning_rate": 9.127675369438267e-06, + "loss": 0.7848, + "step": 4349 + }, + { + "epoch": 1.1569148936170213, + "grad_norm": 4.286210536956787, + "learning_rate": 9.127178956094805e-06, + "loss": 0.8657, + "step": 4350 + }, + { + "epoch": 1.1571808510638297, + "grad_norm": 3.8484995365142822, + "learning_rate": 9.12668241505157e-06, + "loss": 0.7356, + "step": 4351 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 3.80110239982605, + "learning_rate": 9.12618574632392e-06, + "loss": 0.8581, + "step": 4352 + }, + { + "epoch": 1.1577127659574469, + "grad_norm": 4.16612434387207, + "learning_rate": 9.125688949927223e-06, + "loss": 0.9135, + "step": 4353 + }, + { + "epoch": 1.1579787234042553, + "grad_norm": 4.107837677001953, + "learning_rate": 9.125192025876855e-06, + "loss": 0.8993, + "step": 4354 + }, + { + "epoch": 1.1582446808510638, + "grad_norm": 3.7631843090057373, + "learning_rate": 9.124694974188188e-06, + "loss": 0.7997, + "step": 4355 + }, + { + "epoch": 1.1585106382978723, + "grad_norm": 4.244007587432861, + "learning_rate": 9.124197794876604e-06, + "loss": 0.806, + "step": 4356 + }, + { + "epoch": 1.1587765957446807, + "grad_norm": 3.4537291526794434, + "learning_rate": 9.123700487957484e-06, + "loss": 0.7259, + "step": 4357 + }, + { + "epoch": 1.1590425531914894, + "grad_norm": 4.083813667297363, + "learning_rate": 9.123203053446215e-06, + "loss": 0.7935, + "step": 4358 + }, + { + "epoch": 1.1593085106382979, + "grad_norm": 3.842515707015991, + "learning_rate": 9.12270549135819e-06, + "loss": 0.8403, + "step": 4359 + }, + { + "epoch": 1.1595744680851063, + "grad_norm": 3.8198819160461426, + "learning_rate": 9.122207801708802e-06, + "loss": 0.8035, + "step": 4360 + }, + { + "epoch": 1.1598404255319148, + "grad_norm": 4.05394172668457, + "learning_rate": 9.121709984513453e-06, + "loss": 0.6678, + "step": 4361 + }, + { + "epoch": 1.1601063829787235, + "grad_norm": 3.8895061016082764, + "learning_rate": 9.121212039787543e-06, + "loss": 0.7822, + "step": 4362 + }, + { + "epoch": 1.160372340425532, + "grad_norm": 4.040393829345703, + "learning_rate": 9.12071396754648e-06, + "loss": 0.8669, + "step": 4363 + }, + { + "epoch": 1.1606382978723404, + "grad_norm": 3.8143858909606934, + "learning_rate": 9.120215767805677e-06, + "loss": 0.9251, + "step": 4364 + }, + { + "epoch": 1.1609042553191489, + "grad_norm": 3.8011443614959717, + "learning_rate": 9.119717440580547e-06, + "loss": 0.7142, + "step": 4365 + }, + { + "epoch": 1.1611702127659576, + "grad_norm": 4.147587776184082, + "learning_rate": 9.119218985886506e-06, + "loss": 0.8196, + "step": 4366 + }, + { + "epoch": 1.161436170212766, + "grad_norm": 4.035295009613037, + "learning_rate": 9.118720403738984e-06, + "loss": 0.9006, + "step": 4367 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 4.253767967224121, + "learning_rate": 9.118221694153401e-06, + "loss": 0.9149, + "step": 4368 + }, + { + "epoch": 1.161968085106383, + "grad_norm": 3.7400970458984375, + "learning_rate": 9.11772285714519e-06, + "loss": 0.847, + "step": 4369 + }, + { + "epoch": 1.1622340425531914, + "grad_norm": 4.12266731262207, + "learning_rate": 9.117223892729788e-06, + "loss": 0.8159, + "step": 4370 + }, + { + "epoch": 1.1625, + "grad_norm": 3.939617395401001, + "learning_rate": 9.11672480092263e-06, + "loss": 0.8515, + "step": 4371 + }, + { + "epoch": 1.1627659574468086, + "grad_norm": 3.597660541534424, + "learning_rate": 9.11622558173916e-06, + "loss": 0.7139, + "step": 4372 + }, + { + "epoch": 1.163031914893617, + "grad_norm": 3.8929126262664795, + "learning_rate": 9.115726235194825e-06, + "loss": 0.755, + "step": 4373 + }, + { + "epoch": 1.1632978723404255, + "grad_norm": 3.9748990535736084, + "learning_rate": 9.115226761305071e-06, + "loss": 0.9779, + "step": 4374 + }, + { + "epoch": 1.163563829787234, + "grad_norm": 3.6702117919921875, + "learning_rate": 9.11472716008536e-06, + "loss": 0.7913, + "step": 4375 + }, + { + "epoch": 1.1638297872340426, + "grad_norm": 3.5676674842834473, + "learning_rate": 9.114227431551144e-06, + "loss": 0.8714, + "step": 4376 + }, + { + "epoch": 1.164095744680851, + "grad_norm": 3.871457576751709, + "learning_rate": 9.113727575717887e-06, + "loss": 0.7551, + "step": 4377 + }, + { + "epoch": 1.1643617021276595, + "grad_norm": 3.709536552429199, + "learning_rate": 9.113227592601057e-06, + "loss": 0.7476, + "step": 4378 + }, + { + "epoch": 1.164627659574468, + "grad_norm": 4.048936367034912, + "learning_rate": 9.112727482216123e-06, + "loss": 0.822, + "step": 4379 + }, + { + "epoch": 1.1648936170212765, + "grad_norm": 4.941551685333252, + "learning_rate": 9.112227244578557e-06, + "loss": 0.942, + "step": 4380 + }, + { + "epoch": 1.1651595744680852, + "grad_norm": 3.971956491470337, + "learning_rate": 9.111726879703839e-06, + "loss": 0.898, + "step": 4381 + }, + { + "epoch": 1.1654255319148936, + "grad_norm": 4.139491558074951, + "learning_rate": 9.111226387607452e-06, + "loss": 0.9185, + "step": 4382 + }, + { + "epoch": 1.165691489361702, + "grad_norm": 3.8217787742614746, + "learning_rate": 9.110725768304878e-06, + "loss": 0.8598, + "step": 4383 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 3.656966209411621, + "learning_rate": 9.11022502181161e-06, + "loss": 0.7433, + "step": 4384 + }, + { + "epoch": 1.1662234042553192, + "grad_norm": 4.29415225982666, + "learning_rate": 9.10972414814314e-06, + "loss": 0.7777, + "step": 4385 + }, + { + "epoch": 1.1664893617021277, + "grad_norm": 3.9143810272216797, + "learning_rate": 9.109223147314968e-06, + "loss": 0.678, + "step": 4386 + }, + { + "epoch": 1.1667553191489362, + "grad_norm": 4.056838512420654, + "learning_rate": 9.108722019342592e-06, + "loss": 0.6778, + "step": 4387 + }, + { + "epoch": 1.1670212765957446, + "grad_norm": 3.9018867015838623, + "learning_rate": 9.10822076424152e-06, + "loss": 0.8195, + "step": 4388 + }, + { + "epoch": 1.1672872340425533, + "grad_norm": 4.0093994140625, + "learning_rate": 9.10771938202726e-06, + "loss": 0.9474, + "step": 4389 + }, + { + "epoch": 1.1675531914893618, + "grad_norm": 4.224606037139893, + "learning_rate": 9.107217872715326e-06, + "loss": 0.7376, + "step": 4390 + }, + { + "epoch": 1.1678191489361702, + "grad_norm": 3.831489086151123, + "learning_rate": 9.106716236321236e-06, + "loss": 0.731, + "step": 4391 + }, + { + "epoch": 1.1680851063829787, + "grad_norm": 3.8180394172668457, + "learning_rate": 9.106214472860511e-06, + "loss": 0.7458, + "step": 4392 + }, + { + "epoch": 1.1683510638297872, + "grad_norm": 3.393148899078369, + "learning_rate": 9.105712582348676e-06, + "loss": 0.7216, + "step": 4393 + }, + { + "epoch": 1.1686170212765958, + "grad_norm": 4.6142964363098145, + "learning_rate": 9.105210564801259e-06, + "loss": 0.7643, + "step": 4394 + }, + { + "epoch": 1.1688829787234043, + "grad_norm": 4.428558826446533, + "learning_rate": 9.104708420233794e-06, + "loss": 0.8364, + "step": 4395 + }, + { + "epoch": 1.1691489361702128, + "grad_norm": 4.209799766540527, + "learning_rate": 9.104206148661819e-06, + "loss": 0.7965, + "step": 4396 + }, + { + "epoch": 1.1694148936170212, + "grad_norm": 4.0707831382751465, + "learning_rate": 9.10370375010087e-06, + "loss": 0.7676, + "step": 4397 + }, + { + "epoch": 1.1696808510638297, + "grad_norm": 3.684016227722168, + "learning_rate": 9.103201224566499e-06, + "loss": 0.8018, + "step": 4398 + }, + { + "epoch": 1.1699468085106384, + "grad_norm": 4.157726287841797, + "learning_rate": 9.10269857207425e-06, + "loss": 0.8431, + "step": 4399 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 3.866776704788208, + "learning_rate": 9.102195792639677e-06, + "loss": 0.9013, + "step": 4400 + }, + { + "epoch": 1.1704787234042553, + "grad_norm": 3.8174455165863037, + "learning_rate": 9.101692886278336e-06, + "loss": 0.8174, + "step": 4401 + }, + { + "epoch": 1.1707446808510638, + "grad_norm": 4.051540851593018, + "learning_rate": 9.101189853005788e-06, + "loss": 0.8006, + "step": 4402 + }, + { + "epoch": 1.1710106382978722, + "grad_norm": 4.115768909454346, + "learning_rate": 9.100686692837598e-06, + "loss": 0.8905, + "step": 4403 + }, + { + "epoch": 1.171276595744681, + "grad_norm": 3.989694595336914, + "learning_rate": 9.100183405789334e-06, + "loss": 0.8763, + "step": 4404 + }, + { + "epoch": 1.1715425531914894, + "grad_norm": 3.5945072174072266, + "learning_rate": 9.099679991876567e-06, + "loss": 0.7173, + "step": 4405 + }, + { + "epoch": 1.1718085106382978, + "grad_norm": 3.627795934677124, + "learning_rate": 9.099176451114876e-06, + "loss": 0.7708, + "step": 4406 + }, + { + "epoch": 1.1720744680851063, + "grad_norm": 4.366139888763428, + "learning_rate": 9.098672783519837e-06, + "loss": 0.7882, + "step": 4407 + }, + { + "epoch": 1.172340425531915, + "grad_norm": 4.13855504989624, + "learning_rate": 9.098168989107038e-06, + "loss": 0.7776, + "step": 4408 + }, + { + "epoch": 1.1726063829787234, + "grad_norm": 3.8078205585479736, + "learning_rate": 9.097665067892066e-06, + "loss": 0.7194, + "step": 4409 + }, + { + "epoch": 1.172872340425532, + "grad_norm": 3.676452398300171, + "learning_rate": 9.09716101989051e-06, + "loss": 0.7386, + "step": 4410 + }, + { + "epoch": 1.1731382978723404, + "grad_norm": 4.525330066680908, + "learning_rate": 9.09665684511797e-06, + "loss": 0.8734, + "step": 4411 + }, + { + "epoch": 1.173404255319149, + "grad_norm": 4.38550329208374, + "learning_rate": 9.096152543590045e-06, + "loss": 0.8248, + "step": 4412 + }, + { + "epoch": 1.1736702127659575, + "grad_norm": 4.337765693664551, + "learning_rate": 9.095648115322336e-06, + "loss": 0.8992, + "step": 4413 + }, + { + "epoch": 1.173936170212766, + "grad_norm": 4.145912170410156, + "learning_rate": 9.095143560330453e-06, + "loss": 0.8119, + "step": 4414 + }, + { + "epoch": 1.1742021276595744, + "grad_norm": 3.5085721015930176, + "learning_rate": 9.094638878630007e-06, + "loss": 0.744, + "step": 4415 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 4.225882053375244, + "learning_rate": 9.094134070236614e-06, + "loss": 0.8368, + "step": 4416 + }, + { + "epoch": 1.1747340425531916, + "grad_norm": 4.2498273849487305, + "learning_rate": 9.09362913516589e-06, + "loss": 0.7281, + "step": 4417 + }, + { + "epoch": 1.175, + "grad_norm": 3.8343684673309326, + "learning_rate": 9.093124073433464e-06, + "loss": 0.8521, + "step": 4418 + }, + { + "epoch": 1.1752659574468085, + "grad_norm": 4.265048503875732, + "learning_rate": 9.092618885054958e-06, + "loss": 0.8624, + "step": 4419 + }, + { + "epoch": 1.175531914893617, + "grad_norm": 4.251501560211182, + "learning_rate": 9.092113570046005e-06, + "loss": 0.7163, + "step": 4420 + }, + { + "epoch": 1.1757978723404254, + "grad_norm": 3.9519202709198, + "learning_rate": 9.091608128422243e-06, + "loss": 0.8139, + "step": 4421 + }, + { + "epoch": 1.1760638297872341, + "grad_norm": 3.785550832748413, + "learning_rate": 9.091102560199306e-06, + "loss": 0.7897, + "step": 4422 + }, + { + "epoch": 1.1763297872340426, + "grad_norm": 4.2011260986328125, + "learning_rate": 9.090596865392838e-06, + "loss": 0.8119, + "step": 4423 + }, + { + "epoch": 1.176595744680851, + "grad_norm": 3.7419655323028564, + "learning_rate": 9.090091044018488e-06, + "loss": 0.64, + "step": 4424 + }, + { + "epoch": 1.1768617021276595, + "grad_norm": 3.561340093612671, + "learning_rate": 9.089585096091906e-06, + "loss": 0.7546, + "step": 4425 + }, + { + "epoch": 1.177127659574468, + "grad_norm": 3.971997022628784, + "learning_rate": 9.089079021628746e-06, + "loss": 0.8783, + "step": 4426 + }, + { + "epoch": 1.1773936170212767, + "grad_norm": 4.214608669281006, + "learning_rate": 9.088572820644667e-06, + "loss": 0.9312, + "step": 4427 + }, + { + "epoch": 1.1776595744680851, + "grad_norm": 3.867511749267578, + "learning_rate": 9.088066493155332e-06, + "loss": 0.9171, + "step": 4428 + }, + { + "epoch": 1.1779255319148936, + "grad_norm": 3.8267605304718018, + "learning_rate": 9.087560039176407e-06, + "loss": 0.7369, + "step": 4429 + }, + { + "epoch": 1.178191489361702, + "grad_norm": 3.9210994243621826, + "learning_rate": 9.08705345872356e-06, + "loss": 0.7975, + "step": 4430 + }, + { + "epoch": 1.1784574468085107, + "grad_norm": 3.820697069168091, + "learning_rate": 9.086546751812467e-06, + "loss": 0.7579, + "step": 4431 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 4.319027423858643, + "learning_rate": 9.086039918458806e-06, + "loss": 0.7671, + "step": 4432 + }, + { + "epoch": 1.1789893617021276, + "grad_norm": 3.768254280090332, + "learning_rate": 9.085532958678262e-06, + "loss": 0.7075, + "step": 4433 + }, + { + "epoch": 1.179255319148936, + "grad_norm": 3.8115556240081787, + "learning_rate": 9.085025872486516e-06, + "loss": 0.6844, + "step": 4434 + }, + { + "epoch": 1.1795212765957448, + "grad_norm": 3.6113126277923584, + "learning_rate": 9.08451865989926e-06, + "loss": 0.7161, + "step": 4435 + }, + { + "epoch": 1.1797872340425533, + "grad_norm": 4.16688871383667, + "learning_rate": 9.08401132093219e-06, + "loss": 0.8756, + "step": 4436 + }, + { + "epoch": 1.1800531914893617, + "grad_norm": 4.136419773101807, + "learning_rate": 9.083503855600997e-06, + "loss": 0.8072, + "step": 4437 + }, + { + "epoch": 1.1803191489361702, + "grad_norm": 4.0323357582092285, + "learning_rate": 9.08299626392139e-06, + "loss": 0.7889, + "step": 4438 + }, + { + "epoch": 1.1805851063829786, + "grad_norm": 3.848400354385376, + "learning_rate": 9.082488545909072e-06, + "loss": 0.8467, + "step": 4439 + }, + { + "epoch": 1.1808510638297873, + "grad_norm": 3.8820831775665283, + "learning_rate": 9.08198070157975e-06, + "loss": 0.7926, + "step": 4440 + }, + { + "epoch": 1.1811170212765958, + "grad_norm": 3.9585654735565186, + "learning_rate": 9.08147273094914e-06, + "loss": 0.8671, + "step": 4441 + }, + { + "epoch": 1.1813829787234043, + "grad_norm": 4.736848831176758, + "learning_rate": 9.080964634032958e-06, + "loss": 0.8953, + "step": 4442 + }, + { + "epoch": 1.1816489361702127, + "grad_norm": 4.1310343742370605, + "learning_rate": 9.080456410846926e-06, + "loss": 0.7878, + "step": 4443 + }, + { + "epoch": 1.1819148936170212, + "grad_norm": 3.701655149459839, + "learning_rate": 9.079948061406769e-06, + "loss": 0.7205, + "step": 4444 + }, + { + "epoch": 1.1821808510638299, + "grad_norm": 4.258152008056641, + "learning_rate": 9.079439585728214e-06, + "loss": 0.8573, + "step": 4445 + }, + { + "epoch": 1.1824468085106383, + "grad_norm": 4.08727502822876, + "learning_rate": 9.078930983826997e-06, + "loss": 0.8661, + "step": 4446 + }, + { + "epoch": 1.1827127659574468, + "grad_norm": 4.263191223144531, + "learning_rate": 9.078422255718852e-06, + "loss": 0.9975, + "step": 4447 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 3.8881144523620605, + "learning_rate": 9.07791340141952e-06, + "loss": 0.8825, + "step": 4448 + }, + { + "epoch": 1.1832446808510637, + "grad_norm": 4.034143924713135, + "learning_rate": 9.077404420944746e-06, + "loss": 0.7645, + "step": 4449 + }, + { + "epoch": 1.1835106382978724, + "grad_norm": 3.6815900802612305, + "learning_rate": 9.076895314310282e-06, + "loss": 0.845, + "step": 4450 + }, + { + "epoch": 1.1837765957446809, + "grad_norm": 4.061761379241943, + "learning_rate": 9.076386081531873e-06, + "loss": 0.715, + "step": 4451 + }, + { + "epoch": 1.1840425531914893, + "grad_norm": 3.675588846206665, + "learning_rate": 9.075876722625281e-06, + "loss": 0.6865, + "step": 4452 + }, + { + "epoch": 1.1843085106382978, + "grad_norm": 3.922511577606201, + "learning_rate": 9.075367237606265e-06, + "loss": 0.8139, + "step": 4453 + }, + { + "epoch": 1.1845744680851065, + "grad_norm": 4.45919132232666, + "learning_rate": 9.074857626490587e-06, + "loss": 0.8832, + "step": 4454 + }, + { + "epoch": 1.184840425531915, + "grad_norm": 3.8306045532226562, + "learning_rate": 9.074347889294017e-06, + "loss": 0.775, + "step": 4455 + }, + { + "epoch": 1.1851063829787234, + "grad_norm": 4.380180358886719, + "learning_rate": 9.073838026032328e-06, + "loss": 0.8028, + "step": 4456 + }, + { + "epoch": 1.1853723404255319, + "grad_norm": 3.6403377056121826, + "learning_rate": 9.073328036721292e-06, + "loss": 0.7365, + "step": 4457 + }, + { + "epoch": 1.1856382978723405, + "grad_norm": 4.642416477203369, + "learning_rate": 9.072817921376692e-06, + "loss": 1.0456, + "step": 4458 + }, + { + "epoch": 1.185904255319149, + "grad_norm": 4.2514753341674805, + "learning_rate": 9.07230768001431e-06, + "loss": 0.8752, + "step": 4459 + }, + { + "epoch": 1.1861702127659575, + "grad_norm": 4.097993850708008, + "learning_rate": 9.071797312649934e-06, + "loss": 0.8805, + "step": 4460 + }, + { + "epoch": 1.186436170212766, + "grad_norm": 3.6704015731811523, + "learning_rate": 9.071286819299355e-06, + "loss": 0.7362, + "step": 4461 + }, + { + "epoch": 1.1867021276595744, + "grad_norm": 3.5198822021484375, + "learning_rate": 9.070776199978369e-06, + "loss": 0.6528, + "step": 4462 + }, + { + "epoch": 1.186968085106383, + "grad_norm": 4.044826507568359, + "learning_rate": 9.070265454702774e-06, + "loss": 0.785, + "step": 4463 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 3.775392770767212, + "learning_rate": 9.069754583488375e-06, + "loss": 0.7664, + "step": 4464 + }, + { + "epoch": 1.1875, + "grad_norm": 3.9251670837402344, + "learning_rate": 9.069243586350976e-06, + "loss": 0.7694, + "step": 4465 + }, + { + "epoch": 1.1877659574468085, + "grad_norm": 4.138858318328857, + "learning_rate": 9.06873246330639e-06, + "loss": 0.8734, + "step": 4466 + }, + { + "epoch": 1.188031914893617, + "grad_norm": 3.8749899864196777, + "learning_rate": 9.06822121437043e-06, + "loss": 0.7114, + "step": 4467 + }, + { + "epoch": 1.1882978723404256, + "grad_norm": 4.107519626617432, + "learning_rate": 9.067709839558917e-06, + "loss": 0.7998, + "step": 4468 + }, + { + "epoch": 1.188563829787234, + "grad_norm": 3.6962497234344482, + "learning_rate": 9.067198338887673e-06, + "loss": 0.8317, + "step": 4469 + }, + { + "epoch": 1.1888297872340425, + "grad_norm": 4.575094223022461, + "learning_rate": 9.066686712372524e-06, + "loss": 0.8399, + "step": 4470 + }, + { + "epoch": 1.189095744680851, + "grad_norm": 4.391597747802734, + "learning_rate": 9.0661749600293e-06, + "loss": 0.8801, + "step": 4471 + }, + { + "epoch": 1.1893617021276595, + "grad_norm": 3.650452136993408, + "learning_rate": 9.065663081873834e-06, + "loss": 0.7738, + "step": 4472 + }, + { + "epoch": 1.1896276595744681, + "grad_norm": 4.12108039855957, + "learning_rate": 9.065151077921968e-06, + "loss": 0.8333, + "step": 4473 + }, + { + "epoch": 1.1898936170212766, + "grad_norm": 4.204649925231934, + "learning_rate": 9.064638948189539e-06, + "loss": 0.8531, + "step": 4474 + }, + { + "epoch": 1.190159574468085, + "grad_norm": 4.241077423095703, + "learning_rate": 9.064126692692397e-06, + "loss": 0.8215, + "step": 4475 + }, + { + "epoch": 1.1904255319148935, + "grad_norm": 4.215181350708008, + "learning_rate": 9.06361431144639e-06, + "loss": 0.7595, + "step": 4476 + }, + { + "epoch": 1.1906914893617022, + "grad_norm": 3.597543239593506, + "learning_rate": 9.06310180446737e-06, + "loss": 0.7967, + "step": 4477 + }, + { + "epoch": 1.1909574468085107, + "grad_norm": 4.075351238250732, + "learning_rate": 9.0625891717712e-06, + "loss": 0.8158, + "step": 4478 + }, + { + "epoch": 1.1912234042553191, + "grad_norm": 3.5748724937438965, + "learning_rate": 9.062076413373735e-06, + "loss": 0.733, + "step": 4479 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 3.9107751846313477, + "learning_rate": 9.061563529290845e-06, + "loss": 0.8057, + "step": 4480 + }, + { + "epoch": 1.1917553191489363, + "grad_norm": 4.108970642089844, + "learning_rate": 9.061050519538397e-06, + "loss": 0.9214, + "step": 4481 + }, + { + "epoch": 1.1920212765957447, + "grad_norm": 3.9196219444274902, + "learning_rate": 9.060537384132264e-06, + "loss": 0.8046, + "step": 4482 + }, + { + "epoch": 1.1922872340425532, + "grad_norm": 3.312999963760376, + "learning_rate": 9.060024123088324e-06, + "loss": 0.6791, + "step": 4483 + }, + { + "epoch": 1.1925531914893617, + "grad_norm": 4.010212421417236, + "learning_rate": 9.05951073642246e-06, + "loss": 0.8244, + "step": 4484 + }, + { + "epoch": 1.1928191489361701, + "grad_norm": 3.9299821853637695, + "learning_rate": 9.05899722415055e-06, + "loss": 0.7054, + "step": 4485 + }, + { + "epoch": 1.1930851063829788, + "grad_norm": 4.205704212188721, + "learning_rate": 9.05848358628849e-06, + "loss": 0.9058, + "step": 4486 + }, + { + "epoch": 1.1933510638297873, + "grad_norm": 4.133444309234619, + "learning_rate": 9.057969822852168e-06, + "loss": 0.8414, + "step": 4487 + }, + { + "epoch": 1.1936170212765957, + "grad_norm": 3.7199227809906006, + "learning_rate": 9.057455933857483e-06, + "loss": 0.7884, + "step": 4488 + }, + { + "epoch": 1.1938829787234042, + "grad_norm": 4.377199172973633, + "learning_rate": 9.056941919320335e-06, + "loss": 0.7732, + "step": 4489 + }, + { + "epoch": 1.1941489361702127, + "grad_norm": 4.171092987060547, + "learning_rate": 9.056427779256624e-06, + "loss": 0.8652, + "step": 4490 + }, + { + "epoch": 1.1944148936170214, + "grad_norm": 3.7670929431915283, + "learning_rate": 9.055913513682267e-06, + "loss": 0.7825, + "step": 4491 + }, + { + "epoch": 1.1946808510638298, + "grad_norm": 3.9210784435272217, + "learning_rate": 9.055399122613166e-06, + "loss": 0.8515, + "step": 4492 + }, + { + "epoch": 1.1949468085106383, + "grad_norm": 3.543363094329834, + "learning_rate": 9.054884606065243e-06, + "loss": 0.6883, + "step": 4493 + }, + { + "epoch": 1.1952127659574467, + "grad_norm": 3.9357686042785645, + "learning_rate": 9.054369964054418e-06, + "loss": 0.7847, + "step": 4494 + }, + { + "epoch": 1.1954787234042552, + "grad_norm": 3.5497348308563232, + "learning_rate": 9.05385519659661e-06, + "loss": 0.8664, + "step": 4495 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 4.09616756439209, + "learning_rate": 9.053340303707752e-06, + "loss": 0.7928, + "step": 4496 + }, + { + "epoch": 1.1960106382978724, + "grad_norm": 4.135888576507568, + "learning_rate": 9.052825285403771e-06, + "loss": 0.8372, + "step": 4497 + }, + { + "epoch": 1.1962765957446808, + "grad_norm": 4.014375686645508, + "learning_rate": 9.052310141700605e-06, + "loss": 0.7838, + "step": 4498 + }, + { + "epoch": 1.1965425531914893, + "grad_norm": 4.164703369140625, + "learning_rate": 9.051794872614193e-06, + "loss": 0.7346, + "step": 4499 + }, + { + "epoch": 1.196808510638298, + "grad_norm": 3.9445199966430664, + "learning_rate": 9.051279478160475e-06, + "loss": 0.7969, + "step": 4500 + }, + { + "epoch": 1.196808510638298, + "eval_loss": 1.3114004135131836, + "eval_runtime": 13.8708, + "eval_samples_per_second": 28.838, + "eval_steps_per_second": 3.605, + "step": 4500 + }, + { + "epoch": 1.1970744680851064, + "grad_norm": 4.145724773406982, + "learning_rate": 9.050763958355401e-06, + "loss": 0.864, + "step": 4501 + }, + { + "epoch": 1.1973404255319149, + "grad_norm": 3.9395062923431396, + "learning_rate": 9.050248313214921e-06, + "loss": 0.8854, + "step": 4502 + }, + { + "epoch": 1.1976063829787233, + "grad_norm": 3.7419703006744385, + "learning_rate": 9.04973254275499e-06, + "loss": 0.778, + "step": 4503 + }, + { + "epoch": 1.197872340425532, + "grad_norm": 3.620009422302246, + "learning_rate": 9.049216646991568e-06, + "loss": 0.6522, + "step": 4504 + }, + { + "epoch": 1.1981382978723405, + "grad_norm": 4.093226909637451, + "learning_rate": 9.048700625940613e-06, + "loss": 0.7909, + "step": 4505 + }, + { + "epoch": 1.198404255319149, + "grad_norm": 4.31190824508667, + "learning_rate": 9.048184479618094e-06, + "loss": 0.87, + "step": 4506 + }, + { + "epoch": 1.1986702127659574, + "grad_norm": 3.5274550914764404, + "learning_rate": 9.047668208039981e-06, + "loss": 0.7015, + "step": 4507 + }, + { + "epoch": 1.1989361702127659, + "grad_norm": 4.295877933502197, + "learning_rate": 9.04715181122225e-06, + "loss": 0.8673, + "step": 4508 + }, + { + "epoch": 1.1992021276595746, + "grad_norm": 4.239846706390381, + "learning_rate": 9.046635289180875e-06, + "loss": 0.7815, + "step": 4509 + }, + { + "epoch": 1.199468085106383, + "grad_norm": 4.294873237609863, + "learning_rate": 9.046118641931841e-06, + "loss": 0.8275, + "step": 4510 + }, + { + "epoch": 1.1997340425531915, + "grad_norm": 4.2128586769104, + "learning_rate": 9.045601869491131e-06, + "loss": 0.885, + "step": 4511 + }, + { + "epoch": 1.2, + "grad_norm": 4.04133415222168, + "learning_rate": 9.045084971874738e-06, + "loss": 0.6479, + "step": 4512 + }, + { + "epoch": 1.2002659574468084, + "grad_norm": 4.300421714782715, + "learning_rate": 9.044567949098653e-06, + "loss": 0.7596, + "step": 4513 + }, + { + "epoch": 1.200531914893617, + "grad_norm": 4.0186896324157715, + "learning_rate": 9.044050801178873e-06, + "loss": 0.9244, + "step": 4514 + }, + { + "epoch": 1.2007978723404256, + "grad_norm": 3.989703416824341, + "learning_rate": 9.043533528131401e-06, + "loss": 0.8296, + "step": 4515 + }, + { + "epoch": 1.201063829787234, + "grad_norm": 3.6627588272094727, + "learning_rate": 9.043016129972239e-06, + "loss": 0.6557, + "step": 4516 + }, + { + "epoch": 1.2013297872340425, + "grad_norm": 4.000990867614746, + "learning_rate": 9.042498606717401e-06, + "loss": 0.8114, + "step": 4517 + }, + { + "epoch": 1.201595744680851, + "grad_norm": 4.12056827545166, + "learning_rate": 9.041980958382895e-06, + "loss": 0.7866, + "step": 4518 + }, + { + "epoch": 1.2018617021276596, + "grad_norm": 4.345433712005615, + "learning_rate": 9.041463184984739e-06, + "loss": 0.9222, + "step": 4519 + }, + { + "epoch": 1.202127659574468, + "grad_norm": 3.629518747329712, + "learning_rate": 9.040945286538954e-06, + "loss": 0.6739, + "step": 4520 + }, + { + "epoch": 1.2023936170212766, + "grad_norm": 4.012117862701416, + "learning_rate": 9.040427263061563e-06, + "loss": 0.8168, + "step": 4521 + }, + { + "epoch": 1.202659574468085, + "grad_norm": 3.6947031021118164, + "learning_rate": 9.039909114568597e-06, + "loss": 0.7811, + "step": 4522 + }, + { + "epoch": 1.2029255319148937, + "grad_norm": 4.276979446411133, + "learning_rate": 9.039390841076086e-06, + "loss": 0.9514, + "step": 4523 + }, + { + "epoch": 1.2031914893617022, + "grad_norm": 3.970949411392212, + "learning_rate": 9.038872442600066e-06, + "loss": 0.832, + "step": 4524 + }, + { + "epoch": 1.2034574468085106, + "grad_norm": 4.2050323486328125, + "learning_rate": 9.038353919156579e-06, + "loss": 0.838, + "step": 4525 + }, + { + "epoch": 1.203723404255319, + "grad_norm": 3.872286319732666, + "learning_rate": 9.037835270761667e-06, + "loss": 0.8424, + "step": 4526 + }, + { + "epoch": 1.2039893617021278, + "grad_norm": 4.053325653076172, + "learning_rate": 9.037316497431377e-06, + "loss": 0.8673, + "step": 4527 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 3.982133388519287, + "learning_rate": 9.036797599181762e-06, + "loss": 0.7101, + "step": 4528 + }, + { + "epoch": 1.2045212765957447, + "grad_norm": 4.298680782318115, + "learning_rate": 9.036278576028876e-06, + "loss": 0.8027, + "step": 4529 + }, + { + "epoch": 1.2047872340425532, + "grad_norm": 3.7166576385498047, + "learning_rate": 9.035759427988779e-06, + "loss": 0.8048, + "step": 4530 + }, + { + "epoch": 1.2050531914893616, + "grad_norm": 4.02637243270874, + "learning_rate": 9.035240155077532e-06, + "loss": 0.8519, + "step": 4531 + }, + { + "epoch": 1.2053191489361703, + "grad_norm": 4.048903942108154, + "learning_rate": 9.034720757311206e-06, + "loss": 0.8076, + "step": 4532 + }, + { + "epoch": 1.2055851063829788, + "grad_norm": 3.8102221488952637, + "learning_rate": 9.034201234705869e-06, + "loss": 0.8361, + "step": 4533 + }, + { + "epoch": 1.2058510638297872, + "grad_norm": 4.269223213195801, + "learning_rate": 9.033681587277596e-06, + "loss": 0.9528, + "step": 4534 + }, + { + "epoch": 1.2061170212765957, + "grad_norm": 4.001543998718262, + "learning_rate": 9.033161815042465e-06, + "loss": 0.8678, + "step": 4535 + }, + { + "epoch": 1.2063829787234042, + "grad_norm": 4.034337997436523, + "learning_rate": 9.032641918016559e-06, + "loss": 0.7533, + "step": 4536 + }, + { + "epoch": 1.2066489361702128, + "grad_norm": 3.7186598777770996, + "learning_rate": 9.032121896215965e-06, + "loss": 0.8469, + "step": 4537 + }, + { + "epoch": 1.2069148936170213, + "grad_norm": 3.8396542072296143, + "learning_rate": 9.03160174965677e-06, + "loss": 0.7419, + "step": 4538 + }, + { + "epoch": 1.2071808510638298, + "grad_norm": 3.971125602722168, + "learning_rate": 9.031081478355074e-06, + "loss": 0.7997, + "step": 4539 + }, + { + "epoch": 1.2074468085106382, + "grad_norm": 3.9450175762176514, + "learning_rate": 9.03056108232697e-06, + "loss": 0.9049, + "step": 4540 + }, + { + "epoch": 1.2077127659574467, + "grad_norm": 3.878206729888916, + "learning_rate": 9.03004056158856e-06, + "loss": 0.7389, + "step": 4541 + }, + { + "epoch": 1.2079787234042554, + "grad_norm": 4.157868385314941, + "learning_rate": 9.02951991615595e-06, + "loss": 0.8474, + "step": 4542 + }, + { + "epoch": 1.2082446808510638, + "grad_norm": 4.203000068664551, + "learning_rate": 9.02899914604525e-06, + "loss": 0.7146, + "step": 4543 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 4.336871147155762, + "learning_rate": 9.028478251272573e-06, + "loss": 0.7901, + "step": 4544 + }, + { + "epoch": 1.2087765957446808, + "grad_norm": 4.467360973358154, + "learning_rate": 9.027957231854034e-06, + "loss": 0.6987, + "step": 4545 + }, + { + "epoch": 1.2090425531914895, + "grad_norm": 4.293298721313477, + "learning_rate": 9.027436087805759e-06, + "loss": 0.8706, + "step": 4546 + }, + { + "epoch": 1.209308510638298, + "grad_norm": 4.344003200531006, + "learning_rate": 9.026914819143867e-06, + "loss": 0.8803, + "step": 4547 + }, + { + "epoch": 1.2095744680851064, + "grad_norm": 3.9396615028381348, + "learning_rate": 9.026393425884491e-06, + "loss": 0.8195, + "step": 4548 + }, + { + "epoch": 1.2098404255319148, + "grad_norm": 4.163116931915283, + "learning_rate": 9.025871908043762e-06, + "loss": 0.8396, + "step": 4549 + }, + { + "epoch": 1.2101063829787235, + "grad_norm": 3.790417194366455, + "learning_rate": 9.025350265637816e-06, + "loss": 0.9279, + "step": 4550 + }, + { + "epoch": 1.210372340425532, + "grad_norm": 3.6482441425323486, + "learning_rate": 9.024828498682793e-06, + "loss": 0.8154, + "step": 4551 + }, + { + "epoch": 1.2106382978723405, + "grad_norm": 4.012534141540527, + "learning_rate": 9.024306607194839e-06, + "loss": 0.777, + "step": 4552 + }, + { + "epoch": 1.210904255319149, + "grad_norm": 3.850843906402588, + "learning_rate": 9.0237845911901e-06, + "loss": 0.6989, + "step": 4553 + }, + { + "epoch": 1.2111702127659574, + "grad_norm": 3.810297966003418, + "learning_rate": 9.023262450684727e-06, + "loss": 0.8284, + "step": 4554 + }, + { + "epoch": 1.211436170212766, + "grad_norm": 3.643862247467041, + "learning_rate": 9.022740185694877e-06, + "loss": 0.9392, + "step": 4555 + }, + { + "epoch": 1.2117021276595745, + "grad_norm": 3.707839012145996, + "learning_rate": 9.022217796236711e-06, + "loss": 0.794, + "step": 4556 + }, + { + "epoch": 1.211968085106383, + "grad_norm": 4.23673152923584, + "learning_rate": 9.02169528232639e-06, + "loss": 0.7546, + "step": 4557 + }, + { + "epoch": 1.2122340425531914, + "grad_norm": 4.236415386199951, + "learning_rate": 9.021172643980082e-06, + "loss": 0.9645, + "step": 4558 + }, + { + "epoch": 1.2125, + "grad_norm": 3.956615686416626, + "learning_rate": 9.02064988121396e-06, + "loss": 0.9095, + "step": 4559 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 4.126330852508545, + "learning_rate": 9.020126994044194e-06, + "loss": 0.7762, + "step": 4560 + }, + { + "epoch": 1.213031914893617, + "grad_norm": 4.501354694366455, + "learning_rate": 9.019603982486967e-06, + "loss": 0.873, + "step": 4561 + }, + { + "epoch": 1.2132978723404255, + "grad_norm": 4.185324192047119, + "learning_rate": 9.01908084655846e-06, + "loss": 0.8071, + "step": 4562 + }, + { + "epoch": 1.213563829787234, + "grad_norm": 4.112594127655029, + "learning_rate": 9.018557586274858e-06, + "loss": 0.7762, + "step": 4563 + }, + { + "epoch": 1.2138297872340424, + "grad_norm": 3.841365098953247, + "learning_rate": 9.018034201652357e-06, + "loss": 0.8042, + "step": 4564 + }, + { + "epoch": 1.2140957446808511, + "grad_norm": 3.9603569507598877, + "learning_rate": 9.017510692707144e-06, + "loss": 0.6254, + "step": 4565 + }, + { + "epoch": 1.2143617021276596, + "grad_norm": 3.6832830905914307, + "learning_rate": 9.016987059455422e-06, + "loss": 0.7013, + "step": 4566 + }, + { + "epoch": 1.214627659574468, + "grad_norm": 4.155395030975342, + "learning_rate": 9.01646330191339e-06, + "loss": 0.8052, + "step": 4567 + }, + { + "epoch": 1.2148936170212765, + "grad_norm": 3.9648375511169434, + "learning_rate": 9.015939420097255e-06, + "loss": 0.778, + "step": 4568 + }, + { + "epoch": 1.2151595744680852, + "grad_norm": 3.8621366024017334, + "learning_rate": 9.015415414023226e-06, + "loss": 0.7851, + "step": 4569 + }, + { + "epoch": 1.2154255319148937, + "grad_norm": 4.207528114318848, + "learning_rate": 9.014891283707517e-06, + "loss": 0.9192, + "step": 4570 + }, + { + "epoch": 1.2156914893617021, + "grad_norm": 4.204238414764404, + "learning_rate": 9.014367029166344e-06, + "loss": 0.8175, + "step": 4571 + }, + { + "epoch": 1.2159574468085106, + "grad_norm": 4.0870537757873535, + "learning_rate": 9.013842650415927e-06, + "loss": 0.8294, + "step": 4572 + }, + { + "epoch": 1.2162234042553193, + "grad_norm": 4.164912700653076, + "learning_rate": 9.013318147472497e-06, + "loss": 0.8457, + "step": 4573 + }, + { + "epoch": 1.2164893617021277, + "grad_norm": 4.122684478759766, + "learning_rate": 9.012793520352276e-06, + "loss": 0.7565, + "step": 4574 + }, + { + "epoch": 1.2167553191489362, + "grad_norm": 4.155274391174316, + "learning_rate": 9.012268769071499e-06, + "loss": 0.7522, + "step": 4575 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 4.182219505310059, + "learning_rate": 9.011743893646402e-06, + "loss": 0.842, + "step": 4576 + }, + { + "epoch": 1.2172872340425531, + "grad_norm": 3.9600305557250977, + "learning_rate": 9.011218894093226e-06, + "loss": 0.7938, + "step": 4577 + }, + { + "epoch": 1.2175531914893618, + "grad_norm": 3.977374792098999, + "learning_rate": 9.010693770428217e-06, + "loss": 0.7021, + "step": 4578 + }, + { + "epoch": 1.2178191489361703, + "grad_norm": 4.227469444274902, + "learning_rate": 9.010168522667617e-06, + "loss": 0.8016, + "step": 4579 + }, + { + "epoch": 1.2180851063829787, + "grad_norm": 3.7802317142486572, + "learning_rate": 9.009643150827683e-06, + "loss": 0.7565, + "step": 4580 + }, + { + "epoch": 1.2183510638297872, + "grad_norm": 3.9615867137908936, + "learning_rate": 9.00911765492467e-06, + "loss": 0.8134, + "step": 4581 + }, + { + "epoch": 1.2186170212765957, + "grad_norm": 3.852104902267456, + "learning_rate": 9.008592034974836e-06, + "loss": 0.7654, + "step": 4582 + }, + { + "epoch": 1.2188829787234043, + "grad_norm": 3.5889623165130615, + "learning_rate": 9.008066290994443e-06, + "loss": 0.816, + "step": 4583 + }, + { + "epoch": 1.2191489361702128, + "grad_norm": 3.7613863945007324, + "learning_rate": 9.007540422999762e-06, + "loss": 0.7356, + "step": 4584 + }, + { + "epoch": 1.2194148936170213, + "grad_norm": 4.141067981719971, + "learning_rate": 9.007014431007064e-06, + "loss": 0.8445, + "step": 4585 + }, + { + "epoch": 1.2196808510638297, + "grad_norm": 3.842954635620117, + "learning_rate": 9.00648831503262e-06, + "loss": 0.7844, + "step": 4586 + }, + { + "epoch": 1.2199468085106382, + "grad_norm": 3.799661159515381, + "learning_rate": 9.00596207509271e-06, + "loss": 0.8777, + "step": 4587 + }, + { + "epoch": 1.2202127659574469, + "grad_norm": 4.335452079772949, + "learning_rate": 9.005435711203619e-06, + "loss": 0.936, + "step": 4588 + }, + { + "epoch": 1.2204787234042553, + "grad_norm": 3.905426025390625, + "learning_rate": 9.004909223381628e-06, + "loss": 0.7583, + "step": 4589 + }, + { + "epoch": 1.2207446808510638, + "grad_norm": 3.950054168701172, + "learning_rate": 9.004382611643032e-06, + "loss": 0.8512, + "step": 4590 + }, + { + "epoch": 1.2210106382978723, + "grad_norm": 4.1044135093688965, + "learning_rate": 9.003855876004124e-06, + "loss": 0.7941, + "step": 4591 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 3.908524751663208, + "learning_rate": 9.003329016481201e-06, + "loss": 0.7502, + "step": 4592 + }, + { + "epoch": 1.2215425531914894, + "grad_norm": 3.6956968307495117, + "learning_rate": 9.002802033090564e-06, + "loss": 0.7847, + "step": 4593 + }, + { + "epoch": 1.2218085106382979, + "grad_norm": 4.292162895202637, + "learning_rate": 9.00227492584852e-06, + "loss": 0.7966, + "step": 4594 + }, + { + "epoch": 1.2220744680851063, + "grad_norm": 4.15654993057251, + "learning_rate": 9.001747694771378e-06, + "loss": 0.7523, + "step": 4595 + }, + { + "epoch": 1.2223404255319148, + "grad_norm": 3.5688204765319824, + "learning_rate": 9.00122033987545e-06, + "loss": 0.6891, + "step": 4596 + }, + { + "epoch": 1.2226063829787235, + "grad_norm": 3.962028980255127, + "learning_rate": 9.000692861177056e-06, + "loss": 0.7285, + "step": 4597 + }, + { + "epoch": 1.222872340425532, + "grad_norm": 4.2762651443481445, + "learning_rate": 9.000165258692512e-06, + "loss": 0.8359, + "step": 4598 + }, + { + "epoch": 1.2231382978723404, + "grad_norm": 4.260420799255371, + "learning_rate": 8.999637532438145e-06, + "loss": 0.9171, + "step": 4599 + }, + { + "epoch": 1.2234042553191489, + "grad_norm": 4.032958507537842, + "learning_rate": 8.999109682430288e-06, + "loss": 0.8082, + "step": 4600 + }, + { + "epoch": 1.2236702127659576, + "grad_norm": 3.772594690322876, + "learning_rate": 8.998581708685264e-06, + "loss": 0.8029, + "step": 4601 + }, + { + "epoch": 1.223936170212766, + "grad_norm": 4.074283123016357, + "learning_rate": 8.998053611219418e-06, + "loss": 0.729, + "step": 4602 + }, + { + "epoch": 1.2242021276595745, + "grad_norm": 3.5871801376342773, + "learning_rate": 8.997525390049084e-06, + "loss": 0.8645, + "step": 4603 + }, + { + "epoch": 1.224468085106383, + "grad_norm": 3.789030075073242, + "learning_rate": 8.996997045190608e-06, + "loss": 0.7226, + "step": 4604 + }, + { + "epoch": 1.2247340425531914, + "grad_norm": 3.840949296951294, + "learning_rate": 8.996468576660337e-06, + "loss": 0.8817, + "step": 4605 + }, + { + "epoch": 1.225, + "grad_norm": 4.251964569091797, + "learning_rate": 8.995939984474624e-06, + "loss": 0.7567, + "step": 4606 + }, + { + "epoch": 1.2252659574468086, + "grad_norm": 3.7050812244415283, + "learning_rate": 8.995411268649823e-06, + "loss": 0.8609, + "step": 4607 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 4.209064483642578, + "learning_rate": 8.994882429202294e-06, + "loss": 0.8653, + "step": 4608 + }, + { + "epoch": 1.2257978723404255, + "grad_norm": 4.214296340942383, + "learning_rate": 8.994353466148399e-06, + "loss": 0.8262, + "step": 4609 + }, + { + "epoch": 1.226063829787234, + "grad_norm": 3.9574646949768066, + "learning_rate": 8.993824379504505e-06, + "loss": 0.7383, + "step": 4610 + }, + { + "epoch": 1.2263297872340426, + "grad_norm": 4.194293975830078, + "learning_rate": 8.993295169286982e-06, + "loss": 0.7483, + "step": 4611 + }, + { + "epoch": 1.226595744680851, + "grad_norm": 3.9258837699890137, + "learning_rate": 8.992765835512205e-06, + "loss": 0.7151, + "step": 4612 + }, + { + "epoch": 1.2268617021276595, + "grad_norm": 3.662429094314575, + "learning_rate": 8.992236378196552e-06, + "loss": 0.8595, + "step": 4613 + }, + { + "epoch": 1.227127659574468, + "grad_norm": 3.745591640472412, + "learning_rate": 8.991706797356407e-06, + "loss": 0.8065, + "step": 4614 + }, + { + "epoch": 1.2273936170212765, + "grad_norm": 3.8420639038085938, + "learning_rate": 8.991177093008153e-06, + "loss": 0.7613, + "step": 4615 + }, + { + "epoch": 1.2276595744680852, + "grad_norm": 3.994805097579956, + "learning_rate": 8.990647265168179e-06, + "loss": 0.7919, + "step": 4616 + }, + { + "epoch": 1.2279255319148936, + "grad_norm": 4.0484514236450195, + "learning_rate": 8.990117313852882e-06, + "loss": 0.9, + "step": 4617 + }, + { + "epoch": 1.228191489361702, + "grad_norm": 3.999068260192871, + "learning_rate": 8.989587239078658e-06, + "loss": 0.7472, + "step": 4618 + }, + { + "epoch": 1.2284574468085105, + "grad_norm": 3.9625680446624756, + "learning_rate": 8.989057040861905e-06, + "loss": 1.0265, + "step": 4619 + }, + { + "epoch": 1.2287234042553192, + "grad_norm": 4.0248284339904785, + "learning_rate": 8.988526719219035e-06, + "loss": 0.7525, + "step": 4620 + }, + { + "epoch": 1.2289893617021277, + "grad_norm": 3.985003709793091, + "learning_rate": 8.987996274166449e-06, + "loss": 0.8491, + "step": 4621 + }, + { + "epoch": 1.2292553191489362, + "grad_norm": 3.5832836627960205, + "learning_rate": 8.987465705720565e-06, + "loss": 0.6647, + "step": 4622 + }, + { + "epoch": 1.2295212765957446, + "grad_norm": 3.5431840419769287, + "learning_rate": 8.986935013897796e-06, + "loss": 0.7142, + "step": 4623 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 3.745082139968872, + "learning_rate": 8.986404198714561e-06, + "loss": 0.6538, + "step": 4624 + }, + { + "epoch": 1.2300531914893618, + "grad_norm": 3.653146982192993, + "learning_rate": 8.98587326018729e-06, + "loss": 0.7833, + "step": 4625 + }, + { + "epoch": 1.2303191489361702, + "grad_norm": 3.9238173961639404, + "learning_rate": 8.985342198332407e-06, + "loss": 0.8265, + "step": 4626 + }, + { + "epoch": 1.2305851063829787, + "grad_norm": 4.6217265129089355, + "learning_rate": 8.984811013166345e-06, + "loss": 0.9442, + "step": 4627 + }, + { + "epoch": 1.2308510638297872, + "grad_norm": 3.7040395736694336, + "learning_rate": 8.98427970470554e-06, + "loss": 0.8234, + "step": 4628 + }, + { + "epoch": 1.2311170212765958, + "grad_norm": 3.8721320629119873, + "learning_rate": 8.983748272966426e-06, + "loss": 0.8997, + "step": 4629 + }, + { + "epoch": 1.2313829787234043, + "grad_norm": 3.5621466636657715, + "learning_rate": 8.983216717965453e-06, + "loss": 0.8186, + "step": 4630 + }, + { + "epoch": 1.2316489361702128, + "grad_norm": 3.854879379272461, + "learning_rate": 8.982685039719064e-06, + "loss": 0.773, + "step": 4631 + }, + { + "epoch": 1.2319148936170212, + "grad_norm": 3.9702491760253906, + "learning_rate": 8.982153238243712e-06, + "loss": 0.8645, + "step": 4632 + }, + { + "epoch": 1.2321808510638297, + "grad_norm": 4.122603416442871, + "learning_rate": 8.981621313555849e-06, + "loss": 0.7651, + "step": 4633 + }, + { + "epoch": 1.2324468085106384, + "grad_norm": 4.362513065338135, + "learning_rate": 8.981089265671936e-06, + "loss": 0.8279, + "step": 4634 + }, + { + "epoch": 1.2327127659574468, + "grad_norm": 4.333089351654053, + "learning_rate": 8.980557094608433e-06, + "loss": 0.8613, + "step": 4635 + }, + { + "epoch": 1.2329787234042553, + "grad_norm": 3.9214844703674316, + "learning_rate": 8.980024800381807e-06, + "loss": 0.8316, + "step": 4636 + }, + { + "epoch": 1.2332446808510638, + "grad_norm": 3.9786224365234375, + "learning_rate": 8.979492383008528e-06, + "loss": 0.8405, + "step": 4637 + }, + { + "epoch": 1.2335106382978722, + "grad_norm": 4.105279445648193, + "learning_rate": 8.978959842505071e-06, + "loss": 0.8187, + "step": 4638 + }, + { + "epoch": 1.233776595744681, + "grad_norm": 4.662153244018555, + "learning_rate": 8.97842717888791e-06, + "loss": 0.8309, + "step": 4639 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 4.0390400886535645, + "learning_rate": 8.977894392173527e-06, + "loss": 0.823, + "step": 4640 + }, + { + "epoch": 1.2343085106382978, + "grad_norm": 3.574883222579956, + "learning_rate": 8.97736148237841e-06, + "loss": 0.899, + "step": 4641 + }, + { + "epoch": 1.2345744680851063, + "grad_norm": 3.9242796897888184, + "learning_rate": 8.976828449519047e-06, + "loss": 0.9994, + "step": 4642 + }, + { + "epoch": 1.234840425531915, + "grad_norm": 3.9096062183380127, + "learning_rate": 8.976295293611927e-06, + "loss": 0.907, + "step": 4643 + }, + { + "epoch": 1.2351063829787234, + "grad_norm": 4.211862087249756, + "learning_rate": 8.97576201467355e-06, + "loss": 0.807, + "step": 4644 + }, + { + "epoch": 1.235372340425532, + "grad_norm": 3.7779862880706787, + "learning_rate": 8.975228612720415e-06, + "loss": 0.7325, + "step": 4645 + }, + { + "epoch": 1.2356382978723404, + "grad_norm": 4.162439823150635, + "learning_rate": 8.974695087769027e-06, + "loss": 0.9018, + "step": 4646 + }, + { + "epoch": 1.235904255319149, + "grad_norm": 3.9376440048217773, + "learning_rate": 8.974161439835894e-06, + "loss": 0.7467, + "step": 4647 + }, + { + "epoch": 1.2361702127659575, + "grad_norm": 3.728128433227539, + "learning_rate": 8.973627668937528e-06, + "loss": 0.6471, + "step": 4648 + }, + { + "epoch": 1.236436170212766, + "grad_norm": 4.1924967765808105, + "learning_rate": 8.97309377509044e-06, + "loss": 0.8827, + "step": 4649 + }, + { + "epoch": 1.2367021276595744, + "grad_norm": 3.9644808769226074, + "learning_rate": 8.972559758311156e-06, + "loss": 0.737, + "step": 4650 + }, + { + "epoch": 1.236968085106383, + "grad_norm": 4.276489734649658, + "learning_rate": 8.972025618616195e-06, + "loss": 0.7805, + "step": 4651 + }, + { + "epoch": 1.2372340425531916, + "grad_norm": 4.115257263183594, + "learning_rate": 8.971491356022086e-06, + "loss": 0.8479, + "step": 4652 + }, + { + "epoch": 1.2375, + "grad_norm": 4.143589019775391, + "learning_rate": 8.970956970545356e-06, + "loss": 0.7716, + "step": 4653 + }, + { + "epoch": 1.2377659574468085, + "grad_norm": 3.872377634048462, + "learning_rate": 8.970422462202543e-06, + "loss": 0.7949, + "step": 4654 + }, + { + "epoch": 1.238031914893617, + "grad_norm": 3.9074594974517822, + "learning_rate": 8.969887831010185e-06, + "loss": 0.818, + "step": 4655 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 3.7083117961883545, + "learning_rate": 8.969353076984823e-06, + "loss": 0.823, + "step": 4656 + }, + { + "epoch": 1.2385638297872341, + "grad_norm": 3.952829122543335, + "learning_rate": 8.968818200143005e-06, + "loss": 0.7928, + "step": 4657 + }, + { + "epoch": 1.2388297872340426, + "grad_norm": 4.015969276428223, + "learning_rate": 8.96828320050128e-06, + "loss": 0.8713, + "step": 4658 + }, + { + "epoch": 1.239095744680851, + "grad_norm": 4.456661701202393, + "learning_rate": 8.967748078076197e-06, + "loss": 0.8482, + "step": 4659 + }, + { + "epoch": 1.2393617021276595, + "grad_norm": 3.8664846420288086, + "learning_rate": 8.96721283288432e-06, + "loss": 0.7526, + "step": 4660 + }, + { + "epoch": 1.239627659574468, + "grad_norm": 4.358894348144531, + "learning_rate": 8.966677464942206e-06, + "loss": 0.7756, + "step": 4661 + }, + { + "epoch": 1.2398936170212767, + "grad_norm": 3.8991811275482178, + "learning_rate": 8.96614197426642e-06, + "loss": 0.7629, + "step": 4662 + }, + { + "epoch": 1.2401595744680851, + "grad_norm": 3.752913236618042, + "learning_rate": 8.965606360873533e-06, + "loss": 0.7598, + "step": 4663 + }, + { + "epoch": 1.2404255319148936, + "grad_norm": 4.097616672515869, + "learning_rate": 8.965070624780117e-06, + "loss": 0.7635, + "step": 4664 + }, + { + "epoch": 1.240691489361702, + "grad_norm": 3.855180025100708, + "learning_rate": 8.964534766002747e-06, + "loss": 0.8571, + "step": 4665 + }, + { + "epoch": 1.2409574468085107, + "grad_norm": 4.117387771606445, + "learning_rate": 8.963998784558001e-06, + "loss": 0.8517, + "step": 4666 + }, + { + "epoch": 1.2412234042553192, + "grad_norm": 4.247325897216797, + "learning_rate": 8.963462680462469e-06, + "loss": 0.7862, + "step": 4667 + }, + { + "epoch": 1.2414893617021276, + "grad_norm": 4.604616165161133, + "learning_rate": 8.962926453732734e-06, + "loss": 0.8325, + "step": 4668 + }, + { + "epoch": 1.241755319148936, + "grad_norm": 4.283206462860107, + "learning_rate": 8.96239010438539e-06, + "loss": 0.7897, + "step": 4669 + }, + { + "epoch": 1.2420212765957448, + "grad_norm": 4.039552688598633, + "learning_rate": 8.96185363243703e-06, + "loss": 0.8889, + "step": 4670 + }, + { + "epoch": 1.2422872340425533, + "grad_norm": 3.6952388286590576, + "learning_rate": 8.961317037904253e-06, + "loss": 0.7318, + "step": 4671 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 4.330514907836914, + "learning_rate": 8.960780320803665e-06, + "loss": 0.8473, + "step": 4672 + }, + { + "epoch": 1.2428191489361702, + "grad_norm": 3.8652656078338623, + "learning_rate": 8.960243481151869e-06, + "loss": 0.7744, + "step": 4673 + }, + { + "epoch": 1.2430851063829786, + "grad_norm": 4.232844352722168, + "learning_rate": 8.959706518965479e-06, + "loss": 0.7232, + "step": 4674 + }, + { + "epoch": 1.2433510638297873, + "grad_norm": 3.9439735412597656, + "learning_rate": 8.959169434261106e-06, + "loss": 0.7025, + "step": 4675 + }, + { + "epoch": 1.2436170212765958, + "grad_norm": 3.876521587371826, + "learning_rate": 8.958632227055369e-06, + "loss": 0.6779, + "step": 4676 + }, + { + "epoch": 1.2438829787234043, + "grad_norm": 3.7715842723846436, + "learning_rate": 8.95809489736489e-06, + "loss": 0.7331, + "step": 4677 + }, + { + "epoch": 1.2441489361702127, + "grad_norm": 4.344306945800781, + "learning_rate": 8.957557445206297e-06, + "loss": 0.797, + "step": 4678 + }, + { + "epoch": 1.2444148936170212, + "grad_norm": 3.924248218536377, + "learning_rate": 8.957019870596216e-06, + "loss": 0.9321, + "step": 4679 + }, + { + "epoch": 1.2446808510638299, + "grad_norm": 3.8048911094665527, + "learning_rate": 8.956482173551281e-06, + "loss": 0.7405, + "step": 4680 + }, + { + "epoch": 1.2449468085106383, + "grad_norm": 4.218112468719482, + "learning_rate": 8.95594435408813e-06, + "loss": 0.8395, + "step": 4681 + }, + { + "epoch": 1.2452127659574468, + "grad_norm": 3.683992385864258, + "learning_rate": 8.955406412223402e-06, + "loss": 0.7261, + "step": 4682 + }, + { + "epoch": 1.2454787234042553, + "grad_norm": 4.05771541595459, + "learning_rate": 8.954868347973742e-06, + "loss": 0.85, + "step": 4683 + }, + { + "epoch": 1.2457446808510637, + "grad_norm": 4.423064708709717, + "learning_rate": 8.954330161355803e-06, + "loss": 0.8632, + "step": 4684 + }, + { + "epoch": 1.2460106382978724, + "grad_norm": 4.039585113525391, + "learning_rate": 8.953791852386229e-06, + "loss": 0.8078, + "step": 4685 + }, + { + "epoch": 1.2462765957446809, + "grad_norm": 4.336376190185547, + "learning_rate": 8.953253421081682e-06, + "loss": 0.807, + "step": 4686 + }, + { + "epoch": 1.2465425531914893, + "grad_norm": 4.025651454925537, + "learning_rate": 8.95271486745882e-06, + "loss": 0.8651, + "step": 4687 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 3.839545488357544, + "learning_rate": 8.952176191534305e-06, + "loss": 0.7696, + "step": 4688 + }, + { + "epoch": 1.2470744680851065, + "grad_norm": 3.4037442207336426, + "learning_rate": 8.951637393324806e-06, + "loss": 0.7827, + "step": 4689 + }, + { + "epoch": 1.247340425531915, + "grad_norm": 4.202190399169922, + "learning_rate": 8.951098472846994e-06, + "loss": 0.6717, + "step": 4690 + }, + { + "epoch": 1.2476063829787234, + "grad_norm": 4.145596027374268, + "learning_rate": 8.950559430117542e-06, + "loss": 0.8201, + "step": 4691 + }, + { + "epoch": 1.2478723404255319, + "grad_norm": 4.066543102264404, + "learning_rate": 8.950020265153133e-06, + "loss": 0.7651, + "step": 4692 + }, + { + "epoch": 1.2481382978723405, + "grad_norm": 3.9612643718719482, + "learning_rate": 8.949480977970444e-06, + "loss": 0.7625, + "step": 4693 + }, + { + "epoch": 1.248404255319149, + "grad_norm": 3.6797444820404053, + "learning_rate": 8.948941568586165e-06, + "loss": 0.7396, + "step": 4694 + }, + { + "epoch": 1.2486702127659575, + "grad_norm": 4.5470662117004395, + "learning_rate": 8.948402037016984e-06, + "loss": 0.831, + "step": 4695 + }, + { + "epoch": 1.248936170212766, + "grad_norm": 3.3565194606781006, + "learning_rate": 8.947862383279594e-06, + "loss": 0.6773, + "step": 4696 + }, + { + "epoch": 1.2492021276595744, + "grad_norm": 4.042359352111816, + "learning_rate": 8.947322607390694e-06, + "loss": 0.8052, + "step": 4697 + }, + { + "epoch": 1.249468085106383, + "grad_norm": 3.909513235092163, + "learning_rate": 8.946782709366988e-06, + "loss": 0.8849, + "step": 4698 + }, + { + "epoch": 1.2497340425531915, + "grad_norm": 4.553561687469482, + "learning_rate": 8.946242689225175e-06, + "loss": 0.9048, + "step": 4699 + }, + { + "epoch": 1.25, + "grad_norm": 4.289936542510986, + "learning_rate": 8.94570254698197e-06, + "loss": 0.8465, + "step": 4700 + }, + { + "epoch": 1.2502659574468085, + "grad_norm": 3.7364187240600586, + "learning_rate": 8.94516228265408e-06, + "loss": 0.9081, + "step": 4701 + }, + { + "epoch": 1.250531914893617, + "grad_norm": 3.8869049549102783, + "learning_rate": 8.944621896258226e-06, + "loss": 0.7625, + "step": 4702 + }, + { + "epoch": 1.2507978723404256, + "grad_norm": 4.203104019165039, + "learning_rate": 8.944081387811126e-06, + "loss": 0.7822, + "step": 4703 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 3.810011148452759, + "learning_rate": 8.943540757329503e-06, + "loss": 0.9403, + "step": 4704 + }, + { + "epoch": 1.2513297872340425, + "grad_norm": 3.795477867126465, + "learning_rate": 8.943000004830087e-06, + "loss": 0.7856, + "step": 4705 + }, + { + "epoch": 1.251595744680851, + "grad_norm": 4.174344062805176, + "learning_rate": 8.942459130329608e-06, + "loss": 0.8522, + "step": 4706 + }, + { + "epoch": 1.2518617021276595, + "grad_norm": 3.6374874114990234, + "learning_rate": 8.941918133844803e-06, + "loss": 0.8471, + "step": 4707 + }, + { + "epoch": 1.2521276595744681, + "grad_norm": 3.645719528198242, + "learning_rate": 8.941377015392407e-06, + "loss": 0.7564, + "step": 4708 + }, + { + "epoch": 1.2523936170212766, + "grad_norm": 4.238284587860107, + "learning_rate": 8.94083577498917e-06, + "loss": 0.9556, + "step": 4709 + }, + { + "epoch": 1.252659574468085, + "grad_norm": 4.101098537445068, + "learning_rate": 8.940294412651831e-06, + "loss": 0.9095, + "step": 4710 + }, + { + "epoch": 1.2529255319148938, + "grad_norm": 3.56626296043396, + "learning_rate": 8.939752928397146e-06, + "loss": 0.7358, + "step": 4711 + }, + { + "epoch": 1.253191489361702, + "grad_norm": 3.680903434753418, + "learning_rate": 8.939211322241866e-06, + "loss": 0.7556, + "step": 4712 + }, + { + "epoch": 1.2534574468085107, + "grad_norm": 4.173125267028809, + "learning_rate": 8.938669594202748e-06, + "loss": 0.7488, + "step": 4713 + }, + { + "epoch": 1.2537234042553191, + "grad_norm": 4.197647571563721, + "learning_rate": 8.938127744296559e-06, + "loss": 0.8367, + "step": 4714 + }, + { + "epoch": 1.2539893617021276, + "grad_norm": 3.5184898376464844, + "learning_rate": 8.937585772540058e-06, + "loss": 0.7586, + "step": 4715 + }, + { + "epoch": 1.2542553191489363, + "grad_norm": 4.331880569458008, + "learning_rate": 8.93704367895002e-06, + "loss": 0.9277, + "step": 4716 + }, + { + "epoch": 1.2545212765957447, + "grad_norm": 4.3062238693237305, + "learning_rate": 8.936501463543213e-06, + "loss": 0.7798, + "step": 4717 + }, + { + "epoch": 1.2547872340425532, + "grad_norm": 4.3987956047058105, + "learning_rate": 8.935959126336418e-06, + "loss": 0.8121, + "step": 4718 + }, + { + "epoch": 1.2550531914893617, + "grad_norm": 3.8964762687683105, + "learning_rate": 8.935416667346412e-06, + "loss": 0.8318, + "step": 4719 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 4.110397815704346, + "learning_rate": 8.934874086589981e-06, + "loss": 0.7502, + "step": 4720 + }, + { + "epoch": 1.2555851063829788, + "grad_norm": 3.531947135925293, + "learning_rate": 8.934331384083914e-06, + "loss": 0.7613, + "step": 4721 + }, + { + "epoch": 1.2558510638297873, + "grad_norm": 3.8877408504486084, + "learning_rate": 8.933788559845001e-06, + "loss": 0.7568, + "step": 4722 + }, + { + "epoch": 1.2561170212765957, + "grad_norm": 3.653062582015991, + "learning_rate": 8.93324561389004e-06, + "loss": 0.7156, + "step": 4723 + }, + { + "epoch": 1.2563829787234042, + "grad_norm": 3.9823882579803467, + "learning_rate": 8.932702546235827e-06, + "loss": 0.8349, + "step": 4724 + }, + { + "epoch": 1.2566489361702127, + "grad_norm": 3.867664337158203, + "learning_rate": 8.932159356899169e-06, + "loss": 0.7605, + "step": 4725 + }, + { + "epoch": 1.2569148936170214, + "grad_norm": 3.945042371749878, + "learning_rate": 8.93161604589687e-06, + "loss": 0.698, + "step": 4726 + }, + { + "epoch": 1.2571808510638298, + "grad_norm": 4.207972049713135, + "learning_rate": 8.93107261324574e-06, + "loss": 0.9514, + "step": 4727 + }, + { + "epoch": 1.2574468085106383, + "grad_norm": 3.8403220176696777, + "learning_rate": 8.930529058962597e-06, + "loss": 0.7912, + "step": 4728 + }, + { + "epoch": 1.2577127659574467, + "grad_norm": 3.9817752838134766, + "learning_rate": 8.929985383064257e-06, + "loss": 0.752, + "step": 4729 + }, + { + "epoch": 1.2579787234042552, + "grad_norm": 3.786790132522583, + "learning_rate": 8.929441585567543e-06, + "loss": 0.7753, + "step": 4730 + }, + { + "epoch": 1.258244680851064, + "grad_norm": 3.5705316066741943, + "learning_rate": 8.928897666489278e-06, + "loss": 0.6983, + "step": 4731 + }, + { + "epoch": 1.2585106382978724, + "grad_norm": 3.8111605644226074, + "learning_rate": 8.928353625846294e-06, + "loss": 0.9261, + "step": 4732 + }, + { + "epoch": 1.2587765957446808, + "grad_norm": 3.8016891479492188, + "learning_rate": 8.927809463655424e-06, + "loss": 0.9297, + "step": 4733 + }, + { + "epoch": 1.2590425531914895, + "grad_norm": 3.998060941696167, + "learning_rate": 8.927265179933506e-06, + "loss": 0.8105, + "step": 4734 + }, + { + "epoch": 1.2593085106382977, + "grad_norm": 3.4611032009124756, + "learning_rate": 8.926720774697379e-06, + "loss": 0.7404, + "step": 4735 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 4.086428165435791, + "learning_rate": 8.926176247963886e-06, + "loss": 0.7905, + "step": 4736 + }, + { + "epoch": 1.2598404255319149, + "grad_norm": 4.124720573425293, + "learning_rate": 8.92563159974988e-06, + "loss": 0.9439, + "step": 4737 + }, + { + "epoch": 1.2601063829787233, + "grad_norm": 3.536327600479126, + "learning_rate": 8.92508683007221e-06, + "loss": 0.7992, + "step": 4738 + }, + { + "epoch": 1.260372340425532, + "grad_norm": 3.884551763534546, + "learning_rate": 8.924541938947731e-06, + "loss": 0.8708, + "step": 4739 + }, + { + "epoch": 1.2606382978723405, + "grad_norm": 4.106461048126221, + "learning_rate": 8.923996926393306e-06, + "loss": 0.8013, + "step": 4740 + }, + { + "epoch": 1.260904255319149, + "grad_norm": 3.6707823276519775, + "learning_rate": 8.923451792425795e-06, + "loss": 0.7818, + "step": 4741 + }, + { + "epoch": 1.2611702127659574, + "grad_norm": 4.26462984085083, + "learning_rate": 8.922906537062066e-06, + "loss": 0.9622, + "step": 4742 + }, + { + "epoch": 1.2614361702127659, + "grad_norm": 4.356677055358887, + "learning_rate": 8.92236116031899e-06, + "loss": 0.9918, + "step": 4743 + }, + { + "epoch": 1.2617021276595746, + "grad_norm": 3.735673427581787, + "learning_rate": 8.921815662213442e-06, + "loss": 0.6767, + "step": 4744 + }, + { + "epoch": 1.261968085106383, + "grad_norm": 3.9601590633392334, + "learning_rate": 8.9212700427623e-06, + "loss": 0.8667, + "step": 4745 + }, + { + "epoch": 1.2622340425531915, + "grad_norm": 3.9646952152252197, + "learning_rate": 8.920724301982446e-06, + "loss": 0.7383, + "step": 4746 + }, + { + "epoch": 1.2625, + "grad_norm": 3.402167320251465, + "learning_rate": 8.920178439890765e-06, + "loss": 0.7373, + "step": 4747 + }, + { + "epoch": 1.2627659574468084, + "grad_norm": 4.096093654632568, + "learning_rate": 8.91963245650415e-06, + "loss": 0.7765, + "step": 4748 + }, + { + "epoch": 1.263031914893617, + "grad_norm": 3.612751007080078, + "learning_rate": 8.91908635183949e-06, + "loss": 0.8401, + "step": 4749 + }, + { + "epoch": 1.2632978723404256, + "grad_norm": 4.043914318084717, + "learning_rate": 8.918540125913686e-06, + "loss": 0.7371, + "step": 4750 + }, + { + "epoch": 1.263563829787234, + "grad_norm": 3.865091562271118, + "learning_rate": 8.917993778743636e-06, + "loss": 0.6962, + "step": 4751 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 4.154531478881836, + "learning_rate": 8.917447310346245e-06, + "loss": 0.8158, + "step": 4752 + }, + { + "epoch": 1.264095744680851, + "grad_norm": 3.6052658557891846, + "learning_rate": 8.916900720738423e-06, + "loss": 0.7131, + "step": 4753 + }, + { + "epoch": 1.2643617021276596, + "grad_norm": 4.163410186767578, + "learning_rate": 8.916354009937081e-06, + "loss": 0.8955, + "step": 4754 + }, + { + "epoch": 1.264627659574468, + "grad_norm": 3.979421377182007, + "learning_rate": 8.915807177959133e-06, + "loss": 0.8712, + "step": 4755 + }, + { + "epoch": 1.2648936170212766, + "grad_norm": 3.4931585788726807, + "learning_rate": 8.915260224821504e-06, + "loss": 0.8079, + "step": 4756 + }, + { + "epoch": 1.265159574468085, + "grad_norm": 3.8094661235809326, + "learning_rate": 8.914713150541113e-06, + "loss": 0.8143, + "step": 4757 + }, + { + "epoch": 1.2654255319148935, + "grad_norm": 4.149999618530273, + "learning_rate": 8.914165955134886e-06, + "loss": 0.789, + "step": 4758 + }, + { + "epoch": 1.2656914893617022, + "grad_norm": 3.9979913234710693, + "learning_rate": 8.913618638619757e-06, + "loss": 0.8312, + "step": 4759 + }, + { + "epoch": 1.2659574468085106, + "grad_norm": 4.05308723449707, + "learning_rate": 8.91307120101266e-06, + "loss": 0.8029, + "step": 4760 + }, + { + "epoch": 1.266223404255319, + "grad_norm": 4.013595104217529, + "learning_rate": 8.912523642330533e-06, + "loss": 0.8625, + "step": 4761 + }, + { + "epoch": 1.2664893617021278, + "grad_norm": 3.932847023010254, + "learning_rate": 8.911975962590319e-06, + "loss": 0.8532, + "step": 4762 + }, + { + "epoch": 1.2667553191489362, + "grad_norm": 4.163691520690918, + "learning_rate": 8.911428161808962e-06, + "loss": 0.9048, + "step": 4763 + }, + { + "epoch": 1.2670212765957447, + "grad_norm": 4.368598461151123, + "learning_rate": 8.910880240003413e-06, + "loss": 0.7907, + "step": 4764 + }, + { + "epoch": 1.2672872340425532, + "grad_norm": 4.071594715118408, + "learning_rate": 8.910332197190623e-06, + "loss": 0.8764, + "step": 4765 + }, + { + "epoch": 1.2675531914893616, + "grad_norm": 3.6952078342437744, + "learning_rate": 8.909784033387552e-06, + "loss": 0.8343, + "step": 4766 + }, + { + "epoch": 1.2678191489361703, + "grad_norm": 3.967707872390747, + "learning_rate": 8.909235748611161e-06, + "loss": 0.7465, + "step": 4767 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 4.079662799835205, + "learning_rate": 8.908687342878413e-06, + "loss": 0.8126, + "step": 4768 + }, + { + "epoch": 1.2683510638297872, + "grad_norm": 3.95373272895813, + "learning_rate": 8.908138816206275e-06, + "loss": 0.7309, + "step": 4769 + }, + { + "epoch": 1.2686170212765957, + "grad_norm": 3.959603786468506, + "learning_rate": 8.907590168611724e-06, + "loss": 0.7635, + "step": 4770 + }, + { + "epoch": 1.2688829787234042, + "grad_norm": 3.9669322967529297, + "learning_rate": 8.90704140011173e-06, + "loss": 0.9031, + "step": 4771 + }, + { + "epoch": 1.2691489361702128, + "grad_norm": 4.063694477081299, + "learning_rate": 8.906492510723276e-06, + "loss": 0.8292, + "step": 4772 + }, + { + "epoch": 1.2694148936170213, + "grad_norm": 3.9221720695495605, + "learning_rate": 8.905943500463344e-06, + "loss": 0.7683, + "step": 4773 + }, + { + "epoch": 1.2696808510638298, + "grad_norm": 3.9919097423553467, + "learning_rate": 8.905394369348921e-06, + "loss": 0.7647, + "step": 4774 + }, + { + "epoch": 1.2699468085106382, + "grad_norm": 3.8253092765808105, + "learning_rate": 8.904845117397e-06, + "loss": 0.7056, + "step": 4775 + }, + { + "epoch": 1.2702127659574467, + "grad_norm": 3.5580105781555176, + "learning_rate": 8.904295744624572e-06, + "loss": 0.7939, + "step": 4776 + }, + { + "epoch": 1.2704787234042554, + "grad_norm": 3.987231492996216, + "learning_rate": 8.903746251048638e-06, + "loss": 0.8708, + "step": 4777 + }, + { + "epoch": 1.2707446808510638, + "grad_norm": 3.8669490814208984, + "learning_rate": 8.903196636686198e-06, + "loss": 0.776, + "step": 4778 + }, + { + "epoch": 1.2710106382978723, + "grad_norm": 3.940711259841919, + "learning_rate": 8.902646901554258e-06, + "loss": 0.7831, + "step": 4779 + }, + { + "epoch": 1.2712765957446808, + "grad_norm": 4.304079055786133, + "learning_rate": 8.90209704566983e-06, + "loss": 0.8243, + "step": 4780 + }, + { + "epoch": 1.2715425531914892, + "grad_norm": 4.165473937988281, + "learning_rate": 8.901547069049924e-06, + "loss": 0.8804, + "step": 4781 + }, + { + "epoch": 1.271808510638298, + "grad_norm": 3.84690260887146, + "learning_rate": 8.900996971711558e-06, + "loss": 0.8067, + "step": 4782 + }, + { + "epoch": 1.2720744680851064, + "grad_norm": 3.9118542671203613, + "learning_rate": 8.900446753671754e-06, + "loss": 0.8676, + "step": 4783 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 4.110815525054932, + "learning_rate": 8.899896414947534e-06, + "loss": 0.6605, + "step": 4784 + }, + { + "epoch": 1.2726063829787235, + "grad_norm": 3.7008938789367676, + "learning_rate": 8.899345955555928e-06, + "loss": 0.7201, + "step": 4785 + }, + { + "epoch": 1.272872340425532, + "grad_norm": 4.3613691329956055, + "learning_rate": 8.898795375513966e-06, + "loss": 0.806, + "step": 4786 + }, + { + "epoch": 1.2731382978723405, + "grad_norm": 4.315506458282471, + "learning_rate": 8.898244674838687e-06, + "loss": 0.8599, + "step": 4787 + }, + { + "epoch": 1.273404255319149, + "grad_norm": 3.8863260746002197, + "learning_rate": 8.897693853547127e-06, + "loss": 0.7735, + "step": 4788 + }, + { + "epoch": 1.2736702127659574, + "grad_norm": 4.221061706542969, + "learning_rate": 8.89714291165633e-06, + "loss": 0.9449, + "step": 4789 + }, + { + "epoch": 1.273936170212766, + "grad_norm": 3.727510929107666, + "learning_rate": 8.896591849183343e-06, + "loss": 0.8311, + "step": 4790 + }, + { + "epoch": 1.2742021276595745, + "grad_norm": 3.9543018341064453, + "learning_rate": 8.896040666145218e-06, + "loss": 0.6876, + "step": 4791 + }, + { + "epoch": 1.274468085106383, + "grad_norm": 3.7465333938598633, + "learning_rate": 8.895489362559007e-06, + "loss": 0.7677, + "step": 4792 + }, + { + "epoch": 1.2747340425531914, + "grad_norm": 4.069217205047607, + "learning_rate": 8.894937938441768e-06, + "loss": 0.8168, + "step": 4793 + }, + { + "epoch": 1.275, + "grad_norm": 4.367965221405029, + "learning_rate": 8.894386393810563e-06, + "loss": 0.7627, + "step": 4794 + }, + { + "epoch": 1.2752659574468086, + "grad_norm": 3.4115452766418457, + "learning_rate": 8.893834728682459e-06, + "loss": 0.6498, + "step": 4795 + }, + { + "epoch": 1.275531914893617, + "grad_norm": 3.94594669342041, + "learning_rate": 8.893282943074524e-06, + "loss": 0.7735, + "step": 4796 + }, + { + "epoch": 1.2757978723404255, + "grad_norm": 3.6856279373168945, + "learning_rate": 8.89273103700383e-06, + "loss": 0.8616, + "step": 4797 + }, + { + "epoch": 1.276063829787234, + "grad_norm": 3.8516628742218018, + "learning_rate": 8.892179010487456e-06, + "loss": 0.8549, + "step": 4798 + }, + { + "epoch": 1.2763297872340424, + "grad_norm": 4.085914611816406, + "learning_rate": 8.891626863542479e-06, + "loss": 0.7623, + "step": 4799 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 3.8456547260284424, + "learning_rate": 8.891074596185987e-06, + "loss": 0.8117, + "step": 4800 + }, + { + "epoch": 1.2768617021276596, + "grad_norm": 4.302917003631592, + "learning_rate": 8.890522208435067e-06, + "loss": 0.8329, + "step": 4801 + }, + { + "epoch": 1.277127659574468, + "grad_norm": 4.0489912033081055, + "learning_rate": 8.889969700306807e-06, + "loss": 0.8957, + "step": 4802 + }, + { + "epoch": 1.2773936170212765, + "grad_norm": 4.2099199295043945, + "learning_rate": 8.889417071818306e-06, + "loss": 0.7582, + "step": 4803 + }, + { + "epoch": 1.277659574468085, + "grad_norm": 3.925480842590332, + "learning_rate": 8.888864322986658e-06, + "loss": 0.814, + "step": 4804 + }, + { + "epoch": 1.2779255319148937, + "grad_norm": 3.9066643714904785, + "learning_rate": 8.888311453828973e-06, + "loss": 0.798, + "step": 4805 + }, + { + "epoch": 1.2781914893617021, + "grad_norm": 3.6610445976257324, + "learning_rate": 8.887758464362352e-06, + "loss": 0.708, + "step": 4806 + }, + { + "epoch": 1.2784574468085106, + "grad_norm": 3.639225482940674, + "learning_rate": 8.887205354603908e-06, + "loss": 0.9377, + "step": 4807 + }, + { + "epoch": 1.2787234042553193, + "grad_norm": 4.213227272033691, + "learning_rate": 8.886652124570753e-06, + "loss": 0.8664, + "step": 4808 + }, + { + "epoch": 1.2789893617021277, + "grad_norm": 3.916071653366089, + "learning_rate": 8.886098774280006e-06, + "loss": 0.8438, + "step": 4809 + }, + { + "epoch": 1.2792553191489362, + "grad_norm": 3.6656155586242676, + "learning_rate": 8.885545303748786e-06, + "loss": 0.8395, + "step": 4810 + }, + { + "epoch": 1.2795212765957447, + "grad_norm": 3.8457565307617188, + "learning_rate": 8.884991712994223e-06, + "loss": 0.7528, + "step": 4811 + }, + { + "epoch": 1.2797872340425531, + "grad_norm": 4.223479270935059, + "learning_rate": 8.88443800203344e-06, + "loss": 0.8702, + "step": 4812 + }, + { + "epoch": 1.2800531914893618, + "grad_norm": 3.9296419620513916, + "learning_rate": 8.88388417088357e-06, + "loss": 0.8804, + "step": 4813 + }, + { + "epoch": 1.2803191489361703, + "grad_norm": 4.048618316650391, + "learning_rate": 8.883330219561754e-06, + "loss": 0.8696, + "step": 4814 + }, + { + "epoch": 1.2805851063829787, + "grad_norm": 3.960580825805664, + "learning_rate": 8.882776148085129e-06, + "loss": 0.7783, + "step": 4815 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 4.032505035400391, + "learning_rate": 8.882221956470838e-06, + "loss": 0.8208, + "step": 4816 + }, + { + "epoch": 1.2811170212765957, + "grad_norm": 4.192906379699707, + "learning_rate": 8.881667644736028e-06, + "loss": 0.8411, + "step": 4817 + }, + { + "epoch": 1.2813829787234043, + "grad_norm": 3.9931344985961914, + "learning_rate": 8.881113212897851e-06, + "loss": 0.8844, + "step": 4818 + }, + { + "epoch": 1.2816489361702128, + "grad_norm": 4.1028923988342285, + "learning_rate": 8.880558660973462e-06, + "loss": 0.7664, + "step": 4819 + }, + { + "epoch": 1.2819148936170213, + "grad_norm": 4.039322376251221, + "learning_rate": 8.880003988980019e-06, + "loss": 0.8436, + "step": 4820 + }, + { + "epoch": 1.2821808510638297, + "grad_norm": 4.0381388664245605, + "learning_rate": 8.879449196934687e-06, + "loss": 0.749, + "step": 4821 + }, + { + "epoch": 1.2824468085106382, + "grad_norm": 4.3847222328186035, + "learning_rate": 8.878894284854626e-06, + "loss": 0.8086, + "step": 4822 + }, + { + "epoch": 1.2827127659574469, + "grad_norm": 4.213246822357178, + "learning_rate": 8.878339252757011e-06, + "loss": 0.9063, + "step": 4823 + }, + { + "epoch": 1.2829787234042553, + "grad_norm": 4.628039360046387, + "learning_rate": 8.877784100659013e-06, + "loss": 0.9035, + "step": 4824 + }, + { + "epoch": 1.2832446808510638, + "grad_norm": 3.940800905227661, + "learning_rate": 8.877228828577809e-06, + "loss": 0.8975, + "step": 4825 + }, + { + "epoch": 1.2835106382978723, + "grad_norm": 3.82865571975708, + "learning_rate": 8.87667343653058e-06, + "loss": 0.7283, + "step": 4826 + }, + { + "epoch": 1.2837765957446807, + "grad_norm": 4.173588752746582, + "learning_rate": 8.876117924534511e-06, + "loss": 0.8323, + "step": 4827 + }, + { + "epoch": 1.2840425531914894, + "grad_norm": 3.6624155044555664, + "learning_rate": 8.87556229260679e-06, + "loss": 0.8799, + "step": 4828 + }, + { + "epoch": 1.2843085106382979, + "grad_norm": 3.8801040649414062, + "learning_rate": 8.875006540764607e-06, + "loss": 0.7246, + "step": 4829 + }, + { + "epoch": 1.2845744680851063, + "grad_norm": 3.9223177433013916, + "learning_rate": 8.874450669025161e-06, + "loss": 0.8083, + "step": 4830 + }, + { + "epoch": 1.284840425531915, + "grad_norm": 3.640429735183716, + "learning_rate": 8.87389467740565e-06, + "loss": 0.8996, + "step": 4831 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 3.7746853828430176, + "learning_rate": 8.873338565923275e-06, + "loss": 0.6899, + "step": 4832 + }, + { + "epoch": 1.285372340425532, + "grad_norm": 4.439557075500488, + "learning_rate": 8.872782334595246e-06, + "loss": 0.9741, + "step": 4833 + }, + { + "epoch": 1.2856382978723404, + "grad_norm": 4.051036834716797, + "learning_rate": 8.872225983438774e-06, + "loss": 0.8935, + "step": 4834 + }, + { + "epoch": 1.2859042553191489, + "grad_norm": 4.3584370613098145, + "learning_rate": 8.871669512471068e-06, + "loss": 0.8499, + "step": 4835 + }, + { + "epoch": 1.2861702127659576, + "grad_norm": 3.96370792388916, + "learning_rate": 8.87111292170935e-06, + "loss": 0.8756, + "step": 4836 + }, + { + "epoch": 1.286436170212766, + "grad_norm": 3.8416450023651123, + "learning_rate": 8.87055621117084e-06, + "loss": 0.7347, + "step": 4837 + }, + { + "epoch": 1.2867021276595745, + "grad_norm": 3.84533429145813, + "learning_rate": 8.869999380872765e-06, + "loss": 0.7894, + "step": 4838 + }, + { + "epoch": 1.286968085106383, + "grad_norm": 4.616893768310547, + "learning_rate": 8.869442430832351e-06, + "loss": 0.8618, + "step": 4839 + }, + { + "epoch": 1.2872340425531914, + "grad_norm": 3.9372458457946777, + "learning_rate": 8.868885361066835e-06, + "loss": 0.785, + "step": 4840 + }, + { + "epoch": 1.2875, + "grad_norm": 3.895632743835449, + "learning_rate": 8.868328171593448e-06, + "loss": 0.7812, + "step": 4841 + }, + { + "epoch": 1.2877659574468086, + "grad_norm": 4.029928684234619, + "learning_rate": 8.867770862429434e-06, + "loss": 0.8724, + "step": 4842 + }, + { + "epoch": 1.288031914893617, + "grad_norm": 3.8094303607940674, + "learning_rate": 8.867213433592037e-06, + "loss": 0.791, + "step": 4843 + }, + { + "epoch": 1.2882978723404255, + "grad_norm": 3.862415313720703, + "learning_rate": 8.866655885098502e-06, + "loss": 0.8223, + "step": 4844 + }, + { + "epoch": 1.288563829787234, + "grad_norm": 4.023502826690674, + "learning_rate": 8.866098216966081e-06, + "loss": 0.8339, + "step": 4845 + }, + { + "epoch": 1.2888297872340426, + "grad_norm": 3.7530012130737305, + "learning_rate": 8.865540429212031e-06, + "loss": 0.7766, + "step": 4846 + }, + { + "epoch": 1.289095744680851, + "grad_norm": 3.7417378425598145, + "learning_rate": 8.864982521853609e-06, + "loss": 0.9348, + "step": 4847 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 4.337246417999268, + "learning_rate": 8.864424494908076e-06, + "loss": 0.8423, + "step": 4848 + }, + { + "epoch": 1.289627659574468, + "grad_norm": 4.149337291717529, + "learning_rate": 8.8638663483927e-06, + "loss": 0.9212, + "step": 4849 + }, + { + "epoch": 1.2898936170212765, + "grad_norm": 4.155276298522949, + "learning_rate": 8.86330808232475e-06, + "loss": 0.9331, + "step": 4850 + }, + { + "epoch": 1.2901595744680852, + "grad_norm": 3.66481876373291, + "learning_rate": 8.8627496967215e-06, + "loss": 0.7795, + "step": 4851 + }, + { + "epoch": 1.2904255319148936, + "grad_norm": 4.018246650695801, + "learning_rate": 8.862191191600227e-06, + "loss": 0.8021, + "step": 4852 + }, + { + "epoch": 1.290691489361702, + "grad_norm": 4.123905658721924, + "learning_rate": 8.86163256697821e-06, + "loss": 0.8106, + "step": 4853 + }, + { + "epoch": 1.2909574468085108, + "grad_norm": 4.097765922546387, + "learning_rate": 8.861073822872735e-06, + "loss": 0.8006, + "step": 4854 + }, + { + "epoch": 1.2912234042553192, + "grad_norm": 4.317656517028809, + "learning_rate": 8.86051495930109e-06, + "loss": 0.8026, + "step": 4855 + }, + { + "epoch": 1.2914893617021277, + "grad_norm": 3.8379859924316406, + "learning_rate": 8.859955976280568e-06, + "loss": 0.813, + "step": 4856 + }, + { + "epoch": 1.2917553191489362, + "grad_norm": 4.173714637756348, + "learning_rate": 8.859396873828461e-06, + "loss": 0.8064, + "step": 4857 + }, + { + "epoch": 1.2920212765957446, + "grad_norm": 4.439601898193359, + "learning_rate": 8.858837651962073e-06, + "loss": 0.8187, + "step": 4858 + }, + { + "epoch": 1.2922872340425533, + "grad_norm": 3.970308542251587, + "learning_rate": 8.858278310698705e-06, + "loss": 0.7977, + "step": 4859 + }, + { + "epoch": 1.2925531914893618, + "grad_norm": 3.7830026149749756, + "learning_rate": 8.857718850055663e-06, + "loss": 0.7371, + "step": 4860 + }, + { + "epoch": 1.2928191489361702, + "grad_norm": 3.9715933799743652, + "learning_rate": 8.857159270050258e-06, + "loss": 0.9022, + "step": 4861 + }, + { + "epoch": 1.2930851063829787, + "grad_norm": 3.824910879135132, + "learning_rate": 8.856599570699805e-06, + "loss": 0.7895, + "step": 4862 + }, + { + "epoch": 1.2933510638297872, + "grad_norm": 4.079301357269287, + "learning_rate": 8.856039752021619e-06, + "loss": 0.8215, + "step": 4863 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 3.722262382507324, + "learning_rate": 8.855479814033024e-06, + "loss": 0.7611, + "step": 4864 + }, + { + "epoch": 1.2938829787234043, + "grad_norm": 3.853123664855957, + "learning_rate": 8.854919756751343e-06, + "loss": 0.7494, + "step": 4865 + }, + { + "epoch": 1.2941489361702128, + "grad_norm": 3.9518027305603027, + "learning_rate": 8.854359580193907e-06, + "loss": 0.7751, + "step": 4866 + }, + { + "epoch": 1.2944148936170212, + "grad_norm": 4.295631408691406, + "learning_rate": 8.853799284378048e-06, + "loss": 0.8227, + "step": 4867 + }, + { + "epoch": 1.2946808510638297, + "grad_norm": 3.7936043739318848, + "learning_rate": 8.853238869321104e-06, + "loss": 0.7634, + "step": 4868 + }, + { + "epoch": 1.2949468085106384, + "grad_norm": 4.017428874969482, + "learning_rate": 8.85267833504041e-06, + "loss": 0.732, + "step": 4869 + }, + { + "epoch": 1.2952127659574468, + "grad_norm": 4.081499099731445, + "learning_rate": 8.852117681553312e-06, + "loss": 0.8568, + "step": 4870 + }, + { + "epoch": 1.2954787234042553, + "grad_norm": 4.4456281661987305, + "learning_rate": 8.851556908877159e-06, + "loss": 0.8038, + "step": 4871 + }, + { + "epoch": 1.2957446808510638, + "grad_norm": 4.371933460235596, + "learning_rate": 8.8509960170293e-06, + "loss": 0.7515, + "step": 4872 + }, + { + "epoch": 1.2960106382978722, + "grad_norm": 3.5804035663604736, + "learning_rate": 8.85043500602709e-06, + "loss": 0.7818, + "step": 4873 + }, + { + "epoch": 1.296276595744681, + "grad_norm": 4.176633834838867, + "learning_rate": 8.849873875887888e-06, + "loss": 0.8217, + "step": 4874 + }, + { + "epoch": 1.2965425531914894, + "grad_norm": 3.9609858989715576, + "learning_rate": 8.849312626629055e-06, + "loss": 0.8517, + "step": 4875 + }, + { + "epoch": 1.2968085106382978, + "grad_norm": 4.5829291343688965, + "learning_rate": 8.848751258267959e-06, + "loss": 1.0122, + "step": 4876 + }, + { + "epoch": 1.2970744680851065, + "grad_norm": 3.677952766418457, + "learning_rate": 8.848189770821965e-06, + "loss": 0.8094, + "step": 4877 + }, + { + "epoch": 1.297340425531915, + "grad_norm": 4.067968368530273, + "learning_rate": 8.84762816430845e-06, + "loss": 0.8764, + "step": 4878 + }, + { + "epoch": 1.2976063829787234, + "grad_norm": 3.8500382900238037, + "learning_rate": 8.847066438744792e-06, + "loss": 0.8741, + "step": 4879 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 3.8818368911743164, + "learning_rate": 8.846504594148366e-06, + "loss": 0.8485, + "step": 4880 + }, + { + "epoch": 1.2981382978723404, + "grad_norm": 3.9118518829345703, + "learning_rate": 8.84594263053656e-06, + "loss": 0.9005, + "step": 4881 + }, + { + "epoch": 1.298404255319149, + "grad_norm": 3.889709711074829, + "learning_rate": 8.84538054792676e-06, + "loss": 0.9367, + "step": 4882 + }, + { + "epoch": 1.2986702127659575, + "grad_norm": 3.9546077251434326, + "learning_rate": 8.844818346336361e-06, + "loss": 0.8102, + "step": 4883 + }, + { + "epoch": 1.298936170212766, + "grad_norm": 4.036288738250732, + "learning_rate": 8.844256025782754e-06, + "loss": 0.9124, + "step": 4884 + }, + { + "epoch": 1.2992021276595744, + "grad_norm": 3.9991087913513184, + "learning_rate": 8.84369358628334e-06, + "loss": 0.7885, + "step": 4885 + }, + { + "epoch": 1.299468085106383, + "grad_norm": 3.767066478729248, + "learning_rate": 8.84313102785552e-06, + "loss": 0.8147, + "step": 4886 + }, + { + "epoch": 1.2997340425531916, + "grad_norm": 3.645434617996216, + "learning_rate": 8.842568350516702e-06, + "loss": 0.7238, + "step": 4887 + }, + { + "epoch": 1.3, + "grad_norm": 3.777766466140747, + "learning_rate": 8.842005554284296e-06, + "loss": 0.816, + "step": 4888 + }, + { + "epoch": 1.3002659574468085, + "grad_norm": 3.8868510723114014, + "learning_rate": 8.841442639175714e-06, + "loss": 0.8835, + "step": 4889 + }, + { + "epoch": 1.300531914893617, + "grad_norm": 4.271452903747559, + "learning_rate": 8.840879605208374e-06, + "loss": 0.8119, + "step": 4890 + }, + { + "epoch": 1.3007978723404254, + "grad_norm": 3.4486215114593506, + "learning_rate": 8.840316452399697e-06, + "loss": 0.7602, + "step": 4891 + }, + { + "epoch": 1.3010638297872341, + "grad_norm": 3.726085901260376, + "learning_rate": 8.839753180767108e-06, + "loss": 0.7252, + "step": 4892 + }, + { + "epoch": 1.3013297872340426, + "grad_norm": 4.51430082321167, + "learning_rate": 8.839189790328033e-06, + "loss": 0.8133, + "step": 4893 + }, + { + "epoch": 1.301595744680851, + "grad_norm": 4.0574469566345215, + "learning_rate": 8.838626281099908e-06, + "loss": 0.8436, + "step": 4894 + }, + { + "epoch": 1.3018617021276595, + "grad_norm": 4.096327304840088, + "learning_rate": 8.838062653100165e-06, + "loss": 0.8056, + "step": 4895 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 4.048945903778076, + "learning_rate": 8.837498906346247e-06, + "loss": 0.8764, + "step": 4896 + }, + { + "epoch": 1.3023936170212767, + "grad_norm": 3.9284706115722656, + "learning_rate": 8.836935040855591e-06, + "loss": 0.7626, + "step": 4897 + }, + { + "epoch": 1.3026595744680851, + "grad_norm": 3.914583444595337, + "learning_rate": 8.83637105664565e-06, + "loss": 0.7855, + "step": 4898 + }, + { + "epoch": 1.3029255319148936, + "grad_norm": 4.442378520965576, + "learning_rate": 8.835806953733871e-06, + "loss": 0.8103, + "step": 4899 + }, + { + "epoch": 1.3031914893617023, + "grad_norm": 3.8343191146850586, + "learning_rate": 8.83524273213771e-06, + "loss": 0.8425, + "step": 4900 + }, + { + "epoch": 1.3034574468085105, + "grad_norm": 4.154768943786621, + "learning_rate": 8.834678391874623e-06, + "loss": 0.7792, + "step": 4901 + }, + { + "epoch": 1.3037234042553192, + "grad_norm": 4.136390209197998, + "learning_rate": 8.834113932962071e-06, + "loss": 0.8578, + "step": 4902 + }, + { + "epoch": 1.3039893617021276, + "grad_norm": 4.139702320098877, + "learning_rate": 8.833549355417518e-06, + "loss": 0.724, + "step": 4903 + }, + { + "epoch": 1.304255319148936, + "grad_norm": 4.213815689086914, + "learning_rate": 8.83298465925844e-06, + "loss": 0.7892, + "step": 4904 + }, + { + "epoch": 1.3045212765957448, + "grad_norm": 4.048974990844727, + "learning_rate": 8.832419844502298e-06, + "loss": 0.829, + "step": 4905 + }, + { + "epoch": 1.3047872340425533, + "grad_norm": 4.729825496673584, + "learning_rate": 8.831854911166577e-06, + "loss": 0.9176, + "step": 4906 + }, + { + "epoch": 1.3050531914893617, + "grad_norm": 3.5801501274108887, + "learning_rate": 8.831289859268753e-06, + "loss": 0.724, + "step": 4907 + }, + { + "epoch": 1.3053191489361702, + "grad_norm": 4.097287654876709, + "learning_rate": 8.83072468882631e-06, + "loss": 0.8299, + "step": 4908 + }, + { + "epoch": 1.3055851063829786, + "grad_norm": 4.027351379394531, + "learning_rate": 8.830159399856734e-06, + "loss": 0.9384, + "step": 4909 + }, + { + "epoch": 1.3058510638297873, + "grad_norm": 4.275338649749756, + "learning_rate": 8.829593992377518e-06, + "loss": 0.7921, + "step": 4910 + }, + { + "epoch": 1.3061170212765958, + "grad_norm": 4.1409220695495605, + "learning_rate": 8.829028466406156e-06, + "loss": 0.8888, + "step": 4911 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 3.6458733081817627, + "learning_rate": 8.828462821960143e-06, + "loss": 0.7371, + "step": 4912 + }, + { + "epoch": 1.3066489361702127, + "grad_norm": 3.8695321083068848, + "learning_rate": 8.827897059056983e-06, + "loss": 0.8467, + "step": 4913 + }, + { + "epoch": 1.3069148936170212, + "grad_norm": 3.693190336227417, + "learning_rate": 8.827331177714183e-06, + "loss": 0.8182, + "step": 4914 + }, + { + "epoch": 1.3071808510638299, + "grad_norm": 3.806725263595581, + "learning_rate": 8.826765177949248e-06, + "loss": 0.8669, + "step": 4915 + }, + { + "epoch": 1.3074468085106383, + "grad_norm": 3.970451593399048, + "learning_rate": 8.826199059779695e-06, + "loss": 0.9024, + "step": 4916 + }, + { + "epoch": 1.3077127659574468, + "grad_norm": 3.7471280097961426, + "learning_rate": 8.825632823223037e-06, + "loss": 0.7707, + "step": 4917 + }, + { + "epoch": 1.3079787234042553, + "grad_norm": 4.0794267654418945, + "learning_rate": 8.825066468296796e-06, + "loss": 0.8489, + "step": 4918 + }, + { + "epoch": 1.3082446808510637, + "grad_norm": 3.681044578552246, + "learning_rate": 8.824499995018494e-06, + "loss": 0.7854, + "step": 4919 + }, + { + "epoch": 1.3085106382978724, + "grad_norm": 3.9300031661987305, + "learning_rate": 8.82393340340566e-06, + "loss": 0.8076, + "step": 4920 + }, + { + "epoch": 1.3087765957446809, + "grad_norm": 3.5358026027679443, + "learning_rate": 8.823366693475826e-06, + "loss": 0.7239, + "step": 4921 + }, + { + "epoch": 1.3090425531914893, + "grad_norm": 3.7831380367279053, + "learning_rate": 8.822799865246522e-06, + "loss": 0.8004, + "step": 4922 + }, + { + "epoch": 1.309308510638298, + "grad_norm": 3.6898906230926514, + "learning_rate": 8.822232918735292e-06, + "loss": 0.765, + "step": 4923 + }, + { + "epoch": 1.3095744680851062, + "grad_norm": 3.685541868209839, + "learning_rate": 8.821665853959673e-06, + "loss": 0.9544, + "step": 4924 + }, + { + "epoch": 1.309840425531915, + "grad_norm": 4.169592380523682, + "learning_rate": 8.821098670937215e-06, + "loss": 0.9082, + "step": 4925 + }, + { + "epoch": 1.3101063829787234, + "grad_norm": 3.870544910430908, + "learning_rate": 8.820531369685464e-06, + "loss": 0.7508, + "step": 4926 + }, + { + "epoch": 1.3103723404255319, + "grad_norm": 3.920816659927368, + "learning_rate": 8.819963950221976e-06, + "loss": 0.849, + "step": 4927 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 3.8789918422698975, + "learning_rate": 8.819396412564305e-06, + "loss": 0.7916, + "step": 4928 + }, + { + "epoch": 1.310904255319149, + "grad_norm": 3.8481719493865967, + "learning_rate": 8.818828756730012e-06, + "loss": 0.7985, + "step": 4929 + }, + { + "epoch": 1.3111702127659575, + "grad_norm": 4.481472015380859, + "learning_rate": 8.818260982736662e-06, + "loss": 0.7636, + "step": 4930 + }, + { + "epoch": 1.311436170212766, + "grad_norm": 3.4751243591308594, + "learning_rate": 8.81769309060182e-06, + "loss": 0.7336, + "step": 4931 + }, + { + "epoch": 1.3117021276595744, + "grad_norm": 4.149890899658203, + "learning_rate": 8.81712508034306e-06, + "loss": 0.8473, + "step": 4932 + }, + { + "epoch": 1.311968085106383, + "grad_norm": 3.9108872413635254, + "learning_rate": 8.816556951977955e-06, + "loss": 0.7656, + "step": 4933 + }, + { + "epoch": 1.3122340425531915, + "grad_norm": 3.8704488277435303, + "learning_rate": 8.815988705524086e-06, + "loss": 0.8214, + "step": 4934 + }, + { + "epoch": 1.3125, + "grad_norm": 4.183962821960449, + "learning_rate": 8.815420340999034e-06, + "loss": 0.8411, + "step": 4935 + }, + { + "epoch": 1.3127659574468085, + "grad_norm": 3.7032434940338135, + "learning_rate": 8.814851858420384e-06, + "loss": 0.8455, + "step": 4936 + }, + { + "epoch": 1.313031914893617, + "grad_norm": 3.5762336254119873, + "learning_rate": 8.814283257805724e-06, + "loss": 0.7208, + "step": 4937 + }, + { + "epoch": 1.3132978723404256, + "grad_norm": 4.197664260864258, + "learning_rate": 8.813714539172653e-06, + "loss": 0.8642, + "step": 4938 + }, + { + "epoch": 1.313563829787234, + "grad_norm": 3.5386626720428467, + "learning_rate": 8.81314570253876e-06, + "loss": 0.6846, + "step": 4939 + }, + { + "epoch": 1.3138297872340425, + "grad_norm": 4.332328796386719, + "learning_rate": 8.812576747921653e-06, + "loss": 0.7862, + "step": 4940 + }, + { + "epoch": 1.314095744680851, + "grad_norm": 3.6495919227600098, + "learning_rate": 8.81200767533893e-06, + "loss": 0.676, + "step": 4941 + }, + { + "epoch": 1.3143617021276595, + "grad_norm": 3.717625617980957, + "learning_rate": 8.811438484808204e-06, + "loss": 0.8879, + "step": 4942 + }, + { + "epoch": 1.3146276595744681, + "grad_norm": 4.201274394989014, + "learning_rate": 8.810869176347082e-06, + "loss": 0.9174, + "step": 4943 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 3.3899879455566406, + "learning_rate": 8.810299749973182e-06, + "loss": 0.7209, + "step": 4944 + }, + { + "epoch": 1.315159574468085, + "grad_norm": 3.821558713912964, + "learning_rate": 8.80973020570412e-06, + "loss": 0.647, + "step": 4945 + }, + { + "epoch": 1.3154255319148938, + "grad_norm": 4.011831760406494, + "learning_rate": 8.809160543557523e-06, + "loss": 0.8387, + "step": 4946 + }, + { + "epoch": 1.315691489361702, + "grad_norm": 4.121433258056641, + "learning_rate": 8.80859076355101e-06, + "loss": 0.7835, + "step": 4947 + }, + { + "epoch": 1.3159574468085107, + "grad_norm": 4.066422462463379, + "learning_rate": 8.808020865702218e-06, + "loss": 0.7569, + "step": 4948 + }, + { + "epoch": 1.3162234042553191, + "grad_norm": 3.7616024017333984, + "learning_rate": 8.807450850028776e-06, + "loss": 0.7514, + "step": 4949 + }, + { + "epoch": 1.3164893617021276, + "grad_norm": 3.809521198272705, + "learning_rate": 8.806880716548322e-06, + "loss": 0.8212, + "step": 4950 + }, + { + "epoch": 1.3167553191489363, + "grad_norm": 3.664140224456787, + "learning_rate": 8.806310465278496e-06, + "loss": 0.8303, + "step": 4951 + }, + { + "epoch": 1.3170212765957447, + "grad_norm": 3.978876829147339, + "learning_rate": 8.805740096236943e-06, + "loss": 0.8149, + "step": 4952 + }, + { + "epoch": 1.3172872340425532, + "grad_norm": 4.436275959014893, + "learning_rate": 8.805169609441312e-06, + "loss": 0.9033, + "step": 4953 + }, + { + "epoch": 1.3175531914893617, + "grad_norm": 3.9355101585388184, + "learning_rate": 8.804599004909251e-06, + "loss": 0.8599, + "step": 4954 + }, + { + "epoch": 1.3178191489361701, + "grad_norm": 3.6748297214508057, + "learning_rate": 8.80402828265842e-06, + "loss": 0.6637, + "step": 4955 + }, + { + "epoch": 1.3180851063829788, + "grad_norm": 3.953321695327759, + "learning_rate": 8.803457442706473e-06, + "loss": 0.7684, + "step": 4956 + }, + { + "epoch": 1.3183510638297873, + "grad_norm": 3.9680938720703125, + "learning_rate": 8.802886485071078e-06, + "loss": 0.8377, + "step": 4957 + }, + { + "epoch": 1.3186170212765957, + "grad_norm": 3.608375072479248, + "learning_rate": 8.802315409769894e-06, + "loss": 0.7671, + "step": 4958 + }, + { + "epoch": 1.3188829787234042, + "grad_norm": 3.7180373668670654, + "learning_rate": 8.801744216820596e-06, + "loss": 0.794, + "step": 4959 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 3.490082263946533, + "learning_rate": 8.801172906240857e-06, + "loss": 0.7993, + "step": 4960 + }, + { + "epoch": 1.3194148936170214, + "grad_norm": 3.9783389568328857, + "learning_rate": 8.800601478048351e-06, + "loss": 0.7455, + "step": 4961 + }, + { + "epoch": 1.3196808510638298, + "grad_norm": 4.333663463592529, + "learning_rate": 8.800029932260764e-06, + "loss": 0.8772, + "step": 4962 + }, + { + "epoch": 1.3199468085106383, + "grad_norm": 3.9584553241729736, + "learning_rate": 8.799458268895774e-06, + "loss": 0.8622, + "step": 4963 + }, + { + "epoch": 1.3202127659574467, + "grad_norm": 4.271299362182617, + "learning_rate": 8.798886487971073e-06, + "loss": 0.7591, + "step": 4964 + }, + { + "epoch": 1.3204787234042552, + "grad_norm": 4.128324508666992, + "learning_rate": 8.798314589504348e-06, + "loss": 0.7294, + "step": 4965 + }, + { + "epoch": 1.320744680851064, + "grad_norm": 3.613626718521118, + "learning_rate": 8.797742573513302e-06, + "loss": 0.8173, + "step": 4966 + }, + { + "epoch": 1.3210106382978724, + "grad_norm": 3.665271043777466, + "learning_rate": 8.797170440015627e-06, + "loss": 0.7592, + "step": 4967 + }, + { + "epoch": 1.3212765957446808, + "grad_norm": 4.036754608154297, + "learning_rate": 8.79659818902903e-06, + "loss": 0.7705, + "step": 4968 + }, + { + "epoch": 1.3215425531914895, + "grad_norm": 4.09188175201416, + "learning_rate": 8.796025820571213e-06, + "loss": 0.9028, + "step": 4969 + }, + { + "epoch": 1.3218085106382977, + "grad_norm": 3.8270485401153564, + "learning_rate": 8.795453334659889e-06, + "loss": 0.7337, + "step": 4970 + }, + { + "epoch": 1.3220744680851064, + "grad_norm": 4.005841255187988, + "learning_rate": 8.794880731312771e-06, + "loss": 0.8789, + "step": 4971 + }, + { + "epoch": 1.3223404255319149, + "grad_norm": 3.894681930541992, + "learning_rate": 8.794308010547574e-06, + "loss": 0.7452, + "step": 4972 + }, + { + "epoch": 1.3226063829787233, + "grad_norm": 3.7697856426239014, + "learning_rate": 8.79373517238202e-06, + "loss": 0.7111, + "step": 4973 + }, + { + "epoch": 1.322872340425532, + "grad_norm": 4.162429332733154, + "learning_rate": 8.793162216833835e-06, + "loss": 0.8352, + "step": 4974 + }, + { + "epoch": 1.3231382978723405, + "grad_norm": 4.8362298011779785, + "learning_rate": 8.792589143920743e-06, + "loss": 0.8807, + "step": 4975 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 4.283027172088623, + "learning_rate": 8.792015953660478e-06, + "loss": 0.9241, + "step": 4976 + }, + { + "epoch": 1.3236702127659574, + "grad_norm": 3.7246296405792236, + "learning_rate": 8.791442646070776e-06, + "loss": 0.8158, + "step": 4977 + }, + { + "epoch": 1.3239361702127659, + "grad_norm": 3.9116530418395996, + "learning_rate": 8.790869221169374e-06, + "loss": 0.7603, + "step": 4978 + }, + { + "epoch": 1.3242021276595746, + "grad_norm": 4.164322853088379, + "learning_rate": 8.790295678974015e-06, + "loss": 0.7518, + "step": 4979 + }, + { + "epoch": 1.324468085106383, + "grad_norm": 3.459543228149414, + "learning_rate": 8.789722019502444e-06, + "loss": 0.8216, + "step": 4980 + }, + { + "epoch": 1.3247340425531915, + "grad_norm": 3.4385783672332764, + "learning_rate": 8.789148242772414e-06, + "loss": 0.5722, + "step": 4981 + }, + { + "epoch": 1.325, + "grad_norm": 3.881467580795288, + "learning_rate": 8.788574348801676e-06, + "loss": 0.7652, + "step": 4982 + }, + { + "epoch": 1.3252659574468084, + "grad_norm": 3.8028674125671387, + "learning_rate": 8.788000337607984e-06, + "loss": 0.7125, + "step": 4983 + }, + { + "epoch": 1.325531914893617, + "grad_norm": 3.595238447189331, + "learning_rate": 8.787426209209104e-06, + "loss": 0.6849, + "step": 4984 + }, + { + "epoch": 1.3257978723404256, + "grad_norm": 4.597902774810791, + "learning_rate": 8.786851963622799e-06, + "loss": 0.8314, + "step": 4985 + }, + { + "epoch": 1.326063829787234, + "grad_norm": 4.151714324951172, + "learning_rate": 8.786277600866834e-06, + "loss": 0.8624, + "step": 4986 + }, + { + "epoch": 1.3263297872340425, + "grad_norm": 3.7185237407684326, + "learning_rate": 8.785703120958984e-06, + "loss": 0.7547, + "step": 4987 + }, + { + "epoch": 1.326595744680851, + "grad_norm": 3.964048385620117, + "learning_rate": 8.785128523917022e-06, + "loss": 0.8626, + "step": 4988 + }, + { + "epoch": 1.3268617021276596, + "grad_norm": 3.9490604400634766, + "learning_rate": 8.784553809758724e-06, + "loss": 0.7927, + "step": 4989 + }, + { + "epoch": 1.327127659574468, + "grad_norm": 3.736051321029663, + "learning_rate": 8.783978978501879e-06, + "loss": 0.7581, + "step": 4990 + }, + { + "epoch": 1.3273936170212766, + "grad_norm": 4.048060417175293, + "learning_rate": 8.783404030164269e-06, + "loss": 0.8141, + "step": 4991 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.542971134185791, + "learning_rate": 8.782828964763683e-06, + "loss": 0.8244, + "step": 4992 + }, + { + "epoch": 1.3279255319148935, + "grad_norm": 4.4042439460754395, + "learning_rate": 8.782253782317914e-06, + "loss": 0.7623, + "step": 4993 + }, + { + "epoch": 1.3281914893617022, + "grad_norm": 4.011150360107422, + "learning_rate": 8.781678482844763e-06, + "loss": 0.7879, + "step": 4994 + }, + { + "epoch": 1.3284574468085106, + "grad_norm": 3.9396347999572754, + "learning_rate": 8.781103066362024e-06, + "loss": 0.8731, + "step": 4995 + }, + { + "epoch": 1.328723404255319, + "grad_norm": 4.063819408416748, + "learning_rate": 8.780527532887506e-06, + "loss": 0.7255, + "step": 4996 + }, + { + "epoch": 1.3289893617021278, + "grad_norm": 3.684864044189453, + "learning_rate": 8.779951882439016e-06, + "loss": 0.7447, + "step": 4997 + }, + { + "epoch": 1.3292553191489362, + "grad_norm": 4.3980207443237305, + "learning_rate": 8.77937611503436e-06, + "loss": 0.8104, + "step": 4998 + }, + { + "epoch": 1.3295212765957447, + "grad_norm": 4.019001483917236, + "learning_rate": 8.778800230691363e-06, + "loss": 0.7426, + "step": 4999 + }, + { + "epoch": 1.3297872340425532, + "grad_norm": 4.1492486000061035, + "learning_rate": 8.778224229427836e-06, + "loss": 0.7929, + "step": 5000 + }, + { + "epoch": 1.3297872340425532, + "eval_loss": 1.2957489490509033, + "eval_runtime": 14.7283, + "eval_samples_per_second": 27.159, + "eval_steps_per_second": 3.395, + "step": 5000 + }, + { + "epoch": 1.3300531914893616, + "grad_norm": 3.742830753326416, + "learning_rate": 8.777648111261601e-06, + "loss": 0.6807, + "step": 5001 + }, + { + "epoch": 1.3303191489361703, + "grad_norm": 4.3522114753723145, + "learning_rate": 8.77707187621049e-06, + "loss": 0.8048, + "step": 5002 + }, + { + "epoch": 1.3305851063829788, + "grad_norm": 3.7916550636291504, + "learning_rate": 8.776495524292325e-06, + "loss": 0.8209, + "step": 5003 + }, + { + "epoch": 1.3308510638297872, + "grad_norm": 3.642531156539917, + "learning_rate": 8.775919055524941e-06, + "loss": 0.7274, + "step": 5004 + }, + { + "epoch": 1.3311170212765957, + "grad_norm": 3.885079860687256, + "learning_rate": 8.775342469926178e-06, + "loss": 0.8305, + "step": 5005 + }, + { + "epoch": 1.3313829787234042, + "grad_norm": 3.816824436187744, + "learning_rate": 8.774765767513876e-06, + "loss": 0.7605, + "step": 5006 + }, + { + "epoch": 1.3316489361702128, + "grad_norm": 4.696832656860352, + "learning_rate": 8.774188948305874e-06, + "loss": 0.8907, + "step": 5007 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 4.030970096588135, + "learning_rate": 8.773612012320023e-06, + "loss": 0.9613, + "step": 5008 + }, + { + "epoch": 1.3321808510638298, + "grad_norm": 4.046240329742432, + "learning_rate": 8.773034959574173e-06, + "loss": 0.7066, + "step": 5009 + }, + { + "epoch": 1.3324468085106382, + "grad_norm": 3.916098117828369, + "learning_rate": 8.77245779008618e-06, + "loss": 0.7762, + "step": 5010 + }, + { + "epoch": 1.3327127659574467, + "grad_norm": 4.096320629119873, + "learning_rate": 8.771880503873902e-06, + "loss": 0.7222, + "step": 5011 + }, + { + "epoch": 1.3329787234042554, + "grad_norm": 4.3136467933654785, + "learning_rate": 8.771303100955199e-06, + "loss": 0.8265, + "step": 5012 + }, + { + "epoch": 1.3332446808510638, + "grad_norm": 3.972031593322754, + "learning_rate": 8.770725581347938e-06, + "loss": 0.7263, + "step": 5013 + }, + { + "epoch": 1.3335106382978723, + "grad_norm": 4.295060634613037, + "learning_rate": 8.770147945069988e-06, + "loss": 0.8489, + "step": 5014 + }, + { + "epoch": 1.3337765957446808, + "grad_norm": 3.8986477851867676, + "learning_rate": 8.769570192139224e-06, + "loss": 0.7101, + "step": 5015 + }, + { + "epoch": 1.3340425531914892, + "grad_norm": 3.8135452270507812, + "learning_rate": 8.768992322573518e-06, + "loss": 0.7885, + "step": 5016 + }, + { + "epoch": 1.334308510638298, + "grad_norm": 3.727550983428955, + "learning_rate": 8.768414336390752e-06, + "loss": 0.8622, + "step": 5017 + }, + { + "epoch": 1.3345744680851064, + "grad_norm": 4.012676239013672, + "learning_rate": 8.76783623360881e-06, + "loss": 0.8938, + "step": 5018 + }, + { + "epoch": 1.3348404255319148, + "grad_norm": 4.344918727874756, + "learning_rate": 8.767258014245578e-06, + "loss": 0.8228, + "step": 5019 + }, + { + "epoch": 1.3351063829787235, + "grad_norm": 3.9926249980926514, + "learning_rate": 8.76667967831895e-06, + "loss": 0.6513, + "step": 5020 + }, + { + "epoch": 1.335372340425532, + "grad_norm": 4.119525909423828, + "learning_rate": 8.766101225846816e-06, + "loss": 0.7887, + "step": 5021 + }, + { + "epoch": 1.3356382978723405, + "grad_norm": 4.538883686065674, + "learning_rate": 8.765522656847077e-06, + "loss": 0.796, + "step": 5022 + }, + { + "epoch": 1.335904255319149, + "grad_norm": 3.7550501823425293, + "learning_rate": 8.764943971337633e-06, + "loss": 0.7695, + "step": 5023 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 3.611605405807495, + "learning_rate": 8.76436516933639e-06, + "loss": 0.7483, + "step": 5024 + }, + { + "epoch": 1.336436170212766, + "grad_norm": 4.187867164611816, + "learning_rate": 8.763786250861258e-06, + "loss": 0.8277, + "step": 5025 + }, + { + "epoch": 1.3367021276595745, + "grad_norm": 3.9223055839538574, + "learning_rate": 8.763207215930147e-06, + "loss": 0.7724, + "step": 5026 + }, + { + "epoch": 1.336968085106383, + "grad_norm": 4.048906326293945, + "learning_rate": 8.762628064560975e-06, + "loss": 0.7923, + "step": 5027 + }, + { + "epoch": 1.3372340425531914, + "grad_norm": 4.241153240203857, + "learning_rate": 8.762048796771659e-06, + "loss": 0.8776, + "step": 5028 + }, + { + "epoch": 1.3375, + "grad_norm": 3.759209632873535, + "learning_rate": 8.761469412580126e-06, + "loss": 0.7554, + "step": 5029 + }, + { + "epoch": 1.3377659574468086, + "grad_norm": 3.8906912803649902, + "learning_rate": 8.760889912004297e-06, + "loss": 0.6977, + "step": 5030 + }, + { + "epoch": 1.338031914893617, + "grad_norm": 3.9501161575317383, + "learning_rate": 8.760310295062112e-06, + "loss": 0.9481, + "step": 5031 + }, + { + "epoch": 1.3382978723404255, + "grad_norm": 3.918553590774536, + "learning_rate": 8.759730561771494e-06, + "loss": 0.7882, + "step": 5032 + }, + { + "epoch": 1.338563829787234, + "grad_norm": 4.063170909881592, + "learning_rate": 8.759150712150388e-06, + "loss": 0.8415, + "step": 5033 + }, + { + "epoch": 1.3388297872340424, + "grad_norm": 3.863600015640259, + "learning_rate": 8.758570746216732e-06, + "loss": 0.807, + "step": 5034 + }, + { + "epoch": 1.3390957446808511, + "grad_norm": 3.9519717693328857, + "learning_rate": 8.757990663988474e-06, + "loss": 0.8594, + "step": 5035 + }, + { + "epoch": 1.3393617021276596, + "grad_norm": 4.245703220367432, + "learning_rate": 8.75741046548356e-06, + "loss": 0.7987, + "step": 5036 + }, + { + "epoch": 1.339627659574468, + "grad_norm": 4.1299729347229, + "learning_rate": 8.75683015071994e-06, + "loss": 0.9377, + "step": 5037 + }, + { + "epoch": 1.3398936170212765, + "grad_norm": 3.744929552078247, + "learning_rate": 8.756249719715576e-06, + "loss": 0.6875, + "step": 5038 + }, + { + "epoch": 1.340159574468085, + "grad_norm": 3.7629339694976807, + "learning_rate": 8.75566917248842e-06, + "loss": 0.7619, + "step": 5039 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 4.09276819229126, + "learning_rate": 8.75508850905644e-06, + "loss": 0.7618, + "step": 5040 + }, + { + "epoch": 1.3406914893617021, + "grad_norm": 4.220356464385986, + "learning_rate": 8.7545077294376e-06, + "loss": 0.9246, + "step": 5041 + }, + { + "epoch": 1.3409574468085106, + "grad_norm": 3.9419326782226562, + "learning_rate": 8.753926833649871e-06, + "loss": 0.7463, + "step": 5042 + }, + { + "epoch": 1.3412234042553193, + "grad_norm": 4.060051918029785, + "learning_rate": 8.753345821711224e-06, + "loss": 0.9061, + "step": 5043 + }, + { + "epoch": 1.3414893617021277, + "grad_norm": 3.7086057662963867, + "learning_rate": 8.75276469363964e-06, + "loss": 0.8177, + "step": 5044 + }, + { + "epoch": 1.3417553191489362, + "grad_norm": 4.173861503601074, + "learning_rate": 8.752183449453098e-06, + "loss": 0.8117, + "step": 5045 + }, + { + "epoch": 1.3420212765957447, + "grad_norm": 4.282475471496582, + "learning_rate": 8.75160208916958e-06, + "loss": 0.8352, + "step": 5046 + }, + { + "epoch": 1.3422872340425531, + "grad_norm": 3.9250497817993164, + "learning_rate": 8.75102061280708e-06, + "loss": 0.8292, + "step": 5047 + }, + { + "epoch": 1.3425531914893618, + "grad_norm": 4.28936767578125, + "learning_rate": 8.750439020383584e-06, + "loss": 0.8269, + "step": 5048 + }, + { + "epoch": 1.3428191489361703, + "grad_norm": 4.007338523864746, + "learning_rate": 8.749857311917089e-06, + "loss": 0.8376, + "step": 5049 + }, + { + "epoch": 1.3430851063829787, + "grad_norm": 3.741140842437744, + "learning_rate": 8.749275487425595e-06, + "loss": 0.7936, + "step": 5050 + }, + { + "epoch": 1.3433510638297872, + "grad_norm": 3.8448450565338135, + "learning_rate": 8.748693546927101e-06, + "loss": 0.8088, + "step": 5051 + }, + { + "epoch": 1.3436170212765957, + "grad_norm": 4.5769782066345215, + "learning_rate": 8.748111490439617e-06, + "loss": 0.8315, + "step": 5052 + }, + { + "epoch": 1.3438829787234043, + "grad_norm": 4.1284871101379395, + "learning_rate": 8.74752931798115e-06, + "loss": 0.8866, + "step": 5053 + }, + { + "epoch": 1.3441489361702128, + "grad_norm": 3.9224517345428467, + "learning_rate": 8.746947029569715e-06, + "loss": 0.6403, + "step": 5054 + }, + { + "epoch": 1.3444148936170213, + "grad_norm": 4.114837169647217, + "learning_rate": 8.746364625223326e-06, + "loss": 0.7303, + "step": 5055 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 3.9492406845092773, + "learning_rate": 8.745782104960006e-06, + "loss": 0.7462, + "step": 5056 + }, + { + "epoch": 1.3449468085106382, + "grad_norm": 3.5633533000946045, + "learning_rate": 8.745199468797775e-06, + "loss": 0.8241, + "step": 5057 + }, + { + "epoch": 1.3452127659574469, + "grad_norm": 3.9602227210998535, + "learning_rate": 8.744616716754665e-06, + "loss": 0.8142, + "step": 5058 + }, + { + "epoch": 1.3454787234042553, + "grad_norm": 3.6486499309539795, + "learning_rate": 8.744033848848705e-06, + "loss": 0.7932, + "step": 5059 + }, + { + "epoch": 1.3457446808510638, + "grad_norm": 3.9516966342926025, + "learning_rate": 8.743450865097929e-06, + "loss": 0.7334, + "step": 5060 + }, + { + "epoch": 1.3460106382978723, + "grad_norm": 4.261397361755371, + "learning_rate": 8.742867765520377e-06, + "loss": 0.7549, + "step": 5061 + }, + { + "epoch": 1.3462765957446807, + "grad_norm": 4.082563877105713, + "learning_rate": 8.742284550134088e-06, + "loss": 0.8306, + "step": 5062 + }, + { + "epoch": 1.3465425531914894, + "grad_norm": 3.9603230953216553, + "learning_rate": 8.74170121895711e-06, + "loss": 0.832, + "step": 5063 + }, + { + "epoch": 1.3468085106382979, + "grad_norm": 4.0057692527771, + "learning_rate": 8.741117772007492e-06, + "loss": 0.783, + "step": 5064 + }, + { + "epoch": 1.3470744680851063, + "grad_norm": 4.130981922149658, + "learning_rate": 8.740534209303285e-06, + "loss": 0.6476, + "step": 5065 + }, + { + "epoch": 1.347340425531915, + "grad_norm": 3.641900062561035, + "learning_rate": 8.739950530862544e-06, + "loss": 0.9809, + "step": 5066 + }, + { + "epoch": 1.3476063829787235, + "grad_norm": 3.607656955718994, + "learning_rate": 8.739366736703331e-06, + "loss": 0.7784, + "step": 5067 + }, + { + "epoch": 1.347872340425532, + "grad_norm": 4.068065166473389, + "learning_rate": 8.73878282684371e-06, + "loss": 0.9063, + "step": 5068 + }, + { + "epoch": 1.3481382978723404, + "grad_norm": 3.952601671218872, + "learning_rate": 8.738198801301745e-06, + "loss": 0.9279, + "step": 5069 + }, + { + "epoch": 1.3484042553191489, + "grad_norm": 4.016735553741455, + "learning_rate": 8.737614660095507e-06, + "loss": 0.7658, + "step": 5070 + }, + { + "epoch": 1.3486702127659576, + "grad_norm": 3.669020891189575, + "learning_rate": 8.737030403243074e-06, + "loss": 0.6806, + "step": 5071 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 3.659308910369873, + "learning_rate": 8.736446030762518e-06, + "loss": 0.7539, + "step": 5072 + }, + { + "epoch": 1.3492021276595745, + "grad_norm": 3.9839887619018555, + "learning_rate": 8.735861542671924e-06, + "loss": 0.7342, + "step": 5073 + }, + { + "epoch": 1.349468085106383, + "grad_norm": 3.9134328365325928, + "learning_rate": 8.735276938989375e-06, + "loss": 0.8636, + "step": 5074 + }, + { + "epoch": 1.3497340425531914, + "grad_norm": 3.841643810272217, + "learning_rate": 8.73469221973296e-06, + "loss": 0.7273, + "step": 5075 + }, + { + "epoch": 1.35, + "grad_norm": 3.903296947479248, + "learning_rate": 8.734107384920771e-06, + "loss": 0.8596, + "step": 5076 + }, + { + "epoch": 1.3502659574468086, + "grad_norm": 4.10729455947876, + "learning_rate": 8.733522434570901e-06, + "loss": 0.8268, + "step": 5077 + }, + { + "epoch": 1.350531914893617, + "grad_norm": 3.913231611251831, + "learning_rate": 8.732937368701453e-06, + "loss": 0.8017, + "step": 5078 + }, + { + "epoch": 1.3507978723404255, + "grad_norm": 3.795318365097046, + "learning_rate": 8.732352187330528e-06, + "loss": 0.6833, + "step": 5079 + }, + { + "epoch": 1.351063829787234, + "grad_norm": 3.991790294647217, + "learning_rate": 8.731766890476232e-06, + "loss": 0.7068, + "step": 5080 + }, + { + "epoch": 1.3513297872340426, + "grad_norm": 4.177598476409912, + "learning_rate": 8.731181478156673e-06, + "loss": 0.806, + "step": 5081 + }, + { + "epoch": 1.351595744680851, + "grad_norm": 3.855368137359619, + "learning_rate": 8.730595950389968e-06, + "loss": 0.7752, + "step": 5082 + }, + { + "epoch": 1.3518617021276595, + "grad_norm": 4.333880424499512, + "learning_rate": 8.730010307194232e-06, + "loss": 0.771, + "step": 5083 + }, + { + "epoch": 1.352127659574468, + "grad_norm": 3.9861552715301514, + "learning_rate": 8.729424548587585e-06, + "loss": 0.873, + "step": 5084 + }, + { + "epoch": 1.3523936170212765, + "grad_norm": 4.271336078643799, + "learning_rate": 8.728838674588151e-06, + "loss": 0.8345, + "step": 5085 + }, + { + "epoch": 1.3526595744680852, + "grad_norm": 4.418639659881592, + "learning_rate": 8.72825268521406e-06, + "loss": 0.9593, + "step": 5086 + }, + { + "epoch": 1.3529255319148936, + "grad_norm": 4.122128963470459, + "learning_rate": 8.72766658048344e-06, + "loss": 0.6917, + "step": 5087 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.9738972187042236, + "learning_rate": 8.727080360414428e-06, + "loss": 0.7446, + "step": 5088 + }, + { + "epoch": 1.3534574468085108, + "grad_norm": 4.067488670349121, + "learning_rate": 8.726494025025162e-06, + "loss": 0.6886, + "step": 5089 + }, + { + "epoch": 1.3537234042553192, + "grad_norm": 3.782886028289795, + "learning_rate": 8.725907574333783e-06, + "loss": 0.8159, + "step": 5090 + }, + { + "epoch": 1.3539893617021277, + "grad_norm": 3.9360549449920654, + "learning_rate": 8.725321008358436e-06, + "loss": 0.8189, + "step": 5091 + }, + { + "epoch": 1.3542553191489362, + "grad_norm": 4.132941246032715, + "learning_rate": 8.724734327117273e-06, + "loss": 0.9677, + "step": 5092 + }, + { + "epoch": 1.3545212765957446, + "grad_norm": 4.25277042388916, + "learning_rate": 8.724147530628442e-06, + "loss": 0.8653, + "step": 5093 + }, + { + "epoch": 1.3547872340425533, + "grad_norm": 3.962684392929077, + "learning_rate": 8.723560618910103e-06, + "loss": 0.6903, + "step": 5094 + }, + { + "epoch": 1.3550531914893618, + "grad_norm": 3.9663078784942627, + "learning_rate": 8.722973591980414e-06, + "loss": 0.7572, + "step": 5095 + }, + { + "epoch": 1.3553191489361702, + "grad_norm": 4.48624849319458, + "learning_rate": 8.722386449857541e-06, + "loss": 0.9056, + "step": 5096 + }, + { + "epoch": 1.3555851063829787, + "grad_norm": 3.8394525051116943, + "learning_rate": 8.721799192559646e-06, + "loss": 0.7721, + "step": 5097 + }, + { + "epoch": 1.3558510638297872, + "grad_norm": 4.599715232849121, + "learning_rate": 8.721211820104903e-06, + "loss": 1.0118, + "step": 5098 + }, + { + "epoch": 1.3561170212765958, + "grad_norm": 4.1499528884887695, + "learning_rate": 8.720624332511484e-06, + "loss": 0.8979, + "step": 5099 + }, + { + "epoch": 1.3563829787234043, + "grad_norm": 3.8984806537628174, + "learning_rate": 8.72003672979757e-06, + "loss": 0.8824, + "step": 5100 + }, + { + "epoch": 1.3566489361702128, + "grad_norm": 3.709800958633423, + "learning_rate": 8.71944901198134e-06, + "loss": 0.8053, + "step": 5101 + }, + { + "epoch": 1.3569148936170212, + "grad_norm": 3.4785032272338867, + "learning_rate": 8.718861179080975e-06, + "loss": 0.6898, + "step": 5102 + }, + { + "epoch": 1.3571808510638297, + "grad_norm": 3.8457705974578857, + "learning_rate": 8.71827323111467e-06, + "loss": 0.75, + "step": 5103 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 3.66109299659729, + "learning_rate": 8.71768516810061e-06, + "loss": 0.7255, + "step": 5104 + }, + { + "epoch": 1.3577127659574468, + "grad_norm": 3.6998486518859863, + "learning_rate": 8.717096990056999e-06, + "loss": 0.8202, + "step": 5105 + }, + { + "epoch": 1.3579787234042553, + "grad_norm": 4.291678428649902, + "learning_rate": 8.716508697002027e-06, + "loss": 0.9424, + "step": 5106 + }, + { + "epoch": 1.3582446808510638, + "grad_norm": 3.870074987411499, + "learning_rate": 8.715920288953901e-06, + "loss": 0.8821, + "step": 5107 + }, + { + "epoch": 1.3585106382978722, + "grad_norm": 3.469759702682495, + "learning_rate": 8.715331765930828e-06, + "loss": 0.745, + "step": 5108 + }, + { + "epoch": 1.358776595744681, + "grad_norm": 4.048684597015381, + "learning_rate": 8.714743127951014e-06, + "loss": 0.9526, + "step": 5109 + }, + { + "epoch": 1.3590425531914894, + "grad_norm": 4.060766696929932, + "learning_rate": 8.714154375032675e-06, + "loss": 0.7971, + "step": 5110 + }, + { + "epoch": 1.3593085106382978, + "grad_norm": 4.004628658294678, + "learning_rate": 8.713565507194027e-06, + "loss": 0.8302, + "step": 5111 + }, + { + "epoch": 1.3595744680851065, + "grad_norm": 4.034252166748047, + "learning_rate": 8.712976524453289e-06, + "loss": 0.8873, + "step": 5112 + }, + { + "epoch": 1.359840425531915, + "grad_norm": 3.9113869667053223, + "learning_rate": 8.712387426828685e-06, + "loss": 0.7514, + "step": 5113 + }, + { + "epoch": 1.3601063829787234, + "grad_norm": 3.977827787399292, + "learning_rate": 8.711798214338445e-06, + "loss": 0.8099, + "step": 5114 + }, + { + "epoch": 1.360372340425532, + "grad_norm": 4.005003929138184, + "learning_rate": 8.711208887000797e-06, + "loss": 0.8888, + "step": 5115 + }, + { + "epoch": 1.3606382978723404, + "grad_norm": 3.7809715270996094, + "learning_rate": 8.710619444833977e-06, + "loss": 0.8131, + "step": 5116 + }, + { + "epoch": 1.360904255319149, + "grad_norm": 3.8309693336486816, + "learning_rate": 8.710029887856224e-06, + "loss": 0.6836, + "step": 5117 + }, + { + "epoch": 1.3611702127659575, + "grad_norm": 3.7106757164001465, + "learning_rate": 8.709440216085777e-06, + "loss": 0.8079, + "step": 5118 + }, + { + "epoch": 1.361436170212766, + "grad_norm": 4.386137962341309, + "learning_rate": 8.708850429540882e-06, + "loss": 0.8484, + "step": 5119 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 4.305933952331543, + "learning_rate": 8.708260528239788e-06, + "loss": 0.9357, + "step": 5120 + }, + { + "epoch": 1.361968085106383, + "grad_norm": 4.107351303100586, + "learning_rate": 8.70767051220075e-06, + "loss": 0.8932, + "step": 5121 + }, + { + "epoch": 1.3622340425531916, + "grad_norm": 3.7665624618530273, + "learning_rate": 8.707080381442016e-06, + "loss": 0.7792, + "step": 5122 + }, + { + "epoch": 1.3625, + "grad_norm": 4.177657604217529, + "learning_rate": 8.706490135981856e-06, + "loss": 0.8046, + "step": 5123 + }, + { + "epoch": 1.3627659574468085, + "grad_norm": 4.132664203643799, + "learning_rate": 8.705899775838525e-06, + "loss": 0.8516, + "step": 5124 + }, + { + "epoch": 1.363031914893617, + "grad_norm": 4.0525288581848145, + "learning_rate": 8.70530930103029e-06, + "loss": 0.8747, + "step": 5125 + }, + { + "epoch": 1.3632978723404254, + "grad_norm": 4.088098526000977, + "learning_rate": 8.704718711575424e-06, + "loss": 0.6531, + "step": 5126 + }, + { + "epoch": 1.3635638297872341, + "grad_norm": 3.944594144821167, + "learning_rate": 8.704128007492201e-06, + "loss": 0.8084, + "step": 5127 + }, + { + "epoch": 1.3638297872340426, + "grad_norm": 4.340763092041016, + "learning_rate": 8.703537188798894e-06, + "loss": 0.8186, + "step": 5128 + }, + { + "epoch": 1.364095744680851, + "grad_norm": 3.9249961376190186, + "learning_rate": 8.702946255513787e-06, + "loss": 0.8166, + "step": 5129 + }, + { + "epoch": 1.3643617021276595, + "grad_norm": 3.667654275894165, + "learning_rate": 8.702355207655164e-06, + "loss": 0.8432, + "step": 5130 + }, + { + "epoch": 1.364627659574468, + "grad_norm": 3.6376404762268066, + "learning_rate": 8.70176404524131e-06, + "loss": 0.7878, + "step": 5131 + }, + { + "epoch": 1.3648936170212767, + "grad_norm": 3.9054555892944336, + "learning_rate": 8.70117276829052e-06, + "loss": 0.7763, + "step": 5132 + }, + { + "epoch": 1.3651595744680851, + "grad_norm": 4.0739288330078125, + "learning_rate": 8.700581376821086e-06, + "loss": 0.728, + "step": 5133 + }, + { + "epoch": 1.3654255319148936, + "grad_norm": 3.8359971046447754, + "learning_rate": 8.699989870851308e-06, + "loss": 0.8314, + "step": 5134 + }, + { + "epoch": 1.3656914893617023, + "grad_norm": 3.708594799041748, + "learning_rate": 8.699398250399486e-06, + "loss": 0.7632, + "step": 5135 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 3.9665486812591553, + "learning_rate": 8.698806515483928e-06, + "loss": 0.8794, + "step": 5136 + }, + { + "epoch": 1.3662234042553192, + "grad_norm": 4.699567794799805, + "learning_rate": 8.698214666122941e-06, + "loss": 1.0106, + "step": 5137 + }, + { + "epoch": 1.3664893617021276, + "grad_norm": 3.8563220500946045, + "learning_rate": 8.697622702334839e-06, + "loss": 0.7451, + "step": 5138 + }, + { + "epoch": 1.366755319148936, + "grad_norm": 4.188748359680176, + "learning_rate": 8.697030624137937e-06, + "loss": 0.7481, + "step": 5139 + }, + { + "epoch": 1.3670212765957448, + "grad_norm": 3.891820192337036, + "learning_rate": 8.696438431550553e-06, + "loss": 0.8304, + "step": 5140 + }, + { + "epoch": 1.3672872340425533, + "grad_norm": 4.065185546875, + "learning_rate": 8.695846124591015e-06, + "loss": 0.8912, + "step": 5141 + }, + { + "epoch": 1.3675531914893617, + "grad_norm": 3.466252326965332, + "learning_rate": 8.695253703277644e-06, + "loss": 0.7941, + "step": 5142 + }, + { + "epoch": 1.3678191489361702, + "grad_norm": 3.7102415561676025, + "learning_rate": 8.694661167628772e-06, + "loss": 0.6821, + "step": 5143 + }, + { + "epoch": 1.3680851063829786, + "grad_norm": 4.1319260597229, + "learning_rate": 8.694068517662735e-06, + "loss": 0.9666, + "step": 5144 + }, + { + "epoch": 1.3683510638297873, + "grad_norm": 3.870607852935791, + "learning_rate": 8.693475753397869e-06, + "loss": 0.8806, + "step": 5145 + }, + { + "epoch": 1.3686170212765958, + "grad_norm": 3.9953293800354004, + "learning_rate": 8.692882874852515e-06, + "loss": 0.8558, + "step": 5146 + }, + { + "epoch": 1.3688829787234043, + "grad_norm": 4.429169178009033, + "learning_rate": 8.692289882045015e-06, + "loss": 0.7949, + "step": 5147 + }, + { + "epoch": 1.3691489361702127, + "grad_norm": 3.895005464553833, + "learning_rate": 8.691696774993721e-06, + "loss": 0.7547, + "step": 5148 + }, + { + "epoch": 1.3694148936170212, + "grad_norm": 4.446406841278076, + "learning_rate": 8.691103553716981e-06, + "loss": 0.8757, + "step": 5149 + }, + { + "epoch": 1.3696808510638299, + "grad_norm": 4.012157440185547, + "learning_rate": 8.690510218233153e-06, + "loss": 0.9106, + "step": 5150 + }, + { + "epoch": 1.3699468085106383, + "grad_norm": 3.966068983078003, + "learning_rate": 8.689916768560593e-06, + "loss": 0.7194, + "step": 5151 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 3.9841232299804688, + "learning_rate": 8.689323204717663e-06, + "loss": 0.8174, + "step": 5152 + }, + { + "epoch": 1.3704787234042553, + "grad_norm": 4.248937129974365, + "learning_rate": 8.688729526722732e-06, + "loss": 0.8107, + "step": 5153 + }, + { + "epoch": 1.3707446808510637, + "grad_norm": 3.6485583782196045, + "learning_rate": 8.688135734594165e-06, + "loss": 0.8828, + "step": 5154 + }, + { + "epoch": 1.3710106382978724, + "grad_norm": 4.1670966148376465, + "learning_rate": 8.687541828350334e-06, + "loss": 0.8604, + "step": 5155 + }, + { + "epoch": 1.3712765957446809, + "grad_norm": 4.121282577514648, + "learning_rate": 8.686947808009621e-06, + "loss": 0.8228, + "step": 5156 + }, + { + "epoch": 1.3715425531914893, + "grad_norm": 3.781928539276123, + "learning_rate": 8.6863536735904e-06, + "loss": 0.7416, + "step": 5157 + }, + { + "epoch": 1.371808510638298, + "grad_norm": 3.688425064086914, + "learning_rate": 8.685759425111056e-06, + "loss": 0.7902, + "step": 5158 + }, + { + "epoch": 1.3720744680851062, + "grad_norm": 3.922410488128662, + "learning_rate": 8.685165062589975e-06, + "loss": 0.8117, + "step": 5159 + }, + { + "epoch": 1.372340425531915, + "grad_norm": 4.217987060546875, + "learning_rate": 8.68457058604555e-06, + "loss": 0.9173, + "step": 5160 + }, + { + "epoch": 1.3726063829787234, + "grad_norm": 4.135257244110107, + "learning_rate": 8.683975995496173e-06, + "loss": 0.7474, + "step": 5161 + }, + { + "epoch": 1.3728723404255319, + "grad_norm": 3.7882463932037354, + "learning_rate": 8.68338129096024e-06, + "loss": 0.8153, + "step": 5162 + }, + { + "epoch": 1.3731382978723405, + "grad_norm": 3.6793859004974365, + "learning_rate": 8.682786472456155e-06, + "loss": 0.6914, + "step": 5163 + }, + { + "epoch": 1.373404255319149, + "grad_norm": 4.030581951141357, + "learning_rate": 8.682191540002318e-06, + "loss": 0.778, + "step": 5164 + }, + { + "epoch": 1.3736702127659575, + "grad_norm": 3.8380470275878906, + "learning_rate": 8.681596493617141e-06, + "loss": 0.7522, + "step": 5165 + }, + { + "epoch": 1.373936170212766, + "grad_norm": 4.138343334197998, + "learning_rate": 8.681001333319035e-06, + "loss": 0.843, + "step": 5166 + }, + { + "epoch": 1.3742021276595744, + "grad_norm": 3.723407030105591, + "learning_rate": 8.680406059126412e-06, + "loss": 0.7799, + "step": 5167 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 3.8985822200775146, + "learning_rate": 8.679810671057695e-06, + "loss": 0.7446, + "step": 5168 + }, + { + "epoch": 1.3747340425531915, + "grad_norm": 4.534223556518555, + "learning_rate": 8.679215169131301e-06, + "loss": 0.8734, + "step": 5169 + }, + { + "epoch": 1.375, + "grad_norm": 3.75278639793396, + "learning_rate": 8.67861955336566e-06, + "loss": 0.8435, + "step": 5170 + }, + { + "epoch": 1.3752659574468085, + "grad_norm": 4.094736099243164, + "learning_rate": 8.678023823779196e-06, + "loss": 0.7671, + "step": 5171 + }, + { + "epoch": 1.375531914893617, + "grad_norm": 3.920642137527466, + "learning_rate": 8.677427980390348e-06, + "loss": 0.7937, + "step": 5172 + }, + { + "epoch": 1.3757978723404256, + "grad_norm": 3.5799460411071777, + "learning_rate": 8.676832023217545e-06, + "loss": 0.8206, + "step": 5173 + }, + { + "epoch": 1.376063829787234, + "grad_norm": 3.8929152488708496, + "learning_rate": 8.676235952279233e-06, + "loss": 0.837, + "step": 5174 + }, + { + "epoch": 1.3763297872340425, + "grad_norm": 3.7762844562530518, + "learning_rate": 8.675639767593851e-06, + "loss": 0.8191, + "step": 5175 + }, + { + "epoch": 1.376595744680851, + "grad_norm": 4.34854793548584, + "learning_rate": 8.675043469179849e-06, + "loss": 0.9724, + "step": 5176 + }, + { + "epoch": 1.3768617021276595, + "grad_norm": 4.143275260925293, + "learning_rate": 8.674447057055673e-06, + "loss": 0.7607, + "step": 5177 + }, + { + "epoch": 1.3771276595744681, + "grad_norm": 3.8602356910705566, + "learning_rate": 8.673850531239781e-06, + "loss": 0.8241, + "step": 5178 + }, + { + "epoch": 1.3773936170212766, + "grad_norm": 4.238362789154053, + "learning_rate": 8.673253891750626e-06, + "loss": 0.75, + "step": 5179 + }, + { + "epoch": 1.377659574468085, + "grad_norm": 4.423724174499512, + "learning_rate": 8.672657138606672e-06, + "loss": 0.8929, + "step": 5180 + }, + { + "epoch": 1.3779255319148938, + "grad_norm": 3.5237340927124023, + "learning_rate": 8.672060271826381e-06, + "loss": 0.6877, + "step": 5181 + }, + { + "epoch": 1.378191489361702, + "grad_norm": 3.615936756134033, + "learning_rate": 8.671463291428223e-06, + "loss": 0.7091, + "step": 5182 + }, + { + "epoch": 1.3784574468085107, + "grad_norm": 3.587336778640747, + "learning_rate": 8.67086619743067e-06, + "loss": 0.8266, + "step": 5183 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 4.141132831573486, + "learning_rate": 8.670268989852192e-06, + "loss": 0.7199, + "step": 5184 + }, + { + "epoch": 1.3789893617021276, + "grad_norm": 4.076261520385742, + "learning_rate": 8.669671668711272e-06, + "loss": 0.7788, + "step": 5185 + }, + { + "epoch": 1.3792553191489363, + "grad_norm": 4.020741939544678, + "learning_rate": 8.66907423402639e-06, + "loss": 0.8652, + "step": 5186 + }, + { + "epoch": 1.3795212765957447, + "grad_norm": 3.8059983253479004, + "learning_rate": 8.668476685816029e-06, + "loss": 0.8151, + "step": 5187 + }, + { + "epoch": 1.3797872340425532, + "grad_norm": 4.055500030517578, + "learning_rate": 8.667879024098682e-06, + "loss": 0.7985, + "step": 5188 + }, + { + "epoch": 1.3800531914893617, + "grad_norm": 3.8605387210845947, + "learning_rate": 8.66728124889284e-06, + "loss": 0.8602, + "step": 5189 + }, + { + "epoch": 1.3803191489361701, + "grad_norm": 3.781041383743286, + "learning_rate": 8.666683360216998e-06, + "loss": 0.815, + "step": 5190 + }, + { + "epoch": 1.3805851063829788, + "grad_norm": 4.160099029541016, + "learning_rate": 8.666085358089655e-06, + "loss": 0.8366, + "step": 5191 + }, + { + "epoch": 1.3808510638297873, + "grad_norm": 4.079177379608154, + "learning_rate": 8.665487242529316e-06, + "loss": 0.9131, + "step": 5192 + }, + { + "epoch": 1.3811170212765957, + "grad_norm": 4.033502578735352, + "learning_rate": 8.664889013554484e-06, + "loss": 0.7588, + "step": 5193 + }, + { + "epoch": 1.3813829787234042, + "grad_norm": 3.969634771347046, + "learning_rate": 8.664290671183675e-06, + "loss": 0.9422, + "step": 5194 + }, + { + "epoch": 1.3816489361702127, + "grad_norm": 3.9259159564971924, + "learning_rate": 8.663692215435396e-06, + "loss": 0.7046, + "step": 5195 + }, + { + "epoch": 1.3819148936170214, + "grad_norm": 4.086988925933838, + "learning_rate": 8.663093646328166e-06, + "loss": 0.8629, + "step": 5196 + }, + { + "epoch": 1.3821808510638298, + "grad_norm": 4.083224773406982, + "learning_rate": 8.662494963880508e-06, + "loss": 0.8992, + "step": 5197 + }, + { + "epoch": 1.3824468085106383, + "grad_norm": 4.1260881423950195, + "learning_rate": 8.66189616811094e-06, + "loss": 0.8958, + "step": 5198 + }, + { + "epoch": 1.3827127659574467, + "grad_norm": 3.9255919456481934, + "learning_rate": 8.661297259037998e-06, + "loss": 0.8155, + "step": 5199 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 4.030576705932617, + "learning_rate": 8.660698236680205e-06, + "loss": 0.901, + "step": 5200 + }, + { + "epoch": 1.383244680851064, + "grad_norm": 4.204456329345703, + "learning_rate": 8.660099101056098e-06, + "loss": 0.8021, + "step": 5201 + }, + { + "epoch": 1.3835106382978724, + "grad_norm": 3.743723154067993, + "learning_rate": 8.659499852184218e-06, + "loss": 0.8411, + "step": 5202 + }, + { + "epoch": 1.3837765957446808, + "grad_norm": 3.8044793605804443, + "learning_rate": 8.658900490083102e-06, + "loss": 0.6985, + "step": 5203 + }, + { + "epoch": 1.3840425531914895, + "grad_norm": 3.762624740600586, + "learning_rate": 8.658301014771298e-06, + "loss": 0.7873, + "step": 5204 + }, + { + "epoch": 1.3843085106382977, + "grad_norm": 3.8245599269866943, + "learning_rate": 8.657701426267355e-06, + "loss": 0.7773, + "step": 5205 + }, + { + "epoch": 1.3845744680851064, + "grad_norm": 3.875678062438965, + "learning_rate": 8.65710172458982e-06, + "loss": 0.9493, + "step": 5206 + }, + { + "epoch": 1.3848404255319149, + "grad_norm": 4.034217834472656, + "learning_rate": 8.656501909757255e-06, + "loss": 0.8742, + "step": 5207 + }, + { + "epoch": 1.3851063829787233, + "grad_norm": 3.7253971099853516, + "learning_rate": 8.655901981788216e-06, + "loss": 0.7408, + "step": 5208 + }, + { + "epoch": 1.385372340425532, + "grad_norm": 4.211146354675293, + "learning_rate": 8.655301940701262e-06, + "loss": 0.8107, + "step": 5209 + }, + { + "epoch": 1.3856382978723405, + "grad_norm": 4.0121378898620605, + "learning_rate": 8.654701786514965e-06, + "loss": 0.8808, + "step": 5210 + }, + { + "epoch": 1.385904255319149, + "grad_norm": 4.111256122589111, + "learning_rate": 8.654101519247892e-06, + "loss": 0.8339, + "step": 5211 + }, + { + "epoch": 1.3861702127659574, + "grad_norm": 3.683849811553955, + "learning_rate": 8.653501138918615e-06, + "loss": 0.8046, + "step": 5212 + }, + { + "epoch": 1.3864361702127659, + "grad_norm": 4.3086957931518555, + "learning_rate": 8.652900645545711e-06, + "loss": 0.8217, + "step": 5213 + }, + { + "epoch": 1.3867021276595746, + "grad_norm": 4.064043998718262, + "learning_rate": 8.65230003914776e-06, + "loss": 0.9811, + "step": 5214 + }, + { + "epoch": 1.386968085106383, + "grad_norm": 3.8175463676452637, + "learning_rate": 8.651699319743348e-06, + "loss": 0.879, + "step": 5215 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 4.500128269195557, + "learning_rate": 8.651098487351057e-06, + "loss": 0.6979, + "step": 5216 + }, + { + "epoch": 1.3875, + "grad_norm": 4.019436836242676, + "learning_rate": 8.650497541989483e-06, + "loss": 0.8766, + "step": 5217 + }, + { + "epoch": 1.3877659574468084, + "grad_norm": 3.5277206897735596, + "learning_rate": 8.649896483677213e-06, + "loss": 0.8292, + "step": 5218 + }, + { + "epoch": 1.388031914893617, + "grad_norm": 3.918307065963745, + "learning_rate": 8.649295312432853e-06, + "loss": 0.7684, + "step": 5219 + }, + { + "epoch": 1.3882978723404256, + "grad_norm": 3.9739909172058105, + "learning_rate": 8.648694028274998e-06, + "loss": 0.743, + "step": 5220 + }, + { + "epoch": 1.388563829787234, + "grad_norm": 3.6508398056030273, + "learning_rate": 8.648092631222253e-06, + "loss": 0.7689, + "step": 5221 + }, + { + "epoch": 1.3888297872340425, + "grad_norm": 3.846869468688965, + "learning_rate": 8.647491121293228e-06, + "loss": 0.741, + "step": 5222 + }, + { + "epoch": 1.389095744680851, + "grad_norm": 3.8481643199920654, + "learning_rate": 8.646889498506532e-06, + "loss": 0.8665, + "step": 5223 + }, + { + "epoch": 1.3893617021276596, + "grad_norm": 4.380584239959717, + "learning_rate": 8.646287762880783e-06, + "loss": 0.8029, + "step": 5224 + }, + { + "epoch": 1.389627659574468, + "grad_norm": 3.8931496143341064, + "learning_rate": 8.645685914434596e-06, + "loss": 0.8964, + "step": 5225 + }, + { + "epoch": 1.3898936170212766, + "grad_norm": 3.976508378982544, + "learning_rate": 8.645083953186596e-06, + "loss": 0.8707, + "step": 5226 + }, + { + "epoch": 1.390159574468085, + "grad_norm": 3.606631278991699, + "learning_rate": 8.644481879155406e-06, + "loss": 0.7476, + "step": 5227 + }, + { + "epoch": 1.3904255319148935, + "grad_norm": 4.043211936950684, + "learning_rate": 8.643879692359655e-06, + "loss": 0.7478, + "step": 5228 + }, + { + "epoch": 1.3906914893617022, + "grad_norm": 3.9135618209838867, + "learning_rate": 8.643277392817976e-06, + "loss": 0.7469, + "step": 5229 + }, + { + "epoch": 1.3909574468085106, + "grad_norm": 3.747793674468994, + "learning_rate": 8.642674980549008e-06, + "loss": 0.8092, + "step": 5230 + }, + { + "epoch": 1.391223404255319, + "grad_norm": 4.33275032043457, + "learning_rate": 8.642072455571383e-06, + "loss": 0.7867, + "step": 5231 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 4.364730358123779, + "learning_rate": 8.641469817903752e-06, + "loss": 0.8545, + "step": 5232 + }, + { + "epoch": 1.3917553191489362, + "grad_norm": 3.848296880722046, + "learning_rate": 8.640867067564757e-06, + "loss": 0.8735, + "step": 5233 + }, + { + "epoch": 1.3920212765957447, + "grad_norm": 3.8391952514648438, + "learning_rate": 8.640264204573049e-06, + "loss": 0.8439, + "step": 5234 + }, + { + "epoch": 1.3922872340425532, + "grad_norm": 4.061415672302246, + "learning_rate": 8.639661228947278e-06, + "loss": 0.7702, + "step": 5235 + }, + { + "epoch": 1.3925531914893616, + "grad_norm": 4.175765037536621, + "learning_rate": 8.639058140706105e-06, + "loss": 0.8053, + "step": 5236 + }, + { + "epoch": 1.3928191489361703, + "grad_norm": 3.840773105621338, + "learning_rate": 8.638454939868188e-06, + "loss": 0.7192, + "step": 5237 + }, + { + "epoch": 1.3930851063829788, + "grad_norm": 3.76470947265625, + "learning_rate": 8.637851626452191e-06, + "loss": 0.7634, + "step": 5238 + }, + { + "epoch": 1.3933510638297872, + "grad_norm": 3.903261184692383, + "learning_rate": 8.637248200476783e-06, + "loss": 0.7672, + "step": 5239 + }, + { + "epoch": 1.3936170212765957, + "grad_norm": 4.356569290161133, + "learning_rate": 8.636644661960634e-06, + "loss": 0.8834, + "step": 5240 + }, + { + "epoch": 1.3938829787234042, + "grad_norm": 4.116570949554443, + "learning_rate": 8.636041010922416e-06, + "loss": 0.7715, + "step": 5241 + }, + { + "epoch": 1.3941489361702128, + "grad_norm": 3.9501302242279053, + "learning_rate": 8.635437247380809e-06, + "loss": 0.7663, + "step": 5242 + }, + { + "epoch": 1.3944148936170213, + "grad_norm": 4.226482391357422, + "learning_rate": 8.634833371354492e-06, + "loss": 0.8156, + "step": 5243 + }, + { + "epoch": 1.3946808510638298, + "grad_norm": 4.047403335571289, + "learning_rate": 8.634229382862152e-06, + "loss": 0.8982, + "step": 5244 + }, + { + "epoch": 1.3949468085106382, + "grad_norm": 4.245815753936768, + "learning_rate": 8.633625281922477e-06, + "loss": 0.8558, + "step": 5245 + }, + { + "epoch": 1.3952127659574467, + "grad_norm": 3.9995036125183105, + "learning_rate": 8.633021068554155e-06, + "loss": 0.8246, + "step": 5246 + }, + { + "epoch": 1.3954787234042554, + "grad_norm": 4.213914394378662, + "learning_rate": 8.632416742775886e-06, + "loss": 0.7979, + "step": 5247 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 4.043915748596191, + "learning_rate": 8.631812304606367e-06, + "loss": 0.8903, + "step": 5248 + }, + { + "epoch": 1.3960106382978723, + "grad_norm": 3.995999336242676, + "learning_rate": 8.631207754064299e-06, + "loss": 0.7445, + "step": 5249 + }, + { + "epoch": 1.3962765957446808, + "grad_norm": 3.6424171924591064, + "learning_rate": 8.630603091168385e-06, + "loss": 0.6922, + "step": 5250 + }, + { + "epoch": 1.3965425531914892, + "grad_norm": 4.3226118087768555, + "learning_rate": 8.62999831593734e-06, + "loss": 0.8686, + "step": 5251 + }, + { + "epoch": 1.396808510638298, + "grad_norm": 3.89966082572937, + "learning_rate": 8.629393428389873e-06, + "loss": 0.7592, + "step": 5252 + }, + { + "epoch": 1.3970744680851064, + "grad_norm": 4.409592151641846, + "learning_rate": 8.628788428544698e-06, + "loss": 0.952, + "step": 5253 + }, + { + "epoch": 1.3973404255319148, + "grad_norm": 3.884060859680176, + "learning_rate": 8.62818331642054e-06, + "loss": 0.83, + "step": 5254 + }, + { + "epoch": 1.3976063829787235, + "grad_norm": 3.480745792388916, + "learning_rate": 8.627578092036117e-06, + "loss": 0.7324, + "step": 5255 + }, + { + "epoch": 1.397872340425532, + "grad_norm": 3.862119436264038, + "learning_rate": 8.626972755410156e-06, + "loss": 0.7555, + "step": 5256 + }, + { + "epoch": 1.3981382978723405, + "grad_norm": 4.149264335632324, + "learning_rate": 8.626367306561387e-06, + "loss": 0.7649, + "step": 5257 + }, + { + "epoch": 1.398404255319149, + "grad_norm": 3.6122639179229736, + "learning_rate": 8.625761745508547e-06, + "loss": 0.7959, + "step": 5258 + }, + { + "epoch": 1.3986702127659574, + "grad_norm": 3.611455202102661, + "learning_rate": 8.625156072270367e-06, + "loss": 0.8546, + "step": 5259 + }, + { + "epoch": 1.398936170212766, + "grad_norm": 4.0274858474731445, + "learning_rate": 8.624550286865592e-06, + "loss": 0.818, + "step": 5260 + }, + { + "epoch": 1.3992021276595745, + "grad_norm": 4.080778121948242, + "learning_rate": 8.623944389312962e-06, + "loss": 0.8599, + "step": 5261 + }, + { + "epoch": 1.399468085106383, + "grad_norm": 4.097471237182617, + "learning_rate": 8.623338379631227e-06, + "loss": 0.8178, + "step": 5262 + }, + { + "epoch": 1.3997340425531914, + "grad_norm": 3.6200075149536133, + "learning_rate": 8.622732257839137e-06, + "loss": 0.8381, + "step": 5263 + }, + { + "epoch": 1.4, + "grad_norm": 4.054747581481934, + "learning_rate": 8.622126023955446e-06, + "loss": 0.9865, + "step": 5264 + }, + { + "epoch": 1.4002659574468086, + "grad_norm": 4.653242111206055, + "learning_rate": 8.62151967799891e-06, + "loss": 0.8813, + "step": 5265 + }, + { + "epoch": 1.400531914893617, + "grad_norm": 4.182617664337158, + "learning_rate": 8.620913219988291e-06, + "loss": 0.7061, + "step": 5266 + }, + { + "epoch": 1.4007978723404255, + "grad_norm": 3.594130277633667, + "learning_rate": 8.620306649942356e-06, + "loss": 0.7468, + "step": 5267 + }, + { + "epoch": 1.401063829787234, + "grad_norm": 4.210184574127197, + "learning_rate": 8.619699967879868e-06, + "loss": 0.9574, + "step": 5268 + }, + { + "epoch": 1.4013297872340424, + "grad_norm": 4.212064743041992, + "learning_rate": 8.619093173819603e-06, + "loss": 0.8027, + "step": 5269 + }, + { + "epoch": 1.4015957446808511, + "grad_norm": 4.000636100769043, + "learning_rate": 8.618486267780334e-06, + "loss": 0.8482, + "step": 5270 + }, + { + "epoch": 1.4018617021276596, + "grad_norm": 4.396604537963867, + "learning_rate": 8.617879249780841e-06, + "loss": 0.8989, + "step": 5271 + }, + { + "epoch": 1.402127659574468, + "grad_norm": 3.6377105712890625, + "learning_rate": 8.617272119839903e-06, + "loss": 0.7686, + "step": 5272 + }, + { + "epoch": 1.4023936170212765, + "grad_norm": 3.8942556381225586, + "learning_rate": 8.616664877976308e-06, + "loss": 0.8185, + "step": 5273 + }, + { + "epoch": 1.402659574468085, + "grad_norm": 3.9607818126678467, + "learning_rate": 8.616057524208843e-06, + "loss": 0.6682, + "step": 5274 + }, + { + "epoch": 1.4029255319148937, + "grad_norm": 4.523376941680908, + "learning_rate": 8.615450058556301e-06, + "loss": 0.8093, + "step": 5275 + }, + { + "epoch": 1.4031914893617021, + "grad_norm": 4.111645221710205, + "learning_rate": 8.614842481037476e-06, + "loss": 0.8694, + "step": 5276 + }, + { + "epoch": 1.4034574468085106, + "grad_norm": 3.7978808879852295, + "learning_rate": 8.61423479167117e-06, + "loss": 0.7477, + "step": 5277 + }, + { + "epoch": 1.4037234042553193, + "grad_norm": 3.669728994369507, + "learning_rate": 8.613626990476186e-06, + "loss": 0.7951, + "step": 5278 + }, + { + "epoch": 1.4039893617021277, + "grad_norm": 4.3240251541137695, + "learning_rate": 8.613019077471325e-06, + "loss": 0.8721, + "step": 5279 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 3.702890157699585, + "learning_rate": 8.6124110526754e-06, + "loss": 0.6856, + "step": 5280 + }, + { + "epoch": 1.4045212765957447, + "grad_norm": 4.085876941680908, + "learning_rate": 8.611802916107225e-06, + "loss": 0.7458, + "step": 5281 + }, + { + "epoch": 1.4047872340425531, + "grad_norm": 4.095217704772949, + "learning_rate": 8.611194667785615e-06, + "loss": 0.821, + "step": 5282 + }, + { + "epoch": 1.4050531914893618, + "grad_norm": 3.8958888053894043, + "learning_rate": 8.610586307729393e-06, + "loss": 0.7271, + "step": 5283 + }, + { + "epoch": 1.4053191489361703, + "grad_norm": 3.696851968765259, + "learning_rate": 8.609977835957378e-06, + "loss": 0.7236, + "step": 5284 + }, + { + "epoch": 1.4055851063829787, + "grad_norm": 4.185340404510498, + "learning_rate": 8.609369252488398e-06, + "loss": 0.9089, + "step": 5285 + }, + { + "epoch": 1.4058510638297872, + "grad_norm": 4.072790622711182, + "learning_rate": 8.608760557341284e-06, + "loss": 0.761, + "step": 5286 + }, + { + "epoch": 1.4061170212765957, + "grad_norm": 3.8811473846435547, + "learning_rate": 8.60815175053487e-06, + "loss": 0.8021, + "step": 5287 + }, + { + "epoch": 1.4063829787234043, + "grad_norm": 4.050495624542236, + "learning_rate": 8.607542832087993e-06, + "loss": 0.7736, + "step": 5288 + }, + { + "epoch": 1.4066489361702128, + "grad_norm": 3.903702735900879, + "learning_rate": 8.606933802019493e-06, + "loss": 0.8525, + "step": 5289 + }, + { + "epoch": 1.4069148936170213, + "grad_norm": 3.618151903152466, + "learning_rate": 8.606324660348214e-06, + "loss": 0.7992, + "step": 5290 + }, + { + "epoch": 1.4071808510638297, + "grad_norm": 3.910585641860962, + "learning_rate": 8.605715407093005e-06, + "loss": 0.8235, + "step": 5291 + }, + { + "epoch": 1.4074468085106382, + "grad_norm": 4.317497253417969, + "learning_rate": 8.605106042272715e-06, + "loss": 0.8737, + "step": 5292 + }, + { + "epoch": 1.4077127659574469, + "grad_norm": 4.357272624969482, + "learning_rate": 8.6044965659062e-06, + "loss": 0.787, + "step": 5293 + }, + { + "epoch": 1.4079787234042553, + "grad_norm": 4.051640033721924, + "learning_rate": 8.603886978012317e-06, + "loss": 0.8513, + "step": 5294 + }, + { + "epoch": 1.4082446808510638, + "grad_norm": 4.226726055145264, + "learning_rate": 8.60327727860993e-06, + "loss": 0.717, + "step": 5295 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 3.7265825271606445, + "learning_rate": 8.6026674677179e-06, + "loss": 0.7177, + "step": 5296 + }, + { + "epoch": 1.4087765957446807, + "grad_norm": 3.866156816482544, + "learning_rate": 8.602057545355096e-06, + "loss": 0.78, + "step": 5297 + }, + { + "epoch": 1.4090425531914894, + "grad_norm": 3.843125820159912, + "learning_rate": 8.601447511540392e-06, + "loss": 0.8847, + "step": 5298 + }, + { + "epoch": 1.4093085106382979, + "grad_norm": 3.813894033432007, + "learning_rate": 8.600837366292663e-06, + "loss": 0.7, + "step": 5299 + }, + { + "epoch": 1.4095744680851063, + "grad_norm": 4.289909362792969, + "learning_rate": 8.600227109630785e-06, + "loss": 0.7832, + "step": 5300 + }, + { + "epoch": 1.409840425531915, + "grad_norm": 4.330870151519775, + "learning_rate": 8.599616741573642e-06, + "loss": 0.9482, + "step": 5301 + }, + { + "epoch": 1.4101063829787235, + "grad_norm": 3.625694990158081, + "learning_rate": 8.599006262140117e-06, + "loss": 0.6515, + "step": 5302 + }, + { + "epoch": 1.410372340425532, + "grad_norm": 4.081284999847412, + "learning_rate": 8.598395671349104e-06, + "loss": 0.9656, + "step": 5303 + }, + { + "epoch": 1.4106382978723404, + "grad_norm": 4.240716457366943, + "learning_rate": 8.59778496921949e-06, + "loss": 0.8328, + "step": 5304 + }, + { + "epoch": 1.4109042553191489, + "grad_norm": 3.9750494956970215, + "learning_rate": 8.597174155770174e-06, + "loss": 0.7686, + "step": 5305 + }, + { + "epoch": 1.4111702127659576, + "grad_norm": 3.6305007934570312, + "learning_rate": 8.596563231020054e-06, + "loss": 0.7059, + "step": 5306 + }, + { + "epoch": 1.411436170212766, + "grad_norm": 3.9132840633392334, + "learning_rate": 8.595952194988034e-06, + "loss": 0.8509, + "step": 5307 + }, + { + "epoch": 1.4117021276595745, + "grad_norm": 4.162221431732178, + "learning_rate": 8.59534104769302e-06, + "loss": 0.82, + "step": 5308 + }, + { + "epoch": 1.411968085106383, + "grad_norm": 4.090907096862793, + "learning_rate": 8.594729789153919e-06, + "loss": 0.9025, + "step": 5309 + }, + { + "epoch": 1.4122340425531914, + "grad_norm": 4.178388595581055, + "learning_rate": 8.594118419389648e-06, + "loss": 0.8537, + "step": 5310 + }, + { + "epoch": 1.4125, + "grad_norm": 3.5532939434051514, + "learning_rate": 8.59350693841912e-06, + "loss": 0.684, + "step": 5311 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 3.9625163078308105, + "learning_rate": 8.592895346261258e-06, + "loss": 0.7501, + "step": 5312 + }, + { + "epoch": 1.413031914893617, + "grad_norm": 3.4592795372009277, + "learning_rate": 8.592283642934983e-06, + "loss": 0.8845, + "step": 5313 + }, + { + "epoch": 1.4132978723404255, + "grad_norm": 4.265946865081787, + "learning_rate": 8.591671828459222e-06, + "loss": 0.8354, + "step": 5314 + }, + { + "epoch": 1.413563829787234, + "grad_norm": 4.301452159881592, + "learning_rate": 8.591059902852907e-06, + "loss": 0.9654, + "step": 5315 + }, + { + "epoch": 1.4138297872340426, + "grad_norm": 3.953643560409546, + "learning_rate": 8.59044786613497e-06, + "loss": 0.8592, + "step": 5316 + }, + { + "epoch": 1.414095744680851, + "grad_norm": 3.8107998371124268, + "learning_rate": 8.589835718324349e-06, + "loss": 0.7486, + "step": 5317 + }, + { + "epoch": 1.4143617021276595, + "grad_norm": 4.148920059204102, + "learning_rate": 8.589223459439987e-06, + "loss": 0.8111, + "step": 5318 + }, + { + "epoch": 1.414627659574468, + "grad_norm": 3.7461628913879395, + "learning_rate": 8.588611089500821e-06, + "loss": 0.7551, + "step": 5319 + }, + { + "epoch": 1.4148936170212765, + "grad_norm": 4.387768268585205, + "learning_rate": 8.587998608525806e-06, + "loss": 0.933, + "step": 5320 + }, + { + "epoch": 1.4151595744680852, + "grad_norm": 3.419297933578491, + "learning_rate": 8.587386016533887e-06, + "loss": 0.7643, + "step": 5321 + }, + { + "epoch": 1.4154255319148936, + "grad_norm": 3.7075390815734863, + "learning_rate": 8.586773313544023e-06, + "loss": 0.7818, + "step": 5322 + }, + { + "epoch": 1.415691489361702, + "grad_norm": 4.141719341278076, + "learning_rate": 8.586160499575168e-06, + "loss": 0.912, + "step": 5323 + }, + { + "epoch": 1.4159574468085108, + "grad_norm": 4.2602386474609375, + "learning_rate": 8.585547574646287e-06, + "loss": 0.834, + "step": 5324 + }, + { + "epoch": 1.4162234042553192, + "grad_norm": 4.043152332305908, + "learning_rate": 8.584934538776342e-06, + "loss": 0.6793, + "step": 5325 + }, + { + "epoch": 1.4164893617021277, + "grad_norm": 4.062325954437256, + "learning_rate": 8.584321391984301e-06, + "loss": 0.8172, + "step": 5326 + }, + { + "epoch": 1.4167553191489362, + "grad_norm": 3.731950044631958, + "learning_rate": 8.583708134289138e-06, + "loss": 0.6754, + "step": 5327 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 4.3393940925598145, + "learning_rate": 8.583094765709823e-06, + "loss": 0.8304, + "step": 5328 + }, + { + "epoch": 1.4172872340425533, + "grad_norm": 4.178645610809326, + "learning_rate": 8.582481286265341e-06, + "loss": 0.9168, + "step": 5329 + }, + { + "epoch": 1.4175531914893618, + "grad_norm": 3.5687899589538574, + "learning_rate": 8.581867695974667e-06, + "loss": 0.6632, + "step": 5330 + }, + { + "epoch": 1.4178191489361702, + "grad_norm": 3.7236688137054443, + "learning_rate": 8.58125399485679e-06, + "loss": 0.6788, + "step": 5331 + }, + { + "epoch": 1.4180851063829787, + "grad_norm": 3.8592636585235596, + "learning_rate": 8.5806401829307e-06, + "loss": 0.8632, + "step": 5332 + }, + { + "epoch": 1.4183510638297872, + "grad_norm": 3.7756807804107666, + "learning_rate": 8.580026260215384e-06, + "loss": 0.6994, + "step": 5333 + }, + { + "epoch": 1.4186170212765958, + "grad_norm": 3.481576919555664, + "learning_rate": 8.579412226729843e-06, + "loss": 0.8748, + "step": 5334 + }, + { + "epoch": 1.4188829787234043, + "grad_norm": 3.908369779586792, + "learning_rate": 8.578798082493074e-06, + "loss": 0.7567, + "step": 5335 + }, + { + "epoch": 1.4191489361702128, + "grad_norm": 4.084057807922363, + "learning_rate": 8.578183827524076e-06, + "loss": 0.9174, + "step": 5336 + }, + { + "epoch": 1.4194148936170212, + "grad_norm": 4.469969749450684, + "learning_rate": 8.57756946184186e-06, + "loss": 0.9547, + "step": 5337 + }, + { + "epoch": 1.4196808510638297, + "grad_norm": 3.8578479290008545, + "learning_rate": 8.576954985465431e-06, + "loss": 0.8135, + "step": 5338 + }, + { + "epoch": 1.4199468085106384, + "grad_norm": 3.7595484256744385, + "learning_rate": 8.576340398413804e-06, + "loss": 0.7724, + "step": 5339 + }, + { + "epoch": 1.4202127659574468, + "grad_norm": 4.005858898162842, + "learning_rate": 8.575725700705995e-06, + "loss": 0.8386, + "step": 5340 + }, + { + "epoch": 1.4204787234042553, + "grad_norm": 4.103984355926514, + "learning_rate": 8.575110892361022e-06, + "loss": 0.9413, + "step": 5341 + }, + { + "epoch": 1.4207446808510638, + "grad_norm": 3.5380845069885254, + "learning_rate": 8.57449597339791e-06, + "loss": 0.8393, + "step": 5342 + }, + { + "epoch": 1.4210106382978722, + "grad_norm": 3.589729070663452, + "learning_rate": 8.573880943835684e-06, + "loss": 0.7789, + "step": 5343 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 4.016366004943848, + "learning_rate": 8.573265803693374e-06, + "loss": 0.7377, + "step": 5344 + }, + { + "epoch": 1.4215425531914894, + "grad_norm": 3.708329439163208, + "learning_rate": 8.572650552990012e-06, + "loss": 0.8608, + "step": 5345 + }, + { + "epoch": 1.4218085106382978, + "grad_norm": 4.192487716674805, + "learning_rate": 8.572035191744637e-06, + "loss": 0.7963, + "step": 5346 + }, + { + "epoch": 1.4220744680851065, + "grad_norm": 3.561629056930542, + "learning_rate": 8.571419719976287e-06, + "loss": 0.8004, + "step": 5347 + }, + { + "epoch": 1.422340425531915, + "grad_norm": 3.7709176540374756, + "learning_rate": 8.570804137704005e-06, + "loss": 0.7012, + "step": 5348 + }, + { + "epoch": 1.4226063829787234, + "grad_norm": 3.842339515686035, + "learning_rate": 8.57018844494684e-06, + "loss": 0.8063, + "step": 5349 + }, + { + "epoch": 1.422872340425532, + "grad_norm": 4.014485836029053, + "learning_rate": 8.56957264172384e-06, + "loss": 0.681, + "step": 5350 + }, + { + "epoch": 1.4231382978723404, + "grad_norm": 3.9877431392669678, + "learning_rate": 8.568956728054061e-06, + "loss": 0.9011, + "step": 5351 + }, + { + "epoch": 1.423404255319149, + "grad_norm": 3.9741530418395996, + "learning_rate": 8.568340703956558e-06, + "loss": 0.8245, + "step": 5352 + }, + { + "epoch": 1.4236702127659575, + "grad_norm": 4.008678436279297, + "learning_rate": 8.567724569450393e-06, + "loss": 0.8588, + "step": 5353 + }, + { + "epoch": 1.423936170212766, + "grad_norm": 4.2688679695129395, + "learning_rate": 8.56710832455463e-06, + "loss": 0.8026, + "step": 5354 + }, + { + "epoch": 1.4242021276595744, + "grad_norm": 4.144524097442627, + "learning_rate": 8.566491969288333e-06, + "loss": 0.7977, + "step": 5355 + }, + { + "epoch": 1.424468085106383, + "grad_norm": 4.431448459625244, + "learning_rate": 8.565875503670578e-06, + "loss": 0.9466, + "step": 5356 + }, + { + "epoch": 1.4247340425531916, + "grad_norm": 3.9344115257263184, + "learning_rate": 8.565258927720436e-06, + "loss": 0.7571, + "step": 5357 + }, + { + "epoch": 1.425, + "grad_norm": 4.618174076080322, + "learning_rate": 8.564642241456986e-06, + "loss": 0.92, + "step": 5358 + }, + { + "epoch": 1.4252659574468085, + "grad_norm": 4.515613079071045, + "learning_rate": 8.564025444899308e-06, + "loss": 0.8339, + "step": 5359 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.8892219066619873, + "learning_rate": 8.563408538066486e-06, + "loss": 0.6946, + "step": 5360 + }, + { + "epoch": 1.4257978723404254, + "grad_norm": 3.8335928916931152, + "learning_rate": 8.562791520977608e-06, + "loss": 0.7894, + "step": 5361 + }, + { + "epoch": 1.4260638297872341, + "grad_norm": 3.8898446559906006, + "learning_rate": 8.562174393651767e-06, + "loss": 0.6504, + "step": 5362 + }, + { + "epoch": 1.4263297872340426, + "grad_norm": 3.916454553604126, + "learning_rate": 8.561557156108055e-06, + "loss": 0.8178, + "step": 5363 + }, + { + "epoch": 1.426595744680851, + "grad_norm": 4.594573020935059, + "learning_rate": 8.560939808365571e-06, + "loss": 0.8554, + "step": 5364 + }, + { + "epoch": 1.4268617021276595, + "grad_norm": 3.920474052429199, + "learning_rate": 8.56032235044342e-06, + "loss": 0.9173, + "step": 5365 + }, + { + "epoch": 1.427127659574468, + "grad_norm": 3.8437423706054688, + "learning_rate": 8.5597047823607e-06, + "loss": 0.7551, + "step": 5366 + }, + { + "epoch": 1.4273936170212767, + "grad_norm": 3.631983518600464, + "learning_rate": 8.559087104136525e-06, + "loss": 0.8889, + "step": 5367 + }, + { + "epoch": 1.4276595744680851, + "grad_norm": 3.7418458461761475, + "learning_rate": 8.558469315790005e-06, + "loss": 0.7964, + "step": 5368 + }, + { + "epoch": 1.4279255319148936, + "grad_norm": 4.14785099029541, + "learning_rate": 8.557851417340252e-06, + "loss": 0.8312, + "step": 5369 + }, + { + "epoch": 1.4281914893617023, + "grad_norm": 4.0224103927612305, + "learning_rate": 8.55723340880639e-06, + "loss": 0.9175, + "step": 5370 + }, + { + "epoch": 1.4284574468085105, + "grad_norm": 3.899369478225708, + "learning_rate": 8.556615290207538e-06, + "loss": 0.776, + "step": 5371 + }, + { + "epoch": 1.4287234042553192, + "grad_norm": 3.869248628616333, + "learning_rate": 8.555997061562821e-06, + "loss": 0.7417, + "step": 5372 + }, + { + "epoch": 1.4289893617021276, + "grad_norm": 3.8381667137145996, + "learning_rate": 8.555378722891367e-06, + "loss": 0.7887, + "step": 5373 + }, + { + "epoch": 1.429255319148936, + "grad_norm": 4.0374674797058105, + "learning_rate": 8.55476027421231e-06, + "loss": 0.7039, + "step": 5374 + }, + { + "epoch": 1.4295212765957448, + "grad_norm": 4.473758220672607, + "learning_rate": 8.554141715544788e-06, + "loss": 0.8829, + "step": 5375 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 3.995429277420044, + "learning_rate": 8.553523046907934e-06, + "loss": 0.8441, + "step": 5376 + }, + { + "epoch": 1.4300531914893617, + "grad_norm": 3.942129373550415, + "learning_rate": 8.552904268320895e-06, + "loss": 0.8657, + "step": 5377 + }, + { + "epoch": 1.4303191489361702, + "grad_norm": 4.163167953491211, + "learning_rate": 8.552285379802811e-06, + "loss": 0.7497, + "step": 5378 + }, + { + "epoch": 1.4305851063829786, + "grad_norm": 3.926020860671997, + "learning_rate": 8.551666381372839e-06, + "loss": 0.8265, + "step": 5379 + }, + { + "epoch": 1.4308510638297873, + "grad_norm": 3.686615228652954, + "learning_rate": 8.551047273050126e-06, + "loss": 0.694, + "step": 5380 + }, + { + "epoch": 1.4311170212765958, + "grad_norm": 4.436965465545654, + "learning_rate": 8.55042805485383e-06, + "loss": 0.929, + "step": 5381 + }, + { + "epoch": 1.4313829787234043, + "grad_norm": 4.103221416473389, + "learning_rate": 8.549808726803108e-06, + "loss": 0.7724, + "step": 5382 + }, + { + "epoch": 1.4316489361702127, + "grad_norm": 3.994560718536377, + "learning_rate": 8.549189288917127e-06, + "loss": 0.6845, + "step": 5383 + }, + { + "epoch": 1.4319148936170212, + "grad_norm": 4.3197712898254395, + "learning_rate": 8.548569741215049e-06, + "loss": 0.8348, + "step": 5384 + }, + { + "epoch": 1.4321808510638299, + "grad_norm": 4.51045560836792, + "learning_rate": 8.547950083716047e-06, + "loss": 0.8659, + "step": 5385 + }, + { + "epoch": 1.4324468085106383, + "grad_norm": 4.250168323516846, + "learning_rate": 8.54733031643929e-06, + "loss": 0.9424, + "step": 5386 + }, + { + "epoch": 1.4327127659574468, + "grad_norm": 3.6297523975372314, + "learning_rate": 8.54671043940396e-06, + "loss": 0.8464, + "step": 5387 + }, + { + "epoch": 1.4329787234042553, + "grad_norm": 3.914750099182129, + "learning_rate": 8.54609045262923e-06, + "loss": 0.9345, + "step": 5388 + }, + { + "epoch": 1.4332446808510637, + "grad_norm": 4.086660385131836, + "learning_rate": 8.545470356134289e-06, + "loss": 0.8161, + "step": 5389 + }, + { + "epoch": 1.4335106382978724, + "grad_norm": 3.657174825668335, + "learning_rate": 8.54485014993832e-06, + "loss": 0.8184, + "step": 5390 + }, + { + "epoch": 1.4337765957446809, + "grad_norm": 4.197863578796387, + "learning_rate": 8.544229834060512e-06, + "loss": 0.8937, + "step": 5391 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 4.215087413787842, + "learning_rate": 8.543609408520062e-06, + "loss": 0.8149, + "step": 5392 + }, + { + "epoch": 1.434308510638298, + "grad_norm": 4.2908101081848145, + "learning_rate": 8.542988873336164e-06, + "loss": 0.7731, + "step": 5393 + }, + { + "epoch": 1.4345744680851062, + "grad_norm": 3.921720266342163, + "learning_rate": 8.54236822852802e-06, + "loss": 0.7697, + "step": 5394 + }, + { + "epoch": 1.434840425531915, + "grad_norm": 4.464201927185059, + "learning_rate": 8.54174747411483e-06, + "loss": 0.8365, + "step": 5395 + }, + { + "epoch": 1.4351063829787234, + "grad_norm": 3.9795491695404053, + "learning_rate": 8.541126610115806e-06, + "loss": 0.8086, + "step": 5396 + }, + { + "epoch": 1.4353723404255319, + "grad_norm": 4.0533766746521, + "learning_rate": 8.540505636550153e-06, + "loss": 0.7996, + "step": 5397 + }, + { + "epoch": 1.4356382978723405, + "grad_norm": 4.261003494262695, + "learning_rate": 8.53988455343709e-06, + "loss": 0.7748, + "step": 5398 + }, + { + "epoch": 1.435904255319149, + "grad_norm": 4.159748077392578, + "learning_rate": 8.53926336079583e-06, + "loss": 0.8867, + "step": 5399 + }, + { + "epoch": 1.4361702127659575, + "grad_norm": 3.9314358234405518, + "learning_rate": 8.538642058645595e-06, + "loss": 0.8713, + "step": 5400 + }, + { + "epoch": 1.436436170212766, + "grad_norm": 3.8043625354766846, + "learning_rate": 8.538020647005607e-06, + "loss": 0.7276, + "step": 5401 + }, + { + "epoch": 1.4367021276595744, + "grad_norm": 4.576129913330078, + "learning_rate": 8.537399125895096e-06, + "loss": 0.7822, + "step": 5402 + }, + { + "epoch": 1.436968085106383, + "grad_norm": 3.801168918609619, + "learning_rate": 8.53677749533329e-06, + "loss": 0.8445, + "step": 5403 + }, + { + "epoch": 1.4372340425531915, + "grad_norm": 3.763317108154297, + "learning_rate": 8.536155755339427e-06, + "loss": 0.7572, + "step": 5404 + }, + { + "epoch": 1.4375, + "grad_norm": 4.1881256103515625, + "learning_rate": 8.535533905932739e-06, + "loss": 0.8398, + "step": 5405 + }, + { + "epoch": 1.4377659574468085, + "grad_norm": 3.61997127532959, + "learning_rate": 8.534911947132469e-06, + "loss": 0.674, + "step": 5406 + }, + { + "epoch": 1.438031914893617, + "grad_norm": 3.6583242416381836, + "learning_rate": 8.534289878957863e-06, + "loss": 0.6655, + "step": 5407 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 3.9012091159820557, + "learning_rate": 8.533667701428167e-06, + "loss": 0.6869, + "step": 5408 + }, + { + "epoch": 1.438563829787234, + "grad_norm": 3.890615463256836, + "learning_rate": 8.53304541456263e-06, + "loss": 0.8431, + "step": 5409 + }, + { + "epoch": 1.4388297872340425, + "grad_norm": 3.8987715244293213, + "learning_rate": 8.532423018380511e-06, + "loss": 0.8705, + "step": 5410 + }, + { + "epoch": 1.439095744680851, + "grad_norm": 4.005768775939941, + "learning_rate": 8.531800512901066e-06, + "loss": 0.8555, + "step": 5411 + }, + { + "epoch": 1.4393617021276595, + "grad_norm": 3.9035804271698, + "learning_rate": 8.531177898143552e-06, + "loss": 0.7811, + "step": 5412 + }, + { + "epoch": 1.4396276595744681, + "grad_norm": 4.260951995849609, + "learning_rate": 8.530555174127236e-06, + "loss": 0.9168, + "step": 5413 + }, + { + "epoch": 1.4398936170212766, + "grad_norm": 4.07423210144043, + "learning_rate": 8.529932340871388e-06, + "loss": 0.7437, + "step": 5414 + }, + { + "epoch": 1.440159574468085, + "grad_norm": 3.9797050952911377, + "learning_rate": 8.529309398395275e-06, + "loss": 0.707, + "step": 5415 + }, + { + "epoch": 1.4404255319148938, + "grad_norm": 3.7319893836975098, + "learning_rate": 8.528686346718177e-06, + "loss": 0.7089, + "step": 5416 + }, + { + "epoch": 1.440691489361702, + "grad_norm": 4.224223613739014, + "learning_rate": 8.528063185859367e-06, + "loss": 0.786, + "step": 5417 + }, + { + "epoch": 1.4409574468085107, + "grad_norm": 4.449718952178955, + "learning_rate": 8.527439915838129e-06, + "loss": 0.8129, + "step": 5418 + }, + { + "epoch": 1.4412234042553191, + "grad_norm": 3.991421937942505, + "learning_rate": 8.526816536673748e-06, + "loss": 0.9446, + "step": 5419 + }, + { + "epoch": 1.4414893617021276, + "grad_norm": 3.5149245262145996, + "learning_rate": 8.52619304838551e-06, + "loss": 0.738, + "step": 5420 + }, + { + "epoch": 1.4417553191489363, + "grad_norm": 4.034007549285889, + "learning_rate": 8.525569450992707e-06, + "loss": 0.8011, + "step": 5421 + }, + { + "epoch": 1.4420212765957447, + "grad_norm": 4.191031455993652, + "learning_rate": 8.524945744514634e-06, + "loss": 0.9352, + "step": 5422 + }, + { + "epoch": 1.4422872340425532, + "grad_norm": 3.4210205078125, + "learning_rate": 8.524321928970591e-06, + "loss": 0.7345, + "step": 5423 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 3.573930263519287, + "learning_rate": 8.523698004379878e-06, + "loss": 0.6936, + "step": 5424 + }, + { + "epoch": 1.4428191489361701, + "grad_norm": 3.847769260406494, + "learning_rate": 8.523073970761799e-06, + "loss": 0.7465, + "step": 5425 + }, + { + "epoch": 1.4430851063829788, + "grad_norm": 3.526007652282715, + "learning_rate": 8.522449828135663e-06, + "loss": 0.8042, + "step": 5426 + }, + { + "epoch": 1.4433510638297873, + "grad_norm": 3.3529438972473145, + "learning_rate": 8.521825576520784e-06, + "loss": 0.6523, + "step": 5427 + }, + { + "epoch": 1.4436170212765957, + "grad_norm": 3.608856678009033, + "learning_rate": 8.521201215936474e-06, + "loss": 0.753, + "step": 5428 + }, + { + "epoch": 1.4438829787234042, + "grad_norm": 3.78037691116333, + "learning_rate": 8.520576746402052e-06, + "loss": 0.9188, + "step": 5429 + }, + { + "epoch": 1.4441489361702127, + "grad_norm": 3.6370112895965576, + "learning_rate": 8.519952167936842e-06, + "loss": 0.7606, + "step": 5430 + }, + { + "epoch": 1.4444148936170214, + "grad_norm": 4.091804504394531, + "learning_rate": 8.519327480560169e-06, + "loss": 0.8833, + "step": 5431 + }, + { + "epoch": 1.4446808510638298, + "grad_norm": 4.076303482055664, + "learning_rate": 8.518702684291358e-06, + "loss": 0.7852, + "step": 5432 + }, + { + "epoch": 1.4449468085106383, + "grad_norm": 3.845811605453491, + "learning_rate": 8.518077779149744e-06, + "loss": 0.7455, + "step": 5433 + }, + { + "epoch": 1.4452127659574467, + "grad_norm": 4.302513599395752, + "learning_rate": 8.517452765154661e-06, + "loss": 0.7273, + "step": 5434 + }, + { + "epoch": 1.4454787234042552, + "grad_norm": 3.78494930267334, + "learning_rate": 8.516827642325447e-06, + "loss": 0.7468, + "step": 5435 + }, + { + "epoch": 1.445744680851064, + "grad_norm": 3.9590561389923096, + "learning_rate": 8.516202410681446e-06, + "loss": 0.9023, + "step": 5436 + }, + { + "epoch": 1.4460106382978724, + "grad_norm": 4.2443766593933105, + "learning_rate": 8.515577070242005e-06, + "loss": 0.9363, + "step": 5437 + }, + { + "epoch": 1.4462765957446808, + "grad_norm": 3.511875867843628, + "learning_rate": 8.514951621026468e-06, + "loss": 0.7257, + "step": 5438 + }, + { + "epoch": 1.4465425531914895, + "grad_norm": 3.931488513946533, + "learning_rate": 8.51432606305419e-06, + "loss": 0.794, + "step": 5439 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 4.520570755004883, + "learning_rate": 8.513700396344527e-06, + "loss": 0.9367, + "step": 5440 + }, + { + "epoch": 1.4470744680851064, + "grad_norm": 4.023960113525391, + "learning_rate": 8.513074620916835e-06, + "loss": 0.8083, + "step": 5441 + }, + { + "epoch": 1.4473404255319149, + "grad_norm": 3.8863484859466553, + "learning_rate": 8.512448736790479e-06, + "loss": 0.7789, + "step": 5442 + }, + { + "epoch": 1.4476063829787233, + "grad_norm": 3.4847662448883057, + "learning_rate": 8.511822743984824e-06, + "loss": 0.6853, + "step": 5443 + }, + { + "epoch": 1.447872340425532, + "grad_norm": 3.668828010559082, + "learning_rate": 8.511196642519237e-06, + "loss": 0.8037, + "step": 5444 + }, + { + "epoch": 1.4481382978723405, + "grad_norm": 3.801157236099243, + "learning_rate": 8.510570432413095e-06, + "loss": 0.8393, + "step": 5445 + }, + { + "epoch": 1.448404255319149, + "grad_norm": 4.479011535644531, + "learning_rate": 8.509944113685769e-06, + "loss": 0.9082, + "step": 5446 + }, + { + "epoch": 1.4486702127659574, + "grad_norm": 4.385382652282715, + "learning_rate": 8.509317686356638e-06, + "loss": 0.9118, + "step": 5447 + }, + { + "epoch": 1.4489361702127659, + "grad_norm": 4.001799583435059, + "learning_rate": 8.50869115044509e-06, + "loss": 0.7022, + "step": 5448 + }, + { + "epoch": 1.4492021276595746, + "grad_norm": 4.2879228591918945, + "learning_rate": 8.508064505970503e-06, + "loss": 0.8253, + "step": 5449 + }, + { + "epoch": 1.449468085106383, + "grad_norm": 3.933523654937744, + "learning_rate": 8.507437752952271e-06, + "loss": 0.8163, + "step": 5450 + }, + { + "epoch": 1.4497340425531915, + "grad_norm": 4.011867046356201, + "learning_rate": 8.506810891409786e-06, + "loss": 0.8196, + "step": 5451 + }, + { + "epoch": 1.45, + "grad_norm": 4.269194602966309, + "learning_rate": 8.506183921362443e-06, + "loss": 0.7912, + "step": 5452 + }, + { + "epoch": 1.4502659574468084, + "grad_norm": 4.043778896331787, + "learning_rate": 8.505556842829643e-06, + "loss": 0.7842, + "step": 5453 + }, + { + "epoch": 1.450531914893617, + "grad_norm": 4.532417297363281, + "learning_rate": 8.504929655830785e-06, + "loss": 0.9794, + "step": 5454 + }, + { + "epoch": 1.4507978723404256, + "grad_norm": 3.571371555328369, + "learning_rate": 8.504302360385276e-06, + "loss": 0.8234, + "step": 5455 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 3.6812736988067627, + "learning_rate": 8.50367495651253e-06, + "loss": 0.8207, + "step": 5456 + }, + { + "epoch": 1.4513297872340425, + "grad_norm": 3.88917875289917, + "learning_rate": 8.503047444231954e-06, + "loss": 0.8452, + "step": 5457 + }, + { + "epoch": 1.451595744680851, + "grad_norm": 3.7152698040008545, + "learning_rate": 8.502419823562964e-06, + "loss": 0.7018, + "step": 5458 + }, + { + "epoch": 1.4518617021276596, + "grad_norm": 3.9872684478759766, + "learning_rate": 8.501792094524983e-06, + "loss": 0.9355, + "step": 5459 + }, + { + "epoch": 1.452127659574468, + "grad_norm": 3.8965933322906494, + "learning_rate": 8.501164257137431e-06, + "loss": 0.7547, + "step": 5460 + }, + { + "epoch": 1.4523936170212766, + "grad_norm": 4.248835563659668, + "learning_rate": 8.500536311419735e-06, + "loss": 0.8456, + "step": 5461 + }, + { + "epoch": 1.452659574468085, + "grad_norm": 4.09518575668335, + "learning_rate": 8.499908257391324e-06, + "loss": 0.8698, + "step": 5462 + }, + { + "epoch": 1.4529255319148935, + "grad_norm": 4.262086391448975, + "learning_rate": 8.49928009507163e-06, + "loss": 0.761, + "step": 5463 + }, + { + "epoch": 1.4531914893617022, + "grad_norm": 3.634997606277466, + "learning_rate": 8.49865182448009e-06, + "loss": 0.7712, + "step": 5464 + }, + { + "epoch": 1.4534574468085106, + "grad_norm": 4.407344818115234, + "learning_rate": 8.498023445636145e-06, + "loss": 0.8103, + "step": 5465 + }, + { + "epoch": 1.453723404255319, + "grad_norm": 3.926379680633545, + "learning_rate": 8.497394958559236e-06, + "loss": 0.7233, + "step": 5466 + }, + { + "epoch": 1.4539893617021278, + "grad_norm": 4.115360736846924, + "learning_rate": 8.496766363268809e-06, + "loss": 0.9513, + "step": 5467 + }, + { + "epoch": 1.4542553191489362, + "grad_norm": 4.249356269836426, + "learning_rate": 8.496137659784313e-06, + "loss": 0.7799, + "step": 5468 + }, + { + "epoch": 1.4545212765957447, + "grad_norm": 3.9418179988861084, + "learning_rate": 8.495508848125202e-06, + "loss": 0.7216, + "step": 5469 + }, + { + "epoch": 1.4547872340425532, + "grad_norm": 4.33933687210083, + "learning_rate": 8.494879928310934e-06, + "loss": 0.8312, + "step": 5470 + }, + { + "epoch": 1.4550531914893616, + "grad_norm": 4.497339248657227, + "learning_rate": 8.494250900360963e-06, + "loss": 0.6842, + "step": 5471 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 4.439492225646973, + "learning_rate": 8.493621764294757e-06, + "loss": 0.8134, + "step": 5472 + }, + { + "epoch": 1.4555851063829788, + "grad_norm": 4.622555255889893, + "learning_rate": 8.49299252013178e-06, + "loss": 0.878, + "step": 5473 + }, + { + "epoch": 1.4558510638297872, + "grad_norm": 4.369466781616211, + "learning_rate": 8.492363167891502e-06, + "loss": 0.7228, + "step": 5474 + }, + { + "epoch": 1.4561170212765957, + "grad_norm": 4.223091125488281, + "learning_rate": 8.491733707593395e-06, + "loss": 0.8303, + "step": 5475 + }, + { + "epoch": 1.4563829787234042, + "grad_norm": 4.063412189483643, + "learning_rate": 8.491104139256936e-06, + "loss": 0.8504, + "step": 5476 + }, + { + "epoch": 1.4566489361702128, + "grad_norm": 4.342689514160156, + "learning_rate": 8.490474462901605e-06, + "loss": 0.841, + "step": 5477 + }, + { + "epoch": 1.4569148936170213, + "grad_norm": 4.090299129486084, + "learning_rate": 8.489844678546886e-06, + "loss": 0.8391, + "step": 5478 + }, + { + "epoch": 1.4571808510638298, + "grad_norm": 3.786254644393921, + "learning_rate": 8.489214786212263e-06, + "loss": 0.8498, + "step": 5479 + }, + { + "epoch": 1.4574468085106382, + "grad_norm": 4.191230297088623, + "learning_rate": 8.488584785917226e-06, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 1.4577127659574467, + "grad_norm": 3.928368330001831, + "learning_rate": 8.487954677681269e-06, + "loss": 0.8001, + "step": 5481 + }, + { + "epoch": 1.4579787234042554, + "grad_norm": 3.579162836074829, + "learning_rate": 8.487324461523887e-06, + "loss": 0.8023, + "step": 5482 + }, + { + "epoch": 1.4582446808510638, + "grad_norm": 3.6825640201568604, + "learning_rate": 8.486694137464582e-06, + "loss": 0.7853, + "step": 5483 + }, + { + "epoch": 1.4585106382978723, + "grad_norm": 4.125916004180908, + "learning_rate": 8.486063705522853e-06, + "loss": 0.7216, + "step": 5484 + }, + { + "epoch": 1.4587765957446808, + "grad_norm": 4.086201190948486, + "learning_rate": 8.48543316571821e-06, + "loss": 0.7723, + "step": 5485 + }, + { + "epoch": 1.4590425531914892, + "grad_norm": 3.6054461002349854, + "learning_rate": 8.484802518070161e-06, + "loss": 0.7561, + "step": 5486 + }, + { + "epoch": 1.459308510638298, + "grad_norm": 3.9755938053131104, + "learning_rate": 8.48417176259822e-06, + "loss": 0.7914, + "step": 5487 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 3.4087741374969482, + "learning_rate": 8.483540899321901e-06, + "loss": 0.8288, + "step": 5488 + }, + { + "epoch": 1.4598404255319148, + "grad_norm": 4.220149517059326, + "learning_rate": 8.482909928260726e-06, + "loss": 0.9088, + "step": 5489 + }, + { + "epoch": 1.4601063829787235, + "grad_norm": 4.157181262969971, + "learning_rate": 8.482278849434218e-06, + "loss": 0.8727, + "step": 5490 + }, + { + "epoch": 1.460372340425532, + "grad_norm": 4.077250003814697, + "learning_rate": 8.481647662861901e-06, + "loss": 0.7891, + "step": 5491 + }, + { + "epoch": 1.4606382978723405, + "grad_norm": 3.9751412868499756, + "learning_rate": 8.481016368563308e-06, + "loss": 0.8363, + "step": 5492 + }, + { + "epoch": 1.460904255319149, + "grad_norm": 4.07692813873291, + "learning_rate": 8.480384966557969e-06, + "loss": 1.0291, + "step": 5493 + }, + { + "epoch": 1.4611702127659574, + "grad_norm": 3.963118553161621, + "learning_rate": 8.479753456865422e-06, + "loss": 0.778, + "step": 5494 + }, + { + "epoch": 1.461436170212766, + "grad_norm": 4.359419822692871, + "learning_rate": 8.479121839505205e-06, + "loss": 0.8413, + "step": 5495 + }, + { + "epoch": 1.4617021276595745, + "grad_norm": 4.071464538574219, + "learning_rate": 8.478490114496862e-06, + "loss": 0.802, + "step": 5496 + }, + { + "epoch": 1.461968085106383, + "grad_norm": 4.090579509735107, + "learning_rate": 8.477858281859941e-06, + "loss": 0.8182, + "step": 5497 + }, + { + "epoch": 1.4622340425531914, + "grad_norm": 4.3386006355285645, + "learning_rate": 8.47722634161399e-06, + "loss": 0.7349, + "step": 5498 + }, + { + "epoch": 1.4625, + "grad_norm": 3.489248275756836, + "learning_rate": 8.476594293778561e-06, + "loss": 0.7918, + "step": 5499 + }, + { + "epoch": 1.4627659574468086, + "grad_norm": 3.849106788635254, + "learning_rate": 8.475962138373212e-06, + "loss": 0.7986, + "step": 5500 + }, + { + "epoch": 1.4627659574468086, + "eval_loss": 1.2964370250701904, + "eval_runtime": 13.6602, + "eval_samples_per_second": 29.282, + "eval_steps_per_second": 3.66, + "step": 5500 + }, + { + "epoch": 1.463031914893617, + "grad_norm": 3.9225049018859863, + "learning_rate": 8.475329875417502e-06, + "loss": 0.7197, + "step": 5501 + }, + { + "epoch": 1.4632978723404255, + "grad_norm": 3.952686071395874, + "learning_rate": 8.474697504930994e-06, + "loss": 0.8378, + "step": 5502 + }, + { + "epoch": 1.463563829787234, + "grad_norm": 3.452550172805786, + "learning_rate": 8.474065026933254e-06, + "loss": 0.8279, + "step": 5503 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 3.6807174682617188, + "learning_rate": 8.473432441443852e-06, + "loss": 0.8527, + "step": 5504 + }, + { + "epoch": 1.4640957446808511, + "grad_norm": 3.6200850009918213, + "learning_rate": 8.472799748482361e-06, + "loss": 0.7749, + "step": 5505 + }, + { + "epoch": 1.4643617021276596, + "grad_norm": 4.591206073760986, + "learning_rate": 8.472166948068357e-06, + "loss": 0.8827, + "step": 5506 + }, + { + "epoch": 1.464627659574468, + "grad_norm": 3.7772765159606934, + "learning_rate": 8.471534040221419e-06, + "loss": 0.8578, + "step": 5507 + }, + { + "epoch": 1.4648936170212765, + "grad_norm": 3.75657057762146, + "learning_rate": 8.47090102496113e-06, + "loss": 0.8552, + "step": 5508 + }, + { + "epoch": 1.465159574468085, + "grad_norm": 3.635420322418213, + "learning_rate": 8.470267902307079e-06, + "loss": 0.7732, + "step": 5509 + }, + { + "epoch": 1.4654255319148937, + "grad_norm": 4.403695583343506, + "learning_rate": 8.469634672278853e-06, + "loss": 0.9379, + "step": 5510 + }, + { + "epoch": 1.4656914893617021, + "grad_norm": 3.849709987640381, + "learning_rate": 8.469001334896044e-06, + "loss": 0.7691, + "step": 5511 + }, + { + "epoch": 1.4659574468085106, + "grad_norm": 3.580702066421509, + "learning_rate": 8.46836789017825e-06, + "loss": 0.7887, + "step": 5512 + }, + { + "epoch": 1.4662234042553193, + "grad_norm": 4.184311866760254, + "learning_rate": 8.46773433814507e-06, + "loss": 0.9119, + "step": 5513 + }, + { + "epoch": 1.4664893617021277, + "grad_norm": 4.308862686157227, + "learning_rate": 8.467100678816108e-06, + "loss": 0.8483, + "step": 5514 + }, + { + "epoch": 1.4667553191489362, + "grad_norm": 3.799316883087158, + "learning_rate": 8.466466912210967e-06, + "loss": 0.8143, + "step": 5515 + }, + { + "epoch": 1.4670212765957447, + "grad_norm": 3.673563003540039, + "learning_rate": 8.465833038349259e-06, + "loss": 0.7485, + "step": 5516 + }, + { + "epoch": 1.4672872340425531, + "grad_norm": 4.07314395904541, + "learning_rate": 8.465199057250597e-06, + "loss": 0.8663, + "step": 5517 + }, + { + "epoch": 1.4675531914893618, + "grad_norm": 3.6095144748687744, + "learning_rate": 8.464564968934595e-06, + "loss": 0.6752, + "step": 5518 + }, + { + "epoch": 1.4678191489361703, + "grad_norm": 3.661813735961914, + "learning_rate": 8.463930773420874e-06, + "loss": 0.8518, + "step": 5519 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 4.36665153503418, + "learning_rate": 8.463296470729058e-06, + "loss": 0.7581, + "step": 5520 + }, + { + "epoch": 1.4683510638297872, + "grad_norm": 4.145575046539307, + "learning_rate": 8.462662060878772e-06, + "loss": 0.8582, + "step": 5521 + }, + { + "epoch": 1.4686170212765957, + "grad_norm": 3.805684804916382, + "learning_rate": 8.462027543889644e-06, + "loss": 0.718, + "step": 5522 + }, + { + "epoch": 1.4688829787234043, + "grad_norm": 3.7820284366607666, + "learning_rate": 8.461392919781309e-06, + "loss": 0.7179, + "step": 5523 + }, + { + "epoch": 1.4691489361702128, + "grad_norm": 4.097955226898193, + "learning_rate": 8.460758188573399e-06, + "loss": 0.7764, + "step": 5524 + }, + { + "epoch": 1.4694148936170213, + "grad_norm": 4.177279472351074, + "learning_rate": 8.46012335028556e-06, + "loss": 0.8168, + "step": 5525 + }, + { + "epoch": 1.4696808510638297, + "grad_norm": 4.4050679206848145, + "learning_rate": 8.459488404937426e-06, + "loss": 0.8876, + "step": 5526 + }, + { + "epoch": 1.4699468085106382, + "grad_norm": 3.7400434017181396, + "learning_rate": 8.458853352548651e-06, + "loss": 0.8693, + "step": 5527 + }, + { + "epoch": 1.4702127659574469, + "grad_norm": 3.909196138381958, + "learning_rate": 8.458218193138881e-06, + "loss": 0.8237, + "step": 5528 + }, + { + "epoch": 1.4704787234042553, + "grad_norm": 3.941265344619751, + "learning_rate": 8.457582926727768e-06, + "loss": 0.9123, + "step": 5529 + }, + { + "epoch": 1.4707446808510638, + "grad_norm": 3.8149471282958984, + "learning_rate": 8.456947553334966e-06, + "loss": 0.6899, + "step": 5530 + }, + { + "epoch": 1.4710106382978723, + "grad_norm": 3.6952855587005615, + "learning_rate": 8.45631207298014e-06, + "loss": 0.7128, + "step": 5531 + }, + { + "epoch": 1.4712765957446807, + "grad_norm": 3.9754221439361572, + "learning_rate": 8.45567648568295e-06, + "loss": 0.9245, + "step": 5532 + }, + { + "epoch": 1.4715425531914894, + "grad_norm": 4.337751388549805, + "learning_rate": 8.455040791463057e-06, + "loss": 0.8776, + "step": 5533 + }, + { + "epoch": 1.4718085106382979, + "grad_norm": 3.7709763050079346, + "learning_rate": 8.454404990340137e-06, + "loss": 0.6869, + "step": 5534 + }, + { + "epoch": 1.4720744680851063, + "grad_norm": 4.196871280670166, + "learning_rate": 8.453769082333858e-06, + "loss": 0.8704, + "step": 5535 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 3.957577705383301, + "learning_rate": 8.453133067463898e-06, + "loss": 0.7857, + "step": 5536 + }, + { + "epoch": 1.4726063829787235, + "grad_norm": 3.942445993423462, + "learning_rate": 8.452496945749934e-06, + "loss": 0.875, + "step": 5537 + }, + { + "epoch": 1.472872340425532, + "grad_norm": 4.122093200683594, + "learning_rate": 8.451860717211653e-06, + "loss": 0.8047, + "step": 5538 + }, + { + "epoch": 1.4731382978723404, + "grad_norm": 3.8919665813446045, + "learning_rate": 8.451224381868735e-06, + "loss": 0.9631, + "step": 5539 + }, + { + "epoch": 1.4734042553191489, + "grad_norm": 4.186689376831055, + "learning_rate": 8.45058793974087e-06, + "loss": 0.8028, + "step": 5540 + }, + { + "epoch": 1.4736702127659576, + "grad_norm": 4.130399703979492, + "learning_rate": 8.449951390847754e-06, + "loss": 0.7659, + "step": 5541 + }, + { + "epoch": 1.473936170212766, + "grad_norm": 3.8741462230682373, + "learning_rate": 8.44931473520908e-06, + "loss": 0.74, + "step": 5542 + }, + { + "epoch": 1.4742021276595745, + "grad_norm": 4.210333824157715, + "learning_rate": 8.448677972844546e-06, + "loss": 0.7675, + "step": 5543 + }, + { + "epoch": 1.474468085106383, + "grad_norm": 3.959024429321289, + "learning_rate": 8.448041103773857e-06, + "loss": 0.8771, + "step": 5544 + }, + { + "epoch": 1.4747340425531914, + "grad_norm": 3.9098892211914062, + "learning_rate": 8.447404128016715e-06, + "loss": 0.8756, + "step": 5545 + }, + { + "epoch": 1.475, + "grad_norm": 3.9612808227539062, + "learning_rate": 8.446767045592829e-06, + "loss": 0.7888, + "step": 5546 + }, + { + "epoch": 1.4752659574468086, + "grad_norm": 3.754507303237915, + "learning_rate": 8.446129856521917e-06, + "loss": 0.8611, + "step": 5547 + }, + { + "epoch": 1.475531914893617, + "grad_norm": 3.97927188873291, + "learning_rate": 8.445492560823686e-06, + "loss": 0.7937, + "step": 5548 + }, + { + "epoch": 1.4757978723404255, + "grad_norm": 3.8864712715148926, + "learning_rate": 8.44485515851786e-06, + "loss": 0.7687, + "step": 5549 + }, + { + "epoch": 1.476063829787234, + "grad_norm": 3.407346487045288, + "learning_rate": 8.44421764962416e-06, + "loss": 0.8368, + "step": 5550 + }, + { + "epoch": 1.4763297872340426, + "grad_norm": 4.162166118621826, + "learning_rate": 8.44358003416231e-06, + "loss": 0.7305, + "step": 5551 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 4.198580741882324, + "learning_rate": 8.44294231215204e-06, + "loss": 0.9471, + "step": 5552 + }, + { + "epoch": 1.4768617021276595, + "grad_norm": 3.6172430515289307, + "learning_rate": 8.44230448361308e-06, + "loss": 0.84, + "step": 5553 + }, + { + "epoch": 1.477127659574468, + "grad_norm": 3.573073387145996, + "learning_rate": 8.441666548565169e-06, + "loss": 0.8333, + "step": 5554 + }, + { + "epoch": 1.4773936170212765, + "grad_norm": 3.864596128463745, + "learning_rate": 8.441028507028041e-06, + "loss": 0.7169, + "step": 5555 + }, + { + "epoch": 1.4776595744680852, + "grad_norm": 3.62256121635437, + "learning_rate": 8.44039035902144e-06, + "loss": 0.8163, + "step": 5556 + }, + { + "epoch": 1.4779255319148936, + "grad_norm": 3.8395614624023438, + "learning_rate": 8.43975210456511e-06, + "loss": 0.7796, + "step": 5557 + }, + { + "epoch": 1.478191489361702, + "grad_norm": 3.980595111846924, + "learning_rate": 8.439113743678801e-06, + "loss": 0.9652, + "step": 5558 + }, + { + "epoch": 1.4784574468085108, + "grad_norm": 3.7857303619384766, + "learning_rate": 8.438475276382264e-06, + "loss": 0.9076, + "step": 5559 + }, + { + "epoch": 1.4787234042553192, + "grad_norm": 3.4477193355560303, + "learning_rate": 8.437836702695253e-06, + "loss": 0.727, + "step": 5560 + }, + { + "epoch": 1.4789893617021277, + "grad_norm": 3.9439425468444824, + "learning_rate": 8.437198022637527e-06, + "loss": 0.7404, + "step": 5561 + }, + { + "epoch": 1.4792553191489362, + "grad_norm": 3.8489301204681396, + "learning_rate": 8.436559236228849e-06, + "loss": 0.7598, + "step": 5562 + }, + { + "epoch": 1.4795212765957446, + "grad_norm": 3.9537103176116943, + "learning_rate": 8.435920343488978e-06, + "loss": 0.81, + "step": 5563 + }, + { + "epoch": 1.4797872340425533, + "grad_norm": 4.361562252044678, + "learning_rate": 8.435281344437691e-06, + "loss": 0.9021, + "step": 5564 + }, + { + "epoch": 1.4800531914893618, + "grad_norm": 4.177056789398193, + "learning_rate": 8.434642239094752e-06, + "loss": 0.7916, + "step": 5565 + }, + { + "epoch": 1.4803191489361702, + "grad_norm": 4.249316215515137, + "learning_rate": 8.43400302747994e-06, + "loss": 0.8578, + "step": 5566 + }, + { + "epoch": 1.4805851063829787, + "grad_norm": 4.1586198806762695, + "learning_rate": 8.43336370961303e-06, + "loss": 0.7918, + "step": 5567 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 3.8984861373901367, + "learning_rate": 8.432724285513804e-06, + "loss": 0.8302, + "step": 5568 + }, + { + "epoch": 1.4811170212765958, + "grad_norm": 4.403296947479248, + "learning_rate": 8.43208475520205e-06, + "loss": 0.9246, + "step": 5569 + }, + { + "epoch": 1.4813829787234043, + "grad_norm": 4.00664758682251, + "learning_rate": 8.43144511869755e-06, + "loss": 0.7915, + "step": 5570 + }, + { + "epoch": 1.4816489361702128, + "grad_norm": 4.43447732925415, + "learning_rate": 8.4308053760201e-06, + "loss": 0.811, + "step": 5571 + }, + { + "epoch": 1.4819148936170212, + "grad_norm": 4.107089519500732, + "learning_rate": 8.43016552718949e-06, + "loss": 0.9385, + "step": 5572 + }, + { + "epoch": 1.4821808510638297, + "grad_norm": 4.0541229248046875, + "learning_rate": 8.429525572225521e-06, + "loss": 0.7683, + "step": 5573 + }, + { + "epoch": 1.4824468085106384, + "grad_norm": 3.8049004077911377, + "learning_rate": 8.428885511147994e-06, + "loss": 0.8483, + "step": 5574 + }, + { + "epoch": 1.4827127659574468, + "grad_norm": 4.220947265625, + "learning_rate": 8.42824534397671e-06, + "loss": 0.8209, + "step": 5575 + }, + { + "epoch": 1.4829787234042553, + "grad_norm": 3.299015998840332, + "learning_rate": 8.427605070731482e-06, + "loss": 0.6946, + "step": 5576 + }, + { + "epoch": 1.4832446808510638, + "grad_norm": 4.028343677520752, + "learning_rate": 8.426964691432116e-06, + "loss": 0.7912, + "step": 5577 + }, + { + "epoch": 1.4835106382978722, + "grad_norm": 3.6714823246002197, + "learning_rate": 8.426324206098429e-06, + "loss": 0.7487, + "step": 5578 + }, + { + "epoch": 1.483776595744681, + "grad_norm": 3.8498239517211914, + "learning_rate": 8.425683614750235e-06, + "loss": 0.7929, + "step": 5579 + }, + { + "epoch": 1.4840425531914894, + "grad_norm": 3.6556410789489746, + "learning_rate": 8.425042917407358e-06, + "loss": 0.7774, + "step": 5580 + }, + { + "epoch": 1.4843085106382978, + "grad_norm": 3.908780336380005, + "learning_rate": 8.424402114089618e-06, + "loss": 0.7533, + "step": 5581 + }, + { + "epoch": 1.4845744680851065, + "grad_norm": 4.054098129272461, + "learning_rate": 8.42376120481685e-06, + "loss": 0.8575, + "step": 5582 + }, + { + "epoch": 1.484840425531915, + "grad_norm": 4.667778968811035, + "learning_rate": 8.423120189608876e-06, + "loss": 0.8906, + "step": 5583 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 3.960300922393799, + "learning_rate": 8.422479068485531e-06, + "loss": 0.7737, + "step": 5584 + }, + { + "epoch": 1.485372340425532, + "grad_norm": 4.355529308319092, + "learning_rate": 8.421837841466657e-06, + "loss": 0.8904, + "step": 5585 + }, + { + "epoch": 1.4856382978723404, + "grad_norm": 4.450819969177246, + "learning_rate": 8.42119650857209e-06, + "loss": 0.8558, + "step": 5586 + }, + { + "epoch": 1.485904255319149, + "grad_norm": 3.8777942657470703, + "learning_rate": 8.420555069821679e-06, + "loss": 0.8021, + "step": 5587 + }, + { + "epoch": 1.4861702127659575, + "grad_norm": 3.9618871212005615, + "learning_rate": 8.419913525235264e-06, + "loss": 0.8717, + "step": 5588 + }, + { + "epoch": 1.486436170212766, + "grad_norm": 3.7627811431884766, + "learning_rate": 8.419271874832697e-06, + "loss": 0.7337, + "step": 5589 + }, + { + "epoch": 1.4867021276595744, + "grad_norm": 3.9509243965148926, + "learning_rate": 8.418630118633835e-06, + "loss": 0.8209, + "step": 5590 + }, + { + "epoch": 1.486968085106383, + "grad_norm": 3.8642148971557617, + "learning_rate": 8.417988256658532e-06, + "loss": 0.7907, + "step": 5591 + }, + { + "epoch": 1.4872340425531916, + "grad_norm": 3.917509078979492, + "learning_rate": 8.417346288926646e-06, + "loss": 0.8037, + "step": 5592 + }, + { + "epoch": 1.4875, + "grad_norm": 3.5143251419067383, + "learning_rate": 8.416704215458042e-06, + "loss": 0.8127, + "step": 5593 + }, + { + "epoch": 1.4877659574468085, + "grad_norm": 4.229488372802734, + "learning_rate": 8.41606203627259e-06, + "loss": 0.8681, + "step": 5594 + }, + { + "epoch": 1.488031914893617, + "grad_norm": 3.636591911315918, + "learning_rate": 8.415419751390155e-06, + "loss": 0.8858, + "step": 5595 + }, + { + "epoch": 1.4882978723404254, + "grad_norm": 3.9129700660705566, + "learning_rate": 8.414777360830611e-06, + "loss": 0.8607, + "step": 5596 + }, + { + "epoch": 1.4885638297872341, + "grad_norm": 4.00184965133667, + "learning_rate": 8.414134864613837e-06, + "loss": 0.7551, + "step": 5597 + }, + { + "epoch": 1.4888297872340426, + "grad_norm": 3.9038429260253906, + "learning_rate": 8.413492262759708e-06, + "loss": 0.7195, + "step": 5598 + }, + { + "epoch": 1.489095744680851, + "grad_norm": 3.802076816558838, + "learning_rate": 8.412849555288111e-06, + "loss": 0.8092, + "step": 5599 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 4.020835876464844, + "learning_rate": 8.41220674221893e-06, + "loss": 0.8439, + "step": 5600 + }, + { + "epoch": 1.489627659574468, + "grad_norm": 4.310454845428467, + "learning_rate": 8.411563823572057e-06, + "loss": 0.959, + "step": 5601 + }, + { + "epoch": 1.4898936170212767, + "grad_norm": 4.212212085723877, + "learning_rate": 8.410920799367382e-06, + "loss": 0.784, + "step": 5602 + }, + { + "epoch": 1.4901595744680851, + "grad_norm": 3.9010252952575684, + "learning_rate": 8.4102776696248e-06, + "loss": 0.7156, + "step": 5603 + }, + { + "epoch": 1.4904255319148936, + "grad_norm": 4.061422348022461, + "learning_rate": 8.409634434364214e-06, + "loss": 0.8524, + "step": 5604 + }, + { + "epoch": 1.4906914893617023, + "grad_norm": 4.281171798706055, + "learning_rate": 8.408991093605524e-06, + "loss": 0.8344, + "step": 5605 + }, + { + "epoch": 1.4909574468085105, + "grad_norm": 4.274752616882324, + "learning_rate": 8.408347647368634e-06, + "loss": 0.8106, + "step": 5606 + }, + { + "epoch": 1.4912234042553192, + "grad_norm": 3.9846606254577637, + "learning_rate": 8.407704095673454e-06, + "loss": 0.7059, + "step": 5607 + }, + { + "epoch": 1.4914893617021276, + "grad_norm": 4.1280436515808105, + "learning_rate": 8.4070604385399e-06, + "loss": 0.8267, + "step": 5608 + }, + { + "epoch": 1.491755319148936, + "grad_norm": 3.7875635623931885, + "learning_rate": 8.406416675987884e-06, + "loss": 0.8078, + "step": 5609 + }, + { + "epoch": 1.4920212765957448, + "grad_norm": 4.4207444190979, + "learning_rate": 8.405772808037326e-06, + "loss": 0.8452, + "step": 5610 + }, + { + "epoch": 1.4922872340425533, + "grad_norm": 3.9423201084136963, + "learning_rate": 8.405128834708147e-06, + "loss": 0.7491, + "step": 5611 + }, + { + "epoch": 1.4925531914893617, + "grad_norm": 3.669431686401367, + "learning_rate": 8.404484756020272e-06, + "loss": 0.7232, + "step": 5612 + }, + { + "epoch": 1.4928191489361702, + "grad_norm": 4.371226787567139, + "learning_rate": 8.403840571993631e-06, + "loss": 0.7899, + "step": 5613 + }, + { + "epoch": 1.4930851063829786, + "grad_norm": 4.185215950012207, + "learning_rate": 8.403196282648156e-06, + "loss": 0.9727, + "step": 5614 + }, + { + "epoch": 1.4933510638297873, + "grad_norm": 3.5517239570617676, + "learning_rate": 8.402551888003781e-06, + "loss": 0.805, + "step": 5615 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 3.4188995361328125, + "learning_rate": 8.401907388080443e-06, + "loss": 0.7345, + "step": 5616 + }, + { + "epoch": 1.4938829787234043, + "grad_norm": 3.7187201976776123, + "learning_rate": 8.401262782898087e-06, + "loss": 0.7147, + "step": 5617 + }, + { + "epoch": 1.4941489361702127, + "grad_norm": 4.5645976066589355, + "learning_rate": 8.400618072476655e-06, + "loss": 0.8707, + "step": 5618 + }, + { + "epoch": 1.4944148936170212, + "grad_norm": 3.7568912506103516, + "learning_rate": 8.399973256836097e-06, + "loss": 0.8637, + "step": 5619 + }, + { + "epoch": 1.4946808510638299, + "grad_norm": 4.120610237121582, + "learning_rate": 8.399328335996362e-06, + "loss": 0.8749, + "step": 5620 + }, + { + "epoch": 1.4949468085106383, + "grad_norm": 3.780111312866211, + "learning_rate": 8.398683309977407e-06, + "loss": 0.739, + "step": 5621 + }, + { + "epoch": 1.4952127659574468, + "grad_norm": 4.050705909729004, + "learning_rate": 8.39803817879919e-06, + "loss": 0.869, + "step": 5622 + }, + { + "epoch": 1.4954787234042553, + "grad_norm": 3.941727876663208, + "learning_rate": 8.39739294248167e-06, + "loss": 0.8147, + "step": 5623 + }, + { + "epoch": 1.4957446808510637, + "grad_norm": 4.117156505584717, + "learning_rate": 8.396747601044812e-06, + "loss": 0.843, + "step": 5624 + }, + { + "epoch": 1.4960106382978724, + "grad_norm": 3.813788890838623, + "learning_rate": 8.396102154508584e-06, + "loss": 0.7214, + "step": 5625 + }, + { + "epoch": 1.4962765957446809, + "grad_norm": 4.435267448425293, + "learning_rate": 8.395456602892957e-06, + "loss": 0.9548, + "step": 5626 + }, + { + "epoch": 1.4965425531914893, + "grad_norm": 4.178934097290039, + "learning_rate": 8.394810946217905e-06, + "loss": 0.797, + "step": 5627 + }, + { + "epoch": 1.496808510638298, + "grad_norm": 4.201347827911377, + "learning_rate": 8.394165184503406e-06, + "loss": 0.8086, + "step": 5628 + }, + { + "epoch": 1.4970744680851062, + "grad_norm": 4.090775489807129, + "learning_rate": 8.39351931776944e-06, + "loss": 0.8206, + "step": 5629 + }, + { + "epoch": 1.497340425531915, + "grad_norm": 3.81706166267395, + "learning_rate": 8.392873346035992e-06, + "loss": 0.7876, + "step": 5630 + }, + { + "epoch": 1.4976063829787234, + "grad_norm": 4.212119102478027, + "learning_rate": 8.392227269323046e-06, + "loss": 0.8634, + "step": 5631 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 4.333573818206787, + "learning_rate": 8.391581087650596e-06, + "loss": 0.8157, + "step": 5632 + }, + { + "epoch": 1.4981382978723405, + "grad_norm": 4.08198356628418, + "learning_rate": 8.390934801038632e-06, + "loss": 0.8804, + "step": 5633 + }, + { + "epoch": 1.498404255319149, + "grad_norm": 3.6360666751861572, + "learning_rate": 8.390288409507156e-06, + "loss": 0.6327, + "step": 5634 + }, + { + "epoch": 1.4986702127659575, + "grad_norm": 4.428205490112305, + "learning_rate": 8.389641913076163e-06, + "loss": 0.8857, + "step": 5635 + }, + { + "epoch": 1.498936170212766, + "grad_norm": 4.506261825561523, + "learning_rate": 8.388995311765657e-06, + "loss": 0.8376, + "step": 5636 + }, + { + "epoch": 1.4992021276595744, + "grad_norm": 3.7618744373321533, + "learning_rate": 8.388348605595649e-06, + "loss": 0.8656, + "step": 5637 + }, + { + "epoch": 1.499468085106383, + "grad_norm": 3.843425750732422, + "learning_rate": 8.387701794586145e-06, + "loss": 0.7474, + "step": 5638 + }, + { + "epoch": 1.4997340425531915, + "grad_norm": 3.933223009109497, + "learning_rate": 8.387054878757157e-06, + "loss": 0.9316, + "step": 5639 + }, + { + "epoch": 1.5, + "grad_norm": 3.8141305446624756, + "learning_rate": 8.386407858128707e-06, + "loss": 0.7359, + "step": 5640 + }, + { + "epoch": 1.5002659574468085, + "grad_norm": 4.184633731842041, + "learning_rate": 8.385760732720809e-06, + "loss": 0.8206, + "step": 5641 + }, + { + "epoch": 1.500531914893617, + "grad_norm": 3.9276089668273926, + "learning_rate": 8.385113502553487e-06, + "loss": 0.8148, + "step": 5642 + }, + { + "epoch": 1.5007978723404256, + "grad_norm": 4.084725856781006, + "learning_rate": 8.384466167646768e-06, + "loss": 0.8435, + "step": 5643 + }, + { + "epoch": 1.501063829787234, + "grad_norm": 4.092894077301025, + "learning_rate": 8.383818728020681e-06, + "loss": 0.7876, + "step": 5644 + }, + { + "epoch": 1.5013297872340425, + "grad_norm": 3.6473567485809326, + "learning_rate": 8.383171183695258e-06, + "loss": 0.7427, + "step": 5645 + }, + { + "epoch": 1.5015957446808512, + "grad_norm": 4.224092483520508, + "learning_rate": 8.382523534690537e-06, + "loss": 0.8959, + "step": 5646 + }, + { + "epoch": 1.5018617021276595, + "grad_norm": 4.414750576019287, + "learning_rate": 8.381875781026553e-06, + "loss": 0.746, + "step": 5647 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 4.199521064758301, + "learning_rate": 8.381227922723353e-06, + "loss": 0.8083, + "step": 5648 + }, + { + "epoch": 1.5023936170212766, + "grad_norm": 3.8716115951538086, + "learning_rate": 8.380579959800981e-06, + "loss": 0.7007, + "step": 5649 + }, + { + "epoch": 1.502659574468085, + "grad_norm": 4.189701080322266, + "learning_rate": 8.379931892279483e-06, + "loss": 0.7694, + "step": 5650 + }, + { + "epoch": 1.5029255319148938, + "grad_norm": 3.577147960662842, + "learning_rate": 8.379283720178913e-06, + "loss": 0.7776, + "step": 5651 + }, + { + "epoch": 1.503191489361702, + "grad_norm": 4.009932994842529, + "learning_rate": 8.378635443519327e-06, + "loss": 0.7633, + "step": 5652 + }, + { + "epoch": 1.5034574468085107, + "grad_norm": 4.129024505615234, + "learning_rate": 8.377987062320782e-06, + "loss": 0.7067, + "step": 5653 + }, + { + "epoch": 1.5037234042553191, + "grad_norm": 3.6017751693725586, + "learning_rate": 8.37733857660334e-06, + "loss": 0.7983, + "step": 5654 + }, + { + "epoch": 1.5039893617021276, + "grad_norm": 3.799006223678589, + "learning_rate": 8.376689986387066e-06, + "loss": 0.8479, + "step": 5655 + }, + { + "epoch": 1.5042553191489363, + "grad_norm": 4.5062575340271, + "learning_rate": 8.376041291692028e-06, + "loss": 0.8298, + "step": 5656 + }, + { + "epoch": 1.5045212765957445, + "grad_norm": 3.729353666305542, + "learning_rate": 8.3753924925383e-06, + "loss": 0.7688, + "step": 5657 + }, + { + "epoch": 1.5047872340425532, + "grad_norm": 4.237773418426514, + "learning_rate": 8.374743588945951e-06, + "loss": 0.9623, + "step": 5658 + }, + { + "epoch": 1.5050531914893617, + "grad_norm": 3.5734505653381348, + "learning_rate": 8.374094580935064e-06, + "loss": 0.6333, + "step": 5659 + }, + { + "epoch": 1.5053191489361701, + "grad_norm": 3.711700677871704, + "learning_rate": 8.373445468525719e-06, + "loss": 0.8401, + "step": 5660 + }, + { + "epoch": 1.5055851063829788, + "grad_norm": 3.8051505088806152, + "learning_rate": 8.372796251737995e-06, + "loss": 0.7845, + "step": 5661 + }, + { + "epoch": 1.5058510638297873, + "grad_norm": 3.983067750930786, + "learning_rate": 8.372146930591988e-06, + "loss": 0.8886, + "step": 5662 + }, + { + "epoch": 1.5061170212765957, + "grad_norm": 3.872107744216919, + "learning_rate": 8.371497505107784e-06, + "loss": 0.8892, + "step": 5663 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 4.311370849609375, + "learning_rate": 8.370847975305479e-06, + "loss": 0.8369, + "step": 5664 + }, + { + "epoch": 1.5066489361702127, + "grad_norm": 3.470078706741333, + "learning_rate": 8.370198341205167e-06, + "loss": 0.7035, + "step": 5665 + }, + { + "epoch": 1.5069148936170214, + "grad_norm": 3.7826905250549316, + "learning_rate": 8.369548602826951e-06, + "loss": 0.8478, + "step": 5666 + }, + { + "epoch": 1.5071808510638298, + "grad_norm": 4.1136603355407715, + "learning_rate": 8.368898760190933e-06, + "loss": 0.7812, + "step": 5667 + }, + { + "epoch": 1.5074468085106383, + "grad_norm": 3.856652021408081, + "learning_rate": 8.368248813317221e-06, + "loss": 0.7926, + "step": 5668 + }, + { + "epoch": 1.507712765957447, + "grad_norm": 4.0616865158081055, + "learning_rate": 8.367598762225929e-06, + "loss": 0.7884, + "step": 5669 + }, + { + "epoch": 1.5079787234042552, + "grad_norm": 4.08623743057251, + "learning_rate": 8.366948606937161e-06, + "loss": 0.8499, + "step": 5670 + }, + { + "epoch": 1.508244680851064, + "grad_norm": 4.225100517272949, + "learning_rate": 8.366298347471043e-06, + "loss": 0.8145, + "step": 5671 + }, + { + "epoch": 1.5085106382978724, + "grad_norm": 4.046361923217773, + "learning_rate": 8.36564798384769e-06, + "loss": 0.6879, + "step": 5672 + }, + { + "epoch": 1.5087765957446808, + "grad_norm": 4.1829833984375, + "learning_rate": 8.364997516087224e-06, + "loss": 0.7828, + "step": 5673 + }, + { + "epoch": 1.5090425531914895, + "grad_norm": 3.750427484512329, + "learning_rate": 8.364346944209774e-06, + "loss": 0.7639, + "step": 5674 + }, + { + "epoch": 1.5093085106382977, + "grad_norm": 4.194416522979736, + "learning_rate": 8.36369626823547e-06, + "loss": 0.8308, + "step": 5675 + }, + { + "epoch": 1.5095744680851064, + "grad_norm": 4.148036003112793, + "learning_rate": 8.363045488184443e-06, + "loss": 0.7443, + "step": 5676 + }, + { + "epoch": 1.5098404255319149, + "grad_norm": 3.7398674488067627, + "learning_rate": 8.362394604076827e-06, + "loss": 0.8633, + "step": 5677 + }, + { + "epoch": 1.5101063829787233, + "grad_norm": 3.8514955043792725, + "learning_rate": 8.361743615932765e-06, + "loss": 0.797, + "step": 5678 + }, + { + "epoch": 1.510372340425532, + "grad_norm": 4.254388809204102, + "learning_rate": 8.361092523772396e-06, + "loss": 0.8425, + "step": 5679 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 4.257145881652832, + "learning_rate": 8.360441327615868e-06, + "loss": 0.7964, + "step": 5680 + }, + { + "epoch": 1.510904255319149, + "grad_norm": 3.9065487384796143, + "learning_rate": 8.35979002748333e-06, + "loss": 0.837, + "step": 5681 + }, + { + "epoch": 1.5111702127659574, + "grad_norm": 4.575162410736084, + "learning_rate": 8.359138623394931e-06, + "loss": 0.9485, + "step": 5682 + }, + { + "epoch": 1.5114361702127659, + "grad_norm": 4.180033206939697, + "learning_rate": 8.35848711537083e-06, + "loss": 0.8287, + "step": 5683 + }, + { + "epoch": 1.5117021276595746, + "grad_norm": 4.284930229187012, + "learning_rate": 8.357835503431182e-06, + "loss": 0.8548, + "step": 5684 + }, + { + "epoch": 1.511968085106383, + "grad_norm": 3.8655450344085693, + "learning_rate": 8.357183787596151e-06, + "loss": 0.7792, + "step": 5685 + }, + { + "epoch": 1.5122340425531915, + "grad_norm": 3.840792655944824, + "learning_rate": 8.356531967885899e-06, + "loss": 0.7953, + "step": 5686 + }, + { + "epoch": 1.5125, + "grad_norm": 3.675896406173706, + "learning_rate": 8.355880044320599e-06, + "loss": 0.7667, + "step": 5687 + }, + { + "epoch": 1.5127659574468084, + "grad_norm": 3.6345510482788086, + "learning_rate": 8.355228016920417e-06, + "loss": 0.8588, + "step": 5688 + }, + { + "epoch": 1.513031914893617, + "grad_norm": 3.8645408153533936, + "learning_rate": 8.354575885705532e-06, + "loss": 0.862, + "step": 5689 + }, + { + "epoch": 1.5132978723404256, + "grad_norm": 4.727093696594238, + "learning_rate": 8.353923650696119e-06, + "loss": 0.8419, + "step": 5690 + }, + { + "epoch": 1.513563829787234, + "grad_norm": 4.074021816253662, + "learning_rate": 8.353271311912357e-06, + "loss": 0.7486, + "step": 5691 + }, + { + "epoch": 1.5138297872340427, + "grad_norm": 3.9446327686309814, + "learning_rate": 8.352618869374435e-06, + "loss": 0.7721, + "step": 5692 + }, + { + "epoch": 1.514095744680851, + "grad_norm": 3.839276075363159, + "learning_rate": 8.351966323102538e-06, + "loss": 0.7744, + "step": 5693 + }, + { + "epoch": 1.5143617021276596, + "grad_norm": 4.190333366394043, + "learning_rate": 8.351313673116856e-06, + "loss": 0.8085, + "step": 5694 + }, + { + "epoch": 1.514627659574468, + "grad_norm": 3.8334741592407227, + "learning_rate": 8.350660919437585e-06, + "loss": 0.933, + "step": 5695 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 3.766174793243408, + "learning_rate": 8.350008062084918e-06, + "loss": 0.7537, + "step": 5696 + }, + { + "epoch": 1.5151595744680852, + "grad_norm": 4.281386852264404, + "learning_rate": 8.349355101079058e-06, + "loss": 0.8714, + "step": 5697 + }, + { + "epoch": 1.5154255319148935, + "grad_norm": 3.8533146381378174, + "learning_rate": 8.348702036440209e-06, + "loss": 0.8423, + "step": 5698 + }, + { + "epoch": 1.5156914893617022, + "grad_norm": 4.271562099456787, + "learning_rate": 8.348048868188574e-06, + "loss": 0.9832, + "step": 5699 + }, + { + "epoch": 1.5159574468085106, + "grad_norm": 4.475942611694336, + "learning_rate": 8.347395596344365e-06, + "loss": 0.9984, + "step": 5700 + }, + { + "epoch": 1.516223404255319, + "grad_norm": 4.308716773986816, + "learning_rate": 8.346742220927798e-06, + "loss": 0.8947, + "step": 5701 + }, + { + "epoch": 1.5164893617021278, + "grad_norm": 4.1707587242126465, + "learning_rate": 8.346088741959085e-06, + "loss": 0.9077, + "step": 5702 + }, + { + "epoch": 1.516755319148936, + "grad_norm": 4.016225337982178, + "learning_rate": 8.345435159458445e-06, + "loss": 0.9186, + "step": 5703 + }, + { + "epoch": 1.5170212765957447, + "grad_norm": 4.131173133850098, + "learning_rate": 8.344781473446106e-06, + "loss": 0.708, + "step": 5704 + }, + { + "epoch": 1.5172872340425532, + "grad_norm": 4.118223667144775, + "learning_rate": 8.344127683942289e-06, + "loss": 0.815, + "step": 5705 + }, + { + "epoch": 1.5175531914893616, + "grad_norm": 4.08048677444458, + "learning_rate": 8.343473790967223e-06, + "loss": 0.7402, + "step": 5706 + }, + { + "epoch": 1.5178191489361703, + "grad_norm": 4.256683826446533, + "learning_rate": 8.342819794541143e-06, + "loss": 0.9272, + "step": 5707 + }, + { + "epoch": 1.5180851063829788, + "grad_norm": 3.6859428882598877, + "learning_rate": 8.34216569468428e-06, + "loss": 0.8052, + "step": 5708 + }, + { + "epoch": 1.5183510638297872, + "grad_norm": 4.601988315582275, + "learning_rate": 8.341511491416877e-06, + "loss": 0.7638, + "step": 5709 + }, + { + "epoch": 1.5186170212765957, + "grad_norm": 3.8631575107574463, + "learning_rate": 8.340857184759178e-06, + "loss": 0.8282, + "step": 5710 + }, + { + "epoch": 1.5188829787234042, + "grad_norm": 4.184502124786377, + "learning_rate": 8.34020277473142e-06, + "loss": 0.8513, + "step": 5711 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 3.9446780681610107, + "learning_rate": 8.339548261353856e-06, + "loss": 0.6634, + "step": 5712 + }, + { + "epoch": 1.5194148936170213, + "grad_norm": 3.9360363483428955, + "learning_rate": 8.338893644646739e-06, + "loss": 0.7769, + "step": 5713 + }, + { + "epoch": 1.5196808510638298, + "grad_norm": 3.235274314880371, + "learning_rate": 8.33823892463032e-06, + "loss": 0.7531, + "step": 5714 + }, + { + "epoch": 1.5199468085106385, + "grad_norm": 3.941875696182251, + "learning_rate": 8.337584101324859e-06, + "loss": 0.7937, + "step": 5715 + }, + { + "epoch": 1.5202127659574467, + "grad_norm": 3.7710206508636475, + "learning_rate": 8.336929174750616e-06, + "loss": 0.8403, + "step": 5716 + }, + { + "epoch": 1.5204787234042554, + "grad_norm": 4.109030246734619, + "learning_rate": 8.336274144927855e-06, + "loss": 0.6704, + "step": 5717 + }, + { + "epoch": 1.5207446808510638, + "grad_norm": 3.7918636798858643, + "learning_rate": 8.335619011876846e-06, + "loss": 0.7756, + "step": 5718 + }, + { + "epoch": 1.5210106382978723, + "grad_norm": 3.633254051208496, + "learning_rate": 8.334963775617854e-06, + "loss": 0.7325, + "step": 5719 + }, + { + "epoch": 1.521276595744681, + "grad_norm": 3.994147539138794, + "learning_rate": 8.334308436171159e-06, + "loss": 0.8936, + "step": 5720 + }, + { + "epoch": 1.5215425531914892, + "grad_norm": 3.5977087020874023, + "learning_rate": 8.333652993557035e-06, + "loss": 0.8429, + "step": 5721 + }, + { + "epoch": 1.521808510638298, + "grad_norm": 3.7515316009521484, + "learning_rate": 8.332997447795763e-06, + "loss": 0.8329, + "step": 5722 + }, + { + "epoch": 1.5220744680851064, + "grad_norm": 3.969116449356079, + "learning_rate": 8.332341798907624e-06, + "loss": 0.804, + "step": 5723 + }, + { + "epoch": 1.5223404255319148, + "grad_norm": 3.915306329727173, + "learning_rate": 8.331686046912908e-06, + "loss": 0.9369, + "step": 5724 + }, + { + "epoch": 1.5226063829787235, + "grad_norm": 3.7423787117004395, + "learning_rate": 8.331030191831904e-06, + "loss": 0.8416, + "step": 5725 + }, + { + "epoch": 1.5228723404255318, + "grad_norm": 3.554068088531494, + "learning_rate": 8.3303742336849e-06, + "loss": 0.7121, + "step": 5726 + }, + { + "epoch": 1.5231382978723405, + "grad_norm": 4.019564628601074, + "learning_rate": 8.3297181724922e-06, + "loss": 0.7882, + "step": 5727 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 4.351405143737793, + "learning_rate": 8.3290620082741e-06, + "loss": 0.8769, + "step": 5728 + }, + { + "epoch": 1.5236702127659574, + "grad_norm": 3.942936658859253, + "learning_rate": 8.328405741050901e-06, + "loss": 0.924, + "step": 5729 + }, + { + "epoch": 1.523936170212766, + "grad_norm": 4.362167835235596, + "learning_rate": 8.327749370842909e-06, + "loss": 0.8015, + "step": 5730 + }, + { + "epoch": 1.5242021276595743, + "grad_norm": 3.7932353019714355, + "learning_rate": 8.327092897670432e-06, + "loss": 0.7993, + "step": 5731 + }, + { + "epoch": 1.524468085106383, + "grad_norm": 3.8214194774627686, + "learning_rate": 8.326436321553785e-06, + "loss": 0.7971, + "step": 5732 + }, + { + "epoch": 1.5247340425531914, + "grad_norm": 4.244415760040283, + "learning_rate": 8.325779642513283e-06, + "loss": 0.7253, + "step": 5733 + }, + { + "epoch": 1.525, + "grad_norm": 4.184083938598633, + "learning_rate": 8.325122860569241e-06, + "loss": 0.7849, + "step": 5734 + }, + { + "epoch": 1.5252659574468086, + "grad_norm": 4.359492301940918, + "learning_rate": 8.324465975741986e-06, + "loss": 0.8228, + "step": 5735 + }, + { + "epoch": 1.525531914893617, + "grad_norm": 3.8751020431518555, + "learning_rate": 8.323808988051837e-06, + "loss": 0.7288, + "step": 5736 + }, + { + "epoch": 1.5257978723404255, + "grad_norm": 4.366562843322754, + "learning_rate": 8.323151897519126e-06, + "loss": 0.8452, + "step": 5737 + }, + { + "epoch": 1.5260638297872342, + "grad_norm": 4.116846561431885, + "learning_rate": 8.322494704164182e-06, + "loss": 0.9376, + "step": 5738 + }, + { + "epoch": 1.5263297872340424, + "grad_norm": 4.062334060668945, + "learning_rate": 8.321837408007341e-06, + "loss": 0.855, + "step": 5739 + }, + { + "epoch": 1.5265957446808511, + "grad_norm": 4.4059014320373535, + "learning_rate": 8.321180009068937e-06, + "loss": 0.8832, + "step": 5740 + }, + { + "epoch": 1.5268617021276596, + "grad_norm": 4.124050140380859, + "learning_rate": 8.320522507369315e-06, + "loss": 0.7446, + "step": 5741 + }, + { + "epoch": 1.527127659574468, + "grad_norm": 3.721942901611328, + "learning_rate": 8.319864902928819e-06, + "loss": 0.8547, + "step": 5742 + }, + { + "epoch": 1.5273936170212767, + "grad_norm": 3.816612720489502, + "learning_rate": 8.31920719576779e-06, + "loss": 0.8478, + "step": 5743 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 4.217785835266113, + "learning_rate": 8.318549385906587e-06, + "loss": 0.8573, + "step": 5744 + }, + { + "epoch": 1.5279255319148937, + "grad_norm": 4.105627536773682, + "learning_rate": 8.317891473365558e-06, + "loss": 0.8891, + "step": 5745 + }, + { + "epoch": 1.5281914893617021, + "grad_norm": 4.537158966064453, + "learning_rate": 8.317233458165059e-06, + "loss": 0.9119, + "step": 5746 + }, + { + "epoch": 1.5284574468085106, + "grad_norm": 4.287096977233887, + "learning_rate": 8.31657534032545e-06, + "loss": 0.8465, + "step": 5747 + }, + { + "epoch": 1.5287234042553193, + "grad_norm": 4.125601291656494, + "learning_rate": 8.315917119867098e-06, + "loss": 0.7537, + "step": 5748 + }, + { + "epoch": 1.5289893617021275, + "grad_norm": 4.014163017272949, + "learning_rate": 8.315258796810366e-06, + "loss": 0.7572, + "step": 5749 + }, + { + "epoch": 1.5292553191489362, + "grad_norm": 3.912703514099121, + "learning_rate": 8.314600371175623e-06, + "loss": 0.7825, + "step": 5750 + }, + { + "epoch": 1.5295212765957447, + "grad_norm": 3.731410264968872, + "learning_rate": 8.313941842983243e-06, + "loss": 0.9015, + "step": 5751 + }, + { + "epoch": 1.5297872340425531, + "grad_norm": 4.122485160827637, + "learning_rate": 8.313283212253598e-06, + "loss": 0.8381, + "step": 5752 + }, + { + "epoch": 1.5300531914893618, + "grad_norm": 4.2268757820129395, + "learning_rate": 8.312624479007072e-06, + "loss": 0.788, + "step": 5753 + }, + { + "epoch": 1.53031914893617, + "grad_norm": 4.129693508148193, + "learning_rate": 8.311965643264042e-06, + "loss": 0.6951, + "step": 5754 + }, + { + "epoch": 1.5305851063829787, + "grad_norm": 4.038047790527344, + "learning_rate": 8.311306705044898e-06, + "loss": 0.834, + "step": 5755 + }, + { + "epoch": 1.5308510638297872, + "grad_norm": 3.85589599609375, + "learning_rate": 8.310647664370026e-06, + "loss": 0.8583, + "step": 5756 + }, + { + "epoch": 1.5311170212765957, + "grad_norm": 3.889176845550537, + "learning_rate": 8.309988521259816e-06, + "loss": 0.8361, + "step": 5757 + }, + { + "epoch": 1.5313829787234043, + "grad_norm": 4.0538458824157715, + "learning_rate": 8.309329275734664e-06, + "loss": 0.6951, + "step": 5758 + }, + { + "epoch": 1.5316489361702128, + "grad_norm": 4.010767936706543, + "learning_rate": 8.30866992781497e-06, + "loss": 0.8313, + "step": 5759 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 3.897259473800659, + "learning_rate": 8.30801047752113e-06, + "loss": 0.7736, + "step": 5760 + }, + { + "epoch": 1.53218085106383, + "grad_norm": 4.07016134262085, + "learning_rate": 8.307350924873553e-06, + "loss": 0.8231, + "step": 5761 + }, + { + "epoch": 1.5324468085106382, + "grad_norm": 3.886470317840576, + "learning_rate": 8.306691269892646e-06, + "loss": 0.8535, + "step": 5762 + }, + { + "epoch": 1.5327127659574469, + "grad_norm": 3.458498477935791, + "learning_rate": 8.306031512598815e-06, + "loss": 0.7291, + "step": 5763 + }, + { + "epoch": 1.5329787234042553, + "grad_norm": 3.6657865047454834, + "learning_rate": 8.305371653012479e-06, + "loss": 0.8239, + "step": 5764 + }, + { + "epoch": 1.5332446808510638, + "grad_norm": 4.054435729980469, + "learning_rate": 8.304711691154052e-06, + "loss": 0.7947, + "step": 5765 + }, + { + "epoch": 1.5335106382978725, + "grad_norm": 4.395258903503418, + "learning_rate": 8.304051627043952e-06, + "loss": 0.8615, + "step": 5766 + }, + { + "epoch": 1.5337765957446807, + "grad_norm": 4.212094306945801, + "learning_rate": 8.303391460702607e-06, + "loss": 0.7645, + "step": 5767 + }, + { + "epoch": 1.5340425531914894, + "grad_norm": 4.2090044021606445, + "learning_rate": 8.302731192150441e-06, + "loss": 0.8463, + "step": 5768 + }, + { + "epoch": 1.5343085106382979, + "grad_norm": 3.734283685684204, + "learning_rate": 8.302070821407882e-06, + "loss": 0.7986, + "step": 5769 + }, + { + "epoch": 1.5345744680851063, + "grad_norm": 4.0931291580200195, + "learning_rate": 8.301410348495366e-06, + "loss": 0.7541, + "step": 5770 + }, + { + "epoch": 1.534840425531915, + "grad_norm": 3.604841470718384, + "learning_rate": 8.300749773433325e-06, + "loss": 0.8511, + "step": 5771 + }, + { + "epoch": 1.5351063829787233, + "grad_norm": 3.881558895111084, + "learning_rate": 8.300089096242201e-06, + "loss": 0.7382, + "step": 5772 + }, + { + "epoch": 1.535372340425532, + "grad_norm": 3.472681760787964, + "learning_rate": 8.299428316942435e-06, + "loss": 0.7106, + "step": 5773 + }, + { + "epoch": 1.5356382978723404, + "grad_norm": 3.5763661861419678, + "learning_rate": 8.298767435554473e-06, + "loss": 0.6924, + "step": 5774 + }, + { + "epoch": 1.5359042553191489, + "grad_norm": 3.965982437133789, + "learning_rate": 8.298106452098761e-06, + "loss": 0.8163, + "step": 5775 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 3.9243502616882324, + "learning_rate": 8.297445366595754e-06, + "loss": 0.8372, + "step": 5776 + }, + { + "epoch": 1.5364361702127658, + "grad_norm": 3.8713953495025635, + "learning_rate": 8.296784179065904e-06, + "loss": 0.7919, + "step": 5777 + }, + { + "epoch": 1.5367021276595745, + "grad_norm": 3.7591898441314697, + "learning_rate": 8.29612288952967e-06, + "loss": 0.8597, + "step": 5778 + }, + { + "epoch": 1.536968085106383, + "grad_norm": 4.25253438949585, + "learning_rate": 8.295461498007513e-06, + "loss": 1.0482, + "step": 5779 + }, + { + "epoch": 1.5372340425531914, + "grad_norm": 3.846035957336426, + "learning_rate": 8.294800004519895e-06, + "loss": 0.8348, + "step": 5780 + }, + { + "epoch": 1.5375, + "grad_norm": 3.652987003326416, + "learning_rate": 8.29413840908729e-06, + "loss": 0.7409, + "step": 5781 + }, + { + "epoch": 1.5377659574468086, + "grad_norm": 4.131805419921875, + "learning_rate": 8.293476711730163e-06, + "loss": 0.8703, + "step": 5782 + }, + { + "epoch": 1.538031914893617, + "grad_norm": 4.142578125, + "learning_rate": 8.292814912468988e-06, + "loss": 0.881, + "step": 5783 + }, + { + "epoch": 1.5382978723404257, + "grad_norm": 3.5386013984680176, + "learning_rate": 8.292153011324242e-06, + "loss": 0.7984, + "step": 5784 + }, + { + "epoch": 1.538563829787234, + "grad_norm": 4.26931619644165, + "learning_rate": 8.291491008316409e-06, + "loss": 0.8968, + "step": 5785 + }, + { + "epoch": 1.5388297872340426, + "grad_norm": 4.214763164520264, + "learning_rate": 8.290828903465965e-06, + "loss": 0.7912, + "step": 5786 + }, + { + "epoch": 1.539095744680851, + "grad_norm": 4.008779525756836, + "learning_rate": 8.290166696793405e-06, + "loss": 0.8708, + "step": 5787 + }, + { + "epoch": 1.5393617021276595, + "grad_norm": 3.722784996032715, + "learning_rate": 8.28950438831921e-06, + "loss": 0.8047, + "step": 5788 + }, + { + "epoch": 1.5396276595744682, + "grad_norm": 3.9850144386291504, + "learning_rate": 8.288841978063877e-06, + "loss": 0.8583, + "step": 5789 + }, + { + "epoch": 1.5398936170212765, + "grad_norm": 3.7640953063964844, + "learning_rate": 8.288179466047903e-06, + "loss": 0.899, + "step": 5790 + }, + { + "epoch": 1.5401595744680852, + "grad_norm": 3.9535369873046875, + "learning_rate": 8.287516852291784e-06, + "loss": 0.671, + "step": 5791 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 3.784611940383911, + "learning_rate": 8.28685413681602e-06, + "loss": 0.955, + "step": 5792 + }, + { + "epoch": 1.540691489361702, + "grad_norm": 4.205324172973633, + "learning_rate": 8.286191319641123e-06, + "loss": 0.8411, + "step": 5793 + }, + { + "epoch": 1.5409574468085108, + "grad_norm": 4.253503322601318, + "learning_rate": 8.285528400787597e-06, + "loss": 0.7707, + "step": 5794 + }, + { + "epoch": 1.541223404255319, + "grad_norm": 3.7679977416992188, + "learning_rate": 8.284865380275953e-06, + "loss": 0.9103, + "step": 5795 + }, + { + "epoch": 1.5414893617021277, + "grad_norm": 4.094081878662109, + "learning_rate": 8.284202258126706e-06, + "loss": 0.9798, + "step": 5796 + }, + { + "epoch": 1.5417553191489362, + "grad_norm": 4.189050674438477, + "learning_rate": 8.283539034360376e-06, + "loss": 0.8641, + "step": 5797 + }, + { + "epoch": 1.5420212765957446, + "grad_norm": 4.017099857330322, + "learning_rate": 8.282875708997482e-06, + "loss": 0.8214, + "step": 5798 + }, + { + "epoch": 1.5422872340425533, + "grad_norm": 3.6189417839050293, + "learning_rate": 8.282212282058549e-06, + "loss": 0.7486, + "step": 5799 + }, + { + "epoch": 1.5425531914893615, + "grad_norm": 4.480672359466553, + "learning_rate": 8.281548753564101e-06, + "loss": 0.9041, + "step": 5800 + }, + { + "epoch": 1.5428191489361702, + "grad_norm": 4.047300338745117, + "learning_rate": 8.280885123534673e-06, + "loss": 0.9519, + "step": 5801 + }, + { + "epoch": 1.5430851063829787, + "grad_norm": 4.379581928253174, + "learning_rate": 8.280221391990797e-06, + "loss": 0.9203, + "step": 5802 + }, + { + "epoch": 1.5433510638297872, + "grad_norm": 4.053439140319824, + "learning_rate": 8.279557558953009e-06, + "loss": 0.7759, + "step": 5803 + }, + { + "epoch": 1.5436170212765958, + "grad_norm": 3.927568197250366, + "learning_rate": 8.278893624441849e-06, + "loss": 0.7132, + "step": 5804 + }, + { + "epoch": 1.5438829787234043, + "grad_norm": 4.322382926940918, + "learning_rate": 8.278229588477857e-06, + "loss": 0.8272, + "step": 5805 + }, + { + "epoch": 1.5441489361702128, + "grad_norm": 3.6044352054595947, + "learning_rate": 8.277565451081587e-06, + "loss": 0.7487, + "step": 5806 + }, + { + "epoch": 1.5444148936170212, + "grad_norm": 3.7423501014709473, + "learning_rate": 8.27690121227358e-06, + "loss": 0.7342, + "step": 5807 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 3.7679383754730225, + "learning_rate": 8.27623687207439e-06, + "loss": 0.7897, + "step": 5808 + }, + { + "epoch": 1.5449468085106384, + "grad_norm": 3.7263903617858887, + "learning_rate": 8.275572430504578e-06, + "loss": 0.8311, + "step": 5809 + }, + { + "epoch": 1.5452127659574468, + "grad_norm": 3.551025390625, + "learning_rate": 8.274907887584695e-06, + "loss": 0.6916, + "step": 5810 + }, + { + "epoch": 1.5454787234042553, + "grad_norm": 3.8874595165252686, + "learning_rate": 8.274243243335307e-06, + "loss": 0.8246, + "step": 5811 + }, + { + "epoch": 1.545744680851064, + "grad_norm": 3.7710976600646973, + "learning_rate": 8.27357849777698e-06, + "loss": 0.8668, + "step": 5812 + }, + { + "epoch": 1.5460106382978722, + "grad_norm": 4.312849044799805, + "learning_rate": 8.272913650930277e-06, + "loss": 0.9206, + "step": 5813 + }, + { + "epoch": 1.546276595744681, + "grad_norm": 4.059734344482422, + "learning_rate": 8.272248702815776e-06, + "loss": 0.77, + "step": 5814 + }, + { + "epoch": 1.5465425531914894, + "grad_norm": 3.781832456588745, + "learning_rate": 8.271583653454046e-06, + "loss": 0.7643, + "step": 5815 + }, + { + "epoch": 1.5468085106382978, + "grad_norm": 3.607161045074463, + "learning_rate": 8.270918502865663e-06, + "loss": 0.7721, + "step": 5816 + }, + { + "epoch": 1.5470744680851065, + "grad_norm": 3.986572504043579, + "learning_rate": 8.270253251071214e-06, + "loss": 0.6967, + "step": 5817 + }, + { + "epoch": 1.5473404255319148, + "grad_norm": 3.9674570560455322, + "learning_rate": 8.269587898091277e-06, + "loss": 0.7986, + "step": 5818 + }, + { + "epoch": 1.5476063829787234, + "grad_norm": 3.794405698776245, + "learning_rate": 8.268922443946444e-06, + "loss": 0.7897, + "step": 5819 + }, + { + "epoch": 1.547872340425532, + "grad_norm": 3.5226500034332275, + "learning_rate": 8.2682568886573e-06, + "loss": 0.7474, + "step": 5820 + }, + { + "epoch": 1.5481382978723404, + "grad_norm": 3.692884922027588, + "learning_rate": 8.267591232244439e-06, + "loss": 0.9286, + "step": 5821 + }, + { + "epoch": 1.548404255319149, + "grad_norm": 4.193415641784668, + "learning_rate": 8.266925474728459e-06, + "loss": 0.7917, + "step": 5822 + }, + { + "epoch": 1.5486702127659573, + "grad_norm": 3.877485752105713, + "learning_rate": 8.266259616129959e-06, + "loss": 0.8366, + "step": 5823 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 3.8126795291900635, + "learning_rate": 8.26559365646954e-06, + "loss": 0.7591, + "step": 5824 + }, + { + "epoch": 1.5492021276595744, + "grad_norm": 4.233253479003906, + "learning_rate": 8.264927595767808e-06, + "loss": 0.8596, + "step": 5825 + }, + { + "epoch": 1.549468085106383, + "grad_norm": 4.092543601989746, + "learning_rate": 8.264261434045374e-06, + "loss": 0.7732, + "step": 5826 + }, + { + "epoch": 1.5497340425531916, + "grad_norm": 4.047788619995117, + "learning_rate": 8.263595171322847e-06, + "loss": 0.8763, + "step": 5827 + }, + { + "epoch": 1.55, + "grad_norm": 3.990832805633545, + "learning_rate": 8.262928807620843e-06, + "loss": 0.8313, + "step": 5828 + }, + { + "epoch": 1.5502659574468085, + "grad_norm": 3.948673725128174, + "learning_rate": 8.262262342959981e-06, + "loss": 0.8937, + "step": 5829 + }, + { + "epoch": 1.550531914893617, + "grad_norm": 4.302928924560547, + "learning_rate": 8.261595777360881e-06, + "loss": 0.7945, + "step": 5830 + }, + { + "epoch": 1.5507978723404254, + "grad_norm": 3.8130292892456055, + "learning_rate": 8.260929110844166e-06, + "loss": 0.7971, + "step": 5831 + }, + { + "epoch": 1.5510638297872341, + "grad_norm": 3.7944552898406982, + "learning_rate": 8.260262343430468e-06, + "loss": 0.7268, + "step": 5832 + }, + { + "epoch": 1.5513297872340426, + "grad_norm": 3.765657424926758, + "learning_rate": 8.259595475140412e-06, + "loss": 0.7289, + "step": 5833 + }, + { + "epoch": 1.551595744680851, + "grad_norm": 4.215806484222412, + "learning_rate": 8.258928505994635e-06, + "loss": 0.8254, + "step": 5834 + }, + { + "epoch": 1.5518617021276597, + "grad_norm": 3.7282323837280273, + "learning_rate": 8.258261436013774e-06, + "loss": 0.8426, + "step": 5835 + }, + { + "epoch": 1.552127659574468, + "grad_norm": 4.05489444732666, + "learning_rate": 8.257594265218468e-06, + "loss": 0.832, + "step": 5836 + }, + { + "epoch": 1.5523936170212767, + "grad_norm": 4.3416666984558105, + "learning_rate": 8.256926993629358e-06, + "loss": 0.844, + "step": 5837 + }, + { + "epoch": 1.5526595744680851, + "grad_norm": 4.158813953399658, + "learning_rate": 8.256259621267095e-06, + "loss": 0.7328, + "step": 5838 + }, + { + "epoch": 1.5529255319148936, + "grad_norm": 4.071340560913086, + "learning_rate": 8.255592148152325e-06, + "loss": 0.7983, + "step": 5839 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.988938093185425, + "learning_rate": 8.254924574305698e-06, + "loss": 0.7863, + "step": 5840 + }, + { + "epoch": 1.5534574468085105, + "grad_norm": 3.8350539207458496, + "learning_rate": 8.254256899747876e-06, + "loss": 0.8347, + "step": 5841 + }, + { + "epoch": 1.5537234042553192, + "grad_norm": 3.7759451866149902, + "learning_rate": 8.253589124499513e-06, + "loss": 0.7486, + "step": 5842 + }, + { + "epoch": 1.5539893617021276, + "grad_norm": 4.114711284637451, + "learning_rate": 8.252921248581272e-06, + "loss": 0.8939, + "step": 5843 + }, + { + "epoch": 1.554255319148936, + "grad_norm": 4.071899890899658, + "learning_rate": 8.252253272013816e-06, + "loss": 0.7912, + "step": 5844 + }, + { + "epoch": 1.5545212765957448, + "grad_norm": 3.5732295513153076, + "learning_rate": 8.251585194817816e-06, + "loss": 0.7897, + "step": 5845 + }, + { + "epoch": 1.554787234042553, + "grad_norm": 3.884356737136841, + "learning_rate": 8.250917017013943e-06, + "loss": 0.8328, + "step": 5846 + }, + { + "epoch": 1.5550531914893617, + "grad_norm": 4.147099018096924, + "learning_rate": 8.250248738622868e-06, + "loss": 0.8425, + "step": 5847 + }, + { + "epoch": 1.5553191489361702, + "grad_norm": 4.285495758056641, + "learning_rate": 8.249580359665272e-06, + "loss": 0.9088, + "step": 5848 + }, + { + "epoch": 1.5555851063829786, + "grad_norm": 3.903362512588501, + "learning_rate": 8.248911880161832e-06, + "loss": 0.8711, + "step": 5849 + }, + { + "epoch": 1.5558510638297873, + "grad_norm": 3.910297155380249, + "learning_rate": 8.248243300133236e-06, + "loss": 0.8571, + "step": 5850 + }, + { + "epoch": 1.5561170212765958, + "grad_norm": 3.7283291816711426, + "learning_rate": 8.247574619600165e-06, + "loss": 0.8114, + "step": 5851 + }, + { + "epoch": 1.5563829787234043, + "grad_norm": 4.2508864402771, + "learning_rate": 8.246905838583315e-06, + "loss": 0.8498, + "step": 5852 + }, + { + "epoch": 1.5566489361702127, + "grad_norm": 3.5398671627044678, + "learning_rate": 8.246236957103374e-06, + "loss": 0.7013, + "step": 5853 + }, + { + "epoch": 1.5569148936170212, + "grad_norm": 3.609945297241211, + "learning_rate": 8.245567975181037e-06, + "loss": 0.7113, + "step": 5854 + }, + { + "epoch": 1.5571808510638299, + "grad_norm": 3.550767660140991, + "learning_rate": 8.244898892837009e-06, + "loss": 0.753, + "step": 5855 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 4.197300434112549, + "learning_rate": 8.244229710091986e-06, + "loss": 0.7006, + "step": 5856 + }, + { + "epoch": 1.5577127659574468, + "grad_norm": 3.916386842727661, + "learning_rate": 8.243560426966678e-06, + "loss": 0.7071, + "step": 5857 + }, + { + "epoch": 1.5579787234042555, + "grad_norm": 4.1130218505859375, + "learning_rate": 8.242891043481793e-06, + "loss": 0.8622, + "step": 5858 + }, + { + "epoch": 1.5582446808510637, + "grad_norm": 3.9336955547332764, + "learning_rate": 8.242221559658039e-06, + "loss": 0.7626, + "step": 5859 + }, + { + "epoch": 1.5585106382978724, + "grad_norm": 4.237149715423584, + "learning_rate": 8.241551975516133e-06, + "loss": 0.8566, + "step": 5860 + }, + { + "epoch": 1.5587765957446809, + "grad_norm": 4.12725305557251, + "learning_rate": 8.240882291076794e-06, + "loss": 0.7879, + "step": 5861 + }, + { + "epoch": 1.5590425531914893, + "grad_norm": 4.043492794036865, + "learning_rate": 8.240212506360738e-06, + "loss": 0.8772, + "step": 5862 + }, + { + "epoch": 1.559308510638298, + "grad_norm": 3.8735363483428955, + "learning_rate": 8.239542621388696e-06, + "loss": 0.9265, + "step": 5863 + }, + { + "epoch": 1.5595744680851062, + "grad_norm": 4.195898056030273, + "learning_rate": 8.23887263618139e-06, + "loss": 0.9022, + "step": 5864 + }, + { + "epoch": 1.559840425531915, + "grad_norm": 3.4813778400421143, + "learning_rate": 8.23820255075955e-06, + "loss": 0.7605, + "step": 5865 + }, + { + "epoch": 1.5601063829787234, + "grad_norm": 3.5564541816711426, + "learning_rate": 8.237532365143909e-06, + "loss": 0.7148, + "step": 5866 + }, + { + "epoch": 1.5603723404255319, + "grad_norm": 4.291294097900391, + "learning_rate": 8.236862079355208e-06, + "loss": 1.022, + "step": 5867 + }, + { + "epoch": 1.5606382978723405, + "grad_norm": 3.761632204055786, + "learning_rate": 8.236191693414184e-06, + "loss": 0.8673, + "step": 5868 + }, + { + "epoch": 1.5609042553191488, + "grad_norm": 3.8336169719696045, + "learning_rate": 8.235521207341577e-06, + "loss": 0.7979, + "step": 5869 + }, + { + "epoch": 1.5611702127659575, + "grad_norm": 3.8964157104492188, + "learning_rate": 8.234850621158135e-06, + "loss": 0.7466, + "step": 5870 + }, + { + "epoch": 1.561436170212766, + "grad_norm": 3.8827109336853027, + "learning_rate": 8.234179934884605e-06, + "loss": 0.953, + "step": 5871 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 4.318760395050049, + "learning_rate": 8.23350914854174e-06, + "loss": 0.8975, + "step": 5872 + }, + { + "epoch": 1.561968085106383, + "grad_norm": 3.927676200866699, + "learning_rate": 8.232838262150298e-06, + "loss": 0.8148, + "step": 5873 + }, + { + "epoch": 1.5622340425531915, + "grad_norm": 4.160933017730713, + "learning_rate": 8.23216727573103e-06, + "loss": 0.7736, + "step": 5874 + }, + { + "epoch": 1.5625, + "grad_norm": 4.034573078155518, + "learning_rate": 8.231496189304704e-06, + "loss": 0.7754, + "step": 5875 + }, + { + "epoch": 1.5627659574468085, + "grad_norm": 4.033196926116943, + "learning_rate": 8.230825002892081e-06, + "loss": 0.8588, + "step": 5876 + }, + { + "epoch": 1.563031914893617, + "grad_norm": 3.949902057647705, + "learning_rate": 8.23015371651393e-06, + "loss": 0.8279, + "step": 5877 + }, + { + "epoch": 1.5632978723404256, + "grad_norm": 3.8417794704437256, + "learning_rate": 8.229482330191016e-06, + "loss": 0.7201, + "step": 5878 + }, + { + "epoch": 1.563563829787234, + "grad_norm": 3.836516857147217, + "learning_rate": 8.22881084394412e-06, + "loss": 0.9244, + "step": 5879 + }, + { + "epoch": 1.5638297872340425, + "grad_norm": 3.882302761077881, + "learning_rate": 8.228139257794012e-06, + "loss": 0.7944, + "step": 5880 + }, + { + "epoch": 1.5640957446808512, + "grad_norm": 4.163621425628662, + "learning_rate": 8.227467571761478e-06, + "loss": 0.7916, + "step": 5881 + }, + { + "epoch": 1.5643617021276595, + "grad_norm": 3.8937926292419434, + "learning_rate": 8.226795785867294e-06, + "loss": 0.7165, + "step": 5882 + }, + { + "epoch": 1.5646276595744681, + "grad_norm": 4.019950866699219, + "learning_rate": 8.226123900132252e-06, + "loss": 0.8444, + "step": 5883 + }, + { + "epoch": 1.5648936170212766, + "grad_norm": 3.9146535396575928, + "learning_rate": 8.225451914577137e-06, + "loss": 0.7472, + "step": 5884 + }, + { + "epoch": 1.565159574468085, + "grad_norm": 4.430140018463135, + "learning_rate": 8.224779829222742e-06, + "loss": 0.8139, + "step": 5885 + }, + { + "epoch": 1.5654255319148938, + "grad_norm": 3.8101890087127686, + "learning_rate": 8.224107644089863e-06, + "loss": 0.8198, + "step": 5886 + }, + { + "epoch": 1.565691489361702, + "grad_norm": 3.603240966796875, + "learning_rate": 8.223435359199297e-06, + "loss": 0.7507, + "step": 5887 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 3.993999719619751, + "learning_rate": 8.222762974571848e-06, + "loss": 0.6875, + "step": 5888 + }, + { + "epoch": 1.5662234042553191, + "grad_norm": 4.127441883087158, + "learning_rate": 8.222090490228316e-06, + "loss": 0.7653, + "step": 5889 + }, + { + "epoch": 1.5664893617021276, + "grad_norm": 4.082408428192139, + "learning_rate": 8.22141790618951e-06, + "loss": 0.8506, + "step": 5890 + }, + { + "epoch": 1.5667553191489363, + "grad_norm": 4.1307806968688965, + "learning_rate": 8.220745222476243e-06, + "loss": 0.7614, + "step": 5891 + }, + { + "epoch": 1.5670212765957445, + "grad_norm": 3.9022128582000732, + "learning_rate": 8.220072439109326e-06, + "loss": 0.8563, + "step": 5892 + }, + { + "epoch": 1.5672872340425532, + "grad_norm": 3.8020009994506836, + "learning_rate": 8.219399556109578e-06, + "loss": 0.8016, + "step": 5893 + }, + { + "epoch": 1.5675531914893617, + "grad_norm": 4.383156776428223, + "learning_rate": 8.218726573497817e-06, + "loss": 0.7956, + "step": 5894 + }, + { + "epoch": 1.5678191489361701, + "grad_norm": 4.414666175842285, + "learning_rate": 8.218053491294864e-06, + "loss": 0.8215, + "step": 5895 + }, + { + "epoch": 1.5680851063829788, + "grad_norm": 4.223287105560303, + "learning_rate": 8.21738030952155e-06, + "loss": 0.8466, + "step": 5896 + }, + { + "epoch": 1.5683510638297873, + "grad_norm": 4.012655735015869, + "learning_rate": 8.216707028198699e-06, + "loss": 0.7384, + "step": 5897 + }, + { + "epoch": 1.5686170212765957, + "grad_norm": 4.301409721374512, + "learning_rate": 8.216033647347145e-06, + "loss": 0.7748, + "step": 5898 + }, + { + "epoch": 1.5688829787234042, + "grad_norm": 4.148224353790283, + "learning_rate": 8.215360166987728e-06, + "loss": 0.8227, + "step": 5899 + }, + { + "epoch": 1.5691489361702127, + "grad_norm": 4.055191993713379, + "learning_rate": 8.214686587141277e-06, + "loss": 0.7811, + "step": 5900 + }, + { + "epoch": 1.5694148936170214, + "grad_norm": 3.9274792671203613, + "learning_rate": 8.21401290782864e-06, + "loss": 0.7934, + "step": 5901 + }, + { + "epoch": 1.5696808510638298, + "grad_norm": 3.762334108352661, + "learning_rate": 8.213339129070658e-06, + "loss": 0.7967, + "step": 5902 + }, + { + "epoch": 1.5699468085106383, + "grad_norm": 4.094070911407471, + "learning_rate": 8.212665250888184e-06, + "loss": 0.8637, + "step": 5903 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 3.871859550476074, + "learning_rate": 8.21199127330206e-06, + "loss": 0.7181, + "step": 5904 + }, + { + "epoch": 1.5704787234042552, + "grad_norm": 4.029532432556152, + "learning_rate": 8.211317196333149e-06, + "loss": 0.756, + "step": 5905 + }, + { + "epoch": 1.570744680851064, + "grad_norm": 3.982078790664673, + "learning_rate": 8.2106430200023e-06, + "loss": 0.7437, + "step": 5906 + }, + { + "epoch": 1.5710106382978724, + "grad_norm": 4.319076061248779, + "learning_rate": 8.209968744330375e-06, + "loss": 0.8517, + "step": 5907 + }, + { + "epoch": 1.5712765957446808, + "grad_norm": 3.5704493522644043, + "learning_rate": 8.20929436933824e-06, + "loss": 0.7369, + "step": 5908 + }, + { + "epoch": 1.5715425531914895, + "grad_norm": 3.825941562652588, + "learning_rate": 8.208619895046759e-06, + "loss": 0.7644, + "step": 5909 + }, + { + "epoch": 1.5718085106382977, + "grad_norm": 3.535365581512451, + "learning_rate": 8.2079453214768e-06, + "loss": 0.8191, + "step": 5910 + }, + { + "epoch": 1.5720744680851064, + "grad_norm": 4.012056827545166, + "learning_rate": 8.207270648649235e-06, + "loss": 0.805, + "step": 5911 + }, + { + "epoch": 1.5723404255319149, + "grad_norm": 3.670342206954956, + "learning_rate": 8.20659587658494e-06, + "loss": 0.7253, + "step": 5912 + }, + { + "epoch": 1.5726063829787233, + "grad_norm": 3.5404562950134277, + "learning_rate": 8.205921005304796e-06, + "loss": 0.7078, + "step": 5913 + }, + { + "epoch": 1.572872340425532, + "grad_norm": 4.304678916931152, + "learning_rate": 8.20524603482968e-06, + "loss": 0.8129, + "step": 5914 + }, + { + "epoch": 1.5731382978723403, + "grad_norm": 3.6795125007629395, + "learning_rate": 8.204570965180476e-06, + "loss": 0.7669, + "step": 5915 + }, + { + "epoch": 1.573404255319149, + "grad_norm": 3.8298754692077637, + "learning_rate": 8.203895796378076e-06, + "loss": 0.7803, + "step": 5916 + }, + { + "epoch": 1.5736702127659574, + "grad_norm": 4.399144649505615, + "learning_rate": 8.203220528443367e-06, + "loss": 0.9503, + "step": 5917 + }, + { + "epoch": 1.5739361702127659, + "grad_norm": 4.104849815368652, + "learning_rate": 8.202545161397242e-06, + "loss": 0.8586, + "step": 5918 + }, + { + "epoch": 1.5742021276595746, + "grad_norm": 4.923317909240723, + "learning_rate": 8.201869695260603e-06, + "loss": 0.815, + "step": 5919 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 3.845151424407959, + "learning_rate": 8.201194130054342e-06, + "loss": 0.8449, + "step": 5920 + }, + { + "epoch": 1.5747340425531915, + "grad_norm": 4.074094295501709, + "learning_rate": 8.200518465799367e-06, + "loss": 0.7569, + "step": 5921 + }, + { + "epoch": 1.575, + "grad_norm": 4.062026023864746, + "learning_rate": 8.199842702516584e-06, + "loss": 0.8712, + "step": 5922 + }, + { + "epoch": 1.5752659574468084, + "grad_norm": 4.046767711639404, + "learning_rate": 8.199166840226898e-06, + "loss": 0.8318, + "step": 5923 + }, + { + "epoch": 1.575531914893617, + "grad_norm": 3.813408851623535, + "learning_rate": 8.198490878951224e-06, + "loss": 0.7493, + "step": 5924 + }, + { + "epoch": 1.5757978723404256, + "grad_norm": 4.108468055725098, + "learning_rate": 8.19781481871048e-06, + "loss": 0.7867, + "step": 5925 + }, + { + "epoch": 1.576063829787234, + "grad_norm": 3.9004015922546387, + "learning_rate": 8.197138659525576e-06, + "loss": 0.7384, + "step": 5926 + }, + { + "epoch": 1.5763297872340427, + "grad_norm": 4.14080286026001, + "learning_rate": 8.19646240141744e-06, + "loss": 0.7755, + "step": 5927 + }, + { + "epoch": 1.576595744680851, + "grad_norm": 3.8850128650665283, + "learning_rate": 8.195786044406992e-06, + "loss": 0.7689, + "step": 5928 + }, + { + "epoch": 1.5768617021276596, + "grad_norm": 3.973543882369995, + "learning_rate": 8.195109588515163e-06, + "loss": 0.7336, + "step": 5929 + }, + { + "epoch": 1.577127659574468, + "grad_norm": 3.7367260456085205, + "learning_rate": 8.194433033762882e-06, + "loss": 0.8511, + "step": 5930 + }, + { + "epoch": 1.5773936170212766, + "grad_norm": 3.7051467895507812, + "learning_rate": 8.193756380171081e-06, + "loss": 0.7696, + "step": 5931 + }, + { + "epoch": 1.5776595744680852, + "grad_norm": 3.612755298614502, + "learning_rate": 8.193079627760697e-06, + "loss": 0.7733, + "step": 5932 + }, + { + "epoch": 1.5779255319148935, + "grad_norm": 4.524839401245117, + "learning_rate": 8.19240277655267e-06, + "loss": 0.8047, + "step": 5933 + }, + { + "epoch": 1.5781914893617022, + "grad_norm": 4.2709059715271, + "learning_rate": 8.191725826567943e-06, + "loss": 0.9173, + "step": 5934 + }, + { + "epoch": 1.5784574468085106, + "grad_norm": 4.062780857086182, + "learning_rate": 8.191048777827462e-06, + "loss": 0.755, + "step": 5935 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 4.253462314605713, + "learning_rate": 8.190371630352174e-06, + "loss": 0.9102, + "step": 5936 + }, + { + "epoch": 1.5789893617021278, + "grad_norm": 3.578122854232788, + "learning_rate": 8.189694384163032e-06, + "loss": 0.6755, + "step": 5937 + }, + { + "epoch": 1.579255319148936, + "grad_norm": 3.9935173988342285, + "learning_rate": 8.189017039280989e-06, + "loss": 0.8196, + "step": 5938 + }, + { + "epoch": 1.5795212765957447, + "grad_norm": 3.9614062309265137, + "learning_rate": 8.188339595727004e-06, + "loss": 0.7896, + "step": 5939 + }, + { + "epoch": 1.5797872340425532, + "grad_norm": 3.7698519229888916, + "learning_rate": 8.187662053522039e-06, + "loss": 0.785, + "step": 5940 + }, + { + "epoch": 1.5800531914893616, + "grad_norm": 4.328986167907715, + "learning_rate": 8.186984412687058e-06, + "loss": 0.87, + "step": 5941 + }, + { + "epoch": 1.5803191489361703, + "grad_norm": 4.169852256774902, + "learning_rate": 8.186306673243025e-06, + "loss": 0.8594, + "step": 5942 + }, + { + "epoch": 1.5805851063829788, + "grad_norm": 4.010345458984375, + "learning_rate": 8.185628835210915e-06, + "loss": 0.913, + "step": 5943 + }, + { + "epoch": 1.5808510638297872, + "grad_norm": 3.9177587032318115, + "learning_rate": 8.184950898611696e-06, + "loss": 0.9157, + "step": 5944 + }, + { + "epoch": 1.5811170212765957, + "grad_norm": 4.508220672607422, + "learning_rate": 8.184272863466348e-06, + "loss": 0.8951, + "step": 5945 + }, + { + "epoch": 1.5813829787234042, + "grad_norm": 3.5971477031707764, + "learning_rate": 8.183594729795848e-06, + "loss": 0.7883, + "step": 5946 + }, + { + "epoch": 1.5816489361702128, + "grad_norm": 4.1539998054504395, + "learning_rate": 8.182916497621177e-06, + "loss": 0.8599, + "step": 5947 + }, + { + "epoch": 1.5819148936170213, + "grad_norm": 3.9577205181121826, + "learning_rate": 8.182238166963325e-06, + "loss": 0.8107, + "step": 5948 + }, + { + "epoch": 1.5821808510638298, + "grad_norm": 3.921849250793457, + "learning_rate": 8.181559737843274e-06, + "loss": 0.8452, + "step": 5949 + }, + { + "epoch": 1.5824468085106385, + "grad_norm": 3.6595895290374756, + "learning_rate": 8.18088121028202e-06, + "loss": 0.8332, + "step": 5950 + }, + { + "epoch": 1.5827127659574467, + "grad_norm": 4.248002052307129, + "learning_rate": 8.18020258430056e-06, + "loss": 0.928, + "step": 5951 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 3.584662437438965, + "learning_rate": 8.179523859919884e-06, + "loss": 0.7684, + "step": 5952 + }, + { + "epoch": 1.5832446808510638, + "grad_norm": 3.5269956588745117, + "learning_rate": 8.178845037160997e-06, + "loss": 0.7553, + "step": 5953 + }, + { + "epoch": 1.5835106382978723, + "grad_norm": 4.2691731452941895, + "learning_rate": 8.178166116044904e-06, + "loss": 0.8211, + "step": 5954 + }, + { + "epoch": 1.583776595744681, + "grad_norm": 4.050920009613037, + "learning_rate": 8.177487096592607e-06, + "loss": 0.9221, + "step": 5955 + }, + { + "epoch": 1.5840425531914892, + "grad_norm": 4.290426731109619, + "learning_rate": 8.17680797882512e-06, + "loss": 0.7909, + "step": 5956 + }, + { + "epoch": 1.584308510638298, + "grad_norm": 3.8692431449890137, + "learning_rate": 8.176128762763451e-06, + "loss": 0.7887, + "step": 5957 + }, + { + "epoch": 1.5845744680851064, + "grad_norm": 4.173573017120361, + "learning_rate": 8.175449448428621e-06, + "loss": 0.7535, + "step": 5958 + }, + { + "epoch": 1.5848404255319148, + "grad_norm": 4.186033248901367, + "learning_rate": 8.174770035841647e-06, + "loss": 0.8673, + "step": 5959 + }, + { + "epoch": 1.5851063829787235, + "grad_norm": 4.015555381774902, + "learning_rate": 8.17409052502355e-06, + "loss": 0.8815, + "step": 5960 + }, + { + "epoch": 1.5853723404255318, + "grad_norm": 3.864473342895508, + "learning_rate": 8.173410915995354e-06, + "loss": 0.8684, + "step": 5961 + }, + { + "epoch": 1.5856382978723405, + "grad_norm": 3.6198973655700684, + "learning_rate": 8.172731208778089e-06, + "loss": 0.7445, + "step": 5962 + }, + { + "epoch": 1.585904255319149, + "grad_norm": 3.7900218963623047, + "learning_rate": 8.172051403392784e-06, + "loss": 0.7331, + "step": 5963 + }, + { + "epoch": 1.5861702127659574, + "grad_norm": 4.163589954376221, + "learning_rate": 8.171371499860475e-06, + "loss": 0.8528, + "step": 5964 + }, + { + "epoch": 1.586436170212766, + "grad_norm": 4.275415420532227, + "learning_rate": 8.170691498202196e-06, + "loss": 0.8435, + "step": 5965 + }, + { + "epoch": 1.5867021276595743, + "grad_norm": 3.969174861907959, + "learning_rate": 8.170011398438992e-06, + "loss": 0.8812, + "step": 5966 + }, + { + "epoch": 1.586968085106383, + "grad_norm": 4.086930751800537, + "learning_rate": 8.169331200591901e-06, + "loss": 0.8988, + "step": 5967 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 4.444678783416748, + "learning_rate": 8.168650904681973e-06, + "loss": 0.9295, + "step": 5968 + }, + { + "epoch": 1.5875, + "grad_norm": 3.7711548805236816, + "learning_rate": 8.167970510730254e-06, + "loss": 0.7715, + "step": 5969 + }, + { + "epoch": 1.5877659574468086, + "grad_norm": 3.800588369369507, + "learning_rate": 8.167290018757797e-06, + "loss": 0.8273, + "step": 5970 + }, + { + "epoch": 1.588031914893617, + "grad_norm": 4.506065845489502, + "learning_rate": 8.16660942878566e-06, + "loss": 0.7786, + "step": 5971 + }, + { + "epoch": 1.5882978723404255, + "grad_norm": 3.8182950019836426, + "learning_rate": 8.165928740834896e-06, + "loss": 0.6682, + "step": 5972 + }, + { + "epoch": 1.5885638297872342, + "grad_norm": 4.040492534637451, + "learning_rate": 8.165247954926572e-06, + "loss": 0.7333, + "step": 5973 + }, + { + "epoch": 1.5888297872340424, + "grad_norm": 4.233337879180908, + "learning_rate": 8.164567071081747e-06, + "loss": 0.7931, + "step": 5974 + }, + { + "epoch": 1.5890957446808511, + "grad_norm": 4.0191969871521, + "learning_rate": 8.163886089321493e-06, + "loss": 0.8279, + "step": 5975 + }, + { + "epoch": 1.5893617021276596, + "grad_norm": 3.9428741931915283, + "learning_rate": 8.163205009666879e-06, + "loss": 0.7945, + "step": 5976 + }, + { + "epoch": 1.589627659574468, + "grad_norm": 4.383618354797363, + "learning_rate": 8.162523832138977e-06, + "loss": 0.8961, + "step": 5977 + }, + { + "epoch": 1.5898936170212767, + "grad_norm": 4.313653945922852, + "learning_rate": 8.161842556758863e-06, + "loss": 0.927, + "step": 5978 + }, + { + "epoch": 1.590159574468085, + "grad_norm": 4.137526988983154, + "learning_rate": 8.161161183547619e-06, + "loss": 0.833, + "step": 5979 + }, + { + "epoch": 1.5904255319148937, + "grad_norm": 3.9024994373321533, + "learning_rate": 8.160479712526326e-06, + "loss": 0.8324, + "step": 5980 + }, + { + "epoch": 1.5906914893617021, + "grad_norm": 3.745685577392578, + "learning_rate": 8.159798143716069e-06, + "loss": 0.7946, + "step": 5981 + }, + { + "epoch": 1.5909574468085106, + "grad_norm": 4.142686367034912, + "learning_rate": 8.159116477137938e-06, + "loss": 0.8469, + "step": 5982 + }, + { + "epoch": 1.5912234042553193, + "grad_norm": 4.332526683807373, + "learning_rate": 8.158434712813024e-06, + "loss": 0.8398, + "step": 5983 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 4.1822028160095215, + "learning_rate": 8.157752850762422e-06, + "loss": 0.8182, + "step": 5984 + }, + { + "epoch": 1.5917553191489362, + "grad_norm": 3.797029972076416, + "learning_rate": 8.157070891007227e-06, + "loss": 0.8219, + "step": 5985 + }, + { + "epoch": 1.5920212765957447, + "grad_norm": 3.6281862258911133, + "learning_rate": 8.156388833568543e-06, + "loss": 0.7788, + "step": 5986 + }, + { + "epoch": 1.5922872340425531, + "grad_norm": 3.963622570037842, + "learning_rate": 8.155706678467472e-06, + "loss": 0.8121, + "step": 5987 + }, + { + "epoch": 1.5925531914893618, + "grad_norm": 3.965254068374634, + "learning_rate": 8.15502442572512e-06, + "loss": 0.9758, + "step": 5988 + }, + { + "epoch": 1.59281914893617, + "grad_norm": 3.7290945053100586, + "learning_rate": 8.1543420753626e-06, + "loss": 0.7913, + "step": 5989 + }, + { + "epoch": 1.5930851063829787, + "grad_norm": 3.5423686504364014, + "learning_rate": 8.15365962740102e-06, + "loss": 0.6702, + "step": 5990 + }, + { + "epoch": 1.5933510638297872, + "grad_norm": 4.0960540771484375, + "learning_rate": 8.1529770818615e-06, + "loss": 0.976, + "step": 5991 + }, + { + "epoch": 1.5936170212765957, + "grad_norm": 3.9374215602874756, + "learning_rate": 8.152294438765157e-06, + "loss": 0.7726, + "step": 5992 + }, + { + "epoch": 1.5938829787234043, + "grad_norm": 4.123393535614014, + "learning_rate": 8.15161169813311e-06, + "loss": 0.7414, + "step": 5993 + }, + { + "epoch": 1.5941489361702128, + "grad_norm": 3.7125062942504883, + "learning_rate": 8.150928859986488e-06, + "loss": 0.8094, + "step": 5994 + }, + { + "epoch": 1.5944148936170213, + "grad_norm": 3.6186742782592773, + "learning_rate": 8.15024592434642e-06, + "loss": 0.8291, + "step": 5995 + }, + { + "epoch": 1.59468085106383, + "grad_norm": 3.9349913597106934, + "learning_rate": 8.14956289123403e-06, + "loss": 0.8469, + "step": 5996 + }, + { + "epoch": 1.5949468085106382, + "grad_norm": 4.224155426025391, + "learning_rate": 8.148879760670459e-06, + "loss": 0.8178, + "step": 5997 + }, + { + "epoch": 1.5952127659574469, + "grad_norm": 4.03489351272583, + "learning_rate": 8.14819653267684e-06, + "loss": 1.0682, + "step": 5998 + }, + { + "epoch": 1.5954787234042553, + "grad_norm": 3.757615566253662, + "learning_rate": 8.147513207274314e-06, + "loss": 0.9454, + "step": 5999 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 3.69804048538208, + "learning_rate": 8.146829784484024e-06, + "loss": 0.6988, + "step": 6000 + }, + { + "epoch": 1.5957446808510638, + "eval_loss": 1.2842473983764648, + "eval_runtime": 13.4375, + "eval_samples_per_second": 29.767, + "eval_steps_per_second": 3.721, + "step": 6000 + }, + { + "epoch": 1.5960106382978725, + "grad_norm": 3.8672168254852295, + "learning_rate": 8.146146264327113e-06, + "loss": 0.8893, + "step": 6001 + }, + { + "epoch": 1.5962765957446807, + "grad_norm": 3.7445380687713623, + "learning_rate": 8.145462646824734e-06, + "loss": 0.8237, + "step": 6002 + }, + { + "epoch": 1.5965425531914894, + "grad_norm": 3.7135863304138184, + "learning_rate": 8.144778931998038e-06, + "loss": 0.6954, + "step": 6003 + }, + { + "epoch": 1.5968085106382979, + "grad_norm": 3.946181058883667, + "learning_rate": 8.144095119868178e-06, + "loss": 0.8022, + "step": 6004 + }, + { + "epoch": 1.5970744680851063, + "grad_norm": 3.866457223892212, + "learning_rate": 8.143411210456314e-06, + "loss": 0.7848, + "step": 6005 + }, + { + "epoch": 1.597340425531915, + "grad_norm": 3.9514496326446533, + "learning_rate": 8.142727203783608e-06, + "loss": 0.8287, + "step": 6006 + }, + { + "epoch": 1.5976063829787233, + "grad_norm": 3.780092239379883, + "learning_rate": 8.142043099871219e-06, + "loss": 0.731, + "step": 6007 + }, + { + "epoch": 1.597872340425532, + "grad_norm": 3.832037925720215, + "learning_rate": 8.141358898740319e-06, + "loss": 0.8207, + "step": 6008 + }, + { + "epoch": 1.5981382978723404, + "grad_norm": 3.7208633422851562, + "learning_rate": 8.140674600412076e-06, + "loss": 0.7905, + "step": 6009 + }, + { + "epoch": 1.5984042553191489, + "grad_norm": 3.5873775482177734, + "learning_rate": 8.139990204907662e-06, + "loss": 0.7042, + "step": 6010 + }, + { + "epoch": 1.5986702127659576, + "grad_norm": 4.138782024383545, + "learning_rate": 8.139305712248256e-06, + "loss": 0.8231, + "step": 6011 + }, + { + "epoch": 1.5989361702127658, + "grad_norm": 4.014845371246338, + "learning_rate": 8.138621122455034e-06, + "loss": 0.7606, + "step": 6012 + }, + { + "epoch": 1.5992021276595745, + "grad_norm": 3.997772693634033, + "learning_rate": 8.13793643554918e-06, + "loss": 0.8122, + "step": 6013 + }, + { + "epoch": 1.599468085106383, + "grad_norm": 3.3885183334350586, + "learning_rate": 8.137251651551878e-06, + "loss": 0.7245, + "step": 6014 + }, + { + "epoch": 1.5997340425531914, + "grad_norm": 3.9096522331237793, + "learning_rate": 8.136566770484316e-06, + "loss": 0.7919, + "step": 6015 + }, + { + "epoch": 1.6, + "grad_norm": 4.008962154388428, + "learning_rate": 8.135881792367686e-06, + "loss": 0.8683, + "step": 6016 + }, + { + "epoch": 1.6002659574468086, + "grad_norm": 3.9772658348083496, + "learning_rate": 8.13519671722318e-06, + "loss": 0.7775, + "step": 6017 + }, + { + "epoch": 1.600531914893617, + "grad_norm": 4.593280792236328, + "learning_rate": 8.134511545071998e-06, + "loss": 0.8959, + "step": 6018 + }, + { + "epoch": 1.6007978723404257, + "grad_norm": 3.9730031490325928, + "learning_rate": 8.133826275935337e-06, + "loss": 0.8394, + "step": 6019 + }, + { + "epoch": 1.601063829787234, + "grad_norm": 4.224338531494141, + "learning_rate": 8.133140909834402e-06, + "loss": 0.7961, + "step": 6020 + }, + { + "epoch": 1.6013297872340426, + "grad_norm": 3.759888172149658, + "learning_rate": 8.132455446790399e-06, + "loss": 0.8531, + "step": 6021 + }, + { + "epoch": 1.601595744680851, + "grad_norm": 3.5629312992095947, + "learning_rate": 8.131769886824535e-06, + "loss": 0.8102, + "step": 6022 + }, + { + "epoch": 1.6018617021276595, + "grad_norm": 3.5515568256378174, + "learning_rate": 8.131084229958024e-06, + "loss": 0.7867, + "step": 6023 + }, + { + "epoch": 1.6021276595744682, + "grad_norm": 4.148061275482178, + "learning_rate": 8.130398476212081e-06, + "loss": 0.8708, + "step": 6024 + }, + { + "epoch": 1.6023936170212765, + "grad_norm": 4.018913745880127, + "learning_rate": 8.129712625607924e-06, + "loss": 0.771, + "step": 6025 + }, + { + "epoch": 1.6026595744680852, + "grad_norm": 4.379147052764893, + "learning_rate": 8.129026678166772e-06, + "loss": 0.8199, + "step": 6026 + }, + { + "epoch": 1.6029255319148936, + "grad_norm": 3.568890333175659, + "learning_rate": 8.128340633909852e-06, + "loss": 0.705, + "step": 6027 + }, + { + "epoch": 1.603191489361702, + "grad_norm": 3.6377384662628174, + "learning_rate": 8.127654492858388e-06, + "loss": 0.6958, + "step": 6028 + }, + { + "epoch": 1.6034574468085108, + "grad_norm": 4.233497142791748, + "learning_rate": 8.126968255033614e-06, + "loss": 0.8446, + "step": 6029 + }, + { + "epoch": 1.603723404255319, + "grad_norm": 4.239995956420898, + "learning_rate": 8.126281920456758e-06, + "loss": 0.813, + "step": 6030 + }, + { + "epoch": 1.6039893617021277, + "grad_norm": 3.8521575927734375, + "learning_rate": 8.12559548914906e-06, + "loss": 0.7906, + "step": 6031 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 3.567471742630005, + "learning_rate": 8.124908961131759e-06, + "loss": 0.6709, + "step": 6032 + }, + { + "epoch": 1.6045212765957446, + "grad_norm": 3.527024030685425, + "learning_rate": 8.124222336426094e-06, + "loss": 0.7508, + "step": 6033 + }, + { + "epoch": 1.6047872340425533, + "grad_norm": 4.134167671203613, + "learning_rate": 8.123535615053312e-06, + "loss": 0.8233, + "step": 6034 + }, + { + "epoch": 1.6050531914893615, + "grad_norm": 3.62556791305542, + "learning_rate": 8.12284879703466e-06, + "loss": 0.7347, + "step": 6035 + }, + { + "epoch": 1.6053191489361702, + "grad_norm": 4.534690856933594, + "learning_rate": 8.12216188239139e-06, + "loss": 0.9258, + "step": 6036 + }, + { + "epoch": 1.6055851063829787, + "grad_norm": 3.8855905532836914, + "learning_rate": 8.121474871144757e-06, + "loss": 0.7215, + "step": 6037 + }, + { + "epoch": 1.6058510638297872, + "grad_norm": 3.889317274093628, + "learning_rate": 8.120787763316014e-06, + "loss": 0.7557, + "step": 6038 + }, + { + "epoch": 1.6061170212765958, + "grad_norm": 4.091339588165283, + "learning_rate": 8.120100558926425e-06, + "loss": 0.8053, + "step": 6039 + }, + { + "epoch": 1.6063829787234043, + "grad_norm": 4.249019622802734, + "learning_rate": 8.11941325799725e-06, + "loss": 0.837, + "step": 6040 + }, + { + "epoch": 1.6066489361702128, + "grad_norm": 4.165124416351318, + "learning_rate": 8.118725860549756e-06, + "loss": 0.8762, + "step": 6041 + }, + { + "epoch": 1.6069148936170212, + "grad_norm": 4.028770923614502, + "learning_rate": 8.118038366605212e-06, + "loss": 0.8456, + "step": 6042 + }, + { + "epoch": 1.6071808510638297, + "grad_norm": 3.60648250579834, + "learning_rate": 8.117350776184892e-06, + "loss": 0.688, + "step": 6043 + }, + { + "epoch": 1.6074468085106384, + "grad_norm": 3.6444270610809326, + "learning_rate": 8.116663089310067e-06, + "loss": 0.8199, + "step": 6044 + }, + { + "epoch": 1.6077127659574468, + "grad_norm": 4.073156833648682, + "learning_rate": 8.115975306002018e-06, + "loss": 0.9758, + "step": 6045 + }, + { + "epoch": 1.6079787234042553, + "grad_norm": 4.100760459899902, + "learning_rate": 8.115287426282022e-06, + "loss": 0.9357, + "step": 6046 + }, + { + "epoch": 1.608244680851064, + "grad_norm": 4.134888648986816, + "learning_rate": 8.114599450171366e-06, + "loss": 0.7536, + "step": 6047 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 3.8742432594299316, + "learning_rate": 8.113911377691338e-06, + "loss": 0.7832, + "step": 6048 + }, + { + "epoch": 1.608776595744681, + "grad_norm": 4.110736846923828, + "learning_rate": 8.113223208863224e-06, + "loss": 0.7098, + "step": 6049 + }, + { + "epoch": 1.6090425531914894, + "grad_norm": 3.972907304763794, + "learning_rate": 8.11253494370832e-06, + "loss": 0.8414, + "step": 6050 + }, + { + "epoch": 1.6093085106382978, + "grad_norm": 3.984872817993164, + "learning_rate": 8.111846582247917e-06, + "loss": 0.9063, + "step": 6051 + }, + { + "epoch": 1.6095744680851065, + "grad_norm": 4.114076614379883, + "learning_rate": 8.11115812450332e-06, + "loss": 0.8774, + "step": 6052 + }, + { + "epoch": 1.6098404255319148, + "grad_norm": 3.8898861408233643, + "learning_rate": 8.110469570495828e-06, + "loss": 0.6855, + "step": 6053 + }, + { + "epoch": 1.6101063829787234, + "grad_norm": 3.620485544204712, + "learning_rate": 8.109780920246743e-06, + "loss": 0.8566, + "step": 6054 + }, + { + "epoch": 1.610372340425532, + "grad_norm": 4.412075519561768, + "learning_rate": 8.109092173777376e-06, + "loss": 0.8386, + "step": 6055 + }, + { + "epoch": 1.6106382978723404, + "grad_norm": 4.396791934967041, + "learning_rate": 8.108403331109038e-06, + "loss": 0.7074, + "step": 6056 + }, + { + "epoch": 1.610904255319149, + "grad_norm": 4.347930431365967, + "learning_rate": 8.10771439226304e-06, + "loss": 0.8188, + "step": 6057 + }, + { + "epoch": 1.6111702127659573, + "grad_norm": 3.751016855239868, + "learning_rate": 8.1070253572607e-06, + "loss": 0.7469, + "step": 6058 + }, + { + "epoch": 1.611436170212766, + "grad_norm": 4.112164497375488, + "learning_rate": 8.106336226123339e-06, + "loss": 0.8259, + "step": 6059 + }, + { + "epoch": 1.6117021276595744, + "grad_norm": 4.112537860870361, + "learning_rate": 8.105646998872275e-06, + "loss": 0.8493, + "step": 6060 + }, + { + "epoch": 1.611968085106383, + "grad_norm": 4.171288967132568, + "learning_rate": 8.104957675528837e-06, + "loss": 0.9249, + "step": 6061 + }, + { + "epoch": 1.6122340425531916, + "grad_norm": 4.331489086151123, + "learning_rate": 8.104268256114354e-06, + "loss": 0.9123, + "step": 6062 + }, + { + "epoch": 1.6125, + "grad_norm": 4.148106575012207, + "learning_rate": 8.103578740650157e-06, + "loss": 0.7654, + "step": 6063 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 3.72057843208313, + "learning_rate": 8.102889129157578e-06, + "loss": 0.8049, + "step": 6064 + }, + { + "epoch": 1.613031914893617, + "grad_norm": 3.9282565116882324, + "learning_rate": 8.102199421657957e-06, + "loss": 0.7639, + "step": 6065 + }, + { + "epoch": 1.6132978723404254, + "grad_norm": 3.8103582859039307, + "learning_rate": 8.101509618172634e-06, + "loss": 0.8689, + "step": 6066 + }, + { + "epoch": 1.6135638297872341, + "grad_norm": 4.2297539710998535, + "learning_rate": 8.10081971872295e-06, + "loss": 0.9582, + "step": 6067 + }, + { + "epoch": 1.6138297872340426, + "grad_norm": 4.653298854827881, + "learning_rate": 8.100129723330255e-06, + "loss": 0.9946, + "step": 6068 + }, + { + "epoch": 1.614095744680851, + "grad_norm": 3.7969958782196045, + "learning_rate": 8.099439632015896e-06, + "loss": 0.7852, + "step": 6069 + }, + { + "epoch": 1.6143617021276597, + "grad_norm": 4.072946071624756, + "learning_rate": 8.098749444801226e-06, + "loss": 0.79, + "step": 6070 + }, + { + "epoch": 1.614627659574468, + "grad_norm": 3.9592959880828857, + "learning_rate": 8.0980591617076e-06, + "loss": 0.7815, + "step": 6071 + }, + { + "epoch": 1.6148936170212767, + "grad_norm": 4.4633588790893555, + "learning_rate": 8.097368782756374e-06, + "loss": 0.7754, + "step": 6072 + }, + { + "epoch": 1.6151595744680851, + "grad_norm": 4.381833553314209, + "learning_rate": 8.096678307968913e-06, + "loss": 0.9649, + "step": 6073 + }, + { + "epoch": 1.6154255319148936, + "grad_norm": 4.433225154876709, + "learning_rate": 8.095987737366578e-06, + "loss": 0.9376, + "step": 6074 + }, + { + "epoch": 1.6156914893617023, + "grad_norm": 3.7621006965637207, + "learning_rate": 8.095297070970738e-06, + "loss": 0.7577, + "step": 6075 + }, + { + "epoch": 1.6159574468085105, + "grad_norm": 3.4518826007843018, + "learning_rate": 8.094606308802764e-06, + "loss": 0.816, + "step": 6076 + }, + { + "epoch": 1.6162234042553192, + "grad_norm": 4.059780120849609, + "learning_rate": 8.093915450884025e-06, + "loss": 0.8319, + "step": 6077 + }, + { + "epoch": 1.6164893617021276, + "grad_norm": 3.8527324199676514, + "learning_rate": 8.093224497235899e-06, + "loss": 0.8826, + "step": 6078 + }, + { + "epoch": 1.616755319148936, + "grad_norm": 3.3895418643951416, + "learning_rate": 8.092533447879766e-06, + "loss": 0.73, + "step": 6079 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 3.9259166717529297, + "learning_rate": 8.091842302837009e-06, + "loss": 0.8569, + "step": 6080 + }, + { + "epoch": 1.617287234042553, + "grad_norm": 3.5704541206359863, + "learning_rate": 8.091151062129008e-06, + "loss": 0.8113, + "step": 6081 + }, + { + "epoch": 1.6175531914893617, + "grad_norm": 3.8313138484954834, + "learning_rate": 8.090459725777156e-06, + "loss": 0.7352, + "step": 6082 + }, + { + "epoch": 1.6178191489361702, + "grad_norm": 4.403858184814453, + "learning_rate": 8.089768293802842e-06, + "loss": 0.7757, + "step": 6083 + }, + { + "epoch": 1.6180851063829786, + "grad_norm": 4.078790664672852, + "learning_rate": 8.089076766227457e-06, + "loss": 0.8444, + "step": 6084 + }, + { + "epoch": 1.6183510638297873, + "grad_norm": 4.103868007659912, + "learning_rate": 8.088385143072402e-06, + "loss": 0.7451, + "step": 6085 + }, + { + "epoch": 1.6186170212765958, + "grad_norm": 3.906527042388916, + "learning_rate": 8.087693424359073e-06, + "loss": 0.7095, + "step": 6086 + }, + { + "epoch": 1.6188829787234043, + "grad_norm": 4.909295082092285, + "learning_rate": 8.087001610108874e-06, + "loss": 0.8277, + "step": 6087 + }, + { + "epoch": 1.6191489361702127, + "grad_norm": 5.194472312927246, + "learning_rate": 8.086309700343211e-06, + "loss": 0.8959, + "step": 6088 + }, + { + "epoch": 1.6194148936170212, + "grad_norm": 3.6174070835113525, + "learning_rate": 8.085617695083493e-06, + "loss": 0.7838, + "step": 6089 + }, + { + "epoch": 1.6196808510638299, + "grad_norm": 3.5253570079803467, + "learning_rate": 8.08492559435113e-06, + "loss": 0.7633, + "step": 6090 + }, + { + "epoch": 1.6199468085106383, + "grad_norm": 4.330216884613037, + "learning_rate": 8.084233398167537e-06, + "loss": 0.8669, + "step": 6091 + }, + { + "epoch": 1.6202127659574468, + "grad_norm": 3.792811393737793, + "learning_rate": 8.083541106554131e-06, + "loss": 0.8782, + "step": 6092 + }, + { + "epoch": 1.6204787234042555, + "grad_norm": 3.888946533203125, + "learning_rate": 8.082848719532335e-06, + "loss": 0.8816, + "step": 6093 + }, + { + "epoch": 1.6207446808510637, + "grad_norm": 3.9346768856048584, + "learning_rate": 8.082156237123567e-06, + "loss": 0.6887, + "step": 6094 + }, + { + "epoch": 1.6210106382978724, + "grad_norm": 3.7470414638519287, + "learning_rate": 8.081463659349258e-06, + "loss": 0.7622, + "step": 6095 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 3.9194772243499756, + "learning_rate": 8.080770986230835e-06, + "loss": 0.768, + "step": 6096 + }, + { + "epoch": 1.6215425531914893, + "grad_norm": 3.7921671867370605, + "learning_rate": 8.08007821778973e-06, + "loss": 0.8936, + "step": 6097 + }, + { + "epoch": 1.621808510638298, + "grad_norm": 3.8893918991088867, + "learning_rate": 8.07938535404738e-06, + "loss": 0.835, + "step": 6098 + }, + { + "epoch": 1.6220744680851062, + "grad_norm": 3.7834744453430176, + "learning_rate": 8.07869239502522e-06, + "loss": 0.7374, + "step": 6099 + }, + { + "epoch": 1.622340425531915, + "grad_norm": 3.867154598236084, + "learning_rate": 8.077999340744694e-06, + "loss": 0.7935, + "step": 6100 + }, + { + "epoch": 1.6226063829787234, + "grad_norm": 4.853170394897461, + "learning_rate": 8.077306191227244e-06, + "loss": 0.7786, + "step": 6101 + }, + { + "epoch": 1.6228723404255319, + "grad_norm": 4.339568614959717, + "learning_rate": 8.076612946494317e-06, + "loss": 0.6722, + "step": 6102 + }, + { + "epoch": 1.6231382978723405, + "grad_norm": 3.6707983016967773, + "learning_rate": 8.075919606567363e-06, + "loss": 0.8792, + "step": 6103 + }, + { + "epoch": 1.6234042553191488, + "grad_norm": 3.867652177810669, + "learning_rate": 8.075226171467835e-06, + "loss": 0.7879, + "step": 6104 + }, + { + "epoch": 1.6236702127659575, + "grad_norm": 3.5733299255371094, + "learning_rate": 8.07453264121719e-06, + "loss": 0.7921, + "step": 6105 + }, + { + "epoch": 1.623936170212766, + "grad_norm": 3.7665045261383057, + "learning_rate": 8.073839015836884e-06, + "loss": 0.9738, + "step": 6106 + }, + { + "epoch": 1.6242021276595744, + "grad_norm": 4.237964153289795, + "learning_rate": 8.07314529534838e-06, + "loss": 0.869, + "step": 6107 + }, + { + "epoch": 1.624468085106383, + "grad_norm": 3.797464370727539, + "learning_rate": 8.072451479773143e-06, + "loss": 0.8445, + "step": 6108 + }, + { + "epoch": 1.6247340425531915, + "grad_norm": 3.9559130668640137, + "learning_rate": 8.071757569132639e-06, + "loss": 0.848, + "step": 6109 + }, + { + "epoch": 1.625, + "grad_norm": 3.7033722400665283, + "learning_rate": 8.071063563448341e-06, + "loss": 0.8571, + "step": 6110 + }, + { + "epoch": 1.6252659574468085, + "grad_norm": 3.696049451828003, + "learning_rate": 8.070369462741719e-06, + "loss": 0.8649, + "step": 6111 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 3.495377540588379, + "learning_rate": 8.06967526703425e-06, + "loss": 0.7691, + "step": 6112 + }, + { + "epoch": 1.6257978723404256, + "grad_norm": 3.9298911094665527, + "learning_rate": 8.068980976347416e-06, + "loss": 0.7793, + "step": 6113 + }, + { + "epoch": 1.626063829787234, + "grad_norm": 3.756425380706787, + "learning_rate": 8.068286590702697e-06, + "loss": 0.8161, + "step": 6114 + }, + { + "epoch": 1.6263297872340425, + "grad_norm": 4.13591194152832, + "learning_rate": 8.067592110121576e-06, + "loss": 0.8543, + "step": 6115 + }, + { + "epoch": 1.6265957446808512, + "grad_norm": 4.203410625457764, + "learning_rate": 8.066897534625547e-06, + "loss": 0.7607, + "step": 6116 + }, + { + "epoch": 1.6268617021276595, + "grad_norm": 4.2013983726501465, + "learning_rate": 8.066202864236096e-06, + "loss": 0.8248, + "step": 6117 + }, + { + "epoch": 1.6271276595744681, + "grad_norm": 4.034732341766357, + "learning_rate": 8.065508098974719e-06, + "loss": 0.804, + "step": 6118 + }, + { + "epoch": 1.6273936170212766, + "grad_norm": 4.180783271789551, + "learning_rate": 8.06481323886291e-06, + "loss": 0.8354, + "step": 6119 + }, + { + "epoch": 1.627659574468085, + "grad_norm": 3.9474117755889893, + "learning_rate": 8.064118283922173e-06, + "loss": 0.8622, + "step": 6120 + }, + { + "epoch": 1.6279255319148938, + "grad_norm": 3.8866050243377686, + "learning_rate": 8.063423234174008e-06, + "loss": 0.7197, + "step": 6121 + }, + { + "epoch": 1.628191489361702, + "grad_norm": 4.463206768035889, + "learning_rate": 8.062728089639921e-06, + "loss": 0.9226, + "step": 6122 + }, + { + "epoch": 1.6284574468085107, + "grad_norm": 3.982656717300415, + "learning_rate": 8.062032850341423e-06, + "loss": 0.7225, + "step": 6123 + }, + { + "epoch": 1.6287234042553191, + "grad_norm": 3.9853739738464355, + "learning_rate": 8.061337516300024e-06, + "loss": 0.6711, + "step": 6124 + }, + { + "epoch": 1.6289893617021276, + "grad_norm": 3.823125123977661, + "learning_rate": 8.060642087537233e-06, + "loss": 0.8944, + "step": 6125 + }, + { + "epoch": 1.6292553191489363, + "grad_norm": 4.082576274871826, + "learning_rate": 8.059946564074577e-06, + "loss": 0.8235, + "step": 6126 + }, + { + "epoch": 1.6295212765957445, + "grad_norm": 4.3164472579956055, + "learning_rate": 8.05925094593357e-06, + "loss": 0.8086, + "step": 6127 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.8943753242492676, + "learning_rate": 8.058555233135737e-06, + "loss": 0.7088, + "step": 6128 + }, + { + "epoch": 1.6300531914893617, + "grad_norm": 4.248415470123291, + "learning_rate": 8.057859425702605e-06, + "loss": 0.8011, + "step": 6129 + }, + { + "epoch": 1.6303191489361701, + "grad_norm": 3.8152194023132324, + "learning_rate": 8.057163523655702e-06, + "loss": 0.7437, + "step": 6130 + }, + { + "epoch": 1.6305851063829788, + "grad_norm": 4.243065357208252, + "learning_rate": 8.056467527016559e-06, + "loss": 0.8156, + "step": 6131 + }, + { + "epoch": 1.6308510638297873, + "grad_norm": 4.148963928222656, + "learning_rate": 8.055771435806714e-06, + "loss": 0.8538, + "step": 6132 + }, + { + "epoch": 1.6311170212765957, + "grad_norm": 3.848583698272705, + "learning_rate": 8.0550752500477e-06, + "loss": 0.7818, + "step": 6133 + }, + { + "epoch": 1.6313829787234042, + "grad_norm": 4.185320854187012, + "learning_rate": 8.054378969761062e-06, + "loss": 0.85, + "step": 6134 + }, + { + "epoch": 1.6316489361702127, + "grad_norm": 4.244765758514404, + "learning_rate": 8.053682594968346e-06, + "loss": 0.8856, + "step": 6135 + }, + { + "epoch": 1.6319148936170214, + "grad_norm": 3.8420188426971436, + "learning_rate": 8.052986125691091e-06, + "loss": 0.7745, + "step": 6136 + }, + { + "epoch": 1.6321808510638298, + "grad_norm": 4.029837131500244, + "learning_rate": 8.052289561950852e-06, + "loss": 0.8724, + "step": 6137 + }, + { + "epoch": 1.6324468085106383, + "grad_norm": 3.9027750492095947, + "learning_rate": 8.051592903769182e-06, + "loss": 0.7405, + "step": 6138 + }, + { + "epoch": 1.632712765957447, + "grad_norm": 4.00022554397583, + "learning_rate": 8.050896151167632e-06, + "loss": 0.7677, + "step": 6139 + }, + { + "epoch": 1.6329787234042552, + "grad_norm": 4.150446891784668, + "learning_rate": 8.050199304167766e-06, + "loss": 0.7348, + "step": 6140 + }, + { + "epoch": 1.633244680851064, + "grad_norm": 4.308548927307129, + "learning_rate": 8.04950236279114e-06, + "loss": 0.8106, + "step": 6141 + }, + { + "epoch": 1.6335106382978724, + "grad_norm": 3.9967095851898193, + "learning_rate": 8.048805327059321e-06, + "loss": 0.7345, + "step": 6142 + }, + { + "epoch": 1.6337765957446808, + "grad_norm": 3.783818244934082, + "learning_rate": 8.048108196993879e-06, + "loss": 0.716, + "step": 6143 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 3.8823726177215576, + "learning_rate": 8.047410972616376e-06, + "loss": 0.778, + "step": 6144 + }, + { + "epoch": 1.6343085106382977, + "grad_norm": 4.007701873779297, + "learning_rate": 8.046713653948393e-06, + "loss": 0.9691, + "step": 6145 + }, + { + "epoch": 1.6345744680851064, + "grad_norm": 4.14747428894043, + "learning_rate": 8.0460162410115e-06, + "loss": 0.8201, + "step": 6146 + }, + { + "epoch": 1.6348404255319149, + "grad_norm": 4.101099967956543, + "learning_rate": 8.045318733827278e-06, + "loss": 0.8864, + "step": 6147 + }, + { + "epoch": 1.6351063829787233, + "grad_norm": 3.709555149078369, + "learning_rate": 8.044621132417311e-06, + "loss": 0.7185, + "step": 6148 + }, + { + "epoch": 1.635372340425532, + "grad_norm": 4.0000481605529785, + "learning_rate": 8.043923436803182e-06, + "loss": 0.8816, + "step": 6149 + }, + { + "epoch": 1.6356382978723403, + "grad_norm": 4.075678825378418, + "learning_rate": 8.043225647006475e-06, + "loss": 0.8192, + "step": 6150 + }, + { + "epoch": 1.635904255319149, + "grad_norm": 4.004273891448975, + "learning_rate": 8.042527763048787e-06, + "loss": 0.9374, + "step": 6151 + }, + { + "epoch": 1.6361702127659574, + "grad_norm": 3.904745101928711, + "learning_rate": 8.041829784951706e-06, + "loss": 0.7701, + "step": 6152 + }, + { + "epoch": 1.6364361702127659, + "grad_norm": 3.7361650466918945, + "learning_rate": 8.04113171273683e-06, + "loss": 0.6875, + "step": 6153 + }, + { + "epoch": 1.6367021276595746, + "grad_norm": 3.9355521202087402, + "learning_rate": 8.040433546425759e-06, + "loss": 0.828, + "step": 6154 + }, + { + "epoch": 1.636968085106383, + "grad_norm": 3.615612745285034, + "learning_rate": 8.039735286040095e-06, + "loss": 0.8136, + "step": 6155 + }, + { + "epoch": 1.6372340425531915, + "grad_norm": 3.900493621826172, + "learning_rate": 8.03903693160144e-06, + "loss": 0.7782, + "step": 6156 + }, + { + "epoch": 1.6375, + "grad_norm": 4.175507068634033, + "learning_rate": 8.038338483131408e-06, + "loss": 0.8486, + "step": 6157 + }, + { + "epoch": 1.6377659574468084, + "grad_norm": 4.02733039855957, + "learning_rate": 8.037639940651603e-06, + "loss": 0.7591, + "step": 6158 + }, + { + "epoch": 1.638031914893617, + "grad_norm": 4.006030559539795, + "learning_rate": 8.036941304183643e-06, + "loss": 0.8453, + "step": 6159 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 3.9777238368988037, + "learning_rate": 8.036242573749142e-06, + "loss": 0.7623, + "step": 6160 + }, + { + "epoch": 1.638563829787234, + "grad_norm": 3.7054030895233154, + "learning_rate": 8.035543749369724e-06, + "loss": 0.7552, + "step": 6161 + }, + { + "epoch": 1.6388297872340427, + "grad_norm": 4.149451732635498, + "learning_rate": 8.034844831067006e-06, + "loss": 0.6954, + "step": 6162 + }, + { + "epoch": 1.639095744680851, + "grad_norm": 4.144680500030518, + "learning_rate": 8.034145818862618e-06, + "loss": 0.8583, + "step": 6163 + }, + { + "epoch": 1.6393617021276596, + "grad_norm": 3.732167959213257, + "learning_rate": 8.033446712778184e-06, + "loss": 0.7437, + "step": 6164 + }, + { + "epoch": 1.639627659574468, + "grad_norm": 4.176260471343994, + "learning_rate": 8.032747512835338e-06, + "loss": 0.9089, + "step": 6165 + }, + { + "epoch": 1.6398936170212766, + "grad_norm": 3.9875879287719727, + "learning_rate": 8.032048219055712e-06, + "loss": 0.7776, + "step": 6166 + }, + { + "epoch": 1.6401595744680852, + "grad_norm": 3.942016839981079, + "learning_rate": 8.031348831460948e-06, + "loss": 0.752, + "step": 6167 + }, + { + "epoch": 1.6404255319148935, + "grad_norm": 4.088458061218262, + "learning_rate": 8.030649350072679e-06, + "loss": 0.8339, + "step": 6168 + }, + { + "epoch": 1.6406914893617022, + "grad_norm": 4.712299346923828, + "learning_rate": 8.029949774912552e-06, + "loss": 0.942, + "step": 6169 + }, + { + "epoch": 1.6409574468085106, + "grad_norm": 3.5929760932922363, + "learning_rate": 8.029250106002212e-06, + "loss": 0.7309, + "step": 6170 + }, + { + "epoch": 1.641223404255319, + "grad_norm": 4.059690475463867, + "learning_rate": 8.028550343363306e-06, + "loss": 0.8479, + "step": 6171 + }, + { + "epoch": 1.6414893617021278, + "grad_norm": 4.054781436920166, + "learning_rate": 8.027850487017488e-06, + "loss": 0.9293, + "step": 6172 + }, + { + "epoch": 1.641755319148936, + "grad_norm": 3.754241466522217, + "learning_rate": 8.027150536986411e-06, + "loss": 0.7714, + "step": 6173 + }, + { + "epoch": 1.6420212765957447, + "grad_norm": 3.6258599758148193, + "learning_rate": 8.026450493291731e-06, + "loss": 0.725, + "step": 6174 + }, + { + "epoch": 1.6422872340425532, + "grad_norm": 4.247791290283203, + "learning_rate": 8.025750355955112e-06, + "loss": 0.7394, + "step": 6175 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 3.7767536640167236, + "learning_rate": 8.025050124998213e-06, + "loss": 0.757, + "step": 6176 + }, + { + "epoch": 1.6428191489361703, + "grad_norm": 3.970726490020752, + "learning_rate": 8.0243498004427e-06, + "loss": 0.7449, + "step": 6177 + }, + { + "epoch": 1.6430851063829788, + "grad_norm": 4.161791801452637, + "learning_rate": 8.023649382310246e-06, + "loss": 0.8939, + "step": 6178 + }, + { + "epoch": 1.6433510638297872, + "grad_norm": 3.9791698455810547, + "learning_rate": 8.02294887062252e-06, + "loss": 0.7553, + "step": 6179 + }, + { + "epoch": 1.6436170212765957, + "grad_norm": 3.881882905960083, + "learning_rate": 8.022248265401196e-06, + "loss": 0.7806, + "step": 6180 + }, + { + "epoch": 1.6438829787234042, + "grad_norm": 4.165888786315918, + "learning_rate": 8.021547566667952e-06, + "loss": 0.7756, + "step": 6181 + }, + { + "epoch": 1.6441489361702128, + "grad_norm": 4.053508281707764, + "learning_rate": 8.02084677444447e-06, + "loss": 0.7472, + "step": 6182 + }, + { + "epoch": 1.6444148936170213, + "grad_norm": 4.370820045471191, + "learning_rate": 8.020145888752431e-06, + "loss": 0.858, + "step": 6183 + }, + { + "epoch": 1.6446808510638298, + "grad_norm": 4.108578205108643, + "learning_rate": 8.019444909613524e-06, + "loss": 0.8644, + "step": 6184 + }, + { + "epoch": 1.6449468085106385, + "grad_norm": 3.9922139644622803, + "learning_rate": 8.018743837049433e-06, + "loss": 0.7846, + "step": 6185 + }, + { + "epoch": 1.6452127659574467, + "grad_norm": 3.711470127105713, + "learning_rate": 8.018042671081858e-06, + "loss": 0.685, + "step": 6186 + }, + { + "epoch": 1.6454787234042554, + "grad_norm": 3.7997970581054688, + "learning_rate": 8.01734141173249e-06, + "loss": 0.7726, + "step": 6187 + }, + { + "epoch": 1.6457446808510638, + "grad_norm": 4.349726676940918, + "learning_rate": 8.016640059023023e-06, + "loss": 0.9296, + "step": 6188 + }, + { + "epoch": 1.6460106382978723, + "grad_norm": 3.8738739490509033, + "learning_rate": 8.01593861297516e-06, + "loss": 0.9472, + "step": 6189 + }, + { + "epoch": 1.646276595744681, + "grad_norm": 4.002452850341797, + "learning_rate": 8.015237073610607e-06, + "loss": 0.7488, + "step": 6190 + }, + { + "epoch": 1.6465425531914892, + "grad_norm": 4.017054557800293, + "learning_rate": 8.01453544095107e-06, + "loss": 0.9446, + "step": 6191 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 4.123724460601807, + "learning_rate": 8.013833715018256e-06, + "loss": 0.9052, + "step": 6192 + }, + { + "epoch": 1.6470744680851064, + "grad_norm": 3.664494752883911, + "learning_rate": 8.013131895833879e-06, + "loss": 0.7421, + "step": 6193 + }, + { + "epoch": 1.6473404255319148, + "grad_norm": 3.7503373622894287, + "learning_rate": 8.012429983419654e-06, + "loss": 0.7293, + "step": 6194 + }, + { + "epoch": 1.6476063829787235, + "grad_norm": 4.248551845550537, + "learning_rate": 8.0117279777973e-06, + "loss": 0.664, + "step": 6195 + }, + { + "epoch": 1.6478723404255318, + "grad_norm": 4.146711349487305, + "learning_rate": 8.011025878988534e-06, + "loss": 0.8164, + "step": 6196 + }, + { + "epoch": 1.6481382978723405, + "grad_norm": 3.8372318744659424, + "learning_rate": 8.010323687015083e-06, + "loss": 0.7173, + "step": 6197 + }, + { + "epoch": 1.648404255319149, + "grad_norm": 4.206233501434326, + "learning_rate": 8.009621401898671e-06, + "loss": 0.8324, + "step": 6198 + }, + { + "epoch": 1.6486702127659574, + "grad_norm": 3.9302217960357666, + "learning_rate": 8.008919023661033e-06, + "loss": 0.8095, + "step": 6199 + }, + { + "epoch": 1.648936170212766, + "grad_norm": 3.8333635330200195, + "learning_rate": 8.008216552323896e-06, + "loss": 0.6761, + "step": 6200 + }, + { + "epoch": 1.6492021276595743, + "grad_norm": 4.308274269104004, + "learning_rate": 8.007513987908997e-06, + "loss": 0.9286, + "step": 6201 + }, + { + "epoch": 1.649468085106383, + "grad_norm": 3.9875328540802, + "learning_rate": 8.006811330438076e-06, + "loss": 0.8439, + "step": 6202 + }, + { + "epoch": 1.6497340425531914, + "grad_norm": 3.9723567962646484, + "learning_rate": 8.006108579932869e-06, + "loss": 0.743, + "step": 6203 + }, + { + "epoch": 1.65, + "grad_norm": 3.6594903469085693, + "learning_rate": 8.005405736415127e-06, + "loss": 0.8403, + "step": 6204 + }, + { + "epoch": 1.6502659574468086, + "grad_norm": 3.7459709644317627, + "learning_rate": 8.00470279990659e-06, + "loss": 0.7611, + "step": 6205 + }, + { + "epoch": 1.650531914893617, + "grad_norm": 4.077069282531738, + "learning_rate": 8.003999770429013e-06, + "loss": 0.8415, + "step": 6206 + }, + { + "epoch": 1.6507978723404255, + "grad_norm": 4.072371482849121, + "learning_rate": 8.003296648004146e-06, + "loss": 0.8709, + "step": 6207 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 4.159237861633301, + "learning_rate": 8.002593432653743e-06, + "loss": 0.802, + "step": 6208 + }, + { + "epoch": 1.6513297872340424, + "grad_norm": 4.047359943389893, + "learning_rate": 8.001890124399565e-06, + "loss": 0.7666, + "step": 6209 + }, + { + "epoch": 1.6515957446808511, + "grad_norm": 3.548340320587158, + "learning_rate": 8.001186723263374e-06, + "loss": 0.8141, + "step": 6210 + }, + { + "epoch": 1.6518617021276596, + "grad_norm": 4.3510050773620605, + "learning_rate": 8.00048322926693e-06, + "loss": 0.7908, + "step": 6211 + }, + { + "epoch": 1.652127659574468, + "grad_norm": 3.642498254776001, + "learning_rate": 7.999779642432003e-06, + "loss": 0.8594, + "step": 6212 + }, + { + "epoch": 1.6523936170212767, + "grad_norm": 3.804325819015503, + "learning_rate": 7.999075962780363e-06, + "loss": 0.7736, + "step": 6213 + }, + { + "epoch": 1.652659574468085, + "grad_norm": 4.080993175506592, + "learning_rate": 7.998372190333781e-06, + "loss": 0.8834, + "step": 6214 + }, + { + "epoch": 1.6529255319148937, + "grad_norm": 4.291904449462891, + "learning_rate": 7.997668325114033e-06, + "loss": 0.8433, + "step": 6215 + }, + { + "epoch": 1.6531914893617021, + "grad_norm": 3.4936020374298096, + "learning_rate": 7.996964367142899e-06, + "loss": 0.7045, + "step": 6216 + }, + { + "epoch": 1.6534574468085106, + "grad_norm": 4.251427173614502, + "learning_rate": 7.996260316442157e-06, + "loss": 0.8487, + "step": 6217 + }, + { + "epoch": 1.6537234042553193, + "grad_norm": 3.810161828994751, + "learning_rate": 7.995556173033594e-06, + "loss": 0.7715, + "step": 6218 + }, + { + "epoch": 1.6539893617021275, + "grad_norm": 3.8157644271850586, + "learning_rate": 7.994851936938996e-06, + "loss": 0.8408, + "step": 6219 + }, + { + "epoch": 1.6542553191489362, + "grad_norm": 3.614837646484375, + "learning_rate": 7.994147608180153e-06, + "loss": 0.7829, + "step": 6220 + }, + { + "epoch": 1.6545212765957447, + "grad_norm": 4.262511253356934, + "learning_rate": 7.99344318677886e-06, + "loss": 0.8728, + "step": 6221 + }, + { + "epoch": 1.6547872340425531, + "grad_norm": 4.14133358001709, + "learning_rate": 7.992738672756909e-06, + "loss": 0.8611, + "step": 6222 + }, + { + "epoch": 1.6550531914893618, + "grad_norm": 4.4198737144470215, + "learning_rate": 7.992034066136099e-06, + "loss": 0.8825, + "step": 6223 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 4.433263778686523, + "learning_rate": 7.991329366938232e-06, + "loss": 0.9547, + "step": 6224 + }, + { + "epoch": 1.6555851063829787, + "grad_norm": 4.354765892028809, + "learning_rate": 7.990624575185116e-06, + "loss": 0.9415, + "step": 6225 + }, + { + "epoch": 1.6558510638297872, + "grad_norm": 4.149988174438477, + "learning_rate": 7.98991969089855e-06, + "loss": 0.7804, + "step": 6226 + }, + { + "epoch": 1.6561170212765957, + "grad_norm": 3.833970546722412, + "learning_rate": 7.98921471410035e-06, + "loss": 0.7944, + "step": 6227 + }, + { + "epoch": 1.6563829787234043, + "grad_norm": 3.816167116165161, + "learning_rate": 7.98850964481233e-06, + "loss": 0.8054, + "step": 6228 + }, + { + "epoch": 1.6566489361702128, + "grad_norm": 3.758295774459839, + "learning_rate": 7.987804483056301e-06, + "loss": 0.7724, + "step": 6229 + }, + { + "epoch": 1.6569148936170213, + "grad_norm": 4.2231669425964355, + "learning_rate": 7.987099228854083e-06, + "loss": 0.8713, + "step": 6230 + }, + { + "epoch": 1.65718085106383, + "grad_norm": 4.497824192047119, + "learning_rate": 7.9863938822275e-06, + "loss": 0.9629, + "step": 6231 + }, + { + "epoch": 1.6574468085106382, + "grad_norm": 3.9088895320892334, + "learning_rate": 7.985688443198371e-06, + "loss": 0.7597, + "step": 6232 + }, + { + "epoch": 1.6577127659574469, + "grad_norm": 3.699256658554077, + "learning_rate": 7.984982911788528e-06, + "loss": 0.8468, + "step": 6233 + }, + { + "epoch": 1.6579787234042553, + "grad_norm": 3.8971588611602783, + "learning_rate": 7.9842772880198e-06, + "loss": 0.8377, + "step": 6234 + }, + { + "epoch": 1.6582446808510638, + "grad_norm": 3.8062503337860107, + "learning_rate": 7.98357157191402e-06, + "loss": 0.6739, + "step": 6235 + }, + { + "epoch": 1.6585106382978725, + "grad_norm": 3.7170534133911133, + "learning_rate": 7.982865763493022e-06, + "loss": 0.7505, + "step": 6236 + }, + { + "epoch": 1.6587765957446807, + "grad_norm": 3.678074598312378, + "learning_rate": 7.982159862778645e-06, + "loss": 0.7589, + "step": 6237 + }, + { + "epoch": 1.6590425531914894, + "grad_norm": 3.895219326019287, + "learning_rate": 7.98145386979273e-06, + "loss": 0.6712, + "step": 6238 + }, + { + "epoch": 1.6593085106382979, + "grad_norm": 4.339925765991211, + "learning_rate": 7.980747784557123e-06, + "loss": 0.9584, + "step": 6239 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.8446319103240967, + "learning_rate": 7.98004160709367e-06, + "loss": 0.7287, + "step": 6240 + }, + { + "epoch": 1.659840425531915, + "grad_norm": 3.852252960205078, + "learning_rate": 7.979335337424222e-06, + "loss": 0.9698, + "step": 6241 + }, + { + "epoch": 1.6601063829787233, + "grad_norm": 3.7780802249908447, + "learning_rate": 7.97862897557063e-06, + "loss": 0.8085, + "step": 6242 + }, + { + "epoch": 1.660372340425532, + "grad_norm": 3.954035758972168, + "learning_rate": 7.97792252155475e-06, + "loss": 0.8768, + "step": 6243 + }, + { + "epoch": 1.6606382978723404, + "grad_norm": 3.267712116241455, + "learning_rate": 7.977215975398442e-06, + "loss": 0.6974, + "step": 6244 + }, + { + "epoch": 1.6609042553191489, + "grad_norm": 3.534168243408203, + "learning_rate": 7.976509337123567e-06, + "loss": 0.8029, + "step": 6245 + }, + { + "epoch": 1.6611702127659576, + "grad_norm": 3.9597525596618652, + "learning_rate": 7.975802606751989e-06, + "loss": 0.7754, + "step": 6246 + }, + { + "epoch": 1.6614361702127658, + "grad_norm": 4.123916149139404, + "learning_rate": 7.975095784305572e-06, + "loss": 0.8451, + "step": 6247 + }, + { + "epoch": 1.6617021276595745, + "grad_norm": 3.989689588546753, + "learning_rate": 7.97438886980619e-06, + "loss": 0.7707, + "step": 6248 + }, + { + "epoch": 1.661968085106383, + "grad_norm": 4.045599937438965, + "learning_rate": 7.973681863275715e-06, + "loss": 0.7474, + "step": 6249 + }, + { + "epoch": 1.6622340425531914, + "grad_norm": 4.4239420890808105, + "learning_rate": 7.972974764736023e-06, + "loss": 0.7858, + "step": 6250 + }, + { + "epoch": 1.6625, + "grad_norm": 3.499119520187378, + "learning_rate": 7.972267574208991e-06, + "loss": 0.7021, + "step": 6251 + }, + { + "epoch": 1.6627659574468086, + "grad_norm": 4.45729923248291, + "learning_rate": 7.971560291716501e-06, + "loss": 0.9094, + "step": 6252 + }, + { + "epoch": 1.663031914893617, + "grad_norm": 4.242092609405518, + "learning_rate": 7.970852917280434e-06, + "loss": 0.8807, + "step": 6253 + }, + { + "epoch": 1.6632978723404257, + "grad_norm": 3.947512149810791, + "learning_rate": 7.970145450922684e-06, + "loss": 0.8778, + "step": 6254 + }, + { + "epoch": 1.663563829787234, + "grad_norm": 5.4790167808532715, + "learning_rate": 7.969437892665134e-06, + "loss": 0.8196, + "step": 6255 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 3.856820583343506, + "learning_rate": 7.968730242529681e-06, + "loss": 0.7653, + "step": 6256 + }, + { + "epoch": 1.664095744680851, + "grad_norm": 4.446346759796143, + "learning_rate": 7.968022500538219e-06, + "loss": 0.9374, + "step": 6257 + }, + { + "epoch": 1.6643617021276595, + "grad_norm": 4.079642295837402, + "learning_rate": 7.967314666712647e-06, + "loss": 0.8123, + "step": 6258 + }, + { + "epoch": 1.6646276595744682, + "grad_norm": 4.338622570037842, + "learning_rate": 7.966606741074864e-06, + "loss": 0.7508, + "step": 6259 + }, + { + "epoch": 1.6648936170212765, + "grad_norm": 3.974862813949585, + "learning_rate": 7.965898723646777e-06, + "loss": 0.8222, + "step": 6260 + }, + { + "epoch": 1.6651595744680852, + "grad_norm": 4.263228416442871, + "learning_rate": 7.96519061445029e-06, + "loss": 0.9591, + "step": 6261 + }, + { + "epoch": 1.6654255319148936, + "grad_norm": 3.6377105712890625, + "learning_rate": 7.964482413507316e-06, + "loss": 0.7791, + "step": 6262 + }, + { + "epoch": 1.665691489361702, + "grad_norm": 3.3404452800750732, + "learning_rate": 7.963774120839767e-06, + "loss": 0.7668, + "step": 6263 + }, + { + "epoch": 1.6659574468085108, + "grad_norm": 3.6252615451812744, + "learning_rate": 7.963065736469555e-06, + "loss": 0.7628, + "step": 6264 + }, + { + "epoch": 1.666223404255319, + "grad_norm": 4.053292751312256, + "learning_rate": 7.9623572604186e-06, + "loss": 0.9255, + "step": 6265 + }, + { + "epoch": 1.6664893617021277, + "grad_norm": 3.612187385559082, + "learning_rate": 7.961648692708826e-06, + "loss": 0.7864, + "step": 6266 + }, + { + "epoch": 1.6667553191489362, + "grad_norm": 4.19817590713501, + "learning_rate": 7.960940033362152e-06, + "loss": 0.8414, + "step": 6267 + }, + { + "epoch": 1.6670212765957446, + "grad_norm": 3.919515371322632, + "learning_rate": 7.960231282400509e-06, + "loss": 0.7358, + "step": 6268 + }, + { + "epoch": 1.6672872340425533, + "grad_norm": 4.0831732749938965, + "learning_rate": 7.959522439845825e-06, + "loss": 0.7613, + "step": 6269 + }, + { + "epoch": 1.6675531914893615, + "grad_norm": 4.200259685516357, + "learning_rate": 7.958813505720031e-06, + "loss": 0.9464, + "step": 6270 + }, + { + "epoch": 1.6678191489361702, + "grad_norm": 4.281257152557373, + "learning_rate": 7.958104480045066e-06, + "loss": 0.8795, + "step": 6271 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 3.907784938812256, + "learning_rate": 7.957395362842864e-06, + "loss": 0.6676, + "step": 6272 + }, + { + "epoch": 1.6683510638297872, + "grad_norm": 4.122792720794678, + "learning_rate": 7.956686154135368e-06, + "loss": 0.7808, + "step": 6273 + }, + { + "epoch": 1.6686170212765958, + "grad_norm": 4.015087127685547, + "learning_rate": 7.95597685394452e-06, + "loss": 0.8536, + "step": 6274 + }, + { + "epoch": 1.6688829787234043, + "grad_norm": 3.8058676719665527, + "learning_rate": 7.95526746229227e-06, + "loss": 0.8526, + "step": 6275 + }, + { + "epoch": 1.6691489361702128, + "grad_norm": 4.022008895874023, + "learning_rate": 7.954557979200562e-06, + "loss": 0.7642, + "step": 6276 + }, + { + "epoch": 1.6694148936170212, + "grad_norm": 3.820610284805298, + "learning_rate": 7.953848404691354e-06, + "loss": 0.8786, + "step": 6277 + }, + { + "epoch": 1.6696808510638297, + "grad_norm": 3.6477434635162354, + "learning_rate": 7.9531387387866e-06, + "loss": 0.8277, + "step": 6278 + }, + { + "epoch": 1.6699468085106384, + "grad_norm": 4.075412273406982, + "learning_rate": 7.952428981508254e-06, + "loss": 0.8095, + "step": 6279 + }, + { + "epoch": 1.6702127659574468, + "grad_norm": 4.030799388885498, + "learning_rate": 7.951719132878279e-06, + "loss": 0.7007, + "step": 6280 + }, + { + "epoch": 1.6704787234042553, + "grad_norm": 4.039961338043213, + "learning_rate": 7.95100919291864e-06, + "loss": 0.8829, + "step": 6281 + }, + { + "epoch": 1.670744680851064, + "grad_norm": 3.8483259677886963, + "learning_rate": 7.950299161651303e-06, + "loss": 0.7494, + "step": 6282 + }, + { + "epoch": 1.6710106382978722, + "grad_norm": 3.8535609245300293, + "learning_rate": 7.949589039098235e-06, + "loss": 0.7572, + "step": 6283 + }, + { + "epoch": 1.671276595744681, + "grad_norm": 4.3112311363220215, + "learning_rate": 7.94887882528141e-06, + "loss": 0.9061, + "step": 6284 + }, + { + "epoch": 1.6715425531914894, + "grad_norm": 3.8851253986358643, + "learning_rate": 7.948168520222802e-06, + "loss": 0.9334, + "step": 6285 + }, + { + "epoch": 1.6718085106382978, + "grad_norm": 4.051077842712402, + "learning_rate": 7.94745812394439e-06, + "loss": 0.8568, + "step": 6286 + }, + { + "epoch": 1.6720744680851065, + "grad_norm": 3.8714540004730225, + "learning_rate": 7.946747636468153e-06, + "loss": 0.8496, + "step": 6287 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 3.9510905742645264, + "learning_rate": 7.946037057816075e-06, + "loss": 0.8367, + "step": 6288 + }, + { + "epoch": 1.6726063829787234, + "grad_norm": 4.504206657409668, + "learning_rate": 7.945326388010141e-06, + "loss": 0.8716, + "step": 6289 + }, + { + "epoch": 1.672872340425532, + "grad_norm": 4.116037845611572, + "learning_rate": 7.944615627072341e-06, + "loss": 0.8481, + "step": 6290 + }, + { + "epoch": 1.6731382978723404, + "grad_norm": 3.539327383041382, + "learning_rate": 7.943904775024667e-06, + "loss": 0.6687, + "step": 6291 + }, + { + "epoch": 1.673404255319149, + "grad_norm": 4.1150898933410645, + "learning_rate": 7.943193831889112e-06, + "loss": 0.9299, + "step": 6292 + }, + { + "epoch": 1.6736702127659573, + "grad_norm": 4.379646301269531, + "learning_rate": 7.942482797687675e-06, + "loss": 0.8867, + "step": 6293 + }, + { + "epoch": 1.673936170212766, + "grad_norm": 3.6255533695220947, + "learning_rate": 7.941771672442358e-06, + "loss": 0.6831, + "step": 6294 + }, + { + "epoch": 1.6742021276595744, + "grad_norm": 4.358723163604736, + "learning_rate": 7.94106045617516e-06, + "loss": 0.6923, + "step": 6295 + }, + { + "epoch": 1.674468085106383, + "grad_norm": 3.967379093170166, + "learning_rate": 7.94034914890809e-06, + "loss": 0.8413, + "step": 6296 + }, + { + "epoch": 1.6747340425531916, + "grad_norm": 4.233070373535156, + "learning_rate": 7.939637750663153e-06, + "loss": 0.9755, + "step": 6297 + }, + { + "epoch": 1.675, + "grad_norm": 3.4149739742279053, + "learning_rate": 7.938926261462366e-06, + "loss": 0.6741, + "step": 6298 + }, + { + "epoch": 1.6752659574468085, + "grad_norm": 4.045546054840088, + "learning_rate": 7.938214681327739e-06, + "loss": 0.8484, + "step": 6299 + }, + { + "epoch": 1.675531914893617, + "grad_norm": 4.123802185058594, + "learning_rate": 7.93750301028129e-06, + "loss": 0.8398, + "step": 6300 + }, + { + "epoch": 1.6757978723404254, + "grad_norm": 3.7821900844573975, + "learning_rate": 7.936791248345041e-06, + "loss": 0.7785, + "step": 6301 + }, + { + "epoch": 1.6760638297872341, + "grad_norm": 3.6713192462921143, + "learning_rate": 7.936079395541013e-06, + "loss": 0.7191, + "step": 6302 + }, + { + "epoch": 1.6763297872340426, + "grad_norm": 4.085387706756592, + "learning_rate": 7.935367451891232e-06, + "loss": 0.684, + "step": 6303 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 3.6555123329162598, + "learning_rate": 7.934655417417724e-06, + "loss": 0.7526, + "step": 6304 + }, + { + "epoch": 1.6768617021276597, + "grad_norm": 3.9464025497436523, + "learning_rate": 7.933943292142524e-06, + "loss": 0.7544, + "step": 6305 + }, + { + "epoch": 1.677127659574468, + "grad_norm": 3.74369215965271, + "learning_rate": 7.933231076087662e-06, + "loss": 0.7524, + "step": 6306 + }, + { + "epoch": 1.6773936170212767, + "grad_norm": 4.703025817871094, + "learning_rate": 7.932518769275179e-06, + "loss": 0.8955, + "step": 6307 + }, + { + "epoch": 1.6776595744680851, + "grad_norm": 4.241019248962402, + "learning_rate": 7.931806371727111e-06, + "loss": 0.7727, + "step": 6308 + }, + { + "epoch": 1.6779255319148936, + "grad_norm": 4.029513359069824, + "learning_rate": 7.931093883465503e-06, + "loss": 0.7951, + "step": 6309 + }, + { + "epoch": 1.6781914893617023, + "grad_norm": 3.7332520484924316, + "learning_rate": 7.930381304512401e-06, + "loss": 0.7148, + "step": 6310 + }, + { + "epoch": 1.6784574468085105, + "grad_norm": 3.734999179840088, + "learning_rate": 7.92966863488985e-06, + "loss": 0.7856, + "step": 6311 + }, + { + "epoch": 1.6787234042553192, + "grad_norm": 4.164159774780273, + "learning_rate": 7.928955874619902e-06, + "loss": 0.8163, + "step": 6312 + }, + { + "epoch": 1.6789893617021276, + "grad_norm": 4.043959617614746, + "learning_rate": 7.928243023724611e-06, + "loss": 0.8262, + "step": 6313 + }, + { + "epoch": 1.679255319148936, + "grad_norm": 3.5217018127441406, + "learning_rate": 7.927530082226034e-06, + "loss": 0.7066, + "step": 6314 + }, + { + "epoch": 1.6795212765957448, + "grad_norm": 4.035088539123535, + "learning_rate": 7.926817050146227e-06, + "loss": 0.9041, + "step": 6315 + }, + { + "epoch": 1.679787234042553, + "grad_norm": 3.8981032371520996, + "learning_rate": 7.926103927507257e-06, + "loss": 0.8896, + "step": 6316 + }, + { + "epoch": 1.6800531914893617, + "grad_norm": 3.613386392593384, + "learning_rate": 7.925390714331185e-06, + "loss": 0.8692, + "step": 6317 + }, + { + "epoch": 1.6803191489361702, + "grad_norm": 4.042194843292236, + "learning_rate": 7.924677410640081e-06, + "loss": 0.8251, + "step": 6318 + }, + { + "epoch": 1.6805851063829786, + "grad_norm": 3.749028444290161, + "learning_rate": 7.923964016456014e-06, + "loss": 0.8519, + "step": 6319 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 3.482661008834839, + "learning_rate": 7.92325053180106e-06, + "loss": 0.6798, + "step": 6320 + }, + { + "epoch": 1.6811170212765958, + "grad_norm": 3.876594066619873, + "learning_rate": 7.92253695669729e-06, + "loss": 0.8437, + "step": 6321 + }, + { + "epoch": 1.6813829787234043, + "grad_norm": 3.941342830657959, + "learning_rate": 7.921823291166785e-06, + "loss": 0.7915, + "step": 6322 + }, + { + "epoch": 1.6816489361702127, + "grad_norm": 4.015593528747559, + "learning_rate": 7.92110953523163e-06, + "loss": 0.8184, + "step": 6323 + }, + { + "epoch": 1.6819148936170212, + "grad_norm": 4.370626449584961, + "learning_rate": 7.920395688913906e-06, + "loss": 0.962, + "step": 6324 + }, + { + "epoch": 1.6821808510638299, + "grad_norm": 3.7897567749023438, + "learning_rate": 7.919681752235701e-06, + "loss": 0.9113, + "step": 6325 + }, + { + "epoch": 1.6824468085106383, + "grad_norm": 3.8005380630493164, + "learning_rate": 7.918967725219104e-06, + "loss": 0.869, + "step": 6326 + }, + { + "epoch": 1.6827127659574468, + "grad_norm": 4.056982040405273, + "learning_rate": 7.918253607886212e-06, + "loss": 0.8451, + "step": 6327 + }, + { + "epoch": 1.6829787234042555, + "grad_norm": 3.5084946155548096, + "learning_rate": 7.917539400259116e-06, + "loss": 0.7714, + "step": 6328 + }, + { + "epoch": 1.6832446808510637, + "grad_norm": 3.9143457412719727, + "learning_rate": 7.916825102359914e-06, + "loss": 0.8663, + "step": 6329 + }, + { + "epoch": 1.6835106382978724, + "grad_norm": 3.867074966430664, + "learning_rate": 7.916110714210711e-06, + "loss": 0.8741, + "step": 6330 + }, + { + "epoch": 1.6837765957446809, + "grad_norm": 3.8426260948181152, + "learning_rate": 7.91539623583361e-06, + "loss": 0.8347, + "step": 6331 + }, + { + "epoch": 1.6840425531914893, + "grad_norm": 3.8092234134674072, + "learning_rate": 7.914681667250714e-06, + "loss": 0.8565, + "step": 6332 + }, + { + "epoch": 1.684308510638298, + "grad_norm": 3.754821538925171, + "learning_rate": 7.913967008484138e-06, + "loss": 0.6845, + "step": 6333 + }, + { + "epoch": 1.6845744680851062, + "grad_norm": 4.067741394042969, + "learning_rate": 7.913252259555992e-06, + "loss": 0.7716, + "step": 6334 + }, + { + "epoch": 1.684840425531915, + "grad_norm": 4.096173286437988, + "learning_rate": 7.91253742048839e-06, + "loss": 0.8299, + "step": 6335 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 4.119457721710205, + "learning_rate": 7.911822491303453e-06, + "loss": 0.8621, + "step": 6336 + }, + { + "epoch": 1.6853723404255319, + "grad_norm": 4.278772354125977, + "learning_rate": 7.911107472023298e-06, + "loss": 0.8446, + "step": 6337 + }, + { + "epoch": 1.6856382978723405, + "grad_norm": 3.7795321941375732, + "learning_rate": 7.910392362670051e-06, + "loss": 0.6943, + "step": 6338 + }, + { + "epoch": 1.6859042553191488, + "grad_norm": 3.9733240604400635, + "learning_rate": 7.909677163265838e-06, + "loss": 0.6562, + "step": 6339 + }, + { + "epoch": 1.6861702127659575, + "grad_norm": 4.160102844238281, + "learning_rate": 7.908961873832788e-06, + "loss": 0.7915, + "step": 6340 + }, + { + "epoch": 1.686436170212766, + "grad_norm": 4.3431525230407715, + "learning_rate": 7.908246494393032e-06, + "loss": 0.8474, + "step": 6341 + }, + { + "epoch": 1.6867021276595744, + "grad_norm": 4.230860233306885, + "learning_rate": 7.907531024968705e-06, + "loss": 0.7098, + "step": 6342 + }, + { + "epoch": 1.686968085106383, + "grad_norm": 4.223114967346191, + "learning_rate": 7.906815465581945e-06, + "loss": 0.7278, + "step": 6343 + }, + { + "epoch": 1.6872340425531915, + "grad_norm": 4.246336460113525, + "learning_rate": 7.906099816254895e-06, + "loss": 0.825, + "step": 6344 + }, + { + "epoch": 1.6875, + "grad_norm": 3.5722670555114746, + "learning_rate": 7.905384077009693e-06, + "loss": 0.8907, + "step": 6345 + }, + { + "epoch": 1.6877659574468085, + "grad_norm": 4.00727653503418, + "learning_rate": 7.904668247868486e-06, + "loss": 0.7821, + "step": 6346 + }, + { + "epoch": 1.688031914893617, + "grad_norm": 3.889538049697876, + "learning_rate": 7.903952328853426e-06, + "loss": 0.7967, + "step": 6347 + }, + { + "epoch": 1.6882978723404256, + "grad_norm": 3.923154830932617, + "learning_rate": 7.90323631998666e-06, + "loss": 0.8152, + "step": 6348 + }, + { + "epoch": 1.688563829787234, + "grad_norm": 4.059485912322998, + "learning_rate": 7.902520221290345e-06, + "loss": 0.7824, + "step": 6349 + }, + { + "epoch": 1.6888297872340425, + "grad_norm": 4.1757378578186035, + "learning_rate": 7.901804032786637e-06, + "loss": 0.8839, + "step": 6350 + }, + { + "epoch": 1.6890957446808512, + "grad_norm": 3.6736671924591064, + "learning_rate": 7.901087754497694e-06, + "loss": 0.684, + "step": 6351 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 4.116995811462402, + "learning_rate": 7.900371386445682e-06, + "loss": 0.9625, + "step": 6352 + }, + { + "epoch": 1.6896276595744681, + "grad_norm": 3.686619758605957, + "learning_rate": 7.899654928652765e-06, + "loss": 0.8667, + "step": 6353 + }, + { + "epoch": 1.6898936170212766, + "grad_norm": 4.151339054107666, + "learning_rate": 7.89893838114111e-06, + "loss": 0.8102, + "step": 6354 + }, + { + "epoch": 1.690159574468085, + "grad_norm": 3.7917020320892334, + "learning_rate": 7.898221743932887e-06, + "loss": 0.934, + "step": 6355 + }, + { + "epoch": 1.6904255319148938, + "grad_norm": 3.5394623279571533, + "learning_rate": 7.897505017050272e-06, + "loss": 0.7577, + "step": 6356 + }, + { + "epoch": 1.690691489361702, + "grad_norm": 4.058946132659912, + "learning_rate": 7.896788200515442e-06, + "loss": 0.7536, + "step": 6357 + }, + { + "epoch": 1.6909574468085107, + "grad_norm": 3.8410744667053223, + "learning_rate": 7.896071294350574e-06, + "loss": 0.8212, + "step": 6358 + }, + { + "epoch": 1.6912234042553191, + "grad_norm": 3.915674924850464, + "learning_rate": 7.89535429857785e-06, + "loss": 0.8288, + "step": 6359 + }, + { + "epoch": 1.6914893617021276, + "grad_norm": 3.954108715057373, + "learning_rate": 7.894637213219454e-06, + "loss": 0.7738, + "step": 6360 + }, + { + "epoch": 1.6917553191489363, + "grad_norm": 4.220264434814453, + "learning_rate": 7.893920038297575e-06, + "loss": 0.7686, + "step": 6361 + }, + { + "epoch": 1.6920212765957445, + "grad_norm": 4.50542688369751, + "learning_rate": 7.893202773834404e-06, + "loss": 0.825, + "step": 6362 + }, + { + "epoch": 1.6922872340425532, + "grad_norm": 4.274563312530518, + "learning_rate": 7.892485419852131e-06, + "loss": 0.8119, + "step": 6363 + }, + { + "epoch": 1.6925531914893617, + "grad_norm": 3.8938279151916504, + "learning_rate": 7.891767976372957e-06, + "loss": 0.9073, + "step": 6364 + }, + { + "epoch": 1.6928191489361701, + "grad_norm": 3.949944257736206, + "learning_rate": 7.891050443419074e-06, + "loss": 0.757, + "step": 6365 + }, + { + "epoch": 1.6930851063829788, + "grad_norm": 4.313665866851807, + "learning_rate": 7.890332821012687e-06, + "loss": 0.8997, + "step": 6366 + }, + { + "epoch": 1.6933510638297873, + "grad_norm": 4.165764331817627, + "learning_rate": 7.889615109176e-06, + "loss": 0.8262, + "step": 6367 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 3.462186336517334, + "learning_rate": 7.88889730793122e-06, + "loss": 0.6989, + "step": 6368 + }, + { + "epoch": 1.6938829787234042, + "grad_norm": 4.610195159912109, + "learning_rate": 7.888179417300556e-06, + "loss": 0.924, + "step": 6369 + }, + { + "epoch": 1.6941489361702127, + "grad_norm": 3.8986306190490723, + "learning_rate": 7.887461437306221e-06, + "loss": 0.8204, + "step": 6370 + }, + { + "epoch": 1.6944148936170214, + "grad_norm": 3.9623425006866455, + "learning_rate": 7.886743367970428e-06, + "loss": 0.8856, + "step": 6371 + }, + { + "epoch": 1.6946808510638298, + "grad_norm": 3.7937700748443604, + "learning_rate": 7.886025209315396e-06, + "loss": 0.905, + "step": 6372 + }, + { + "epoch": 1.6949468085106383, + "grad_norm": 3.6256890296936035, + "learning_rate": 7.885306961363347e-06, + "loss": 0.7097, + "step": 6373 + }, + { + "epoch": 1.695212765957447, + "grad_norm": 4.079528331756592, + "learning_rate": 7.884588624136505e-06, + "loss": 0.8255, + "step": 6374 + }, + { + "epoch": 1.6954787234042552, + "grad_norm": 3.7182741165161133, + "learning_rate": 7.883870197657094e-06, + "loss": 0.671, + "step": 6375 + }, + { + "epoch": 1.695744680851064, + "grad_norm": 3.2320377826690674, + "learning_rate": 7.883151681947343e-06, + "loss": 0.6876, + "step": 6376 + }, + { + "epoch": 1.6960106382978724, + "grad_norm": 3.610546588897705, + "learning_rate": 7.882433077029484e-06, + "loss": 0.7904, + "step": 6377 + }, + { + "epoch": 1.6962765957446808, + "grad_norm": 3.8851020336151123, + "learning_rate": 7.881714382925753e-06, + "loss": 0.7701, + "step": 6378 + }, + { + "epoch": 1.6965425531914895, + "grad_norm": 3.727907657623291, + "learning_rate": 7.880995599658387e-06, + "loss": 0.8374, + "step": 6379 + }, + { + "epoch": 1.6968085106382977, + "grad_norm": 3.564770221710205, + "learning_rate": 7.880276727249623e-06, + "loss": 0.6483, + "step": 6380 + }, + { + "epoch": 1.6970744680851064, + "grad_norm": 4.088687419891357, + "learning_rate": 7.879557765721707e-06, + "loss": 0.7902, + "step": 6381 + }, + { + "epoch": 1.6973404255319149, + "grad_norm": 4.087176322937012, + "learning_rate": 7.878838715096883e-06, + "loss": 0.8723, + "step": 6382 + }, + { + "epoch": 1.6976063829787233, + "grad_norm": 3.7613840103149414, + "learning_rate": 7.878119575397401e-06, + "loss": 0.7559, + "step": 6383 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 4.426526069641113, + "learning_rate": 7.87740034664551e-06, + "loss": 1.1472, + "step": 6384 + }, + { + "epoch": 1.6981382978723403, + "grad_norm": 3.5922887325286865, + "learning_rate": 7.876681028863464e-06, + "loss": 0.8193, + "step": 6385 + }, + { + "epoch": 1.698404255319149, + "grad_norm": 4.141395092010498, + "learning_rate": 7.875961622073523e-06, + "loss": 0.8629, + "step": 6386 + }, + { + "epoch": 1.6986702127659574, + "grad_norm": 3.894594669342041, + "learning_rate": 7.875242126297939e-06, + "loss": 0.8301, + "step": 6387 + }, + { + "epoch": 1.6989361702127659, + "grad_norm": 3.929243564605713, + "learning_rate": 7.87452254155898e-06, + "loss": 0.8301, + "step": 6388 + }, + { + "epoch": 1.6992021276595746, + "grad_norm": 3.575058698654175, + "learning_rate": 7.87380286787891e-06, + "loss": 0.7595, + "step": 6389 + }, + { + "epoch": 1.699468085106383, + "grad_norm": 3.9643123149871826, + "learning_rate": 7.873083105279996e-06, + "loss": 0.8527, + "step": 6390 + }, + { + "epoch": 1.6997340425531915, + "grad_norm": 3.8817079067230225, + "learning_rate": 7.872363253784508e-06, + "loss": 0.6764, + "step": 6391 + }, + { + "epoch": 1.7, + "grad_norm": 4.209853649139404, + "learning_rate": 7.871643313414718e-06, + "loss": 0.8082, + "step": 6392 + }, + { + "epoch": 1.7002659574468084, + "grad_norm": 3.9260003566741943, + "learning_rate": 7.870923284192904e-06, + "loss": 0.7839, + "step": 6393 + }, + { + "epoch": 1.700531914893617, + "grad_norm": 3.726177453994751, + "learning_rate": 7.870203166141343e-06, + "loss": 0.721, + "step": 6394 + }, + { + "epoch": 1.7007978723404256, + "grad_norm": 4.2059326171875, + "learning_rate": 7.869482959282318e-06, + "loss": 0.7346, + "step": 6395 + }, + { + "epoch": 1.701063829787234, + "grad_norm": 4.017068862915039, + "learning_rate": 7.868762663638111e-06, + "loss": 0.6286, + "step": 6396 + }, + { + "epoch": 1.7013297872340427, + "grad_norm": 3.6799540519714355, + "learning_rate": 7.86804227923101e-06, + "loss": 0.7389, + "step": 6397 + }, + { + "epoch": 1.701595744680851, + "grad_norm": 3.797459602355957, + "learning_rate": 7.867321806083303e-06, + "loss": 0.7271, + "step": 6398 + }, + { + "epoch": 1.7018617021276596, + "grad_norm": 3.9897758960723877, + "learning_rate": 7.866601244217284e-06, + "loss": 0.8449, + "step": 6399 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 4.305942058563232, + "learning_rate": 7.86588059365525e-06, + "loss": 0.8108, + "step": 6400 + }, + { + "epoch": 1.7023936170212766, + "grad_norm": 3.727057456970215, + "learning_rate": 7.865159854419493e-06, + "loss": 0.801, + "step": 6401 + }, + { + "epoch": 1.7026595744680852, + "grad_norm": 3.9825263023376465, + "learning_rate": 7.864439026532318e-06, + "loss": 0.8026, + "step": 6402 + }, + { + "epoch": 1.7029255319148935, + "grad_norm": 3.602372884750366, + "learning_rate": 7.863718110016025e-06, + "loss": 0.6829, + "step": 6403 + }, + { + "epoch": 1.7031914893617022, + "grad_norm": 4.175540447235107, + "learning_rate": 7.862997104892924e-06, + "loss": 0.7491, + "step": 6404 + }, + { + "epoch": 1.7034574468085106, + "grad_norm": 3.7469863891601562, + "learning_rate": 7.862276011185323e-06, + "loss": 0.6495, + "step": 6405 + }, + { + "epoch": 1.703723404255319, + "grad_norm": 3.860929012298584, + "learning_rate": 7.861554828915531e-06, + "loss": 0.8538, + "step": 6406 + }, + { + "epoch": 1.7039893617021278, + "grad_norm": 3.6298773288726807, + "learning_rate": 7.860833558105863e-06, + "loss": 0.7653, + "step": 6407 + }, + { + "epoch": 1.704255319148936, + "grad_norm": 3.6208910942077637, + "learning_rate": 7.860112198778638e-06, + "loss": 0.8272, + "step": 6408 + }, + { + "epoch": 1.7045212765957447, + "grad_norm": 3.9331130981445312, + "learning_rate": 7.859390750956172e-06, + "loss": 0.802, + "step": 6409 + }, + { + "epoch": 1.7047872340425532, + "grad_norm": 3.843306303024292, + "learning_rate": 7.858669214660792e-06, + "loss": 0.8426, + "step": 6410 + }, + { + "epoch": 1.7050531914893616, + "grad_norm": 3.844093084335327, + "learning_rate": 7.857947589914819e-06, + "loss": 0.7836, + "step": 6411 + }, + { + "epoch": 1.7053191489361703, + "grad_norm": 3.7956225872039795, + "learning_rate": 7.857225876740585e-06, + "loss": 0.7151, + "step": 6412 + }, + { + "epoch": 1.7055851063829788, + "grad_norm": 3.568847417831421, + "learning_rate": 7.856504075160416e-06, + "loss": 0.8406, + "step": 6413 + }, + { + "epoch": 1.7058510638297872, + "grad_norm": 5.6517462730407715, + "learning_rate": 7.855782185196648e-06, + "loss": 0.8804, + "step": 6414 + }, + { + "epoch": 1.7061170212765957, + "grad_norm": 3.6728999614715576, + "learning_rate": 7.855060206871618e-06, + "loss": 0.7445, + "step": 6415 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 4.358402729034424, + "learning_rate": 7.854338140207662e-06, + "loss": 0.7949, + "step": 6416 + }, + { + "epoch": 1.7066489361702128, + "grad_norm": 4.032132625579834, + "learning_rate": 7.853615985227126e-06, + "loss": 0.8492, + "step": 6417 + }, + { + "epoch": 1.7069148936170213, + "grad_norm": 4.185794353485107, + "learning_rate": 7.85289374195235e-06, + "loss": 0.9054, + "step": 6418 + }, + { + "epoch": 1.7071808510638298, + "grad_norm": 4.639225006103516, + "learning_rate": 7.852171410405684e-06, + "loss": 0.9118, + "step": 6419 + }, + { + "epoch": 1.7074468085106385, + "grad_norm": 3.67490816116333, + "learning_rate": 7.851448990609476e-06, + "loss": 0.8046, + "step": 6420 + }, + { + "epoch": 1.7077127659574467, + "grad_norm": 3.879056692123413, + "learning_rate": 7.850726482586078e-06, + "loss": 0.6831, + "step": 6421 + }, + { + "epoch": 1.7079787234042554, + "grad_norm": 3.963789463043213, + "learning_rate": 7.850003886357847e-06, + "loss": 0.7881, + "step": 6422 + }, + { + "epoch": 1.7082446808510638, + "grad_norm": 4.229506015777588, + "learning_rate": 7.849281201947142e-06, + "loss": 0.8157, + "step": 6423 + }, + { + "epoch": 1.7085106382978723, + "grad_norm": 4.29874849319458, + "learning_rate": 7.84855842937632e-06, + "loss": 0.9049, + "step": 6424 + }, + { + "epoch": 1.708776595744681, + "grad_norm": 3.8917417526245117, + "learning_rate": 7.847835568667746e-06, + "loss": 0.7922, + "step": 6425 + }, + { + "epoch": 1.7090425531914892, + "grad_norm": 3.8562116622924805, + "learning_rate": 7.847112619843789e-06, + "loss": 0.7363, + "step": 6426 + }, + { + "epoch": 1.709308510638298, + "grad_norm": 4.495066165924072, + "learning_rate": 7.846389582926814e-06, + "loss": 0.977, + "step": 6427 + }, + { + "epoch": 1.7095744680851064, + "grad_norm": 3.899489164352417, + "learning_rate": 7.845666457939193e-06, + "loss": 0.7289, + "step": 6428 + }, + { + "epoch": 1.7098404255319148, + "grad_norm": 3.9472427368164062, + "learning_rate": 7.844943244903303e-06, + "loss": 0.8273, + "step": 6429 + }, + { + "epoch": 1.7101063829787235, + "grad_norm": 4.187959671020508, + "learning_rate": 7.84421994384152e-06, + "loss": 0.8658, + "step": 6430 + }, + { + "epoch": 1.7103723404255318, + "grad_norm": 4.103062152862549, + "learning_rate": 7.843496554776222e-06, + "loss": 0.8097, + "step": 6431 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 3.977741241455078, + "learning_rate": 7.842773077729793e-06, + "loss": 0.799, + "step": 6432 + }, + { + "epoch": 1.710904255319149, + "grad_norm": 3.8812167644500732, + "learning_rate": 7.842049512724618e-06, + "loss": 0.6743, + "step": 6433 + }, + { + "epoch": 1.7111702127659574, + "grad_norm": 4.060866832733154, + "learning_rate": 7.841325859783086e-06, + "loss": 0.7479, + "step": 6434 + }, + { + "epoch": 1.711436170212766, + "grad_norm": 4.428943634033203, + "learning_rate": 7.840602118927584e-06, + "loss": 0.9101, + "step": 6435 + }, + { + "epoch": 1.7117021276595743, + "grad_norm": 3.989323139190674, + "learning_rate": 7.83987829018051e-06, + "loss": 0.8308, + "step": 6436 + }, + { + "epoch": 1.711968085106383, + "grad_norm": 4.173738479614258, + "learning_rate": 7.83915437356426e-06, + "loss": 0.8025, + "step": 6437 + }, + { + "epoch": 1.7122340425531914, + "grad_norm": 3.7683372497558594, + "learning_rate": 7.838430369101227e-06, + "loss": 0.8168, + "step": 6438 + }, + { + "epoch": 1.7125, + "grad_norm": 3.9382693767547607, + "learning_rate": 7.837706276813819e-06, + "loss": 0.8469, + "step": 6439 + }, + { + "epoch": 1.7127659574468086, + "grad_norm": 4.1283278465271, + "learning_rate": 7.836982096724438e-06, + "loss": 0.7938, + "step": 6440 + }, + { + "epoch": 1.713031914893617, + "grad_norm": 4.033618927001953, + "learning_rate": 7.836257828855489e-06, + "loss": 0.8479, + "step": 6441 + }, + { + "epoch": 1.7132978723404255, + "grad_norm": 4.25187349319458, + "learning_rate": 7.835533473229385e-06, + "loss": 0.8507, + "step": 6442 + }, + { + "epoch": 1.7135638297872342, + "grad_norm": 4.031279563903809, + "learning_rate": 7.834809029868538e-06, + "loss": 0.8444, + "step": 6443 + }, + { + "epoch": 1.7138297872340424, + "grad_norm": 3.5434410572052, + "learning_rate": 7.834084498795361e-06, + "loss": 0.6862, + "step": 6444 + }, + { + "epoch": 1.7140957446808511, + "grad_norm": 4.158623218536377, + "learning_rate": 7.833359880032272e-06, + "loss": 0.8362, + "step": 6445 + }, + { + "epoch": 1.7143617021276596, + "grad_norm": 4.039031982421875, + "learning_rate": 7.832635173601692e-06, + "loss": 0.8806, + "step": 6446 + }, + { + "epoch": 1.714627659574468, + "grad_norm": 4.09163236618042, + "learning_rate": 7.831910379526047e-06, + "loss": 0.9957, + "step": 6447 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 3.4675064086914062, + "learning_rate": 7.831185497827758e-06, + "loss": 0.7451, + "step": 6448 + }, + { + "epoch": 1.715159574468085, + "grad_norm": 3.6473426818847656, + "learning_rate": 7.830460528529258e-06, + "loss": 0.7436, + "step": 6449 + }, + { + "epoch": 1.7154255319148937, + "grad_norm": 3.779623508453369, + "learning_rate": 7.829735471652978e-06, + "loss": 0.7522, + "step": 6450 + }, + { + "epoch": 1.7156914893617021, + "grad_norm": 3.759127616882324, + "learning_rate": 7.829010327221348e-06, + "loss": 0.8186, + "step": 6451 + }, + { + "epoch": 1.7159574468085106, + "grad_norm": 3.606985330581665, + "learning_rate": 7.828285095256808e-06, + "loss": 0.8916, + "step": 6452 + }, + { + "epoch": 1.7162234042553193, + "grad_norm": 3.6981024742126465, + "learning_rate": 7.8275597757818e-06, + "loss": 0.7967, + "step": 6453 + }, + { + "epoch": 1.7164893617021275, + "grad_norm": 3.8665547370910645, + "learning_rate": 7.826834368818761e-06, + "loss": 0.731, + "step": 6454 + }, + { + "epoch": 1.7167553191489362, + "grad_norm": 3.547314167022705, + "learning_rate": 7.826108874390141e-06, + "loss": 0.7793, + "step": 6455 + }, + { + "epoch": 1.7170212765957447, + "grad_norm": 3.823787212371826, + "learning_rate": 7.825383292518383e-06, + "loss": 0.7854, + "step": 6456 + }, + { + "epoch": 1.7172872340425531, + "grad_norm": 4.252329349517822, + "learning_rate": 7.82465762322594e-06, + "loss": 0.9033, + "step": 6457 + }, + { + "epoch": 1.7175531914893618, + "grad_norm": 3.9819960594177246, + "learning_rate": 7.823931866535264e-06, + "loss": 0.9616, + "step": 6458 + }, + { + "epoch": 1.71781914893617, + "grad_norm": 4.099963665008545, + "learning_rate": 7.823206022468812e-06, + "loss": 0.8145, + "step": 6459 + }, + { + "epoch": 1.7180851063829787, + "grad_norm": 4.146093368530273, + "learning_rate": 7.82248009104904e-06, + "loss": 0.7693, + "step": 6460 + }, + { + "epoch": 1.7183510638297872, + "grad_norm": 3.9053497314453125, + "learning_rate": 7.821754072298414e-06, + "loss": 0.8287, + "step": 6461 + }, + { + "epoch": 1.7186170212765957, + "grad_norm": 4.186066150665283, + "learning_rate": 7.821027966239393e-06, + "loss": 0.7655, + "step": 6462 + }, + { + "epoch": 1.7188829787234043, + "grad_norm": 4.364232540130615, + "learning_rate": 7.820301772894445e-06, + "loss": 0.7746, + "step": 6463 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 3.838639736175537, + "learning_rate": 7.81957549228604e-06, + "loss": 0.8342, + "step": 6464 + }, + { + "epoch": 1.7194148936170213, + "grad_norm": 4.181699752807617, + "learning_rate": 7.818849124436651e-06, + "loss": 0.8181, + "step": 6465 + }, + { + "epoch": 1.71968085106383, + "grad_norm": 4.069806098937988, + "learning_rate": 7.818122669368751e-06, + "loss": 0.7486, + "step": 6466 + }, + { + "epoch": 1.7199468085106382, + "grad_norm": 3.9210989475250244, + "learning_rate": 7.817396127104815e-06, + "loss": 0.8064, + "step": 6467 + }, + { + "epoch": 1.7202127659574469, + "grad_norm": 3.3825418949127197, + "learning_rate": 7.816669497667328e-06, + "loss": 0.7276, + "step": 6468 + }, + { + "epoch": 1.7204787234042553, + "grad_norm": 4.07489013671875, + "learning_rate": 7.815942781078772e-06, + "loss": 0.7628, + "step": 6469 + }, + { + "epoch": 1.7207446808510638, + "grad_norm": 4.20849084854126, + "learning_rate": 7.815215977361628e-06, + "loss": 0.822, + "step": 6470 + }, + { + "epoch": 1.7210106382978725, + "grad_norm": 4.13023567199707, + "learning_rate": 7.814489086538388e-06, + "loss": 0.8117, + "step": 6471 + }, + { + "epoch": 1.7212765957446807, + "grad_norm": 4.143436431884766, + "learning_rate": 7.813762108631544e-06, + "loss": 0.8769, + "step": 6472 + }, + { + "epoch": 1.7215425531914894, + "grad_norm": 3.954219102859497, + "learning_rate": 7.813035043663585e-06, + "loss": 0.7836, + "step": 6473 + }, + { + "epoch": 1.7218085106382979, + "grad_norm": 3.688133478164673, + "learning_rate": 7.81230789165701e-06, + "loss": 0.8905, + "step": 6474 + }, + { + "epoch": 1.7220744680851063, + "grad_norm": 4.443986892700195, + "learning_rate": 7.811580652634319e-06, + "loss": 0.8933, + "step": 6475 + }, + { + "epoch": 1.722340425531915, + "grad_norm": 3.791365146636963, + "learning_rate": 7.810853326618012e-06, + "loss": 0.8278, + "step": 6476 + }, + { + "epoch": 1.7226063829787233, + "grad_norm": 4.167088031768799, + "learning_rate": 7.810125913630593e-06, + "loss": 0.7669, + "step": 6477 + }, + { + "epoch": 1.722872340425532, + "grad_norm": 3.4958133697509766, + "learning_rate": 7.80939841369457e-06, + "loss": 0.7095, + "step": 6478 + }, + { + "epoch": 1.7231382978723404, + "grad_norm": 4.2002339363098145, + "learning_rate": 7.808670826832455e-06, + "loss": 0.7463, + "step": 6479 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 3.795557737350464, + "learning_rate": 7.807943153066754e-06, + "loss": 0.6731, + "step": 6480 + }, + { + "epoch": 1.7236702127659576, + "grad_norm": 3.272183895111084, + "learning_rate": 7.807215392419988e-06, + "loss": 0.6116, + "step": 6481 + }, + { + "epoch": 1.7239361702127658, + "grad_norm": 4.027061462402344, + "learning_rate": 7.806487544914672e-06, + "loss": 0.8122, + "step": 6482 + }, + { + "epoch": 1.7242021276595745, + "grad_norm": 3.5909063816070557, + "learning_rate": 7.805759610573327e-06, + "loss": 0.7915, + "step": 6483 + }, + { + "epoch": 1.724468085106383, + "grad_norm": 4.0041961669921875, + "learning_rate": 7.805031589418477e-06, + "loss": 0.6859, + "step": 6484 + }, + { + "epoch": 1.7247340425531914, + "grad_norm": 3.9270341396331787, + "learning_rate": 7.804303481472645e-06, + "loss": 0.7585, + "step": 6485 + }, + { + "epoch": 1.725, + "grad_norm": 4.444969654083252, + "learning_rate": 7.803575286758365e-06, + "loss": 0.8409, + "step": 6486 + }, + { + "epoch": 1.7252659574468086, + "grad_norm": 4.4063262939453125, + "learning_rate": 7.802847005298162e-06, + "loss": 1.0173, + "step": 6487 + }, + { + "epoch": 1.725531914893617, + "grad_norm": 4.078791618347168, + "learning_rate": 7.802118637114575e-06, + "loss": 0.8106, + "step": 6488 + }, + { + "epoch": 1.7257978723404257, + "grad_norm": 3.8760604858398438, + "learning_rate": 7.801390182230137e-06, + "loss": 0.7751, + "step": 6489 + }, + { + "epoch": 1.726063829787234, + "grad_norm": 4.180771350860596, + "learning_rate": 7.800661640667388e-06, + "loss": 0.8671, + "step": 6490 + }, + { + "epoch": 1.7263297872340426, + "grad_norm": 3.921558380126953, + "learning_rate": 7.799933012448872e-06, + "loss": 0.8414, + "step": 6491 + }, + { + "epoch": 1.726595744680851, + "grad_norm": 3.8960835933685303, + "learning_rate": 7.799204297597129e-06, + "loss": 0.7135, + "step": 6492 + }, + { + "epoch": 1.7268617021276595, + "grad_norm": 3.834841251373291, + "learning_rate": 7.798475496134714e-06, + "loss": 0.7374, + "step": 6493 + }, + { + "epoch": 1.7271276595744682, + "grad_norm": 3.5948872566223145, + "learning_rate": 7.79774660808417e-06, + "loss": 0.7354, + "step": 6494 + }, + { + "epoch": 1.7273936170212765, + "grad_norm": 3.763976573944092, + "learning_rate": 7.797017633468052e-06, + "loss": 0.9162, + "step": 6495 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.8534562587738037, + "learning_rate": 7.796288572308914e-06, + "loss": 0.8713, + "step": 6496 + }, + { + "epoch": 1.7279255319148936, + "grad_norm": 4.049807071685791, + "learning_rate": 7.795559424629317e-06, + "loss": 0.8404, + "step": 6497 + }, + { + "epoch": 1.728191489361702, + "grad_norm": 3.8596930503845215, + "learning_rate": 7.79483019045182e-06, + "loss": 0.7868, + "step": 6498 + }, + { + "epoch": 1.7284574468085108, + "grad_norm": 4.452897071838379, + "learning_rate": 7.794100869798986e-06, + "loss": 0.9168, + "step": 6499 + }, + { + "epoch": 1.728723404255319, + "grad_norm": 3.7102370262145996, + "learning_rate": 7.79337146269338e-06, + "loss": 0.9201, + "step": 6500 + }, + { + "epoch": 1.728723404255319, + "eval_loss": 1.2800854444503784, + "eval_runtime": 13.8491, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 3.61, + "step": 6500 + }, + { + "epoch": 1.7289893617021277, + "grad_norm": 4.088536262512207, + "learning_rate": 7.792641969157574e-06, + "loss": 0.8304, + "step": 6501 + }, + { + "epoch": 1.7292553191489362, + "grad_norm": 3.8640379905700684, + "learning_rate": 7.791912389214138e-06, + "loss": 0.77, + "step": 6502 + }, + { + "epoch": 1.7295212765957446, + "grad_norm": 3.927625894546509, + "learning_rate": 7.791182722885644e-06, + "loss": 0.7303, + "step": 6503 + }, + { + "epoch": 1.7297872340425533, + "grad_norm": 3.960904598236084, + "learning_rate": 7.790452970194673e-06, + "loss": 0.8346, + "step": 6504 + }, + { + "epoch": 1.7300531914893615, + "grad_norm": 3.953512191772461, + "learning_rate": 7.7897231311638e-06, + "loss": 0.6958, + "step": 6505 + }, + { + "epoch": 1.7303191489361702, + "grad_norm": 3.7672922611236572, + "learning_rate": 7.788993205815606e-06, + "loss": 0.7887, + "step": 6506 + }, + { + "epoch": 1.7305851063829787, + "grad_norm": 4.269046783447266, + "learning_rate": 7.788263194172684e-06, + "loss": 0.9836, + "step": 6507 + }, + { + "epoch": 1.7308510638297872, + "grad_norm": 3.96058988571167, + "learning_rate": 7.787533096257613e-06, + "loss": 0.9103, + "step": 6508 + }, + { + "epoch": 1.7311170212765958, + "grad_norm": 3.9208950996398926, + "learning_rate": 7.786802912092986e-06, + "loss": 0.819, + "step": 6509 + }, + { + "epoch": 1.7313829787234043, + "grad_norm": 3.600135326385498, + "learning_rate": 7.786072641701397e-06, + "loss": 0.8122, + "step": 6510 + }, + { + "epoch": 1.7316489361702128, + "grad_norm": 3.9716193675994873, + "learning_rate": 7.78534228510544e-06, + "loss": 0.7281, + "step": 6511 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 4.222037315368652, + "learning_rate": 7.784611842327711e-06, + "loss": 0.8926, + "step": 6512 + }, + { + "epoch": 1.7321808510638297, + "grad_norm": 3.3642852306365967, + "learning_rate": 7.783881313390816e-06, + "loss": 0.7014, + "step": 6513 + }, + { + "epoch": 1.7324468085106384, + "grad_norm": 4.051825046539307, + "learning_rate": 7.783150698317354e-06, + "loss": 0.7602, + "step": 6514 + }, + { + "epoch": 1.7327127659574468, + "grad_norm": 4.036343574523926, + "learning_rate": 7.782419997129934e-06, + "loss": 0.8381, + "step": 6515 + }, + { + "epoch": 1.7329787234042553, + "grad_norm": 3.722576856613159, + "learning_rate": 7.781689209851163e-06, + "loss": 0.8737, + "step": 6516 + }, + { + "epoch": 1.733244680851064, + "grad_norm": 4.037721157073975, + "learning_rate": 7.780958336503653e-06, + "loss": 0.8382, + "step": 6517 + }, + { + "epoch": 1.7335106382978722, + "grad_norm": 4.075493812561035, + "learning_rate": 7.780227377110016e-06, + "loss": 0.8215, + "step": 6518 + }, + { + "epoch": 1.733776595744681, + "grad_norm": 3.9683899879455566, + "learning_rate": 7.779496331692872e-06, + "loss": 0.8797, + "step": 6519 + }, + { + "epoch": 1.7340425531914894, + "grad_norm": 3.871469259262085, + "learning_rate": 7.77876520027484e-06, + "loss": 0.7388, + "step": 6520 + }, + { + "epoch": 1.7343085106382978, + "grad_norm": 3.950624465942383, + "learning_rate": 7.778033982878539e-06, + "loss": 0.7502, + "step": 6521 + }, + { + "epoch": 1.7345744680851065, + "grad_norm": 4.015387058258057, + "learning_rate": 7.777302679526596e-06, + "loss": 0.9874, + "step": 6522 + }, + { + "epoch": 1.7348404255319148, + "grad_norm": 4.03596830368042, + "learning_rate": 7.776571290241642e-06, + "loss": 0.7633, + "step": 6523 + }, + { + "epoch": 1.7351063829787234, + "grad_norm": 4.029125213623047, + "learning_rate": 7.775839815046299e-06, + "loss": 0.7994, + "step": 6524 + }, + { + "epoch": 1.735372340425532, + "grad_norm": 4.058604717254639, + "learning_rate": 7.775108253963207e-06, + "loss": 0.7391, + "step": 6525 + }, + { + "epoch": 1.7356382978723404, + "grad_norm": 3.862391948699951, + "learning_rate": 7.774376607014995e-06, + "loss": 0.9032, + "step": 6526 + }, + { + "epoch": 1.735904255319149, + "grad_norm": 3.903395414352417, + "learning_rate": 7.773644874224306e-06, + "loss": 0.8429, + "step": 6527 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 3.8711469173431396, + "learning_rate": 7.77291305561378e-06, + "loss": 0.807, + "step": 6528 + }, + { + "epoch": 1.736436170212766, + "grad_norm": 3.977463483810425, + "learning_rate": 7.77218115120606e-06, + "loss": 0.7929, + "step": 6529 + }, + { + "epoch": 1.7367021276595744, + "grad_norm": 3.7397544384002686, + "learning_rate": 7.77144916102379e-06, + "loss": 0.8478, + "step": 6530 + }, + { + "epoch": 1.736968085106383, + "grad_norm": 3.6703922748565674, + "learning_rate": 7.770717085089618e-06, + "loss": 0.6432, + "step": 6531 + }, + { + "epoch": 1.7372340425531916, + "grad_norm": 4.170365333557129, + "learning_rate": 7.7699849234262e-06, + "loss": 0.7565, + "step": 6532 + }, + { + "epoch": 1.7375, + "grad_norm": 3.6264007091522217, + "learning_rate": 7.769252676056186e-06, + "loss": 0.7635, + "step": 6533 + }, + { + "epoch": 1.7377659574468085, + "grad_norm": 3.9042675495147705, + "learning_rate": 7.768520343002235e-06, + "loss": 0.9037, + "step": 6534 + }, + { + "epoch": 1.738031914893617, + "grad_norm": 4.19412899017334, + "learning_rate": 7.767787924287005e-06, + "loss": 0.8516, + "step": 6535 + }, + { + "epoch": 1.7382978723404254, + "grad_norm": 3.869814157485962, + "learning_rate": 7.767055419933157e-06, + "loss": 0.7815, + "step": 6536 + }, + { + "epoch": 1.7385638297872341, + "grad_norm": 3.712411642074585, + "learning_rate": 7.766322829963357e-06, + "loss": 0.6676, + "step": 6537 + }, + { + "epoch": 1.7388297872340426, + "grad_norm": 4.046865463256836, + "learning_rate": 7.76559015440027e-06, + "loss": 0.8799, + "step": 6538 + }, + { + "epoch": 1.739095744680851, + "grad_norm": 3.908235549926758, + "learning_rate": 7.76485739326657e-06, + "loss": 0.7999, + "step": 6539 + }, + { + "epoch": 1.7393617021276597, + "grad_norm": 4.396571159362793, + "learning_rate": 7.764124546584926e-06, + "loss": 0.8813, + "step": 6540 + }, + { + "epoch": 1.739627659574468, + "grad_norm": 3.7259883880615234, + "learning_rate": 7.763391614378014e-06, + "loss": 0.8519, + "step": 6541 + }, + { + "epoch": 1.7398936170212767, + "grad_norm": 3.7457261085510254, + "learning_rate": 7.762658596668514e-06, + "loss": 0.7913, + "step": 6542 + }, + { + "epoch": 1.7401595744680851, + "grad_norm": 3.66605544090271, + "learning_rate": 7.7619254934791e-06, + "loss": 0.8122, + "step": 6543 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.8894519805908203, + "learning_rate": 7.761192304832463e-06, + "loss": 0.6829, + "step": 6544 + }, + { + "epoch": 1.7406914893617023, + "grad_norm": 3.4376041889190674, + "learning_rate": 7.760459030751285e-06, + "loss": 0.6903, + "step": 6545 + }, + { + "epoch": 1.7409574468085105, + "grad_norm": 4.00453519821167, + "learning_rate": 7.759725671258254e-06, + "loss": 0.8714, + "step": 6546 + }, + { + "epoch": 1.7412234042553192, + "grad_norm": 3.9484405517578125, + "learning_rate": 7.758992226376062e-06, + "loss": 0.9567, + "step": 6547 + }, + { + "epoch": 1.7414893617021276, + "grad_norm": 3.885755777359009, + "learning_rate": 7.7582586961274e-06, + "loss": 0.7928, + "step": 6548 + }, + { + "epoch": 1.741755319148936, + "grad_norm": 3.8768088817596436, + "learning_rate": 7.757525080534968e-06, + "loss": 0.7554, + "step": 6549 + }, + { + "epoch": 1.7420212765957448, + "grad_norm": 3.7053639888763428, + "learning_rate": 7.756791379621461e-06, + "loss": 0.8122, + "step": 6550 + }, + { + "epoch": 1.742287234042553, + "grad_norm": 3.9800238609313965, + "learning_rate": 7.756057593409588e-06, + "loss": 0.8505, + "step": 6551 + }, + { + "epoch": 1.7425531914893617, + "grad_norm": 3.586451768875122, + "learning_rate": 7.755323721922045e-06, + "loss": 0.7435, + "step": 6552 + }, + { + "epoch": 1.7428191489361702, + "grad_norm": 4.315957069396973, + "learning_rate": 7.754589765181543e-06, + "loss": 0.8308, + "step": 6553 + }, + { + "epoch": 1.7430851063829786, + "grad_norm": 3.764915704727173, + "learning_rate": 7.75385572321079e-06, + "loss": 0.7939, + "step": 6554 + }, + { + "epoch": 1.7433510638297873, + "grad_norm": 3.9177279472351074, + "learning_rate": 7.7531215960325e-06, + "loss": 0.8557, + "step": 6555 + }, + { + "epoch": 1.7436170212765958, + "grad_norm": 3.802114248275757, + "learning_rate": 7.752387383669384e-06, + "loss": 0.7933, + "step": 6556 + }, + { + "epoch": 1.7438829787234043, + "grad_norm": 4.129657745361328, + "learning_rate": 7.751653086144164e-06, + "loss": 0.8744, + "step": 6557 + }, + { + "epoch": 1.7441489361702127, + "grad_norm": 4.201019763946533, + "learning_rate": 7.750918703479558e-06, + "loss": 0.7875, + "step": 6558 + }, + { + "epoch": 1.7444148936170212, + "grad_norm": 4.305670261383057, + "learning_rate": 7.750184235698285e-06, + "loss": 0.8137, + "step": 6559 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.571631908416748, + "learning_rate": 7.749449682823077e-06, + "loss": 0.7308, + "step": 6560 + }, + { + "epoch": 1.7449468085106383, + "grad_norm": 4.124020576477051, + "learning_rate": 7.74871504487666e-06, + "loss": 0.9546, + "step": 6561 + }, + { + "epoch": 1.7452127659574468, + "grad_norm": 4.1722588539123535, + "learning_rate": 7.74798032188176e-06, + "loss": 0.787, + "step": 6562 + }, + { + "epoch": 1.7454787234042555, + "grad_norm": 4.017617225646973, + "learning_rate": 7.747245513861115e-06, + "loss": 0.8655, + "step": 6563 + }, + { + "epoch": 1.7457446808510637, + "grad_norm": 4.122082233428955, + "learning_rate": 7.74651062083746e-06, + "loss": 0.9471, + "step": 6564 + }, + { + "epoch": 1.7460106382978724, + "grad_norm": 4.254493713378906, + "learning_rate": 7.745775642833532e-06, + "loss": 0.8313, + "step": 6565 + }, + { + "epoch": 1.7462765957446809, + "grad_norm": 3.856379985809326, + "learning_rate": 7.745040579872073e-06, + "loss": 0.9207, + "step": 6566 + }, + { + "epoch": 1.7465425531914893, + "grad_norm": 4.020528316497803, + "learning_rate": 7.744305431975827e-06, + "loss": 0.7029, + "step": 6567 + }, + { + "epoch": 1.746808510638298, + "grad_norm": 4.091069221496582, + "learning_rate": 7.743570199167539e-06, + "loss": 0.8682, + "step": 6568 + }, + { + "epoch": 1.7470744680851062, + "grad_norm": 3.8805131912231445, + "learning_rate": 7.742834881469959e-06, + "loss": 0.8366, + "step": 6569 + }, + { + "epoch": 1.747340425531915, + "grad_norm": 3.5972797870635986, + "learning_rate": 7.742099478905837e-06, + "loss": 0.784, + "step": 6570 + }, + { + "epoch": 1.7476063829787234, + "grad_norm": 3.655684232711792, + "learning_rate": 7.741363991497932e-06, + "loss": 0.7849, + "step": 6571 + }, + { + "epoch": 1.7478723404255319, + "grad_norm": 3.854562520980835, + "learning_rate": 7.740628419268996e-06, + "loss": 0.7961, + "step": 6572 + }, + { + "epoch": 1.7481382978723405, + "grad_norm": 3.5972256660461426, + "learning_rate": 7.73989276224179e-06, + "loss": 0.8045, + "step": 6573 + }, + { + "epoch": 1.7484042553191488, + "grad_norm": 4.087411880493164, + "learning_rate": 7.739157020439077e-06, + "loss": 0.8889, + "step": 6574 + }, + { + "epoch": 1.7486702127659575, + "grad_norm": 4.145167350769043, + "learning_rate": 7.738421193883618e-06, + "loss": 0.8542, + "step": 6575 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 4.064332008361816, + "learning_rate": 7.737685282598187e-06, + "loss": 0.8523, + "step": 6576 + }, + { + "epoch": 1.7492021276595744, + "grad_norm": 4.075108051300049, + "learning_rate": 7.736949286605549e-06, + "loss": 0.8839, + "step": 6577 + }, + { + "epoch": 1.749468085106383, + "grad_norm": 4.157843112945557, + "learning_rate": 7.736213205928476e-06, + "loss": 0.9253, + "step": 6578 + }, + { + "epoch": 1.7497340425531915, + "grad_norm": 3.978928327560425, + "learning_rate": 7.735477040589745e-06, + "loss": 0.8454, + "step": 6579 + }, + { + "epoch": 1.75, + "grad_norm": 3.7294394969940186, + "learning_rate": 7.734740790612137e-06, + "loss": 0.7877, + "step": 6580 + }, + { + "epoch": 1.7502659574468085, + "grad_norm": 4.367574214935303, + "learning_rate": 7.734004456018424e-06, + "loss": 0.7477, + "step": 6581 + }, + { + "epoch": 1.750531914893617, + "grad_norm": 3.952146291732788, + "learning_rate": 7.733268036831398e-06, + "loss": 0.7725, + "step": 6582 + }, + { + "epoch": 1.7507978723404256, + "grad_norm": 4.400146961212158, + "learning_rate": 7.73253153307384e-06, + "loss": 0.8059, + "step": 6583 + }, + { + "epoch": 1.751063829787234, + "grad_norm": 4.003587245941162, + "learning_rate": 7.73179494476854e-06, + "loss": 0.8549, + "step": 6584 + }, + { + "epoch": 1.7513297872340425, + "grad_norm": 3.898470640182495, + "learning_rate": 7.731058271938286e-06, + "loss": 0.7925, + "step": 6585 + }, + { + "epoch": 1.7515957446808512, + "grad_norm": 3.6899170875549316, + "learning_rate": 7.730321514605877e-06, + "loss": 0.7535, + "step": 6586 + }, + { + "epoch": 1.7518617021276595, + "grad_norm": 3.996615171432495, + "learning_rate": 7.729584672794102e-06, + "loss": 0.8278, + "step": 6587 + }, + { + "epoch": 1.7521276595744681, + "grad_norm": 4.020608901977539, + "learning_rate": 7.728847746525764e-06, + "loss": 0.7233, + "step": 6588 + }, + { + "epoch": 1.7523936170212766, + "grad_norm": 4.504430294036865, + "learning_rate": 7.728110735823666e-06, + "loss": 0.8254, + "step": 6589 + }, + { + "epoch": 1.752659574468085, + "grad_norm": 3.7418766021728516, + "learning_rate": 7.72737364071061e-06, + "loss": 0.8151, + "step": 6590 + }, + { + "epoch": 1.7529255319148938, + "grad_norm": 4.577789783477783, + "learning_rate": 7.7266364612094e-06, + "loss": 0.9276, + "step": 6591 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 4.067131042480469, + "learning_rate": 7.72589919734285e-06, + "loss": 0.8282, + "step": 6592 + }, + { + "epoch": 1.7534574468085107, + "grad_norm": 4.11132287979126, + "learning_rate": 7.725161849133769e-06, + "loss": 0.8663, + "step": 6593 + }, + { + "epoch": 1.7537234042553191, + "grad_norm": 3.8996002674102783, + "learning_rate": 7.724424416604972e-06, + "loss": 0.9631, + "step": 6594 + }, + { + "epoch": 1.7539893617021276, + "grad_norm": 3.911623954772949, + "learning_rate": 7.723686899779277e-06, + "loss": 0.8082, + "step": 6595 + }, + { + "epoch": 1.7542553191489363, + "grad_norm": 4.957215785980225, + "learning_rate": 7.7229492986795e-06, + "loss": 0.8758, + "step": 6596 + }, + { + "epoch": 1.7545212765957445, + "grad_norm": 4.114643573760986, + "learning_rate": 7.722211613328467e-06, + "loss": 0.7665, + "step": 6597 + }, + { + "epoch": 1.7547872340425532, + "grad_norm": 3.4866108894348145, + "learning_rate": 7.721473843749e-06, + "loss": 0.7636, + "step": 6598 + }, + { + "epoch": 1.7550531914893617, + "grad_norm": 3.798917055130005, + "learning_rate": 7.72073598996393e-06, + "loss": 0.7645, + "step": 6599 + }, + { + "epoch": 1.7553191489361701, + "grad_norm": 4.327617168426514, + "learning_rate": 7.719998051996087e-06, + "loss": 0.8174, + "step": 6600 + }, + { + "epoch": 1.7555851063829788, + "grad_norm": 3.7455971240997314, + "learning_rate": 7.719260029868299e-06, + "loss": 0.7484, + "step": 6601 + }, + { + "epoch": 1.7558510638297873, + "grad_norm": 3.4463014602661133, + "learning_rate": 7.718521923603404e-06, + "loss": 0.692, + "step": 6602 + }, + { + "epoch": 1.7561170212765957, + "grad_norm": 3.920140027999878, + "learning_rate": 7.717783733224243e-06, + "loss": 0.9122, + "step": 6603 + }, + { + "epoch": 1.7563829787234042, + "grad_norm": 4.227574825286865, + "learning_rate": 7.717045458753651e-06, + "loss": 0.7812, + "step": 6604 + }, + { + "epoch": 1.7566489361702127, + "grad_norm": 4.23086404800415, + "learning_rate": 7.716307100214472e-06, + "loss": 0.829, + "step": 6605 + }, + { + "epoch": 1.7569148936170214, + "grad_norm": 3.5714340209960938, + "learning_rate": 7.715568657629557e-06, + "loss": 0.8676, + "step": 6606 + }, + { + "epoch": 1.7571808510638298, + "grad_norm": 4.220118045806885, + "learning_rate": 7.71483013102175e-06, + "loss": 0.7351, + "step": 6607 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 3.8862133026123047, + "learning_rate": 7.7140915204139e-06, + "loss": 0.7836, + "step": 6608 + }, + { + "epoch": 1.757712765957447, + "grad_norm": 3.9056966304779053, + "learning_rate": 7.713352825828865e-06, + "loss": 0.7439, + "step": 6609 + }, + { + "epoch": 1.7579787234042552, + "grad_norm": 4.519630432128906, + "learning_rate": 7.712614047289498e-06, + "loss": 0.9618, + "step": 6610 + }, + { + "epoch": 1.758244680851064, + "grad_norm": 3.756225109100342, + "learning_rate": 7.711875184818659e-06, + "loss": 0.7612, + "step": 6611 + }, + { + "epoch": 1.7585106382978724, + "grad_norm": 4.109426498413086, + "learning_rate": 7.71113623843921e-06, + "loss": 0.8828, + "step": 6612 + }, + { + "epoch": 1.7587765957446808, + "grad_norm": 4.274012565612793, + "learning_rate": 7.710397208174012e-06, + "loss": 0.8212, + "step": 6613 + }, + { + "epoch": 1.7590425531914895, + "grad_norm": 4.489198207855225, + "learning_rate": 7.709658094045933e-06, + "loss": 0.9358, + "step": 6614 + }, + { + "epoch": 1.7593085106382977, + "grad_norm": 3.796844005584717, + "learning_rate": 7.708918896077843e-06, + "loss": 0.8092, + "step": 6615 + }, + { + "epoch": 1.7595744680851064, + "grad_norm": 4.139426231384277, + "learning_rate": 7.708179614292614e-06, + "loss": 0.7859, + "step": 6616 + }, + { + "epoch": 1.7598404255319149, + "grad_norm": 4.109641075134277, + "learning_rate": 7.707440248713118e-06, + "loss": 0.7763, + "step": 6617 + }, + { + "epoch": 1.7601063829787233, + "grad_norm": 4.1055521965026855, + "learning_rate": 7.706700799362235e-06, + "loss": 0.7225, + "step": 6618 + }, + { + "epoch": 1.760372340425532, + "grad_norm": 4.071004390716553, + "learning_rate": 7.70596126626284e-06, + "loss": 0.7714, + "step": 6619 + }, + { + "epoch": 1.7606382978723403, + "grad_norm": 4.117389678955078, + "learning_rate": 7.705221649437819e-06, + "loss": 0.8, + "step": 6620 + }, + { + "epoch": 1.760904255319149, + "grad_norm": 3.617248058319092, + "learning_rate": 7.704481948910057e-06, + "loss": 0.8286, + "step": 6621 + }, + { + "epoch": 1.7611702127659574, + "grad_norm": 3.6249337196350098, + "learning_rate": 7.703742164702436e-06, + "loss": 0.732, + "step": 6622 + }, + { + "epoch": 1.7614361702127659, + "grad_norm": 3.584951400756836, + "learning_rate": 7.703002296837849e-06, + "loss": 0.859, + "step": 6623 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 3.908857822418213, + "learning_rate": 7.70226234533919e-06, + "loss": 0.8112, + "step": 6624 + }, + { + "epoch": 1.761968085106383, + "grad_norm": 4.350627422332764, + "learning_rate": 7.701522310229353e-06, + "loss": 0.9676, + "step": 6625 + }, + { + "epoch": 1.7622340425531915, + "grad_norm": 3.7733817100524902, + "learning_rate": 7.700782191531236e-06, + "loss": 0.7312, + "step": 6626 + }, + { + "epoch": 1.7625, + "grad_norm": 3.822552442550659, + "learning_rate": 7.700041989267738e-06, + "loss": 0.6901, + "step": 6627 + }, + { + "epoch": 1.7627659574468084, + "grad_norm": 3.9083547592163086, + "learning_rate": 7.69930170346176e-06, + "loss": 0.7498, + "step": 6628 + }, + { + "epoch": 1.763031914893617, + "grad_norm": 4.126950263977051, + "learning_rate": 7.69856133413621e-06, + "loss": 0.7975, + "step": 6629 + }, + { + "epoch": 1.7632978723404256, + "grad_norm": 4.27503776550293, + "learning_rate": 7.697820881313994e-06, + "loss": 0.7927, + "step": 6630 + }, + { + "epoch": 1.763563829787234, + "grad_norm": 4.2161407470703125, + "learning_rate": 7.697080345018024e-06, + "loss": 0.8779, + "step": 6631 + }, + { + "epoch": 1.7638297872340427, + "grad_norm": 4.142273426055908, + "learning_rate": 7.696339725271215e-06, + "loss": 0.8069, + "step": 6632 + }, + { + "epoch": 1.764095744680851, + "grad_norm": 4.17659330368042, + "learning_rate": 7.695599022096478e-06, + "loss": 0.7439, + "step": 6633 + }, + { + "epoch": 1.7643617021276596, + "grad_norm": 4.072018623352051, + "learning_rate": 7.694858235516735e-06, + "loss": 0.8364, + "step": 6634 + }, + { + "epoch": 1.764627659574468, + "grad_norm": 3.6811084747314453, + "learning_rate": 7.694117365554905e-06, + "loss": 0.8986, + "step": 6635 + }, + { + "epoch": 1.7648936170212766, + "grad_norm": 3.924104928970337, + "learning_rate": 7.693376412233913e-06, + "loss": 0.7906, + "step": 6636 + }, + { + "epoch": 1.7651595744680852, + "grad_norm": 4.180627822875977, + "learning_rate": 7.69263537557668e-06, + "loss": 0.814, + "step": 6637 + }, + { + "epoch": 1.7654255319148935, + "grad_norm": 3.74808931350708, + "learning_rate": 7.691894255606143e-06, + "loss": 0.8623, + "step": 6638 + }, + { + "epoch": 1.7656914893617022, + "grad_norm": 3.8845086097717285, + "learning_rate": 7.691153052345227e-06, + "loss": 0.8279, + "step": 6639 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 3.6786465644836426, + "learning_rate": 7.690411765816864e-06, + "loss": 0.8579, + "step": 6640 + }, + { + "epoch": 1.766223404255319, + "grad_norm": 4.260414123535156, + "learning_rate": 7.689670396043997e-06, + "loss": 0.8473, + "step": 6641 + }, + { + "epoch": 1.7664893617021278, + "grad_norm": 3.757199287414551, + "learning_rate": 7.688928943049558e-06, + "loss": 0.8065, + "step": 6642 + }, + { + "epoch": 1.766755319148936, + "grad_norm": 4.010439872741699, + "learning_rate": 7.688187406856494e-06, + "loss": 0.8412, + "step": 6643 + }, + { + "epoch": 1.7670212765957447, + "grad_norm": 4.193131923675537, + "learning_rate": 7.687445787487746e-06, + "loss": 0.7638, + "step": 6644 + }, + { + "epoch": 1.7672872340425532, + "grad_norm": 3.7920022010803223, + "learning_rate": 7.686704084966263e-06, + "loss": 0.7628, + "step": 6645 + }, + { + "epoch": 1.7675531914893616, + "grad_norm": 3.6464099884033203, + "learning_rate": 7.68596229931499e-06, + "loss": 0.7547, + "step": 6646 + }, + { + "epoch": 1.7678191489361703, + "grad_norm": 3.7222912311553955, + "learning_rate": 7.685220430556883e-06, + "loss": 0.6741, + "step": 6647 + }, + { + "epoch": 1.7680851063829788, + "grad_norm": 3.48502254486084, + "learning_rate": 7.684478478714892e-06, + "loss": 0.6893, + "step": 6648 + }, + { + "epoch": 1.7683510638297872, + "grad_norm": 4.072755813598633, + "learning_rate": 7.683736443811978e-06, + "loss": 0.8487, + "step": 6649 + }, + { + "epoch": 1.7686170212765957, + "grad_norm": 3.5753612518310547, + "learning_rate": 7.682994325871098e-06, + "loss": 0.8314, + "step": 6650 + }, + { + "epoch": 1.7688829787234042, + "grad_norm": 4.951267242431641, + "learning_rate": 7.682252124915216e-06, + "loss": 0.9956, + "step": 6651 + }, + { + "epoch": 1.7691489361702128, + "grad_norm": 4.200650691986084, + "learning_rate": 7.681509840967294e-06, + "loss": 0.7119, + "step": 6652 + }, + { + "epoch": 1.7694148936170213, + "grad_norm": 3.4650633335113525, + "learning_rate": 7.6807674740503e-06, + "loss": 0.843, + "step": 6653 + }, + { + "epoch": 1.7696808510638298, + "grad_norm": 4.049907207489014, + "learning_rate": 7.680025024187206e-06, + "loss": 0.7776, + "step": 6654 + }, + { + "epoch": 1.7699468085106385, + "grad_norm": 3.934799909591675, + "learning_rate": 7.67928249140098e-06, + "loss": 0.7957, + "step": 6655 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 4.14153528213501, + "learning_rate": 7.678539875714604e-06, + "loss": 0.7445, + "step": 6656 + }, + { + "epoch": 1.7704787234042554, + "grad_norm": 3.816898822784424, + "learning_rate": 7.677797177151047e-06, + "loss": 0.8869, + "step": 6657 + }, + { + "epoch": 1.7707446808510638, + "grad_norm": 4.405877113342285, + "learning_rate": 7.677054395733292e-06, + "loss": 0.9004, + "step": 6658 + }, + { + "epoch": 1.7710106382978723, + "grad_norm": 4.069585800170898, + "learning_rate": 7.676311531484324e-06, + "loss": 0.7907, + "step": 6659 + }, + { + "epoch": 1.771276595744681, + "grad_norm": 3.9655072689056396, + "learning_rate": 7.675568584427125e-06, + "loss": 0.8069, + "step": 6660 + }, + { + "epoch": 1.7715425531914892, + "grad_norm": 3.8515357971191406, + "learning_rate": 7.674825554584686e-06, + "loss": 0.8013, + "step": 6661 + }, + { + "epoch": 1.771808510638298, + "grad_norm": 4.2742438316345215, + "learning_rate": 7.674082441979993e-06, + "loss": 0.9655, + "step": 6662 + }, + { + "epoch": 1.7720744680851064, + "grad_norm": 4.425269603729248, + "learning_rate": 7.67333924663604e-06, + "loss": 0.872, + "step": 6663 + }, + { + "epoch": 1.7723404255319148, + "grad_norm": 4.043865203857422, + "learning_rate": 7.672595968575827e-06, + "loss": 0.8425, + "step": 6664 + }, + { + "epoch": 1.7726063829787235, + "grad_norm": 3.77255916595459, + "learning_rate": 7.671852607822346e-06, + "loss": 0.6711, + "step": 6665 + }, + { + "epoch": 1.7728723404255318, + "grad_norm": 3.8917951583862305, + "learning_rate": 7.671109164398598e-06, + "loss": 0.7429, + "step": 6666 + }, + { + "epoch": 1.7731382978723405, + "grad_norm": 4.034469127655029, + "learning_rate": 7.67036563832759e-06, + "loss": 0.884, + "step": 6667 + }, + { + "epoch": 1.773404255319149, + "grad_norm": 4.177572727203369, + "learning_rate": 7.669622029632323e-06, + "loss": 0.7823, + "step": 6668 + }, + { + "epoch": 1.7736702127659574, + "grad_norm": 3.816012382507324, + "learning_rate": 7.668878338335808e-06, + "loss": 0.8012, + "step": 6669 + }, + { + "epoch": 1.773936170212766, + "grad_norm": 3.6478235721588135, + "learning_rate": 7.668134564461057e-06, + "loss": 0.8071, + "step": 6670 + }, + { + "epoch": 1.7742021276595743, + "grad_norm": 4.1651177406311035, + "learning_rate": 7.66739070803108e-06, + "loss": 0.882, + "step": 6671 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 4.032572269439697, + "learning_rate": 7.666646769068894e-06, + "loss": 0.7804, + "step": 6672 + }, + { + "epoch": 1.7747340425531914, + "grad_norm": 4.481500148773193, + "learning_rate": 7.665902747597516e-06, + "loss": 0.8824, + "step": 6673 + }, + { + "epoch": 1.775, + "grad_norm": 3.6887848377227783, + "learning_rate": 7.66515864363997e-06, + "loss": 0.8179, + "step": 6674 + }, + { + "epoch": 1.7752659574468086, + "grad_norm": 3.5154476165771484, + "learning_rate": 7.664414457219277e-06, + "loss": 0.8015, + "step": 6675 + }, + { + "epoch": 1.775531914893617, + "grad_norm": 3.9713804721832275, + "learning_rate": 7.663670188358464e-06, + "loss": 0.8426, + "step": 6676 + }, + { + "epoch": 1.7757978723404255, + "grad_norm": 4.082159996032715, + "learning_rate": 7.66292583708056e-06, + "loss": 0.81, + "step": 6677 + }, + { + "epoch": 1.7760638297872342, + "grad_norm": 3.8582613468170166, + "learning_rate": 7.662181403408593e-06, + "loss": 0.7965, + "step": 6678 + }, + { + "epoch": 1.7763297872340424, + "grad_norm": 4.068000793457031, + "learning_rate": 7.661436887365603e-06, + "loss": 0.8332, + "step": 6679 + }, + { + "epoch": 1.7765957446808511, + "grad_norm": 4.067226409912109, + "learning_rate": 7.660692288974618e-06, + "loss": 0.8399, + "step": 6680 + }, + { + "epoch": 1.7768617021276596, + "grad_norm": 3.885331392288208, + "learning_rate": 7.659947608258684e-06, + "loss": 0.8701, + "step": 6681 + }, + { + "epoch": 1.777127659574468, + "grad_norm": 3.792872905731201, + "learning_rate": 7.659202845240839e-06, + "loss": 0.8379, + "step": 6682 + }, + { + "epoch": 1.7773936170212767, + "grad_norm": 3.553959369659424, + "learning_rate": 7.658457999944124e-06, + "loss": 0.6874, + "step": 6683 + }, + { + "epoch": 1.777659574468085, + "grad_norm": 4.169983386993408, + "learning_rate": 7.657713072391591e-06, + "loss": 0.7569, + "step": 6684 + }, + { + "epoch": 1.7779255319148937, + "grad_norm": 4.05847692489624, + "learning_rate": 7.656968062606288e-06, + "loss": 0.8497, + "step": 6685 + }, + { + "epoch": 1.7781914893617021, + "grad_norm": 4.117887496948242, + "learning_rate": 7.656222970611263e-06, + "loss": 0.708, + "step": 6686 + }, + { + "epoch": 1.7784574468085106, + "grad_norm": 3.683126211166382, + "learning_rate": 7.655477796429571e-06, + "loss": 0.7568, + "step": 6687 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 3.6990060806274414, + "learning_rate": 7.654732540084273e-06, + "loss": 0.7721, + "step": 6688 + }, + { + "epoch": 1.7789893617021275, + "grad_norm": 3.917276620864868, + "learning_rate": 7.653987201598422e-06, + "loss": 0.8214, + "step": 6689 + }, + { + "epoch": 1.7792553191489362, + "grad_norm": 4.091401100158691, + "learning_rate": 7.653241780995083e-06, + "loss": 0.7312, + "step": 6690 + }, + { + "epoch": 1.7795212765957447, + "grad_norm": 4.167940139770508, + "learning_rate": 7.652496278297319e-06, + "loss": 0.9115, + "step": 6691 + }, + { + "epoch": 1.7797872340425531, + "grad_norm": 3.9726510047912598, + "learning_rate": 7.651750693528197e-06, + "loss": 0.7857, + "step": 6692 + }, + { + "epoch": 1.7800531914893618, + "grad_norm": 3.7973427772521973, + "learning_rate": 7.651005026710786e-06, + "loss": 0.8594, + "step": 6693 + }, + { + "epoch": 1.78031914893617, + "grad_norm": 3.932386875152588, + "learning_rate": 7.65025927786816e-06, + "loss": 0.7873, + "step": 6694 + }, + { + "epoch": 1.7805851063829787, + "grad_norm": 3.6921486854553223, + "learning_rate": 7.64951344702339e-06, + "loss": 0.7569, + "step": 6695 + }, + { + "epoch": 1.7808510638297872, + "grad_norm": 4.060511589050293, + "learning_rate": 7.648767534199556e-06, + "loss": 0.7533, + "step": 6696 + }, + { + "epoch": 1.7811170212765957, + "grad_norm": 4.142321586608887, + "learning_rate": 7.648021539419737e-06, + "loss": 0.7836, + "step": 6697 + }, + { + "epoch": 1.7813829787234043, + "grad_norm": 4.071194648742676, + "learning_rate": 7.647275462707011e-06, + "loss": 0.7489, + "step": 6698 + }, + { + "epoch": 1.7816489361702128, + "grad_norm": 4.006459712982178, + "learning_rate": 7.646529304084469e-06, + "loss": 0.812, + "step": 6699 + }, + { + "epoch": 1.7819148936170213, + "grad_norm": 3.6437671184539795, + "learning_rate": 7.64578306357519e-06, + "loss": 0.7105, + "step": 6700 + }, + { + "epoch": 1.78218085106383, + "grad_norm": 4.094074249267578, + "learning_rate": 7.645036741202271e-06, + "loss": 0.9633, + "step": 6701 + }, + { + "epoch": 1.7824468085106382, + "grad_norm": 4.029351711273193, + "learning_rate": 7.6442903369888e-06, + "loss": 0.8999, + "step": 6702 + }, + { + "epoch": 1.7827127659574469, + "grad_norm": 3.8068792819976807, + "learning_rate": 7.643543850957872e-06, + "loss": 0.7305, + "step": 6703 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 4.074723243713379, + "learning_rate": 7.642797283132586e-06, + "loss": 0.8502, + "step": 6704 + }, + { + "epoch": 1.7832446808510638, + "grad_norm": 3.3582799434661865, + "learning_rate": 7.642050633536042e-06, + "loss": 0.7219, + "step": 6705 + }, + { + "epoch": 1.7835106382978725, + "grad_norm": 3.6337673664093018, + "learning_rate": 7.641303902191339e-06, + "loss": 0.7843, + "step": 6706 + }, + { + "epoch": 1.7837765957446807, + "grad_norm": 4.376511573791504, + "learning_rate": 7.640557089121583e-06, + "loss": 0.9737, + "step": 6707 + }, + { + "epoch": 1.7840425531914894, + "grad_norm": 3.6106109619140625, + "learning_rate": 7.639810194349884e-06, + "loss": 0.7549, + "step": 6708 + }, + { + "epoch": 1.7843085106382979, + "grad_norm": 3.9676499366760254, + "learning_rate": 7.639063217899348e-06, + "loss": 0.8951, + "step": 6709 + }, + { + "epoch": 1.7845744680851063, + "grad_norm": 3.7763378620147705, + "learning_rate": 7.638316159793089e-06, + "loss": 0.8431, + "step": 6710 + }, + { + "epoch": 1.784840425531915, + "grad_norm": 3.744365930557251, + "learning_rate": 7.637569020054221e-06, + "loss": 0.8697, + "step": 6711 + }, + { + "epoch": 1.7851063829787233, + "grad_norm": 3.4194390773773193, + "learning_rate": 7.636821798705864e-06, + "loss": 0.8979, + "step": 6712 + }, + { + "epoch": 1.785372340425532, + "grad_norm": 3.804483413696289, + "learning_rate": 7.636074495771134e-06, + "loss": 0.8484, + "step": 6713 + }, + { + "epoch": 1.7856382978723404, + "grad_norm": 4.089145660400391, + "learning_rate": 7.635327111273158e-06, + "loss": 0.892, + "step": 6714 + }, + { + "epoch": 1.7859042553191489, + "grad_norm": 4.051761150360107, + "learning_rate": 7.634579645235056e-06, + "loss": 0.8972, + "step": 6715 + }, + { + "epoch": 1.7861702127659576, + "grad_norm": 4.0280961990356445, + "learning_rate": 7.633832097679959e-06, + "loss": 0.8125, + "step": 6716 + }, + { + "epoch": 1.7864361702127658, + "grad_norm": 4.206244468688965, + "learning_rate": 7.633084468630996e-06, + "loss": 0.7675, + "step": 6717 + }, + { + "epoch": 1.7867021276595745, + "grad_norm": 3.4746177196502686, + "learning_rate": 7.6323367581113e-06, + "loss": 0.7079, + "step": 6718 + }, + { + "epoch": 1.786968085106383, + "grad_norm": 3.8518667221069336, + "learning_rate": 7.631588966144003e-06, + "loss": 0.965, + "step": 6719 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 3.605275869369507, + "learning_rate": 7.630841092752248e-06, + "loss": 0.7733, + "step": 6720 + }, + { + "epoch": 1.7875, + "grad_norm": 4.255527019500732, + "learning_rate": 7.63009313795917e-06, + "loss": 0.8645, + "step": 6721 + }, + { + "epoch": 1.7877659574468086, + "grad_norm": 3.93906307220459, + "learning_rate": 7.629345101787917e-06, + "loss": 0.8449, + "step": 6722 + }, + { + "epoch": 1.788031914893617, + "grad_norm": 4.351909160614014, + "learning_rate": 7.628596984261629e-06, + "loss": 0.8644, + "step": 6723 + }, + { + "epoch": 1.7882978723404257, + "grad_norm": 3.7165818214416504, + "learning_rate": 7.627848785403456e-06, + "loss": 0.7284, + "step": 6724 + }, + { + "epoch": 1.788563829787234, + "grad_norm": 3.9665300846099854, + "learning_rate": 7.6271005052365465e-06, + "loss": 0.8396, + "step": 6725 + }, + { + "epoch": 1.7888297872340426, + "grad_norm": 3.951260566711426, + "learning_rate": 7.6263521437840544e-06, + "loss": 0.9464, + "step": 6726 + }, + { + "epoch": 1.789095744680851, + "grad_norm": 4.499269008636475, + "learning_rate": 7.625603701069135e-06, + "loss": 0.9031, + "step": 6727 + }, + { + "epoch": 1.7893617021276595, + "grad_norm": 3.931673526763916, + "learning_rate": 7.6248551771149474e-06, + "loss": 0.823, + "step": 6728 + }, + { + "epoch": 1.7896276595744682, + "grad_norm": 4.128811836242676, + "learning_rate": 7.624106571944648e-06, + "loss": 0.7497, + "step": 6729 + }, + { + "epoch": 1.7898936170212765, + "grad_norm": 3.873683452606201, + "learning_rate": 7.623357885581403e-06, + "loss": 0.8247, + "step": 6730 + }, + { + "epoch": 1.7901595744680852, + "grad_norm": 3.7852728366851807, + "learning_rate": 7.6226091180483765e-06, + "loss": 0.8774, + "step": 6731 + }, + { + "epoch": 1.7904255319148936, + "grad_norm": 3.885965585708618, + "learning_rate": 7.621860269368735e-06, + "loss": 0.7561, + "step": 6732 + }, + { + "epoch": 1.790691489361702, + "grad_norm": 4.435214519500732, + "learning_rate": 7.6211113395656515e-06, + "loss": 0.9338, + "step": 6733 + }, + { + "epoch": 1.7909574468085108, + "grad_norm": 4.548224449157715, + "learning_rate": 7.6203623286622955e-06, + "loss": 0.8323, + "step": 6734 + }, + { + "epoch": 1.791223404255319, + "grad_norm": 3.8655712604522705, + "learning_rate": 7.619613236681845e-06, + "loss": 0.8654, + "step": 6735 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 3.7102363109588623, + "learning_rate": 7.618864063647477e-06, + "loss": 0.8015, + "step": 6736 + }, + { + "epoch": 1.7917553191489362, + "grad_norm": 4.260025978088379, + "learning_rate": 7.6181148095823705e-06, + "loss": 0.7977, + "step": 6737 + }, + { + "epoch": 1.7920212765957446, + "grad_norm": 4.112497806549072, + "learning_rate": 7.6173654745097106e-06, + "loss": 0.7763, + "step": 6738 + }, + { + "epoch": 1.7922872340425533, + "grad_norm": 3.998528003692627, + "learning_rate": 7.6166160584526795e-06, + "loss": 0.8215, + "step": 6739 + }, + { + "epoch": 1.7925531914893615, + "grad_norm": 3.6492180824279785, + "learning_rate": 7.615866561434468e-06, + "loss": 0.7239, + "step": 6740 + }, + { + "epoch": 1.7928191489361702, + "grad_norm": 3.8486714363098145, + "learning_rate": 7.615116983478266e-06, + "loss": 0.8435, + "step": 6741 + }, + { + "epoch": 1.7930851063829787, + "grad_norm": 3.863814353942871, + "learning_rate": 7.614367324607263e-06, + "loss": 0.8033, + "step": 6742 + }, + { + "epoch": 1.7933510638297872, + "grad_norm": 3.88749098777771, + "learning_rate": 7.613617584844662e-06, + "loss": 0.8072, + "step": 6743 + }, + { + "epoch": 1.7936170212765958, + "grad_norm": 3.9917871952056885, + "learning_rate": 7.612867764213651e-06, + "loss": 0.8138, + "step": 6744 + }, + { + "epoch": 1.7938829787234043, + "grad_norm": 4.009222507476807, + "learning_rate": 7.612117862737437e-06, + "loss": 0.7131, + "step": 6745 + }, + { + "epoch": 1.7941489361702128, + "grad_norm": 4.001763343811035, + "learning_rate": 7.611367880439221e-06, + "loss": 0.9487, + "step": 6746 + }, + { + "epoch": 1.7944148936170212, + "grad_norm": 4.2233805656433105, + "learning_rate": 7.610617817342207e-06, + "loss": 0.7244, + "step": 6747 + }, + { + "epoch": 1.7946808510638297, + "grad_norm": 3.7131550312042236, + "learning_rate": 7.609867673469607e-06, + "loss": 0.8303, + "step": 6748 + }, + { + "epoch": 1.7949468085106384, + "grad_norm": 4.046380519866943, + "learning_rate": 7.609117448844626e-06, + "loss": 0.8372, + "step": 6749 + }, + { + "epoch": 1.7952127659574468, + "grad_norm": 4.070696830749512, + "learning_rate": 7.60836714349048e-06, + "loss": 0.8259, + "step": 6750 + }, + { + "epoch": 1.7954787234042553, + "grad_norm": 3.893247604370117, + "learning_rate": 7.607616757430383e-06, + "loss": 0.8598, + "step": 6751 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 3.7077648639678955, + "learning_rate": 7.606866290687555e-06, + "loss": 0.8036, + "step": 6752 + }, + { + "epoch": 1.7960106382978722, + "grad_norm": 4.3204450607299805, + "learning_rate": 7.606115743285213e-06, + "loss": 0.8424, + "step": 6753 + }, + { + "epoch": 1.796276595744681, + "grad_norm": 3.3555731773376465, + "learning_rate": 7.605365115246581e-06, + "loss": 0.8369, + "step": 6754 + }, + { + "epoch": 1.7965425531914894, + "grad_norm": 3.561962842941284, + "learning_rate": 7.604614406594888e-06, + "loss": 0.7841, + "step": 6755 + }, + { + "epoch": 1.7968085106382978, + "grad_norm": 4.0263166427612305, + "learning_rate": 7.6038636173533565e-06, + "loss": 0.7135, + "step": 6756 + }, + { + "epoch": 1.7970744680851065, + "grad_norm": 3.8524928092956543, + "learning_rate": 7.603112747545218e-06, + "loss": 0.8327, + "step": 6757 + }, + { + "epoch": 1.7973404255319148, + "grad_norm": 3.5046606063842773, + "learning_rate": 7.602361797193709e-06, + "loss": 0.8162, + "step": 6758 + }, + { + "epoch": 1.7976063829787234, + "grad_norm": 4.547070503234863, + "learning_rate": 7.60161076632206e-06, + "loss": 0.8014, + "step": 6759 + }, + { + "epoch": 1.797872340425532, + "grad_norm": 4.453802585601807, + "learning_rate": 7.600859654953513e-06, + "loss": 0.9287, + "step": 6760 + }, + { + "epoch": 1.7981382978723404, + "grad_norm": 4.324093818664551, + "learning_rate": 7.6001084631113046e-06, + "loss": 0.848, + "step": 6761 + }, + { + "epoch": 1.798404255319149, + "grad_norm": 4.146725177764893, + "learning_rate": 7.599357190818679e-06, + "loss": 0.8875, + "step": 6762 + }, + { + "epoch": 1.7986702127659573, + "grad_norm": 4.132041931152344, + "learning_rate": 7.598605838098882e-06, + "loss": 0.8413, + "step": 6763 + }, + { + "epoch": 1.798936170212766, + "grad_norm": 3.829908847808838, + "learning_rate": 7.59785440497516e-06, + "loss": 0.843, + "step": 6764 + }, + { + "epoch": 1.7992021276595744, + "grad_norm": 4.308759689331055, + "learning_rate": 7.597102891470766e-06, + "loss": 0.7839, + "step": 6765 + }, + { + "epoch": 1.799468085106383, + "grad_norm": 3.6383216381073, + "learning_rate": 7.59635129760895e-06, + "loss": 0.608, + "step": 6766 + }, + { + "epoch": 1.7997340425531916, + "grad_norm": 3.6101510524749756, + "learning_rate": 7.595599623412968e-06, + "loss": 0.7246, + "step": 6767 + }, + { + "epoch": 1.8, + "grad_norm": 3.51635479927063, + "learning_rate": 7.594847868906076e-06, + "loss": 0.798, + "step": 6768 + }, + { + "epoch": 1.8002659574468085, + "grad_norm": 3.927917718887329, + "learning_rate": 7.594096034111538e-06, + "loss": 0.8229, + "step": 6769 + }, + { + "epoch": 1.800531914893617, + "grad_norm": 4.29150390625, + "learning_rate": 7.5933441190526146e-06, + "loss": 0.922, + "step": 6770 + }, + { + "epoch": 1.8007978723404254, + "grad_norm": 3.8685336112976074, + "learning_rate": 7.592592123752569e-06, + "loss": 0.7242, + "step": 6771 + }, + { + "epoch": 1.8010638297872341, + "grad_norm": 3.9335358142852783, + "learning_rate": 7.591840048234673e-06, + "loss": 0.8717, + "step": 6772 + }, + { + "epoch": 1.8013297872340426, + "grad_norm": 4.033020496368408, + "learning_rate": 7.591087892522193e-06, + "loss": 0.8129, + "step": 6773 + }, + { + "epoch": 1.801595744680851, + "grad_norm": 4.348812580108643, + "learning_rate": 7.590335656638403e-06, + "loss": 0.8352, + "step": 6774 + }, + { + "epoch": 1.8018617021276597, + "grad_norm": 3.683743476867676, + "learning_rate": 7.589583340606579e-06, + "loss": 0.8427, + "step": 6775 + }, + { + "epoch": 1.802127659574468, + "grad_norm": 3.782118797302246, + "learning_rate": 7.588830944449996e-06, + "loss": 0.8659, + "step": 6776 + }, + { + "epoch": 1.8023936170212767, + "grad_norm": 4.097870826721191, + "learning_rate": 7.5880784681919365e-06, + "loss": 0.7472, + "step": 6777 + }, + { + "epoch": 1.8026595744680851, + "grad_norm": 3.921733856201172, + "learning_rate": 7.587325911855681e-06, + "loss": 0.8388, + "step": 6778 + }, + { + "epoch": 1.8029255319148936, + "grad_norm": 4.305613994598389, + "learning_rate": 7.586573275464517e-06, + "loss": 1.0133, + "step": 6779 + }, + { + "epoch": 1.8031914893617023, + "grad_norm": 4.13943338394165, + "learning_rate": 7.58582055904173e-06, + "loss": 0.7861, + "step": 6780 + }, + { + "epoch": 1.8034574468085105, + "grad_norm": 4.047939777374268, + "learning_rate": 7.585067762610612e-06, + "loss": 0.8422, + "step": 6781 + }, + { + "epoch": 1.8037234042553192, + "grad_norm": 3.8695991039276123, + "learning_rate": 7.584314886194451e-06, + "loss": 0.8365, + "step": 6782 + }, + { + "epoch": 1.8039893617021276, + "grad_norm": 3.7691190242767334, + "learning_rate": 7.583561929816547e-06, + "loss": 0.8293, + "step": 6783 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 4.062473773956299, + "learning_rate": 7.5828088935001954e-06, + "loss": 0.8118, + "step": 6784 + }, + { + "epoch": 1.8045212765957448, + "grad_norm": 4.588931560516357, + "learning_rate": 7.582055777268693e-06, + "loss": 0.8835, + "step": 6785 + }, + { + "epoch": 1.804787234042553, + "grad_norm": 3.1973307132720947, + "learning_rate": 7.581302581145346e-06, + "loss": 0.6728, + "step": 6786 + }, + { + "epoch": 1.8050531914893617, + "grad_norm": 4.123830318450928, + "learning_rate": 7.5805493051534605e-06, + "loss": 0.9315, + "step": 6787 + }, + { + "epoch": 1.8053191489361702, + "grad_norm": 3.992337942123413, + "learning_rate": 7.57979594931634e-06, + "loss": 0.7951, + "step": 6788 + }, + { + "epoch": 1.8055851063829786, + "grad_norm": 3.456594467163086, + "learning_rate": 7.579042513657294e-06, + "loss": 0.8114, + "step": 6789 + }, + { + "epoch": 1.8058510638297873, + "grad_norm": 4.029353618621826, + "learning_rate": 7.578288998199638e-06, + "loss": 0.895, + "step": 6790 + }, + { + "epoch": 1.8061170212765958, + "grad_norm": 4.027595520019531, + "learning_rate": 7.577535402966683e-06, + "loss": 0.8416, + "step": 6791 + }, + { + "epoch": 1.8063829787234043, + "grad_norm": 3.8989861011505127, + "learning_rate": 7.5767817279817505e-06, + "loss": 0.8275, + "step": 6792 + }, + { + "epoch": 1.8066489361702127, + "grad_norm": 4.1814961433410645, + "learning_rate": 7.576027973268155e-06, + "loss": 0.7388, + "step": 6793 + }, + { + "epoch": 1.8069148936170212, + "grad_norm": 3.8830153942108154, + "learning_rate": 7.575274138849223e-06, + "loss": 0.7622, + "step": 6794 + }, + { + "epoch": 1.8071808510638299, + "grad_norm": 3.6945488452911377, + "learning_rate": 7.574520224748276e-06, + "loss": 0.6767, + "step": 6795 + }, + { + "epoch": 1.8074468085106383, + "grad_norm": 3.8499093055725098, + "learning_rate": 7.5737662309886415e-06, + "loss": 0.8128, + "step": 6796 + }, + { + "epoch": 1.8077127659574468, + "grad_norm": 4.120965480804443, + "learning_rate": 7.573012157593651e-06, + "loss": 0.8356, + "step": 6797 + }, + { + "epoch": 1.8079787234042555, + "grad_norm": 3.9702072143554688, + "learning_rate": 7.572258004586635e-06, + "loss": 0.773, + "step": 6798 + }, + { + "epoch": 1.8082446808510637, + "grad_norm": 3.910039186477661, + "learning_rate": 7.5715037719909266e-06, + "loss": 0.7577, + "step": 6799 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 3.9392266273498535, + "learning_rate": 7.570749459829865e-06, + "loss": 0.9043, + "step": 6800 + }, + { + "epoch": 1.8087765957446809, + "grad_norm": 3.9405999183654785, + "learning_rate": 7.56999506812679e-06, + "loss": 0.8526, + "step": 6801 + }, + { + "epoch": 1.8090425531914893, + "grad_norm": 3.701950788497925, + "learning_rate": 7.569240596905038e-06, + "loss": 0.7136, + "step": 6802 + }, + { + "epoch": 1.809308510638298, + "grad_norm": 3.7333173751831055, + "learning_rate": 7.568486046187959e-06, + "loss": 0.8191, + "step": 6803 + }, + { + "epoch": 1.8095744680851062, + "grad_norm": 3.9274251461029053, + "learning_rate": 7.567731415998898e-06, + "loss": 0.8371, + "step": 6804 + }, + { + "epoch": 1.809840425531915, + "grad_norm": 4.320472240447998, + "learning_rate": 7.566976706361204e-06, + "loss": 0.8743, + "step": 6805 + }, + { + "epoch": 1.8101063829787234, + "grad_norm": 4.124827861785889, + "learning_rate": 7.566221917298228e-06, + "loss": 0.8599, + "step": 6806 + }, + { + "epoch": 1.8103723404255319, + "grad_norm": 4.09792947769165, + "learning_rate": 7.565467048833325e-06, + "loss": 0.782, + "step": 6807 + }, + { + "epoch": 1.8106382978723405, + "grad_norm": 4.003774166107178, + "learning_rate": 7.56471210098985e-06, + "loss": 0.7946, + "step": 6808 + }, + { + "epoch": 1.8109042553191488, + "grad_norm": 4.259424686431885, + "learning_rate": 7.563957073791164e-06, + "loss": 0.8328, + "step": 6809 + }, + { + "epoch": 1.8111702127659575, + "grad_norm": 3.9565248489379883, + "learning_rate": 7.563201967260627e-06, + "loss": 0.8544, + "step": 6810 + }, + { + "epoch": 1.811436170212766, + "grad_norm": 3.88087797164917, + "learning_rate": 7.562446781421604e-06, + "loss": 0.7987, + "step": 6811 + }, + { + "epoch": 1.8117021276595744, + "grad_norm": 3.9190945625305176, + "learning_rate": 7.5616915162974594e-06, + "loss": 0.8162, + "step": 6812 + }, + { + "epoch": 1.811968085106383, + "grad_norm": 3.700688600540161, + "learning_rate": 7.560936171911564e-06, + "loss": 0.7738, + "step": 6813 + }, + { + "epoch": 1.8122340425531915, + "grad_norm": 4.023971080780029, + "learning_rate": 7.560180748287289e-06, + "loss": 0.8266, + "step": 6814 + }, + { + "epoch": 1.8125, + "grad_norm": 4.754519462585449, + "learning_rate": 7.559425245448006e-06, + "loss": 1.0779, + "step": 6815 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 4.043941497802734, + "learning_rate": 7.558669663417093e-06, + "loss": 0.7789, + "step": 6816 + }, + { + "epoch": 1.813031914893617, + "grad_norm": 4.064941883087158, + "learning_rate": 7.557914002217929e-06, + "loss": 0.8235, + "step": 6817 + }, + { + "epoch": 1.8132978723404256, + "grad_norm": 4.2770562171936035, + "learning_rate": 7.5571582618738936e-06, + "loss": 0.8647, + "step": 6818 + }, + { + "epoch": 1.813563829787234, + "grad_norm": 3.758079767227173, + "learning_rate": 7.55640244240837e-06, + "loss": 0.765, + "step": 6819 + }, + { + "epoch": 1.8138297872340425, + "grad_norm": 4.024742603302002, + "learning_rate": 7.555646543844747e-06, + "loss": 0.9143, + "step": 6820 + }, + { + "epoch": 1.8140957446808512, + "grad_norm": 4.142058372497559, + "learning_rate": 7.55489056620641e-06, + "loss": 0.8872, + "step": 6821 + }, + { + "epoch": 1.8143617021276595, + "grad_norm": 4.0311455726623535, + "learning_rate": 7.554134509516751e-06, + "loss": 0.7628, + "step": 6822 + }, + { + "epoch": 1.8146276595744681, + "grad_norm": 3.73848032951355, + "learning_rate": 7.553378373799163e-06, + "loss": 0.807, + "step": 6823 + }, + { + "epoch": 1.8148936170212766, + "grad_norm": 3.553116798400879, + "learning_rate": 7.552622159077041e-06, + "loss": 0.8166, + "step": 6824 + }, + { + "epoch": 1.815159574468085, + "grad_norm": 3.678316116333008, + "learning_rate": 7.5518658653737844e-06, + "loss": 0.8462, + "step": 6825 + }, + { + "epoch": 1.8154255319148938, + "grad_norm": 4.440575122833252, + "learning_rate": 7.551109492712795e-06, + "loss": 0.8861, + "step": 6826 + }, + { + "epoch": 1.815691489361702, + "grad_norm": 4.359316825866699, + "learning_rate": 7.550353041117473e-06, + "loss": 0.8025, + "step": 6827 + }, + { + "epoch": 1.8159574468085107, + "grad_norm": 3.976832389831543, + "learning_rate": 7.549596510611226e-06, + "loss": 0.8486, + "step": 6828 + }, + { + "epoch": 1.8162234042553191, + "grad_norm": 3.64974308013916, + "learning_rate": 7.54883990121746e-06, + "loss": 0.6982, + "step": 6829 + }, + { + "epoch": 1.8164893617021276, + "grad_norm": 4.051089286804199, + "learning_rate": 7.548083212959588e-06, + "loss": 0.8417, + "step": 6830 + }, + { + "epoch": 1.8167553191489363, + "grad_norm": 3.949113130569458, + "learning_rate": 7.547326445861021e-06, + "loss": 0.7382, + "step": 6831 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 3.896155834197998, + "learning_rate": 7.546569599945174e-06, + "loss": 0.9312, + "step": 6832 + }, + { + "epoch": 1.8172872340425532, + "grad_norm": 4.127990245819092, + "learning_rate": 7.545812675235467e-06, + "loss": 0.9422, + "step": 6833 + }, + { + "epoch": 1.8175531914893617, + "grad_norm": 3.8345584869384766, + "learning_rate": 7.545055671755316e-06, + "loss": 0.8672, + "step": 6834 + }, + { + "epoch": 1.8178191489361701, + "grad_norm": 3.544022560119629, + "learning_rate": 7.544298589528148e-06, + "loss": 0.8378, + "step": 6835 + }, + { + "epoch": 1.8180851063829788, + "grad_norm": 3.773446798324585, + "learning_rate": 7.543541428577386e-06, + "loss": 0.7617, + "step": 6836 + }, + { + "epoch": 1.8183510638297873, + "grad_norm": 4.245392322540283, + "learning_rate": 7.542784188926456e-06, + "loss": 0.7689, + "step": 6837 + }, + { + "epoch": 1.8186170212765957, + "grad_norm": 4.0154924392700195, + "learning_rate": 7.542026870598791e-06, + "loss": 0.7467, + "step": 6838 + }, + { + "epoch": 1.8188829787234042, + "grad_norm": 4.492767810821533, + "learning_rate": 7.5412694736178206e-06, + "loss": 0.9573, + "step": 6839 + }, + { + "epoch": 1.8191489361702127, + "grad_norm": 3.7740705013275146, + "learning_rate": 7.540511998006982e-06, + "loss": 0.6853, + "step": 6840 + }, + { + "epoch": 1.8194148936170214, + "grad_norm": 4.6515655517578125, + "learning_rate": 7.539754443789709e-06, + "loss": 0.9875, + "step": 6841 + }, + { + "epoch": 1.8196808510638298, + "grad_norm": 4.019815921783447, + "learning_rate": 7.5389968109894465e-06, + "loss": 0.7956, + "step": 6842 + }, + { + "epoch": 1.8199468085106383, + "grad_norm": 3.8876473903656006, + "learning_rate": 7.5382390996296315e-06, + "loss": 0.8368, + "step": 6843 + }, + { + "epoch": 1.820212765957447, + "grad_norm": 4.036003112792969, + "learning_rate": 7.537481309733709e-06, + "loss": 0.7615, + "step": 6844 + }, + { + "epoch": 1.8204787234042552, + "grad_norm": 3.9731733798980713, + "learning_rate": 7.53672344132513e-06, + "loss": 0.8408, + "step": 6845 + }, + { + "epoch": 1.820744680851064, + "grad_norm": 4.149892807006836, + "learning_rate": 7.53596549442734e-06, + "loss": 0.7553, + "step": 6846 + }, + { + "epoch": 1.8210106382978724, + "grad_norm": 3.9756197929382324, + "learning_rate": 7.535207469063791e-06, + "loss": 0.8429, + "step": 6847 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 4.044477939605713, + "learning_rate": 7.53444936525794e-06, + "loss": 0.7761, + "step": 6848 + }, + { + "epoch": 1.8215425531914895, + "grad_norm": 3.613596200942993, + "learning_rate": 7.53369118303324e-06, + "loss": 0.808, + "step": 6849 + }, + { + "epoch": 1.8218085106382977, + "grad_norm": 4.789092540740967, + "learning_rate": 7.532932922413152e-06, + "loss": 0.8992, + "step": 6850 + }, + { + "epoch": 1.8220744680851064, + "grad_norm": 3.8128976821899414, + "learning_rate": 7.532174583421138e-06, + "loss": 0.7259, + "step": 6851 + }, + { + "epoch": 1.8223404255319149, + "grad_norm": 3.685126781463623, + "learning_rate": 7.53141616608066e-06, + "loss": 0.7971, + "step": 6852 + }, + { + "epoch": 1.8226063829787233, + "grad_norm": 3.8787617683410645, + "learning_rate": 7.5306576704151865e-06, + "loss": 0.7447, + "step": 6853 + }, + { + "epoch": 1.822872340425532, + "grad_norm": 4.506245136260986, + "learning_rate": 7.529899096448185e-06, + "loss": 0.8898, + "step": 6854 + }, + { + "epoch": 1.8231382978723403, + "grad_norm": 4.238636016845703, + "learning_rate": 7.529140444203127e-06, + "loss": 0.8057, + "step": 6855 + }, + { + "epoch": 1.823404255319149, + "grad_norm": 4.039521217346191, + "learning_rate": 7.528381713703485e-06, + "loss": 0.772, + "step": 6856 + }, + { + "epoch": 1.8236702127659574, + "grad_norm": 3.6089868545532227, + "learning_rate": 7.5276229049727375e-06, + "loss": 0.8194, + "step": 6857 + }, + { + "epoch": 1.8239361702127659, + "grad_norm": 3.4110054969787598, + "learning_rate": 7.52686401803436e-06, + "loss": 0.6902, + "step": 6858 + }, + { + "epoch": 1.8242021276595746, + "grad_norm": 3.6139302253723145, + "learning_rate": 7.526105052911836e-06, + "loss": 0.8318, + "step": 6859 + }, + { + "epoch": 1.824468085106383, + "grad_norm": 4.215152740478516, + "learning_rate": 7.525346009628647e-06, + "loss": 0.8303, + "step": 6860 + }, + { + "epoch": 1.8247340425531915, + "grad_norm": 3.8578953742980957, + "learning_rate": 7.524586888208278e-06, + "loss": 0.8625, + "step": 6861 + }, + { + "epoch": 1.825, + "grad_norm": 3.8874824047088623, + "learning_rate": 7.52382768867422e-06, + "loss": 0.7106, + "step": 6862 + }, + { + "epoch": 1.8252659574468084, + "grad_norm": 3.746168851852417, + "learning_rate": 7.5230684110499604e-06, + "loss": 0.8753, + "step": 6863 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 3.70993971824646, + "learning_rate": 7.522309055358995e-06, + "loss": 0.7393, + "step": 6864 + }, + { + "epoch": 1.8257978723404256, + "grad_norm": 3.599679470062256, + "learning_rate": 7.5215496216248175e-06, + "loss": 0.893, + "step": 6865 + }, + { + "epoch": 1.826063829787234, + "grad_norm": 3.7604589462280273, + "learning_rate": 7.520790109870926e-06, + "loss": 0.7966, + "step": 6866 + }, + { + "epoch": 1.8263297872340427, + "grad_norm": 3.9113166332244873, + "learning_rate": 7.5200305201208205e-06, + "loss": 0.8071, + "step": 6867 + }, + { + "epoch": 1.826595744680851, + "grad_norm": 4.262864112854004, + "learning_rate": 7.519270852398002e-06, + "loss": 0.7942, + "step": 6868 + }, + { + "epoch": 1.8268617021276596, + "grad_norm": 4.096951007843018, + "learning_rate": 7.5185111067259804e-06, + "loss": 0.717, + "step": 6869 + }, + { + "epoch": 1.827127659574468, + "grad_norm": 4.112506866455078, + "learning_rate": 7.517751283128258e-06, + "loss": 0.8871, + "step": 6870 + }, + { + "epoch": 1.8273936170212766, + "grad_norm": 3.5203890800476074, + "learning_rate": 7.516991381628347e-06, + "loss": 0.796, + "step": 6871 + }, + { + "epoch": 1.8276595744680852, + "grad_norm": 3.556929588317871, + "learning_rate": 7.516231402249758e-06, + "loss": 0.8346, + "step": 6872 + }, + { + "epoch": 1.8279255319148935, + "grad_norm": 3.3509085178375244, + "learning_rate": 7.51547134501601e-06, + "loss": 0.7763, + "step": 6873 + }, + { + "epoch": 1.8281914893617022, + "grad_norm": 4.3177103996276855, + "learning_rate": 7.514711209950615e-06, + "loss": 0.7943, + "step": 6874 + }, + { + "epoch": 1.8284574468085106, + "grad_norm": 3.8919661045074463, + "learning_rate": 7.513950997077094e-06, + "loss": 0.7541, + "step": 6875 + }, + { + "epoch": 1.828723404255319, + "grad_norm": 3.506849765777588, + "learning_rate": 7.513190706418969e-06, + "loss": 0.8451, + "step": 6876 + }, + { + "epoch": 1.8289893617021278, + "grad_norm": 4.711544513702393, + "learning_rate": 7.512430337999768e-06, + "loss": 0.9569, + "step": 6877 + }, + { + "epoch": 1.829255319148936, + "grad_norm": 4.111194610595703, + "learning_rate": 7.511669891843011e-06, + "loss": 0.9289, + "step": 6878 + }, + { + "epoch": 1.8295212765957447, + "grad_norm": 3.4928982257843018, + "learning_rate": 7.510909367972231e-06, + "loss": 0.7627, + "step": 6879 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 3.737337827682495, + "learning_rate": 7.5101487664109605e-06, + "loss": 0.7463, + "step": 6880 + }, + { + "epoch": 1.8300531914893616, + "grad_norm": 3.4611358642578125, + "learning_rate": 7.50938808718273e-06, + "loss": 0.7764, + "step": 6881 + }, + { + "epoch": 1.8303191489361703, + "grad_norm": 3.901796817779541, + "learning_rate": 7.508627330311078e-06, + "loss": 0.9079, + "step": 6882 + }, + { + "epoch": 1.8305851063829788, + "grad_norm": 3.8375611305236816, + "learning_rate": 7.507866495819543e-06, + "loss": 0.7861, + "step": 6883 + }, + { + "epoch": 1.8308510638297872, + "grad_norm": 3.7982888221740723, + "learning_rate": 7.507105583731666e-06, + "loss": 0.8905, + "step": 6884 + }, + { + "epoch": 1.8311170212765957, + "grad_norm": 3.70542573928833, + "learning_rate": 7.506344594070991e-06, + "loss": 0.7173, + "step": 6885 + }, + { + "epoch": 1.8313829787234042, + "grad_norm": 3.7828474044799805, + "learning_rate": 7.505583526861064e-06, + "loss": 0.8687, + "step": 6886 + }, + { + "epoch": 1.8316489361702128, + "grad_norm": 4.376963138580322, + "learning_rate": 7.504822382125432e-06, + "loss": 0.982, + "step": 6887 + }, + { + "epoch": 1.8319148936170213, + "grad_norm": 3.9631431102752686, + "learning_rate": 7.504061159887646e-06, + "loss": 0.8186, + "step": 6888 + }, + { + "epoch": 1.8321808510638298, + "grad_norm": 4.296795845031738, + "learning_rate": 7.5032998601712605e-06, + "loss": 0.8346, + "step": 6889 + }, + { + "epoch": 1.8324468085106385, + "grad_norm": 3.889289617538452, + "learning_rate": 7.502538482999829e-06, + "loss": 0.8344, + "step": 6890 + }, + { + "epoch": 1.8327127659574467, + "grad_norm": 4.060772895812988, + "learning_rate": 7.50177702839691e-06, + "loss": 0.7625, + "step": 6891 + }, + { + "epoch": 1.8329787234042554, + "grad_norm": 3.6209208965301514, + "learning_rate": 7.501015496386066e-06, + "loss": 0.779, + "step": 6892 + }, + { + "epoch": 1.8332446808510638, + "grad_norm": 3.7519564628601074, + "learning_rate": 7.5002538869908556e-06, + "loss": 0.7245, + "step": 6893 + }, + { + "epoch": 1.8335106382978723, + "grad_norm": 3.842135190963745, + "learning_rate": 7.499492200234849e-06, + "loss": 0.7977, + "step": 6894 + }, + { + "epoch": 1.833776595744681, + "grad_norm": 4.067161560058594, + "learning_rate": 7.498730436141609e-06, + "loss": 0.8287, + "step": 6895 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 3.8573522567749023, + "learning_rate": 7.497968594734708e-06, + "loss": 0.7012, + "step": 6896 + }, + { + "epoch": 1.834308510638298, + "grad_norm": 3.792734146118164, + "learning_rate": 7.4972066760377184e-06, + "loss": 0.7986, + "step": 6897 + }, + { + "epoch": 1.8345744680851064, + "grad_norm": 4.287036418914795, + "learning_rate": 7.496444680074213e-06, + "loss": 0.8091, + "step": 6898 + }, + { + "epoch": 1.8348404255319148, + "grad_norm": 3.9161949157714844, + "learning_rate": 7.49568260686777e-06, + "loss": 0.8796, + "step": 6899 + }, + { + "epoch": 1.8351063829787235, + "grad_norm": 3.8841638565063477, + "learning_rate": 7.49492045644197e-06, + "loss": 0.8827, + "step": 6900 + }, + { + "epoch": 1.8353723404255318, + "grad_norm": 3.770533323287964, + "learning_rate": 7.494158228820393e-06, + "loss": 0.7671, + "step": 6901 + }, + { + "epoch": 1.8356382978723405, + "grad_norm": 4.155034065246582, + "learning_rate": 7.493395924026623e-06, + "loss": 0.8533, + "step": 6902 + }, + { + "epoch": 1.835904255319149, + "grad_norm": 3.911745071411133, + "learning_rate": 7.492633542084249e-06, + "loss": 0.82, + "step": 6903 + }, + { + "epoch": 1.8361702127659574, + "grad_norm": 3.444728136062622, + "learning_rate": 7.491871083016858e-06, + "loss": 0.7717, + "step": 6904 + }, + { + "epoch": 1.836436170212766, + "grad_norm": 4.003023147583008, + "learning_rate": 7.491108546848041e-06, + "loss": 0.7351, + "step": 6905 + }, + { + "epoch": 1.8367021276595743, + "grad_norm": 3.9087607860565186, + "learning_rate": 7.490345933601395e-06, + "loss": 0.8509, + "step": 6906 + }, + { + "epoch": 1.836968085106383, + "grad_norm": 4.098905086517334, + "learning_rate": 7.489583243300511e-06, + "loss": 0.9289, + "step": 6907 + }, + { + "epoch": 1.8372340425531914, + "grad_norm": 4.120253562927246, + "learning_rate": 7.488820475968992e-06, + "loss": 0.8707, + "step": 6908 + }, + { + "epoch": 1.8375, + "grad_norm": 4.324950218200684, + "learning_rate": 7.488057631630438e-06, + "loss": 0.7811, + "step": 6909 + }, + { + "epoch": 1.8377659574468086, + "grad_norm": 4.5706634521484375, + "learning_rate": 7.4872947103084495e-06, + "loss": 0.8641, + "step": 6910 + }, + { + "epoch": 1.838031914893617, + "grad_norm": 4.22561502456665, + "learning_rate": 7.486531712026634e-06, + "loss": 0.794, + "step": 6911 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 4.015974521636963, + "learning_rate": 7.485768636808603e-06, + "loss": 0.8757, + "step": 6912 + }, + { + "epoch": 1.8385638297872342, + "grad_norm": 3.7457127571105957, + "learning_rate": 7.48500548467796e-06, + "loss": 0.8682, + "step": 6913 + }, + { + "epoch": 1.8388297872340424, + "grad_norm": 3.964571714401245, + "learning_rate": 7.484242255658322e-06, + "loss": 0.7431, + "step": 6914 + }, + { + "epoch": 1.8390957446808511, + "grad_norm": 3.838426351547241, + "learning_rate": 7.4834789497733065e-06, + "loss": 0.7413, + "step": 6915 + }, + { + "epoch": 1.8393617021276596, + "grad_norm": 3.7367520332336426, + "learning_rate": 7.4827155670465264e-06, + "loss": 0.8366, + "step": 6916 + }, + { + "epoch": 1.839627659574468, + "grad_norm": 3.9056553840637207, + "learning_rate": 7.481952107501604e-06, + "loss": 0.7134, + "step": 6917 + }, + { + "epoch": 1.8398936170212767, + "grad_norm": 4.098144054412842, + "learning_rate": 7.481188571162161e-06, + "loss": 0.7744, + "step": 6918 + }, + { + "epoch": 1.840159574468085, + "grad_norm": 4.067973613739014, + "learning_rate": 7.480424958051823e-06, + "loss": 0.8143, + "step": 6919 + }, + { + "epoch": 1.8404255319148937, + "grad_norm": 3.9194462299346924, + "learning_rate": 7.479661268194217e-06, + "loss": 0.8335, + "step": 6920 + }, + { + "epoch": 1.8406914893617021, + "grad_norm": 4.130805492401123, + "learning_rate": 7.4788975016129704e-06, + "loss": 0.769, + "step": 6921 + }, + { + "epoch": 1.8409574468085106, + "grad_norm": 3.580792188644409, + "learning_rate": 7.478133658331716e-06, + "loss": 0.7743, + "step": 6922 + }, + { + "epoch": 1.8412234042553193, + "grad_norm": 3.78035569190979, + "learning_rate": 7.477369738374092e-06, + "loss": 0.8619, + "step": 6923 + }, + { + "epoch": 1.8414893617021275, + "grad_norm": 3.8400089740753174, + "learning_rate": 7.476605741763729e-06, + "loss": 0.8161, + "step": 6924 + }, + { + "epoch": 1.8417553191489362, + "grad_norm": 3.7448103427886963, + "learning_rate": 7.475841668524268e-06, + "loss": 0.8305, + "step": 6925 + }, + { + "epoch": 1.8420212765957447, + "grad_norm": 3.828014850616455, + "learning_rate": 7.475077518679352e-06, + "loss": 0.8424, + "step": 6926 + }, + { + "epoch": 1.8422872340425531, + "grad_norm": 3.776527166366577, + "learning_rate": 7.474313292252624e-06, + "loss": 0.9811, + "step": 6927 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 4.294341564178467, + "learning_rate": 7.473548989267728e-06, + "loss": 0.8375, + "step": 6928 + }, + { + "epoch": 1.84281914893617, + "grad_norm": 4.230419158935547, + "learning_rate": 7.472784609748316e-06, + "loss": 0.7886, + "step": 6929 + }, + { + "epoch": 1.8430851063829787, + "grad_norm": 4.243613243103027, + "learning_rate": 7.472020153718036e-06, + "loss": 0.8787, + "step": 6930 + }, + { + "epoch": 1.8433510638297872, + "grad_norm": 4.046195983886719, + "learning_rate": 7.471255621200541e-06, + "loss": 0.7344, + "step": 6931 + }, + { + "epoch": 1.8436170212765957, + "grad_norm": 3.4666972160339355, + "learning_rate": 7.470491012219488e-06, + "loss": 0.8123, + "step": 6932 + }, + { + "epoch": 1.8438829787234043, + "grad_norm": 4.226772785186768, + "learning_rate": 7.469726326798535e-06, + "loss": 0.7765, + "step": 6933 + }, + { + "epoch": 1.8441489361702128, + "grad_norm": 4.348804950714111, + "learning_rate": 7.468961564961341e-06, + "loss": 0.8481, + "step": 6934 + }, + { + "epoch": 1.8444148936170213, + "grad_norm": 3.7085683345794678, + "learning_rate": 7.4681967267315715e-06, + "loss": 0.7717, + "step": 6935 + }, + { + "epoch": 1.84468085106383, + "grad_norm": 3.670295238494873, + "learning_rate": 7.4674318121328856e-06, + "loss": 0.7074, + "step": 6936 + }, + { + "epoch": 1.8449468085106382, + "grad_norm": 4.235050678253174, + "learning_rate": 7.466666821188957e-06, + "loss": 0.9085, + "step": 6937 + }, + { + "epoch": 1.8452127659574469, + "grad_norm": 4.282822132110596, + "learning_rate": 7.465901753923452e-06, + "loss": 0.8641, + "step": 6938 + }, + { + "epoch": 1.8454787234042553, + "grad_norm": 3.9703402519226074, + "learning_rate": 7.465136610360044e-06, + "loss": 0.7331, + "step": 6939 + }, + { + "epoch": 1.8457446808510638, + "grad_norm": 3.793503522872925, + "learning_rate": 7.4643713905224065e-06, + "loss": 0.8122, + "step": 6940 + }, + { + "epoch": 1.8460106382978725, + "grad_norm": 4.120753288269043, + "learning_rate": 7.463606094434218e-06, + "loss": 0.8822, + "step": 6941 + }, + { + "epoch": 1.8462765957446807, + "grad_norm": 4.266670227050781, + "learning_rate": 7.462840722119155e-06, + "loss": 0.8363, + "step": 6942 + }, + { + "epoch": 1.8465425531914894, + "grad_norm": 3.998488664627075, + "learning_rate": 7.462075273600901e-06, + "loss": 0.895, + "step": 6943 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 3.923610210418701, + "learning_rate": 7.461309748903138e-06, + "loss": 0.8406, + "step": 6944 + }, + { + "epoch": 1.8470744680851063, + "grad_norm": 4.076598644256592, + "learning_rate": 7.460544148049555e-06, + "loss": 0.7919, + "step": 6945 + }, + { + "epoch": 1.847340425531915, + "grad_norm": 4.171792507171631, + "learning_rate": 7.459778471063839e-06, + "loss": 0.9616, + "step": 6946 + }, + { + "epoch": 1.8476063829787233, + "grad_norm": 4.327701091766357, + "learning_rate": 7.45901271796968e-06, + "loss": 0.8918, + "step": 6947 + }, + { + "epoch": 1.847872340425532, + "grad_norm": 4.035894393920898, + "learning_rate": 7.4582468887907746e-06, + "loss": 0.7007, + "step": 6948 + }, + { + "epoch": 1.8481382978723404, + "grad_norm": 3.9794068336486816, + "learning_rate": 7.457480983550813e-06, + "loss": 0.8622, + "step": 6949 + }, + { + "epoch": 1.8484042553191489, + "grad_norm": 3.988560914993286, + "learning_rate": 7.4567150022735e-06, + "loss": 0.7892, + "step": 6950 + }, + { + "epoch": 1.8486702127659576, + "grad_norm": 3.761817216873169, + "learning_rate": 7.455948944982529e-06, + "loss": 0.7549, + "step": 6951 + }, + { + "epoch": 1.8489361702127658, + "grad_norm": 3.962528944015503, + "learning_rate": 7.455182811701609e-06, + "loss": 0.7874, + "step": 6952 + }, + { + "epoch": 1.8492021276595745, + "grad_norm": 4.180268287658691, + "learning_rate": 7.454416602454441e-06, + "loss": 0.8401, + "step": 6953 + }, + { + "epoch": 1.849468085106383, + "grad_norm": 3.7611262798309326, + "learning_rate": 7.453650317264734e-06, + "loss": 0.8463, + "step": 6954 + }, + { + "epoch": 1.8497340425531914, + "grad_norm": 3.7269387245178223, + "learning_rate": 7.452883956156197e-06, + "loss": 0.7884, + "step": 6955 + }, + { + "epoch": 1.85, + "grad_norm": 4.998419284820557, + "learning_rate": 7.452117519152542e-06, + "loss": 0.861, + "step": 6956 + }, + { + "epoch": 1.8502659574468086, + "grad_norm": 4.210315704345703, + "learning_rate": 7.4513510062774845e-06, + "loss": 0.8083, + "step": 6957 + }, + { + "epoch": 1.850531914893617, + "grad_norm": 4.184957027435303, + "learning_rate": 7.4505844175547405e-06, + "loss": 0.7648, + "step": 6958 + }, + { + "epoch": 1.8507978723404257, + "grad_norm": 3.883157730102539, + "learning_rate": 7.44981775300803e-06, + "loss": 0.789, + "step": 6959 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 3.930384397506714, + "learning_rate": 7.449051012661073e-06, + "loss": 0.7467, + "step": 6960 + }, + { + "epoch": 1.8513297872340426, + "grad_norm": 4.148220062255859, + "learning_rate": 7.448284196537594e-06, + "loss": 0.8692, + "step": 6961 + }, + { + "epoch": 1.851595744680851, + "grad_norm": 4.141353607177734, + "learning_rate": 7.4475173046613205e-06, + "loss": 0.8553, + "step": 6962 + }, + { + "epoch": 1.8518617021276595, + "grad_norm": 3.8646962642669678, + "learning_rate": 7.4467503370559806e-06, + "loss": 0.7953, + "step": 6963 + }, + { + "epoch": 1.8521276595744682, + "grad_norm": 3.765763759613037, + "learning_rate": 7.445983293745302e-06, + "loss": 0.7173, + "step": 6964 + }, + { + "epoch": 1.8523936170212765, + "grad_norm": 3.5731546878814697, + "learning_rate": 7.445216174753022e-06, + "loss": 0.7643, + "step": 6965 + }, + { + "epoch": 1.8526595744680852, + "grad_norm": 3.3962113857269287, + "learning_rate": 7.444448980102875e-06, + "loss": 0.7694, + "step": 6966 + }, + { + "epoch": 1.8529255319148936, + "grad_norm": 4.201429843902588, + "learning_rate": 7.4436817098186e-06, + "loss": 0.9388, + "step": 6967 + }, + { + "epoch": 1.853191489361702, + "grad_norm": 4.063852787017822, + "learning_rate": 7.442914363923933e-06, + "loss": 0.8472, + "step": 6968 + }, + { + "epoch": 1.8534574468085108, + "grad_norm": 4.6696696281433105, + "learning_rate": 7.442146942442621e-06, + "loss": 0.8739, + "step": 6969 + }, + { + "epoch": 1.853723404255319, + "grad_norm": 3.5337836742401123, + "learning_rate": 7.4413794453984065e-06, + "loss": 0.7506, + "step": 6970 + }, + { + "epoch": 1.8539893617021277, + "grad_norm": 4.372726917266846, + "learning_rate": 7.440611872815038e-06, + "loss": 0.824, + "step": 6971 + }, + { + "epoch": 1.8542553191489362, + "grad_norm": 4.04209566116333, + "learning_rate": 7.439844224716265e-06, + "loss": 0.8098, + "step": 6972 + }, + { + "epoch": 1.8545212765957446, + "grad_norm": 3.8578147888183594, + "learning_rate": 7.439076501125839e-06, + "loss": 0.7585, + "step": 6973 + }, + { + "epoch": 1.8547872340425533, + "grad_norm": 4.210418701171875, + "learning_rate": 7.4383087020675145e-06, + "loss": 0.7915, + "step": 6974 + }, + { + "epoch": 1.8550531914893615, + "grad_norm": 3.4614603519439697, + "learning_rate": 7.4375408275650475e-06, + "loss": 0.7506, + "step": 6975 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 4.220035076141357, + "learning_rate": 7.436772877642199e-06, + "loss": 0.8875, + "step": 6976 + }, + { + "epoch": 1.8555851063829787, + "grad_norm": 4.095662593841553, + "learning_rate": 7.436004852322727e-06, + "loss": 0.8973, + "step": 6977 + }, + { + "epoch": 1.8558510638297872, + "grad_norm": 4.23422908782959, + "learning_rate": 7.435236751630397e-06, + "loss": 0.699, + "step": 6978 + }, + { + "epoch": 1.8561170212765958, + "grad_norm": 3.976768970489502, + "learning_rate": 7.434468575588976e-06, + "loss": 0.781, + "step": 6979 + }, + { + "epoch": 1.8563829787234043, + "grad_norm": 4.405401229858398, + "learning_rate": 7.43370032422223e-06, + "loss": 0.7388, + "step": 6980 + }, + { + "epoch": 1.8566489361702128, + "grad_norm": 4.096654891967773, + "learning_rate": 7.432931997553929e-06, + "loss": 0.8305, + "step": 6981 + }, + { + "epoch": 1.8569148936170212, + "grad_norm": 3.9386327266693115, + "learning_rate": 7.432163595607851e-06, + "loss": 0.775, + "step": 6982 + }, + { + "epoch": 1.8571808510638297, + "grad_norm": 4.111544609069824, + "learning_rate": 7.431395118407766e-06, + "loss": 0.9179, + "step": 6983 + }, + { + "epoch": 1.8574468085106384, + "grad_norm": 3.3650224208831787, + "learning_rate": 7.4306265659774525e-06, + "loss": 0.8286, + "step": 6984 + }, + { + "epoch": 1.8577127659574468, + "grad_norm": 4.099471569061279, + "learning_rate": 7.429857938340693e-06, + "loss": 0.8789, + "step": 6985 + }, + { + "epoch": 1.8579787234042553, + "grad_norm": 4.082056999206543, + "learning_rate": 7.429089235521267e-06, + "loss": 0.8938, + "step": 6986 + }, + { + "epoch": 1.858244680851064, + "grad_norm": 4.1304545402526855, + "learning_rate": 7.428320457542962e-06, + "loss": 0.8639, + "step": 6987 + }, + { + "epoch": 1.8585106382978722, + "grad_norm": 3.941922426223755, + "learning_rate": 7.427551604429562e-06, + "loss": 0.7966, + "step": 6988 + }, + { + "epoch": 1.858776595744681, + "grad_norm": 3.8861730098724365, + "learning_rate": 7.426782676204857e-06, + "loss": 0.8282, + "step": 6989 + }, + { + "epoch": 1.8590425531914894, + "grad_norm": 3.8917558193206787, + "learning_rate": 7.426013672892639e-06, + "loss": 0.7213, + "step": 6990 + }, + { + "epoch": 1.8593085106382978, + "grad_norm": 4.324743747711182, + "learning_rate": 7.4252445945167005e-06, + "loss": 0.9627, + "step": 6991 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 3.6545021533966064, + "learning_rate": 7.42447544110084e-06, + "loss": 0.742, + "step": 6992 + }, + { + "epoch": 1.8598404255319148, + "grad_norm": 4.201162338256836, + "learning_rate": 7.423706212668855e-06, + "loss": 0.8343, + "step": 6993 + }, + { + "epoch": 1.8601063829787234, + "grad_norm": 3.67588472366333, + "learning_rate": 7.4229369092445465e-06, + "loss": 0.7863, + "step": 6994 + }, + { + "epoch": 1.860372340425532, + "grad_norm": 3.3527588844299316, + "learning_rate": 7.422167530851716e-06, + "loss": 0.7513, + "step": 6995 + }, + { + "epoch": 1.8606382978723404, + "grad_norm": 3.977691888809204, + "learning_rate": 7.421398077514172e-06, + "loss": 0.7507, + "step": 6996 + }, + { + "epoch": 1.860904255319149, + "grad_norm": 4.172175407409668, + "learning_rate": 7.420628549255719e-06, + "loss": 0.8395, + "step": 6997 + }, + { + "epoch": 1.8611702127659573, + "grad_norm": 3.738621473312378, + "learning_rate": 7.41985894610017e-06, + "loss": 0.8366, + "step": 6998 + }, + { + "epoch": 1.861436170212766, + "grad_norm": 4.003189563751221, + "learning_rate": 7.4190892680713366e-06, + "loss": 0.9032, + "step": 6999 + }, + { + "epoch": 1.8617021276595744, + "grad_norm": 3.872437000274658, + "learning_rate": 7.418319515193032e-06, + "loss": 0.8052, + "step": 7000 + }, + { + "epoch": 1.8617021276595744, + "eval_loss": 1.269985556602478, + "eval_runtime": 14.1914, + "eval_samples_per_second": 28.186, + "eval_steps_per_second": 3.523, + "step": 7000 + }, + { + "epoch": 1.861968085106383, + "grad_norm": 4.005687713623047, + "learning_rate": 7.417549687489074e-06, + "loss": 0.7515, + "step": 7001 + }, + { + "epoch": 1.8622340425531916, + "grad_norm": 3.833047866821289, + "learning_rate": 7.416779784983284e-06, + "loss": 0.8487, + "step": 7002 + }, + { + "epoch": 1.8625, + "grad_norm": 3.902536392211914, + "learning_rate": 7.416009807699481e-06, + "loss": 0.7448, + "step": 7003 + }, + { + "epoch": 1.8627659574468085, + "grad_norm": 4.018909931182861, + "learning_rate": 7.41523975566149e-06, + "loss": 0.8619, + "step": 7004 + }, + { + "epoch": 1.863031914893617, + "grad_norm": 3.7916078567504883, + "learning_rate": 7.414469628893137e-06, + "loss": 0.7254, + "step": 7005 + }, + { + "epoch": 1.8632978723404254, + "grad_norm": 3.662709951400757, + "learning_rate": 7.413699427418253e-06, + "loss": 0.8801, + "step": 7006 + }, + { + "epoch": 1.8635638297872341, + "grad_norm": 3.8417561054229736, + "learning_rate": 7.412929151260665e-06, + "loss": 0.9611, + "step": 7007 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 3.8474161624908447, + "learning_rate": 7.412158800444208e-06, + "loss": 0.7215, + "step": 7008 + }, + { + "epoch": 1.864095744680851, + "grad_norm": 3.4360055923461914, + "learning_rate": 7.411388374992719e-06, + "loss": 0.7885, + "step": 7009 + }, + { + "epoch": 1.8643617021276597, + "grad_norm": 3.902475357055664, + "learning_rate": 7.410617874930034e-06, + "loss": 0.8199, + "step": 7010 + }, + { + "epoch": 1.864627659574468, + "grad_norm": 4.08276891708374, + "learning_rate": 7.409847300279993e-06, + "loss": 0.793, + "step": 7011 + }, + { + "epoch": 1.8648936170212767, + "grad_norm": 4.242387294769287, + "learning_rate": 7.4090766510664405e-06, + "loss": 0.9345, + "step": 7012 + }, + { + "epoch": 1.8651595744680851, + "grad_norm": 3.8312370777130127, + "learning_rate": 7.40830592731322e-06, + "loss": 0.8151, + "step": 7013 + }, + { + "epoch": 1.8654255319148936, + "grad_norm": 4.087930679321289, + "learning_rate": 7.407535129044179e-06, + "loss": 0.936, + "step": 7014 + }, + { + "epoch": 1.8656914893617023, + "grad_norm": 4.200309753417969, + "learning_rate": 7.4067642562831656e-06, + "loss": 0.8345, + "step": 7015 + }, + { + "epoch": 1.8659574468085105, + "grad_norm": 3.7283883094787598, + "learning_rate": 7.4059933090540315e-06, + "loss": 0.7398, + "step": 7016 + }, + { + "epoch": 1.8662234042553192, + "grad_norm": 4.288913249969482, + "learning_rate": 7.4052222873806345e-06, + "loss": 0.9314, + "step": 7017 + }, + { + "epoch": 1.8664893617021276, + "grad_norm": 4.077908515930176, + "learning_rate": 7.404451191286825e-06, + "loss": 0.8331, + "step": 7018 + }, + { + "epoch": 1.866755319148936, + "grad_norm": 4.040445804595947, + "learning_rate": 7.403680020796468e-06, + "loss": 0.8054, + "step": 7019 + }, + { + "epoch": 1.8670212765957448, + "grad_norm": 4.416097164154053, + "learning_rate": 7.402908775933419e-06, + "loss": 0.7164, + "step": 7020 + }, + { + "epoch": 1.867287234042553, + "grad_norm": 3.8552403450012207, + "learning_rate": 7.402137456721544e-06, + "loss": 0.8274, + "step": 7021 + }, + { + "epoch": 1.8675531914893617, + "grad_norm": 4.477870941162109, + "learning_rate": 7.401366063184709e-06, + "loss": 0.9087, + "step": 7022 + }, + { + "epoch": 1.8678191489361702, + "grad_norm": 4.315149784088135, + "learning_rate": 7.4005945953467794e-06, + "loss": 0.8275, + "step": 7023 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 4.013988971710205, + "learning_rate": 7.3998230532316275e-06, + "loss": 0.7935, + "step": 7024 + }, + { + "epoch": 1.8683510638297873, + "grad_norm": 4.538480281829834, + "learning_rate": 7.399051436863125e-06, + "loss": 0.7913, + "step": 7025 + }, + { + "epoch": 1.8686170212765958, + "grad_norm": 3.814431667327881, + "learning_rate": 7.398279746265144e-06, + "loss": 0.8819, + "step": 7026 + }, + { + "epoch": 1.8688829787234043, + "grad_norm": 4.128929615020752, + "learning_rate": 7.397507981461567e-06, + "loss": 0.7733, + "step": 7027 + }, + { + "epoch": 1.8691489361702127, + "grad_norm": 4.266568660736084, + "learning_rate": 7.3967361424762696e-06, + "loss": 0.8756, + "step": 7028 + }, + { + "epoch": 1.8694148936170212, + "grad_norm": 3.817857265472412, + "learning_rate": 7.3959642293331336e-06, + "loss": 0.8247, + "step": 7029 + }, + { + "epoch": 1.8696808510638299, + "grad_norm": 4.07396125793457, + "learning_rate": 7.395192242056044e-06, + "loss": 0.7925, + "step": 7030 + }, + { + "epoch": 1.8699468085106383, + "grad_norm": 3.3347582817077637, + "learning_rate": 7.3944201806688865e-06, + "loss": 0.647, + "step": 7031 + }, + { + "epoch": 1.8702127659574468, + "grad_norm": 3.7496252059936523, + "learning_rate": 7.393648045195548e-06, + "loss": 0.884, + "step": 7032 + }, + { + "epoch": 1.8704787234042555, + "grad_norm": 3.871969223022461, + "learning_rate": 7.392875835659923e-06, + "loss": 0.7962, + "step": 7033 + }, + { + "epoch": 1.8707446808510637, + "grad_norm": 4.357855796813965, + "learning_rate": 7.392103552085901e-06, + "loss": 0.8063, + "step": 7034 + }, + { + "epoch": 1.8710106382978724, + "grad_norm": 3.7552926540374756, + "learning_rate": 7.391331194497379e-06, + "loss": 0.7611, + "step": 7035 + }, + { + "epoch": 1.8712765957446809, + "grad_norm": 4.20325231552124, + "learning_rate": 7.390558762918254e-06, + "loss": 0.8825, + "step": 7036 + }, + { + "epoch": 1.8715425531914893, + "grad_norm": 3.433969020843506, + "learning_rate": 7.389786257372428e-06, + "loss": 0.6822, + "step": 7037 + }, + { + "epoch": 1.871808510638298, + "grad_norm": 3.9316911697387695, + "learning_rate": 7.3890136778837995e-06, + "loss": 0.8302, + "step": 7038 + }, + { + "epoch": 1.8720744680851062, + "grad_norm": 3.7068655490875244, + "learning_rate": 7.388241024476276e-06, + "loss": 0.8207, + "step": 7039 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 3.7558844089508057, + "learning_rate": 7.387468297173764e-06, + "loss": 0.8916, + "step": 7040 + }, + { + "epoch": 1.8726063829787234, + "grad_norm": 3.663325786590576, + "learning_rate": 7.386695496000172e-06, + "loss": 0.8461, + "step": 7041 + }, + { + "epoch": 1.8728723404255319, + "grad_norm": 3.7792584896087646, + "learning_rate": 7.38592262097941e-06, + "loss": 0.775, + "step": 7042 + }, + { + "epoch": 1.8731382978723405, + "grad_norm": 3.6168766021728516, + "learning_rate": 7.385149672135394e-06, + "loss": 0.7552, + "step": 7043 + }, + { + "epoch": 1.8734042553191488, + "grad_norm": 3.5428271293640137, + "learning_rate": 7.384376649492039e-06, + "loss": 0.8633, + "step": 7044 + }, + { + "epoch": 1.8736702127659575, + "grad_norm": 4.00286340713501, + "learning_rate": 7.383603553073262e-06, + "loss": 0.7895, + "step": 7045 + }, + { + "epoch": 1.873936170212766, + "grad_norm": 4.0529890060424805, + "learning_rate": 7.382830382902986e-06, + "loss": 0.7161, + "step": 7046 + }, + { + "epoch": 1.8742021276595744, + "grad_norm": 4.5928425788879395, + "learning_rate": 7.382057139005132e-06, + "loss": 0.8454, + "step": 7047 + }, + { + "epoch": 1.874468085106383, + "grad_norm": 3.7979865074157715, + "learning_rate": 7.381283821403626e-06, + "loss": 0.8475, + "step": 7048 + }, + { + "epoch": 1.8747340425531915, + "grad_norm": 3.9232993125915527, + "learning_rate": 7.380510430122396e-06, + "loss": 0.8079, + "step": 7049 + }, + { + "epoch": 1.875, + "grad_norm": 4.084567546844482, + "learning_rate": 7.379736965185369e-06, + "loss": 0.8926, + "step": 7050 + }, + { + "epoch": 1.8752659574468085, + "grad_norm": 3.967013359069824, + "learning_rate": 7.378963426616479e-06, + "loss": 0.8136, + "step": 7051 + }, + { + "epoch": 1.875531914893617, + "grad_norm": 4.18993616104126, + "learning_rate": 7.378189814439659e-06, + "loss": 0.663, + "step": 7052 + }, + { + "epoch": 1.8757978723404256, + "grad_norm": 3.4214327335357666, + "learning_rate": 7.377416128678847e-06, + "loss": 0.7142, + "step": 7053 + }, + { + "epoch": 1.876063829787234, + "grad_norm": 4.111138343811035, + "learning_rate": 7.37664236935798e-06, + "loss": 0.8517, + "step": 7054 + }, + { + "epoch": 1.8763297872340425, + "grad_norm": 4.020641326904297, + "learning_rate": 7.375868536501001e-06, + "loss": 0.7649, + "step": 7055 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.6159451007843018, + "learning_rate": 7.375094630131852e-06, + "loss": 0.7219, + "step": 7056 + }, + { + "epoch": 1.8768617021276595, + "grad_norm": 4.138524532318115, + "learning_rate": 7.374320650274479e-06, + "loss": 0.7374, + "step": 7057 + }, + { + "epoch": 1.8771276595744681, + "grad_norm": 4.114788055419922, + "learning_rate": 7.373546596952829e-06, + "loss": 0.9118, + "step": 7058 + }, + { + "epoch": 1.8773936170212766, + "grad_norm": 3.8229057788848877, + "learning_rate": 7.372772470190852e-06, + "loss": 0.7109, + "step": 7059 + }, + { + "epoch": 1.877659574468085, + "grad_norm": 3.9543075561523438, + "learning_rate": 7.371998270012504e-06, + "loss": 0.7616, + "step": 7060 + }, + { + "epoch": 1.8779255319148938, + "grad_norm": 3.862529754638672, + "learning_rate": 7.3712239964417345e-06, + "loss": 0.8719, + "step": 7061 + }, + { + "epoch": 1.878191489361702, + "grad_norm": 3.855138063430786, + "learning_rate": 7.370449649502504e-06, + "loss": 0.7093, + "step": 7062 + }, + { + "epoch": 1.8784574468085107, + "grad_norm": 4.169119358062744, + "learning_rate": 7.36967522921877e-06, + "loss": 0.8817, + "step": 7063 + }, + { + "epoch": 1.8787234042553191, + "grad_norm": 3.8987720012664795, + "learning_rate": 7.368900735614494e-06, + "loss": 0.7522, + "step": 7064 + }, + { + "epoch": 1.8789893617021276, + "grad_norm": 3.938058853149414, + "learning_rate": 7.36812616871364e-06, + "loss": 0.7694, + "step": 7065 + }, + { + "epoch": 1.8792553191489363, + "grad_norm": 3.7450876235961914, + "learning_rate": 7.367351528540176e-06, + "loss": 0.7283, + "step": 7066 + }, + { + "epoch": 1.8795212765957445, + "grad_norm": 3.9045193195343018, + "learning_rate": 7.366576815118067e-06, + "loss": 0.735, + "step": 7067 + }, + { + "epoch": 1.8797872340425532, + "grad_norm": 3.4928138256073, + "learning_rate": 7.365802028471285e-06, + "loss": 0.7537, + "step": 7068 + }, + { + "epoch": 1.8800531914893617, + "grad_norm": 3.8254666328430176, + "learning_rate": 7.365027168623804e-06, + "loss": 0.8252, + "step": 7069 + }, + { + "epoch": 1.8803191489361701, + "grad_norm": 4.039599418640137, + "learning_rate": 7.364252235599596e-06, + "loss": 0.78, + "step": 7070 + }, + { + "epoch": 1.8805851063829788, + "grad_norm": 4.29962158203125, + "learning_rate": 7.363477229422642e-06, + "loss": 0.8651, + "step": 7071 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.891298294067383, + "learning_rate": 7.3627021501169196e-06, + "loss": 0.7887, + "step": 7072 + }, + { + "epoch": 1.8811170212765957, + "grad_norm": 3.8227875232696533, + "learning_rate": 7.36192699770641e-06, + "loss": 0.8563, + "step": 7073 + }, + { + "epoch": 1.8813829787234042, + "grad_norm": 3.881826639175415, + "learning_rate": 7.3611517722151e-06, + "loss": 0.7518, + "step": 7074 + }, + { + "epoch": 1.8816489361702127, + "grad_norm": 3.529783248901367, + "learning_rate": 7.360376473666973e-06, + "loss": 0.7086, + "step": 7075 + }, + { + "epoch": 1.8819148936170214, + "grad_norm": 3.710423231124878, + "learning_rate": 7.359601102086018e-06, + "loss": 0.8141, + "step": 7076 + }, + { + "epoch": 1.8821808510638298, + "grad_norm": 4.26459264755249, + "learning_rate": 7.358825657496228e-06, + "loss": 0.8523, + "step": 7077 + }, + { + "epoch": 1.8824468085106383, + "grad_norm": 3.9186158180236816, + "learning_rate": 7.358050139921595e-06, + "loss": 0.806, + "step": 7078 + }, + { + "epoch": 1.882712765957447, + "grad_norm": 3.5147833824157715, + "learning_rate": 7.3572745493861155e-06, + "loss": 0.742, + "step": 7079 + }, + { + "epoch": 1.8829787234042552, + "grad_norm": 3.834606885910034, + "learning_rate": 7.356498885913784e-06, + "loss": 0.9308, + "step": 7080 + }, + { + "epoch": 1.883244680851064, + "grad_norm": 3.989713191986084, + "learning_rate": 7.355723149528604e-06, + "loss": 0.8085, + "step": 7081 + }, + { + "epoch": 1.8835106382978724, + "grad_norm": 4.148540019989014, + "learning_rate": 7.354947340254576e-06, + "loss": 0.7697, + "step": 7082 + }, + { + "epoch": 1.8837765957446808, + "grad_norm": 3.6128063201904297, + "learning_rate": 7.354171458115704e-06, + "loss": 0.7755, + "step": 7083 + }, + { + "epoch": 1.8840425531914895, + "grad_norm": 4.31196928024292, + "learning_rate": 7.353395503135996e-06, + "loss": 0.7754, + "step": 7084 + }, + { + "epoch": 1.8843085106382977, + "grad_norm": 3.750534772872925, + "learning_rate": 7.35261947533946e-06, + "loss": 0.8237, + "step": 7085 + }, + { + "epoch": 1.8845744680851064, + "grad_norm": 3.8344967365264893, + "learning_rate": 7.351843374750108e-06, + "loss": 0.832, + "step": 7086 + }, + { + "epoch": 1.8848404255319149, + "grad_norm": 3.5898144245147705, + "learning_rate": 7.351067201391952e-06, + "loss": 0.737, + "step": 7087 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.8664729595184326, + "learning_rate": 7.35029095528901e-06, + "loss": 0.8636, + "step": 7088 + }, + { + "epoch": 1.885372340425532, + "grad_norm": 4.382975101470947, + "learning_rate": 7.349514636465298e-06, + "loss": 0.8923, + "step": 7089 + }, + { + "epoch": 1.8856382978723403, + "grad_norm": 4.070766448974609, + "learning_rate": 7.348738244944837e-06, + "loss": 0.8651, + "step": 7090 + }, + { + "epoch": 1.885904255319149, + "grad_norm": 4.187519073486328, + "learning_rate": 7.347961780751649e-06, + "loss": 0.8492, + "step": 7091 + }, + { + "epoch": 1.8861702127659574, + "grad_norm": 3.7398457527160645, + "learning_rate": 7.347185243909761e-06, + "loss": 0.7936, + "step": 7092 + }, + { + "epoch": 1.8864361702127659, + "grad_norm": 3.758314609527588, + "learning_rate": 7.346408634443196e-06, + "loss": 0.9086, + "step": 7093 + }, + { + "epoch": 1.8867021276595746, + "grad_norm": 3.800701856613159, + "learning_rate": 7.345631952375986e-06, + "loss": 0.8418, + "step": 7094 + }, + { + "epoch": 1.886968085106383, + "grad_norm": 4.155978202819824, + "learning_rate": 7.3448551977321615e-06, + "loss": 0.9388, + "step": 7095 + }, + { + "epoch": 1.8872340425531915, + "grad_norm": 3.9163780212402344, + "learning_rate": 7.344078370535757e-06, + "loss": 0.7108, + "step": 7096 + }, + { + "epoch": 1.8875, + "grad_norm": 3.312629222869873, + "learning_rate": 7.343301470810809e-06, + "loss": 0.6591, + "step": 7097 + }, + { + "epoch": 1.8877659574468084, + "grad_norm": 4.259210586547852, + "learning_rate": 7.342524498581352e-06, + "loss": 0.9209, + "step": 7098 + }, + { + "epoch": 1.888031914893617, + "grad_norm": 4.158624649047852, + "learning_rate": 7.34174745387143e-06, + "loss": 0.8084, + "step": 7099 + }, + { + "epoch": 1.8882978723404256, + "grad_norm": 4.25371789932251, + "learning_rate": 7.340970336705084e-06, + "loss": 0.8624, + "step": 7100 + }, + { + "epoch": 1.888563829787234, + "grad_norm": 3.780513286590576, + "learning_rate": 7.340193147106362e-06, + "loss": 0.7879, + "step": 7101 + }, + { + "epoch": 1.8888297872340427, + "grad_norm": 4.191688537597656, + "learning_rate": 7.339415885099307e-06, + "loss": 0.7785, + "step": 7102 + }, + { + "epoch": 1.889095744680851, + "grad_norm": 4.398171901702881, + "learning_rate": 7.33863855070797e-06, + "loss": 0.8883, + "step": 7103 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 3.6488990783691406, + "learning_rate": 7.337861143956404e-06, + "loss": 0.8097, + "step": 7104 + }, + { + "epoch": 1.889627659574468, + "grad_norm": 4.0780487060546875, + "learning_rate": 7.3370836648686616e-06, + "loss": 0.7897, + "step": 7105 + }, + { + "epoch": 1.8898936170212766, + "grad_norm": 4.089003562927246, + "learning_rate": 7.336306113468799e-06, + "loss": 0.9653, + "step": 7106 + }, + { + "epoch": 1.8901595744680852, + "grad_norm": 4.446435928344727, + "learning_rate": 7.335528489780874e-06, + "loss": 0.8947, + "step": 7107 + }, + { + "epoch": 1.8904255319148935, + "grad_norm": 3.880557060241699, + "learning_rate": 7.334750793828947e-06, + "loss": 0.9184, + "step": 7108 + }, + { + "epoch": 1.8906914893617022, + "grad_norm": 4.0276899337768555, + "learning_rate": 7.3339730256370834e-06, + "loss": 0.7444, + "step": 7109 + }, + { + "epoch": 1.8909574468085106, + "grad_norm": 4.381673336029053, + "learning_rate": 7.333195185229346e-06, + "loss": 0.7789, + "step": 7110 + }, + { + "epoch": 1.891223404255319, + "grad_norm": 4.908472537994385, + "learning_rate": 7.3324172726298015e-06, + "loss": 0.8258, + "step": 7111 + }, + { + "epoch": 1.8914893617021278, + "grad_norm": 4.257655143737793, + "learning_rate": 7.331639287862522e-06, + "loss": 0.8343, + "step": 7112 + }, + { + "epoch": 1.891755319148936, + "grad_norm": 3.902233600616455, + "learning_rate": 7.330861230951577e-06, + "loss": 0.7672, + "step": 7113 + }, + { + "epoch": 1.8920212765957447, + "grad_norm": 4.111093044281006, + "learning_rate": 7.3300831019210415e-06, + "loss": 0.9128, + "step": 7114 + }, + { + "epoch": 1.8922872340425532, + "grad_norm": 4.477164268493652, + "learning_rate": 7.329304900794991e-06, + "loss": 0.9389, + "step": 7115 + }, + { + "epoch": 1.8925531914893616, + "grad_norm": 4.585188388824463, + "learning_rate": 7.328526627597505e-06, + "loss": 0.8127, + "step": 7116 + }, + { + "epoch": 1.8928191489361703, + "grad_norm": 3.906665086746216, + "learning_rate": 7.327748282352664e-06, + "loss": 0.7996, + "step": 7117 + }, + { + "epoch": 1.8930851063829788, + "grad_norm": 4.213885307312012, + "learning_rate": 7.32696986508455e-06, + "loss": 0.8334, + "step": 7118 + }, + { + "epoch": 1.8933510638297872, + "grad_norm": 4.066798686981201, + "learning_rate": 7.326191375817249e-06, + "loss": 0.8217, + "step": 7119 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 3.510889768600464, + "learning_rate": 7.325412814574847e-06, + "loss": 0.7864, + "step": 7120 + }, + { + "epoch": 1.8938829787234042, + "grad_norm": 3.888808250427246, + "learning_rate": 7.324634181381436e-06, + "loss": 0.7519, + "step": 7121 + }, + { + "epoch": 1.8941489361702128, + "grad_norm": 3.9174201488494873, + "learning_rate": 7.323855476261106e-06, + "loss": 0.6913, + "step": 7122 + }, + { + "epoch": 1.8944148936170213, + "grad_norm": 4.041181564331055, + "learning_rate": 7.323076699237951e-06, + "loss": 0.6076, + "step": 7123 + }, + { + "epoch": 1.8946808510638298, + "grad_norm": 3.841498851776123, + "learning_rate": 7.322297850336069e-06, + "loss": 0.8645, + "step": 7124 + }, + { + "epoch": 1.8949468085106385, + "grad_norm": 3.5201406478881836, + "learning_rate": 7.3215189295795565e-06, + "loss": 0.7253, + "step": 7125 + }, + { + "epoch": 1.8952127659574467, + "grad_norm": 3.9525210857391357, + "learning_rate": 7.320739936992514e-06, + "loss": 0.8073, + "step": 7126 + }, + { + "epoch": 1.8954787234042554, + "grad_norm": 3.8624043464660645, + "learning_rate": 7.319960872599048e-06, + "loss": 0.8157, + "step": 7127 + }, + { + "epoch": 1.8957446808510638, + "grad_norm": 4.123876571655273, + "learning_rate": 7.31918173642326e-06, + "loss": 0.8038, + "step": 7128 + }, + { + "epoch": 1.8960106382978723, + "grad_norm": 3.812316417694092, + "learning_rate": 7.318402528489258e-06, + "loss": 0.7421, + "step": 7129 + }, + { + "epoch": 1.896276595744681, + "grad_norm": 4.009311199188232, + "learning_rate": 7.317623248821153e-06, + "loss": 0.835, + "step": 7130 + }, + { + "epoch": 1.8965425531914892, + "grad_norm": 4.297110557556152, + "learning_rate": 7.316843897443055e-06, + "loss": 0.7093, + "step": 7131 + }, + { + "epoch": 1.896808510638298, + "grad_norm": 4.034492015838623, + "learning_rate": 7.316064474379081e-06, + "loss": 0.7682, + "step": 7132 + }, + { + "epoch": 1.8970744680851064, + "grad_norm": 4.544641494750977, + "learning_rate": 7.315284979653344e-06, + "loss": 0.8832, + "step": 7133 + }, + { + "epoch": 1.8973404255319148, + "grad_norm": 4.383004188537598, + "learning_rate": 7.314505413289964e-06, + "loss": 0.892, + "step": 7134 + }, + { + "epoch": 1.8976063829787235, + "grad_norm": 3.52055025100708, + "learning_rate": 7.313725775313061e-06, + "loss": 0.7965, + "step": 7135 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 3.933687925338745, + "learning_rate": 7.31294606574676e-06, + "loss": 0.7829, + "step": 7136 + }, + { + "epoch": 1.8981382978723405, + "grad_norm": 4.500588417053223, + "learning_rate": 7.312166284615183e-06, + "loss": 0.8802, + "step": 7137 + }, + { + "epoch": 1.898404255319149, + "grad_norm": 3.9210360050201416, + "learning_rate": 7.31138643194246e-06, + "loss": 0.7418, + "step": 7138 + }, + { + "epoch": 1.8986702127659574, + "grad_norm": 4.024209022521973, + "learning_rate": 7.3106065077527175e-06, + "loss": 0.8769, + "step": 7139 + }, + { + "epoch": 1.898936170212766, + "grad_norm": 4.242138862609863, + "learning_rate": 7.3098265120700915e-06, + "loss": 0.8789, + "step": 7140 + }, + { + "epoch": 1.8992021276595743, + "grad_norm": 3.6798341274261475, + "learning_rate": 7.309046444918712e-06, + "loss": 0.7971, + "step": 7141 + }, + { + "epoch": 1.899468085106383, + "grad_norm": 4.092346668243408, + "learning_rate": 7.308266306322719e-06, + "loss": 0.7864, + "step": 7142 + }, + { + "epoch": 1.8997340425531914, + "grad_norm": 4.132681846618652, + "learning_rate": 7.307486096306247e-06, + "loss": 0.8868, + "step": 7143 + }, + { + "epoch": 1.9, + "grad_norm": 3.893075942993164, + "learning_rate": 7.30670581489344e-06, + "loss": 0.9096, + "step": 7144 + }, + { + "epoch": 1.9002659574468086, + "grad_norm": 3.807593822479248, + "learning_rate": 7.305925462108439e-06, + "loss": 0.7444, + "step": 7145 + }, + { + "epoch": 1.900531914893617, + "grad_norm": 3.6460392475128174, + "learning_rate": 7.305145037975388e-06, + "loss": 0.74, + "step": 7146 + }, + { + "epoch": 1.9007978723404255, + "grad_norm": 3.5041310787200928, + "learning_rate": 7.304364542518435e-06, + "loss": 0.8561, + "step": 7147 + }, + { + "epoch": 1.9010638297872342, + "grad_norm": 4.359119892120361, + "learning_rate": 7.303583975761732e-06, + "loss": 0.735, + "step": 7148 + }, + { + "epoch": 1.9013297872340424, + "grad_norm": 4.176085948944092, + "learning_rate": 7.302803337729429e-06, + "loss": 0.8723, + "step": 7149 + }, + { + "epoch": 1.9015957446808511, + "grad_norm": 3.764272689819336, + "learning_rate": 7.302022628445678e-06, + "loss": 0.8359, + "step": 7150 + }, + { + "epoch": 1.9018617021276596, + "grad_norm": 3.8661603927612305, + "learning_rate": 7.301241847934637e-06, + "loss": 0.9286, + "step": 7151 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.493070363998413, + "learning_rate": 7.300460996220464e-06, + "loss": 0.7439, + "step": 7152 + }, + { + "epoch": 1.9023936170212767, + "grad_norm": 3.425701379776001, + "learning_rate": 7.2996800733273196e-06, + "loss": 0.8468, + "step": 7153 + }, + { + "epoch": 1.902659574468085, + "grad_norm": 3.9553513526916504, + "learning_rate": 7.298899079279365e-06, + "loss": 0.8075, + "step": 7154 + }, + { + "epoch": 1.9029255319148937, + "grad_norm": 3.900907516479492, + "learning_rate": 7.298118014100766e-06, + "loss": 0.8969, + "step": 7155 + }, + { + "epoch": 1.9031914893617021, + "grad_norm": 3.8822121620178223, + "learning_rate": 7.297336877815693e-06, + "loss": 0.8685, + "step": 7156 + }, + { + "epoch": 1.9034574468085106, + "grad_norm": 3.847317695617676, + "learning_rate": 7.29655567044831e-06, + "loss": 0.7251, + "step": 7157 + }, + { + "epoch": 1.9037234042553193, + "grad_norm": 3.5498738288879395, + "learning_rate": 7.295774392022791e-06, + "loss": 0.7035, + "step": 7158 + }, + { + "epoch": 1.9039893617021275, + "grad_norm": 3.658343553543091, + "learning_rate": 7.2949930425633095e-06, + "loss": 0.7414, + "step": 7159 + }, + { + "epoch": 1.9042553191489362, + "grad_norm": 3.804388999938965, + "learning_rate": 7.2942116220940406e-06, + "loss": 0.8057, + "step": 7160 + }, + { + "epoch": 1.9045212765957447, + "grad_norm": 3.876521348953247, + "learning_rate": 7.293430130639163e-06, + "loss": 0.886, + "step": 7161 + }, + { + "epoch": 1.9047872340425531, + "grad_norm": 3.969161033630371, + "learning_rate": 7.292648568222859e-06, + "loss": 0.9049, + "step": 7162 + }, + { + "epoch": 1.9050531914893618, + "grad_norm": 4.049928188323975, + "learning_rate": 7.2918669348693075e-06, + "loss": 0.8954, + "step": 7163 + }, + { + "epoch": 1.90531914893617, + "grad_norm": 3.997854232788086, + "learning_rate": 7.291085230602694e-06, + "loss": 0.9063, + "step": 7164 + }, + { + "epoch": 1.9055851063829787, + "grad_norm": 4.090554237365723, + "learning_rate": 7.290303455447208e-06, + "loss": 0.8132, + "step": 7165 + }, + { + "epoch": 1.9058510638297872, + "grad_norm": 3.8804330825805664, + "learning_rate": 7.289521609427035e-06, + "loss": 0.8245, + "step": 7166 + }, + { + "epoch": 1.9061170212765957, + "grad_norm": 3.7036948204040527, + "learning_rate": 7.288739692566367e-06, + "loss": 0.891, + "step": 7167 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 3.8350512981414795, + "learning_rate": 7.2879577048894e-06, + "loss": 0.7912, + "step": 7168 + }, + { + "epoch": 1.9066489361702128, + "grad_norm": 3.3897817134857178, + "learning_rate": 7.287175646420327e-06, + "loss": 0.8327, + "step": 7169 + }, + { + "epoch": 1.9069148936170213, + "grad_norm": 4.037939548492432, + "learning_rate": 7.2863935171833465e-06, + "loss": 0.8793, + "step": 7170 + }, + { + "epoch": 1.90718085106383, + "grad_norm": 3.7813265323638916, + "learning_rate": 7.285611317202661e-06, + "loss": 0.8551, + "step": 7171 + }, + { + "epoch": 1.9074468085106382, + "grad_norm": 3.916761636734009, + "learning_rate": 7.284829046502467e-06, + "loss": 0.7564, + "step": 7172 + }, + { + "epoch": 1.9077127659574469, + "grad_norm": 3.843834400177002, + "learning_rate": 7.284046705106974e-06, + "loss": 0.8456, + "step": 7173 + }, + { + "epoch": 1.9079787234042553, + "grad_norm": 3.752497434616089, + "learning_rate": 7.2832642930403876e-06, + "loss": 0.8221, + "step": 7174 + }, + { + "epoch": 1.9082446808510638, + "grad_norm": 4.00820779800415, + "learning_rate": 7.282481810326915e-06, + "loss": 0.9672, + "step": 7175 + }, + { + "epoch": 1.9085106382978725, + "grad_norm": 4.226334571838379, + "learning_rate": 7.281699256990766e-06, + "loss": 0.8973, + "step": 7176 + }, + { + "epoch": 1.9087765957446807, + "grad_norm": 3.871880531311035, + "learning_rate": 7.280916633056159e-06, + "loss": 0.8204, + "step": 7177 + }, + { + "epoch": 1.9090425531914894, + "grad_norm": 4.339875221252441, + "learning_rate": 7.280133938547304e-06, + "loss": 0.8958, + "step": 7178 + }, + { + "epoch": 1.9093085106382979, + "grad_norm": 3.7419753074645996, + "learning_rate": 7.27935117348842e-06, + "loss": 0.789, + "step": 7179 + }, + { + "epoch": 1.9095744680851063, + "grad_norm": 4.0317888259887695, + "learning_rate": 7.278568337903729e-06, + "loss": 0.7995, + "step": 7180 + }, + { + "epoch": 1.909840425531915, + "grad_norm": 3.9452288150787354, + "learning_rate": 7.277785431817449e-06, + "loss": 0.8576, + "step": 7181 + }, + { + "epoch": 1.9101063829787233, + "grad_norm": 3.957437753677368, + "learning_rate": 7.277002455253807e-06, + "loss": 0.8532, + "step": 7182 + }, + { + "epoch": 1.910372340425532, + "grad_norm": 3.9327943325042725, + "learning_rate": 7.276219408237029e-06, + "loss": 0.8155, + "step": 7183 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 4.20408296585083, + "learning_rate": 7.27543629079134e-06, + "loss": 0.8285, + "step": 7184 + }, + { + "epoch": 1.9109042553191489, + "grad_norm": 4.2042341232299805, + "learning_rate": 7.274653102940974e-06, + "loss": 0.8624, + "step": 7185 + }, + { + "epoch": 1.9111702127659576, + "grad_norm": 4.000115871429443, + "learning_rate": 7.2738698447101645e-06, + "loss": 0.8343, + "step": 7186 + }, + { + "epoch": 1.9114361702127658, + "grad_norm": 4.323785305023193, + "learning_rate": 7.273086516123145e-06, + "loss": 0.7525, + "step": 7187 + }, + { + "epoch": 1.9117021276595745, + "grad_norm": 3.9202396869659424, + "learning_rate": 7.27230311720415e-06, + "loss": 0.9014, + "step": 7188 + }, + { + "epoch": 1.911968085106383, + "grad_norm": 3.924821615219116, + "learning_rate": 7.271519647977422e-06, + "loss": 0.8206, + "step": 7189 + }, + { + "epoch": 1.9122340425531914, + "grad_norm": 3.9752979278564453, + "learning_rate": 7.270736108467202e-06, + "loss": 0.9627, + "step": 7190 + }, + { + "epoch": 1.9125, + "grad_norm": 3.7932825088500977, + "learning_rate": 7.269952498697734e-06, + "loss": 0.8227, + "step": 7191 + }, + { + "epoch": 1.9127659574468086, + "grad_norm": 4.589715480804443, + "learning_rate": 7.2691688186932626e-06, + "loss": 0.9176, + "step": 7192 + }, + { + "epoch": 1.913031914893617, + "grad_norm": 4.00385856628418, + "learning_rate": 7.268385068478037e-06, + "loss": 0.7602, + "step": 7193 + }, + { + "epoch": 1.9132978723404257, + "grad_norm": 4.291144847869873, + "learning_rate": 7.267601248076307e-06, + "loss": 1.0254, + "step": 7194 + }, + { + "epoch": 1.913563829787234, + "grad_norm": 3.699037790298462, + "learning_rate": 7.2668173575123234e-06, + "loss": 0.8528, + "step": 7195 + }, + { + "epoch": 1.9138297872340426, + "grad_norm": 3.936768054962158, + "learning_rate": 7.266033396810343e-06, + "loss": 0.7172, + "step": 7196 + }, + { + "epoch": 1.914095744680851, + "grad_norm": 3.23809814453125, + "learning_rate": 7.265249365994621e-06, + "loss": 0.6519, + "step": 7197 + }, + { + "epoch": 1.9143617021276595, + "grad_norm": 4.3691020011901855, + "learning_rate": 7.2644652650894155e-06, + "loss": 0.8097, + "step": 7198 + }, + { + "epoch": 1.9146276595744682, + "grad_norm": 4.070173263549805, + "learning_rate": 7.263681094118989e-06, + "loss": 1.0137, + "step": 7199 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 3.9889721870422363, + "learning_rate": 7.262896853107606e-06, + "loss": 0.8935, + "step": 7200 + }, + { + "epoch": 1.9151595744680852, + "grad_norm": 3.6993491649627686, + "learning_rate": 7.262112542079529e-06, + "loss": 0.7445, + "step": 7201 + }, + { + "epoch": 1.9154255319148936, + "grad_norm": 4.081962585449219, + "learning_rate": 7.261328161059026e-06, + "loss": 1.0239, + "step": 7202 + }, + { + "epoch": 1.915691489361702, + "grad_norm": 4.065913677215576, + "learning_rate": 7.260543710070369e-06, + "loss": 0.9063, + "step": 7203 + }, + { + "epoch": 1.9159574468085108, + "grad_norm": 3.7012364864349365, + "learning_rate": 7.259759189137827e-06, + "loss": 0.9102, + "step": 7204 + }, + { + "epoch": 1.916223404255319, + "grad_norm": 4.341013431549072, + "learning_rate": 7.258974598285674e-06, + "loss": 0.8309, + "step": 7205 + }, + { + "epoch": 1.9164893617021277, + "grad_norm": 3.8948628902435303, + "learning_rate": 7.258189937538189e-06, + "loss": 0.786, + "step": 7206 + }, + { + "epoch": 1.9167553191489362, + "grad_norm": 4.040065288543701, + "learning_rate": 7.257405206919649e-06, + "loss": 0.7283, + "step": 7207 + }, + { + "epoch": 1.9170212765957446, + "grad_norm": 3.775395631790161, + "learning_rate": 7.256620406454333e-06, + "loss": 0.7441, + "step": 7208 + }, + { + "epoch": 1.9172872340425533, + "grad_norm": 4.277199745178223, + "learning_rate": 7.255835536166525e-06, + "loss": 0.8784, + "step": 7209 + }, + { + "epoch": 1.9175531914893615, + "grad_norm": 4.311332702636719, + "learning_rate": 7.25505059608051e-06, + "loss": 0.911, + "step": 7210 + }, + { + "epoch": 1.9178191489361702, + "grad_norm": 3.843778371810913, + "learning_rate": 7.254265586220574e-06, + "loss": 0.7906, + "step": 7211 + }, + { + "epoch": 1.9180851063829787, + "grad_norm": 4.064030647277832, + "learning_rate": 7.253480506611008e-06, + "loss": 0.8904, + "step": 7212 + }, + { + "epoch": 1.9183510638297872, + "grad_norm": 3.85115385055542, + "learning_rate": 7.252695357276101e-06, + "loss": 0.7148, + "step": 7213 + }, + { + "epoch": 1.9186170212765958, + "grad_norm": 3.716801643371582, + "learning_rate": 7.251910138240147e-06, + "loss": 0.7956, + "step": 7214 + }, + { + "epoch": 1.9188829787234043, + "grad_norm": 3.7296745777130127, + "learning_rate": 7.251124849527442e-06, + "loss": 0.8143, + "step": 7215 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 3.9987385272979736, + "learning_rate": 7.250339491162284e-06, + "loss": 0.8333, + "step": 7216 + }, + { + "epoch": 1.9194148936170212, + "grad_norm": 3.8190033435821533, + "learning_rate": 7.2495540631689745e-06, + "loss": 0.8476, + "step": 7217 + }, + { + "epoch": 1.9196808510638297, + "grad_norm": 4.055121898651123, + "learning_rate": 7.248768565571811e-06, + "loss": 0.8605, + "step": 7218 + }, + { + "epoch": 1.9199468085106384, + "grad_norm": 4.3670525550842285, + "learning_rate": 7.247982998395102e-06, + "loss": 0.8381, + "step": 7219 + }, + { + "epoch": 1.9202127659574468, + "grad_norm": 4.680405139923096, + "learning_rate": 7.247197361663152e-06, + "loss": 0.9635, + "step": 7220 + }, + { + "epoch": 1.9204787234042553, + "grad_norm": 4.1340460777282715, + "learning_rate": 7.24641165540027e-06, + "loss": 0.8125, + "step": 7221 + }, + { + "epoch": 1.920744680851064, + "grad_norm": 4.003271102905273, + "learning_rate": 7.245625879630767e-06, + "loss": 0.8934, + "step": 7222 + }, + { + "epoch": 1.9210106382978722, + "grad_norm": 4.222568035125732, + "learning_rate": 7.244840034378955e-06, + "loss": 1.0299, + "step": 7223 + }, + { + "epoch": 1.921276595744681, + "grad_norm": 3.762643337249756, + "learning_rate": 7.244054119669148e-06, + "loss": 0.6798, + "step": 7224 + }, + { + "epoch": 1.9215425531914894, + "grad_norm": 4.137721538543701, + "learning_rate": 7.243268135525666e-06, + "loss": 0.8147, + "step": 7225 + }, + { + "epoch": 1.9218085106382978, + "grad_norm": 4.0250139236450195, + "learning_rate": 7.242482081972827e-06, + "loss": 0.8394, + "step": 7226 + }, + { + "epoch": 1.9220744680851065, + "grad_norm": 3.7539706230163574, + "learning_rate": 7.241695959034951e-06, + "loss": 0.8293, + "step": 7227 + }, + { + "epoch": 1.9223404255319148, + "grad_norm": 4.054415225982666, + "learning_rate": 7.2409097667363635e-06, + "loss": 0.9107, + "step": 7228 + }, + { + "epoch": 1.9226063829787234, + "grad_norm": 4.380495548248291, + "learning_rate": 7.2401235051013885e-06, + "loss": 0.8641, + "step": 7229 + }, + { + "epoch": 1.922872340425532, + "grad_norm": 4.061448097229004, + "learning_rate": 7.239337174154357e-06, + "loss": 0.8332, + "step": 7230 + }, + { + "epoch": 1.9231382978723404, + "grad_norm": 4.095539093017578, + "learning_rate": 7.2385507739195945e-06, + "loss": 0.828, + "step": 7231 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 4.271059513092041, + "learning_rate": 7.2377643044214375e-06, + "loss": 0.8365, + "step": 7232 + }, + { + "epoch": 1.9236702127659573, + "grad_norm": 3.9962894916534424, + "learning_rate": 7.236977765684216e-06, + "loss": 0.6932, + "step": 7233 + }, + { + "epoch": 1.923936170212766, + "grad_norm": 4.267841339111328, + "learning_rate": 7.236191157732272e-06, + "loss": 0.8561, + "step": 7234 + }, + { + "epoch": 1.9242021276595744, + "grad_norm": 4.299777030944824, + "learning_rate": 7.2354044805899385e-06, + "loss": 0.864, + "step": 7235 + }, + { + "epoch": 1.924468085106383, + "grad_norm": 4.053724765777588, + "learning_rate": 7.234617734281558e-06, + "loss": 0.8643, + "step": 7236 + }, + { + "epoch": 1.9247340425531916, + "grad_norm": 4.541396141052246, + "learning_rate": 7.2338309188314745e-06, + "loss": 0.793, + "step": 7237 + }, + { + "epoch": 1.925, + "grad_norm": 4.2436676025390625, + "learning_rate": 7.233044034264034e-06, + "loss": 0.7894, + "step": 7238 + }, + { + "epoch": 1.9252659574468085, + "grad_norm": 4.764181613922119, + "learning_rate": 7.23225708060358e-06, + "loss": 0.7979, + "step": 7239 + }, + { + "epoch": 1.925531914893617, + "grad_norm": 4.301015377044678, + "learning_rate": 7.2314700578744635e-06, + "loss": 0.8022, + "step": 7240 + }, + { + "epoch": 1.9257978723404254, + "grad_norm": 3.9735851287841797, + "learning_rate": 7.230682966101038e-06, + "loss": 0.7377, + "step": 7241 + }, + { + "epoch": 1.9260638297872341, + "grad_norm": 4.120856285095215, + "learning_rate": 7.229895805307654e-06, + "loss": 0.7386, + "step": 7242 + }, + { + "epoch": 1.9263297872340426, + "grad_norm": 4.618571758270264, + "learning_rate": 7.229108575518668e-06, + "loss": 0.8771, + "step": 7243 + }, + { + "epoch": 1.926595744680851, + "grad_norm": 3.679917573928833, + "learning_rate": 7.22832127675844e-06, + "loss": 0.8137, + "step": 7244 + }, + { + "epoch": 1.9268617021276597, + "grad_norm": 4.480624198913574, + "learning_rate": 7.227533909051327e-06, + "loss": 0.8955, + "step": 7245 + }, + { + "epoch": 1.927127659574468, + "grad_norm": 3.715806722640991, + "learning_rate": 7.226746472421692e-06, + "loss": 0.8023, + "step": 7246 + }, + { + "epoch": 1.9273936170212767, + "grad_norm": 4.008445739746094, + "learning_rate": 7.2259589668939005e-06, + "loss": 0.8584, + "step": 7247 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 4.211793899536133, + "learning_rate": 7.225171392492316e-06, + "loss": 0.8412, + "step": 7248 + }, + { + "epoch": 1.9279255319148936, + "grad_norm": 4.422094821929932, + "learning_rate": 7.224383749241311e-06, + "loss": 0.811, + "step": 7249 + }, + { + "epoch": 1.9281914893617023, + "grad_norm": 3.894848108291626, + "learning_rate": 7.223596037165252e-06, + "loss": 0.9126, + "step": 7250 + }, + { + "epoch": 1.9284574468085105, + "grad_norm": 3.9139139652252197, + "learning_rate": 7.222808256288515e-06, + "loss": 0.7837, + "step": 7251 + }, + { + "epoch": 1.9287234042553192, + "grad_norm": 4.1469197273254395, + "learning_rate": 7.222020406635474e-06, + "loss": 0.7134, + "step": 7252 + }, + { + "epoch": 1.9289893617021276, + "grad_norm": 3.5331952571868896, + "learning_rate": 7.2212324882305045e-06, + "loss": 0.7372, + "step": 7253 + }, + { + "epoch": 1.929255319148936, + "grad_norm": 3.312333822250366, + "learning_rate": 7.220444501097986e-06, + "loss": 0.7583, + "step": 7254 + }, + { + "epoch": 1.9295212765957448, + "grad_norm": 4.264598846435547, + "learning_rate": 7.2196564452623015e-06, + "loss": 0.8354, + "step": 7255 + }, + { + "epoch": 1.929787234042553, + "grad_norm": 4.467483997344971, + "learning_rate": 7.2188683207478326e-06, + "loss": 0.8728, + "step": 7256 + }, + { + "epoch": 1.9300531914893617, + "grad_norm": 3.850327730178833, + "learning_rate": 7.218080127578966e-06, + "loss": 0.8222, + "step": 7257 + }, + { + "epoch": 1.9303191489361702, + "grad_norm": 3.970350980758667, + "learning_rate": 7.217291865780089e-06, + "loss": 0.8979, + "step": 7258 + }, + { + "epoch": 1.9305851063829786, + "grad_norm": 3.9415476322174072, + "learning_rate": 7.21650353537559e-06, + "loss": 0.7552, + "step": 7259 + }, + { + "epoch": 1.9308510638297873, + "grad_norm": 3.566114664077759, + "learning_rate": 7.215715136389862e-06, + "loss": 0.8683, + "step": 7260 + }, + { + "epoch": 1.9311170212765958, + "grad_norm": 3.991467237472534, + "learning_rate": 7.2149266688473005e-06, + "loss": 0.7815, + "step": 7261 + }, + { + "epoch": 1.9313829787234043, + "grad_norm": 4.0647406578063965, + "learning_rate": 7.214138132772299e-06, + "loss": 0.7483, + "step": 7262 + }, + { + "epoch": 1.9316489361702127, + "grad_norm": 4.495807647705078, + "learning_rate": 7.213349528189258e-06, + "loss": 0.9067, + "step": 7263 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 4.034248352050781, + "learning_rate": 7.212560855122576e-06, + "loss": 0.7541, + "step": 7264 + }, + { + "epoch": 1.9321808510638299, + "grad_norm": 3.8755152225494385, + "learning_rate": 7.211772113596656e-06, + "loss": 0.8805, + "step": 7265 + }, + { + "epoch": 1.9324468085106383, + "grad_norm": 3.655921220779419, + "learning_rate": 7.210983303635901e-06, + "loss": 0.7864, + "step": 7266 + }, + { + "epoch": 1.9327127659574468, + "grad_norm": 4.281502723693848, + "learning_rate": 7.210194425264723e-06, + "loss": 0.9595, + "step": 7267 + }, + { + "epoch": 1.9329787234042555, + "grad_norm": 3.8239359855651855, + "learning_rate": 7.209405478507525e-06, + "loss": 0.7896, + "step": 7268 + }, + { + "epoch": 1.9332446808510637, + "grad_norm": 3.9340760707855225, + "learning_rate": 7.20861646338872e-06, + "loss": 0.855, + "step": 7269 + }, + { + "epoch": 1.9335106382978724, + "grad_norm": 3.6993649005889893, + "learning_rate": 7.207827379932724e-06, + "loss": 0.774, + "step": 7270 + }, + { + "epoch": 1.9337765957446809, + "grad_norm": 4.12832498550415, + "learning_rate": 7.2070382281639466e-06, + "loss": 0.8031, + "step": 7271 + }, + { + "epoch": 1.9340425531914893, + "grad_norm": 3.675234079360962, + "learning_rate": 7.206249008106808e-06, + "loss": 0.7203, + "step": 7272 + }, + { + "epoch": 1.934308510638298, + "grad_norm": 4.341015338897705, + "learning_rate": 7.20545971978573e-06, + "loss": 0.7099, + "step": 7273 + }, + { + "epoch": 1.9345744680851062, + "grad_norm": 4.289004802703857, + "learning_rate": 7.2046703632251295e-06, + "loss": 0.8558, + "step": 7274 + }, + { + "epoch": 1.934840425531915, + "grad_norm": 3.8868236541748047, + "learning_rate": 7.203880938449432e-06, + "loss": 0.8851, + "step": 7275 + }, + { + "epoch": 1.9351063829787234, + "grad_norm": 4.085642337799072, + "learning_rate": 7.2030914454830645e-06, + "loss": 0.7872, + "step": 7276 + }, + { + "epoch": 1.9353723404255319, + "grad_norm": 3.6767923831939697, + "learning_rate": 7.202301884350454e-06, + "loss": 0.712, + "step": 7277 + }, + { + "epoch": 1.9356382978723405, + "grad_norm": 4.32539176940918, + "learning_rate": 7.201512255076031e-06, + "loss": 0.9707, + "step": 7278 + }, + { + "epoch": 1.9359042553191488, + "grad_norm": 3.729510545730591, + "learning_rate": 7.2007225576842255e-06, + "loss": 0.8447, + "step": 7279 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 4.127895832061768, + "learning_rate": 7.1999327921994735e-06, + "loss": 0.8129, + "step": 7280 + }, + { + "epoch": 1.936436170212766, + "grad_norm": 3.7349631786346436, + "learning_rate": 7.199142958646211e-06, + "loss": 0.6886, + "step": 7281 + }, + { + "epoch": 1.9367021276595744, + "grad_norm": 3.900869369506836, + "learning_rate": 7.198353057048876e-06, + "loss": 0.7183, + "step": 7282 + }, + { + "epoch": 1.936968085106383, + "grad_norm": 4.21663761138916, + "learning_rate": 7.197563087431909e-06, + "loss": 0.9005, + "step": 7283 + }, + { + "epoch": 1.9372340425531915, + "grad_norm": 3.992421865463257, + "learning_rate": 7.196773049819753e-06, + "loss": 0.8604, + "step": 7284 + }, + { + "epoch": 1.9375, + "grad_norm": 4.140373229980469, + "learning_rate": 7.195982944236853e-06, + "loss": 0.9231, + "step": 7285 + }, + { + "epoch": 1.9377659574468085, + "grad_norm": 3.9591143131256104, + "learning_rate": 7.1951927707076545e-06, + "loss": 0.9934, + "step": 7286 + }, + { + "epoch": 1.938031914893617, + "grad_norm": 4.134740352630615, + "learning_rate": 7.194402529256608e-06, + "loss": 0.8869, + "step": 7287 + }, + { + "epoch": 1.9382978723404256, + "grad_norm": 3.9935176372528076, + "learning_rate": 7.193612219908161e-06, + "loss": 0.7377, + "step": 7288 + }, + { + "epoch": 1.938563829787234, + "grad_norm": 4.432157039642334, + "learning_rate": 7.192821842686772e-06, + "loss": 0.864, + "step": 7289 + }, + { + "epoch": 1.9388297872340425, + "grad_norm": 4.096209526062012, + "learning_rate": 7.1920313976168935e-06, + "loss": 0.8539, + "step": 7290 + }, + { + "epoch": 1.9390957446808512, + "grad_norm": 3.792664051055908, + "learning_rate": 7.191240884722982e-06, + "loss": 0.8195, + "step": 7291 + }, + { + "epoch": 1.9393617021276595, + "grad_norm": 3.759690046310425, + "learning_rate": 7.190450304029497e-06, + "loss": 0.7395, + "step": 7292 + }, + { + "epoch": 1.9396276595744681, + "grad_norm": 3.7826247215270996, + "learning_rate": 7.1896596555609025e-06, + "loss": 0.7206, + "step": 7293 + }, + { + "epoch": 1.9398936170212766, + "grad_norm": 3.8327670097351074, + "learning_rate": 7.1888689393416575e-06, + "loss": 0.9116, + "step": 7294 + }, + { + "epoch": 1.940159574468085, + "grad_norm": 3.965418815612793, + "learning_rate": 7.188078155396232e-06, + "loss": 0.8134, + "step": 7295 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 3.9271137714385986, + "learning_rate": 7.187287303749093e-06, + "loss": 0.705, + "step": 7296 + }, + { + "epoch": 1.940691489361702, + "grad_norm": 4.100310325622559, + "learning_rate": 7.186496384424708e-06, + "loss": 0.8471, + "step": 7297 + }, + { + "epoch": 1.9409574468085107, + "grad_norm": 3.9107069969177246, + "learning_rate": 7.185705397447552e-06, + "loss": 0.8495, + "step": 7298 + }, + { + "epoch": 1.9412234042553191, + "grad_norm": 4.238333225250244, + "learning_rate": 7.1849143428420975e-06, + "loss": 0.7926, + "step": 7299 + }, + { + "epoch": 1.9414893617021276, + "grad_norm": 4.412265777587891, + "learning_rate": 7.18412322063282e-06, + "loss": 0.947, + "step": 7300 + }, + { + "epoch": 1.9417553191489363, + "grad_norm": 3.686246156692505, + "learning_rate": 7.183332030844199e-06, + "loss": 0.7733, + "step": 7301 + }, + { + "epoch": 1.9420212765957445, + "grad_norm": 3.924842596054077, + "learning_rate": 7.182540773500715e-06, + "loss": 0.9132, + "step": 7302 + }, + { + "epoch": 1.9422872340425532, + "grad_norm": 3.5468335151672363, + "learning_rate": 7.181749448626849e-06, + "loss": 0.8032, + "step": 7303 + }, + { + "epoch": 1.9425531914893617, + "grad_norm": 3.618908166885376, + "learning_rate": 7.180958056247087e-06, + "loss": 0.8473, + "step": 7304 + }, + { + "epoch": 1.9428191489361701, + "grad_norm": 3.575326919555664, + "learning_rate": 7.180166596385915e-06, + "loss": 0.7703, + "step": 7305 + }, + { + "epoch": 1.9430851063829788, + "grad_norm": 4.315759658813477, + "learning_rate": 7.179375069067821e-06, + "loss": 0.823, + "step": 7306 + }, + { + "epoch": 1.9433510638297873, + "grad_norm": 3.9836225509643555, + "learning_rate": 7.178583474317295e-06, + "loss": 0.6672, + "step": 7307 + }, + { + "epoch": 1.9436170212765957, + "grad_norm": 4.030239105224609, + "learning_rate": 7.177791812158835e-06, + "loss": 0.806, + "step": 7308 + }, + { + "epoch": 1.9438829787234042, + "grad_norm": 3.8376708030700684, + "learning_rate": 7.17700008261693e-06, + "loss": 0.7224, + "step": 7309 + }, + { + "epoch": 1.9441489361702127, + "grad_norm": 4.117557048797607, + "learning_rate": 7.176208285716079e-06, + "loss": 0.8359, + "step": 7310 + }, + { + "epoch": 1.9444148936170214, + "grad_norm": 4.3215012550354, + "learning_rate": 7.175416421480783e-06, + "loss": 0.7143, + "step": 7311 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 3.8996849060058594, + "learning_rate": 7.174624489935541e-06, + "loss": 0.806, + "step": 7312 + }, + { + "epoch": 1.9449468085106383, + "grad_norm": 3.478804588317871, + "learning_rate": 7.173832491104858e-06, + "loss": 0.7916, + "step": 7313 + }, + { + "epoch": 1.945212765957447, + "grad_norm": 3.8935012817382812, + "learning_rate": 7.173040425013236e-06, + "loss": 0.719, + "step": 7314 + }, + { + "epoch": 1.9454787234042552, + "grad_norm": 3.9126412868499756, + "learning_rate": 7.172248291685187e-06, + "loss": 0.6975, + "step": 7315 + }, + { + "epoch": 1.945744680851064, + "grad_norm": 3.790658712387085, + "learning_rate": 7.171456091145217e-06, + "loss": 0.8119, + "step": 7316 + }, + { + "epoch": 1.9460106382978724, + "grad_norm": 4.477363109588623, + "learning_rate": 7.170663823417839e-06, + "loss": 0.8697, + "step": 7317 + }, + { + "epoch": 1.9462765957446808, + "grad_norm": 4.502041816711426, + "learning_rate": 7.1698714885275665e-06, + "loss": 0.9479, + "step": 7318 + }, + { + "epoch": 1.9465425531914895, + "grad_norm": 3.928950071334839, + "learning_rate": 7.169079086498915e-06, + "loss": 0.7123, + "step": 7319 + }, + { + "epoch": 1.9468085106382977, + "grad_norm": 3.781550168991089, + "learning_rate": 7.168286617356406e-06, + "loss": 0.7275, + "step": 7320 + }, + { + "epoch": 1.9470744680851064, + "grad_norm": 4.246979236602783, + "learning_rate": 7.167494081124553e-06, + "loss": 0.885, + "step": 7321 + }, + { + "epoch": 1.9473404255319149, + "grad_norm": 4.124865531921387, + "learning_rate": 7.166701477827882e-06, + "loss": 0.8088, + "step": 7322 + }, + { + "epoch": 1.9476063829787233, + "grad_norm": 4.21986198425293, + "learning_rate": 7.165908807490916e-06, + "loss": 0.9175, + "step": 7323 + }, + { + "epoch": 1.947872340425532, + "grad_norm": 4.153756618499756, + "learning_rate": 7.165116070138183e-06, + "loss": 0.8633, + "step": 7324 + }, + { + "epoch": 1.9481382978723403, + "grad_norm": 3.5365302562713623, + "learning_rate": 7.164323265794209e-06, + "loss": 0.8274, + "step": 7325 + }, + { + "epoch": 1.948404255319149, + "grad_norm": 4.312306880950928, + "learning_rate": 7.1635303944835246e-06, + "loss": 0.847, + "step": 7326 + }, + { + "epoch": 1.9486702127659574, + "grad_norm": 4.010374069213867, + "learning_rate": 7.162737456230662e-06, + "loss": 0.82, + "step": 7327 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 5.155407905578613, + "learning_rate": 7.161944451060157e-06, + "loss": 0.9241, + "step": 7328 + }, + { + "epoch": 1.9492021276595746, + "grad_norm": 3.665374279022217, + "learning_rate": 7.161151378996545e-06, + "loss": 0.8255, + "step": 7329 + }, + { + "epoch": 1.949468085106383, + "grad_norm": 3.6932079792022705, + "learning_rate": 7.1603582400643646e-06, + "loss": 0.8187, + "step": 7330 + }, + { + "epoch": 1.9497340425531915, + "grad_norm": 3.555961847305298, + "learning_rate": 7.159565034288157e-06, + "loss": 0.7523, + "step": 7331 + }, + { + "epoch": 1.95, + "grad_norm": 4.505660533905029, + "learning_rate": 7.158771761692464e-06, + "loss": 0.7903, + "step": 7332 + }, + { + "epoch": 1.9502659574468084, + "grad_norm": 3.616476058959961, + "learning_rate": 7.157978422301832e-06, + "loss": 0.8853, + "step": 7333 + }, + { + "epoch": 1.950531914893617, + "grad_norm": 4.25620698928833, + "learning_rate": 7.157185016140809e-06, + "loss": 0.8566, + "step": 7334 + }, + { + "epoch": 1.9507978723404256, + "grad_norm": 3.9593820571899414, + "learning_rate": 7.156391543233938e-06, + "loss": 0.7797, + "step": 7335 + }, + { + "epoch": 1.951063829787234, + "grad_norm": 4.379816055297852, + "learning_rate": 7.155598003605776e-06, + "loss": 0.9148, + "step": 7336 + }, + { + "epoch": 1.9513297872340427, + "grad_norm": 3.731823205947876, + "learning_rate": 7.154804397280873e-06, + "loss": 0.7223, + "step": 7337 + }, + { + "epoch": 1.951595744680851, + "grad_norm": 3.8849217891693115, + "learning_rate": 7.154010724283786e-06, + "loss": 0.8446, + "step": 7338 + }, + { + "epoch": 1.9518617021276596, + "grad_norm": 3.7477874755859375, + "learning_rate": 7.15321698463907e-06, + "loss": 0.6922, + "step": 7339 + }, + { + "epoch": 1.952127659574468, + "grad_norm": 4.323108673095703, + "learning_rate": 7.152423178371286e-06, + "loss": 0.8153, + "step": 7340 + }, + { + "epoch": 1.9523936170212766, + "grad_norm": 4.16124153137207, + "learning_rate": 7.1516293055049944e-06, + "loss": 0.8003, + "step": 7341 + }, + { + "epoch": 1.9526595744680852, + "grad_norm": 4.236426830291748, + "learning_rate": 7.150835366064759e-06, + "loss": 0.7843, + "step": 7342 + }, + { + "epoch": 1.9529255319148935, + "grad_norm": 3.637660026550293, + "learning_rate": 7.1500413600751465e-06, + "loss": 0.7665, + "step": 7343 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 3.838202476501465, + "learning_rate": 7.14924728756072e-06, + "loss": 0.7723, + "step": 7344 + }, + { + "epoch": 1.9534574468085106, + "grad_norm": 4.209107875823975, + "learning_rate": 7.148453148546055e-06, + "loss": 0.8646, + "step": 7345 + }, + { + "epoch": 1.953723404255319, + "grad_norm": 3.9335439205169678, + "learning_rate": 7.147658943055718e-06, + "loss": 0.6881, + "step": 7346 + }, + { + "epoch": 1.9539893617021278, + "grad_norm": 3.6025755405426025, + "learning_rate": 7.1468646711142855e-06, + "loss": 0.6567, + "step": 7347 + }, + { + "epoch": 1.954255319148936, + "grad_norm": 3.8079092502593994, + "learning_rate": 7.146070332746332e-06, + "loss": 0.7122, + "step": 7348 + }, + { + "epoch": 1.9545212765957447, + "grad_norm": 4.033806800842285, + "learning_rate": 7.145275927976436e-06, + "loss": 0.7522, + "step": 7349 + }, + { + "epoch": 1.9547872340425532, + "grad_norm": 4.1563310623168945, + "learning_rate": 7.144481456829178e-06, + "loss": 0.7998, + "step": 7350 + }, + { + "epoch": 1.9550531914893616, + "grad_norm": 4.061034202575684, + "learning_rate": 7.143686919329138e-06, + "loss": 0.9232, + "step": 7351 + }, + { + "epoch": 1.9553191489361703, + "grad_norm": 4.174419403076172, + "learning_rate": 7.1428923155009e-06, + "loss": 0.6807, + "step": 7352 + }, + { + "epoch": 1.9555851063829788, + "grad_norm": 3.6197104454040527, + "learning_rate": 7.142097645369052e-06, + "loss": 0.8129, + "step": 7353 + }, + { + "epoch": 1.9558510638297872, + "grad_norm": 4.288638591766357, + "learning_rate": 7.141302908958181e-06, + "loss": 0.9342, + "step": 7354 + }, + { + "epoch": 1.9561170212765957, + "grad_norm": 3.9184861183166504, + "learning_rate": 7.140508106292876e-06, + "loss": 0.7052, + "step": 7355 + }, + { + "epoch": 1.9563829787234042, + "grad_norm": 4.214428901672363, + "learning_rate": 7.1397132373977295e-06, + "loss": 0.8679, + "step": 7356 + }, + { + "epoch": 1.9566489361702128, + "grad_norm": 4.283886909484863, + "learning_rate": 7.138918302297338e-06, + "loss": 0.8816, + "step": 7357 + }, + { + "epoch": 1.9569148936170213, + "grad_norm": 3.77843976020813, + "learning_rate": 7.138123301016295e-06, + "loss": 0.7901, + "step": 7358 + }, + { + "epoch": 1.9571808510638298, + "grad_norm": 3.9347009658813477, + "learning_rate": 7.137328233579201e-06, + "loss": 0.7385, + "step": 7359 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 3.9841034412384033, + "learning_rate": 7.136533100010654e-06, + "loss": 0.7738, + "step": 7360 + }, + { + "epoch": 1.9577127659574467, + "grad_norm": 3.536179780960083, + "learning_rate": 7.1357379003352565e-06, + "loss": 0.8311, + "step": 7361 + }, + { + "epoch": 1.9579787234042554, + "grad_norm": 4.386892318725586, + "learning_rate": 7.134942634577615e-06, + "loss": 0.9451, + "step": 7362 + }, + { + "epoch": 1.9582446808510638, + "grad_norm": 3.738041877746582, + "learning_rate": 7.1341473027623355e-06, + "loss": 0.6454, + "step": 7363 + }, + { + "epoch": 1.9585106382978723, + "grad_norm": 3.718473434448242, + "learning_rate": 7.133351904914024e-06, + "loss": 0.8613, + "step": 7364 + }, + { + "epoch": 1.958776595744681, + "grad_norm": 4.3047661781311035, + "learning_rate": 7.132556441057294e-06, + "loss": 0.7499, + "step": 7365 + }, + { + "epoch": 1.9590425531914892, + "grad_norm": 3.821338415145874, + "learning_rate": 7.131760911216756e-06, + "loss": 0.737, + "step": 7366 + }, + { + "epoch": 1.959308510638298, + "grad_norm": 3.7964980602264404, + "learning_rate": 7.130965315417027e-06, + "loss": 0.8637, + "step": 7367 + }, + { + "epoch": 1.9595744680851064, + "grad_norm": 3.9412569999694824, + "learning_rate": 7.130169653682721e-06, + "loss": 0.6788, + "step": 7368 + }, + { + "epoch": 1.9598404255319148, + "grad_norm": 4.125255584716797, + "learning_rate": 7.129373926038459e-06, + "loss": 0.86, + "step": 7369 + }, + { + "epoch": 1.9601063829787235, + "grad_norm": 3.7982115745544434, + "learning_rate": 7.128578132508859e-06, + "loss": 0.9386, + "step": 7370 + }, + { + "epoch": 1.9603723404255318, + "grad_norm": 3.9143412113189697, + "learning_rate": 7.1277822731185475e-06, + "loss": 0.911, + "step": 7371 + }, + { + "epoch": 1.9606382978723405, + "grad_norm": 4.226142883300781, + "learning_rate": 7.126986347892146e-06, + "loss": 0.7375, + "step": 7372 + }, + { + "epoch": 1.960904255319149, + "grad_norm": 3.8393430709838867, + "learning_rate": 7.126190356854283e-06, + "loss": 0.8341, + "step": 7373 + }, + { + "epoch": 1.9611702127659574, + "grad_norm": 4.1616926193237305, + "learning_rate": 7.1253943000295865e-06, + "loss": 0.8532, + "step": 7374 + }, + { + "epoch": 1.961436170212766, + "grad_norm": 3.9134316444396973, + "learning_rate": 7.12459817744269e-06, + "loss": 0.7566, + "step": 7375 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 3.930948495864868, + "learning_rate": 7.123801989118223e-06, + "loss": 0.7781, + "step": 7376 + }, + { + "epoch": 1.961968085106383, + "grad_norm": 3.913886785507202, + "learning_rate": 7.1230057350808234e-06, + "loss": 0.8081, + "step": 7377 + }, + { + "epoch": 1.9622340425531914, + "grad_norm": 4.381828308105469, + "learning_rate": 7.122209415355125e-06, + "loss": 0.9048, + "step": 7378 + }, + { + "epoch": 1.9625, + "grad_norm": 3.839282512664795, + "learning_rate": 7.121413029965769e-06, + "loss": 0.7002, + "step": 7379 + }, + { + "epoch": 1.9627659574468086, + "grad_norm": 4.018161773681641, + "learning_rate": 7.120616578937397e-06, + "loss": 0.803, + "step": 7380 + }, + { + "epoch": 1.963031914893617, + "grad_norm": 4.220311164855957, + "learning_rate": 7.1198200622946516e-06, + "loss": 0.8337, + "step": 7381 + }, + { + "epoch": 1.9632978723404255, + "grad_norm": 3.790156841278076, + "learning_rate": 7.119023480062176e-06, + "loss": 0.7224, + "step": 7382 + }, + { + "epoch": 1.9635638297872342, + "grad_norm": 4.560417652130127, + "learning_rate": 7.1182268322646205e-06, + "loss": 0.8584, + "step": 7383 + }, + { + "epoch": 1.9638297872340424, + "grad_norm": 4.3043999671936035, + "learning_rate": 7.117430118926633e-06, + "loss": 0.8294, + "step": 7384 + }, + { + "epoch": 1.9640957446808511, + "grad_norm": 3.781405210494995, + "learning_rate": 7.116633340072863e-06, + "loss": 0.7876, + "step": 7385 + }, + { + "epoch": 1.9643617021276596, + "grad_norm": 3.986027956008911, + "learning_rate": 7.115836495727968e-06, + "loss": 0.7581, + "step": 7386 + }, + { + "epoch": 1.964627659574468, + "grad_norm": 3.9813320636749268, + "learning_rate": 7.1150395859165985e-06, + "loss": 0.9021, + "step": 7387 + }, + { + "epoch": 1.9648936170212767, + "grad_norm": 4.043676376342773, + "learning_rate": 7.114242610663415e-06, + "loss": 0.791, + "step": 7388 + }, + { + "epoch": 1.965159574468085, + "grad_norm": 4.014968395233154, + "learning_rate": 7.113445569993076e-06, + "loss": 0.7437, + "step": 7389 + }, + { + "epoch": 1.9654255319148937, + "grad_norm": 3.8244807720184326, + "learning_rate": 7.1126484639302425e-06, + "loss": 0.7376, + "step": 7390 + }, + { + "epoch": 1.9656914893617021, + "grad_norm": 3.804473400115967, + "learning_rate": 7.111851292499579e-06, + "loss": 0.8358, + "step": 7391 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 3.598792552947998, + "learning_rate": 7.111054055725749e-06, + "loss": 0.7728, + "step": 7392 + }, + { + "epoch": 1.9662234042553193, + "grad_norm": 4.2588677406311035, + "learning_rate": 7.110256753633421e-06, + "loss": 0.884, + "step": 7393 + }, + { + "epoch": 1.9664893617021275, + "grad_norm": 3.7859714031219482, + "learning_rate": 7.109459386247265e-06, + "loss": 0.6813, + "step": 7394 + }, + { + "epoch": 1.9667553191489362, + "grad_norm": 4.303823471069336, + "learning_rate": 7.108661953591953e-06, + "loss": 0.9044, + "step": 7395 + }, + { + "epoch": 1.9670212765957447, + "grad_norm": 3.953003406524658, + "learning_rate": 7.107864455692156e-06, + "loss": 0.7632, + "step": 7396 + }, + { + "epoch": 1.9672872340425531, + "grad_norm": 4.125672817230225, + "learning_rate": 7.107066892572552e-06, + "loss": 0.7153, + "step": 7397 + }, + { + "epoch": 1.9675531914893618, + "grad_norm": 4.01138973236084, + "learning_rate": 7.106269264257817e-06, + "loss": 0.8052, + "step": 7398 + }, + { + "epoch": 1.96781914893617, + "grad_norm": 3.7055439949035645, + "learning_rate": 7.10547157077263e-06, + "loss": 0.7684, + "step": 7399 + }, + { + "epoch": 1.9680851063829787, + "grad_norm": 4.636490821838379, + "learning_rate": 7.104673812141676e-06, + "loss": 0.7504, + "step": 7400 + }, + { + "epoch": 1.9683510638297872, + "grad_norm": 3.961894989013672, + "learning_rate": 7.103875988389636e-06, + "loss": 0.9316, + "step": 7401 + }, + { + "epoch": 1.9686170212765957, + "grad_norm": 3.978306770324707, + "learning_rate": 7.103078099541194e-06, + "loss": 0.8276, + "step": 7402 + }, + { + "epoch": 1.9688829787234043, + "grad_norm": 3.9166336059570312, + "learning_rate": 7.102280145621041e-06, + "loss": 0.7308, + "step": 7403 + }, + { + "epoch": 1.9691489361702128, + "grad_norm": 3.680129289627075, + "learning_rate": 7.101482126653865e-06, + "loss": 0.8355, + "step": 7404 + }, + { + "epoch": 1.9694148936170213, + "grad_norm": 4.1183857917785645, + "learning_rate": 7.1006840426643576e-06, + "loss": 0.7782, + "step": 7405 + }, + { + "epoch": 1.96968085106383, + "grad_norm": 4.286891460418701, + "learning_rate": 7.099885893677213e-06, + "loss": 0.8094, + "step": 7406 + }, + { + "epoch": 1.9699468085106382, + "grad_norm": 4.037398338317871, + "learning_rate": 7.099087679717127e-06, + "loss": 0.8141, + "step": 7407 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 3.8752505779266357, + "learning_rate": 7.098289400808795e-06, + "loss": 0.7824, + "step": 7408 + }, + { + "epoch": 1.9704787234042553, + "grad_norm": 3.7574338912963867, + "learning_rate": 7.0974910569769195e-06, + "loss": 0.6398, + "step": 7409 + }, + { + "epoch": 1.9707446808510638, + "grad_norm": 3.918271064758301, + "learning_rate": 7.096692648246203e-06, + "loss": 0.7949, + "step": 7410 + }, + { + "epoch": 1.9710106382978725, + "grad_norm": 4.124891757965088, + "learning_rate": 7.095894174641345e-06, + "loss": 0.9578, + "step": 7411 + }, + { + "epoch": 1.9712765957446807, + "grad_norm": 3.764817953109741, + "learning_rate": 7.0950956361870536e-06, + "loss": 0.8013, + "step": 7412 + }, + { + "epoch": 1.9715425531914894, + "grad_norm": 4.22829008102417, + "learning_rate": 7.094297032908037e-06, + "loss": 0.7897, + "step": 7413 + }, + { + "epoch": 1.9718085106382979, + "grad_norm": 4.174428462982178, + "learning_rate": 7.093498364829006e-06, + "loss": 0.8182, + "step": 7414 + }, + { + "epoch": 1.9720744680851063, + "grad_norm": 4.265493392944336, + "learning_rate": 7.09269963197467e-06, + "loss": 0.7067, + "step": 7415 + }, + { + "epoch": 1.972340425531915, + "grad_norm": 3.417632579803467, + "learning_rate": 7.091900834369743e-06, + "loss": 0.6767, + "step": 7416 + }, + { + "epoch": 1.9726063829787233, + "grad_norm": 3.931145429611206, + "learning_rate": 7.09110197203894e-06, + "loss": 0.7581, + "step": 7417 + }, + { + "epoch": 1.972872340425532, + "grad_norm": 3.808061361312866, + "learning_rate": 7.090303045006983e-06, + "loss": 0.88, + "step": 7418 + }, + { + "epoch": 1.9731382978723404, + "grad_norm": 4.074621677398682, + "learning_rate": 7.089504053298587e-06, + "loss": 0.8391, + "step": 7419 + }, + { + "epoch": 1.9734042553191489, + "grad_norm": 3.7446646690368652, + "learning_rate": 7.0887049969384756e-06, + "loss": 0.778, + "step": 7420 + }, + { + "epoch": 1.9736702127659576, + "grad_norm": 4.311694622039795, + "learning_rate": 7.087905875951373e-06, + "loss": 0.6362, + "step": 7421 + }, + { + "epoch": 1.9739361702127658, + "grad_norm": 3.7492148876190186, + "learning_rate": 7.087106690362003e-06, + "loss": 0.85, + "step": 7422 + }, + { + "epoch": 1.9742021276595745, + "grad_norm": 3.8154044151306152, + "learning_rate": 7.086307440195096e-06, + "loss": 0.8229, + "step": 7423 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 3.8786826133728027, + "learning_rate": 7.085508125475381e-06, + "loss": 0.8001, + "step": 7424 + }, + { + "epoch": 1.9747340425531914, + "grad_norm": 3.972696304321289, + "learning_rate": 7.084708746227589e-06, + "loss": 0.9101, + "step": 7425 + }, + { + "epoch": 1.975, + "grad_norm": 4.224587440490723, + "learning_rate": 7.083909302476453e-06, + "loss": 0.7869, + "step": 7426 + }, + { + "epoch": 1.9752659574468086, + "grad_norm": 3.700507164001465, + "learning_rate": 7.08310979424671e-06, + "loss": 0.7123, + "step": 7427 + }, + { + "epoch": 1.975531914893617, + "grad_norm": 3.8128812313079834, + "learning_rate": 7.082310221563098e-06, + "loss": 0.7205, + "step": 7428 + }, + { + "epoch": 1.9757978723404257, + "grad_norm": 4.028718948364258, + "learning_rate": 7.081510584450355e-06, + "loss": 0.9249, + "step": 7429 + }, + { + "epoch": 1.976063829787234, + "grad_norm": 3.798619270324707, + "learning_rate": 7.080710882933225e-06, + "loss": 0.7412, + "step": 7430 + }, + { + "epoch": 1.9763297872340426, + "grad_norm": 4.599943161010742, + "learning_rate": 7.07991111703645e-06, + "loss": 0.8713, + "step": 7431 + }, + { + "epoch": 1.976595744680851, + "grad_norm": 4.6581854820251465, + "learning_rate": 7.079111286784775e-06, + "loss": 0.8165, + "step": 7432 + }, + { + "epoch": 1.9768617021276595, + "grad_norm": 3.9097495079040527, + "learning_rate": 7.078311392202951e-06, + "loss": 0.7803, + "step": 7433 + }, + { + "epoch": 1.9771276595744682, + "grad_norm": 4.4464802742004395, + "learning_rate": 7.077511433315725e-06, + "loss": 0.9244, + "step": 7434 + }, + { + "epoch": 1.9773936170212765, + "grad_norm": 4.222725868225098, + "learning_rate": 7.076711410147849e-06, + "loss": 0.9159, + "step": 7435 + }, + { + "epoch": 1.9776595744680852, + "grad_norm": 3.8437206745147705, + "learning_rate": 7.075911322724077e-06, + "loss": 0.7657, + "step": 7436 + }, + { + "epoch": 1.9779255319148936, + "grad_norm": 3.891757011413574, + "learning_rate": 7.075111171069165e-06, + "loss": 0.574, + "step": 7437 + }, + { + "epoch": 1.978191489361702, + "grad_norm": 3.8077917098999023, + "learning_rate": 7.074310955207869e-06, + "loss": 0.713, + "step": 7438 + }, + { + "epoch": 1.9784574468085108, + "grad_norm": 3.8292224407196045, + "learning_rate": 7.073510675164952e-06, + "loss": 0.8645, + "step": 7439 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 3.931783437728882, + "learning_rate": 7.072710330965171e-06, + "loss": 0.7588, + "step": 7440 + }, + { + "epoch": 1.9789893617021277, + "grad_norm": 3.6988885402679443, + "learning_rate": 7.071909922633293e-06, + "loss": 0.8146, + "step": 7441 + }, + { + "epoch": 1.9792553191489362, + "grad_norm": 3.7726998329162598, + "learning_rate": 7.071109450194085e-06, + "loss": 0.8082, + "step": 7442 + }, + { + "epoch": 1.9795212765957446, + "grad_norm": 4.304258346557617, + "learning_rate": 7.070308913672309e-06, + "loss": 0.8142, + "step": 7443 + }, + { + "epoch": 1.9797872340425533, + "grad_norm": 3.6615335941314697, + "learning_rate": 7.069508313092739e-06, + "loss": 0.7409, + "step": 7444 + }, + { + "epoch": 1.9800531914893615, + "grad_norm": 4.02711296081543, + "learning_rate": 7.068707648480145e-06, + "loss": 0.8662, + "step": 7445 + }, + { + "epoch": 1.9803191489361702, + "grad_norm": 3.48976993560791, + "learning_rate": 7.067906919859301e-06, + "loss": 0.7655, + "step": 7446 + }, + { + "epoch": 1.9805851063829787, + "grad_norm": 4.168039321899414, + "learning_rate": 7.067106127254983e-06, + "loss": 0.8516, + "step": 7447 + }, + { + "epoch": 1.9808510638297872, + "grad_norm": 3.757882833480835, + "learning_rate": 7.066305270691965e-06, + "loss": 0.7557, + "step": 7448 + }, + { + "epoch": 1.9811170212765958, + "grad_norm": 4.09896183013916, + "learning_rate": 7.065504350195031e-06, + "loss": 0.7227, + "step": 7449 + }, + { + "epoch": 1.9813829787234043, + "grad_norm": 3.6728386878967285, + "learning_rate": 7.064703365788961e-06, + "loss": 0.8711, + "step": 7450 + }, + { + "epoch": 1.9816489361702128, + "grad_norm": 4.336848735809326, + "learning_rate": 7.063902317498537e-06, + "loss": 0.8427, + "step": 7451 + }, + { + "epoch": 1.9819148936170212, + "grad_norm": 3.715324640274048, + "learning_rate": 7.063101205348546e-06, + "loss": 0.8392, + "step": 7452 + }, + { + "epoch": 1.9821808510638297, + "grad_norm": 3.8472211360931396, + "learning_rate": 7.062300029363775e-06, + "loss": 0.8386, + "step": 7453 + }, + { + "epoch": 1.9824468085106384, + "grad_norm": 4.4139533042907715, + "learning_rate": 7.061498789569012e-06, + "loss": 0.7736, + "step": 7454 + }, + { + "epoch": 1.9827127659574468, + "grad_norm": 4.422085285186768, + "learning_rate": 7.06069748598905e-06, + "loss": 0.8175, + "step": 7455 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 4.3708696365356445, + "learning_rate": 7.059896118648681e-06, + "loss": 0.8802, + "step": 7456 + }, + { + "epoch": 1.983244680851064, + "grad_norm": 3.6612091064453125, + "learning_rate": 7.059094687572701e-06, + "loss": 0.73, + "step": 7457 + }, + { + "epoch": 1.9835106382978722, + "grad_norm": 4.2330780029296875, + "learning_rate": 7.058293192785907e-06, + "loss": 0.7638, + "step": 7458 + }, + { + "epoch": 1.983776595744681, + "grad_norm": 4.289926528930664, + "learning_rate": 7.0574916343130995e-06, + "loss": 0.7821, + "step": 7459 + }, + { + "epoch": 1.9840425531914894, + "grad_norm": 4.122095108032227, + "learning_rate": 7.0566900121790775e-06, + "loss": 0.9189, + "step": 7460 + }, + { + "epoch": 1.9843085106382978, + "grad_norm": 3.974686861038208, + "learning_rate": 7.055888326408645e-06, + "loss": 0.7231, + "step": 7461 + }, + { + "epoch": 1.9845744680851065, + "grad_norm": 3.515641450881958, + "learning_rate": 7.055086577026608e-06, + "loss": 0.8235, + "step": 7462 + }, + { + "epoch": 1.9848404255319148, + "grad_norm": 4.1052565574646, + "learning_rate": 7.0542847640577725e-06, + "loss": 0.7862, + "step": 7463 + }, + { + "epoch": 1.9851063829787234, + "grad_norm": 3.889636516571045, + "learning_rate": 7.0534828875269466e-06, + "loss": 0.7854, + "step": 7464 + }, + { + "epoch": 1.985372340425532, + "grad_norm": 4.208193778991699, + "learning_rate": 7.052680947458944e-06, + "loss": 0.7854, + "step": 7465 + }, + { + "epoch": 1.9856382978723404, + "grad_norm": 4.233124732971191, + "learning_rate": 7.051878943878575e-06, + "loss": 0.7895, + "step": 7466 + }, + { + "epoch": 1.985904255319149, + "grad_norm": 4.030735969543457, + "learning_rate": 7.051076876810656e-06, + "loss": 0.8551, + "step": 7467 + }, + { + "epoch": 1.9861702127659573, + "grad_norm": 3.666236639022827, + "learning_rate": 7.050274746280005e-06, + "loss": 0.7758, + "step": 7468 + }, + { + "epoch": 1.986436170212766, + "grad_norm": 3.7510082721710205, + "learning_rate": 7.0494725523114375e-06, + "loss": 0.9323, + "step": 7469 + }, + { + "epoch": 1.9867021276595744, + "grad_norm": 3.9435558319091797, + "learning_rate": 7.048670294929777e-06, + "loss": 0.9059, + "step": 7470 + }, + { + "epoch": 1.986968085106383, + "grad_norm": 3.691020965576172, + "learning_rate": 7.047867974159845e-06, + "loss": 0.7602, + "step": 7471 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 3.697643518447876, + "learning_rate": 7.047065590026467e-06, + "loss": 0.7624, + "step": 7472 + }, + { + "epoch": 1.9875, + "grad_norm": 3.759286880493164, + "learning_rate": 7.04626314255447e-06, + "loss": 0.8639, + "step": 7473 + }, + { + "epoch": 1.9877659574468085, + "grad_norm": 4.054465293884277, + "learning_rate": 7.045460631768684e-06, + "loss": 0.7268, + "step": 7474 + }, + { + "epoch": 1.988031914893617, + "grad_norm": 4.61219596862793, + "learning_rate": 7.0446580576939346e-06, + "loss": 0.9591, + "step": 7475 + }, + { + "epoch": 1.9882978723404254, + "grad_norm": 4.135398864746094, + "learning_rate": 7.04385542035506e-06, + "loss": 0.9273, + "step": 7476 + }, + { + "epoch": 1.9885638297872341, + "grad_norm": 3.8725779056549072, + "learning_rate": 7.043052719776891e-06, + "loss": 0.803, + "step": 7477 + }, + { + "epoch": 1.9888297872340426, + "grad_norm": 3.9959404468536377, + "learning_rate": 7.042249955984265e-06, + "loss": 0.8572, + "step": 7478 + }, + { + "epoch": 1.989095744680851, + "grad_norm": 3.542355537414551, + "learning_rate": 7.041447129002023e-06, + "loss": 0.8041, + "step": 7479 + }, + { + "epoch": 1.9893617021276597, + "grad_norm": 4.780427932739258, + "learning_rate": 7.0406442388550016e-06, + "loss": 0.88, + "step": 7480 + }, + { + "epoch": 1.989627659574468, + "grad_norm": 3.5344386100769043, + "learning_rate": 7.039841285568045e-06, + "loss": 0.7503, + "step": 7481 + }, + { + "epoch": 1.9898936170212767, + "grad_norm": 3.8678970336914062, + "learning_rate": 7.039038269165999e-06, + "loss": 0.74, + "step": 7482 + }, + { + "epoch": 1.9901595744680851, + "grad_norm": 3.366485834121704, + "learning_rate": 7.038235189673706e-06, + "loss": 0.7804, + "step": 7483 + }, + { + "epoch": 1.9904255319148936, + "grad_norm": 3.5538713932037354, + "learning_rate": 7.037432047116018e-06, + "loss": 0.7362, + "step": 7484 + }, + { + "epoch": 1.9906914893617023, + "grad_norm": 4.539484977722168, + "learning_rate": 7.036628841517783e-06, + "loss": 0.8812, + "step": 7485 + }, + { + "epoch": 1.9909574468085105, + "grad_norm": 3.830280065536499, + "learning_rate": 7.035825572903854e-06, + "loss": 0.809, + "step": 7486 + }, + { + "epoch": 1.9912234042553192, + "grad_norm": 4.038280963897705, + "learning_rate": 7.035022241299083e-06, + "loss": 0.7987, + "step": 7487 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 4.29449462890625, + "learning_rate": 7.034218846728331e-06, + "loss": 0.8703, + "step": 7488 + }, + { + "epoch": 1.991755319148936, + "grad_norm": 4.56672477722168, + "learning_rate": 7.033415389216452e-06, + "loss": 0.9195, + "step": 7489 + }, + { + "epoch": 1.9920212765957448, + "grad_norm": 4.10626745223999, + "learning_rate": 7.032611868788306e-06, + "loss": 0.7476, + "step": 7490 + }, + { + "epoch": 1.992287234042553, + "grad_norm": 3.6163523197174072, + "learning_rate": 7.031808285468756e-06, + "loss": 0.8082, + "step": 7491 + }, + { + "epoch": 1.9925531914893617, + "grad_norm": 4.114681243896484, + "learning_rate": 7.031004639282666e-06, + "loss": 0.9355, + "step": 7492 + }, + { + "epoch": 1.9928191489361702, + "grad_norm": 3.9397499561309814, + "learning_rate": 7.0302009302549e-06, + "loss": 0.7364, + "step": 7493 + }, + { + "epoch": 1.9930851063829786, + "grad_norm": 3.4797003269195557, + "learning_rate": 7.029397158410329e-06, + "loss": 0.8413, + "step": 7494 + }, + { + "epoch": 1.9933510638297873, + "grad_norm": 4.215932369232178, + "learning_rate": 7.028593323773819e-06, + "loss": 0.8095, + "step": 7495 + }, + { + "epoch": 1.9936170212765958, + "grad_norm": 3.694060802459717, + "learning_rate": 7.027789426370244e-06, + "loss": 0.8051, + "step": 7496 + }, + { + "epoch": 1.9938829787234043, + "grad_norm": 4.0490875244140625, + "learning_rate": 7.026985466224477e-06, + "loss": 0.874, + "step": 7497 + }, + { + "epoch": 1.9941489361702127, + "grad_norm": 4.0154194831848145, + "learning_rate": 7.026181443361392e-06, + "loss": 0.807, + "step": 7498 + }, + { + "epoch": 1.9944148936170212, + "grad_norm": 3.8070061206817627, + "learning_rate": 7.025377357805867e-06, + "loss": 0.8078, + "step": 7499 + }, + { + "epoch": 1.9946808510638299, + "grad_norm": 4.185990810394287, + "learning_rate": 7.024573209582783e-06, + "loss": 0.7529, + "step": 7500 + }, + { + "epoch": 1.9946808510638299, + "eval_loss": 1.260877251625061, + "eval_runtime": 13.905, + "eval_samples_per_second": 28.767, + "eval_steps_per_second": 3.596, + "step": 7500 + }, + { + "epoch": 1.9949468085106383, + "grad_norm": 3.18033504486084, + "learning_rate": 7.023768998717022e-06, + "loss": 0.7159, + "step": 7501 + }, + { + "epoch": 1.9952127659574468, + "grad_norm": 3.839970111846924, + "learning_rate": 7.022964725233463e-06, + "loss": 0.7902, + "step": 7502 + }, + { + "epoch": 1.9954787234042555, + "grad_norm": 4.011384963989258, + "learning_rate": 7.022160389156995e-06, + "loss": 0.7596, + "step": 7503 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 3.67543888092041, + "learning_rate": 7.0213559905125016e-06, + "loss": 0.7987, + "step": 7504 + }, + { + "epoch": 1.9960106382978724, + "grad_norm": 4.240528583526611, + "learning_rate": 7.020551529324877e-06, + "loss": 0.8651, + "step": 7505 + }, + { + "epoch": 1.9962765957446809, + "grad_norm": 3.9020180702209473, + "learning_rate": 7.0197470056190075e-06, + "loss": 0.9205, + "step": 7506 + }, + { + "epoch": 1.9965425531914893, + "grad_norm": 4.0633368492126465, + "learning_rate": 7.0189424194197875e-06, + "loss": 0.8294, + "step": 7507 + }, + { + "epoch": 1.996808510638298, + "grad_norm": 3.88988995552063, + "learning_rate": 7.018137770752114e-06, + "loss": 0.861, + "step": 7508 + }, + { + "epoch": 1.9970744680851062, + "grad_norm": 3.5177197456359863, + "learning_rate": 7.01733305964088e-06, + "loss": 0.772, + "step": 7509 + }, + { + "epoch": 1.997340425531915, + "grad_norm": 3.661116123199463, + "learning_rate": 7.016528286110986e-06, + "loss": 0.7985, + "step": 7510 + }, + { + "epoch": 1.9976063829787234, + "grad_norm": 4.28385591506958, + "learning_rate": 7.015723450187334e-06, + "loss": 0.9045, + "step": 7511 + }, + { + "epoch": 1.9978723404255319, + "grad_norm": 3.899296522140503, + "learning_rate": 7.014918551894824e-06, + "loss": 0.7558, + "step": 7512 + }, + { + "epoch": 1.9981382978723405, + "grad_norm": 3.9070241451263428, + "learning_rate": 7.014113591258361e-06, + "loss": 0.8287, + "step": 7513 + }, + { + "epoch": 1.9984042553191488, + "grad_norm": 3.7345831394195557, + "learning_rate": 7.013308568302855e-06, + "loss": 0.781, + "step": 7514 + }, + { + "epoch": 1.9986702127659575, + "grad_norm": 3.6665847301483154, + "learning_rate": 7.012503483053209e-06, + "loss": 0.9715, + "step": 7515 + }, + { + "epoch": 1.998936170212766, + "grad_norm": 3.48984956741333, + "learning_rate": 7.011698335534336e-06, + "loss": 0.6823, + "step": 7516 + }, + { + "epoch": 1.9992021276595744, + "grad_norm": 3.7711336612701416, + "learning_rate": 7.01089312577115e-06, + "loss": 0.8192, + "step": 7517 + }, + { + "epoch": 1.999468085106383, + "grad_norm": 4.02569580078125, + "learning_rate": 7.0100878537885605e-06, + "loss": 0.856, + "step": 7518 + }, + { + "epoch": 1.9997340425531915, + "grad_norm": 4.044494152069092, + "learning_rate": 7.009282519611488e-06, + "loss": 0.8349, + "step": 7519 + }, + { + "epoch": 2.0, + "grad_norm": 3.897979259490967, + "learning_rate": 7.008477123264849e-06, + "loss": 0.6436, + "step": 7520 + } + ], + "logging_steps": 1.0, + "max_steps": 18800, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4371418007171236e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}