diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,34734 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998014165618587, + "eval_steps": 500, + "global_step": 4956, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020173555620699328, + "grad_norm": 2022.7607421875, + "learning_rate": 6.711409395973155e-08, + "loss": 1.7879, + "step": 1 + }, + { + "epoch": 0.00040347111241398657, + "grad_norm": 22654.603515625, + "learning_rate": 1.342281879194631e-07, + "loss": 1.7023, + "step": 2 + }, + { + "epoch": 0.0006052066686209798, + "grad_norm": 5139.21240234375, + "learning_rate": 2.0134228187919465e-07, + "loss": 2.1801, + "step": 3 + }, + { + "epoch": 0.0008069422248279731, + "grad_norm": 2654.375, + "learning_rate": 2.684563758389262e-07, + "loss": 2.2953, + "step": 4 + }, + { + "epoch": 0.0010086777810349664, + "grad_norm": 4316.03662109375, + "learning_rate": 3.3557046979865777e-07, + "loss": 1.7812, + "step": 5 + }, + { + "epoch": 0.0012104133372419596, + "grad_norm": 7220.66845703125, + "learning_rate": 4.026845637583893e-07, + "loss": 1.8566, + "step": 6 + }, + { + "epoch": 0.001412148893448953, + "grad_norm": 24308.83203125, + "learning_rate": 4.6979865771812087e-07, + "loss": 2.2305, + "step": 7 + }, + { + "epoch": 0.0016138844496559463, + "grad_norm": 11270.6220703125, + "learning_rate": 5.369127516778524e-07, + "loss": 2.5174, + "step": 8 + }, + { + "epoch": 0.0018156200058629397, + "grad_norm": 3041.62646484375, + "learning_rate": 6.04026845637584e-07, + "loss": 1.8983, + "step": 9 + }, + { + "epoch": 0.002017355562069933, + "grad_norm": 4904.11328125, + "learning_rate": 6.711409395973155e-07, + "loss": 1.7648, + "step": 10 + }, + { + "epoch": 0.0022190911182769263, + "grad_norm": 10130.5146484375, + "learning_rate": 7.382550335570471e-07, + "loss": 1.8282, + "step": 11 + }, + { + "epoch": 0.0024208266744839193, + "grad_norm": 4326.14892578125, + "learning_rate": 8.053691275167786e-07, + "loss": 2.0227, + "step": 12 + }, + { + "epoch": 0.0026225622306909127, + "grad_norm": 3840.538818359375, + "learning_rate": 8.724832214765102e-07, + "loss": 1.68, + "step": 13 + }, + { + "epoch": 0.002824297786897906, + "grad_norm": 5577.01171875, + "learning_rate": 9.395973154362417e-07, + "loss": 1.7269, + "step": 14 + }, + { + "epoch": 0.0030260333431048995, + "grad_norm": 6080.58642578125, + "learning_rate": 1.006711409395973e-06, + "loss": 1.6123, + "step": 15 + }, + { + "epoch": 0.0032277688993118925, + "grad_norm": 2213.759521484375, + "learning_rate": 1.0738255033557048e-06, + "loss": 2.2998, + "step": 16 + }, + { + "epoch": 0.003429504455518886, + "grad_norm": 5135.35693359375, + "learning_rate": 1.1409395973154363e-06, + "loss": 2.0855, + "step": 17 + }, + { + "epoch": 0.0036312400117258794, + "grad_norm": 4677.1923828125, + "learning_rate": 1.208053691275168e-06, + "loss": 2.3382, + "step": 18 + }, + { + "epoch": 0.0038329755679328724, + "grad_norm": 1373.9124755859375, + "learning_rate": 1.2751677852348996e-06, + "loss": 1.5623, + "step": 19 + }, + { + "epoch": 0.004034711124139866, + "grad_norm": 1056.8350830078125, + "learning_rate": 1.342281879194631e-06, + "loss": 1.7456, + "step": 20 + }, + { + "epoch": 0.004236446680346859, + "grad_norm": 25182.080078125, + "learning_rate": 1.4093959731543623e-06, + "loss": 1.5903, + "step": 21 + }, + { + "epoch": 0.004438182236553853, + "grad_norm": 10851.70703125, + "learning_rate": 1.4765100671140942e-06, + "loss": 2.3867, + "step": 22 + }, + { + "epoch": 0.004639917792760846, + "grad_norm": 6197.96728515625, + "learning_rate": 1.5436241610738257e-06, + "loss": 2.0905, + "step": 23 + }, + { + "epoch": 0.004841653348967839, + "grad_norm": 1165.3834228515625, + "learning_rate": 1.6107382550335572e-06, + "loss": 1.5915, + "step": 24 + }, + { + "epoch": 0.005043388905174832, + "grad_norm": 7833.49169921875, + "learning_rate": 1.6778523489932889e-06, + "loss": 1.593, + "step": 25 + }, + { + "epoch": 0.005245124461381825, + "grad_norm": 10110.6279296875, + "learning_rate": 1.7449664429530203e-06, + "loss": 1.9986, + "step": 26 + }, + { + "epoch": 0.005446860017588819, + "grad_norm": 6629.67138671875, + "learning_rate": 1.8120805369127518e-06, + "loss": 2.5279, + "step": 27 + }, + { + "epoch": 0.005648595573795812, + "grad_norm": 4542.9150390625, + "learning_rate": 1.8791946308724835e-06, + "loss": 1.6785, + "step": 28 + }, + { + "epoch": 0.005850331130002806, + "grad_norm": 2227.125, + "learning_rate": 1.9463087248322147e-06, + "loss": 1.4933, + "step": 29 + }, + { + "epoch": 0.006052066686209799, + "grad_norm": 788.425048828125, + "learning_rate": 2.013422818791946e-06, + "loss": 1.4813, + "step": 30 + }, + { + "epoch": 0.006253802242416792, + "grad_norm": 3233.39111328125, + "learning_rate": 2.080536912751678e-06, + "loss": 1.4689, + "step": 31 + }, + { + "epoch": 0.006455537798623785, + "grad_norm": 1077.7388916015625, + "learning_rate": 2.1476510067114096e-06, + "loss": 1.4551, + "step": 32 + }, + { + "epoch": 0.0066572733548307785, + "grad_norm": 800.4613647460938, + "learning_rate": 2.2147651006711415e-06, + "loss": 1.4164, + "step": 33 + }, + { + "epoch": 0.006859008911037772, + "grad_norm": 329.1565856933594, + "learning_rate": 2.2818791946308725e-06, + "loss": 1.4257, + "step": 34 + }, + { + "epoch": 0.007060744467244765, + "grad_norm": 480.46246337890625, + "learning_rate": 2.348993288590604e-06, + "loss": 1.5043, + "step": 35 + }, + { + "epoch": 0.007262480023451759, + "grad_norm": 620.2860717773438, + "learning_rate": 2.416107382550336e-06, + "loss": 1.5757, + "step": 36 + }, + { + "epoch": 0.007464215579658751, + "grad_norm": 472.3393859863281, + "learning_rate": 2.4832214765100673e-06, + "loss": 1.3904, + "step": 37 + }, + { + "epoch": 0.007665951135865745, + "grad_norm": 845.4601440429688, + "learning_rate": 2.5503355704697992e-06, + "loss": 1.6462, + "step": 38 + }, + { + "epoch": 0.007867686692072738, + "grad_norm": 1025.87548828125, + "learning_rate": 2.6174496644295307e-06, + "loss": 1.8661, + "step": 39 + }, + { + "epoch": 0.008069422248279732, + "grad_norm": 219.7495880126953, + "learning_rate": 2.684563758389262e-06, + "loss": 1.4307, + "step": 40 + }, + { + "epoch": 0.008271157804486725, + "grad_norm": 102.17412567138672, + "learning_rate": 2.7516778523489936e-06, + "loss": 1.2734, + "step": 41 + }, + { + "epoch": 0.008472893360693718, + "grad_norm": 1972.4371337890625, + "learning_rate": 2.8187919463087247e-06, + "loss": 1.2536, + "step": 42 + }, + { + "epoch": 0.008674628916900712, + "grad_norm": 7863.28759765625, + "learning_rate": 2.885906040268457e-06, + "loss": 1.5733, + "step": 43 + }, + { + "epoch": 0.008876364473107705, + "grad_norm": 493.8641357421875, + "learning_rate": 2.9530201342281885e-06, + "loss": 1.3364, + "step": 44 + }, + { + "epoch": 0.009078100029314699, + "grad_norm": 128.04525756835938, + "learning_rate": 3.02013422818792e-06, + "loss": 1.3062, + "step": 45 + }, + { + "epoch": 0.009279835585521692, + "grad_norm": 1128.474609375, + "learning_rate": 3.0872483221476514e-06, + "loss": 1.3676, + "step": 46 + }, + { + "epoch": 0.009481571141728684, + "grad_norm": 47.335845947265625, + "learning_rate": 3.154362416107383e-06, + "loss": 1.2489, + "step": 47 + }, + { + "epoch": 0.009683306697935677, + "grad_norm": 38.27725601196289, + "learning_rate": 3.2214765100671143e-06, + "loss": 1.271, + "step": 48 + }, + { + "epoch": 0.00988504225414267, + "grad_norm": 83.20748901367188, + "learning_rate": 3.2885906040268462e-06, + "loss": 1.8054, + "step": 49 + }, + { + "epoch": 0.010086777810349664, + "grad_norm": 232.52198791503906, + "learning_rate": 3.3557046979865777e-06, + "loss": 1.3684, + "step": 50 + }, + { + "epoch": 0.010288513366556657, + "grad_norm": 46.04096221923828, + "learning_rate": 3.422818791946309e-06, + "loss": 1.2638, + "step": 51 + }, + { + "epoch": 0.01049024892276365, + "grad_norm": 91.04130554199219, + "learning_rate": 3.4899328859060407e-06, + "loss": 1.1821, + "step": 52 + }, + { + "epoch": 0.010691984478970644, + "grad_norm": 75.88416290283203, + "learning_rate": 3.557046979865772e-06, + "loss": 1.4274, + "step": 53 + }, + { + "epoch": 0.010893720035177638, + "grad_norm": 702.1119995117188, + "learning_rate": 3.6241610738255036e-06, + "loss": 1.1664, + "step": 54 + }, + { + "epoch": 0.011095455591384631, + "grad_norm": 21.94797706604004, + "learning_rate": 3.6912751677852355e-06, + "loss": 1.4295, + "step": 55 + }, + { + "epoch": 0.011297191147591625, + "grad_norm": 25.085304260253906, + "learning_rate": 3.758389261744967e-06, + "loss": 1.2236, + "step": 56 + }, + { + "epoch": 0.011498926703798618, + "grad_norm": 9.833930969238281, + "learning_rate": 3.825503355704698e-06, + "loss": 1.1847, + "step": 57 + }, + { + "epoch": 0.011700662260005611, + "grad_norm": 12.627878189086914, + "learning_rate": 3.8926174496644295e-06, + "loss": 1.0892, + "step": 58 + }, + { + "epoch": 0.011902397816212605, + "grad_norm": 17.127168655395508, + "learning_rate": 3.959731543624161e-06, + "loss": 1.264, + "step": 59 + }, + { + "epoch": 0.012104133372419598, + "grad_norm": 9.815498352050781, + "learning_rate": 4.026845637583892e-06, + "loss": 1.0999, + "step": 60 + }, + { + "epoch": 0.01230586892862659, + "grad_norm": 23.42744255065918, + "learning_rate": 4.093959731543625e-06, + "loss": 1.0759, + "step": 61 + }, + { + "epoch": 0.012507604484833583, + "grad_norm": 32.74321746826172, + "learning_rate": 4.161073825503356e-06, + "loss": 1.3767, + "step": 62 + }, + { + "epoch": 0.012709340041040577, + "grad_norm": 10.060574531555176, + "learning_rate": 4.228187919463088e-06, + "loss": 1.137, + "step": 63 + }, + { + "epoch": 0.01291107559724757, + "grad_norm": 25.0057373046875, + "learning_rate": 4.295302013422819e-06, + "loss": 1.3044, + "step": 64 + }, + { + "epoch": 0.013112811153454564, + "grad_norm": 20.90718650817871, + "learning_rate": 4.362416107382551e-06, + "loss": 1.0647, + "step": 65 + }, + { + "epoch": 0.013314546709661557, + "grad_norm": 11.432819366455078, + "learning_rate": 4.429530201342283e-06, + "loss": 1.1671, + "step": 66 + }, + { + "epoch": 0.01351628226586855, + "grad_norm": 35.13310241699219, + "learning_rate": 4.4966442953020135e-06, + "loss": 1.0784, + "step": 67 + }, + { + "epoch": 0.013718017822075544, + "grad_norm": 18.177671432495117, + "learning_rate": 4.563758389261745e-06, + "loss": 1.0769, + "step": 68 + }, + { + "epoch": 0.013919753378282537, + "grad_norm": 239.5286102294922, + "learning_rate": 4.6308724832214765e-06, + "loss": 1.2211, + "step": 69 + }, + { + "epoch": 0.01412148893448953, + "grad_norm": 6.744454383850098, + "learning_rate": 4.697986577181208e-06, + "loss": 1.1171, + "step": 70 + }, + { + "epoch": 0.014323224490696524, + "grad_norm": 5.52532434463501, + "learning_rate": 4.765100671140939e-06, + "loss": 1.0494, + "step": 71 + }, + { + "epoch": 0.014524960046903517, + "grad_norm": 4.606289863586426, + "learning_rate": 4.832214765100672e-06, + "loss": 1.0316, + "step": 72 + }, + { + "epoch": 0.014726695603110511, + "grad_norm": 60.82453536987305, + "learning_rate": 4.899328859060403e-06, + "loss": 1.0452, + "step": 73 + }, + { + "epoch": 0.014928431159317503, + "grad_norm": 6.075560569763184, + "learning_rate": 4.966442953020135e-06, + "loss": 1.1055, + "step": 74 + }, + { + "epoch": 0.015130166715524496, + "grad_norm": 28.668581008911133, + "learning_rate": 5.033557046979867e-06, + "loss": 1.3083, + "step": 75 + }, + { + "epoch": 0.01533190227173149, + "grad_norm": 5.29589319229126, + "learning_rate": 5.1006711409395985e-06, + "loss": 1.3318, + "step": 76 + }, + { + "epoch": 0.015533637827938483, + "grad_norm": 11.307202339172363, + "learning_rate": 5.16778523489933e-06, + "loss": 1.0679, + "step": 77 + }, + { + "epoch": 0.015735373384145476, + "grad_norm": 12.506169319152832, + "learning_rate": 5.234899328859061e-06, + "loss": 1.0901, + "step": 78 + }, + { + "epoch": 0.01593710894035247, + "grad_norm": 3.876575469970703, + "learning_rate": 5.302013422818793e-06, + "loss": 1.2399, + "step": 79 + }, + { + "epoch": 0.016138844496559463, + "grad_norm": 5.603764533996582, + "learning_rate": 5.369127516778524e-06, + "loss": 0.9899, + "step": 80 + }, + { + "epoch": 0.016340580052766455, + "grad_norm": 3.92093563079834, + "learning_rate": 5.436241610738256e-06, + "loss": 1.0783, + "step": 81 + }, + { + "epoch": 0.01654231560897345, + "grad_norm": 4.067335605621338, + "learning_rate": 5.503355704697987e-06, + "loss": 1.2255, + "step": 82 + }, + { + "epoch": 0.01674405116518044, + "grad_norm": 23.241966247558594, + "learning_rate": 5.570469798657718e-06, + "loss": 1.0219, + "step": 83 + }, + { + "epoch": 0.016945786721387437, + "grad_norm": 18.93113136291504, + "learning_rate": 5.637583892617449e-06, + "loss": 1.2462, + "step": 84 + }, + { + "epoch": 0.01714752227759443, + "grad_norm": 3.556022882461548, + "learning_rate": 5.704697986577181e-06, + "loss": 1.0287, + "step": 85 + }, + { + "epoch": 0.017349257833801424, + "grad_norm": 6.718225479125977, + "learning_rate": 5.771812080536914e-06, + "loss": 1.0052, + "step": 86 + }, + { + "epoch": 0.017550993390008415, + "grad_norm": 16.812232971191406, + "learning_rate": 5.8389261744966455e-06, + "loss": 1.0208, + "step": 87 + }, + { + "epoch": 0.01775272894621541, + "grad_norm": 10.673800468444824, + "learning_rate": 5.906040268456377e-06, + "loss": 1.0182, + "step": 88 + }, + { + "epoch": 0.017954464502422402, + "grad_norm": 4.149665832519531, + "learning_rate": 5.973154362416108e-06, + "loss": 0.9561, + "step": 89 + }, + { + "epoch": 0.018156200058629397, + "grad_norm": 412.8095703125, + "learning_rate": 6.04026845637584e-06, + "loss": 1.1994, + "step": 90 + }, + { + "epoch": 0.01835793561483639, + "grad_norm": 10.464238166809082, + "learning_rate": 6.107382550335571e-06, + "loss": 0.9532, + "step": 91 + }, + { + "epoch": 0.018559671171043384, + "grad_norm": 5.531954765319824, + "learning_rate": 6.174496644295303e-06, + "loss": 1.0273, + "step": 92 + }, + { + "epoch": 0.018761406727250376, + "grad_norm": 13.847434997558594, + "learning_rate": 6.241610738255034e-06, + "loss": 1.2019, + "step": 93 + }, + { + "epoch": 0.018963142283457367, + "grad_norm": 12.25539779663086, + "learning_rate": 6.308724832214766e-06, + "loss": 0.9668, + "step": 94 + }, + { + "epoch": 0.019164877839664363, + "grad_norm": 7.894690036773682, + "learning_rate": 6.375838926174497e-06, + "loss": 0.9212, + "step": 95 + }, + { + "epoch": 0.019366613395871354, + "grad_norm": 14.030834197998047, + "learning_rate": 6.442953020134229e-06, + "loss": 0.9981, + "step": 96 + }, + { + "epoch": 0.01956834895207835, + "grad_norm": 3.4250216484069824, + "learning_rate": 6.51006711409396e-06, + "loss": 0.995, + "step": 97 + }, + { + "epoch": 0.01977008450828534, + "grad_norm": 6.011563301086426, + "learning_rate": 6.5771812080536925e-06, + "loss": 1.0305, + "step": 98 + }, + { + "epoch": 0.019971820064492336, + "grad_norm": 15.089917182922363, + "learning_rate": 6.644295302013424e-06, + "loss": 1.0044, + "step": 99 + }, + { + "epoch": 0.020173555620699328, + "grad_norm": 9.317388534545898, + "learning_rate": 6.711409395973155e-06, + "loss": 0.9269, + "step": 100 + }, + { + "epoch": 0.020375291176906323, + "grad_norm": 17.396928787231445, + "learning_rate": 6.778523489932887e-06, + "loss": 1.1154, + "step": 101 + }, + { + "epoch": 0.020577026733113315, + "grad_norm": 5.767102241516113, + "learning_rate": 6.845637583892618e-06, + "loss": 1.0193, + "step": 102 + }, + { + "epoch": 0.02077876228932031, + "grad_norm": 38.032161712646484, + "learning_rate": 6.91275167785235e-06, + "loss": 0.9445, + "step": 103 + }, + { + "epoch": 0.0209804978455273, + "grad_norm": 23.336257934570312, + "learning_rate": 6.979865771812081e-06, + "loss": 1.149, + "step": 104 + }, + { + "epoch": 0.021182233401734297, + "grad_norm": 84.36126708984375, + "learning_rate": 7.046979865771813e-06, + "loss": 0.9812, + "step": 105 + }, + { + "epoch": 0.02138396895794129, + "grad_norm": 245.55101013183594, + "learning_rate": 7.114093959731544e-06, + "loss": 0.9889, + "step": 106 + }, + { + "epoch": 0.021585704514148284, + "grad_norm": 17.846338272094727, + "learning_rate": 7.181208053691276e-06, + "loss": 1.1272, + "step": 107 + }, + { + "epoch": 0.021787440070355275, + "grad_norm": 36.46464157104492, + "learning_rate": 7.248322147651007e-06, + "loss": 0.9686, + "step": 108 + }, + { + "epoch": 0.021989175626562267, + "grad_norm": 3.497917413711548, + "learning_rate": 7.3154362416107395e-06, + "loss": 0.9897, + "step": 109 + }, + { + "epoch": 0.022190911182769262, + "grad_norm": 15.46224594116211, + "learning_rate": 7.382550335570471e-06, + "loss": 1.3466, + "step": 110 + }, + { + "epoch": 0.022392646738976254, + "grad_norm": 18.964536666870117, + "learning_rate": 7.4496644295302024e-06, + "loss": 1.3584, + "step": 111 + }, + { + "epoch": 0.02259438229518325, + "grad_norm": 7.033268928527832, + "learning_rate": 7.516778523489934e-06, + "loss": 0.9985, + "step": 112 + }, + { + "epoch": 0.02279611785139024, + "grad_norm": 186.49122619628906, + "learning_rate": 7.583892617449665e-06, + "loss": 0.9712, + "step": 113 + }, + { + "epoch": 0.022997853407597236, + "grad_norm": 188.2796630859375, + "learning_rate": 7.651006711409396e-06, + "loss": 1.1078, + "step": 114 + }, + { + "epoch": 0.023199588963804228, + "grad_norm": 17.890708923339844, + "learning_rate": 7.718120805369127e-06, + "loss": 0.9729, + "step": 115 + }, + { + "epoch": 0.023401324520011223, + "grad_norm": 15.275798797607422, + "learning_rate": 7.785234899328859e-06, + "loss": 1.2821, + "step": 116 + }, + { + "epoch": 0.023603060076218214, + "grad_norm": 20.20365333557129, + "learning_rate": 7.85234899328859e-06, + "loss": 1.0977, + "step": 117 + }, + { + "epoch": 0.02380479563242521, + "grad_norm": 5.463934421539307, + "learning_rate": 7.919463087248322e-06, + "loss": 0.9282, + "step": 118 + }, + { + "epoch": 0.0240065311886322, + "grad_norm": 4.599616050720215, + "learning_rate": 7.986577181208053e-06, + "loss": 0.9563, + "step": 119 + }, + { + "epoch": 0.024208266744839196, + "grad_norm": 10.358293533325195, + "learning_rate": 8.053691275167785e-06, + "loss": 1.1196, + "step": 120 + }, + { + "epoch": 0.024410002301046188, + "grad_norm": 10.99390697479248, + "learning_rate": 8.120805369127518e-06, + "loss": 0.9119, + "step": 121 + }, + { + "epoch": 0.02461173785725318, + "grad_norm": 226.3212432861328, + "learning_rate": 8.18791946308725e-06, + "loss": 0.9652, + "step": 122 + }, + { + "epoch": 0.024813473413460175, + "grad_norm": 188.92813110351562, + "learning_rate": 8.255033557046981e-06, + "loss": 0.9699, + "step": 123 + }, + { + "epoch": 0.025015208969667167, + "grad_norm": 2.4782161712646484, + "learning_rate": 8.322147651006712e-06, + "loss": 0.9455, + "step": 124 + }, + { + "epoch": 0.025216944525874162, + "grad_norm": 27.041120529174805, + "learning_rate": 8.389261744966444e-06, + "loss": 1.1747, + "step": 125 + }, + { + "epoch": 0.025418680082081153, + "grad_norm": 8.053828239440918, + "learning_rate": 8.456375838926175e-06, + "loss": 1.1885, + "step": 126 + }, + { + "epoch": 0.02562041563828815, + "grad_norm": 3.6767969131469727, + "learning_rate": 8.523489932885907e-06, + "loss": 1.0123, + "step": 127 + }, + { + "epoch": 0.02582215119449514, + "grad_norm": 7.078328609466553, + "learning_rate": 8.590604026845638e-06, + "loss": 0.9259, + "step": 128 + }, + { + "epoch": 0.026023886750702135, + "grad_norm": 15.417545318603516, + "learning_rate": 8.65771812080537e-06, + "loss": 0.9523, + "step": 129 + }, + { + "epoch": 0.026225622306909127, + "grad_norm": 12.301921844482422, + "learning_rate": 8.724832214765101e-06, + "loss": 0.9276, + "step": 130 + }, + { + "epoch": 0.026427357863116122, + "grad_norm": 13.825284957885742, + "learning_rate": 8.791946308724833e-06, + "loss": 0.8871, + "step": 131 + }, + { + "epoch": 0.026629093419323114, + "grad_norm": 4.011205196380615, + "learning_rate": 8.859060402684566e-06, + "loss": 0.959, + "step": 132 + }, + { + "epoch": 0.02683082897553011, + "grad_norm": 6.832649230957031, + "learning_rate": 8.926174496644297e-06, + "loss": 0.9345, + "step": 133 + }, + { + "epoch": 0.0270325645317371, + "grad_norm": 10.89799976348877, + "learning_rate": 8.993288590604027e-06, + "loss": 0.9571, + "step": 134 + }, + { + "epoch": 0.027234300087944092, + "grad_norm": 15.070175170898438, + "learning_rate": 9.060402684563759e-06, + "loss": 0.9965, + "step": 135 + }, + { + "epoch": 0.027436035644151088, + "grad_norm": 9.60452938079834, + "learning_rate": 9.12751677852349e-06, + "loss": 0.9057, + "step": 136 + }, + { + "epoch": 0.02763777120035808, + "grad_norm": 4.11983060836792, + "learning_rate": 9.194630872483221e-06, + "loss": 0.9351, + "step": 137 + }, + { + "epoch": 0.027839506756565074, + "grad_norm": 8.37743091583252, + "learning_rate": 9.261744966442953e-06, + "loss": 1.1355, + "step": 138 + }, + { + "epoch": 0.028041242312772066, + "grad_norm": 4.7272820472717285, + "learning_rate": 9.328859060402684e-06, + "loss": 1.0328, + "step": 139 + }, + { + "epoch": 0.02824297786897906, + "grad_norm": 11.339620590209961, + "learning_rate": 9.395973154362416e-06, + "loss": 1.3605, + "step": 140 + }, + { + "epoch": 0.028444713425186053, + "grad_norm": 295.88519287109375, + "learning_rate": 9.463087248322147e-06, + "loss": 0.9109, + "step": 141 + }, + { + "epoch": 0.028646448981393048, + "grad_norm": 70.61186218261719, + "learning_rate": 9.530201342281879e-06, + "loss": 0.9349, + "step": 142 + }, + { + "epoch": 0.02884818453760004, + "grad_norm": 14.863121032714844, + "learning_rate": 9.59731543624161e-06, + "loss": 0.9364, + "step": 143 + }, + { + "epoch": 0.029049920093807035, + "grad_norm": 9.727048873901367, + "learning_rate": 9.664429530201343e-06, + "loss": 0.8834, + "step": 144 + }, + { + "epoch": 0.029251655650014027, + "grad_norm": 2.5803744792938232, + "learning_rate": 9.731543624161075e-06, + "loss": 0.9208, + "step": 145 + }, + { + "epoch": 0.029453391206221022, + "grad_norm": 9.345537185668945, + "learning_rate": 9.798657718120806e-06, + "loss": 0.9386, + "step": 146 + }, + { + "epoch": 0.029655126762428013, + "grad_norm": 5.340958118438721, + "learning_rate": 9.865771812080538e-06, + "loss": 1.2306, + "step": 147 + }, + { + "epoch": 0.029856862318635005, + "grad_norm": 11.54121208190918, + "learning_rate": 9.93288590604027e-06, + "loss": 0.9641, + "step": 148 + }, + { + "epoch": 0.030058597874842, + "grad_norm": 16.07155418395996, + "learning_rate": 1e-05, + "loss": 0.8883, + "step": 149 + }, + { + "epoch": 0.030260333431048992, + "grad_norm": 2.79647159576416, + "learning_rate": 9.999998932196122e-06, + "loss": 0.9043, + "step": 150 + }, + { + "epoch": 0.030462068987255987, + "grad_norm": 14.367652893066406, + "learning_rate": 9.99999572878494e-06, + "loss": 0.9166, + "step": 151 + }, + { + "epoch": 0.03066380454346298, + "grad_norm": 2.824852466583252, + "learning_rate": 9.999990389767822e-06, + "loss": 0.9134, + "step": 152 + }, + { + "epoch": 0.030865540099669974, + "grad_norm": 3.328451156616211, + "learning_rate": 9.999982915147052e-06, + "loss": 1.3344, + "step": 153 + }, + { + "epoch": 0.031067275655876966, + "grad_norm": 7.940481185913086, + "learning_rate": 9.99997330492582e-06, + "loss": 1.0262, + "step": 154 + }, + { + "epoch": 0.03126901121208396, + "grad_norm": 5.5620856285095215, + "learning_rate": 9.999961559108231e-06, + "loss": 0.9052, + "step": 155 + }, + { + "epoch": 0.03147074676829095, + "grad_norm": 6.570486068725586, + "learning_rate": 9.999947677699302e-06, + "loss": 0.8743, + "step": 156 + }, + { + "epoch": 0.031672482324497944, + "grad_norm": 2.9782333374023438, + "learning_rate": 9.999931660704962e-06, + "loss": 0.9057, + "step": 157 + }, + { + "epoch": 0.03187421788070494, + "grad_norm": 6.992729663848877, + "learning_rate": 9.999913508132052e-06, + "loss": 1.0055, + "step": 158 + }, + { + "epoch": 0.032075953436911935, + "grad_norm": 3.99275803565979, + "learning_rate": 9.999893219988329e-06, + "loss": 0.9025, + "step": 159 + }, + { + "epoch": 0.032277688993118926, + "grad_norm": 10.511117935180664, + "learning_rate": 9.999870796282452e-06, + "loss": 0.8839, + "step": 160 + }, + { + "epoch": 0.03247942454932592, + "grad_norm": 3.7621190547943115, + "learning_rate": 9.999846237024003e-06, + "loss": 0.8776, + "step": 161 + }, + { + "epoch": 0.03268116010553291, + "grad_norm": 2.77579402923584, + "learning_rate": 9.99981954222347e-06, + "loss": 0.8678, + "step": 162 + }, + { + "epoch": 0.03288289566173991, + "grad_norm": 5.247135162353516, + "learning_rate": 9.999790711892255e-06, + "loss": 0.9416, + "step": 163 + }, + { + "epoch": 0.0330846312179469, + "grad_norm": 3.735067844390869, + "learning_rate": 9.999759746042674e-06, + "loss": 0.8518, + "step": 164 + }, + { + "epoch": 0.03328636677415389, + "grad_norm": 3.955429792404175, + "learning_rate": 9.999726644687952e-06, + "loss": 0.8846, + "step": 165 + }, + { + "epoch": 0.03348810233036088, + "grad_norm": 7.231546401977539, + "learning_rate": 9.999691407842228e-06, + "loss": 0.8946, + "step": 166 + }, + { + "epoch": 0.03368983788656788, + "grad_norm": 22.583908081054688, + "learning_rate": 9.999654035520548e-06, + "loss": 1.0744, + "step": 167 + }, + { + "epoch": 0.033891573442774874, + "grad_norm": 31.7758846282959, + "learning_rate": 9.999614527738882e-06, + "loss": 0.8924, + "step": 168 + }, + { + "epoch": 0.034093308998981865, + "grad_norm": 3.467855453491211, + "learning_rate": 9.999572884514098e-06, + "loss": 0.8897, + "step": 169 + }, + { + "epoch": 0.03429504455518886, + "grad_norm": 3.608447551727295, + "learning_rate": 9.999529105863986e-06, + "loss": 0.9032, + "step": 170 + }, + { + "epoch": 0.034496780111395856, + "grad_norm": 3.091294050216675, + "learning_rate": 9.999483191807245e-06, + "loss": 0.863, + "step": 171 + }, + { + "epoch": 0.03469851566760285, + "grad_norm": 6.513212203979492, + "learning_rate": 9.999435142363484e-06, + "loss": 0.8868, + "step": 172 + }, + { + "epoch": 0.03490025122380984, + "grad_norm": 1.8056596517562866, + "learning_rate": 9.999384957553228e-06, + "loss": 0.8478, + "step": 173 + }, + { + "epoch": 0.03510198678001683, + "grad_norm": 3.0379676818847656, + "learning_rate": 9.99933263739791e-06, + "loss": 1.0226, + "step": 174 + }, + { + "epoch": 0.03530372233622382, + "grad_norm": 2.665609121322632, + "learning_rate": 9.99927818191988e-06, + "loss": 0.9337, + "step": 175 + }, + { + "epoch": 0.03550545789243082, + "grad_norm": 8.217555046081543, + "learning_rate": 9.999221591142395e-06, + "loss": 0.8608, + "step": 176 + }, + { + "epoch": 0.03570719344863781, + "grad_norm": 2.4761505126953125, + "learning_rate": 9.999162865089625e-06, + "loss": 0.858, + "step": 177 + }, + { + "epoch": 0.035908929004844804, + "grad_norm": 4.345452308654785, + "learning_rate": 9.999102003786655e-06, + "loss": 1.0062, + "step": 178 + }, + { + "epoch": 0.036110664561051796, + "grad_norm": 3.3440897464752197, + "learning_rate": 9.99903900725948e-06, + "loss": 0.8589, + "step": 179 + }, + { + "epoch": 0.036312400117258795, + "grad_norm": 4.387753009796143, + "learning_rate": 9.998973875535006e-06, + "loss": 0.8124, + "step": 180 + }, + { + "epoch": 0.036514135673465786, + "grad_norm": 2.263735055923462, + "learning_rate": 9.998906608641055e-06, + "loss": 0.8761, + "step": 181 + }, + { + "epoch": 0.03671587122967278, + "grad_norm": 4.443448066711426, + "learning_rate": 9.998837206606355e-06, + "loss": 0.808, + "step": 182 + }, + { + "epoch": 0.03691760678587977, + "grad_norm": 2.865161895751953, + "learning_rate": 9.998765669460551e-06, + "loss": 0.9949, + "step": 183 + }, + { + "epoch": 0.03711934234208677, + "grad_norm": 1.972935676574707, + "learning_rate": 9.998691997234196e-06, + "loss": 0.8248, + "step": 184 + }, + { + "epoch": 0.03732107789829376, + "grad_norm": 2.0238354206085205, + "learning_rate": 9.998616189958758e-06, + "loss": 0.8415, + "step": 185 + }, + { + "epoch": 0.03752281345450075, + "grad_norm": 2.7127320766448975, + "learning_rate": 9.998538247666618e-06, + "loss": 0.7998, + "step": 186 + }, + { + "epoch": 0.03772454901070774, + "grad_norm": 2.8492398262023926, + "learning_rate": 9.998458170391065e-06, + "loss": 0.8178, + "step": 187 + }, + { + "epoch": 0.037926284566914735, + "grad_norm": 2.2515549659729004, + "learning_rate": 9.998375958166301e-06, + "loss": 0.9846, + "step": 188 + }, + { + "epoch": 0.038128020123121734, + "grad_norm": 2.6660799980163574, + "learning_rate": 9.998291611027441e-06, + "loss": 0.8962, + "step": 189 + }, + { + "epoch": 0.038329755679328725, + "grad_norm": 2.132474660873413, + "learning_rate": 9.998205129010515e-06, + "loss": 0.8482, + "step": 190 + }, + { + "epoch": 0.03853149123553572, + "grad_norm": 3.912343740463257, + "learning_rate": 9.998116512152456e-06, + "loss": 0.867, + "step": 191 + }, + { + "epoch": 0.03873322679174271, + "grad_norm": 1.7469815015792847, + "learning_rate": 9.998025760491117e-06, + "loss": 1.018, + "step": 192 + }, + { + "epoch": 0.03893496234794971, + "grad_norm": 1.599955439567566, + "learning_rate": 9.997932874065259e-06, + "loss": 0.9208, + "step": 193 + }, + { + "epoch": 0.0391366979041567, + "grad_norm": 3.3400845527648926, + "learning_rate": 9.997837852914557e-06, + "loss": 0.8257, + "step": 194 + }, + { + "epoch": 0.03933843346036369, + "grad_norm": 5.241814136505127, + "learning_rate": 9.997740697079595e-06, + "loss": 0.8121, + "step": 195 + }, + { + "epoch": 0.03954016901657068, + "grad_norm": 3.9843716621398926, + "learning_rate": 9.99764140660187e-06, + "loss": 0.8926, + "step": 196 + }, + { + "epoch": 0.03974190457277768, + "grad_norm": 2.6611931324005127, + "learning_rate": 9.997539981523794e-06, + "loss": 0.8292, + "step": 197 + }, + { + "epoch": 0.03994364012898467, + "grad_norm": 0.9562042355537415, + "learning_rate": 9.997436421888685e-06, + "loss": 0.8107, + "step": 198 + }, + { + "epoch": 0.040145375685191664, + "grad_norm": 3.046618700027466, + "learning_rate": 9.997330727740778e-06, + "loss": 0.9869, + "step": 199 + }, + { + "epoch": 0.040347111241398656, + "grad_norm": 3.274746894836426, + "learning_rate": 9.997222899125214e-06, + "loss": 0.7945, + "step": 200 + }, + { + "epoch": 0.04054884679760565, + "grad_norm": 2.0076584815979004, + "learning_rate": 9.997112936088052e-06, + "loss": 0.9723, + "step": 201 + }, + { + "epoch": 0.040750582353812646, + "grad_norm": 2.4189558029174805, + "learning_rate": 9.997000838676258e-06, + "loss": 0.8047, + "step": 202 + }, + { + "epoch": 0.04095231791001964, + "grad_norm": 1.8270951509475708, + "learning_rate": 9.996886606937712e-06, + "loss": 0.8327, + "step": 203 + }, + { + "epoch": 0.04115405346622663, + "grad_norm": 1.598482370376587, + "learning_rate": 9.996770240921205e-06, + "loss": 0.8465, + "step": 204 + }, + { + "epoch": 0.04135578902243362, + "grad_norm": 3.2114081382751465, + "learning_rate": 9.996651740676439e-06, + "loss": 0.7961, + "step": 205 + }, + { + "epoch": 0.04155752457864062, + "grad_norm": 1.7791095972061157, + "learning_rate": 9.996531106254027e-06, + "loss": 0.788, + "step": 206 + }, + { + "epoch": 0.04175926013484761, + "grad_norm": 4.112730503082275, + "learning_rate": 9.996408337705497e-06, + "loss": 0.7666, + "step": 207 + }, + { + "epoch": 0.0419609956910546, + "grad_norm": 4.838507175445557, + "learning_rate": 9.996283435083282e-06, + "loss": 0.8164, + "step": 208 + }, + { + "epoch": 0.042162731247261595, + "grad_norm": 7.356931209564209, + "learning_rate": 9.996156398440735e-06, + "loss": 0.8305, + "step": 209 + }, + { + "epoch": 0.042364466803468594, + "grad_norm": 21.874197006225586, + "learning_rate": 9.996027227832114e-06, + "loss": 0.806, + "step": 210 + }, + { + "epoch": 0.042566202359675585, + "grad_norm": 9.663326263427734, + "learning_rate": 9.99589592331259e-06, + "loss": 0.7798, + "step": 211 + }, + { + "epoch": 0.04276793791588258, + "grad_norm": 3.5580976009368896, + "learning_rate": 9.995762484938247e-06, + "loss": 0.8254, + "step": 212 + }, + { + "epoch": 0.04296967347208957, + "grad_norm": 5.485879898071289, + "learning_rate": 9.995626912766081e-06, + "loss": 0.7719, + "step": 213 + }, + { + "epoch": 0.04317140902829657, + "grad_norm": 1.792300820350647, + "learning_rate": 9.995489206853995e-06, + "loss": 0.8067, + "step": 214 + }, + { + "epoch": 0.04337314458450356, + "grad_norm": 2.1983845233917236, + "learning_rate": 9.995349367260807e-06, + "loss": 0.762, + "step": 215 + }, + { + "epoch": 0.04357488014071055, + "grad_norm": 7.275660991668701, + "learning_rate": 9.995207394046245e-06, + "loss": 0.7808, + "step": 216 + }, + { + "epoch": 0.04377661569691754, + "grad_norm": 1.4585134983062744, + "learning_rate": 9.99506328727095e-06, + "loss": 0.7906, + "step": 217 + }, + { + "epoch": 0.043978351253124534, + "grad_norm": 1.4152764081954956, + "learning_rate": 9.994917046996472e-06, + "loss": 0.8777, + "step": 218 + }, + { + "epoch": 0.04418008680933153, + "grad_norm": 1.9685487747192383, + "learning_rate": 9.994768673285275e-06, + "loss": 0.9062, + "step": 219 + }, + { + "epoch": 0.044381822365538524, + "grad_norm": 1.1828324794769287, + "learning_rate": 9.99461816620073e-06, + "loss": 1.0103, + "step": 220 + }, + { + "epoch": 0.044583557921745516, + "grad_norm": 1.5477691888809204, + "learning_rate": 9.994465525807125e-06, + "loss": 0.8139, + "step": 221 + }, + { + "epoch": 0.04478529347795251, + "grad_norm": 1.9244577884674072, + "learning_rate": 9.994310752169654e-06, + "loss": 0.7512, + "step": 222 + }, + { + "epoch": 0.044987029034159506, + "grad_norm": 1.0615133047103882, + "learning_rate": 9.994153845354426e-06, + "loss": 0.7983, + "step": 223 + }, + { + "epoch": 0.0451887645903665, + "grad_norm": 2.2280080318450928, + "learning_rate": 9.993994805428456e-06, + "loss": 1.1871, + "step": 224 + }, + { + "epoch": 0.04539050014657349, + "grad_norm": 3.0566983222961426, + "learning_rate": 9.993833632459675e-06, + "loss": 0.7883, + "step": 225 + }, + { + "epoch": 0.04559223570278048, + "grad_norm": 1.9731240272521973, + "learning_rate": 9.993670326516924e-06, + "loss": 0.8278, + "step": 226 + }, + { + "epoch": 0.04579397125898748, + "grad_norm": 1.77347731590271, + "learning_rate": 9.993504887669955e-06, + "loss": 0.7994, + "step": 227 + }, + { + "epoch": 0.04599570681519447, + "grad_norm": 12.29296588897705, + "learning_rate": 9.993337315989428e-06, + "loss": 0.7465, + "step": 228 + }, + { + "epoch": 0.04619744237140146, + "grad_norm": 5.521291732788086, + "learning_rate": 9.99316761154692e-06, + "loss": 0.8043, + "step": 229 + }, + { + "epoch": 0.046399177927608455, + "grad_norm": 2.6495323181152344, + "learning_rate": 9.992995774414912e-06, + "loss": 0.8121, + "step": 230 + }, + { + "epoch": 0.04660091348381545, + "grad_norm": 1.0269204378128052, + "learning_rate": 9.992821804666803e-06, + "loss": 0.8266, + "step": 231 + }, + { + "epoch": 0.046802649040022445, + "grad_norm": 7.434131622314453, + "learning_rate": 9.992645702376896e-06, + "loss": 0.7602, + "step": 232 + }, + { + "epoch": 0.04700438459622944, + "grad_norm": 36.971431732177734, + "learning_rate": 9.992467467620408e-06, + "loss": 0.8083, + "step": 233 + }, + { + "epoch": 0.04720612015243643, + "grad_norm": 57.72653579711914, + "learning_rate": 9.99228710047347e-06, + "loss": 0.8107, + "step": 234 + }, + { + "epoch": 0.04740785570864342, + "grad_norm": 15.51785659790039, + "learning_rate": 9.992104601013117e-06, + "loss": 0.7832, + "step": 235 + }, + { + "epoch": 0.04760959126485042, + "grad_norm": 1.4890369176864624, + "learning_rate": 9.9919199693173e-06, + "loss": 0.7787, + "step": 236 + }, + { + "epoch": 0.04781132682105741, + "grad_norm": 1.1738063097000122, + "learning_rate": 9.991733205464882e-06, + "loss": 0.7906, + "step": 237 + }, + { + "epoch": 0.0480130623772644, + "grad_norm": 2.434535026550293, + "learning_rate": 9.99154430953563e-06, + "loss": 0.7879, + "step": 238 + }, + { + "epoch": 0.048214797933471394, + "grad_norm": 1.8027598857879639, + "learning_rate": 9.991353281610227e-06, + "loss": 0.7958, + "step": 239 + }, + { + "epoch": 0.04841653348967839, + "grad_norm": 1.322300672531128, + "learning_rate": 9.991160121770265e-06, + "loss": 0.9135, + "step": 240 + }, + { + "epoch": 0.048618269045885384, + "grad_norm": 3.259050130844116, + "learning_rate": 9.990964830098246e-06, + "loss": 0.9424, + "step": 241 + }, + { + "epoch": 0.048820004602092376, + "grad_norm": 1.3933802843093872, + "learning_rate": 9.990767406677585e-06, + "loss": 0.7914, + "step": 242 + }, + { + "epoch": 0.04902174015829937, + "grad_norm": 1.3619595766067505, + "learning_rate": 9.990567851592604e-06, + "loss": 0.7864, + "step": 243 + }, + { + "epoch": 0.04922347571450636, + "grad_norm": 2.9409220218658447, + "learning_rate": 9.990366164928538e-06, + "loss": 0.8433, + "step": 244 + }, + { + "epoch": 0.04942521127071336, + "grad_norm": 1.1837221384048462, + "learning_rate": 9.990162346771532e-06, + "loss": 0.7411, + "step": 245 + }, + { + "epoch": 0.04962694682692035, + "grad_norm": 0.7388036847114563, + "learning_rate": 9.98995639720864e-06, + "loss": 0.7743, + "step": 246 + }, + { + "epoch": 0.04982868238312734, + "grad_norm": 1.7144968509674072, + "learning_rate": 9.98974831632783e-06, + "loss": 0.7733, + "step": 247 + }, + { + "epoch": 0.05003041793933433, + "grad_norm": 0.677221417427063, + "learning_rate": 9.989538104217975e-06, + "loss": 0.7485, + "step": 248 + }, + { + "epoch": 0.05023215349554133, + "grad_norm": 1.5204025506973267, + "learning_rate": 9.989325760968865e-06, + "loss": 0.8537, + "step": 249 + }, + { + "epoch": 0.050433889051748323, + "grad_norm": 1.2902796268463135, + "learning_rate": 9.98911128667119e-06, + "loss": 0.7397, + "step": 250 + }, + { + "epoch": 0.050635624607955315, + "grad_norm": 1.303731083869934, + "learning_rate": 9.988894681416561e-06, + "loss": 1.0365, + "step": 251 + }, + { + "epoch": 0.05083736016416231, + "grad_norm": 3.029825448989868, + "learning_rate": 9.988675945297497e-06, + "loss": 0.9559, + "step": 252 + }, + { + "epoch": 0.051039095720369305, + "grad_norm": 1.6813995838165283, + "learning_rate": 9.98845507840742e-06, + "loss": 0.8865, + "step": 253 + }, + { + "epoch": 0.0512408312765763, + "grad_norm": 3.1005282402038574, + "learning_rate": 9.988232080840668e-06, + "loss": 0.7663, + "step": 254 + }, + { + "epoch": 0.05144256683278329, + "grad_norm": 1.6702642440795898, + "learning_rate": 9.98800695269249e-06, + "loss": 0.7728, + "step": 255 + }, + { + "epoch": 0.05164430238899028, + "grad_norm": 3.3101143836975098, + "learning_rate": 9.987779694059043e-06, + "loss": 0.8304, + "step": 256 + }, + { + "epoch": 0.05184603794519727, + "grad_norm": 1.2295477390289307, + "learning_rate": 9.987550305037392e-06, + "loss": 0.7588, + "step": 257 + }, + { + "epoch": 0.05204777350140427, + "grad_norm": 1.4161425828933716, + "learning_rate": 9.987318785725517e-06, + "loss": 0.8055, + "step": 258 + }, + { + "epoch": 0.05224950905761126, + "grad_norm": 1.438620924949646, + "learning_rate": 9.987085136222302e-06, + "loss": 1.0795, + "step": 259 + }, + { + "epoch": 0.052451244613818254, + "grad_norm": 1.5853941440582275, + "learning_rate": 9.986849356627545e-06, + "loss": 0.7861, + "step": 260 + }, + { + "epoch": 0.052652980170025246, + "grad_norm": 1.0955591201782227, + "learning_rate": 9.986611447041952e-06, + "loss": 0.9781, + "step": 261 + }, + { + "epoch": 0.052854715726232245, + "grad_norm": 9.280670166015625, + "learning_rate": 9.98637140756714e-06, + "loss": 0.7771, + "step": 262 + }, + { + "epoch": 0.053056451282439236, + "grad_norm": 53.89901351928711, + "learning_rate": 9.986129238305635e-06, + "loss": 0.7909, + "step": 263 + }, + { + "epoch": 0.05325818683864623, + "grad_norm": 0.7823047637939453, + "learning_rate": 9.985884939360873e-06, + "loss": 0.8604, + "step": 264 + }, + { + "epoch": 0.05345992239485322, + "grad_norm": 14.755097389221191, + "learning_rate": 9.985638510837197e-06, + "loss": 0.861, + "step": 265 + }, + { + "epoch": 0.05366165795106022, + "grad_norm": 0.7898758053779602, + "learning_rate": 9.985389952839864e-06, + "loss": 0.7715, + "step": 266 + }, + { + "epoch": 0.05386339350726721, + "grad_norm": 1.4765191078186035, + "learning_rate": 9.985139265475039e-06, + "loss": 0.9422, + "step": 267 + }, + { + "epoch": 0.0540651290634742, + "grad_norm": 0.8627459406852722, + "learning_rate": 9.984886448849796e-06, + "loss": 0.7241, + "step": 268 + }, + { + "epoch": 0.05426686461968119, + "grad_norm": 1.3317475318908691, + "learning_rate": 9.984631503072116e-06, + "loss": 0.7417, + "step": 269 + }, + { + "epoch": 0.054468600175888185, + "grad_norm": 0.6790952086448669, + "learning_rate": 9.984374428250894e-06, + "loss": 0.7364, + "step": 270 + }, + { + "epoch": 0.054670335732095184, + "grad_norm": 6.166963577270508, + "learning_rate": 9.984115224495933e-06, + "loss": 0.7579, + "step": 271 + }, + { + "epoch": 0.054872071288302175, + "grad_norm": 1.9955143928527832, + "learning_rate": 9.983853891917942e-06, + "loss": 0.884, + "step": 272 + }, + { + "epoch": 0.05507380684450917, + "grad_norm": 4.1368889808654785, + "learning_rate": 9.983590430628543e-06, + "loss": 0.7424, + "step": 273 + }, + { + "epoch": 0.05527554240071616, + "grad_norm": 3.234487295150757, + "learning_rate": 9.983324840740265e-06, + "loss": 0.8108, + "step": 274 + }, + { + "epoch": 0.05547727795692316, + "grad_norm": 12.349787712097168, + "learning_rate": 9.983057122366549e-06, + "loss": 0.7564, + "step": 275 + }, + { + "epoch": 0.05567901351313015, + "grad_norm": 2.3753366470336914, + "learning_rate": 9.982787275621743e-06, + "loss": 0.7861, + "step": 276 + }, + { + "epoch": 0.05588074906933714, + "grad_norm": 4.729350566864014, + "learning_rate": 9.982515300621103e-06, + "loss": 0.7651, + "step": 277 + }, + { + "epoch": 0.05608248462554413, + "grad_norm": 0.8417067527770996, + "learning_rate": 9.982241197480795e-06, + "loss": 0.7681, + "step": 278 + }, + { + "epoch": 0.05628422018175113, + "grad_norm": 0.8154447674751282, + "learning_rate": 9.981964966317897e-06, + "loss": 0.7743, + "step": 279 + }, + { + "epoch": 0.05648595573795812, + "grad_norm": 0.5765023231506348, + "learning_rate": 9.981686607250391e-06, + "loss": 0.958, + "step": 280 + }, + { + "epoch": 0.056687691294165114, + "grad_norm": 1.1746493577957153, + "learning_rate": 9.981406120397172e-06, + "loss": 0.7329, + "step": 281 + }, + { + "epoch": 0.056889426850372106, + "grad_norm": 1.1388757228851318, + "learning_rate": 9.98112350587804e-06, + "loss": 0.7618, + "step": 282 + }, + { + "epoch": 0.0570911624065791, + "grad_norm": 0.5775352716445923, + "learning_rate": 9.980838763813707e-06, + "loss": 0.7496, + "step": 283 + }, + { + "epoch": 0.057292897962786096, + "grad_norm": 1.036942481994629, + "learning_rate": 9.980551894325793e-06, + "loss": 0.857, + "step": 284 + }, + { + "epoch": 0.05749463351899309, + "grad_norm": 1.5410163402557373, + "learning_rate": 9.980262897536824e-06, + "loss": 0.7326, + "step": 285 + }, + { + "epoch": 0.05769636907520008, + "grad_norm": 0.9250368475914001, + "learning_rate": 9.979971773570239e-06, + "loss": 0.8455, + "step": 286 + }, + { + "epoch": 0.05789810463140707, + "grad_norm": 0.8032963275909424, + "learning_rate": 9.979678522550382e-06, + "loss": 0.706, + "step": 287 + }, + { + "epoch": 0.05809984018761407, + "grad_norm": 6.008584976196289, + "learning_rate": 9.979383144602505e-06, + "loss": 0.7733, + "step": 288 + }, + { + "epoch": 0.05830157574382106, + "grad_norm": 1.6467902660369873, + "learning_rate": 9.979085639852776e-06, + "loss": 0.8366, + "step": 289 + }, + { + "epoch": 0.05850331130002805, + "grad_norm": 5.2584404945373535, + "learning_rate": 9.97878600842826e-06, + "loss": 0.9207, + "step": 290 + }, + { + "epoch": 0.058705046856235045, + "grad_norm": 1.9451853036880493, + "learning_rate": 9.978484250456938e-06, + "loss": 0.7263, + "step": 291 + }, + { + "epoch": 0.058906782412442044, + "grad_norm": 0.9677664637565613, + "learning_rate": 9.9781803660677e-06, + "loss": 0.7551, + "step": 292 + }, + { + "epoch": 0.059108517968649035, + "grad_norm": 2.3871355056762695, + "learning_rate": 9.977874355390337e-06, + "loss": 0.7409, + "step": 293 + }, + { + "epoch": 0.05931025352485603, + "grad_norm": 2.6086199283599854, + "learning_rate": 9.977566218555554e-06, + "loss": 0.7638, + "step": 294 + }, + { + "epoch": 0.05951198908106302, + "grad_norm": 2.170511484146118, + "learning_rate": 9.977255955694967e-06, + "loss": 0.7545, + "step": 295 + }, + { + "epoch": 0.05971372463727001, + "grad_norm": 2.165513277053833, + "learning_rate": 9.97694356694109e-06, + "loss": 0.7556, + "step": 296 + }, + { + "epoch": 0.05991546019347701, + "grad_norm": 1.170595407485962, + "learning_rate": 9.976629052427353e-06, + "loss": 0.7372, + "step": 297 + }, + { + "epoch": 0.060117195749684, + "grad_norm": 1.5325140953063965, + "learning_rate": 9.976312412288096e-06, + "loss": 0.9574, + "step": 298 + }, + { + "epoch": 0.06031893130589099, + "grad_norm": 1.3879196643829346, + "learning_rate": 9.975993646658555e-06, + "loss": 0.7547, + "step": 299 + }, + { + "epoch": 0.060520666862097984, + "grad_norm": 0.6840426921844482, + "learning_rate": 9.97567275567489e-06, + "loss": 0.7828, + "step": 300 + }, + { + "epoch": 0.06072240241830498, + "grad_norm": 0.6344276070594788, + "learning_rate": 9.975349739474156e-06, + "loss": 0.747, + "step": 301 + }, + { + "epoch": 0.060924137974511974, + "grad_norm": 1.4871931076049805, + "learning_rate": 9.975024598194318e-06, + "loss": 1.0252, + "step": 302 + }, + { + "epoch": 0.061125873530718966, + "grad_norm": 0.8229524493217468, + "learning_rate": 9.974697331974255e-06, + "loss": 0.8158, + "step": 303 + }, + { + "epoch": 0.06132760908692596, + "grad_norm": 0.6888187527656555, + "learning_rate": 9.974367940953748e-06, + "loss": 0.9094, + "step": 304 + }, + { + "epoch": 0.061529344643132956, + "grad_norm": 1.9582043886184692, + "learning_rate": 9.974036425273487e-06, + "loss": 0.7483, + "step": 305 + }, + { + "epoch": 0.06173108019933995, + "grad_norm": 0.950177013874054, + "learning_rate": 9.973702785075072e-06, + "loss": 0.7646, + "step": 306 + }, + { + "epoch": 0.06193281575554694, + "grad_norm": 1.6053050756454468, + "learning_rate": 9.973367020501003e-06, + "loss": 0.9114, + "step": 307 + }, + { + "epoch": 0.06213455131175393, + "grad_norm": 0.6130600571632385, + "learning_rate": 9.973029131694694e-06, + "loss": 0.7913, + "step": 308 + }, + { + "epoch": 0.06233628686796092, + "grad_norm": 1.4359782934188843, + "learning_rate": 9.972689118800467e-06, + "loss": 0.8401, + "step": 309 + }, + { + "epoch": 0.06253802242416792, + "grad_norm": 0.8085805773735046, + "learning_rate": 9.972346981963546e-06, + "loss": 0.7986, + "step": 310 + }, + { + "epoch": 0.06273975798037491, + "grad_norm": 2.3081631660461426, + "learning_rate": 9.972002721330067e-06, + "loss": 0.758, + "step": 311 + }, + { + "epoch": 0.0629414935365819, + "grad_norm": 6.955173015594482, + "learning_rate": 9.97165633704707e-06, + "loss": 0.8835, + "step": 312 + }, + { + "epoch": 0.0631432290927889, + "grad_norm": 42.62002182006836, + "learning_rate": 9.971307829262504e-06, + "loss": 0.7345, + "step": 313 + }, + { + "epoch": 0.06334496464899589, + "grad_norm": 33.94930648803711, + "learning_rate": 9.970957198125224e-06, + "loss": 0.725, + "step": 314 + }, + { + "epoch": 0.06354670020520288, + "grad_norm": 1.6801073551177979, + "learning_rate": 9.97060444378499e-06, + "loss": 0.7774, + "step": 315 + }, + { + "epoch": 0.06374843576140989, + "grad_norm": 0.945922315120697, + "learning_rate": 9.970249566392474e-06, + "loss": 0.7666, + "step": 316 + }, + { + "epoch": 0.06395017131761688, + "grad_norm": 0.5739346146583557, + "learning_rate": 9.96989256609925e-06, + "loss": 0.9993, + "step": 317 + }, + { + "epoch": 0.06415190687382387, + "grad_norm": 2.0904643535614014, + "learning_rate": 9.969533443057802e-06, + "loss": 0.782, + "step": 318 + }, + { + "epoch": 0.06435364243003086, + "grad_norm": 1.087174415588379, + "learning_rate": 9.969172197421518e-06, + "loss": 0.7468, + "step": 319 + }, + { + "epoch": 0.06455537798623785, + "grad_norm": 1.0355284214019775, + "learning_rate": 9.968808829344692e-06, + "loss": 0.7677, + "step": 320 + }, + { + "epoch": 0.06475711354244484, + "grad_norm": 1.3207528591156006, + "learning_rate": 9.968443338982532e-06, + "loss": 0.7234, + "step": 321 + }, + { + "epoch": 0.06495884909865184, + "grad_norm": 4.863753795623779, + "learning_rate": 9.96807572649114e-06, + "loss": 0.8654, + "step": 322 + }, + { + "epoch": 0.06516058465485883, + "grad_norm": 0.6006829738616943, + "learning_rate": 9.967705992027537e-06, + "loss": 0.8232, + "step": 323 + }, + { + "epoch": 0.06536232021106582, + "grad_norm": 1.9106441736221313, + "learning_rate": 9.96733413574964e-06, + "loss": 0.7413, + "step": 324 + }, + { + "epoch": 0.06556405576727282, + "grad_norm": 48.14797592163086, + "learning_rate": 9.966960157816279e-06, + "loss": 0.7532, + "step": 325 + }, + { + "epoch": 0.06576579132347982, + "grad_norm": 47.85212707519531, + "learning_rate": 9.96658405838719e-06, + "loss": 0.7283, + "step": 326 + }, + { + "epoch": 0.06596752687968681, + "grad_norm": 29.23980140686035, + "learning_rate": 9.966205837623009e-06, + "loss": 0.7057, + "step": 327 + }, + { + "epoch": 0.0661692624358938, + "grad_norm": 3.5036604404449463, + "learning_rate": 9.965825495685284e-06, + "loss": 0.6846, + "step": 328 + }, + { + "epoch": 0.06637099799210079, + "grad_norm": 0.6103178858757019, + "learning_rate": 9.965443032736469e-06, + "loss": 0.9649, + "step": 329 + }, + { + "epoch": 0.06657273354830778, + "grad_norm": 1.7372721433639526, + "learning_rate": 9.965058448939919e-06, + "loss": 0.8735, + "step": 330 + }, + { + "epoch": 0.06677446910451477, + "grad_norm": 0.6306778192520142, + "learning_rate": 9.964671744459902e-06, + "loss": 1.0855, + "step": 331 + }, + { + "epoch": 0.06697620466072177, + "grad_norm": 1.587566614151001, + "learning_rate": 9.964282919461584e-06, + "loss": 0.7498, + "step": 332 + }, + { + "epoch": 0.06717794021692877, + "grad_norm": 1.041245460510254, + "learning_rate": 9.963891974111042e-06, + "loss": 0.7863, + "step": 333 + }, + { + "epoch": 0.06737967577313576, + "grad_norm": 0.8831943869590759, + "learning_rate": 9.963498908575258e-06, + "loss": 0.7402, + "step": 334 + }, + { + "epoch": 0.06758141132934276, + "grad_norm": 1.5780138969421387, + "learning_rate": 9.963103723022117e-06, + "loss": 0.9024, + "step": 335 + }, + { + "epoch": 0.06778314688554975, + "grad_norm": 0.7395191788673401, + "learning_rate": 9.962706417620413e-06, + "loss": 0.7374, + "step": 336 + }, + { + "epoch": 0.06798488244175674, + "grad_norm": 1.0590225458145142, + "learning_rate": 9.962306992539842e-06, + "loss": 0.7621, + "step": 337 + }, + { + "epoch": 0.06818661799796373, + "grad_norm": 0.5914890170097351, + "learning_rate": 9.96190544795101e-06, + "loss": 0.7071, + "step": 338 + }, + { + "epoch": 0.06838835355417072, + "grad_norm": 1.8004124164581299, + "learning_rate": 9.961501784025423e-06, + "loss": 0.7526, + "step": 339 + }, + { + "epoch": 0.06859008911037771, + "grad_norm": 1.0831773281097412, + "learning_rate": 9.961096000935493e-06, + "loss": 0.6934, + "step": 340 + }, + { + "epoch": 0.0687918246665847, + "grad_norm": 0.832261323928833, + "learning_rate": 9.960688098854542e-06, + "loss": 0.7978, + "step": 341 + }, + { + "epoch": 0.06899356022279171, + "grad_norm": 0.72775799036026, + "learning_rate": 9.960278077956792e-06, + "loss": 0.8441, + "step": 342 + }, + { + "epoch": 0.0691952957789987, + "grad_norm": 2.9648144245147705, + "learning_rate": 9.959865938417372e-06, + "loss": 0.751, + "step": 343 + }, + { + "epoch": 0.0693970313352057, + "grad_norm": 1.3026782274246216, + "learning_rate": 9.959451680412316e-06, + "loss": 0.7462, + "step": 344 + }, + { + "epoch": 0.06959876689141269, + "grad_norm": 2.5709228515625, + "learning_rate": 9.959035304118563e-06, + "loss": 0.7466, + "step": 345 + }, + { + "epoch": 0.06980050244761968, + "grad_norm": 2.0137813091278076, + "learning_rate": 9.958616809713955e-06, + "loss": 0.7713, + "step": 346 + }, + { + "epoch": 0.07000223800382667, + "grad_norm": 1.1602307558059692, + "learning_rate": 9.958196197377242e-06, + "loss": 0.9285, + "step": 347 + }, + { + "epoch": 0.07020397356003366, + "grad_norm": 0.5965341925621033, + "learning_rate": 9.957773467288074e-06, + "loss": 0.7451, + "step": 348 + }, + { + "epoch": 0.07040570911624065, + "grad_norm": 0.5603690147399902, + "learning_rate": 9.95734861962701e-06, + "loss": 0.7483, + "step": 349 + }, + { + "epoch": 0.07060744467244764, + "grad_norm": 1.2296565771102905, + "learning_rate": 9.95692165457551e-06, + "loss": 0.7167, + "step": 350 + }, + { + "epoch": 0.07080918022865465, + "grad_norm": 1.3939180374145508, + "learning_rate": 9.95649257231594e-06, + "loss": 0.7404, + "step": 351 + }, + { + "epoch": 0.07101091578486164, + "grad_norm": 0.8631527423858643, + "learning_rate": 9.956061373031573e-06, + "loss": 0.807, + "step": 352 + }, + { + "epoch": 0.07121265134106863, + "grad_norm": 0.6659342050552368, + "learning_rate": 9.955628056906584e-06, + "loss": 0.6811, + "step": 353 + }, + { + "epoch": 0.07141438689727563, + "grad_norm": 0.7338282465934753, + "learning_rate": 9.955192624126045e-06, + "loss": 0.721, + "step": 354 + }, + { + "epoch": 0.07161612245348262, + "grad_norm": 0.7555668950080872, + "learning_rate": 9.954755074875946e-06, + "loss": 0.8442, + "step": 355 + }, + { + "epoch": 0.07181785800968961, + "grad_norm": 1.4037517309188843, + "learning_rate": 9.95431540934317e-06, + "loss": 0.7371, + "step": 356 + }, + { + "epoch": 0.0720195935658966, + "grad_norm": 1.6213260889053345, + "learning_rate": 9.953873627715506e-06, + "loss": 0.7541, + "step": 357 + }, + { + "epoch": 0.07222132912210359, + "grad_norm": 2.8973515033721924, + "learning_rate": 9.953429730181653e-06, + "loss": 0.7394, + "step": 358 + }, + { + "epoch": 0.0724230646783106, + "grad_norm": 1.1352715492248535, + "learning_rate": 9.952983716931209e-06, + "loss": 0.6834, + "step": 359 + }, + { + "epoch": 0.07262480023451759, + "grad_norm": 0.7093841433525085, + "learning_rate": 9.952535588154673e-06, + "loss": 0.75, + "step": 360 + }, + { + "epoch": 0.07282653579072458, + "grad_norm": 1.4455276727676392, + "learning_rate": 9.95208534404345e-06, + "loss": 0.7095, + "step": 361 + }, + { + "epoch": 0.07302827134693157, + "grad_norm": 0.5514054894447327, + "learning_rate": 9.951632984789851e-06, + "loss": 0.7518, + "step": 362 + }, + { + "epoch": 0.07323000690313856, + "grad_norm": 2.37100887298584, + "learning_rate": 9.951178510587087e-06, + "loss": 0.7373, + "step": 363 + }, + { + "epoch": 0.07343174245934556, + "grad_norm": 0.6046085953712463, + "learning_rate": 9.950721921629276e-06, + "loss": 0.7272, + "step": 364 + }, + { + "epoch": 0.07363347801555255, + "grad_norm": 0.8679599165916443, + "learning_rate": 9.950263218111435e-06, + "loss": 0.6825, + "step": 365 + }, + { + "epoch": 0.07383521357175954, + "grad_norm": 1.1433424949645996, + "learning_rate": 9.949802400229486e-06, + "loss": 0.7595, + "step": 366 + }, + { + "epoch": 0.07403694912796653, + "grad_norm": 0.6897748708724976, + "learning_rate": 9.949339468180256e-06, + "loss": 0.8082, + "step": 367 + }, + { + "epoch": 0.07423868468417354, + "grad_norm": 0.674286425113678, + "learning_rate": 9.948874422161473e-06, + "loss": 0.8014, + "step": 368 + }, + { + "epoch": 0.07444042024038053, + "grad_norm": 0.6043882966041565, + "learning_rate": 9.948407262371764e-06, + "loss": 0.7644, + "step": 369 + }, + { + "epoch": 0.07464215579658752, + "grad_norm": 1.9822863340377808, + "learning_rate": 9.947937989010668e-06, + "loss": 0.7242, + "step": 370 + }, + { + "epoch": 0.07484389135279451, + "grad_norm": 0.5006752610206604, + "learning_rate": 9.947466602278621e-06, + "loss": 0.7257, + "step": 371 + }, + { + "epoch": 0.0750456269090015, + "grad_norm": 1.2581019401550293, + "learning_rate": 9.946993102376961e-06, + "loss": 1.0325, + "step": 372 + }, + { + "epoch": 0.0752473624652085, + "grad_norm": 0.8816624879837036, + "learning_rate": 9.94651748950793e-06, + "loss": 0.9296, + "step": 373 + }, + { + "epoch": 0.07544909802141549, + "grad_norm": 0.9441580772399902, + "learning_rate": 9.946039763874674e-06, + "loss": 0.7625, + "step": 374 + }, + { + "epoch": 0.07565083357762248, + "grad_norm": 0.8706744909286499, + "learning_rate": 9.945559925681238e-06, + "loss": 0.7524, + "step": 375 + }, + { + "epoch": 0.07585256913382947, + "grad_norm": 0.7602512836456299, + "learning_rate": 9.945077975132573e-06, + "loss": 0.7468, + "step": 376 + }, + { + "epoch": 0.07605430469003648, + "grad_norm": 0.7464005947113037, + "learning_rate": 9.94459391243453e-06, + "loss": 0.7798, + "step": 377 + }, + { + "epoch": 0.07625604024624347, + "grad_norm": 5.947686195373535, + "learning_rate": 9.944107737793862e-06, + "loss": 0.7422, + "step": 378 + }, + { + "epoch": 0.07645777580245046, + "grad_norm": 11.784186363220215, + "learning_rate": 9.943619451418225e-06, + "loss": 0.7596, + "step": 379 + }, + { + "epoch": 0.07665951135865745, + "grad_norm": 2.201178550720215, + "learning_rate": 9.943129053516176e-06, + "loss": 0.7318, + "step": 380 + }, + { + "epoch": 0.07686124691486444, + "grad_norm": 1.0005850791931152, + "learning_rate": 9.942636544297175e-06, + "loss": 0.7558, + "step": 381 + }, + { + "epoch": 0.07706298247107143, + "grad_norm": 0.6544851064682007, + "learning_rate": 9.942141923971584e-06, + "loss": 0.7184, + "step": 382 + }, + { + "epoch": 0.07726471802727843, + "grad_norm": 1.1898512840270996, + "learning_rate": 9.941645192750665e-06, + "loss": 0.7013, + "step": 383 + }, + { + "epoch": 0.07746645358348542, + "grad_norm": 0.9880014061927795, + "learning_rate": 9.941146350846583e-06, + "loss": 0.7444, + "step": 384 + }, + { + "epoch": 0.07766818913969242, + "grad_norm": 0.6566327214241028, + "learning_rate": 9.940645398472405e-06, + "loss": 0.748, + "step": 385 + }, + { + "epoch": 0.07786992469589941, + "grad_norm": 3.9069302082061768, + "learning_rate": 9.940142335842097e-06, + "loss": 0.7627, + "step": 386 + }, + { + "epoch": 0.0780716602521064, + "grad_norm": 0.8036094307899475, + "learning_rate": 9.939637163170528e-06, + "loss": 0.9697, + "step": 387 + }, + { + "epoch": 0.0782733958083134, + "grad_norm": 0.5197206139564514, + "learning_rate": 9.939129880673471e-06, + "loss": 0.7418, + "step": 388 + }, + { + "epoch": 0.07847513136452039, + "grad_norm": 0.5764813423156738, + "learning_rate": 9.938620488567592e-06, + "loss": 0.8098, + "step": 389 + }, + { + "epoch": 0.07867686692072738, + "grad_norm": 4.1930832862854, + "learning_rate": 9.938108987070467e-06, + "loss": 0.796, + "step": 390 + }, + { + "epoch": 0.07887860247693437, + "grad_norm": 1.410750389099121, + "learning_rate": 9.93759537640057e-06, + "loss": 0.7219, + "step": 391 + }, + { + "epoch": 0.07908033803314136, + "grad_norm": 0.6416834592819214, + "learning_rate": 9.937079656777275e-06, + "loss": 0.7999, + "step": 392 + }, + { + "epoch": 0.07928207358934836, + "grad_norm": 0.6255632042884827, + "learning_rate": 9.936561828420854e-06, + "loss": 0.7706, + "step": 393 + }, + { + "epoch": 0.07948380914555536, + "grad_norm": 2.998563051223755, + "learning_rate": 9.936041891552484e-06, + "loss": 0.7554, + "step": 394 + }, + { + "epoch": 0.07968554470176235, + "grad_norm": 1.068771481513977, + "learning_rate": 9.935519846394242e-06, + "loss": 0.7692, + "step": 395 + }, + { + "epoch": 0.07988728025796935, + "grad_norm": 1.0073810815811157, + "learning_rate": 9.934995693169104e-06, + "loss": 0.7764, + "step": 396 + }, + { + "epoch": 0.08008901581417634, + "grad_norm": 2.54864501953125, + "learning_rate": 9.93446943210095e-06, + "loss": 0.7399, + "step": 397 + }, + { + "epoch": 0.08029075137038333, + "grad_norm": 1.4615826606750488, + "learning_rate": 9.933941063414553e-06, + "loss": 0.749, + "step": 398 + }, + { + "epoch": 0.08049248692659032, + "grad_norm": 2.118624448776245, + "learning_rate": 9.933410587335594e-06, + "loss": 0.6639, + "step": 399 + }, + { + "epoch": 0.08069422248279731, + "grad_norm": 1.0449012517929077, + "learning_rate": 9.93287800409065e-06, + "loss": 0.7129, + "step": 400 + }, + { + "epoch": 0.0808959580390043, + "grad_norm": 0.46440356969833374, + "learning_rate": 9.932343313907196e-06, + "loss": 0.7212, + "step": 401 + }, + { + "epoch": 0.0810976935952113, + "grad_norm": 1.939047932624817, + "learning_rate": 9.931806517013612e-06, + "loss": 0.7696, + "step": 402 + }, + { + "epoch": 0.0812994291514183, + "grad_norm": 0.7793079018592834, + "learning_rate": 9.931267613639177e-06, + "loss": 0.7481, + "step": 403 + }, + { + "epoch": 0.08150116470762529, + "grad_norm": 0.884232223033905, + "learning_rate": 9.930726604014066e-06, + "loss": 0.8048, + "step": 404 + }, + { + "epoch": 0.08170290026383228, + "grad_norm": 1.6912051439285278, + "learning_rate": 9.930183488369357e-06, + "loss": 0.8251, + "step": 405 + }, + { + "epoch": 0.08190463582003928, + "grad_norm": 0.6897804737091064, + "learning_rate": 9.929638266937025e-06, + "loss": 0.7105, + "step": 406 + }, + { + "epoch": 0.08210637137624627, + "grad_norm": 1.0087461471557617, + "learning_rate": 9.929090939949948e-06, + "loss": 0.7334, + "step": 407 + }, + { + "epoch": 0.08230810693245326, + "grad_norm": 1.2530308961868286, + "learning_rate": 9.9285415076419e-06, + "loss": 0.7001, + "step": 408 + }, + { + "epoch": 0.08250984248866025, + "grad_norm": 0.5179945230484009, + "learning_rate": 9.927989970247554e-06, + "loss": 0.749, + "step": 409 + }, + { + "epoch": 0.08271157804486724, + "grad_norm": 0.757422685623169, + "learning_rate": 9.927436328002487e-06, + "loss": 0.685, + "step": 410 + }, + { + "epoch": 0.08291331360107425, + "grad_norm": 1.2727844715118408, + "learning_rate": 9.926880581143168e-06, + "loss": 0.8697, + "step": 411 + }, + { + "epoch": 0.08311504915728124, + "grad_norm": 0.7067129611968994, + "learning_rate": 9.926322729906968e-06, + "loss": 0.7317, + "step": 412 + }, + { + "epoch": 0.08331678471348823, + "grad_norm": 7.245047569274902, + "learning_rate": 9.925762774532162e-06, + "loss": 0.7562, + "step": 413 + }, + { + "epoch": 0.08351852026969522, + "grad_norm": 1.140660047531128, + "learning_rate": 9.925200715257915e-06, + "loss": 0.7611, + "step": 414 + }, + { + "epoch": 0.08372025582590222, + "grad_norm": 0.8195030093193054, + "learning_rate": 9.924636552324296e-06, + "loss": 0.7434, + "step": 415 + }, + { + "epoch": 0.0839219913821092, + "grad_norm": 0.5836001634597778, + "learning_rate": 9.92407028597227e-06, + "loss": 0.7571, + "step": 416 + }, + { + "epoch": 0.0841237269383162, + "grad_norm": 0.7573882341384888, + "learning_rate": 9.923501916443704e-06, + "loss": 0.7345, + "step": 417 + }, + { + "epoch": 0.08432546249452319, + "grad_norm": 0.625736653804779, + "learning_rate": 9.922931443981358e-06, + "loss": 0.8219, + "step": 418 + }, + { + "epoch": 0.08452719805073018, + "grad_norm": 0.9961444139480591, + "learning_rate": 9.922358868828896e-06, + "loss": 0.7045, + "step": 419 + }, + { + "epoch": 0.08472893360693719, + "grad_norm": 0.8629128932952881, + "learning_rate": 9.921784191230874e-06, + "loss": 0.7361, + "step": 420 + }, + { + "epoch": 0.08493066916314418, + "grad_norm": 0.8115153908729553, + "learning_rate": 9.921207411432752e-06, + "loss": 0.7315, + "step": 421 + }, + { + "epoch": 0.08513240471935117, + "grad_norm": 1.0216450691223145, + "learning_rate": 9.920628529680882e-06, + "loss": 0.7691, + "step": 422 + }, + { + "epoch": 0.08533414027555816, + "grad_norm": 0.8689268827438354, + "learning_rate": 9.920047546222522e-06, + "loss": 0.8783, + "step": 423 + }, + { + "epoch": 0.08553587583176515, + "grad_norm": 1.2929258346557617, + "learning_rate": 9.919464461305817e-06, + "loss": 0.7367, + "step": 424 + }, + { + "epoch": 0.08573761138797215, + "grad_norm": 0.7794530987739563, + "learning_rate": 9.918879275179819e-06, + "loss": 0.7366, + "step": 425 + }, + { + "epoch": 0.08593934694417914, + "grad_norm": 2.4730234146118164, + "learning_rate": 9.91829198809447e-06, + "loss": 0.8005, + "step": 426 + }, + { + "epoch": 0.08614108250038613, + "grad_norm": 2.03058123588562, + "learning_rate": 9.917702600300615e-06, + "loss": 0.7365, + "step": 427 + }, + { + "epoch": 0.08634281805659313, + "grad_norm": 0.6619600653648376, + "learning_rate": 9.917111112049996e-06, + "loss": 0.718, + "step": 428 + }, + { + "epoch": 0.08654455361280013, + "grad_norm": 1.198303461074829, + "learning_rate": 9.916517523595248e-06, + "loss": 0.9309, + "step": 429 + }, + { + "epoch": 0.08674628916900712, + "grad_norm": 0.6509268879890442, + "learning_rate": 9.915921835189906e-06, + "loss": 0.7377, + "step": 430 + }, + { + "epoch": 0.08694802472521411, + "grad_norm": 0.9966859817504883, + "learning_rate": 9.915324047088402e-06, + "loss": 0.716, + "step": 431 + }, + { + "epoch": 0.0871497602814211, + "grad_norm": 2.054065227508545, + "learning_rate": 9.914724159546063e-06, + "loss": 0.7177, + "step": 432 + }, + { + "epoch": 0.0873514958376281, + "grad_norm": 1.0520039796829224, + "learning_rate": 9.914122172819113e-06, + "loss": 0.6613, + "step": 433 + }, + { + "epoch": 0.08755323139383508, + "grad_norm": 0.4894033670425415, + "learning_rate": 9.913518087164678e-06, + "loss": 0.7384, + "step": 434 + }, + { + "epoch": 0.08775496695004208, + "grad_norm": 0.550117552280426, + "learning_rate": 9.912911902840771e-06, + "loss": 0.7757, + "step": 435 + }, + { + "epoch": 0.08795670250624907, + "grad_norm": 0.8945670127868652, + "learning_rate": 9.91230362010631e-06, + "loss": 0.6971, + "step": 436 + }, + { + "epoch": 0.08815843806245607, + "grad_norm": 0.4656491279602051, + "learning_rate": 9.911693239221101e-06, + "loss": 0.7452, + "step": 437 + }, + { + "epoch": 0.08836017361866307, + "grad_norm": 0.7141634225845337, + "learning_rate": 9.911080760445857e-06, + "loss": 0.8922, + "step": 438 + }, + { + "epoch": 0.08856190917487006, + "grad_norm": 3.367650032043457, + "learning_rate": 9.910466184042177e-06, + "loss": 0.7144, + "step": 439 + }, + { + "epoch": 0.08876364473107705, + "grad_norm": 0.4969181716442108, + "learning_rate": 9.90984951027256e-06, + "loss": 0.741, + "step": 440 + }, + { + "epoch": 0.08896538028728404, + "grad_norm": 0.9121612906455994, + "learning_rate": 9.909230739400402e-06, + "loss": 0.8275, + "step": 441 + }, + { + "epoch": 0.08916711584349103, + "grad_norm": 0.5296851992607117, + "learning_rate": 9.908609871689992e-06, + "loss": 0.6998, + "step": 442 + }, + { + "epoch": 0.08936885139969802, + "grad_norm": 0.6200650334358215, + "learning_rate": 9.907986907406517e-06, + "loss": 0.6965, + "step": 443 + }, + { + "epoch": 0.08957058695590502, + "grad_norm": 0.9386738538742065, + "learning_rate": 9.907361846816057e-06, + "loss": 0.8557, + "step": 444 + }, + { + "epoch": 0.08977232251211201, + "grad_norm": 0.6921373009681702, + "learning_rate": 9.90673469018559e-06, + "loss": 0.7537, + "step": 445 + }, + { + "epoch": 0.08997405806831901, + "grad_norm": 1.0132421255111694, + "learning_rate": 9.90610543778299e-06, + "loss": 0.7246, + "step": 446 + }, + { + "epoch": 0.090175793624526, + "grad_norm": 3.3522820472717285, + "learning_rate": 9.90547408987702e-06, + "loss": 0.7415, + "step": 447 + }, + { + "epoch": 0.090377529180733, + "grad_norm": 4.449716091156006, + "learning_rate": 9.904840646737346e-06, + "loss": 0.8019, + "step": 448 + }, + { + "epoch": 0.09057926473693999, + "grad_norm": 3.593144416809082, + "learning_rate": 9.904205108634525e-06, + "loss": 0.6708, + "step": 449 + }, + { + "epoch": 0.09078100029314698, + "grad_norm": 9.274864196777344, + "learning_rate": 9.903567475840005e-06, + "loss": 0.7104, + "step": 450 + }, + { + "epoch": 0.09098273584935397, + "grad_norm": 3.4164490699768066, + "learning_rate": 9.902927748626139e-06, + "loss": 0.8612, + "step": 451 + }, + { + "epoch": 0.09118447140556096, + "grad_norm": 1.1139180660247803, + "learning_rate": 9.902285927266162e-06, + "loss": 0.7185, + "step": 452 + }, + { + "epoch": 0.09138620696176795, + "grad_norm": 1.6076403856277466, + "learning_rate": 9.901642012034214e-06, + "loss": 0.7423, + "step": 453 + }, + { + "epoch": 0.09158794251797496, + "grad_norm": 0.48474082350730896, + "learning_rate": 9.900996003205323e-06, + "loss": 0.6703, + "step": 454 + }, + { + "epoch": 0.09178967807418195, + "grad_norm": 17.713228225708008, + "learning_rate": 9.900347901055414e-06, + "loss": 0.8088, + "step": 455 + }, + { + "epoch": 0.09199141363038894, + "grad_norm": 21.032386779785156, + "learning_rate": 9.899697705861304e-06, + "loss": 0.7579, + "step": 456 + }, + { + "epoch": 0.09219314918659594, + "grad_norm": 2.4759697914123535, + "learning_rate": 9.899045417900709e-06, + "loss": 0.8178, + "step": 457 + }, + { + "epoch": 0.09239488474280293, + "grad_norm": 0.45519858598709106, + "learning_rate": 9.898391037452231e-06, + "loss": 0.7225, + "step": 458 + }, + { + "epoch": 0.09259662029900992, + "grad_norm": 1.385945200920105, + "learning_rate": 9.897734564795374e-06, + "loss": 0.8623, + "step": 459 + }, + { + "epoch": 0.09279835585521691, + "grad_norm": 0.9438928961753845, + "learning_rate": 9.897076000210528e-06, + "loss": 0.7484, + "step": 460 + }, + { + "epoch": 0.0930000914114239, + "grad_norm": 0.42515015602111816, + "learning_rate": 9.896415343978982e-06, + "loss": 0.7168, + "step": 461 + }, + { + "epoch": 0.0932018269676309, + "grad_norm": 0.8145204782485962, + "learning_rate": 9.895752596382916e-06, + "loss": 0.7254, + "step": 462 + }, + { + "epoch": 0.0934035625238379, + "grad_norm": 0.9704925417900085, + "learning_rate": 9.895087757705406e-06, + "loss": 0.741, + "step": 463 + }, + { + "epoch": 0.09360529808004489, + "grad_norm": 0.5286685824394226, + "learning_rate": 9.894420828230416e-06, + "loss": 0.745, + "step": 464 + }, + { + "epoch": 0.09380703363625188, + "grad_norm": 1.0488917827606201, + "learning_rate": 9.893751808242805e-06, + "loss": 0.702, + "step": 465 + }, + { + "epoch": 0.09400876919245887, + "grad_norm": 0.8315796256065369, + "learning_rate": 9.89308069802833e-06, + "loss": 0.9462, + "step": 466 + }, + { + "epoch": 0.09421050474866587, + "grad_norm": 0.5982228517532349, + "learning_rate": 9.892407497873633e-06, + "loss": 0.7748, + "step": 467 + }, + { + "epoch": 0.09441224030487286, + "grad_norm": 11.72299575805664, + "learning_rate": 9.891732208066254e-06, + "loss": 0.722, + "step": 468 + }, + { + "epoch": 0.09461397586107985, + "grad_norm": 0.6634166836738586, + "learning_rate": 9.891054828894624e-06, + "loss": 0.6943, + "step": 469 + }, + { + "epoch": 0.09481571141728684, + "grad_norm": 0.5095616579055786, + "learning_rate": 9.890375360648065e-06, + "loss": 0.6962, + "step": 470 + }, + { + "epoch": 0.09501744697349383, + "grad_norm": 0.49047261476516724, + "learning_rate": 9.889693803616793e-06, + "loss": 0.969, + "step": 471 + }, + { + "epoch": 0.09521918252970084, + "grad_norm": 1.026039958000183, + "learning_rate": 9.889010158091917e-06, + "loss": 0.7256, + "step": 472 + }, + { + "epoch": 0.09542091808590783, + "grad_norm": 0.9997348189353943, + "learning_rate": 9.888324424365435e-06, + "loss": 0.7145, + "step": 473 + }, + { + "epoch": 0.09562265364211482, + "grad_norm": 2.0080642700195312, + "learning_rate": 9.88763660273024e-06, + "loss": 1.0393, + "step": 474 + }, + { + "epoch": 0.09582438919832181, + "grad_norm": 0.9626255631446838, + "learning_rate": 9.886946693480114e-06, + "loss": 0.749, + "step": 475 + }, + { + "epoch": 0.0960261247545288, + "grad_norm": 0.7559981346130371, + "learning_rate": 9.886254696909733e-06, + "loss": 0.7095, + "step": 476 + }, + { + "epoch": 0.0962278603107358, + "grad_norm": 0.5458871722221375, + "learning_rate": 9.885560613314664e-06, + "loss": 0.7313, + "step": 477 + }, + { + "epoch": 0.09642959586694279, + "grad_norm": 0.4658026397228241, + "learning_rate": 9.884864442991364e-06, + "loss": 0.7166, + "step": 478 + }, + { + "epoch": 0.09663133142314978, + "grad_norm": 0.5284778475761414, + "learning_rate": 9.884166186237185e-06, + "loss": 0.821, + "step": 479 + }, + { + "epoch": 0.09683306697935679, + "grad_norm": 1.8951740264892578, + "learning_rate": 9.883465843350364e-06, + "loss": 0.6991, + "step": 480 + }, + { + "epoch": 0.09703480253556378, + "grad_norm": 0.44958412647247314, + "learning_rate": 9.882763414630033e-06, + "loss": 0.7198, + "step": 481 + }, + { + "epoch": 0.09723653809177077, + "grad_norm": 0.8649196624755859, + "learning_rate": 9.882058900376218e-06, + "loss": 0.7282, + "step": 482 + }, + { + "epoch": 0.09743827364797776, + "grad_norm": 1.0441064834594727, + "learning_rate": 9.881352300889825e-06, + "loss": 0.8577, + "step": 483 + }, + { + "epoch": 0.09764000920418475, + "grad_norm": 0.6799788475036621, + "learning_rate": 9.880643616472667e-06, + "loss": 0.715, + "step": 484 + }, + { + "epoch": 0.09784174476039174, + "grad_norm": 0.4560524523258209, + "learning_rate": 9.879932847427432e-06, + "loss": 0.7117, + "step": 485 + }, + { + "epoch": 0.09804348031659874, + "grad_norm": 0.7565090656280518, + "learning_rate": 9.879219994057706e-06, + "loss": 0.7229, + "step": 486 + }, + { + "epoch": 0.09824521587280573, + "grad_norm": 0.5884823203086853, + "learning_rate": 9.878505056667967e-06, + "loss": 0.7408, + "step": 487 + }, + { + "epoch": 0.09844695142901272, + "grad_norm": 0.7502923011779785, + "learning_rate": 9.877788035563577e-06, + "loss": 0.7072, + "step": 488 + }, + { + "epoch": 0.09864868698521972, + "grad_norm": 0.611727774143219, + "learning_rate": 9.877068931050792e-06, + "loss": 0.9156, + "step": 489 + }, + { + "epoch": 0.09885042254142672, + "grad_norm": 0.5064416527748108, + "learning_rate": 9.876347743436758e-06, + "loss": 0.7025, + "step": 490 + }, + { + "epoch": 0.09905215809763371, + "grad_norm": 0.6777665615081787, + "learning_rate": 9.875624473029508e-06, + "loss": 0.6648, + "step": 491 + }, + { + "epoch": 0.0992538936538407, + "grad_norm": 2.0562150478363037, + "learning_rate": 9.874899120137968e-06, + "loss": 0.7049, + "step": 492 + }, + { + "epoch": 0.09945562921004769, + "grad_norm": 3.049912214279175, + "learning_rate": 9.874171685071949e-06, + "loss": 0.8232, + "step": 493 + }, + { + "epoch": 0.09965736476625468, + "grad_norm": 1.2710695266723633, + "learning_rate": 9.873442168142158e-06, + "loss": 0.8825, + "step": 494 + }, + { + "epoch": 0.09985910032246167, + "grad_norm": 0.7639725208282471, + "learning_rate": 9.872710569660186e-06, + "loss": 0.7314, + "step": 495 + }, + { + "epoch": 0.10006083587866867, + "grad_norm": 0.9314205646514893, + "learning_rate": 9.871976889938514e-06, + "loss": 0.7111, + "step": 496 + }, + { + "epoch": 0.10026257143487566, + "grad_norm": 0.8991636037826538, + "learning_rate": 9.871241129290511e-06, + "loss": 0.7147, + "step": 497 + }, + { + "epoch": 0.10046430699108266, + "grad_norm": 1.5229334831237793, + "learning_rate": 9.870503288030441e-06, + "loss": 0.7507, + "step": 498 + }, + { + "epoch": 0.10066604254728966, + "grad_norm": 1.4551448822021484, + "learning_rate": 9.869763366473447e-06, + "loss": 0.7372, + "step": 499 + }, + { + "epoch": 0.10086777810349665, + "grad_norm": 2.448296070098877, + "learning_rate": 9.869021364935567e-06, + "loss": 0.6818, + "step": 500 + }, + { + "epoch": 0.10106951365970364, + "grad_norm": 0.8766042590141296, + "learning_rate": 9.868277283733725e-06, + "loss": 0.7068, + "step": 501 + }, + { + "epoch": 0.10127124921591063, + "grad_norm": 0.6733140349388123, + "learning_rate": 9.867531123185738e-06, + "loss": 0.7211, + "step": 502 + }, + { + "epoch": 0.10147298477211762, + "grad_norm": 0.904449999332428, + "learning_rate": 9.866782883610302e-06, + "loss": 0.8696, + "step": 503 + }, + { + "epoch": 0.10167472032832461, + "grad_norm": 0.6029015779495239, + "learning_rate": 9.86603256532701e-06, + "loss": 0.7392, + "step": 504 + }, + { + "epoch": 0.1018764558845316, + "grad_norm": 0.4247641861438751, + "learning_rate": 9.865280168656337e-06, + "loss": 0.683, + "step": 505 + }, + { + "epoch": 0.10207819144073861, + "grad_norm": 0.6710717678070068, + "learning_rate": 9.864525693919648e-06, + "loss": 0.7417, + "step": 506 + }, + { + "epoch": 0.1022799269969456, + "grad_norm": 0.6687069535255432, + "learning_rate": 9.863769141439199e-06, + "loss": 0.8465, + "step": 507 + }, + { + "epoch": 0.1024816625531526, + "grad_norm": 0.9954400062561035, + "learning_rate": 9.863010511538124e-06, + "loss": 0.6569, + "step": 508 + }, + { + "epoch": 0.10268339810935959, + "grad_norm": 0.5977325439453125, + "learning_rate": 9.862249804540453e-06, + "loss": 0.7459, + "step": 509 + }, + { + "epoch": 0.10288513366556658, + "grad_norm": 0.7987101674079895, + "learning_rate": 9.861487020771103e-06, + "loss": 0.7721, + "step": 510 + }, + { + "epoch": 0.10308686922177357, + "grad_norm": 1.4089206457138062, + "learning_rate": 9.860722160555872e-06, + "loss": 0.7118, + "step": 511 + }, + { + "epoch": 0.10328860477798056, + "grad_norm": 0.8175768852233887, + "learning_rate": 9.859955224221446e-06, + "loss": 0.8256, + "step": 512 + }, + { + "epoch": 0.10349034033418755, + "grad_norm": 0.778608500957489, + "learning_rate": 9.859186212095405e-06, + "loss": 0.6965, + "step": 513 + }, + { + "epoch": 0.10369207589039454, + "grad_norm": 0.4597846269607544, + "learning_rate": 9.858415124506211e-06, + "loss": 0.7308, + "step": 514 + }, + { + "epoch": 0.10389381144660155, + "grad_norm": 0.8675187826156616, + "learning_rate": 9.857641961783207e-06, + "loss": 0.6938, + "step": 515 + }, + { + "epoch": 0.10409554700280854, + "grad_norm": 1.7470319271087646, + "learning_rate": 9.856866724256634e-06, + "loss": 0.6957, + "step": 516 + }, + { + "epoch": 0.10429728255901553, + "grad_norm": 0.8632996082305908, + "learning_rate": 9.856089412257605e-06, + "loss": 0.8682, + "step": 517 + }, + { + "epoch": 0.10449901811522253, + "grad_norm": 1.298071026802063, + "learning_rate": 9.855310026118132e-06, + "loss": 0.9162, + "step": 518 + }, + { + "epoch": 0.10470075367142952, + "grad_norm": 0.5649451613426208, + "learning_rate": 9.854528566171106e-06, + "loss": 0.7407, + "step": 519 + }, + { + "epoch": 0.10490248922763651, + "grad_norm": 3.4036266803741455, + "learning_rate": 9.853745032750309e-06, + "loss": 0.7086, + "step": 520 + }, + { + "epoch": 0.1051042247838435, + "grad_norm": 0.46299856901168823, + "learning_rate": 9.852959426190399e-06, + "loss": 0.7751, + "step": 521 + }, + { + "epoch": 0.10530596034005049, + "grad_norm": 1.2342002391815186, + "learning_rate": 9.852171746826928e-06, + "loss": 0.76, + "step": 522 + }, + { + "epoch": 0.10550769589625748, + "grad_norm": 2.244950771331787, + "learning_rate": 9.85138199499633e-06, + "loss": 0.6863, + "step": 523 + }, + { + "epoch": 0.10570943145246449, + "grad_norm": 2.6959030628204346, + "learning_rate": 9.850590171035928e-06, + "loss": 0.7939, + "step": 524 + }, + { + "epoch": 0.10591116700867148, + "grad_norm": 2.4526679515838623, + "learning_rate": 9.849796275283925e-06, + "loss": 1.0101, + "step": 525 + }, + { + "epoch": 0.10611290256487847, + "grad_norm": 0.7242956161499023, + "learning_rate": 9.849000308079412e-06, + "loss": 0.7311, + "step": 526 + }, + { + "epoch": 0.10631463812108546, + "grad_norm": 0.6420136094093323, + "learning_rate": 9.84820226976236e-06, + "loss": 0.7165, + "step": 527 + }, + { + "epoch": 0.10651637367729246, + "grad_norm": 0.5252360105514526, + "learning_rate": 9.847402160673634e-06, + "loss": 0.702, + "step": 528 + }, + { + "epoch": 0.10671810923349945, + "grad_norm": 2.2276649475097656, + "learning_rate": 9.846599981154975e-06, + "loss": 0.8515, + "step": 529 + }, + { + "epoch": 0.10691984478970644, + "grad_norm": 1.01405668258667, + "learning_rate": 9.84579573154901e-06, + "loss": 0.7093, + "step": 530 + }, + { + "epoch": 0.10712158034591343, + "grad_norm": 0.48206350207328796, + "learning_rate": 9.844989412199254e-06, + "loss": 0.7038, + "step": 531 + }, + { + "epoch": 0.10732331590212044, + "grad_norm": 0.7908836007118225, + "learning_rate": 9.844181023450101e-06, + "loss": 0.7945, + "step": 532 + }, + { + "epoch": 0.10752505145832743, + "grad_norm": 0.42727962136268616, + "learning_rate": 9.843370565646833e-06, + "loss": 0.6819, + "step": 533 + }, + { + "epoch": 0.10772678701453442, + "grad_norm": 0.5153601765632629, + "learning_rate": 9.842558039135612e-06, + "loss": 0.7405, + "step": 534 + }, + { + "epoch": 0.10792852257074141, + "grad_norm": 0.7418642640113831, + "learning_rate": 9.841743444263489e-06, + "loss": 0.7064, + "step": 535 + }, + { + "epoch": 0.1081302581269484, + "grad_norm": 0.6575508117675781, + "learning_rate": 9.84092678137839e-06, + "loss": 0.9104, + "step": 536 + }, + { + "epoch": 0.1083319936831554, + "grad_norm": 1.157667875289917, + "learning_rate": 9.840108050829135e-06, + "loss": 0.7568, + "step": 537 + }, + { + "epoch": 0.10853372923936239, + "grad_norm": 0.6192544102668762, + "learning_rate": 9.839287252965418e-06, + "loss": 0.8135, + "step": 538 + }, + { + "epoch": 0.10873546479556938, + "grad_norm": 0.5880550742149353, + "learning_rate": 9.838464388137819e-06, + "loss": 0.7347, + "step": 539 + }, + { + "epoch": 0.10893720035177637, + "grad_norm": 0.7463414669036865, + "learning_rate": 9.837639456697802e-06, + "loss": 0.8709, + "step": 540 + }, + { + "epoch": 0.10913893590798338, + "grad_norm": 0.644129753112793, + "learning_rate": 9.836812458997715e-06, + "loss": 0.7348, + "step": 541 + }, + { + "epoch": 0.10934067146419037, + "grad_norm": 1.0127805471420288, + "learning_rate": 9.835983395390784e-06, + "loss": 0.8324, + "step": 542 + }, + { + "epoch": 0.10954240702039736, + "grad_norm": 0.7811554074287415, + "learning_rate": 9.835152266231121e-06, + "loss": 0.8713, + "step": 543 + }, + { + "epoch": 0.10974414257660435, + "grad_norm": 1.3340731859207153, + "learning_rate": 9.834319071873719e-06, + "loss": 0.8607, + "step": 544 + }, + { + "epoch": 0.10994587813281134, + "grad_norm": 0.8164252042770386, + "learning_rate": 9.833483812674453e-06, + "loss": 0.7775, + "step": 545 + }, + { + "epoch": 0.11014761368901833, + "grad_norm": 0.535208523273468, + "learning_rate": 9.832646488990081e-06, + "loss": 0.7571, + "step": 546 + }, + { + "epoch": 0.11034934924522533, + "grad_norm": 0.4087328016757965, + "learning_rate": 9.831807101178242e-06, + "loss": 0.7241, + "step": 547 + }, + { + "epoch": 0.11055108480143232, + "grad_norm": 0.5368490219116211, + "learning_rate": 9.830965649597455e-06, + "loss": 0.8102, + "step": 548 + }, + { + "epoch": 0.11075282035763931, + "grad_norm": 1.0589547157287598, + "learning_rate": 9.830122134607125e-06, + "loss": 0.7413, + "step": 549 + }, + { + "epoch": 0.11095455591384631, + "grad_norm": 0.6054285168647766, + "learning_rate": 9.82927655656753e-06, + "loss": 0.7119, + "step": 550 + }, + { + "epoch": 0.1111562914700533, + "grad_norm": 0.45449957251548767, + "learning_rate": 9.828428915839843e-06, + "loss": 0.8459, + "step": 551 + }, + { + "epoch": 0.1113580270262603, + "grad_norm": 0.6329957246780396, + "learning_rate": 9.827579212786103e-06, + "loss": 1.019, + "step": 552 + }, + { + "epoch": 0.11155976258246729, + "grad_norm": 0.724990963935852, + "learning_rate": 9.826727447769237e-06, + "loss": 0.6923, + "step": 553 + }, + { + "epoch": 0.11176149813867428, + "grad_norm": 0.9215730428695679, + "learning_rate": 9.825873621153055e-06, + "loss": 0.7808, + "step": 554 + }, + { + "epoch": 0.11196323369488127, + "grad_norm": 0.5018446445465088, + "learning_rate": 9.825017733302241e-06, + "loss": 0.7438, + "step": 555 + }, + { + "epoch": 0.11216496925108826, + "grad_norm": 0.8885820508003235, + "learning_rate": 9.82415978458237e-06, + "loss": 0.7674, + "step": 556 + }, + { + "epoch": 0.11236670480729526, + "grad_norm": 0.8014993071556091, + "learning_rate": 9.823299775359882e-06, + "loss": 0.7115, + "step": 557 + }, + { + "epoch": 0.11256844036350226, + "grad_norm": 0.4157255291938782, + "learning_rate": 9.82243770600211e-06, + "loss": 0.7158, + "step": 558 + }, + { + "epoch": 0.11277017591970925, + "grad_norm": 0.8874197006225586, + "learning_rate": 9.821573576877264e-06, + "loss": 0.7286, + "step": 559 + }, + { + "epoch": 0.11297191147591625, + "grad_norm": 0.4824160933494568, + "learning_rate": 9.820707388354428e-06, + "loss": 0.6904, + "step": 560 + }, + { + "epoch": 0.11317364703212324, + "grad_norm": 0.44073861837387085, + "learning_rate": 9.819839140803571e-06, + "loss": 0.7417, + "step": 561 + }, + { + "epoch": 0.11337538258833023, + "grad_norm": 0.7926369905471802, + "learning_rate": 9.818968834595544e-06, + "loss": 0.7159, + "step": 562 + }, + { + "epoch": 0.11357711814453722, + "grad_norm": 0.503059446811676, + "learning_rate": 9.818096470102067e-06, + "loss": 0.7917, + "step": 563 + }, + { + "epoch": 0.11377885370074421, + "grad_norm": 0.7098116874694824, + "learning_rate": 9.817222047695751e-06, + "loss": 0.7545, + "step": 564 + }, + { + "epoch": 0.1139805892569512, + "grad_norm": 1.5219429731369019, + "learning_rate": 9.816345567750078e-06, + "loss": 0.8778, + "step": 565 + }, + { + "epoch": 0.1141823248131582, + "grad_norm": 0.6335718631744385, + "learning_rate": 9.815467030639414e-06, + "loss": 0.8104, + "step": 566 + }, + { + "epoch": 0.1143840603693652, + "grad_norm": 1.10847806930542, + "learning_rate": 9.814586436738998e-06, + "loss": 0.7334, + "step": 567 + }, + { + "epoch": 0.11458579592557219, + "grad_norm": 0.5866941809654236, + "learning_rate": 9.81370378642495e-06, + "loss": 0.7219, + "step": 568 + }, + { + "epoch": 0.11478753148177918, + "grad_norm": 0.7550913691520691, + "learning_rate": 9.812819080074274e-06, + "loss": 0.8763, + "step": 569 + }, + { + "epoch": 0.11498926703798618, + "grad_norm": 0.46390995383262634, + "learning_rate": 9.811932318064843e-06, + "loss": 0.8953, + "step": 570 + }, + { + "epoch": 0.11519100259419317, + "grad_norm": 0.983810544013977, + "learning_rate": 9.811043500775415e-06, + "loss": 0.8389, + "step": 571 + }, + { + "epoch": 0.11539273815040016, + "grad_norm": 0.5378410220146179, + "learning_rate": 9.81015262858562e-06, + "loss": 0.8677, + "step": 572 + }, + { + "epoch": 0.11559447370660715, + "grad_norm": 0.7762982845306396, + "learning_rate": 9.80925970187597e-06, + "loss": 0.8762, + "step": 573 + }, + { + "epoch": 0.11579620926281414, + "grad_norm": 0.5548637509346008, + "learning_rate": 9.808364721027854e-06, + "loss": 0.6611, + "step": 574 + }, + { + "epoch": 0.11599794481902115, + "grad_norm": 0.6933954954147339, + "learning_rate": 9.807467686423536e-06, + "loss": 0.7337, + "step": 575 + }, + { + "epoch": 0.11619968037522814, + "grad_norm": 1.0687283277511597, + "learning_rate": 9.80656859844616e-06, + "loss": 0.8725, + "step": 576 + }, + { + "epoch": 0.11640141593143513, + "grad_norm": 1.4584949016571045, + "learning_rate": 9.805667457479747e-06, + "loss": 0.7366, + "step": 577 + }, + { + "epoch": 0.11660315148764212, + "grad_norm": 1.2576607465744019, + "learning_rate": 9.80476426390919e-06, + "loss": 0.7085, + "step": 578 + }, + { + "epoch": 0.11680488704384911, + "grad_norm": 0.730647087097168, + "learning_rate": 9.803859018120265e-06, + "loss": 0.7253, + "step": 579 + }, + { + "epoch": 0.1170066226000561, + "grad_norm": 2.792001724243164, + "learning_rate": 9.802951720499623e-06, + "loss": 0.7478, + "step": 580 + }, + { + "epoch": 0.1172083581562631, + "grad_norm": 1.430155634880066, + "learning_rate": 9.80204237143479e-06, + "loss": 0.7854, + "step": 581 + }, + { + "epoch": 0.11741009371247009, + "grad_norm": 2.819394826889038, + "learning_rate": 9.801130971314165e-06, + "loss": 0.8178, + "step": 582 + }, + { + "epoch": 0.11761182926867708, + "grad_norm": 0.820543646812439, + "learning_rate": 9.800217520527031e-06, + "loss": 0.8311, + "step": 583 + }, + { + "epoch": 0.11781356482488409, + "grad_norm": 1.3293335437774658, + "learning_rate": 9.799302019463541e-06, + "loss": 0.6847, + "step": 584 + }, + { + "epoch": 0.11801530038109108, + "grad_norm": 0.6411197185516357, + "learning_rate": 9.798384468514725e-06, + "loss": 0.872, + "step": 585 + }, + { + "epoch": 0.11821703593729807, + "grad_norm": 0.6462484002113342, + "learning_rate": 9.797464868072489e-06, + "loss": 0.8846, + "step": 586 + }, + { + "epoch": 0.11841877149350506, + "grad_norm": 0.8242557048797607, + "learning_rate": 9.796543218529612e-06, + "loss": 0.8605, + "step": 587 + }, + { + "epoch": 0.11862050704971205, + "grad_norm": 0.7354720830917358, + "learning_rate": 9.795619520279754e-06, + "loss": 0.6761, + "step": 588 + }, + { + "epoch": 0.11882224260591905, + "grad_norm": 1.7302488088607788, + "learning_rate": 9.794693773717445e-06, + "loss": 0.9048, + "step": 589 + }, + { + "epoch": 0.11902397816212604, + "grad_norm": 0.757748544216156, + "learning_rate": 9.79376597923809e-06, + "loss": 0.7343, + "step": 590 + }, + { + "epoch": 0.11922571371833303, + "grad_norm": 0.8567695021629333, + "learning_rate": 9.792836137237973e-06, + "loss": 0.7886, + "step": 591 + }, + { + "epoch": 0.11942744927454002, + "grad_norm": 0.5145772695541382, + "learning_rate": 9.791904248114247e-06, + "loss": 0.6968, + "step": 592 + }, + { + "epoch": 0.11962918483074703, + "grad_norm": 0.6528315544128418, + "learning_rate": 9.790970312264943e-06, + "loss": 0.6785, + "step": 593 + }, + { + "epoch": 0.11983092038695402, + "grad_norm": 0.6215816140174866, + "learning_rate": 9.790034330088964e-06, + "loss": 0.7053, + "step": 594 + }, + { + "epoch": 0.12003265594316101, + "grad_norm": 0.8241238594055176, + "learning_rate": 9.78909630198609e-06, + "loss": 0.6551, + "step": 595 + }, + { + "epoch": 0.120234391499368, + "grad_norm": 0.5994774103164673, + "learning_rate": 9.788156228356969e-06, + "loss": 0.7186, + "step": 596 + }, + { + "epoch": 0.12043612705557499, + "grad_norm": 1.1751370429992676, + "learning_rate": 9.787214109603134e-06, + "loss": 0.7103, + "step": 597 + }, + { + "epoch": 0.12063786261178198, + "grad_norm": 1.848848581314087, + "learning_rate": 9.786269946126976e-06, + "loss": 0.8232, + "step": 598 + }, + { + "epoch": 0.12083959816798898, + "grad_norm": 2.087062120437622, + "learning_rate": 9.785323738331773e-06, + "loss": 0.7161, + "step": 599 + }, + { + "epoch": 0.12104133372419597, + "grad_norm": 2.3562679290771484, + "learning_rate": 9.78437548662167e-06, + "loss": 0.7386, + "step": 600 + }, + { + "epoch": 0.12124306928040297, + "grad_norm": 2.2806203365325928, + "learning_rate": 9.783425191401686e-06, + "loss": 0.8851, + "step": 601 + }, + { + "epoch": 0.12144480483660997, + "grad_norm": 1.4156206846237183, + "learning_rate": 9.78247285307771e-06, + "loss": 0.7962, + "step": 602 + }, + { + "epoch": 0.12164654039281696, + "grad_norm": 0.6872971653938293, + "learning_rate": 9.781518472056507e-06, + "loss": 0.7202, + "step": 603 + }, + { + "epoch": 0.12184827594902395, + "grad_norm": 0.5104350447654724, + "learning_rate": 9.780562048745715e-06, + "loss": 0.9973, + "step": 604 + }, + { + "epoch": 0.12205001150523094, + "grad_norm": 0.7299901247024536, + "learning_rate": 9.779603583553842e-06, + "loss": 0.7373, + "step": 605 + }, + { + "epoch": 0.12225174706143793, + "grad_norm": 0.38602492213249207, + "learning_rate": 9.77864307689027e-06, + "loss": 0.7382, + "step": 606 + }, + { + "epoch": 0.12245348261764492, + "grad_norm": 0.5797430276870728, + "learning_rate": 9.777680529165251e-06, + "loss": 0.7304, + "step": 607 + }, + { + "epoch": 0.12265521817385192, + "grad_norm": 0.6225494742393494, + "learning_rate": 9.776715940789911e-06, + "loss": 0.7016, + "step": 608 + }, + { + "epoch": 0.12285695373005891, + "grad_norm": 0.610600471496582, + "learning_rate": 9.775749312176249e-06, + "loss": 0.6823, + "step": 609 + }, + { + "epoch": 0.12305868928626591, + "grad_norm": 0.5466820597648621, + "learning_rate": 9.774780643737126e-06, + "loss": 0.7635, + "step": 610 + }, + { + "epoch": 0.1232604248424729, + "grad_norm": 0.6284288167953491, + "learning_rate": 9.773809935886287e-06, + "loss": 0.706, + "step": 611 + }, + { + "epoch": 0.1234621603986799, + "grad_norm": 0.5985273122787476, + "learning_rate": 9.77283718903834e-06, + "loss": 0.7338, + "step": 612 + }, + { + "epoch": 0.12366389595488689, + "grad_norm": 0.6078040599822998, + "learning_rate": 9.771862403608765e-06, + "loss": 0.7114, + "step": 613 + }, + { + "epoch": 0.12386563151109388, + "grad_norm": 0.5801209807395935, + "learning_rate": 9.770885580013917e-06, + "loss": 0.7514, + "step": 614 + }, + { + "epoch": 0.12406736706730087, + "grad_norm": 0.5198795199394226, + "learning_rate": 9.769906718671017e-06, + "loss": 0.6857, + "step": 615 + }, + { + "epoch": 0.12426910262350786, + "grad_norm": 0.8949345946311951, + "learning_rate": 9.768925819998157e-06, + "loss": 0.7517, + "step": 616 + }, + { + "epoch": 0.12447083817971485, + "grad_norm": 0.8734506964683533, + "learning_rate": 9.7679428844143e-06, + "loss": 0.8156, + "step": 617 + }, + { + "epoch": 0.12467257373592185, + "grad_norm": 0.8583992123603821, + "learning_rate": 9.766957912339281e-06, + "loss": 0.751, + "step": 618 + }, + { + "epoch": 0.12487430929212885, + "grad_norm": 0.5585662126541138, + "learning_rate": 9.7659709041938e-06, + "loss": 0.6754, + "step": 619 + }, + { + "epoch": 0.12507604484833584, + "grad_norm": 0.5230299830436707, + "learning_rate": 9.764981860399432e-06, + "loss": 0.6779, + "step": 620 + }, + { + "epoch": 0.12527778040454282, + "grad_norm": 0.48600155115127563, + "learning_rate": 9.763990781378616e-06, + "loss": 0.8704, + "step": 621 + }, + { + "epoch": 0.12547951596074983, + "grad_norm": 4.001948833465576, + "learning_rate": 9.762997667554666e-06, + "loss": 0.8584, + "step": 622 + }, + { + "epoch": 0.12568125151695683, + "grad_norm": 0.4935590922832489, + "learning_rate": 9.762002519351761e-06, + "loss": 0.9628, + "step": 623 + }, + { + "epoch": 0.1258829870731638, + "grad_norm": 5.8011274337768555, + "learning_rate": 9.76100533719495e-06, + "loss": 0.7344, + "step": 624 + }, + { + "epoch": 0.12608472262937082, + "grad_norm": 1.531157374382019, + "learning_rate": 9.760006121510152e-06, + "loss": 1.3136, + "step": 625 + }, + { + "epoch": 0.1262864581855778, + "grad_norm": 0.5714470744132996, + "learning_rate": 9.759004872724153e-06, + "loss": 0.7915, + "step": 626 + }, + { + "epoch": 0.1264881937417848, + "grad_norm": 1.6371656656265259, + "learning_rate": 9.758001591264608e-06, + "loss": 0.7265, + "step": 627 + }, + { + "epoch": 0.12668992929799178, + "grad_norm": 1.5811271667480469, + "learning_rate": 9.75699627756004e-06, + "loss": 0.6997, + "step": 628 + }, + { + "epoch": 0.12689166485419878, + "grad_norm": 2.01772403717041, + "learning_rate": 9.755988932039842e-06, + "loss": 0.7678, + "step": 629 + }, + { + "epoch": 0.12709340041040576, + "grad_norm": 0.5429158210754395, + "learning_rate": 9.754979555134267e-06, + "loss": 0.7126, + "step": 630 + }, + { + "epoch": 0.12729513596661277, + "grad_norm": 0.8591412901878357, + "learning_rate": 9.753968147274448e-06, + "loss": 0.7246, + "step": 631 + }, + { + "epoch": 0.12749687152281977, + "grad_norm": 2.1179087162017822, + "learning_rate": 9.752954708892379e-06, + "loss": 0.7497, + "step": 632 + }, + { + "epoch": 0.12769860707902675, + "grad_norm": 0.6513729095458984, + "learning_rate": 9.751939240420916e-06, + "loss": 0.7581, + "step": 633 + }, + { + "epoch": 0.12790034263523375, + "grad_norm": 0.41945937275886536, + "learning_rate": 9.750921742293794e-06, + "loss": 0.9612, + "step": 634 + }, + { + "epoch": 0.12810207819144073, + "grad_norm": 0.6694250106811523, + "learning_rate": 9.749902214945602e-06, + "loss": 0.7349, + "step": 635 + }, + { + "epoch": 0.12830381374764774, + "grad_norm": 0.6524680256843567, + "learning_rate": 9.748880658811806e-06, + "loss": 0.6788, + "step": 636 + }, + { + "epoch": 0.12850554930385472, + "grad_norm": 0.48333224654197693, + "learning_rate": 9.747857074328735e-06, + "loss": 0.6621, + "step": 637 + }, + { + "epoch": 0.12870728486006172, + "grad_norm": 0.4791509211063385, + "learning_rate": 9.746831461933581e-06, + "loss": 0.7244, + "step": 638 + }, + { + "epoch": 0.1289090204162687, + "grad_norm": 1.133988618850708, + "learning_rate": 9.745803822064409e-06, + "loss": 0.7146, + "step": 639 + }, + { + "epoch": 0.1291107559724757, + "grad_norm": 0.5031291246414185, + "learning_rate": 9.744774155160143e-06, + "loss": 0.9058, + "step": 640 + }, + { + "epoch": 0.1293124915286827, + "grad_norm": 0.5303350687026978, + "learning_rate": 9.743742461660577e-06, + "loss": 0.7332, + "step": 641 + }, + { + "epoch": 0.1295142270848897, + "grad_norm": 1.1249873638153076, + "learning_rate": 9.74270874200637e-06, + "loss": 0.9229, + "step": 642 + }, + { + "epoch": 0.1297159626410967, + "grad_norm": 2.115898609161377, + "learning_rate": 9.741672996639046e-06, + "loss": 0.6615, + "step": 643 + }, + { + "epoch": 0.12991769819730367, + "grad_norm": 0.8712553381919861, + "learning_rate": 9.740635226000994e-06, + "loss": 0.6996, + "step": 644 + }, + { + "epoch": 0.13011943375351068, + "grad_norm": 1.0483100414276123, + "learning_rate": 9.739595430535467e-06, + "loss": 0.7213, + "step": 645 + }, + { + "epoch": 0.13032116930971765, + "grad_norm": 0.5651170015335083, + "learning_rate": 9.738553610686586e-06, + "loss": 0.6623, + "step": 646 + }, + { + "epoch": 0.13052290486592466, + "grad_norm": 0.599432110786438, + "learning_rate": 9.737509766899333e-06, + "loss": 0.8433, + "step": 647 + }, + { + "epoch": 0.13072464042213164, + "grad_norm": 2.7517247200012207, + "learning_rate": 9.736463899619557e-06, + "loss": 0.6449, + "step": 648 + }, + { + "epoch": 0.13092637597833864, + "grad_norm": 1.9037710428237915, + "learning_rate": 9.73541600929397e-06, + "loss": 0.7139, + "step": 649 + }, + { + "epoch": 0.13112811153454565, + "grad_norm": 5.7445220947265625, + "learning_rate": 9.734366096370148e-06, + "loss": 0.6772, + "step": 650 + }, + { + "epoch": 0.13132984709075263, + "grad_norm": 1.8100695610046387, + "learning_rate": 9.733314161296534e-06, + "loss": 0.704, + "step": 651 + }, + { + "epoch": 0.13153158264695963, + "grad_norm": 0.4549188017845154, + "learning_rate": 9.73226020452243e-06, + "loss": 0.8671, + "step": 652 + }, + { + "epoch": 0.1317333182031666, + "grad_norm": 0.832423210144043, + "learning_rate": 9.731204226498006e-06, + "loss": 1.0424, + "step": 653 + }, + { + "epoch": 0.13193505375937362, + "grad_norm": 0.6530823707580566, + "learning_rate": 9.730146227674289e-06, + "loss": 0.7243, + "step": 654 + }, + { + "epoch": 0.1321367893155806, + "grad_norm": 0.6458786129951477, + "learning_rate": 9.729086208503174e-06, + "loss": 0.6997, + "step": 655 + }, + { + "epoch": 0.1323385248717876, + "grad_norm": 1.432695746421814, + "learning_rate": 9.72802416943742e-06, + "loss": 0.9292, + "step": 656 + }, + { + "epoch": 0.13254026042799458, + "grad_norm": 0.6455374360084534, + "learning_rate": 9.726960110930648e-06, + "loss": 0.7323, + "step": 657 + }, + { + "epoch": 0.13274199598420158, + "grad_norm": 0.4740164279937744, + "learning_rate": 9.725894033437335e-06, + "loss": 0.721, + "step": 658 + }, + { + "epoch": 0.1329437315404086, + "grad_norm": 0.73670494556427, + "learning_rate": 9.724825937412832e-06, + "loss": 0.676, + "step": 659 + }, + { + "epoch": 0.13314546709661557, + "grad_norm": 0.5248075723648071, + "learning_rate": 9.723755823313342e-06, + "loss": 0.694, + "step": 660 + }, + { + "epoch": 0.13334720265282257, + "grad_norm": 0.4952867329120636, + "learning_rate": 9.722683691595933e-06, + "loss": 0.7452, + "step": 661 + }, + { + "epoch": 0.13354893820902955, + "grad_norm": 0.5512588024139404, + "learning_rate": 9.72160954271854e-06, + "loss": 0.666, + "step": 662 + }, + { + "epoch": 0.13375067376523656, + "grad_norm": 1.7721054553985596, + "learning_rate": 9.720533377139949e-06, + "loss": 0.8351, + "step": 663 + }, + { + "epoch": 0.13395240932144353, + "grad_norm": 0.647897481918335, + "learning_rate": 9.719455195319819e-06, + "loss": 0.7144, + "step": 664 + }, + { + "epoch": 0.13415414487765054, + "grad_norm": 0.5909448862075806, + "learning_rate": 9.718374997718662e-06, + "loss": 0.6794, + "step": 665 + }, + { + "epoch": 0.13435588043385754, + "grad_norm": 0.6904434561729431, + "learning_rate": 9.717292784797854e-06, + "loss": 0.7908, + "step": 666 + }, + { + "epoch": 0.13455761599006452, + "grad_norm": 1.919708013534546, + "learning_rate": 9.716208557019632e-06, + "loss": 0.7146, + "step": 667 + }, + { + "epoch": 0.13475935154627153, + "grad_norm": 0.973901093006134, + "learning_rate": 9.715122314847093e-06, + "loss": 0.7182, + "step": 668 + }, + { + "epoch": 0.1349610871024785, + "grad_norm": 0.5195472836494446, + "learning_rate": 9.714034058744193e-06, + "loss": 0.8965, + "step": 669 + }, + { + "epoch": 0.1351628226586855, + "grad_norm": 1.6600466966629028, + "learning_rate": 9.712943789175753e-06, + "loss": 0.7199, + "step": 670 + }, + { + "epoch": 0.1353645582148925, + "grad_norm": 0.6698784828186035, + "learning_rate": 9.711851506607446e-06, + "loss": 0.7798, + "step": 671 + }, + { + "epoch": 0.1355662937710995, + "grad_norm": 0.6091451644897461, + "learning_rate": 9.710757211505812e-06, + "loss": 0.6942, + "step": 672 + }, + { + "epoch": 0.13576802932730647, + "grad_norm": 0.5566756129264832, + "learning_rate": 9.70966090433825e-06, + "loss": 0.7163, + "step": 673 + }, + { + "epoch": 0.13596976488351348, + "grad_norm": 0.8096559047698975, + "learning_rate": 9.708562585573013e-06, + "loss": 0.7149, + "step": 674 + }, + { + "epoch": 0.13617150043972048, + "grad_norm": 0.6540467739105225, + "learning_rate": 9.707462255679217e-06, + "loss": 0.7077, + "step": 675 + }, + { + "epoch": 0.13637323599592746, + "grad_norm": 0.3687657415866852, + "learning_rate": 9.706359915126838e-06, + "loss": 0.6766, + "step": 676 + }, + { + "epoch": 0.13657497155213447, + "grad_norm": 0.8468163013458252, + "learning_rate": 9.70525556438671e-06, + "loss": 0.7108, + "step": 677 + }, + { + "epoch": 0.13677670710834144, + "grad_norm": 0.6615922451019287, + "learning_rate": 9.704149203930522e-06, + "loss": 0.7482, + "step": 678 + }, + { + "epoch": 0.13697844266454845, + "grad_norm": 0.4638476073741913, + "learning_rate": 9.703040834230828e-06, + "loss": 0.686, + "step": 679 + }, + { + "epoch": 0.13718017822075543, + "grad_norm": 0.7426740527153015, + "learning_rate": 9.701930455761036e-06, + "loss": 0.9151, + "step": 680 + }, + { + "epoch": 0.13738191377696243, + "grad_norm": 0.6113342046737671, + "learning_rate": 9.700818068995407e-06, + "loss": 1.0156, + "step": 681 + }, + { + "epoch": 0.1375836493331694, + "grad_norm": 0.7739665508270264, + "learning_rate": 9.699703674409074e-06, + "loss": 0.6889, + "step": 682 + }, + { + "epoch": 0.13778538488937642, + "grad_norm": 0.6738470196723938, + "learning_rate": 9.698587272478012e-06, + "loss": 0.7311, + "step": 683 + }, + { + "epoch": 0.13798712044558342, + "grad_norm": 0.7146970629692078, + "learning_rate": 9.697468863679065e-06, + "loss": 0.685, + "step": 684 + }, + { + "epoch": 0.1381888560017904, + "grad_norm": 0.555719792842865, + "learning_rate": 9.696348448489927e-06, + "loss": 1.0687, + "step": 685 + }, + { + "epoch": 0.1383905915579974, + "grad_norm": 0.5952572226524353, + "learning_rate": 9.695226027389154e-06, + "loss": 0.7047, + "step": 686 + }, + { + "epoch": 0.13859232711420438, + "grad_norm": 1.26414954662323, + "learning_rate": 9.69410160085615e-06, + "loss": 0.7015, + "step": 687 + }, + { + "epoch": 0.1387940626704114, + "grad_norm": 0.5881784558296204, + "learning_rate": 9.692975169371189e-06, + "loss": 0.7379, + "step": 688 + }, + { + "epoch": 0.13899579822661837, + "grad_norm": 0.9572243690490723, + "learning_rate": 9.69184673341539e-06, + "loss": 0.6954, + "step": 689 + }, + { + "epoch": 0.13919753378282537, + "grad_norm": 1.2255734205245972, + "learning_rate": 9.690716293470735e-06, + "loss": 0.6955, + "step": 690 + }, + { + "epoch": 0.13939926933903235, + "grad_norm": 0.5322736501693726, + "learning_rate": 9.689583850020058e-06, + "loss": 0.8187, + "step": 691 + }, + { + "epoch": 0.13960100489523936, + "grad_norm": 0.6318933367729187, + "learning_rate": 9.68844940354705e-06, + "loss": 0.6888, + "step": 692 + }, + { + "epoch": 0.13980274045144636, + "grad_norm": 1.0587413311004639, + "learning_rate": 9.687312954536255e-06, + "loss": 0.7381, + "step": 693 + }, + { + "epoch": 0.14000447600765334, + "grad_norm": 1.1156810522079468, + "learning_rate": 9.68617450347308e-06, + "loss": 0.6845, + "step": 694 + }, + { + "epoch": 0.14020621156386034, + "grad_norm": 1.1569520235061646, + "learning_rate": 9.685034050843779e-06, + "loss": 0.7295, + "step": 695 + }, + { + "epoch": 0.14040794712006732, + "grad_norm": 1.456122875213623, + "learning_rate": 9.683891597135462e-06, + "loss": 0.6988, + "step": 696 + }, + { + "epoch": 0.14060968267627433, + "grad_norm": 0.5524277687072754, + "learning_rate": 9.6827471428361e-06, + "loss": 0.7191, + "step": 697 + }, + { + "epoch": 0.1408114182324813, + "grad_norm": 0.667767345905304, + "learning_rate": 9.681600688434509e-06, + "loss": 0.6836, + "step": 698 + }, + { + "epoch": 0.1410131537886883, + "grad_norm": 0.445433109998703, + "learning_rate": 9.68045223442037e-06, + "loss": 0.815, + "step": 699 + }, + { + "epoch": 0.1412148893448953, + "grad_norm": 0.8379462957382202, + "learning_rate": 9.679301781284209e-06, + "loss": 0.9189, + "step": 700 + }, + { + "epoch": 0.1414166249011023, + "grad_norm": 0.7621182799339294, + "learning_rate": 9.67814932951741e-06, + "loss": 0.7113, + "step": 701 + }, + { + "epoch": 0.1416183604573093, + "grad_norm": 0.6904314160346985, + "learning_rate": 9.676994879612209e-06, + "loss": 0.722, + "step": 702 + }, + { + "epoch": 0.14182009601351628, + "grad_norm": 1.0283129215240479, + "learning_rate": 9.675838432061698e-06, + "loss": 0.6987, + "step": 703 + }, + { + "epoch": 0.14202183156972328, + "grad_norm": 1.1526720523834229, + "learning_rate": 9.674679987359822e-06, + "loss": 0.8583, + "step": 704 + }, + { + "epoch": 0.14222356712593026, + "grad_norm": 0.6819185018539429, + "learning_rate": 9.673519546001373e-06, + "loss": 0.9021, + "step": 705 + }, + { + "epoch": 0.14242530268213727, + "grad_norm": 1.3759137392044067, + "learning_rate": 9.672357108482005e-06, + "loss": 0.7268, + "step": 706 + }, + { + "epoch": 0.14262703823834424, + "grad_norm": 0.46242034435272217, + "learning_rate": 9.671192675298218e-06, + "loss": 0.6623, + "step": 707 + }, + { + "epoch": 0.14282877379455125, + "grad_norm": 22.558115005493164, + "learning_rate": 9.670026246947367e-06, + "loss": 0.6944, + "step": 708 + }, + { + "epoch": 0.14303050935075823, + "grad_norm": 1.0445126295089722, + "learning_rate": 9.668857823927658e-06, + "loss": 0.7426, + "step": 709 + }, + { + "epoch": 0.14323224490696523, + "grad_norm": 0.49592289328575134, + "learning_rate": 9.66768740673815e-06, + "loss": 0.7744, + "step": 710 + }, + { + "epoch": 0.14343398046317224, + "grad_norm": 0.9152998924255371, + "learning_rate": 9.666514995878755e-06, + "loss": 0.8277, + "step": 711 + }, + { + "epoch": 0.14363571601937922, + "grad_norm": 0.7208297848701477, + "learning_rate": 9.665340591850235e-06, + "loss": 0.8091, + "step": 712 + }, + { + "epoch": 0.14383745157558622, + "grad_norm": 0.45589447021484375, + "learning_rate": 9.664164195154199e-06, + "loss": 0.709, + "step": 713 + }, + { + "epoch": 0.1440391871317932, + "grad_norm": 0.45049798488616943, + "learning_rate": 9.662985806293115e-06, + "loss": 0.7353, + "step": 714 + }, + { + "epoch": 0.1442409226880002, + "grad_norm": 0.7140663862228394, + "learning_rate": 9.661805425770298e-06, + "loss": 0.7304, + "step": 715 + }, + { + "epoch": 0.14444265824420718, + "grad_norm": 1.5145859718322754, + "learning_rate": 9.660623054089913e-06, + "loss": 0.7908, + "step": 716 + }, + { + "epoch": 0.1446443938004142, + "grad_norm": 0.4583665430545807, + "learning_rate": 9.659438691756976e-06, + "loss": 0.7136, + "step": 717 + }, + { + "epoch": 0.1448461293566212, + "grad_norm": 0.868816077709198, + "learning_rate": 9.658252339277359e-06, + "loss": 0.7388, + "step": 718 + }, + { + "epoch": 0.14504786491282817, + "grad_norm": 0.7502593994140625, + "learning_rate": 9.65706399715777e-06, + "loss": 0.7955, + "step": 719 + }, + { + "epoch": 0.14524960046903518, + "grad_norm": 0.4568173885345459, + "learning_rate": 9.655873665905781e-06, + "loss": 0.66, + "step": 720 + }, + { + "epoch": 0.14545133602524216, + "grad_norm": 1.3116257190704346, + "learning_rate": 9.654681346029809e-06, + "loss": 0.7388, + "step": 721 + }, + { + "epoch": 0.14565307158144916, + "grad_norm": 0.8519835472106934, + "learning_rate": 9.653487038039116e-06, + "loss": 1.0086, + "step": 722 + }, + { + "epoch": 0.14585480713765614, + "grad_norm": 0.7730309367179871, + "learning_rate": 9.652290742443818e-06, + "loss": 0.7059, + "step": 723 + }, + { + "epoch": 0.14605654269386315, + "grad_norm": 1.275344967842102, + "learning_rate": 9.651092459754879e-06, + "loss": 0.7021, + "step": 724 + }, + { + "epoch": 0.14625827825007012, + "grad_norm": 0.962645411491394, + "learning_rate": 9.64989219048411e-06, + "loss": 0.799, + "step": 725 + }, + { + "epoch": 0.14646001380627713, + "grad_norm": 0.774117648601532, + "learning_rate": 9.648689935144175e-06, + "loss": 0.7324, + "step": 726 + }, + { + "epoch": 0.14666174936248413, + "grad_norm": 0.6590328812599182, + "learning_rate": 9.647485694248579e-06, + "loss": 0.7218, + "step": 727 + }, + { + "epoch": 0.1468634849186911, + "grad_norm": 0.5773506164550781, + "learning_rate": 9.646279468311684e-06, + "loss": 0.7326, + "step": 728 + }, + { + "epoch": 0.14706522047489812, + "grad_norm": 0.5804173350334167, + "learning_rate": 9.645071257848692e-06, + "loss": 0.7098, + "step": 729 + }, + { + "epoch": 0.1472669560311051, + "grad_norm": 1.1419330835342407, + "learning_rate": 9.643861063375657e-06, + "loss": 0.8657, + "step": 730 + }, + { + "epoch": 0.1474686915873121, + "grad_norm": 2.3604445457458496, + "learning_rate": 9.642648885409475e-06, + "loss": 0.6524, + "step": 731 + }, + { + "epoch": 0.14767042714351908, + "grad_norm": 1.2862043380737305, + "learning_rate": 9.6414347244679e-06, + "loss": 0.9618, + "step": 732 + }, + { + "epoch": 0.14787216269972608, + "grad_norm": 0.5891053080558777, + "learning_rate": 9.640218581069522e-06, + "loss": 0.6627, + "step": 733 + }, + { + "epoch": 0.14807389825593306, + "grad_norm": 0.5970614552497864, + "learning_rate": 9.639000455733784e-06, + "loss": 0.6595, + "step": 734 + }, + { + "epoch": 0.14827563381214007, + "grad_norm": 1.3071032762527466, + "learning_rate": 9.637780348980972e-06, + "loss": 0.689, + "step": 735 + }, + { + "epoch": 0.14847736936834707, + "grad_norm": 0.3851945400238037, + "learning_rate": 9.636558261332221e-06, + "loss": 0.8366, + "step": 736 + }, + { + "epoch": 0.14867910492455405, + "grad_norm": 0.7860169410705566, + "learning_rate": 9.63533419330951e-06, + "loss": 0.7495, + "step": 737 + }, + { + "epoch": 0.14888084048076106, + "grad_norm": 0.6074318885803223, + "learning_rate": 9.634108145435665e-06, + "loss": 0.8666, + "step": 738 + }, + { + "epoch": 0.14908257603696803, + "grad_norm": 0.9100054502487183, + "learning_rate": 9.63288011823436e-06, + "loss": 0.7006, + "step": 739 + }, + { + "epoch": 0.14928431159317504, + "grad_norm": 0.504601776599884, + "learning_rate": 9.631650112230108e-06, + "loss": 0.6408, + "step": 740 + }, + { + "epoch": 0.14948604714938202, + "grad_norm": 0.7778279185295105, + "learning_rate": 9.630418127948273e-06, + "loss": 0.7248, + "step": 741 + }, + { + "epoch": 0.14968778270558902, + "grad_norm": 0.5700371265411377, + "learning_rate": 9.629184165915063e-06, + "loss": 0.6877, + "step": 742 + }, + { + "epoch": 0.149889518261796, + "grad_norm": 0.4914577305316925, + "learning_rate": 9.627948226657527e-06, + "loss": 0.8579, + "step": 743 + }, + { + "epoch": 0.150091253818003, + "grad_norm": 0.7705051302909851, + "learning_rate": 9.626710310703565e-06, + "loss": 0.6808, + "step": 744 + }, + { + "epoch": 0.15029298937421, + "grad_norm": 0.5976186394691467, + "learning_rate": 9.625470418581913e-06, + "loss": 0.7094, + "step": 745 + }, + { + "epoch": 0.150494724930417, + "grad_norm": 0.5062031745910645, + "learning_rate": 9.62422855082216e-06, + "loss": 0.8747, + "step": 746 + }, + { + "epoch": 0.150696460486624, + "grad_norm": 1.6156225204467773, + "learning_rate": 9.622984707954732e-06, + "loss": 0.7054, + "step": 747 + }, + { + "epoch": 0.15089819604283097, + "grad_norm": 0.4361424446105957, + "learning_rate": 9.621738890510901e-06, + "loss": 0.7104, + "step": 748 + }, + { + "epoch": 0.15109993159903798, + "grad_norm": 0.5863596796989441, + "learning_rate": 9.620491099022786e-06, + "loss": 0.6611, + "step": 749 + }, + { + "epoch": 0.15130166715524496, + "grad_norm": 0.8612884879112244, + "learning_rate": 9.61924133402334e-06, + "loss": 0.7151, + "step": 750 + }, + { + "epoch": 0.15150340271145196, + "grad_norm": 1.4638561010360718, + "learning_rate": 9.617989596046368e-06, + "loss": 0.7716, + "step": 751 + }, + { + "epoch": 0.15170513826765894, + "grad_norm": 0.8269960284233093, + "learning_rate": 9.616735885626516e-06, + "loss": 0.7143, + "step": 752 + }, + { + "epoch": 0.15190687382386595, + "grad_norm": 0.9362808465957642, + "learning_rate": 9.615480203299266e-06, + "loss": 0.7181, + "step": 753 + }, + { + "epoch": 0.15210860938007295, + "grad_norm": 0.7409095168113708, + "learning_rate": 9.61422254960095e-06, + "loss": 0.6982, + "step": 754 + }, + { + "epoch": 0.15231034493627993, + "grad_norm": 0.42784583568573, + "learning_rate": 9.612962925068738e-06, + "loss": 0.8009, + "step": 755 + }, + { + "epoch": 0.15251208049248693, + "grad_norm": 0.9697193503379822, + "learning_rate": 9.611701330240644e-06, + "loss": 0.803, + "step": 756 + }, + { + "epoch": 0.1527138160486939, + "grad_norm": 0.6597254276275635, + "learning_rate": 9.610437765655522e-06, + "loss": 0.7358, + "step": 757 + }, + { + "epoch": 0.15291555160490092, + "grad_norm": 5.7137675285339355, + "learning_rate": 9.609172231853066e-06, + "loss": 0.7157, + "step": 758 + }, + { + "epoch": 0.1531172871611079, + "grad_norm": 1.3956544399261475, + "learning_rate": 9.607904729373816e-06, + "loss": 0.7055, + "step": 759 + }, + { + "epoch": 0.1533190227173149, + "grad_norm": 0.8854261636734009, + "learning_rate": 9.606635258759146e-06, + "loss": 0.6771, + "step": 760 + }, + { + "epoch": 0.1535207582735219, + "grad_norm": 0.5108949542045593, + "learning_rate": 9.605363820551277e-06, + "loss": 0.6921, + "step": 761 + }, + { + "epoch": 0.15372249382972888, + "grad_norm": 0.5805178284645081, + "learning_rate": 9.604090415293265e-06, + "loss": 0.6944, + "step": 762 + }, + { + "epoch": 0.1539242293859359, + "grad_norm": 5.874364852905273, + "learning_rate": 9.60281504352901e-06, + "loss": 0.7408, + "step": 763 + }, + { + "epoch": 0.15412596494214287, + "grad_norm": 0.6608618497848511, + "learning_rate": 9.601537705803253e-06, + "loss": 0.7101, + "step": 764 + }, + { + "epoch": 0.15432770049834987, + "grad_norm": 1.541603446006775, + "learning_rate": 9.60025840266157e-06, + "loss": 0.7053, + "step": 765 + }, + { + "epoch": 0.15452943605455685, + "grad_norm": 0.4954194724559784, + "learning_rate": 9.598977134650381e-06, + "loss": 0.6553, + "step": 766 + }, + { + "epoch": 0.15473117161076386, + "grad_norm": 1.2848337888717651, + "learning_rate": 9.597693902316938e-06, + "loss": 0.6651, + "step": 767 + }, + { + "epoch": 0.15493290716697083, + "grad_norm": 0.6397077441215515, + "learning_rate": 9.596408706209344e-06, + "loss": 0.7129, + "step": 768 + }, + { + "epoch": 0.15513464272317784, + "grad_norm": 0.5690326690673828, + "learning_rate": 9.595121546876529e-06, + "loss": 0.8142, + "step": 769 + }, + { + "epoch": 0.15533637827938485, + "grad_norm": 0.42553994059562683, + "learning_rate": 9.593832424868271e-06, + "loss": 0.6814, + "step": 770 + }, + { + "epoch": 0.15553811383559182, + "grad_norm": 0.9635345339775085, + "learning_rate": 9.592541340735177e-06, + "loss": 0.7081, + "step": 771 + }, + { + "epoch": 0.15573984939179883, + "grad_norm": 0.41006505489349365, + "learning_rate": 9.5912482950287e-06, + "loss": 0.7294, + "step": 772 + }, + { + "epoch": 0.1559415849480058, + "grad_norm": 0.5492969155311584, + "learning_rate": 9.589953288301126e-06, + "loss": 0.6791, + "step": 773 + }, + { + "epoch": 0.1561433205042128, + "grad_norm": 0.5428392887115479, + "learning_rate": 9.58865632110558e-06, + "loss": 0.7024, + "step": 774 + }, + { + "epoch": 0.1563450560604198, + "grad_norm": 1.0308176279067993, + "learning_rate": 9.587357393996027e-06, + "loss": 0.6896, + "step": 775 + }, + { + "epoch": 0.1565467916166268, + "grad_norm": 0.4440838694572449, + "learning_rate": 9.586056507527266e-06, + "loss": 0.6864, + "step": 776 + }, + { + "epoch": 0.15674852717283377, + "grad_norm": 0.4978935122489929, + "learning_rate": 9.584753662254932e-06, + "loss": 0.719, + "step": 777 + }, + { + "epoch": 0.15695026272904078, + "grad_norm": 0.8401952385902405, + "learning_rate": 9.5834488587355e-06, + "loss": 0.7286, + "step": 778 + }, + { + "epoch": 0.15715199828524778, + "grad_norm": 0.5550390481948853, + "learning_rate": 9.582142097526278e-06, + "loss": 0.9584, + "step": 779 + }, + { + "epoch": 0.15735373384145476, + "grad_norm": 3.024136543273926, + "learning_rate": 9.580833379185415e-06, + "loss": 0.7976, + "step": 780 + }, + { + "epoch": 0.15755546939766177, + "grad_norm": 0.5451089143753052, + "learning_rate": 9.579522704271889e-06, + "loss": 0.6971, + "step": 781 + }, + { + "epoch": 0.15775720495386875, + "grad_norm": 0.6897755861282349, + "learning_rate": 9.57821007334552e-06, + "loss": 0.8471, + "step": 782 + }, + { + "epoch": 0.15795894051007575, + "grad_norm": 0.5179625153541565, + "learning_rate": 9.576895486966959e-06, + "loss": 0.6998, + "step": 783 + }, + { + "epoch": 0.15816067606628273, + "grad_norm": 0.465382844209671, + "learning_rate": 9.575578945697696e-06, + "loss": 0.843, + "step": 784 + }, + { + "epoch": 0.15836241162248973, + "grad_norm": 6.361021518707275, + "learning_rate": 9.574260450100054e-06, + "loss": 0.7791, + "step": 785 + }, + { + "epoch": 0.1585641471786967, + "grad_norm": 0.4968310594558716, + "learning_rate": 9.57294000073719e-06, + "loss": 0.741, + "step": 786 + }, + { + "epoch": 0.15876588273490372, + "grad_norm": 0.4837093949317932, + "learning_rate": 9.571617598173097e-06, + "loss": 0.7069, + "step": 787 + }, + { + "epoch": 0.15896761829111072, + "grad_norm": 0.5080758333206177, + "learning_rate": 9.5702932429726e-06, + "loss": 0.7032, + "step": 788 + }, + { + "epoch": 0.1591693538473177, + "grad_norm": 0.3934210538864136, + "learning_rate": 9.568966935701362e-06, + "loss": 0.6943, + "step": 789 + }, + { + "epoch": 0.1593710894035247, + "grad_norm": 1.3253793716430664, + "learning_rate": 9.567638676925877e-06, + "loss": 0.6914, + "step": 790 + }, + { + "epoch": 0.15957282495973169, + "grad_norm": 0.5028374195098877, + "learning_rate": 9.566308467213472e-06, + "loss": 0.7356, + "step": 791 + }, + { + "epoch": 0.1597745605159387, + "grad_norm": 0.7472164630889893, + "learning_rate": 9.56497630713231e-06, + "loss": 0.6967, + "step": 792 + }, + { + "epoch": 0.15997629607214567, + "grad_norm": 0.5399154424667358, + "learning_rate": 9.563642197251382e-06, + "loss": 0.7105, + "step": 793 + }, + { + "epoch": 0.16017803162835267, + "grad_norm": 0.6258538365364075, + "learning_rate": 9.562306138140518e-06, + "loss": 0.7155, + "step": 794 + }, + { + "epoch": 0.16037976718455965, + "grad_norm": 0.44273310899734497, + "learning_rate": 9.560968130370376e-06, + "loss": 0.6848, + "step": 795 + }, + { + "epoch": 0.16058150274076666, + "grad_norm": 0.9403547644615173, + "learning_rate": 9.55962817451245e-06, + "loss": 0.7942, + "step": 796 + }, + { + "epoch": 0.16078323829697366, + "grad_norm": 0.5547764301300049, + "learning_rate": 9.558286271139061e-06, + "loss": 0.7035, + "step": 797 + }, + { + "epoch": 0.16098497385318064, + "grad_norm": 0.4399620592594147, + "learning_rate": 9.556942420823368e-06, + "loss": 0.6614, + "step": 798 + }, + { + "epoch": 0.16118670940938765, + "grad_norm": 0.5260964632034302, + "learning_rate": 9.555596624139356e-06, + "loss": 0.744, + "step": 799 + }, + { + "epoch": 0.16138844496559462, + "grad_norm": 0.691392183303833, + "learning_rate": 9.554248881661845e-06, + "loss": 0.7364, + "step": 800 + }, + { + "epoch": 0.16159018052180163, + "grad_norm": 0.7063742280006409, + "learning_rate": 9.552899193966484e-06, + "loss": 0.9067, + "step": 801 + }, + { + "epoch": 0.1617919160780086, + "grad_norm": 0.5435588955879211, + "learning_rate": 9.551547561629755e-06, + "loss": 0.7611, + "step": 802 + }, + { + "epoch": 0.1619936516342156, + "grad_norm": 0.5623730421066284, + "learning_rate": 9.550193985228968e-06, + "loss": 0.7608, + "step": 803 + }, + { + "epoch": 0.1621953871904226, + "grad_norm": 1.5376967191696167, + "learning_rate": 9.548838465342266e-06, + "loss": 0.7438, + "step": 804 + }, + { + "epoch": 0.1623971227466296, + "grad_norm": 0.5251892805099487, + "learning_rate": 9.54748100254862e-06, + "loss": 0.916, + "step": 805 + }, + { + "epoch": 0.1625988583028366, + "grad_norm": 0.7586867213249207, + "learning_rate": 9.54612159742783e-06, + "loss": 0.84, + "step": 806 + }, + { + "epoch": 0.16280059385904358, + "grad_norm": 0.3578733205795288, + "learning_rate": 9.544760250560531e-06, + "loss": 0.713, + "step": 807 + }, + { + "epoch": 0.16300232941525059, + "grad_norm": 1.0173057317733765, + "learning_rate": 9.54339696252818e-06, + "loss": 0.7829, + "step": 808 + }, + { + "epoch": 0.16320406497145756, + "grad_norm": 0.5583421587944031, + "learning_rate": 9.542031733913069e-06, + "loss": 0.7975, + "step": 809 + }, + { + "epoch": 0.16340580052766457, + "grad_norm": 0.409379780292511, + "learning_rate": 9.540664565298315e-06, + "loss": 0.6718, + "step": 810 + }, + { + "epoch": 0.16360753608387155, + "grad_norm": 0.44500353932380676, + "learning_rate": 9.539295457267865e-06, + "loss": 0.7731, + "step": 811 + }, + { + "epoch": 0.16380927164007855, + "grad_norm": 0.49359723925590515, + "learning_rate": 9.537924410406495e-06, + "loss": 0.6798, + "step": 812 + }, + { + "epoch": 0.16401100719628556, + "grad_norm": 0.5281481742858887, + "learning_rate": 9.536551425299812e-06, + "loss": 0.71, + "step": 813 + }, + { + "epoch": 0.16421274275249254, + "grad_norm": 0.4787105321884155, + "learning_rate": 9.535176502534242e-06, + "loss": 0.7343, + "step": 814 + }, + { + "epoch": 0.16441447830869954, + "grad_norm": 0.5769368410110474, + "learning_rate": 9.533799642697047e-06, + "loss": 0.6915, + "step": 815 + }, + { + "epoch": 0.16461621386490652, + "grad_norm": 0.44727253913879395, + "learning_rate": 9.532420846376316e-06, + "loss": 0.86, + "step": 816 + }, + { + "epoch": 0.16481794942111352, + "grad_norm": 0.5646665692329407, + "learning_rate": 9.531040114160958e-06, + "loss": 0.8393, + "step": 817 + }, + { + "epoch": 0.1650196849773205, + "grad_norm": 1.3017197847366333, + "learning_rate": 9.529657446640714e-06, + "loss": 0.7177, + "step": 818 + }, + { + "epoch": 0.1652214205335275, + "grad_norm": 1.5386371612548828, + "learning_rate": 9.528272844406154e-06, + "loss": 0.8101, + "step": 819 + }, + { + "epoch": 0.16542315608973449, + "grad_norm": 0.4335991144180298, + "learning_rate": 9.52688630804867e-06, + "loss": 0.6833, + "step": 820 + }, + { + "epoch": 0.1656248916459415, + "grad_norm": 0.8145807981491089, + "learning_rate": 9.52549783816048e-06, + "loss": 0.8993, + "step": 821 + }, + { + "epoch": 0.1658266272021485, + "grad_norm": 0.40622222423553467, + "learning_rate": 9.524107435334633e-06, + "loss": 0.8246, + "step": 822 + }, + { + "epoch": 0.16602836275835547, + "grad_norm": 2.768005609512329, + "learning_rate": 9.522715100164996e-06, + "loss": 0.9468, + "step": 823 + }, + { + "epoch": 0.16623009831456248, + "grad_norm": 0.8487063050270081, + "learning_rate": 9.521320833246268e-06, + "loss": 1.1553, + "step": 824 + }, + { + "epoch": 0.16643183387076946, + "grad_norm": 0.8210890889167786, + "learning_rate": 9.51992463517397e-06, + "loss": 0.7387, + "step": 825 + }, + { + "epoch": 0.16663356942697646, + "grad_norm": 0.40662023425102234, + "learning_rate": 9.518526506544447e-06, + "loss": 1.0524, + "step": 826 + }, + { + "epoch": 0.16683530498318344, + "grad_norm": 0.932871401309967, + "learning_rate": 9.517126447954872e-06, + "loss": 0.7819, + "step": 827 + }, + { + "epoch": 0.16703704053939045, + "grad_norm": 0.5390327572822571, + "learning_rate": 9.515724460003238e-06, + "loss": 0.7195, + "step": 828 + }, + { + "epoch": 0.16723877609559742, + "grad_norm": 0.38334664702415466, + "learning_rate": 9.514320543288367e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.16744051165180443, + "grad_norm": 1.7166111469268799, + "learning_rate": 9.512914698409898e-06, + "loss": 0.9641, + "step": 830 + }, + { + "epoch": 0.16764224720801144, + "grad_norm": 0.5833882093429565, + "learning_rate": 9.511506925968302e-06, + "loss": 0.6696, + "step": 831 + }, + { + "epoch": 0.1678439827642184, + "grad_norm": 0.7301717400550842, + "learning_rate": 9.510097226564866e-06, + "loss": 0.8498, + "step": 832 + }, + { + "epoch": 0.16804571832042542, + "grad_norm": 0.4588780701160431, + "learning_rate": 9.508685600801704e-06, + "loss": 0.6698, + "step": 833 + }, + { + "epoch": 0.1682474538766324, + "grad_norm": 0.4560970366001129, + "learning_rate": 9.507272049281752e-06, + "loss": 0.7969, + "step": 834 + }, + { + "epoch": 0.1684491894328394, + "grad_norm": 0.5290451645851135, + "learning_rate": 9.50585657260877e-06, + "loss": 0.7079, + "step": 835 + }, + { + "epoch": 0.16865092498904638, + "grad_norm": 0.49037396907806396, + "learning_rate": 9.504439171387334e-06, + "loss": 1.0519, + "step": 836 + }, + { + "epoch": 0.16885266054525339, + "grad_norm": 0.5754956603050232, + "learning_rate": 9.503019846222849e-06, + "loss": 0.745, + "step": 837 + }, + { + "epoch": 0.16905439610146036, + "grad_norm": 0.7032858729362488, + "learning_rate": 9.501598597721542e-06, + "loss": 0.7016, + "step": 838 + }, + { + "epoch": 0.16925613165766737, + "grad_norm": 0.5736189484596252, + "learning_rate": 9.500175426490455e-06, + "loss": 1.0468, + "step": 839 + }, + { + "epoch": 0.16945786721387437, + "grad_norm": 0.48139849305152893, + "learning_rate": 9.498750333137456e-06, + "loss": 0.8038, + "step": 840 + }, + { + "epoch": 0.16965960277008135, + "grad_norm": 0.5781999230384827, + "learning_rate": 9.497323318271237e-06, + "loss": 0.6912, + "step": 841 + }, + { + "epoch": 0.16986133832628836, + "grad_norm": 0.41525569558143616, + "learning_rate": 9.4958943825013e-06, + "loss": 0.8743, + "step": 842 + }, + { + "epoch": 0.17006307388249534, + "grad_norm": 0.4945438504219055, + "learning_rate": 9.494463526437979e-06, + "loss": 0.7092, + "step": 843 + }, + { + "epoch": 0.17026480943870234, + "grad_norm": 0.7058941125869751, + "learning_rate": 9.493030750692422e-06, + "loss": 0.7403, + "step": 844 + }, + { + "epoch": 0.17046654499490932, + "grad_norm": 2.3633134365081787, + "learning_rate": 9.4915960558766e-06, + "loss": 0.6802, + "step": 845 + }, + { + "epoch": 0.17066828055111632, + "grad_norm": 0.5798237323760986, + "learning_rate": 9.4901594426033e-06, + "loss": 0.7799, + "step": 846 + }, + { + "epoch": 0.1708700161073233, + "grad_norm": 1.2589781284332275, + "learning_rate": 9.488720911486131e-06, + "loss": 0.6914, + "step": 847 + }, + { + "epoch": 0.1710717516635303, + "grad_norm": 0.611457109451294, + "learning_rate": 9.487280463139521e-06, + "loss": 0.6979, + "step": 848 + }, + { + "epoch": 0.1712734872197373, + "grad_norm": 0.846007227897644, + "learning_rate": 9.485838098178715e-06, + "loss": 0.9793, + "step": 849 + }, + { + "epoch": 0.1714752227759443, + "grad_norm": 0.4944300949573517, + "learning_rate": 9.48439381721978e-06, + "loss": 0.6902, + "step": 850 + }, + { + "epoch": 0.1716769583321513, + "grad_norm": 0.3780556321144104, + "learning_rate": 9.482947620879601e-06, + "loss": 0.8286, + "step": 851 + }, + { + "epoch": 0.17187869388835827, + "grad_norm": 0.8853325247764587, + "learning_rate": 9.481499509775878e-06, + "loss": 0.7205, + "step": 852 + }, + { + "epoch": 0.17208042944456528, + "grad_norm": 0.7118326425552368, + "learning_rate": 9.480049484527127e-06, + "loss": 0.7851, + "step": 853 + }, + { + "epoch": 0.17228216500077226, + "grad_norm": 0.4678933620452881, + "learning_rate": 9.47859754575269e-06, + "loss": 0.6764, + "step": 854 + }, + { + "epoch": 0.17248390055697926, + "grad_norm": 0.9232149124145508, + "learning_rate": 9.477143694072721e-06, + "loss": 0.7186, + "step": 855 + }, + { + "epoch": 0.17268563611318627, + "grad_norm": 0.6610773801803589, + "learning_rate": 9.475687930108188e-06, + "loss": 0.6943, + "step": 856 + }, + { + "epoch": 0.17288737166939325, + "grad_norm": 0.5761659741401672, + "learning_rate": 9.47423025448088e-06, + "loss": 0.6806, + "step": 857 + }, + { + "epoch": 0.17308910722560025, + "grad_norm": 0.448404461145401, + "learning_rate": 9.472770667813406e-06, + "loss": 0.7138, + "step": 858 + }, + { + "epoch": 0.17329084278180723, + "grad_norm": 0.4232058823108673, + "learning_rate": 9.471309170729182e-06, + "loss": 0.6921, + "step": 859 + }, + { + "epoch": 0.17349257833801424, + "grad_norm": 0.4294980466365814, + "learning_rate": 9.469845763852447e-06, + "loss": 0.8345, + "step": 860 + }, + { + "epoch": 0.17369431389422121, + "grad_norm": 0.40628236532211304, + "learning_rate": 9.468380447808251e-06, + "loss": 0.7114, + "step": 861 + }, + { + "epoch": 0.17389604945042822, + "grad_norm": 0.42344218492507935, + "learning_rate": 9.466913223222467e-06, + "loss": 0.851, + "step": 862 + }, + { + "epoch": 0.1740977850066352, + "grad_norm": 1.0432575941085815, + "learning_rate": 9.465444090721775e-06, + "loss": 0.8181, + "step": 863 + }, + { + "epoch": 0.1742995205628422, + "grad_norm": 0.848474383354187, + "learning_rate": 9.463973050933674e-06, + "loss": 0.7649, + "step": 864 + }, + { + "epoch": 0.1745012561190492, + "grad_norm": 0.513373851776123, + "learning_rate": 9.462500104486476e-06, + "loss": 0.6926, + "step": 865 + }, + { + "epoch": 0.1747029916752562, + "grad_norm": 1.0103217363357544, + "learning_rate": 9.461025252009308e-06, + "loss": 0.7174, + "step": 866 + }, + { + "epoch": 0.1749047272314632, + "grad_norm": 0.5421655178070068, + "learning_rate": 9.45954849413211e-06, + "loss": 0.7147, + "step": 867 + }, + { + "epoch": 0.17510646278767017, + "grad_norm": 1.8900377750396729, + "learning_rate": 9.458069831485643e-06, + "loss": 0.6938, + "step": 868 + }, + { + "epoch": 0.17530819834387718, + "grad_norm": 0.7132319808006287, + "learning_rate": 9.45658926470147e-06, + "loss": 0.6805, + "step": 869 + }, + { + "epoch": 0.17550993390008415, + "grad_norm": 1.216712474822998, + "learning_rate": 9.455106794411974e-06, + "loss": 0.7156, + "step": 870 + }, + { + "epoch": 0.17571166945629116, + "grad_norm": 1.3393114805221558, + "learning_rate": 9.453622421250353e-06, + "loss": 0.6761, + "step": 871 + }, + { + "epoch": 0.17591340501249814, + "grad_norm": 0.6455486416816711, + "learning_rate": 9.45213614585061e-06, + "loss": 0.6961, + "step": 872 + }, + { + "epoch": 0.17611514056870514, + "grad_norm": 0.6910165548324585, + "learning_rate": 9.45064796884757e-06, + "loss": 0.8834, + "step": 873 + }, + { + "epoch": 0.17631687612491215, + "grad_norm": 1.8813116550445557, + "learning_rate": 9.449157890876862e-06, + "loss": 0.6928, + "step": 874 + }, + { + "epoch": 0.17651861168111913, + "grad_norm": 2.003828763961792, + "learning_rate": 9.44766591257493e-06, + "loss": 0.6792, + "step": 875 + }, + { + "epoch": 0.17672034723732613, + "grad_norm": 2.5982606410980225, + "learning_rate": 9.446172034579034e-06, + "loss": 0.9014, + "step": 876 + }, + { + "epoch": 0.1769220827935331, + "grad_norm": 0.8602038621902466, + "learning_rate": 9.44467625752724e-06, + "loss": 0.8226, + "step": 877 + }, + { + "epoch": 0.17712381834974011, + "grad_norm": 0.7308774590492249, + "learning_rate": 9.443178582058423e-06, + "loss": 0.8395, + "step": 878 + }, + { + "epoch": 0.1773255539059471, + "grad_norm": 0.41064178943634033, + "learning_rate": 9.441679008812277e-06, + "loss": 0.9711, + "step": 879 + }, + { + "epoch": 0.1775272894621541, + "grad_norm": 0.43151190876960754, + "learning_rate": 9.440177538429299e-06, + "loss": 0.7408, + "step": 880 + }, + { + "epoch": 0.17772902501836108, + "grad_norm": 0.5189063549041748, + "learning_rate": 9.438674171550801e-06, + "loss": 0.7126, + "step": 881 + }, + { + "epoch": 0.17793076057456808, + "grad_norm": 0.5402753949165344, + "learning_rate": 9.437168908818904e-06, + "loss": 0.6815, + "step": 882 + }, + { + "epoch": 0.1781324961307751, + "grad_norm": 0.7131788730621338, + "learning_rate": 9.435661750876537e-06, + "loss": 0.7269, + "step": 883 + }, + { + "epoch": 0.17833423168698206, + "grad_norm": 0.8747779726982117, + "learning_rate": 9.43415269836744e-06, + "loss": 0.7123, + "step": 884 + }, + { + "epoch": 0.17853596724318907, + "grad_norm": 0.8151087760925293, + "learning_rate": 9.432641751936162e-06, + "loss": 0.7792, + "step": 885 + }, + { + "epoch": 0.17873770279939605, + "grad_norm": 0.45538297295570374, + "learning_rate": 9.43112891222806e-06, + "loss": 0.6891, + "step": 886 + }, + { + "epoch": 0.17893943835560305, + "grad_norm": 0.8422892689704895, + "learning_rate": 9.429614179889302e-06, + "loss": 0.7761, + "step": 887 + }, + { + "epoch": 0.17914117391181003, + "grad_norm": 0.9492774605751038, + "learning_rate": 9.428097555566859e-06, + "loss": 0.6858, + "step": 888 + }, + { + "epoch": 0.17934290946801704, + "grad_norm": 0.5029323697090149, + "learning_rate": 9.42657903990852e-06, + "loss": 0.7098, + "step": 889 + }, + { + "epoch": 0.17954464502422401, + "grad_norm": 0.755977213382721, + "learning_rate": 9.42505863356287e-06, + "loss": 0.83, + "step": 890 + }, + { + "epoch": 0.17974638058043102, + "grad_norm": 0.5003045201301575, + "learning_rate": 9.42353633717931e-06, + "loss": 0.8131, + "step": 891 + }, + { + "epoch": 0.17994811613663803, + "grad_norm": 0.6524782180786133, + "learning_rate": 9.422012151408046e-06, + "loss": 0.7094, + "step": 892 + }, + { + "epoch": 0.180149851692845, + "grad_norm": 0.8290368318557739, + "learning_rate": 9.42048607690009e-06, + "loss": 0.6777, + "step": 893 + }, + { + "epoch": 0.180351587249052, + "grad_norm": 0.718972384929657, + "learning_rate": 9.418958114307263e-06, + "loss": 0.7676, + "step": 894 + }, + { + "epoch": 0.180553322805259, + "grad_norm": 0.4021669030189514, + "learning_rate": 9.417428264282186e-06, + "loss": 0.6786, + "step": 895 + }, + { + "epoch": 0.180755058361466, + "grad_norm": 0.5108888149261475, + "learning_rate": 9.415896527478297e-06, + "loss": 0.8632, + "step": 896 + }, + { + "epoch": 0.18095679391767297, + "grad_norm": 0.4423260986804962, + "learning_rate": 9.414362904549829e-06, + "loss": 0.7235, + "step": 897 + }, + { + "epoch": 0.18115852947387998, + "grad_norm": 1.0673054456710815, + "learning_rate": 9.412827396151827e-06, + "loss": 0.7061, + "step": 898 + }, + { + "epoch": 0.18136026503008695, + "grad_norm": 0.6691890954971313, + "learning_rate": 9.411290002940141e-06, + "loss": 0.7062, + "step": 899 + }, + { + "epoch": 0.18156200058629396, + "grad_norm": 0.49868547916412354, + "learning_rate": 9.409750725571422e-06, + "loss": 0.76, + "step": 900 + }, + { + "epoch": 0.18176373614250096, + "grad_norm": 1.40470552444458, + "learning_rate": 9.408209564703133e-06, + "loss": 0.6882, + "step": 901 + }, + { + "epoch": 0.18196547169870794, + "grad_norm": 1.4107697010040283, + "learning_rate": 9.40666652099353e-06, + "loss": 0.8416, + "step": 902 + }, + { + "epoch": 0.18216720725491495, + "grad_norm": 0.4921844005584717, + "learning_rate": 9.405121595101688e-06, + "loss": 0.8499, + "step": 903 + }, + { + "epoch": 0.18236894281112193, + "grad_norm": 1.1987202167510986, + "learning_rate": 9.403574787687474e-06, + "loss": 0.6613, + "step": 904 + }, + { + "epoch": 0.18257067836732893, + "grad_norm": 2.4473557472229004, + "learning_rate": 9.402026099411563e-06, + "loss": 0.9773, + "step": 905 + }, + { + "epoch": 0.1827724139235359, + "grad_norm": 0.5587297677993774, + "learning_rate": 9.400475530935433e-06, + "loss": 0.7263, + "step": 906 + }, + { + "epoch": 0.18297414947974291, + "grad_norm": 1.1013250350952148, + "learning_rate": 9.398923082921366e-06, + "loss": 0.8863, + "step": 907 + }, + { + "epoch": 0.18317588503594992, + "grad_norm": 1.5462638139724731, + "learning_rate": 9.397368756032445e-06, + "loss": 0.6677, + "step": 908 + }, + { + "epoch": 0.1833776205921569, + "grad_norm": 0.9664633870124817, + "learning_rate": 9.395812550932559e-06, + "loss": 0.6855, + "step": 909 + }, + { + "epoch": 0.1835793561483639, + "grad_norm": 0.5382829904556274, + "learning_rate": 9.394254468286395e-06, + "loss": 0.7645, + "step": 910 + }, + { + "epoch": 0.18378109170457088, + "grad_norm": 1.6417592763900757, + "learning_rate": 9.392694508759443e-06, + "loss": 0.7324, + "step": 911 + }, + { + "epoch": 0.1839828272607779, + "grad_norm": 1.6140865087509155, + "learning_rate": 9.391132673017995e-06, + "loss": 0.6887, + "step": 912 + }, + { + "epoch": 0.18418456281698486, + "grad_norm": 1031.276123046875, + "learning_rate": 9.389568961729148e-06, + "loss": 0.8069, + "step": 913 + }, + { + "epoch": 0.18438629837319187, + "grad_norm": 16818.4453125, + "learning_rate": 9.388003375560792e-06, + "loss": 0.6865, + "step": 914 + }, + { + "epoch": 0.18458803392939885, + "grad_norm": 1720.62548828125, + "learning_rate": 9.386435915181626e-06, + "loss": 0.7457, + "step": 915 + }, + { + "epoch": 0.18478976948560585, + "grad_norm": 680.1848754882812, + "learning_rate": 9.384866581261145e-06, + "loss": 0.7248, + "step": 916 + }, + { + "epoch": 0.18499150504181286, + "grad_norm": 2.356326103210449, + "learning_rate": 9.383295374469646e-06, + "loss": 0.759, + "step": 917 + }, + { + "epoch": 0.18519324059801984, + "grad_norm": 1.6549099683761597, + "learning_rate": 9.381722295478227e-06, + "loss": 0.7201, + "step": 918 + }, + { + "epoch": 0.18539497615422684, + "grad_norm": 0.6107184886932373, + "learning_rate": 9.380147344958778e-06, + "loss": 0.7285, + "step": 919 + }, + { + "epoch": 0.18559671171043382, + "grad_norm": 0.675512433052063, + "learning_rate": 9.378570523583999e-06, + "loss": 0.7225, + "step": 920 + }, + { + "epoch": 0.18579844726664083, + "grad_norm": 0.6857936382293701, + "learning_rate": 9.376991832027385e-06, + "loss": 0.8173, + "step": 921 + }, + { + "epoch": 0.1860001828228478, + "grad_norm": 0.451214998960495, + "learning_rate": 9.375411270963226e-06, + "loss": 0.6989, + "step": 922 + }, + { + "epoch": 0.1862019183790548, + "grad_norm": 1.1534514427185059, + "learning_rate": 9.373828841066616e-06, + "loss": 1.0878, + "step": 923 + }, + { + "epoch": 0.1864036539352618, + "grad_norm": 0.38888052105903625, + "learning_rate": 9.372244543013444e-06, + "loss": 0.6825, + "step": 924 + }, + { + "epoch": 0.1866053894914688, + "grad_norm": 0.766791820526123, + "learning_rate": 9.370658377480399e-06, + "loss": 0.6974, + "step": 925 + }, + { + "epoch": 0.1868071250476758, + "grad_norm": 0.7451943159103394, + "learning_rate": 9.369070345144966e-06, + "loss": 0.6389, + "step": 926 + }, + { + "epoch": 0.18700886060388278, + "grad_norm": 0.5548701286315918, + "learning_rate": 9.367480446685427e-06, + "loss": 0.8277, + "step": 927 + }, + { + "epoch": 0.18721059616008978, + "grad_norm": 0.42171385884284973, + "learning_rate": 9.365888682780862e-06, + "loss": 0.8162, + "step": 928 + }, + { + "epoch": 0.18741233171629676, + "grad_norm": 0.8420100212097168, + "learning_rate": 9.364295054111147e-06, + "loss": 0.7352, + "step": 929 + }, + { + "epoch": 0.18761406727250377, + "grad_norm": 0.5614784955978394, + "learning_rate": 9.362699561356957e-06, + "loss": 0.676, + "step": 930 + }, + { + "epoch": 0.18781580282871074, + "grad_norm": 1.4516627788543701, + "learning_rate": 9.361102205199762e-06, + "loss": 0.7014, + "step": 931 + }, + { + "epoch": 0.18801753838491775, + "grad_norm": 1.1581453084945679, + "learning_rate": 9.359502986321823e-06, + "loss": 0.6991, + "step": 932 + }, + { + "epoch": 0.18821927394112473, + "grad_norm": 0.8086721897125244, + "learning_rate": 9.357901905406204e-06, + "loss": 0.7994, + "step": 933 + }, + { + "epoch": 0.18842100949733173, + "grad_norm": 1.4955883026123047, + "learning_rate": 9.356298963136763e-06, + "loss": 0.7284, + "step": 934 + }, + { + "epoch": 0.18862274505353874, + "grad_norm": 0.9167599081993103, + "learning_rate": 9.354694160198146e-06, + "loss": 0.7033, + "step": 935 + }, + { + "epoch": 0.18882448060974572, + "grad_norm": 1.0095226764678955, + "learning_rate": 9.353087497275804e-06, + "loss": 0.6968, + "step": 936 + }, + { + "epoch": 0.18902621616595272, + "grad_norm": 0.897120475769043, + "learning_rate": 9.351478975055973e-06, + "loss": 0.7284, + "step": 937 + }, + { + "epoch": 0.1892279517221597, + "grad_norm": 1.160873293876648, + "learning_rate": 9.349868594225692e-06, + "loss": 1.2516, + "step": 938 + }, + { + "epoch": 0.1894296872783667, + "grad_norm": 0.9946362376213074, + "learning_rate": 9.348256355472787e-06, + "loss": 0.7964, + "step": 939 + }, + { + "epoch": 0.18963142283457368, + "grad_norm": 0.6938387155532837, + "learning_rate": 9.34664225948588e-06, + "loss": 0.6529, + "step": 940 + }, + { + "epoch": 0.1898331583907807, + "grad_norm": 0.7048420310020447, + "learning_rate": 9.345026306954385e-06, + "loss": 0.6904, + "step": 941 + }, + { + "epoch": 0.19003489394698767, + "grad_norm": 0.8328052163124084, + "learning_rate": 9.343408498568512e-06, + "loss": 0.7811, + "step": 942 + }, + { + "epoch": 0.19023662950319467, + "grad_norm": 0.4666134715080261, + "learning_rate": 9.34178883501926e-06, + "loss": 0.7828, + "step": 943 + }, + { + "epoch": 0.19043836505940168, + "grad_norm": 0.39586102962493896, + "learning_rate": 9.340167316998425e-06, + "loss": 0.7562, + "step": 944 + }, + { + "epoch": 0.19064010061560865, + "grad_norm": 0.4975466728210449, + "learning_rate": 9.33854394519859e-06, + "loss": 0.787, + "step": 945 + }, + { + "epoch": 0.19084183617181566, + "grad_norm": 0.43210187554359436, + "learning_rate": 9.336918720313133e-06, + "loss": 0.8803, + "step": 946 + }, + { + "epoch": 0.19104357172802264, + "grad_norm": 0.6917931437492371, + "learning_rate": 9.335291643036221e-06, + "loss": 0.6514, + "step": 947 + }, + { + "epoch": 0.19124530728422964, + "grad_norm": 0.5619732737541199, + "learning_rate": 9.333662714062818e-06, + "loss": 0.6951, + "step": 948 + }, + { + "epoch": 0.19144704284043662, + "grad_norm": 0.9777224063873291, + "learning_rate": 9.33203193408867e-06, + "loss": 0.6511, + "step": 949 + }, + { + "epoch": 0.19164877839664363, + "grad_norm": 0.5258626341819763, + "learning_rate": 9.33039930381032e-06, + "loss": 0.7898, + "step": 950 + }, + { + "epoch": 0.1918505139528506, + "grad_norm": 0.48357635736465454, + "learning_rate": 9.3287648239251e-06, + "loss": 0.7166, + "step": 951 + }, + { + "epoch": 0.1920522495090576, + "grad_norm": 0.48291975259780884, + "learning_rate": 9.32712849513113e-06, + "loss": 0.6984, + "step": 952 + }, + { + "epoch": 0.19225398506526462, + "grad_norm": 0.9965933561325073, + "learning_rate": 9.325490318127323e-06, + "loss": 0.7334, + "step": 953 + }, + { + "epoch": 0.1924557206214716, + "grad_norm": 0.562293291091919, + "learning_rate": 9.32385029361338e-06, + "loss": 0.6767, + "step": 954 + }, + { + "epoch": 0.1926574561776786, + "grad_norm": 1.084022879600525, + "learning_rate": 9.32220842228979e-06, + "loss": 0.6759, + "step": 955 + }, + { + "epoch": 0.19285919173388558, + "grad_norm": 0.804706335067749, + "learning_rate": 9.32056470485783e-06, + "loss": 0.7241, + "step": 956 + }, + { + "epoch": 0.19306092729009258, + "grad_norm": 0.3961019814014435, + "learning_rate": 9.318919142019572e-06, + "loss": 0.7219, + "step": 957 + }, + { + "epoch": 0.19326266284629956, + "grad_norm": 0.5734902620315552, + "learning_rate": 9.317271734477865e-06, + "loss": 0.6651, + "step": 958 + }, + { + "epoch": 0.19346439840250657, + "grad_norm": 0.4710521101951599, + "learning_rate": 9.315622482936356e-06, + "loss": 0.6818, + "step": 959 + }, + { + "epoch": 0.19366613395871357, + "grad_norm": 0.881702721118927, + "learning_rate": 9.313971388099476e-06, + "loss": 0.7027, + "step": 960 + }, + { + "epoch": 0.19386786951492055, + "grad_norm": 0.4663563668727875, + "learning_rate": 9.312318450672441e-06, + "loss": 0.6915, + "step": 961 + }, + { + "epoch": 0.19406960507112755, + "grad_norm": 0.8489170670509338, + "learning_rate": 9.31066367136126e-06, + "loss": 0.6745, + "step": 962 + }, + { + "epoch": 0.19427134062733453, + "grad_norm": 0.6792709827423096, + "learning_rate": 9.309007050872722e-06, + "loss": 0.8499, + "step": 963 + }, + { + "epoch": 0.19447307618354154, + "grad_norm": 0.7607674598693848, + "learning_rate": 9.307348589914405e-06, + "loss": 0.6581, + "step": 964 + }, + { + "epoch": 0.19467481173974852, + "grad_norm": 0.74134761095047, + "learning_rate": 9.305688289194673e-06, + "loss": 0.6741, + "step": 965 + }, + { + "epoch": 0.19487654729595552, + "grad_norm": 0.8747991323471069, + "learning_rate": 9.30402614942268e-06, + "loss": 0.6974, + "step": 966 + }, + { + "epoch": 0.1950782828521625, + "grad_norm": 0.7645976543426514, + "learning_rate": 9.302362171308358e-06, + "loss": 0.6842, + "step": 967 + }, + { + "epoch": 0.1952800184083695, + "grad_norm": 1.2800146341323853, + "learning_rate": 9.30069635556243e-06, + "loss": 0.6564, + "step": 968 + }, + { + "epoch": 0.1954817539645765, + "grad_norm": 0.6098384857177734, + "learning_rate": 9.299028702896402e-06, + "loss": 0.8256, + "step": 969 + }, + { + "epoch": 0.1956834895207835, + "grad_norm": 0.6422080993652344, + "learning_rate": 9.29735921402256e-06, + "loss": 0.6772, + "step": 970 + }, + { + "epoch": 0.1958852250769905, + "grad_norm": 0.42048829793930054, + "learning_rate": 9.295687889653986e-06, + "loss": 0.6898, + "step": 971 + }, + { + "epoch": 0.19608696063319747, + "grad_norm": 0.3918842673301697, + "learning_rate": 9.294014730504532e-06, + "loss": 0.7526, + "step": 972 + }, + { + "epoch": 0.19628869618940448, + "grad_norm": 0.3806288242340088, + "learning_rate": 9.292339737288844e-06, + "loss": 0.7355, + "step": 973 + }, + { + "epoch": 0.19649043174561145, + "grad_norm": 0.6645257472991943, + "learning_rate": 9.290662910722346e-06, + "loss": 0.6992, + "step": 974 + }, + { + "epoch": 0.19669216730181846, + "grad_norm": 1.5524142980575562, + "learning_rate": 9.288984251521246e-06, + "loss": 0.7014, + "step": 975 + }, + { + "epoch": 0.19689390285802544, + "grad_norm": 1.6902189254760742, + "learning_rate": 9.28730376040254e-06, + "loss": 0.6856, + "step": 976 + }, + { + "epoch": 0.19709563841423244, + "grad_norm": 0.6375247836112976, + "learning_rate": 9.285621438083997e-06, + "loss": 0.6822, + "step": 977 + }, + { + "epoch": 0.19729737397043945, + "grad_norm": 0.44163259863853455, + "learning_rate": 9.283937285284177e-06, + "loss": 0.7284, + "step": 978 + }, + { + "epoch": 0.19749910952664643, + "grad_norm": 0.6588442325592041, + "learning_rate": 9.282251302722416e-06, + "loss": 0.7268, + "step": 979 + }, + { + "epoch": 0.19770084508285343, + "grad_norm": 1.0238844156265259, + "learning_rate": 9.280563491118833e-06, + "loss": 0.6698, + "step": 980 + }, + { + "epoch": 0.1979025806390604, + "grad_norm": 0.7650852203369141, + "learning_rate": 9.278873851194328e-06, + "loss": 1.0946, + "step": 981 + }, + { + "epoch": 0.19810431619526742, + "grad_norm": 0.6054670214653015, + "learning_rate": 9.277182383670584e-06, + "loss": 0.6911, + "step": 982 + }, + { + "epoch": 0.1983060517514744, + "grad_norm": 0.33824777603149414, + "learning_rate": 9.275489089270064e-06, + "loss": 0.6725, + "step": 983 + }, + { + "epoch": 0.1985077873076814, + "grad_norm": 0.6010570526123047, + "learning_rate": 9.27379396871601e-06, + "loss": 0.6839, + "step": 984 + }, + { + "epoch": 0.19870952286388838, + "grad_norm": 0.6706582903862, + "learning_rate": 9.272097022732444e-06, + "loss": 0.6889, + "step": 985 + }, + { + "epoch": 0.19891125842009538, + "grad_norm": 0.4594559669494629, + "learning_rate": 9.270398252044169e-06, + "loss": 0.7069, + "step": 986 + }, + { + "epoch": 0.1991129939763024, + "grad_norm": 0.5065109133720398, + "learning_rate": 9.268697657376765e-06, + "loss": 0.7619, + "step": 987 + }, + { + "epoch": 0.19931472953250937, + "grad_norm": 0.5241512060165405, + "learning_rate": 9.266995239456593e-06, + "loss": 0.8801, + "step": 988 + }, + { + "epoch": 0.19951646508871637, + "grad_norm": 0.4009891152381897, + "learning_rate": 9.265290999010794e-06, + "loss": 0.9087, + "step": 989 + }, + { + "epoch": 0.19971820064492335, + "grad_norm": 0.8747932314872742, + "learning_rate": 9.263584936767282e-06, + "loss": 0.8327, + "step": 990 + }, + { + "epoch": 0.19991993620113035, + "grad_norm": 1.358296513557434, + "learning_rate": 9.26187705345476e-06, + "loss": 0.7112, + "step": 991 + }, + { + "epoch": 0.20012167175733733, + "grad_norm": 0.3803250789642334, + "learning_rate": 9.260167349802696e-06, + "loss": 0.7441, + "step": 992 + }, + { + "epoch": 0.20032340731354434, + "grad_norm": 0.6250929236412048, + "learning_rate": 9.258455826541341e-06, + "loss": 0.7045, + "step": 993 + }, + { + "epoch": 0.20052514286975132, + "grad_norm": 0.6147077679634094, + "learning_rate": 9.256742484401728e-06, + "loss": 0.7959, + "step": 994 + }, + { + "epoch": 0.20072687842595832, + "grad_norm": 0.6384791731834412, + "learning_rate": 9.255027324115657e-06, + "loss": 0.7937, + "step": 995 + }, + { + "epoch": 0.20092861398216533, + "grad_norm": 3.0452754497528076, + "learning_rate": 9.253310346415714e-06, + "loss": 0.6903, + "step": 996 + }, + { + "epoch": 0.2011303495383723, + "grad_norm": 3.768707036972046, + "learning_rate": 9.251591552035255e-06, + "loss": 0.7988, + "step": 997 + }, + { + "epoch": 0.2013320850945793, + "grad_norm": 0.8928516507148743, + "learning_rate": 9.249870941708416e-06, + "loss": 0.6586, + "step": 998 + }, + { + "epoch": 0.2015338206507863, + "grad_norm": 0.6847289204597473, + "learning_rate": 9.248148516170106e-06, + "loss": 0.6983, + "step": 999 + }, + { + "epoch": 0.2017355562069933, + "grad_norm": 1.5991216897964478, + "learning_rate": 9.246424276156008e-06, + "loss": 0.7384, + "step": 1000 + }, + { + "epoch": 0.20193729176320027, + "grad_norm": 1.0704641342163086, + "learning_rate": 9.244698222402584e-06, + "loss": 0.7639, + "step": 1001 + }, + { + "epoch": 0.20213902731940728, + "grad_norm": 0.6123541593551636, + "learning_rate": 9.24297035564707e-06, + "loss": 0.8269, + "step": 1002 + }, + { + "epoch": 0.20234076287561428, + "grad_norm": 0.8308938145637512, + "learning_rate": 9.241240676627472e-06, + "loss": 0.7285, + "step": 1003 + }, + { + "epoch": 0.20254249843182126, + "grad_norm": 1.161763072013855, + "learning_rate": 9.239509186082574e-06, + "loss": 0.6906, + "step": 1004 + }, + { + "epoch": 0.20274423398802827, + "grad_norm": 0.8944821357727051, + "learning_rate": 9.237775884751936e-06, + "loss": 0.83, + "step": 1005 + }, + { + "epoch": 0.20294596954423524, + "grad_norm": 0.6827241778373718, + "learning_rate": 9.236040773375884e-06, + "loss": 0.8459, + "step": 1006 + }, + { + "epoch": 0.20314770510044225, + "grad_norm": 0.60840904712677, + "learning_rate": 9.234303852695526e-06, + "loss": 0.8371, + "step": 1007 + }, + { + "epoch": 0.20334944065664923, + "grad_norm": 0.3988000452518463, + "learning_rate": 9.232565123452734e-06, + "loss": 0.6888, + "step": 1008 + }, + { + "epoch": 0.20355117621285623, + "grad_norm": 0.7547544836997986, + "learning_rate": 9.23082458639016e-06, + "loss": 0.6639, + "step": 1009 + }, + { + "epoch": 0.2037529117690632, + "grad_norm": 0.7112361192703247, + "learning_rate": 9.229082242251222e-06, + "loss": 0.7103, + "step": 1010 + }, + { + "epoch": 0.20395464732527022, + "grad_norm": 0.4319564700126648, + "learning_rate": 9.227338091780116e-06, + "loss": 0.6841, + "step": 1011 + }, + { + "epoch": 0.20415638288147722, + "grad_norm": 0.4466502070426941, + "learning_rate": 9.225592135721802e-06, + "loss": 0.8465, + "step": 1012 + }, + { + "epoch": 0.2043581184376842, + "grad_norm": 0.7582955956459045, + "learning_rate": 9.22384437482202e-06, + "loss": 0.8686, + "step": 1013 + }, + { + "epoch": 0.2045598539938912, + "grad_norm": 0.9325213432312012, + "learning_rate": 9.222094809827272e-06, + "loss": 0.7334, + "step": 1014 + }, + { + "epoch": 0.20476158955009818, + "grad_norm": 0.841929018497467, + "learning_rate": 9.220343441484837e-06, + "loss": 0.6743, + "step": 1015 + }, + { + "epoch": 0.2049633251063052, + "grad_norm": 0.3721573054790497, + "learning_rate": 9.218590270542765e-06, + "loss": 1.0676, + "step": 1016 + }, + { + "epoch": 0.20516506066251217, + "grad_norm": 0.509548008441925, + "learning_rate": 9.216835297749869e-06, + "loss": 0.706, + "step": 1017 + }, + { + "epoch": 0.20536679621871917, + "grad_norm": 0.5040112137794495, + "learning_rate": 9.215078523855736e-06, + "loss": 0.7995, + "step": 1018 + }, + { + "epoch": 0.20556853177492615, + "grad_norm": 0.570552408695221, + "learning_rate": 9.213319949610727e-06, + "loss": 0.6729, + "step": 1019 + }, + { + "epoch": 0.20577026733113316, + "grad_norm": 0.6146701574325562, + "learning_rate": 9.211559575765958e-06, + "loss": 0.6371, + "step": 1020 + }, + { + "epoch": 0.20597200288734016, + "grad_norm": 0.4954344630241394, + "learning_rate": 9.209797403073331e-06, + "loss": 0.7039, + "step": 1021 + }, + { + "epoch": 0.20617373844354714, + "grad_norm": 0.4436202049255371, + "learning_rate": 9.208033432285503e-06, + "loss": 0.7906, + "step": 1022 + }, + { + "epoch": 0.20637547399975414, + "grad_norm": 0.4960552453994751, + "learning_rate": 9.206267664155906e-06, + "loss": 0.6918, + "step": 1023 + }, + { + "epoch": 0.20657720955596112, + "grad_norm": 0.47839146852493286, + "learning_rate": 9.204500099438739e-06, + "loss": 0.819, + "step": 1024 + }, + { + "epoch": 0.20677894511216813, + "grad_norm": 0.5067861676216125, + "learning_rate": 9.202730738888962e-06, + "loss": 0.6683, + "step": 1025 + }, + { + "epoch": 0.2069806806683751, + "grad_norm": 0.4200279414653778, + "learning_rate": 9.200959583262312e-06, + "loss": 0.6968, + "step": 1026 + }, + { + "epoch": 0.2071824162245821, + "grad_norm": 0.49858543276786804, + "learning_rate": 9.199186633315286e-06, + "loss": 0.898, + "step": 1027 + }, + { + "epoch": 0.2073841517807891, + "grad_norm": 1.0465232133865356, + "learning_rate": 9.197411889805148e-06, + "loss": 0.7353, + "step": 1028 + }, + { + "epoch": 0.2075858873369961, + "grad_norm": 0.8756152391433716, + "learning_rate": 9.195635353489932e-06, + "loss": 0.698, + "step": 1029 + }, + { + "epoch": 0.2077876228932031, + "grad_norm": 1.9849929809570312, + "learning_rate": 9.193857025128431e-06, + "loss": 0.713, + "step": 1030 + }, + { + "epoch": 0.20798935844941008, + "grad_norm": 1.5864392518997192, + "learning_rate": 9.19207690548021e-06, + "loss": 0.7083, + "step": 1031 + }, + { + "epoch": 0.20819109400561708, + "grad_norm": 1.4301296472549438, + "learning_rate": 9.190294995305598e-06, + "loss": 0.708, + "step": 1032 + }, + { + "epoch": 0.20839282956182406, + "grad_norm": 0.43994635343551636, + "learning_rate": 9.188511295365683e-06, + "loss": 0.6809, + "step": 1033 + }, + { + "epoch": 0.20859456511803107, + "grad_norm": 0.9496873021125793, + "learning_rate": 9.186725806422325e-06, + "loss": 0.6739, + "step": 1034 + }, + { + "epoch": 0.20879630067423804, + "grad_norm": 0.5276889801025391, + "learning_rate": 9.184938529238144e-06, + "loss": 0.7, + "step": 1035 + }, + { + "epoch": 0.20899803623044505, + "grad_norm": 0.45405182242393494, + "learning_rate": 9.183149464576524e-06, + "loss": 0.687, + "step": 1036 + }, + { + "epoch": 0.20919977178665203, + "grad_norm": 0.5376430749893188, + "learning_rate": 9.181358613201613e-06, + "loss": 0.6767, + "step": 1037 + }, + { + "epoch": 0.20940150734285903, + "grad_norm": 0.6030016541481018, + "learning_rate": 9.179565975878324e-06, + "loss": 0.8204, + "step": 1038 + }, + { + "epoch": 0.20960324289906604, + "grad_norm": 0.5121897459030151, + "learning_rate": 9.177771553372328e-06, + "loss": 0.7511, + "step": 1039 + }, + { + "epoch": 0.20980497845527302, + "grad_norm": 0.5720887184143066, + "learning_rate": 9.175975346450063e-06, + "loss": 0.8343, + "step": 1040 + }, + { + "epoch": 0.21000671401148002, + "grad_norm": 0.7431820034980774, + "learning_rate": 9.174177355878731e-06, + "loss": 0.7304, + "step": 1041 + }, + { + "epoch": 0.210208449567687, + "grad_norm": 0.49612024426460266, + "learning_rate": 9.172377582426286e-06, + "loss": 0.6773, + "step": 1042 + }, + { + "epoch": 0.210410185123894, + "grad_norm": 0.45197567343711853, + "learning_rate": 9.170576026861455e-06, + "loss": 0.6998, + "step": 1043 + }, + { + "epoch": 0.21061192068010098, + "grad_norm": 0.8435342311859131, + "learning_rate": 9.16877268995372e-06, + "loss": 0.7114, + "step": 1044 + }, + { + "epoch": 0.210813656236308, + "grad_norm": 0.961377739906311, + "learning_rate": 9.166967572473325e-06, + "loss": 0.7085, + "step": 1045 + }, + { + "epoch": 0.21101539179251497, + "grad_norm": 0.6751661896705627, + "learning_rate": 9.165160675191272e-06, + "loss": 0.8048, + "step": 1046 + }, + { + "epoch": 0.21121712734872197, + "grad_norm": 0.9755600690841675, + "learning_rate": 9.163351998879331e-06, + "loss": 0.7424, + "step": 1047 + }, + { + "epoch": 0.21141886290492898, + "grad_norm": 0.9335680603981018, + "learning_rate": 9.161541544310022e-06, + "loss": 0.6775, + "step": 1048 + }, + { + "epoch": 0.21162059846113596, + "grad_norm": 0.6090373396873474, + "learning_rate": 9.159729312256632e-06, + "loss": 0.6716, + "step": 1049 + }, + { + "epoch": 0.21182233401734296, + "grad_norm": 0.7214455604553223, + "learning_rate": 9.157915303493201e-06, + "loss": 0.7575, + "step": 1050 + }, + { + "epoch": 0.21202406957354994, + "grad_norm": 0.7865515947341919, + "learning_rate": 9.156099518794535e-06, + "loss": 0.6964, + "step": 1051 + }, + { + "epoch": 0.21222580512975694, + "grad_norm": 1.348393440246582, + "learning_rate": 9.154281958936194e-06, + "loss": 0.7956, + "step": 1052 + }, + { + "epoch": 0.21242754068596392, + "grad_norm": 1.2331461906433105, + "learning_rate": 9.152462624694495e-06, + "loss": 0.8917, + "step": 1053 + }, + { + "epoch": 0.21262927624217093, + "grad_norm": 0.6964088678359985, + "learning_rate": 9.150641516846517e-06, + "loss": 0.7055, + "step": 1054 + }, + { + "epoch": 0.21283101179837793, + "grad_norm": 0.4619287848472595, + "learning_rate": 9.148818636170092e-06, + "loss": 0.699, + "step": 1055 + }, + { + "epoch": 0.2130327473545849, + "grad_norm": 1.3972951173782349, + "learning_rate": 9.146993983443815e-06, + "loss": 0.681, + "step": 1056 + }, + { + "epoch": 0.21323448291079192, + "grad_norm": 0.4713950455188751, + "learning_rate": 9.145167559447032e-06, + "loss": 0.6583, + "step": 1057 + }, + { + "epoch": 0.2134362184669989, + "grad_norm": 0.5854702591896057, + "learning_rate": 9.143339364959849e-06, + "loss": 0.8043, + "step": 1058 + }, + { + "epoch": 0.2136379540232059, + "grad_norm": 0.4331769347190857, + "learning_rate": 9.141509400763127e-06, + "loss": 0.7089, + "step": 1059 + }, + { + "epoch": 0.21383968957941288, + "grad_norm": 0.4095942974090576, + "learning_rate": 9.139677667638481e-06, + "loss": 0.6899, + "step": 1060 + }, + { + "epoch": 0.21404142513561988, + "grad_norm": 0.7834404706954956, + "learning_rate": 9.137844166368289e-06, + "loss": 0.6861, + "step": 1061 + }, + { + "epoch": 0.21424316069182686, + "grad_norm": 4.768156051635742, + "learning_rate": 9.136008897735673e-06, + "loss": 0.7185, + "step": 1062 + }, + { + "epoch": 0.21444489624803387, + "grad_norm": 3.4436988830566406, + "learning_rate": 9.13417186252452e-06, + "loss": 0.7006, + "step": 1063 + }, + { + "epoch": 0.21464663180424087, + "grad_norm": 0.5718343257904053, + "learning_rate": 9.132333061519465e-06, + "loss": 0.7174, + "step": 1064 + }, + { + "epoch": 0.21484836736044785, + "grad_norm": 0.7323482632637024, + "learning_rate": 9.130492495505902e-06, + "loss": 0.6881, + "step": 1065 + }, + { + "epoch": 0.21505010291665486, + "grad_norm": 0.48052531480789185, + "learning_rate": 9.128650165269973e-06, + "loss": 0.7082, + "step": 1066 + }, + { + "epoch": 0.21525183847286183, + "grad_norm": 0.4008042514324188, + "learning_rate": 9.126806071598579e-06, + "loss": 0.7354, + "step": 1067 + }, + { + "epoch": 0.21545357402906884, + "grad_norm": 1.4962018728256226, + "learning_rate": 9.124960215279372e-06, + "loss": 0.8422, + "step": 1068 + }, + { + "epoch": 0.21565530958527582, + "grad_norm": 0.5466567277908325, + "learning_rate": 9.123112597100759e-06, + "loss": 0.7146, + "step": 1069 + }, + { + "epoch": 0.21585704514148282, + "grad_norm": 0.8619051575660706, + "learning_rate": 9.121263217851892e-06, + "loss": 0.7536, + "step": 1070 + }, + { + "epoch": 0.2160587806976898, + "grad_norm": 0.5741105675697327, + "learning_rate": 9.119412078322688e-06, + "loss": 0.6986, + "step": 1071 + }, + { + "epoch": 0.2162605162538968, + "grad_norm": 0.5464684963226318, + "learning_rate": 9.1175591793038e-06, + "loss": 0.7119, + "step": 1072 + }, + { + "epoch": 0.2164622518101038, + "grad_norm": 0.5449472665786743, + "learning_rate": 9.11570452158665e-06, + "loss": 0.7213, + "step": 1073 + }, + { + "epoch": 0.2166639873663108, + "grad_norm": 0.830302357673645, + "learning_rate": 9.113848105963397e-06, + "loss": 0.7419, + "step": 1074 + }, + { + "epoch": 0.2168657229225178, + "grad_norm": 0.5640235543251038, + "learning_rate": 9.111989933226957e-06, + "loss": 0.638, + "step": 1075 + }, + { + "epoch": 0.21706745847872477, + "grad_norm": 0.990414559841156, + "learning_rate": 9.110130004170995e-06, + "loss": 0.7019, + "step": 1076 + }, + { + "epoch": 0.21726919403493178, + "grad_norm": 0.6244648694992065, + "learning_rate": 9.108268319589928e-06, + "loss": 0.7039, + "step": 1077 + }, + { + "epoch": 0.21747092959113876, + "grad_norm": 0.7823309302330017, + "learning_rate": 9.106404880278923e-06, + "loss": 0.766, + "step": 1078 + }, + { + "epoch": 0.21767266514734576, + "grad_norm": 0.5949699282646179, + "learning_rate": 9.104539687033891e-06, + "loss": 0.8078, + "step": 1079 + }, + { + "epoch": 0.21787440070355274, + "grad_norm": 0.4364759922027588, + "learning_rate": 9.1026727406515e-06, + "loss": 0.7588, + "step": 1080 + }, + { + "epoch": 0.21807613625975975, + "grad_norm": 1.3297545909881592, + "learning_rate": 9.100804041929161e-06, + "loss": 0.7108, + "step": 1081 + }, + { + "epoch": 0.21827787181596675, + "grad_norm": 0.5728005766868591, + "learning_rate": 9.098933591665037e-06, + "loss": 0.8183, + "step": 1082 + }, + { + "epoch": 0.21847960737217373, + "grad_norm": 0.559471607208252, + "learning_rate": 9.097061390658036e-06, + "loss": 0.8388, + "step": 1083 + }, + { + "epoch": 0.21868134292838073, + "grad_norm": 0.6557266116142273, + "learning_rate": 9.095187439707817e-06, + "loss": 0.6565, + "step": 1084 + }, + { + "epoch": 0.2188830784845877, + "grad_norm": 1.3041290044784546, + "learning_rate": 9.093311739614783e-06, + "loss": 0.677, + "step": 1085 + }, + { + "epoch": 0.21908481404079472, + "grad_norm": 0.4473462700843811, + "learning_rate": 9.091434291180088e-06, + "loss": 0.6987, + "step": 1086 + }, + { + "epoch": 0.2192865495970017, + "grad_norm": 0.5305896401405334, + "learning_rate": 9.08955509520563e-06, + "loss": 0.7524, + "step": 1087 + }, + { + "epoch": 0.2194882851532087, + "grad_norm": 0.6427012085914612, + "learning_rate": 9.087674152494052e-06, + "loss": 0.8808, + "step": 1088 + }, + { + "epoch": 0.21969002070941568, + "grad_norm": 0.9214508533477783, + "learning_rate": 9.085791463848748e-06, + "loss": 0.649, + "step": 1089 + }, + { + "epoch": 0.21989175626562268, + "grad_norm": 1.6634055376052856, + "learning_rate": 9.083907030073853e-06, + "loss": 0.8284, + "step": 1090 + }, + { + "epoch": 0.2200934918218297, + "grad_norm": 0.5022058486938477, + "learning_rate": 9.08202085197425e-06, + "loss": 0.6859, + "step": 1091 + }, + { + "epoch": 0.22029522737803667, + "grad_norm": 0.5239662528038025, + "learning_rate": 9.080132930355567e-06, + "loss": 0.6801, + "step": 1092 + }, + { + "epoch": 0.22049696293424367, + "grad_norm": 0.5457141995429993, + "learning_rate": 9.078243266024177e-06, + "loss": 0.8468, + "step": 1093 + }, + { + "epoch": 0.22069869849045065, + "grad_norm": 2.5816259384155273, + "learning_rate": 9.076351859787191e-06, + "loss": 0.7319, + "step": 1094 + }, + { + "epoch": 0.22090043404665766, + "grad_norm": 1.3747007846832275, + "learning_rate": 9.074458712452476e-06, + "loss": 0.6986, + "step": 1095 + }, + { + "epoch": 0.22110216960286463, + "grad_norm": 1.4433778524398804, + "learning_rate": 9.072563824828631e-06, + "loss": 0.7989, + "step": 1096 + }, + { + "epoch": 0.22130390515907164, + "grad_norm": 1.1537508964538574, + "learning_rate": 9.070667197725007e-06, + "loss": 0.8272, + "step": 1097 + }, + { + "epoch": 0.22150564071527862, + "grad_norm": 0.5916514992713928, + "learning_rate": 9.068768831951693e-06, + "loss": 0.758, + "step": 1098 + }, + { + "epoch": 0.22170737627148562, + "grad_norm": 1.487269639968872, + "learning_rate": 9.066868728319522e-06, + "loss": 0.8429, + "step": 1099 + }, + { + "epoch": 0.22190911182769263, + "grad_norm": 1.0207079648971558, + "learning_rate": 9.064966887640068e-06, + "loss": 0.7288, + "step": 1100 + }, + { + "epoch": 0.2221108473838996, + "grad_norm": 0.4363601505756378, + "learning_rate": 9.06306331072565e-06, + "loss": 0.6922, + "step": 1101 + }, + { + "epoch": 0.2223125829401066, + "grad_norm": 0.7018163800239563, + "learning_rate": 9.061157998389325e-06, + "loss": 0.6636, + "step": 1102 + }, + { + "epoch": 0.2225143184963136, + "grad_norm": 0.6163290739059448, + "learning_rate": 9.059250951444894e-06, + "loss": 0.7651, + "step": 1103 + }, + { + "epoch": 0.2227160540525206, + "grad_norm": 0.7058590054512024, + "learning_rate": 9.057342170706897e-06, + "loss": 0.7247, + "step": 1104 + }, + { + "epoch": 0.22291778960872757, + "grad_norm": 0.9741607904434204, + "learning_rate": 9.055431656990617e-06, + "loss": 0.6887, + "step": 1105 + }, + { + "epoch": 0.22311952516493458, + "grad_norm": 0.6746135354042053, + "learning_rate": 9.053519411112075e-06, + "loss": 0.7096, + "step": 1106 + }, + { + "epoch": 0.22332126072114158, + "grad_norm": 0.7209685444831848, + "learning_rate": 9.051605433888031e-06, + "loss": 0.6607, + "step": 1107 + }, + { + "epoch": 0.22352299627734856, + "grad_norm": 1.3863085508346558, + "learning_rate": 9.049689726135988e-06, + "loss": 0.7675, + "step": 1108 + }, + { + "epoch": 0.22372473183355557, + "grad_norm": 0.8169994354248047, + "learning_rate": 9.047772288674183e-06, + "loss": 0.6838, + "step": 1109 + }, + { + "epoch": 0.22392646738976255, + "grad_norm": 0.48424163460731506, + "learning_rate": 9.045853122321599e-06, + "loss": 0.8101, + "step": 1110 + }, + { + "epoch": 0.22412820294596955, + "grad_norm": 0.3474283814430237, + "learning_rate": 9.04393222789795e-06, + "loss": 0.6836, + "step": 1111 + }, + { + "epoch": 0.22432993850217653, + "grad_norm": 0.47879624366760254, + "learning_rate": 9.042009606223693e-06, + "loss": 0.7023, + "step": 1112 + }, + { + "epoch": 0.22453167405838353, + "grad_norm": 0.7166565656661987, + "learning_rate": 9.040085258120022e-06, + "loss": 0.8499, + "step": 1113 + }, + { + "epoch": 0.2247334096145905, + "grad_norm": 1.2024344205856323, + "learning_rate": 9.038159184408863e-06, + "loss": 0.745, + "step": 1114 + }, + { + "epoch": 0.22493514517079752, + "grad_norm": 1.2393633127212524, + "learning_rate": 9.03623138591289e-06, + "loss": 0.8218, + "step": 1115 + }, + { + "epoch": 0.22513688072700452, + "grad_norm": 0.4706745147705078, + "learning_rate": 9.034301863455504e-06, + "loss": 0.7126, + "step": 1116 + }, + { + "epoch": 0.2253386162832115, + "grad_norm": 1.2676310539245605, + "learning_rate": 9.032370617860844e-06, + "loss": 0.7014, + "step": 1117 + }, + { + "epoch": 0.2255403518394185, + "grad_norm": 0.6443676352500916, + "learning_rate": 9.03043764995379e-06, + "loss": 0.6831, + "step": 1118 + }, + { + "epoch": 0.22574208739562548, + "grad_norm": 1.2037488222122192, + "learning_rate": 9.028502960559952e-06, + "loss": 0.6523, + "step": 1119 + }, + { + "epoch": 0.2259438229518325, + "grad_norm": 0.8409974575042725, + "learning_rate": 9.026566550505677e-06, + "loss": 0.689, + "step": 1120 + }, + { + "epoch": 0.22614555850803947, + "grad_norm": 0.48111411929130554, + "learning_rate": 9.02462842061805e-06, + "loss": 0.6924, + "step": 1121 + }, + { + "epoch": 0.22634729406424647, + "grad_norm": 0.8593577146530151, + "learning_rate": 9.022688571724888e-06, + "loss": 0.6815, + "step": 1122 + }, + { + "epoch": 0.22654902962045345, + "grad_norm": 0.8607560396194458, + "learning_rate": 9.02074700465474e-06, + "loss": 0.7054, + "step": 1123 + }, + { + "epoch": 0.22675076517666046, + "grad_norm": 0.4094844460487366, + "learning_rate": 9.018803720236891e-06, + "loss": 0.7117, + "step": 1124 + }, + { + "epoch": 0.22695250073286746, + "grad_norm": 0.4774841070175171, + "learning_rate": 9.016858719301363e-06, + "loss": 0.6442, + "step": 1125 + }, + { + "epoch": 0.22715423628907444, + "grad_norm": 0.7203394174575806, + "learning_rate": 9.014912002678905e-06, + "loss": 0.7519, + "step": 1126 + }, + { + "epoch": 0.22735597184528145, + "grad_norm": 0.4779406189918518, + "learning_rate": 9.012963571200998e-06, + "loss": 0.7155, + "step": 1127 + }, + { + "epoch": 0.22755770740148842, + "grad_norm": 1.0806742906570435, + "learning_rate": 9.011013425699868e-06, + "loss": 0.6541, + "step": 1128 + }, + { + "epoch": 0.22775944295769543, + "grad_norm": 0.3512164354324341, + "learning_rate": 9.00906156700846e-06, + "loss": 0.7335, + "step": 1129 + }, + { + "epoch": 0.2279611785139024, + "grad_norm": 0.7348980903625488, + "learning_rate": 9.007107995960452e-06, + "loss": 0.6764, + "step": 1130 + }, + { + "epoch": 0.2281629140701094, + "grad_norm": 0.45355191826820374, + "learning_rate": 9.005152713390259e-06, + "loss": 0.7835, + "step": 1131 + }, + { + "epoch": 0.2283646496263164, + "grad_norm": 0.4727195203304291, + "learning_rate": 9.003195720133024e-06, + "loss": 0.6377, + "step": 1132 + }, + { + "epoch": 0.2285663851825234, + "grad_norm": 1.2743937969207764, + "learning_rate": 9.001237017024621e-06, + "loss": 0.6838, + "step": 1133 + }, + { + "epoch": 0.2287681207387304, + "grad_norm": 0.3683013916015625, + "learning_rate": 8.999276604901654e-06, + "loss": 0.6876, + "step": 1134 + }, + { + "epoch": 0.22896985629493738, + "grad_norm": 0.652779757976532, + "learning_rate": 8.997314484601458e-06, + "loss": 0.6726, + "step": 1135 + }, + { + "epoch": 0.22917159185114439, + "grad_norm": 1.0614265203475952, + "learning_rate": 8.995350656962098e-06, + "loss": 0.6525, + "step": 1136 + }, + { + "epoch": 0.22937332740735136, + "grad_norm": 0.6317399740219116, + "learning_rate": 8.993385122822364e-06, + "loss": 0.7065, + "step": 1137 + }, + { + "epoch": 0.22957506296355837, + "grad_norm": 0.37013980746269226, + "learning_rate": 8.99141788302178e-06, + "loss": 0.6774, + "step": 1138 + }, + { + "epoch": 0.22977679851976535, + "grad_norm": 0.40387916564941406, + "learning_rate": 8.989448938400596e-06, + "loss": 0.6923, + "step": 1139 + }, + { + "epoch": 0.22997853407597235, + "grad_norm": 0.4020114839076996, + "learning_rate": 8.987478289799792e-06, + "loss": 0.6752, + "step": 1140 + }, + { + "epoch": 0.23018026963217933, + "grad_norm": 0.7502828240394592, + "learning_rate": 8.98550593806107e-06, + "loss": 0.6412, + "step": 1141 + }, + { + "epoch": 0.23038200518838634, + "grad_norm": 0.4360479414463043, + "learning_rate": 8.98353188402687e-06, + "loss": 0.6961, + "step": 1142 + }, + { + "epoch": 0.23058374074459334, + "grad_norm": 0.7076632976531982, + "learning_rate": 8.98155612854035e-06, + "loss": 0.7323, + "step": 1143 + }, + { + "epoch": 0.23078547630080032, + "grad_norm": 0.4245670735836029, + "learning_rate": 8.979578672445397e-06, + "loss": 0.6964, + "step": 1144 + }, + { + "epoch": 0.23098721185700732, + "grad_norm": 0.4887491464614868, + "learning_rate": 8.977599516586625e-06, + "loss": 0.8195, + "step": 1145 + }, + { + "epoch": 0.2311889474132143, + "grad_norm": 0.816503643989563, + "learning_rate": 8.975618661809378e-06, + "loss": 0.7633, + "step": 1146 + }, + { + "epoch": 0.2313906829694213, + "grad_norm": 0.37157323956489563, + "learning_rate": 8.973636108959718e-06, + "loss": 0.7864, + "step": 1147 + }, + { + "epoch": 0.23159241852562829, + "grad_norm": 2.1369481086730957, + "learning_rate": 8.971651858884436e-06, + "loss": 0.7709, + "step": 1148 + }, + { + "epoch": 0.2317941540818353, + "grad_norm": 0.6506975889205933, + "learning_rate": 8.969665912431049e-06, + "loss": 0.9078, + "step": 1149 + }, + { + "epoch": 0.2319958896380423, + "grad_norm": 0.3176805377006531, + "learning_rate": 8.9676782704478e-06, + "loss": 0.6704, + "step": 1150 + }, + { + "epoch": 0.23219762519424927, + "grad_norm": 0.45958906412124634, + "learning_rate": 8.965688933783648e-06, + "loss": 0.8634, + "step": 1151 + }, + { + "epoch": 0.23239936075045628, + "grad_norm": 0.4763012230396271, + "learning_rate": 8.963697903288287e-06, + "loss": 0.7168, + "step": 1152 + }, + { + "epoch": 0.23260109630666326, + "grad_norm": 0.3679403066635132, + "learning_rate": 8.961705179812126e-06, + "loss": 0.6559, + "step": 1153 + }, + { + "epoch": 0.23280283186287026, + "grad_norm": 1.3070601224899292, + "learning_rate": 8.9597107642063e-06, + "loss": 0.7322, + "step": 1154 + }, + { + "epoch": 0.23300456741907724, + "grad_norm": 1.5573060512542725, + "learning_rate": 8.957714657322669e-06, + "loss": 0.8178, + "step": 1155 + }, + { + "epoch": 0.23320630297528425, + "grad_norm": 1.0047229528427124, + "learning_rate": 8.955716860013812e-06, + "loss": 0.9026, + "step": 1156 + }, + { + "epoch": 0.23340803853149122, + "grad_norm": 0.44546905159950256, + "learning_rate": 8.953717373133031e-06, + "loss": 0.6928, + "step": 1157 + }, + { + "epoch": 0.23360977408769823, + "grad_norm": 0.6220713257789612, + "learning_rate": 8.95171619753435e-06, + "loss": 0.6822, + "step": 1158 + }, + { + "epoch": 0.23381150964390524, + "grad_norm": 0.8338699340820312, + "learning_rate": 8.949713334072516e-06, + "loss": 0.7441, + "step": 1159 + }, + { + "epoch": 0.2340132452001122, + "grad_norm": 0.8864205479621887, + "learning_rate": 8.947708783602993e-06, + "loss": 0.676, + "step": 1160 + }, + { + "epoch": 0.23421498075631922, + "grad_norm": 0.6396302580833435, + "learning_rate": 8.94570254698197e-06, + "loss": 0.7031, + "step": 1161 + }, + { + "epoch": 0.2344167163125262, + "grad_norm": 0.4238170385360718, + "learning_rate": 8.94369462506635e-06, + "loss": 0.6581, + "step": 1162 + }, + { + "epoch": 0.2346184518687332, + "grad_norm": 0.36502161622047424, + "learning_rate": 8.941685018713762e-06, + "loss": 0.7065, + "step": 1163 + }, + { + "epoch": 0.23482018742494018, + "grad_norm": 0.5333518385887146, + "learning_rate": 8.939673728782555e-06, + "loss": 0.7189, + "step": 1164 + }, + { + "epoch": 0.23502192298114719, + "grad_norm": 0.4384578466415405, + "learning_rate": 8.937660756131789e-06, + "loss": 0.8163, + "step": 1165 + }, + { + "epoch": 0.23522365853735416, + "grad_norm": 0.4460729658603668, + "learning_rate": 8.935646101621252e-06, + "loss": 0.6813, + "step": 1166 + }, + { + "epoch": 0.23542539409356117, + "grad_norm": 0.53566974401474, + "learning_rate": 8.933629766111443e-06, + "loss": 0.6833, + "step": 1167 + }, + { + "epoch": 0.23562712964976817, + "grad_norm": 0.4094941020011902, + "learning_rate": 8.931611750463586e-06, + "loss": 0.6722, + "step": 1168 + }, + { + "epoch": 0.23582886520597515, + "grad_norm": 0.462923139333725, + "learning_rate": 8.929592055539615e-06, + "loss": 0.8593, + "step": 1169 + }, + { + "epoch": 0.23603060076218216, + "grad_norm": 0.5280745029449463, + "learning_rate": 8.92757068220219e-06, + "loss": 0.6685, + "step": 1170 + }, + { + "epoch": 0.23623233631838914, + "grad_norm": 0.5644258856773376, + "learning_rate": 8.925547631314679e-06, + "loss": 0.6895, + "step": 1171 + }, + { + "epoch": 0.23643407187459614, + "grad_norm": 0.4548441469669342, + "learning_rate": 8.923522903741173e-06, + "loss": 0.6942, + "step": 1172 + }, + { + "epoch": 0.23663580743080312, + "grad_norm": 0.48022985458374023, + "learning_rate": 8.921496500346477e-06, + "loss": 0.6608, + "step": 1173 + }, + { + "epoch": 0.23683754298701012, + "grad_norm": 0.5295478105545044, + "learning_rate": 8.91946842199611e-06, + "loss": 0.6691, + "step": 1174 + }, + { + "epoch": 0.2370392785432171, + "grad_norm": 0.4129369854927063, + "learning_rate": 8.917438669556307e-06, + "loss": 0.6763, + "step": 1175 + }, + { + "epoch": 0.2372410140994241, + "grad_norm": 0.4060460329055786, + "learning_rate": 8.915407243894022e-06, + "loss": 0.725, + "step": 1176 + }, + { + "epoch": 0.2374427496556311, + "grad_norm": 0.8107428550720215, + "learning_rate": 8.913374145876918e-06, + "loss": 0.8017, + "step": 1177 + }, + { + "epoch": 0.2376444852118381, + "grad_norm": 0.35032761096954346, + "learning_rate": 8.911339376373377e-06, + "loss": 0.7125, + "step": 1178 + }, + { + "epoch": 0.2378462207680451, + "grad_norm": 1.233778476715088, + "learning_rate": 8.909302936252491e-06, + "loss": 0.7081, + "step": 1179 + }, + { + "epoch": 0.23804795632425207, + "grad_norm": 0.5814195871353149, + "learning_rate": 8.90726482638407e-06, + "loss": 0.6855, + "step": 1180 + }, + { + "epoch": 0.23824969188045908, + "grad_norm": 0.5171557664871216, + "learning_rate": 8.905225047638633e-06, + "loss": 0.6714, + "step": 1181 + }, + { + "epoch": 0.23845142743666606, + "grad_norm": 1.1142185926437378, + "learning_rate": 8.903183600887412e-06, + "loss": 1.0363, + "step": 1182 + }, + { + "epoch": 0.23865316299287306, + "grad_norm": 2.59897518157959, + "learning_rate": 8.901140487002358e-06, + "loss": 0.795, + "step": 1183 + }, + { + "epoch": 0.23885489854908004, + "grad_norm": 3.8224384784698486, + "learning_rate": 8.899095706856122e-06, + "loss": 0.7541, + "step": 1184 + }, + { + "epoch": 0.23905663410528705, + "grad_norm": 0.720397412776947, + "learning_rate": 8.897049261322079e-06, + "loss": 0.6947, + "step": 1185 + }, + { + "epoch": 0.23925836966149405, + "grad_norm": 3.830878734588623, + "learning_rate": 8.895001151274309e-06, + "loss": 0.7975, + "step": 1186 + }, + { + "epoch": 0.23946010521770103, + "grad_norm": 0.5359489321708679, + "learning_rate": 8.892951377587602e-06, + "loss": 0.6846, + "step": 1187 + }, + { + "epoch": 0.23966184077390804, + "grad_norm": 0.6558648943901062, + "learning_rate": 8.890899941137461e-06, + "loss": 0.6896, + "step": 1188 + }, + { + "epoch": 0.239863576330115, + "grad_norm": 0.8913664221763611, + "learning_rate": 8.888846842800101e-06, + "loss": 0.7391, + "step": 1189 + }, + { + "epoch": 0.24006531188632202, + "grad_norm": 0.5871713757514954, + "learning_rate": 8.886792083452443e-06, + "loss": 0.8685, + "step": 1190 + }, + { + "epoch": 0.240267047442529, + "grad_norm": 0.4375171959400177, + "learning_rate": 8.884735663972118e-06, + "loss": 0.7846, + "step": 1191 + }, + { + "epoch": 0.240468782998736, + "grad_norm": 1.61861252784729, + "learning_rate": 8.882677585237467e-06, + "loss": 0.6952, + "step": 1192 + }, + { + "epoch": 0.24067051855494298, + "grad_norm": 0.9504277110099792, + "learning_rate": 8.880617848127542e-06, + "loss": 0.6648, + "step": 1193 + }, + { + "epoch": 0.24087225411114999, + "grad_norm": 0.5535653233528137, + "learning_rate": 8.8785564535221e-06, + "loss": 0.7864, + "step": 1194 + }, + { + "epoch": 0.241073989667357, + "grad_norm": 0.38245075941085815, + "learning_rate": 8.876493402301606e-06, + "loss": 0.6732, + "step": 1195 + }, + { + "epoch": 0.24127572522356397, + "grad_norm": 0.43615174293518066, + "learning_rate": 8.874428695347237e-06, + "loss": 0.7012, + "step": 1196 + }, + { + "epoch": 0.24147746077977097, + "grad_norm": 0.7267807722091675, + "learning_rate": 8.872362333540869e-06, + "loss": 0.6345, + "step": 1197 + }, + { + "epoch": 0.24167919633597795, + "grad_norm": 0.5719030499458313, + "learning_rate": 8.870294317765094e-06, + "loss": 0.7034, + "step": 1198 + }, + { + "epoch": 0.24188093189218496, + "grad_norm": 0.8841007351875305, + "learning_rate": 8.868224648903203e-06, + "loss": 0.707, + "step": 1199 + }, + { + "epoch": 0.24208266744839194, + "grad_norm": 0.48409315943717957, + "learning_rate": 8.866153327839198e-06, + "loss": 0.7201, + "step": 1200 + }, + { + "epoch": 0.24228440300459894, + "grad_norm": 0.7220364212989807, + "learning_rate": 8.864080355457782e-06, + "loss": 0.7586, + "step": 1201 + }, + { + "epoch": 0.24248613856080595, + "grad_norm": 0.43595415353775024, + "learning_rate": 8.862005732644373e-06, + "loss": 0.6996, + "step": 1202 + }, + { + "epoch": 0.24268787411701293, + "grad_norm": 0.4253355860710144, + "learning_rate": 8.859929460285078e-06, + "loss": 0.8073, + "step": 1203 + }, + { + "epoch": 0.24288960967321993, + "grad_norm": 0.6659312844276428, + "learning_rate": 8.857851539266724e-06, + "loss": 0.6698, + "step": 1204 + }, + { + "epoch": 0.2430913452294269, + "grad_norm": 0.3482789695262909, + "learning_rate": 8.855771970476834e-06, + "loss": 0.6306, + "step": 1205 + }, + { + "epoch": 0.24329308078563391, + "grad_norm": 6.6193718910217285, + "learning_rate": 8.853690754803638e-06, + "loss": 0.6382, + "step": 1206 + }, + { + "epoch": 0.2434948163418409, + "grad_norm": 0.38211026787757874, + "learning_rate": 8.851607893136065e-06, + "loss": 0.693, + "step": 1207 + }, + { + "epoch": 0.2436965518980479, + "grad_norm": 0.7665725946426392, + "learning_rate": 8.849523386363754e-06, + "loss": 0.6995, + "step": 1208 + }, + { + "epoch": 0.24389828745425488, + "grad_norm": 0.5308759212493896, + "learning_rate": 8.84743723537704e-06, + "loss": 0.7059, + "step": 1209 + }, + { + "epoch": 0.24410002301046188, + "grad_norm": 0.8600336313247681, + "learning_rate": 8.845349441066961e-06, + "loss": 0.7524, + "step": 1210 + }, + { + "epoch": 0.2443017585666689, + "grad_norm": 0.656417191028595, + "learning_rate": 8.843260004325265e-06, + "loss": 0.7514, + "step": 1211 + }, + { + "epoch": 0.24450349412287586, + "grad_norm": 0.33181649446487427, + "learning_rate": 8.84116892604439e-06, + "loss": 0.6944, + "step": 1212 + }, + { + "epoch": 0.24470522967908287, + "grad_norm": 1.0347340106964111, + "learning_rate": 8.839076207117485e-06, + "loss": 0.7253, + "step": 1213 + }, + { + "epoch": 0.24490696523528985, + "grad_norm": 1.1775801181793213, + "learning_rate": 8.83698184843839e-06, + "loss": 0.6874, + "step": 1214 + }, + { + "epoch": 0.24510870079149685, + "grad_norm": 0.4488203823566437, + "learning_rate": 8.834885850901656e-06, + "loss": 1.1538, + "step": 1215 + }, + { + "epoch": 0.24531043634770383, + "grad_norm": 0.508878231048584, + "learning_rate": 8.832788215402527e-06, + "loss": 0.8184, + "step": 1216 + }, + { + "epoch": 0.24551217190391084, + "grad_norm": 0.42795446515083313, + "learning_rate": 8.830688942836946e-06, + "loss": 0.6962, + "step": 1217 + }, + { + "epoch": 0.24571390746011781, + "grad_norm": 0.7489685416221619, + "learning_rate": 8.828588034101561e-06, + "loss": 0.6939, + "step": 1218 + }, + { + "epoch": 0.24591564301632482, + "grad_norm": 0.5963667035102844, + "learning_rate": 8.826485490093714e-06, + "loss": 0.6844, + "step": 1219 + }, + { + "epoch": 0.24611737857253183, + "grad_norm": 0.5979509353637695, + "learning_rate": 8.824381311711444e-06, + "loss": 0.6856, + "step": 1220 + }, + { + "epoch": 0.2463191141287388, + "grad_norm": 0.4428512454032898, + "learning_rate": 8.822275499853497e-06, + "loss": 0.6599, + "step": 1221 + }, + { + "epoch": 0.2465208496849458, + "grad_norm": 0.47854629158973694, + "learning_rate": 8.820168055419306e-06, + "loss": 0.6481, + "step": 1222 + }, + { + "epoch": 0.2467225852411528, + "grad_norm": 0.5568980574607849, + "learning_rate": 8.818058979309007e-06, + "loss": 0.6722, + "step": 1223 + }, + { + "epoch": 0.2469243207973598, + "grad_norm": 0.5346736311912537, + "learning_rate": 8.815948272423432e-06, + "loss": 0.7013, + "step": 1224 + }, + { + "epoch": 0.24712605635356677, + "grad_norm": 0.515552818775177, + "learning_rate": 8.81383593566411e-06, + "loss": 0.8875, + "step": 1225 + }, + { + "epoch": 0.24732779190977378, + "grad_norm": 0.581317126750946, + "learning_rate": 8.811721969933264e-06, + "loss": 0.7057, + "step": 1226 + }, + { + "epoch": 0.24752952746598075, + "grad_norm": 0.5679516792297363, + "learning_rate": 8.809606376133814e-06, + "loss": 0.701, + "step": 1227 + }, + { + "epoch": 0.24773126302218776, + "grad_norm": 0.623017430305481, + "learning_rate": 8.80748915516938e-06, + "loss": 0.7434, + "step": 1228 + }, + { + "epoch": 0.24793299857839476, + "grad_norm": 0.7623668313026428, + "learning_rate": 8.805370307944268e-06, + "loss": 0.6615, + "step": 1229 + }, + { + "epoch": 0.24813473413460174, + "grad_norm": 0.4862426817417145, + "learning_rate": 8.803249835363486e-06, + "loss": 0.6661, + "step": 1230 + }, + { + "epoch": 0.24833646969080875, + "grad_norm": 0.9921598434448242, + "learning_rate": 8.801127738332731e-06, + "loss": 0.7617, + "step": 1231 + }, + { + "epoch": 0.24853820524701573, + "grad_norm": 0.4795064926147461, + "learning_rate": 8.7990040177584e-06, + "loss": 0.8968, + "step": 1232 + }, + { + "epoch": 0.24873994080322273, + "grad_norm": 2.262944221496582, + "learning_rate": 8.796878674547578e-06, + "loss": 0.6576, + "step": 1233 + }, + { + "epoch": 0.2489416763594297, + "grad_norm": 1.5917365550994873, + "learning_rate": 8.794751709608042e-06, + "loss": 0.8318, + "step": 1234 + }, + { + "epoch": 0.24914341191563671, + "grad_norm": 0.45491623878479004, + "learning_rate": 8.79262312384827e-06, + "loss": 0.6919, + "step": 1235 + }, + { + "epoch": 0.2493451474718437, + "grad_norm": 0.7376951575279236, + "learning_rate": 8.790492918177423e-06, + "loss": 0.7886, + "step": 1236 + }, + { + "epoch": 0.2495468830280507, + "grad_norm": 0.6271589994430542, + "learning_rate": 8.788361093505358e-06, + "loss": 0.6272, + "step": 1237 + }, + { + "epoch": 0.2497486185842577, + "grad_norm": 0.5672959685325623, + "learning_rate": 8.786227650742624e-06, + "loss": 0.6987, + "step": 1238 + }, + { + "epoch": 0.24995035414046468, + "grad_norm": 0.41392165422439575, + "learning_rate": 8.784092590800462e-06, + "loss": 0.6873, + "step": 1239 + }, + { + "epoch": 0.2501520896966717, + "grad_norm": 0.5560492873191833, + "learning_rate": 8.781955914590801e-06, + "loss": 0.6692, + "step": 1240 + }, + { + "epoch": 0.2503538252528787, + "grad_norm": 1.9556437730789185, + "learning_rate": 8.77981762302626e-06, + "loss": 0.7373, + "step": 1241 + }, + { + "epoch": 0.25055556080908564, + "grad_norm": 0.4091489613056183, + "learning_rate": 8.77767771702015e-06, + "loss": 0.6626, + "step": 1242 + }, + { + "epoch": 0.25075729636529265, + "grad_norm": 0.693080723285675, + "learning_rate": 8.775536197486471e-06, + "loss": 0.6866, + "step": 1243 + }, + { + "epoch": 0.25095903192149965, + "grad_norm": 0.6191006898880005, + "learning_rate": 8.773393065339915e-06, + "loss": 0.6609, + "step": 1244 + }, + { + "epoch": 0.25116076747770666, + "grad_norm": 0.4202521741390228, + "learning_rate": 8.771248321495856e-06, + "loss": 0.7482, + "step": 1245 + }, + { + "epoch": 0.25136250303391366, + "grad_norm": 0.4725857377052307, + "learning_rate": 8.769101966870362e-06, + "loss": 0.8326, + "step": 1246 + }, + { + "epoch": 0.2515642385901206, + "grad_norm": 0.6367688775062561, + "learning_rate": 8.766954002380188e-06, + "loss": 0.6573, + "step": 1247 + }, + { + "epoch": 0.2517659741463276, + "grad_norm": 0.3952287435531616, + "learning_rate": 8.764804428942774e-06, + "loss": 0.6654, + "step": 1248 + }, + { + "epoch": 0.2519677097025346, + "grad_norm": 0.9058800935745239, + "learning_rate": 8.762653247476249e-06, + "loss": 0.6637, + "step": 1249 + }, + { + "epoch": 0.25216944525874163, + "grad_norm": 0.42118096351623535, + "learning_rate": 8.760500458899432e-06, + "loss": 0.6979, + "step": 1250 + }, + { + "epoch": 0.2523711808149486, + "grad_norm": 0.3834727704524994, + "learning_rate": 8.758346064131824e-06, + "loss": 0.6721, + "step": 1251 + }, + { + "epoch": 0.2525729163711556, + "grad_norm": 0.7268792390823364, + "learning_rate": 8.756190064093613e-06, + "loss": 0.6441, + "step": 1252 + }, + { + "epoch": 0.2527746519273626, + "grad_norm": 0.3844261169433594, + "learning_rate": 8.754032459705672e-06, + "loss": 0.677, + "step": 1253 + }, + { + "epoch": 0.2529763874835696, + "grad_norm": 1.0050066709518433, + "learning_rate": 8.751873251889563e-06, + "loss": 0.6986, + "step": 1254 + }, + { + "epoch": 0.2531781230397766, + "grad_norm": 0.4152543544769287, + "learning_rate": 8.749712441567526e-06, + "loss": 0.7067, + "step": 1255 + }, + { + "epoch": 0.25337985859598355, + "grad_norm": 0.9145545363426208, + "learning_rate": 8.747550029662493e-06, + "loss": 0.6727, + "step": 1256 + }, + { + "epoch": 0.25358159415219056, + "grad_norm": 0.5626360177993774, + "learning_rate": 8.74538601709808e-06, + "loss": 0.7, + "step": 1257 + }, + { + "epoch": 0.25378332970839756, + "grad_norm": 0.9599636197090149, + "learning_rate": 8.743220404798573e-06, + "loss": 0.731, + "step": 1258 + }, + { + "epoch": 0.25398506526460457, + "grad_norm": 0.6157480478286743, + "learning_rate": 8.741053193688964e-06, + "loss": 0.6644, + "step": 1259 + }, + { + "epoch": 0.2541868008208115, + "grad_norm": 0.6961658596992493, + "learning_rate": 8.738884384694905e-06, + "loss": 0.6765, + "step": 1260 + }, + { + "epoch": 0.2543885363770185, + "grad_norm": 1.0447839498519897, + "learning_rate": 8.73671397874275e-06, + "loss": 0.7055, + "step": 1261 + }, + { + "epoch": 0.25459027193322553, + "grad_norm": 0.5891323685646057, + "learning_rate": 8.734541976759519e-06, + "loss": 0.7109, + "step": 1262 + }, + { + "epoch": 0.25479200748943254, + "grad_norm": 1.4431084394454956, + "learning_rate": 8.732368379672924e-06, + "loss": 0.844, + "step": 1263 + }, + { + "epoch": 0.25499374304563954, + "grad_norm": 0.45965614914894104, + "learning_rate": 8.730193188411355e-06, + "loss": 0.6844, + "step": 1264 + }, + { + "epoch": 0.2551954786018465, + "grad_norm": 0.7861972451210022, + "learning_rate": 8.728016403903884e-06, + "loss": 0.6699, + "step": 1265 + }, + { + "epoch": 0.2553972141580535, + "grad_norm": 1.466948390007019, + "learning_rate": 8.725838027080261e-06, + "loss": 0.7904, + "step": 1266 + }, + { + "epoch": 0.2555989497142605, + "grad_norm": 0.543451726436615, + "learning_rate": 8.723658058870919e-06, + "loss": 0.8914, + "step": 1267 + }, + { + "epoch": 0.2558006852704675, + "grad_norm": 0.5557923913002014, + "learning_rate": 8.721476500206968e-06, + "loss": 0.6768, + "step": 1268 + }, + { + "epoch": 0.25600242082667446, + "grad_norm": 0.37838971614837646, + "learning_rate": 8.7192933520202e-06, + "loss": 0.6898, + "step": 1269 + }, + { + "epoch": 0.25620415638288146, + "grad_norm": 0.6620925664901733, + "learning_rate": 8.717108615243081e-06, + "loss": 0.6722, + "step": 1270 + }, + { + "epoch": 0.25640589193908847, + "grad_norm": 0.35290196537971497, + "learning_rate": 8.714922290808766e-06, + "loss": 0.6942, + "step": 1271 + }, + { + "epoch": 0.2566076274952955, + "grad_norm": 0.644790768623352, + "learning_rate": 8.712734379651075e-06, + "loss": 0.7593, + "step": 1272 + }, + { + "epoch": 0.2568093630515025, + "grad_norm": 0.7884232401847839, + "learning_rate": 8.710544882704516e-06, + "loss": 0.7974, + "step": 1273 + }, + { + "epoch": 0.25701109860770943, + "grad_norm": 0.6848911046981812, + "learning_rate": 8.708353800904269e-06, + "loss": 0.7131, + "step": 1274 + }, + { + "epoch": 0.25721283416391644, + "grad_norm": 0.5041124820709229, + "learning_rate": 8.706161135186192e-06, + "loss": 0.6846, + "step": 1275 + }, + { + "epoch": 0.25741456972012344, + "grad_norm": 1.2871185541152954, + "learning_rate": 8.703966886486819e-06, + "loss": 0.6742, + "step": 1276 + }, + { + "epoch": 0.25761630527633045, + "grad_norm": 0.5651084184646606, + "learning_rate": 8.701771055743363e-06, + "loss": 0.7076, + "step": 1277 + }, + { + "epoch": 0.2578180408325374, + "grad_norm": 0.4107651710510254, + "learning_rate": 8.699573643893708e-06, + "loss": 0.6315, + "step": 1278 + }, + { + "epoch": 0.2580197763887444, + "grad_norm": 0.48510411381721497, + "learning_rate": 8.697374651876419e-06, + "loss": 0.74, + "step": 1279 + }, + { + "epoch": 0.2582215119449514, + "grad_norm": 0.36969929933547974, + "learning_rate": 8.695174080630728e-06, + "loss": 1.1805, + "step": 1280 + }, + { + "epoch": 0.2584232475011584, + "grad_norm": 0.48439717292785645, + "learning_rate": 8.692971931096553e-06, + "loss": 0.7533, + "step": 1281 + }, + { + "epoch": 0.2586249830573654, + "grad_norm": 0.5932062864303589, + "learning_rate": 8.690768204214474e-06, + "loss": 0.7201, + "step": 1282 + }, + { + "epoch": 0.25882671861357237, + "grad_norm": 1.3188583850860596, + "learning_rate": 8.688562900925755e-06, + "loss": 0.6881, + "step": 1283 + }, + { + "epoch": 0.2590284541697794, + "grad_norm": 0.40501800179481506, + "learning_rate": 8.686356022172324e-06, + "loss": 0.6485, + "step": 1284 + }, + { + "epoch": 0.2592301897259864, + "grad_norm": 0.7536221146583557, + "learning_rate": 8.684147568896788e-06, + "loss": 0.6903, + "step": 1285 + }, + { + "epoch": 0.2594319252821934, + "grad_norm": 0.5470438599586487, + "learning_rate": 8.681937542042426e-06, + "loss": 0.7256, + "step": 1286 + }, + { + "epoch": 0.25963366083840034, + "grad_norm": 0.5705691576004028, + "learning_rate": 8.679725942553189e-06, + "loss": 0.6838, + "step": 1287 + }, + { + "epoch": 0.25983539639460734, + "grad_norm": 0.7795629501342773, + "learning_rate": 8.677512771373695e-06, + "loss": 0.7627, + "step": 1288 + }, + { + "epoch": 0.26003713195081435, + "grad_norm": 1.288813829421997, + "learning_rate": 8.675298029449241e-06, + "loss": 0.6883, + "step": 1289 + }, + { + "epoch": 0.26023886750702135, + "grad_norm": 0.40749382972717285, + "learning_rate": 8.67308171772579e-06, + "loss": 0.7227, + "step": 1290 + }, + { + "epoch": 0.26044060306322836, + "grad_norm": 1.280716061592102, + "learning_rate": 8.670863837149976e-06, + "loss": 0.8096, + "step": 1291 + }, + { + "epoch": 0.2606423386194353, + "grad_norm": 0.424046128988266, + "learning_rate": 8.668644388669102e-06, + "loss": 0.7505, + "step": 1292 + }, + { + "epoch": 0.2608440741756423, + "grad_norm": 0.8731921911239624, + "learning_rate": 8.666423373231145e-06, + "loss": 0.7711, + "step": 1293 + }, + { + "epoch": 0.2610458097318493, + "grad_norm": 0.40730440616607666, + "learning_rate": 8.664200791784746e-06, + "loss": 0.7384, + "step": 1294 + }, + { + "epoch": 0.2612475452880563, + "grad_norm": 0.45330461859703064, + "learning_rate": 8.66197664527922e-06, + "loss": 0.8328, + "step": 1295 + }, + { + "epoch": 0.2614492808442633, + "grad_norm": 0.6191751956939697, + "learning_rate": 8.659750934664546e-06, + "loss": 0.7547, + "step": 1296 + }, + { + "epoch": 0.2616510164004703, + "grad_norm": 0.5399580597877502, + "learning_rate": 8.657523660891376e-06, + "loss": 0.9473, + "step": 1297 + }, + { + "epoch": 0.2618527519566773, + "grad_norm": 0.5975515246391296, + "learning_rate": 8.655294824911022e-06, + "loss": 0.6853, + "step": 1298 + }, + { + "epoch": 0.2620544875128843, + "grad_norm": 0.5757783651351929, + "learning_rate": 8.65306442767547e-06, + "loss": 0.6934, + "step": 1299 + }, + { + "epoch": 0.2622562230690913, + "grad_norm": 0.5642322301864624, + "learning_rate": 8.650832470137373e-06, + "loss": 0.6925, + "step": 1300 + }, + { + "epoch": 0.26245795862529825, + "grad_norm": 0.3975115716457367, + "learning_rate": 8.648598953250045e-06, + "loss": 0.9906, + "step": 1301 + }, + { + "epoch": 0.26265969418150525, + "grad_norm": 0.4235062599182129, + "learning_rate": 8.64636387796747e-06, + "loss": 0.8266, + "step": 1302 + }, + { + "epoch": 0.26286142973771226, + "grad_norm": 0.4979153275489807, + "learning_rate": 8.644127245244298e-06, + "loss": 0.707, + "step": 1303 + }, + { + "epoch": 0.26306316529391927, + "grad_norm": 0.8910357356071472, + "learning_rate": 8.641889056035842e-06, + "loss": 1.0055, + "step": 1304 + }, + { + "epoch": 0.2632649008501262, + "grad_norm": 0.6762681603431702, + "learning_rate": 8.639649311298081e-06, + "loss": 0.948, + "step": 1305 + }, + { + "epoch": 0.2634666364063332, + "grad_norm": 0.5388845801353455, + "learning_rate": 8.637408011987657e-06, + "loss": 0.6458, + "step": 1306 + }, + { + "epoch": 0.2636683719625402, + "grad_norm": 0.4577954113483429, + "learning_rate": 8.63516515906188e-06, + "loss": 0.9071, + "step": 1307 + }, + { + "epoch": 0.26387010751874723, + "grad_norm": 0.4139573276042938, + "learning_rate": 8.63292075347872e-06, + "loss": 0.6905, + "step": 1308 + }, + { + "epoch": 0.26407184307495424, + "grad_norm": 1.0751546621322632, + "learning_rate": 8.630674796196809e-06, + "loss": 0.683, + "step": 1309 + }, + { + "epoch": 0.2642735786311612, + "grad_norm": 2.5541248321533203, + "learning_rate": 8.628427288175444e-06, + "loss": 0.7364, + "step": 1310 + }, + { + "epoch": 0.2644753141873682, + "grad_norm": 0.41766172647476196, + "learning_rate": 8.626178230374588e-06, + "loss": 0.7022, + "step": 1311 + }, + { + "epoch": 0.2646770497435752, + "grad_norm": 0.4580974578857422, + "learning_rate": 8.623927623754858e-06, + "loss": 0.7594, + "step": 1312 + }, + { + "epoch": 0.2648787852997822, + "grad_norm": 1.566453456878662, + "learning_rate": 8.621675469277538e-06, + "loss": 0.7054, + "step": 1313 + }, + { + "epoch": 0.26508052085598915, + "grad_norm": 1.0086655616760254, + "learning_rate": 8.619421767904571e-06, + "loss": 0.6916, + "step": 1314 + }, + { + "epoch": 0.26528225641219616, + "grad_norm": 0.5008938312530518, + "learning_rate": 8.617166520598563e-06, + "loss": 0.7232, + "step": 1315 + }, + { + "epoch": 0.26548399196840317, + "grad_norm": 0.32938429713249207, + "learning_rate": 8.614909728322778e-06, + "loss": 0.8034, + "step": 1316 + }, + { + "epoch": 0.26568572752461017, + "grad_norm": 0.5591095089912415, + "learning_rate": 8.612651392041138e-06, + "loss": 0.8449, + "step": 1317 + }, + { + "epoch": 0.2658874630808172, + "grad_norm": 1.0034416913986206, + "learning_rate": 8.610391512718232e-06, + "loss": 0.6897, + "step": 1318 + }, + { + "epoch": 0.2660891986370241, + "grad_norm": 0.5176849365234375, + "learning_rate": 8.6081300913193e-06, + "loss": 0.6888, + "step": 1319 + }, + { + "epoch": 0.26629093419323113, + "grad_norm": 0.4472174346446991, + "learning_rate": 8.605867128810243e-06, + "loss": 0.9786, + "step": 1320 + }, + { + "epoch": 0.26649266974943814, + "grad_norm": 0.39083603024482727, + "learning_rate": 8.603602626157624e-06, + "loss": 0.8769, + "step": 1321 + }, + { + "epoch": 0.26669440530564514, + "grad_norm": 0.40016594529151917, + "learning_rate": 8.601336584328659e-06, + "loss": 0.8001, + "step": 1322 + }, + { + "epoch": 0.26689614086185215, + "grad_norm": 0.42085206508636475, + "learning_rate": 8.599069004291224e-06, + "loss": 0.6967, + "step": 1323 + }, + { + "epoch": 0.2670978764180591, + "grad_norm": 0.5440914630889893, + "learning_rate": 8.596799887013852e-06, + "loss": 1.0482, + "step": 1324 + }, + { + "epoch": 0.2672996119742661, + "grad_norm": 0.900872528553009, + "learning_rate": 8.594529233465728e-06, + "loss": 0.7054, + "step": 1325 + }, + { + "epoch": 0.2675013475304731, + "grad_norm": 0.3810652494430542, + "learning_rate": 8.592257044616701e-06, + "loss": 0.716, + "step": 1326 + }, + { + "epoch": 0.2677030830866801, + "grad_norm": 0.5799583196640015, + "learning_rate": 8.589983321437271e-06, + "loss": 0.7983, + "step": 1327 + }, + { + "epoch": 0.26790481864288707, + "grad_norm": 0.5827012658119202, + "learning_rate": 8.587708064898595e-06, + "loss": 0.709, + "step": 1328 + }, + { + "epoch": 0.26810655419909407, + "grad_norm": 1.0978662967681885, + "learning_rate": 8.585431275972483e-06, + "loss": 0.6835, + "step": 1329 + }, + { + "epoch": 0.2683082897553011, + "grad_norm": 0.4511360824108124, + "learning_rate": 8.5831529556314e-06, + "loss": 0.7022, + "step": 1330 + }, + { + "epoch": 0.2685100253115081, + "grad_norm": 0.6490656137466431, + "learning_rate": 8.580873104848466e-06, + "loss": 0.7947, + "step": 1331 + }, + { + "epoch": 0.2687117608677151, + "grad_norm": 0.6834023594856262, + "learning_rate": 8.578591724597455e-06, + "loss": 0.7441, + "step": 1332 + }, + { + "epoch": 0.26891349642392204, + "grad_norm": 0.40996038913726807, + "learning_rate": 8.576308815852793e-06, + "loss": 0.669, + "step": 1333 + }, + { + "epoch": 0.26911523198012904, + "grad_norm": 2.8881947994232178, + "learning_rate": 8.57402437958956e-06, + "loss": 0.7299, + "step": 1334 + }, + { + "epoch": 0.26931696753633605, + "grad_norm": 2.5765886306762695, + "learning_rate": 8.57173841678349e-06, + "loss": 0.6747, + "step": 1335 + }, + { + "epoch": 0.26951870309254305, + "grad_norm": 0.8167450428009033, + "learning_rate": 8.569450928410963e-06, + "loss": 0.7206, + "step": 1336 + }, + { + "epoch": 0.26972043864875, + "grad_norm": 2.003079652786255, + "learning_rate": 8.567161915449018e-06, + "loss": 0.6806, + "step": 1337 + }, + { + "epoch": 0.269922174204957, + "grad_norm": 2.5011754035949707, + "learning_rate": 8.56487137887534e-06, + "loss": 0.6929, + "step": 1338 + }, + { + "epoch": 0.270123909761164, + "grad_norm": 0.7322660088539124, + "learning_rate": 8.562579319668265e-06, + "loss": 0.7239, + "step": 1339 + }, + { + "epoch": 0.270325645317371, + "grad_norm": 0.47642382979393005, + "learning_rate": 8.560285738806784e-06, + "loss": 0.6749, + "step": 1340 + }, + { + "epoch": 0.270527380873578, + "grad_norm": 0.5918588638305664, + "learning_rate": 8.557990637270533e-06, + "loss": 0.6878, + "step": 1341 + }, + { + "epoch": 0.270729116429785, + "grad_norm": 0.4040104150772095, + "learning_rate": 8.5556940160398e-06, + "loss": 0.7198, + "step": 1342 + }, + { + "epoch": 0.270930851985992, + "grad_norm": 0.516697883605957, + "learning_rate": 8.553395876095523e-06, + "loss": 0.769, + "step": 1343 + }, + { + "epoch": 0.271132587542199, + "grad_norm": 1.1435527801513672, + "learning_rate": 8.551096218419283e-06, + "loss": 0.6739, + "step": 1344 + }, + { + "epoch": 0.271334323098406, + "grad_norm": 0.4685196876525879, + "learning_rate": 8.548795043993316e-06, + "loss": 0.8076, + "step": 1345 + }, + { + "epoch": 0.27153605865461294, + "grad_norm": 0.4881542921066284, + "learning_rate": 8.546492353800504e-06, + "loss": 0.659, + "step": 1346 + }, + { + "epoch": 0.27173779421081995, + "grad_norm": 0.31347280740737915, + "learning_rate": 8.544188148824376e-06, + "loss": 1.4194, + "step": 1347 + }, + { + "epoch": 0.27193952976702696, + "grad_norm": 0.5140544772148132, + "learning_rate": 8.541882430049103e-06, + "loss": 0.6736, + "step": 1348 + }, + { + "epoch": 0.27214126532323396, + "grad_norm": 0.35953038930892944, + "learning_rate": 8.539575198459512e-06, + "loss": 0.6883, + "step": 1349 + }, + { + "epoch": 0.27234300087944097, + "grad_norm": 0.5027438402175903, + "learning_rate": 8.537266455041069e-06, + "loss": 0.8255, + "step": 1350 + }, + { + "epoch": 0.2725447364356479, + "grad_norm": 0.4743795096874237, + "learning_rate": 8.534956200779889e-06, + "loss": 0.7119, + "step": 1351 + }, + { + "epoch": 0.2727464719918549, + "grad_norm": 0.5884958505630493, + "learning_rate": 8.532644436662732e-06, + "loss": 0.8181, + "step": 1352 + }, + { + "epoch": 0.2729482075480619, + "grad_norm": 0.5815306901931763, + "learning_rate": 8.530331163677e-06, + "loss": 0.6907, + "step": 1353 + }, + { + "epoch": 0.27314994310426893, + "grad_norm": 0.45392102003097534, + "learning_rate": 8.528016382810744e-06, + "loss": 0.8513, + "step": 1354 + }, + { + "epoch": 0.2733516786604759, + "grad_norm": 0.4220741391181946, + "learning_rate": 8.525700095052655e-06, + "loss": 0.8672, + "step": 1355 + }, + { + "epoch": 0.2735534142166829, + "grad_norm": 0.867940366268158, + "learning_rate": 8.523382301392071e-06, + "loss": 0.7781, + "step": 1356 + }, + { + "epoch": 0.2737551497728899, + "grad_norm": 0.5932605862617493, + "learning_rate": 8.52106300281897e-06, + "loss": 0.8358, + "step": 1357 + }, + { + "epoch": 0.2739568853290969, + "grad_norm": 0.5759530663490295, + "learning_rate": 8.518742200323977e-06, + "loss": 0.835, + "step": 1358 + }, + { + "epoch": 0.2741586208853039, + "grad_norm": 0.37414512038230896, + "learning_rate": 8.516419894898356e-06, + "loss": 0.7588, + "step": 1359 + }, + { + "epoch": 0.27436035644151086, + "grad_norm": 0.8124709725379944, + "learning_rate": 8.51409608753401e-06, + "loss": 0.7914, + "step": 1360 + }, + { + "epoch": 0.27456209199771786, + "grad_norm": 0.7299355864524841, + "learning_rate": 8.511770779223491e-06, + "loss": 0.8218, + "step": 1361 + }, + { + "epoch": 0.27476382755392487, + "grad_norm": 0.8825898766517639, + "learning_rate": 8.50944397095999e-06, + "loss": 0.6635, + "step": 1362 + }, + { + "epoch": 0.27496556311013187, + "grad_norm": 0.5448117256164551, + "learning_rate": 8.507115663737331e-06, + "loss": 0.6729, + "step": 1363 + }, + { + "epoch": 0.2751672986663388, + "grad_norm": 0.8231639266014099, + "learning_rate": 8.504785858549989e-06, + "loss": 0.7734, + "step": 1364 + }, + { + "epoch": 0.2753690342225458, + "grad_norm": 1.6986454725265503, + "learning_rate": 8.502454556393071e-06, + "loss": 0.8105, + "step": 1365 + }, + { + "epoch": 0.27557076977875283, + "grad_norm": 1.8813633918762207, + "learning_rate": 8.50012175826233e-06, + "loss": 0.8676, + "step": 1366 + }, + { + "epoch": 0.27577250533495984, + "grad_norm": 0.4902789294719696, + "learning_rate": 8.49778746515415e-06, + "loss": 0.7333, + "step": 1367 + }, + { + "epoch": 0.27597424089116684, + "grad_norm": 0.3125844895839691, + "learning_rate": 8.495451678065563e-06, + "loss": 0.822, + "step": 1368 + }, + { + "epoch": 0.2761759764473738, + "grad_norm": 0.8880373239517212, + "learning_rate": 8.493114397994229e-06, + "loss": 0.7292, + "step": 1369 + }, + { + "epoch": 0.2763777120035808, + "grad_norm": 0.3109457492828369, + "learning_rate": 8.490775625938452e-06, + "loss": 0.703, + "step": 1370 + }, + { + "epoch": 0.2765794475597878, + "grad_norm": 0.5545622706413269, + "learning_rate": 8.488435362897176e-06, + "loss": 0.7946, + "step": 1371 + }, + { + "epoch": 0.2767811831159948, + "grad_norm": 1.6414847373962402, + "learning_rate": 8.486093609869972e-06, + "loss": 0.7223, + "step": 1372 + }, + { + "epoch": 0.27698291867220176, + "grad_norm": 0.4702077805995941, + "learning_rate": 8.483750367857056e-06, + "loss": 0.6668, + "step": 1373 + }, + { + "epoch": 0.27718465422840877, + "grad_norm": 0.7001679539680481, + "learning_rate": 8.481405637859277e-06, + "loss": 0.7024, + "step": 1374 + }, + { + "epoch": 0.27738638978461577, + "grad_norm": 0.6269954442977905, + "learning_rate": 8.479059420878121e-06, + "loss": 0.729, + "step": 1375 + }, + { + "epoch": 0.2775881253408228, + "grad_norm": 0.39072608947753906, + "learning_rate": 8.476711717915707e-06, + "loss": 0.8695, + "step": 1376 + }, + { + "epoch": 0.2777898608970298, + "grad_norm": 0.5733974575996399, + "learning_rate": 8.474362529974787e-06, + "loss": 0.6856, + "step": 1377 + }, + { + "epoch": 0.27799159645323673, + "grad_norm": 0.47215527296066284, + "learning_rate": 8.472011858058751e-06, + "loss": 0.8618, + "step": 1378 + }, + { + "epoch": 0.27819333200944374, + "grad_norm": 0.7275539040565491, + "learning_rate": 8.469659703171624e-06, + "loss": 0.849, + "step": 1379 + }, + { + "epoch": 0.27839506756565074, + "grad_norm": 0.6656959652900696, + "learning_rate": 8.467306066318063e-06, + "loss": 0.7511, + "step": 1380 + }, + { + "epoch": 0.27859680312185775, + "grad_norm": 0.7071893215179443, + "learning_rate": 8.46495094850335e-06, + "loss": 0.6808, + "step": 1381 + }, + { + "epoch": 0.2787985386780647, + "grad_norm": 1.0060806274414062, + "learning_rate": 8.462594350733414e-06, + "loss": 0.6832, + "step": 1382 + }, + { + "epoch": 0.2790002742342717, + "grad_norm": 0.5994428992271423, + "learning_rate": 8.460236274014805e-06, + "loss": 0.6658, + "step": 1383 + }, + { + "epoch": 0.2792020097904787, + "grad_norm": 0.5965285897254944, + "learning_rate": 8.457876719354708e-06, + "loss": 0.661, + "step": 1384 + }, + { + "epoch": 0.2794037453466857, + "grad_norm": 0.4691231846809387, + "learning_rate": 8.455515687760943e-06, + "loss": 0.8201, + "step": 1385 + }, + { + "epoch": 0.2796054809028927, + "grad_norm": 0.7659153938293457, + "learning_rate": 8.453153180241954e-06, + "loss": 0.7907, + "step": 1386 + }, + { + "epoch": 0.2798072164590997, + "grad_norm": 0.48467427492141724, + "learning_rate": 8.450789197806819e-06, + "loss": 0.8428, + "step": 1387 + }, + { + "epoch": 0.2800089520153067, + "grad_norm": 0.800972044467926, + "learning_rate": 8.448423741465249e-06, + "loss": 0.7075, + "step": 1388 + }, + { + "epoch": 0.2802106875715137, + "grad_norm": 0.5204843878746033, + "learning_rate": 8.446056812227579e-06, + "loss": 0.6352, + "step": 1389 + }, + { + "epoch": 0.2804124231277207, + "grad_norm": 0.414917916059494, + "learning_rate": 8.443688411104775e-06, + "loss": 0.669, + "step": 1390 + }, + { + "epoch": 0.28061415868392764, + "grad_norm": 0.316938579082489, + "learning_rate": 8.441318539108433e-06, + "loss": 0.7909, + "step": 1391 + }, + { + "epoch": 0.28081589424013464, + "grad_norm": 0.5244396924972534, + "learning_rate": 8.43894719725078e-06, + "loss": 0.7192, + "step": 1392 + }, + { + "epoch": 0.28101762979634165, + "grad_norm": 0.5841974020004272, + "learning_rate": 8.43657438654466e-06, + "loss": 0.7106, + "step": 1393 + }, + { + "epoch": 0.28121936535254866, + "grad_norm": 0.41134485602378845, + "learning_rate": 8.434200108003556e-06, + "loss": 0.6971, + "step": 1394 + }, + { + "epoch": 0.28142110090875566, + "grad_norm": 0.3490750789642334, + "learning_rate": 8.431824362641573e-06, + "loss": 0.6944, + "step": 1395 + }, + { + "epoch": 0.2816228364649626, + "grad_norm": 0.5032677054405212, + "learning_rate": 8.429447151473443e-06, + "loss": 0.8536, + "step": 1396 + }, + { + "epoch": 0.2818245720211696, + "grad_norm": 0.4385986924171448, + "learning_rate": 8.427068475514524e-06, + "loss": 0.7629, + "step": 1397 + }, + { + "epoch": 0.2820263075773766, + "grad_norm": 0.779082179069519, + "learning_rate": 8.424688335780799e-06, + "loss": 0.6729, + "step": 1398 + }, + { + "epoch": 0.28222804313358363, + "grad_norm": 0.8364329934120178, + "learning_rate": 8.42230673328888e-06, + "loss": 0.7799, + "step": 1399 + }, + { + "epoch": 0.2824297786897906, + "grad_norm": 0.4209824502468109, + "learning_rate": 8.419923669055995e-06, + "loss": 0.7129, + "step": 1400 + }, + { + "epoch": 0.2826315142459976, + "grad_norm": 0.4362705945968628, + "learning_rate": 8.417539144100008e-06, + "loss": 0.6584, + "step": 1401 + }, + { + "epoch": 0.2828332498022046, + "grad_norm": 0.4876041114330292, + "learning_rate": 8.415153159439397e-06, + "loss": 0.7438, + "step": 1402 + }, + { + "epoch": 0.2830349853584116, + "grad_norm": 0.513187825679779, + "learning_rate": 8.412765716093273e-06, + "loss": 0.6959, + "step": 1403 + }, + { + "epoch": 0.2832367209146186, + "grad_norm": 0.4529809355735779, + "learning_rate": 8.410376815081356e-06, + "loss": 0.7872, + "step": 1404 + }, + { + "epoch": 0.28343845647082555, + "grad_norm": 0.39496833086013794, + "learning_rate": 8.407986457424002e-06, + "loss": 0.7546, + "step": 1405 + }, + { + "epoch": 0.28364019202703256, + "grad_norm": 14.25558853149414, + "learning_rate": 8.405594644142186e-06, + "loss": 0.83, + "step": 1406 + }, + { + "epoch": 0.28384192758323956, + "grad_norm": 0.7426337003707886, + "learning_rate": 8.4032013762575e-06, + "loss": 0.6679, + "step": 1407 + }, + { + "epoch": 0.28404366313944657, + "grad_norm": 2.217550039291382, + "learning_rate": 8.400806654792161e-06, + "loss": 0.7844, + "step": 1408 + }, + { + "epoch": 0.2842453986956535, + "grad_norm": 2.4115359783172607, + "learning_rate": 8.398410480769007e-06, + "loss": 0.6876, + "step": 1409 + }, + { + "epoch": 0.2844471342518605, + "grad_norm": 2.4654650688171387, + "learning_rate": 8.396012855211494e-06, + "loss": 0.7054, + "step": 1410 + }, + { + "epoch": 0.28464886980806753, + "grad_norm": 0.7609624862670898, + "learning_rate": 8.393613779143703e-06, + "loss": 0.8621, + "step": 1411 + }, + { + "epoch": 0.28485060536427453, + "grad_norm": 0.45478445291519165, + "learning_rate": 8.391213253590325e-06, + "loss": 0.7032, + "step": 1412 + }, + { + "epoch": 0.28505234092048154, + "grad_norm": 0.6453161835670471, + "learning_rate": 8.388811279576682e-06, + "loss": 0.6764, + "step": 1413 + }, + { + "epoch": 0.2852540764766885, + "grad_norm": 0.4519941210746765, + "learning_rate": 8.386407858128707e-06, + "loss": 0.6926, + "step": 1414 + }, + { + "epoch": 0.2854558120328955, + "grad_norm": 1.1275166273117065, + "learning_rate": 8.384002990272951e-06, + "loss": 0.6854, + "step": 1415 + }, + { + "epoch": 0.2856575475891025, + "grad_norm": 0.391759991645813, + "learning_rate": 8.381596677036588e-06, + "loss": 0.6812, + "step": 1416 + }, + { + "epoch": 0.2858592831453095, + "grad_norm": 0.3690997064113617, + "learning_rate": 8.379188919447405e-06, + "loss": 0.6598, + "step": 1417 + }, + { + "epoch": 0.28606101870151646, + "grad_norm": 0.6712028384208679, + "learning_rate": 8.376779718533806e-06, + "loss": 0.7243, + "step": 1418 + }, + { + "epoch": 0.28626275425772346, + "grad_norm": 0.6204757690429688, + "learning_rate": 8.374369075324813e-06, + "loss": 0.7349, + "step": 1419 + }, + { + "epoch": 0.28646448981393047, + "grad_norm": 0.6927712559700012, + "learning_rate": 8.371956990850065e-06, + "loss": 0.6993, + "step": 1420 + }, + { + "epoch": 0.2866662253701375, + "grad_norm": 0.42022544145584106, + "learning_rate": 8.369543466139816e-06, + "loss": 0.7363, + "step": 1421 + }, + { + "epoch": 0.2868679609263445, + "grad_norm": 0.6742234826087952, + "learning_rate": 8.367128502224931e-06, + "loss": 0.7005, + "step": 1422 + }, + { + "epoch": 0.28706969648255143, + "grad_norm": 0.5271845459938049, + "learning_rate": 8.364712100136897e-06, + "loss": 0.7629, + "step": 1423 + }, + { + "epoch": 0.28727143203875843, + "grad_norm": 0.4005318284034729, + "learning_rate": 8.362294260907808e-06, + "loss": 0.641, + "step": 1424 + }, + { + "epoch": 0.28747316759496544, + "grad_norm": 1.1153535842895508, + "learning_rate": 8.359874985570378e-06, + "loss": 0.8169, + "step": 1425 + }, + { + "epoch": 0.28767490315117245, + "grad_norm": 1.0508934259414673, + "learning_rate": 8.35745427515793e-06, + "loss": 0.7177, + "step": 1426 + }, + { + "epoch": 0.28787663870737945, + "grad_norm": 0.40963393449783325, + "learning_rate": 8.355032130704402e-06, + "loss": 0.6816, + "step": 1427 + }, + { + "epoch": 0.2880783742635864, + "grad_norm": 0.8009536266326904, + "learning_rate": 8.352608553244344e-06, + "loss": 0.72, + "step": 1428 + }, + { + "epoch": 0.2882801098197934, + "grad_norm": 0.8653877973556519, + "learning_rate": 8.350183543812918e-06, + "loss": 0.6564, + "step": 1429 + }, + { + "epoch": 0.2884818453760004, + "grad_norm": 0.4677664339542389, + "learning_rate": 8.3477571034459e-06, + "loss": 0.712, + "step": 1430 + }, + { + "epoch": 0.2886835809322074, + "grad_norm": 1.347430944442749, + "learning_rate": 8.34532923317967e-06, + "loss": 0.6571, + "step": 1431 + }, + { + "epoch": 0.28888531648841437, + "grad_norm": 0.5276641845703125, + "learning_rate": 8.342899934051229e-06, + "loss": 0.7271, + "step": 1432 + }, + { + "epoch": 0.2890870520446214, + "grad_norm": 1.0363144874572754, + "learning_rate": 8.34046920709818e-06, + "loss": 0.9796, + "step": 1433 + }, + { + "epoch": 0.2892887876008284, + "grad_norm": 0.7945374846458435, + "learning_rate": 8.338037053358739e-06, + "loss": 0.6992, + "step": 1434 + }, + { + "epoch": 0.2894905231570354, + "grad_norm": 0.7973695397377014, + "learning_rate": 8.335603473871734e-06, + "loss": 0.8969, + "step": 1435 + }, + { + "epoch": 0.2896922587132424, + "grad_norm": 0.40596818923950195, + "learning_rate": 8.333168469676595e-06, + "loss": 0.704, + "step": 1436 + }, + { + "epoch": 0.28989399426944934, + "grad_norm": 0.43208691477775574, + "learning_rate": 8.330732041813367e-06, + "loss": 0.7658, + "step": 1437 + }, + { + "epoch": 0.29009572982565635, + "grad_norm": 0.9563391804695129, + "learning_rate": 8.328294191322703e-06, + "loss": 0.7315, + "step": 1438 + }, + { + "epoch": 0.29029746538186335, + "grad_norm": 0.36794593930244446, + "learning_rate": 8.325854919245859e-06, + "loss": 0.8485, + "step": 1439 + }, + { + "epoch": 0.29049920093807036, + "grad_norm": 0.3880400061607361, + "learning_rate": 8.323414226624699e-06, + "loss": 0.6999, + "step": 1440 + }, + { + "epoch": 0.2907009364942773, + "grad_norm": 0.68841952085495, + "learning_rate": 8.320972114501698e-06, + "loss": 0.9734, + "step": 1441 + }, + { + "epoch": 0.2909026720504843, + "grad_norm": 0.5601948499679565, + "learning_rate": 8.318528583919933e-06, + "loss": 0.8702, + "step": 1442 + }, + { + "epoch": 0.2911044076066913, + "grad_norm": 0.442568302154541, + "learning_rate": 8.31608363592309e-06, + "loss": 0.6594, + "step": 1443 + }, + { + "epoch": 0.2913061431628983, + "grad_norm": 0.7428321838378906, + "learning_rate": 8.313637271555462e-06, + "loss": 0.6561, + "step": 1444 + }, + { + "epoch": 0.29150787871910533, + "grad_norm": 2.2169814109802246, + "learning_rate": 8.311189491861938e-06, + "loss": 0.669, + "step": 1445 + }, + { + "epoch": 0.2917096142753123, + "grad_norm": 2.1682381629943848, + "learning_rate": 8.30874029788802e-06, + "loss": 0.791, + "step": 1446 + }, + { + "epoch": 0.2919113498315193, + "grad_norm": 0.6268622875213623, + "learning_rate": 8.306289690679812e-06, + "loss": 0.7264, + "step": 1447 + }, + { + "epoch": 0.2921130853877263, + "grad_norm": 0.757176399230957, + "learning_rate": 8.30383767128402e-06, + "loss": 0.7165, + "step": 1448 + }, + { + "epoch": 0.2923148209439333, + "grad_norm": 0.4244846999645233, + "learning_rate": 8.301384240747957e-06, + "loss": 0.7516, + "step": 1449 + }, + { + "epoch": 0.29251655650014025, + "grad_norm": 1.070397138595581, + "learning_rate": 8.298929400119533e-06, + "loss": 0.6561, + "step": 1450 + }, + { + "epoch": 0.29271829205634725, + "grad_norm": 0.5053624510765076, + "learning_rate": 8.296473150447263e-06, + "loss": 0.6578, + "step": 1451 + }, + { + "epoch": 0.29292002761255426, + "grad_norm": 0.4902132451534271, + "learning_rate": 8.294015492780267e-06, + "loss": 0.7171, + "step": 1452 + }, + { + "epoch": 0.29312176316876126, + "grad_norm": 0.3744256794452667, + "learning_rate": 8.291556428168263e-06, + "loss": 0.6644, + "step": 1453 + }, + { + "epoch": 0.29332349872496827, + "grad_norm": 0.6036210060119629, + "learning_rate": 8.289095957661569e-06, + "loss": 0.8321, + "step": 1454 + }, + { + "epoch": 0.2935252342811752, + "grad_norm": 0.6904403567314148, + "learning_rate": 8.286634082311107e-06, + "loss": 0.6942, + "step": 1455 + }, + { + "epoch": 0.2937269698373822, + "grad_norm": 0.5230683088302612, + "learning_rate": 8.284170803168393e-06, + "loss": 0.6435, + "step": 1456 + }, + { + "epoch": 0.29392870539358923, + "grad_norm": 0.3730091154575348, + "learning_rate": 8.28170612128555e-06, + "loss": 0.6557, + "step": 1457 + }, + { + "epoch": 0.29413044094979623, + "grad_norm": 0.44985833764076233, + "learning_rate": 8.279240037715297e-06, + "loss": 0.7341, + "step": 1458 + }, + { + "epoch": 0.2943321765060032, + "grad_norm": 0.4488007724285126, + "learning_rate": 8.27677255351095e-06, + "loss": 0.7637, + "step": 1459 + }, + { + "epoch": 0.2945339120622102, + "grad_norm": 4.375908374786377, + "learning_rate": 8.274303669726427e-06, + "loss": 0.8729, + "step": 1460 + }, + { + "epoch": 0.2947356476184172, + "grad_norm": 0.7124999165534973, + "learning_rate": 8.271833387416237e-06, + "loss": 0.6708, + "step": 1461 + }, + { + "epoch": 0.2949373831746242, + "grad_norm": 0.5048505663871765, + "learning_rate": 8.269361707635494e-06, + "loss": 0.6511, + "step": 1462 + }, + { + "epoch": 0.2951391187308312, + "grad_norm": 0.9429525136947632, + "learning_rate": 8.266888631439907e-06, + "loss": 0.8462, + "step": 1463 + }, + { + "epoch": 0.29534085428703816, + "grad_norm": 0.49526622891426086, + "learning_rate": 8.264414159885776e-06, + "loss": 0.7101, + "step": 1464 + }, + { + "epoch": 0.29554258984324516, + "grad_norm": 0.623329758644104, + "learning_rate": 8.261938294030003e-06, + "loss": 0.6689, + "step": 1465 + }, + { + "epoch": 0.29574432539945217, + "grad_norm": 1.0023547410964966, + "learning_rate": 8.259461034930088e-06, + "loss": 0.7473, + "step": 1466 + }, + { + "epoch": 0.2959460609556592, + "grad_norm": 0.555940568447113, + "learning_rate": 8.256982383644114e-06, + "loss": 0.6968, + "step": 1467 + }, + { + "epoch": 0.2961477965118661, + "grad_norm": 2.2776830196380615, + "learning_rate": 8.254502341230771e-06, + "loss": 0.76, + "step": 1468 + }, + { + "epoch": 0.29634953206807313, + "grad_norm": 2.1908066272735596, + "learning_rate": 8.252020908749338e-06, + "loss": 0.7083, + "step": 1469 + }, + { + "epoch": 0.29655126762428013, + "grad_norm": 2.6084344387054443, + "learning_rate": 8.24953808725969e-06, + "loss": 0.6853, + "step": 1470 + }, + { + "epoch": 0.29675300318048714, + "grad_norm": 2.2120113372802734, + "learning_rate": 8.24705387782229e-06, + "loss": 0.6915, + "step": 1471 + }, + { + "epoch": 0.29695473873669415, + "grad_norm": 1.1153708696365356, + "learning_rate": 8.244568281498198e-06, + "loss": 0.6737, + "step": 1472 + }, + { + "epoch": 0.2971564742929011, + "grad_norm": 1.088796615600586, + "learning_rate": 8.24208129934907e-06, + "loss": 0.6738, + "step": 1473 + }, + { + "epoch": 0.2973582098491081, + "grad_norm": 0.6631084680557251, + "learning_rate": 8.239592932437144e-06, + "loss": 0.6507, + "step": 1474 + }, + { + "epoch": 0.2975599454053151, + "grad_norm": 0.6017811298370361, + "learning_rate": 8.237103181825257e-06, + "loss": 0.9657, + "step": 1475 + }, + { + "epoch": 0.2977616809615221, + "grad_norm": 0.7511520385742188, + "learning_rate": 8.234612048576838e-06, + "loss": 0.629, + "step": 1476 + }, + { + "epoch": 0.29796341651772906, + "grad_norm": 0.593137800693512, + "learning_rate": 8.2321195337559e-06, + "loss": 0.6513, + "step": 1477 + }, + { + "epoch": 0.29816515207393607, + "grad_norm": 0.5652965307235718, + "learning_rate": 8.229625638427052e-06, + "loss": 0.6914, + "step": 1478 + }, + { + "epoch": 0.2983668876301431, + "grad_norm": 0.4325360357761383, + "learning_rate": 8.22713036365549e-06, + "loss": 0.6658, + "step": 1479 + }, + { + "epoch": 0.2985686231863501, + "grad_norm": 0.382667601108551, + "learning_rate": 8.224633710506997e-06, + "loss": 0.8343, + "step": 1480 + }, + { + "epoch": 0.2987703587425571, + "grad_norm": 2.504685640335083, + "learning_rate": 8.222135680047952e-06, + "loss": 0.7681, + "step": 1481 + }, + { + "epoch": 0.29897209429876404, + "grad_norm": 0.8869113922119141, + "learning_rate": 8.219636273345315e-06, + "loss": 0.6972, + "step": 1482 + }, + { + "epoch": 0.29917382985497104, + "grad_norm": 0.6380051374435425, + "learning_rate": 8.217135491466636e-06, + "loss": 0.6778, + "step": 1483 + }, + { + "epoch": 0.29937556541117805, + "grad_norm": 0.4614868462085724, + "learning_rate": 8.214633335480055e-06, + "loss": 0.7061, + "step": 1484 + }, + { + "epoch": 0.29957730096738505, + "grad_norm": 0.5545341968536377, + "learning_rate": 8.212129806454294e-06, + "loss": 0.7063, + "step": 1485 + }, + { + "epoch": 0.299779036523592, + "grad_norm": 0.4980104863643646, + "learning_rate": 8.209624905458667e-06, + "loss": 0.65, + "step": 1486 + }, + { + "epoch": 0.299980772079799, + "grad_norm": 0.619178831577301, + "learning_rate": 8.20711863356307e-06, + "loss": 0.7838, + "step": 1487 + }, + { + "epoch": 0.300182507636006, + "grad_norm": 0.4087018072605133, + "learning_rate": 8.204610991837983e-06, + "loss": 0.7019, + "step": 1488 + }, + { + "epoch": 0.300384243192213, + "grad_norm": 0.39627236127853394, + "learning_rate": 8.202101981354478e-06, + "loss": 0.6936, + "step": 1489 + }, + { + "epoch": 0.30058597874842, + "grad_norm": 1.5058199167251587, + "learning_rate": 8.199591603184205e-06, + "loss": 0.8135, + "step": 1490 + }, + { + "epoch": 0.300787714304627, + "grad_norm": 0.5494144558906555, + "learning_rate": 8.197079858399403e-06, + "loss": 0.8652, + "step": 1491 + }, + { + "epoch": 0.300989449860834, + "grad_norm": 0.7970489263534546, + "learning_rate": 8.19456674807289e-06, + "loss": 0.7358, + "step": 1492 + }, + { + "epoch": 0.301191185417041, + "grad_norm": 0.7870729565620422, + "learning_rate": 8.19205227327807e-06, + "loss": 0.773, + "step": 1493 + }, + { + "epoch": 0.301392920973248, + "grad_norm": 1.1276938915252686, + "learning_rate": 8.189536435088931e-06, + "loss": 0.6871, + "step": 1494 + }, + { + "epoch": 0.30159465652945494, + "grad_norm": 4.686948776245117, + "learning_rate": 8.18701923458004e-06, + "loss": 0.6465, + "step": 1495 + }, + { + "epoch": 0.30179639208566195, + "grad_norm": 2.210895538330078, + "learning_rate": 8.184500672826547e-06, + "loss": 0.6671, + "step": 1496 + }, + { + "epoch": 0.30199812764186895, + "grad_norm": 3.591681957244873, + "learning_rate": 8.181980750904185e-06, + "loss": 0.6889, + "step": 1497 + }, + { + "epoch": 0.30219986319807596, + "grad_norm": 2.6738879680633545, + "learning_rate": 8.179459469889269e-06, + "loss": 0.6789, + "step": 1498 + }, + { + "epoch": 0.30240159875428296, + "grad_norm": 1.4650682210922241, + "learning_rate": 8.176936830858689e-06, + "loss": 0.7287, + "step": 1499 + }, + { + "epoch": 0.3026033343104899, + "grad_norm": 2.2726752758026123, + "learning_rate": 8.17441283488992e-06, + "loss": 0.6666, + "step": 1500 + }, + { + "epoch": 0.3028050698666969, + "grad_norm": 0.5218955278396606, + "learning_rate": 8.171887483061014e-06, + "loss": 0.7004, + "step": 1501 + }, + { + "epoch": 0.3030068054229039, + "grad_norm": 0.559872031211853, + "learning_rate": 8.169360776450606e-06, + "loss": 0.6717, + "step": 1502 + }, + { + "epoch": 0.30320854097911093, + "grad_norm": 0.760443389415741, + "learning_rate": 8.166832716137905e-06, + "loss": 0.7983, + "step": 1503 + }, + { + "epoch": 0.3034102765353179, + "grad_norm": 0.8818268775939941, + "learning_rate": 8.164303303202698e-06, + "loss": 0.7724, + "step": 1504 + }, + { + "epoch": 0.3036120120915249, + "grad_norm": 0.5333113670349121, + "learning_rate": 8.161772538725357e-06, + "loss": 0.6922, + "step": 1505 + }, + { + "epoch": 0.3038137476477319, + "grad_norm": 0.5082593560218811, + "learning_rate": 8.15924042378682e-06, + "loss": 0.6744, + "step": 1506 + }, + { + "epoch": 0.3040154832039389, + "grad_norm": 0.5612021684646606, + "learning_rate": 8.156706959468611e-06, + "loss": 0.763, + "step": 1507 + }, + { + "epoch": 0.3042172187601459, + "grad_norm": 0.4409393072128296, + "learning_rate": 8.15417214685283e-06, + "loss": 0.7668, + "step": 1508 + }, + { + "epoch": 0.30441895431635285, + "grad_norm": 0.7158373594284058, + "learning_rate": 8.151635987022146e-06, + "loss": 0.788, + "step": 1509 + }, + { + "epoch": 0.30462068987255986, + "grad_norm": 0.7124815583229065, + "learning_rate": 8.149098481059807e-06, + "loss": 0.7019, + "step": 1510 + }, + { + "epoch": 0.30482242542876686, + "grad_norm": 0.5579118132591248, + "learning_rate": 8.146559630049639e-06, + "loss": 0.7112, + "step": 1511 + }, + { + "epoch": 0.30502416098497387, + "grad_norm": 0.4145357310771942, + "learning_rate": 8.14401943507604e-06, + "loss": 0.7051, + "step": 1512 + }, + { + "epoch": 0.3052258965411808, + "grad_norm": 0.43273279070854187, + "learning_rate": 8.14147789722398e-06, + "loss": 0.8772, + "step": 1513 + }, + { + "epoch": 0.3054276320973878, + "grad_norm": 0.4729679226875305, + "learning_rate": 8.138935017579007e-06, + "loss": 0.6842, + "step": 1514 + }, + { + "epoch": 0.30562936765359483, + "grad_norm": 3.5257089138031006, + "learning_rate": 8.136390797227235e-06, + "loss": 0.7959, + "step": 1515 + }, + { + "epoch": 0.30583110320980184, + "grad_norm": 0.608936071395874, + "learning_rate": 8.133845237255361e-06, + "loss": 0.679, + "step": 1516 + }, + { + "epoch": 0.30603283876600884, + "grad_norm": 0.5570759773254395, + "learning_rate": 8.131298338750648e-06, + "loss": 0.6549, + "step": 1517 + }, + { + "epoch": 0.3062345743222158, + "grad_norm": 0.5858903527259827, + "learning_rate": 8.128750102800929e-06, + "loss": 0.8579, + "step": 1518 + }, + { + "epoch": 0.3064363098784228, + "grad_norm": 0.6043578386306763, + "learning_rate": 8.12620053049461e-06, + "loss": 0.8372, + "step": 1519 + }, + { + "epoch": 0.3066380454346298, + "grad_norm": 0.4177265465259552, + "learning_rate": 8.12364962292067e-06, + "loss": 0.683, + "step": 1520 + }, + { + "epoch": 0.3068397809908368, + "grad_norm": 0.5823288559913635, + "learning_rate": 8.121097381168654e-06, + "loss": 0.6338, + "step": 1521 + }, + { + "epoch": 0.3070415165470438, + "grad_norm": 1.1234952211380005, + "learning_rate": 8.118543806328682e-06, + "loss": 0.6671, + "step": 1522 + }, + { + "epoch": 0.30724325210325076, + "grad_norm": 0.36531540751457214, + "learning_rate": 8.11598889949144e-06, + "loss": 0.723, + "step": 1523 + }, + { + "epoch": 0.30744498765945777, + "grad_norm": 0.8904269337654114, + "learning_rate": 8.113432661748187e-06, + "loss": 0.6854, + "step": 1524 + }, + { + "epoch": 0.3076467232156648, + "grad_norm": 0.38234153389930725, + "learning_rate": 8.110875094190742e-06, + "loss": 0.7636, + "step": 1525 + }, + { + "epoch": 0.3078484587718718, + "grad_norm": 1.4962918758392334, + "learning_rate": 8.108316197911498e-06, + "loss": 0.8294, + "step": 1526 + }, + { + "epoch": 0.30805019432807873, + "grad_norm": 4.313572883605957, + "learning_rate": 8.105755974003418e-06, + "loss": 0.6851, + "step": 1527 + }, + { + "epoch": 0.30825192988428574, + "grad_norm": 0.4833182394504547, + "learning_rate": 8.103194423560026e-06, + "loss": 0.7248, + "step": 1528 + }, + { + "epoch": 0.30845366544049274, + "grad_norm": 0.40504032373428345, + "learning_rate": 8.100631547675417e-06, + "loss": 0.6765, + "step": 1529 + }, + { + "epoch": 0.30865540099669975, + "grad_norm": 0.66014564037323, + "learning_rate": 8.09806734744425e-06, + "loss": 0.6544, + "step": 1530 + }, + { + "epoch": 0.30885713655290675, + "grad_norm": 0.6813423037528992, + "learning_rate": 8.095501823961752e-06, + "loss": 0.679, + "step": 1531 + }, + { + "epoch": 0.3090588721091137, + "grad_norm": 0.6496931314468384, + "learning_rate": 8.092934978323708e-06, + "loss": 0.8407, + "step": 1532 + }, + { + "epoch": 0.3092606076653207, + "grad_norm": 0.7875673174858093, + "learning_rate": 8.090366811626477e-06, + "loss": 0.8551, + "step": 1533 + }, + { + "epoch": 0.3094623432215277, + "grad_norm": 0.4039202034473419, + "learning_rate": 8.087797324966981e-06, + "loss": 0.8194, + "step": 1534 + }, + { + "epoch": 0.3096640787777347, + "grad_norm": 0.4680841565132141, + "learning_rate": 8.085226519442697e-06, + "loss": 0.6882, + "step": 1535 + }, + { + "epoch": 0.30986581433394167, + "grad_norm": 0.37298035621643066, + "learning_rate": 8.082654396151676e-06, + "loss": 0.6403, + "step": 1536 + }, + { + "epoch": 0.3100675498901487, + "grad_norm": 0.43404385447502136, + "learning_rate": 8.080080956192525e-06, + "loss": 0.7122, + "step": 1537 + }, + { + "epoch": 0.3102692854463557, + "grad_norm": 0.4201224148273468, + "learning_rate": 8.077506200664416e-06, + "loss": 0.8081, + "step": 1538 + }, + { + "epoch": 0.3104710210025627, + "grad_norm": 0.8215480446815491, + "learning_rate": 8.074930130667085e-06, + "loss": 0.6545, + "step": 1539 + }, + { + "epoch": 0.3106727565587697, + "grad_norm": 0.8744224309921265, + "learning_rate": 8.072352747300823e-06, + "loss": 0.6507, + "step": 1540 + }, + { + "epoch": 0.31087449211497664, + "grad_norm": 0.9224565625190735, + "learning_rate": 8.06977405166649e-06, + "loss": 0.6646, + "step": 1541 + }, + { + "epoch": 0.31107622767118365, + "grad_norm": 0.9008786678314209, + "learning_rate": 8.067194044865499e-06, + "loss": 0.6832, + "step": 1542 + }, + { + "epoch": 0.31127796322739065, + "grad_norm": 0.5391042828559875, + "learning_rate": 8.064612727999827e-06, + "loss": 0.683, + "step": 1543 + }, + { + "epoch": 0.31147969878359766, + "grad_norm": 0.7307866811752319, + "learning_rate": 8.062030102172013e-06, + "loss": 0.6636, + "step": 1544 + }, + { + "epoch": 0.3116814343398046, + "grad_norm": 1.0967941284179688, + "learning_rate": 8.05944616848515e-06, + "loss": 0.7132, + "step": 1545 + }, + { + "epoch": 0.3118831698960116, + "grad_norm": 0.661615788936615, + "learning_rate": 8.056860928042892e-06, + "loss": 0.8316, + "step": 1546 + }, + { + "epoch": 0.3120849054522186, + "grad_norm": 0.5277570486068726, + "learning_rate": 8.054274381949449e-06, + "loss": 0.7624, + "step": 1547 + }, + { + "epoch": 0.3122866410084256, + "grad_norm": 0.4380006790161133, + "learning_rate": 8.051686531309595e-06, + "loss": 0.7945, + "step": 1548 + }, + { + "epoch": 0.31248837656463263, + "grad_norm": 0.48799121379852295, + "learning_rate": 8.049097377228653e-06, + "loss": 0.7093, + "step": 1549 + }, + { + "epoch": 0.3126901121208396, + "grad_norm": 0.4029648005962372, + "learning_rate": 8.046506920812508e-06, + "loss": 0.9447, + "step": 1550 + }, + { + "epoch": 0.3128918476770466, + "grad_norm": 0.8204885125160217, + "learning_rate": 8.0439151631676e-06, + "loss": 0.8118, + "step": 1551 + }, + { + "epoch": 0.3130935832332536, + "grad_norm": 2.713876962661743, + "learning_rate": 8.041322105400923e-06, + "loss": 0.7025, + "step": 1552 + }, + { + "epoch": 0.3132953187894606, + "grad_norm": 0.3328094482421875, + "learning_rate": 8.03872774862003e-06, + "loss": 0.7096, + "step": 1553 + }, + { + "epoch": 0.31349705434566755, + "grad_norm": 0.75003582239151, + "learning_rate": 8.036132093933025e-06, + "loss": 0.7072, + "step": 1554 + }, + { + "epoch": 0.31369878990187455, + "grad_norm": 1.3351184129714966, + "learning_rate": 8.03353514244857e-06, + "loss": 0.7103, + "step": 1555 + }, + { + "epoch": 0.31390052545808156, + "grad_norm": 0.38253384828567505, + "learning_rate": 8.030936895275875e-06, + "loss": 0.6638, + "step": 1556 + }, + { + "epoch": 0.31410226101428856, + "grad_norm": 0.43413951992988586, + "learning_rate": 8.028337353524712e-06, + "loss": 0.8378, + "step": 1557 + }, + { + "epoch": 0.31430399657049557, + "grad_norm": 0.5159037709236145, + "learning_rate": 8.025736518305398e-06, + "loss": 0.7017, + "step": 1558 + }, + { + "epoch": 0.3145057321267025, + "grad_norm": 0.9960720539093018, + "learning_rate": 8.023134390728808e-06, + "loss": 0.6939, + "step": 1559 + }, + { + "epoch": 0.3147074676829095, + "grad_norm": 0.8572008609771729, + "learning_rate": 8.020530971906365e-06, + "loss": 0.6778, + "step": 1560 + }, + { + "epoch": 0.31490920323911653, + "grad_norm": 0.5154844522476196, + "learning_rate": 8.017926262950048e-06, + "loss": 0.7486, + "step": 1561 + }, + { + "epoch": 0.31511093879532354, + "grad_norm": 0.722102701663971, + "learning_rate": 8.015320264972381e-06, + "loss": 0.691, + "step": 1562 + }, + { + "epoch": 0.3153126743515305, + "grad_norm": 0.7588250041007996, + "learning_rate": 8.012712979086444e-06, + "loss": 0.6785, + "step": 1563 + }, + { + "epoch": 0.3155144099077375, + "grad_norm": 1.2562438249588013, + "learning_rate": 8.010104406405865e-06, + "loss": 0.6535, + "step": 1564 + }, + { + "epoch": 0.3157161454639445, + "grad_norm": 0.39819613099098206, + "learning_rate": 8.00749454804482e-06, + "loss": 0.6738, + "step": 1565 + }, + { + "epoch": 0.3159178810201515, + "grad_norm": 1.0211012363433838, + "learning_rate": 8.004883405118036e-06, + "loss": 0.6788, + "step": 1566 + }, + { + "epoch": 0.3161196165763585, + "grad_norm": 0.6734877824783325, + "learning_rate": 8.00227097874079e-06, + "loss": 0.7398, + "step": 1567 + }, + { + "epoch": 0.31632135213256546, + "grad_norm": 0.4316209554672241, + "learning_rate": 7.999657270028904e-06, + "loss": 0.7092, + "step": 1568 + }, + { + "epoch": 0.31652308768877246, + "grad_norm": 0.39186379313468933, + "learning_rate": 7.997042280098752e-06, + "loss": 0.902, + "step": 1569 + }, + { + "epoch": 0.31672482324497947, + "grad_norm": 0.9745991826057434, + "learning_rate": 7.99442601006725e-06, + "loss": 0.6493, + "step": 1570 + }, + { + "epoch": 0.3169265588011865, + "grad_norm": 0.48621875047683716, + "learning_rate": 7.991808461051862e-06, + "loss": 0.6901, + "step": 1571 + }, + { + "epoch": 0.3171282943573934, + "grad_norm": 0.6053562164306641, + "learning_rate": 7.989189634170603e-06, + "loss": 0.8829, + "step": 1572 + }, + { + "epoch": 0.31733002991360043, + "grad_norm": 1.5087822675704956, + "learning_rate": 7.986569530542028e-06, + "loss": 0.6407, + "step": 1573 + }, + { + "epoch": 0.31753176546980744, + "grad_norm": 0.5080868601799011, + "learning_rate": 7.983948151285242e-06, + "loss": 0.6927, + "step": 1574 + }, + { + "epoch": 0.31773350102601444, + "grad_norm": 0.3664741516113281, + "learning_rate": 7.981325497519892e-06, + "loss": 0.6811, + "step": 1575 + }, + { + "epoch": 0.31793523658222145, + "grad_norm": 0.9207780957221985, + "learning_rate": 7.978701570366167e-06, + "loss": 0.7989, + "step": 1576 + }, + { + "epoch": 0.3181369721384284, + "grad_norm": 0.4849698543548584, + "learning_rate": 7.976076370944805e-06, + "loss": 0.6831, + "step": 1577 + }, + { + "epoch": 0.3183387076946354, + "grad_norm": 0.33219724893569946, + "learning_rate": 7.973449900377086e-06, + "loss": 0.7534, + "step": 1578 + }, + { + "epoch": 0.3185404432508424, + "grad_norm": 0.77428138256073, + "learning_rate": 7.970822159784832e-06, + "loss": 0.7303, + "step": 1579 + }, + { + "epoch": 0.3187421788070494, + "grad_norm": 0.3515469431877136, + "learning_rate": 7.968193150290408e-06, + "loss": 0.6644, + "step": 1580 + }, + { + "epoch": 0.31894391436325636, + "grad_norm": 0.4491312801837921, + "learning_rate": 7.96556287301672e-06, + "loss": 0.6831, + "step": 1581 + }, + { + "epoch": 0.31914564991946337, + "grad_norm": 0.6044942140579224, + "learning_rate": 7.962931329087214e-06, + "loss": 0.6645, + "step": 1582 + }, + { + "epoch": 0.3193473854756704, + "grad_norm": 0.6661559343338013, + "learning_rate": 7.96029851962588e-06, + "loss": 0.6956, + "step": 1583 + }, + { + "epoch": 0.3195491210318774, + "grad_norm": 0.54176926612854, + "learning_rate": 7.95766444575725e-06, + "loss": 0.7459, + "step": 1584 + }, + { + "epoch": 0.3197508565880844, + "grad_norm": 0.3712012767791748, + "learning_rate": 7.955029108606392e-06, + "loss": 0.968, + "step": 1585 + }, + { + "epoch": 0.31995259214429134, + "grad_norm": 0.4860132932662964, + "learning_rate": 7.952392509298916e-06, + "loss": 0.9221, + "step": 1586 + }, + { + "epoch": 0.32015432770049834, + "grad_norm": 0.41063064336776733, + "learning_rate": 7.94975464896097e-06, + "loss": 0.6869, + "step": 1587 + }, + { + "epoch": 0.32035606325670535, + "grad_norm": 0.46989545226097107, + "learning_rate": 7.947115528719241e-06, + "loss": 0.7325, + "step": 1588 + }, + { + "epoch": 0.32055779881291235, + "grad_norm": 0.613256573677063, + "learning_rate": 7.944475149700954e-06, + "loss": 0.6995, + "step": 1589 + }, + { + "epoch": 0.3207595343691193, + "grad_norm": 0.7676635384559631, + "learning_rate": 7.941833513033873e-06, + "loss": 0.662, + "step": 1590 + }, + { + "epoch": 0.3209612699253263, + "grad_norm": 0.3710860311985016, + "learning_rate": 7.939190619846296e-06, + "loss": 0.6619, + "step": 1591 + }, + { + "epoch": 0.3211630054815333, + "grad_norm": 0.41502392292022705, + "learning_rate": 7.93654647126706e-06, + "loss": 0.6537, + "step": 1592 + }, + { + "epoch": 0.3213647410377403, + "grad_norm": 0.5116865038871765, + "learning_rate": 7.933901068425539e-06, + "loss": 0.6757, + "step": 1593 + }, + { + "epoch": 0.3215664765939473, + "grad_norm": 0.761396586894989, + "learning_rate": 7.93125441245164e-06, + "loss": 0.6889, + "step": 1594 + }, + { + "epoch": 0.3217682121501543, + "grad_norm": 0.8247814178466797, + "learning_rate": 7.928606504475809e-06, + "loss": 0.7393, + "step": 1595 + }, + { + "epoch": 0.3219699477063613, + "grad_norm": 0.5235460996627808, + "learning_rate": 7.925957345629023e-06, + "loss": 0.7118, + "step": 1596 + }, + { + "epoch": 0.3221716832625683, + "grad_norm": 0.8428860902786255, + "learning_rate": 7.923306937042796e-06, + "loss": 0.6925, + "step": 1597 + }, + { + "epoch": 0.3223734188187753, + "grad_norm": 0.7909860014915466, + "learning_rate": 7.920655279849173e-06, + "loss": 0.6794, + "step": 1598 + }, + { + "epoch": 0.32257515437498224, + "grad_norm": 0.5924726724624634, + "learning_rate": 7.918002375180733e-06, + "loss": 0.6811, + "step": 1599 + }, + { + "epoch": 0.32277688993118925, + "grad_norm": 0.8714628219604492, + "learning_rate": 7.915348224170593e-06, + "loss": 0.7976, + "step": 1600 + }, + { + "epoch": 0.32297862548739625, + "grad_norm": 0.34327223896980286, + "learning_rate": 7.912692827952395e-06, + "loss": 0.7037, + "step": 1601 + }, + { + "epoch": 0.32318036104360326, + "grad_norm": 0.3930150866508484, + "learning_rate": 7.910036187660316e-06, + "loss": 0.647, + "step": 1602 + }, + { + "epoch": 0.32338209659981026, + "grad_norm": 0.39237427711486816, + "learning_rate": 7.907378304429065e-06, + "loss": 0.66, + "step": 1603 + }, + { + "epoch": 0.3235838321560172, + "grad_norm": 0.7589460015296936, + "learning_rate": 7.904719179393881e-06, + "loss": 0.6372, + "step": 1604 + }, + { + "epoch": 0.3237855677122242, + "grad_norm": 1.4176976680755615, + "learning_rate": 7.902058813690532e-06, + "loss": 0.6219, + "step": 1605 + }, + { + "epoch": 0.3239873032684312, + "grad_norm": 0.6004815101623535, + "learning_rate": 7.899397208455323e-06, + "loss": 0.677, + "step": 1606 + }, + { + "epoch": 0.32418903882463823, + "grad_norm": 1.2083632946014404, + "learning_rate": 7.896734364825076e-06, + "loss": 0.7007, + "step": 1607 + }, + { + "epoch": 0.3243907743808452, + "grad_norm": 0.7155832648277283, + "learning_rate": 7.894070283937152e-06, + "loss": 0.6921, + "step": 1608 + }, + { + "epoch": 0.3245925099370522, + "grad_norm": 0.3577544391155243, + "learning_rate": 7.891404966929439e-06, + "loss": 0.6632, + "step": 1609 + }, + { + "epoch": 0.3247942454932592, + "grad_norm": 0.49218058586120605, + "learning_rate": 7.888738414940352e-06, + "loss": 0.6419, + "step": 1610 + }, + { + "epoch": 0.3249959810494662, + "grad_norm": 0.4804253578186035, + "learning_rate": 7.886070629108826e-06, + "loss": 0.904, + "step": 1611 + }, + { + "epoch": 0.3251977166056732, + "grad_norm": 0.36306464672088623, + "learning_rate": 7.883401610574338e-06, + "loss": 0.812, + "step": 1612 + }, + { + "epoch": 0.32539945216188015, + "grad_norm": 0.41663479804992676, + "learning_rate": 7.880731360476877e-06, + "loss": 0.9999, + "step": 1613 + }, + { + "epoch": 0.32560118771808716, + "grad_norm": 0.42178666591644287, + "learning_rate": 7.878059879956967e-06, + "loss": 0.8646, + "step": 1614 + }, + { + "epoch": 0.32580292327429416, + "grad_norm": 0.4636440575122833, + "learning_rate": 7.875387170155657e-06, + "loss": 0.7127, + "step": 1615 + }, + { + "epoch": 0.32600465883050117, + "grad_norm": 0.4064319431781769, + "learning_rate": 7.872713232214517e-06, + "loss": 0.8338, + "step": 1616 + }, + { + "epoch": 0.3262063943867082, + "grad_norm": 0.47023361921310425, + "learning_rate": 7.87003806727564e-06, + "loss": 0.6782, + "step": 1617 + }, + { + "epoch": 0.3264081299429151, + "grad_norm": 0.3830969035625458, + "learning_rate": 7.867361676481654e-06, + "loss": 0.6551, + "step": 1618 + }, + { + "epoch": 0.32660986549912213, + "grad_norm": 0.44314447045326233, + "learning_rate": 7.864684060975699e-06, + "loss": 0.9076, + "step": 1619 + }, + { + "epoch": 0.32681160105532914, + "grad_norm": 0.476024866104126, + "learning_rate": 7.86200522190144e-06, + "loss": 0.7747, + "step": 1620 + }, + { + "epoch": 0.32701333661153614, + "grad_norm": 0.44015395641326904, + "learning_rate": 7.859325160403073e-06, + "loss": 0.6402, + "step": 1621 + }, + { + "epoch": 0.3272150721677431, + "grad_norm": 0.5018024444580078, + "learning_rate": 7.856643877625304e-06, + "loss": 0.6621, + "step": 1622 + }, + { + "epoch": 0.3274168077239501, + "grad_norm": 1.147547721862793, + "learning_rate": 7.853961374713367e-06, + "loss": 0.712, + "step": 1623 + }, + { + "epoch": 0.3276185432801571, + "grad_norm": 0.7580029964447021, + "learning_rate": 7.851277652813023e-06, + "loss": 0.697, + "step": 1624 + }, + { + "epoch": 0.3278202788363641, + "grad_norm": 1.268939733505249, + "learning_rate": 7.848592713070542e-06, + "loss": 0.6805, + "step": 1625 + }, + { + "epoch": 0.3280220143925711, + "grad_norm": 0.46011170744895935, + "learning_rate": 7.845906556632721e-06, + "loss": 0.6714, + "step": 1626 + }, + { + "epoch": 0.32822374994877807, + "grad_norm": 4.070643901824951, + "learning_rate": 7.843219184646877e-06, + "loss": 0.6859, + "step": 1627 + }, + { + "epoch": 0.32842548550498507, + "grad_norm": 1.8737590312957764, + "learning_rate": 7.84053059826084e-06, + "loss": 0.6949, + "step": 1628 + }, + { + "epoch": 0.3286272210611921, + "grad_norm": 0.5250210762023926, + "learning_rate": 7.837840798622969e-06, + "loss": 0.667, + "step": 1629 + }, + { + "epoch": 0.3288289566173991, + "grad_norm": 0.40614551305770874, + "learning_rate": 7.83514978688213e-06, + "loss": 0.7115, + "step": 1630 + }, + { + "epoch": 0.32903069217360603, + "grad_norm": 0.43287792801856995, + "learning_rate": 7.832457564187715e-06, + "loss": 0.7591, + "step": 1631 + }, + { + "epoch": 0.32923242772981304, + "grad_norm": 0.598949134349823, + "learning_rate": 7.82976413168963e-06, + "loss": 0.7173, + "step": 1632 + }, + { + "epoch": 0.32943416328602004, + "grad_norm": 1.1374804973602295, + "learning_rate": 7.827069490538298e-06, + "loss": 0.8068, + "step": 1633 + }, + { + "epoch": 0.32963589884222705, + "grad_norm": 0.4957578182220459, + "learning_rate": 7.82437364188466e-06, + "loss": 0.6748, + "step": 1634 + }, + { + "epoch": 0.32983763439843405, + "grad_norm": 0.5811397433280945, + "learning_rate": 7.821676586880167e-06, + "loss": 0.7066, + "step": 1635 + }, + { + "epoch": 0.330039369954641, + "grad_norm": 0.3692084848880768, + "learning_rate": 7.818978326676793e-06, + "loss": 0.6955, + "step": 1636 + }, + { + "epoch": 0.330241105510848, + "grad_norm": 0.40660807490348816, + "learning_rate": 7.81627886242702e-06, + "loss": 0.7185, + "step": 1637 + }, + { + "epoch": 0.330442841067055, + "grad_norm": 0.31255900859832764, + "learning_rate": 7.813578195283852e-06, + "loss": 0.6788, + "step": 1638 + }, + { + "epoch": 0.330644576623262, + "grad_norm": 0.2971351146697998, + "learning_rate": 7.810876326400796e-06, + "loss": 0.6293, + "step": 1639 + }, + { + "epoch": 0.33084631217946897, + "grad_norm": 0.36985236406326294, + "learning_rate": 7.808173256931883e-06, + "loss": 0.8046, + "step": 1640 + }, + { + "epoch": 0.331048047735676, + "grad_norm": 0.41634565591812134, + "learning_rate": 7.805468988031652e-06, + "loss": 0.7201, + "step": 1641 + }, + { + "epoch": 0.331249783291883, + "grad_norm": 0.3758183717727661, + "learning_rate": 7.802763520855152e-06, + "loss": 0.7648, + "step": 1642 + }, + { + "epoch": 0.33145151884809, + "grad_norm": 0.426986962556839, + "learning_rate": 7.80005685655795e-06, + "loss": 0.7397, + "step": 1643 + }, + { + "epoch": 0.331653254404297, + "grad_norm": 1.3208926916122437, + "learning_rate": 7.797348996296116e-06, + "loss": 0.671, + "step": 1644 + }, + { + "epoch": 0.33185498996050394, + "grad_norm": 0.3618689477443695, + "learning_rate": 7.794639941226238e-06, + "loss": 0.6987, + "step": 1645 + }, + { + "epoch": 0.33205672551671095, + "grad_norm": 0.7262416481971741, + "learning_rate": 7.791929692505411e-06, + "loss": 0.6579, + "step": 1646 + }, + { + "epoch": 0.33225846107291795, + "grad_norm": 0.7027097344398499, + "learning_rate": 7.789218251291244e-06, + "loss": 0.7859, + "step": 1647 + }, + { + "epoch": 0.33246019662912496, + "grad_norm": 1.2748175859451294, + "learning_rate": 7.786505618741848e-06, + "loss": 0.6828, + "step": 1648 + }, + { + "epoch": 0.3326619321853319, + "grad_norm": 0.4840089976787567, + "learning_rate": 7.783791796015848e-06, + "loss": 0.6823, + "step": 1649 + }, + { + "epoch": 0.3328636677415389, + "grad_norm": 0.38921627402305603, + "learning_rate": 7.781076784272377e-06, + "loss": 0.6761, + "step": 1650 + }, + { + "epoch": 0.3330654032977459, + "grad_norm": 0.30266091227531433, + "learning_rate": 7.778360584671072e-06, + "loss": 0.6835, + "step": 1651 + }, + { + "epoch": 0.3332671388539529, + "grad_norm": 0.43497520685195923, + "learning_rate": 7.775643198372085e-06, + "loss": 0.7037, + "step": 1652 + }, + { + "epoch": 0.33346887441015993, + "grad_norm": 0.42013806104660034, + "learning_rate": 7.772924626536068e-06, + "loss": 0.6906, + "step": 1653 + }, + { + "epoch": 0.3336706099663669, + "grad_norm": 0.3756752014160156, + "learning_rate": 7.770204870324181e-06, + "loss": 0.7698, + "step": 1654 + }, + { + "epoch": 0.3338723455225739, + "grad_norm": 0.5245344042778015, + "learning_rate": 7.76748393089809e-06, + "loss": 0.651, + "step": 1655 + }, + { + "epoch": 0.3340740810787809, + "grad_norm": 0.9904852509498596, + "learning_rate": 7.764761809419969e-06, + "loss": 0.6821, + "step": 1656 + }, + { + "epoch": 0.3342758166349879, + "grad_norm": 0.3961763083934784, + "learning_rate": 7.762038507052494e-06, + "loss": 0.6742, + "step": 1657 + }, + { + "epoch": 0.33447755219119485, + "grad_norm": 0.38408246636390686, + "learning_rate": 7.759314024958846e-06, + "loss": 0.708, + "step": 1658 + }, + { + "epoch": 0.33467928774740185, + "grad_norm": 0.8345887660980225, + "learning_rate": 7.75658836430271e-06, + "loss": 0.642, + "step": 1659 + }, + { + "epoch": 0.33488102330360886, + "grad_norm": 0.5460199117660522, + "learning_rate": 7.753861526248274e-06, + "loss": 0.6872, + "step": 1660 + }, + { + "epoch": 0.33508275885981587, + "grad_norm": 0.6602993607521057, + "learning_rate": 7.751133511960228e-06, + "loss": 0.7681, + "step": 1661 + }, + { + "epoch": 0.33528449441602287, + "grad_norm": 1.3037577867507935, + "learning_rate": 7.748404322603768e-06, + "loss": 0.6629, + "step": 1662 + }, + { + "epoch": 0.3354862299722298, + "grad_norm": 0.7454012036323547, + "learning_rate": 7.74567395934459e-06, + "loss": 0.7028, + "step": 1663 + }, + { + "epoch": 0.3356879655284368, + "grad_norm": 0.7407185435295105, + "learning_rate": 7.74294242334889e-06, + "loss": 0.8085, + "step": 1664 + }, + { + "epoch": 0.33588970108464383, + "grad_norm": 1.609656810760498, + "learning_rate": 7.740209715783365e-06, + "loss": 0.9629, + "step": 1665 + }, + { + "epoch": 0.33609143664085084, + "grad_norm": 0.6212112307548523, + "learning_rate": 7.737475837815215e-06, + "loss": 0.6332, + "step": 1666 + }, + { + "epoch": 0.3362931721970578, + "grad_norm": 0.6354268789291382, + "learning_rate": 7.734740790612137e-06, + "loss": 0.7686, + "step": 1667 + }, + { + "epoch": 0.3364949077532648, + "grad_norm": 0.8016752600669861, + "learning_rate": 7.732004575342328e-06, + "loss": 0.6746, + "step": 1668 + }, + { + "epoch": 0.3366966433094718, + "grad_norm": 1.6977438926696777, + "learning_rate": 7.729267193174483e-06, + "loss": 0.6971, + "step": 1669 + }, + { + "epoch": 0.3368983788656788, + "grad_norm": 0.3582017421722412, + "learning_rate": 7.726528645277801e-06, + "loss": 0.7068, + "step": 1670 + }, + { + "epoch": 0.3371001144218858, + "grad_norm": 1.4036459922790527, + "learning_rate": 7.723788932821977e-06, + "loss": 0.7213, + "step": 1671 + }, + { + "epoch": 0.33730184997809276, + "grad_norm": 0.34059593081474304, + "learning_rate": 7.721048056977192e-06, + "loss": 0.644, + "step": 1672 + }, + { + "epoch": 0.33750358553429977, + "grad_norm": 6.393365383148193, + "learning_rate": 7.71830601891414e-06, + "loss": 0.671, + "step": 1673 + }, + { + "epoch": 0.33770532109050677, + "grad_norm": 0.9649179577827454, + "learning_rate": 7.715562819804005e-06, + "loss": 0.6975, + "step": 1674 + }, + { + "epoch": 0.3379070566467138, + "grad_norm": 1.1787409782409668, + "learning_rate": 7.712818460818464e-06, + "loss": 0.8055, + "step": 1675 + }, + { + "epoch": 0.3381087922029207, + "grad_norm": 0.6837486028671265, + "learning_rate": 7.710072943129692e-06, + "loss": 0.8101, + "step": 1676 + }, + { + "epoch": 0.33831052775912773, + "grad_norm": 0.3825688660144806, + "learning_rate": 7.707326267910358e-06, + "loss": 0.8263, + "step": 1677 + }, + { + "epoch": 0.33851226331533474, + "grad_norm": 0.3459135591983795, + "learning_rate": 7.70457843633363e-06, + "loss": 0.6956, + "step": 1678 + }, + { + "epoch": 0.33871399887154174, + "grad_norm": 0.42721888422966003, + "learning_rate": 7.70182944957316e-06, + "loss": 0.728, + "step": 1679 + }, + { + "epoch": 0.33891573442774875, + "grad_norm": 0.41401034593582153, + "learning_rate": 7.699079308803105e-06, + "loss": 0.7388, + "step": 1680 + }, + { + "epoch": 0.3391174699839557, + "grad_norm": 0.4155450463294983, + "learning_rate": 7.696328015198107e-06, + "loss": 0.6918, + "step": 1681 + }, + { + "epoch": 0.3393192055401627, + "grad_norm": 0.9530971646308899, + "learning_rate": 7.693575569933302e-06, + "loss": 0.6704, + "step": 1682 + }, + { + "epoch": 0.3395209410963697, + "grad_norm": 0.46446821093559265, + "learning_rate": 7.69082197418432e-06, + "loss": 0.7255, + "step": 1683 + }, + { + "epoch": 0.3397226766525767, + "grad_norm": 0.8986880779266357, + "learning_rate": 7.688067229127283e-06, + "loss": 0.6523, + "step": 1684 + }, + { + "epoch": 0.33992441220878367, + "grad_norm": 0.5720349550247192, + "learning_rate": 7.685311335938797e-06, + "loss": 0.761, + "step": 1685 + }, + { + "epoch": 0.34012614776499067, + "grad_norm": 0.4581213593482971, + "learning_rate": 7.682554295795968e-06, + "loss": 0.7814, + "step": 1686 + }, + { + "epoch": 0.3403278833211977, + "grad_norm": 0.40103885531425476, + "learning_rate": 7.679796109876385e-06, + "loss": 0.6647, + "step": 1687 + }, + { + "epoch": 0.3405296188774047, + "grad_norm": 0.388505756855011, + "learning_rate": 7.67703677935813e-06, + "loss": 0.6565, + "step": 1688 + }, + { + "epoch": 0.3407313544336117, + "grad_norm": 0.36097803711891174, + "learning_rate": 7.67427630541977e-06, + "loss": 0.7654, + "step": 1689 + }, + { + "epoch": 0.34093308998981864, + "grad_norm": 0.3629458546638489, + "learning_rate": 7.671514689240366e-06, + "loss": 0.6841, + "step": 1690 + }, + { + "epoch": 0.34113482554602564, + "grad_norm": 0.6155198216438293, + "learning_rate": 7.668751931999464e-06, + "loss": 0.7549, + "step": 1691 + }, + { + "epoch": 0.34133656110223265, + "grad_norm": 0.4708308279514313, + "learning_rate": 7.665988034877093e-06, + "loss": 0.6257, + "step": 1692 + }, + { + "epoch": 0.34153829665843966, + "grad_norm": 0.59466552734375, + "learning_rate": 7.663222999053774e-06, + "loss": 0.7247, + "step": 1693 + }, + { + "epoch": 0.3417400322146466, + "grad_norm": 0.4716358482837677, + "learning_rate": 7.660456825710518e-06, + "loss": 0.6991, + "step": 1694 + }, + { + "epoch": 0.3419417677708536, + "grad_norm": 0.808247983455658, + "learning_rate": 7.657689516028814e-06, + "loss": 0.6722, + "step": 1695 + }, + { + "epoch": 0.3421435033270606, + "grad_norm": 0.7132179141044617, + "learning_rate": 7.654921071190637e-06, + "loss": 0.7261, + "step": 1696 + }, + { + "epoch": 0.3423452388832676, + "grad_norm": 0.4227313995361328, + "learning_rate": 7.652151492378455e-06, + "loss": 0.6658, + "step": 1697 + }, + { + "epoch": 0.3425469744394746, + "grad_norm": 0.32644885778427124, + "learning_rate": 7.649380780775211e-06, + "loss": 0.6655, + "step": 1698 + }, + { + "epoch": 0.3427487099956816, + "grad_norm": 1.1053694486618042, + "learning_rate": 7.646608937564338e-06, + "loss": 0.6884, + "step": 1699 + }, + { + "epoch": 0.3429504455518886, + "grad_norm": 0.3581966757774353, + "learning_rate": 7.643835963929747e-06, + "loss": 0.6848, + "step": 1700 + }, + { + "epoch": 0.3431521811080956, + "grad_norm": 0.5573354959487915, + "learning_rate": 7.641061861055837e-06, + "loss": 0.802, + "step": 1701 + }, + { + "epoch": 0.3433539166643026, + "grad_norm": 0.8281745910644531, + "learning_rate": 7.638286630127487e-06, + "loss": 0.6741, + "step": 1702 + }, + { + "epoch": 0.34355565222050954, + "grad_norm": 0.5986734628677368, + "learning_rate": 7.635510272330058e-06, + "loss": 0.6812, + "step": 1703 + }, + { + "epoch": 0.34375738777671655, + "grad_norm": 0.587714672088623, + "learning_rate": 7.63273278884939e-06, + "loss": 0.7008, + "step": 1704 + }, + { + "epoch": 0.34395912333292356, + "grad_norm": 0.6945699453353882, + "learning_rate": 7.62995418087181e-06, + "loss": 0.6674, + "step": 1705 + }, + { + "epoch": 0.34416085888913056, + "grad_norm": 2.0080175399780273, + "learning_rate": 7.6271744495841185e-06, + "loss": 0.677, + "step": 1706 + }, + { + "epoch": 0.34436259444533757, + "grad_norm": 1.2509560585021973, + "learning_rate": 7.624393596173598e-06, + "loss": 0.682, + "step": 1707 + }, + { + "epoch": 0.3445643300015445, + "grad_norm": 0.977975606918335, + "learning_rate": 7.621611621828016e-06, + "loss": 0.7995, + "step": 1708 + }, + { + "epoch": 0.3447660655577515, + "grad_norm": 0.7222344875335693, + "learning_rate": 7.618828527735607e-06, + "loss": 0.8359, + "step": 1709 + }, + { + "epoch": 0.3449678011139585, + "grad_norm": 0.7610874772071838, + "learning_rate": 7.616044315085092e-06, + "loss": 0.6729, + "step": 1710 + }, + { + "epoch": 0.34516953667016553, + "grad_norm": 0.5035162568092346, + "learning_rate": 7.613258985065672e-06, + "loss": 0.7004, + "step": 1711 + }, + { + "epoch": 0.34537127222637254, + "grad_norm": 0.31400299072265625, + "learning_rate": 7.61047253886702e-06, + "loss": 0.6807, + "step": 1712 + }, + { + "epoch": 0.3455730077825795, + "grad_norm": 0.41363200545310974, + "learning_rate": 7.607684977679284e-06, + "loss": 0.7384, + "step": 1713 + }, + { + "epoch": 0.3457747433387865, + "grad_norm": 0.42264917492866516, + "learning_rate": 7.604896302693094e-06, + "loss": 0.679, + "step": 1714 + }, + { + "epoch": 0.3459764788949935, + "grad_norm": 0.8459030985832214, + "learning_rate": 7.602106515099554e-06, + "loss": 0.6999, + "step": 1715 + }, + { + "epoch": 0.3461782144512005, + "grad_norm": 0.5736342668533325, + "learning_rate": 7.599315616090242e-06, + "loss": 0.7783, + "step": 1716 + }, + { + "epoch": 0.34637995000740746, + "grad_norm": 0.42105481028556824, + "learning_rate": 7.596523606857209e-06, + "loss": 0.6519, + "step": 1717 + }, + { + "epoch": 0.34658168556361446, + "grad_norm": 0.40403833985328674, + "learning_rate": 7.593730488592985e-06, + "loss": 0.7062, + "step": 1718 + }, + { + "epoch": 0.34678342111982147, + "grad_norm": 0.51008141040802, + "learning_rate": 7.590936262490569e-06, + "loss": 0.7837, + "step": 1719 + }, + { + "epoch": 0.34698515667602847, + "grad_norm": 0.680470883846283, + "learning_rate": 7.588140929743437e-06, + "loss": 0.6716, + "step": 1720 + }, + { + "epoch": 0.3471868922322355, + "grad_norm": 0.3377155661582947, + "learning_rate": 7.585344491545535e-06, + "loss": 0.7046, + "step": 1721 + }, + { + "epoch": 0.34738862778844243, + "grad_norm": 0.3631691336631775, + "learning_rate": 7.58254694909128e-06, + "loss": 0.6441, + "step": 1722 + }, + { + "epoch": 0.34759036334464943, + "grad_norm": 0.4910680055618286, + "learning_rate": 7.579748303575567e-06, + "loss": 0.6494, + "step": 1723 + }, + { + "epoch": 0.34779209890085644, + "grad_norm": 0.33177176117897034, + "learning_rate": 7.576948556193755e-06, + "loss": 0.6973, + "step": 1724 + }, + { + "epoch": 0.34799383445706344, + "grad_norm": 0.4534423351287842, + "learning_rate": 7.574147708141675e-06, + "loss": 0.6672, + "step": 1725 + }, + { + "epoch": 0.3481955700132704, + "grad_norm": 0.7091371417045593, + "learning_rate": 7.5713457606156335e-06, + "loss": 0.6313, + "step": 1726 + }, + { + "epoch": 0.3483973055694774, + "grad_norm": 0.38050803542137146, + "learning_rate": 7.568542714812401e-06, + "loss": 0.7431, + "step": 1727 + }, + { + "epoch": 0.3485990411256844, + "grad_norm": 0.7393271327018738, + "learning_rate": 7.565738571929217e-06, + "loss": 0.7006, + "step": 1728 + }, + { + "epoch": 0.3488007766818914, + "grad_norm": 0.35073035955429077, + "learning_rate": 7.562933333163792e-06, + "loss": 0.6593, + "step": 1729 + }, + { + "epoch": 0.3490025122380984, + "grad_norm": 0.9442110657691956, + "learning_rate": 7.5601269997143055e-06, + "loss": 0.6577, + "step": 1730 + }, + { + "epoch": 0.34920424779430537, + "grad_norm": 1.0707677602767944, + "learning_rate": 7.557319572779402e-06, + "loss": 0.6477, + "step": 1731 + }, + { + "epoch": 0.3494059833505124, + "grad_norm": 0.3912695050239563, + "learning_rate": 7.554511053558196e-06, + "loss": 1.0967, + "step": 1732 + }, + { + "epoch": 0.3496077189067194, + "grad_norm": 0.7106683850288391, + "learning_rate": 7.551701443250263e-06, + "loss": 0.8774, + "step": 1733 + }, + { + "epoch": 0.3498094544629264, + "grad_norm": 0.7584543824195862, + "learning_rate": 7.54889074305565e-06, + "loss": 0.7291, + "step": 1734 + }, + { + "epoch": 0.35001119001913333, + "grad_norm": 0.3926626145839691, + "learning_rate": 7.546078954174868e-06, + "loss": 0.718, + "step": 1735 + }, + { + "epoch": 0.35021292557534034, + "grad_norm": 1.3000050783157349, + "learning_rate": 7.543266077808893e-06, + "loss": 0.7198, + "step": 1736 + }, + { + "epoch": 0.35041466113154734, + "grad_norm": 0.4094867706298828, + "learning_rate": 7.540452115159163e-06, + "loss": 0.7017, + "step": 1737 + }, + { + "epoch": 0.35061639668775435, + "grad_norm": 0.3614434003829956, + "learning_rate": 7.5376370674275834e-06, + "loss": 0.6872, + "step": 1738 + }, + { + "epoch": 0.35081813224396136, + "grad_norm": 1.8085640668869019, + "learning_rate": 7.5348209358165225e-06, + "loss": 0.7319, + "step": 1739 + }, + { + "epoch": 0.3510198678001683, + "grad_norm": 0.7284565567970276, + "learning_rate": 7.53200372152881e-06, + "loss": 0.6596, + "step": 1740 + }, + { + "epoch": 0.3512216033563753, + "grad_norm": 0.4147509038448334, + "learning_rate": 7.529185425767738e-06, + "loss": 0.7772, + "step": 1741 + }, + { + "epoch": 0.3514233389125823, + "grad_norm": 0.35822349786758423, + "learning_rate": 7.526366049737063e-06, + "loss": 0.7323, + "step": 1742 + }, + { + "epoch": 0.3516250744687893, + "grad_norm": 0.38672930002212524, + "learning_rate": 7.523545594641001e-06, + "loss": 0.8405, + "step": 1743 + }, + { + "epoch": 0.3518268100249963, + "grad_norm": 0.3860418498516083, + "learning_rate": 7.520724061684227e-06, + "loss": 0.6463, + "step": 1744 + }, + { + "epoch": 0.3520285455812033, + "grad_norm": 0.5697163343429565, + "learning_rate": 7.51790145207188e-06, + "loss": 0.701, + "step": 1745 + }, + { + "epoch": 0.3522302811374103, + "grad_norm": 0.6017259955406189, + "learning_rate": 7.51507776700956e-06, + "loss": 0.6839, + "step": 1746 + }, + { + "epoch": 0.3524320166936173, + "grad_norm": 0.46007320284843445, + "learning_rate": 7.512253007703321e-06, + "loss": 0.7561, + "step": 1747 + }, + { + "epoch": 0.3526337522498243, + "grad_norm": 0.740803062915802, + "learning_rate": 7.509427175359678e-06, + "loss": 0.6879, + "step": 1748 + }, + { + "epoch": 0.35283548780603124, + "grad_norm": 0.4627668857574463, + "learning_rate": 7.506600271185605e-06, + "loss": 0.6998, + "step": 1749 + }, + { + "epoch": 0.35303722336223825, + "grad_norm": 0.3997352123260498, + "learning_rate": 7.503772296388536e-06, + "loss": 0.6721, + "step": 1750 + }, + { + "epoch": 0.35323895891844526, + "grad_norm": 0.49099141359329224, + "learning_rate": 7.500943252176359e-06, + "loss": 0.7366, + "step": 1751 + }, + { + "epoch": 0.35344069447465226, + "grad_norm": 0.3792705833911896, + "learning_rate": 7.498113139757418e-06, + "loss": 0.6561, + "step": 1752 + }, + { + "epoch": 0.3536424300308592, + "grad_norm": 0.5688077211380005, + "learning_rate": 7.4952819603405155e-06, + "loss": 0.6516, + "step": 1753 + }, + { + "epoch": 0.3538441655870662, + "grad_norm": 0.5022661089897156, + "learning_rate": 7.492449715134912e-06, + "loss": 0.7017, + "step": 1754 + }, + { + "epoch": 0.3540459011432732, + "grad_norm": 0.4993700683116913, + "learning_rate": 7.489616405350319e-06, + "loss": 0.6686, + "step": 1755 + }, + { + "epoch": 0.35424763669948023, + "grad_norm": 1.4127341508865356, + "learning_rate": 7.4867820321969005e-06, + "loss": 0.8506, + "step": 1756 + }, + { + "epoch": 0.35444937225568723, + "grad_norm": 0.5744199752807617, + "learning_rate": 7.483946596885283e-06, + "loss": 0.6632, + "step": 1757 + }, + { + "epoch": 0.3546511078118942, + "grad_norm": 0.5674511194229126, + "learning_rate": 7.481110100626542e-06, + "loss": 0.6733, + "step": 1758 + }, + { + "epoch": 0.3548528433681012, + "grad_norm": 0.45668545365333557, + "learning_rate": 7.478272544632204e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.3550545789243082, + "grad_norm": 0.4792921841144562, + "learning_rate": 7.47543393011425e-06, + "loss": 0.6587, + "step": 1760 + }, + { + "epoch": 0.3552563144805152, + "grad_norm": 0.6189489960670471, + "learning_rate": 7.472594258285115e-06, + "loss": 1.0455, + "step": 1761 + }, + { + "epoch": 0.35545805003672215, + "grad_norm": 0.5455827116966248, + "learning_rate": 7.469753530357684e-06, + "loss": 0.7175, + "step": 1762 + }, + { + "epoch": 0.35565978559292916, + "grad_norm": 0.7381955981254578, + "learning_rate": 7.466911747545291e-06, + "loss": 0.6858, + "step": 1763 + }, + { + "epoch": 0.35586152114913616, + "grad_norm": 1.1056345701217651, + "learning_rate": 7.464068911061726e-06, + "loss": 0.6447, + "step": 1764 + }, + { + "epoch": 0.35606325670534317, + "grad_norm": 2.1146862506866455, + "learning_rate": 7.461225022121223e-06, + "loss": 0.6942, + "step": 1765 + }, + { + "epoch": 0.3562649922615502, + "grad_norm": 1.0887856483459473, + "learning_rate": 7.45838008193847e-06, + "loss": 0.7291, + "step": 1766 + }, + { + "epoch": 0.3564667278177571, + "grad_norm": 0.6987454891204834, + "learning_rate": 7.455534091728603e-06, + "loss": 0.6633, + "step": 1767 + }, + { + "epoch": 0.35666846337396413, + "grad_norm": 0.9622281789779663, + "learning_rate": 7.452687052707201e-06, + "loss": 0.7049, + "step": 1768 + }, + { + "epoch": 0.35687019893017113, + "grad_norm": 0.8182410597801208, + "learning_rate": 7.4498389660903025e-06, + "loss": 1.1296, + "step": 1769 + }, + { + "epoch": 0.35707193448637814, + "grad_norm": 1.42489755153656, + "learning_rate": 7.446989833094381e-06, + "loss": 0.7751, + "step": 1770 + }, + { + "epoch": 0.3572736700425851, + "grad_norm": 0.5022891759872437, + "learning_rate": 7.444139654936367e-06, + "loss": 0.6574, + "step": 1771 + }, + { + "epoch": 0.3574754055987921, + "grad_norm": 0.913811445236206, + "learning_rate": 7.441288432833628e-06, + "loss": 0.7729, + "step": 1772 + }, + { + "epoch": 0.3576771411549991, + "grad_norm": 0.987787127494812, + "learning_rate": 7.438436168003987e-06, + "loss": 0.6526, + "step": 1773 + }, + { + "epoch": 0.3578788767112061, + "grad_norm": 0.8746455311775208, + "learning_rate": 7.435582861665705e-06, + "loss": 0.678, + "step": 1774 + }, + { + "epoch": 0.3580806122674131, + "grad_norm": 0.4354274868965149, + "learning_rate": 7.432728515037494e-06, + "loss": 0.7056, + "step": 1775 + }, + { + "epoch": 0.35828234782362006, + "grad_norm": 0.3449755311012268, + "learning_rate": 7.429873129338503e-06, + "loss": 0.814, + "step": 1776 + }, + { + "epoch": 0.35848408337982707, + "grad_norm": 0.741226851940155, + "learning_rate": 7.4270167057883295e-06, + "loss": 0.7221, + "step": 1777 + }, + { + "epoch": 0.3586858189360341, + "grad_norm": 1.018054723739624, + "learning_rate": 7.424159245607016e-06, + "loss": 0.816, + "step": 1778 + }, + { + "epoch": 0.3588875544922411, + "grad_norm": 0.4770512282848358, + "learning_rate": 7.421300750015043e-06, + "loss": 0.8044, + "step": 1779 + }, + { + "epoch": 0.35908929004844803, + "grad_norm": 0.7181556820869446, + "learning_rate": 7.418441220233336e-06, + "loss": 0.6637, + "step": 1780 + }, + { + "epoch": 0.35929102560465503, + "grad_norm": 0.49955129623413086, + "learning_rate": 7.415580657483263e-06, + "loss": 0.6818, + "step": 1781 + }, + { + "epoch": 0.35949276116086204, + "grad_norm": 0.6910608410835266, + "learning_rate": 7.412719062986632e-06, + "loss": 0.7449, + "step": 1782 + }, + { + "epoch": 0.35969449671706905, + "grad_norm": 0.35166415572166443, + "learning_rate": 7.40985643796569e-06, + "loss": 0.6331, + "step": 1783 + }, + { + "epoch": 0.35989623227327605, + "grad_norm": 0.5472837090492249, + "learning_rate": 7.406992783643127e-06, + "loss": 0.6728, + "step": 1784 + }, + { + "epoch": 0.360097967829483, + "grad_norm": 0.5765179395675659, + "learning_rate": 7.4041281012420695e-06, + "loss": 0.652, + "step": 1785 + }, + { + "epoch": 0.36029970338569, + "grad_norm": 0.642083466053009, + "learning_rate": 7.401262391986088e-06, + "loss": 0.7795, + "step": 1786 + }, + { + "epoch": 0.360501438941897, + "grad_norm": 0.39941856265068054, + "learning_rate": 7.398395657099189e-06, + "loss": 0.662, + "step": 1787 + }, + { + "epoch": 0.360703174498104, + "grad_norm": 0.4457746148109436, + "learning_rate": 7.395527897805812e-06, + "loss": 0.6827, + "step": 1788 + }, + { + "epoch": 0.36090491005431097, + "grad_norm": 0.44422203302383423, + "learning_rate": 7.392659115330844e-06, + "loss": 0.7186, + "step": 1789 + }, + { + "epoch": 0.361106645610518, + "grad_norm": 0.4708141088485718, + "learning_rate": 7.389789310899602e-06, + "loss": 0.6707, + "step": 1790 + }, + { + "epoch": 0.361308381166725, + "grad_norm": 0.7885355353355408, + "learning_rate": 7.38691848573784e-06, + "loss": 0.651, + "step": 1791 + }, + { + "epoch": 0.361510116722932, + "grad_norm": 0.7183753848075867, + "learning_rate": 7.3840466410717505e-06, + "loss": 0.6756, + "step": 1792 + }, + { + "epoch": 0.361711852279139, + "grad_norm": 0.8749204277992249, + "learning_rate": 7.381173778127961e-06, + "loss": 0.6698, + "step": 1793 + }, + { + "epoch": 0.36191358783534594, + "grad_norm": 0.4229263961315155, + "learning_rate": 7.378299898133533e-06, + "loss": 0.6847, + "step": 1794 + }, + { + "epoch": 0.36211532339155295, + "grad_norm": 1.5481194257736206, + "learning_rate": 7.3754250023159615e-06, + "loss": 0.9223, + "step": 1795 + }, + { + "epoch": 0.36231705894775995, + "grad_norm": 0.6774659752845764, + "learning_rate": 7.372549091903175e-06, + "loss": 0.6662, + "step": 1796 + }, + { + "epoch": 0.36251879450396696, + "grad_norm": 0.4385301470756531, + "learning_rate": 7.36967216812354e-06, + "loss": 0.7054, + "step": 1797 + }, + { + "epoch": 0.3627205300601739, + "grad_norm": 0.7024504542350769, + "learning_rate": 7.366794232205852e-06, + "loss": 0.7764, + "step": 1798 + }, + { + "epoch": 0.3629222656163809, + "grad_norm": 0.4045291543006897, + "learning_rate": 7.36391528537934e-06, + "loss": 0.6346, + "step": 1799 + }, + { + "epoch": 0.3631240011725879, + "grad_norm": 0.39111268520355225, + "learning_rate": 7.36103532887366e-06, + "loss": 0.7232, + "step": 1800 + }, + { + "epoch": 0.3633257367287949, + "grad_norm": 0.44023269414901733, + "learning_rate": 7.358154363918909e-06, + "loss": 0.6935, + "step": 1801 + }, + { + "epoch": 0.36352747228500193, + "grad_norm": 0.3415777087211609, + "learning_rate": 7.355272391745605e-06, + "loss": 0.6837, + "step": 1802 + }, + { + "epoch": 0.3637292078412089, + "grad_norm": 0.32775235176086426, + "learning_rate": 7.352389413584704e-06, + "loss": 0.7245, + "step": 1803 + }, + { + "epoch": 0.3639309433974159, + "grad_norm": 0.4504493176937103, + "learning_rate": 7.349505430667585e-06, + "loss": 0.6969, + "step": 1804 + }, + { + "epoch": 0.3641326789536229, + "grad_norm": 0.5969768762588501, + "learning_rate": 7.3466204442260605e-06, + "loss": 0.6916, + "step": 1805 + }, + { + "epoch": 0.3643344145098299, + "grad_norm": 0.5859495401382446, + "learning_rate": 7.343734455492372e-06, + "loss": 0.7688, + "step": 1806 + }, + { + "epoch": 0.36453615006603685, + "grad_norm": 0.431937038898468, + "learning_rate": 7.340847465699186e-06, + "loss": 0.6623, + "step": 1807 + }, + { + "epoch": 0.36473788562224385, + "grad_norm": 0.4382317364215851, + "learning_rate": 7.3379594760795955e-06, + "loss": 0.684, + "step": 1808 + }, + { + "epoch": 0.36493962117845086, + "grad_norm": 0.33924439549446106, + "learning_rate": 7.335070487867127e-06, + "loss": 0.6919, + "step": 1809 + }, + { + "epoch": 0.36514135673465786, + "grad_norm": 0.8796870112419128, + "learning_rate": 7.332180502295729e-06, + "loss": 0.7005, + "step": 1810 + }, + { + "epoch": 0.36534309229086487, + "grad_norm": 1.7464920282363892, + "learning_rate": 7.329289520599776e-06, + "loss": 0.6226, + "step": 1811 + }, + { + "epoch": 0.3655448278470718, + "grad_norm": 0.5517187714576721, + "learning_rate": 7.326397544014065e-06, + "loss": 0.6956, + "step": 1812 + }, + { + "epoch": 0.3657465634032788, + "grad_norm": 0.4252607226371765, + "learning_rate": 7.32350457377383e-06, + "loss": 0.687, + "step": 1813 + }, + { + "epoch": 0.36594829895948583, + "grad_norm": 0.49049055576324463, + "learning_rate": 7.320610611114713e-06, + "loss": 0.7759, + "step": 1814 + }, + { + "epoch": 0.36615003451569283, + "grad_norm": 0.3037787079811096, + "learning_rate": 7.317715657272793e-06, + "loss": 0.7135, + "step": 1815 + }, + { + "epoch": 0.36635177007189984, + "grad_norm": 2.116178274154663, + "learning_rate": 7.314819713484561e-06, + "loss": 0.6389, + "step": 1816 + }, + { + "epoch": 0.3665535056281068, + "grad_norm": 0.47844094038009644, + "learning_rate": 7.3119227809869445e-06, + "loss": 0.7052, + "step": 1817 + }, + { + "epoch": 0.3667552411843138, + "grad_norm": 0.5032902359962463, + "learning_rate": 7.309024861017281e-06, + "loss": 0.6915, + "step": 1818 + }, + { + "epoch": 0.3669569767405208, + "grad_norm": 2.9803664684295654, + "learning_rate": 7.306125954813335e-06, + "loss": 0.6944, + "step": 1819 + }, + { + "epoch": 0.3671587122967278, + "grad_norm": 1.1328582763671875, + "learning_rate": 7.303226063613293e-06, + "loss": 0.7092, + "step": 1820 + }, + { + "epoch": 0.36736044785293476, + "grad_norm": 1.1794075965881348, + "learning_rate": 7.300325188655762e-06, + "loss": 0.7334, + "step": 1821 + }, + { + "epoch": 0.36756218340914176, + "grad_norm": 0.4251624643802643, + "learning_rate": 7.297423331179766e-06, + "loss": 0.6757, + "step": 1822 + }, + { + "epoch": 0.36776391896534877, + "grad_norm": 0.35774368047714233, + "learning_rate": 7.294520492424752e-06, + "loss": 0.701, + "step": 1823 + }, + { + "epoch": 0.3679656545215558, + "grad_norm": 0.34498193860054016, + "learning_rate": 7.291616673630583e-06, + "loss": 0.8526, + "step": 1824 + }, + { + "epoch": 0.3681673900777628, + "grad_norm": 0.5169105529785156, + "learning_rate": 7.288711876037546e-06, + "loss": 0.7012, + "step": 1825 + }, + { + "epoch": 0.36836912563396973, + "grad_norm": 0.7580662369728088, + "learning_rate": 7.28580610088634e-06, + "loss": 0.7087, + "step": 1826 + }, + { + "epoch": 0.36857086119017674, + "grad_norm": 0.5409523844718933, + "learning_rate": 7.282899349418086e-06, + "loss": 0.6959, + "step": 1827 + }, + { + "epoch": 0.36877259674638374, + "grad_norm": 0.44074133038520813, + "learning_rate": 7.279991622874319e-06, + "loss": 0.641, + "step": 1828 + }, + { + "epoch": 0.36897433230259075, + "grad_norm": 0.45611658692359924, + "learning_rate": 7.277082922496993e-06, + "loss": 0.8013, + "step": 1829 + }, + { + "epoch": 0.3691760678587977, + "grad_norm": 2.360243558883667, + "learning_rate": 7.2741732495284745e-06, + "loss": 0.675, + "step": 1830 + }, + { + "epoch": 0.3693778034150047, + "grad_norm": 0.5340917110443115, + "learning_rate": 7.27126260521155e-06, + "loss": 0.7439, + "step": 1831 + }, + { + "epoch": 0.3695795389712117, + "grad_norm": 0.8571584224700928, + "learning_rate": 7.268350990789415e-06, + "loss": 0.737, + "step": 1832 + }, + { + "epoch": 0.3697812745274187, + "grad_norm": 1.4758659601211548, + "learning_rate": 7.265438407505686e-06, + "loss": 0.852, + "step": 1833 + }, + { + "epoch": 0.3699830100836257, + "grad_norm": 0.7456501722335815, + "learning_rate": 7.262524856604389e-06, + "loss": 0.6932, + "step": 1834 + }, + { + "epoch": 0.37018474563983267, + "grad_norm": 0.36184829473495483, + "learning_rate": 7.259610339329965e-06, + "loss": 0.651, + "step": 1835 + }, + { + "epoch": 0.3703864811960397, + "grad_norm": 0.3770706057548523, + "learning_rate": 7.256694856927267e-06, + "loss": 0.7797, + "step": 1836 + }, + { + "epoch": 0.3705882167522467, + "grad_norm": 0.4544236361980438, + "learning_rate": 7.253778410641557e-06, + "loss": 0.6685, + "step": 1837 + }, + { + "epoch": 0.3707899523084537, + "grad_norm": 0.3603919446468353, + "learning_rate": 7.2508610017185175e-06, + "loss": 0.8271, + "step": 1838 + }, + { + "epoch": 0.37099168786466064, + "grad_norm": 0.5313817858695984, + "learning_rate": 7.247942631404232e-06, + "loss": 0.6747, + "step": 1839 + }, + { + "epoch": 0.37119342342086764, + "grad_norm": 0.3739321231842041, + "learning_rate": 7.245023300945203e-06, + "loss": 0.7191, + "step": 1840 + }, + { + "epoch": 0.37139515897707465, + "grad_norm": 0.3855895400047302, + "learning_rate": 7.242103011588339e-06, + "loss": 0.6782, + "step": 1841 + }, + { + "epoch": 0.37159689453328165, + "grad_norm": 0.4556311070919037, + "learning_rate": 7.239181764580956e-06, + "loss": 0.6905, + "step": 1842 + }, + { + "epoch": 0.37179863008948866, + "grad_norm": 0.4981215000152588, + "learning_rate": 7.236259561170783e-06, + "loss": 0.6933, + "step": 1843 + }, + { + "epoch": 0.3720003656456956, + "grad_norm": 0.47854724526405334, + "learning_rate": 7.233336402605956e-06, + "loss": 0.8892, + "step": 1844 + }, + { + "epoch": 0.3722021012019026, + "grad_norm": 0.6122666597366333, + "learning_rate": 7.23041229013502e-06, + "loss": 0.668, + "step": 1845 + }, + { + "epoch": 0.3724038367581096, + "grad_norm": 0.5111199021339417, + "learning_rate": 7.227487225006926e-06, + "loss": 0.8229, + "step": 1846 + }, + { + "epoch": 0.3726055723143166, + "grad_norm": 0.6023356914520264, + "learning_rate": 7.22456120847103e-06, + "loss": 1.0255, + "step": 1847 + }, + { + "epoch": 0.3728073078705236, + "grad_norm": 0.4147641062736511, + "learning_rate": 7.2216342417771e-06, + "loss": 0.6807, + "step": 1848 + }, + { + "epoch": 0.3730090434267306, + "grad_norm": 0.9486438632011414, + "learning_rate": 7.218706326175304e-06, + "loss": 0.6871, + "step": 1849 + }, + { + "epoch": 0.3732107789829376, + "grad_norm": 0.8997410535812378, + "learning_rate": 7.215777462916221e-06, + "loss": 0.8113, + "step": 1850 + }, + { + "epoch": 0.3734125145391446, + "grad_norm": 1.2337889671325684, + "learning_rate": 7.212847653250828e-06, + "loss": 0.7935, + "step": 1851 + }, + { + "epoch": 0.3736142500953516, + "grad_norm": 0.511205792427063, + "learning_rate": 7.2099168984305124e-06, + "loss": 0.7142, + "step": 1852 + }, + { + "epoch": 0.37381598565155855, + "grad_norm": 3.531510353088379, + "learning_rate": 7.206985199707062e-06, + "loss": 0.6798, + "step": 1853 + }, + { + "epoch": 0.37401772120776555, + "grad_norm": 1.0379084348678589, + "learning_rate": 7.204052558332668e-06, + "loss": 0.7005, + "step": 1854 + }, + { + "epoch": 0.37421945676397256, + "grad_norm": 2.9178311824798584, + "learning_rate": 7.2011189755599255e-06, + "loss": 0.6898, + "step": 1855 + }, + { + "epoch": 0.37442119232017956, + "grad_norm": 2.786430597305298, + "learning_rate": 7.19818445264183e-06, + "loss": 0.6328, + "step": 1856 + }, + { + "epoch": 0.3746229278763865, + "grad_norm": 3.562833070755005, + "learning_rate": 7.19524899083178e-06, + "loss": 0.6735, + "step": 1857 + }, + { + "epoch": 0.3748246634325935, + "grad_norm": 1.2733399868011475, + "learning_rate": 7.192312591383575e-06, + "loss": 0.6778, + "step": 1858 + }, + { + "epoch": 0.3750263989888005, + "grad_norm": 0.535793662071228, + "learning_rate": 7.189375255551413e-06, + "loss": 0.8744, + "step": 1859 + }, + { + "epoch": 0.37522813454500753, + "grad_norm": 1.285027265548706, + "learning_rate": 7.186436984589895e-06, + "loss": 0.9023, + "step": 1860 + }, + { + "epoch": 0.37542987010121454, + "grad_norm": 0.45449426770210266, + "learning_rate": 7.18349777975402e-06, + "loss": 0.9073, + "step": 1861 + }, + { + "epoch": 0.3756316056574215, + "grad_norm": 0.36496907472610474, + "learning_rate": 7.180557642299184e-06, + "loss": 0.7054, + "step": 1862 + }, + { + "epoch": 0.3758333412136285, + "grad_norm": 0.6334181427955627, + "learning_rate": 7.177616573481185e-06, + "loss": 0.7651, + "step": 1863 + }, + { + "epoch": 0.3760350767698355, + "grad_norm": 0.410114586353302, + "learning_rate": 7.1746745745562165e-06, + "loss": 0.6655, + "step": 1864 + }, + { + "epoch": 0.3762368123260425, + "grad_norm": 0.35170572996139526, + "learning_rate": 7.171731646780867e-06, + "loss": 0.692, + "step": 1865 + }, + { + "epoch": 0.37643854788224945, + "grad_norm": 0.4408973455429077, + "learning_rate": 7.168787791412128e-06, + "loss": 0.6569, + "step": 1866 + }, + { + "epoch": 0.37664028343845646, + "grad_norm": 0.3591359257698059, + "learning_rate": 7.165843009707383e-06, + "loss": 0.6774, + "step": 1867 + }, + { + "epoch": 0.37684201899466346, + "grad_norm": 0.48465201258659363, + "learning_rate": 7.162897302924409e-06, + "loss": 0.9298, + "step": 1868 + }, + { + "epoch": 0.37704375455087047, + "grad_norm": 0.4542745351791382, + "learning_rate": 7.1599506723213845e-06, + "loss": 0.6479, + "step": 1869 + }, + { + "epoch": 0.3772454901070775, + "grad_norm": 0.4889806807041168, + "learning_rate": 7.157003119156876e-06, + "loss": 0.8561, + "step": 1870 + }, + { + "epoch": 0.3774472256632844, + "grad_norm": 0.40656688809394836, + "learning_rate": 7.154054644689847e-06, + "loss": 0.7979, + "step": 1871 + }, + { + "epoch": 0.37764896121949143, + "grad_norm": 0.419414758682251, + "learning_rate": 7.151105250179658e-06, + "loss": 0.6655, + "step": 1872 + }, + { + "epoch": 0.37785069677569844, + "grad_norm": 0.4601416289806366, + "learning_rate": 7.1481549368860545e-06, + "loss": 0.7036, + "step": 1873 + }, + { + "epoch": 0.37805243233190544, + "grad_norm": 0.5763274431228638, + "learning_rate": 7.145203706069183e-06, + "loss": 0.6789, + "step": 1874 + }, + { + "epoch": 0.3782541678881124, + "grad_norm": 0.42279934883117676, + "learning_rate": 7.142251558989573e-06, + "loss": 0.6988, + "step": 1875 + }, + { + "epoch": 0.3784559034443194, + "grad_norm": 0.3470363914966583, + "learning_rate": 7.139298496908155e-06, + "loss": 0.8392, + "step": 1876 + }, + { + "epoch": 0.3786576390005264, + "grad_norm": 0.34533339738845825, + "learning_rate": 7.136344521086242e-06, + "loss": 0.6961, + "step": 1877 + }, + { + "epoch": 0.3788593745567334, + "grad_norm": 0.4589783847332001, + "learning_rate": 7.133389632785543e-06, + "loss": 0.868, + "step": 1878 + }, + { + "epoch": 0.3790611101129404, + "grad_norm": 0.9338458776473999, + "learning_rate": 7.1304338332681534e-06, + "loss": 0.7152, + "step": 1879 + }, + { + "epoch": 0.37926284566914736, + "grad_norm": 0.48263630270957947, + "learning_rate": 7.127477123796559e-06, + "loss": 0.8125, + "step": 1880 + }, + { + "epoch": 0.37946458122535437, + "grad_norm": 0.443023145198822, + "learning_rate": 7.124519505633633e-06, + "loss": 0.6722, + "step": 1881 + }, + { + "epoch": 0.3796663167815614, + "grad_norm": 0.45879748463630676, + "learning_rate": 7.121560980042641e-06, + "loss": 0.8428, + "step": 1882 + }, + { + "epoch": 0.3798680523377684, + "grad_norm": 0.5562877655029297, + "learning_rate": 7.11860154828723e-06, + "loss": 0.8586, + "step": 1883 + }, + { + "epoch": 0.38006978789397533, + "grad_norm": 0.6089495420455933, + "learning_rate": 7.1156412116314374e-06, + "loss": 0.7096, + "step": 1884 + }, + { + "epoch": 0.38027152345018234, + "grad_norm": 0.45152896642684937, + "learning_rate": 7.112679971339689e-06, + "loss": 0.6575, + "step": 1885 + }, + { + "epoch": 0.38047325900638934, + "grad_norm": 0.5453755259513855, + "learning_rate": 7.109717828676792e-06, + "loss": 0.6727, + "step": 1886 + }, + { + "epoch": 0.38067499456259635, + "grad_norm": 0.33480140566825867, + "learning_rate": 7.106754784907942e-06, + "loss": 0.6577, + "step": 1887 + }, + { + "epoch": 0.38087673011880335, + "grad_norm": 0.44161325693130493, + "learning_rate": 7.10379084129872e-06, + "loss": 0.704, + "step": 1888 + }, + { + "epoch": 0.3810784656750103, + "grad_norm": 0.4320142865180969, + "learning_rate": 7.100825999115089e-06, + "loss": 0.6513, + "step": 1889 + }, + { + "epoch": 0.3812802012312173, + "grad_norm": 0.38756075501441956, + "learning_rate": 7.097860259623397e-06, + "loss": 0.7126, + "step": 1890 + }, + { + "epoch": 0.3814819367874243, + "grad_norm": 0.45298585295677185, + "learning_rate": 7.094893624090375e-06, + "loss": 0.6851, + "step": 1891 + }, + { + "epoch": 0.3816836723436313, + "grad_norm": 0.3509814143180847, + "learning_rate": 7.091926093783139e-06, + "loss": 0.6829, + "step": 1892 + }, + { + "epoch": 0.38188540789983827, + "grad_norm": 0.5120080709457397, + "learning_rate": 7.088957669969182e-06, + "loss": 0.7134, + "step": 1893 + }, + { + "epoch": 0.3820871434560453, + "grad_norm": 0.389632910490036, + "learning_rate": 7.085988353916385e-06, + "loss": 0.7099, + "step": 1894 + }, + { + "epoch": 0.3822888790122523, + "grad_norm": 0.39382511377334595, + "learning_rate": 7.083018146893003e-06, + "loss": 0.6807, + "step": 1895 + }, + { + "epoch": 0.3824906145684593, + "grad_norm": 0.5153424739837646, + "learning_rate": 7.08004705016768e-06, + "loss": 0.6804, + "step": 1896 + }, + { + "epoch": 0.3826923501246663, + "grad_norm": 0.37517619132995605, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.7282, + "step": 1897 + }, + { + "epoch": 0.38289408568087324, + "grad_norm": 0.705549418926239, + "learning_rate": 7.07410219268766e-06, + "loss": 0.6881, + "step": 1898 + }, + { + "epoch": 0.38309582123708025, + "grad_norm": 0.8489635586738586, + "learning_rate": 7.071128434472141e-06, + "loss": 0.7333, + "step": 1899 + }, + { + "epoch": 0.38329755679328725, + "grad_norm": 0.7200409173965454, + "learning_rate": 7.06815379163303e-06, + "loss": 0.6978, + "step": 1900 + }, + { + "epoch": 0.38349929234949426, + "grad_norm": 0.6117933392524719, + "learning_rate": 7.065178265440864e-06, + "loss": 0.6218, + "step": 1901 + }, + { + "epoch": 0.3837010279057012, + "grad_norm": 0.41154125332832336, + "learning_rate": 7.0622018571665514e-06, + "loss": 0.7046, + "step": 1902 + }, + { + "epoch": 0.3839027634619082, + "grad_norm": 0.5020780563354492, + "learning_rate": 7.059224568081381e-06, + "loss": 0.8056, + "step": 1903 + }, + { + "epoch": 0.3841044990181152, + "grad_norm": 0.7355788350105286, + "learning_rate": 7.056246399457019e-06, + "loss": 0.7505, + "step": 1904 + }, + { + "epoch": 0.3843062345743222, + "grad_norm": 0.8149549961090088, + "learning_rate": 7.053267352565504e-06, + "loss": 0.659, + "step": 1905 + }, + { + "epoch": 0.38450797013052923, + "grad_norm": 0.37660858035087585, + "learning_rate": 7.05028742867925e-06, + "loss": 0.723, + "step": 1906 + }, + { + "epoch": 0.3847097056867362, + "grad_norm": 0.5093876123428345, + "learning_rate": 7.047306629071048e-06, + "loss": 0.6905, + "step": 1907 + }, + { + "epoch": 0.3849114412429432, + "grad_norm": 0.47235724329948425, + "learning_rate": 7.044324955014062e-06, + "loss": 0.6641, + "step": 1908 + }, + { + "epoch": 0.3851131767991502, + "grad_norm": 0.5246691107749939, + "learning_rate": 7.04134240778183e-06, + "loss": 0.6654, + "step": 1909 + }, + { + "epoch": 0.3853149123553572, + "grad_norm": 0.41439288854599, + "learning_rate": 7.03835898864826e-06, + "loss": 0.6438, + "step": 1910 + }, + { + "epoch": 0.3855166479115642, + "grad_norm": 0.47759121656417847, + "learning_rate": 7.0353746988876345e-06, + "loss": 0.698, + "step": 1911 + }, + { + "epoch": 0.38571838346777115, + "grad_norm": 0.5373578071594238, + "learning_rate": 7.032389539774611e-06, + "loss": 0.7928, + "step": 1912 + }, + { + "epoch": 0.38592011902397816, + "grad_norm": 0.38632825016975403, + "learning_rate": 7.029403512584214e-06, + "loss": 0.6464, + "step": 1913 + }, + { + "epoch": 0.38612185458018516, + "grad_norm": 0.743184506893158, + "learning_rate": 7.026416618591838e-06, + "loss": 0.697, + "step": 1914 + }, + { + "epoch": 0.38632359013639217, + "grad_norm": 0.36446547508239746, + "learning_rate": 7.0234288590732516e-06, + "loss": 0.648, + "step": 1915 + }, + { + "epoch": 0.3865253256925991, + "grad_norm": 0.32801979780197144, + "learning_rate": 7.020440235304593e-06, + "loss": 0.6475, + "step": 1916 + }, + { + "epoch": 0.3867270612488061, + "grad_norm": 0.48668283224105835, + "learning_rate": 7.017450748562364e-06, + "loss": 0.9108, + "step": 1917 + }, + { + "epoch": 0.38692879680501313, + "grad_norm": 0.5561202764511108, + "learning_rate": 7.0144604001234405e-06, + "loss": 0.6994, + "step": 1918 + }, + { + "epoch": 0.38713053236122014, + "grad_norm": 0.3585546016693115, + "learning_rate": 7.011469191265066e-06, + "loss": 0.7193, + "step": 1919 + }, + { + "epoch": 0.38733226791742714, + "grad_norm": 0.627954363822937, + "learning_rate": 7.008477123264849e-06, + "loss": 0.6728, + "step": 1920 + }, + { + "epoch": 0.3875340034736341, + "grad_norm": 0.47432732582092285, + "learning_rate": 7.005484197400765e-06, + "loss": 0.6372, + "step": 1921 + }, + { + "epoch": 0.3877357390298411, + "grad_norm": 0.6232267618179321, + "learning_rate": 7.00249041495116e-06, + "loss": 0.8147, + "step": 1922 + }, + { + "epoch": 0.3879374745860481, + "grad_norm": 0.5535650849342346, + "learning_rate": 6.99949577719474e-06, + "loss": 0.6625, + "step": 1923 + }, + { + "epoch": 0.3881392101422551, + "grad_norm": 0.5244125127792358, + "learning_rate": 6.996500285410582e-06, + "loss": 0.7226, + "step": 1924 + }, + { + "epoch": 0.38834094569846206, + "grad_norm": 0.8325016498565674, + "learning_rate": 6.993503940878126e-06, + "loss": 0.7708, + "step": 1925 + }, + { + "epoch": 0.38854268125466906, + "grad_norm": 0.4260006546974182, + "learning_rate": 6.990506744877171e-06, + "loss": 0.7013, + "step": 1926 + }, + { + "epoch": 0.38874441681087607, + "grad_norm": 0.3316943049430847, + "learning_rate": 6.987508698687886e-06, + "loss": 0.6514, + "step": 1927 + }, + { + "epoch": 0.3889461523670831, + "grad_norm": 0.31761300563812256, + "learning_rate": 6.984509803590802e-06, + "loss": 0.6922, + "step": 1928 + }, + { + "epoch": 0.3891478879232901, + "grad_norm": 0.5712217092514038, + "learning_rate": 6.981510060866812e-06, + "loss": 0.7326, + "step": 1929 + }, + { + "epoch": 0.38934962347949703, + "grad_norm": 0.4465492367744446, + "learning_rate": 6.97850947179717e-06, + "loss": 0.9044, + "step": 1930 + }, + { + "epoch": 0.38955135903570404, + "grad_norm": 0.350298136472702, + "learning_rate": 6.97550803766349e-06, + "loss": 0.6799, + "step": 1931 + }, + { + "epoch": 0.38975309459191104, + "grad_norm": 0.4485641121864319, + "learning_rate": 6.972505759747754e-06, + "loss": 0.8262, + "step": 1932 + }, + { + "epoch": 0.38995483014811805, + "grad_norm": 0.5809760093688965, + "learning_rate": 6.969502639332298e-06, + "loss": 0.6894, + "step": 1933 + }, + { + "epoch": 0.390156565704325, + "grad_norm": 0.3777012825012207, + "learning_rate": 6.9664986776998155e-06, + "loss": 0.6713, + "step": 1934 + }, + { + "epoch": 0.390358301260532, + "grad_norm": 0.4996185898780823, + "learning_rate": 6.963493876133367e-06, + "loss": 0.6522, + "step": 1935 + }, + { + "epoch": 0.390560036816739, + "grad_norm": 0.7318883538246155, + "learning_rate": 6.960488235916367e-06, + "loss": 0.7009, + "step": 1936 + }, + { + "epoch": 0.390761772372946, + "grad_norm": 0.46615687012672424, + "learning_rate": 6.957481758332592e-06, + "loss": 0.8314, + "step": 1937 + }, + { + "epoch": 0.390963507929153, + "grad_norm": 0.3986649811267853, + "learning_rate": 6.954474444666169e-06, + "loss": 0.6622, + "step": 1938 + }, + { + "epoch": 0.39116524348535997, + "grad_norm": 0.3693607747554779, + "learning_rate": 6.951466296201587e-06, + "loss": 0.6999, + "step": 1939 + }, + { + "epoch": 0.391366979041567, + "grad_norm": 0.5240568518638611, + "learning_rate": 6.948457314223693e-06, + "loss": 0.8285, + "step": 1940 + }, + { + "epoch": 0.391568714597774, + "grad_norm": 0.5109261870384216, + "learning_rate": 6.945447500017689e-06, + "loss": 0.6924, + "step": 1941 + }, + { + "epoch": 0.391770450153981, + "grad_norm": 0.5352274179458618, + "learning_rate": 6.942436854869129e-06, + "loss": 0.6813, + "step": 1942 + }, + { + "epoch": 0.39197218571018794, + "grad_norm": 0.6227556467056274, + "learning_rate": 6.939425380063924e-06, + "loss": 0.6969, + "step": 1943 + }, + { + "epoch": 0.39217392126639494, + "grad_norm": 0.7124168276786804, + "learning_rate": 6.936413076888344e-06, + "loss": 0.6694, + "step": 1944 + }, + { + "epoch": 0.39237565682260195, + "grad_norm": 0.44350770115852356, + "learning_rate": 6.933399946629005e-06, + "loss": 0.653, + "step": 1945 + }, + { + "epoch": 0.39257739237880895, + "grad_norm": 0.48074135184288025, + "learning_rate": 6.930385990572879e-06, + "loss": 0.6888, + "step": 1946 + }, + { + "epoch": 0.39277912793501596, + "grad_norm": 0.400717169046402, + "learning_rate": 6.927371210007293e-06, + "loss": 0.7551, + "step": 1947 + }, + { + "epoch": 0.3929808634912229, + "grad_norm": 0.5485891699790955, + "learning_rate": 6.924355606219927e-06, + "loss": 0.7881, + "step": 1948 + }, + { + "epoch": 0.3931825990474299, + "grad_norm": 0.37272897362709045, + "learning_rate": 6.921339180498807e-06, + "loss": 0.6898, + "step": 1949 + }, + { + "epoch": 0.3933843346036369, + "grad_norm": 0.5032504796981812, + "learning_rate": 6.918321934132315e-06, + "loss": 0.7051, + "step": 1950 + }, + { + "epoch": 0.3935860701598439, + "grad_norm": 0.3550852835178375, + "learning_rate": 6.915303868409182e-06, + "loss": 0.6538, + "step": 1951 + }, + { + "epoch": 0.3937878057160509, + "grad_norm": 0.7708501219749451, + "learning_rate": 6.9122849846184895e-06, + "loss": 0.649, + "step": 1952 + }, + { + "epoch": 0.3939895412722579, + "grad_norm": 0.878424882888794, + "learning_rate": 6.909265284049664e-06, + "loss": 0.6987, + "step": 1953 + }, + { + "epoch": 0.3941912768284649, + "grad_norm": 0.4390408992767334, + "learning_rate": 6.90624476799249e-06, + "loss": 0.7713, + "step": 1954 + }, + { + "epoch": 0.3943930123846719, + "grad_norm": 0.7288705110549927, + "learning_rate": 6.903223437737092e-06, + "loss": 0.6889, + "step": 1955 + }, + { + "epoch": 0.3945947479408789, + "grad_norm": 1.3917056322097778, + "learning_rate": 6.900201294573946e-06, + "loss": 0.9714, + "step": 1956 + }, + { + "epoch": 0.39479648349708585, + "grad_norm": 0.41537991166114807, + "learning_rate": 6.897178339793875e-06, + "loss": 0.6585, + "step": 1957 + }, + { + "epoch": 0.39499821905329285, + "grad_norm": 1.4110198020935059, + "learning_rate": 6.894154574688046e-06, + "loss": 0.7028, + "step": 1958 + }, + { + "epoch": 0.39519995460949986, + "grad_norm": 0.4615449607372284, + "learning_rate": 6.891130000547979e-06, + "loss": 0.6434, + "step": 1959 + }, + { + "epoch": 0.39540169016570686, + "grad_norm": 0.5821374654769897, + "learning_rate": 6.888104618665529e-06, + "loss": 0.6351, + "step": 1960 + }, + { + "epoch": 0.3956034257219138, + "grad_norm": 0.7208396792411804, + "learning_rate": 6.885078430332905e-06, + "loss": 0.7269, + "step": 1961 + }, + { + "epoch": 0.3958051612781208, + "grad_norm": 0.544691801071167, + "learning_rate": 6.8820514368426565e-06, + "loss": 0.6914, + "step": 1962 + }, + { + "epoch": 0.3960068968343278, + "grad_norm": 0.4440980553627014, + "learning_rate": 6.879023639487676e-06, + "loss": 0.7014, + "step": 1963 + }, + { + "epoch": 0.39620863239053483, + "grad_norm": 0.6505277156829834, + "learning_rate": 6.875995039561206e-06, + "loss": 0.673, + "step": 1964 + }, + { + "epoch": 0.39641036794674184, + "grad_norm": 0.7644113898277283, + "learning_rate": 6.872965638356823e-06, + "loss": 0.7049, + "step": 1965 + }, + { + "epoch": 0.3966121035029488, + "grad_norm": 0.6489436626434326, + "learning_rate": 6.869935437168449e-06, + "loss": 0.6419, + "step": 1966 + }, + { + "epoch": 0.3968138390591558, + "grad_norm": 0.7834652662277222, + "learning_rate": 6.8669044372903495e-06, + "loss": 0.6944, + "step": 1967 + }, + { + "epoch": 0.3970155746153628, + "grad_norm": 0.35680845379829407, + "learning_rate": 6.86387264001713e-06, + "loss": 0.6467, + "step": 1968 + }, + { + "epoch": 0.3972173101715698, + "grad_norm": 0.9005623459815979, + "learning_rate": 6.860840046643736e-06, + "loss": 0.7223, + "step": 1969 + }, + { + "epoch": 0.39741904572777675, + "grad_norm": 0.5492415428161621, + "learning_rate": 6.857806658465453e-06, + "loss": 0.6834, + "step": 1970 + }, + { + "epoch": 0.39762078128398376, + "grad_norm": 0.3760004937648773, + "learning_rate": 6.854772476777909e-06, + "loss": 0.6643, + "step": 1971 + }, + { + "epoch": 0.39782251684019077, + "grad_norm": 0.36829209327697754, + "learning_rate": 6.851737502877066e-06, + "loss": 0.6687, + "step": 1972 + }, + { + "epoch": 0.39802425239639777, + "grad_norm": 0.4438265264034271, + "learning_rate": 6.8487017380592266e-06, + "loss": 0.7908, + "step": 1973 + }, + { + "epoch": 0.3982259879526048, + "grad_norm": 0.5089324712753296, + "learning_rate": 6.845665183621033e-06, + "loss": 0.7331, + "step": 1974 + }, + { + "epoch": 0.3984277235088117, + "grad_norm": 0.7725141644477844, + "learning_rate": 6.842627840859461e-06, + "loss": 1.1026, + "step": 1975 + }, + { + "epoch": 0.39862945906501873, + "grad_norm": 0.39934641122817993, + "learning_rate": 6.839589711071828e-06, + "loss": 0.7376, + "step": 1976 + }, + { + "epoch": 0.39883119462122574, + "grad_norm": 0.6319762468338013, + "learning_rate": 6.836550795555781e-06, + "loss": 1.0085, + "step": 1977 + }, + { + "epoch": 0.39903293017743274, + "grad_norm": 0.33538317680358887, + "learning_rate": 6.833511095609309e-06, + "loss": 0.6938, + "step": 1978 + }, + { + "epoch": 0.3992346657336397, + "grad_norm": 0.6401187777519226, + "learning_rate": 6.830470612530733e-06, + "loss": 0.7073, + "step": 1979 + }, + { + "epoch": 0.3994364012898467, + "grad_norm": 0.5242562890052795, + "learning_rate": 6.827429347618709e-06, + "loss": 0.7513, + "step": 1980 + }, + { + "epoch": 0.3996381368460537, + "grad_norm": 1.2201377153396606, + "learning_rate": 6.824387302172225e-06, + "loss": 0.6687, + "step": 1981 + }, + { + "epoch": 0.3998398724022607, + "grad_norm": 0.39438146352767944, + "learning_rate": 6.821344477490605e-06, + "loss": 0.6536, + "step": 1982 + }, + { + "epoch": 0.4000416079584677, + "grad_norm": 0.4241214692592621, + "learning_rate": 6.818300874873508e-06, + "loss": 0.8196, + "step": 1983 + }, + { + "epoch": 0.40024334351467467, + "grad_norm": 1.178167462348938, + "learning_rate": 6.815256495620919e-06, + "loss": 0.6902, + "step": 1984 + }, + { + "epoch": 0.40044507907088167, + "grad_norm": 0.3699394464492798, + "learning_rate": 6.812211341033158e-06, + "loss": 0.6524, + "step": 1985 + }, + { + "epoch": 0.4006468146270887, + "grad_norm": 1.5307899713516235, + "learning_rate": 6.8091654124108765e-06, + "loss": 0.7088, + "step": 1986 + }, + { + "epoch": 0.4008485501832957, + "grad_norm": 0.925547182559967, + "learning_rate": 6.8061187110550586e-06, + "loss": 0.701, + "step": 1987 + }, + { + "epoch": 0.40105028573950263, + "grad_norm": 0.5421550869941711, + "learning_rate": 6.803071238267011e-06, + "loss": 0.6445, + "step": 1988 + }, + { + "epoch": 0.40125202129570964, + "grad_norm": 0.5064270496368408, + "learning_rate": 6.800022995348381e-06, + "loss": 0.7091, + "step": 1989 + }, + { + "epoch": 0.40145375685191664, + "grad_norm": 0.4140400290489197, + "learning_rate": 6.796973983601135e-06, + "loss": 0.6602, + "step": 1990 + }, + { + "epoch": 0.40165549240812365, + "grad_norm": 0.49159950017929077, + "learning_rate": 6.793924204327572e-06, + "loss": 0.6981, + "step": 1991 + }, + { + "epoch": 0.40185722796433065, + "grad_norm": 0.3385809361934662, + "learning_rate": 6.790873658830321e-06, + "loss": 0.6857, + "step": 1992 + }, + { + "epoch": 0.4020589635205376, + "grad_norm": 2.379497528076172, + "learning_rate": 6.787822348412333e-06, + "loss": 0.7229, + "step": 1993 + }, + { + "epoch": 0.4022606990767446, + "grad_norm": 0.310107946395874, + "learning_rate": 6.784770274376888e-06, + "loss": 0.7501, + "step": 1994 + }, + { + "epoch": 0.4024624346329516, + "grad_norm": 0.503961980342865, + "learning_rate": 6.781717438027594e-06, + "loss": 0.6566, + "step": 1995 + }, + { + "epoch": 0.4026641701891586, + "grad_norm": 0.3085574805736542, + "learning_rate": 6.7786638406683845e-06, + "loss": 0.6601, + "step": 1996 + }, + { + "epoch": 0.40286590574536557, + "grad_norm": 0.6734681129455566, + "learning_rate": 6.775609483603516e-06, + "loss": 0.7521, + "step": 1997 + }, + { + "epoch": 0.4030676413015726, + "grad_norm": 0.4044412076473236, + "learning_rate": 6.772554368137567e-06, + "loss": 0.8514, + "step": 1998 + }, + { + "epoch": 0.4032693768577796, + "grad_norm": 0.7736945152282715, + "learning_rate": 6.7694984955754465e-06, + "loss": 0.665, + "step": 1999 + }, + { + "epoch": 0.4034711124139866, + "grad_norm": 0.5825895667076111, + "learning_rate": 6.766441867222384e-06, + "loss": 0.705, + "step": 2000 + }, + { + "epoch": 0.4036728479701936, + "grad_norm": 5.248704433441162, + "learning_rate": 6.763384484383929e-06, + "loss": 0.6642, + "step": 2001 + }, + { + "epoch": 0.40387458352640054, + "grad_norm": 0.5176589488983154, + "learning_rate": 6.760326348365955e-06, + "loss": 0.6548, + "step": 2002 + }, + { + "epoch": 0.40407631908260755, + "grad_norm": 0.41134941577911377, + "learning_rate": 6.757267460474663e-06, + "loss": 0.6557, + "step": 2003 + }, + { + "epoch": 0.40427805463881455, + "grad_norm": 3.7623648643493652, + "learning_rate": 6.754207822016565e-06, + "loss": 0.7574, + "step": 2004 + }, + { + "epoch": 0.40447979019502156, + "grad_norm": 1.6524170637130737, + "learning_rate": 6.7511474342985e-06, + "loss": 0.6833, + "step": 2005 + }, + { + "epoch": 0.40468152575122857, + "grad_norm": 1.2042206525802612, + "learning_rate": 6.748086298627624e-06, + "loss": 0.6984, + "step": 2006 + }, + { + "epoch": 0.4048832613074355, + "grad_norm": 0.7427788376808167, + "learning_rate": 6.745024416311418e-06, + "loss": 0.6826, + "step": 2007 + }, + { + "epoch": 0.4050849968636425, + "grad_norm": 0.5733217597007751, + "learning_rate": 6.7419617886576735e-06, + "loss": 0.6935, + "step": 2008 + }, + { + "epoch": 0.4052867324198495, + "grad_norm": 0.3138233423233032, + "learning_rate": 6.738898416974507e-06, + "loss": 0.7866, + "step": 2009 + }, + { + "epoch": 0.40548846797605653, + "grad_norm": 2.404067277908325, + "learning_rate": 6.7358343025703506e-06, + "loss": 0.6897, + "step": 2010 + }, + { + "epoch": 0.4056902035322635, + "grad_norm": 0.4286998212337494, + "learning_rate": 6.732769446753954e-06, + "loss": 0.7369, + "step": 2011 + }, + { + "epoch": 0.4058919390884705, + "grad_norm": 0.8496800661087036, + "learning_rate": 6.729703850834381e-06, + "loss": 0.7423, + "step": 2012 + }, + { + "epoch": 0.4060936746446775, + "grad_norm": 0.5670628547668457, + "learning_rate": 6.7266375161210175e-06, + "loss": 0.7706, + "step": 2013 + }, + { + "epoch": 0.4062954102008845, + "grad_norm": 0.5016106367111206, + "learning_rate": 6.723570443923557e-06, + "loss": 0.6655, + "step": 2014 + }, + { + "epoch": 0.4064971457570915, + "grad_norm": 0.4219368100166321, + "learning_rate": 6.7205026355520145e-06, + "loss": 0.7931, + "step": 2015 + }, + { + "epoch": 0.40669888131329845, + "grad_norm": 0.5831752419471741, + "learning_rate": 6.717434092316716e-06, + "loss": 0.753, + "step": 2016 + }, + { + "epoch": 0.40690061686950546, + "grad_norm": 0.5141821503639221, + "learning_rate": 6.7143648155283025e-06, + "loss": 0.7004, + "step": 2017 + }, + { + "epoch": 0.40710235242571247, + "grad_norm": 0.43418243527412415, + "learning_rate": 6.71129480649773e-06, + "loss": 0.684, + "step": 2018 + }, + { + "epoch": 0.40730408798191947, + "grad_norm": 0.3431943356990814, + "learning_rate": 6.708224066536263e-06, + "loss": 0.7785, + "step": 2019 + }, + { + "epoch": 0.4075058235381264, + "grad_norm": 0.40281733870506287, + "learning_rate": 6.705152596955483e-06, + "loss": 0.6922, + "step": 2020 + }, + { + "epoch": 0.4077075590943334, + "grad_norm": 0.6138989925384521, + "learning_rate": 6.70208039906728e-06, + "loss": 0.6413, + "step": 2021 + }, + { + "epoch": 0.40790929465054043, + "grad_norm": 0.8132925033569336, + "learning_rate": 6.699007474183854e-06, + "loss": 0.8472, + "step": 2022 + }, + { + "epoch": 0.40811103020674744, + "grad_norm": 0.5439250469207764, + "learning_rate": 6.695933823617719e-06, + "loss": 0.7646, + "step": 2023 + }, + { + "epoch": 0.40831276576295444, + "grad_norm": 0.48923900723457336, + "learning_rate": 6.6928594486817e-06, + "loss": 0.6508, + "step": 2024 + }, + { + "epoch": 0.4085145013191614, + "grad_norm": 0.6131086945533752, + "learning_rate": 6.689784350688926e-06, + "loss": 0.6921, + "step": 2025 + }, + { + "epoch": 0.4087162368753684, + "grad_norm": 0.44216862320899963, + "learning_rate": 6.686708530952836e-06, + "loss": 0.7758, + "step": 2026 + }, + { + "epoch": 0.4089179724315754, + "grad_norm": 0.4120861291885376, + "learning_rate": 6.6836319907871825e-06, + "loss": 1.0358, + "step": 2027 + }, + { + "epoch": 0.4091197079877824, + "grad_norm": 0.6558131575584412, + "learning_rate": 6.68055473150602e-06, + "loss": 0.7933, + "step": 2028 + }, + { + "epoch": 0.40932144354398936, + "grad_norm": 1.0560377836227417, + "learning_rate": 6.677476754423714e-06, + "loss": 1.1087, + "step": 2029 + }, + { + "epoch": 0.40952317910019637, + "grad_norm": 0.3615643382072449, + "learning_rate": 6.674398060854931e-06, + "loss": 0.7684, + "step": 2030 + }, + { + "epoch": 0.40972491465640337, + "grad_norm": 0.636316180229187, + "learning_rate": 6.671318652114652e-06, + "loss": 0.6682, + "step": 2031 + }, + { + "epoch": 0.4099266502126104, + "grad_norm": 1.1835805177688599, + "learning_rate": 6.668238529518157e-06, + "loss": 0.7633, + "step": 2032 + }, + { + "epoch": 0.4101283857688174, + "grad_norm": 0.6155040860176086, + "learning_rate": 6.66515769438103e-06, + "loss": 0.6595, + "step": 2033 + }, + { + "epoch": 0.41033012132502433, + "grad_norm": 0.5014193058013916, + "learning_rate": 6.662076148019168e-06, + "loss": 0.7761, + "step": 2034 + }, + { + "epoch": 0.41053185688123134, + "grad_norm": 0.4848555624485016, + "learning_rate": 6.65899389174876e-06, + "loss": 0.6308, + "step": 2035 + }, + { + "epoch": 0.41073359243743834, + "grad_norm": 0.44919511675834656, + "learning_rate": 6.655910926886308e-06, + "loss": 0.6908, + "step": 2036 + }, + { + "epoch": 0.41093532799364535, + "grad_norm": 1.0742298364639282, + "learning_rate": 6.65282725474861e-06, + "loss": 0.7018, + "step": 2037 + }, + { + "epoch": 0.4111370635498523, + "grad_norm": 0.46099919080734253, + "learning_rate": 6.649742876652772e-06, + "loss": 0.8293, + "step": 2038 + }, + { + "epoch": 0.4113387991060593, + "grad_norm": 0.6284064650535583, + "learning_rate": 6.646657793916196e-06, + "loss": 0.679, + "step": 2039 + }, + { + "epoch": 0.4115405346622663, + "grad_norm": 0.4924802780151367, + "learning_rate": 6.643572007856587e-06, + "loss": 0.8707, + "step": 2040 + }, + { + "epoch": 0.4117422702184733, + "grad_norm": 0.4875141978263855, + "learning_rate": 6.640485519791953e-06, + "loss": 0.6645, + "step": 2041 + }, + { + "epoch": 0.4119440057746803, + "grad_norm": 1.0110448598861694, + "learning_rate": 6.637398331040597e-06, + "loss": 0.7211, + "step": 2042 + }, + { + "epoch": 0.41214574133088727, + "grad_norm": 0.8589099049568176, + "learning_rate": 6.634310442921124e-06, + "loss": 0.6869, + "step": 2043 + }, + { + "epoch": 0.4123474768870943, + "grad_norm": 0.5140877962112427, + "learning_rate": 6.63122185675244e-06, + "loss": 0.624, + "step": 2044 + }, + { + "epoch": 0.4125492124433013, + "grad_norm": 0.43134400248527527, + "learning_rate": 6.628132573853745e-06, + "loss": 0.6992, + "step": 2045 + }, + { + "epoch": 0.4127509479995083, + "grad_norm": 0.6019126772880554, + "learning_rate": 6.6250425955445386e-06, + "loss": 0.7856, + "step": 2046 + }, + { + "epoch": 0.41295268355571524, + "grad_norm": 1.1939761638641357, + "learning_rate": 6.621951923144616e-06, + "loss": 0.8442, + "step": 2047 + }, + { + "epoch": 0.41315441911192224, + "grad_norm": 0.8242788314819336, + "learning_rate": 6.618860557974073e-06, + "loss": 0.6458, + "step": 2048 + }, + { + "epoch": 0.41335615466812925, + "grad_norm": 0.6458016037940979, + "learning_rate": 6.615768501353297e-06, + "loss": 0.7739, + "step": 2049 + }, + { + "epoch": 0.41355789022433626, + "grad_norm": 0.5104942917823792, + "learning_rate": 6.612675754602968e-06, + "loss": 0.7799, + "step": 2050 + }, + { + "epoch": 0.41375962578054326, + "grad_norm": 0.6379034519195557, + "learning_rate": 6.60958231904407e-06, + "loss": 0.7387, + "step": 2051 + }, + { + "epoch": 0.4139613613367502, + "grad_norm": 1.0765997171401978, + "learning_rate": 6.606488195997876e-06, + "loss": 0.681, + "step": 2052 + }, + { + "epoch": 0.4141630968929572, + "grad_norm": 0.9007116556167603, + "learning_rate": 6.603393386785948e-06, + "loss": 0.7555, + "step": 2053 + }, + { + "epoch": 0.4143648324491642, + "grad_norm": 0.48005685210227966, + "learning_rate": 6.60029789273015e-06, + "loss": 0.6721, + "step": 2054 + }, + { + "epoch": 0.4145665680053712, + "grad_norm": 0.7423443794250488, + "learning_rate": 6.5972017151526325e-06, + "loss": 1.1185, + "step": 2055 + }, + { + "epoch": 0.4147683035615782, + "grad_norm": 0.4530531167984009, + "learning_rate": 6.59410485537584e-06, + "loss": 1.0106, + "step": 2056 + }, + { + "epoch": 0.4149700391177852, + "grad_norm": 0.42010125517845154, + "learning_rate": 6.591007314722508e-06, + "loss": 0.7337, + "step": 2057 + }, + { + "epoch": 0.4151717746739922, + "grad_norm": 0.4540041387081146, + "learning_rate": 6.587909094515663e-06, + "loss": 0.6685, + "step": 2058 + }, + { + "epoch": 0.4153735102301992, + "grad_norm": 0.6442490816116333, + "learning_rate": 6.584810196078622e-06, + "loss": 0.6358, + "step": 2059 + }, + { + "epoch": 0.4155752457864062, + "grad_norm": 0.42252811789512634, + "learning_rate": 6.58171062073499e-06, + "loss": 0.6725, + "step": 2060 + }, + { + "epoch": 0.41577698134261315, + "grad_norm": 0.7584266662597656, + "learning_rate": 6.578610369808663e-06, + "loss": 0.7934, + "step": 2061 + }, + { + "epoch": 0.41597871689882016, + "grad_norm": 0.5483881831169128, + "learning_rate": 6.575509444623825e-06, + "loss": 0.7312, + "step": 2062 + }, + { + "epoch": 0.41618045245502716, + "grad_norm": 0.4764006733894348, + "learning_rate": 6.57240784650495e-06, + "loss": 0.7083, + "step": 2063 + }, + { + "epoch": 0.41638218801123417, + "grad_norm": 0.8024596571922302, + "learning_rate": 6.569305576776794e-06, + "loss": 0.832, + "step": 2064 + }, + { + "epoch": 0.4165839235674411, + "grad_norm": 0.5202115178108215, + "learning_rate": 6.566202636764406e-06, + "loss": 0.6965, + "step": 2065 + }, + { + "epoch": 0.4167856591236481, + "grad_norm": 0.8023049831390381, + "learning_rate": 6.563099027793116e-06, + "loss": 0.6656, + "step": 2066 + }, + { + "epoch": 0.41698739467985513, + "grad_norm": 0.3753010928630829, + "learning_rate": 6.559994751188545e-06, + "loss": 0.688, + "step": 2067 + }, + { + "epoch": 0.41718913023606213, + "grad_norm": 0.6880113482475281, + "learning_rate": 6.5568898082765945e-06, + "loss": 0.6656, + "step": 2068 + }, + { + "epoch": 0.41739086579226914, + "grad_norm": 0.64495450258255, + "learning_rate": 6.553784200383453e-06, + "loss": 0.6485, + "step": 2069 + }, + { + "epoch": 0.4175926013484761, + "grad_norm": 0.7110615968704224, + "learning_rate": 6.550677928835592e-06, + "loss": 0.7502, + "step": 2070 + }, + { + "epoch": 0.4177943369046831, + "grad_norm": 0.4838223457336426, + "learning_rate": 6.54757099495977e-06, + "loss": 0.6866, + "step": 2071 + }, + { + "epoch": 0.4179960724608901, + "grad_norm": 0.9055322408676147, + "learning_rate": 6.544463400083021e-06, + "loss": 0.6705, + "step": 2072 + }, + { + "epoch": 0.4181978080170971, + "grad_norm": 0.38350167870521545, + "learning_rate": 6.541355145532669e-06, + "loss": 0.6856, + "step": 2073 + }, + { + "epoch": 0.41839954357330406, + "grad_norm": 0.6029995083808899, + "learning_rate": 6.538246232636316e-06, + "loss": 0.6904, + "step": 2074 + }, + { + "epoch": 0.41860127912951106, + "grad_norm": 0.36843833327293396, + "learning_rate": 6.535136662721844e-06, + "loss": 0.7428, + "step": 2075 + }, + { + "epoch": 0.41880301468571807, + "grad_norm": 0.7784242630004883, + "learning_rate": 6.5320264371174195e-06, + "loss": 0.7155, + "step": 2076 + }, + { + "epoch": 0.4190047502419251, + "grad_norm": 0.38750869035720825, + "learning_rate": 6.528915557151484e-06, + "loss": 0.708, + "step": 2077 + }, + { + "epoch": 0.4192064857981321, + "grad_norm": 0.9216073155403137, + "learning_rate": 6.525804024152765e-06, + "loss": 0.8106, + "step": 2078 + }, + { + "epoch": 0.41940822135433903, + "grad_norm": 0.7102227807044983, + "learning_rate": 6.522691839450262e-06, + "loss": 0.7246, + "step": 2079 + }, + { + "epoch": 0.41960995691054603, + "grad_norm": 0.6181046962738037, + "learning_rate": 6.51957900437326e-06, + "loss": 0.6921, + "step": 2080 + }, + { + "epoch": 0.41981169246675304, + "grad_norm": 0.432643324136734, + "learning_rate": 6.5164655202513135e-06, + "loss": 0.6676, + "step": 2081 + }, + { + "epoch": 0.42001342802296004, + "grad_norm": 0.37533438205718994, + "learning_rate": 6.5133513884142605e-06, + "loss": 0.6286, + "step": 2082 + }, + { + "epoch": 0.420215163579167, + "grad_norm": 0.7802932858467102, + "learning_rate": 6.510236610192215e-06, + "loss": 0.6887, + "step": 2083 + }, + { + "epoch": 0.420416899135374, + "grad_norm": 0.3948320746421814, + "learning_rate": 6.507121186915567e-06, + "loss": 0.6774, + "step": 2084 + }, + { + "epoch": 0.420618634691581, + "grad_norm": 0.3722754418849945, + "learning_rate": 6.5040051199149755e-06, + "loss": 0.8555, + "step": 2085 + }, + { + "epoch": 0.420820370247788, + "grad_norm": 0.4183351695537567, + "learning_rate": 6.500888410521385e-06, + "loss": 0.8037, + "step": 2086 + }, + { + "epoch": 0.421022105803995, + "grad_norm": 0.48188161849975586, + "learning_rate": 6.497771060066008e-06, + "loss": 0.7292, + "step": 2087 + }, + { + "epoch": 0.42122384136020197, + "grad_norm": 1.078413724899292, + "learning_rate": 6.494653069880332e-06, + "loss": 0.7182, + "step": 2088 + }, + { + "epoch": 0.421425576916409, + "grad_norm": 0.39380356669425964, + "learning_rate": 6.491534441296117e-06, + "loss": 0.6689, + "step": 2089 + }, + { + "epoch": 0.421627312472616, + "grad_norm": 0.39865002036094666, + "learning_rate": 6.488415175645395e-06, + "loss": 0.6478, + "step": 2090 + }, + { + "epoch": 0.421829048028823, + "grad_norm": 0.5632877945899963, + "learning_rate": 6.485295274260476e-06, + "loss": 0.8674, + "step": 2091 + }, + { + "epoch": 0.42203078358502993, + "grad_norm": 0.2843717336654663, + "learning_rate": 6.4821747384739344e-06, + "loss": 0.653, + "step": 2092 + }, + { + "epoch": 0.42223251914123694, + "grad_norm": 0.4226774573326111, + "learning_rate": 6.479053569618616e-06, + "loss": 0.7398, + "step": 2093 + }, + { + "epoch": 0.42243425469744394, + "grad_norm": 0.3823719918727875, + "learning_rate": 6.475931769027643e-06, + "loss": 0.6444, + "step": 2094 + }, + { + "epoch": 0.42263599025365095, + "grad_norm": 1.0371155738830566, + "learning_rate": 6.472809338034405e-06, + "loss": 0.6676, + "step": 2095 + }, + { + "epoch": 0.42283772580985796, + "grad_norm": 0.5268231630325317, + "learning_rate": 6.469686277972556e-06, + "loss": 0.749, + "step": 2096 + }, + { + "epoch": 0.4230394613660649, + "grad_norm": 0.35234931111335754, + "learning_rate": 6.466562590176021e-06, + "loss": 0.6727, + "step": 2097 + }, + { + "epoch": 0.4232411969222719, + "grad_norm": 4.1712775230407715, + "learning_rate": 6.463438275978998e-06, + "loss": 0.7347, + "step": 2098 + }, + { + "epoch": 0.4234429324784789, + "grad_norm": 0.4240151047706604, + "learning_rate": 6.4603133367159486e-06, + "loss": 0.8569, + "step": 2099 + }, + { + "epoch": 0.4236446680346859, + "grad_norm": 0.40674498677253723, + "learning_rate": 6.4571877737216e-06, + "loss": 0.66, + "step": 2100 + }, + { + "epoch": 0.4238464035908929, + "grad_norm": 0.37719449400901794, + "learning_rate": 6.454061588330947e-06, + "loss": 0.6394, + "step": 2101 + }, + { + "epoch": 0.4240481391470999, + "grad_norm": 0.6937292218208313, + "learning_rate": 6.450934781879254e-06, + "loss": 0.6966, + "step": 2102 + }, + { + "epoch": 0.4242498747033069, + "grad_norm": 0.6843068599700928, + "learning_rate": 6.447807355702047e-06, + "loss": 0.7033, + "step": 2103 + }, + { + "epoch": 0.4244516102595139, + "grad_norm": 0.4702962338924408, + "learning_rate": 6.444679311135112e-06, + "loss": 0.8195, + "step": 2104 + }, + { + "epoch": 0.4246533458157209, + "grad_norm": 1.1637814044952393, + "learning_rate": 6.441550649514509e-06, + "loss": 0.6992, + "step": 2105 + }, + { + "epoch": 0.42485508137192785, + "grad_norm": 0.48818790912628174, + "learning_rate": 6.4384213721765565e-06, + "loss": 0.7211, + "step": 2106 + }, + { + "epoch": 0.42505681692813485, + "grad_norm": 0.47552841901779175, + "learning_rate": 6.4352914804578345e-06, + "loss": 0.8105, + "step": 2107 + }, + { + "epoch": 0.42525855248434186, + "grad_norm": 0.4511565566062927, + "learning_rate": 6.43216097569519e-06, + "loss": 0.7177, + "step": 2108 + }, + { + "epoch": 0.42546028804054886, + "grad_norm": 0.38846555352211, + "learning_rate": 6.429029859225725e-06, + "loss": 0.6874, + "step": 2109 + }, + { + "epoch": 0.42566202359675587, + "grad_norm": 0.5311095714569092, + "learning_rate": 6.42589813238681e-06, + "loss": 0.8165, + "step": 2110 + }, + { + "epoch": 0.4258637591529628, + "grad_norm": 1.02147376537323, + "learning_rate": 6.422765796516071e-06, + "loss": 0.7223, + "step": 2111 + }, + { + "epoch": 0.4260654947091698, + "grad_norm": 0.3974895477294922, + "learning_rate": 6.419632852951398e-06, + "loss": 0.8251, + "step": 2112 + }, + { + "epoch": 0.42626723026537683, + "grad_norm": 0.6259453296661377, + "learning_rate": 6.416499303030939e-06, + "loss": 0.9337, + "step": 2113 + }, + { + "epoch": 0.42646896582158383, + "grad_norm": 0.5818927884101868, + "learning_rate": 6.413365148093097e-06, + "loss": 0.6606, + "step": 2114 + }, + { + "epoch": 0.4266707013777908, + "grad_norm": 2.3462257385253906, + "learning_rate": 6.410230389476542e-06, + "loss": 0.6489, + "step": 2115 + }, + { + "epoch": 0.4268724369339978, + "grad_norm": 1.1948621273040771, + "learning_rate": 6.407095028520194e-06, + "loss": 0.8109, + "step": 2116 + }, + { + "epoch": 0.4270741724902048, + "grad_norm": 1.4963581562042236, + "learning_rate": 6.403959066563234e-06, + "loss": 0.7941, + "step": 2117 + }, + { + "epoch": 0.4272759080464118, + "grad_norm": 0.8830330967903137, + "learning_rate": 6.4008225049450974e-06, + "loss": 0.8084, + "step": 2118 + }, + { + "epoch": 0.4274776436026188, + "grad_norm": 0.3570726811885834, + "learning_rate": 6.397685345005482e-06, + "loss": 0.6967, + "step": 2119 + }, + { + "epoch": 0.42767937915882576, + "grad_norm": 0.5791197419166565, + "learning_rate": 6.394547588084331e-06, + "loss": 0.8182, + "step": 2120 + }, + { + "epoch": 0.42788111471503276, + "grad_norm": 0.39418864250183105, + "learning_rate": 6.3914092355218494e-06, + "loss": 0.7964, + "step": 2121 + }, + { + "epoch": 0.42808285027123977, + "grad_norm": 0.8419885039329529, + "learning_rate": 6.388270288658498e-06, + "loss": 0.7087, + "step": 2122 + }, + { + "epoch": 0.4282845858274468, + "grad_norm": 0.9715898036956787, + "learning_rate": 6.385130748834986e-06, + "loss": 0.7973, + "step": 2123 + }, + { + "epoch": 0.4284863213836537, + "grad_norm": 0.5298724174499512, + "learning_rate": 6.38199061739228e-06, + "loss": 0.6844, + "step": 2124 + }, + { + "epoch": 0.42868805693986073, + "grad_norm": 0.4409395754337311, + "learning_rate": 6.378849895671594e-06, + "loss": 0.6847, + "step": 2125 + }, + { + "epoch": 0.42888979249606773, + "grad_norm": 0.6608665585517883, + "learning_rate": 6.375708585014403e-06, + "loss": 0.6416, + "step": 2126 + }, + { + "epoch": 0.42909152805227474, + "grad_norm": 0.35339251160621643, + "learning_rate": 6.372566686762427e-06, + "loss": 0.6711, + "step": 2127 + }, + { + "epoch": 0.42929326360848175, + "grad_norm": 0.3852989375591278, + "learning_rate": 6.369424202257637e-06, + "loss": 0.6684, + "step": 2128 + }, + { + "epoch": 0.4294949991646887, + "grad_norm": 0.428529292345047, + "learning_rate": 6.366281132842256e-06, + "loss": 0.8359, + "step": 2129 + }, + { + "epoch": 0.4296967347208957, + "grad_norm": 0.45827290415763855, + "learning_rate": 6.363137479858759e-06, + "loss": 0.8202, + "step": 2130 + }, + { + "epoch": 0.4298984702771027, + "grad_norm": 0.3035009503364563, + "learning_rate": 6.359993244649865e-06, + "loss": 0.6537, + "step": 2131 + }, + { + "epoch": 0.4301002058333097, + "grad_norm": 0.39818131923675537, + "learning_rate": 6.356848428558546e-06, + "loss": 0.6618, + "step": 2132 + }, + { + "epoch": 0.43030194138951666, + "grad_norm": 0.47690775990486145, + "learning_rate": 6.35370303292802e-06, + "loss": 0.6937, + "step": 2133 + }, + { + "epoch": 0.43050367694572367, + "grad_norm": 0.4136514365673065, + "learning_rate": 6.350557059101757e-06, + "loss": 0.6429, + "step": 2134 + }, + { + "epoch": 0.4307054125019307, + "grad_norm": 0.6138867139816284, + "learning_rate": 6.347410508423464e-06, + "loss": 0.6956, + "step": 2135 + }, + { + "epoch": 0.4309071480581377, + "grad_norm": 0.8477742075920105, + "learning_rate": 6.344263382237106e-06, + "loss": 0.7875, + "step": 2136 + }, + { + "epoch": 0.4311088836143447, + "grad_norm": 0.5474502444267273, + "learning_rate": 6.341115681886885e-06, + "loss": 0.6698, + "step": 2137 + }, + { + "epoch": 0.43131061917055163, + "grad_norm": 0.6278495192527771, + "learning_rate": 6.337967408717254e-06, + "loss": 0.6913, + "step": 2138 + }, + { + "epoch": 0.43151235472675864, + "grad_norm": 0.40545186400413513, + "learning_rate": 6.334818564072906e-06, + "loss": 0.629, + "step": 2139 + }, + { + "epoch": 0.43171409028296565, + "grad_norm": 0.4732603430747986, + "learning_rate": 6.331669149298781e-06, + "loss": 0.6154, + "step": 2140 + }, + { + "epoch": 0.43191582583917265, + "grad_norm": 0.7837668657302856, + "learning_rate": 6.328519165740063e-06, + "loss": 0.77, + "step": 2141 + }, + { + "epoch": 0.4321175613953796, + "grad_norm": 0.35569074749946594, + "learning_rate": 6.325368614742177e-06, + "loss": 0.661, + "step": 2142 + }, + { + "epoch": 0.4323192969515866, + "grad_norm": 0.5048931837081909, + "learning_rate": 6.322217497650794e-06, + "loss": 0.6205, + "step": 2143 + }, + { + "epoch": 0.4325210325077936, + "grad_norm": 0.4467771053314209, + "learning_rate": 6.3190658158118205e-06, + "loss": 0.6533, + "step": 2144 + }, + { + "epoch": 0.4327227680640006, + "grad_norm": 0.3703531324863434, + "learning_rate": 6.315913570571408e-06, + "loss": 0.6979, + "step": 2145 + }, + { + "epoch": 0.4329245036202076, + "grad_norm": 0.8720201849937439, + "learning_rate": 6.312760763275949e-06, + "loss": 0.6939, + "step": 2146 + }, + { + "epoch": 0.4331262391764146, + "grad_norm": 0.3208676278591156, + "learning_rate": 6.3096073952720775e-06, + "loss": 0.6957, + "step": 2147 + }, + { + "epoch": 0.4333279747326216, + "grad_norm": 0.351581335067749, + "learning_rate": 6.306453467906663e-06, + "loss": 0.658, + "step": 2148 + }, + { + "epoch": 0.4335297102888286, + "grad_norm": 0.44626230001449585, + "learning_rate": 6.303298982526813e-06, + "loss": 0.7386, + "step": 2149 + }, + { + "epoch": 0.4337314458450356, + "grad_norm": 0.42948535084724426, + "learning_rate": 6.300143940479881e-06, + "loss": 0.6874, + "step": 2150 + }, + { + "epoch": 0.43393318140124254, + "grad_norm": 0.4214824140071869, + "learning_rate": 6.296988343113453e-06, + "loss": 0.8205, + "step": 2151 + }, + { + "epoch": 0.43413491695744955, + "grad_norm": 0.36822807788848877, + "learning_rate": 6.29383219177535e-06, + "loss": 0.6291, + "step": 2152 + }, + { + "epoch": 0.43433665251365655, + "grad_norm": 0.9206970930099487, + "learning_rate": 6.290675487813632e-06, + "loss": 0.695, + "step": 2153 + }, + { + "epoch": 0.43453838806986356, + "grad_norm": 0.3771558403968811, + "learning_rate": 6.2875182325765995e-06, + "loss": 0.8117, + "step": 2154 + }, + { + "epoch": 0.43474012362607056, + "grad_norm": 0.8105612993240356, + "learning_rate": 6.284360427412781e-06, + "loss": 0.6993, + "step": 2155 + }, + { + "epoch": 0.4349418591822775, + "grad_norm": 0.9626283645629883, + "learning_rate": 6.281202073670942e-06, + "loss": 0.6851, + "step": 2156 + }, + { + "epoch": 0.4351435947384845, + "grad_norm": 0.3421020209789276, + "learning_rate": 6.2780431727000865e-06, + "loss": 0.6535, + "step": 2157 + }, + { + "epoch": 0.4353453302946915, + "grad_norm": 0.40301311016082764, + "learning_rate": 6.274883725849449e-06, + "loss": 0.7201, + "step": 2158 + }, + { + "epoch": 0.43554706585089853, + "grad_norm": 0.32986417412757874, + "learning_rate": 6.271723734468496e-06, + "loss": 0.7125, + "step": 2159 + }, + { + "epoch": 0.4357488014071055, + "grad_norm": 0.3987983167171478, + "learning_rate": 6.268563199906925e-06, + "loss": 0.668, + "step": 2160 + }, + { + "epoch": 0.4359505369633125, + "grad_norm": 1.0826027393341064, + "learning_rate": 6.2654021235146745e-06, + "loss": 0.8437, + "step": 2161 + }, + { + "epoch": 0.4361522725195195, + "grad_norm": 0.6496895551681519, + "learning_rate": 6.2622405066419046e-06, + "loss": 0.694, + "step": 2162 + }, + { + "epoch": 0.4363540080757265, + "grad_norm": 0.9457875490188599, + "learning_rate": 6.25907835063901e-06, + "loss": 0.6251, + "step": 2163 + }, + { + "epoch": 0.4365557436319335, + "grad_norm": 0.4565484821796417, + "learning_rate": 6.2559156568566185e-06, + "loss": 0.7898, + "step": 2164 + }, + { + "epoch": 0.43675747918814045, + "grad_norm": 1.442725658416748, + "learning_rate": 6.252752426645581e-06, + "loss": 0.7358, + "step": 2165 + }, + { + "epoch": 0.43695921474434746, + "grad_norm": 1.2952994108200073, + "learning_rate": 6.249588661356983e-06, + "loss": 0.6899, + "step": 2166 + }, + { + "epoch": 0.43716095030055446, + "grad_norm": 1.1040253639221191, + "learning_rate": 6.246424362342139e-06, + "loss": 0.7184, + "step": 2167 + }, + { + "epoch": 0.43736268585676147, + "grad_norm": 1.4704509973526, + "learning_rate": 6.243259530952585e-06, + "loss": 0.6943, + "step": 2168 + }, + { + "epoch": 0.4375644214129684, + "grad_norm": 1.258582592010498, + "learning_rate": 6.240094168540091e-06, + "loss": 0.8548, + "step": 2169 + }, + { + "epoch": 0.4377661569691754, + "grad_norm": 0.42416098713874817, + "learning_rate": 6.236928276456652e-06, + "loss": 0.7267, + "step": 2170 + }, + { + "epoch": 0.43796789252538243, + "grad_norm": 0.40356701612472534, + "learning_rate": 6.233761856054488e-06, + "loss": 0.7985, + "step": 2171 + }, + { + "epoch": 0.43816962808158944, + "grad_norm": 0.360816091299057, + "learning_rate": 6.230594908686045e-06, + "loss": 0.6553, + "step": 2172 + }, + { + "epoch": 0.43837136363779644, + "grad_norm": 0.8857923746109009, + "learning_rate": 6.227427435703997e-06, + "loss": 0.6654, + "step": 2173 + }, + { + "epoch": 0.4385730991940034, + "grad_norm": 0.4565891921520233, + "learning_rate": 6.224259438461235e-06, + "loss": 0.7137, + "step": 2174 + }, + { + "epoch": 0.4387748347502104, + "grad_norm": 0.4995157718658447, + "learning_rate": 6.221090918310885e-06, + "loss": 0.9537, + "step": 2175 + }, + { + "epoch": 0.4389765703064174, + "grad_norm": 0.4343486428260803, + "learning_rate": 6.217921876606285e-06, + "loss": 0.6915, + "step": 2176 + }, + { + "epoch": 0.4391783058626244, + "grad_norm": 0.37906357645988464, + "learning_rate": 6.214752314701003e-06, + "loss": 0.8003, + "step": 2177 + }, + { + "epoch": 0.43938004141883136, + "grad_norm": 0.5478761792182922, + "learning_rate": 6.2115822339488296e-06, + "loss": 0.7823, + "step": 2178 + }, + { + "epoch": 0.43958177697503836, + "grad_norm": 0.5704029202461243, + "learning_rate": 6.208411635703771e-06, + "loss": 0.6973, + "step": 2179 + }, + { + "epoch": 0.43978351253124537, + "grad_norm": 0.5843176245689392, + "learning_rate": 6.205240521320059e-06, + "loss": 0.6887, + "step": 2180 + }, + { + "epoch": 0.4399852480874524, + "grad_norm": 0.3240997791290283, + "learning_rate": 6.2020688921521454e-06, + "loss": 0.761, + "step": 2181 + }, + { + "epoch": 0.4401869836436594, + "grad_norm": 0.3329768478870392, + "learning_rate": 6.1988967495547016e-06, + "loss": 0.627, + "step": 2182 + }, + { + "epoch": 0.44038871919986633, + "grad_norm": 0.4571416974067688, + "learning_rate": 6.195724094882618e-06, + "loss": 0.8011, + "step": 2183 + }, + { + "epoch": 0.44059045475607334, + "grad_norm": 0.5178765058517456, + "learning_rate": 6.192550929491002e-06, + "loss": 0.6937, + "step": 2184 + }, + { + "epoch": 0.44079219031228034, + "grad_norm": 1.414166808128357, + "learning_rate": 6.189377254735184e-06, + "loss": 0.6913, + "step": 2185 + }, + { + "epoch": 0.44099392586848735, + "grad_norm": 0.5404744744300842, + "learning_rate": 6.186203071970708e-06, + "loss": 0.8938, + "step": 2186 + }, + { + "epoch": 0.4411956614246943, + "grad_norm": 0.4626302123069763, + "learning_rate": 6.183028382553334e-06, + "loss": 0.6291, + "step": 2187 + }, + { + "epoch": 0.4413973969809013, + "grad_norm": 0.5483360290527344, + "learning_rate": 6.179853187839041e-06, + "loss": 0.678, + "step": 2188 + }, + { + "epoch": 0.4415991325371083, + "grad_norm": 0.3046145737171173, + "learning_rate": 6.176677489184024e-06, + "loss": 0.6738, + "step": 2189 + }, + { + "epoch": 0.4418008680933153, + "grad_norm": 0.6439849138259888, + "learning_rate": 6.173501287944692e-06, + "loss": 0.6438, + "step": 2190 + }, + { + "epoch": 0.4420026036495223, + "grad_norm": 0.5328143239021301, + "learning_rate": 6.170324585477669e-06, + "loss": 0.6751, + "step": 2191 + }, + { + "epoch": 0.44220433920572927, + "grad_norm": 1.3237669467926025, + "learning_rate": 6.167147383139793e-06, + "loss": 0.7781, + "step": 2192 + }, + { + "epoch": 0.4424060747619363, + "grad_norm": 0.4817297160625458, + "learning_rate": 6.163969682288115e-06, + "loss": 0.6905, + "step": 2193 + }, + { + "epoch": 0.4426078103181433, + "grad_norm": 0.6590097546577454, + "learning_rate": 6.160791484279901e-06, + "loss": 0.6987, + "step": 2194 + }, + { + "epoch": 0.4428095458743503, + "grad_norm": 0.3818492293357849, + "learning_rate": 6.157612790472626e-06, + "loss": 0.6845, + "step": 2195 + }, + { + "epoch": 0.44301128143055724, + "grad_norm": 0.5485851764678955, + "learning_rate": 6.154433602223979e-06, + "loss": 0.6867, + "step": 2196 + }, + { + "epoch": 0.44321301698676424, + "grad_norm": 1.5924900770187378, + "learning_rate": 6.1512539208918634e-06, + "loss": 0.6557, + "step": 2197 + }, + { + "epoch": 0.44341475254297125, + "grad_norm": 0.5956331491470337, + "learning_rate": 6.1480737478343844e-06, + "loss": 0.6194, + "step": 2198 + }, + { + "epoch": 0.44361648809917825, + "grad_norm": 0.3365457355976105, + "learning_rate": 6.144893084409865e-06, + "loss": 0.6527, + "step": 2199 + }, + { + "epoch": 0.44381822365538526, + "grad_norm": 0.47032344341278076, + "learning_rate": 6.141711931976835e-06, + "loss": 0.8081, + "step": 2200 + }, + { + "epoch": 0.4440199592115922, + "grad_norm": 0.6154392957687378, + "learning_rate": 6.138530291894033e-06, + "loss": 0.6787, + "step": 2201 + }, + { + "epoch": 0.4442216947677992, + "grad_norm": 0.39080673456192017, + "learning_rate": 6.135348165520405e-06, + "loss": 0.6672, + "step": 2202 + }, + { + "epoch": 0.4444234303240062, + "grad_norm": 0.4939618408679962, + "learning_rate": 6.132165554215108e-06, + "loss": 0.7767, + "step": 2203 + }, + { + "epoch": 0.4446251658802132, + "grad_norm": 0.6760823130607605, + "learning_rate": 6.128982459337502e-06, + "loss": 0.6899, + "step": 2204 + }, + { + "epoch": 0.44482690143642023, + "grad_norm": 0.5723358392715454, + "learning_rate": 6.1257988822471556e-06, + "loss": 0.6011, + "step": 2205 + }, + { + "epoch": 0.4450286369926272, + "grad_norm": 0.30772972106933594, + "learning_rate": 6.122614824303845e-06, + "loss": 0.779, + "step": 2206 + }, + { + "epoch": 0.4452303725488342, + "grad_norm": 0.9321260452270508, + "learning_rate": 6.119430286867548e-06, + "loss": 0.6848, + "step": 2207 + }, + { + "epoch": 0.4454321081050412, + "grad_norm": 0.7563874125480652, + "learning_rate": 6.11624527129845e-06, + "loss": 0.6603, + "step": 2208 + }, + { + "epoch": 0.4456338436612482, + "grad_norm": 0.4485842287540436, + "learning_rate": 6.1130597789569376e-06, + "loss": 0.7239, + "step": 2209 + }, + { + "epoch": 0.44583557921745515, + "grad_norm": 0.3827153742313385, + "learning_rate": 6.109873811203609e-06, + "loss": 0.8143, + "step": 2210 + }, + { + "epoch": 0.44603731477366215, + "grad_norm": 0.41676798462867737, + "learning_rate": 6.106687369399254e-06, + "loss": 0.6945, + "step": 2211 + }, + { + "epoch": 0.44623905032986916, + "grad_norm": 0.5404744148254395, + "learning_rate": 6.103500454904871e-06, + "loss": 0.6511, + "step": 2212 + }, + { + "epoch": 0.44644078588607616, + "grad_norm": 0.6789629459381104, + "learning_rate": 6.100313069081662e-06, + "loss": 0.6963, + "step": 2213 + }, + { + "epoch": 0.44664252144228317, + "grad_norm": 0.3863263428211212, + "learning_rate": 6.097125213291029e-06, + "loss": 0.6801, + "step": 2214 + }, + { + "epoch": 0.4468442569984901, + "grad_norm": 1.1624631881713867, + "learning_rate": 6.093936888894573e-06, + "loss": 0.7014, + "step": 2215 + }, + { + "epoch": 0.4470459925546971, + "grad_norm": 0.3568146526813507, + "learning_rate": 6.0907480972540915e-06, + "loss": 0.6531, + "step": 2216 + }, + { + "epoch": 0.44724772811090413, + "grad_norm": 0.5413874387741089, + "learning_rate": 6.087558839731594e-06, + "loss": 0.6698, + "step": 2217 + }, + { + "epoch": 0.44744946366711114, + "grad_norm": 0.5732948780059814, + "learning_rate": 6.084369117689276e-06, + "loss": 0.659, + "step": 2218 + }, + { + "epoch": 0.4476511992233181, + "grad_norm": 0.3843459486961365, + "learning_rate": 6.0811789324895365e-06, + "loss": 0.6573, + "step": 2219 + }, + { + "epoch": 0.4478529347795251, + "grad_norm": 0.4645818769931793, + "learning_rate": 6.0779882854949745e-06, + "loss": 0.6783, + "step": 2220 + }, + { + "epoch": 0.4480546703357321, + "grad_norm": 0.4214332103729248, + "learning_rate": 6.074797178068385e-06, + "loss": 0.685, + "step": 2221 + }, + { + "epoch": 0.4482564058919391, + "grad_norm": 0.4040057361125946, + "learning_rate": 6.071605611572755e-06, + "loss": 0.6768, + "step": 2222 + }, + { + "epoch": 0.4484581414481461, + "grad_norm": 0.5158857107162476, + "learning_rate": 6.068413587371274e-06, + "loss": 0.7078, + "step": 2223 + }, + { + "epoch": 0.44865987700435306, + "grad_norm": 0.37071144580841064, + "learning_rate": 6.0652211068273226e-06, + "loss": 0.6374, + "step": 2224 + }, + { + "epoch": 0.44886161256056006, + "grad_norm": 0.4070124924182892, + "learning_rate": 6.062028171304481e-06, + "loss": 0.7792, + "step": 2225 + }, + { + "epoch": 0.44906334811676707, + "grad_norm": 0.40504300594329834, + "learning_rate": 6.058834782166516e-06, + "loss": 0.6528, + "step": 2226 + }, + { + "epoch": 0.4492650836729741, + "grad_norm": 0.553276538848877, + "learning_rate": 6.055640940777398e-06, + "loss": 0.6665, + "step": 2227 + }, + { + "epoch": 0.449466819229181, + "grad_norm": 0.41161584854125977, + "learning_rate": 6.052446648501283e-06, + "loss": 0.8529, + "step": 2228 + }, + { + "epoch": 0.44966855478538803, + "grad_norm": 0.4330577552318573, + "learning_rate": 6.049251906702522e-06, + "loss": 0.8221, + "step": 2229 + }, + { + "epoch": 0.44987029034159504, + "grad_norm": 0.43640753626823425, + "learning_rate": 6.046056716745659e-06, + "loss": 0.6714, + "step": 2230 + }, + { + "epoch": 0.45007202589780204, + "grad_norm": 0.5692388415336609, + "learning_rate": 6.042861079995428e-06, + "loss": 0.8885, + "step": 2231 + }, + { + "epoch": 0.45027376145400905, + "grad_norm": 0.49734705686569214, + "learning_rate": 6.039664997816753e-06, + "loss": 0.6671, + "step": 2232 + }, + { + "epoch": 0.450475497010216, + "grad_norm": 0.3377302289009094, + "learning_rate": 6.036468471574751e-06, + "loss": 0.7033, + "step": 2233 + }, + { + "epoch": 0.450677232566423, + "grad_norm": 0.40693655610084534, + "learning_rate": 6.033271502634729e-06, + "loss": 0.666, + "step": 2234 + }, + { + "epoch": 0.45087896812263, + "grad_norm": 0.4038968086242676, + "learning_rate": 6.030074092362178e-06, + "loss": 0.6822, + "step": 2235 + }, + { + "epoch": 0.451080703678837, + "grad_norm": 0.7137503623962402, + "learning_rate": 6.026876242122782e-06, + "loss": 0.6872, + "step": 2236 + }, + { + "epoch": 0.45128243923504396, + "grad_norm": 0.4246538579463959, + "learning_rate": 6.023677953282412e-06, + "loss": 0.8179, + "step": 2237 + }, + { + "epoch": 0.45148417479125097, + "grad_norm": 0.5865968465805054, + "learning_rate": 6.020479227207127e-06, + "loss": 0.6563, + "step": 2238 + }, + { + "epoch": 0.451685910347458, + "grad_norm": 0.40608182549476624, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.7347, + "step": 2239 + }, + { + "epoch": 0.451887645903665, + "grad_norm": 0.7402623891830444, + "learning_rate": 6.014080468816972e-06, + "loss": 0.7196, + "step": 2240 + }, + { + "epoch": 0.452089381459872, + "grad_norm": 0.37510138750076294, + "learning_rate": 6.010880439235153e-06, + "loss": 1.0391, + "step": 2241 + }, + { + "epoch": 0.45229111701607894, + "grad_norm": 0.45044469833374023, + "learning_rate": 6.0076799778845105e-06, + "loss": 0.6827, + "step": 2242 + }, + { + "epoch": 0.45249285257228594, + "grad_norm": 0.4569462835788727, + "learning_rate": 6.004479086132033e-06, + "loss": 0.6673, + "step": 2243 + }, + { + "epoch": 0.45269458812849295, + "grad_norm": 0.9873457551002502, + "learning_rate": 6.001277765344888e-06, + "loss": 0.641, + "step": 2244 + }, + { + "epoch": 0.45289632368469995, + "grad_norm": 2.229550361633301, + "learning_rate": 5.998076016890432e-06, + "loss": 0.7482, + "step": 2245 + }, + { + "epoch": 0.4530980592409069, + "grad_norm": 0.4194773733615875, + "learning_rate": 5.994873842136198e-06, + "loss": 0.6847, + "step": 2246 + }, + { + "epoch": 0.4532997947971139, + "grad_norm": 0.3104577958583832, + "learning_rate": 5.991671242449906e-06, + "loss": 0.7933, + "step": 2247 + }, + { + "epoch": 0.4535015303533209, + "grad_norm": 0.3971177637577057, + "learning_rate": 5.988468219199451e-06, + "loss": 0.8049, + "step": 2248 + }, + { + "epoch": 0.4537032659095279, + "grad_norm": 0.34223371744155884, + "learning_rate": 5.985264773752919e-06, + "loss": 0.6613, + "step": 2249 + }, + { + "epoch": 0.4539050014657349, + "grad_norm": 0.4462783932685852, + "learning_rate": 5.982060907478568e-06, + "loss": 0.8033, + "step": 2250 + }, + { + "epoch": 0.4541067370219419, + "grad_norm": 0.9888842701911926, + "learning_rate": 5.978856621744837e-06, + "loss": 0.7698, + "step": 2251 + }, + { + "epoch": 0.4543084725781489, + "grad_norm": 0.4110148549079895, + "learning_rate": 5.975651917920347e-06, + "loss": 0.8328, + "step": 2252 + }, + { + "epoch": 0.4545102081343559, + "grad_norm": 1.3794505596160889, + "learning_rate": 5.9724467973738965e-06, + "loss": 0.6795, + "step": 2253 + }, + { + "epoch": 0.4547119436905629, + "grad_norm": 0.5329458713531494, + "learning_rate": 5.969241261474461e-06, + "loss": 0.7191, + "step": 2254 + }, + { + "epoch": 0.45491367924676984, + "grad_norm": 0.3493848145008087, + "learning_rate": 5.966035311591194e-06, + "loss": 0.8103, + "step": 2255 + }, + { + "epoch": 0.45511541480297685, + "grad_norm": 0.6665728092193604, + "learning_rate": 5.962828949093424e-06, + "loss": 0.7297, + "step": 2256 + }, + { + "epoch": 0.45531715035918385, + "grad_norm": 0.39233672618865967, + "learning_rate": 5.959622175350661e-06, + "loss": 0.6564, + "step": 2257 + }, + { + "epoch": 0.45551888591539086, + "grad_norm": 0.4845694303512573, + "learning_rate": 5.9564149917325845e-06, + "loss": 0.7227, + "step": 2258 + }, + { + "epoch": 0.45572062147159786, + "grad_norm": 0.5083968639373779, + "learning_rate": 5.953207399609053e-06, + "loss": 0.7313, + "step": 2259 + }, + { + "epoch": 0.4559223570278048, + "grad_norm": 0.6814498901367188, + "learning_rate": 5.9499994003500975e-06, + "loss": 0.6717, + "step": 2260 + }, + { + "epoch": 0.4561240925840118, + "grad_norm": 0.3610351085662842, + "learning_rate": 5.946790995325924e-06, + "loss": 0.7272, + "step": 2261 + }, + { + "epoch": 0.4563258281402188, + "grad_norm": 0.3892957270145416, + "learning_rate": 5.943582185906911e-06, + "loss": 0.6967, + "step": 2262 + }, + { + "epoch": 0.45652756369642583, + "grad_norm": 0.45099541544914246, + "learning_rate": 5.940372973463612e-06, + "loss": 0.9041, + "step": 2263 + }, + { + "epoch": 0.4567292992526328, + "grad_norm": 0.41133007407188416, + "learning_rate": 5.937163359366747e-06, + "loss": 0.7562, + "step": 2264 + }, + { + "epoch": 0.4569310348088398, + "grad_norm": 1.6504077911376953, + "learning_rate": 5.933953344987215e-06, + "loss": 0.7812, + "step": 2265 + }, + { + "epoch": 0.4571327703650468, + "grad_norm": 0.4914233982563019, + "learning_rate": 5.9307429316960805e-06, + "loss": 0.6183, + "step": 2266 + }, + { + "epoch": 0.4573345059212538, + "grad_norm": 0.5438753962516785, + "learning_rate": 5.927532120864582e-06, + "loss": 0.6406, + "step": 2267 + }, + { + "epoch": 0.4575362414774608, + "grad_norm": 0.7342792749404907, + "learning_rate": 5.924320913864124e-06, + "loss": 0.6925, + "step": 2268 + }, + { + "epoch": 0.45773797703366775, + "grad_norm": 0.4444006681442261, + "learning_rate": 5.921109312066282e-06, + "loss": 0.7297, + "step": 2269 + }, + { + "epoch": 0.45793971258987476, + "grad_norm": 1.223875880241394, + "learning_rate": 5.917897316842803e-06, + "loss": 0.6645, + "step": 2270 + }, + { + "epoch": 0.45814144814608176, + "grad_norm": 0.3372178077697754, + "learning_rate": 5.914684929565596e-06, + "loss": 0.7088, + "step": 2271 + }, + { + "epoch": 0.45834318370228877, + "grad_norm": 0.5606435537338257, + "learning_rate": 5.911472151606743e-06, + "loss": 0.8502, + "step": 2272 + }, + { + "epoch": 0.4585449192584957, + "grad_norm": 0.32245010137557983, + "learning_rate": 5.908258984338491e-06, + "loss": 0.7166, + "step": 2273 + }, + { + "epoch": 0.4587466548147027, + "grad_norm": 0.8593288660049438, + "learning_rate": 5.905045429133252e-06, + "loss": 0.6902, + "step": 2274 + }, + { + "epoch": 0.45894839037090973, + "grad_norm": 0.4007122218608856, + "learning_rate": 5.901831487363605e-06, + "loss": 0.6997, + "step": 2275 + }, + { + "epoch": 0.45915012592711674, + "grad_norm": 0.6089879274368286, + "learning_rate": 5.8986171604022925e-06, + "loss": 0.7023, + "step": 2276 + }, + { + "epoch": 0.45935186148332374, + "grad_norm": 0.4027574360370636, + "learning_rate": 5.895402449622226e-06, + "loss": 0.6861, + "step": 2277 + }, + { + "epoch": 0.4595535970395307, + "grad_norm": 0.9821425676345825, + "learning_rate": 5.8921873563964745e-06, + "loss": 0.6767, + "step": 2278 + }, + { + "epoch": 0.4597553325957377, + "grad_norm": 0.41719210147857666, + "learning_rate": 5.8889718820982754e-06, + "loss": 0.8088, + "step": 2279 + }, + { + "epoch": 0.4599570681519447, + "grad_norm": 0.5477870106697083, + "learning_rate": 5.885756028101025e-06, + "loss": 0.6984, + "step": 2280 + }, + { + "epoch": 0.4601588037081517, + "grad_norm": 0.37010979652404785, + "learning_rate": 5.882539795778287e-06, + "loss": 0.7369, + "step": 2281 + }, + { + "epoch": 0.46036053926435866, + "grad_norm": 6.755899906158447, + "learning_rate": 5.879323186503783e-06, + "loss": 0.7295, + "step": 2282 + }, + { + "epoch": 0.46056227482056566, + "grad_norm": 0.49411219358444214, + "learning_rate": 5.876106201651392e-06, + "loss": 0.6711, + "step": 2283 + }, + { + "epoch": 0.46076401037677267, + "grad_norm": 0.47513481974601746, + "learning_rate": 5.872888842595163e-06, + "loss": 0.6993, + "step": 2284 + }, + { + "epoch": 0.4609657459329797, + "grad_norm": 0.3968546986579895, + "learning_rate": 5.869671110709296e-06, + "loss": 0.8708, + "step": 2285 + }, + { + "epoch": 0.4611674814891867, + "grad_norm": 0.6728366613388062, + "learning_rate": 5.866453007368154e-06, + "loss": 0.6404, + "step": 2286 + }, + { + "epoch": 0.46136921704539363, + "grad_norm": 0.7965627312660217, + "learning_rate": 5.86323453394626e-06, + "loss": 0.649, + "step": 2287 + }, + { + "epoch": 0.46157095260160064, + "grad_norm": 0.8242064118385315, + "learning_rate": 5.860015691818292e-06, + "loss": 0.6907, + "step": 2288 + }, + { + "epoch": 0.46177268815780764, + "grad_norm": 0.3499292731285095, + "learning_rate": 5.856796482359089e-06, + "loss": 0.6556, + "step": 2289 + }, + { + "epoch": 0.46197442371401465, + "grad_norm": 0.3247630000114441, + "learning_rate": 5.853576906943641e-06, + "loss": 0.8573, + "step": 2290 + }, + { + "epoch": 0.4621761592702216, + "grad_norm": 0.5543577671051025, + "learning_rate": 5.8503569669471e-06, + "loss": 0.6823, + "step": 2291 + }, + { + "epoch": 0.4623778948264286, + "grad_norm": 0.8258817791938782, + "learning_rate": 5.847136663744772e-06, + "loss": 0.8946, + "step": 2292 + }, + { + "epoch": 0.4625796303826356, + "grad_norm": 0.7056267857551575, + "learning_rate": 5.843915998712117e-06, + "loss": 0.8463, + "step": 2293 + }, + { + "epoch": 0.4627813659388426, + "grad_norm": 0.6977629065513611, + "learning_rate": 5.840694973224752e-06, + "loss": 0.6792, + "step": 2294 + }, + { + "epoch": 0.4629831014950496, + "grad_norm": 0.30397582054138184, + "learning_rate": 5.837473588658444e-06, + "loss": 0.6731, + "step": 2295 + }, + { + "epoch": 0.46318483705125657, + "grad_norm": 0.8290293216705322, + "learning_rate": 5.8342518463891195e-06, + "loss": 0.6895, + "step": 2296 + }, + { + "epoch": 0.4633865726074636, + "grad_norm": 1.0042904615402222, + "learning_rate": 5.831029747792851e-06, + "loss": 0.6827, + "step": 2297 + }, + { + "epoch": 0.4635883081636706, + "grad_norm": 0.8199013471603394, + "learning_rate": 5.827807294245867e-06, + "loss": 0.6703, + "step": 2298 + }, + { + "epoch": 0.4637900437198776, + "grad_norm": 0.4754752516746521, + "learning_rate": 5.824584487124546e-06, + "loss": 0.8643, + "step": 2299 + }, + { + "epoch": 0.4639917792760846, + "grad_norm": 0.33841854333877563, + "learning_rate": 5.821361327805419e-06, + "loss": 0.7325, + "step": 2300 + }, + { + "epoch": 0.46419351483229154, + "grad_norm": 0.8162549734115601, + "learning_rate": 5.8181378176651696e-06, + "loss": 0.6413, + "step": 2301 + }, + { + "epoch": 0.46439525038849855, + "grad_norm": 0.7209305167198181, + "learning_rate": 5.814913958080625e-06, + "loss": 0.6518, + "step": 2302 + }, + { + "epoch": 0.46459698594470555, + "grad_norm": 0.6614891886711121, + "learning_rate": 5.811689750428765e-06, + "loss": 0.7149, + "step": 2303 + }, + { + "epoch": 0.46479872150091256, + "grad_norm": 0.5822973251342773, + "learning_rate": 5.808465196086719e-06, + "loss": 0.7242, + "step": 2304 + }, + { + "epoch": 0.4650004570571195, + "grad_norm": 0.904875636100769, + "learning_rate": 5.805240296431765e-06, + "loss": 0.6861, + "step": 2305 + }, + { + "epoch": 0.4652021926133265, + "grad_norm": 0.4102171063423157, + "learning_rate": 5.802015052841328e-06, + "loss": 0.6944, + "step": 2306 + }, + { + "epoch": 0.4654039281695335, + "grad_norm": 0.5328947305679321, + "learning_rate": 5.798789466692974e-06, + "loss": 0.6568, + "step": 2307 + }, + { + "epoch": 0.4656056637257405, + "grad_norm": 0.7053859233856201, + "learning_rate": 5.795563539364424e-06, + "loss": 0.6781, + "step": 2308 + }, + { + "epoch": 0.46580739928194753, + "grad_norm": 0.38549986481666565, + "learning_rate": 5.7923372722335415e-06, + "loss": 0.6573, + "step": 2309 + }, + { + "epoch": 0.4660091348381545, + "grad_norm": 0.5039394497871399, + "learning_rate": 5.7891106666783325e-06, + "loss": 0.6786, + "step": 2310 + }, + { + "epoch": 0.4662108703943615, + "grad_norm": 0.3344820439815521, + "learning_rate": 5.78588372407695e-06, + "loss": 0.9494, + "step": 2311 + }, + { + "epoch": 0.4664126059505685, + "grad_norm": 0.451581746339798, + "learning_rate": 5.782656445807695e-06, + "loss": 0.6447, + "step": 2312 + }, + { + "epoch": 0.4666143415067755, + "grad_norm": 1.1283626556396484, + "learning_rate": 5.779428833249003e-06, + "loss": 0.6672, + "step": 2313 + }, + { + "epoch": 0.46681607706298245, + "grad_norm": 0.5923398733139038, + "learning_rate": 5.776200887779458e-06, + "loss": 0.6433, + "step": 2314 + }, + { + "epoch": 0.46701781261918945, + "grad_norm": 0.8438357710838318, + "learning_rate": 5.7729726107777855e-06, + "loss": 0.6469, + "step": 2315 + }, + { + "epoch": 0.46721954817539646, + "grad_norm": 0.8239353895187378, + "learning_rate": 5.769744003622852e-06, + "loss": 0.7574, + "step": 2316 + }, + { + "epoch": 0.46742128373160347, + "grad_norm": 0.6511779427528381, + "learning_rate": 5.766515067693665e-06, + "loss": 0.8149, + "step": 2317 + }, + { + "epoch": 0.46762301928781047, + "grad_norm": 0.5120037794113159, + "learning_rate": 5.7632858043693726e-06, + "loss": 0.7148, + "step": 2318 + }, + { + "epoch": 0.4678247548440174, + "grad_norm": 0.5220741629600525, + "learning_rate": 5.760056215029263e-06, + "loss": 0.6585, + "step": 2319 + }, + { + "epoch": 0.4680264904002244, + "grad_norm": 0.3358466327190399, + "learning_rate": 5.756826301052764e-06, + "loss": 0.6783, + "step": 2320 + }, + { + "epoch": 0.46822822595643143, + "grad_norm": 0.7241223454475403, + "learning_rate": 5.753596063819441e-06, + "loss": 0.6732, + "step": 2321 + }, + { + "epoch": 0.46842996151263844, + "grad_norm": 0.45295026898384094, + "learning_rate": 5.750365504708998e-06, + "loss": 0.6714, + "step": 2322 + }, + { + "epoch": 0.4686316970688454, + "grad_norm": 0.7881321310997009, + "learning_rate": 5.747134625101275e-06, + "loss": 0.6594, + "step": 2323 + }, + { + "epoch": 0.4688334326250524, + "grad_norm": 0.6307108402252197, + "learning_rate": 5.7439034263762526e-06, + "loss": 0.6983, + "step": 2324 + }, + { + "epoch": 0.4690351681812594, + "grad_norm": 0.3908398151397705, + "learning_rate": 5.740671909914044e-06, + "loss": 0.7215, + "step": 2325 + }, + { + "epoch": 0.4692369037374664, + "grad_norm": 0.33366483449935913, + "learning_rate": 5.7374400770949e-06, + "loss": 0.671, + "step": 2326 + }, + { + "epoch": 0.4694386392936734, + "grad_norm": 0.4603020250797272, + "learning_rate": 5.734207929299206e-06, + "loss": 0.6932, + "step": 2327 + }, + { + "epoch": 0.46964037484988036, + "grad_norm": 0.6322477459907532, + "learning_rate": 5.730975467907481e-06, + "loss": 0.8183, + "step": 2328 + }, + { + "epoch": 0.46984211040608737, + "grad_norm": 0.3986794650554657, + "learning_rate": 5.727742694300381e-06, + "loss": 0.6771, + "step": 2329 + }, + { + "epoch": 0.47004384596229437, + "grad_norm": 0.3935398459434509, + "learning_rate": 5.724509609858693e-06, + "loss": 0.6629, + "step": 2330 + }, + { + "epoch": 0.4702455815185014, + "grad_norm": 0.38128989934921265, + "learning_rate": 5.7212762159633335e-06, + "loss": 0.7115, + "step": 2331 + }, + { + "epoch": 0.4704473170747083, + "grad_norm": 0.7728134393692017, + "learning_rate": 5.718042513995359e-06, + "loss": 0.6499, + "step": 2332 + }, + { + "epoch": 0.47064905263091533, + "grad_norm": 0.8129534721374512, + "learning_rate": 5.714808505335952e-06, + "loss": 0.8108, + "step": 2333 + }, + { + "epoch": 0.47085078818712234, + "grad_norm": 0.47534510493278503, + "learning_rate": 5.711574191366427e-06, + "loss": 0.7435, + "step": 2334 + }, + { + "epoch": 0.47105252374332934, + "grad_norm": 0.700268566608429, + "learning_rate": 5.708339573468227e-06, + "loss": 0.6906, + "step": 2335 + }, + { + "epoch": 0.47125425929953635, + "grad_norm": 0.34931257367134094, + "learning_rate": 5.705104653022931e-06, + "loss": 0.667, + "step": 2336 + }, + { + "epoch": 0.4714559948557433, + "grad_norm": 0.9690538644790649, + "learning_rate": 5.701869431412243e-06, + "loss": 0.6849, + "step": 2337 + }, + { + "epoch": 0.4716577304119503, + "grad_norm": 0.36594158411026, + "learning_rate": 5.698633910017993e-06, + "loss": 0.6393, + "step": 2338 + }, + { + "epoch": 0.4718594659681573, + "grad_norm": 0.34695130586624146, + "learning_rate": 5.695398090222141e-06, + "loss": 0.7614, + "step": 2339 + }, + { + "epoch": 0.4720612015243643, + "grad_norm": 0.5252466797828674, + "learning_rate": 5.69216197340678e-06, + "loss": 0.8025, + "step": 2340 + }, + { + "epoch": 0.47226293708057127, + "grad_norm": 0.34003615379333496, + "learning_rate": 5.6889255609541236e-06, + "loss": 0.669, + "step": 2341 + }, + { + "epoch": 0.47246467263677827, + "grad_norm": 0.8574584722518921, + "learning_rate": 5.68568885424651e-06, + "loss": 0.6524, + "step": 2342 + }, + { + "epoch": 0.4726664081929853, + "grad_norm": 0.39517006278038025, + "learning_rate": 5.682451854666411e-06, + "loss": 0.6406, + "step": 2343 + }, + { + "epoch": 0.4728681437491923, + "grad_norm": 0.6471952199935913, + "learning_rate": 5.6792145635964156e-06, + "loss": 0.8623, + "step": 2344 + }, + { + "epoch": 0.4730698793053993, + "grad_norm": 0.4071753919124603, + "learning_rate": 5.675976982419243e-06, + "loss": 0.8123, + "step": 2345 + }, + { + "epoch": 0.47327161486160624, + "grad_norm": 0.5992933511734009, + "learning_rate": 5.672739112517732e-06, + "loss": 0.764, + "step": 2346 + }, + { + "epoch": 0.47347335041781324, + "grad_norm": 0.3176419734954834, + "learning_rate": 5.669500955274847e-06, + "loss": 0.6406, + "step": 2347 + }, + { + "epoch": 0.47367508597402025, + "grad_norm": 0.5436950922012329, + "learning_rate": 5.666262512073676e-06, + "loss": 0.6682, + "step": 2348 + }, + { + "epoch": 0.47387682153022725, + "grad_norm": 0.5929723381996155, + "learning_rate": 5.663023784297426e-06, + "loss": 0.6594, + "step": 2349 + }, + { + "epoch": 0.4740785570864342, + "grad_norm": 0.9469460248947144, + "learning_rate": 5.65978477332943e-06, + "loss": 0.6563, + "step": 2350 + }, + { + "epoch": 0.4742802926426412, + "grad_norm": 0.9490289688110352, + "learning_rate": 5.656545480553135e-06, + "loss": 0.659, + "step": 2351 + }, + { + "epoch": 0.4744820281988482, + "grad_norm": 0.40818971395492554, + "learning_rate": 5.653305907352118e-06, + "loss": 0.6625, + "step": 2352 + }, + { + "epoch": 0.4746837637550552, + "grad_norm": 0.354064017534256, + "learning_rate": 5.650066055110067e-06, + "loss": 0.7012, + "step": 2353 + }, + { + "epoch": 0.4748854993112622, + "grad_norm": 0.5287762880325317, + "learning_rate": 5.646825925210795e-06, + "loss": 0.8059, + "step": 2354 + }, + { + "epoch": 0.4750872348674692, + "grad_norm": 0.4946046769618988, + "learning_rate": 5.6435855190382284e-06, + "loss": 0.6788, + "step": 2355 + }, + { + "epoch": 0.4752889704236762, + "grad_norm": 0.3169674873352051, + "learning_rate": 5.640344837976417e-06, + "loss": 0.6524, + "step": 2356 + }, + { + "epoch": 0.4754907059798832, + "grad_norm": 0.36257556080818176, + "learning_rate": 5.637103883409525e-06, + "loss": 0.6573, + "step": 2357 + }, + { + "epoch": 0.4756924415360902, + "grad_norm": 1.1422092914581299, + "learning_rate": 5.6338626567218335e-06, + "loss": 0.7541, + "step": 2358 + }, + { + "epoch": 0.47589417709229714, + "grad_norm": 0.3955131769180298, + "learning_rate": 5.63062115929774e-06, + "loss": 0.8808, + "step": 2359 + }, + { + "epoch": 0.47609591264850415, + "grad_norm": 0.705595850944519, + "learning_rate": 5.627379392521758e-06, + "loss": 0.7939, + "step": 2360 + }, + { + "epoch": 0.47629764820471115, + "grad_norm": 0.5015299320220947, + "learning_rate": 5.624137357778519e-06, + "loss": 0.7529, + "step": 2361 + }, + { + "epoch": 0.47649938376091816, + "grad_norm": 1.0474416017532349, + "learning_rate": 5.620895056452761e-06, + "loss": 0.6746, + "step": 2362 + }, + { + "epoch": 0.47670111931712517, + "grad_norm": 0.6244195103645325, + "learning_rate": 5.617652489929342e-06, + "loss": 0.7225, + "step": 2363 + }, + { + "epoch": 0.4769028548733321, + "grad_norm": 0.41735851764678955, + "learning_rate": 5.614409659593234e-06, + "loss": 0.73, + "step": 2364 + }, + { + "epoch": 0.4771045904295391, + "grad_norm": 0.42424479126930237, + "learning_rate": 5.61116656682952e-06, + "loss": 0.6849, + "step": 2365 + }, + { + "epoch": 0.4773063259857461, + "grad_norm": 1.1443331241607666, + "learning_rate": 5.607923213023392e-06, + "loss": 0.8486, + "step": 2366 + }, + { + "epoch": 0.47750806154195313, + "grad_norm": 0.38195037841796875, + "learning_rate": 5.604679599560159e-06, + "loss": 0.661, + "step": 2367 + }, + { + "epoch": 0.4777097970981601, + "grad_norm": 0.5178070664405823, + "learning_rate": 5.601435727825237e-06, + "loss": 0.6907, + "step": 2368 + }, + { + "epoch": 0.4779115326543671, + "grad_norm": 0.6520282030105591, + "learning_rate": 5.598191599204153e-06, + "loss": 0.7514, + "step": 2369 + }, + { + "epoch": 0.4781132682105741, + "grad_norm": 0.9794221520423889, + "learning_rate": 5.594947215082545e-06, + "loss": 0.6626, + "step": 2370 + }, + { + "epoch": 0.4783150037667811, + "grad_norm": 0.46517887711524963, + "learning_rate": 5.59170257684616e-06, + "loss": 0.8643, + "step": 2371 + }, + { + "epoch": 0.4785167393229881, + "grad_norm": 0.6313295364379883, + "learning_rate": 5.588457685880851e-06, + "loss": 0.7549, + "step": 2372 + }, + { + "epoch": 0.47871847487919506, + "grad_norm": 0.33170410990715027, + "learning_rate": 5.585212543572585e-06, + "loss": 0.898, + "step": 2373 + }, + { + "epoch": 0.47892021043540206, + "grad_norm": 0.9356674551963806, + "learning_rate": 5.5819671513074256e-06, + "loss": 0.7104, + "step": 2374 + }, + { + "epoch": 0.47912194599160907, + "grad_norm": 0.4888406991958618, + "learning_rate": 5.578721510471554e-06, + "loss": 0.8359, + "step": 2375 + }, + { + "epoch": 0.47932368154781607, + "grad_norm": 0.5194557905197144, + "learning_rate": 5.575475622451255e-06, + "loss": 0.6816, + "step": 2376 + }, + { + "epoch": 0.479525417104023, + "grad_norm": 0.4818446934223175, + "learning_rate": 5.572229488632913e-06, + "loss": 0.6913, + "step": 2377 + }, + { + "epoch": 0.47972715266023, + "grad_norm": 0.9433100819587708, + "learning_rate": 5.568983110403025e-06, + "loss": 0.6474, + "step": 2378 + }, + { + "epoch": 0.47992888821643703, + "grad_norm": 0.5533755421638489, + "learning_rate": 5.565736489148188e-06, + "loss": 0.8286, + "step": 2379 + }, + { + "epoch": 0.48013062377264404, + "grad_norm": 0.416610449552536, + "learning_rate": 5.562489626255104e-06, + "loss": 0.6741, + "step": 2380 + }, + { + "epoch": 0.48033235932885104, + "grad_norm": 0.38036978244781494, + "learning_rate": 5.559242523110577e-06, + "loss": 0.6823, + "step": 2381 + }, + { + "epoch": 0.480534094885058, + "grad_norm": 0.33574825525283813, + "learning_rate": 5.555995181101517e-06, + "loss": 0.7284, + "step": 2382 + }, + { + "epoch": 0.480735830441265, + "grad_norm": 0.5829797983169556, + "learning_rate": 5.552747601614932e-06, + "loss": 0.6931, + "step": 2383 + }, + { + "epoch": 0.480937565997472, + "grad_norm": 1.0719894170761108, + "learning_rate": 5.549499786037932e-06, + "loss": 0.6811, + "step": 2384 + }, + { + "epoch": 0.481139301553679, + "grad_norm": 0.27220767736434937, + "learning_rate": 5.5462517357577325e-06, + "loss": 0.7297, + "step": 2385 + }, + { + "epoch": 0.48134103710988596, + "grad_norm": 0.42966559529304504, + "learning_rate": 5.543003452161644e-06, + "loss": 0.6712, + "step": 2386 + }, + { + "epoch": 0.48154277266609297, + "grad_norm": 1.9052566289901733, + "learning_rate": 5.539754936637079e-06, + "loss": 0.6926, + "step": 2387 + }, + { + "epoch": 0.48174450822229997, + "grad_norm": 0.33792710304260254, + "learning_rate": 5.536506190571546e-06, + "loss": 0.6334, + "step": 2388 + }, + { + "epoch": 0.481946243778507, + "grad_norm": 0.7136744260787964, + "learning_rate": 5.5332572153526574e-06, + "loss": 0.6855, + "step": 2389 + }, + { + "epoch": 0.482147979334714, + "grad_norm": 2.199124574661255, + "learning_rate": 5.530008012368119e-06, + "loss": 0.6911, + "step": 2390 + }, + { + "epoch": 0.48234971489092093, + "grad_norm": 1.1589012145996094, + "learning_rate": 5.526758583005736e-06, + "loss": 0.7076, + "step": 2391 + }, + { + "epoch": 0.48255145044712794, + "grad_norm": 1.6382700204849243, + "learning_rate": 5.52350892865341e-06, + "loss": 0.6658, + "step": 2392 + }, + { + "epoch": 0.48275318600333494, + "grad_norm": 1.535251259803772, + "learning_rate": 5.520259050699138e-06, + "loss": 0.8544, + "step": 2393 + }, + { + "epoch": 0.48295492155954195, + "grad_norm": 0.8162296414375305, + "learning_rate": 5.517008950531013e-06, + "loss": 0.7948, + "step": 2394 + }, + { + "epoch": 0.4831566571157489, + "grad_norm": 0.4203198254108429, + "learning_rate": 5.5137586295372215e-06, + "loss": 0.6362, + "step": 2395 + }, + { + "epoch": 0.4833583926719559, + "grad_norm": 0.3570404648780823, + "learning_rate": 5.510508089106049e-06, + "loss": 0.6748, + "step": 2396 + }, + { + "epoch": 0.4835601282281629, + "grad_norm": 0.5715703964233398, + "learning_rate": 5.507257330625869e-06, + "loss": 0.6902, + "step": 2397 + }, + { + "epoch": 0.4837618637843699, + "grad_norm": 0.3944864571094513, + "learning_rate": 5.50400635548515e-06, + "loss": 0.6548, + "step": 2398 + }, + { + "epoch": 0.4839635993405769, + "grad_norm": 0.45001280307769775, + "learning_rate": 5.500755165072453e-06, + "loss": 0.7976, + "step": 2399 + }, + { + "epoch": 0.48416533489678387, + "grad_norm": 0.5098247528076172, + "learning_rate": 5.497503760776436e-06, + "loss": 0.6763, + "step": 2400 + }, + { + "epoch": 0.4843670704529909, + "grad_norm": 0.478303998708725, + "learning_rate": 5.4942521439858386e-06, + "loss": 0.6613, + "step": 2401 + }, + { + "epoch": 0.4845688060091979, + "grad_norm": 0.4981384873390198, + "learning_rate": 5.491000316089499e-06, + "loss": 0.7707, + "step": 2402 + }, + { + "epoch": 0.4847705415654049, + "grad_norm": 0.579944908618927, + "learning_rate": 5.487748278476342e-06, + "loss": 0.7008, + "step": 2403 + }, + { + "epoch": 0.4849722771216119, + "grad_norm": 0.4089328348636627, + "learning_rate": 5.484496032535385e-06, + "loss": 0.6602, + "step": 2404 + }, + { + "epoch": 0.48517401267781884, + "grad_norm": 0.6986388564109802, + "learning_rate": 5.48124357965573e-06, + "loss": 0.6808, + "step": 2405 + }, + { + "epoch": 0.48537574823402585, + "grad_norm": 0.3838619589805603, + "learning_rate": 5.477990921226569e-06, + "loss": 0.6708, + "step": 2406 + }, + { + "epoch": 0.48557748379023286, + "grad_norm": 0.38885095715522766, + "learning_rate": 5.474738058637185e-06, + "loss": 0.6856, + "step": 2407 + }, + { + "epoch": 0.48577921934643986, + "grad_norm": 2.982579469680786, + "learning_rate": 5.471484993276945e-06, + "loss": 0.7715, + "step": 2408 + }, + { + "epoch": 0.4859809549026468, + "grad_norm": 0.6595232486724854, + "learning_rate": 5.4682317265353025e-06, + "loss": 0.6493, + "step": 2409 + }, + { + "epoch": 0.4861826904588538, + "grad_norm": 0.39451122283935547, + "learning_rate": 5.464978259801797e-06, + "loss": 0.6916, + "step": 2410 + }, + { + "epoch": 0.4863844260150608, + "grad_norm": 0.5617831349372864, + "learning_rate": 5.461724594466059e-06, + "loss": 0.7956, + "step": 2411 + }, + { + "epoch": 0.48658616157126783, + "grad_norm": 1.7667847871780396, + "learning_rate": 5.458470731917794e-06, + "loss": 0.7746, + "step": 2412 + }, + { + "epoch": 0.48678789712747483, + "grad_norm": 1.2532994747161865, + "learning_rate": 5.455216673546798e-06, + "loss": 0.6748, + "step": 2413 + }, + { + "epoch": 0.4869896326836818, + "grad_norm": 0.6899948716163635, + "learning_rate": 5.451962420742951e-06, + "loss": 0.7031, + "step": 2414 + }, + { + "epoch": 0.4871913682398888, + "grad_norm": 0.8961516618728638, + "learning_rate": 5.448707974896214e-06, + "loss": 0.7372, + "step": 2415 + }, + { + "epoch": 0.4873931037960958, + "grad_norm": 1.3983064889907837, + "learning_rate": 5.445453337396629e-06, + "loss": 0.6774, + "step": 2416 + }, + { + "epoch": 0.4875948393523028, + "grad_norm": 0.7456166744232178, + "learning_rate": 5.442198509634324e-06, + "loss": 0.6806, + "step": 2417 + }, + { + "epoch": 0.48779657490850975, + "grad_norm": 0.5494102835655212, + "learning_rate": 5.438943492999504e-06, + "loss": 0.6734, + "step": 2418 + }, + { + "epoch": 0.48799831046471676, + "grad_norm": 1.1771401166915894, + "learning_rate": 5.435688288882461e-06, + "loss": 0.813, + "step": 2419 + }, + { + "epoch": 0.48820004602092376, + "grad_norm": 0.3710864186286926, + "learning_rate": 5.432432898673558e-06, + "loss": 0.7281, + "step": 2420 + }, + { + "epoch": 0.48840178157713077, + "grad_norm": 0.4745769500732422, + "learning_rate": 5.429177323763245e-06, + "loss": 0.7416, + "step": 2421 + }, + { + "epoch": 0.4886035171333378, + "grad_norm": 1.0034306049346924, + "learning_rate": 5.425921565542047e-06, + "loss": 0.6967, + "step": 2422 + }, + { + "epoch": 0.4888052526895447, + "grad_norm": 0.498399555683136, + "learning_rate": 5.4226656254005686e-06, + "loss": 0.6518, + "step": 2423 + }, + { + "epoch": 0.48900698824575173, + "grad_norm": 0.38673850893974304, + "learning_rate": 5.4194095047294935e-06, + "loss": 0.7897, + "step": 2424 + }, + { + "epoch": 0.48920872380195873, + "grad_norm": 0.32360196113586426, + "learning_rate": 5.41615320491958e-06, + "loss": 0.7278, + "step": 2425 + }, + { + "epoch": 0.48941045935816574, + "grad_norm": 0.7270029783248901, + "learning_rate": 5.412896727361663e-06, + "loss": 0.6923, + "step": 2426 + }, + { + "epoch": 0.4896121949143727, + "grad_norm": 0.5027311444282532, + "learning_rate": 5.409640073446654e-06, + "loss": 0.6943, + "step": 2427 + }, + { + "epoch": 0.4898139304705797, + "grad_norm": 0.6347085237503052, + "learning_rate": 5.406383244565543e-06, + "loss": 0.7144, + "step": 2428 + }, + { + "epoch": 0.4900156660267867, + "grad_norm": 0.40251901745796204, + "learning_rate": 5.40312624210939e-06, + "loss": 0.6694, + "step": 2429 + }, + { + "epoch": 0.4902174015829937, + "grad_norm": 0.5375779271125793, + "learning_rate": 5.3998690674693286e-06, + "loss": 0.6423, + "step": 2430 + }, + { + "epoch": 0.4904191371392007, + "grad_norm": 0.5994971394538879, + "learning_rate": 5.396611722036573e-06, + "loss": 0.6715, + "step": 2431 + }, + { + "epoch": 0.49062087269540766, + "grad_norm": 0.38045647740364075, + "learning_rate": 5.393354207202404e-06, + "loss": 0.7003, + "step": 2432 + }, + { + "epoch": 0.49082260825161467, + "grad_norm": 0.4006095230579376, + "learning_rate": 5.390096524358175e-06, + "loss": 0.9497, + "step": 2433 + }, + { + "epoch": 0.4910243438078217, + "grad_norm": 0.7631081938743591, + "learning_rate": 5.386838674895311e-06, + "loss": 0.6707, + "step": 2434 + }, + { + "epoch": 0.4912260793640287, + "grad_norm": 0.3868980407714844, + "learning_rate": 5.383580660205313e-06, + "loss": 0.657, + "step": 2435 + }, + { + "epoch": 0.49142781492023563, + "grad_norm": 0.3521217405796051, + "learning_rate": 5.3803224816797495e-06, + "loss": 0.8157, + "step": 2436 + }, + { + "epoch": 0.49162955047644263, + "grad_norm": 0.43658751249313354, + "learning_rate": 5.3770641407102554e-06, + "loss": 0.6828, + "step": 2437 + }, + { + "epoch": 0.49183128603264964, + "grad_norm": 0.375871479511261, + "learning_rate": 5.373805638688542e-06, + "loss": 0.6761, + "step": 2438 + }, + { + "epoch": 0.49203302158885664, + "grad_norm": 0.8562093377113342, + "learning_rate": 5.370546977006383e-06, + "loss": 0.7243, + "step": 2439 + }, + { + "epoch": 0.49223475714506365, + "grad_norm": 0.3729056119918823, + "learning_rate": 5.367288157055626e-06, + "loss": 0.7204, + "step": 2440 + }, + { + "epoch": 0.4924364927012706, + "grad_norm": 0.8656922578811646, + "learning_rate": 5.36402918022818e-06, + "loss": 0.8264, + "step": 2441 + }, + { + "epoch": 0.4926382282574776, + "grad_norm": 0.4582517445087433, + "learning_rate": 5.360770047916025e-06, + "loss": 0.6895, + "step": 2442 + }, + { + "epoch": 0.4928399638136846, + "grad_norm": 0.7141796946525574, + "learning_rate": 5.3575107615112084e-06, + "loss": 0.7993, + "step": 2443 + }, + { + "epoch": 0.4930416993698916, + "grad_norm": 0.6745219826698303, + "learning_rate": 5.35425132240584e-06, + "loss": 0.6812, + "step": 2444 + }, + { + "epoch": 0.49324343492609857, + "grad_norm": 0.3646477162837982, + "learning_rate": 5.350991731992098e-06, + "loss": 0.6652, + "step": 2445 + }, + { + "epoch": 0.4934451704823056, + "grad_norm": 0.8088419437408447, + "learning_rate": 5.3477319916622215e-06, + "loss": 0.7443, + "step": 2446 + }, + { + "epoch": 0.4936469060385126, + "grad_norm": 0.41926172375679016, + "learning_rate": 5.344472102808519e-06, + "loss": 0.6607, + "step": 2447 + }, + { + "epoch": 0.4938486415947196, + "grad_norm": 1.1572102308273315, + "learning_rate": 5.341212066823356e-06, + "loss": 0.7375, + "step": 2448 + }, + { + "epoch": 0.4940503771509266, + "grad_norm": 0.433287650346756, + "learning_rate": 5.337951885099167e-06, + "loss": 0.7003, + "step": 2449 + }, + { + "epoch": 0.49425211270713354, + "grad_norm": 0.48769572377204895, + "learning_rate": 5.334691559028442e-06, + "loss": 0.8846, + "step": 2450 + }, + { + "epoch": 0.49445384826334055, + "grad_norm": 0.565112829208374, + "learning_rate": 5.331431090003739e-06, + "loss": 0.6548, + "step": 2451 + }, + { + "epoch": 0.49465558381954755, + "grad_norm": 1.2953200340270996, + "learning_rate": 5.328170479417676e-06, + "loss": 0.6966, + "step": 2452 + }, + { + "epoch": 0.49485731937575456, + "grad_norm": 1.8815406560897827, + "learning_rate": 5.324909728662929e-06, + "loss": 0.6353, + "step": 2453 + }, + { + "epoch": 0.4950590549319615, + "grad_norm": 1.499898076057434, + "learning_rate": 5.321648839132233e-06, + "loss": 0.6948, + "step": 2454 + }, + { + "epoch": 0.4952607904881685, + "grad_norm": 2.585374355316162, + "learning_rate": 5.318387812218386e-06, + "loss": 0.6784, + "step": 2455 + }, + { + "epoch": 0.4954625260443755, + "grad_norm": 1.7011631727218628, + "learning_rate": 5.315126649314244e-06, + "loss": 0.6391, + "step": 2456 + }, + { + "epoch": 0.4956642616005825, + "grad_norm": 1.2485464811325073, + "learning_rate": 5.311865351812718e-06, + "loss": 0.6842, + "step": 2457 + }, + { + "epoch": 0.49586599715678953, + "grad_norm": 1.5083630084991455, + "learning_rate": 5.308603921106777e-06, + "loss": 0.6877, + "step": 2458 + }, + { + "epoch": 0.4960677327129965, + "grad_norm": 0.4522298574447632, + "learning_rate": 5.305342358589452e-06, + "loss": 0.6042, + "step": 2459 + }, + { + "epoch": 0.4962694682692035, + "grad_norm": 0.3971395790576935, + "learning_rate": 5.302080665653826e-06, + "loss": 0.6506, + "step": 2460 + }, + { + "epoch": 0.4964712038254105, + "grad_norm": 0.4974903166294098, + "learning_rate": 5.298818843693035e-06, + "loss": 0.7048, + "step": 2461 + }, + { + "epoch": 0.4966729393816175, + "grad_norm": 0.8520945310592651, + "learning_rate": 5.295556894100278e-06, + "loss": 0.9368, + "step": 2462 + }, + { + "epoch": 0.49687467493782445, + "grad_norm": 0.36369588971138, + "learning_rate": 5.292294818268801e-06, + "loss": 0.6535, + "step": 2463 + }, + { + "epoch": 0.49707641049403145, + "grad_norm": 0.39193958044052124, + "learning_rate": 5.289032617591908e-06, + "loss": 0.6962, + "step": 2464 + }, + { + "epoch": 0.49727814605023846, + "grad_norm": 0.5720986127853394, + "learning_rate": 5.285770293462954e-06, + "loss": 0.6672, + "step": 2465 + }, + { + "epoch": 0.49747988160644546, + "grad_norm": 0.444812536239624, + "learning_rate": 5.2825078472753476e-06, + "loss": 0.6347, + "step": 2466 + }, + { + "epoch": 0.49768161716265247, + "grad_norm": 0.45876580476760864, + "learning_rate": 5.2792452804225535e-06, + "loss": 0.725, + "step": 2467 + }, + { + "epoch": 0.4978833527188594, + "grad_norm": 0.6092671751976013, + "learning_rate": 5.275982594298081e-06, + "loss": 0.763, + "step": 2468 + }, + { + "epoch": 0.4980850882750664, + "grad_norm": 0.5046172142028809, + "learning_rate": 5.2727197902954954e-06, + "loss": 0.7659, + "step": 2469 + }, + { + "epoch": 0.49828682383127343, + "grad_norm": 0.3817034959793091, + "learning_rate": 5.2694568698084085e-06, + "loss": 0.8996, + "step": 2470 + }, + { + "epoch": 0.49848855938748043, + "grad_norm": 0.3855259418487549, + "learning_rate": 5.266193834230485e-06, + "loss": 0.7581, + "step": 2471 + }, + { + "epoch": 0.4986902949436874, + "grad_norm": 0.6069688200950623, + "learning_rate": 5.262930684955439e-06, + "loss": 0.7928, + "step": 2472 + }, + { + "epoch": 0.4988920304998944, + "grad_norm": 0.3956807851791382, + "learning_rate": 5.25966742337703e-06, + "loss": 0.66, + "step": 2473 + }, + { + "epoch": 0.4990937660561014, + "grad_norm": 0.43637824058532715, + "learning_rate": 5.256404050889069e-06, + "loss": 0.7644, + "step": 2474 + }, + { + "epoch": 0.4992955016123084, + "grad_norm": 0.49486860632896423, + "learning_rate": 5.253140568885412e-06, + "loss": 0.7436, + "step": 2475 + }, + { + "epoch": 0.4994972371685154, + "grad_norm": 0.4755359888076782, + "learning_rate": 5.249876978759961e-06, + "loss": 0.6725, + "step": 2476 + }, + { + "epoch": 0.49969897272472236, + "grad_norm": 0.5608794689178467, + "learning_rate": 5.246613281906669e-06, + "loss": 0.6263, + "step": 2477 + }, + { + "epoch": 0.49990070828092936, + "grad_norm": 0.692234992980957, + "learning_rate": 5.243349479719528e-06, + "loss": 0.6567, + "step": 2478 + }, + { + "epoch": 0.5001024438371363, + "grad_norm": 0.9859315156936646, + "learning_rate": 5.240085573592579e-06, + "loss": 0.7421, + "step": 2479 + }, + { + "epoch": 0.5003041793933434, + "grad_norm": 0.5274341106414795, + "learning_rate": 5.236821564919909e-06, + "loss": 0.6897, + "step": 2480 + }, + { + "epoch": 0.5005059149495503, + "grad_norm": 0.6965314149856567, + "learning_rate": 5.233557455095645e-06, + "loss": 0.6506, + "step": 2481 + }, + { + "epoch": 0.5007076505057574, + "grad_norm": 0.578256368637085, + "learning_rate": 5.230293245513956e-06, + "loss": 0.6574, + "step": 2482 + }, + { + "epoch": 0.5009093860619643, + "grad_norm": 0.34265127778053284, + "learning_rate": 5.22702893756906e-06, + "loss": 0.6711, + "step": 2483 + }, + { + "epoch": 0.5011111216181713, + "grad_norm": 0.5414084196090698, + "learning_rate": 5.2237645326552125e-06, + "loss": 0.6894, + "step": 2484 + }, + { + "epoch": 0.5013128571743783, + "grad_norm": 0.41821205615997314, + "learning_rate": 5.220500032166709e-06, + "loss": 0.8267, + "step": 2485 + }, + { + "epoch": 0.5015145927305853, + "grad_norm": 1.5558137893676758, + "learning_rate": 5.2172354374978905e-06, + "loss": 0.7558, + "step": 2486 + }, + { + "epoch": 0.5017163282867924, + "grad_norm": 1.021885871887207, + "learning_rate": 5.213970750043135e-06, + "loss": 0.8056, + "step": 2487 + }, + { + "epoch": 0.5019180638429993, + "grad_norm": 1.5603513717651367, + "learning_rate": 5.210705971196861e-06, + "loss": 0.6761, + "step": 2488 + }, + { + "epoch": 0.5021197993992063, + "grad_norm": 0.47618329524993896, + "learning_rate": 5.207441102353524e-06, + "loss": 0.6797, + "step": 2489 + }, + { + "epoch": 0.5023215349554133, + "grad_norm": 0.36025211215019226, + "learning_rate": 5.204176144907624e-06, + "loss": 0.8224, + "step": 2490 + }, + { + "epoch": 0.5025232705116203, + "grad_norm": 1.2106201648712158, + "learning_rate": 5.20091110025369e-06, + "loss": 0.717, + "step": 2491 + }, + { + "epoch": 0.5027250060678273, + "grad_norm": 0.40544193983078003, + "learning_rate": 5.197645969786297e-06, + "loss": 0.6746, + "step": 2492 + }, + { + "epoch": 0.5029267416240343, + "grad_norm": 0.34461602568626404, + "learning_rate": 5.194380754900049e-06, + "loss": 0.7017, + "step": 2493 + }, + { + "epoch": 0.5031284771802412, + "grad_norm": 0.5607613921165466, + "learning_rate": 5.1911154569895915e-06, + "loss": 0.6919, + "step": 2494 + }, + { + "epoch": 0.5033302127364483, + "grad_norm": 0.49895989894866943, + "learning_rate": 5.187850077449604e-06, + "loss": 0.8624, + "step": 2495 + }, + { + "epoch": 0.5035319482926552, + "grad_norm": 0.5202046036720276, + "learning_rate": 5.1845846176748005e-06, + "loss": 0.7062, + "step": 2496 + }, + { + "epoch": 0.5037336838488622, + "grad_norm": 0.42681393027305603, + "learning_rate": 5.181319079059928e-06, + "loss": 0.7279, + "step": 2497 + }, + { + "epoch": 0.5039354194050693, + "grad_norm": 0.3662031292915344, + "learning_rate": 5.178053462999768e-06, + "loss": 0.6648, + "step": 2498 + }, + { + "epoch": 0.5041371549612762, + "grad_norm": 0.4268733859062195, + "learning_rate": 5.174787770889138e-06, + "loss": 0.62, + "step": 2499 + }, + { + "epoch": 0.5043388905174833, + "grad_norm": 0.4190344214439392, + "learning_rate": 5.1715220041228835e-06, + "loss": 0.6409, + "step": 2500 + }, + { + "epoch": 0.5045406260736902, + "grad_norm": 0.4919296205043793, + "learning_rate": 5.168256164095885e-06, + "loss": 0.6389, + "step": 2501 + }, + { + "epoch": 0.5047423616298972, + "grad_norm": 0.32788679003715515, + "learning_rate": 5.164990252203052e-06, + "loss": 0.6515, + "step": 2502 + }, + { + "epoch": 0.5049440971861042, + "grad_norm": 0.4756167232990265, + "learning_rate": 5.1617242698393265e-06, + "loss": 0.6804, + "step": 2503 + }, + { + "epoch": 0.5051458327423112, + "grad_norm": 0.49856603145599365, + "learning_rate": 5.15845821839968e-06, + "loss": 0.6365, + "step": 2504 + }, + { + "epoch": 0.5053475682985182, + "grad_norm": 0.3664408028125763, + "learning_rate": 5.155192099279113e-06, + "loss": 0.6819, + "step": 2505 + }, + { + "epoch": 0.5055493038547252, + "grad_norm": 0.637521505355835, + "learning_rate": 5.151925913872657e-06, + "loss": 0.6986, + "step": 2506 + }, + { + "epoch": 0.5057510394109321, + "grad_norm": 0.29554644227027893, + "learning_rate": 5.148659663575367e-06, + "loss": 0.7517, + "step": 2507 + }, + { + "epoch": 0.5059527749671392, + "grad_norm": 0.42966410517692566, + "learning_rate": 5.1453933497823326e-06, + "loss": 0.7161, + "step": 2508 + }, + { + "epoch": 0.5061545105233461, + "grad_norm": 0.32631874084472656, + "learning_rate": 5.1421269738886635e-06, + "loss": 0.638, + "step": 2509 + }, + { + "epoch": 0.5063562460795532, + "grad_norm": 0.39070239663124084, + "learning_rate": 5.138860537289502e-06, + "loss": 0.7415, + "step": 2510 + }, + { + "epoch": 0.5065579816357602, + "grad_norm": 0.5472754240036011, + "learning_rate": 5.135594041380012e-06, + "loss": 0.6592, + "step": 2511 + }, + { + "epoch": 0.5067597171919671, + "grad_norm": 0.603266716003418, + "learning_rate": 5.132327487555385e-06, + "loss": 0.6929, + "step": 2512 + }, + { + "epoch": 0.5069614527481742, + "grad_norm": 1.3803391456604004, + "learning_rate": 5.129060877210835e-06, + "loss": 0.6848, + "step": 2513 + }, + { + "epoch": 0.5071631883043811, + "grad_norm": 0.5233571529388428, + "learning_rate": 5.125794211741602e-06, + "loss": 1.0529, + "step": 2514 + }, + { + "epoch": 0.5073649238605881, + "grad_norm": 0.5816826224327087, + "learning_rate": 5.122527492542954e-06, + "loss": 0.6554, + "step": 2515 + }, + { + "epoch": 0.5075666594167951, + "grad_norm": 0.4094732105731964, + "learning_rate": 5.119260721010171e-06, + "loss": 0.6973, + "step": 2516 + }, + { + "epoch": 0.5077683949730021, + "grad_norm": 0.6474040746688843, + "learning_rate": 5.1159938985385625e-06, + "loss": 0.6685, + "step": 2517 + }, + { + "epoch": 0.5079701305292091, + "grad_norm": 0.47420260310173035, + "learning_rate": 5.112727026523461e-06, + "loss": 0.6705, + "step": 2518 + }, + { + "epoch": 0.5081718660854161, + "grad_norm": 0.4265378415584564, + "learning_rate": 5.1094601063602176e-06, + "loss": 0.8524, + "step": 2519 + }, + { + "epoch": 0.508373601641623, + "grad_norm": 1.0773259401321411, + "learning_rate": 5.1061931394442045e-06, + "loss": 0.6885, + "step": 2520 + }, + { + "epoch": 0.5085753371978301, + "grad_norm": 0.4289955496788025, + "learning_rate": 5.1029261271708104e-06, + "loss": 0.6799, + "step": 2521 + }, + { + "epoch": 0.508777072754037, + "grad_norm": 0.5529792904853821, + "learning_rate": 5.099659070935451e-06, + "loss": 0.8266, + "step": 2522 + }, + { + "epoch": 0.5089788083102441, + "grad_norm": 0.7462150454521179, + "learning_rate": 5.096391972133554e-06, + "loss": 0.6685, + "step": 2523 + }, + { + "epoch": 0.5091805438664511, + "grad_norm": 0.5908946394920349, + "learning_rate": 5.093124832160569e-06, + "loss": 0.668, + "step": 2524 + }, + { + "epoch": 0.509382279422658, + "grad_norm": 0.7674736976623535, + "learning_rate": 5.089857652411961e-06, + "loss": 0.6632, + "step": 2525 + }, + { + "epoch": 0.5095840149788651, + "grad_norm": 0.40639930963516235, + "learning_rate": 5.086590434283212e-06, + "loss": 0.6175, + "step": 2526 + }, + { + "epoch": 0.509785750535072, + "grad_norm": 0.33126306533813477, + "learning_rate": 5.083323179169824e-06, + "loss": 0.6474, + "step": 2527 + }, + { + "epoch": 0.5099874860912791, + "grad_norm": 0.30662772059440613, + "learning_rate": 5.080055888467308e-06, + "loss": 0.7701, + "step": 2528 + }, + { + "epoch": 0.510189221647486, + "grad_norm": 0.3923175036907196, + "learning_rate": 5.076788563571198e-06, + "loss": 0.6567, + "step": 2529 + }, + { + "epoch": 0.510390957203693, + "grad_norm": 0.5640500783920288, + "learning_rate": 5.073521205877038e-06, + "loss": 0.6456, + "step": 2530 + }, + { + "epoch": 0.5105926927599, + "grad_norm": 0.6147575378417969, + "learning_rate": 5.0702538167803864e-06, + "loss": 0.6634, + "step": 2531 + }, + { + "epoch": 0.510794428316107, + "grad_norm": 0.4823729693889618, + "learning_rate": 5.0669863976768145e-06, + "loss": 0.6735, + "step": 2532 + }, + { + "epoch": 0.510996163872314, + "grad_norm": 0.47359368205070496, + "learning_rate": 5.063718949961909e-06, + "loss": 0.6704, + "step": 2533 + }, + { + "epoch": 0.511197899428521, + "grad_norm": 2.3694827556610107, + "learning_rate": 5.060451475031267e-06, + "loss": 0.6973, + "step": 2534 + }, + { + "epoch": 0.511399634984728, + "grad_norm": 0.32952338457107544, + "learning_rate": 5.057183974280498e-06, + "loss": 0.7599, + "step": 2535 + }, + { + "epoch": 0.511601370540935, + "grad_norm": 0.5751401782035828, + "learning_rate": 5.053916449105219e-06, + "loss": 0.7763, + "step": 2536 + }, + { + "epoch": 0.511803106097142, + "grad_norm": 0.3479835093021393, + "learning_rate": 5.050648900901064e-06, + "loss": 0.6997, + "step": 2537 + }, + { + "epoch": 0.5120048416533489, + "grad_norm": 0.3522459864616394, + "learning_rate": 5.047381331063672e-06, + "loss": 0.8358, + "step": 2538 + }, + { + "epoch": 0.512206577209556, + "grad_norm": 1.7044692039489746, + "learning_rate": 5.044113740988692e-06, + "loss": 0.8016, + "step": 2539 + }, + { + "epoch": 0.5124083127657629, + "grad_norm": 0.4563331604003906, + "learning_rate": 5.040846132071783e-06, + "loss": 0.8318, + "step": 2540 + }, + { + "epoch": 0.51261004832197, + "grad_norm": 0.47234177589416504, + "learning_rate": 5.03757850570861e-06, + "loss": 0.6481, + "step": 2541 + }, + { + "epoch": 0.5128117838781769, + "grad_norm": 0.6565883755683899, + "learning_rate": 5.034310863294847e-06, + "loss": 0.8461, + "step": 2542 + }, + { + "epoch": 0.5130135194343839, + "grad_norm": 0.44650590419769287, + "learning_rate": 5.0310432062261764e-06, + "loss": 0.684, + "step": 2543 + }, + { + "epoch": 0.513215254990591, + "grad_norm": 0.4767093062400818, + "learning_rate": 5.027775535898283e-06, + "loss": 0.6856, + "step": 2544 + }, + { + "epoch": 0.5134169905467979, + "grad_norm": 0.5772445797920227, + "learning_rate": 5.024507853706858e-06, + "loss": 0.6855, + "step": 2545 + }, + { + "epoch": 0.513618726103005, + "grad_norm": 0.46750447154045105, + "learning_rate": 5.021240161047601e-06, + "loss": 0.6753, + "step": 2546 + }, + { + "epoch": 0.5138204616592119, + "grad_norm": 0.733974277973175, + "learning_rate": 5.0179724593162146e-06, + "loss": 0.6634, + "step": 2547 + }, + { + "epoch": 0.5140221972154189, + "grad_norm": 0.4525078535079956, + "learning_rate": 5.014704749908404e-06, + "loss": 0.7676, + "step": 2548 + }, + { + "epoch": 0.5142239327716259, + "grad_norm": 0.4221421778202057, + "learning_rate": 5.011437034219875e-06, + "loss": 0.6615, + "step": 2549 + }, + { + "epoch": 0.5144256683278329, + "grad_norm": 0.6879943013191223, + "learning_rate": 5.0081693136463435e-06, + "loss": 0.7909, + "step": 2550 + }, + { + "epoch": 0.5146274038840399, + "grad_norm": 0.47413426637649536, + "learning_rate": 5.004901589583524e-06, + "loss": 0.649, + "step": 2551 + }, + { + "epoch": 0.5148291394402469, + "grad_norm": 0.4079788625240326, + "learning_rate": 5.0016338634271285e-06, + "loss": 0.6717, + "step": 2552 + }, + { + "epoch": 0.5150308749964538, + "grad_norm": 0.5021528601646423, + "learning_rate": 4.998366136572874e-06, + "loss": 0.6573, + "step": 2553 + }, + { + "epoch": 0.5152326105526609, + "grad_norm": 0.5983197689056396, + "learning_rate": 4.995098410416478e-06, + "loss": 0.6964, + "step": 2554 + }, + { + "epoch": 0.5154343461088678, + "grad_norm": 0.4596244990825653, + "learning_rate": 4.9918306863536565e-06, + "loss": 0.6676, + "step": 2555 + }, + { + "epoch": 0.5156360816650748, + "grad_norm": 0.4215291142463684, + "learning_rate": 4.988562965780127e-06, + "loss": 0.8117, + "step": 2556 + }, + { + "epoch": 0.5158378172212819, + "grad_norm": 1.1139490604400635, + "learning_rate": 4.985295250091598e-06, + "loss": 0.6803, + "step": 2557 + }, + { + "epoch": 0.5160395527774888, + "grad_norm": 0.5091721415519714, + "learning_rate": 4.982027540683785e-06, + "loss": 0.6929, + "step": 2558 + }, + { + "epoch": 0.5162412883336959, + "grad_norm": 0.6055331826210022, + "learning_rate": 4.9787598389524e-06, + "loss": 0.7099, + "step": 2559 + }, + { + "epoch": 0.5164430238899028, + "grad_norm": 1.519207239151001, + "learning_rate": 4.975492146293143e-06, + "loss": 0.7387, + "step": 2560 + }, + { + "epoch": 0.5166447594461098, + "grad_norm": 0.8859824538230896, + "learning_rate": 4.97222446410172e-06, + "loss": 0.8139, + "step": 2561 + }, + { + "epoch": 0.5168464950023168, + "grad_norm": 1.6021531820297241, + "learning_rate": 4.968956793773825e-06, + "loss": 0.687, + "step": 2562 + }, + { + "epoch": 0.5170482305585238, + "grad_norm": 7.087365627288818, + "learning_rate": 4.965689136705153e-06, + "loss": 0.7062, + "step": 2563 + }, + { + "epoch": 0.5172499661147308, + "grad_norm": 0.7062385082244873, + "learning_rate": 4.9624214942913916e-06, + "loss": 0.699, + "step": 2564 + }, + { + "epoch": 0.5174517016709378, + "grad_norm": 0.4900060296058655, + "learning_rate": 4.959153867928218e-06, + "loss": 0.7065, + "step": 2565 + }, + { + "epoch": 0.5176534372271447, + "grad_norm": 0.656950056552887, + "learning_rate": 4.955886259011308e-06, + "loss": 0.67, + "step": 2566 + }, + { + "epoch": 0.5178551727833518, + "grad_norm": 0.38856950402259827, + "learning_rate": 4.95261866893633e-06, + "loss": 0.6693, + "step": 2567 + }, + { + "epoch": 0.5180569083395588, + "grad_norm": 0.4750538766384125, + "learning_rate": 4.949351099098937e-06, + "loss": 0.6941, + "step": 2568 + }, + { + "epoch": 0.5182586438957658, + "grad_norm": 0.327970027923584, + "learning_rate": 4.946083550894782e-06, + "loss": 0.7079, + "step": 2569 + }, + { + "epoch": 0.5184603794519728, + "grad_norm": 0.5009106397628784, + "learning_rate": 4.942816025719505e-06, + "loss": 0.7175, + "step": 2570 + }, + { + "epoch": 0.5186621150081797, + "grad_norm": 0.7603925466537476, + "learning_rate": 4.939548524968734e-06, + "loss": 0.6647, + "step": 2571 + }, + { + "epoch": 0.5188638505643868, + "grad_norm": 0.41649743914604187, + "learning_rate": 4.936281050038091e-06, + "loss": 0.6842, + "step": 2572 + }, + { + "epoch": 0.5190655861205937, + "grad_norm": 0.43867188692092896, + "learning_rate": 4.933013602323186e-06, + "loss": 0.6821, + "step": 2573 + }, + { + "epoch": 0.5192673216768007, + "grad_norm": 0.5117209553718567, + "learning_rate": 4.929746183219615e-06, + "loss": 0.7396, + "step": 2574 + }, + { + "epoch": 0.5194690572330077, + "grad_norm": 1.3436833620071411, + "learning_rate": 4.926478794122965e-06, + "loss": 0.644, + "step": 2575 + }, + { + "epoch": 0.5196707927892147, + "grad_norm": 1.052332878112793, + "learning_rate": 4.923211436428804e-06, + "loss": 0.6689, + "step": 2576 + }, + { + "epoch": 0.5198725283454217, + "grad_norm": 0.5234437584877014, + "learning_rate": 4.919944111532692e-06, + "loss": 0.6491, + "step": 2577 + }, + { + "epoch": 0.5200742639016287, + "grad_norm": 0.46440157294273376, + "learning_rate": 4.91667682083018e-06, + "loss": 0.7633, + "step": 2578 + }, + { + "epoch": 0.5202759994578356, + "grad_norm": 0.45638078451156616, + "learning_rate": 4.91340956571679e-06, + "loss": 0.658, + "step": 2579 + }, + { + "epoch": 0.5204777350140427, + "grad_norm": 0.34317252039909363, + "learning_rate": 4.910142347588041e-06, + "loss": 0.6906, + "step": 2580 + }, + { + "epoch": 0.5206794705702497, + "grad_norm": 0.6732125282287598, + "learning_rate": 4.906875167839433e-06, + "loss": 0.9408, + "step": 2581 + }, + { + "epoch": 0.5208812061264567, + "grad_norm": 0.6680876612663269, + "learning_rate": 4.903608027866447e-06, + "loss": 0.7812, + "step": 2582 + }, + { + "epoch": 0.5210829416826637, + "grad_norm": 0.8660580515861511, + "learning_rate": 4.90034092906455e-06, + "loss": 1.1006, + "step": 2583 + }, + { + "epoch": 0.5212846772388706, + "grad_norm": 0.478946715593338, + "learning_rate": 4.89707387282919e-06, + "loss": 0.8267, + "step": 2584 + }, + { + "epoch": 0.5214864127950777, + "grad_norm": 0.3759441375732422, + "learning_rate": 4.893806860555797e-06, + "loss": 0.6477, + "step": 2585 + }, + { + "epoch": 0.5216881483512846, + "grad_norm": 0.39136189222335815, + "learning_rate": 4.890539893639782e-06, + "loss": 0.6646, + "step": 2586 + }, + { + "epoch": 0.5218898839074917, + "grad_norm": 0.4921991527080536, + "learning_rate": 4.88727297347654e-06, + "loss": 0.7019, + "step": 2587 + }, + { + "epoch": 0.5220916194636986, + "grad_norm": 0.3825063407421112, + "learning_rate": 4.884006101461438e-06, + "loss": 0.8176, + "step": 2588 + }, + { + "epoch": 0.5222933550199056, + "grad_norm": 0.3946991562843323, + "learning_rate": 4.880739278989832e-06, + "loss": 0.671, + "step": 2589 + }, + { + "epoch": 0.5224950905761127, + "grad_norm": 0.5453450679779053, + "learning_rate": 4.877472507457049e-06, + "loss": 0.8733, + "step": 2590 + }, + { + "epoch": 0.5226968261323196, + "grad_norm": 1.0942749977111816, + "learning_rate": 4.874205788258397e-06, + "loss": 0.843, + "step": 2591 + }, + { + "epoch": 0.5228985616885266, + "grad_norm": 0.8049074411392212, + "learning_rate": 4.870939122789167e-06, + "loss": 0.6479, + "step": 2592 + }, + { + "epoch": 0.5231002972447336, + "grad_norm": 0.5376644134521484, + "learning_rate": 4.867672512444616e-06, + "loss": 0.6544, + "step": 2593 + }, + { + "epoch": 0.5233020328009406, + "grad_norm": 0.632375955581665, + "learning_rate": 4.8644059586199885e-06, + "loss": 0.682, + "step": 2594 + }, + { + "epoch": 0.5235037683571476, + "grad_norm": 1.4146294593811035, + "learning_rate": 4.8611394627105e-06, + "loss": 0.6603, + "step": 2595 + }, + { + "epoch": 0.5237055039133546, + "grad_norm": 0.33007383346557617, + "learning_rate": 4.857873026111338e-06, + "loss": 0.7352, + "step": 2596 + }, + { + "epoch": 0.5239072394695615, + "grad_norm": 0.8308141231536865, + "learning_rate": 4.854606650217668e-06, + "loss": 0.7489, + "step": 2597 + }, + { + "epoch": 0.5241089750257686, + "grad_norm": 0.9379283785820007, + "learning_rate": 4.851340336424635e-06, + "loss": 0.6454, + "step": 2598 + }, + { + "epoch": 0.5243107105819755, + "grad_norm": 1.1256037950515747, + "learning_rate": 4.848074086127345e-06, + "loss": 0.6926, + "step": 2599 + }, + { + "epoch": 0.5245124461381826, + "grad_norm": 0.892929196357727, + "learning_rate": 4.844807900720888e-06, + "loss": 0.6503, + "step": 2600 + }, + { + "epoch": 0.5247141816943895, + "grad_norm": 0.7106110453605652, + "learning_rate": 4.841541781600322e-06, + "loss": 0.7273, + "step": 2601 + }, + { + "epoch": 0.5249159172505965, + "grad_norm": 0.5377501249313354, + "learning_rate": 4.838275730160675e-06, + "loss": 0.7661, + "step": 2602 + }, + { + "epoch": 0.5251176528068036, + "grad_norm": 0.4693327248096466, + "learning_rate": 4.835009747796951e-06, + "loss": 0.6407, + "step": 2603 + }, + { + "epoch": 0.5253193883630105, + "grad_norm": 0.9355971813201904, + "learning_rate": 4.831743835904117e-06, + "loss": 0.6412, + "step": 2604 + }, + { + "epoch": 0.5255211239192176, + "grad_norm": 0.6056692600250244, + "learning_rate": 4.828477995877117e-06, + "loss": 0.6361, + "step": 2605 + }, + { + "epoch": 0.5257228594754245, + "grad_norm": 0.4959450364112854, + "learning_rate": 4.825212229110864e-06, + "loss": 0.8288, + "step": 2606 + }, + { + "epoch": 0.5259245950316315, + "grad_norm": 0.8087174892425537, + "learning_rate": 4.821946537000234e-06, + "loss": 0.7336, + "step": 2607 + }, + { + "epoch": 0.5261263305878385, + "grad_norm": 0.6815080642700195, + "learning_rate": 4.818680920940074e-06, + "loss": 0.6745, + "step": 2608 + }, + { + "epoch": 0.5263280661440455, + "grad_norm": 0.4571034908294678, + "learning_rate": 4.815415382325202e-06, + "loss": 0.713, + "step": 2609 + }, + { + "epoch": 0.5265298017002524, + "grad_norm": 0.361689031124115, + "learning_rate": 4.8121499225503974e-06, + "loss": 0.6651, + "step": 2610 + }, + { + "epoch": 0.5267315372564595, + "grad_norm": 0.7336861491203308, + "learning_rate": 4.808884543010409e-06, + "loss": 0.6711, + "step": 2611 + }, + { + "epoch": 0.5269332728126664, + "grad_norm": 0.6097553968429565, + "learning_rate": 4.805619245099953e-06, + "loss": 0.694, + "step": 2612 + }, + { + "epoch": 0.5271350083688735, + "grad_norm": 0.3593609035015106, + "learning_rate": 4.802354030213704e-06, + "loss": 0.6593, + "step": 2613 + }, + { + "epoch": 0.5273367439250805, + "grad_norm": 0.5035977363586426, + "learning_rate": 4.7990888997463106e-06, + "loss": 0.7097, + "step": 2614 + }, + { + "epoch": 0.5275384794812874, + "grad_norm": 0.3543822169303894, + "learning_rate": 4.795823855092379e-06, + "loss": 0.7509, + "step": 2615 + }, + { + "epoch": 0.5277402150374945, + "grad_norm": 0.6724951863288879, + "learning_rate": 4.792558897646477e-06, + "loss": 0.672, + "step": 2616 + }, + { + "epoch": 0.5279419505937014, + "grad_norm": 0.35657966136932373, + "learning_rate": 4.789294028803141e-06, + "loss": 0.6623, + "step": 2617 + }, + { + "epoch": 0.5281436861499085, + "grad_norm": 0.6644230484962463, + "learning_rate": 4.786029249956866e-06, + "loss": 0.6991, + "step": 2618 + }, + { + "epoch": 0.5283454217061154, + "grad_norm": 0.40029600262641907, + "learning_rate": 4.78276456250211e-06, + "loss": 0.6563, + "step": 2619 + }, + { + "epoch": 0.5285471572623224, + "grad_norm": 0.44226789474487305, + "learning_rate": 4.779499967833292e-06, + "loss": 0.6879, + "step": 2620 + }, + { + "epoch": 0.5287488928185294, + "grad_norm": 0.4395109713077545, + "learning_rate": 4.776235467344789e-06, + "loss": 0.7705, + "step": 2621 + }, + { + "epoch": 0.5289506283747364, + "grad_norm": 0.41959723830223083, + "learning_rate": 4.772971062430941e-06, + "loss": 0.6486, + "step": 2622 + }, + { + "epoch": 0.5291523639309434, + "grad_norm": 0.5506021976470947, + "learning_rate": 4.769706754486046e-06, + "loss": 0.6839, + "step": 2623 + }, + { + "epoch": 0.5293540994871504, + "grad_norm": 0.6641210913658142, + "learning_rate": 4.766442544904357e-06, + "loss": 0.7063, + "step": 2624 + }, + { + "epoch": 0.5295558350433573, + "grad_norm": 0.9038758277893066, + "learning_rate": 4.763178435080091e-06, + "loss": 0.7615, + "step": 2625 + }, + { + "epoch": 0.5297575705995644, + "grad_norm": 0.3876363933086395, + "learning_rate": 4.759914426407422e-06, + "loss": 0.6912, + "step": 2626 + }, + { + "epoch": 0.5299593061557714, + "grad_norm": 0.3626331090927124, + "learning_rate": 4.7566505202804736e-06, + "loss": 0.6377, + "step": 2627 + }, + { + "epoch": 0.5301610417119783, + "grad_norm": 8.561905860900879, + "learning_rate": 4.7533867180933324e-06, + "loss": 0.8409, + "step": 2628 + }, + { + "epoch": 0.5303627772681854, + "grad_norm": 0.750117838382721, + "learning_rate": 4.75012302124004e-06, + "loss": 0.8042, + "step": 2629 + }, + { + "epoch": 0.5305645128243923, + "grad_norm": 0.4639262557029724, + "learning_rate": 4.746859431114589e-06, + "loss": 0.8446, + "step": 2630 + }, + { + "epoch": 0.5307662483805994, + "grad_norm": 0.44851747155189514, + "learning_rate": 4.743595949110934e-06, + "loss": 0.8183, + "step": 2631 + }, + { + "epoch": 0.5309679839368063, + "grad_norm": 0.4516337811946869, + "learning_rate": 4.7403325766229705e-06, + "loss": 0.7315, + "step": 2632 + }, + { + "epoch": 0.5311697194930133, + "grad_norm": 0.6819981932640076, + "learning_rate": 4.737069315044562e-06, + "loss": 0.7033, + "step": 2633 + }, + { + "epoch": 0.5313714550492203, + "grad_norm": 0.4693869948387146, + "learning_rate": 4.7338061657695164e-06, + "loss": 0.7256, + "step": 2634 + }, + { + "epoch": 0.5315731906054273, + "grad_norm": 0.5674824714660645, + "learning_rate": 4.730543130191594e-06, + "loss": 0.7296, + "step": 2635 + }, + { + "epoch": 0.5317749261616344, + "grad_norm": 0.3459540009498596, + "learning_rate": 4.727280209704507e-06, + "loss": 0.659, + "step": 2636 + }, + { + "epoch": 0.5319766617178413, + "grad_norm": 0.5303491353988647, + "learning_rate": 4.7240174057019205e-06, + "loss": 0.6834, + "step": 2637 + }, + { + "epoch": 0.5321783972740483, + "grad_norm": 1.3827285766601562, + "learning_rate": 4.720754719577448e-06, + "loss": 0.7446, + "step": 2638 + }, + { + "epoch": 0.5323801328302553, + "grad_norm": 0.5295196175575256, + "learning_rate": 4.717492152724652e-06, + "loss": 0.8191, + "step": 2639 + }, + { + "epoch": 0.5325818683864623, + "grad_norm": 0.7308456301689148, + "learning_rate": 4.714229706537048e-06, + "loss": 0.6954, + "step": 2640 + }, + { + "epoch": 0.5327836039426693, + "grad_norm": 0.5110881924629211, + "learning_rate": 4.710967382408094e-06, + "loss": 0.6741, + "step": 2641 + }, + { + "epoch": 0.5329853394988763, + "grad_norm": 0.48731887340545654, + "learning_rate": 4.707705181731202e-06, + "loss": 0.7082, + "step": 2642 + }, + { + "epoch": 0.5331870750550832, + "grad_norm": 0.35509830713272095, + "learning_rate": 4.7044431058997245e-06, + "loss": 0.6748, + "step": 2643 + }, + { + "epoch": 0.5333888106112903, + "grad_norm": 0.7154707312583923, + "learning_rate": 4.701181156306965e-06, + "loss": 0.6625, + "step": 2644 + }, + { + "epoch": 0.5335905461674972, + "grad_norm": 0.38529646396636963, + "learning_rate": 4.697919334346177e-06, + "loss": 0.8088, + "step": 2645 + }, + { + "epoch": 0.5337922817237043, + "grad_norm": 0.45116767287254333, + "learning_rate": 4.6946576414105485e-06, + "loss": 0.7078, + "step": 2646 + }, + { + "epoch": 0.5339940172799112, + "grad_norm": 0.43628716468811035, + "learning_rate": 4.691396078893223e-06, + "loss": 0.6913, + "step": 2647 + }, + { + "epoch": 0.5341957528361182, + "grad_norm": 2.0100085735321045, + "learning_rate": 4.6881346481872844e-06, + "loss": 0.6698, + "step": 2648 + }, + { + "epoch": 0.5343974883923253, + "grad_norm": 1.209532380104065, + "learning_rate": 4.684873350685758e-06, + "loss": 0.7494, + "step": 2649 + }, + { + "epoch": 0.5345992239485322, + "grad_norm": 0.6590553522109985, + "learning_rate": 4.681612187781614e-06, + "loss": 0.6414, + "step": 2650 + }, + { + "epoch": 0.5348009595047392, + "grad_norm": 0.7591909170150757, + "learning_rate": 4.678351160867769e-06, + "loss": 0.6677, + "step": 2651 + }, + { + "epoch": 0.5350026950609462, + "grad_norm": 0.5568974614143372, + "learning_rate": 4.675090271337072e-06, + "loss": 0.7136, + "step": 2652 + }, + { + "epoch": 0.5352044306171532, + "grad_norm": 1.6308541297912598, + "learning_rate": 4.6718295205823235e-06, + "loss": 0.769, + "step": 2653 + }, + { + "epoch": 0.5354061661733602, + "grad_norm": 0.45534488558769226, + "learning_rate": 4.668568909996263e-06, + "loss": 0.735, + "step": 2654 + }, + { + "epoch": 0.5356079017295672, + "grad_norm": 0.4230724275112152, + "learning_rate": 4.66530844097156e-06, + "loss": 0.7746, + "step": 2655 + }, + { + "epoch": 0.5358096372857741, + "grad_norm": 1.834598183631897, + "learning_rate": 4.662048114900837e-06, + "loss": 0.6822, + "step": 2656 + }, + { + "epoch": 0.5360113728419812, + "grad_norm": 0.5896740555763245, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.6524, + "step": 2657 + }, + { + "epoch": 0.5362131083981881, + "grad_norm": 0.9899855852127075, + "learning_rate": 4.655527897191482e-06, + "loss": 0.8368, + "step": 2658 + }, + { + "epoch": 0.5364148439543952, + "grad_norm": 1.0821828842163086, + "learning_rate": 4.652268008337779e-06, + "loss": 0.8366, + "step": 2659 + }, + { + "epoch": 0.5366165795106022, + "grad_norm": 0.3686734139919281, + "learning_rate": 4.649008268007903e-06, + "loss": 0.6939, + "step": 2660 + }, + { + "epoch": 0.5368183150668091, + "grad_norm": 1.32119619846344, + "learning_rate": 4.64574867759416e-06, + "loss": 0.6831, + "step": 2661 + }, + { + "epoch": 0.5370200506230162, + "grad_norm": 0.6293189525604248, + "learning_rate": 4.642489238488794e-06, + "loss": 0.6749, + "step": 2662 + }, + { + "epoch": 0.5372217861792231, + "grad_norm": 0.7143060564994812, + "learning_rate": 4.639229952083976e-06, + "loss": 0.6889, + "step": 2663 + }, + { + "epoch": 0.5374235217354302, + "grad_norm": 0.4893825352191925, + "learning_rate": 4.635970819771822e-06, + "loss": 0.6325, + "step": 2664 + }, + { + "epoch": 0.5376252572916371, + "grad_norm": 0.44156011939048767, + "learning_rate": 4.632711842944377e-06, + "loss": 0.6458, + "step": 2665 + }, + { + "epoch": 0.5378269928478441, + "grad_norm": 0.39792585372924805, + "learning_rate": 4.629453022993618e-06, + "loss": 0.7571, + "step": 2666 + }, + { + "epoch": 0.5380287284040511, + "grad_norm": 0.3987579643726349, + "learning_rate": 4.626194361311459e-06, + "loss": 0.7626, + "step": 2667 + }, + { + "epoch": 0.5382304639602581, + "grad_norm": 1.033350944519043, + "learning_rate": 4.622935859289745e-06, + "loss": 0.8932, + "step": 2668 + }, + { + "epoch": 0.538432199516465, + "grad_norm": 0.35241934657096863, + "learning_rate": 4.619677518320252e-06, + "loss": 0.846, + "step": 2669 + }, + { + "epoch": 0.5386339350726721, + "grad_norm": 0.35918742418289185, + "learning_rate": 4.616419339794689e-06, + "loss": 0.6199, + "step": 2670 + }, + { + "epoch": 0.538835670628879, + "grad_norm": 0.774231493473053, + "learning_rate": 4.613161325104691e-06, + "loss": 0.6777, + "step": 2671 + }, + { + "epoch": 0.5390374061850861, + "grad_norm": 0.6578600406646729, + "learning_rate": 4.609903475641827e-06, + "loss": 0.7652, + "step": 2672 + }, + { + "epoch": 0.5392391417412931, + "grad_norm": 0.3553310036659241, + "learning_rate": 4.606645792797599e-06, + "loss": 0.675, + "step": 2673 + }, + { + "epoch": 0.5394408772975, + "grad_norm": 0.7471398711204529, + "learning_rate": 4.603388277963428e-06, + "loss": 0.6971, + "step": 2674 + }, + { + "epoch": 0.5396426128537071, + "grad_norm": 0.37425556778907776, + "learning_rate": 4.600130932530671e-06, + "loss": 0.6655, + "step": 2675 + }, + { + "epoch": 0.539844348409914, + "grad_norm": 0.5239168405532837, + "learning_rate": 4.596873757890612e-06, + "loss": 0.6452, + "step": 2676 + }, + { + "epoch": 0.5400460839661211, + "grad_norm": 0.6423122882843018, + "learning_rate": 4.593616755434458e-06, + "loss": 0.7154, + "step": 2677 + }, + { + "epoch": 0.540247819522328, + "grad_norm": 0.9147586822509766, + "learning_rate": 4.590359926553346e-06, + "loss": 0.6945, + "step": 2678 + }, + { + "epoch": 0.540449555078535, + "grad_norm": 0.5066952109336853, + "learning_rate": 4.587103272638339e-06, + "loss": 0.9241, + "step": 2679 + }, + { + "epoch": 0.540651290634742, + "grad_norm": 0.5868780612945557, + "learning_rate": 4.583846795080422e-06, + "loss": 0.6798, + "step": 2680 + }, + { + "epoch": 0.540853026190949, + "grad_norm": 0.3115403950214386, + "learning_rate": 4.580590495270507e-06, + "loss": 0.8002, + "step": 2681 + }, + { + "epoch": 0.541054761747156, + "grad_norm": 1.4777621030807495, + "learning_rate": 4.577334374599433e-06, + "loss": 0.6667, + "step": 2682 + }, + { + "epoch": 0.541256497303363, + "grad_norm": 0.6413019895553589, + "learning_rate": 4.574078434457955e-06, + "loss": 0.7311, + "step": 2683 + }, + { + "epoch": 0.54145823285957, + "grad_norm": 0.40882059931755066, + "learning_rate": 4.570822676236758e-06, + "loss": 0.6541, + "step": 2684 + }, + { + "epoch": 0.541659968415777, + "grad_norm": 0.6062789559364319, + "learning_rate": 4.567567101326444e-06, + "loss": 0.6362, + "step": 2685 + }, + { + "epoch": 0.541861703971984, + "grad_norm": 0.42461323738098145, + "learning_rate": 4.5643117111175405e-06, + "loss": 0.673, + "step": 2686 + }, + { + "epoch": 0.5420634395281909, + "grad_norm": 0.9184004068374634, + "learning_rate": 4.5610565070004965e-06, + "loss": 0.656, + "step": 2687 + }, + { + "epoch": 0.542265175084398, + "grad_norm": 0.4189847409725189, + "learning_rate": 4.557801490365678e-06, + "loss": 0.6666, + "step": 2688 + }, + { + "epoch": 0.5424669106406049, + "grad_norm": 0.4108622074127197, + "learning_rate": 4.5545466626033715e-06, + "loss": 0.6443, + "step": 2689 + }, + { + "epoch": 0.542668646196812, + "grad_norm": 0.8937840461730957, + "learning_rate": 4.551292025103789e-06, + "loss": 0.708, + "step": 2690 + }, + { + "epoch": 0.5428703817530189, + "grad_norm": 0.5205972790718079, + "learning_rate": 4.548037579257051e-06, + "loss": 0.6702, + "step": 2691 + }, + { + "epoch": 0.5430721173092259, + "grad_norm": 0.6844760775566101, + "learning_rate": 4.5447833264532015e-06, + "loss": 0.6735, + "step": 2692 + }, + { + "epoch": 0.543273852865433, + "grad_norm": 0.8154087662696838, + "learning_rate": 4.541529268082208e-06, + "loss": 0.6228, + "step": 2693 + }, + { + "epoch": 0.5434755884216399, + "grad_norm": 0.631812572479248, + "learning_rate": 4.538275405533943e-06, + "loss": 0.7052, + "step": 2694 + }, + { + "epoch": 0.543677323977847, + "grad_norm": 0.7014383673667908, + "learning_rate": 4.535021740198202e-06, + "loss": 0.7789, + "step": 2695 + }, + { + "epoch": 0.5438790595340539, + "grad_norm": 0.7482444643974304, + "learning_rate": 4.531768273464699e-06, + "loss": 0.8676, + "step": 2696 + }, + { + "epoch": 0.5440807950902609, + "grad_norm": 0.42839550971984863, + "learning_rate": 4.5285150067230565e-06, + "loss": 0.7038, + "step": 2697 + }, + { + "epoch": 0.5442825306464679, + "grad_norm": 0.4500325620174408, + "learning_rate": 4.525261941362818e-06, + "loss": 0.6795, + "step": 2698 + }, + { + "epoch": 0.5444842662026749, + "grad_norm": 5.416876792907715, + "learning_rate": 4.522009078773433e-06, + "loss": 0.714, + "step": 2699 + }, + { + "epoch": 0.5446860017588819, + "grad_norm": 0.491653710603714, + "learning_rate": 4.518756420344272e-06, + "loss": 0.7071, + "step": 2700 + }, + { + "epoch": 0.5448877373150889, + "grad_norm": 0.35450485348701477, + "learning_rate": 4.515503967464619e-06, + "loss": 0.6644, + "step": 2701 + }, + { + "epoch": 0.5450894728712958, + "grad_norm": 0.4445685148239136, + "learning_rate": 4.512251721523659e-06, + "loss": 0.6412, + "step": 2702 + }, + { + "epoch": 0.5452912084275029, + "grad_norm": 0.8250188827514648, + "learning_rate": 4.508999683910503e-06, + "loss": 0.6637, + "step": 2703 + }, + { + "epoch": 0.5454929439837098, + "grad_norm": 0.38135024905204773, + "learning_rate": 4.505747856014163e-06, + "loss": 0.698, + "step": 2704 + }, + { + "epoch": 0.5456946795399168, + "grad_norm": 0.35442817211151123, + "learning_rate": 4.502496239223566e-06, + "loss": 0.6717, + "step": 2705 + }, + { + "epoch": 0.5458964150961239, + "grad_norm": 0.5252602100372314, + "learning_rate": 4.499244834927547e-06, + "loss": 0.8047, + "step": 2706 + }, + { + "epoch": 0.5460981506523308, + "grad_norm": 0.40941813588142395, + "learning_rate": 4.495993644514851e-06, + "loss": 0.7421, + "step": 2707 + }, + { + "epoch": 0.5462998862085379, + "grad_norm": 0.4588121175765991, + "learning_rate": 4.492742669374133e-06, + "loss": 0.6431, + "step": 2708 + }, + { + "epoch": 0.5465016217647448, + "grad_norm": 0.4406556487083435, + "learning_rate": 4.489491910893951e-06, + "loss": 0.7046, + "step": 2709 + }, + { + "epoch": 0.5467033573209518, + "grad_norm": 0.7718446850776672, + "learning_rate": 4.486241370462779e-06, + "loss": 0.6692, + "step": 2710 + }, + { + "epoch": 0.5469050928771588, + "grad_norm": 0.38533902168273926, + "learning_rate": 4.482991049468989e-06, + "loss": 0.8709, + "step": 2711 + }, + { + "epoch": 0.5471068284333658, + "grad_norm": 0.8942533135414124, + "learning_rate": 4.479740949300864e-06, + "loss": 0.6439, + "step": 2712 + }, + { + "epoch": 0.5473085639895728, + "grad_norm": 0.5363048315048218, + "learning_rate": 4.476491071346591e-06, + "loss": 0.7975, + "step": 2713 + }, + { + "epoch": 0.5475102995457798, + "grad_norm": 0.5082899928092957, + "learning_rate": 4.473241416994265e-06, + "loss": 0.7761, + "step": 2714 + }, + { + "epoch": 0.5477120351019867, + "grad_norm": 0.4341282844543457, + "learning_rate": 4.469991987631883e-06, + "loss": 0.6799, + "step": 2715 + }, + { + "epoch": 0.5479137706581938, + "grad_norm": 0.4084181785583496, + "learning_rate": 4.466742784647344e-06, + "loss": 0.7703, + "step": 2716 + }, + { + "epoch": 0.5481155062144007, + "grad_norm": 0.4442834258079529, + "learning_rate": 4.463493809428454e-06, + "loss": 0.6482, + "step": 2717 + }, + { + "epoch": 0.5483172417706078, + "grad_norm": 0.5705946087837219, + "learning_rate": 4.460245063362925e-06, + "loss": 0.6905, + "step": 2718 + }, + { + "epoch": 0.5485189773268148, + "grad_norm": 0.30586788058280945, + "learning_rate": 4.456996547838358e-06, + "loss": 0.6811, + "step": 2719 + }, + { + "epoch": 0.5487207128830217, + "grad_norm": 0.8047022223472595, + "learning_rate": 4.4537482642422675e-06, + "loss": 0.6551, + "step": 2720 + }, + { + "epoch": 0.5489224484392288, + "grad_norm": 0.8891247510910034, + "learning_rate": 4.450500213962069e-06, + "loss": 0.7088, + "step": 2721 + }, + { + "epoch": 0.5491241839954357, + "grad_norm": 1.040446400642395, + "learning_rate": 4.447252398385071e-06, + "loss": 0.933, + "step": 2722 + }, + { + "epoch": 0.5493259195516427, + "grad_norm": 0.3285609483718872, + "learning_rate": 4.444004818898484e-06, + "loss": 0.713, + "step": 2723 + }, + { + "epoch": 0.5495276551078497, + "grad_norm": 0.3531717360019684, + "learning_rate": 4.440757476889424e-06, + "loss": 0.6311, + "step": 2724 + }, + { + "epoch": 0.5497293906640567, + "grad_norm": 0.35809552669525146, + "learning_rate": 4.437510373744897e-06, + "loss": 0.6997, + "step": 2725 + }, + { + "epoch": 0.5499311262202637, + "grad_norm": 0.5928544998168945, + "learning_rate": 4.4342635108518145e-06, + "loss": 1.145, + "step": 2726 + }, + { + "epoch": 0.5501328617764707, + "grad_norm": 1.4170804023742676, + "learning_rate": 4.4310168895969755e-06, + "loss": 0.8147, + "step": 2727 + }, + { + "epoch": 0.5503345973326776, + "grad_norm": 0.4764517545700073, + "learning_rate": 4.427770511367087e-06, + "loss": 0.6892, + "step": 2728 + }, + { + "epoch": 0.5505363328888847, + "grad_norm": 0.5142654180526733, + "learning_rate": 4.424524377548747e-06, + "loss": 0.8374, + "step": 2729 + }, + { + "epoch": 0.5507380684450917, + "grad_norm": 0.4085748493671417, + "learning_rate": 4.421278489528447e-06, + "loss": 0.8751, + "step": 2730 + }, + { + "epoch": 0.5509398040012987, + "grad_norm": 1.015724539756775, + "learning_rate": 4.418032848692575e-06, + "loss": 0.6493, + "step": 2731 + }, + { + "epoch": 0.5511415395575057, + "grad_norm": 0.4627054035663605, + "learning_rate": 4.414787456427419e-06, + "loss": 0.8622, + "step": 2732 + }, + { + "epoch": 0.5513432751137126, + "grad_norm": 0.35189080238342285, + "learning_rate": 4.41154231411915e-06, + "loss": 0.6559, + "step": 2733 + }, + { + "epoch": 0.5515450106699197, + "grad_norm": 0.4287737011909485, + "learning_rate": 4.408297423153841e-06, + "loss": 0.6751, + "step": 2734 + }, + { + "epoch": 0.5517467462261266, + "grad_norm": 0.7251502871513367, + "learning_rate": 4.4050527849174555e-06, + "loss": 0.7138, + "step": 2735 + }, + { + "epoch": 0.5519484817823337, + "grad_norm": 1.1452211141586304, + "learning_rate": 4.4018084007958475e-06, + "loss": 0.7997, + "step": 2736 + }, + { + "epoch": 0.5521502173385406, + "grad_norm": 0.4358304738998413, + "learning_rate": 4.398564272174764e-06, + "loss": 0.7486, + "step": 2737 + }, + { + "epoch": 0.5523519528947476, + "grad_norm": 0.4466962516307831, + "learning_rate": 4.3953204004398434e-06, + "loss": 0.6605, + "step": 2738 + }, + { + "epoch": 0.5525536884509546, + "grad_norm": 0.330139696598053, + "learning_rate": 4.392076786976609e-06, + "loss": 0.6841, + "step": 2739 + }, + { + "epoch": 0.5527554240071616, + "grad_norm": 0.3641626536846161, + "learning_rate": 4.388833433170482e-06, + "loss": 0.6266, + "step": 2740 + }, + { + "epoch": 0.5529571595633687, + "grad_norm": 0.3895452916622162, + "learning_rate": 4.3855903404067665e-06, + "loss": 0.6705, + "step": 2741 + }, + { + "epoch": 0.5531588951195756, + "grad_norm": 0.3152562379837036, + "learning_rate": 4.382347510070659e-06, + "loss": 0.6698, + "step": 2742 + }, + { + "epoch": 0.5533606306757826, + "grad_norm": 0.41445258259773254, + "learning_rate": 4.379104943547242e-06, + "loss": 0.7837, + "step": 2743 + }, + { + "epoch": 0.5535623662319896, + "grad_norm": 0.3514803647994995, + "learning_rate": 4.3758626422214836e-06, + "loss": 0.6789, + "step": 2744 + }, + { + "epoch": 0.5537641017881966, + "grad_norm": 0.3821863532066345, + "learning_rate": 4.372620607478242e-06, + "loss": 0.8194, + "step": 2745 + }, + { + "epoch": 0.5539658373444035, + "grad_norm": 0.46488285064697266, + "learning_rate": 4.369378840702263e-06, + "loss": 0.6812, + "step": 2746 + }, + { + "epoch": 0.5541675729006106, + "grad_norm": 0.7570337653160095, + "learning_rate": 4.366137343278168e-06, + "loss": 0.6632, + "step": 2747 + }, + { + "epoch": 0.5543693084568175, + "grad_norm": 0.5926573872566223, + "learning_rate": 4.362896116590475e-06, + "loss": 0.9003, + "step": 2748 + }, + { + "epoch": 0.5545710440130246, + "grad_norm": 0.6851587891578674, + "learning_rate": 4.359655162023585e-06, + "loss": 0.8732, + "step": 2749 + }, + { + "epoch": 0.5547727795692315, + "grad_norm": 0.44598305225372314, + "learning_rate": 4.356414480961773e-06, + "loss": 1.022, + "step": 2750 + }, + { + "epoch": 0.5549745151254385, + "grad_norm": 0.44037872552871704, + "learning_rate": 4.353174074789207e-06, + "loss": 0.6514, + "step": 2751 + }, + { + "epoch": 0.5551762506816456, + "grad_norm": 0.5381491184234619, + "learning_rate": 4.349933944889934e-06, + "loss": 0.6869, + "step": 2752 + }, + { + "epoch": 0.5553779862378525, + "grad_norm": 1.0382646322250366, + "learning_rate": 4.346694092647883e-06, + "loss": 0.6822, + "step": 2753 + }, + { + "epoch": 0.5555797217940596, + "grad_norm": 0.8446259498596191, + "learning_rate": 4.343454519446866e-06, + "loss": 0.7246, + "step": 2754 + }, + { + "epoch": 0.5557814573502665, + "grad_norm": 0.3245503902435303, + "learning_rate": 4.340215226670572e-06, + "loss": 0.7802, + "step": 2755 + }, + { + "epoch": 0.5559831929064735, + "grad_norm": 0.41590356826782227, + "learning_rate": 4.336976215702574e-06, + "loss": 0.6587, + "step": 2756 + }, + { + "epoch": 0.5561849284626805, + "grad_norm": 0.478512167930603, + "learning_rate": 4.333737487926326e-06, + "loss": 0.6288, + "step": 2757 + }, + { + "epoch": 0.5563866640188875, + "grad_norm": 0.3754807114601135, + "learning_rate": 4.330499044725154e-06, + "loss": 0.6632, + "step": 2758 + }, + { + "epoch": 0.5565883995750945, + "grad_norm": 0.34584978222846985, + "learning_rate": 4.327260887482269e-06, + "loss": 0.6748, + "step": 2759 + }, + { + "epoch": 0.5567901351313015, + "grad_norm": 0.45819729566574097, + "learning_rate": 4.324023017580759e-06, + "loss": 0.701, + "step": 2760 + }, + { + "epoch": 0.5569918706875084, + "grad_norm": 0.6208237409591675, + "learning_rate": 4.320785436403585e-06, + "loss": 0.6867, + "step": 2761 + }, + { + "epoch": 0.5571936062437155, + "grad_norm": 0.4650062322616577, + "learning_rate": 4.31754814533359e-06, + "loss": 0.6537, + "step": 2762 + }, + { + "epoch": 0.5573953417999225, + "grad_norm": 0.5537533760070801, + "learning_rate": 4.3143111457534905e-06, + "loss": 0.6722, + "step": 2763 + }, + { + "epoch": 0.5575970773561294, + "grad_norm": 0.8163173794746399, + "learning_rate": 4.311074439045878e-06, + "loss": 0.714, + "step": 2764 + }, + { + "epoch": 0.5577988129123365, + "grad_norm": 0.5011228322982788, + "learning_rate": 4.30783802659322e-06, + "loss": 0.6721, + "step": 2765 + }, + { + "epoch": 0.5580005484685434, + "grad_norm": 0.4537436068058014, + "learning_rate": 4.30460190977786e-06, + "loss": 0.662, + "step": 2766 + }, + { + "epoch": 0.5582022840247505, + "grad_norm": 0.5868704915046692, + "learning_rate": 4.301366089982009e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.5584040195809574, + "grad_norm": 0.309226930141449, + "learning_rate": 4.29813056858776e-06, + "loss": 0.6806, + "step": 2768 + }, + { + "epoch": 0.5586057551371644, + "grad_norm": 0.4315721392631531, + "learning_rate": 4.2948953469770695e-06, + "loss": 0.8418, + "step": 2769 + }, + { + "epoch": 0.5588074906933714, + "grad_norm": 0.5131277441978455, + "learning_rate": 4.291660426531773e-06, + "loss": 0.7035, + "step": 2770 + }, + { + "epoch": 0.5590092262495784, + "grad_norm": 0.44210201501846313, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.6561, + "step": 2771 + }, + { + "epoch": 0.5592109618057854, + "grad_norm": 0.49398237466812134, + "learning_rate": 4.285191494664049e-06, + "loss": 0.6542, + "step": 2772 + }, + { + "epoch": 0.5594126973619924, + "grad_norm": 0.515116274356842, + "learning_rate": 4.281957486004642e-06, + "loss": 0.6596, + "step": 2773 + }, + { + "epoch": 0.5596144329181993, + "grad_norm": 0.5115038752555847, + "learning_rate": 4.278723784036667e-06, + "loss": 0.739, + "step": 2774 + }, + { + "epoch": 0.5598161684744064, + "grad_norm": 0.6491000056266785, + "learning_rate": 4.275490390141309e-06, + "loss": 0.6157, + "step": 2775 + }, + { + "epoch": 0.5600179040306134, + "grad_norm": 0.33408376574516296, + "learning_rate": 4.272257305699619e-06, + "loss": 0.7125, + "step": 2776 + }, + { + "epoch": 0.5602196395868204, + "grad_norm": 0.40321362018585205, + "learning_rate": 4.26902453209252e-06, + "loss": 0.6438, + "step": 2777 + }, + { + "epoch": 0.5604213751430274, + "grad_norm": 0.5445034503936768, + "learning_rate": 4.265792070700796e-06, + "loss": 0.669, + "step": 2778 + }, + { + "epoch": 0.5606231106992343, + "grad_norm": 0.43536266684532166, + "learning_rate": 4.262559922905101e-06, + "loss": 0.6817, + "step": 2779 + }, + { + "epoch": 0.5608248462554414, + "grad_norm": 0.39567360281944275, + "learning_rate": 4.259328090085958e-06, + "loss": 0.7887, + "step": 2780 + }, + { + "epoch": 0.5610265818116483, + "grad_norm": 0.8040300607681274, + "learning_rate": 4.256096573623748e-06, + "loss": 0.7735, + "step": 2781 + }, + { + "epoch": 0.5612283173678553, + "grad_norm": 0.5638187527656555, + "learning_rate": 4.252865374898726e-06, + "loss": 0.8225, + "step": 2782 + }, + { + "epoch": 0.5614300529240623, + "grad_norm": 0.39516234397888184, + "learning_rate": 4.249634495291004e-06, + "loss": 0.6774, + "step": 2783 + }, + { + "epoch": 0.5616317884802693, + "grad_norm": 0.33776554465293884, + "learning_rate": 4.24640393618056e-06, + "loss": 0.6498, + "step": 2784 + }, + { + "epoch": 0.5618335240364764, + "grad_norm": 0.43080976605415344, + "learning_rate": 4.243173698947238e-06, + "loss": 0.6616, + "step": 2785 + }, + { + "epoch": 0.5620352595926833, + "grad_norm": 0.36215585470199585, + "learning_rate": 4.239943784970738e-06, + "loss": 0.6085, + "step": 2786 + }, + { + "epoch": 0.5622369951488903, + "grad_norm": 0.40813156962394714, + "learning_rate": 4.236714195630627e-06, + "loss": 0.7525, + "step": 2787 + }, + { + "epoch": 0.5624387307050973, + "grad_norm": 0.5586144924163818, + "learning_rate": 4.233484932306337e-06, + "loss": 0.8477, + "step": 2788 + }, + { + "epoch": 0.5626404662613043, + "grad_norm": 0.32251301407814026, + "learning_rate": 4.23025599637715e-06, + "loss": 0.7107, + "step": 2789 + }, + { + "epoch": 0.5628422018175113, + "grad_norm": 0.29961180686950684, + "learning_rate": 4.227027389222215e-06, + "loss": 0.9627, + "step": 2790 + }, + { + "epoch": 0.5630439373737183, + "grad_norm": 0.7028844952583313, + "learning_rate": 4.223799112220543e-06, + "loss": 0.6415, + "step": 2791 + }, + { + "epoch": 0.5632456729299252, + "grad_norm": 0.3743568956851959, + "learning_rate": 4.2205711667509986e-06, + "loss": 0.6619, + "step": 2792 + }, + { + "epoch": 0.5634474084861323, + "grad_norm": 0.348414808511734, + "learning_rate": 4.217343554192308e-06, + "loss": 0.6367, + "step": 2793 + }, + { + "epoch": 0.5636491440423392, + "grad_norm": 0.42004287242889404, + "learning_rate": 4.214116275923051e-06, + "loss": 0.6912, + "step": 2794 + }, + { + "epoch": 0.5638508795985463, + "grad_norm": 0.450578510761261, + "learning_rate": 4.210889333321668e-06, + "loss": 0.7102, + "step": 2795 + }, + { + "epoch": 0.5640526151547532, + "grad_norm": 0.36887484788894653, + "learning_rate": 4.207662727766462e-06, + "loss": 0.7953, + "step": 2796 + }, + { + "epoch": 0.5642543507109602, + "grad_norm": 0.47906526923179626, + "learning_rate": 4.204436460635578e-06, + "loss": 0.6876, + "step": 2797 + }, + { + "epoch": 0.5644560862671673, + "grad_norm": 0.3474639654159546, + "learning_rate": 4.201210533307028e-06, + "loss": 0.6823, + "step": 2798 + }, + { + "epoch": 0.5646578218233742, + "grad_norm": 1.1059390306472778, + "learning_rate": 4.1979849471586755e-06, + "loss": 0.6545, + "step": 2799 + }, + { + "epoch": 0.5648595573795812, + "grad_norm": 0.47281643748283386, + "learning_rate": 4.1947597035682355e-06, + "loss": 0.6701, + "step": 2800 + }, + { + "epoch": 0.5650612929357882, + "grad_norm": 0.4572257995605469, + "learning_rate": 4.191534803913281e-06, + "loss": 0.6721, + "step": 2801 + }, + { + "epoch": 0.5652630284919952, + "grad_norm": 0.39056044816970825, + "learning_rate": 4.188310249571236e-06, + "loss": 0.6467, + "step": 2802 + }, + { + "epoch": 0.5654647640482022, + "grad_norm": 0.600731611251831, + "learning_rate": 4.185086041919376e-06, + "loss": 0.8154, + "step": 2803 + }, + { + "epoch": 0.5656664996044092, + "grad_norm": 0.3447350561618805, + "learning_rate": 4.18186218233483e-06, + "loss": 0.7342, + "step": 2804 + }, + { + "epoch": 0.5658682351606161, + "grad_norm": 1.207696795463562, + "learning_rate": 4.178638672194582e-06, + "loss": 0.6334, + "step": 2805 + }, + { + "epoch": 0.5660699707168232, + "grad_norm": 0.6632784605026245, + "learning_rate": 4.1754155128754545e-06, + "loss": 0.6567, + "step": 2806 + }, + { + "epoch": 0.5662717062730301, + "grad_norm": 0.4278070032596588, + "learning_rate": 4.172192705754135e-06, + "loss": 0.617, + "step": 2807 + }, + { + "epoch": 0.5664734418292372, + "grad_norm": 0.9952390193939209, + "learning_rate": 4.168970252207151e-06, + "loss": 0.6974, + "step": 2808 + }, + { + "epoch": 0.5666751773854442, + "grad_norm": 0.358598917722702, + "learning_rate": 4.165748153610881e-06, + "loss": 0.7966, + "step": 2809 + }, + { + "epoch": 0.5668769129416511, + "grad_norm": 0.46342357993125916, + "learning_rate": 4.1625264113415564e-06, + "loss": 0.6444, + "step": 2810 + }, + { + "epoch": 0.5670786484978582, + "grad_norm": 0.34472960233688354, + "learning_rate": 4.159305026775249e-06, + "loss": 0.664, + "step": 2811 + }, + { + "epoch": 0.5672803840540651, + "grad_norm": 0.4725569188594818, + "learning_rate": 4.156084001287883e-06, + "loss": 0.6403, + "step": 2812 + }, + { + "epoch": 0.5674821196102722, + "grad_norm": 0.34548354148864746, + "learning_rate": 4.152863336255231e-06, + "loss": 0.6669, + "step": 2813 + }, + { + "epoch": 0.5676838551664791, + "grad_norm": 0.6782296299934387, + "learning_rate": 4.149643033052902e-06, + "loss": 0.7029, + "step": 2814 + }, + { + "epoch": 0.5678855907226861, + "grad_norm": 0.38759613037109375, + "learning_rate": 4.1464230930563595e-06, + "loss": 0.6613, + "step": 2815 + }, + { + "epoch": 0.5680873262788931, + "grad_norm": 0.37541335821151733, + "learning_rate": 4.143203517640914e-06, + "loss": 0.6692, + "step": 2816 + }, + { + "epoch": 0.5682890618351001, + "grad_norm": 0.33899807929992676, + "learning_rate": 4.1399843081817085e-06, + "loss": 0.6809, + "step": 2817 + }, + { + "epoch": 0.568490797391307, + "grad_norm": 0.36863020062446594, + "learning_rate": 4.136765466053741e-06, + "loss": 0.7819, + "step": 2818 + }, + { + "epoch": 0.5686925329475141, + "grad_norm": 0.5838175415992737, + "learning_rate": 4.133546992631847e-06, + "loss": 0.6702, + "step": 2819 + }, + { + "epoch": 0.568894268503721, + "grad_norm": 1.1761834621429443, + "learning_rate": 4.130328889290705e-06, + "loss": 0.8095, + "step": 2820 + }, + { + "epoch": 0.5690960040599281, + "grad_norm": 0.6819433569908142, + "learning_rate": 4.127111157404841e-06, + "loss": 0.7284, + "step": 2821 + }, + { + "epoch": 0.5692977396161351, + "grad_norm": 0.51947021484375, + "learning_rate": 4.1238937983486085e-06, + "loss": 0.6761, + "step": 2822 + }, + { + "epoch": 0.569499475172342, + "grad_norm": 0.40705233812332153, + "learning_rate": 4.120676813496219e-06, + "loss": 0.7013, + "step": 2823 + }, + { + "epoch": 0.5697012107285491, + "grad_norm": 0.3811868727207184, + "learning_rate": 4.117460204221715e-06, + "loss": 0.7444, + "step": 2824 + }, + { + "epoch": 0.569902946284756, + "grad_norm": 0.6068965196609497, + "learning_rate": 4.114243971898976e-06, + "loss": 0.6504, + "step": 2825 + }, + { + "epoch": 0.5701046818409631, + "grad_norm": 0.460427463054657, + "learning_rate": 4.111028117901726e-06, + "loss": 0.6779, + "step": 2826 + }, + { + "epoch": 0.57030641739717, + "grad_norm": 0.31617289781570435, + "learning_rate": 4.107812643603528e-06, + "loss": 0.6728, + "step": 2827 + }, + { + "epoch": 0.570508152953377, + "grad_norm": 0.48123976588249207, + "learning_rate": 4.104597550377776e-06, + "loss": 0.6909, + "step": 2828 + }, + { + "epoch": 0.570709888509584, + "grad_norm": 0.3782070577144623, + "learning_rate": 4.1013828395977075e-06, + "loss": 0.6668, + "step": 2829 + }, + { + "epoch": 0.570911624065791, + "grad_norm": 0.7894898653030396, + "learning_rate": 4.098168512636397e-06, + "loss": 0.8079, + "step": 2830 + }, + { + "epoch": 0.571113359621998, + "grad_norm": 1.8633605241775513, + "learning_rate": 4.094954570866748e-06, + "loss": 0.6558, + "step": 2831 + }, + { + "epoch": 0.571315095178205, + "grad_norm": 0.42616137862205505, + "learning_rate": 4.0917410156615085e-06, + "loss": 0.7138, + "step": 2832 + }, + { + "epoch": 0.571516830734412, + "grad_norm": 0.5962389707565308, + "learning_rate": 4.088527848393258e-06, + "loss": 0.6977, + "step": 2833 + }, + { + "epoch": 0.571718566290619, + "grad_norm": 0.4381890892982483, + "learning_rate": 4.085315070434405e-06, + "loss": 0.6546, + "step": 2834 + }, + { + "epoch": 0.571920301846826, + "grad_norm": 0.4706403613090515, + "learning_rate": 4.0821026831572e-06, + "loss": 0.6852, + "step": 2835 + }, + { + "epoch": 0.5721220374030329, + "grad_norm": 0.7240921854972839, + "learning_rate": 4.078890687933719e-06, + "loss": 0.6789, + "step": 2836 + }, + { + "epoch": 0.57232377295924, + "grad_norm": 0.8354261517524719, + "learning_rate": 4.075679086135877e-06, + "loss": 0.6875, + "step": 2837 + }, + { + "epoch": 0.5725255085154469, + "grad_norm": 0.6532930731773376, + "learning_rate": 4.07246787913542e-06, + "loss": 0.699, + "step": 2838 + }, + { + "epoch": 0.572727244071654, + "grad_norm": 0.45959559082984924, + "learning_rate": 4.06925706830392e-06, + "loss": 0.6808, + "step": 2839 + }, + { + "epoch": 0.5729289796278609, + "grad_norm": 0.461574912071228, + "learning_rate": 4.066046655012786e-06, + "loss": 0.6926, + "step": 2840 + }, + { + "epoch": 0.5731307151840679, + "grad_norm": 0.33318328857421875, + "learning_rate": 4.062836640633256e-06, + "loss": 0.7878, + "step": 2841 + }, + { + "epoch": 0.573332450740275, + "grad_norm": 0.4128797948360443, + "learning_rate": 4.05962702653639e-06, + "loss": 0.6467, + "step": 2842 + }, + { + "epoch": 0.5735341862964819, + "grad_norm": 0.41282278299331665, + "learning_rate": 4.056417814093089e-06, + "loss": 0.6423, + "step": 2843 + }, + { + "epoch": 0.573735921852689, + "grad_norm": 0.38421866297721863, + "learning_rate": 4.053209004674079e-06, + "loss": 0.6681, + "step": 2844 + }, + { + "epoch": 0.5739376574088959, + "grad_norm": 0.4734240174293518, + "learning_rate": 4.050000599649905e-06, + "loss": 0.7335, + "step": 2845 + }, + { + "epoch": 0.5741393929651029, + "grad_norm": 0.8877872228622437, + "learning_rate": 4.046792600390948e-06, + "loss": 0.7402, + "step": 2846 + }, + { + "epoch": 0.5743411285213099, + "grad_norm": 0.407859742641449, + "learning_rate": 4.043585008267418e-06, + "loss": 0.856, + "step": 2847 + }, + { + "epoch": 0.5745428640775169, + "grad_norm": 0.36701953411102295, + "learning_rate": 4.040377824649341e-06, + "loss": 0.6531, + "step": 2848 + }, + { + "epoch": 0.5747445996337239, + "grad_norm": 0.32425543665885925, + "learning_rate": 4.0371710509065775e-06, + "loss": 0.7612, + "step": 2849 + }, + { + "epoch": 0.5749463351899309, + "grad_norm": 0.5345488786697388, + "learning_rate": 4.033964688408808e-06, + "loss": 0.67, + "step": 2850 + }, + { + "epoch": 0.5751480707461378, + "grad_norm": 0.37572526931762695, + "learning_rate": 4.0307587385255395e-06, + "loss": 0.6508, + "step": 2851 + }, + { + "epoch": 0.5753498063023449, + "grad_norm": 0.5175938010215759, + "learning_rate": 4.027553202626105e-06, + "loss": 0.7204, + "step": 2852 + }, + { + "epoch": 0.5755515418585518, + "grad_norm": 0.38628992438316345, + "learning_rate": 4.0243480820796544e-06, + "loss": 0.6626, + "step": 2853 + }, + { + "epoch": 0.5757532774147589, + "grad_norm": 0.39906197786331177, + "learning_rate": 4.021143378255164e-06, + "loss": 0.8014, + "step": 2854 + }, + { + "epoch": 0.5759550129709659, + "grad_norm": 0.5469081997871399, + "learning_rate": 4.017939092521434e-06, + "loss": 0.646, + "step": 2855 + }, + { + "epoch": 0.5761567485271728, + "grad_norm": 0.4166348874568939, + "learning_rate": 4.014735226247082e-06, + "loss": 0.7024, + "step": 2856 + }, + { + "epoch": 0.5763584840833799, + "grad_norm": 0.42324933409690857, + "learning_rate": 4.011531780800549e-06, + "loss": 0.6633, + "step": 2857 + }, + { + "epoch": 0.5765602196395868, + "grad_norm": 0.3424137532711029, + "learning_rate": 4.0083287575500965e-06, + "loss": 0.636, + "step": 2858 + }, + { + "epoch": 0.5767619551957938, + "grad_norm": 0.6285327076911926, + "learning_rate": 4.005126157863803e-06, + "loss": 0.8236, + "step": 2859 + }, + { + "epoch": 0.5769636907520008, + "grad_norm": 0.5726880431175232, + "learning_rate": 4.001923983109569e-06, + "loss": 0.6326, + "step": 2860 + }, + { + "epoch": 0.5771654263082078, + "grad_norm": 0.7701642513275146, + "learning_rate": 3.998722234655113e-06, + "loss": 0.6819, + "step": 2861 + }, + { + "epoch": 0.5773671618644148, + "grad_norm": 1.3106602430343628, + "learning_rate": 3.995520913867968e-06, + "loss": 0.7423, + "step": 2862 + }, + { + "epoch": 0.5775688974206218, + "grad_norm": 0.4269159138202667, + "learning_rate": 3.992320022115492e-06, + "loss": 0.646, + "step": 2863 + }, + { + "epoch": 0.5777706329768287, + "grad_norm": 0.75742107629776, + "learning_rate": 3.989119560764849e-06, + "loss": 0.7595, + "step": 2864 + }, + { + "epoch": 0.5779723685330358, + "grad_norm": 0.8314652442932129, + "learning_rate": 3.985919531183029e-06, + "loss": 0.6935, + "step": 2865 + }, + { + "epoch": 0.5781741040892427, + "grad_norm": 0.7508164048194885, + "learning_rate": 3.982719934736832e-06, + "loss": 0.6716, + "step": 2866 + }, + { + "epoch": 0.5783758396454498, + "grad_norm": 1.0965166091918945, + "learning_rate": 3.979520772792875e-06, + "loss": 0.676, + "step": 2867 + }, + { + "epoch": 0.5785775752016568, + "grad_norm": 1.353140115737915, + "learning_rate": 3.976322046717589e-06, + "loss": 0.7143, + "step": 2868 + }, + { + "epoch": 0.5787793107578637, + "grad_norm": 0.7627206444740295, + "learning_rate": 3.973123757877219e-06, + "loss": 0.8499, + "step": 2869 + }, + { + "epoch": 0.5789810463140708, + "grad_norm": 0.46488526463508606, + "learning_rate": 3.969925907637823e-06, + "loss": 0.9204, + "step": 2870 + }, + { + "epoch": 0.5791827818702777, + "grad_norm": 0.43276649713516235, + "learning_rate": 3.966728497365272e-06, + "loss": 0.7419, + "step": 2871 + }, + { + "epoch": 0.5793845174264848, + "grad_norm": 0.3913387656211853, + "learning_rate": 3.96353152842525e-06, + "loss": 0.7284, + "step": 2872 + }, + { + "epoch": 0.5795862529826917, + "grad_norm": 0.3979531526565552, + "learning_rate": 3.9603350021832485e-06, + "loss": 0.6834, + "step": 2873 + }, + { + "epoch": 0.5797879885388987, + "grad_norm": 0.4591613709926605, + "learning_rate": 3.9571389200045735e-06, + "loss": 0.6881, + "step": 2874 + }, + { + "epoch": 0.5799897240951057, + "grad_norm": 0.685820996761322, + "learning_rate": 3.953943283254342e-06, + "loss": 0.7704, + "step": 2875 + }, + { + "epoch": 0.5801914596513127, + "grad_norm": 1.0198726654052734, + "learning_rate": 3.950748093297479e-06, + "loss": 0.7098, + "step": 2876 + }, + { + "epoch": 0.5803931952075196, + "grad_norm": 0.4752916693687439, + "learning_rate": 3.947553351498719e-06, + "loss": 0.6746, + "step": 2877 + }, + { + "epoch": 0.5805949307637267, + "grad_norm": 0.8071935176849365, + "learning_rate": 3.9443590592226025e-06, + "loss": 0.6935, + "step": 2878 + }, + { + "epoch": 0.5807966663199337, + "grad_norm": 0.6545006036758423, + "learning_rate": 3.941165217833484e-06, + "loss": 0.65, + "step": 2879 + }, + { + "epoch": 0.5809984018761407, + "grad_norm": 0.5821269750595093, + "learning_rate": 3.937971828695522e-06, + "loss": 0.8889, + "step": 2880 + }, + { + "epoch": 0.5812001374323477, + "grad_norm": 0.6591540575027466, + "learning_rate": 3.934778893172679e-06, + "loss": 0.6388, + "step": 2881 + }, + { + "epoch": 0.5814018729885546, + "grad_norm": 0.9008996486663818, + "learning_rate": 3.931586412628727e-06, + "loss": 0.6927, + "step": 2882 + }, + { + "epoch": 0.5816036085447617, + "grad_norm": 0.7184357643127441, + "learning_rate": 3.928394388427247e-06, + "loss": 0.681, + "step": 2883 + }, + { + "epoch": 0.5818053441009686, + "grad_norm": 0.41039785742759705, + "learning_rate": 3.925202821931618e-06, + "loss": 0.885, + "step": 2884 + }, + { + "epoch": 0.5820070796571757, + "grad_norm": 0.9089972972869873, + "learning_rate": 3.9220117145050254e-06, + "loss": 0.685, + "step": 2885 + }, + { + "epoch": 0.5822088152133826, + "grad_norm": 0.5806027054786682, + "learning_rate": 3.918821067510464e-06, + "loss": 0.6892, + "step": 2886 + }, + { + "epoch": 0.5824105507695896, + "grad_norm": 1.8731859922409058, + "learning_rate": 3.915630882310726e-06, + "loss": 0.8295, + "step": 2887 + }, + { + "epoch": 0.5826122863257966, + "grad_norm": 0.8789365887641907, + "learning_rate": 3.912441160268407e-06, + "loss": 0.6816, + "step": 2888 + }, + { + "epoch": 0.5828140218820036, + "grad_norm": 0.4402933716773987, + "learning_rate": 3.909251902745909e-06, + "loss": 0.8881, + "step": 2889 + }, + { + "epoch": 0.5830157574382107, + "grad_norm": 0.4205271005630493, + "learning_rate": 3.90606311110543e-06, + "loss": 0.6722, + "step": 2890 + }, + { + "epoch": 0.5832174929944176, + "grad_norm": 0.39811578392982483, + "learning_rate": 3.9028747867089735e-06, + "loss": 0.7116, + "step": 2891 + }, + { + "epoch": 0.5834192285506246, + "grad_norm": 0.36638522148132324, + "learning_rate": 3.899686930918339e-06, + "loss": 0.6378, + "step": 2892 + }, + { + "epoch": 0.5836209641068316, + "grad_norm": 0.5602337718009949, + "learning_rate": 3.89649954509513e-06, + "loss": 0.6696, + "step": 2893 + }, + { + "epoch": 0.5838226996630386, + "grad_norm": 0.37899842858314514, + "learning_rate": 3.893312630600749e-06, + "loss": 0.7303, + "step": 2894 + }, + { + "epoch": 0.5840244352192455, + "grad_norm": 0.8907220363616943, + "learning_rate": 3.890126188796393e-06, + "loss": 0.6649, + "step": 2895 + }, + { + "epoch": 0.5842261707754526, + "grad_norm": 1.020248532295227, + "learning_rate": 3.8869402210430616e-06, + "loss": 0.679, + "step": 2896 + }, + { + "epoch": 0.5844279063316595, + "grad_norm": 0.6201417446136475, + "learning_rate": 3.883754728701552e-06, + "loss": 0.656, + "step": 2897 + }, + { + "epoch": 0.5846296418878666, + "grad_norm": 0.3928530514240265, + "learning_rate": 3.8805697131324525e-06, + "loss": 0.9927, + "step": 2898 + }, + { + "epoch": 0.5848313774440735, + "grad_norm": 0.3769809603691101, + "learning_rate": 3.877385175696156e-06, + "loss": 0.7605, + "step": 2899 + }, + { + "epoch": 0.5850331130002805, + "grad_norm": 0.6660411953926086, + "learning_rate": 3.874201117752846e-06, + "loss": 0.6641, + "step": 2900 + }, + { + "epoch": 0.5852348485564876, + "grad_norm": 0.41550612449645996, + "learning_rate": 3.8710175406625e-06, + "loss": 0.7132, + "step": 2901 + }, + { + "epoch": 0.5854365841126945, + "grad_norm": 0.7428571581840515, + "learning_rate": 3.867834445784893e-06, + "loss": 0.6619, + "step": 2902 + }, + { + "epoch": 0.5856383196689016, + "grad_norm": 0.9024825096130371, + "learning_rate": 3.864651834479596e-06, + "loss": 0.6359, + "step": 2903 + }, + { + "epoch": 0.5858400552251085, + "grad_norm": 0.6011614799499512, + "learning_rate": 3.861469708105969e-06, + "loss": 0.6514, + "step": 2904 + }, + { + "epoch": 0.5860417907813155, + "grad_norm": 0.4725446403026581, + "learning_rate": 3.8582880680231675e-06, + "loss": 0.7775, + "step": 2905 + }, + { + "epoch": 0.5862435263375225, + "grad_norm": 0.5164721012115479, + "learning_rate": 3.855106915590137e-06, + "loss": 0.6267, + "step": 2906 + }, + { + "epoch": 0.5864452618937295, + "grad_norm": 0.3842906951904297, + "learning_rate": 3.851926252165616e-06, + "loss": 0.6418, + "step": 2907 + }, + { + "epoch": 0.5866469974499365, + "grad_norm": 0.5270214676856995, + "learning_rate": 3.848746079108139e-06, + "loss": 0.7971, + "step": 2908 + }, + { + "epoch": 0.5868487330061435, + "grad_norm": 0.32814523577690125, + "learning_rate": 3.845566397776022e-06, + "loss": 0.7647, + "step": 2909 + }, + { + "epoch": 0.5870504685623504, + "grad_norm": 1.0125082731246948, + "learning_rate": 3.842387209527374e-06, + "loss": 0.7725, + "step": 2910 + }, + { + "epoch": 0.5872522041185575, + "grad_norm": 0.4370710253715515, + "learning_rate": 3.839208515720102e-06, + "loss": 0.6698, + "step": 2911 + }, + { + "epoch": 0.5874539396747644, + "grad_norm": 0.3766273558139801, + "learning_rate": 3.836030317711886e-06, + "loss": 0.628, + "step": 2912 + }, + { + "epoch": 0.5876556752309714, + "grad_norm": 0.5694656372070312, + "learning_rate": 3.832852616860208e-06, + "loss": 0.6578, + "step": 2913 + }, + { + "epoch": 0.5878574107871785, + "grad_norm": 0.5162098407745361, + "learning_rate": 3.829675414522332e-06, + "loss": 0.691, + "step": 2914 + }, + { + "epoch": 0.5880591463433854, + "grad_norm": 1.029689073562622, + "learning_rate": 3.82649871205531e-06, + "loss": 0.7757, + "step": 2915 + }, + { + "epoch": 0.5882608818995925, + "grad_norm": 1.1007672548294067, + "learning_rate": 3.8233225108159765e-06, + "loss": 0.7941, + "step": 2916 + }, + { + "epoch": 0.5884626174557994, + "grad_norm": 0.3063526451587677, + "learning_rate": 3.82014681216096e-06, + "loss": 0.632, + "step": 2917 + }, + { + "epoch": 0.5886643530120064, + "grad_norm": 0.2978290319442749, + "learning_rate": 3.8169716174466675e-06, + "loss": 0.6563, + "step": 2918 + }, + { + "epoch": 0.5888660885682134, + "grad_norm": 0.4835878312587738, + "learning_rate": 3.813796928029295e-06, + "loss": 0.6915, + "step": 2919 + }, + { + "epoch": 0.5890678241244204, + "grad_norm": 0.3862401247024536, + "learning_rate": 3.8106227452648175e-06, + "loss": 0.6784, + "step": 2920 + }, + { + "epoch": 0.5892695596806274, + "grad_norm": 0.44164806604385376, + "learning_rate": 3.8074490705089983e-06, + "loss": 0.6522, + "step": 2921 + }, + { + "epoch": 0.5894712952368344, + "grad_norm": 0.5748169422149658, + "learning_rate": 3.8042759051173843e-06, + "loss": 0.6835, + "step": 2922 + }, + { + "epoch": 0.5896730307930413, + "grad_norm": 0.5566649436950684, + "learning_rate": 3.8011032504453e-06, + "loss": 0.666, + "step": 2923 + }, + { + "epoch": 0.5898747663492484, + "grad_norm": 0.5561574697494507, + "learning_rate": 3.7979311078478554e-06, + "loss": 0.7103, + "step": 2924 + }, + { + "epoch": 0.5900765019054554, + "grad_norm": 0.4142785966396332, + "learning_rate": 3.7947594786799424e-06, + "loss": 0.8713, + "step": 2925 + }, + { + "epoch": 0.5902782374616624, + "grad_norm": 1.048613429069519, + "learning_rate": 3.7915883642962303e-06, + "loss": 0.6412, + "step": 2926 + }, + { + "epoch": 0.5904799730178694, + "grad_norm": 0.685620129108429, + "learning_rate": 3.7884177660511713e-06, + "loss": 0.6561, + "step": 2927 + }, + { + "epoch": 0.5906817085740763, + "grad_norm": 0.43489912152290344, + "learning_rate": 3.785247685298998e-06, + "loss": 0.6759, + "step": 2928 + }, + { + "epoch": 0.5908834441302834, + "grad_norm": 0.32432207465171814, + "learning_rate": 3.7820781233937163e-06, + "loss": 0.6417, + "step": 2929 + }, + { + "epoch": 0.5910851796864903, + "grad_norm": 0.9676985144615173, + "learning_rate": 3.7789090816891157e-06, + "loss": 0.6837, + "step": 2930 + }, + { + "epoch": 0.5912869152426973, + "grad_norm": 0.40368932485580444, + "learning_rate": 3.7757405615387657e-06, + "loss": 0.6609, + "step": 2931 + }, + { + "epoch": 0.5914886507989043, + "grad_norm": 0.6651648879051208, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.7401, + "step": 2932 + }, + { + "epoch": 0.5916903863551113, + "grad_norm": 0.35699695348739624, + "learning_rate": 3.7694050913139555e-06, + "loss": 0.6083, + "step": 2933 + }, + { + "epoch": 0.5918921219113183, + "grad_norm": 0.5533597469329834, + "learning_rate": 3.7662381439455133e-06, + "loss": 0.6542, + "step": 2934 + }, + { + "epoch": 0.5920938574675253, + "grad_norm": 0.7516672611236572, + "learning_rate": 3.763071723543349e-06, + "loss": 0.8206, + "step": 2935 + }, + { + "epoch": 0.5922955930237322, + "grad_norm": 0.5364474654197693, + "learning_rate": 3.7599058314599112e-06, + "loss": 0.6639, + "step": 2936 + }, + { + "epoch": 0.5924973285799393, + "grad_norm": 0.753676176071167, + "learning_rate": 3.756740469047416e-06, + "loss": 0.6909, + "step": 2937 + }, + { + "epoch": 0.5926990641361463, + "grad_norm": 0.8867906332015991, + "learning_rate": 3.7535756376578625e-06, + "loss": 0.7263, + "step": 2938 + }, + { + "epoch": 0.5929007996923533, + "grad_norm": 0.46517348289489746, + "learning_rate": 3.7504113386430187e-06, + "loss": 0.7492, + "step": 2939 + }, + { + "epoch": 0.5931025352485603, + "grad_norm": 3.5035691261291504, + "learning_rate": 3.747247573354421e-06, + "loss": 0.6719, + "step": 2940 + }, + { + "epoch": 0.5933042708047672, + "grad_norm": 0.43149906396865845, + "learning_rate": 3.744084343143383e-06, + "loss": 0.6979, + "step": 2941 + }, + { + "epoch": 0.5935060063609743, + "grad_norm": 0.405324250459671, + "learning_rate": 3.740921649360991e-06, + "loss": 0.6662, + "step": 2942 + }, + { + "epoch": 0.5937077419171812, + "grad_norm": 0.40969419479370117, + "learning_rate": 3.7377594933580967e-06, + "loss": 0.6335, + "step": 2943 + }, + { + "epoch": 0.5939094774733883, + "grad_norm": 0.3887074291706085, + "learning_rate": 3.7345978764853276e-06, + "loss": 0.6206, + "step": 2944 + }, + { + "epoch": 0.5941112130295952, + "grad_norm": 0.33942726254463196, + "learning_rate": 3.7314368000930754e-06, + "loss": 0.6788, + "step": 2945 + }, + { + "epoch": 0.5943129485858022, + "grad_norm": 0.8974020481109619, + "learning_rate": 3.7282762655315065e-06, + "loss": 0.6798, + "step": 2946 + }, + { + "epoch": 0.5945146841420093, + "grad_norm": 0.4469338655471802, + "learning_rate": 3.7251162741505543e-06, + "loss": 0.6575, + "step": 2947 + }, + { + "epoch": 0.5947164196982162, + "grad_norm": 0.5734437704086304, + "learning_rate": 3.7219568272999148e-06, + "loss": 0.7013, + "step": 2948 + }, + { + "epoch": 0.5949181552544233, + "grad_norm": 0.38166043162345886, + "learning_rate": 3.7187979263290585e-06, + "loss": 0.6895, + "step": 2949 + }, + { + "epoch": 0.5951198908106302, + "grad_norm": 0.32561951875686646, + "learning_rate": 3.7156395725872213e-06, + "loss": 0.7999, + "step": 2950 + }, + { + "epoch": 0.5953216263668372, + "grad_norm": 0.3810766637325287, + "learning_rate": 3.712481767423402e-06, + "loss": 0.7402, + "step": 2951 + }, + { + "epoch": 0.5955233619230442, + "grad_norm": 0.3509414494037628, + "learning_rate": 3.7093245121863673e-06, + "loss": 0.6636, + "step": 2952 + }, + { + "epoch": 0.5957250974792512, + "grad_norm": 0.5389799475669861, + "learning_rate": 3.706167808224652e-06, + "loss": 0.7185, + "step": 2953 + }, + { + "epoch": 0.5959268330354581, + "grad_norm": 1.3201884031295776, + "learning_rate": 3.7030116568865486e-06, + "loss": 0.7929, + "step": 2954 + }, + { + "epoch": 0.5961285685916652, + "grad_norm": 0.3962297737598419, + "learning_rate": 3.6998560595201188e-06, + "loss": 0.6616, + "step": 2955 + }, + { + "epoch": 0.5963303041478721, + "grad_norm": 0.8047513961791992, + "learning_rate": 3.696701017473189e-06, + "loss": 0.6579, + "step": 2956 + }, + { + "epoch": 0.5965320397040792, + "grad_norm": 0.3982381522655487, + "learning_rate": 3.6935465320933393e-06, + "loss": 0.6751, + "step": 2957 + }, + { + "epoch": 0.5967337752602861, + "grad_norm": 0.3599437475204468, + "learning_rate": 3.6903926047279254e-06, + "loss": 0.6433, + "step": 2958 + }, + { + "epoch": 0.5969355108164931, + "grad_norm": 0.8588778972625732, + "learning_rate": 3.6872392367240523e-06, + "loss": 0.8927, + "step": 2959 + }, + { + "epoch": 0.5971372463727002, + "grad_norm": 0.38870593905448914, + "learning_rate": 3.684086429428594e-06, + "loss": 0.6456, + "step": 2960 + }, + { + "epoch": 0.5973389819289071, + "grad_norm": 1.5577501058578491, + "learning_rate": 3.680934184188182e-06, + "loss": 0.7435, + "step": 2961 + }, + { + "epoch": 0.5975407174851142, + "grad_norm": 1.114391803741455, + "learning_rate": 3.6777825023492076e-06, + "loss": 0.8232, + "step": 2962 + }, + { + "epoch": 0.5977424530413211, + "grad_norm": 0.6805806756019592, + "learning_rate": 3.6746313852578226e-06, + "loss": 0.6773, + "step": 2963 + }, + { + "epoch": 0.5979441885975281, + "grad_norm": 0.6296008229255676, + "learning_rate": 3.671480834259939e-06, + "loss": 0.6876, + "step": 2964 + }, + { + "epoch": 0.5981459241537351, + "grad_norm": 1.0204540491104126, + "learning_rate": 3.6683308507012196e-06, + "loss": 0.6512, + "step": 2965 + }, + { + "epoch": 0.5983476597099421, + "grad_norm": 0.6919869184494019, + "learning_rate": 3.6651814359270955e-06, + "loss": 0.6719, + "step": 2966 + }, + { + "epoch": 0.5985493952661491, + "grad_norm": 1.0824942588806152, + "learning_rate": 3.6620325912827493e-06, + "loss": 0.848, + "step": 2967 + }, + { + "epoch": 0.5987511308223561, + "grad_norm": 0.3988608419895172, + "learning_rate": 3.658884318113117e-06, + "loss": 0.6716, + "step": 2968 + }, + { + "epoch": 0.598952866378563, + "grad_norm": 1.047755479812622, + "learning_rate": 3.6557366177628956e-06, + "loss": 0.6931, + "step": 2969 + }, + { + "epoch": 0.5991546019347701, + "grad_norm": 0.7173445224761963, + "learning_rate": 3.652589491576537e-06, + "loss": 0.6335, + "step": 2970 + }, + { + "epoch": 0.599356337490977, + "grad_norm": 1.1091235876083374, + "learning_rate": 3.6494429408982446e-06, + "loss": 0.622, + "step": 2971 + }, + { + "epoch": 0.599558073047184, + "grad_norm": 0.8816885352134705, + "learning_rate": 3.6462969670719807e-06, + "loss": 0.6661, + "step": 2972 + }, + { + "epoch": 0.5997598086033911, + "grad_norm": 1.5705469846725464, + "learning_rate": 3.6431515714414552e-06, + "loss": 0.6991, + "step": 2973 + }, + { + "epoch": 0.599961544159598, + "grad_norm": 0.774057924747467, + "learning_rate": 3.6400067553501362e-06, + "loss": 0.6694, + "step": 2974 + }, + { + "epoch": 0.6001632797158051, + "grad_norm": 0.43345385789871216, + "learning_rate": 3.6368625201412443e-06, + "loss": 0.8072, + "step": 2975 + }, + { + "epoch": 0.600365015272012, + "grad_norm": 0.3652766942977905, + "learning_rate": 3.6337188671577463e-06, + "loss": 0.6596, + "step": 2976 + }, + { + "epoch": 0.600566750828219, + "grad_norm": 0.3644733428955078, + "learning_rate": 3.630575797742365e-06, + "loss": 0.6502, + "step": 2977 + }, + { + "epoch": 0.600768486384426, + "grad_norm": 0.8941843509674072, + "learning_rate": 3.627433313237576e-06, + "loss": 0.6439, + "step": 2978 + }, + { + "epoch": 0.600970221940633, + "grad_norm": 0.9603281021118164, + "learning_rate": 3.6242914149855984e-06, + "loss": 0.6655, + "step": 2979 + }, + { + "epoch": 0.60117195749684, + "grad_norm": 0.37791603803634644, + "learning_rate": 3.621150104328407e-06, + "loss": 0.7303, + "step": 2980 + }, + { + "epoch": 0.601373693053047, + "grad_norm": 0.862527072429657, + "learning_rate": 3.6180093826077236e-06, + "loss": 0.6355, + "step": 2981 + }, + { + "epoch": 0.601575428609254, + "grad_norm": 0.4360196888446808, + "learning_rate": 3.614869251165015e-06, + "loss": 0.6691, + "step": 2982 + }, + { + "epoch": 0.601777164165461, + "grad_norm": 0.38961103558540344, + "learning_rate": 3.611729711341503e-06, + "loss": 0.6645, + "step": 2983 + }, + { + "epoch": 0.601978899721668, + "grad_norm": 0.6375663876533508, + "learning_rate": 3.6085907644781522e-06, + "loss": 0.6759, + "step": 2984 + }, + { + "epoch": 0.602180635277875, + "grad_norm": 0.6156184673309326, + "learning_rate": 3.6054524119156696e-06, + "loss": 0.6485, + "step": 2985 + }, + { + "epoch": 0.602382370834082, + "grad_norm": 0.4622036814689636, + "learning_rate": 3.602314654994521e-06, + "loss": 0.6766, + "step": 2986 + }, + { + "epoch": 0.6025841063902889, + "grad_norm": 0.43395689129829407, + "learning_rate": 3.599177495054903e-06, + "loss": 0.7704, + "step": 2987 + }, + { + "epoch": 0.602785841946496, + "grad_norm": 0.3791487216949463, + "learning_rate": 3.5960409334367676e-06, + "loss": 0.6537, + "step": 2988 + }, + { + "epoch": 0.6029875775027029, + "grad_norm": 0.4992687702178955, + "learning_rate": 3.592904971479808e-06, + "loss": 0.6854, + "step": 2989 + }, + { + "epoch": 0.6031893130589099, + "grad_norm": 0.3665289878845215, + "learning_rate": 3.589769610523459e-06, + "loss": 0.6498, + "step": 2990 + }, + { + "epoch": 0.6033910486151169, + "grad_norm": 0.6164460778236389, + "learning_rate": 3.5866348519069034e-06, + "loss": 0.6605, + "step": 2991 + }, + { + "epoch": 0.6035927841713239, + "grad_norm": 0.39422157406806946, + "learning_rate": 3.5835006969690634e-06, + "loss": 0.6493, + "step": 2992 + }, + { + "epoch": 0.603794519727531, + "grad_norm": 0.6252549886703491, + "learning_rate": 3.5803671470486023e-06, + "loss": 0.6322, + "step": 2993 + }, + { + "epoch": 0.6039962552837379, + "grad_norm": 1.0483990907669067, + "learning_rate": 3.5772342034839293e-06, + "loss": 0.6724, + "step": 2994 + }, + { + "epoch": 0.6041979908399449, + "grad_norm": 1.0793009996414185, + "learning_rate": 3.574101867613192e-06, + "loss": 0.6607, + "step": 2995 + }, + { + "epoch": 0.6043997263961519, + "grad_norm": 0.4432700276374817, + "learning_rate": 3.570970140774277e-06, + "loss": 0.6635, + "step": 2996 + }, + { + "epoch": 0.6046014619523589, + "grad_norm": 0.41216427087783813, + "learning_rate": 3.567839024304812e-06, + "loss": 0.616, + "step": 2997 + }, + { + "epoch": 0.6048031975085659, + "grad_norm": 0.9142566323280334, + "learning_rate": 3.5647085195421668e-06, + "loss": 0.6757, + "step": 2998 + }, + { + "epoch": 0.6050049330647729, + "grad_norm": 0.3180692791938782, + "learning_rate": 3.5615786278234443e-06, + "loss": 0.785, + "step": 2999 + }, + { + "epoch": 0.6052066686209798, + "grad_norm": 2.0407209396362305, + "learning_rate": 3.5584493504854924e-06, + "loss": 0.6865, + "step": 3000 + }, + { + "epoch": 0.6054084041771869, + "grad_norm": 0.601322591304779, + "learning_rate": 3.555320688864889e-06, + "loss": 0.7127, + "step": 3001 + }, + { + "epoch": 0.6056101397333938, + "grad_norm": 0.4150068163871765, + "learning_rate": 3.552192644297955e-06, + "loss": 0.7162, + "step": 3002 + }, + { + "epoch": 0.6058118752896009, + "grad_norm": 1.607946515083313, + "learning_rate": 3.5490652181207474e-06, + "loss": 0.6517, + "step": 3003 + }, + { + "epoch": 0.6060136108458078, + "grad_norm": 3.0131125450134277, + "learning_rate": 3.545938411669053e-06, + "loss": 0.8067, + "step": 3004 + }, + { + "epoch": 0.6062153464020148, + "grad_norm": 0.8337088227272034, + "learning_rate": 3.5428122262784005e-06, + "loss": 0.7968, + "step": 3005 + }, + { + "epoch": 0.6064170819582219, + "grad_norm": 1.514702320098877, + "learning_rate": 3.539686663284053e-06, + "loss": 0.6373, + "step": 3006 + }, + { + "epoch": 0.6066188175144288, + "grad_norm": 1.8276013135910034, + "learning_rate": 3.536561724021003e-06, + "loss": 0.7206, + "step": 3007 + }, + { + "epoch": 0.6068205530706358, + "grad_norm": 0.4989999532699585, + "learning_rate": 3.5334374098239797e-06, + "loss": 0.6142, + "step": 3008 + }, + { + "epoch": 0.6070222886268428, + "grad_norm": 0.3233503997325897, + "learning_rate": 3.5303137220274467e-06, + "loss": 0.6769, + "step": 3009 + }, + { + "epoch": 0.6072240241830498, + "grad_norm": 0.4093765616416931, + "learning_rate": 3.5271906619655966e-06, + "loss": 0.6632, + "step": 3010 + }, + { + "epoch": 0.6074257597392568, + "grad_norm": 0.8114385604858398, + "learning_rate": 3.524068230972356e-06, + "loss": 0.6268, + "step": 3011 + }, + { + "epoch": 0.6076274952954638, + "grad_norm": 0.4605998396873474, + "learning_rate": 3.5209464303813843e-06, + "loss": 0.684, + "step": 3012 + }, + { + "epoch": 0.6078292308516707, + "grad_norm": 0.4391426742076874, + "learning_rate": 3.5178252615260677e-06, + "loss": 0.681, + "step": 3013 + }, + { + "epoch": 0.6080309664078778, + "grad_norm": 0.3784283697605133, + "learning_rate": 3.5147047257395268e-06, + "loss": 0.8089, + "step": 3014 + }, + { + "epoch": 0.6082327019640847, + "grad_norm": 0.5586382746696472, + "learning_rate": 3.5115848243546065e-06, + "loss": 0.6608, + "step": 3015 + }, + { + "epoch": 0.6084344375202918, + "grad_norm": 0.5277711749076843, + "learning_rate": 3.508465558703885e-06, + "loss": 0.7175, + "step": 3016 + }, + { + "epoch": 0.6086361730764988, + "grad_norm": 0.420762836933136, + "learning_rate": 3.505346930119671e-06, + "loss": 0.6385, + "step": 3017 + }, + { + "epoch": 0.6088379086327057, + "grad_norm": 0.3296513557434082, + "learning_rate": 3.5022289399339933e-06, + "loss": 0.7085, + "step": 3018 + }, + { + "epoch": 0.6090396441889128, + "grad_norm": 0.41073331236839294, + "learning_rate": 3.4991115894786152e-06, + "loss": 0.737, + "step": 3019 + }, + { + "epoch": 0.6092413797451197, + "grad_norm": 0.29459112882614136, + "learning_rate": 3.4959948800850253e-06, + "loss": 0.661, + "step": 3020 + }, + { + "epoch": 0.6094431153013268, + "grad_norm": 0.49085837602615356, + "learning_rate": 3.492878813084435e-06, + "loss": 0.674, + "step": 3021 + }, + { + "epoch": 0.6096448508575337, + "grad_norm": 0.427566796541214, + "learning_rate": 3.489763389807784e-06, + "loss": 0.6497, + "step": 3022 + }, + { + "epoch": 0.6098465864137407, + "grad_norm": 0.8970413208007812, + "learning_rate": 3.4866486115857407e-06, + "loss": 0.707, + "step": 3023 + }, + { + "epoch": 0.6100483219699477, + "grad_norm": 0.3847060799598694, + "learning_rate": 3.483534479748688e-06, + "loss": 0.6836, + "step": 3024 + }, + { + "epoch": 0.6102500575261547, + "grad_norm": 1.063359022140503, + "learning_rate": 3.480420995626741e-06, + "loss": 0.6425, + "step": 3025 + }, + { + "epoch": 0.6104517930823616, + "grad_norm": 0.6723124384880066, + "learning_rate": 3.4773081605497393e-06, + "loss": 0.7339, + "step": 3026 + }, + { + "epoch": 0.6106535286385687, + "grad_norm": 0.41069871187210083, + "learning_rate": 3.4741959758472367e-06, + "loss": 0.6732, + "step": 3027 + }, + { + "epoch": 0.6108552641947756, + "grad_norm": 0.608799934387207, + "learning_rate": 3.4710844428485176e-06, + "loss": 0.6761, + "step": 3028 + }, + { + "epoch": 0.6110569997509827, + "grad_norm": 0.4474925398826599, + "learning_rate": 3.4679735628825826e-06, + "loss": 0.6193, + "step": 3029 + }, + { + "epoch": 0.6112587353071897, + "grad_norm": 0.669126570224762, + "learning_rate": 3.464863337278157e-06, + "loss": 0.6864, + "step": 3030 + }, + { + "epoch": 0.6114604708633966, + "grad_norm": 0.3941359519958496, + "learning_rate": 3.461753767363687e-06, + "loss": 0.6738, + "step": 3031 + }, + { + "epoch": 0.6116622064196037, + "grad_norm": 0.45722004771232605, + "learning_rate": 3.458644854467331e-06, + "loss": 0.6721, + "step": 3032 + }, + { + "epoch": 0.6118639419758106, + "grad_norm": 0.28656086325645447, + "learning_rate": 3.455536599916979e-06, + "loss": 0.6457, + "step": 3033 + }, + { + "epoch": 0.6120656775320177, + "grad_norm": 0.7220064401626587, + "learning_rate": 3.452429005040232e-06, + "loss": 0.6899, + "step": 3034 + }, + { + "epoch": 0.6122674130882246, + "grad_norm": 0.6459056735038757, + "learning_rate": 3.449322071164408e-06, + "loss": 0.664, + "step": 3035 + }, + { + "epoch": 0.6124691486444316, + "grad_norm": 0.4235333502292633, + "learning_rate": 3.446215799616548e-06, + "loss": 0.7266, + "step": 3036 + }, + { + "epoch": 0.6126708842006386, + "grad_norm": 0.3836830258369446, + "learning_rate": 3.443110191723407e-06, + "loss": 0.7827, + "step": 3037 + }, + { + "epoch": 0.6128726197568456, + "grad_norm": 0.5448535084724426, + "learning_rate": 3.440005248811457e-06, + "loss": 0.7843, + "step": 3038 + }, + { + "epoch": 0.6130743553130527, + "grad_norm": 0.44550132751464844, + "learning_rate": 3.4369009722068846e-06, + "loss": 0.6652, + "step": 3039 + }, + { + "epoch": 0.6132760908692596, + "grad_norm": 0.4314553439617157, + "learning_rate": 3.4337973632355958e-06, + "loss": 0.6526, + "step": 3040 + }, + { + "epoch": 0.6134778264254666, + "grad_norm": 0.5170155167579651, + "learning_rate": 3.4306944232232065e-06, + "loss": 0.6846, + "step": 3041 + }, + { + "epoch": 0.6136795619816736, + "grad_norm": 0.6112193465232849, + "learning_rate": 3.427592153495053e-06, + "loss": 0.6449, + "step": 3042 + }, + { + "epoch": 0.6138812975378806, + "grad_norm": 0.846665620803833, + "learning_rate": 3.424490555376176e-06, + "loss": 0.7245, + "step": 3043 + }, + { + "epoch": 0.6140830330940876, + "grad_norm": 0.38036710023880005, + "learning_rate": 3.421389630191338e-06, + "loss": 0.7464, + "step": 3044 + }, + { + "epoch": 0.6142847686502946, + "grad_norm": 0.4693972170352936, + "learning_rate": 3.4182893792650117e-06, + "loss": 0.7562, + "step": 3045 + }, + { + "epoch": 0.6144865042065015, + "grad_norm": 1.0809375047683716, + "learning_rate": 3.41518980392138e-06, + "loss": 0.6967, + "step": 3046 + }, + { + "epoch": 0.6146882397627086, + "grad_norm": 0.3993861973285675, + "learning_rate": 3.4120909054843375e-06, + "loss": 0.6257, + "step": 3047 + }, + { + "epoch": 0.6148899753189155, + "grad_norm": 1.0259984731674194, + "learning_rate": 3.4089926852774934e-06, + "loss": 0.6815, + "step": 3048 + }, + { + "epoch": 0.6150917108751225, + "grad_norm": 1.449548363685608, + "learning_rate": 3.4058951446241604e-06, + "loss": 0.6787, + "step": 3049 + }, + { + "epoch": 0.6152934464313295, + "grad_norm": 0.5463994741439819, + "learning_rate": 3.402798284847368e-06, + "loss": 0.6876, + "step": 3050 + }, + { + "epoch": 0.6154951819875365, + "grad_norm": 1.2420287132263184, + "learning_rate": 3.3997021072698524e-06, + "loss": 0.6332, + "step": 3051 + }, + { + "epoch": 0.6156969175437436, + "grad_norm": 0.3920036554336548, + "learning_rate": 3.396606613214053e-06, + "loss": 0.8062, + "step": 3052 + }, + { + "epoch": 0.6158986530999505, + "grad_norm": 0.4213649034500122, + "learning_rate": 3.3935118040021255e-06, + "loss": 0.6945, + "step": 3053 + }, + { + "epoch": 0.6161003886561575, + "grad_norm": 1.0378633737564087, + "learning_rate": 3.390417680955931e-06, + "loss": 0.855, + "step": 3054 + }, + { + "epoch": 0.6163021242123645, + "grad_norm": 0.718075692653656, + "learning_rate": 3.387324245397032e-06, + "loss": 0.8364, + "step": 3055 + }, + { + "epoch": 0.6165038597685715, + "grad_norm": 0.8293585777282715, + "learning_rate": 3.384231498646706e-06, + "loss": 0.6713, + "step": 3056 + }, + { + "epoch": 0.6167055953247785, + "grad_norm": 0.3795190155506134, + "learning_rate": 3.381139442025928e-06, + "loss": 0.8182, + "step": 3057 + }, + { + "epoch": 0.6169073308809855, + "grad_norm": 0.36966368556022644, + "learning_rate": 3.3780480768553834e-06, + "loss": 0.6691, + "step": 3058 + }, + { + "epoch": 0.6171090664371924, + "grad_norm": 0.6272523403167725, + "learning_rate": 3.374957404455464e-06, + "loss": 0.6929, + "step": 3059 + }, + { + "epoch": 0.6173108019933995, + "grad_norm": 0.32975462079048157, + "learning_rate": 3.371867426146256e-06, + "loss": 0.7372, + "step": 3060 + }, + { + "epoch": 0.6175125375496064, + "grad_norm": 0.42231565713882446, + "learning_rate": 3.368778143247561e-06, + "loss": 0.7876, + "step": 3061 + }, + { + "epoch": 0.6177142731058135, + "grad_norm": 0.37638795375823975, + "learning_rate": 3.3656895570788778e-06, + "loss": 0.691, + "step": 3062 + }, + { + "epoch": 0.6179160086620205, + "grad_norm": 0.4994615316390991, + "learning_rate": 3.3626016689594053e-06, + "loss": 0.7033, + "step": 3063 + }, + { + "epoch": 0.6181177442182274, + "grad_norm": 0.4467025697231293, + "learning_rate": 3.3595144802080493e-06, + "loss": 0.6819, + "step": 3064 + }, + { + "epoch": 0.6183194797744345, + "grad_norm": 0.3670813739299774, + "learning_rate": 3.356427992143415e-06, + "loss": 0.8562, + "step": 3065 + }, + { + "epoch": 0.6185212153306414, + "grad_norm": 0.5851613879203796, + "learning_rate": 3.3533422060838056e-06, + "loss": 0.8069, + "step": 3066 + }, + { + "epoch": 0.6187229508868484, + "grad_norm": 0.624320387840271, + "learning_rate": 3.350257123347229e-06, + "loss": 0.6733, + "step": 3067 + }, + { + "epoch": 0.6189246864430554, + "grad_norm": 0.5927186608314514, + "learning_rate": 3.34717274525139e-06, + "loss": 0.6508, + "step": 3068 + }, + { + "epoch": 0.6191264219992624, + "grad_norm": 1.0810381174087524, + "learning_rate": 3.3440890731136925e-06, + "loss": 0.7074, + "step": 3069 + }, + { + "epoch": 0.6193281575554694, + "grad_norm": 0.3897158205509186, + "learning_rate": 3.3410061082512422e-06, + "loss": 0.7887, + "step": 3070 + }, + { + "epoch": 0.6195298931116764, + "grad_norm": 0.9414277076721191, + "learning_rate": 3.337923851980834e-06, + "loss": 0.8225, + "step": 3071 + }, + { + "epoch": 0.6197316286678833, + "grad_norm": 0.43071287870407104, + "learning_rate": 3.3348423056189705e-06, + "loss": 0.6808, + "step": 3072 + }, + { + "epoch": 0.6199333642240904, + "grad_norm": 0.4197927713394165, + "learning_rate": 3.331761470481846e-06, + "loss": 0.9375, + "step": 3073 + }, + { + "epoch": 0.6201350997802973, + "grad_norm": 0.4770767390727997, + "learning_rate": 3.3286813478853495e-06, + "loss": 0.6511, + "step": 3074 + }, + { + "epoch": 0.6203368353365044, + "grad_norm": 0.47283339500427246, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.6466, + "step": 3075 + }, + { + "epoch": 0.6205385708927114, + "grad_norm": 0.42882150411605835, + "learning_rate": 3.3225232455762885e-06, + "loss": 0.74, + "step": 3076 + }, + { + "epoch": 0.6207403064489183, + "grad_norm": 0.36606308817863464, + "learning_rate": 3.319445268493981e-06, + "loss": 0.6657, + "step": 3077 + }, + { + "epoch": 0.6209420420051254, + "grad_norm": 0.5328860878944397, + "learning_rate": 3.316368009212818e-06, + "loss": 0.7469, + "step": 3078 + }, + { + "epoch": 0.6211437775613323, + "grad_norm": 0.33745500445365906, + "learning_rate": 3.3132914690471657e-06, + "loss": 0.6671, + "step": 3079 + }, + { + "epoch": 0.6213455131175394, + "grad_norm": 0.3988508880138397, + "learning_rate": 3.310215649311075e-06, + "loss": 0.7865, + "step": 3080 + }, + { + "epoch": 0.6215472486737463, + "grad_norm": 0.601906955242157, + "learning_rate": 3.3071405513182996e-06, + "loss": 0.7031, + "step": 3081 + }, + { + "epoch": 0.6217489842299533, + "grad_norm": 0.8292602896690369, + "learning_rate": 3.304066176382281e-06, + "loss": 0.6719, + "step": 3082 + }, + { + "epoch": 0.6219507197861603, + "grad_norm": 0.38256916403770447, + "learning_rate": 3.300992525816147e-06, + "loss": 0.6376, + "step": 3083 + }, + { + "epoch": 0.6221524553423673, + "grad_norm": 0.6803867220878601, + "learning_rate": 3.297919600932723e-06, + "loss": 0.797, + "step": 3084 + }, + { + "epoch": 0.6223541908985742, + "grad_norm": 0.461038738489151, + "learning_rate": 3.2948474030445187e-06, + "loss": 0.6493, + "step": 3085 + }, + { + "epoch": 0.6225559264547813, + "grad_norm": 0.3889191150665283, + "learning_rate": 3.2917759334637376e-06, + "loss": 0.6486, + "step": 3086 + }, + { + "epoch": 0.6227576620109883, + "grad_norm": 0.3315489888191223, + "learning_rate": 3.288705193502272e-06, + "loss": 0.6665, + "step": 3087 + }, + { + "epoch": 0.6229593975671953, + "grad_norm": 0.7866511940956116, + "learning_rate": 3.2856351844716983e-06, + "loss": 1.0142, + "step": 3088 + }, + { + "epoch": 0.6231611331234023, + "grad_norm": 0.48360222578048706, + "learning_rate": 3.2825659076832848e-06, + "loss": 0.7912, + "step": 3089 + }, + { + "epoch": 0.6233628686796092, + "grad_norm": 0.4154527187347412, + "learning_rate": 3.2794973644479884e-06, + "loss": 0.6703, + "step": 3090 + }, + { + "epoch": 0.6235646042358163, + "grad_norm": 0.3557988703250885, + "learning_rate": 3.276429556076445e-06, + "loss": 0.7134, + "step": 3091 + }, + { + "epoch": 0.6237663397920232, + "grad_norm": 0.8320627212524414, + "learning_rate": 3.2733624838789846e-06, + "loss": 0.6844, + "step": 3092 + }, + { + "epoch": 0.6239680753482303, + "grad_norm": 0.9393786191940308, + "learning_rate": 3.2702961491656197e-06, + "loss": 0.6608, + "step": 3093 + }, + { + "epoch": 0.6241698109044372, + "grad_norm": 0.876509428024292, + "learning_rate": 3.267230553246047e-06, + "loss": 0.6976, + "step": 3094 + }, + { + "epoch": 0.6243715464606442, + "grad_norm": 0.36942392587661743, + "learning_rate": 3.26416569742965e-06, + "loss": 0.6621, + "step": 3095 + }, + { + "epoch": 0.6245732820168513, + "grad_norm": 0.5300217270851135, + "learning_rate": 3.261101583025494e-06, + "loss": 0.7066, + "step": 3096 + }, + { + "epoch": 0.6247750175730582, + "grad_norm": 1.0432391166687012, + "learning_rate": 3.258038211342327e-06, + "loss": 0.6486, + "step": 3097 + }, + { + "epoch": 0.6249767531292653, + "grad_norm": 0.483392596244812, + "learning_rate": 3.254975583688585e-06, + "loss": 0.6736, + "step": 3098 + }, + { + "epoch": 0.6251784886854722, + "grad_norm": 1.1535712480545044, + "learning_rate": 3.2519137013723775e-06, + "loss": 0.68, + "step": 3099 + }, + { + "epoch": 0.6253802242416792, + "grad_norm": 0.8637328743934631, + "learning_rate": 3.2488525657015014e-06, + "loss": 0.6433, + "step": 3100 + }, + { + "epoch": 0.6255819597978862, + "grad_norm": 2.4023852348327637, + "learning_rate": 3.2457921779834372e-06, + "loss": 0.6498, + "step": 3101 + }, + { + "epoch": 0.6257836953540932, + "grad_norm": 0.5843347907066345, + "learning_rate": 3.2427325395253386e-06, + "loss": 0.736, + "step": 3102 + }, + { + "epoch": 0.6259854309103001, + "grad_norm": 0.5199889540672302, + "learning_rate": 3.2396736516340443e-06, + "loss": 0.6761, + "step": 3103 + }, + { + "epoch": 0.6261871664665072, + "grad_norm": 0.45910605788230896, + "learning_rate": 3.2366155156160726e-06, + "loss": 0.8176, + "step": 3104 + }, + { + "epoch": 0.6263889020227141, + "grad_norm": 0.9626832604408264, + "learning_rate": 3.2335581327776178e-06, + "loss": 0.6933, + "step": 3105 + }, + { + "epoch": 0.6265906375789212, + "grad_norm": 0.3536245822906494, + "learning_rate": 3.2305015044245534e-06, + "loss": 0.7737, + "step": 3106 + }, + { + "epoch": 0.6267923731351281, + "grad_norm": 0.5715398788452148, + "learning_rate": 3.2274456318624344e-06, + "loss": 0.7681, + "step": 3107 + }, + { + "epoch": 0.6269941086913351, + "grad_norm": 0.6852706074714661, + "learning_rate": 3.2243905163964863e-06, + "loss": 0.6293, + "step": 3108 + }, + { + "epoch": 0.6271958442475422, + "grad_norm": 0.5433623194694519, + "learning_rate": 3.221336159331618e-06, + "loss": 0.6476, + "step": 3109 + }, + { + "epoch": 0.6273975798037491, + "grad_norm": 0.48686903715133667, + "learning_rate": 3.218282561972407e-06, + "loss": 0.6762, + "step": 3110 + }, + { + "epoch": 0.6275993153599562, + "grad_norm": 0.47076553106307983, + "learning_rate": 3.2152297256231137e-06, + "loss": 0.6213, + "step": 3111 + }, + { + "epoch": 0.6278010509161631, + "grad_norm": 0.5113581418991089, + "learning_rate": 3.21217765158767e-06, + "loss": 0.781, + "step": 3112 + }, + { + "epoch": 0.6280027864723701, + "grad_norm": 0.3782995045185089, + "learning_rate": 3.209126341169681e-06, + "loss": 0.7276, + "step": 3113 + }, + { + "epoch": 0.6282045220285771, + "grad_norm": 0.5735915899276733, + "learning_rate": 3.2060757956724286e-06, + "loss": 0.6898, + "step": 3114 + }, + { + "epoch": 0.6284062575847841, + "grad_norm": 0.39048486948013306, + "learning_rate": 3.203026016398867e-06, + "loss": 0.6437, + "step": 3115 + }, + { + "epoch": 0.6286079931409911, + "grad_norm": 0.6044697165489197, + "learning_rate": 3.1999770046516198e-06, + "loss": 0.6616, + "step": 3116 + }, + { + "epoch": 0.6288097286971981, + "grad_norm": 0.3670637607574463, + "learning_rate": 3.1969287617329887e-06, + "loss": 0.8468, + "step": 3117 + }, + { + "epoch": 0.629011464253405, + "grad_norm": 0.6543151140213013, + "learning_rate": 3.1938812889449444e-06, + "loss": 0.657, + "step": 3118 + }, + { + "epoch": 0.6292131998096121, + "grad_norm": 0.32406216859817505, + "learning_rate": 3.1908345875891243e-06, + "loss": 0.774, + "step": 3119 + }, + { + "epoch": 0.629414935365819, + "grad_norm": 0.44408175349235535, + "learning_rate": 3.1877886589668423e-06, + "loss": 1.0477, + "step": 3120 + }, + { + "epoch": 0.629616670922026, + "grad_norm": 0.45666587352752686, + "learning_rate": 3.1847435043790833e-06, + "loss": 0.6835, + "step": 3121 + }, + { + "epoch": 0.6298184064782331, + "grad_norm": 0.33028221130371094, + "learning_rate": 3.181699125126493e-06, + "loss": 0.6404, + "step": 3122 + }, + { + "epoch": 0.63002014203444, + "grad_norm": 0.6274064183235168, + "learning_rate": 3.178655522509395e-06, + "loss": 0.6529, + "step": 3123 + }, + { + "epoch": 0.6302218775906471, + "grad_norm": 0.7759518027305603, + "learning_rate": 3.1756126978277756e-06, + "loss": 0.6167, + "step": 3124 + }, + { + "epoch": 0.630423613146854, + "grad_norm": 0.3837743401527405, + "learning_rate": 3.1725706523812925e-06, + "loss": 0.6501, + "step": 3125 + }, + { + "epoch": 0.630625348703061, + "grad_norm": 0.5663520097732544, + "learning_rate": 3.169529387469269e-06, + "loss": 0.6922, + "step": 3126 + }, + { + "epoch": 0.630827084259268, + "grad_norm": 0.6764240860939026, + "learning_rate": 3.1664889043906928e-06, + "loss": 0.6241, + "step": 3127 + }, + { + "epoch": 0.631028819815475, + "grad_norm": 1.6318018436431885, + "learning_rate": 3.1634492044442195e-06, + "loss": 0.8103, + "step": 3128 + }, + { + "epoch": 0.631230555371682, + "grad_norm": 0.42009663581848145, + "learning_rate": 3.160410288928175e-06, + "loss": 0.7029, + "step": 3129 + }, + { + "epoch": 0.631432290927889, + "grad_norm": 0.49394020438194275, + "learning_rate": 3.1573721591405405e-06, + "loss": 1.0464, + "step": 3130 + }, + { + "epoch": 0.631634026484096, + "grad_norm": 0.4648574888706207, + "learning_rate": 3.154334816378969e-06, + "loss": 0.6562, + "step": 3131 + }, + { + "epoch": 0.631835762040303, + "grad_norm": 0.47843027114868164, + "learning_rate": 3.151298261940775e-06, + "loss": 0.7575, + "step": 3132 + }, + { + "epoch": 0.63203749759651, + "grad_norm": 0.7636077404022217, + "learning_rate": 3.148262497122935e-06, + "loss": 0.8223, + "step": 3133 + }, + { + "epoch": 0.632239233152717, + "grad_norm": 0.378098726272583, + "learning_rate": 3.145227523222092e-06, + "loss": 0.7765, + "step": 3134 + }, + { + "epoch": 0.632440968708924, + "grad_norm": 0.5029193758964539, + "learning_rate": 3.1421933415345473e-06, + "loss": 0.6579, + "step": 3135 + }, + { + "epoch": 0.6326427042651309, + "grad_norm": 0.6376214027404785, + "learning_rate": 3.1391599533562644e-06, + "loss": 0.6408, + "step": 3136 + }, + { + "epoch": 0.632844439821338, + "grad_norm": 1.2196003198623657, + "learning_rate": 3.1361273599828722e-06, + "loss": 0.6752, + "step": 3137 + }, + { + "epoch": 0.6330461753775449, + "grad_norm": 0.7059176564216614, + "learning_rate": 3.1330955627096526e-06, + "loss": 0.6517, + "step": 3138 + }, + { + "epoch": 0.633247910933752, + "grad_norm": 0.6646839380264282, + "learning_rate": 3.130064562831553e-06, + "loss": 0.6575, + "step": 3139 + }, + { + "epoch": 0.6334496464899589, + "grad_norm": 3.0043044090270996, + "learning_rate": 3.1270343616431795e-06, + "loss": 0.641, + "step": 3140 + }, + { + "epoch": 0.6336513820461659, + "grad_norm": 0.34532269835472107, + "learning_rate": 3.1240049604387955e-06, + "loss": 0.7085, + "step": 3141 + }, + { + "epoch": 0.633853117602373, + "grad_norm": 0.719542384147644, + "learning_rate": 3.1209763605123233e-06, + "loss": 0.6261, + "step": 3142 + }, + { + "epoch": 0.6340548531585799, + "grad_norm": 0.4158155024051666, + "learning_rate": 3.117948563157346e-06, + "loss": 0.6272, + "step": 3143 + }, + { + "epoch": 0.6342565887147869, + "grad_norm": 0.49435046315193176, + "learning_rate": 3.1149215696670963e-06, + "loss": 0.7016, + "step": 3144 + }, + { + "epoch": 0.6344583242709939, + "grad_norm": 0.5325908660888672, + "learning_rate": 3.111895381334472e-06, + "loss": 0.7781, + "step": 3145 + }, + { + "epoch": 0.6346600598272009, + "grad_norm": 0.48683398962020874, + "learning_rate": 3.108869999452024e-06, + "loss": 0.7391, + "step": 3146 + }, + { + "epoch": 0.6348617953834079, + "grad_norm": 0.31066611409187317, + "learning_rate": 3.105845425311954e-06, + "loss": 0.6373, + "step": 3147 + }, + { + "epoch": 0.6350635309396149, + "grad_norm": 0.6181333661079407, + "learning_rate": 3.102821660206125e-06, + "loss": 0.7017, + "step": 3148 + }, + { + "epoch": 0.6352652664958218, + "grad_norm": 0.3839402198791504, + "learning_rate": 3.099798705426055e-06, + "loss": 0.787, + "step": 3149 + }, + { + "epoch": 0.6354670020520289, + "grad_norm": 0.6559051871299744, + "learning_rate": 3.0967765622629085e-06, + "loss": 0.6686, + "step": 3150 + }, + { + "epoch": 0.6356687376082358, + "grad_norm": 0.35817965865135193, + "learning_rate": 3.0937552320075116e-06, + "loss": 0.67, + "step": 3151 + }, + { + "epoch": 0.6358704731644429, + "grad_norm": 0.5216743350028992, + "learning_rate": 3.0907347159503364e-06, + "loss": 0.6393, + "step": 3152 + }, + { + "epoch": 0.6360722087206498, + "grad_norm": 0.3688260018825531, + "learning_rate": 3.0877150153815126e-06, + "loss": 0.7921, + "step": 3153 + }, + { + "epoch": 0.6362739442768568, + "grad_norm": 1.2055909633636475, + "learning_rate": 3.0846961315908206e-06, + "loss": 0.7709, + "step": 3154 + }, + { + "epoch": 0.6364756798330639, + "grad_norm": 0.5644564032554626, + "learning_rate": 3.0816780658676857e-06, + "loss": 0.6773, + "step": 3155 + }, + { + "epoch": 0.6366774153892708, + "grad_norm": 0.6851127743721008, + "learning_rate": 3.0786608195011938e-06, + "loss": 0.6381, + "step": 3156 + }, + { + "epoch": 0.6368791509454779, + "grad_norm": 0.33154991269111633, + "learning_rate": 3.0756443937800757e-06, + "loss": 0.6474, + "step": 3157 + }, + { + "epoch": 0.6370808865016848, + "grad_norm": 0.6636943817138672, + "learning_rate": 3.0726287899927075e-06, + "loss": 0.7441, + "step": 3158 + }, + { + "epoch": 0.6372826220578918, + "grad_norm": 0.4501652717590332, + "learning_rate": 3.069614009427123e-06, + "loss": 0.7027, + "step": 3159 + }, + { + "epoch": 0.6374843576140988, + "grad_norm": 0.48597389459609985, + "learning_rate": 3.0666000533709984e-06, + "loss": 0.7337, + "step": 3160 + }, + { + "epoch": 0.6376860931703058, + "grad_norm": 0.9662090539932251, + "learning_rate": 3.063586923111658e-06, + "loss": 0.7431, + "step": 3161 + }, + { + "epoch": 0.6378878287265127, + "grad_norm": 0.403626024723053, + "learning_rate": 3.0605746199360755e-06, + "loss": 0.6392, + "step": 3162 + }, + { + "epoch": 0.6380895642827198, + "grad_norm": 0.43942490220069885, + "learning_rate": 3.057563145130873e-06, + "loss": 0.6931, + "step": 3163 + }, + { + "epoch": 0.6382912998389267, + "grad_norm": 0.5516834855079651, + "learning_rate": 3.054552499982312e-06, + "loss": 0.7797, + "step": 3164 + }, + { + "epoch": 0.6384930353951338, + "grad_norm": 0.9289595484733582, + "learning_rate": 3.0515426857763087e-06, + "loss": 0.6549, + "step": 3165 + }, + { + "epoch": 0.6386947709513408, + "grad_norm": 0.3789737820625305, + "learning_rate": 3.0485337037984146e-06, + "loss": 0.6868, + "step": 3166 + }, + { + "epoch": 0.6388965065075477, + "grad_norm": 0.761205792427063, + "learning_rate": 3.045525555333834e-06, + "loss": 0.7468, + "step": 3167 + }, + { + "epoch": 0.6390982420637548, + "grad_norm": 0.46798014640808105, + "learning_rate": 3.0425182416674117e-06, + "loss": 0.6358, + "step": 3168 + }, + { + "epoch": 0.6392999776199617, + "grad_norm": 0.4467335641384125, + "learning_rate": 3.0395117640836337e-06, + "loss": 0.6945, + "step": 3169 + }, + { + "epoch": 0.6395017131761688, + "grad_norm": 0.41640374064445496, + "learning_rate": 3.0365061238666336e-06, + "loss": 0.6827, + "step": 3170 + }, + { + "epoch": 0.6397034487323757, + "grad_norm": 0.3545227348804474, + "learning_rate": 3.0335013223001865e-06, + "loss": 0.783, + "step": 3171 + }, + { + "epoch": 0.6399051842885827, + "grad_norm": 1.406578540802002, + "learning_rate": 3.0304973606677044e-06, + "loss": 0.6595, + "step": 3172 + }, + { + "epoch": 0.6401069198447897, + "grad_norm": 1.4224039316177368, + "learning_rate": 3.027494240252246e-06, + "loss": 0.6437, + "step": 3173 + }, + { + "epoch": 0.6403086554009967, + "grad_norm": 0.3247050344944, + "learning_rate": 3.024491962336511e-06, + "loss": 0.6654, + "step": 3174 + }, + { + "epoch": 0.6405103909572037, + "grad_norm": 0.3300015330314636, + "learning_rate": 3.021490528202831e-06, + "loss": 0.7923, + "step": 3175 + }, + { + "epoch": 0.6407121265134107, + "grad_norm": 0.48039737343788147, + "learning_rate": 3.018489939133188e-06, + "loss": 0.691, + "step": 3176 + }, + { + "epoch": 0.6409138620696176, + "grad_norm": 3.007985830307007, + "learning_rate": 3.0154901964091993e-06, + "loss": 0.7091, + "step": 3177 + }, + { + "epoch": 0.6411155976258247, + "grad_norm": 0.9376941323280334, + "learning_rate": 3.0124913013121148e-06, + "loss": 0.7613, + "step": 3178 + }, + { + "epoch": 0.6413173331820317, + "grad_norm": 1.181395411491394, + "learning_rate": 3.009493255122831e-06, + "loss": 0.6758, + "step": 3179 + }, + { + "epoch": 0.6415190687382386, + "grad_norm": 0.3755936026573181, + "learning_rate": 3.0064960591218763e-06, + "loss": 0.7249, + "step": 3180 + }, + { + "epoch": 0.6417208042944457, + "grad_norm": 0.40885961055755615, + "learning_rate": 3.0034997145894178e-06, + "loss": 0.689, + "step": 3181 + }, + { + "epoch": 0.6419225398506526, + "grad_norm": 0.41958701610565186, + "learning_rate": 3.0005042228052604e-06, + "loss": 0.7136, + "step": 3182 + }, + { + "epoch": 0.6421242754068597, + "grad_norm": 0.4304403066635132, + "learning_rate": 2.9975095850488412e-06, + "loss": 0.7982, + "step": 3183 + }, + { + "epoch": 0.6423260109630666, + "grad_norm": 0.4791124761104584, + "learning_rate": 2.9945158025992354e-06, + "loss": 0.7134, + "step": 3184 + }, + { + "epoch": 0.6425277465192736, + "grad_norm": 0.7359556555747986, + "learning_rate": 2.991522876735154e-06, + "loss": 0.8083, + "step": 3185 + }, + { + "epoch": 0.6427294820754806, + "grad_norm": 0.48312368988990784, + "learning_rate": 2.9885308087349364e-06, + "loss": 0.6765, + "step": 3186 + }, + { + "epoch": 0.6429312176316876, + "grad_norm": 0.6698987483978271, + "learning_rate": 2.9855395998765607e-06, + "loss": 0.6487, + "step": 3187 + }, + { + "epoch": 0.6431329531878947, + "grad_norm": 0.44107022881507874, + "learning_rate": 2.982549251437638e-06, + "loss": 0.6513, + "step": 3188 + }, + { + "epoch": 0.6433346887441016, + "grad_norm": 0.9434458017349243, + "learning_rate": 2.979559764695409e-06, + "loss": 0.6799, + "step": 3189 + }, + { + "epoch": 0.6435364243003086, + "grad_norm": 0.5217306017875671, + "learning_rate": 2.9765711409267484e-06, + "loss": 0.6945, + "step": 3190 + }, + { + "epoch": 0.6437381598565156, + "grad_norm": 0.6917257308959961, + "learning_rate": 2.9735833814081627e-06, + "loss": 0.7392, + "step": 3191 + }, + { + "epoch": 0.6439398954127226, + "grad_norm": 1.2743409872055054, + "learning_rate": 2.9705964874157865e-06, + "loss": 0.6708, + "step": 3192 + }, + { + "epoch": 0.6441416309689296, + "grad_norm": 0.4855993390083313, + "learning_rate": 2.967610460225391e-06, + "loss": 0.8801, + "step": 3193 + }, + { + "epoch": 0.6443433665251366, + "grad_norm": 1.0808053016662598, + "learning_rate": 2.964625301112366e-06, + "loss": 0.7138, + "step": 3194 + }, + { + "epoch": 0.6445451020813435, + "grad_norm": 0.6945085525512695, + "learning_rate": 2.9616410113517405e-06, + "loss": 0.8535, + "step": 3195 + }, + { + "epoch": 0.6447468376375506, + "grad_norm": 0.3475915789604187, + "learning_rate": 2.9586575922181724e-06, + "loss": 0.8371, + "step": 3196 + }, + { + "epoch": 0.6449485731937575, + "grad_norm": 0.6857219338417053, + "learning_rate": 2.9556750449859396e-06, + "loss": 0.6513, + "step": 3197 + }, + { + "epoch": 0.6451503087499645, + "grad_norm": 0.4195650517940521, + "learning_rate": 2.952693370928953e-06, + "loss": 0.6669, + "step": 3198 + }, + { + "epoch": 0.6453520443061715, + "grad_norm": 0.7740408182144165, + "learning_rate": 2.9497125713207518e-06, + "loss": 0.6673, + "step": 3199 + }, + { + "epoch": 0.6455537798623785, + "grad_norm": 0.34402042627334595, + "learning_rate": 2.9467326474344983e-06, + "loss": 0.6437, + "step": 3200 + }, + { + "epoch": 0.6457555154185856, + "grad_norm": 0.7659520506858826, + "learning_rate": 2.943753600542982e-06, + "loss": 0.6758, + "step": 3201 + }, + { + "epoch": 0.6459572509747925, + "grad_norm": 0.3940645754337311, + "learning_rate": 2.940775431918621e-06, + "loss": 0.6867, + "step": 3202 + }, + { + "epoch": 0.6461589865309995, + "grad_norm": 0.5474492311477661, + "learning_rate": 2.9377981428334494e-06, + "loss": 0.9003, + "step": 3203 + }, + { + "epoch": 0.6463607220872065, + "grad_norm": 0.4333469867706299, + "learning_rate": 2.9348217345591367e-06, + "loss": 0.6497, + "step": 3204 + }, + { + "epoch": 0.6465624576434135, + "grad_norm": 0.31425702571868896, + "learning_rate": 2.9318462083669706e-06, + "loss": 0.7416, + "step": 3205 + }, + { + "epoch": 0.6467641931996205, + "grad_norm": 0.6604920625686646, + "learning_rate": 2.9288715655278605e-06, + "loss": 0.7005, + "step": 3206 + }, + { + "epoch": 0.6469659287558275, + "grad_norm": 1.00324285030365, + "learning_rate": 2.9258978073123413e-06, + "loss": 0.7507, + "step": 3207 + }, + { + "epoch": 0.6471676643120344, + "grad_norm": 0.3944019675254822, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.8014, + "step": 3208 + }, + { + "epoch": 0.6473693998682415, + "grad_norm": 0.4360457956790924, + "learning_rate": 2.9199529498323207e-06, + "loss": 0.7602, + "step": 3209 + }, + { + "epoch": 0.6475711354244484, + "grad_norm": 0.4374183416366577, + "learning_rate": 2.916981853106997e-06, + "loss": 0.657, + "step": 3210 + }, + { + "epoch": 0.6477728709806555, + "grad_norm": 0.335193008184433, + "learning_rate": 2.9140116460836175e-06, + "loss": 0.6482, + "step": 3211 + }, + { + "epoch": 0.6479746065368625, + "grad_norm": 0.43707412481307983, + "learning_rate": 2.9110423300308182e-06, + "loss": 0.7353, + "step": 3212 + }, + { + "epoch": 0.6481763420930694, + "grad_norm": 1.7312297821044922, + "learning_rate": 2.9080739062168626e-06, + "loss": 0.6955, + "step": 3213 + }, + { + "epoch": 0.6483780776492765, + "grad_norm": 1.1668918132781982, + "learning_rate": 2.9051063759096264e-06, + "loss": 0.6381, + "step": 3214 + }, + { + "epoch": 0.6485798132054834, + "grad_norm": 1.7101664543151855, + "learning_rate": 2.9021397403766034e-06, + "loss": 0.6756, + "step": 3215 + }, + { + "epoch": 0.6487815487616904, + "grad_norm": 0.6171029210090637, + "learning_rate": 2.8991740008849117e-06, + "loss": 0.7846, + "step": 3216 + }, + { + "epoch": 0.6489832843178974, + "grad_norm": 0.3693191707134247, + "learning_rate": 2.896209158701281e-06, + "loss": 0.6294, + "step": 3217 + }, + { + "epoch": 0.6491850198741044, + "grad_norm": 0.40041401982307434, + "learning_rate": 2.8932452150920576e-06, + "loss": 0.909, + "step": 3218 + }, + { + "epoch": 0.6493867554303114, + "grad_norm": 0.3019446134567261, + "learning_rate": 2.89028217132321e-06, + "loss": 0.6771, + "step": 3219 + }, + { + "epoch": 0.6495884909865184, + "grad_norm": 0.5917576551437378, + "learning_rate": 2.887320028660312e-06, + "loss": 0.7757, + "step": 3220 + }, + { + "epoch": 0.6497902265427253, + "grad_norm": 0.4102381467819214, + "learning_rate": 2.884358788368563e-06, + "loss": 0.6877, + "step": 3221 + }, + { + "epoch": 0.6499919620989324, + "grad_norm": 0.39108479022979736, + "learning_rate": 2.8813984517127723e-06, + "loss": 0.7356, + "step": 3222 + }, + { + "epoch": 0.6501936976551393, + "grad_norm": 0.30545833706855774, + "learning_rate": 2.87843901995736e-06, + "loss": 0.6435, + "step": 3223 + }, + { + "epoch": 0.6503954332113464, + "grad_norm": 0.7336997389793396, + "learning_rate": 2.875480494366367e-06, + "loss": 0.7055, + "step": 3224 + }, + { + "epoch": 0.6505971687675534, + "grad_norm": 0.31208398938179016, + "learning_rate": 2.872522876203443e-06, + "loss": 0.7695, + "step": 3225 + }, + { + "epoch": 0.6507989043237603, + "grad_norm": 0.5648106336593628, + "learning_rate": 2.8695661667318465e-06, + "loss": 0.6477, + "step": 3226 + }, + { + "epoch": 0.6510006398799674, + "grad_norm": 0.546466588973999, + "learning_rate": 2.8666103672144597e-06, + "loss": 0.7945, + "step": 3227 + }, + { + "epoch": 0.6512023754361743, + "grad_norm": 0.4231576919555664, + "learning_rate": 2.8636554789137587e-06, + "loss": 0.6658, + "step": 3228 + }, + { + "epoch": 0.6514041109923814, + "grad_norm": 0.6607463955879211, + "learning_rate": 2.860701503091845e-06, + "loss": 0.6531, + "step": 3229 + }, + { + "epoch": 0.6516058465485883, + "grad_norm": 0.6346076130867004, + "learning_rate": 2.8577484410104283e-06, + "loss": 0.6459, + "step": 3230 + }, + { + "epoch": 0.6518075821047953, + "grad_norm": 0.593224823474884, + "learning_rate": 2.8547962939308187e-06, + "loss": 0.6421, + "step": 3231 + }, + { + "epoch": 0.6520093176610023, + "grad_norm": 0.3373255431652069, + "learning_rate": 2.8518450631139467e-06, + "loss": 0.7888, + "step": 3232 + }, + { + "epoch": 0.6522110532172093, + "grad_norm": 0.4355120062828064, + "learning_rate": 2.8488947498203445e-06, + "loss": 0.6563, + "step": 3233 + }, + { + "epoch": 0.6524127887734164, + "grad_norm": 1.12356698513031, + "learning_rate": 2.8459453553101526e-06, + "loss": 0.6683, + "step": 3234 + }, + { + "epoch": 0.6526145243296233, + "grad_norm": 0.37360039353370667, + "learning_rate": 2.8429968808431275e-06, + "loss": 0.7465, + "step": 3235 + }, + { + "epoch": 0.6528162598858303, + "grad_norm": 0.5207381844520569, + "learning_rate": 2.840049327678618e-06, + "loss": 0.7123, + "step": 3236 + }, + { + "epoch": 0.6530179954420373, + "grad_norm": 0.47542569041252136, + "learning_rate": 2.8371026970755903e-06, + "loss": 0.6815, + "step": 3237 + }, + { + "epoch": 0.6532197309982443, + "grad_norm": 0.49312570691108704, + "learning_rate": 2.8341569902926198e-06, + "loss": 0.6957, + "step": 3238 + }, + { + "epoch": 0.6534214665544512, + "grad_norm": 0.39625513553619385, + "learning_rate": 2.8312122085878725e-06, + "loss": 0.7877, + "step": 3239 + }, + { + "epoch": 0.6536232021106583, + "grad_norm": 1.3322360515594482, + "learning_rate": 2.8282683532191333e-06, + "loss": 0.7897, + "step": 3240 + }, + { + "epoch": 0.6538249376668652, + "grad_norm": 1.1602030992507935, + "learning_rate": 2.825325425443786e-06, + "loss": 0.67, + "step": 3241 + }, + { + "epoch": 0.6540266732230723, + "grad_norm": 0.5422742962837219, + "learning_rate": 2.8223834265188154e-06, + "loss": 0.9461, + "step": 3242 + }, + { + "epoch": 0.6542284087792792, + "grad_norm": 1.4504823684692383, + "learning_rate": 2.8194423577008167e-06, + "loss": 0.6599, + "step": 3243 + }, + { + "epoch": 0.6544301443354862, + "grad_norm": 0.8231688737869263, + "learning_rate": 2.816502220245982e-06, + "loss": 0.6647, + "step": 3244 + }, + { + "epoch": 0.6546318798916932, + "grad_norm": 1.09976327419281, + "learning_rate": 2.8135630154101044e-06, + "loss": 0.6879, + "step": 3245 + }, + { + "epoch": 0.6548336154479002, + "grad_norm": 0.37235990166664124, + "learning_rate": 2.810624744448588e-06, + "loss": 0.6403, + "step": 3246 + }, + { + "epoch": 0.6550353510041073, + "grad_norm": 0.3631633520126343, + "learning_rate": 2.807687408616427e-06, + "loss": 0.6878, + "step": 3247 + }, + { + "epoch": 0.6552370865603142, + "grad_norm": 0.5159462094306946, + "learning_rate": 2.8047510091682223e-06, + "loss": 0.6744, + "step": 3248 + }, + { + "epoch": 0.6554388221165212, + "grad_norm": 0.81461101770401, + "learning_rate": 2.801815547358173e-06, + "loss": 0.6666, + "step": 3249 + }, + { + "epoch": 0.6556405576727282, + "grad_norm": 0.5176712870597839, + "learning_rate": 2.7988810244400766e-06, + "loss": 0.6644, + "step": 3250 + }, + { + "epoch": 0.6558422932289352, + "grad_norm": 0.4347783625125885, + "learning_rate": 2.795947441667334e-06, + "loss": 0.6467, + "step": 3251 + }, + { + "epoch": 0.6560440287851422, + "grad_norm": 0.5249154567718506, + "learning_rate": 2.79301480029294e-06, + "loss": 0.6968, + "step": 3252 + }, + { + "epoch": 0.6562457643413492, + "grad_norm": 0.4152076244354248, + "learning_rate": 2.7900831015694884e-06, + "loss": 0.8355, + "step": 3253 + }, + { + "epoch": 0.6564474998975561, + "grad_norm": 0.6886522769927979, + "learning_rate": 2.787152346749173e-06, + "loss": 0.6496, + "step": 3254 + }, + { + "epoch": 0.6566492354537632, + "grad_norm": 0.4261277914047241, + "learning_rate": 2.784222537083781e-06, + "loss": 0.6611, + "step": 3255 + }, + { + "epoch": 0.6568509710099701, + "grad_norm": 0.41764065623283386, + "learning_rate": 2.7812936738246977e-06, + "loss": 0.7819, + "step": 3256 + }, + { + "epoch": 0.6570527065661771, + "grad_norm": 0.4790710508823395, + "learning_rate": 2.7783657582229006e-06, + "loss": 0.6821, + "step": 3257 + }, + { + "epoch": 0.6572544421223842, + "grad_norm": 0.7666770815849304, + "learning_rate": 2.775438791528971e-06, + "loss": 0.6297, + "step": 3258 + }, + { + "epoch": 0.6574561776785911, + "grad_norm": 0.3256152272224426, + "learning_rate": 2.7725127749930768e-06, + "loss": 0.6162, + "step": 3259 + }, + { + "epoch": 0.6576579132347982, + "grad_norm": 0.39912089705467224, + "learning_rate": 2.7695877098649828e-06, + "loss": 0.7555, + "step": 3260 + }, + { + "epoch": 0.6578596487910051, + "grad_norm": 0.3404659330844879, + "learning_rate": 2.766663597394044e-06, + "loss": 0.6616, + "step": 3261 + }, + { + "epoch": 0.6580613843472121, + "grad_norm": 0.7698021531105042, + "learning_rate": 2.7637404388292184e-06, + "loss": 0.6816, + "step": 3262 + }, + { + "epoch": 0.6582631199034191, + "grad_norm": 0.4176563620567322, + "learning_rate": 2.760818235419046e-06, + "loss": 0.6545, + "step": 3263 + }, + { + "epoch": 0.6584648554596261, + "grad_norm": 0.9553281664848328, + "learning_rate": 2.757896988411662e-06, + "loss": 0.6904, + "step": 3264 + }, + { + "epoch": 0.6586665910158331, + "grad_norm": 0.3938625752925873, + "learning_rate": 2.7549766990547973e-06, + "loss": 0.7029, + "step": 3265 + }, + { + "epoch": 0.6588683265720401, + "grad_norm": 0.5802119970321655, + "learning_rate": 2.7520573685957685e-06, + "loss": 0.7366, + "step": 3266 + }, + { + "epoch": 0.659070062128247, + "grad_norm": 0.4900028109550476, + "learning_rate": 2.7491389982814846e-06, + "loss": 0.6723, + "step": 3267 + }, + { + "epoch": 0.6592717976844541, + "grad_norm": 0.47692182660102844, + "learning_rate": 2.746221589358443e-06, + "loss": 0.8045, + "step": 3268 + }, + { + "epoch": 0.659473533240661, + "grad_norm": 0.3741524815559387, + "learning_rate": 2.7433051430727353e-06, + "loss": 0.6767, + "step": 3269 + }, + { + "epoch": 0.6596752687968681, + "grad_norm": 1.0025602579116821, + "learning_rate": 2.7403896606700363e-06, + "loss": 0.86, + "step": 3270 + }, + { + "epoch": 0.6598770043530751, + "grad_norm": 0.3644028306007385, + "learning_rate": 2.7374751433956103e-06, + "loss": 0.6842, + "step": 3271 + }, + { + "epoch": 0.660078739909282, + "grad_norm": 0.7453895211219788, + "learning_rate": 2.734561592494314e-06, + "loss": 0.6767, + "step": 3272 + }, + { + "epoch": 0.6602804754654891, + "grad_norm": 0.6459788084030151, + "learning_rate": 2.7316490092105856e-06, + "loss": 0.6485, + "step": 3273 + }, + { + "epoch": 0.660482211021696, + "grad_norm": 0.9067775011062622, + "learning_rate": 2.7287373947884523e-06, + "loss": 0.7792, + "step": 3274 + }, + { + "epoch": 0.660683946577903, + "grad_norm": 0.42903923988342285, + "learning_rate": 2.7258267504715276e-06, + "loss": 0.6674, + "step": 3275 + }, + { + "epoch": 0.66088568213411, + "grad_norm": 0.4832659363746643, + "learning_rate": 2.7229170775030078e-06, + "loss": 0.6905, + "step": 3276 + }, + { + "epoch": 0.661087417690317, + "grad_norm": 0.3498452305793762, + "learning_rate": 2.720008377125682e-06, + "loss": 0.7758, + "step": 3277 + }, + { + "epoch": 0.661289153246524, + "grad_norm": 0.5607253909111023, + "learning_rate": 2.7171006505819153e-06, + "loss": 0.6126, + "step": 3278 + }, + { + "epoch": 0.661490888802731, + "grad_norm": 0.6438088417053223, + "learning_rate": 2.7141938991136597e-06, + "loss": 0.6824, + "step": 3279 + }, + { + "epoch": 0.6616926243589379, + "grad_norm": 0.5905462503433228, + "learning_rate": 2.711288123962455e-06, + "loss": 0.6258, + "step": 3280 + }, + { + "epoch": 0.661894359915145, + "grad_norm": 0.3438940942287445, + "learning_rate": 2.7083833263694182e-06, + "loss": 0.6689, + "step": 3281 + }, + { + "epoch": 0.662096095471352, + "grad_norm": 0.49798575043678284, + "learning_rate": 2.7054795075752494e-06, + "loss": 0.8654, + "step": 3282 + }, + { + "epoch": 0.662297831027559, + "grad_norm": 0.5611802339553833, + "learning_rate": 2.702576668820237e-06, + "loss": 0.6325, + "step": 3283 + }, + { + "epoch": 0.662499566583766, + "grad_norm": 0.3897034823894501, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.7167, + "step": 3284 + }, + { + "epoch": 0.6627013021399729, + "grad_norm": 0.30359792709350586, + "learning_rate": 2.696773936386706e-06, + "loss": 0.7844, + "step": 3285 + }, + { + "epoch": 0.66290303769618, + "grad_norm": 0.7668525576591492, + "learning_rate": 2.6938740451866674e-06, + "loss": 0.7037, + "step": 3286 + }, + { + "epoch": 0.6631047732523869, + "grad_norm": 0.38820183277130127, + "learning_rate": 2.690975138982721e-06, + "loss": 0.6852, + "step": 3287 + }, + { + "epoch": 0.663306508808594, + "grad_norm": 0.4365038275718689, + "learning_rate": 2.6880772190130576e-06, + "loss": 0.7843, + "step": 3288 + }, + { + "epoch": 0.6635082443648009, + "grad_norm": 0.4769340753555298, + "learning_rate": 2.6851802865154403e-06, + "loss": 0.8124, + "step": 3289 + }, + { + "epoch": 0.6637099799210079, + "grad_norm": 1.5809507369995117, + "learning_rate": 2.6822843427272094e-06, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 0.663911715477215, + "grad_norm": 0.3428616523742676, + "learning_rate": 2.6793893888852885e-06, + "loss": 0.6811, + "step": 3291 + }, + { + "epoch": 0.6641134510334219, + "grad_norm": 0.5171772837638855, + "learning_rate": 2.676495426226172e-06, + "loss": 0.7681, + "step": 3292 + }, + { + "epoch": 0.6643151865896288, + "grad_norm": 1.7425131797790527, + "learning_rate": 2.6736024559859335e-06, + "loss": 0.8144, + "step": 3293 + }, + { + "epoch": 0.6645169221458359, + "grad_norm": 0.3819194734096527, + "learning_rate": 2.6707104794002283e-06, + "loss": 0.673, + "step": 3294 + }, + { + "epoch": 0.6647186577020429, + "grad_norm": 0.41104787588119507, + "learning_rate": 2.6678194977042727e-06, + "loss": 0.8096, + "step": 3295 + }, + { + "epoch": 0.6649203932582499, + "grad_norm": 1.118825078010559, + "learning_rate": 2.6649295121328745e-06, + "loss": 0.6665, + "step": 3296 + }, + { + "epoch": 0.6651221288144569, + "grad_norm": 0.38198497891426086, + "learning_rate": 2.6620405239204066e-06, + "loss": 0.6618, + "step": 3297 + }, + { + "epoch": 0.6653238643706638, + "grad_norm": 5.1874237060546875, + "learning_rate": 2.6591525343008164e-06, + "loss": 0.7858, + "step": 3298 + }, + { + "epoch": 0.6655255999268709, + "grad_norm": 0.32285603880882263, + "learning_rate": 2.6562655445076296e-06, + "loss": 0.7059, + "step": 3299 + }, + { + "epoch": 0.6657273354830778, + "grad_norm": 0.6265811920166016, + "learning_rate": 2.6533795557739407e-06, + "loss": 0.6951, + "step": 3300 + }, + { + "epoch": 0.6659290710392849, + "grad_norm": 0.3196472227573395, + "learning_rate": 2.650494569332415e-06, + "loss": 0.6171, + "step": 3301 + }, + { + "epoch": 0.6661308065954918, + "grad_norm": 0.39705124497413635, + "learning_rate": 2.647610586415299e-06, + "loss": 0.6602, + "step": 3302 + }, + { + "epoch": 0.6663325421516988, + "grad_norm": 0.44583508372306824, + "learning_rate": 2.644727608254396e-06, + "loss": 0.6791, + "step": 3303 + }, + { + "epoch": 0.6665342777079059, + "grad_norm": 0.3522447645664215, + "learning_rate": 2.6418456360810918e-06, + "loss": 0.6254, + "step": 3304 + }, + { + "epoch": 0.6667360132641128, + "grad_norm": 0.7186568379402161, + "learning_rate": 2.6389646711263417e-06, + "loss": 0.6565, + "step": 3305 + }, + { + "epoch": 0.6669377488203199, + "grad_norm": 0.7441889047622681, + "learning_rate": 2.6360847146206624e-06, + "loss": 0.6145, + "step": 3306 + }, + { + "epoch": 0.6671394843765268, + "grad_norm": 0.3817283809185028, + "learning_rate": 2.633205767794149e-06, + "loss": 0.6737, + "step": 3307 + }, + { + "epoch": 0.6673412199327338, + "grad_norm": 1.152697205543518, + "learning_rate": 2.6303278318764613e-06, + "loss": 0.6301, + "step": 3308 + }, + { + "epoch": 0.6675429554889408, + "grad_norm": 0.7283335328102112, + "learning_rate": 2.6274509080968252e-06, + "loss": 0.7515, + "step": 3309 + }, + { + "epoch": 0.6677446910451478, + "grad_norm": 0.6745430827140808, + "learning_rate": 2.6245749976840406e-06, + "loss": 0.6622, + "step": 3310 + }, + { + "epoch": 0.6679464266013547, + "grad_norm": 0.3779880702495575, + "learning_rate": 2.621700101866469e-06, + "loss": 0.6511, + "step": 3311 + }, + { + "epoch": 0.6681481621575618, + "grad_norm": 0.40893808007240295, + "learning_rate": 2.618826221872039e-06, + "loss": 0.6498, + "step": 3312 + }, + { + "epoch": 0.6683498977137687, + "grad_norm": 0.3666467070579529, + "learning_rate": 2.61595335892825e-06, + "loss": 0.6254, + "step": 3313 + }, + { + "epoch": 0.6685516332699758, + "grad_norm": 0.8473466634750366, + "learning_rate": 2.6130815142621614e-06, + "loss": 0.7145, + "step": 3314 + }, + { + "epoch": 0.6687533688261827, + "grad_norm": 0.6159899830818176, + "learning_rate": 2.6102106891004002e-06, + "loss": 0.6858, + "step": 3315 + }, + { + "epoch": 0.6689551043823897, + "grad_norm": 0.5568851828575134, + "learning_rate": 2.6073408846691582e-06, + "loss": 0.7757, + "step": 3316 + }, + { + "epoch": 0.6691568399385968, + "grad_norm": 0.6199424862861633, + "learning_rate": 2.6044721021941887e-06, + "loss": 0.6802, + "step": 3317 + }, + { + "epoch": 0.6693585754948037, + "grad_norm": 0.3639499247074127, + "learning_rate": 2.601604342900814e-06, + "loss": 0.6053, + "step": 3318 + }, + { + "epoch": 0.6695603110510108, + "grad_norm": 0.4354953467845917, + "learning_rate": 2.5987376080139136e-06, + "loss": 0.6559, + "step": 3319 + }, + { + "epoch": 0.6697620466072177, + "grad_norm": 0.481569766998291, + "learning_rate": 2.5958718987579313e-06, + "loss": 0.6343, + "step": 3320 + }, + { + "epoch": 0.6699637821634247, + "grad_norm": 0.4260782301425934, + "learning_rate": 2.5930072163568752e-06, + "loss": 0.7423, + "step": 3321 + }, + { + "epoch": 0.6701655177196317, + "grad_norm": 0.46539121866226196, + "learning_rate": 2.590143562034312e-06, + "loss": 0.7246, + "step": 3322 + }, + { + "epoch": 0.6703672532758387, + "grad_norm": 0.53533536195755, + "learning_rate": 2.5872809370133704e-06, + "loss": 0.6694, + "step": 3323 + }, + { + "epoch": 0.6705689888320457, + "grad_norm": 0.783592700958252, + "learning_rate": 2.5844193425167374e-06, + "loss": 0.6935, + "step": 3324 + }, + { + "epoch": 0.6707707243882527, + "grad_norm": 0.364938884973526, + "learning_rate": 2.581558779766664e-06, + "loss": 0.6913, + "step": 3325 + }, + { + "epoch": 0.6709724599444596, + "grad_norm": 0.3413155674934387, + "learning_rate": 2.5786992499849584e-06, + "loss": 0.7899, + "step": 3326 + }, + { + "epoch": 0.6711741955006667, + "grad_norm": 0.6075354814529419, + "learning_rate": 2.575840754392984e-06, + "loss": 0.6825, + "step": 3327 + }, + { + "epoch": 0.6713759310568737, + "grad_norm": 0.6059809923171997, + "learning_rate": 2.5729832942116705e-06, + "loss": 0.6617, + "step": 3328 + }, + { + "epoch": 0.6715776666130807, + "grad_norm": 0.49375784397125244, + "learning_rate": 2.570126870661499e-06, + "loss": 0.6622, + "step": 3329 + }, + { + "epoch": 0.6717794021692877, + "grad_norm": 0.46630603075027466, + "learning_rate": 2.5672714849625084e-06, + "loss": 0.6755, + "step": 3330 + }, + { + "epoch": 0.6719811377254946, + "grad_norm": 0.5884243249893188, + "learning_rate": 2.5644171383342965e-06, + "loss": 0.6426, + "step": 3331 + }, + { + "epoch": 0.6721828732817017, + "grad_norm": 1.362908124923706, + "learning_rate": 2.5615638319960133e-06, + "loss": 0.6702, + "step": 3332 + }, + { + "epoch": 0.6723846088379086, + "grad_norm": 1.3717610836029053, + "learning_rate": 2.5587115671663732e-06, + "loss": 0.7833, + "step": 3333 + }, + { + "epoch": 0.6725863443941156, + "grad_norm": 0.3735629618167877, + "learning_rate": 2.555860345063636e-06, + "loss": 0.6799, + "step": 3334 + }, + { + "epoch": 0.6727880799503226, + "grad_norm": 0.8326557874679565, + "learning_rate": 2.553010166905619e-06, + "loss": 0.6809, + "step": 3335 + }, + { + "epoch": 0.6729898155065296, + "grad_norm": 0.7335181832313538, + "learning_rate": 2.5501610339096987e-06, + "loss": 0.6632, + "step": 3336 + }, + { + "epoch": 0.6731915510627366, + "grad_norm": 2.6886038780212402, + "learning_rate": 2.547312947292799e-06, + "loss": 0.7022, + "step": 3337 + }, + { + "epoch": 0.6733932866189436, + "grad_norm": 0.48173320293426514, + "learning_rate": 2.5444659082713978e-06, + "loss": 0.8309, + "step": 3338 + }, + { + "epoch": 0.6735950221751505, + "grad_norm": 1.1179413795471191, + "learning_rate": 2.5416199180615297e-06, + "loss": 0.6615, + "step": 3339 + }, + { + "epoch": 0.6737967577313576, + "grad_norm": 0.6155341267585754, + "learning_rate": 2.5387749778787775e-06, + "loss": 0.67, + "step": 3340 + }, + { + "epoch": 0.6739984932875646, + "grad_norm": 0.5885789394378662, + "learning_rate": 2.535931088938274e-06, + "loss": 0.6407, + "step": 3341 + }, + { + "epoch": 0.6742002288437716, + "grad_norm": 0.6222666501998901, + "learning_rate": 2.5330882524547107e-06, + "loss": 0.8534, + "step": 3342 + }, + { + "epoch": 0.6744019643999786, + "grad_norm": 0.6253111362457275, + "learning_rate": 2.530246469642318e-06, + "loss": 0.6266, + "step": 3343 + }, + { + "epoch": 0.6746036999561855, + "grad_norm": 0.3401983976364136, + "learning_rate": 2.5274057417148866e-06, + "loss": 0.6534, + "step": 3344 + }, + { + "epoch": 0.6748054355123926, + "grad_norm": 0.6543856263160706, + "learning_rate": 2.524566069885752e-06, + "loss": 0.7743, + "step": 3345 + }, + { + "epoch": 0.6750071710685995, + "grad_norm": 0.5683391690254211, + "learning_rate": 2.5217274553677975e-06, + "loss": 0.7044, + "step": 3346 + }, + { + "epoch": 0.6752089066248066, + "grad_norm": 0.34736117720603943, + "learning_rate": 2.5188898993734594e-06, + "loss": 0.8291, + "step": 3347 + }, + { + "epoch": 0.6754106421810135, + "grad_norm": 0.389544814825058, + "learning_rate": 2.5160534031147175e-06, + "loss": 0.659, + "step": 3348 + }, + { + "epoch": 0.6756123777372205, + "grad_norm": 1.0548666715621948, + "learning_rate": 2.5132179678030995e-06, + "loss": 0.686, + "step": 3349 + }, + { + "epoch": 0.6758141132934276, + "grad_norm": 0.40486329793930054, + "learning_rate": 2.5103835946496846e-06, + "loss": 0.784, + "step": 3350 + }, + { + "epoch": 0.6760158488496345, + "grad_norm": 0.37598225474357605, + "learning_rate": 2.507550284865089e-06, + "loss": 0.7081, + "step": 3351 + }, + { + "epoch": 0.6762175844058415, + "grad_norm": 0.5921080708503723, + "learning_rate": 2.504718039659483e-06, + "loss": 0.7131, + "step": 3352 + }, + { + "epoch": 0.6764193199620485, + "grad_norm": 0.35993483662605286, + "learning_rate": 2.5018868602425846e-06, + "loss": 0.9476, + "step": 3353 + }, + { + "epoch": 0.6766210555182555, + "grad_norm": 0.45840582251548767, + "learning_rate": 2.499056747823642e-06, + "loss": 0.7662, + "step": 3354 + }, + { + "epoch": 0.6768227910744625, + "grad_norm": 0.6039509773254395, + "learning_rate": 2.4962277036114648e-06, + "loss": 0.6869, + "step": 3355 + }, + { + "epoch": 0.6770245266306695, + "grad_norm": 0.6492475867271423, + "learning_rate": 2.493399728814396e-06, + "loss": 0.6659, + "step": 3356 + }, + { + "epoch": 0.6772262621868764, + "grad_norm": 0.3330156207084656, + "learning_rate": 2.4905728246403226e-06, + "loss": 0.7897, + "step": 3357 + }, + { + "epoch": 0.6774279977430835, + "grad_norm": 0.5170933604240417, + "learning_rate": 2.4877469922966823e-06, + "loss": 0.8187, + "step": 3358 + }, + { + "epoch": 0.6776297332992904, + "grad_norm": 0.5229772925376892, + "learning_rate": 2.484922232990441e-06, + "loss": 0.698, + "step": 3359 + }, + { + "epoch": 0.6778314688554975, + "grad_norm": 0.40928876399993896, + "learning_rate": 2.4820985479281184e-06, + "loss": 0.6308, + "step": 3360 + }, + { + "epoch": 0.6780332044117044, + "grad_norm": 0.6356455087661743, + "learning_rate": 2.479275938315775e-06, + "loss": 0.6408, + "step": 3361 + }, + { + "epoch": 0.6782349399679114, + "grad_norm": 0.405601441860199, + "learning_rate": 2.4764544053590005e-06, + "loss": 0.9088, + "step": 3362 + }, + { + "epoch": 0.6784366755241185, + "grad_norm": 0.5116518139839172, + "learning_rate": 2.4736339502629385e-06, + "loss": 0.7813, + "step": 3363 + }, + { + "epoch": 0.6786384110803254, + "grad_norm": 0.4038514196872711, + "learning_rate": 2.4708145742322643e-06, + "loss": 0.67, + "step": 3364 + }, + { + "epoch": 0.6788401466365325, + "grad_norm": 0.44164231419563293, + "learning_rate": 2.4679962784711915e-06, + "loss": 0.6275, + "step": 3365 + }, + { + "epoch": 0.6790418821927394, + "grad_norm": 0.45643317699432373, + "learning_rate": 2.4651790641834788e-06, + "loss": 0.726, + "step": 3366 + }, + { + "epoch": 0.6792436177489464, + "grad_norm": 1.0231688022613525, + "learning_rate": 2.4623629325724186e-06, + "loss": 0.751, + "step": 3367 + }, + { + "epoch": 0.6794453533051534, + "grad_norm": 0.40992307662963867, + "learning_rate": 2.4595478848408377e-06, + "loss": 0.6971, + "step": 3368 + }, + { + "epoch": 0.6796470888613604, + "grad_norm": 0.6936296820640564, + "learning_rate": 2.4567339221911086e-06, + "loss": 0.6235, + "step": 3369 + }, + { + "epoch": 0.6798488244175673, + "grad_norm": 0.5006226897239685, + "learning_rate": 2.4539210458251333e-06, + "loss": 0.6386, + "step": 3370 + }, + { + "epoch": 0.6800505599737744, + "grad_norm": 0.424445778131485, + "learning_rate": 2.4511092569443518e-06, + "loss": 0.6737, + "step": 3371 + }, + { + "epoch": 0.6802522955299813, + "grad_norm": 0.6313624978065491, + "learning_rate": 2.4482985567497395e-06, + "loss": 0.7417, + "step": 3372 + }, + { + "epoch": 0.6804540310861884, + "grad_norm": 0.6919423341751099, + "learning_rate": 2.4454889464418052e-06, + "loss": 0.6836, + "step": 3373 + }, + { + "epoch": 0.6806557666423954, + "grad_norm": 1.5375241041183472, + "learning_rate": 2.4426804272205985e-06, + "loss": 0.6692, + "step": 3374 + }, + { + "epoch": 0.6808575021986023, + "grad_norm": 1.3544081449508667, + "learning_rate": 2.4398730002856958e-06, + "loss": 0.8126, + "step": 3375 + }, + { + "epoch": 0.6810592377548094, + "grad_norm": 0.39466825127601624, + "learning_rate": 2.437066666836208e-06, + "loss": 0.6005, + "step": 3376 + }, + { + "epoch": 0.6812609733110163, + "grad_norm": 0.6122323870658875, + "learning_rate": 2.434261428070785e-06, + "loss": 0.6519, + "step": 3377 + }, + { + "epoch": 0.6814627088672234, + "grad_norm": 1.1176679134368896, + "learning_rate": 2.4314572851876016e-06, + "loss": 0.6268, + "step": 3378 + }, + { + "epoch": 0.6816644444234303, + "grad_norm": 0.4447222352027893, + "learning_rate": 2.4286542393843665e-06, + "loss": 0.6636, + "step": 3379 + }, + { + "epoch": 0.6818661799796373, + "grad_norm": 0.8198567628860474, + "learning_rate": 2.425852291858325e-06, + "loss": 0.6355, + "step": 3380 + }, + { + "epoch": 0.6820679155358443, + "grad_norm": 0.8290604948997498, + "learning_rate": 2.423051443806247e-06, + "loss": 0.6412, + "step": 3381 + }, + { + "epoch": 0.6822696510920513, + "grad_norm": 0.42411890625953674, + "learning_rate": 2.4202516964244347e-06, + "loss": 0.6699, + "step": 3382 + }, + { + "epoch": 0.6824713866482583, + "grad_norm": 0.4729047119617462, + "learning_rate": 2.4174530509087193e-06, + "loss": 0.7003, + "step": 3383 + }, + { + "epoch": 0.6826731222044653, + "grad_norm": 0.6185622215270996, + "learning_rate": 2.4146555084544665e-06, + "loss": 0.6598, + "step": 3384 + }, + { + "epoch": 0.6828748577606722, + "grad_norm": 0.6863579750061035, + "learning_rate": 2.4118590702565643e-06, + "loss": 0.6678, + "step": 3385 + }, + { + "epoch": 0.6830765933168793, + "grad_norm": 0.34752097725868225, + "learning_rate": 2.4090637375094323e-06, + "loss": 0.6857, + "step": 3386 + }, + { + "epoch": 0.6832783288730863, + "grad_norm": 0.3720141649246216, + "learning_rate": 2.4062695114070156e-06, + "loss": 0.8465, + "step": 3387 + }, + { + "epoch": 0.6834800644292932, + "grad_norm": 0.5772742629051208, + "learning_rate": 2.4034763931427917e-06, + "loss": 0.6451, + "step": 3388 + }, + { + "epoch": 0.6836817999855003, + "grad_norm": 0.488405704498291, + "learning_rate": 2.40068438390976e-06, + "loss": 0.746, + "step": 3389 + }, + { + "epoch": 0.6838835355417072, + "grad_norm": 0.48354852199554443, + "learning_rate": 2.3978934849004477e-06, + "loss": 0.6652, + "step": 3390 + }, + { + "epoch": 0.6840852710979143, + "grad_norm": 0.5033637881278992, + "learning_rate": 2.395103697306906e-06, + "loss": 0.6791, + "step": 3391 + }, + { + "epoch": 0.6842870066541212, + "grad_norm": 0.4847155809402466, + "learning_rate": 2.3923150223207176e-06, + "loss": 0.7022, + "step": 3392 + }, + { + "epoch": 0.6844887422103282, + "grad_norm": 0.5099661946296692, + "learning_rate": 2.3895274611329826e-06, + "loss": 0.6725, + "step": 3393 + }, + { + "epoch": 0.6846904777665352, + "grad_norm": 0.49720004200935364, + "learning_rate": 2.3867410149343284e-06, + "loss": 0.7407, + "step": 3394 + }, + { + "epoch": 0.6848922133227422, + "grad_norm": 0.7642709016799927, + "learning_rate": 2.383955684914908e-06, + "loss": 0.6789, + "step": 3395 + }, + { + "epoch": 0.6850939488789493, + "grad_norm": 0.4113779366016388, + "learning_rate": 2.3811714722643954e-06, + "loss": 0.922, + "step": 3396 + }, + { + "epoch": 0.6852956844351562, + "grad_norm": 0.6020419001579285, + "learning_rate": 2.3783883781719857e-06, + "loss": 0.6606, + "step": 3397 + }, + { + "epoch": 0.6854974199913632, + "grad_norm": 4.857358455657959, + "learning_rate": 2.3756064038264033e-06, + "loss": 0.7021, + "step": 3398 + }, + { + "epoch": 0.6856991555475702, + "grad_norm": 0.2977675497531891, + "learning_rate": 2.3728255504158827e-06, + "loss": 0.834, + "step": 3399 + }, + { + "epoch": 0.6859008911037772, + "grad_norm": 0.39246630668640137, + "learning_rate": 2.3700458191281913e-06, + "loss": 0.6267, + "step": 3400 + }, + { + "epoch": 0.6861026266599842, + "grad_norm": 0.57973712682724, + "learning_rate": 2.3672672111506104e-06, + "loss": 0.7184, + "step": 3401 + }, + { + "epoch": 0.6863043622161912, + "grad_norm": 0.3879867196083069, + "learning_rate": 2.3644897276699426e-06, + "loss": 0.6248, + "step": 3402 + }, + { + "epoch": 0.6865060977723981, + "grad_norm": 0.6185869574546814, + "learning_rate": 2.3617133698725137e-06, + "loss": 0.824, + "step": 3403 + }, + { + "epoch": 0.6867078333286052, + "grad_norm": 0.4408000111579895, + "learning_rate": 2.358938138944164e-06, + "loss": 0.6526, + "step": 3404 + }, + { + "epoch": 0.6869095688848121, + "grad_norm": 0.5326852798461914, + "learning_rate": 2.3561640360702525e-06, + "loss": 0.7627, + "step": 3405 + }, + { + "epoch": 0.6871113044410191, + "grad_norm": 1.1812150478363037, + "learning_rate": 2.353391062435665e-06, + "loss": 0.6987, + "step": 3406 + }, + { + "epoch": 0.6873130399972261, + "grad_norm": 0.4056047797203064, + "learning_rate": 2.3506192192247893e-06, + "loss": 0.8717, + "step": 3407 + }, + { + "epoch": 0.6875147755534331, + "grad_norm": 0.4317014515399933, + "learning_rate": 2.3478485076215444e-06, + "loss": 0.7081, + "step": 3408 + }, + { + "epoch": 0.6877165111096402, + "grad_norm": 0.37390607595443726, + "learning_rate": 2.3450789288093646e-06, + "loss": 0.7089, + "step": 3409 + }, + { + "epoch": 0.6879182466658471, + "grad_norm": 0.5218164920806885, + "learning_rate": 2.342310483971188e-06, + "loss": 0.6372, + "step": 3410 + }, + { + "epoch": 0.6881199822220541, + "grad_norm": 0.42212942242622375, + "learning_rate": 2.3395431742894836e-06, + "loss": 0.6279, + "step": 3411 + }, + { + "epoch": 0.6883217177782611, + "grad_norm": 0.37575986981391907, + "learning_rate": 2.336777000946227e-06, + "loss": 0.6768, + "step": 3412 + }, + { + "epoch": 0.6885234533344681, + "grad_norm": 0.5202623009681702, + "learning_rate": 2.334011965122909e-06, + "loss": 0.819, + "step": 3413 + }, + { + "epoch": 0.6887251888906751, + "grad_norm": 0.46885550022125244, + "learning_rate": 2.331248068000539e-06, + "loss": 0.733, + "step": 3414 + }, + { + "epoch": 0.6889269244468821, + "grad_norm": 0.7351022362709045, + "learning_rate": 2.328485310759635e-06, + "loss": 0.665, + "step": 3415 + }, + { + "epoch": 0.689128660003089, + "grad_norm": 0.6357284188270569, + "learning_rate": 2.3257236945802292e-06, + "loss": 0.6295, + "step": 3416 + }, + { + "epoch": 0.6893303955592961, + "grad_norm": 0.3708368241786957, + "learning_rate": 2.3229632206418727e-06, + "loss": 0.7518, + "step": 3417 + }, + { + "epoch": 0.689532131115503, + "grad_norm": 0.8100972771644592, + "learning_rate": 2.3202038901236157e-06, + "loss": 0.6639, + "step": 3418 + }, + { + "epoch": 0.6897338666717101, + "grad_norm": 0.47530439496040344, + "learning_rate": 2.317445704204033e-06, + "loss": 0.6972, + "step": 3419 + }, + { + "epoch": 0.689935602227917, + "grad_norm": 0.4638540744781494, + "learning_rate": 2.3146886640612045e-06, + "loss": 0.6631, + "step": 3420 + }, + { + "epoch": 0.690137337784124, + "grad_norm": 0.501802921295166, + "learning_rate": 2.3119327708727187e-06, + "loss": 0.7817, + "step": 3421 + }, + { + "epoch": 0.6903390733403311, + "grad_norm": 0.46486896276474, + "learning_rate": 2.3091780258156805e-06, + "loss": 0.7204, + "step": 3422 + }, + { + "epoch": 0.690540808896538, + "grad_norm": 0.9126157760620117, + "learning_rate": 2.3064244300667e-06, + "loss": 0.6801, + "step": 3423 + }, + { + "epoch": 0.6907425444527451, + "grad_norm": 0.42806971073150635, + "learning_rate": 2.3036719848018942e-06, + "loss": 0.6358, + "step": 3424 + }, + { + "epoch": 0.690944280008952, + "grad_norm": 0.30322638154029846, + "learning_rate": 2.3009206911968984e-06, + "loss": 0.6602, + "step": 3425 + }, + { + "epoch": 0.691146015565159, + "grad_norm": 0.4272553324699402, + "learning_rate": 2.2981705504268415e-06, + "loss": 0.6592, + "step": 3426 + }, + { + "epoch": 0.691347751121366, + "grad_norm": 0.6703861355781555, + "learning_rate": 2.295421563666372e-06, + "loss": 0.6591, + "step": 3427 + }, + { + "epoch": 0.691549486677573, + "grad_norm": 0.5437256693840027, + "learning_rate": 2.292673732089644e-06, + "loss": 0.6519, + "step": 3428 + }, + { + "epoch": 0.6917512222337799, + "grad_norm": 0.39333558082580566, + "learning_rate": 2.2899270568703096e-06, + "loss": 0.731, + "step": 3429 + }, + { + "epoch": 0.691952957789987, + "grad_norm": 0.7978708744049072, + "learning_rate": 2.2871815391815377e-06, + "loss": 0.8463, + "step": 3430 + }, + { + "epoch": 0.692154693346194, + "grad_norm": 0.37055301666259766, + "learning_rate": 2.2844371801959965e-06, + "loss": 0.6729, + "step": 3431 + }, + { + "epoch": 0.692356428902401, + "grad_norm": 0.4103529453277588, + "learning_rate": 2.281693981085859e-06, + "loss": 1.1052, + "step": 3432 + }, + { + "epoch": 0.692558164458608, + "grad_norm": 0.4024542570114136, + "learning_rate": 2.2789519430228084e-06, + "loss": 0.6711, + "step": 3433 + }, + { + "epoch": 0.6927599000148149, + "grad_norm": 0.9240769147872925, + "learning_rate": 2.2762110671780263e-06, + "loss": 0.6759, + "step": 3434 + }, + { + "epoch": 0.692961635571022, + "grad_norm": 0.5204855799674988, + "learning_rate": 2.2734713547221976e-06, + "loss": 0.6282, + "step": 3435 + }, + { + "epoch": 0.6931633711272289, + "grad_norm": 0.7119300365447998, + "learning_rate": 2.270732806825517e-06, + "loss": 0.844, + "step": 3436 + }, + { + "epoch": 0.693365106683436, + "grad_norm": 0.4279820919036865, + "learning_rate": 2.2679954246576754e-06, + "loss": 0.6066, + "step": 3437 + }, + { + "epoch": 0.6935668422396429, + "grad_norm": 0.3645946681499481, + "learning_rate": 2.265259209387867e-06, + "loss": 0.6672, + "step": 3438 + }, + { + "epoch": 0.6937685777958499, + "grad_norm": 0.35464754700660706, + "learning_rate": 2.262524162184789e-06, + "loss": 0.6452, + "step": 3439 + }, + { + "epoch": 0.6939703133520569, + "grad_norm": 0.4235239028930664, + "learning_rate": 2.2597902842166366e-06, + "loss": 0.6188, + "step": 3440 + }, + { + "epoch": 0.6941720489082639, + "grad_norm": 0.6387593150138855, + "learning_rate": 2.2570575766511115e-06, + "loss": 0.6596, + "step": 3441 + }, + { + "epoch": 0.694373784464471, + "grad_norm": 0.4509795904159546, + "learning_rate": 2.254326040655412e-06, + "loss": 0.7899, + "step": 3442 + }, + { + "epoch": 0.6945755200206779, + "grad_norm": 1.2646552324295044, + "learning_rate": 2.2515956773962315e-06, + "loss": 0.6117, + "step": 3443 + }, + { + "epoch": 0.6947772555768849, + "grad_norm": 0.4494341313838959, + "learning_rate": 2.2488664880397726e-06, + "loss": 0.8168, + "step": 3444 + }, + { + "epoch": 0.6949789911330919, + "grad_norm": 0.43138113617897034, + "learning_rate": 2.2461384737517283e-06, + "loss": 0.6593, + "step": 3445 + }, + { + "epoch": 0.6951807266892989, + "grad_norm": 1.265650749206543, + "learning_rate": 2.2434116356972927e-06, + "loss": 0.6795, + "step": 3446 + }, + { + "epoch": 0.6953824622455058, + "grad_norm": 0.7934123277664185, + "learning_rate": 2.240685975041155e-06, + "loss": 0.6411, + "step": 3447 + }, + { + "epoch": 0.6955841978017129, + "grad_norm": 0.4912724196910858, + "learning_rate": 2.237961492947507e-06, + "loss": 0.6871, + "step": 3448 + }, + { + "epoch": 0.6957859333579198, + "grad_norm": 0.4764300286769867, + "learning_rate": 2.2352381905800325e-06, + "loss": 0.6373, + "step": 3449 + }, + { + "epoch": 0.6959876689141269, + "grad_norm": 0.6754266023635864, + "learning_rate": 2.23251606910191e-06, + "loss": 0.6287, + "step": 3450 + }, + { + "epoch": 0.6961894044703338, + "grad_norm": 0.8246191143989563, + "learning_rate": 2.2297951296758203e-06, + "loss": 0.6471, + "step": 3451 + }, + { + "epoch": 0.6963911400265408, + "grad_norm": 1.140021562576294, + "learning_rate": 2.227075373463934e-06, + "loss": 0.8084, + "step": 3452 + }, + { + "epoch": 0.6965928755827479, + "grad_norm": 0.5953003764152527, + "learning_rate": 2.2243568016279167e-06, + "loss": 0.7284, + "step": 3453 + }, + { + "epoch": 0.6967946111389548, + "grad_norm": 0.3563099801540375, + "learning_rate": 2.221639415328928e-06, + "loss": 0.6609, + "step": 3454 + }, + { + "epoch": 0.6969963466951619, + "grad_norm": 0.3192271888256073, + "learning_rate": 2.2189232157276247e-06, + "loss": 0.6704, + "step": 3455 + }, + { + "epoch": 0.6971980822513688, + "grad_norm": 1.3229840993881226, + "learning_rate": 2.216208203984154e-06, + "loss": 0.7263, + "step": 3456 + }, + { + "epoch": 0.6973998178075758, + "grad_norm": 0.6065989136695862, + "learning_rate": 2.2134943812581544e-06, + "loss": 0.6356, + "step": 3457 + }, + { + "epoch": 0.6976015533637828, + "grad_norm": 0.48584502935409546, + "learning_rate": 2.210781748708757e-06, + "loss": 0.6244, + "step": 3458 + }, + { + "epoch": 0.6978032889199898, + "grad_norm": 0.5548803806304932, + "learning_rate": 2.2080703074945894e-06, + "loss": 0.6113, + "step": 3459 + }, + { + "epoch": 0.6980050244761968, + "grad_norm": 0.6471741199493408, + "learning_rate": 2.205360058773764e-06, + "loss": 0.6809, + "step": 3460 + }, + { + "epoch": 0.6982067600324038, + "grad_norm": 0.5157921314239502, + "learning_rate": 2.202651003703885e-06, + "loss": 0.6554, + "step": 3461 + }, + { + "epoch": 0.6984084955886107, + "grad_norm": 0.4163440465927124, + "learning_rate": 2.199943143442052e-06, + "loss": 0.6819, + "step": 3462 + }, + { + "epoch": 0.6986102311448178, + "grad_norm": 0.3754201829433441, + "learning_rate": 2.1972364791448488e-06, + "loss": 0.6534, + "step": 3463 + }, + { + "epoch": 0.6988119667010247, + "grad_norm": 0.6087068319320679, + "learning_rate": 2.194531011968348e-06, + "loss": 0.7028, + "step": 3464 + }, + { + "epoch": 0.6990137022572317, + "grad_norm": 0.7134766578674316, + "learning_rate": 2.1918267430681184e-06, + "loss": 0.6727, + "step": 3465 + }, + { + "epoch": 0.6992154378134388, + "grad_norm": 0.4389779567718506, + "learning_rate": 2.1891236735992044e-06, + "loss": 0.6918, + "step": 3466 + }, + { + "epoch": 0.6994171733696457, + "grad_norm": 0.8246768116950989, + "learning_rate": 2.18642180471615e-06, + "loss": 0.7179, + "step": 3467 + }, + { + "epoch": 0.6996189089258528, + "grad_norm": 1.204871654510498, + "learning_rate": 2.1837211375729812e-06, + "loss": 0.6657, + "step": 3468 + }, + { + "epoch": 0.6998206444820597, + "grad_norm": 0.44929951429367065, + "learning_rate": 2.181021673323208e-06, + "loss": 0.6923, + "step": 3469 + }, + { + "epoch": 0.7000223800382667, + "grad_norm": 0.862220823764801, + "learning_rate": 2.178323413119834e-06, + "loss": 0.8135, + "step": 3470 + }, + { + "epoch": 0.7002241155944737, + "grad_norm": 0.5849249958992004, + "learning_rate": 2.1756263581153427e-06, + "loss": 0.6884, + "step": 3471 + }, + { + "epoch": 0.7004258511506807, + "grad_norm": 0.8377878069877625, + "learning_rate": 2.1729305094617016e-06, + "loss": 0.7828, + "step": 3472 + }, + { + "epoch": 0.7006275867068877, + "grad_norm": 0.6120553016662598, + "learning_rate": 2.170235868310372e-06, + "loss": 0.6917, + "step": 3473 + }, + { + "epoch": 0.7008293222630947, + "grad_norm": 0.5731369853019714, + "learning_rate": 2.167542435812286e-06, + "loss": 0.6501, + "step": 3474 + }, + { + "epoch": 0.7010310578193016, + "grad_norm": 0.5575307607650757, + "learning_rate": 2.16485021311787e-06, + "loss": 0.6643, + "step": 3475 + }, + { + "epoch": 0.7012327933755087, + "grad_norm": 0.5431971549987793, + "learning_rate": 2.162159201377034e-06, + "loss": 0.8182, + "step": 3476 + }, + { + "epoch": 0.7014345289317157, + "grad_norm": 0.5338951349258423, + "learning_rate": 2.1594694017391604e-06, + "loss": 0.6866, + "step": 3477 + }, + { + "epoch": 0.7016362644879227, + "grad_norm": 0.31306493282318115, + "learning_rate": 2.156780815353125e-06, + "loss": 0.6562, + "step": 3478 + }, + { + "epoch": 0.7018380000441297, + "grad_norm": 0.8306349515914917, + "learning_rate": 2.15409344336728e-06, + "loss": 0.6357, + "step": 3479 + }, + { + "epoch": 0.7020397356003366, + "grad_norm": 0.4163278639316559, + "learning_rate": 2.151407286929458e-06, + "loss": 0.6896, + "step": 3480 + }, + { + "epoch": 0.7022414711565437, + "grad_norm": 0.6696937680244446, + "learning_rate": 2.1487223471869793e-06, + "loss": 0.6761, + "step": 3481 + }, + { + "epoch": 0.7024432067127506, + "grad_norm": 0.3525125980377197, + "learning_rate": 2.1460386252866327e-06, + "loss": 0.6543, + "step": 3482 + }, + { + "epoch": 0.7026449422689576, + "grad_norm": 0.5017916560173035, + "learning_rate": 2.143356122374697e-06, + "loss": 0.684, + "step": 3483 + }, + { + "epoch": 0.7028466778251646, + "grad_norm": 0.5240808725357056, + "learning_rate": 2.140674839596931e-06, + "loss": 0.6856, + "step": 3484 + }, + { + "epoch": 0.7030484133813716, + "grad_norm": 0.4257897436618805, + "learning_rate": 2.1379947780985603e-06, + "loss": 0.8324, + "step": 3485 + }, + { + "epoch": 0.7032501489375786, + "grad_norm": 0.4334295094013214, + "learning_rate": 2.1353159390243035e-06, + "loss": 0.6699, + "step": 3486 + }, + { + "epoch": 0.7034518844937856, + "grad_norm": 0.3629571497440338, + "learning_rate": 2.132638323518348e-06, + "loss": 0.6528, + "step": 3487 + }, + { + "epoch": 0.7036536200499925, + "grad_norm": 0.469107449054718, + "learning_rate": 2.129961932724359e-06, + "loss": 0.6625, + "step": 3488 + }, + { + "epoch": 0.7038553556061996, + "grad_norm": 0.4057089686393738, + "learning_rate": 2.1272867677854853e-06, + "loss": 0.9242, + "step": 3489 + }, + { + "epoch": 0.7040570911624066, + "grad_norm": 0.7630118131637573, + "learning_rate": 2.124612829844345e-06, + "loss": 0.6927, + "step": 3490 + }, + { + "epoch": 0.7042588267186136, + "grad_norm": 0.35844919085502625, + "learning_rate": 2.121940120043033e-06, + "loss": 0.8758, + "step": 3491 + }, + { + "epoch": 0.7044605622748206, + "grad_norm": 0.7905420660972595, + "learning_rate": 2.119268639523124e-06, + "loss": 0.707, + "step": 3492 + }, + { + "epoch": 0.7046622978310275, + "grad_norm": 0.4721790850162506, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.6773, + "step": 3493 + }, + { + "epoch": 0.7048640333872346, + "grad_norm": 0.34849104285240173, + "learning_rate": 2.113929370891176e-06, + "loss": 0.6873, + "step": 3494 + }, + { + "epoch": 0.7050657689434415, + "grad_norm": 0.3804130554199219, + "learning_rate": 2.1112615850596518e-06, + "loss": 0.6803, + "step": 3495 + }, + { + "epoch": 0.7052675044996486, + "grad_norm": 0.9019719958305359, + "learning_rate": 2.1085950330705613e-06, + "loss": 0.802, + "step": 3496 + }, + { + "epoch": 0.7054692400558555, + "grad_norm": 0.37380659580230713, + "learning_rate": 2.105929716062848e-06, + "loss": 0.6707, + "step": 3497 + }, + { + "epoch": 0.7056709756120625, + "grad_norm": 0.3215074837207794, + "learning_rate": 2.103265635174926e-06, + "loss": 0.6622, + "step": 3498 + }, + { + "epoch": 0.7058727111682696, + "grad_norm": 0.8287083506584167, + "learning_rate": 2.1006027915446785e-06, + "loss": 0.785, + "step": 3499 + }, + { + "epoch": 0.7060744467244765, + "grad_norm": 0.3489381968975067, + "learning_rate": 2.0979411863094677e-06, + "loss": 0.7725, + "step": 3500 + }, + { + "epoch": 0.7062761822806835, + "grad_norm": 0.38414129614830017, + "learning_rate": 2.095280820606121e-06, + "loss": 0.8481, + "step": 3501 + }, + { + "epoch": 0.7064779178368905, + "grad_norm": 0.8350195288658142, + "learning_rate": 2.0926216955709355e-06, + "loss": 0.8169, + "step": 3502 + }, + { + "epoch": 0.7066796533930975, + "grad_norm": 0.41042858362197876, + "learning_rate": 2.0899638123396847e-06, + "loss": 0.8681, + "step": 3503 + }, + { + "epoch": 0.7068813889493045, + "grad_norm": 0.36468562483787537, + "learning_rate": 2.0873071720476067e-06, + "loss": 0.6322, + "step": 3504 + }, + { + "epoch": 0.7070831245055115, + "grad_norm": 0.6368513107299805, + "learning_rate": 2.084651775829409e-06, + "loss": 0.7808, + "step": 3505 + }, + { + "epoch": 0.7072848600617184, + "grad_norm": 0.4741699993610382, + "learning_rate": 2.0819976248192664e-06, + "loss": 0.7226, + "step": 3506 + }, + { + "epoch": 0.7074865956179255, + "grad_norm": 1.307973027229309, + "learning_rate": 2.0793447201508288e-06, + "loss": 0.6619, + "step": 3507 + }, + { + "epoch": 0.7076883311741324, + "grad_norm": 1.281394124031067, + "learning_rate": 2.0766930629572057e-06, + "loss": 0.7279, + "step": 3508 + }, + { + "epoch": 0.7078900667303395, + "grad_norm": 0.7751509547233582, + "learning_rate": 2.0740426543709783e-06, + "loss": 0.6551, + "step": 3509 + }, + { + "epoch": 0.7080918022865464, + "grad_norm": 0.9136074781417847, + "learning_rate": 2.071393495524191e-06, + "loss": 0.7842, + "step": 3510 + }, + { + "epoch": 0.7082935378427534, + "grad_norm": 0.6318010687828064, + "learning_rate": 2.0687455875483603e-06, + "loss": 0.6892, + "step": 3511 + }, + { + "epoch": 0.7084952733989605, + "grad_norm": 0.5524263381958008, + "learning_rate": 2.0660989315744624e-06, + "loss": 0.7104, + "step": 3512 + }, + { + "epoch": 0.7086970089551674, + "grad_norm": 0.6849998831748962, + "learning_rate": 2.0634535287329416e-06, + "loss": 0.7497, + "step": 3513 + }, + { + "epoch": 0.7088987445113745, + "grad_norm": 0.5287047028541565, + "learning_rate": 2.060809380153705e-06, + "loss": 0.6998, + "step": 3514 + }, + { + "epoch": 0.7091004800675814, + "grad_norm": 0.3894512355327606, + "learning_rate": 2.058166486966128e-06, + "loss": 0.7638, + "step": 3515 + }, + { + "epoch": 0.7093022156237884, + "grad_norm": 0.6588128805160522, + "learning_rate": 2.0555248502990473e-06, + "loss": 0.6625, + "step": 3516 + }, + { + "epoch": 0.7095039511799954, + "grad_norm": 1.2160587310791016, + "learning_rate": 2.0528844712807588e-06, + "loss": 0.6813, + "step": 3517 + }, + { + "epoch": 0.7097056867362024, + "grad_norm": 0.5312079787254333, + "learning_rate": 2.05024535103903e-06, + "loss": 0.6315, + "step": 3518 + }, + { + "epoch": 0.7099074222924093, + "grad_norm": 0.4518246054649353, + "learning_rate": 2.0476074907010853e-06, + "loss": 0.8286, + "step": 3519 + }, + { + "epoch": 0.7101091578486164, + "grad_norm": 0.4721282422542572, + "learning_rate": 2.044970891393608e-06, + "loss": 0.7441, + "step": 3520 + }, + { + "epoch": 0.7103108934048233, + "grad_norm": 0.43414926528930664, + "learning_rate": 2.042335554242752e-06, + "loss": 0.6512, + "step": 3521 + }, + { + "epoch": 0.7105126289610304, + "grad_norm": 0.4045238494873047, + "learning_rate": 2.039701480374121e-06, + "loss": 0.6694, + "step": 3522 + }, + { + "epoch": 0.7107143645172374, + "grad_norm": 0.4696193039417267, + "learning_rate": 2.0370686709127885e-06, + "loss": 0.6588, + "step": 3523 + }, + { + "epoch": 0.7109161000734443, + "grad_norm": 2.4165115356445312, + "learning_rate": 2.0344371269832834e-06, + "loss": 0.7298, + "step": 3524 + }, + { + "epoch": 0.7111178356296514, + "grad_norm": 0.5125167965888977, + "learning_rate": 2.031806849709593e-06, + "loss": 0.69, + "step": 3525 + }, + { + "epoch": 0.7113195711858583, + "grad_norm": 0.4065336287021637, + "learning_rate": 2.0291778402151685e-06, + "loss": 0.663, + "step": 3526 + }, + { + "epoch": 0.7115213067420654, + "grad_norm": 0.6046711802482605, + "learning_rate": 2.026550099622914e-06, + "loss": 0.698, + "step": 3527 + }, + { + "epoch": 0.7117230422982723, + "grad_norm": 0.886115550994873, + "learning_rate": 2.0239236290551946e-06, + "loss": 0.6353, + "step": 3528 + }, + { + "epoch": 0.7119247778544793, + "grad_norm": 0.6150587201118469, + "learning_rate": 2.021298429633834e-06, + "loss": 0.8005, + "step": 3529 + }, + { + "epoch": 0.7121265134106863, + "grad_norm": 0.8583071827888489, + "learning_rate": 2.01867450248011e-06, + "loss": 0.7305, + "step": 3530 + }, + { + "epoch": 0.7123282489668933, + "grad_norm": 1.2208572626113892, + "learning_rate": 2.016051848714758e-06, + "loss": 0.7093, + "step": 3531 + }, + { + "epoch": 0.7125299845231003, + "grad_norm": 0.5126375555992126, + "learning_rate": 2.0134304694579737e-06, + "loss": 0.6569, + "step": 3532 + }, + { + "epoch": 0.7127317200793073, + "grad_norm": 1.7419109344482422, + "learning_rate": 2.0108103658293982e-06, + "loss": 1.2269, + "step": 3533 + }, + { + "epoch": 0.7129334556355142, + "grad_norm": 0.8870477676391602, + "learning_rate": 2.008191538948139e-06, + "loss": 0.6706, + "step": 3534 + }, + { + "epoch": 0.7131351911917213, + "grad_norm": 0.6514060497283936, + "learning_rate": 2.005573989932753e-06, + "loss": 0.7235, + "step": 3535 + }, + { + "epoch": 0.7133369267479283, + "grad_norm": 0.5482868552207947, + "learning_rate": 2.0029577199012496e-06, + "loss": 0.7448, + "step": 3536 + }, + { + "epoch": 0.7135386623041353, + "grad_norm": 0.6119092702865601, + "learning_rate": 2.0003427299710966e-06, + "loss": 0.6549, + "step": 3537 + }, + { + "epoch": 0.7137403978603423, + "grad_norm": 0.35968175530433655, + "learning_rate": 1.9977290212592116e-06, + "loss": 0.672, + "step": 3538 + }, + { + "epoch": 0.7139421334165492, + "grad_norm": 0.9781152606010437, + "learning_rate": 1.9951165948819646e-06, + "loss": 0.6671, + "step": 3539 + }, + { + "epoch": 0.7141438689727563, + "grad_norm": 0.5758945941925049, + "learning_rate": 1.9925054519551833e-06, + "loss": 0.8566, + "step": 3540 + }, + { + "epoch": 0.7143456045289632, + "grad_norm": 1.2972782850265503, + "learning_rate": 1.989895593594137e-06, + "loss": 0.6791, + "step": 3541 + }, + { + "epoch": 0.7145473400851702, + "grad_norm": 0.7332674264907837, + "learning_rate": 1.987287020913556e-06, + "loss": 0.7392, + "step": 3542 + }, + { + "epoch": 0.7147490756413772, + "grad_norm": 0.35885506868362427, + "learning_rate": 1.984679735027621e-06, + "loss": 0.6427, + "step": 3543 + }, + { + "epoch": 0.7149508111975842, + "grad_norm": 0.8930257558822632, + "learning_rate": 1.9820737370499533e-06, + "loss": 0.7992, + "step": 3544 + }, + { + "epoch": 0.7151525467537913, + "grad_norm": 0.38415658473968506, + "learning_rate": 1.979469028093635e-06, + "loss": 0.7889, + "step": 3545 + }, + { + "epoch": 0.7153542823099982, + "grad_norm": 0.41735655069351196, + "learning_rate": 1.9768656092711934e-06, + "loss": 0.7093, + "step": 3546 + }, + { + "epoch": 0.7155560178662052, + "grad_norm": 0.3132960796356201, + "learning_rate": 1.974263481694602e-06, + "loss": 0.6574, + "step": 3547 + }, + { + "epoch": 0.7157577534224122, + "grad_norm": 0.8277822732925415, + "learning_rate": 1.9716626464752896e-06, + "loss": 0.6673, + "step": 3548 + }, + { + "epoch": 0.7159594889786192, + "grad_norm": 0.7590373158454895, + "learning_rate": 1.9690631047241267e-06, + "loss": 0.6493, + "step": 3549 + }, + { + "epoch": 0.7161612245348262, + "grad_norm": 0.5453904867172241, + "learning_rate": 1.9664648575514316e-06, + "loss": 0.6598, + "step": 3550 + }, + { + "epoch": 0.7163629600910332, + "grad_norm": 0.4179770350456238, + "learning_rate": 1.963867906066978e-06, + "loss": 0.7093, + "step": 3551 + }, + { + "epoch": 0.7165646956472401, + "grad_norm": 0.3255173861980438, + "learning_rate": 1.9612722513799714e-06, + "loss": 0.8126, + "step": 3552 + }, + { + "epoch": 0.7167664312034472, + "grad_norm": 0.6156246662139893, + "learning_rate": 1.9586778945990785e-06, + "loss": 0.6545, + "step": 3553 + }, + { + "epoch": 0.7169681667596541, + "grad_norm": 1.3091801404953003, + "learning_rate": 1.9560848368324024e-06, + "loss": 0.6836, + "step": 3554 + }, + { + "epoch": 0.7171699023158612, + "grad_norm": 1.0971754789352417, + "learning_rate": 1.953493079187493e-06, + "loss": 0.9682, + "step": 3555 + }, + { + "epoch": 0.7173716378720681, + "grad_norm": 1.8149343729019165, + "learning_rate": 1.9509026227713487e-06, + "loss": 0.6908, + "step": 3556 + }, + { + "epoch": 0.7175733734282751, + "grad_norm": 0.5145043134689331, + "learning_rate": 1.948313468690407e-06, + "loss": 0.6739, + "step": 3557 + }, + { + "epoch": 0.7177751089844822, + "grad_norm": 0.4650350511074066, + "learning_rate": 1.9457256180505507e-06, + "loss": 0.645, + "step": 3558 + }, + { + "epoch": 0.7179768445406891, + "grad_norm": 0.39293235540390015, + "learning_rate": 1.9431390719571096e-06, + "loss": 0.8465, + "step": 3559 + }, + { + "epoch": 0.7181785800968961, + "grad_norm": 0.564153254032135, + "learning_rate": 1.940553831514852e-06, + "loss": 0.6868, + "step": 3560 + }, + { + "epoch": 0.7183803156531031, + "grad_norm": 0.373946875333786, + "learning_rate": 1.9379698978279886e-06, + "loss": 0.62, + "step": 3561 + }, + { + "epoch": 0.7185820512093101, + "grad_norm": 0.7261645197868347, + "learning_rate": 1.935387272000175e-06, + "loss": 0.6233, + "step": 3562 + }, + { + "epoch": 0.7187837867655171, + "grad_norm": 0.46915268898010254, + "learning_rate": 1.932805955134503e-06, + "loss": 0.6634, + "step": 3563 + }, + { + "epoch": 0.7189855223217241, + "grad_norm": 1.2086641788482666, + "learning_rate": 1.9302259483335123e-06, + "loss": 0.6569, + "step": 3564 + }, + { + "epoch": 0.719187257877931, + "grad_norm": 0.520490825176239, + "learning_rate": 1.9276472526991785e-06, + "loss": 0.6266, + "step": 3565 + }, + { + "epoch": 0.7193889934341381, + "grad_norm": 0.367524117231369, + "learning_rate": 1.925069869332916e-06, + "loss": 0.7286, + "step": 3566 + }, + { + "epoch": 0.719590728990345, + "grad_norm": 0.3706212639808655, + "learning_rate": 1.9224937993355846e-06, + "loss": 0.7045, + "step": 3567 + }, + { + "epoch": 0.7197924645465521, + "grad_norm": 0.6001324653625488, + "learning_rate": 1.9199190438074767e-06, + "loss": 0.6536, + "step": 3568 + }, + { + "epoch": 0.719994200102759, + "grad_norm": 0.4473857581615448, + "learning_rate": 1.9173456038483244e-06, + "loss": 0.7551, + "step": 3569 + }, + { + "epoch": 0.720195935658966, + "grad_norm": 0.5655458569526672, + "learning_rate": 1.914773480557304e-06, + "loss": 0.6492, + "step": 3570 + }, + { + "epoch": 0.7203976712151731, + "grad_norm": 0.651198148727417, + "learning_rate": 1.9122026750330213e-06, + "loss": 0.6725, + "step": 3571 + }, + { + "epoch": 0.72059940677138, + "grad_norm": 1.1214271783828735, + "learning_rate": 1.9096331883735237e-06, + "loss": 0.7058, + "step": 3572 + }, + { + "epoch": 0.7208011423275871, + "grad_norm": 0.4652702510356903, + "learning_rate": 1.9070650216762927e-06, + "loss": 0.7483, + "step": 3573 + }, + { + "epoch": 0.721002877883794, + "grad_norm": 0.4463115930557251, + "learning_rate": 1.9044981760382502e-06, + "loss": 0.6675, + "step": 3574 + }, + { + "epoch": 0.721204613440001, + "grad_norm": 0.4675081670284271, + "learning_rate": 1.9019326525557508e-06, + "loss": 0.6827, + "step": 3575 + }, + { + "epoch": 0.721406348996208, + "grad_norm": 1.2795963287353516, + "learning_rate": 1.8993684523245842e-06, + "loss": 0.6651, + "step": 3576 + }, + { + "epoch": 0.721608084552415, + "grad_norm": 0.4441238045692444, + "learning_rate": 1.896805576439974e-06, + "loss": 0.691, + "step": 3577 + }, + { + "epoch": 0.7218098201086219, + "grad_norm": 0.4510972797870636, + "learning_rate": 1.8942440259965833e-06, + "loss": 0.702, + "step": 3578 + }, + { + "epoch": 0.722011555664829, + "grad_norm": 0.4195414185523987, + "learning_rate": 1.891683802088503e-06, + "loss": 0.6515, + "step": 3579 + }, + { + "epoch": 0.722213291221036, + "grad_norm": 0.3845515847206116, + "learning_rate": 1.8891249058092609e-06, + "loss": 0.6648, + "step": 3580 + }, + { + "epoch": 0.722415026777243, + "grad_norm": 0.5072640776634216, + "learning_rate": 1.8865673382518146e-06, + "loss": 0.6789, + "step": 3581 + }, + { + "epoch": 0.72261676233345, + "grad_norm": 0.8547187447547913, + "learning_rate": 1.8840111005085598e-06, + "loss": 0.6483, + "step": 3582 + }, + { + "epoch": 0.7228184978896569, + "grad_norm": 0.5213097929954529, + "learning_rate": 1.8814561936713195e-06, + "loss": 0.6461, + "step": 3583 + }, + { + "epoch": 0.723020233445864, + "grad_norm": 3.7029454708099365, + "learning_rate": 1.878902618831347e-06, + "loss": 0.6881, + "step": 3584 + }, + { + "epoch": 0.7232219690020709, + "grad_norm": 0.5109856128692627, + "learning_rate": 1.8763503770793323e-06, + "loss": 0.6796, + "step": 3585 + }, + { + "epoch": 0.723423704558278, + "grad_norm": 0.512251079082489, + "learning_rate": 1.8737994695053924e-06, + "loss": 0.6862, + "step": 3586 + }, + { + "epoch": 0.7236254401144849, + "grad_norm": 0.5038248896598816, + "learning_rate": 1.8712498971990723e-06, + "loss": 0.6449, + "step": 3587 + }, + { + "epoch": 0.7238271756706919, + "grad_norm": 0.7192299365997314, + "learning_rate": 1.8687016612493542e-06, + "loss": 0.6209, + "step": 3588 + }, + { + "epoch": 0.7240289112268989, + "grad_norm": 0.8483881950378418, + "learning_rate": 1.8661547627446386e-06, + "loss": 0.6305, + "step": 3589 + }, + { + "epoch": 0.7242306467831059, + "grad_norm": 0.4165220260620117, + "learning_rate": 1.8636092027727653e-06, + "loss": 0.6671, + "step": 3590 + }, + { + "epoch": 0.724432382339313, + "grad_norm": 0.505834698677063, + "learning_rate": 1.8610649824209958e-06, + "loss": 0.6653, + "step": 3591 + }, + { + "epoch": 0.7246341178955199, + "grad_norm": 0.44209980964660645, + "learning_rate": 1.8585221027760209e-06, + "loss": 0.6618, + "step": 3592 + }, + { + "epoch": 0.7248358534517269, + "grad_norm": 1.2882723808288574, + "learning_rate": 1.8559805649239614e-06, + "loss": 0.7678, + "step": 3593 + }, + { + "epoch": 0.7250375890079339, + "grad_norm": 0.33671534061431885, + "learning_rate": 1.8534403699503622e-06, + "loss": 0.8213, + "step": 3594 + }, + { + "epoch": 0.7252393245641409, + "grad_norm": 0.6320860981941223, + "learning_rate": 1.850901518940193e-06, + "loss": 0.6464, + "step": 3595 + }, + { + "epoch": 0.7254410601203478, + "grad_norm": 0.37541621923446655, + "learning_rate": 1.8483640129778575e-06, + "loss": 0.6329, + "step": 3596 + }, + { + "epoch": 0.7256427956765549, + "grad_norm": 0.3355324864387512, + "learning_rate": 1.8458278531471712e-06, + "loss": 0.7638, + "step": 3597 + }, + { + "epoch": 0.7258445312327618, + "grad_norm": 0.6320685744285583, + "learning_rate": 1.8432930405313871e-06, + "loss": 0.7502, + "step": 3598 + }, + { + "epoch": 0.7260462667889689, + "grad_norm": 0.5309786200523376, + "learning_rate": 1.8407595762131814e-06, + "loss": 0.6699, + "step": 3599 + }, + { + "epoch": 0.7262480023451758, + "grad_norm": 0.4337455630302429, + "learning_rate": 1.8382274612746447e-06, + "loss": 0.654, + "step": 3600 + }, + { + "epoch": 0.7264497379013828, + "grad_norm": 0.6220811605453491, + "learning_rate": 1.8356966967973027e-06, + "loss": 0.6629, + "step": 3601 + }, + { + "epoch": 0.7266514734575898, + "grad_norm": 0.6550498008728027, + "learning_rate": 1.833167283862098e-06, + "loss": 0.6817, + "step": 3602 + }, + { + "epoch": 0.7268532090137968, + "grad_norm": 0.3714157044887543, + "learning_rate": 1.8306392235493946e-06, + "loss": 0.8233, + "step": 3603 + }, + { + "epoch": 0.7270549445700039, + "grad_norm": 0.41854748129844666, + "learning_rate": 1.8281125169389868e-06, + "loss": 0.6654, + "step": 3604 + }, + { + "epoch": 0.7272566801262108, + "grad_norm": 0.7979785203933716, + "learning_rate": 1.825587165110082e-06, + "loss": 0.6649, + "step": 3605 + }, + { + "epoch": 0.7274584156824178, + "grad_norm": 0.6475619077682495, + "learning_rate": 1.823063169141312e-06, + "loss": 0.8118, + "step": 3606 + }, + { + "epoch": 0.7276601512386248, + "grad_norm": 0.7294145226478577, + "learning_rate": 1.8205405301107343e-06, + "loss": 0.6871, + "step": 3607 + }, + { + "epoch": 0.7278618867948318, + "grad_norm": 0.3854321539402008, + "learning_rate": 1.818019249095816e-06, + "loss": 0.6471, + "step": 3608 + }, + { + "epoch": 0.7280636223510388, + "grad_norm": 0.8849390745162964, + "learning_rate": 1.815499327173455e-06, + "loss": 0.8196, + "step": 3609 + }, + { + "epoch": 0.7282653579072458, + "grad_norm": 1.1094517707824707, + "learning_rate": 1.8129807654199628e-06, + "loss": 0.6813, + "step": 3610 + }, + { + "epoch": 0.7284670934634527, + "grad_norm": 0.33865752816200256, + "learning_rate": 1.8104635649110702e-06, + "loss": 0.766, + "step": 3611 + }, + { + "epoch": 0.7286688290196598, + "grad_norm": 0.3571256995201111, + "learning_rate": 1.8079477267219308e-06, + "loss": 0.6486, + "step": 3612 + }, + { + "epoch": 0.7288705645758667, + "grad_norm": 0.7489669322967529, + "learning_rate": 1.8054332519271118e-06, + "loss": 0.6319, + "step": 3613 + }, + { + "epoch": 0.7290723001320737, + "grad_norm": 1.4024207592010498, + "learning_rate": 1.8029201416005976e-06, + "loss": 0.696, + "step": 3614 + }, + { + "epoch": 0.7292740356882808, + "grad_norm": 0.4047021269798279, + "learning_rate": 1.8004083968157953e-06, + "loss": 0.896, + "step": 3615 + }, + { + "epoch": 0.7294757712444877, + "grad_norm": 0.4339413642883301, + "learning_rate": 1.7978980186455236e-06, + "loss": 0.7205, + "step": 3616 + }, + { + "epoch": 0.7296775068006948, + "grad_norm": 0.3869302570819855, + "learning_rate": 1.7953890081620174e-06, + "loss": 0.7952, + "step": 3617 + }, + { + "epoch": 0.7298792423569017, + "grad_norm": 0.6974090933799744, + "learning_rate": 1.7928813664369339e-06, + "loss": 0.6494, + "step": 3618 + }, + { + "epoch": 0.7300809779131087, + "grad_norm": 0.6910544037818909, + "learning_rate": 1.790375094541335e-06, + "loss": 0.6995, + "step": 3619 + }, + { + "epoch": 0.7302827134693157, + "grad_norm": 0.5370467901229858, + "learning_rate": 1.7878701935457076e-06, + "loss": 0.7044, + "step": 3620 + }, + { + "epoch": 0.7304844490255227, + "grad_norm": 0.5953854322433472, + "learning_rate": 1.7853666645199474e-06, + "loss": 0.8002, + "step": 3621 + }, + { + "epoch": 0.7306861845817297, + "grad_norm": 0.4186118543148041, + "learning_rate": 1.7828645085333645e-06, + "loss": 0.7228, + "step": 3622 + }, + { + "epoch": 0.7308879201379367, + "grad_norm": 0.38932132720947266, + "learning_rate": 1.7803637266546864e-06, + "loss": 0.7806, + "step": 3623 + }, + { + "epoch": 0.7310896556941436, + "grad_norm": 0.6305902600288391, + "learning_rate": 1.7778643199520496e-06, + "loss": 0.6624, + "step": 3624 + }, + { + "epoch": 0.7312913912503507, + "grad_norm": 0.37308868765830994, + "learning_rate": 1.775366289493003e-06, + "loss": 0.6912, + "step": 3625 + }, + { + "epoch": 0.7314931268065576, + "grad_norm": 0.5473910570144653, + "learning_rate": 1.772869636344512e-06, + "loss": 0.6916, + "step": 3626 + }, + { + "epoch": 0.7316948623627647, + "grad_norm": 0.8070292472839355, + "learning_rate": 1.7703743615729501e-06, + "loss": 0.7116, + "step": 3627 + }, + { + "epoch": 0.7318965979189717, + "grad_norm": 0.7206965684890747, + "learning_rate": 1.7678804662441019e-06, + "loss": 0.6904, + "step": 3628 + }, + { + "epoch": 0.7320983334751786, + "grad_norm": 0.31374138593673706, + "learning_rate": 1.7653879514231631e-06, + "loss": 0.7582, + "step": 3629 + }, + { + "epoch": 0.7323000690313857, + "grad_norm": 0.5226505994796753, + "learning_rate": 1.7628968181747435e-06, + "loss": 0.656, + "step": 3630 + }, + { + "epoch": 0.7325018045875926, + "grad_norm": 0.6887264847755432, + "learning_rate": 1.760407067562858e-06, + "loss": 0.8081, + "step": 3631 + }, + { + "epoch": 0.7327035401437997, + "grad_norm": 0.4167690575122833, + "learning_rate": 1.757918700650933e-06, + "loss": 1.3488, + "step": 3632 + }, + { + "epoch": 0.7329052757000066, + "grad_norm": 0.3777919113636017, + "learning_rate": 1.7554317185018016e-06, + "loss": 0.737, + "step": 3633 + }, + { + "epoch": 0.7331070112562136, + "grad_norm": 0.3365021347999573, + "learning_rate": 1.7529461221777117e-06, + "loss": 0.6321, + "step": 3634 + }, + { + "epoch": 0.7333087468124206, + "grad_norm": 0.5992380380630493, + "learning_rate": 1.7504619127403122e-06, + "loss": 0.7048, + "step": 3635 + }, + { + "epoch": 0.7335104823686276, + "grad_norm": 0.5630524754524231, + "learning_rate": 1.7479790912506628e-06, + "loss": 0.6618, + "step": 3636 + }, + { + "epoch": 0.7337122179248345, + "grad_norm": 0.8216602206230164, + "learning_rate": 1.745497658769229e-06, + "loss": 0.6668, + "step": 3637 + }, + { + "epoch": 0.7339139534810416, + "grad_norm": 0.6757683157920837, + "learning_rate": 1.743017616355887e-06, + "loss": 0.6826, + "step": 3638 + }, + { + "epoch": 0.7341156890372486, + "grad_norm": 0.45060646533966064, + "learning_rate": 1.740538965069915e-06, + "loss": 0.7319, + "step": 3639 + }, + { + "epoch": 0.7343174245934556, + "grad_norm": 0.4027143716812134, + "learning_rate": 1.7380617059699961e-06, + "loss": 0.816, + "step": 3640 + }, + { + "epoch": 0.7345191601496626, + "grad_norm": 0.6248337626457214, + "learning_rate": 1.735585840114225e-06, + "loss": 0.6808, + "step": 3641 + }, + { + "epoch": 0.7347208957058695, + "grad_norm": 0.3696412146091461, + "learning_rate": 1.7331113685600954e-06, + "loss": 0.6959, + "step": 3642 + }, + { + "epoch": 0.7349226312620766, + "grad_norm": 0.6471091508865356, + "learning_rate": 1.7306382923645054e-06, + "loss": 0.6513, + "step": 3643 + }, + { + "epoch": 0.7351243668182835, + "grad_norm": 0.31260403990745544, + "learning_rate": 1.7281666125837637e-06, + "loss": 0.6608, + "step": 3644 + }, + { + "epoch": 0.7353261023744906, + "grad_norm": 0.45326605439186096, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.7701, + "step": 3645 + }, + { + "epoch": 0.7355278379306975, + "grad_norm": 0.39521533250808716, + "learning_rate": 1.7232274464890509e-06, + "loss": 0.6616, + "step": 3646 + }, + { + "epoch": 0.7357295734869045, + "grad_norm": 0.5993699431419373, + "learning_rate": 1.7207599622847042e-06, + "loss": 0.7022, + "step": 3647 + }, + { + "epoch": 0.7359313090431115, + "grad_norm": 2.394320487976074, + "learning_rate": 1.7182938787144498e-06, + "loss": 0.6558, + "step": 3648 + }, + { + "epoch": 0.7361330445993185, + "grad_norm": 0.6650750041007996, + "learning_rate": 1.7158291968316076e-06, + "loss": 0.8178, + "step": 3649 + }, + { + "epoch": 0.7363347801555256, + "grad_norm": 0.7132764458656311, + "learning_rate": 1.7133659176888956e-06, + "loss": 0.6896, + "step": 3650 + }, + { + "epoch": 0.7365365157117325, + "grad_norm": 0.5961502194404602, + "learning_rate": 1.710904042338431e-06, + "loss": 0.6361, + "step": 3651 + }, + { + "epoch": 0.7367382512679395, + "grad_norm": 0.3709481358528137, + "learning_rate": 1.7084435718317372e-06, + "loss": 0.7271, + "step": 3652 + }, + { + "epoch": 0.7369399868241465, + "grad_norm": 1.152391791343689, + "learning_rate": 1.705984507219733e-06, + "loss": 0.6716, + "step": 3653 + }, + { + "epoch": 0.7371417223803535, + "grad_norm": 0.5584093332290649, + "learning_rate": 1.7035268495527358e-06, + "loss": 0.7522, + "step": 3654 + }, + { + "epoch": 0.7373434579365604, + "grad_norm": 0.4153057932853699, + "learning_rate": 1.7010705998804694e-06, + "loss": 0.6646, + "step": 3655 + }, + { + "epoch": 0.7375451934927675, + "grad_norm": 0.29575854539871216, + "learning_rate": 1.6986157592520442e-06, + "loss": 0.7195, + "step": 3656 + }, + { + "epoch": 0.7377469290489744, + "grad_norm": 0.6873073577880859, + "learning_rate": 1.6961623287159784e-06, + "loss": 0.6508, + "step": 3657 + }, + { + "epoch": 0.7379486646051815, + "grad_norm": 0.5382766127586365, + "learning_rate": 1.6937103093201895e-06, + "loss": 0.7678, + "step": 3658 + }, + { + "epoch": 0.7381504001613884, + "grad_norm": 0.8277750015258789, + "learning_rate": 1.6912597021119802e-06, + "loss": 0.7059, + "step": 3659 + }, + { + "epoch": 0.7383521357175954, + "grad_norm": 1.0506656169891357, + "learning_rate": 1.6888105081380628e-06, + "loss": 0.6723, + "step": 3660 + }, + { + "epoch": 0.7385538712738025, + "grad_norm": 0.4117637276649475, + "learning_rate": 1.68636272844454e-06, + "loss": 0.6578, + "step": 3661 + }, + { + "epoch": 0.7387556068300094, + "grad_norm": 0.8198413252830505, + "learning_rate": 1.6839163640769084e-06, + "loss": 0.6617, + "step": 3662 + }, + { + "epoch": 0.7389573423862165, + "grad_norm": 0.3305530250072479, + "learning_rate": 1.6814714160800683e-06, + "loss": 1.0195, + "step": 3663 + }, + { + "epoch": 0.7391590779424234, + "grad_norm": 0.41136759519577026, + "learning_rate": 1.6790278854983033e-06, + "loss": 0.6396, + "step": 3664 + }, + { + "epoch": 0.7393608134986304, + "grad_norm": 0.9290247559547424, + "learning_rate": 1.6765857733753016e-06, + "loss": 0.7637, + "step": 3665 + }, + { + "epoch": 0.7395625490548374, + "grad_norm": 0.8298892378807068, + "learning_rate": 1.6741450807541448e-06, + "loss": 0.7454, + "step": 3666 + }, + { + "epoch": 0.7397642846110444, + "grad_norm": 0.7271751165390015, + "learning_rate": 1.671705808677298e-06, + "loss": 0.6321, + "step": 3667 + }, + { + "epoch": 0.7399660201672514, + "grad_norm": 0.4080258011817932, + "learning_rate": 1.6692679581866334e-06, + "loss": 0.6645, + "step": 3668 + }, + { + "epoch": 0.7401677557234584, + "grad_norm": 0.4282958507537842, + "learning_rate": 1.6668315303234068e-06, + "loss": 0.7104, + "step": 3669 + }, + { + "epoch": 0.7403694912796653, + "grad_norm": 0.7578421235084534, + "learning_rate": 1.6643965261282675e-06, + "loss": 0.6384, + "step": 3670 + }, + { + "epoch": 0.7405712268358724, + "grad_norm": 1.0067808628082275, + "learning_rate": 1.6619629466412613e-06, + "loss": 0.8314, + "step": 3671 + }, + { + "epoch": 0.7407729623920793, + "grad_norm": 0.4014785885810852, + "learning_rate": 1.6595307929018216e-06, + "loss": 0.6424, + "step": 3672 + }, + { + "epoch": 0.7409746979482863, + "grad_norm": 0.5424908995628357, + "learning_rate": 1.6571000659487719e-06, + "loss": 0.8547, + "step": 3673 + }, + { + "epoch": 0.7411764335044934, + "grad_norm": 0.32863613963127136, + "learning_rate": 1.6546707668203322e-06, + "loss": 0.6912, + "step": 3674 + }, + { + "epoch": 0.7413781690607003, + "grad_norm": 0.7613126635551453, + "learning_rate": 1.652242896554102e-06, + "loss": 0.7847, + "step": 3675 + }, + { + "epoch": 0.7415799046169074, + "grad_norm": 0.38640883564949036, + "learning_rate": 1.6498164561870834e-06, + "loss": 0.7502, + "step": 3676 + }, + { + "epoch": 0.7417816401731143, + "grad_norm": 0.5415723323822021, + "learning_rate": 1.6473914467556578e-06, + "loss": 0.6831, + "step": 3677 + }, + { + "epoch": 0.7419833757293213, + "grad_norm": 0.5771734714508057, + "learning_rate": 1.644967869295599e-06, + "loss": 0.6915, + "step": 3678 + }, + { + "epoch": 0.7421851112855283, + "grad_norm": 0.30845147371292114, + "learning_rate": 1.6425457248420712e-06, + "loss": 0.6984, + "step": 3679 + }, + { + "epoch": 0.7423868468417353, + "grad_norm": 0.2992367148399353, + "learning_rate": 1.6401250144296239e-06, + "loss": 0.7463, + "step": 3680 + }, + { + "epoch": 0.7425885823979423, + "grad_norm": 0.38862472772598267, + "learning_rate": 1.6377057390921919e-06, + "loss": 0.691, + "step": 3681 + }, + { + "epoch": 0.7427903179541493, + "grad_norm": 0.6588892340660095, + "learning_rate": 1.6352878998631044e-06, + "loss": 0.6742, + "step": 3682 + }, + { + "epoch": 0.7429920535103562, + "grad_norm": 0.35232171416282654, + "learning_rate": 1.6328714977750698e-06, + "loss": 0.6368, + "step": 3683 + }, + { + "epoch": 0.7431937890665633, + "grad_norm": 0.7427819967269897, + "learning_rate": 1.6304565338601864e-06, + "loss": 0.7213, + "step": 3684 + }, + { + "epoch": 0.7433955246227703, + "grad_norm": 0.35415828227996826, + "learning_rate": 1.628043009149935e-06, + "loss": 0.791, + "step": 3685 + }, + { + "epoch": 0.7435972601789773, + "grad_norm": 0.4176258444786072, + "learning_rate": 1.6256309246751879e-06, + "loss": 0.6655, + "step": 3686 + }, + { + "epoch": 0.7437989957351843, + "grad_norm": 0.45703744888305664, + "learning_rate": 1.6232202814661963e-06, + "loss": 0.7076, + "step": 3687 + }, + { + "epoch": 0.7440007312913912, + "grad_norm": 0.5029721260070801, + "learning_rate": 1.6208110805525983e-06, + "loss": 0.8178, + "step": 3688 + }, + { + "epoch": 0.7442024668475983, + "grad_norm": 0.647709310054779, + "learning_rate": 1.6184033229634134e-06, + "loss": 0.6565, + "step": 3689 + }, + { + "epoch": 0.7444042024038052, + "grad_norm": 0.6276201009750366, + "learning_rate": 1.61599700972705e-06, + "loss": 0.6759, + "step": 3690 + }, + { + "epoch": 0.7446059379600122, + "grad_norm": 0.6077056527137756, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.666, + "step": 3691 + }, + { + "epoch": 0.7448076735162192, + "grad_norm": 0.5409219264984131, + "learning_rate": 1.6111887204233184e-06, + "loss": 0.7746, + "step": 3692 + }, + { + "epoch": 0.7450094090724262, + "grad_norm": 0.38843217492103577, + "learning_rate": 1.608786746409675e-06, + "loss": 0.7357, + "step": 3693 + }, + { + "epoch": 0.7452111446286332, + "grad_norm": 0.5379346013069153, + "learning_rate": 1.606386220856299e-06, + "loss": 0.6834, + "step": 3694 + }, + { + "epoch": 0.7454128801848402, + "grad_norm": 0.7650083899497986, + "learning_rate": 1.603987144788507e-06, + "loss": 0.8181, + "step": 3695 + }, + { + "epoch": 0.7456146157410471, + "grad_norm": 0.48006683588027954, + "learning_rate": 1.6015895192309933e-06, + "loss": 0.7082, + "step": 3696 + }, + { + "epoch": 0.7458163512972542, + "grad_norm": 0.42755845189094543, + "learning_rate": 1.5991933452078396e-06, + "loss": 0.6766, + "step": 3697 + }, + { + "epoch": 0.7460180868534612, + "grad_norm": 0.4909815192222595, + "learning_rate": 1.596798623742501e-06, + "loss": 0.7053, + "step": 3698 + }, + { + "epoch": 0.7462198224096682, + "grad_norm": 0.4889410138130188, + "learning_rate": 1.5944053558578144e-06, + "loss": 0.6501, + "step": 3699 + }, + { + "epoch": 0.7464215579658752, + "grad_norm": 0.6778377890586853, + "learning_rate": 1.5920135425759974e-06, + "loss": 0.7152, + "step": 3700 + }, + { + "epoch": 0.7466232935220821, + "grad_norm": 2.5792107582092285, + "learning_rate": 1.5896231849186456e-06, + "loss": 0.7679, + "step": 3701 + }, + { + "epoch": 0.7468250290782892, + "grad_norm": 0.7936316132545471, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.6517, + "step": 3702 + }, + { + "epoch": 0.7470267646344961, + "grad_norm": 0.5701306462287903, + "learning_rate": 1.5848468405606038e-06, + "loss": 0.7991, + "step": 3703 + }, + { + "epoch": 0.7472285001907032, + "grad_norm": 0.4055497646331787, + "learning_rate": 1.5824608558999927e-06, + "loss": 0.6636, + "step": 3704 + }, + { + "epoch": 0.7474302357469101, + "grad_norm": 0.34075456857681274, + "learning_rate": 1.5800763309440053e-06, + "loss": 0.8077, + "step": 3705 + }, + { + "epoch": 0.7476319713031171, + "grad_norm": 0.7685133218765259, + "learning_rate": 1.5776932667111228e-06, + "loss": 0.6494, + "step": 3706 + }, + { + "epoch": 0.7478337068593242, + "grad_norm": 0.39524295926094055, + "learning_rate": 1.5753116642192013e-06, + "loss": 1.0224, + "step": 3707 + }, + { + "epoch": 0.7480354424155311, + "grad_norm": 0.4503965377807617, + "learning_rate": 1.572931524485477e-06, + "loss": 0.6458, + "step": 3708 + }, + { + "epoch": 0.748237177971738, + "grad_norm": 0.48717793822288513, + "learning_rate": 1.5705528485265586e-06, + "loss": 0.6641, + "step": 3709 + }, + { + "epoch": 0.7484389135279451, + "grad_norm": 0.3593592941761017, + "learning_rate": 1.5681756373584272e-06, + "loss": 0.9094, + "step": 3710 + }, + { + "epoch": 0.7486406490841521, + "grad_norm": 0.5503544211387634, + "learning_rate": 1.5657998919964462e-06, + "loss": 0.6513, + "step": 3711 + }, + { + "epoch": 0.7488423846403591, + "grad_norm": 0.45195677876472473, + "learning_rate": 1.5634256134553416e-06, + "loss": 0.6276, + "step": 3712 + }, + { + "epoch": 0.7490441201965661, + "grad_norm": 0.4969177842140198, + "learning_rate": 1.561052802749221e-06, + "loss": 0.7005, + "step": 3713 + }, + { + "epoch": 0.749245855752773, + "grad_norm": 0.6967068314552307, + "learning_rate": 1.5586814608915673e-06, + "loss": 0.6904, + "step": 3714 + }, + { + "epoch": 0.7494475913089801, + "grad_norm": 0.42649900913238525, + "learning_rate": 1.5563115888952252e-06, + "loss": 0.8253, + "step": 3715 + }, + { + "epoch": 0.749649326865187, + "grad_norm": 0.5606639981269836, + "learning_rate": 1.553943187772422e-06, + "loss": 0.8229, + "step": 3716 + }, + { + "epoch": 0.7498510624213941, + "grad_norm": 0.4556725025177002, + "learning_rate": 1.5515762585347526e-06, + "loss": 0.6659, + "step": 3717 + }, + { + "epoch": 0.750052797977601, + "grad_norm": 0.3770776391029358, + "learning_rate": 1.5492108021931806e-06, + "loss": 0.7185, + "step": 3718 + }, + { + "epoch": 0.750254533533808, + "grad_norm": 1.0561859607696533, + "learning_rate": 1.5468468197580478e-06, + "loss": 0.7136, + "step": 3719 + }, + { + "epoch": 0.7504562690900151, + "grad_norm": 0.6821918487548828, + "learning_rate": 1.544484312239059e-06, + "loss": 0.6605, + "step": 3720 + }, + { + "epoch": 0.750658004646222, + "grad_norm": 0.5854676365852356, + "learning_rate": 1.542123280645292e-06, + "loss": 0.6434, + "step": 3721 + }, + { + "epoch": 0.7508597402024291, + "grad_norm": 0.6247336864471436, + "learning_rate": 1.5397637259851977e-06, + "loss": 0.6861, + "step": 3722 + }, + { + "epoch": 0.751061475758636, + "grad_norm": 0.3600952625274658, + "learning_rate": 1.5374056492665879e-06, + "loss": 0.7089, + "step": 3723 + }, + { + "epoch": 0.751263211314843, + "grad_norm": 0.34228378534317017, + "learning_rate": 1.5350490514966509e-06, + "loss": 0.7864, + "step": 3724 + }, + { + "epoch": 0.75146494687105, + "grad_norm": 0.7408826947212219, + "learning_rate": 1.5326939336819408e-06, + "loss": 0.6609, + "step": 3725 + }, + { + "epoch": 0.751666682427257, + "grad_norm": 0.31936943531036377, + "learning_rate": 1.5303402968283758e-06, + "loss": 0.6578, + "step": 3726 + }, + { + "epoch": 0.751868417983464, + "grad_norm": 0.4063432216644287, + "learning_rate": 1.527988141941249e-06, + "loss": 0.6571, + "step": 3727 + }, + { + "epoch": 0.752070153539671, + "grad_norm": 0.3507099449634552, + "learning_rate": 1.5256374700252151e-06, + "loss": 0.6911, + "step": 3728 + }, + { + "epoch": 0.7522718890958779, + "grad_norm": 0.4853664040565491, + "learning_rate": 1.5232882820842948e-06, + "loss": 0.6991, + "step": 3729 + }, + { + "epoch": 0.752473624652085, + "grad_norm": 0.3933629095554352, + "learning_rate": 1.520940579121881e-06, + "loss": 0.6584, + "step": 3730 + }, + { + "epoch": 0.752675360208292, + "grad_norm": 0.37084707617759705, + "learning_rate": 1.5185943621407233e-06, + "loss": 0.655, + "step": 3731 + }, + { + "epoch": 0.7528770957644989, + "grad_norm": 0.3501735329627991, + "learning_rate": 1.5162496321429438e-06, + "loss": 0.6804, + "step": 3732 + }, + { + "epoch": 0.753078831320706, + "grad_norm": 0.5413325428962708, + "learning_rate": 1.5139063901300298e-06, + "loss": 0.6807, + "step": 3733 + }, + { + "epoch": 0.7532805668769129, + "grad_norm": 0.3382270038127899, + "learning_rate": 1.5115646371028258e-06, + "loss": 0.6519, + "step": 3734 + }, + { + "epoch": 0.75348230243312, + "grad_norm": 0.37832415103912354, + "learning_rate": 1.5092243740615486e-06, + "loss": 0.6879, + "step": 3735 + }, + { + "epoch": 0.7536840379893269, + "grad_norm": 0.4899078905582428, + "learning_rate": 1.5068856020057732e-06, + "loss": 0.6914, + "step": 3736 + }, + { + "epoch": 0.7538857735455339, + "grad_norm": 0.786450207233429, + "learning_rate": 1.5045483219344387e-06, + "loss": 0.6993, + "step": 3737 + }, + { + "epoch": 0.7540875091017409, + "grad_norm": 0.6078961491584778, + "learning_rate": 1.5022125348458504e-06, + "loss": 0.6498, + "step": 3738 + }, + { + "epoch": 0.7542892446579479, + "grad_norm": 0.39420047402381897, + "learning_rate": 1.4998782417376723e-06, + "loss": 0.6692, + "step": 3739 + }, + { + "epoch": 0.754490980214155, + "grad_norm": 0.7297521233558655, + "learning_rate": 1.4975454436069292e-06, + "loss": 0.6502, + "step": 3740 + }, + { + "epoch": 0.7546927157703619, + "grad_norm": 0.4037453830242157, + "learning_rate": 1.4952141414500143e-06, + "loss": 0.6704, + "step": 3741 + }, + { + "epoch": 0.7548944513265688, + "grad_norm": 0.5245139002799988, + "learning_rate": 1.4928843362626705e-06, + "loss": 0.6671, + "step": 3742 + }, + { + "epoch": 0.7550961868827759, + "grad_norm": 0.5826756954193115, + "learning_rate": 1.4905560290400128e-06, + "loss": 0.642, + "step": 3743 + }, + { + "epoch": 0.7552979224389829, + "grad_norm": 0.3041967749595642, + "learning_rate": 1.4882292207765104e-06, + "loss": 0.7224, + "step": 3744 + }, + { + "epoch": 0.7554996579951899, + "grad_norm": 0.31114375591278076, + "learning_rate": 1.4859039124659908e-06, + "loss": 0.7237, + "step": 3745 + }, + { + "epoch": 0.7557013935513969, + "grad_norm": 0.8395849466323853, + "learning_rate": 1.4835801051016463e-06, + "loss": 0.6635, + "step": 3746 + }, + { + "epoch": 0.7559031291076038, + "grad_norm": 0.3680986166000366, + "learning_rate": 1.4812577996760242e-06, + "loss": 0.8131, + "step": 3747 + }, + { + "epoch": 0.7561048646638109, + "grad_norm": 1.124172329902649, + "learning_rate": 1.4789369971810298e-06, + "loss": 0.6186, + "step": 3748 + }, + { + "epoch": 0.7563066002200178, + "grad_norm": 0.3392429053783417, + "learning_rate": 1.47661769860793e-06, + "loss": 0.6952, + "step": 3749 + }, + { + "epoch": 0.7565083357762248, + "grad_norm": 0.37163037061691284, + "learning_rate": 1.474299904947346e-06, + "loss": 0.7324, + "step": 3750 + }, + { + "epoch": 0.7567100713324318, + "grad_norm": 0.3731152415275574, + "learning_rate": 1.471983617189258e-06, + "loss": 0.6832, + "step": 3751 + }, + { + "epoch": 0.7569118068886388, + "grad_norm": 1.5635799169540405, + "learning_rate": 1.469668836323001e-06, + "loss": 0.6204, + "step": 3752 + }, + { + "epoch": 0.7571135424448459, + "grad_norm": 0.32818305492401123, + "learning_rate": 1.4673555633372699e-06, + "loss": 0.6593, + "step": 3753 + }, + { + "epoch": 0.7573152780010528, + "grad_norm": 0.5359674692153931, + "learning_rate": 1.4650437992201122e-06, + "loss": 0.6488, + "step": 3754 + }, + { + "epoch": 0.7575170135572598, + "grad_norm": 0.39580339193344116, + "learning_rate": 1.4627335449589331e-06, + "loss": 0.665, + "step": 3755 + }, + { + "epoch": 0.7577187491134668, + "grad_norm": 0.3998722732067108, + "learning_rate": 1.4604248015404886e-06, + "loss": 0.6748, + "step": 3756 + }, + { + "epoch": 0.7579204846696738, + "grad_norm": 0.3832513093948364, + "learning_rate": 1.4581175699508982e-06, + "loss": 0.6579, + "step": 3757 + }, + { + "epoch": 0.7581222202258808, + "grad_norm": 0.7050317525863647, + "learning_rate": 1.455811851175627e-06, + "loss": 0.6625, + "step": 3758 + }, + { + "epoch": 0.7583239557820878, + "grad_norm": 0.37937480211257935, + "learning_rate": 1.4535076461994974e-06, + "loss": 0.6636, + "step": 3759 + }, + { + "epoch": 0.7585256913382947, + "grad_norm": 0.7320890426635742, + "learning_rate": 1.4512049560066837e-06, + "loss": 0.652, + "step": 3760 + }, + { + "epoch": 0.7587274268945018, + "grad_norm": 0.39705395698547363, + "learning_rate": 1.4489037815807178e-06, + "loss": 0.6796, + "step": 3761 + }, + { + "epoch": 0.7589291624507087, + "grad_norm": 0.48778223991394043, + "learning_rate": 1.4466041239044792e-06, + "loss": 0.7062, + "step": 3762 + }, + { + "epoch": 0.7591308980069158, + "grad_norm": 0.40511614084243774, + "learning_rate": 1.4443059839601998e-06, + "loss": 0.6851, + "step": 3763 + }, + { + "epoch": 0.7593326335631227, + "grad_norm": 0.40412962436676025, + "learning_rate": 1.4420093627294673e-06, + "loss": 0.824, + "step": 3764 + }, + { + "epoch": 0.7595343691193297, + "grad_norm": 0.6214047074317932, + "learning_rate": 1.4397142611932174e-06, + "loss": 0.6574, + "step": 3765 + }, + { + "epoch": 0.7597361046755368, + "grad_norm": 0.7142421007156372, + "learning_rate": 1.4374206803317354e-06, + "loss": 0.6656, + "step": 3766 + }, + { + "epoch": 0.7599378402317437, + "grad_norm": 0.49294382333755493, + "learning_rate": 1.4351286211246618e-06, + "loss": 0.7274, + "step": 3767 + }, + { + "epoch": 0.7601395757879507, + "grad_norm": 0.7711848616600037, + "learning_rate": 1.4328380845509837e-06, + "loss": 0.6402, + "step": 3768 + }, + { + "epoch": 0.7603413113441577, + "grad_norm": 3.8337807655334473, + "learning_rate": 1.430549071589038e-06, + "loss": 0.6335, + "step": 3769 + }, + { + "epoch": 0.7605430469003647, + "grad_norm": 0.3326573669910431, + "learning_rate": 1.428261583216512e-06, + "loss": 0.7242, + "step": 3770 + }, + { + "epoch": 0.7607447824565717, + "grad_norm": 0.4140056073665619, + "learning_rate": 1.4259756204104396e-06, + "loss": 0.6446, + "step": 3771 + }, + { + "epoch": 0.7609465180127787, + "grad_norm": 0.42758747935295105, + "learning_rate": 1.4236911841472074e-06, + "loss": 0.6707, + "step": 3772 + }, + { + "epoch": 0.7611482535689856, + "grad_norm": 0.5065056085586548, + "learning_rate": 1.4214082754025466e-06, + "loss": 0.6566, + "step": 3773 + }, + { + "epoch": 0.7613499891251927, + "grad_norm": 0.48952701687812805, + "learning_rate": 1.4191268951515348e-06, + "loss": 0.6657, + "step": 3774 + }, + { + "epoch": 0.7615517246813996, + "grad_norm": 1.1300623416900635, + "learning_rate": 1.4168470443686017e-06, + "loss": 0.6557, + "step": 3775 + }, + { + "epoch": 0.7617534602376067, + "grad_norm": 0.532576322555542, + "learning_rate": 1.414568724027519e-06, + "loss": 0.6368, + "step": 3776 + }, + { + "epoch": 0.7619551957938137, + "grad_norm": 0.5727057456970215, + "learning_rate": 1.4122919351014052e-06, + "loss": 0.6731, + "step": 3777 + }, + { + "epoch": 0.7621569313500206, + "grad_norm": 1.5019340515136719, + "learning_rate": 1.4100166785627301e-06, + "loss": 0.6507, + "step": 3778 + }, + { + "epoch": 0.7623586669062277, + "grad_norm": 0.39014142751693726, + "learning_rate": 1.4077429553832995e-06, + "loss": 0.7623, + "step": 3779 + }, + { + "epoch": 0.7625604024624346, + "grad_norm": 0.36659300327301025, + "learning_rate": 1.4054707665342721e-06, + "loss": 0.733, + "step": 3780 + }, + { + "epoch": 0.7627621380186417, + "grad_norm": 0.48996803164482117, + "learning_rate": 1.403200112986151e-06, + "loss": 0.6413, + "step": 3781 + }, + { + "epoch": 0.7629638735748486, + "grad_norm": 0.33237341046333313, + "learning_rate": 1.400930995708777e-06, + "loss": 0.6667, + "step": 3782 + }, + { + "epoch": 0.7631656091310556, + "grad_norm": 0.3961053192615509, + "learning_rate": 1.3986634156713418e-06, + "loss": 0.6613, + "step": 3783 + }, + { + "epoch": 0.7633673446872626, + "grad_norm": 0.44538700580596924, + "learning_rate": 1.3963973738423774e-06, + "loss": 0.8809, + "step": 3784 + }, + { + "epoch": 0.7635690802434696, + "grad_norm": 0.5624479055404663, + "learning_rate": 1.3941328711897568e-06, + "loss": 0.6743, + "step": 3785 + }, + { + "epoch": 0.7637708157996765, + "grad_norm": 0.349399596452713, + "learning_rate": 1.391869908680703e-06, + "loss": 0.6448, + "step": 3786 + }, + { + "epoch": 0.7639725513558836, + "grad_norm": 0.3919467329978943, + "learning_rate": 1.3896084872817695e-06, + "loss": 0.6094, + "step": 3787 + }, + { + "epoch": 0.7641742869120906, + "grad_norm": 0.45314642786979675, + "learning_rate": 1.3873486079588617e-06, + "loss": 0.8347, + "step": 3788 + }, + { + "epoch": 0.7643760224682976, + "grad_norm": 0.7018961906433105, + "learning_rate": 1.3850902716772251e-06, + "loss": 0.7033, + "step": 3789 + }, + { + "epoch": 0.7645777580245046, + "grad_norm": 0.3727651536464691, + "learning_rate": 1.382833479401438e-06, + "loss": 0.6793, + "step": 3790 + }, + { + "epoch": 0.7647794935807115, + "grad_norm": 0.34330493211746216, + "learning_rate": 1.3805782320954297e-06, + "loss": 0.8555, + "step": 3791 + }, + { + "epoch": 0.7649812291369186, + "grad_norm": 0.492082417011261, + "learning_rate": 1.3783245307224635e-06, + "loss": 0.7536, + "step": 3792 + }, + { + "epoch": 0.7651829646931255, + "grad_norm": 0.3740497827529907, + "learning_rate": 1.3760723762451428e-06, + "loss": 0.8049, + "step": 3793 + }, + { + "epoch": 0.7653847002493326, + "grad_norm": 0.3092051148414612, + "learning_rate": 1.373821769625413e-06, + "loss": 0.7887, + "step": 3794 + }, + { + "epoch": 0.7655864358055395, + "grad_norm": 0.3884614109992981, + "learning_rate": 1.3715727118245558e-06, + "loss": 0.6814, + "step": 3795 + }, + { + "epoch": 0.7657881713617465, + "grad_norm": 1.359918236732483, + "learning_rate": 1.3693252038031912e-06, + "loss": 0.6626, + "step": 3796 + }, + { + "epoch": 0.7659899069179535, + "grad_norm": 0.4398966133594513, + "learning_rate": 1.3670792465212828e-06, + "loss": 0.6604, + "step": 3797 + }, + { + "epoch": 0.7661916424741605, + "grad_norm": 0.7652633190155029, + "learning_rate": 1.3648348409381208e-06, + "loss": 0.675, + "step": 3798 + }, + { + "epoch": 0.7663933780303676, + "grad_norm": 0.8095738291740417, + "learning_rate": 1.3625919880123438e-06, + "loss": 0.6632, + "step": 3799 + }, + { + "epoch": 0.7665951135865745, + "grad_norm": 0.5652156472206116, + "learning_rate": 1.3603506887019214e-06, + "loss": 0.7734, + "step": 3800 + }, + { + "epoch": 0.7667968491427815, + "grad_norm": 0.45031893253326416, + "learning_rate": 1.3581109439641587e-06, + "loss": 0.6172, + "step": 3801 + }, + { + "epoch": 0.7669985846989885, + "grad_norm": 0.49738746881484985, + "learning_rate": 1.3558727547557032e-06, + "loss": 0.7687, + "step": 3802 + }, + { + "epoch": 0.7672003202551955, + "grad_norm": 3.2418699264526367, + "learning_rate": 1.3536361220325312e-06, + "loss": 0.6724, + "step": 3803 + }, + { + "epoch": 0.7674020558114024, + "grad_norm": 0.34520092606544495, + "learning_rate": 1.3514010467499556e-06, + "loss": 0.6646, + "step": 3804 + }, + { + "epoch": 0.7676037913676095, + "grad_norm": 0.5189719796180725, + "learning_rate": 1.3491675298626279e-06, + "loss": 0.8133, + "step": 3805 + }, + { + "epoch": 0.7678055269238164, + "grad_norm": 0.7521995902061462, + "learning_rate": 1.3469355723245303e-06, + "loss": 0.6499, + "step": 3806 + }, + { + "epoch": 0.7680072624800235, + "grad_norm": 0.6148568987846375, + "learning_rate": 1.3447051750889783e-06, + "loss": 1.0773, + "step": 3807 + }, + { + "epoch": 0.7682089980362304, + "grad_norm": 2.7657034397125244, + "learning_rate": 1.3424763391086253e-06, + "loss": 0.6823, + "step": 3808 + }, + { + "epoch": 0.7684107335924374, + "grad_norm": 0.35511335730552673, + "learning_rate": 1.3402490653354544e-06, + "loss": 0.7114, + "step": 3809 + }, + { + "epoch": 0.7686124691486445, + "grad_norm": 2.8116512298583984, + "learning_rate": 1.338023354720781e-06, + "loss": 0.8214, + "step": 3810 + }, + { + "epoch": 0.7688142047048514, + "grad_norm": 0.5073435306549072, + "learning_rate": 1.3357992082152555e-06, + "loss": 0.7818, + "step": 3811 + }, + { + "epoch": 0.7690159402610585, + "grad_norm": 0.4251355230808258, + "learning_rate": 1.3335766267688566e-06, + "loss": 0.6838, + "step": 3812 + }, + { + "epoch": 0.7692176758172654, + "grad_norm": 0.3123939633369446, + "learning_rate": 1.3313556113308994e-06, + "loss": 0.6698, + "step": 3813 + }, + { + "epoch": 0.7694194113734724, + "grad_norm": 0.44618505239486694, + "learning_rate": 1.3291361628500266e-06, + "loss": 0.7761, + "step": 3814 + }, + { + "epoch": 0.7696211469296794, + "grad_norm": 1.669847011566162, + "learning_rate": 1.326918282274211e-06, + "loss": 0.7316, + "step": 3815 + }, + { + "epoch": 0.7698228824858864, + "grad_norm": 0.8414665460586548, + "learning_rate": 1.3247019705507596e-06, + "loss": 0.7481, + "step": 3816 + }, + { + "epoch": 0.7700246180420934, + "grad_norm": 0.5543730854988098, + "learning_rate": 1.3224872286263058e-06, + "loss": 0.778, + "step": 3817 + }, + { + "epoch": 0.7702263535983004, + "grad_norm": 0.7356978058815002, + "learning_rate": 1.3202740574468132e-06, + "loss": 0.6415, + "step": 3818 + }, + { + "epoch": 0.7704280891545073, + "grad_norm": 0.37701356410980225, + "learning_rate": 1.3180624579575741e-06, + "loss": 0.6663, + "step": 3819 + }, + { + "epoch": 0.7706298247107144, + "grad_norm": 0.37720897793769836, + "learning_rate": 1.3158524311032128e-06, + "loss": 0.7926, + "step": 3820 + }, + { + "epoch": 0.7708315602669213, + "grad_norm": 0.6713565587997437, + "learning_rate": 1.3136439778276782e-06, + "loss": 0.6525, + "step": 3821 + }, + { + "epoch": 0.7710332958231284, + "grad_norm": 0.8381698131561279, + "learning_rate": 1.3114370990742465e-06, + "loss": 0.6224, + "step": 3822 + }, + { + "epoch": 0.7712350313793354, + "grad_norm": 0.3800964653491974, + "learning_rate": 1.309231795785526e-06, + "loss": 0.6969, + "step": 3823 + }, + { + "epoch": 0.7714367669355423, + "grad_norm": 0.544260561466217, + "learning_rate": 1.3070280689034486e-06, + "loss": 0.6555, + "step": 3824 + }, + { + "epoch": 0.7716385024917494, + "grad_norm": 0.3614006042480469, + "learning_rate": 1.304825919369273e-06, + "loss": 0.6744, + "step": 3825 + }, + { + "epoch": 0.7718402380479563, + "grad_norm": 0.4283572733402252, + "learning_rate": 1.3026253481235845e-06, + "loss": 0.6823, + "step": 3826 + }, + { + "epoch": 0.7720419736041633, + "grad_norm": 0.4926629960536957, + "learning_rate": 1.3004263561062935e-06, + "loss": 0.7654, + "step": 3827 + }, + { + "epoch": 0.7722437091603703, + "grad_norm": 0.5868778228759766, + "learning_rate": 1.2982289442566392e-06, + "loss": 0.6969, + "step": 3828 + }, + { + "epoch": 0.7724454447165773, + "grad_norm": 0.3782004117965698, + "learning_rate": 1.2960331135131826e-06, + "loss": 0.6601, + "step": 3829 + }, + { + "epoch": 0.7726471802727843, + "grad_norm": 0.5834161639213562, + "learning_rate": 1.2938388648138089e-06, + "loss": 0.6755, + "step": 3830 + }, + { + "epoch": 0.7728489158289913, + "grad_norm": 0.3004898726940155, + "learning_rate": 1.291646199095732e-06, + "loss": 0.7482, + "step": 3831 + }, + { + "epoch": 0.7730506513851982, + "grad_norm": 0.8956354856491089, + "learning_rate": 1.289455117295485e-06, + "loss": 0.661, + "step": 3832 + }, + { + "epoch": 0.7732523869414053, + "grad_norm": 0.34841060638427734, + "learning_rate": 1.2872656203489242e-06, + "loss": 0.6674, + "step": 3833 + }, + { + "epoch": 0.7734541224976123, + "grad_norm": 0.36636319756507874, + "learning_rate": 1.2850777091912364e-06, + "loss": 0.7774, + "step": 3834 + }, + { + "epoch": 0.7736558580538193, + "grad_norm": 0.3407168388366699, + "learning_rate": 1.2828913847569185e-06, + "loss": 0.6519, + "step": 3835 + }, + { + "epoch": 0.7738575936100263, + "grad_norm": 0.41359943151474, + "learning_rate": 1.2807066479798013e-06, + "loss": 0.7158, + "step": 3836 + }, + { + "epoch": 0.7740593291662332, + "grad_norm": 0.3872455656528473, + "learning_rate": 1.2785234997930345e-06, + "loss": 0.6615, + "step": 3837 + }, + { + "epoch": 0.7742610647224403, + "grad_norm": 0.4519384205341339, + "learning_rate": 1.2763419411290823e-06, + "loss": 0.6828, + "step": 3838 + }, + { + "epoch": 0.7744628002786472, + "grad_norm": 1.394004464149475, + "learning_rate": 1.2741619729197403e-06, + "loss": 0.812, + "step": 3839 + }, + { + "epoch": 0.7746645358348543, + "grad_norm": 0.38254714012145996, + "learning_rate": 1.2719835960961173e-06, + "loss": 0.6816, + "step": 3840 + }, + { + "epoch": 0.7748662713910612, + "grad_norm": 0.390522837638855, + "learning_rate": 1.2698068115886453e-06, + "loss": 0.6515, + "step": 3841 + }, + { + "epoch": 0.7750680069472682, + "grad_norm": 0.47609326243400574, + "learning_rate": 1.2676316203270766e-06, + "loss": 0.651, + "step": 3842 + }, + { + "epoch": 0.7752697425034752, + "grad_norm": 0.7987368106842041, + "learning_rate": 1.265458023240483e-06, + "loss": 0.6626, + "step": 3843 + }, + { + "epoch": 0.7754714780596822, + "grad_norm": 0.4103650450706482, + "learning_rate": 1.2632860212572518e-06, + "loss": 0.7753, + "step": 3844 + }, + { + "epoch": 0.7756732136158891, + "grad_norm": 1.0546495914459229, + "learning_rate": 1.2611156153050963e-06, + "loss": 0.6805, + "step": 3845 + }, + { + "epoch": 0.7758749491720962, + "grad_norm": 1.0925642251968384, + "learning_rate": 1.2589468063110382e-06, + "loss": 0.7166, + "step": 3846 + }, + { + "epoch": 0.7760766847283032, + "grad_norm": 0.4217831492424011, + "learning_rate": 1.2567795952014272e-06, + "loss": 0.6609, + "step": 3847 + }, + { + "epoch": 0.7762784202845102, + "grad_norm": 1.4595791101455688, + "learning_rate": 1.2546139829019238e-06, + "loss": 0.6633, + "step": 3848 + }, + { + "epoch": 0.7764801558407172, + "grad_norm": 0.33885642886161804, + "learning_rate": 1.2524499703375065e-06, + "loss": 0.6822, + "step": 3849 + }, + { + "epoch": 0.7766818913969241, + "grad_norm": 0.380485475063324, + "learning_rate": 1.2502875584324748e-06, + "loss": 0.6736, + "step": 3850 + }, + { + "epoch": 0.7768836269531312, + "grad_norm": 0.486627459526062, + "learning_rate": 1.2481267481104398e-06, + "loss": 0.6623, + "step": 3851 + }, + { + "epoch": 0.7770853625093381, + "grad_norm": 0.4864477515220642, + "learning_rate": 1.245967540294329e-06, + "loss": 0.6338, + "step": 3852 + }, + { + "epoch": 0.7772870980655452, + "grad_norm": 0.5462962985038757, + "learning_rate": 1.2438099359063898e-06, + "loss": 0.7502, + "step": 3853 + }, + { + "epoch": 0.7774888336217521, + "grad_norm": 0.4412570595741272, + "learning_rate": 1.2416539358681772e-06, + "loss": 0.6794, + "step": 3854 + }, + { + "epoch": 0.7776905691779591, + "grad_norm": 0.32791343331336975, + "learning_rate": 1.2394995411005672e-06, + "loss": 0.8837, + "step": 3855 + }, + { + "epoch": 0.7778923047341662, + "grad_norm": 0.6076446175575256, + "learning_rate": 1.237346752523752e-06, + "loss": 0.6854, + "step": 3856 + }, + { + "epoch": 0.7780940402903731, + "grad_norm": 0.3447825610637665, + "learning_rate": 1.2351955710572272e-06, + "loss": 0.6444, + "step": 3857 + }, + { + "epoch": 0.7782957758465802, + "grad_norm": 0.6088526844978333, + "learning_rate": 1.233045997619814e-06, + "loss": 0.6993, + "step": 3858 + }, + { + "epoch": 0.7784975114027871, + "grad_norm": 0.5636637806892395, + "learning_rate": 1.23089803312964e-06, + "loss": 0.6693, + "step": 3859 + }, + { + "epoch": 0.7786992469589941, + "grad_norm": 0.3898247480392456, + "learning_rate": 1.2287516785041447e-06, + "loss": 0.6597, + "step": 3860 + }, + { + "epoch": 0.7789009825152011, + "grad_norm": 0.3873283565044403, + "learning_rate": 1.2266069346600862e-06, + "loss": 0.6447, + "step": 3861 + }, + { + "epoch": 0.7791027180714081, + "grad_norm": 0.8242816925048828, + "learning_rate": 1.224463802513529e-06, + "loss": 0.8405, + "step": 3862 + }, + { + "epoch": 0.779304453627615, + "grad_norm": 0.9445183873176575, + "learning_rate": 1.2223222829798503e-06, + "loss": 0.672, + "step": 3863 + }, + { + "epoch": 0.7795061891838221, + "grad_norm": 0.4672654867172241, + "learning_rate": 1.2201823769737408e-06, + "loss": 0.6316, + "step": 3864 + }, + { + "epoch": 0.779707924740029, + "grad_norm": 0.43315044045448303, + "learning_rate": 1.2180440854092007e-06, + "loss": 0.6807, + "step": 3865 + }, + { + "epoch": 0.7799096602962361, + "grad_norm": 0.47044748067855835, + "learning_rate": 1.2159074091995387e-06, + "loss": 0.6416, + "step": 3866 + }, + { + "epoch": 0.780111395852443, + "grad_norm": 0.361098051071167, + "learning_rate": 1.2137723492573766e-06, + "loss": 0.651, + "step": 3867 + }, + { + "epoch": 0.78031313140865, + "grad_norm": 0.5044724345207214, + "learning_rate": 1.2116389064946427e-06, + "loss": 0.6424, + "step": 3868 + }, + { + "epoch": 0.7805148669648571, + "grad_norm": 0.3531644642353058, + "learning_rate": 1.209507081822579e-06, + "loss": 0.666, + "step": 3869 + }, + { + "epoch": 0.780716602521064, + "grad_norm": 0.7237960696220398, + "learning_rate": 1.2073768761517325e-06, + "loss": 0.6426, + "step": 3870 + }, + { + "epoch": 0.7809183380772711, + "grad_norm": 0.5849398374557495, + "learning_rate": 1.2052482903919577e-06, + "loss": 0.758, + "step": 3871 + }, + { + "epoch": 0.781120073633478, + "grad_norm": 0.47558656334877014, + "learning_rate": 1.2031213254524237e-06, + "loss": 0.6467, + "step": 3872 + }, + { + "epoch": 0.781321809189685, + "grad_norm": 0.6503086686134338, + "learning_rate": 1.2009959822416012e-06, + "loss": 0.7555, + "step": 3873 + }, + { + "epoch": 0.781523544745892, + "grad_norm": 0.8717564940452576, + "learning_rate": 1.1988722616672698e-06, + "loss": 0.6304, + "step": 3874 + }, + { + "epoch": 0.781725280302099, + "grad_norm": 0.5620495676994324, + "learning_rate": 1.1967501646365147e-06, + "loss": 0.6471, + "step": 3875 + }, + { + "epoch": 0.781927015858306, + "grad_norm": 0.46963924169540405, + "learning_rate": 1.1946296920557327e-06, + "loss": 0.6977, + "step": 3876 + }, + { + "epoch": 0.782128751414513, + "grad_norm": 0.4893326461315155, + "learning_rate": 1.1925108448306217e-06, + "loss": 0.7063, + "step": 3877 + }, + { + "epoch": 0.7823304869707199, + "grad_norm": 1.2923660278320312, + "learning_rate": 1.1903936238661868e-06, + "loss": 0.7065, + "step": 3878 + }, + { + "epoch": 0.782532222526927, + "grad_norm": 1.1199201345443726, + "learning_rate": 1.1882780300667374e-06, + "loss": 0.6457, + "step": 3879 + }, + { + "epoch": 0.782733958083134, + "grad_norm": 0.42425259947776794, + "learning_rate": 1.1861640643358925e-06, + "loss": 0.6513, + "step": 3880 + }, + { + "epoch": 0.7829356936393409, + "grad_norm": 0.8278416991233826, + "learning_rate": 1.18405172757657e-06, + "loss": 0.6506, + "step": 3881 + }, + { + "epoch": 0.783137429195548, + "grad_norm": 1.3610056638717651, + "learning_rate": 1.1819410206909942e-06, + "loss": 0.5924, + "step": 3882 + }, + { + "epoch": 0.7833391647517549, + "grad_norm": 1.611168384552002, + "learning_rate": 1.1798319445806955e-06, + "loss": 0.6485, + "step": 3883 + }, + { + "epoch": 0.783540900307962, + "grad_norm": 0.36074623465538025, + "learning_rate": 1.1777245001465048e-06, + "loss": 0.6858, + "step": 3884 + }, + { + "epoch": 0.7837426358641689, + "grad_norm": 0.9947795867919922, + "learning_rate": 1.1756186882885566e-06, + "loss": 0.7189, + "step": 3885 + }, + { + "epoch": 0.7839443714203759, + "grad_norm": 0.6028972268104553, + "learning_rate": 1.1735145099062872e-06, + "loss": 0.7583, + "step": 3886 + }, + { + "epoch": 0.7841461069765829, + "grad_norm": 1.2518178224563599, + "learning_rate": 1.1714119658984402e-06, + "loss": 0.694, + "step": 3887 + }, + { + "epoch": 0.7843478425327899, + "grad_norm": 0.311381995677948, + "learning_rate": 1.169311057163055e-06, + "loss": 1.0158, + "step": 3888 + }, + { + "epoch": 0.784549578088997, + "grad_norm": 0.5026519298553467, + "learning_rate": 1.167211784597474e-06, + "loss": 0.6602, + "step": 3889 + }, + { + "epoch": 0.7847513136452039, + "grad_norm": 0.7679435610771179, + "learning_rate": 1.1651141490983442e-06, + "loss": 0.6701, + "step": 3890 + }, + { + "epoch": 0.7849530492014108, + "grad_norm": 0.34757447242736816, + "learning_rate": 1.1630181515616102e-06, + "loss": 0.6911, + "step": 3891 + }, + { + "epoch": 0.7851547847576179, + "grad_norm": 0.4210948348045349, + "learning_rate": 1.1609237928825174e-06, + "loss": 0.7271, + "step": 3892 + }, + { + "epoch": 0.7853565203138249, + "grad_norm": 0.3013748824596405, + "learning_rate": 1.1588310739556113e-06, + "loss": 0.6697, + "step": 3893 + }, + { + "epoch": 0.7855582558700319, + "grad_norm": 0.8086436986923218, + "learning_rate": 1.156739995674736e-06, + "loss": 0.6954, + "step": 3894 + }, + { + "epoch": 0.7857599914262389, + "grad_norm": 0.5072692632675171, + "learning_rate": 1.1546505589330391e-06, + "loss": 0.772, + "step": 3895 + }, + { + "epoch": 0.7859617269824458, + "grad_norm": 0.7558883428573608, + "learning_rate": 1.152562764622963e-06, + "loss": 0.6673, + "step": 3896 + }, + { + "epoch": 0.7861634625386529, + "grad_norm": 0.8792096972465515, + "learning_rate": 1.1504766136362471e-06, + "loss": 0.6173, + "step": 3897 + }, + { + "epoch": 0.7863651980948598, + "grad_norm": 0.7532789707183838, + "learning_rate": 1.1483921068639353e-06, + "loss": 0.6114, + "step": 3898 + }, + { + "epoch": 0.7865669336510668, + "grad_norm": 0.6622703075408936, + "learning_rate": 1.1463092451963637e-06, + "loss": 1.0339, + "step": 3899 + }, + { + "epoch": 0.7867686692072738, + "grad_norm": 0.4823106825351715, + "learning_rate": 1.1442280295231656e-06, + "loss": 0.7618, + "step": 3900 + }, + { + "epoch": 0.7869704047634808, + "grad_norm": 0.4021747410297394, + "learning_rate": 1.1421484607332778e-06, + "loss": 0.6722, + "step": 3901 + }, + { + "epoch": 0.7871721403196879, + "grad_norm": 0.5340956449508667, + "learning_rate": 1.1400705397149226e-06, + "loss": 0.8126, + "step": 3902 + }, + { + "epoch": 0.7873738758758948, + "grad_norm": 0.3946157693862915, + "learning_rate": 1.1379942673556287e-06, + "loss": 0.6543, + "step": 3903 + }, + { + "epoch": 0.7875756114321018, + "grad_norm": 0.7702094316482544, + "learning_rate": 1.1359196445422187e-06, + "loss": 0.7597, + "step": 3904 + }, + { + "epoch": 0.7877773469883088, + "grad_norm": 0.3735303580760956, + "learning_rate": 1.1338466721608039e-06, + "loss": 0.6527, + "step": 3905 + }, + { + "epoch": 0.7879790825445158, + "grad_norm": 0.4215957522392273, + "learning_rate": 1.1317753510967989e-06, + "loss": 0.6755, + "step": 3906 + }, + { + "epoch": 0.7881808181007228, + "grad_norm": 2.6547322273254395, + "learning_rate": 1.1297056822349083e-06, + "loss": 0.8418, + "step": 3907 + }, + { + "epoch": 0.7883825536569298, + "grad_norm": 0.33156833052635193, + "learning_rate": 1.1276376664591315e-06, + "loss": 0.6247, + "step": 3908 + }, + { + "epoch": 0.7885842892131367, + "grad_norm": 0.522953450679779, + "learning_rate": 1.125571304652766e-06, + "loss": 0.8095, + "step": 3909 + }, + { + "epoch": 0.7887860247693438, + "grad_norm": 0.32513847947120667, + "learning_rate": 1.1235065976983944e-06, + "loss": 0.6806, + "step": 3910 + }, + { + "epoch": 0.7889877603255507, + "grad_norm": 0.586698055267334, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.6676, + "step": 3911 + }, + { + "epoch": 0.7891894958817578, + "grad_norm": 0.805479884147644, + "learning_rate": 1.1193821518724602e-06, + "loss": 0.7554, + "step": 3912 + }, + { + "epoch": 0.7893912314379647, + "grad_norm": 0.4024786949157715, + "learning_rate": 1.1173224147625339e-06, + "loss": 0.6999, + "step": 3913 + }, + { + "epoch": 0.7895929669941717, + "grad_norm": 1.362681269645691, + "learning_rate": 1.1152643360278847e-06, + "loss": 0.6082, + "step": 3914 + }, + { + "epoch": 0.7897947025503788, + "grad_norm": 0.38716137409210205, + "learning_rate": 1.1132079165475601e-06, + "loss": 0.8036, + "step": 3915 + }, + { + "epoch": 0.7899964381065857, + "grad_norm": 0.44183582067489624, + "learning_rate": 1.1111531571999e-06, + "loss": 0.8228, + "step": 3916 + }, + { + "epoch": 0.7901981736627928, + "grad_norm": 0.6368033289909363, + "learning_rate": 1.1091000588625395e-06, + "loss": 0.6956, + "step": 3917 + }, + { + "epoch": 0.7903999092189997, + "grad_norm": 0.5896937847137451, + "learning_rate": 1.1070486224124e-06, + "loss": 0.8112, + "step": 3918 + }, + { + "epoch": 0.7906016447752067, + "grad_norm": 0.4653731882572174, + "learning_rate": 1.104998848725692e-06, + "loss": 0.7249, + "step": 3919 + }, + { + "epoch": 0.7908033803314137, + "grad_norm": 0.6194247007369995, + "learning_rate": 1.1029507386779225e-06, + "loss": 0.6755, + "step": 3920 + }, + { + "epoch": 0.7910051158876207, + "grad_norm": 0.34444916248321533, + "learning_rate": 1.1009042931438784e-06, + "loss": 0.6422, + "step": 3921 + }, + { + "epoch": 0.7912068514438276, + "grad_norm": 0.45076173543930054, + "learning_rate": 1.0988595129976444e-06, + "loss": 0.6559, + "step": 3922 + }, + { + "epoch": 0.7914085870000347, + "grad_norm": 0.41652804613113403, + "learning_rate": 1.096816399112589e-06, + "loss": 0.6954, + "step": 3923 + }, + { + "epoch": 0.7916103225562416, + "grad_norm": 0.3207223117351532, + "learning_rate": 1.0947749523613683e-06, + "loss": 0.7796, + "step": 3924 + }, + { + "epoch": 0.7918120581124487, + "grad_norm": 0.3924807608127594, + "learning_rate": 1.0927351736159314e-06, + "loss": 0.6771, + "step": 3925 + }, + { + "epoch": 0.7920137936686557, + "grad_norm": 1.2680659294128418, + "learning_rate": 1.09069706374751e-06, + "loss": 0.7168, + "step": 3926 + }, + { + "epoch": 0.7922155292248626, + "grad_norm": 1.0150847434997559, + "learning_rate": 1.088660623626624e-06, + "loss": 0.7073, + "step": 3927 + }, + { + "epoch": 0.7924172647810697, + "grad_norm": 0.3826291859149933, + "learning_rate": 1.0866258541230835e-06, + "loss": 0.7506, + "step": 3928 + }, + { + "epoch": 0.7926190003372766, + "grad_norm": 0.8095358610153198, + "learning_rate": 1.0845927561059805e-06, + "loss": 0.6207, + "step": 3929 + }, + { + "epoch": 0.7928207358934837, + "grad_norm": 0.41174301505088806, + "learning_rate": 1.0825613304436938e-06, + "loss": 0.6418, + "step": 3930 + }, + { + "epoch": 0.7930224714496906, + "grad_norm": 0.4840712547302246, + "learning_rate": 1.0805315780038922e-06, + "loss": 0.684, + "step": 3931 + }, + { + "epoch": 0.7932242070058976, + "grad_norm": 0.5175725817680359, + "learning_rate": 1.078503499653525e-06, + "loss": 0.6481, + "step": 3932 + }, + { + "epoch": 0.7934259425621046, + "grad_norm": 0.424094557762146, + "learning_rate": 1.0764770962588278e-06, + "loss": 0.7043, + "step": 3933 + }, + { + "epoch": 0.7936276781183116, + "grad_norm": 0.40806031227111816, + "learning_rate": 1.074452368685322e-06, + "loss": 0.6706, + "step": 3934 + }, + { + "epoch": 0.7938294136745186, + "grad_norm": 0.8501918911933899, + "learning_rate": 1.0724293177978106e-06, + "loss": 0.8923, + "step": 3935 + }, + { + "epoch": 0.7940311492307256, + "grad_norm": 1.5072156190872192, + "learning_rate": 1.0704079444603855e-06, + "loss": 0.7205, + "step": 3936 + }, + { + "epoch": 0.7942328847869325, + "grad_norm": 0.581558346748352, + "learning_rate": 1.0683882495364163e-06, + "loss": 0.8027, + "step": 3937 + }, + { + "epoch": 0.7944346203431396, + "grad_norm": 0.35465800762176514, + "learning_rate": 1.0663702338885579e-06, + "loss": 0.8202, + "step": 3938 + }, + { + "epoch": 0.7946363558993466, + "grad_norm": 0.8600527048110962, + "learning_rate": 1.0643538983787505e-06, + "loss": 0.6867, + "step": 3939 + }, + { + "epoch": 0.7948380914555535, + "grad_norm": 0.49030637741088867, + "learning_rate": 1.062339243868213e-06, + "loss": 0.6903, + "step": 3940 + }, + { + "epoch": 0.7950398270117606, + "grad_norm": 0.5355114340782166, + "learning_rate": 1.0603262712174477e-06, + "loss": 0.9598, + "step": 3941 + }, + { + "epoch": 0.7952415625679675, + "grad_norm": 0.5693348050117493, + "learning_rate": 1.0583149812862382e-06, + "loss": 0.6499, + "step": 3942 + }, + { + "epoch": 0.7954432981241746, + "grad_norm": 0.3693627417087555, + "learning_rate": 1.0563053749336516e-06, + "loss": 0.8187, + "step": 3943 + }, + { + "epoch": 0.7956450336803815, + "grad_norm": 0.31983208656311035, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.6465, + "step": 3944 + }, + { + "epoch": 0.7958467692365885, + "grad_norm": 0.9442469477653503, + "learning_rate": 1.0522912163970073e-06, + "loss": 0.6709, + "step": 3945 + }, + { + "epoch": 0.7960485047927955, + "grad_norm": 0.4609791934490204, + "learning_rate": 1.0502866659274847e-06, + "loss": 0.6847, + "step": 3946 + }, + { + "epoch": 0.7962502403490025, + "grad_norm": 1.130595326423645, + "learning_rate": 1.0482838024656505e-06, + "loss": 0.6629, + "step": 3947 + }, + { + "epoch": 0.7964519759052096, + "grad_norm": 0.37832605838775635, + "learning_rate": 1.0462826268669707e-06, + "loss": 0.7392, + "step": 3948 + }, + { + "epoch": 0.7966537114614165, + "grad_norm": 0.35009559988975525, + "learning_rate": 1.0442831399861903e-06, + "loss": 0.6671, + "step": 3949 + }, + { + "epoch": 0.7968554470176235, + "grad_norm": 0.526996910572052, + "learning_rate": 1.0422853426773322e-06, + "loss": 0.629, + "step": 3950 + }, + { + "epoch": 0.7970571825738305, + "grad_norm": 0.4254051744937897, + "learning_rate": 1.040289235793701e-06, + "loss": 0.7922, + "step": 3951 + }, + { + "epoch": 0.7972589181300375, + "grad_norm": 0.6463653445243835, + "learning_rate": 1.0382948201878767e-06, + "loss": 0.8352, + "step": 3952 + }, + { + "epoch": 0.7974606536862445, + "grad_norm": 0.4028523862361908, + "learning_rate": 1.0363020967117143e-06, + "loss": 0.6736, + "step": 3953 + }, + { + "epoch": 0.7976623892424515, + "grad_norm": 0.6316519379615784, + "learning_rate": 1.034311066216353e-06, + "loss": 0.6796, + "step": 3954 + }, + { + "epoch": 0.7978641247986584, + "grad_norm": 0.37582528591156006, + "learning_rate": 1.0323217295522026e-06, + "loss": 0.6582, + "step": 3955 + }, + { + "epoch": 0.7980658603548655, + "grad_norm": 0.8689025640487671, + "learning_rate": 1.0303340875689505e-06, + "loss": 0.6513, + "step": 3956 + }, + { + "epoch": 0.7982675959110724, + "grad_norm": 0.45216962695121765, + "learning_rate": 1.028348141115565e-06, + "loss": 0.6753, + "step": 3957 + }, + { + "epoch": 0.7984693314672794, + "grad_norm": 0.5104213356971741, + "learning_rate": 1.0263638910402834e-06, + "loss": 0.7656, + "step": 3958 + }, + { + "epoch": 0.7986710670234864, + "grad_norm": 0.35348913073539734, + "learning_rate": 1.024381338190622e-06, + "loss": 0.6837, + "step": 3959 + }, + { + "epoch": 0.7988728025796934, + "grad_norm": 0.42005017399787903, + "learning_rate": 1.0224004834133755e-06, + "loss": 0.6548, + "step": 3960 + }, + { + "epoch": 0.7990745381359005, + "grad_norm": 0.5248255133628845, + "learning_rate": 1.0204213275546037e-06, + "loss": 0.6795, + "step": 3961 + }, + { + "epoch": 0.7992762736921074, + "grad_norm": 0.6113191843032837, + "learning_rate": 1.0184438714596518e-06, + "loss": 0.8211, + "step": 3962 + }, + { + "epoch": 0.7994780092483144, + "grad_norm": 0.480577290058136, + "learning_rate": 1.0164681159731316e-06, + "loss": 0.6789, + "step": 3963 + }, + { + "epoch": 0.7996797448045214, + "grad_norm": 0.5853450894355774, + "learning_rate": 1.0144940619389298e-06, + "loss": 0.668, + "step": 3964 + }, + { + "epoch": 0.7998814803607284, + "grad_norm": 0.8544267416000366, + "learning_rate": 1.01252171020021e-06, + "loss": 0.718, + "step": 3965 + }, + { + "epoch": 0.8000832159169354, + "grad_norm": 0.4894731938838959, + "learning_rate": 1.0105510615994051e-06, + "loss": 0.6801, + "step": 3966 + }, + { + "epoch": 0.8002849514731424, + "grad_norm": 0.5039870738983154, + "learning_rate": 1.00858211697822e-06, + "loss": 0.6748, + "step": 3967 + }, + { + "epoch": 0.8004866870293493, + "grad_norm": 0.6922500729560852, + "learning_rate": 1.006614877177638e-06, + "loss": 0.655, + "step": 3968 + }, + { + "epoch": 0.8006884225855564, + "grad_norm": 2.2016608715057373, + "learning_rate": 1.0046493430379029e-06, + "loss": 0.852, + "step": 3969 + }, + { + "epoch": 0.8008901581417633, + "grad_norm": 0.44182875752449036, + "learning_rate": 1.0026855153985409e-06, + "loss": 0.8245, + "step": 3970 + }, + { + "epoch": 0.8010918936979704, + "grad_norm": 0.35396745800971985, + "learning_rate": 1.000723395098347e-06, + "loss": 0.6556, + "step": 3971 + }, + { + "epoch": 0.8012936292541774, + "grad_norm": 0.33134302496910095, + "learning_rate": 9.987629829753799e-07, + "loss": 0.6738, + "step": 3972 + }, + { + "epoch": 0.8014953648103843, + "grad_norm": 0.5623152852058411, + "learning_rate": 9.968042798669775e-07, + "loss": 0.6553, + "step": 3973 + }, + { + "epoch": 0.8016971003665914, + "grad_norm": 0.7911794781684875, + "learning_rate": 9.94847286609743e-07, + "loss": 0.6228, + "step": 3974 + }, + { + "epoch": 0.8018988359227983, + "grad_norm": 0.39568525552749634, + "learning_rate": 9.928920040395495e-07, + "loss": 0.7458, + "step": 3975 + }, + { + "epoch": 0.8021005714790053, + "grad_norm": 0.3557928800582886, + "learning_rate": 9.90938432991544e-07, + "loss": 0.7113, + "step": 3976 + }, + { + "epoch": 0.8023023070352123, + "grad_norm": 0.8656487464904785, + "learning_rate": 9.889865743001332e-07, + "loss": 0.6891, + "step": 3977 + }, + { + "epoch": 0.8025040425914193, + "grad_norm": 0.665074348449707, + "learning_rate": 9.87036428799001e-07, + "loss": 0.6932, + "step": 3978 + }, + { + "epoch": 0.8027057781476263, + "grad_norm": 0.6446415185928345, + "learning_rate": 9.850879973210993e-07, + "loss": 0.66, + "step": 3979 + }, + { + "epoch": 0.8029075137038333, + "grad_norm": 0.7038710117340088, + "learning_rate": 9.831412806986395e-07, + "loss": 0.6797, + "step": 3980 + }, + { + "epoch": 0.8031092492600402, + "grad_norm": 0.48777586221694946, + "learning_rate": 9.811962797631102e-07, + "loss": 0.6634, + "step": 3981 + }, + { + "epoch": 0.8033109848162473, + "grad_norm": 0.33813929557800293, + "learning_rate": 9.792529953452622e-07, + "loss": 0.8279, + "step": 3982 + }, + { + "epoch": 0.8035127203724542, + "grad_norm": 0.9014752507209778, + "learning_rate": 9.773114282751134e-07, + "loss": 0.7631, + "step": 3983 + }, + { + "epoch": 0.8037144559286613, + "grad_norm": 0.8523035645484924, + "learning_rate": 9.753715793819502e-07, + "loss": 0.7108, + "step": 3984 + }, + { + "epoch": 0.8039161914848683, + "grad_norm": 0.7649599313735962, + "learning_rate": 9.734334494943237e-07, + "loss": 0.638, + "step": 3985 + }, + { + "epoch": 0.8041179270410752, + "grad_norm": 0.41619718074798584, + "learning_rate": 9.714970394400492e-07, + "loss": 0.7235, + "step": 3986 + }, + { + "epoch": 0.8043196625972823, + "grad_norm": 0.35346657037734985, + "learning_rate": 9.695623500462114e-07, + "loss": 0.6938, + "step": 3987 + }, + { + "epoch": 0.8045213981534892, + "grad_norm": 0.5827614068984985, + "learning_rate": 9.676293821391568e-07, + "loss": 0.6514, + "step": 3988 + }, + { + "epoch": 0.8047231337096963, + "grad_norm": 0.9623099565505981, + "learning_rate": 9.656981365444983e-07, + "loss": 0.7336, + "step": 3989 + }, + { + "epoch": 0.8049248692659032, + "grad_norm": 0.3670915365219116, + "learning_rate": 9.637686140871121e-07, + "loss": 0.6056, + "step": 3990 + }, + { + "epoch": 0.8051266048221102, + "grad_norm": 0.7231318950653076, + "learning_rate": 9.618408155911369e-07, + "loss": 0.7102, + "step": 3991 + }, + { + "epoch": 0.8053283403783172, + "grad_norm": 0.9028728008270264, + "learning_rate": 9.599147418799803e-07, + "loss": 0.8215, + "step": 3992 + }, + { + "epoch": 0.8055300759345242, + "grad_norm": 0.6327657103538513, + "learning_rate": 9.579903937763086e-07, + "loss": 0.6552, + "step": 3993 + }, + { + "epoch": 0.8057318114907311, + "grad_norm": 0.7391289472579956, + "learning_rate": 9.560677721020506e-07, + "loss": 0.6449, + "step": 3994 + }, + { + "epoch": 0.8059335470469382, + "grad_norm": 0.49933743476867676, + "learning_rate": 9.541468776784025e-07, + "loss": 0.6742, + "step": 3995 + }, + { + "epoch": 0.8061352826031452, + "grad_norm": 0.9601204991340637, + "learning_rate": 9.522277113258177e-07, + "loss": 0.6404, + "step": 3996 + }, + { + "epoch": 0.8063370181593522, + "grad_norm": 0.4408121109008789, + "learning_rate": 9.503102738640146e-07, + "loss": 0.6537, + "step": 3997 + }, + { + "epoch": 0.8065387537155592, + "grad_norm": 0.35162702202796936, + "learning_rate": 9.483945661119698e-07, + "loss": 0.7622, + "step": 3998 + }, + { + "epoch": 0.8067404892717661, + "grad_norm": 0.3745332360267639, + "learning_rate": 9.464805888879264e-07, + "loss": 0.6869, + "step": 3999 + }, + { + "epoch": 0.8069422248279732, + "grad_norm": 0.36443084478378296, + "learning_rate": 9.445683430093843e-07, + "loss": 0.8024, + "step": 4000 + }, + { + "epoch": 0.8071439603841801, + "grad_norm": 0.8786391019821167, + "learning_rate": 9.426578292931033e-07, + "loss": 0.6622, + "step": 4001 + }, + { + "epoch": 0.8073456959403872, + "grad_norm": 0.3806266188621521, + "learning_rate": 9.407490485551068e-07, + "loss": 0.6544, + "step": 4002 + }, + { + "epoch": 0.8075474314965941, + "grad_norm": 0.3530179262161255, + "learning_rate": 9.388420016106764e-07, + "loss": 0.7667, + "step": 4003 + }, + { + "epoch": 0.8077491670528011, + "grad_norm": 0.5868592858314514, + "learning_rate": 9.36936689274352e-07, + "loss": 0.6702, + "step": 4004 + }, + { + "epoch": 0.8079509026090081, + "grad_norm": 0.31058308482170105, + "learning_rate": 9.350331123599327e-07, + "loss": 0.9213, + "step": 4005 + }, + { + "epoch": 0.8081526381652151, + "grad_norm": 0.4726583659648895, + "learning_rate": 9.331312716804791e-07, + "loss": 0.6907, + "step": 4006 + }, + { + "epoch": 0.8083543737214222, + "grad_norm": 0.3896176815032959, + "learning_rate": 9.312311680483083e-07, + "loss": 0.8602, + "step": 4007 + }, + { + "epoch": 0.8085561092776291, + "grad_norm": 0.5245814919471741, + "learning_rate": 9.293328022749942e-07, + "loss": 0.6245, + "step": 4008 + }, + { + "epoch": 0.8087578448338361, + "grad_norm": 0.42944633960723877, + "learning_rate": 9.27436175171369e-07, + "loss": 0.8182, + "step": 4009 + }, + { + "epoch": 0.8089595803900431, + "grad_norm": 1.1788564920425415, + "learning_rate": 9.255412875475256e-07, + "loss": 0.6822, + "step": 4010 + }, + { + "epoch": 0.8091613159462501, + "grad_norm": 0.9582619071006775, + "learning_rate": 9.2364814021281e-07, + "loss": 0.8864, + "step": 4011 + }, + { + "epoch": 0.8093630515024571, + "grad_norm": 0.6898618936538696, + "learning_rate": 9.217567339758254e-07, + "loss": 0.6338, + "step": 4012 + }, + { + "epoch": 0.8095647870586641, + "grad_norm": 0.6961838006973267, + "learning_rate": 9.198670696444339e-07, + "loss": 0.6896, + "step": 4013 + }, + { + "epoch": 0.809766522614871, + "grad_norm": 0.3257087767124176, + "learning_rate": 9.179791480257511e-07, + "loss": 0.7154, + "step": 4014 + }, + { + "epoch": 0.8099682581710781, + "grad_norm": 0.4184440076351166, + "learning_rate": 9.160929699261479e-07, + "loss": 0.6807, + "step": 4015 + }, + { + "epoch": 0.810169993727285, + "grad_norm": 0.8447985649108887, + "learning_rate": 9.142085361512548e-07, + "loss": 0.6644, + "step": 4016 + }, + { + "epoch": 0.810371729283492, + "grad_norm": 0.49792590737342834, + "learning_rate": 9.123258475059493e-07, + "loss": 0.6581, + "step": 4017 + }, + { + "epoch": 0.810573464839699, + "grad_norm": 0.32772713899612427, + "learning_rate": 9.104449047943725e-07, + "loss": 0.6658, + "step": 4018 + }, + { + "epoch": 0.810775200395906, + "grad_norm": 0.5296732187271118, + "learning_rate": 9.08565708819914e-07, + "loss": 0.6357, + "step": 4019 + }, + { + "epoch": 0.8109769359521131, + "grad_norm": 0.5747442245483398, + "learning_rate": 9.066882603852173e-07, + "loss": 0.6868, + "step": 4020 + }, + { + "epoch": 0.81117867150832, + "grad_norm": 0.36947518587112427, + "learning_rate": 9.048125602921843e-07, + "loss": 0.641, + "step": 4021 + }, + { + "epoch": 0.811380407064527, + "grad_norm": 0.3003344237804413, + "learning_rate": 9.029386093419651e-07, + "loss": 0.6279, + "step": 4022 + }, + { + "epoch": 0.811582142620734, + "grad_norm": 0.6348669528961182, + "learning_rate": 9.010664083349635e-07, + "loss": 0.6277, + "step": 4023 + }, + { + "epoch": 0.811783878176941, + "grad_norm": 0.4441215991973877, + "learning_rate": 8.991959580708409e-07, + "loss": 0.7389, + "step": 4024 + }, + { + "epoch": 0.811985613733148, + "grad_norm": 0.5426074266433716, + "learning_rate": 8.973272593485011e-07, + "loss": 0.6705, + "step": 4025 + }, + { + "epoch": 0.812187349289355, + "grad_norm": 0.9262121319770813, + "learning_rate": 8.954603129661088e-07, + "loss": 0.656, + "step": 4026 + }, + { + "epoch": 0.8123890848455619, + "grad_norm": 0.46151402592658997, + "learning_rate": 8.935951197210796e-07, + "loss": 0.7806, + "step": 4027 + }, + { + "epoch": 0.812590820401769, + "grad_norm": 0.9116199612617493, + "learning_rate": 8.917316804100723e-07, + "loss": 0.7286, + "step": 4028 + }, + { + "epoch": 0.812792555957976, + "grad_norm": 0.5533108711242676, + "learning_rate": 8.898699958290063e-07, + "loss": 0.6867, + "step": 4029 + }, + { + "epoch": 0.812994291514183, + "grad_norm": 0.6026671528816223, + "learning_rate": 8.880100667730457e-07, + "loss": 0.6471, + "step": 4030 + }, + { + "epoch": 0.81319602707039, + "grad_norm": 0.35685989260673523, + "learning_rate": 8.861518940366043e-07, + "loss": 0.7645, + "step": 4031 + }, + { + "epoch": 0.8133977626265969, + "grad_norm": 0.42927679419517517, + "learning_rate": 8.842954784133517e-07, + "loss": 0.8251, + "step": 4032 + }, + { + "epoch": 0.813599498182804, + "grad_norm": 0.5092020034790039, + "learning_rate": 8.824408206962004e-07, + "loss": 0.668, + "step": 4033 + }, + { + "epoch": 0.8138012337390109, + "grad_norm": 0.6452570557594299, + "learning_rate": 8.805879216773139e-07, + "loss": 0.6546, + "step": 4034 + }, + { + "epoch": 0.8140029692952179, + "grad_norm": 0.4207552373409271, + "learning_rate": 8.787367821481096e-07, + "loss": 0.7028, + "step": 4035 + }, + { + "epoch": 0.8142047048514249, + "grad_norm": 0.3666224479675293, + "learning_rate": 8.768874028992431e-07, + "loss": 0.6957, + "step": 4036 + }, + { + "epoch": 0.8144064404076319, + "grad_norm": 0.544642448425293, + "learning_rate": 8.750397847206288e-07, + "loss": 0.666, + "step": 4037 + }, + { + "epoch": 0.8146081759638389, + "grad_norm": 0.3639770448207855, + "learning_rate": 8.731939284014223e-07, + "loss": 0.6683, + "step": 4038 + }, + { + "epoch": 0.8148099115200459, + "grad_norm": 0.5589380264282227, + "learning_rate": 8.713498347300281e-07, + "loss": 0.8243, + "step": 4039 + }, + { + "epoch": 0.8150116470762528, + "grad_norm": 0.3419032692909241, + "learning_rate": 8.695075044940998e-07, + "loss": 0.6522, + "step": 4040 + }, + { + "epoch": 0.8152133826324599, + "grad_norm": 0.9368754625320435, + "learning_rate": 8.676669384805359e-07, + "loss": 0.8444, + "step": 4041 + }, + { + "epoch": 0.8154151181886669, + "grad_norm": 0.3633244037628174, + "learning_rate": 8.658281374754807e-07, + "loss": 0.6971, + "step": 4042 + }, + { + "epoch": 0.8156168537448739, + "grad_norm": 0.40811094641685486, + "learning_rate": 8.639911022643288e-07, + "loss": 0.679, + "step": 4043 + }, + { + "epoch": 0.8158185893010809, + "grad_norm": 0.5470287799835205, + "learning_rate": 8.621558336317132e-07, + "loss": 0.7524, + "step": 4044 + }, + { + "epoch": 0.8160203248572878, + "grad_norm": 0.4651123583316803, + "learning_rate": 8.60322332361519e-07, + "loss": 0.7018, + "step": 4045 + }, + { + "epoch": 0.8162220604134949, + "grad_norm": 0.3326999843120575, + "learning_rate": 8.584905992368764e-07, + "loss": 0.6496, + "step": 4046 + }, + { + "epoch": 0.8164237959697018, + "grad_norm": 1.3912816047668457, + "learning_rate": 8.56660635040153e-07, + "loss": 0.8146, + "step": 4047 + }, + { + "epoch": 0.8166255315259089, + "grad_norm": 0.3894466459751129, + "learning_rate": 8.548324405529696e-07, + "loss": 0.7605, + "step": 4048 + }, + { + "epoch": 0.8168272670821158, + "grad_norm": 0.5294163227081299, + "learning_rate": 8.530060165561871e-07, + "loss": 0.7352, + "step": 4049 + }, + { + "epoch": 0.8170290026383228, + "grad_norm": 0.4345068335533142, + "learning_rate": 8.511813638299082e-07, + "loss": 0.7604, + "step": 4050 + }, + { + "epoch": 0.8172307381945298, + "grad_norm": 0.7005764842033386, + "learning_rate": 8.493584831534845e-07, + "loss": 0.7812, + "step": 4051 + }, + { + "epoch": 0.8174324737507368, + "grad_norm": 0.44860848784446716, + "learning_rate": 8.475373753055067e-07, + "loss": 0.6624, + "step": 4052 + }, + { + "epoch": 0.8176342093069437, + "grad_norm": 0.9805493354797363, + "learning_rate": 8.457180410638072e-07, + "loss": 0.709, + "step": 4053 + }, + { + "epoch": 0.8178359448631508, + "grad_norm": 0.35368284583091736, + "learning_rate": 8.439004812054658e-07, + "loss": 0.7585, + "step": 4054 + }, + { + "epoch": 0.8180376804193578, + "grad_norm": 1.1566565036773682, + "learning_rate": 8.420846965068003e-07, + "loss": 0.762, + "step": 4055 + }, + { + "epoch": 0.8182394159755648, + "grad_norm": 0.9465445280075073, + "learning_rate": 8.402706877433708e-07, + "loss": 0.7044, + "step": 4056 + }, + { + "epoch": 0.8184411515317718, + "grad_norm": 0.5295639038085938, + "learning_rate": 8.384584556899805e-07, + "loss": 0.6468, + "step": 4057 + }, + { + "epoch": 0.8186428870879787, + "grad_norm": 0.4885746240615845, + "learning_rate": 8.366480011206707e-07, + "loss": 0.8564, + "step": 4058 + }, + { + "epoch": 0.8188446226441858, + "grad_norm": 0.4677394926548004, + "learning_rate": 8.348393248087289e-07, + "loss": 0.7055, + "step": 4059 + }, + { + "epoch": 0.8190463582003927, + "grad_norm": 0.5112569332122803, + "learning_rate": 8.330324275266777e-07, + "loss": 0.8003, + "step": 4060 + }, + { + "epoch": 0.8192480937565998, + "grad_norm": 0.7537760138511658, + "learning_rate": 8.312273100462809e-07, + "loss": 0.6848, + "step": 4061 + }, + { + "epoch": 0.8194498293128067, + "grad_norm": 0.3288614749908447, + "learning_rate": 8.294239731385456e-07, + "loss": 1.0145, + "step": 4062 + }, + { + "epoch": 0.8196515648690137, + "grad_norm": 0.6865113377571106, + "learning_rate": 8.276224175737152e-07, + "loss": 0.7945, + "step": 4063 + }, + { + "epoch": 0.8198533004252208, + "grad_norm": 0.46431106328964233, + "learning_rate": 8.258226441212719e-07, + "loss": 0.6765, + "step": 4064 + }, + { + "epoch": 0.8200550359814277, + "grad_norm": 0.6039952635765076, + "learning_rate": 8.240246535499369e-07, + "loss": 0.6528, + "step": 4065 + }, + { + "epoch": 0.8202567715376348, + "grad_norm": 0.473136842250824, + "learning_rate": 8.222284466276731e-07, + "loss": 0.6434, + "step": 4066 + }, + { + "epoch": 0.8204585070938417, + "grad_norm": 0.46378180384635925, + "learning_rate": 8.20434024121678e-07, + "loss": 0.7357, + "step": 4067 + }, + { + "epoch": 0.8206602426500487, + "grad_norm": 0.5360262989997864, + "learning_rate": 8.186413867983872e-07, + "loss": 0.6118, + "step": 4068 + }, + { + "epoch": 0.8208619782062557, + "grad_norm": 0.32590314745903015, + "learning_rate": 8.168505354234774e-07, + "loss": 0.6723, + "step": 4069 + }, + { + "epoch": 0.8210637137624627, + "grad_norm": 0.364083468914032, + "learning_rate": 8.150614707618576e-07, + "loss": 0.8355, + "step": 4070 + }, + { + "epoch": 0.8212654493186696, + "grad_norm": 0.5447162985801697, + "learning_rate": 8.132741935776767e-07, + "loss": 0.8829, + "step": 4071 + }, + { + "epoch": 0.8214671848748767, + "grad_norm": 0.9954445362091064, + "learning_rate": 8.114887046343184e-07, + "loss": 0.6878, + "step": 4072 + }, + { + "epoch": 0.8216689204310836, + "grad_norm": 0.7198444604873657, + "learning_rate": 8.097050046944039e-07, + "loss": 0.6585, + "step": 4073 + }, + { + "epoch": 0.8218706559872907, + "grad_norm": 1.0638344287872314, + "learning_rate": 8.079230945197908e-07, + "loss": 0.7947, + "step": 4074 + }, + { + "epoch": 0.8220723915434976, + "grad_norm": 1.1641714572906494, + "learning_rate": 8.061429748715705e-07, + "loss": 0.6389, + "step": 4075 + }, + { + "epoch": 0.8222741270997046, + "grad_norm": 0.6671349406242371, + "learning_rate": 8.043646465100696e-07, + "loss": 0.6476, + "step": 4076 + }, + { + "epoch": 0.8224758626559117, + "grad_norm": 0.4042210280895233, + "learning_rate": 8.02588110194853e-07, + "loss": 0.6908, + "step": 4077 + }, + { + "epoch": 0.8226775982121186, + "grad_norm": 0.49477052688598633, + "learning_rate": 8.008133666847156e-07, + "loss": 0.7094, + "step": 4078 + }, + { + "epoch": 0.8228793337683257, + "grad_norm": 0.5312076807022095, + "learning_rate": 7.990404167376886e-07, + "loss": 0.7471, + "step": 4079 + }, + { + "epoch": 0.8230810693245326, + "grad_norm": 0.4298705756664276, + "learning_rate": 7.972692611110384e-07, + "loss": 0.716, + "step": 4080 + }, + { + "epoch": 0.8232828048807396, + "grad_norm": 0.5681132674217224, + "learning_rate": 7.954999005612629e-07, + "loss": 0.6814, + "step": 4081 + }, + { + "epoch": 0.8234845404369466, + "grad_norm": 0.49187588691711426, + "learning_rate": 7.937323358440935e-07, + "loss": 0.674, + "step": 4082 + }, + { + "epoch": 0.8236862759931536, + "grad_norm": 0.4805457293987274, + "learning_rate": 7.919665677144983e-07, + "loss": 0.6772, + "step": 4083 + }, + { + "epoch": 0.8238880115493606, + "grad_norm": 0.41921690106391907, + "learning_rate": 7.902025969266702e-07, + "loss": 0.6761, + "step": 4084 + }, + { + "epoch": 0.8240897471055676, + "grad_norm": 0.5359138250350952, + "learning_rate": 7.884404242340421e-07, + "loss": 0.6255, + "step": 4085 + }, + { + "epoch": 0.8242914826617745, + "grad_norm": 0.39744481444358826, + "learning_rate": 7.866800503892758e-07, + "loss": 0.7994, + "step": 4086 + }, + { + "epoch": 0.8244932182179816, + "grad_norm": 0.47074541449546814, + "learning_rate": 7.849214761442637e-07, + "loss": 0.6598, + "step": 4087 + }, + { + "epoch": 0.8246949537741886, + "grad_norm": 0.4479901194572449, + "learning_rate": 7.83164702250132e-07, + "loss": 0.6677, + "step": 4088 + }, + { + "epoch": 0.8248966893303955, + "grad_norm": 3.9795687198638916, + "learning_rate": 7.814097294572365e-07, + "loss": 0.6239, + "step": 4089 + }, + { + "epoch": 0.8250984248866026, + "grad_norm": 0.34133467078208923, + "learning_rate": 7.796565585151621e-07, + "loss": 0.7294, + "step": 4090 + }, + { + "epoch": 0.8253001604428095, + "grad_norm": 0.5068195462226868, + "learning_rate": 7.779051901727297e-07, + "loss": 0.6838, + "step": 4091 + }, + { + "epoch": 0.8255018959990166, + "grad_norm": 6.181421279907227, + "learning_rate": 7.761556251779823e-07, + "loss": 0.6886, + "step": 4092 + }, + { + "epoch": 0.8257036315552235, + "grad_norm": 0.48550984263420105, + "learning_rate": 7.744078642781982e-07, + "loss": 0.6524, + "step": 4093 + }, + { + "epoch": 0.8259053671114305, + "grad_norm": 0.8800036311149597, + "learning_rate": 7.726619082198871e-07, + "loss": 0.6432, + "step": 4094 + }, + { + "epoch": 0.8261071026676375, + "grad_norm": 0.6388117074966431, + "learning_rate": 7.709177577487786e-07, + "loss": 0.6767, + "step": 4095 + }, + { + "epoch": 0.8263088382238445, + "grad_norm": 0.3175949454307556, + "learning_rate": 7.691754136098417e-07, + "loss": 0.692, + "step": 4096 + }, + { + "epoch": 0.8265105737800515, + "grad_norm": 0.7242339253425598, + "learning_rate": 7.674348765472672e-07, + "loss": 0.6378, + "step": 4097 + }, + { + "epoch": 0.8267123093362585, + "grad_norm": 0.3929362893104553, + "learning_rate": 7.656961473044744e-07, + "loss": 0.675, + "step": 4098 + }, + { + "epoch": 0.8269140448924654, + "grad_norm": 1.0461013317108154, + "learning_rate": 7.63959226624117e-07, + "loss": 0.675, + "step": 4099 + }, + { + "epoch": 0.8271157804486725, + "grad_norm": 0.5271601676940918, + "learning_rate": 7.622241152480652e-07, + "loss": 0.6348, + "step": 4100 + }, + { + "epoch": 0.8273175160048795, + "grad_norm": 1.4197477102279663, + "learning_rate": 7.604908139174255e-07, + "loss": 0.7237, + "step": 4101 + }, + { + "epoch": 0.8275192515610865, + "grad_norm": 0.4259219765663147, + "learning_rate": 7.587593233725305e-07, + "loss": 0.6484, + "step": 4102 + }, + { + "epoch": 0.8277209871172935, + "grad_norm": 0.34800437092781067, + "learning_rate": 7.570296443529318e-07, + "loss": 0.6178, + "step": 4103 + }, + { + "epoch": 0.8279227226735004, + "grad_norm": 0.5846043825149536, + "learning_rate": 7.55301777597417e-07, + "loss": 0.6755, + "step": 4104 + }, + { + "epoch": 0.8281244582297075, + "grad_norm": 0.34522733092308044, + "learning_rate": 7.535757238439939e-07, + "loss": 0.6354, + "step": 4105 + }, + { + "epoch": 0.8283261937859144, + "grad_norm": 0.5267305374145508, + "learning_rate": 7.518514838298957e-07, + "loss": 0.741, + "step": 4106 + }, + { + "epoch": 0.8285279293421214, + "grad_norm": 0.5945020318031311, + "learning_rate": 7.501290582915849e-07, + "loss": 0.7373, + "step": 4107 + }, + { + "epoch": 0.8287296648983284, + "grad_norm": 0.4800272583961487, + "learning_rate": 7.484084479647458e-07, + "loss": 0.6785, + "step": 4108 + }, + { + "epoch": 0.8289314004545354, + "grad_norm": 0.37526294589042664, + "learning_rate": 7.466896535842865e-07, + "loss": 0.6325, + "step": 4109 + }, + { + "epoch": 0.8291331360107425, + "grad_norm": 0.7243527770042419, + "learning_rate": 7.449726758843434e-07, + "loss": 0.6231, + "step": 4110 + }, + { + "epoch": 0.8293348715669494, + "grad_norm": 0.4649500250816345, + "learning_rate": 7.432575155982741e-07, + "loss": 0.7683, + "step": 4111 + }, + { + "epoch": 0.8295366071231564, + "grad_norm": 0.532231330871582, + "learning_rate": 7.415441734586604e-07, + "loss": 0.6805, + "step": 4112 + }, + { + "epoch": 0.8297383426793634, + "grad_norm": 0.7913029193878174, + "learning_rate": 7.398326501973069e-07, + "loss": 0.6648, + "step": 4113 + }, + { + "epoch": 0.8299400782355704, + "grad_norm": 0.5430890917778015, + "learning_rate": 7.381229465452417e-07, + "loss": 0.6817, + "step": 4114 + }, + { + "epoch": 0.8301418137917774, + "grad_norm": 0.3268798291683197, + "learning_rate": 7.364150632327182e-07, + "loss": 0.6853, + "step": 4115 + }, + { + "epoch": 0.8303435493479844, + "grad_norm": 0.39360764622688293, + "learning_rate": 7.347090009892089e-07, + "loss": 0.6283, + "step": 4116 + }, + { + "epoch": 0.8305452849041913, + "grad_norm": 0.44346925616264343, + "learning_rate": 7.330047605434087e-07, + "loss": 0.6706, + "step": 4117 + }, + { + "epoch": 0.8307470204603984, + "grad_norm": 0.6690890789031982, + "learning_rate": 7.313023426232374e-07, + "loss": 0.6884, + "step": 4118 + }, + { + "epoch": 0.8309487560166053, + "grad_norm": 0.6090205311775208, + "learning_rate": 7.296017479558338e-07, + "loss": 0.8355, + "step": 4119 + }, + { + "epoch": 0.8311504915728124, + "grad_norm": 0.8864596486091614, + "learning_rate": 7.279029772675572e-07, + "loss": 0.6982, + "step": 4120 + }, + { + "epoch": 0.8313522271290193, + "grad_norm": 0.3951992690563202, + "learning_rate": 7.262060312839908e-07, + "loss": 0.7975, + "step": 4121 + }, + { + "epoch": 0.8315539626852263, + "grad_norm": 0.4038160443305969, + "learning_rate": 7.24510910729937e-07, + "loss": 0.6582, + "step": 4122 + }, + { + "epoch": 0.8317556982414334, + "grad_norm": 0.36711713671684265, + "learning_rate": 7.228176163294171e-07, + "loss": 0.6611, + "step": 4123 + }, + { + "epoch": 0.8319574337976403, + "grad_norm": 0.3582766056060791, + "learning_rate": 7.211261488056731e-07, + "loss": 0.7128, + "step": 4124 + }, + { + "epoch": 0.8321591693538474, + "grad_norm": 0.40361008048057556, + "learning_rate": 7.194365088811689e-07, + "loss": 0.6902, + "step": 4125 + }, + { + "epoch": 0.8323609049100543, + "grad_norm": 0.38358166813850403, + "learning_rate": 7.17748697277586e-07, + "loss": 0.7614, + "step": 4126 + }, + { + "epoch": 0.8325626404662613, + "grad_norm": 0.33210864663124084, + "learning_rate": 7.160627147158244e-07, + "loss": 0.8363, + "step": 4127 + }, + { + "epoch": 0.8327643760224683, + "grad_norm": 1.1483681201934814, + "learning_rate": 7.143785619160026e-07, + "loss": 0.7553, + "step": 4128 + }, + { + "epoch": 0.8329661115786753, + "grad_norm": 0.6336138844490051, + "learning_rate": 7.126962395974607e-07, + "loss": 0.7545, + "step": 4129 + }, + { + "epoch": 0.8331678471348822, + "grad_norm": 0.38122066855430603, + "learning_rate": 7.110157484787538e-07, + "loss": 0.7034, + "step": 4130 + }, + { + "epoch": 0.8333695826910893, + "grad_norm": 1.3744087219238281, + "learning_rate": 7.093370892776558e-07, + "loss": 0.7234, + "step": 4131 + }, + { + "epoch": 0.8335713182472962, + "grad_norm": 0.5250982642173767, + "learning_rate": 7.076602627111573e-07, + "loss": 0.6685, + "step": 4132 + }, + { + "epoch": 0.8337730538035033, + "grad_norm": 0.4979982078075409, + "learning_rate": 7.059852694954694e-07, + "loss": 0.6442, + "step": 4133 + }, + { + "epoch": 0.8339747893597103, + "grad_norm": 0.41418156027793884, + "learning_rate": 7.04312110346016e-07, + "loss": 0.7095, + "step": 4134 + }, + { + "epoch": 0.8341765249159172, + "grad_norm": 0.4083440601825714, + "learning_rate": 7.026407859774393e-07, + "loss": 0.7809, + "step": 4135 + }, + { + "epoch": 0.8343782604721243, + "grad_norm": 0.5151472091674805, + "learning_rate": 7.009712971035998e-07, + "loss": 0.6721, + "step": 4136 + }, + { + "epoch": 0.8345799960283312, + "grad_norm": 0.9469529390335083, + "learning_rate": 6.993036444375706e-07, + "loss": 0.6296, + "step": 4137 + }, + { + "epoch": 0.8347817315845383, + "grad_norm": 0.8012638688087463, + "learning_rate": 6.976378286916414e-07, + "loss": 0.6933, + "step": 4138 + }, + { + "epoch": 0.8349834671407452, + "grad_norm": 0.581827700138092, + "learning_rate": 6.959738505773211e-07, + "loss": 0.8131, + "step": 4139 + }, + { + "epoch": 0.8351852026969522, + "grad_norm": 0.3581453263759613, + "learning_rate": 6.943117108053265e-07, + "loss": 0.8009, + "step": 4140 + }, + { + "epoch": 0.8353869382531592, + "grad_norm": 0.396505743265152, + "learning_rate": 6.926514100855964e-07, + "loss": 0.6704, + "step": 4141 + }, + { + "epoch": 0.8355886738093662, + "grad_norm": 0.38942739367485046, + "learning_rate": 6.909929491272799e-07, + "loss": 0.6357, + "step": 4142 + }, + { + "epoch": 0.8357904093655733, + "grad_norm": 1.1855741739273071, + "learning_rate": 6.893363286387405e-07, + "loss": 0.6746, + "step": 4143 + }, + { + "epoch": 0.8359921449217802, + "grad_norm": 0.3587276339530945, + "learning_rate": 6.876815493275585e-07, + "loss": 0.7073, + "step": 4144 + }, + { + "epoch": 0.8361938804779872, + "grad_norm": 0.45886123180389404, + "learning_rate": 6.860286119005255e-07, + "loss": 0.7007, + "step": 4145 + }, + { + "epoch": 0.8363956160341942, + "grad_norm": 0.48944273591041565, + "learning_rate": 6.843775170636441e-07, + "loss": 0.614, + "step": 4146 + }, + { + "epoch": 0.8365973515904012, + "grad_norm": 0.4077273905277252, + "learning_rate": 6.827282655221373e-07, + "loss": 0.7222, + "step": 4147 + }, + { + "epoch": 0.8367990871466081, + "grad_norm": 0.537196695804596, + "learning_rate": 6.810808579804306e-07, + "loss": 0.6385, + "step": 4148 + }, + { + "epoch": 0.8370008227028152, + "grad_norm": 0.34784817695617676, + "learning_rate": 6.794352951421695e-07, + "loss": 0.6262, + "step": 4149 + }, + { + "epoch": 0.8372025582590221, + "grad_norm": 0.5768676400184631, + "learning_rate": 6.777915777102123e-07, + "loss": 0.6929, + "step": 4150 + }, + { + "epoch": 0.8374042938152292, + "grad_norm": 0.5870742201805115, + "learning_rate": 6.761497063866207e-07, + "loss": 0.7894, + "step": 4151 + }, + { + "epoch": 0.8376060293714361, + "grad_norm": 1.8497966527938843, + "learning_rate": 6.745096818726776e-07, + "loss": 0.6915, + "step": 4152 + }, + { + "epoch": 0.8378077649276431, + "grad_norm": 0.4410600960254669, + "learning_rate": 6.728715048688711e-07, + "loss": 0.6588, + "step": 4153 + }, + { + "epoch": 0.8380095004838501, + "grad_norm": 0.4844304025173187, + "learning_rate": 6.712351760749014e-07, + "loss": 0.9008, + "step": 4154 + }, + { + "epoch": 0.8382112360400571, + "grad_norm": 0.47696301341056824, + "learning_rate": 6.696006961896812e-07, + "loss": 0.7537, + "step": 4155 + }, + { + "epoch": 0.8384129715962642, + "grad_norm": 0.46135079860687256, + "learning_rate": 6.679680659113313e-07, + "loss": 0.6742, + "step": 4156 + }, + { + "epoch": 0.8386147071524711, + "grad_norm": 0.6165065169334412, + "learning_rate": 6.66337285937183e-07, + "loss": 0.6473, + "step": 4157 + }, + { + "epoch": 0.8388164427086781, + "grad_norm": 0.6609275937080383, + "learning_rate": 6.647083569637797e-07, + "loss": 0.6924, + "step": 4158 + }, + { + "epoch": 0.8390181782648851, + "grad_norm": 0.4120253026485443, + "learning_rate": 6.630812796868679e-07, + "loss": 0.7029, + "step": 4159 + }, + { + "epoch": 0.8392199138210921, + "grad_norm": 0.7023719549179077, + "learning_rate": 6.61456054801411e-07, + "loss": 0.704, + "step": 4160 + }, + { + "epoch": 0.8394216493772991, + "grad_norm": 0.3818644881248474, + "learning_rate": 6.598326830015761e-07, + "loss": 0.6771, + "step": 4161 + }, + { + "epoch": 0.8396233849335061, + "grad_norm": 0.40819141268730164, + "learning_rate": 6.582111649807399e-07, + "loss": 0.6666, + "step": 4162 + }, + { + "epoch": 0.839825120489713, + "grad_norm": 0.45097702741622925, + "learning_rate": 6.565915014314895e-07, + "loss": 0.7478, + "step": 4163 + }, + { + "epoch": 0.8400268560459201, + "grad_norm": 0.4159963130950928, + "learning_rate": 6.549736930456163e-07, + "loss": 0.6349, + "step": 4164 + }, + { + "epoch": 0.840228591602127, + "grad_norm": 0.7589116096496582, + "learning_rate": 6.533577405141211e-07, + "loss": 0.9, + "step": 4165 + }, + { + "epoch": 0.840430327158334, + "grad_norm": 0.6266133189201355, + "learning_rate": 6.517436445272135e-07, + "loss": 0.6469, + "step": 4166 + }, + { + "epoch": 0.840632062714541, + "grad_norm": 0.8853380084037781, + "learning_rate": 6.501314057743085e-07, + "loss": 0.6754, + "step": 4167 + }, + { + "epoch": 0.840833798270748, + "grad_norm": 0.6288461089134216, + "learning_rate": 6.48521024944026e-07, + "loss": 0.6727, + "step": 4168 + }, + { + "epoch": 0.8410355338269551, + "grad_norm": 2.7457149028778076, + "learning_rate": 6.46912502724198e-07, + "loss": 0.6679, + "step": 4169 + }, + { + "epoch": 0.841237269383162, + "grad_norm": 1.6510045528411865, + "learning_rate": 6.453058398018541e-07, + "loss": 0.7959, + "step": 4170 + }, + { + "epoch": 0.841439004939369, + "grad_norm": 0.7876770496368408, + "learning_rate": 6.43701036863239e-07, + "loss": 0.8301, + "step": 4171 + }, + { + "epoch": 0.841640740495576, + "grad_norm": 0.2771237790584564, + "learning_rate": 6.420980945937971e-07, + "loss": 0.7384, + "step": 4172 + }, + { + "epoch": 0.841842476051783, + "grad_norm": 0.6143053770065308, + "learning_rate": 6.40497013678178e-07, + "loss": 0.6745, + "step": 4173 + }, + { + "epoch": 0.84204421160799, + "grad_norm": 0.3154948055744171, + "learning_rate": 6.388977948002406e-07, + "loss": 0.6943, + "step": 4174 + }, + { + "epoch": 0.842245947164197, + "grad_norm": 0.3753282129764557, + "learning_rate": 6.373004386430442e-07, + "loss": 0.6293, + "step": 4175 + }, + { + "epoch": 0.8424476827204039, + "grad_norm": 0.677807629108429, + "learning_rate": 6.357049458888537e-07, + "loss": 0.6477, + "step": 4176 + }, + { + "epoch": 0.842649418276611, + "grad_norm": 0.4451962113380432, + "learning_rate": 6.341113172191399e-07, + "loss": 0.7745, + "step": 4177 + }, + { + "epoch": 0.842851153832818, + "grad_norm": 0.425942599773407, + "learning_rate": 6.325195533145751e-07, + "loss": 0.6755, + "step": 4178 + }, + { + "epoch": 0.843052889389025, + "grad_norm": 0.8061302900314331, + "learning_rate": 6.309296548550359e-07, + "loss": 0.6743, + "step": 4179 + }, + { + "epoch": 0.843254624945232, + "grad_norm": 0.5910888314247131, + "learning_rate": 6.293416225196009e-07, + "loss": 0.6547, + "step": 4180 + }, + { + "epoch": 0.8434563605014389, + "grad_norm": 0.42811521887779236, + "learning_rate": 6.277554569865557e-07, + "loss": 0.6636, + "step": 4181 + }, + { + "epoch": 0.843658096057646, + "grad_norm": 0.5060998201370239, + "learning_rate": 6.261711589333847e-07, + "loss": 0.6498, + "step": 4182 + }, + { + "epoch": 0.8438598316138529, + "grad_norm": 0.8295542001724243, + "learning_rate": 6.245887290367752e-07, + "loss": 0.6737, + "step": 4183 + }, + { + "epoch": 0.8440615671700599, + "grad_norm": 0.34962189197540283, + "learning_rate": 6.230081679726157e-07, + "loss": 0.674, + "step": 4184 + }, + { + "epoch": 0.8442633027262669, + "grad_norm": 0.47532859444618225, + "learning_rate": 6.214294764160012e-07, + "loss": 0.6495, + "step": 4185 + }, + { + "epoch": 0.8444650382824739, + "grad_norm": 0.8541948199272156, + "learning_rate": 6.198526550412232e-07, + "loss": 0.7932, + "step": 4186 + }, + { + "epoch": 0.8446667738386809, + "grad_norm": 0.4823712408542633, + "learning_rate": 6.182777045217764e-07, + "loss": 0.7874, + "step": 4187 + }, + { + "epoch": 0.8448685093948879, + "grad_norm": 0.4878910779953003, + "learning_rate": 6.167046255303543e-07, + "loss": 0.7106, + "step": 4188 + }, + { + "epoch": 0.8450702449510948, + "grad_norm": 1.4342825412750244, + "learning_rate": 6.151334187388552e-07, + "loss": 0.6694, + "step": 4189 + }, + { + "epoch": 0.8452719805073019, + "grad_norm": 0.973486602306366, + "learning_rate": 6.13564084818375e-07, + "loss": 0.7091, + "step": 4190 + }, + { + "epoch": 0.8454737160635089, + "grad_norm": 1.177097201347351, + "learning_rate": 6.119966244392084e-07, + "loss": 0.6511, + "step": 4191 + }, + { + "epoch": 0.8456754516197159, + "grad_norm": 0.44476157426834106, + "learning_rate": 6.104310382708539e-07, + "loss": 0.7403, + "step": 4192 + }, + { + "epoch": 0.8458771871759229, + "grad_norm": 0.37803569436073303, + "learning_rate": 6.088673269820061e-07, + "loss": 0.6753, + "step": 4193 + }, + { + "epoch": 0.8460789227321298, + "grad_norm": 0.6805874109268188, + "learning_rate": 6.073054912405591e-07, + "loss": 0.8085, + "step": 4194 + }, + { + "epoch": 0.8462806582883369, + "grad_norm": 0.6585365533828735, + "learning_rate": 6.057455317136063e-07, + "loss": 0.8038, + "step": 4195 + }, + { + "epoch": 0.8464823938445438, + "grad_norm": 0.34971868991851807, + "learning_rate": 6.041874490674416e-07, + "loss": 0.649, + "step": 4196 + }, + { + "epoch": 0.8466841294007509, + "grad_norm": 0.9895550012588501, + "learning_rate": 6.026312439675553e-07, + "loss": 0.8025, + "step": 4197 + }, + { + "epoch": 0.8468858649569578, + "grad_norm": 0.4490432143211365, + "learning_rate": 6.010769170786351e-07, + "loss": 0.6689, + "step": 4198 + }, + { + "epoch": 0.8470876005131648, + "grad_norm": 0.3404206335544586, + "learning_rate": 5.995244690645679e-07, + "loss": 0.6602, + "step": 4199 + }, + { + "epoch": 0.8472893360693718, + "grad_norm": 0.5209308862686157, + "learning_rate": 5.979739005884382e-07, + "loss": 0.6826, + "step": 4200 + }, + { + "epoch": 0.8474910716255788, + "grad_norm": 2.569395065307617, + "learning_rate": 5.964252123125275e-07, + "loss": 0.6958, + "step": 4201 + }, + { + "epoch": 0.8476928071817857, + "grad_norm": 0.3720131814479828, + "learning_rate": 5.948784048983125e-07, + "loss": 0.6304, + "step": 4202 + }, + { + "epoch": 0.8478945427379928, + "grad_norm": 0.30389517545700073, + "learning_rate": 5.933334790064698e-07, + "loss": 0.6829, + "step": 4203 + }, + { + "epoch": 0.8480962782941998, + "grad_norm": 0.4956578314304352, + "learning_rate": 5.917904352968695e-07, + "loss": 0.7114, + "step": 4204 + }, + { + "epoch": 0.8482980138504068, + "grad_norm": 0.576766848564148, + "learning_rate": 5.902492744285776e-07, + "loss": 0.6591, + "step": 4205 + }, + { + "epoch": 0.8484997494066138, + "grad_norm": 0.4476238191127777, + "learning_rate": 5.887099970598614e-07, + "loss": 0.7116, + "step": 4206 + }, + { + "epoch": 0.8487014849628207, + "grad_norm": 1.312357783317566, + "learning_rate": 5.87172603848174e-07, + "loss": 0.6716, + "step": 4207 + }, + { + "epoch": 0.8489032205190278, + "grad_norm": 0.3982757329940796, + "learning_rate": 5.856370954501722e-07, + "loss": 0.6598, + "step": 4208 + }, + { + "epoch": 0.8491049560752347, + "grad_norm": 0.3557843565940857, + "learning_rate": 5.841034725217049e-07, + "loss": 0.7635, + "step": 4209 + }, + { + "epoch": 0.8493066916314418, + "grad_norm": 0.5288363099098206, + "learning_rate": 5.82571735717814e-07, + "loss": 0.7273, + "step": 4210 + }, + { + "epoch": 0.8495084271876487, + "grad_norm": 0.38838717341423035, + "learning_rate": 5.810418856927385e-07, + "loss": 0.6438, + "step": 4211 + }, + { + "epoch": 0.8497101627438557, + "grad_norm": 0.481218546628952, + "learning_rate": 5.795139230999103e-07, + "loss": 0.8037, + "step": 4212 + }, + { + "epoch": 0.8499118983000628, + "grad_norm": 0.5310901999473572, + "learning_rate": 5.779878485919538e-07, + "loss": 0.7274, + "step": 4213 + }, + { + "epoch": 0.8501136338562697, + "grad_norm": 0.46764230728149414, + "learning_rate": 5.76463662820691e-07, + "loss": 0.8457, + "step": 4214 + }, + { + "epoch": 0.8503153694124768, + "grad_norm": 0.7696524858474731, + "learning_rate": 5.749413664371312e-07, + "loss": 0.7506, + "step": 4215 + }, + { + "epoch": 0.8505171049686837, + "grad_norm": 0.5245659947395325, + "learning_rate": 5.734209600914814e-07, + "loss": 0.6286, + "step": 4216 + }, + { + "epoch": 0.8507188405248907, + "grad_norm": 0.3059857487678528, + "learning_rate": 5.719024444331422e-07, + "loss": 0.7241, + "step": 4217 + }, + { + "epoch": 0.8509205760810977, + "grad_norm": 0.43941038846969604, + "learning_rate": 5.703858201107004e-07, + "loss": 0.6359, + "step": 4218 + }, + { + "epoch": 0.8511223116373047, + "grad_norm": 0.39626121520996094, + "learning_rate": 5.688710877719417e-07, + "loss": 0.6723, + "step": 4219 + }, + { + "epoch": 0.8513240471935117, + "grad_norm": 0.630386233329773, + "learning_rate": 5.673582480638395e-07, + "loss": 0.7543, + "step": 4220 + }, + { + "epoch": 0.8515257827497187, + "grad_norm": 0.34314990043640137, + "learning_rate": 5.658473016325605e-07, + "loss": 0.702, + "step": 4221 + }, + { + "epoch": 0.8517275183059256, + "grad_norm": 0.5030164122581482, + "learning_rate": 5.643382491234645e-07, + "loss": 0.9247, + "step": 4222 + }, + { + "epoch": 0.8519292538621327, + "grad_norm": 0.32690751552581787, + "learning_rate": 5.628310911810969e-07, + "loss": 0.6695, + "step": 4223 + }, + { + "epoch": 0.8521309894183396, + "grad_norm": 0.3914249539375305, + "learning_rate": 5.613258284491984e-07, + "loss": 0.6439, + "step": 4224 + }, + { + "epoch": 0.8523327249745466, + "grad_norm": 0.49372297525405884, + "learning_rate": 5.598224615707026e-07, + "loss": 0.6305, + "step": 4225 + }, + { + "epoch": 0.8525344605307537, + "grad_norm": 0.5601586699485779, + "learning_rate": 5.583209911877247e-07, + "loss": 0.6901, + "step": 4226 + }, + { + "epoch": 0.8527361960869606, + "grad_norm": 0.6586346626281738, + "learning_rate": 5.568214179415787e-07, + "loss": 0.651, + "step": 4227 + }, + { + "epoch": 0.8529379316431677, + "grad_norm": 0.4916522204875946, + "learning_rate": 5.553237424727631e-07, + "loss": 0.6494, + "step": 4228 + }, + { + "epoch": 0.8531396671993746, + "grad_norm": 1.484217643737793, + "learning_rate": 5.538279654209666e-07, + "loss": 0.6313, + "step": 4229 + }, + { + "epoch": 0.8533414027555816, + "grad_norm": 0.574791431427002, + "learning_rate": 5.523340874250704e-07, + "loss": 0.6747, + "step": 4230 + }, + { + "epoch": 0.8535431383117886, + "grad_norm": 0.5068508982658386, + "learning_rate": 5.508421091231403e-07, + "loss": 0.722, + "step": 4231 + }, + { + "epoch": 0.8537448738679956, + "grad_norm": 0.5957054495811462, + "learning_rate": 5.493520311524315e-07, + "loss": 0.645, + "step": 4232 + }, + { + "epoch": 0.8539466094242026, + "grad_norm": 0.31677567958831787, + "learning_rate": 5.478638541493903e-07, + "loss": 0.7713, + "step": 4233 + }, + { + "epoch": 0.8541483449804096, + "grad_norm": 0.36994820833206177, + "learning_rate": 5.463775787496484e-07, + "loss": 0.6373, + "step": 4234 + }, + { + "epoch": 0.8543500805366165, + "grad_norm": 0.35878241062164307, + "learning_rate": 5.448932055880262e-07, + "loss": 0.6331, + "step": 4235 + }, + { + "epoch": 0.8545518160928236, + "grad_norm": 0.4238692820072174, + "learning_rate": 5.434107352985313e-07, + "loss": 0.6168, + "step": 4236 + }, + { + "epoch": 0.8547535516490306, + "grad_norm": 0.4130517542362213, + "learning_rate": 5.41930168514358e-07, + "loss": 0.6304, + "step": 4237 + }, + { + "epoch": 0.8549552872052376, + "grad_norm": 0.5014340877532959, + "learning_rate": 5.404515058678894e-07, + "loss": 0.7457, + "step": 4238 + }, + { + "epoch": 0.8551570227614446, + "grad_norm": 0.610929548740387, + "learning_rate": 5.389747479906943e-07, + "loss": 0.6477, + "step": 4239 + }, + { + "epoch": 0.8553587583176515, + "grad_norm": 0.364946186542511, + "learning_rate": 5.374998955135258e-07, + "loss": 0.6872, + "step": 4240 + }, + { + "epoch": 0.8555604938738586, + "grad_norm": 0.39241814613342285, + "learning_rate": 5.360269490663278e-07, + "loss": 0.6155, + "step": 4241 + }, + { + "epoch": 0.8557622294300655, + "grad_norm": 0.34818553924560547, + "learning_rate": 5.345559092782266e-07, + "loss": 0.6519, + "step": 4242 + }, + { + "epoch": 0.8559639649862725, + "grad_norm": 0.3692716062068939, + "learning_rate": 5.330867767775333e-07, + "loss": 0.6915, + "step": 4243 + }, + { + "epoch": 0.8561657005424795, + "grad_norm": 0.3212972581386566, + "learning_rate": 5.316195521917484e-07, + "loss": 0.7803, + "step": 4244 + }, + { + "epoch": 0.8563674360986865, + "grad_norm": 0.43347883224487305, + "learning_rate": 5.301542361475548e-07, + "loss": 0.6335, + "step": 4245 + }, + { + "epoch": 0.8565691716548935, + "grad_norm": 0.6067093014717102, + "learning_rate": 5.286908292708198e-07, + "loss": 0.6424, + "step": 4246 + }, + { + "epoch": 0.8567709072111005, + "grad_norm": 0.6528324484825134, + "learning_rate": 5.272293321865951e-07, + "loss": 0.6596, + "step": 4247 + }, + { + "epoch": 0.8569726427673074, + "grad_norm": 0.5397018790245056, + "learning_rate": 5.257697455191197e-07, + "loss": 0.6386, + "step": 4248 + }, + { + "epoch": 0.8571743783235145, + "grad_norm": 0.4248422384262085, + "learning_rate": 5.243120698918136e-07, + "loss": 0.8655, + "step": 4249 + }, + { + "epoch": 0.8573761138797215, + "grad_norm": 0.33575424551963806, + "learning_rate": 5.228563059272812e-07, + "loss": 0.7742, + "step": 4250 + }, + { + "epoch": 0.8575778494359285, + "grad_norm": 0.5652782320976257, + "learning_rate": 5.2140245424731e-07, + "loss": 0.6833, + "step": 4251 + }, + { + "epoch": 0.8577795849921355, + "grad_norm": 0.41048964858055115, + "learning_rate": 5.199505154728729e-07, + "loss": 0.6791, + "step": 4252 + }, + { + "epoch": 0.8579813205483424, + "grad_norm": 0.7275264859199524, + "learning_rate": 5.185004902241241e-07, + "loss": 0.6563, + "step": 4253 + }, + { + "epoch": 0.8581830561045495, + "grad_norm": 0.5408958196640015, + "learning_rate": 5.170523791204002e-07, + "loss": 0.6719, + "step": 4254 + }, + { + "epoch": 0.8583847916607564, + "grad_norm": 0.3260270357131958, + "learning_rate": 5.156061827802195e-07, + "loss": 0.8123, + "step": 4255 + }, + { + "epoch": 0.8585865272169635, + "grad_norm": 0.5259901285171509, + "learning_rate": 5.141619018212851e-07, + "loss": 0.656, + "step": 4256 + }, + { + "epoch": 0.8587882627731704, + "grad_norm": 0.3833652436733246, + "learning_rate": 5.127195368604809e-07, + "loss": 0.6741, + "step": 4257 + }, + { + "epoch": 0.8589899983293774, + "grad_norm": 0.8140313029289246, + "learning_rate": 5.112790885138703e-07, + "loss": 0.8541, + "step": 4258 + }, + { + "epoch": 0.8591917338855845, + "grad_norm": 0.42131373286247253, + "learning_rate": 5.098405573967013e-07, + "loss": 0.677, + "step": 4259 + }, + { + "epoch": 0.8593934694417914, + "grad_norm": 0.4936279356479645, + "learning_rate": 5.084039441234013e-07, + "loss": 0.7666, + "step": 4260 + }, + { + "epoch": 0.8595952049979984, + "grad_norm": 0.3809494376182556, + "learning_rate": 5.069692493075778e-07, + "loss": 0.645, + "step": 4261 + }, + { + "epoch": 0.8597969405542054, + "grad_norm": 0.7333827018737793, + "learning_rate": 5.055364735620222e-07, + "loss": 0.6462, + "step": 4262 + }, + { + "epoch": 0.8599986761104124, + "grad_norm": 0.3328392207622528, + "learning_rate": 5.041056174987008e-07, + "loss": 0.6537, + "step": 4263 + }, + { + "epoch": 0.8602004116666194, + "grad_norm": 0.4985945522785187, + "learning_rate": 5.026766817287654e-07, + "loss": 0.6892, + "step": 4264 + }, + { + "epoch": 0.8604021472228264, + "grad_norm": 0.6473126411437988, + "learning_rate": 5.012496668625444e-07, + "loss": 0.7757, + "step": 4265 + }, + { + "epoch": 0.8606038827790333, + "grad_norm": 0.5193613767623901, + "learning_rate": 4.998245735095459e-07, + "loss": 0.6941, + "step": 4266 + }, + { + "epoch": 0.8608056183352404, + "grad_norm": 0.5102809071540833, + "learning_rate": 4.984014022784595e-07, + "loss": 0.8465, + "step": 4267 + }, + { + "epoch": 0.8610073538914473, + "grad_norm": 0.4982369542121887, + "learning_rate": 4.969801537771512e-07, + "loss": 0.717, + "step": 4268 + }, + { + "epoch": 0.8612090894476544, + "grad_norm": 0.6398607492446899, + "learning_rate": 4.955608286126673e-07, + "loss": 0.6681, + "step": 4269 + }, + { + "epoch": 0.8614108250038613, + "grad_norm": 0.37193191051483154, + "learning_rate": 4.941434273912321e-07, + "loss": 0.7803, + "step": 4270 + }, + { + "epoch": 0.8616125605600683, + "grad_norm": 0.31802332401275635, + "learning_rate": 4.927279507182486e-07, + "loss": 0.6695, + "step": 4271 + }, + { + "epoch": 0.8618142961162754, + "grad_norm": 0.5764533877372742, + "learning_rate": 4.91314399198296e-07, + "loss": 0.6397, + "step": 4272 + }, + { + "epoch": 0.8620160316724823, + "grad_norm": 0.360847532749176, + "learning_rate": 4.899027734351358e-07, + "loss": 0.719, + "step": 4273 + }, + { + "epoch": 0.8622177672286894, + "grad_norm": 0.7284826636314392, + "learning_rate": 4.88493074031699e-07, + "loss": 0.8214, + "step": 4274 + }, + { + "epoch": 0.8624195027848963, + "grad_norm": 0.44335803389549255, + "learning_rate": 4.870853015901028e-07, + "loss": 1.1041, + "step": 4275 + }, + { + "epoch": 0.8626212383411033, + "grad_norm": 0.5888170599937439, + "learning_rate": 4.856794567116352e-07, + "loss": 0.6641, + "step": 4276 + }, + { + "epoch": 0.8628229738973103, + "grad_norm": 0.48649922013282776, + "learning_rate": 4.842755399967625e-07, + "loss": 0.6491, + "step": 4277 + }, + { + "epoch": 0.8630247094535173, + "grad_norm": 0.9031432271003723, + "learning_rate": 4.828735520451294e-07, + "loss": 0.6338, + "step": 4278 + }, + { + "epoch": 0.8632264450097242, + "grad_norm": 0.7475653886795044, + "learning_rate": 4.814734934555543e-07, + "loss": 0.7146, + "step": 4279 + }, + { + "epoch": 0.8634281805659313, + "grad_norm": 0.7083207368850708, + "learning_rate": 4.800753648260309e-07, + "loss": 0.6607, + "step": 4280 + }, + { + "epoch": 0.8636299161221382, + "grad_norm": 0.5013923645019531, + "learning_rate": 4.786791667537338e-07, + "loss": 0.6543, + "step": 4281 + }, + { + "epoch": 0.8638316516783453, + "grad_norm": 0.6404758095741272, + "learning_rate": 4.772848998350049e-07, + "loss": 0.7574, + "step": 4282 + }, + { + "epoch": 0.8640333872345523, + "grad_norm": 1.0840024948120117, + "learning_rate": 4.7589256466536835e-07, + "loss": 0.6509, + "step": 4283 + }, + { + "epoch": 0.8642351227907592, + "grad_norm": 0.3333173990249634, + "learning_rate": 4.7450216183952127e-07, + "loss": 0.7079, + "step": 4284 + }, + { + "epoch": 0.8644368583469663, + "grad_norm": 0.4083701968193054, + "learning_rate": 4.7311369195133127e-07, + "loss": 0.6664, + "step": 4285 + }, + { + "epoch": 0.8646385939031732, + "grad_norm": 0.37062057852745056, + "learning_rate": 4.717271555938474e-07, + "loss": 0.7771, + "step": 4286 + }, + { + "epoch": 0.8648403294593803, + "grad_norm": 0.8348171710968018, + "learning_rate": 4.7034255335928704e-07, + "loss": 0.6784, + "step": 4287 + }, + { + "epoch": 0.8650420650155872, + "grad_norm": 0.3087136745452881, + "learning_rate": 4.689598858390432e-07, + "loss": 0.6582, + "step": 4288 + }, + { + "epoch": 0.8652438005717942, + "grad_norm": 0.5123614072799683, + "learning_rate": 4.6757915362368567e-07, + "loss": 0.6496, + "step": 4289 + }, + { + "epoch": 0.8654455361280012, + "grad_norm": 0.5696089863777161, + "learning_rate": 4.6620035730295277e-07, + "loss": 0.6654, + "step": 4290 + }, + { + "epoch": 0.8656472716842082, + "grad_norm": 0.4323718845844269, + "learning_rate": 4.6482349746575783e-07, + "loss": 0.6986, + "step": 4291 + }, + { + "epoch": 0.8658490072404152, + "grad_norm": 0.9498506188392639, + "learning_rate": 4.634485747001899e-07, + "loss": 0.6253, + "step": 4292 + }, + { + "epoch": 0.8660507427966222, + "grad_norm": 0.4021576941013336, + "learning_rate": 4.620755895935042e-07, + "loss": 0.6582, + "step": 4293 + }, + { + "epoch": 0.8662524783528291, + "grad_norm": 0.41058310866355896, + "learning_rate": 4.6070454273213605e-07, + "loss": 0.6798, + "step": 4294 + }, + { + "epoch": 0.8664542139090362, + "grad_norm": 0.7499169707298279, + "learning_rate": 4.5933543470168706e-07, + "loss": 0.6811, + "step": 4295 + }, + { + "epoch": 0.8666559494652432, + "grad_norm": 0.6942950487136841, + "learning_rate": 4.5796826608693277e-07, + "loss": 0.6765, + "step": 4296 + }, + { + "epoch": 0.8668576850214501, + "grad_norm": 0.511210560798645, + "learning_rate": 4.566030374718211e-07, + "loss": 0.8112, + "step": 4297 + }, + { + "epoch": 0.8670594205776572, + "grad_norm": 0.3508337140083313, + "learning_rate": 4.552397494394706e-07, + "loss": 0.7067, + "step": 4298 + }, + { + "epoch": 0.8672611561338641, + "grad_norm": 1.2191749811172485, + "learning_rate": 4.5387840257216987e-07, + "loss": 0.7768, + "step": 4299 + }, + { + "epoch": 0.8674628916900712, + "grad_norm": 0.6152402758598328, + "learning_rate": 4.5251899745138104e-07, + "loss": 0.6522, + "step": 4300 + }, + { + "epoch": 0.8676646272462781, + "grad_norm": 0.3791225254535675, + "learning_rate": 4.5116153465773525e-07, + "loss": 0.658, + "step": 4301 + }, + { + "epoch": 0.8678663628024851, + "grad_norm": 0.7504114508628845, + "learning_rate": 4.4980601477103257e-07, + "loss": 0.6723, + "step": 4302 + }, + { + "epoch": 0.8680680983586921, + "grad_norm": 0.520866870880127, + "learning_rate": 4.4845243837024543e-07, + "loss": 0.6935, + "step": 4303 + }, + { + "epoch": 0.8682698339148991, + "grad_norm": 0.8700990676879883, + "learning_rate": 4.4710080603351634e-07, + "loss": 0.6538, + "step": 4304 + }, + { + "epoch": 0.8684715694711062, + "grad_norm": 0.39416974782943726, + "learning_rate": 4.457511183381563e-07, + "loss": 0.7885, + "step": 4305 + }, + { + "epoch": 0.8686733050273131, + "grad_norm": 0.7392798662185669, + "learning_rate": 4.444033758606453e-07, + "loss": 0.6649, + "step": 4306 + }, + { + "epoch": 0.86887504058352, + "grad_norm": 0.6805448532104492, + "learning_rate": 4.4305757917663284e-07, + "loss": 0.8631, + "step": 4307 + }, + { + "epoch": 0.8690767761397271, + "grad_norm": 0.7489762902259827, + "learning_rate": 4.4171372886093967e-07, + "loss": 0.7059, + "step": 4308 + }, + { + "epoch": 0.8692785116959341, + "grad_norm": 0.5760902762413025, + "learning_rate": 4.4037182548755166e-07, + "loss": 0.7396, + "step": 4309 + }, + { + "epoch": 0.8694802472521411, + "grad_norm": 0.8316147327423096, + "learning_rate": 4.390318696296247e-07, + "loss": 0.6357, + "step": 4310 + }, + { + "epoch": 0.8696819828083481, + "grad_norm": 0.516345739364624, + "learning_rate": 4.376938618594828e-07, + "loss": 0.6736, + "step": 4311 + }, + { + "epoch": 0.869883718364555, + "grad_norm": 1.7527180910110474, + "learning_rate": 4.363578027486187e-07, + "loss": 0.6342, + "step": 4312 + }, + { + "epoch": 0.8700854539207621, + "grad_norm": 0.6879702806472778, + "learning_rate": 4.3502369286769154e-07, + "loss": 0.6328, + "step": 4313 + }, + { + "epoch": 0.870287189476969, + "grad_norm": 0.8283377289772034, + "learning_rate": 4.3369153278652765e-07, + "loss": 0.6603, + "step": 4314 + }, + { + "epoch": 0.8704889250331761, + "grad_norm": 1.3900506496429443, + "learning_rate": 4.323613230741236e-07, + "loss": 0.8108, + "step": 4315 + }, + { + "epoch": 0.870690660589383, + "grad_norm": 0.36297035217285156, + "learning_rate": 4.310330642986382e-07, + "loss": 0.7027, + "step": 4316 + }, + { + "epoch": 0.87089239614559, + "grad_norm": 0.6657307744026184, + "learning_rate": 4.2970675702739997e-07, + "loss": 0.6216, + "step": 4317 + }, + { + "epoch": 0.8710941317017971, + "grad_norm": 0.40114277601242065, + "learning_rate": 4.283824018269045e-07, + "loss": 0.878, + "step": 4318 + }, + { + "epoch": 0.871295867258004, + "grad_norm": 0.3651154041290283, + "learning_rate": 4.270599992628116e-07, + "loss": 0.678, + "step": 4319 + }, + { + "epoch": 0.871497602814211, + "grad_norm": 0.4124424457550049, + "learning_rate": 4.257395498999478e-07, + "loss": 0.6754, + "step": 4320 + }, + { + "epoch": 0.871699338370418, + "grad_norm": 0.42094066739082336, + "learning_rate": 4.244210543023053e-07, + "loss": 0.6665, + "step": 4321 + }, + { + "epoch": 0.871901073926625, + "grad_norm": 0.35365965962409973, + "learning_rate": 4.231045130330419e-07, + "loss": 0.7355, + "step": 4322 + }, + { + "epoch": 0.872102809482832, + "grad_norm": 0.3624404966831207, + "learning_rate": 4.2178992665448226e-07, + "loss": 0.7504, + "step": 4323 + }, + { + "epoch": 0.872304545039039, + "grad_norm": 0.6722658276557922, + "learning_rate": 4.204772957281128e-07, + "loss": 0.7163, + "step": 4324 + }, + { + "epoch": 0.8725062805952459, + "grad_norm": 0.3923650085926056, + "learning_rate": 4.191666208145867e-07, + "loss": 0.6459, + "step": 4325 + }, + { + "epoch": 0.872708016151453, + "grad_norm": 0.42511051893234253, + "learning_rate": 4.1785790247372226e-07, + "loss": 0.7555, + "step": 4326 + }, + { + "epoch": 0.8729097517076599, + "grad_norm": 0.4749056398868561, + "learning_rate": 4.1655114126450125e-07, + "loss": 0.661, + "step": 4327 + }, + { + "epoch": 0.873111487263867, + "grad_norm": 0.31862133741378784, + "learning_rate": 4.152463377450683e-07, + "loss": 0.6398, + "step": 4328 + }, + { + "epoch": 0.873313222820074, + "grad_norm": 0.48377200961112976, + "learning_rate": 4.139434924727359e-07, + "loss": 0.6006, + "step": 4329 + }, + { + "epoch": 0.8735149583762809, + "grad_norm": 0.43817344307899475, + "learning_rate": 4.1264260600397343e-07, + "loss": 0.6488, + "step": 4330 + }, + { + "epoch": 0.873716693932488, + "grad_norm": 0.8320906758308411, + "learning_rate": 4.113436788944197e-07, + "loss": 0.6923, + "step": 4331 + }, + { + "epoch": 0.8739184294886949, + "grad_norm": 0.6215797066688538, + "learning_rate": 4.10046711698876e-07, + "loss": 0.7747, + "step": 4332 + }, + { + "epoch": 0.874120165044902, + "grad_norm": 0.45883363485336304, + "learning_rate": 4.0875170497130135e-07, + "loss": 0.697, + "step": 4333 + }, + { + "epoch": 0.8743219006011089, + "grad_norm": 0.4302061200141907, + "learning_rate": 4.074586592648244e-07, + "loss": 0.6485, + "step": 4334 + }, + { + "epoch": 0.8745236361573159, + "grad_norm": 0.4158835709095001, + "learning_rate": 4.0616757513173123e-07, + "loss": 0.6602, + "step": 4335 + }, + { + "epoch": 0.8747253717135229, + "grad_norm": 0.5009157061576843, + "learning_rate": 4.048784531234706e-07, + "loss": 0.6721, + "step": 4336 + }, + { + "epoch": 0.8749271072697299, + "grad_norm": 0.37360620498657227, + "learning_rate": 4.035912937906578e-07, + "loss": 0.6843, + "step": 4337 + }, + { + "epoch": 0.8751288428259368, + "grad_norm": 0.447135865688324, + "learning_rate": 4.023060976830623e-07, + "loss": 0.6044, + "step": 4338 + }, + { + "epoch": 0.8753305783821439, + "grad_norm": 0.336240291595459, + "learning_rate": 4.010228653496207e-07, + "loss": 0.8823, + "step": 4339 + }, + { + "epoch": 0.8755323139383508, + "grad_norm": 0.338085800409317, + "learning_rate": 3.997415973384311e-07, + "loss": 0.6601, + "step": 4340 + }, + { + "epoch": 0.8757340494945579, + "grad_norm": 0.6273344159126282, + "learning_rate": 3.9846229419674754e-07, + "loss": 0.6359, + "step": 4341 + }, + { + "epoch": 0.8759357850507649, + "grad_norm": 0.42052680253982544, + "learning_rate": 3.9718495647099007e-07, + "loss": 0.7733, + "step": 4342 + }, + { + "epoch": 0.8761375206069718, + "grad_norm": 0.373045951128006, + "learning_rate": 3.9590958470673626e-07, + "loss": 0.7387, + "step": 4343 + }, + { + "epoch": 0.8763392561631789, + "grad_norm": 0.926173985004425, + "learning_rate": 3.9463617944872465e-07, + "loss": 0.6867, + "step": 4344 + }, + { + "epoch": 0.8765409917193858, + "grad_norm": 0.5256276726722717, + "learning_rate": 3.933647412408548e-07, + "loss": 0.6511, + "step": 4345 + }, + { + "epoch": 0.8767427272755929, + "grad_norm": 0.8077823519706726, + "learning_rate": 3.920952706261855e-07, + "loss": 0.7477, + "step": 4346 + }, + { + "epoch": 0.8769444628317998, + "grad_norm": 0.567271888256073, + "learning_rate": 3.9082776814693355e-07, + "loss": 0.798, + "step": 4347 + }, + { + "epoch": 0.8771461983880068, + "grad_norm": 0.3140374720096588, + "learning_rate": 3.8956223434447936e-07, + "loss": 0.7335, + "step": 4348 + }, + { + "epoch": 0.8773479339442138, + "grad_norm": 0.9564724564552307, + "learning_rate": 3.8829866975935603e-07, + "loss": 0.7208, + "step": 4349 + }, + { + "epoch": 0.8775496695004208, + "grad_norm": 0.6215482354164124, + "learning_rate": 3.870370749312624e-07, + "loss": 0.7061, + "step": 4350 + }, + { + "epoch": 0.8777514050566279, + "grad_norm": 0.33347561955451965, + "learning_rate": 3.857774503990513e-07, + "loss": 0.7846, + "step": 4351 + }, + { + "epoch": 0.8779531406128348, + "grad_norm": 1.3723175525665283, + "learning_rate": 3.845197967007347e-07, + "loss": 0.656, + "step": 4352 + }, + { + "epoch": 0.8781548761690418, + "grad_norm": 0.5448706746101379, + "learning_rate": 3.832641143734861e-07, + "loss": 0.6686, + "step": 4353 + }, + { + "epoch": 0.8783566117252488, + "grad_norm": 0.7323155999183655, + "learning_rate": 3.820104039536326e-07, + "loss": 0.6493, + "step": 4354 + }, + { + "epoch": 0.8785583472814558, + "grad_norm": 1.221942663192749, + "learning_rate": 3.8075866597666044e-07, + "loss": 0.7042, + "step": 4355 + }, + { + "epoch": 0.8787600828376627, + "grad_norm": 0.4284684360027313, + "learning_rate": 3.795089009772157e-07, + "loss": 0.7094, + "step": 4356 + }, + { + "epoch": 0.8789618183938698, + "grad_norm": 0.5355064272880554, + "learning_rate": 3.782611094890992e-07, + "loss": 0.646, + "step": 4357 + }, + { + "epoch": 0.8791635539500767, + "grad_norm": 0.4629518687725067, + "learning_rate": 3.7701529204526856e-07, + "loss": 0.6725, + "step": 4358 + }, + { + "epoch": 0.8793652895062838, + "grad_norm": 0.356670618057251, + "learning_rate": 3.757714491778419e-07, + "loss": 0.6576, + "step": 4359 + }, + { + "epoch": 0.8795670250624907, + "grad_norm": 0.5200281143188477, + "learning_rate": 3.745295814180877e-07, + "loss": 0.6276, + "step": 4360 + }, + { + "epoch": 0.8797687606186977, + "grad_norm": 0.5747634768486023, + "learning_rate": 3.7328968929643714e-07, + "loss": 0.7729, + "step": 4361 + }, + { + "epoch": 0.8799704961749047, + "grad_norm": 0.41098442673683167, + "learning_rate": 3.7205177334247445e-07, + "loss": 0.682, + "step": 4362 + }, + { + "epoch": 0.8801722317311117, + "grad_norm": 0.3811708986759186, + "learning_rate": 3.7081583408493883e-07, + "loss": 0.7493, + "step": 4363 + }, + { + "epoch": 0.8803739672873188, + "grad_norm": 0.5595073103904724, + "learning_rate": 3.69581872051728e-07, + "loss": 0.793, + "step": 4364 + }, + { + "epoch": 0.8805757028435257, + "grad_norm": 0.410376638174057, + "learning_rate": 3.6834988776989323e-07, + "loss": 0.6694, + "step": 4365 + }, + { + "epoch": 0.8807774383997327, + "grad_norm": 0.36430269479751587, + "learning_rate": 3.671198817656413e-07, + "loss": 0.6765, + "step": 4366 + }, + { + "epoch": 0.8809791739559397, + "grad_norm": 0.6053842902183533, + "learning_rate": 3.658918545643353e-07, + "loss": 0.8619, + "step": 4367 + }, + { + "epoch": 0.8811809095121467, + "grad_norm": 0.6663161516189575, + "learning_rate": 3.6466580669049123e-07, + "loss": 0.7408, + "step": 4368 + }, + { + "epoch": 0.8813826450683537, + "grad_norm": 0.41885098814964294, + "learning_rate": 3.6344173866778075e-07, + "loss": 0.6584, + "step": 4369 + }, + { + "epoch": 0.8815843806245607, + "grad_norm": 0.5261691808700562, + "learning_rate": 3.62219651019029e-07, + "loss": 0.6922, + "step": 4370 + }, + { + "epoch": 0.8817861161807676, + "grad_norm": 0.4592227339744568, + "learning_rate": 3.609995442662173e-07, + "loss": 0.679, + "step": 4371 + }, + { + "epoch": 0.8819878517369747, + "grad_norm": 0.4438103139400482, + "learning_rate": 3.597814189304788e-07, + "loss": 0.7043, + "step": 4372 + }, + { + "epoch": 0.8821895872931816, + "grad_norm": 0.5406025052070618, + "learning_rate": 3.585652755321012e-07, + "loss": 0.733, + "step": 4373 + }, + { + "epoch": 0.8823913228493886, + "grad_norm": 0.3534887731075287, + "learning_rate": 3.573511145905245e-07, + "loss": 0.6839, + "step": 4374 + }, + { + "epoch": 0.8825930584055957, + "grad_norm": 0.4145587980747223, + "learning_rate": 3.561389366243451e-07, + "loss": 0.7142, + "step": 4375 + }, + { + "epoch": 0.8827947939618026, + "grad_norm": 0.5214899182319641, + "learning_rate": 3.5492874215130926e-07, + "loss": 0.766, + "step": 4376 + }, + { + "epoch": 0.8829965295180097, + "grad_norm": 0.38252052664756775, + "learning_rate": 3.5372053168831744e-07, + "loss": 0.6699, + "step": 4377 + }, + { + "epoch": 0.8831982650742166, + "grad_norm": 0.5145108699798584, + "learning_rate": 3.5251430575142074e-07, + "loss": 0.6906, + "step": 4378 + }, + { + "epoch": 0.8834000006304236, + "grad_norm": 0.27400287985801697, + "learning_rate": 3.5131006485582653e-07, + "loss": 0.6277, + "step": 4379 + }, + { + "epoch": 0.8836017361866306, + "grad_norm": 0.4707011878490448, + "learning_rate": 3.501078095158911e-07, + "loss": 0.6461, + "step": 4380 + }, + { + "epoch": 0.8838034717428376, + "grad_norm": 0.3479291498661041, + "learning_rate": 3.4890754024512254e-07, + "loss": 0.7901, + "step": 4381 + }, + { + "epoch": 0.8840052072990446, + "grad_norm": 0.6620512008666992, + "learning_rate": 3.477092575561836e-07, + "loss": 0.6257, + "step": 4382 + }, + { + "epoch": 0.8842069428552516, + "grad_norm": 0.3701179623603821, + "learning_rate": 3.465129619608859e-07, + "loss": 0.6799, + "step": 4383 + }, + { + "epoch": 0.8844086784114585, + "grad_norm": 0.876181423664093, + "learning_rate": 3.453186539701925e-07, + "loss": 0.719, + "step": 4384 + }, + { + "epoch": 0.8846104139676656, + "grad_norm": 0.9063430428504944, + "learning_rate": 3.441263340942197e-07, + "loss": 0.6176, + "step": 4385 + }, + { + "epoch": 0.8848121495238725, + "grad_norm": 0.4452477693557739, + "learning_rate": 3.429360028422307e-07, + "loss": 0.6694, + "step": 4386 + }, + { + "epoch": 0.8850138850800796, + "grad_norm": 0.3298512399196625, + "learning_rate": 3.4174766072264333e-07, + "loss": 0.6696, + "step": 4387 + }, + { + "epoch": 0.8852156206362866, + "grad_norm": 0.8097718358039856, + "learning_rate": 3.405613082430237e-07, + "loss": 0.6662, + "step": 4388 + }, + { + "epoch": 0.8854173561924935, + "grad_norm": 0.39331236481666565, + "learning_rate": 3.393769459100876e-07, + "loss": 0.7255, + "step": 4389 + }, + { + "epoch": 0.8856190917487006, + "grad_norm": 0.33854588866233826, + "learning_rate": 3.3819457422970327e-07, + "loss": 0.7015, + "step": 4390 + }, + { + "epoch": 0.8858208273049075, + "grad_norm": 0.4164811074733734, + "learning_rate": 3.3701419370688657e-07, + "loss": 0.6659, + "step": 4391 + }, + { + "epoch": 0.8860225628611145, + "grad_norm": 0.40010184049606323, + "learning_rate": 3.3583580484580215e-07, + "loss": 0.779, + "step": 4392 + }, + { + "epoch": 0.8862242984173215, + "grad_norm": 0.3427859842777252, + "learning_rate": 3.3465940814976784e-07, + "loss": 0.6434, + "step": 4393 + }, + { + "epoch": 0.8864260339735285, + "grad_norm": 0.4118961691856384, + "learning_rate": 3.334850041212462e-07, + "loss": 0.6414, + "step": 4394 + }, + { + "epoch": 0.8866277695297355, + "grad_norm": 0.576214611530304, + "learning_rate": 3.3231259326184983e-07, + "loss": 0.7513, + "step": 4395 + }, + { + "epoch": 0.8868295050859425, + "grad_norm": 0.45370420813560486, + "learning_rate": 3.311421760723438e-07, + "loss": 0.7095, + "step": 4396 + }, + { + "epoch": 0.8870312406421494, + "grad_norm": 0.37329503893852234, + "learning_rate": 3.299737530526348e-07, + "loss": 0.6451, + "step": 4397 + }, + { + "epoch": 0.8872329761983565, + "grad_norm": 0.4043911099433899, + "learning_rate": 3.2880732470178366e-07, + "loss": 0.6876, + "step": 4398 + }, + { + "epoch": 0.8874347117545635, + "grad_norm": 0.4774644672870636, + "learning_rate": 3.276428915179969e-07, + "loss": 0.7071, + "step": 4399 + }, + { + "epoch": 0.8876364473107705, + "grad_norm": 0.30243927240371704, + "learning_rate": 3.264804539986283e-07, + "loss": 0.6641, + "step": 4400 + }, + { + "epoch": 0.8878381828669775, + "grad_norm": 0.7250840067863464, + "learning_rate": 3.2532001264018067e-07, + "loss": 0.8407, + "step": 4401 + }, + { + "epoch": 0.8880399184231844, + "grad_norm": 1.1073092222213745, + "learning_rate": 3.241615679383031e-07, + "loss": 0.6304, + "step": 4402 + }, + { + "epoch": 0.8882416539793915, + "grad_norm": 0.6125013828277588, + "learning_rate": 3.2300512038779155e-07, + "loss": 0.849, + "step": 4403 + }, + { + "epoch": 0.8884433895355984, + "grad_norm": 0.3922661542892456, + "learning_rate": 3.2185067048259245e-07, + "loss": 0.7053, + "step": 4404 + }, + { + "epoch": 0.8886451250918055, + "grad_norm": 0.5089098215103149, + "learning_rate": 3.2069821871579255e-07, + "loss": 0.7376, + "step": 4405 + }, + { + "epoch": 0.8888468606480124, + "grad_norm": 4.080984592437744, + "learning_rate": 3.1954776557963086e-07, + "loss": 0.6724, + "step": 4406 + }, + { + "epoch": 0.8890485962042194, + "grad_norm": 0.42470672726631165, + "learning_rate": 3.183993115654921e-07, + "loss": 0.8208, + "step": 4407 + }, + { + "epoch": 0.8892503317604264, + "grad_norm": 0.38804891705513, + "learning_rate": 3.172528571639022e-07, + "loss": 0.6946, + "step": 4408 + }, + { + "epoch": 0.8894520673166334, + "grad_norm": 0.7295994758605957, + "learning_rate": 3.161084028645395e-07, + "loss": 0.7716, + "step": 4409 + }, + { + "epoch": 0.8896538028728405, + "grad_norm": 0.3722836971282959, + "learning_rate": 3.1496594915622405e-07, + "loss": 0.6619, + "step": 4410 + }, + { + "epoch": 0.8898555384290474, + "grad_norm": 0.42691469192504883, + "learning_rate": 3.1382549652692164e-07, + "loss": 0.7169, + "step": 4411 + }, + { + "epoch": 0.8900572739852544, + "grad_norm": 0.5380913615226746, + "learning_rate": 3.126870454637454e-07, + "loss": 0.6632, + "step": 4412 + }, + { + "epoch": 0.8902590095414614, + "grad_norm": 0.4028385579586029, + "learning_rate": 3.115505964529519e-07, + "loss": 0.7287, + "step": 4413 + }, + { + "epoch": 0.8904607450976684, + "grad_norm": 0.45696067810058594, + "learning_rate": 3.1041614997994295e-07, + "loss": 0.6113, + "step": 4414 + }, + { + "epoch": 0.8906624806538753, + "grad_norm": 1.2479190826416016, + "learning_rate": 3.0928370652926586e-07, + "loss": 0.731, + "step": 4415 + }, + { + "epoch": 0.8908642162100824, + "grad_norm": 0.767214298248291, + "learning_rate": 3.0815326658460986e-07, + "loss": 0.6698, + "step": 4416 + }, + { + "epoch": 0.8910659517662893, + "grad_norm": 0.46267926692962646, + "learning_rate": 3.0702483062881206e-07, + "loss": 0.7059, + "step": 4417 + }, + { + "epoch": 0.8912676873224964, + "grad_norm": 0.751915693283081, + "learning_rate": 3.058983991438508e-07, + "loss": 0.7794, + "step": 4418 + }, + { + "epoch": 0.8914694228787033, + "grad_norm": 0.42991894483566284, + "learning_rate": 3.047739726108484e-07, + "loss": 0.6285, + "step": 4419 + }, + { + "epoch": 0.8916711584349103, + "grad_norm": 0.40284815430641174, + "learning_rate": 3.036515515100735e-07, + "loss": 0.6686, + "step": 4420 + }, + { + "epoch": 0.8918728939911174, + "grad_norm": 0.3892557919025421, + "learning_rate": 3.02531136320936e-07, + "loss": 0.6799, + "step": 4421 + }, + { + "epoch": 0.8920746295473243, + "grad_norm": 0.5194017291069031, + "learning_rate": 3.01412727521988e-07, + "loss": 0.6733, + "step": 4422 + }, + { + "epoch": 0.8922763651035314, + "grad_norm": 0.36624839901924133, + "learning_rate": 3.0029632559092747e-07, + "loss": 0.8639, + "step": 4423 + }, + { + "epoch": 0.8924781006597383, + "grad_norm": 0.36247923970222473, + "learning_rate": 2.991819310045929e-07, + "loss": 0.638, + "step": 4424 + }, + { + "epoch": 0.8926798362159453, + "grad_norm": 0.32094642519950867, + "learning_rate": 2.9806954423896696e-07, + "loss": 0.6623, + "step": 4425 + }, + { + "epoch": 0.8928815717721523, + "grad_norm": 0.5981245040893555, + "learning_rate": 2.9695916576917285e-07, + "loss": 0.7036, + "step": 4426 + }, + { + "epoch": 0.8930833073283593, + "grad_norm": 0.4451741874217987, + "learning_rate": 2.9585079606947843e-07, + "loss": 0.8021, + "step": 4427 + }, + { + "epoch": 0.8932850428845663, + "grad_norm": 0.39480602741241455, + "learning_rate": 2.947444356132917e-07, + "loss": 0.8903, + "step": 4428 + }, + { + "epoch": 0.8934867784407733, + "grad_norm": 0.6016244292259216, + "learning_rate": 2.93640084873163e-07, + "loss": 0.6515, + "step": 4429 + }, + { + "epoch": 0.8936885139969802, + "grad_norm": 0.34783855080604553, + "learning_rate": 2.9253774432078384e-07, + "loss": 0.9107, + "step": 4430 + }, + { + "epoch": 0.8938902495531873, + "grad_norm": 0.42876172065734863, + "learning_rate": 2.914374144269888e-07, + "loss": 0.6776, + "step": 4431 + }, + { + "epoch": 0.8940919851093942, + "grad_norm": 0.909610390663147, + "learning_rate": 2.903390956617519e-07, + "loss": 0.7213, + "step": 4432 + }, + { + "epoch": 0.8942937206656012, + "grad_norm": 1.040679693222046, + "learning_rate": 2.8924278849418784e-07, + "loss": 0.7602, + "step": 4433 + }, + { + "epoch": 0.8944954562218083, + "grad_norm": 0.5264043211936951, + "learning_rate": 2.881484933925549e-07, + "loss": 0.649, + "step": 4434 + }, + { + "epoch": 0.8946971917780152, + "grad_norm": 0.4022894501686096, + "learning_rate": 2.870562108242486e-07, + "loss": 0.7097, + "step": 4435 + }, + { + "epoch": 0.8948989273342223, + "grad_norm": 0.41743919253349304, + "learning_rate": 2.8596594125580745e-07, + "loss": 0.6597, + "step": 4436 + }, + { + "epoch": 0.8951006628904292, + "grad_norm": 0.8420581221580505, + "learning_rate": 2.8487768515290783e-07, + "loss": 0.6856, + "step": 4437 + }, + { + "epoch": 0.8953023984466362, + "grad_norm": 0.7341222763061523, + "learning_rate": 2.8379144298036845e-07, + "loss": 0.774, + "step": 4438 + }, + { + "epoch": 0.8955041340028432, + "grad_norm": 3.0229249000549316, + "learning_rate": 2.827072152021465e-07, + "loss": 0.6685, + "step": 4439 + }, + { + "epoch": 0.8957058695590502, + "grad_norm": 0.44219347834587097, + "learning_rate": 2.816250022813383e-07, + "loss": 0.8718, + "step": 4440 + }, + { + "epoch": 0.8959076051152572, + "grad_norm": 0.7286803722381592, + "learning_rate": 2.8054480468018117e-07, + "loss": 0.6332, + "step": 4441 + }, + { + "epoch": 0.8961093406714642, + "grad_norm": 0.4334128797054291, + "learning_rate": 2.7946662286005124e-07, + "loss": 0.6492, + "step": 4442 + }, + { + "epoch": 0.8963110762276711, + "grad_norm": 0.45339667797088623, + "learning_rate": 2.783904572814622e-07, + "loss": 0.7465, + "step": 4443 + }, + { + "epoch": 0.8965128117838782, + "grad_norm": 0.5342560410499573, + "learning_rate": 2.7731630840406754e-07, + "loss": 0.6591, + "step": 4444 + }, + { + "epoch": 0.8967145473400852, + "grad_norm": 0.3707166910171509, + "learning_rate": 2.7624417668665917e-07, + "loss": 0.6863, + "step": 4445 + }, + { + "epoch": 0.8969162828962922, + "grad_norm": 0.4466363787651062, + "learning_rate": 2.751740625871691e-07, + "loss": 0.727, + "step": 4446 + }, + { + "epoch": 0.8971180184524992, + "grad_norm": 0.5713307857513428, + "learning_rate": 2.7410596656266497e-07, + "loss": 0.6592, + "step": 4447 + }, + { + "epoch": 0.8973197540087061, + "grad_norm": 0.4353967010974884, + "learning_rate": 2.730398890693536e-07, + "loss": 0.6307, + "step": 4448 + }, + { + "epoch": 0.8975214895649132, + "grad_norm": 0.4011313021183014, + "learning_rate": 2.7197583056258027e-07, + "loss": 0.6981, + "step": 4449 + }, + { + "epoch": 0.8977232251211201, + "grad_norm": 0.37026599049568176, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.8362, + "step": 4450 + }, + { + "epoch": 0.8979249606773271, + "grad_norm": 0.4338156580924988, + "learning_rate": 2.698537723257127e-07, + "loss": 0.7418, + "step": 4451 + }, + { + "epoch": 0.8981266962335341, + "grad_norm": 0.539256751537323, + "learning_rate": 2.687957735019969e-07, + "loss": 0.6242, + "step": 4452 + }, + { + "epoch": 0.8983284317897411, + "grad_norm": 0.4187043607234955, + "learning_rate": 2.6773979547757013e-07, + "loss": 0.6234, + "step": 4453 + }, + { + "epoch": 0.8985301673459481, + "grad_norm": 0.34601306915283203, + "learning_rate": 2.666858387034654e-07, + "loss": 0.6453, + "step": 4454 + }, + { + "epoch": 0.8987319029021551, + "grad_norm": 0.44681617617607117, + "learning_rate": 2.656339036298522e-07, + "loss": 0.6852, + "step": 4455 + }, + { + "epoch": 0.898933638458362, + "grad_norm": 1.701403021812439, + "learning_rate": 2.6458399070603047e-07, + "loss": 0.6536, + "step": 4456 + }, + { + "epoch": 0.8991353740145691, + "grad_norm": 0.5173128843307495, + "learning_rate": 2.635361003804443e-07, + "loss": 0.6447, + "step": 4457 + }, + { + "epoch": 0.8993371095707761, + "grad_norm": 1.2008379697799683, + "learning_rate": 2.6249023310066845e-07, + "loss": 0.7079, + "step": 4458 + }, + { + "epoch": 0.8995388451269831, + "grad_norm": 0.9323862195014954, + "learning_rate": 2.6144638931341503e-07, + "loss": 0.6737, + "step": 4459 + }, + { + "epoch": 0.8997405806831901, + "grad_norm": 0.3327551484107971, + "learning_rate": 2.604045694645341e-07, + "loss": 0.6379, + "step": 4460 + }, + { + "epoch": 0.899942316239397, + "grad_norm": 0.4596746265888214, + "learning_rate": 2.593647739990068e-07, + "loss": 0.6822, + "step": 4461 + }, + { + "epoch": 0.9001440517956041, + "grad_norm": 0.6696921586990356, + "learning_rate": 2.583270033609536e-07, + "loss": 0.687, + "step": 4462 + }, + { + "epoch": 0.900345787351811, + "grad_norm": 0.48210087418556213, + "learning_rate": 2.572912579936304e-07, + "loss": 0.7884, + "step": 4463 + }, + { + "epoch": 0.9005475229080181, + "grad_norm": 0.8259105086326599, + "learning_rate": 2.5625753833942337e-07, + "loss": 0.78, + "step": 4464 + }, + { + "epoch": 0.900749258464225, + "grad_norm": 0.40220457315444946, + "learning_rate": 2.552258448398576e-07, + "loss": 0.6503, + "step": 4465 + }, + { + "epoch": 0.900950994020432, + "grad_norm": 0.4821997880935669, + "learning_rate": 2.5419617793559224e-07, + "loss": 0.6525, + "step": 4466 + }, + { + "epoch": 0.901152729576639, + "grad_norm": 1.67778480052948, + "learning_rate": 2.5316853806641895e-07, + "loss": 0.6351, + "step": 4467 + }, + { + "epoch": 0.901354465132846, + "grad_norm": 0.44449669122695923, + "learning_rate": 2.521429256712665e-07, + "loss": 0.6526, + "step": 4468 + }, + { + "epoch": 0.901556200689053, + "grad_norm": 0.5264053344726562, + "learning_rate": 2.5111934118819514e-07, + "loss": 0.7608, + "step": 4469 + }, + { + "epoch": 0.90175793624526, + "grad_norm": 0.47850438952445984, + "learning_rate": 2.5009778505439895e-07, + "loss": 0.6309, + "step": 4470 + }, + { + "epoch": 0.901959671801467, + "grad_norm": 0.8120545744895935, + "learning_rate": 2.49078257706209e-07, + "loss": 0.6336, + "step": 4471 + }, + { + "epoch": 0.902161407357674, + "grad_norm": 0.4928573966026306, + "learning_rate": 2.480607595790846e-07, + "loss": 0.7334, + "step": 4472 + }, + { + "epoch": 0.902363142913881, + "grad_norm": 0.582403302192688, + "learning_rate": 2.470452911076227e-07, + "loss": 0.6507, + "step": 4473 + }, + { + "epoch": 0.9025648784700879, + "grad_norm": 0.5206587910652161, + "learning_rate": 2.460318527255523e-07, + "loss": 0.6185, + "step": 4474 + }, + { + "epoch": 0.902766614026295, + "grad_norm": 1.0327239036560059, + "learning_rate": 2.450204448657328e-07, + "loss": 0.6549, + "step": 4475 + }, + { + "epoch": 0.9029683495825019, + "grad_norm": 0.4468756914138794, + "learning_rate": 2.4401106796016037e-07, + "loss": 0.6491, + "step": 4476 + }, + { + "epoch": 0.903170085138709, + "grad_norm": 0.35880833864212036, + "learning_rate": 2.430037224399606e-07, + "loss": 0.6615, + "step": 4477 + }, + { + "epoch": 0.903371820694916, + "grad_norm": 0.6757314801216125, + "learning_rate": 2.4199840873539217e-07, + "loss": 0.6449, + "step": 4478 + }, + { + "epoch": 0.9035735562511229, + "grad_norm": 0.9163317084312439, + "learning_rate": 2.409951272758471e-07, + "loss": 0.6517, + "step": 4479 + }, + { + "epoch": 0.90377529180733, + "grad_norm": 0.4744824171066284, + "learning_rate": 2.399938784898481e-07, + "loss": 0.6863, + "step": 4480 + }, + { + "epoch": 0.9039770273635369, + "grad_norm": 0.36794766783714294, + "learning_rate": 2.3899466280504936e-07, + "loss": 0.6666, + "step": 4481 + }, + { + "epoch": 0.904178762919744, + "grad_norm": 0.6534149646759033, + "learning_rate": 2.3799748064823935e-07, + "loss": 0.6995, + "step": 4482 + }, + { + "epoch": 0.9043804984759509, + "grad_norm": 0.3607742190361023, + "learning_rate": 2.3700233244533412e-07, + "loss": 0.6806, + "step": 4483 + }, + { + "epoch": 0.9045822340321579, + "grad_norm": 0.4232555627822876, + "learning_rate": 2.3600921862138414e-07, + "loss": 0.6792, + "step": 4484 + }, + { + "epoch": 0.9047839695883649, + "grad_norm": 0.6024295687675476, + "learning_rate": 2.3501813960056962e-07, + "loss": 0.6654, + "step": 4485 + }, + { + "epoch": 0.9049857051445719, + "grad_norm": 0.6922175288200378, + "learning_rate": 2.3402909580620025e-07, + "loss": 0.7664, + "step": 4486 + }, + { + "epoch": 0.9051874407007788, + "grad_norm": 0.8140506148338318, + "learning_rate": 2.330420876607198e-07, + "loss": 0.7117, + "step": 4487 + }, + { + "epoch": 0.9053891762569859, + "grad_norm": 0.7923047542572021, + "learning_rate": 2.3205711558570043e-07, + "loss": 0.6498, + "step": 4488 + }, + { + "epoch": 0.9055909118131928, + "grad_norm": 0.9481064677238464, + "learning_rate": 2.3107418000184345e-07, + "loss": 0.668, + "step": 4489 + }, + { + "epoch": 0.9057926473693999, + "grad_norm": 0.44208043813705444, + "learning_rate": 2.3009328132898355e-07, + "loss": 0.8772, + "step": 4490 + }, + { + "epoch": 0.9059943829256069, + "grad_norm": 0.6147819757461548, + "learning_rate": 2.2911441998608342e-07, + "loss": 0.6579, + "step": 4491 + }, + { + "epoch": 0.9061961184818138, + "grad_norm": 0.39272868633270264, + "learning_rate": 2.2813759639123577e-07, + "loss": 0.6526, + "step": 4492 + }, + { + "epoch": 0.9063978540380209, + "grad_norm": 0.5846182703971863, + "learning_rate": 2.2716281096166137e-07, + "loss": 0.8201, + "step": 4493 + }, + { + "epoch": 0.9065995895942278, + "grad_norm": 0.3155399262905121, + "learning_rate": 2.2619006411371437e-07, + "loss": 0.6852, + "step": 4494 + }, + { + "epoch": 0.9068013251504349, + "grad_norm": 0.38381192088127136, + "learning_rate": 2.2521935626287516e-07, + "loss": 0.7007, + "step": 4495 + }, + { + "epoch": 0.9070030607066418, + "grad_norm": 0.6401692628860474, + "learning_rate": 2.242506878237538e-07, + "loss": 0.6858, + "step": 4496 + }, + { + "epoch": 0.9072047962628488, + "grad_norm": 0.540847659111023, + "learning_rate": 2.2328405921008877e-07, + "loss": 0.7831, + "step": 4497 + }, + { + "epoch": 0.9074065318190558, + "grad_norm": 0.4094178080558777, + "learning_rate": 2.2231947083474925e-07, + "loss": 0.765, + "step": 4498 + }, + { + "epoch": 0.9076082673752628, + "grad_norm": 0.38613298535346985, + "learning_rate": 2.213569231097312e-07, + "loss": 0.7178, + "step": 4499 + }, + { + "epoch": 0.9078100029314699, + "grad_norm": 1.2126609086990356, + "learning_rate": 2.203964164461597e-07, + "loss": 0.6322, + "step": 4500 + }, + { + "epoch": 0.9080117384876768, + "grad_norm": 0.5331121683120728, + "learning_rate": 2.1943795125428659e-07, + "loss": 0.7294, + "step": 4501 + }, + { + "epoch": 0.9082134740438838, + "grad_norm": 0.3965674042701721, + "learning_rate": 2.1848152794349487e-07, + "loss": 0.6751, + "step": 4502 + }, + { + "epoch": 0.9084152096000908, + "grad_norm": 0.3532261550426483, + "learning_rate": 2.1752714692229282e-07, + "loss": 0.8164, + "step": 4503 + }, + { + "epoch": 0.9086169451562978, + "grad_norm": 1.104665756225586, + "learning_rate": 2.1657480859831603e-07, + "loss": 0.6374, + "step": 4504 + }, + { + "epoch": 0.9088186807125048, + "grad_norm": 0.8871171474456787, + "learning_rate": 2.156245133783308e-07, + "loss": 0.7202, + "step": 4505 + }, + { + "epoch": 0.9090204162687118, + "grad_norm": 0.5117438435554504, + "learning_rate": 2.1467626166822742e-07, + "loss": 0.8585, + "step": 4506 + }, + { + "epoch": 0.9092221518249187, + "grad_norm": 0.5775021910667419, + "learning_rate": 2.1373005387302416e-07, + "loss": 0.7866, + "step": 4507 + }, + { + "epoch": 0.9094238873811258, + "grad_norm": 0.6503050923347473, + "learning_rate": 2.127858903968677e-07, + "loss": 0.6747, + "step": 4508 + }, + { + "epoch": 0.9096256229373327, + "grad_norm": 0.4406130313873291, + "learning_rate": 2.1184377164303106e-07, + "loss": 0.656, + "step": 4509 + }, + { + "epoch": 0.9098273584935397, + "grad_norm": 0.41005194187164307, + "learning_rate": 2.1090369801391231e-07, + "loss": 0.7006, + "step": 4510 + }, + { + "epoch": 0.9100290940497467, + "grad_norm": 0.38209623098373413, + "learning_rate": 2.0996566991103752e-07, + "loss": 0.6916, + "step": 4511 + }, + { + "epoch": 0.9102308296059537, + "grad_norm": 0.49952998757362366, + "learning_rate": 2.0902968773505838e-07, + "loss": 0.7505, + "step": 4512 + }, + { + "epoch": 0.9104325651621608, + "grad_norm": 0.6335734724998474, + "learning_rate": 2.0809575188575404e-07, + "loss": 0.6507, + "step": 4513 + }, + { + "epoch": 0.9106343007183677, + "grad_norm": 0.46581634879112244, + "learning_rate": 2.0716386276202815e-07, + "loss": 0.6842, + "step": 4514 + }, + { + "epoch": 0.9108360362745747, + "grad_norm": 0.437761515378952, + "learning_rate": 2.0623402076190956e-07, + "loss": 0.6598, + "step": 4515 + }, + { + "epoch": 0.9110377718307817, + "grad_norm": 0.4070280194282532, + "learning_rate": 2.0530622628255613e-07, + "loss": 0.658, + "step": 4516 + }, + { + "epoch": 0.9112395073869887, + "grad_norm": 0.4778706431388855, + "learning_rate": 2.04380479720247e-07, + "loss": 0.7122, + "step": 4517 + }, + { + "epoch": 0.9114412429431957, + "grad_norm": 0.5471115708351135, + "learning_rate": 2.0345678147038807e-07, + "loss": 0.6866, + "step": 4518 + }, + { + "epoch": 0.9116429784994027, + "grad_norm": 1.0620917081832886, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.6739, + "step": 4519 + }, + { + "epoch": 0.9118447140556096, + "grad_norm": 1.0109601020812988, + "learning_rate": 2.0161553148527692e-07, + "loss": 0.7746, + "step": 4520 + }, + { + "epoch": 0.9120464496118167, + "grad_norm": 0.51026850938797, + "learning_rate": 2.0069798053646005e-07, + "loss": 0.6865, + "step": 4521 + }, + { + "epoch": 0.9122481851680236, + "grad_norm": 0.48596811294555664, + "learning_rate": 1.9978247947297025e-07, + "loss": 0.7223, + "step": 4522 + }, + { + "epoch": 0.9124499207242307, + "grad_norm": 1.3675061464309692, + "learning_rate": 1.9886902868583525e-07, + "loss": 0.6901, + "step": 4523 + }, + { + "epoch": 0.9126516562804377, + "grad_norm": 0.43187323212623596, + "learning_rate": 1.9795762856521183e-07, + "loss": 0.7347, + "step": 4524 + }, + { + "epoch": 0.9128533918366446, + "grad_norm": 0.5734922885894775, + "learning_rate": 1.9704827950037753e-07, + "loss": 0.6259, + "step": 4525 + }, + { + "epoch": 0.9130551273928517, + "grad_norm": 0.36623337864875793, + "learning_rate": 1.9614098187973495e-07, + "loss": 0.6741, + "step": 4526 + }, + { + "epoch": 0.9132568629490586, + "grad_norm": 1.3819425106048584, + "learning_rate": 1.9523573609081137e-07, + "loss": 0.7965, + "step": 4527 + }, + { + "epoch": 0.9134585985052656, + "grad_norm": 0.6897246837615967, + "learning_rate": 1.9433254252025524e-07, + "loss": 0.7002, + "step": 4528 + }, + { + "epoch": 0.9136603340614726, + "grad_norm": 0.500214159488678, + "learning_rate": 1.9343140155384023e-07, + "loss": 0.9337, + "step": 4529 + }, + { + "epoch": 0.9138620696176796, + "grad_norm": 0.33126023411750793, + "learning_rate": 1.9253231357646507e-07, + "loss": 0.6866, + "step": 4530 + }, + { + "epoch": 0.9140638051738866, + "grad_norm": 0.3791239261627197, + "learning_rate": 1.9163527897214706e-07, + "loss": 0.6401, + "step": 4531 + }, + { + "epoch": 0.9142655407300936, + "grad_norm": 0.8828220963478088, + "learning_rate": 1.9074029812403084e-07, + "loss": 0.64, + "step": 4532 + }, + { + "epoch": 0.9144672762863005, + "grad_norm": 0.3454236388206482, + "learning_rate": 1.8984737141438113e-07, + "loss": 0.6586, + "step": 4533 + }, + { + "epoch": 0.9146690118425076, + "grad_norm": 0.4951605796813965, + "learning_rate": 1.889564992245857e-07, + "loss": 0.6759, + "step": 4534 + }, + { + "epoch": 0.9148707473987145, + "grad_norm": 1.244110345840454, + "learning_rate": 1.880676819351568e-07, + "loss": 0.6697, + "step": 4535 + }, + { + "epoch": 0.9150724829549216, + "grad_norm": 0.743262529373169, + "learning_rate": 1.871809199257263e-07, + "loss": 0.8218, + "step": 4536 + }, + { + "epoch": 0.9152742185111286, + "grad_norm": 0.8847272396087646, + "learning_rate": 1.8629621357504902e-07, + "loss": 0.6451, + "step": 4537 + }, + { + "epoch": 0.9154759540673355, + "grad_norm": 0.793997585773468, + "learning_rate": 1.8541356326100436e-07, + "loss": 0.7275, + "step": 4538 + }, + { + "epoch": 0.9156776896235426, + "grad_norm": 0.8958233594894409, + "learning_rate": 1.8453296936058796e-07, + "loss": 0.6335, + "step": 4539 + }, + { + "epoch": 0.9158794251797495, + "grad_norm": 0.5523783564567566, + "learning_rate": 1.8365443224992286e-07, + "loss": 0.6393, + "step": 4540 + }, + { + "epoch": 0.9160811607359566, + "grad_norm": 0.3359818458557129, + "learning_rate": 1.8277795230425054e-07, + "loss": 0.7353, + "step": 4541 + }, + { + "epoch": 0.9162828962921635, + "grad_norm": 0.5899154543876648, + "learning_rate": 1.8190352989793325e-07, + "loss": 0.7054, + "step": 4542 + }, + { + "epoch": 0.9164846318483705, + "grad_norm": 0.4736073911190033, + "learning_rate": 1.810311654044583e-07, + "loss": 0.7244, + "step": 4543 + }, + { + "epoch": 0.9166863674045775, + "grad_norm": 0.6302992701530457, + "learning_rate": 1.8016085919642934e-07, + "loss": 0.6488, + "step": 4544 + }, + { + "epoch": 0.9168881029607845, + "grad_norm": 0.7711555361747742, + "learning_rate": 1.7929261164557287e-07, + "loss": 0.7604, + "step": 4545 + }, + { + "epoch": 0.9170898385169914, + "grad_norm": 1.415159821510315, + "learning_rate": 1.7842642312273728e-07, + "loss": 0.7046, + "step": 4546 + }, + { + "epoch": 0.9172915740731985, + "grad_norm": 0.38230201601982117, + "learning_rate": 1.7756229399788993e-07, + "loss": 0.7197, + "step": 4547 + }, + { + "epoch": 0.9174933096294055, + "grad_norm": 0.4994561970233917, + "learning_rate": 1.7670022464011837e-07, + "loss": 0.8675, + "step": 4548 + }, + { + "epoch": 0.9176950451856125, + "grad_norm": 0.6563410758972168, + "learning_rate": 1.758402154176314e-07, + "loss": 1.1117, + "step": 4549 + }, + { + "epoch": 0.9178967807418195, + "grad_norm": 0.4620409905910492, + "learning_rate": 1.7498226669775854e-07, + "loss": 0.643, + "step": 4550 + }, + { + "epoch": 0.9180985162980264, + "grad_norm": 0.37006834149360657, + "learning_rate": 1.741263788469466e-07, + "loss": 0.684, + "step": 4551 + }, + { + "epoch": 0.9183002518542335, + "grad_norm": 0.32972970604896545, + "learning_rate": 1.7327255223076434e-07, + "loss": 0.7611, + "step": 4552 + }, + { + "epoch": 0.9185019874104404, + "grad_norm": 0.5621365308761597, + "learning_rate": 1.7242078721389888e-07, + "loss": 0.7297, + "step": 4553 + }, + { + "epoch": 0.9187037229666475, + "grad_norm": 0.43346965312957764, + "learning_rate": 1.7157108416015867e-07, + "loss": 0.6915, + "step": 4554 + }, + { + "epoch": 0.9189054585228544, + "grad_norm": 2.2155392169952393, + "learning_rate": 1.7072344343246948e-07, + "loss": 0.7675, + "step": 4555 + }, + { + "epoch": 0.9191071940790614, + "grad_norm": 0.7277094721794128, + "learning_rate": 1.6987786539287677e-07, + "loss": 0.642, + "step": 4556 + }, + { + "epoch": 0.9193089296352684, + "grad_norm": 0.3713609576225281, + "learning_rate": 1.6903435040254545e-07, + "loss": 0.7663, + "step": 4557 + }, + { + "epoch": 0.9195106651914754, + "grad_norm": 0.746296226978302, + "learning_rate": 1.681928988217596e-07, + "loss": 0.6506, + "step": 4558 + }, + { + "epoch": 0.9197124007476825, + "grad_norm": 0.3724304735660553, + "learning_rate": 1.6735351100992003e-07, + "loss": 0.6939, + "step": 4559 + }, + { + "epoch": 0.9199141363038894, + "grad_norm": 0.5999610424041748, + "learning_rate": 1.6651618732554774e-07, + "loss": 0.6673, + "step": 4560 + }, + { + "epoch": 0.9201158718600964, + "grad_norm": 0.8278008103370667, + "learning_rate": 1.6568092812628223e-07, + "loss": 0.7195, + "step": 4561 + }, + { + "epoch": 0.9203176074163034, + "grad_norm": 0.7241888046264648, + "learning_rate": 1.648477337688803e-07, + "loss": 0.6596, + "step": 4562 + }, + { + "epoch": 0.9205193429725104, + "grad_norm": 0.49117833375930786, + "learning_rate": 1.6401660460921675e-07, + "loss": 0.6889, + "step": 4563 + }, + { + "epoch": 0.9207210785287173, + "grad_norm": 0.990657389163971, + "learning_rate": 1.631875410022865e-07, + "loss": 0.7313, + "step": 4564 + }, + { + "epoch": 0.9209228140849244, + "grad_norm": 0.3560831546783447, + "learning_rate": 1.6236054330219853e-07, + "loss": 0.7822, + "step": 4565 + }, + { + "epoch": 0.9211245496411313, + "grad_norm": 0.41908150911331177, + "learning_rate": 1.6153561186218247e-07, + "loss": 0.7748, + "step": 4566 + }, + { + "epoch": 0.9213262851973384, + "grad_norm": 0.5297964811325073, + "learning_rate": 1.6071274703458428e-07, + "loss": 0.6574, + "step": 4567 + }, + { + "epoch": 0.9215280207535453, + "grad_norm": 0.6379546523094177, + "learning_rate": 1.5989194917086615e-07, + "loss": 0.6792, + "step": 4568 + }, + { + "epoch": 0.9217297563097523, + "grad_norm": 0.8528479337692261, + "learning_rate": 1.5907321862160985e-07, + "loss": 0.6961, + "step": 4569 + }, + { + "epoch": 0.9219314918659594, + "grad_norm": 0.396762490272522, + "learning_rate": 1.582565557365129e-07, + "loss": 0.813, + "step": 4570 + }, + { + "epoch": 0.9221332274221663, + "grad_norm": 0.3264789879322052, + "learning_rate": 1.5744196086438789e-07, + "loss": 0.7792, + "step": 4571 + }, + { + "epoch": 0.9223349629783734, + "grad_norm": 0.42081427574157715, + "learning_rate": 1.566294343531677e-07, + "loss": 0.6821, + "step": 4572 + }, + { + "epoch": 0.9225366985345803, + "grad_norm": 0.5739784240722656, + "learning_rate": 1.5581897654989963e-07, + "loss": 0.6688, + "step": 4573 + }, + { + "epoch": 0.9227384340907873, + "grad_norm": 0.38827821612358093, + "learning_rate": 1.5501058780074685e-07, + "loss": 0.6339, + "step": 4574 + }, + { + "epoch": 0.9229401696469943, + "grad_norm": 0.8106147646903992, + "learning_rate": 1.5420426845099035e-07, + "loss": 0.6988, + "step": 4575 + }, + { + "epoch": 0.9231419052032013, + "grad_norm": 0.46809065341949463, + "learning_rate": 1.5340001884502577e-07, + "loss": 0.7949, + "step": 4576 + }, + { + "epoch": 0.9233436407594083, + "grad_norm": 0.3986304998397827, + "learning_rate": 1.5259783932636608e-07, + "loss": 0.8216, + "step": 4577 + }, + { + "epoch": 0.9235453763156153, + "grad_norm": 0.4268011748790741, + "learning_rate": 1.5179773023763998e-07, + "loss": 0.6838, + "step": 4578 + }, + { + "epoch": 0.9237471118718222, + "grad_norm": 0.6723108887672424, + "learning_rate": 1.5099969192058972e-07, + "loss": 0.6683, + "step": 4579 + }, + { + "epoch": 0.9239488474280293, + "grad_norm": 4.252002716064453, + "learning_rate": 1.5020372471607593e-07, + "loss": 0.742, + "step": 4580 + }, + { + "epoch": 0.9241505829842362, + "grad_norm": 1.975690245628357, + "learning_rate": 1.4940982896407275e-07, + "loss": 0.8189, + "step": 4581 + }, + { + "epoch": 0.9243523185404432, + "grad_norm": 0.6956148743629456, + "learning_rate": 1.4861800500367007e-07, + "loss": 0.7083, + "step": 4582 + }, + { + "epoch": 0.9245540540966503, + "grad_norm": 0.56894850730896, + "learning_rate": 1.4782825317307348e-07, + "loss": 0.6524, + "step": 4583 + }, + { + "epoch": 0.9247557896528572, + "grad_norm": 0.3585595190525055, + "learning_rate": 1.4704057380960313e-07, + "loss": 0.665, + "step": 4584 + }, + { + "epoch": 0.9249575252090643, + "grad_norm": 0.48725804686546326, + "learning_rate": 1.4625496724969324e-07, + "loss": 0.6495, + "step": 4585 + }, + { + "epoch": 0.9251592607652712, + "grad_norm": 0.9633391499519348, + "learning_rate": 1.454714338288943e-07, + "loss": 0.7812, + "step": 4586 + }, + { + "epoch": 0.9253609963214782, + "grad_norm": 0.3100288212299347, + "learning_rate": 1.4468997388186857e-07, + "loss": 0.6137, + "step": 4587 + }, + { + "epoch": 0.9255627318776852, + "grad_norm": 0.49807772040367126, + "learning_rate": 1.439105877423963e-07, + "loss": 0.7728, + "step": 4588 + }, + { + "epoch": 0.9257644674338922, + "grad_norm": 0.6176745295524597, + "learning_rate": 1.4313327574336899e-07, + "loss": 0.658, + "step": 4589 + }, + { + "epoch": 0.9259662029900992, + "grad_norm": 0.7888067960739136, + "learning_rate": 1.4235803821679328e-07, + "loss": 0.6395, + "step": 4590 + }, + { + "epoch": 0.9261679385463062, + "grad_norm": 0.3679797649383545, + "learning_rate": 1.415848754937904e-07, + "loss": 0.8599, + "step": 4591 + }, + { + "epoch": 0.9263696741025131, + "grad_norm": 0.7646364569664001, + "learning_rate": 1.408137879045951e-07, + "loss": 0.708, + "step": 4592 + }, + { + "epoch": 0.9265714096587202, + "grad_norm": 0.5600611567497253, + "learning_rate": 1.4004477577855392e-07, + "loss": 0.6493, + "step": 4593 + }, + { + "epoch": 0.9267731452149272, + "grad_norm": 0.762845516204834, + "learning_rate": 1.3927783944413075e-07, + "loss": 0.6445, + "step": 4594 + }, + { + "epoch": 0.9269748807711342, + "grad_norm": 0.32098323106765747, + "learning_rate": 1.385129792288986e-07, + "loss": 0.6962, + "step": 4595 + }, + { + "epoch": 0.9271766163273412, + "grad_norm": 0.379790723323822, + "learning_rate": 1.377501954595467e-07, + "loss": 0.727, + "step": 4596 + }, + { + "epoch": 0.9273783518835481, + "grad_norm": 0.7628555297851562, + "learning_rate": 1.369894884618772e-07, + "loss": 0.7543, + "step": 4597 + }, + { + "epoch": 0.9275800874397552, + "grad_norm": 0.4495057761669159, + "learning_rate": 1.3623085856080298e-07, + "loss": 0.6668, + "step": 4598 + }, + { + "epoch": 0.9277818229959621, + "grad_norm": 0.4083491563796997, + "learning_rate": 1.3547430608035207e-07, + "loss": 0.6652, + "step": 4599 + }, + { + "epoch": 0.9279835585521692, + "grad_norm": 0.49711883068084717, + "learning_rate": 1.3471983134366374e-07, + "loss": 0.6794, + "step": 4600 + }, + { + "epoch": 0.9281852941083761, + "grad_norm": 0.6337301135063171, + "learning_rate": 1.3396743467299077e-07, + "loss": 0.6634, + "step": 4601 + }, + { + "epoch": 0.9283870296645831, + "grad_norm": 0.6049329042434692, + "learning_rate": 1.3321711638969836e-07, + "loss": 0.6498, + "step": 4602 + }, + { + "epoch": 0.9285887652207901, + "grad_norm": 0.3812504708766937, + "learning_rate": 1.3246887681426346e-07, + "loss": 0.6642, + "step": 4603 + }, + { + "epoch": 0.9287905007769971, + "grad_norm": 0.41788598895072937, + "learning_rate": 1.3172271626627486e-07, + "loss": 0.634, + "step": 4604 + }, + { + "epoch": 0.928992236333204, + "grad_norm": 0.5177899599075317, + "learning_rate": 1.3097863506443432e-07, + "loss": 0.6881, + "step": 4605 + }, + { + "epoch": 0.9291939718894111, + "grad_norm": 0.4392205774784088, + "learning_rate": 1.3023663352655424e-07, + "loss": 0.6254, + "step": 4606 + }, + { + "epoch": 0.9293957074456181, + "grad_norm": 0.3761579692363739, + "learning_rate": 1.294967119695606e-07, + "loss": 0.7679, + "step": 4607 + }, + { + "epoch": 0.9295974430018251, + "grad_norm": 0.47355636954307556, + "learning_rate": 1.287588707094889e-07, + "loss": 0.6452, + "step": 4608 + }, + { + "epoch": 0.9297991785580321, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.2802311006148703e-07, + "loss": 0.8158, + "step": 4609 + }, + { + "epoch": 0.930000914114239, + "grad_norm": 0.4282612204551697, + "learning_rate": 1.272894303398148e-07, + "loss": 0.7309, + "step": 4610 + }, + { + "epoch": 0.9302026496704461, + "grad_norm": 0.41782447695732117, + "learning_rate": 1.2655783185784253e-07, + "loss": 0.6939, + "step": 4611 + }, + { + "epoch": 0.930404385226653, + "grad_norm": 0.4852448105812073, + "learning_rate": 1.2582831492805092e-07, + "loss": 0.6524, + "step": 4612 + }, + { + "epoch": 0.9306061207828601, + "grad_norm": 0.3634573817253113, + "learning_rate": 1.2510087986203346e-07, + "loss": 0.6512, + "step": 4613 + }, + { + "epoch": 0.930807856339067, + "grad_norm": 0.5622749924659729, + "learning_rate": 1.2437552697049327e-07, + "loss": 0.8715, + "step": 4614 + }, + { + "epoch": 0.931009591895274, + "grad_norm": 0.4542370140552521, + "learning_rate": 1.2365225656324308e-07, + "loss": 0.6442, + "step": 4615 + }, + { + "epoch": 0.931211327451481, + "grad_norm": 0.36814215779304504, + "learning_rate": 1.2293106894920803e-07, + "loss": 0.6481, + "step": 4616 + }, + { + "epoch": 0.931413063007688, + "grad_norm": 0.3866225481033325, + "learning_rate": 1.2221196443642336e-07, + "loss": 0.6577, + "step": 4617 + }, + { + "epoch": 0.9316147985638951, + "grad_norm": 0.42240482568740845, + "learning_rate": 1.214949433320334e-07, + "loss": 0.8365, + "step": 4618 + }, + { + "epoch": 0.931816534120102, + "grad_norm": 0.7792031168937683, + "learning_rate": 1.2078000594229312e-07, + "loss": 0.6615, + "step": 4619 + }, + { + "epoch": 0.932018269676309, + "grad_norm": 0.36705565452575684, + "learning_rate": 1.200671525725683e-07, + "loss": 0.6638, + "step": 4620 + }, + { + "epoch": 0.932220005232516, + "grad_norm": 0.4371699094772339, + "learning_rate": 1.1935638352733424e-07, + "loss": 0.7768, + "step": 4621 + }, + { + "epoch": 0.932421740788723, + "grad_norm": 0.3500683605670929, + "learning_rate": 1.1864769911017482e-07, + "loss": 0.703, + "step": 4622 + }, + { + "epoch": 0.9326234763449299, + "grad_norm": 0.6728479266166687, + "learning_rate": 1.1794109962378452e-07, + "loss": 0.6745, + "step": 4623 + }, + { + "epoch": 0.932825211901137, + "grad_norm": 0.349743515253067, + "learning_rate": 1.1723658536996807e-07, + "loss": 0.6736, + "step": 4624 + }, + { + "epoch": 0.9330269474573439, + "grad_norm": 0.9413335919380188, + "learning_rate": 1.1653415664963807e-07, + "loss": 0.7034, + "step": 4625 + }, + { + "epoch": 0.933228683013551, + "grad_norm": 1.0594662427902222, + "learning_rate": 1.1583381376281733e-07, + "loss": 0.7869, + "step": 4626 + }, + { + "epoch": 0.933430418569758, + "grad_norm": 0.5527055859565735, + "learning_rate": 1.1513555700863655e-07, + "loss": 0.6817, + "step": 4627 + }, + { + "epoch": 0.9336321541259649, + "grad_norm": 1.049141526222229, + "learning_rate": 1.1443938668533716e-07, + "loss": 0.8054, + "step": 4628 + }, + { + "epoch": 0.933833889682172, + "grad_norm": 0.5355727672576904, + "learning_rate": 1.1374530309026799e-07, + "loss": 0.6708, + "step": 4629 + }, + { + "epoch": 0.9340356252383789, + "grad_norm": 0.48663362860679626, + "learning_rate": 1.1305330651988689e-07, + "loss": 0.6783, + "step": 4630 + }, + { + "epoch": 0.934237360794586, + "grad_norm": 0.4144569933414459, + "learning_rate": 1.1236339726976132e-07, + "loss": 0.749, + "step": 4631 + }, + { + "epoch": 0.9344390963507929, + "grad_norm": 0.41263678669929504, + "learning_rate": 1.1167557563456611e-07, + "loss": 0.657, + "step": 4632 + }, + { + "epoch": 0.9346408319069999, + "grad_norm": 1.1964731216430664, + "learning_rate": 1.1098984190808403e-07, + "loss": 0.7634, + "step": 4633 + }, + { + "epoch": 0.9348425674632069, + "grad_norm": 0.40326234698295593, + "learning_rate": 1.1030619638320805e-07, + "loss": 0.7715, + "step": 4634 + }, + { + "epoch": 0.9350443030194139, + "grad_norm": 0.3794019818305969, + "learning_rate": 1.0962463935193624e-07, + "loss": 0.6335, + "step": 4635 + }, + { + "epoch": 0.9352460385756209, + "grad_norm": 0.4727846682071686, + "learning_rate": 1.089451711053774e-07, + "loss": 0.8265, + "step": 4636 + }, + { + "epoch": 0.9354477741318279, + "grad_norm": 0.5812278389930725, + "learning_rate": 1.0826779193374715e-07, + "loss": 0.7067, + "step": 4637 + }, + { + "epoch": 0.9356495096880348, + "grad_norm": 0.360893577337265, + "learning_rate": 1.0759250212636795e-07, + "loss": 0.6921, + "step": 4638 + }, + { + "epoch": 0.9358512452442419, + "grad_norm": 0.5948578715324402, + "learning_rate": 1.0691930197167133e-07, + "loss": 0.746, + "step": 4639 + }, + { + "epoch": 0.9360529808004489, + "grad_norm": 0.8942117094993591, + "learning_rate": 1.0624819175719558e-07, + "loss": 0.6549, + "step": 4640 + }, + { + "epoch": 0.9362547163566558, + "grad_norm": 0.9222234487533569, + "learning_rate": 1.0557917176958532e-07, + "loss": 0.7215, + "step": 4641 + }, + { + "epoch": 0.9364564519128629, + "grad_norm": 0.9265431761741638, + "learning_rate": 1.0491224229459529e-07, + "loss": 0.8325, + "step": 4642 + }, + { + "epoch": 0.9366581874690698, + "grad_norm": 0.3459104299545288, + "learning_rate": 1.0424740361708374e-07, + "loss": 0.6419, + "step": 4643 + }, + { + "epoch": 0.9368599230252769, + "grad_norm": 0.521603524684906, + "learning_rate": 1.0358465602101796e-07, + "loss": 0.6269, + "step": 4644 + }, + { + "epoch": 0.9370616585814838, + "grad_norm": 0.6259787678718567, + "learning_rate": 1.0292399978947265e-07, + "loss": 0.715, + "step": 4645 + }, + { + "epoch": 0.9372633941376908, + "grad_norm": 0.6800326108932495, + "learning_rate": 1.0226543520462707e-07, + "loss": 0.6915, + "step": 4646 + }, + { + "epoch": 0.9374651296938978, + "grad_norm": 0.47363513708114624, + "learning_rate": 1.0160896254776897e-07, + "loss": 0.6676, + "step": 4647 + }, + { + "epoch": 0.9376668652501048, + "grad_norm": 0.6539434790611267, + "learning_rate": 1.0095458209929243e-07, + "loss": 0.657, + "step": 4648 + }, + { + "epoch": 0.9378686008063118, + "grad_norm": 0.4961993992328644, + "learning_rate": 1.0030229413869607e-07, + "loss": 0.671, + "step": 4649 + }, + { + "epoch": 0.9380703363625188, + "grad_norm": 0.4845615327358246, + "learning_rate": 9.965209894458761e-08, + "loss": 0.624, + "step": 4650 + }, + { + "epoch": 0.9382720719187257, + "grad_norm": 0.37729620933532715, + "learning_rate": 9.900399679467876e-08, + "loss": 0.6853, + "step": 4651 + }, + { + "epoch": 0.9384738074749328, + "grad_norm": 0.41500183939933777, + "learning_rate": 9.835798796578755e-08, + "loss": 0.6308, + "step": 4652 + }, + { + "epoch": 0.9386755430311398, + "grad_norm": 1.0571080446243286, + "learning_rate": 9.771407273383938e-08, + "loss": 0.6582, + "step": 4653 + }, + { + "epoch": 0.9388772785873468, + "grad_norm": 0.31755977869033813, + "learning_rate": 9.707225137386256e-08, + "loss": 0.6526, + "step": 4654 + }, + { + "epoch": 0.9390790141435538, + "grad_norm": 0.3515841066837311, + "learning_rate": 9.643252415999504e-08, + "loss": 0.7137, + "step": 4655 + }, + { + "epoch": 0.9392807496997607, + "grad_norm": 1.5465117692947388, + "learning_rate": 9.579489136547659e-08, + "loss": 0.6578, + "step": 4656 + }, + { + "epoch": 0.9394824852559678, + "grad_norm": 1.811485767364502, + "learning_rate": 9.51593532626538e-08, + "loss": 0.6434, + "step": 4657 + }, + { + "epoch": 0.9396842208121747, + "grad_norm": 0.4483201205730438, + "learning_rate": 9.452591012297951e-08, + "loss": 0.6697, + "step": 4658 + }, + { + "epoch": 0.9398859563683817, + "grad_norm": 0.3444378077983856, + "learning_rate": 9.389456221701121e-08, + "loss": 0.7016, + "step": 4659 + }, + { + "epoch": 0.9400876919245887, + "grad_norm": 0.3389723300933838, + "learning_rate": 9.326530981440985e-08, + "loss": 0.7512, + "step": 4660 + }, + { + "epoch": 0.9402894274807957, + "grad_norm": 0.45492416620254517, + "learning_rate": 9.263815318394376e-08, + "loss": 0.6854, + "step": 4661 + }, + { + "epoch": 0.9404911630370028, + "grad_norm": 0.609420120716095, + "learning_rate": 9.201309259348479e-08, + "loss": 0.7446, + "step": 4662 + }, + { + "epoch": 0.9406928985932097, + "grad_norm": 0.41878068447113037, + "learning_rate": 9.139012831000937e-08, + "loss": 0.6859, + "step": 4663 + }, + { + "epoch": 0.9408946341494167, + "grad_norm": 1.0418484210968018, + "learning_rate": 9.076926059959967e-08, + "loss": 0.6834, + "step": 4664 + }, + { + "epoch": 0.9410963697056237, + "grad_norm": 0.4487755298614502, + "learning_rate": 9.015048972744079e-08, + "loss": 0.6682, + "step": 4665 + }, + { + "epoch": 0.9412981052618307, + "grad_norm": 0.46032407879829407, + "learning_rate": 8.953381595782462e-08, + "loss": 0.6496, + "step": 4666 + }, + { + "epoch": 0.9414998408180377, + "grad_norm": 0.3330506384372711, + "learning_rate": 8.891923955414438e-08, + "loss": 0.7225, + "step": 4667 + }, + { + "epoch": 0.9417015763742447, + "grad_norm": 0.36100640892982483, + "learning_rate": 8.8306760778899e-08, + "loss": 0.6906, + "step": 4668 + }, + { + "epoch": 0.9419033119304516, + "grad_norm": 0.5503403544425964, + "learning_rate": 8.769637989369195e-08, + "loss": 0.721, + "step": 4669 + }, + { + "epoch": 0.9421050474866587, + "grad_norm": 0.49627622961997986, + "learning_rate": 8.708809715922973e-08, + "loss": 0.7463, + "step": 4670 + }, + { + "epoch": 0.9423067830428656, + "grad_norm": 0.4225630760192871, + "learning_rate": 8.648191283532337e-08, + "loss": 0.6516, + "step": 4671 + }, + { + "epoch": 0.9425085185990727, + "grad_norm": 1.131819248199463, + "learning_rate": 8.587782718088688e-08, + "loss": 0.6205, + "step": 4672 + }, + { + "epoch": 0.9427102541552796, + "grad_norm": 0.6810362339019775, + "learning_rate": 8.527584045393833e-08, + "loss": 0.6979, + "step": 4673 + }, + { + "epoch": 0.9429119897114866, + "grad_norm": 0.7575717568397522, + "learning_rate": 8.46759529115998e-08, + "loss": 0.733, + "step": 4674 + }, + { + "epoch": 0.9431137252676937, + "grad_norm": 0.31936702132225037, + "learning_rate": 8.407816481009524e-08, + "loss": 0.7006, + "step": 4675 + }, + { + "epoch": 0.9433154608239006, + "grad_norm": 0.4622681140899658, + "learning_rate": 8.34824764047526e-08, + "loss": 0.7778, + "step": 4676 + }, + { + "epoch": 0.9435171963801076, + "grad_norm": 0.4197992980480194, + "learning_rate": 8.288888795000504e-08, + "loss": 0.7495, + "step": 4677 + }, + { + "epoch": 0.9437189319363146, + "grad_norm": 0.5172026753425598, + "learning_rate": 8.229739969938533e-08, + "loss": 0.6222, + "step": 4678 + }, + { + "epoch": 0.9439206674925216, + "grad_norm": 0.444822758436203, + "learning_rate": 8.17080119055308e-08, + "loss": 0.674, + "step": 4679 + }, + { + "epoch": 0.9441224030487286, + "grad_norm": 0.35563191771507263, + "learning_rate": 8.11207248201834e-08, + "loss": 0.719, + "step": 4680 + }, + { + "epoch": 0.9443241386049356, + "grad_norm": 0.557724118232727, + "learning_rate": 8.053553869418418e-08, + "loss": 0.8011, + "step": 4681 + }, + { + "epoch": 0.9445258741611425, + "grad_norm": 0.516588568687439, + "learning_rate": 7.995245377747984e-08, + "loss": 0.6691, + "step": 4682 + }, + { + "epoch": 0.9447276097173496, + "grad_norm": 1.4332866668701172, + "learning_rate": 7.937147031911785e-08, + "loss": 0.6962, + "step": 4683 + }, + { + "epoch": 0.9449293452735565, + "grad_norm": 0.4007475972175598, + "learning_rate": 7.879258856724913e-08, + "loss": 0.6687, + "step": 4684 + }, + { + "epoch": 0.9451310808297636, + "grad_norm": 0.6435663104057312, + "learning_rate": 7.821580876912705e-08, + "loss": 0.8308, + "step": 4685 + }, + { + "epoch": 0.9453328163859706, + "grad_norm": 0.9628399610519409, + "learning_rate": 7.764113117110506e-08, + "loss": 0.6806, + "step": 4686 + }, + { + "epoch": 0.9455345519421775, + "grad_norm": 0.33533141016960144, + "learning_rate": 7.706855601864238e-08, + "loss": 0.6426, + "step": 4687 + }, + { + "epoch": 0.9457362874983846, + "grad_norm": 0.5895031094551086, + "learning_rate": 7.649808355629729e-08, + "loss": 0.6142, + "step": 4688 + }, + { + "epoch": 0.9459380230545915, + "grad_norm": 0.7705016136169434, + "learning_rate": 7.592971402773042e-08, + "loss": 0.6929, + "step": 4689 + }, + { + "epoch": 0.9461397586107986, + "grad_norm": 0.40751463174819946, + "learning_rate": 7.536344767570536e-08, + "loss": 0.6995, + "step": 4690 + }, + { + "epoch": 0.9463414941670055, + "grad_norm": 0.7165273427963257, + "learning_rate": 7.479928474208586e-08, + "loss": 0.6518, + "step": 4691 + }, + { + "epoch": 0.9465432297232125, + "grad_norm": 0.5997741222381592, + "learning_rate": 7.423722546783918e-08, + "loss": 0.6692, + "step": 4692 + }, + { + "epoch": 0.9467449652794195, + "grad_norm": 0.7366072535514832, + "learning_rate": 7.367727009303216e-08, + "loss": 0.6946, + "step": 4693 + }, + { + "epoch": 0.9469467008356265, + "grad_norm": 0.38420969247817993, + "learning_rate": 7.311941885683405e-08, + "loss": 0.705, + "step": 4694 + }, + { + "epoch": 0.9471484363918334, + "grad_norm": 0.37986132502555847, + "learning_rate": 7.25636719975148e-08, + "loss": 0.6504, + "step": 4695 + }, + { + "epoch": 0.9473501719480405, + "grad_norm": 0.5251741409301758, + "learning_rate": 7.201002975244676e-08, + "loss": 0.6562, + "step": 4696 + }, + { + "epoch": 0.9475519075042474, + "grad_norm": 0.5595903396606445, + "learning_rate": 7.145849235810131e-08, + "loss": 0.7736, + "step": 4697 + }, + { + "epoch": 0.9477536430604545, + "grad_norm": 0.36443448066711426, + "learning_rate": 7.090906005005283e-08, + "loss": 0.8156, + "step": 4698 + }, + { + "epoch": 0.9479553786166615, + "grad_norm": 0.3789260983467102, + "learning_rate": 7.036173306297522e-08, + "loss": 0.7545, + "step": 4699 + }, + { + "epoch": 0.9481571141728684, + "grad_norm": 1.2341930866241455, + "learning_rate": 6.981651163064374e-08, + "loss": 0.7395, + "step": 4700 + }, + { + "epoch": 0.9483588497290755, + "grad_norm": 0.9317732453346252, + "learning_rate": 6.927339598593485e-08, + "loss": 0.81, + "step": 4701 + }, + { + "epoch": 0.9485605852852824, + "grad_norm": 0.46789565682411194, + "learning_rate": 6.873238636082358e-08, + "loss": 0.6293, + "step": 4702 + }, + { + "epoch": 0.9487623208414895, + "grad_norm": 0.8435044884681702, + "learning_rate": 6.819348298638839e-08, + "loss": 0.7131, + "step": 4703 + }, + { + "epoch": 0.9489640563976964, + "grad_norm": 0.5798925161361694, + "learning_rate": 6.765668609280519e-08, + "loss": 0.7062, + "step": 4704 + }, + { + "epoch": 0.9491657919539034, + "grad_norm": 0.49503710865974426, + "learning_rate": 6.71219959093522e-08, + "loss": 0.8745, + "step": 4705 + }, + { + "epoch": 0.9493675275101104, + "grad_norm": 0.36657482385635376, + "learning_rate": 6.658941266440677e-08, + "loss": 0.6438, + "step": 4706 + }, + { + "epoch": 0.9495692630663174, + "grad_norm": 0.9516401290893555, + "learning_rate": 6.605893658544693e-08, + "loss": 0.6972, + "step": 4707 + }, + { + "epoch": 0.9497709986225245, + "grad_norm": 0.5418645739555359, + "learning_rate": 6.553056789905032e-08, + "loss": 0.634, + "step": 4708 + }, + { + "epoch": 0.9499727341787314, + "grad_norm": 0.43398237228393555, + "learning_rate": 6.500430683089532e-08, + "loss": 0.7135, + "step": 4709 + }, + { + "epoch": 0.9501744697349384, + "grad_norm": 0.4190267324447632, + "learning_rate": 6.448015360575821e-08, + "loss": 0.6784, + "step": 4710 + }, + { + "epoch": 0.9503762052911454, + "grad_norm": 0.38937604427337646, + "learning_rate": 6.395810844751604e-08, + "loss": 0.8755, + "step": 4711 + }, + { + "epoch": 0.9505779408473524, + "grad_norm": 0.4863095283508301, + "learning_rate": 6.343817157914712e-08, + "loss": 0.8249, + "step": 4712 + }, + { + "epoch": 0.9507796764035594, + "grad_norm": 0.5813011527061462, + "learning_rate": 6.292034322272656e-08, + "loss": 0.6882, + "step": 4713 + }, + { + "epoch": 0.9509814119597664, + "grad_norm": 0.3933367431163788, + "learning_rate": 6.240462359942967e-08, + "loss": 0.7558, + "step": 4714 + }, + { + "epoch": 0.9511831475159733, + "grad_norm": 0.4394649267196655, + "learning_rate": 6.189101292953247e-08, + "loss": 0.6481, + "step": 4715 + }, + { + "epoch": 0.9513848830721804, + "grad_norm": 0.39462268352508545, + "learning_rate": 6.137951143240783e-08, + "loss": 0.6553, + "step": 4716 + }, + { + "epoch": 0.9515866186283873, + "grad_norm": 0.7447942495346069, + "learning_rate": 6.087011932653097e-08, + "loss": 0.6813, + "step": 4717 + }, + { + "epoch": 0.9517883541845943, + "grad_norm": 0.48867762088775635, + "learning_rate": 6.036283682947231e-08, + "loss": 0.6461, + "step": 4718 + }, + { + "epoch": 0.9519900897408013, + "grad_norm": 0.320430189371109, + "learning_rate": 5.98576641579035e-08, + "loss": 0.6561, + "step": 4719 + }, + { + "epoch": 0.9521918252970083, + "grad_norm": 0.659767746925354, + "learning_rate": 5.935460152759642e-08, + "loss": 0.676, + "step": 4720 + }, + { + "epoch": 0.9523935608532154, + "grad_norm": 0.7774620652198792, + "learning_rate": 5.8853649153417515e-08, + "loss": 0.6979, + "step": 4721 + }, + { + "epoch": 0.9525952964094223, + "grad_norm": 0.3772054612636566, + "learning_rate": 5.835480724933562e-08, + "loss": 0.7595, + "step": 4722 + }, + { + "epoch": 0.9527970319656293, + "grad_norm": 0.33030465245246887, + "learning_rate": 5.7858076028416975e-08, + "loss": 0.6754, + "step": 4723 + }, + { + "epoch": 0.9529987675218363, + "grad_norm": 0.47675344347953796, + "learning_rate": 5.736345570282575e-08, + "loss": 0.8973, + "step": 4724 + }, + { + "epoch": 0.9532005030780433, + "grad_norm": 0.5579949021339417, + "learning_rate": 5.687094648382518e-08, + "loss": 0.703, + "step": 4725 + }, + { + "epoch": 0.9534022386342503, + "grad_norm": 0.47706276178359985, + "learning_rate": 5.638054858177644e-08, + "loss": 0.707, + "step": 4726 + }, + { + "epoch": 0.9536039741904573, + "grad_norm": 0.3631863296031952, + "learning_rate": 5.589226220613919e-08, + "loss": 0.6557, + "step": 4727 + }, + { + "epoch": 0.9538057097466642, + "grad_norm": 0.5944626927375793, + "learning_rate": 5.5406087565471054e-08, + "loss": 0.6658, + "step": 4728 + }, + { + "epoch": 0.9540074453028713, + "grad_norm": 0.579210102558136, + "learning_rate": 5.492202486742759e-08, + "loss": 0.7539, + "step": 4729 + }, + { + "epoch": 0.9542091808590782, + "grad_norm": 0.4768008887767792, + "learning_rate": 5.44400743187623e-08, + "loss": 0.6639, + "step": 4730 + }, + { + "epoch": 0.9544109164152853, + "grad_norm": 0.3371616005897522, + "learning_rate": 5.396023612532719e-08, + "loss": 0.6591, + "step": 4731 + }, + { + "epoch": 0.9546126519714923, + "grad_norm": 0.34802260994911194, + "learning_rate": 5.348251049207054e-08, + "loss": 0.9614, + "step": 4732 + }, + { + "epoch": 0.9548143875276992, + "grad_norm": 0.36523744463920593, + "learning_rate": 5.300689762304023e-08, + "loss": 0.8026, + "step": 4733 + }, + { + "epoch": 0.9550161230839063, + "grad_norm": 0.5702638626098633, + "learning_rate": 5.2533397721379887e-08, + "loss": 0.5845, + "step": 4734 + }, + { + "epoch": 0.9552178586401132, + "grad_norm": 0.44271722435951233, + "learning_rate": 5.206201098933217e-08, + "loss": 0.8374, + "step": 4735 + }, + { + "epoch": 0.9554195941963202, + "grad_norm": 0.5539191365242004, + "learning_rate": 5.159273762823658e-08, + "loss": 0.6885, + "step": 4736 + }, + { + "epoch": 0.9556213297525272, + "grad_norm": 0.6696466207504272, + "learning_rate": 5.112557783852945e-08, + "loss": 0.6233, + "step": 4737 + }, + { + "epoch": 0.9558230653087342, + "grad_norm": 0.33567628264427185, + "learning_rate": 5.0660531819745065e-08, + "loss": 0.6452, + "step": 4738 + }, + { + "epoch": 0.9560248008649412, + "grad_norm": 0.8079859614372253, + "learning_rate": 5.0197599770514524e-08, + "loss": 0.6868, + "step": 4739 + }, + { + "epoch": 0.9562265364211482, + "grad_norm": 0.4152475595474243, + "learning_rate": 4.9736781888566345e-08, + "loss": 0.6861, + "step": 4740 + }, + { + "epoch": 0.9564282719773551, + "grad_norm": 0.3019008934497833, + "learning_rate": 4.927807837072529e-08, + "loss": 0.6617, + "step": 4741 + }, + { + "epoch": 0.9566300075335622, + "grad_norm": 0.43031102418899536, + "learning_rate": 4.882148941291298e-08, + "loss": 0.8175, + "step": 4742 + }, + { + "epoch": 0.9568317430897691, + "grad_norm": 0.5170923471450806, + "learning_rate": 4.836701521015008e-08, + "loss": 0.6904, + "step": 4743 + }, + { + "epoch": 0.9570334786459762, + "grad_norm": 0.44814711809158325, + "learning_rate": 4.791465595655132e-08, + "loss": 0.6623, + "step": 4744 + }, + { + "epoch": 0.9572352142021832, + "grad_norm": 2.267430305480957, + "learning_rate": 4.746441184532879e-08, + "loss": 0.801, + "step": 4745 + }, + { + "epoch": 0.9574369497583901, + "grad_norm": 0.6424638628959656, + "learning_rate": 4.701628306879202e-08, + "loss": 0.7212, + "step": 4746 + }, + { + "epoch": 0.9576386853145972, + "grad_norm": 0.7151271104812622, + "learning_rate": 4.657026981834623e-08, + "loss": 0.64, + "step": 4747 + }, + { + "epoch": 0.9578404208708041, + "grad_norm": 1.2689751386642456, + "learning_rate": 4.612637228449346e-08, + "loss": 0.6775, + "step": 4748 + }, + { + "epoch": 0.9580421564270112, + "grad_norm": 0.46473121643066406, + "learning_rate": 4.568459065683206e-08, + "loss": 0.799, + "step": 4749 + }, + { + "epoch": 0.9582438919832181, + "grad_norm": 0.44650596380233765, + "learning_rate": 4.524492512405554e-08, + "loss": 0.6784, + "step": 4750 + }, + { + "epoch": 0.9584456275394251, + "grad_norm": 0.7469905018806458, + "learning_rate": 4.4807375873955336e-08, + "loss": 0.6511, + "step": 4751 + }, + { + "epoch": 0.9586473630956321, + "grad_norm": 0.5191195607185364, + "learning_rate": 4.437194309341808e-08, + "loss": 0.7911, + "step": 4752 + }, + { + "epoch": 0.9588490986518391, + "grad_norm": 1.0587661266326904, + "learning_rate": 4.393862696842666e-08, + "loss": 0.6558, + "step": 4753 + }, + { + "epoch": 0.959050834208046, + "grad_norm": 0.4224800169467926, + "learning_rate": 4.350742768405913e-08, + "loss": 0.6558, + "step": 4754 + }, + { + "epoch": 0.9592525697642531, + "grad_norm": 0.4730607271194458, + "learning_rate": 4.307834542449096e-08, + "loss": 0.6371, + "step": 4755 + }, + { + "epoch": 0.95945430532046, + "grad_norm": 0.6827614903450012, + "learning_rate": 4.26513803729911e-08, + "loss": 0.7278, + "step": 4756 + }, + { + "epoch": 0.9596560408766671, + "grad_norm": 0.78753262758255, + "learning_rate": 4.2226532711927005e-08, + "loss": 0.6537, + "step": 4757 + }, + { + "epoch": 0.9598577764328741, + "grad_norm": 0.39794209599494934, + "learning_rate": 4.180380262275907e-08, + "loss": 0.6444, + "step": 4758 + }, + { + "epoch": 0.960059511989081, + "grad_norm": 0.3673033118247986, + "learning_rate": 4.138319028604509e-08, + "loss": 0.8612, + "step": 4759 + }, + { + "epoch": 0.9602612475452881, + "grad_norm": 0.659838855266571, + "learning_rate": 4.0964695881437475e-08, + "loss": 0.6587, + "step": 4760 + }, + { + "epoch": 0.960462983101495, + "grad_norm": 0.6321941614151001, + "learning_rate": 4.054831958768435e-08, + "loss": 0.6918, + "step": 4761 + }, + { + "epoch": 0.9606647186577021, + "grad_norm": 0.5344189405441284, + "learning_rate": 4.0134061582628446e-08, + "loss": 0.7182, + "step": 4762 + }, + { + "epoch": 0.960866454213909, + "grad_norm": 0.797895610332489, + "learning_rate": 3.9721922043208797e-08, + "loss": 0.6649, + "step": 4763 + }, + { + "epoch": 0.961068189770116, + "grad_norm": 0.526902437210083, + "learning_rate": 3.931190114545902e-08, + "loss": 0.624, + "step": 4764 + }, + { + "epoch": 0.961269925326323, + "grad_norm": 0.4661473035812378, + "learning_rate": 3.8903999064507923e-08, + "loss": 0.6719, + "step": 4765 + }, + { + "epoch": 0.96147166088253, + "grad_norm": 0.5854642987251282, + "learning_rate": 3.849821597457892e-08, + "loss": 0.6386, + "step": 4766 + }, + { + "epoch": 0.9616733964387371, + "grad_norm": 0.49384617805480957, + "learning_rate": 3.809455204899115e-08, + "loss": 0.6907, + "step": 4767 + }, + { + "epoch": 0.961875131994944, + "grad_norm": 0.49993792176246643, + "learning_rate": 3.769300746015836e-08, + "loss": 0.7345, + "step": 4768 + }, + { + "epoch": 0.962076867551151, + "grad_norm": 0.6220202445983887, + "learning_rate": 3.72935823795878e-08, + "loss": 0.7145, + "step": 4769 + }, + { + "epoch": 0.962278603107358, + "grad_norm": 0.8840739727020264, + "learning_rate": 3.689627697788356e-08, + "loss": 0.7168, + "step": 4770 + }, + { + "epoch": 0.962480338663565, + "grad_norm": 0.34587568044662476, + "learning_rate": 3.650109142474323e-08, + "loss": 0.6895, + "step": 4771 + }, + { + "epoch": 0.9626820742197719, + "grad_norm": 0.3984651267528534, + "learning_rate": 3.610802588895845e-08, + "loss": 0.6525, + "step": 4772 + }, + { + "epoch": 0.962883809775979, + "grad_norm": 1.5117933750152588, + "learning_rate": 3.571708053841716e-08, + "loss": 0.7711, + "step": 4773 + }, + { + "epoch": 0.9630855453321859, + "grad_norm": 0.4134984314441681, + "learning_rate": 3.532825554009966e-08, + "loss": 0.6177, + "step": 4774 + }, + { + "epoch": 0.963287280888393, + "grad_norm": 0.6590429544448853, + "learning_rate": 3.49415510600809e-08, + "loss": 0.7145, + "step": 4775 + }, + { + "epoch": 0.9634890164445999, + "grad_norm": 0.44440943002700806, + "learning_rate": 3.455696726353208e-08, + "loss": 0.6574, + "step": 4776 + }, + { + "epoch": 0.9636907520008069, + "grad_norm": 0.529772162437439, + "learning_rate": 3.417450431471625e-08, + "loss": 0.6641, + "step": 4777 + }, + { + "epoch": 0.963892487557014, + "grad_norm": 0.7167370915412903, + "learning_rate": 3.379416237699218e-08, + "loss": 0.8083, + "step": 4778 + }, + { + "epoch": 0.9640942231132209, + "grad_norm": 1.930140495300293, + "learning_rate": 3.341594161281214e-08, + "loss": 0.657, + "step": 4779 + }, + { + "epoch": 0.964295958669428, + "grad_norm": 0.4069491922855377, + "learning_rate": 3.303984218372136e-08, + "loss": 0.7373, + "step": 4780 + }, + { + "epoch": 0.9644976942256349, + "grad_norm": 0.38006487488746643, + "learning_rate": 3.2665864250360777e-08, + "loss": 0.6501, + "step": 4781 + }, + { + "epoch": 0.9646994297818419, + "grad_norm": 0.4782893657684326, + "learning_rate": 3.2294007972464845e-08, + "loss": 0.6497, + "step": 4782 + }, + { + "epoch": 0.9649011653380489, + "grad_norm": 0.32621467113494873, + "learning_rate": 3.19242735088604e-08, + "loss": 0.8127, + "step": 4783 + }, + { + "epoch": 0.9651029008942559, + "grad_norm": 0.5363353490829468, + "learning_rate": 3.155666101747001e-08, + "loss": 0.6488, + "step": 4784 + }, + { + "epoch": 0.9653046364504629, + "grad_norm": 0.4314109981060028, + "learning_rate": 3.119117065530808e-08, + "loss": 0.8215, + "step": 4785 + }, + { + "epoch": 0.9655063720066699, + "grad_norm": 0.38545098900794983, + "learning_rate": 3.082780257848361e-08, + "loss": 0.6359, + "step": 4786 + }, + { + "epoch": 0.9657081075628768, + "grad_norm": 0.41607731580734253, + "learning_rate": 3.046655694219969e-08, + "loss": 0.6006, + "step": 4787 + }, + { + "epoch": 0.9659098431190839, + "grad_norm": 0.7114999890327454, + "learning_rate": 3.0107433900751216e-08, + "loss": 0.7779, + "step": 4788 + }, + { + "epoch": 0.9661115786752908, + "grad_norm": 0.5416654944419861, + "learning_rate": 2.9750433607527163e-08, + "loss": 0.6834, + "step": 4789 + }, + { + "epoch": 0.9663133142314978, + "grad_norm": 0.6984711289405823, + "learning_rate": 2.9395556215011113e-08, + "loss": 0.6532, + "step": 4790 + }, + { + "epoch": 0.9665150497877049, + "grad_norm": 0.5756306648254395, + "learning_rate": 2.9042801874777925e-08, + "loss": 0.7076, + "step": 4791 + }, + { + "epoch": 0.9667167853439118, + "grad_norm": 0.7223901152610779, + "learning_rate": 2.8692170737497083e-08, + "loss": 0.6489, + "step": 4792 + }, + { + "epoch": 0.9669185209001189, + "grad_norm": 0.4999725818634033, + "learning_rate": 2.8343662952931005e-08, + "loss": 0.7008, + "step": 4793 + }, + { + "epoch": 0.9671202564563258, + "grad_norm": 0.700188398361206, + "learning_rate": 2.7997278669933405e-08, + "loss": 0.6829, + "step": 4794 + }, + { + "epoch": 0.9673219920125328, + "grad_norm": 0.726335883140564, + "learning_rate": 2.765301803645426e-08, + "loss": 0.7448, + "step": 4795 + }, + { + "epoch": 0.9675237275687398, + "grad_norm": 0.7787569761276245, + "learning_rate": 2.7310881199533736e-08, + "loss": 0.6284, + "step": 4796 + }, + { + "epoch": 0.9677254631249468, + "grad_norm": 0.5372986197471619, + "learning_rate": 2.69708683053066e-08, + "loss": 0.6637, + "step": 4797 + }, + { + "epoch": 0.9679271986811538, + "grad_norm": 1.1246623992919922, + "learning_rate": 2.6632979498998347e-08, + "loss": 0.6536, + "step": 4798 + }, + { + "epoch": 0.9681289342373608, + "grad_norm": 0.39886438846588135, + "learning_rate": 2.629721492492965e-08, + "loss": 0.6609, + "step": 4799 + }, + { + "epoch": 0.9683306697935677, + "grad_norm": 0.9603978991508484, + "learning_rate": 2.5963574726512454e-08, + "loss": 0.6716, + "step": 4800 + }, + { + "epoch": 0.9685324053497748, + "grad_norm": 0.40881672501564026, + "learning_rate": 2.5632059046251655e-08, + "loss": 0.6358, + "step": 4801 + }, + { + "epoch": 0.9687341409059818, + "grad_norm": 0.9251068830490112, + "learning_rate": 2.53026680257451e-08, + "loss": 0.6458, + "step": 4802 + }, + { + "epoch": 0.9689358764621888, + "grad_norm": 0.9787841439247131, + "learning_rate": 2.4975401805682475e-08, + "loss": 0.7053, + "step": 4803 + }, + { + "epoch": 0.9691376120183958, + "grad_norm": 0.4082057774066925, + "learning_rate": 2.4650260525846404e-08, + "loss": 0.8172, + "step": 4804 + }, + { + "epoch": 0.9693393475746027, + "grad_norm": 0.5025261044502258, + "learning_rate": 2.4327244325111354e-08, + "loss": 0.8375, + "step": 4805 + }, + { + "epoch": 0.9695410831308098, + "grad_norm": 0.45469751954078674, + "learning_rate": 2.4006353341444745e-08, + "loss": 0.822, + "step": 4806 + }, + { + "epoch": 0.9697428186870167, + "grad_norm": 0.6082906126976013, + "learning_rate": 2.3687587711905825e-08, + "loss": 0.6237, + "step": 4807 + }, + { + "epoch": 0.9699445542432238, + "grad_norm": 0.8955042362213135, + "learning_rate": 2.3370947572646796e-08, + "loss": 0.8761, + "step": 4808 + }, + { + "epoch": 0.9701462897994307, + "grad_norm": 0.3564375638961792, + "learning_rate": 2.3056433058911142e-08, + "loss": 0.6063, + "step": 4809 + }, + { + "epoch": 0.9703480253556377, + "grad_norm": 0.4515208303928375, + "learning_rate": 2.274404430503474e-08, + "loss": 0.7077, + "step": 4810 + }, + { + "epoch": 0.9705497609118447, + "grad_norm": 0.4937743842601776, + "learning_rate": 2.2433781444445858e-08, + "loss": 0.6696, + "step": 4811 + }, + { + "epoch": 0.9707514964680517, + "grad_norm": 0.7430995106697083, + "learning_rate": 2.2125644609664042e-08, + "loss": 0.7526, + "step": 4812 + }, + { + "epoch": 0.9709532320242587, + "grad_norm": 1.061241865158081, + "learning_rate": 2.1819633932301797e-08, + "loss": 0.6511, + "step": 4813 + }, + { + "epoch": 0.9711549675804657, + "grad_norm": 0.3608217239379883, + "learning_rate": 2.1515749543061792e-08, + "loss": 0.6436, + "step": 4814 + }, + { + "epoch": 0.9713567031366727, + "grad_norm": 0.3461396396160126, + "learning_rate": 2.1213991571740755e-08, + "loss": 0.6623, + "step": 4815 + }, + { + "epoch": 0.9715584386928797, + "grad_norm": 0.46052438020706177, + "learning_rate": 2.0914360147225033e-08, + "loss": 0.6507, + "step": 4816 + }, + { + "epoch": 0.9717601742490867, + "grad_norm": 0.3347858786582947, + "learning_rate": 2.0616855397494472e-08, + "loss": 0.6507, + "step": 4817 + }, + { + "epoch": 0.9719619098052936, + "grad_norm": 1.116852879524231, + "learning_rate": 2.0321477449619098e-08, + "loss": 0.6457, + "step": 4818 + }, + { + "epoch": 0.9721636453615007, + "grad_norm": 0.8443088531494141, + "learning_rate": 2.0028226429762433e-08, + "loss": 0.7697, + "step": 4819 + }, + { + "epoch": 0.9723653809177076, + "grad_norm": 0.7438956499099731, + "learning_rate": 1.9737102463176504e-08, + "loss": 0.8288, + "step": 4820 + }, + { + "epoch": 0.9725671164739147, + "grad_norm": 0.5241464972496033, + "learning_rate": 1.944810567420796e-08, + "loss": 0.678, + "step": 4821 + }, + { + "epoch": 0.9727688520301216, + "grad_norm": 0.4894596040248871, + "learning_rate": 1.9161236186293063e-08, + "loss": 1.0671, + "step": 4822 + }, + { + "epoch": 0.9729705875863286, + "grad_norm": 0.7723326683044434, + "learning_rate": 1.8876494121959908e-08, + "loss": 0.6433, + "step": 4823 + }, + { + "epoch": 0.9731723231425357, + "grad_norm": 0.46578657627105713, + "learning_rate": 1.8593879602828434e-08, + "loss": 0.7017, + "step": 4824 + }, + { + "epoch": 0.9733740586987426, + "grad_norm": 0.671375036239624, + "learning_rate": 1.831339274960875e-08, + "loss": 0.6607, + "step": 4825 + }, + { + "epoch": 0.9735757942549497, + "grad_norm": 0.4858841001987457, + "learning_rate": 1.8035033682103353e-08, + "loss": 0.7833, + "step": 4826 + }, + { + "epoch": 0.9737775298111566, + "grad_norm": 0.5803720951080322, + "learning_rate": 1.7758802519204922e-08, + "loss": 0.6924, + "step": 4827 + }, + { + "epoch": 0.9739792653673636, + "grad_norm": 2.0239927768707275, + "learning_rate": 1.7484699378897962e-08, + "loss": 0.7044, + "step": 4828 + }, + { + "epoch": 0.9741810009235706, + "grad_norm": 1.0976885557174683, + "learning_rate": 1.721272437825827e-08, + "loss": 0.6594, + "step": 4829 + }, + { + "epoch": 0.9743827364797776, + "grad_norm": 0.34820252656936646, + "learning_rate": 1.6942877633451815e-08, + "loss": 0.8396, + "step": 4830 + }, + { + "epoch": 0.9745844720359845, + "grad_norm": 0.6627471446990967, + "learning_rate": 1.6675159259735285e-08, + "loss": 0.677, + "step": 4831 + }, + { + "epoch": 0.9747862075921916, + "grad_norm": 0.5685920715332031, + "learning_rate": 1.6409569371458323e-08, + "loss": 0.8709, + "step": 4832 + }, + { + "epoch": 0.9749879431483985, + "grad_norm": 0.439563512802124, + "learning_rate": 1.6146108082059075e-08, + "loss": 0.6527, + "step": 4833 + }, + { + "epoch": 0.9751896787046056, + "grad_norm": 1.0685017108917236, + "learning_rate": 1.5884775504068083e-08, + "loss": 0.7461, + "step": 4834 + }, + { + "epoch": 0.9753914142608126, + "grad_norm": 0.8692358136177063, + "learning_rate": 1.562557174910606e-08, + "loss": 0.6507, + "step": 4835 + }, + { + "epoch": 0.9755931498170195, + "grad_norm": 0.4101846218109131, + "learning_rate": 1.5368496927884447e-08, + "loss": 0.6549, + "step": 4836 + }, + { + "epoch": 0.9757948853732266, + "grad_norm": 0.41572755575180054, + "learning_rate": 1.5113551150204853e-08, + "loss": 0.6295, + "step": 4837 + }, + { + "epoch": 0.9759966209294335, + "grad_norm": 0.35794496536254883, + "learning_rate": 1.4860734524961285e-08, + "loss": 0.6945, + "step": 4838 + }, + { + "epoch": 0.9761983564856406, + "grad_norm": 0.6017243266105652, + "learning_rate": 1.4610047160136254e-08, + "loss": 0.7921, + "step": 4839 + }, + { + "epoch": 0.9764000920418475, + "grad_norm": 0.34502172470092773, + "learning_rate": 1.4361489162804109e-08, + "loss": 0.713, + "step": 4840 + }, + { + "epoch": 0.9766018275980545, + "grad_norm": 0.6996079683303833, + "learning_rate": 1.411506063912882e-08, + "loss": 0.7164, + "step": 4841 + }, + { + "epoch": 0.9768035631542615, + "grad_norm": 0.4020618200302124, + "learning_rate": 1.387076169436563e-08, + "loss": 0.6264, + "step": 4842 + }, + { + "epoch": 0.9770052987104685, + "grad_norm": 1.5375221967697144, + "learning_rate": 1.3628592432861077e-08, + "loss": 0.8928, + "step": 4843 + }, + { + "epoch": 0.9772070342666755, + "grad_norm": 0.7670455574989319, + "learning_rate": 1.3388552958048529e-08, + "loss": 0.6474, + "step": 4844 + }, + { + "epoch": 0.9774087698228825, + "grad_norm": 0.4482732117176056, + "learning_rate": 1.3150643372455973e-08, + "loss": 0.6652, + "step": 4845 + }, + { + "epoch": 0.9776105053790894, + "grad_norm": 0.4064268469810486, + "learning_rate": 1.2914863777698794e-08, + "loss": 0.6561, + "step": 4846 + }, + { + "epoch": 0.9778122409352965, + "grad_norm": 0.745275616645813, + "learning_rate": 1.2681214274483655e-08, + "loss": 0.6295, + "step": 4847 + }, + { + "epoch": 0.9780139764915035, + "grad_norm": 1.2450134754180908, + "learning_rate": 1.244969496260795e-08, + "loss": 0.6317, + "step": 4848 + }, + { + "epoch": 0.9782157120477104, + "grad_norm": 0.8189371228218079, + "learning_rate": 1.2220305940957578e-08, + "loss": 0.6434, + "step": 4849 + }, + { + "epoch": 0.9784174476039175, + "grad_norm": 0.8365606665611267, + "learning_rate": 1.199304730750972e-08, + "loss": 0.6992, + "step": 4850 + }, + { + "epoch": 0.9786191831601244, + "grad_norm": 0.40646567940711975, + "learning_rate": 1.1767919159332286e-08, + "loss": 0.7065, + "step": 4851 + }, + { + "epoch": 0.9788209187163315, + "grad_norm": 0.6693828701972961, + "learning_rate": 1.1544921592581138e-08, + "loss": 0.6444, + "step": 4852 + }, + { + "epoch": 0.9790226542725384, + "grad_norm": 0.7508454918861389, + "learning_rate": 1.1324054702504528e-08, + "loss": 0.6489, + "step": 4853 + }, + { + "epoch": 0.9792243898287454, + "grad_norm": 0.4734887480735779, + "learning_rate": 1.1105318583438663e-08, + "loss": 0.6898, + "step": 4854 + }, + { + "epoch": 0.9794261253849524, + "grad_norm": 0.7000921368598938, + "learning_rate": 1.0888713328810474e-08, + "loss": 0.6613, + "step": 4855 + }, + { + "epoch": 0.9796278609411594, + "grad_norm": 0.569733738899231, + "learning_rate": 1.0674239031137069e-08, + "loss": 0.6622, + "step": 4856 + }, + { + "epoch": 0.9798295964973665, + "grad_norm": 0.376286119222641, + "learning_rate": 1.0461895782025166e-08, + "loss": 0.6579, + "step": 4857 + }, + { + "epoch": 0.9800313320535734, + "grad_norm": 1.5786285400390625, + "learning_rate": 1.0251683672170554e-08, + "loss": 0.7327, + "step": 4858 + }, + { + "epoch": 0.9802330676097804, + "grad_norm": 0.4468984007835388, + "learning_rate": 1.0043602791360297e-08, + "loss": 0.6494, + "step": 4859 + }, + { + "epoch": 0.9804348031659874, + "grad_norm": 0.31856024265289307, + "learning_rate": 9.837653228469413e-09, + "loss": 0.6477, + "step": 4860 + }, + { + "epoch": 0.9806365387221944, + "grad_norm": 0.6004407405853271, + "learning_rate": 9.633835071463094e-09, + "loss": 0.7727, + "step": 4861 + }, + { + "epoch": 0.9808382742784014, + "grad_norm": 0.8156235218048096, + "learning_rate": 9.432148407397257e-09, + "loss": 0.6428, + "step": 4862 + }, + { + "epoch": 0.9810400098346084, + "grad_norm": 0.3582519590854645, + "learning_rate": 9.232593322416883e-09, + "loss": 0.6174, + "step": 4863 + }, + { + "epoch": 0.9812417453908153, + "grad_norm": 0.4001818299293518, + "learning_rate": 9.035169901754902e-09, + "loss": 0.6688, + "step": 4864 + }, + { + "epoch": 0.9814434809470224, + "grad_norm": 0.4952855110168457, + "learning_rate": 8.839878229736643e-09, + "loss": 0.6671, + "step": 4865 + }, + { + "epoch": 0.9816452165032293, + "grad_norm": 0.34948718547821045, + "learning_rate": 8.646718389774267e-09, + "loss": 0.6613, + "step": 4866 + }, + { + "epoch": 0.9818469520594363, + "grad_norm": 0.6136161684989929, + "learning_rate": 8.455690464371224e-09, + "loss": 0.7906, + "step": 4867 + }, + { + "epoch": 0.9820486876156433, + "grad_norm": 0.5521015524864197, + "learning_rate": 8.266794535118915e-09, + "loss": 0.7189, + "step": 4868 + }, + { + "epoch": 0.9822504231718503, + "grad_norm": 0.5386425256729126, + "learning_rate": 8.08003068269947e-09, + "loss": 0.6319, + "step": 4869 + }, + { + "epoch": 0.9824521587280574, + "grad_norm": 1.521870493888855, + "learning_rate": 7.895398986883518e-09, + "loss": 0.8588, + "step": 4870 + }, + { + "epoch": 0.9826538942842643, + "grad_norm": 1.0882370471954346, + "learning_rate": 7.71289952653187e-09, + "loss": 0.6604, + "step": 4871 + }, + { + "epoch": 0.9828556298404713, + "grad_norm": 0.4301353991031647, + "learning_rate": 7.532532379592728e-09, + "loss": 0.6396, + "step": 4872 + }, + { + "epoch": 0.9830573653966783, + "grad_norm": 0.7053226232528687, + "learning_rate": 7.354297623105577e-09, + "loss": 0.6267, + "step": 4873 + }, + { + "epoch": 0.9832591009528853, + "grad_norm": 0.5108756422996521, + "learning_rate": 7.1781953331984125e-09, + "loss": 0.8235, + "step": 4874 + }, + { + "epoch": 0.9834608365090923, + "grad_norm": 0.5003656148910522, + "learning_rate": 7.004225585088287e-09, + "loss": 0.6547, + "step": 4875 + }, + { + "epoch": 0.9836625720652993, + "grad_norm": 1.7763170003890991, + "learning_rate": 6.832388453080762e-09, + "loss": 0.6739, + "step": 4876 + }, + { + "epoch": 0.9838643076215062, + "grad_norm": 0.5061050653457642, + "learning_rate": 6.662684010572129e-09, + "loss": 0.8183, + "step": 4877 + }, + { + "epoch": 0.9840660431777133, + "grad_norm": 0.46445515751838684, + "learning_rate": 6.495112330046072e-09, + "loss": 0.6868, + "step": 4878 + }, + { + "epoch": 0.9842677787339202, + "grad_norm": 0.35919633507728577, + "learning_rate": 6.329673483076448e-09, + "loss": 0.6631, + "step": 4879 + }, + { + "epoch": 0.9844695142901273, + "grad_norm": 0.6525983810424805, + "learning_rate": 6.166367540325624e-09, + "loss": 0.6468, + "step": 4880 + }, + { + "epoch": 0.9846712498463343, + "grad_norm": 0.4751875102519989, + "learning_rate": 6.005194571545581e-09, + "loss": 0.6931, + "step": 4881 + }, + { + "epoch": 0.9848729854025412, + "grad_norm": 0.560192346572876, + "learning_rate": 5.846154645575697e-09, + "loss": 0.8019, + "step": 4882 + }, + { + "epoch": 0.9850747209587483, + "grad_norm": 0.5137495398521423, + "learning_rate": 5.689247830346079e-09, + "loss": 0.68, + "step": 4883 + }, + { + "epoch": 0.9852764565149552, + "grad_norm": 0.4108632504940033, + "learning_rate": 5.534474192875339e-09, + "loss": 0.8589, + "step": 4884 + }, + { + "epoch": 0.9854781920711622, + "grad_norm": 0.3733404278755188, + "learning_rate": 5.381833799269487e-09, + "loss": 0.6393, + "step": 4885 + }, + { + "epoch": 0.9856799276273692, + "grad_norm": 0.4676814377307892, + "learning_rate": 5.231326714725815e-09, + "loss": 0.6373, + "step": 4886 + }, + { + "epoch": 0.9858816631835762, + "grad_norm": 0.6283035278320312, + "learning_rate": 5.082953003528457e-09, + "loss": 0.9995, + "step": 4887 + }, + { + "epoch": 0.9860833987397832, + "grad_norm": 0.49630334973335266, + "learning_rate": 4.936712729051163e-09, + "loss": 0.7571, + "step": 4888 + }, + { + "epoch": 0.9862851342959902, + "grad_norm": 0.45282241702079773, + "learning_rate": 4.792605953756191e-09, + "loss": 0.8741, + "step": 4889 + }, + { + "epoch": 0.9864868698521971, + "grad_norm": 0.4955006241798401, + "learning_rate": 4.650632739194305e-09, + "loss": 0.6935, + "step": 4890 + }, + { + "epoch": 0.9866886054084042, + "grad_norm": 0.7275412082672119, + "learning_rate": 4.510793146006442e-09, + "loss": 0.9353, + "step": 4891 + }, + { + "epoch": 0.9868903409646111, + "grad_norm": 0.4954439103603363, + "learning_rate": 4.373087233919826e-09, + "loss": 0.6653, + "step": 4892 + }, + { + "epoch": 0.9870920765208182, + "grad_norm": 0.3884679079055786, + "learning_rate": 4.2375150617529615e-09, + "loss": 0.6975, + "step": 4893 + }, + { + "epoch": 0.9872938120770252, + "grad_norm": 0.42031583189964294, + "learning_rate": 4.104076687410086e-09, + "loss": 0.7541, + "step": 4894 + }, + { + "epoch": 0.9874955476332321, + "grad_norm": 0.3728240728378296, + "learning_rate": 3.972772167886718e-09, + "loss": 0.6141, + "step": 4895 + }, + { + "epoch": 0.9876972831894392, + "grad_norm": 0.4875032305717468, + "learning_rate": 3.843601559265775e-09, + "loss": 0.8307, + "step": 4896 + }, + { + "epoch": 0.9878990187456461, + "grad_norm": 0.7961161136627197, + "learning_rate": 3.716564916718124e-09, + "loss": 0.7869, + "step": 4897 + }, + { + "epoch": 0.9881007543018532, + "grad_norm": 0.7322169542312622, + "learning_rate": 3.591662294504805e-09, + "loss": 0.6661, + "step": 4898 + }, + { + "epoch": 0.9883024898580601, + "grad_norm": 0.3394564986228943, + "learning_rate": 3.4688937459737004e-09, + "loss": 0.69, + "step": 4899 + }, + { + "epoch": 0.9885042254142671, + "grad_norm": 0.9545552730560303, + "learning_rate": 3.3482593235617533e-09, + "loss": 0.6647, + "step": 4900 + }, + { + "epoch": 0.9887059609704741, + "grad_norm": 0.3521050214767456, + "learning_rate": 3.2297590787955248e-09, + "loss": 0.6554, + "step": 4901 + }, + { + "epoch": 0.9889076965266811, + "grad_norm": 0.4236242473125458, + "learning_rate": 3.1133930622878618e-09, + "loss": 0.8047, + "step": 4902 + }, + { + "epoch": 0.9891094320828882, + "grad_norm": 0.4478450119495392, + "learning_rate": 2.9991613237417837e-09, + "loss": 0.7946, + "step": 4903 + }, + { + "epoch": 0.9893111676390951, + "grad_norm": 0.8264681696891785, + "learning_rate": 2.8870639119482622e-09, + "loss": 0.7464, + "step": 4904 + }, + { + "epoch": 0.989512903195302, + "grad_norm": 1.0444552898406982, + "learning_rate": 2.7771008747867757e-09, + "loss": 0.6486, + "step": 4905 + }, + { + "epoch": 0.9897146387515091, + "grad_norm": 0.34184062480926514, + "learning_rate": 2.669272259223643e-09, + "loss": 0.6797, + "step": 4906 + }, + { + "epoch": 0.9899163743077161, + "grad_norm": 0.42163538932800293, + "learning_rate": 2.563578111315912e-09, + "loss": 0.6164, + "step": 4907 + }, + { + "epoch": 0.990118109863923, + "grad_norm": 0.41893240809440613, + "learning_rate": 2.460018476207471e-09, + "loss": 0.6805, + "step": 4908 + }, + { + "epoch": 0.9903198454201301, + "grad_norm": 0.4893782138824463, + "learning_rate": 2.3585933981312704e-09, + "loss": 0.6462, + "step": 4909 + }, + { + "epoch": 0.990521580976337, + "grad_norm": 0.3577900230884552, + "learning_rate": 2.2593029204076578e-09, + "loss": 0.7524, + "step": 4910 + }, + { + "epoch": 0.9907233165325441, + "grad_norm": 0.3445039391517639, + "learning_rate": 2.1621470854454874e-09, + "loss": 0.6344, + "step": 4911 + }, + { + "epoch": 0.990925052088751, + "grad_norm": 0.4528195261955261, + "learning_rate": 2.067125934742675e-09, + "loss": 0.6931, + "step": 4912 + }, + { + "epoch": 0.991126787644958, + "grad_norm": 0.5998149514198303, + "learning_rate": 1.9742395088845346e-09, + "loss": 0.7681, + "step": 4913 + }, + { + "epoch": 0.991328523201165, + "grad_norm": 0.3263251483440399, + "learning_rate": 1.8834878475454398e-09, + "loss": 0.7044, + "step": 4914 + }, + { + "epoch": 0.991530258757372, + "grad_norm": 0.6413541436195374, + "learning_rate": 1.794870989486608e-09, + "loss": 0.7415, + "step": 4915 + }, + { + "epoch": 0.9917319943135791, + "grad_norm": 0.5035390853881836, + "learning_rate": 1.708388972558317e-09, + "loss": 0.8024, + "step": 4916 + }, + { + "epoch": 0.991933729869786, + "grad_norm": 0.5376717448234558, + "learning_rate": 1.6240418336993525e-09, + "loss": 0.6732, + "step": 4917 + }, + { + "epoch": 0.992135465425993, + "grad_norm": 0.6315147280693054, + "learning_rate": 1.5418296089358964e-09, + "loss": 0.6264, + "step": 4918 + }, + { + "epoch": 0.9923372009822, + "grad_norm": 0.43847543001174927, + "learning_rate": 1.4617523333820827e-09, + "loss": 0.7628, + "step": 4919 + }, + { + "epoch": 0.992538936538407, + "grad_norm": 0.465602844953537, + "learning_rate": 1.3838100412416622e-09, + "loss": 0.6496, + "step": 4920 + }, + { + "epoch": 0.992740672094614, + "grad_norm": 0.43424639105796814, + "learning_rate": 1.3080027658052275e-09, + "loss": 0.818, + "step": 4921 + }, + { + "epoch": 0.992942407650821, + "grad_norm": 0.46224871277809143, + "learning_rate": 1.2343305394507677e-09, + "loss": 0.6756, + "step": 4922 + }, + { + "epoch": 0.9931441432070279, + "grad_norm": 0.8015269637107849, + "learning_rate": 1.1627933936464442e-09, + "loss": 0.7697, + "step": 4923 + }, + { + "epoch": 0.993345878763235, + "grad_norm": 0.36045822501182556, + "learning_rate": 1.0933913589461497e-09, + "loss": 0.7097, + "step": 4924 + }, + { + "epoch": 0.9935476143194419, + "grad_norm": 0.5318560600280762, + "learning_rate": 1.0261244649945045e-09, + "loss": 0.8177, + "step": 4925 + }, + { + "epoch": 0.9937493498756489, + "grad_norm": 0.32715970277786255, + "learning_rate": 9.60992740521305e-10, + "loss": 0.6823, + "step": 4926 + }, + { + "epoch": 0.993951085431856, + "grad_norm": 0.5524141192436218, + "learning_rate": 8.979962133459641e-10, + "loss": 0.6753, + "step": 4927 + }, + { + "epoch": 0.9941528209880629, + "grad_norm": 0.5175045132637024, + "learning_rate": 8.371349103764026e-10, + "loss": 0.7542, + "step": 4928 + }, + { + "epoch": 0.99435455654427, + "grad_norm": 0.42321813106536865, + "learning_rate": 7.784088576068272e-10, + "loss": 0.7561, + "step": 4929 + }, + { + "epoch": 0.9945562921004769, + "grad_norm": 0.7016764283180237, + "learning_rate": 7.218180801210617e-10, + "loss": 1.0079, + "step": 4930 + }, + { + "epoch": 0.9947580276566839, + "grad_norm": 0.3303213119506836, + "learning_rate": 6.673626020903267e-10, + "loss": 0.7609, + "step": 4931 + }, + { + "epoch": 0.9949597632128909, + "grad_norm": 0.3997403681278229, + "learning_rate": 6.150424467732397e-10, + "loss": 0.7826, + "step": 4932 + }, + { + "epoch": 0.9951614987690979, + "grad_norm": 0.5176541209220886, + "learning_rate": 5.648576365169245e-10, + "loss": 0.7491, + "step": 4933 + }, + { + "epoch": 0.9953632343253049, + "grad_norm": 1.5391536951065063, + "learning_rate": 5.168081927564572e-10, + "loss": 0.6885, + "step": 4934 + }, + { + "epoch": 0.9955649698815119, + "grad_norm": 0.4760850965976715, + "learning_rate": 4.708941360148655e-10, + "loss": 0.6359, + "step": 4935 + }, + { + "epoch": 0.9957667054377188, + "grad_norm": 0.41108208894729614, + "learning_rate": 4.2711548590368414e-10, + "loss": 1.0169, + "step": 4936 + }, + { + "epoch": 0.9959684409939259, + "grad_norm": 0.7306568622589111, + "learning_rate": 3.854722611201789e-10, + "loss": 0.6983, + "step": 4937 + }, + { + "epoch": 0.9961701765501328, + "grad_norm": 2.0669448375701904, + "learning_rate": 3.459644794523431e-10, + "loss": 0.6308, + "step": 4938 + }, + { + "epoch": 0.9963719121063399, + "grad_norm": 0.8391724824905396, + "learning_rate": 3.0859215777445663e-10, + "loss": 0.9293, + "step": 4939 + }, + { + "epoch": 0.9965736476625469, + "grad_norm": 0.6950282454490662, + "learning_rate": 2.7335531204930597e-10, + "loss": 0.6589, + "step": 4940 + }, + { + "epoch": 0.9967753832187538, + "grad_norm": 0.36784011125564575, + "learning_rate": 2.4025395732651947e-10, + "loss": 0.679, + "step": 4941 + }, + { + "epoch": 0.9969771187749609, + "grad_norm": 0.3888319134712219, + "learning_rate": 2.0928810774534237e-10, + "loss": 0.6201, + "step": 4942 + }, + { + "epoch": 0.9971788543311678, + "grad_norm": 0.3902921974658966, + "learning_rate": 1.8045777653130648e-10, + "loss": 0.6859, + "step": 4943 + }, + { + "epoch": 0.9973805898873748, + "grad_norm": 2.439812421798706, + "learning_rate": 1.5376297599845046e-10, + "loss": 0.688, + "step": 4944 + }, + { + "epoch": 0.9975823254435818, + "grad_norm": 0.39363893866539, + "learning_rate": 1.2920371754931994e-10, + "loss": 0.8237, + "step": 4945 + }, + { + "epoch": 0.9977840609997888, + "grad_norm": 0.4545292258262634, + "learning_rate": 1.0678001167274688e-10, + "loss": 0.639, + "step": 4946 + }, + { + "epoch": 0.9979857965559958, + "grad_norm": 0.307205468416214, + "learning_rate": 8.649186794773556e-11, + "loss": 0.649, + "step": 4947 + }, + { + "epoch": 0.9981875321122028, + "grad_norm": 0.48131927847862244, + "learning_rate": 6.833929503846648e-11, + "loss": 0.6642, + "step": 4948 + }, + { + "epoch": 0.9983892676684097, + "grad_norm": 0.4924654960632324, + "learning_rate": 5.2322300698737225e-11, + "loss": 0.6458, + "step": 4949 + }, + { + "epoch": 0.9985910032246168, + "grad_norm": 0.4332221448421478, + "learning_rate": 3.8440891769742085e-11, + "loss": 0.7168, + "step": 4950 + }, + { + "epoch": 0.9987927387808238, + "grad_norm": 0.5997542142868042, + "learning_rate": 2.6695074181182225e-11, + "loss": 0.6842, + "step": 4951 + }, + { + "epoch": 0.9989944743370308, + "grad_norm": 0.7213303446769714, + "learning_rate": 1.708485294904527e-11, + "loss": 0.8125, + "step": 4952 + }, + { + "epoch": 0.9991962098932378, + "grad_norm": 0.8712928891181946, + "learning_rate": 9.610232178380862e-12, + "loss": 0.6654, + "step": 4953 + }, + { + "epoch": 0.9993979454494447, + "grad_norm": 0.37508413195610046, + "learning_rate": 4.271215061635303e-12, + "loss": 0.689, + "step": 4954 + }, + { + "epoch": 0.9995996810056518, + "grad_norm": 0.3845532536506653, + "learning_rate": 1.0678038792066857e-12, + "loss": 0.6463, + "step": 4955 + }, + { + "epoch": 0.9998014165618587, + "grad_norm": 0.40240800380706787, + "learning_rate": 0.0, + "loss": 0.6359, + "step": 4956 + }, + { + "epoch": 0.9998014165618587, + "step": 4956, + "total_flos": 6.502525695783076e+18, + "train_loss": 0.7414010693292833, + "train_runtime": 166490.8866, + "train_samples_per_second": 7.622, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 1.0, + "max_steps": 4956, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.502525695783076e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}