diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,98924 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9998584271253628, + "eval_steps": 500, + "global_step": 14126, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014157287463721952, + "grad_norm": 63.84707491036199, + "learning_rate": 1.179245283018868e-08, + "loss": 1.8583, + "step": 1 + }, + { + "epoch": 0.00028314574927443904, + "grad_norm": 55.97125814452673, + "learning_rate": 2.358490566037736e-08, + "loss": 1.6209, + "step": 2 + }, + { + "epoch": 0.00042471862391165854, + "grad_norm": 55.47292457882607, + "learning_rate": 3.537735849056604e-08, + "loss": 1.8056, + "step": 3 + }, + { + "epoch": 0.0005662914985488781, + "grad_norm": 106.85951093901282, + "learning_rate": 4.716981132075472e-08, + "loss": 1.7069, + "step": 4 + }, + { + "epoch": 0.0007078643731860976, + "grad_norm": 39.30327824322117, + "learning_rate": 5.89622641509434e-08, + "loss": 1.4781, + "step": 5 + }, + { + "epoch": 0.0008494372478233171, + "grad_norm": 57.27069810407522, + "learning_rate": 7.075471698113208e-08, + "loss": 1.4853, + "step": 6 + }, + { + "epoch": 0.0009910101224605366, + "grad_norm": 60.03881800942412, + "learning_rate": 8.254716981132076e-08, + "loss": 1.6872, + "step": 7 + }, + { + "epoch": 0.0011325829970977562, + "grad_norm": 54.691218623571096, + "learning_rate": 9.433962264150944e-08, + "loss": 1.6029, + "step": 8 + }, + { + "epoch": 0.0012741558717349756, + "grad_norm": 39.057755961924514, + "learning_rate": 1.0613207547169811e-07, + "loss": 1.722, + "step": 9 + }, + { + "epoch": 0.0014157287463721952, + "grad_norm": 52.518007514387385, + "learning_rate": 1.179245283018868e-07, + "loss": 1.5892, + "step": 10 + }, + { + "epoch": 0.0015573016210094145, + "grad_norm": 40.99188300939895, + "learning_rate": 1.297169811320755e-07, + "loss": 1.7743, + "step": 11 + }, + { + "epoch": 0.0016988744956466342, + "grad_norm": 38.71449986762066, + "learning_rate": 1.4150943396226417e-07, + "loss": 1.6641, + "step": 12 + }, + { + "epoch": 0.0018404473702838535, + "grad_norm": 28.593979464584738, + "learning_rate": 1.5330188679245283e-07, + "loss": 1.7126, + "step": 13 + }, + { + "epoch": 0.001982020244921073, + "grad_norm": 25.95256998202442, + "learning_rate": 1.6509433962264153e-07, + "loss": 1.687, + "step": 14 + }, + { + "epoch": 0.0021235931195582925, + "grad_norm": 22.958692525925773, + "learning_rate": 1.768867924528302e-07, + "loss": 1.4344, + "step": 15 + }, + { + "epoch": 0.0022651659941955123, + "grad_norm": 35.51739849289535, + "learning_rate": 1.886792452830189e-07, + "loss": 1.4988, + "step": 16 + }, + { + "epoch": 0.0024067388688327317, + "grad_norm": 31.415543945110795, + "learning_rate": 2.0047169811320755e-07, + "loss": 1.664, + "step": 17 + }, + { + "epoch": 0.002548311743469951, + "grad_norm": 38.73468226733714, + "learning_rate": 2.1226415094339622e-07, + "loss": 1.6399, + "step": 18 + }, + { + "epoch": 0.0026898846181071705, + "grad_norm": 39.95800560481044, + "learning_rate": 2.2405660377358492e-07, + "loss": 1.562, + "step": 19 + }, + { + "epoch": 0.0028314574927443903, + "grad_norm": 38.70349717221139, + "learning_rate": 2.358490566037736e-07, + "loss": 1.6775, + "step": 20 + }, + { + "epoch": 0.0029730303673816097, + "grad_norm": 37.92284068609741, + "learning_rate": 2.476415094339623e-07, + "loss": 1.5582, + "step": 21 + }, + { + "epoch": 0.003114603242018829, + "grad_norm": 29.92503654968528, + "learning_rate": 2.59433962264151e-07, + "loss": 1.7453, + "step": 22 + }, + { + "epoch": 0.0032561761166560485, + "grad_norm": 17.986794608085052, + "learning_rate": 2.7122641509433966e-07, + "loss": 1.483, + "step": 23 + }, + { + "epoch": 0.0033977489912932683, + "grad_norm": 19.196691641288172, + "learning_rate": 2.8301886792452833e-07, + "loss": 1.7226, + "step": 24 + }, + { + "epoch": 0.0035393218659304877, + "grad_norm": 19.784474090535298, + "learning_rate": 2.94811320754717e-07, + "loss": 1.6687, + "step": 25 + }, + { + "epoch": 0.003680894740567707, + "grad_norm": 19.379568041385834, + "learning_rate": 3.0660377358490567e-07, + "loss": 1.6947, + "step": 26 + }, + { + "epoch": 0.003822467615204927, + "grad_norm": 20.93732144863233, + "learning_rate": 3.183962264150944e-07, + "loss": 1.5904, + "step": 27 + }, + { + "epoch": 0.003964040489842146, + "grad_norm": 20.618692809294735, + "learning_rate": 3.3018867924528305e-07, + "loss": 1.7195, + "step": 28 + }, + { + "epoch": 0.004105613364479366, + "grad_norm": 21.44965376141011, + "learning_rate": 3.419811320754717e-07, + "loss": 1.5418, + "step": 29 + }, + { + "epoch": 0.004247186239116585, + "grad_norm": 24.713102329691683, + "learning_rate": 3.537735849056604e-07, + "loss": 1.5442, + "step": 30 + }, + { + "epoch": 0.004388759113753805, + "grad_norm": 22.915087984212906, + "learning_rate": 3.6556603773584905e-07, + "loss": 1.5081, + "step": 31 + }, + { + "epoch": 0.004530331988391025, + "grad_norm": 16.63497923592595, + "learning_rate": 3.773584905660378e-07, + "loss": 1.6978, + "step": 32 + }, + { + "epoch": 0.004671904863028244, + "grad_norm": 17.82796580505173, + "learning_rate": 3.8915094339622644e-07, + "loss": 1.5757, + "step": 33 + }, + { + "epoch": 0.0048134777376654635, + "grad_norm": 14.200117105350365, + "learning_rate": 4.009433962264151e-07, + "loss": 1.4431, + "step": 34 + }, + { + "epoch": 0.004955050612302682, + "grad_norm": 19.485917704063397, + "learning_rate": 4.127358490566038e-07, + "loss": 1.7673, + "step": 35 + }, + { + "epoch": 0.005096623486939902, + "grad_norm": 16.828062434598177, + "learning_rate": 4.2452830188679244e-07, + "loss": 1.647, + "step": 36 + }, + { + "epoch": 0.005238196361577122, + "grad_norm": 18.20557923369578, + "learning_rate": 4.3632075471698116e-07, + "loss": 1.6429, + "step": 37 + }, + { + "epoch": 0.005379769236214341, + "grad_norm": 17.973172010231668, + "learning_rate": 4.4811320754716983e-07, + "loss": 1.6643, + "step": 38 + }, + { + "epoch": 0.005521342110851561, + "grad_norm": 21.95264401163474, + "learning_rate": 4.599056603773585e-07, + "loss": 1.7284, + "step": 39 + }, + { + "epoch": 0.005662914985488781, + "grad_norm": 20.081733017261296, + "learning_rate": 4.716981132075472e-07, + "loss": 1.5981, + "step": 40 + }, + { + "epoch": 0.005804487860126, + "grad_norm": 15.272347507461708, + "learning_rate": 4.834905660377359e-07, + "loss": 1.3697, + "step": 41 + }, + { + "epoch": 0.005946060734763219, + "grad_norm": 16.281213449155228, + "learning_rate": 4.952830188679246e-07, + "loss": 1.5138, + "step": 42 + }, + { + "epoch": 0.006087633609400439, + "grad_norm": 18.346455727037323, + "learning_rate": 5.070754716981133e-07, + "loss": 1.6156, + "step": 43 + }, + { + "epoch": 0.006229206484037658, + "grad_norm": 17.027650673277623, + "learning_rate": 5.18867924528302e-07, + "loss": 1.6837, + "step": 44 + }, + { + "epoch": 0.006370779358674878, + "grad_norm": 17.554328510737477, + "learning_rate": 5.306603773584906e-07, + "loss": 1.6768, + "step": 45 + }, + { + "epoch": 0.006512352233312097, + "grad_norm": 14.118845991574137, + "learning_rate": 5.424528301886793e-07, + "loss": 1.5004, + "step": 46 + }, + { + "epoch": 0.006653925107949317, + "grad_norm": 14.766144790784827, + "learning_rate": 5.542452830188679e-07, + "loss": 1.5461, + "step": 47 + }, + { + "epoch": 0.006795497982586537, + "grad_norm": 21.987237869964265, + "learning_rate": 5.660377358490567e-07, + "loss": 1.4939, + "step": 48 + }, + { + "epoch": 0.0069370708572237556, + "grad_norm": 19.499311386078485, + "learning_rate": 5.778301886792454e-07, + "loss": 1.7227, + "step": 49 + }, + { + "epoch": 0.007078643731860975, + "grad_norm": 15.671376000748497, + "learning_rate": 5.89622641509434e-07, + "loss": 1.6233, + "step": 50 + }, + { + "epoch": 0.007220216606498195, + "grad_norm": 17.439377769079268, + "learning_rate": 6.014150943396227e-07, + "loss": 1.7287, + "step": 51 + }, + { + "epoch": 0.007361789481135414, + "grad_norm": 14.003872472065261, + "learning_rate": 6.132075471698113e-07, + "loss": 1.6392, + "step": 52 + }, + { + "epoch": 0.007503362355772634, + "grad_norm": 13.756472087420967, + "learning_rate": 6.25e-07, + "loss": 1.4719, + "step": 53 + }, + { + "epoch": 0.007644935230409854, + "grad_norm": 13.88924412251876, + "learning_rate": 6.367924528301888e-07, + "loss": 1.4071, + "step": 54 + }, + { + "epoch": 0.007786508105047073, + "grad_norm": 18.097433239101893, + "learning_rate": 6.485849056603774e-07, + "loss": 1.5474, + "step": 55 + }, + { + "epoch": 0.007928080979684293, + "grad_norm": 19.15859169516871, + "learning_rate": 6.603773584905661e-07, + "loss": 1.694, + "step": 56 + }, + { + "epoch": 0.008069653854321512, + "grad_norm": 19.846306583587932, + "learning_rate": 6.721698113207547e-07, + "loss": 1.681, + "step": 57 + }, + { + "epoch": 0.008211226728958732, + "grad_norm": 15.972953825333935, + "learning_rate": 6.839622641509434e-07, + "loss": 1.4197, + "step": 58 + }, + { + "epoch": 0.00835279960359595, + "grad_norm": 15.230586968956427, + "learning_rate": 6.957547169811322e-07, + "loss": 1.6777, + "step": 59 + }, + { + "epoch": 0.00849437247823317, + "grad_norm": 14.869030179257814, + "learning_rate": 7.075471698113208e-07, + "loss": 1.5724, + "step": 60 + }, + { + "epoch": 0.00863594535287039, + "grad_norm": 18.825678166994823, + "learning_rate": 7.193396226415095e-07, + "loss": 1.7128, + "step": 61 + }, + { + "epoch": 0.00877751822750761, + "grad_norm": 21.544871501986485, + "learning_rate": 7.311320754716981e-07, + "loss": 1.5879, + "step": 62 + }, + { + "epoch": 0.00891909110214483, + "grad_norm": 14.199419165538846, + "learning_rate": 7.429245283018868e-07, + "loss": 1.4421, + "step": 63 + }, + { + "epoch": 0.00906066397678205, + "grad_norm": 16.192808952358394, + "learning_rate": 7.547169811320755e-07, + "loss": 1.726, + "step": 64 + }, + { + "epoch": 0.009202236851419267, + "grad_norm": 15.347477291206577, + "learning_rate": 7.665094339622642e-07, + "loss": 1.5107, + "step": 65 + }, + { + "epoch": 0.009343809726056487, + "grad_norm": 16.19939348003788, + "learning_rate": 7.783018867924529e-07, + "loss": 1.5814, + "step": 66 + }, + { + "epoch": 0.009485382600693707, + "grad_norm": 13.51898158735264, + "learning_rate": 7.900943396226415e-07, + "loss": 1.5254, + "step": 67 + }, + { + "epoch": 0.009626955475330927, + "grad_norm": 17.11927688640152, + "learning_rate": 8.018867924528302e-07, + "loss": 1.7402, + "step": 68 + }, + { + "epoch": 0.009768528349968147, + "grad_norm": 16.34138684666224, + "learning_rate": 8.136792452830189e-07, + "loss": 1.6328, + "step": 69 + }, + { + "epoch": 0.009910101224605365, + "grad_norm": 15.33422058413311, + "learning_rate": 8.254716981132076e-07, + "loss": 1.5055, + "step": 70 + }, + { + "epoch": 0.010051674099242585, + "grad_norm": 16.604385789345123, + "learning_rate": 8.372641509433963e-07, + "loss": 1.8437, + "step": 71 + }, + { + "epoch": 0.010193246973879804, + "grad_norm": 22.59078770129195, + "learning_rate": 8.490566037735849e-07, + "loss": 1.7667, + "step": 72 + }, + { + "epoch": 0.010334819848517024, + "grad_norm": 15.591242831389156, + "learning_rate": 8.608490566037736e-07, + "loss": 1.6976, + "step": 73 + }, + { + "epoch": 0.010476392723154244, + "grad_norm": 16.853932885559306, + "learning_rate": 8.726415094339623e-07, + "loss": 1.8321, + "step": 74 + }, + { + "epoch": 0.010617965597791464, + "grad_norm": 17.897127127665716, + "learning_rate": 8.844339622641509e-07, + "loss": 1.6987, + "step": 75 + }, + { + "epoch": 0.010759538472428682, + "grad_norm": 19.453441889989225, + "learning_rate": 8.962264150943397e-07, + "loss": 1.5366, + "step": 76 + }, + { + "epoch": 0.010901111347065902, + "grad_norm": 15.923868730208563, + "learning_rate": 9.080188679245283e-07, + "loss": 1.6872, + "step": 77 + }, + { + "epoch": 0.011042684221703122, + "grad_norm": 14.695457867436243, + "learning_rate": 9.19811320754717e-07, + "loss": 1.6081, + "step": 78 + }, + { + "epoch": 0.011184257096340341, + "grad_norm": 13.220793581875874, + "learning_rate": 9.316037735849057e-07, + "loss": 1.5088, + "step": 79 + }, + { + "epoch": 0.011325829970977561, + "grad_norm": 16.509900446221373, + "learning_rate": 9.433962264150944e-07, + "loss": 1.6059, + "step": 80 + }, + { + "epoch": 0.01146740284561478, + "grad_norm": 19.10985010013447, + "learning_rate": 9.551886792452833e-07, + "loss": 1.5191, + "step": 81 + }, + { + "epoch": 0.011608975720252, + "grad_norm": 13.444186086422903, + "learning_rate": 9.669811320754719e-07, + "loss": 1.5635, + "step": 82 + }, + { + "epoch": 0.011750548594889219, + "grad_norm": 15.684371575512676, + "learning_rate": 9.787735849056605e-07, + "loss": 1.657, + "step": 83 + }, + { + "epoch": 0.011892121469526439, + "grad_norm": 17.437719637673652, + "learning_rate": 9.90566037735849e-07, + "loss": 1.7334, + "step": 84 + }, + { + "epoch": 0.012033694344163659, + "grad_norm": 16.434440066899146, + "learning_rate": 1.002358490566038e-06, + "loss": 1.4781, + "step": 85 + }, + { + "epoch": 0.012175267218800878, + "grad_norm": 14.027730453629607, + "learning_rate": 1.0141509433962265e-06, + "loss": 1.5501, + "step": 86 + }, + { + "epoch": 0.012316840093438097, + "grad_norm": 12.857080526049462, + "learning_rate": 1.0259433962264152e-06, + "loss": 1.4878, + "step": 87 + }, + { + "epoch": 0.012458412968075316, + "grad_norm": 14.444642253075566, + "learning_rate": 1.037735849056604e-06, + "loss": 1.7252, + "step": 88 + }, + { + "epoch": 0.012599985842712536, + "grad_norm": 19.848812869147945, + "learning_rate": 1.0495283018867926e-06, + "loss": 1.5859, + "step": 89 + }, + { + "epoch": 0.012741558717349756, + "grad_norm": 14.409536785275609, + "learning_rate": 1.0613207547169812e-06, + "loss": 1.3952, + "step": 90 + }, + { + "epoch": 0.012883131591986976, + "grad_norm": 13.819869203072765, + "learning_rate": 1.07311320754717e-06, + "loss": 1.5638, + "step": 91 + }, + { + "epoch": 0.013024704466624194, + "grad_norm": 12.706403808863712, + "learning_rate": 1.0849056603773587e-06, + "loss": 1.6233, + "step": 92 + }, + { + "epoch": 0.013166277341261414, + "grad_norm": 14.19695004227773, + "learning_rate": 1.0966981132075473e-06, + "loss": 1.5352, + "step": 93 + }, + { + "epoch": 0.013307850215898634, + "grad_norm": 13.606861605060066, + "learning_rate": 1.1084905660377359e-06, + "loss": 1.4425, + "step": 94 + }, + { + "epoch": 0.013449423090535853, + "grad_norm": 15.755666530871718, + "learning_rate": 1.1202830188679247e-06, + "loss": 1.4741, + "step": 95 + }, + { + "epoch": 0.013590995965173073, + "grad_norm": 14.877809403486939, + "learning_rate": 1.1320754716981133e-06, + "loss": 1.5541, + "step": 96 + }, + { + "epoch": 0.013732568839810293, + "grad_norm": 13.255857504466126, + "learning_rate": 1.143867924528302e-06, + "loss": 1.5647, + "step": 97 + }, + { + "epoch": 0.013874141714447511, + "grad_norm": 13.855458838451487, + "learning_rate": 1.1556603773584908e-06, + "loss": 1.5705, + "step": 98 + }, + { + "epoch": 0.014015714589084731, + "grad_norm": 13.55374692792118, + "learning_rate": 1.1674528301886794e-06, + "loss": 1.4713, + "step": 99 + }, + { + "epoch": 0.01415728746372195, + "grad_norm": 14.95035118277431, + "learning_rate": 1.179245283018868e-06, + "loss": 1.6928, + "step": 100 + }, + { + "epoch": 0.01429886033835917, + "grad_norm": 14.35835058233753, + "learning_rate": 1.1910377358490568e-06, + "loss": 1.4509, + "step": 101 + }, + { + "epoch": 0.01444043321299639, + "grad_norm": 14.356722946912853, + "learning_rate": 1.2028301886792454e-06, + "loss": 1.6253, + "step": 102 + }, + { + "epoch": 0.01458200608763361, + "grad_norm": 13.253531507191182, + "learning_rate": 1.214622641509434e-06, + "loss": 1.4329, + "step": 103 + }, + { + "epoch": 0.014723578962270828, + "grad_norm": 12.068835872461198, + "learning_rate": 1.2264150943396227e-06, + "loss": 1.5758, + "step": 104 + }, + { + "epoch": 0.014865151836908048, + "grad_norm": 16.067234870286583, + "learning_rate": 1.2382075471698115e-06, + "loss": 1.6764, + "step": 105 + }, + { + "epoch": 0.015006724711545268, + "grad_norm": 13.277565550515734, + "learning_rate": 1.25e-06, + "loss": 1.5468, + "step": 106 + }, + { + "epoch": 0.015148297586182488, + "grad_norm": 13.694581590592911, + "learning_rate": 1.261792452830189e-06, + "loss": 1.5405, + "step": 107 + }, + { + "epoch": 0.015289870460819708, + "grad_norm": 13.575376524366472, + "learning_rate": 1.2735849056603775e-06, + "loss": 1.4807, + "step": 108 + }, + { + "epoch": 0.015431443335456926, + "grad_norm": 13.251138890101696, + "learning_rate": 1.2853773584905664e-06, + "loss": 1.7341, + "step": 109 + }, + { + "epoch": 0.015573016210094145, + "grad_norm": 12.48415499649706, + "learning_rate": 1.2971698113207548e-06, + "loss": 1.6548, + "step": 110 + }, + { + "epoch": 0.015714589084731365, + "grad_norm": 14.528873503585993, + "learning_rate": 1.3089622641509436e-06, + "loss": 1.5195, + "step": 111 + }, + { + "epoch": 0.015856161959368585, + "grad_norm": 10.910482465249267, + "learning_rate": 1.3207547169811322e-06, + "loss": 1.4978, + "step": 112 + }, + { + "epoch": 0.015997734834005805, + "grad_norm": 13.338750183946267, + "learning_rate": 1.332547169811321e-06, + "loss": 1.5451, + "step": 113 + }, + { + "epoch": 0.016139307708643025, + "grad_norm": 14.897790711740459, + "learning_rate": 1.3443396226415094e-06, + "loss": 1.6466, + "step": 114 + }, + { + "epoch": 0.016280880583280245, + "grad_norm": 15.685948511737843, + "learning_rate": 1.3561320754716983e-06, + "loss": 1.4926, + "step": 115 + }, + { + "epoch": 0.016422453457917464, + "grad_norm": 13.013156982019078, + "learning_rate": 1.3679245283018869e-06, + "loss": 1.5655, + "step": 116 + }, + { + "epoch": 0.016564026332554684, + "grad_norm": 15.222617658566685, + "learning_rate": 1.3797169811320757e-06, + "loss": 1.6512, + "step": 117 + }, + { + "epoch": 0.0167055992071919, + "grad_norm": 14.619333065909409, + "learning_rate": 1.3915094339622643e-06, + "loss": 1.4915, + "step": 118 + }, + { + "epoch": 0.01684717208182912, + "grad_norm": 13.37392642266976, + "learning_rate": 1.4033018867924531e-06, + "loss": 1.6999, + "step": 119 + }, + { + "epoch": 0.01698874495646634, + "grad_norm": 16.901172376067493, + "learning_rate": 1.4150943396226415e-06, + "loss": 1.4944, + "step": 120 + }, + { + "epoch": 0.01713031783110356, + "grad_norm": 13.734780506314639, + "learning_rate": 1.4268867924528304e-06, + "loss": 1.5935, + "step": 121 + }, + { + "epoch": 0.01727189070574078, + "grad_norm": 17.245475728592375, + "learning_rate": 1.438679245283019e-06, + "loss": 1.7136, + "step": 122 + }, + { + "epoch": 0.017413463580378, + "grad_norm": 16.03460765276547, + "learning_rate": 1.4504716981132078e-06, + "loss": 1.6388, + "step": 123 + }, + { + "epoch": 0.01755503645501522, + "grad_norm": 14.105209665129605, + "learning_rate": 1.4622641509433962e-06, + "loss": 1.5156, + "step": 124 + }, + { + "epoch": 0.01769660932965244, + "grad_norm": 14.131155107694996, + "learning_rate": 1.474056603773585e-06, + "loss": 1.4179, + "step": 125 + }, + { + "epoch": 0.01783818220428966, + "grad_norm": 13.715693174528676, + "learning_rate": 1.4858490566037737e-06, + "loss": 1.5075, + "step": 126 + }, + { + "epoch": 0.01797975507892688, + "grad_norm": 13.156183890496015, + "learning_rate": 1.4976415094339625e-06, + "loss": 1.5233, + "step": 127 + }, + { + "epoch": 0.0181213279535641, + "grad_norm": 14.943304617964309, + "learning_rate": 1.509433962264151e-06, + "loss": 1.576, + "step": 128 + }, + { + "epoch": 0.018262900828201315, + "grad_norm": 15.375239176556835, + "learning_rate": 1.52122641509434e-06, + "loss": 1.5976, + "step": 129 + }, + { + "epoch": 0.018404473702838535, + "grad_norm": 16.322948865568645, + "learning_rate": 1.5330188679245283e-06, + "loss": 1.5658, + "step": 130 + }, + { + "epoch": 0.018546046577475755, + "grad_norm": 14.668642315593237, + "learning_rate": 1.5448113207547172e-06, + "loss": 1.6371, + "step": 131 + }, + { + "epoch": 0.018687619452112975, + "grad_norm": 12.54557487621071, + "learning_rate": 1.5566037735849058e-06, + "loss": 1.6308, + "step": 132 + }, + { + "epoch": 0.018829192326750194, + "grad_norm": 13.730156468710524, + "learning_rate": 1.5683962264150946e-06, + "loss": 1.5501, + "step": 133 + }, + { + "epoch": 0.018970765201387414, + "grad_norm": 12.947855320378775, + "learning_rate": 1.580188679245283e-06, + "loss": 1.6422, + "step": 134 + }, + { + "epoch": 0.019112338076024634, + "grad_norm": 15.812208090036545, + "learning_rate": 1.5919811320754718e-06, + "loss": 1.4228, + "step": 135 + }, + { + "epoch": 0.019253910950661854, + "grad_norm": 12.463533844914142, + "learning_rate": 1.6037735849056604e-06, + "loss": 1.3262, + "step": 136 + }, + { + "epoch": 0.019395483825299074, + "grad_norm": 18.82834624481006, + "learning_rate": 1.6155660377358493e-06, + "loss": 1.8582, + "step": 137 + }, + { + "epoch": 0.019537056699936294, + "grad_norm": 14.707654265522446, + "learning_rate": 1.6273584905660379e-06, + "loss": 1.4652, + "step": 138 + }, + { + "epoch": 0.019678629574573513, + "grad_norm": 13.564922063753375, + "learning_rate": 1.6391509433962267e-06, + "loss": 1.6464, + "step": 139 + }, + { + "epoch": 0.01982020244921073, + "grad_norm": 13.096845383862064, + "learning_rate": 1.650943396226415e-06, + "loss": 1.4895, + "step": 140 + }, + { + "epoch": 0.01996177532384795, + "grad_norm": 14.912794807634462, + "learning_rate": 1.662735849056604e-06, + "loss": 1.577, + "step": 141 + }, + { + "epoch": 0.02010334819848517, + "grad_norm": 16.625749843580834, + "learning_rate": 1.6745283018867925e-06, + "loss": 1.5344, + "step": 142 + }, + { + "epoch": 0.02024492107312239, + "grad_norm": 13.118412362779342, + "learning_rate": 1.6863207547169814e-06, + "loss": 1.5609, + "step": 143 + }, + { + "epoch": 0.02038649394775961, + "grad_norm": 12.789803401339752, + "learning_rate": 1.6981132075471698e-06, + "loss": 1.4033, + "step": 144 + }, + { + "epoch": 0.02052806682239683, + "grad_norm": 13.561377131657066, + "learning_rate": 1.7099056603773586e-06, + "loss": 1.4907, + "step": 145 + }, + { + "epoch": 0.02066963969703405, + "grad_norm": 16.932227183179787, + "learning_rate": 1.7216981132075472e-06, + "loss": 1.7734, + "step": 146 + }, + { + "epoch": 0.02081121257167127, + "grad_norm": 15.48548652670461, + "learning_rate": 1.733490566037736e-06, + "loss": 1.6198, + "step": 147 + }, + { + "epoch": 0.020952785446308488, + "grad_norm": 15.300986683264753, + "learning_rate": 1.7452830188679247e-06, + "loss": 1.5848, + "step": 148 + }, + { + "epoch": 0.021094358320945708, + "grad_norm": 16.437703294548463, + "learning_rate": 1.7570754716981135e-06, + "loss": 1.5339, + "step": 149 + }, + { + "epoch": 0.021235931195582928, + "grad_norm": 13.028215160135677, + "learning_rate": 1.7688679245283019e-06, + "loss": 1.4365, + "step": 150 + }, + { + "epoch": 0.021377504070220144, + "grad_norm": 15.91159045908045, + "learning_rate": 1.7806603773584907e-06, + "loss": 1.6195, + "step": 151 + }, + { + "epoch": 0.021519076944857364, + "grad_norm": 16.954751326082516, + "learning_rate": 1.7924528301886793e-06, + "loss": 1.551, + "step": 152 + }, + { + "epoch": 0.021660649819494584, + "grad_norm": 15.046821760517183, + "learning_rate": 1.8042452830188682e-06, + "loss": 1.825, + "step": 153 + }, + { + "epoch": 0.021802222694131804, + "grad_norm": 15.091352378961156, + "learning_rate": 1.8160377358490566e-06, + "loss": 1.6292, + "step": 154 + }, + { + "epoch": 0.021943795568769024, + "grad_norm": 14.432539182550636, + "learning_rate": 1.8278301886792454e-06, + "loss": 1.5909, + "step": 155 + }, + { + "epoch": 0.022085368443406243, + "grad_norm": 15.350198730772796, + "learning_rate": 1.839622641509434e-06, + "loss": 1.3723, + "step": 156 + }, + { + "epoch": 0.022226941318043463, + "grad_norm": 12.139384438307225, + "learning_rate": 1.8514150943396228e-06, + "loss": 1.5192, + "step": 157 + }, + { + "epoch": 0.022368514192680683, + "grad_norm": 14.336121103218012, + "learning_rate": 1.8632075471698114e-06, + "loss": 1.5375, + "step": 158 + }, + { + "epoch": 0.022510087067317903, + "grad_norm": 12.27092155247805, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.4674, + "step": 159 + }, + { + "epoch": 0.022651659941955123, + "grad_norm": 14.587461108995125, + "learning_rate": 1.8867924528301889e-06, + "loss": 1.6306, + "step": 160 + }, + { + "epoch": 0.022793232816592342, + "grad_norm": 18.411363393349042, + "learning_rate": 1.8985849056603775e-06, + "loss": 1.5669, + "step": 161 + }, + { + "epoch": 0.02293480569122956, + "grad_norm": 13.522875584079742, + "learning_rate": 1.9103773584905665e-06, + "loss": 1.5095, + "step": 162 + }, + { + "epoch": 0.02307637856586678, + "grad_norm": 12.772640695713509, + "learning_rate": 1.9221698113207547e-06, + "loss": 1.5482, + "step": 163 + }, + { + "epoch": 0.023217951440504, + "grad_norm": 13.092669094145359, + "learning_rate": 1.9339622641509438e-06, + "loss": 1.6376, + "step": 164 + }, + { + "epoch": 0.023359524315141218, + "grad_norm": 14.703520393527892, + "learning_rate": 1.9457547169811324e-06, + "loss": 1.4219, + "step": 165 + }, + { + "epoch": 0.023501097189778438, + "grad_norm": 14.982875967771507, + "learning_rate": 1.957547169811321e-06, + "loss": 1.6373, + "step": 166 + }, + { + "epoch": 0.023642670064415658, + "grad_norm": 13.031265807656778, + "learning_rate": 1.9693396226415096e-06, + "loss": 1.5, + "step": 167 + }, + { + "epoch": 0.023784242939052878, + "grad_norm": 13.171651419736513, + "learning_rate": 1.981132075471698e-06, + "loss": 1.5495, + "step": 168 + }, + { + "epoch": 0.023925815813690098, + "grad_norm": 12.024859427980658, + "learning_rate": 1.992924528301887e-06, + "loss": 1.5937, + "step": 169 + }, + { + "epoch": 0.024067388688327317, + "grad_norm": 11.309674494989553, + "learning_rate": 2.004716981132076e-06, + "loss": 1.5181, + "step": 170 + }, + { + "epoch": 0.024208961562964537, + "grad_norm": 12.096265585430954, + "learning_rate": 2.0165094339622645e-06, + "loss": 1.5963, + "step": 171 + }, + { + "epoch": 0.024350534437601757, + "grad_norm": 19.9050879026962, + "learning_rate": 2.028301886792453e-06, + "loss": 1.6418, + "step": 172 + }, + { + "epoch": 0.024492107312238973, + "grad_norm": 11.237402071754202, + "learning_rate": 2.0400943396226417e-06, + "loss": 1.3917, + "step": 173 + }, + { + "epoch": 0.024633680186876193, + "grad_norm": 24.721165979040737, + "learning_rate": 2.0518867924528303e-06, + "loss": 1.6966, + "step": 174 + }, + { + "epoch": 0.024775253061513413, + "grad_norm": 13.442270115367128, + "learning_rate": 2.063679245283019e-06, + "loss": 1.3873, + "step": 175 + }, + { + "epoch": 0.024916825936150633, + "grad_norm": 18.067810333952256, + "learning_rate": 2.075471698113208e-06, + "loss": 1.6409, + "step": 176 + }, + { + "epoch": 0.025058398810787853, + "grad_norm": 12.152404647669659, + "learning_rate": 2.087264150943396e-06, + "loss": 1.5051, + "step": 177 + }, + { + "epoch": 0.025199971685425072, + "grad_norm": 11.072792566895318, + "learning_rate": 2.099056603773585e-06, + "loss": 1.5607, + "step": 178 + }, + { + "epoch": 0.025341544560062292, + "grad_norm": 17.038763394325674, + "learning_rate": 2.110849056603774e-06, + "loss": 1.6831, + "step": 179 + }, + { + "epoch": 0.025483117434699512, + "grad_norm": 12.727139791809998, + "learning_rate": 2.1226415094339624e-06, + "loss": 1.4876, + "step": 180 + }, + { + "epoch": 0.025624690309336732, + "grad_norm": 15.984658515630796, + "learning_rate": 2.134433962264151e-06, + "loss": 1.4248, + "step": 181 + }, + { + "epoch": 0.02576626318397395, + "grad_norm": 13.670608200239414, + "learning_rate": 2.14622641509434e-06, + "loss": 1.5833, + "step": 182 + }, + { + "epoch": 0.02590783605861117, + "grad_norm": 11.84797500498503, + "learning_rate": 2.1580188679245283e-06, + "loss": 1.5871, + "step": 183 + }, + { + "epoch": 0.026049408933248388, + "grad_norm": 15.782458087267877, + "learning_rate": 2.1698113207547173e-06, + "loss": 1.6402, + "step": 184 + }, + { + "epoch": 0.026190981807885608, + "grad_norm": 15.444899048430552, + "learning_rate": 2.181603773584906e-06, + "loss": 1.4687, + "step": 185 + }, + { + "epoch": 0.026332554682522828, + "grad_norm": 12.700725408547353, + "learning_rate": 2.1933962264150945e-06, + "loss": 1.7111, + "step": 186 + }, + { + "epoch": 0.026474127557160047, + "grad_norm": 12.791023826392621, + "learning_rate": 2.205188679245283e-06, + "loss": 1.5864, + "step": 187 + }, + { + "epoch": 0.026615700431797267, + "grad_norm": 19.092763023028272, + "learning_rate": 2.2169811320754718e-06, + "loss": 1.5508, + "step": 188 + }, + { + "epoch": 0.026757273306434487, + "grad_norm": 14.931716118391561, + "learning_rate": 2.2287735849056604e-06, + "loss": 1.6208, + "step": 189 + }, + { + "epoch": 0.026898846181071707, + "grad_norm": 12.647548368609947, + "learning_rate": 2.2405660377358494e-06, + "loss": 1.523, + "step": 190 + }, + { + "epoch": 0.027040419055708927, + "grad_norm": 16.25359128735333, + "learning_rate": 2.252358490566038e-06, + "loss": 1.6727, + "step": 191 + }, + { + "epoch": 0.027181991930346146, + "grad_norm": 17.20422126956601, + "learning_rate": 2.2641509433962266e-06, + "loss": 1.626, + "step": 192 + }, + { + "epoch": 0.027323564804983366, + "grad_norm": 21.05076711529958, + "learning_rate": 2.2759433962264153e-06, + "loss": 1.7074, + "step": 193 + }, + { + "epoch": 0.027465137679620586, + "grad_norm": 16.915585237460196, + "learning_rate": 2.287735849056604e-06, + "loss": 1.5558, + "step": 194 + }, + { + "epoch": 0.027606710554257806, + "grad_norm": 21.809565917902717, + "learning_rate": 2.2995283018867925e-06, + "loss": 1.7469, + "step": 195 + }, + { + "epoch": 0.027748283428895022, + "grad_norm": 14.04482124429427, + "learning_rate": 2.3113207547169815e-06, + "loss": 1.4616, + "step": 196 + }, + { + "epoch": 0.027889856303532242, + "grad_norm": 17.697562977753815, + "learning_rate": 2.3231132075471697e-06, + "loss": 1.4925, + "step": 197 + }, + { + "epoch": 0.028031429178169462, + "grad_norm": 19.46590631820318, + "learning_rate": 2.3349056603773588e-06, + "loss": 1.5952, + "step": 198 + }, + { + "epoch": 0.02817300205280668, + "grad_norm": 10.736677919476863, + "learning_rate": 2.3466981132075474e-06, + "loss": 1.5475, + "step": 199 + }, + { + "epoch": 0.0283145749274439, + "grad_norm": 16.821841909683226, + "learning_rate": 2.358490566037736e-06, + "loss": 1.5889, + "step": 200 + }, + { + "epoch": 0.02845614780208112, + "grad_norm": 14.392989338210592, + "learning_rate": 2.3702830188679246e-06, + "loss": 1.6246, + "step": 201 + }, + { + "epoch": 0.02859772067671834, + "grad_norm": 14.68586936177538, + "learning_rate": 2.3820754716981136e-06, + "loss": 1.5908, + "step": 202 + }, + { + "epoch": 0.02873929355135556, + "grad_norm": 11.30225393046411, + "learning_rate": 2.393867924528302e-06, + "loss": 1.6981, + "step": 203 + }, + { + "epoch": 0.02888086642599278, + "grad_norm": 15.139140431280994, + "learning_rate": 2.405660377358491e-06, + "loss": 1.4584, + "step": 204 + }, + { + "epoch": 0.02902243930063, + "grad_norm": 18.136757030031752, + "learning_rate": 2.4174528301886795e-06, + "loss": 1.7026, + "step": 205 + }, + { + "epoch": 0.02916401217526722, + "grad_norm": 16.90510732910601, + "learning_rate": 2.429245283018868e-06, + "loss": 1.7415, + "step": 206 + }, + { + "epoch": 0.029305585049904437, + "grad_norm": 15.73847903356735, + "learning_rate": 2.4410377358490567e-06, + "loss": 1.5474, + "step": 207 + }, + { + "epoch": 0.029447157924541657, + "grad_norm": 13.883660440304627, + "learning_rate": 2.4528301886792453e-06, + "loss": 1.5644, + "step": 208 + }, + { + "epoch": 0.029588730799178876, + "grad_norm": 15.753096987489972, + "learning_rate": 2.464622641509434e-06, + "loss": 1.4789, + "step": 209 + }, + { + "epoch": 0.029730303673816096, + "grad_norm": 11.703151728315607, + "learning_rate": 2.476415094339623e-06, + "loss": 1.5977, + "step": 210 + }, + { + "epoch": 0.029871876548453316, + "grad_norm": 20.35579220681886, + "learning_rate": 2.4882075471698116e-06, + "loss": 1.53, + "step": 211 + }, + { + "epoch": 0.030013449423090536, + "grad_norm": 13.078082413871822, + "learning_rate": 2.5e-06, + "loss": 1.4975, + "step": 212 + }, + { + "epoch": 0.030155022297727756, + "grad_norm": 25.05771536116359, + "learning_rate": 2.511792452830189e-06, + "loss": 1.5969, + "step": 213 + }, + { + "epoch": 0.030296595172364976, + "grad_norm": 15.77066089665197, + "learning_rate": 2.523584905660378e-06, + "loss": 1.5186, + "step": 214 + }, + { + "epoch": 0.030438168047002195, + "grad_norm": 12.83569677478928, + "learning_rate": 2.535377358490566e-06, + "loss": 1.6209, + "step": 215 + }, + { + "epoch": 0.030579740921639415, + "grad_norm": 20.116665648571242, + "learning_rate": 2.547169811320755e-06, + "loss": 1.7061, + "step": 216 + }, + { + "epoch": 0.030721313796276635, + "grad_norm": 14.944487529505027, + "learning_rate": 2.5589622641509437e-06, + "loss": 1.6152, + "step": 217 + }, + { + "epoch": 0.03086288667091385, + "grad_norm": 11.449867107836203, + "learning_rate": 2.5707547169811327e-06, + "loss": 1.6268, + "step": 218 + }, + { + "epoch": 0.03100445954555107, + "grad_norm": 15.189741809049485, + "learning_rate": 2.582547169811321e-06, + "loss": 1.6159, + "step": 219 + }, + { + "epoch": 0.03114603242018829, + "grad_norm": 13.791001103102749, + "learning_rate": 2.5943396226415095e-06, + "loss": 1.547, + "step": 220 + }, + { + "epoch": 0.031287605294825514, + "grad_norm": 17.008551746681174, + "learning_rate": 2.6061320754716986e-06, + "loss": 1.7294, + "step": 221 + }, + { + "epoch": 0.03142917816946273, + "grad_norm": 16.712832992078088, + "learning_rate": 2.617924528301887e-06, + "loss": 1.4712, + "step": 222 + }, + { + "epoch": 0.03157075104409995, + "grad_norm": 20.859240402187467, + "learning_rate": 2.6297169811320754e-06, + "loss": 1.5791, + "step": 223 + }, + { + "epoch": 0.03171232391873717, + "grad_norm": 13.946644609870269, + "learning_rate": 2.6415094339622644e-06, + "loss": 1.5184, + "step": 224 + }, + { + "epoch": 0.03185389679337439, + "grad_norm": 16.748550580071193, + "learning_rate": 2.653301886792453e-06, + "loss": 1.4621, + "step": 225 + }, + { + "epoch": 0.03199546966801161, + "grad_norm": 18.75302384471135, + "learning_rate": 2.665094339622642e-06, + "loss": 1.5516, + "step": 226 + }, + { + "epoch": 0.032137042542648826, + "grad_norm": 13.769395500217366, + "learning_rate": 2.6768867924528303e-06, + "loss": 1.578, + "step": 227 + }, + { + "epoch": 0.03227861541728605, + "grad_norm": 15.258649530612034, + "learning_rate": 2.688679245283019e-06, + "loss": 1.474, + "step": 228 + }, + { + "epoch": 0.032420188291923266, + "grad_norm": 17.45684463356843, + "learning_rate": 2.700471698113208e-06, + "loss": 1.5765, + "step": 229 + }, + { + "epoch": 0.03256176116656049, + "grad_norm": 14.429139197588968, + "learning_rate": 2.7122641509433965e-06, + "loss": 1.6272, + "step": 230 + }, + { + "epoch": 0.032703334041197706, + "grad_norm": 11.525824699767284, + "learning_rate": 2.724056603773585e-06, + "loss": 1.496, + "step": 231 + }, + { + "epoch": 0.03284490691583493, + "grad_norm": 13.750744053909694, + "learning_rate": 2.7358490566037738e-06, + "loss": 1.571, + "step": 232 + }, + { + "epoch": 0.032986479790472145, + "grad_norm": 15.490688849758717, + "learning_rate": 2.7476415094339624e-06, + "loss": 1.4308, + "step": 233 + }, + { + "epoch": 0.03312805266510937, + "grad_norm": 13.793800642217017, + "learning_rate": 2.7594339622641514e-06, + "loss": 1.4714, + "step": 234 + }, + { + "epoch": 0.033269625539746585, + "grad_norm": 10.697327139167207, + "learning_rate": 2.7712264150943396e-06, + "loss": 1.3354, + "step": 235 + }, + { + "epoch": 0.0334111984143838, + "grad_norm": 12.554985265992823, + "learning_rate": 2.7830188679245286e-06, + "loss": 1.5042, + "step": 236 + }, + { + "epoch": 0.033552771289021024, + "grad_norm": 11.094891900407966, + "learning_rate": 2.7948113207547173e-06, + "loss": 1.439, + "step": 237 + }, + { + "epoch": 0.03369434416365824, + "grad_norm": 9.912370108947599, + "learning_rate": 2.8066037735849063e-06, + "loss": 1.499, + "step": 238 + }, + { + "epoch": 0.033835917038295464, + "grad_norm": 11.257202216608063, + "learning_rate": 2.8183962264150945e-06, + "loss": 1.5, + "step": 239 + }, + { + "epoch": 0.03397748991293268, + "grad_norm": 12.816795559463603, + "learning_rate": 2.830188679245283e-06, + "loss": 1.8102, + "step": 240 + }, + { + "epoch": 0.034119062787569904, + "grad_norm": 14.413740151375256, + "learning_rate": 2.841981132075472e-06, + "loss": 1.4316, + "step": 241 + }, + { + "epoch": 0.03426063566220712, + "grad_norm": 13.237587927219195, + "learning_rate": 2.8537735849056608e-06, + "loss": 1.6965, + "step": 242 + }, + { + "epoch": 0.03440220853684434, + "grad_norm": 10.692188684490473, + "learning_rate": 2.865566037735849e-06, + "loss": 1.6786, + "step": 243 + }, + { + "epoch": 0.03454378141148156, + "grad_norm": 12.375605404619984, + "learning_rate": 2.877358490566038e-06, + "loss": 1.627, + "step": 244 + }, + { + "epoch": 0.03468535428611878, + "grad_norm": 11.62038732096559, + "learning_rate": 2.8891509433962266e-06, + "loss": 1.4265, + "step": 245 + }, + { + "epoch": 0.034826927160756, + "grad_norm": 12.351094902399785, + "learning_rate": 2.9009433962264156e-06, + "loss": 1.5533, + "step": 246 + }, + { + "epoch": 0.034968500035393216, + "grad_norm": 11.119967576123726, + "learning_rate": 2.912735849056604e-06, + "loss": 1.4086, + "step": 247 + }, + { + "epoch": 0.03511007291003044, + "grad_norm": 12.860572805506845, + "learning_rate": 2.9245283018867924e-06, + "loss": 1.417, + "step": 248 + }, + { + "epoch": 0.035251645784667655, + "grad_norm": 13.736791196383779, + "learning_rate": 2.9363207547169815e-06, + "loss": 1.5673, + "step": 249 + }, + { + "epoch": 0.03539321865930488, + "grad_norm": 12.089991416707239, + "learning_rate": 2.94811320754717e-06, + "loss": 1.6234, + "step": 250 + }, + { + "epoch": 0.035534791533942095, + "grad_norm": 12.865588850295339, + "learning_rate": 2.9599056603773587e-06, + "loss": 1.4899, + "step": 251 + }, + { + "epoch": 0.03567636440857932, + "grad_norm": 11.37346328583593, + "learning_rate": 2.9716981132075473e-06, + "loss": 1.3134, + "step": 252 + }, + { + "epoch": 0.035817937283216535, + "grad_norm": 12.952226154353376, + "learning_rate": 2.983490566037736e-06, + "loss": 1.7041, + "step": 253 + }, + { + "epoch": 0.03595951015785376, + "grad_norm": 16.47740892503468, + "learning_rate": 2.995283018867925e-06, + "loss": 1.7105, + "step": 254 + }, + { + "epoch": 0.036101083032490974, + "grad_norm": 12.259772314848023, + "learning_rate": 3.007075471698113e-06, + "loss": 1.5359, + "step": 255 + }, + { + "epoch": 0.0362426559071282, + "grad_norm": 13.15361696113112, + "learning_rate": 3.018867924528302e-06, + "loss": 1.4668, + "step": 256 + }, + { + "epoch": 0.036384228781765414, + "grad_norm": 11.85160884961273, + "learning_rate": 3.030660377358491e-06, + "loss": 1.4925, + "step": 257 + }, + { + "epoch": 0.03652580165640263, + "grad_norm": 14.00304434237399, + "learning_rate": 3.04245283018868e-06, + "loss": 1.5922, + "step": 258 + }, + { + "epoch": 0.036667374531039854, + "grad_norm": 15.004935406320534, + "learning_rate": 3.054245283018868e-06, + "loss": 1.6927, + "step": 259 + }, + { + "epoch": 0.03680894740567707, + "grad_norm": 11.863595083517646, + "learning_rate": 3.0660377358490567e-06, + "loss": 1.4076, + "step": 260 + }, + { + "epoch": 0.03695052028031429, + "grad_norm": 11.706887876582863, + "learning_rate": 3.0778301886792457e-06, + "loss": 1.428, + "step": 261 + }, + { + "epoch": 0.03709209315495151, + "grad_norm": 12.609342931921738, + "learning_rate": 3.0896226415094343e-06, + "loss": 1.3543, + "step": 262 + }, + { + "epoch": 0.03723366602958873, + "grad_norm": 15.37357379532275, + "learning_rate": 3.1014150943396225e-06, + "loss": 1.5881, + "step": 263 + }, + { + "epoch": 0.03737523890422595, + "grad_norm": 10.541909638470802, + "learning_rate": 3.1132075471698115e-06, + "loss": 1.4197, + "step": 264 + }, + { + "epoch": 0.03751681177886317, + "grad_norm": 12.994250860281323, + "learning_rate": 3.125e-06, + "loss": 1.4946, + "step": 265 + }, + { + "epoch": 0.03765838465350039, + "grad_norm": 13.65478731261437, + "learning_rate": 3.136792452830189e-06, + "loss": 1.5497, + "step": 266 + }, + { + "epoch": 0.03779995752813761, + "grad_norm": 11.254530608982128, + "learning_rate": 3.148584905660378e-06, + "loss": 1.5299, + "step": 267 + }, + { + "epoch": 0.03794153040277483, + "grad_norm": 11.656297126245217, + "learning_rate": 3.160377358490566e-06, + "loss": 1.5153, + "step": 268 + }, + { + "epoch": 0.038083103277412045, + "grad_norm": 13.755509815814488, + "learning_rate": 3.172169811320755e-06, + "loss": 1.427, + "step": 269 + }, + { + "epoch": 0.03822467615204927, + "grad_norm": 13.587216124600417, + "learning_rate": 3.1839622641509436e-06, + "loss": 1.5298, + "step": 270 + }, + { + "epoch": 0.038366249026686484, + "grad_norm": 12.112294884865117, + "learning_rate": 3.1957547169811327e-06, + "loss": 1.5709, + "step": 271 + }, + { + "epoch": 0.03850782190132371, + "grad_norm": 13.637326325622755, + "learning_rate": 3.207547169811321e-06, + "loss": 1.5438, + "step": 272 + }, + { + "epoch": 0.038649394775960924, + "grad_norm": 17.207797851112897, + "learning_rate": 3.2193396226415095e-06, + "loss": 1.5918, + "step": 273 + }, + { + "epoch": 0.03879096765059815, + "grad_norm": 14.940634681875654, + "learning_rate": 3.2311320754716985e-06, + "loss": 1.5988, + "step": 274 + }, + { + "epoch": 0.038932540525235364, + "grad_norm": 16.91909026024671, + "learning_rate": 3.242924528301887e-06, + "loss": 1.6323, + "step": 275 + }, + { + "epoch": 0.03907411339987259, + "grad_norm": 12.352112846491524, + "learning_rate": 3.2547169811320758e-06, + "loss": 1.579, + "step": 276 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 14.36729636055142, + "learning_rate": 3.2665094339622644e-06, + "loss": 1.6939, + "step": 277 + }, + { + "epoch": 0.03935725914914703, + "grad_norm": 14.536952846001634, + "learning_rate": 3.2783018867924534e-06, + "loss": 1.3646, + "step": 278 + }, + { + "epoch": 0.03949883202378424, + "grad_norm": 13.0292785763344, + "learning_rate": 3.290094339622642e-06, + "loss": 1.6463, + "step": 279 + }, + { + "epoch": 0.03964040489842146, + "grad_norm": 13.19163817071076, + "learning_rate": 3.30188679245283e-06, + "loss": 1.5869, + "step": 280 + }, + { + "epoch": 0.03978197777305868, + "grad_norm": 15.541825008182306, + "learning_rate": 3.3136792452830192e-06, + "loss": 1.3878, + "step": 281 + }, + { + "epoch": 0.0399235506476959, + "grad_norm": 14.602579260011632, + "learning_rate": 3.325471698113208e-06, + "loss": 1.6486, + "step": 282 + }, + { + "epoch": 0.04006512352233312, + "grad_norm": 12.854613524613935, + "learning_rate": 3.337264150943397e-06, + "loss": 1.4761, + "step": 283 + }, + { + "epoch": 0.04020669639697034, + "grad_norm": 14.591155187043375, + "learning_rate": 3.349056603773585e-06, + "loss": 1.4768, + "step": 284 + }, + { + "epoch": 0.04034826927160756, + "grad_norm": 14.00770574764119, + "learning_rate": 3.3608490566037737e-06, + "loss": 1.4841, + "step": 285 + }, + { + "epoch": 0.04048984214624478, + "grad_norm": 12.684547118273601, + "learning_rate": 3.3726415094339627e-06, + "loss": 1.6088, + "step": 286 + }, + { + "epoch": 0.040631415020882, + "grad_norm": 14.077606360972457, + "learning_rate": 3.3844339622641514e-06, + "loss": 1.4525, + "step": 287 + }, + { + "epoch": 0.04077298789551922, + "grad_norm": 12.37692030062122, + "learning_rate": 3.3962264150943395e-06, + "loss": 1.461, + "step": 288 + }, + { + "epoch": 0.04091456077015644, + "grad_norm": 12.70991347864987, + "learning_rate": 3.4080188679245286e-06, + "loss": 1.6119, + "step": 289 + }, + { + "epoch": 0.04105613364479366, + "grad_norm": 14.625094421603585, + "learning_rate": 3.419811320754717e-06, + "loss": 1.5845, + "step": 290 + }, + { + "epoch": 0.041197706519430874, + "grad_norm": 12.650931439391318, + "learning_rate": 3.4316037735849062e-06, + "loss": 1.511, + "step": 291 + }, + { + "epoch": 0.0413392793940681, + "grad_norm": 17.681685391698824, + "learning_rate": 3.4433962264150944e-06, + "loss": 1.6627, + "step": 292 + }, + { + "epoch": 0.041480852268705314, + "grad_norm": 13.852901972742254, + "learning_rate": 3.455188679245283e-06, + "loss": 1.4634, + "step": 293 + }, + { + "epoch": 0.04162242514334254, + "grad_norm": 11.235050356199906, + "learning_rate": 3.466981132075472e-06, + "loss": 1.5729, + "step": 294 + }, + { + "epoch": 0.04176399801797975, + "grad_norm": 15.061645483573551, + "learning_rate": 3.4787735849056607e-06, + "loss": 1.44, + "step": 295 + }, + { + "epoch": 0.041905570892616976, + "grad_norm": 13.213340861000459, + "learning_rate": 3.4905660377358493e-06, + "loss": 1.5443, + "step": 296 + }, + { + "epoch": 0.04204714376725419, + "grad_norm": 12.829084873878257, + "learning_rate": 3.502358490566038e-06, + "loss": 1.51, + "step": 297 + }, + { + "epoch": 0.042188716641891416, + "grad_norm": 11.839341764968793, + "learning_rate": 3.514150943396227e-06, + "loss": 1.5636, + "step": 298 + }, + { + "epoch": 0.04233028951652863, + "grad_norm": 12.463975188126554, + "learning_rate": 3.5259433962264156e-06, + "loss": 1.5464, + "step": 299 + }, + { + "epoch": 0.042471862391165856, + "grad_norm": 12.505469994572625, + "learning_rate": 3.5377358490566038e-06, + "loss": 1.6278, + "step": 300 + }, + { + "epoch": 0.04261343526580307, + "grad_norm": 11.95865948771701, + "learning_rate": 3.549528301886793e-06, + "loss": 1.4534, + "step": 301 + }, + { + "epoch": 0.04275500814044029, + "grad_norm": 13.1569510882452, + "learning_rate": 3.5613207547169814e-06, + "loss": 1.4099, + "step": 302 + }, + { + "epoch": 0.04289658101507751, + "grad_norm": 12.407375947607674, + "learning_rate": 3.5731132075471705e-06, + "loss": 1.5631, + "step": 303 + }, + { + "epoch": 0.04303815388971473, + "grad_norm": 11.495179866198267, + "learning_rate": 3.5849056603773586e-06, + "loss": 1.4924, + "step": 304 + }, + { + "epoch": 0.04317972676435195, + "grad_norm": 10.190325198988146, + "learning_rate": 3.5966981132075473e-06, + "loss": 1.3476, + "step": 305 + }, + { + "epoch": 0.04332129963898917, + "grad_norm": 11.96765291051355, + "learning_rate": 3.6084905660377363e-06, + "loss": 1.5363, + "step": 306 + }, + { + "epoch": 0.04346287251362639, + "grad_norm": 15.508449097450175, + "learning_rate": 3.620283018867925e-06, + "loss": 1.3793, + "step": 307 + }, + { + "epoch": 0.04360444538826361, + "grad_norm": 12.019986356959162, + "learning_rate": 3.632075471698113e-06, + "loss": 1.4787, + "step": 308 + }, + { + "epoch": 0.04374601826290083, + "grad_norm": 11.371062886423248, + "learning_rate": 3.643867924528302e-06, + "loss": 1.675, + "step": 309 + }, + { + "epoch": 0.04388759113753805, + "grad_norm": 18.77545310132771, + "learning_rate": 3.6556603773584908e-06, + "loss": 1.6515, + "step": 310 + }, + { + "epoch": 0.04402916401217527, + "grad_norm": 10.37820311588431, + "learning_rate": 3.66745283018868e-06, + "loss": 1.4924, + "step": 311 + }, + { + "epoch": 0.04417073688681249, + "grad_norm": 13.989321859855567, + "learning_rate": 3.679245283018868e-06, + "loss": 1.6333, + "step": 312 + }, + { + "epoch": 0.0443123097614497, + "grad_norm": 18.14246111374286, + "learning_rate": 3.6910377358490566e-06, + "loss": 1.4851, + "step": 313 + }, + { + "epoch": 0.044453882636086926, + "grad_norm": 12.43205655848772, + "learning_rate": 3.7028301886792456e-06, + "loss": 1.554, + "step": 314 + }, + { + "epoch": 0.04459545551072414, + "grad_norm": 14.370121136223124, + "learning_rate": 3.7146226415094343e-06, + "loss": 1.6656, + "step": 315 + }, + { + "epoch": 0.044737028385361366, + "grad_norm": 13.226365224828282, + "learning_rate": 3.726415094339623e-06, + "loss": 1.458, + "step": 316 + }, + { + "epoch": 0.04487860125999858, + "grad_norm": 12.40553231649751, + "learning_rate": 3.7382075471698115e-06, + "loss": 1.5017, + "step": 317 + }, + { + "epoch": 0.045020174134635806, + "grad_norm": 10.67209375919637, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.4327, + "step": 318 + }, + { + "epoch": 0.04516174700927302, + "grad_norm": 10.722152481335444, + "learning_rate": 3.761792452830189e-06, + "loss": 1.4864, + "step": 319 + }, + { + "epoch": 0.045303319883910245, + "grad_norm": 10.303894521469555, + "learning_rate": 3.7735849056603777e-06, + "loss": 1.4328, + "step": 320 + }, + { + "epoch": 0.04544489275854746, + "grad_norm": 12.898338128024236, + "learning_rate": 3.7853773584905664e-06, + "loss": 1.6387, + "step": 321 + }, + { + "epoch": 0.045586465633184685, + "grad_norm": 12.044024772261295, + "learning_rate": 3.797169811320755e-06, + "loss": 1.3398, + "step": 322 + }, + { + "epoch": 0.0457280385078219, + "grad_norm": 13.523557101579728, + "learning_rate": 3.808962264150944e-06, + "loss": 1.5648, + "step": 323 + }, + { + "epoch": 0.04586961138245912, + "grad_norm": 15.072970095181454, + "learning_rate": 3.820754716981133e-06, + "loss": 1.5107, + "step": 324 + }, + { + "epoch": 0.04601118425709634, + "grad_norm": 12.612250509526008, + "learning_rate": 3.832547169811321e-06, + "loss": 1.4006, + "step": 325 + }, + { + "epoch": 0.04615275713173356, + "grad_norm": 11.13828470225099, + "learning_rate": 3.8443396226415094e-06, + "loss": 1.3893, + "step": 326 + }, + { + "epoch": 0.04629433000637078, + "grad_norm": 13.189801779089771, + "learning_rate": 3.856132075471699e-06, + "loss": 1.3141, + "step": 327 + }, + { + "epoch": 0.046435902881008, + "grad_norm": 18.802420212817683, + "learning_rate": 3.8679245283018875e-06, + "loss": 1.7083, + "step": 328 + }, + { + "epoch": 0.04657747575564522, + "grad_norm": 13.115984626949476, + "learning_rate": 3.879716981132075e-06, + "loss": 1.422, + "step": 329 + }, + { + "epoch": 0.046719048630282436, + "grad_norm": 14.232245377019035, + "learning_rate": 3.891509433962265e-06, + "loss": 1.6212, + "step": 330 + }, + { + "epoch": 0.04686062150491966, + "grad_norm": 12.930792650088017, + "learning_rate": 3.903301886792453e-06, + "loss": 1.5937, + "step": 331 + }, + { + "epoch": 0.047002194379556876, + "grad_norm": 11.525853825052804, + "learning_rate": 3.915094339622642e-06, + "loss": 1.4803, + "step": 332 + }, + { + "epoch": 0.0471437672541941, + "grad_norm": 15.47951951101817, + "learning_rate": 3.926886792452831e-06, + "loss": 1.6232, + "step": 333 + }, + { + "epoch": 0.047285340128831316, + "grad_norm": 9.06063634191386, + "learning_rate": 3.938679245283019e-06, + "loss": 1.3336, + "step": 334 + }, + { + "epoch": 0.04742691300346853, + "grad_norm": 18.19059637919712, + "learning_rate": 3.950471698113208e-06, + "loss": 1.5712, + "step": 335 + }, + { + "epoch": 0.047568485878105755, + "grad_norm": 15.011710618329829, + "learning_rate": 3.962264150943396e-06, + "loss": 1.6409, + "step": 336 + }, + { + "epoch": 0.04771005875274297, + "grad_norm": 13.490860282325459, + "learning_rate": 3.974056603773585e-06, + "loss": 1.4341, + "step": 337 + }, + { + "epoch": 0.047851631627380195, + "grad_norm": 18.730439829878613, + "learning_rate": 3.985849056603774e-06, + "loss": 1.4, + "step": 338 + }, + { + "epoch": 0.04799320450201741, + "grad_norm": 13.530274530009429, + "learning_rate": 3.997641509433962e-06, + "loss": 1.5709, + "step": 339 + }, + { + "epoch": 0.048134777376654635, + "grad_norm": 14.513655973489136, + "learning_rate": 4.009433962264152e-06, + "loss": 1.5607, + "step": 340 + }, + { + "epoch": 0.04827635025129185, + "grad_norm": 18.01383948762555, + "learning_rate": 4.0212264150943395e-06, + "loss": 1.6144, + "step": 341 + }, + { + "epoch": 0.048417923125929074, + "grad_norm": 11.15823030393373, + "learning_rate": 4.033018867924529e-06, + "loss": 1.477, + "step": 342 + }, + { + "epoch": 0.04855949600056629, + "grad_norm": 13.235963692277236, + "learning_rate": 4.0448113207547176e-06, + "loss": 1.4943, + "step": 343 + }, + { + "epoch": 0.048701068875203514, + "grad_norm": 16.775677940438893, + "learning_rate": 4.056603773584906e-06, + "loss": 1.583, + "step": 344 + }, + { + "epoch": 0.04884264174984073, + "grad_norm": 13.915051090477029, + "learning_rate": 4.068396226415095e-06, + "loss": 1.4982, + "step": 345 + }, + { + "epoch": 0.04898421462447795, + "grad_norm": 11.261778366560899, + "learning_rate": 4.080188679245283e-06, + "loss": 1.5752, + "step": 346 + }, + { + "epoch": 0.04912578749911517, + "grad_norm": 11.391966522296192, + "learning_rate": 4.091981132075472e-06, + "loss": 1.4978, + "step": 347 + }, + { + "epoch": 0.049267360373752386, + "grad_norm": 13.380136804722362, + "learning_rate": 4.103773584905661e-06, + "loss": 1.5026, + "step": 348 + }, + { + "epoch": 0.04940893324838961, + "grad_norm": 16.3502494769647, + "learning_rate": 4.115566037735849e-06, + "loss": 1.4355, + "step": 349 + }, + { + "epoch": 0.049550506123026826, + "grad_norm": 11.965934721115605, + "learning_rate": 4.127358490566038e-06, + "loss": 1.4808, + "step": 350 + }, + { + "epoch": 0.04969207899766405, + "grad_norm": 10.20245985401334, + "learning_rate": 4.1391509433962265e-06, + "loss": 1.3701, + "step": 351 + }, + { + "epoch": 0.049833651872301266, + "grad_norm": 15.36983446636422, + "learning_rate": 4.150943396226416e-06, + "loss": 1.7683, + "step": 352 + }, + { + "epoch": 0.04997522474693849, + "grad_norm": 14.325277563471774, + "learning_rate": 4.162735849056604e-06, + "loss": 1.5218, + "step": 353 + }, + { + "epoch": 0.050116797621575705, + "grad_norm": 11.758763068925779, + "learning_rate": 4.174528301886792e-06, + "loss": 1.5006, + "step": 354 + }, + { + "epoch": 0.05025837049621293, + "grad_norm": 10.218577905541508, + "learning_rate": 4.186320754716982e-06, + "loss": 1.3826, + "step": 355 + }, + { + "epoch": 0.050399943370850145, + "grad_norm": 12.837478628762943, + "learning_rate": 4.19811320754717e-06, + "loss": 1.6801, + "step": 356 + }, + { + "epoch": 0.05054151624548736, + "grad_norm": 12.981278657399459, + "learning_rate": 4.209905660377359e-06, + "loss": 1.5594, + "step": 357 + }, + { + "epoch": 0.050683089120124585, + "grad_norm": 12.257283439427072, + "learning_rate": 4.221698113207548e-06, + "loss": 1.5344, + "step": 358 + }, + { + "epoch": 0.0508246619947618, + "grad_norm": 10.046586051808182, + "learning_rate": 4.233490566037736e-06, + "loss": 1.4747, + "step": 359 + }, + { + "epoch": 0.050966234869399024, + "grad_norm": 12.50178881243569, + "learning_rate": 4.245283018867925e-06, + "loss": 1.6338, + "step": 360 + }, + { + "epoch": 0.05110780774403624, + "grad_norm": 14.844289155504434, + "learning_rate": 4.2570754716981135e-06, + "loss": 1.474, + "step": 361 + }, + { + "epoch": 0.051249380618673464, + "grad_norm": 11.063001373481269, + "learning_rate": 4.268867924528302e-06, + "loss": 1.5917, + "step": 362 + }, + { + "epoch": 0.05139095349331068, + "grad_norm": 11.631744069260355, + "learning_rate": 4.280660377358491e-06, + "loss": 1.4944, + "step": 363 + }, + { + "epoch": 0.0515325263679479, + "grad_norm": 11.792182039426654, + "learning_rate": 4.29245283018868e-06, + "loss": 1.483, + "step": 364 + }, + { + "epoch": 0.05167409924258512, + "grad_norm": 10.048679883806955, + "learning_rate": 4.304245283018868e-06, + "loss": 1.3792, + "step": 365 + }, + { + "epoch": 0.05181567211722234, + "grad_norm": 16.281829179357622, + "learning_rate": 4.3160377358490565e-06, + "loss": 1.6723, + "step": 366 + }, + { + "epoch": 0.05195724499185956, + "grad_norm": 12.927769046684508, + "learning_rate": 4.327830188679246e-06, + "loss": 1.6836, + "step": 367 + }, + { + "epoch": 0.052098817866496776, + "grad_norm": 17.67335575554652, + "learning_rate": 4.339622641509435e-06, + "loss": 1.4297, + "step": 368 + }, + { + "epoch": 0.052240390741134, + "grad_norm": 13.404726453164931, + "learning_rate": 4.351415094339622e-06, + "loss": 1.4923, + "step": 369 + }, + { + "epoch": 0.052381963615771215, + "grad_norm": 10.458356474791275, + "learning_rate": 4.363207547169812e-06, + "loss": 1.5893, + "step": 370 + }, + { + "epoch": 0.05252353649040844, + "grad_norm": 12.813243118693643, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.4019, + "step": 371 + }, + { + "epoch": 0.052665109365045655, + "grad_norm": 14.204267499943844, + "learning_rate": 4.386792452830189e-06, + "loss": 1.4609, + "step": 372 + }, + { + "epoch": 0.05280668223968288, + "grad_norm": 11.568275452643846, + "learning_rate": 4.398584905660378e-06, + "loss": 1.5952, + "step": 373 + }, + { + "epoch": 0.052948255114320095, + "grad_norm": 10.301470782998356, + "learning_rate": 4.410377358490566e-06, + "loss": 1.5027, + "step": 374 + }, + { + "epoch": 0.05308982798895732, + "grad_norm": 9.724272210902896, + "learning_rate": 4.422169811320755e-06, + "loss": 1.2069, + "step": 375 + }, + { + "epoch": 0.053231400863594534, + "grad_norm": 10.561957892777674, + "learning_rate": 4.4339622641509435e-06, + "loss": 1.3286, + "step": 376 + }, + { + "epoch": 0.05337297373823176, + "grad_norm": 11.343865680367463, + "learning_rate": 4.445754716981133e-06, + "loss": 1.4346, + "step": 377 + }, + { + "epoch": 0.053514546612868974, + "grad_norm": 12.051240241461535, + "learning_rate": 4.457547169811321e-06, + "loss": 1.4667, + "step": 378 + }, + { + "epoch": 0.0536561194875062, + "grad_norm": 9.953691068209203, + "learning_rate": 4.469339622641509e-06, + "loss": 1.558, + "step": 379 + }, + { + "epoch": 0.053797692362143414, + "grad_norm": 11.5291451701543, + "learning_rate": 4.481132075471699e-06, + "loss": 1.3692, + "step": 380 + }, + { + "epoch": 0.05393926523678063, + "grad_norm": 10.910131425074693, + "learning_rate": 4.4929245283018875e-06, + "loss": 1.3542, + "step": 381 + }, + { + "epoch": 0.05408083811141785, + "grad_norm": 12.734940554926041, + "learning_rate": 4.504716981132076e-06, + "loss": 1.5591, + "step": 382 + }, + { + "epoch": 0.05422241098605507, + "grad_norm": 10.05784319105054, + "learning_rate": 4.516509433962265e-06, + "loss": 1.4132, + "step": 383 + }, + { + "epoch": 0.05436398386069229, + "grad_norm": 15.70995605074091, + "learning_rate": 4.528301886792453e-06, + "loss": 1.5423, + "step": 384 + }, + { + "epoch": 0.05450555673532951, + "grad_norm": 12.757097849128012, + "learning_rate": 4.540094339622642e-06, + "loss": 1.7114, + "step": 385 + }, + { + "epoch": 0.05464712960996673, + "grad_norm": 15.60328080328531, + "learning_rate": 4.5518867924528305e-06, + "loss": 1.6, + "step": 386 + }, + { + "epoch": 0.05478870248460395, + "grad_norm": 12.499285257886061, + "learning_rate": 4.563679245283019e-06, + "loss": 1.6051, + "step": 387 + }, + { + "epoch": 0.05493027535924117, + "grad_norm": 11.840270323889293, + "learning_rate": 4.575471698113208e-06, + "loss": 1.4581, + "step": 388 + }, + { + "epoch": 0.05507184823387839, + "grad_norm": 13.803985042434586, + "learning_rate": 4.587264150943397e-06, + "loss": 1.5296, + "step": 389 + }, + { + "epoch": 0.05521342110851561, + "grad_norm": 11.070902074433647, + "learning_rate": 4.599056603773585e-06, + "loss": 1.3439, + "step": 390 + }, + { + "epoch": 0.05535499398315283, + "grad_norm": 9.05115214227112, + "learning_rate": 4.610849056603774e-06, + "loss": 1.3973, + "step": 391 + }, + { + "epoch": 0.055496566857790045, + "grad_norm": 11.169374583604366, + "learning_rate": 4.622641509433963e-06, + "loss": 1.5362, + "step": 392 + }, + { + "epoch": 0.05563813973242727, + "grad_norm": 13.861730880658898, + "learning_rate": 4.634433962264152e-06, + "loss": 1.4226, + "step": 393 + }, + { + "epoch": 0.055779712607064484, + "grad_norm": 11.698157036548869, + "learning_rate": 4.6462264150943394e-06, + "loss": 1.3848, + "step": 394 + }, + { + "epoch": 0.05592128548170171, + "grad_norm": 12.369582714792198, + "learning_rate": 4.658018867924529e-06, + "loss": 1.6363, + "step": 395 + }, + { + "epoch": 0.056062858356338924, + "grad_norm": 12.205755558342139, + "learning_rate": 4.6698113207547175e-06, + "loss": 1.3974, + "step": 396 + }, + { + "epoch": 0.05620443123097615, + "grad_norm": 11.182746772524984, + "learning_rate": 4.681603773584906e-06, + "loss": 1.449, + "step": 397 + }, + { + "epoch": 0.05634600410561336, + "grad_norm": 12.037298094263358, + "learning_rate": 4.693396226415095e-06, + "loss": 1.6078, + "step": 398 + }, + { + "epoch": 0.05648757698025059, + "grad_norm": 13.599835316278018, + "learning_rate": 4.705188679245283e-06, + "loss": 1.4239, + "step": 399 + }, + { + "epoch": 0.0566291498548878, + "grad_norm": 10.903349307577322, + "learning_rate": 4.716981132075472e-06, + "loss": 1.3857, + "step": 400 + }, + { + "epoch": 0.056770722729525026, + "grad_norm": 11.681769776758465, + "learning_rate": 4.728773584905661e-06, + "loss": 1.6518, + "step": 401 + }, + { + "epoch": 0.05691229560416224, + "grad_norm": 14.112844993084424, + "learning_rate": 4.740566037735849e-06, + "loss": 1.4499, + "step": 402 + }, + { + "epoch": 0.05705386847879946, + "grad_norm": 10.543848307026332, + "learning_rate": 4.752358490566038e-06, + "loss": 1.5575, + "step": 403 + }, + { + "epoch": 0.05719544135343668, + "grad_norm": 11.887331693242022, + "learning_rate": 4.764150943396227e-06, + "loss": 1.4608, + "step": 404 + }, + { + "epoch": 0.0573370142280739, + "grad_norm": 15.629967472093822, + "learning_rate": 4.775943396226416e-06, + "loss": 1.6656, + "step": 405 + }, + { + "epoch": 0.05747858710271112, + "grad_norm": 13.111115004831658, + "learning_rate": 4.787735849056604e-06, + "loss": 1.5731, + "step": 406 + }, + { + "epoch": 0.05762015997734834, + "grad_norm": 16.26221558588247, + "learning_rate": 4.799528301886793e-06, + "loss": 1.5037, + "step": 407 + }, + { + "epoch": 0.05776173285198556, + "grad_norm": 10.063603879668307, + "learning_rate": 4.811320754716982e-06, + "loss": 1.331, + "step": 408 + }, + { + "epoch": 0.05790330572662278, + "grad_norm": 12.313716925109118, + "learning_rate": 4.82311320754717e-06, + "loss": 1.4243, + "step": 409 + }, + { + "epoch": 0.05804487860126, + "grad_norm": 14.280535189646583, + "learning_rate": 4.834905660377359e-06, + "loss": 1.5115, + "step": 410 + }, + { + "epoch": 0.05818645147589722, + "grad_norm": 13.245199233472292, + "learning_rate": 4.8466981132075476e-06, + "loss": 1.5789, + "step": 411 + }, + { + "epoch": 0.05832802435053444, + "grad_norm": 11.715478058852781, + "learning_rate": 4.858490566037736e-06, + "loss": 1.6146, + "step": 412 + }, + { + "epoch": 0.05846959722517166, + "grad_norm": 13.590734107798122, + "learning_rate": 4.870283018867925e-06, + "loss": 1.4784, + "step": 413 + }, + { + "epoch": 0.058611170099808874, + "grad_norm": 11.480304603921802, + "learning_rate": 4.882075471698113e-06, + "loss": 1.5629, + "step": 414 + }, + { + "epoch": 0.0587527429744461, + "grad_norm": 13.021725620468496, + "learning_rate": 4.893867924528302e-06, + "loss": 1.5649, + "step": 415 + }, + { + "epoch": 0.05889431584908331, + "grad_norm": 15.366355885130233, + "learning_rate": 4.905660377358491e-06, + "loss": 1.4631, + "step": 416 + }, + { + "epoch": 0.05903588872372054, + "grad_norm": 9.805928939708133, + "learning_rate": 4.91745283018868e-06, + "loss": 1.4948, + "step": 417 + }, + { + "epoch": 0.05917746159835775, + "grad_norm": 10.019636329858344, + "learning_rate": 4.929245283018868e-06, + "loss": 1.2915, + "step": 418 + }, + { + "epoch": 0.059319034472994976, + "grad_norm": 15.484086758637831, + "learning_rate": 4.9410377358490565e-06, + "loss": 1.6348, + "step": 419 + }, + { + "epoch": 0.05946060734763219, + "grad_norm": 10.959424524165442, + "learning_rate": 4.952830188679246e-06, + "loss": 1.5184, + "step": 420 + }, + { + "epoch": 0.059602180222269416, + "grad_norm": 13.137251304065545, + "learning_rate": 4.9646226415094346e-06, + "loss": 1.5784, + "step": 421 + }, + { + "epoch": 0.05974375309690663, + "grad_norm": 14.124406920278435, + "learning_rate": 4.976415094339623e-06, + "loss": 1.5202, + "step": 422 + }, + { + "epoch": 0.059885325971543855, + "grad_norm": 11.937542181290004, + "learning_rate": 4.988207547169812e-06, + "loss": 1.5244, + "step": 423 + }, + { + "epoch": 0.06002689884618107, + "grad_norm": 9.742690256173312, + "learning_rate": 5e-06, + "loss": 1.2642, + "step": 424 + }, + { + "epoch": 0.06016847172081829, + "grad_norm": 9.822308928431587, + "learning_rate": 4.999999934288433e-06, + "loss": 1.4309, + "step": 425 + }, + { + "epoch": 0.06031004459545551, + "grad_norm": 11.332820955549721, + "learning_rate": 4.999999737153732e-06, + "loss": 1.4631, + "step": 426 + }, + { + "epoch": 0.06045161747009273, + "grad_norm": 11.556825967385235, + "learning_rate": 4.999999408595909e-06, + "loss": 1.4601, + "step": 427 + }, + { + "epoch": 0.06059319034472995, + "grad_norm": 14.063105998227519, + "learning_rate": 4.999998948614983e-06, + "loss": 1.4723, + "step": 428 + }, + { + "epoch": 0.06073476321936717, + "grad_norm": 12.153318703859375, + "learning_rate": 4.999998357210974e-06, + "loss": 1.8034, + "step": 429 + }, + { + "epoch": 0.06087633609400439, + "grad_norm": 12.941784135555968, + "learning_rate": 4.999997634383916e-06, + "loss": 1.3495, + "step": 430 + }, + { + "epoch": 0.06101790896864161, + "grad_norm": 13.802442937041016, + "learning_rate": 4.9999967801338475e-06, + "loss": 1.4215, + "step": 431 + }, + { + "epoch": 0.06115948184327883, + "grad_norm": 10.468728865417098, + "learning_rate": 4.9999957944608115e-06, + "loss": 1.4304, + "step": 432 + }, + { + "epoch": 0.06130105471791605, + "grad_norm": 13.090205693854543, + "learning_rate": 4.999994677364861e-06, + "loss": 1.5751, + "step": 433 + }, + { + "epoch": 0.06144262759255327, + "grad_norm": 10.844246361005618, + "learning_rate": 4.999993428846054e-06, + "loss": 1.438, + "step": 434 + }, + { + "epoch": 0.061584200467190486, + "grad_norm": 10.791189018686111, + "learning_rate": 4.999992048904457e-06, + "loss": 1.3169, + "step": 435 + }, + { + "epoch": 0.0617257733418277, + "grad_norm": 10.443122491527468, + "learning_rate": 4.999990537540142e-06, + "loss": 1.4441, + "step": 436 + }, + { + "epoch": 0.061867346216464926, + "grad_norm": 10.822367669219775, + "learning_rate": 4.999988894753189e-06, + "loss": 1.4573, + "step": 437 + }, + { + "epoch": 0.06200891909110214, + "grad_norm": 12.175743027089506, + "learning_rate": 4.999987120543682e-06, + "loss": 1.5634, + "step": 438 + }, + { + "epoch": 0.062150491965739366, + "grad_norm": 10.986843217091664, + "learning_rate": 4.999985214911718e-06, + "loss": 1.6051, + "step": 439 + }, + { + "epoch": 0.06229206484037658, + "grad_norm": 13.378633094839254, + "learning_rate": 4.9999831778573945e-06, + "loss": 1.3025, + "step": 440 + }, + { + "epoch": 0.062433637715013805, + "grad_norm": 11.404716827609914, + "learning_rate": 4.99998100938082e-06, + "loss": 1.5891, + "step": 441 + }, + { + "epoch": 0.06257521058965103, + "grad_norm": 10.475412918728841, + "learning_rate": 4.999978709482108e-06, + "loss": 1.6087, + "step": 442 + }, + { + "epoch": 0.06271678346428824, + "grad_norm": 10.405053055576193, + "learning_rate": 4.999976278161378e-06, + "loss": 1.601, + "step": 443 + }, + { + "epoch": 0.06285835633892546, + "grad_norm": 10.537284774663883, + "learning_rate": 4.9999737154187596e-06, + "loss": 1.5019, + "step": 444 + }, + { + "epoch": 0.06299992921356268, + "grad_norm": 11.446071060613304, + "learning_rate": 4.999971021254387e-06, + "loss": 1.4298, + "step": 445 + }, + { + "epoch": 0.0631415020881999, + "grad_norm": 12.062722633955818, + "learning_rate": 4.9999681956684025e-06, + "loss": 1.53, + "step": 446 + }, + { + "epoch": 0.06328307496283712, + "grad_norm": 11.428702217443588, + "learning_rate": 4.999965238660954e-06, + "loss": 1.5598, + "step": 447 + }, + { + "epoch": 0.06342464783747434, + "grad_norm": 11.700069094722862, + "learning_rate": 4.999962150232197e-06, + "loss": 1.6052, + "step": 448 + }, + { + "epoch": 0.06356622071211156, + "grad_norm": 11.728143370705704, + "learning_rate": 4.999958930382293e-06, + "loss": 1.5422, + "step": 449 + }, + { + "epoch": 0.06370779358674877, + "grad_norm": 9.764397188439913, + "learning_rate": 4.999955579111413e-06, + "loss": 1.5556, + "step": 450 + }, + { + "epoch": 0.063849366461386, + "grad_norm": 12.1334758903876, + "learning_rate": 4.999952096419731e-06, + "loss": 1.4828, + "step": 451 + }, + { + "epoch": 0.06399093933602322, + "grad_norm": 12.090117626138596, + "learning_rate": 4.999948482307433e-06, + "loss": 1.5335, + "step": 452 + }, + { + "epoch": 0.06413251221066044, + "grad_norm": 11.912947565659325, + "learning_rate": 4.999944736774706e-06, + "loss": 1.5197, + "step": 453 + }, + { + "epoch": 0.06427408508529765, + "grad_norm": 11.46458754505041, + "learning_rate": 4.999940859821749e-06, + "loss": 1.229, + "step": 454 + }, + { + "epoch": 0.06441565795993488, + "grad_norm": 12.99614423645138, + "learning_rate": 4.999936851448764e-06, + "loss": 1.4235, + "step": 455 + }, + { + "epoch": 0.0645572308345721, + "grad_norm": 13.802513689733237, + "learning_rate": 4.9999327116559634e-06, + "loss": 1.5288, + "step": 456 + }, + { + "epoch": 0.06469880370920932, + "grad_norm": 11.233554942796694, + "learning_rate": 4.999928440443565e-06, + "loss": 1.4566, + "step": 457 + }, + { + "epoch": 0.06484037658384653, + "grad_norm": 10.988725259314979, + "learning_rate": 4.999924037811792e-06, + "loss": 1.5973, + "step": 458 + }, + { + "epoch": 0.06498194945848375, + "grad_norm": 12.718898762950648, + "learning_rate": 4.9999195037608765e-06, + "loss": 1.2781, + "step": 459 + }, + { + "epoch": 0.06512352233312098, + "grad_norm": 12.0159862845961, + "learning_rate": 4.999914838291056e-06, + "loss": 1.5239, + "step": 460 + }, + { + "epoch": 0.0652650952077582, + "grad_norm": 13.805173285701349, + "learning_rate": 4.999910041402577e-06, + "loss": 1.5116, + "step": 461 + }, + { + "epoch": 0.06540666808239541, + "grad_norm": 16.44639248235289, + "learning_rate": 4.999905113095691e-06, + "loss": 1.6311, + "step": 462 + }, + { + "epoch": 0.06554824095703263, + "grad_norm": 13.603456596470998, + "learning_rate": 4.999900053370657e-06, + "loss": 1.5138, + "step": 463 + }, + { + "epoch": 0.06568981383166986, + "grad_norm": 16.035405087137455, + "learning_rate": 4.999894862227741e-06, + "loss": 1.563, + "step": 464 + }, + { + "epoch": 0.06583138670630707, + "grad_norm": 11.927684323468345, + "learning_rate": 4.999889539667217e-06, + "loss": 1.5195, + "step": 465 + }, + { + "epoch": 0.06597295958094429, + "grad_norm": 15.219609731715929, + "learning_rate": 4.999884085689363e-06, + "loss": 1.6859, + "step": 466 + }, + { + "epoch": 0.0661145324555815, + "grad_norm": 13.508590790548228, + "learning_rate": 4.9998785002944665e-06, + "loss": 1.6788, + "step": 467 + }, + { + "epoch": 0.06625610533021874, + "grad_norm": 14.311418384178145, + "learning_rate": 4.999872783482822e-06, + "loss": 1.8242, + "step": 468 + }, + { + "epoch": 0.06639767820485595, + "grad_norm": 14.750245690320032, + "learning_rate": 4.999866935254729e-06, + "loss": 1.5594, + "step": 469 + }, + { + "epoch": 0.06653925107949317, + "grad_norm": 11.51696786077198, + "learning_rate": 4.999860955610495e-06, + "loss": 1.6254, + "step": 470 + }, + { + "epoch": 0.06668082395413039, + "grad_norm": 13.51714225587735, + "learning_rate": 4.9998548445504345e-06, + "loss": 1.6437, + "step": 471 + }, + { + "epoch": 0.0668223968287676, + "grad_norm": 12.538607371023605, + "learning_rate": 4.999848602074869e-06, + "loss": 1.608, + "step": 472 + }, + { + "epoch": 0.06696396970340483, + "grad_norm": 11.632870870614132, + "learning_rate": 4.999842228184127e-06, + "loss": 1.5766, + "step": 473 + }, + { + "epoch": 0.06710554257804205, + "grad_norm": 14.677573627903866, + "learning_rate": 4.999835722878542e-06, + "loss": 1.4659, + "step": 474 + }, + { + "epoch": 0.06724711545267927, + "grad_norm": 12.649120291175391, + "learning_rate": 4.999829086158458e-06, + "loss": 1.7262, + "step": 475 + }, + { + "epoch": 0.06738868832731648, + "grad_norm": 10.81804818929813, + "learning_rate": 4.999822318024222e-06, + "loss": 1.5644, + "step": 476 + }, + { + "epoch": 0.06753026120195371, + "grad_norm": 15.7947715317208, + "learning_rate": 4.999815418476191e-06, + "loss": 1.488, + "step": 477 + }, + { + "epoch": 0.06767183407659093, + "grad_norm": 14.375597070650503, + "learning_rate": 4.9998083875147275e-06, + "loss": 1.7659, + "step": 478 + }, + { + "epoch": 0.06781340695122814, + "grad_norm": 10.219952713769262, + "learning_rate": 4.9998012251402005e-06, + "loss": 1.4859, + "step": 479 + }, + { + "epoch": 0.06795497982586536, + "grad_norm": 13.255501234894185, + "learning_rate": 4.9997939313529875e-06, + "loss": 1.4737, + "step": 480 + }, + { + "epoch": 0.06809655270050258, + "grad_norm": 16.134297374159573, + "learning_rate": 4.999786506153471e-06, + "loss": 1.5892, + "step": 481 + }, + { + "epoch": 0.06823812557513981, + "grad_norm": 12.068032374646158, + "learning_rate": 4.999778949542042e-06, + "loss": 1.305, + "step": 482 + }, + { + "epoch": 0.06837969844977702, + "grad_norm": 11.761297111339843, + "learning_rate": 4.999771261519099e-06, + "loss": 1.569, + "step": 483 + }, + { + "epoch": 0.06852127132441424, + "grad_norm": 18.301492031954748, + "learning_rate": 4.999763442085043e-06, + "loss": 1.4771, + "step": 484 + }, + { + "epoch": 0.06866284419905146, + "grad_norm": 13.788722220482681, + "learning_rate": 4.999755491240287e-06, + "loss": 1.4208, + "step": 485 + }, + { + "epoch": 0.06880441707368869, + "grad_norm": 8.969656762197795, + "learning_rate": 4.999747408985249e-06, + "loss": 1.5328, + "step": 486 + }, + { + "epoch": 0.0689459899483259, + "grad_norm": 9.461358436651519, + "learning_rate": 4.9997391953203535e-06, + "loss": 1.4762, + "step": 487 + }, + { + "epoch": 0.06908756282296312, + "grad_norm": 11.80433849169713, + "learning_rate": 4.999730850246032e-06, + "loss": 1.5599, + "step": 488 + }, + { + "epoch": 0.06922913569760034, + "grad_norm": 16.4909384149201, + "learning_rate": 4.999722373762725e-06, + "loss": 1.5207, + "step": 489 + }, + { + "epoch": 0.06937070857223757, + "grad_norm": 12.207963714385697, + "learning_rate": 4.999713765870875e-06, + "loss": 1.4793, + "step": 490 + }, + { + "epoch": 0.06951228144687478, + "grad_norm": 11.416758590291046, + "learning_rate": 4.999705026570937e-06, + "loss": 1.4224, + "step": 491 + }, + { + "epoch": 0.069653854321512, + "grad_norm": 13.547776065765369, + "learning_rate": 4.999696155863369e-06, + "loss": 1.5234, + "step": 492 + }, + { + "epoch": 0.06979542719614922, + "grad_norm": 15.924343048869737, + "learning_rate": 4.999687153748638e-06, + "loss": 1.4671, + "step": 493 + }, + { + "epoch": 0.06993700007078643, + "grad_norm": 13.914658513870084, + "learning_rate": 4.9996780202272175e-06, + "loss": 1.3998, + "step": 494 + }, + { + "epoch": 0.07007857294542366, + "grad_norm": 12.99757538706288, + "learning_rate": 4.999668755299588e-06, + "loss": 1.4688, + "step": 495 + }, + { + "epoch": 0.07022014582006088, + "grad_norm": 13.80811360993397, + "learning_rate": 4.999659358966235e-06, + "loss": 1.3305, + "step": 496 + }, + { + "epoch": 0.0703617186946981, + "grad_norm": 14.82587842735054, + "learning_rate": 4.999649831227654e-06, + "loss": 1.5667, + "step": 497 + }, + { + "epoch": 0.07050329156933531, + "grad_norm": 9.403285779304255, + "learning_rate": 4.999640172084345e-06, + "loss": 1.4057, + "step": 498 + }, + { + "epoch": 0.07064486444397254, + "grad_norm": 11.47047681567363, + "learning_rate": 4.999630381536815e-06, + "loss": 1.2765, + "step": 499 + }, + { + "epoch": 0.07078643731860976, + "grad_norm": 13.185388766688336, + "learning_rate": 4.99962045958558e-06, + "loss": 1.4575, + "step": 500 + }, + { + "epoch": 0.07092801019324697, + "grad_norm": 12.577961834151319, + "learning_rate": 4.999610406231162e-06, + "loss": 1.5344, + "step": 501 + }, + { + "epoch": 0.07106958306788419, + "grad_norm": 11.805460452027061, + "learning_rate": 4.999600221474089e-06, + "loss": 1.5744, + "step": 502 + }, + { + "epoch": 0.0712111559425214, + "grad_norm": 12.9381964307195, + "learning_rate": 4.999589905314895e-06, + "loss": 1.414, + "step": 503 + }, + { + "epoch": 0.07135272881715864, + "grad_norm": 17.075614165267783, + "learning_rate": 4.9995794577541235e-06, + "loss": 1.68, + "step": 504 + }, + { + "epoch": 0.07149430169179585, + "grad_norm": 10.936315156247892, + "learning_rate": 4.999568878792324e-06, + "loss": 1.4558, + "step": 505 + }, + { + "epoch": 0.07163587456643307, + "grad_norm": 12.795593719242952, + "learning_rate": 4.999558168430053e-06, + "loss": 1.421, + "step": 506 + }, + { + "epoch": 0.07177744744107029, + "grad_norm": 15.565551948797163, + "learning_rate": 4.999547326667872e-06, + "loss": 1.554, + "step": 507 + }, + { + "epoch": 0.07191902031570752, + "grad_norm": 11.833314994117245, + "learning_rate": 4.999536353506352e-06, + "loss": 1.4762, + "step": 508 + }, + { + "epoch": 0.07206059319034473, + "grad_norm": 9.09547098660986, + "learning_rate": 4.99952524894607e-06, + "loss": 1.4605, + "step": 509 + }, + { + "epoch": 0.07220216606498195, + "grad_norm": 13.719351718367221, + "learning_rate": 4.999514012987609e-06, + "loss": 1.619, + "step": 510 + }, + { + "epoch": 0.07234373893961916, + "grad_norm": 17.19131349739237, + "learning_rate": 4.99950264563156e-06, + "loss": 1.4046, + "step": 511 + }, + { + "epoch": 0.0724853118142564, + "grad_norm": 13.688050141448368, + "learning_rate": 4.99949114687852e-06, + "loss": 1.5382, + "step": 512 + }, + { + "epoch": 0.07262688468889361, + "grad_norm": 18.777691953163217, + "learning_rate": 4.9994795167290954e-06, + "loss": 1.5931, + "step": 513 + }, + { + "epoch": 0.07276845756353083, + "grad_norm": 13.890788674865284, + "learning_rate": 4.999467755183895e-06, + "loss": 1.4583, + "step": 514 + }, + { + "epoch": 0.07291003043816804, + "grad_norm": 11.512549808821065, + "learning_rate": 4.999455862243539e-06, + "loss": 1.3902, + "step": 515 + }, + { + "epoch": 0.07305160331280526, + "grad_norm": 14.734938350842361, + "learning_rate": 4.999443837908653e-06, + "loss": 1.6652, + "step": 516 + }, + { + "epoch": 0.07319317618744249, + "grad_norm": 11.054571899240978, + "learning_rate": 4.999431682179867e-06, + "loss": 1.3328, + "step": 517 + }, + { + "epoch": 0.07333474906207971, + "grad_norm": 12.59049517881073, + "learning_rate": 4.999419395057821e-06, + "loss": 1.6232, + "step": 518 + }, + { + "epoch": 0.07347632193671692, + "grad_norm": 14.59989195483218, + "learning_rate": 4.999406976543162e-06, + "loss": 1.5964, + "step": 519 + }, + { + "epoch": 0.07361789481135414, + "grad_norm": 9.666164999035255, + "learning_rate": 4.999394426636541e-06, + "loss": 1.3423, + "step": 520 + }, + { + "epoch": 0.07375946768599137, + "grad_norm": 11.517200377740053, + "learning_rate": 4.9993817453386185e-06, + "loss": 1.4648, + "step": 521 + }, + { + "epoch": 0.07390104056062859, + "grad_norm": 11.925381406046832, + "learning_rate": 4.999368932650062e-06, + "loss": 1.2968, + "step": 522 + }, + { + "epoch": 0.0740426134352658, + "grad_norm": 11.650467079851818, + "learning_rate": 4.999355988571544e-06, + "loss": 1.5174, + "step": 523 + }, + { + "epoch": 0.07418418630990302, + "grad_norm": 11.306361097609225, + "learning_rate": 4.999342913103745e-06, + "loss": 1.6329, + "step": 524 + }, + { + "epoch": 0.07432575918454024, + "grad_norm": 9.930882583841326, + "learning_rate": 4.999329706247353e-06, + "loss": 1.5081, + "step": 525 + }, + { + "epoch": 0.07446733205917747, + "grad_norm": 9.176820114003824, + "learning_rate": 4.999316368003062e-06, + "loss": 1.6258, + "step": 526 + }, + { + "epoch": 0.07460890493381468, + "grad_norm": 10.538498554453883, + "learning_rate": 4.999302898371572e-06, + "loss": 1.5378, + "step": 527 + }, + { + "epoch": 0.0747504778084519, + "grad_norm": 11.622740166985487, + "learning_rate": 4.999289297353593e-06, + "loss": 1.288, + "step": 528 + }, + { + "epoch": 0.07489205068308911, + "grad_norm": 12.277252400829425, + "learning_rate": 4.9992755649498395e-06, + "loss": 1.4761, + "step": 529 + }, + { + "epoch": 0.07503362355772634, + "grad_norm": 10.835349927179722, + "learning_rate": 4.999261701161033e-06, + "loss": 1.5363, + "step": 530 + }, + { + "epoch": 0.07517519643236356, + "grad_norm": 13.249466219532735, + "learning_rate": 4.999247705987902e-06, + "loss": 1.5437, + "step": 531 + }, + { + "epoch": 0.07531676930700078, + "grad_norm": 12.464650333425736, + "learning_rate": 4.999233579431183e-06, + "loss": 1.4988, + "step": 532 + }, + { + "epoch": 0.075458342181638, + "grad_norm": 14.822912491855673, + "learning_rate": 4.999219321491618e-06, + "loss": 1.5167, + "step": 533 + }, + { + "epoch": 0.07559991505627522, + "grad_norm": 11.172517942054395, + "learning_rate": 4.999204932169958e-06, + "loss": 1.3949, + "step": 534 + }, + { + "epoch": 0.07574148793091244, + "grad_norm": 11.874834240208234, + "learning_rate": 4.999190411466956e-06, + "loss": 1.4766, + "step": 535 + }, + { + "epoch": 0.07588306080554966, + "grad_norm": 14.305089106357231, + "learning_rate": 4.999175759383379e-06, + "loss": 1.582, + "step": 536 + }, + { + "epoch": 0.07602463368018687, + "grad_norm": 10.74180254992077, + "learning_rate": 4.9991609759199954e-06, + "loss": 1.4935, + "step": 537 + }, + { + "epoch": 0.07616620655482409, + "grad_norm": 10.922164192791504, + "learning_rate": 4.9991460610775825e-06, + "loss": 1.4974, + "step": 538 + }, + { + "epoch": 0.07630777942946132, + "grad_norm": 16.01472653729851, + "learning_rate": 4.999131014856925e-06, + "loss": 1.6579, + "step": 539 + }, + { + "epoch": 0.07644935230409854, + "grad_norm": 13.823007858807301, + "learning_rate": 4.999115837258813e-06, + "loss": 1.4678, + "step": 540 + }, + { + "epoch": 0.07659092517873575, + "grad_norm": 14.743865725720848, + "learning_rate": 4.999100528284045e-06, + "loss": 1.3637, + "step": 541 + }, + { + "epoch": 0.07673249805337297, + "grad_norm": 20.605775693803896, + "learning_rate": 4.999085087933426e-06, + "loss": 1.5534, + "step": 542 + }, + { + "epoch": 0.0768740709280102, + "grad_norm": 12.025224876283824, + "learning_rate": 4.999069516207767e-06, + "loss": 1.4447, + "step": 543 + }, + { + "epoch": 0.07701564380264742, + "grad_norm": 15.570002819298677, + "learning_rate": 4.9990538131078885e-06, + "loss": 1.5326, + "step": 544 + }, + { + "epoch": 0.07715721667728463, + "grad_norm": 14.216791911018847, + "learning_rate": 4.9990379786346126e-06, + "loss": 1.547, + "step": 545 + }, + { + "epoch": 0.07729878955192185, + "grad_norm": 12.584309790579436, + "learning_rate": 4.999022012788774e-06, + "loss": 1.3976, + "step": 546 + }, + { + "epoch": 0.07744036242655906, + "grad_norm": 11.70358458814709, + "learning_rate": 4.999005915571211e-06, + "loss": 1.3942, + "step": 547 + }, + { + "epoch": 0.0775819353011963, + "grad_norm": 13.50626538729023, + "learning_rate": 4.998989686982771e-06, + "loss": 1.3389, + "step": 548 + }, + { + "epoch": 0.07772350817583351, + "grad_norm": 12.007381711937532, + "learning_rate": 4.998973327024306e-06, + "loss": 1.6479, + "step": 549 + }, + { + "epoch": 0.07786508105047073, + "grad_norm": 10.783130072712611, + "learning_rate": 4.998956835696676e-06, + "loss": 1.6616, + "step": 550 + }, + { + "epoch": 0.07800665392510794, + "grad_norm": 14.521164279634496, + "learning_rate": 4.99894021300075e-06, + "loss": 1.5159, + "step": 551 + }, + { + "epoch": 0.07814822679974517, + "grad_norm": 10.74762681892005, + "learning_rate": 4.998923458937399e-06, + "loss": 1.498, + "step": 552 + }, + { + "epoch": 0.07828979967438239, + "grad_norm": 10.794268987798898, + "learning_rate": 4.998906573507506e-06, + "loss": 1.6019, + "step": 553 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 10.770347677727079, + "learning_rate": 4.998889556711958e-06, + "loss": 1.3818, + "step": 554 + }, + { + "epoch": 0.07857294542365682, + "grad_norm": 13.408105681774469, + "learning_rate": 4.998872408551648e-06, + "loss": 1.5437, + "step": 555 + }, + { + "epoch": 0.07871451829829405, + "grad_norm": 12.028758873593839, + "learning_rate": 4.998855129027479e-06, + "loss": 1.4907, + "step": 556 + }, + { + "epoch": 0.07885609117293127, + "grad_norm": 10.56384666029411, + "learning_rate": 4.998837718140359e-06, + "loss": 1.2869, + "step": 557 + }, + { + "epoch": 0.07899766404756849, + "grad_norm": 12.28478983952787, + "learning_rate": 4.998820175891204e-06, + "loss": 1.3529, + "step": 558 + }, + { + "epoch": 0.0791392369222057, + "grad_norm": 11.792219564642705, + "learning_rate": 4.998802502280936e-06, + "loss": 1.4622, + "step": 559 + }, + { + "epoch": 0.07928080979684292, + "grad_norm": 13.329092368542602, + "learning_rate": 4.998784697310483e-06, + "loss": 1.3649, + "step": 560 + }, + { + "epoch": 0.07942238267148015, + "grad_norm": 11.057869324559023, + "learning_rate": 4.998766760980781e-06, + "loss": 1.2895, + "step": 561 + }, + { + "epoch": 0.07956395554611737, + "grad_norm": 17.875739235697534, + "learning_rate": 4.998748693292774e-06, + "loss": 1.4857, + "step": 562 + }, + { + "epoch": 0.07970552842075458, + "grad_norm": 11.603381328386702, + "learning_rate": 4.9987304942474115e-06, + "loss": 1.4248, + "step": 563 + }, + { + "epoch": 0.0798471012953918, + "grad_norm": 11.887607026044563, + "learning_rate": 4.99871216384565e-06, + "loss": 1.5283, + "step": 564 + }, + { + "epoch": 0.07998867417002903, + "grad_norm": 15.473891388311657, + "learning_rate": 4.998693702088453e-06, + "loss": 1.3257, + "step": 565 + }, + { + "epoch": 0.08013024704466624, + "grad_norm": 12.20093347699105, + "learning_rate": 4.998675108976792e-06, + "loss": 1.4658, + "step": 566 + }, + { + "epoch": 0.08027181991930346, + "grad_norm": 9.263750320864883, + "learning_rate": 4.998656384511643e-06, + "loss": 1.3546, + "step": 567 + }, + { + "epoch": 0.08041339279394068, + "grad_norm": 11.217979468446721, + "learning_rate": 4.998637528693991e-06, + "loss": 1.3715, + "step": 568 + }, + { + "epoch": 0.0805549656685779, + "grad_norm": 10.859395374306427, + "learning_rate": 4.998618541524827e-06, + "loss": 1.5041, + "step": 569 + }, + { + "epoch": 0.08069653854321512, + "grad_norm": 11.334176050905937, + "learning_rate": 4.998599423005149e-06, + "loss": 1.5975, + "step": 570 + }, + { + "epoch": 0.08083811141785234, + "grad_norm": 12.853243621048783, + "learning_rate": 4.998580173135963e-06, + "loss": 1.6349, + "step": 571 + }, + { + "epoch": 0.08097968429248956, + "grad_norm": 12.323518576584444, + "learning_rate": 4.99856079191828e-06, + "loss": 1.6108, + "step": 572 + }, + { + "epoch": 0.08112125716712677, + "grad_norm": 12.192347418413284, + "learning_rate": 4.998541279353119e-06, + "loss": 1.3957, + "step": 573 + }, + { + "epoch": 0.081262830041764, + "grad_norm": 13.405047453896273, + "learning_rate": 4.998521635441506e-06, + "loss": 1.6809, + "step": 574 + }, + { + "epoch": 0.08140440291640122, + "grad_norm": 15.857695031095117, + "learning_rate": 4.998501860184474e-06, + "loss": 1.5762, + "step": 575 + }, + { + "epoch": 0.08154597579103844, + "grad_norm": 9.232440674893594, + "learning_rate": 4.998481953583062e-06, + "loss": 1.3994, + "step": 576 + }, + { + "epoch": 0.08168754866567565, + "grad_norm": 12.481748760651428, + "learning_rate": 4.998461915638316e-06, + "loss": 1.5891, + "step": 577 + }, + { + "epoch": 0.08182912154031288, + "grad_norm": 12.296861350453034, + "learning_rate": 4.9984417463512916e-06, + "loss": 1.5558, + "step": 578 + }, + { + "epoch": 0.0819706944149501, + "grad_norm": 15.326900514929013, + "learning_rate": 4.998421445723046e-06, + "loss": 1.5534, + "step": 579 + }, + { + "epoch": 0.08211226728958732, + "grad_norm": 10.31201799595626, + "learning_rate": 4.9984010137546475e-06, + "loss": 1.4455, + "step": 580 + }, + { + "epoch": 0.08225384016422453, + "grad_norm": 12.5087042448355, + "learning_rate": 4.998380450447172e-06, + "loss": 1.3342, + "step": 581 + }, + { + "epoch": 0.08239541303886175, + "grad_norm": 13.580405514579809, + "learning_rate": 4.998359755801699e-06, + "loss": 1.4596, + "step": 582 + }, + { + "epoch": 0.08253698591349898, + "grad_norm": 13.054087288902624, + "learning_rate": 4.9983389298193165e-06, + "loss": 1.3979, + "step": 583 + }, + { + "epoch": 0.0826785587881362, + "grad_norm": 16.097481363554984, + "learning_rate": 4.998317972501119e-06, + "loss": 1.5228, + "step": 584 + }, + { + "epoch": 0.08282013166277341, + "grad_norm": 10.742935698375636, + "learning_rate": 4.9982968838482085e-06, + "loss": 1.4778, + "step": 585 + }, + { + "epoch": 0.08296170453741063, + "grad_norm": 9.903097234646703, + "learning_rate": 4.998275663861692e-06, + "loss": 1.4816, + "step": 586 + }, + { + "epoch": 0.08310327741204786, + "grad_norm": 11.122389240600851, + "learning_rate": 4.998254312542689e-06, + "loss": 1.4134, + "step": 587 + }, + { + "epoch": 0.08324485028668507, + "grad_norm": 14.183161269570082, + "learning_rate": 4.998232829892319e-06, + "loss": 1.3665, + "step": 588 + }, + { + "epoch": 0.08338642316132229, + "grad_norm": 11.155595861753048, + "learning_rate": 4.998211215911711e-06, + "loss": 1.6073, + "step": 589 + }, + { + "epoch": 0.0835279960359595, + "grad_norm": 11.930498396638235, + "learning_rate": 4.998189470602003e-06, + "loss": 1.5122, + "step": 590 + }, + { + "epoch": 0.08366956891059672, + "grad_norm": 11.281233430887244, + "learning_rate": 4.998167593964337e-06, + "loss": 1.6088, + "step": 591 + }, + { + "epoch": 0.08381114178523395, + "grad_norm": 10.698549503872774, + "learning_rate": 4.998145585999864e-06, + "loss": 1.718, + "step": 592 + }, + { + "epoch": 0.08395271465987117, + "grad_norm": 12.130629029219023, + "learning_rate": 4.998123446709739e-06, + "loss": 1.4406, + "step": 593 + }, + { + "epoch": 0.08409428753450839, + "grad_norm": 12.019215140464599, + "learning_rate": 4.998101176095128e-06, + "loss": 1.5776, + "step": 594 + }, + { + "epoch": 0.0842358604091456, + "grad_norm": 13.126647845775103, + "learning_rate": 4.9980787741572e-06, + "loss": 1.5982, + "step": 595 + }, + { + "epoch": 0.08437743328378283, + "grad_norm": 10.516419424625877, + "learning_rate": 4.998056240897134e-06, + "loss": 1.4333, + "step": 596 + }, + { + "epoch": 0.08451900615842005, + "grad_norm": 10.987187987184102, + "learning_rate": 4.9980335763161145e-06, + "loss": 1.3892, + "step": 597 + }, + { + "epoch": 0.08466057903305726, + "grad_norm": 11.449621895623208, + "learning_rate": 4.998010780415332e-06, + "loss": 1.5521, + "step": 598 + }, + { + "epoch": 0.08480215190769448, + "grad_norm": 10.550068960823195, + "learning_rate": 4.997987853195985e-06, + "loss": 1.3055, + "step": 599 + }, + { + "epoch": 0.08494372478233171, + "grad_norm": 12.453996769007968, + "learning_rate": 4.99796479465928e-06, + "loss": 1.5673, + "step": 600 + }, + { + "epoch": 0.08508529765696893, + "grad_norm": 12.33823860840792, + "learning_rate": 4.997941604806428e-06, + "loss": 1.5271, + "step": 601 + }, + { + "epoch": 0.08522687053160614, + "grad_norm": 9.999167979913496, + "learning_rate": 4.997918283638647e-06, + "loss": 1.5314, + "step": 602 + }, + { + "epoch": 0.08536844340624336, + "grad_norm": 12.236385954821447, + "learning_rate": 4.9978948311571666e-06, + "loss": 1.3992, + "step": 603 + }, + { + "epoch": 0.08551001628088058, + "grad_norm": 13.226262548282254, + "learning_rate": 4.997871247363217e-06, + "loss": 1.5569, + "step": 604 + }, + { + "epoch": 0.08565158915551781, + "grad_norm": 11.555384740626607, + "learning_rate": 4.997847532258037e-06, + "loss": 1.3373, + "step": 605 + }, + { + "epoch": 0.08579316203015502, + "grad_norm": 11.366308895383472, + "learning_rate": 4.997823685842875e-06, + "loss": 1.4023, + "step": 606 + }, + { + "epoch": 0.08593473490479224, + "grad_norm": 13.47912566204801, + "learning_rate": 4.997799708118985e-06, + "loss": 1.4767, + "step": 607 + }, + { + "epoch": 0.08607630777942946, + "grad_norm": 13.01151645838697, + "learning_rate": 4.997775599087627e-06, + "loss": 1.4361, + "step": 608 + }, + { + "epoch": 0.08621788065406669, + "grad_norm": 20.13331498572585, + "learning_rate": 4.997751358750068e-06, + "loss": 1.4365, + "step": 609 + }, + { + "epoch": 0.0863594535287039, + "grad_norm": 13.686224054933337, + "learning_rate": 4.997726987107582e-06, + "loss": 1.5838, + "step": 610 + }, + { + "epoch": 0.08650102640334112, + "grad_norm": 27.791031478951307, + "learning_rate": 4.997702484161451e-06, + "loss": 1.6395, + "step": 611 + }, + { + "epoch": 0.08664259927797834, + "grad_norm": 13.08566405964313, + "learning_rate": 4.997677849912963e-06, + "loss": 1.6016, + "step": 612 + }, + { + "epoch": 0.08678417215261557, + "grad_norm": 12.454622836717409, + "learning_rate": 4.997653084363412e-06, + "loss": 1.4053, + "step": 613 + }, + { + "epoch": 0.08692574502725278, + "grad_norm": 33.64316859287707, + "learning_rate": 4.997628187514101e-06, + "loss": 1.4172, + "step": 614 + }, + { + "epoch": 0.08706731790189, + "grad_norm": 11.693265257941347, + "learning_rate": 4.997603159366339e-06, + "loss": 1.3308, + "step": 615 + }, + { + "epoch": 0.08720889077652721, + "grad_norm": 63.69815600254846, + "learning_rate": 4.99757799992144e-06, + "loss": 1.6563, + "step": 616 + }, + { + "epoch": 0.08735046365116443, + "grad_norm": 19.55322981982053, + "learning_rate": 4.997552709180729e-06, + "loss": 1.5322, + "step": 617 + }, + { + "epoch": 0.08749203652580166, + "grad_norm": 13.867243067670357, + "learning_rate": 4.997527287145534e-06, + "loss": 1.6664, + "step": 618 + }, + { + "epoch": 0.08763360940043888, + "grad_norm": 11.806626893343958, + "learning_rate": 4.997501733817191e-06, + "loss": 1.4898, + "step": 619 + }, + { + "epoch": 0.0877751822750761, + "grad_norm": 12.095407140785596, + "learning_rate": 4.997476049197046e-06, + "loss": 1.4713, + "step": 620 + }, + { + "epoch": 0.08791675514971331, + "grad_norm": 12.976527811701077, + "learning_rate": 4.9974502332864464e-06, + "loss": 1.6953, + "step": 621 + }, + { + "epoch": 0.08805832802435054, + "grad_norm": 15.734348816078015, + "learning_rate": 4.99742428608675e-06, + "loss": 1.4128, + "step": 622 + }, + { + "epoch": 0.08819990089898776, + "grad_norm": 12.002069930523964, + "learning_rate": 4.9973982075993204e-06, + "loss": 1.5239, + "step": 623 + }, + { + "epoch": 0.08834147377362497, + "grad_norm": 9.998125663576547, + "learning_rate": 4.99737199782553e-06, + "loss": 1.4491, + "step": 624 + }, + { + "epoch": 0.08848304664826219, + "grad_norm": 18.504706531538385, + "learning_rate": 4.997345656766755e-06, + "loss": 1.5198, + "step": 625 + }, + { + "epoch": 0.0886246195228994, + "grad_norm": 15.063154910717005, + "learning_rate": 4.997319184424382e-06, + "loss": 1.6458, + "step": 626 + }, + { + "epoch": 0.08876619239753664, + "grad_norm": 33.441875078844824, + "learning_rate": 4.997292580799801e-06, + "loss": 1.4896, + "step": 627 + }, + { + "epoch": 0.08890776527217385, + "grad_norm": 99.67360796620136, + "learning_rate": 4.997265845894411e-06, + "loss": 1.4678, + "step": 628 + }, + { + "epoch": 0.08904933814681107, + "grad_norm": 75.94944607681481, + "learning_rate": 4.997238979709617e-06, + "loss": 1.7554, + "step": 629 + }, + { + "epoch": 0.08919091102144829, + "grad_norm": 29.49600140902312, + "learning_rate": 4.997211982246833e-06, + "loss": 1.4393, + "step": 630 + }, + { + "epoch": 0.08933248389608552, + "grad_norm": 60.09388609903985, + "learning_rate": 4.997184853507476e-06, + "loss": 1.5791, + "step": 631 + }, + { + "epoch": 0.08947405677072273, + "grad_norm": 261.71309135673744, + "learning_rate": 4.997157593492974e-06, + "loss": 2.1542, + "step": 632 + }, + { + "epoch": 0.08961562964535995, + "grad_norm": 833.5889274696492, + "learning_rate": 4.997130202204759e-06, + "loss": 8.2476, + "step": 633 + }, + { + "epoch": 0.08975720251999716, + "grad_norm": 956.0032034465156, + "learning_rate": 4.997102679644271e-06, + "loss": 14.22, + "step": 634 + }, + { + "epoch": 0.0898987753946344, + "grad_norm": 430.23429300208045, + "learning_rate": 4.997075025812957e-06, + "loss": 8.0691, + "step": 635 + }, + { + "epoch": 0.09004034826927161, + "grad_norm": 362.30667258553217, + "learning_rate": 4.997047240712272e-06, + "loss": 5.1684, + "step": 636 + }, + { + "epoch": 0.09018192114390883, + "grad_norm": 349.0120543226265, + "learning_rate": 4.997019324343674e-06, + "loss": 5.9287, + "step": 637 + }, + { + "epoch": 0.09032349401854604, + "grad_norm": 200.0047362720443, + "learning_rate": 4.996991276708633e-06, + "loss": 4.6246, + "step": 638 + }, + { + "epoch": 0.09046506689318326, + "grad_norm": 156.94261491067363, + "learning_rate": 4.996963097808622e-06, + "loss": 3.2692, + "step": 639 + }, + { + "epoch": 0.09060663976782049, + "grad_norm": 112.93656612784895, + "learning_rate": 4.996934787645123e-06, + "loss": 3.019, + "step": 640 + }, + { + "epoch": 0.0907482126424577, + "grad_norm": 154.87078157087927, + "learning_rate": 4.996906346219623e-06, + "loss": 2.68, + "step": 641 + }, + { + "epoch": 0.09088978551709492, + "grad_norm": 83.58988016743564, + "learning_rate": 4.996877773533619e-06, + "loss": 2.8031, + "step": 642 + }, + { + "epoch": 0.09103135839173214, + "grad_norm": 40.53634557406476, + "learning_rate": 4.996849069588612e-06, + "loss": 2.2218, + "step": 643 + }, + { + "epoch": 0.09117293126636937, + "grad_norm": 89.59449663972113, + "learning_rate": 4.996820234386112e-06, + "loss": 2.2926, + "step": 644 + }, + { + "epoch": 0.09131450414100659, + "grad_norm": 39.404171361711505, + "learning_rate": 4.996791267927632e-06, + "loss": 2.4727, + "step": 645 + }, + { + "epoch": 0.0914560770156438, + "grad_norm": 32.55745559835815, + "learning_rate": 4.996762170214698e-06, + "loss": 2.1037, + "step": 646 + }, + { + "epoch": 0.09159764989028102, + "grad_norm": 35.91285670017337, + "learning_rate": 4.996732941248839e-06, + "loss": 2.2015, + "step": 647 + }, + { + "epoch": 0.09173922276491824, + "grad_norm": 27.37430101312235, + "learning_rate": 4.99670358103159e-06, + "loss": 2.0208, + "step": 648 + }, + { + "epoch": 0.09188079563955547, + "grad_norm": 19.10302828604056, + "learning_rate": 4.996674089564495e-06, + "loss": 1.9139, + "step": 649 + }, + { + "epoch": 0.09202236851419268, + "grad_norm": 24.40575483652626, + "learning_rate": 4.9966444668491055e-06, + "loss": 1.915, + "step": 650 + }, + { + "epoch": 0.0921639413888299, + "grad_norm": 23.90962368964574, + "learning_rate": 4.996614712886978e-06, + "loss": 1.9129, + "step": 651 + }, + { + "epoch": 0.09230551426346711, + "grad_norm": 14.977682168305135, + "learning_rate": 4.996584827679676e-06, + "loss": 1.8413, + "step": 652 + }, + { + "epoch": 0.09244708713810434, + "grad_norm": 17.727523111918195, + "learning_rate": 4.996554811228772e-06, + "loss": 1.8157, + "step": 653 + }, + { + "epoch": 0.09258866001274156, + "grad_norm": 17.307209184212624, + "learning_rate": 4.996524663535842e-06, + "loss": 1.8923, + "step": 654 + }, + { + "epoch": 0.09273023288737878, + "grad_norm": 16.586874016777166, + "learning_rate": 4.996494384602473e-06, + "loss": 1.7986, + "step": 655 + }, + { + "epoch": 0.092871805762016, + "grad_norm": 14.67964904033014, + "learning_rate": 4.996463974430255e-06, + "loss": 1.8594, + "step": 656 + }, + { + "epoch": 0.09301337863665322, + "grad_norm": 17.107621556839256, + "learning_rate": 4.996433433020788e-06, + "loss": 1.8018, + "step": 657 + }, + { + "epoch": 0.09315495151129044, + "grad_norm": 18.781724463640845, + "learning_rate": 4.996402760375676e-06, + "loss": 1.7647, + "step": 658 + }, + { + "epoch": 0.09329652438592766, + "grad_norm": 15.103173514851989, + "learning_rate": 4.996371956496532e-06, + "loss": 1.7632, + "step": 659 + }, + { + "epoch": 0.09343809726056487, + "grad_norm": 22.185477323076338, + "learning_rate": 4.996341021384976e-06, + "loss": 1.884, + "step": 660 + }, + { + "epoch": 0.09357967013520209, + "grad_norm": 13.069686154112201, + "learning_rate": 4.996309955042634e-06, + "loss": 1.6154, + "step": 661 + }, + { + "epoch": 0.09372124300983932, + "grad_norm": 14.43765377606823, + "learning_rate": 4.996278757471139e-06, + "loss": 1.807, + "step": 662 + }, + { + "epoch": 0.09386281588447654, + "grad_norm": 15.026096277341548, + "learning_rate": 4.996247428672132e-06, + "loss": 1.7516, + "step": 663 + }, + { + "epoch": 0.09400438875911375, + "grad_norm": 12.58441952360512, + "learning_rate": 4.996215968647258e-06, + "loss": 1.6755, + "step": 664 + }, + { + "epoch": 0.09414596163375097, + "grad_norm": 14.5020887251657, + "learning_rate": 4.996184377398171e-06, + "loss": 1.7809, + "step": 665 + }, + { + "epoch": 0.0942875345083882, + "grad_norm": 15.816945532915353, + "learning_rate": 4.996152654926534e-06, + "loss": 1.8357, + "step": 666 + }, + { + "epoch": 0.09442910738302542, + "grad_norm": 12.124910649235858, + "learning_rate": 4.996120801234012e-06, + "loss": 1.8485, + "step": 667 + }, + { + "epoch": 0.09457068025766263, + "grad_norm": 12.146126186881817, + "learning_rate": 4.996088816322281e-06, + "loss": 1.7427, + "step": 668 + }, + { + "epoch": 0.09471225313229985, + "grad_norm": 12.383358377163507, + "learning_rate": 4.996056700193023e-06, + "loss": 1.6613, + "step": 669 + }, + { + "epoch": 0.09485382600693706, + "grad_norm": 12.55856944982097, + "learning_rate": 4.996024452847924e-06, + "loss": 1.7892, + "step": 670 + }, + { + "epoch": 0.0949953988815743, + "grad_norm": 9.979963065100076, + "learning_rate": 4.9959920742886815e-06, + "loss": 1.8824, + "step": 671 + }, + { + "epoch": 0.09513697175621151, + "grad_norm": 15.623803665138553, + "learning_rate": 4.995959564516997e-06, + "loss": 1.7777, + "step": 672 + }, + { + "epoch": 0.09527854463084873, + "grad_norm": 11.113736740995106, + "learning_rate": 4.995926923534578e-06, + "loss": 1.6039, + "step": 673 + }, + { + "epoch": 0.09542011750548594, + "grad_norm": 12.044700020506296, + "learning_rate": 4.995894151343143e-06, + "loss": 1.6588, + "step": 674 + }, + { + "epoch": 0.09556169038012317, + "grad_norm": 9.863062342867801, + "learning_rate": 4.9958612479444125e-06, + "loss": 1.5908, + "step": 675 + }, + { + "epoch": 0.09570326325476039, + "grad_norm": 11.539173577664423, + "learning_rate": 4.995828213340118e-06, + "loss": 1.7817, + "step": 676 + }, + { + "epoch": 0.0958448361293976, + "grad_norm": 10.94668420205549, + "learning_rate": 4.995795047531994e-06, + "loss": 1.6401, + "step": 677 + }, + { + "epoch": 0.09598640900403482, + "grad_norm": 11.794534251423743, + "learning_rate": 4.995761750521787e-06, + "loss": 1.7014, + "step": 678 + }, + { + "epoch": 0.09612798187867205, + "grad_norm": 11.350778803384346, + "learning_rate": 4.995728322311244e-06, + "loss": 1.4844, + "step": 679 + }, + { + "epoch": 0.09626955475330927, + "grad_norm": 10.883551732004706, + "learning_rate": 4.995694762902125e-06, + "loss": 1.6731, + "step": 680 + }, + { + "epoch": 0.09641112762794649, + "grad_norm": 11.79099863988548, + "learning_rate": 4.9956610722961936e-06, + "loss": 1.6883, + "step": 681 + }, + { + "epoch": 0.0965527005025837, + "grad_norm": 13.557387834408889, + "learning_rate": 4.99562725049522e-06, + "loss": 1.6599, + "step": 682 + }, + { + "epoch": 0.09669427337722092, + "grad_norm": 9.681447753462669, + "learning_rate": 4.9955932975009825e-06, + "loss": 1.7209, + "step": 683 + }, + { + "epoch": 0.09683584625185815, + "grad_norm": 10.832149577878763, + "learning_rate": 4.995559213315267e-06, + "loss": 1.6219, + "step": 684 + }, + { + "epoch": 0.09697741912649536, + "grad_norm": 11.1240398549858, + "learning_rate": 4.9955249979398625e-06, + "loss": 1.5699, + "step": 685 + }, + { + "epoch": 0.09711899200113258, + "grad_norm": 12.402177041202409, + "learning_rate": 4.995490651376571e-06, + "loss": 1.6338, + "step": 686 + }, + { + "epoch": 0.0972605648757698, + "grad_norm": 8.826652687653288, + "learning_rate": 4.9954561736271966e-06, + "loss": 1.4361, + "step": 687 + }, + { + "epoch": 0.09740213775040703, + "grad_norm": 12.081370718284296, + "learning_rate": 4.995421564693551e-06, + "loss": 1.694, + "step": 688 + }, + { + "epoch": 0.09754371062504424, + "grad_norm": 11.568901970275812, + "learning_rate": 4.995386824577455e-06, + "loss": 1.6181, + "step": 689 + }, + { + "epoch": 0.09768528349968146, + "grad_norm": 10.520255759723716, + "learning_rate": 4.995351953280735e-06, + "loss": 1.6927, + "step": 690 + }, + { + "epoch": 0.09782685637431868, + "grad_norm": 11.6620333235776, + "learning_rate": 4.995316950805223e-06, + "loss": 1.5393, + "step": 691 + }, + { + "epoch": 0.0979684292489559, + "grad_norm": 10.624676867788585, + "learning_rate": 4.995281817152759e-06, + "loss": 1.5751, + "step": 692 + }, + { + "epoch": 0.09811000212359312, + "grad_norm": 15.811520029693984, + "learning_rate": 4.995246552325191e-06, + "loss": 1.7659, + "step": 693 + }, + { + "epoch": 0.09825157499823034, + "grad_norm": 9.260416300501493, + "learning_rate": 4.9952111563243715e-06, + "loss": 1.5984, + "step": 694 + }, + { + "epoch": 0.09839314787286756, + "grad_norm": 10.574289818402663, + "learning_rate": 4.995175629152162e-06, + "loss": 1.4672, + "step": 695 + }, + { + "epoch": 0.09853472074750477, + "grad_norm": 12.619810945331093, + "learning_rate": 4.995139970810431e-06, + "loss": 1.5841, + "step": 696 + }, + { + "epoch": 0.098676293622142, + "grad_norm": 10.688343037154207, + "learning_rate": 4.995104181301052e-06, + "loss": 1.6306, + "step": 697 + }, + { + "epoch": 0.09881786649677922, + "grad_norm": 10.64254246042611, + "learning_rate": 4.995068260625906e-06, + "loss": 1.6155, + "step": 698 + }, + { + "epoch": 0.09895943937141644, + "grad_norm": 11.752506719188478, + "learning_rate": 4.995032208786883e-06, + "loss": 1.5808, + "step": 699 + }, + { + "epoch": 0.09910101224605365, + "grad_norm": 9.882519140902916, + "learning_rate": 4.994996025785876e-06, + "loss": 1.5292, + "step": 700 + }, + { + "epoch": 0.09924258512069088, + "grad_norm": 11.49757293299805, + "learning_rate": 4.99495971162479e-06, + "loss": 1.767, + "step": 701 + }, + { + "epoch": 0.0993841579953281, + "grad_norm": 9.63913115515447, + "learning_rate": 4.9949232663055304e-06, + "loss": 1.4986, + "step": 702 + }, + { + "epoch": 0.09952573086996531, + "grad_norm": 11.89502320924333, + "learning_rate": 4.994886689830015e-06, + "loss": 1.8406, + "step": 703 + }, + { + "epoch": 0.09966730374460253, + "grad_norm": 11.211246456979733, + "learning_rate": 4.994849982200168e-06, + "loss": 1.6265, + "step": 704 + }, + { + "epoch": 0.09980887661923975, + "grad_norm": 11.459386344843717, + "learning_rate": 4.994813143417917e-06, + "loss": 1.6349, + "step": 705 + }, + { + "epoch": 0.09995044949387698, + "grad_norm": 13.459429213053982, + "learning_rate": 4.994776173485199e-06, + "loss": 1.5345, + "step": 706 + }, + { + "epoch": 0.1000920223685142, + "grad_norm": 10.669543077147102, + "learning_rate": 4.994739072403958e-06, + "loss": 1.5845, + "step": 707 + }, + { + "epoch": 0.10023359524315141, + "grad_norm": 10.475920541596698, + "learning_rate": 4.994701840176144e-06, + "loss": 1.6864, + "step": 708 + }, + { + "epoch": 0.10037516811778863, + "grad_norm": 13.86117497295447, + "learning_rate": 4.994664476803714e-06, + "loss": 1.5763, + "step": 709 + }, + { + "epoch": 0.10051674099242586, + "grad_norm": 9.628325049016143, + "learning_rate": 4.9946269822886335e-06, + "loss": 1.6373, + "step": 710 + }, + { + "epoch": 0.10065831386706307, + "grad_norm": 9.945693375095866, + "learning_rate": 4.994589356632872e-06, + "loss": 1.6476, + "step": 711 + }, + { + "epoch": 0.10079988674170029, + "grad_norm": 9.312378748161972, + "learning_rate": 4.994551599838408e-06, + "loss": 1.5423, + "step": 712 + }, + { + "epoch": 0.1009414596163375, + "grad_norm": 11.47901860099513, + "learning_rate": 4.994513711907227e-06, + "loss": 1.5144, + "step": 713 + }, + { + "epoch": 0.10108303249097472, + "grad_norm": 8.8315855012461, + "learning_rate": 4.994475692841319e-06, + "loss": 1.4727, + "step": 714 + }, + { + "epoch": 0.10122460536561195, + "grad_norm": 10.96861019208443, + "learning_rate": 4.9944375426426846e-06, + "loss": 1.6411, + "step": 715 + }, + { + "epoch": 0.10136617824024917, + "grad_norm": 10.498060501444645, + "learning_rate": 4.994399261313329e-06, + "loss": 1.3594, + "step": 716 + }, + { + "epoch": 0.10150775111488639, + "grad_norm": 13.073329980808614, + "learning_rate": 4.994360848855264e-06, + "loss": 1.41, + "step": 717 + }, + { + "epoch": 0.1016493239895236, + "grad_norm": 10.602114095934954, + "learning_rate": 4.994322305270508e-06, + "loss": 1.5643, + "step": 718 + }, + { + "epoch": 0.10179089686416083, + "grad_norm": 11.847373549009317, + "learning_rate": 4.994283630561089e-06, + "loss": 1.7558, + "step": 719 + }, + { + "epoch": 0.10193246973879805, + "grad_norm": 11.042725556660663, + "learning_rate": 4.994244824729039e-06, + "loss": 1.5328, + "step": 720 + }, + { + "epoch": 0.10207404261343526, + "grad_norm": 12.460832189993873, + "learning_rate": 4.994205887776399e-06, + "loss": 1.666, + "step": 721 + }, + { + "epoch": 0.10221561548807248, + "grad_norm": 7.086175714797533, + "learning_rate": 4.9941668197052155e-06, + "loss": 1.4206, + "step": 722 + }, + { + "epoch": 0.10235718836270971, + "grad_norm": 9.902336815825974, + "learning_rate": 4.9941276205175405e-06, + "loss": 1.5832, + "step": 723 + }, + { + "epoch": 0.10249876123734693, + "grad_norm": 15.084604845494844, + "learning_rate": 4.994088290215438e-06, + "loss": 1.6585, + "step": 724 + }, + { + "epoch": 0.10264033411198414, + "grad_norm": 9.829517550887385, + "learning_rate": 4.994048828800972e-06, + "loss": 1.445, + "step": 725 + }, + { + "epoch": 0.10278190698662136, + "grad_norm": 12.583692600266737, + "learning_rate": 4.994009236276219e-06, + "loss": 1.6067, + "step": 726 + }, + { + "epoch": 0.10292347986125858, + "grad_norm": 9.39235291793068, + "learning_rate": 4.993969512643261e-06, + "loss": 1.5841, + "step": 727 + }, + { + "epoch": 0.1030650527358958, + "grad_norm": 10.776265902404061, + "learning_rate": 4.993929657904185e-06, + "loss": 1.6634, + "step": 728 + }, + { + "epoch": 0.10320662561053302, + "grad_norm": 9.663516572721834, + "learning_rate": 4.993889672061087e-06, + "loss": 1.4487, + "step": 729 + }, + { + "epoch": 0.10334819848517024, + "grad_norm": 12.028268578441255, + "learning_rate": 4.993849555116067e-06, + "loss": 1.4712, + "step": 730 + }, + { + "epoch": 0.10348977135980746, + "grad_norm": 9.400223635996023, + "learning_rate": 4.993809307071236e-06, + "loss": 1.3275, + "step": 731 + }, + { + "epoch": 0.10363134423444469, + "grad_norm": 10.443834404230483, + "learning_rate": 4.99376892792871e-06, + "loss": 1.3998, + "step": 732 + }, + { + "epoch": 0.1037729171090819, + "grad_norm": 11.198554177286507, + "learning_rate": 4.99372841769061e-06, + "loss": 1.4169, + "step": 733 + }, + { + "epoch": 0.10391448998371912, + "grad_norm": 9.572683857994956, + "learning_rate": 4.9936877763590664e-06, + "loss": 1.5673, + "step": 734 + }, + { + "epoch": 0.10405606285835634, + "grad_norm": 11.28651132235244, + "learning_rate": 4.9936470039362165e-06, + "loss": 1.5267, + "step": 735 + }, + { + "epoch": 0.10419763573299355, + "grad_norm": 12.49624333198219, + "learning_rate": 4.993606100424202e-06, + "loss": 1.6075, + "step": 736 + }, + { + "epoch": 0.10433920860763078, + "grad_norm": 12.185571136426, + "learning_rate": 4.993565065825175e-06, + "loss": 1.4884, + "step": 737 + }, + { + "epoch": 0.104480781482268, + "grad_norm": 12.593044819742017, + "learning_rate": 4.9935239001412915e-06, + "loss": 1.3621, + "step": 738 + }, + { + "epoch": 0.10462235435690521, + "grad_norm": 10.88173633520613, + "learning_rate": 4.993482603374715e-06, + "loss": 1.437, + "step": 739 + }, + { + "epoch": 0.10476392723154243, + "grad_norm": 12.702063109060768, + "learning_rate": 4.993441175527619e-06, + "loss": 1.5878, + "step": 740 + }, + { + "epoch": 0.10490550010617966, + "grad_norm": 9.250486618851296, + "learning_rate": 4.993399616602178e-06, + "loss": 1.5772, + "step": 741 + }, + { + "epoch": 0.10504707298081688, + "grad_norm": 13.010314957297764, + "learning_rate": 4.99335792660058e-06, + "loss": 1.625, + "step": 742 + }, + { + "epoch": 0.1051886458554541, + "grad_norm": 11.67811052271982, + "learning_rate": 4.993316105525013e-06, + "loss": 1.4829, + "step": 743 + }, + { + "epoch": 0.10533021873009131, + "grad_norm": 10.065743438961869, + "learning_rate": 4.993274153377678e-06, + "loss": 1.589, + "step": 744 + }, + { + "epoch": 0.10547179160472854, + "grad_norm": 10.570939704461074, + "learning_rate": 4.993232070160781e-06, + "loss": 1.6034, + "step": 745 + }, + { + "epoch": 0.10561336447936576, + "grad_norm": 10.904233729722684, + "learning_rate": 4.993189855876531e-06, + "loss": 1.4638, + "step": 746 + }, + { + "epoch": 0.10575493735400297, + "grad_norm": 12.525173017026365, + "learning_rate": 4.993147510527151e-06, + "loss": 1.57, + "step": 747 + }, + { + "epoch": 0.10589651022864019, + "grad_norm": 12.329211671847425, + "learning_rate": 4.993105034114864e-06, + "loss": 1.5369, + "step": 748 + }, + { + "epoch": 0.1060380831032774, + "grad_norm": 11.219209709418317, + "learning_rate": 4.993062426641906e-06, + "loss": 1.4416, + "step": 749 + }, + { + "epoch": 0.10617965597791464, + "grad_norm": 10.033176700556501, + "learning_rate": 4.993019688110514e-06, + "loss": 1.4368, + "step": 750 + }, + { + "epoch": 0.10632122885255185, + "grad_norm": 10.200623093067227, + "learning_rate": 4.992976818522936e-06, + "loss": 1.3937, + "step": 751 + }, + { + "epoch": 0.10646280172718907, + "grad_norm": 12.069340955034255, + "learning_rate": 4.992933817881426e-06, + "loss": 1.477, + "step": 752 + }, + { + "epoch": 0.10660437460182628, + "grad_norm": 10.264897383127257, + "learning_rate": 4.992890686188243e-06, + "loss": 1.4088, + "step": 753 + }, + { + "epoch": 0.10674594747646352, + "grad_norm": 10.24734872163302, + "learning_rate": 4.992847423445657e-06, + "loss": 1.3864, + "step": 754 + }, + { + "epoch": 0.10688752035110073, + "grad_norm": 11.786857424036752, + "learning_rate": 4.992804029655939e-06, + "loss": 1.3912, + "step": 755 + }, + { + "epoch": 0.10702909322573795, + "grad_norm": 12.549588528761051, + "learning_rate": 4.992760504821373e-06, + "loss": 1.484, + "step": 756 + }, + { + "epoch": 0.10717066610037516, + "grad_norm": 10.94714557946, + "learning_rate": 4.992716848944245e-06, + "loss": 1.4681, + "step": 757 + }, + { + "epoch": 0.1073122389750124, + "grad_norm": 8.658335961153906, + "learning_rate": 4.992673062026851e-06, + "loss": 1.4373, + "step": 758 + }, + { + "epoch": 0.10745381184964961, + "grad_norm": 12.69355276265401, + "learning_rate": 4.992629144071494e-06, + "loss": 1.4432, + "step": 759 + }, + { + "epoch": 0.10759538472428683, + "grad_norm": 11.701831537333161, + "learning_rate": 4.99258509508048e-06, + "loss": 1.4592, + "step": 760 + }, + { + "epoch": 0.10773695759892404, + "grad_norm": 13.35638501353876, + "learning_rate": 4.9925409150561264e-06, + "loss": 1.3696, + "step": 761 + }, + { + "epoch": 0.10787853047356126, + "grad_norm": 9.58063099672512, + "learning_rate": 4.992496604000756e-06, + "loss": 1.6092, + "step": 762 + }, + { + "epoch": 0.10802010334819849, + "grad_norm": 16.54592515274248, + "learning_rate": 4.992452161916698e-06, + "loss": 1.5258, + "step": 763 + }, + { + "epoch": 0.1081616762228357, + "grad_norm": 12.742116847213687, + "learning_rate": 4.992407588806287e-06, + "loss": 1.5443, + "step": 764 + }, + { + "epoch": 0.10830324909747292, + "grad_norm": 12.395040910967996, + "learning_rate": 4.9923628846718685e-06, + "loss": 1.4597, + "step": 765 + }, + { + "epoch": 0.10844482197211014, + "grad_norm": 10.327990924928727, + "learning_rate": 4.992318049515791e-06, + "loss": 1.433, + "step": 766 + }, + { + "epoch": 0.10858639484674737, + "grad_norm": 15.89998072976918, + "learning_rate": 4.992273083340412e-06, + "loss": 1.5208, + "step": 767 + }, + { + "epoch": 0.10872796772138459, + "grad_norm": 14.913173898848786, + "learning_rate": 4.992227986148096e-06, + "loss": 1.3765, + "step": 768 + }, + { + "epoch": 0.1088695405960218, + "grad_norm": 8.909830437790372, + "learning_rate": 4.992182757941212e-06, + "loss": 1.3759, + "step": 769 + }, + { + "epoch": 0.10901111347065902, + "grad_norm": 10.51755980877003, + "learning_rate": 4.992137398722139e-06, + "loss": 1.3914, + "step": 770 + }, + { + "epoch": 0.10915268634529623, + "grad_norm": 17.414420875910842, + "learning_rate": 4.992091908493262e-06, + "loss": 1.5107, + "step": 771 + }, + { + "epoch": 0.10929425921993347, + "grad_norm": 10.713914853898272, + "learning_rate": 4.992046287256971e-06, + "loss": 1.419, + "step": 772 + }, + { + "epoch": 0.10943583209457068, + "grad_norm": 11.411827751156705, + "learning_rate": 4.992000535015664e-06, + "loss": 1.4031, + "step": 773 + }, + { + "epoch": 0.1095774049692079, + "grad_norm": 10.160995736175956, + "learning_rate": 4.991954651771748e-06, + "loss": 1.4584, + "step": 774 + }, + { + "epoch": 0.10971897784384511, + "grad_norm": 15.79323782775351, + "learning_rate": 4.991908637527634e-06, + "loss": 1.5055, + "step": 775 + }, + { + "epoch": 0.10986055071848234, + "grad_norm": 10.146282293163274, + "learning_rate": 4.991862492285741e-06, + "loss": 1.5186, + "step": 776 + }, + { + "epoch": 0.11000212359311956, + "grad_norm": 9.54971795589322, + "learning_rate": 4.991816216048494e-06, + "loss": 1.3946, + "step": 777 + }, + { + "epoch": 0.11014369646775678, + "grad_norm": 12.08818773997633, + "learning_rate": 4.991769808818328e-06, + "loss": 1.4581, + "step": 778 + }, + { + "epoch": 0.110285269342394, + "grad_norm": 9.196663574940889, + "learning_rate": 4.991723270597679e-06, + "loss": 1.2776, + "step": 779 + }, + { + "epoch": 0.11042684221703122, + "grad_norm": 10.64143912944135, + "learning_rate": 4.9916766013889975e-06, + "loss": 1.4212, + "step": 780 + }, + { + "epoch": 0.11056841509166844, + "grad_norm": 9.471418250287282, + "learning_rate": 4.991629801194734e-06, + "loss": 1.486, + "step": 781 + }, + { + "epoch": 0.11070998796630566, + "grad_norm": 10.307093827384318, + "learning_rate": 4.9915828700173495e-06, + "loss": 1.527, + "step": 782 + }, + { + "epoch": 0.11085156084094287, + "grad_norm": 19.750176151310967, + "learning_rate": 4.991535807859312e-06, + "loss": 1.346, + "step": 783 + }, + { + "epoch": 0.11099313371558009, + "grad_norm": 11.494394014040061, + "learning_rate": 4.991488614723094e-06, + "loss": 1.5973, + "step": 784 + }, + { + "epoch": 0.11113470659021732, + "grad_norm": 15.014741393259003, + "learning_rate": 4.991441290611177e-06, + "loss": 1.597, + "step": 785 + }, + { + "epoch": 0.11127627946485454, + "grad_norm": 10.178794464497663, + "learning_rate": 4.991393835526051e-06, + "loss": 1.5616, + "step": 786 + }, + { + "epoch": 0.11141785233949175, + "grad_norm": 11.773146536509142, + "learning_rate": 4.991346249470207e-06, + "loss": 1.4924, + "step": 787 + }, + { + "epoch": 0.11155942521412897, + "grad_norm": 10.935837275959694, + "learning_rate": 4.991298532446149e-06, + "loss": 1.3476, + "step": 788 + }, + { + "epoch": 0.1117009980887662, + "grad_norm": 9.815156807795844, + "learning_rate": 4.991250684456385e-06, + "loss": 1.2554, + "step": 789 + }, + { + "epoch": 0.11184257096340341, + "grad_norm": 11.011685839898497, + "learning_rate": 4.9912027055034295e-06, + "loss": 1.5356, + "step": 790 + }, + { + "epoch": 0.11198414383804063, + "grad_norm": 9.50859203293161, + "learning_rate": 4.9911545955898055e-06, + "loss": 1.4638, + "step": 791 + }, + { + "epoch": 0.11212571671267785, + "grad_norm": 12.33184936118635, + "learning_rate": 4.991106354718042e-06, + "loss": 1.4693, + "step": 792 + }, + { + "epoch": 0.11226728958731506, + "grad_norm": 10.995423838940871, + "learning_rate": 4.991057982890674e-06, + "loss": 1.4925, + "step": 793 + }, + { + "epoch": 0.1124088624619523, + "grad_norm": 9.053989386779618, + "learning_rate": 4.991009480110246e-06, + "loss": 1.5498, + "step": 794 + }, + { + "epoch": 0.11255043533658951, + "grad_norm": 10.427955512206157, + "learning_rate": 4.990960846379307e-06, + "loss": 1.482, + "step": 795 + }, + { + "epoch": 0.11269200821122673, + "grad_norm": 9.968764158615533, + "learning_rate": 4.990912081700413e-06, + "loss": 1.4278, + "step": 796 + }, + { + "epoch": 0.11283358108586394, + "grad_norm": 10.74128689406558, + "learning_rate": 4.990863186076129e-06, + "loss": 1.3529, + "step": 797 + }, + { + "epoch": 0.11297515396050117, + "grad_norm": 10.610967367728726, + "learning_rate": 4.990814159509025e-06, + "loss": 1.6191, + "step": 798 + }, + { + "epoch": 0.11311672683513839, + "grad_norm": 9.35216905862347, + "learning_rate": 4.990765002001677e-06, + "loss": 1.4112, + "step": 799 + }, + { + "epoch": 0.1132582997097756, + "grad_norm": 9.862259383946967, + "learning_rate": 4.99071571355667e-06, + "loss": 1.6273, + "step": 800 + }, + { + "epoch": 0.11339987258441282, + "grad_norm": 12.344843948756122, + "learning_rate": 4.990666294176596e-06, + "loss": 1.5961, + "step": 801 + }, + { + "epoch": 0.11354144545905005, + "grad_norm": 10.674613456070595, + "learning_rate": 4.990616743864051e-06, + "loss": 1.4296, + "step": 802 + }, + { + "epoch": 0.11368301833368727, + "grad_norm": 11.333649649600204, + "learning_rate": 4.99056706262164e-06, + "loss": 1.4046, + "step": 803 + }, + { + "epoch": 0.11382459120832449, + "grad_norm": 15.383584703986472, + "learning_rate": 4.990517250451978e-06, + "loss": 1.4302, + "step": 804 + }, + { + "epoch": 0.1139661640829617, + "grad_norm": 13.52525792466491, + "learning_rate": 4.99046730735768e-06, + "loss": 1.4589, + "step": 805 + }, + { + "epoch": 0.11410773695759892, + "grad_norm": 10.307005372134572, + "learning_rate": 4.990417233341373e-06, + "loss": 1.4986, + "step": 806 + }, + { + "epoch": 0.11424930983223615, + "grad_norm": 11.592885275619347, + "learning_rate": 4.990367028405688e-06, + "loss": 1.3795, + "step": 807 + }, + { + "epoch": 0.11439088270687336, + "grad_norm": 15.118048465469903, + "learning_rate": 4.990316692553265e-06, + "loss": 1.5222, + "step": 808 + }, + { + "epoch": 0.11453245558151058, + "grad_norm": 11.349931866277464, + "learning_rate": 4.990266225786751e-06, + "loss": 1.3799, + "step": 809 + }, + { + "epoch": 0.1146740284561478, + "grad_norm": 10.834248274289514, + "learning_rate": 4.9902156281087985e-06, + "loss": 1.5128, + "step": 810 + }, + { + "epoch": 0.11481560133078503, + "grad_norm": 13.086762450676243, + "learning_rate": 4.990164899522068e-06, + "loss": 1.3853, + "step": 811 + }, + { + "epoch": 0.11495717420542224, + "grad_norm": 10.476260278312358, + "learning_rate": 4.990114040029224e-06, + "loss": 1.4376, + "step": 812 + }, + { + "epoch": 0.11509874708005946, + "grad_norm": 12.393281014591762, + "learning_rate": 4.990063049632943e-06, + "loss": 1.5966, + "step": 813 + }, + { + "epoch": 0.11524031995469668, + "grad_norm": 10.488510748769379, + "learning_rate": 4.9900119283359025e-06, + "loss": 1.5068, + "step": 814 + }, + { + "epoch": 0.11538189282933389, + "grad_norm": 10.36782000178222, + "learning_rate": 4.989960676140793e-06, + "loss": 1.4215, + "step": 815 + }, + { + "epoch": 0.11552346570397112, + "grad_norm": 10.0770037833189, + "learning_rate": 4.989909293050307e-06, + "loss": 1.4633, + "step": 816 + }, + { + "epoch": 0.11566503857860834, + "grad_norm": 10.392385233304351, + "learning_rate": 4.989857779067146e-06, + "loss": 1.353, + "step": 817 + }, + { + "epoch": 0.11580661145324556, + "grad_norm": 10.705423951892149, + "learning_rate": 4.989806134194018e-06, + "loss": 1.4568, + "step": 818 + }, + { + "epoch": 0.11594818432788277, + "grad_norm": 12.003048827374808, + "learning_rate": 4.9897543584336376e-06, + "loss": 1.4292, + "step": 819 + }, + { + "epoch": 0.11608975720252, + "grad_norm": 12.192993803836309, + "learning_rate": 4.989702451788727e-06, + "loss": 1.5632, + "step": 820 + }, + { + "epoch": 0.11623133007715722, + "grad_norm": 9.816705859310579, + "learning_rate": 4.989650414262015e-06, + "loss": 1.3982, + "step": 821 + }, + { + "epoch": 0.11637290295179444, + "grad_norm": 10.165301330273191, + "learning_rate": 4.989598245856238e-06, + "loss": 1.348, + "step": 822 + }, + { + "epoch": 0.11651447582643165, + "grad_norm": 11.256418389348127, + "learning_rate": 4.989545946574136e-06, + "loss": 1.4242, + "step": 823 + }, + { + "epoch": 0.11665604870106888, + "grad_norm": 12.114404827365192, + "learning_rate": 4.989493516418461e-06, + "loss": 1.5775, + "step": 824 + }, + { + "epoch": 0.1167976215757061, + "grad_norm": 10.59761017544352, + "learning_rate": 4.9894409553919675e-06, + "loss": 1.6157, + "step": 825 + }, + { + "epoch": 0.11693919445034331, + "grad_norm": 11.471377370625415, + "learning_rate": 4.98938826349742e-06, + "loss": 1.4617, + "step": 826 + }, + { + "epoch": 0.11708076732498053, + "grad_norm": 11.59241340109617, + "learning_rate": 4.989335440737587e-06, + "loss": 1.4185, + "step": 827 + }, + { + "epoch": 0.11722234019961775, + "grad_norm": 9.193007706188174, + "learning_rate": 4.989282487115246e-06, + "loss": 1.5507, + "step": 828 + }, + { + "epoch": 0.11736391307425498, + "grad_norm": 13.295391712766436, + "learning_rate": 4.98922940263318e-06, + "loss": 1.6229, + "step": 829 + }, + { + "epoch": 0.1175054859488922, + "grad_norm": 10.240517300413984, + "learning_rate": 4.989176187294182e-06, + "loss": 1.4442, + "step": 830 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 9.963497296699972, + "learning_rate": 4.989122841101047e-06, + "loss": 1.5861, + "step": 831 + }, + { + "epoch": 0.11778863169816663, + "grad_norm": 10.78252334815084, + "learning_rate": 4.98906936405658e-06, + "loss": 1.5788, + "step": 832 + }, + { + "epoch": 0.11793020457280386, + "grad_norm": 9.906121382495096, + "learning_rate": 4.989015756163593e-06, + "loss": 1.54, + "step": 833 + }, + { + "epoch": 0.11807177744744107, + "grad_norm": 13.692338480210042, + "learning_rate": 4.988962017424903e-06, + "loss": 1.4181, + "step": 834 + }, + { + "epoch": 0.11821335032207829, + "grad_norm": 11.349746338282603, + "learning_rate": 4.988908147843336e-06, + "loss": 1.5935, + "step": 835 + }, + { + "epoch": 0.1183549231967155, + "grad_norm": 10.335050214655617, + "learning_rate": 4.988854147421724e-06, + "loss": 1.5457, + "step": 836 + }, + { + "epoch": 0.11849649607135272, + "grad_norm": 12.173157375884323, + "learning_rate": 4.988800016162904e-06, + "loss": 1.461, + "step": 837 + }, + { + "epoch": 0.11863806894598995, + "grad_norm": 10.167308057554223, + "learning_rate": 4.9887457540697235e-06, + "loss": 1.5739, + "step": 838 + }, + { + "epoch": 0.11877964182062717, + "grad_norm": 9.823647551113389, + "learning_rate": 4.988691361145035e-06, + "loss": 1.384, + "step": 839 + }, + { + "epoch": 0.11892121469526439, + "grad_norm": 9.498573196076897, + "learning_rate": 4.988636837391696e-06, + "loss": 1.4092, + "step": 840 + }, + { + "epoch": 0.1190627875699016, + "grad_norm": 11.079607994774504, + "learning_rate": 4.988582182812575e-06, + "loss": 1.5839, + "step": 841 + }, + { + "epoch": 0.11920436044453883, + "grad_norm": 11.704907188650488, + "learning_rate": 4.988527397410544e-06, + "loss": 1.4571, + "step": 842 + }, + { + "epoch": 0.11934593331917605, + "grad_norm": 10.202082956096094, + "learning_rate": 4.988472481188484e-06, + "loss": 1.5641, + "step": 843 + }, + { + "epoch": 0.11948750619381326, + "grad_norm": 10.106707213562862, + "learning_rate": 4.988417434149279e-06, + "loss": 1.4081, + "step": 844 + }, + { + "epoch": 0.11962907906845048, + "grad_norm": 8.744827921740907, + "learning_rate": 4.988362256295827e-06, + "loss": 1.5319, + "step": 845 + }, + { + "epoch": 0.11977065194308771, + "grad_norm": 10.992488030051264, + "learning_rate": 4.988306947631025e-06, + "loss": 1.4399, + "step": 846 + }, + { + "epoch": 0.11991222481772493, + "grad_norm": 11.09574661555242, + "learning_rate": 4.988251508157784e-06, + "loss": 1.4727, + "step": 847 + }, + { + "epoch": 0.12005379769236214, + "grad_norm": 13.265182146244614, + "learning_rate": 4.988195937879015e-06, + "loss": 1.4219, + "step": 848 + }, + { + "epoch": 0.12019537056699936, + "grad_norm": 13.773681223027987, + "learning_rate": 4.988140236797642e-06, + "loss": 1.5204, + "step": 849 + }, + { + "epoch": 0.12033694344163658, + "grad_norm": 9.276748747367115, + "learning_rate": 4.988084404916591e-06, + "loss": 1.4569, + "step": 850 + }, + { + "epoch": 0.1204785163162738, + "grad_norm": 12.315193307701454, + "learning_rate": 4.988028442238798e-06, + "loss": 1.4235, + "step": 851 + }, + { + "epoch": 0.12062008919091102, + "grad_norm": 10.236480356564556, + "learning_rate": 4.987972348767206e-06, + "loss": 1.3441, + "step": 852 + }, + { + "epoch": 0.12076166206554824, + "grad_norm": 12.074289519769955, + "learning_rate": 4.987916124504761e-06, + "loss": 1.4432, + "step": 853 + }, + { + "epoch": 0.12090323494018546, + "grad_norm": 10.731491478950769, + "learning_rate": 4.9878597694544215e-06, + "loss": 1.5405, + "step": 854 + }, + { + "epoch": 0.12104480781482269, + "grad_norm": 10.38792735481934, + "learning_rate": 4.987803283619149e-06, + "loss": 1.4894, + "step": 855 + }, + { + "epoch": 0.1211863806894599, + "grad_norm": 13.297343764688083, + "learning_rate": 4.987746667001913e-06, + "loss": 1.4352, + "step": 856 + }, + { + "epoch": 0.12132795356409712, + "grad_norm": 11.564159377338768, + "learning_rate": 4.98768991960569e-06, + "loss": 1.4035, + "step": 857 + }, + { + "epoch": 0.12146952643873433, + "grad_norm": 10.788319268616478, + "learning_rate": 4.987633041433462e-06, + "loss": 1.5716, + "step": 858 + }, + { + "epoch": 0.12161109931337155, + "grad_norm": 10.078419099996575, + "learning_rate": 4.98757603248822e-06, + "loss": 1.5625, + "step": 859 + }, + { + "epoch": 0.12175267218800878, + "grad_norm": 11.779628328376388, + "learning_rate": 4.987518892772961e-06, + "loss": 1.4673, + "step": 860 + }, + { + "epoch": 0.121894245062646, + "grad_norm": 10.348308377077378, + "learning_rate": 4.987461622290688e-06, + "loss": 1.3049, + "step": 861 + }, + { + "epoch": 0.12203581793728321, + "grad_norm": 10.941873079451087, + "learning_rate": 4.987404221044413e-06, + "loss": 1.283, + "step": 862 + }, + { + "epoch": 0.12217739081192043, + "grad_norm": 10.38829457242791, + "learning_rate": 4.9873466890371525e-06, + "loss": 1.5517, + "step": 863 + }, + { + "epoch": 0.12231896368655766, + "grad_norm": 10.487017643853756, + "learning_rate": 4.987289026271931e-06, + "loss": 1.3882, + "step": 864 + }, + { + "epoch": 0.12246053656119488, + "grad_norm": 10.47360617894341, + "learning_rate": 4.98723123275178e-06, + "loss": 1.3254, + "step": 865 + }, + { + "epoch": 0.1226021094358321, + "grad_norm": 9.719686873929946, + "learning_rate": 4.987173308479738e-06, + "loss": 1.442, + "step": 866 + }, + { + "epoch": 0.12274368231046931, + "grad_norm": 10.340243664347854, + "learning_rate": 4.98711525345885e-06, + "loss": 1.4562, + "step": 867 + }, + { + "epoch": 0.12288525518510654, + "grad_norm": 12.508585308535032, + "learning_rate": 4.987057067692167e-06, + "loss": 1.4443, + "step": 868 + }, + { + "epoch": 0.12302682805974376, + "grad_norm": 10.737156492340773, + "learning_rate": 4.986998751182748e-06, + "loss": 1.4647, + "step": 869 + }, + { + "epoch": 0.12316840093438097, + "grad_norm": 8.198778145170808, + "learning_rate": 4.98694030393366e-06, + "loss": 1.4141, + "step": 870 + }, + { + "epoch": 0.12330997380901819, + "grad_norm": 10.764822388781512, + "learning_rate": 4.986881725947974e-06, + "loss": 1.5512, + "step": 871 + }, + { + "epoch": 0.1234515466836554, + "grad_norm": 10.528233670641663, + "learning_rate": 4.98682301722877e-06, + "loss": 1.4611, + "step": 872 + }, + { + "epoch": 0.12359311955829264, + "grad_norm": 12.60769101305485, + "learning_rate": 4.986764177779134e-06, + "loss": 1.5082, + "step": 873 + }, + { + "epoch": 0.12373469243292985, + "grad_norm": 12.16832801937511, + "learning_rate": 4.986705207602161e-06, + "loss": 1.5998, + "step": 874 + }, + { + "epoch": 0.12387626530756707, + "grad_norm": 9.704389652514578, + "learning_rate": 4.986646106700948e-06, + "loss": 1.4805, + "step": 875 + }, + { + "epoch": 0.12401783818220428, + "grad_norm": 9.208338764816224, + "learning_rate": 4.986586875078603e-06, + "loss": 1.5827, + "step": 876 + }, + { + "epoch": 0.12415941105684151, + "grad_norm": 12.778534915624864, + "learning_rate": 4.98652751273824e-06, + "loss": 1.4114, + "step": 877 + }, + { + "epoch": 0.12430098393147873, + "grad_norm": 12.000310893799858, + "learning_rate": 4.986468019682981e-06, + "loss": 1.2916, + "step": 878 + }, + { + "epoch": 0.12444255680611595, + "grad_norm": 11.080675958609522, + "learning_rate": 4.98640839591595e-06, + "loss": 1.4159, + "step": 879 + }, + { + "epoch": 0.12458412968075316, + "grad_norm": 7.961734570852571, + "learning_rate": 4.986348641440286e-06, + "loss": 1.3765, + "step": 880 + }, + { + "epoch": 0.12472570255539038, + "grad_norm": 10.40012855450332, + "learning_rate": 4.986288756259126e-06, + "loss": 1.4546, + "step": 881 + }, + { + "epoch": 0.12486727543002761, + "grad_norm": 11.743833181625646, + "learning_rate": 4.986228740375621e-06, + "loss": 1.4184, + "step": 882 + }, + { + "epoch": 0.12500884830466483, + "grad_norm": 10.347958541262122, + "learning_rate": 4.986168593792924e-06, + "loss": 1.4743, + "step": 883 + }, + { + "epoch": 0.12515042117930206, + "grad_norm": 10.223492469738929, + "learning_rate": 4.986108316514199e-06, + "loss": 1.4581, + "step": 884 + }, + { + "epoch": 0.12529199405393926, + "grad_norm": 10.089092112925025, + "learning_rate": 4.986047908542613e-06, + "loss": 1.5233, + "step": 885 + }, + { + "epoch": 0.1254335669285765, + "grad_norm": 14.159141666238437, + "learning_rate": 4.9859873698813425e-06, + "loss": 1.428, + "step": 886 + }, + { + "epoch": 0.1255751398032137, + "grad_norm": 12.766450927611377, + "learning_rate": 4.985926700533569e-06, + "loss": 1.4086, + "step": 887 + }, + { + "epoch": 0.12571671267785092, + "grad_norm": 10.652181167396979, + "learning_rate": 4.985865900502482e-06, + "loss": 1.3771, + "step": 888 + }, + { + "epoch": 0.12585828555248815, + "grad_norm": 11.66516786666278, + "learning_rate": 4.985804969791278e-06, + "loss": 1.401, + "step": 889 + }, + { + "epoch": 0.12599985842712536, + "grad_norm": 8.81737194257069, + "learning_rate": 4.9857439084031614e-06, + "loss": 1.3604, + "step": 890 + }, + { + "epoch": 0.12614143130176259, + "grad_norm": 8.947010875207106, + "learning_rate": 4.985682716341341e-06, + "loss": 1.3473, + "step": 891 + }, + { + "epoch": 0.1262830041763998, + "grad_norm": 11.084417495667587, + "learning_rate": 4.985621393609032e-06, + "loss": 1.5848, + "step": 892 + }, + { + "epoch": 0.12642457705103702, + "grad_norm": 10.582898909496079, + "learning_rate": 4.985559940209462e-06, + "loss": 1.3582, + "step": 893 + }, + { + "epoch": 0.12656614992567425, + "grad_norm": 10.705546886117466, + "learning_rate": 4.985498356145858e-06, + "loss": 1.3085, + "step": 894 + }, + { + "epoch": 0.12670772280031145, + "grad_norm": 9.78502915960381, + "learning_rate": 4.985436641421458e-06, + "loss": 1.3261, + "step": 895 + }, + { + "epoch": 0.12684929567494868, + "grad_norm": 11.02536588654222, + "learning_rate": 4.985374796039508e-06, + "loss": 1.4308, + "step": 896 + }, + { + "epoch": 0.1269908685495859, + "grad_norm": 10.518302950592801, + "learning_rate": 4.985312820003258e-06, + "loss": 1.4952, + "step": 897 + }, + { + "epoch": 0.1271324414242231, + "grad_norm": 9.649568190326738, + "learning_rate": 4.985250713315966e-06, + "loss": 1.3738, + "step": 898 + }, + { + "epoch": 0.12727401429886034, + "grad_norm": 11.193004085324974, + "learning_rate": 4.985188475980898e-06, + "loss": 1.4633, + "step": 899 + }, + { + "epoch": 0.12741558717349755, + "grad_norm": 10.687334382854196, + "learning_rate": 4.985126108001323e-06, + "loss": 1.433, + "step": 900 + }, + { + "epoch": 0.12755716004813478, + "grad_norm": 9.90608171867615, + "learning_rate": 4.985063609380522e-06, + "loss": 1.4008, + "step": 901 + }, + { + "epoch": 0.127698732922772, + "grad_norm": 9.797498375925842, + "learning_rate": 4.985000980121782e-06, + "loss": 1.3425, + "step": 902 + }, + { + "epoch": 0.1278403057974092, + "grad_norm": 9.58152842456169, + "learning_rate": 4.984938220228391e-06, + "loss": 1.4432, + "step": 903 + }, + { + "epoch": 0.12798187867204644, + "grad_norm": 10.445653232218362, + "learning_rate": 4.9848753297036515e-06, + "loss": 1.2645, + "step": 904 + }, + { + "epoch": 0.12812345154668364, + "grad_norm": 10.15854804229604, + "learning_rate": 4.984812308550869e-06, + "loss": 1.3024, + "step": 905 + }, + { + "epoch": 0.12826502442132087, + "grad_norm": 7.922158621802766, + "learning_rate": 4.984749156773355e-06, + "loss": 1.3801, + "step": 906 + }, + { + "epoch": 0.1284065972959581, + "grad_norm": 10.465745181523857, + "learning_rate": 4.984685874374432e-06, + "loss": 1.3546, + "step": 907 + }, + { + "epoch": 0.1285481701705953, + "grad_norm": 11.597037206659994, + "learning_rate": 4.984622461357425e-06, + "loss": 1.5502, + "step": 908 + }, + { + "epoch": 0.12868974304523254, + "grad_norm": 11.275372863152908, + "learning_rate": 4.984558917725667e-06, + "loss": 1.3862, + "step": 909 + }, + { + "epoch": 0.12883131591986977, + "grad_norm": 10.701044434528159, + "learning_rate": 4.9844952434825e-06, + "loss": 1.3987, + "step": 910 + }, + { + "epoch": 0.12897288879450697, + "grad_norm": 9.506166966142747, + "learning_rate": 4.98443143863127e-06, + "loss": 1.4511, + "step": 911 + }, + { + "epoch": 0.1291144616691442, + "grad_norm": 10.842656940699149, + "learning_rate": 4.984367503175332e-06, + "loss": 1.5191, + "step": 912 + }, + { + "epoch": 0.1292560345437814, + "grad_norm": 10.155933739066274, + "learning_rate": 4.984303437118047e-06, + "loss": 1.4432, + "step": 913 + }, + { + "epoch": 0.12939760741841863, + "grad_norm": 11.086667661164087, + "learning_rate": 4.984239240462783e-06, + "loss": 1.5723, + "step": 914 + }, + { + "epoch": 0.12953918029305586, + "grad_norm": 11.679889342946558, + "learning_rate": 4.984174913212913e-06, + "loss": 1.4797, + "step": 915 + }, + { + "epoch": 0.12968075316769306, + "grad_norm": 10.454857196607941, + "learning_rate": 4.984110455371822e-06, + "loss": 1.5402, + "step": 916 + }, + { + "epoch": 0.1298223260423303, + "grad_norm": 10.358202837743237, + "learning_rate": 4.984045866942895e-06, + "loss": 1.5083, + "step": 917 + }, + { + "epoch": 0.1299638989169675, + "grad_norm": 7.661321711840226, + "learning_rate": 4.98398114792953e-06, + "loss": 1.2321, + "step": 918 + }, + { + "epoch": 0.13010547179160473, + "grad_norm": 10.27901835701197, + "learning_rate": 4.983916298335127e-06, + "loss": 1.397, + "step": 919 + }, + { + "epoch": 0.13024704466624196, + "grad_norm": 11.040304385425774, + "learning_rate": 4.9838513181630975e-06, + "loss": 1.3515, + "step": 920 + }, + { + "epoch": 0.13038861754087916, + "grad_norm": 12.075057850574735, + "learning_rate": 4.983786207416856e-06, + "loss": 1.4311, + "step": 921 + }, + { + "epoch": 0.1305301904155164, + "grad_norm": 9.13195159744221, + "learning_rate": 4.983720966099826e-06, + "loss": 1.3987, + "step": 922 + }, + { + "epoch": 0.13067176329015362, + "grad_norm": 10.793177986118781, + "learning_rate": 4.983655594215436e-06, + "loss": 1.4039, + "step": 923 + }, + { + "epoch": 0.13081333616479082, + "grad_norm": 10.003489457712563, + "learning_rate": 4.983590091767123e-06, + "loss": 1.3258, + "step": 924 + }, + { + "epoch": 0.13095490903942805, + "grad_norm": 9.98012360276489, + "learning_rate": 4.983524458758331e-06, + "loss": 1.51, + "step": 925 + }, + { + "epoch": 0.13109648191406525, + "grad_norm": 12.440679555896454, + "learning_rate": 4.98345869519251e-06, + "loss": 1.3865, + "step": 926 + }, + { + "epoch": 0.13123805478870249, + "grad_norm": 11.088330817677909, + "learning_rate": 4.9833928010731185e-06, + "loss": 1.4459, + "step": 927 + }, + { + "epoch": 0.13137962766333972, + "grad_norm": 10.823825439443981, + "learning_rate": 4.983326776403618e-06, + "loss": 1.4539, + "step": 928 + }, + { + "epoch": 0.13152120053797692, + "grad_norm": 7.95486233364943, + "learning_rate": 4.983260621187479e-06, + "loss": 1.2448, + "step": 929 + }, + { + "epoch": 0.13166277341261415, + "grad_norm": 10.145504757510489, + "learning_rate": 4.983194335428183e-06, + "loss": 1.436, + "step": 930 + }, + { + "epoch": 0.13180434628725135, + "grad_norm": 10.442519756843666, + "learning_rate": 4.9831279191292114e-06, + "loss": 1.4316, + "step": 931 + }, + { + "epoch": 0.13194591916188858, + "grad_norm": 11.453988286018543, + "learning_rate": 4.983061372294057e-06, + "loss": 1.4451, + "step": 932 + }, + { + "epoch": 0.1320874920365258, + "grad_norm": 11.32529598295614, + "learning_rate": 4.982994694926217e-06, + "loss": 1.6512, + "step": 933 + }, + { + "epoch": 0.132229064911163, + "grad_norm": 9.290568288250851, + "learning_rate": 4.9829278870291975e-06, + "loss": 1.3172, + "step": 934 + }, + { + "epoch": 0.13237063778580024, + "grad_norm": 9.28137494654954, + "learning_rate": 4.982860948606511e-06, + "loss": 1.4008, + "step": 935 + }, + { + "epoch": 0.13251221066043747, + "grad_norm": 9.267330533771199, + "learning_rate": 4.9827938796616745e-06, + "loss": 1.5217, + "step": 936 + }, + { + "epoch": 0.13265378353507468, + "grad_norm": 8.726702819624991, + "learning_rate": 4.982726680198217e-06, + "loss": 1.4964, + "step": 937 + }, + { + "epoch": 0.1327953564097119, + "grad_norm": 8.565016328046726, + "learning_rate": 4.982659350219668e-06, + "loss": 1.4946, + "step": 938 + }, + { + "epoch": 0.1329369292843491, + "grad_norm": 10.88863911850454, + "learning_rate": 4.982591889729567e-06, + "loss": 1.299, + "step": 939 + }, + { + "epoch": 0.13307850215898634, + "grad_norm": 10.512614710632137, + "learning_rate": 4.982524298731463e-06, + "loss": 1.3458, + "step": 940 + }, + { + "epoch": 0.13322007503362357, + "grad_norm": 10.163914625893948, + "learning_rate": 4.982456577228907e-06, + "loss": 1.5148, + "step": 941 + }, + { + "epoch": 0.13336164790826077, + "grad_norm": 9.119460004915313, + "learning_rate": 4.98238872522546e-06, + "loss": 1.41, + "step": 942 + }, + { + "epoch": 0.133503220782898, + "grad_norm": 9.26413286362214, + "learning_rate": 4.982320742724688e-06, + "loss": 1.5651, + "step": 943 + }, + { + "epoch": 0.1336447936575352, + "grad_norm": 8.904985947166805, + "learning_rate": 4.982252629730167e-06, + "loss": 1.4956, + "step": 944 + }, + { + "epoch": 0.13378636653217243, + "grad_norm": 11.05793762955555, + "learning_rate": 4.982184386245475e-06, + "loss": 1.4573, + "step": 945 + }, + { + "epoch": 0.13392793940680967, + "grad_norm": 10.553403650637136, + "learning_rate": 4.9821160122742e-06, + "loss": 1.4644, + "step": 946 + }, + { + "epoch": 0.13406951228144687, + "grad_norm": 10.332657412500502, + "learning_rate": 4.982047507819938e-06, + "loss": 1.4091, + "step": 947 + }, + { + "epoch": 0.1342110851560841, + "grad_norm": 10.716554644895497, + "learning_rate": 4.981978872886288e-06, + "loss": 1.4655, + "step": 948 + }, + { + "epoch": 0.1343526580307213, + "grad_norm": 10.035199585104953, + "learning_rate": 4.981910107476861e-06, + "loss": 1.312, + "step": 949 + }, + { + "epoch": 0.13449423090535853, + "grad_norm": 10.590095542354252, + "learning_rate": 4.9818412115952685e-06, + "loss": 1.3752, + "step": 950 + }, + { + "epoch": 0.13463580377999576, + "grad_norm": 8.911330958841548, + "learning_rate": 4.981772185245135e-06, + "loss": 1.2763, + "step": 951 + }, + { + "epoch": 0.13477737665463296, + "grad_norm": 10.757805762638595, + "learning_rate": 4.981703028430088e-06, + "loss": 1.5149, + "step": 952 + }, + { + "epoch": 0.1349189495292702, + "grad_norm": 11.379414466851905, + "learning_rate": 4.981633741153764e-06, + "loss": 1.4202, + "step": 953 + }, + { + "epoch": 0.13506052240390742, + "grad_norm": 9.974184665485742, + "learning_rate": 4.981564323419804e-06, + "loss": 1.4334, + "step": 954 + }, + { + "epoch": 0.13520209527854463, + "grad_norm": 9.387766372822831, + "learning_rate": 4.981494775231857e-06, + "loss": 1.3727, + "step": 955 + }, + { + "epoch": 0.13534366815318186, + "grad_norm": 10.292683745461241, + "learning_rate": 4.981425096593582e-06, + "loss": 1.4829, + "step": 956 + }, + { + "epoch": 0.13548524102781906, + "grad_norm": 9.037239174356914, + "learning_rate": 4.981355287508638e-06, + "loss": 1.3898, + "step": 957 + }, + { + "epoch": 0.1356268139024563, + "grad_norm": 9.444467561980234, + "learning_rate": 4.981285347980698e-06, + "loss": 1.3918, + "step": 958 + }, + { + "epoch": 0.13576838677709352, + "grad_norm": 10.264587442810049, + "learning_rate": 4.981215278013436e-06, + "loss": 1.508, + "step": 959 + }, + { + "epoch": 0.13590995965173072, + "grad_norm": 8.388120479913654, + "learning_rate": 4.981145077610538e-06, + "loss": 1.2418, + "step": 960 + }, + { + "epoch": 0.13605153252636795, + "grad_norm": 10.001835463877837, + "learning_rate": 4.981074746775693e-06, + "loss": 1.5085, + "step": 961 + }, + { + "epoch": 0.13619310540100515, + "grad_norm": 10.002614251790238, + "learning_rate": 4.9810042855125985e-06, + "loss": 1.3224, + "step": 962 + }, + { + "epoch": 0.13633467827564238, + "grad_norm": 11.522435735058568, + "learning_rate": 4.980933693824959e-06, + "loss": 1.5053, + "step": 963 + }, + { + "epoch": 0.13647625115027961, + "grad_norm": 9.132206827382193, + "learning_rate": 4.9808629717164845e-06, + "loss": 1.4331, + "step": 964 + }, + { + "epoch": 0.13661782402491682, + "grad_norm": 11.769479391273071, + "learning_rate": 4.980792119190894e-06, + "loss": 1.3642, + "step": 965 + }, + { + "epoch": 0.13675939689955405, + "grad_norm": 9.163667454054414, + "learning_rate": 4.98072113625191e-06, + "loss": 1.3382, + "step": 966 + }, + { + "epoch": 0.13690096977419128, + "grad_norm": 12.495151647629797, + "learning_rate": 4.980650022903267e-06, + "loss": 1.4572, + "step": 967 + }, + { + "epoch": 0.13704254264882848, + "grad_norm": 10.404018403385697, + "learning_rate": 4.980578779148702e-06, + "loss": 1.4466, + "step": 968 + }, + { + "epoch": 0.1371841155234657, + "grad_norm": 8.252379565534559, + "learning_rate": 4.98050740499196e-06, + "loss": 1.4243, + "step": 969 + }, + { + "epoch": 0.1373256883981029, + "grad_norm": 12.612479771880459, + "learning_rate": 4.980435900436793e-06, + "loss": 1.2932, + "step": 970 + }, + { + "epoch": 0.13746726127274014, + "grad_norm": 9.709028183679578, + "learning_rate": 4.98036426548696e-06, + "loss": 1.4334, + "step": 971 + }, + { + "epoch": 0.13760883414737737, + "grad_norm": 11.45066734268241, + "learning_rate": 4.980292500146227e-06, + "loss": 1.6652, + "step": 972 + }, + { + "epoch": 0.13775040702201458, + "grad_norm": 11.82048418786417, + "learning_rate": 4.980220604418367e-06, + "loss": 1.7111, + "step": 973 + }, + { + "epoch": 0.1378919798966518, + "grad_norm": 8.866555938411734, + "learning_rate": 4.980148578307159e-06, + "loss": 1.4161, + "step": 974 + }, + { + "epoch": 0.138033552771289, + "grad_norm": 8.834389629426116, + "learning_rate": 4.98007642181639e-06, + "loss": 1.3748, + "step": 975 + }, + { + "epoch": 0.13817512564592624, + "grad_norm": 11.778432655921506, + "learning_rate": 4.980004134949853e-06, + "loss": 1.5432, + "step": 976 + }, + { + "epoch": 0.13831669852056347, + "grad_norm": 10.606405705043555, + "learning_rate": 4.979931717711347e-06, + "loss": 1.3311, + "step": 977 + }, + { + "epoch": 0.13845827139520067, + "grad_norm": 8.512196485847344, + "learning_rate": 4.979859170104679e-06, + "loss": 1.5009, + "step": 978 + }, + { + "epoch": 0.1385998442698379, + "grad_norm": 8.835874467889678, + "learning_rate": 4.979786492133665e-06, + "loss": 1.3571, + "step": 979 + }, + { + "epoch": 0.13874141714447513, + "grad_norm": 9.35339674014811, + "learning_rate": 4.979713683802123e-06, + "loss": 1.4426, + "step": 980 + }, + { + "epoch": 0.13888299001911233, + "grad_norm": 11.408329353489727, + "learning_rate": 4.979640745113883e-06, + "loss": 1.4879, + "step": 981 + }, + { + "epoch": 0.13902456289374956, + "grad_norm": 12.359477470735134, + "learning_rate": 4.979567676072776e-06, + "loss": 1.4438, + "step": 982 + }, + { + "epoch": 0.13916613576838677, + "grad_norm": 10.322607647183696, + "learning_rate": 4.979494476682647e-06, + "loss": 1.4606, + "step": 983 + }, + { + "epoch": 0.139307708643024, + "grad_norm": 10.086477204648922, + "learning_rate": 4.979421146947341e-06, + "loss": 1.4126, + "step": 984 + }, + { + "epoch": 0.13944928151766123, + "grad_norm": 10.64736953272068, + "learning_rate": 4.979347686870714e-06, + "loss": 1.5668, + "step": 985 + }, + { + "epoch": 0.13959085439229843, + "grad_norm": 10.613767543262023, + "learning_rate": 4.979274096456629e-06, + "loss": 1.2888, + "step": 986 + }, + { + "epoch": 0.13973242726693566, + "grad_norm": 10.09852284119904, + "learning_rate": 4.979200375708951e-06, + "loss": 1.6008, + "step": 987 + }, + { + "epoch": 0.13987400014157286, + "grad_norm": 9.035037609332392, + "learning_rate": 4.97912652463156e-06, + "loss": 1.6409, + "step": 988 + }, + { + "epoch": 0.1400155730162101, + "grad_norm": 12.794224055778182, + "learning_rate": 4.979052543228335e-06, + "loss": 1.385, + "step": 989 + }, + { + "epoch": 0.14015714589084732, + "grad_norm": 11.261915212472951, + "learning_rate": 4.978978431503167e-06, + "loss": 1.3993, + "step": 990 + }, + { + "epoch": 0.14029871876548453, + "grad_norm": 11.296968536200769, + "learning_rate": 4.978904189459951e-06, + "loss": 1.494, + "step": 991 + }, + { + "epoch": 0.14044029164012176, + "grad_norm": 9.663204713219619, + "learning_rate": 4.97882981710259e-06, + "loss": 1.41, + "step": 992 + }, + { + "epoch": 0.14058186451475896, + "grad_norm": 11.121170928011969, + "learning_rate": 4.978755314434994e-06, + "loss": 1.5727, + "step": 993 + }, + { + "epoch": 0.1407234373893962, + "grad_norm": 13.115443647111098, + "learning_rate": 4.978680681461079e-06, + "loss": 1.2923, + "step": 994 + }, + { + "epoch": 0.14086501026403342, + "grad_norm": 10.048980160110816, + "learning_rate": 4.978605918184769e-06, + "loss": 1.4329, + "step": 995 + }, + { + "epoch": 0.14100658313867062, + "grad_norm": 8.097353806761502, + "learning_rate": 4.978531024609994e-06, + "loss": 1.5344, + "step": 996 + }, + { + "epoch": 0.14114815601330785, + "grad_norm": 9.60738514712689, + "learning_rate": 4.978456000740691e-06, + "loss": 1.4355, + "step": 997 + }, + { + "epoch": 0.14128972888794508, + "grad_norm": 11.914958342344496, + "learning_rate": 4.9783808465808035e-06, + "loss": 1.4211, + "step": 998 + }, + { + "epoch": 0.14143130176258228, + "grad_norm": 13.603430236855129, + "learning_rate": 4.978305562134284e-06, + "loss": 1.5307, + "step": 999 + }, + { + "epoch": 0.14157287463721951, + "grad_norm": 8.529147269769355, + "learning_rate": 4.978230147405089e-06, + "loss": 1.3799, + "step": 1000 + }, + { + "epoch": 0.14171444751185672, + "grad_norm": 9.365108306959094, + "learning_rate": 4.978154602397182e-06, + "loss": 1.5159, + "step": 1001 + }, + { + "epoch": 0.14185602038649395, + "grad_norm": 11.15910409701634, + "learning_rate": 4.978078927114536e-06, + "loss": 1.4008, + "step": 1002 + }, + { + "epoch": 0.14199759326113118, + "grad_norm": 10.990043642549228, + "learning_rate": 4.978003121561128e-06, + "loss": 1.3022, + "step": 1003 + }, + { + "epoch": 0.14213916613576838, + "grad_norm": 9.246816963820704, + "learning_rate": 4.977927185740944e-06, + "loss": 1.4544, + "step": 1004 + }, + { + "epoch": 0.1422807390104056, + "grad_norm": 10.453005301071016, + "learning_rate": 4.977851119657976e-06, + "loss": 1.3737, + "step": 1005 + }, + { + "epoch": 0.1424223118850428, + "grad_norm": 10.253497387681884, + "learning_rate": 4.977774923316221e-06, + "loss": 1.4559, + "step": 1006 + }, + { + "epoch": 0.14256388475968004, + "grad_norm": 10.453303451041243, + "learning_rate": 4.977698596719686e-06, + "loss": 1.3894, + "step": 1007 + }, + { + "epoch": 0.14270545763431727, + "grad_norm": 9.37325952268419, + "learning_rate": 4.977622139872384e-06, + "loss": 1.47, + "step": 1008 + }, + { + "epoch": 0.14284703050895448, + "grad_norm": 9.131975825795323, + "learning_rate": 4.977545552778333e-06, + "loss": 1.4402, + "step": 1009 + }, + { + "epoch": 0.1429886033835917, + "grad_norm": 9.718049738638728, + "learning_rate": 4.97746883544156e-06, + "loss": 1.4262, + "step": 1010 + }, + { + "epoch": 0.14313017625822894, + "grad_norm": 10.116358707607029, + "learning_rate": 4.977391987866097e-06, + "loss": 1.445, + "step": 1011 + }, + { + "epoch": 0.14327174913286614, + "grad_norm": 8.425311291657307, + "learning_rate": 4.9773150100559844e-06, + "loss": 1.4091, + "step": 1012 + }, + { + "epoch": 0.14341332200750337, + "grad_norm": 10.365558786672377, + "learning_rate": 4.9772379020152695e-06, + "loss": 1.4616, + "step": 1013 + }, + { + "epoch": 0.14355489488214057, + "grad_norm": 15.885079135331438, + "learning_rate": 4.977160663748005e-06, + "loss": 1.3573, + "step": 1014 + }, + { + "epoch": 0.1436964677567778, + "grad_norm": 10.502108044410171, + "learning_rate": 4.977083295258251e-06, + "loss": 1.3841, + "step": 1015 + }, + { + "epoch": 0.14383804063141503, + "grad_norm": 13.155202067268448, + "learning_rate": 4.977005796550076e-06, + "loss": 1.5351, + "step": 1016 + }, + { + "epoch": 0.14397961350605223, + "grad_norm": 11.606896527369852, + "learning_rate": 4.976928167627553e-06, + "loss": 1.5193, + "step": 1017 + }, + { + "epoch": 0.14412118638068946, + "grad_norm": 13.255399935173825, + "learning_rate": 4.976850408494762e-06, + "loss": 1.4378, + "step": 1018 + }, + { + "epoch": 0.14426275925532667, + "grad_norm": 9.288782832601399, + "learning_rate": 4.976772519155793e-06, + "loss": 1.4433, + "step": 1019 + }, + { + "epoch": 0.1444043321299639, + "grad_norm": 10.427544694437525, + "learning_rate": 4.976694499614739e-06, + "loss": 1.4251, + "step": 1020 + }, + { + "epoch": 0.14454590500460113, + "grad_norm": 12.341231695250656, + "learning_rate": 4.976616349875702e-06, + "loss": 1.2695, + "step": 1021 + }, + { + "epoch": 0.14468747787923833, + "grad_norm": 11.298174644834262, + "learning_rate": 4.9765380699427905e-06, + "loss": 1.3877, + "step": 1022 + }, + { + "epoch": 0.14482905075387556, + "grad_norm": 10.643813407561243, + "learning_rate": 4.9764596598201185e-06, + "loss": 1.4918, + "step": 1023 + }, + { + "epoch": 0.1449706236285128, + "grad_norm": 9.563820342619284, + "learning_rate": 4.97638111951181e-06, + "loss": 1.4276, + "step": 1024 + }, + { + "epoch": 0.14511219650315, + "grad_norm": 13.82821603518763, + "learning_rate": 4.976302449021991e-06, + "loss": 1.6607, + "step": 1025 + }, + { + "epoch": 0.14525376937778722, + "grad_norm": 12.852151392282781, + "learning_rate": 4.9762236483547985e-06, + "loss": 1.4211, + "step": 1026 + }, + { + "epoch": 0.14539534225242443, + "grad_norm": 10.573630342998147, + "learning_rate": 4.976144717514376e-06, + "loss": 1.6094, + "step": 1027 + }, + { + "epoch": 0.14553691512706166, + "grad_norm": 10.723908809469082, + "learning_rate": 4.976065656504873e-06, + "loss": 1.4079, + "step": 1028 + }, + { + "epoch": 0.14567848800169889, + "grad_norm": 12.01102259161513, + "learning_rate": 4.975986465330443e-06, + "loss": 1.3311, + "step": 1029 + }, + { + "epoch": 0.1458200608763361, + "grad_norm": 11.423504714444578, + "learning_rate": 4.975907143995251e-06, + "loss": 1.4104, + "step": 1030 + }, + { + "epoch": 0.14596163375097332, + "grad_norm": 11.202373484752886, + "learning_rate": 4.975827692503467e-06, + "loss": 1.6661, + "step": 1031 + }, + { + "epoch": 0.14610320662561052, + "grad_norm": 9.624426490244147, + "learning_rate": 4.975748110859267e-06, + "loss": 1.3012, + "step": 1032 + }, + { + "epoch": 0.14624477950024775, + "grad_norm": 10.496106470382133, + "learning_rate": 4.975668399066835e-06, + "loss": 1.2818, + "step": 1033 + }, + { + "epoch": 0.14638635237488498, + "grad_norm": 9.560498794886763, + "learning_rate": 4.975588557130361e-06, + "loss": 1.3187, + "step": 1034 + }, + { + "epoch": 0.14652792524952218, + "grad_norm": 11.358597887577485, + "learning_rate": 4.9755085850540426e-06, + "loss": 1.3526, + "step": 1035 + }, + { + "epoch": 0.14666949812415941, + "grad_norm": 13.353662315679955, + "learning_rate": 4.975428482842083e-06, + "loss": 1.5669, + "step": 1036 + }, + { + "epoch": 0.14681107099879662, + "grad_norm": 9.508693933182485, + "learning_rate": 4.975348250498695e-06, + "loss": 1.4339, + "step": 1037 + }, + { + "epoch": 0.14695264387343385, + "grad_norm": 10.67028743167267, + "learning_rate": 4.975267888028094e-06, + "loss": 1.5514, + "step": 1038 + }, + { + "epoch": 0.14709421674807108, + "grad_norm": 12.432852485365197, + "learning_rate": 4.975187395434506e-06, + "loss": 1.5356, + "step": 1039 + }, + { + "epoch": 0.14723578962270828, + "grad_norm": 11.009804257793228, + "learning_rate": 4.975106772722164e-06, + "loss": 1.3748, + "step": 1040 + }, + { + "epoch": 0.1473773624973455, + "grad_norm": 13.361634608794887, + "learning_rate": 4.975026019895302e-06, + "loss": 1.3196, + "step": 1041 + }, + { + "epoch": 0.14751893537198274, + "grad_norm": 10.463307528853358, + "learning_rate": 4.9749451369581694e-06, + "loss": 1.3407, + "step": 1042 + }, + { + "epoch": 0.14766050824661994, + "grad_norm": 10.88527746278865, + "learning_rate": 4.974864123915015e-06, + "loss": 1.3184, + "step": 1043 + }, + { + "epoch": 0.14780208112125717, + "grad_norm": 10.910183872033597, + "learning_rate": 4.9747829807701e-06, + "loss": 1.4277, + "step": 1044 + }, + { + "epoch": 0.14794365399589438, + "grad_norm": 12.437827561248763, + "learning_rate": 4.974701707527688e-06, + "loss": 1.3132, + "step": 1045 + }, + { + "epoch": 0.1480852268705316, + "grad_norm": 11.367001582457263, + "learning_rate": 4.9746203041920534e-06, + "loss": 1.4632, + "step": 1046 + }, + { + "epoch": 0.14822679974516884, + "grad_norm": 11.41785448860892, + "learning_rate": 4.974538770767474e-06, + "loss": 1.5009, + "step": 1047 + }, + { + "epoch": 0.14836837261980604, + "grad_norm": 13.5083699593978, + "learning_rate": 4.9744571072582365e-06, + "loss": 1.3323, + "step": 1048 + }, + { + "epoch": 0.14850994549444327, + "grad_norm": 11.626322353176043, + "learning_rate": 4.974375313668633e-06, + "loss": 1.397, + "step": 1049 + }, + { + "epoch": 0.14865151836908047, + "grad_norm": 11.361327285596067, + "learning_rate": 4.974293390002966e-06, + "loss": 1.5719, + "step": 1050 + }, + { + "epoch": 0.1487930912437177, + "grad_norm": 9.16248762642518, + "learning_rate": 4.97421133626554e-06, + "loss": 1.4475, + "step": 1051 + }, + { + "epoch": 0.14893466411835493, + "grad_norm": 10.151214196252738, + "learning_rate": 4.9741291524606684e-06, + "loss": 1.4556, + "step": 1052 + }, + { + "epoch": 0.14907623699299213, + "grad_norm": 10.747275428599734, + "learning_rate": 4.974046838592672e-06, + "loss": 1.1976, + "step": 1053 + }, + { + "epoch": 0.14921780986762936, + "grad_norm": 12.961171190064734, + "learning_rate": 4.973964394665878e-06, + "loss": 1.3879, + "step": 1054 + }, + { + "epoch": 0.1493593827422666, + "grad_norm": 10.603018607975477, + "learning_rate": 4.973881820684621e-06, + "loss": 1.4065, + "step": 1055 + }, + { + "epoch": 0.1495009556169038, + "grad_norm": 9.31069785480428, + "learning_rate": 4.973799116653241e-06, + "loss": 1.4934, + "step": 1056 + }, + { + "epoch": 0.14964252849154103, + "grad_norm": 16.04268959383983, + "learning_rate": 4.973716282576086e-06, + "loss": 1.5151, + "step": 1057 + }, + { + "epoch": 0.14978410136617823, + "grad_norm": 11.236340516625793, + "learning_rate": 4.9736333184575105e-06, + "loss": 1.4289, + "step": 1058 + }, + { + "epoch": 0.14992567424081546, + "grad_norm": 9.126239013163671, + "learning_rate": 4.973550224301875e-06, + "loss": 1.5669, + "step": 1059 + }, + { + "epoch": 0.1500672471154527, + "grad_norm": 9.198298537440863, + "learning_rate": 4.9734670001135495e-06, + "loss": 1.6833, + "step": 1060 + }, + { + "epoch": 0.1502088199900899, + "grad_norm": 9.131406860257547, + "learning_rate": 4.973383645896908e-06, + "loss": 1.3644, + "step": 1061 + }, + { + "epoch": 0.15035039286472712, + "grad_norm": 9.334936367608764, + "learning_rate": 4.973300161656332e-06, + "loss": 1.3722, + "step": 1062 + }, + { + "epoch": 0.15049196573936433, + "grad_norm": 14.091431285321775, + "learning_rate": 4.973216547396212e-06, + "loss": 1.5158, + "step": 1063 + }, + { + "epoch": 0.15063353861400156, + "grad_norm": 11.170176813131523, + "learning_rate": 4.9731328031209414e-06, + "loss": 1.4306, + "step": 1064 + }, + { + "epoch": 0.15077511148863879, + "grad_norm": 8.582498849345885, + "learning_rate": 4.973048928834923e-06, + "loss": 1.5167, + "step": 1065 + }, + { + "epoch": 0.150916684363276, + "grad_norm": 9.433267010387112, + "learning_rate": 4.972964924542567e-06, + "loss": 1.4454, + "step": 1066 + }, + { + "epoch": 0.15105825723791322, + "grad_norm": 14.339828320783036, + "learning_rate": 4.9728807902482885e-06, + "loss": 1.2547, + "step": 1067 + }, + { + "epoch": 0.15119983011255045, + "grad_norm": 14.778738699689868, + "learning_rate": 4.97279652595651e-06, + "loss": 1.5662, + "step": 1068 + }, + { + "epoch": 0.15134140298718765, + "grad_norm": 9.461519306919731, + "learning_rate": 4.972712131671663e-06, + "loss": 1.4678, + "step": 1069 + }, + { + "epoch": 0.15148297586182488, + "grad_norm": 10.811195956406142, + "learning_rate": 4.972627607398183e-06, + "loss": 1.5634, + "step": 1070 + }, + { + "epoch": 0.15162454873646208, + "grad_norm": 11.419955111814554, + "learning_rate": 4.972542953140513e-06, + "loss": 1.4625, + "step": 1071 + }, + { + "epoch": 0.1517661216110993, + "grad_norm": 14.291934768045316, + "learning_rate": 4.972458168903104e-06, + "loss": 1.4495, + "step": 1072 + }, + { + "epoch": 0.15190769448573654, + "grad_norm": 11.688620773676435, + "learning_rate": 4.972373254690411e-06, + "loss": 1.3111, + "step": 1073 + }, + { + "epoch": 0.15204926736037375, + "grad_norm": 10.369326959323612, + "learning_rate": 4.972288210506902e-06, + "loss": 1.2632, + "step": 1074 + }, + { + "epoch": 0.15219084023501098, + "grad_norm": 8.705770696521368, + "learning_rate": 4.972203036357043e-06, + "loss": 1.3816, + "step": 1075 + }, + { + "epoch": 0.15233241310964818, + "grad_norm": 8.262297279537284, + "learning_rate": 4.972117732245314e-06, + "loss": 1.4605, + "step": 1076 + }, + { + "epoch": 0.1524739859842854, + "grad_norm": 14.19886035912907, + "learning_rate": 4.972032298176201e-06, + "loss": 1.4351, + "step": 1077 + }, + { + "epoch": 0.15261555885892264, + "grad_norm": 11.331854854711217, + "learning_rate": 4.9719467341541914e-06, + "loss": 1.4399, + "step": 1078 + }, + { + "epoch": 0.15275713173355984, + "grad_norm": 11.01909886769136, + "learning_rate": 4.971861040183785e-06, + "loss": 1.4564, + "step": 1079 + }, + { + "epoch": 0.15289870460819707, + "grad_norm": 9.907998878303376, + "learning_rate": 4.971775216269488e-06, + "loss": 1.3605, + "step": 1080 + }, + { + "epoch": 0.1530402774828343, + "grad_norm": 11.40083882776011, + "learning_rate": 4.971689262415811e-06, + "loss": 1.4275, + "step": 1081 + }, + { + "epoch": 0.1531818503574715, + "grad_norm": 10.459875917592225, + "learning_rate": 4.971603178627271e-06, + "loss": 1.3963, + "step": 1082 + }, + { + "epoch": 0.15332342323210874, + "grad_norm": 11.487214652119425, + "learning_rate": 4.971516964908396e-06, + "loss": 1.3589, + "step": 1083 + }, + { + "epoch": 0.15346499610674594, + "grad_norm": 11.151934304546012, + "learning_rate": 4.9714306212637165e-06, + "loss": 1.4228, + "step": 1084 + }, + { + "epoch": 0.15360656898138317, + "grad_norm": 8.532627539454115, + "learning_rate": 4.971344147697772e-06, + "loss": 1.3938, + "step": 1085 + }, + { + "epoch": 0.1537481418560204, + "grad_norm": 9.998560229605014, + "learning_rate": 4.9712575442151086e-06, + "loss": 1.4513, + "step": 1086 + }, + { + "epoch": 0.1538897147306576, + "grad_norm": 12.77024699737728, + "learning_rate": 4.971170810820279e-06, + "loss": 1.3814, + "step": 1087 + }, + { + "epoch": 0.15403128760529483, + "grad_norm": 10.3076896776969, + "learning_rate": 4.971083947517842e-06, + "loss": 1.342, + "step": 1088 + }, + { + "epoch": 0.15417286047993203, + "grad_norm": 12.411860276070437, + "learning_rate": 4.970996954312365e-06, + "loss": 1.4412, + "step": 1089 + }, + { + "epoch": 0.15431443335456926, + "grad_norm": 10.3758588688178, + "learning_rate": 4.97090983120842e-06, + "loss": 1.3477, + "step": 1090 + }, + { + "epoch": 0.1544560062292065, + "grad_norm": 10.890165781768179, + "learning_rate": 4.970822578210587e-06, + "loss": 1.5636, + "step": 1091 + }, + { + "epoch": 0.1545975791038437, + "grad_norm": 9.416283208640872, + "learning_rate": 4.970735195323454e-06, + "loss": 1.2581, + "step": 1092 + }, + { + "epoch": 0.15473915197848093, + "grad_norm": 8.270322795135762, + "learning_rate": 4.970647682551614e-06, + "loss": 1.3414, + "step": 1093 + }, + { + "epoch": 0.15488072485311813, + "grad_norm": 8.862284402955435, + "learning_rate": 4.970560039899668e-06, + "loss": 1.4531, + "step": 1094 + }, + { + "epoch": 0.15502229772775536, + "grad_norm": 11.31599094206851, + "learning_rate": 4.970472267372223e-06, + "loss": 1.425, + "step": 1095 + }, + { + "epoch": 0.1551638706023926, + "grad_norm": 10.413127654027031, + "learning_rate": 4.9703843649738926e-06, + "loss": 1.3869, + "step": 1096 + }, + { + "epoch": 0.1553054434770298, + "grad_norm": 8.314629074145461, + "learning_rate": 4.970296332709298e-06, + "loss": 1.1668, + "step": 1097 + }, + { + "epoch": 0.15544701635166702, + "grad_norm": 9.403806248559635, + "learning_rate": 4.970208170583066e-06, + "loss": 1.3294, + "step": 1098 + }, + { + "epoch": 0.15558858922630425, + "grad_norm": 9.383870315319077, + "learning_rate": 4.9701198785998335e-06, + "loss": 1.373, + "step": 1099 + }, + { + "epoch": 0.15573016210094145, + "grad_norm": 8.001678290755633, + "learning_rate": 4.970031456764242e-06, + "loss": 1.3347, + "step": 1100 + }, + { + "epoch": 0.15587173497557869, + "grad_norm": 10.719288448233202, + "learning_rate": 4.969942905080936e-06, + "loss": 1.4413, + "step": 1101 + }, + { + "epoch": 0.1560133078502159, + "grad_norm": 7.976514676031953, + "learning_rate": 4.969854223554575e-06, + "loss": 1.5117, + "step": 1102 + }, + { + "epoch": 0.15615488072485312, + "grad_norm": 11.376611616387244, + "learning_rate": 4.969765412189819e-06, + "loss": 1.5925, + "step": 1103 + }, + { + "epoch": 0.15629645359949035, + "grad_norm": 9.816093418587734, + "learning_rate": 4.969676470991336e-06, + "loss": 1.4351, + "step": 1104 + }, + { + "epoch": 0.15643802647412755, + "grad_norm": 9.03913635381994, + "learning_rate": 4.969587399963802e-06, + "loss": 1.3264, + "step": 1105 + }, + { + "epoch": 0.15657959934876478, + "grad_norm": 9.631109503254123, + "learning_rate": 4.969498199111901e-06, + "loss": 1.2027, + "step": 1106 + }, + { + "epoch": 0.15672117222340198, + "grad_norm": 10.377132702132887, + "learning_rate": 4.9694088684403205e-06, + "loss": 1.5278, + "step": 1107 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 10.224965482993962, + "learning_rate": 4.969319407953756e-06, + "loss": 1.426, + "step": 1108 + }, + { + "epoch": 0.15700431797267644, + "grad_norm": 8.847604511438128, + "learning_rate": 4.969229817656913e-06, + "loss": 1.364, + "step": 1109 + }, + { + "epoch": 0.15714589084731365, + "grad_norm": 10.746890305810197, + "learning_rate": 4.969140097554499e-06, + "loss": 1.4391, + "step": 1110 + }, + { + "epoch": 0.15728746372195088, + "grad_norm": 10.371233152900208, + "learning_rate": 4.969050247651231e-06, + "loss": 1.2657, + "step": 1111 + }, + { + "epoch": 0.1574290365965881, + "grad_norm": 7.79930844297882, + "learning_rate": 4.968960267951833e-06, + "loss": 1.3751, + "step": 1112 + }, + { + "epoch": 0.1575706094712253, + "grad_norm": 9.171062686957574, + "learning_rate": 4.9688701584610345e-06, + "loss": 1.3753, + "step": 1113 + }, + { + "epoch": 0.15771218234586254, + "grad_norm": 9.098981578137046, + "learning_rate": 4.968779919183573e-06, + "loss": 1.5349, + "step": 1114 + }, + { + "epoch": 0.15785375522049974, + "grad_norm": 10.660480811635745, + "learning_rate": 4.96868955012419e-06, + "loss": 1.4799, + "step": 1115 + }, + { + "epoch": 0.15799532809513697, + "grad_norm": 9.575802793787465, + "learning_rate": 4.96859905128764e-06, + "loss": 1.3096, + "step": 1116 + }, + { + "epoch": 0.1581369009697742, + "grad_norm": 8.891622101929187, + "learning_rate": 4.968508422678679e-06, + "loss": 1.5135, + "step": 1117 + }, + { + "epoch": 0.1582784738444114, + "grad_norm": 9.200077089733066, + "learning_rate": 4.968417664302069e-06, + "loss": 1.287, + "step": 1118 + }, + { + "epoch": 0.15842004671904863, + "grad_norm": 10.188585094867273, + "learning_rate": 4.968326776162584e-06, + "loss": 1.4761, + "step": 1119 + }, + { + "epoch": 0.15856161959368584, + "grad_norm": 9.168421028335906, + "learning_rate": 4.968235758265001e-06, + "loss": 1.3717, + "step": 1120 + }, + { + "epoch": 0.15870319246832307, + "grad_norm": 9.34101751172199, + "learning_rate": 4.968144610614104e-06, + "loss": 1.3951, + "step": 1121 + }, + { + "epoch": 0.1588447653429603, + "grad_norm": 11.010672160600489, + "learning_rate": 4.9680533332146855e-06, + "loss": 1.4431, + "step": 1122 + }, + { + "epoch": 0.1589863382175975, + "grad_norm": 11.218364400798432, + "learning_rate": 4.967961926071543e-06, + "loss": 1.3748, + "step": 1123 + }, + { + "epoch": 0.15912791109223473, + "grad_norm": 11.261779382749543, + "learning_rate": 4.967870389189483e-06, + "loss": 1.2774, + "step": 1124 + }, + { + "epoch": 0.15926948396687196, + "grad_norm": 11.603817909976467, + "learning_rate": 4.967778722573317e-06, + "loss": 1.4539, + "step": 1125 + }, + { + "epoch": 0.15941105684150916, + "grad_norm": 10.122911402660465, + "learning_rate": 4.967686926227862e-06, + "loss": 1.6403, + "step": 1126 + }, + { + "epoch": 0.1595526297161464, + "grad_norm": 10.39772614783685, + "learning_rate": 4.967595000157946e-06, + "loss": 1.3066, + "step": 1127 + }, + { + "epoch": 0.1596942025907836, + "grad_norm": 10.90454368031861, + "learning_rate": 4.967502944368402e-06, + "loss": 1.3928, + "step": 1128 + }, + { + "epoch": 0.15983577546542083, + "grad_norm": 11.933325871584245, + "learning_rate": 4.967410758864066e-06, + "loss": 1.438, + "step": 1129 + }, + { + "epoch": 0.15997734834005806, + "grad_norm": 9.831154360975924, + "learning_rate": 4.967318443649788e-06, + "loss": 1.4727, + "step": 1130 + }, + { + "epoch": 0.16011892121469526, + "grad_norm": 11.812427661815542, + "learning_rate": 4.967225998730419e-06, + "loss": 1.3898, + "step": 1131 + }, + { + "epoch": 0.1602604940893325, + "grad_norm": 11.030757903883215, + "learning_rate": 4.967133424110817e-06, + "loss": 1.5569, + "step": 1132 + }, + { + "epoch": 0.1604020669639697, + "grad_norm": 9.17609263054572, + "learning_rate": 4.967040719795853e-06, + "loss": 1.5918, + "step": 1133 + }, + { + "epoch": 0.16054363983860692, + "grad_norm": 8.318264217508672, + "learning_rate": 4.966947885790396e-06, + "loss": 1.2553, + "step": 1134 + }, + { + "epoch": 0.16068521271324415, + "grad_norm": 10.842414180036881, + "learning_rate": 4.966854922099329e-06, + "loss": 1.4688, + "step": 1135 + }, + { + "epoch": 0.16082678558788135, + "grad_norm": 12.299192721539187, + "learning_rate": 4.966761828727537e-06, + "loss": 1.4029, + "step": 1136 + }, + { + "epoch": 0.16096835846251858, + "grad_norm": 11.009137606545018, + "learning_rate": 4.9666686056799165e-06, + "loss": 1.3821, + "step": 1137 + }, + { + "epoch": 0.1611099313371558, + "grad_norm": 11.163513725359461, + "learning_rate": 4.966575252961365e-06, + "loss": 1.5941, + "step": 1138 + }, + { + "epoch": 0.16125150421179302, + "grad_norm": 9.323887768397675, + "learning_rate": 4.966481770576793e-06, + "loss": 1.339, + "step": 1139 + }, + { + "epoch": 0.16139307708643025, + "grad_norm": 8.477286115427065, + "learning_rate": 4.9663881585311126e-06, + "loss": 1.3777, + "step": 1140 + }, + { + "epoch": 0.16153464996106745, + "grad_norm": 9.243336932565304, + "learning_rate": 4.9662944168292455e-06, + "loss": 1.3516, + "step": 1141 + }, + { + "epoch": 0.16167622283570468, + "grad_norm": 12.333965663336954, + "learning_rate": 4.966200545476121e-06, + "loss": 1.4765, + "step": 1142 + }, + { + "epoch": 0.1618177957103419, + "grad_norm": 9.071883611776828, + "learning_rate": 4.966106544476672e-06, + "loss": 1.3394, + "step": 1143 + }, + { + "epoch": 0.1619593685849791, + "grad_norm": 11.024835466813666, + "learning_rate": 4.9660124138358415e-06, + "loss": 1.3858, + "step": 1144 + }, + { + "epoch": 0.16210094145961634, + "grad_norm": 8.484420410933952, + "learning_rate": 4.965918153558576e-06, + "loss": 1.3445, + "step": 1145 + }, + { + "epoch": 0.16224251433425355, + "grad_norm": 8.714771525221783, + "learning_rate": 4.965823763649832e-06, + "loss": 1.4312, + "step": 1146 + }, + { + "epoch": 0.16238408720889078, + "grad_norm": 9.34404843469373, + "learning_rate": 4.965729244114572e-06, + "loss": 1.2111, + "step": 1147 + }, + { + "epoch": 0.162525660083528, + "grad_norm": 9.389282739969744, + "learning_rate": 4.965634594957763e-06, + "loss": 1.4717, + "step": 1148 + }, + { + "epoch": 0.1626672329581652, + "grad_norm": 10.538472854034158, + "learning_rate": 4.9655398161843836e-06, + "loss": 1.3414, + "step": 1149 + }, + { + "epoch": 0.16280880583280244, + "grad_norm": 9.512602628337234, + "learning_rate": 4.965444907799413e-06, + "loss": 1.5278, + "step": 1150 + }, + { + "epoch": 0.16295037870743964, + "grad_norm": 8.153282356425157, + "learning_rate": 4.9653498698078425e-06, + "loss": 1.3913, + "step": 1151 + }, + { + "epoch": 0.16309195158207687, + "grad_norm": 8.679582459563552, + "learning_rate": 4.965254702214668e-06, + "loss": 1.3723, + "step": 1152 + }, + { + "epoch": 0.1632335244567141, + "grad_norm": 13.238001735209368, + "learning_rate": 4.96515940502489e-06, + "loss": 1.3045, + "step": 1153 + }, + { + "epoch": 0.1633750973313513, + "grad_norm": 12.673422233300508, + "learning_rate": 4.9650639782435225e-06, + "loss": 1.5039, + "step": 1154 + }, + { + "epoch": 0.16351667020598853, + "grad_norm": 8.495696380793566, + "learning_rate": 4.964968421875579e-06, + "loss": 1.2364, + "step": 1155 + }, + { + "epoch": 0.16365824308062576, + "grad_norm": 12.792491045470287, + "learning_rate": 4.964872735926083e-06, + "loss": 1.284, + "step": 1156 + }, + { + "epoch": 0.16379981595526297, + "grad_norm": 11.505837327310823, + "learning_rate": 4.964776920400066e-06, + "loss": 1.3871, + "step": 1157 + }, + { + "epoch": 0.1639413888299002, + "grad_norm": 8.938741957797301, + "learning_rate": 4.964680975302563e-06, + "loss": 1.3476, + "step": 1158 + }, + { + "epoch": 0.1640829617045374, + "grad_norm": 8.73522617512371, + "learning_rate": 4.96458490063862e-06, + "loss": 1.403, + "step": 1159 + }, + { + "epoch": 0.16422453457917463, + "grad_norm": 9.957762591447775, + "learning_rate": 4.964488696413285e-06, + "loss": 1.4244, + "step": 1160 + }, + { + "epoch": 0.16436610745381186, + "grad_norm": 11.11307907054668, + "learning_rate": 4.964392362631618e-06, + "loss": 1.5481, + "step": 1161 + }, + { + "epoch": 0.16450768032844906, + "grad_norm": 11.926048018429686, + "learning_rate": 4.964295899298682e-06, + "loss": 1.423, + "step": 1162 + }, + { + "epoch": 0.1646492532030863, + "grad_norm": 11.528167726184716, + "learning_rate": 4.964199306419548e-06, + "loss": 1.3061, + "step": 1163 + }, + { + "epoch": 0.1647908260777235, + "grad_norm": 8.566770504966597, + "learning_rate": 4.964102583999293e-06, + "loss": 1.3728, + "step": 1164 + }, + { + "epoch": 0.16493239895236073, + "grad_norm": 8.776371479723135, + "learning_rate": 4.964005732043003e-06, + "loss": 1.3834, + "step": 1165 + }, + { + "epoch": 0.16507397182699796, + "grad_norm": 11.007583085127115, + "learning_rate": 4.9639087505557694e-06, + "loss": 1.4465, + "step": 1166 + }, + { + "epoch": 0.16521554470163516, + "grad_norm": 10.09922996078321, + "learning_rate": 4.96381163954269e-06, + "loss": 1.6076, + "step": 1167 + }, + { + "epoch": 0.1653571175762724, + "grad_norm": 10.567291798035304, + "learning_rate": 4.963714399008869e-06, + "loss": 1.4837, + "step": 1168 + }, + { + "epoch": 0.16549869045090962, + "grad_norm": 11.65468032535977, + "learning_rate": 4.9636170289594195e-06, + "loss": 1.5359, + "step": 1169 + }, + { + "epoch": 0.16564026332554682, + "grad_norm": 9.775782607962624, + "learning_rate": 4.96351952939946e-06, + "loss": 1.452, + "step": 1170 + }, + { + "epoch": 0.16578183620018405, + "grad_norm": 13.114437830660444, + "learning_rate": 4.9634219003341156e-06, + "loss": 1.528, + "step": 1171 + }, + { + "epoch": 0.16592340907482125, + "grad_norm": 11.979407125077575, + "learning_rate": 4.963324141768519e-06, + "loss": 1.5519, + "step": 1172 + }, + { + "epoch": 0.16606498194945848, + "grad_norm": 10.885796109876429, + "learning_rate": 4.963226253707808e-06, + "loss": 1.5312, + "step": 1173 + }, + { + "epoch": 0.16620655482409571, + "grad_norm": 10.604622721779831, + "learning_rate": 4.96312823615713e-06, + "loss": 1.396, + "step": 1174 + }, + { + "epoch": 0.16634812769873292, + "grad_norm": 11.798653373819695, + "learning_rate": 4.963030089121636e-06, + "loss": 1.2915, + "step": 1175 + }, + { + "epoch": 0.16648970057337015, + "grad_norm": 11.491131887701354, + "learning_rate": 4.9629318126064884e-06, + "loss": 1.4561, + "step": 1176 + }, + { + "epoch": 0.16663127344800735, + "grad_norm": 11.972271991182767, + "learning_rate": 4.962833406616851e-06, + "loss": 1.5285, + "step": 1177 + }, + { + "epoch": 0.16677284632264458, + "grad_norm": 9.390168393121174, + "learning_rate": 4.9627348711578996e-06, + "loss": 1.3934, + "step": 1178 + }, + { + "epoch": 0.1669144191972818, + "grad_norm": 10.882036760646354, + "learning_rate": 4.96263620623481e-06, + "loss": 1.2298, + "step": 1179 + }, + { + "epoch": 0.167055992071919, + "grad_norm": 9.64544567100882, + "learning_rate": 4.962537411852772e-06, + "loss": 1.3061, + "step": 1180 + }, + { + "epoch": 0.16719756494655624, + "grad_norm": 9.806827533508105, + "learning_rate": 4.962438488016979e-06, + "loss": 1.3396, + "step": 1181 + }, + { + "epoch": 0.16733913782119345, + "grad_norm": 9.71201299479839, + "learning_rate": 4.9623394347326306e-06, + "loss": 1.4096, + "step": 1182 + }, + { + "epoch": 0.16748071069583068, + "grad_norm": 13.5619841446755, + "learning_rate": 4.9622402520049336e-06, + "loss": 1.4384, + "step": 1183 + }, + { + "epoch": 0.1676222835704679, + "grad_norm": 10.734509685176699, + "learning_rate": 4.962140939839103e-06, + "loss": 1.5649, + "step": 1184 + }, + { + "epoch": 0.1677638564451051, + "grad_norm": 11.315798789572646, + "learning_rate": 4.962041498240359e-06, + "loss": 1.4355, + "step": 1185 + }, + { + "epoch": 0.16790542931974234, + "grad_norm": 11.55416916412609, + "learning_rate": 4.961941927213928e-06, + "loss": 1.3427, + "step": 1186 + }, + { + "epoch": 0.16804700219437957, + "grad_norm": 9.02151418558346, + "learning_rate": 4.961842226765047e-06, + "loss": 1.1961, + "step": 1187 + }, + { + "epoch": 0.16818857506901677, + "grad_norm": 12.82667077913701, + "learning_rate": 4.9617423968989556e-06, + "loss": 1.4451, + "step": 1188 + }, + { + "epoch": 0.168330147943654, + "grad_norm": 12.902665423253215, + "learning_rate": 4.961642437620901e-06, + "loss": 1.3572, + "step": 1189 + }, + { + "epoch": 0.1684717208182912, + "grad_norm": 10.824847805724099, + "learning_rate": 4.96154234893614e-06, + "loss": 1.2855, + "step": 1190 + }, + { + "epoch": 0.16861329369292843, + "grad_norm": 10.783067456072368, + "learning_rate": 4.961442130849933e-06, + "loss": 1.5119, + "step": 1191 + }, + { + "epoch": 0.16875486656756566, + "grad_norm": 13.67412593077536, + "learning_rate": 4.961341783367548e-06, + "loss": 1.4632, + "step": 1192 + }, + { + "epoch": 0.16889643944220287, + "grad_norm": 10.060369230437537, + "learning_rate": 4.96124130649426e-06, + "loss": 1.5454, + "step": 1193 + }, + { + "epoch": 0.1690380123168401, + "grad_norm": 12.049706825077218, + "learning_rate": 4.961140700235353e-06, + "loss": 1.2218, + "step": 1194 + }, + { + "epoch": 0.1691795851914773, + "grad_norm": 11.13142672296197, + "learning_rate": 4.961039964596114e-06, + "loss": 1.4, + "step": 1195 + }, + { + "epoch": 0.16932115806611453, + "grad_norm": 11.373651780935813, + "learning_rate": 4.9609390995818395e-06, + "loss": 1.4734, + "step": 1196 + }, + { + "epoch": 0.16946273094075176, + "grad_norm": 10.533761016597309, + "learning_rate": 4.960838105197831e-06, + "loss": 1.3499, + "step": 1197 + }, + { + "epoch": 0.16960430381538896, + "grad_norm": 13.118413525937502, + "learning_rate": 4.960736981449399e-06, + "loss": 1.4746, + "step": 1198 + }, + { + "epoch": 0.1697458766900262, + "grad_norm": 12.616305237149032, + "learning_rate": 4.960635728341858e-06, + "loss": 1.4723, + "step": 1199 + }, + { + "epoch": 0.16988744956466342, + "grad_norm": 8.847532508102674, + "learning_rate": 4.960534345880531e-06, + "loss": 1.3297, + "step": 1200 + }, + { + "epoch": 0.17002902243930063, + "grad_norm": 14.640137752864607, + "learning_rate": 4.960432834070749e-06, + "loss": 1.4149, + "step": 1201 + }, + { + "epoch": 0.17017059531393786, + "grad_norm": 12.616302213523927, + "learning_rate": 4.960331192917847e-06, + "loss": 1.3855, + "step": 1202 + }, + { + "epoch": 0.17031216818857506, + "grad_norm": 14.617087285867052, + "learning_rate": 4.960229422427169e-06, + "loss": 1.3622, + "step": 1203 + }, + { + "epoch": 0.1704537410632123, + "grad_norm": 8.348884060236406, + "learning_rate": 4.960127522604065e-06, + "loss": 1.4954, + "step": 1204 + }, + { + "epoch": 0.17059531393784952, + "grad_norm": 9.819201082567886, + "learning_rate": 4.96002549345389e-06, + "loss": 1.4255, + "step": 1205 + }, + { + "epoch": 0.17073688681248672, + "grad_norm": 9.15721901202432, + "learning_rate": 4.95992333498201e-06, + "loss": 1.3908, + "step": 1206 + }, + { + "epoch": 0.17087845968712395, + "grad_norm": 10.813551305917178, + "learning_rate": 4.9598210471937945e-06, + "loss": 1.326, + "step": 1207 + }, + { + "epoch": 0.17102003256176115, + "grad_norm": 11.933726088668827, + "learning_rate": 4.959718630094621e-06, + "loss": 1.4383, + "step": 1208 + }, + { + "epoch": 0.17116160543639838, + "grad_norm": 9.099036918232361, + "learning_rate": 4.9596160836898735e-06, + "loss": 1.3666, + "step": 1209 + }, + { + "epoch": 0.17130317831103561, + "grad_norm": 10.218174349468343, + "learning_rate": 4.959513407984941e-06, + "loss": 1.572, + "step": 1210 + }, + { + "epoch": 0.17144475118567282, + "grad_norm": 12.859644946095846, + "learning_rate": 4.9594106029852234e-06, + "loss": 1.396, + "step": 1211 + }, + { + "epoch": 0.17158632406031005, + "grad_norm": 9.24482829914984, + "learning_rate": 4.959307668696124e-06, + "loss": 1.2843, + "step": 1212 + }, + { + "epoch": 0.17172789693494728, + "grad_norm": 10.265967980262618, + "learning_rate": 4.959204605123055e-06, + "loss": 1.4165, + "step": 1213 + }, + { + "epoch": 0.17186946980958448, + "grad_norm": 8.351821481455735, + "learning_rate": 4.959101412271433e-06, + "loss": 1.2746, + "step": 1214 + }, + { + "epoch": 0.1720110426842217, + "grad_norm": 11.04192684994654, + "learning_rate": 4.958998090146683e-06, + "loss": 1.5011, + "step": 1215 + }, + { + "epoch": 0.1721526155588589, + "grad_norm": 10.159462382592904, + "learning_rate": 4.9588946387542366e-06, + "loss": 1.3807, + "step": 1216 + }, + { + "epoch": 0.17229418843349614, + "grad_norm": 11.15726959954774, + "learning_rate": 4.958791058099533e-06, + "loss": 1.5969, + "step": 1217 + }, + { + "epoch": 0.17243576130813337, + "grad_norm": 11.748912355121231, + "learning_rate": 4.9586873481880175e-06, + "loss": 1.4639, + "step": 1218 + }, + { + "epoch": 0.17257733418277058, + "grad_norm": 11.24281725998454, + "learning_rate": 4.95858350902514e-06, + "loss": 1.432, + "step": 1219 + }, + { + "epoch": 0.1727189070574078, + "grad_norm": 11.426358158512341, + "learning_rate": 4.958479540616362e-06, + "loss": 1.4727, + "step": 1220 + }, + { + "epoch": 0.172860479932045, + "grad_norm": 10.459059691313787, + "learning_rate": 4.958375442967147e-06, + "loss": 1.2013, + "step": 1221 + }, + { + "epoch": 0.17300205280668224, + "grad_norm": 10.903855200227994, + "learning_rate": 4.958271216082968e-06, + "loss": 1.5968, + "step": 1222 + }, + { + "epoch": 0.17314362568131947, + "grad_norm": 8.562422035029401, + "learning_rate": 4.958166859969304e-06, + "loss": 1.2934, + "step": 1223 + }, + { + "epoch": 0.17328519855595667, + "grad_norm": 9.916248943175406, + "learning_rate": 4.958062374631641e-06, + "loss": 1.3958, + "step": 1224 + }, + { + "epoch": 0.1734267714305939, + "grad_norm": 10.51843568782201, + "learning_rate": 4.957957760075472e-06, + "loss": 1.4059, + "step": 1225 + }, + { + "epoch": 0.17356834430523113, + "grad_norm": 8.415946501587218, + "learning_rate": 4.957853016306297e-06, + "loss": 1.1663, + "step": 1226 + }, + { + "epoch": 0.17370991717986833, + "grad_norm": 11.55198067200119, + "learning_rate": 4.95774814332962e-06, + "loss": 1.5978, + "step": 1227 + }, + { + "epoch": 0.17385149005450556, + "grad_norm": 9.472669942000413, + "learning_rate": 4.957643141150958e-06, + "loss": 1.3526, + "step": 1228 + }, + { + "epoch": 0.17399306292914277, + "grad_norm": 11.487637716802354, + "learning_rate": 4.957538009775826e-06, + "loss": 1.4369, + "step": 1229 + }, + { + "epoch": 0.17413463580378, + "grad_norm": 10.38057145502646, + "learning_rate": 4.957432749209755e-06, + "loss": 1.2956, + "step": 1230 + }, + { + "epoch": 0.17427620867841723, + "grad_norm": 8.574535960106083, + "learning_rate": 4.957327359458276e-06, + "loss": 1.4177, + "step": 1231 + }, + { + "epoch": 0.17441778155305443, + "grad_norm": 10.08844327049869, + "learning_rate": 4.95722184052693e-06, + "loss": 1.4099, + "step": 1232 + }, + { + "epoch": 0.17455935442769166, + "grad_norm": 9.842078851202182, + "learning_rate": 4.957116192421264e-06, + "loss": 1.4446, + "step": 1233 + }, + { + "epoch": 0.17470092730232886, + "grad_norm": 11.444186564711783, + "learning_rate": 4.957010415146833e-06, + "loss": 1.5742, + "step": 1234 + }, + { + "epoch": 0.1748425001769661, + "grad_norm": 9.278005321806237, + "learning_rate": 4.956904508709195e-06, + "loss": 1.4318, + "step": 1235 + }, + { + "epoch": 0.17498407305160332, + "grad_norm": 9.250776103684913, + "learning_rate": 4.956798473113919e-06, + "loss": 1.3097, + "step": 1236 + }, + { + "epoch": 0.17512564592624053, + "grad_norm": 9.33031178565744, + "learning_rate": 4.95669230836658e-06, + "loss": 1.5937, + "step": 1237 + }, + { + "epoch": 0.17526721880087776, + "grad_norm": 12.027497893011923, + "learning_rate": 4.9565860144727575e-06, + "loss": 1.4439, + "step": 1238 + }, + { + "epoch": 0.17540879167551496, + "grad_norm": 9.08825853374944, + "learning_rate": 4.956479591438039e-06, + "loss": 1.3547, + "step": 1239 + }, + { + "epoch": 0.1755503645501522, + "grad_norm": 8.161937148592736, + "learning_rate": 4.956373039268022e-06, + "loss": 1.2735, + "step": 1240 + }, + { + "epoch": 0.17569193742478942, + "grad_norm": 10.205633769263143, + "learning_rate": 4.9562663579683045e-06, + "loss": 1.4412, + "step": 1241 + }, + { + "epoch": 0.17583351029942662, + "grad_norm": 9.403487398638212, + "learning_rate": 4.9561595475444965e-06, + "loss": 1.3559, + "step": 1242 + }, + { + "epoch": 0.17597508317406385, + "grad_norm": 9.108079664415538, + "learning_rate": 4.956052608002212e-06, + "loss": 1.4586, + "step": 1243 + }, + { + "epoch": 0.17611665604870108, + "grad_norm": 9.24952077912984, + "learning_rate": 4.955945539347075e-06, + "loss": 1.3813, + "step": 1244 + }, + { + "epoch": 0.17625822892333828, + "grad_norm": 10.600258057530036, + "learning_rate": 4.95583834158471e-06, + "loss": 1.4605, + "step": 1245 + }, + { + "epoch": 0.17639980179797551, + "grad_norm": 10.200256224761281, + "learning_rate": 4.955731014720756e-06, + "loss": 1.4307, + "step": 1246 + }, + { + "epoch": 0.17654137467261272, + "grad_norm": 8.814155585425187, + "learning_rate": 4.955623558760852e-06, + "loss": 1.3241, + "step": 1247 + }, + { + "epoch": 0.17668294754724995, + "grad_norm": 11.308718223084396, + "learning_rate": 4.955515973710651e-06, + "loss": 1.4646, + "step": 1248 + }, + { + "epoch": 0.17682452042188718, + "grad_norm": 9.743309268441, + "learning_rate": 4.955408259575804e-06, + "loss": 1.4439, + "step": 1249 + }, + { + "epoch": 0.17696609329652438, + "grad_norm": 9.243629530095223, + "learning_rate": 4.955300416361977e-06, + "loss": 1.4048, + "step": 1250 + }, + { + "epoch": 0.1771076661711616, + "grad_norm": 9.133956485104214, + "learning_rate": 4.955192444074837e-06, + "loss": 1.3724, + "step": 1251 + }, + { + "epoch": 0.1772492390457988, + "grad_norm": 10.55718210192026, + "learning_rate": 4.9550843427200605e-06, + "loss": 1.543, + "step": 1252 + }, + { + "epoch": 0.17739081192043604, + "grad_norm": 9.931020867769547, + "learning_rate": 4.9549761123033316e-06, + "loss": 1.4349, + "step": 1253 + }, + { + "epoch": 0.17753238479507327, + "grad_norm": 10.301284887668823, + "learning_rate": 4.9548677528303385e-06, + "loss": 1.2981, + "step": 1254 + }, + { + "epoch": 0.17767395766971047, + "grad_norm": 12.779747461919179, + "learning_rate": 4.954759264306778e-06, + "loss": 1.4602, + "step": 1255 + }, + { + "epoch": 0.1778155305443477, + "grad_norm": 10.095981032461012, + "learning_rate": 4.954650646738354e-06, + "loss": 1.4877, + "step": 1256 + }, + { + "epoch": 0.17795710341898494, + "grad_norm": 9.880629468008708, + "learning_rate": 4.954541900130775e-06, + "loss": 1.4322, + "step": 1257 + }, + { + "epoch": 0.17809867629362214, + "grad_norm": 9.479925126693503, + "learning_rate": 4.9544330244897586e-06, + "loss": 1.4071, + "step": 1258 + }, + { + "epoch": 0.17824024916825937, + "grad_norm": 13.439660754690644, + "learning_rate": 4.954324019821028e-06, + "loss": 1.5452, + "step": 1259 + }, + { + "epoch": 0.17838182204289657, + "grad_norm": 11.647775304995545, + "learning_rate": 4.954214886130315e-06, + "loss": 1.2609, + "step": 1260 + }, + { + "epoch": 0.1785233949175338, + "grad_norm": 11.642780447464066, + "learning_rate": 4.954105623423354e-06, + "loss": 1.4583, + "step": 1261 + }, + { + "epoch": 0.17866496779217103, + "grad_norm": 8.863395303502623, + "learning_rate": 4.953996231705891e-06, + "loss": 1.5065, + "step": 1262 + }, + { + "epoch": 0.17880654066680823, + "grad_norm": 10.095532144820766, + "learning_rate": 4.953886710983676e-06, + "loss": 1.4411, + "step": 1263 + }, + { + "epoch": 0.17894811354144546, + "grad_norm": 11.069173922570961, + "learning_rate": 4.9537770612624655e-06, + "loss": 1.3173, + "step": 1264 + }, + { + "epoch": 0.17908968641608267, + "grad_norm": 14.270529431450896, + "learning_rate": 4.9536672825480255e-06, + "loss": 1.6317, + "step": 1265 + }, + { + "epoch": 0.1792312592907199, + "grad_norm": 11.0541626283303, + "learning_rate": 4.953557374846125e-06, + "loss": 1.4635, + "step": 1266 + }, + { + "epoch": 0.17937283216535713, + "grad_norm": 9.765344917858565, + "learning_rate": 4.953447338162543e-06, + "loss": 1.4278, + "step": 1267 + }, + { + "epoch": 0.17951440503999433, + "grad_norm": 11.277619768444119, + "learning_rate": 4.953337172503064e-06, + "loss": 1.521, + "step": 1268 + }, + { + "epoch": 0.17965597791463156, + "grad_norm": 11.9747054221091, + "learning_rate": 4.953226877873479e-06, + "loss": 1.4342, + "step": 1269 + }, + { + "epoch": 0.1797975507892688, + "grad_norm": 12.10329421417826, + "learning_rate": 4.953116454279587e-06, + "loss": 1.4381, + "step": 1270 + }, + { + "epoch": 0.179939123663906, + "grad_norm": 9.971966932943507, + "learning_rate": 4.953005901727191e-06, + "loss": 1.3245, + "step": 1271 + }, + { + "epoch": 0.18008069653854322, + "grad_norm": 9.615502365565073, + "learning_rate": 4.952895220222104e-06, + "loss": 1.3928, + "step": 1272 + }, + { + "epoch": 0.18022226941318042, + "grad_norm": 17.01965273152077, + "learning_rate": 4.952784409770145e-06, + "loss": 1.6055, + "step": 1273 + }, + { + "epoch": 0.18036384228781766, + "grad_norm": 13.720290390811819, + "learning_rate": 4.952673470377137e-06, + "loss": 1.4027, + "step": 1274 + }, + { + "epoch": 0.18050541516245489, + "grad_norm": 9.988762836087634, + "learning_rate": 4.952562402048915e-06, + "loss": 1.4202, + "step": 1275 + }, + { + "epoch": 0.1806469880370921, + "grad_norm": 8.607270082122529, + "learning_rate": 4.952451204791315e-06, + "loss": 1.3766, + "step": 1276 + }, + { + "epoch": 0.18078856091172932, + "grad_norm": 10.341802592361498, + "learning_rate": 4.952339878610185e-06, + "loss": 1.4664, + "step": 1277 + }, + { + "epoch": 0.18093013378636652, + "grad_norm": 9.014496043374562, + "learning_rate": 4.952228423511375e-06, + "loss": 1.3828, + "step": 1278 + }, + { + "epoch": 0.18107170666100375, + "grad_norm": 12.217489894662155, + "learning_rate": 4.952116839500747e-06, + "loss": 1.4517, + "step": 1279 + }, + { + "epoch": 0.18121327953564098, + "grad_norm": 8.323724267487021, + "learning_rate": 4.9520051265841626e-06, + "loss": 1.3604, + "step": 1280 + }, + { + "epoch": 0.18135485241027818, + "grad_norm": 9.62268514168148, + "learning_rate": 4.951893284767498e-06, + "loss": 1.2875, + "step": 1281 + }, + { + "epoch": 0.1814964252849154, + "grad_norm": 9.685343840821625, + "learning_rate": 4.951781314056633e-06, + "loss": 1.3752, + "step": 1282 + }, + { + "epoch": 0.18163799815955262, + "grad_norm": 10.753015139705075, + "learning_rate": 4.951669214457451e-06, + "loss": 1.4195, + "step": 1283 + }, + { + "epoch": 0.18177957103418985, + "grad_norm": 10.590308426498227, + "learning_rate": 4.951556985975847e-06, + "loss": 1.4208, + "step": 1284 + }, + { + "epoch": 0.18192114390882708, + "grad_norm": 9.819562375015314, + "learning_rate": 4.95144462861772e-06, + "loss": 1.277, + "step": 1285 + }, + { + "epoch": 0.18206271678346428, + "grad_norm": 7.9818799325146506, + "learning_rate": 4.951332142388976e-06, + "loss": 1.3249, + "step": 1286 + }, + { + "epoch": 0.1822042896581015, + "grad_norm": 10.943998487996671, + "learning_rate": 4.95121952729553e-06, + "loss": 1.4502, + "step": 1287 + }, + { + "epoch": 0.18234586253273874, + "grad_norm": 10.979825679875306, + "learning_rate": 4.951106783343301e-06, + "loss": 1.4487, + "step": 1288 + }, + { + "epoch": 0.18248743540737594, + "grad_norm": 9.770841575461832, + "learning_rate": 4.950993910538216e-06, + "loss": 1.3723, + "step": 1289 + }, + { + "epoch": 0.18262900828201317, + "grad_norm": 10.980378771087224, + "learning_rate": 4.950880908886208e-06, + "loss": 1.4405, + "step": 1290 + }, + { + "epoch": 0.18277058115665037, + "grad_norm": 11.481429655261184, + "learning_rate": 4.95076777839322e-06, + "loss": 1.4285, + "step": 1291 + }, + { + "epoch": 0.1829121540312876, + "grad_norm": 8.182483330093966, + "learning_rate": 4.950654519065196e-06, + "loss": 1.3006, + "step": 1292 + }, + { + "epoch": 0.18305372690592484, + "grad_norm": 9.096813826255365, + "learning_rate": 4.950541130908091e-06, + "loss": 1.3831, + "step": 1293 + }, + { + "epoch": 0.18319529978056204, + "grad_norm": 9.853855335137629, + "learning_rate": 4.9504276139278655e-06, + "loss": 1.2732, + "step": 1294 + }, + { + "epoch": 0.18333687265519927, + "grad_norm": 11.495887352674895, + "learning_rate": 4.950313968130488e-06, + "loss": 1.3993, + "step": 1295 + }, + { + "epoch": 0.18347844552983647, + "grad_norm": 10.910807272243023, + "learning_rate": 4.950200193521932e-06, + "loss": 1.5819, + "step": 1296 + }, + { + "epoch": 0.1836200184044737, + "grad_norm": 11.001623207318126, + "learning_rate": 4.950086290108179e-06, + "loss": 1.2754, + "step": 1297 + }, + { + "epoch": 0.18376159127911093, + "grad_norm": 9.095434498193997, + "learning_rate": 4.949972257895217e-06, + "loss": 1.3008, + "step": 1298 + }, + { + "epoch": 0.18390316415374813, + "grad_norm": 10.568705344260884, + "learning_rate": 4.94985809688904e-06, + "loss": 1.5203, + "step": 1299 + }, + { + "epoch": 0.18404473702838536, + "grad_norm": 9.619732889444375, + "learning_rate": 4.949743807095649e-06, + "loss": 1.3809, + "step": 1300 + }, + { + "epoch": 0.1841863099030226, + "grad_norm": 9.65499682149336, + "learning_rate": 4.9496293885210535e-06, + "loss": 1.4684, + "step": 1301 + }, + { + "epoch": 0.1843278827776598, + "grad_norm": 9.362241073715378, + "learning_rate": 4.949514841171266e-06, + "loss": 1.3595, + "step": 1302 + }, + { + "epoch": 0.18446945565229703, + "grad_norm": 8.556109863203659, + "learning_rate": 4.949400165052312e-06, + "loss": 1.4326, + "step": 1303 + }, + { + "epoch": 0.18461102852693423, + "grad_norm": 10.676470860227965, + "learning_rate": 4.949285360170216e-06, + "loss": 1.3866, + "step": 1304 + }, + { + "epoch": 0.18475260140157146, + "grad_norm": 9.947864429809405, + "learning_rate": 4.949170426531016e-06, + "loss": 1.3765, + "step": 1305 + }, + { + "epoch": 0.1848941742762087, + "grad_norm": 8.53969237298366, + "learning_rate": 4.9490553641407515e-06, + "loss": 1.3969, + "step": 1306 + }, + { + "epoch": 0.1850357471508459, + "grad_norm": 10.972005420801684, + "learning_rate": 4.948940173005474e-06, + "loss": 1.5874, + "step": 1307 + }, + { + "epoch": 0.18517732002548312, + "grad_norm": 8.153466228276447, + "learning_rate": 4.948824853131237e-06, + "loss": 1.2992, + "step": 1308 + }, + { + "epoch": 0.18531889290012032, + "grad_norm": 8.134814628828991, + "learning_rate": 4.948709404524103e-06, + "loss": 1.3673, + "step": 1309 + }, + { + "epoch": 0.18546046577475755, + "grad_norm": 11.671786756067023, + "learning_rate": 4.948593827190142e-06, + "loss": 1.5002, + "step": 1310 + }, + { + "epoch": 0.18560203864939478, + "grad_norm": 10.968404998135554, + "learning_rate": 4.9484781211354286e-06, + "loss": 1.4449, + "step": 1311 + }, + { + "epoch": 0.185743611524032, + "grad_norm": 9.692236745199738, + "learning_rate": 4.948362286366047e-06, + "loss": 1.3675, + "step": 1312 + }, + { + "epoch": 0.18588518439866922, + "grad_norm": 8.732902170656994, + "learning_rate": 4.948246322888085e-06, + "loss": 1.3472, + "step": 1313 + }, + { + "epoch": 0.18602675727330645, + "grad_norm": 9.47669533733155, + "learning_rate": 4.948130230707639e-06, + "loss": 1.4672, + "step": 1314 + }, + { + "epoch": 0.18616833014794365, + "grad_norm": 10.157481896442764, + "learning_rate": 4.9480140098308125e-06, + "loss": 1.2868, + "step": 1315 + }, + { + "epoch": 0.18630990302258088, + "grad_norm": 9.390438135100014, + "learning_rate": 4.947897660263715e-06, + "loss": 1.3004, + "step": 1316 + }, + { + "epoch": 0.18645147589721808, + "grad_norm": 11.66483692171289, + "learning_rate": 4.947781182012462e-06, + "loss": 1.4928, + "step": 1317 + }, + { + "epoch": 0.1865930487718553, + "grad_norm": 9.531605798302733, + "learning_rate": 4.947664575083179e-06, + "loss": 1.3808, + "step": 1318 + }, + { + "epoch": 0.18673462164649254, + "grad_norm": 9.881572226398355, + "learning_rate": 4.947547839481993e-06, + "loss": 1.3443, + "step": 1319 + }, + { + "epoch": 0.18687619452112975, + "grad_norm": 12.592229330104862, + "learning_rate": 4.947430975215043e-06, + "loss": 1.4111, + "step": 1320 + }, + { + "epoch": 0.18701776739576698, + "grad_norm": 7.8929867675237295, + "learning_rate": 4.94731398228847e-06, + "loss": 1.394, + "step": 1321 + }, + { + "epoch": 0.18715934027040418, + "grad_norm": 8.915758270861408, + "learning_rate": 4.947196860708426e-06, + "loss": 1.3197, + "step": 1322 + }, + { + "epoch": 0.1873009131450414, + "grad_norm": 10.281649225246678, + "learning_rate": 4.947079610481069e-06, + "loss": 1.4726, + "step": 1323 + }, + { + "epoch": 0.18744248601967864, + "grad_norm": 10.346913387005015, + "learning_rate": 4.946962231612561e-06, + "loss": 1.4057, + "step": 1324 + }, + { + "epoch": 0.18758405889431584, + "grad_norm": 10.85186446382872, + "learning_rate": 4.946844724109073e-06, + "loss": 1.4484, + "step": 1325 + }, + { + "epoch": 0.18772563176895307, + "grad_norm": 10.397265338822459, + "learning_rate": 4.946727087976782e-06, + "loss": 1.3758, + "step": 1326 + }, + { + "epoch": 0.18786720464359027, + "grad_norm": 10.717269750764162, + "learning_rate": 4.946609323221873e-06, + "loss": 1.5345, + "step": 1327 + }, + { + "epoch": 0.1880087775182275, + "grad_norm": 8.243639256772507, + "learning_rate": 4.946491429850535e-06, + "loss": 1.5037, + "step": 1328 + }, + { + "epoch": 0.18815035039286473, + "grad_norm": 8.874825059482907, + "learning_rate": 4.946373407868967e-06, + "loss": 1.3401, + "step": 1329 + }, + { + "epoch": 0.18829192326750194, + "grad_norm": 12.616091162701146, + "learning_rate": 4.946255257283374e-06, + "loss": 1.3707, + "step": 1330 + }, + { + "epoch": 0.18843349614213917, + "grad_norm": 9.352716844742478, + "learning_rate": 4.946136978099966e-06, + "loss": 1.2701, + "step": 1331 + }, + { + "epoch": 0.1885750690167764, + "grad_norm": 10.501154154969147, + "learning_rate": 4.94601857032496e-06, + "loss": 1.3217, + "step": 1332 + }, + { + "epoch": 0.1887166418914136, + "grad_norm": 8.499716810109074, + "learning_rate": 4.9459000339645824e-06, + "loss": 1.3314, + "step": 1333 + }, + { + "epoch": 0.18885821476605083, + "grad_norm": 14.166111355537906, + "learning_rate": 4.9457813690250635e-06, + "loss": 1.4995, + "step": 1334 + }, + { + "epoch": 0.18899978764068803, + "grad_norm": 9.226727877434055, + "learning_rate": 4.9456625755126415e-06, + "loss": 1.2562, + "step": 1335 + }, + { + "epoch": 0.18914136051532526, + "grad_norm": 10.948314965598017, + "learning_rate": 4.945543653433562e-06, + "loss": 1.4601, + "step": 1336 + }, + { + "epoch": 0.1892829333899625, + "grad_norm": 9.773726717468312, + "learning_rate": 4.945424602794076e-06, + "loss": 1.3103, + "step": 1337 + }, + { + "epoch": 0.1894245062645997, + "grad_norm": 12.455311964883823, + "learning_rate": 4.945305423600441e-06, + "loss": 1.3241, + "step": 1338 + }, + { + "epoch": 0.18956607913923693, + "grad_norm": 11.529004878618576, + "learning_rate": 4.945186115858925e-06, + "loss": 1.3585, + "step": 1339 + }, + { + "epoch": 0.18970765201387413, + "grad_norm": 11.345787685011533, + "learning_rate": 4.945066679575796e-06, + "loss": 1.4424, + "step": 1340 + }, + { + "epoch": 0.18984922488851136, + "grad_norm": 10.194054864362695, + "learning_rate": 4.944947114757336e-06, + "loss": 1.2237, + "step": 1341 + }, + { + "epoch": 0.1899907977631486, + "grad_norm": 10.348267090431245, + "learning_rate": 4.944827421409829e-06, + "loss": 1.4031, + "step": 1342 + }, + { + "epoch": 0.1901323706377858, + "grad_norm": 9.089459327926136, + "learning_rate": 4.944707599539567e-06, + "loss": 1.4427, + "step": 1343 + }, + { + "epoch": 0.19027394351242302, + "grad_norm": 9.225534612779986, + "learning_rate": 4.94458764915285e-06, + "loss": 1.3805, + "step": 1344 + }, + { + "epoch": 0.19041551638706025, + "grad_norm": 10.78111271425273, + "learning_rate": 4.944467570255983e-06, + "loss": 1.3714, + "step": 1345 + }, + { + "epoch": 0.19055708926169745, + "grad_norm": 11.521000801694974, + "learning_rate": 4.944347362855278e-06, + "loss": 1.516, + "step": 1346 + }, + { + "epoch": 0.19069866213633468, + "grad_norm": 9.098280993738083, + "learning_rate": 4.9442270269570545e-06, + "loss": 1.4196, + "step": 1347 + }, + { + "epoch": 0.1908402350109719, + "grad_norm": 8.869595453612424, + "learning_rate": 4.94410656256764e-06, + "loss": 1.3407, + "step": 1348 + }, + { + "epoch": 0.19098180788560912, + "grad_norm": 10.113305767761899, + "learning_rate": 4.943985969693365e-06, + "loss": 1.4174, + "step": 1349 + }, + { + "epoch": 0.19112338076024635, + "grad_norm": 11.61574719446589, + "learning_rate": 4.94386524834057e-06, + "loss": 1.5299, + "step": 1350 + }, + { + "epoch": 0.19126495363488355, + "grad_norm": 11.476767915324704, + "learning_rate": 4.943744398515601e-06, + "loss": 1.3625, + "step": 1351 + }, + { + "epoch": 0.19140652650952078, + "grad_norm": 8.57043757386515, + "learning_rate": 4.943623420224811e-06, + "loss": 1.4615, + "step": 1352 + }, + { + "epoch": 0.19154809938415798, + "grad_norm": 10.85613394473715, + "learning_rate": 4.94350231347456e-06, + "loss": 1.5389, + "step": 1353 + }, + { + "epoch": 0.1916896722587952, + "grad_norm": 9.767020603402306, + "learning_rate": 4.943381078271214e-06, + "loss": 1.4154, + "step": 1354 + }, + { + "epoch": 0.19183124513343244, + "grad_norm": 11.56796943590867, + "learning_rate": 4.943259714621148e-06, + "loss": 1.3023, + "step": 1355 + }, + { + "epoch": 0.19197281800806965, + "grad_norm": 10.458139810380096, + "learning_rate": 4.943138222530739e-06, + "loss": 1.4922, + "step": 1356 + }, + { + "epoch": 0.19211439088270688, + "grad_norm": 10.467323684318963, + "learning_rate": 4.943016602006376e-06, + "loss": 1.3893, + "step": 1357 + }, + { + "epoch": 0.1922559637573441, + "grad_norm": 10.520513568714776, + "learning_rate": 4.942894853054452e-06, + "loss": 1.3297, + "step": 1358 + }, + { + "epoch": 0.1923975366319813, + "grad_norm": 11.262506611575473, + "learning_rate": 4.942772975681366e-06, + "loss": 1.5517, + "step": 1359 + }, + { + "epoch": 0.19253910950661854, + "grad_norm": 10.089802918035847, + "learning_rate": 4.942650969893527e-06, + "loss": 1.3762, + "step": 1360 + }, + { + "epoch": 0.19268068238125574, + "grad_norm": 12.919759335003654, + "learning_rate": 4.942528835697348e-06, + "loss": 1.3492, + "step": 1361 + }, + { + "epoch": 0.19282225525589297, + "grad_norm": 10.337501136567539, + "learning_rate": 4.942406573099249e-06, + "loss": 1.3081, + "step": 1362 + }, + { + "epoch": 0.1929638281305302, + "grad_norm": 9.915372577078744, + "learning_rate": 4.942284182105658e-06, + "loss": 1.3189, + "step": 1363 + }, + { + "epoch": 0.1931054010051674, + "grad_norm": 9.440456710371095, + "learning_rate": 4.942161662723007e-06, + "loss": 1.404, + "step": 1364 + }, + { + "epoch": 0.19324697387980463, + "grad_norm": 9.267884980760257, + "learning_rate": 4.94203901495774e-06, + "loss": 1.2984, + "step": 1365 + }, + { + "epoch": 0.19338854675444184, + "grad_norm": 8.356946018308943, + "learning_rate": 4.9419162388163025e-06, + "loss": 1.3929, + "step": 1366 + }, + { + "epoch": 0.19353011962907907, + "grad_norm": 8.44305101687967, + "learning_rate": 4.941793334305149e-06, + "loss": 1.2583, + "step": 1367 + }, + { + "epoch": 0.1936716925037163, + "grad_norm": 11.71486198523089, + "learning_rate": 4.94167030143074e-06, + "loss": 1.468, + "step": 1368 + }, + { + "epoch": 0.1938132653783535, + "grad_norm": 10.06407200635613, + "learning_rate": 4.941547140199545e-06, + "loss": 1.4349, + "step": 1369 + }, + { + "epoch": 0.19395483825299073, + "grad_norm": 9.51950760730035, + "learning_rate": 4.9414238506180365e-06, + "loss": 1.3515, + "step": 1370 + }, + { + "epoch": 0.19409641112762796, + "grad_norm": 7.932155942768058, + "learning_rate": 4.941300432692697e-06, + "loss": 1.5073, + "step": 1371 + }, + { + "epoch": 0.19423798400226516, + "grad_norm": 9.294788697104853, + "learning_rate": 4.941176886430014e-06, + "loss": 1.3969, + "step": 1372 + }, + { + "epoch": 0.1943795568769024, + "grad_norm": 9.259619170364177, + "learning_rate": 4.941053211836482e-06, + "loss": 1.4073, + "step": 1373 + }, + { + "epoch": 0.1945211297515396, + "grad_norm": 9.642528080245363, + "learning_rate": 4.940929408918603e-06, + "loss": 1.3788, + "step": 1374 + }, + { + "epoch": 0.19466270262617683, + "grad_norm": 10.991841845578161, + "learning_rate": 4.940805477682885e-06, + "loss": 1.4656, + "step": 1375 + }, + { + "epoch": 0.19480427550081406, + "grad_norm": 10.30256090641879, + "learning_rate": 4.940681418135843e-06, + "loss": 1.3944, + "step": 1376 + }, + { + "epoch": 0.19494584837545126, + "grad_norm": 9.159377465561763, + "learning_rate": 4.940557230283999e-06, + "loss": 1.5978, + "step": 1377 + }, + { + "epoch": 0.1950874212500885, + "grad_norm": 10.441637875049674, + "learning_rate": 4.94043291413388e-06, + "loss": 1.5533, + "step": 1378 + }, + { + "epoch": 0.1952289941247257, + "grad_norm": 10.462395678220009, + "learning_rate": 4.9403084696920234e-06, + "loss": 1.4965, + "step": 1379 + }, + { + "epoch": 0.19537056699936292, + "grad_norm": 9.704357026000535, + "learning_rate": 4.940183896964969e-06, + "loss": 1.0718, + "step": 1380 + }, + { + "epoch": 0.19551213987400015, + "grad_norm": 9.32829798245702, + "learning_rate": 4.940059195959268e-06, + "loss": 1.3682, + "step": 1381 + }, + { + "epoch": 0.19565371274863735, + "grad_norm": 9.641984098765143, + "learning_rate": 4.939934366681474e-06, + "loss": 1.312, + "step": 1382 + }, + { + "epoch": 0.19579528562327458, + "grad_norm": 8.531528119035794, + "learning_rate": 4.93980940913815e-06, + "loss": 1.2792, + "step": 1383 + }, + { + "epoch": 0.1959368584979118, + "grad_norm": 9.18557407667937, + "learning_rate": 4.939684323335864e-06, + "loss": 1.3971, + "step": 1384 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 14.209544259802602, + "learning_rate": 4.939559109281192e-06, + "loss": 1.3889, + "step": 1385 + }, + { + "epoch": 0.19622000424718625, + "grad_norm": 9.241449889337112, + "learning_rate": 4.939433766980717e-06, + "loss": 1.3047, + "step": 1386 + }, + { + "epoch": 0.19636157712182345, + "grad_norm": 8.97744977664523, + "learning_rate": 4.939308296441028e-06, + "loss": 1.3446, + "step": 1387 + }, + { + "epoch": 0.19650314999646068, + "grad_norm": 8.184111614730293, + "learning_rate": 4.939182697668721e-06, + "loss": 1.2236, + "step": 1388 + }, + { + "epoch": 0.1966447228710979, + "grad_norm": 9.471581768957849, + "learning_rate": 4.939056970670397e-06, + "loss": 1.6158, + "step": 1389 + }, + { + "epoch": 0.1967862957457351, + "grad_norm": 10.4675492694097, + "learning_rate": 4.938931115452668e-06, + "loss": 1.4516, + "step": 1390 + }, + { + "epoch": 0.19692786862037234, + "grad_norm": 10.746604205660734, + "learning_rate": 4.938805132022148e-06, + "loss": 1.4387, + "step": 1391 + }, + { + "epoch": 0.19706944149500955, + "grad_norm": 9.334096968155919, + "learning_rate": 4.9386790203854605e-06, + "loss": 1.2403, + "step": 1392 + }, + { + "epoch": 0.19721101436964678, + "grad_norm": 10.209141498157155, + "learning_rate": 4.938552780549236e-06, + "loss": 1.3922, + "step": 1393 + }, + { + "epoch": 0.197352587244284, + "grad_norm": 8.658130648030172, + "learning_rate": 4.93842641252011e-06, + "loss": 1.3079, + "step": 1394 + }, + { + "epoch": 0.1974941601189212, + "grad_norm": 10.782423226472128, + "learning_rate": 4.938299916304725e-06, + "loss": 1.2893, + "step": 1395 + }, + { + "epoch": 0.19763573299355844, + "grad_norm": 8.928600228399423, + "learning_rate": 4.938173291909732e-06, + "loss": 1.3279, + "step": 1396 + }, + { + "epoch": 0.19777730586819564, + "grad_norm": 10.50895200315378, + "learning_rate": 4.9380465393417875e-06, + "loss": 1.393, + "step": 1397 + }, + { + "epoch": 0.19791887874283287, + "grad_norm": 9.963414597020888, + "learning_rate": 4.937919658607554e-06, + "loss": 1.3332, + "step": 1398 + }, + { + "epoch": 0.1980604516174701, + "grad_norm": 9.79007836845289, + "learning_rate": 4.937792649713701e-06, + "loss": 1.3706, + "step": 1399 + }, + { + "epoch": 0.1982020244921073, + "grad_norm": 9.076923967184847, + "learning_rate": 4.937665512666907e-06, + "loss": 1.4893, + "step": 1400 + }, + { + "epoch": 0.19834359736674453, + "grad_norm": 10.10727260608377, + "learning_rate": 4.937538247473854e-06, + "loss": 1.4697, + "step": 1401 + }, + { + "epoch": 0.19848517024138176, + "grad_norm": 9.718934084997962, + "learning_rate": 4.9374108541412336e-06, + "loss": 1.3553, + "step": 1402 + }, + { + "epoch": 0.19862674311601897, + "grad_norm": 9.911916369734234, + "learning_rate": 4.937283332675741e-06, + "loss": 1.4497, + "step": 1403 + }, + { + "epoch": 0.1987683159906562, + "grad_norm": 11.253819135271124, + "learning_rate": 4.937155683084082e-06, + "loss": 1.5985, + "step": 1404 + }, + { + "epoch": 0.1989098888652934, + "grad_norm": 10.103666082088216, + "learning_rate": 4.937027905372965e-06, + "loss": 1.3939, + "step": 1405 + }, + { + "epoch": 0.19905146173993063, + "grad_norm": 10.927866502496574, + "learning_rate": 4.936899999549108e-06, + "loss": 1.5311, + "step": 1406 + }, + { + "epoch": 0.19919303461456786, + "grad_norm": 10.733651793437026, + "learning_rate": 4.936771965619236e-06, + "loss": 1.3847, + "step": 1407 + }, + { + "epoch": 0.19933460748920506, + "grad_norm": 9.43246750199912, + "learning_rate": 4.936643803590079e-06, + "loss": 1.4025, + "step": 1408 + }, + { + "epoch": 0.1994761803638423, + "grad_norm": 9.15864401496592, + "learning_rate": 4.936515513468373e-06, + "loss": 1.3563, + "step": 1409 + }, + { + "epoch": 0.1996177532384795, + "grad_norm": 8.389102279048686, + "learning_rate": 4.9363870952608634e-06, + "loss": 1.258, + "step": 1410 + }, + { + "epoch": 0.19975932611311673, + "grad_norm": 8.7190982653338, + "learning_rate": 4.936258548974301e-06, + "loss": 1.3672, + "step": 1411 + }, + { + "epoch": 0.19990089898775396, + "grad_norm": 11.76299460138393, + "learning_rate": 4.936129874615443e-06, + "loss": 1.4857, + "step": 1412 + }, + { + "epoch": 0.20004247186239116, + "grad_norm": 8.861578450357246, + "learning_rate": 4.9360010721910545e-06, + "loss": 1.3943, + "step": 1413 + }, + { + "epoch": 0.2001840447370284, + "grad_norm": 9.649234927193751, + "learning_rate": 4.935872141707906e-06, + "loss": 1.3514, + "step": 1414 + }, + { + "epoch": 0.20032561761166562, + "grad_norm": 10.492978609415738, + "learning_rate": 4.935743083172775e-06, + "loss": 1.3357, + "step": 1415 + }, + { + "epoch": 0.20046719048630282, + "grad_norm": 8.690188204677437, + "learning_rate": 4.935613896592446e-06, + "loss": 1.3472, + "step": 1416 + }, + { + "epoch": 0.20060876336094005, + "grad_norm": 10.656528144853313, + "learning_rate": 4.93548458197371e-06, + "loss": 1.3655, + "step": 1417 + }, + { + "epoch": 0.20075033623557725, + "grad_norm": 10.809358526318215, + "learning_rate": 4.935355139323367e-06, + "loss": 1.4321, + "step": 1418 + }, + { + "epoch": 0.20089190911021448, + "grad_norm": 8.865083554084443, + "learning_rate": 4.93522556864822e-06, + "loss": 1.2932, + "step": 1419 + }, + { + "epoch": 0.20103348198485171, + "grad_norm": 10.815000741000068, + "learning_rate": 4.935095869955079e-06, + "loss": 1.5396, + "step": 1420 + }, + { + "epoch": 0.20117505485948892, + "grad_norm": 9.005548674089031, + "learning_rate": 4.934966043250765e-06, + "loss": 1.3457, + "step": 1421 + }, + { + "epoch": 0.20131662773412615, + "grad_norm": 9.060395516021266, + "learning_rate": 4.934836088542102e-06, + "loss": 1.2628, + "step": 1422 + }, + { + "epoch": 0.20145820060876335, + "grad_norm": 8.312645473497977, + "learning_rate": 4.934706005835921e-06, + "loss": 1.4745, + "step": 1423 + }, + { + "epoch": 0.20159977348340058, + "grad_norm": 10.644330918413031, + "learning_rate": 4.9345757951390605e-06, + "loss": 1.3411, + "step": 1424 + }, + { + "epoch": 0.2017413463580378, + "grad_norm": 11.138239151631005, + "learning_rate": 4.934445456458366e-06, + "loss": 1.2896, + "step": 1425 + }, + { + "epoch": 0.201882919232675, + "grad_norm": 10.525070777347013, + "learning_rate": 4.934314989800689e-06, + "loss": 1.4747, + "step": 1426 + }, + { + "epoch": 0.20202449210731224, + "grad_norm": 8.294145785376697, + "learning_rate": 4.934184395172888e-06, + "loss": 1.3776, + "step": 1427 + }, + { + "epoch": 0.20216606498194944, + "grad_norm": 8.686104861419429, + "learning_rate": 4.934053672581828e-06, + "loss": 1.3788, + "step": 1428 + }, + { + "epoch": 0.20230763785658668, + "grad_norm": 11.833327888881524, + "learning_rate": 4.933922822034381e-06, + "loss": 1.4218, + "step": 1429 + }, + { + "epoch": 0.2024492107312239, + "grad_norm": 10.166543011356309, + "learning_rate": 4.933791843537427e-06, + "loss": 1.3424, + "step": 1430 + }, + { + "epoch": 0.2025907836058611, + "grad_norm": 7.839016523372746, + "learning_rate": 4.933660737097851e-06, + "loss": 1.2105, + "step": 1431 + }, + { + "epoch": 0.20273235648049834, + "grad_norm": 9.891923912536239, + "learning_rate": 4.933529502722544e-06, + "loss": 1.4459, + "step": 1432 + }, + { + "epoch": 0.20287392935513557, + "grad_norm": 8.607093245584341, + "learning_rate": 4.933398140418405e-06, + "loss": 1.3991, + "step": 1433 + }, + { + "epoch": 0.20301550222977277, + "grad_norm": 9.503408523238928, + "learning_rate": 4.933266650192341e-06, + "loss": 1.4349, + "step": 1434 + }, + { + "epoch": 0.20315707510441, + "grad_norm": 9.388264541884109, + "learning_rate": 4.933135032051263e-06, + "loss": 1.4349, + "step": 1435 + }, + { + "epoch": 0.2032986479790472, + "grad_norm": 9.108658044255888, + "learning_rate": 4.933003286002091e-06, + "loss": 1.2488, + "step": 1436 + }, + { + "epoch": 0.20344022085368443, + "grad_norm": 10.038995622345332, + "learning_rate": 4.932871412051749e-06, + "loss": 1.3661, + "step": 1437 + }, + { + "epoch": 0.20358179372832166, + "grad_norm": 9.773724765962118, + "learning_rate": 4.932739410207172e-06, + "loss": 1.4237, + "step": 1438 + }, + { + "epoch": 0.20372336660295887, + "grad_norm": 9.72634875204394, + "learning_rate": 4.932607280475299e-06, + "loss": 1.3969, + "step": 1439 + }, + { + "epoch": 0.2038649394775961, + "grad_norm": 8.614924641266866, + "learning_rate": 4.932475022863074e-06, + "loss": 1.4623, + "step": 1440 + }, + { + "epoch": 0.2040065123522333, + "grad_norm": 9.338927999073288, + "learning_rate": 4.932342637377451e-06, + "loss": 1.3346, + "step": 1441 + }, + { + "epoch": 0.20414808522687053, + "grad_norm": 9.639524919508643, + "learning_rate": 4.93221012402539e-06, + "loss": 1.2798, + "step": 1442 + }, + { + "epoch": 0.20428965810150776, + "grad_norm": 11.458817091469005, + "learning_rate": 4.9320774828138555e-06, + "loss": 1.3737, + "step": 1443 + }, + { + "epoch": 0.20443123097614496, + "grad_norm": 9.541156560819365, + "learning_rate": 4.931944713749821e-06, + "loss": 1.214, + "step": 1444 + }, + { + "epoch": 0.2045728038507822, + "grad_norm": 13.484618855522426, + "learning_rate": 4.9318118168402665e-06, + "loss": 1.3556, + "step": 1445 + }, + { + "epoch": 0.20471437672541942, + "grad_norm": 9.414953115288782, + "learning_rate": 4.931678792092177e-06, + "loss": 1.3065, + "step": 1446 + }, + { + "epoch": 0.20485594960005662, + "grad_norm": 8.550091516551518, + "learning_rate": 4.9315456395125475e-06, + "loss": 1.3077, + "step": 1447 + }, + { + "epoch": 0.20499752247469386, + "grad_norm": 8.23908598339627, + "learning_rate": 4.931412359108377e-06, + "loss": 1.2783, + "step": 1448 + }, + { + "epoch": 0.20513909534933106, + "grad_norm": 9.594267293795038, + "learning_rate": 4.931278950886671e-06, + "loss": 1.405, + "step": 1449 + }, + { + "epoch": 0.2052806682239683, + "grad_norm": 9.335629407834787, + "learning_rate": 4.931145414854444e-06, + "loss": 1.3645, + "step": 1450 + }, + { + "epoch": 0.20542224109860552, + "grad_norm": 10.11307416683987, + "learning_rate": 4.931011751018715e-06, + "loss": 1.4034, + "step": 1451 + }, + { + "epoch": 0.20556381397324272, + "grad_norm": 10.46357986547024, + "learning_rate": 4.930877959386511e-06, + "loss": 1.4326, + "step": 1452 + }, + { + "epoch": 0.20570538684787995, + "grad_norm": 9.78150427734581, + "learning_rate": 4.930744039964866e-06, + "loss": 1.2919, + "step": 1453 + }, + { + "epoch": 0.20584695972251715, + "grad_norm": 8.010249724382671, + "learning_rate": 4.930609992760818e-06, + "loss": 1.3415, + "step": 1454 + }, + { + "epoch": 0.20598853259715438, + "grad_norm": 9.774920575924133, + "learning_rate": 4.930475817781415e-06, + "loss": 1.3131, + "step": 1455 + }, + { + "epoch": 0.2061301054717916, + "grad_norm": 9.266830803311725, + "learning_rate": 4.930341515033712e-06, + "loss": 1.3776, + "step": 1456 + }, + { + "epoch": 0.20627167834642882, + "grad_norm": 9.520081427089686, + "learning_rate": 4.930207084524766e-06, + "loss": 1.3398, + "step": 1457 + }, + { + "epoch": 0.20641325122106605, + "grad_norm": 9.454012742406992, + "learning_rate": 4.930072526261647e-06, + "loss": 1.4492, + "step": 1458 + }, + { + "epoch": 0.20655482409570328, + "grad_norm": 10.682764621620075, + "learning_rate": 4.9299378402514265e-06, + "loss": 1.4325, + "step": 1459 + }, + { + "epoch": 0.20669639697034048, + "grad_norm": 9.799595991876599, + "learning_rate": 4.9298030265011856e-06, + "loss": 1.5617, + "step": 1460 + }, + { + "epoch": 0.2068379698449777, + "grad_norm": 9.761590572887972, + "learning_rate": 4.929668085018011e-06, + "loss": 1.3869, + "step": 1461 + }, + { + "epoch": 0.2069795427196149, + "grad_norm": 7.827007526121643, + "learning_rate": 4.929533015808997e-06, + "loss": 1.282, + "step": 1462 + }, + { + "epoch": 0.20712111559425214, + "grad_norm": 10.306805882404044, + "learning_rate": 4.929397818881244e-06, + "loss": 1.2548, + "step": 1463 + }, + { + "epoch": 0.20726268846888937, + "grad_norm": 9.453074557587525, + "learning_rate": 4.929262494241859e-06, + "loss": 1.3542, + "step": 1464 + }, + { + "epoch": 0.20740426134352657, + "grad_norm": 10.337846898228419, + "learning_rate": 4.929127041897957e-06, + "loss": 1.462, + "step": 1465 + }, + { + "epoch": 0.2075458342181638, + "grad_norm": 8.907188821132765, + "learning_rate": 4.928991461856656e-06, + "loss": 1.2085, + "step": 1466 + }, + { + "epoch": 0.207687407092801, + "grad_norm": 10.152215995733318, + "learning_rate": 4.928855754125086e-06, + "loss": 1.3402, + "step": 1467 + }, + { + "epoch": 0.20782897996743824, + "grad_norm": 7.677415140987904, + "learning_rate": 4.92871991871038e-06, + "loss": 1.25, + "step": 1468 + }, + { + "epoch": 0.20797055284207547, + "grad_norm": 11.047024176856974, + "learning_rate": 4.928583955619678e-06, + "loss": 1.4294, + "step": 1469 + }, + { + "epoch": 0.20811212571671267, + "grad_norm": 9.63685333813596, + "learning_rate": 4.928447864860129e-06, + "loss": 1.4023, + "step": 1470 + }, + { + "epoch": 0.2082536985913499, + "grad_norm": 9.649631837136853, + "learning_rate": 4.928311646438887e-06, + "loss": 1.4555, + "step": 1471 + }, + { + "epoch": 0.2083952714659871, + "grad_norm": 10.339229091335824, + "learning_rate": 4.9281753003631114e-06, + "loss": 1.4349, + "step": 1472 + }, + { + "epoch": 0.20853684434062433, + "grad_norm": 10.985794605101114, + "learning_rate": 4.928038826639971e-06, + "loss": 1.3927, + "step": 1473 + }, + { + "epoch": 0.20867841721526156, + "grad_norm": 8.813004283139145, + "learning_rate": 4.92790222527664e-06, + "loss": 1.2054, + "step": 1474 + }, + { + "epoch": 0.20881999008989877, + "grad_norm": 11.576492668123501, + "learning_rate": 4.927765496280299e-06, + "loss": 1.3388, + "step": 1475 + }, + { + "epoch": 0.208961562964536, + "grad_norm": 9.990122685409386, + "learning_rate": 4.927628639658137e-06, + "loss": 1.4458, + "step": 1476 + }, + { + "epoch": 0.20910313583917323, + "grad_norm": 10.015028151702547, + "learning_rate": 4.927491655417347e-06, + "loss": 1.4468, + "step": 1477 + }, + { + "epoch": 0.20924470871381043, + "grad_norm": 10.625184539987485, + "learning_rate": 4.927354543565131e-06, + "loss": 1.5867, + "step": 1478 + }, + { + "epoch": 0.20938628158844766, + "grad_norm": 10.03079821558568, + "learning_rate": 4.927217304108696e-06, + "loss": 1.4135, + "step": 1479 + }, + { + "epoch": 0.20952785446308486, + "grad_norm": 12.49812791156921, + "learning_rate": 4.927079937055257e-06, + "loss": 1.4131, + "step": 1480 + }, + { + "epoch": 0.2096694273377221, + "grad_norm": 11.975690378186123, + "learning_rate": 4.926942442412036e-06, + "loss": 1.4465, + "step": 1481 + }, + { + "epoch": 0.20981100021235932, + "grad_norm": 10.92930810691147, + "learning_rate": 4.92680482018626e-06, + "loss": 1.336, + "step": 1482 + }, + { + "epoch": 0.20995257308699652, + "grad_norm": 9.474153393286983, + "learning_rate": 4.9266670703851645e-06, + "loss": 1.3764, + "step": 1483 + }, + { + "epoch": 0.21009414596163375, + "grad_norm": 11.55842741304782, + "learning_rate": 4.92652919301599e-06, + "loss": 1.3923, + "step": 1484 + }, + { + "epoch": 0.21023571883627096, + "grad_norm": 11.010146576531975, + "learning_rate": 4.9263911880859855e-06, + "loss": 1.4297, + "step": 1485 + }, + { + "epoch": 0.2103772917109082, + "grad_norm": 8.994491162753759, + "learning_rate": 4.926253055602405e-06, + "loss": 1.463, + "step": 1486 + }, + { + "epoch": 0.21051886458554542, + "grad_norm": 11.51499028313542, + "learning_rate": 4.926114795572511e-06, + "loss": 1.4958, + "step": 1487 + }, + { + "epoch": 0.21066043746018262, + "grad_norm": 10.013220630863167, + "learning_rate": 4.925976408003571e-06, + "loss": 1.4517, + "step": 1488 + }, + { + "epoch": 0.21080201033481985, + "grad_norm": 8.711222581517825, + "learning_rate": 4.92583789290286e-06, + "loss": 1.3842, + "step": 1489 + }, + { + "epoch": 0.21094358320945708, + "grad_norm": 9.893552325859085, + "learning_rate": 4.9256992502776605e-06, + "loss": 1.4708, + "step": 1490 + }, + { + "epoch": 0.21108515608409428, + "grad_norm": 9.82769920021081, + "learning_rate": 4.925560480135258e-06, + "loss": 1.531, + "step": 1491 + }, + { + "epoch": 0.2112267289587315, + "grad_norm": 10.151662500589689, + "learning_rate": 4.925421582482952e-06, + "loss": 1.4268, + "step": 1492 + }, + { + "epoch": 0.21136830183336872, + "grad_norm": 10.37730804876039, + "learning_rate": 4.925282557328041e-06, + "loss": 1.4085, + "step": 1493 + }, + { + "epoch": 0.21150987470800595, + "grad_norm": 9.263557602275899, + "learning_rate": 4.925143404677835e-06, + "loss": 1.4033, + "step": 1494 + }, + { + "epoch": 0.21165144758264318, + "grad_norm": 11.239835767530355, + "learning_rate": 4.925004124539648e-06, + "loss": 1.5579, + "step": 1495 + }, + { + "epoch": 0.21179302045728038, + "grad_norm": 8.403023820397742, + "learning_rate": 4.924864716920801e-06, + "loss": 1.363, + "step": 1496 + }, + { + "epoch": 0.2119345933319176, + "grad_norm": 9.122488433922555, + "learning_rate": 4.9247251818286255e-06, + "loss": 1.4555, + "step": 1497 + }, + { + "epoch": 0.2120761662065548, + "grad_norm": 10.882268121143184, + "learning_rate": 4.924585519270454e-06, + "loss": 1.4714, + "step": 1498 + }, + { + "epoch": 0.21221773908119204, + "grad_norm": 9.522579864864083, + "learning_rate": 4.9244457292536305e-06, + "loss": 1.4043, + "step": 1499 + }, + { + "epoch": 0.21235931195582927, + "grad_norm": 7.675962644962224, + "learning_rate": 4.924305811785502e-06, + "loss": 1.3249, + "step": 1500 + }, + { + "epoch": 0.21250088483046647, + "grad_norm": 11.385660060131903, + "learning_rate": 4.9241657668734256e-06, + "loss": 1.3023, + "step": 1501 + }, + { + "epoch": 0.2126424577051037, + "grad_norm": 9.791932923504836, + "learning_rate": 4.9240255945247616e-06, + "loss": 1.4569, + "step": 1502 + }, + { + "epoch": 0.21278403057974093, + "grad_norm": 11.497188722063415, + "learning_rate": 4.9238852947468796e-06, + "loss": 1.3869, + "step": 1503 + }, + { + "epoch": 0.21292560345437814, + "grad_norm": 11.432940465407265, + "learning_rate": 4.9237448675471555e-06, + "loss": 1.4429, + "step": 1504 + }, + { + "epoch": 0.21306717632901537, + "grad_norm": 8.152751771653257, + "learning_rate": 4.9236043129329705e-06, + "loss": 1.4326, + "step": 1505 + }, + { + "epoch": 0.21320874920365257, + "grad_norm": 9.175219990018462, + "learning_rate": 4.923463630911714e-06, + "loss": 1.2867, + "step": 1506 + }, + { + "epoch": 0.2133503220782898, + "grad_norm": 10.35566398698452, + "learning_rate": 4.9233228214907815e-06, + "loss": 1.4544, + "step": 1507 + }, + { + "epoch": 0.21349189495292703, + "grad_norm": 8.200842316327368, + "learning_rate": 4.923181884677574e-06, + "loss": 1.2899, + "step": 1508 + }, + { + "epoch": 0.21363346782756423, + "grad_norm": 10.63916364093135, + "learning_rate": 4.923040820479504e-06, + "loss": 1.5962, + "step": 1509 + }, + { + "epoch": 0.21377504070220146, + "grad_norm": 9.402805039707358, + "learning_rate": 4.922899628903983e-06, + "loss": 1.5193, + "step": 1510 + }, + { + "epoch": 0.21391661357683867, + "grad_norm": 10.463490545674233, + "learning_rate": 4.9227583099584355e-06, + "loss": 1.485, + "step": 1511 + }, + { + "epoch": 0.2140581864514759, + "grad_norm": 10.342380951695795, + "learning_rate": 4.92261686365029e-06, + "loss": 1.2978, + "step": 1512 + }, + { + "epoch": 0.21419975932611313, + "grad_norm": 7.959717182521778, + "learning_rate": 4.9224752899869835e-06, + "loss": 1.2951, + "step": 1513 + }, + { + "epoch": 0.21434133220075033, + "grad_norm": 9.874261756052995, + "learning_rate": 4.922333588975956e-06, + "loss": 1.4394, + "step": 1514 + }, + { + "epoch": 0.21448290507538756, + "grad_norm": 9.687301141482168, + "learning_rate": 4.922191760624659e-06, + "loss": 1.2053, + "step": 1515 + }, + { + "epoch": 0.2146244779500248, + "grad_norm": 9.969404719266992, + "learning_rate": 4.922049804940546e-06, + "loss": 1.365, + "step": 1516 + }, + { + "epoch": 0.214766050824662, + "grad_norm": 11.503286140969267, + "learning_rate": 4.9219077219310804e-06, + "loss": 1.5029, + "step": 1517 + }, + { + "epoch": 0.21490762369929922, + "grad_norm": 8.344084200791741, + "learning_rate": 4.921765511603733e-06, + "loss": 1.2568, + "step": 1518 + }, + { + "epoch": 0.21504919657393642, + "grad_norm": 10.369480365325689, + "learning_rate": 4.921623173965978e-06, + "loss": 1.258, + "step": 1519 + }, + { + "epoch": 0.21519076944857365, + "grad_norm": 10.343594368400561, + "learning_rate": 4.921480709025298e-06, + "loss": 1.3097, + "step": 1520 + }, + { + "epoch": 0.21533234232321088, + "grad_norm": 11.490199017506793, + "learning_rate": 4.921338116789183e-06, + "loss": 1.4997, + "step": 1521 + }, + { + "epoch": 0.2154739151978481, + "grad_norm": 11.184934524392238, + "learning_rate": 4.921195397265129e-06, + "loss": 1.5516, + "step": 1522 + }, + { + "epoch": 0.21561548807248532, + "grad_norm": 8.81242641120649, + "learning_rate": 4.921052550460638e-06, + "loss": 1.4745, + "step": 1523 + }, + { + "epoch": 0.21575706094712252, + "grad_norm": 12.776630788730003, + "learning_rate": 4.920909576383219e-06, + "loss": 1.3625, + "step": 1524 + }, + { + "epoch": 0.21589863382175975, + "grad_norm": 9.412588626196095, + "learning_rate": 4.920766475040389e-06, + "loss": 1.2281, + "step": 1525 + }, + { + "epoch": 0.21604020669639698, + "grad_norm": 11.305747024035359, + "learning_rate": 4.920623246439671e-06, + "loss": 1.704, + "step": 1526 + }, + { + "epoch": 0.21618177957103418, + "grad_norm": 8.806599257983905, + "learning_rate": 4.920479890588593e-06, + "loss": 1.504, + "step": 1527 + }, + { + "epoch": 0.2163233524456714, + "grad_norm": 12.800416486640007, + "learning_rate": 4.920336407494692e-06, + "loss": 1.4451, + "step": 1528 + }, + { + "epoch": 0.21646492532030862, + "grad_norm": 16.260756468495995, + "learning_rate": 4.920192797165511e-06, + "loss": 1.4871, + "step": 1529 + }, + { + "epoch": 0.21660649819494585, + "grad_norm": 9.968345131619689, + "learning_rate": 4.9200490596086e-06, + "loss": 1.3504, + "step": 1530 + }, + { + "epoch": 0.21674807106958308, + "grad_norm": 8.707142092862213, + "learning_rate": 4.919905194831514e-06, + "loss": 1.2746, + "step": 1531 + }, + { + "epoch": 0.21688964394422028, + "grad_norm": 10.074383086823635, + "learning_rate": 4.919761202841815e-06, + "loss": 1.1736, + "step": 1532 + }, + { + "epoch": 0.2170312168188575, + "grad_norm": 10.549729069851303, + "learning_rate": 4.919617083647074e-06, + "loss": 1.5386, + "step": 1533 + }, + { + "epoch": 0.21717278969349474, + "grad_norm": 9.286707856396614, + "learning_rate": 4.9194728372548685e-06, + "loss": 1.3205, + "step": 1534 + }, + { + "epoch": 0.21731436256813194, + "grad_norm": 10.474389586951514, + "learning_rate": 4.919328463672779e-06, + "loss": 1.2737, + "step": 1535 + }, + { + "epoch": 0.21745593544276917, + "grad_norm": 7.7907568524492135, + "learning_rate": 4.919183962908397e-06, + "loss": 1.2586, + "step": 1536 + }, + { + "epoch": 0.21759750831740637, + "grad_norm": 9.552640731561565, + "learning_rate": 4.919039334969317e-06, + "loss": 1.4414, + "step": 1537 + }, + { + "epoch": 0.2177390811920436, + "grad_norm": 8.932601760212025, + "learning_rate": 4.918894579863143e-06, + "loss": 1.3331, + "step": 1538 + }, + { + "epoch": 0.21788065406668083, + "grad_norm": 10.6544860990873, + "learning_rate": 4.9187496975974845e-06, + "loss": 1.3279, + "step": 1539 + }, + { + "epoch": 0.21802222694131804, + "grad_norm": 10.126579255695134, + "learning_rate": 4.918604688179959e-06, + "loss": 1.3122, + "step": 1540 + }, + { + "epoch": 0.21816379981595527, + "grad_norm": 9.577808766955373, + "learning_rate": 4.918459551618187e-06, + "loss": 1.3191, + "step": 1541 + }, + { + "epoch": 0.21830537269059247, + "grad_norm": 8.286966445341198, + "learning_rate": 4.9183142879198e-06, + "loss": 1.3315, + "step": 1542 + }, + { + "epoch": 0.2184469455652297, + "grad_norm": 10.14895771857571, + "learning_rate": 4.918168897092435e-06, + "loss": 1.2022, + "step": 1543 + }, + { + "epoch": 0.21858851843986693, + "grad_norm": 11.990143860883718, + "learning_rate": 4.9180233791437326e-06, + "loss": 1.3755, + "step": 1544 + }, + { + "epoch": 0.21873009131450413, + "grad_norm": 8.827266361036774, + "learning_rate": 4.917877734081345e-06, + "loss": 1.1649, + "step": 1545 + }, + { + "epoch": 0.21887166418914136, + "grad_norm": 8.717116117347544, + "learning_rate": 4.917731961912927e-06, + "loss": 1.3766, + "step": 1546 + }, + { + "epoch": 0.2190132370637786, + "grad_norm": 9.903261329528647, + "learning_rate": 4.917586062646144e-06, + "loss": 1.4392, + "step": 1547 + }, + { + "epoch": 0.2191548099384158, + "grad_norm": 9.025065380081525, + "learning_rate": 4.917440036288663e-06, + "loss": 1.2832, + "step": 1548 + }, + { + "epoch": 0.21929638281305303, + "grad_norm": 10.680669375241546, + "learning_rate": 4.917293882848162e-06, + "loss": 1.4145, + "step": 1549 + }, + { + "epoch": 0.21943795568769023, + "grad_norm": 10.0970132471772, + "learning_rate": 4.9171476023323245e-06, + "loss": 1.3864, + "step": 1550 + }, + { + "epoch": 0.21957952856232746, + "grad_norm": 9.670280263442997, + "learning_rate": 4.917001194748839e-06, + "loss": 1.3396, + "step": 1551 + }, + { + "epoch": 0.2197211014369647, + "grad_norm": 9.124816056579236, + "learning_rate": 4.916854660105404e-06, + "loss": 1.14, + "step": 1552 + }, + { + "epoch": 0.2198626743116019, + "grad_norm": 8.583098868220068, + "learning_rate": 4.916707998409721e-06, + "loss": 1.2965, + "step": 1553 + }, + { + "epoch": 0.22000424718623912, + "grad_norm": 8.499189899212224, + "learning_rate": 4.916561209669501e-06, + "loss": 1.3778, + "step": 1554 + }, + { + "epoch": 0.22014582006087632, + "grad_norm": 8.095214239988254, + "learning_rate": 4.9164142938924595e-06, + "loss": 1.2705, + "step": 1555 + }, + { + "epoch": 0.22028739293551355, + "grad_norm": 10.851570585287664, + "learning_rate": 4.916267251086321e-06, + "loss": 1.2945, + "step": 1556 + }, + { + "epoch": 0.22042896581015078, + "grad_norm": 9.497433767547173, + "learning_rate": 4.916120081258814e-06, + "loss": 1.3682, + "step": 1557 + }, + { + "epoch": 0.220570538684788, + "grad_norm": 10.39243331891458, + "learning_rate": 4.915972784417676e-06, + "loss": 1.4524, + "step": 1558 + }, + { + "epoch": 0.22071211155942522, + "grad_norm": 9.775144969128615, + "learning_rate": 4.91582536057065e-06, + "loss": 1.2462, + "step": 1559 + }, + { + "epoch": 0.22085368443406245, + "grad_norm": 9.66073514111617, + "learning_rate": 4.915677809725487e-06, + "loss": 1.4234, + "step": 1560 + }, + { + "epoch": 0.22099525730869965, + "grad_norm": 8.891409733738664, + "learning_rate": 4.915530131889942e-06, + "loss": 1.2246, + "step": 1561 + }, + { + "epoch": 0.22113683018333688, + "grad_norm": 13.191685595391373, + "learning_rate": 4.915382327071778e-06, + "loss": 1.5836, + "step": 1562 + }, + { + "epoch": 0.22127840305797408, + "grad_norm": 9.54100303063396, + "learning_rate": 4.915234395278768e-06, + "loss": 1.4657, + "step": 1563 + }, + { + "epoch": 0.2214199759326113, + "grad_norm": 8.921518402188381, + "learning_rate": 4.915086336518686e-06, + "loss": 1.3817, + "step": 1564 + }, + { + "epoch": 0.22156154880724854, + "grad_norm": 9.487431292982443, + "learning_rate": 4.914938150799315e-06, + "loss": 1.3946, + "step": 1565 + }, + { + "epoch": 0.22170312168188575, + "grad_norm": 9.859678333289454, + "learning_rate": 4.914789838128447e-06, + "loss": 1.4318, + "step": 1566 + }, + { + "epoch": 0.22184469455652298, + "grad_norm": 10.066287257979292, + "learning_rate": 4.914641398513879e-06, + "loss": 1.3646, + "step": 1567 + }, + { + "epoch": 0.22198626743116018, + "grad_norm": 8.845870232261811, + "learning_rate": 4.914492831963411e-06, + "loss": 1.2332, + "step": 1568 + }, + { + "epoch": 0.2221278403057974, + "grad_norm": 9.381856623484143, + "learning_rate": 4.914344138484856e-06, + "loss": 1.2141, + "step": 1569 + }, + { + "epoch": 0.22226941318043464, + "grad_norm": 10.420039574370403, + "learning_rate": 4.91419531808603e-06, + "loss": 1.3892, + "step": 1570 + }, + { + "epoch": 0.22241098605507184, + "grad_norm": 9.315888268488251, + "learning_rate": 4.914046370774757e-06, + "loss": 1.4113, + "step": 1571 + }, + { + "epoch": 0.22255255892970907, + "grad_norm": 11.676591484516749, + "learning_rate": 4.913897296558865e-06, + "loss": 1.463, + "step": 1572 + }, + { + "epoch": 0.22269413180434627, + "grad_norm": 11.136296398405735, + "learning_rate": 4.913748095446192e-06, + "loss": 1.5605, + "step": 1573 + }, + { + "epoch": 0.2228357046789835, + "grad_norm": 10.908263799350056, + "learning_rate": 4.9135987674445815e-06, + "loss": 1.4836, + "step": 1574 + }, + { + "epoch": 0.22297727755362073, + "grad_norm": 9.158242070787844, + "learning_rate": 4.913449312561884e-06, + "loss": 1.502, + "step": 1575 + }, + { + "epoch": 0.22311885042825794, + "grad_norm": 9.901404895705024, + "learning_rate": 4.913299730805956e-06, + "loss": 1.3387, + "step": 1576 + }, + { + "epoch": 0.22326042330289517, + "grad_norm": 8.824922071766634, + "learning_rate": 4.913150022184659e-06, + "loss": 1.232, + "step": 1577 + }, + { + "epoch": 0.2234019961775324, + "grad_norm": 10.485567255351706, + "learning_rate": 4.913000186705866e-06, + "loss": 1.4787, + "step": 1578 + }, + { + "epoch": 0.2235435690521696, + "grad_norm": 9.56291287596723, + "learning_rate": 4.912850224377452e-06, + "loss": 1.2489, + "step": 1579 + }, + { + "epoch": 0.22368514192680683, + "grad_norm": 10.293026565302522, + "learning_rate": 4.912700135207301e-06, + "loss": 1.3544, + "step": 1580 + }, + { + "epoch": 0.22382671480144403, + "grad_norm": 9.67762499149839, + "learning_rate": 4.9125499192033035e-06, + "loss": 1.4636, + "step": 1581 + }, + { + "epoch": 0.22396828767608126, + "grad_norm": 9.821962107248758, + "learning_rate": 4.912399576373354e-06, + "loss": 1.3858, + "step": 1582 + }, + { + "epoch": 0.2241098605507185, + "grad_norm": 8.301830815998416, + "learning_rate": 4.9122491067253586e-06, + "loss": 1.4359, + "step": 1583 + }, + { + "epoch": 0.2242514334253557, + "grad_norm": 9.757385195682991, + "learning_rate": 4.912098510267226e-06, + "loss": 1.3356, + "step": 1584 + }, + { + "epoch": 0.22439300629999293, + "grad_norm": 8.647216490139689, + "learning_rate": 4.911947787006873e-06, + "loss": 1.2541, + "step": 1585 + }, + { + "epoch": 0.22453457917463013, + "grad_norm": 9.041208235081065, + "learning_rate": 4.911796936952224e-06, + "loss": 1.3042, + "step": 1586 + }, + { + "epoch": 0.22467615204926736, + "grad_norm": 9.185618512837276, + "learning_rate": 4.911645960111208e-06, + "loss": 1.4306, + "step": 1587 + }, + { + "epoch": 0.2248177249239046, + "grad_norm": 10.021144637402955, + "learning_rate": 4.911494856491762e-06, + "loss": 1.5394, + "step": 1588 + }, + { + "epoch": 0.2249592977985418, + "grad_norm": 9.980277640048945, + "learning_rate": 4.91134362610183e-06, + "loss": 1.4319, + "step": 1589 + }, + { + "epoch": 0.22510087067317902, + "grad_norm": 9.069226524405297, + "learning_rate": 4.9111922689493605e-06, + "loss": 1.3635, + "step": 1590 + }, + { + "epoch": 0.22524244354781625, + "grad_norm": 7.0769618680777535, + "learning_rate": 4.911040785042313e-06, + "loss": 1.3485, + "step": 1591 + }, + { + "epoch": 0.22538401642245345, + "grad_norm": 10.317627158207513, + "learning_rate": 4.910889174388647e-06, + "loss": 1.5086, + "step": 1592 + }, + { + "epoch": 0.22552558929709068, + "grad_norm": 11.564739938228927, + "learning_rate": 4.910737436996335e-06, + "loss": 1.3707, + "step": 1593 + }, + { + "epoch": 0.2256671621717279, + "grad_norm": 8.329061659481301, + "learning_rate": 4.910585572873355e-06, + "loss": 1.2721, + "step": 1594 + }, + { + "epoch": 0.22580873504636512, + "grad_norm": 10.209611919025589, + "learning_rate": 4.910433582027688e-06, + "loss": 1.3935, + "step": 1595 + }, + { + "epoch": 0.22595030792100235, + "grad_norm": 11.074631068713101, + "learning_rate": 4.910281464467325e-06, + "loss": 1.309, + "step": 1596 + }, + { + "epoch": 0.22609188079563955, + "grad_norm": 8.066686681717352, + "learning_rate": 4.910129220200263e-06, + "loss": 1.2175, + "step": 1597 + }, + { + "epoch": 0.22623345367027678, + "grad_norm": 9.460936693589973, + "learning_rate": 4.909976849234504e-06, + "loss": 1.4403, + "step": 1598 + }, + { + "epoch": 0.22637502654491398, + "grad_norm": 7.817846315213723, + "learning_rate": 4.90982435157806e-06, + "loss": 1.2833, + "step": 1599 + }, + { + "epoch": 0.2265165994195512, + "grad_norm": 13.046914179394765, + "learning_rate": 4.909671727238946e-06, + "loss": 1.381, + "step": 1600 + }, + { + "epoch": 0.22665817229418844, + "grad_norm": 11.524448446121172, + "learning_rate": 4.909518976225186e-06, + "loss": 1.3667, + "step": 1601 + }, + { + "epoch": 0.22679974516882564, + "grad_norm": 8.563768724435814, + "learning_rate": 4.90936609854481e-06, + "loss": 1.2994, + "step": 1602 + }, + { + "epoch": 0.22694131804346288, + "grad_norm": 11.121294754725476, + "learning_rate": 4.909213094205855e-06, + "loss": 1.5774, + "step": 1603 + }, + { + "epoch": 0.2270828909181001, + "grad_norm": 9.873572137671983, + "learning_rate": 4.909059963216363e-06, + "loss": 1.4187, + "step": 1604 + }, + { + "epoch": 0.2272244637927373, + "grad_norm": 9.402712945857036, + "learning_rate": 4.908906705584387e-06, + "loss": 1.5649, + "step": 1605 + }, + { + "epoch": 0.22736603666737454, + "grad_norm": 10.690728664704022, + "learning_rate": 4.90875332131798e-06, + "loss": 1.4951, + "step": 1606 + }, + { + "epoch": 0.22750760954201174, + "grad_norm": 9.96713157598651, + "learning_rate": 4.908599810425208e-06, + "loss": 1.4153, + "step": 1607 + }, + { + "epoch": 0.22764918241664897, + "grad_norm": 9.447231101019717, + "learning_rate": 4.90844617291414e-06, + "loss": 1.4038, + "step": 1608 + }, + { + "epoch": 0.2277907552912862, + "grad_norm": 7.696831722656702, + "learning_rate": 4.908292408792852e-06, + "loss": 1.1037, + "step": 1609 + }, + { + "epoch": 0.2279323281659234, + "grad_norm": 12.810625339226467, + "learning_rate": 4.908138518069428e-06, + "loss": 1.4156, + "step": 1610 + }, + { + "epoch": 0.22807390104056063, + "grad_norm": 11.009752978361107, + "learning_rate": 4.907984500751956e-06, + "loss": 1.3505, + "step": 1611 + }, + { + "epoch": 0.22821547391519784, + "grad_norm": 8.448708556313873, + "learning_rate": 4.907830356848537e-06, + "loss": 1.3187, + "step": 1612 + }, + { + "epoch": 0.22835704678983507, + "grad_norm": 8.473776814009934, + "learning_rate": 4.907676086367269e-06, + "loss": 1.3345, + "step": 1613 + }, + { + "epoch": 0.2284986196644723, + "grad_norm": 9.500966876167459, + "learning_rate": 4.907521689316265e-06, + "loss": 1.3949, + "step": 1614 + }, + { + "epoch": 0.2286401925391095, + "grad_norm": 9.174339782077602, + "learning_rate": 4.907367165703643e-06, + "loss": 1.4587, + "step": 1615 + }, + { + "epoch": 0.22878176541374673, + "grad_norm": 13.026373886311438, + "learning_rate": 4.907212515537522e-06, + "loss": 1.4158, + "step": 1616 + }, + { + "epoch": 0.22892333828838393, + "grad_norm": 10.207110854669489, + "learning_rate": 4.907057738826034e-06, + "loss": 1.361, + "step": 1617 + }, + { + "epoch": 0.22906491116302116, + "grad_norm": 8.539587397379364, + "learning_rate": 4.906902835577316e-06, + "loss": 1.391, + "step": 1618 + }, + { + "epoch": 0.2292064840376584, + "grad_norm": 9.698584730586582, + "learning_rate": 4.906747805799511e-06, + "loss": 1.3132, + "step": 1619 + }, + { + "epoch": 0.2293480569122956, + "grad_norm": 11.331294007171163, + "learning_rate": 4.906592649500767e-06, + "loss": 1.5416, + "step": 1620 + }, + { + "epoch": 0.22948962978693282, + "grad_norm": 10.57046262827608, + "learning_rate": 4.906437366689244e-06, + "loss": 1.6555, + "step": 1621 + }, + { + "epoch": 0.22963120266157006, + "grad_norm": 10.152667261702979, + "learning_rate": 4.9062819573731015e-06, + "loss": 1.2847, + "step": 1622 + }, + { + "epoch": 0.22977277553620726, + "grad_norm": 8.514039954262222, + "learning_rate": 4.906126421560511e-06, + "loss": 1.3593, + "step": 1623 + }, + { + "epoch": 0.2299143484108445, + "grad_norm": 10.430494687280499, + "learning_rate": 4.905970759259648e-06, + "loss": 1.3334, + "step": 1624 + }, + { + "epoch": 0.2300559212854817, + "grad_norm": 8.974351575799485, + "learning_rate": 4.905814970478697e-06, + "loss": 1.2964, + "step": 1625 + }, + { + "epoch": 0.23019749416011892, + "grad_norm": 10.66846864102488, + "learning_rate": 4.905659055225847e-06, + "loss": 1.4034, + "step": 1626 + }, + { + "epoch": 0.23033906703475615, + "grad_norm": 12.024542824030904, + "learning_rate": 4.905503013509293e-06, + "loss": 1.2904, + "step": 1627 + }, + { + "epoch": 0.23048063990939335, + "grad_norm": 8.517342770409815, + "learning_rate": 4.90534684533724e-06, + "loss": 1.3125, + "step": 1628 + }, + { + "epoch": 0.23062221278403058, + "grad_norm": 10.118128952938918, + "learning_rate": 4.905190550717897e-06, + "loss": 1.3416, + "step": 1629 + }, + { + "epoch": 0.23076378565866779, + "grad_norm": 10.58166715066676, + "learning_rate": 4.90503412965948e-06, + "loss": 1.4196, + "step": 1630 + }, + { + "epoch": 0.23090535853330502, + "grad_norm": 10.459125341895492, + "learning_rate": 4.904877582170212e-06, + "loss": 1.2175, + "step": 1631 + }, + { + "epoch": 0.23104693140794225, + "grad_norm": 8.637466046367328, + "learning_rate": 4.904720908258323e-06, + "loss": 1.4072, + "step": 1632 + }, + { + "epoch": 0.23118850428257945, + "grad_norm": 8.981553565133337, + "learning_rate": 4.904564107932048e-06, + "loss": 1.3897, + "step": 1633 + }, + { + "epoch": 0.23133007715721668, + "grad_norm": 8.679080093789553, + "learning_rate": 4.904407181199631e-06, + "loss": 1.2936, + "step": 1634 + }, + { + "epoch": 0.2314716500318539, + "grad_norm": 9.810835581743877, + "learning_rate": 4.904250128069322e-06, + "loss": 1.2341, + "step": 1635 + }, + { + "epoch": 0.2316132229064911, + "grad_norm": 12.685215335592517, + "learning_rate": 4.904092948549376e-06, + "loss": 1.2228, + "step": 1636 + }, + { + "epoch": 0.23175479578112834, + "grad_norm": 8.812257587534678, + "learning_rate": 4.9039356426480565e-06, + "loss": 1.3251, + "step": 1637 + }, + { + "epoch": 0.23189636865576554, + "grad_norm": 10.319637903174545, + "learning_rate": 4.903778210373632e-06, + "loss": 1.3607, + "step": 1638 + }, + { + "epoch": 0.23203794153040277, + "grad_norm": 8.831457214732149, + "learning_rate": 4.90362065173438e-06, + "loss": 1.5066, + "step": 1639 + }, + { + "epoch": 0.23217951440504, + "grad_norm": 8.415631926229512, + "learning_rate": 4.9034629667385825e-06, + "loss": 1.3136, + "step": 1640 + }, + { + "epoch": 0.2323210872796772, + "grad_norm": 10.356234205253495, + "learning_rate": 4.903305155394529e-06, + "loss": 1.2954, + "step": 1641 + }, + { + "epoch": 0.23246266015431444, + "grad_norm": 10.94311030776935, + "learning_rate": 4.903147217710515e-06, + "loss": 1.4084, + "step": 1642 + }, + { + "epoch": 0.23260423302895164, + "grad_norm": 9.512761428990155, + "learning_rate": 4.902989153694843e-06, + "loss": 1.3351, + "step": 1643 + }, + { + "epoch": 0.23274580590358887, + "grad_norm": 9.122249240915252, + "learning_rate": 4.902830963355825e-06, + "loss": 1.5306, + "step": 1644 + }, + { + "epoch": 0.2328873787782261, + "grad_norm": 9.457005837558793, + "learning_rate": 4.902672646701774e-06, + "loss": 1.1273, + "step": 1645 + }, + { + "epoch": 0.2330289516528633, + "grad_norm": 11.188009751283927, + "learning_rate": 4.902514203741013e-06, + "loss": 1.2471, + "step": 1646 + }, + { + "epoch": 0.23317052452750053, + "grad_norm": 11.38010166330633, + "learning_rate": 4.902355634481872e-06, + "loss": 1.3883, + "step": 1647 + }, + { + "epoch": 0.23331209740213776, + "grad_norm": 9.37294655245592, + "learning_rate": 4.9021969389326866e-06, + "loss": 1.4865, + "step": 1648 + }, + { + "epoch": 0.23345367027677497, + "grad_norm": 8.538683660397764, + "learning_rate": 4.902038117101798e-06, + "loss": 1.2729, + "step": 1649 + }, + { + "epoch": 0.2335952431514122, + "grad_norm": 10.898910019998487, + "learning_rate": 4.901879168997559e-06, + "loss": 1.1859, + "step": 1650 + }, + { + "epoch": 0.2337368160260494, + "grad_norm": 9.525670353886804, + "learning_rate": 4.901720094628322e-06, + "loss": 1.2875, + "step": 1651 + }, + { + "epoch": 0.23387838890068663, + "grad_norm": 8.954051663840815, + "learning_rate": 4.901560894002449e-06, + "loss": 1.4121, + "step": 1652 + }, + { + "epoch": 0.23401996177532386, + "grad_norm": 8.598487685547711, + "learning_rate": 4.9014015671283124e-06, + "loss": 1.3599, + "step": 1653 + }, + { + "epoch": 0.23416153464996106, + "grad_norm": 8.721104020390575, + "learning_rate": 4.901242114014285e-06, + "loss": 1.3055, + "step": 1654 + }, + { + "epoch": 0.2343031075245983, + "grad_norm": 7.788706443247301, + "learning_rate": 4.901082534668751e-06, + "loss": 1.2268, + "step": 1655 + }, + { + "epoch": 0.2344446803992355, + "grad_norm": 9.324060011216865, + "learning_rate": 4.900922829100097e-06, + "loss": 1.3695, + "step": 1656 + }, + { + "epoch": 0.23458625327387272, + "grad_norm": 9.354517419663356, + "learning_rate": 4.900762997316722e-06, + "loss": 1.362, + "step": 1657 + }, + { + "epoch": 0.23472782614850995, + "grad_norm": 10.500341863970617, + "learning_rate": 4.900603039327024e-06, + "loss": 1.3581, + "step": 1658 + }, + { + "epoch": 0.23486939902314716, + "grad_norm": 15.164161111863805, + "learning_rate": 4.9004429551394155e-06, + "loss": 1.6463, + "step": 1659 + }, + { + "epoch": 0.2350109718977844, + "grad_norm": 9.727202931350604, + "learning_rate": 4.900282744762311e-06, + "loss": 1.4743, + "step": 1660 + }, + { + "epoch": 0.23515254477242162, + "grad_norm": 12.043032101276772, + "learning_rate": 4.900122408204132e-06, + "loss": 1.3099, + "step": 1661 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 10.007947815122481, + "learning_rate": 4.899961945473307e-06, + "loss": 1.4496, + "step": 1662 + }, + { + "epoch": 0.23543569052169605, + "grad_norm": 10.400705871102572, + "learning_rate": 4.899801356578273e-06, + "loss": 1.2004, + "step": 1663 + }, + { + "epoch": 0.23557726339633325, + "grad_norm": 8.743759545736166, + "learning_rate": 4.89964064152747e-06, + "loss": 1.3951, + "step": 1664 + }, + { + "epoch": 0.23571883627097048, + "grad_norm": 9.95839504930427, + "learning_rate": 4.899479800329348e-06, + "loss": 1.3875, + "step": 1665 + }, + { + "epoch": 0.2358604091456077, + "grad_norm": 8.988035090066873, + "learning_rate": 4.899318832992363e-06, + "loss": 1.2153, + "step": 1666 + }, + { + "epoch": 0.23600198202024492, + "grad_norm": 9.662321188235719, + "learning_rate": 4.8991577395249755e-06, + "loss": 1.2486, + "step": 1667 + }, + { + "epoch": 0.23614355489488215, + "grad_norm": 9.509140034811779, + "learning_rate": 4.898996519935654e-06, + "loss": 1.3043, + "step": 1668 + }, + { + "epoch": 0.23628512776951935, + "grad_norm": 10.722584741906369, + "learning_rate": 4.898835174232875e-06, + "loss": 1.397, + "step": 1669 + }, + { + "epoch": 0.23642670064415658, + "grad_norm": 10.266395667656825, + "learning_rate": 4.898673702425118e-06, + "loss": 1.3905, + "step": 1670 + }, + { + "epoch": 0.2365682735187938, + "grad_norm": 11.577767844103333, + "learning_rate": 4.898512104520875e-06, + "loss": 1.3519, + "step": 1671 + }, + { + "epoch": 0.236709846393431, + "grad_norm": 8.060713695799183, + "learning_rate": 4.898350380528638e-06, + "loss": 1.2698, + "step": 1672 + }, + { + "epoch": 0.23685141926806824, + "grad_norm": 9.369060618599924, + "learning_rate": 4.8981885304569095e-06, + "loss": 1.3674, + "step": 1673 + }, + { + "epoch": 0.23699299214270544, + "grad_norm": 9.038452233342502, + "learning_rate": 4.898026554314199e-06, + "loss": 1.3332, + "step": 1674 + }, + { + "epoch": 0.23713456501734267, + "grad_norm": 9.613269742348862, + "learning_rate": 4.89786445210902e-06, + "loss": 1.3539, + "step": 1675 + }, + { + "epoch": 0.2372761378919799, + "grad_norm": 9.799111726947858, + "learning_rate": 4.897702223849895e-06, + "loss": 1.312, + "step": 1676 + }, + { + "epoch": 0.2374177107666171, + "grad_norm": 8.748959288695652, + "learning_rate": 4.897539869545351e-06, + "loss": 1.4057, + "step": 1677 + }, + { + "epoch": 0.23755928364125434, + "grad_norm": 7.760409593685605, + "learning_rate": 4.897377389203925e-06, + "loss": 1.2402, + "step": 1678 + }, + { + "epoch": 0.23770085651589157, + "grad_norm": 11.64249801416198, + "learning_rate": 4.897214782834156e-06, + "loss": 1.4157, + "step": 1679 + }, + { + "epoch": 0.23784242939052877, + "grad_norm": 8.229238105817943, + "learning_rate": 4.897052050444595e-06, + "loss": 1.3665, + "step": 1680 + }, + { + "epoch": 0.237984002265166, + "grad_norm": 7.593852226919293, + "learning_rate": 4.8968891920437936e-06, + "loss": 1.1967, + "step": 1681 + }, + { + "epoch": 0.2381255751398032, + "grad_norm": 9.979226469897354, + "learning_rate": 4.896726207640315e-06, + "loss": 1.3473, + "step": 1682 + }, + { + "epoch": 0.23826714801444043, + "grad_norm": 8.58880939301547, + "learning_rate": 4.896563097242727e-06, + "loss": 1.5724, + "step": 1683 + }, + { + "epoch": 0.23840872088907766, + "grad_norm": 9.94291222937597, + "learning_rate": 4.896399860859603e-06, + "loss": 1.4598, + "step": 1684 + }, + { + "epoch": 0.23855029376371487, + "grad_norm": 9.952034164370069, + "learning_rate": 4.896236498499526e-06, + "loss": 1.5053, + "step": 1685 + }, + { + "epoch": 0.2386918666383521, + "grad_norm": 7.619764187585733, + "learning_rate": 4.896073010171083e-06, + "loss": 1.2752, + "step": 1686 + }, + { + "epoch": 0.2388334395129893, + "grad_norm": 9.338836092260399, + "learning_rate": 4.895909395882868e-06, + "loss": 1.3313, + "step": 1687 + }, + { + "epoch": 0.23897501238762653, + "grad_norm": 9.362751601623902, + "learning_rate": 4.895745655643482e-06, + "loss": 1.4038, + "step": 1688 + }, + { + "epoch": 0.23911658526226376, + "grad_norm": 9.66912911281533, + "learning_rate": 4.895581789461534e-06, + "loss": 1.2267, + "step": 1689 + }, + { + "epoch": 0.23925815813690096, + "grad_norm": 9.689796618520989, + "learning_rate": 4.895417797345638e-06, + "loss": 1.143, + "step": 1690 + }, + { + "epoch": 0.2393997310115382, + "grad_norm": 9.572147464392623, + "learning_rate": 4.895253679304414e-06, + "loss": 1.4366, + "step": 1691 + }, + { + "epoch": 0.23954130388617542, + "grad_norm": 8.358240011295786, + "learning_rate": 4.8950894353464905e-06, + "loss": 1.378, + "step": 1692 + }, + { + "epoch": 0.23968287676081262, + "grad_norm": 9.216533019308079, + "learning_rate": 4.8949250654805e-06, + "loss": 1.4015, + "step": 1693 + }, + { + "epoch": 0.23982444963544985, + "grad_norm": 8.690030614449993, + "learning_rate": 4.894760569715086e-06, + "loss": 1.3284, + "step": 1694 + }, + { + "epoch": 0.23996602251008706, + "grad_norm": 11.055928665416811, + "learning_rate": 4.894595948058893e-06, + "loss": 1.4149, + "step": 1695 + }, + { + "epoch": 0.2401075953847243, + "grad_norm": 8.332000117147649, + "learning_rate": 4.894431200520578e-06, + "loss": 1.4507, + "step": 1696 + }, + { + "epoch": 0.24024916825936152, + "grad_norm": 9.263371880354907, + "learning_rate": 4.894266327108799e-06, + "loss": 1.3002, + "step": 1697 + }, + { + "epoch": 0.24039074113399872, + "grad_norm": 9.191360596731075, + "learning_rate": 4.894101327832225e-06, + "loss": 1.2742, + "step": 1698 + }, + { + "epoch": 0.24053231400863595, + "grad_norm": 10.207296970206572, + "learning_rate": 4.8939362026995295e-06, + "loss": 1.217, + "step": 1699 + }, + { + "epoch": 0.24067388688327315, + "grad_norm": 10.276559797637246, + "learning_rate": 4.893770951719392e-06, + "loss": 1.5503, + "step": 1700 + }, + { + "epoch": 0.24081545975791038, + "grad_norm": 9.112668418636103, + "learning_rate": 4.893605574900501e-06, + "loss": 1.4779, + "step": 1701 + }, + { + "epoch": 0.2409570326325476, + "grad_norm": 11.374350728379511, + "learning_rate": 4.893440072251549e-06, + "loss": 1.5177, + "step": 1702 + }, + { + "epoch": 0.24109860550718482, + "grad_norm": 11.652303155309777, + "learning_rate": 4.893274443781239e-06, + "loss": 1.5781, + "step": 1703 + }, + { + "epoch": 0.24124017838182205, + "grad_norm": 12.607579062004232, + "learning_rate": 4.893108689498274e-06, + "loss": 1.3777, + "step": 1704 + }, + { + "epoch": 0.24138175125645928, + "grad_norm": 8.336872366597296, + "learning_rate": 4.89294280941137e-06, + "loss": 1.314, + "step": 1705 + }, + { + "epoch": 0.24152332413109648, + "grad_norm": 10.876179598269266, + "learning_rate": 4.892776803529246e-06, + "loss": 1.3776, + "step": 1706 + }, + { + "epoch": 0.2416648970057337, + "grad_norm": 8.940338324304475, + "learning_rate": 4.892610671860631e-06, + "loss": 1.2331, + "step": 1707 + }, + { + "epoch": 0.2418064698803709, + "grad_norm": 9.929130820720436, + "learning_rate": 4.892444414414257e-06, + "loss": 1.2559, + "step": 1708 + }, + { + "epoch": 0.24194804275500814, + "grad_norm": 9.012840859240992, + "learning_rate": 4.892278031198864e-06, + "loss": 1.1816, + "step": 1709 + }, + { + "epoch": 0.24208961562964537, + "grad_norm": 8.873244420883173, + "learning_rate": 4.892111522223198e-06, + "loss": 1.1529, + "step": 1710 + }, + { + "epoch": 0.24223118850428257, + "grad_norm": 8.357351810511986, + "learning_rate": 4.891944887496013e-06, + "loss": 1.346, + "step": 1711 + }, + { + "epoch": 0.2423727613789198, + "grad_norm": 13.78304301597704, + "learning_rate": 4.8917781270260686e-06, + "loss": 1.4652, + "step": 1712 + }, + { + "epoch": 0.242514334253557, + "grad_norm": 9.061841638262784, + "learning_rate": 4.891611240822132e-06, + "loss": 1.356, + "step": 1713 + }, + { + "epoch": 0.24265590712819424, + "grad_norm": 10.13390345057164, + "learning_rate": 4.891444228892975e-06, + "loss": 1.4301, + "step": 1714 + }, + { + "epoch": 0.24279748000283147, + "grad_norm": 9.907586907027992, + "learning_rate": 4.891277091247379e-06, + "loss": 1.4988, + "step": 1715 + }, + { + "epoch": 0.24293905287746867, + "grad_norm": 10.640612810886925, + "learning_rate": 4.891109827894129e-06, + "loss": 1.4077, + "step": 1716 + }, + { + "epoch": 0.2430806257521059, + "grad_norm": 9.641339193688495, + "learning_rate": 4.890942438842018e-06, + "loss": 1.4508, + "step": 1717 + }, + { + "epoch": 0.2432221986267431, + "grad_norm": 8.104592855535921, + "learning_rate": 4.890774924099845e-06, + "loss": 1.3778, + "step": 1718 + }, + { + "epoch": 0.24336377150138033, + "grad_norm": 8.739912921812172, + "learning_rate": 4.890607283676418e-06, + "loss": 1.2415, + "step": 1719 + }, + { + "epoch": 0.24350534437601756, + "grad_norm": 7.720582319839733, + "learning_rate": 4.890439517580548e-06, + "loss": 1.4206, + "step": 1720 + }, + { + "epoch": 0.24364691725065477, + "grad_norm": 9.93679519169237, + "learning_rate": 4.890271625821056e-06, + "loss": 1.3853, + "step": 1721 + }, + { + "epoch": 0.243788490125292, + "grad_norm": 10.219665672970185, + "learning_rate": 4.890103608406765e-06, + "loss": 1.4307, + "step": 1722 + }, + { + "epoch": 0.24393006299992923, + "grad_norm": 9.05278725881263, + "learning_rate": 4.889935465346511e-06, + "loss": 1.3939, + "step": 1723 + }, + { + "epoch": 0.24407163587456643, + "grad_norm": 8.912648897144413, + "learning_rate": 4.8897671966491315e-06, + "loss": 1.3925, + "step": 1724 + }, + { + "epoch": 0.24421320874920366, + "grad_norm": 8.3541113137356, + "learning_rate": 4.889598802323471e-06, + "loss": 1.3552, + "step": 1725 + }, + { + "epoch": 0.24435478162384086, + "grad_norm": 10.266420191328713, + "learning_rate": 4.8894302823783845e-06, + "loss": 1.3436, + "step": 1726 + }, + { + "epoch": 0.2444963544984781, + "grad_norm": 9.228732429662308, + "learning_rate": 4.88926163682273e-06, + "loss": 1.3366, + "step": 1727 + }, + { + "epoch": 0.24463792737311532, + "grad_norm": 9.289283846053936, + "learning_rate": 4.889092865665372e-06, + "loss": 1.3931, + "step": 1728 + }, + { + "epoch": 0.24477950024775252, + "grad_norm": 12.577450486572687, + "learning_rate": 4.888923968915183e-06, + "loss": 1.4741, + "step": 1729 + }, + { + "epoch": 0.24492107312238975, + "grad_norm": 9.58980158468327, + "learning_rate": 4.888754946581044e-06, + "loss": 1.1805, + "step": 1730 + }, + { + "epoch": 0.24506264599702696, + "grad_norm": 11.5471828971974, + "learning_rate": 4.8885857986718365e-06, + "loss": 1.575, + "step": 1731 + }, + { + "epoch": 0.2452042188716642, + "grad_norm": 9.41806758036075, + "learning_rate": 4.888416525196455e-06, + "loss": 1.587, + "step": 1732 + }, + { + "epoch": 0.24534579174630142, + "grad_norm": 7.028531918209731, + "learning_rate": 4.8882471261637985e-06, + "loss": 1.2723, + "step": 1733 + }, + { + "epoch": 0.24548736462093862, + "grad_norm": 8.448151823095642, + "learning_rate": 4.888077601582772e-06, + "loss": 1.274, + "step": 1734 + }, + { + "epoch": 0.24562893749557585, + "grad_norm": 9.243440106235264, + "learning_rate": 4.887907951462284e-06, + "loss": 1.3902, + "step": 1735 + }, + { + "epoch": 0.24577051037021308, + "grad_norm": 9.17627970322911, + "learning_rate": 4.8877381758112576e-06, + "loss": 1.5082, + "step": 1736 + }, + { + "epoch": 0.24591208324485028, + "grad_norm": 9.767402182041149, + "learning_rate": 4.887568274638616e-06, + "loss": 1.3993, + "step": 1737 + }, + { + "epoch": 0.2460536561194875, + "grad_norm": 9.856089581119617, + "learning_rate": 4.887398247953289e-06, + "loss": 1.397, + "step": 1738 + }, + { + "epoch": 0.24619522899412472, + "grad_norm": 8.905495702021897, + "learning_rate": 4.887228095764216e-06, + "loss": 1.5131, + "step": 1739 + }, + { + "epoch": 0.24633680186876195, + "grad_norm": 11.292939361882377, + "learning_rate": 4.887057818080343e-06, + "loss": 1.4903, + "step": 1740 + }, + { + "epoch": 0.24647837474339918, + "grad_norm": 10.18083525223277, + "learning_rate": 4.886887414910621e-06, + "loss": 1.4748, + "step": 1741 + }, + { + "epoch": 0.24661994761803638, + "grad_norm": 8.959165549249951, + "learning_rate": 4.8867168862640056e-06, + "loss": 1.4526, + "step": 1742 + }, + { + "epoch": 0.2467615204926736, + "grad_norm": 8.657862764359596, + "learning_rate": 4.886546232149464e-06, + "loss": 1.3539, + "step": 1743 + }, + { + "epoch": 0.2469030933673108, + "grad_norm": 10.992457144547211, + "learning_rate": 4.886375452575967e-06, + "loss": 1.255, + "step": 1744 + }, + { + "epoch": 0.24704466624194804, + "grad_norm": 11.584327344073476, + "learning_rate": 4.886204547552491e-06, + "loss": 1.3928, + "step": 1745 + }, + { + "epoch": 0.24718623911658527, + "grad_norm": 9.703310560826328, + "learning_rate": 4.886033517088021e-06, + "loss": 1.4206, + "step": 1746 + }, + { + "epoch": 0.24732781199122247, + "grad_norm": 8.688226683158607, + "learning_rate": 4.885862361191549e-06, + "loss": 1.2731, + "step": 1747 + }, + { + "epoch": 0.2474693848658597, + "grad_norm": 9.303503004344948, + "learning_rate": 4.885691079872071e-06, + "loss": 1.3939, + "step": 1748 + }, + { + "epoch": 0.24761095774049693, + "grad_norm": 8.541955065316758, + "learning_rate": 4.885519673138592e-06, + "loss": 1.4727, + "step": 1749 + }, + { + "epoch": 0.24775253061513414, + "grad_norm": 9.170345146314782, + "learning_rate": 4.8853481410001225e-06, + "loss": 1.3429, + "step": 1750 + }, + { + "epoch": 0.24789410348977137, + "grad_norm": 12.619419186340785, + "learning_rate": 4.88517648346568e-06, + "loss": 1.4545, + "step": 1751 + }, + { + "epoch": 0.24803567636440857, + "grad_norm": 11.632230089517176, + "learning_rate": 4.885004700544288e-06, + "loss": 1.438, + "step": 1752 + }, + { + "epoch": 0.2481772492390458, + "grad_norm": 10.723204818617951, + "learning_rate": 4.884832792244977e-06, + "loss": 1.3659, + "step": 1753 + }, + { + "epoch": 0.24831882211368303, + "grad_norm": 10.115121802870892, + "learning_rate": 4.884660758576785e-06, + "loss": 1.4124, + "step": 1754 + }, + { + "epoch": 0.24846039498832023, + "grad_norm": 11.597157597060354, + "learning_rate": 4.884488599548755e-06, + "loss": 1.3241, + "step": 1755 + }, + { + "epoch": 0.24860196786295746, + "grad_norm": 11.011307539878857, + "learning_rate": 4.884316315169936e-06, + "loss": 1.4026, + "step": 1756 + }, + { + "epoch": 0.24874354073759466, + "grad_norm": 10.238371789350719, + "learning_rate": 4.8841439054493864e-06, + "loss": 1.243, + "step": 1757 + }, + { + "epoch": 0.2488851136122319, + "grad_norm": 8.087079101175645, + "learning_rate": 4.88397137039617e-06, + "loss": 1.2365, + "step": 1758 + }, + { + "epoch": 0.24902668648686913, + "grad_norm": 12.277673098509636, + "learning_rate": 4.883798710019356e-06, + "loss": 1.349, + "step": 1759 + }, + { + "epoch": 0.24916825936150633, + "grad_norm": 12.560781788397655, + "learning_rate": 4.883625924328022e-06, + "loss": 1.4135, + "step": 1760 + }, + { + "epoch": 0.24930983223614356, + "grad_norm": 9.514720554307546, + "learning_rate": 4.88345301333125e-06, + "loss": 1.2723, + "step": 1761 + }, + { + "epoch": 0.24945140511078076, + "grad_norm": 10.5907993764921, + "learning_rate": 4.88327997703813e-06, + "loss": 1.4432, + "step": 1762 + }, + { + "epoch": 0.249592977985418, + "grad_norm": 8.667673003629726, + "learning_rate": 4.883106815457758e-06, + "loss": 1.4231, + "step": 1763 + }, + { + "epoch": 0.24973455086005522, + "grad_norm": 10.686868213919347, + "learning_rate": 4.882933528599239e-06, + "loss": 1.4557, + "step": 1764 + }, + { + "epoch": 0.24987612373469242, + "grad_norm": 11.97963958249025, + "learning_rate": 4.882760116471681e-06, + "loss": 1.4463, + "step": 1765 + }, + { + "epoch": 0.25001769660932965, + "grad_norm": 7.557266568318147, + "learning_rate": 4.8825865790841995e-06, + "loss": 1.258, + "step": 1766 + }, + { + "epoch": 0.2501592694839669, + "grad_norm": 10.213426048847342, + "learning_rate": 4.882412916445919e-06, + "loss": 1.2871, + "step": 1767 + }, + { + "epoch": 0.2503008423586041, + "grad_norm": 11.878247268798159, + "learning_rate": 4.882239128565968e-06, + "loss": 1.2109, + "step": 1768 + }, + { + "epoch": 0.2504424152332413, + "grad_norm": 8.776285852191403, + "learning_rate": 4.882065215453481e-06, + "loss": 1.4418, + "step": 1769 + }, + { + "epoch": 0.2505839881078785, + "grad_norm": 9.613834791433263, + "learning_rate": 4.881891177117602e-06, + "loss": 1.4288, + "step": 1770 + }, + { + "epoch": 0.25072556098251575, + "grad_norm": 10.7902818967182, + "learning_rate": 4.881717013567481e-06, + "loss": 1.3688, + "step": 1771 + }, + { + "epoch": 0.250867133857153, + "grad_norm": 10.669246321912187, + "learning_rate": 4.881542724812272e-06, + "loss": 1.4007, + "step": 1772 + }, + { + "epoch": 0.2510087067317902, + "grad_norm": 10.113154133881503, + "learning_rate": 4.881368310861137e-06, + "loss": 1.4825, + "step": 1773 + }, + { + "epoch": 0.2511502796064274, + "grad_norm": 10.15737824262107, + "learning_rate": 4.881193771723246e-06, + "loss": 1.3691, + "step": 1774 + }, + { + "epoch": 0.2512918524810646, + "grad_norm": 8.958153821935685, + "learning_rate": 4.881019107407774e-06, + "loss": 1.3584, + "step": 1775 + }, + { + "epoch": 0.25143342535570185, + "grad_norm": 7.9781402910491686, + "learning_rate": 4.8808443179239025e-06, + "loss": 1.4088, + "step": 1776 + }, + { + "epoch": 0.2515749982303391, + "grad_norm": 8.74323381668603, + "learning_rate": 4.880669403280821e-06, + "loss": 1.4369, + "step": 1777 + }, + { + "epoch": 0.2517165711049763, + "grad_norm": 10.360190242950038, + "learning_rate": 4.880494363487723e-06, + "loss": 1.2489, + "step": 1778 + }, + { + "epoch": 0.2518581439796135, + "grad_norm": 10.13336401315696, + "learning_rate": 4.880319198553813e-06, + "loss": 1.4695, + "step": 1779 + }, + { + "epoch": 0.2519997168542507, + "grad_norm": 8.947678965089477, + "learning_rate": 4.880143908488296e-06, + "loss": 1.2819, + "step": 1780 + }, + { + "epoch": 0.25214128972888794, + "grad_norm": 9.015849674274804, + "learning_rate": 4.87996849330039e-06, + "loss": 1.3514, + "step": 1781 + }, + { + "epoch": 0.25228286260352517, + "grad_norm": 10.088924612727988, + "learning_rate": 4.8797929529993135e-06, + "loss": 1.5156, + "step": 1782 + }, + { + "epoch": 0.2524244354781624, + "grad_norm": 8.865327103846937, + "learning_rate": 4.8796172875942965e-06, + "loss": 1.2843, + "step": 1783 + }, + { + "epoch": 0.2525660083527996, + "grad_norm": 8.957168810953055, + "learning_rate": 4.879441497094572e-06, + "loss": 1.3637, + "step": 1784 + }, + { + "epoch": 0.2527075812274368, + "grad_norm": 13.102375403373905, + "learning_rate": 4.879265581509384e-06, + "loss": 1.4375, + "step": 1785 + }, + { + "epoch": 0.25284915410207404, + "grad_norm": 11.229706835567569, + "learning_rate": 4.8790895408479776e-06, + "loss": 1.3752, + "step": 1786 + }, + { + "epoch": 0.25299072697671127, + "grad_norm": 10.632626298539535, + "learning_rate": 4.878913375119608e-06, + "loss": 1.4521, + "step": 1787 + }, + { + "epoch": 0.2531322998513485, + "grad_norm": 8.629137014512771, + "learning_rate": 4.878737084333536e-06, + "loss": 1.3273, + "step": 1788 + }, + { + "epoch": 0.2532738727259857, + "grad_norm": 9.45304630971897, + "learning_rate": 4.878560668499029e-06, + "loss": 1.3267, + "step": 1789 + }, + { + "epoch": 0.2534154456006229, + "grad_norm": 10.458959755748335, + "learning_rate": 4.8783841276253605e-06, + "loss": 1.358, + "step": 1790 + }, + { + "epoch": 0.25355701847526013, + "grad_norm": 11.156984791212528, + "learning_rate": 4.8782074617218135e-06, + "loss": 1.282, + "step": 1791 + }, + { + "epoch": 0.25369859134989736, + "grad_norm": 9.316918880026389, + "learning_rate": 4.878030670797672e-06, + "loss": 1.3982, + "step": 1792 + }, + { + "epoch": 0.2538401642245346, + "grad_norm": 8.758544673481051, + "learning_rate": 4.877853754862232e-06, + "loss": 1.3952, + "step": 1793 + }, + { + "epoch": 0.2539817370991718, + "grad_norm": 10.387056629987645, + "learning_rate": 4.8776767139247936e-06, + "loss": 1.5105, + "step": 1794 + }, + { + "epoch": 0.254123309973809, + "grad_norm": 9.84767910486737, + "learning_rate": 4.877499547994662e-06, + "loss": 1.5199, + "step": 1795 + }, + { + "epoch": 0.2542648828484462, + "grad_norm": 10.686633337027995, + "learning_rate": 4.877322257081153e-06, + "loss": 1.3842, + "step": 1796 + }, + { + "epoch": 0.25440645572308346, + "grad_norm": 9.648616996053033, + "learning_rate": 4.877144841193585e-06, + "loss": 1.3408, + "step": 1797 + }, + { + "epoch": 0.2545480285977207, + "grad_norm": 9.353597803525506, + "learning_rate": 4.876967300341285e-06, + "loss": 1.5396, + "step": 1798 + }, + { + "epoch": 0.2546896014723579, + "grad_norm": 10.98662239189239, + "learning_rate": 4.876789634533587e-06, + "loss": 1.4615, + "step": 1799 + }, + { + "epoch": 0.2548311743469951, + "grad_norm": 8.387047599951227, + "learning_rate": 4.876611843779829e-06, + "loss": 1.5166, + "step": 1800 + }, + { + "epoch": 0.2549727472216323, + "grad_norm": 7.872352518376193, + "learning_rate": 4.876433928089359e-06, + "loss": 1.3683, + "step": 1801 + }, + { + "epoch": 0.25511432009626955, + "grad_norm": 9.831721631745918, + "learning_rate": 4.87625588747153e-06, + "loss": 1.3574, + "step": 1802 + }, + { + "epoch": 0.2552558929709068, + "grad_norm": 8.218928694142232, + "learning_rate": 4.8760777219357e-06, + "loss": 1.275, + "step": 1803 + }, + { + "epoch": 0.255397465845544, + "grad_norm": 11.27077312535665, + "learning_rate": 4.875899431491236e-06, + "loss": 1.5835, + "step": 1804 + }, + { + "epoch": 0.2555390387201812, + "grad_norm": 9.42315756195873, + "learning_rate": 4.875721016147511e-06, + "loss": 1.4386, + "step": 1805 + }, + { + "epoch": 0.2556806115948184, + "grad_norm": 8.615944794374384, + "learning_rate": 4.875542475913902e-06, + "loss": 1.3392, + "step": 1806 + }, + { + "epoch": 0.25582218446945565, + "grad_norm": 9.641346711235418, + "learning_rate": 4.875363810799798e-06, + "loss": 1.3076, + "step": 1807 + }, + { + "epoch": 0.2559637573440929, + "grad_norm": 10.790150028914537, + "learning_rate": 4.87518502081459e-06, + "loss": 1.4063, + "step": 1808 + }, + { + "epoch": 0.2561053302187301, + "grad_norm": 8.801133637668812, + "learning_rate": 4.875006105967675e-06, + "loss": 1.353, + "step": 1809 + }, + { + "epoch": 0.2562469030933673, + "grad_norm": 9.257405796050744, + "learning_rate": 4.87482706626846e-06, + "loss": 1.4443, + "step": 1810 + }, + { + "epoch": 0.2563884759680045, + "grad_norm": 10.689816229501226, + "learning_rate": 4.874647901726358e-06, + "loss": 1.3274, + "step": 1811 + }, + { + "epoch": 0.25653004884264174, + "grad_norm": 10.14103540374742, + "learning_rate": 4.874468612350786e-06, + "loss": 1.3938, + "step": 1812 + }, + { + "epoch": 0.256671621717279, + "grad_norm": 10.808067162814007, + "learning_rate": 4.874289198151168e-06, + "loss": 1.3144, + "step": 1813 + }, + { + "epoch": 0.2568131945919162, + "grad_norm": 10.335521179257242, + "learning_rate": 4.87410965913694e-06, + "loss": 1.3895, + "step": 1814 + }, + { + "epoch": 0.25695476746655344, + "grad_norm": 12.318419297113254, + "learning_rate": 4.873929995317535e-06, + "loss": 1.3761, + "step": 1815 + }, + { + "epoch": 0.2570963403411906, + "grad_norm": 8.865177790563525, + "learning_rate": 4.873750206702401e-06, + "loss": 1.357, + "step": 1816 + }, + { + "epoch": 0.25723791321582784, + "grad_norm": 9.750063773704692, + "learning_rate": 4.873570293300989e-06, + "loss": 1.5309, + "step": 1817 + }, + { + "epoch": 0.25737948609046507, + "grad_norm": 10.433660292047263, + "learning_rate": 4.873390255122756e-06, + "loss": 1.2496, + "step": 1818 + }, + { + "epoch": 0.2575210589651023, + "grad_norm": 10.087053563710661, + "learning_rate": 4.873210092177167e-06, + "loss": 1.3889, + "step": 1819 + }, + { + "epoch": 0.25766263183973953, + "grad_norm": 8.385001061291021, + "learning_rate": 4.873029804473694e-06, + "loss": 1.3556, + "step": 1820 + }, + { + "epoch": 0.2578042047143767, + "grad_norm": 8.861289596221617, + "learning_rate": 4.8728493920218126e-06, + "loss": 1.3844, + "step": 1821 + }, + { + "epoch": 0.25794577758901394, + "grad_norm": 10.213150029873978, + "learning_rate": 4.872668854831008e-06, + "loss": 1.2854, + "step": 1822 + }, + { + "epoch": 0.25808735046365117, + "grad_norm": 10.120587929998342, + "learning_rate": 4.87248819291077e-06, + "loss": 1.3369, + "step": 1823 + }, + { + "epoch": 0.2582289233382884, + "grad_norm": 9.8152458090956, + "learning_rate": 4.872307406270598e-06, + "loss": 1.2661, + "step": 1824 + }, + { + "epoch": 0.2583704962129256, + "grad_norm": 10.365514992609471, + "learning_rate": 4.872126494919994e-06, + "loss": 1.4486, + "step": 1825 + }, + { + "epoch": 0.2585120690875628, + "grad_norm": 9.776313291558687, + "learning_rate": 4.871945458868469e-06, + "loss": 1.3378, + "step": 1826 + }, + { + "epoch": 0.25865364196220003, + "grad_norm": 8.63784143681163, + "learning_rate": 4.87176429812554e-06, + "loss": 1.4257, + "step": 1827 + }, + { + "epoch": 0.25879521483683726, + "grad_norm": 10.775967130162009, + "learning_rate": 4.87158301270073e-06, + "loss": 1.354, + "step": 1828 + }, + { + "epoch": 0.2589367877114745, + "grad_norm": 12.254277319453413, + "learning_rate": 4.87140160260357e-06, + "loss": 1.4764, + "step": 1829 + }, + { + "epoch": 0.2590783605861117, + "grad_norm": 8.861479009832344, + "learning_rate": 4.871220067843595e-06, + "loss": 1.3488, + "step": 1830 + }, + { + "epoch": 0.2592199334607489, + "grad_norm": 11.697663972230805, + "learning_rate": 4.8710384084303495e-06, + "loss": 1.4801, + "step": 1831 + }, + { + "epoch": 0.2593615063353861, + "grad_norm": 11.13085474836498, + "learning_rate": 4.870856624373383e-06, + "loss": 1.3689, + "step": 1832 + }, + { + "epoch": 0.25950307921002336, + "grad_norm": 10.306524591622281, + "learning_rate": 4.870674715682252e-06, + "loss": 1.5555, + "step": 1833 + }, + { + "epoch": 0.2596446520846606, + "grad_norm": 11.985992521424032, + "learning_rate": 4.870492682366518e-06, + "loss": 1.3943, + "step": 1834 + }, + { + "epoch": 0.2597862249592978, + "grad_norm": 9.614215704659568, + "learning_rate": 4.8703105244357504e-06, + "loss": 1.2294, + "step": 1835 + }, + { + "epoch": 0.259927797833935, + "grad_norm": 9.682165633775497, + "learning_rate": 4.870128241899527e-06, + "loss": 1.4515, + "step": 1836 + }, + { + "epoch": 0.2600693707085722, + "grad_norm": 12.579544271635251, + "learning_rate": 4.86994583476743e-06, + "loss": 1.6148, + "step": 1837 + }, + { + "epoch": 0.26021094358320945, + "grad_norm": 9.445803595587103, + "learning_rate": 4.8697633030490465e-06, + "loss": 1.2611, + "step": 1838 + }, + { + "epoch": 0.2603525164578467, + "grad_norm": 12.742091699517244, + "learning_rate": 4.869580646753973e-06, + "loss": 1.4125, + "step": 1839 + }, + { + "epoch": 0.2604940893324839, + "grad_norm": 15.499562903363195, + "learning_rate": 4.869397865891812e-06, + "loss": 1.6298, + "step": 1840 + }, + { + "epoch": 0.2606356622071211, + "grad_norm": 9.573827877960644, + "learning_rate": 4.869214960472172e-06, + "loss": 1.3679, + "step": 1841 + }, + { + "epoch": 0.2607772350817583, + "grad_norm": 9.681040720835574, + "learning_rate": 4.869031930504668e-06, + "loss": 1.4703, + "step": 1842 + }, + { + "epoch": 0.26091880795639555, + "grad_norm": 10.250644198387416, + "learning_rate": 4.8688487759989215e-06, + "loss": 1.3452, + "step": 1843 + }, + { + "epoch": 0.2610603808310328, + "grad_norm": 25.619043467381715, + "learning_rate": 4.868665496964562e-06, + "loss": 1.4307, + "step": 1844 + }, + { + "epoch": 0.26120195370567, + "grad_norm": 11.75217628615813, + "learning_rate": 4.868482093411223e-06, + "loss": 1.3149, + "step": 1845 + }, + { + "epoch": 0.26134352658030724, + "grad_norm": 9.785206929758978, + "learning_rate": 4.868298565348546e-06, + "loss": 1.3673, + "step": 1846 + }, + { + "epoch": 0.2614850994549444, + "grad_norm": 9.272386833646094, + "learning_rate": 4.8681149127861795e-06, + "loss": 1.4277, + "step": 1847 + }, + { + "epoch": 0.26162667232958164, + "grad_norm": 8.482483261276265, + "learning_rate": 4.8679311357337774e-06, + "loss": 1.2528, + "step": 1848 + }, + { + "epoch": 0.2617682452042189, + "grad_norm": 11.661307784428129, + "learning_rate": 4.867747234201003e-06, + "loss": 1.4143, + "step": 1849 + }, + { + "epoch": 0.2619098180788561, + "grad_norm": 11.414583856162913, + "learning_rate": 4.86756320819752e-06, + "loss": 1.4213, + "step": 1850 + }, + { + "epoch": 0.26205139095349334, + "grad_norm": 9.379412617135493, + "learning_rate": 4.867379057733005e-06, + "loss": 1.3199, + "step": 1851 + }, + { + "epoch": 0.2621929638281305, + "grad_norm": 10.210630404955493, + "learning_rate": 4.867194782817138e-06, + "loss": 1.3957, + "step": 1852 + }, + { + "epoch": 0.26233453670276774, + "grad_norm": 11.11131902712099, + "learning_rate": 4.867010383459606e-06, + "loss": 1.3366, + "step": 1853 + }, + { + "epoch": 0.26247610957740497, + "grad_norm": 9.702525834459776, + "learning_rate": 4.8668258596701035e-06, + "loss": 1.3718, + "step": 1854 + }, + { + "epoch": 0.2626176824520422, + "grad_norm": 10.184146258865917, + "learning_rate": 4.86664121145833e-06, + "loss": 1.3626, + "step": 1855 + }, + { + "epoch": 0.26275925532667943, + "grad_norm": 9.450047923778945, + "learning_rate": 4.866456438833993e-06, + "loss": 1.2997, + "step": 1856 + }, + { + "epoch": 0.2629008282013166, + "grad_norm": 9.335849649822025, + "learning_rate": 4.866271541806806e-06, + "loss": 1.4811, + "step": 1857 + }, + { + "epoch": 0.26304240107595384, + "grad_norm": 13.515806763589923, + "learning_rate": 4.8660865203864885e-06, + "loss": 1.4664, + "step": 1858 + }, + { + "epoch": 0.26318397395059107, + "grad_norm": 9.684551750212245, + "learning_rate": 4.865901374582766e-06, + "loss": 1.2608, + "step": 1859 + }, + { + "epoch": 0.2633255468252283, + "grad_norm": 9.947015393291347, + "learning_rate": 4.865716104405373e-06, + "loss": 1.3728, + "step": 1860 + }, + { + "epoch": 0.2634671196998655, + "grad_norm": 10.768844056784179, + "learning_rate": 4.865530709864048e-06, + "loss": 1.3748, + "step": 1861 + }, + { + "epoch": 0.2636086925745027, + "grad_norm": 9.376886610300959, + "learning_rate": 4.865345190968537e-06, + "loss": 1.2756, + "step": 1862 + }, + { + "epoch": 0.26375026544913993, + "grad_norm": 10.0538569702392, + "learning_rate": 4.865159547728593e-06, + "loss": 1.2088, + "step": 1863 + }, + { + "epoch": 0.26389183832377716, + "grad_norm": 10.352365092706851, + "learning_rate": 4.8649737801539755e-06, + "loss": 1.2887, + "step": 1864 + }, + { + "epoch": 0.2640334111984144, + "grad_norm": 10.536725802506536, + "learning_rate": 4.86478788825445e-06, + "loss": 1.392, + "step": 1865 + }, + { + "epoch": 0.2641749840730516, + "grad_norm": 11.943269144436346, + "learning_rate": 4.864601872039788e-06, + "loss": 1.505, + "step": 1866 + }, + { + "epoch": 0.2643165569476888, + "grad_norm": 9.6190317649297, + "learning_rate": 4.864415731519769e-06, + "loss": 1.5084, + "step": 1867 + }, + { + "epoch": 0.264458129822326, + "grad_norm": 11.585192048675218, + "learning_rate": 4.864229466704178e-06, + "loss": 1.433, + "step": 1868 + }, + { + "epoch": 0.26459970269696326, + "grad_norm": 10.509837673686787, + "learning_rate": 4.864043077602807e-06, + "loss": 1.4894, + "step": 1869 + }, + { + "epoch": 0.2647412755716005, + "grad_norm": 11.645941791245486, + "learning_rate": 4.863856564225453e-06, + "loss": 1.357, + "step": 1870 + }, + { + "epoch": 0.2648828484462377, + "grad_norm": 11.743442085131425, + "learning_rate": 4.863669926581924e-06, + "loss": 1.4374, + "step": 1871 + }, + { + "epoch": 0.26502442132087495, + "grad_norm": 10.688018964693525, + "learning_rate": 4.863483164682027e-06, + "loss": 1.3877, + "step": 1872 + }, + { + "epoch": 0.2651659941955121, + "grad_norm": 11.02342884866333, + "learning_rate": 4.863296278535584e-06, + "loss": 1.3793, + "step": 1873 + }, + { + "epoch": 0.26530756707014935, + "grad_norm": 12.766841760402462, + "learning_rate": 4.863109268152417e-06, + "loss": 1.1848, + "step": 1874 + }, + { + "epoch": 0.2654491399447866, + "grad_norm": 9.137272355235451, + "learning_rate": 4.862922133542358e-06, + "loss": 1.5362, + "step": 1875 + }, + { + "epoch": 0.2655907128194238, + "grad_norm": 9.547542667298572, + "learning_rate": 4.862734874715245e-06, + "loss": 1.42, + "step": 1876 + }, + { + "epoch": 0.26573228569406104, + "grad_norm": 10.379498345338224, + "learning_rate": 4.8625474916809205e-06, + "loss": 1.4147, + "step": 1877 + }, + { + "epoch": 0.2658738585686982, + "grad_norm": 12.462324815014327, + "learning_rate": 4.862359984449236e-06, + "loss": 1.462, + "step": 1878 + }, + { + "epoch": 0.26601543144333545, + "grad_norm": 9.485630210004869, + "learning_rate": 4.862172353030049e-06, + "loss": 1.3242, + "step": 1879 + }, + { + "epoch": 0.2661570043179727, + "grad_norm": 12.425897294923388, + "learning_rate": 4.861984597433223e-06, + "loss": 1.4224, + "step": 1880 + }, + { + "epoch": 0.2662985771926099, + "grad_norm": 10.464057075461488, + "learning_rate": 4.861796717668626e-06, + "loss": 1.4043, + "step": 1881 + }, + { + "epoch": 0.26644015006724714, + "grad_norm": 11.016549232614944, + "learning_rate": 4.8616087137461385e-06, + "loss": 1.5395, + "step": 1882 + }, + { + "epoch": 0.2665817229418843, + "grad_norm": 9.690211156664347, + "learning_rate": 4.861420585675641e-06, + "loss": 1.3816, + "step": 1883 + }, + { + "epoch": 0.26672329581652154, + "grad_norm": 14.342447325616572, + "learning_rate": 4.861232333467024e-06, + "loss": 1.3882, + "step": 1884 + }, + { + "epoch": 0.2668648686911588, + "grad_norm": 9.293748653696523, + "learning_rate": 4.8610439571301845e-06, + "loss": 1.3119, + "step": 1885 + }, + { + "epoch": 0.267006441565796, + "grad_norm": 11.992063440929966, + "learning_rate": 4.860855456675024e-06, + "loss": 1.4479, + "step": 1886 + }, + { + "epoch": 0.26714801444043323, + "grad_norm": 9.826906938380253, + "learning_rate": 4.860666832111453e-06, + "loss": 1.4371, + "step": 1887 + }, + { + "epoch": 0.2672895873150704, + "grad_norm": 9.154920878628351, + "learning_rate": 4.860478083449387e-06, + "loss": 1.4633, + "step": 1888 + }, + { + "epoch": 0.26743116018970764, + "grad_norm": 9.752857669783292, + "learning_rate": 4.8602892106987474e-06, + "loss": 1.3054, + "step": 1889 + }, + { + "epoch": 0.26757273306434487, + "grad_norm": 8.475932436261068, + "learning_rate": 4.860100213869464e-06, + "loss": 1.2999, + "step": 1890 + }, + { + "epoch": 0.2677143059389821, + "grad_norm": 8.97104522365065, + "learning_rate": 4.859911092971473e-06, + "loss": 1.2309, + "step": 1891 + }, + { + "epoch": 0.26785587881361933, + "grad_norm": 9.451696366736734, + "learning_rate": 4.8597218480147145e-06, + "loss": 1.2767, + "step": 1892 + }, + { + "epoch": 0.2679974516882565, + "grad_norm": 8.014641714482527, + "learning_rate": 4.859532479009138e-06, + "loss": 1.1932, + "step": 1893 + }, + { + "epoch": 0.26813902456289374, + "grad_norm": 11.629725654495386, + "learning_rate": 4.859342985964699e-06, + "loss": 1.3231, + "step": 1894 + }, + { + "epoch": 0.26828059743753097, + "grad_norm": 9.769939281388176, + "learning_rate": 4.8591533688913584e-06, + "loss": 1.23, + "step": 1895 + }, + { + "epoch": 0.2684221703121682, + "grad_norm": 10.904951220459699, + "learning_rate": 4.858963627799084e-06, + "loss": 1.592, + "step": 1896 + }, + { + "epoch": 0.2685637431868054, + "grad_norm": 9.943557524481657, + "learning_rate": 4.85877376269785e-06, + "loss": 1.3084, + "step": 1897 + }, + { + "epoch": 0.2687053160614426, + "grad_norm": 10.102923402624453, + "learning_rate": 4.858583773597639e-06, + "loss": 1.3706, + "step": 1898 + }, + { + "epoch": 0.26884688893607983, + "grad_norm": 10.146896985414664, + "learning_rate": 4.858393660508437e-06, + "loss": 1.2742, + "step": 1899 + }, + { + "epoch": 0.26898846181071706, + "grad_norm": 9.431736683153934, + "learning_rate": 4.85820342344024e-06, + "loss": 1.2557, + "step": 1900 + }, + { + "epoch": 0.2691300346853543, + "grad_norm": 10.70029009844677, + "learning_rate": 4.8580130624030454e-06, + "loss": 1.3216, + "step": 1901 + }, + { + "epoch": 0.2692716075599915, + "grad_norm": 10.80380340291094, + "learning_rate": 4.857822577406864e-06, + "loss": 1.5813, + "step": 1902 + }, + { + "epoch": 0.26941318043462875, + "grad_norm": 9.866532570554881, + "learning_rate": 4.8576319684617064e-06, + "loss": 1.4123, + "step": 1903 + }, + { + "epoch": 0.2695547533092659, + "grad_norm": 10.394361698870842, + "learning_rate": 4.857441235577596e-06, + "loss": 1.1787, + "step": 1904 + }, + { + "epoch": 0.26969632618390316, + "grad_norm": 11.198761966944613, + "learning_rate": 4.857250378764556e-06, + "loss": 1.3047, + "step": 1905 + }, + { + "epoch": 0.2698378990585404, + "grad_norm": 10.737684070221778, + "learning_rate": 4.857059398032622e-06, + "loss": 1.3463, + "step": 1906 + }, + { + "epoch": 0.2699794719331776, + "grad_norm": 10.136406019381134, + "learning_rate": 4.8568682933918325e-06, + "loss": 1.4447, + "step": 1907 + }, + { + "epoch": 0.27012104480781485, + "grad_norm": 9.349867027275542, + "learning_rate": 4.856677064852234e-06, + "loss": 1.199, + "step": 1908 + }, + { + "epoch": 0.270262617682452, + "grad_norm": 11.251256914919654, + "learning_rate": 4.85648571242388e-06, + "loss": 1.2627, + "step": 1909 + }, + { + "epoch": 0.27040419055708925, + "grad_norm": 10.620895131349018, + "learning_rate": 4.856294236116829e-06, + "loss": 1.5058, + "step": 1910 + }, + { + "epoch": 0.2705457634317265, + "grad_norm": 7.975654752155834, + "learning_rate": 4.856102635941147e-06, + "loss": 1.2701, + "step": 1911 + }, + { + "epoch": 0.2706873363063637, + "grad_norm": 10.944255028993451, + "learning_rate": 4.855910911906906e-06, + "loss": 1.437, + "step": 1912 + }, + { + "epoch": 0.27082890918100094, + "grad_norm": 10.130582047787438, + "learning_rate": 4.855719064024185e-06, + "loss": 1.3837, + "step": 1913 + }, + { + "epoch": 0.2709704820556381, + "grad_norm": 9.609745769015806, + "learning_rate": 4.855527092303069e-06, + "loss": 1.3622, + "step": 1914 + }, + { + "epoch": 0.27111205493027535, + "grad_norm": 9.2090608013133, + "learning_rate": 4.855334996753651e-06, + "loss": 1.2581, + "step": 1915 + }, + { + "epoch": 0.2712536278049126, + "grad_norm": 9.10017047764315, + "learning_rate": 4.8551427773860284e-06, + "loss": 1.3496, + "step": 1916 + }, + { + "epoch": 0.2713952006795498, + "grad_norm": 7.9957819786099025, + "learning_rate": 4.854950434210305e-06, + "loss": 1.2388, + "step": 1917 + }, + { + "epoch": 0.27153677355418704, + "grad_norm": 8.005434574528154, + "learning_rate": 4.854757967236594e-06, + "loss": 1.2465, + "step": 1918 + }, + { + "epoch": 0.2716783464288242, + "grad_norm": 9.568635057074166, + "learning_rate": 4.8545653764750125e-06, + "loss": 1.3637, + "step": 1919 + }, + { + "epoch": 0.27181991930346144, + "grad_norm": 10.909140829492271, + "learning_rate": 4.8543726619356846e-06, + "loss": 1.4389, + "step": 1920 + }, + { + "epoch": 0.2719614921780987, + "grad_norm": 10.665906004804295, + "learning_rate": 4.854179823628741e-06, + "loss": 1.3744, + "step": 1921 + }, + { + "epoch": 0.2721030650527359, + "grad_norm": 8.95579906960276, + "learning_rate": 4.85398686156432e-06, + "loss": 1.2805, + "step": 1922 + }, + { + "epoch": 0.27224463792737313, + "grad_norm": 10.50865361700942, + "learning_rate": 4.853793775752564e-06, + "loss": 1.2663, + "step": 1923 + }, + { + "epoch": 0.2723862108020103, + "grad_norm": 10.097436379376049, + "learning_rate": 4.853600566203625e-06, + "loss": 1.3178, + "step": 1924 + }, + { + "epoch": 0.27252778367664754, + "grad_norm": 8.67743962088031, + "learning_rate": 4.8534072329276594e-06, + "loss": 1.2158, + "step": 1925 + }, + { + "epoch": 0.27266935655128477, + "grad_norm": 8.556544996683733, + "learning_rate": 4.85321377593483e-06, + "loss": 1.3911, + "step": 1926 + }, + { + "epoch": 0.272810929425922, + "grad_norm": 9.698226798097874, + "learning_rate": 4.853020195235307e-06, + "loss": 1.4272, + "step": 1927 + }, + { + "epoch": 0.27295250230055923, + "grad_norm": 10.302246915304751, + "learning_rate": 4.852826490839266e-06, + "loss": 1.2483, + "step": 1928 + }, + { + "epoch": 0.2730940751751964, + "grad_norm": 11.675696302802391, + "learning_rate": 4.852632662756892e-06, + "loss": 1.3846, + "step": 1929 + }, + { + "epoch": 0.27323564804983363, + "grad_norm": 7.2144516867790545, + "learning_rate": 4.852438710998373e-06, + "loss": 1.246, + "step": 1930 + }, + { + "epoch": 0.27337722092447087, + "grad_norm": 9.600005753833319, + "learning_rate": 4.852244635573905e-06, + "loss": 1.5645, + "step": 1931 + }, + { + "epoch": 0.2735187937991081, + "grad_norm": 8.484666345135867, + "learning_rate": 4.85205043649369e-06, + "loss": 1.219, + "step": 1932 + }, + { + "epoch": 0.2736603666737453, + "grad_norm": 9.689184817578784, + "learning_rate": 4.851856113767937e-06, + "loss": 1.3263, + "step": 1933 + }, + { + "epoch": 0.27380193954838256, + "grad_norm": 9.307810991795245, + "learning_rate": 4.851661667406862e-06, + "loss": 1.4133, + "step": 1934 + }, + { + "epoch": 0.27394351242301973, + "grad_norm": 9.754623490599695, + "learning_rate": 4.851467097420687e-06, + "loss": 1.4888, + "step": 1935 + }, + { + "epoch": 0.27408508529765696, + "grad_norm": 8.180806692773718, + "learning_rate": 4.8512724038196395e-06, + "loss": 1.3118, + "step": 1936 + }, + { + "epoch": 0.2742266581722942, + "grad_norm": 12.632332362744878, + "learning_rate": 4.8510775866139556e-06, + "loss": 1.4125, + "step": 1937 + }, + { + "epoch": 0.2743682310469314, + "grad_norm": 9.28282938148082, + "learning_rate": 4.850882645813875e-06, + "loss": 1.3902, + "step": 1938 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 10.114202322627666, + "learning_rate": 4.850687581429647e-06, + "loss": 1.4037, + "step": 1939 + }, + { + "epoch": 0.2746513767962058, + "grad_norm": 9.808811150393407, + "learning_rate": 4.8504923934715265e-06, + "loss": 1.427, + "step": 1940 + }, + { + "epoch": 0.27479294967084306, + "grad_norm": 9.491298505181108, + "learning_rate": 4.850297081949773e-06, + "loss": 1.5069, + "step": 1941 + }, + { + "epoch": 0.2749345225454803, + "grad_norm": 9.977631920095577, + "learning_rate": 4.850101646874654e-06, + "loss": 1.3893, + "step": 1942 + }, + { + "epoch": 0.2750760954201175, + "grad_norm": 10.46994549202416, + "learning_rate": 4.8499060882564435e-06, + "loss": 1.3556, + "step": 1943 + }, + { + "epoch": 0.27521766829475475, + "grad_norm": 9.725718854602446, + "learning_rate": 4.849710406105422e-06, + "loss": 1.4852, + "step": 1944 + }, + { + "epoch": 0.2753592411693919, + "grad_norm": 10.098251614460064, + "learning_rate": 4.849514600431877e-06, + "loss": 1.3532, + "step": 1945 + }, + { + "epoch": 0.27550081404402915, + "grad_norm": 8.879811017199177, + "learning_rate": 4.849318671246101e-06, + "loss": 1.2974, + "step": 1946 + }, + { + "epoch": 0.2756423869186664, + "grad_norm": 9.693837314116562, + "learning_rate": 4.849122618558395e-06, + "loss": 1.4952, + "step": 1947 + }, + { + "epoch": 0.2757839597933036, + "grad_norm": 11.265295523853162, + "learning_rate": 4.848926442379064e-06, + "loss": 1.4421, + "step": 1948 + }, + { + "epoch": 0.27592553266794084, + "grad_norm": 10.757786968923952, + "learning_rate": 4.8487301427184204e-06, + "loss": 1.2949, + "step": 1949 + }, + { + "epoch": 0.276067105542578, + "grad_norm": 8.529530110805215, + "learning_rate": 4.848533719586787e-06, + "loss": 1.264, + "step": 1950 + }, + { + "epoch": 0.27620867841721525, + "grad_norm": 10.537805707123933, + "learning_rate": 4.848337172994485e-06, + "loss": 1.4141, + "step": 1951 + }, + { + "epoch": 0.2763502512918525, + "grad_norm": 9.552210638819629, + "learning_rate": 4.848140502951849e-06, + "loss": 1.2039, + "step": 1952 + }, + { + "epoch": 0.2764918241664897, + "grad_norm": 10.133326368162205, + "learning_rate": 4.847943709469218e-06, + "loss": 1.4797, + "step": 1953 + }, + { + "epoch": 0.27663339704112694, + "grad_norm": 11.02267027306589, + "learning_rate": 4.8477467925569365e-06, + "loss": 1.0908, + "step": 1954 + }, + { + "epoch": 0.2767749699157641, + "grad_norm": 9.920407356389113, + "learning_rate": 4.847549752225356e-06, + "loss": 1.2937, + "step": 1955 + }, + { + "epoch": 0.27691654279040134, + "grad_norm": 11.381119978621324, + "learning_rate": 4.847352588484837e-06, + "loss": 1.316, + "step": 1956 + }, + { + "epoch": 0.2770581156650386, + "grad_norm": 10.807182283269242, + "learning_rate": 4.847155301345743e-06, + "loss": 1.4705, + "step": 1957 + }, + { + "epoch": 0.2771996885396758, + "grad_norm": 10.051293619130263, + "learning_rate": 4.846957890818444e-06, + "loss": 1.4419, + "step": 1958 + }, + { + "epoch": 0.27734126141431303, + "grad_norm": 10.352681985456337, + "learning_rate": 4.846760356913318e-06, + "loss": 1.3314, + "step": 1959 + }, + { + "epoch": 0.27748283428895026, + "grad_norm": 9.487264026627356, + "learning_rate": 4.846562699640751e-06, + "loss": 1.124, + "step": 1960 + }, + { + "epoch": 0.27762440716358744, + "grad_norm": 10.627881904655172, + "learning_rate": 4.846364919011132e-06, + "loss": 1.5308, + "step": 1961 + }, + { + "epoch": 0.27776598003822467, + "grad_norm": 8.04871461450838, + "learning_rate": 4.8461670150348585e-06, + "loss": 1.3599, + "step": 1962 + }, + { + "epoch": 0.2779075529128619, + "grad_norm": 10.816390026847712, + "learning_rate": 4.8459689877223346e-06, + "loss": 1.3575, + "step": 1963 + }, + { + "epoch": 0.27804912578749913, + "grad_norm": 11.074341034928903, + "learning_rate": 4.845770837083971e-06, + "loss": 1.2996, + "step": 1964 + }, + { + "epoch": 0.27819069866213636, + "grad_norm": 10.519679201985928, + "learning_rate": 4.845572563130182e-06, + "loss": 1.3343, + "step": 1965 + }, + { + "epoch": 0.27833227153677353, + "grad_norm": 8.789659701932791, + "learning_rate": 4.845374165871394e-06, + "loss": 1.2693, + "step": 1966 + }, + { + "epoch": 0.27847384441141076, + "grad_norm": 8.067910914446038, + "learning_rate": 4.845175645318034e-06, + "loss": 1.3378, + "step": 1967 + }, + { + "epoch": 0.278615417286048, + "grad_norm": 7.679658689862033, + "learning_rate": 4.844977001480539e-06, + "loss": 1.3687, + "step": 1968 + }, + { + "epoch": 0.2787569901606852, + "grad_norm": 8.344708676930072, + "learning_rate": 4.8447782343693515e-06, + "loss": 1.188, + "step": 1969 + }, + { + "epoch": 0.27889856303532246, + "grad_norm": 12.059576915702399, + "learning_rate": 4.844579343994921e-06, + "loss": 1.3455, + "step": 1970 + }, + { + "epoch": 0.27904013590995963, + "grad_norm": 11.259797301830984, + "learning_rate": 4.844380330367701e-06, + "loss": 1.4203, + "step": 1971 + }, + { + "epoch": 0.27918170878459686, + "grad_norm": 8.331168185931979, + "learning_rate": 4.844181193498157e-06, + "loss": 1.3567, + "step": 1972 + }, + { + "epoch": 0.2793232816592341, + "grad_norm": 8.81365569632706, + "learning_rate": 4.843981933396755e-06, + "loss": 1.5111, + "step": 1973 + }, + { + "epoch": 0.2794648545338713, + "grad_norm": 10.072144619146606, + "learning_rate": 4.84378255007397e-06, + "loss": 1.2982, + "step": 1974 + }, + { + "epoch": 0.27960642740850855, + "grad_norm": 11.847540336970066, + "learning_rate": 4.843583043540284e-06, + "loss": 1.3606, + "step": 1975 + }, + { + "epoch": 0.2797480002831457, + "grad_norm": 10.01517327252559, + "learning_rate": 4.8433834138061856e-06, + "loss": 1.3367, + "step": 1976 + }, + { + "epoch": 0.27988957315778296, + "grad_norm": 13.726369075899587, + "learning_rate": 4.843183660882168e-06, + "loss": 1.4496, + "step": 1977 + }, + { + "epoch": 0.2800311460324202, + "grad_norm": 9.386937386914827, + "learning_rate": 4.842983784778732e-06, + "loss": 1.3173, + "step": 1978 + }, + { + "epoch": 0.2801727189070574, + "grad_norm": 7.588107803359174, + "learning_rate": 4.842783785506386e-06, + "loss": 1.3139, + "step": 1979 + }, + { + "epoch": 0.28031429178169465, + "grad_norm": 9.688932054561542, + "learning_rate": 4.842583663075643e-06, + "loss": 1.4811, + "step": 1980 + }, + { + "epoch": 0.2804558646563318, + "grad_norm": 8.887345015707597, + "learning_rate": 4.842383417497024e-06, + "loss": 1.3017, + "step": 1981 + }, + { + "epoch": 0.28059743753096905, + "grad_norm": 8.33532380809767, + "learning_rate": 4.842183048781055e-06, + "loss": 1.2095, + "step": 1982 + }, + { + "epoch": 0.2807390104056063, + "grad_norm": 10.630173085796129, + "learning_rate": 4.84198255693827e-06, + "loss": 1.4592, + "step": 1983 + }, + { + "epoch": 0.2808805832802435, + "grad_norm": 9.672621591868083, + "learning_rate": 4.841781941979207e-06, + "loss": 1.2442, + "step": 1984 + }, + { + "epoch": 0.28102215615488074, + "grad_norm": 8.915264506628198, + "learning_rate": 4.8415812039144145e-06, + "loss": 1.2733, + "step": 1985 + }, + { + "epoch": 0.2811637290295179, + "grad_norm": 8.765121907911315, + "learning_rate": 4.841380342754444e-06, + "loss": 1.3803, + "step": 1986 + }, + { + "epoch": 0.28130530190415515, + "grad_norm": 9.515717203730215, + "learning_rate": 4.841179358509854e-06, + "loss": 1.513, + "step": 1987 + }, + { + "epoch": 0.2814468747787924, + "grad_norm": 9.364828873846323, + "learning_rate": 4.840978251191212e-06, + "loss": 1.3883, + "step": 1988 + }, + { + "epoch": 0.2815884476534296, + "grad_norm": 14.787267641513688, + "learning_rate": 4.840777020809087e-06, + "loss": 1.3672, + "step": 1989 + }, + { + "epoch": 0.28173002052806684, + "grad_norm": 9.983094612370994, + "learning_rate": 4.8405756673740606e-06, + "loss": 1.2824, + "step": 1990 + }, + { + "epoch": 0.28187159340270407, + "grad_norm": 10.514028714279199, + "learning_rate": 4.840374190896716e-06, + "loss": 1.3531, + "step": 1991 + }, + { + "epoch": 0.28201316627734124, + "grad_norm": 9.283529188006591, + "learning_rate": 4.840172591387646e-06, + "loss": 1.3788, + "step": 1992 + }, + { + "epoch": 0.2821547391519785, + "grad_norm": 11.377814646833341, + "learning_rate": 4.839970868857447e-06, + "loss": 1.4467, + "step": 1993 + }, + { + "epoch": 0.2822963120266157, + "grad_norm": 9.835263423834206, + "learning_rate": 4.839769023316725e-06, + "loss": 1.4185, + "step": 1994 + }, + { + "epoch": 0.28243788490125293, + "grad_norm": 11.399009099358839, + "learning_rate": 4.83956705477609e-06, + "loss": 1.3016, + "step": 1995 + }, + { + "epoch": 0.28257945777589016, + "grad_norm": 10.303404710124003, + "learning_rate": 4.839364963246159e-06, + "loss": 1.4154, + "step": 1996 + }, + { + "epoch": 0.28272103065052734, + "grad_norm": 10.894929360891176, + "learning_rate": 4.839162748737556e-06, + "loss": 1.5901, + "step": 1997 + }, + { + "epoch": 0.28286260352516457, + "grad_norm": 14.749064270338845, + "learning_rate": 4.838960411260911e-06, + "loss": 1.5944, + "step": 1998 + }, + { + "epoch": 0.2830041763998018, + "grad_norm": 8.72961448910649, + "learning_rate": 4.838757950826862e-06, + "loss": 1.3303, + "step": 1999 + }, + { + "epoch": 0.28314574927443903, + "grad_norm": 9.252895417565165, + "learning_rate": 4.838555367446052e-06, + "loss": 1.4545, + "step": 2000 + }, + { + "epoch": 0.28328732214907626, + "grad_norm": 12.666460604413691, + "learning_rate": 4.838352661129129e-06, + "loss": 1.5531, + "step": 2001 + }, + { + "epoch": 0.28342889502371343, + "grad_norm": 11.027641592351262, + "learning_rate": 4.838149831886751e-06, + "loss": 1.2482, + "step": 2002 + }, + { + "epoch": 0.28357046789835066, + "grad_norm": 8.643086363940329, + "learning_rate": 4.8379468797295785e-06, + "loss": 1.3245, + "step": 2003 + }, + { + "epoch": 0.2837120407729879, + "grad_norm": 8.615468826511181, + "learning_rate": 4.837743804668282e-06, + "loss": 1.1585, + "step": 2004 + }, + { + "epoch": 0.2838536136476251, + "grad_norm": 9.691767976799346, + "learning_rate": 4.837540606713538e-06, + "loss": 1.4313, + "step": 2005 + }, + { + "epoch": 0.28399518652226236, + "grad_norm": 11.322768129967159, + "learning_rate": 4.837337285876026e-06, + "loss": 1.5276, + "step": 2006 + }, + { + "epoch": 0.28413675939689953, + "grad_norm": 12.132190577870652, + "learning_rate": 4.837133842166436e-06, + "loss": 1.3579, + "step": 2007 + }, + { + "epoch": 0.28427833227153676, + "grad_norm": 9.503406516223414, + "learning_rate": 4.8369302755954625e-06, + "loss": 1.3617, + "step": 2008 + }, + { + "epoch": 0.284419905146174, + "grad_norm": 7.5562390348671995, + "learning_rate": 4.836726586173807e-06, + "loss": 1.2545, + "step": 2009 + }, + { + "epoch": 0.2845614780208112, + "grad_norm": 11.79843868982579, + "learning_rate": 4.836522773912178e-06, + "loss": 1.5949, + "step": 2010 + }, + { + "epoch": 0.28470305089544845, + "grad_norm": 14.08126135638135, + "learning_rate": 4.836318838821288e-06, + "loss": 1.4672, + "step": 2011 + }, + { + "epoch": 0.2848446237700856, + "grad_norm": 11.859600160494319, + "learning_rate": 4.836114780911859e-06, + "loss": 1.5006, + "step": 2012 + }, + { + "epoch": 0.28498619664472286, + "grad_norm": 7.772269463124775, + "learning_rate": 4.835910600194618e-06, + "loss": 1.2247, + "step": 2013 + }, + { + "epoch": 0.2851277695193601, + "grad_norm": 7.605368685541653, + "learning_rate": 4.835706296680298e-06, + "loss": 1.2956, + "step": 2014 + }, + { + "epoch": 0.2852693423939973, + "grad_norm": 9.696617118610071, + "learning_rate": 4.83550187037964e-06, + "loss": 1.3699, + "step": 2015 + }, + { + "epoch": 0.28541091526863455, + "grad_norm": 9.873680316289425, + "learning_rate": 4.8352973213033894e-06, + "loss": 1.5184, + "step": 2016 + }, + { + "epoch": 0.2855524881432718, + "grad_norm": 8.855312808783564, + "learning_rate": 4.835092649462301e-06, + "loss": 1.3545, + "step": 2017 + }, + { + "epoch": 0.28569406101790895, + "grad_norm": 9.14654922869651, + "learning_rate": 4.834887854867132e-06, + "loss": 1.3961, + "step": 2018 + }, + { + "epoch": 0.2858356338925462, + "grad_norm": 10.564496997101084, + "learning_rate": 4.83468293752865e-06, + "loss": 1.4352, + "step": 2019 + }, + { + "epoch": 0.2859772067671834, + "grad_norm": 9.868649915751798, + "learning_rate": 4.834477897457627e-06, + "loss": 1.3999, + "step": 2020 + }, + { + "epoch": 0.28611877964182064, + "grad_norm": 11.329772913543366, + "learning_rate": 4.834272734664841e-06, + "loss": 1.4467, + "step": 2021 + }, + { + "epoch": 0.28626035251645787, + "grad_norm": 12.157721658388613, + "learning_rate": 4.8340674491610786e-06, + "loss": 1.3021, + "step": 2022 + }, + { + "epoch": 0.28640192539109505, + "grad_norm": 11.861203179658489, + "learning_rate": 4.83386204095713e-06, + "loss": 1.4495, + "step": 2023 + }, + { + "epoch": 0.2865434982657323, + "grad_norm": 8.633196046343983, + "learning_rate": 4.833656510063794e-06, + "loss": 1.2742, + "step": 2024 + }, + { + "epoch": 0.2866850711403695, + "grad_norm": 10.730482259046829, + "learning_rate": 4.833450856491875e-06, + "loss": 1.3002, + "step": 2025 + }, + { + "epoch": 0.28682664401500674, + "grad_norm": 10.899440968028008, + "learning_rate": 4.833245080252186e-06, + "loss": 1.51, + "step": 2026 + }, + { + "epoch": 0.28696821688964397, + "grad_norm": 13.488952072082164, + "learning_rate": 4.833039181355542e-06, + "loss": 1.5339, + "step": 2027 + }, + { + "epoch": 0.28710978976428114, + "grad_norm": 11.608685602638158, + "learning_rate": 4.832833159812768e-06, + "loss": 1.3757, + "step": 2028 + }, + { + "epoch": 0.2872513626389184, + "grad_norm": 13.041299800567474, + "learning_rate": 4.832627015634694e-06, + "loss": 1.3655, + "step": 2029 + }, + { + "epoch": 0.2873929355135556, + "grad_norm": 9.930386666474933, + "learning_rate": 4.832420748832157e-06, + "loss": 1.4826, + "step": 2030 + }, + { + "epoch": 0.28753450838819283, + "grad_norm": 8.906193461991597, + "learning_rate": 4.832214359416001e-06, + "loss": 1.3304, + "step": 2031 + }, + { + "epoch": 0.28767608126283006, + "grad_norm": 10.91497401347542, + "learning_rate": 4.8320078473970745e-06, + "loss": 1.3706, + "step": 2032 + }, + { + "epoch": 0.28781765413746724, + "grad_norm": 10.19635486170525, + "learning_rate": 4.831801212786234e-06, + "loss": 1.2404, + "step": 2033 + }, + { + "epoch": 0.28795922701210447, + "grad_norm": 11.555077061975966, + "learning_rate": 4.831594455594343e-06, + "loss": 1.3211, + "step": 2034 + }, + { + "epoch": 0.2881007998867417, + "grad_norm": 8.767365580306892, + "learning_rate": 4.8313875758322695e-06, + "loss": 1.3108, + "step": 2035 + }, + { + "epoch": 0.28824237276137893, + "grad_norm": 10.575707118424162, + "learning_rate": 4.83118057351089e-06, + "loss": 1.2787, + "step": 2036 + }, + { + "epoch": 0.28838394563601616, + "grad_norm": 7.918289848031053, + "learning_rate": 4.830973448641086e-06, + "loss": 1.2351, + "step": 2037 + }, + { + "epoch": 0.28852551851065333, + "grad_norm": 9.8764477404937, + "learning_rate": 4.830766201233746e-06, + "loss": 1.3567, + "step": 2038 + }, + { + "epoch": 0.28866709138529056, + "grad_norm": 8.545454224488715, + "learning_rate": 4.8305588312997635e-06, + "loss": 1.4107, + "step": 2039 + }, + { + "epoch": 0.2888086642599278, + "grad_norm": 9.403464275513539, + "learning_rate": 4.8303513388500414e-06, + "loss": 1.2542, + "step": 2040 + }, + { + "epoch": 0.288950237134565, + "grad_norm": 9.766972172703436, + "learning_rate": 4.8301437238954875e-06, + "loss": 1.4099, + "step": 2041 + }, + { + "epoch": 0.28909181000920225, + "grad_norm": 9.382850310212119, + "learning_rate": 4.829935986447015e-06, + "loss": 1.3965, + "step": 2042 + }, + { + "epoch": 0.28923338288383943, + "grad_norm": 10.309873304208356, + "learning_rate": 4.829728126515545e-06, + "loss": 1.3775, + "step": 2043 + }, + { + "epoch": 0.28937495575847666, + "grad_norm": 10.739647071765171, + "learning_rate": 4.829520144112005e-06, + "loss": 1.4254, + "step": 2044 + }, + { + "epoch": 0.2895165286331139, + "grad_norm": 11.775895822064212, + "learning_rate": 4.829312039247328e-06, + "loss": 1.4168, + "step": 2045 + }, + { + "epoch": 0.2896581015077511, + "grad_norm": 10.802443574044261, + "learning_rate": 4.829103811932453e-06, + "loss": 1.538, + "step": 2046 + }, + { + "epoch": 0.28979967438238835, + "grad_norm": 11.099190938520973, + "learning_rate": 4.828895462178329e-06, + "loss": 1.4369, + "step": 2047 + }, + { + "epoch": 0.2899412472570256, + "grad_norm": 11.018880766094274, + "learning_rate": 4.828686989995905e-06, + "loss": 1.4746, + "step": 2048 + }, + { + "epoch": 0.29008282013166276, + "grad_norm": 11.373787951608913, + "learning_rate": 4.828478395396143e-06, + "loss": 1.4016, + "step": 2049 + }, + { + "epoch": 0.2902243930063, + "grad_norm": 9.844088303338495, + "learning_rate": 4.828269678390008e-06, + "loss": 1.3384, + "step": 2050 + }, + { + "epoch": 0.2903659658809372, + "grad_norm": 11.415093158036015, + "learning_rate": 4.828060838988473e-06, + "loss": 1.4478, + "step": 2051 + }, + { + "epoch": 0.29050753875557445, + "grad_norm": 10.191616233104105, + "learning_rate": 4.827851877202515e-06, + "loss": 1.3685, + "step": 2052 + }, + { + "epoch": 0.2906491116302117, + "grad_norm": 9.62846411256708, + "learning_rate": 4.827642793043119e-06, + "loss": 1.3796, + "step": 2053 + }, + { + "epoch": 0.29079068450484885, + "grad_norm": 9.293404272712676, + "learning_rate": 4.827433586521277e-06, + "loss": 1.272, + "step": 2054 + }, + { + "epoch": 0.2909322573794861, + "grad_norm": 10.509401018403713, + "learning_rate": 4.827224257647987e-06, + "loss": 1.3307, + "step": 2055 + }, + { + "epoch": 0.2910738302541233, + "grad_norm": 8.479932095957537, + "learning_rate": 4.827014806434254e-06, + "loss": 1.1794, + "step": 2056 + }, + { + "epoch": 0.29121540312876054, + "grad_norm": 9.930651338081262, + "learning_rate": 4.826805232891087e-06, + "loss": 1.3967, + "step": 2057 + }, + { + "epoch": 0.29135697600339777, + "grad_norm": 11.034848239617377, + "learning_rate": 4.826595537029503e-06, + "loss": 1.2711, + "step": 2058 + }, + { + "epoch": 0.29149854887803495, + "grad_norm": 10.979466781037157, + "learning_rate": 4.826385718860527e-06, + "loss": 1.4204, + "step": 2059 + }, + { + "epoch": 0.2916401217526722, + "grad_norm": 8.986111415081487, + "learning_rate": 4.826175778395188e-06, + "loss": 1.348, + "step": 2060 + }, + { + "epoch": 0.2917816946273094, + "grad_norm": 9.564909986599863, + "learning_rate": 4.825965715644523e-06, + "loss": 1.3141, + "step": 2061 + }, + { + "epoch": 0.29192326750194664, + "grad_norm": 10.359869161243589, + "learning_rate": 4.825755530619576e-06, + "loss": 1.4402, + "step": 2062 + }, + { + "epoch": 0.29206484037658387, + "grad_norm": 9.44027123594978, + "learning_rate": 4.825545223331392e-06, + "loss": 1.2864, + "step": 2063 + }, + { + "epoch": 0.29220641325122104, + "grad_norm": 10.67094593835234, + "learning_rate": 4.825334793791032e-06, + "loss": 1.3271, + "step": 2064 + }, + { + "epoch": 0.2923479861258583, + "grad_norm": 11.241704298163324, + "learning_rate": 4.825124242009556e-06, + "loss": 1.5296, + "step": 2065 + }, + { + "epoch": 0.2924895590004955, + "grad_norm": 10.268329513772821, + "learning_rate": 4.824913567998031e-06, + "loss": 1.4393, + "step": 2066 + }, + { + "epoch": 0.29263113187513273, + "grad_norm": 9.270317240433643, + "learning_rate": 4.8247027717675335e-06, + "loss": 1.4089, + "step": 2067 + }, + { + "epoch": 0.29277270474976996, + "grad_norm": 11.519778947828641, + "learning_rate": 4.8244918533291444e-06, + "loss": 1.419, + "step": 2068 + }, + { + "epoch": 0.29291427762440714, + "grad_norm": 9.027187397228857, + "learning_rate": 4.824280812693952e-06, + "loss": 1.3113, + "step": 2069 + }, + { + "epoch": 0.29305585049904437, + "grad_norm": 8.875857029757482, + "learning_rate": 4.824069649873051e-06, + "loss": 1.3806, + "step": 2070 + }, + { + "epoch": 0.2931974233736816, + "grad_norm": 9.06220029138033, + "learning_rate": 4.82385836487754e-06, + "loss": 1.4462, + "step": 2071 + }, + { + "epoch": 0.29333899624831883, + "grad_norm": 9.218802552154237, + "learning_rate": 4.823646957718529e-06, + "loss": 1.2526, + "step": 2072 + }, + { + "epoch": 0.29348056912295606, + "grad_norm": 10.128778576735844, + "learning_rate": 4.823435428407129e-06, + "loss": 1.4443, + "step": 2073 + }, + { + "epoch": 0.29362214199759323, + "grad_norm": 11.148948695138746, + "learning_rate": 4.823223776954462e-06, + "loss": 1.4588, + "step": 2074 + }, + { + "epoch": 0.29376371487223046, + "grad_norm": 13.59133924834034, + "learning_rate": 4.8230120033716525e-06, + "loss": 1.5776, + "step": 2075 + }, + { + "epoch": 0.2939052877468677, + "grad_norm": 9.776923542083875, + "learning_rate": 4.822800107669835e-06, + "loss": 1.3683, + "step": 2076 + }, + { + "epoch": 0.2940468606215049, + "grad_norm": 9.139804483739056, + "learning_rate": 4.822588089860146e-06, + "loss": 1.4074, + "step": 2077 + }, + { + "epoch": 0.29418843349614215, + "grad_norm": 8.997344154986054, + "learning_rate": 4.822375949953735e-06, + "loss": 1.3273, + "step": 2078 + }, + { + "epoch": 0.2943300063707794, + "grad_norm": 10.825133247524493, + "learning_rate": 4.82216368796175e-06, + "loss": 1.3079, + "step": 2079 + }, + { + "epoch": 0.29447157924541656, + "grad_norm": 8.97234546221119, + "learning_rate": 4.8219513038953534e-06, + "loss": 1.4166, + "step": 2080 + }, + { + "epoch": 0.2946131521200538, + "grad_norm": 8.189103995882034, + "learning_rate": 4.821738797765707e-06, + "loss": 1.2642, + "step": 2081 + }, + { + "epoch": 0.294754724994691, + "grad_norm": 8.90490712532391, + "learning_rate": 4.8215261695839825e-06, + "loss": 1.3606, + "step": 2082 + }, + { + "epoch": 0.29489629786932825, + "grad_norm": 10.550594323941135, + "learning_rate": 4.821313419361359e-06, + "loss": 1.2495, + "step": 2083 + }, + { + "epoch": 0.2950378707439655, + "grad_norm": 8.704619958985637, + "learning_rate": 4.82110054710902e-06, + "loss": 1.2937, + "step": 2084 + }, + { + "epoch": 0.29517944361860265, + "grad_norm": 10.505472482825432, + "learning_rate": 4.820887552838156e-06, + "loss": 1.3797, + "step": 2085 + }, + { + "epoch": 0.2953210164932399, + "grad_norm": 8.57094096658788, + "learning_rate": 4.820674436559964e-06, + "loss": 1.5596, + "step": 2086 + }, + { + "epoch": 0.2954625893678771, + "grad_norm": 12.362399381068032, + "learning_rate": 4.8204611982856465e-06, + "loss": 1.4066, + "step": 2087 + }, + { + "epoch": 0.29560416224251435, + "grad_norm": 10.221129160872696, + "learning_rate": 4.820247838026414e-06, + "loss": 1.3604, + "step": 2088 + }, + { + "epoch": 0.2957457351171516, + "grad_norm": 9.675624730642598, + "learning_rate": 4.820034355793483e-06, + "loss": 1.3795, + "step": 2089 + }, + { + "epoch": 0.29588730799178875, + "grad_norm": 8.660483526602356, + "learning_rate": 4.819820751598076e-06, + "loss": 1.4066, + "step": 2090 + }, + { + "epoch": 0.296028880866426, + "grad_norm": 8.859606650686182, + "learning_rate": 4.819607025451422e-06, + "loss": 1.3032, + "step": 2091 + }, + { + "epoch": 0.2961704537410632, + "grad_norm": 9.576872751541536, + "learning_rate": 4.819393177364756e-06, + "loss": 1.3406, + "step": 2092 + }, + { + "epoch": 0.29631202661570044, + "grad_norm": 10.99337777293214, + "learning_rate": 4.81917920734932e-06, + "loss": 1.4148, + "step": 2093 + }, + { + "epoch": 0.29645359949033767, + "grad_norm": 11.93353109639854, + "learning_rate": 4.818965115416362e-06, + "loss": 1.3994, + "step": 2094 + }, + { + "epoch": 0.29659517236497485, + "grad_norm": 9.7823189343279, + "learning_rate": 4.818750901577137e-06, + "loss": 1.3582, + "step": 2095 + }, + { + "epoch": 0.2967367452396121, + "grad_norm": 9.221642819425533, + "learning_rate": 4.818536565842907e-06, + "loss": 1.3051, + "step": 2096 + }, + { + "epoch": 0.2968783181142493, + "grad_norm": 8.290119531595701, + "learning_rate": 4.8183221082249375e-06, + "loss": 1.1853, + "step": 2097 + }, + { + "epoch": 0.29701989098888654, + "grad_norm": 8.182777498821219, + "learning_rate": 4.8181075287345045e-06, + "loss": 1.2591, + "step": 2098 + }, + { + "epoch": 0.29716146386352377, + "grad_norm": 10.577796468887414, + "learning_rate": 4.817892827382886e-06, + "loss": 1.5066, + "step": 2099 + }, + { + "epoch": 0.29730303673816094, + "grad_norm": 11.463507428508805, + "learning_rate": 4.81767800418137e-06, + "loss": 1.3751, + "step": 2100 + }, + { + "epoch": 0.29744460961279817, + "grad_norm": 9.603549587150138, + "learning_rate": 4.8174630591412495e-06, + "loss": 1.383, + "step": 2101 + }, + { + "epoch": 0.2975861824874354, + "grad_norm": 10.575090297015864, + "learning_rate": 4.817247992273824e-06, + "loss": 1.3946, + "step": 2102 + }, + { + "epoch": 0.29772775536207263, + "grad_norm": 8.71051883803338, + "learning_rate": 4.8170328035904e-06, + "loss": 1.3906, + "step": 2103 + }, + { + "epoch": 0.29786932823670986, + "grad_norm": 10.461757226097943, + "learning_rate": 4.816817493102289e-06, + "loss": 1.326, + "step": 2104 + }, + { + "epoch": 0.2980109011113471, + "grad_norm": 10.508427099242667, + "learning_rate": 4.81660206082081e-06, + "loss": 1.3709, + "step": 2105 + }, + { + "epoch": 0.29815247398598427, + "grad_norm": 9.618032336216583, + "learning_rate": 4.816386506757287e-06, + "loss": 1.161, + "step": 2106 + }, + { + "epoch": 0.2982940468606215, + "grad_norm": 9.327085810912275, + "learning_rate": 4.816170830923053e-06, + "loss": 1.21, + "step": 2107 + }, + { + "epoch": 0.29843561973525873, + "grad_norm": 9.141068615704167, + "learning_rate": 4.815955033329446e-06, + "loss": 1.2455, + "step": 2108 + }, + { + "epoch": 0.29857719260989596, + "grad_norm": 13.491622020576093, + "learning_rate": 4.815739113987809e-06, + "loss": 1.6216, + "step": 2109 + }, + { + "epoch": 0.2987187654845332, + "grad_norm": 10.044947129258064, + "learning_rate": 4.815523072909494e-06, + "loss": 1.3361, + "step": 2110 + }, + { + "epoch": 0.29886033835917036, + "grad_norm": 9.918100290827184, + "learning_rate": 4.815306910105857e-06, + "loss": 1.245, + "step": 2111 + }, + { + "epoch": 0.2990019112338076, + "grad_norm": 9.750497267342277, + "learning_rate": 4.815090625588263e-06, + "loss": 1.317, + "step": 2112 + }, + { + "epoch": 0.2991434841084448, + "grad_norm": 10.802003208453035, + "learning_rate": 4.81487421936808e-06, + "loss": 1.3706, + "step": 2113 + }, + { + "epoch": 0.29928505698308205, + "grad_norm": 9.461971261131815, + "learning_rate": 4.814657691456685e-06, + "loss": 1.3726, + "step": 2114 + }, + { + "epoch": 0.2994266298577193, + "grad_norm": 10.012700979665933, + "learning_rate": 4.814441041865463e-06, + "loss": 1.462, + "step": 2115 + }, + { + "epoch": 0.29956820273235646, + "grad_norm": 8.594717186340898, + "learning_rate": 4.814224270605799e-06, + "loss": 1.3599, + "step": 2116 + }, + { + "epoch": 0.2997097756069937, + "grad_norm": 9.51419372317741, + "learning_rate": 4.814007377689093e-06, + "loss": 1.4325, + "step": 2117 + }, + { + "epoch": 0.2998513484816309, + "grad_norm": 9.524917449993948, + "learning_rate": 4.813790363126743e-06, + "loss": 1.2791, + "step": 2118 + }, + { + "epoch": 0.29999292135626815, + "grad_norm": 8.204016413619698, + "learning_rate": 4.813573226930158e-06, + "loss": 1.34, + "step": 2119 + }, + { + "epoch": 0.3001344942309054, + "grad_norm": 9.31606925818716, + "learning_rate": 4.813355969110755e-06, + "loss": 1.2854, + "step": 2120 + }, + { + "epoch": 0.30027606710554255, + "grad_norm": 10.39322871806616, + "learning_rate": 4.813138589679953e-06, + "loss": 1.3706, + "step": 2121 + }, + { + "epoch": 0.3004176399801798, + "grad_norm": 9.830934350438758, + "learning_rate": 4.812921088649181e-06, + "loss": 1.3557, + "step": 2122 + }, + { + "epoch": 0.300559212854817, + "grad_norm": 10.891898870194083, + "learning_rate": 4.812703466029871e-06, + "loss": 1.3287, + "step": 2123 + }, + { + "epoch": 0.30070078572945425, + "grad_norm": 9.249139281771752, + "learning_rate": 4.812485721833465e-06, + "loss": 1.3189, + "step": 2124 + }, + { + "epoch": 0.3008423586040915, + "grad_norm": 9.101850434689048, + "learning_rate": 4.812267856071407e-06, + "loss": 1.3882, + "step": 2125 + }, + { + "epoch": 0.30098393147872865, + "grad_norm": 10.922647211502055, + "learning_rate": 4.812049868755154e-06, + "loss": 1.3182, + "step": 2126 + }, + { + "epoch": 0.3011255043533659, + "grad_norm": 11.62284978854288, + "learning_rate": 4.8118317598961625e-06, + "loss": 1.3851, + "step": 2127 + }, + { + "epoch": 0.3012670772280031, + "grad_norm": 9.72555333306426, + "learning_rate": 4.811613529505899e-06, + "loss": 1.3358, + "step": 2128 + }, + { + "epoch": 0.30140865010264034, + "grad_norm": 8.110378313565906, + "learning_rate": 4.811395177595836e-06, + "loss": 1.2919, + "step": 2129 + }, + { + "epoch": 0.30155022297727757, + "grad_norm": 10.578732629450947, + "learning_rate": 4.811176704177452e-06, + "loss": 1.4054, + "step": 2130 + }, + { + "epoch": 0.30169179585191475, + "grad_norm": 8.637127297692386, + "learning_rate": 4.810958109262232e-06, + "loss": 1.2347, + "step": 2131 + }, + { + "epoch": 0.301833368726552, + "grad_norm": 10.674299330575776, + "learning_rate": 4.810739392861667e-06, + "loss": 1.4166, + "step": 2132 + }, + { + "epoch": 0.3019749416011892, + "grad_norm": 10.467692854156777, + "learning_rate": 4.810520554987256e-06, + "loss": 1.4355, + "step": 2133 + }, + { + "epoch": 0.30211651447582644, + "grad_norm": 10.889367445552464, + "learning_rate": 4.810301595650501e-06, + "loss": 1.3214, + "step": 2134 + }, + { + "epoch": 0.30225808735046367, + "grad_norm": 10.895388728468278, + "learning_rate": 4.810082514862915e-06, + "loss": 1.5507, + "step": 2135 + }, + { + "epoch": 0.3023996602251009, + "grad_norm": 8.831910744561583, + "learning_rate": 4.809863312636013e-06, + "loss": 1.3049, + "step": 2136 + }, + { + "epoch": 0.30254123309973807, + "grad_norm": 8.917596596209899, + "learning_rate": 4.8096439889813186e-06, + "loss": 1.5085, + "step": 2137 + }, + { + "epoch": 0.3026828059743753, + "grad_norm": 8.665414524101644, + "learning_rate": 4.809424543910363e-06, + "loss": 1.3044, + "step": 2138 + }, + { + "epoch": 0.30282437884901253, + "grad_norm": 10.472495613160971, + "learning_rate": 4.80920497743468e-06, + "loss": 1.2828, + "step": 2139 + }, + { + "epoch": 0.30296595172364976, + "grad_norm": 10.248632200205922, + "learning_rate": 4.808985289565813e-06, + "loss": 1.4064, + "step": 2140 + }, + { + "epoch": 0.303107524598287, + "grad_norm": 11.866525939221257, + "learning_rate": 4.808765480315312e-06, + "loss": 1.4072, + "step": 2141 + }, + { + "epoch": 0.30324909747292417, + "grad_norm": 9.57578685903809, + "learning_rate": 4.80854554969473e-06, + "loss": 1.2522, + "step": 2142 + }, + { + "epoch": 0.3033906703475614, + "grad_norm": 10.062908661878268, + "learning_rate": 4.80832549771563e-06, + "loss": 1.2444, + "step": 2143 + }, + { + "epoch": 0.3035322432221986, + "grad_norm": 8.320168536019775, + "learning_rate": 4.808105324389581e-06, + "loss": 1.4341, + "step": 2144 + }, + { + "epoch": 0.30367381609683586, + "grad_norm": 9.577208931009238, + "learning_rate": 4.807885029728155e-06, + "loss": 1.2097, + "step": 2145 + }, + { + "epoch": 0.3038153889714731, + "grad_norm": 11.866626879450358, + "learning_rate": 4.807664613742934e-06, + "loss": 1.387, + "step": 2146 + }, + { + "epoch": 0.30395696184611026, + "grad_norm": 11.990442284451305, + "learning_rate": 4.807444076445506e-06, + "loss": 1.3301, + "step": 2147 + }, + { + "epoch": 0.3040985347207475, + "grad_norm": 8.813060120441902, + "learning_rate": 4.807223417847462e-06, + "loss": 1.2563, + "step": 2148 + }, + { + "epoch": 0.3042401075953847, + "grad_norm": 8.843693062427137, + "learning_rate": 4.807002637960403e-06, + "loss": 1.4235, + "step": 2149 + }, + { + "epoch": 0.30438168047002195, + "grad_norm": 9.913397199283489, + "learning_rate": 4.806781736795937e-06, + "loss": 1.3566, + "step": 2150 + }, + { + "epoch": 0.3045232533446592, + "grad_norm": 12.563138271837364, + "learning_rate": 4.806560714365674e-06, + "loss": 1.2917, + "step": 2151 + }, + { + "epoch": 0.30466482621929636, + "grad_norm": 11.165381020699757, + "learning_rate": 4.806339570681234e-06, + "loss": 1.3521, + "step": 2152 + }, + { + "epoch": 0.3048063990939336, + "grad_norm": 9.014799876940645, + "learning_rate": 4.8061183057542424e-06, + "loss": 1.3374, + "step": 2153 + }, + { + "epoch": 0.3049479719685708, + "grad_norm": 8.309626261802624, + "learning_rate": 4.805896919596332e-06, + "loss": 1.2374, + "step": 2154 + }, + { + "epoch": 0.30508954484320805, + "grad_norm": 8.973004863710912, + "learning_rate": 4.805675412219139e-06, + "loss": 1.4541, + "step": 2155 + }, + { + "epoch": 0.3052311177178453, + "grad_norm": 9.853985021958795, + "learning_rate": 4.805453783634309e-06, + "loss": 1.2393, + "step": 2156 + }, + { + "epoch": 0.30537269059248245, + "grad_norm": 12.049780904590458, + "learning_rate": 4.805232033853493e-06, + "loss": 1.512, + "step": 2157 + }, + { + "epoch": 0.3055142634671197, + "grad_norm": 10.628002864379267, + "learning_rate": 4.805010162888347e-06, + "loss": 1.3717, + "step": 2158 + }, + { + "epoch": 0.3056558363417569, + "grad_norm": 8.528608758666236, + "learning_rate": 4.804788170750536e-06, + "loss": 1.4219, + "step": 2159 + }, + { + "epoch": 0.30579740921639414, + "grad_norm": 6.78648322201603, + "learning_rate": 4.804566057451729e-06, + "loss": 1.1867, + "step": 2160 + }, + { + "epoch": 0.3059389820910314, + "grad_norm": 12.751289452068354, + "learning_rate": 4.8043438230036034e-06, + "loss": 1.3117, + "step": 2161 + }, + { + "epoch": 0.3060805549656686, + "grad_norm": 7.218541526261204, + "learning_rate": 4.804121467417841e-06, + "loss": 1.2878, + "step": 2162 + }, + { + "epoch": 0.3062221278403058, + "grad_norm": 10.59821200855693, + "learning_rate": 4.8038989907061305e-06, + "loss": 1.4835, + "step": 2163 + }, + { + "epoch": 0.306363700714943, + "grad_norm": 10.455945191897145, + "learning_rate": 4.803676392880168e-06, + "loss": 1.3167, + "step": 2164 + }, + { + "epoch": 0.30650527358958024, + "grad_norm": 8.600561487365184, + "learning_rate": 4.803453673951656e-06, + "loss": 1.3833, + "step": 2165 + }, + { + "epoch": 0.30664684646421747, + "grad_norm": 10.799638982319617, + "learning_rate": 4.803230833932302e-06, + "loss": 1.2631, + "step": 2166 + }, + { + "epoch": 0.3067884193388547, + "grad_norm": 11.055646767125527, + "learning_rate": 4.803007872833819e-06, + "loss": 1.4719, + "step": 2167 + }, + { + "epoch": 0.3069299922134919, + "grad_norm": 8.734533973843961, + "learning_rate": 4.8027847906679305e-06, + "loss": 1.3097, + "step": 2168 + }, + { + "epoch": 0.3070715650881291, + "grad_norm": 9.331467941610928, + "learning_rate": 4.802561587446362e-06, + "loss": 1.3733, + "step": 2169 + }, + { + "epoch": 0.30721313796276634, + "grad_norm": 13.078388097525329, + "learning_rate": 4.802338263180848e-06, + "loss": 1.5777, + "step": 2170 + }, + { + "epoch": 0.30735471083740357, + "grad_norm": 10.253762647803976, + "learning_rate": 4.802114817883128e-06, + "loss": 1.2179, + "step": 2171 + }, + { + "epoch": 0.3074962837120408, + "grad_norm": 9.510203052362655, + "learning_rate": 4.801891251564949e-06, + "loss": 1.288, + "step": 2172 + }, + { + "epoch": 0.30763785658667797, + "grad_norm": 10.616907246577124, + "learning_rate": 4.801667564238063e-06, + "loss": 1.4672, + "step": 2173 + }, + { + "epoch": 0.3077794294613152, + "grad_norm": 12.412792104797134, + "learning_rate": 4.801443755914229e-06, + "loss": 1.3913, + "step": 2174 + }, + { + "epoch": 0.30792100233595243, + "grad_norm": 13.572873696849353, + "learning_rate": 4.801219826605213e-06, + "loss": 1.4041, + "step": 2175 + }, + { + "epoch": 0.30806257521058966, + "grad_norm": 9.510719676098345, + "learning_rate": 4.8009957763227875e-06, + "loss": 1.4456, + "step": 2176 + }, + { + "epoch": 0.3082041480852269, + "grad_norm": 10.465766686624884, + "learning_rate": 4.800771605078728e-06, + "loss": 1.4303, + "step": 2177 + }, + { + "epoch": 0.30834572095986407, + "grad_norm": 9.451444113859127, + "learning_rate": 4.800547312884822e-06, + "loss": 1.3074, + "step": 2178 + }, + { + "epoch": 0.3084872938345013, + "grad_norm": 10.8182594086711, + "learning_rate": 4.800322899752859e-06, + "loss": 1.3681, + "step": 2179 + }, + { + "epoch": 0.3086288667091385, + "grad_norm": 8.224565544364923, + "learning_rate": 4.800098365694636e-06, + "loss": 1.3655, + "step": 2180 + }, + { + "epoch": 0.30877043958377576, + "grad_norm": 11.465042720803257, + "learning_rate": 4.799873710721958e-06, + "loss": 1.4696, + "step": 2181 + }, + { + "epoch": 0.308912012458413, + "grad_norm": 9.94256424370667, + "learning_rate": 4.799648934846633e-06, + "loss": 1.3052, + "step": 2182 + }, + { + "epoch": 0.30905358533305016, + "grad_norm": 7.477948131135514, + "learning_rate": 4.799424038080478e-06, + "loss": 1.2722, + "step": 2183 + }, + { + "epoch": 0.3091951582076874, + "grad_norm": 9.86239437535031, + "learning_rate": 4.799199020435316e-06, + "loss": 1.3953, + "step": 2184 + }, + { + "epoch": 0.3093367310823246, + "grad_norm": 10.05652663832517, + "learning_rate": 4.798973881922975e-06, + "loss": 1.3786, + "step": 2185 + }, + { + "epoch": 0.30947830395696185, + "grad_norm": 9.342370381138767, + "learning_rate": 4.798748622555293e-06, + "loss": 1.3066, + "step": 2186 + }, + { + "epoch": 0.3096198768315991, + "grad_norm": 10.84374788927393, + "learning_rate": 4.798523242344109e-06, + "loss": 1.5314, + "step": 2187 + }, + { + "epoch": 0.30976144970623626, + "grad_norm": 8.644672018595156, + "learning_rate": 4.798297741301271e-06, + "loss": 1.3766, + "step": 2188 + }, + { + "epoch": 0.3099030225808735, + "grad_norm": 9.999206892987539, + "learning_rate": 4.798072119438636e-06, + "loss": 1.3785, + "step": 2189 + }, + { + "epoch": 0.3100445954555107, + "grad_norm": 7.997373149661199, + "learning_rate": 4.797846376768062e-06, + "loss": 1.2384, + "step": 2190 + }, + { + "epoch": 0.31018616833014795, + "grad_norm": 8.239192009726715, + "learning_rate": 4.797620513301418e-06, + "loss": 1.3864, + "step": 2191 + }, + { + "epoch": 0.3103277412047852, + "grad_norm": 7.8792294240705205, + "learning_rate": 4.797394529050577e-06, + "loss": 1.3436, + "step": 2192 + }, + { + "epoch": 0.3104693140794224, + "grad_norm": 11.454182116758368, + "learning_rate": 4.797168424027419e-06, + "loss": 1.4547, + "step": 2193 + }, + { + "epoch": 0.3106108869540596, + "grad_norm": 8.108193724927705, + "learning_rate": 4.796942198243828e-06, + "loss": 1.2429, + "step": 2194 + }, + { + "epoch": 0.3107524598286968, + "grad_norm": 9.064592212020067, + "learning_rate": 4.796715851711699e-06, + "loss": 1.3266, + "step": 2195 + }, + { + "epoch": 0.31089403270333404, + "grad_norm": 9.215443991419207, + "learning_rate": 4.7964893844429315e-06, + "loss": 1.3649, + "step": 2196 + }, + { + "epoch": 0.3110356055779713, + "grad_norm": 8.390685921275578, + "learning_rate": 4.796262796449428e-06, + "loss": 1.1835, + "step": 2197 + }, + { + "epoch": 0.3111771784526085, + "grad_norm": 10.643321318850102, + "learning_rate": 4.7960360877431025e-06, + "loss": 1.222, + "step": 2198 + }, + { + "epoch": 0.3113187513272457, + "grad_norm": 9.401739616938295, + "learning_rate": 4.795809258335872e-06, + "loss": 1.4686, + "step": 2199 + }, + { + "epoch": 0.3114603242018829, + "grad_norm": 11.001899208493398, + "learning_rate": 4.795582308239659e-06, + "loss": 1.2981, + "step": 2200 + }, + { + "epoch": 0.31160189707652014, + "grad_norm": 11.63112290449771, + "learning_rate": 4.795355237466397e-06, + "loss": 1.5022, + "step": 2201 + }, + { + "epoch": 0.31174346995115737, + "grad_norm": 12.078431987932431, + "learning_rate": 4.795128046028021e-06, + "loss": 1.4802, + "step": 2202 + }, + { + "epoch": 0.3118850428257946, + "grad_norm": 9.821779953256707, + "learning_rate": 4.794900733936476e-06, + "loss": 1.3743, + "step": 2203 + }, + { + "epoch": 0.3120266157004318, + "grad_norm": 10.702598569968993, + "learning_rate": 4.794673301203709e-06, + "loss": 1.4356, + "step": 2204 + }, + { + "epoch": 0.312168188575069, + "grad_norm": 11.096660393434009, + "learning_rate": 4.794445747841679e-06, + "loss": 1.4567, + "step": 2205 + }, + { + "epoch": 0.31230976144970624, + "grad_norm": 12.722072158069109, + "learning_rate": 4.794218073862346e-06, + "loss": 1.2958, + "step": 2206 + }, + { + "epoch": 0.31245133432434347, + "grad_norm": 8.722778702118541, + "learning_rate": 4.79399027927768e-06, + "loss": 1.2662, + "step": 2207 + }, + { + "epoch": 0.3125929071989807, + "grad_norm": 13.433128604007218, + "learning_rate": 4.793762364099655e-06, + "loss": 1.5376, + "step": 2208 + }, + { + "epoch": 0.31273448007361787, + "grad_norm": 9.228789058449195, + "learning_rate": 4.793534328340253e-06, + "loss": 1.3998, + "step": 2209 + }, + { + "epoch": 0.3128760529482551, + "grad_norm": 11.223913508043225, + "learning_rate": 4.7933061720114615e-06, + "loss": 1.3947, + "step": 2210 + }, + { + "epoch": 0.31301762582289233, + "grad_norm": 16.991958511388095, + "learning_rate": 4.793077895125274e-06, + "loss": 1.4702, + "step": 2211 + }, + { + "epoch": 0.31315919869752956, + "grad_norm": 11.11055580870813, + "learning_rate": 4.792849497693692e-06, + "loss": 1.3359, + "step": 2212 + }, + { + "epoch": 0.3133007715721668, + "grad_norm": 9.2300348038601, + "learning_rate": 4.7926209797287216e-06, + "loss": 1.3559, + "step": 2213 + }, + { + "epoch": 0.31344234444680397, + "grad_norm": 7.532482505395667, + "learning_rate": 4.792392341242375e-06, + "loss": 1.3341, + "step": 2214 + }, + { + "epoch": 0.3135839173214412, + "grad_norm": 12.45021212600098, + "learning_rate": 4.792163582246674e-06, + "loss": 1.4448, + "step": 2215 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 15.270172784873221, + "learning_rate": 4.791934702753641e-06, + "loss": 1.4395, + "step": 2216 + }, + { + "epoch": 0.31386706307071566, + "grad_norm": 8.514443187775203, + "learning_rate": 4.79170570277531e-06, + "loss": 1.3032, + "step": 2217 + }, + { + "epoch": 0.3140086359453529, + "grad_norm": 8.84213101830233, + "learning_rate": 4.791476582323719e-06, + "loss": 1.4588, + "step": 2218 + }, + { + "epoch": 0.31415020881999006, + "grad_norm": 10.468526262767735, + "learning_rate": 4.791247341410913e-06, + "loss": 1.4874, + "step": 2219 + }, + { + "epoch": 0.3142917816946273, + "grad_norm": 8.343746342461746, + "learning_rate": 4.791017980048942e-06, + "loss": 1.3926, + "step": 2220 + }, + { + "epoch": 0.3144333545692645, + "grad_norm": 9.74692506474598, + "learning_rate": 4.790788498249864e-06, + "loss": 1.4704, + "step": 2221 + }, + { + "epoch": 0.31457492744390175, + "grad_norm": 11.245977403119074, + "learning_rate": 4.790558896025743e-06, + "loss": 1.546, + "step": 2222 + }, + { + "epoch": 0.314716500318539, + "grad_norm": 9.823785402967365, + "learning_rate": 4.79032917338865e-06, + "loss": 1.4752, + "step": 2223 + }, + { + "epoch": 0.3148580731931762, + "grad_norm": 9.742700827870408, + "learning_rate": 4.790099330350658e-06, + "loss": 1.3732, + "step": 2224 + }, + { + "epoch": 0.3149996460678134, + "grad_norm": 9.50971768927896, + "learning_rate": 4.789869366923853e-06, + "loss": 1.3883, + "step": 2225 + }, + { + "epoch": 0.3151412189424506, + "grad_norm": 12.088388442296017, + "learning_rate": 4.789639283120323e-06, + "loss": 1.3612, + "step": 2226 + }, + { + "epoch": 0.31528279181708785, + "grad_norm": 9.25986223023589, + "learning_rate": 4.789409078952162e-06, + "loss": 1.2207, + "step": 2227 + }, + { + "epoch": 0.3154243646917251, + "grad_norm": 10.62953789238573, + "learning_rate": 4.789178754431474e-06, + "loss": 1.24, + "step": 2228 + }, + { + "epoch": 0.3155659375663623, + "grad_norm": 9.507486304875876, + "learning_rate": 4.788948309570365e-06, + "loss": 1.3571, + "step": 2229 + }, + { + "epoch": 0.3157075104409995, + "grad_norm": 10.187523215799851, + "learning_rate": 4.78871774438095e-06, + "loss": 1.3901, + "step": 2230 + }, + { + "epoch": 0.3158490833156367, + "grad_norm": 9.175083619446099, + "learning_rate": 4.78848705887535e-06, + "loss": 1.2577, + "step": 2231 + }, + { + "epoch": 0.31599065619027394, + "grad_norm": 10.80137811980996, + "learning_rate": 4.788256253065692e-06, + "loss": 1.3968, + "step": 2232 + }, + { + "epoch": 0.3161322290649112, + "grad_norm": 10.744839871412914, + "learning_rate": 4.7880253269641085e-06, + "loss": 1.3011, + "step": 2233 + }, + { + "epoch": 0.3162738019395484, + "grad_norm": 10.274067233551241, + "learning_rate": 4.787794280582739e-06, + "loss": 1.4676, + "step": 2234 + }, + { + "epoch": 0.3164153748141856, + "grad_norm": 9.015605959549868, + "learning_rate": 4.787563113933731e-06, + "loss": 1.2941, + "step": 2235 + }, + { + "epoch": 0.3165569476888228, + "grad_norm": 8.817414773259303, + "learning_rate": 4.787331827029236e-06, + "loss": 1.2263, + "step": 2236 + }, + { + "epoch": 0.31669852056346004, + "grad_norm": 8.401299021957715, + "learning_rate": 4.787100419881412e-06, + "loss": 1.2617, + "step": 2237 + }, + { + "epoch": 0.31684009343809727, + "grad_norm": 10.51100744295333, + "learning_rate": 4.7868688925024245e-06, + "loss": 1.2427, + "step": 2238 + }, + { + "epoch": 0.3169816663127345, + "grad_norm": 12.607021712520405, + "learning_rate": 4.786637244904444e-06, + "loss": 1.2824, + "step": 2239 + }, + { + "epoch": 0.3171232391873717, + "grad_norm": 8.439413574239076, + "learning_rate": 4.786405477099648e-06, + "loss": 1.2391, + "step": 2240 + }, + { + "epoch": 0.3172648120620089, + "grad_norm": 10.710677151399155, + "learning_rate": 4.786173589100222e-06, + "loss": 1.4805, + "step": 2241 + }, + { + "epoch": 0.31740638493664614, + "grad_norm": 10.869154839107956, + "learning_rate": 4.785941580918354e-06, + "loss": 1.1829, + "step": 2242 + }, + { + "epoch": 0.31754795781128337, + "grad_norm": 8.519773931951628, + "learning_rate": 4.785709452566243e-06, + "loss": 1.4121, + "step": 2243 + }, + { + "epoch": 0.3176895306859206, + "grad_norm": 8.634256012656206, + "learning_rate": 4.785477204056089e-06, + "loss": 1.3843, + "step": 2244 + }, + { + "epoch": 0.31783110356055777, + "grad_norm": 9.937636656391353, + "learning_rate": 4.785244835400103e-06, + "loss": 1.3349, + "step": 2245 + }, + { + "epoch": 0.317972676435195, + "grad_norm": 11.30760263899498, + "learning_rate": 4.7850123466105e-06, + "loss": 1.3647, + "step": 2246 + }, + { + "epoch": 0.31811424930983223, + "grad_norm": 8.575640097880678, + "learning_rate": 4.784779737699502e-06, + "loss": 1.2162, + "step": 2247 + }, + { + "epoch": 0.31825582218446946, + "grad_norm": 10.839489014465883, + "learning_rate": 4.7845470086793365e-06, + "loss": 1.433, + "step": 2248 + }, + { + "epoch": 0.3183973950591067, + "grad_norm": 7.58261082599368, + "learning_rate": 4.784314159562238e-06, + "loss": 1.2934, + "step": 2249 + }, + { + "epoch": 0.3185389679337439, + "grad_norm": 9.281120119566825, + "learning_rate": 4.7840811903604475e-06, + "loss": 1.5035, + "step": 2250 + }, + { + "epoch": 0.3186805408083811, + "grad_norm": 8.427742296128955, + "learning_rate": 4.783848101086212e-06, + "loss": 1.3066, + "step": 2251 + }, + { + "epoch": 0.3188221136830183, + "grad_norm": 9.039179821786153, + "learning_rate": 4.783614891751785e-06, + "loss": 1.2166, + "step": 2252 + }, + { + "epoch": 0.31896368655765556, + "grad_norm": 8.460719731875514, + "learning_rate": 4.783381562369425e-06, + "loss": 1.4452, + "step": 2253 + }, + { + "epoch": 0.3191052594322928, + "grad_norm": 8.689664941436671, + "learning_rate": 4.7831481129514e-06, + "loss": 1.4565, + "step": 2254 + }, + { + "epoch": 0.31924683230693, + "grad_norm": 10.443022038162649, + "learning_rate": 4.78291454350998e-06, + "loss": 1.4063, + "step": 2255 + }, + { + "epoch": 0.3193884051815672, + "grad_norm": 11.192383451555802, + "learning_rate": 4.782680854057445e-06, + "loss": 1.4301, + "step": 2256 + }, + { + "epoch": 0.3195299780562044, + "grad_norm": 9.652357977528972, + "learning_rate": 4.78244704460608e-06, + "loss": 1.485, + "step": 2257 + }, + { + "epoch": 0.31967155093084165, + "grad_norm": 10.762238389223887, + "learning_rate": 4.782213115168176e-06, + "loss": 1.4877, + "step": 2258 + }, + { + "epoch": 0.3198131238054789, + "grad_norm": 10.95125425636706, + "learning_rate": 4.781979065756029e-06, + "loss": 1.3698, + "step": 2259 + }, + { + "epoch": 0.3199546966801161, + "grad_norm": 12.830967540577124, + "learning_rate": 4.781744896381945e-06, + "loss": 1.4194, + "step": 2260 + }, + { + "epoch": 0.3200962695547533, + "grad_norm": 8.694504363558034, + "learning_rate": 4.781510607058233e-06, + "loss": 1.3703, + "step": 2261 + }, + { + "epoch": 0.3202378424293905, + "grad_norm": 9.848073827173959, + "learning_rate": 4.781276197797209e-06, + "loss": 1.447, + "step": 2262 + }, + { + "epoch": 0.32037941530402775, + "grad_norm": 10.886910066268372, + "learning_rate": 4.781041668611197e-06, + "loss": 1.3141, + "step": 2263 + }, + { + "epoch": 0.320520988178665, + "grad_norm": 9.080760999878482, + "learning_rate": 4.780807019512525e-06, + "loss": 1.4078, + "step": 2264 + }, + { + "epoch": 0.3206625610533022, + "grad_norm": 10.550238540628365, + "learning_rate": 4.7805722505135285e-06, + "loss": 1.4502, + "step": 2265 + }, + { + "epoch": 0.3208041339279394, + "grad_norm": 10.188039192019483, + "learning_rate": 4.7803373616265495e-06, + "loss": 1.3492, + "step": 2266 + }, + { + "epoch": 0.3209457068025766, + "grad_norm": 8.190436615536745, + "learning_rate": 4.780102352863935e-06, + "loss": 1.3142, + "step": 2267 + }, + { + "epoch": 0.32108727967721384, + "grad_norm": 9.016211849265735, + "learning_rate": 4.77986722423804e-06, + "loss": 1.256, + "step": 2268 + }, + { + "epoch": 0.3212288525518511, + "grad_norm": 9.012697375883526, + "learning_rate": 4.779631975761226e-06, + "loss": 1.3936, + "step": 2269 + }, + { + "epoch": 0.3213704254264883, + "grad_norm": 11.531250661627691, + "learning_rate": 4.779396607445858e-06, + "loss": 1.4671, + "step": 2270 + }, + { + "epoch": 0.3215119983011255, + "grad_norm": 8.395392798389949, + "learning_rate": 4.779161119304311e-06, + "loss": 1.4031, + "step": 2271 + }, + { + "epoch": 0.3216535711757627, + "grad_norm": 10.577285440279857, + "learning_rate": 4.7789255113489615e-06, + "loss": 1.2265, + "step": 2272 + }, + { + "epoch": 0.32179514405039994, + "grad_norm": 8.285497878682744, + "learning_rate": 4.778689783592198e-06, + "loss": 1.3038, + "step": 2273 + }, + { + "epoch": 0.32193671692503717, + "grad_norm": 7.762175317629787, + "learning_rate": 4.778453936046412e-06, + "loss": 1.2744, + "step": 2274 + }, + { + "epoch": 0.3220782897996744, + "grad_norm": 9.52923782871387, + "learning_rate": 4.778217968724002e-06, + "loss": 1.3339, + "step": 2275 + }, + { + "epoch": 0.3222198626743116, + "grad_norm": 9.323837035939329, + "learning_rate": 4.777981881637372e-06, + "loss": 1.4908, + "step": 2276 + }, + { + "epoch": 0.3223614355489488, + "grad_norm": 11.126942325725407, + "learning_rate": 4.777745674798931e-06, + "loss": 1.2494, + "step": 2277 + }, + { + "epoch": 0.32250300842358604, + "grad_norm": 11.58282301256605, + "learning_rate": 4.7775093482211e-06, + "loss": 1.4775, + "step": 2278 + }, + { + "epoch": 0.32264458129822327, + "grad_norm": 8.630166538085906, + "learning_rate": 4.7772729019163e-06, + "loss": 1.2937, + "step": 2279 + }, + { + "epoch": 0.3227861541728605, + "grad_norm": 9.8891087434977, + "learning_rate": 4.777036335896962e-06, + "loss": 1.5527, + "step": 2280 + }, + { + "epoch": 0.3229277270474977, + "grad_norm": 9.82861753845805, + "learning_rate": 4.776799650175521e-06, + "loss": 1.4105, + "step": 2281 + }, + { + "epoch": 0.3230692999221349, + "grad_norm": 9.576287994209807, + "learning_rate": 4.7765628447644214e-06, + "loss": 1.3974, + "step": 2282 + }, + { + "epoch": 0.32321087279677213, + "grad_norm": 8.882462493791532, + "learning_rate": 4.776325919676109e-06, + "loss": 1.3288, + "step": 2283 + }, + { + "epoch": 0.32335244567140936, + "grad_norm": 11.379957523099067, + "learning_rate": 4.7760888749230414e-06, + "loss": 1.3705, + "step": 2284 + }, + { + "epoch": 0.3234940185460466, + "grad_norm": 10.140843934221756, + "learning_rate": 4.775851710517678e-06, + "loss": 1.3088, + "step": 2285 + }, + { + "epoch": 0.3236355914206838, + "grad_norm": 7.171915424041849, + "learning_rate": 4.775614426472488e-06, + "loss": 1.3219, + "step": 2286 + }, + { + "epoch": 0.323777164295321, + "grad_norm": 9.703786240804492, + "learning_rate": 4.775377022799944e-06, + "loss": 1.371, + "step": 2287 + }, + { + "epoch": 0.3239187371699582, + "grad_norm": 9.111841617028283, + "learning_rate": 4.7751394995125266e-06, + "loss": 1.1909, + "step": 2288 + }, + { + "epoch": 0.32406031004459546, + "grad_norm": 9.973538674605301, + "learning_rate": 4.7749018566227214e-06, + "loss": 1.4851, + "step": 2289 + }, + { + "epoch": 0.3242018829192327, + "grad_norm": 8.377841752019132, + "learning_rate": 4.774664094143022e-06, + "loss": 1.1724, + "step": 2290 + }, + { + "epoch": 0.3243434557938699, + "grad_norm": 10.320210851245248, + "learning_rate": 4.774426212085928e-06, + "loss": 1.3609, + "step": 2291 + }, + { + "epoch": 0.3244850286685071, + "grad_norm": 11.170033037620051, + "learning_rate": 4.774188210463944e-06, + "loss": 1.4117, + "step": 2292 + }, + { + "epoch": 0.3246266015431443, + "grad_norm": 8.646501360597163, + "learning_rate": 4.77395008928958e-06, + "loss": 1.2985, + "step": 2293 + }, + { + "epoch": 0.32476817441778155, + "grad_norm": 10.119684401706246, + "learning_rate": 4.773711848575357e-06, + "loss": 1.3323, + "step": 2294 + }, + { + "epoch": 0.3249097472924188, + "grad_norm": 10.590571734130396, + "learning_rate": 4.773473488333797e-06, + "loss": 1.4364, + "step": 2295 + }, + { + "epoch": 0.325051320167056, + "grad_norm": 9.923796027735438, + "learning_rate": 4.77323500857743e-06, + "loss": 1.3196, + "step": 2296 + }, + { + "epoch": 0.3251928930416932, + "grad_norm": 8.945486488482613, + "learning_rate": 4.772996409318794e-06, + "loss": 1.3218, + "step": 2297 + }, + { + "epoch": 0.3253344659163304, + "grad_norm": 8.596988026338062, + "learning_rate": 4.772757690570432e-06, + "loss": 1.4391, + "step": 2298 + }, + { + "epoch": 0.32547603879096765, + "grad_norm": 8.844135181679235, + "learning_rate": 4.772518852344893e-06, + "loss": 1.3219, + "step": 2299 + }, + { + "epoch": 0.3256176116656049, + "grad_norm": 11.80762651616795, + "learning_rate": 4.772279894654732e-06, + "loss": 1.4989, + "step": 2300 + }, + { + "epoch": 0.3257591845402421, + "grad_norm": 8.014139554580577, + "learning_rate": 4.772040817512511e-06, + "loss": 1.2117, + "step": 2301 + }, + { + "epoch": 0.3259007574148793, + "grad_norm": 8.646793419412093, + "learning_rate": 4.7718016209307996e-06, + "loss": 1.4292, + "step": 2302 + }, + { + "epoch": 0.3260423302895165, + "grad_norm": 10.227015334464264, + "learning_rate": 4.77156230492217e-06, + "loss": 1.4192, + "step": 2303 + }, + { + "epoch": 0.32618390316415374, + "grad_norm": 9.686631292955465, + "learning_rate": 4.771322869499203e-06, + "loss": 1.2747, + "step": 2304 + }, + { + "epoch": 0.326325476038791, + "grad_norm": 9.322450378236802, + "learning_rate": 4.7710833146744874e-06, + "loss": 1.3684, + "step": 2305 + }, + { + "epoch": 0.3264670489134282, + "grad_norm": 10.317098066423863, + "learning_rate": 4.770843640460615e-06, + "loss": 1.4558, + "step": 2306 + }, + { + "epoch": 0.32660862178806543, + "grad_norm": 10.763358399366023, + "learning_rate": 4.770603846870185e-06, + "loss": 1.4163, + "step": 2307 + }, + { + "epoch": 0.3267501946627026, + "grad_norm": 10.909116701615794, + "learning_rate": 4.770363933915805e-06, + "loss": 1.3577, + "step": 2308 + }, + { + "epoch": 0.32689176753733984, + "grad_norm": 8.153054031408262, + "learning_rate": 4.770123901610085e-06, + "loss": 1.257, + "step": 2309 + }, + { + "epoch": 0.32703334041197707, + "grad_norm": 10.641424073451596, + "learning_rate": 4.769883749965645e-06, + "loss": 1.315, + "step": 2310 + }, + { + "epoch": 0.3271749132866143, + "grad_norm": 9.483400800718607, + "learning_rate": 4.7696434789951074e-06, + "loss": 1.2914, + "step": 2311 + }, + { + "epoch": 0.32731648616125153, + "grad_norm": 10.545548238308163, + "learning_rate": 4.769403088711105e-06, + "loss": 1.3721, + "step": 2312 + }, + { + "epoch": 0.3274580590358887, + "grad_norm": 10.520092948822052, + "learning_rate": 4.7691625791262756e-06, + "loss": 1.4145, + "step": 2313 + }, + { + "epoch": 0.32759963191052593, + "grad_norm": 13.6578024673556, + "learning_rate": 4.76892195025326e-06, + "loss": 1.453, + "step": 2314 + }, + { + "epoch": 0.32774120478516316, + "grad_norm": 9.464754275010426, + "learning_rate": 4.768681202104709e-06, + "loss": 1.3407, + "step": 2315 + }, + { + "epoch": 0.3278827776598004, + "grad_norm": 9.949224117833259, + "learning_rate": 4.7684403346932795e-06, + "loss": 1.3194, + "step": 2316 + }, + { + "epoch": 0.3280243505344376, + "grad_norm": 8.540068487745906, + "learning_rate": 4.768199348031633e-06, + "loss": 1.3057, + "step": 2317 + }, + { + "epoch": 0.3281659234090748, + "grad_norm": 11.972481009117194, + "learning_rate": 4.7679582421324385e-06, + "loss": 1.5252, + "step": 2318 + }, + { + "epoch": 0.32830749628371203, + "grad_norm": 9.23813879052422, + "learning_rate": 4.76771701700837e-06, + "loss": 1.215, + "step": 2319 + }, + { + "epoch": 0.32844906915834926, + "grad_norm": 10.913021575942759, + "learning_rate": 4.767475672672108e-06, + "loss": 1.3623, + "step": 2320 + }, + { + "epoch": 0.3285906420329865, + "grad_norm": 9.77781192696513, + "learning_rate": 4.767234209136341e-06, + "loss": 1.4048, + "step": 2321 + }, + { + "epoch": 0.3287322149076237, + "grad_norm": 10.640822533385833, + "learning_rate": 4.7669926264137625e-06, + "loss": 1.3395, + "step": 2322 + }, + { + "epoch": 0.3288737877822609, + "grad_norm": 8.838765279578391, + "learning_rate": 4.766750924517071e-06, + "loss": 1.3281, + "step": 2323 + }, + { + "epoch": 0.3290153606568981, + "grad_norm": 10.264661769878245, + "learning_rate": 4.766509103458975e-06, + "loss": 1.3471, + "step": 2324 + }, + { + "epoch": 0.32915693353153536, + "grad_norm": 10.652897729796523, + "learning_rate": 4.766267163252185e-06, + "loss": 1.5196, + "step": 2325 + }, + { + "epoch": 0.3292985064061726, + "grad_norm": 9.872582634671536, + "learning_rate": 4.766025103909419e-06, + "loss": 1.2749, + "step": 2326 + }, + { + "epoch": 0.3294400792808098, + "grad_norm": 9.250830123356033, + "learning_rate": 4.765782925443404e-06, + "loss": 1.279, + "step": 2327 + }, + { + "epoch": 0.329581652155447, + "grad_norm": 8.12116984542271, + "learning_rate": 4.76554062786687e-06, + "loss": 1.3079, + "step": 2328 + }, + { + "epoch": 0.3297232250300842, + "grad_norm": 9.87970438094102, + "learning_rate": 4.765298211192554e-06, + "loss": 1.3782, + "step": 2329 + }, + { + "epoch": 0.32986479790472145, + "grad_norm": 9.47212627383424, + "learning_rate": 4.7650556754332e-06, + "loss": 1.2616, + "step": 2330 + }, + { + "epoch": 0.3300063707793587, + "grad_norm": 8.945253650188514, + "learning_rate": 4.7648130206015585e-06, + "loss": 1.2611, + "step": 2331 + }, + { + "epoch": 0.3301479436539959, + "grad_norm": 11.157412513836093, + "learning_rate": 4.764570246710385e-06, + "loss": 1.3572, + "step": 2332 + }, + { + "epoch": 0.3302895165286331, + "grad_norm": 10.66238432594512, + "learning_rate": 4.764327353772442e-06, + "loss": 1.603, + "step": 2333 + }, + { + "epoch": 0.3304310894032703, + "grad_norm": 8.616191401800505, + "learning_rate": 4.764084341800499e-06, + "loss": 1.419, + "step": 2334 + }, + { + "epoch": 0.33057266227790755, + "grad_norm": 6.801129572678017, + "learning_rate": 4.763841210807329e-06, + "loss": 1.1657, + "step": 2335 + }, + { + "epoch": 0.3307142351525448, + "grad_norm": 10.550619633272545, + "learning_rate": 4.763597960805716e-06, + "loss": 1.3385, + "step": 2336 + }, + { + "epoch": 0.330855808027182, + "grad_norm": 9.40182725718388, + "learning_rate": 4.763354591808446e-06, + "loss": 1.4718, + "step": 2337 + }, + { + "epoch": 0.33099738090181924, + "grad_norm": 10.659467373651529, + "learning_rate": 4.763111103828312e-06, + "loss": 1.3803, + "step": 2338 + }, + { + "epoch": 0.3311389537764564, + "grad_norm": 9.795682259555777, + "learning_rate": 4.762867496878114e-06, + "loss": 1.2628, + "step": 2339 + }, + { + "epoch": 0.33128052665109364, + "grad_norm": 8.87586089780851, + "learning_rate": 4.76262377097066e-06, + "loss": 1.6375, + "step": 2340 + }, + { + "epoch": 0.3314220995257309, + "grad_norm": 8.792083682133196, + "learning_rate": 4.762379926118761e-06, + "loss": 1.4179, + "step": 2341 + }, + { + "epoch": 0.3315636724003681, + "grad_norm": 9.767461546057447, + "learning_rate": 4.762135962335237e-06, + "loss": 1.2659, + "step": 2342 + }, + { + "epoch": 0.33170524527500533, + "grad_norm": 8.979724934534595, + "learning_rate": 4.7618918796329115e-06, + "loss": 1.3012, + "step": 2343 + }, + { + "epoch": 0.3318468181496425, + "grad_norm": 8.851819350311374, + "learning_rate": 4.761647678024617e-06, + "loss": 1.3692, + "step": 2344 + }, + { + "epoch": 0.33198839102427974, + "grad_norm": 10.004613956322306, + "learning_rate": 4.76140335752319e-06, + "loss": 1.3045, + "step": 2345 + }, + { + "epoch": 0.33212996389891697, + "grad_norm": 9.74926989829519, + "learning_rate": 4.7611589181414745e-06, + "loss": 1.3116, + "step": 2346 + }, + { + "epoch": 0.3322715367735542, + "grad_norm": 8.33745666218122, + "learning_rate": 4.76091435989232e-06, + "loss": 1.2299, + "step": 2347 + }, + { + "epoch": 0.33241310964819143, + "grad_norm": 8.930186874296824, + "learning_rate": 4.760669682788584e-06, + "loss": 1.446, + "step": 2348 + }, + { + "epoch": 0.3325546825228286, + "grad_norm": 8.476120110143675, + "learning_rate": 4.760424886843129e-06, + "loss": 1.3183, + "step": 2349 + }, + { + "epoch": 0.33269625539746583, + "grad_norm": 11.249620219284072, + "learning_rate": 4.7601799720688235e-06, + "loss": 1.4387, + "step": 2350 + }, + { + "epoch": 0.33283782827210306, + "grad_norm": 9.76422021146034, + "learning_rate": 4.759934938478541e-06, + "loss": 1.3479, + "step": 2351 + }, + { + "epoch": 0.3329794011467403, + "grad_norm": 10.096779383469613, + "learning_rate": 4.7596897860851644e-06, + "loss": 1.3358, + "step": 2352 + }, + { + "epoch": 0.3331209740213775, + "grad_norm": 8.040990718703837, + "learning_rate": 4.75944451490158e-06, + "loss": 1.311, + "step": 2353 + }, + { + "epoch": 0.3332625468960147, + "grad_norm": 9.257857825574698, + "learning_rate": 4.759199124940683e-06, + "loss": 1.3211, + "step": 2354 + }, + { + "epoch": 0.33340411977065193, + "grad_norm": 8.645797643626135, + "learning_rate": 4.7589536162153725e-06, + "loss": 1.2343, + "step": 2355 + }, + { + "epoch": 0.33354569264528916, + "grad_norm": 10.122427908013456, + "learning_rate": 4.758707988738555e-06, + "loss": 1.3853, + "step": 2356 + }, + { + "epoch": 0.3336872655199264, + "grad_norm": 8.854674799680984, + "learning_rate": 4.758462242523141e-06, + "loss": 1.3234, + "step": 2357 + }, + { + "epoch": 0.3338288383945636, + "grad_norm": 8.896651385987262, + "learning_rate": 4.758216377582052e-06, + "loss": 1.238, + "step": 2358 + }, + { + "epoch": 0.3339704112692008, + "grad_norm": 8.67421711403997, + "learning_rate": 4.757970393928212e-06, + "loss": 1.1752, + "step": 2359 + }, + { + "epoch": 0.334111984143838, + "grad_norm": 9.84112145584989, + "learning_rate": 4.757724291574552e-06, + "loss": 1.3281, + "step": 2360 + }, + { + "epoch": 0.33425355701847526, + "grad_norm": 6.92637278036478, + "learning_rate": 4.7574780705340094e-06, + "loss": 1.2771, + "step": 2361 + }, + { + "epoch": 0.3343951298931125, + "grad_norm": 8.924763173612963, + "learning_rate": 4.757231730819528e-06, + "loss": 1.3431, + "step": 2362 + }, + { + "epoch": 0.3345367027677497, + "grad_norm": 9.724217682975759, + "learning_rate": 4.7569852724440565e-06, + "loss": 1.3765, + "step": 2363 + }, + { + "epoch": 0.3346782756423869, + "grad_norm": 12.263390520990209, + "learning_rate": 4.7567386954205535e-06, + "loss": 1.526, + "step": 2364 + }, + { + "epoch": 0.3348198485170241, + "grad_norm": 11.672985509210665, + "learning_rate": 4.756491999761979e-06, + "loss": 1.2542, + "step": 2365 + }, + { + "epoch": 0.33496142139166135, + "grad_norm": 10.361603694203067, + "learning_rate": 4.756245185481304e-06, + "loss": 1.3895, + "step": 2366 + }, + { + "epoch": 0.3351029942662986, + "grad_norm": 10.492933892964775, + "learning_rate": 4.755998252591501e-06, + "loss": 1.3694, + "step": 2367 + }, + { + "epoch": 0.3352445671409358, + "grad_norm": 9.905220242958933, + "learning_rate": 4.755751201105552e-06, + "loss": 1.4377, + "step": 2368 + }, + { + "epoch": 0.33538614001557304, + "grad_norm": 11.117811645229855, + "learning_rate": 4.755504031036444e-06, + "loss": 1.3639, + "step": 2369 + }, + { + "epoch": 0.3355277128902102, + "grad_norm": 9.824765066196598, + "learning_rate": 4.75525674239717e-06, + "loss": 1.3643, + "step": 2370 + }, + { + "epoch": 0.33566928576484745, + "grad_norm": 10.139854554707627, + "learning_rate": 4.755009335200732e-06, + "loss": 1.3937, + "step": 2371 + }, + { + "epoch": 0.3358108586394847, + "grad_norm": 11.270852662996557, + "learning_rate": 4.754761809460135e-06, + "loss": 1.5049, + "step": 2372 + }, + { + "epoch": 0.3359524315141219, + "grad_norm": 10.269748550501484, + "learning_rate": 4.75451416518839e-06, + "loss": 1.3755, + "step": 2373 + }, + { + "epoch": 0.33609400438875914, + "grad_norm": 10.037125147450634, + "learning_rate": 4.754266402398517e-06, + "loss": 1.3799, + "step": 2374 + }, + { + "epoch": 0.3362355772633963, + "grad_norm": 9.123615316600787, + "learning_rate": 4.754018521103539e-06, + "loss": 1.2609, + "step": 2375 + }, + { + "epoch": 0.33637715013803354, + "grad_norm": 9.59543617167594, + "learning_rate": 4.75377052131649e-06, + "loss": 1.3512, + "step": 2376 + }, + { + "epoch": 0.3365187230126708, + "grad_norm": 9.356217757725632, + "learning_rate": 4.753522403050403e-06, + "loss": 1.2956, + "step": 2377 + }, + { + "epoch": 0.336660295887308, + "grad_norm": 11.232466194650437, + "learning_rate": 4.7532741663183255e-06, + "loss": 1.328, + "step": 2378 + }, + { + "epoch": 0.33680186876194523, + "grad_norm": 10.207759628264077, + "learning_rate": 4.753025811133304e-06, + "loss": 1.1741, + "step": 2379 + }, + { + "epoch": 0.3369434416365824, + "grad_norm": 12.413959865253826, + "learning_rate": 4.752777337508395e-06, + "loss": 1.2947, + "step": 2380 + }, + { + "epoch": 0.33708501451121964, + "grad_norm": 9.485770561195777, + "learning_rate": 4.752528745456663e-06, + "loss": 1.3986, + "step": 2381 + }, + { + "epoch": 0.33722658738585687, + "grad_norm": 9.665196887999182, + "learning_rate": 4.752280034991172e-06, + "loss": 1.2756, + "step": 2382 + }, + { + "epoch": 0.3373681602604941, + "grad_norm": 12.751096192153932, + "learning_rate": 4.752031206125e-06, + "loss": 1.3544, + "step": 2383 + }, + { + "epoch": 0.33750973313513133, + "grad_norm": 11.549174780491962, + "learning_rate": 4.751782258871227e-06, + "loss": 1.3731, + "step": 2384 + }, + { + "epoch": 0.3376513060097685, + "grad_norm": 12.603433752688131, + "learning_rate": 4.751533193242941e-06, + "loss": 1.5942, + "step": 2385 + }, + { + "epoch": 0.33779287888440573, + "grad_norm": 9.164659823750066, + "learning_rate": 4.751284009253232e-06, + "loss": 1.3042, + "step": 2386 + }, + { + "epoch": 0.33793445175904296, + "grad_norm": 10.708119985233825, + "learning_rate": 4.7510347069152015e-06, + "loss": 1.4192, + "step": 2387 + }, + { + "epoch": 0.3380760246336802, + "grad_norm": 9.668203516720293, + "learning_rate": 4.750785286241955e-06, + "loss": 1.4275, + "step": 2388 + }, + { + "epoch": 0.3382175975083174, + "grad_norm": 8.267988072682467, + "learning_rate": 4.750535747246604e-06, + "loss": 1.3317, + "step": 2389 + }, + { + "epoch": 0.3383591703829546, + "grad_norm": 8.24473264909652, + "learning_rate": 4.750286089942267e-06, + "loss": 1.2047, + "step": 2390 + }, + { + "epoch": 0.33850074325759183, + "grad_norm": 8.861423047141479, + "learning_rate": 4.750036314342069e-06, + "loss": 1.3129, + "step": 2391 + }, + { + "epoch": 0.33864231613222906, + "grad_norm": 8.67741764030618, + "learning_rate": 4.7497864204591386e-06, + "loss": 1.3807, + "step": 2392 + }, + { + "epoch": 0.3387838890068663, + "grad_norm": 9.661263852419838, + "learning_rate": 4.749536408306614e-06, + "loss": 1.3761, + "step": 2393 + }, + { + "epoch": 0.3389254618815035, + "grad_norm": 9.263844207955861, + "learning_rate": 4.749286277897637e-06, + "loss": 1.385, + "step": 2394 + }, + { + "epoch": 0.33906703475614075, + "grad_norm": 10.32197939675294, + "learning_rate": 4.749036029245358e-06, + "loss": 1.2286, + "step": 2395 + }, + { + "epoch": 0.3392086076307779, + "grad_norm": 8.958861106288976, + "learning_rate": 4.7487856623629325e-06, + "loss": 1.3419, + "step": 2396 + }, + { + "epoch": 0.33935018050541516, + "grad_norm": 9.740873320042171, + "learning_rate": 4.748535177263522e-06, + "loss": 1.3405, + "step": 2397 + }, + { + "epoch": 0.3394917533800524, + "grad_norm": 8.535847637186006, + "learning_rate": 4.748284573960292e-06, + "loss": 1.4036, + "step": 2398 + }, + { + "epoch": 0.3396333262546896, + "grad_norm": 8.808180595909478, + "learning_rate": 4.748033852466419e-06, + "loss": 1.3464, + "step": 2399 + }, + { + "epoch": 0.33977489912932685, + "grad_norm": 10.380372276612926, + "learning_rate": 4.747783012795083e-06, + "loss": 1.4047, + "step": 2400 + }, + { + "epoch": 0.339916472003964, + "grad_norm": 10.109497635582475, + "learning_rate": 4.747532054959469e-06, + "loss": 1.328, + "step": 2401 + }, + { + "epoch": 0.34005804487860125, + "grad_norm": 8.817781637705364, + "learning_rate": 4.747280978972772e-06, + "loss": 1.4293, + "step": 2402 + }, + { + "epoch": 0.3401996177532385, + "grad_norm": 10.975016920211367, + "learning_rate": 4.747029784848189e-06, + "loss": 1.3992, + "step": 2403 + }, + { + "epoch": 0.3403411906278757, + "grad_norm": 10.78901264640455, + "learning_rate": 4.746778472598927e-06, + "loss": 1.2605, + "step": 2404 + }, + { + "epoch": 0.34048276350251294, + "grad_norm": 8.43916677330391, + "learning_rate": 4.746527042238194e-06, + "loss": 1.2425, + "step": 2405 + }, + { + "epoch": 0.3406243363771501, + "grad_norm": 10.790810765872196, + "learning_rate": 4.74627549377921e-06, + "loss": 1.4551, + "step": 2406 + }, + { + "epoch": 0.34076590925178735, + "grad_norm": 9.619568716679655, + "learning_rate": 4.746023827235198e-06, + "loss": 1.2343, + "step": 2407 + }, + { + "epoch": 0.3409074821264246, + "grad_norm": 9.11386976373442, + "learning_rate": 4.745772042619389e-06, + "loss": 1.2838, + "step": 2408 + }, + { + "epoch": 0.3410490550010618, + "grad_norm": 10.973562896416166, + "learning_rate": 4.745520139945018e-06, + "loss": 1.3601, + "step": 2409 + }, + { + "epoch": 0.34119062787569904, + "grad_norm": 7.775809105480474, + "learning_rate": 4.745268119225327e-06, + "loss": 1.2794, + "step": 2410 + }, + { + "epoch": 0.3413322007503362, + "grad_norm": 8.633637015206949, + "learning_rate": 4.745015980473565e-06, + "loss": 1.4651, + "step": 2411 + }, + { + "epoch": 0.34147377362497344, + "grad_norm": 9.306035812915697, + "learning_rate": 4.744763723702988e-06, + "loss": 1.4775, + "step": 2412 + }, + { + "epoch": 0.3416153464996107, + "grad_norm": 10.251613955084439, + "learning_rate": 4.744511348926855e-06, + "loss": 1.3409, + "step": 2413 + }, + { + "epoch": 0.3417569193742479, + "grad_norm": 10.238900478460199, + "learning_rate": 4.7442588561584336e-06, + "loss": 1.182, + "step": 2414 + }, + { + "epoch": 0.34189849224888513, + "grad_norm": 11.61571566731328, + "learning_rate": 4.744006245410998e-06, + "loss": 1.3698, + "step": 2415 + }, + { + "epoch": 0.3420400651235223, + "grad_norm": 8.391679239374142, + "learning_rate": 4.743753516697827e-06, + "loss": 1.2354, + "step": 2416 + }, + { + "epoch": 0.34218163799815954, + "grad_norm": 11.567051337127461, + "learning_rate": 4.743500670032207e-06, + "loss": 1.3622, + "step": 2417 + }, + { + "epoch": 0.34232321087279677, + "grad_norm": 9.015487484734386, + "learning_rate": 4.743247705427429e-06, + "loss": 1.3385, + "step": 2418 + }, + { + "epoch": 0.342464783747434, + "grad_norm": 11.7226816321368, + "learning_rate": 4.742994622896793e-06, + "loss": 1.4042, + "step": 2419 + }, + { + "epoch": 0.34260635662207123, + "grad_norm": 9.418489217913308, + "learning_rate": 4.7427414224536014e-06, + "loss": 1.2313, + "step": 2420 + }, + { + "epoch": 0.3427479294967084, + "grad_norm": 12.917606114504816, + "learning_rate": 4.742488104111165e-06, + "loss": 1.5169, + "step": 2421 + }, + { + "epoch": 0.34288950237134563, + "grad_norm": 9.235900856117038, + "learning_rate": 4.742234667882802e-06, + "loss": 1.2721, + "step": 2422 + }, + { + "epoch": 0.34303107524598286, + "grad_norm": 9.727391170146843, + "learning_rate": 4.7419811137818335e-06, + "loss": 1.4337, + "step": 2423 + }, + { + "epoch": 0.3431726481206201, + "grad_norm": 9.8200840889308, + "learning_rate": 4.7417274418215895e-06, + "loss": 1.3806, + "step": 2424 + }, + { + "epoch": 0.3433142209952573, + "grad_norm": 8.212820720942101, + "learning_rate": 4.741473652015407e-06, + "loss": 1.3629, + "step": 2425 + }, + { + "epoch": 0.34345579386989455, + "grad_norm": 11.93597050799697, + "learning_rate": 4.741219744376624e-06, + "loss": 1.3855, + "step": 2426 + }, + { + "epoch": 0.34359736674453173, + "grad_norm": 11.034623189558712, + "learning_rate": 4.740965718918591e-06, + "loss": 1.4437, + "step": 2427 + }, + { + "epoch": 0.34373893961916896, + "grad_norm": 12.193340447536567, + "learning_rate": 4.74071157565466e-06, + "loss": 1.4606, + "step": 2428 + }, + { + "epoch": 0.3438805124938062, + "grad_norm": 9.871309387484393, + "learning_rate": 4.740457314598194e-06, + "loss": 1.362, + "step": 2429 + }, + { + "epoch": 0.3440220853684434, + "grad_norm": 10.137094691366865, + "learning_rate": 4.740202935762557e-06, + "loss": 1.3682, + "step": 2430 + }, + { + "epoch": 0.34416365824308065, + "grad_norm": 9.869608505817517, + "learning_rate": 4.739948439161122e-06, + "loss": 1.3692, + "step": 2431 + }, + { + "epoch": 0.3443052311177178, + "grad_norm": 8.566807463957137, + "learning_rate": 4.7396938248072675e-06, + "loss": 1.2732, + "step": 2432 + }, + { + "epoch": 0.34444680399235506, + "grad_norm": 9.217956301902774, + "learning_rate": 4.739439092714379e-06, + "loss": 1.1819, + "step": 2433 + }, + { + "epoch": 0.3445883768669923, + "grad_norm": 10.107586615505276, + "learning_rate": 4.7391842428958454e-06, + "loss": 1.2226, + "step": 2434 + }, + { + "epoch": 0.3447299497416295, + "grad_norm": 8.767509162752287, + "learning_rate": 4.738929275365068e-06, + "loss": 1.4763, + "step": 2435 + }, + { + "epoch": 0.34487152261626675, + "grad_norm": 8.334738905306446, + "learning_rate": 4.738674190135447e-06, + "loss": 1.2977, + "step": 2436 + }, + { + "epoch": 0.3450130954909039, + "grad_norm": 8.396271523020278, + "learning_rate": 4.7384189872203935e-06, + "loss": 1.2404, + "step": 2437 + }, + { + "epoch": 0.34515466836554115, + "grad_norm": 8.997820272339313, + "learning_rate": 4.738163666633322e-06, + "loss": 1.3469, + "step": 2438 + }, + { + "epoch": 0.3452962412401784, + "grad_norm": 9.050831827113756, + "learning_rate": 4.737908228387656e-06, + "loss": 1.3806, + "step": 2439 + }, + { + "epoch": 0.3454378141148156, + "grad_norm": 9.733095430924351, + "learning_rate": 4.737652672496823e-06, + "loss": 1.4262, + "step": 2440 + }, + { + "epoch": 0.34557938698945284, + "grad_norm": 8.937461586182746, + "learning_rate": 4.737396998974257e-06, + "loss": 1.2047, + "step": 2441 + }, + { + "epoch": 0.34572095986409, + "grad_norm": 9.778144709127428, + "learning_rate": 4.7371412078334e-06, + "loss": 1.4933, + "step": 2442 + }, + { + "epoch": 0.34586253273872725, + "grad_norm": 12.563072684979815, + "learning_rate": 4.736885299087698e-06, + "loss": 1.4023, + "step": 2443 + }, + { + "epoch": 0.3460041056133645, + "grad_norm": 11.858895715570377, + "learning_rate": 4.7366292727506025e-06, + "loss": 1.5855, + "step": 2444 + }, + { + "epoch": 0.3461456784880017, + "grad_norm": 8.26963504082835, + "learning_rate": 4.736373128835574e-06, + "loss": 1.3152, + "step": 2445 + }, + { + "epoch": 0.34628725136263894, + "grad_norm": 12.37015384612325, + "learning_rate": 4.736116867356079e-06, + "loss": 1.4039, + "step": 2446 + }, + { + "epoch": 0.3464288242372761, + "grad_norm": 12.95708895449918, + "learning_rate": 4.735860488325586e-06, + "loss": 1.4388, + "step": 2447 + }, + { + "epoch": 0.34657039711191334, + "grad_norm": 10.891559139344553, + "learning_rate": 4.735603991757576e-06, + "loss": 1.4925, + "step": 2448 + }, + { + "epoch": 0.34671196998655057, + "grad_norm": 11.73288795608477, + "learning_rate": 4.735347377665529e-06, + "loss": 1.3055, + "step": 2449 + }, + { + "epoch": 0.3468535428611878, + "grad_norm": 9.458170705957222, + "learning_rate": 4.735090646062939e-06, + "loss": 1.4278, + "step": 2450 + }, + { + "epoch": 0.34699511573582503, + "grad_norm": 11.603566746068587, + "learning_rate": 4.7348337969632985e-06, + "loss": 1.368, + "step": 2451 + }, + { + "epoch": 0.34713668861046226, + "grad_norm": 14.982937072560656, + "learning_rate": 4.734576830380113e-06, + "loss": 1.3213, + "step": 2452 + }, + { + "epoch": 0.34727826148509944, + "grad_norm": 8.606679283189969, + "learning_rate": 4.7343197463268895e-06, + "loss": 1.2988, + "step": 2453 + }, + { + "epoch": 0.34741983435973667, + "grad_norm": 7.014946919659801, + "learning_rate": 4.734062544817143e-06, + "loss": 1.2639, + "step": 2454 + }, + { + "epoch": 0.3475614072343739, + "grad_norm": 9.345754351022096, + "learning_rate": 4.733805225864393e-06, + "loss": 1.4731, + "step": 2455 + }, + { + "epoch": 0.34770298010901113, + "grad_norm": 14.54838539419839, + "learning_rate": 4.733547789482169e-06, + "loss": 1.3937, + "step": 2456 + }, + { + "epoch": 0.34784455298364836, + "grad_norm": 11.69058399004602, + "learning_rate": 4.733290235684002e-06, + "loss": 1.2455, + "step": 2457 + }, + { + "epoch": 0.34798612585828553, + "grad_norm": 9.08465393918911, + "learning_rate": 4.733032564483434e-06, + "loss": 1.4812, + "step": 2458 + }, + { + "epoch": 0.34812769873292276, + "grad_norm": 8.110299294778146, + "learning_rate": 4.732774775894009e-06, + "loss": 1.2938, + "step": 2459 + }, + { + "epoch": 0.34826927160756, + "grad_norm": 10.381884760917202, + "learning_rate": 4.732516869929278e-06, + "loss": 1.5566, + "step": 2460 + }, + { + "epoch": 0.3484108444821972, + "grad_norm": 8.859596316955647, + "learning_rate": 4.732258846602801e-06, + "loss": 1.432, + "step": 2461 + }, + { + "epoch": 0.34855241735683445, + "grad_norm": 9.5869617614461, + "learning_rate": 4.73200070592814e-06, + "loss": 1.233, + "step": 2462 + }, + { + "epoch": 0.34869399023147163, + "grad_norm": 10.436229148749794, + "learning_rate": 4.731742447918866e-06, + "loss": 1.3385, + "step": 2463 + }, + { + "epoch": 0.34883556310610886, + "grad_norm": 9.503508472075383, + "learning_rate": 4.731484072588556e-06, + "loss": 1.3426, + "step": 2464 + }, + { + "epoch": 0.3489771359807461, + "grad_norm": 8.858808334500859, + "learning_rate": 4.731225579950791e-06, + "loss": 1.3543, + "step": 2465 + }, + { + "epoch": 0.3491187088553833, + "grad_norm": 12.200260622336298, + "learning_rate": 4.730966970019163e-06, + "loss": 1.3303, + "step": 2466 + }, + { + "epoch": 0.34926028173002055, + "grad_norm": 9.928332436675584, + "learning_rate": 4.730708242807263e-06, + "loss": 1.4258, + "step": 2467 + }, + { + "epoch": 0.3494018546046577, + "grad_norm": 10.947767920289662, + "learning_rate": 4.730449398328695e-06, + "loss": 1.4295, + "step": 2468 + }, + { + "epoch": 0.34954342747929495, + "grad_norm": 9.226009704412736, + "learning_rate": 4.7301904365970656e-06, + "loss": 1.4534, + "step": 2469 + }, + { + "epoch": 0.3496850003539322, + "grad_norm": 9.048945109391118, + "learning_rate": 4.7299313576259865e-06, + "loss": 1.1861, + "step": 2470 + }, + { + "epoch": 0.3498265732285694, + "grad_norm": 9.72319846444092, + "learning_rate": 4.72967216142908e-06, + "loss": 1.344, + "step": 2471 + }, + { + "epoch": 0.34996814610320665, + "grad_norm": 10.12867048631322, + "learning_rate": 4.729412848019969e-06, + "loss": 1.4683, + "step": 2472 + }, + { + "epoch": 0.3501097189778438, + "grad_norm": 9.249141343961933, + "learning_rate": 4.729153417412288e-06, + "loss": 1.2761, + "step": 2473 + }, + { + "epoch": 0.35025129185248105, + "grad_norm": 10.481486393009149, + "learning_rate": 4.7288938696196735e-06, + "loss": 1.2911, + "step": 2474 + }, + { + "epoch": 0.3503928647271183, + "grad_norm": 10.44193561894608, + "learning_rate": 4.728634204655771e-06, + "loss": 1.5429, + "step": 2475 + }, + { + "epoch": 0.3505344376017555, + "grad_norm": 10.907640953228649, + "learning_rate": 4.728374422534229e-06, + "loss": 1.4166, + "step": 2476 + }, + { + "epoch": 0.35067601047639274, + "grad_norm": 14.27181407964746, + "learning_rate": 4.728114523268705e-06, + "loss": 1.2956, + "step": 2477 + }, + { + "epoch": 0.3508175833510299, + "grad_norm": 10.388570711518629, + "learning_rate": 4.727854506872863e-06, + "loss": 1.3249, + "step": 2478 + }, + { + "epoch": 0.35095915622566715, + "grad_norm": 8.7308860184121, + "learning_rate": 4.72759437336037e-06, + "loss": 1.4873, + "step": 2479 + }, + { + "epoch": 0.3511007291003044, + "grad_norm": 8.70348221817957, + "learning_rate": 4.727334122744902e-06, + "loss": 1.4942, + "step": 2480 + }, + { + "epoch": 0.3512423019749416, + "grad_norm": 8.534638231562738, + "learning_rate": 4.72707375504014e-06, + "loss": 1.3337, + "step": 2481 + }, + { + "epoch": 0.35138387484957884, + "grad_norm": 10.996827448324478, + "learning_rate": 4.726813270259772e-06, + "loss": 1.2276, + "step": 2482 + }, + { + "epoch": 0.35152544772421607, + "grad_norm": 11.042791882648, + "learning_rate": 4.7265526684174894e-06, + "loss": 1.2565, + "step": 2483 + }, + { + "epoch": 0.35166702059885324, + "grad_norm": 9.603142430655552, + "learning_rate": 4.7262919495269946e-06, + "loss": 1.229, + "step": 2484 + }, + { + "epoch": 0.35180859347349047, + "grad_norm": 8.540522304884776, + "learning_rate": 4.726031113601991e-06, + "loss": 1.419, + "step": 2485 + }, + { + "epoch": 0.3519501663481277, + "grad_norm": 9.607416514724568, + "learning_rate": 4.725770160656191e-06, + "loss": 1.2241, + "step": 2486 + }, + { + "epoch": 0.35209173922276493, + "grad_norm": 10.378372930995697, + "learning_rate": 4.725509090703314e-06, + "loss": 1.4612, + "step": 2487 + }, + { + "epoch": 0.35223331209740216, + "grad_norm": 9.008105548680692, + "learning_rate": 4.725247903757084e-06, + "loss": 1.1582, + "step": 2488 + }, + { + "epoch": 0.35237488497203934, + "grad_norm": 8.306999509132439, + "learning_rate": 4.7249865998312306e-06, + "loss": 1.3716, + "step": 2489 + }, + { + "epoch": 0.35251645784667657, + "grad_norm": 8.346260978825198, + "learning_rate": 4.72472517893949e-06, + "loss": 1.4011, + "step": 2490 + }, + { + "epoch": 0.3526580307213138, + "grad_norm": 9.520598316143964, + "learning_rate": 4.724463641095606e-06, + "loss": 1.1861, + "step": 2491 + }, + { + "epoch": 0.35279960359595103, + "grad_norm": 13.112219409831521, + "learning_rate": 4.7242019863133275e-06, + "loss": 1.3587, + "step": 2492 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 7.442681310954779, + "learning_rate": 4.723940214606408e-06, + "loss": 1.3025, + "step": 2493 + }, + { + "epoch": 0.35308274934522543, + "grad_norm": 10.590268083300435, + "learning_rate": 4.723678325988611e-06, + "loss": 1.4549, + "step": 2494 + }, + { + "epoch": 0.35322432221986266, + "grad_norm": 9.431411092607188, + "learning_rate": 4.723416320473702e-06, + "loss": 1.2775, + "step": 2495 + }, + { + "epoch": 0.3533658950944999, + "grad_norm": 10.66411687257201, + "learning_rate": 4.723154198075454e-06, + "loss": 1.4799, + "step": 2496 + }, + { + "epoch": 0.3535074679691371, + "grad_norm": 11.284105610490844, + "learning_rate": 4.7228919588076484e-06, + "loss": 1.2211, + "step": 2497 + }, + { + "epoch": 0.35364904084377435, + "grad_norm": 8.226551371075093, + "learning_rate": 4.722629602684069e-06, + "loss": 1.3003, + "step": 2498 + }, + { + "epoch": 0.35379061371841153, + "grad_norm": 9.354155701175598, + "learning_rate": 4.72236712971851e-06, + "loss": 1.2904, + "step": 2499 + }, + { + "epoch": 0.35393218659304876, + "grad_norm": 11.82630396421898, + "learning_rate": 4.7221045399247666e-06, + "loss": 1.5433, + "step": 2500 + }, + { + "epoch": 0.354073759467686, + "grad_norm": 9.106249212449562, + "learning_rate": 4.721841833316645e-06, + "loss": 1.4807, + "step": 2501 + }, + { + "epoch": 0.3542153323423232, + "grad_norm": 9.820050681424316, + "learning_rate": 4.721579009907955e-06, + "loss": 1.3702, + "step": 2502 + }, + { + "epoch": 0.35435690521696045, + "grad_norm": 9.252923864196717, + "learning_rate": 4.721316069712514e-06, + "loss": 1.2999, + "step": 2503 + }, + { + "epoch": 0.3544984780915976, + "grad_norm": 11.23537970070629, + "learning_rate": 4.721053012744142e-06, + "loss": 1.3154, + "step": 2504 + }, + { + "epoch": 0.35464005096623485, + "grad_norm": 9.254745091122723, + "learning_rate": 4.7207898390166695e-06, + "loss": 1.2765, + "step": 2505 + }, + { + "epoch": 0.3547816238408721, + "grad_norm": 9.426008288940956, + "learning_rate": 4.720526548543931e-06, + "loss": 1.3789, + "step": 2506 + }, + { + "epoch": 0.3549231967155093, + "grad_norm": 7.894592375521029, + "learning_rate": 4.720263141339768e-06, + "loss": 1.2328, + "step": 2507 + }, + { + "epoch": 0.35506476959014655, + "grad_norm": 10.412398987425513, + "learning_rate": 4.719999617418027e-06, + "loss": 1.3842, + "step": 2508 + }, + { + "epoch": 0.3552063424647837, + "grad_norm": 8.363266654004077, + "learning_rate": 4.719735976792562e-06, + "loss": 1.3067, + "step": 2509 + }, + { + "epoch": 0.35534791533942095, + "grad_norm": 10.013587494463577, + "learning_rate": 4.71947221947723e-06, + "loss": 1.3353, + "step": 2510 + }, + { + "epoch": 0.3554894882140582, + "grad_norm": 8.351958048689559, + "learning_rate": 4.7192083454859e-06, + "loss": 1.2974, + "step": 2511 + }, + { + "epoch": 0.3556310610886954, + "grad_norm": 10.256202867807607, + "learning_rate": 4.7189443548324415e-06, + "loss": 1.4175, + "step": 2512 + }, + { + "epoch": 0.35577263396333264, + "grad_norm": 10.632951341741382, + "learning_rate": 4.7186802475307325e-06, + "loss": 1.461, + "step": 2513 + }, + { + "epoch": 0.35591420683796987, + "grad_norm": 9.058180628112021, + "learning_rate": 4.7184160235946576e-06, + "loss": 1.3578, + "step": 2514 + }, + { + "epoch": 0.35605577971260705, + "grad_norm": 10.103644938943871, + "learning_rate": 4.7181516830381065e-06, + "loss": 1.2364, + "step": 2515 + }, + { + "epoch": 0.3561973525872443, + "grad_norm": 10.070669707953847, + "learning_rate": 4.717887225874976e-06, + "loss": 1.2538, + "step": 2516 + }, + { + "epoch": 0.3563389254618815, + "grad_norm": 10.586652420508395, + "learning_rate": 4.717622652119166e-06, + "loss": 1.2633, + "step": 2517 + }, + { + "epoch": 0.35648049833651874, + "grad_norm": 10.587766865110952, + "learning_rate": 4.717357961784587e-06, + "loss": 1.3731, + "step": 2518 + }, + { + "epoch": 0.35662207121115597, + "grad_norm": 10.341077384379568, + "learning_rate": 4.717093154885154e-06, + "loss": 1.5045, + "step": 2519 + }, + { + "epoch": 0.35676364408579314, + "grad_norm": 8.080699635799112, + "learning_rate": 4.716828231434787e-06, + "loss": 1.3251, + "step": 2520 + }, + { + "epoch": 0.35690521696043037, + "grad_norm": 8.70455027882777, + "learning_rate": 4.716563191447413e-06, + "loss": 1.3304, + "step": 2521 + }, + { + "epoch": 0.3570467898350676, + "grad_norm": 8.930146720338772, + "learning_rate": 4.7162980349369645e-06, + "loss": 1.286, + "step": 2522 + }, + { + "epoch": 0.35718836270970483, + "grad_norm": 9.43124323718901, + "learning_rate": 4.716032761917381e-06, + "loss": 1.2823, + "step": 2523 + }, + { + "epoch": 0.35732993558434206, + "grad_norm": 10.635766173375877, + "learning_rate": 4.715767372402608e-06, + "loss": 1.3656, + "step": 2524 + }, + { + "epoch": 0.35747150845897924, + "grad_norm": 8.165111886803722, + "learning_rate": 4.715501866406595e-06, + "loss": 1.2738, + "step": 2525 + }, + { + "epoch": 0.35761308133361647, + "grad_norm": 10.392780556487878, + "learning_rate": 4.715236243943302e-06, + "loss": 1.3055, + "step": 2526 + }, + { + "epoch": 0.3577546542082537, + "grad_norm": 9.055298778843307, + "learning_rate": 4.714970505026691e-06, + "loss": 1.2822, + "step": 2527 + }, + { + "epoch": 0.3578962270828909, + "grad_norm": 9.908153651869883, + "learning_rate": 4.714704649670732e-06, + "loss": 1.473, + "step": 2528 + }, + { + "epoch": 0.35803779995752816, + "grad_norm": 8.787560201468528, + "learning_rate": 4.7144386778894e-06, + "loss": 1.2644, + "step": 2529 + }, + { + "epoch": 0.35817937283216533, + "grad_norm": 9.619176514844213, + "learning_rate": 4.71417258969668e-06, + "loss": 1.2005, + "step": 2530 + }, + { + "epoch": 0.35832094570680256, + "grad_norm": 11.070465151448406, + "learning_rate": 4.713906385106556e-06, + "loss": 1.586, + "step": 2531 + }, + { + "epoch": 0.3584625185814398, + "grad_norm": 8.011054031420331, + "learning_rate": 4.7136400641330245e-06, + "loss": 1.2618, + "step": 2532 + }, + { + "epoch": 0.358604091456077, + "grad_norm": 11.086488394191859, + "learning_rate": 4.713373626790086e-06, + "loss": 1.2252, + "step": 2533 + }, + { + "epoch": 0.35874566433071425, + "grad_norm": 7.931008876204439, + "learning_rate": 4.713107073091746e-06, + "loss": 1.4278, + "step": 2534 + }, + { + "epoch": 0.35888723720535143, + "grad_norm": 10.805763571686525, + "learning_rate": 4.712840403052018e-06, + "loss": 1.5341, + "step": 2535 + }, + { + "epoch": 0.35902881007998866, + "grad_norm": 7.047689657213527, + "learning_rate": 4.712573616684919e-06, + "loss": 1.2776, + "step": 2536 + }, + { + "epoch": 0.3591703829546259, + "grad_norm": 11.720719886517344, + "learning_rate": 4.712306714004475e-06, + "loss": 1.4437, + "step": 2537 + }, + { + "epoch": 0.3593119558292631, + "grad_norm": 8.554445808515027, + "learning_rate": 4.712039695024717e-06, + "loss": 1.4308, + "step": 2538 + }, + { + "epoch": 0.35945352870390035, + "grad_norm": 8.896887211310553, + "learning_rate": 4.7117725597596814e-06, + "loss": 1.2809, + "step": 2539 + }, + { + "epoch": 0.3595951015785376, + "grad_norm": 9.862351828115997, + "learning_rate": 4.711505308223412e-06, + "loss": 1.3493, + "step": 2540 + }, + { + "epoch": 0.35973667445317475, + "grad_norm": 11.091800830086019, + "learning_rate": 4.711237940429956e-06, + "loss": 1.4282, + "step": 2541 + }, + { + "epoch": 0.359878247327812, + "grad_norm": 9.58673415724864, + "learning_rate": 4.710970456393371e-06, + "loss": 1.4434, + "step": 2542 + }, + { + "epoch": 0.3600198202024492, + "grad_norm": 9.502048472350499, + "learning_rate": 4.710702856127718e-06, + "loss": 1.4473, + "step": 2543 + }, + { + "epoch": 0.36016139307708644, + "grad_norm": 9.141518886580661, + "learning_rate": 4.710435139647064e-06, + "loss": 1.1372, + "step": 2544 + }, + { + "epoch": 0.3603029659517237, + "grad_norm": 10.122395121492021, + "learning_rate": 4.710167306965483e-06, + "loss": 1.3913, + "step": 2545 + }, + { + "epoch": 0.36044453882636085, + "grad_norm": 9.19423091623836, + "learning_rate": 4.709899358097055e-06, + "loss": 1.2436, + "step": 2546 + }, + { + "epoch": 0.3605861117009981, + "grad_norm": 9.615650342350623, + "learning_rate": 4.709631293055865e-06, + "loss": 1.2671, + "step": 2547 + }, + { + "epoch": 0.3607276845756353, + "grad_norm": 9.863310256764278, + "learning_rate": 4.7093631118560054e-06, + "loss": 1.4019, + "step": 2548 + }, + { + "epoch": 0.36086925745027254, + "grad_norm": 9.976731121935407, + "learning_rate": 4.709094814511574e-06, + "loss": 1.5184, + "step": 2549 + }, + { + "epoch": 0.36101083032490977, + "grad_norm": 9.239052557653622, + "learning_rate": 4.708826401036677e-06, + "loss": 1.2005, + "step": 2550 + }, + { + "epoch": 0.36115240319954695, + "grad_norm": 9.617007417333058, + "learning_rate": 4.708557871445422e-06, + "loss": 1.3951, + "step": 2551 + }, + { + "epoch": 0.3612939760741842, + "grad_norm": 7.736659720156256, + "learning_rate": 4.708289225751926e-06, + "loss": 1.3479, + "step": 2552 + }, + { + "epoch": 0.3614355489488214, + "grad_norm": 11.298348189623333, + "learning_rate": 4.7080204639703125e-06, + "loss": 1.5634, + "step": 2553 + }, + { + "epoch": 0.36157712182345864, + "grad_norm": 10.599437166062389, + "learning_rate": 4.707751586114709e-06, + "loss": 1.5393, + "step": 2554 + }, + { + "epoch": 0.36171869469809587, + "grad_norm": 10.190222627942495, + "learning_rate": 4.7074825921992516e-06, + "loss": 1.3217, + "step": 2555 + }, + { + "epoch": 0.36186026757273304, + "grad_norm": 8.11949041336671, + "learning_rate": 4.70721348223808e-06, + "loss": 1.3592, + "step": 2556 + }, + { + "epoch": 0.36200184044737027, + "grad_norm": 9.79908953738656, + "learning_rate": 4.706944256245342e-06, + "loss": 1.3233, + "step": 2557 + }, + { + "epoch": 0.3621434133220075, + "grad_norm": 8.344955985820024, + "learning_rate": 4.706674914235189e-06, + "loss": 1.2246, + "step": 2558 + }, + { + "epoch": 0.36228498619664473, + "grad_norm": 9.858591861726184, + "learning_rate": 4.706405456221782e-06, + "loss": 1.4378, + "step": 2559 + }, + { + "epoch": 0.36242655907128196, + "grad_norm": 13.634505849346333, + "learning_rate": 4.706135882219285e-06, + "loss": 1.3204, + "step": 2560 + }, + { + "epoch": 0.36256813194591914, + "grad_norm": 11.384796283894643, + "learning_rate": 4.705866192241869e-06, + "loss": 1.2606, + "step": 2561 + }, + { + "epoch": 0.36270970482055637, + "grad_norm": 8.436329852308592, + "learning_rate": 4.705596386303713e-06, + "loss": 1.2888, + "step": 2562 + }, + { + "epoch": 0.3628512776951936, + "grad_norm": 11.52850590337958, + "learning_rate": 4.705326464418999e-06, + "loss": 1.3721, + "step": 2563 + }, + { + "epoch": 0.3629928505698308, + "grad_norm": 9.831755775590613, + "learning_rate": 4.705056426601917e-06, + "loss": 1.5144, + "step": 2564 + }, + { + "epoch": 0.36313442344446806, + "grad_norm": 10.01247505251554, + "learning_rate": 4.704786272866663e-06, + "loss": 1.338, + "step": 2565 + }, + { + "epoch": 0.36327599631910523, + "grad_norm": 10.284787946192402, + "learning_rate": 4.704516003227439e-06, + "loss": 1.3269, + "step": 2566 + }, + { + "epoch": 0.36341756919374246, + "grad_norm": 8.643244368741025, + "learning_rate": 4.704245617698452e-06, + "loss": 1.4173, + "step": 2567 + }, + { + "epoch": 0.3635591420683797, + "grad_norm": 10.542767574034684, + "learning_rate": 4.703975116293916e-06, + "loss": 1.5485, + "step": 2568 + }, + { + "epoch": 0.3637007149430169, + "grad_norm": 9.856377534398018, + "learning_rate": 4.703704499028052e-06, + "loss": 1.3953, + "step": 2569 + }, + { + "epoch": 0.36384228781765415, + "grad_norm": 10.847304231662275, + "learning_rate": 4.703433765915086e-06, + "loss": 1.4219, + "step": 2570 + }, + { + "epoch": 0.3639838606922914, + "grad_norm": 9.12910264684973, + "learning_rate": 4.7031629169692495e-06, + "loss": 1.3368, + "step": 2571 + }, + { + "epoch": 0.36412543356692856, + "grad_norm": 8.077602701623388, + "learning_rate": 4.702891952204781e-06, + "loss": 1.3273, + "step": 2572 + }, + { + "epoch": 0.3642670064415658, + "grad_norm": 9.766174984512872, + "learning_rate": 4.702620871635926e-06, + "loss": 1.3585, + "step": 2573 + }, + { + "epoch": 0.364408579316203, + "grad_norm": 11.398080737862665, + "learning_rate": 4.702349675276933e-06, + "loss": 1.3, + "step": 2574 + }, + { + "epoch": 0.36455015219084025, + "grad_norm": 8.936586133211529, + "learning_rate": 4.702078363142061e-06, + "loss": 1.3563, + "step": 2575 + }, + { + "epoch": 0.3646917250654775, + "grad_norm": 11.762919364369385, + "learning_rate": 4.70180693524557e-06, + "loss": 1.427, + "step": 2576 + }, + { + "epoch": 0.36483329794011465, + "grad_norm": 9.284959867243806, + "learning_rate": 4.7015353916017305e-06, + "loss": 1.3218, + "step": 2577 + }, + { + "epoch": 0.3649748708147519, + "grad_norm": 7.756534897630775, + "learning_rate": 4.701263732224817e-06, + "loss": 1.343, + "step": 2578 + }, + { + "epoch": 0.3651164436893891, + "grad_norm": 10.33163803304204, + "learning_rate": 4.700991957129111e-06, + "loss": 1.3813, + "step": 2579 + }, + { + "epoch": 0.36525801656402634, + "grad_norm": 9.570955016697164, + "learning_rate": 4.700720066328899e-06, + "loss": 1.3358, + "step": 2580 + }, + { + "epoch": 0.3653995894386636, + "grad_norm": 10.654731709446887, + "learning_rate": 4.7004480598384736e-06, + "loss": 1.3442, + "step": 2581 + }, + { + "epoch": 0.36554116231330075, + "grad_norm": 11.515159235195753, + "learning_rate": 4.700175937672134e-06, + "loss": 1.3133, + "step": 2582 + }, + { + "epoch": 0.365682735187938, + "grad_norm": 9.084550641818753, + "learning_rate": 4.699903699844186e-06, + "loss": 1.2631, + "step": 2583 + }, + { + "epoch": 0.3658243080625752, + "grad_norm": 9.161513342302756, + "learning_rate": 4.699631346368941e-06, + "loss": 1.3486, + "step": 2584 + }, + { + "epoch": 0.36596588093721244, + "grad_norm": 9.07806953242521, + "learning_rate": 4.699358877260717e-06, + "loss": 1.222, + "step": 2585 + }, + { + "epoch": 0.36610745381184967, + "grad_norm": 8.25347549751533, + "learning_rate": 4.699086292533836e-06, + "loss": 1.3341, + "step": 2586 + }, + { + "epoch": 0.36624902668648684, + "grad_norm": 9.102691973238905, + "learning_rate": 4.698813592202628e-06, + "loss": 1.31, + "step": 2587 + }, + { + "epoch": 0.3663905995611241, + "grad_norm": 9.77130381760277, + "learning_rate": 4.69854077628143e-06, + "loss": 1.5145, + "step": 2588 + }, + { + "epoch": 0.3665321724357613, + "grad_norm": 8.266146493768744, + "learning_rate": 4.698267844784582e-06, + "loss": 1.388, + "step": 2589 + }, + { + "epoch": 0.36667374531039854, + "grad_norm": 10.632951341741382, + "learning_rate": 4.697994797726433e-06, + "loss": 1.1958, + "step": 2590 + }, + { + "epoch": 0.36681531818503577, + "grad_norm": 8.01537847604577, + "learning_rate": 4.6977216351213355e-06, + "loss": 1.2619, + "step": 2591 + }, + { + "epoch": 0.36695689105967294, + "grad_norm": 8.58004366848021, + "learning_rate": 4.697448356983651e-06, + "loss": 1.3429, + "step": 2592 + }, + { + "epoch": 0.36709846393431017, + "grad_norm": 11.742743666136708, + "learning_rate": 4.697174963327744e-06, + "loss": 1.4673, + "step": 2593 + }, + { + "epoch": 0.3672400368089474, + "grad_norm": 8.234105838668965, + "learning_rate": 4.696901454167989e-06, + "loss": 1.3037, + "step": 2594 + }, + { + "epoch": 0.36738160968358463, + "grad_norm": 9.222872571981243, + "learning_rate": 4.696627829518761e-06, + "loss": 1.327, + "step": 2595 + }, + { + "epoch": 0.36752318255822186, + "grad_norm": 9.925308522538637, + "learning_rate": 4.696354089394447e-06, + "loss": 1.4799, + "step": 2596 + }, + { + "epoch": 0.3676647554328591, + "grad_norm": 8.405925985565462, + "learning_rate": 4.696080233809436e-06, + "loss": 1.3716, + "step": 2597 + }, + { + "epoch": 0.36780632830749627, + "grad_norm": 9.89121662799051, + "learning_rate": 4.695806262778124e-06, + "loss": 1.3769, + "step": 2598 + }, + { + "epoch": 0.3679479011821335, + "grad_norm": 9.528220173531517, + "learning_rate": 4.695532176314914e-06, + "loss": 1.3324, + "step": 2599 + }, + { + "epoch": 0.3680894740567707, + "grad_norm": 9.626974324446632, + "learning_rate": 4.695257974434215e-06, + "loss": 1.2736, + "step": 2600 + }, + { + "epoch": 0.36823104693140796, + "grad_norm": 10.581946895427905, + "learning_rate": 4.694983657150442e-06, + "loss": 1.1887, + "step": 2601 + }, + { + "epoch": 0.3683726198060452, + "grad_norm": 9.93566378537222, + "learning_rate": 4.6947092244780134e-06, + "loss": 1.4897, + "step": 2602 + }, + { + "epoch": 0.36851419268068236, + "grad_norm": 9.94136787811848, + "learning_rate": 4.694434676431358e-06, + "loss": 1.3721, + "step": 2603 + }, + { + "epoch": 0.3686557655553196, + "grad_norm": 8.673867046678605, + "learning_rate": 4.694160013024907e-06, + "loss": 1.198, + "step": 2604 + }, + { + "epoch": 0.3687973384299568, + "grad_norm": 9.563005022662937, + "learning_rate": 4.693885234273101e-06, + "loss": 1.2854, + "step": 2605 + }, + { + "epoch": 0.36893891130459405, + "grad_norm": 12.187924421696435, + "learning_rate": 4.693610340190384e-06, + "loss": 1.5656, + "step": 2606 + }, + { + "epoch": 0.3690804841792313, + "grad_norm": 10.934364436150762, + "learning_rate": 4.693335330791207e-06, + "loss": 1.3658, + "step": 2607 + }, + { + "epoch": 0.36922205705386846, + "grad_norm": 8.821289873329006, + "learning_rate": 4.693060206090028e-06, + "loss": 1.5378, + "step": 2608 + }, + { + "epoch": 0.3693636299285057, + "grad_norm": 7.801966287289253, + "learning_rate": 4.692784966101308e-06, + "loss": 1.2421, + "step": 2609 + }, + { + "epoch": 0.3695052028031429, + "grad_norm": 11.45595774675814, + "learning_rate": 4.6925096108395175e-06, + "loss": 1.2819, + "step": 2610 + }, + { + "epoch": 0.36964677567778015, + "grad_norm": 11.173782889973548, + "learning_rate": 4.692234140319131e-06, + "loss": 1.5621, + "step": 2611 + }, + { + "epoch": 0.3697883485524174, + "grad_norm": 8.675178846994669, + "learning_rate": 4.691958554554631e-06, + "loss": 1.3765, + "step": 2612 + }, + { + "epoch": 0.36992992142705455, + "grad_norm": 9.859656280035791, + "learning_rate": 4.6916828535605044e-06, + "loss": 1.2981, + "step": 2613 + }, + { + "epoch": 0.3700714943016918, + "grad_norm": 11.949955538742799, + "learning_rate": 4.691407037351244e-06, + "loss": 1.3316, + "step": 2614 + }, + { + "epoch": 0.370213067176329, + "grad_norm": 8.437451171733715, + "learning_rate": 4.69113110594135e-06, + "loss": 1.4813, + "step": 2615 + }, + { + "epoch": 0.37035464005096624, + "grad_norm": 11.145629614753819, + "learning_rate": 4.690855059345327e-06, + "loss": 1.4193, + "step": 2616 + }, + { + "epoch": 0.3704962129256035, + "grad_norm": 10.327466796585226, + "learning_rate": 4.690578897577687e-06, + "loss": 1.233, + "step": 2617 + }, + { + "epoch": 0.37063778580024065, + "grad_norm": 12.800371188503737, + "learning_rate": 4.690302620652949e-06, + "loss": 1.4209, + "step": 2618 + }, + { + "epoch": 0.3707793586748779, + "grad_norm": 8.79636935852523, + "learning_rate": 4.690026228585634e-06, + "loss": 1.2573, + "step": 2619 + }, + { + "epoch": 0.3709209315495151, + "grad_norm": 13.274270914462567, + "learning_rate": 4.689749721390273e-06, + "loss": 1.3474, + "step": 2620 + }, + { + "epoch": 0.37106250442415234, + "grad_norm": 12.466305906682395, + "learning_rate": 4.689473099081403e-06, + "loss": 1.5922, + "step": 2621 + }, + { + "epoch": 0.37120407729878957, + "grad_norm": 10.053928302121475, + "learning_rate": 4.689196361673565e-06, + "loss": 1.3351, + "step": 2622 + }, + { + "epoch": 0.37134565017342674, + "grad_norm": 11.146311374763563, + "learning_rate": 4.688919509181305e-06, + "loss": 1.4052, + "step": 2623 + }, + { + "epoch": 0.371487223048064, + "grad_norm": 10.038712147339838, + "learning_rate": 4.68864254161918e-06, + "loss": 1.2739, + "step": 2624 + }, + { + "epoch": 0.3716287959227012, + "grad_norm": 11.223543041125476, + "learning_rate": 4.6883654590017475e-06, + "loss": 1.4639, + "step": 2625 + }, + { + "epoch": 0.37177036879733844, + "grad_norm": 11.361158732029713, + "learning_rate": 4.688088261343575e-06, + "loss": 1.2961, + "step": 2626 + }, + { + "epoch": 0.37191194167197567, + "grad_norm": 7.985938350264703, + "learning_rate": 4.687810948659234e-06, + "loss": 1.3817, + "step": 2627 + }, + { + "epoch": 0.3720535145466129, + "grad_norm": 9.298589570755938, + "learning_rate": 4.687533520963302e-06, + "loss": 1.3278, + "step": 2628 + }, + { + "epoch": 0.37219508742125007, + "grad_norm": 12.996455369555637, + "learning_rate": 4.6872559782703655e-06, + "loss": 1.4197, + "step": 2629 + }, + { + "epoch": 0.3723366602958873, + "grad_norm": 9.333826824039127, + "learning_rate": 4.686978320595012e-06, + "loss": 1.1977, + "step": 2630 + }, + { + "epoch": 0.37247823317052453, + "grad_norm": 9.01050420876609, + "learning_rate": 4.686700547951839e-06, + "loss": 1.3206, + "step": 2631 + }, + { + "epoch": 0.37261980604516176, + "grad_norm": 8.41033632020997, + "learning_rate": 4.686422660355448e-06, + "loss": 1.4791, + "step": 2632 + }, + { + "epoch": 0.372761378919799, + "grad_norm": 8.891115413072436, + "learning_rate": 4.686144657820449e-06, + "loss": 1.3807, + "step": 2633 + }, + { + "epoch": 0.37290295179443617, + "grad_norm": 9.967835387363213, + "learning_rate": 4.685866540361456e-06, + "loss": 1.4185, + "step": 2634 + }, + { + "epoch": 0.3730445246690734, + "grad_norm": 8.444053083129162, + "learning_rate": 4.685588307993087e-06, + "loss": 1.1858, + "step": 2635 + }, + { + "epoch": 0.3731860975437106, + "grad_norm": 9.64394189293854, + "learning_rate": 4.6853099607299725e-06, + "loss": 1.2906, + "step": 2636 + }, + { + "epoch": 0.37332767041834786, + "grad_norm": 9.894015389483874, + "learning_rate": 4.685031498586741e-06, + "loss": 1.1827, + "step": 2637 + }, + { + "epoch": 0.3734692432929851, + "grad_norm": 7.9781833238632665, + "learning_rate": 4.684752921578033e-06, + "loss": 1.4153, + "step": 2638 + }, + { + "epoch": 0.37361081616762226, + "grad_norm": 9.808859374514649, + "learning_rate": 4.684474229718494e-06, + "loss": 1.3973, + "step": 2639 + }, + { + "epoch": 0.3737523890422595, + "grad_norm": 15.439946648698253, + "learning_rate": 4.6841954230227725e-06, + "loss": 1.5312, + "step": 2640 + }, + { + "epoch": 0.3738939619168967, + "grad_norm": 10.989721524543365, + "learning_rate": 4.683916501505527e-06, + "loss": 1.3442, + "step": 2641 + }, + { + "epoch": 0.37403553479153395, + "grad_norm": 9.180143771905295, + "learning_rate": 4.6836374651814186e-06, + "loss": 1.3311, + "step": 2642 + }, + { + "epoch": 0.3741771076661712, + "grad_norm": 8.477762191889235, + "learning_rate": 4.6833583140651175e-06, + "loss": 1.1947, + "step": 2643 + }, + { + "epoch": 0.37431868054080836, + "grad_norm": 11.395662760925411, + "learning_rate": 4.6830790481712975e-06, + "loss": 1.4976, + "step": 2644 + }, + { + "epoch": 0.3744602534154456, + "grad_norm": 8.805416209030543, + "learning_rate": 4.68279966751464e-06, + "loss": 1.2381, + "step": 2645 + }, + { + "epoch": 0.3746018262900828, + "grad_norm": 8.292539106569494, + "learning_rate": 4.682520172109831e-06, + "loss": 1.1332, + "step": 2646 + }, + { + "epoch": 0.37474339916472005, + "grad_norm": 8.345654445285714, + "learning_rate": 4.682240561971565e-06, + "loss": 1.3987, + "step": 2647 + }, + { + "epoch": 0.3748849720393573, + "grad_norm": 7.703319121539835, + "learning_rate": 4.681960837114539e-06, + "loss": 1.2385, + "step": 2648 + }, + { + "epoch": 0.37502654491399445, + "grad_norm": 14.325787768581163, + "learning_rate": 4.681680997553459e-06, + "loss": 1.4275, + "step": 2649 + }, + { + "epoch": 0.3751681177886317, + "grad_norm": 9.022912001067278, + "learning_rate": 4.681401043303036e-06, + "loss": 1.2391, + "step": 2650 + }, + { + "epoch": 0.3753096906632689, + "grad_norm": 9.914498442075129, + "learning_rate": 4.681120974377985e-06, + "loss": 1.3294, + "step": 2651 + }, + { + "epoch": 0.37545126353790614, + "grad_norm": 9.79979335022077, + "learning_rate": 4.680840790793032e-06, + "loss": 1.2942, + "step": 2652 + }, + { + "epoch": 0.3755928364125434, + "grad_norm": 11.574444184439741, + "learning_rate": 4.680560492562904e-06, + "loss": 1.3636, + "step": 2653 + }, + { + "epoch": 0.37573440928718055, + "grad_norm": 10.376484224713236, + "learning_rate": 4.680280079702339e-06, + "loss": 1.419, + "step": 2654 + }, + { + "epoch": 0.3758759821618178, + "grad_norm": 12.461187301571378, + "learning_rate": 4.679999552226073e-06, + "loss": 1.3748, + "step": 2655 + }, + { + "epoch": 0.376017555036455, + "grad_norm": 10.263335322443575, + "learning_rate": 4.679718910148858e-06, + "loss": 1.441, + "step": 2656 + }, + { + "epoch": 0.37615912791109224, + "grad_norm": 10.1665512662081, + "learning_rate": 4.679438153485444e-06, + "loss": 1.2852, + "step": 2657 + }, + { + "epoch": 0.37630070078572947, + "grad_norm": 12.658632708774833, + "learning_rate": 4.679157282250592e-06, + "loss": 1.2924, + "step": 2658 + }, + { + "epoch": 0.3764422736603667, + "grad_norm": 15.837187662635163, + "learning_rate": 4.678876296459066e-06, + "loss": 1.5171, + "step": 2659 + }, + { + "epoch": 0.3765838465350039, + "grad_norm": 11.478213695886547, + "learning_rate": 4.678595196125638e-06, + "loss": 1.5487, + "step": 2660 + }, + { + "epoch": 0.3767254194096411, + "grad_norm": 10.227289114274276, + "learning_rate": 4.678313981265086e-06, + "loss": 1.3614, + "step": 2661 + }, + { + "epoch": 0.37686699228427833, + "grad_norm": 10.87434964044007, + "learning_rate": 4.678032651892191e-06, + "loss": 1.4133, + "step": 2662 + }, + { + "epoch": 0.37700856515891557, + "grad_norm": 11.892068582295328, + "learning_rate": 4.677751208021744e-06, + "loss": 1.2706, + "step": 2663 + }, + { + "epoch": 0.3771501380335528, + "grad_norm": 9.977311909010739, + "learning_rate": 4.677469649668539e-06, + "loss": 1.4324, + "step": 2664 + }, + { + "epoch": 0.37729171090818997, + "grad_norm": 11.637420254115533, + "learning_rate": 4.677187976847379e-06, + "loss": 1.2458, + "step": 2665 + }, + { + "epoch": 0.3774332837828272, + "grad_norm": 9.215099167851632, + "learning_rate": 4.67690618957307e-06, + "loss": 1.3439, + "step": 2666 + }, + { + "epoch": 0.37757485665746443, + "grad_norm": 10.28201652846374, + "learning_rate": 4.676624287860425e-06, + "loss": 1.5099, + "step": 2667 + }, + { + "epoch": 0.37771642953210166, + "grad_norm": 10.335607175962174, + "learning_rate": 4.676342271724266e-06, + "loss": 1.4562, + "step": 2668 + }, + { + "epoch": 0.3778580024067389, + "grad_norm": 8.475984643321867, + "learning_rate": 4.676060141179415e-06, + "loss": 1.3604, + "step": 2669 + }, + { + "epoch": 0.37799957528137607, + "grad_norm": 12.355138387734803, + "learning_rate": 4.675777896240706e-06, + "loss": 1.4093, + "step": 2670 + }, + { + "epoch": 0.3781411481560133, + "grad_norm": 10.95136572276936, + "learning_rate": 4.675495536922975e-06, + "loss": 1.4342, + "step": 2671 + }, + { + "epoch": 0.3782827210306505, + "grad_norm": 10.148173245813789, + "learning_rate": 4.675213063241065e-06, + "loss": 1.1531, + "step": 2672 + }, + { + "epoch": 0.37842429390528776, + "grad_norm": 10.22154715515793, + "learning_rate": 4.674930475209827e-06, + "loss": 1.243, + "step": 2673 + }, + { + "epoch": 0.378565866779925, + "grad_norm": 8.33009346606149, + "learning_rate": 4.674647772844115e-06, + "loss": 1.236, + "step": 2674 + }, + { + "epoch": 0.37870743965456216, + "grad_norm": 9.257224483567423, + "learning_rate": 4.674364956158791e-06, + "loss": 1.4091, + "step": 2675 + }, + { + "epoch": 0.3788490125291994, + "grad_norm": 10.818658210633869, + "learning_rate": 4.674082025168723e-06, + "loss": 1.3547, + "step": 2676 + }, + { + "epoch": 0.3789905854038366, + "grad_norm": 11.154588078682158, + "learning_rate": 4.673798979888784e-06, + "loss": 1.3831, + "step": 2677 + }, + { + "epoch": 0.37913215827847385, + "grad_norm": 10.073039157021416, + "learning_rate": 4.673515820333853e-06, + "loss": 1.5053, + "step": 2678 + }, + { + "epoch": 0.3792737311531111, + "grad_norm": 11.065952602773786, + "learning_rate": 4.673232546518817e-06, + "loss": 1.418, + "step": 2679 + }, + { + "epoch": 0.37941530402774826, + "grad_norm": 9.691340122204865, + "learning_rate": 4.672949158458565e-06, + "loss": 1.3683, + "step": 2680 + }, + { + "epoch": 0.3795568769023855, + "grad_norm": 11.200709552087083, + "learning_rate": 4.672665656167997e-06, + "loss": 1.3431, + "step": 2681 + }, + { + "epoch": 0.3796984497770227, + "grad_norm": 10.345372559673116, + "learning_rate": 4.672382039662016e-06, + "loss": 1.3055, + "step": 2682 + }, + { + "epoch": 0.37984002265165995, + "grad_norm": 11.661562284364363, + "learning_rate": 4.672098308955529e-06, + "loss": 1.3483, + "step": 2683 + }, + { + "epoch": 0.3799815955262972, + "grad_norm": 12.983219173322889, + "learning_rate": 4.671814464063455e-06, + "loss": 1.5273, + "step": 2684 + }, + { + "epoch": 0.3801231684009344, + "grad_norm": 10.982784671615228, + "learning_rate": 4.671530505000714e-06, + "loss": 1.2853, + "step": 2685 + }, + { + "epoch": 0.3802647412755716, + "grad_norm": 7.978578019569369, + "learning_rate": 4.671246431782234e-06, + "loss": 1.387, + "step": 2686 + }, + { + "epoch": 0.3804063141502088, + "grad_norm": 8.038322214457287, + "learning_rate": 4.670962244422946e-06, + "loss": 1.4877, + "step": 2687 + }, + { + "epoch": 0.38054788702484604, + "grad_norm": 10.98707028743225, + "learning_rate": 4.670677942937793e-06, + "loss": 1.5384, + "step": 2688 + }, + { + "epoch": 0.3806894598994833, + "grad_norm": 11.313081673689375, + "learning_rate": 4.6703935273417195e-06, + "loss": 1.3609, + "step": 2689 + }, + { + "epoch": 0.3808310327741205, + "grad_norm": 10.412177703009249, + "learning_rate": 4.670108997649676e-06, + "loss": 1.3185, + "step": 2690 + }, + { + "epoch": 0.3809726056487577, + "grad_norm": 10.054040610867874, + "learning_rate": 4.66982435387662e-06, + "loss": 1.4195, + "step": 2691 + }, + { + "epoch": 0.3811141785233949, + "grad_norm": 9.306332587701245, + "learning_rate": 4.669539596037517e-06, + "loss": 1.4194, + "step": 2692 + }, + { + "epoch": 0.38125575139803214, + "grad_norm": 10.6979344165708, + "learning_rate": 4.669254724147334e-06, + "loss": 1.4598, + "step": 2693 + }, + { + "epoch": 0.38139732427266937, + "grad_norm": 10.702752901834854, + "learning_rate": 4.6689697382210475e-06, + "loss": 1.3117, + "step": 2694 + }, + { + "epoch": 0.3815388971473066, + "grad_norm": 9.911225907364658, + "learning_rate": 4.668684638273639e-06, + "loss": 1.2393, + "step": 2695 + }, + { + "epoch": 0.3816804700219438, + "grad_norm": 9.745084648410577, + "learning_rate": 4.668399424320097e-06, + "loss": 1.3565, + "step": 2696 + }, + { + "epoch": 0.381822042896581, + "grad_norm": 9.837591079232126, + "learning_rate": 4.668114096375413e-06, + "loss": 1.5382, + "step": 2697 + }, + { + "epoch": 0.38196361577121823, + "grad_norm": 8.864272820801578, + "learning_rate": 4.6678286544545894e-06, + "loss": 1.5159, + "step": 2698 + }, + { + "epoch": 0.38210518864585546, + "grad_norm": 9.222782403970946, + "learning_rate": 4.667543098572627e-06, + "loss": 1.4108, + "step": 2699 + }, + { + "epoch": 0.3822467615204927, + "grad_norm": 8.373216581482302, + "learning_rate": 4.667257428744542e-06, + "loss": 1.368, + "step": 2700 + }, + { + "epoch": 0.38238833439512987, + "grad_norm": 11.697078268296252, + "learning_rate": 4.6669716449853505e-06, + "loss": 1.406, + "step": 2701 + }, + { + "epoch": 0.3825299072697671, + "grad_norm": 9.665612086507297, + "learning_rate": 4.666685747310075e-06, + "loss": 1.3514, + "step": 2702 + }, + { + "epoch": 0.38267148014440433, + "grad_norm": 8.82546022420051, + "learning_rate": 4.666399735733745e-06, + "loss": 1.2088, + "step": 2703 + }, + { + "epoch": 0.38281305301904156, + "grad_norm": 8.3727855886308, + "learning_rate": 4.666113610271395e-06, + "loss": 1.2886, + "step": 2704 + }, + { + "epoch": 0.3829546258936788, + "grad_norm": 9.699658842084707, + "learning_rate": 4.66582737093807e-06, + "loss": 1.3807, + "step": 2705 + }, + { + "epoch": 0.38309619876831597, + "grad_norm": 9.08050222343912, + "learning_rate": 4.665541017748813e-06, + "loss": 1.283, + "step": 2706 + }, + { + "epoch": 0.3832377716429532, + "grad_norm": 10.709286975081415, + "learning_rate": 4.665254550718681e-06, + "loss": 1.3762, + "step": 2707 + }, + { + "epoch": 0.3833793445175904, + "grad_norm": 8.933920830534039, + "learning_rate": 4.6649679698627306e-06, + "loss": 1.271, + "step": 2708 + }, + { + "epoch": 0.38352091739222766, + "grad_norm": 8.920955256472721, + "learning_rate": 4.664681275196028e-06, + "loss": 1.4052, + "step": 2709 + }, + { + "epoch": 0.3836624902668649, + "grad_norm": 11.627357161896787, + "learning_rate": 4.664394466733646e-06, + "loss": 1.4375, + "step": 2710 + }, + { + "epoch": 0.38380406314150206, + "grad_norm": 8.045155402579875, + "learning_rate": 4.66410754449066e-06, + "loss": 1.3756, + "step": 2711 + }, + { + "epoch": 0.3839456360161393, + "grad_norm": 10.57837382577547, + "learning_rate": 4.6638205084821544e-06, + "loss": 1.3694, + "step": 2712 + }, + { + "epoch": 0.3840872088907765, + "grad_norm": 7.904616598040143, + "learning_rate": 4.6635333587232175e-06, + "loss": 1.3144, + "step": 2713 + }, + { + "epoch": 0.38422878176541375, + "grad_norm": 13.495878932805574, + "learning_rate": 4.663246095228946e-06, + "loss": 1.3702, + "step": 2714 + }, + { + "epoch": 0.384370354640051, + "grad_norm": 12.100674167484106, + "learning_rate": 4.66295871801444e-06, + "loss": 1.4133, + "step": 2715 + }, + { + "epoch": 0.3845119275146882, + "grad_norm": 8.980030794154597, + "learning_rate": 4.662671227094806e-06, + "loss": 1.3135, + "step": 2716 + }, + { + "epoch": 0.3846535003893254, + "grad_norm": 8.29613105039171, + "learning_rate": 4.662383622485159e-06, + "loss": 1.3489, + "step": 2717 + }, + { + "epoch": 0.3847950732639626, + "grad_norm": 11.41272558049922, + "learning_rate": 4.662095904200617e-06, + "loss": 1.2931, + "step": 2718 + }, + { + "epoch": 0.38493664613859985, + "grad_norm": 11.209020959874518, + "learning_rate": 4.661808072256306e-06, + "loss": 1.3658, + "step": 2719 + }, + { + "epoch": 0.3850782190132371, + "grad_norm": 8.623691625456964, + "learning_rate": 4.661520126667356e-06, + "loss": 1.3799, + "step": 2720 + }, + { + "epoch": 0.3852197918878743, + "grad_norm": 10.883893465249034, + "learning_rate": 4.6612320674489045e-06, + "loss": 1.3583, + "step": 2721 + }, + { + "epoch": 0.3853613647625115, + "grad_norm": 10.05328705585598, + "learning_rate": 4.660943894616095e-06, + "loss": 1.4679, + "step": 2722 + }, + { + "epoch": 0.3855029376371487, + "grad_norm": 11.036612873996656, + "learning_rate": 4.660655608184076e-06, + "loss": 1.4387, + "step": 2723 + }, + { + "epoch": 0.38564451051178594, + "grad_norm": 8.735430112925894, + "learning_rate": 4.660367208168004e-06, + "loss": 1.2773, + "step": 2724 + }, + { + "epoch": 0.3857860833864232, + "grad_norm": 8.406797710287742, + "learning_rate": 4.660078694583037e-06, + "loss": 1.3671, + "step": 2725 + }, + { + "epoch": 0.3859276562610604, + "grad_norm": 9.412165913725703, + "learning_rate": 4.6597900674443445e-06, + "loss": 1.4734, + "step": 2726 + }, + { + "epoch": 0.3860692291356976, + "grad_norm": 10.023768977737015, + "learning_rate": 4.659501326767098e-06, + "loss": 1.5199, + "step": 2727 + }, + { + "epoch": 0.3862108020103348, + "grad_norm": 10.510573376841101, + "learning_rate": 4.6592124725664776e-06, + "loss": 1.3955, + "step": 2728 + }, + { + "epoch": 0.38635237488497204, + "grad_norm": 9.624021009718746, + "learning_rate": 4.6589235048576676e-06, + "loss": 1.2931, + "step": 2729 + }, + { + "epoch": 0.38649394775960927, + "grad_norm": 9.93348276764094, + "learning_rate": 4.658634423655858e-06, + "loss": 1.2739, + "step": 2730 + }, + { + "epoch": 0.3866355206342465, + "grad_norm": 10.070906450676684, + "learning_rate": 4.658345228976246e-06, + "loss": 1.3344, + "step": 2731 + }, + { + "epoch": 0.3867770935088837, + "grad_norm": 10.792909724888364, + "learning_rate": 4.658055920834036e-06, + "loss": 1.3413, + "step": 2732 + }, + { + "epoch": 0.3869186663835209, + "grad_norm": 8.08021055117189, + "learning_rate": 4.6577664992444345e-06, + "loss": 1.2123, + "step": 2733 + }, + { + "epoch": 0.38706023925815813, + "grad_norm": 8.778612711453633, + "learning_rate": 4.657476964222657e-06, + "loss": 1.4105, + "step": 2734 + }, + { + "epoch": 0.38720181213279536, + "grad_norm": 8.768424119284179, + "learning_rate": 4.657187315783925e-06, + "loss": 1.3075, + "step": 2735 + }, + { + "epoch": 0.3873433850074326, + "grad_norm": 7.45720926974518, + "learning_rate": 4.656897553943463e-06, + "loss": 1.2876, + "step": 2736 + }, + { + "epoch": 0.38748495788206977, + "grad_norm": 10.199335069113612, + "learning_rate": 4.656607678716506e-06, + "loss": 1.3631, + "step": 2737 + }, + { + "epoch": 0.387626530756707, + "grad_norm": 8.402863568445126, + "learning_rate": 4.656317690118291e-06, + "loss": 1.3287, + "step": 2738 + }, + { + "epoch": 0.38776810363134423, + "grad_norm": 8.454443076102102, + "learning_rate": 4.6560275881640615e-06, + "loss": 1.3103, + "step": 2739 + }, + { + "epoch": 0.38790967650598146, + "grad_norm": 9.652602608705179, + "learning_rate": 4.655737372869071e-06, + "loss": 1.3687, + "step": 2740 + }, + { + "epoch": 0.3880512493806187, + "grad_norm": 8.847429029229119, + "learning_rate": 4.655447044248573e-06, + "loss": 1.2548, + "step": 2741 + }, + { + "epoch": 0.3881928222552559, + "grad_norm": 7.5432575156727815, + "learning_rate": 4.655156602317832e-06, + "loss": 1.2472, + "step": 2742 + }, + { + "epoch": 0.3883343951298931, + "grad_norm": 8.129569309364532, + "learning_rate": 4.654866047092115e-06, + "loss": 1.2768, + "step": 2743 + }, + { + "epoch": 0.3884759680045303, + "grad_norm": 9.115501999834368, + "learning_rate": 4.654575378586696e-06, + "loss": 1.3474, + "step": 2744 + }, + { + "epoch": 0.38861754087916756, + "grad_norm": 7.998165158618632, + "learning_rate": 4.6542845968168575e-06, + "loss": 1.1801, + "step": 2745 + }, + { + "epoch": 0.3887591137538048, + "grad_norm": 9.96245431159634, + "learning_rate": 4.653993701797883e-06, + "loss": 1.3783, + "step": 2746 + }, + { + "epoch": 0.388900686628442, + "grad_norm": 8.852742398467992, + "learning_rate": 4.653702693545066e-06, + "loss": 1.2266, + "step": 2747 + }, + { + "epoch": 0.3890422595030792, + "grad_norm": 10.187163364956724, + "learning_rate": 4.653411572073704e-06, + "loss": 1.3879, + "step": 2748 + }, + { + "epoch": 0.3891838323777164, + "grad_norm": 10.219071409437914, + "learning_rate": 4.6531203373991015e-06, + "loss": 1.4126, + "step": 2749 + }, + { + "epoch": 0.38932540525235365, + "grad_norm": 9.434443300902773, + "learning_rate": 4.652828989536567e-06, + "loss": 1.2364, + "step": 2750 + }, + { + "epoch": 0.3894669781269909, + "grad_norm": 8.683922339711959, + "learning_rate": 4.6525375285014195e-06, + "loss": 1.4456, + "step": 2751 + }, + { + "epoch": 0.3896085510016281, + "grad_norm": 9.651316150783934, + "learning_rate": 4.652245954308979e-06, + "loss": 1.4638, + "step": 2752 + }, + { + "epoch": 0.3897501238762653, + "grad_norm": 10.917035822171608, + "learning_rate": 4.651954266974573e-06, + "loss": 1.3792, + "step": 2753 + }, + { + "epoch": 0.3898916967509025, + "grad_norm": 9.697098240939585, + "learning_rate": 4.651662466513536e-06, + "loss": 1.2616, + "step": 2754 + }, + { + "epoch": 0.39003326962553975, + "grad_norm": 8.890800487750711, + "learning_rate": 4.651370552941207e-06, + "loss": 1.3711, + "step": 2755 + }, + { + "epoch": 0.390174842500177, + "grad_norm": 10.705604611266635, + "learning_rate": 4.651078526272932e-06, + "loss": 1.2014, + "step": 2756 + }, + { + "epoch": 0.3903164153748142, + "grad_norm": 11.739655831538196, + "learning_rate": 4.6507863865240635e-06, + "loss": 1.3189, + "step": 2757 + }, + { + "epoch": 0.3904579882494514, + "grad_norm": 10.28018730260118, + "learning_rate": 4.650494133709958e-06, + "loss": 1.4561, + "step": 2758 + }, + { + "epoch": 0.3905995611240886, + "grad_norm": 8.92066319348082, + "learning_rate": 4.650201767845979e-06, + "loss": 1.2872, + "step": 2759 + }, + { + "epoch": 0.39074113399872584, + "grad_norm": 8.437134685378606, + "learning_rate": 4.649909288947497e-06, + "loss": 1.3115, + "step": 2760 + }, + { + "epoch": 0.3908827068733631, + "grad_norm": 11.024685989717575, + "learning_rate": 4.649616697029886e-06, + "loss": 1.1658, + "step": 2761 + }, + { + "epoch": 0.3910242797480003, + "grad_norm": 9.286247783049062, + "learning_rate": 4.649323992108529e-06, + "loss": 1.3053, + "step": 2762 + }, + { + "epoch": 0.3911658526226375, + "grad_norm": 9.096330309397095, + "learning_rate": 4.649031174198812e-06, + "loss": 1.3373, + "step": 2763 + }, + { + "epoch": 0.3913074254972747, + "grad_norm": 9.109302134410548, + "learning_rate": 4.648738243316128e-06, + "loss": 1.5556, + "step": 2764 + }, + { + "epoch": 0.39144899837191194, + "grad_norm": 8.309587699862083, + "learning_rate": 4.648445199475877e-06, + "loss": 1.2552, + "step": 2765 + }, + { + "epoch": 0.39159057124654917, + "grad_norm": 9.707568835717588, + "learning_rate": 4.648152042693464e-06, + "loss": 1.2814, + "step": 2766 + }, + { + "epoch": 0.3917321441211864, + "grad_norm": 10.405905411430599, + "learning_rate": 4.6478587729843e-06, + "loss": 1.3209, + "step": 2767 + }, + { + "epoch": 0.3918737169958236, + "grad_norm": 13.19552931937763, + "learning_rate": 4.647565390363802e-06, + "loss": 1.3155, + "step": 2768 + }, + { + "epoch": 0.3920152898704608, + "grad_norm": 10.406235336889068, + "learning_rate": 4.6472718948473915e-06, + "loss": 1.233, + "step": 2769 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 11.194516501708828, + "learning_rate": 4.6469782864504995e-06, + "loss": 1.2991, + "step": 2770 + }, + { + "epoch": 0.39229843561973526, + "grad_norm": 8.913558798624878, + "learning_rate": 4.64668456518856e-06, + "loss": 1.2707, + "step": 2771 + }, + { + "epoch": 0.3924400084943725, + "grad_norm": 11.988411075787747, + "learning_rate": 4.646390731077013e-06, + "loss": 1.3388, + "step": 2772 + }, + { + "epoch": 0.3925815813690097, + "grad_norm": 11.490905482749241, + "learning_rate": 4.646096784131306e-06, + "loss": 1.3681, + "step": 2773 + }, + { + "epoch": 0.3927231542436469, + "grad_norm": 10.511080390431855, + "learning_rate": 4.645802724366891e-06, + "loss": 1.3291, + "step": 2774 + }, + { + "epoch": 0.39286472711828413, + "grad_norm": 11.634510374959449, + "learning_rate": 4.645508551799227e-06, + "loss": 1.4062, + "step": 2775 + }, + { + "epoch": 0.39300629999292136, + "grad_norm": 10.07523691357906, + "learning_rate": 4.645214266443778e-06, + "loss": 1.3857, + "step": 2776 + }, + { + "epoch": 0.3931478728675586, + "grad_norm": 8.936610891226685, + "learning_rate": 4.644919868316014e-06, + "loss": 1.3691, + "step": 2777 + }, + { + "epoch": 0.3932894457421958, + "grad_norm": 9.13612628009058, + "learning_rate": 4.644625357431414e-06, + "loss": 1.2615, + "step": 2778 + }, + { + "epoch": 0.393431018616833, + "grad_norm": 11.423029182450906, + "learning_rate": 4.6443307338054565e-06, + "loss": 1.3913, + "step": 2779 + }, + { + "epoch": 0.3935725914914702, + "grad_norm": 7.98584472506825, + "learning_rate": 4.644035997453631e-06, + "loss": 1.2388, + "step": 2780 + }, + { + "epoch": 0.39371416436610746, + "grad_norm": 9.030338439344428, + "learning_rate": 4.643741148391432e-06, + "loss": 1.4152, + "step": 2781 + }, + { + "epoch": 0.3938557372407447, + "grad_norm": 12.553341388048157, + "learning_rate": 4.64344618663436e-06, + "loss": 1.445, + "step": 2782 + }, + { + "epoch": 0.3939973101153819, + "grad_norm": 8.373602451608157, + "learning_rate": 4.643151112197919e-06, + "loss": 1.2911, + "step": 2783 + }, + { + "epoch": 0.3941388829900191, + "grad_norm": 8.977310401828671, + "learning_rate": 4.642855925097622e-06, + "loss": 1.3993, + "step": 2784 + }, + { + "epoch": 0.3942804558646563, + "grad_norm": 8.865866245864009, + "learning_rate": 4.642560625348988e-06, + "loss": 1.2814, + "step": 2785 + }, + { + "epoch": 0.39442202873929355, + "grad_norm": 9.939664784848443, + "learning_rate": 4.642265212967539e-06, + "loss": 1.2551, + "step": 2786 + }, + { + "epoch": 0.3945636016139308, + "grad_norm": 10.9379509533152, + "learning_rate": 4.6419696879688046e-06, + "loss": 1.2219, + "step": 2787 + }, + { + "epoch": 0.394705174488568, + "grad_norm": 7.322482346252645, + "learning_rate": 4.641674050368321e-06, + "loss": 1.1737, + "step": 2788 + }, + { + "epoch": 0.3948467473632052, + "grad_norm": 10.018388244989497, + "learning_rate": 4.641378300181629e-06, + "loss": 1.412, + "step": 2789 + }, + { + "epoch": 0.3949883202378424, + "grad_norm": 11.767286206182083, + "learning_rate": 4.641082437424277e-06, + "loss": 1.4521, + "step": 2790 + }, + { + "epoch": 0.39512989311247965, + "grad_norm": 8.076085672529674, + "learning_rate": 4.6407864621118184e-06, + "loss": 1.4024, + "step": 2791 + }, + { + "epoch": 0.3952714659871169, + "grad_norm": 8.468239631617429, + "learning_rate": 4.640490374259811e-06, + "loss": 1.383, + "step": 2792 + }, + { + "epoch": 0.3954130388617541, + "grad_norm": 7.924437099165028, + "learning_rate": 4.6401941738838204e-06, + "loss": 1.319, + "step": 2793 + }, + { + "epoch": 0.3955546117363913, + "grad_norm": 9.86252743097124, + "learning_rate": 4.639897860999418e-06, + "loss": 1.3689, + "step": 2794 + }, + { + "epoch": 0.3956961846110285, + "grad_norm": 8.282860250286808, + "learning_rate": 4.639601435622182e-06, + "loss": 1.3725, + "step": 2795 + }, + { + "epoch": 0.39583775748566574, + "grad_norm": 10.388787358060425, + "learning_rate": 4.639304897767692e-06, + "loss": 1.4397, + "step": 2796 + }, + { + "epoch": 0.395979330360303, + "grad_norm": 10.028938383624613, + "learning_rate": 4.63900824745154e-06, + "loss": 1.3749, + "step": 2797 + }, + { + "epoch": 0.3961209032349402, + "grad_norm": 7.044476762865275, + "learning_rate": 4.638711484689319e-06, + "loss": 1.1255, + "step": 2798 + }, + { + "epoch": 0.3962624761095774, + "grad_norm": 8.991914720151247, + "learning_rate": 4.638414609496628e-06, + "loss": 1.2067, + "step": 2799 + }, + { + "epoch": 0.3964040489842146, + "grad_norm": 10.556095506566336, + "learning_rate": 4.638117621889078e-06, + "loss": 1.3289, + "step": 2800 + }, + { + "epoch": 0.39654562185885184, + "grad_norm": 8.471367575752785, + "learning_rate": 4.637820521882278e-06, + "loss": 1.246, + "step": 2801 + }, + { + "epoch": 0.39668719473348907, + "grad_norm": 11.04779592773327, + "learning_rate": 4.637523309491847e-06, + "loss": 1.4815, + "step": 2802 + }, + { + "epoch": 0.3968287676081263, + "grad_norm": 9.4329758467408, + "learning_rate": 4.63722598473341e-06, + "loss": 1.3778, + "step": 2803 + }, + { + "epoch": 0.39697034048276353, + "grad_norm": 10.393342131814256, + "learning_rate": 4.636928547622596e-06, + "loss": 1.2944, + "step": 2804 + }, + { + "epoch": 0.3971119133574007, + "grad_norm": 11.15508223547347, + "learning_rate": 4.636630998175042e-06, + "loss": 1.3217, + "step": 2805 + }, + { + "epoch": 0.39725348623203793, + "grad_norm": 9.297082209481816, + "learning_rate": 4.636333336406389e-06, + "loss": 1.3225, + "step": 2806 + }, + { + "epoch": 0.39739505910667516, + "grad_norm": 9.970988342211008, + "learning_rate": 4.636035562332286e-06, + "loss": 1.5343, + "step": 2807 + }, + { + "epoch": 0.3975366319813124, + "grad_norm": 9.364636605987817, + "learning_rate": 4.6357376759683856e-06, + "loss": 1.3674, + "step": 2808 + }, + { + "epoch": 0.3976782048559496, + "grad_norm": 9.198120207098746, + "learning_rate": 4.635439677330349e-06, + "loss": 1.3061, + "step": 2809 + }, + { + "epoch": 0.3978197777305868, + "grad_norm": 10.595354796981825, + "learning_rate": 4.635141566433839e-06, + "loss": 1.2876, + "step": 2810 + }, + { + "epoch": 0.39796135060522403, + "grad_norm": 8.575999512544305, + "learning_rate": 4.6348433432945314e-06, + "loss": 1.185, + "step": 2811 + }, + { + "epoch": 0.39810292347986126, + "grad_norm": 9.850691609379794, + "learning_rate": 4.6345450079281e-06, + "loss": 1.4283, + "step": 2812 + }, + { + "epoch": 0.3982444963544985, + "grad_norm": 9.972257278177693, + "learning_rate": 4.634246560350229e-06, + "loss": 1.3128, + "step": 2813 + }, + { + "epoch": 0.3983860692291357, + "grad_norm": 11.871154162399751, + "learning_rate": 4.633948000576607e-06, + "loss": 1.3242, + "step": 2814 + }, + { + "epoch": 0.3985276421037729, + "grad_norm": 11.338866809019997, + "learning_rate": 4.63364932862293e-06, + "loss": 1.4601, + "step": 2815 + }, + { + "epoch": 0.3986692149784101, + "grad_norm": 9.59754217791897, + "learning_rate": 4.633350544504899e-06, + "loss": 1.1109, + "step": 2816 + }, + { + "epoch": 0.39881078785304735, + "grad_norm": 7.835878594463289, + "learning_rate": 4.63305164823822e-06, + "loss": 1.1786, + "step": 2817 + }, + { + "epoch": 0.3989523607276846, + "grad_norm": 8.248203428762295, + "learning_rate": 4.632752639838607e-06, + "loss": 1.1452, + "step": 2818 + }, + { + "epoch": 0.3990939336023218, + "grad_norm": 8.088448338278244, + "learning_rate": 4.632453519321778e-06, + "loss": 1.2562, + "step": 2819 + }, + { + "epoch": 0.399235506476959, + "grad_norm": 11.195044674701155, + "learning_rate": 4.632154286703457e-06, + "loss": 1.297, + "step": 2820 + }, + { + "epoch": 0.3993770793515962, + "grad_norm": 9.451737533767353, + "learning_rate": 4.6318549419993765e-06, + "loss": 1.2949, + "step": 2821 + }, + { + "epoch": 0.39951865222623345, + "grad_norm": 10.780707569574624, + "learning_rate": 4.63155548522527e-06, + "loss": 1.4362, + "step": 2822 + }, + { + "epoch": 0.3996602251008707, + "grad_norm": 9.296989478680677, + "learning_rate": 4.6312559163968805e-06, + "loss": 1.3343, + "step": 2823 + }, + { + "epoch": 0.3998017979755079, + "grad_norm": 10.3425318066885, + "learning_rate": 4.630956235529957e-06, + "loss": 1.5448, + "step": 2824 + }, + { + "epoch": 0.3999433708501451, + "grad_norm": 10.618772948331774, + "learning_rate": 4.630656442640254e-06, + "loss": 1.3558, + "step": 2825 + }, + { + "epoch": 0.4000849437247823, + "grad_norm": 10.152813796712843, + "learning_rate": 4.63035653774353e-06, + "loss": 1.3271, + "step": 2826 + }, + { + "epoch": 0.40022651659941955, + "grad_norm": 8.183332241495943, + "learning_rate": 4.6300565208555505e-06, + "loss": 1.2925, + "step": 2827 + }, + { + "epoch": 0.4003680894740568, + "grad_norm": 10.801148911576888, + "learning_rate": 4.629756391992088e-06, + "loss": 1.3348, + "step": 2828 + }, + { + "epoch": 0.400509662348694, + "grad_norm": 9.96218397531914, + "learning_rate": 4.629456151168921e-06, + "loss": 1.4639, + "step": 2829 + }, + { + "epoch": 0.40065123522333124, + "grad_norm": 9.211466352634778, + "learning_rate": 4.629155798401832e-06, + "loss": 1.5077, + "step": 2830 + }, + { + "epoch": 0.4007928080979684, + "grad_norm": 9.341980833634594, + "learning_rate": 4.628855333706609e-06, + "loss": 1.2824, + "step": 2831 + }, + { + "epoch": 0.40093438097260564, + "grad_norm": 7.8075101252235815, + "learning_rate": 4.62855475709905e-06, + "loss": 1.3058, + "step": 2832 + }, + { + "epoch": 0.40107595384724287, + "grad_norm": 8.69829475966837, + "learning_rate": 4.628254068594953e-06, + "loss": 1.3313, + "step": 2833 + }, + { + "epoch": 0.4012175267218801, + "grad_norm": 8.456397475530592, + "learning_rate": 4.627953268210127e-06, + "loss": 1.4015, + "step": 2834 + }, + { + "epoch": 0.40135909959651733, + "grad_norm": 8.863178385393617, + "learning_rate": 4.627652355960384e-06, + "loss": 1.3053, + "step": 2835 + }, + { + "epoch": 0.4015006724711545, + "grad_norm": 10.561840871977786, + "learning_rate": 4.627351331861544e-06, + "loss": 1.2979, + "step": 2836 + }, + { + "epoch": 0.40164224534579174, + "grad_norm": 9.036847448994934, + "learning_rate": 4.6270501959294315e-06, + "loss": 1.4141, + "step": 2837 + }, + { + "epoch": 0.40178381822042897, + "grad_norm": 9.120070706812838, + "learning_rate": 4.6267489481798744e-06, + "loss": 1.3535, + "step": 2838 + }, + { + "epoch": 0.4019253910950662, + "grad_norm": 8.640507122980987, + "learning_rate": 4.626447588628712e-06, + "loss": 1.2851, + "step": 2839 + }, + { + "epoch": 0.40206696396970343, + "grad_norm": 9.134253839295447, + "learning_rate": 4.626146117291784e-06, + "loss": 1.1421, + "step": 2840 + }, + { + "epoch": 0.4022085368443406, + "grad_norm": 10.464587850314656, + "learning_rate": 4.625844534184941e-06, + "loss": 1.1815, + "step": 2841 + }, + { + "epoch": 0.40235010971897783, + "grad_norm": 9.60055370323459, + "learning_rate": 4.625542839324036e-06, + "loss": 1.3255, + "step": 2842 + }, + { + "epoch": 0.40249168259361506, + "grad_norm": 10.245405702689148, + "learning_rate": 4.625241032724929e-06, + "loss": 1.3262, + "step": 2843 + }, + { + "epoch": 0.4026332554682523, + "grad_norm": 7.434780617448631, + "learning_rate": 4.624939114403485e-06, + "loss": 1.32, + "step": 2844 + }, + { + "epoch": 0.4027748283428895, + "grad_norm": 9.711470941452674, + "learning_rate": 4.624637084375576e-06, + "loss": 1.4756, + "step": 2845 + }, + { + "epoch": 0.4029164012175267, + "grad_norm": 7.57320233155615, + "learning_rate": 4.62433494265708e-06, + "loss": 1.216, + "step": 2846 + }, + { + "epoch": 0.40305797409216393, + "grad_norm": 8.387464215675575, + "learning_rate": 4.62403268926388e-06, + "loss": 1.2944, + "step": 2847 + }, + { + "epoch": 0.40319954696680116, + "grad_norm": 7.673864662120323, + "learning_rate": 4.623730324211865e-06, + "loss": 1.1791, + "step": 2848 + }, + { + "epoch": 0.4033411198414384, + "grad_norm": 10.604403649900334, + "learning_rate": 4.623427847516931e-06, + "loss": 1.3722, + "step": 2849 + }, + { + "epoch": 0.4034826927160756, + "grad_norm": 12.270283624655633, + "learning_rate": 4.623125259194978e-06, + "loss": 1.3518, + "step": 2850 + }, + { + "epoch": 0.4036242655907128, + "grad_norm": 9.992890310139918, + "learning_rate": 4.622822559261913e-06, + "loss": 1.3393, + "step": 2851 + }, + { + "epoch": 0.40376583846535, + "grad_norm": 11.817354042567347, + "learning_rate": 4.622519747733649e-06, + "loss": 1.4199, + "step": 2852 + }, + { + "epoch": 0.40390741133998725, + "grad_norm": 7.2337277083698615, + "learning_rate": 4.622216824626104e-06, + "loss": 1.2217, + "step": 2853 + }, + { + "epoch": 0.4040489842146245, + "grad_norm": 9.182868880234652, + "learning_rate": 4.621913789955204e-06, + "loss": 1.3101, + "step": 2854 + }, + { + "epoch": 0.4041905570892617, + "grad_norm": 10.463219300477402, + "learning_rate": 4.621610643736878e-06, + "loss": 1.3445, + "step": 2855 + }, + { + "epoch": 0.4043321299638989, + "grad_norm": 9.077553079418923, + "learning_rate": 4.621307385987062e-06, + "loss": 1.2472, + "step": 2856 + }, + { + "epoch": 0.4044737028385361, + "grad_norm": 9.411493911774432, + "learning_rate": 4.621004016721699e-06, + "loss": 1.2347, + "step": 2857 + }, + { + "epoch": 0.40461527571317335, + "grad_norm": 7.8653359834043926, + "learning_rate": 4.620700535956735e-06, + "loss": 1.3161, + "step": 2858 + }, + { + "epoch": 0.4047568485878106, + "grad_norm": 11.654349082598415, + "learning_rate": 4.620396943708127e-06, + "loss": 1.5757, + "step": 2859 + }, + { + "epoch": 0.4048984214624478, + "grad_norm": 7.7945744057797235, + "learning_rate": 4.6200932399918304e-06, + "loss": 1.2532, + "step": 2860 + }, + { + "epoch": 0.40503999433708504, + "grad_norm": 9.359071755312797, + "learning_rate": 4.619789424823815e-06, + "loss": 1.3075, + "step": 2861 + }, + { + "epoch": 0.4051815672117222, + "grad_norm": 8.619348858449364, + "learning_rate": 4.619485498220049e-06, + "loss": 1.4258, + "step": 2862 + }, + { + "epoch": 0.40532314008635945, + "grad_norm": 10.107078609811163, + "learning_rate": 4.6191814601965115e-06, + "loss": 1.3049, + "step": 2863 + }, + { + "epoch": 0.4054647129609967, + "grad_norm": 11.694191715251074, + "learning_rate": 4.618877310769184e-06, + "loss": 1.3057, + "step": 2864 + }, + { + "epoch": 0.4056062858356339, + "grad_norm": 8.537532295847361, + "learning_rate": 4.6185730499540565e-06, + "loss": 1.395, + "step": 2865 + }, + { + "epoch": 0.40574785871027114, + "grad_norm": 10.253081067987258, + "learning_rate": 4.618268677767124e-06, + "loss": 1.3485, + "step": 2866 + }, + { + "epoch": 0.4058894315849083, + "grad_norm": 8.835785531407565, + "learning_rate": 4.617964194224386e-06, + "loss": 1.3219, + "step": 2867 + }, + { + "epoch": 0.40603100445954554, + "grad_norm": 10.33394026255705, + "learning_rate": 4.617659599341849e-06, + "loss": 1.3645, + "step": 2868 + }, + { + "epoch": 0.40617257733418277, + "grad_norm": 8.01096927107509, + "learning_rate": 4.617354893135527e-06, + "loss": 1.3142, + "step": 2869 + }, + { + "epoch": 0.40631415020882, + "grad_norm": 11.839712939519876, + "learning_rate": 4.617050075621436e-06, + "loss": 1.4168, + "step": 2870 + }, + { + "epoch": 0.40645572308345723, + "grad_norm": 9.343058378719682, + "learning_rate": 4.6167451468156015e-06, + "loss": 1.4057, + "step": 2871 + }, + { + "epoch": 0.4065972959580944, + "grad_norm": 9.023607868386256, + "learning_rate": 4.616440106734053e-06, + "loss": 1.3064, + "step": 2872 + }, + { + "epoch": 0.40673886883273164, + "grad_norm": 11.444121231732979, + "learning_rate": 4.6161349553928255e-06, + "loss": 1.465, + "step": 2873 + }, + { + "epoch": 0.40688044170736887, + "grad_norm": 8.79233662973868, + "learning_rate": 4.615829692807962e-06, + "loss": 1.358, + "step": 2874 + }, + { + "epoch": 0.4070220145820061, + "grad_norm": 9.538569807096017, + "learning_rate": 4.61552431899551e-06, + "loss": 1.2733, + "step": 2875 + }, + { + "epoch": 0.4071635874566433, + "grad_norm": 10.202008547024192, + "learning_rate": 4.615218833971521e-06, + "loss": 1.3722, + "step": 2876 + }, + { + "epoch": 0.4073051603312805, + "grad_norm": 9.294024477652883, + "learning_rate": 4.614913237752054e-06, + "loss": 1.4096, + "step": 2877 + }, + { + "epoch": 0.40744673320591773, + "grad_norm": 7.522779330959049, + "learning_rate": 4.614607530353177e-06, + "loss": 1.2713, + "step": 2878 + }, + { + "epoch": 0.40758830608055496, + "grad_norm": 11.881676654088205, + "learning_rate": 4.614301711790958e-06, + "loss": 1.3986, + "step": 2879 + }, + { + "epoch": 0.4077298789551922, + "grad_norm": 10.853539700744847, + "learning_rate": 4.613995782081474e-06, + "loss": 1.2894, + "step": 2880 + }, + { + "epoch": 0.4078714518298294, + "grad_norm": 8.327340781557309, + "learning_rate": 4.6136897412408084e-06, + "loss": 1.3703, + "step": 2881 + }, + { + "epoch": 0.4080130247044666, + "grad_norm": 8.498762601654276, + "learning_rate": 4.61338358928505e-06, + "loss": 1.3157, + "step": 2882 + }, + { + "epoch": 0.40815459757910383, + "grad_norm": 9.725242286154831, + "learning_rate": 4.6130773262302905e-06, + "loss": 1.519, + "step": 2883 + }, + { + "epoch": 0.40829617045374106, + "grad_norm": 9.786682375376444, + "learning_rate": 4.612770952092632e-06, + "loss": 1.314, + "step": 2884 + }, + { + "epoch": 0.4084377433283783, + "grad_norm": 10.386248637819218, + "learning_rate": 4.612464466888181e-06, + "loss": 1.462, + "step": 2885 + }, + { + "epoch": 0.4085793162030155, + "grad_norm": 13.302955608799403, + "learning_rate": 4.612157870633047e-06, + "loss": 1.3059, + "step": 2886 + }, + { + "epoch": 0.40872088907765275, + "grad_norm": 8.745500115935739, + "learning_rate": 4.61185116334335e-06, + "loss": 1.172, + "step": 2887 + }, + { + "epoch": 0.4088624619522899, + "grad_norm": 8.994894486932834, + "learning_rate": 4.61154434503521e-06, + "loss": 1.3584, + "step": 2888 + }, + { + "epoch": 0.40900403482692715, + "grad_norm": 10.070377654463393, + "learning_rate": 4.611237415724759e-06, + "loss": 1.3452, + "step": 2889 + }, + { + "epoch": 0.4091456077015644, + "grad_norm": 12.937222242829083, + "learning_rate": 4.610930375428132e-06, + "loss": 1.3379, + "step": 2890 + }, + { + "epoch": 0.4092871805762016, + "grad_norm": 12.90620448217609, + "learning_rate": 4.610623224161468e-06, + "loss": 1.4758, + "step": 2891 + }, + { + "epoch": 0.40942875345083884, + "grad_norm": 8.02744306888072, + "learning_rate": 4.610315961940916e-06, + "loss": 1.3856, + "step": 2892 + }, + { + "epoch": 0.409570326325476, + "grad_norm": 8.773157483844008, + "learning_rate": 4.610008588782626e-06, + "loss": 1.3475, + "step": 2893 + }, + { + "epoch": 0.40971189920011325, + "grad_norm": 11.1904950954351, + "learning_rate": 4.609701104702759e-06, + "loss": 1.3716, + "step": 2894 + }, + { + "epoch": 0.4098534720747505, + "grad_norm": 12.397059651897111, + "learning_rate": 4.609393509717478e-06, + "loss": 1.2934, + "step": 2895 + }, + { + "epoch": 0.4099950449493877, + "grad_norm": 9.448914351494302, + "learning_rate": 4.6090858038429535e-06, + "loss": 1.4021, + "step": 2896 + }, + { + "epoch": 0.41013661782402494, + "grad_norm": 8.241076180262304, + "learning_rate": 4.6087779870953595e-06, + "loss": 1.4614, + "step": 2897 + }, + { + "epoch": 0.4102781906986621, + "grad_norm": 9.309347758488592, + "learning_rate": 4.608470059490879e-06, + "loss": 1.2883, + "step": 2898 + }, + { + "epoch": 0.41041976357329935, + "grad_norm": 7.377042584511843, + "learning_rate": 4.6081620210457e-06, + "loss": 1.1782, + "step": 2899 + }, + { + "epoch": 0.4105613364479366, + "grad_norm": 9.38246732230916, + "learning_rate": 4.6078538717760165e-06, + "loss": 1.2599, + "step": 2900 + }, + { + "epoch": 0.4107029093225738, + "grad_norm": 9.005225889601766, + "learning_rate": 4.607545611698025e-06, + "loss": 1.3542, + "step": 2901 + }, + { + "epoch": 0.41084448219721104, + "grad_norm": 8.880245753831225, + "learning_rate": 4.607237240827933e-06, + "loss": 1.2004, + "step": 2902 + }, + { + "epoch": 0.4109860550718482, + "grad_norm": 7.717043297998502, + "learning_rate": 4.606928759181951e-06, + "loss": 1.3793, + "step": 2903 + }, + { + "epoch": 0.41112762794648544, + "grad_norm": 8.960946865459361, + "learning_rate": 4.6066201667762944e-06, + "loss": 1.3845, + "step": 2904 + }, + { + "epoch": 0.41126920082112267, + "grad_norm": 9.703055806919153, + "learning_rate": 4.606311463627186e-06, + "loss": 1.2743, + "step": 2905 + }, + { + "epoch": 0.4114107736957599, + "grad_norm": 9.742251716408244, + "learning_rate": 4.606002649750856e-06, + "loss": 1.4747, + "step": 2906 + }, + { + "epoch": 0.41155234657039713, + "grad_norm": 9.531209576630468, + "learning_rate": 4.605693725163536e-06, + "loss": 1.236, + "step": 2907 + }, + { + "epoch": 0.4116939194450343, + "grad_norm": 8.90294706435257, + "learning_rate": 4.605384689881467e-06, + "loss": 1.3823, + "step": 2908 + }, + { + "epoch": 0.41183549231967154, + "grad_norm": 9.436535514406144, + "learning_rate": 4.605075543920895e-06, + "loss": 1.2442, + "step": 2909 + }, + { + "epoch": 0.41197706519430877, + "grad_norm": 9.742146385613925, + "learning_rate": 4.604766287298071e-06, + "loss": 1.3926, + "step": 2910 + }, + { + "epoch": 0.412118638068946, + "grad_norm": 9.452176234025794, + "learning_rate": 4.604456920029252e-06, + "loss": 1.3479, + "step": 2911 + }, + { + "epoch": 0.4122602109435832, + "grad_norm": 10.318224618476947, + "learning_rate": 4.604147442130703e-06, + "loss": 1.3946, + "step": 2912 + }, + { + "epoch": 0.4124017838182204, + "grad_norm": 8.778446279358885, + "learning_rate": 4.603837853618691e-06, + "loss": 1.4644, + "step": 2913 + }, + { + "epoch": 0.41254335669285763, + "grad_norm": 8.673991067133677, + "learning_rate": 4.603528154509492e-06, + "loss": 1.3483, + "step": 2914 + }, + { + "epoch": 0.41268492956749486, + "grad_norm": 11.179017671342658, + "learning_rate": 4.6032183448193865e-06, + "loss": 1.4868, + "step": 2915 + }, + { + "epoch": 0.4128265024421321, + "grad_norm": 10.5387982673974, + "learning_rate": 4.602908424564661e-06, + "loss": 1.3698, + "step": 2916 + }, + { + "epoch": 0.4129680753167693, + "grad_norm": 8.414427002898615, + "learning_rate": 4.602598393761607e-06, + "loss": 1.2531, + "step": 2917 + }, + { + "epoch": 0.41310964819140655, + "grad_norm": 8.128276927075175, + "learning_rate": 4.602288252426524e-06, + "loss": 1.261, + "step": 2918 + }, + { + "epoch": 0.41325122106604373, + "grad_norm": 11.478268199913288, + "learning_rate": 4.601978000575715e-06, + "loss": 1.4729, + "step": 2919 + }, + { + "epoch": 0.41339279394068096, + "grad_norm": 10.49689292349709, + "learning_rate": 4.6016676382254895e-06, + "loss": 1.4525, + "step": 2920 + }, + { + "epoch": 0.4135343668153182, + "grad_norm": 8.703406392724292, + "learning_rate": 4.601357165392163e-06, + "loss": 1.4825, + "step": 2921 + }, + { + "epoch": 0.4136759396899554, + "grad_norm": 7.874606894717849, + "learning_rate": 4.601046582092058e-06, + "loss": 1.1975, + "step": 2922 + }, + { + "epoch": 0.41381751256459265, + "grad_norm": 9.090142547194754, + "learning_rate": 4.6007358883414996e-06, + "loss": 1.3577, + "step": 2923 + }, + { + "epoch": 0.4139590854392298, + "grad_norm": 9.477396124067015, + "learning_rate": 4.600425084156823e-06, + "loss": 1.3033, + "step": 2924 + }, + { + "epoch": 0.41410065831386705, + "grad_norm": 9.51908242940734, + "learning_rate": 4.6001141695543655e-06, + "loss": 1.3843, + "step": 2925 + }, + { + "epoch": 0.4142422311885043, + "grad_norm": 10.102764061127266, + "learning_rate": 4.599803144550472e-06, + "loss": 1.4004, + "step": 2926 + }, + { + "epoch": 0.4143838040631415, + "grad_norm": 10.238350179208066, + "learning_rate": 4.5994920091614935e-06, + "loss": 1.3924, + "step": 2927 + }, + { + "epoch": 0.41452537693777874, + "grad_norm": 8.889701594181536, + "learning_rate": 4.5991807634037846e-06, + "loss": 1.2656, + "step": 2928 + }, + { + "epoch": 0.4146669498124159, + "grad_norm": 10.40860169560794, + "learning_rate": 4.598869407293708e-06, + "loss": 1.447, + "step": 2929 + }, + { + "epoch": 0.41480852268705315, + "grad_norm": 9.649687577166116, + "learning_rate": 4.5985579408476324e-06, + "loss": 1.333, + "step": 2930 + }, + { + "epoch": 0.4149500955616904, + "grad_norm": 9.680687656410393, + "learning_rate": 4.5982463640819304e-06, + "loss": 1.2791, + "step": 2931 + }, + { + "epoch": 0.4150916684363276, + "grad_norm": 9.052368815097559, + "learning_rate": 4.597934677012982e-06, + "loss": 1.3863, + "step": 2932 + }, + { + "epoch": 0.41523324131096484, + "grad_norm": 9.591604583123926, + "learning_rate": 4.597622879657171e-06, + "loss": 1.2454, + "step": 2933 + }, + { + "epoch": 0.415374814185602, + "grad_norm": 9.10072965945438, + "learning_rate": 4.597310972030889e-06, + "loss": 1.4127, + "step": 2934 + }, + { + "epoch": 0.41551638706023925, + "grad_norm": 13.008030098592887, + "learning_rate": 4.596998954150534e-06, + "loss": 1.421, + "step": 2935 + }, + { + "epoch": 0.4156579599348765, + "grad_norm": 9.193379914532505, + "learning_rate": 4.596686826032507e-06, + "loss": 1.2345, + "step": 2936 + }, + { + "epoch": 0.4157995328095137, + "grad_norm": 10.191871875143248, + "learning_rate": 4.596374587693218e-06, + "loss": 1.3568, + "step": 2937 + }, + { + "epoch": 0.41594110568415094, + "grad_norm": 8.622760869053648, + "learning_rate": 4.596062239149079e-06, + "loss": 1.3892, + "step": 2938 + }, + { + "epoch": 0.4160826785587881, + "grad_norm": 9.51838230717833, + "learning_rate": 4.595749780416511e-06, + "loss": 1.1925, + "step": 2939 + }, + { + "epoch": 0.41622425143342534, + "grad_norm": 11.959013561790695, + "learning_rate": 4.59543721151194e-06, + "loss": 1.3084, + "step": 2940 + }, + { + "epoch": 0.41636582430806257, + "grad_norm": 10.736073544200616, + "learning_rate": 4.595124532451797e-06, + "loss": 1.4099, + "step": 2941 + }, + { + "epoch": 0.4165073971826998, + "grad_norm": 9.052495235177435, + "learning_rate": 4.5948117432525195e-06, + "loss": 1.3936, + "step": 2942 + }, + { + "epoch": 0.41664897005733703, + "grad_norm": 11.190913015418015, + "learning_rate": 4.594498843930551e-06, + "loss": 1.3138, + "step": 2943 + }, + { + "epoch": 0.4167905429319742, + "grad_norm": 10.946567112248879, + "learning_rate": 4.59418583450234e-06, + "loss": 1.4157, + "step": 2944 + }, + { + "epoch": 0.41693211580661144, + "grad_norm": 10.756453595182405, + "learning_rate": 4.593872714984341e-06, + "loss": 1.3117, + "step": 2945 + }, + { + "epoch": 0.41707368868124867, + "grad_norm": 8.822330267709267, + "learning_rate": 4.593559485393015e-06, + "loss": 1.2645, + "step": 2946 + }, + { + "epoch": 0.4172152615558859, + "grad_norm": 10.83254540341245, + "learning_rate": 4.593246145744827e-06, + "loss": 1.191, + "step": 2947 + }, + { + "epoch": 0.4173568344305231, + "grad_norm": 9.487513316752018, + "learning_rate": 4.59293269605625e-06, + "loss": 1.398, + "step": 2948 + }, + { + "epoch": 0.41749840730516036, + "grad_norm": 9.329909468658277, + "learning_rate": 4.592619136343762e-06, + "loss": 1.2172, + "step": 2949 + }, + { + "epoch": 0.41763998017979753, + "grad_norm": 11.37047243664999, + "learning_rate": 4.592305466623847e-06, + "loss": 1.4658, + "step": 2950 + }, + { + "epoch": 0.41778155305443476, + "grad_norm": 9.002633874984127, + "learning_rate": 4.591991686912993e-06, + "loss": 1.2678, + "step": 2951 + }, + { + "epoch": 0.417923125929072, + "grad_norm": 8.490146984899338, + "learning_rate": 4.591677797227696e-06, + "loss": 1.2745, + "step": 2952 + }, + { + "epoch": 0.4180646988037092, + "grad_norm": 8.768196150458209, + "learning_rate": 4.591363797584457e-06, + "loss": 1.2899, + "step": 2953 + }, + { + "epoch": 0.41820627167834645, + "grad_norm": 10.501775682258156, + "learning_rate": 4.591049687999782e-06, + "loss": 1.3561, + "step": 2954 + }, + { + "epoch": 0.4183478445529836, + "grad_norm": 7.304186407027378, + "learning_rate": 4.590735468490184e-06, + "loss": 1.386, + "step": 2955 + }, + { + "epoch": 0.41848941742762086, + "grad_norm": 8.552900070115108, + "learning_rate": 4.590421139072182e-06, + "loss": 1.2299, + "step": 2956 + }, + { + "epoch": 0.4186309903022581, + "grad_norm": 9.898295677752749, + "learning_rate": 4.590106699762299e-06, + "loss": 1.3088, + "step": 2957 + }, + { + "epoch": 0.4187725631768953, + "grad_norm": 9.343023673823636, + "learning_rate": 4.589792150577065e-06, + "loss": 1.3906, + "step": 2958 + }, + { + "epoch": 0.41891413605153255, + "grad_norm": 10.519419196743536, + "learning_rate": 4.589477491533016e-06, + "loss": 1.4471, + "step": 2959 + }, + { + "epoch": 0.4190557089261697, + "grad_norm": 9.798166484974889, + "learning_rate": 4.589162722646694e-06, + "loss": 1.3329, + "step": 2960 + }, + { + "epoch": 0.41919728180080695, + "grad_norm": 11.082837736670301, + "learning_rate": 4.588847843934645e-06, + "loss": 1.3647, + "step": 2961 + }, + { + "epoch": 0.4193388546754442, + "grad_norm": 8.226680280199128, + "learning_rate": 4.588532855413422e-06, + "loss": 1.3566, + "step": 2962 + }, + { + "epoch": 0.4194804275500814, + "grad_norm": 9.032017869875911, + "learning_rate": 4.588217757099584e-06, + "loss": 1.186, + "step": 2963 + }, + { + "epoch": 0.41962200042471864, + "grad_norm": 7.809783707479816, + "learning_rate": 4.587902549009696e-06, + "loss": 1.4253, + "step": 2964 + }, + { + "epoch": 0.4197635732993558, + "grad_norm": 8.521746026165802, + "learning_rate": 4.587587231160329e-06, + "loss": 1.2788, + "step": 2965 + }, + { + "epoch": 0.41990514617399305, + "grad_norm": 10.238144880577883, + "learning_rate": 4.5872718035680554e-06, + "loss": 1.4273, + "step": 2966 + }, + { + "epoch": 0.4200467190486303, + "grad_norm": 8.24442634729054, + "learning_rate": 4.586956266249461e-06, + "loss": 1.236, + "step": 2967 + }, + { + "epoch": 0.4201882919232675, + "grad_norm": 8.178961341782408, + "learning_rate": 4.586640619221131e-06, + "loss": 1.229, + "step": 2968 + }, + { + "epoch": 0.42032986479790474, + "grad_norm": 9.866781170498507, + "learning_rate": 4.586324862499661e-06, + "loss": 1.3249, + "step": 2969 + }, + { + "epoch": 0.4204714376725419, + "grad_norm": 8.712818602944314, + "learning_rate": 4.586008996101646e-06, + "loss": 1.269, + "step": 2970 + }, + { + "epoch": 0.42061301054717914, + "grad_norm": 8.419087077687205, + "learning_rate": 4.5856930200436955e-06, + "loss": 1.2025, + "step": 2971 + }, + { + "epoch": 0.4207545834218164, + "grad_norm": 8.613630679561709, + "learning_rate": 4.585376934342418e-06, + "loss": 1.2191, + "step": 2972 + }, + { + "epoch": 0.4208961562964536, + "grad_norm": 9.209983252096947, + "learning_rate": 4.585060739014429e-06, + "loss": 1.3578, + "step": 2973 + }, + { + "epoch": 0.42103772917109084, + "grad_norm": 9.854652011094336, + "learning_rate": 4.584744434076352e-06, + "loss": 1.2992, + "step": 2974 + }, + { + "epoch": 0.42117930204572807, + "grad_norm": 8.504343605973187, + "learning_rate": 4.584428019544815e-06, + "loss": 1.2136, + "step": 2975 + }, + { + "epoch": 0.42132087492036524, + "grad_norm": 8.476324881462116, + "learning_rate": 4.58411149543645e-06, + "loss": 1.3656, + "step": 2976 + }, + { + "epoch": 0.42146244779500247, + "grad_norm": 8.168322752072866, + "learning_rate": 4.583794861767899e-06, + "loss": 1.3296, + "step": 2977 + }, + { + "epoch": 0.4216040206696397, + "grad_norm": 9.02228626652345, + "learning_rate": 4.583478118555806e-06, + "loss": 1.2181, + "step": 2978 + }, + { + "epoch": 0.42174559354427693, + "grad_norm": 9.331980153036126, + "learning_rate": 4.583161265816821e-06, + "loss": 1.0937, + "step": 2979 + }, + { + "epoch": 0.42188716641891416, + "grad_norm": 11.656178655738431, + "learning_rate": 4.582844303567602e-06, + "loss": 1.3695, + "step": 2980 + }, + { + "epoch": 0.42202873929355134, + "grad_norm": 9.286296666924486, + "learning_rate": 4.58252723182481e-06, + "loss": 1.2848, + "step": 2981 + }, + { + "epoch": 0.42217031216818857, + "grad_norm": 9.805842187982886, + "learning_rate": 4.582210050605115e-06, + "loss": 1.1828, + "step": 2982 + }, + { + "epoch": 0.4223118850428258, + "grad_norm": 7.966396088840117, + "learning_rate": 4.58189275992519e-06, + "loss": 1.265, + "step": 2983 + }, + { + "epoch": 0.422453457917463, + "grad_norm": 10.98751990063702, + "learning_rate": 4.581575359801715e-06, + "loss": 1.3565, + "step": 2984 + }, + { + "epoch": 0.42259503079210026, + "grad_norm": 12.293565782292568, + "learning_rate": 4.581257850251376e-06, + "loss": 1.4169, + "step": 2985 + }, + { + "epoch": 0.42273660366673743, + "grad_norm": 8.987911583146824, + "learning_rate": 4.580940231290864e-06, + "loss": 1.4284, + "step": 2986 + }, + { + "epoch": 0.42287817654137466, + "grad_norm": 8.220514028399519, + "learning_rate": 4.580622502936875e-06, + "loss": 1.2508, + "step": 2987 + }, + { + "epoch": 0.4230197494160119, + "grad_norm": 11.443337873603712, + "learning_rate": 4.580304665206111e-06, + "loss": 1.2951, + "step": 2988 + }, + { + "epoch": 0.4231613222906491, + "grad_norm": 9.432100684122794, + "learning_rate": 4.579986718115283e-06, + "loss": 1.3227, + "step": 2989 + }, + { + "epoch": 0.42330289516528635, + "grad_norm": 9.840079068122092, + "learning_rate": 4.579668661681105e-06, + "loss": 1.349, + "step": 2990 + }, + { + "epoch": 0.4234444680399235, + "grad_norm": 9.27899162996418, + "learning_rate": 4.579350495920295e-06, + "loss": 1.2828, + "step": 2991 + }, + { + "epoch": 0.42358604091456076, + "grad_norm": 6.697593131391463, + "learning_rate": 4.579032220849581e-06, + "loss": 1.1961, + "step": 2992 + }, + { + "epoch": 0.423727613789198, + "grad_norm": 7.976592389834424, + "learning_rate": 4.578713836485692e-06, + "loss": 1.3202, + "step": 2993 + }, + { + "epoch": 0.4238691866638352, + "grad_norm": 7.958975987342206, + "learning_rate": 4.578395342845367e-06, + "loss": 1.2972, + "step": 2994 + }, + { + "epoch": 0.42401075953847245, + "grad_norm": 10.002984555231855, + "learning_rate": 4.578076739945349e-06, + "loss": 1.3501, + "step": 2995 + }, + { + "epoch": 0.4241523324131096, + "grad_norm": 9.943777728566578, + "learning_rate": 4.577758027802386e-06, + "loss": 1.3184, + "step": 2996 + }, + { + "epoch": 0.42429390528774685, + "grad_norm": 9.976442436386, + "learning_rate": 4.5774392064332325e-06, + "loss": 1.2964, + "step": 2997 + }, + { + "epoch": 0.4244354781623841, + "grad_norm": 8.876351696829097, + "learning_rate": 4.577120275854649e-06, + "loss": 1.2717, + "step": 2998 + }, + { + "epoch": 0.4245770510370213, + "grad_norm": 9.088474277075257, + "learning_rate": 4.576801236083402e-06, + "loss": 1.3159, + "step": 2999 + }, + { + "epoch": 0.42471862391165854, + "grad_norm": 9.734267624558276, + "learning_rate": 4.576482087136262e-06, + "loss": 1.3154, + "step": 3000 + }, + { + "epoch": 0.4248601967862957, + "grad_norm": 7.1059749982862686, + "learning_rate": 4.576162829030007e-06, + "loss": 1.2792, + "step": 3001 + }, + { + "epoch": 0.42500176966093295, + "grad_norm": 9.015256454366025, + "learning_rate": 4.57584346178142e-06, + "loss": 1.3329, + "step": 3002 + }, + { + "epoch": 0.4251433425355702, + "grad_norm": 10.009769245426174, + "learning_rate": 4.5755239854072904e-06, + "loss": 1.4544, + "step": 3003 + }, + { + "epoch": 0.4252849154102074, + "grad_norm": 8.499095644244717, + "learning_rate": 4.575204399924412e-06, + "loss": 1.4657, + "step": 3004 + }, + { + "epoch": 0.42542648828484464, + "grad_norm": 9.011095201525563, + "learning_rate": 4.574884705349586e-06, + "loss": 1.2891, + "step": 3005 + }, + { + "epoch": 0.42556806115948187, + "grad_norm": 10.60061684019275, + "learning_rate": 4.574564901699618e-06, + "loss": 1.3723, + "step": 3006 + }, + { + "epoch": 0.42570963403411904, + "grad_norm": 9.796833725740408, + "learning_rate": 4.57424498899132e-06, + "loss": 1.2056, + "step": 3007 + }, + { + "epoch": 0.4258512069087563, + "grad_norm": 9.61568247641865, + "learning_rate": 4.573924967241509e-06, + "loss": 1.2731, + "step": 3008 + }, + { + "epoch": 0.4259927797833935, + "grad_norm": 9.058445516790618, + "learning_rate": 4.57360483646701e-06, + "loss": 1.3853, + "step": 3009 + }, + { + "epoch": 0.42613435265803074, + "grad_norm": 8.174806029092142, + "learning_rate": 4.57328459668465e-06, + "loss": 1.2625, + "step": 3010 + }, + { + "epoch": 0.42627592553266797, + "grad_norm": 11.209794489314948, + "learning_rate": 4.572964247911265e-06, + "loss": 1.4644, + "step": 3011 + }, + { + "epoch": 0.42641749840730514, + "grad_norm": 10.459453588623003, + "learning_rate": 4.572643790163696e-06, + "loss": 1.4552, + "step": 3012 + }, + { + "epoch": 0.42655907128194237, + "grad_norm": 9.489729700449727, + "learning_rate": 4.572323223458786e-06, + "loss": 1.4446, + "step": 3013 + }, + { + "epoch": 0.4267006441565796, + "grad_norm": 9.840557052847748, + "learning_rate": 4.572002547813391e-06, + "loss": 1.2101, + "step": 3014 + }, + { + "epoch": 0.42684221703121683, + "grad_norm": 10.00866781327487, + "learning_rate": 4.571681763244367e-06, + "loss": 1.2079, + "step": 3015 + }, + { + "epoch": 0.42698378990585406, + "grad_norm": 10.449534895918365, + "learning_rate": 4.571360869768578e-06, + "loss": 1.2348, + "step": 3016 + }, + { + "epoch": 0.42712536278049124, + "grad_norm": 10.278261254655153, + "learning_rate": 4.571039867402891e-06, + "loss": 1.4295, + "step": 3017 + }, + { + "epoch": 0.42726693565512847, + "grad_norm": 10.991873079858179, + "learning_rate": 4.570718756164183e-06, + "loss": 1.2489, + "step": 3018 + }, + { + "epoch": 0.4274085085297657, + "grad_norm": 8.425358831987745, + "learning_rate": 4.570397536069335e-06, + "loss": 1.2335, + "step": 3019 + }, + { + "epoch": 0.4275500814044029, + "grad_norm": 9.089839973006306, + "learning_rate": 4.570076207135231e-06, + "loss": 1.3523, + "step": 3020 + }, + { + "epoch": 0.42769165427904016, + "grad_norm": 8.701501773698364, + "learning_rate": 4.569754769378765e-06, + "loss": 1.2193, + "step": 3021 + }, + { + "epoch": 0.42783322715367733, + "grad_norm": 9.068797481946252, + "learning_rate": 4.569433222816834e-06, + "loss": 1.4387, + "step": 3022 + }, + { + "epoch": 0.42797480002831456, + "grad_norm": 8.790540674308929, + "learning_rate": 4.569111567466341e-06, + "loss": 1.2454, + "step": 3023 + }, + { + "epoch": 0.4281163729029518, + "grad_norm": 9.05835034305364, + "learning_rate": 4.568789803344196e-06, + "loss": 1.4937, + "step": 3024 + }, + { + "epoch": 0.428257945777589, + "grad_norm": 9.197134351168167, + "learning_rate": 4.568467930467314e-06, + "loss": 1.4753, + "step": 3025 + }, + { + "epoch": 0.42839951865222625, + "grad_norm": 9.287261146564804, + "learning_rate": 4.568145948852614e-06, + "loss": 1.1303, + "step": 3026 + }, + { + "epoch": 0.4285410915268634, + "grad_norm": 9.020376662279382, + "learning_rate": 4.567823858517024e-06, + "loss": 1.4017, + "step": 3027 + }, + { + "epoch": 0.42868266440150066, + "grad_norm": 8.378096733913527, + "learning_rate": 4.567501659477477e-06, + "loss": 1.2225, + "step": 3028 + }, + { + "epoch": 0.4288242372761379, + "grad_norm": 10.151587346172649, + "learning_rate": 4.567179351750908e-06, + "loss": 1.4223, + "step": 3029 + }, + { + "epoch": 0.4289658101507751, + "grad_norm": 10.24201058497568, + "learning_rate": 4.566856935354262e-06, + "loss": 1.3197, + "step": 3030 + }, + { + "epoch": 0.42910738302541235, + "grad_norm": 8.686756129629982, + "learning_rate": 4.566534410304488e-06, + "loss": 1.3219, + "step": 3031 + }, + { + "epoch": 0.4292489559000496, + "grad_norm": 11.93073756225058, + "learning_rate": 4.566211776618541e-06, + "loss": 1.3179, + "step": 3032 + }, + { + "epoch": 0.42939052877468675, + "grad_norm": 8.736220927340872, + "learning_rate": 4.565889034313382e-06, + "loss": 1.3479, + "step": 3033 + }, + { + "epoch": 0.429532101649324, + "grad_norm": 9.728271530327135, + "learning_rate": 4.565566183405976e-06, + "loss": 1.4753, + "step": 3034 + }, + { + "epoch": 0.4296736745239612, + "grad_norm": 10.451881232058778, + "learning_rate": 4.565243223913297e-06, + "loss": 1.295, + "step": 3035 + }, + { + "epoch": 0.42981524739859844, + "grad_norm": 9.851195798215826, + "learning_rate": 4.564920155852321e-06, + "loss": 1.4287, + "step": 3036 + }, + { + "epoch": 0.4299568202732357, + "grad_norm": 9.16591012318092, + "learning_rate": 4.564596979240031e-06, + "loss": 1.2543, + "step": 3037 + }, + { + "epoch": 0.43009839314787285, + "grad_norm": 10.521734358978012, + "learning_rate": 4.564273694093419e-06, + "loss": 1.3226, + "step": 3038 + }, + { + "epoch": 0.4302399660225101, + "grad_norm": 9.824007126821114, + "learning_rate": 4.5639503004294774e-06, + "loss": 1.3749, + "step": 3039 + }, + { + "epoch": 0.4303815388971473, + "grad_norm": 10.908357520449416, + "learning_rate": 4.5636267982652075e-06, + "loss": 1.2588, + "step": 3040 + }, + { + "epoch": 0.43052311177178454, + "grad_norm": 10.841541252222067, + "learning_rate": 4.5633031876176156e-06, + "loss": 1.3138, + "step": 3041 + }, + { + "epoch": 0.43066468464642177, + "grad_norm": 8.834673318053065, + "learning_rate": 4.562979468503713e-06, + "loss": 1.2716, + "step": 3042 + }, + { + "epoch": 0.43080625752105894, + "grad_norm": 10.236994983108065, + "learning_rate": 4.562655640940519e-06, + "loss": 1.2831, + "step": 3043 + }, + { + "epoch": 0.4309478303956962, + "grad_norm": 10.790343764435288, + "learning_rate": 4.562331704945055e-06, + "loss": 1.423, + "step": 3044 + }, + { + "epoch": 0.4310894032703334, + "grad_norm": 11.044993195779897, + "learning_rate": 4.562007660534351e-06, + "loss": 1.3862, + "step": 3045 + }, + { + "epoch": 0.43123097614497063, + "grad_norm": 12.332888689703527, + "learning_rate": 4.5616835077254425e-06, + "loss": 1.4661, + "step": 3046 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 10.849197118911611, + "learning_rate": 4.561359246535369e-06, + "loss": 1.4065, + "step": 3047 + }, + { + "epoch": 0.43151412189424504, + "grad_norm": 8.376650974892996, + "learning_rate": 4.561034876981177e-06, + "loss": 1.3778, + "step": 3048 + }, + { + "epoch": 0.43165569476888227, + "grad_norm": 8.628744003822115, + "learning_rate": 4.560710399079918e-06, + "loss": 1.083, + "step": 3049 + }, + { + "epoch": 0.4317972676435195, + "grad_norm": 11.673885473602233, + "learning_rate": 4.56038581284865e-06, + "loss": 1.3416, + "step": 3050 + }, + { + "epoch": 0.43193884051815673, + "grad_norm": 9.612728073529647, + "learning_rate": 4.560061118304436e-06, + "loss": 1.2255, + "step": 3051 + }, + { + "epoch": 0.43208041339279396, + "grad_norm": 8.484020696446608, + "learning_rate": 4.559736315464345e-06, + "loss": 1.3194, + "step": 3052 + }, + { + "epoch": 0.43222198626743114, + "grad_norm": 8.821593442471594, + "learning_rate": 4.559411404345452e-06, + "loss": 1.2654, + "step": 3053 + }, + { + "epoch": 0.43236355914206837, + "grad_norm": 9.367404556623004, + "learning_rate": 4.5590863849648364e-06, + "loss": 1.4809, + "step": 3054 + }, + { + "epoch": 0.4325051320167056, + "grad_norm": 10.422537123480637, + "learning_rate": 4.5587612573395855e-06, + "loss": 1.1988, + "step": 3055 + }, + { + "epoch": 0.4326467048913428, + "grad_norm": 10.311931062667696, + "learning_rate": 4.55843602148679e-06, + "loss": 1.3014, + "step": 3056 + }, + { + "epoch": 0.43278827776598006, + "grad_norm": 9.492143293364139, + "learning_rate": 4.558110677423548e-06, + "loss": 1.2735, + "step": 3057 + }, + { + "epoch": 0.43292985064061723, + "grad_norm": 7.195667701262488, + "learning_rate": 4.557785225166962e-06, + "loss": 1.0809, + "step": 3058 + }, + { + "epoch": 0.43307142351525446, + "grad_norm": 9.930121603660076, + "learning_rate": 4.5574596647341414e-06, + "loss": 1.3027, + "step": 3059 + }, + { + "epoch": 0.4332129963898917, + "grad_norm": 11.033317393803296, + "learning_rate": 4.5571339961422e-06, + "loss": 1.4836, + "step": 3060 + }, + { + "epoch": 0.4333545692645289, + "grad_norm": 11.598193034479081, + "learning_rate": 4.5568082194082584e-06, + "loss": 1.3783, + "step": 3061 + }, + { + "epoch": 0.43349614213916615, + "grad_norm": 8.999477371300124, + "learning_rate": 4.556482334549442e-06, + "loss": 1.2521, + "step": 3062 + }, + { + "epoch": 0.4336377150138034, + "grad_norm": 10.854868177318998, + "learning_rate": 4.556156341582884e-06, + "loss": 1.3511, + "step": 3063 + }, + { + "epoch": 0.43377928788844056, + "grad_norm": 9.737455465388773, + "learning_rate": 4.555830240525719e-06, + "loss": 1.242, + "step": 3064 + }, + { + "epoch": 0.4339208607630778, + "grad_norm": 9.264873609262324, + "learning_rate": 4.5555040313950915e-06, + "loss": 1.3034, + "step": 3065 + }, + { + "epoch": 0.434062433637715, + "grad_norm": 11.170402888696575, + "learning_rate": 4.555177714208149e-06, + "loss": 1.2996, + "step": 3066 + }, + { + "epoch": 0.43420400651235225, + "grad_norm": 12.213147842575614, + "learning_rate": 4.554851288982047e-06, + "loss": 1.1687, + "step": 3067 + }, + { + "epoch": 0.4343455793869895, + "grad_norm": 10.859375, + "learning_rate": 4.554524755733946e-06, + "loss": 1.4498, + "step": 3068 + }, + { + "epoch": 0.43448715226162665, + "grad_norm": 7.512556881300493, + "learning_rate": 4.554198114481009e-06, + "loss": 1.1112, + "step": 3069 + }, + { + "epoch": 0.4346287251362639, + "grad_norm": 9.410281511853098, + "learning_rate": 4.553871365240409e-06, + "loss": 1.2818, + "step": 3070 + }, + { + "epoch": 0.4347702980109011, + "grad_norm": 10.569840809066328, + "learning_rate": 4.553544508029323e-06, + "loss": 1.3229, + "step": 3071 + }, + { + "epoch": 0.43491187088553834, + "grad_norm": 10.448983642214847, + "learning_rate": 4.5532175428649335e-06, + "loss": 1.3744, + "step": 3072 + }, + { + "epoch": 0.4350534437601756, + "grad_norm": 11.05334058987602, + "learning_rate": 4.5528904697644296e-06, + "loss": 1.3426, + "step": 3073 + }, + { + "epoch": 0.43519501663481275, + "grad_norm": 11.329418703067429, + "learning_rate": 4.552563288745004e-06, + "loss": 1.2047, + "step": 3074 + }, + { + "epoch": 0.43533658950945, + "grad_norm": 9.204631999387082, + "learning_rate": 4.552235999823856e-06, + "loss": 1.2107, + "step": 3075 + }, + { + "epoch": 0.4354781623840872, + "grad_norm": 11.009199630375718, + "learning_rate": 4.551908603018191e-06, + "loss": 1.3022, + "step": 3076 + }, + { + "epoch": 0.43561973525872444, + "grad_norm": 9.02502649361481, + "learning_rate": 4.551581098345222e-06, + "loss": 1.2204, + "step": 3077 + }, + { + "epoch": 0.43576130813336167, + "grad_norm": 10.874110042944858, + "learning_rate": 4.551253485822164e-06, + "loss": 1.3745, + "step": 3078 + }, + { + "epoch": 0.43590288100799884, + "grad_norm": 9.553690124507765, + "learning_rate": 4.55092576546624e-06, + "loss": 1.2848, + "step": 3079 + }, + { + "epoch": 0.4360444538826361, + "grad_norm": 9.0072026571534, + "learning_rate": 4.550597937294677e-06, + "loss": 1.3833, + "step": 3080 + }, + { + "epoch": 0.4361860267572733, + "grad_norm": 8.863687962379633, + "learning_rate": 4.55027000132471e-06, + "loss": 1.2718, + "step": 3081 + }, + { + "epoch": 0.43632759963191053, + "grad_norm": 12.993981949138272, + "learning_rate": 4.549941957573578e-06, + "loss": 1.2825, + "step": 3082 + }, + { + "epoch": 0.43646917250654776, + "grad_norm": 10.601714344666076, + "learning_rate": 4.549613806058526e-06, + "loss": 1.2658, + "step": 3083 + }, + { + "epoch": 0.43661074538118494, + "grad_norm": 8.193230174413383, + "learning_rate": 4.5492855467968036e-06, + "loss": 1.3301, + "step": 3084 + }, + { + "epoch": 0.43675231825582217, + "grad_norm": 9.33320722040623, + "learning_rate": 4.548957179805668e-06, + "loss": 1.261, + "step": 3085 + }, + { + "epoch": 0.4368938911304594, + "grad_norm": 10.67340014438182, + "learning_rate": 4.548628705102382e-06, + "loss": 1.4827, + "step": 3086 + }, + { + "epoch": 0.43703546400509663, + "grad_norm": 8.798425134708724, + "learning_rate": 4.5483001227042126e-06, + "loss": 1.2903, + "step": 3087 + }, + { + "epoch": 0.43717703687973386, + "grad_norm": 9.737127952666615, + "learning_rate": 4.5479714326284316e-06, + "loss": 1.2286, + "step": 3088 + }, + { + "epoch": 0.43731860975437103, + "grad_norm": 7.151878654321032, + "learning_rate": 4.547642634892321e-06, + "loss": 1.1878, + "step": 3089 + }, + { + "epoch": 0.43746018262900827, + "grad_norm": 6.8817668557844245, + "learning_rate": 4.547313729513163e-06, + "loss": 1.2536, + "step": 3090 + }, + { + "epoch": 0.4376017555036455, + "grad_norm": 10.294464431178763, + "learning_rate": 4.546984716508249e-06, + "loss": 1.513, + "step": 3091 + }, + { + "epoch": 0.4377433283782827, + "grad_norm": 9.439208425534611, + "learning_rate": 4.546655595894875e-06, + "loss": 1.3117, + "step": 3092 + }, + { + "epoch": 0.43788490125291996, + "grad_norm": 9.89486049220282, + "learning_rate": 4.546326367690342e-06, + "loss": 1.4076, + "step": 3093 + }, + { + "epoch": 0.4380264741275572, + "grad_norm": 7.474559360134471, + "learning_rate": 4.545997031911958e-06, + "loss": 1.2868, + "step": 3094 + }, + { + "epoch": 0.43816804700219436, + "grad_norm": 7.963975381028554, + "learning_rate": 4.545667588577035e-06, + "loss": 1.2087, + "step": 3095 + }, + { + "epoch": 0.4383096198768316, + "grad_norm": 11.794429459884004, + "learning_rate": 4.545338037702893e-06, + "loss": 1.4677, + "step": 3096 + }, + { + "epoch": 0.4384511927514688, + "grad_norm": 10.917537237112153, + "learning_rate": 4.545008379306854e-06, + "loss": 1.3663, + "step": 3097 + }, + { + "epoch": 0.43859276562610605, + "grad_norm": 9.276785345036858, + "learning_rate": 4.5446786134062515e-06, + "loss": 1.3235, + "step": 3098 + }, + { + "epoch": 0.4387343385007433, + "grad_norm": 8.705132682617345, + "learning_rate": 4.544348740018417e-06, + "loss": 1.3545, + "step": 3099 + }, + { + "epoch": 0.43887591137538046, + "grad_norm": 11.00221923636318, + "learning_rate": 4.544018759160694e-06, + "loss": 1.4109, + "step": 3100 + }, + { + "epoch": 0.4390174842500177, + "grad_norm": 10.718087668016032, + "learning_rate": 4.5436886708504295e-06, + "loss": 1.332, + "step": 3101 + }, + { + "epoch": 0.4391590571246549, + "grad_norm": 12.594016552700234, + "learning_rate": 4.543358475104975e-06, + "loss": 1.3964, + "step": 3102 + }, + { + "epoch": 0.43930062999929215, + "grad_norm": 8.729894591019557, + "learning_rate": 4.543028171941689e-06, + "loss": 1.265, + "step": 3103 + }, + { + "epoch": 0.4394422028739294, + "grad_norm": 9.425929372282077, + "learning_rate": 4.5426977613779355e-06, + "loss": 1.3441, + "step": 3104 + }, + { + "epoch": 0.43958377574856655, + "grad_norm": 11.25893200106864, + "learning_rate": 4.542367243431084e-06, + "loss": 1.4898, + "step": 3105 + }, + { + "epoch": 0.4397253486232038, + "grad_norm": 9.60517168613436, + "learning_rate": 4.54203661811851e-06, + "loss": 1.4478, + "step": 3106 + }, + { + "epoch": 0.439866921497841, + "grad_norm": 9.132640409520643, + "learning_rate": 4.541705885457593e-06, + "loss": 1.3283, + "step": 3107 + }, + { + "epoch": 0.44000849437247824, + "grad_norm": 9.664628526522065, + "learning_rate": 4.541375045465719e-06, + "loss": 1.2846, + "step": 3108 + }, + { + "epoch": 0.4401500672471155, + "grad_norm": 10.291805703978165, + "learning_rate": 4.541044098160281e-06, + "loss": 1.3529, + "step": 3109 + }, + { + "epoch": 0.44029164012175265, + "grad_norm": 9.426709606427169, + "learning_rate": 4.540713043558678e-06, + "loss": 1.3674, + "step": 3110 + }, + { + "epoch": 0.4404332129963899, + "grad_norm": 8.417076428199078, + "learning_rate": 4.54038188167831e-06, + "loss": 1.237, + "step": 3111 + }, + { + "epoch": 0.4405747858710271, + "grad_norm": 11.901394392110635, + "learning_rate": 4.54005061253659e-06, + "loss": 1.3909, + "step": 3112 + }, + { + "epoch": 0.44071635874566434, + "grad_norm": 9.76257804028607, + "learning_rate": 4.539719236150929e-06, + "loss": 1.2295, + "step": 3113 + }, + { + "epoch": 0.44085793162030157, + "grad_norm": 11.185736661073284, + "learning_rate": 4.53938775253875e-06, + "loss": 1.3937, + "step": 3114 + }, + { + "epoch": 0.44099950449493874, + "grad_norm": 10.165759520369177, + "learning_rate": 4.539056161717477e-06, + "loss": 1.2309, + "step": 3115 + }, + { + "epoch": 0.441141077369576, + "grad_norm": 9.44335715324431, + "learning_rate": 4.5387244637045414e-06, + "loss": 1.3241, + "step": 3116 + }, + { + "epoch": 0.4412826502442132, + "grad_norm": 8.8982047166929, + "learning_rate": 4.53839265851738e-06, + "loss": 1.2732, + "step": 3117 + }, + { + "epoch": 0.44142422311885043, + "grad_norm": 11.560728195116742, + "learning_rate": 4.538060746173438e-06, + "loss": 1.2203, + "step": 3118 + }, + { + "epoch": 0.44156579599348766, + "grad_norm": 10.134885883187554, + "learning_rate": 4.537728726690162e-06, + "loss": 1.4116, + "step": 3119 + }, + { + "epoch": 0.4417073688681249, + "grad_norm": 10.522975670777418, + "learning_rate": 4.537396600085006e-06, + "loss": 1.3016, + "step": 3120 + }, + { + "epoch": 0.44184894174276207, + "grad_norm": 9.166501500346758, + "learning_rate": 4.537064366375429e-06, + "loss": 1.3841, + "step": 3121 + }, + { + "epoch": 0.4419905146173993, + "grad_norm": 8.760118138461644, + "learning_rate": 4.5367320255788985e-06, + "loss": 1.2636, + "step": 3122 + }, + { + "epoch": 0.44213208749203653, + "grad_norm": 10.317603495702443, + "learning_rate": 4.536399577712883e-06, + "loss": 1.4575, + "step": 3123 + }, + { + "epoch": 0.44227366036667376, + "grad_norm": 11.414296444813326, + "learning_rate": 4.536067022794861e-06, + "loss": 1.24, + "step": 3124 + }, + { + "epoch": 0.442415233241311, + "grad_norm": 11.971259349585534, + "learning_rate": 4.535734360842313e-06, + "loss": 1.4596, + "step": 3125 + }, + { + "epoch": 0.44255680611594816, + "grad_norm": 10.195941446716104, + "learning_rate": 4.535401591872729e-06, + "loss": 1.3361, + "step": 3126 + }, + { + "epoch": 0.4426983789905854, + "grad_norm": 10.43556899065362, + "learning_rate": 4.5350687159036e-06, + "loss": 1.2408, + "step": 3127 + }, + { + "epoch": 0.4428399518652226, + "grad_norm": 11.75071746583646, + "learning_rate": 4.5347357329524254e-06, + "loss": 1.4091, + "step": 3128 + }, + { + "epoch": 0.44298152473985986, + "grad_norm": 12.364239570875988, + "learning_rate": 4.534402643036711e-06, + "loss": 1.2198, + "step": 3129 + }, + { + "epoch": 0.4431230976144971, + "grad_norm": 12.726608660329116, + "learning_rate": 4.534069446173967e-06, + "loss": 1.4787, + "step": 3130 + }, + { + "epoch": 0.44326467048913426, + "grad_norm": 7.861125773793248, + "learning_rate": 4.533736142381708e-06, + "loss": 1.1898, + "step": 3131 + }, + { + "epoch": 0.4434062433637715, + "grad_norm": 8.447383263270469, + "learning_rate": 4.533402731677457e-06, + "loss": 1.2915, + "step": 3132 + }, + { + "epoch": 0.4435478162384087, + "grad_norm": 10.905808229434765, + "learning_rate": 4.53306921407874e-06, + "loss": 1.3213, + "step": 3133 + }, + { + "epoch": 0.44368938911304595, + "grad_norm": 11.396845705765067, + "learning_rate": 4.532735589603091e-06, + "loss": 1.1893, + "step": 3134 + }, + { + "epoch": 0.4438309619876832, + "grad_norm": 8.867132863956126, + "learning_rate": 4.5324018582680476e-06, + "loss": 1.2807, + "step": 3135 + }, + { + "epoch": 0.44397253486232036, + "grad_norm": 11.019615024273902, + "learning_rate": 4.532068020091154e-06, + "loss": 1.3727, + "step": 3136 + }, + { + "epoch": 0.4441141077369576, + "grad_norm": 7.731520000606309, + "learning_rate": 4.531734075089959e-06, + "loss": 1.1939, + "step": 3137 + }, + { + "epoch": 0.4442556806115948, + "grad_norm": 9.294963941581106, + "learning_rate": 4.53140002328202e-06, + "loss": 1.3626, + "step": 3138 + }, + { + "epoch": 0.44439725348623205, + "grad_norm": 8.911547989044056, + "learning_rate": 4.531065864684896e-06, + "loss": 1.2683, + "step": 3139 + }, + { + "epoch": 0.4445388263608693, + "grad_norm": 11.140847908775385, + "learning_rate": 4.530731599316153e-06, + "loss": 1.2741, + "step": 3140 + }, + { + "epoch": 0.44468039923550645, + "grad_norm": 8.95785658403159, + "learning_rate": 4.530397227193365e-06, + "loss": 1.2367, + "step": 3141 + }, + { + "epoch": 0.4448219721101437, + "grad_norm": 8.660210870373534, + "learning_rate": 4.530062748334109e-06, + "loss": 1.4926, + "step": 3142 + }, + { + "epoch": 0.4449635449847809, + "grad_norm": 9.44908108557739, + "learning_rate": 4.529728162755966e-06, + "loss": 1.2297, + "step": 3143 + }, + { + "epoch": 0.44510511785941814, + "grad_norm": 9.266787991541394, + "learning_rate": 4.5293934704765285e-06, + "loss": 1.1719, + "step": 3144 + }, + { + "epoch": 0.4452466907340554, + "grad_norm": 8.746465786666304, + "learning_rate": 4.529058671513389e-06, + "loss": 1.2027, + "step": 3145 + }, + { + "epoch": 0.44538826360869255, + "grad_norm": 8.659352762117331, + "learning_rate": 4.528723765884149e-06, + "loss": 1.3485, + "step": 3146 + }, + { + "epoch": 0.4455298364833298, + "grad_norm": 9.683806472048373, + "learning_rate": 4.528388753606412e-06, + "loss": 1.4244, + "step": 3147 + }, + { + "epoch": 0.445671409357967, + "grad_norm": 9.861943752136924, + "learning_rate": 4.528053634697791e-06, + "loss": 1.3722, + "step": 3148 + }, + { + "epoch": 0.44581298223260424, + "grad_norm": 8.834934977112434, + "learning_rate": 4.527718409175903e-06, + "loss": 1.2167, + "step": 3149 + }, + { + "epoch": 0.44595455510724147, + "grad_norm": 9.459291474018004, + "learning_rate": 4.52738307705837e-06, + "loss": 1.348, + "step": 3150 + }, + { + "epoch": 0.4460961279818787, + "grad_norm": 8.471428816980751, + "learning_rate": 4.52704763836282e-06, + "loss": 1.3216, + "step": 3151 + }, + { + "epoch": 0.4462377008565159, + "grad_norm": 7.678654241273744, + "learning_rate": 4.526712093106888e-06, + "loss": 1.3076, + "step": 3152 + }, + { + "epoch": 0.4463792737311531, + "grad_norm": 10.061071448479344, + "learning_rate": 4.5263764413082115e-06, + "loss": 1.3076, + "step": 3153 + }, + { + "epoch": 0.44652084660579033, + "grad_norm": 9.874505525594687, + "learning_rate": 4.5260406829844364e-06, + "loss": 1.3502, + "step": 3154 + }, + { + "epoch": 0.44666241948042756, + "grad_norm": 10.6748528856776, + "learning_rate": 4.525704818153214e-06, + "loss": 1.4079, + "step": 3155 + }, + { + "epoch": 0.4468039923550648, + "grad_norm": 9.156270414466363, + "learning_rate": 4.525368846832199e-06, + "loss": 1.4252, + "step": 3156 + }, + { + "epoch": 0.44694556522970197, + "grad_norm": 9.830513328894392, + "learning_rate": 4.525032769039054e-06, + "loss": 1.4442, + "step": 3157 + }, + { + "epoch": 0.4470871381043392, + "grad_norm": 9.060527297416336, + "learning_rate": 4.524696584791447e-06, + "loss": 1.264, + "step": 3158 + }, + { + "epoch": 0.44722871097897643, + "grad_norm": 10.566238012754942, + "learning_rate": 4.524360294107049e-06, + "loss": 1.397, + "step": 3159 + }, + { + "epoch": 0.44737028385361366, + "grad_norm": 8.874897701049942, + "learning_rate": 4.5240238970035414e-06, + "loss": 1.4483, + "step": 3160 + }, + { + "epoch": 0.4475118567282509, + "grad_norm": 8.433540828197781, + "learning_rate": 4.523687393498605e-06, + "loss": 1.2369, + "step": 3161 + }, + { + "epoch": 0.44765342960288806, + "grad_norm": 8.078385192104404, + "learning_rate": 4.523350783609932e-06, + "loss": 1.3928, + "step": 3162 + }, + { + "epoch": 0.4477950024775253, + "grad_norm": 11.374170294044735, + "learning_rate": 4.523014067355217e-06, + "loss": 1.4235, + "step": 3163 + }, + { + "epoch": 0.4479365753521625, + "grad_norm": 11.331362010572622, + "learning_rate": 4.52267724475216e-06, + "loss": 1.4934, + "step": 3164 + }, + { + "epoch": 0.44807814822679976, + "grad_norm": 10.30197438695764, + "learning_rate": 4.52234031581847e-06, + "loss": 1.3151, + "step": 3165 + }, + { + "epoch": 0.448219721101437, + "grad_norm": 9.272745571016422, + "learning_rate": 4.5220032805718575e-06, + "loss": 1.3527, + "step": 3166 + }, + { + "epoch": 0.44836129397607416, + "grad_norm": 9.110467076476512, + "learning_rate": 4.521666139030039e-06, + "loss": 1.291, + "step": 3167 + }, + { + "epoch": 0.4485028668507114, + "grad_norm": 11.446045065079726, + "learning_rate": 4.52132889121074e-06, + "loss": 1.3233, + "step": 3168 + }, + { + "epoch": 0.4486444397253486, + "grad_norm": 9.804212435577556, + "learning_rate": 4.520991537131687e-06, + "loss": 1.4171, + "step": 3169 + }, + { + "epoch": 0.44878601259998585, + "grad_norm": 9.107125529638738, + "learning_rate": 4.520654076810617e-06, + "loss": 1.1498, + "step": 3170 + }, + { + "epoch": 0.4489275854746231, + "grad_norm": 10.668184728681512, + "learning_rate": 4.520316510265268e-06, + "loss": 1.3108, + "step": 3171 + }, + { + "epoch": 0.44906915834926026, + "grad_norm": 12.098556154978008, + "learning_rate": 4.519978837513388e-06, + "loss": 1.3501, + "step": 3172 + }, + { + "epoch": 0.4492107312238975, + "grad_norm": 8.42656400296088, + "learning_rate": 4.519641058572725e-06, + "loss": 1.3183, + "step": 3173 + }, + { + "epoch": 0.4493523040985347, + "grad_norm": 9.976170567257345, + "learning_rate": 4.519303173461038e-06, + "loss": 1.3579, + "step": 3174 + }, + { + "epoch": 0.44949387697317195, + "grad_norm": 10.836201175707757, + "learning_rate": 4.5189651821960885e-06, + "loss": 1.2673, + "step": 3175 + }, + { + "epoch": 0.4496354498478092, + "grad_norm": 10.987552883120745, + "learning_rate": 4.518627084795646e-06, + "loss": 1.3422, + "step": 3176 + }, + { + "epoch": 0.4497770227224464, + "grad_norm": 9.642205651290443, + "learning_rate": 4.5182888812774814e-06, + "loss": 1.2973, + "step": 3177 + }, + { + "epoch": 0.4499185955970836, + "grad_norm": 9.726344045614074, + "learning_rate": 4.517950571659376e-06, + "loss": 1.3535, + "step": 3178 + }, + { + "epoch": 0.4500601684717208, + "grad_norm": 12.45520721985983, + "learning_rate": 4.517612155959114e-06, + "loss": 1.2379, + "step": 3179 + }, + { + "epoch": 0.45020174134635804, + "grad_norm": 9.546269224336575, + "learning_rate": 4.5172736341944845e-06, + "loss": 1.3366, + "step": 3180 + }, + { + "epoch": 0.45034331422099527, + "grad_norm": 8.812116898670821, + "learning_rate": 4.516935006383285e-06, + "loss": 1.3875, + "step": 3181 + }, + { + "epoch": 0.4504848870956325, + "grad_norm": 11.13176632968247, + "learning_rate": 4.516596272543316e-06, + "loss": 1.2997, + "step": 3182 + }, + { + "epoch": 0.4506264599702697, + "grad_norm": 10.404922538150025, + "learning_rate": 4.516257432692383e-06, + "loss": 1.1973, + "step": 3183 + }, + { + "epoch": 0.4507680328449069, + "grad_norm": 9.60335653753985, + "learning_rate": 4.515918486848302e-06, + "loss": 1.2088, + "step": 3184 + }, + { + "epoch": 0.45090960571954414, + "grad_norm": 9.756709602277994, + "learning_rate": 4.5155794350288885e-06, + "loss": 1.4947, + "step": 3185 + }, + { + "epoch": 0.45105117859418137, + "grad_norm": 10.096271967493822, + "learning_rate": 4.515240277251968e-06, + "loss": 1.2796, + "step": 3186 + }, + { + "epoch": 0.4511927514688186, + "grad_norm": 8.147424826964356, + "learning_rate": 4.514901013535368e-06, + "loss": 1.196, + "step": 3187 + }, + { + "epoch": 0.4513343243434558, + "grad_norm": 7.610143477782887, + "learning_rate": 4.514561643896924e-06, + "loss": 1.2552, + "step": 3188 + }, + { + "epoch": 0.451475897218093, + "grad_norm": 10.23272663848731, + "learning_rate": 4.514222168354476e-06, + "loss": 1.3236, + "step": 3189 + }, + { + "epoch": 0.45161747009273023, + "grad_norm": 9.633191474829236, + "learning_rate": 4.513882586925872e-06, + "loss": 1.4203, + "step": 3190 + }, + { + "epoch": 0.45175904296736746, + "grad_norm": 9.282904994358034, + "learning_rate": 4.51354289962896e-06, + "loss": 1.273, + "step": 3191 + }, + { + "epoch": 0.4519006158420047, + "grad_norm": 9.641142152736746, + "learning_rate": 4.5132031064816e-06, + "loss": 1.3095, + "step": 3192 + }, + { + "epoch": 0.45204218871664187, + "grad_norm": 9.781113498698094, + "learning_rate": 4.512863207501654e-06, + "loss": 1.2868, + "step": 3193 + }, + { + "epoch": 0.4521837615912791, + "grad_norm": 8.88302421485274, + "learning_rate": 4.51252320270699e-06, + "loss": 1.2813, + "step": 3194 + }, + { + "epoch": 0.45232533446591633, + "grad_norm": 8.370907822830228, + "learning_rate": 4.512183092115482e-06, + "loss": 1.373, + "step": 3195 + }, + { + "epoch": 0.45246690734055356, + "grad_norm": 7.942942277975461, + "learning_rate": 4.511842875745009e-06, + "loss": 1.2062, + "step": 3196 + }, + { + "epoch": 0.4526084802151908, + "grad_norm": 12.005361630777399, + "learning_rate": 4.511502553613456e-06, + "loss": 1.3965, + "step": 3197 + }, + { + "epoch": 0.45275005308982796, + "grad_norm": 8.87677671888801, + "learning_rate": 4.511162125738714e-06, + "loss": 1.3295, + "step": 3198 + }, + { + "epoch": 0.4528916259644652, + "grad_norm": 8.942758573740159, + "learning_rate": 4.510821592138678e-06, + "loss": 1.4553, + "step": 3199 + }, + { + "epoch": 0.4530331988391024, + "grad_norm": 8.010966413969177, + "learning_rate": 4.510480952831251e-06, + "loss": 1.3207, + "step": 3200 + }, + { + "epoch": 0.45317477171373965, + "grad_norm": 8.886272310214732, + "learning_rate": 4.510140207834339e-06, + "loss": 1.2813, + "step": 3201 + }, + { + "epoch": 0.4533163445883769, + "grad_norm": 11.775548552028134, + "learning_rate": 4.509799357165855e-06, + "loss": 1.4606, + "step": 3202 + }, + { + "epoch": 0.45345791746301406, + "grad_norm": 14.433285052517551, + "learning_rate": 4.509458400843717e-06, + "loss": 1.4704, + "step": 3203 + }, + { + "epoch": 0.4535994903376513, + "grad_norm": 8.541884951399734, + "learning_rate": 4.50911733888585e-06, + "loss": 1.2468, + "step": 3204 + }, + { + "epoch": 0.4537410632122885, + "grad_norm": 8.993390093210254, + "learning_rate": 4.508776171310183e-06, + "loss": 1.163, + "step": 3205 + }, + { + "epoch": 0.45388263608692575, + "grad_norm": 10.70473620685259, + "learning_rate": 4.5084348981346495e-06, + "loss": 1.6362, + "step": 3206 + }, + { + "epoch": 0.454024208961563, + "grad_norm": 11.7699046260888, + "learning_rate": 4.5080935193771905e-06, + "loss": 1.2889, + "step": 3207 + }, + { + "epoch": 0.4541657818362002, + "grad_norm": 9.366538743563066, + "learning_rate": 4.5077520350557534e-06, + "loss": 1.3717, + "step": 3208 + }, + { + "epoch": 0.4543073547108374, + "grad_norm": 9.072879293359689, + "learning_rate": 4.5074104451882886e-06, + "loss": 1.3097, + "step": 3209 + }, + { + "epoch": 0.4544489275854746, + "grad_norm": 12.280965476586497, + "learning_rate": 4.507068749792754e-06, + "loss": 1.4465, + "step": 3210 + }, + { + "epoch": 0.45459050046011185, + "grad_norm": 8.913846386923435, + "learning_rate": 4.50672694888711e-06, + "loss": 1.3334, + "step": 3211 + }, + { + "epoch": 0.4547320733347491, + "grad_norm": 7.284352963421194, + "learning_rate": 4.506385042489329e-06, + "loss": 1.3025, + "step": 3212 + }, + { + "epoch": 0.4548736462093863, + "grad_norm": 9.897400377913518, + "learning_rate": 4.5060430306173805e-06, + "loss": 1.278, + "step": 3213 + }, + { + "epoch": 0.4550152190840235, + "grad_norm": 10.443245225963498, + "learning_rate": 4.505700913289246e-06, + "loss": 1.4003, + "step": 3214 + }, + { + "epoch": 0.4551567919586607, + "grad_norm": 7.951699119708999, + "learning_rate": 4.505358690522911e-06, + "loss": 1.4502, + "step": 3215 + }, + { + "epoch": 0.45529836483329794, + "grad_norm": 8.612044624946524, + "learning_rate": 4.505016362336364e-06, + "loss": 1.2054, + "step": 3216 + }, + { + "epoch": 0.45543993770793517, + "grad_norm": 8.499526515563648, + "learning_rate": 4.504673928747601e-06, + "loss": 1.2961, + "step": 3217 + }, + { + "epoch": 0.4555815105825724, + "grad_norm": 9.39633639751752, + "learning_rate": 4.504331389774626e-06, + "loss": 1.4038, + "step": 3218 + }, + { + "epoch": 0.4557230834572096, + "grad_norm": 10.632916541785246, + "learning_rate": 4.503988745435443e-06, + "loss": 1.3619, + "step": 3219 + }, + { + "epoch": 0.4558646563318468, + "grad_norm": 10.004456480743851, + "learning_rate": 4.503645995748067e-06, + "loss": 1.335, + "step": 3220 + }, + { + "epoch": 0.45600622920648404, + "grad_norm": 10.117666720672581, + "learning_rate": 4.503303140730515e-06, + "loss": 1.3547, + "step": 3221 + }, + { + "epoch": 0.45614780208112127, + "grad_norm": 9.632834676576257, + "learning_rate": 4.502960180400809e-06, + "loss": 1.4667, + "step": 3222 + }, + { + "epoch": 0.4562893749557585, + "grad_norm": 8.971782530517686, + "learning_rate": 4.5026171147769816e-06, + "loss": 1.3089, + "step": 3223 + }, + { + "epoch": 0.4564309478303957, + "grad_norm": 9.50421771735819, + "learning_rate": 4.5022739438770655e-06, + "loss": 1.2125, + "step": 3224 + }, + { + "epoch": 0.4565725207050329, + "grad_norm": 7.235031197038416, + "learning_rate": 4.5019306677191e-06, + "loss": 1.2378, + "step": 3225 + }, + { + "epoch": 0.45671409357967013, + "grad_norm": 8.898889330879568, + "learning_rate": 4.501587286321133e-06, + "loss": 1.1922, + "step": 3226 + }, + { + "epoch": 0.45685566645430736, + "grad_norm": 9.749798112392988, + "learning_rate": 4.501243799701215e-06, + "loss": 1.3738, + "step": 3227 + }, + { + "epoch": 0.4569972393289446, + "grad_norm": 11.875630894013984, + "learning_rate": 4.500900207877402e-06, + "loss": 1.5478, + "step": 3228 + }, + { + "epoch": 0.45713881220358177, + "grad_norm": 9.774898721701993, + "learning_rate": 4.500556510867756e-06, + "loss": 1.3492, + "step": 3229 + }, + { + "epoch": 0.457280385078219, + "grad_norm": 8.716197525538256, + "learning_rate": 4.500212708690348e-06, + "loss": 1.4596, + "step": 3230 + }, + { + "epoch": 0.45742195795285623, + "grad_norm": 8.814170733137102, + "learning_rate": 4.499868801363248e-06, + "loss": 1.3531, + "step": 3231 + }, + { + "epoch": 0.45756353082749346, + "grad_norm": 9.629444631880752, + "learning_rate": 4.499524788904537e-06, + "loss": 1.3064, + "step": 3232 + }, + { + "epoch": 0.4577051037021307, + "grad_norm": 9.418471396919452, + "learning_rate": 4.4991806713322986e-06, + "loss": 1.3221, + "step": 3233 + }, + { + "epoch": 0.45784667657676786, + "grad_norm": 7.7351576900374885, + "learning_rate": 4.498836448664622e-06, + "loss": 1.2575, + "step": 3234 + }, + { + "epoch": 0.4579882494514051, + "grad_norm": 8.952964794596326, + "learning_rate": 4.498492120919604e-06, + "loss": 1.2034, + "step": 3235 + }, + { + "epoch": 0.4581298223260423, + "grad_norm": 6.566863434969175, + "learning_rate": 4.498147688115346e-06, + "loss": 1.1197, + "step": 3236 + }, + { + "epoch": 0.45827139520067955, + "grad_norm": 10.026478616019684, + "learning_rate": 4.497803150269954e-06, + "loss": 1.2201, + "step": 3237 + }, + { + "epoch": 0.4584129680753168, + "grad_norm": 11.348740670000433, + "learning_rate": 4.4974585074015394e-06, + "loss": 1.2637, + "step": 3238 + }, + { + "epoch": 0.458554540949954, + "grad_norm": 12.90940452614785, + "learning_rate": 4.497113759528221e-06, + "loss": 1.2358, + "step": 3239 + }, + { + "epoch": 0.4586961138245912, + "grad_norm": 13.178044549715898, + "learning_rate": 4.4967689066681205e-06, + "loss": 1.4231, + "step": 3240 + }, + { + "epoch": 0.4588376866992284, + "grad_norm": 10.142630591827034, + "learning_rate": 4.496423948839369e-06, + "loss": 1.3004, + "step": 3241 + }, + { + "epoch": 0.45897925957386565, + "grad_norm": 10.967747308270118, + "learning_rate": 4.496078886060098e-06, + "loss": 1.3302, + "step": 3242 + }, + { + "epoch": 0.4591208324485029, + "grad_norm": 10.541721469500478, + "learning_rate": 4.495733718348449e-06, + "loss": 1.3297, + "step": 3243 + }, + { + "epoch": 0.4592624053231401, + "grad_norm": 9.200843307986775, + "learning_rate": 4.4953884457225645e-06, + "loss": 1.274, + "step": 3244 + }, + { + "epoch": 0.4594039781977773, + "grad_norm": 8.245540453962352, + "learning_rate": 4.4950430682005995e-06, + "loss": 1.2966, + "step": 3245 + }, + { + "epoch": 0.4595455510724145, + "grad_norm": 10.052984251982366, + "learning_rate": 4.4946975858007066e-06, + "loss": 1.494, + "step": 3246 + }, + { + "epoch": 0.45968712394705175, + "grad_norm": 8.351571178520983, + "learning_rate": 4.494351998541049e-06, + "loss": 1.1227, + "step": 3247 + }, + { + "epoch": 0.459828696821689, + "grad_norm": 10.239792369883302, + "learning_rate": 4.494006306439795e-06, + "loss": 1.3481, + "step": 3248 + }, + { + "epoch": 0.4599702696963262, + "grad_norm": 9.406344492491533, + "learning_rate": 4.493660509515115e-06, + "loss": 1.266, + "step": 3249 + }, + { + "epoch": 0.4601118425709634, + "grad_norm": 7.671390432682611, + "learning_rate": 4.493314607785189e-06, + "loss": 1.2963, + "step": 3250 + }, + { + "epoch": 0.4602534154456006, + "grad_norm": 9.019766400139526, + "learning_rate": 4.492968601268202e-06, + "loss": 1.3098, + "step": 3251 + }, + { + "epoch": 0.46039498832023784, + "grad_norm": 9.496522367306566, + "learning_rate": 4.492622489982339e-06, + "loss": 1.4174, + "step": 3252 + }, + { + "epoch": 0.46053656119487507, + "grad_norm": 9.37830182741074, + "learning_rate": 4.4922762739457995e-06, + "loss": 1.2159, + "step": 3253 + }, + { + "epoch": 0.4606781340695123, + "grad_norm": 8.859737112997529, + "learning_rate": 4.49192995317678e-06, + "loss": 1.3583, + "step": 3254 + }, + { + "epoch": 0.4608197069441495, + "grad_norm": 8.56592441217854, + "learning_rate": 4.491583527693489e-06, + "loss": 1.341, + "step": 3255 + }, + { + "epoch": 0.4609612798187867, + "grad_norm": 9.862574232085578, + "learning_rate": 4.491236997514138e-06, + "loss": 1.3556, + "step": 3256 + }, + { + "epoch": 0.46110285269342394, + "grad_norm": 7.131503968058599, + "learning_rate": 4.490890362656941e-06, + "loss": 1.1969, + "step": 3257 + }, + { + "epoch": 0.46124442556806117, + "grad_norm": 9.748361303096184, + "learning_rate": 4.490543623140123e-06, + "loss": 1.4553, + "step": 3258 + }, + { + "epoch": 0.4613859984426984, + "grad_norm": 8.828153951141688, + "learning_rate": 4.490196778981911e-06, + "loss": 1.4396, + "step": 3259 + }, + { + "epoch": 0.46152757131733557, + "grad_norm": 8.37164011501522, + "learning_rate": 4.489849830200538e-06, + "loss": 1.1906, + "step": 3260 + }, + { + "epoch": 0.4616691441919728, + "grad_norm": 9.207988703796193, + "learning_rate": 4.489502776814243e-06, + "loss": 1.3078, + "step": 3261 + }, + { + "epoch": 0.46181071706661003, + "grad_norm": 8.774819622739894, + "learning_rate": 4.4891556188412705e-06, + "loss": 1.2048, + "step": 3262 + }, + { + "epoch": 0.46195228994124726, + "grad_norm": 10.246473124290933, + "learning_rate": 4.48880835629987e-06, + "loss": 1.3487, + "step": 3263 + }, + { + "epoch": 0.4620938628158845, + "grad_norm": 8.833248737817822, + "learning_rate": 4.488460989208298e-06, + "loss": 1.1817, + "step": 3264 + }, + { + "epoch": 0.4622354356905217, + "grad_norm": 8.943249540368903, + "learning_rate": 4.4881135175848145e-06, + "loss": 1.2893, + "step": 3265 + }, + { + "epoch": 0.4623770085651589, + "grad_norm": 8.038399567995242, + "learning_rate": 4.4877659414476845e-06, + "loss": 1.2918, + "step": 3266 + }, + { + "epoch": 0.46251858143979613, + "grad_norm": 9.364879791579932, + "learning_rate": 4.487418260815182e-06, + "loss": 1.335, + "step": 3267 + }, + { + "epoch": 0.46266015431443336, + "grad_norm": 9.23252909585772, + "learning_rate": 4.487070475705584e-06, + "loss": 1.4068, + "step": 3268 + }, + { + "epoch": 0.4628017271890706, + "grad_norm": 9.358687386703906, + "learning_rate": 4.486722586137171e-06, + "loss": 1.3595, + "step": 3269 + }, + { + "epoch": 0.4629433000637078, + "grad_norm": 11.284439608173185, + "learning_rate": 4.486374592128235e-06, + "loss": 1.2428, + "step": 3270 + }, + { + "epoch": 0.463084872938345, + "grad_norm": 12.492526452366869, + "learning_rate": 4.486026493697067e-06, + "loss": 1.4356, + "step": 3271 + }, + { + "epoch": 0.4632264458129822, + "grad_norm": 9.686459983866293, + "learning_rate": 4.485678290861967e-06, + "loss": 1.2912, + "step": 3272 + }, + { + "epoch": 0.46336801868761945, + "grad_norm": 9.577028494918476, + "learning_rate": 4.485329983641239e-06, + "loss": 1.2799, + "step": 3273 + }, + { + "epoch": 0.4635095915622567, + "grad_norm": 9.931194488197397, + "learning_rate": 4.484981572053195e-06, + "loss": 1.3687, + "step": 3274 + }, + { + "epoch": 0.4636511644368939, + "grad_norm": 9.679305222223388, + "learning_rate": 4.48463305611615e-06, + "loss": 1.5219, + "step": 3275 + }, + { + "epoch": 0.4637927373115311, + "grad_norm": 11.033254814060635, + "learning_rate": 4.484284435848424e-06, + "loss": 1.2891, + "step": 3276 + }, + { + "epoch": 0.4639343101861683, + "grad_norm": 9.583450007073964, + "learning_rate": 4.483935711268346e-06, + "loss": 1.3474, + "step": 3277 + }, + { + "epoch": 0.46407588306080555, + "grad_norm": 8.243442588089335, + "learning_rate": 4.483586882394247e-06, + "loss": 1.2774, + "step": 3278 + }, + { + "epoch": 0.4642174559354428, + "grad_norm": 10.087503584963931, + "learning_rate": 4.483237949244463e-06, + "loss": 1.3466, + "step": 3279 + }, + { + "epoch": 0.46435902881008, + "grad_norm": 8.750990893251434, + "learning_rate": 4.4828889118373395e-06, + "loss": 1.3056, + "step": 3280 + }, + { + "epoch": 0.4645006016847172, + "grad_norm": 8.361928731567641, + "learning_rate": 4.482539770191225e-06, + "loss": 1.2845, + "step": 3281 + }, + { + "epoch": 0.4646421745593544, + "grad_norm": 10.802102795506414, + "learning_rate": 4.482190524324473e-06, + "loss": 1.2964, + "step": 3282 + }, + { + "epoch": 0.46478374743399165, + "grad_norm": 9.15804546576013, + "learning_rate": 4.481841174255443e-06, + "loss": 1.3128, + "step": 3283 + }, + { + "epoch": 0.4649253203086289, + "grad_norm": 7.88685974327599, + "learning_rate": 4.481491720002499e-06, + "loss": 1.1702, + "step": 3284 + }, + { + "epoch": 0.4650668931832661, + "grad_norm": 7.434902987947156, + "learning_rate": 4.481142161584014e-06, + "loss": 1.2375, + "step": 3285 + }, + { + "epoch": 0.4652084660579033, + "grad_norm": 9.736891321787704, + "learning_rate": 4.480792499018362e-06, + "loss": 1.5175, + "step": 3286 + }, + { + "epoch": 0.4653500389325405, + "grad_norm": 10.790820310732434, + "learning_rate": 4.4804427323239265e-06, + "loss": 1.4002, + "step": 3287 + }, + { + "epoch": 0.46549161180717774, + "grad_norm": 10.921232671227274, + "learning_rate": 4.480092861519092e-06, + "loss": 1.2724, + "step": 3288 + }, + { + "epoch": 0.46563318468181497, + "grad_norm": 8.312275130593289, + "learning_rate": 4.479742886622254e-06, + "loss": 1.2592, + "step": 3289 + }, + { + "epoch": 0.4657747575564522, + "grad_norm": 9.479073615543173, + "learning_rate": 4.479392807651807e-06, + "loss": 1.3858, + "step": 3290 + }, + { + "epoch": 0.4659163304310894, + "grad_norm": 9.41287636783323, + "learning_rate": 4.479042624626156e-06, + "loss": 1.4659, + "step": 3291 + }, + { + "epoch": 0.4660579033057266, + "grad_norm": 8.56819575970695, + "learning_rate": 4.47869233756371e-06, + "loss": 1.3994, + "step": 3292 + }, + { + "epoch": 0.46619947618036384, + "grad_norm": 9.527841027151528, + "learning_rate": 4.478341946482884e-06, + "loss": 1.3773, + "step": 3293 + }, + { + "epoch": 0.46634104905500107, + "grad_norm": 8.57228231265464, + "learning_rate": 4.4779914514020964e-06, + "loss": 1.2585, + "step": 3294 + }, + { + "epoch": 0.4664826219296383, + "grad_norm": 8.727010116093956, + "learning_rate": 4.4776408523397725e-06, + "loss": 1.2767, + "step": 3295 + }, + { + "epoch": 0.4666241948042755, + "grad_norm": 9.725105390982797, + "learning_rate": 4.477290149314344e-06, + "loss": 1.473, + "step": 3296 + }, + { + "epoch": 0.4667657676789127, + "grad_norm": 9.541599545527314, + "learning_rate": 4.476939342344246e-06, + "loss": 1.3345, + "step": 3297 + }, + { + "epoch": 0.46690734055354993, + "grad_norm": 9.200681611394767, + "learning_rate": 4.4765884314479226e-06, + "loss": 1.2867, + "step": 3298 + }, + { + "epoch": 0.46704891342818716, + "grad_norm": 10.27103220119708, + "learning_rate": 4.4762374166438185e-06, + "loss": 1.2709, + "step": 3299 + }, + { + "epoch": 0.4671904863028244, + "grad_norm": 11.200905722371957, + "learning_rate": 4.475886297950386e-06, + "loss": 1.2594, + "step": 3300 + }, + { + "epoch": 0.4673320591774616, + "grad_norm": 7.771025899865278, + "learning_rate": 4.475535075386085e-06, + "loss": 1.1906, + "step": 3301 + }, + { + "epoch": 0.4674736320520988, + "grad_norm": 11.82315148387866, + "learning_rate": 4.475183748969377e-06, + "loss": 1.3449, + "step": 3302 + }, + { + "epoch": 0.46761520492673603, + "grad_norm": 11.009657002025362, + "learning_rate": 4.474832318718733e-06, + "loss": 1.4395, + "step": 3303 + }, + { + "epoch": 0.46775677780137326, + "grad_norm": 10.85257944130266, + "learning_rate": 4.474480784652627e-06, + "loss": 1.3836, + "step": 3304 + }, + { + "epoch": 0.4678983506760105, + "grad_norm": 9.984152252625822, + "learning_rate": 4.474129146789538e-06, + "loss": 1.2862, + "step": 3305 + }, + { + "epoch": 0.4680399235506477, + "grad_norm": 10.859866782929188, + "learning_rate": 4.473777405147952e-06, + "loss": 1.3728, + "step": 3306 + }, + { + "epoch": 0.4681814964252849, + "grad_norm": 9.750273089374616, + "learning_rate": 4.473425559746358e-06, + "loss": 1.5771, + "step": 3307 + }, + { + "epoch": 0.4683230692999221, + "grad_norm": 9.665149525819315, + "learning_rate": 4.473073610603255e-06, + "loss": 1.3418, + "step": 3308 + }, + { + "epoch": 0.46846464217455935, + "grad_norm": 9.522228937234468, + "learning_rate": 4.4727215577371445e-06, + "loss": 1.3143, + "step": 3309 + }, + { + "epoch": 0.4686062150491966, + "grad_norm": 10.783238324351624, + "learning_rate": 4.472369401166531e-06, + "loss": 1.4904, + "step": 3310 + }, + { + "epoch": 0.4687477879238338, + "grad_norm": 8.783965535970037, + "learning_rate": 4.472017140909929e-06, + "loss": 1.2859, + "step": 3311 + }, + { + "epoch": 0.468889360798471, + "grad_norm": 8.246028984823829, + "learning_rate": 4.471664776985857e-06, + "loss": 1.228, + "step": 3312 + }, + { + "epoch": 0.4690309336731082, + "grad_norm": 9.71298900733152, + "learning_rate": 4.471312309412837e-06, + "loss": 1.3732, + "step": 3313 + }, + { + "epoch": 0.46917250654774545, + "grad_norm": 10.049951156573435, + "learning_rate": 4.470959738209399e-06, + "loss": 1.4026, + "step": 3314 + }, + { + "epoch": 0.4693140794223827, + "grad_norm": 9.235875248267622, + "learning_rate": 4.470607063394077e-06, + "loss": 1.2798, + "step": 3315 + }, + { + "epoch": 0.4694556522970199, + "grad_norm": 8.449984434215045, + "learning_rate": 4.470254284985411e-06, + "loss": 1.2991, + "step": 3316 + }, + { + "epoch": 0.4695972251716571, + "grad_norm": 9.86079718100529, + "learning_rate": 4.469901403001947e-06, + "loss": 1.3747, + "step": 3317 + }, + { + "epoch": 0.4697387980462943, + "grad_norm": 7.97149228052482, + "learning_rate": 4.469548417462234e-06, + "loss": 1.3058, + "step": 3318 + }, + { + "epoch": 0.46988037092093154, + "grad_norm": 9.18021191986082, + "learning_rate": 4.46919532838483e-06, + "loss": 1.3873, + "step": 3319 + }, + { + "epoch": 0.4700219437955688, + "grad_norm": 8.841378153397082, + "learning_rate": 4.468842135788296e-06, + "loss": 1.355, + "step": 3320 + }, + { + "epoch": 0.470163516670206, + "grad_norm": 9.986842081985067, + "learning_rate": 4.468488839691199e-06, + "loss": 1.2938, + "step": 3321 + }, + { + "epoch": 0.47030508954484324, + "grad_norm": 8.56606112862133, + "learning_rate": 4.468135440112111e-06, + "loss": 1.2546, + "step": 3322 + }, + { + "epoch": 0.4704466624194804, + "grad_norm": 11.083652080646713, + "learning_rate": 4.467781937069611e-06, + "loss": 1.2982, + "step": 3323 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 8.18163945937268, + "learning_rate": 4.467428330582281e-06, + "loss": 1.2117, + "step": 3324 + }, + { + "epoch": 0.47072980816875487, + "grad_norm": 7.566663116364619, + "learning_rate": 4.467074620668711e-06, + "loss": 1.238, + "step": 3325 + }, + { + "epoch": 0.4708713810433921, + "grad_norm": 8.230004325114674, + "learning_rate": 4.466720807347495e-06, + "loss": 1.2425, + "step": 3326 + }, + { + "epoch": 0.47101295391802933, + "grad_norm": 9.049692509100922, + "learning_rate": 4.466366890637232e-06, + "loss": 1.1213, + "step": 3327 + }, + { + "epoch": 0.4711545267926665, + "grad_norm": 9.390071706609776, + "learning_rate": 4.466012870556529e-06, + "loss": 1.2452, + "step": 3328 + }, + { + "epoch": 0.47129609966730374, + "grad_norm": 8.88200982727318, + "learning_rate": 4.4656587471239944e-06, + "loss": 1.4023, + "step": 3329 + }, + { + "epoch": 0.47143767254194097, + "grad_norm": 8.632074964959676, + "learning_rate": 4.4653045203582455e-06, + "loss": 1.3724, + "step": 3330 + }, + { + "epoch": 0.4715792454165782, + "grad_norm": 10.313202435238768, + "learning_rate": 4.464950190277903e-06, + "loss": 1.29, + "step": 3331 + }, + { + "epoch": 0.4717208182912154, + "grad_norm": 8.109906541165891, + "learning_rate": 4.464595756901594e-06, + "loss": 1.3016, + "step": 3332 + }, + { + "epoch": 0.4718623911658526, + "grad_norm": 10.934266053709356, + "learning_rate": 4.4642412202479515e-06, + "loss": 1.3023, + "step": 3333 + }, + { + "epoch": 0.47200396404048983, + "grad_norm": 11.156242477457198, + "learning_rate": 4.463886580335612e-06, + "loss": 1.4055, + "step": 3334 + }, + { + "epoch": 0.47214553691512706, + "grad_norm": 9.938144878839939, + "learning_rate": 4.463531837183221e-06, + "loss": 1.2743, + "step": 3335 + }, + { + "epoch": 0.4722871097897643, + "grad_norm": 8.150472765357119, + "learning_rate": 4.463176990809423e-06, + "loss": 1.3304, + "step": 3336 + }, + { + "epoch": 0.4724286826644015, + "grad_norm": 9.369379418665783, + "learning_rate": 4.462822041232876e-06, + "loss": 1.2673, + "step": 3337 + }, + { + "epoch": 0.4725702555390387, + "grad_norm": 10.743470448956318, + "learning_rate": 4.462466988472237e-06, + "loss": 1.2548, + "step": 3338 + }, + { + "epoch": 0.4727118284136759, + "grad_norm": 9.183108986700587, + "learning_rate": 4.462111832546172e-06, + "loss": 1.428, + "step": 3339 + }, + { + "epoch": 0.47285340128831316, + "grad_norm": 10.822847035061335, + "learning_rate": 4.461756573473352e-06, + "loss": 1.3728, + "step": 3340 + }, + { + "epoch": 0.4729949741629504, + "grad_norm": 11.407641182284355, + "learning_rate": 4.4614012112724494e-06, + "loss": 1.2326, + "step": 3341 + }, + { + "epoch": 0.4731365470375876, + "grad_norm": 8.606294555105483, + "learning_rate": 4.461045745962149e-06, + "loss": 1.3514, + "step": 3342 + }, + { + "epoch": 0.4732781199122248, + "grad_norm": 9.657803604919257, + "learning_rate": 4.460690177561136e-06, + "loss": 1.3295, + "step": 3343 + }, + { + "epoch": 0.473419692786862, + "grad_norm": 9.597390742155765, + "learning_rate": 4.460334506088102e-06, + "loss": 1.2861, + "step": 3344 + }, + { + "epoch": 0.47356126566149925, + "grad_norm": 12.029684227456043, + "learning_rate": 4.459978731561745e-06, + "loss": 1.4862, + "step": 3345 + }, + { + "epoch": 0.4737028385361365, + "grad_norm": 10.5469733791708, + "learning_rate": 4.459622854000767e-06, + "loss": 1.1787, + "step": 3346 + }, + { + "epoch": 0.4738444114107737, + "grad_norm": 12.360378682776698, + "learning_rate": 4.4592668734238775e-06, + "loss": 1.392, + "step": 3347 + }, + { + "epoch": 0.4739859842854109, + "grad_norm": 9.819201471061527, + "learning_rate": 4.458910789849789e-06, + "loss": 1.2712, + "step": 3348 + }, + { + "epoch": 0.4741275571600481, + "grad_norm": 8.532456756312325, + "learning_rate": 4.45855460329722e-06, + "loss": 1.3634, + "step": 3349 + }, + { + "epoch": 0.47426913003468535, + "grad_norm": 10.730650409706396, + "learning_rate": 4.458198313784897e-06, + "loss": 1.4829, + "step": 3350 + }, + { + "epoch": 0.4744107029093226, + "grad_norm": 8.714331597736713, + "learning_rate": 4.457841921331549e-06, + "loss": 1.2372, + "step": 3351 + }, + { + "epoch": 0.4745522757839598, + "grad_norm": 11.919138263045435, + "learning_rate": 4.457485425955911e-06, + "loss": 1.1936, + "step": 3352 + }, + { + "epoch": 0.47469384865859704, + "grad_norm": 11.785309676454068, + "learning_rate": 4.457128827676722e-06, + "loss": 1.4456, + "step": 3353 + }, + { + "epoch": 0.4748354215332342, + "grad_norm": 13.32696807240603, + "learning_rate": 4.456772126512732e-06, + "loss": 1.3831, + "step": 3354 + }, + { + "epoch": 0.47497699440787144, + "grad_norm": 10.549274538959532, + "learning_rate": 4.456415322482689e-06, + "loss": 1.2348, + "step": 3355 + }, + { + "epoch": 0.4751185672825087, + "grad_norm": 7.4648473271511575, + "learning_rate": 4.456058415605352e-06, + "loss": 1.3127, + "step": 3356 + }, + { + "epoch": 0.4752601401571459, + "grad_norm": 8.485942660621474, + "learning_rate": 4.4557014058994815e-06, + "loss": 1.1686, + "step": 3357 + }, + { + "epoch": 0.47540171303178314, + "grad_norm": 8.97252870501425, + "learning_rate": 4.455344293383847e-06, + "loss": 1.2194, + "step": 3358 + }, + { + "epoch": 0.4755432859064203, + "grad_norm": 10.287742165071036, + "learning_rate": 4.454987078077221e-06, + "loss": 1.3788, + "step": 3359 + }, + { + "epoch": 0.47568485878105754, + "grad_norm": 8.953767923169142, + "learning_rate": 4.454629759998382e-06, + "loss": 1.3881, + "step": 3360 + }, + { + "epoch": 0.47582643165569477, + "grad_norm": 8.667323552204207, + "learning_rate": 4.454272339166114e-06, + "loss": 1.2893, + "step": 3361 + }, + { + "epoch": 0.475968004530332, + "grad_norm": 7.590423232027882, + "learning_rate": 4.453914815599206e-06, + "loss": 1.2031, + "step": 3362 + }, + { + "epoch": 0.47610957740496923, + "grad_norm": 8.132311069886736, + "learning_rate": 4.453557189316454e-06, + "loss": 1.4738, + "step": 3363 + }, + { + "epoch": 0.4762511502796064, + "grad_norm": 8.708753317115999, + "learning_rate": 4.453199460336656e-06, + "loss": 1.2831, + "step": 3364 + }, + { + "epoch": 0.47639272315424364, + "grad_norm": 11.766892647306772, + "learning_rate": 4.452841628678619e-06, + "loss": 1.4864, + "step": 3365 + }, + { + "epoch": 0.47653429602888087, + "grad_norm": 9.33298691606233, + "learning_rate": 4.452483694361154e-06, + "loss": 1.3037, + "step": 3366 + }, + { + "epoch": 0.4766758689035181, + "grad_norm": 8.070994552343896, + "learning_rate": 4.452125657403077e-06, + "loss": 1.3575, + "step": 3367 + }, + { + "epoch": 0.4768174417781553, + "grad_norm": 7.9543317018003235, + "learning_rate": 4.45176751782321e-06, + "loss": 1.3963, + "step": 3368 + }, + { + "epoch": 0.4769590146527925, + "grad_norm": 10.643802655683942, + "learning_rate": 4.451409275640379e-06, + "loss": 1.2926, + "step": 3369 + }, + { + "epoch": 0.47710058752742973, + "grad_norm": 8.969720555336176, + "learning_rate": 4.451050930873418e-06, + "loss": 1.0985, + "step": 3370 + }, + { + "epoch": 0.47724216040206696, + "grad_norm": 10.472906488874377, + "learning_rate": 4.450692483541165e-06, + "loss": 1.2868, + "step": 3371 + }, + { + "epoch": 0.4773837332767042, + "grad_norm": 9.501467541228129, + "learning_rate": 4.450333933662462e-06, + "loss": 1.3563, + "step": 3372 + }, + { + "epoch": 0.4775253061513414, + "grad_norm": 9.55311872278069, + "learning_rate": 4.449975281256158e-06, + "loss": 1.3114, + "step": 3373 + }, + { + "epoch": 0.4776668790259786, + "grad_norm": 9.753607840474299, + "learning_rate": 4.4496165263411075e-06, + "loss": 1.3747, + "step": 3374 + }, + { + "epoch": 0.4778084519006158, + "grad_norm": 10.520492175522701, + "learning_rate": 4.4492576689361705e-06, + "loss": 1.3805, + "step": 3375 + }, + { + "epoch": 0.47795002477525306, + "grad_norm": 9.4708016064559, + "learning_rate": 4.448898709060211e-06, + "loss": 1.3336, + "step": 3376 + }, + { + "epoch": 0.4780915976498903, + "grad_norm": 9.41028880860868, + "learning_rate": 4.448539646732099e-06, + "loss": 1.367, + "step": 3377 + }, + { + "epoch": 0.4782331705245275, + "grad_norm": 9.48183632171207, + "learning_rate": 4.448180481970711e-06, + "loss": 1.4832, + "step": 3378 + }, + { + "epoch": 0.4783747433991647, + "grad_norm": 8.148024580561211, + "learning_rate": 4.447821214794928e-06, + "loss": 1.3189, + "step": 3379 + }, + { + "epoch": 0.4785163162738019, + "grad_norm": 9.112478365565025, + "learning_rate": 4.447461845223636e-06, + "loss": 1.4257, + "step": 3380 + }, + { + "epoch": 0.47865788914843915, + "grad_norm": 9.032214261753031, + "learning_rate": 4.447102373275727e-06, + "loss": 1.1645, + "step": 3381 + }, + { + "epoch": 0.4787994620230764, + "grad_norm": 9.151593896005348, + "learning_rate": 4.446742798970097e-06, + "loss": 1.2593, + "step": 3382 + }, + { + "epoch": 0.4789410348977136, + "grad_norm": 7.459905391226147, + "learning_rate": 4.44638312232565e-06, + "loss": 1.1881, + "step": 3383 + }, + { + "epoch": 0.47908260777235084, + "grad_norm": 8.48430845680627, + "learning_rate": 4.446023343361294e-06, + "loss": 1.4111, + "step": 3384 + }, + { + "epoch": 0.479224180646988, + "grad_norm": 8.674326617135314, + "learning_rate": 4.445663462095943e-06, + "loss": 1.316, + "step": 3385 + }, + { + "epoch": 0.47936575352162525, + "grad_norm": 10.636365488877686, + "learning_rate": 4.445303478548513e-06, + "loss": 1.3581, + "step": 3386 + }, + { + "epoch": 0.4795073263962625, + "grad_norm": 9.766792508335524, + "learning_rate": 4.4449433927379295e-06, + "loss": 1.24, + "step": 3387 + }, + { + "epoch": 0.4796488992708997, + "grad_norm": 8.294771721695485, + "learning_rate": 4.444583204683123e-06, + "loss": 1.2707, + "step": 3388 + }, + { + "epoch": 0.47979047214553694, + "grad_norm": 8.190590311651555, + "learning_rate": 4.444222914403027e-06, + "loss": 1.4181, + "step": 3389 + }, + { + "epoch": 0.4799320450201741, + "grad_norm": 9.852753121822847, + "learning_rate": 4.443862521916582e-06, + "loss": 1.3177, + "step": 3390 + }, + { + "epoch": 0.48007361789481134, + "grad_norm": 9.840110856959189, + "learning_rate": 4.443502027242733e-06, + "loss": 1.2816, + "step": 3391 + }, + { + "epoch": 0.4802151907694486, + "grad_norm": 7.748540679373898, + "learning_rate": 4.443141430400432e-06, + "loss": 1.1012, + "step": 3392 + }, + { + "epoch": 0.4803567636440858, + "grad_norm": 8.496620403935568, + "learning_rate": 4.4427807314086355e-06, + "loss": 1.3381, + "step": 3393 + }, + { + "epoch": 0.48049833651872303, + "grad_norm": 9.451851751158548, + "learning_rate": 4.442419930286304e-06, + "loss": 1.4037, + "step": 3394 + }, + { + "epoch": 0.4806399093933602, + "grad_norm": 12.605350726709645, + "learning_rate": 4.442059027052406e-06, + "loss": 1.2878, + "step": 3395 + }, + { + "epoch": 0.48078148226799744, + "grad_norm": 9.130671580622293, + "learning_rate": 4.441698021725911e-06, + "loss": 1.3892, + "step": 3396 + }, + { + "epoch": 0.48092305514263467, + "grad_norm": 8.490407579970672, + "learning_rate": 4.4413369143258e-06, + "loss": 1.2068, + "step": 3397 + }, + { + "epoch": 0.4810646280172719, + "grad_norm": 9.469724095161197, + "learning_rate": 4.440975704871055e-06, + "loss": 1.3062, + "step": 3398 + }, + { + "epoch": 0.48120620089190913, + "grad_norm": 8.864194067280023, + "learning_rate": 4.4406143933806646e-06, + "loss": 1.308, + "step": 3399 + }, + { + "epoch": 0.4813477737665463, + "grad_norm": 9.4638872911025, + "learning_rate": 4.4402529798736224e-06, + "loss": 1.2528, + "step": 3400 + }, + { + "epoch": 0.48148934664118354, + "grad_norm": 9.904593633088695, + "learning_rate": 4.439891464368927e-06, + "loss": 1.4716, + "step": 3401 + }, + { + "epoch": 0.48163091951582077, + "grad_norm": 9.161474618613534, + "learning_rate": 4.439529846885585e-06, + "loss": 1.2288, + "step": 3402 + }, + { + "epoch": 0.481772492390458, + "grad_norm": 9.821604010265974, + "learning_rate": 4.439168127442604e-06, + "loss": 1.3187, + "step": 3403 + }, + { + "epoch": 0.4819140652650952, + "grad_norm": 10.094562887522011, + "learning_rate": 4.438806306059001e-06, + "loss": 1.4525, + "step": 3404 + }, + { + "epoch": 0.4820556381397324, + "grad_norm": 8.621567637490994, + "learning_rate": 4.438444382753796e-06, + "loss": 1.37, + "step": 3405 + }, + { + "epoch": 0.48219721101436963, + "grad_norm": 9.63505959888786, + "learning_rate": 4.438082357546015e-06, + "loss": 1.3754, + "step": 3406 + }, + { + "epoch": 0.48233878388900686, + "grad_norm": 8.11166790425772, + "learning_rate": 4.4377202304546905e-06, + "loss": 1.3439, + "step": 3407 + }, + { + "epoch": 0.4824803567636441, + "grad_norm": 11.120298270746657, + "learning_rate": 4.437358001498857e-06, + "loss": 1.3521, + "step": 3408 + }, + { + "epoch": 0.4826219296382813, + "grad_norm": 11.543246348184848, + "learning_rate": 4.436995670697559e-06, + "loss": 1.459, + "step": 3409 + }, + { + "epoch": 0.48276350251291855, + "grad_norm": 8.839181751338929, + "learning_rate": 4.436633238069843e-06, + "loss": 1.31, + "step": 3410 + }, + { + "epoch": 0.4829050753875557, + "grad_norm": 10.705106454361804, + "learning_rate": 4.436270703634761e-06, + "loss": 1.2944, + "step": 3411 + }, + { + "epoch": 0.48304664826219296, + "grad_norm": 8.098644058328288, + "learning_rate": 4.435908067411372e-06, + "loss": 1.2795, + "step": 3412 + }, + { + "epoch": 0.4831882211368302, + "grad_norm": 9.73433737954207, + "learning_rate": 4.435545329418739e-06, + "loss": 1.3463, + "step": 3413 + }, + { + "epoch": 0.4833297940114674, + "grad_norm": 10.624746880601696, + "learning_rate": 4.435182489675931e-06, + "loss": 1.521, + "step": 3414 + }, + { + "epoch": 0.48347136688610465, + "grad_norm": 10.558780895724006, + "learning_rate": 4.434819548202024e-06, + "loss": 1.2736, + "step": 3415 + }, + { + "epoch": 0.4836129397607418, + "grad_norm": 7.880278014492468, + "learning_rate": 4.434456505016094e-06, + "loss": 1.2651, + "step": 3416 + }, + { + "epoch": 0.48375451263537905, + "grad_norm": 8.683512039932705, + "learning_rate": 4.43409336013723e-06, + "loss": 1.4235, + "step": 3417 + }, + { + "epoch": 0.4838960855100163, + "grad_norm": 8.59988816654326, + "learning_rate": 4.433730113584519e-06, + "loss": 1.2992, + "step": 3418 + }, + { + "epoch": 0.4840376583846535, + "grad_norm": 11.107621925126141, + "learning_rate": 4.433366765377057e-06, + "loss": 1.4835, + "step": 3419 + }, + { + "epoch": 0.48417923125929074, + "grad_norm": 12.15130467572374, + "learning_rate": 4.433003315533947e-06, + "loss": 1.2637, + "step": 3420 + }, + { + "epoch": 0.4843208041339279, + "grad_norm": 11.528056542474294, + "learning_rate": 4.432639764074294e-06, + "loss": 1.4657, + "step": 3421 + }, + { + "epoch": 0.48446237700856515, + "grad_norm": 10.737668793920294, + "learning_rate": 4.4322761110172085e-06, + "loss": 1.3733, + "step": 3422 + }, + { + "epoch": 0.4846039498832024, + "grad_norm": 9.893377273681345, + "learning_rate": 4.43191235638181e-06, + "loss": 1.2788, + "step": 3423 + }, + { + "epoch": 0.4847455227578396, + "grad_norm": 8.617032558953825, + "learning_rate": 4.431548500187218e-06, + "loss": 1.3411, + "step": 3424 + }, + { + "epoch": 0.48488709563247684, + "grad_norm": 9.163736973384975, + "learning_rate": 4.431184542452563e-06, + "loss": 1.4696, + "step": 3425 + }, + { + "epoch": 0.485028668507114, + "grad_norm": 9.653885734025703, + "learning_rate": 4.430820483196976e-06, + "loss": 1.268, + "step": 3426 + }, + { + "epoch": 0.48517024138175124, + "grad_norm": 7.877979986300471, + "learning_rate": 4.430456322439596e-06, + "loss": 1.2177, + "step": 3427 + }, + { + "epoch": 0.4853118142563885, + "grad_norm": 11.880387218599703, + "learning_rate": 4.430092060199566e-06, + "loss": 1.2671, + "step": 3428 + }, + { + "epoch": 0.4854533871310257, + "grad_norm": 10.003591655415608, + "learning_rate": 4.429727696496036e-06, + "loss": 1.3087, + "step": 3429 + }, + { + "epoch": 0.48559496000566293, + "grad_norm": 9.194522587142236, + "learning_rate": 4.42936323134816e-06, + "loss": 1.3968, + "step": 3430 + }, + { + "epoch": 0.4857365328803001, + "grad_norm": 9.455427712177533, + "learning_rate": 4.4289986647750975e-06, + "loss": 1.2233, + "step": 3431 + }, + { + "epoch": 0.48587810575493734, + "grad_norm": 8.772100385781199, + "learning_rate": 4.428633996796012e-06, + "loss": 1.3964, + "step": 3432 + }, + { + "epoch": 0.48601967862957457, + "grad_norm": 8.549537815741443, + "learning_rate": 4.4282692274300775e-06, + "loss": 1.3488, + "step": 3433 + }, + { + "epoch": 0.4861612515042118, + "grad_norm": 8.196448641009534, + "learning_rate": 4.427904356696467e-06, + "loss": 1.276, + "step": 3434 + }, + { + "epoch": 0.48630282437884903, + "grad_norm": 11.087022401657862, + "learning_rate": 4.427539384614361e-06, + "loss": 1.4526, + "step": 3435 + }, + { + "epoch": 0.4864443972534862, + "grad_norm": 8.414942448329127, + "learning_rate": 4.427174311202948e-06, + "loss": 1.2913, + "step": 3436 + }, + { + "epoch": 0.48658597012812344, + "grad_norm": 8.885956354659529, + "learning_rate": 4.426809136481417e-06, + "loss": 1.4002, + "step": 3437 + }, + { + "epoch": 0.48672754300276067, + "grad_norm": 7.577955358120551, + "learning_rate": 4.426443860468967e-06, + "loss": 1.2457, + "step": 3438 + }, + { + "epoch": 0.4868691158773979, + "grad_norm": 10.244151982536602, + "learning_rate": 4.4260784831848e-06, + "loss": 1.2117, + "step": 3439 + }, + { + "epoch": 0.4870106887520351, + "grad_norm": 11.175580883930003, + "learning_rate": 4.425713004648123e-06, + "loss": 1.3842, + "step": 3440 + }, + { + "epoch": 0.48715226162667236, + "grad_norm": 9.023359290110195, + "learning_rate": 4.4253474248781494e-06, + "loss": 1.3275, + "step": 3441 + }, + { + "epoch": 0.48729383450130953, + "grad_norm": 10.093258683379188, + "learning_rate": 4.424981743894097e-06, + "loss": 1.3398, + "step": 3442 + }, + { + "epoch": 0.48743540737594676, + "grad_norm": 8.537192709713947, + "learning_rate": 4.42461596171519e-06, + "loss": 1.429, + "step": 3443 + }, + { + "epoch": 0.487576980250584, + "grad_norm": 9.05612273933079, + "learning_rate": 4.424250078360657e-06, + "loss": 1.158, + "step": 3444 + }, + { + "epoch": 0.4877185531252212, + "grad_norm": 9.549594922487223, + "learning_rate": 4.4238840938497315e-06, + "loss": 1.2439, + "step": 3445 + }, + { + "epoch": 0.48786012599985845, + "grad_norm": 8.946737144332852, + "learning_rate": 4.423518008201655e-06, + "loss": 1.2446, + "step": 3446 + }, + { + "epoch": 0.4880016988744956, + "grad_norm": 8.746167896905442, + "learning_rate": 4.42315182143567e-06, + "loss": 1.4049, + "step": 3447 + }, + { + "epoch": 0.48814327174913286, + "grad_norm": 10.593720472626623, + "learning_rate": 4.422785533571028e-06, + "loss": 1.3256, + "step": 3448 + }, + { + "epoch": 0.4882848446237701, + "grad_norm": 9.233199249652802, + "learning_rate": 4.422419144626984e-06, + "loss": 1.3141, + "step": 3449 + }, + { + "epoch": 0.4884264174984073, + "grad_norm": 8.103139254862365, + "learning_rate": 4.4220526546228e-06, + "loss": 1.2957, + "step": 3450 + }, + { + "epoch": 0.48856799037304455, + "grad_norm": 9.156296661570522, + "learning_rate": 4.4216860635777395e-06, + "loss": 1.2217, + "step": 3451 + }, + { + "epoch": 0.4887095632476817, + "grad_norm": 7.781067627757413, + "learning_rate": 4.4213193715110755e-06, + "loss": 1.186, + "step": 3452 + }, + { + "epoch": 0.48885113612231895, + "grad_norm": 7.799748137882678, + "learning_rate": 4.420952578442086e-06, + "loss": 1.3251, + "step": 3453 + }, + { + "epoch": 0.4889927089969562, + "grad_norm": 9.797772866800601, + "learning_rate": 4.420585684390051e-06, + "loss": 1.3668, + "step": 3454 + }, + { + "epoch": 0.4891342818715934, + "grad_norm": 8.752253650901952, + "learning_rate": 4.420218689374259e-06, + "loss": 1.1432, + "step": 3455 + }, + { + "epoch": 0.48927585474623064, + "grad_norm": 11.068069693855694, + "learning_rate": 4.419851593414002e-06, + "loss": 1.4114, + "step": 3456 + }, + { + "epoch": 0.4894174276208678, + "grad_norm": 9.288293702416162, + "learning_rate": 4.4194843965285786e-06, + "loss": 1.3921, + "step": 3457 + }, + { + "epoch": 0.48955900049550505, + "grad_norm": 7.513001172300603, + "learning_rate": 4.419117098737291e-06, + "loss": 1.3331, + "step": 3458 + }, + { + "epoch": 0.4897005733701423, + "grad_norm": 7.596131504698291, + "learning_rate": 4.418749700059449e-06, + "loss": 1.2346, + "step": 3459 + }, + { + "epoch": 0.4898421462447795, + "grad_norm": 10.067346768005244, + "learning_rate": 4.418382200514366e-06, + "loss": 1.215, + "step": 3460 + }, + { + "epoch": 0.48998371911941674, + "grad_norm": 8.49556751243838, + "learning_rate": 4.418014600121361e-06, + "loss": 1.0883, + "step": 3461 + }, + { + "epoch": 0.4901252919940539, + "grad_norm": 8.735744962581585, + "learning_rate": 4.4176468988997586e-06, + "loss": 1.4575, + "step": 3462 + }, + { + "epoch": 0.49026686486869114, + "grad_norm": 10.256025822599518, + "learning_rate": 4.4172790968688885e-06, + "loss": 1.3552, + "step": 3463 + }, + { + "epoch": 0.4904084377433284, + "grad_norm": 10.129171383437003, + "learning_rate": 4.416911194048086e-06, + "loss": 1.368, + "step": 3464 + }, + { + "epoch": 0.4905500106179656, + "grad_norm": 9.082178258218812, + "learning_rate": 4.4165431904566915e-06, + "loss": 1.3062, + "step": 3465 + }, + { + "epoch": 0.49069158349260283, + "grad_norm": 7.39781100914661, + "learning_rate": 4.416175086114049e-06, + "loss": 1.1872, + "step": 3466 + }, + { + "epoch": 0.49083315636724006, + "grad_norm": 8.249298065841819, + "learning_rate": 4.415806881039513e-06, + "loss": 1.2094, + "step": 3467 + }, + { + "epoch": 0.49097472924187724, + "grad_norm": 9.215722986879458, + "learning_rate": 4.415438575252438e-06, + "loss": 1.3325, + "step": 3468 + }, + { + "epoch": 0.49111630211651447, + "grad_norm": 9.235067325464357, + "learning_rate": 4.415070168772184e-06, + "loss": 1.2323, + "step": 3469 + }, + { + "epoch": 0.4912578749911517, + "grad_norm": 11.979672062481901, + "learning_rate": 4.414701661618119e-06, + "loss": 1.3187, + "step": 3470 + }, + { + "epoch": 0.49139944786578893, + "grad_norm": 9.125377490452749, + "learning_rate": 4.414333053809616e-06, + "loss": 1.2218, + "step": 3471 + }, + { + "epoch": 0.49154102074042616, + "grad_norm": 10.200375897830003, + "learning_rate": 4.413964345366051e-06, + "loss": 1.3388, + "step": 3472 + }, + { + "epoch": 0.49168259361506333, + "grad_norm": 10.976519753586695, + "learning_rate": 4.413595536306808e-06, + "loss": 1.4129, + "step": 3473 + }, + { + "epoch": 0.49182416648970056, + "grad_norm": 9.693768054741206, + "learning_rate": 4.4132266266512745e-06, + "loss": 1.4295, + "step": 3474 + }, + { + "epoch": 0.4919657393643378, + "grad_norm": 11.58491413846918, + "learning_rate": 4.412857616418844e-06, + "loss": 1.3145, + "step": 3475 + }, + { + "epoch": 0.492107312238975, + "grad_norm": 12.517276269228674, + "learning_rate": 4.412488505628915e-06, + "loss": 1.4455, + "step": 3476 + }, + { + "epoch": 0.49224888511361226, + "grad_norm": 7.812865714096454, + "learning_rate": 4.41211929430089e-06, + "loss": 1.221, + "step": 3477 + }, + { + "epoch": 0.49239045798824943, + "grad_norm": 9.548812744477795, + "learning_rate": 4.411749982454181e-06, + "loss": 1.3289, + "step": 3478 + }, + { + "epoch": 0.49253203086288666, + "grad_norm": 10.069094564271321, + "learning_rate": 4.4113805701082e-06, + "loss": 1.3622, + "step": 3479 + }, + { + "epoch": 0.4926736037375239, + "grad_norm": 9.197308553002918, + "learning_rate": 4.411011057282368e-06, + "loss": 1.2031, + "step": 3480 + }, + { + "epoch": 0.4928151766121611, + "grad_norm": 10.996033039907896, + "learning_rate": 4.41064144399611e-06, + "loss": 1.4659, + "step": 3481 + }, + { + "epoch": 0.49295674948679835, + "grad_norm": 9.667449481004462, + "learning_rate": 4.4102717302688556e-06, + "loss": 1.4367, + "step": 3482 + }, + { + "epoch": 0.4930983223614355, + "grad_norm": 8.600762408860733, + "learning_rate": 4.40990191612004e-06, + "loss": 1.2087, + "step": 3483 + }, + { + "epoch": 0.49323989523607276, + "grad_norm": 9.517159875313913, + "learning_rate": 4.409532001569106e-06, + "loss": 1.2846, + "step": 3484 + }, + { + "epoch": 0.49338146811071, + "grad_norm": 8.894193285190958, + "learning_rate": 4.4091619866354975e-06, + "loss": 1.2055, + "step": 3485 + }, + { + "epoch": 0.4935230409853472, + "grad_norm": 8.597307502297594, + "learning_rate": 4.408791871338667e-06, + "loss": 1.273, + "step": 3486 + }, + { + "epoch": 0.49366461385998445, + "grad_norm": 8.340951971889258, + "learning_rate": 4.4084216556980715e-06, + "loss": 1.152, + "step": 3487 + }, + { + "epoch": 0.4938061867346216, + "grad_norm": 9.637160112891344, + "learning_rate": 4.408051339733172e-06, + "loss": 1.2504, + "step": 3488 + }, + { + "epoch": 0.49394775960925885, + "grad_norm": 8.322996926109685, + "learning_rate": 4.407680923463437e-06, + "loss": 1.3272, + "step": 3489 + }, + { + "epoch": 0.4940893324838961, + "grad_norm": 8.646537096419909, + "learning_rate": 4.407310406908338e-06, + "loss": 1.2358, + "step": 3490 + }, + { + "epoch": 0.4942309053585333, + "grad_norm": 8.791834632269415, + "learning_rate": 4.406939790087353e-06, + "loss": 1.2557, + "step": 3491 + }, + { + "epoch": 0.49437247823317054, + "grad_norm": 12.798504885138806, + "learning_rate": 4.406569073019965e-06, + "loss": 1.491, + "step": 3492 + }, + { + "epoch": 0.4945140511078077, + "grad_norm": 10.362869340782074, + "learning_rate": 4.406198255725662e-06, + "loss": 1.3549, + "step": 3493 + }, + { + "epoch": 0.49465562398244495, + "grad_norm": 9.330569766393012, + "learning_rate": 4.4058273382239395e-06, + "loss": 1.1749, + "step": 3494 + }, + { + "epoch": 0.4947971968570822, + "grad_norm": 11.090738064943888, + "learning_rate": 4.4054563205342935e-06, + "loss": 1.3639, + "step": 3495 + }, + { + "epoch": 0.4949387697317194, + "grad_norm": 9.794254128846651, + "learning_rate": 4.4050852026762295e-06, + "loss": 1.2714, + "step": 3496 + }, + { + "epoch": 0.49508034260635664, + "grad_norm": 10.010516549577023, + "learning_rate": 4.404713984669257e-06, + "loss": 1.2737, + "step": 3497 + }, + { + "epoch": 0.49522191548099387, + "grad_norm": 10.07306225792103, + "learning_rate": 4.404342666532891e-06, + "loss": 1.3022, + "step": 3498 + }, + { + "epoch": 0.49536348835563104, + "grad_norm": 9.733838502847256, + "learning_rate": 4.403971248286651e-06, + "loss": 1.4025, + "step": 3499 + }, + { + "epoch": 0.4955050612302683, + "grad_norm": 7.948625592120372, + "learning_rate": 4.403599729950062e-06, + "loss": 1.2245, + "step": 3500 + }, + { + "epoch": 0.4956466341049055, + "grad_norm": 9.864400849203642, + "learning_rate": 4.403228111542654e-06, + "loss": 1.364, + "step": 3501 + }, + { + "epoch": 0.49578820697954273, + "grad_norm": 9.020945864732456, + "learning_rate": 4.402856393083964e-06, + "loss": 1.317, + "step": 3502 + }, + { + "epoch": 0.49592977985417996, + "grad_norm": 8.336857266795088, + "learning_rate": 4.402484574593532e-06, + "loss": 1.182, + "step": 3503 + }, + { + "epoch": 0.49607135272881714, + "grad_norm": 9.669971383495257, + "learning_rate": 4.402112656090904e-06, + "loss": 1.322, + "step": 3504 + }, + { + "epoch": 0.49621292560345437, + "grad_norm": 10.250370670222278, + "learning_rate": 4.401740637595633e-06, + "loss": 1.3543, + "step": 3505 + }, + { + "epoch": 0.4963544984780916, + "grad_norm": 8.281678847239164, + "learning_rate": 4.401368519127274e-06, + "loss": 1.2596, + "step": 3506 + }, + { + "epoch": 0.49649607135272883, + "grad_norm": 8.381710723115901, + "learning_rate": 4.400996300705389e-06, + "loss": 1.3443, + "step": 3507 + }, + { + "epoch": 0.49663764422736606, + "grad_norm": 12.359167586755186, + "learning_rate": 4.400623982349547e-06, + "loss": 1.3197, + "step": 3508 + }, + { + "epoch": 0.49677921710200323, + "grad_norm": 10.596853516179195, + "learning_rate": 4.400251564079319e-06, + "loss": 1.2304, + "step": 3509 + }, + { + "epoch": 0.49692078997664046, + "grad_norm": 8.16007724575805, + "learning_rate": 4.399879045914283e-06, + "loss": 1.2855, + "step": 3510 + }, + { + "epoch": 0.4970623628512777, + "grad_norm": 7.999841688497068, + "learning_rate": 4.399506427874023e-06, + "loss": 1.2656, + "step": 3511 + }, + { + "epoch": 0.4972039357259149, + "grad_norm": 9.19851335870389, + "learning_rate": 4.399133709978126e-06, + "loss": 1.2572, + "step": 3512 + }, + { + "epoch": 0.49734550860055216, + "grad_norm": 10.543692011015143, + "learning_rate": 4.398760892246185e-06, + "loss": 1.1363, + "step": 3513 + }, + { + "epoch": 0.49748708147518933, + "grad_norm": 9.467562660833908, + "learning_rate": 4.398387974697801e-06, + "loss": 1.2989, + "step": 3514 + }, + { + "epoch": 0.49762865434982656, + "grad_norm": 7.648051453804365, + "learning_rate": 4.398014957352576e-06, + "loss": 1.2101, + "step": 3515 + }, + { + "epoch": 0.4977702272244638, + "grad_norm": 9.913867416446465, + "learning_rate": 4.3976418402301196e-06, + "loss": 1.2813, + "step": 3516 + }, + { + "epoch": 0.497911800099101, + "grad_norm": 9.01964375065179, + "learning_rate": 4.397268623350047e-06, + "loss": 1.3348, + "step": 3517 + }, + { + "epoch": 0.49805337297373825, + "grad_norm": 7.68456702378329, + "learning_rate": 4.396895306731978e-06, + "loss": 1.361, + "step": 3518 + }, + { + "epoch": 0.4981949458483754, + "grad_norm": 9.472732763866244, + "learning_rate": 4.396521890395536e-06, + "loss": 1.3689, + "step": 3519 + }, + { + "epoch": 0.49833651872301266, + "grad_norm": 9.573949404616071, + "learning_rate": 4.396148374360354e-06, + "loss": 1.3755, + "step": 3520 + }, + { + "epoch": 0.4984780915976499, + "grad_norm": 10.137299778627336, + "learning_rate": 4.395774758646064e-06, + "loss": 1.3647, + "step": 3521 + }, + { + "epoch": 0.4986196644722871, + "grad_norm": 9.6458344318782, + "learning_rate": 4.395401043272309e-06, + "loss": 1.3787, + "step": 3522 + }, + { + "epoch": 0.49876123734692435, + "grad_norm": 8.89610896401944, + "learning_rate": 4.395027228258735e-06, + "loss": 1.2711, + "step": 3523 + }, + { + "epoch": 0.4989028102215615, + "grad_norm": 7.0881088614273775, + "learning_rate": 4.3946533136249926e-06, + "loss": 1.0665, + "step": 3524 + }, + { + "epoch": 0.49904438309619875, + "grad_norm": 7.967308721912542, + "learning_rate": 4.394279299390737e-06, + "loss": 1.2285, + "step": 3525 + }, + { + "epoch": 0.499185955970836, + "grad_norm": 8.603327848050146, + "learning_rate": 4.393905185575632e-06, + "loss": 1.1893, + "step": 3526 + }, + { + "epoch": 0.4993275288454732, + "grad_norm": 7.9065891841460605, + "learning_rate": 4.393530972199344e-06, + "loss": 1.2525, + "step": 3527 + }, + { + "epoch": 0.49946910172011044, + "grad_norm": 9.666588443132678, + "learning_rate": 4.393156659281545e-06, + "loss": 1.3401, + "step": 3528 + }, + { + "epoch": 0.4996106745947477, + "grad_norm": 9.827809824536981, + "learning_rate": 4.39278224684191e-06, + "loss": 1.1489, + "step": 3529 + }, + { + "epoch": 0.49975224746938485, + "grad_norm": 9.545156271101385, + "learning_rate": 4.392407734900125e-06, + "loss": 1.4004, + "step": 3530 + }, + { + "epoch": 0.4998938203440221, + "grad_norm": 11.119915089454581, + "learning_rate": 4.392033123475876e-06, + "loss": 1.3414, + "step": 3531 + }, + { + "epoch": 0.5000353932186593, + "grad_norm": 9.601895828606867, + "learning_rate": 4.3916584125888575e-06, + "loss": 1.2758, + "step": 3532 + }, + { + "epoch": 0.5001769660932965, + "grad_norm": 9.405676535934713, + "learning_rate": 4.391283602258765e-06, + "loss": 1.2702, + "step": 3533 + }, + { + "epoch": 0.5003185389679338, + "grad_norm": 8.478410566830677, + "learning_rate": 4.390908692505305e-06, + "loss": 1.3231, + "step": 3534 + }, + { + "epoch": 0.500460111842571, + "grad_norm": 11.824971714752111, + "learning_rate": 4.390533683348184e-06, + "loss": 1.2985, + "step": 3535 + }, + { + "epoch": 0.5006016847172082, + "grad_norm": 11.866457145069932, + "learning_rate": 4.390158574807118e-06, + "loss": 1.2969, + "step": 3536 + }, + { + "epoch": 0.5007432575918453, + "grad_norm": 9.758740151186469, + "learning_rate": 4.389783366901824e-06, + "loss": 1.3713, + "step": 3537 + }, + { + "epoch": 0.5008848304664826, + "grad_norm": 8.803027100538342, + "learning_rate": 4.3894080596520286e-06, + "loss": 1.3853, + "step": 3538 + }, + { + "epoch": 0.5010264033411198, + "grad_norm": 7.430376247567812, + "learning_rate": 4.38903265307746e-06, + "loss": 1.1696, + "step": 3539 + }, + { + "epoch": 0.501167976215757, + "grad_norm": 11.430323616312567, + "learning_rate": 4.388657147197852e-06, + "loss": 1.2981, + "step": 3540 + }, + { + "epoch": 0.5013095490903943, + "grad_norm": 11.699443080069626, + "learning_rate": 4.388281542032948e-06, + "loss": 1.4888, + "step": 3541 + }, + { + "epoch": 0.5014511219650315, + "grad_norm": 10.964092254707024, + "learning_rate": 4.38790583760249e-06, + "loss": 1.4003, + "step": 3542 + }, + { + "epoch": 0.5015926948396687, + "grad_norm": 10.87441699339735, + "learning_rate": 4.3875300339262304e-06, + "loss": 1.0645, + "step": 3543 + }, + { + "epoch": 0.501734267714306, + "grad_norm": 8.311762956339816, + "learning_rate": 4.387154131023924e-06, + "loss": 1.2044, + "step": 3544 + }, + { + "epoch": 0.5018758405889432, + "grad_norm": 10.6818265077422, + "learning_rate": 4.386778128915332e-06, + "loss": 1.378, + "step": 3545 + }, + { + "epoch": 0.5020174134635804, + "grad_norm": 11.54980695819131, + "learning_rate": 4.386402027620221e-06, + "loss": 1.3755, + "step": 3546 + }, + { + "epoch": 0.5021589863382176, + "grad_norm": 10.072475251418078, + "learning_rate": 4.386025827158362e-06, + "loss": 1.316, + "step": 3547 + }, + { + "epoch": 0.5023005592128548, + "grad_norm": 9.081131507851355, + "learning_rate": 4.385649527549531e-06, + "loss": 1.1906, + "step": 3548 + }, + { + "epoch": 0.502442132087492, + "grad_norm": 10.1457130586883, + "learning_rate": 4.385273128813511e-06, + "loss": 1.276, + "step": 3549 + }, + { + "epoch": 0.5025837049621292, + "grad_norm": 7.972295234277907, + "learning_rate": 4.384896630970088e-06, + "loss": 1.4091, + "step": 3550 + }, + { + "epoch": 0.5027252778367665, + "grad_norm": 7.956317133594052, + "learning_rate": 4.384520034039054e-06, + "loss": 1.242, + "step": 3551 + }, + { + "epoch": 0.5028668507114037, + "grad_norm": 8.472580609966649, + "learning_rate": 4.384143338040207e-06, + "loss": 1.3097, + "step": 3552 + }, + { + "epoch": 0.5030084235860409, + "grad_norm": 9.706047563965718, + "learning_rate": 4.3837665429933505e-06, + "loss": 1.3847, + "step": 3553 + }, + { + "epoch": 0.5031499964606782, + "grad_norm": 9.229073850510874, + "learning_rate": 4.383389648918291e-06, + "loss": 1.2678, + "step": 3554 + }, + { + "epoch": 0.5032915693353154, + "grad_norm": 8.90064456620124, + "learning_rate": 4.3830126558348425e-06, + "loss": 1.2976, + "step": 3555 + }, + { + "epoch": 0.5034331422099526, + "grad_norm": 11.393540918353834, + "learning_rate": 4.382635563762822e-06, + "loss": 1.4591, + "step": 3556 + }, + { + "epoch": 0.5035747150845898, + "grad_norm": 9.831399587377154, + "learning_rate": 4.382258372722054e-06, + "loss": 1.3694, + "step": 3557 + }, + { + "epoch": 0.503716287959227, + "grad_norm": 7.35223625573914, + "learning_rate": 4.381881082732367e-06, + "loss": 1.2936, + "step": 3558 + }, + { + "epoch": 0.5038578608338642, + "grad_norm": 9.41638894360809, + "learning_rate": 4.381503693813594e-06, + "loss": 1.3006, + "step": 3559 + }, + { + "epoch": 0.5039994337085014, + "grad_norm": 11.04441881729042, + "learning_rate": 4.381126205985575e-06, + "loss": 1.3834, + "step": 3560 + }, + { + "epoch": 0.5041410065831387, + "grad_norm": 9.708416808810474, + "learning_rate": 4.380748619268154e-06, + "loss": 1.1973, + "step": 3561 + }, + { + "epoch": 0.5042825794577759, + "grad_norm": 10.062146293157456, + "learning_rate": 4.3803709336811804e-06, + "loss": 1.3512, + "step": 3562 + }, + { + "epoch": 0.5044241523324131, + "grad_norm": 8.74817224213864, + "learning_rate": 4.379993149244509e-06, + "loss": 1.3635, + "step": 3563 + }, + { + "epoch": 0.5045657252070503, + "grad_norm": 8.895195991025156, + "learning_rate": 4.379615265978e-06, + "loss": 1.1614, + "step": 3564 + }, + { + "epoch": 0.5047072980816876, + "grad_norm": 9.433851332534102, + "learning_rate": 4.379237283901518e-06, + "loss": 1.0763, + "step": 3565 + }, + { + "epoch": 0.5048488709563248, + "grad_norm": 10.162210915971169, + "learning_rate": 4.378859203034932e-06, + "loss": 1.2676, + "step": 3566 + }, + { + "epoch": 0.504990443830962, + "grad_norm": 7.859618166125194, + "learning_rate": 4.378481023398119e-06, + "loss": 1.1299, + "step": 3567 + }, + { + "epoch": 0.5051320167055992, + "grad_norm": 8.570361461516935, + "learning_rate": 4.37810274501096e-06, + "loss": 1.3747, + "step": 3568 + }, + { + "epoch": 0.5052735895802364, + "grad_norm": 9.698002198549267, + "learning_rate": 4.37772436789334e-06, + "loss": 1.344, + "step": 3569 + }, + { + "epoch": 0.5054151624548736, + "grad_norm": 10.693662060499207, + "learning_rate": 4.377345892065149e-06, + "loss": 1.4562, + "step": 3570 + }, + { + "epoch": 0.5055567353295108, + "grad_norm": 8.804870766033025, + "learning_rate": 4.376967317546285e-06, + "loss": 1.2438, + "step": 3571 + }, + { + "epoch": 0.5056983082041481, + "grad_norm": 11.900100681936872, + "learning_rate": 4.376588644356649e-06, + "loss": 1.478, + "step": 3572 + }, + { + "epoch": 0.5058398810787853, + "grad_norm": 10.120219668088787, + "learning_rate": 4.376209872516146e-06, + "loss": 1.2629, + "step": 3573 + }, + { + "epoch": 0.5059814539534225, + "grad_norm": 9.222704643694506, + "learning_rate": 4.37583100204469e-06, + "loss": 1.3411, + "step": 3574 + }, + { + "epoch": 0.5061230268280598, + "grad_norm": 14.113778036682808, + "learning_rate": 4.375452032962197e-06, + "loss": 1.4009, + "step": 3575 + }, + { + "epoch": 0.506264599702697, + "grad_norm": 9.563826325630156, + "learning_rate": 4.375072965288589e-06, + "loss": 1.3481, + "step": 3576 + }, + { + "epoch": 0.5064061725773342, + "grad_norm": 10.58969353625942, + "learning_rate": 4.374693799043792e-06, + "loss": 1.4274, + "step": 3577 + }, + { + "epoch": 0.5065477454519715, + "grad_norm": 8.766919589534027, + "learning_rate": 4.374314534247741e-06, + "loss": 1.1455, + "step": 3578 + }, + { + "epoch": 0.5066893183266086, + "grad_norm": 11.15061864120296, + "learning_rate": 4.3739351709203725e-06, + "loss": 1.2594, + "step": 3579 + }, + { + "epoch": 0.5068308912012458, + "grad_norm": 10.297119878655883, + "learning_rate": 4.3735557090816295e-06, + "loss": 1.3052, + "step": 3580 + }, + { + "epoch": 0.506972464075883, + "grad_norm": 9.146231003257101, + "learning_rate": 4.37317614875146e-06, + "loss": 1.287, + "step": 3581 + }, + { + "epoch": 0.5071140369505203, + "grad_norm": 9.882043571865589, + "learning_rate": 4.372796489949816e-06, + "loss": 1.3389, + "step": 3582 + }, + { + "epoch": 0.5072556098251575, + "grad_norm": 7.811374186460933, + "learning_rate": 4.3724167326966575e-06, + "loss": 1.1854, + "step": 3583 + }, + { + "epoch": 0.5073971826997947, + "grad_norm": 10.818054536518021, + "learning_rate": 4.372036877011948e-06, + "loss": 1.3886, + "step": 3584 + }, + { + "epoch": 0.507538755574432, + "grad_norm": 9.699870818546213, + "learning_rate": 4.371656922915655e-06, + "loss": 1.3545, + "step": 3585 + }, + { + "epoch": 0.5076803284490692, + "grad_norm": 11.007507883138821, + "learning_rate": 4.3712768704277535e-06, + "loss": 1.393, + "step": 3586 + }, + { + "epoch": 0.5078219013237064, + "grad_norm": 11.568869655985734, + "learning_rate": 4.3708967195682215e-06, + "loss": 1.4189, + "step": 3587 + }, + { + "epoch": 0.5079634741983436, + "grad_norm": 10.3037623524479, + "learning_rate": 4.3705164703570444e-06, + "loss": 1.2912, + "step": 3588 + }, + { + "epoch": 0.5081050470729808, + "grad_norm": 11.373174164505347, + "learning_rate": 4.3701361228142115e-06, + "loss": 1.207, + "step": 3589 + }, + { + "epoch": 0.508246619947618, + "grad_norm": 11.237121670711891, + "learning_rate": 4.369755676959717e-06, + "loss": 1.3908, + "step": 3590 + }, + { + "epoch": 0.5083881928222552, + "grad_norm": 8.524258721752355, + "learning_rate": 4.36937513281356e-06, + "loss": 1.4201, + "step": 3591 + }, + { + "epoch": 0.5085297656968925, + "grad_norm": 10.984852157799436, + "learning_rate": 4.3689944903957475e-06, + "loss": 1.3405, + "step": 3592 + }, + { + "epoch": 0.5086713385715297, + "grad_norm": 11.462342013036782, + "learning_rate": 4.368613749726287e-06, + "loss": 1.2763, + "step": 3593 + }, + { + "epoch": 0.5088129114461669, + "grad_norm": 10.990650712135832, + "learning_rate": 4.368232910825196e-06, + "loss": 1.2023, + "step": 3594 + }, + { + "epoch": 0.5089544843208041, + "grad_norm": 10.185433581884306, + "learning_rate": 4.367851973712492e-06, + "loss": 1.4747, + "step": 3595 + }, + { + "epoch": 0.5090960571954414, + "grad_norm": 9.753900774678078, + "learning_rate": 4.367470938408204e-06, + "loss": 1.1592, + "step": 3596 + }, + { + "epoch": 0.5092376300700786, + "grad_norm": 11.563049633262372, + "learning_rate": 4.367089804932362e-06, + "loss": 1.4018, + "step": 3597 + }, + { + "epoch": 0.5093792029447158, + "grad_norm": 9.857701082396586, + "learning_rate": 4.366708573304999e-06, + "loss": 1.2494, + "step": 3598 + }, + { + "epoch": 0.5095207758193531, + "grad_norm": 9.952626741354987, + "learning_rate": 4.36632724354616e-06, + "loss": 1.3116, + "step": 3599 + }, + { + "epoch": 0.5096623486939902, + "grad_norm": 8.726732107237186, + "learning_rate": 4.365945815675888e-06, + "loss": 1.2629, + "step": 3600 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 9.500316012296437, + "learning_rate": 4.365564289714237e-06, + "loss": 1.2569, + "step": 3601 + }, + { + "epoch": 0.5099454944432646, + "grad_norm": 11.518580146816396, + "learning_rate": 4.365182665681261e-06, + "loss": 1.395, + "step": 3602 + }, + { + "epoch": 0.5100870673179019, + "grad_norm": 10.404613469289115, + "learning_rate": 4.364800943597024e-06, + "loss": 1.2535, + "step": 3603 + }, + { + "epoch": 0.5102286401925391, + "grad_norm": 11.070715660663398, + "learning_rate": 4.364419123481592e-06, + "loss": 1.24, + "step": 3604 + }, + { + "epoch": 0.5103702130671763, + "grad_norm": 8.089870627822323, + "learning_rate": 4.364037205355036e-06, + "loss": 1.1674, + "step": 3605 + }, + { + "epoch": 0.5105117859418136, + "grad_norm": 11.554065494160891, + "learning_rate": 4.3636551892374346e-06, + "loss": 1.4152, + "step": 3606 + }, + { + "epoch": 0.5106533588164508, + "grad_norm": 8.718538233875382, + "learning_rate": 4.3632730751488695e-06, + "loss": 1.3727, + "step": 3607 + }, + { + "epoch": 0.510794931691088, + "grad_norm": 8.05165683921299, + "learning_rate": 4.362890863109428e-06, + "loss": 1.2115, + "step": 3608 + }, + { + "epoch": 0.5109365045657253, + "grad_norm": 11.408877051924852, + "learning_rate": 4.362508553139203e-06, + "loss": 1.2151, + "step": 3609 + }, + { + "epoch": 0.5110780774403624, + "grad_norm": 9.166796550408547, + "learning_rate": 4.362126145258292e-06, + "loss": 1.2188, + "step": 3610 + }, + { + "epoch": 0.5112196503149996, + "grad_norm": 10.785255285220781, + "learning_rate": 4.361743639486797e-06, + "loss": 1.2269, + "step": 3611 + }, + { + "epoch": 0.5113612231896368, + "grad_norm": 10.09594702652033, + "learning_rate": 4.361361035844829e-06, + "loss": 1.4284, + "step": 3612 + }, + { + "epoch": 0.5115027960642741, + "grad_norm": 8.383403604363346, + "learning_rate": 4.360978334352498e-06, + "loss": 1.2688, + "step": 3613 + }, + { + "epoch": 0.5116443689389113, + "grad_norm": 11.109635963003862, + "learning_rate": 4.360595535029924e-06, + "loss": 1.2878, + "step": 3614 + }, + { + "epoch": 0.5117859418135485, + "grad_norm": 10.5244122048835, + "learning_rate": 4.36021263789723e-06, + "loss": 1.333, + "step": 3615 + }, + { + "epoch": 0.5119275146881858, + "grad_norm": 8.728630346849018, + "learning_rate": 4.359829642974544e-06, + "loss": 1.3274, + "step": 3616 + }, + { + "epoch": 0.512069087562823, + "grad_norm": 9.188280552514641, + "learning_rate": 4.359446550282001e-06, + "loss": 1.3884, + "step": 3617 + }, + { + "epoch": 0.5122106604374602, + "grad_norm": 8.004876081280718, + "learning_rate": 4.359063359839739e-06, + "loss": 1.2561, + "step": 3618 + }, + { + "epoch": 0.5123522333120974, + "grad_norm": 8.528752782408311, + "learning_rate": 4.358680071667903e-06, + "loss": 1.3053, + "step": 3619 + }, + { + "epoch": 0.5124938061867346, + "grad_norm": 9.882068663336362, + "learning_rate": 4.35829668578664e-06, + "loss": 1.3428, + "step": 3620 + }, + { + "epoch": 0.5126353790613718, + "grad_norm": 9.819680083554626, + "learning_rate": 4.357913202216108e-06, + "loss": 1.2256, + "step": 3621 + }, + { + "epoch": 0.512776951936009, + "grad_norm": 9.118700335068658, + "learning_rate": 4.357529620976463e-06, + "loss": 1.3428, + "step": 3622 + }, + { + "epoch": 0.5129185248106463, + "grad_norm": 8.052748823280151, + "learning_rate": 4.3571459420878705e-06, + "loss": 1.2902, + "step": 3623 + }, + { + "epoch": 0.5130600976852835, + "grad_norm": 10.424274402862382, + "learning_rate": 4.3567621655705015e-06, + "loss": 1.263, + "step": 3624 + }, + { + "epoch": 0.5132016705599207, + "grad_norm": 9.079106131765146, + "learning_rate": 4.356378291444529e-06, + "loss": 1.2425, + "step": 3625 + }, + { + "epoch": 0.513343243434558, + "grad_norm": 9.654248076386382, + "learning_rate": 4.355994319730135e-06, + "loss": 1.1795, + "step": 3626 + }, + { + "epoch": 0.5134848163091952, + "grad_norm": 9.326823234779999, + "learning_rate": 4.355610250447503e-06, + "loss": 1.334, + "step": 3627 + }, + { + "epoch": 0.5136263891838324, + "grad_norm": 8.823396912122204, + "learning_rate": 4.355226083616824e-06, + "loss": 1.3492, + "step": 3628 + }, + { + "epoch": 0.5137679620584696, + "grad_norm": 10.588597667950674, + "learning_rate": 4.354841819258293e-06, + "loss": 1.4545, + "step": 3629 + }, + { + "epoch": 0.5139095349331069, + "grad_norm": 9.90376245692629, + "learning_rate": 4.35445745739211e-06, + "loss": 1.3612, + "step": 3630 + }, + { + "epoch": 0.514051107807744, + "grad_norm": 7.019821984623312, + "learning_rate": 4.354072998038482e-06, + "loss": 1.2451, + "step": 3631 + }, + { + "epoch": 0.5141926806823812, + "grad_norm": 9.001079494586808, + "learning_rate": 4.353688441217618e-06, + "loss": 1.4295, + "step": 3632 + }, + { + "epoch": 0.5143342535570185, + "grad_norm": 9.289357353131576, + "learning_rate": 4.353303786949735e-06, + "loss": 1.3349, + "step": 3633 + }, + { + "epoch": 0.5144758264316557, + "grad_norm": 10.835440758373545, + "learning_rate": 4.352919035255055e-06, + "loss": 1.3358, + "step": 3634 + }, + { + "epoch": 0.5146173993062929, + "grad_norm": 9.205593017837504, + "learning_rate": 4.352534186153802e-06, + "loss": 1.3733, + "step": 3635 + }, + { + "epoch": 0.5147589721809301, + "grad_norm": 8.234261036035372, + "learning_rate": 4.352149239666208e-06, + "loss": 1.2727, + "step": 3636 + }, + { + "epoch": 0.5149005450555674, + "grad_norm": 8.594434010420503, + "learning_rate": 4.35176419581251e-06, + "loss": 1.3088, + "step": 3637 + }, + { + "epoch": 0.5150421179302046, + "grad_norm": 9.286494253748415, + "learning_rate": 4.351379054612949e-06, + "loss": 1.3359, + "step": 3638 + }, + { + "epoch": 0.5151836908048418, + "grad_norm": 11.955219638774025, + "learning_rate": 4.35099381608777e-06, + "loss": 1.3881, + "step": 3639 + }, + { + "epoch": 0.5153252636794791, + "grad_norm": 9.614671987095223, + "learning_rate": 4.3506084802572276e-06, + "loss": 1.2515, + "step": 3640 + }, + { + "epoch": 0.5154668365541162, + "grad_norm": 9.39151093275718, + "learning_rate": 4.350223047141577e-06, + "loss": 1.5442, + "step": 3641 + }, + { + "epoch": 0.5156084094287534, + "grad_norm": 10.597233292449017, + "learning_rate": 4.349837516761081e-06, + "loss": 1.2808, + "step": 3642 + }, + { + "epoch": 0.5157499823033906, + "grad_norm": 11.097924363181543, + "learning_rate": 4.3494518891360054e-06, + "loss": 1.404, + "step": 3643 + }, + { + "epoch": 0.5158915551780279, + "grad_norm": 9.34133931067636, + "learning_rate": 4.3490661642866225e-06, + "loss": 1.2152, + "step": 3644 + }, + { + "epoch": 0.5160331280526651, + "grad_norm": 11.91366034157324, + "learning_rate": 4.3486803422332115e-06, + "loss": 1.3273, + "step": 3645 + }, + { + "epoch": 0.5161747009273023, + "grad_norm": 8.33580295843862, + "learning_rate": 4.348294422996052e-06, + "loss": 1.1015, + "step": 3646 + }, + { + "epoch": 0.5163162738019396, + "grad_norm": 8.955641467763272, + "learning_rate": 4.347908406595433e-06, + "loss": 1.3313, + "step": 3647 + }, + { + "epoch": 0.5164578466765768, + "grad_norm": 12.988385807849822, + "learning_rate": 4.3475222930516484e-06, + "loss": 1.396, + "step": 3648 + }, + { + "epoch": 0.516599419551214, + "grad_norm": 7.239411349735131, + "learning_rate": 4.347136082384993e-06, + "loss": 1.2539, + "step": 3649 + }, + { + "epoch": 0.5167409924258513, + "grad_norm": 8.506620633629561, + "learning_rate": 4.3467497746157715e-06, + "loss": 1.1788, + "step": 3650 + }, + { + "epoch": 0.5168825653004884, + "grad_norm": 14.303134302624242, + "learning_rate": 4.3463633697642905e-06, + "loss": 1.3677, + "step": 3651 + }, + { + "epoch": 0.5170241381751256, + "grad_norm": 10.011149485132657, + "learning_rate": 4.345976867850865e-06, + "loss": 1.3191, + "step": 3652 + }, + { + "epoch": 0.5171657110497628, + "grad_norm": 9.000102572386416, + "learning_rate": 4.345590268895812e-06, + "loss": 1.195, + "step": 3653 + }, + { + "epoch": 0.5173072839244001, + "grad_norm": 7.775654078995617, + "learning_rate": 4.3452035729194544e-06, + "loss": 1.1391, + "step": 3654 + }, + { + "epoch": 0.5174488567990373, + "grad_norm": 7.1321180158041, + "learning_rate": 4.34481677994212e-06, + "loss": 1.2042, + "step": 3655 + }, + { + "epoch": 0.5175904296736745, + "grad_norm": 9.070762918806732, + "learning_rate": 4.3444298899841445e-06, + "loss": 1.3308, + "step": 3656 + }, + { + "epoch": 0.5177320025483118, + "grad_norm": 10.013534922934658, + "learning_rate": 4.344042903065864e-06, + "loss": 1.3021, + "step": 3657 + }, + { + "epoch": 0.517873575422949, + "grad_norm": 7.335698035478601, + "learning_rate": 4.3436558192076225e-06, + "loss": 1.2039, + "step": 3658 + }, + { + "epoch": 0.5180151482975862, + "grad_norm": 10.284138838897249, + "learning_rate": 4.3432686384297705e-06, + "loss": 1.4557, + "step": 3659 + }, + { + "epoch": 0.5181567211722234, + "grad_norm": 8.810776562277079, + "learning_rate": 4.34288136075266e-06, + "loss": 1.2821, + "step": 3660 + }, + { + "epoch": 0.5182982940468607, + "grad_norm": 10.57991316992278, + "learning_rate": 4.34249398619665e-06, + "loss": 1.2703, + "step": 3661 + }, + { + "epoch": 0.5184398669214978, + "grad_norm": 6.48705910891587, + "learning_rate": 4.342106514782106e-06, + "loss": 1.1235, + "step": 3662 + }, + { + "epoch": 0.518581439796135, + "grad_norm": 7.753502085263255, + "learning_rate": 4.341718946529395e-06, + "loss": 1.2455, + "step": 3663 + }, + { + "epoch": 0.5187230126707723, + "grad_norm": 8.439793705882364, + "learning_rate": 4.341331281458893e-06, + "loss": 1.1997, + "step": 3664 + }, + { + "epoch": 0.5188645855454095, + "grad_norm": 10.299057956318027, + "learning_rate": 4.3409435195909785e-06, + "loss": 1.3715, + "step": 3665 + }, + { + "epoch": 0.5190061584200467, + "grad_norm": 8.981694572982525, + "learning_rate": 4.340555660946035e-06, + "loss": 1.355, + "step": 3666 + }, + { + "epoch": 0.519147731294684, + "grad_norm": 9.441629276947932, + "learning_rate": 4.340167705544454e-06, + "loss": 1.4047, + "step": 3667 + }, + { + "epoch": 0.5192893041693212, + "grad_norm": 9.941222831068275, + "learning_rate": 4.339779653406628e-06, + "loss": 1.3555, + "step": 3668 + }, + { + "epoch": 0.5194308770439584, + "grad_norm": 10.917321299563834, + "learning_rate": 4.3393915045529575e-06, + "loss": 1.3726, + "step": 3669 + }, + { + "epoch": 0.5195724499185956, + "grad_norm": 8.371894830137022, + "learning_rate": 4.339003259003848e-06, + "loss": 1.1966, + "step": 3670 + }, + { + "epoch": 0.5197140227932329, + "grad_norm": 8.603376621677885, + "learning_rate": 4.338614916779706e-06, + "loss": 1.2495, + "step": 3671 + }, + { + "epoch": 0.51985559566787, + "grad_norm": 9.122696677272973, + "learning_rate": 4.3382264779009504e-06, + "loss": 1.226, + "step": 3672 + }, + { + "epoch": 0.5199971685425072, + "grad_norm": 8.785167105512413, + "learning_rate": 4.337837942388e-06, + "loss": 1.4481, + "step": 3673 + }, + { + "epoch": 0.5201387414171444, + "grad_norm": 10.615710696822555, + "learning_rate": 4.337449310261279e-06, + "loss": 1.4717, + "step": 3674 + }, + { + "epoch": 0.5202803142917817, + "grad_norm": 8.372563705224755, + "learning_rate": 4.337060581541217e-06, + "loss": 1.3384, + "step": 3675 + }, + { + "epoch": 0.5204218871664189, + "grad_norm": 10.21246910137232, + "learning_rate": 4.336671756248251e-06, + "loss": 1.3008, + "step": 3676 + }, + { + "epoch": 0.5205634600410561, + "grad_norm": 8.116534708923036, + "learning_rate": 4.33628283440282e-06, + "loss": 1.2539, + "step": 3677 + }, + { + "epoch": 0.5207050329156934, + "grad_norm": 8.300047302111306, + "learning_rate": 4.335893816025369e-06, + "loss": 1.3735, + "step": 3678 + }, + { + "epoch": 0.5208466057903306, + "grad_norm": 11.848857171042514, + "learning_rate": 4.33550470113635e-06, + "loss": 1.2463, + "step": 3679 + }, + { + "epoch": 0.5209881786649678, + "grad_norm": 10.828314885487831, + "learning_rate": 4.335115489756217e-06, + "loss": 1.138, + "step": 3680 + }, + { + "epoch": 0.5211297515396051, + "grad_norm": 12.146072795376535, + "learning_rate": 4.33472618190543e-06, + "loss": 1.3875, + "step": 3681 + }, + { + "epoch": 0.5212713244142422, + "grad_norm": 9.531580184470373, + "learning_rate": 4.334336777604458e-06, + "loss": 1.4485, + "step": 3682 + }, + { + "epoch": 0.5214128972888794, + "grad_norm": 8.865400253529078, + "learning_rate": 4.333947276873767e-06, + "loss": 1.4834, + "step": 3683 + }, + { + "epoch": 0.5215544701635166, + "grad_norm": 8.923370075722556, + "learning_rate": 4.333557679733836e-06, + "loss": 1.3215, + "step": 3684 + }, + { + "epoch": 0.5216960430381539, + "grad_norm": 8.08497833607637, + "learning_rate": 4.333167986205145e-06, + "loss": 1.258, + "step": 3685 + }, + { + "epoch": 0.5218376159127911, + "grad_norm": 8.797134747913619, + "learning_rate": 4.33277819630818e-06, + "loss": 1.3794, + "step": 3686 + }, + { + "epoch": 0.5219791887874283, + "grad_norm": 8.57299206415875, + "learning_rate": 4.332388310063431e-06, + "loss": 1.4255, + "step": 3687 + }, + { + "epoch": 0.5221207616620656, + "grad_norm": 7.5749540321289865, + "learning_rate": 4.331998327491396e-06, + "loss": 1.2958, + "step": 3688 + }, + { + "epoch": 0.5222623345367028, + "grad_norm": 9.085293432943997, + "learning_rate": 4.331608248612574e-06, + "loss": 1.3056, + "step": 3689 + }, + { + "epoch": 0.52240390741134, + "grad_norm": 9.309875118348167, + "learning_rate": 4.331218073447472e-06, + "loss": 1.2508, + "step": 3690 + }, + { + "epoch": 0.5225454802859772, + "grad_norm": 6.781244655905873, + "learning_rate": 4.330827802016603e-06, + "loss": 1.074, + "step": 3691 + }, + { + "epoch": 0.5226870531606145, + "grad_norm": 8.588133374085094, + "learning_rate": 4.3304374343404794e-06, + "loss": 1.1986, + "step": 3692 + }, + { + "epoch": 0.5228286260352516, + "grad_norm": 8.845392836175764, + "learning_rate": 4.330046970439625e-06, + "loss": 1.3916, + "step": 3693 + }, + { + "epoch": 0.5229701989098888, + "grad_norm": 9.157712643815552, + "learning_rate": 4.329656410334567e-06, + "loss": 1.4306, + "step": 3694 + }, + { + "epoch": 0.5231117717845261, + "grad_norm": 7.468568424587415, + "learning_rate": 4.329265754045835e-06, + "loss": 1.3161, + "step": 3695 + }, + { + "epoch": 0.5232533446591633, + "grad_norm": 9.661809906984868, + "learning_rate": 4.328875001593966e-06, + "loss": 1.3855, + "step": 3696 + }, + { + "epoch": 0.5233949175338005, + "grad_norm": 10.046344559165032, + "learning_rate": 4.3284841529995025e-06, + "loss": 1.2471, + "step": 3697 + }, + { + "epoch": 0.5235364904084377, + "grad_norm": 9.783668487197572, + "learning_rate": 4.32809320828299e-06, + "loss": 1.3684, + "step": 3698 + }, + { + "epoch": 0.523678063283075, + "grad_norm": 8.446637214703637, + "learning_rate": 4.327702167464981e-06, + "loss": 1.3046, + "step": 3699 + }, + { + "epoch": 0.5238196361577122, + "grad_norm": 9.662885342868043, + "learning_rate": 4.327311030566033e-06, + "loss": 1.2554, + "step": 3700 + }, + { + "epoch": 0.5239612090323494, + "grad_norm": 9.533697095568884, + "learning_rate": 4.326919797606705e-06, + "loss": 1.299, + "step": 3701 + }, + { + "epoch": 0.5241027819069867, + "grad_norm": 10.043671804810923, + "learning_rate": 4.326528468607566e-06, + "loss": 1.461, + "step": 3702 + }, + { + "epoch": 0.5242443547816238, + "grad_norm": 9.063247123702022, + "learning_rate": 4.3261370435891866e-06, + "loss": 1.3196, + "step": 3703 + }, + { + "epoch": 0.524385927656261, + "grad_norm": 8.418371600714407, + "learning_rate": 4.325745522572145e-06, + "loss": 1.2188, + "step": 3704 + }, + { + "epoch": 0.5245275005308982, + "grad_norm": 8.679098553938903, + "learning_rate": 4.325353905577023e-06, + "loss": 1.2621, + "step": 3705 + }, + { + "epoch": 0.5246690734055355, + "grad_norm": 9.902775970596736, + "learning_rate": 4.324962192624407e-06, + "loss": 1.3326, + "step": 3706 + }, + { + "epoch": 0.5248106462801727, + "grad_norm": 8.49646954987845, + "learning_rate": 4.324570383734888e-06, + "loss": 1.3364, + "step": 3707 + }, + { + "epoch": 0.5249522191548099, + "grad_norm": 9.435015016512502, + "learning_rate": 4.3241784789290665e-06, + "loss": 1.3184, + "step": 3708 + }, + { + "epoch": 0.5250937920294472, + "grad_norm": 7.803848236751282, + "learning_rate": 4.323786478227541e-06, + "loss": 1.2536, + "step": 3709 + }, + { + "epoch": 0.5252353649040844, + "grad_norm": 8.806566337497168, + "learning_rate": 4.323394381650921e-06, + "loss": 1.3474, + "step": 3710 + }, + { + "epoch": 0.5253769377787216, + "grad_norm": 11.336184498299778, + "learning_rate": 4.323002189219818e-06, + "loss": 1.2696, + "step": 3711 + }, + { + "epoch": 0.5255185106533589, + "grad_norm": 8.99746456460422, + "learning_rate": 4.322609900954848e-06, + "loss": 1.376, + "step": 3712 + }, + { + "epoch": 0.525660083527996, + "grad_norm": 8.925488062744979, + "learning_rate": 4.322217516876635e-06, + "loss": 1.3037, + "step": 3713 + }, + { + "epoch": 0.5258016564026332, + "grad_norm": 8.48769430351045, + "learning_rate": 4.321825037005807e-06, + "loss": 1.4551, + "step": 3714 + }, + { + "epoch": 0.5259432292772704, + "grad_norm": 10.143781032738074, + "learning_rate": 4.321432461362994e-06, + "loss": 1.2989, + "step": 3715 + }, + { + "epoch": 0.5260848021519077, + "grad_norm": 8.499526964376551, + "learning_rate": 4.3210397899688355e-06, + "loss": 1.2254, + "step": 3716 + }, + { + "epoch": 0.5262263750265449, + "grad_norm": 9.220475374821806, + "learning_rate": 4.320647022843972e-06, + "loss": 1.2873, + "step": 3717 + }, + { + "epoch": 0.5263679479011821, + "grad_norm": 9.261975346376955, + "learning_rate": 4.320254160009053e-06, + "loss": 1.2735, + "step": 3718 + }, + { + "epoch": 0.5265095207758194, + "grad_norm": 8.534135379451435, + "learning_rate": 4.31986120148473e-06, + "loss": 1.3807, + "step": 3719 + }, + { + "epoch": 0.5266510936504566, + "grad_norm": 7.39427330506257, + "learning_rate": 4.31946814729166e-06, + "loss": 1.1963, + "step": 3720 + }, + { + "epoch": 0.5267926665250938, + "grad_norm": 7.837385414660903, + "learning_rate": 4.319074997450506e-06, + "loss": 1.2716, + "step": 3721 + }, + { + "epoch": 0.526934239399731, + "grad_norm": 10.246172678880134, + "learning_rate": 4.318681751981937e-06, + "loss": 1.2566, + "step": 3722 + }, + { + "epoch": 0.5270758122743683, + "grad_norm": 9.192920149973514, + "learning_rate": 4.318288410906623e-06, + "loss": 1.2458, + "step": 3723 + }, + { + "epoch": 0.5272173851490054, + "grad_norm": 9.242925877230254, + "learning_rate": 4.3178949742452435e-06, + "loss": 1.4706, + "step": 3724 + }, + { + "epoch": 0.5273589580236426, + "grad_norm": 9.36737645764377, + "learning_rate": 4.317501442018481e-06, + "loss": 1.4173, + "step": 3725 + }, + { + "epoch": 0.5275005308982799, + "grad_norm": 9.726227168541872, + "learning_rate": 4.317107814247022e-06, + "loss": 1.3803, + "step": 3726 + }, + { + "epoch": 0.5276421037729171, + "grad_norm": 7.187997817339006, + "learning_rate": 4.316714090951562e-06, + "loss": 1.2546, + "step": 3727 + }, + { + "epoch": 0.5277836766475543, + "grad_norm": 8.389041800996536, + "learning_rate": 4.316320272152795e-06, + "loss": 1.2617, + "step": 3728 + }, + { + "epoch": 0.5279252495221916, + "grad_norm": 8.012227727158999, + "learning_rate": 4.315926357871426e-06, + "loss": 1.1987, + "step": 3729 + }, + { + "epoch": 0.5280668223968288, + "grad_norm": 11.816103431037186, + "learning_rate": 4.3155323481281625e-06, + "loss": 1.4112, + "step": 3730 + }, + { + "epoch": 0.528208395271466, + "grad_norm": 9.837851267727011, + "learning_rate": 4.3151382429437175e-06, + "loss": 1.3665, + "step": 3731 + }, + { + "epoch": 0.5283499681461032, + "grad_norm": 8.2459026912672, + "learning_rate": 4.314744042338808e-06, + "loss": 1.3858, + "step": 3732 + }, + { + "epoch": 0.5284915410207405, + "grad_norm": 8.227777317117797, + "learning_rate": 4.314349746334158e-06, + "loss": 1.325, + "step": 3733 + }, + { + "epoch": 0.5286331138953776, + "grad_norm": 8.044267724603506, + "learning_rate": 4.313955354950494e-06, + "loss": 1.3163, + "step": 3734 + }, + { + "epoch": 0.5287746867700148, + "grad_norm": 9.149196785447762, + "learning_rate": 4.313560868208549e-06, + "loss": 1.3381, + "step": 3735 + }, + { + "epoch": 0.528916259644652, + "grad_norm": 10.910249605199411, + "learning_rate": 4.313166286129063e-06, + "loss": 1.2926, + "step": 3736 + }, + { + "epoch": 0.5290578325192893, + "grad_norm": 9.420299088251358, + "learning_rate": 4.312771608732776e-06, + "loss": 1.3802, + "step": 3737 + }, + { + "epoch": 0.5291994053939265, + "grad_norm": 7.1822373736466165, + "learning_rate": 4.312376836040437e-06, + "loss": 1.2094, + "step": 3738 + }, + { + "epoch": 0.5293409782685637, + "grad_norm": 9.514509264047438, + "learning_rate": 4.3119819680728e-06, + "loss": 1.3992, + "step": 3739 + }, + { + "epoch": 0.529482551143201, + "grad_norm": 9.179810088942089, + "learning_rate": 4.311587004850622e-06, + "loss": 1.2906, + "step": 3740 + }, + { + "epoch": 0.5296241240178382, + "grad_norm": 8.193832627497429, + "learning_rate": 4.311191946394665e-06, + "loss": 1.2981, + "step": 3741 + }, + { + "epoch": 0.5297656968924754, + "grad_norm": 9.607520146186268, + "learning_rate": 4.3107967927256985e-06, + "loss": 1.2537, + "step": 3742 + }, + { + "epoch": 0.5299072697671127, + "grad_norm": 9.35474945042749, + "learning_rate": 4.310401543864495e-06, + "loss": 1.3894, + "step": 3743 + }, + { + "epoch": 0.5300488426417499, + "grad_norm": 8.19207868603825, + "learning_rate": 4.3100061998318325e-06, + "loss": 1.3194, + "step": 3744 + }, + { + "epoch": 0.530190415516387, + "grad_norm": 9.319360855518532, + "learning_rate": 4.309610760648493e-06, + "loss": 1.2224, + "step": 3745 + }, + { + "epoch": 0.5303319883910242, + "grad_norm": 8.166832383738633, + "learning_rate": 4.309215226335265e-06, + "loss": 1.4899, + "step": 3746 + }, + { + "epoch": 0.5304735612656615, + "grad_norm": 7.6209468387460975, + "learning_rate": 4.308819596912942e-06, + "loss": 1.2953, + "step": 3747 + }, + { + "epoch": 0.5306151341402987, + "grad_norm": 8.33983460186852, + "learning_rate": 4.308423872402322e-06, + "loss": 1.3378, + "step": 3748 + }, + { + "epoch": 0.5307567070149359, + "grad_norm": 11.117907717249562, + "learning_rate": 4.308028052824207e-06, + "loss": 1.2586, + "step": 3749 + }, + { + "epoch": 0.5308982798895732, + "grad_norm": 8.57107537768061, + "learning_rate": 4.307632138199405e-06, + "loss": 1.3302, + "step": 3750 + }, + { + "epoch": 0.5310398527642104, + "grad_norm": 8.718196509509175, + "learning_rate": 4.30723612854873e-06, + "loss": 1.3645, + "step": 3751 + }, + { + "epoch": 0.5311814256388476, + "grad_norm": 8.200230609163198, + "learning_rate": 4.306840023892998e-06, + "loss": 1.2245, + "step": 3752 + }, + { + "epoch": 0.5313229985134849, + "grad_norm": 7.938204756560521, + "learning_rate": 4.306443824253035e-06, + "loss": 1.3367, + "step": 3753 + }, + { + "epoch": 0.5314645713881221, + "grad_norm": 8.581486722455251, + "learning_rate": 4.306047529649665e-06, + "loss": 1.27, + "step": 3754 + }, + { + "epoch": 0.5316061442627592, + "grad_norm": 9.008671291997818, + "learning_rate": 4.305651140103725e-06, + "loss": 1.2821, + "step": 3755 + }, + { + "epoch": 0.5317477171373964, + "grad_norm": 8.407587221562652, + "learning_rate": 4.305254655636049e-06, + "loss": 1.3098, + "step": 3756 + }, + { + "epoch": 0.5318892900120337, + "grad_norm": 8.358605122988571, + "learning_rate": 4.304858076267483e-06, + "loss": 1.3409, + "step": 3757 + }, + { + "epoch": 0.5320308628866709, + "grad_norm": 10.766707101594408, + "learning_rate": 4.304461402018873e-06, + "loss": 1.3799, + "step": 3758 + }, + { + "epoch": 0.5321724357613081, + "grad_norm": 8.36639098464267, + "learning_rate": 4.304064632911073e-06, + "loss": 1.2808, + "step": 3759 + }, + { + "epoch": 0.5323140086359454, + "grad_norm": 8.113480603593183, + "learning_rate": 4.303667768964941e-06, + "loss": 1.1976, + "step": 3760 + }, + { + "epoch": 0.5324555815105826, + "grad_norm": 8.901602834322565, + "learning_rate": 4.303270810201339e-06, + "loss": 1.2706, + "step": 3761 + }, + { + "epoch": 0.5325971543852198, + "grad_norm": 9.071849130980599, + "learning_rate": 4.302873756641135e-06, + "loss": 1.2448, + "step": 3762 + }, + { + "epoch": 0.532738727259857, + "grad_norm": 10.29780150718335, + "learning_rate": 4.302476608305201e-06, + "loss": 1.5945, + "step": 3763 + }, + { + "epoch": 0.5328803001344943, + "grad_norm": 7.131010498518458, + "learning_rate": 4.3020793652144165e-06, + "loss": 1.3053, + "step": 3764 + }, + { + "epoch": 0.5330218730091314, + "grad_norm": 9.07283094141536, + "learning_rate": 4.301682027389663e-06, + "loss": 1.3515, + "step": 3765 + }, + { + "epoch": 0.5331634458837686, + "grad_norm": 11.180529932554803, + "learning_rate": 4.301284594851829e-06, + "loss": 1.3664, + "step": 3766 + }, + { + "epoch": 0.5333050187584059, + "grad_norm": 11.172287811975364, + "learning_rate": 4.300887067621807e-06, + "loss": 1.4482, + "step": 3767 + }, + { + "epoch": 0.5334465916330431, + "grad_norm": 10.850151700835907, + "learning_rate": 4.300489445720495e-06, + "loss": 1.3595, + "step": 3768 + }, + { + "epoch": 0.5335881645076803, + "grad_norm": 7.363800338135322, + "learning_rate": 4.300091729168795e-06, + "loss": 1.1683, + "step": 3769 + }, + { + "epoch": 0.5337297373823175, + "grad_norm": 10.455684696291378, + "learning_rate": 4.299693917987615e-06, + "loss": 1.2372, + "step": 3770 + }, + { + "epoch": 0.5338713102569548, + "grad_norm": 10.216469606508069, + "learning_rate": 4.299296012197868e-06, + "loss": 1.2713, + "step": 3771 + }, + { + "epoch": 0.534012883131592, + "grad_norm": 8.05126122492692, + "learning_rate": 4.29889801182047e-06, + "loss": 1.2125, + "step": 3772 + }, + { + "epoch": 0.5341544560062292, + "grad_norm": 8.856083441805563, + "learning_rate": 4.298499916876347e-06, + "loss": 1.2822, + "step": 3773 + }, + { + "epoch": 0.5342960288808665, + "grad_norm": 9.315437416867665, + "learning_rate": 4.298101727386422e-06, + "loss": 1.2636, + "step": 3774 + }, + { + "epoch": 0.5344376017555037, + "grad_norm": 8.90434250063711, + "learning_rate": 4.297703443371632e-06, + "loss": 1.2247, + "step": 3775 + }, + { + "epoch": 0.5345791746301408, + "grad_norm": 9.398827548665354, + "learning_rate": 4.2973050648529114e-06, + "loss": 1.351, + "step": 3776 + }, + { + "epoch": 0.534720747504778, + "grad_norm": 8.691716111049539, + "learning_rate": 4.296906591851203e-06, + "loss": 1.2276, + "step": 3777 + }, + { + "epoch": 0.5348623203794153, + "grad_norm": 7.4955217342981495, + "learning_rate": 4.2965080243874555e-06, + "loss": 1.2306, + "step": 3778 + }, + { + "epoch": 0.5350038932540525, + "grad_norm": 9.717986932463495, + "learning_rate": 4.296109362482621e-06, + "loss": 1.2148, + "step": 3779 + }, + { + "epoch": 0.5351454661286897, + "grad_norm": 12.919732170990125, + "learning_rate": 4.2957106061576565e-06, + "loss": 1.449, + "step": 3780 + }, + { + "epoch": 0.535287039003327, + "grad_norm": 8.310619629161563, + "learning_rate": 4.295311755433525e-06, + "loss": 1.2079, + "step": 3781 + }, + { + "epoch": 0.5354286118779642, + "grad_norm": 8.333207803416514, + "learning_rate": 4.294912810331191e-06, + "loss": 1.2454, + "step": 3782 + }, + { + "epoch": 0.5355701847526014, + "grad_norm": 8.226709493137134, + "learning_rate": 4.2945137708716315e-06, + "loss": 1.2467, + "step": 3783 + }, + { + "epoch": 0.5357117576272387, + "grad_norm": 8.564012825412403, + "learning_rate": 4.294114637075819e-06, + "loss": 1.25, + "step": 3784 + }, + { + "epoch": 0.5358533305018759, + "grad_norm": 10.986431075906522, + "learning_rate": 4.293715408964738e-06, + "loss": 1.2417, + "step": 3785 + }, + { + "epoch": 0.535994903376513, + "grad_norm": 12.830127332119446, + "learning_rate": 4.293316086559377e-06, + "loss": 1.343, + "step": 3786 + }, + { + "epoch": 0.5361364762511502, + "grad_norm": 11.530465282835376, + "learning_rate": 4.292916669880726e-06, + "loss": 1.2581, + "step": 3787 + }, + { + "epoch": 0.5362780491257875, + "grad_norm": 7.81021768420434, + "learning_rate": 4.292517158949781e-06, + "loss": 1.3628, + "step": 3788 + }, + { + "epoch": 0.5364196220004247, + "grad_norm": 11.09230053526097, + "learning_rate": 4.292117553787547e-06, + "loss": 1.2315, + "step": 3789 + }, + { + "epoch": 0.5365611948750619, + "grad_norm": 12.064321429840138, + "learning_rate": 4.291717854415029e-06, + "loss": 1.4015, + "step": 3790 + }, + { + "epoch": 0.5367027677496992, + "grad_norm": 11.411218015901953, + "learning_rate": 4.29131806085324e-06, + "loss": 1.4319, + "step": 3791 + }, + { + "epoch": 0.5368443406243364, + "grad_norm": 9.096376439607955, + "learning_rate": 4.2909181731231955e-06, + "loss": 1.399, + "step": 3792 + }, + { + "epoch": 0.5369859134989736, + "grad_norm": 10.253796502388877, + "learning_rate": 4.290518191245918e-06, + "loss": 1.5076, + "step": 3793 + }, + { + "epoch": 0.5371274863736109, + "grad_norm": 8.416439191809895, + "learning_rate": 4.290118115242434e-06, + "loss": 1.3435, + "step": 3794 + }, + { + "epoch": 0.5372690592482481, + "grad_norm": 12.65265610142386, + "learning_rate": 4.289717945133775e-06, + "loss": 1.3518, + "step": 3795 + }, + { + "epoch": 0.5374106321228852, + "grad_norm": 8.239730454267566, + "learning_rate": 4.289317680940979e-06, + "loss": 1.2544, + "step": 3796 + }, + { + "epoch": 0.5375522049975224, + "grad_norm": 10.11796344152822, + "learning_rate": 4.288917322685087e-06, + "loss": 1.4314, + "step": 3797 + }, + { + "epoch": 0.5376937778721597, + "grad_norm": 10.265843869551295, + "learning_rate": 4.288516870387145e-06, + "loss": 1.3034, + "step": 3798 + }, + { + "epoch": 0.5378353507467969, + "grad_norm": 7.357765102659108, + "learning_rate": 4.288116324068205e-06, + "loss": 1.3147, + "step": 3799 + }, + { + "epoch": 0.5379769236214341, + "grad_norm": 10.353789191525902, + "learning_rate": 4.287715683749322e-06, + "loss": 1.119, + "step": 3800 + }, + { + "epoch": 0.5381184964960714, + "grad_norm": 11.382719334081665, + "learning_rate": 4.287314949451559e-06, + "loss": 1.3458, + "step": 3801 + }, + { + "epoch": 0.5382600693707086, + "grad_norm": 10.026398718615196, + "learning_rate": 4.286914121195982e-06, + "loss": 1.2459, + "step": 3802 + }, + { + "epoch": 0.5384016422453458, + "grad_norm": 9.596730517905605, + "learning_rate": 4.286513199003661e-06, + "loss": 1.3503, + "step": 3803 + }, + { + "epoch": 0.538543215119983, + "grad_norm": 9.417496051103955, + "learning_rate": 4.2861121828956745e-06, + "loss": 1.3815, + "step": 3804 + }, + { + "epoch": 0.5386847879946203, + "grad_norm": 8.762975608554052, + "learning_rate": 4.285711072893102e-06, + "loss": 1.3892, + "step": 3805 + }, + { + "epoch": 0.5388263608692575, + "grad_norm": 10.854305176147093, + "learning_rate": 4.28530986901703e-06, + "loss": 1.3796, + "step": 3806 + }, + { + "epoch": 0.5389679337438946, + "grad_norm": 9.194964017398329, + "learning_rate": 4.2849085712885495e-06, + "loss": 1.3069, + "step": 3807 + }, + { + "epoch": 0.5391095066185319, + "grad_norm": 8.462941785631767, + "learning_rate": 4.284507179728756e-06, + "loss": 1.189, + "step": 3808 + }, + { + "epoch": 0.5392510794931691, + "grad_norm": 8.592889694154024, + "learning_rate": 4.2841056943587505e-06, + "loss": 1.4696, + "step": 3809 + }, + { + "epoch": 0.5393926523678063, + "grad_norm": 10.675523617340488, + "learning_rate": 4.283704115199639e-06, + "loss": 1.2997, + "step": 3810 + }, + { + "epoch": 0.5395342252424435, + "grad_norm": 9.256598516132543, + "learning_rate": 4.283302442272532e-06, + "loss": 1.3584, + "step": 3811 + }, + { + "epoch": 0.5396757981170808, + "grad_norm": 9.037229043742473, + "learning_rate": 4.282900675598546e-06, + "loss": 1.3556, + "step": 3812 + }, + { + "epoch": 0.539817370991718, + "grad_norm": 9.087580210083816, + "learning_rate": 4.2824988151988e-06, + "loss": 1.2756, + "step": 3813 + }, + { + "epoch": 0.5399589438663552, + "grad_norm": 9.099506876233734, + "learning_rate": 4.282096861094421e-06, + "loss": 1.3425, + "step": 3814 + }, + { + "epoch": 0.5401005167409925, + "grad_norm": 7.9981274797525606, + "learning_rate": 4.281694813306538e-06, + "loss": 1.3205, + "step": 3815 + }, + { + "epoch": 0.5402420896156297, + "grad_norm": 10.945932505525846, + "learning_rate": 4.281292671856288e-06, + "loss": 1.2949, + "step": 3816 + }, + { + "epoch": 0.5403836624902668, + "grad_norm": 9.951468001911397, + "learning_rate": 4.28089043676481e-06, + "loss": 1.366, + "step": 3817 + }, + { + "epoch": 0.540525235364904, + "grad_norm": 8.392730164654362, + "learning_rate": 4.28048810805325e-06, + "loss": 1.4103, + "step": 3818 + }, + { + "epoch": 0.5406668082395413, + "grad_norm": 10.772852183950455, + "learning_rate": 4.280085685742758e-06, + "loss": 1.51, + "step": 3819 + }, + { + "epoch": 0.5408083811141785, + "grad_norm": 9.209863964157194, + "learning_rate": 4.279683169854488e-06, + "loss": 1.3649, + "step": 3820 + }, + { + "epoch": 0.5409499539888157, + "grad_norm": 9.89426445554246, + "learning_rate": 4.279280560409601e-06, + "loss": 1.2052, + "step": 3821 + }, + { + "epoch": 0.541091526863453, + "grad_norm": 8.602507966316967, + "learning_rate": 4.278877857429261e-06, + "loss": 1.3298, + "step": 3822 + }, + { + "epoch": 0.5412330997380902, + "grad_norm": 7.288258335446747, + "learning_rate": 4.278475060934639e-06, + "loss": 1.3723, + "step": 3823 + }, + { + "epoch": 0.5413746726127274, + "grad_norm": 10.062815785635973, + "learning_rate": 4.278072170946909e-06, + "loss": 1.3148, + "step": 3824 + }, + { + "epoch": 0.5415162454873647, + "grad_norm": 9.021728565175907, + "learning_rate": 4.277669187487251e-06, + "loss": 1.2265, + "step": 3825 + }, + { + "epoch": 0.5416578183620019, + "grad_norm": 9.875565911024456, + "learning_rate": 4.2772661105768495e-06, + "loss": 1.3717, + "step": 3826 + }, + { + "epoch": 0.541799391236639, + "grad_norm": 9.287798385653032, + "learning_rate": 4.276862940236894e-06, + "loss": 1.1936, + "step": 3827 + }, + { + "epoch": 0.5419409641112762, + "grad_norm": 7.438987775466173, + "learning_rate": 4.276459676488578e-06, + "loss": 1.1359, + "step": 3828 + }, + { + "epoch": 0.5420825369859135, + "grad_norm": 9.610728594959726, + "learning_rate": 4.276056319353101e-06, + "loss": 1.3033, + "step": 3829 + }, + { + "epoch": 0.5422241098605507, + "grad_norm": 9.769129058844467, + "learning_rate": 4.275652868851669e-06, + "loss": 1.1335, + "step": 3830 + }, + { + "epoch": 0.5423656827351879, + "grad_norm": 9.548318157595707, + "learning_rate": 4.275249325005488e-06, + "loss": 1.4988, + "step": 3831 + }, + { + "epoch": 0.5425072556098252, + "grad_norm": 8.376892331543802, + "learning_rate": 4.2748456878357746e-06, + "loss": 1.2484, + "step": 3832 + }, + { + "epoch": 0.5426488284844624, + "grad_norm": 8.369703297523408, + "learning_rate": 4.274441957363747e-06, + "loss": 1.3197, + "step": 3833 + }, + { + "epoch": 0.5427904013590996, + "grad_norm": 10.780061783480837, + "learning_rate": 4.274038133610629e-06, + "loss": 1.2089, + "step": 3834 + }, + { + "epoch": 0.5429319742337368, + "grad_norm": 8.484618237909494, + "learning_rate": 4.273634216597648e-06, + "loss": 1.265, + "step": 3835 + }, + { + "epoch": 0.5430735471083741, + "grad_norm": 9.896298388880451, + "learning_rate": 4.273230206346039e-06, + "loss": 1.2697, + "step": 3836 + }, + { + "epoch": 0.5432151199830113, + "grad_norm": 9.02052256083359, + "learning_rate": 4.27282610287704e-06, + "loss": 1.2435, + "step": 3837 + }, + { + "epoch": 0.5433566928576484, + "grad_norm": 9.389666262414828, + "learning_rate": 4.272421906211895e-06, + "loss": 1.2807, + "step": 3838 + }, + { + "epoch": 0.5434982657322857, + "grad_norm": 8.862092854303704, + "learning_rate": 4.272017616371853e-06, + "loss": 1.133, + "step": 3839 + }, + { + "epoch": 0.5436398386069229, + "grad_norm": 8.87649781388704, + "learning_rate": 4.2716132333781646e-06, + "loss": 1.2745, + "step": 3840 + }, + { + "epoch": 0.5437814114815601, + "grad_norm": 8.060252741407746, + "learning_rate": 4.27120875725209e-06, + "loss": 1.3365, + "step": 3841 + }, + { + "epoch": 0.5439229843561973, + "grad_norm": 10.195015785313576, + "learning_rate": 4.270804188014892e-06, + "loss": 1.2496, + "step": 3842 + }, + { + "epoch": 0.5440645572308346, + "grad_norm": 9.210757757735228, + "learning_rate": 4.270399525687839e-06, + "loss": 1.3056, + "step": 3843 + }, + { + "epoch": 0.5442061301054718, + "grad_norm": 9.63697209081263, + "learning_rate": 4.269994770292201e-06, + "loss": 1.3461, + "step": 3844 + }, + { + "epoch": 0.544347702980109, + "grad_norm": 10.935560653958545, + "learning_rate": 4.269589921849259e-06, + "loss": 1.346, + "step": 3845 + }, + { + "epoch": 0.5444892758547463, + "grad_norm": 7.843649570992402, + "learning_rate": 4.269184980380294e-06, + "loss": 1.2839, + "step": 3846 + }, + { + "epoch": 0.5446308487293835, + "grad_norm": 8.791173357630155, + "learning_rate": 4.268779945906594e-06, + "loss": 1.4653, + "step": 3847 + }, + { + "epoch": 0.5447724216040206, + "grad_norm": 8.60438750370791, + "learning_rate": 4.26837481844945e-06, + "loss": 1.2194, + "step": 3848 + }, + { + "epoch": 0.5449139944786578, + "grad_norm": 11.286698565752527, + "learning_rate": 4.267969598030162e-06, + "loss": 1.2291, + "step": 3849 + }, + { + "epoch": 0.5450555673532951, + "grad_norm": 11.30844937299892, + "learning_rate": 4.267564284670029e-06, + "loss": 1.3229, + "step": 3850 + }, + { + "epoch": 0.5451971402279323, + "grad_norm": 7.571405133110846, + "learning_rate": 4.267158878390361e-06, + "loss": 1.3607, + "step": 3851 + }, + { + "epoch": 0.5453387131025695, + "grad_norm": 9.64490778679715, + "learning_rate": 4.266753379212467e-06, + "loss": 1.4209, + "step": 3852 + }, + { + "epoch": 0.5454802859772068, + "grad_norm": 9.44048418775323, + "learning_rate": 4.266347787157666e-06, + "loss": 1.2349, + "step": 3853 + }, + { + "epoch": 0.545621858851844, + "grad_norm": 10.792723811349312, + "learning_rate": 4.265942102247278e-06, + "loss": 1.2019, + "step": 3854 + }, + { + "epoch": 0.5457634317264812, + "grad_norm": 8.462127686372307, + "learning_rate": 4.265536324502631e-06, + "loss": 1.4757, + "step": 3855 + }, + { + "epoch": 0.5459050046011185, + "grad_norm": 9.020336909800465, + "learning_rate": 4.265130453945056e-06, + "loss": 1.4742, + "step": 3856 + }, + { + "epoch": 0.5460465774757557, + "grad_norm": 9.984397923976031, + "learning_rate": 4.26472449059589e-06, + "loss": 1.1975, + "step": 3857 + }, + { + "epoch": 0.5461881503503928, + "grad_norm": 10.000811353193699, + "learning_rate": 4.264318434476472e-06, + "loss": 1.1722, + "step": 3858 + }, + { + "epoch": 0.54632972322503, + "grad_norm": 9.09923270207997, + "learning_rate": 4.26391228560815e-06, + "loss": 1.2925, + "step": 3859 + }, + { + "epoch": 0.5464712960996673, + "grad_norm": 8.567224688730155, + "learning_rate": 4.263506044012275e-06, + "loss": 1.4225, + "step": 3860 + }, + { + "epoch": 0.5466128689743045, + "grad_norm": 8.330194670565131, + "learning_rate": 4.2630997097102e-06, + "loss": 1.1049, + "step": 3861 + }, + { + "epoch": 0.5467544418489417, + "grad_norm": 9.5470500124609, + "learning_rate": 4.26269328272329e-06, + "loss": 1.2031, + "step": 3862 + }, + { + "epoch": 0.546896014723579, + "grad_norm": 9.326081274101805, + "learning_rate": 4.262286763072908e-06, + "loss": 1.1552, + "step": 3863 + }, + { + "epoch": 0.5470375875982162, + "grad_norm": 8.004953758015642, + "learning_rate": 4.261880150780424e-06, + "loss": 1.2716, + "step": 3864 + }, + { + "epoch": 0.5471791604728534, + "grad_norm": 9.31978368418076, + "learning_rate": 4.261473445867215e-06, + "loss": 1.2814, + "step": 3865 + }, + { + "epoch": 0.5473207333474907, + "grad_norm": 10.30458902886044, + "learning_rate": 4.26106664835466e-06, + "loss": 1.2704, + "step": 3866 + }, + { + "epoch": 0.5474623062221279, + "grad_norm": 9.483258000919923, + "learning_rate": 4.260659758264145e-06, + "loss": 1.444, + "step": 3867 + }, + { + "epoch": 0.5476038790967651, + "grad_norm": 8.50272056573912, + "learning_rate": 4.260252775617058e-06, + "loss": 1.2459, + "step": 3868 + }, + { + "epoch": 0.5477454519714022, + "grad_norm": 10.287330568570779, + "learning_rate": 4.259845700434797e-06, + "loss": 1.3593, + "step": 3869 + }, + { + "epoch": 0.5478870248460395, + "grad_norm": 10.38580899022063, + "learning_rate": 4.259438532738759e-06, + "loss": 1.3086, + "step": 3870 + }, + { + "epoch": 0.5480285977206767, + "grad_norm": 13.206299313310083, + "learning_rate": 4.259031272550349e-06, + "loss": 1.1755, + "step": 3871 + }, + { + "epoch": 0.5481701705953139, + "grad_norm": 8.029775996331997, + "learning_rate": 4.258623919890976e-06, + "loss": 1.2079, + "step": 3872 + }, + { + "epoch": 0.5483117434699512, + "grad_norm": 9.207539611946636, + "learning_rate": 4.258216474782056e-06, + "loss": 1.2685, + "step": 3873 + }, + { + "epoch": 0.5484533163445884, + "grad_norm": 8.362834236068592, + "learning_rate": 4.257808937245006e-06, + "loss": 1.2919, + "step": 3874 + }, + { + "epoch": 0.5485948892192256, + "grad_norm": 10.937712051515847, + "learning_rate": 4.257401307301251e-06, + "loss": 1.4021, + "step": 3875 + }, + { + "epoch": 0.5487364620938628, + "grad_norm": 8.617910816481006, + "learning_rate": 4.25699358497222e-06, + "loss": 1.2984, + "step": 3876 + }, + { + "epoch": 0.5488780349685001, + "grad_norm": 9.977715266578716, + "learning_rate": 4.256585770279345e-06, + "loss": 1.4345, + "step": 3877 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 8.197416168277508, + "learning_rate": 4.256177863244067e-06, + "loss": 1.2174, + "step": 3878 + }, + { + "epoch": 0.5491611807177744, + "grad_norm": 9.198009474658006, + "learning_rate": 4.255769863887829e-06, + "loss": 1.2334, + "step": 3879 + }, + { + "epoch": 0.5493027535924117, + "grad_norm": 10.268482571147738, + "learning_rate": 4.2553617722320775e-06, + "loss": 1.4306, + "step": 3880 + }, + { + "epoch": 0.5494443264670489, + "grad_norm": 8.838350083015575, + "learning_rate": 4.254953588298266e-06, + "loss": 1.2747, + "step": 3881 + }, + { + "epoch": 0.5495858993416861, + "grad_norm": 9.648404684281239, + "learning_rate": 4.254545312107854e-06, + "loss": 1.2828, + "step": 3882 + }, + { + "epoch": 0.5497274722163233, + "grad_norm": 9.836843047448543, + "learning_rate": 4.254136943682302e-06, + "loss": 1.3361, + "step": 3883 + }, + { + "epoch": 0.5498690450909606, + "grad_norm": 10.753850912498377, + "learning_rate": 4.253728483043081e-06, + "loss": 1.3186, + "step": 3884 + }, + { + "epoch": 0.5500106179655978, + "grad_norm": 9.270613512726316, + "learning_rate": 4.253319930211659e-06, + "loss": 1.2812, + "step": 3885 + }, + { + "epoch": 0.550152190840235, + "grad_norm": 9.870724295618695, + "learning_rate": 4.252911285209516e-06, + "loss": 1.3674, + "step": 3886 + }, + { + "epoch": 0.5502937637148723, + "grad_norm": 8.822222169267008, + "learning_rate": 4.252502548058134e-06, + "loss": 1.265, + "step": 3887 + }, + { + "epoch": 0.5504353365895095, + "grad_norm": 8.88414712104428, + "learning_rate": 4.252093718779e-06, + "loss": 1.179, + "step": 3888 + }, + { + "epoch": 0.5505769094641467, + "grad_norm": 9.032675449114896, + "learning_rate": 4.2516847973936045e-06, + "loss": 1.1383, + "step": 3889 + }, + { + "epoch": 0.5507184823387838, + "grad_norm": 8.974505023536421, + "learning_rate": 4.251275783923447e-06, + "loss": 1.3363, + "step": 3890 + }, + { + "epoch": 0.5508600552134211, + "grad_norm": 11.909944216503886, + "learning_rate": 4.250866678390026e-06, + "loss": 1.3671, + "step": 3891 + }, + { + "epoch": 0.5510016280880583, + "grad_norm": 9.085628068005905, + "learning_rate": 4.25045748081485e-06, + "loss": 1.2891, + "step": 3892 + }, + { + "epoch": 0.5511432009626955, + "grad_norm": 10.615959360305052, + "learning_rate": 4.250048191219429e-06, + "loss": 1.3517, + "step": 3893 + }, + { + "epoch": 0.5512847738373328, + "grad_norm": 8.26563699934148, + "learning_rate": 4.24963880962528e-06, + "loss": 1.2922, + "step": 3894 + }, + { + "epoch": 0.55142634671197, + "grad_norm": 8.213152353513921, + "learning_rate": 4.249229336053924e-06, + "loss": 1.1266, + "step": 3895 + }, + { + "epoch": 0.5515679195866072, + "grad_norm": 10.072769137659511, + "learning_rate": 4.248819770526884e-06, + "loss": 1.2876, + "step": 3896 + }, + { + "epoch": 0.5517094924612445, + "grad_norm": 10.924830133789195, + "learning_rate": 4.248410113065694e-06, + "loss": 1.3718, + "step": 3897 + }, + { + "epoch": 0.5518510653358817, + "grad_norm": 8.877531738224516, + "learning_rate": 4.248000363691888e-06, + "loss": 1.3678, + "step": 3898 + }, + { + "epoch": 0.5519926382105189, + "grad_norm": 7.9190900071755745, + "learning_rate": 4.247590522427006e-06, + "loss": 1.166, + "step": 3899 + }, + { + "epoch": 0.552134211085156, + "grad_norm": 9.59546837343879, + "learning_rate": 4.2471805892925935e-06, + "loss": 1.2101, + "step": 3900 + }, + { + "epoch": 0.5522757839597933, + "grad_norm": 10.600981008371052, + "learning_rate": 4.2467705643102005e-06, + "loss": 1.2175, + "step": 3901 + }, + { + "epoch": 0.5524173568344305, + "grad_norm": 9.066829515378432, + "learning_rate": 4.246360447501381e-06, + "loss": 1.2571, + "step": 3902 + }, + { + "epoch": 0.5525589297090677, + "grad_norm": 8.946474064868202, + "learning_rate": 4.245950238887695e-06, + "loss": 1.3015, + "step": 3903 + }, + { + "epoch": 0.552700502583705, + "grad_norm": 8.60411395656398, + "learning_rate": 4.245539938490706e-06, + "loss": 1.2021, + "step": 3904 + }, + { + "epoch": 0.5528420754583422, + "grad_norm": 10.611472819911233, + "learning_rate": 4.245129546331985e-06, + "loss": 1.3199, + "step": 3905 + }, + { + "epoch": 0.5529836483329794, + "grad_norm": 9.527245252412126, + "learning_rate": 4.244719062433105e-06, + "loss": 1.3476, + "step": 3906 + }, + { + "epoch": 0.5531252212076166, + "grad_norm": 10.188152643372986, + "learning_rate": 4.2443084868156434e-06, + "loss": 1.2356, + "step": 3907 + }, + { + "epoch": 0.5532667940822539, + "grad_norm": 7.848986576048916, + "learning_rate": 4.243897819501187e-06, + "loss": 1.3208, + "step": 3908 + }, + { + "epoch": 0.5534083669568911, + "grad_norm": 8.180269033023102, + "learning_rate": 4.243487060511321e-06, + "loss": 1.15, + "step": 3909 + }, + { + "epoch": 0.5535499398315282, + "grad_norm": 9.1096928379103, + "learning_rate": 4.243076209867642e-06, + "loss": 1.222, + "step": 3910 + }, + { + "epoch": 0.5536915127061655, + "grad_norm": 11.483003744665417, + "learning_rate": 4.242665267591744e-06, + "loss": 1.3441, + "step": 3911 + }, + { + "epoch": 0.5538330855808027, + "grad_norm": 8.968909072794455, + "learning_rate": 4.242254233705234e-06, + "loss": 1.1886, + "step": 3912 + }, + { + "epoch": 0.5539746584554399, + "grad_norm": 8.052105020655896, + "learning_rate": 4.241843108229718e-06, + "loss": 1.2047, + "step": 3913 + }, + { + "epoch": 0.5541162313300771, + "grad_norm": 9.475547641840706, + "learning_rate": 4.241431891186808e-06, + "loss": 1.3334, + "step": 3914 + }, + { + "epoch": 0.5542578042047144, + "grad_norm": 8.920308257793701, + "learning_rate": 4.241020582598122e-06, + "loss": 1.4727, + "step": 3915 + }, + { + "epoch": 0.5543993770793516, + "grad_norm": 12.93356130602243, + "learning_rate": 4.240609182485282e-06, + "loss": 1.4707, + "step": 3916 + }, + { + "epoch": 0.5545409499539888, + "grad_norm": 9.251476685471157, + "learning_rate": 4.240197690869916e-06, + "loss": 1.281, + "step": 3917 + }, + { + "epoch": 0.5546825228286261, + "grad_norm": 9.269773638450157, + "learning_rate": 4.239786107773655e-06, + "loss": 1.2668, + "step": 3918 + }, + { + "epoch": 0.5548240957032633, + "grad_norm": 9.605753096645309, + "learning_rate": 4.239374433218134e-06, + "loss": 1.2175, + "step": 3919 + }, + { + "epoch": 0.5549656685779005, + "grad_norm": 9.44602934560068, + "learning_rate": 4.238962667224997e-06, + "loss": 1.1684, + "step": 3920 + }, + { + "epoch": 0.5551072414525376, + "grad_norm": 8.4638518072182, + "learning_rate": 4.238550809815889e-06, + "loss": 1.2356, + "step": 3921 + }, + { + "epoch": 0.5552488143271749, + "grad_norm": 9.708291464348312, + "learning_rate": 4.238138861012461e-06, + "loss": 1.3439, + "step": 3922 + }, + { + "epoch": 0.5553903872018121, + "grad_norm": 8.007427105817477, + "learning_rate": 4.23772682083637e-06, + "loss": 1.0959, + "step": 3923 + }, + { + "epoch": 0.5555319600764493, + "grad_norm": 10.06603145826833, + "learning_rate": 4.237314689309275e-06, + "loss": 1.235, + "step": 3924 + }, + { + "epoch": 0.5556735329510866, + "grad_norm": 10.216550257788677, + "learning_rate": 4.236902466452843e-06, + "loss": 1.303, + "step": 3925 + }, + { + "epoch": 0.5558151058257238, + "grad_norm": 9.261371530422032, + "learning_rate": 4.2364901522887415e-06, + "loss": 1.3251, + "step": 3926 + }, + { + "epoch": 0.555956678700361, + "grad_norm": 7.663141628863676, + "learning_rate": 4.236077746838649e-06, + "loss": 1.2319, + "step": 3927 + }, + { + "epoch": 0.5560982515749983, + "grad_norm": 9.258850400021618, + "learning_rate": 4.2356652501242435e-06, + "loss": 1.4111, + "step": 3928 + }, + { + "epoch": 0.5562398244496355, + "grad_norm": 10.22504569418847, + "learning_rate": 4.235252662167211e-06, + "loss": 1.3654, + "step": 3929 + }, + { + "epoch": 0.5563813973242727, + "grad_norm": 10.49624239578871, + "learning_rate": 4.234839982989238e-06, + "loss": 1.2713, + "step": 3930 + }, + { + "epoch": 0.5565229701989098, + "grad_norm": 10.477913652035491, + "learning_rate": 4.234427212612021e-06, + "loss": 1.3615, + "step": 3931 + }, + { + "epoch": 0.5566645430735471, + "grad_norm": 8.923713775189336, + "learning_rate": 4.23401435105726e-06, + "loss": 1.4025, + "step": 3932 + }, + { + "epoch": 0.5568061159481843, + "grad_norm": 7.778498764529385, + "learning_rate": 4.2336013983466565e-06, + "loss": 1.1741, + "step": 3933 + }, + { + "epoch": 0.5569476888228215, + "grad_norm": 8.31098086594486, + "learning_rate": 4.233188354501921e-06, + "loss": 1.444, + "step": 3934 + }, + { + "epoch": 0.5570892616974588, + "grad_norm": 10.002611963569816, + "learning_rate": 4.2327752195447645e-06, + "loss": 1.3012, + "step": 3935 + }, + { + "epoch": 0.557230834572096, + "grad_norm": 9.265312922916868, + "learning_rate": 4.232361993496908e-06, + "loss": 1.1693, + "step": 3936 + }, + { + "epoch": 0.5573724074467332, + "grad_norm": 7.822657215541612, + "learning_rate": 4.231948676380073e-06, + "loss": 1.258, + "step": 3937 + }, + { + "epoch": 0.5575139803213705, + "grad_norm": 10.559329307527287, + "learning_rate": 4.231535268215987e-06, + "loss": 1.3802, + "step": 3938 + }, + { + "epoch": 0.5576555531960077, + "grad_norm": 8.962176209882966, + "learning_rate": 4.231121769026383e-06, + "loss": 1.2706, + "step": 3939 + }, + { + "epoch": 0.5577971260706449, + "grad_norm": 8.705316291685286, + "learning_rate": 4.230708178832999e-06, + "loss": 1.2669, + "step": 3940 + }, + { + "epoch": 0.557938698945282, + "grad_norm": 7.607993750568635, + "learning_rate": 4.230294497657576e-06, + "loss": 1.3831, + "step": 3941 + }, + { + "epoch": 0.5580802718199193, + "grad_norm": 8.035168593512639, + "learning_rate": 4.2298807255218615e-06, + "loss": 1.1535, + "step": 3942 + }, + { + "epoch": 0.5582218446945565, + "grad_norm": 10.220256542711056, + "learning_rate": 4.229466862447608e-06, + "loss": 1.4397, + "step": 3943 + }, + { + "epoch": 0.5583634175691937, + "grad_norm": 9.789522712770406, + "learning_rate": 4.22905290845657e-06, + "loss": 1.3219, + "step": 3944 + }, + { + "epoch": 0.558504990443831, + "grad_norm": 9.173154673426021, + "learning_rate": 4.22863886357051e-06, + "loss": 1.299, + "step": 3945 + }, + { + "epoch": 0.5586465633184682, + "grad_norm": 9.434174008422328, + "learning_rate": 4.228224727811194e-06, + "loss": 1.3329, + "step": 3946 + }, + { + "epoch": 0.5587881361931054, + "grad_norm": 9.233007959171328, + "learning_rate": 4.227810501200393e-06, + "loss": 1.3644, + "step": 3947 + }, + { + "epoch": 0.5589297090677426, + "grad_norm": 11.008831293787644, + "learning_rate": 4.227396183759882e-06, + "loss": 1.3278, + "step": 3948 + }, + { + "epoch": 0.5590712819423799, + "grad_norm": 8.857562920941488, + "learning_rate": 4.226981775511442e-06, + "loss": 1.3322, + "step": 3949 + }, + { + "epoch": 0.5592128548170171, + "grad_norm": 8.709322737806284, + "learning_rate": 4.2265672764768565e-06, + "loss": 1.305, + "step": 3950 + }, + { + "epoch": 0.5593544276916543, + "grad_norm": 11.057660957215434, + "learning_rate": 4.226152686677918e-06, + "loss": 1.3068, + "step": 3951 + }, + { + "epoch": 0.5594960005662915, + "grad_norm": 7.660894238736714, + "learning_rate": 4.22573800613642e-06, + "loss": 1.1924, + "step": 3952 + }, + { + "epoch": 0.5596375734409287, + "grad_norm": 9.41320340997155, + "learning_rate": 4.22532323487416e-06, + "loss": 1.4382, + "step": 3953 + }, + { + "epoch": 0.5597791463155659, + "grad_norm": 10.710179942780526, + "learning_rate": 4.224908372912946e-06, + "loss": 1.4634, + "step": 3954 + }, + { + "epoch": 0.5599207191902031, + "grad_norm": 10.883030173814651, + "learning_rate": 4.224493420274584e-06, + "loss": 1.3562, + "step": 3955 + }, + { + "epoch": 0.5600622920648404, + "grad_norm": 9.825760937603238, + "learning_rate": 4.224078376980888e-06, + "loss": 1.1998, + "step": 3956 + }, + { + "epoch": 0.5602038649394776, + "grad_norm": 8.823075678509786, + "learning_rate": 4.223663243053679e-06, + "loss": 1.2976, + "step": 3957 + }, + { + "epoch": 0.5603454378141148, + "grad_norm": 8.931672016741794, + "learning_rate": 4.2232480185147775e-06, + "loss": 1.1244, + "step": 3958 + }, + { + "epoch": 0.5604870106887521, + "grad_norm": 8.269618434383352, + "learning_rate": 4.222832703386013e-06, + "loss": 1.2311, + "step": 3959 + }, + { + "epoch": 0.5606285835633893, + "grad_norm": 9.878983092697384, + "learning_rate": 4.222417297689217e-06, + "loss": 1.4189, + "step": 3960 + }, + { + "epoch": 0.5607701564380265, + "grad_norm": 9.339911141146457, + "learning_rate": 4.2220018014462284e-06, + "loss": 1.1555, + "step": 3961 + }, + { + "epoch": 0.5609117293126636, + "grad_norm": 10.378360066311808, + "learning_rate": 4.221586214678889e-06, + "loss": 1.2975, + "step": 3962 + }, + { + "epoch": 0.5610533021873009, + "grad_norm": 7.9985571990732, + "learning_rate": 4.221170537409046e-06, + "loss": 1.2423, + "step": 3963 + }, + { + "epoch": 0.5611948750619381, + "grad_norm": 7.330204585240074, + "learning_rate": 4.220754769658551e-06, + "loss": 1.3254, + "step": 3964 + }, + { + "epoch": 0.5613364479365753, + "grad_norm": 9.173440776831514, + "learning_rate": 4.220338911449262e-06, + "loss": 1.2561, + "step": 3965 + }, + { + "epoch": 0.5614780208112126, + "grad_norm": 8.968869942849981, + "learning_rate": 4.219922962803038e-06, + "loss": 1.2153, + "step": 3966 + }, + { + "epoch": 0.5616195936858498, + "grad_norm": 11.015273847461547, + "learning_rate": 4.2195069237417466e-06, + "loss": 1.2902, + "step": 3967 + }, + { + "epoch": 0.561761166560487, + "grad_norm": 9.710709266098105, + "learning_rate": 4.219090794287258e-06, + "loss": 1.4394, + "step": 3968 + }, + { + "epoch": 0.5619027394351243, + "grad_norm": 8.141751436581574, + "learning_rate": 4.218674574461449e-06, + "loss": 1.2047, + "step": 3969 + }, + { + "epoch": 0.5620443123097615, + "grad_norm": 10.936495140782327, + "learning_rate": 4.218258264286198e-06, + "loss": 1.3615, + "step": 3970 + }, + { + "epoch": 0.5621858851843987, + "grad_norm": 8.813053194904189, + "learning_rate": 4.217841863783393e-06, + "loss": 1.2381, + "step": 3971 + }, + { + "epoch": 0.5623274580590358, + "grad_norm": 7.62163907382335, + "learning_rate": 4.21742537297492e-06, + "loss": 1.1516, + "step": 3972 + }, + { + "epoch": 0.5624690309336731, + "grad_norm": 8.123646784847365, + "learning_rate": 4.217008791882678e-06, + "loss": 1.2296, + "step": 3973 + }, + { + "epoch": 0.5626106038083103, + "grad_norm": 11.302506727500555, + "learning_rate": 4.216592120528562e-06, + "loss": 1.2831, + "step": 3974 + }, + { + "epoch": 0.5627521766829475, + "grad_norm": 9.564798714593193, + "learning_rate": 4.216175358934479e-06, + "loss": 1.2436, + "step": 3975 + }, + { + "epoch": 0.5628937495575848, + "grad_norm": 8.25523071927827, + "learning_rate": 4.215758507122337e-06, + "loss": 1.156, + "step": 3976 + }, + { + "epoch": 0.563035322432222, + "grad_norm": 9.74254499206256, + "learning_rate": 4.21534156511405e-06, + "loss": 1.3321, + "step": 3977 + }, + { + "epoch": 0.5631768953068592, + "grad_norm": 7.734892610126242, + "learning_rate": 4.214924532931534e-06, + "loss": 1.2227, + "step": 3978 + }, + { + "epoch": 0.5633184681814964, + "grad_norm": 11.81290689403372, + "learning_rate": 4.214507410596716e-06, + "loss": 1.4239, + "step": 3979 + }, + { + "epoch": 0.5634600410561337, + "grad_norm": 9.190044855085125, + "learning_rate": 4.214090198131522e-06, + "loss": 1.3598, + "step": 3980 + }, + { + "epoch": 0.5636016139307709, + "grad_norm": 8.522732125945588, + "learning_rate": 4.2136728955578835e-06, + "loss": 1.3838, + "step": 3981 + }, + { + "epoch": 0.5637431868054081, + "grad_norm": 8.72695591380433, + "learning_rate": 4.2132555028977386e-06, + "loss": 1.3263, + "step": 3982 + }, + { + "epoch": 0.5638847596800453, + "grad_norm": 9.908969060738846, + "learning_rate": 4.212838020173029e-06, + "loss": 1.3238, + "step": 3983 + }, + { + "epoch": 0.5640263325546825, + "grad_norm": 8.703608008084833, + "learning_rate": 4.212420447405703e-06, + "loss": 1.4024, + "step": 3984 + }, + { + "epoch": 0.5641679054293197, + "grad_norm": 8.65483924579801, + "learning_rate": 4.21200278461771e-06, + "loss": 1.2956, + "step": 3985 + }, + { + "epoch": 0.564309478303957, + "grad_norm": 9.057567437952997, + "learning_rate": 4.211585031831007e-06, + "loss": 1.2764, + "step": 3986 + }, + { + "epoch": 0.5644510511785942, + "grad_norm": 10.333656018869052, + "learning_rate": 4.211167189067556e-06, + "loss": 1.1176, + "step": 3987 + }, + { + "epoch": 0.5645926240532314, + "grad_norm": 10.712143711143144, + "learning_rate": 4.210749256349322e-06, + "loss": 1.3899, + "step": 3988 + }, + { + "epoch": 0.5647341969278686, + "grad_norm": 8.730181675229876, + "learning_rate": 4.210331233698274e-06, + "loss": 1.2285, + "step": 3989 + }, + { + "epoch": 0.5648757698025059, + "grad_norm": 11.994088942294203, + "learning_rate": 4.209913121136389e-06, + "loss": 1.5159, + "step": 3990 + }, + { + "epoch": 0.5650173426771431, + "grad_norm": 7.743951467582645, + "learning_rate": 4.209494918685646e-06, + "loss": 1.2967, + "step": 3991 + }, + { + "epoch": 0.5651589155517803, + "grad_norm": 6.975223288841671, + "learning_rate": 4.20907662636803e-06, + "loss": 1.2729, + "step": 3992 + }, + { + "epoch": 0.5653004884264174, + "grad_norm": 12.26026890918735, + "learning_rate": 4.208658244205529e-06, + "loss": 1.3463, + "step": 3993 + }, + { + "epoch": 0.5654420613010547, + "grad_norm": 10.536044062517234, + "learning_rate": 4.208239772220139e-06, + "loss": 1.2454, + "step": 3994 + }, + { + "epoch": 0.5655836341756919, + "grad_norm": 8.749478133851532, + "learning_rate": 4.207821210433858e-06, + "loss": 1.3127, + "step": 3995 + }, + { + "epoch": 0.5657252070503291, + "grad_norm": 11.028668585179208, + "learning_rate": 4.20740255886869e-06, + "loss": 1.389, + "step": 3996 + }, + { + "epoch": 0.5658667799249664, + "grad_norm": 8.687641828737739, + "learning_rate": 4.206983817546641e-06, + "loss": 1.3978, + "step": 3997 + }, + { + "epoch": 0.5660083527996036, + "grad_norm": 9.167787749198574, + "learning_rate": 4.206564986489726e-06, + "loss": 1.2349, + "step": 3998 + }, + { + "epoch": 0.5661499256742408, + "grad_norm": 9.44003282121028, + "learning_rate": 4.206146065719963e-06, + "loss": 1.3295, + "step": 3999 + }, + { + "epoch": 0.5662914985488781, + "grad_norm": 9.429496959161291, + "learning_rate": 4.205727055259372e-06, + "loss": 1.3879, + "step": 4000 + }, + { + "epoch": 0.5664330714235153, + "grad_norm": 7.914240602558501, + "learning_rate": 4.2053079551299835e-06, + "loss": 1.3111, + "step": 4001 + }, + { + "epoch": 0.5665746442981525, + "grad_norm": 9.314233400965227, + "learning_rate": 4.204888765353826e-06, + "loss": 1.3822, + "step": 4002 + }, + { + "epoch": 0.5667162171727896, + "grad_norm": 9.724507972018511, + "learning_rate": 4.204469485952938e-06, + "loss": 1.324, + "step": 4003 + }, + { + "epoch": 0.5668577900474269, + "grad_norm": 10.77763328149509, + "learning_rate": 4.204050116949359e-06, + "loss": 1.4136, + "step": 4004 + }, + { + "epoch": 0.5669993629220641, + "grad_norm": 7.25211145632437, + "learning_rate": 4.203630658365136e-06, + "loss": 1.289, + "step": 4005 + }, + { + "epoch": 0.5671409357967013, + "grad_norm": 8.932308796828204, + "learning_rate": 4.203211110222321e-06, + "loss": 1.2495, + "step": 4006 + }, + { + "epoch": 0.5672825086713386, + "grad_norm": 9.763473591142724, + "learning_rate": 4.202791472542968e-06, + "loss": 1.2553, + "step": 4007 + }, + { + "epoch": 0.5674240815459758, + "grad_norm": 10.20450263778067, + "learning_rate": 4.202371745349135e-06, + "loss": 1.5306, + "step": 4008 + }, + { + "epoch": 0.567565654420613, + "grad_norm": 11.344267862232368, + "learning_rate": 4.2019519286628895e-06, + "loss": 1.2417, + "step": 4009 + }, + { + "epoch": 0.5677072272952502, + "grad_norm": 8.322937801049381, + "learning_rate": 4.2015320225063e-06, + "loss": 1.2648, + "step": 4010 + }, + { + "epoch": 0.5678488001698875, + "grad_norm": 7.998736520175512, + "learning_rate": 4.201112026901442e-06, + "loss": 1.1849, + "step": 4011 + }, + { + "epoch": 0.5679903730445247, + "grad_norm": 11.424821674449397, + "learning_rate": 4.200691941870392e-06, + "loss": 1.4517, + "step": 4012 + }, + { + "epoch": 0.5681319459191619, + "grad_norm": 8.34412763219636, + "learning_rate": 4.200271767435235e-06, + "loss": 1.2483, + "step": 4013 + }, + { + "epoch": 0.5682735187937991, + "grad_norm": 13.818855584067746, + "learning_rate": 4.199851503618059e-06, + "loss": 1.5549, + "step": 4014 + }, + { + "epoch": 0.5684150916684363, + "grad_norm": 8.583192645157553, + "learning_rate": 4.1994311504409566e-06, + "loss": 1.1992, + "step": 4015 + }, + { + "epoch": 0.5685566645430735, + "grad_norm": 9.186966445896456, + "learning_rate": 4.199010707926026e-06, + "loss": 1.3832, + "step": 4016 + }, + { + "epoch": 0.5686982374177108, + "grad_norm": 9.861566605293731, + "learning_rate": 4.19859017609537e-06, + "loss": 1.2229, + "step": 4017 + }, + { + "epoch": 0.568839810292348, + "grad_norm": 9.354184655481612, + "learning_rate": 4.198169554971095e-06, + "loss": 1.2292, + "step": 4018 + }, + { + "epoch": 0.5689813831669852, + "grad_norm": 10.081232109455994, + "learning_rate": 4.197748844575311e-06, + "loss": 1.2947, + "step": 4019 + }, + { + "epoch": 0.5691229560416224, + "grad_norm": 9.903906511916261, + "learning_rate": 4.197328044930137e-06, + "loss": 1.2948, + "step": 4020 + }, + { + "epoch": 0.5692645289162597, + "grad_norm": 10.387209039645866, + "learning_rate": 4.196907156057694e-06, + "loss": 1.4249, + "step": 4021 + }, + { + "epoch": 0.5694061017908969, + "grad_norm": 12.67081109224471, + "learning_rate": 4.196486177980107e-06, + "loss": 1.3234, + "step": 4022 + }, + { + "epoch": 0.5695476746655341, + "grad_norm": 9.520234893978374, + "learning_rate": 4.196065110719505e-06, + "loss": 1.2537, + "step": 4023 + }, + { + "epoch": 0.5696892475401713, + "grad_norm": 8.172388939533159, + "learning_rate": 4.195643954298026e-06, + "loss": 1.2794, + "step": 4024 + }, + { + "epoch": 0.5698308204148085, + "grad_norm": 10.854697382296315, + "learning_rate": 4.195222708737809e-06, + "loss": 1.3495, + "step": 4025 + }, + { + "epoch": 0.5699723932894457, + "grad_norm": 9.63085085903852, + "learning_rate": 4.1948013740609976e-06, + "loss": 1.4094, + "step": 4026 + }, + { + "epoch": 0.5701139661640829, + "grad_norm": 9.714263371033178, + "learning_rate": 4.194379950289742e-06, + "loss": 1.3291, + "step": 4027 + }, + { + "epoch": 0.5702555390387202, + "grad_norm": 10.67618716068799, + "learning_rate": 4.193958437446195e-06, + "loss": 1.289, + "step": 4028 + }, + { + "epoch": 0.5703971119133574, + "grad_norm": 11.130568578683015, + "learning_rate": 4.193536835552517e-06, + "loss": 1.4433, + "step": 4029 + }, + { + "epoch": 0.5705386847879946, + "grad_norm": 7.7055558708036225, + "learning_rate": 4.19311514463087e-06, + "loss": 1.1338, + "step": 4030 + }, + { + "epoch": 0.5706802576626319, + "grad_norm": 10.845592858873562, + "learning_rate": 4.192693364703422e-06, + "loss": 1.1937, + "step": 4031 + }, + { + "epoch": 0.5708218305372691, + "grad_norm": 11.160384579393353, + "learning_rate": 4.192271495792346e-06, + "loss": 1.476, + "step": 4032 + }, + { + "epoch": 0.5709634034119063, + "grad_norm": 11.97586748329903, + "learning_rate": 4.191849537919819e-06, + "loss": 1.4384, + "step": 4033 + }, + { + "epoch": 0.5711049762865436, + "grad_norm": 11.02026269561534, + "learning_rate": 4.191427491108024e-06, + "loss": 1.3535, + "step": 4034 + }, + { + "epoch": 0.5712465491611807, + "grad_norm": 11.076597380129169, + "learning_rate": 4.191005355379147e-06, + "loss": 1.4715, + "step": 4035 + }, + { + "epoch": 0.5713881220358179, + "grad_norm": 13.630783673355127, + "learning_rate": 4.190583130755379e-06, + "loss": 1.5196, + "step": 4036 + }, + { + "epoch": 0.5715296949104551, + "grad_norm": 9.454915733022409, + "learning_rate": 4.190160817258916e-06, + "loss": 1.1094, + "step": 4037 + }, + { + "epoch": 0.5716712677850924, + "grad_norm": 7.027689528214836, + "learning_rate": 4.189738414911959e-06, + "loss": 1.2616, + "step": 4038 + }, + { + "epoch": 0.5718128406597296, + "grad_norm": 9.746620962067965, + "learning_rate": 4.189315923736715e-06, + "loss": 1.2875, + "step": 4039 + }, + { + "epoch": 0.5719544135343668, + "grad_norm": 9.14818230589843, + "learning_rate": 4.18889334375539e-06, + "loss": 1.1365, + "step": 4040 + }, + { + "epoch": 0.572095986409004, + "grad_norm": 8.50184611299814, + "learning_rate": 4.188470674990203e-06, + "loss": 1.2362, + "step": 4041 + }, + { + "epoch": 0.5722375592836413, + "grad_norm": 11.061030230973415, + "learning_rate": 4.1880479174633715e-06, + "loss": 1.2669, + "step": 4042 + }, + { + "epoch": 0.5723791321582785, + "grad_norm": 12.490187799219258, + "learning_rate": 4.187625071197119e-06, + "loss": 1.2018, + "step": 4043 + }, + { + "epoch": 0.5725207050329157, + "grad_norm": 8.406156518125663, + "learning_rate": 4.187202136213675e-06, + "loss": 1.1499, + "step": 4044 + }, + { + "epoch": 0.5726622779075529, + "grad_norm": 11.233616067137405, + "learning_rate": 4.186779112535273e-06, + "loss": 1.33, + "step": 4045 + }, + { + "epoch": 0.5728038507821901, + "grad_norm": 8.913889182008104, + "learning_rate": 4.186356000184151e-06, + "loss": 1.3218, + "step": 4046 + }, + { + "epoch": 0.5729454236568273, + "grad_norm": 9.400851807654062, + "learning_rate": 4.185932799182551e-06, + "loss": 1.2802, + "step": 4047 + }, + { + "epoch": 0.5730869965314646, + "grad_norm": 12.035056089326805, + "learning_rate": 4.185509509552721e-06, + "loss": 1.3415, + "step": 4048 + }, + { + "epoch": 0.5732285694061018, + "grad_norm": 9.969855459498937, + "learning_rate": 4.185086131316914e-06, + "loss": 1.2827, + "step": 4049 + }, + { + "epoch": 0.573370142280739, + "grad_norm": 11.782683672299973, + "learning_rate": 4.184662664497383e-06, + "loss": 1.3546, + "step": 4050 + }, + { + "epoch": 0.5735117151553762, + "grad_norm": 9.273937285789769, + "learning_rate": 4.184239109116393e-06, + "loss": 1.2622, + "step": 4051 + }, + { + "epoch": 0.5736532880300135, + "grad_norm": 11.027567910845995, + "learning_rate": 4.183815465196209e-06, + "loss": 1.2695, + "step": 4052 + }, + { + "epoch": 0.5737948609046507, + "grad_norm": 12.565017068458923, + "learning_rate": 4.183391732759102e-06, + "loss": 1.2512, + "step": 4053 + }, + { + "epoch": 0.5739364337792879, + "grad_norm": 10.499490634734011, + "learning_rate": 4.182967911827347e-06, + "loss": 1.1947, + "step": 4054 + }, + { + "epoch": 0.574078006653925, + "grad_norm": 9.128055008852044, + "learning_rate": 4.182544002423223e-06, + "loss": 1.4035, + "step": 4055 + }, + { + "epoch": 0.5742195795285623, + "grad_norm": 12.010612881004189, + "learning_rate": 4.182120004569015e-06, + "loss": 1.3527, + "step": 4056 + }, + { + "epoch": 0.5743611524031995, + "grad_norm": 12.484041937633258, + "learning_rate": 4.181695918287013e-06, + "loss": 1.3475, + "step": 4057 + }, + { + "epoch": 0.5745027252778367, + "grad_norm": 8.742132955454283, + "learning_rate": 4.181271743599511e-06, + "loss": 1.125, + "step": 4058 + }, + { + "epoch": 0.574644298152474, + "grad_norm": 9.796007425201829, + "learning_rate": 4.180847480528806e-06, + "loss": 1.3344, + "step": 4059 + }, + { + "epoch": 0.5747858710271112, + "grad_norm": 8.785474962260151, + "learning_rate": 4.180423129097203e-06, + "loss": 1.274, + "step": 4060 + }, + { + "epoch": 0.5749274439017484, + "grad_norm": 9.487116057778044, + "learning_rate": 4.179998689327009e-06, + "loss": 1.2426, + "step": 4061 + }, + { + "epoch": 0.5750690167763857, + "grad_norm": 12.359359567586763, + "learning_rate": 4.1795741612405365e-06, + "loss": 1.2552, + "step": 4062 + }, + { + "epoch": 0.5752105896510229, + "grad_norm": 11.006467131899587, + "learning_rate": 4.179149544860102e-06, + "loss": 1.4515, + "step": 4063 + }, + { + "epoch": 0.5753521625256601, + "grad_norm": 9.084932752243848, + "learning_rate": 4.178724840208029e-06, + "loss": 1.3155, + "step": 4064 + }, + { + "epoch": 0.5754937354002974, + "grad_norm": 7.823565162026991, + "learning_rate": 4.178300047306643e-06, + "loss": 1.1647, + "step": 4065 + }, + { + "epoch": 0.5756353082749345, + "grad_norm": 8.734492920436105, + "learning_rate": 4.177875166178274e-06, + "loss": 1.2078, + "step": 4066 + }, + { + "epoch": 0.5757768811495717, + "grad_norm": 11.718471350853799, + "learning_rate": 4.17745019684526e-06, + "loss": 1.3783, + "step": 4067 + }, + { + "epoch": 0.5759184540242089, + "grad_norm": 8.271578687109903, + "learning_rate": 4.177025139329939e-06, + "loss": 1.2457, + "step": 4068 + }, + { + "epoch": 0.5760600268988462, + "grad_norm": 10.270998032018625, + "learning_rate": 4.176599993654657e-06, + "loss": 1.4844, + "step": 4069 + }, + { + "epoch": 0.5762015997734834, + "grad_norm": 8.147896299740696, + "learning_rate": 4.176174759841762e-06, + "loss": 1.1219, + "step": 4070 + }, + { + "epoch": 0.5763431726481206, + "grad_norm": 11.015605953551797, + "learning_rate": 4.175749437913611e-06, + "loss": 1.4256, + "step": 4071 + }, + { + "epoch": 0.5764847455227579, + "grad_norm": 11.520507439987385, + "learning_rate": 4.175324027892562e-06, + "loss": 1.3405, + "step": 4072 + }, + { + "epoch": 0.5766263183973951, + "grad_norm": 10.44394872945461, + "learning_rate": 4.174898529800977e-06, + "loss": 1.3389, + "step": 4073 + }, + { + "epoch": 0.5767678912720323, + "grad_norm": 10.107789282271604, + "learning_rate": 4.1744729436612255e-06, + "loss": 1.4504, + "step": 4074 + }, + { + "epoch": 0.5769094641466695, + "grad_norm": 11.903219322898467, + "learning_rate": 4.174047269495681e-06, + "loss": 1.338, + "step": 4075 + }, + { + "epoch": 0.5770510370213067, + "grad_norm": 9.693550435415567, + "learning_rate": 4.173621507326719e-06, + "loss": 1.1975, + "step": 4076 + }, + { + "epoch": 0.5771926098959439, + "grad_norm": 9.151979459512775, + "learning_rate": 4.1731956571767215e-06, + "loss": 1.379, + "step": 4077 + }, + { + "epoch": 0.5773341827705811, + "grad_norm": 8.620201655814906, + "learning_rate": 4.172769719068076e-06, + "loss": 1.2086, + "step": 4078 + }, + { + "epoch": 0.5774757556452184, + "grad_norm": 8.879476788202476, + "learning_rate": 4.172343693023174e-06, + "loss": 1.2427, + "step": 4079 + }, + { + "epoch": 0.5776173285198556, + "grad_norm": 9.814871143150103, + "learning_rate": 4.171917579064412e-06, + "loss": 1.2655, + "step": 4080 + }, + { + "epoch": 0.5777589013944928, + "grad_norm": 9.95580711936251, + "learning_rate": 4.1714913772141885e-06, + "loss": 1.3481, + "step": 4081 + }, + { + "epoch": 0.57790047426913, + "grad_norm": 9.202632146050789, + "learning_rate": 4.171065087494909e-06, + "loss": 1.2457, + "step": 4082 + }, + { + "epoch": 0.5780420471437673, + "grad_norm": 9.79335671848165, + "learning_rate": 4.170638709928984e-06, + "loss": 1.3516, + "step": 4083 + }, + { + "epoch": 0.5781836200184045, + "grad_norm": 9.862630315773043, + "learning_rate": 4.170212244538829e-06, + "loss": 1.2601, + "step": 4084 + }, + { + "epoch": 0.5783251928930417, + "grad_norm": 10.801914214990221, + "learning_rate": 4.169785691346861e-06, + "loss": 1.4142, + "step": 4085 + }, + { + "epoch": 0.5784667657676789, + "grad_norm": 8.500223044666653, + "learning_rate": 4.169359050375505e-06, + "loss": 1.2821, + "step": 4086 + }, + { + "epoch": 0.5786083386423161, + "grad_norm": 9.396585664141773, + "learning_rate": 4.168932321647186e-06, + "loss": 1.254, + "step": 4087 + }, + { + "epoch": 0.5787499115169533, + "grad_norm": 6.856409079028495, + "learning_rate": 4.168505505184341e-06, + "loss": 1.1527, + "step": 4088 + }, + { + "epoch": 0.5788914843915905, + "grad_norm": 10.286032263064163, + "learning_rate": 4.168078601009407e-06, + "loss": 1.2787, + "step": 4089 + }, + { + "epoch": 0.5790330572662278, + "grad_norm": 9.009980919211248, + "learning_rate": 4.167651609144822e-06, + "loss": 1.0996, + "step": 4090 + }, + { + "epoch": 0.579174630140865, + "grad_norm": 7.641831878018797, + "learning_rate": 4.167224529613038e-06, + "loss": 1.2995, + "step": 4091 + }, + { + "epoch": 0.5793162030155022, + "grad_norm": 10.481753526124427, + "learning_rate": 4.166797362436502e-06, + "loss": 1.4115, + "step": 4092 + }, + { + "epoch": 0.5794577758901395, + "grad_norm": 9.842518756704212, + "learning_rate": 4.1663701076376715e-06, + "loss": 1.376, + "step": 4093 + }, + { + "epoch": 0.5795993487647767, + "grad_norm": 9.700514585079084, + "learning_rate": 4.1659427652390075e-06, + "loss": 1.4156, + "step": 4094 + }, + { + "epoch": 0.5797409216394139, + "grad_norm": 8.148497423077844, + "learning_rate": 4.165515335262974e-06, + "loss": 1.2253, + "step": 4095 + }, + { + "epoch": 0.5798824945140512, + "grad_norm": 7.88431301145494, + "learning_rate": 4.165087817732041e-06, + "loss": 1.1943, + "step": 4096 + }, + { + "epoch": 0.5800240673886883, + "grad_norm": 7.080211205645827, + "learning_rate": 4.164660212668684e-06, + "loss": 1.2877, + "step": 4097 + }, + { + "epoch": 0.5801656402633255, + "grad_norm": 10.296864626796747, + "learning_rate": 4.164232520095379e-06, + "loss": 1.2709, + "step": 4098 + }, + { + "epoch": 0.5803072131379627, + "grad_norm": 8.931127450474609, + "learning_rate": 4.163804740034613e-06, + "loss": 1.3159, + "step": 4099 + }, + { + "epoch": 0.5804487860126, + "grad_norm": 8.644934133319559, + "learning_rate": 4.163376872508872e-06, + "loss": 1.4045, + "step": 4100 + }, + { + "epoch": 0.5805903588872372, + "grad_norm": 9.140295299876549, + "learning_rate": 4.162948917540649e-06, + "loss": 1.4402, + "step": 4101 + }, + { + "epoch": 0.5807319317618744, + "grad_norm": 9.348685399625547, + "learning_rate": 4.162520875152441e-06, + "loss": 1.1882, + "step": 4102 + }, + { + "epoch": 0.5808735046365117, + "grad_norm": 10.385102282898783, + "learning_rate": 4.1620927453667515e-06, + "loss": 1.4739, + "step": 4103 + }, + { + "epoch": 0.5810150775111489, + "grad_norm": 9.167091174844405, + "learning_rate": 4.161664528206084e-06, + "loss": 1.3071, + "step": 4104 + }, + { + "epoch": 0.5811566503857861, + "grad_norm": 8.357660363724474, + "learning_rate": 4.1612362236929524e-06, + "loss": 1.2992, + "step": 4105 + }, + { + "epoch": 0.5812982232604234, + "grad_norm": 9.159604444333835, + "learning_rate": 4.16080783184987e-06, + "loss": 1.458, + "step": 4106 + }, + { + "epoch": 0.5814397961350605, + "grad_norm": 7.483690712373594, + "learning_rate": 4.16037935269936e-06, + "loss": 1.2443, + "step": 4107 + }, + { + "epoch": 0.5815813690096977, + "grad_norm": 9.158667339682387, + "learning_rate": 4.159950786263944e-06, + "loss": 1.0712, + "step": 4108 + }, + { + "epoch": 0.5817229418843349, + "grad_norm": 10.829086723389711, + "learning_rate": 4.159522132566153e-06, + "loss": 1.3074, + "step": 4109 + }, + { + "epoch": 0.5818645147589722, + "grad_norm": 7.526434027312035, + "learning_rate": 4.159093391628521e-06, + "loss": 1.1811, + "step": 4110 + }, + { + "epoch": 0.5820060876336094, + "grad_norm": 11.85923025171052, + "learning_rate": 4.158664563473587e-06, + "loss": 1.2287, + "step": 4111 + }, + { + "epoch": 0.5821476605082466, + "grad_norm": 8.351366545730844, + "learning_rate": 4.158235648123894e-06, + "loss": 1.3547, + "step": 4112 + }, + { + "epoch": 0.5822892333828839, + "grad_norm": 9.583570217577288, + "learning_rate": 4.1578066456019885e-06, + "loss": 1.3035, + "step": 4113 + }, + { + "epoch": 0.5824308062575211, + "grad_norm": 7.866899471825269, + "learning_rate": 4.157377555930424e-06, + "loss": 1.1571, + "step": 4114 + }, + { + "epoch": 0.5825723791321583, + "grad_norm": 10.043854872055746, + "learning_rate": 4.156948379131757e-06, + "loss": 1.3169, + "step": 4115 + }, + { + "epoch": 0.5827139520067955, + "grad_norm": 8.264059859643414, + "learning_rate": 4.15651911522855e-06, + "loss": 1.172, + "step": 4116 + }, + { + "epoch": 0.5828555248814327, + "grad_norm": 8.42883850856781, + "learning_rate": 4.1560897642433674e-06, + "loss": 1.172, + "step": 4117 + }, + { + "epoch": 0.5829970977560699, + "grad_norm": 7.2746221804745534, + "learning_rate": 4.155660326198781e-06, + "loss": 1.1268, + "step": 4118 + }, + { + "epoch": 0.5831386706307071, + "grad_norm": 10.52055925572508, + "learning_rate": 4.155230801117366e-06, + "loss": 1.2304, + "step": 4119 + }, + { + "epoch": 0.5832802435053444, + "grad_norm": 9.94000251017794, + "learning_rate": 4.154801189021701e-06, + "loss": 1.2415, + "step": 4120 + }, + { + "epoch": 0.5834218163799816, + "grad_norm": 8.713967382258353, + "learning_rate": 4.154371489934372e-06, + "loss": 1.4548, + "step": 4121 + }, + { + "epoch": 0.5835633892546188, + "grad_norm": 10.188475392459557, + "learning_rate": 4.153941703877967e-06, + "loss": 1.3028, + "step": 4122 + }, + { + "epoch": 0.583704962129256, + "grad_norm": 9.225317939717156, + "learning_rate": 4.153511830875081e-06, + "loss": 1.3946, + "step": 4123 + }, + { + "epoch": 0.5838465350038933, + "grad_norm": 11.208570021691097, + "learning_rate": 4.15308187094831e-06, + "loss": 1.3035, + "step": 4124 + }, + { + "epoch": 0.5839881078785305, + "grad_norm": 9.236145366200445, + "learning_rate": 4.152651824120258e-06, + "loss": 1.1708, + "step": 4125 + }, + { + "epoch": 0.5841296807531677, + "grad_norm": 7.85116239069157, + "learning_rate": 4.152221690413531e-06, + "loss": 1.2654, + "step": 4126 + }, + { + "epoch": 0.584271253627805, + "grad_norm": 10.347390447581798, + "learning_rate": 4.151791469850743e-06, + "loss": 1.2106, + "step": 4127 + }, + { + "epoch": 0.5844128265024421, + "grad_norm": 10.209683657237347, + "learning_rate": 4.151361162454509e-06, + "loss": 1.3812, + "step": 4128 + }, + { + "epoch": 0.5845543993770793, + "grad_norm": 8.414446497011804, + "learning_rate": 4.150930768247449e-06, + "loss": 1.3775, + "step": 4129 + }, + { + "epoch": 0.5846959722517165, + "grad_norm": 10.989011304289425, + "learning_rate": 4.15050028725219e-06, + "loss": 1.2695, + "step": 4130 + }, + { + "epoch": 0.5848375451263538, + "grad_norm": 8.541710333983909, + "learning_rate": 4.1500697194913615e-06, + "loss": 1.3316, + "step": 4131 + }, + { + "epoch": 0.584979118000991, + "grad_norm": 10.84539940682025, + "learning_rate": 4.149639064987598e-06, + "loss": 1.3944, + "step": 4132 + }, + { + "epoch": 0.5851206908756282, + "grad_norm": 10.01601272284824, + "learning_rate": 4.149208323763539e-06, + "loss": 1.3725, + "step": 4133 + }, + { + "epoch": 0.5852622637502655, + "grad_norm": 8.069735333759937, + "learning_rate": 4.148777495841829e-06, + "loss": 1.2595, + "step": 4134 + }, + { + "epoch": 0.5854038366249027, + "grad_norm": 9.111119831153397, + "learning_rate": 4.1483465812451144e-06, + "loss": 1.3365, + "step": 4135 + }, + { + "epoch": 0.5855454094995399, + "grad_norm": 9.181374512536406, + "learning_rate": 4.147915579996049e-06, + "loss": 1.3463, + "step": 4136 + }, + { + "epoch": 0.5856869823741772, + "grad_norm": 8.296689708530078, + "learning_rate": 4.147484492117291e-06, + "loss": 1.2156, + "step": 4137 + }, + { + "epoch": 0.5858285552488143, + "grad_norm": 7.238002189119403, + "learning_rate": 4.147053317631501e-06, + "loss": 1.2255, + "step": 4138 + }, + { + "epoch": 0.5859701281234515, + "grad_norm": 10.512198129281874, + "learning_rate": 4.146622056561347e-06, + "loss": 1.4638, + "step": 4139 + }, + { + "epoch": 0.5861117009980887, + "grad_norm": 9.387938662666281, + "learning_rate": 4.146190708929498e-06, + "loss": 1.4163, + "step": 4140 + }, + { + "epoch": 0.586253273872726, + "grad_norm": 11.186801313181775, + "learning_rate": 4.145759274758632e-06, + "loss": 1.3861, + "step": 4141 + }, + { + "epoch": 0.5863948467473632, + "grad_norm": 8.815845909447436, + "learning_rate": 4.145327754071427e-06, + "loss": 1.1801, + "step": 4142 + }, + { + "epoch": 0.5865364196220004, + "grad_norm": 8.116792729813298, + "learning_rate": 4.1448961468905706e-06, + "loss": 1.2811, + "step": 4143 + }, + { + "epoch": 0.5866779924966377, + "grad_norm": 7.484573775739817, + "learning_rate": 4.1444644532387485e-06, + "loss": 1.2223, + "step": 4144 + }, + { + "epoch": 0.5868195653712749, + "grad_norm": 7.521497054325849, + "learning_rate": 4.1440326731386575e-06, + "loss": 1.2205, + "step": 4145 + }, + { + "epoch": 0.5869611382459121, + "grad_norm": 9.35156576336618, + "learning_rate": 4.143600806612993e-06, + "loss": 1.2512, + "step": 4146 + }, + { + "epoch": 0.5871027111205493, + "grad_norm": 12.910634919265384, + "learning_rate": 4.143168853684461e-06, + "loss": 1.428, + "step": 4147 + }, + { + "epoch": 0.5872442839951865, + "grad_norm": 8.992089079609087, + "learning_rate": 4.142736814375768e-06, + "loss": 1.3322, + "step": 4148 + }, + { + "epoch": 0.5873858568698237, + "grad_norm": 7.200080807550281, + "learning_rate": 4.142304688709624e-06, + "loss": 1.0917, + "step": 4149 + }, + { + "epoch": 0.5875274297444609, + "grad_norm": 8.400130643282802, + "learning_rate": 4.141872476708748e-06, + "loss": 1.2513, + "step": 4150 + }, + { + "epoch": 0.5876690026190982, + "grad_norm": 7.3763599758227585, + "learning_rate": 4.1414401783958605e-06, + "loss": 1.3233, + "step": 4151 + }, + { + "epoch": 0.5878105754937354, + "grad_norm": 7.93254475347408, + "learning_rate": 4.141007793793686e-06, + "loss": 1.3307, + "step": 4152 + }, + { + "epoch": 0.5879521483683726, + "grad_norm": 8.321346766703016, + "learning_rate": 4.140575322924955e-06, + "loss": 1.35, + "step": 4153 + }, + { + "epoch": 0.5880937212430098, + "grad_norm": 9.427070564275422, + "learning_rate": 4.140142765812404e-06, + "loss": 1.2608, + "step": 4154 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 7.743133210009949, + "learning_rate": 4.13971012247877e-06, + "loss": 1.2402, + "step": 4155 + }, + { + "epoch": 0.5883768669922843, + "grad_norm": 8.820637726510695, + "learning_rate": 4.139277392946797e-06, + "loss": 1.4348, + "step": 4156 + }, + { + "epoch": 0.5885184398669215, + "grad_norm": 6.945427841069822, + "learning_rate": 4.138844577239234e-06, + "loss": 1.2472, + "step": 4157 + }, + { + "epoch": 0.5886600127415588, + "grad_norm": 9.800819004635331, + "learning_rate": 4.138411675378833e-06, + "loss": 1.3971, + "step": 4158 + }, + { + "epoch": 0.5888015856161959, + "grad_norm": 9.414456763657963, + "learning_rate": 4.137978687388352e-06, + "loss": 1.1955, + "step": 4159 + }, + { + "epoch": 0.5889431584908331, + "grad_norm": 7.976632322478442, + "learning_rate": 4.137545613290554e-06, + "loss": 1.3104, + "step": 4160 + }, + { + "epoch": 0.5890847313654703, + "grad_norm": 7.448090266269734, + "learning_rate": 4.137112453108203e-06, + "loss": 1.3195, + "step": 4161 + }, + { + "epoch": 0.5892263042401076, + "grad_norm": 9.052823076358859, + "learning_rate": 4.136679206864072e-06, + "loss": 1.4387, + "step": 4162 + }, + { + "epoch": 0.5893678771147448, + "grad_norm": 7.524941996100551, + "learning_rate": 4.136245874580935e-06, + "loss": 1.3678, + "step": 4163 + }, + { + "epoch": 0.589509449989382, + "grad_norm": 7.463197056405142, + "learning_rate": 4.135812456281571e-06, + "loss": 1.3537, + "step": 4164 + }, + { + "epoch": 0.5896510228640193, + "grad_norm": 7.563400845428782, + "learning_rate": 4.1353789519887685e-06, + "loss": 1.397, + "step": 4165 + }, + { + "epoch": 0.5897925957386565, + "grad_norm": 9.066798801955226, + "learning_rate": 4.134945361725312e-06, + "loss": 1.3751, + "step": 4166 + }, + { + "epoch": 0.5899341686132937, + "grad_norm": 6.957897819854299, + "learning_rate": 4.134511685513998e-06, + "loss": 1.3364, + "step": 4167 + }, + { + "epoch": 0.590075741487931, + "grad_norm": 9.610120096240221, + "learning_rate": 4.134077923377622e-06, + "loss": 1.5167, + "step": 4168 + }, + { + "epoch": 0.5902173143625681, + "grad_norm": 9.717573186639518, + "learning_rate": 4.13364407533899e-06, + "loss": 1.4057, + "step": 4169 + }, + { + "epoch": 0.5903588872372053, + "grad_norm": 8.115303239403964, + "learning_rate": 4.133210141420905e-06, + "loss": 1.3296, + "step": 4170 + }, + { + "epoch": 0.5905004601118425, + "grad_norm": 10.199956661955076, + "learning_rate": 4.132776121646182e-06, + "loss": 1.3797, + "step": 4171 + }, + { + "epoch": 0.5906420329864798, + "grad_norm": 8.412870501540102, + "learning_rate": 4.132342016037635e-06, + "loss": 1.2732, + "step": 4172 + }, + { + "epoch": 0.590783605861117, + "grad_norm": 9.307024478901885, + "learning_rate": 4.131907824618086e-06, + "loss": 1.1531, + "step": 4173 + }, + { + "epoch": 0.5909251787357542, + "grad_norm": 9.919377530780071, + "learning_rate": 4.131473547410359e-06, + "loss": 1.3308, + "step": 4174 + }, + { + "epoch": 0.5910667516103915, + "grad_norm": 9.214601572564778, + "learning_rate": 4.131039184437283e-06, + "loss": 1.4607, + "step": 4175 + }, + { + "epoch": 0.5912083244850287, + "grad_norm": 10.293931554827278, + "learning_rate": 4.130604735721695e-06, + "loss": 1.3565, + "step": 4176 + }, + { + "epoch": 0.5913498973596659, + "grad_norm": 9.854639624011657, + "learning_rate": 4.130170201286432e-06, + "loss": 1.3207, + "step": 4177 + }, + { + "epoch": 0.5914914702343032, + "grad_norm": 8.998376487952997, + "learning_rate": 4.129735581154336e-06, + "loss": 1.2457, + "step": 4178 + }, + { + "epoch": 0.5916330431089404, + "grad_norm": 10.279192408444557, + "learning_rate": 4.129300875348255e-06, + "loss": 1.2339, + "step": 4179 + }, + { + "epoch": 0.5917746159835775, + "grad_norm": 8.394476718645588, + "learning_rate": 4.128866083891043e-06, + "loss": 1.4164, + "step": 4180 + }, + { + "epoch": 0.5919161888582147, + "grad_norm": 10.717050133978974, + "learning_rate": 4.128431206805556e-06, + "loss": 1.1917, + "step": 4181 + }, + { + "epoch": 0.592057761732852, + "grad_norm": 8.370711865706038, + "learning_rate": 4.127996244114654e-06, + "loss": 1.1718, + "step": 4182 + }, + { + "epoch": 0.5921993346074892, + "grad_norm": 8.82795647654415, + "learning_rate": 4.127561195841203e-06, + "loss": 1.2888, + "step": 4183 + }, + { + "epoch": 0.5923409074821264, + "grad_norm": 8.620091022621585, + "learning_rate": 4.1271260620080745e-06, + "loss": 1.3475, + "step": 4184 + }, + { + "epoch": 0.5924824803567637, + "grad_norm": 8.80815677616248, + "learning_rate": 4.126690842638141e-06, + "loss": 1.2851, + "step": 4185 + }, + { + "epoch": 0.5926240532314009, + "grad_norm": 8.839685375431701, + "learning_rate": 4.1262555377542834e-06, + "loss": 1.1419, + "step": 4186 + }, + { + "epoch": 0.5927656261060381, + "grad_norm": 8.113681362756838, + "learning_rate": 4.125820147379384e-06, + "loss": 1.2025, + "step": 4187 + }, + { + "epoch": 0.5929071989806753, + "grad_norm": 10.177735686828258, + "learning_rate": 4.125384671536333e-06, + "loss": 1.459, + "step": 4188 + }, + { + "epoch": 0.5930487718553126, + "grad_norm": 9.713183019867452, + "learning_rate": 4.124949110248021e-06, + "loss": 1.3714, + "step": 4189 + }, + { + "epoch": 0.5931903447299497, + "grad_norm": 7.59974540484784, + "learning_rate": 4.124513463537346e-06, + "loss": 1.2469, + "step": 4190 + }, + { + "epoch": 0.5933319176045869, + "grad_norm": 9.409538834690197, + "learning_rate": 4.124077731427209e-06, + "loss": 1.335, + "step": 4191 + }, + { + "epoch": 0.5934734904792242, + "grad_norm": 8.758904422200649, + "learning_rate": 4.123641913940518e-06, + "loss": 1.4199, + "step": 4192 + }, + { + "epoch": 0.5936150633538614, + "grad_norm": 10.996955797137643, + "learning_rate": 4.123206011100182e-06, + "loss": 1.409, + "step": 4193 + }, + { + "epoch": 0.5937566362284986, + "grad_norm": 9.221507549042661, + "learning_rate": 4.122770022929114e-06, + "loss": 1.3449, + "step": 4194 + }, + { + "epoch": 0.5938982091031358, + "grad_norm": 10.022329196326133, + "learning_rate": 4.1223339494502375e-06, + "loss": 1.2695, + "step": 4195 + }, + { + "epoch": 0.5940397819777731, + "grad_norm": 8.257001737783147, + "learning_rate": 4.1218977906864754e-06, + "loss": 1.1852, + "step": 4196 + }, + { + "epoch": 0.5941813548524103, + "grad_norm": 9.57662897421455, + "learning_rate": 4.121461546660756e-06, + "loss": 1.3706, + "step": 4197 + }, + { + "epoch": 0.5943229277270475, + "grad_norm": 10.405780037013608, + "learning_rate": 4.121025217396011e-06, + "loss": 1.3716, + "step": 4198 + }, + { + "epoch": 0.5944645006016848, + "grad_norm": 11.104956583885436, + "learning_rate": 4.12058880291518e-06, + "loss": 1.3602, + "step": 4199 + }, + { + "epoch": 0.5946060734763219, + "grad_norm": 9.207577313303904, + "learning_rate": 4.120152303241203e-06, + "loss": 1.3722, + "step": 4200 + }, + { + "epoch": 0.5947476463509591, + "grad_norm": 6.501513378321704, + "learning_rate": 4.119715718397028e-06, + "loss": 1.2333, + "step": 4201 + }, + { + "epoch": 0.5948892192255963, + "grad_norm": 9.852741506695175, + "learning_rate": 4.119279048405606e-06, + "loss": 1.251, + "step": 4202 + }, + { + "epoch": 0.5950307921002336, + "grad_norm": 8.915282905612663, + "learning_rate": 4.1188422932898905e-06, + "loss": 1.3836, + "step": 4203 + }, + { + "epoch": 0.5951723649748708, + "grad_norm": 9.124308520728862, + "learning_rate": 4.1184054530728436e-06, + "loss": 1.3777, + "step": 4204 + }, + { + "epoch": 0.595313937849508, + "grad_norm": 10.068472469963751, + "learning_rate": 4.117968527777428e-06, + "loss": 1.3827, + "step": 4205 + }, + { + "epoch": 0.5954555107241453, + "grad_norm": 9.045062527585406, + "learning_rate": 4.117531517426614e-06, + "loss": 1.2726, + "step": 4206 + }, + { + "epoch": 0.5955970835987825, + "grad_norm": 8.24668262715544, + "learning_rate": 4.117094422043374e-06, + "loss": 1.3372, + "step": 4207 + }, + { + "epoch": 0.5957386564734197, + "grad_norm": 8.056614346548052, + "learning_rate": 4.116657241650687e-06, + "loss": 1.3905, + "step": 4208 + }, + { + "epoch": 0.595880229348057, + "grad_norm": 8.549830509758978, + "learning_rate": 4.116219976271533e-06, + "loss": 1.3367, + "step": 4209 + }, + { + "epoch": 0.5960218022226942, + "grad_norm": 11.800541390106192, + "learning_rate": 4.1157826259289e-06, + "loss": 1.3447, + "step": 4210 + }, + { + "epoch": 0.5961633750973313, + "grad_norm": 8.434524576642383, + "learning_rate": 4.115345190645779e-06, + "loss": 1.3059, + "step": 4211 + }, + { + "epoch": 0.5963049479719685, + "grad_norm": 8.272085048658898, + "learning_rate": 4.114907670445166e-06, + "loss": 1.537, + "step": 4212 + }, + { + "epoch": 0.5964465208466058, + "grad_norm": 9.336312590529836, + "learning_rate": 4.114470065350061e-06, + "loss": 1.3672, + "step": 4213 + }, + { + "epoch": 0.596588093721243, + "grad_norm": 9.214118026610077, + "learning_rate": 4.114032375383469e-06, + "loss": 1.295, + "step": 4214 + }, + { + "epoch": 0.5967296665958802, + "grad_norm": 9.80538585349092, + "learning_rate": 4.113594600568398e-06, + "loss": 1.1247, + "step": 4215 + }, + { + "epoch": 0.5968712394705175, + "grad_norm": 9.829609526817789, + "learning_rate": 4.113156740927862e-06, + "loss": 1.2777, + "step": 4216 + }, + { + "epoch": 0.5970128123451547, + "grad_norm": 7.534249495889328, + "learning_rate": 4.1127187964848785e-06, + "loss": 1.1646, + "step": 4217 + }, + { + "epoch": 0.5971543852197919, + "grad_norm": 10.20447086261397, + "learning_rate": 4.112280767262471e-06, + "loss": 1.1244, + "step": 4218 + }, + { + "epoch": 0.5972959580944291, + "grad_norm": 9.114117547622206, + "learning_rate": 4.111842653283665e-06, + "loss": 1.3489, + "step": 4219 + }, + { + "epoch": 0.5974375309690664, + "grad_norm": 10.14670111504554, + "learning_rate": 4.1114044545714935e-06, + "loss": 1.1647, + "step": 4220 + }, + { + "epoch": 0.5975791038437035, + "grad_norm": 9.722992096951577, + "learning_rate": 4.110966171148991e-06, + "loss": 1.2979, + "step": 4221 + }, + { + "epoch": 0.5977206767183407, + "grad_norm": 10.70686415354381, + "learning_rate": 4.110527803039198e-06, + "loss": 1.3059, + "step": 4222 + }, + { + "epoch": 0.597862249592978, + "grad_norm": 10.674833588564962, + "learning_rate": 4.11008935026516e-06, + "loss": 1.4859, + "step": 4223 + }, + { + "epoch": 0.5980038224676152, + "grad_norm": 10.76235571220698, + "learning_rate": 4.109650812849924e-06, + "loss": 1.4563, + "step": 4224 + }, + { + "epoch": 0.5981453953422524, + "grad_norm": 9.715954148443087, + "learning_rate": 4.109212190816546e-06, + "loss": 1.3545, + "step": 4225 + }, + { + "epoch": 0.5982869682168896, + "grad_norm": 10.243787045554392, + "learning_rate": 4.108773484188082e-06, + "loss": 1.1764, + "step": 4226 + }, + { + "epoch": 0.5984285410915269, + "grad_norm": 10.56306013619718, + "learning_rate": 4.1083346929875966e-06, + "loss": 1.3388, + "step": 4227 + }, + { + "epoch": 0.5985701139661641, + "grad_norm": 8.89106478550345, + "learning_rate": 4.107895817238155e-06, + "loss": 1.306, + "step": 4228 + }, + { + "epoch": 0.5987116868408013, + "grad_norm": 8.917144429844951, + "learning_rate": 4.107456856962829e-06, + "loss": 1.2348, + "step": 4229 + }, + { + "epoch": 0.5988532597154386, + "grad_norm": 9.68269238695269, + "learning_rate": 4.107017812184695e-06, + "loss": 1.2823, + "step": 4230 + }, + { + "epoch": 0.5989948325900757, + "grad_norm": 9.62401268589265, + "learning_rate": 4.106578682926833e-06, + "loss": 1.2783, + "step": 4231 + }, + { + "epoch": 0.5991364054647129, + "grad_norm": 9.71476325295862, + "learning_rate": 4.106139469212327e-06, + "loss": 1.3511, + "step": 4232 + }, + { + "epoch": 0.5992779783393501, + "grad_norm": 7.214090007409789, + "learning_rate": 4.105700171064267e-06, + "loss": 1.0994, + "step": 4233 + }, + { + "epoch": 0.5994195512139874, + "grad_norm": 9.486793172086204, + "learning_rate": 4.105260788505746e-06, + "loss": 1.2135, + "step": 4234 + }, + { + "epoch": 0.5995611240886246, + "grad_norm": 9.596202226296024, + "learning_rate": 4.104821321559863e-06, + "loss": 1.2072, + "step": 4235 + }, + { + "epoch": 0.5997026969632618, + "grad_norm": 10.421154632150293, + "learning_rate": 4.1043817702497195e-06, + "loss": 1.4463, + "step": 4236 + }, + { + "epoch": 0.5998442698378991, + "grad_norm": 9.422343430649175, + "learning_rate": 4.103942134598422e-06, + "loss": 1.2408, + "step": 4237 + }, + { + "epoch": 0.5999858427125363, + "grad_norm": 9.46151688783821, + "learning_rate": 4.103502414629082e-06, + "loss": 1.2977, + "step": 4238 + }, + { + "epoch": 0.6001274155871735, + "grad_norm": 9.539230857569382, + "learning_rate": 4.103062610364817e-06, + "loss": 1.4523, + "step": 4239 + }, + { + "epoch": 0.6002689884618108, + "grad_norm": 10.558063004025533, + "learning_rate": 4.102622721828746e-06, + "loss": 1.3707, + "step": 4240 + }, + { + "epoch": 0.600410561336448, + "grad_norm": 9.85381778370956, + "learning_rate": 4.102182749043993e-06, + "loss": 1.262, + "step": 4241 + }, + { + "epoch": 0.6005521342110851, + "grad_norm": 10.878000481256418, + "learning_rate": 4.101742692033687e-06, + "loss": 1.2124, + "step": 4242 + }, + { + "epoch": 0.6006937070857223, + "grad_norm": 10.844341690674522, + "learning_rate": 4.101302550820962e-06, + "loss": 1.3568, + "step": 4243 + }, + { + "epoch": 0.6008352799603596, + "grad_norm": 8.609963841733427, + "learning_rate": 4.100862325428957e-06, + "loss": 1.2617, + "step": 4244 + }, + { + "epoch": 0.6009768528349968, + "grad_norm": 8.546043453178394, + "learning_rate": 4.1004220158808114e-06, + "loss": 1.2455, + "step": 4245 + }, + { + "epoch": 0.601118425709634, + "grad_norm": 9.939477495855856, + "learning_rate": 4.0999816221996755e-06, + "loss": 1.2697, + "step": 4246 + }, + { + "epoch": 0.6012599985842713, + "grad_norm": 10.979689487312152, + "learning_rate": 4.099541144408698e-06, + "loss": 1.1973, + "step": 4247 + }, + { + "epoch": 0.6014015714589085, + "grad_norm": 9.62615127613904, + "learning_rate": 4.099100582531035e-06, + "loss": 1.3901, + "step": 4248 + }, + { + "epoch": 0.6015431443335457, + "grad_norm": 8.984460342665317, + "learning_rate": 4.098659936589847e-06, + "loss": 1.4636, + "step": 4249 + }, + { + "epoch": 0.601684717208183, + "grad_norm": 8.417668298727525, + "learning_rate": 4.098219206608298e-06, + "loss": 1.2802, + "step": 4250 + }, + { + "epoch": 0.6018262900828202, + "grad_norm": 9.138177840821893, + "learning_rate": 4.0977783926095575e-06, + "loss": 1.1306, + "step": 4251 + }, + { + "epoch": 0.6019678629574573, + "grad_norm": 11.411769587028205, + "learning_rate": 4.097337494616798e-06, + "loss": 1.2857, + "step": 4252 + }, + { + "epoch": 0.6021094358320945, + "grad_norm": 7.890900801569177, + "learning_rate": 4.096896512653197e-06, + "loss": 1.1621, + "step": 4253 + }, + { + "epoch": 0.6022510087067318, + "grad_norm": 10.981042652202577, + "learning_rate": 4.096455446741937e-06, + "loss": 1.2874, + "step": 4254 + }, + { + "epoch": 0.602392581581369, + "grad_norm": 9.548491545574015, + "learning_rate": 4.096014296906205e-06, + "loss": 1.1753, + "step": 4255 + }, + { + "epoch": 0.6025341544560062, + "grad_norm": 11.598824513826349, + "learning_rate": 4.095573063169191e-06, + "loss": 1.1754, + "step": 4256 + }, + { + "epoch": 0.6026757273306435, + "grad_norm": 9.9142802810497, + "learning_rate": 4.095131745554092e-06, + "loss": 1.2718, + "step": 4257 + }, + { + "epoch": 0.6028173002052807, + "grad_norm": 10.000480640304714, + "learning_rate": 4.094690344084105e-06, + "loss": 1.3403, + "step": 4258 + }, + { + "epoch": 0.6029588730799179, + "grad_norm": 10.037900817393258, + "learning_rate": 4.094248858782436e-06, + "loss": 1.158, + "step": 4259 + }, + { + "epoch": 0.6031004459545551, + "grad_norm": 10.100806447683011, + "learning_rate": 4.093807289672294e-06, + "loss": 1.31, + "step": 4260 + }, + { + "epoch": 0.6032420188291924, + "grad_norm": 9.691503079171659, + "learning_rate": 4.09336563677689e-06, + "loss": 1.3099, + "step": 4261 + }, + { + "epoch": 0.6033835917038295, + "grad_norm": 9.808028253063437, + "learning_rate": 4.092923900119443e-06, + "loss": 1.1953, + "step": 4262 + }, + { + "epoch": 0.6035251645784667, + "grad_norm": 9.744472795438199, + "learning_rate": 4.092482079723175e-06, + "loss": 1.3315, + "step": 4263 + }, + { + "epoch": 0.603666737453104, + "grad_norm": 8.99530712271313, + "learning_rate": 4.09204017561131e-06, + "loss": 1.242, + "step": 4264 + }, + { + "epoch": 0.6038083103277412, + "grad_norm": 12.424569160467584, + "learning_rate": 4.091598187807082e-06, + "loss": 1.2044, + "step": 4265 + }, + { + "epoch": 0.6039498832023784, + "grad_norm": 12.105131564963601, + "learning_rate": 4.091156116333723e-06, + "loss": 1.2879, + "step": 4266 + }, + { + "epoch": 0.6040914560770156, + "grad_norm": 7.171434309972857, + "learning_rate": 4.090713961214473e-06, + "loss": 1.1638, + "step": 4267 + }, + { + "epoch": 0.6042330289516529, + "grad_norm": 8.370008205104439, + "learning_rate": 4.090271722472577e-06, + "loss": 1.2424, + "step": 4268 + }, + { + "epoch": 0.6043746018262901, + "grad_norm": 10.844627322894086, + "learning_rate": 4.089829400131282e-06, + "loss": 1.1316, + "step": 4269 + }, + { + "epoch": 0.6045161747009273, + "grad_norm": 9.781013266529165, + "learning_rate": 4.0893869942138405e-06, + "loss": 1.2994, + "step": 4270 + }, + { + "epoch": 0.6046577475755646, + "grad_norm": 8.716459239974045, + "learning_rate": 4.08894450474351e-06, + "loss": 1.2168, + "step": 4271 + }, + { + "epoch": 0.6047993204502018, + "grad_norm": 9.759488697104283, + "learning_rate": 4.088501931743551e-06, + "loss": 1.311, + "step": 4272 + }, + { + "epoch": 0.6049408933248389, + "grad_norm": 9.158968056599848, + "learning_rate": 4.0880592752372315e-06, + "loss": 1.2884, + "step": 4273 + }, + { + "epoch": 0.6050824661994761, + "grad_norm": 11.083603207928403, + "learning_rate": 4.087616535247819e-06, + "loss": 1.4595, + "step": 4274 + }, + { + "epoch": 0.6052240390741134, + "grad_norm": 8.023514997039317, + "learning_rate": 4.087173711798589e-06, + "loss": 1.222, + "step": 4275 + }, + { + "epoch": 0.6053656119487506, + "grad_norm": 9.005092028074007, + "learning_rate": 4.086730804912821e-06, + "loss": 1.3289, + "step": 4276 + }, + { + "epoch": 0.6055071848233878, + "grad_norm": 9.634798685137381, + "learning_rate": 4.086287814613797e-06, + "loss": 1.3519, + "step": 4277 + }, + { + "epoch": 0.6056487576980251, + "grad_norm": 9.78098050559753, + "learning_rate": 4.085844740924805e-06, + "loss": 1.3494, + "step": 4278 + }, + { + "epoch": 0.6057903305726623, + "grad_norm": 9.153631571527534, + "learning_rate": 4.085401583869138e-06, + "loss": 1.3494, + "step": 4279 + }, + { + "epoch": 0.6059319034472995, + "grad_norm": 8.53490104381045, + "learning_rate": 4.0849583434700915e-06, + "loss": 1.2609, + "step": 4280 + }, + { + "epoch": 0.6060734763219368, + "grad_norm": 12.055369112494091, + "learning_rate": 4.0845150197509675e-06, + "loss": 1.3751, + "step": 4281 + }, + { + "epoch": 0.606215049196574, + "grad_norm": 7.8527823795820435, + "learning_rate": 4.08407161273507e-06, + "loss": 1.2882, + "step": 4282 + }, + { + "epoch": 0.6063566220712111, + "grad_norm": 9.31534527829262, + "learning_rate": 4.083628122445708e-06, + "loss": 1.1896, + "step": 4283 + }, + { + "epoch": 0.6064981949458483, + "grad_norm": 11.455248460210553, + "learning_rate": 4.083184548906198e-06, + "loss": 1.2071, + "step": 4284 + }, + { + "epoch": 0.6066397678204856, + "grad_norm": 8.307179060185732, + "learning_rate": 4.082740892139856e-06, + "loss": 1.2738, + "step": 4285 + }, + { + "epoch": 0.6067813406951228, + "grad_norm": 9.623605205984113, + "learning_rate": 4.082297152170005e-06, + "loss": 1.2996, + "step": 4286 + }, + { + "epoch": 0.60692291356976, + "grad_norm": 10.917704603244719, + "learning_rate": 4.081853329019973e-06, + "loss": 1.4357, + "step": 4287 + }, + { + "epoch": 0.6070644864443973, + "grad_norm": 9.55334872506981, + "learning_rate": 4.081409422713091e-06, + "loss": 1.4415, + "step": 4288 + }, + { + "epoch": 0.6072060593190345, + "grad_norm": 9.294485807760646, + "learning_rate": 4.080965433272695e-06, + "loss": 1.1785, + "step": 4289 + }, + { + "epoch": 0.6073476321936717, + "grad_norm": 8.309806215159057, + "learning_rate": 4.080521360722124e-06, + "loss": 1.1485, + "step": 4290 + }, + { + "epoch": 0.607489205068309, + "grad_norm": 9.302460654630957, + "learning_rate": 4.080077205084725e-06, + "loss": 1.2852, + "step": 4291 + }, + { + "epoch": 0.6076307779429462, + "grad_norm": 9.344085993514787, + "learning_rate": 4.079632966383845e-06, + "loss": 1.2185, + "step": 4292 + }, + { + "epoch": 0.6077723508175833, + "grad_norm": 8.94647491765044, + "learning_rate": 4.079188644642838e-06, + "loss": 1.2808, + "step": 4293 + }, + { + "epoch": 0.6079139236922205, + "grad_norm": 10.361379852309616, + "learning_rate": 4.07874423988506e-06, + "loss": 1.1069, + "step": 4294 + }, + { + "epoch": 0.6080554965668578, + "grad_norm": 10.021622741017607, + "learning_rate": 4.078299752133876e-06, + "loss": 1.3216, + "step": 4295 + }, + { + "epoch": 0.608197069441495, + "grad_norm": 11.272311994448845, + "learning_rate": 4.07785518141265e-06, + "loss": 1.5121, + "step": 4296 + }, + { + "epoch": 0.6083386423161322, + "grad_norm": 9.02219155697343, + "learning_rate": 4.077410527744754e-06, + "loss": 1.3684, + "step": 4297 + }, + { + "epoch": 0.6084802151907694, + "grad_norm": 10.189397155479309, + "learning_rate": 4.076965791153562e-06, + "loss": 1.4353, + "step": 4298 + }, + { + "epoch": 0.6086217880654067, + "grad_norm": 13.017259730961824, + "learning_rate": 4.076520971662456e-06, + "loss": 1.314, + "step": 4299 + }, + { + "epoch": 0.6087633609400439, + "grad_norm": 9.891247481152135, + "learning_rate": 4.076076069294817e-06, + "loss": 1.3424, + "step": 4300 + }, + { + "epoch": 0.6089049338146811, + "grad_norm": 11.572815288897404, + "learning_rate": 4.075631084074033e-06, + "loss": 1.2986, + "step": 4301 + }, + { + "epoch": 0.6090465066893184, + "grad_norm": 8.77712687830095, + "learning_rate": 4.075186016023499e-06, + "loss": 1.3688, + "step": 4302 + }, + { + "epoch": 0.6091880795639556, + "grad_norm": 8.600504270451511, + "learning_rate": 4.074740865166611e-06, + "loss": 1.3807, + "step": 4303 + }, + { + "epoch": 0.6093296524385927, + "grad_norm": 10.845304086448616, + "learning_rate": 4.074295631526769e-06, + "loss": 1.3597, + "step": 4304 + }, + { + "epoch": 0.60947122531323, + "grad_norm": 11.430640660017158, + "learning_rate": 4.07385031512738e-06, + "loss": 1.2049, + "step": 4305 + }, + { + "epoch": 0.6096127981878672, + "grad_norm": 8.998097960499113, + "learning_rate": 4.0734049159918535e-06, + "loss": 1.3524, + "step": 4306 + }, + { + "epoch": 0.6097543710625044, + "grad_norm": 10.099125331358954, + "learning_rate": 4.072959434143603e-06, + "loss": 1.3238, + "step": 4307 + }, + { + "epoch": 0.6098959439371416, + "grad_norm": 8.104859257958067, + "learning_rate": 4.0725138696060485e-06, + "loss": 1.2972, + "step": 4308 + }, + { + "epoch": 0.6100375168117789, + "grad_norm": 9.414390311330948, + "learning_rate": 4.072068222402612e-06, + "loss": 1.4536, + "step": 4309 + }, + { + "epoch": 0.6101790896864161, + "grad_norm": 9.167110732911059, + "learning_rate": 4.0716224925567225e-06, + "loss": 1.2934, + "step": 4310 + }, + { + "epoch": 0.6103206625610533, + "grad_norm": 11.20647198467304, + "learning_rate": 4.071176680091809e-06, + "loss": 1.4046, + "step": 4311 + }, + { + "epoch": 0.6104622354356906, + "grad_norm": 10.38236499106139, + "learning_rate": 4.07073078503131e-06, + "loss": 1.1812, + "step": 4312 + }, + { + "epoch": 0.6106038083103278, + "grad_norm": 8.389854807819326, + "learning_rate": 4.070284807398664e-06, + "loss": 1.2544, + "step": 4313 + }, + { + "epoch": 0.6107453811849649, + "grad_norm": 8.94157476837156, + "learning_rate": 4.069838747217317e-06, + "loss": 1.3234, + "step": 4314 + }, + { + "epoch": 0.6108869540596021, + "grad_norm": 9.157473537730715, + "learning_rate": 4.069392604510717e-06, + "loss": 1.2179, + "step": 4315 + }, + { + "epoch": 0.6110285269342394, + "grad_norm": 12.845059738762712, + "learning_rate": 4.068946379302318e-06, + "loss": 1.4357, + "step": 4316 + }, + { + "epoch": 0.6111700998088766, + "grad_norm": 10.336433519844078, + "learning_rate": 4.068500071615578e-06, + "loss": 1.3255, + "step": 4317 + }, + { + "epoch": 0.6113116726835138, + "grad_norm": 8.36549817724073, + "learning_rate": 4.068053681473959e-06, + "loss": 1.276, + "step": 4318 + }, + { + "epoch": 0.6114532455581511, + "grad_norm": 8.726178685885811, + "learning_rate": 4.067607208900927e-06, + "loss": 1.3458, + "step": 4319 + }, + { + "epoch": 0.6115948184327883, + "grad_norm": 9.763780685218354, + "learning_rate": 4.067160653919952e-06, + "loss": 1.2625, + "step": 4320 + }, + { + "epoch": 0.6117363913074255, + "grad_norm": 12.194023694711582, + "learning_rate": 4.066714016554511e-06, + "loss": 1.3425, + "step": 4321 + }, + { + "epoch": 0.6118779641820628, + "grad_norm": 13.066840075944624, + "learning_rate": 4.066267296828083e-06, + "loss": 1.3261, + "step": 4322 + }, + { + "epoch": 0.6120195370567, + "grad_norm": 8.86730795623029, + "learning_rate": 4.06582049476415e-06, + "loss": 1.3282, + "step": 4323 + }, + { + "epoch": 0.6121611099313372, + "grad_norm": 10.5959848392849, + "learning_rate": 4.065373610386201e-06, + "loss": 1.2778, + "step": 4324 + }, + { + "epoch": 0.6123026828059743, + "grad_norm": 10.289744291548223, + "learning_rate": 4.064926643717729e-06, + "loss": 1.404, + "step": 4325 + }, + { + "epoch": 0.6124442556806116, + "grad_norm": 11.386765651704435, + "learning_rate": 4.06447959478223e-06, + "loss": 1.3306, + "step": 4326 + }, + { + "epoch": 0.6125858285552488, + "grad_norm": 9.620277732599186, + "learning_rate": 4.0640324636032044e-06, + "loss": 1.3497, + "step": 4327 + }, + { + "epoch": 0.612727401429886, + "grad_norm": 10.244025000836997, + "learning_rate": 4.0635852502041595e-06, + "loss": 1.3035, + "step": 4328 + }, + { + "epoch": 0.6128689743045233, + "grad_norm": 10.280581374297597, + "learning_rate": 4.0631379546086045e-06, + "loss": 1.2415, + "step": 4329 + }, + { + "epoch": 0.6130105471791605, + "grad_norm": 9.4787387849142, + "learning_rate": 4.0626905768400515e-06, + "loss": 1.2434, + "step": 4330 + }, + { + "epoch": 0.6131521200537977, + "grad_norm": 9.311914207928481, + "learning_rate": 4.06224311692202e-06, + "loss": 1.1213, + "step": 4331 + }, + { + "epoch": 0.6132936929284349, + "grad_norm": 7.873500620684918, + "learning_rate": 4.0617955748780336e-06, + "loss": 1.1822, + "step": 4332 + }, + { + "epoch": 0.6134352658030722, + "grad_norm": 7.421714893421096, + "learning_rate": 4.061347950731617e-06, + "loss": 1.2243, + "step": 4333 + }, + { + "epoch": 0.6135768386777094, + "grad_norm": 11.878649341598734, + "learning_rate": 4.060900244506304e-06, + "loss": 1.2123, + "step": 4334 + }, + { + "epoch": 0.6137184115523465, + "grad_norm": 8.516117746638393, + "learning_rate": 4.060452456225629e-06, + "loss": 1.253, + "step": 4335 + }, + { + "epoch": 0.6138599844269838, + "grad_norm": 9.725951835122416, + "learning_rate": 4.060004585913131e-06, + "loss": 1.291, + "step": 4336 + }, + { + "epoch": 0.614001557301621, + "grad_norm": 8.311586258079037, + "learning_rate": 4.059556633592356e-06, + "loss": 1.1378, + "step": 4337 + }, + { + "epoch": 0.6141431301762582, + "grad_norm": 7.500441728935063, + "learning_rate": 4.0591085992868504e-06, + "loss": 1.2248, + "step": 4338 + }, + { + "epoch": 0.6142847030508954, + "grad_norm": 9.221870333942821, + "learning_rate": 4.05866048302017e-06, + "loss": 1.2561, + "step": 4339 + }, + { + "epoch": 0.6144262759255327, + "grad_norm": 9.140482270520266, + "learning_rate": 4.058212284815869e-06, + "loss": 1.29, + "step": 4340 + }, + { + "epoch": 0.6145678488001699, + "grad_norm": 9.589586777361518, + "learning_rate": 4.057764004697511e-06, + "loss": 1.2393, + "step": 4341 + }, + { + "epoch": 0.6147094216748071, + "grad_norm": 9.207989532358697, + "learning_rate": 4.05731564268866e-06, + "loss": 1.3892, + "step": 4342 + }, + { + "epoch": 0.6148509945494444, + "grad_norm": 9.763891252348897, + "learning_rate": 4.056867198812886e-06, + "loss": 1.4448, + "step": 4343 + }, + { + "epoch": 0.6149925674240816, + "grad_norm": 11.942147992787588, + "learning_rate": 4.056418673093766e-06, + "loss": 1.3005, + "step": 4344 + }, + { + "epoch": 0.6151341402987187, + "grad_norm": 9.049141555452925, + "learning_rate": 4.055970065554876e-06, + "loss": 1.3959, + "step": 4345 + }, + { + "epoch": 0.6152757131733559, + "grad_norm": 8.473453130748076, + "learning_rate": 4.0555213762198e-06, + "loss": 1.3395, + "step": 4346 + }, + { + "epoch": 0.6154172860479932, + "grad_norm": 9.13131954889881, + "learning_rate": 4.055072605112125e-06, + "loss": 1.2652, + "step": 4347 + }, + { + "epoch": 0.6155588589226304, + "grad_norm": 11.622568183776055, + "learning_rate": 4.054623752255443e-06, + "loss": 1.2696, + "step": 4348 + }, + { + "epoch": 0.6157004317972676, + "grad_norm": 8.86277810589854, + "learning_rate": 4.0541748176733485e-06, + "loss": 1.1152, + "step": 4349 + }, + { + "epoch": 0.6158420046719049, + "grad_norm": 10.210629284153743, + "learning_rate": 4.0537258013894434e-06, + "loss": 1.2609, + "step": 4350 + }, + { + "epoch": 0.6159835775465421, + "grad_norm": 10.16110686152681, + "learning_rate": 4.053276703427332e-06, + "loss": 1.3304, + "step": 4351 + }, + { + "epoch": 0.6161251504211793, + "grad_norm": 7.342904029980636, + "learning_rate": 4.052827523810622e-06, + "loss": 1.2114, + "step": 4352 + }, + { + "epoch": 0.6162667232958166, + "grad_norm": 8.93247022682009, + "learning_rate": 4.052378262562926e-06, + "loss": 1.1988, + "step": 4353 + }, + { + "epoch": 0.6164082961704538, + "grad_norm": 7.926688453039072, + "learning_rate": 4.051928919707863e-06, + "loss": 1.2389, + "step": 4354 + }, + { + "epoch": 0.616549869045091, + "grad_norm": 9.35602245882298, + "learning_rate": 4.051479495269054e-06, + "loss": 1.2066, + "step": 4355 + }, + { + "epoch": 0.6166914419197281, + "grad_norm": 9.588768078674214, + "learning_rate": 4.051029989270125e-06, + "loss": 1.2625, + "step": 4356 + }, + { + "epoch": 0.6168330147943654, + "grad_norm": 8.102382695644515, + "learning_rate": 4.0505804017347055e-06, + "loss": 1.1829, + "step": 4357 + }, + { + "epoch": 0.6169745876690026, + "grad_norm": 8.763865357166047, + "learning_rate": 4.05013073268643e-06, + "loss": 1.2749, + "step": 4358 + }, + { + "epoch": 0.6171161605436398, + "grad_norm": 9.376656754650416, + "learning_rate": 4.049680982148938e-06, + "loss": 1.2917, + "step": 4359 + }, + { + "epoch": 0.617257733418277, + "grad_norm": 8.891846906365966, + "learning_rate": 4.049231150145873e-06, + "loss": 1.2704, + "step": 4360 + }, + { + "epoch": 0.6173993062929143, + "grad_norm": 9.9846481739576, + "learning_rate": 4.048781236700882e-06, + "loss": 1.2862, + "step": 4361 + }, + { + "epoch": 0.6175408791675515, + "grad_norm": 10.69322577684809, + "learning_rate": 4.048331241837615e-06, + "loss": 1.149, + "step": 4362 + }, + { + "epoch": 0.6176824520421887, + "grad_norm": 8.190978730796422, + "learning_rate": 4.047881165579729e-06, + "loss": 1.2451, + "step": 4363 + }, + { + "epoch": 0.617824024916826, + "grad_norm": 8.909295668929516, + "learning_rate": 4.047431007950885e-06, + "loss": 1.3481, + "step": 4364 + }, + { + "epoch": 0.6179655977914632, + "grad_norm": 8.943529775961817, + "learning_rate": 4.046980768974746e-06, + "loss": 1.3652, + "step": 4365 + }, + { + "epoch": 0.6181071706661003, + "grad_norm": 9.302819462462542, + "learning_rate": 4.046530448674982e-06, + "loss": 1.1403, + "step": 4366 + }, + { + "epoch": 0.6182487435407376, + "grad_norm": 8.630988212222935, + "learning_rate": 4.046080047075265e-06, + "loss": 1.2074, + "step": 4367 + }, + { + "epoch": 0.6183903164153748, + "grad_norm": 8.552134679035367, + "learning_rate": 4.045629564199274e-06, + "loss": 1.1836, + "step": 4368 + }, + { + "epoch": 0.618531889290012, + "grad_norm": 10.569827455611858, + "learning_rate": 4.045179000070688e-06, + "loss": 1.3122, + "step": 4369 + }, + { + "epoch": 0.6186734621646492, + "grad_norm": 11.181030448689926, + "learning_rate": 4.044728354713195e-06, + "loss": 1.3547, + "step": 4370 + }, + { + "epoch": 0.6188150350392865, + "grad_norm": 8.524274832123506, + "learning_rate": 4.044277628150482e-06, + "loss": 1.3675, + "step": 4371 + }, + { + "epoch": 0.6189566079139237, + "grad_norm": 9.082983821012423, + "learning_rate": 4.0438268204062485e-06, + "loss": 1.333, + "step": 4372 + }, + { + "epoch": 0.6190981807885609, + "grad_norm": 8.800754809519757, + "learning_rate": 4.043375931504189e-06, + "loss": 1.255, + "step": 4373 + }, + { + "epoch": 0.6192397536631982, + "grad_norm": 8.375915932744451, + "learning_rate": 4.042924961468007e-06, + "loss": 1.2986, + "step": 4374 + }, + { + "epoch": 0.6193813265378354, + "grad_norm": 9.08539336289815, + "learning_rate": 4.04247391032141e-06, + "loss": 1.1271, + "step": 4375 + }, + { + "epoch": 0.6195228994124725, + "grad_norm": 9.768504653553885, + "learning_rate": 4.042022778088111e-06, + "loss": 1.2372, + "step": 4376 + }, + { + "epoch": 0.6196644722871097, + "grad_norm": 7.838719430388462, + "learning_rate": 4.0415715647918235e-06, + "loss": 1.1449, + "step": 4377 + }, + { + "epoch": 0.619806045161747, + "grad_norm": 8.621618520220153, + "learning_rate": 4.041120270456268e-06, + "loss": 1.3527, + "step": 4378 + }, + { + "epoch": 0.6199476180363842, + "grad_norm": 8.911129334331513, + "learning_rate": 4.04066889510517e-06, + "loss": 1.2397, + "step": 4379 + }, + { + "epoch": 0.6200891909110214, + "grad_norm": 10.775846769335178, + "learning_rate": 4.040217438762256e-06, + "loss": 1.4797, + "step": 4380 + }, + { + "epoch": 0.6202307637856587, + "grad_norm": 9.671456914933854, + "learning_rate": 4.03976590145126e-06, + "loss": 1.1854, + "step": 4381 + }, + { + "epoch": 0.6203723366602959, + "grad_norm": 9.402945004404703, + "learning_rate": 4.0393142831959186e-06, + "loss": 1.3047, + "step": 4382 + }, + { + "epoch": 0.6205139095349331, + "grad_norm": 8.974202801250302, + "learning_rate": 4.038862584019973e-06, + "loss": 1.2111, + "step": 4383 + }, + { + "epoch": 0.6206554824095704, + "grad_norm": 10.26057855830139, + "learning_rate": 4.038410803947169e-06, + "loss": 1.2288, + "step": 4384 + }, + { + "epoch": 0.6207970552842076, + "grad_norm": 11.59381712202605, + "learning_rate": 4.037958943001257e-06, + "loss": 1.3663, + "step": 4385 + }, + { + "epoch": 0.6209386281588448, + "grad_norm": 8.342693819212881, + "learning_rate": 4.0375070012059884e-06, + "loss": 1.2905, + "step": 4386 + }, + { + "epoch": 0.6210802010334819, + "grad_norm": 10.36827880818875, + "learning_rate": 4.037054978585124e-06, + "loss": 1.4141, + "step": 4387 + }, + { + "epoch": 0.6212217739081192, + "grad_norm": 9.43481770980686, + "learning_rate": 4.036602875162426e-06, + "loss": 1.2055, + "step": 4388 + }, + { + "epoch": 0.6213633467827564, + "grad_norm": 8.82685840263986, + "learning_rate": 4.03615069096166e-06, + "loss": 1.266, + "step": 4389 + }, + { + "epoch": 0.6215049196573936, + "grad_norm": 9.695654408883595, + "learning_rate": 4.035698426006597e-06, + "loss": 1.1519, + "step": 4390 + }, + { + "epoch": 0.6216464925320309, + "grad_norm": 9.887336847283025, + "learning_rate": 4.035246080321014e-06, + "loss": 1.3082, + "step": 4391 + }, + { + "epoch": 0.6217880654066681, + "grad_norm": 8.890014842034942, + "learning_rate": 4.034793653928688e-06, + "loss": 1.3871, + "step": 4392 + }, + { + "epoch": 0.6219296382813053, + "grad_norm": 10.477784770353304, + "learning_rate": 4.034341146853406e-06, + "loss": 1.2215, + "step": 4393 + }, + { + "epoch": 0.6220712111559425, + "grad_norm": 8.452794659095188, + "learning_rate": 4.0338885591189515e-06, + "loss": 1.2396, + "step": 4394 + }, + { + "epoch": 0.6222127840305798, + "grad_norm": 9.584911537140272, + "learning_rate": 4.033435890749121e-06, + "loss": 1.1965, + "step": 4395 + }, + { + "epoch": 0.622354356905217, + "grad_norm": 9.52252938976845, + "learning_rate": 4.032983141767708e-06, + "loss": 1.2357, + "step": 4396 + }, + { + "epoch": 0.6224959297798541, + "grad_norm": 8.172842635857384, + "learning_rate": 4.032530312198515e-06, + "loss": 1.2753, + "step": 4397 + }, + { + "epoch": 0.6226375026544914, + "grad_norm": 10.321045819896657, + "learning_rate": 4.032077402065346e-06, + "loss": 1.4806, + "step": 4398 + }, + { + "epoch": 0.6227790755291286, + "grad_norm": 8.470497091900022, + "learning_rate": 4.03162441139201e-06, + "loss": 1.3248, + "step": 4399 + }, + { + "epoch": 0.6229206484037658, + "grad_norm": 10.099197098950839, + "learning_rate": 4.031171340202321e-06, + "loss": 1.3563, + "step": 4400 + }, + { + "epoch": 0.623062221278403, + "grad_norm": 8.549661854978398, + "learning_rate": 4.030718188520096e-06, + "loss": 1.1736, + "step": 4401 + }, + { + "epoch": 0.6232037941530403, + "grad_norm": 10.075012767395563, + "learning_rate": 4.030264956369158e-06, + "loss": 1.4943, + "step": 4402 + }, + { + "epoch": 0.6233453670276775, + "grad_norm": 11.016665232355791, + "learning_rate": 4.029811643773332e-06, + "loss": 1.3043, + "step": 4403 + }, + { + "epoch": 0.6234869399023147, + "grad_norm": 11.725973685042309, + "learning_rate": 4.029358250756448e-06, + "loss": 1.2226, + "step": 4404 + }, + { + "epoch": 0.623628512776952, + "grad_norm": 11.92602690747642, + "learning_rate": 4.028904777342341e-06, + "loss": 1.2731, + "step": 4405 + }, + { + "epoch": 0.6237700856515892, + "grad_norm": 11.967686060536293, + "learning_rate": 4.02845122355485e-06, + "loss": 1.4376, + "step": 4406 + }, + { + "epoch": 0.6239116585262263, + "grad_norm": 10.688182429784016, + "learning_rate": 4.027997589417818e-06, + "loss": 1.4111, + "step": 4407 + }, + { + "epoch": 0.6240532314008636, + "grad_norm": 11.467711523785965, + "learning_rate": 4.027543874955092e-06, + "loss": 1.2651, + "step": 4408 + }, + { + "epoch": 0.6241948042755008, + "grad_norm": 10.433258839626689, + "learning_rate": 4.027090080190524e-06, + "loss": 1.2079, + "step": 4409 + }, + { + "epoch": 0.624336377150138, + "grad_norm": 11.481852268626117, + "learning_rate": 4.026636205147969e-06, + "loss": 1.4114, + "step": 4410 + }, + { + "epoch": 0.6244779500247752, + "grad_norm": 11.123649600932957, + "learning_rate": 4.026182249851287e-06, + "loss": 1.28, + "step": 4411 + }, + { + "epoch": 0.6246195228994125, + "grad_norm": 10.145505133509248, + "learning_rate": 4.025728214324341e-06, + "loss": 1.2318, + "step": 4412 + }, + { + "epoch": 0.6247610957740497, + "grad_norm": 9.781870861884947, + "learning_rate": 4.025274098591001e-06, + "loss": 1.2932, + "step": 4413 + }, + { + "epoch": 0.6249026686486869, + "grad_norm": 10.08448692427527, + "learning_rate": 4.02481990267514e-06, + "loss": 1.2873, + "step": 4414 + }, + { + "epoch": 0.6250442415233242, + "grad_norm": 7.828986997980695, + "learning_rate": 4.024365626600632e-06, + "loss": 1.2186, + "step": 4415 + }, + { + "epoch": 0.6251858143979614, + "grad_norm": 9.27869274731743, + "learning_rate": 4.023911270391361e-06, + "loss": 1.2698, + "step": 4416 + }, + { + "epoch": 0.6253273872725986, + "grad_norm": 10.657578729476972, + "learning_rate": 4.02345683407121e-06, + "loss": 1.3076, + "step": 4417 + }, + { + "epoch": 0.6254689601472357, + "grad_norm": 11.769588294335705, + "learning_rate": 4.02300231766407e-06, + "loss": 1.2803, + "step": 4418 + }, + { + "epoch": 0.625610533021873, + "grad_norm": 10.233733877819535, + "learning_rate": 4.022547721193833e-06, + "loss": 1.1996, + "step": 4419 + }, + { + "epoch": 0.6257521058965102, + "grad_norm": 10.409854299091814, + "learning_rate": 4.022093044684397e-06, + "loss": 1.4227, + "step": 4420 + }, + { + "epoch": 0.6258936787711474, + "grad_norm": 9.251183099091174, + "learning_rate": 4.021638288159666e-06, + "loss": 1.2448, + "step": 4421 + }, + { + "epoch": 0.6260352516457847, + "grad_norm": 9.291910849699835, + "learning_rate": 4.021183451643544e-06, + "loss": 1.3266, + "step": 4422 + }, + { + "epoch": 0.6261768245204219, + "grad_norm": 9.08552730090275, + "learning_rate": 4.020728535159942e-06, + "loss": 1.2715, + "step": 4423 + }, + { + "epoch": 0.6263183973950591, + "grad_norm": 8.493141548869925, + "learning_rate": 4.020273538732775e-06, + "loss": 1.3049, + "step": 4424 + }, + { + "epoch": 0.6264599702696964, + "grad_norm": 10.482063231881783, + "learning_rate": 4.019818462385962e-06, + "loss": 1.3682, + "step": 4425 + }, + { + "epoch": 0.6266015431443336, + "grad_norm": 9.497973978938342, + "learning_rate": 4.019363306143425e-06, + "loss": 1.2477, + "step": 4426 + }, + { + "epoch": 0.6267431160189708, + "grad_norm": 8.90372300019268, + "learning_rate": 4.018908070029093e-06, + "loss": 1.2161, + "step": 4427 + }, + { + "epoch": 0.6268846888936079, + "grad_norm": 9.09937062864253, + "learning_rate": 4.018452754066895e-06, + "loss": 1.2541, + "step": 4428 + }, + { + "epoch": 0.6270262617682452, + "grad_norm": 8.838147656828678, + "learning_rate": 4.017997358280769e-06, + "loss": 1.302, + "step": 4429 + }, + { + "epoch": 0.6271678346428824, + "grad_norm": 12.753683717430734, + "learning_rate": 4.017541882694653e-06, + "loss": 1.3656, + "step": 4430 + }, + { + "epoch": 0.6273094075175196, + "grad_norm": 10.017429802165253, + "learning_rate": 4.017086327332492e-06, + "loss": 1.2565, + "step": 4431 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 11.503099106871156, + "learning_rate": 4.0166306922182335e-06, + "loss": 1.3535, + "step": 4432 + }, + { + "epoch": 0.6275925532667941, + "grad_norm": 10.404212729326993, + "learning_rate": 4.016174977375831e-06, + "loss": 1.4475, + "step": 4433 + }, + { + "epoch": 0.6277341261414313, + "grad_norm": 7.578385244403827, + "learning_rate": 4.01571918282924e-06, + "loss": 1.3443, + "step": 4434 + }, + { + "epoch": 0.6278756990160685, + "grad_norm": 8.271647864012978, + "learning_rate": 4.015263308602422e-06, + "loss": 1.1755, + "step": 4435 + }, + { + "epoch": 0.6280172718907058, + "grad_norm": 11.524885369845808, + "learning_rate": 4.014807354719342e-06, + "loss": 1.4746, + "step": 4436 + }, + { + "epoch": 0.628158844765343, + "grad_norm": 9.406298260232301, + "learning_rate": 4.014351321203969e-06, + "loss": 1.4196, + "step": 4437 + }, + { + "epoch": 0.6283004176399801, + "grad_norm": 9.374548735884028, + "learning_rate": 4.013895208080275e-06, + "loss": 1.2803, + "step": 4438 + }, + { + "epoch": 0.6284419905146174, + "grad_norm": 10.284823182273238, + "learning_rate": 4.013439015372239e-06, + "loss": 1.2783, + "step": 4439 + }, + { + "epoch": 0.6285835633892546, + "grad_norm": 10.43897753096968, + "learning_rate": 4.012982743103844e-06, + "loss": 1.3695, + "step": 4440 + }, + { + "epoch": 0.6287251362638918, + "grad_norm": 8.799163900016607, + "learning_rate": 4.012526391299073e-06, + "loss": 1.1853, + "step": 4441 + }, + { + "epoch": 0.628866709138529, + "grad_norm": 8.66641376811064, + "learning_rate": 4.012069959981917e-06, + "loss": 1.3673, + "step": 4442 + }, + { + "epoch": 0.6290082820131663, + "grad_norm": 9.568712796787368, + "learning_rate": 4.0116134491763716e-06, + "loss": 1.3232, + "step": 4443 + }, + { + "epoch": 0.6291498548878035, + "grad_norm": 9.12903077445023, + "learning_rate": 4.0111568589064335e-06, + "loss": 1.1527, + "step": 4444 + }, + { + "epoch": 0.6292914277624407, + "grad_norm": 9.48670591466438, + "learning_rate": 4.010700189196106e-06, + "loss": 1.2234, + "step": 4445 + }, + { + "epoch": 0.629433000637078, + "grad_norm": 8.116013942324873, + "learning_rate": 4.010243440069397e-06, + "loss": 1.247, + "step": 4446 + }, + { + "epoch": 0.6295745735117152, + "grad_norm": 10.827171294693223, + "learning_rate": 4.0097866115503156e-06, + "loss": 1.27, + "step": 4447 + }, + { + "epoch": 0.6297161463863524, + "grad_norm": 8.804276328991843, + "learning_rate": 4.009329703662878e-06, + "loss": 1.2204, + "step": 4448 + }, + { + "epoch": 0.6298577192609895, + "grad_norm": 9.27141751509756, + "learning_rate": 4.008872716431104e-06, + "loss": 1.2983, + "step": 4449 + }, + { + "epoch": 0.6299992921356268, + "grad_norm": 7.840256357478151, + "learning_rate": 4.008415649879015e-06, + "loss": 1.2159, + "step": 4450 + }, + { + "epoch": 0.630140865010264, + "grad_norm": 9.115769407575526, + "learning_rate": 4.007958504030641e-06, + "loss": 1.2202, + "step": 4451 + }, + { + "epoch": 0.6302824378849012, + "grad_norm": 9.649540122345412, + "learning_rate": 4.007501278910013e-06, + "loss": 1.1243, + "step": 4452 + }, + { + "epoch": 0.6304240107595385, + "grad_norm": 11.260927784506528, + "learning_rate": 4.007043974541166e-06, + "loss": 1.3887, + "step": 4453 + }, + { + "epoch": 0.6305655836341757, + "grad_norm": 9.371582629103854, + "learning_rate": 4.006586590948141e-06, + "loss": 1.0903, + "step": 4454 + }, + { + "epoch": 0.6307071565088129, + "grad_norm": 9.773959334215307, + "learning_rate": 4.006129128154983e-06, + "loss": 1.2734, + "step": 4455 + }, + { + "epoch": 0.6308487293834502, + "grad_norm": 9.232998869676168, + "learning_rate": 4.00567158618574e-06, + "loss": 1.3845, + "step": 4456 + }, + { + "epoch": 0.6309903022580874, + "grad_norm": 11.92470771863213, + "learning_rate": 4.0052139650644625e-06, + "loss": 1.4115, + "step": 4457 + }, + { + "epoch": 0.6311318751327246, + "grad_norm": 10.621958218658449, + "learning_rate": 4.004756264815211e-06, + "loss": 1.4166, + "step": 4458 + }, + { + "epoch": 0.6312734480073617, + "grad_norm": 9.669514554451531, + "learning_rate": 4.004298485462044e-06, + "loss": 1.4698, + "step": 4459 + }, + { + "epoch": 0.631415020881999, + "grad_norm": 8.072225220304759, + "learning_rate": 4.003840627029028e-06, + "loss": 1.2047, + "step": 4460 + }, + { + "epoch": 0.6315565937566362, + "grad_norm": 9.158387022639156, + "learning_rate": 4.00338268954023e-06, + "loss": 1.3197, + "step": 4461 + }, + { + "epoch": 0.6316981666312734, + "grad_norm": 12.285420182975656, + "learning_rate": 4.002924673019726e-06, + "loss": 1.3133, + "step": 4462 + }, + { + "epoch": 0.6318397395059107, + "grad_norm": 11.332261502314353, + "learning_rate": 4.002466577491593e-06, + "loss": 1.4357, + "step": 4463 + }, + { + "epoch": 0.6319813123805479, + "grad_norm": 8.076309560872891, + "learning_rate": 4.002008402979911e-06, + "loss": 1.3517, + "step": 4464 + }, + { + "epoch": 0.6321228852551851, + "grad_norm": 8.863774036680981, + "learning_rate": 4.001550149508768e-06, + "loss": 1.3846, + "step": 4465 + }, + { + "epoch": 0.6322644581298223, + "grad_norm": 7.941229957616521, + "learning_rate": 4.001091817102253e-06, + "loss": 1.367, + "step": 4466 + }, + { + "epoch": 0.6324060310044596, + "grad_norm": 11.668571607291621, + "learning_rate": 4.000633405784461e-06, + "loss": 1.2163, + "step": 4467 + }, + { + "epoch": 0.6325476038790968, + "grad_norm": 10.807793623822427, + "learning_rate": 4.000174915579489e-06, + "loss": 1.1841, + "step": 4468 + }, + { + "epoch": 0.632689176753734, + "grad_norm": 8.982912160121842, + "learning_rate": 3.999716346511442e-06, + "loss": 1.2435, + "step": 4469 + }, + { + "epoch": 0.6328307496283712, + "grad_norm": 10.236803072541548, + "learning_rate": 3.999257698604423e-06, + "loss": 1.4105, + "step": 4470 + }, + { + "epoch": 0.6329723225030084, + "grad_norm": 8.06798278328643, + "learning_rate": 3.998798971882545e-06, + "loss": 1.2638, + "step": 4471 + }, + { + "epoch": 0.6331138953776456, + "grad_norm": 7.203514767111955, + "learning_rate": 3.998340166369923e-06, + "loss": 1.1306, + "step": 4472 + }, + { + "epoch": 0.6332554682522828, + "grad_norm": 10.393339929617337, + "learning_rate": 3.997881282090676e-06, + "loss": 1.2977, + "step": 4473 + }, + { + "epoch": 0.6333970411269201, + "grad_norm": 10.344383566085737, + "learning_rate": 3.997422319068926e-06, + "loss": 1.3082, + "step": 4474 + }, + { + "epoch": 0.6335386140015573, + "grad_norm": 7.95229828440498, + "learning_rate": 3.996963277328802e-06, + "loss": 1.1325, + "step": 4475 + }, + { + "epoch": 0.6336801868761945, + "grad_norm": 10.863500368609975, + "learning_rate": 3.996504156894434e-06, + "loss": 1.3121, + "step": 4476 + }, + { + "epoch": 0.6338217597508318, + "grad_norm": 9.411853831935074, + "learning_rate": 3.996044957789959e-06, + "loss": 1.2073, + "step": 4477 + }, + { + "epoch": 0.633963332625469, + "grad_norm": 9.085565508560904, + "learning_rate": 3.995585680039515e-06, + "loss": 1.4125, + "step": 4478 + }, + { + "epoch": 0.6341049055001062, + "grad_norm": 9.45590295249336, + "learning_rate": 3.995126323667248e-06, + "loss": 1.3561, + "step": 4479 + }, + { + "epoch": 0.6342464783747433, + "grad_norm": 10.933386503260895, + "learning_rate": 3.994666888697304e-06, + "loss": 1.2764, + "step": 4480 + }, + { + "epoch": 0.6343880512493806, + "grad_norm": 10.147579306597777, + "learning_rate": 3.994207375153836e-06, + "loss": 1.3019, + "step": 4481 + }, + { + "epoch": 0.6345296241240178, + "grad_norm": 9.289153667288112, + "learning_rate": 3.993747783061001e-06, + "loss": 1.4612, + "step": 4482 + }, + { + "epoch": 0.634671196998655, + "grad_norm": 10.569251436489454, + "learning_rate": 3.99328811244296e-06, + "loss": 1.3308, + "step": 4483 + }, + { + "epoch": 0.6348127698732923, + "grad_norm": 12.556287453473571, + "learning_rate": 3.9928283633238755e-06, + "loss": 1.3408, + "step": 4484 + }, + { + "epoch": 0.6349543427479295, + "grad_norm": 8.894221163484776, + "learning_rate": 3.992368535727917e-06, + "loss": 1.1773, + "step": 4485 + }, + { + "epoch": 0.6350959156225667, + "grad_norm": 10.943097758395686, + "learning_rate": 3.991908629679257e-06, + "loss": 1.3305, + "step": 4486 + }, + { + "epoch": 0.635237488497204, + "grad_norm": 9.905810999243544, + "learning_rate": 3.991448645202073e-06, + "loss": 1.2113, + "step": 4487 + }, + { + "epoch": 0.6353790613718412, + "grad_norm": 11.318922717052597, + "learning_rate": 3.990988582320546e-06, + "loss": 1.2476, + "step": 4488 + }, + { + "epoch": 0.6355206342464784, + "grad_norm": 10.109773088544346, + "learning_rate": 3.990528441058861e-06, + "loss": 1.1969, + "step": 4489 + }, + { + "epoch": 0.6356622071211155, + "grad_norm": 7.2795064909784415, + "learning_rate": 3.990068221441207e-06, + "loss": 1.2358, + "step": 4490 + }, + { + "epoch": 0.6358037799957528, + "grad_norm": 8.193110982155556, + "learning_rate": 3.989607923491777e-06, + "loss": 1.1882, + "step": 4491 + }, + { + "epoch": 0.63594535287039, + "grad_norm": 9.34835650845536, + "learning_rate": 3.98914754723477e-06, + "loss": 1.3959, + "step": 4492 + }, + { + "epoch": 0.6360869257450272, + "grad_norm": 9.646380173498251, + "learning_rate": 3.988687092694386e-06, + "loss": 1.354, + "step": 4493 + }, + { + "epoch": 0.6362284986196645, + "grad_norm": 10.527208950899494, + "learning_rate": 3.988226559894832e-06, + "loss": 1.2932, + "step": 4494 + }, + { + "epoch": 0.6363700714943017, + "grad_norm": 9.236564157284928, + "learning_rate": 3.9877659488603186e-06, + "loss": 1.2389, + "step": 4495 + }, + { + "epoch": 0.6365116443689389, + "grad_norm": 8.212324175991961, + "learning_rate": 3.9873052596150565e-06, + "loss": 1.3639, + "step": 4496 + }, + { + "epoch": 0.6366532172435762, + "grad_norm": 10.585008103228365, + "learning_rate": 3.986844492183267e-06, + "loss": 1.2861, + "step": 4497 + }, + { + "epoch": 0.6367947901182134, + "grad_norm": 9.748597655560516, + "learning_rate": 3.986383646589171e-06, + "loss": 1.1851, + "step": 4498 + }, + { + "epoch": 0.6369363629928506, + "grad_norm": 8.370581073165793, + "learning_rate": 3.985922722856996e-06, + "loss": 1.4422, + "step": 4499 + }, + { + "epoch": 0.6370779358674878, + "grad_norm": 8.164878286095746, + "learning_rate": 3.9854617210109705e-06, + "loss": 1.3022, + "step": 4500 + }, + { + "epoch": 0.637219508742125, + "grad_norm": 7.482927345641961, + "learning_rate": 3.985000641075329e-06, + "loss": 1.2614, + "step": 4501 + }, + { + "epoch": 0.6373610816167622, + "grad_norm": 9.260509397530884, + "learning_rate": 3.984539483074313e-06, + "loss": 1.3309, + "step": 4502 + }, + { + "epoch": 0.6375026544913994, + "grad_norm": 8.636141008491853, + "learning_rate": 3.984078247032162e-06, + "loss": 1.242, + "step": 4503 + }, + { + "epoch": 0.6376442273660367, + "grad_norm": 7.992221149810026, + "learning_rate": 3.983616932973124e-06, + "loss": 1.3112, + "step": 4504 + }, + { + "epoch": 0.6377858002406739, + "grad_norm": 7.9025510797536285, + "learning_rate": 3.98315554092145e-06, + "loss": 1.1481, + "step": 4505 + }, + { + "epoch": 0.6379273731153111, + "grad_norm": 7.188198685894846, + "learning_rate": 3.982694070901396e-06, + "loss": 1.3925, + "step": 4506 + }, + { + "epoch": 0.6380689459899483, + "grad_norm": 8.100737512238469, + "learning_rate": 3.98223252293722e-06, + "loss": 1.2566, + "step": 4507 + }, + { + "epoch": 0.6382105188645856, + "grad_norm": 10.982000710235333, + "learning_rate": 3.9817708970531855e-06, + "loss": 1.2336, + "step": 4508 + }, + { + "epoch": 0.6383520917392228, + "grad_norm": 10.084725233958215, + "learning_rate": 3.9813091932735596e-06, + "loss": 1.323, + "step": 4509 + }, + { + "epoch": 0.63849366461386, + "grad_norm": 8.383923232127488, + "learning_rate": 3.9808474116226135e-06, + "loss": 1.2865, + "step": 4510 + }, + { + "epoch": 0.6386352374884972, + "grad_norm": 8.226071885595955, + "learning_rate": 3.980385552124624e-06, + "loss": 1.1954, + "step": 4511 + }, + { + "epoch": 0.6387768103631344, + "grad_norm": 8.082549014498912, + "learning_rate": 3.979923614803869e-06, + "loss": 1.4014, + "step": 4512 + }, + { + "epoch": 0.6389183832377716, + "grad_norm": 8.504475032655142, + "learning_rate": 3.979461599684633e-06, + "loss": 1.2166, + "step": 4513 + }, + { + "epoch": 0.6390599561124088, + "grad_norm": 7.697082501865713, + "learning_rate": 3.978999506791205e-06, + "loss": 1.2058, + "step": 4514 + }, + { + "epoch": 0.6392015289870461, + "grad_norm": 8.27877321452279, + "learning_rate": 3.978537336147875e-06, + "loss": 1.1736, + "step": 4515 + }, + { + "epoch": 0.6393431018616833, + "grad_norm": 10.339033175042475, + "learning_rate": 3.97807508777894e-06, + "loss": 1.1608, + "step": 4516 + }, + { + "epoch": 0.6394846747363205, + "grad_norm": 9.005202591061163, + "learning_rate": 3.977612761708699e-06, + "loss": 1.2999, + "step": 4517 + }, + { + "epoch": 0.6396262476109578, + "grad_norm": 8.041114063415552, + "learning_rate": 3.977150357961457e-06, + "loss": 1.1359, + "step": 4518 + }, + { + "epoch": 0.639767820485595, + "grad_norm": 8.760230922355202, + "learning_rate": 3.976687876561523e-06, + "loss": 1.2871, + "step": 4519 + }, + { + "epoch": 0.6399093933602322, + "grad_norm": 10.793807790631337, + "learning_rate": 3.976225317533208e-06, + "loss": 1.278, + "step": 4520 + }, + { + "epoch": 0.6400509662348693, + "grad_norm": 9.946114507013334, + "learning_rate": 3.9757626809008274e-06, + "loss": 1.3256, + "step": 4521 + }, + { + "epoch": 0.6401925391095066, + "grad_norm": 8.377708794095906, + "learning_rate": 3.975299966688705e-06, + "loss": 1.175, + "step": 4522 + }, + { + "epoch": 0.6403341119841438, + "grad_norm": 8.167464340594504, + "learning_rate": 3.974837174921162e-06, + "loss": 1.2688, + "step": 4523 + }, + { + "epoch": 0.640475684858781, + "grad_norm": 9.493890102575717, + "learning_rate": 3.974374305622529e-06, + "loss": 1.1958, + "step": 4524 + }, + { + "epoch": 0.6406172577334183, + "grad_norm": 7.390338271145841, + "learning_rate": 3.973911358817139e-06, + "loss": 1.1987, + "step": 4525 + }, + { + "epoch": 0.6407588306080555, + "grad_norm": 9.730456204829327, + "learning_rate": 3.973448334529326e-06, + "loss": 1.1872, + "step": 4526 + }, + { + "epoch": 0.6409004034826927, + "grad_norm": 10.135041332188823, + "learning_rate": 3.972985232783434e-06, + "loss": 1.3345, + "step": 4527 + }, + { + "epoch": 0.64104197635733, + "grad_norm": 10.050203569922104, + "learning_rate": 3.972522053603806e-06, + "loss": 1.2604, + "step": 4528 + }, + { + "epoch": 0.6411835492319672, + "grad_norm": 9.80803292029676, + "learning_rate": 3.972058797014792e-06, + "loss": 1.246, + "step": 4529 + }, + { + "epoch": 0.6413251221066044, + "grad_norm": 8.94614616407338, + "learning_rate": 3.971595463040744e-06, + "loss": 1.2121, + "step": 4530 + }, + { + "epoch": 0.6414666949812416, + "grad_norm": 12.27530159201006, + "learning_rate": 3.97113205170602e-06, + "loss": 1.1771, + "step": 4531 + }, + { + "epoch": 0.6416082678558788, + "grad_norm": 11.287395800408442, + "learning_rate": 3.970668563034982e-06, + "loss": 1.289, + "step": 4532 + }, + { + "epoch": 0.641749840730516, + "grad_norm": 8.999899545744725, + "learning_rate": 3.9702049970519925e-06, + "loss": 1.2248, + "step": 4533 + }, + { + "epoch": 0.6418914136051532, + "grad_norm": 9.297963516239264, + "learning_rate": 3.969741353781424e-06, + "loss": 1.2605, + "step": 4534 + }, + { + "epoch": 0.6420329864797905, + "grad_norm": 8.791894509013241, + "learning_rate": 3.969277633247648e-06, + "loss": 1.2902, + "step": 4535 + }, + { + "epoch": 0.6421745593544277, + "grad_norm": 9.192746279947057, + "learning_rate": 3.968813835475043e-06, + "loss": 1.1481, + "step": 4536 + }, + { + "epoch": 0.6423161322290649, + "grad_norm": 8.710676057595707, + "learning_rate": 3.968349960487988e-06, + "loss": 1.288, + "step": 4537 + }, + { + "epoch": 0.6424577051037021, + "grad_norm": 7.43571643119981, + "learning_rate": 3.967886008310872e-06, + "loss": 1.1648, + "step": 4538 + }, + { + "epoch": 0.6425992779783394, + "grad_norm": 9.329103146998845, + "learning_rate": 3.967421978968083e-06, + "loss": 1.2601, + "step": 4539 + }, + { + "epoch": 0.6427408508529766, + "grad_norm": 10.373412596817337, + "learning_rate": 3.966957872484013e-06, + "loss": 1.2354, + "step": 4540 + }, + { + "epoch": 0.6428824237276138, + "grad_norm": 10.495111689961456, + "learning_rate": 3.966493688883064e-06, + "loss": 1.2922, + "step": 4541 + }, + { + "epoch": 0.643023996602251, + "grad_norm": 11.185066512570454, + "learning_rate": 3.966029428189634e-06, + "loss": 1.3681, + "step": 4542 + }, + { + "epoch": 0.6431655694768882, + "grad_norm": 8.456977572722263, + "learning_rate": 3.965565090428129e-06, + "loss": 1.2131, + "step": 4543 + }, + { + "epoch": 0.6433071423515254, + "grad_norm": 8.71356462626483, + "learning_rate": 3.965100675622962e-06, + "loss": 1.3791, + "step": 4544 + }, + { + "epoch": 0.6434487152261626, + "grad_norm": 10.646160282457034, + "learning_rate": 3.9646361837985435e-06, + "loss": 1.2969, + "step": 4545 + }, + { + "epoch": 0.6435902881007999, + "grad_norm": 11.544338169614319, + "learning_rate": 3.964171614979294e-06, + "loss": 1.2989, + "step": 4546 + }, + { + "epoch": 0.6437318609754371, + "grad_norm": 8.382203605274299, + "learning_rate": 3.963706969189634e-06, + "loss": 1.13, + "step": 4547 + }, + { + "epoch": 0.6438734338500743, + "grad_norm": 11.14617516299471, + "learning_rate": 3.963242246453989e-06, + "loss": 1.3873, + "step": 4548 + }, + { + "epoch": 0.6440150067247116, + "grad_norm": 8.286849061652171, + "learning_rate": 3.962777446796791e-06, + "loss": 1.3765, + "step": 4549 + }, + { + "epoch": 0.6441565795993488, + "grad_norm": 9.197138913639249, + "learning_rate": 3.962312570242473e-06, + "loss": 1.3262, + "step": 4550 + }, + { + "epoch": 0.644298152473986, + "grad_norm": 9.135437313275439, + "learning_rate": 3.961847616815474e-06, + "loss": 1.2912, + "step": 4551 + }, + { + "epoch": 0.6444397253486231, + "grad_norm": 9.019064315003195, + "learning_rate": 3.961382586540236e-06, + "loss": 1.3678, + "step": 4552 + }, + { + "epoch": 0.6445812982232604, + "grad_norm": 11.228449886472635, + "learning_rate": 3.960917479441204e-06, + "loss": 1.3546, + "step": 4553 + }, + { + "epoch": 0.6447228710978976, + "grad_norm": 8.524037201061368, + "learning_rate": 3.96045229554283e-06, + "loss": 1.2301, + "step": 4554 + }, + { + "epoch": 0.6448644439725348, + "grad_norm": 10.986792872114505, + "learning_rate": 3.959987034869568e-06, + "loss": 1.2826, + "step": 4555 + }, + { + "epoch": 0.6450060168471721, + "grad_norm": 8.755498411935568, + "learning_rate": 3.959521697445876e-06, + "loss": 1.1196, + "step": 4556 + }, + { + "epoch": 0.6451475897218093, + "grad_norm": 8.305228588028992, + "learning_rate": 3.9590562832962174e-06, + "loss": 1.2003, + "step": 4557 + }, + { + "epoch": 0.6452891625964465, + "grad_norm": 9.141973725282194, + "learning_rate": 3.958590792445057e-06, + "loss": 1.0965, + "step": 4558 + }, + { + "epoch": 0.6454307354710838, + "grad_norm": 7.296154828034984, + "learning_rate": 3.958125224916866e-06, + "loss": 1.3694, + "step": 4559 + }, + { + "epoch": 0.645572308345721, + "grad_norm": 8.54069649994083, + "learning_rate": 3.95765958073612e-06, + "loss": 1.2157, + "step": 4560 + }, + { + "epoch": 0.6457138812203582, + "grad_norm": 8.535806075091122, + "learning_rate": 3.957193859927295e-06, + "loss": 1.1252, + "step": 4561 + }, + { + "epoch": 0.6458554540949955, + "grad_norm": 8.37248898341131, + "learning_rate": 3.9567280625148776e-06, + "loss": 1.2161, + "step": 4562 + }, + { + "epoch": 0.6459970269696326, + "grad_norm": 9.247824335693648, + "learning_rate": 3.956262188523351e-06, + "loss": 1.335, + "step": 4563 + }, + { + "epoch": 0.6461385998442698, + "grad_norm": 8.42045714381008, + "learning_rate": 3.955796237977207e-06, + "loss": 1.3997, + "step": 4564 + }, + { + "epoch": 0.646280172718907, + "grad_norm": 8.951732906744448, + "learning_rate": 3.955330210900941e-06, + "loss": 1.308, + "step": 4565 + }, + { + "epoch": 0.6464217455935443, + "grad_norm": 7.34964492874406, + "learning_rate": 3.95486410731905e-06, + "loss": 1.1371, + "step": 4566 + }, + { + "epoch": 0.6465633184681815, + "grad_norm": 9.438853589937983, + "learning_rate": 3.954397927256037e-06, + "loss": 1.2917, + "step": 4567 + }, + { + "epoch": 0.6467048913428187, + "grad_norm": 9.596369183898245, + "learning_rate": 3.953931670736411e-06, + "loss": 1.062, + "step": 4568 + }, + { + "epoch": 0.646846464217456, + "grad_norm": 7.392884116850849, + "learning_rate": 3.953465337784681e-06, + "loss": 1.2698, + "step": 4569 + }, + { + "epoch": 0.6469880370920932, + "grad_norm": 8.37752847792459, + "learning_rate": 3.952998928425361e-06, + "loss": 1.2358, + "step": 4570 + }, + { + "epoch": 0.6471296099667304, + "grad_norm": 9.829551702388155, + "learning_rate": 3.9525324426829716e-06, + "loss": 1.2021, + "step": 4571 + }, + { + "epoch": 0.6472711828413676, + "grad_norm": 8.378887583749604, + "learning_rate": 3.952065880582034e-06, + "loss": 1.2161, + "step": 4572 + }, + { + "epoch": 0.6474127557160048, + "grad_norm": 9.996242198609522, + "learning_rate": 3.951599242147076e-06, + "loss": 1.4276, + "step": 4573 + }, + { + "epoch": 0.647554328590642, + "grad_norm": 7.370737686816128, + "learning_rate": 3.951132527402629e-06, + "loss": 1.2587, + "step": 4574 + }, + { + "epoch": 0.6476959014652792, + "grad_norm": 7.792620948927396, + "learning_rate": 3.950665736373226e-06, + "loss": 1.1225, + "step": 4575 + }, + { + "epoch": 0.6478374743399165, + "grad_norm": 10.221100423137583, + "learning_rate": 3.950198869083407e-06, + "loss": 1.3911, + "step": 4576 + }, + { + "epoch": 0.6479790472145537, + "grad_norm": 9.008337185906202, + "learning_rate": 3.949731925557715e-06, + "loss": 1.2453, + "step": 4577 + }, + { + "epoch": 0.6481206200891909, + "grad_norm": 8.818027791731108, + "learning_rate": 3.949264905820697e-06, + "loss": 1.3209, + "step": 4578 + }, + { + "epoch": 0.6482621929638281, + "grad_norm": 8.400859479984259, + "learning_rate": 3.948797809896903e-06, + "loss": 1.3933, + "step": 4579 + }, + { + "epoch": 0.6484037658384654, + "grad_norm": 8.540756797401697, + "learning_rate": 3.948330637810888e-06, + "loss": 1.1702, + "step": 4580 + }, + { + "epoch": 0.6485453387131026, + "grad_norm": 10.753573156505897, + "learning_rate": 3.947863389587212e-06, + "loss": 1.2679, + "step": 4581 + }, + { + "epoch": 0.6486869115877398, + "grad_norm": 9.196627488015393, + "learning_rate": 3.947396065250437e-06, + "loss": 1.1645, + "step": 4582 + }, + { + "epoch": 0.648828484462377, + "grad_norm": 8.509767193954985, + "learning_rate": 3.9469286648251304e-06, + "loss": 1.1602, + "step": 4583 + }, + { + "epoch": 0.6489700573370142, + "grad_norm": 8.003479677657937, + "learning_rate": 3.946461188335863e-06, + "loss": 1.2507, + "step": 4584 + }, + { + "epoch": 0.6491116302116514, + "grad_norm": 7.837138151488563, + "learning_rate": 3.945993635807209e-06, + "loss": 1.3798, + "step": 4585 + }, + { + "epoch": 0.6492532030862886, + "grad_norm": 8.407113069948473, + "learning_rate": 3.945526007263747e-06, + "loss": 1.289, + "step": 4586 + }, + { + "epoch": 0.6493947759609259, + "grad_norm": 7.552325253174438, + "learning_rate": 3.945058302730061e-06, + "loss": 1.3831, + "step": 4587 + }, + { + "epoch": 0.6495363488355631, + "grad_norm": 9.39706996947615, + "learning_rate": 3.944590522230738e-06, + "loss": 1.311, + "step": 4588 + }, + { + "epoch": 0.6496779217102003, + "grad_norm": 7.979705340841863, + "learning_rate": 3.9441226657903686e-06, + "loss": 1.1337, + "step": 4589 + }, + { + "epoch": 0.6498194945848376, + "grad_norm": 10.095939091784356, + "learning_rate": 3.943654733433547e-06, + "loss": 1.1595, + "step": 4590 + }, + { + "epoch": 0.6499610674594748, + "grad_norm": 8.854476193272337, + "learning_rate": 3.943186725184872e-06, + "loss": 1.3115, + "step": 4591 + }, + { + "epoch": 0.650102640334112, + "grad_norm": 9.967975072128452, + "learning_rate": 3.942718641068947e-06, + "loss": 1.2999, + "step": 4592 + }, + { + "epoch": 0.6502442132087493, + "grad_norm": 8.894950257066286, + "learning_rate": 3.94225048111038e-06, + "loss": 1.2693, + "step": 4593 + }, + { + "epoch": 0.6503857860833864, + "grad_norm": 10.949477607424397, + "learning_rate": 3.941782245333781e-06, + "loss": 1.2845, + "step": 4594 + }, + { + "epoch": 0.6505273589580236, + "grad_norm": 8.579297595316525, + "learning_rate": 3.941313933763763e-06, + "loss": 1.3784, + "step": 4595 + }, + { + "epoch": 0.6506689318326608, + "grad_norm": 9.680855521236069, + "learning_rate": 3.9408455464249466e-06, + "loss": 1.194, + "step": 4596 + }, + { + "epoch": 0.6508105047072981, + "grad_norm": 10.294165015872993, + "learning_rate": 3.9403770833419535e-06, + "loss": 1.2491, + "step": 4597 + }, + { + "epoch": 0.6509520775819353, + "grad_norm": 10.598087109399412, + "learning_rate": 3.939908544539412e-06, + "loss": 1.285, + "step": 4598 + }, + { + "epoch": 0.6510936504565725, + "grad_norm": 7.861733297012155, + "learning_rate": 3.9394399300419516e-06, + "loss": 1.3311, + "step": 4599 + }, + { + "epoch": 0.6512352233312098, + "grad_norm": 10.966459986199114, + "learning_rate": 3.938971239874208e-06, + "loss": 1.2349, + "step": 4600 + }, + { + "epoch": 0.651376796205847, + "grad_norm": 8.929810957835322, + "learning_rate": 3.938502474060818e-06, + "loss": 1.1407, + "step": 4601 + }, + { + "epoch": 0.6515183690804842, + "grad_norm": 11.06092814684765, + "learning_rate": 3.938033632626426e-06, + "loss": 1.2706, + "step": 4602 + }, + { + "epoch": 0.6516599419551214, + "grad_norm": 8.566818596151958, + "learning_rate": 3.937564715595678e-06, + "loss": 1.2413, + "step": 4603 + }, + { + "epoch": 0.6518015148297586, + "grad_norm": 10.213578061524162, + "learning_rate": 3.937095722993225e-06, + "loss": 1.3742, + "step": 4604 + }, + { + "epoch": 0.6519430877043958, + "grad_norm": 9.928866492784712, + "learning_rate": 3.936626654843722e-06, + "loss": 1.2688, + "step": 4605 + }, + { + "epoch": 0.652084660579033, + "grad_norm": 7.009017041301434, + "learning_rate": 3.936157511171827e-06, + "loss": 1.197, + "step": 4606 + }, + { + "epoch": 0.6522262334536703, + "grad_norm": 8.583935268986389, + "learning_rate": 3.935688292002201e-06, + "loss": 1.2885, + "step": 4607 + }, + { + "epoch": 0.6523678063283075, + "grad_norm": 10.19460792883934, + "learning_rate": 3.935218997359513e-06, + "loss": 1.3376, + "step": 4608 + }, + { + "epoch": 0.6525093792029447, + "grad_norm": 13.232404433798468, + "learning_rate": 3.934749627268433e-06, + "loss": 1.4152, + "step": 4609 + }, + { + "epoch": 0.652650952077582, + "grad_norm": 11.668081215979571, + "learning_rate": 3.934280181753634e-06, + "loss": 1.3362, + "step": 4610 + }, + { + "epoch": 0.6527925249522192, + "grad_norm": 11.183078339612146, + "learning_rate": 3.9338106608397955e-06, + "loss": 1.3332, + "step": 4611 + }, + { + "epoch": 0.6529340978268564, + "grad_norm": 9.257939411049948, + "learning_rate": 3.9333410645516e-06, + "loss": 1.421, + "step": 4612 + }, + { + "epoch": 0.6530756707014936, + "grad_norm": 9.200625224222621, + "learning_rate": 3.932871392913733e-06, + "loss": 1.2629, + "step": 4613 + }, + { + "epoch": 0.6532172435761309, + "grad_norm": 8.1308882284649, + "learning_rate": 3.932401645950885e-06, + "loss": 1.374, + "step": 4614 + }, + { + "epoch": 0.653358816450768, + "grad_norm": 9.282588977751994, + "learning_rate": 3.931931823687751e-06, + "loss": 1.4547, + "step": 4615 + }, + { + "epoch": 0.6535003893254052, + "grad_norm": 11.301839790122738, + "learning_rate": 3.931461926149029e-06, + "loss": 1.33, + "step": 4616 + }, + { + "epoch": 0.6536419622000424, + "grad_norm": 11.148793696679174, + "learning_rate": 3.930991953359421e-06, + "loss": 1.3109, + "step": 4617 + }, + { + "epoch": 0.6537835350746797, + "grad_norm": 11.73103912928153, + "learning_rate": 3.930521905343632e-06, + "loss": 1.5208, + "step": 4618 + }, + { + "epoch": 0.6539251079493169, + "grad_norm": 9.841946294323677, + "learning_rate": 3.930051782126374e-06, + "loss": 1.183, + "step": 4619 + }, + { + "epoch": 0.6540666808239541, + "grad_norm": 14.472099559131497, + "learning_rate": 3.92958158373236e-06, + "loss": 1.3281, + "step": 4620 + }, + { + "epoch": 0.6542082536985914, + "grad_norm": 8.873107708422875, + "learning_rate": 3.929111310186307e-06, + "loss": 1.2404, + "step": 4621 + }, + { + "epoch": 0.6543498265732286, + "grad_norm": 10.24282921032727, + "learning_rate": 3.928640961512939e-06, + "loss": 1.3684, + "step": 4622 + }, + { + "epoch": 0.6544913994478658, + "grad_norm": 8.252328717413885, + "learning_rate": 3.9281705377369814e-06, + "loss": 1.1585, + "step": 4623 + }, + { + "epoch": 0.6546329723225031, + "grad_norm": 11.980247134810401, + "learning_rate": 3.927700038883162e-06, + "loss": 1.3393, + "step": 4624 + }, + { + "epoch": 0.6547745451971402, + "grad_norm": 12.187084953870288, + "learning_rate": 3.927229464976218e-06, + "loss": 1.3627, + "step": 4625 + }, + { + "epoch": 0.6549161180717774, + "grad_norm": 10.376033501682006, + "learning_rate": 3.9267588160408845e-06, + "loss": 1.264, + "step": 4626 + }, + { + "epoch": 0.6550576909464146, + "grad_norm": 7.409167523900158, + "learning_rate": 3.926288092101903e-06, + "loss": 1.248, + "step": 4627 + }, + { + "epoch": 0.6551992638210519, + "grad_norm": 11.873949506372462, + "learning_rate": 3.92581729318402e-06, + "loss": 1.3143, + "step": 4628 + }, + { + "epoch": 0.6553408366956891, + "grad_norm": 8.455597633973381, + "learning_rate": 3.925346419311986e-06, + "loss": 1.2605, + "step": 4629 + }, + { + "epoch": 0.6554824095703263, + "grad_norm": 8.984553751618533, + "learning_rate": 3.924875470510553e-06, + "loss": 1.2186, + "step": 4630 + }, + { + "epoch": 0.6556239824449636, + "grad_norm": 9.111036930971114, + "learning_rate": 3.924404446804479e-06, + "loss": 1.2979, + "step": 4631 + }, + { + "epoch": 0.6557655553196008, + "grad_norm": 9.777319560247744, + "learning_rate": 3.923933348218525e-06, + "loss": 1.2109, + "step": 4632 + }, + { + "epoch": 0.655907128194238, + "grad_norm": 10.421919289284642, + "learning_rate": 3.923462174777458e-06, + "loss": 1.2298, + "step": 4633 + }, + { + "epoch": 0.6560487010688753, + "grad_norm": 7.759112999157275, + "learning_rate": 3.922990926506044e-06, + "loss": 1.1936, + "step": 4634 + }, + { + "epoch": 0.6561902739435124, + "grad_norm": 8.203826234908814, + "learning_rate": 3.922519603429059e-06, + "loss": 1.3227, + "step": 4635 + }, + { + "epoch": 0.6563318468181496, + "grad_norm": 11.33607614276003, + "learning_rate": 3.922048205571279e-06, + "loss": 1.3759, + "step": 4636 + }, + { + "epoch": 0.6564734196927868, + "grad_norm": 8.75694217491944, + "learning_rate": 3.921576732957486e-06, + "loss": 1.2932, + "step": 4637 + }, + { + "epoch": 0.6566149925674241, + "grad_norm": 10.196205959414359, + "learning_rate": 3.9211051856124625e-06, + "loss": 1.3996, + "step": 4638 + }, + { + "epoch": 0.6567565654420613, + "grad_norm": 9.274089478581619, + "learning_rate": 3.920633563560999e-06, + "loss": 1.2138, + "step": 4639 + }, + { + "epoch": 0.6568981383166985, + "grad_norm": 9.49254194940074, + "learning_rate": 3.92016186682789e-06, + "loss": 1.1334, + "step": 4640 + }, + { + "epoch": 0.6570397111913358, + "grad_norm": 9.033510763342555, + "learning_rate": 3.919690095437929e-06, + "loss": 1.2722, + "step": 4641 + }, + { + "epoch": 0.657181284065973, + "grad_norm": 9.76655190913598, + "learning_rate": 3.9192182494159196e-06, + "loss": 1.247, + "step": 4642 + }, + { + "epoch": 0.6573228569406102, + "grad_norm": 9.152801801784737, + "learning_rate": 3.918746328786665e-06, + "loss": 1.3533, + "step": 4643 + }, + { + "epoch": 0.6574644298152474, + "grad_norm": 8.640647074169362, + "learning_rate": 3.918274333574972e-06, + "loss": 1.217, + "step": 4644 + }, + { + "epoch": 0.6576060026898847, + "grad_norm": 9.642594938396826, + "learning_rate": 3.9178022638056565e-06, + "loss": 1.3, + "step": 4645 + }, + { + "epoch": 0.6577475755645218, + "grad_norm": 9.574504821562575, + "learning_rate": 3.9173301195035326e-06, + "loss": 1.2672, + "step": 4646 + }, + { + "epoch": 0.657889148439159, + "grad_norm": 9.426161263967236, + "learning_rate": 3.916857900693421e-06, + "loss": 1.2747, + "step": 4647 + }, + { + "epoch": 0.6580307213137963, + "grad_norm": 9.189313437102497, + "learning_rate": 3.916385607400146e-06, + "loss": 1.3363, + "step": 4648 + }, + { + "epoch": 0.6581722941884335, + "grad_norm": 8.093173091167255, + "learning_rate": 3.915913239648535e-06, + "loss": 1.1261, + "step": 4649 + }, + { + "epoch": 0.6583138670630707, + "grad_norm": 7.913039597949131, + "learning_rate": 3.915440797463422e-06, + "loss": 1.2451, + "step": 4650 + }, + { + "epoch": 0.6584554399377079, + "grad_norm": 7.413631318138842, + "learning_rate": 3.914968280869642e-06, + "loss": 1.1505, + "step": 4651 + }, + { + "epoch": 0.6585970128123452, + "grad_norm": 10.025152617170404, + "learning_rate": 3.9144956898920336e-06, + "loss": 1.3655, + "step": 4652 + }, + { + "epoch": 0.6587385856869824, + "grad_norm": 8.98979286371059, + "learning_rate": 3.914023024555441e-06, + "loss": 1.2845, + "step": 4653 + }, + { + "epoch": 0.6588801585616196, + "grad_norm": 8.799575309909315, + "learning_rate": 3.913550284884714e-06, + "loss": 1.3207, + "step": 4654 + }, + { + "epoch": 0.6590217314362569, + "grad_norm": 9.287413121018616, + "learning_rate": 3.913077470904701e-06, + "loss": 1.1765, + "step": 4655 + }, + { + "epoch": 0.659163304310894, + "grad_norm": 7.852379903424508, + "learning_rate": 3.912604582640259e-06, + "loss": 1.1587, + "step": 4656 + }, + { + "epoch": 0.6593048771855312, + "grad_norm": 8.19189847486885, + "learning_rate": 3.912131620116249e-06, + "loss": 1.2102, + "step": 4657 + }, + { + "epoch": 0.6594464500601684, + "grad_norm": 8.565648300656925, + "learning_rate": 3.9116585833575305e-06, + "loss": 1.1998, + "step": 4658 + }, + { + "epoch": 0.6595880229348057, + "grad_norm": 8.754304535506554, + "learning_rate": 3.911185472388974e-06, + "loss": 1.3548, + "step": 4659 + }, + { + "epoch": 0.6597295958094429, + "grad_norm": 9.265908659891755, + "learning_rate": 3.91071228723545e-06, + "loss": 1.2935, + "step": 4660 + }, + { + "epoch": 0.6598711686840801, + "grad_norm": 11.044067544047369, + "learning_rate": 3.9102390279218315e-06, + "loss": 1.3586, + "step": 4661 + }, + { + "epoch": 0.6600127415587174, + "grad_norm": 9.680372803500187, + "learning_rate": 3.909765694473e-06, + "loss": 1.1364, + "step": 4662 + }, + { + "epoch": 0.6601543144333546, + "grad_norm": 8.844968462467845, + "learning_rate": 3.909292286913836e-06, + "loss": 1.2279, + "step": 4663 + }, + { + "epoch": 0.6602958873079918, + "grad_norm": 11.758176515336904, + "learning_rate": 3.908818805269229e-06, + "loss": 1.2748, + "step": 4664 + }, + { + "epoch": 0.660437460182629, + "grad_norm": 8.570045877376154, + "learning_rate": 3.908345249564066e-06, + "loss": 1.4263, + "step": 4665 + }, + { + "epoch": 0.6605790330572662, + "grad_norm": 10.146288308680722, + "learning_rate": 3.907871619823244e-06, + "loss": 1.2092, + "step": 4666 + }, + { + "epoch": 0.6607206059319034, + "grad_norm": 10.397212872806872, + "learning_rate": 3.907397916071661e-06, + "loss": 1.5012, + "step": 4667 + }, + { + "epoch": 0.6608621788065406, + "grad_norm": 11.05734184398697, + "learning_rate": 3.90692413833422e-06, + "loss": 1.3039, + "step": 4668 + }, + { + "epoch": 0.6610037516811779, + "grad_norm": 10.163919880340673, + "learning_rate": 3.906450286635824e-06, + "loss": 1.2885, + "step": 4669 + }, + { + "epoch": 0.6611453245558151, + "grad_norm": 9.551042461918751, + "learning_rate": 3.905976361001385e-06, + "loss": 1.3176, + "step": 4670 + }, + { + "epoch": 0.6612868974304523, + "grad_norm": 8.718251203809992, + "learning_rate": 3.905502361455819e-06, + "loss": 1.2059, + "step": 4671 + }, + { + "epoch": 0.6614284703050896, + "grad_norm": 8.670378354802613, + "learning_rate": 3.9050282880240405e-06, + "loss": 1.1592, + "step": 4672 + }, + { + "epoch": 0.6615700431797268, + "grad_norm": 8.335801127926214, + "learning_rate": 3.904554140730973e-06, + "loss": 1.2916, + "step": 4673 + }, + { + "epoch": 0.661711616054364, + "grad_norm": 8.47951327528545, + "learning_rate": 3.904079919601543e-06, + "loss": 1.0678, + "step": 4674 + }, + { + "epoch": 0.6618531889290012, + "grad_norm": 9.280965986947637, + "learning_rate": 3.903605624660676e-06, + "loss": 1.2001, + "step": 4675 + }, + { + "epoch": 0.6619947618036385, + "grad_norm": 10.093796106947714, + "learning_rate": 3.903131255933309e-06, + "loss": 1.4236, + "step": 4676 + }, + { + "epoch": 0.6621363346782756, + "grad_norm": 9.994122876242301, + "learning_rate": 3.902656813444378e-06, + "loss": 1.4558, + "step": 4677 + }, + { + "epoch": 0.6622779075529128, + "grad_norm": 8.271412660182028, + "learning_rate": 3.902182297218824e-06, + "loss": 1.2751, + "step": 4678 + }, + { + "epoch": 0.6624194804275501, + "grad_norm": 11.09582225088974, + "learning_rate": 3.901707707281592e-06, + "loss": 1.2278, + "step": 4679 + }, + { + "epoch": 0.6625610533021873, + "grad_norm": 11.37197265583069, + "learning_rate": 3.901233043657632e-06, + "loss": 1.1396, + "step": 4680 + }, + { + "epoch": 0.6627026261768245, + "grad_norm": 8.634188415521551, + "learning_rate": 3.900758306371895e-06, + "loss": 1.2587, + "step": 4681 + }, + { + "epoch": 0.6628441990514617, + "grad_norm": 6.909001833510338, + "learning_rate": 3.900283495449339e-06, + "loss": 1.1305, + "step": 4682 + }, + { + "epoch": 0.662985771926099, + "grad_norm": 8.721562413922255, + "learning_rate": 3.899808610914923e-06, + "loss": 1.4387, + "step": 4683 + }, + { + "epoch": 0.6631273448007362, + "grad_norm": 8.905918904892822, + "learning_rate": 3.899333652793612e-06, + "loss": 1.1504, + "step": 4684 + }, + { + "epoch": 0.6632689176753734, + "grad_norm": 12.640640089028386, + "learning_rate": 3.898858621110374e-06, + "loss": 1.4105, + "step": 4685 + }, + { + "epoch": 0.6634104905500107, + "grad_norm": 9.240920688618631, + "learning_rate": 3.898383515890182e-06, + "loss": 1.2304, + "step": 4686 + }, + { + "epoch": 0.6635520634246478, + "grad_norm": 7.7611262532569425, + "learning_rate": 3.89790833715801e-06, + "loss": 1.3515, + "step": 4687 + }, + { + "epoch": 0.663693636299285, + "grad_norm": 8.562253182745163, + "learning_rate": 3.897433084938841e-06, + "loss": 1.3841, + "step": 4688 + }, + { + "epoch": 0.6638352091739222, + "grad_norm": 8.944901822587136, + "learning_rate": 3.8969577592576555e-06, + "loss": 1.292, + "step": 4689 + }, + { + "epoch": 0.6639767820485595, + "grad_norm": 9.118050214913257, + "learning_rate": 3.896482360139443e-06, + "loss": 1.1421, + "step": 4690 + }, + { + "epoch": 0.6641183549231967, + "grad_norm": 10.32537260329973, + "learning_rate": 3.896006887609193e-06, + "loss": 1.2994, + "step": 4691 + }, + { + "epoch": 0.6642599277978339, + "grad_norm": 9.20856287973637, + "learning_rate": 3.8955313416919026e-06, + "loss": 1.4732, + "step": 4692 + }, + { + "epoch": 0.6644015006724712, + "grad_norm": 9.649592305006053, + "learning_rate": 3.89505572241257e-06, + "loss": 1.3284, + "step": 4693 + }, + { + "epoch": 0.6645430735471084, + "grad_norm": 8.540492825369933, + "learning_rate": 3.894580029796198e-06, + "loss": 1.2739, + "step": 4694 + }, + { + "epoch": 0.6646846464217456, + "grad_norm": 8.204920283458588, + "learning_rate": 3.894104263867794e-06, + "loss": 1.2845, + "step": 4695 + }, + { + "epoch": 0.6648262192963829, + "grad_norm": 10.383306280583572, + "learning_rate": 3.893628424652368e-06, + "loss": 1.2526, + "step": 4696 + }, + { + "epoch": 0.66496779217102, + "grad_norm": 11.082563062781432, + "learning_rate": 3.893152512174935e-06, + "loss": 1.248, + "step": 4697 + }, + { + "epoch": 0.6651093650456572, + "grad_norm": 9.469452986099938, + "learning_rate": 3.892676526460513e-06, + "loss": 1.1932, + "step": 4698 + }, + { + "epoch": 0.6652509379202944, + "grad_norm": 10.380858088840466, + "learning_rate": 3.8922004675341244e-06, + "loss": 1.5706, + "step": 4699 + }, + { + "epoch": 0.6653925107949317, + "grad_norm": 10.109827046259172, + "learning_rate": 3.891724335420796e-06, + "loss": 1.3353, + "step": 4700 + }, + { + "epoch": 0.6655340836695689, + "grad_norm": 11.433855320291208, + "learning_rate": 3.891248130145556e-06, + "loss": 1.2226, + "step": 4701 + }, + { + "epoch": 0.6656756565442061, + "grad_norm": 8.960476026151422, + "learning_rate": 3.8907718517334405e-06, + "loss": 1.2614, + "step": 4702 + }, + { + "epoch": 0.6658172294188434, + "grad_norm": 9.644308564283158, + "learning_rate": 3.890295500209485e-06, + "loss": 1.1152, + "step": 4703 + }, + { + "epoch": 0.6659588022934806, + "grad_norm": 10.801067680989524, + "learning_rate": 3.8898190755987314e-06, + "loss": 1.3476, + "step": 4704 + }, + { + "epoch": 0.6661003751681178, + "grad_norm": 11.518616576309467, + "learning_rate": 3.889342577926225e-06, + "loss": 1.3235, + "step": 4705 + }, + { + "epoch": 0.666241948042755, + "grad_norm": 11.056218147124127, + "learning_rate": 3.888866007217017e-06, + "loss": 1.387, + "step": 4706 + }, + { + "epoch": 0.6663835209173923, + "grad_norm": 9.772245412161313, + "learning_rate": 3.888389363496157e-06, + "loss": 1.3894, + "step": 4707 + }, + { + "epoch": 0.6665250937920294, + "grad_norm": 10.211513932060612, + "learning_rate": 3.887912646788704e-06, + "loss": 1.238, + "step": 4708 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 9.377092051733293, + "learning_rate": 3.8874358571197164e-06, + "loss": 1.3729, + "step": 4709 + }, + { + "epoch": 0.6668082395413039, + "grad_norm": 8.944960248250041, + "learning_rate": 3.886958994514263e-06, + "loss": 1.1385, + "step": 4710 + }, + { + "epoch": 0.6669498124159411, + "grad_norm": 10.487787683335444, + "learning_rate": 3.8864820589974075e-06, + "loss": 1.1926, + "step": 4711 + }, + { + "epoch": 0.6670913852905783, + "grad_norm": 6.89901008692141, + "learning_rate": 3.886005050594225e-06, + "loss": 1.1615, + "step": 4712 + }, + { + "epoch": 0.6672329581652156, + "grad_norm": 9.964138578782789, + "learning_rate": 3.88552796932979e-06, + "loss": 1.2164, + "step": 4713 + }, + { + "epoch": 0.6673745310398528, + "grad_norm": 10.48815140457335, + "learning_rate": 3.885050815229182e-06, + "loss": 1.3762, + "step": 4714 + }, + { + "epoch": 0.66751610391449, + "grad_norm": 8.75151877165026, + "learning_rate": 3.884573588317486e-06, + "loss": 1.2617, + "step": 4715 + }, + { + "epoch": 0.6676576767891272, + "grad_norm": 8.611999001070169, + "learning_rate": 3.88409628861979e-06, + "loss": 1.1672, + "step": 4716 + }, + { + "epoch": 0.6677992496637645, + "grad_norm": 11.104187431578458, + "learning_rate": 3.883618916161183e-06, + "loss": 1.457, + "step": 4717 + }, + { + "epoch": 0.6679408225384016, + "grad_norm": 8.473591789347655, + "learning_rate": 3.883141470966761e-06, + "loss": 1.2161, + "step": 4718 + }, + { + "epoch": 0.6680823954130388, + "grad_norm": 10.49672248186681, + "learning_rate": 3.8826639530616235e-06, + "loss": 1.3224, + "step": 4719 + }, + { + "epoch": 0.668223968287676, + "grad_norm": 8.533561421366027, + "learning_rate": 3.8821863624708725e-06, + "loss": 1.2082, + "step": 4720 + }, + { + "epoch": 0.6683655411623133, + "grad_norm": 10.534537054245224, + "learning_rate": 3.881708699219616e-06, + "loss": 1.239, + "step": 4721 + }, + { + "epoch": 0.6685071140369505, + "grad_norm": 7.541590259930702, + "learning_rate": 3.881230963332963e-06, + "loss": 1.2193, + "step": 4722 + }, + { + "epoch": 0.6686486869115877, + "grad_norm": 8.801551890083255, + "learning_rate": 3.880753154836028e-06, + "loss": 1.2039, + "step": 4723 + }, + { + "epoch": 0.668790259786225, + "grad_norm": 11.77716430729799, + "learning_rate": 3.880275273753929e-06, + "loss": 1.4025, + "step": 4724 + }, + { + "epoch": 0.6689318326608622, + "grad_norm": 8.987702763462723, + "learning_rate": 3.879797320111788e-06, + "loss": 1.3582, + "step": 4725 + }, + { + "epoch": 0.6690734055354994, + "grad_norm": 8.809895015441754, + "learning_rate": 3.879319293934732e-06, + "loss": 1.3058, + "step": 4726 + }, + { + "epoch": 0.6692149784101367, + "grad_norm": 9.009608755476444, + "learning_rate": 3.878841195247888e-06, + "loss": 1.3183, + "step": 4727 + }, + { + "epoch": 0.6693565512847738, + "grad_norm": 12.265779880590618, + "learning_rate": 3.87836302407639e-06, + "loss": 1.288, + "step": 4728 + }, + { + "epoch": 0.669498124159411, + "grad_norm": 11.517952216718108, + "learning_rate": 3.877884780445377e-06, + "loss": 1.3632, + "step": 4729 + }, + { + "epoch": 0.6696396970340482, + "grad_norm": 7.897654086086359, + "learning_rate": 3.877406464379987e-06, + "loss": 1.1174, + "step": 4730 + }, + { + "epoch": 0.6697812699086855, + "grad_norm": 9.41320259947224, + "learning_rate": 3.876928075905368e-06, + "loss": 1.2543, + "step": 4731 + }, + { + "epoch": 0.6699228427833227, + "grad_norm": 13.186361399306671, + "learning_rate": 3.876449615046665e-06, + "loss": 1.3903, + "step": 4732 + }, + { + "epoch": 0.6700644156579599, + "grad_norm": 9.53177949028747, + "learning_rate": 3.875971081829033e-06, + "loss": 1.4155, + "step": 4733 + }, + { + "epoch": 0.6702059885325972, + "grad_norm": 9.03423157152232, + "learning_rate": 3.875492476277627e-06, + "loss": 1.3188, + "step": 4734 + }, + { + "epoch": 0.6703475614072344, + "grad_norm": 8.80547252766976, + "learning_rate": 3.875013798417606e-06, + "loss": 1.2529, + "step": 4735 + }, + { + "epoch": 0.6704891342818716, + "grad_norm": 10.940343519881257, + "learning_rate": 3.874535048274136e-06, + "loss": 1.1599, + "step": 4736 + }, + { + "epoch": 0.6706307071565089, + "grad_norm": 8.179205733633644, + "learning_rate": 3.8740562258723845e-06, + "loss": 1.1824, + "step": 4737 + }, + { + "epoch": 0.6707722800311461, + "grad_norm": 10.56226488562752, + "learning_rate": 3.87357733123752e-06, + "loss": 1.4012, + "step": 4738 + }, + { + "epoch": 0.6709138529057832, + "grad_norm": 10.997049456136981, + "learning_rate": 3.87309836439472e-06, + "loss": 1.3415, + "step": 4739 + }, + { + "epoch": 0.6710554257804204, + "grad_norm": 10.250767376714947, + "learning_rate": 3.872619325369162e-06, + "loss": 1.256, + "step": 4740 + }, + { + "epoch": 0.6711969986550577, + "grad_norm": 7.77096871129424, + "learning_rate": 3.872140214186031e-06, + "loss": 1.0503, + "step": 4741 + }, + { + "epoch": 0.6713385715296949, + "grad_norm": 10.395123189294367, + "learning_rate": 3.871661030870512e-06, + "loss": 1.2908, + "step": 4742 + }, + { + "epoch": 0.6714801444043321, + "grad_norm": 12.245377271704093, + "learning_rate": 3.871181775447794e-06, + "loss": 1.3588, + "step": 4743 + }, + { + "epoch": 0.6716217172789694, + "grad_norm": 9.24244546409327, + "learning_rate": 3.870702447943073e-06, + "loss": 1.3548, + "step": 4744 + }, + { + "epoch": 0.6717632901536066, + "grad_norm": 8.16697391269404, + "learning_rate": 3.870223048381546e-06, + "loss": 1.1961, + "step": 4745 + }, + { + "epoch": 0.6719048630282438, + "grad_norm": 7.159235302458413, + "learning_rate": 3.869743576788416e-06, + "loss": 1.1588, + "step": 4746 + }, + { + "epoch": 0.672046435902881, + "grad_norm": 7.961058012075459, + "learning_rate": 3.869264033188887e-06, + "loss": 1.2391, + "step": 4747 + }, + { + "epoch": 0.6721880087775183, + "grad_norm": 7.66129258367897, + "learning_rate": 3.868784417608169e-06, + "loss": 1.0882, + "step": 4748 + }, + { + "epoch": 0.6723295816521554, + "grad_norm": 7.844432056454443, + "learning_rate": 3.868304730071475e-06, + "loss": 1.1944, + "step": 4749 + }, + { + "epoch": 0.6724711545267926, + "grad_norm": 8.717104301869284, + "learning_rate": 3.86782497060402e-06, + "loss": 1.2575, + "step": 4750 + }, + { + "epoch": 0.6726127274014299, + "grad_norm": 10.275795089513982, + "learning_rate": 3.867345139231028e-06, + "loss": 1.3419, + "step": 4751 + }, + { + "epoch": 0.6727543002760671, + "grad_norm": 9.047092151835782, + "learning_rate": 3.86686523597772e-06, + "loss": 1.4378, + "step": 4752 + }, + { + "epoch": 0.6728958731507043, + "grad_norm": 9.094596063117345, + "learning_rate": 3.866385260869327e-06, + "loss": 1.2947, + "step": 4753 + }, + { + "epoch": 0.6730374460253415, + "grad_norm": 9.2048591055129, + "learning_rate": 3.86590521393108e-06, + "loss": 1.149, + "step": 4754 + }, + { + "epoch": 0.6731790188999788, + "grad_norm": 9.13083994817452, + "learning_rate": 3.865425095188214e-06, + "loss": 1.1344, + "step": 4755 + }, + { + "epoch": 0.673320591774616, + "grad_norm": 8.096169364213143, + "learning_rate": 3.864944904665967e-06, + "loss": 1.2516, + "step": 4756 + }, + { + "epoch": 0.6734621646492532, + "grad_norm": 8.777600599129128, + "learning_rate": 3.864464642389586e-06, + "loss": 1.3176, + "step": 4757 + }, + { + "epoch": 0.6736037375238905, + "grad_norm": 9.952646288895002, + "learning_rate": 3.863984308384317e-06, + "loss": 1.1161, + "step": 4758 + }, + { + "epoch": 0.6737453103985277, + "grad_norm": 7.307060631389395, + "learning_rate": 3.8635039026754075e-06, + "loss": 1.1842, + "step": 4759 + }, + { + "epoch": 0.6738868832731648, + "grad_norm": 9.024237738460457, + "learning_rate": 3.863023425288116e-06, + "loss": 1.2689, + "step": 4760 + }, + { + "epoch": 0.674028456147802, + "grad_norm": 7.500455206726154, + "learning_rate": 3.862542876247699e-06, + "loss": 1.2734, + "step": 4761 + }, + { + "epoch": 0.6741700290224393, + "grad_norm": 9.76164958146263, + "learning_rate": 3.862062255579419e-06, + "loss": 1.3511, + "step": 4762 + }, + { + "epoch": 0.6743116018970765, + "grad_norm": 7.188269001830398, + "learning_rate": 3.861581563308542e-06, + "loss": 1.2622, + "step": 4763 + }, + { + "epoch": 0.6744531747717137, + "grad_norm": 9.486964869830933, + "learning_rate": 3.861100799460336e-06, + "loss": 1.2705, + "step": 4764 + }, + { + "epoch": 0.674594747646351, + "grad_norm": 7.801895635067691, + "learning_rate": 3.860619964060078e-06, + "loss": 1.131, + "step": 4765 + }, + { + "epoch": 0.6747363205209882, + "grad_norm": 8.33878844684469, + "learning_rate": 3.860139057133042e-06, + "loss": 1.3314, + "step": 4766 + }, + { + "epoch": 0.6748778933956254, + "grad_norm": 8.569900321959429, + "learning_rate": 3.85965807870451e-06, + "loss": 1.3622, + "step": 4767 + }, + { + "epoch": 0.6750194662702627, + "grad_norm": 12.0173983015658, + "learning_rate": 3.859177028799766e-06, + "loss": 1.3004, + "step": 4768 + }, + { + "epoch": 0.6751610391448999, + "grad_norm": 8.833300992318694, + "learning_rate": 3.858695907444101e-06, + "loss": 1.3106, + "step": 4769 + }, + { + "epoch": 0.675302612019537, + "grad_norm": 7.292680273904279, + "learning_rate": 3.858214714662804e-06, + "loss": 1.3107, + "step": 4770 + }, + { + "epoch": 0.6754441848941742, + "grad_norm": 8.149689707085622, + "learning_rate": 3.857733450481172e-06, + "loss": 1.3722, + "step": 4771 + }, + { + "epoch": 0.6755857577688115, + "grad_norm": 10.553113027921103, + "learning_rate": 3.857252114924504e-06, + "loss": 1.2356, + "step": 4772 + }, + { + "epoch": 0.6757273306434487, + "grad_norm": 9.691910459601532, + "learning_rate": 3.8567707080181054e-06, + "loss": 1.3115, + "step": 4773 + }, + { + "epoch": 0.6758689035180859, + "grad_norm": 8.159470430281411, + "learning_rate": 3.856289229787283e-06, + "loss": 1.3113, + "step": 4774 + }, + { + "epoch": 0.6760104763927232, + "grad_norm": 8.482748588789217, + "learning_rate": 3.855807680257347e-06, + "loss": 1.3203, + "step": 4775 + }, + { + "epoch": 0.6761520492673604, + "grad_norm": 8.542061351558479, + "learning_rate": 3.85532605945361e-06, + "loss": 1.2293, + "step": 4776 + }, + { + "epoch": 0.6762936221419976, + "grad_norm": 6.992166768474853, + "learning_rate": 3.854844367401395e-06, + "loss": 1.1647, + "step": 4777 + }, + { + "epoch": 0.6764351950166348, + "grad_norm": 7.472507696674221, + "learning_rate": 3.854362604126021e-06, + "loss": 1.1595, + "step": 4778 + }, + { + "epoch": 0.6765767678912721, + "grad_norm": 9.8817162188401, + "learning_rate": 3.853880769652815e-06, + "loss": 1.2575, + "step": 4779 + }, + { + "epoch": 0.6767183407659092, + "grad_norm": 9.18851304470186, + "learning_rate": 3.853398864007105e-06, + "loss": 1.364, + "step": 4780 + }, + { + "epoch": 0.6768599136405464, + "grad_norm": 8.503753730832546, + "learning_rate": 3.852916887214227e-06, + "loss": 1.3348, + "step": 4781 + }, + { + "epoch": 0.6770014865151837, + "grad_norm": 7.019435060380182, + "learning_rate": 3.852434839299517e-06, + "loss": 1.2191, + "step": 4782 + }, + { + "epoch": 0.6771430593898209, + "grad_norm": 10.773125547853985, + "learning_rate": 3.851952720288316e-06, + "loss": 1.2722, + "step": 4783 + }, + { + "epoch": 0.6772846322644581, + "grad_norm": 9.792468575180543, + "learning_rate": 3.851470530205969e-06, + "loss": 1.3847, + "step": 4784 + }, + { + "epoch": 0.6774262051390953, + "grad_norm": 9.84385889386694, + "learning_rate": 3.8509882690778234e-06, + "loss": 1.2809, + "step": 4785 + }, + { + "epoch": 0.6775677780137326, + "grad_norm": 9.130853734958151, + "learning_rate": 3.850505936929232e-06, + "loss": 1.1152, + "step": 4786 + }, + { + "epoch": 0.6777093508883698, + "grad_norm": 9.01465600159809, + "learning_rate": 3.8500235337855495e-06, + "loss": 1.4216, + "step": 4787 + }, + { + "epoch": 0.677850923763007, + "grad_norm": 8.528043374872746, + "learning_rate": 3.849541059672137e-06, + "loss": 1.2083, + "step": 4788 + }, + { + "epoch": 0.6779924966376443, + "grad_norm": 8.468393691334017, + "learning_rate": 3.8490585146143574e-06, + "loss": 1.2187, + "step": 4789 + }, + { + "epoch": 0.6781340695122815, + "grad_norm": 7.867391634795735, + "learning_rate": 3.848575898637579e-06, + "loss": 1.2716, + "step": 4790 + }, + { + "epoch": 0.6782756423869186, + "grad_norm": 8.042000192106672, + "learning_rate": 3.84809321176717e-06, + "loss": 1.383, + "step": 4791 + }, + { + "epoch": 0.6784172152615559, + "grad_norm": 9.972081312378254, + "learning_rate": 3.8476104540285054e-06, + "loss": 1.3433, + "step": 4792 + }, + { + "epoch": 0.6785587881361931, + "grad_norm": 10.570669051174251, + "learning_rate": 3.847127625446964e-06, + "loss": 1.2914, + "step": 4793 + }, + { + "epoch": 0.6787003610108303, + "grad_norm": 9.185658795897464, + "learning_rate": 3.846644726047928e-06, + "loss": 1.3038, + "step": 4794 + }, + { + "epoch": 0.6788419338854675, + "grad_norm": 9.106672720434691, + "learning_rate": 3.846161755856784e-06, + "loss": 1.3459, + "step": 4795 + }, + { + "epoch": 0.6789835067601048, + "grad_norm": 8.133883271099426, + "learning_rate": 3.84567871489892e-06, + "loss": 1.2426, + "step": 4796 + }, + { + "epoch": 0.679125079634742, + "grad_norm": 7.802122991144482, + "learning_rate": 3.845195603199728e-06, + "loss": 1.3181, + "step": 4797 + }, + { + "epoch": 0.6792666525093792, + "grad_norm": 8.620689310008075, + "learning_rate": 3.844712420784607e-06, + "loss": 1.2222, + "step": 4798 + }, + { + "epoch": 0.6794082253840165, + "grad_norm": 8.608577407473824, + "learning_rate": 3.844229167678957e-06, + "loss": 1.239, + "step": 4799 + }, + { + "epoch": 0.6795497982586537, + "grad_norm": 8.809920562489927, + "learning_rate": 3.843745843908181e-06, + "loss": 1.3082, + "step": 4800 + }, + { + "epoch": 0.6796913711332908, + "grad_norm": 8.164666170863365, + "learning_rate": 3.843262449497689e-06, + "loss": 1.2073, + "step": 4801 + }, + { + "epoch": 0.679832944007928, + "grad_norm": 8.885976531504966, + "learning_rate": 3.842778984472891e-06, + "loss": 1.276, + "step": 4802 + }, + { + "epoch": 0.6799745168825653, + "grad_norm": 9.771479885505165, + "learning_rate": 3.842295448859203e-06, + "loss": 1.3372, + "step": 4803 + }, + { + "epoch": 0.6801160897572025, + "grad_norm": 8.959611337270239, + "learning_rate": 3.841811842682044e-06, + "loss": 1.3028, + "step": 4804 + }, + { + "epoch": 0.6802576626318397, + "grad_norm": 8.911352362449506, + "learning_rate": 3.841328165966837e-06, + "loss": 1.2702, + "step": 4805 + }, + { + "epoch": 0.680399235506477, + "grad_norm": 8.827737823368171, + "learning_rate": 3.84084441873901e-06, + "loss": 1.3341, + "step": 4806 + }, + { + "epoch": 0.6805408083811142, + "grad_norm": 7.497514948815697, + "learning_rate": 3.840360601023989e-06, + "loss": 1.3327, + "step": 4807 + }, + { + "epoch": 0.6806823812557514, + "grad_norm": 9.542872412402595, + "learning_rate": 3.839876712847211e-06, + "loss": 1.3502, + "step": 4808 + }, + { + "epoch": 0.6808239541303887, + "grad_norm": 9.708653347869701, + "learning_rate": 3.839392754234115e-06, + "loss": 1.3405, + "step": 4809 + }, + { + "epoch": 0.6809655270050259, + "grad_norm": 7.378071549451005, + "learning_rate": 3.8389087252101395e-06, + "loss": 1.2281, + "step": 4810 + }, + { + "epoch": 0.681107099879663, + "grad_norm": 9.017813537578562, + "learning_rate": 3.838424625800732e-06, + "loss": 1.3734, + "step": 4811 + }, + { + "epoch": 0.6812486727543002, + "grad_norm": 8.087267308373692, + "learning_rate": 3.837940456031338e-06, + "loss": 1.2613, + "step": 4812 + }, + { + "epoch": 0.6813902456289375, + "grad_norm": 8.939736253231622, + "learning_rate": 3.837456215927413e-06, + "loss": 1.3696, + "step": 4813 + }, + { + "epoch": 0.6815318185035747, + "grad_norm": 9.082651608517153, + "learning_rate": 3.8369719055144115e-06, + "loss": 1.3684, + "step": 4814 + }, + { + "epoch": 0.6816733913782119, + "grad_norm": 8.346916826458289, + "learning_rate": 3.836487524817794e-06, + "loss": 1.357, + "step": 4815 + }, + { + "epoch": 0.6818149642528492, + "grad_norm": 9.876793420883551, + "learning_rate": 3.836003073863024e-06, + "loss": 1.4028, + "step": 4816 + }, + { + "epoch": 0.6819565371274864, + "grad_norm": 8.284023065333212, + "learning_rate": 3.8355185526755676e-06, + "loss": 1.217, + "step": 4817 + }, + { + "epoch": 0.6820981100021236, + "grad_norm": 8.66388065845582, + "learning_rate": 3.835033961280898e-06, + "loss": 1.2823, + "step": 4818 + }, + { + "epoch": 0.6822396828767608, + "grad_norm": 7.6570730584385585, + "learning_rate": 3.834549299704487e-06, + "loss": 1.2476, + "step": 4819 + }, + { + "epoch": 0.6823812557513981, + "grad_norm": 7.969892113313176, + "learning_rate": 3.8340645679718155e-06, + "loss": 1.2261, + "step": 4820 + }, + { + "epoch": 0.6825228286260353, + "grad_norm": 7.802978572915281, + "learning_rate": 3.833579766108365e-06, + "loss": 1.235, + "step": 4821 + }, + { + "epoch": 0.6826644015006724, + "grad_norm": 8.494741720726738, + "learning_rate": 3.83309489413962e-06, + "loss": 1.1466, + "step": 4822 + }, + { + "epoch": 0.6828059743753097, + "grad_norm": 8.76625425966628, + "learning_rate": 3.83260995209107e-06, + "loss": 1.2334, + "step": 4823 + }, + { + "epoch": 0.6829475472499469, + "grad_norm": 7.87206274083574, + "learning_rate": 3.832124939988208e-06, + "loss": 1.2439, + "step": 4824 + }, + { + "epoch": 0.6830891201245841, + "grad_norm": 10.990223441382067, + "learning_rate": 3.831639857856532e-06, + "loss": 1.2422, + "step": 4825 + }, + { + "epoch": 0.6832306929992213, + "grad_norm": 7.787539226990389, + "learning_rate": 3.831154705721542e-06, + "loss": 1.2779, + "step": 4826 + }, + { + "epoch": 0.6833722658738586, + "grad_norm": 10.78724039589768, + "learning_rate": 3.830669483608741e-06, + "loss": 1.4133, + "step": 4827 + }, + { + "epoch": 0.6835138387484958, + "grad_norm": 10.05320509480547, + "learning_rate": 3.830184191543638e-06, + "loss": 1.3761, + "step": 4828 + }, + { + "epoch": 0.683655411623133, + "grad_norm": 10.306256618000264, + "learning_rate": 3.829698829551743e-06, + "loss": 1.3405, + "step": 4829 + }, + { + "epoch": 0.6837969844977703, + "grad_norm": 8.523670224824661, + "learning_rate": 3.829213397658572e-06, + "loss": 1.3476, + "step": 4830 + }, + { + "epoch": 0.6839385573724075, + "grad_norm": 8.277314715764321, + "learning_rate": 3.828727895889644e-06, + "loss": 1.2608, + "step": 4831 + }, + { + "epoch": 0.6840801302470446, + "grad_norm": 8.800583161191694, + "learning_rate": 3.828242324270482e-06, + "loss": 1.3519, + "step": 4832 + }, + { + "epoch": 0.6842217031216818, + "grad_norm": 8.351556105294557, + "learning_rate": 3.82775668282661e-06, + "loss": 1.2853, + "step": 4833 + }, + { + "epoch": 0.6843632759963191, + "grad_norm": 8.904091878055368, + "learning_rate": 3.827270971583561e-06, + "loss": 1.2004, + "step": 4834 + }, + { + "epoch": 0.6845048488709563, + "grad_norm": 9.734357757308164, + "learning_rate": 3.826785190566865e-06, + "loss": 1.1402, + "step": 4835 + }, + { + "epoch": 0.6846464217455935, + "grad_norm": 9.730894884160554, + "learning_rate": 3.826299339802062e-06, + "loss": 1.2034, + "step": 4836 + }, + { + "epoch": 0.6847879946202308, + "grad_norm": 8.717583472309267, + "learning_rate": 3.825813419314691e-06, + "loss": 1.2383, + "step": 4837 + }, + { + "epoch": 0.684929567494868, + "grad_norm": 10.936473514890187, + "learning_rate": 3.825327429130297e-06, + "loss": 1.3807, + "step": 4838 + }, + { + "epoch": 0.6850711403695052, + "grad_norm": 9.229014743447191, + "learning_rate": 3.824841369274429e-06, + "loss": 1.1613, + "step": 4839 + }, + { + "epoch": 0.6852127132441425, + "grad_norm": 9.637919684745738, + "learning_rate": 3.824355239772637e-06, + "loss": 1.3447, + "step": 4840 + }, + { + "epoch": 0.6853542861187797, + "grad_norm": 8.461368511067114, + "learning_rate": 3.823869040650478e-06, + "loss": 1.3274, + "step": 4841 + }, + { + "epoch": 0.6854958589934168, + "grad_norm": 8.53732899261385, + "learning_rate": 3.823382771933512e-06, + "loss": 1.2686, + "step": 4842 + }, + { + "epoch": 0.685637431868054, + "grad_norm": 9.630034480048037, + "learning_rate": 3.822896433647299e-06, + "loss": 1.1866, + "step": 4843 + }, + { + "epoch": 0.6857790047426913, + "grad_norm": 10.554048847836528, + "learning_rate": 3.8224100258174066e-06, + "loss": 1.1205, + "step": 4844 + }, + { + "epoch": 0.6859205776173285, + "grad_norm": 12.381863626893974, + "learning_rate": 3.821923548469405e-06, + "loss": 1.3488, + "step": 4845 + }, + { + "epoch": 0.6860621504919657, + "grad_norm": 9.246195242630547, + "learning_rate": 3.82143700162887e-06, + "loss": 1.3653, + "step": 4846 + }, + { + "epoch": 0.686203723366603, + "grad_norm": 7.1759803341497195, + "learning_rate": 3.820950385321375e-06, + "loss": 1.2145, + "step": 4847 + }, + { + "epoch": 0.6863452962412402, + "grad_norm": 7.657642969583354, + "learning_rate": 3.820463699572505e-06, + "loss": 1.1532, + "step": 4848 + }, + { + "epoch": 0.6864868691158774, + "grad_norm": 8.965665386503888, + "learning_rate": 3.819976944407841e-06, + "loss": 1.2173, + "step": 4849 + }, + { + "epoch": 0.6866284419905146, + "grad_norm": 11.30741607911509, + "learning_rate": 3.819490119852975e-06, + "loss": 1.2635, + "step": 4850 + }, + { + "epoch": 0.6867700148651519, + "grad_norm": 10.939592782484274, + "learning_rate": 3.819003225933497e-06, + "loss": 1.3329, + "step": 4851 + }, + { + "epoch": 0.6869115877397891, + "grad_norm": 8.384501519032378, + "learning_rate": 3.818516262675001e-06, + "loss": 1.2366, + "step": 4852 + }, + { + "epoch": 0.6870531606144262, + "grad_norm": 7.4198756044556236, + "learning_rate": 3.81802923010309e-06, + "loss": 1.1622, + "step": 4853 + }, + { + "epoch": 0.6871947334890635, + "grad_norm": 7.859488818214743, + "learning_rate": 3.817542128243365e-06, + "loss": 1.2607, + "step": 4854 + }, + { + "epoch": 0.6873363063637007, + "grad_norm": 9.076631459272814, + "learning_rate": 3.817054957121432e-06, + "loss": 1.3061, + "step": 4855 + }, + { + "epoch": 0.6874778792383379, + "grad_norm": 8.62395083915556, + "learning_rate": 3.8165677167629025e-06, + "loss": 1.0993, + "step": 4856 + }, + { + "epoch": 0.6876194521129751, + "grad_norm": 8.468221162281976, + "learning_rate": 3.81608040719339e-06, + "loss": 1.259, + "step": 4857 + }, + { + "epoch": 0.6877610249876124, + "grad_norm": 9.043172918861481, + "learning_rate": 3.8155930284385116e-06, + "loss": 1.5566, + "step": 4858 + }, + { + "epoch": 0.6879025978622496, + "grad_norm": 8.081348713023363, + "learning_rate": 3.815105580523888e-06, + "loss": 1.3152, + "step": 4859 + }, + { + "epoch": 0.6880441707368868, + "grad_norm": 10.90814035204633, + "learning_rate": 3.814618063475145e-06, + "loss": 1.2363, + "step": 4860 + }, + { + "epoch": 0.6881857436115241, + "grad_norm": 8.673167749222445, + "learning_rate": 3.814130477317911e-06, + "loss": 1.3721, + "step": 4861 + }, + { + "epoch": 0.6883273164861613, + "grad_norm": 7.016159479049006, + "learning_rate": 3.8136428220778177e-06, + "loss": 1.183, + "step": 4862 + }, + { + "epoch": 0.6884688893607984, + "grad_norm": 9.97570443402761, + "learning_rate": 3.8131550977805005e-06, + "loss": 1.2668, + "step": 4863 + }, + { + "epoch": 0.6886104622354356, + "grad_norm": 9.893252730268289, + "learning_rate": 3.8126673044515993e-06, + "loss": 1.398, + "step": 4864 + }, + { + "epoch": 0.6887520351100729, + "grad_norm": 8.145822458349386, + "learning_rate": 3.812179442116756e-06, + "loss": 1.2755, + "step": 4865 + }, + { + "epoch": 0.6888936079847101, + "grad_norm": 9.068607771406871, + "learning_rate": 3.811691510801618e-06, + "loss": 1.2065, + "step": 4866 + }, + { + "epoch": 0.6890351808593473, + "grad_norm": 9.543888904932176, + "learning_rate": 3.8112035105318353e-06, + "loss": 1.4804, + "step": 4867 + }, + { + "epoch": 0.6891767537339846, + "grad_norm": 9.212144250095648, + "learning_rate": 3.8107154413330616e-06, + "loss": 1.3663, + "step": 4868 + }, + { + "epoch": 0.6893183266086218, + "grad_norm": 8.87680766004302, + "learning_rate": 3.8102273032309554e-06, + "loss": 1.324, + "step": 4869 + }, + { + "epoch": 0.689459899483259, + "grad_norm": 10.407569961917481, + "learning_rate": 3.809739096251176e-06, + "loss": 1.2862, + "step": 4870 + }, + { + "epoch": 0.6896014723578963, + "grad_norm": 8.306561406643771, + "learning_rate": 3.809250820419389e-06, + "loss": 1.2067, + "step": 4871 + }, + { + "epoch": 0.6897430452325335, + "grad_norm": 13.55818409961679, + "learning_rate": 3.808762475761263e-06, + "loss": 1.3582, + "step": 4872 + }, + { + "epoch": 0.6898846181071706, + "grad_norm": 9.277232318409736, + "learning_rate": 3.808274062302469e-06, + "loss": 1.1801, + "step": 4873 + }, + { + "epoch": 0.6900261909818078, + "grad_norm": 8.73827120934269, + "learning_rate": 3.807785580068683e-06, + "loss": 1.2244, + "step": 4874 + }, + { + "epoch": 0.6901677638564451, + "grad_norm": 8.894662915554903, + "learning_rate": 3.8072970290855843e-06, + "loss": 1.233, + "step": 4875 + }, + { + "epoch": 0.6903093367310823, + "grad_norm": 10.834295455072864, + "learning_rate": 3.8068084093788554e-06, + "loss": 1.3662, + "step": 4876 + }, + { + "epoch": 0.6904509096057195, + "grad_norm": 10.548957042757422, + "learning_rate": 3.806319720974183e-06, + "loss": 1.3206, + "step": 4877 + }, + { + "epoch": 0.6905924824803568, + "grad_norm": 9.53101586226249, + "learning_rate": 3.8058309638972567e-06, + "loss": 1.4186, + "step": 4878 + }, + { + "epoch": 0.690734055354994, + "grad_norm": 7.531737252456372, + "learning_rate": 3.805342138173771e-06, + "loss": 1.3403, + "step": 4879 + }, + { + "epoch": 0.6908756282296312, + "grad_norm": 6.571754542256723, + "learning_rate": 3.8048532438294215e-06, + "loss": 1.3278, + "step": 4880 + }, + { + "epoch": 0.6910172011042685, + "grad_norm": 9.468575554679171, + "learning_rate": 3.8043642808899106e-06, + "loss": 1.2402, + "step": 4881 + }, + { + "epoch": 0.6911587739789057, + "grad_norm": 9.293587342142708, + "learning_rate": 3.8038752493809416e-06, + "loss": 1.3141, + "step": 4882 + }, + { + "epoch": 0.6913003468535429, + "grad_norm": 8.455438829580713, + "learning_rate": 3.803386149328223e-06, + "loss": 1.2819, + "step": 4883 + }, + { + "epoch": 0.69144191972818, + "grad_norm": 8.30907168715291, + "learning_rate": 3.8028969807574665e-06, + "loss": 1.297, + "step": 4884 + }, + { + "epoch": 0.6915834926028173, + "grad_norm": 8.69552614784137, + "learning_rate": 3.8024077436943875e-06, + "loss": 1.324, + "step": 4885 + }, + { + "epoch": 0.6917250654774545, + "grad_norm": 9.752911646451848, + "learning_rate": 3.8019184381647044e-06, + "loss": 1.3015, + "step": 4886 + }, + { + "epoch": 0.6918666383520917, + "grad_norm": 9.233345090489069, + "learning_rate": 3.8014290641941392e-06, + "loss": 1.379, + "step": 4887 + }, + { + "epoch": 0.692008211226729, + "grad_norm": 9.109040399756417, + "learning_rate": 3.800939621808419e-06, + "loss": 1.2431, + "step": 4888 + }, + { + "epoch": 0.6921497841013662, + "grad_norm": 11.482537321083617, + "learning_rate": 3.8004501110332726e-06, + "loss": 1.3521, + "step": 4889 + }, + { + "epoch": 0.6922913569760034, + "grad_norm": 8.180537634158695, + "learning_rate": 3.799960531894434e-06, + "loss": 1.3423, + "step": 4890 + }, + { + "epoch": 0.6924329298506406, + "grad_norm": 8.634630659075832, + "learning_rate": 3.7994708844176385e-06, + "loss": 1.3508, + "step": 4891 + }, + { + "epoch": 0.6925745027252779, + "grad_norm": 8.636722725151731, + "learning_rate": 3.7989811686286283e-06, + "loss": 1.3805, + "step": 4892 + }, + { + "epoch": 0.6927160755999151, + "grad_norm": 10.306962809714598, + "learning_rate": 3.7984913845531466e-06, + "loss": 1.5042, + "step": 4893 + }, + { + "epoch": 0.6928576484745522, + "grad_norm": 6.41779073142109, + "learning_rate": 3.798001532216941e-06, + "loss": 1.0354, + "step": 4894 + }, + { + "epoch": 0.6929992213491895, + "grad_norm": 10.878149519237665, + "learning_rate": 3.7975116116457626e-06, + "loss": 1.2593, + "step": 4895 + }, + { + "epoch": 0.6931407942238267, + "grad_norm": 11.194988451019368, + "learning_rate": 3.7970216228653667e-06, + "loss": 1.2403, + "step": 4896 + }, + { + "epoch": 0.6932823670984639, + "grad_norm": 10.147827036104207, + "learning_rate": 3.7965315659015108e-06, + "loss": 1.2987, + "step": 4897 + }, + { + "epoch": 0.6934239399731011, + "grad_norm": 9.562818733837345, + "learning_rate": 3.7960414407799565e-06, + "loss": 1.2448, + "step": 4898 + }, + { + "epoch": 0.6935655128477384, + "grad_norm": 7.588697471581375, + "learning_rate": 3.795551247526471e-06, + "loss": 1.2787, + "step": 4899 + }, + { + "epoch": 0.6937070857223756, + "grad_norm": 9.40032265292016, + "learning_rate": 3.795060986166822e-06, + "loss": 1.1789, + "step": 4900 + }, + { + "epoch": 0.6938486585970128, + "grad_norm": 7.7511015078478325, + "learning_rate": 3.794570656726784e-06, + "loss": 1.3604, + "step": 4901 + }, + { + "epoch": 0.6939902314716501, + "grad_norm": 8.439650424027358, + "learning_rate": 3.79408025923213e-06, + "loss": 1.2041, + "step": 4902 + }, + { + "epoch": 0.6941318043462873, + "grad_norm": 7.807440500365535, + "learning_rate": 3.793589793708642e-06, + "loss": 1.2344, + "step": 4903 + }, + { + "epoch": 0.6942733772209245, + "grad_norm": 10.730722575037614, + "learning_rate": 3.7930992601821028e-06, + "loss": 1.315, + "step": 4904 + }, + { + "epoch": 0.6944149500955616, + "grad_norm": 10.432444187419975, + "learning_rate": 3.7926086586783008e-06, + "loss": 1.4356, + "step": 4905 + }, + { + "epoch": 0.6945565229701989, + "grad_norm": 8.661101045807259, + "learning_rate": 3.7921179892230246e-06, + "loss": 1.2717, + "step": 4906 + }, + { + "epoch": 0.6946980958448361, + "grad_norm": 8.00855798740308, + "learning_rate": 3.7916272518420694e-06, + "loss": 1.2995, + "step": 4907 + }, + { + "epoch": 0.6948396687194733, + "grad_norm": 7.93278182968453, + "learning_rate": 3.791136446561233e-06, + "loss": 1.3208, + "step": 4908 + }, + { + "epoch": 0.6949812415941106, + "grad_norm": 9.795611773124005, + "learning_rate": 3.7906455734063156e-06, + "loss": 1.1817, + "step": 4909 + }, + { + "epoch": 0.6951228144687478, + "grad_norm": 9.790631654836352, + "learning_rate": 3.7901546324031236e-06, + "loss": 1.3801, + "step": 4910 + }, + { + "epoch": 0.695264387343385, + "grad_norm": 9.033039483315065, + "learning_rate": 3.7896636235774636e-06, + "loss": 1.3694, + "step": 4911 + }, + { + "epoch": 0.6954059602180223, + "grad_norm": 10.933521179198245, + "learning_rate": 3.789172546955149e-06, + "loss": 1.2528, + "step": 4912 + }, + { + "epoch": 0.6955475330926595, + "grad_norm": 9.977625802895078, + "learning_rate": 3.7886814025619944e-06, + "loss": 1.2312, + "step": 4913 + }, + { + "epoch": 0.6956891059672967, + "grad_norm": 9.23632420140274, + "learning_rate": 3.7881901904238203e-06, + "loss": 1.2931, + "step": 4914 + }, + { + "epoch": 0.6958306788419338, + "grad_norm": 9.593054134219733, + "learning_rate": 3.7876989105664476e-06, + "loss": 1.1769, + "step": 4915 + }, + { + "epoch": 0.6959722517165711, + "grad_norm": 10.587787401786535, + "learning_rate": 3.7872075630157035e-06, + "loss": 1.2248, + "step": 4916 + }, + { + "epoch": 0.6961138245912083, + "grad_norm": 10.627061352198849, + "learning_rate": 3.786716147797418e-06, + "loss": 1.2254, + "step": 4917 + }, + { + "epoch": 0.6962553974658455, + "grad_norm": 9.291088502512686, + "learning_rate": 3.786224664937424e-06, + "loss": 1.3238, + "step": 4918 + }, + { + "epoch": 0.6963969703404828, + "grad_norm": 9.503380023578906, + "learning_rate": 3.7857331144615576e-06, + "loss": 1.2969, + "step": 4919 + }, + { + "epoch": 0.69653854321512, + "grad_norm": 9.650350503040963, + "learning_rate": 3.785241496395661e-06, + "loss": 1.3473, + "step": 4920 + }, + { + "epoch": 0.6966801160897572, + "grad_norm": 11.706166420967653, + "learning_rate": 3.7847498107655768e-06, + "loss": 1.2749, + "step": 4921 + }, + { + "epoch": 0.6968216889643944, + "grad_norm": 10.62040305829329, + "learning_rate": 3.7842580575971533e-06, + "loss": 1.2057, + "step": 4922 + }, + { + "epoch": 0.6969632618390317, + "grad_norm": 9.882318030490707, + "learning_rate": 3.783766236916241e-06, + "loss": 1.3365, + "step": 4923 + }, + { + "epoch": 0.6971048347136689, + "grad_norm": 8.968806994320214, + "learning_rate": 3.7832743487486945e-06, + "loss": 1.2977, + "step": 4924 + }, + { + "epoch": 0.697246407588306, + "grad_norm": 8.612794948006497, + "learning_rate": 3.782782393120373e-06, + "loss": 1.2234, + "step": 4925 + }, + { + "epoch": 0.6973879804629433, + "grad_norm": 7.715316193999354, + "learning_rate": 3.7822903700571372e-06, + "loss": 1.3272, + "step": 4926 + }, + { + "epoch": 0.6975295533375805, + "grad_norm": 9.49020000733384, + "learning_rate": 3.781798279584853e-06, + "loss": 1.3736, + "step": 4927 + }, + { + "epoch": 0.6976711262122177, + "grad_norm": 8.702009420619065, + "learning_rate": 3.7813061217293887e-06, + "loss": 1.4032, + "step": 4928 + }, + { + "epoch": 0.697812699086855, + "grad_norm": 8.669712216222022, + "learning_rate": 3.7808138965166167e-06, + "loss": 1.3451, + "step": 4929 + }, + { + "epoch": 0.6979542719614922, + "grad_norm": 8.419875889676677, + "learning_rate": 3.780321603972414e-06, + "loss": 1.2725, + "step": 4930 + }, + { + "epoch": 0.6980958448361294, + "grad_norm": 8.896024489005432, + "learning_rate": 3.7798292441226584e-06, + "loss": 1.2032, + "step": 4931 + }, + { + "epoch": 0.6982374177107666, + "grad_norm": 9.903564859374029, + "learning_rate": 3.7793368169932343e-06, + "loss": 1.3223, + "step": 4932 + }, + { + "epoch": 0.6983789905854039, + "grad_norm": 9.68202300756629, + "learning_rate": 3.7788443226100274e-06, + "loss": 1.324, + "step": 4933 + }, + { + "epoch": 0.6985205634600411, + "grad_norm": 8.713065533228137, + "learning_rate": 3.7783517609989284e-06, + "loss": 1.1389, + "step": 4934 + }, + { + "epoch": 0.6986621363346783, + "grad_norm": 8.591505388681894, + "learning_rate": 3.77785913218583e-06, + "loss": 1.3227, + "step": 4935 + }, + { + "epoch": 0.6988037092093154, + "grad_norm": 8.368398313418838, + "learning_rate": 3.77736643619663e-06, + "loss": 1.3473, + "step": 4936 + }, + { + "epoch": 0.6989452820839527, + "grad_norm": 7.110504664284424, + "learning_rate": 3.776873673057229e-06, + "loss": 1.2929, + "step": 4937 + }, + { + "epoch": 0.6990868549585899, + "grad_norm": 8.805998872921933, + "learning_rate": 3.776380842793531e-06, + "loss": 1.2378, + "step": 4938 + }, + { + "epoch": 0.6992284278332271, + "grad_norm": 7.941074557678906, + "learning_rate": 3.775887945431444e-06, + "loss": 1.295, + "step": 4939 + }, + { + "epoch": 0.6993700007078644, + "grad_norm": 9.061558325941162, + "learning_rate": 3.775394980996879e-06, + "loss": 1.3317, + "step": 4940 + }, + { + "epoch": 0.6995115735825016, + "grad_norm": 8.211162357173276, + "learning_rate": 3.77490194951575e-06, + "loss": 1.2875, + "step": 4941 + }, + { + "epoch": 0.6996531464571388, + "grad_norm": 10.379192929800633, + "learning_rate": 3.7744088510139763e-06, + "loss": 1.1362, + "step": 4942 + }, + { + "epoch": 0.6997947193317761, + "grad_norm": 9.463786520735693, + "learning_rate": 3.773915685517481e-06, + "loss": 1.1189, + "step": 4943 + }, + { + "epoch": 0.6999362922064133, + "grad_norm": 9.401063217744227, + "learning_rate": 3.7734224530521867e-06, + "loss": 1.2399, + "step": 4944 + }, + { + "epoch": 0.7000778650810505, + "grad_norm": 7.873647180041247, + "learning_rate": 3.772929153644024e-06, + "loss": 1.2202, + "step": 4945 + }, + { + "epoch": 0.7002194379556876, + "grad_norm": 8.531410523186462, + "learning_rate": 3.772435787318925e-06, + "loss": 1.269, + "step": 4946 + }, + { + "epoch": 0.7003610108303249, + "grad_norm": 10.453007125760099, + "learning_rate": 3.771942354102825e-06, + "loss": 1.3982, + "step": 4947 + }, + { + "epoch": 0.7005025837049621, + "grad_norm": 8.377815798069728, + "learning_rate": 3.7714488540216637e-06, + "loss": 1.3188, + "step": 4948 + }, + { + "epoch": 0.7006441565795993, + "grad_norm": 9.427031312802487, + "learning_rate": 3.7709552871013844e-06, + "loss": 1.2008, + "step": 4949 + }, + { + "epoch": 0.7007857294542366, + "grad_norm": 11.28356470258672, + "learning_rate": 3.770461653367934e-06, + "loss": 1.3115, + "step": 4950 + }, + { + "epoch": 0.7009273023288738, + "grad_norm": 14.18976914328925, + "learning_rate": 3.769967952847261e-06, + "loss": 1.3739, + "step": 4951 + }, + { + "epoch": 0.701068875203511, + "grad_norm": 7.797662179232041, + "learning_rate": 3.7694741855653195e-06, + "loss": 1.1321, + "step": 4952 + }, + { + "epoch": 0.7012104480781483, + "grad_norm": 11.12324767295709, + "learning_rate": 3.7689803515480674e-06, + "loss": 1.4091, + "step": 4953 + }, + { + "epoch": 0.7013520209527855, + "grad_norm": 9.021052850485358, + "learning_rate": 3.7684864508214638e-06, + "loss": 1.1714, + "step": 4954 + }, + { + "epoch": 0.7014935938274227, + "grad_norm": 9.991549927127876, + "learning_rate": 3.7679924834114735e-06, + "loss": 1.3709, + "step": 4955 + }, + { + "epoch": 0.7016351667020598, + "grad_norm": 8.742622535115187, + "learning_rate": 3.7674984493440632e-06, + "loss": 1.122, + "step": 4956 + }, + { + "epoch": 0.7017767395766971, + "grad_norm": 8.72004673656573, + "learning_rate": 3.7670043486452047e-06, + "loss": 1.3288, + "step": 4957 + }, + { + "epoch": 0.7019183124513343, + "grad_norm": 9.946118725913186, + "learning_rate": 3.7665101813408726e-06, + "loss": 1.2583, + "step": 4958 + }, + { + "epoch": 0.7020598853259715, + "grad_norm": 9.257154017984972, + "learning_rate": 3.766015947457046e-06, + "loss": 1.3945, + "step": 4959 + }, + { + "epoch": 0.7022014582006088, + "grad_norm": 8.272610746433783, + "learning_rate": 3.7655216470197033e-06, + "loss": 1.1771, + "step": 4960 + }, + { + "epoch": 0.702343031075246, + "grad_norm": 9.948398587735074, + "learning_rate": 3.7650272800548316e-06, + "loss": 1.1787, + "step": 4961 + }, + { + "epoch": 0.7024846039498832, + "grad_norm": 8.159066017404639, + "learning_rate": 3.764532846588419e-06, + "loss": 1.3023, + "step": 4962 + }, + { + "epoch": 0.7026261768245204, + "grad_norm": 9.168489263424764, + "learning_rate": 3.764038346646457e-06, + "loss": 1.3405, + "step": 4963 + }, + { + "epoch": 0.7027677496991577, + "grad_norm": 8.404070603580204, + "learning_rate": 3.7635437802549426e-06, + "loss": 1.2132, + "step": 4964 + }, + { + "epoch": 0.7029093225737949, + "grad_norm": 11.718537758494685, + "learning_rate": 3.7630491474398734e-06, + "loss": 1.2017, + "step": 4965 + }, + { + "epoch": 0.7030508954484321, + "grad_norm": 12.474050212779227, + "learning_rate": 3.7625544482272523e-06, + "loss": 1.2649, + "step": 4966 + }, + { + "epoch": 0.7031924683230693, + "grad_norm": 6.741970231145808, + "learning_rate": 3.762059682643085e-06, + "loss": 1.1768, + "step": 4967 + }, + { + "epoch": 0.7033340411977065, + "grad_norm": 11.35848773077972, + "learning_rate": 3.7615648507133816e-06, + "loss": 1.3184, + "step": 4968 + }, + { + "epoch": 0.7034756140723437, + "grad_norm": 8.199391356419246, + "learning_rate": 3.7610699524641547e-06, + "loss": 1.3233, + "step": 4969 + }, + { + "epoch": 0.7036171869469809, + "grad_norm": 10.094598031810287, + "learning_rate": 3.7605749879214203e-06, + "loss": 1.2596, + "step": 4970 + }, + { + "epoch": 0.7037587598216182, + "grad_norm": 8.296501194351212, + "learning_rate": 3.760079957111199e-06, + "loss": 1.3658, + "step": 4971 + }, + { + "epoch": 0.7039003326962554, + "grad_norm": 7.5787169549729025, + "learning_rate": 3.7595848600595135e-06, + "loss": 1.2563, + "step": 4972 + }, + { + "epoch": 0.7040419055708926, + "grad_norm": 10.078567471512608, + "learning_rate": 3.7590896967923917e-06, + "loss": 1.2453, + "step": 4973 + }, + { + "epoch": 0.7041834784455299, + "grad_norm": 8.939170414540452, + "learning_rate": 3.7585944673358632e-06, + "loss": 1.231, + "step": 4974 + }, + { + "epoch": 0.7043250513201671, + "grad_norm": 8.413405992261696, + "learning_rate": 3.758099171715962e-06, + "loss": 1.2109, + "step": 4975 + }, + { + "epoch": 0.7044666241948043, + "grad_norm": 8.775853355190959, + "learning_rate": 3.7576038099587252e-06, + "loss": 1.1238, + "step": 4976 + }, + { + "epoch": 0.7046081970694414, + "grad_norm": 10.32144572340954, + "learning_rate": 3.7571083820901943e-06, + "loss": 1.4115, + "step": 4977 + }, + { + "epoch": 0.7047497699440787, + "grad_norm": 9.182614227589786, + "learning_rate": 3.7566128881364116e-06, + "loss": 1.3227, + "step": 4978 + }, + { + "epoch": 0.7048913428187159, + "grad_norm": 9.414778483535498, + "learning_rate": 3.7561173281234276e-06, + "loss": 1.2265, + "step": 4979 + }, + { + "epoch": 0.7050329156933531, + "grad_norm": 8.004611593493797, + "learning_rate": 3.755621702077293e-06, + "loss": 1.3549, + "step": 4980 + }, + { + "epoch": 0.7051744885679904, + "grad_norm": 9.88227942917567, + "learning_rate": 3.7551260100240604e-06, + "loss": 1.4022, + "step": 4981 + }, + { + "epoch": 0.7053160614426276, + "grad_norm": 9.64821964898289, + "learning_rate": 3.7546302519897904e-06, + "loss": 1.2917, + "step": 4982 + }, + { + "epoch": 0.7054576343172648, + "grad_norm": 9.18839762967279, + "learning_rate": 3.7541344280005427e-06, + "loss": 1.1958, + "step": 4983 + }, + { + "epoch": 0.7055992071919021, + "grad_norm": 9.064649024561103, + "learning_rate": 3.7536385380823835e-06, + "loss": 1.3827, + "step": 4984 + }, + { + "epoch": 0.7057407800665393, + "grad_norm": 8.231336811276988, + "learning_rate": 3.753142582261381e-06, + "loss": 1.4214, + "step": 4985 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 10.374437431801649, + "learning_rate": 3.7526465605636075e-06, + "loss": 1.26, + "step": 4986 + }, + { + "epoch": 0.7060239258158136, + "grad_norm": 9.600248142850104, + "learning_rate": 3.7521504730151382e-06, + "loss": 1.1791, + "step": 4987 + }, + { + "epoch": 0.7061654986904509, + "grad_norm": 6.98349887891427, + "learning_rate": 3.751654319642052e-06, + "loss": 1.2551, + "step": 4988 + }, + { + "epoch": 0.7063070715650881, + "grad_norm": 10.001691675149065, + "learning_rate": 3.7511581004704317e-06, + "loss": 1.2154, + "step": 4989 + }, + { + "epoch": 0.7064486444397253, + "grad_norm": 9.385124741112929, + "learning_rate": 3.750661815526363e-06, + "loss": 1.2442, + "step": 4990 + }, + { + "epoch": 0.7065902173143626, + "grad_norm": 10.464403029983401, + "learning_rate": 3.7501654648359353e-06, + "loss": 1.3223, + "step": 4991 + }, + { + "epoch": 0.7067317901889998, + "grad_norm": 8.444390090410522, + "learning_rate": 3.7496690484252413e-06, + "loss": 1.0963, + "step": 4992 + }, + { + "epoch": 0.706873363063637, + "grad_norm": 9.482831197718014, + "learning_rate": 3.7491725663203765e-06, + "loss": 1.2321, + "step": 4993 + }, + { + "epoch": 0.7070149359382742, + "grad_norm": 8.805193963500367, + "learning_rate": 3.748676018547442e-06, + "loss": 1.3718, + "step": 4994 + }, + { + "epoch": 0.7071565088129115, + "grad_norm": 11.822903688846585, + "learning_rate": 3.7481794051325404e-06, + "loss": 1.3167, + "step": 4995 + }, + { + "epoch": 0.7072980816875487, + "grad_norm": 9.544257821373009, + "learning_rate": 3.7476827261017777e-06, + "loss": 1.3914, + "step": 4996 + }, + { + "epoch": 0.7074396545621859, + "grad_norm": 9.001110432256226, + "learning_rate": 3.747185981481265e-06, + "loss": 1.2617, + "step": 4997 + }, + { + "epoch": 0.7075812274368231, + "grad_norm": 8.184641098701322, + "learning_rate": 3.7466891712971144e-06, + "loss": 1.1824, + "step": 4998 + }, + { + "epoch": 0.7077228003114603, + "grad_norm": 12.008651158090935, + "learning_rate": 3.7461922955754445e-06, + "loss": 1.286, + "step": 4999 + }, + { + "epoch": 0.7078643731860975, + "grad_norm": 9.794927522119508, + "learning_rate": 3.745695354342374e-06, + "loss": 1.2449, + "step": 5000 + }, + { + "epoch": 0.7080059460607347, + "grad_norm": 8.958425074854249, + "learning_rate": 3.745198347624027e-06, + "loss": 1.2338, + "step": 5001 + }, + { + "epoch": 0.708147518935372, + "grad_norm": 11.579249771403788, + "learning_rate": 3.744701275446533e-06, + "loss": 1.4563, + "step": 5002 + }, + { + "epoch": 0.7082890918100092, + "grad_norm": 10.495944738661272, + "learning_rate": 3.7442041378360204e-06, + "loss": 1.3621, + "step": 5003 + }, + { + "epoch": 0.7084306646846464, + "grad_norm": 8.77236217182866, + "learning_rate": 3.743706934818624e-06, + "loss": 1.3206, + "step": 5004 + }, + { + "epoch": 0.7085722375592837, + "grad_norm": 9.502509337844415, + "learning_rate": 3.743209666420481e-06, + "loss": 1.1139, + "step": 5005 + }, + { + "epoch": 0.7087138104339209, + "grad_norm": 10.7020432419806, + "learning_rate": 3.7427123326677326e-06, + "loss": 1.2896, + "step": 5006 + }, + { + "epoch": 0.7088553833085581, + "grad_norm": 10.623382086145739, + "learning_rate": 3.7422149335865244e-06, + "loss": 1.2087, + "step": 5007 + }, + { + "epoch": 0.7089969561831952, + "grad_norm": 9.880895086353236, + "learning_rate": 3.7417174692030027e-06, + "loss": 1.2494, + "step": 5008 + }, + { + "epoch": 0.7091385290578325, + "grad_norm": 9.604660142866114, + "learning_rate": 3.74121993954332e-06, + "loss": 1.3388, + "step": 5009 + }, + { + "epoch": 0.7092801019324697, + "grad_norm": 9.17151439679219, + "learning_rate": 3.74072234463363e-06, + "loss": 1.2659, + "step": 5010 + }, + { + "epoch": 0.7094216748071069, + "grad_norm": 11.273154611402175, + "learning_rate": 3.7402246845000916e-06, + "loss": 1.4765, + "step": 5011 + }, + { + "epoch": 0.7095632476817442, + "grad_norm": 9.971095846514118, + "learning_rate": 3.7397269591688666e-06, + "loss": 1.1789, + "step": 5012 + }, + { + "epoch": 0.7097048205563814, + "grad_norm": 9.702325318043528, + "learning_rate": 3.73922916866612e-06, + "loss": 1.2988, + "step": 5013 + }, + { + "epoch": 0.7098463934310186, + "grad_norm": 10.191715421479442, + "learning_rate": 3.7387313130180192e-06, + "loss": 1.405, + "step": 5014 + }, + { + "epoch": 0.7099879663056559, + "grad_norm": 12.596437683312306, + "learning_rate": 3.7382333922507375e-06, + "loss": 1.2271, + "step": 5015 + }, + { + "epoch": 0.7101295391802931, + "grad_norm": 12.016182478533766, + "learning_rate": 3.7377354063904484e-06, + "loss": 1.2301, + "step": 5016 + }, + { + "epoch": 0.7102711120549303, + "grad_norm": 9.42498596352258, + "learning_rate": 3.7372373554633334e-06, + "loss": 1.3243, + "step": 5017 + }, + { + "epoch": 0.7104126849295674, + "grad_norm": 7.656363848404175, + "learning_rate": 3.7367392394955726e-06, + "loss": 1.2675, + "step": 5018 + }, + { + "epoch": 0.7105542578042047, + "grad_norm": 8.849569070960417, + "learning_rate": 3.7362410585133523e-06, + "loss": 1.2072, + "step": 5019 + }, + { + "epoch": 0.7106958306788419, + "grad_norm": 7.657419294479364, + "learning_rate": 3.7357428125428612e-06, + "loss": 1.1971, + "step": 5020 + }, + { + "epoch": 0.7108374035534791, + "grad_norm": 9.371855348246527, + "learning_rate": 3.7352445016102917e-06, + "loss": 1.3024, + "step": 5021 + }, + { + "epoch": 0.7109789764281164, + "grad_norm": 8.012282479557655, + "learning_rate": 3.7347461257418403e-06, + "loss": 1.2458, + "step": 5022 + }, + { + "epoch": 0.7111205493027536, + "grad_norm": 9.751261702844575, + "learning_rate": 3.7342476849637053e-06, + "loss": 1.2864, + "step": 5023 + }, + { + "epoch": 0.7112621221773908, + "grad_norm": 10.964090863001292, + "learning_rate": 3.7337491793020898e-06, + "loss": 1.4105, + "step": 5024 + }, + { + "epoch": 0.711403695052028, + "grad_norm": 9.97001386421487, + "learning_rate": 3.7332506087832e-06, + "loss": 1.1974, + "step": 5025 + }, + { + "epoch": 0.7115452679266653, + "grad_norm": 8.936195545102652, + "learning_rate": 3.7327519734332453e-06, + "loss": 1.3504, + "step": 5026 + }, + { + "epoch": 0.7116868408013025, + "grad_norm": 9.123390787445707, + "learning_rate": 3.732253273278438e-06, + "loss": 1.3465, + "step": 5027 + }, + { + "epoch": 0.7118284136759397, + "grad_norm": 7.1676284085584285, + "learning_rate": 3.731754508344996e-06, + "loss": 1.1825, + "step": 5028 + }, + { + "epoch": 0.7119699865505769, + "grad_norm": 8.013093247374155, + "learning_rate": 3.731255678659137e-06, + "loss": 1.2602, + "step": 5029 + }, + { + "epoch": 0.7121115594252141, + "grad_norm": 9.722948547358968, + "learning_rate": 3.730756784247085e-06, + "loss": 1.0952, + "step": 5030 + }, + { + "epoch": 0.7122531322998513, + "grad_norm": 8.54383542200233, + "learning_rate": 3.730257825135067e-06, + "loss": 1.3665, + "step": 5031 + }, + { + "epoch": 0.7123947051744886, + "grad_norm": 10.047621442673556, + "learning_rate": 3.7297588013493124e-06, + "loss": 1.2516, + "step": 5032 + }, + { + "epoch": 0.7125362780491258, + "grad_norm": 8.435079948078071, + "learning_rate": 3.7292597129160547e-06, + "loss": 1.2167, + "step": 5033 + }, + { + "epoch": 0.712677850923763, + "grad_norm": 9.458597008703082, + "learning_rate": 3.72876055986153e-06, + "loss": 1.2492, + "step": 5034 + }, + { + "epoch": 0.7128194237984002, + "grad_norm": 8.587319149239676, + "learning_rate": 3.7282613422119794e-06, + "loss": 1.2576, + "step": 5035 + }, + { + "epoch": 0.7129609966730375, + "grad_norm": 10.347435055676614, + "learning_rate": 3.7277620599936453e-06, + "loss": 1.1688, + "step": 5036 + }, + { + "epoch": 0.7131025695476747, + "grad_norm": 9.786348324344, + "learning_rate": 3.7272627132327753e-06, + "loss": 1.3657, + "step": 5037 + }, + { + "epoch": 0.7132441424223119, + "grad_norm": 7.784879152196689, + "learning_rate": 3.7267633019556194e-06, + "loss": 1.1277, + "step": 5038 + }, + { + "epoch": 0.713385715296949, + "grad_norm": 9.534572935097172, + "learning_rate": 3.726263826188432e-06, + "loss": 1.2699, + "step": 5039 + }, + { + "epoch": 0.7135272881715863, + "grad_norm": 8.423108748337999, + "learning_rate": 3.7257642859574694e-06, + "loss": 1.2522, + "step": 5040 + }, + { + "epoch": 0.7136688610462235, + "grad_norm": 8.928291325126445, + "learning_rate": 3.7252646812889926e-06, + "loss": 1.3809, + "step": 5041 + }, + { + "epoch": 0.7138104339208607, + "grad_norm": 7.569674784881908, + "learning_rate": 3.724765012209264e-06, + "loss": 1.131, + "step": 5042 + }, + { + "epoch": 0.713952006795498, + "grad_norm": 9.579735874047046, + "learning_rate": 3.7242652787445527e-06, + "loss": 1.4202, + "step": 5043 + }, + { + "epoch": 0.7140935796701352, + "grad_norm": 10.404194396846076, + "learning_rate": 3.723765480921129e-06, + "loss": 1.3617, + "step": 5044 + }, + { + "epoch": 0.7142351525447724, + "grad_norm": 8.316968132879904, + "learning_rate": 3.7232656187652655e-06, + "loss": 1.1848, + "step": 5045 + }, + { + "epoch": 0.7143767254194097, + "grad_norm": 8.295859294437525, + "learning_rate": 3.7227656923032406e-06, + "loss": 1.1614, + "step": 5046 + }, + { + "epoch": 0.7145182982940469, + "grad_norm": 10.247191625916914, + "learning_rate": 3.7222657015613354e-06, + "loss": 1.339, + "step": 5047 + }, + { + "epoch": 0.7146598711686841, + "grad_norm": 7.654310580253742, + "learning_rate": 3.7217656465658335e-06, + "loss": 1.1711, + "step": 5048 + }, + { + "epoch": 0.7148014440433214, + "grad_norm": 10.291922829923928, + "learning_rate": 3.721265527343023e-06, + "loss": 1.3388, + "step": 5049 + }, + { + "epoch": 0.7149430169179585, + "grad_norm": 10.685927247445367, + "learning_rate": 3.7207653439191944e-06, + "loss": 1.2639, + "step": 5050 + }, + { + "epoch": 0.7150845897925957, + "grad_norm": 8.494415243678121, + "learning_rate": 3.720265096320641e-06, + "loss": 1.3357, + "step": 5051 + }, + { + "epoch": 0.7152261626672329, + "grad_norm": 10.687088093990381, + "learning_rate": 3.7197647845736616e-06, + "loss": 1.3178, + "step": 5052 + }, + { + "epoch": 0.7153677355418702, + "grad_norm": 11.36310063989331, + "learning_rate": 3.719264408704557e-06, + "loss": 1.0806, + "step": 5053 + }, + { + "epoch": 0.7155093084165074, + "grad_norm": 11.5902011920361, + "learning_rate": 3.718763968739632e-06, + "loss": 1.284, + "step": 5054 + }, + { + "epoch": 0.7156508812911446, + "grad_norm": 8.875307319585607, + "learning_rate": 3.718263464705194e-06, + "loss": 1.2202, + "step": 5055 + }, + { + "epoch": 0.7157924541657819, + "grad_norm": 12.197269221626772, + "learning_rate": 3.7177628966275535e-06, + "loss": 1.1655, + "step": 5056 + }, + { + "epoch": 0.7159340270404191, + "grad_norm": 9.117819273044319, + "learning_rate": 3.717262264533026e-06, + "loss": 1.2692, + "step": 5057 + }, + { + "epoch": 0.7160755999150563, + "grad_norm": 9.817635716878883, + "learning_rate": 3.716761568447928e-06, + "loss": 1.2274, + "step": 5058 + }, + { + "epoch": 0.7162171727896935, + "grad_norm": 8.612251922622267, + "learning_rate": 3.7162608083985824e-06, + "loss": 1.3801, + "step": 5059 + }, + { + "epoch": 0.7163587456643307, + "grad_norm": 9.877895268801257, + "learning_rate": 3.715759984411313e-06, + "loss": 1.3553, + "step": 5060 + }, + { + "epoch": 0.7165003185389679, + "grad_norm": 10.316661018365927, + "learning_rate": 3.715259096512447e-06, + "loss": 1.3419, + "step": 5061 + }, + { + "epoch": 0.7166418914136051, + "grad_norm": 8.241782980298373, + "learning_rate": 3.7147581447283172e-06, + "loss": 1.118, + "step": 5062 + }, + { + "epoch": 0.7167834642882424, + "grad_norm": 10.945054241131407, + "learning_rate": 3.714257129085257e-06, + "loss": 1.3159, + "step": 5063 + }, + { + "epoch": 0.7169250371628796, + "grad_norm": 8.084213471098414, + "learning_rate": 3.7137560496096054e-06, + "loss": 1.3527, + "step": 5064 + }, + { + "epoch": 0.7170666100375168, + "grad_norm": 12.411323029035007, + "learning_rate": 3.7132549063277033e-06, + "loss": 1.2772, + "step": 5065 + }, + { + "epoch": 0.717208182912154, + "grad_norm": 12.841622652097639, + "learning_rate": 3.712753699265895e-06, + "loss": 1.2729, + "step": 5066 + }, + { + "epoch": 0.7173497557867913, + "grad_norm": 7.832156180814695, + "learning_rate": 3.712252428450529e-06, + "loss": 1.1791, + "step": 5067 + }, + { + "epoch": 0.7174913286614285, + "grad_norm": 10.107018975967463, + "learning_rate": 3.7117510939079563e-06, + "loss": 1.4276, + "step": 5068 + }, + { + "epoch": 0.7176329015360657, + "grad_norm": 8.051396257137835, + "learning_rate": 3.7112496956645326e-06, + "loss": 1.252, + "step": 5069 + }, + { + "epoch": 0.7177744744107029, + "grad_norm": 8.89724179507664, + "learning_rate": 3.710748233746616e-06, + "loss": 1.1197, + "step": 5070 + }, + { + "epoch": 0.7179160472853401, + "grad_norm": 11.028007571124057, + "learning_rate": 3.7102467081805676e-06, + "loss": 1.2125, + "step": 5071 + }, + { + "epoch": 0.7180576201599773, + "grad_norm": 7.730168965653262, + "learning_rate": 3.709745118992751e-06, + "loss": 1.2158, + "step": 5072 + }, + { + "epoch": 0.7181991930346145, + "grad_norm": 9.114115454881091, + "learning_rate": 3.709243466209537e-06, + "loss": 1.301, + "step": 5073 + }, + { + "epoch": 0.7183407659092518, + "grad_norm": 8.239047091299833, + "learning_rate": 3.7087417498572946e-06, + "loss": 1.2783, + "step": 5074 + }, + { + "epoch": 0.718482338783889, + "grad_norm": 9.781960166037347, + "learning_rate": 3.7082399699623996e-06, + "loss": 1.2208, + "step": 5075 + }, + { + "epoch": 0.7186239116585262, + "grad_norm": 9.778672533807311, + "learning_rate": 3.707738126551231e-06, + "loss": 1.2108, + "step": 5076 + }, + { + "epoch": 0.7187654845331635, + "grad_norm": 8.112918263425525, + "learning_rate": 3.707236219650169e-06, + "loss": 1.1804, + "step": 5077 + }, + { + "epoch": 0.7189070574078007, + "grad_norm": 10.245028150602643, + "learning_rate": 3.7067342492855997e-06, + "loss": 1.5004, + "step": 5078 + }, + { + "epoch": 0.7190486302824379, + "grad_norm": 9.740069295961744, + "learning_rate": 3.7062322154839098e-06, + "loss": 1.3034, + "step": 5079 + }, + { + "epoch": 0.7191902031570752, + "grad_norm": 9.389542756896832, + "learning_rate": 3.7057301182714924e-06, + "loss": 1.3087, + "step": 5080 + }, + { + "epoch": 0.7193317760317123, + "grad_norm": 9.929339819222902, + "learning_rate": 3.705227957674742e-06, + "loss": 1.197, + "step": 5081 + }, + { + "epoch": 0.7194733489063495, + "grad_norm": 9.69728391709, + "learning_rate": 3.7047257337200554e-06, + "loss": 1.2315, + "step": 5082 + }, + { + "epoch": 0.7196149217809867, + "grad_norm": 10.551050967257806, + "learning_rate": 3.704223446433836e-06, + "loss": 1.3738, + "step": 5083 + }, + { + "epoch": 0.719756494655624, + "grad_norm": 8.099172064770919, + "learning_rate": 3.703721095842488e-06, + "loss": 1.2885, + "step": 5084 + }, + { + "epoch": 0.7198980675302612, + "grad_norm": 11.808551108093699, + "learning_rate": 3.703218681972419e-06, + "loss": 1.2763, + "step": 5085 + }, + { + "epoch": 0.7200396404048984, + "grad_norm": 9.39011273756184, + "learning_rate": 3.702716204850042e-06, + "loss": 1.2684, + "step": 5086 + }, + { + "epoch": 0.7201812132795357, + "grad_norm": 8.913345241191541, + "learning_rate": 3.7022136645017704e-06, + "loss": 1.21, + "step": 5087 + }, + { + "epoch": 0.7203227861541729, + "grad_norm": 8.793358325165967, + "learning_rate": 3.701711060954023e-06, + "loss": 1.2362, + "step": 5088 + }, + { + "epoch": 0.7204643590288101, + "grad_norm": 10.928511233836288, + "learning_rate": 3.701208394233221e-06, + "loss": 1.3687, + "step": 5089 + }, + { + "epoch": 0.7206059319034473, + "grad_norm": 12.936455578502573, + "learning_rate": 3.7007056643657884e-06, + "loss": 1.2005, + "step": 5090 + }, + { + "epoch": 0.7207475047780845, + "grad_norm": 10.206311789966241, + "learning_rate": 3.700202871378156e-06, + "loss": 1.1142, + "step": 5091 + }, + { + "epoch": 0.7208890776527217, + "grad_norm": 8.977644812771183, + "learning_rate": 3.6997000152967526e-06, + "loss": 1.2408, + "step": 5092 + }, + { + "epoch": 0.7210306505273589, + "grad_norm": 8.699778709666584, + "learning_rate": 3.699197096148014e-06, + "loss": 1.2759, + "step": 5093 + }, + { + "epoch": 0.7211722234019962, + "grad_norm": 9.161047814420874, + "learning_rate": 3.698694113958379e-06, + "loss": 1.2863, + "step": 5094 + }, + { + "epoch": 0.7213137962766334, + "grad_norm": 11.545414028219508, + "learning_rate": 3.6981910687542873e-06, + "loss": 1.3082, + "step": 5095 + }, + { + "epoch": 0.7214553691512706, + "grad_norm": 9.716589390058633, + "learning_rate": 3.697687960562185e-06, + "loss": 1.2858, + "step": 5096 + }, + { + "epoch": 0.7215969420259079, + "grad_norm": 9.355097689666481, + "learning_rate": 3.697184789408519e-06, + "loss": 1.2806, + "step": 5097 + }, + { + "epoch": 0.7217385149005451, + "grad_norm": 8.547630151168912, + "learning_rate": 3.6966815553197416e-06, + "loss": 1.2902, + "step": 5098 + }, + { + "epoch": 0.7218800877751823, + "grad_norm": 10.05820615151271, + "learning_rate": 3.696178258322307e-06, + "loss": 1.2081, + "step": 5099 + }, + { + "epoch": 0.7220216606498195, + "grad_norm": 10.637390093117958, + "learning_rate": 3.6956748984426736e-06, + "loss": 1.2723, + "step": 5100 + }, + { + "epoch": 0.7221632335244567, + "grad_norm": 9.689525367915023, + "learning_rate": 3.695171475707302e-06, + "loss": 1.3748, + "step": 5101 + }, + { + "epoch": 0.7223048063990939, + "grad_norm": 9.350124468459422, + "learning_rate": 3.694667990142658e-06, + "loss": 1.2944, + "step": 5102 + }, + { + "epoch": 0.7224463792737311, + "grad_norm": 8.448328825793327, + "learning_rate": 3.6941644417752077e-06, + "loss": 1.1687, + "step": 5103 + }, + { + "epoch": 0.7225879521483684, + "grad_norm": 9.714593225308157, + "learning_rate": 3.6936608306314227e-06, + "loss": 1.2232, + "step": 5104 + }, + { + "epoch": 0.7227295250230056, + "grad_norm": 10.601770476266939, + "learning_rate": 3.6931571567377785e-06, + "loss": 1.2866, + "step": 5105 + }, + { + "epoch": 0.7228710978976428, + "grad_norm": 9.290412669958478, + "learning_rate": 3.692653420120752e-06, + "loss": 1.3162, + "step": 5106 + }, + { + "epoch": 0.72301267077228, + "grad_norm": 8.447918824094849, + "learning_rate": 3.6921496208068253e-06, + "loss": 1.084, + "step": 5107 + }, + { + "epoch": 0.7231542436469173, + "grad_norm": 9.640653093936983, + "learning_rate": 3.691645758822481e-06, + "loss": 1.3319, + "step": 5108 + }, + { + "epoch": 0.7232958165215545, + "grad_norm": 10.499879382212713, + "learning_rate": 3.6911418341942078e-06, + "loss": 1.2394, + "step": 5109 + }, + { + "epoch": 0.7234373893961917, + "grad_norm": 8.463411908059792, + "learning_rate": 3.690637846948497e-06, + "loss": 1.3194, + "step": 5110 + }, + { + "epoch": 0.723578962270829, + "grad_norm": 11.193678188892353, + "learning_rate": 3.6901337971118415e-06, + "loss": 1.3656, + "step": 5111 + }, + { + "epoch": 0.7237205351454661, + "grad_norm": 11.75890840511494, + "learning_rate": 3.6896296847107406e-06, + "loss": 1.2511, + "step": 5112 + }, + { + "epoch": 0.7238621080201033, + "grad_norm": 9.590755430187953, + "learning_rate": 3.6891255097716937e-06, + "loss": 1.2729, + "step": 5113 + }, + { + "epoch": 0.7240036808947405, + "grad_norm": 9.97994739342778, + "learning_rate": 3.6886212723212057e-06, + "loss": 1.3541, + "step": 5114 + }, + { + "epoch": 0.7241452537693778, + "grad_norm": 8.372523610676062, + "learning_rate": 3.6881169723857833e-06, + "loss": 1.3336, + "step": 5115 + }, + { + "epoch": 0.724286826644015, + "grad_norm": 10.088960910975104, + "learning_rate": 3.687612609991938e-06, + "loss": 1.2664, + "step": 5116 + }, + { + "epoch": 0.7244283995186522, + "grad_norm": 8.184992981513293, + "learning_rate": 3.6871081851661825e-06, + "loss": 1.1688, + "step": 5117 + }, + { + "epoch": 0.7245699723932895, + "grad_norm": 10.104178034720302, + "learning_rate": 3.686603697935036e-06, + "loss": 1.2276, + "step": 5118 + }, + { + "epoch": 0.7247115452679267, + "grad_norm": 8.67186356277399, + "learning_rate": 3.6860991483250167e-06, + "loss": 1.2104, + "step": 5119 + }, + { + "epoch": 0.7248531181425639, + "grad_norm": 9.984238219182414, + "learning_rate": 3.6855945363626504e-06, + "loss": 1.2063, + "step": 5120 + }, + { + "epoch": 0.7249946910172012, + "grad_norm": 11.920107329224615, + "learning_rate": 3.685089862074463e-06, + "loss": 1.5612, + "step": 5121 + }, + { + "epoch": 0.7251362638918383, + "grad_norm": 9.639229300940677, + "learning_rate": 3.684585125486985e-06, + "loss": 1.3208, + "step": 5122 + }, + { + "epoch": 0.7252778367664755, + "grad_norm": 11.766652744994605, + "learning_rate": 3.684080326626751e-06, + "loss": 1.153, + "step": 5123 + }, + { + "epoch": 0.7254194096411127, + "grad_norm": 9.629451762564607, + "learning_rate": 3.683575465520297e-06, + "loss": 1.2977, + "step": 5124 + }, + { + "epoch": 0.72556098251575, + "grad_norm": 8.08162626613411, + "learning_rate": 3.6830705421941624e-06, + "loss": 1.2736, + "step": 5125 + }, + { + "epoch": 0.7257025553903872, + "grad_norm": 8.238129832801244, + "learning_rate": 3.6825655566748927e-06, + "loss": 1.2341, + "step": 5126 + }, + { + "epoch": 0.7258441282650244, + "grad_norm": 9.32968213552852, + "learning_rate": 3.6820605089890323e-06, + "loss": 1.4364, + "step": 5127 + }, + { + "epoch": 0.7259857011396617, + "grad_norm": 9.24666019665914, + "learning_rate": 3.6815553991631323e-06, + "loss": 1.2405, + "step": 5128 + }, + { + "epoch": 0.7261272740142989, + "grad_norm": 12.25977044791406, + "learning_rate": 3.681050227223747e-06, + "loss": 1.2526, + "step": 5129 + }, + { + "epoch": 0.7262688468889361, + "grad_norm": 10.39845474207342, + "learning_rate": 3.6805449931974313e-06, + "loss": 1.3095, + "step": 5130 + }, + { + "epoch": 0.7264104197635733, + "grad_norm": 8.981352242861403, + "learning_rate": 3.6800396971107456e-06, + "loss": 1.2415, + "step": 5131 + }, + { + "epoch": 0.7265519926382105, + "grad_norm": 9.19225826440444, + "learning_rate": 3.6795343389902534e-06, + "loss": 1.3882, + "step": 5132 + }, + { + "epoch": 0.7266935655128477, + "grad_norm": 8.356703629089258, + "learning_rate": 3.6790289188625196e-06, + "loss": 1.0816, + "step": 5133 + }, + { + "epoch": 0.7268351383874849, + "grad_norm": 9.05614337950522, + "learning_rate": 3.678523436754115e-06, + "loss": 1.1913, + "step": 5134 + }, + { + "epoch": 0.7269767112621222, + "grad_norm": 9.848994138838494, + "learning_rate": 3.678017892691612e-06, + "loss": 1.2706, + "step": 5135 + }, + { + "epoch": 0.7271182841367594, + "grad_norm": 11.712371310867532, + "learning_rate": 3.677512286701587e-06, + "loss": 1.3705, + "step": 5136 + }, + { + "epoch": 0.7272598570113966, + "grad_norm": 10.811858229802482, + "learning_rate": 3.677006618810619e-06, + "loss": 1.3665, + "step": 5137 + }, + { + "epoch": 0.7274014298860338, + "grad_norm": 10.016031004105134, + "learning_rate": 3.676500889045291e-06, + "loss": 1.2419, + "step": 5138 + }, + { + "epoch": 0.7275430027606711, + "grad_norm": 8.983784372569948, + "learning_rate": 3.6759950974321883e-06, + "loss": 1.2226, + "step": 5139 + }, + { + "epoch": 0.7276845756353083, + "grad_norm": 12.871581577439795, + "learning_rate": 3.6754892439979e-06, + "loss": 1.4036, + "step": 5140 + }, + { + "epoch": 0.7278261485099455, + "grad_norm": 10.447809849526141, + "learning_rate": 3.6749833287690183e-06, + "loss": 1.3349, + "step": 5141 + }, + { + "epoch": 0.7279677213845828, + "grad_norm": 10.822068054786103, + "learning_rate": 3.6744773517721394e-06, + "loss": 1.4621, + "step": 5142 + }, + { + "epoch": 0.7281092942592199, + "grad_norm": 10.550093548256012, + "learning_rate": 3.6739713130338617e-06, + "loss": 1.1373, + "step": 5143 + }, + { + "epoch": 0.7282508671338571, + "grad_norm": 10.37083222657552, + "learning_rate": 3.673465212580788e-06, + "loss": 1.3429, + "step": 5144 + }, + { + "epoch": 0.7283924400084943, + "grad_norm": 10.45853520131423, + "learning_rate": 3.672959050439523e-06, + "loss": 1.2876, + "step": 5145 + }, + { + "epoch": 0.7285340128831316, + "grad_norm": 7.2325695578022104, + "learning_rate": 3.672452826636675e-06, + "loss": 1.2238, + "step": 5146 + }, + { + "epoch": 0.7286755857577688, + "grad_norm": 9.470589739151634, + "learning_rate": 3.671946541198856e-06, + "loss": 1.2213, + "step": 5147 + }, + { + "epoch": 0.728817158632406, + "grad_norm": 10.43340618689048, + "learning_rate": 3.671440194152681e-06, + "loss": 1.239, + "step": 5148 + }, + { + "epoch": 0.7289587315070433, + "grad_norm": 9.251342675989996, + "learning_rate": 3.670933785524769e-06, + "loss": 1.1187, + "step": 5149 + }, + { + "epoch": 0.7291003043816805, + "grad_norm": 11.6569385478604, + "learning_rate": 3.6704273153417407e-06, + "loss": 1.2649, + "step": 5150 + }, + { + "epoch": 0.7292418772563177, + "grad_norm": 9.851036257381164, + "learning_rate": 3.669920783630221e-06, + "loss": 1.2911, + "step": 5151 + }, + { + "epoch": 0.729383450130955, + "grad_norm": 8.520755783014176, + "learning_rate": 3.669414190416838e-06, + "loss": 1.1824, + "step": 5152 + }, + { + "epoch": 0.7295250230055921, + "grad_norm": 8.439489511536348, + "learning_rate": 3.6689075357282235e-06, + "loss": 1.3285, + "step": 5153 + }, + { + "epoch": 0.7296665958802293, + "grad_norm": 14.098412260658446, + "learning_rate": 3.668400819591011e-06, + "loss": 1.3154, + "step": 5154 + }, + { + "epoch": 0.7298081687548665, + "grad_norm": 13.188097031805327, + "learning_rate": 3.6678940420318385e-06, + "loss": 1.3574, + "step": 5155 + }, + { + "epoch": 0.7299497416295038, + "grad_norm": 11.443517884046846, + "learning_rate": 3.6673872030773473e-06, + "loss": 1.3242, + "step": 5156 + }, + { + "epoch": 0.730091314504141, + "grad_norm": 8.977181648169891, + "learning_rate": 3.666880302754181e-06, + "loss": 1.1358, + "step": 5157 + }, + { + "epoch": 0.7302328873787782, + "grad_norm": 10.533142100984456, + "learning_rate": 3.6663733410889875e-06, + "loss": 1.2687, + "step": 5158 + }, + { + "epoch": 0.7303744602534155, + "grad_norm": 11.251551881352613, + "learning_rate": 3.665866318108417e-06, + "loss": 1.2583, + "step": 5159 + }, + { + "epoch": 0.7305160331280527, + "grad_norm": 13.205848115076765, + "learning_rate": 3.665359233839124e-06, + "loss": 1.2862, + "step": 5160 + }, + { + "epoch": 0.7306576060026899, + "grad_norm": 10.22570675969889, + "learning_rate": 3.6648520883077644e-06, + "loss": 1.2053, + "step": 5161 + }, + { + "epoch": 0.7307991788773271, + "grad_norm": 8.696329364115625, + "learning_rate": 3.6643448815409994e-06, + "loss": 1.4039, + "step": 5162 + }, + { + "epoch": 0.7309407517519643, + "grad_norm": 7.9356937305664434, + "learning_rate": 3.663837613565492e-06, + "loss": 1.0407, + "step": 5163 + }, + { + "epoch": 0.7310823246266015, + "grad_norm": 13.382554817279846, + "learning_rate": 3.663330284407908e-06, + "loss": 1.3257, + "step": 5164 + }, + { + "epoch": 0.7312238975012387, + "grad_norm": 10.880869016162036, + "learning_rate": 3.6628228940949195e-06, + "loss": 1.169, + "step": 5165 + }, + { + "epoch": 0.731365470375876, + "grad_norm": 7.946631278059502, + "learning_rate": 3.662315442653199e-06, + "loss": 1.2242, + "step": 5166 + }, + { + "epoch": 0.7315070432505132, + "grad_norm": 11.988402166219794, + "learning_rate": 3.661807930109422e-06, + "loss": 1.4152, + "step": 5167 + }, + { + "epoch": 0.7316486161251504, + "grad_norm": 7.152900811611465, + "learning_rate": 3.6613003564902678e-06, + "loss": 1.2662, + "step": 5168 + }, + { + "epoch": 0.7317901889997876, + "grad_norm": 9.56794214906076, + "learning_rate": 3.66079272182242e-06, + "loss": 1.3481, + "step": 5169 + }, + { + "epoch": 0.7319317618744249, + "grad_norm": 13.643073457587942, + "learning_rate": 3.6602850261325645e-06, + "loss": 1.3628, + "step": 5170 + }, + { + "epoch": 0.7320733347490621, + "grad_norm": 11.209311592682038, + "learning_rate": 3.6597772694473902e-06, + "loss": 1.3356, + "step": 5171 + }, + { + "epoch": 0.7322149076236993, + "grad_norm": 10.96889710939011, + "learning_rate": 3.6592694517935895e-06, + "loss": 1.3145, + "step": 5172 + }, + { + "epoch": 0.7323564804983366, + "grad_norm": 8.02562329495105, + "learning_rate": 3.6587615731978583e-06, + "loss": 1.173, + "step": 5173 + }, + { + "epoch": 0.7324980533729737, + "grad_norm": 8.242029212046422, + "learning_rate": 3.658253633686895e-06, + "loss": 1.2253, + "step": 5174 + }, + { + "epoch": 0.7326396262476109, + "grad_norm": 9.624548963407243, + "learning_rate": 3.6577456332874025e-06, + "loss": 1.2072, + "step": 5175 + }, + { + "epoch": 0.7327811991222482, + "grad_norm": 11.886017196065142, + "learning_rate": 3.657237572026085e-06, + "loss": 1.2405, + "step": 5176 + }, + { + "epoch": 0.7329227719968854, + "grad_norm": 11.947720770766738, + "learning_rate": 3.656729449929651e-06, + "loss": 1.4367, + "step": 5177 + }, + { + "epoch": 0.7330643448715226, + "grad_norm": 9.6891135563987, + "learning_rate": 3.656221267024812e-06, + "loss": 1.2576, + "step": 5178 + }, + { + "epoch": 0.7332059177461598, + "grad_norm": 9.565747875083682, + "learning_rate": 3.6557130233382833e-06, + "loss": 1.391, + "step": 5179 + }, + { + "epoch": 0.7333474906207971, + "grad_norm": 8.39992265211552, + "learning_rate": 3.6552047188967827e-06, + "loss": 1.2423, + "step": 5180 + }, + { + "epoch": 0.7334890634954343, + "grad_norm": 12.055189378121888, + "learning_rate": 3.6546963537270314e-06, + "loss": 1.1793, + "step": 5181 + }, + { + "epoch": 0.7336306363700715, + "grad_norm": 10.032347812177951, + "learning_rate": 3.654187927855754e-06, + "loss": 1.307, + "step": 5182 + }, + { + "epoch": 0.7337722092447088, + "grad_norm": 10.115008663790391, + "learning_rate": 3.6536794413096775e-06, + "loss": 1.1515, + "step": 5183 + }, + { + "epoch": 0.7339137821193459, + "grad_norm": 7.275401635897708, + "learning_rate": 3.6531708941155337e-06, + "loss": 1.1348, + "step": 5184 + }, + { + "epoch": 0.7340553549939831, + "grad_norm": 8.263025808989074, + "learning_rate": 3.652662286300055e-06, + "loss": 1.1498, + "step": 5185 + }, + { + "epoch": 0.7341969278686203, + "grad_norm": 9.488657554806307, + "learning_rate": 3.6521536178899798e-06, + "loss": 1.159, + "step": 5186 + }, + { + "epoch": 0.7343385007432576, + "grad_norm": 10.357143248008382, + "learning_rate": 3.6516448889120475e-06, + "loss": 1.2947, + "step": 5187 + }, + { + "epoch": 0.7344800736178948, + "grad_norm": 7.931221468778715, + "learning_rate": 3.651136099393003e-06, + "loss": 1.3574, + "step": 5188 + }, + { + "epoch": 0.734621646492532, + "grad_norm": 9.464668829636222, + "learning_rate": 3.650627249359591e-06, + "loss": 1.3439, + "step": 5189 + }, + { + "epoch": 0.7347632193671693, + "grad_norm": 9.078569568772332, + "learning_rate": 3.650118338838563e-06, + "loss": 1.2525, + "step": 5190 + }, + { + "epoch": 0.7349047922418065, + "grad_norm": 8.137622594422638, + "learning_rate": 3.6496093678566713e-06, + "loss": 1.286, + "step": 5191 + }, + { + "epoch": 0.7350463651164437, + "grad_norm": 10.942569625873524, + "learning_rate": 3.649100336440673e-06, + "loss": 1.2613, + "step": 5192 + }, + { + "epoch": 0.735187937991081, + "grad_norm": 7.51561941674039, + "learning_rate": 3.648591244617326e-06, + "loss": 1.2604, + "step": 5193 + }, + { + "epoch": 0.7353295108657182, + "grad_norm": 10.150951141741903, + "learning_rate": 3.648082092413394e-06, + "loss": 1.3052, + "step": 5194 + }, + { + "epoch": 0.7354710837403553, + "grad_norm": 10.146964279735998, + "learning_rate": 3.6475728798556426e-06, + "loss": 1.314, + "step": 5195 + }, + { + "epoch": 0.7356126566149925, + "grad_norm": 9.023857285284148, + "learning_rate": 3.6470636069708405e-06, + "loss": 1.277, + "step": 5196 + }, + { + "epoch": 0.7357542294896298, + "grad_norm": 9.044065892542598, + "learning_rate": 3.6465542737857603e-06, + "loss": 1.1713, + "step": 5197 + }, + { + "epoch": 0.735895802364267, + "grad_norm": 9.533871949814973, + "learning_rate": 3.646044880327176e-06, + "loss": 1.2165, + "step": 5198 + }, + { + "epoch": 0.7360373752389042, + "grad_norm": 10.233152360175358, + "learning_rate": 3.6455354266218675e-06, + "loss": 1.2245, + "step": 5199 + }, + { + "epoch": 0.7361789481135415, + "grad_norm": 9.382753548290033, + "learning_rate": 3.645025912696615e-06, + "loss": 1.3147, + "step": 5200 + }, + { + "epoch": 0.7363205209881787, + "grad_norm": 7.402912917184953, + "learning_rate": 3.644516338578204e-06, + "loss": 1.3148, + "step": 5201 + }, + { + "epoch": 0.7364620938628159, + "grad_norm": 8.865087857141843, + "learning_rate": 3.644006704293423e-06, + "loss": 1.3511, + "step": 5202 + }, + { + "epoch": 0.7366036667374531, + "grad_norm": 8.840681318721947, + "learning_rate": 3.643497009869063e-06, + "loss": 1.2473, + "step": 5203 + }, + { + "epoch": 0.7367452396120904, + "grad_norm": 9.007718061761024, + "learning_rate": 3.642987255331917e-06, + "loss": 1.2677, + "step": 5204 + }, + { + "epoch": 0.7368868124867275, + "grad_norm": 11.260637806365692, + "learning_rate": 3.642477440708784e-06, + "loss": 1.4095, + "step": 5205 + }, + { + "epoch": 0.7370283853613647, + "grad_norm": 11.671754726622622, + "learning_rate": 3.641967566026463e-06, + "loss": 1.2258, + "step": 5206 + }, + { + "epoch": 0.737169958236002, + "grad_norm": 9.308226558536221, + "learning_rate": 3.641457631311759e-06, + "loss": 1.2104, + "step": 5207 + }, + { + "epoch": 0.7373115311106392, + "grad_norm": 9.40385411568065, + "learning_rate": 3.6409476365914786e-06, + "loss": 1.2737, + "step": 5208 + }, + { + "epoch": 0.7374531039852764, + "grad_norm": 9.16649192874707, + "learning_rate": 3.6404375818924315e-06, + "loss": 1.1916, + "step": 5209 + }, + { + "epoch": 0.7375946768599136, + "grad_norm": 11.417330608877974, + "learning_rate": 3.639927467241431e-06, + "loss": 1.1516, + "step": 5210 + }, + { + "epoch": 0.7377362497345509, + "grad_norm": 11.48392125485434, + "learning_rate": 3.639417292665293e-06, + "loss": 1.2893, + "step": 5211 + }, + { + "epoch": 0.7378778226091881, + "grad_norm": 9.155499215966035, + "learning_rate": 3.638907058190838e-06, + "loss": 1.3363, + "step": 5212 + }, + { + "epoch": 0.7380193954838253, + "grad_norm": 11.557946870627923, + "learning_rate": 3.638396763844889e-06, + "loss": 1.3463, + "step": 5213 + }, + { + "epoch": 0.7381609683584626, + "grad_norm": 9.347628499749867, + "learning_rate": 3.63788640965427e-06, + "loss": 1.1756, + "step": 5214 + }, + { + "epoch": 0.7383025412330997, + "grad_norm": 10.072452527897951, + "learning_rate": 3.637375995645811e-06, + "loss": 1.2819, + "step": 5215 + }, + { + "epoch": 0.7384441141077369, + "grad_norm": 10.296166634397544, + "learning_rate": 3.6368655218463435e-06, + "loss": 1.3802, + "step": 5216 + }, + { + "epoch": 0.7385856869823741, + "grad_norm": 8.96127039350056, + "learning_rate": 3.636354988282704e-06, + "loss": 1.1792, + "step": 5217 + }, + { + "epoch": 0.7387272598570114, + "grad_norm": 9.653528910318924, + "learning_rate": 3.635844394981729e-06, + "loss": 1.1588, + "step": 5218 + }, + { + "epoch": 0.7388688327316486, + "grad_norm": 7.880949164993448, + "learning_rate": 3.6353337419702627e-06, + "loss": 1.1791, + "step": 5219 + }, + { + "epoch": 0.7390104056062858, + "grad_norm": 9.767389293753098, + "learning_rate": 3.6348230292751476e-06, + "loss": 1.1924, + "step": 5220 + }, + { + "epoch": 0.7391519784809231, + "grad_norm": 10.268147476545984, + "learning_rate": 3.6343122569232313e-06, + "loss": 1.2072, + "step": 5221 + }, + { + "epoch": 0.7392935513555603, + "grad_norm": 7.454385774767277, + "learning_rate": 3.6338014249413657e-06, + "loss": 1.0635, + "step": 5222 + }, + { + "epoch": 0.7394351242301975, + "grad_norm": 9.895154256442344, + "learning_rate": 3.6332905333564046e-06, + "loss": 1.2943, + "step": 5223 + }, + { + "epoch": 0.7395766971048348, + "grad_norm": 9.344166009630145, + "learning_rate": 3.632779582195205e-06, + "loss": 1.371, + "step": 5224 + }, + { + "epoch": 0.739718269979472, + "grad_norm": 10.983969712139917, + "learning_rate": 3.6322685714846277e-06, + "loss": 1.3387, + "step": 5225 + }, + { + "epoch": 0.7398598428541091, + "grad_norm": 7.9552967875975416, + "learning_rate": 3.631757501251536e-06, + "loss": 1.2348, + "step": 5226 + }, + { + "epoch": 0.7400014157287463, + "grad_norm": 10.672934796503263, + "learning_rate": 3.631246371522796e-06, + "loss": 1.3486, + "step": 5227 + }, + { + "epoch": 0.7401429886033836, + "grad_norm": 7.750074570820043, + "learning_rate": 3.6307351823252778e-06, + "loss": 1.3397, + "step": 5228 + }, + { + "epoch": 0.7402845614780208, + "grad_norm": 8.244614201409192, + "learning_rate": 3.6302239336858547e-06, + "loss": 1.1519, + "step": 5229 + }, + { + "epoch": 0.740426134352658, + "grad_norm": 7.881839022909351, + "learning_rate": 3.6297126256314013e-06, + "loss": 1.2336, + "step": 5230 + }, + { + "epoch": 0.7405677072272953, + "grad_norm": 7.101234017172676, + "learning_rate": 3.629201258188798e-06, + "loss": 1.1502, + "step": 5231 + }, + { + "epoch": 0.7407092801019325, + "grad_norm": 7.568237902250983, + "learning_rate": 3.6286898313849267e-06, + "loss": 1.3364, + "step": 5232 + }, + { + "epoch": 0.7408508529765697, + "grad_norm": 7.549710390713181, + "learning_rate": 3.6281783452466725e-06, + "loss": 1.1825, + "step": 5233 + }, + { + "epoch": 0.740992425851207, + "grad_norm": 7.560515844921469, + "learning_rate": 3.6276667998009242e-06, + "loss": 1.234, + "step": 5234 + }, + { + "epoch": 0.7411339987258442, + "grad_norm": 8.701372446131074, + "learning_rate": 3.627155195074572e-06, + "loss": 1.2128, + "step": 5235 + }, + { + "epoch": 0.7412755716004813, + "grad_norm": 7.467335842436728, + "learning_rate": 3.6266435310945125e-06, + "loss": 1.1976, + "step": 5236 + }, + { + "epoch": 0.7414171444751185, + "grad_norm": 9.152944339155766, + "learning_rate": 3.6261318078876416e-06, + "loss": 1.2441, + "step": 5237 + }, + { + "epoch": 0.7415587173497558, + "grad_norm": 10.944610899957333, + "learning_rate": 3.625620025480862e-06, + "loss": 1.3752, + "step": 5238 + }, + { + "epoch": 0.741700290224393, + "grad_norm": 7.505441344802678, + "learning_rate": 3.625108183901077e-06, + "loss": 1.302, + "step": 5239 + }, + { + "epoch": 0.7418418630990302, + "grad_norm": 10.305213527253336, + "learning_rate": 3.624596283175194e-06, + "loss": 1.3533, + "step": 5240 + }, + { + "epoch": 0.7419834359736674, + "grad_norm": 8.32104420198139, + "learning_rate": 3.6240843233301228e-06, + "loss": 1.2248, + "step": 5241 + }, + { + "epoch": 0.7421250088483047, + "grad_norm": 11.633731967104188, + "learning_rate": 3.623572304392776e-06, + "loss": 1.2812, + "step": 5242 + }, + { + "epoch": 0.7422665817229419, + "grad_norm": 8.398006893030669, + "learning_rate": 3.6230602263900714e-06, + "loss": 1.376, + "step": 5243 + }, + { + "epoch": 0.7424081545975791, + "grad_norm": 8.585537180015796, + "learning_rate": 3.6225480893489283e-06, + "loss": 1.1023, + "step": 5244 + }, + { + "epoch": 0.7425497274722164, + "grad_norm": 10.077017032725113, + "learning_rate": 3.6220358932962696e-06, + "loss": 1.2636, + "step": 5245 + }, + { + "epoch": 0.7426913003468535, + "grad_norm": 8.6280246904256, + "learning_rate": 3.6215236382590197e-06, + "loss": 1.4506, + "step": 5246 + }, + { + "epoch": 0.7428328732214907, + "grad_norm": 9.937228984855937, + "learning_rate": 3.621011324264109e-06, + "loss": 1.2891, + "step": 5247 + }, + { + "epoch": 0.742974446096128, + "grad_norm": 9.42783532840345, + "learning_rate": 3.620498951338468e-06, + "loss": 1.273, + "step": 5248 + }, + { + "epoch": 0.7431160189707652, + "grad_norm": 8.656916733827684, + "learning_rate": 3.6199865195090333e-06, + "loss": 1.1711, + "step": 5249 + }, + { + "epoch": 0.7432575918454024, + "grad_norm": 11.235246266415, + "learning_rate": 3.619474028802743e-06, + "loss": 1.2557, + "step": 5250 + }, + { + "epoch": 0.7433991647200396, + "grad_norm": 9.600454367375463, + "learning_rate": 3.618961479246537e-06, + "loss": 1.2663, + "step": 5251 + }, + { + "epoch": 0.7435407375946769, + "grad_norm": 9.765776561323893, + "learning_rate": 3.6184488708673605e-06, + "loss": 1.2687, + "step": 5252 + }, + { + "epoch": 0.7436823104693141, + "grad_norm": 12.398651640400258, + "learning_rate": 3.61793620369216e-06, + "loss": 1.3407, + "step": 5253 + }, + { + "epoch": 0.7438238833439513, + "grad_norm": 13.636893304883964, + "learning_rate": 3.617423477747888e-06, + "loss": 1.3015, + "step": 5254 + }, + { + "epoch": 0.7439654562185886, + "grad_norm": 11.255447425168075, + "learning_rate": 3.616910693061496e-06, + "loss": 1.2419, + "step": 5255 + }, + { + "epoch": 0.7441070290932258, + "grad_norm": 8.31845361423784, + "learning_rate": 3.6163978496599428e-06, + "loss": 1.3042, + "step": 5256 + }, + { + "epoch": 0.7442486019678629, + "grad_norm": 10.546068762587073, + "learning_rate": 3.6158849475701863e-06, + "loss": 1.1896, + "step": 5257 + }, + { + "epoch": 0.7443901748425001, + "grad_norm": 10.675714073151251, + "learning_rate": 3.6153719868191905e-06, + "loss": 1.3167, + "step": 5258 + }, + { + "epoch": 0.7445317477171374, + "grad_norm": 10.373663391053274, + "learning_rate": 3.614858967433921e-06, + "loss": 1.2217, + "step": 5259 + }, + { + "epoch": 0.7446733205917746, + "grad_norm": 8.450029127082058, + "learning_rate": 3.6143458894413463e-06, + "loss": 1.3335, + "step": 5260 + }, + { + "epoch": 0.7448148934664118, + "grad_norm": 6.388803833473635, + "learning_rate": 3.613832752868439e-06, + "loss": 1.1314, + "step": 5261 + }, + { + "epoch": 0.7449564663410491, + "grad_norm": 8.252981860801983, + "learning_rate": 3.613319557742175e-06, + "loss": 1.3585, + "step": 5262 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 6.856671402255315, + "learning_rate": 3.6128063040895318e-06, + "loss": 1.2872, + "step": 5263 + }, + { + "epoch": 0.7452396120903235, + "grad_norm": 8.472556296955371, + "learning_rate": 3.612292991937491e-06, + "loss": 1.3495, + "step": 5264 + }, + { + "epoch": 0.7453811849649608, + "grad_norm": 9.880284694013914, + "learning_rate": 3.6117796213130367e-06, + "loss": 1.2844, + "step": 5265 + }, + { + "epoch": 0.745522757839598, + "grad_norm": 7.931238302788852, + "learning_rate": 3.6112661922431576e-06, + "loss": 1.2506, + "step": 5266 + }, + { + "epoch": 0.7456643307142351, + "grad_norm": 7.967856921845544, + "learning_rate": 3.610752704754842e-06, + "loss": 1.3228, + "step": 5267 + }, + { + "epoch": 0.7458059035888723, + "grad_norm": 8.608071784163315, + "learning_rate": 3.610239158875085e-06, + "loss": 1.191, + "step": 5268 + }, + { + "epoch": 0.7459474764635096, + "grad_norm": 8.973574095278705, + "learning_rate": 3.609725554630884e-06, + "loss": 1.3282, + "step": 5269 + }, + { + "epoch": 0.7460890493381468, + "grad_norm": 9.432502282226027, + "learning_rate": 3.609211892049238e-06, + "loss": 1.2991, + "step": 5270 + }, + { + "epoch": 0.746230622212784, + "grad_norm": 8.022942071790416, + "learning_rate": 3.60869817115715e-06, + "loss": 1.1099, + "step": 5271 + }, + { + "epoch": 0.7463721950874213, + "grad_norm": 8.421975101590862, + "learning_rate": 3.6081843919816263e-06, + "loss": 1.2083, + "step": 5272 + }, + { + "epoch": 0.7465137679620585, + "grad_norm": 10.732731633666823, + "learning_rate": 3.6076705545496743e-06, + "loss": 1.2363, + "step": 5273 + }, + { + "epoch": 0.7466553408366957, + "grad_norm": 7.49359238612331, + "learning_rate": 3.6071566588883077e-06, + "loss": 1.1887, + "step": 5274 + }, + { + "epoch": 0.7467969137113329, + "grad_norm": 11.894291997799039, + "learning_rate": 3.606642705024541e-06, + "loss": 1.1765, + "step": 5275 + }, + { + "epoch": 0.7469384865859702, + "grad_norm": 10.302552760543819, + "learning_rate": 3.6061286929853915e-06, + "loss": 1.4551, + "step": 5276 + }, + { + "epoch": 0.7470800594606073, + "grad_norm": 10.404042602663031, + "learning_rate": 3.6056146227978827e-06, + "loss": 1.4332, + "step": 5277 + }, + { + "epoch": 0.7472216323352445, + "grad_norm": 10.008923935649095, + "learning_rate": 3.6051004944890373e-06, + "loss": 1.3871, + "step": 5278 + }, + { + "epoch": 0.7473632052098818, + "grad_norm": 9.475782344999525, + "learning_rate": 3.6045863080858824e-06, + "loss": 1.1792, + "step": 5279 + }, + { + "epoch": 0.747504778084519, + "grad_norm": 11.504719968037847, + "learning_rate": 3.604072063615449e-06, + "loss": 1.2039, + "step": 5280 + }, + { + "epoch": 0.7476463509591562, + "grad_norm": 9.388876449105211, + "learning_rate": 3.6035577611047713e-06, + "loss": 1.1879, + "step": 5281 + }, + { + "epoch": 0.7477879238337934, + "grad_norm": 8.516453245988806, + "learning_rate": 3.603043400580884e-06, + "loss": 1.1095, + "step": 5282 + }, + { + "epoch": 0.7479294967084307, + "grad_norm": 8.259742302003605, + "learning_rate": 3.6025289820708277e-06, + "loss": 1.1363, + "step": 5283 + }, + { + "epoch": 0.7480710695830679, + "grad_norm": 10.252895783545359, + "learning_rate": 3.6020145056016454e-06, + "loss": 1.1861, + "step": 5284 + }, + { + "epoch": 0.7482126424577051, + "grad_norm": 8.41947401709925, + "learning_rate": 3.601499971200382e-06, + "loss": 1.2405, + "step": 5285 + }, + { + "epoch": 0.7483542153323424, + "grad_norm": 8.139128153831477, + "learning_rate": 3.600985378894086e-06, + "loss": 1.2881, + "step": 5286 + }, + { + "epoch": 0.7484957882069796, + "grad_norm": 7.33889155279139, + "learning_rate": 3.6004707287098104e-06, + "loss": 1.1257, + "step": 5287 + }, + { + "epoch": 0.7486373610816167, + "grad_norm": 7.309652442445243, + "learning_rate": 3.5999560206746088e-06, + "loss": 1.1952, + "step": 5288 + }, + { + "epoch": 0.748778933956254, + "grad_norm": 9.943298567445673, + "learning_rate": 3.5994412548155387e-06, + "loss": 1.2129, + "step": 5289 + }, + { + "epoch": 0.7489205068308912, + "grad_norm": 8.876568722768898, + "learning_rate": 3.5989264311596617e-06, + "loss": 1.1777, + "step": 5290 + }, + { + "epoch": 0.7490620797055284, + "grad_norm": 10.343759219808014, + "learning_rate": 3.598411549734042e-06, + "loss": 1.3628, + "step": 5291 + }, + { + "epoch": 0.7492036525801656, + "grad_norm": 7.60362098164228, + "learning_rate": 3.5978966105657465e-06, + "loss": 1.266, + "step": 5292 + }, + { + "epoch": 0.7493452254548029, + "grad_norm": 7.800356773628911, + "learning_rate": 3.597381613681845e-06, + "loss": 1.3083, + "step": 5293 + }, + { + "epoch": 0.7494867983294401, + "grad_norm": 9.358927873684262, + "learning_rate": 3.5968665591094097e-06, + "loss": 1.3291, + "step": 5294 + }, + { + "epoch": 0.7496283712040773, + "grad_norm": 8.452653402828005, + "learning_rate": 3.5963514468755172e-06, + "loss": 1.2212, + "step": 5295 + }, + { + "epoch": 0.7497699440787146, + "grad_norm": 8.017151090561205, + "learning_rate": 3.5958362770072465e-06, + "loss": 1.276, + "step": 5296 + }, + { + "epoch": 0.7499115169533518, + "grad_norm": 7.80914869526292, + "learning_rate": 3.59532104953168e-06, + "loss": 1.2712, + "step": 5297 + }, + { + "epoch": 0.7500530898279889, + "grad_norm": 7.932903009919819, + "learning_rate": 3.5948057644759025e-06, + "loss": 1.2068, + "step": 5298 + }, + { + "epoch": 0.7501946627026261, + "grad_norm": 9.928133791210303, + "learning_rate": 3.5942904218670025e-06, + "loss": 1.2994, + "step": 5299 + }, + { + "epoch": 0.7503362355772634, + "grad_norm": 8.069404898141885, + "learning_rate": 3.5937750217320712e-06, + "loss": 1.3877, + "step": 5300 + }, + { + "epoch": 0.7504778084519006, + "grad_norm": 8.728716441874845, + "learning_rate": 3.5932595640982023e-06, + "loss": 1.2207, + "step": 5301 + }, + { + "epoch": 0.7506193813265378, + "grad_norm": 7.979224647198435, + "learning_rate": 3.592744048992493e-06, + "loss": 1.2443, + "step": 5302 + }, + { + "epoch": 0.7507609542011751, + "grad_norm": 9.121182413915955, + "learning_rate": 3.5922284764420445e-06, + "loss": 1.3691, + "step": 5303 + }, + { + "epoch": 0.7509025270758123, + "grad_norm": 9.537829120201927, + "learning_rate": 3.5917128464739586e-06, + "loss": 1.2502, + "step": 5304 + }, + { + "epoch": 0.7510440999504495, + "grad_norm": 8.272773521515093, + "learning_rate": 3.5911971591153426e-06, + "loss": 1.258, + "step": 5305 + }, + { + "epoch": 0.7511856728250867, + "grad_norm": 9.101662251331055, + "learning_rate": 3.590681414393306e-06, + "loss": 1.256, + "step": 5306 + }, + { + "epoch": 0.751327245699724, + "grad_norm": 7.359632094766185, + "learning_rate": 3.5901656123349606e-06, + "loss": 1.2355, + "step": 5307 + }, + { + "epoch": 0.7514688185743611, + "grad_norm": 10.783641251457539, + "learning_rate": 3.5896497529674213e-06, + "loss": 1.2127, + "step": 5308 + }, + { + "epoch": 0.7516103914489983, + "grad_norm": 8.58459695446475, + "learning_rate": 3.589133836317808e-06, + "loss": 1.2717, + "step": 5309 + }, + { + "epoch": 0.7517519643236356, + "grad_norm": 9.141900702244968, + "learning_rate": 3.5886178624132407e-06, + "loss": 1.2816, + "step": 5310 + }, + { + "epoch": 0.7518935371982728, + "grad_norm": 8.704105890359212, + "learning_rate": 3.5881018312808435e-06, + "loss": 1.3212, + "step": 5311 + }, + { + "epoch": 0.75203511007291, + "grad_norm": 9.366763145882077, + "learning_rate": 3.5875857429477447e-06, + "loss": 1.2922, + "step": 5312 + }, + { + "epoch": 0.7521766829475472, + "grad_norm": 10.042524710015382, + "learning_rate": 3.5870695974410734e-06, + "loss": 1.3256, + "step": 5313 + }, + { + "epoch": 0.7523182558221845, + "grad_norm": 11.201364120789213, + "learning_rate": 3.586553394787965e-06, + "loss": 1.2475, + "step": 5314 + }, + { + "epoch": 0.7524598286968217, + "grad_norm": 9.741371053273692, + "learning_rate": 3.5860371350155547e-06, + "loss": 1.4311, + "step": 5315 + }, + { + "epoch": 0.7526014015714589, + "grad_norm": 8.629194041573577, + "learning_rate": 3.5855208181509817e-06, + "loss": 1.1594, + "step": 5316 + }, + { + "epoch": 0.7527429744460962, + "grad_norm": 10.454647392409242, + "learning_rate": 3.585004444221389e-06, + "loss": 1.3115, + "step": 5317 + }, + { + "epoch": 0.7528845473207334, + "grad_norm": 10.059580124573532, + "learning_rate": 3.584488013253921e-06, + "loss": 1.2082, + "step": 5318 + }, + { + "epoch": 0.7530261201953705, + "grad_norm": 9.382911700662108, + "learning_rate": 3.5839715252757273e-06, + "loss": 1.2735, + "step": 5319 + }, + { + "epoch": 0.7531676930700077, + "grad_norm": 8.450132055210041, + "learning_rate": 3.583454980313959e-06, + "loss": 1.158, + "step": 5320 + }, + { + "epoch": 0.753309265944645, + "grad_norm": 8.475178998559098, + "learning_rate": 3.58293837839577e-06, + "loss": 1.1053, + "step": 5321 + }, + { + "epoch": 0.7534508388192822, + "grad_norm": 8.929461512204934, + "learning_rate": 3.5824217195483178e-06, + "loss": 1.1672, + "step": 5322 + }, + { + "epoch": 0.7535924116939194, + "grad_norm": 9.61319672796549, + "learning_rate": 3.581905003798763e-06, + "loss": 1.1835, + "step": 5323 + }, + { + "epoch": 0.7537339845685567, + "grad_norm": 7.649149938489514, + "learning_rate": 3.581388231174269e-06, + "loss": 1.2791, + "step": 5324 + }, + { + "epoch": 0.7538755574431939, + "grad_norm": 9.523851668854968, + "learning_rate": 3.580871401702002e-06, + "loss": 1.3424, + "step": 5325 + }, + { + "epoch": 0.7540171303178311, + "grad_norm": 9.407599976449601, + "learning_rate": 3.5803545154091312e-06, + "loss": 1.2421, + "step": 5326 + }, + { + "epoch": 0.7541587031924684, + "grad_norm": 8.847290624343792, + "learning_rate": 3.5798375723228283e-06, + "loss": 1.3853, + "step": 5327 + }, + { + "epoch": 0.7543002760671056, + "grad_norm": 9.164333485843152, + "learning_rate": 3.57932057247027e-06, + "loss": 1.2116, + "step": 5328 + }, + { + "epoch": 0.7544418489417427, + "grad_norm": 8.626470025880915, + "learning_rate": 3.5788035158786346e-06, + "loss": 1.2988, + "step": 5329 + }, + { + "epoch": 0.7545834218163799, + "grad_norm": 9.651666337267233, + "learning_rate": 3.5782864025751025e-06, + "loss": 1.2906, + "step": 5330 + }, + { + "epoch": 0.7547249946910172, + "grad_norm": 8.668026841851304, + "learning_rate": 3.577769232586858e-06, + "loss": 1.2244, + "step": 5331 + }, + { + "epoch": 0.7548665675656544, + "grad_norm": 7.459332134633462, + "learning_rate": 3.5772520059410887e-06, + "loss": 1.1949, + "step": 5332 + }, + { + "epoch": 0.7550081404402916, + "grad_norm": 9.00175077575516, + "learning_rate": 3.576734722664984e-06, + "loss": 1.2193, + "step": 5333 + }, + { + "epoch": 0.7551497133149289, + "grad_norm": 12.356309127087288, + "learning_rate": 3.576217382785738e-06, + "loss": 1.2756, + "step": 5334 + }, + { + "epoch": 0.7552912861895661, + "grad_norm": 10.230640271567118, + "learning_rate": 3.5756999863305475e-06, + "loss": 1.3386, + "step": 5335 + }, + { + "epoch": 0.7554328590642033, + "grad_norm": 8.503283147101781, + "learning_rate": 3.57518253332661e-06, + "loss": 1.2507, + "step": 5336 + }, + { + "epoch": 0.7555744319388406, + "grad_norm": 8.088929850901941, + "learning_rate": 3.574665023801129e-06, + "loss": 1.2718, + "step": 5337 + }, + { + "epoch": 0.7557160048134778, + "grad_norm": 10.92199933986805, + "learning_rate": 3.5741474577813086e-06, + "loss": 1.4943, + "step": 5338 + }, + { + "epoch": 0.755857577688115, + "grad_norm": 8.557361703881325, + "learning_rate": 3.573629835294357e-06, + "loss": 1.1788, + "step": 5339 + }, + { + "epoch": 0.7559991505627521, + "grad_norm": 9.624587409353866, + "learning_rate": 3.5731121563674863e-06, + "loss": 1.4047, + "step": 5340 + }, + { + "epoch": 0.7561407234373894, + "grad_norm": 9.233238498829625, + "learning_rate": 3.572594421027909e-06, + "loss": 1.2387, + "step": 5341 + }, + { + "epoch": 0.7562822963120266, + "grad_norm": 9.328867615959279, + "learning_rate": 3.572076629302843e-06, + "loss": 1.2711, + "step": 5342 + }, + { + "epoch": 0.7564238691866638, + "grad_norm": 13.469992171004186, + "learning_rate": 3.571558781219508e-06, + "loss": 1.1601, + "step": 5343 + }, + { + "epoch": 0.756565442061301, + "grad_norm": 10.2317122162172, + "learning_rate": 3.5710408768051262e-06, + "loss": 1.3433, + "step": 5344 + }, + { + "epoch": 0.7567070149359383, + "grad_norm": 10.51368656968832, + "learning_rate": 3.5705229160869247e-06, + "loss": 1.2007, + "step": 5345 + }, + { + "epoch": 0.7568485878105755, + "grad_norm": 10.6477020076838, + "learning_rate": 3.570004899092133e-06, + "loss": 1.2715, + "step": 5346 + }, + { + "epoch": 0.7569901606852127, + "grad_norm": 7.604044901948542, + "learning_rate": 3.5694868258479798e-06, + "loss": 1.2446, + "step": 5347 + }, + { + "epoch": 0.75713173355985, + "grad_norm": 10.246955605550797, + "learning_rate": 3.5689686963817023e-06, + "loss": 1.2238, + "step": 5348 + }, + { + "epoch": 0.7572733064344872, + "grad_norm": 11.310056981112679, + "learning_rate": 3.568450510720537e-06, + "loss": 1.2834, + "step": 5349 + }, + { + "epoch": 0.7574148793091243, + "grad_norm": 8.219199280733267, + "learning_rate": 3.567932268891725e-06, + "loss": 1.1608, + "step": 5350 + }, + { + "epoch": 0.7575564521837616, + "grad_norm": 7.860543924395659, + "learning_rate": 3.5674139709225104e-06, + "loss": 1.1325, + "step": 5351 + }, + { + "epoch": 0.7576980250583988, + "grad_norm": 8.352059901727884, + "learning_rate": 3.5668956168401392e-06, + "loss": 1.1746, + "step": 5352 + }, + { + "epoch": 0.757839597933036, + "grad_norm": 10.535924943568263, + "learning_rate": 3.5663772066718606e-06, + "loss": 1.1046, + "step": 5353 + }, + { + "epoch": 0.7579811708076732, + "grad_norm": 8.203800195426775, + "learning_rate": 3.565858740444927e-06, + "loss": 1.4213, + "step": 5354 + }, + { + "epoch": 0.7581227436823105, + "grad_norm": 10.349988824958112, + "learning_rate": 3.5653402181865954e-06, + "loss": 1.3312, + "step": 5355 + }, + { + "epoch": 0.7582643165569477, + "grad_norm": 8.670470747789953, + "learning_rate": 3.564821639924122e-06, + "loss": 1.1981, + "step": 5356 + }, + { + "epoch": 0.7584058894315849, + "grad_norm": 7.199453746418972, + "learning_rate": 3.5643030056847695e-06, + "loss": 1.3764, + "step": 5357 + }, + { + "epoch": 0.7585474623062222, + "grad_norm": 8.10137368316011, + "learning_rate": 3.5637843154958006e-06, + "loss": 1.1499, + "step": 5358 + }, + { + "epoch": 0.7586890351808594, + "grad_norm": 9.54389250223609, + "learning_rate": 3.563265569384484e-06, + "loss": 1.4019, + "step": 5359 + }, + { + "epoch": 0.7588306080554965, + "grad_norm": 9.521531850843177, + "learning_rate": 3.56274676737809e-06, + "loss": 1.3365, + "step": 5360 + }, + { + "epoch": 0.7589721809301337, + "grad_norm": 9.950297626280458, + "learning_rate": 3.5622279095038896e-06, + "loss": 1.3648, + "step": 5361 + }, + { + "epoch": 0.759113753804771, + "grad_norm": 9.067591848105435, + "learning_rate": 3.5617089957891614e-06, + "loss": 1.2903, + "step": 5362 + }, + { + "epoch": 0.7592553266794082, + "grad_norm": 9.284668983060193, + "learning_rate": 3.561190026261182e-06, + "loss": 1.2457, + "step": 5363 + }, + { + "epoch": 0.7593968995540454, + "grad_norm": 7.642858136002562, + "learning_rate": 3.5606710009472335e-06, + "loss": 1.3543, + "step": 5364 + }, + { + "epoch": 0.7595384724286827, + "grad_norm": 9.692873934739175, + "learning_rate": 3.560151919874602e-06, + "loss": 1.2858, + "step": 5365 + }, + { + "epoch": 0.7596800453033199, + "grad_norm": 9.272634495561725, + "learning_rate": 3.5596327830705746e-06, + "loss": 1.3427, + "step": 5366 + }, + { + "epoch": 0.7598216181779571, + "grad_norm": 8.203252417165192, + "learning_rate": 3.559113590562443e-06, + "loss": 1.215, + "step": 5367 + }, + { + "epoch": 0.7599631910525944, + "grad_norm": 9.338349992295925, + "learning_rate": 3.558594342377498e-06, + "loss": 1.3898, + "step": 5368 + }, + { + "epoch": 0.7601047639272316, + "grad_norm": 10.978270830140868, + "learning_rate": 3.5580750385430385e-06, + "loss": 1.3058, + "step": 5369 + }, + { + "epoch": 0.7602463368018688, + "grad_norm": 9.009701056903365, + "learning_rate": 3.557555679086363e-06, + "loss": 1.2145, + "step": 5370 + }, + { + "epoch": 0.7603879096765059, + "grad_norm": 9.228056164157632, + "learning_rate": 3.5570362640347743e-06, + "loss": 1.2172, + "step": 5371 + }, + { + "epoch": 0.7605294825511432, + "grad_norm": 9.74740370392471, + "learning_rate": 3.556516793415577e-06, + "loss": 1.2049, + "step": 5372 + }, + { + "epoch": 0.7606710554257804, + "grad_norm": 8.704978430478599, + "learning_rate": 3.5559972672560795e-06, + "loss": 1.2718, + "step": 5373 + }, + { + "epoch": 0.7608126283004176, + "grad_norm": 15.921987126997234, + "learning_rate": 3.5554776855835934e-06, + "loss": 1.1393, + "step": 5374 + }, + { + "epoch": 0.7609542011750549, + "grad_norm": 8.619904270569892, + "learning_rate": 3.554958048425432e-06, + "loss": 1.2907, + "step": 5375 + }, + { + "epoch": 0.7610957740496921, + "grad_norm": 8.557927825245395, + "learning_rate": 3.5544383558089128e-06, + "loss": 1.3369, + "step": 5376 + }, + { + "epoch": 0.7612373469243293, + "grad_norm": 9.796514817815751, + "learning_rate": 3.5539186077613562e-06, + "loss": 1.281, + "step": 5377 + }, + { + "epoch": 0.7613789197989665, + "grad_norm": 10.80376138531395, + "learning_rate": 3.553398804310083e-06, + "loss": 1.2066, + "step": 5378 + }, + { + "epoch": 0.7615204926736038, + "grad_norm": 10.597387718907134, + "learning_rate": 3.5528789454824205e-06, + "loss": 1.1269, + "step": 5379 + }, + { + "epoch": 0.761662065548241, + "grad_norm": 10.899414368742155, + "learning_rate": 3.5523590313056965e-06, + "loss": 1.4492, + "step": 5380 + }, + { + "epoch": 0.7618036384228781, + "grad_norm": 8.384127980281914, + "learning_rate": 3.551839061807244e-06, + "loss": 1.2166, + "step": 5381 + }, + { + "epoch": 0.7619452112975154, + "grad_norm": 9.63551608197587, + "learning_rate": 3.551319037014396e-06, + "loss": 1.0935, + "step": 5382 + }, + { + "epoch": 0.7620867841721526, + "grad_norm": 8.622255192635718, + "learning_rate": 3.5507989569544896e-06, + "loss": 1.1544, + "step": 5383 + }, + { + "epoch": 0.7622283570467898, + "grad_norm": 9.178292364062873, + "learning_rate": 3.550278821654866e-06, + "loss": 1.3381, + "step": 5384 + }, + { + "epoch": 0.762369929921427, + "grad_norm": 8.601797989368405, + "learning_rate": 3.5497586311428676e-06, + "loss": 1.1961, + "step": 5385 + }, + { + "epoch": 0.7625115027960643, + "grad_norm": 12.600705178524484, + "learning_rate": 3.5492383854458405e-06, + "loss": 1.2077, + "step": 5386 + }, + { + "epoch": 0.7626530756707015, + "grad_norm": 10.239226098618627, + "learning_rate": 3.548718084591134e-06, + "loss": 1.3073, + "step": 5387 + }, + { + "epoch": 0.7627946485453387, + "grad_norm": 8.755206057979205, + "learning_rate": 3.5481977286061e-06, + "loss": 1.219, + "step": 5388 + }, + { + "epoch": 0.762936221419976, + "grad_norm": 11.836689719289856, + "learning_rate": 3.547677317518093e-06, + "loss": 1.2465, + "step": 5389 + }, + { + "epoch": 0.7630777942946132, + "grad_norm": 8.15844182639479, + "learning_rate": 3.54715685135447e-06, + "loss": 1.2314, + "step": 5390 + }, + { + "epoch": 0.7632193671692503, + "grad_norm": 6.587824034276553, + "learning_rate": 3.546636330142593e-06, + "loss": 1.1834, + "step": 5391 + }, + { + "epoch": 0.7633609400438875, + "grad_norm": 9.534615744743064, + "learning_rate": 3.5461157539098236e-06, + "loss": 1.305, + "step": 5392 + }, + { + "epoch": 0.7635025129185248, + "grad_norm": 9.207615428804877, + "learning_rate": 3.5455951226835296e-06, + "loss": 1.1615, + "step": 5393 + }, + { + "epoch": 0.763644085793162, + "grad_norm": 9.551452238699834, + "learning_rate": 3.5450744364910794e-06, + "loss": 1.3527, + "step": 5394 + }, + { + "epoch": 0.7637856586677992, + "grad_norm": 9.11376512324515, + "learning_rate": 3.544553695359845e-06, + "loss": 1.2108, + "step": 5395 + }, + { + "epoch": 0.7639272315424365, + "grad_norm": 8.49560523020809, + "learning_rate": 3.5440328993172023e-06, + "loss": 1.3475, + "step": 5396 + }, + { + "epoch": 0.7640688044170737, + "grad_norm": 11.042948368945643, + "learning_rate": 3.5435120483905285e-06, + "loss": 1.3518, + "step": 5397 + }, + { + "epoch": 0.7642103772917109, + "grad_norm": 8.200392029858047, + "learning_rate": 3.542991142607204e-06, + "loss": 1.3248, + "step": 5398 + }, + { + "epoch": 0.7643519501663482, + "grad_norm": 9.924163506375212, + "learning_rate": 3.5424701819946137e-06, + "loss": 1.2304, + "step": 5399 + }, + { + "epoch": 0.7644935230409854, + "grad_norm": 9.450466116253374, + "learning_rate": 3.5419491665801424e-06, + "loss": 1.3625, + "step": 5400 + }, + { + "epoch": 0.7646350959156226, + "grad_norm": 8.495996766687846, + "learning_rate": 3.54142809639118e-06, + "loss": 1.3641, + "step": 5401 + }, + { + "epoch": 0.7647766687902597, + "grad_norm": 8.419900807863344, + "learning_rate": 3.54090697145512e-06, + "loss": 1.3652, + "step": 5402 + }, + { + "epoch": 0.764918241664897, + "grad_norm": 8.328496012197839, + "learning_rate": 3.5403857917993554e-06, + "loss": 1.3622, + "step": 5403 + }, + { + "epoch": 0.7650598145395342, + "grad_norm": 9.873553206292073, + "learning_rate": 3.5398645574512876e-06, + "loss": 1.3174, + "step": 5404 + }, + { + "epoch": 0.7652013874141714, + "grad_norm": 8.460556965703965, + "learning_rate": 3.5393432684383137e-06, + "loss": 1.3148, + "step": 5405 + }, + { + "epoch": 0.7653429602888087, + "grad_norm": 9.382275415909536, + "learning_rate": 3.5388219247878395e-06, + "loss": 1.1494, + "step": 5406 + }, + { + "epoch": 0.7654845331634459, + "grad_norm": 7.977982980231364, + "learning_rate": 3.5383005265272713e-06, + "loss": 1.3072, + "step": 5407 + }, + { + "epoch": 0.7656261060380831, + "grad_norm": 8.750325660094877, + "learning_rate": 3.537779073684019e-06, + "loss": 1.2862, + "step": 5408 + }, + { + "epoch": 0.7657676789127204, + "grad_norm": 7.7065483982202645, + "learning_rate": 3.5372575662854937e-06, + "loss": 1.1662, + "step": 5409 + }, + { + "epoch": 0.7659092517873576, + "grad_norm": 10.094998970352883, + "learning_rate": 3.536736004359112e-06, + "loss": 1.4555, + "step": 5410 + }, + { + "epoch": 0.7660508246619948, + "grad_norm": 9.310068926858252, + "learning_rate": 3.536214387932292e-06, + "loss": 1.398, + "step": 5411 + }, + { + "epoch": 0.7661923975366319, + "grad_norm": 7.72973074144275, + "learning_rate": 3.535692717032454e-06, + "loss": 1.2172, + "step": 5412 + }, + { + "epoch": 0.7663339704112692, + "grad_norm": 8.507916522550483, + "learning_rate": 3.535170991687022e-06, + "loss": 1.2608, + "step": 5413 + }, + { + "epoch": 0.7664755432859064, + "grad_norm": 9.001397766078558, + "learning_rate": 3.5346492119234225e-06, + "loss": 1.3417, + "step": 5414 + }, + { + "epoch": 0.7666171161605436, + "grad_norm": 11.749411710256364, + "learning_rate": 3.5341273777690867e-06, + "loss": 1.2301, + "step": 5415 + }, + { + "epoch": 0.7667586890351809, + "grad_norm": 8.631702417315703, + "learning_rate": 3.5336054892514437e-06, + "loss": 1.2455, + "step": 5416 + }, + { + "epoch": 0.7669002619098181, + "grad_norm": 8.752309875776856, + "learning_rate": 3.5330835463979318e-06, + "loss": 1.1372, + "step": 5417 + }, + { + "epoch": 0.7670418347844553, + "grad_norm": 10.427093617640304, + "learning_rate": 3.532561549235988e-06, + "loss": 1.2277, + "step": 5418 + }, + { + "epoch": 0.7671834076590925, + "grad_norm": 10.4905654664633, + "learning_rate": 3.532039497793054e-06, + "loss": 1.1747, + "step": 5419 + }, + { + "epoch": 0.7673249805337298, + "grad_norm": 8.752088896863066, + "learning_rate": 3.5315173920965736e-06, + "loss": 1.2775, + "step": 5420 + }, + { + "epoch": 0.767466553408367, + "grad_norm": 8.510440921320878, + "learning_rate": 3.5309952321739922e-06, + "loss": 1.1961, + "step": 5421 + }, + { + "epoch": 0.7676081262830041, + "grad_norm": 10.48789934709729, + "learning_rate": 3.53047301805276e-06, + "loss": 1.376, + "step": 5422 + }, + { + "epoch": 0.7677496991576414, + "grad_norm": 8.165620646017544, + "learning_rate": 3.5299507497603303e-06, + "loss": 1.2991, + "step": 5423 + }, + { + "epoch": 0.7678912720322786, + "grad_norm": 6.60089447433665, + "learning_rate": 3.5294284273241565e-06, + "loss": 1.186, + "step": 5424 + }, + { + "epoch": 0.7680328449069158, + "grad_norm": 8.916626784571172, + "learning_rate": 3.5289060507716986e-06, + "loss": 1.3774, + "step": 5425 + }, + { + "epoch": 0.768174417781553, + "grad_norm": 13.241632086434814, + "learning_rate": 3.528383620130417e-06, + "loss": 1.1918, + "step": 5426 + }, + { + "epoch": 0.7683159906561903, + "grad_norm": 13.688613637273384, + "learning_rate": 3.527861135427775e-06, + "loss": 1.3085, + "step": 5427 + }, + { + "epoch": 0.7684575635308275, + "grad_norm": 8.99781136921836, + "learning_rate": 3.5273385966912398e-06, + "loss": 1.1974, + "step": 5428 + }, + { + "epoch": 0.7685991364054647, + "grad_norm": 9.285371937571643, + "learning_rate": 3.52681600394828e-06, + "loss": 1.1918, + "step": 5429 + }, + { + "epoch": 0.768740709280102, + "grad_norm": 11.113968083455099, + "learning_rate": 3.526293357226369e-06, + "loss": 1.2307, + "step": 5430 + }, + { + "epoch": 0.7688822821547392, + "grad_norm": 9.08327444448251, + "learning_rate": 3.5257706565529813e-06, + "loss": 1.2019, + "step": 5431 + }, + { + "epoch": 0.7690238550293764, + "grad_norm": 9.248765192866204, + "learning_rate": 3.525247901955595e-06, + "loss": 1.3567, + "step": 5432 + }, + { + "epoch": 0.7691654279040135, + "grad_norm": 9.429215793141697, + "learning_rate": 3.5247250934616907e-06, + "loss": 1.32, + "step": 5433 + }, + { + "epoch": 0.7693070007786508, + "grad_norm": 9.146949601463799, + "learning_rate": 3.524202231098753e-06, + "loss": 1.3089, + "step": 5434 + }, + { + "epoch": 0.769448573653288, + "grad_norm": 11.329950688011552, + "learning_rate": 3.5236793148942673e-06, + "loss": 1.3104, + "step": 5435 + }, + { + "epoch": 0.7695901465279252, + "grad_norm": 10.168649282114918, + "learning_rate": 3.5231563448757233e-06, + "loss": 1.2718, + "step": 5436 + }, + { + "epoch": 0.7697317194025625, + "grad_norm": 9.745619743608495, + "learning_rate": 3.5226333210706133e-06, + "loss": 1.3217, + "step": 5437 + }, + { + "epoch": 0.7698732922771997, + "grad_norm": 10.09635546807822, + "learning_rate": 3.5221102435064314e-06, + "loss": 1.1219, + "step": 5438 + }, + { + "epoch": 0.7700148651518369, + "grad_norm": 8.353417214685116, + "learning_rate": 3.5215871122106767e-06, + "loss": 1.2251, + "step": 5439 + }, + { + "epoch": 0.7701564380264742, + "grad_norm": 8.419198539809734, + "learning_rate": 3.5210639272108487e-06, + "loss": 1.2997, + "step": 5440 + }, + { + "epoch": 0.7702980109011114, + "grad_norm": 9.619544526735945, + "learning_rate": 3.520540688534453e-06, + "loss": 1.2629, + "step": 5441 + }, + { + "epoch": 0.7704395837757486, + "grad_norm": 8.994618396308697, + "learning_rate": 3.520017396208993e-06, + "loss": 1.211, + "step": 5442 + }, + { + "epoch": 0.7705811566503857, + "grad_norm": 9.242419874376912, + "learning_rate": 3.519494050261979e-06, + "loss": 1.2418, + "step": 5443 + }, + { + "epoch": 0.770722729525023, + "grad_norm": 8.18689801281692, + "learning_rate": 3.518970650720923e-06, + "loss": 1.221, + "step": 5444 + }, + { + "epoch": 0.7708643023996602, + "grad_norm": 7.619345147136426, + "learning_rate": 3.5184471976133396e-06, + "loss": 1.4087, + "step": 5445 + }, + { + "epoch": 0.7710058752742974, + "grad_norm": 9.541033816832709, + "learning_rate": 3.517923690966747e-06, + "loss": 1.4306, + "step": 5446 + }, + { + "epoch": 0.7711474481489347, + "grad_norm": 11.760383725296167, + "learning_rate": 3.5174001308086643e-06, + "loss": 1.1966, + "step": 5447 + }, + { + "epoch": 0.7712890210235719, + "grad_norm": 8.014610300662433, + "learning_rate": 3.516876517166615e-06, + "loss": 1.1912, + "step": 5448 + }, + { + "epoch": 0.7714305938982091, + "grad_norm": 8.30662891450797, + "learning_rate": 3.5163528500681266e-06, + "loss": 1.1034, + "step": 5449 + }, + { + "epoch": 0.7715721667728463, + "grad_norm": 8.91239594001532, + "learning_rate": 3.515829129540726e-06, + "loss": 1.3547, + "step": 5450 + }, + { + "epoch": 0.7717137396474836, + "grad_norm": 9.30009299611182, + "learning_rate": 3.5153053556119454e-06, + "loss": 1.3894, + "step": 5451 + }, + { + "epoch": 0.7718553125221208, + "grad_norm": 8.937526889573842, + "learning_rate": 3.51478152830932e-06, + "loss": 1.3018, + "step": 5452 + }, + { + "epoch": 0.7719968853967579, + "grad_norm": 8.883434746922614, + "learning_rate": 3.514257647660385e-06, + "loss": 1.3539, + "step": 5453 + }, + { + "epoch": 0.7721384582713952, + "grad_norm": 8.192155519052347, + "learning_rate": 3.5137337136926825e-06, + "loss": 1.115, + "step": 5454 + }, + { + "epoch": 0.7722800311460324, + "grad_norm": 7.968102522332182, + "learning_rate": 3.5132097264337546e-06, + "loss": 1.2732, + "step": 5455 + }, + { + "epoch": 0.7724216040206696, + "grad_norm": 7.907011575922194, + "learning_rate": 3.512685685911147e-06, + "loss": 1.1516, + "step": 5456 + }, + { + "epoch": 0.7725631768953068, + "grad_norm": 8.322253936494374, + "learning_rate": 3.5121615921524084e-06, + "loss": 1.2185, + "step": 5457 + }, + { + "epoch": 0.7727047497699441, + "grad_norm": 7.840670889351063, + "learning_rate": 3.5116374451850887e-06, + "loss": 1.138, + "step": 5458 + }, + { + "epoch": 0.7728463226445813, + "grad_norm": 8.161330938815745, + "learning_rate": 3.511113245036743e-06, + "loss": 1.2449, + "step": 5459 + }, + { + "epoch": 0.7729878955192185, + "grad_norm": 9.14046223813423, + "learning_rate": 3.510588991734928e-06, + "loss": 1.2158, + "step": 5460 + }, + { + "epoch": 0.7731294683938558, + "grad_norm": 7.26171638607748, + "learning_rate": 3.510064685307203e-06, + "loss": 1.1781, + "step": 5461 + }, + { + "epoch": 0.773271041268493, + "grad_norm": 8.644723647881532, + "learning_rate": 3.5095403257811313e-06, + "loss": 1.2568, + "step": 5462 + }, + { + "epoch": 0.7734126141431302, + "grad_norm": 10.316894704571471, + "learning_rate": 3.5090159131842773e-06, + "loss": 1.4347, + "step": 5463 + }, + { + "epoch": 0.7735541870177673, + "grad_norm": 9.650224009073208, + "learning_rate": 3.5084914475442085e-06, + "loss": 1.3756, + "step": 5464 + }, + { + "epoch": 0.7736957598924046, + "grad_norm": 7.680512268545334, + "learning_rate": 3.5079669288884965e-06, + "loss": 1.3281, + "step": 5465 + }, + { + "epoch": 0.7738373327670418, + "grad_norm": 8.421780785651126, + "learning_rate": 3.507442357244715e-06, + "loss": 1.1736, + "step": 5466 + }, + { + "epoch": 0.773978905641679, + "grad_norm": 7.567845244633938, + "learning_rate": 3.5069177326404393e-06, + "loss": 1.2378, + "step": 5467 + }, + { + "epoch": 0.7741204785163163, + "grad_norm": 9.771867926399542, + "learning_rate": 3.5063930551032494e-06, + "loss": 1.1704, + "step": 5468 + }, + { + "epoch": 0.7742620513909535, + "grad_norm": 9.400107168033129, + "learning_rate": 3.5058683246607273e-06, + "loss": 1.1966, + "step": 5469 + }, + { + "epoch": 0.7744036242655907, + "grad_norm": 9.412853267773007, + "learning_rate": 3.505343541340457e-06, + "loss": 1.3289, + "step": 5470 + }, + { + "epoch": 0.774545197140228, + "grad_norm": 7.500725774616487, + "learning_rate": 3.5048187051700265e-06, + "loss": 1.2478, + "step": 5471 + }, + { + "epoch": 0.7746867700148652, + "grad_norm": 9.864557853647026, + "learning_rate": 3.5042938161770257e-06, + "loss": 1.2422, + "step": 5472 + }, + { + "epoch": 0.7748283428895024, + "grad_norm": 10.158962227993797, + "learning_rate": 3.5037688743890484e-06, + "loss": 1.173, + "step": 5473 + }, + { + "epoch": 0.7749699157641395, + "grad_norm": 7.924725442937898, + "learning_rate": 3.50324387983369e-06, + "loss": 1.2199, + "step": 5474 + }, + { + "epoch": 0.7751114886387768, + "grad_norm": 10.143574572237297, + "learning_rate": 3.502718832538548e-06, + "loss": 1.2245, + "step": 5475 + }, + { + "epoch": 0.775253061513414, + "grad_norm": 7.611037932304793, + "learning_rate": 3.502193732531225e-06, + "loss": 1.2681, + "step": 5476 + }, + { + "epoch": 0.7753946343880512, + "grad_norm": 9.975076133035943, + "learning_rate": 3.5016685798393244e-06, + "loss": 1.4298, + "step": 5477 + }, + { + "epoch": 0.7755362072626885, + "grad_norm": 9.45192439754437, + "learning_rate": 3.5011433744904543e-06, + "loss": 1.2472, + "step": 5478 + }, + { + "epoch": 0.7756777801373257, + "grad_norm": 8.689539827688932, + "learning_rate": 3.5006181165122233e-06, + "loss": 1.263, + "step": 5479 + }, + { + "epoch": 0.7758193530119629, + "grad_norm": 9.533825935850604, + "learning_rate": 3.500092805932244e-06, + "loss": 1.2074, + "step": 5480 + }, + { + "epoch": 0.7759609258866002, + "grad_norm": 9.124429763325452, + "learning_rate": 3.499567442778131e-06, + "loss": 1.1691, + "step": 5481 + }, + { + "epoch": 0.7761024987612374, + "grad_norm": 10.084326534905426, + "learning_rate": 3.4990420270775026e-06, + "loss": 1.303, + "step": 5482 + }, + { + "epoch": 0.7762440716358746, + "grad_norm": 7.407887466910665, + "learning_rate": 3.4985165588579806e-06, + "loss": 1.1125, + "step": 5483 + }, + { + "epoch": 0.7763856445105118, + "grad_norm": 9.494042787438186, + "learning_rate": 3.497991038147187e-06, + "loss": 1.2572, + "step": 5484 + }, + { + "epoch": 0.776527217385149, + "grad_norm": 8.71984112656079, + "learning_rate": 3.497465464972749e-06, + "loss": 1.2741, + "step": 5485 + }, + { + "epoch": 0.7766687902597862, + "grad_norm": 7.4607521553385405, + "learning_rate": 3.496939839362295e-06, + "loss": 1.2329, + "step": 5486 + }, + { + "epoch": 0.7768103631344234, + "grad_norm": 7.2149621843284155, + "learning_rate": 3.496414161343457e-06, + "loss": 1.2614, + "step": 5487 + }, + { + "epoch": 0.7769519360090607, + "grad_norm": 8.936484112390936, + "learning_rate": 3.49588843094387e-06, + "loss": 1.39, + "step": 5488 + }, + { + "epoch": 0.7770935088836979, + "grad_norm": 8.528876229394358, + "learning_rate": 3.4953626481911707e-06, + "loss": 1.4697, + "step": 5489 + }, + { + "epoch": 0.7772350817583351, + "grad_norm": 9.885489771378024, + "learning_rate": 3.4948368131129984e-06, + "loss": 1.1757, + "step": 5490 + }, + { + "epoch": 0.7773766546329723, + "grad_norm": 8.873166606816543, + "learning_rate": 3.4943109257369973e-06, + "loss": 1.3923, + "step": 5491 + }, + { + "epoch": 0.7775182275076096, + "grad_norm": 9.690695352554203, + "learning_rate": 3.493784986090812e-06, + "loss": 1.2505, + "step": 5492 + }, + { + "epoch": 0.7776598003822468, + "grad_norm": 8.375543377457715, + "learning_rate": 3.4932589942020912e-06, + "loss": 1.151, + "step": 5493 + }, + { + "epoch": 0.777801373256884, + "grad_norm": 8.710869184149155, + "learning_rate": 3.4927329500984857e-06, + "loss": 1.4355, + "step": 5494 + }, + { + "epoch": 0.7779429461315212, + "grad_norm": 8.375614428400024, + "learning_rate": 3.4922068538076493e-06, + "loss": 1.2463, + "step": 5495 + }, + { + "epoch": 0.7780845190061584, + "grad_norm": 8.486554900235255, + "learning_rate": 3.4916807053572376e-06, + "loss": 1.3791, + "step": 5496 + }, + { + "epoch": 0.7782260918807956, + "grad_norm": 7.281954358786332, + "learning_rate": 3.4911545047749113e-06, + "loss": 1.248, + "step": 5497 + }, + { + "epoch": 0.7783676647554328, + "grad_norm": 10.757751154395715, + "learning_rate": 3.4906282520883312e-06, + "loss": 1.2237, + "step": 5498 + }, + { + "epoch": 0.7785092376300701, + "grad_norm": 9.567778682432806, + "learning_rate": 3.4901019473251635e-06, + "loss": 1.3286, + "step": 5499 + }, + { + "epoch": 0.7786508105047073, + "grad_norm": 7.4761318612917655, + "learning_rate": 3.489575590513074e-06, + "loss": 1.2674, + "step": 5500 + }, + { + "epoch": 0.7787923833793445, + "grad_norm": 7.788549469454166, + "learning_rate": 3.4890491816797333e-06, + "loss": 1.2032, + "step": 5501 + }, + { + "epoch": 0.7789339562539818, + "grad_norm": 8.113897161564072, + "learning_rate": 3.4885227208528148e-06, + "loss": 1.2634, + "step": 5502 + }, + { + "epoch": 0.779075529128619, + "grad_norm": 7.740300908116774, + "learning_rate": 3.487996208059994e-06, + "loss": 1.253, + "step": 5503 + }, + { + "epoch": 0.7792171020032562, + "grad_norm": 8.218236174676042, + "learning_rate": 3.48746964332895e-06, + "loss": 1.1058, + "step": 5504 + }, + { + "epoch": 0.7793586748778933, + "grad_norm": 8.751082435140301, + "learning_rate": 3.486943026687362e-06, + "loss": 1.4038, + "step": 5505 + }, + { + "epoch": 0.7795002477525306, + "grad_norm": 10.953999483078908, + "learning_rate": 3.486416358162916e-06, + "loss": 1.2419, + "step": 5506 + }, + { + "epoch": 0.7796418206271678, + "grad_norm": 16.530116484059103, + "learning_rate": 3.4858896377832966e-06, + "loss": 1.2148, + "step": 5507 + }, + { + "epoch": 0.779783393501805, + "grad_norm": 10.025584489711887, + "learning_rate": 3.4853628655761946e-06, + "loss": 1.1716, + "step": 5508 + }, + { + "epoch": 0.7799249663764423, + "grad_norm": 9.203683729325618, + "learning_rate": 3.4848360415693013e-06, + "loss": 1.2937, + "step": 5509 + }, + { + "epoch": 0.7800665392510795, + "grad_norm": 10.032283171214885, + "learning_rate": 3.484309165790312e-06, + "loss": 1.3109, + "step": 5510 + }, + { + "epoch": 0.7802081121257167, + "grad_norm": 9.29328646603793, + "learning_rate": 3.4837822382669235e-06, + "loss": 1.2025, + "step": 5511 + }, + { + "epoch": 0.780349685000354, + "grad_norm": 14.092542778465873, + "learning_rate": 3.4832552590268363e-06, + "loss": 1.2795, + "step": 5512 + }, + { + "epoch": 0.7804912578749912, + "grad_norm": 8.985934817042288, + "learning_rate": 3.4827282280977527e-06, + "loss": 1.4931, + "step": 5513 + }, + { + "epoch": 0.7806328307496284, + "grad_norm": 8.988789676243961, + "learning_rate": 3.4822011455073788e-06, + "loss": 1.3214, + "step": 5514 + }, + { + "epoch": 0.7807744036242656, + "grad_norm": 12.091474925299256, + "learning_rate": 3.4816740112834248e-06, + "loss": 1.1619, + "step": 5515 + }, + { + "epoch": 0.7809159764989028, + "grad_norm": 10.994343083330826, + "learning_rate": 3.4811468254535984e-06, + "loss": 1.0956, + "step": 5516 + }, + { + "epoch": 0.78105754937354, + "grad_norm": 22.159586452109515, + "learning_rate": 3.4806195880456158e-06, + "loss": 1.0633, + "step": 5517 + }, + { + "epoch": 0.7811991222481772, + "grad_norm": 8.229526431432403, + "learning_rate": 3.4800922990871924e-06, + "loss": 1.1971, + "step": 5518 + }, + { + "epoch": 0.7813406951228145, + "grad_norm": 7.868738985546302, + "learning_rate": 3.479564958606047e-06, + "loss": 1.2206, + "step": 5519 + }, + { + "epoch": 0.7814822679974517, + "grad_norm": 10.99448603366848, + "learning_rate": 3.4790375666299026e-06, + "loss": 1.2235, + "step": 5520 + }, + { + "epoch": 0.7816238408720889, + "grad_norm": 10.759739921291226, + "learning_rate": 3.478510123186483e-06, + "loss": 1.3007, + "step": 5521 + }, + { + "epoch": 0.7817654137467261, + "grad_norm": 8.434034299806498, + "learning_rate": 3.477982628303516e-06, + "loss": 1.2613, + "step": 5522 + }, + { + "epoch": 0.7819069866213634, + "grad_norm": 9.521582731863782, + "learning_rate": 3.4774550820087317e-06, + "loss": 1.1967, + "step": 5523 + }, + { + "epoch": 0.7820485594960006, + "grad_norm": 15.624236797766965, + "learning_rate": 3.476927484329863e-06, + "loss": 1.3067, + "step": 5524 + }, + { + "epoch": 0.7821901323706378, + "grad_norm": 8.909475926871563, + "learning_rate": 3.4763998352946436e-06, + "loss": 1.3097, + "step": 5525 + }, + { + "epoch": 0.782331705245275, + "grad_norm": 9.015574225376954, + "learning_rate": 3.4758721349308146e-06, + "loss": 1.1777, + "step": 5526 + }, + { + "epoch": 0.7824732781199122, + "grad_norm": 9.180314556375718, + "learning_rate": 3.4753443832661134e-06, + "loss": 1.3338, + "step": 5527 + }, + { + "epoch": 0.7826148509945494, + "grad_norm": 8.44331080651946, + "learning_rate": 3.4748165803282856e-06, + "loss": 1.0747, + "step": 5528 + }, + { + "epoch": 0.7827564238691866, + "grad_norm": 8.588667708829776, + "learning_rate": 3.4742887261450776e-06, + "loss": 1.1963, + "step": 5529 + }, + { + "epoch": 0.7828979967438239, + "grad_norm": 8.882259355028989, + "learning_rate": 3.4737608207442373e-06, + "loss": 1.1885, + "step": 5530 + }, + { + "epoch": 0.7830395696184611, + "grad_norm": 10.73378542151359, + "learning_rate": 3.4732328641535174e-06, + "loss": 1.3579, + "step": 5531 + }, + { + "epoch": 0.7831811424930983, + "grad_norm": 9.662255256188494, + "learning_rate": 3.472704856400671e-06, + "loss": 1.225, + "step": 5532 + }, + { + "epoch": 0.7833227153677356, + "grad_norm": 10.6484375, + "learning_rate": 3.4721767975134557e-06, + "loss": 1.3898, + "step": 5533 + }, + { + "epoch": 0.7834642882423728, + "grad_norm": 7.664505474265059, + "learning_rate": 3.471648687519631e-06, + "loss": 1.0599, + "step": 5534 + }, + { + "epoch": 0.78360586111701, + "grad_norm": 9.354009297044728, + "learning_rate": 3.4711205264469583e-06, + "loss": 1.2397, + "step": 5535 + }, + { + "epoch": 0.7837474339916471, + "grad_norm": 7.536578181187557, + "learning_rate": 3.470592314323205e-06, + "loss": 1.2021, + "step": 5536 + }, + { + "epoch": 0.7838890068662844, + "grad_norm": 9.640071018321455, + "learning_rate": 3.4700640511761373e-06, + "loss": 1.3154, + "step": 5537 + }, + { + "epoch": 0.7840305797409216, + "grad_norm": 8.46283766102455, + "learning_rate": 3.4695357370335255e-06, + "loss": 1.2629, + "step": 5538 + }, + { + "epoch": 0.7841721526155588, + "grad_norm": 8.580180159947018, + "learning_rate": 3.4690073719231426e-06, + "loss": 1.2506, + "step": 5539 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 8.282409356493746, + "learning_rate": 3.468478955872765e-06, + "loss": 1.3113, + "step": 5540 + }, + { + "epoch": 0.7844552983648333, + "grad_norm": 9.065026924105942, + "learning_rate": 3.4679504889101704e-06, + "loss": 1.2778, + "step": 5541 + }, + { + "epoch": 0.7845968712394705, + "grad_norm": 9.151498857315627, + "learning_rate": 3.4674219710631406e-06, + "loss": 1.1897, + "step": 5542 + }, + { + "epoch": 0.7847384441141078, + "grad_norm": 9.849976716764399, + "learning_rate": 3.466893402359459e-06, + "loss": 1.1618, + "step": 5543 + }, + { + "epoch": 0.784880016988745, + "grad_norm": 8.524146843335014, + "learning_rate": 3.4663647828269124e-06, + "loss": 1.2727, + "step": 5544 + }, + { + "epoch": 0.7850215898633822, + "grad_norm": 8.557075508593202, + "learning_rate": 3.46583611249329e-06, + "loss": 1.2128, + "step": 5545 + }, + { + "epoch": 0.7851631627380194, + "grad_norm": 9.751943150092535, + "learning_rate": 3.465307391386383e-06, + "loss": 1.3209, + "step": 5546 + }, + { + "epoch": 0.7853047356126566, + "grad_norm": 9.700664017749192, + "learning_rate": 3.464778619533987e-06, + "loss": 1.3515, + "step": 5547 + }, + { + "epoch": 0.7854463084872938, + "grad_norm": 8.403841829491865, + "learning_rate": 3.4642497969638973e-06, + "loss": 1.2963, + "step": 5548 + }, + { + "epoch": 0.785587881361931, + "grad_norm": 9.735347595122198, + "learning_rate": 3.463720923703915e-06, + "loss": 1.2579, + "step": 5549 + }, + { + "epoch": 0.7857294542365683, + "grad_norm": 10.904572016220243, + "learning_rate": 3.4631919997818415e-06, + "loss": 1.4626, + "step": 5550 + }, + { + "epoch": 0.7858710271112055, + "grad_norm": 9.190132853779492, + "learning_rate": 3.4626630252254835e-06, + "loss": 1.3259, + "step": 5551 + }, + { + "epoch": 0.7860125999858427, + "grad_norm": 7.690800400291191, + "learning_rate": 3.462134000062649e-06, + "loss": 1.2157, + "step": 5552 + }, + { + "epoch": 0.78615417286048, + "grad_norm": 7.796214960633291, + "learning_rate": 3.4616049243211463e-06, + "loss": 1.3386, + "step": 5553 + }, + { + "epoch": 0.7862957457351172, + "grad_norm": 9.094391370880093, + "learning_rate": 3.46107579802879e-06, + "loss": 1.1755, + "step": 5554 + }, + { + "epoch": 0.7864373186097544, + "grad_norm": 9.618787866310903, + "learning_rate": 3.4605466212133957e-06, + "loss": 1.3907, + "step": 5555 + }, + { + "epoch": 0.7865788914843916, + "grad_norm": 9.569144141054892, + "learning_rate": 3.460017393902782e-06, + "loss": 1.2509, + "step": 5556 + }, + { + "epoch": 0.7867204643590288, + "grad_norm": 8.434056914678669, + "learning_rate": 3.4594881161247694e-06, + "loss": 1.2551, + "step": 5557 + }, + { + "epoch": 0.786862037233666, + "grad_norm": 10.435352218671603, + "learning_rate": 3.458958787907182e-06, + "loss": 1.2707, + "step": 5558 + }, + { + "epoch": 0.7870036101083032, + "grad_norm": 8.046871681583054, + "learning_rate": 3.458429409277846e-06, + "loss": 1.1766, + "step": 5559 + }, + { + "epoch": 0.7871451829829405, + "grad_norm": 10.451977950541863, + "learning_rate": 3.4578999802645905e-06, + "loss": 1.488, + "step": 5560 + }, + { + "epoch": 0.7872867558575777, + "grad_norm": 10.717291818976726, + "learning_rate": 3.457370500895247e-06, + "loss": 1.2668, + "step": 5561 + }, + { + "epoch": 0.7874283287322149, + "grad_norm": 8.462857494381838, + "learning_rate": 3.4568409711976515e-06, + "loss": 1.269, + "step": 5562 + }, + { + "epoch": 0.7875699016068521, + "grad_norm": 11.117160049157958, + "learning_rate": 3.4563113911996395e-06, + "loss": 1.4372, + "step": 5563 + }, + { + "epoch": 0.7877114744814894, + "grad_norm": 8.639611734920488, + "learning_rate": 3.455781760929049e-06, + "loss": 1.2509, + "step": 5564 + }, + { + "epoch": 0.7878530473561266, + "grad_norm": 7.696141046369617, + "learning_rate": 3.4552520804137248e-06, + "loss": 1.1885, + "step": 5565 + }, + { + "epoch": 0.7879946202307638, + "grad_norm": 10.143700178944195, + "learning_rate": 3.4547223496815115e-06, + "loss": 1.2476, + "step": 5566 + }, + { + "epoch": 0.788136193105401, + "grad_norm": 6.984152975969817, + "learning_rate": 3.4541925687602553e-06, + "loss": 1.237, + "step": 5567 + }, + { + "epoch": 0.7882777659800382, + "grad_norm": 7.960806683634468, + "learning_rate": 3.453662737677808e-06, + "loss": 1.2389, + "step": 5568 + }, + { + "epoch": 0.7884193388546754, + "grad_norm": 8.492173125562811, + "learning_rate": 3.4531328564620215e-06, + "loss": 1.3729, + "step": 5569 + }, + { + "epoch": 0.7885609117293126, + "grad_norm": 8.232115346828913, + "learning_rate": 3.452602925140751e-06, + "loss": 1.2559, + "step": 5570 + }, + { + "epoch": 0.7887024846039499, + "grad_norm": 8.852262356489724, + "learning_rate": 3.4520729437418553e-06, + "loss": 1.2993, + "step": 5571 + }, + { + "epoch": 0.7888440574785871, + "grad_norm": 11.383204592809435, + "learning_rate": 3.4515429122931955e-06, + "loss": 1.5127, + "step": 5572 + }, + { + "epoch": 0.7889856303532243, + "grad_norm": 9.526026761040029, + "learning_rate": 3.451012830822633e-06, + "loss": 1.2808, + "step": 5573 + }, + { + "epoch": 0.7891272032278616, + "grad_norm": 8.604970037041321, + "learning_rate": 3.4504826993580364e-06, + "loss": 1.2112, + "step": 5574 + }, + { + "epoch": 0.7892687761024988, + "grad_norm": 6.93880608438577, + "learning_rate": 3.449952517927272e-06, + "loss": 1.2835, + "step": 5575 + }, + { + "epoch": 0.789410348977136, + "grad_norm": 7.063499717510955, + "learning_rate": 3.4494222865582126e-06, + "loss": 1.1498, + "step": 5576 + }, + { + "epoch": 0.7895519218517733, + "grad_norm": 7.834703068660118, + "learning_rate": 3.4488920052787313e-06, + "loss": 1.321, + "step": 5577 + }, + { + "epoch": 0.7896934947264104, + "grad_norm": 8.524971129069286, + "learning_rate": 3.4483616741167046e-06, + "loss": 1.2464, + "step": 5578 + }, + { + "epoch": 0.7898350676010476, + "grad_norm": 8.158739199753223, + "learning_rate": 3.4478312931000123e-06, + "loss": 1.3348, + "step": 5579 + }, + { + "epoch": 0.7899766404756848, + "grad_norm": 7.9625032960893884, + "learning_rate": 3.4473008622565353e-06, + "loss": 1.274, + "step": 5580 + }, + { + "epoch": 0.7901182133503221, + "grad_norm": 8.497624682471015, + "learning_rate": 3.4467703816141584e-06, + "loss": 1.2214, + "step": 5581 + }, + { + "epoch": 0.7902597862249593, + "grad_norm": 10.317705909591357, + "learning_rate": 3.4462398512007684e-06, + "loss": 1.4899, + "step": 5582 + }, + { + "epoch": 0.7904013590995965, + "grad_norm": 8.237014723797609, + "learning_rate": 3.445709271044255e-06, + "loss": 1.2765, + "step": 5583 + }, + { + "epoch": 0.7905429319742338, + "grad_norm": 8.349577622381144, + "learning_rate": 3.445178641172511e-06, + "loss": 1.4235, + "step": 5584 + }, + { + "epoch": 0.790684504848871, + "grad_norm": 8.211595329262305, + "learning_rate": 3.44464796161343e-06, + "loss": 1.298, + "step": 5585 + }, + { + "epoch": 0.7908260777235082, + "grad_norm": 7.690084627675469, + "learning_rate": 3.44411723239491e-06, + "loss": 1.3429, + "step": 5586 + }, + { + "epoch": 0.7909676505981454, + "grad_norm": 6.456464665288666, + "learning_rate": 3.4435864535448504e-06, + "loss": 1.2353, + "step": 5587 + }, + { + "epoch": 0.7911092234727826, + "grad_norm": 7.188827524737252, + "learning_rate": 3.443055625091155e-06, + "loss": 1.2068, + "step": 5588 + }, + { + "epoch": 0.7912507963474198, + "grad_norm": 8.277983399069694, + "learning_rate": 3.4425247470617294e-06, + "loss": 1.239, + "step": 5589 + }, + { + "epoch": 0.791392369222057, + "grad_norm": 8.61755403515569, + "learning_rate": 3.44199381948448e-06, + "loss": 1.1848, + "step": 5590 + }, + { + "epoch": 0.7915339420966943, + "grad_norm": 8.636622904121047, + "learning_rate": 3.441462842387318e-06, + "loss": 1.244, + "step": 5591 + }, + { + "epoch": 0.7916755149713315, + "grad_norm": 10.733984083584279, + "learning_rate": 3.4409318157981565e-06, + "loss": 1.2015, + "step": 5592 + }, + { + "epoch": 0.7918170878459687, + "grad_norm": 8.407007346373183, + "learning_rate": 3.4404007397449104e-06, + "loss": 1.2199, + "step": 5593 + }, + { + "epoch": 0.791958660720606, + "grad_norm": 8.554856501612614, + "learning_rate": 3.439869614255499e-06, + "loss": 1.1478, + "step": 5594 + }, + { + "epoch": 0.7921002335952432, + "grad_norm": 8.861314564611071, + "learning_rate": 3.4393384393578427e-06, + "loss": 1.2609, + "step": 5595 + }, + { + "epoch": 0.7922418064698804, + "grad_norm": 8.379438448278982, + "learning_rate": 3.438807215079865e-06, + "loss": 1.2001, + "step": 5596 + }, + { + "epoch": 0.7923833793445176, + "grad_norm": 8.867109632780352, + "learning_rate": 3.438275941449492e-06, + "loss": 1.3096, + "step": 5597 + }, + { + "epoch": 0.7925249522191548, + "grad_norm": 8.696471048857305, + "learning_rate": 3.437744618494653e-06, + "loss": 1.2164, + "step": 5598 + }, + { + "epoch": 0.792666525093792, + "grad_norm": 8.979009097340152, + "learning_rate": 3.437213246243277e-06, + "loss": 1.1722, + "step": 5599 + }, + { + "epoch": 0.7928080979684292, + "grad_norm": 9.212779448992647, + "learning_rate": 3.4366818247233015e-06, + "loss": 1.3019, + "step": 5600 + }, + { + "epoch": 0.7929496708430664, + "grad_norm": 7.874520907890752, + "learning_rate": 3.4361503539626593e-06, + "loss": 1.0359, + "step": 5601 + }, + { + "epoch": 0.7930912437177037, + "grad_norm": 9.574102008034691, + "learning_rate": 3.4356188339892915e-06, + "loss": 1.1509, + "step": 5602 + }, + { + "epoch": 0.7932328165923409, + "grad_norm": 8.488210692729073, + "learning_rate": 3.4350872648311396e-06, + "loss": 1.1364, + "step": 5603 + }, + { + "epoch": 0.7933743894669781, + "grad_norm": 8.547791259301881, + "learning_rate": 3.434555646516147e-06, + "loss": 1.3167, + "step": 5604 + }, + { + "epoch": 0.7935159623416154, + "grad_norm": 9.546013076757285, + "learning_rate": 3.434023979072262e-06, + "loss": 1.3116, + "step": 5605 + }, + { + "epoch": 0.7936575352162526, + "grad_norm": 9.470644116193256, + "learning_rate": 3.4334922625274312e-06, + "loss": 1.216, + "step": 5606 + }, + { + "epoch": 0.7937991080908898, + "grad_norm": 9.151447169192824, + "learning_rate": 3.432960496909609e-06, + "loss": 1.4724, + "step": 5607 + }, + { + "epoch": 0.7939406809655271, + "grad_norm": 9.052620389317445, + "learning_rate": 3.4324286822467496e-06, + "loss": 1.3992, + "step": 5608 + }, + { + "epoch": 0.7940822538401642, + "grad_norm": 8.265736223997656, + "learning_rate": 3.431896818566809e-06, + "loss": 1.1354, + "step": 5609 + }, + { + "epoch": 0.7942238267148014, + "grad_norm": 7.843102173820278, + "learning_rate": 3.4313649058977473e-06, + "loss": 1.2133, + "step": 5610 + }, + { + "epoch": 0.7943653995894386, + "grad_norm": 7.995851633257364, + "learning_rate": 3.4308329442675276e-06, + "loss": 1.1694, + "step": 5611 + }, + { + "epoch": 0.7945069724640759, + "grad_norm": 8.525993991750363, + "learning_rate": 3.430300933704114e-06, + "loss": 1.3214, + "step": 5612 + }, + { + "epoch": 0.7946485453387131, + "grad_norm": 8.147266570800516, + "learning_rate": 3.4297688742354728e-06, + "loss": 1.1988, + "step": 5613 + }, + { + "epoch": 0.7947901182133503, + "grad_norm": 7.242645216286927, + "learning_rate": 3.4292367658895764e-06, + "loss": 1.2691, + "step": 5614 + }, + { + "epoch": 0.7949316910879876, + "grad_norm": 8.2230538242661, + "learning_rate": 3.4287046086943956e-06, + "loss": 1.2563, + "step": 5615 + }, + { + "epoch": 0.7950732639626248, + "grad_norm": 9.038339544743504, + "learning_rate": 3.428172402677906e-06, + "loss": 1.1168, + "step": 5616 + }, + { + "epoch": 0.795214836837262, + "grad_norm": 8.276043561611514, + "learning_rate": 3.4276401478680856e-06, + "loss": 1.3338, + "step": 5617 + }, + { + "epoch": 0.7953564097118992, + "grad_norm": 8.339188718421314, + "learning_rate": 3.427107844292914e-06, + "loss": 1.223, + "step": 5618 + }, + { + "epoch": 0.7954979825865364, + "grad_norm": 9.479483284046633, + "learning_rate": 3.426575491980374e-06, + "loss": 1.0724, + "step": 5619 + }, + { + "epoch": 0.7956395554611736, + "grad_norm": 8.523904733412785, + "learning_rate": 3.426043090958452e-06, + "loss": 1.2427, + "step": 5620 + }, + { + "epoch": 0.7957811283358108, + "grad_norm": 7.696081070835789, + "learning_rate": 3.4255106412551352e-06, + "loss": 1.1902, + "step": 5621 + }, + { + "epoch": 0.7959227012104481, + "grad_norm": 8.22717642236786, + "learning_rate": 3.4249781428984143e-06, + "loss": 1.1763, + "step": 5622 + }, + { + "epoch": 0.7960642740850853, + "grad_norm": 8.097816417856402, + "learning_rate": 3.424445595916281e-06, + "loss": 1.2244, + "step": 5623 + }, + { + "epoch": 0.7962058469597225, + "grad_norm": 9.24424440868479, + "learning_rate": 3.423913000336732e-06, + "loss": 1.4762, + "step": 5624 + }, + { + "epoch": 0.7963474198343597, + "grad_norm": 9.483687198340979, + "learning_rate": 3.423380356187766e-06, + "loss": 1.1896, + "step": 5625 + }, + { + "epoch": 0.796488992708997, + "grad_norm": 8.566310508043404, + "learning_rate": 3.422847663497384e-06, + "loss": 1.3442, + "step": 5626 + }, + { + "epoch": 0.7966305655836342, + "grad_norm": 9.111993586697889, + "learning_rate": 3.4223149222935875e-06, + "loss": 1.2176, + "step": 5627 + }, + { + "epoch": 0.7967721384582714, + "grad_norm": 7.070711746324367, + "learning_rate": 3.421782132604383e-06, + "loss": 1.2957, + "step": 5628 + }, + { + "epoch": 0.7969137113329087, + "grad_norm": 11.02989642346222, + "learning_rate": 3.4212492944577796e-06, + "loss": 1.2928, + "step": 5629 + }, + { + "epoch": 0.7970552842075458, + "grad_norm": 8.58350863478324, + "learning_rate": 3.420716407881788e-06, + "loss": 1.4643, + "step": 5630 + }, + { + "epoch": 0.797196857082183, + "grad_norm": 9.116241850593676, + "learning_rate": 3.4201834729044208e-06, + "loss": 1.2764, + "step": 5631 + }, + { + "epoch": 0.7973384299568202, + "grad_norm": 7.7847855590215955, + "learning_rate": 3.4196504895536948e-06, + "loss": 1.1452, + "step": 5632 + }, + { + "epoch": 0.7974800028314575, + "grad_norm": 8.556360870155725, + "learning_rate": 3.419117457857628e-06, + "loss": 1.1578, + "step": 5633 + }, + { + "epoch": 0.7976215757060947, + "grad_norm": 8.951461024881942, + "learning_rate": 3.4185843778442417e-06, + "loss": 1.0394, + "step": 5634 + }, + { + "epoch": 0.7977631485807319, + "grad_norm": 10.7570965431011, + "learning_rate": 3.4180512495415603e-06, + "loss": 1.1933, + "step": 5635 + }, + { + "epoch": 0.7979047214553692, + "grad_norm": 9.825068303227512, + "learning_rate": 3.417518072977609e-06, + "loss": 1.2837, + "step": 5636 + }, + { + "epoch": 0.7980462943300064, + "grad_norm": 9.592960685542053, + "learning_rate": 3.4169848481804165e-06, + "loss": 1.3342, + "step": 5637 + }, + { + "epoch": 0.7981878672046436, + "grad_norm": 10.020034176583549, + "learning_rate": 3.416451575178014e-06, + "loss": 1.2307, + "step": 5638 + }, + { + "epoch": 0.7983294400792809, + "grad_norm": 8.567049697249148, + "learning_rate": 3.4159182539984352e-06, + "loss": 1.3354, + "step": 5639 + }, + { + "epoch": 0.798471012953918, + "grad_norm": 9.662281708021961, + "learning_rate": 3.4153848846697174e-06, + "loss": 1.2123, + "step": 5640 + }, + { + "epoch": 0.7986125858285552, + "grad_norm": 9.86836077434133, + "learning_rate": 3.4148514672198986e-06, + "loss": 1.2004, + "step": 5641 + }, + { + "epoch": 0.7987541587031924, + "grad_norm": 10.748266390805302, + "learning_rate": 3.414318001677021e-06, + "loss": 1.3468, + "step": 5642 + }, + { + "epoch": 0.7988957315778297, + "grad_norm": 9.166969248083715, + "learning_rate": 3.4137844880691275e-06, + "loss": 1.2797, + "step": 5643 + }, + { + "epoch": 0.7990373044524669, + "grad_norm": 8.436891434939822, + "learning_rate": 3.413250926424264e-06, + "loss": 1.1905, + "step": 5644 + }, + { + "epoch": 0.7991788773271041, + "grad_norm": 6.7295366152076115, + "learning_rate": 3.4127173167704807e-06, + "loss": 1.2038, + "step": 5645 + }, + { + "epoch": 0.7993204502017414, + "grad_norm": 8.626195852333316, + "learning_rate": 3.4121836591358288e-06, + "loss": 1.3686, + "step": 5646 + }, + { + "epoch": 0.7994620230763786, + "grad_norm": 8.057552268497568, + "learning_rate": 3.4116499535483623e-06, + "loss": 1.2902, + "step": 5647 + }, + { + "epoch": 0.7996035959510158, + "grad_norm": 11.803151139888648, + "learning_rate": 3.4111162000361363e-06, + "loss": 1.1868, + "step": 5648 + }, + { + "epoch": 0.799745168825653, + "grad_norm": 9.451934890865063, + "learning_rate": 3.4105823986272125e-06, + "loss": 1.2612, + "step": 5649 + }, + { + "epoch": 0.7998867417002902, + "grad_norm": 10.10039743075872, + "learning_rate": 3.41004854934965e-06, + "loss": 1.3255, + "step": 5650 + }, + { + "epoch": 0.8000283145749274, + "grad_norm": 10.933035151749698, + "learning_rate": 3.4095146522315144e-06, + "loss": 1.3242, + "step": 5651 + }, + { + "epoch": 0.8001698874495646, + "grad_norm": 7.958998514174557, + "learning_rate": 3.408980707300871e-06, + "loss": 1.2654, + "step": 5652 + }, + { + "epoch": 0.8003114603242019, + "grad_norm": 11.123939034951617, + "learning_rate": 3.4084467145857903e-06, + "loss": 1.1701, + "step": 5653 + }, + { + "epoch": 0.8004530331988391, + "grad_norm": 10.711848492212592, + "learning_rate": 3.4079126741143427e-06, + "loss": 1.1418, + "step": 5654 + }, + { + "epoch": 0.8005946060734763, + "grad_norm": 10.441935984270799, + "learning_rate": 3.407378585914603e-06, + "loss": 1.2137, + "step": 5655 + }, + { + "epoch": 0.8007361789481136, + "grad_norm": 8.93912048587752, + "learning_rate": 3.4068444500146476e-06, + "loss": 1.2602, + "step": 5656 + }, + { + "epoch": 0.8008777518227508, + "grad_norm": 8.427958200605431, + "learning_rate": 3.4063102664425557e-06, + "loss": 1.2651, + "step": 5657 + }, + { + "epoch": 0.801019324697388, + "grad_norm": 10.770274361150376, + "learning_rate": 3.405776035226409e-06, + "loss": 1.2877, + "step": 5658 + }, + { + "epoch": 0.8011608975720252, + "grad_norm": 9.005203861892205, + "learning_rate": 3.405241756394291e-06, + "loss": 1.219, + "step": 5659 + }, + { + "epoch": 0.8013024704466625, + "grad_norm": 9.356698852454713, + "learning_rate": 3.4047074299742894e-06, + "loss": 1.1592, + "step": 5660 + }, + { + "epoch": 0.8014440433212996, + "grad_norm": 11.599161946508382, + "learning_rate": 3.4041730559944918e-06, + "loss": 1.3808, + "step": 5661 + }, + { + "epoch": 0.8015856161959368, + "grad_norm": 9.418640289983973, + "learning_rate": 3.403638634482992e-06, + "loss": 1.2182, + "step": 5662 + }, + { + "epoch": 0.801727189070574, + "grad_norm": 8.685475065407557, + "learning_rate": 3.403104165467883e-06, + "loss": 1.062, + "step": 5663 + }, + { + "epoch": 0.8018687619452113, + "grad_norm": 9.927485956524551, + "learning_rate": 3.4025696489772607e-06, + "loss": 1.1007, + "step": 5664 + }, + { + "epoch": 0.8020103348198485, + "grad_norm": 9.949100656935578, + "learning_rate": 3.402035085039225e-06, + "loss": 1.2968, + "step": 5665 + }, + { + "epoch": 0.8021519076944857, + "grad_norm": 7.956457852437678, + "learning_rate": 3.401500473681878e-06, + "loss": 1.1431, + "step": 5666 + }, + { + "epoch": 0.802293480569123, + "grad_norm": 9.215133940561063, + "learning_rate": 3.4009658149333223e-06, + "loss": 1.3138, + "step": 5667 + }, + { + "epoch": 0.8024350534437602, + "grad_norm": 10.059827366707106, + "learning_rate": 3.4004311088216667e-06, + "loss": 1.2733, + "step": 5668 + }, + { + "epoch": 0.8025766263183974, + "grad_norm": 9.659862447513568, + "learning_rate": 3.3998963553750186e-06, + "loss": 1.3302, + "step": 5669 + }, + { + "epoch": 0.8027181991930347, + "grad_norm": 9.559092194937163, + "learning_rate": 3.3993615546214898e-06, + "loss": 1.259, + "step": 5670 + }, + { + "epoch": 0.8028597720676718, + "grad_norm": 9.024914482747105, + "learning_rate": 3.3988267065891945e-06, + "loss": 1.3378, + "step": 5671 + }, + { + "epoch": 0.803001344942309, + "grad_norm": 7.609980063694268, + "learning_rate": 3.39829181130625e-06, + "loss": 1.1763, + "step": 5672 + }, + { + "epoch": 0.8031429178169462, + "grad_norm": 9.446252263396868, + "learning_rate": 3.3977568688007745e-06, + "loss": 1.1, + "step": 5673 + }, + { + "epoch": 0.8032844906915835, + "grad_norm": 7.7497236141020345, + "learning_rate": 3.3972218791008902e-06, + "loss": 1.2873, + "step": 5674 + }, + { + "epoch": 0.8034260635662207, + "grad_norm": 10.646799500300988, + "learning_rate": 3.3966868422347204e-06, + "loss": 1.2446, + "step": 5675 + }, + { + "epoch": 0.8035676364408579, + "grad_norm": 8.674363117859908, + "learning_rate": 3.3961517582303916e-06, + "loss": 1.2408, + "step": 5676 + }, + { + "epoch": 0.8037092093154952, + "grad_norm": 9.125404244443734, + "learning_rate": 3.395616627116033e-06, + "loss": 1.1184, + "step": 5677 + }, + { + "epoch": 0.8038507821901324, + "grad_norm": 9.05781044478802, + "learning_rate": 3.395081448919777e-06, + "loss": 1.4428, + "step": 5678 + }, + { + "epoch": 0.8039923550647696, + "grad_norm": 7.770786343211715, + "learning_rate": 3.394546223669756e-06, + "loss": 1.1872, + "step": 5679 + }, + { + "epoch": 0.8041339279394069, + "grad_norm": 8.151431710333018, + "learning_rate": 3.394010951394107e-06, + "loss": 1.1701, + "step": 5680 + }, + { + "epoch": 0.804275500814044, + "grad_norm": 9.815665930972642, + "learning_rate": 3.3934756321209693e-06, + "loss": 1.0741, + "step": 5681 + }, + { + "epoch": 0.8044170736886812, + "grad_norm": 7.856385528924963, + "learning_rate": 3.3929402658784837e-06, + "loss": 1.3475, + "step": 5682 + }, + { + "epoch": 0.8045586465633184, + "grad_norm": 10.336521723261129, + "learning_rate": 3.3924048526947937e-06, + "loss": 1.3204, + "step": 5683 + }, + { + "epoch": 0.8047002194379557, + "grad_norm": 9.272296735726865, + "learning_rate": 3.3918693925980455e-06, + "loss": 1.2548, + "step": 5684 + }, + { + "epoch": 0.8048417923125929, + "grad_norm": 9.000115711739879, + "learning_rate": 3.3913338856163897e-06, + "loss": 1.1703, + "step": 5685 + }, + { + "epoch": 0.8049833651872301, + "grad_norm": 7.944264090505631, + "learning_rate": 3.390798331777976e-06, + "loss": 1.3635, + "step": 5686 + }, + { + "epoch": 0.8051249380618674, + "grad_norm": 8.843932025235844, + "learning_rate": 3.390262731110957e-06, + "loss": 1.2866, + "step": 5687 + }, + { + "epoch": 0.8052665109365046, + "grad_norm": 8.707819824653747, + "learning_rate": 3.3897270836434914e-06, + "loss": 1.3295, + "step": 5688 + }, + { + "epoch": 0.8054080838111418, + "grad_norm": 9.813498282221088, + "learning_rate": 3.3891913894037354e-06, + "loss": 1.1751, + "step": 5689 + }, + { + "epoch": 0.805549656685779, + "grad_norm": 7.50491858370216, + "learning_rate": 3.3886556484198517e-06, + "loss": 1.0916, + "step": 5690 + }, + { + "epoch": 0.8056912295604163, + "grad_norm": 9.5539827999274, + "learning_rate": 3.388119860720003e-06, + "loss": 1.2394, + "step": 5691 + }, + { + "epoch": 0.8058328024350534, + "grad_norm": 11.081805781214477, + "learning_rate": 3.3875840263323552e-06, + "loss": 1.2033, + "step": 5692 + }, + { + "epoch": 0.8059743753096906, + "grad_norm": 7.940553093006278, + "learning_rate": 3.3870481452850765e-06, + "loss": 1.3941, + "step": 5693 + }, + { + "epoch": 0.8061159481843279, + "grad_norm": 8.269844924967325, + "learning_rate": 3.386512217606339e-06, + "loss": 1.2094, + "step": 5694 + }, + { + "epoch": 0.8062575210589651, + "grad_norm": 10.456945157471525, + "learning_rate": 3.385976243324316e-06, + "loss": 1.4803, + "step": 5695 + }, + { + "epoch": 0.8063990939336023, + "grad_norm": 8.420499275246902, + "learning_rate": 3.3854402224671813e-06, + "loss": 1.164, + "step": 5696 + }, + { + "epoch": 0.8065406668082395, + "grad_norm": 9.44990381262535, + "learning_rate": 3.3849041550631145e-06, + "loss": 1.303, + "step": 5697 + }, + { + "epoch": 0.8066822396828768, + "grad_norm": 8.779848468837159, + "learning_rate": 3.384368041140296e-06, + "loss": 1.2503, + "step": 5698 + }, + { + "epoch": 0.806823812557514, + "grad_norm": 8.759397419449323, + "learning_rate": 3.383831880726909e-06, + "loss": 1.2359, + "step": 5699 + }, + { + "epoch": 0.8069653854321512, + "grad_norm": 7.709895565052969, + "learning_rate": 3.3832956738511395e-06, + "loss": 1.294, + "step": 5700 + }, + { + "epoch": 0.8071069583067885, + "grad_norm": 9.090776619379504, + "learning_rate": 3.3827594205411746e-06, + "loss": 1.1822, + "step": 5701 + }, + { + "epoch": 0.8072485311814256, + "grad_norm": 8.03849922483032, + "learning_rate": 3.3822231208252053e-06, + "loss": 1.0817, + "step": 5702 + }, + { + "epoch": 0.8073901040560628, + "grad_norm": 7.125607916345821, + "learning_rate": 3.3816867747314242e-06, + "loss": 1.2914, + "step": 5703 + }, + { + "epoch": 0.8075316769307, + "grad_norm": 8.453011277539096, + "learning_rate": 3.381150382288027e-06, + "loss": 1.2479, + "step": 5704 + }, + { + "epoch": 0.8076732498053373, + "grad_norm": 10.073548879313872, + "learning_rate": 3.380613943523211e-06, + "loss": 1.309, + "step": 5705 + }, + { + "epoch": 0.8078148226799745, + "grad_norm": 8.562512919840845, + "learning_rate": 3.3800774584651767e-06, + "loss": 1.2482, + "step": 5706 + }, + { + "epoch": 0.8079563955546117, + "grad_norm": 10.208308960762253, + "learning_rate": 3.379540927142127e-06, + "loss": 1.2201, + "step": 5707 + }, + { + "epoch": 0.808097968429249, + "grad_norm": 7.7457839819239025, + "learning_rate": 3.3790043495822663e-06, + "loss": 1.2991, + "step": 5708 + }, + { + "epoch": 0.8082395413038862, + "grad_norm": 8.12549718656087, + "learning_rate": 3.378467725813802e-06, + "loss": 1.1942, + "step": 5709 + }, + { + "epoch": 0.8083811141785234, + "grad_norm": 10.461712376197559, + "learning_rate": 3.3779310558649447e-06, + "loss": 1.1936, + "step": 5710 + }, + { + "epoch": 0.8085226870531607, + "grad_norm": 9.241004074830567, + "learning_rate": 3.3773943397639068e-06, + "loss": 1.2278, + "step": 5711 + }, + { + "epoch": 0.8086642599277978, + "grad_norm": 9.421854351267797, + "learning_rate": 3.3768575775389022e-06, + "loss": 1.3372, + "step": 5712 + }, + { + "epoch": 0.808805832802435, + "grad_norm": 8.77460747082682, + "learning_rate": 3.3763207692181483e-06, + "loss": 1.1974, + "step": 5713 + }, + { + "epoch": 0.8089474056770722, + "grad_norm": 9.397134108629418, + "learning_rate": 3.375783914829865e-06, + "loss": 1.1459, + "step": 5714 + }, + { + "epoch": 0.8090889785517095, + "grad_norm": 8.770899635722774, + "learning_rate": 3.3752470144022745e-06, + "loss": 1.222, + "step": 5715 + }, + { + "epoch": 0.8092305514263467, + "grad_norm": 9.86846862353186, + "learning_rate": 3.374710067963602e-06, + "loss": 1.2676, + "step": 5716 + }, + { + "epoch": 0.8093721243009839, + "grad_norm": 7.810323427270744, + "learning_rate": 3.374173075542072e-06, + "loss": 1.2889, + "step": 5717 + }, + { + "epoch": 0.8095136971756212, + "grad_norm": 11.692840498394666, + "learning_rate": 3.373636037165916e-06, + "loss": 1.1544, + "step": 5718 + }, + { + "epoch": 0.8096552700502584, + "grad_norm": 10.841585938262828, + "learning_rate": 3.373098952863365e-06, + "loss": 1.2813, + "step": 5719 + }, + { + "epoch": 0.8097968429248956, + "grad_norm": 8.097200225340721, + "learning_rate": 3.372561822662652e-06, + "loss": 1.2274, + "step": 5720 + }, + { + "epoch": 0.8099384157995329, + "grad_norm": 9.111290653392937, + "learning_rate": 3.3720246465920154e-06, + "loss": 1.2787, + "step": 5721 + }, + { + "epoch": 0.8100799886741701, + "grad_norm": 7.576439235421684, + "learning_rate": 3.3714874246796935e-06, + "loss": 1.3201, + "step": 5722 + }, + { + "epoch": 0.8102215615488072, + "grad_norm": 11.635460525926364, + "learning_rate": 3.3709501569539277e-06, + "loss": 1.3422, + "step": 5723 + }, + { + "epoch": 0.8103631344234444, + "grad_norm": 9.257492330302275, + "learning_rate": 3.370412843442961e-06, + "loss": 1.3257, + "step": 5724 + }, + { + "epoch": 0.8105047072980817, + "grad_norm": 11.3304826479781, + "learning_rate": 3.3698754841750403e-06, + "loss": 1.1832, + "step": 5725 + }, + { + "epoch": 0.8106462801727189, + "grad_norm": 8.619545358940007, + "learning_rate": 3.369338079178414e-06, + "loss": 1.2059, + "step": 5726 + }, + { + "epoch": 0.8107878530473561, + "grad_norm": 9.574552632132153, + "learning_rate": 3.368800628481333e-06, + "loss": 1.1932, + "step": 5727 + }, + { + "epoch": 0.8109294259219934, + "grad_norm": 11.274154841176012, + "learning_rate": 3.3682631321120507e-06, + "loss": 1.1458, + "step": 5728 + }, + { + "epoch": 0.8110709987966306, + "grad_norm": 9.77512584712588, + "learning_rate": 3.3677255900988236e-06, + "loss": 1.3902, + "step": 5729 + }, + { + "epoch": 0.8112125716712678, + "grad_norm": 9.738156289937336, + "learning_rate": 3.3671880024699085e-06, + "loss": 1.4208, + "step": 5730 + }, + { + "epoch": 0.811354144545905, + "grad_norm": 8.874653124410738, + "learning_rate": 3.3666503692535667e-06, + "loss": 1.1856, + "step": 5731 + }, + { + "epoch": 0.8114957174205423, + "grad_norm": 8.190131543492267, + "learning_rate": 3.3661126904780624e-06, + "loss": 1.3293, + "step": 5732 + }, + { + "epoch": 0.8116372902951794, + "grad_norm": 10.124061446782248, + "learning_rate": 3.3655749661716585e-06, + "loss": 1.2568, + "step": 5733 + }, + { + "epoch": 0.8117788631698166, + "grad_norm": 8.001928573842616, + "learning_rate": 3.3650371963626243e-06, + "loss": 1.2065, + "step": 5734 + }, + { + "epoch": 0.8119204360444539, + "grad_norm": 9.421370510177331, + "learning_rate": 3.3644993810792297e-06, + "loss": 1.1964, + "step": 5735 + }, + { + "epoch": 0.8120620089190911, + "grad_norm": 10.04224361382039, + "learning_rate": 3.3639615203497467e-06, + "loss": 1.2417, + "step": 5736 + }, + { + "epoch": 0.8122035817937283, + "grad_norm": 9.133219740038243, + "learning_rate": 3.3634236142024516e-06, + "loss": 1.1444, + "step": 5737 + }, + { + "epoch": 0.8123451546683655, + "grad_norm": 8.613305165783125, + "learning_rate": 3.362885662665621e-06, + "loss": 1.0802, + "step": 5738 + }, + { + "epoch": 0.8124867275430028, + "grad_norm": 9.283473714957596, + "learning_rate": 3.3623476657675342e-06, + "loss": 1.1694, + "step": 5739 + }, + { + "epoch": 0.81262830041764, + "grad_norm": 10.280085257091324, + "learning_rate": 3.3618096235364734e-06, + "loss": 1.1441, + "step": 5740 + }, + { + "epoch": 0.8127698732922772, + "grad_norm": 8.17799163114885, + "learning_rate": 3.361271536000723e-06, + "loss": 1.1994, + "step": 5741 + }, + { + "epoch": 0.8129114461669145, + "grad_norm": 8.906841485978498, + "learning_rate": 3.3607334031885707e-06, + "loss": 1.4136, + "step": 5742 + }, + { + "epoch": 0.8130530190415516, + "grad_norm": 7.647586825857762, + "learning_rate": 3.3601952251283056e-06, + "loss": 1.2736, + "step": 5743 + }, + { + "epoch": 0.8131945919161888, + "grad_norm": 8.735988188180169, + "learning_rate": 3.359657001848218e-06, + "loss": 1.3136, + "step": 5744 + }, + { + "epoch": 0.813336164790826, + "grad_norm": 9.902353380789203, + "learning_rate": 3.359118733376603e-06, + "loss": 1.2284, + "step": 5745 + }, + { + "epoch": 0.8134777376654633, + "grad_norm": 11.10953192190948, + "learning_rate": 3.358580419741757e-06, + "loss": 1.295, + "step": 5746 + }, + { + "epoch": 0.8136193105401005, + "grad_norm": 8.691002887409763, + "learning_rate": 3.3580420609719783e-06, + "loss": 1.2711, + "step": 5747 + }, + { + "epoch": 0.8137608834147377, + "grad_norm": 9.449332997836615, + "learning_rate": 3.3575036570955687e-06, + "loss": 1.4344, + "step": 5748 + }, + { + "epoch": 0.813902456289375, + "grad_norm": 7.7101357766947505, + "learning_rate": 3.356965208140831e-06, + "loss": 1.2796, + "step": 5749 + }, + { + "epoch": 0.8140440291640122, + "grad_norm": 12.521884529218672, + "learning_rate": 3.3564267141360706e-06, + "loss": 1.2829, + "step": 5750 + }, + { + "epoch": 0.8141856020386494, + "grad_norm": 9.171050208469559, + "learning_rate": 3.3558881751095975e-06, + "loss": 1.2371, + "step": 5751 + }, + { + "epoch": 0.8143271749132867, + "grad_norm": 8.35984318928376, + "learning_rate": 3.3553495910897206e-06, + "loss": 1.3266, + "step": 5752 + }, + { + "epoch": 0.8144687477879239, + "grad_norm": 9.186319080673886, + "learning_rate": 3.354810962104754e-06, + "loss": 1.3528, + "step": 5753 + }, + { + "epoch": 0.814610320662561, + "grad_norm": 9.92245515343721, + "learning_rate": 3.354272288183012e-06, + "loss": 1.2931, + "step": 5754 + }, + { + "epoch": 0.8147518935371982, + "grad_norm": 9.000545485178202, + "learning_rate": 3.353733569352813e-06, + "loss": 1.1994, + "step": 5755 + }, + { + "epoch": 0.8148934664118355, + "grad_norm": 9.546811866863674, + "learning_rate": 3.3531948056424766e-06, + "loss": 1.2789, + "step": 5756 + }, + { + "epoch": 0.8150350392864727, + "grad_norm": 8.987409898415056, + "learning_rate": 3.352655997080325e-06, + "loss": 1.2027, + "step": 5757 + }, + { + "epoch": 0.8151766121611099, + "grad_norm": 9.203767867376, + "learning_rate": 3.3521171436946844e-06, + "loss": 1.2488, + "step": 5758 + }, + { + "epoch": 0.8153181850357472, + "grad_norm": 9.96048176040768, + "learning_rate": 3.35157824551388e-06, + "loss": 1.3027, + "step": 5759 + }, + { + "epoch": 0.8154597579103844, + "grad_norm": 8.354264738374507, + "learning_rate": 3.351039302566243e-06, + "loss": 1.1957, + "step": 5760 + }, + { + "epoch": 0.8156013307850216, + "grad_norm": 7.372815083277029, + "learning_rate": 3.350500314880104e-06, + "loss": 1.2295, + "step": 5761 + }, + { + "epoch": 0.8157429036596588, + "grad_norm": 7.665575226203444, + "learning_rate": 3.3499612824837978e-06, + "loss": 1.1923, + "step": 5762 + }, + { + "epoch": 0.8158844765342961, + "grad_norm": 9.862287620123983, + "learning_rate": 3.3494222054056606e-06, + "loss": 1.2083, + "step": 5763 + }, + { + "epoch": 0.8160260494089332, + "grad_norm": 8.435976244065039, + "learning_rate": 3.3488830836740315e-06, + "loss": 1.018, + "step": 5764 + }, + { + "epoch": 0.8161676222835704, + "grad_norm": 8.804736457966149, + "learning_rate": 3.3483439173172517e-06, + "loss": 1.2604, + "step": 5765 + }, + { + "epoch": 0.8163091951582077, + "grad_norm": 10.398662378414539, + "learning_rate": 3.347804706363664e-06, + "loss": 1.2921, + "step": 5766 + }, + { + "epoch": 0.8164507680328449, + "grad_norm": 8.126078842137382, + "learning_rate": 3.3472654508416157e-06, + "loss": 1.2832, + "step": 5767 + }, + { + "epoch": 0.8165923409074821, + "grad_norm": 10.62705596778715, + "learning_rate": 3.346726150779455e-06, + "loss": 1.3776, + "step": 5768 + }, + { + "epoch": 0.8167339137821193, + "grad_norm": 10.797273885201337, + "learning_rate": 3.3461868062055313e-06, + "loss": 1.2638, + "step": 5769 + }, + { + "epoch": 0.8168754866567566, + "grad_norm": 9.257458540833651, + "learning_rate": 3.345647417148198e-06, + "loss": 1.3619, + "step": 5770 + }, + { + "epoch": 0.8170170595313938, + "grad_norm": 8.406975583665952, + "learning_rate": 3.3451079836358107e-06, + "loss": 1.2552, + "step": 5771 + }, + { + "epoch": 0.817158632406031, + "grad_norm": 8.469831446892526, + "learning_rate": 3.344568505696727e-06, + "loss": 1.2308, + "step": 5772 + }, + { + "epoch": 0.8173002052806683, + "grad_norm": 8.184268226232922, + "learning_rate": 3.3440289833593053e-06, + "loss": 1.1529, + "step": 5773 + }, + { + "epoch": 0.8174417781553055, + "grad_norm": 9.639955073841557, + "learning_rate": 3.3434894166519104e-06, + "loss": 1.2679, + "step": 5774 + }, + { + "epoch": 0.8175833510299426, + "grad_norm": 9.865645988183447, + "learning_rate": 3.3429498056029066e-06, + "loss": 1.218, + "step": 5775 + }, + { + "epoch": 0.8177249239045798, + "grad_norm": 9.256978881904157, + "learning_rate": 3.342410150240659e-06, + "loss": 1.1522, + "step": 5776 + }, + { + "epoch": 0.8178664967792171, + "grad_norm": 10.285385459608909, + "learning_rate": 3.3418704505935383e-06, + "loss": 1.2748, + "step": 5777 + }, + { + "epoch": 0.8180080696538543, + "grad_norm": 9.182132736160627, + "learning_rate": 3.341330706689916e-06, + "loss": 1.2354, + "step": 5778 + }, + { + "epoch": 0.8181496425284915, + "grad_norm": 7.2766399933970805, + "learning_rate": 3.3407909185581656e-06, + "loss": 1.222, + "step": 5779 + }, + { + "epoch": 0.8182912154031288, + "grad_norm": 7.855705967095196, + "learning_rate": 3.340251086226663e-06, + "loss": 1.06, + "step": 5780 + }, + { + "epoch": 0.818432788277766, + "grad_norm": 9.061272899816085, + "learning_rate": 3.339711209723788e-06, + "loss": 1.1216, + "step": 5781 + }, + { + "epoch": 0.8185743611524032, + "grad_norm": 10.818833100579893, + "learning_rate": 3.33917128907792e-06, + "loss": 1.2011, + "step": 5782 + }, + { + "epoch": 0.8187159340270405, + "grad_norm": 8.167004740458038, + "learning_rate": 3.3386313243174436e-06, + "loss": 1.2743, + "step": 5783 + }, + { + "epoch": 0.8188575069016777, + "grad_norm": 8.46982739341356, + "learning_rate": 3.338091315470744e-06, + "loss": 1.1859, + "step": 5784 + }, + { + "epoch": 0.8189990797763148, + "grad_norm": 8.770135436453103, + "learning_rate": 3.337551262566209e-06, + "loss": 1.2815, + "step": 5785 + }, + { + "epoch": 0.819140652650952, + "grad_norm": 9.622831422326774, + "learning_rate": 3.337011165632228e-06, + "loss": 1.4267, + "step": 5786 + }, + { + "epoch": 0.8192822255255893, + "grad_norm": 9.777491618313533, + "learning_rate": 3.3364710246971937e-06, + "loss": 1.2398, + "step": 5787 + }, + { + "epoch": 0.8194237984002265, + "grad_norm": 8.19824631709619, + "learning_rate": 3.335930839789502e-06, + "loss": 1.1298, + "step": 5788 + }, + { + "epoch": 0.8195653712748637, + "grad_norm": 10.421669656311938, + "learning_rate": 3.335390610937549e-06, + "loss": 1.3415, + "step": 5789 + }, + { + "epoch": 0.819706944149501, + "grad_norm": 7.026395350990291, + "learning_rate": 3.3348503381697358e-06, + "loss": 1.2138, + "step": 5790 + }, + { + "epoch": 0.8198485170241382, + "grad_norm": 8.888978078182783, + "learning_rate": 3.3343100215144614e-06, + "loss": 1.2158, + "step": 5791 + }, + { + "epoch": 0.8199900898987754, + "grad_norm": 8.125238972597446, + "learning_rate": 3.3337696610001314e-06, + "loss": 1.2409, + "step": 5792 + }, + { + "epoch": 0.8201316627734127, + "grad_norm": 9.622902381513626, + "learning_rate": 3.333229256655153e-06, + "loss": 1.2563, + "step": 5793 + }, + { + "epoch": 0.8202732356480499, + "grad_norm": 9.127172760103395, + "learning_rate": 3.332688808507932e-06, + "loss": 1.0686, + "step": 5794 + }, + { + "epoch": 0.820414808522687, + "grad_norm": 8.432657844520547, + "learning_rate": 3.332148316586882e-06, + "loss": 1.4241, + "step": 5795 + }, + { + "epoch": 0.8205563813973242, + "grad_norm": 7.637573131954107, + "learning_rate": 3.3316077809204168e-06, + "loss": 1.2642, + "step": 5796 + }, + { + "epoch": 0.8206979542719615, + "grad_norm": 10.29096317332192, + "learning_rate": 3.3310672015369495e-06, + "loss": 1.3033, + "step": 5797 + }, + { + "epoch": 0.8208395271465987, + "grad_norm": 10.722875040459101, + "learning_rate": 3.330526578464899e-06, + "loss": 1.26, + "step": 5798 + }, + { + "epoch": 0.8209811000212359, + "grad_norm": 8.417976000576925, + "learning_rate": 3.329985911732686e-06, + "loss": 1.3613, + "step": 5799 + }, + { + "epoch": 0.8211226728958732, + "grad_norm": 8.071176990393278, + "learning_rate": 3.329445201368732e-06, + "loss": 1.4301, + "step": 5800 + }, + { + "epoch": 0.8212642457705104, + "grad_norm": 10.513087517721587, + "learning_rate": 3.3289044474014624e-06, + "loss": 1.4134, + "step": 5801 + }, + { + "epoch": 0.8214058186451476, + "grad_norm": 9.155695459023248, + "learning_rate": 3.3283636498593043e-06, + "loss": 1.1919, + "step": 5802 + }, + { + "epoch": 0.8215473915197848, + "grad_norm": 7.6811899479256, + "learning_rate": 3.3278228087706863e-06, + "loss": 1.2305, + "step": 5803 + }, + { + "epoch": 0.8216889643944221, + "grad_norm": 9.794453542205899, + "learning_rate": 3.327281924164041e-06, + "loss": 1.1461, + "step": 5804 + }, + { + "epoch": 0.8218305372690593, + "grad_norm": 9.580737306896062, + "learning_rate": 3.3267409960678015e-06, + "loss": 1.3496, + "step": 5805 + }, + { + "epoch": 0.8219721101436964, + "grad_norm": 8.389678390213234, + "learning_rate": 3.326200024510405e-06, + "loss": 1.1337, + "step": 5806 + }, + { + "epoch": 0.8221136830183337, + "grad_norm": 8.652449121113456, + "learning_rate": 3.3256590095202883e-06, + "loss": 1.2108, + "step": 5807 + }, + { + "epoch": 0.8222552558929709, + "grad_norm": 7.339662623746403, + "learning_rate": 3.3251179511258934e-06, + "loss": 1.2666, + "step": 5808 + }, + { + "epoch": 0.8223968287676081, + "grad_norm": 13.083294392586522, + "learning_rate": 3.324576849355663e-06, + "loss": 1.2809, + "step": 5809 + }, + { + "epoch": 0.8225384016422453, + "grad_norm": 9.316673214204384, + "learning_rate": 3.3240357042380423e-06, + "loss": 1.2696, + "step": 5810 + }, + { + "epoch": 0.8226799745168826, + "grad_norm": 7.149379313811368, + "learning_rate": 3.3234945158014792e-06, + "loss": 1.3139, + "step": 5811 + }, + { + "epoch": 0.8228215473915198, + "grad_norm": 8.732767629380161, + "learning_rate": 3.322953284074424e-06, + "loss": 1.1775, + "step": 5812 + }, + { + "epoch": 0.822963120266157, + "grad_norm": 11.730762072797903, + "learning_rate": 3.3224120090853275e-06, + "loss": 1.2239, + "step": 5813 + }, + { + "epoch": 0.8231046931407943, + "grad_norm": 9.660363564925648, + "learning_rate": 3.321870690862645e-06, + "loss": 1.1945, + "step": 5814 + }, + { + "epoch": 0.8232462660154315, + "grad_norm": 7.426209671689228, + "learning_rate": 3.3213293294348335e-06, + "loss": 1.2712, + "step": 5815 + }, + { + "epoch": 0.8233878388900686, + "grad_norm": 8.505761830310371, + "learning_rate": 3.3207879248303513e-06, + "loss": 1.2508, + "step": 5816 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 10.314720700763818, + "learning_rate": 3.3202464770776597e-06, + "loss": 1.046, + "step": 5817 + }, + { + "epoch": 0.8236709846393431, + "grad_norm": 8.783344928461151, + "learning_rate": 3.319704986205223e-06, + "loss": 1.2614, + "step": 5818 + }, + { + "epoch": 0.8238125575139803, + "grad_norm": 10.047122175103915, + "learning_rate": 3.3191634522415064e-06, + "loss": 1.2368, + "step": 5819 + }, + { + "epoch": 0.8239541303886175, + "grad_norm": 7.597951723447952, + "learning_rate": 3.3186218752149767e-06, + "loss": 1.1545, + "step": 5820 + }, + { + "epoch": 0.8240957032632548, + "grad_norm": 8.045624333937768, + "learning_rate": 3.3180802551541063e-06, + "loss": 1.2201, + "step": 5821 + }, + { + "epoch": 0.824237276137892, + "grad_norm": 7.8146779190830316, + "learning_rate": 3.3175385920873674e-06, + "loss": 1.3016, + "step": 5822 + }, + { + "epoch": 0.8243788490125292, + "grad_norm": 8.732118919132462, + "learning_rate": 3.316996886043234e-06, + "loss": 1.2619, + "step": 5823 + }, + { + "epoch": 0.8245204218871665, + "grad_norm": 8.040869744167964, + "learning_rate": 3.3164551370501826e-06, + "loss": 1.4168, + "step": 5824 + }, + { + "epoch": 0.8246619947618037, + "grad_norm": 8.449517627920025, + "learning_rate": 3.3159133451366937e-06, + "loss": 1.2166, + "step": 5825 + }, + { + "epoch": 0.8248035676364408, + "grad_norm": 11.931230586392338, + "learning_rate": 3.315371510331249e-06, + "loss": 1.4154, + "step": 5826 + }, + { + "epoch": 0.824945140511078, + "grad_norm": 8.812474460429526, + "learning_rate": 3.3148296326623327e-06, + "loss": 1.1507, + "step": 5827 + }, + { + "epoch": 0.8250867133857153, + "grad_norm": 8.239547581520755, + "learning_rate": 3.3142877121584295e-06, + "loss": 1.149, + "step": 5828 + }, + { + "epoch": 0.8252282862603525, + "grad_norm": 8.408971409803843, + "learning_rate": 3.313745748848028e-06, + "loss": 1.3267, + "step": 5829 + }, + { + "epoch": 0.8253698591349897, + "grad_norm": 10.075580695671118, + "learning_rate": 3.3132037427596193e-06, + "loss": 1.1872, + "step": 5830 + }, + { + "epoch": 0.825511432009627, + "grad_norm": 10.988502389016347, + "learning_rate": 3.3126616939216967e-06, + "loss": 1.2302, + "step": 5831 + }, + { + "epoch": 0.8256530048842642, + "grad_norm": 7.6696261899492, + "learning_rate": 3.3121196023627543e-06, + "loss": 1.1779, + "step": 5832 + }, + { + "epoch": 0.8257945777589014, + "grad_norm": 8.894897507006826, + "learning_rate": 3.31157746811129e-06, + "loss": 1.1744, + "step": 5833 + }, + { + "epoch": 0.8259361506335386, + "grad_norm": 8.8592458241244, + "learning_rate": 3.311035291195803e-06, + "loss": 1.2951, + "step": 5834 + }, + { + "epoch": 0.8260777235081759, + "grad_norm": 9.51600543484687, + "learning_rate": 3.3104930716447965e-06, + "loss": 1.2819, + "step": 5835 + }, + { + "epoch": 0.8262192963828131, + "grad_norm": 9.815609578943633, + "learning_rate": 3.3099508094867727e-06, + "loss": 1.1567, + "step": 5836 + }, + { + "epoch": 0.8263608692574502, + "grad_norm": 8.073682024371784, + "learning_rate": 3.3094085047502395e-06, + "loss": 1.2914, + "step": 5837 + }, + { + "epoch": 0.8265024421320875, + "grad_norm": 9.37007683232907, + "learning_rate": 3.308866157463705e-06, + "loss": 1.2548, + "step": 5838 + }, + { + "epoch": 0.8266440150067247, + "grad_norm": 10.269663488117791, + "learning_rate": 3.3083237676556777e-06, + "loss": 1.327, + "step": 5839 + }, + { + "epoch": 0.8267855878813619, + "grad_norm": 7.846174009938747, + "learning_rate": 3.3077813353546744e-06, + "loss": 1.21, + "step": 5840 + }, + { + "epoch": 0.8269271607559991, + "grad_norm": 10.08779703338724, + "learning_rate": 3.307238860589208e-06, + "loss": 1.2295, + "step": 5841 + }, + { + "epoch": 0.8270687336306364, + "grad_norm": 9.073215227145528, + "learning_rate": 3.3066963433877967e-06, + "loss": 1.2399, + "step": 5842 + }, + { + "epoch": 0.8272103065052736, + "grad_norm": 7.751481683667731, + "learning_rate": 3.306153783778961e-06, + "loss": 1.2157, + "step": 5843 + }, + { + "epoch": 0.8273518793799108, + "grad_norm": 8.698710501682985, + "learning_rate": 3.305611181791221e-06, + "loss": 1.1845, + "step": 5844 + }, + { + "epoch": 0.8274934522545481, + "grad_norm": 9.467346288794612, + "learning_rate": 3.305068537453102e-06, + "loss": 1.2113, + "step": 5845 + }, + { + "epoch": 0.8276350251291853, + "grad_norm": 9.2788896738296, + "learning_rate": 3.3045258507931306e-06, + "loss": 1.2204, + "step": 5846 + }, + { + "epoch": 0.8277765980038224, + "grad_norm": 12.44496000069688, + "learning_rate": 3.3039831218398346e-06, + "loss": 1.173, + "step": 5847 + }, + { + "epoch": 0.8279181708784596, + "grad_norm": 8.790503354117442, + "learning_rate": 3.303440350621745e-06, + "loss": 1.1795, + "step": 5848 + }, + { + "epoch": 0.8280597437530969, + "grad_norm": 12.828205884336963, + "learning_rate": 3.3028975371673966e-06, + "loss": 1.491, + "step": 5849 + }, + { + "epoch": 0.8282013166277341, + "grad_norm": 10.69936564338739, + "learning_rate": 3.3023546815053227e-06, + "loss": 1.2983, + "step": 5850 + }, + { + "epoch": 0.8283428895023713, + "grad_norm": 11.888193611262697, + "learning_rate": 3.301811783664061e-06, + "loss": 1.2951, + "step": 5851 + }, + { + "epoch": 0.8284844623770086, + "grad_norm": 9.806834146230866, + "learning_rate": 3.3012688436721518e-06, + "loss": 1.3313, + "step": 5852 + }, + { + "epoch": 0.8286260352516458, + "grad_norm": 8.858471590567891, + "learning_rate": 3.3007258615581372e-06, + "loss": 1.2521, + "step": 5853 + }, + { + "epoch": 0.828767608126283, + "grad_norm": 7.465035636188253, + "learning_rate": 3.300182837350561e-06, + "loss": 1.2956, + "step": 5854 + }, + { + "epoch": 0.8289091810009203, + "grad_norm": 7.403056941236864, + "learning_rate": 3.29963977107797e-06, + "loss": 1.2105, + "step": 5855 + }, + { + "epoch": 0.8290507538755575, + "grad_norm": 13.747503990426658, + "learning_rate": 3.2990966627689126e-06, + "loss": 1.2676, + "step": 5856 + }, + { + "epoch": 0.8291923267501946, + "grad_norm": 11.429846365522728, + "learning_rate": 3.2985535124519387e-06, + "loss": 1.3988, + "step": 5857 + }, + { + "epoch": 0.8293338996248318, + "grad_norm": 9.509389553754678, + "learning_rate": 3.2980103201556023e-06, + "loss": 1.1809, + "step": 5858 + }, + { + "epoch": 0.8294754724994691, + "grad_norm": 9.228402570405391, + "learning_rate": 3.297467085908459e-06, + "loss": 1.2574, + "step": 5859 + }, + { + "epoch": 0.8296170453741063, + "grad_norm": 9.144524158343042, + "learning_rate": 3.2969238097390655e-06, + "loss": 1.1609, + "step": 5860 + }, + { + "epoch": 0.8297586182487435, + "grad_norm": 10.280244447644245, + "learning_rate": 3.2963804916759805e-06, + "loss": 1.1501, + "step": 5861 + }, + { + "epoch": 0.8299001911233808, + "grad_norm": 12.538659091057458, + "learning_rate": 3.295837131747768e-06, + "loss": 1.3012, + "step": 5862 + }, + { + "epoch": 0.830041763998018, + "grad_norm": 11.570277552065722, + "learning_rate": 3.2952937299829902e-06, + "loss": 1.2281, + "step": 5863 + }, + { + "epoch": 0.8301833368726552, + "grad_norm": 7.935119512243277, + "learning_rate": 3.294750286410214e-06, + "loss": 1.2364, + "step": 5864 + }, + { + "epoch": 0.8303249097472925, + "grad_norm": 9.001117636903704, + "learning_rate": 3.2942068010580088e-06, + "loss": 1.2286, + "step": 5865 + }, + { + "epoch": 0.8304664826219297, + "grad_norm": 9.382190439152438, + "learning_rate": 3.2936632739549437e-06, + "loss": 1.1816, + "step": 5866 + }, + { + "epoch": 0.8306080554965669, + "grad_norm": 8.158136025505124, + "learning_rate": 3.2931197051295915e-06, + "loss": 1.2333, + "step": 5867 + }, + { + "epoch": 0.830749628371204, + "grad_norm": 10.72579004183302, + "learning_rate": 3.2925760946105277e-06, + "loss": 1.2405, + "step": 5868 + }, + { + "epoch": 0.8308912012458413, + "grad_norm": 8.455551617098322, + "learning_rate": 3.2920324424263305e-06, + "loss": 1.145, + "step": 5869 + }, + { + "epoch": 0.8310327741204785, + "grad_norm": 10.306222565604594, + "learning_rate": 3.291488748605578e-06, + "loss": 1.3003, + "step": 5870 + }, + { + "epoch": 0.8311743469951157, + "grad_norm": 9.836105042258534, + "learning_rate": 3.290945013176852e-06, + "loss": 1.3071, + "step": 5871 + }, + { + "epoch": 0.831315919869753, + "grad_norm": 8.795622118123552, + "learning_rate": 3.2904012361687367e-06, + "loss": 1.1783, + "step": 5872 + }, + { + "epoch": 0.8314574927443902, + "grad_norm": 11.163468961107272, + "learning_rate": 3.2898574176098176e-06, + "loss": 1.2682, + "step": 5873 + }, + { + "epoch": 0.8315990656190274, + "grad_norm": 9.070793198260308, + "learning_rate": 3.2893135575286828e-06, + "loss": 1.2204, + "step": 5874 + }, + { + "epoch": 0.8317406384936646, + "grad_norm": 8.488735139440864, + "learning_rate": 3.288769655953923e-06, + "loss": 1.1961, + "step": 5875 + }, + { + "epoch": 0.8318822113683019, + "grad_norm": 9.540679569631852, + "learning_rate": 3.2882257129141305e-06, + "loss": 1.2737, + "step": 5876 + }, + { + "epoch": 0.8320237842429391, + "grad_norm": 7.417867699005636, + "learning_rate": 3.287681728437899e-06, + "loss": 1.2604, + "step": 5877 + }, + { + "epoch": 0.8321653571175762, + "grad_norm": 8.749735364999305, + "learning_rate": 3.2871377025538274e-06, + "loss": 1.3587, + "step": 5878 + }, + { + "epoch": 0.8323069299922135, + "grad_norm": 9.502875445023617, + "learning_rate": 3.2865936352905144e-06, + "loss": 1.3759, + "step": 5879 + }, + { + "epoch": 0.8324485028668507, + "grad_norm": 9.086425348306756, + "learning_rate": 3.28604952667656e-06, + "loss": 1.2752, + "step": 5880 + }, + { + "epoch": 0.8325900757414879, + "grad_norm": 7.650496333358282, + "learning_rate": 3.2855053767405674e-06, + "loss": 1.2235, + "step": 5881 + }, + { + "epoch": 0.8327316486161251, + "grad_norm": 9.169045942922196, + "learning_rate": 3.2849611855111433e-06, + "loss": 1.2497, + "step": 5882 + }, + { + "epoch": 0.8328732214907624, + "grad_norm": 7.007511196229601, + "learning_rate": 3.284416953016895e-06, + "loss": 1.2661, + "step": 5883 + }, + { + "epoch": 0.8330147943653996, + "grad_norm": 8.770174148205026, + "learning_rate": 3.2838726792864315e-06, + "loss": 1.2316, + "step": 5884 + }, + { + "epoch": 0.8331563672400368, + "grad_norm": 8.785004705646017, + "learning_rate": 3.2833283643483672e-06, + "loss": 1.2102, + "step": 5885 + }, + { + "epoch": 0.8332979401146741, + "grad_norm": 8.202097219840237, + "learning_rate": 3.2827840082313147e-06, + "loss": 1.2252, + "step": 5886 + }, + { + "epoch": 0.8334395129893113, + "grad_norm": 8.091768353573316, + "learning_rate": 3.28223961096389e-06, + "loss": 1.2711, + "step": 5887 + }, + { + "epoch": 0.8335810858639484, + "grad_norm": 9.38766844251655, + "learning_rate": 3.281695172574712e-06, + "loss": 1.4509, + "step": 5888 + }, + { + "epoch": 0.8337226587385856, + "grad_norm": 9.851819996263103, + "learning_rate": 3.281150693092402e-06, + "loss": 1.177, + "step": 5889 + }, + { + "epoch": 0.8338642316132229, + "grad_norm": 9.992160776205468, + "learning_rate": 3.280606172545582e-06, + "loss": 1.3416, + "step": 5890 + }, + { + "epoch": 0.8340058044878601, + "grad_norm": 9.248953270326307, + "learning_rate": 3.280061610962878e-06, + "loss": 1.3421, + "step": 5891 + }, + { + "epoch": 0.8341473773624973, + "grad_norm": 7.9433341625030165, + "learning_rate": 3.279517008372917e-06, + "loss": 1.1594, + "step": 5892 + }, + { + "epoch": 0.8342889502371346, + "grad_norm": 7.67769412988689, + "learning_rate": 3.2789723648043276e-06, + "loss": 1.1809, + "step": 5893 + }, + { + "epoch": 0.8344305231117718, + "grad_norm": 9.193892765557488, + "learning_rate": 3.2784276802857418e-06, + "loss": 1.3067, + "step": 5894 + }, + { + "epoch": 0.834572095986409, + "grad_norm": 8.800630408161377, + "learning_rate": 3.2778829548457935e-06, + "loss": 1.2551, + "step": 5895 + }, + { + "epoch": 0.8347136688610463, + "grad_norm": 8.271316731993585, + "learning_rate": 3.277338188513119e-06, + "loss": 1.1988, + "step": 5896 + }, + { + "epoch": 0.8348552417356835, + "grad_norm": 9.210744504710501, + "learning_rate": 3.2767933813163542e-06, + "loss": 1.2474, + "step": 5897 + }, + { + "epoch": 0.8349968146103207, + "grad_norm": 9.696502243681417, + "learning_rate": 3.276248533284141e-06, + "loss": 1.2096, + "step": 5898 + }, + { + "epoch": 0.8351383874849578, + "grad_norm": 8.953909794628911, + "learning_rate": 3.2757036444451212e-06, + "loss": 1.2615, + "step": 5899 + }, + { + "epoch": 0.8352799603595951, + "grad_norm": 9.833463980458792, + "learning_rate": 3.2751587148279395e-06, + "loss": 1.3571, + "step": 5900 + }, + { + "epoch": 0.8354215332342323, + "grad_norm": 9.189432991808445, + "learning_rate": 3.274613744461242e-06, + "loss": 1.1349, + "step": 5901 + }, + { + "epoch": 0.8355631061088695, + "grad_norm": 11.322111146145142, + "learning_rate": 3.2740687333736776e-06, + "loss": 1.2061, + "step": 5902 + }, + { + "epoch": 0.8357046789835068, + "grad_norm": 7.51288261264391, + "learning_rate": 3.2735236815938975e-06, + "loss": 1.1914, + "step": 5903 + }, + { + "epoch": 0.835846251858144, + "grad_norm": 6.728519026668225, + "learning_rate": 3.2729785891505533e-06, + "loss": 1.0824, + "step": 5904 + }, + { + "epoch": 0.8359878247327812, + "grad_norm": 6.970345763465099, + "learning_rate": 3.2724334560723015e-06, + "loss": 1.1031, + "step": 5905 + }, + { + "epoch": 0.8361293976074184, + "grad_norm": 8.38452517744278, + "learning_rate": 3.271888282387799e-06, + "loss": 1.2494, + "step": 5906 + }, + { + "epoch": 0.8362709704820557, + "grad_norm": 10.571697135960369, + "learning_rate": 3.2713430681257046e-06, + "loss": 1.3539, + "step": 5907 + }, + { + "epoch": 0.8364125433566929, + "grad_norm": 9.695758277549011, + "learning_rate": 3.2707978133146805e-06, + "loss": 1.1409, + "step": 5908 + }, + { + "epoch": 0.83655411623133, + "grad_norm": 9.531607799380488, + "learning_rate": 3.27025251798339e-06, + "loss": 1.1902, + "step": 5909 + }, + { + "epoch": 0.8366956891059673, + "grad_norm": 7.746753135544362, + "learning_rate": 3.2697071821604986e-06, + "loss": 1.1991, + "step": 5910 + }, + { + "epoch": 0.8368372619806045, + "grad_norm": 9.695204691936869, + "learning_rate": 3.2691618058746757e-06, + "loss": 1.2063, + "step": 5911 + }, + { + "epoch": 0.8369788348552417, + "grad_norm": 9.21134377085892, + "learning_rate": 3.26861638915459e-06, + "loss": 1.2594, + "step": 5912 + }, + { + "epoch": 0.837120407729879, + "grad_norm": 10.77521626903754, + "learning_rate": 3.2680709320289123e-06, + "loss": 1.1042, + "step": 5913 + }, + { + "epoch": 0.8372619806045162, + "grad_norm": 8.913165917470812, + "learning_rate": 3.26752543452632e-06, + "loss": 1.2468, + "step": 5914 + }, + { + "epoch": 0.8374035534791534, + "grad_norm": 8.1899270689981, + "learning_rate": 3.266979896675487e-06, + "loss": 1.2606, + "step": 5915 + }, + { + "epoch": 0.8375451263537906, + "grad_norm": 9.838575571438293, + "learning_rate": 3.266434318505093e-06, + "loss": 1.2783, + "step": 5916 + }, + { + "epoch": 0.8376866992284279, + "grad_norm": 9.342483077387378, + "learning_rate": 3.2658887000438183e-06, + "loss": 1.2648, + "step": 5917 + }, + { + "epoch": 0.8378282721030651, + "grad_norm": 12.773456015850494, + "learning_rate": 3.265343041320346e-06, + "loss": 1.3355, + "step": 5918 + }, + { + "epoch": 0.8379698449777023, + "grad_norm": 7.877220446275991, + "learning_rate": 3.26479734236336e-06, + "loss": 1.1559, + "step": 5919 + }, + { + "epoch": 0.8381114178523394, + "grad_norm": 9.416039730121085, + "learning_rate": 3.2642516032015486e-06, + "loss": 1.336, + "step": 5920 + }, + { + "epoch": 0.8382529907269767, + "grad_norm": 7.639406696070628, + "learning_rate": 3.2637058238635995e-06, + "loss": 1.1896, + "step": 5921 + }, + { + "epoch": 0.8383945636016139, + "grad_norm": 8.139557459093387, + "learning_rate": 3.2631600043782054e-06, + "loss": 1.1648, + "step": 5922 + }, + { + "epoch": 0.8385361364762511, + "grad_norm": 10.672186694752686, + "learning_rate": 3.262614144774059e-06, + "loss": 1.2457, + "step": 5923 + }, + { + "epoch": 0.8386777093508884, + "grad_norm": 8.42366260814379, + "learning_rate": 3.2620682450798557e-06, + "loss": 1.2058, + "step": 5924 + }, + { + "epoch": 0.8388192822255256, + "grad_norm": 10.84174814352451, + "learning_rate": 3.2615223053242924e-06, + "loss": 1.3454, + "step": 5925 + }, + { + "epoch": 0.8389608551001628, + "grad_norm": 8.120361412536688, + "learning_rate": 3.2609763255360696e-06, + "loss": 1.1696, + "step": 5926 + }, + { + "epoch": 0.8391024279748001, + "grad_norm": 7.961398694396769, + "learning_rate": 3.2604303057438883e-06, + "loss": 1.1316, + "step": 5927 + }, + { + "epoch": 0.8392440008494373, + "grad_norm": 9.398236178226144, + "learning_rate": 3.2598842459764535e-06, + "loss": 1.2886, + "step": 5928 + }, + { + "epoch": 0.8393855737240745, + "grad_norm": 10.35071525129442, + "learning_rate": 3.2593381462624705e-06, + "loss": 1.2996, + "step": 5929 + }, + { + "epoch": 0.8395271465987116, + "grad_norm": 7.930304923238268, + "learning_rate": 3.2587920066306474e-06, + "loss": 1.2238, + "step": 5930 + }, + { + "epoch": 0.8396687194733489, + "grad_norm": 8.11050812847449, + "learning_rate": 3.258245827109693e-06, + "loss": 1.209, + "step": 5931 + }, + { + "epoch": 0.8398102923479861, + "grad_norm": 7.8951229154985345, + "learning_rate": 3.2576996077283222e-06, + "loss": 1.0637, + "step": 5932 + }, + { + "epoch": 0.8399518652226233, + "grad_norm": 8.496254488422016, + "learning_rate": 3.2571533485152485e-06, + "loss": 1.194, + "step": 5933 + }, + { + "epoch": 0.8400934380972606, + "grad_norm": 10.145973617137413, + "learning_rate": 3.256607049499187e-06, + "loss": 1.2642, + "step": 5934 + }, + { + "epoch": 0.8402350109718978, + "grad_norm": 10.435746279736733, + "learning_rate": 3.256060710708857e-06, + "loss": 1.3127, + "step": 5935 + }, + { + "epoch": 0.840376583846535, + "grad_norm": 10.192721848177571, + "learning_rate": 3.255514332172979e-06, + "loss": 1.2476, + "step": 5936 + }, + { + "epoch": 0.8405181567211722, + "grad_norm": 10.540442555237133, + "learning_rate": 3.2549679139202756e-06, + "loss": 1.1851, + "step": 5937 + }, + { + "epoch": 0.8406597295958095, + "grad_norm": 7.808616710665342, + "learning_rate": 3.254421455979472e-06, + "loss": 1.2253, + "step": 5938 + }, + { + "epoch": 0.8408013024704467, + "grad_norm": 10.094010388112943, + "learning_rate": 3.253874958379296e-06, + "loss": 1.216, + "step": 5939 + }, + { + "epoch": 0.8409428753450838, + "grad_norm": 9.84151876684608, + "learning_rate": 3.253328421148475e-06, + "loss": 1.1844, + "step": 5940 + }, + { + "epoch": 0.8410844482197211, + "grad_norm": 8.648513366311324, + "learning_rate": 3.2527818443157406e-06, + "loss": 1.2491, + "step": 5941 + }, + { + "epoch": 0.8412260210943583, + "grad_norm": 6.803487152847784, + "learning_rate": 3.2522352279098256e-06, + "loss": 1.1703, + "step": 5942 + }, + { + "epoch": 0.8413675939689955, + "grad_norm": 7.481080160228838, + "learning_rate": 3.251688571959466e-06, + "loss": 1.305, + "step": 5943 + }, + { + "epoch": 0.8415091668436327, + "grad_norm": 8.045954323847813, + "learning_rate": 3.2511418764933983e-06, + "loss": 1.3141, + "step": 5944 + }, + { + "epoch": 0.84165073971827, + "grad_norm": 8.906669740657314, + "learning_rate": 3.2505951415403625e-06, + "loss": 1.2247, + "step": 5945 + }, + { + "epoch": 0.8417923125929072, + "grad_norm": 7.575555802403678, + "learning_rate": 3.2500483671290993e-06, + "loss": 1.318, + "step": 5946 + }, + { + "epoch": 0.8419338854675444, + "grad_norm": 9.327365965768417, + "learning_rate": 3.2495015532883533e-06, + "loss": 1.4416, + "step": 5947 + }, + { + "epoch": 0.8420754583421817, + "grad_norm": 8.152872021845422, + "learning_rate": 3.248954700046869e-06, + "loss": 1.182, + "step": 5948 + }, + { + "epoch": 0.8422170312168189, + "grad_norm": 9.350193825410996, + "learning_rate": 3.248407807433396e-06, + "loss": 1.2032, + "step": 5949 + }, + { + "epoch": 0.8423586040914561, + "grad_norm": 7.698327851487255, + "learning_rate": 3.2478608754766804e-06, + "loss": 1.2997, + "step": 5950 + }, + { + "epoch": 0.8425001769660933, + "grad_norm": 8.673698604710193, + "learning_rate": 3.2473139042054773e-06, + "loss": 1.3685, + "step": 5951 + }, + { + "epoch": 0.8426417498407305, + "grad_norm": 9.475962696135664, + "learning_rate": 3.2467668936485397e-06, + "loss": 1.3351, + "step": 5952 + }, + { + "epoch": 0.8427833227153677, + "grad_norm": 8.400287768566418, + "learning_rate": 3.2462198438346227e-06, + "loss": 1.125, + "step": 5953 + }, + { + "epoch": 0.8429248955900049, + "grad_norm": 9.852919603814815, + "learning_rate": 3.2456727547924855e-06, + "loss": 1.1806, + "step": 5954 + }, + { + "epoch": 0.8430664684646422, + "grad_norm": 8.749423198762027, + "learning_rate": 3.245125626550888e-06, + "loss": 1.1974, + "step": 5955 + }, + { + "epoch": 0.8432080413392794, + "grad_norm": 7.781660321918624, + "learning_rate": 3.244578459138591e-06, + "loss": 1.2147, + "step": 5956 + }, + { + "epoch": 0.8433496142139166, + "grad_norm": 8.094941393974079, + "learning_rate": 3.2440312525843596e-06, + "loss": 1.1997, + "step": 5957 + }, + { + "epoch": 0.8434911870885539, + "grad_norm": 7.639718031148727, + "learning_rate": 3.24348400691696e-06, + "loss": 1.0505, + "step": 5958 + }, + { + "epoch": 0.8436327599631911, + "grad_norm": 9.651538279575405, + "learning_rate": 3.2429367221651603e-06, + "loss": 1.1792, + "step": 5959 + }, + { + "epoch": 0.8437743328378283, + "grad_norm": 11.092514098190486, + "learning_rate": 3.242389398357732e-06, + "loss": 1.353, + "step": 5960 + }, + { + "epoch": 0.8439159057124654, + "grad_norm": 10.22972222481505, + "learning_rate": 3.2418420355234466e-06, + "loss": 1.3402, + "step": 5961 + }, + { + "epoch": 0.8440574785871027, + "grad_norm": 9.32096774794875, + "learning_rate": 3.2412946336910778e-06, + "loss": 1.3673, + "step": 5962 + }, + { + "epoch": 0.8441990514617399, + "grad_norm": 6.828757948531263, + "learning_rate": 3.240747192889403e-06, + "loss": 1.183, + "step": 5963 + }, + { + "epoch": 0.8443406243363771, + "grad_norm": 8.794795873028674, + "learning_rate": 3.240199713147201e-06, + "loss": 1.2729, + "step": 5964 + }, + { + "epoch": 0.8444821972110144, + "grad_norm": 8.25533468990114, + "learning_rate": 3.239652194493251e-06, + "loss": 1.2694, + "step": 5965 + }, + { + "epoch": 0.8446237700856516, + "grad_norm": 11.079490588214602, + "learning_rate": 3.2391046369563374e-06, + "loss": 1.2609, + "step": 5966 + }, + { + "epoch": 0.8447653429602888, + "grad_norm": 8.438377052217483, + "learning_rate": 3.2385570405652444e-06, + "loss": 1.1424, + "step": 5967 + }, + { + "epoch": 0.844906915834926, + "grad_norm": 9.007149293939804, + "learning_rate": 3.2380094053487576e-06, + "loss": 1.228, + "step": 5968 + }, + { + "epoch": 0.8450484887095633, + "grad_norm": 8.81846773708338, + "learning_rate": 3.237461731335667e-06, + "loss": 1.4507, + "step": 5969 + }, + { + "epoch": 0.8451900615842005, + "grad_norm": 7.5825670574781165, + "learning_rate": 3.2369140185547643e-06, + "loss": 1.0591, + "step": 5970 + }, + { + "epoch": 0.8453316344588376, + "grad_norm": 10.779274048999659, + "learning_rate": 3.23636626703484e-06, + "loss": 1.3676, + "step": 5971 + }, + { + "epoch": 0.8454732073334749, + "grad_norm": 7.6259790010921, + "learning_rate": 3.2358184768046895e-06, + "loss": 1.2247, + "step": 5972 + }, + { + "epoch": 0.8456147802081121, + "grad_norm": 11.201681514942614, + "learning_rate": 3.235270647893111e-06, + "loss": 1.1276, + "step": 5973 + }, + { + "epoch": 0.8457563530827493, + "grad_norm": 9.818074774881085, + "learning_rate": 3.2347227803289027e-06, + "loss": 1.2806, + "step": 5974 + }, + { + "epoch": 0.8458979259573866, + "grad_norm": 7.222767954365378, + "learning_rate": 3.234174874140866e-06, + "loss": 1.1001, + "step": 5975 + }, + { + "epoch": 0.8460394988320238, + "grad_norm": 11.55956532119781, + "learning_rate": 3.2336269293578032e-06, + "loss": 1.2543, + "step": 5976 + }, + { + "epoch": 0.846181071706661, + "grad_norm": 8.873289561327308, + "learning_rate": 3.23307894600852e-06, + "loss": 1.3062, + "step": 5977 + }, + { + "epoch": 0.8463226445812982, + "grad_norm": 10.334516108921907, + "learning_rate": 3.2325309241218227e-06, + "loss": 1.2763, + "step": 5978 + }, + { + "epoch": 0.8464642174559355, + "grad_norm": 9.900144988260234, + "learning_rate": 3.2319828637265217e-06, + "loss": 1.2396, + "step": 5979 + }, + { + "epoch": 0.8466057903305727, + "grad_norm": 10.179822404132967, + "learning_rate": 3.2314347648514265e-06, + "loss": 1.359, + "step": 5980 + }, + { + "epoch": 0.8467473632052099, + "grad_norm": 7.872244216480219, + "learning_rate": 3.2308866275253516e-06, + "loss": 1.2057, + "step": 5981 + }, + { + "epoch": 0.846888936079847, + "grad_norm": 8.396227907007185, + "learning_rate": 3.230338451777112e-06, + "loss": 1.1566, + "step": 5982 + }, + { + "epoch": 0.8470305089544843, + "grad_norm": 11.956201733313774, + "learning_rate": 3.2297902376355238e-06, + "loss": 1.3092, + "step": 5983 + }, + { + "epoch": 0.8471720818291215, + "grad_norm": 9.012131038737452, + "learning_rate": 3.2292419851294072e-06, + "loss": 1.151, + "step": 5984 + }, + { + "epoch": 0.8473136547037587, + "grad_norm": 10.014262137920706, + "learning_rate": 3.2286936942875837e-06, + "loss": 1.2233, + "step": 5985 + }, + { + "epoch": 0.847455227578396, + "grad_norm": 9.311658987689967, + "learning_rate": 3.2281453651388755e-06, + "loss": 1.4721, + "step": 5986 + }, + { + "epoch": 0.8475968004530332, + "grad_norm": 8.422596520142307, + "learning_rate": 3.227596997712108e-06, + "loss": 1.2042, + "step": 5987 + }, + { + "epoch": 0.8477383733276704, + "grad_norm": 10.057164643623237, + "learning_rate": 3.2270485920361093e-06, + "loss": 1.2514, + "step": 5988 + }, + { + "epoch": 0.8478799462023077, + "grad_norm": 8.329368970766746, + "learning_rate": 3.2265001481397084e-06, + "loss": 1.267, + "step": 5989 + }, + { + "epoch": 0.8480215190769449, + "grad_norm": 8.052719926693927, + "learning_rate": 3.225951666051736e-06, + "loss": 1.1719, + "step": 5990 + }, + { + "epoch": 0.8481630919515821, + "grad_norm": 7.057432263840115, + "learning_rate": 3.225403145801026e-06, + "loss": 1.3219, + "step": 5991 + }, + { + "epoch": 0.8483046648262192, + "grad_norm": 8.847047440422479, + "learning_rate": 3.2248545874164145e-06, + "loss": 1.2698, + "step": 5992 + }, + { + "epoch": 0.8484462377008565, + "grad_norm": 7.280664248501923, + "learning_rate": 3.2243059909267367e-06, + "loss": 1.297, + "step": 5993 + }, + { + "epoch": 0.8485878105754937, + "grad_norm": 8.48797879336312, + "learning_rate": 3.2237573563608333e-06, + "loss": 1.311, + "step": 5994 + }, + { + "epoch": 0.8487293834501309, + "grad_norm": 8.988031694710315, + "learning_rate": 3.2232086837475444e-06, + "loss": 1.4122, + "step": 5995 + }, + { + "epoch": 0.8488709563247682, + "grad_norm": 9.963019849742707, + "learning_rate": 3.222659973115715e-06, + "loss": 1.2663, + "step": 5996 + }, + { + "epoch": 0.8490125291994054, + "grad_norm": 8.693258668465782, + "learning_rate": 3.2221112244941905e-06, + "loss": 1.2414, + "step": 5997 + }, + { + "epoch": 0.8491541020740426, + "grad_norm": 8.520684151559928, + "learning_rate": 3.2215624379118164e-06, + "loss": 1.1709, + "step": 5998 + }, + { + "epoch": 0.8492956749486799, + "grad_norm": 7.9094366956676145, + "learning_rate": 3.2210136133974434e-06, + "loss": 1.2435, + "step": 5999 + }, + { + "epoch": 0.8494372478233171, + "grad_norm": 9.976226394715253, + "learning_rate": 3.220464750979922e-06, + "loss": 1.2943, + "step": 6000 + }, + { + "epoch": 0.8495788206979543, + "grad_norm": 7.314166800136883, + "learning_rate": 3.219915850688106e-06, + "loss": 1.2773, + "step": 6001 + }, + { + "epoch": 0.8497203935725914, + "grad_norm": 7.5908735188256555, + "learning_rate": 3.2193669125508504e-06, + "loss": 1.2011, + "step": 6002 + }, + { + "epoch": 0.8498619664472287, + "grad_norm": 9.56006985371499, + "learning_rate": 3.218817936597013e-06, + "loss": 1.1682, + "step": 6003 + }, + { + "epoch": 0.8500035393218659, + "grad_norm": 8.236316314536825, + "learning_rate": 3.218268922855452e-06, + "loss": 1.2421, + "step": 6004 + }, + { + "epoch": 0.8501451121965031, + "grad_norm": 9.454581660754897, + "learning_rate": 3.2177198713550295e-06, + "loss": 1.1552, + "step": 6005 + }, + { + "epoch": 0.8502866850711404, + "grad_norm": 8.564607903746783, + "learning_rate": 3.2171707821246083e-06, + "loss": 1.1245, + "step": 6006 + }, + { + "epoch": 0.8504282579457776, + "grad_norm": 9.372428846641172, + "learning_rate": 3.216621655193055e-06, + "loss": 1.1873, + "step": 6007 + }, + { + "epoch": 0.8505698308204148, + "grad_norm": 9.311787213178555, + "learning_rate": 3.216072490589235e-06, + "loss": 1.3156, + "step": 6008 + }, + { + "epoch": 0.850711403695052, + "grad_norm": 8.376580843543383, + "learning_rate": 3.2155232883420172e-06, + "loss": 1.3195, + "step": 6009 + }, + { + "epoch": 0.8508529765696893, + "grad_norm": 9.64917127776558, + "learning_rate": 3.2149740484802736e-06, + "loss": 1.1685, + "step": 6010 + }, + { + "epoch": 0.8509945494443265, + "grad_norm": 7.860700916361363, + "learning_rate": 3.2144247710328787e-06, + "loss": 1.2483, + "step": 6011 + }, + { + "epoch": 0.8511361223189637, + "grad_norm": 7.960749420849317, + "learning_rate": 3.2138754560287057e-06, + "loss": 1.0661, + "step": 6012 + }, + { + "epoch": 0.8512776951936009, + "grad_norm": 10.04510548909813, + "learning_rate": 3.2133261034966325e-06, + "loss": 1.2465, + "step": 6013 + }, + { + "epoch": 0.8514192680682381, + "grad_norm": 10.601575813481876, + "learning_rate": 3.2127767134655374e-06, + "loss": 1.3014, + "step": 6014 + }, + { + "epoch": 0.8515608409428753, + "grad_norm": 7.71862372785442, + "learning_rate": 3.2122272859643022e-06, + "loss": 1.2404, + "step": 6015 + }, + { + "epoch": 0.8517024138175125, + "grad_norm": 9.810915679348446, + "learning_rate": 3.2116778210218103e-06, + "loss": 1.1378, + "step": 6016 + }, + { + "epoch": 0.8518439866921498, + "grad_norm": 8.481789772887845, + "learning_rate": 3.211128318666945e-06, + "loss": 1.395, + "step": 6017 + }, + { + "epoch": 0.851985559566787, + "grad_norm": 8.790704707690534, + "learning_rate": 3.2105787789285947e-06, + "loss": 1.1797, + "step": 6018 + }, + { + "epoch": 0.8521271324414242, + "grad_norm": 9.415893072799756, + "learning_rate": 3.2100292018356477e-06, + "loss": 1.3236, + "step": 6019 + }, + { + "epoch": 0.8522687053160615, + "grad_norm": 9.083268564909089, + "learning_rate": 3.209479587416995e-06, + "loss": 1.1641, + "step": 6020 + }, + { + "epoch": 0.8524102781906987, + "grad_norm": 8.653164640799414, + "learning_rate": 3.208929935701529e-06, + "loss": 1.2693, + "step": 6021 + }, + { + "epoch": 0.8525518510653359, + "grad_norm": 9.603643330005594, + "learning_rate": 3.2083802467181452e-06, + "loss": 1.3006, + "step": 6022 + }, + { + "epoch": 0.852693423939973, + "grad_norm": 8.733038893921972, + "learning_rate": 3.2078305204957406e-06, + "loss": 1.2698, + "step": 6023 + }, + { + "epoch": 0.8528349968146103, + "grad_norm": 10.092009875643113, + "learning_rate": 3.2072807570632125e-06, + "loss": 1.3358, + "step": 6024 + }, + { + "epoch": 0.8529765696892475, + "grad_norm": 10.350414514764198, + "learning_rate": 3.2067309564494626e-06, + "loss": 1.0834, + "step": 6025 + }, + { + "epoch": 0.8531181425638847, + "grad_norm": 7.626689035920815, + "learning_rate": 3.206181118683393e-06, + "loss": 1.1821, + "step": 6026 + }, + { + "epoch": 0.853259715438522, + "grad_norm": 10.659402241220505, + "learning_rate": 3.205631243793909e-06, + "loss": 1.2764, + "step": 6027 + }, + { + "epoch": 0.8534012883131592, + "grad_norm": 9.231815919465598, + "learning_rate": 3.2050813318099166e-06, + "loss": 1.2872, + "step": 6028 + }, + { + "epoch": 0.8535428611877964, + "grad_norm": 9.655401396267628, + "learning_rate": 3.204531382760325e-06, + "loss": 1.2321, + "step": 6029 + }, + { + "epoch": 0.8536844340624337, + "grad_norm": 8.710313441070832, + "learning_rate": 3.203981396674043e-06, + "loss": 1.2565, + "step": 6030 + }, + { + "epoch": 0.8538260069370709, + "grad_norm": 9.490206840680884, + "learning_rate": 3.2034313735799837e-06, + "loss": 1.3511, + "step": 6031 + }, + { + "epoch": 0.8539675798117081, + "grad_norm": 8.059488369816233, + "learning_rate": 3.2028813135070625e-06, + "loss": 1.2268, + "step": 6032 + }, + { + "epoch": 0.8541091526863452, + "grad_norm": 9.852717502054597, + "learning_rate": 3.2023312164841937e-06, + "loss": 1.3017, + "step": 6033 + }, + { + "epoch": 0.8542507255609825, + "grad_norm": 8.497331536989229, + "learning_rate": 3.201781082540297e-06, + "loss": 1.172, + "step": 6034 + }, + { + "epoch": 0.8543922984356197, + "grad_norm": 7.972992848342258, + "learning_rate": 3.201230911704292e-06, + "loss": 1.2276, + "step": 6035 + }, + { + "epoch": 0.8545338713102569, + "grad_norm": 9.667958491156174, + "learning_rate": 3.2006807040051013e-06, + "loss": 1.4034, + "step": 6036 + }, + { + "epoch": 0.8546754441848942, + "grad_norm": 10.002536070629203, + "learning_rate": 3.2001304594716476e-06, + "loss": 1.3692, + "step": 6037 + }, + { + "epoch": 0.8548170170595314, + "grad_norm": 10.498332663261158, + "learning_rate": 3.1995801781328585e-06, + "loss": 1.2534, + "step": 6038 + }, + { + "epoch": 0.8549585899341686, + "grad_norm": 9.011609960196182, + "learning_rate": 3.1990298600176607e-06, + "loss": 1.3948, + "step": 6039 + }, + { + "epoch": 0.8551001628088059, + "grad_norm": 8.275613039479264, + "learning_rate": 3.198479505154984e-06, + "loss": 1.2394, + "step": 6040 + }, + { + "epoch": 0.8552417356834431, + "grad_norm": 8.0969265039912, + "learning_rate": 3.197929113573761e-06, + "loss": 1.1914, + "step": 6041 + }, + { + "epoch": 0.8553833085580803, + "grad_norm": 8.167269107110942, + "learning_rate": 3.197378685302925e-06, + "loss": 1.2016, + "step": 6042 + }, + { + "epoch": 0.8555248814327175, + "grad_norm": 8.928372504061928, + "learning_rate": 3.196828220371411e-06, + "loss": 1.2362, + "step": 6043 + }, + { + "epoch": 0.8556664543073547, + "grad_norm": 9.760376714721684, + "learning_rate": 3.196277718808157e-06, + "loss": 1.1691, + "step": 6044 + }, + { + "epoch": 0.8558080271819919, + "grad_norm": 9.785750746037538, + "learning_rate": 3.195727180642104e-06, + "loss": 1.3898, + "step": 6045 + }, + { + "epoch": 0.8559496000566291, + "grad_norm": 8.369219706817152, + "learning_rate": 3.1951766059021905e-06, + "loss": 1.2656, + "step": 6046 + }, + { + "epoch": 0.8560911729312664, + "grad_norm": 8.68670562844701, + "learning_rate": 3.1946259946173607e-06, + "loss": 1.2107, + "step": 6047 + }, + { + "epoch": 0.8562327458059036, + "grad_norm": 9.268045504876891, + "learning_rate": 3.1940753468165607e-06, + "loss": 1.4617, + "step": 6048 + }, + { + "epoch": 0.8563743186805408, + "grad_norm": 8.548751597141473, + "learning_rate": 3.193524662528738e-06, + "loss": 1.3136, + "step": 6049 + }, + { + "epoch": 0.856515891555178, + "grad_norm": 10.551033613003723, + "learning_rate": 3.192973941782841e-06, + "loss": 1.3367, + "step": 6050 + }, + { + "epoch": 0.8566574644298153, + "grad_norm": 8.335216258629707, + "learning_rate": 3.1924231846078198e-06, + "loss": 1.1175, + "step": 6051 + }, + { + "epoch": 0.8567990373044525, + "grad_norm": 8.372705857077545, + "learning_rate": 3.1918723910326283e-06, + "loss": 1.3324, + "step": 6052 + }, + { + "epoch": 0.8569406101790897, + "grad_norm": 8.86869953724522, + "learning_rate": 3.1913215610862208e-06, + "loss": 1.1618, + "step": 6053 + }, + { + "epoch": 0.8570821830537269, + "grad_norm": 8.900697282200705, + "learning_rate": 3.1907706947975546e-06, + "loss": 1.2607, + "step": 6054 + }, + { + "epoch": 0.8572237559283641, + "grad_norm": 8.509305908729562, + "learning_rate": 3.190219792195588e-06, + "loss": 1.1501, + "step": 6055 + }, + { + "epoch": 0.8573653288030013, + "grad_norm": 8.656242067629488, + "learning_rate": 3.189668853309282e-06, + "loss": 1.166, + "step": 6056 + }, + { + "epoch": 0.8575069016776385, + "grad_norm": 7.400062004035791, + "learning_rate": 3.189117878167598e-06, + "loss": 1.2186, + "step": 6057 + }, + { + "epoch": 0.8576484745522758, + "grad_norm": 8.890661470844572, + "learning_rate": 3.1885668667995006e-06, + "loss": 1.2709, + "step": 6058 + }, + { + "epoch": 0.857790047426913, + "grad_norm": 10.507634974685335, + "learning_rate": 3.1880158192339574e-06, + "loss": 1.238, + "step": 6059 + }, + { + "epoch": 0.8579316203015502, + "grad_norm": 7.893747931666643, + "learning_rate": 3.1874647354999354e-06, + "loss": 1.2133, + "step": 6060 + }, + { + "epoch": 0.8580731931761875, + "grad_norm": 8.877500369889196, + "learning_rate": 3.186913615626405e-06, + "loss": 1.2945, + "step": 6061 + }, + { + "epoch": 0.8582147660508247, + "grad_norm": 7.831505203655477, + "learning_rate": 3.186362459642337e-06, + "loss": 1.1996, + "step": 6062 + }, + { + "epoch": 0.8583563389254619, + "grad_norm": 8.21376449163375, + "learning_rate": 3.1858112675767074e-06, + "loss": 1.2313, + "step": 6063 + }, + { + "epoch": 0.8584979118000992, + "grad_norm": 9.67649442580104, + "learning_rate": 3.18526003945849e-06, + "loss": 1.3579, + "step": 6064 + }, + { + "epoch": 0.8586394846747363, + "grad_norm": 9.595758581981514, + "learning_rate": 3.184708775316663e-06, + "loss": 1.164, + "step": 6065 + }, + { + "epoch": 0.8587810575493735, + "grad_norm": 8.67610573881022, + "learning_rate": 3.184157475180208e-06, + "loss": 1.3042, + "step": 6066 + }, + { + "epoch": 0.8589226304240107, + "grad_norm": 8.2723345286979, + "learning_rate": 3.183606139078103e-06, + "loss": 1.2736, + "step": 6067 + }, + { + "epoch": 0.859064203298648, + "grad_norm": 8.983256129379544, + "learning_rate": 3.1830547670393337e-06, + "loss": 1.2973, + "step": 6068 + }, + { + "epoch": 0.8592057761732852, + "grad_norm": 8.265092396297572, + "learning_rate": 3.1825033590928844e-06, + "loss": 1.2415, + "step": 6069 + }, + { + "epoch": 0.8593473490479224, + "grad_norm": 8.016236042133684, + "learning_rate": 3.181951915267742e-06, + "loss": 1.2429, + "step": 6070 + }, + { + "epoch": 0.8594889219225597, + "grad_norm": 8.923258926296079, + "learning_rate": 3.181400435592897e-06, + "loss": 1.0945, + "step": 6071 + }, + { + "epoch": 0.8596304947971969, + "grad_norm": 9.033573683277893, + "learning_rate": 3.180848920097338e-06, + "loss": 1.3172, + "step": 6072 + }, + { + "epoch": 0.8597720676718341, + "grad_norm": 7.908890234335632, + "learning_rate": 3.1802973688100596e-06, + "loss": 1.2942, + "step": 6073 + }, + { + "epoch": 0.8599136405464713, + "grad_norm": 7.9609044367259925, + "learning_rate": 3.179745781760055e-06, + "loss": 1.1271, + "step": 6074 + }, + { + "epoch": 0.8600552134211085, + "grad_norm": 9.097553100263244, + "learning_rate": 3.1791941589763225e-06, + "loss": 1.1573, + "step": 6075 + }, + { + "epoch": 0.8601967862957457, + "grad_norm": 9.419602153176168, + "learning_rate": 3.178642500487859e-06, + "loss": 1.2492, + "step": 6076 + }, + { + "epoch": 0.8603383591703829, + "grad_norm": 8.22695663971543, + "learning_rate": 3.1780908063236653e-06, + "loss": 1.198, + "step": 6077 + }, + { + "epoch": 0.8604799320450202, + "grad_norm": 10.67159725514762, + "learning_rate": 3.1775390765127433e-06, + "loss": 1.2491, + "step": 6078 + }, + { + "epoch": 0.8606215049196574, + "grad_norm": 9.910763647872164, + "learning_rate": 3.1769873110840977e-06, + "loss": 1.2277, + "step": 6079 + }, + { + "epoch": 0.8607630777942946, + "grad_norm": 9.335910532126322, + "learning_rate": 3.176435510066734e-06, + "loss": 1.1241, + "step": 6080 + }, + { + "epoch": 0.8609046506689318, + "grad_norm": 10.707512216714663, + "learning_rate": 3.175883673489659e-06, + "loss": 1.1552, + "step": 6081 + }, + { + "epoch": 0.8610462235435691, + "grad_norm": 7.7651241762704855, + "learning_rate": 3.1753318013818848e-06, + "loss": 1.2507, + "step": 6082 + }, + { + "epoch": 0.8611877964182063, + "grad_norm": 8.810932425822045, + "learning_rate": 3.1747798937724207e-06, + "loss": 1.2015, + "step": 6083 + }, + { + "epoch": 0.8613293692928435, + "grad_norm": 10.945875002275043, + "learning_rate": 3.1742279506902798e-06, + "loss": 1.2305, + "step": 6084 + }, + { + "epoch": 0.8614709421674807, + "grad_norm": 10.208743547774072, + "learning_rate": 3.173675972164479e-06, + "loss": 1.1556, + "step": 6085 + }, + { + "epoch": 0.8616125150421179, + "grad_norm": 8.60718454510283, + "learning_rate": 3.1731239582240343e-06, + "loss": 1.2858, + "step": 6086 + }, + { + "epoch": 0.8617540879167551, + "grad_norm": 8.708075219911949, + "learning_rate": 3.1725719088979655e-06, + "loss": 1.2691, + "step": 6087 + }, + { + "epoch": 0.8618956607913923, + "grad_norm": 7.826167975573024, + "learning_rate": 3.172019824215293e-06, + "loss": 1.3113, + "step": 6088 + }, + { + "epoch": 0.8620372336660296, + "grad_norm": 10.18467326820444, + "learning_rate": 3.171467704205039e-06, + "loss": 1.3, + "step": 6089 + }, + { + "epoch": 0.8621788065406668, + "grad_norm": 9.601817960266162, + "learning_rate": 3.1709155488962283e-06, + "loss": 1.2323, + "step": 6090 + }, + { + "epoch": 0.862320379415304, + "grad_norm": 6.853295484406172, + "learning_rate": 3.1703633583178885e-06, + "loss": 1.0844, + "step": 6091 + }, + { + "epoch": 0.8624619522899413, + "grad_norm": 7.907625945611181, + "learning_rate": 3.1698111324990454e-06, + "loss": 1.1177, + "step": 6092 + }, + { + "epoch": 0.8626035251645785, + "grad_norm": 7.203794369657736, + "learning_rate": 3.169258871468731e-06, + "loss": 1.1412, + "step": 6093 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 9.315047562089223, + "learning_rate": 3.1687065752559777e-06, + "loss": 1.3111, + "step": 6094 + }, + { + "epoch": 0.862886670913853, + "grad_norm": 10.154362429762909, + "learning_rate": 3.168154243889817e-06, + "loss": 1.162, + "step": 6095 + }, + { + "epoch": 0.8630282437884901, + "grad_norm": 9.841352479610192, + "learning_rate": 3.1676018773992866e-06, + "loss": 1.2833, + "step": 6096 + }, + { + "epoch": 0.8631698166631273, + "grad_norm": 9.929264134611527, + "learning_rate": 3.1670494758134234e-06, + "loss": 1.2461, + "step": 6097 + }, + { + "epoch": 0.8633113895377645, + "grad_norm": 10.576350955028193, + "learning_rate": 3.1664970391612666e-06, + "loss": 1.2649, + "step": 6098 + }, + { + "epoch": 0.8634529624124018, + "grad_norm": 9.369578510522432, + "learning_rate": 3.1659445674718563e-06, + "loss": 1.1844, + "step": 6099 + }, + { + "epoch": 0.863594535287039, + "grad_norm": 9.874847024626034, + "learning_rate": 3.165392060774238e-06, + "loss": 1.2554, + "step": 6100 + }, + { + "epoch": 0.8637361081616762, + "grad_norm": 9.161155662430863, + "learning_rate": 3.1648395190974546e-06, + "loss": 1.2064, + "step": 6101 + }, + { + "epoch": 0.8638776810363135, + "grad_norm": 10.498626619166522, + "learning_rate": 3.1642869424705537e-06, + "loss": 1.3747, + "step": 6102 + }, + { + "epoch": 0.8640192539109507, + "grad_norm": 8.37863763409754, + "learning_rate": 3.1637343309225833e-06, + "loss": 1.1301, + "step": 6103 + }, + { + "epoch": 0.8641608267855879, + "grad_norm": 8.209253198295839, + "learning_rate": 3.163181684482594e-06, + "loss": 1.2942, + "step": 6104 + }, + { + "epoch": 0.8643023996602252, + "grad_norm": 8.717902467067004, + "learning_rate": 3.162629003179638e-06, + "loss": 1.0343, + "step": 6105 + }, + { + "epoch": 0.8644439725348623, + "grad_norm": 9.063624660817961, + "learning_rate": 3.1620762870427703e-06, + "loss": 1.2129, + "step": 6106 + }, + { + "epoch": 0.8645855454094995, + "grad_norm": 8.170367070780177, + "learning_rate": 3.1615235361010442e-06, + "loss": 1.2354, + "step": 6107 + }, + { + "epoch": 0.8647271182841367, + "grad_norm": 8.73143346180062, + "learning_rate": 3.1609707503835203e-06, + "loss": 1.3822, + "step": 6108 + }, + { + "epoch": 0.864868691158774, + "grad_norm": 8.416385708894223, + "learning_rate": 3.1604179299192565e-06, + "loss": 1.2537, + "step": 6109 + }, + { + "epoch": 0.8650102640334112, + "grad_norm": 8.597989898682806, + "learning_rate": 3.1598650747373144e-06, + "loss": 1.1489, + "step": 6110 + }, + { + "epoch": 0.8651518369080484, + "grad_norm": 9.483470791947363, + "learning_rate": 3.1593121848667575e-06, + "loss": 1.2462, + "step": 6111 + }, + { + "epoch": 0.8652934097826857, + "grad_norm": 9.257287531265487, + "learning_rate": 3.158759260336651e-06, + "loss": 1.1838, + "step": 6112 + }, + { + "epoch": 0.8654349826573229, + "grad_norm": 9.273222357577026, + "learning_rate": 3.1582063011760604e-06, + "loss": 1.3338, + "step": 6113 + }, + { + "epoch": 0.8655765555319601, + "grad_norm": 9.044224484221745, + "learning_rate": 3.1576533074140564e-06, + "loss": 1.1965, + "step": 6114 + }, + { + "epoch": 0.8657181284065973, + "grad_norm": 10.618349035639946, + "learning_rate": 3.157100279079708e-06, + "loss": 1.2321, + "step": 6115 + }, + { + "epoch": 0.8658597012812345, + "grad_norm": 8.14886163355665, + "learning_rate": 3.1565472162020876e-06, + "loss": 1.1802, + "step": 6116 + }, + { + "epoch": 0.8660012741558717, + "grad_norm": 9.188975105460322, + "learning_rate": 3.15599411881027e-06, + "loss": 1.1916, + "step": 6117 + }, + { + "epoch": 0.8661428470305089, + "grad_norm": 9.180630769355785, + "learning_rate": 3.15544098693333e-06, + "loss": 1.1746, + "step": 6118 + }, + { + "epoch": 0.8662844199051462, + "grad_norm": 7.898463821744593, + "learning_rate": 3.1548878206003477e-06, + "loss": 1.2612, + "step": 6119 + }, + { + "epoch": 0.8664259927797834, + "grad_norm": 7.492097633308143, + "learning_rate": 3.1543346198403998e-06, + "loss": 1.2228, + "step": 6120 + }, + { + "epoch": 0.8665675656544206, + "grad_norm": 8.837282221807284, + "learning_rate": 3.1537813846825684e-06, + "loss": 1.0724, + "step": 6121 + }, + { + "epoch": 0.8667091385290578, + "grad_norm": 12.743080206121613, + "learning_rate": 3.1532281151559372e-06, + "loss": 1.3951, + "step": 6122 + }, + { + "epoch": 0.8668507114036951, + "grad_norm": 8.466677253696501, + "learning_rate": 3.152674811289591e-06, + "loss": 1.2453, + "step": 6123 + }, + { + "epoch": 0.8669922842783323, + "grad_norm": 7.811274562094449, + "learning_rate": 3.152121473112618e-06, + "loss": 1.2697, + "step": 6124 + }, + { + "epoch": 0.8671338571529695, + "grad_norm": 7.440308881931242, + "learning_rate": 3.151568100654104e-06, + "loss": 1.2005, + "step": 6125 + }, + { + "epoch": 0.8672754300276068, + "grad_norm": 12.430934687586097, + "learning_rate": 3.1510146939431414e-06, + "loss": 1.2692, + "step": 6126 + }, + { + "epoch": 0.8674170029022439, + "grad_norm": 8.392846976508851, + "learning_rate": 3.150461253008822e-06, + "loss": 1.2748, + "step": 6127 + }, + { + "epoch": 0.8675585757768811, + "grad_norm": 9.299057648199158, + "learning_rate": 3.149907777880239e-06, + "loss": 1.3429, + "step": 6128 + }, + { + "epoch": 0.8677001486515183, + "grad_norm": 9.282779657459576, + "learning_rate": 3.1493542685864886e-06, + "loss": 1.3, + "step": 6129 + }, + { + "epoch": 0.8678417215261556, + "grad_norm": 16.316471244932796, + "learning_rate": 3.1488007251566687e-06, + "loss": 1.1889, + "step": 6130 + }, + { + "epoch": 0.8679832944007928, + "grad_norm": 8.259359425974958, + "learning_rate": 3.1482471476198784e-06, + "loss": 1.2825, + "step": 6131 + }, + { + "epoch": 0.86812486727543, + "grad_norm": 7.937624232003322, + "learning_rate": 3.1476935360052184e-06, + "loss": 1.3326, + "step": 6132 + }, + { + "epoch": 0.8682664401500673, + "grad_norm": 11.294201630859776, + "learning_rate": 3.1471398903417926e-06, + "loss": 1.1909, + "step": 6133 + }, + { + "epoch": 0.8684080130247045, + "grad_norm": 6.915198143404774, + "learning_rate": 3.146586210658706e-06, + "loss": 1.1509, + "step": 6134 + }, + { + "epoch": 0.8685495858993417, + "grad_norm": 8.833809269973045, + "learning_rate": 3.1460324969850643e-06, + "loss": 1.2966, + "step": 6135 + }, + { + "epoch": 0.868691158773979, + "grad_norm": 8.908806256646118, + "learning_rate": 3.1454787493499746e-06, + "loss": 1.2386, + "step": 6136 + }, + { + "epoch": 0.8688327316486161, + "grad_norm": 9.625118898920645, + "learning_rate": 3.144924967782549e-06, + "loss": 1.3764, + "step": 6137 + }, + { + "epoch": 0.8689743045232533, + "grad_norm": 8.837201069439306, + "learning_rate": 3.144371152311899e-06, + "loss": 1.0703, + "step": 6138 + }, + { + "epoch": 0.8691158773978905, + "grad_norm": 8.921064296573237, + "learning_rate": 3.143817302967138e-06, + "loss": 1.2298, + "step": 6139 + }, + { + "epoch": 0.8692574502725278, + "grad_norm": 8.292923670685298, + "learning_rate": 3.1432634197773814e-06, + "loss": 1.2824, + "step": 6140 + }, + { + "epoch": 0.869399023147165, + "grad_norm": 9.6700617210525, + "learning_rate": 3.142709502771747e-06, + "loss": 1.397, + "step": 6141 + }, + { + "epoch": 0.8695405960218022, + "grad_norm": 9.01959976560161, + "learning_rate": 3.142155551979352e-06, + "loss": 1.4066, + "step": 6142 + }, + { + "epoch": 0.8696821688964395, + "grad_norm": 9.036659600529381, + "learning_rate": 3.1416015674293195e-06, + "loss": 1.1132, + "step": 6143 + }, + { + "epoch": 0.8698237417710767, + "grad_norm": 8.657952206704206, + "learning_rate": 3.14104754915077e-06, + "loss": 1.2351, + "step": 6144 + }, + { + "epoch": 0.8699653146457139, + "grad_norm": 10.163269809596528, + "learning_rate": 3.1404934971728297e-06, + "loss": 1.1825, + "step": 6145 + }, + { + "epoch": 0.8701068875203511, + "grad_norm": 9.084094607681527, + "learning_rate": 3.1399394115246235e-06, + "loss": 1.3014, + "step": 6146 + }, + { + "epoch": 0.8702484603949883, + "grad_norm": 9.720747273294863, + "learning_rate": 3.1393852922352795e-06, + "loss": 1.2511, + "step": 6147 + }, + { + "epoch": 0.8703900332696255, + "grad_norm": 7.691501475413566, + "learning_rate": 3.138831139333928e-06, + "loss": 1.1642, + "step": 6148 + }, + { + "epoch": 0.8705316061442627, + "grad_norm": 8.610946041152088, + "learning_rate": 3.1382769528496993e-06, + "loss": 1.2282, + "step": 6149 + }, + { + "epoch": 0.8706731790189, + "grad_norm": 8.25823240196019, + "learning_rate": 3.1377227328117264e-06, + "loss": 1.2698, + "step": 6150 + }, + { + "epoch": 0.8708147518935372, + "grad_norm": 8.726627633066018, + "learning_rate": 3.137168479249146e-06, + "loss": 1.2825, + "step": 6151 + }, + { + "epoch": 0.8709563247681744, + "grad_norm": 9.625126032809177, + "learning_rate": 3.1366141921910936e-06, + "loss": 1.2148, + "step": 6152 + }, + { + "epoch": 0.8710978976428116, + "grad_norm": 8.815340490662207, + "learning_rate": 3.136059871666708e-06, + "loss": 1.2335, + "step": 6153 + }, + { + "epoch": 0.8712394705174489, + "grad_norm": 9.122727620658344, + "learning_rate": 3.1355055177051286e-06, + "loss": 1.2651, + "step": 6154 + }, + { + "epoch": 0.8713810433920861, + "grad_norm": 9.919587119339397, + "learning_rate": 3.1349511303354983e-06, + "loss": 1.2769, + "step": 6155 + }, + { + "epoch": 0.8715226162667233, + "grad_norm": 6.947436940008911, + "learning_rate": 3.134396709586961e-06, + "loss": 1.1077, + "step": 6156 + }, + { + "epoch": 0.8716641891413606, + "grad_norm": 8.691300912283015, + "learning_rate": 3.133842255488661e-06, + "loss": 1.136, + "step": 6157 + }, + { + "epoch": 0.8718057620159977, + "grad_norm": 10.504832654270999, + "learning_rate": 3.133287768069746e-06, + "loss": 1.2395, + "step": 6158 + }, + { + "epoch": 0.8719473348906349, + "grad_norm": 10.272733459716308, + "learning_rate": 3.1327332473593657e-06, + "loss": 1.2766, + "step": 6159 + }, + { + "epoch": 0.8720889077652721, + "grad_norm": 8.582276609487591, + "learning_rate": 3.1321786933866705e-06, + "loss": 1.3397, + "step": 6160 + }, + { + "epoch": 0.8722304806399094, + "grad_norm": 11.051068795415043, + "learning_rate": 3.131624106180813e-06, + "loss": 1.2918, + "step": 6161 + }, + { + "epoch": 0.8723720535145466, + "grad_norm": 9.264466391779619, + "learning_rate": 3.1310694857709467e-06, + "loss": 1.216, + "step": 6162 + }, + { + "epoch": 0.8725136263891838, + "grad_norm": 8.068350155450434, + "learning_rate": 3.130514832186228e-06, + "loss": 1.1015, + "step": 6163 + }, + { + "epoch": 0.8726551992638211, + "grad_norm": 9.545344503189517, + "learning_rate": 3.129960145455815e-06, + "loss": 1.2871, + "step": 6164 + }, + { + "epoch": 0.8727967721384583, + "grad_norm": 9.749367717879837, + "learning_rate": 3.129405425608867e-06, + "loss": 1.1633, + "step": 6165 + }, + { + "epoch": 0.8729383450130955, + "grad_norm": 9.814826057902147, + "learning_rate": 3.128850672674545e-06, + "loss": 1.3736, + "step": 6166 + }, + { + "epoch": 0.8730799178877328, + "grad_norm": 11.054649886751381, + "learning_rate": 3.1282958866820113e-06, + "loss": 1.3458, + "step": 6167 + }, + { + "epoch": 0.8732214907623699, + "grad_norm": 10.868149978720432, + "learning_rate": 3.127741067660432e-06, + "loss": 1.3641, + "step": 6168 + }, + { + "epoch": 0.8733630636370071, + "grad_norm": 10.926928484861666, + "learning_rate": 3.127186215638973e-06, + "loss": 1.1623, + "step": 6169 + }, + { + "epoch": 0.8735046365116443, + "grad_norm": 13.343348048139532, + "learning_rate": 3.1266313306468018e-06, + "loss": 1.2626, + "step": 6170 + }, + { + "epoch": 0.8736462093862816, + "grad_norm": 8.716519196857178, + "learning_rate": 3.1260764127130887e-06, + "loss": 1.3274, + "step": 6171 + }, + { + "epoch": 0.8737877822609188, + "grad_norm": 8.68069476884897, + "learning_rate": 3.125521461867006e-06, + "loss": 1.2352, + "step": 6172 + }, + { + "epoch": 0.873929355135556, + "grad_norm": 8.593420181881209, + "learning_rate": 3.1249664781377257e-06, + "loss": 1.1867, + "step": 6173 + }, + { + "epoch": 0.8740709280101933, + "grad_norm": 9.156637867078102, + "learning_rate": 3.1244114615544242e-06, + "loss": 1.2022, + "step": 6174 + }, + { + "epoch": 0.8742125008848305, + "grad_norm": 8.704658962100948, + "learning_rate": 3.1238564121462776e-06, + "loss": 1.2517, + "step": 6175 + }, + { + "epoch": 0.8743540737594677, + "grad_norm": 9.489813312265957, + "learning_rate": 3.1233013299424646e-06, + "loss": 1.3737, + "step": 6176 + }, + { + "epoch": 0.874495646634105, + "grad_norm": 7.858112453728899, + "learning_rate": 3.122746214972166e-06, + "loss": 1.2309, + "step": 6177 + }, + { + "epoch": 0.8746372195087421, + "grad_norm": 11.779224170984797, + "learning_rate": 3.122191067264563e-06, + "loss": 1.3173, + "step": 6178 + }, + { + "epoch": 0.8747787923833793, + "grad_norm": 9.597799335014006, + "learning_rate": 3.121635886848839e-06, + "loss": 1.2484, + "step": 6179 + }, + { + "epoch": 0.8749203652580165, + "grad_norm": 8.234741434416701, + "learning_rate": 3.12108067375418e-06, + "loss": 1.1983, + "step": 6180 + }, + { + "epoch": 0.8750619381326538, + "grad_norm": 12.614440732360448, + "learning_rate": 3.120525428009773e-06, + "loss": 1.3316, + "step": 6181 + }, + { + "epoch": 0.875203511007291, + "grad_norm": 9.748421565692619, + "learning_rate": 3.1199701496448074e-06, + "loss": 1.2508, + "step": 6182 + }, + { + "epoch": 0.8753450838819282, + "grad_norm": 8.664674040776731, + "learning_rate": 3.119414838688473e-06, + "loss": 1.0831, + "step": 6183 + }, + { + "epoch": 0.8754866567565655, + "grad_norm": 9.923755665758396, + "learning_rate": 3.1188594951699623e-06, + "loss": 1.3047, + "step": 6184 + }, + { + "epoch": 0.8756282296312027, + "grad_norm": 10.05510065256936, + "learning_rate": 3.1183041191184695e-06, + "loss": 1.2947, + "step": 6185 + }, + { + "epoch": 0.8757698025058399, + "grad_norm": 9.69909210675476, + "learning_rate": 3.11774871056319e-06, + "loss": 1.0044, + "step": 6186 + }, + { + "epoch": 0.8759113753804771, + "grad_norm": 11.551607517158326, + "learning_rate": 3.1171932695333216e-06, + "loss": 1.2423, + "step": 6187 + }, + { + "epoch": 0.8760529482551144, + "grad_norm": 10.346217664958685, + "learning_rate": 3.1166377960580635e-06, + "loss": 1.2481, + "step": 6188 + }, + { + "epoch": 0.8761945211297515, + "grad_norm": 7.19876330669039, + "learning_rate": 3.116082290166616e-06, + "loss": 1.1278, + "step": 6189 + }, + { + "epoch": 0.8763360940043887, + "grad_norm": 10.908496002390967, + "learning_rate": 3.1155267518881816e-06, + "loss": 1.0993, + "step": 6190 + }, + { + "epoch": 0.876477666879026, + "grad_norm": 10.89707512766112, + "learning_rate": 3.114971181251965e-06, + "loss": 1.3674, + "step": 6191 + }, + { + "epoch": 0.8766192397536632, + "grad_norm": 7.913766778239129, + "learning_rate": 3.1144155782871723e-06, + "loss": 1.3281, + "step": 6192 + }, + { + "epoch": 0.8767608126283004, + "grad_norm": 9.805160207105834, + "learning_rate": 3.113859943023011e-06, + "loss": 1.1855, + "step": 6193 + }, + { + "epoch": 0.8769023855029376, + "grad_norm": 11.930608387836148, + "learning_rate": 3.1133042754886896e-06, + "loss": 1.3658, + "step": 6194 + }, + { + "epoch": 0.8770439583775749, + "grad_norm": 10.675895235281537, + "learning_rate": 3.1127485757134194e-06, + "loss": 1.2449, + "step": 6195 + }, + { + "epoch": 0.8771855312522121, + "grad_norm": 8.063233807903499, + "learning_rate": 3.1121928437264138e-06, + "loss": 1.2144, + "step": 6196 + }, + { + "epoch": 0.8773271041268493, + "grad_norm": 8.35360581440967, + "learning_rate": 3.111637079556887e-06, + "loss": 1.1791, + "step": 6197 + }, + { + "epoch": 0.8774686770014866, + "grad_norm": 9.852241656060515, + "learning_rate": 3.1110812832340552e-06, + "loss": 1.1812, + "step": 6198 + }, + { + "epoch": 0.8776102498761237, + "grad_norm": 8.886815332579557, + "learning_rate": 3.1105254547871354e-06, + "loss": 1.2226, + "step": 6199 + }, + { + "epoch": 0.8777518227507609, + "grad_norm": 14.150947975781737, + "learning_rate": 3.1099695942453485e-06, + "loss": 1.2749, + "step": 6200 + }, + { + "epoch": 0.8778933956253981, + "grad_norm": 12.544345168366702, + "learning_rate": 3.109413701637914e-06, + "loss": 1.166, + "step": 6201 + }, + { + "epoch": 0.8780349685000354, + "grad_norm": 11.015474705313359, + "learning_rate": 3.108857776994056e-06, + "loss": 1.1435, + "step": 6202 + }, + { + "epoch": 0.8781765413746726, + "grad_norm": 8.460712067056804, + "learning_rate": 3.108301820342998e-06, + "loss": 1.2881, + "step": 6203 + }, + { + "epoch": 0.8783181142493098, + "grad_norm": 10.58203521536932, + "learning_rate": 3.107745831713968e-06, + "loss": 1.271, + "step": 6204 + }, + { + "epoch": 0.8784596871239471, + "grad_norm": 10.399197958317243, + "learning_rate": 3.107189811136192e-06, + "loss": 1.2306, + "step": 6205 + }, + { + "epoch": 0.8786012599985843, + "grad_norm": 11.508938176868636, + "learning_rate": 3.1066337586389007e-06, + "loss": 1.2203, + "step": 6206 + }, + { + "epoch": 0.8787428328732215, + "grad_norm": 13.19477650720903, + "learning_rate": 3.1060776742513247e-06, + "loss": 1.289, + "step": 6207 + }, + { + "epoch": 0.8788844057478588, + "grad_norm": 8.389526067904132, + "learning_rate": 3.1055215580026976e-06, + "loss": 1.2408, + "step": 6208 + }, + { + "epoch": 0.879025978622496, + "grad_norm": 8.567153000540376, + "learning_rate": 3.1049654099222542e-06, + "loss": 1.085, + "step": 6209 + }, + { + "epoch": 0.8791675514971331, + "grad_norm": 13.222869736090468, + "learning_rate": 3.104409230039229e-06, + "loss": 1.1543, + "step": 6210 + }, + { + "epoch": 0.8793091243717703, + "grad_norm": 10.778192501238323, + "learning_rate": 3.103853018382862e-06, + "loss": 1.4546, + "step": 6211 + }, + { + "epoch": 0.8794506972464076, + "grad_norm": 10.71419399403868, + "learning_rate": 3.1032967749823917e-06, + "loss": 1.2073, + "step": 6212 + }, + { + "epoch": 0.8795922701210448, + "grad_norm": 11.757383583331197, + "learning_rate": 3.10274049986706e-06, + "loss": 1.4574, + "step": 6213 + }, + { + "epoch": 0.879733842995682, + "grad_norm": 9.64233462416621, + "learning_rate": 3.1021841930661108e-06, + "loss": 1.3004, + "step": 6214 + }, + { + "epoch": 0.8798754158703193, + "grad_norm": 10.823913547889575, + "learning_rate": 3.1016278546087864e-06, + "loss": 1.1158, + "step": 6215 + }, + { + "epoch": 0.8800169887449565, + "grad_norm": 9.058262327513097, + "learning_rate": 3.101071484524334e-06, + "loss": 1.2567, + "step": 6216 + }, + { + "epoch": 0.8801585616195937, + "grad_norm": 10.913446975641413, + "learning_rate": 3.100515082842002e-06, + "loss": 1.3773, + "step": 6217 + }, + { + "epoch": 0.880300134494231, + "grad_norm": 9.231709723366734, + "learning_rate": 3.09995864959104e-06, + "loss": 1.2831, + "step": 6218 + }, + { + "epoch": 0.8804417073688682, + "grad_norm": 11.593767109572955, + "learning_rate": 3.0994021848006996e-06, + "loss": 1.2663, + "step": 6219 + }, + { + "epoch": 0.8805832802435053, + "grad_norm": 8.250789257790121, + "learning_rate": 3.0988456885002327e-06, + "loss": 1.2291, + "step": 6220 + }, + { + "epoch": 0.8807248531181425, + "grad_norm": 9.724780208192689, + "learning_rate": 3.0982891607188948e-06, + "loss": 1.4442, + "step": 6221 + }, + { + "epoch": 0.8808664259927798, + "grad_norm": 10.207108238563457, + "learning_rate": 3.0977326014859415e-06, + "loss": 1.2751, + "step": 6222 + }, + { + "epoch": 0.881007998867417, + "grad_norm": 10.516041808348872, + "learning_rate": 3.0971760108306316e-06, + "loss": 1.1888, + "step": 6223 + }, + { + "epoch": 0.8811495717420542, + "grad_norm": 9.070451707453499, + "learning_rate": 3.0966193887822232e-06, + "loss": 1.1995, + "step": 6224 + }, + { + "epoch": 0.8812911446166914, + "grad_norm": 10.467348830555297, + "learning_rate": 3.096062735369979e-06, + "loss": 1.0655, + "step": 6225 + }, + { + "epoch": 0.8814327174913287, + "grad_norm": 8.867144049315348, + "learning_rate": 3.095506050623161e-06, + "loss": 1.3422, + "step": 6226 + }, + { + "epoch": 0.8815742903659659, + "grad_norm": 7.997808633127509, + "learning_rate": 3.0949493345710343e-06, + "loss": 1.1864, + "step": 6227 + }, + { + "epoch": 0.8817158632406031, + "grad_norm": 10.800985743430326, + "learning_rate": 3.094392587242864e-06, + "loss": 1.1408, + "step": 6228 + }, + { + "epoch": 0.8818574361152404, + "grad_norm": 12.255609590333076, + "learning_rate": 3.093835808667919e-06, + "loss": 1.1045, + "step": 6229 + }, + { + "epoch": 0.8819990089898775, + "grad_norm": 8.201521420643932, + "learning_rate": 3.0932789988754695e-06, + "loss": 1.3115, + "step": 6230 + }, + { + "epoch": 0.8821405818645147, + "grad_norm": 12.992990217798392, + "learning_rate": 3.0927221578947843e-06, + "loss": 1.4626, + "step": 6231 + }, + { + "epoch": 0.882282154739152, + "grad_norm": 8.699447210880836, + "learning_rate": 3.092165285755137e-06, + "loss": 1.4467, + "step": 6232 + }, + { + "epoch": 0.8824237276137892, + "grad_norm": 7.931901774819256, + "learning_rate": 3.0916083824858017e-06, + "loss": 1.0405, + "step": 6233 + }, + { + "epoch": 0.8825653004884264, + "grad_norm": 19.112844156942685, + "learning_rate": 3.091051448116056e-06, + "loss": 1.293, + "step": 6234 + }, + { + "epoch": 0.8827068733630636, + "grad_norm": 13.97918789099699, + "learning_rate": 3.090494482675176e-06, + "loss": 1.3832, + "step": 6235 + }, + { + "epoch": 0.8828484462377009, + "grad_norm": 10.544778075688807, + "learning_rate": 3.0899374861924413e-06, + "loss": 1.2645, + "step": 6236 + }, + { + "epoch": 0.8829900191123381, + "grad_norm": 8.357674969511322, + "learning_rate": 3.0893804586971327e-06, + "loss": 1.2761, + "step": 6237 + }, + { + "epoch": 0.8831315919869753, + "grad_norm": 9.183390626076694, + "learning_rate": 3.088823400218533e-06, + "loss": 1.2273, + "step": 6238 + }, + { + "epoch": 0.8832731648616126, + "grad_norm": 13.27352601684729, + "learning_rate": 3.0882663107859256e-06, + "loss": 1.2152, + "step": 6239 + }, + { + "epoch": 0.8834147377362498, + "grad_norm": 11.009608840299743, + "learning_rate": 3.0877091904285976e-06, + "loss": 1.2795, + "step": 6240 + }, + { + "epoch": 0.8835563106108869, + "grad_norm": 12.436869389635149, + "learning_rate": 3.087152039175835e-06, + "loss": 1.1974, + "step": 6241 + }, + { + "epoch": 0.8836978834855241, + "grad_norm": 8.58545320375501, + "learning_rate": 3.0865948570569283e-06, + "loss": 1.2387, + "step": 6242 + }, + { + "epoch": 0.8838394563601614, + "grad_norm": 9.959082624478592, + "learning_rate": 3.086037644101167e-06, + "loss": 1.3251, + "step": 6243 + }, + { + "epoch": 0.8839810292347986, + "grad_norm": 8.662489202143117, + "learning_rate": 3.0854804003378437e-06, + "loss": 1.3044, + "step": 6244 + }, + { + "epoch": 0.8841226021094358, + "grad_norm": 7.929577575823687, + "learning_rate": 3.084923125796252e-06, + "loss": 1.2051, + "step": 6245 + }, + { + "epoch": 0.8842641749840731, + "grad_norm": 11.780150992133375, + "learning_rate": 3.0843658205056886e-06, + "loss": 1.2823, + "step": 6246 + }, + { + "epoch": 0.8844057478587103, + "grad_norm": 9.443724341771068, + "learning_rate": 3.0838084844954485e-06, + "loss": 1.3908, + "step": 6247 + }, + { + "epoch": 0.8845473207333475, + "grad_norm": 10.81132051065373, + "learning_rate": 3.0832511177948326e-06, + "loss": 1.2362, + "step": 6248 + }, + { + "epoch": 0.8846888936079847, + "grad_norm": 8.869660826588069, + "learning_rate": 3.0826937204331403e-06, + "loss": 1.1382, + "step": 6249 + }, + { + "epoch": 0.884830466482622, + "grad_norm": 8.489392562002145, + "learning_rate": 3.0821362924396732e-06, + "loss": 1.3003, + "step": 6250 + }, + { + "epoch": 0.8849720393572591, + "grad_norm": 9.210850528374385, + "learning_rate": 3.081578833843736e-06, + "loss": 1.2464, + "step": 6251 + }, + { + "epoch": 0.8851136122318963, + "grad_norm": 7.777346938174259, + "learning_rate": 3.0810213446746323e-06, + "loss": 1.252, + "step": 6252 + }, + { + "epoch": 0.8852551851065336, + "grad_norm": 7.951372174537412, + "learning_rate": 3.0804638249616704e-06, + "loss": 1.2446, + "step": 6253 + }, + { + "epoch": 0.8853967579811708, + "grad_norm": 7.513421066615898, + "learning_rate": 3.0799062747341574e-06, + "loss": 1.1525, + "step": 6254 + }, + { + "epoch": 0.885538330855808, + "grad_norm": 10.735704009340601, + "learning_rate": 3.0793486940214034e-06, + "loss": 1.4418, + "step": 6255 + }, + { + "epoch": 0.8856799037304453, + "grad_norm": 9.873968529249096, + "learning_rate": 3.0787910828527217e-06, + "loss": 1.0402, + "step": 6256 + }, + { + "epoch": 0.8858214766050825, + "grad_norm": 8.914748463828905, + "learning_rate": 3.0782334412574244e-06, + "loss": 1.1807, + "step": 6257 + }, + { + "epoch": 0.8859630494797197, + "grad_norm": 8.572905295184084, + "learning_rate": 3.0776757692648256e-06, + "loss": 1.1416, + "step": 6258 + }, + { + "epoch": 0.8861046223543569, + "grad_norm": 7.582828658641088, + "learning_rate": 3.0771180669042422e-06, + "loss": 1.1477, + "step": 6259 + }, + { + "epoch": 0.8862461952289942, + "grad_norm": 9.78028743017284, + "learning_rate": 3.076560334204993e-06, + "loss": 1.4051, + "step": 6260 + }, + { + "epoch": 0.8863877681036313, + "grad_norm": 8.25312237463277, + "learning_rate": 3.0760025711963964e-06, + "loss": 1.1215, + "step": 6261 + }, + { + "epoch": 0.8865293409782685, + "grad_norm": 8.5216873847724, + "learning_rate": 3.0754447779077745e-06, + "loss": 1.1845, + "step": 6262 + }, + { + "epoch": 0.8866709138529058, + "grad_norm": 9.089197861265342, + "learning_rate": 3.0748869543684495e-06, + "loss": 1.2284, + "step": 6263 + }, + { + "epoch": 0.886812486727543, + "grad_norm": 8.252472940358402, + "learning_rate": 3.0743291006077458e-06, + "loss": 1.3061, + "step": 6264 + }, + { + "epoch": 0.8869540596021802, + "grad_norm": 8.324055606097213, + "learning_rate": 3.0737712166549897e-06, + "loss": 1.2924, + "step": 6265 + }, + { + "epoch": 0.8870956324768174, + "grad_norm": 9.882026586833828, + "learning_rate": 3.073213302539508e-06, + "loss": 1.3726, + "step": 6266 + }, + { + "epoch": 0.8872372053514547, + "grad_norm": 9.356928790689976, + "learning_rate": 3.072655358290632e-06, + "loss": 1.2517, + "step": 6267 + }, + { + "epoch": 0.8873787782260919, + "grad_norm": 8.146684554975801, + "learning_rate": 3.07209738393769e-06, + "loss": 1.122, + "step": 6268 + }, + { + "epoch": 0.8875203511007291, + "grad_norm": 8.935958622685519, + "learning_rate": 3.0715393795100146e-06, + "loss": 1.1829, + "step": 6269 + }, + { + "epoch": 0.8876619239753664, + "grad_norm": 7.780509699033722, + "learning_rate": 3.07098134503694e-06, + "loss": 1.1908, + "step": 6270 + }, + { + "epoch": 0.8878034968500036, + "grad_norm": 6.587059929931397, + "learning_rate": 3.0704232805478025e-06, + "loss": 1.2292, + "step": 6271 + }, + { + "epoch": 0.8879450697246407, + "grad_norm": 8.489507145139982, + "learning_rate": 3.0698651860719387e-06, + "loss": 1.1308, + "step": 6272 + }, + { + "epoch": 0.8880866425992779, + "grad_norm": 8.469093681765916, + "learning_rate": 3.0693070616386862e-06, + "loss": 1.3385, + "step": 6273 + }, + { + "epoch": 0.8882282154739152, + "grad_norm": 7.816546559444701, + "learning_rate": 3.0687489072773864e-06, + "loss": 1.3707, + "step": 6274 + }, + { + "epoch": 0.8883697883485524, + "grad_norm": 9.332008767370365, + "learning_rate": 3.0681907230173803e-06, + "loss": 1.0924, + "step": 6275 + }, + { + "epoch": 0.8885113612231896, + "grad_norm": 9.345183297651214, + "learning_rate": 3.0676325088880122e-06, + "loss": 1.2227, + "step": 6276 + }, + { + "epoch": 0.8886529340978269, + "grad_norm": 7.323114500091154, + "learning_rate": 3.067074264918626e-06, + "loss": 1.1269, + "step": 6277 + }, + { + "epoch": 0.8887945069724641, + "grad_norm": 9.873520752413489, + "learning_rate": 3.0665159911385677e-06, + "loss": 1.1985, + "step": 6278 + }, + { + "epoch": 0.8889360798471013, + "grad_norm": 8.438264035141222, + "learning_rate": 3.0659576875771868e-06, + "loss": 1.246, + "step": 6279 + }, + { + "epoch": 0.8890776527217386, + "grad_norm": 8.227804671621433, + "learning_rate": 3.065399354263833e-06, + "loss": 1.3775, + "step": 6280 + }, + { + "epoch": 0.8892192255963758, + "grad_norm": 10.380270848498299, + "learning_rate": 3.0648409912278553e-06, + "loss": 1.2751, + "step": 6281 + }, + { + "epoch": 0.8893607984710129, + "grad_norm": 8.464969931569918, + "learning_rate": 3.064282598498609e-06, + "loss": 1.1406, + "step": 6282 + }, + { + "epoch": 0.8895023713456501, + "grad_norm": 7.667740649864468, + "learning_rate": 3.063724176105447e-06, + "loss": 1.1024, + "step": 6283 + }, + { + "epoch": 0.8896439442202874, + "grad_norm": 8.55685884971953, + "learning_rate": 3.0631657240777254e-06, + "loss": 1.1826, + "step": 6284 + }, + { + "epoch": 0.8897855170949246, + "grad_norm": 8.897831307545623, + "learning_rate": 3.062607242444801e-06, + "loss": 1.2255, + "step": 6285 + }, + { + "epoch": 0.8899270899695618, + "grad_norm": 7.788706443247301, + "learning_rate": 3.0620487312360337e-06, + "loss": 1.0592, + "step": 6286 + }, + { + "epoch": 0.890068662844199, + "grad_norm": 8.420668251902205, + "learning_rate": 3.0614901904807836e-06, + "loss": 1.2572, + "step": 6287 + }, + { + "epoch": 0.8902102357188363, + "grad_norm": 8.319471605101247, + "learning_rate": 3.060931620208414e-06, + "loss": 1.1948, + "step": 6288 + }, + { + "epoch": 0.8903518085934735, + "grad_norm": 9.061063665809847, + "learning_rate": 3.060373020448286e-06, + "loss": 1.1543, + "step": 6289 + }, + { + "epoch": 0.8904933814681107, + "grad_norm": 8.059221887631711, + "learning_rate": 3.0598143912297667e-06, + "loss": 1.1737, + "step": 6290 + }, + { + "epoch": 0.890634954342748, + "grad_norm": 8.693668069460275, + "learning_rate": 3.0592557325822225e-06, + "loss": 1.0537, + "step": 6291 + }, + { + "epoch": 0.8907765272173851, + "grad_norm": 8.589498682103821, + "learning_rate": 3.0586970445350206e-06, + "loss": 1.3416, + "step": 6292 + }, + { + "epoch": 0.8909181000920223, + "grad_norm": 10.75862272559365, + "learning_rate": 3.0581383271175324e-06, + "loss": 1.2821, + "step": 6293 + }, + { + "epoch": 0.8910596729666596, + "grad_norm": 11.863845887122709, + "learning_rate": 3.0575795803591278e-06, + "loss": 1.1512, + "step": 6294 + }, + { + "epoch": 0.8912012458412968, + "grad_norm": 7.085923233774917, + "learning_rate": 3.0570208042891815e-06, + "loss": 1.1885, + "step": 6295 + }, + { + "epoch": 0.891342818715934, + "grad_norm": 8.363418542339936, + "learning_rate": 3.0564619989370656e-06, + "loss": 1.0867, + "step": 6296 + }, + { + "epoch": 0.8914843915905712, + "grad_norm": 9.525281494466613, + "learning_rate": 3.055903164332158e-06, + "loss": 1.4128, + "step": 6297 + }, + { + "epoch": 0.8916259644652085, + "grad_norm": 9.724744511952688, + "learning_rate": 3.055344300503836e-06, + "loss": 1.2141, + "step": 6298 + }, + { + "epoch": 0.8917675373398457, + "grad_norm": 8.45707906308655, + "learning_rate": 3.0547854074814777e-06, + "loss": 1.161, + "step": 6299 + }, + { + "epoch": 0.8919091102144829, + "grad_norm": 9.322097236523927, + "learning_rate": 3.0542264852944635e-06, + "loss": 1.2592, + "step": 6300 + }, + { + "epoch": 0.8920506830891202, + "grad_norm": 7.94706354191339, + "learning_rate": 3.0536675339721774e-06, + "loss": 1.2628, + "step": 6301 + }, + { + "epoch": 0.8921922559637574, + "grad_norm": 8.212132331595773, + "learning_rate": 3.053108553544001e-06, + "loss": 1.216, + "step": 6302 + }, + { + "epoch": 0.8923338288383945, + "grad_norm": 8.626764531629439, + "learning_rate": 3.052549544039321e-06, + "loss": 1.3845, + "step": 6303 + }, + { + "epoch": 0.8924754017130317, + "grad_norm": 8.610408658598406, + "learning_rate": 3.0519905054875237e-06, + "loss": 1.2056, + "step": 6304 + }, + { + "epoch": 0.892616974587669, + "grad_norm": 8.907822640759006, + "learning_rate": 3.0514314379179967e-06, + "loss": 1.3217, + "step": 6305 + }, + { + "epoch": 0.8927585474623062, + "grad_norm": 8.712597935918353, + "learning_rate": 3.05087234136013e-06, + "loss": 1.1248, + "step": 6306 + }, + { + "epoch": 0.8929001203369434, + "grad_norm": 8.114042434652882, + "learning_rate": 3.0503132158433145e-06, + "loss": 1.2559, + "step": 6307 + }, + { + "epoch": 0.8930416932115807, + "grad_norm": 8.643509174779199, + "learning_rate": 3.049754061396944e-06, + "loss": 1.316, + "step": 6308 + }, + { + "epoch": 0.8931832660862179, + "grad_norm": 7.972266763820414, + "learning_rate": 3.049194878050413e-06, + "loss": 1.1483, + "step": 6309 + }, + { + "epoch": 0.8933248389608551, + "grad_norm": 9.517291745191272, + "learning_rate": 3.048635665833116e-06, + "loss": 1.1012, + "step": 6310 + }, + { + "epoch": 0.8934664118354924, + "grad_norm": 9.599382666766015, + "learning_rate": 3.048076424774452e-06, + "loss": 1.2113, + "step": 6311 + }, + { + "epoch": 0.8936079847101296, + "grad_norm": 8.847115998268386, + "learning_rate": 3.0475171549038187e-06, + "loss": 1.2254, + "step": 6312 + }, + { + "epoch": 0.8937495575847667, + "grad_norm": 7.4237364030936055, + "learning_rate": 3.0469578562506165e-06, + "loss": 1.2262, + "step": 6313 + }, + { + "epoch": 0.8938911304594039, + "grad_norm": 8.02243424989855, + "learning_rate": 3.046398528844248e-06, + "loss": 1.1974, + "step": 6314 + }, + { + "epoch": 0.8940327033340412, + "grad_norm": 7.168885382892981, + "learning_rate": 3.0458391727141156e-06, + "loss": 1.2026, + "step": 6315 + }, + { + "epoch": 0.8941742762086784, + "grad_norm": 6.7336074143831075, + "learning_rate": 3.045279787889625e-06, + "loss": 1.1784, + "step": 6316 + }, + { + "epoch": 0.8943158490833156, + "grad_norm": 8.246716857527161, + "learning_rate": 3.044720374400183e-06, + "loss": 1.3041, + "step": 6317 + }, + { + "epoch": 0.8944574219579529, + "grad_norm": 10.182478527230957, + "learning_rate": 3.044160932275197e-06, + "loss": 1.455, + "step": 6318 + }, + { + "epoch": 0.8945989948325901, + "grad_norm": 9.790415798938945, + "learning_rate": 3.043601461544076e-06, + "loss": 1.3586, + "step": 6319 + }, + { + "epoch": 0.8947405677072273, + "grad_norm": 10.072049934366444, + "learning_rate": 3.0430419622362327e-06, + "loss": 1.1644, + "step": 6320 + }, + { + "epoch": 0.8948821405818645, + "grad_norm": 10.442161387185852, + "learning_rate": 3.0424824343810773e-06, + "loss": 1.2317, + "step": 6321 + }, + { + "epoch": 0.8950237134565018, + "grad_norm": 8.196208021258023, + "learning_rate": 3.0419228780080246e-06, + "loss": 1.1599, + "step": 6322 + }, + { + "epoch": 0.8951652863311389, + "grad_norm": 8.816398894408762, + "learning_rate": 3.041363293146491e-06, + "loss": 1.298, + "step": 6323 + }, + { + "epoch": 0.8953068592057761, + "grad_norm": 8.682125048000696, + "learning_rate": 3.0408036798258924e-06, + "loss": 1.1652, + "step": 6324 + }, + { + "epoch": 0.8954484320804134, + "grad_norm": 7.546881318336512, + "learning_rate": 3.040244038075648e-06, + "loss": 1.1111, + "step": 6325 + }, + { + "epoch": 0.8955900049550506, + "grad_norm": 8.191727107598927, + "learning_rate": 3.0396843679251777e-06, + "loss": 1.3247, + "step": 6326 + }, + { + "epoch": 0.8957315778296878, + "grad_norm": 7.873745772982424, + "learning_rate": 3.0391246694039016e-06, + "loss": 1.0939, + "step": 6327 + }, + { + "epoch": 0.895873150704325, + "grad_norm": 10.660554167637478, + "learning_rate": 3.038564942541244e-06, + "loss": 1.3446, + "step": 6328 + }, + { + "epoch": 0.8960147235789623, + "grad_norm": 12.354959926686895, + "learning_rate": 3.0380051873666287e-06, + "loss": 1.1454, + "step": 6329 + }, + { + "epoch": 0.8961562964535995, + "grad_norm": 9.744282537739398, + "learning_rate": 3.037445403909482e-06, + "loss": 1.1725, + "step": 6330 + }, + { + "epoch": 0.8962978693282367, + "grad_norm": 9.57631428515616, + "learning_rate": 3.0368855921992314e-06, + "loss": 1.2289, + "step": 6331 + }, + { + "epoch": 0.896439442202874, + "grad_norm": 8.637791091275792, + "learning_rate": 3.036325752265305e-06, + "loss": 1.2423, + "step": 6332 + }, + { + "epoch": 0.8965810150775112, + "grad_norm": 9.678657286026382, + "learning_rate": 3.035765884137134e-06, + "loss": 1.2476, + "step": 6333 + }, + { + "epoch": 0.8967225879521483, + "grad_norm": 10.290877174232365, + "learning_rate": 3.0352059878441496e-06, + "loss": 1.1264, + "step": 6334 + }, + { + "epoch": 0.8968641608267856, + "grad_norm": 7.6848092681307865, + "learning_rate": 3.0346460634157865e-06, + "loss": 1.3084, + "step": 6335 + }, + { + "epoch": 0.8970057337014228, + "grad_norm": 11.74172877305885, + "learning_rate": 3.034086110881478e-06, + "loss": 1.3236, + "step": 6336 + }, + { + "epoch": 0.89714730657606, + "grad_norm": 8.817999239946845, + "learning_rate": 3.0335261302706605e-06, + "loss": 1.3416, + "step": 6337 + }, + { + "epoch": 0.8972888794506972, + "grad_norm": 8.309859006858492, + "learning_rate": 3.032966121612772e-06, + "loss": 1.2956, + "step": 6338 + }, + { + "epoch": 0.8974304523253345, + "grad_norm": 9.168981456370474, + "learning_rate": 3.0324060849372526e-06, + "loss": 1.1325, + "step": 6339 + }, + { + "epoch": 0.8975720251999717, + "grad_norm": 8.691068725890075, + "learning_rate": 3.0318460202735417e-06, + "loss": 1.1302, + "step": 6340 + }, + { + "epoch": 0.8977135980746089, + "grad_norm": 9.030806903463096, + "learning_rate": 3.0312859276510833e-06, + "loss": 1.183, + "step": 6341 + }, + { + "epoch": 0.8978551709492462, + "grad_norm": 10.264567746046767, + "learning_rate": 3.0307258070993186e-06, + "loss": 1.1714, + "step": 6342 + }, + { + "epoch": 0.8979967438238834, + "grad_norm": 7.715284797519527, + "learning_rate": 3.0301656586476943e-06, + "loss": 1.1966, + "step": 6343 + }, + { + "epoch": 0.8981383166985205, + "grad_norm": 7.756251613177209, + "learning_rate": 3.029605482325656e-06, + "loss": 1.3079, + "step": 6344 + }, + { + "epoch": 0.8982798895731577, + "grad_norm": 8.977807552091035, + "learning_rate": 3.0290452781626526e-06, + "loss": 1.2568, + "step": 6345 + }, + { + "epoch": 0.898421462447795, + "grad_norm": 9.4983338350518, + "learning_rate": 3.028485046188134e-06, + "loss": 1.2757, + "step": 6346 + }, + { + "epoch": 0.8985630353224322, + "grad_norm": 9.000370441866286, + "learning_rate": 3.0279247864315508e-06, + "loss": 1.2352, + "step": 6347 + }, + { + "epoch": 0.8987046081970694, + "grad_norm": 10.457652846205875, + "learning_rate": 3.0273644989223543e-06, + "loss": 1.3436, + "step": 6348 + }, + { + "epoch": 0.8988461810717067, + "grad_norm": 12.142736302904506, + "learning_rate": 3.0268041836900002e-06, + "loss": 1.4648, + "step": 6349 + }, + { + "epoch": 0.8989877539463439, + "grad_norm": 8.781843389370461, + "learning_rate": 3.026243840763942e-06, + "loss": 1.3238, + "step": 6350 + }, + { + "epoch": 0.8991293268209811, + "grad_norm": 11.492445745101499, + "learning_rate": 3.025683470173638e-06, + "loss": 1.298, + "step": 6351 + }, + { + "epoch": 0.8992708996956184, + "grad_norm": 10.954769081875119, + "learning_rate": 3.0251230719485465e-06, + "loss": 1.1836, + "step": 6352 + }, + { + "epoch": 0.8994124725702556, + "grad_norm": 11.941380693871901, + "learning_rate": 3.0245626461181256e-06, + "loss": 1.1857, + "step": 6353 + }, + { + "epoch": 0.8995540454448928, + "grad_norm": 8.936855906506064, + "learning_rate": 3.024002192711838e-06, + "loss": 1.2258, + "step": 6354 + }, + { + "epoch": 0.8996956183195299, + "grad_norm": 7.986312840074637, + "learning_rate": 3.023441711759146e-06, + "loss": 1.2752, + "step": 6355 + }, + { + "epoch": 0.8998371911941672, + "grad_norm": 11.525054838832045, + "learning_rate": 3.0228812032895133e-06, + "loss": 1.1743, + "step": 6356 + }, + { + "epoch": 0.8999787640688044, + "grad_norm": 10.301047884774349, + "learning_rate": 3.022320667332406e-06, + "loss": 1.248, + "step": 6357 + }, + { + "epoch": 0.9001203369434416, + "grad_norm": 10.042012653220539, + "learning_rate": 3.02176010391729e-06, + "loss": 1.1239, + "step": 6358 + }, + { + "epoch": 0.9002619098180789, + "grad_norm": 9.33120998570728, + "learning_rate": 3.021199513073635e-06, + "loss": 1.1528, + "step": 6359 + }, + { + "epoch": 0.9004034826927161, + "grad_norm": 8.919390491137746, + "learning_rate": 3.0206388948309094e-06, + "loss": 1.2946, + "step": 6360 + }, + { + "epoch": 0.9005450555673533, + "grad_norm": 8.181794719469742, + "learning_rate": 3.020078249218586e-06, + "loss": 1.2956, + "step": 6361 + }, + { + "epoch": 0.9006866284419905, + "grad_norm": 9.456690394905484, + "learning_rate": 3.019517576266137e-06, + "loss": 1.2716, + "step": 6362 + }, + { + "epoch": 0.9008282013166278, + "grad_norm": 9.038060982626948, + "learning_rate": 3.0189568760030363e-06, + "loss": 1.0887, + "step": 6363 + }, + { + "epoch": 0.900969774191265, + "grad_norm": 10.872618666628656, + "learning_rate": 3.018396148458759e-06, + "loss": 1.2292, + "step": 6364 + }, + { + "epoch": 0.9011113470659021, + "grad_norm": 11.774350521374643, + "learning_rate": 3.0178353936627835e-06, + "loss": 1.2357, + "step": 6365 + }, + { + "epoch": 0.9012529199405394, + "grad_norm": 9.797577803863254, + "learning_rate": 3.017274611644587e-06, + "loss": 1.2752, + "step": 6366 + }, + { + "epoch": 0.9013944928151766, + "grad_norm": 7.063485135938562, + "learning_rate": 3.016713802433649e-06, + "loss": 1.1336, + "step": 6367 + }, + { + "epoch": 0.9015360656898138, + "grad_norm": 9.75770890094624, + "learning_rate": 3.016152966059453e-06, + "loss": 1.3332, + "step": 6368 + }, + { + "epoch": 0.901677638564451, + "grad_norm": 9.642026035883752, + "learning_rate": 3.01559210255148e-06, + "loss": 1.1531, + "step": 6369 + }, + { + "epoch": 0.9018192114390883, + "grad_norm": 9.768905698935143, + "learning_rate": 3.0150312119392144e-06, + "loss": 1.3142, + "step": 6370 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 11.648996177288263, + "learning_rate": 3.0144702942521424e-06, + "loss": 1.2917, + "step": 6371 + }, + { + "epoch": 0.9021023571883627, + "grad_norm": 8.213550387813088, + "learning_rate": 3.0139093495197504e-06, + "loss": 1.1842, + "step": 6372 + }, + { + "epoch": 0.902243930063, + "grad_norm": 9.290728009856117, + "learning_rate": 3.0133483777715272e-06, + "loss": 1.3223, + "step": 6373 + }, + { + "epoch": 0.9023855029376372, + "grad_norm": 10.550309770565166, + "learning_rate": 3.0127873790369627e-06, + "loss": 1.2476, + "step": 6374 + }, + { + "epoch": 0.9025270758122743, + "grad_norm": 8.935648266159733, + "learning_rate": 3.0122263533455474e-06, + "loss": 1.1548, + "step": 6375 + }, + { + "epoch": 0.9026686486869115, + "grad_norm": 9.669337024388536, + "learning_rate": 3.0116653007267753e-06, + "loss": 1.3593, + "step": 6376 + }, + { + "epoch": 0.9028102215615488, + "grad_norm": 9.007638868393784, + "learning_rate": 3.0111042212101394e-06, + "loss": 1.1329, + "step": 6377 + }, + { + "epoch": 0.902951794436186, + "grad_norm": 10.42905765303239, + "learning_rate": 3.0105431148251364e-06, + "loss": 1.1712, + "step": 6378 + }, + { + "epoch": 0.9030933673108232, + "grad_norm": 8.758220190109261, + "learning_rate": 3.0099819816012623e-06, + "loss": 1.2157, + "step": 6379 + }, + { + "epoch": 0.9032349401854605, + "grad_norm": 10.446368258981018, + "learning_rate": 3.0094208215680156e-06, + "loss": 1.1432, + "step": 6380 + }, + { + "epoch": 0.9033765130600977, + "grad_norm": 9.30316267483239, + "learning_rate": 3.008859634754895e-06, + "loss": 1.1996, + "step": 6381 + }, + { + "epoch": 0.9035180859347349, + "grad_norm": 9.381474001635317, + "learning_rate": 3.0082984211914033e-06, + "loss": 1.1329, + "step": 6382 + }, + { + "epoch": 0.9036596588093722, + "grad_norm": 9.473282034742006, + "learning_rate": 3.007737180907044e-06, + "loss": 1.2563, + "step": 6383 + }, + { + "epoch": 0.9038012316840094, + "grad_norm": 10.849906646748813, + "learning_rate": 3.007175913931319e-06, + "loss": 1.1662, + "step": 6384 + }, + { + "epoch": 0.9039428045586466, + "grad_norm": 8.447360232490439, + "learning_rate": 3.006614620293734e-06, + "loss": 1.1733, + "step": 6385 + }, + { + "epoch": 0.9040843774332837, + "grad_norm": 7.321208235452361, + "learning_rate": 3.0060533000237964e-06, + "loss": 1.121, + "step": 6386 + }, + { + "epoch": 0.904225950307921, + "grad_norm": 8.539812534963763, + "learning_rate": 3.005491953151014e-06, + "loss": 1.2219, + "step": 6387 + }, + { + "epoch": 0.9043675231825582, + "grad_norm": 8.221925071410151, + "learning_rate": 3.0049305797048965e-06, + "loss": 1.2439, + "step": 6388 + }, + { + "epoch": 0.9045090960571954, + "grad_norm": 8.538068456979847, + "learning_rate": 3.0043691797149548e-06, + "loss": 1.1283, + "step": 6389 + }, + { + "epoch": 0.9046506689318327, + "grad_norm": 9.595499779992288, + "learning_rate": 3.003807753210702e-06, + "loss": 1.0282, + "step": 6390 + }, + { + "epoch": 0.9047922418064699, + "grad_norm": 9.08147049680157, + "learning_rate": 3.0032463002216504e-06, + "loss": 1.3119, + "step": 6391 + }, + { + "epoch": 0.9049338146811071, + "grad_norm": 9.664257100026372, + "learning_rate": 3.0026848207773163e-06, + "loss": 1.3406, + "step": 6392 + }, + { + "epoch": 0.9050753875557443, + "grad_norm": 9.729871660348634, + "learning_rate": 3.0021233149072164e-06, + "loss": 1.3096, + "step": 6393 + }, + { + "epoch": 0.9052169604303816, + "grad_norm": 8.502453618179688, + "learning_rate": 3.0015617826408684e-06, + "loss": 1.289, + "step": 6394 + }, + { + "epoch": 0.9053585333050188, + "grad_norm": 8.43590570151675, + "learning_rate": 3.001000224007791e-06, + "loss": 1.3218, + "step": 6395 + }, + { + "epoch": 0.9055001061796559, + "grad_norm": 8.786962422712644, + "learning_rate": 3.000438639037505e-06, + "loss": 1.2339, + "step": 6396 + }, + { + "epoch": 0.9056416790542932, + "grad_norm": 9.193820569813285, + "learning_rate": 2.9998770277595337e-06, + "loss": 1.2073, + "step": 6397 + }, + { + "epoch": 0.9057832519289304, + "grad_norm": 6.436003575885318, + "learning_rate": 2.999315390203399e-06, + "loss": 1.2531, + "step": 6398 + }, + { + "epoch": 0.9059248248035676, + "grad_norm": 8.586831376922637, + "learning_rate": 2.9987537263986277e-06, + "loss": 1.2088, + "step": 6399 + }, + { + "epoch": 0.9060663976782048, + "grad_norm": 8.42936212373085, + "learning_rate": 2.998192036374744e-06, + "loss": 1.3507, + "step": 6400 + }, + { + "epoch": 0.9062079705528421, + "grad_norm": 8.628376175575918, + "learning_rate": 2.9976303201612765e-06, + "loss": 1.1113, + "step": 6401 + }, + { + "epoch": 0.9063495434274793, + "grad_norm": 8.35556645133388, + "learning_rate": 2.9970685777877545e-06, + "loss": 1.2264, + "step": 6402 + }, + { + "epoch": 0.9064911163021165, + "grad_norm": 7.044515481156799, + "learning_rate": 2.9965068092837074e-06, + "loss": 1.1671, + "step": 6403 + }, + { + "epoch": 0.9066326891767538, + "grad_norm": 9.767202997508102, + "learning_rate": 2.9959450146786674e-06, + "loss": 1.3445, + "step": 6404 + }, + { + "epoch": 0.906774262051391, + "grad_norm": 9.968555986674934, + "learning_rate": 2.995383194002169e-06, + "loss": 1.2756, + "step": 6405 + }, + { + "epoch": 0.9069158349260281, + "grad_norm": 9.036995192235757, + "learning_rate": 2.9948213472837443e-06, + "loss": 1.1977, + "step": 6406 + }, + { + "epoch": 0.9070574078006653, + "grad_norm": 8.40312914013007, + "learning_rate": 2.994259474552931e-06, + "loss": 1.183, + "step": 6407 + }, + { + "epoch": 0.9071989806753026, + "grad_norm": 7.6390796182885, + "learning_rate": 2.993697575839265e-06, + "loss": 1.1469, + "step": 6408 + }, + { + "epoch": 0.9073405535499398, + "grad_norm": 8.691859626517783, + "learning_rate": 2.9931356511722857e-06, + "loss": 1.3147, + "step": 6409 + }, + { + "epoch": 0.907482126424577, + "grad_norm": 7.447806774120077, + "learning_rate": 2.9925737005815337e-06, + "loss": 1.1726, + "step": 6410 + }, + { + "epoch": 0.9076236992992143, + "grad_norm": 8.972764661812407, + "learning_rate": 2.9920117240965487e-06, + "loss": 1.1582, + "step": 6411 + }, + { + "epoch": 0.9077652721738515, + "grad_norm": 9.121758708362865, + "learning_rate": 2.991449721746875e-06, + "loss": 1.2689, + "step": 6412 + }, + { + "epoch": 0.9079068450484887, + "grad_norm": 8.512862403090121, + "learning_rate": 2.9908876935620544e-06, + "loss": 1.3412, + "step": 6413 + }, + { + "epoch": 0.908048417923126, + "grad_norm": 10.784472528468195, + "learning_rate": 2.990325639571635e-06, + "loss": 1.3371, + "step": 6414 + }, + { + "epoch": 0.9081899907977632, + "grad_norm": 8.208516178579654, + "learning_rate": 2.9897635598051626e-06, + "loss": 1.1457, + "step": 6415 + }, + { + "epoch": 0.9083315636724004, + "grad_norm": 6.974895419085345, + "learning_rate": 2.9892014542921845e-06, + "loss": 1.2132, + "step": 6416 + }, + { + "epoch": 0.9084731365470375, + "grad_norm": 9.179742353559515, + "learning_rate": 2.9886393230622507e-06, + "loss": 1.1152, + "step": 6417 + }, + { + "epoch": 0.9086147094216748, + "grad_norm": 7.738695821965245, + "learning_rate": 2.9880771661449115e-06, + "loss": 1.2667, + "step": 6418 + }, + { + "epoch": 0.908756282296312, + "grad_norm": 8.59065551530366, + "learning_rate": 2.9875149835697203e-06, + "loss": 1.2835, + "step": 6419 + }, + { + "epoch": 0.9088978551709492, + "grad_norm": 7.358845491251419, + "learning_rate": 2.98695277536623e-06, + "loss": 1.1297, + "step": 6420 + }, + { + "epoch": 0.9090394280455865, + "grad_norm": 7.874679074484436, + "learning_rate": 2.9863905415639954e-06, + "loss": 1.2685, + "step": 6421 + }, + { + "epoch": 0.9091810009202237, + "grad_norm": 8.333876121645767, + "learning_rate": 2.9858282821925723e-06, + "loss": 1.2, + "step": 6422 + }, + { + "epoch": 0.9093225737948609, + "grad_norm": 7.335895379771253, + "learning_rate": 2.985265997281519e-06, + "loss": 1.2327, + "step": 6423 + }, + { + "epoch": 0.9094641466694982, + "grad_norm": 8.320982770830675, + "learning_rate": 2.984703686860394e-06, + "loss": 1.2931, + "step": 6424 + }, + { + "epoch": 0.9096057195441354, + "grad_norm": 7.602678991266058, + "learning_rate": 2.984141350958757e-06, + "loss": 1.1223, + "step": 6425 + }, + { + "epoch": 0.9097472924187726, + "grad_norm": 7.659656249617517, + "learning_rate": 2.9835789896061707e-06, + "loss": 1.2239, + "step": 6426 + }, + { + "epoch": 0.9098888652934097, + "grad_norm": 8.158327269300234, + "learning_rate": 2.9830166028321975e-06, + "loss": 1.1796, + "step": 6427 + }, + { + "epoch": 0.910030438168047, + "grad_norm": 11.612345709458385, + "learning_rate": 2.9824541906664018e-06, + "loss": 1.2315, + "step": 6428 + }, + { + "epoch": 0.9101720110426842, + "grad_norm": 7.8789401415359315, + "learning_rate": 2.9818917531383483e-06, + "loss": 0.9946, + "step": 6429 + }, + { + "epoch": 0.9103135839173214, + "grad_norm": 10.622936093201087, + "learning_rate": 2.981329290277605e-06, + "loss": 1.3208, + "step": 6430 + }, + { + "epoch": 0.9104551567919587, + "grad_norm": 8.029761744250797, + "learning_rate": 2.980766802113741e-06, + "loss": 1.2817, + "step": 6431 + }, + { + "epoch": 0.9105967296665959, + "grad_norm": 8.318997475305581, + "learning_rate": 2.9802042886763234e-06, + "loss": 1.2706, + "step": 6432 + }, + { + "epoch": 0.9107383025412331, + "grad_norm": 7.846921970921121, + "learning_rate": 2.9796417499949244e-06, + "loss": 1.2824, + "step": 6433 + }, + { + "epoch": 0.9108798754158703, + "grad_norm": 9.585738126364923, + "learning_rate": 2.9790791860991165e-06, + "loss": 1.1358, + "step": 6434 + }, + { + "epoch": 0.9110214482905076, + "grad_norm": 7.428398654311898, + "learning_rate": 2.9785165970184724e-06, + "loss": 1.102, + "step": 6435 + }, + { + "epoch": 0.9111630211651448, + "grad_norm": 7.55728978779877, + "learning_rate": 2.977953982782569e-06, + "loss": 1.1591, + "step": 6436 + }, + { + "epoch": 0.9113045940397819, + "grad_norm": 8.136612326026292, + "learning_rate": 2.97739134342098e-06, + "loss": 1.182, + "step": 6437 + }, + { + "epoch": 0.9114461669144192, + "grad_norm": 8.580713656251788, + "learning_rate": 2.9768286789632845e-06, + "loss": 1.2164, + "step": 6438 + }, + { + "epoch": 0.9115877397890564, + "grad_norm": 11.073400956356378, + "learning_rate": 2.9762659894390603e-06, + "loss": 1.0768, + "step": 6439 + }, + { + "epoch": 0.9117293126636936, + "grad_norm": 10.099464900883595, + "learning_rate": 2.9757032748778886e-06, + "loss": 1.2209, + "step": 6440 + }, + { + "epoch": 0.9118708855383308, + "grad_norm": 9.563535148092916, + "learning_rate": 2.97514053530935e-06, + "loss": 1.3414, + "step": 6441 + }, + { + "epoch": 0.9120124584129681, + "grad_norm": 7.784923253302788, + "learning_rate": 2.9745777707630284e-06, + "loss": 1.2911, + "step": 6442 + }, + { + "epoch": 0.9121540312876053, + "grad_norm": 9.855012390965028, + "learning_rate": 2.974014981268507e-06, + "loss": 1.2118, + "step": 6443 + }, + { + "epoch": 0.9122956041622425, + "grad_norm": 9.307070384555304, + "learning_rate": 2.973452166855372e-06, + "loss": 1.2809, + "step": 6444 + }, + { + "epoch": 0.9124371770368798, + "grad_norm": 10.073003558810091, + "learning_rate": 2.972889327553209e-06, + "loss": 1.3, + "step": 6445 + }, + { + "epoch": 0.912578749911517, + "grad_norm": 10.35201502548613, + "learning_rate": 2.972326463391606e-06, + "loss": 1.3025, + "step": 6446 + }, + { + "epoch": 0.9127203227861542, + "grad_norm": 8.559207478137715, + "learning_rate": 2.971763574400154e-06, + "loss": 1.2166, + "step": 6447 + }, + { + "epoch": 0.9128618956607913, + "grad_norm": 8.528292524090906, + "learning_rate": 2.971200660608442e-06, + "loss": 1.2967, + "step": 6448 + }, + { + "epoch": 0.9130034685354286, + "grad_norm": 10.303260687089928, + "learning_rate": 2.970637722046063e-06, + "loss": 1.3361, + "step": 6449 + }, + { + "epoch": 0.9131450414100658, + "grad_norm": 10.061814563223052, + "learning_rate": 2.9700747587426097e-06, + "loss": 1.1667, + "step": 6450 + }, + { + "epoch": 0.913286614284703, + "grad_norm": 7.867206168128918, + "learning_rate": 2.9695117707276774e-06, + "loss": 1.0266, + "step": 6451 + }, + { + "epoch": 0.9134281871593403, + "grad_norm": 8.960659511680898, + "learning_rate": 2.9689487580308613e-06, + "loss": 1.202, + "step": 6452 + }, + { + "epoch": 0.9135697600339775, + "grad_norm": 8.085401080486003, + "learning_rate": 2.9683857206817583e-06, + "loss": 1.1715, + "step": 6453 + }, + { + "epoch": 0.9137113329086147, + "grad_norm": 8.000528318126287, + "learning_rate": 2.9678226587099674e-06, + "loss": 1.1593, + "step": 6454 + }, + { + "epoch": 0.913852905783252, + "grad_norm": 10.43202038223522, + "learning_rate": 2.967259572145088e-06, + "loss": 1.2522, + "step": 6455 + }, + { + "epoch": 0.9139944786578892, + "grad_norm": 9.455900128555237, + "learning_rate": 2.966696461016721e-06, + "loss": 1.2506, + "step": 6456 + }, + { + "epoch": 0.9141360515325264, + "grad_norm": 8.153396516535878, + "learning_rate": 2.966133325354469e-06, + "loss": 1.3149, + "step": 6457 + }, + { + "epoch": 0.9142776244071635, + "grad_norm": 7.568649440005999, + "learning_rate": 2.9655701651879364e-06, + "loss": 1.1497, + "step": 6458 + }, + { + "epoch": 0.9144191972818008, + "grad_norm": 9.723909731748408, + "learning_rate": 2.965006980546727e-06, + "loss": 1.2849, + "step": 6459 + }, + { + "epoch": 0.914560770156438, + "grad_norm": 9.52303332753405, + "learning_rate": 2.9644437714604475e-06, + "loss": 1.2456, + "step": 6460 + }, + { + "epoch": 0.9147023430310752, + "grad_norm": 10.213129486891646, + "learning_rate": 2.963880537958705e-06, + "loss": 1.3195, + "step": 6461 + }, + { + "epoch": 0.9148439159057125, + "grad_norm": 9.98045651907274, + "learning_rate": 2.9633172800711085e-06, + "loss": 1.2047, + "step": 6462 + }, + { + "epoch": 0.9149854887803497, + "grad_norm": 8.873159298277992, + "learning_rate": 2.962753997827268e-06, + "loss": 1.1601, + "step": 6463 + }, + { + "epoch": 0.9151270616549869, + "grad_norm": 8.262692484727237, + "learning_rate": 2.962190691256795e-06, + "loss": 1.3333, + "step": 6464 + }, + { + "epoch": 0.9152686345296241, + "grad_norm": 8.146948176305182, + "learning_rate": 2.961627360389302e-06, + "loss": 1.2101, + "step": 6465 + }, + { + "epoch": 0.9154102074042614, + "grad_norm": 8.489085651312072, + "learning_rate": 2.9610640052544026e-06, + "loss": 1.2931, + "step": 6466 + }, + { + "epoch": 0.9155517802788986, + "grad_norm": 6.91591330718934, + "learning_rate": 2.960500625881712e-06, + "loss": 1.1977, + "step": 6467 + }, + { + "epoch": 0.9156933531535357, + "grad_norm": 9.004312541673825, + "learning_rate": 2.9599372223008483e-06, + "loss": 1.3075, + "step": 6468 + }, + { + "epoch": 0.915834926028173, + "grad_norm": 10.500690255864582, + "learning_rate": 2.9593737945414264e-06, + "loss": 1.2912, + "step": 6469 + }, + { + "epoch": 0.9159764989028102, + "grad_norm": 8.578844940047063, + "learning_rate": 2.9588103426330665e-06, + "loss": 1.1719, + "step": 6470 + }, + { + "epoch": 0.9161180717774474, + "grad_norm": 8.140275416858122, + "learning_rate": 2.95824686660539e-06, + "loss": 1.116, + "step": 6471 + }, + { + "epoch": 0.9162596446520846, + "grad_norm": 11.202481090390933, + "learning_rate": 2.957683366488017e-06, + "loss": 1.3659, + "step": 6472 + }, + { + "epoch": 0.9164012175267219, + "grad_norm": 7.892674789837395, + "learning_rate": 2.9571198423105708e-06, + "loss": 1.2416, + "step": 6473 + }, + { + "epoch": 0.9165427904013591, + "grad_norm": 8.13595265386528, + "learning_rate": 2.956556294102675e-06, + "loss": 1.2847, + "step": 6474 + }, + { + "epoch": 0.9166843632759963, + "grad_norm": 8.775409969305965, + "learning_rate": 2.9559927218939555e-06, + "loss": 1.1961, + "step": 6475 + }, + { + "epoch": 0.9168259361506336, + "grad_norm": 9.962084416201629, + "learning_rate": 2.9554291257140384e-06, + "loss": 1.4218, + "step": 6476 + }, + { + "epoch": 0.9169675090252708, + "grad_norm": 8.54542119076858, + "learning_rate": 2.9548655055925516e-06, + "loss": 1.2313, + "step": 6477 + }, + { + "epoch": 0.917109081899908, + "grad_norm": 9.070677967577941, + "learning_rate": 2.954301861559124e-06, + "loss": 1.1667, + "step": 6478 + }, + { + "epoch": 0.9172506547745451, + "grad_norm": 11.356856076029167, + "learning_rate": 2.9537381936433873e-06, + "loss": 1.3187, + "step": 6479 + }, + { + "epoch": 0.9173922276491824, + "grad_norm": 7.624975735985344, + "learning_rate": 2.953174501874971e-06, + "loss": 1.2858, + "step": 6480 + }, + { + "epoch": 0.9175338005238196, + "grad_norm": 9.766443715680877, + "learning_rate": 2.9526107862835103e-06, + "loss": 1.2515, + "step": 6481 + }, + { + "epoch": 0.9176753733984568, + "grad_norm": 8.73792589101974, + "learning_rate": 2.952047046898637e-06, + "loss": 1.1683, + "step": 6482 + }, + { + "epoch": 0.9178169462730941, + "grad_norm": 10.562436075412148, + "learning_rate": 2.9514832837499884e-06, + "loss": 1.416, + "step": 6483 + }, + { + "epoch": 0.9179585191477313, + "grad_norm": 9.160659717640556, + "learning_rate": 2.9509194968671995e-06, + "loss": 1.2619, + "step": 6484 + }, + { + "epoch": 0.9181000920223685, + "grad_norm": 8.062768266339994, + "learning_rate": 2.9503556862799094e-06, + "loss": 1.3337, + "step": 6485 + }, + { + "epoch": 0.9182416648970058, + "grad_norm": 8.993661980002743, + "learning_rate": 2.949791852017756e-06, + "loss": 1.2085, + "step": 6486 + }, + { + "epoch": 0.918383237771643, + "grad_norm": 8.161877324187012, + "learning_rate": 2.949227994110381e-06, + "loss": 1.2753, + "step": 6487 + }, + { + "epoch": 0.9185248106462802, + "grad_norm": 9.470916802278627, + "learning_rate": 2.948664112587425e-06, + "loss": 1.2525, + "step": 6488 + }, + { + "epoch": 0.9186663835209173, + "grad_norm": 8.161135091184468, + "learning_rate": 2.9481002074785315e-06, + "loss": 1.2482, + "step": 6489 + }, + { + "epoch": 0.9188079563955546, + "grad_norm": 8.90278338509666, + "learning_rate": 2.9475362788133437e-06, + "loss": 1.0873, + "step": 6490 + }, + { + "epoch": 0.9189495292701918, + "grad_norm": 8.68302835330378, + "learning_rate": 2.946972326621507e-06, + "loss": 1.2802, + "step": 6491 + }, + { + "epoch": 0.919091102144829, + "grad_norm": 9.005396178922695, + "learning_rate": 2.946408350932669e-06, + "loss": 1.2585, + "step": 6492 + }, + { + "epoch": 0.9192326750194663, + "grad_norm": 8.448294057698961, + "learning_rate": 2.9458443517764767e-06, + "loss": 1.2888, + "step": 6493 + }, + { + "epoch": 0.9193742478941035, + "grad_norm": 9.657258113840788, + "learning_rate": 2.9452803291825793e-06, + "loss": 1.2292, + "step": 6494 + }, + { + "epoch": 0.9195158207687407, + "grad_norm": 8.737244382074353, + "learning_rate": 2.9447162831806275e-06, + "loss": 1.2961, + "step": 6495 + }, + { + "epoch": 0.919657393643378, + "grad_norm": 8.853173293693288, + "learning_rate": 2.944152213800272e-06, + "loss": 1.1379, + "step": 6496 + }, + { + "epoch": 0.9197989665180152, + "grad_norm": 8.280969003192558, + "learning_rate": 2.9435881210711652e-06, + "loss": 1.1281, + "step": 6497 + }, + { + "epoch": 0.9199405393926524, + "grad_norm": 7.902628314126904, + "learning_rate": 2.943024005022962e-06, + "loss": 1.2456, + "step": 6498 + }, + { + "epoch": 0.9200821122672896, + "grad_norm": 12.032726167558103, + "learning_rate": 2.9424598656853176e-06, + "loss": 1.3131, + "step": 6499 + }, + { + "epoch": 0.9202236851419268, + "grad_norm": 9.63629557682036, + "learning_rate": 2.9418957030878876e-06, + "loss": 1.2276, + "step": 6500 + }, + { + "epoch": 0.920365258016564, + "grad_norm": 8.784113189801802, + "learning_rate": 2.9413315172603296e-06, + "loss": 1.2601, + "step": 6501 + }, + { + "epoch": 0.9205068308912012, + "grad_norm": 9.711407307119067, + "learning_rate": 2.9407673082323033e-06, + "loss": 1.3415, + "step": 6502 + }, + { + "epoch": 0.9206484037658385, + "grad_norm": 8.853689478562737, + "learning_rate": 2.9402030760334684e-06, + "loss": 1.1936, + "step": 6503 + }, + { + "epoch": 0.9207899766404757, + "grad_norm": 7.8336369340649235, + "learning_rate": 2.9396388206934858e-06, + "loss": 1.2454, + "step": 6504 + }, + { + "epoch": 0.9209315495151129, + "grad_norm": 7.996675754344251, + "learning_rate": 2.9390745422420186e-06, + "loss": 1.2818, + "step": 6505 + }, + { + "epoch": 0.9210731223897501, + "grad_norm": 7.803238406749364, + "learning_rate": 2.9385102407087296e-06, + "loss": 1.2055, + "step": 6506 + }, + { + "epoch": 0.9212146952643874, + "grad_norm": 8.03171309386534, + "learning_rate": 2.937945916123284e-06, + "loss": 1.2708, + "step": 6507 + }, + { + "epoch": 0.9213562681390246, + "grad_norm": 10.345677867453121, + "learning_rate": 2.9373815685153485e-06, + "loss": 1.1563, + "step": 6508 + }, + { + "epoch": 0.9214978410136618, + "grad_norm": 8.579224229433933, + "learning_rate": 2.9368171979145898e-06, + "loss": 1.2308, + "step": 6509 + }, + { + "epoch": 0.921639413888299, + "grad_norm": 9.380351255952977, + "learning_rate": 2.936252804350677e-06, + "loss": 1.14, + "step": 6510 + }, + { + "epoch": 0.9217809867629362, + "grad_norm": 8.47768164775049, + "learning_rate": 2.9356883878532794e-06, + "loss": 1.3185, + "step": 6511 + }, + { + "epoch": 0.9219225596375734, + "grad_norm": 8.530777802866218, + "learning_rate": 2.9351239484520684e-06, + "loss": 1.0688, + "step": 6512 + }, + { + "epoch": 0.9220641325122106, + "grad_norm": 8.894424886825613, + "learning_rate": 2.9345594861767157e-06, + "loss": 1.2556, + "step": 6513 + }, + { + "epoch": 0.9222057053868479, + "grad_norm": 10.457352631524797, + "learning_rate": 2.9339950010568945e-06, + "loss": 1.2066, + "step": 6514 + }, + { + "epoch": 0.9223472782614851, + "grad_norm": 10.22220882009466, + "learning_rate": 2.9334304931222795e-06, + "loss": 1.2513, + "step": 6515 + }, + { + "epoch": 0.9224888511361223, + "grad_norm": 8.56878476014102, + "learning_rate": 2.932865962402548e-06, + "loss": 1.2552, + "step": 6516 + }, + { + "epoch": 0.9226304240107596, + "grad_norm": 8.176954160670288, + "learning_rate": 2.9323014089273743e-06, + "loss": 1.2333, + "step": 6517 + }, + { + "epoch": 0.9227719968853968, + "grad_norm": 8.04221317101134, + "learning_rate": 2.9317368327264383e-06, + "loss": 1.1657, + "step": 6518 + }, + { + "epoch": 0.922913569760034, + "grad_norm": 7.82155968841718, + "learning_rate": 2.9311722338294193e-06, + "loss": 1.2114, + "step": 6519 + }, + { + "epoch": 0.9230551426346711, + "grad_norm": 9.961220890576586, + "learning_rate": 2.930607612265997e-06, + "loss": 1.2957, + "step": 6520 + }, + { + "epoch": 0.9231967155093084, + "grad_norm": 9.857127953749107, + "learning_rate": 2.9300429680658538e-06, + "loss": 1.1555, + "step": 6521 + }, + { + "epoch": 0.9233382883839456, + "grad_norm": 10.170434055724266, + "learning_rate": 2.9294783012586725e-06, + "loss": 1.2641, + "step": 6522 + }, + { + "epoch": 0.9234798612585828, + "grad_norm": 7.777075202938326, + "learning_rate": 2.9289136118741367e-06, + "loss": 1.2041, + "step": 6523 + }, + { + "epoch": 0.9236214341332201, + "grad_norm": 7.917105839912152, + "learning_rate": 2.9283488999419324e-06, + "loss": 1.1089, + "step": 6524 + }, + { + "epoch": 0.9237630070078573, + "grad_norm": 9.263016486343394, + "learning_rate": 2.927784165491746e-06, + "loss": 1.1236, + "step": 6525 + }, + { + "epoch": 0.9239045798824945, + "grad_norm": 8.203189173856124, + "learning_rate": 2.927219408553265e-06, + "loss": 1.217, + "step": 6526 + }, + { + "epoch": 0.9240461527571318, + "grad_norm": 10.661088040991569, + "learning_rate": 2.926654629156178e-06, + "loss": 1.1884, + "step": 6527 + }, + { + "epoch": 0.924187725631769, + "grad_norm": 9.314354219109502, + "learning_rate": 2.926089827330175e-06, + "loss": 1.1752, + "step": 6528 + }, + { + "epoch": 0.9243292985064062, + "grad_norm": 9.09723274159167, + "learning_rate": 2.925525003104948e-06, + "loss": 1.3269, + "step": 6529 + }, + { + "epoch": 0.9244708713810434, + "grad_norm": 8.588628623125846, + "learning_rate": 2.924960156510188e-06, + "loss": 1.2631, + "step": 6530 + }, + { + "epoch": 0.9246124442556806, + "grad_norm": 8.772254327360027, + "learning_rate": 2.9243952875755905e-06, + "loss": 1.3361, + "step": 6531 + }, + { + "epoch": 0.9247540171303178, + "grad_norm": 8.669865335828824, + "learning_rate": 2.923830396330849e-06, + "loss": 1.3818, + "step": 6532 + }, + { + "epoch": 0.924895590004955, + "grad_norm": 9.035643462598728, + "learning_rate": 2.9232654828056596e-06, + "loss": 1.3856, + "step": 6533 + }, + { + "epoch": 0.9250371628795923, + "grad_norm": 9.141663686310784, + "learning_rate": 2.9227005470297194e-06, + "loss": 1.3205, + "step": 6534 + }, + { + "epoch": 0.9251787357542295, + "grad_norm": 9.26223687790492, + "learning_rate": 2.922135589032726e-06, + "loss": 1.3437, + "step": 6535 + }, + { + "epoch": 0.9253203086288667, + "grad_norm": 8.514968258792406, + "learning_rate": 2.9215706088443794e-06, + "loss": 1.3431, + "step": 6536 + }, + { + "epoch": 0.925461881503504, + "grad_norm": 7.885474127119602, + "learning_rate": 2.921005606494381e-06, + "loss": 1.2736, + "step": 6537 + }, + { + "epoch": 0.9256034543781412, + "grad_norm": 8.976660665667548, + "learning_rate": 2.9204405820124315e-06, + "loss": 1.3204, + "step": 6538 + }, + { + "epoch": 0.9257450272527784, + "grad_norm": 10.771088607586536, + "learning_rate": 2.9198755354282337e-06, + "loss": 1.468, + "step": 6539 + }, + { + "epoch": 0.9258866001274156, + "grad_norm": 10.145839766879591, + "learning_rate": 2.9193104667714926e-06, + "loss": 1.242, + "step": 6540 + }, + { + "epoch": 0.9260281730020528, + "grad_norm": 8.732201921785366, + "learning_rate": 2.9187453760719126e-06, + "loss": 1.2131, + "step": 6541 + }, + { + "epoch": 0.92616974587669, + "grad_norm": 8.998410826357194, + "learning_rate": 2.918180263359201e-06, + "loss": 1.2405, + "step": 6542 + }, + { + "epoch": 0.9263113187513272, + "grad_norm": 8.836307049080233, + "learning_rate": 2.9176151286630642e-06, + "loss": 1.2127, + "step": 6543 + }, + { + "epoch": 0.9264528916259644, + "grad_norm": 9.40945126627075, + "learning_rate": 2.917049972013211e-06, + "loss": 1.2778, + "step": 6544 + }, + { + "epoch": 0.9265944645006017, + "grad_norm": 8.517241550260758, + "learning_rate": 2.9164847934393523e-06, + "loss": 1.3066, + "step": 6545 + }, + { + "epoch": 0.9267360373752389, + "grad_norm": 8.341810822614773, + "learning_rate": 2.9159195929711985e-06, + "loss": 1.0654, + "step": 6546 + }, + { + "epoch": 0.9268776102498761, + "grad_norm": 8.256160863240677, + "learning_rate": 2.915354370638462e-06, + "loss": 1.2866, + "step": 6547 + }, + { + "epoch": 0.9270191831245134, + "grad_norm": 9.54032930777451, + "learning_rate": 2.914789126470856e-06, + "loss": 1.1501, + "step": 6548 + }, + { + "epoch": 0.9271607559991506, + "grad_norm": 7.692091648295752, + "learning_rate": 2.914223860498095e-06, + "loss": 1.2652, + "step": 6549 + }, + { + "epoch": 0.9273023288737878, + "grad_norm": 9.025024802895064, + "learning_rate": 2.9136585727498946e-06, + "loss": 1.2444, + "step": 6550 + }, + { + "epoch": 0.927443901748425, + "grad_norm": 9.81864124725936, + "learning_rate": 2.9130932632559707e-06, + "loss": 1.17, + "step": 6551 + }, + { + "epoch": 0.9275854746230622, + "grad_norm": 8.530824755455992, + "learning_rate": 2.912527932046042e-06, + "loss": 1.1944, + "step": 6552 + }, + { + "epoch": 0.9277270474976994, + "grad_norm": 8.884459276634882, + "learning_rate": 2.911962579149828e-06, + "loss": 1.148, + "step": 6553 + }, + { + "epoch": 0.9278686203723366, + "grad_norm": 8.602029481107795, + "learning_rate": 2.9113972045970483e-06, + "loss": 1.0878, + "step": 6554 + }, + { + "epoch": 0.9280101932469739, + "grad_norm": 8.691055558233913, + "learning_rate": 2.910831808417424e-06, + "loss": 1.3351, + "step": 6555 + }, + { + "epoch": 0.9281517661216111, + "grad_norm": 12.100091577608966, + "learning_rate": 2.910266390640678e-06, + "loss": 1.3048, + "step": 6556 + }, + { + "epoch": 0.9282933389962483, + "grad_norm": 10.632452292300533, + "learning_rate": 2.909700951296534e-06, + "loss": 1.2498, + "step": 6557 + }, + { + "epoch": 0.9284349118708856, + "grad_norm": 7.623131695098826, + "learning_rate": 2.9091354904147175e-06, + "loss": 1.1857, + "step": 6558 + }, + { + "epoch": 0.9285764847455228, + "grad_norm": 8.922412435241423, + "learning_rate": 2.908570008024951e-06, + "loss": 1.2422, + "step": 6559 + }, + { + "epoch": 0.92871805762016, + "grad_norm": 8.38762021383673, + "learning_rate": 2.9080045041569647e-06, + "loss": 1.0616, + "step": 6560 + }, + { + "epoch": 0.9288596304947973, + "grad_norm": 11.360767221466672, + "learning_rate": 2.9074389788404867e-06, + "loss": 1.3668, + "step": 6561 + }, + { + "epoch": 0.9290012033694344, + "grad_norm": 8.864397189692093, + "learning_rate": 2.9068734321052445e-06, + "loss": 1.2145, + "step": 6562 + }, + { + "epoch": 0.9291427762440716, + "grad_norm": 10.646781585525403, + "learning_rate": 2.9063078639809707e-06, + "loss": 1.2051, + "step": 6563 + }, + { + "epoch": 0.9292843491187088, + "grad_norm": 6.992434364095357, + "learning_rate": 2.905742274497394e-06, + "loss": 1.0544, + "step": 6564 + }, + { + "epoch": 0.9294259219933461, + "grad_norm": 8.031606703380897, + "learning_rate": 2.9051766636842488e-06, + "loss": 1.2495, + "step": 6565 + }, + { + "epoch": 0.9295674948679833, + "grad_norm": 10.29987056613816, + "learning_rate": 2.9046110315712682e-06, + "loss": 1.2713, + "step": 6566 + }, + { + "epoch": 0.9297090677426205, + "grad_norm": 7.349402017985725, + "learning_rate": 2.904045378188187e-06, + "loss": 1.2658, + "step": 6567 + }, + { + "epoch": 0.9298506406172578, + "grad_norm": 8.982236073515942, + "learning_rate": 2.9034797035647427e-06, + "loss": 1.2825, + "step": 6568 + }, + { + "epoch": 0.929992213491895, + "grad_norm": 8.777587561283344, + "learning_rate": 2.9029140077306717e-06, + "loss": 1.2847, + "step": 6569 + }, + { + "epoch": 0.9301337863665322, + "grad_norm": 11.367242536378793, + "learning_rate": 2.902348290715711e-06, + "loss": 1.2888, + "step": 6570 + }, + { + "epoch": 0.9302753592411694, + "grad_norm": 9.303505464512968, + "learning_rate": 2.9017825525496e-06, + "loss": 1.2233, + "step": 6571 + }, + { + "epoch": 0.9304169321158066, + "grad_norm": 10.395080987670774, + "learning_rate": 2.9012167932620806e-06, + "loss": 1.1406, + "step": 6572 + }, + { + "epoch": 0.9305585049904438, + "grad_norm": 9.30059257998474, + "learning_rate": 2.900651012882893e-06, + "loss": 1.2561, + "step": 6573 + }, + { + "epoch": 0.930700077865081, + "grad_norm": 8.809778970290406, + "learning_rate": 2.9000852114417804e-06, + "loss": 1.2869, + "step": 6574 + }, + { + "epoch": 0.9308416507397183, + "grad_norm": 8.26092915109754, + "learning_rate": 2.899519388968487e-06, + "loss": 1.2141, + "step": 6575 + }, + { + "epoch": 0.9309832236143555, + "grad_norm": 9.375918737534555, + "learning_rate": 2.898953545492757e-06, + "loss": 1.1435, + "step": 6576 + }, + { + "epoch": 0.9311247964889927, + "grad_norm": 9.410420149241686, + "learning_rate": 2.8983876810443364e-06, + "loss": 1.2931, + "step": 6577 + }, + { + "epoch": 0.9312663693636299, + "grad_norm": 7.852693724867427, + "learning_rate": 2.8978217956529726e-06, + "loss": 1.2174, + "step": 6578 + }, + { + "epoch": 0.9314079422382672, + "grad_norm": 8.63799865431531, + "learning_rate": 2.8972558893484145e-06, + "loss": 1.3064, + "step": 6579 + }, + { + "epoch": 0.9315495151129044, + "grad_norm": 9.545467191745589, + "learning_rate": 2.8966899621604094e-06, + "loss": 1.2518, + "step": 6580 + }, + { + "epoch": 0.9316910879875416, + "grad_norm": 7.515757981866646, + "learning_rate": 2.8961240141187085e-06, + "loss": 1.1501, + "step": 6581 + }, + { + "epoch": 0.9318326608621788, + "grad_norm": 9.66726165343509, + "learning_rate": 2.8955580452530642e-06, + "loss": 1.3135, + "step": 6582 + }, + { + "epoch": 0.931974233736816, + "grad_norm": 9.553780363724155, + "learning_rate": 2.8949920555932283e-06, + "loss": 1.3534, + "step": 6583 + }, + { + "epoch": 0.9321158066114532, + "grad_norm": 10.275422737725433, + "learning_rate": 2.8944260451689544e-06, + "loss": 1.2514, + "step": 6584 + }, + { + "epoch": 0.9322573794860904, + "grad_norm": 11.076326684302103, + "learning_rate": 2.8938600140099975e-06, + "loss": 1.2176, + "step": 6585 + }, + { + "epoch": 0.9323989523607277, + "grad_norm": 10.177300525379346, + "learning_rate": 2.893293962146114e-06, + "loss": 1.3281, + "step": 6586 + }, + { + "epoch": 0.9325405252353649, + "grad_norm": 9.81252993439403, + "learning_rate": 2.8927278896070593e-06, + "loss": 1.3277, + "step": 6587 + }, + { + "epoch": 0.9326820981100021, + "grad_norm": 9.789854317536792, + "learning_rate": 2.8921617964225923e-06, + "loss": 1.1849, + "step": 6588 + }, + { + "epoch": 0.9328236709846394, + "grad_norm": 8.39506654732316, + "learning_rate": 2.8915956826224724e-06, + "loss": 1.1546, + "step": 6589 + }, + { + "epoch": 0.9329652438592766, + "grad_norm": 8.86667640322196, + "learning_rate": 2.8910295482364594e-06, + "loss": 1.3569, + "step": 6590 + }, + { + "epoch": 0.9331068167339138, + "grad_norm": 10.095854832060486, + "learning_rate": 2.8904633932943145e-06, + "loss": 1.4124, + "step": 6591 + }, + { + "epoch": 0.933248389608551, + "grad_norm": 12.49996154779242, + "learning_rate": 2.8898972178258e-06, + "loss": 1.2336, + "step": 6592 + }, + { + "epoch": 0.9333899624831882, + "grad_norm": 8.006989286954632, + "learning_rate": 2.8893310218606797e-06, + "loss": 1.2139, + "step": 6593 + }, + { + "epoch": 0.9335315353578254, + "grad_norm": 8.39306650567314, + "learning_rate": 2.8887648054287176e-06, + "loss": 1.154, + "step": 6594 + }, + { + "epoch": 0.9336731082324626, + "grad_norm": 9.482620403753367, + "learning_rate": 2.888198568559681e-06, + "loss": 1.3703, + "step": 6595 + }, + { + "epoch": 0.9338146811070999, + "grad_norm": 8.343588609599017, + "learning_rate": 2.887632311283333e-06, + "loss": 1.1616, + "step": 6596 + }, + { + "epoch": 0.9339562539817371, + "grad_norm": 9.855993207780553, + "learning_rate": 2.8870660336294444e-06, + "loss": 1.3033, + "step": 6597 + }, + { + "epoch": 0.9340978268563743, + "grad_norm": 10.421036030230361, + "learning_rate": 2.886499735627783e-06, + "loss": 1.2913, + "step": 6598 + }, + { + "epoch": 0.9342393997310116, + "grad_norm": 8.3549984892793, + "learning_rate": 2.885933417308118e-06, + "loss": 1.1509, + "step": 6599 + }, + { + "epoch": 0.9343809726056488, + "grad_norm": 7.817584527086021, + "learning_rate": 2.8853670787002224e-06, + "loss": 1.2136, + "step": 6600 + }, + { + "epoch": 0.934522545480286, + "grad_norm": 9.11718666318548, + "learning_rate": 2.8848007198338663e-06, + "loss": 1.2433, + "step": 6601 + }, + { + "epoch": 0.9346641183549232, + "grad_norm": 8.048604183141013, + "learning_rate": 2.884234340738823e-06, + "loss": 1.2108, + "step": 6602 + }, + { + "epoch": 0.9348056912295604, + "grad_norm": 9.074352852942575, + "learning_rate": 2.883667941444867e-06, + "loss": 1.3088, + "step": 6603 + }, + { + "epoch": 0.9349472641041976, + "grad_norm": 8.91064473192452, + "learning_rate": 2.8831015219817725e-06, + "loss": 1.226, + "step": 6604 + }, + { + "epoch": 0.9350888369788348, + "grad_norm": 9.804149403187383, + "learning_rate": 2.882535082379318e-06, + "loss": 1.1824, + "step": 6605 + }, + { + "epoch": 0.9352304098534721, + "grad_norm": 8.09444751458716, + "learning_rate": 2.8819686226672794e-06, + "loss": 1.3012, + "step": 6606 + }, + { + "epoch": 0.9353719827281093, + "grad_norm": 10.753721081088338, + "learning_rate": 2.881402142875435e-06, + "loss": 1.322, + "step": 6607 + }, + { + "epoch": 0.9355135556027465, + "grad_norm": 11.677907342390627, + "learning_rate": 2.880835643033564e-06, + "loss": 1.3912, + "step": 6608 + }, + { + "epoch": 0.9356551284773837, + "grad_norm": 10.115198736722796, + "learning_rate": 2.8802691231714463e-06, + "loss": 1.4155, + "step": 6609 + }, + { + "epoch": 0.935796701352021, + "grad_norm": 9.13534586468311, + "learning_rate": 2.879702583318866e-06, + "loss": 1.3405, + "step": 6610 + }, + { + "epoch": 0.9359382742266582, + "grad_norm": 10.582909362791248, + "learning_rate": 2.8791360235056024e-06, + "loss": 1.4045, + "step": 6611 + }, + { + "epoch": 0.9360798471012954, + "grad_norm": 7.476164006980943, + "learning_rate": 2.878569443761442e-06, + "loss": 1.3615, + "step": 6612 + }, + { + "epoch": 0.9362214199759326, + "grad_norm": 7.875077262378302, + "learning_rate": 2.878002844116168e-06, + "loss": 1.1884, + "step": 6613 + }, + { + "epoch": 0.9363629928505698, + "grad_norm": 9.875924755206867, + "learning_rate": 2.877436224599566e-06, + "loss": 1.3544, + "step": 6614 + }, + { + "epoch": 0.936504565725207, + "grad_norm": 8.316409002171927, + "learning_rate": 2.876869585241423e-06, + "loss": 1.2387, + "step": 6615 + }, + { + "epoch": 0.9366461385998442, + "grad_norm": 8.63823447520573, + "learning_rate": 2.8763029260715282e-06, + "loss": 1.2931, + "step": 6616 + }, + { + "epoch": 0.9367877114744815, + "grad_norm": 11.335977881561432, + "learning_rate": 2.8757362471196677e-06, + "loss": 1.3856, + "step": 6617 + }, + { + "epoch": 0.9369292843491187, + "grad_norm": 9.340413087781002, + "learning_rate": 2.875169548415633e-06, + "loss": 1.3035, + "step": 6618 + }, + { + "epoch": 0.9370708572237559, + "grad_norm": 8.148765667079441, + "learning_rate": 2.874602829989215e-06, + "loss": 1.1685, + "step": 6619 + }, + { + "epoch": 0.9372124300983932, + "grad_norm": 9.296628804513292, + "learning_rate": 2.8740360918702053e-06, + "loss": 1.1287, + "step": 6620 + }, + { + "epoch": 0.9373540029730304, + "grad_norm": 9.537239969091024, + "learning_rate": 2.873469334088398e-06, + "loss": 1.3485, + "step": 6621 + }, + { + "epoch": 0.9374955758476676, + "grad_norm": 6.564642856061305, + "learning_rate": 2.8729025566735856e-06, + "loss": 1.1953, + "step": 6622 + }, + { + "epoch": 0.9376371487223049, + "grad_norm": 7.5244512617712855, + "learning_rate": 2.8723357596555644e-06, + "loss": 1.3298, + "step": 6623 + }, + { + "epoch": 0.937778721596942, + "grad_norm": 10.462845597660719, + "learning_rate": 2.871768943064129e-06, + "loss": 1.1884, + "step": 6624 + }, + { + "epoch": 0.9379202944715792, + "grad_norm": 12.074268668005839, + "learning_rate": 2.8712021069290786e-06, + "loss": 1.3375, + "step": 6625 + }, + { + "epoch": 0.9380618673462164, + "grad_norm": 9.815056534622306, + "learning_rate": 2.87063525128021e-06, + "loss": 1.245, + "step": 6626 + }, + { + "epoch": 0.9382034402208537, + "grad_norm": 9.321579162015976, + "learning_rate": 2.870068376147322e-06, + "loss": 1.3478, + "step": 6627 + }, + { + "epoch": 0.9383450130954909, + "grad_norm": 11.36984639220873, + "learning_rate": 2.8695014815602157e-06, + "loss": 1.16, + "step": 6628 + }, + { + "epoch": 0.9384865859701281, + "grad_norm": 9.01681854439789, + "learning_rate": 2.8689345675486917e-06, + "loss": 1.3142, + "step": 6629 + }, + { + "epoch": 0.9386281588447654, + "grad_norm": 8.121628281771189, + "learning_rate": 2.868367634142553e-06, + "loss": 1.1232, + "step": 6630 + }, + { + "epoch": 0.9387697317194026, + "grad_norm": 8.358435804424156, + "learning_rate": 2.8678006813716024e-06, + "loss": 1.2222, + "step": 6631 + }, + { + "epoch": 0.9389113045940398, + "grad_norm": 7.502891237556854, + "learning_rate": 2.8672337092656444e-06, + "loss": 1.315, + "step": 6632 + }, + { + "epoch": 0.939052877468677, + "grad_norm": 7.91995631350404, + "learning_rate": 2.8666667178544833e-06, + "loss": 1.0387, + "step": 6633 + }, + { + "epoch": 0.9391944503433142, + "grad_norm": 7.539960327448809, + "learning_rate": 2.866099707167927e-06, + "loss": 1.0807, + "step": 6634 + }, + { + "epoch": 0.9393360232179514, + "grad_norm": 7.293464075807713, + "learning_rate": 2.8655326772357816e-06, + "loss": 1.2311, + "step": 6635 + }, + { + "epoch": 0.9394775960925886, + "grad_norm": 10.682224689473923, + "learning_rate": 2.8649656280878563e-06, + "loss": 1.2224, + "step": 6636 + }, + { + "epoch": 0.9396191689672259, + "grad_norm": 8.749801633493657, + "learning_rate": 2.8643985597539597e-06, + "loss": 1.1853, + "step": 6637 + }, + { + "epoch": 0.9397607418418631, + "grad_norm": 8.766213789968038, + "learning_rate": 2.863831472263904e-06, + "loss": 1.2071, + "step": 6638 + }, + { + "epoch": 0.9399023147165003, + "grad_norm": 7.2149629774088835, + "learning_rate": 2.8632643656474974e-06, + "loss": 1.124, + "step": 6639 + }, + { + "epoch": 0.9400438875911376, + "grad_norm": 7.72372585023067, + "learning_rate": 2.8626972399345543e-06, + "loss": 1.1952, + "step": 6640 + }, + { + "epoch": 0.9401854604657748, + "grad_norm": 8.779337067135474, + "learning_rate": 2.8621300951548877e-06, + "loss": 1.2185, + "step": 6641 + }, + { + "epoch": 0.940327033340412, + "grad_norm": 11.538343775221612, + "learning_rate": 2.861562931338312e-06, + "loss": 1.3245, + "step": 6642 + }, + { + "epoch": 0.9404686062150492, + "grad_norm": 9.511468891097136, + "learning_rate": 2.8609957485146433e-06, + "loss": 1.1659, + "step": 6643 + }, + { + "epoch": 0.9406101790896865, + "grad_norm": 9.478499325467647, + "learning_rate": 2.8604285467136966e-06, + "loss": 1.3014, + "step": 6644 + }, + { + "epoch": 0.9407517519643236, + "grad_norm": 10.434167752602654, + "learning_rate": 2.8598613259652895e-06, + "loss": 1.2932, + "step": 6645 + }, + { + "epoch": 0.9408933248389608, + "grad_norm": 8.258077655097292, + "learning_rate": 2.8592940862992417e-06, + "loss": 1.357, + "step": 6646 + }, + { + "epoch": 0.941034897713598, + "grad_norm": 9.970529236057146, + "learning_rate": 2.858726827745372e-06, + "loss": 1.2924, + "step": 6647 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 9.385962828488799, + "learning_rate": 2.858159550333499e-06, + "loss": 1.1887, + "step": 6648 + }, + { + "epoch": 0.9413180434628725, + "grad_norm": 9.759135734564026, + "learning_rate": 2.8575922540934464e-06, + "loss": 1.2149, + "step": 6649 + }, + { + "epoch": 0.9414596163375097, + "grad_norm": 9.761685533531923, + "learning_rate": 2.857024939055036e-06, + "loss": 1.1797, + "step": 6650 + }, + { + "epoch": 0.941601189212147, + "grad_norm": 8.749087476831289, + "learning_rate": 2.8564576052480895e-06, + "loss": 1.0866, + "step": 6651 + }, + { + "epoch": 0.9417427620867842, + "grad_norm": 8.643305716195238, + "learning_rate": 2.8558902527024337e-06, + "loss": 1.1655, + "step": 6652 + }, + { + "epoch": 0.9418843349614214, + "grad_norm": 9.836617347061923, + "learning_rate": 2.8553228814478927e-06, + "loss": 1.2315, + "step": 6653 + }, + { + "epoch": 0.9420259078360587, + "grad_norm": 11.094659472513573, + "learning_rate": 2.8547554915142923e-06, + "loss": 1.1191, + "step": 6654 + }, + { + "epoch": 0.9421674807106958, + "grad_norm": 10.738379296569487, + "learning_rate": 2.8541880829314604e-06, + "loss": 1.2574, + "step": 6655 + }, + { + "epoch": 0.942309053585333, + "grad_norm": 9.502183763580636, + "learning_rate": 2.8536206557292244e-06, + "loss": 1.2936, + "step": 6656 + }, + { + "epoch": 0.9424506264599702, + "grad_norm": 8.120911963603932, + "learning_rate": 2.8530532099374146e-06, + "loss": 1.3893, + "step": 6657 + }, + { + "epoch": 0.9425921993346075, + "grad_norm": 8.22825809404439, + "learning_rate": 2.8524857455858606e-06, + "loss": 1.2178, + "step": 6658 + }, + { + "epoch": 0.9427337722092447, + "grad_norm": 10.257884273001952, + "learning_rate": 2.8519182627043953e-06, + "loss": 1.242, + "step": 6659 + }, + { + "epoch": 0.9428753450838819, + "grad_norm": 8.29622301316567, + "learning_rate": 2.8513507613228474e-06, + "loss": 1.152, + "step": 6660 + }, + { + "epoch": 0.9430169179585192, + "grad_norm": 8.595488993086631, + "learning_rate": 2.8507832414710528e-06, + "loss": 1.2203, + "step": 6661 + }, + { + "epoch": 0.9431584908331564, + "grad_norm": 8.479722463276685, + "learning_rate": 2.850215703178845e-06, + "loss": 1.1957, + "step": 6662 + }, + { + "epoch": 0.9433000637077936, + "grad_norm": 10.341718122634523, + "learning_rate": 2.8496481464760585e-06, + "loss": 1.2457, + "step": 6663 + }, + { + "epoch": 0.9434416365824309, + "grad_norm": 9.997703288501908, + "learning_rate": 2.8490805713925298e-06, + "loss": 1.2865, + "step": 6664 + }, + { + "epoch": 0.943583209457068, + "grad_norm": 10.00966215647743, + "learning_rate": 2.848512977958095e-06, + "loss": 1.2826, + "step": 6665 + }, + { + "epoch": 0.9437247823317052, + "grad_norm": 8.2884025346956, + "learning_rate": 2.8479453662025937e-06, + "loss": 1.1933, + "step": 6666 + }, + { + "epoch": 0.9438663552063424, + "grad_norm": 8.962681009547659, + "learning_rate": 2.847377736155863e-06, + "loss": 1.1813, + "step": 6667 + }, + { + "epoch": 0.9440079280809797, + "grad_norm": 9.53113433267418, + "learning_rate": 2.8468100878477443e-06, + "loss": 1.2806, + "step": 6668 + }, + { + "epoch": 0.9441495009556169, + "grad_norm": 7.465224451470957, + "learning_rate": 2.8462424213080786e-06, + "loss": 1.172, + "step": 6669 + }, + { + "epoch": 0.9442910738302541, + "grad_norm": 9.754021622140058, + "learning_rate": 2.845674736566706e-06, + "loss": 1.1674, + "step": 6670 + }, + { + "epoch": 0.9444326467048914, + "grad_norm": 10.837934094175306, + "learning_rate": 2.8451070336534703e-06, + "loss": 1.2877, + "step": 6671 + }, + { + "epoch": 0.9445742195795286, + "grad_norm": 8.216037549223708, + "learning_rate": 2.8445393125982152e-06, + "loss": 1.2972, + "step": 6672 + }, + { + "epoch": 0.9447157924541658, + "grad_norm": 8.300658546887211, + "learning_rate": 2.8439715734307856e-06, + "loss": 1.2543, + "step": 6673 + }, + { + "epoch": 0.944857365328803, + "grad_norm": 9.202218443385549, + "learning_rate": 2.8434038161810266e-06, + "loss": 1.1877, + "step": 6674 + }, + { + "epoch": 0.9449989382034403, + "grad_norm": 7.0570895645838725, + "learning_rate": 2.8428360408787857e-06, + "loss": 1.1736, + "step": 6675 + }, + { + "epoch": 0.9451405110780774, + "grad_norm": 8.71189386566219, + "learning_rate": 2.84226824755391e-06, + "loss": 1.1358, + "step": 6676 + }, + { + "epoch": 0.9452820839527146, + "grad_norm": 10.216462512151994, + "learning_rate": 2.8417004362362465e-06, + "loss": 1.2196, + "step": 6677 + }, + { + "epoch": 0.9454236568273519, + "grad_norm": 8.727450278672416, + "learning_rate": 2.8411326069556456e-06, + "loss": 1.1787, + "step": 6678 + }, + { + "epoch": 0.9455652297019891, + "grad_norm": 7.6877515798201586, + "learning_rate": 2.840564759741959e-06, + "loss": 1.0634, + "step": 6679 + }, + { + "epoch": 0.9457068025766263, + "grad_norm": 9.35676122987493, + "learning_rate": 2.8399968946250373e-06, + "loss": 1.2692, + "step": 6680 + }, + { + "epoch": 0.9458483754512635, + "grad_norm": 8.242019955346866, + "learning_rate": 2.839429011634731e-06, + "loss": 1.1728, + "step": 6681 + }, + { + "epoch": 0.9459899483259008, + "grad_norm": 8.42091876601179, + "learning_rate": 2.8388611108008957e-06, + "loss": 1.1809, + "step": 6682 + }, + { + "epoch": 0.946131521200538, + "grad_norm": 9.156832835945472, + "learning_rate": 2.838293192153384e-06, + "loss": 1.0894, + "step": 6683 + }, + { + "epoch": 0.9462730940751752, + "grad_norm": 9.752816600378836, + "learning_rate": 2.8377252557220513e-06, + "loss": 1.1777, + "step": 6684 + }, + { + "epoch": 0.9464146669498125, + "grad_norm": 7.14149145967106, + "learning_rate": 2.8371573015367543e-06, + "loss": 1.0272, + "step": 6685 + }, + { + "epoch": 0.9465562398244496, + "grad_norm": 7.558869554480379, + "learning_rate": 2.836589329627349e-06, + "loss": 1.0719, + "step": 6686 + }, + { + "epoch": 0.9466978126990868, + "grad_norm": 9.61855110027257, + "learning_rate": 2.8360213400236936e-06, + "loss": 1.3298, + "step": 6687 + }, + { + "epoch": 0.946839385573724, + "grad_norm": 9.068647732936899, + "learning_rate": 2.8354533327556465e-06, + "loss": 1.3652, + "step": 6688 + }, + { + "epoch": 0.9469809584483613, + "grad_norm": 9.62198740393909, + "learning_rate": 2.834885307853068e-06, + "loss": 1.0451, + "step": 6689 + }, + { + "epoch": 0.9471225313229985, + "grad_norm": 8.424316052484524, + "learning_rate": 2.8343172653458194e-06, + "loss": 1.3598, + "step": 6690 + }, + { + "epoch": 0.9472641041976357, + "grad_norm": 9.631444185689432, + "learning_rate": 2.833749205263761e-06, + "loss": 1.2112, + "step": 6691 + }, + { + "epoch": 0.947405677072273, + "grad_norm": 8.663914561406047, + "learning_rate": 2.8331811276367554e-06, + "loss": 1.3214, + "step": 6692 + }, + { + "epoch": 0.9475472499469102, + "grad_norm": 9.474846718386218, + "learning_rate": 2.832613032494666e-06, + "loss": 1.2373, + "step": 6693 + }, + { + "epoch": 0.9476888228215474, + "grad_norm": 8.070017540592383, + "learning_rate": 2.8320449198673585e-06, + "loss": 1.2941, + "step": 6694 + }, + { + "epoch": 0.9478303956961847, + "grad_norm": 11.21716195345503, + "learning_rate": 2.8314767897846963e-06, + "loss": 1.3578, + "step": 6695 + }, + { + "epoch": 0.9479719685708218, + "grad_norm": 10.92286339300041, + "learning_rate": 2.830908642276547e-06, + "loss": 1.2414, + "step": 6696 + }, + { + "epoch": 0.948113541445459, + "grad_norm": 9.3684771415976, + "learning_rate": 2.830340477372777e-06, + "loss": 1.194, + "step": 6697 + }, + { + "epoch": 0.9482551143200962, + "grad_norm": 9.048311900138472, + "learning_rate": 2.829772295103254e-06, + "loss": 1.293, + "step": 6698 + }, + { + "epoch": 0.9483966871947335, + "grad_norm": 8.21412395057383, + "learning_rate": 2.829204095497848e-06, + "loss": 1.2903, + "step": 6699 + }, + { + "epoch": 0.9485382600693707, + "grad_norm": 10.927307262374736, + "learning_rate": 2.828635878586428e-06, + "loss": 1.3195, + "step": 6700 + }, + { + "epoch": 0.9486798329440079, + "grad_norm": 11.07254796148452, + "learning_rate": 2.828067644398864e-06, + "loss": 1.3441, + "step": 6701 + }, + { + "epoch": 0.9488214058186452, + "grad_norm": 10.045070551410403, + "learning_rate": 2.8274993929650297e-06, + "loss": 1.2464, + "step": 6702 + }, + { + "epoch": 0.9489629786932824, + "grad_norm": 10.989669457168947, + "learning_rate": 2.826931124314796e-06, + "loss": 1.2063, + "step": 6703 + }, + { + "epoch": 0.9491045515679196, + "grad_norm": 7.9623782545331085, + "learning_rate": 2.826362838478037e-06, + "loss": 1.1038, + "step": 6704 + }, + { + "epoch": 0.9492461244425568, + "grad_norm": 9.658052442896864, + "learning_rate": 2.825794535484627e-06, + "loss": 1.2891, + "step": 6705 + }, + { + "epoch": 0.9493876973171941, + "grad_norm": 8.674017014451389, + "learning_rate": 2.825226215364441e-06, + "loss": 1.2031, + "step": 6706 + }, + { + "epoch": 0.9495292701918312, + "grad_norm": 8.15375582973021, + "learning_rate": 2.824657878147355e-06, + "loss": 1.1695, + "step": 6707 + }, + { + "epoch": 0.9496708430664684, + "grad_norm": 9.403665485359785, + "learning_rate": 2.8240895238632473e-06, + "loss": 1.1995, + "step": 6708 + }, + { + "epoch": 0.9498124159411057, + "grad_norm": 9.62037329506635, + "learning_rate": 2.8235211525419937e-06, + "loss": 1.159, + "step": 6709 + }, + { + "epoch": 0.9499539888157429, + "grad_norm": 11.371698256647262, + "learning_rate": 2.8229527642134753e-06, + "loss": 1.3686, + "step": 6710 + }, + { + "epoch": 0.9500955616903801, + "grad_norm": 9.444839957937598, + "learning_rate": 2.8223843589075705e-06, + "loss": 1.3162, + "step": 6711 + }, + { + "epoch": 0.9502371345650173, + "grad_norm": 7.781367167008184, + "learning_rate": 2.8218159366541615e-06, + "loss": 1.0773, + "step": 6712 + }, + { + "epoch": 0.9503787074396546, + "grad_norm": 10.177178706825584, + "learning_rate": 2.8212474974831277e-06, + "loss": 1.1938, + "step": 6713 + }, + { + "epoch": 0.9505202803142918, + "grad_norm": 9.689766698652575, + "learning_rate": 2.8206790414243525e-06, + "loss": 1.233, + "step": 6714 + }, + { + "epoch": 0.950661853188929, + "grad_norm": 11.059873797229539, + "learning_rate": 2.8201105685077184e-06, + "loss": 1.3191, + "step": 6715 + }, + { + "epoch": 0.9508034260635663, + "grad_norm": 7.8112943405624105, + "learning_rate": 2.8195420787631113e-06, + "loss": 1.1986, + "step": 6716 + }, + { + "epoch": 0.9509449989382034, + "grad_norm": 9.008086915811397, + "learning_rate": 2.818973572220416e-06, + "loss": 1.1906, + "step": 6717 + }, + { + "epoch": 0.9510865718128406, + "grad_norm": 9.613768526862186, + "learning_rate": 2.818405048909517e-06, + "loss": 1.2139, + "step": 6718 + }, + { + "epoch": 0.9512281446874779, + "grad_norm": 9.910347942322238, + "learning_rate": 2.817836508860302e-06, + "loss": 1.1895, + "step": 6719 + }, + { + "epoch": 0.9513697175621151, + "grad_norm": 8.13985364777688, + "learning_rate": 2.817267952102659e-06, + "loss": 1.3553, + "step": 6720 + }, + { + "epoch": 0.9515112904367523, + "grad_norm": 8.009168616116577, + "learning_rate": 2.8166993786664757e-06, + "loss": 1.2722, + "step": 6721 + }, + { + "epoch": 0.9516528633113895, + "grad_norm": 10.192194880731844, + "learning_rate": 2.816130788581643e-06, + "loss": 1.3005, + "step": 6722 + }, + { + "epoch": 0.9517944361860268, + "grad_norm": 11.828695830223436, + "learning_rate": 2.8155621818780497e-06, + "loss": 1.3706, + "step": 6723 + }, + { + "epoch": 0.951936009060664, + "grad_norm": 9.347325282090017, + "learning_rate": 2.8149935585855885e-06, + "loss": 1.2674, + "step": 6724 + }, + { + "epoch": 0.9520775819353012, + "grad_norm": 9.533119293513296, + "learning_rate": 2.8144249187341506e-06, + "loss": 1.0612, + "step": 6725 + }, + { + "epoch": 0.9522191548099385, + "grad_norm": 8.130122052129325, + "learning_rate": 2.8138562623536293e-06, + "loss": 1.2535, + "step": 6726 + }, + { + "epoch": 0.9523607276845756, + "grad_norm": 9.106597738754484, + "learning_rate": 2.8132875894739175e-06, + "loss": 1.2063, + "step": 6727 + }, + { + "epoch": 0.9525023005592128, + "grad_norm": 8.951163564732669, + "learning_rate": 2.812718900124912e-06, + "loss": 1.1636, + "step": 6728 + }, + { + "epoch": 0.95264387343385, + "grad_norm": 9.735434975019453, + "learning_rate": 2.8121501943365066e-06, + "loss": 1.3208, + "step": 6729 + }, + { + "epoch": 0.9527854463084873, + "grad_norm": 10.947793356760267, + "learning_rate": 2.8115814721385975e-06, + "loss": 1.3122, + "step": 6730 + }, + { + "epoch": 0.9529270191831245, + "grad_norm": 10.693640300248262, + "learning_rate": 2.8110127335610833e-06, + "loss": 1.1756, + "step": 6731 + }, + { + "epoch": 0.9530685920577617, + "grad_norm": 8.222967073994642, + "learning_rate": 2.8104439786338617e-06, + "loss": 1.1511, + "step": 6732 + }, + { + "epoch": 0.953210164932399, + "grad_norm": 12.19957021988089, + "learning_rate": 2.809875207386832e-06, + "loss": 1.2904, + "step": 6733 + }, + { + "epoch": 0.9533517378070362, + "grad_norm": 9.71107773744778, + "learning_rate": 2.809306419849893e-06, + "loss": 1.4051, + "step": 6734 + }, + { + "epoch": 0.9534933106816734, + "grad_norm": 9.331471620803617, + "learning_rate": 2.8087376160529463e-06, + "loss": 1.284, + "step": 6735 + }, + { + "epoch": 0.9536348835563107, + "grad_norm": 11.550949678557279, + "learning_rate": 2.808168796025893e-06, + "loss": 1.1989, + "step": 6736 + }, + { + "epoch": 0.9537764564309479, + "grad_norm": 12.279651489282674, + "learning_rate": 2.8075999597986364e-06, + "loss": 1.2099, + "step": 6737 + }, + { + "epoch": 0.953918029305585, + "grad_norm": 11.756331665469277, + "learning_rate": 2.8070311074010793e-06, + "loss": 1.3089, + "step": 6738 + }, + { + "epoch": 0.9540596021802222, + "grad_norm": 10.087303914146121, + "learning_rate": 2.806462238863125e-06, + "loss": 1.2616, + "step": 6739 + }, + { + "epoch": 0.9542011750548595, + "grad_norm": 10.370676633561121, + "learning_rate": 2.8058933542146804e-06, + "loss": 1.0093, + "step": 6740 + }, + { + "epoch": 0.9543427479294967, + "grad_norm": 15.277681335761336, + "learning_rate": 2.80532445348565e-06, + "loss": 1.3581, + "step": 6741 + }, + { + "epoch": 0.9544843208041339, + "grad_norm": 12.232899466017987, + "learning_rate": 2.8047555367059404e-06, + "loss": 1.4334, + "step": 6742 + }, + { + "epoch": 0.9546258936787712, + "grad_norm": 9.48459942974786, + "learning_rate": 2.80418660390546e-06, + "loss": 1.0948, + "step": 6743 + }, + { + "epoch": 0.9547674665534084, + "grad_norm": 10.2445236090826, + "learning_rate": 2.803617655114116e-06, + "loss": 1.248, + "step": 6744 + }, + { + "epoch": 0.9549090394280456, + "grad_norm": 8.632542946866362, + "learning_rate": 2.803048690361818e-06, + "loss": 1.1209, + "step": 6745 + }, + { + "epoch": 0.9550506123026828, + "grad_norm": 8.678577698935745, + "learning_rate": 2.8024797096784766e-06, + "loss": 1.1799, + "step": 6746 + }, + { + "epoch": 0.9551921851773201, + "grad_norm": 11.284371322001617, + "learning_rate": 2.8019107130940025e-06, + "loss": 1.1782, + "step": 6747 + }, + { + "epoch": 0.9553337580519572, + "grad_norm": 8.964302901032788, + "learning_rate": 2.8013417006383078e-06, + "loss": 1.2962, + "step": 6748 + }, + { + "epoch": 0.9554753309265944, + "grad_norm": 8.894267055255048, + "learning_rate": 2.8007726723413046e-06, + "loss": 1.1119, + "step": 6749 + }, + { + "epoch": 0.9556169038012317, + "grad_norm": 12.9287114907305, + "learning_rate": 2.800203628232906e-06, + "loss": 1.3522, + "step": 6750 + }, + { + "epoch": 0.9557584766758689, + "grad_norm": 7.423010809472061, + "learning_rate": 2.7996345683430266e-06, + "loss": 1.1029, + "step": 6751 + }, + { + "epoch": 0.9559000495505061, + "grad_norm": 7.135210718306006, + "learning_rate": 2.799065492701581e-06, + "loss": 1.1291, + "step": 6752 + }, + { + "epoch": 0.9560416224251433, + "grad_norm": 11.898341318143755, + "learning_rate": 2.7984964013384853e-06, + "loss": 1.3048, + "step": 6753 + }, + { + "epoch": 0.9561831952997806, + "grad_norm": 11.185522149712938, + "learning_rate": 2.7979272942836566e-06, + "loss": 1.3042, + "step": 6754 + }, + { + "epoch": 0.9563247681744178, + "grad_norm": 9.982277997703605, + "learning_rate": 2.7973581715670124e-06, + "loss": 1.272, + "step": 6755 + }, + { + "epoch": 0.956466341049055, + "grad_norm": 8.709874164359917, + "learning_rate": 2.7967890332184705e-06, + "loss": 1.2257, + "step": 6756 + }, + { + "epoch": 0.9566079139236923, + "grad_norm": 7.691879885588205, + "learning_rate": 2.7962198792679506e-06, + "loss": 1.1192, + "step": 6757 + }, + { + "epoch": 0.9567494867983294, + "grad_norm": 8.540626822344755, + "learning_rate": 2.795650709745373e-06, + "loss": 1.1699, + "step": 6758 + }, + { + "epoch": 0.9568910596729666, + "grad_norm": 10.538374757489738, + "learning_rate": 2.7950815246806575e-06, + "loss": 1.2333, + "step": 6759 + }, + { + "epoch": 0.9570326325476038, + "grad_norm": 11.345002244501073, + "learning_rate": 2.794512324103726e-06, + "loss": 1.286, + "step": 6760 + }, + { + "epoch": 0.9571742054222411, + "grad_norm": 10.294283967798245, + "learning_rate": 2.7939431080445016e-06, + "loss": 1.2395, + "step": 6761 + }, + { + "epoch": 0.9573157782968783, + "grad_norm": 7.0718465078793855, + "learning_rate": 2.7933738765329073e-06, + "loss": 1.1913, + "step": 6762 + }, + { + "epoch": 0.9574573511715155, + "grad_norm": 7.779537395462685, + "learning_rate": 2.7928046295988666e-06, + "loss": 1.1875, + "step": 6763 + }, + { + "epoch": 0.9575989240461528, + "grad_norm": 8.56115533893088, + "learning_rate": 2.792235367272305e-06, + "loss": 1.0891, + "step": 6764 + }, + { + "epoch": 0.95774049692079, + "grad_norm": 8.55687177807783, + "learning_rate": 2.7916660895831487e-06, + "loss": 1.1343, + "step": 6765 + }, + { + "epoch": 0.9578820697954272, + "grad_norm": 9.368613140183031, + "learning_rate": 2.791096796561323e-06, + "loss": 1.3072, + "step": 6766 + }, + { + "epoch": 0.9580236426700645, + "grad_norm": 9.204747625286357, + "learning_rate": 2.790527488236755e-06, + "loss": 1.3667, + "step": 6767 + }, + { + "epoch": 0.9581652155447017, + "grad_norm": 7.11704278290027, + "learning_rate": 2.7899581646393746e-06, + "loss": 1.1033, + "step": 6768 + }, + { + "epoch": 0.9583067884193388, + "grad_norm": 9.534550129880236, + "learning_rate": 2.789388825799109e-06, + "loss": 1.2974, + "step": 6769 + }, + { + "epoch": 0.958448361293976, + "grad_norm": 8.966253803868739, + "learning_rate": 2.78881947174589e-06, + "loss": 1.161, + "step": 6770 + }, + { + "epoch": 0.9585899341686133, + "grad_norm": 8.989070613969483, + "learning_rate": 2.788250102509646e-06, + "loss": 1.2873, + "step": 6771 + }, + { + "epoch": 0.9587315070432505, + "grad_norm": 7.3011488232609185, + "learning_rate": 2.7876807181203085e-06, + "loss": 1.1408, + "step": 6772 + }, + { + "epoch": 0.9588730799178877, + "grad_norm": 8.961812276803613, + "learning_rate": 2.7871113186078102e-06, + "loss": 1.339, + "step": 6773 + }, + { + "epoch": 0.959014652792525, + "grad_norm": 9.435664320514672, + "learning_rate": 2.786541904002085e-06, + "loss": 1.1699, + "step": 6774 + }, + { + "epoch": 0.9591562256671622, + "grad_norm": 8.149750557107033, + "learning_rate": 2.785972474333064e-06, + "loss": 1.2308, + "step": 6775 + }, + { + "epoch": 0.9592977985417994, + "grad_norm": 9.812349160365313, + "learning_rate": 2.7854030296306846e-06, + "loss": 1.1855, + "step": 6776 + }, + { + "epoch": 0.9594393714164366, + "grad_norm": 7.781170090108004, + "learning_rate": 2.7848335699248796e-06, + "loss": 1.1634, + "step": 6777 + }, + { + "epoch": 0.9595809442910739, + "grad_norm": 7.651292589477605, + "learning_rate": 2.7842640952455867e-06, + "loss": 1.2265, + "step": 6778 + }, + { + "epoch": 0.959722517165711, + "grad_norm": 7.977948314056554, + "learning_rate": 2.783694605622743e-06, + "loss": 1.2134, + "step": 6779 + }, + { + "epoch": 0.9598640900403482, + "grad_norm": 8.116024752801456, + "learning_rate": 2.7831251010862847e-06, + "loss": 1.0956, + "step": 6780 + }, + { + "epoch": 0.9600056629149855, + "grad_norm": 7.218599127972222, + "learning_rate": 2.7825555816661503e-06, + "loss": 1.1939, + "step": 6781 + }, + { + "epoch": 0.9601472357896227, + "grad_norm": 8.665751286606842, + "learning_rate": 2.7819860473922805e-06, + "loss": 1.2139, + "step": 6782 + }, + { + "epoch": 0.9602888086642599, + "grad_norm": 9.267588209623069, + "learning_rate": 2.781416498294614e-06, + "loss": 1.1568, + "step": 6783 + }, + { + "epoch": 0.9604303815388971, + "grad_norm": 8.906223872631633, + "learning_rate": 2.7808469344030923e-06, + "loss": 1.2797, + "step": 6784 + }, + { + "epoch": 0.9605719544135344, + "grad_norm": 7.787460606262596, + "learning_rate": 2.780277355747657e-06, + "loss": 1.1727, + "step": 6785 + }, + { + "epoch": 0.9607135272881716, + "grad_norm": 8.694141072541774, + "learning_rate": 2.7797077623582503e-06, + "loss": 1.2807, + "step": 6786 + }, + { + "epoch": 0.9608551001628088, + "grad_norm": 13.779870896309054, + "learning_rate": 2.779138154264814e-06, + "loss": 1.1821, + "step": 6787 + }, + { + "epoch": 0.9609966730374461, + "grad_norm": 12.026936498464018, + "learning_rate": 2.778568531497294e-06, + "loss": 1.2051, + "step": 6788 + }, + { + "epoch": 0.9611382459120833, + "grad_norm": 7.0609272243680765, + "learning_rate": 2.777998894085634e-06, + "loss": 1.1013, + "step": 6789 + }, + { + "epoch": 0.9612798187867204, + "grad_norm": 11.759092019249934, + "learning_rate": 2.7774292420597784e-06, + "loss": 1.1277, + "step": 6790 + }, + { + "epoch": 0.9614213916613576, + "grad_norm": 12.403394394474555, + "learning_rate": 2.776859575449675e-06, + "loss": 1.2321, + "step": 6791 + }, + { + "epoch": 0.9615629645359949, + "grad_norm": 9.635096815144466, + "learning_rate": 2.7762898942852705e-06, + "loss": 1.1893, + "step": 6792 + }, + { + "epoch": 0.9617045374106321, + "grad_norm": 9.582157847971276, + "learning_rate": 2.775720198596512e-06, + "loss": 1.2294, + "step": 6793 + }, + { + "epoch": 0.9618461102852693, + "grad_norm": 10.949881036538958, + "learning_rate": 2.7751504884133484e-06, + "loss": 1.2419, + "step": 6794 + }, + { + "epoch": 0.9619876831599066, + "grad_norm": 9.256407709214153, + "learning_rate": 2.7745807637657287e-06, + "loss": 1.2259, + "step": 6795 + }, + { + "epoch": 0.9621292560345438, + "grad_norm": 8.986366966676, + "learning_rate": 2.774011024683603e-06, + "loss": 1.1775, + "step": 6796 + }, + { + "epoch": 0.962270828909181, + "grad_norm": 9.635300710169384, + "learning_rate": 2.7734412711969215e-06, + "loss": 1.1805, + "step": 6797 + }, + { + "epoch": 0.9624124017838183, + "grad_norm": 12.047289176975536, + "learning_rate": 2.7728715033356366e-06, + "loss": 1.1853, + "step": 6798 + }, + { + "epoch": 0.9625539746584555, + "grad_norm": 10.516026210074628, + "learning_rate": 2.7723017211297006e-06, + "loss": 1.2077, + "step": 6799 + }, + { + "epoch": 0.9626955475330926, + "grad_norm": 9.285853416322787, + "learning_rate": 2.7717319246090657e-06, + "loss": 1.3549, + "step": 6800 + }, + { + "epoch": 0.9628371204077298, + "grad_norm": 9.115124518903578, + "learning_rate": 2.7711621138036864e-06, + "loss": 1.1669, + "step": 6801 + }, + { + "epoch": 0.9629786932823671, + "grad_norm": 11.605474009170804, + "learning_rate": 2.7705922887435172e-06, + "loss": 1.3779, + "step": 6802 + }, + { + "epoch": 0.9631202661570043, + "grad_norm": 9.729126717092655, + "learning_rate": 2.770022449458513e-06, + "loss": 1.1608, + "step": 6803 + }, + { + "epoch": 0.9632618390316415, + "grad_norm": 9.6604034478582, + "learning_rate": 2.7694525959786297e-06, + "loss": 1.2459, + "step": 6804 + }, + { + "epoch": 0.9634034119062788, + "grad_norm": 10.115209296225428, + "learning_rate": 2.7688827283338236e-06, + "loss": 1.1445, + "step": 6805 + }, + { + "epoch": 0.963544984780916, + "grad_norm": 10.226904552267582, + "learning_rate": 2.7683128465540545e-06, + "loss": 1.1872, + "step": 6806 + }, + { + "epoch": 0.9636865576555532, + "grad_norm": 8.215287672266014, + "learning_rate": 2.7677429506692788e-06, + "loss": 1.2181, + "step": 6807 + }, + { + "epoch": 0.9638281305301905, + "grad_norm": 7.688158456779034, + "learning_rate": 2.7671730407094553e-06, + "loss": 1.287, + "step": 6808 + }, + { + "epoch": 0.9639697034048277, + "grad_norm": 10.801384476825687, + "learning_rate": 2.7666031167045444e-06, + "loss": 1.3857, + "step": 6809 + }, + { + "epoch": 0.9641112762794648, + "grad_norm": 8.96447311665763, + "learning_rate": 2.766033178684506e-06, + "loss": 1.2443, + "step": 6810 + }, + { + "epoch": 0.964252849154102, + "grad_norm": 8.28223801962788, + "learning_rate": 2.7654632266793025e-06, + "loss": 1.309, + "step": 6811 + }, + { + "epoch": 0.9643944220287393, + "grad_norm": 8.100642388384529, + "learning_rate": 2.764893260718895e-06, + "loss": 1.1993, + "step": 6812 + }, + { + "epoch": 0.9645359949033765, + "grad_norm": 9.109710844205708, + "learning_rate": 2.764323280833246e-06, + "loss": 1.1567, + "step": 6813 + }, + { + "epoch": 0.9646775677780137, + "grad_norm": 8.685445199469529, + "learning_rate": 2.7637532870523193e-06, + "loss": 1.187, + "step": 6814 + }, + { + "epoch": 0.964819140652651, + "grad_norm": 10.407845956118349, + "learning_rate": 2.7631832794060787e-06, + "loss": 1.2914, + "step": 6815 + }, + { + "epoch": 0.9649607135272882, + "grad_norm": 8.8739644910589, + "learning_rate": 2.7626132579244896e-06, + "loss": 1.2506, + "step": 6816 + }, + { + "epoch": 0.9651022864019254, + "grad_norm": 7.525050227102152, + "learning_rate": 2.7620432226375175e-06, + "loss": 1.1452, + "step": 6817 + }, + { + "epoch": 0.9652438592765626, + "grad_norm": 10.033387333755243, + "learning_rate": 2.761473173575129e-06, + "loss": 1.3081, + "step": 6818 + }, + { + "epoch": 0.9653854321511999, + "grad_norm": 9.549092785949249, + "learning_rate": 2.7609031107672896e-06, + "loss": 1.2597, + "step": 6819 + }, + { + "epoch": 0.9655270050258371, + "grad_norm": 7.9333684786918335, + "learning_rate": 2.7603330342439686e-06, + "loss": 1.2474, + "step": 6820 + }, + { + "epoch": 0.9656685779004742, + "grad_norm": 8.514025616719064, + "learning_rate": 2.7597629440351346e-06, + "loss": 1.2384, + "step": 6821 + }, + { + "epoch": 0.9658101507751115, + "grad_norm": 8.963746698943055, + "learning_rate": 2.7591928401707555e-06, + "loss": 1.2677, + "step": 6822 + }, + { + "epoch": 0.9659517236497487, + "grad_norm": 8.317880366359118, + "learning_rate": 2.7586227226808038e-06, + "loss": 1.0506, + "step": 6823 + }, + { + "epoch": 0.9660932965243859, + "grad_norm": 7.989184937041572, + "learning_rate": 2.7580525915952465e-06, + "loss": 1.2297, + "step": 6824 + }, + { + "epoch": 0.9662348693990231, + "grad_norm": 7.4073534437262545, + "learning_rate": 2.757482446944058e-06, + "loss": 1.1061, + "step": 6825 + }, + { + "epoch": 0.9663764422736604, + "grad_norm": 8.961118421914868, + "learning_rate": 2.756912288757209e-06, + "loss": 1.1123, + "step": 6826 + }, + { + "epoch": 0.9665180151482976, + "grad_norm": 8.917801068835317, + "learning_rate": 2.7563421170646714e-06, + "loss": 1.2417, + "step": 6827 + }, + { + "epoch": 0.9666595880229348, + "grad_norm": 12.467908022950361, + "learning_rate": 2.7557719318964216e-06, + "loss": 1.2221, + "step": 6828 + }, + { + "epoch": 0.9668011608975721, + "grad_norm": 9.585531983196285, + "learning_rate": 2.755201733282431e-06, + "loss": 1.2904, + "step": 6829 + }, + { + "epoch": 0.9669427337722093, + "grad_norm": 9.878735185969493, + "learning_rate": 2.754631521252676e-06, + "loss": 1.3014, + "step": 6830 + }, + { + "epoch": 0.9670843066468464, + "grad_norm": 9.015026687523328, + "learning_rate": 2.7540612958371315e-06, + "loss": 1.2131, + "step": 6831 + }, + { + "epoch": 0.9672258795214836, + "grad_norm": 8.835075734390841, + "learning_rate": 2.7534910570657743e-06, + "loss": 1.1847, + "step": 6832 + }, + { + "epoch": 0.9673674523961209, + "grad_norm": 9.451198312178875, + "learning_rate": 2.752920804968581e-06, + "loss": 1.244, + "step": 6833 + }, + { + "epoch": 0.9675090252707581, + "grad_norm": 8.339795722197774, + "learning_rate": 2.7523505395755296e-06, + "loss": 1.1699, + "step": 6834 + }, + { + "epoch": 0.9676505981453953, + "grad_norm": 8.201437698493212, + "learning_rate": 2.7517802609165985e-06, + "loss": 1.1847, + "step": 6835 + }, + { + "epoch": 0.9677921710200326, + "grad_norm": 9.093995815245096, + "learning_rate": 2.751209969021767e-06, + "loss": 1.2135, + "step": 6836 + }, + { + "epoch": 0.9679337438946698, + "grad_norm": 9.679525920513795, + "learning_rate": 2.750639663921014e-06, + "loss": 1.2543, + "step": 6837 + }, + { + "epoch": 0.968075316769307, + "grad_norm": 8.99707704339065, + "learning_rate": 2.7500693456443217e-06, + "loss": 1.2358, + "step": 6838 + }, + { + "epoch": 0.9682168896439443, + "grad_norm": 7.232786593305299, + "learning_rate": 2.749499014221671e-06, + "loss": 1.2684, + "step": 6839 + }, + { + "epoch": 0.9683584625185815, + "grad_norm": 7.636355589990381, + "learning_rate": 2.748928669683042e-06, + "loss": 1.0955, + "step": 6840 + }, + { + "epoch": 0.9685000353932186, + "grad_norm": 7.772285659794125, + "learning_rate": 2.748358312058418e-06, + "loss": 1.2902, + "step": 6841 + }, + { + "epoch": 0.9686416082678558, + "grad_norm": 9.126189689943768, + "learning_rate": 2.7477879413777834e-06, + "loss": 1.133, + "step": 6842 + }, + { + "epoch": 0.9687831811424931, + "grad_norm": 10.261113910253565, + "learning_rate": 2.7472175576711213e-06, + "loss": 1.2474, + "step": 6843 + }, + { + "epoch": 0.9689247540171303, + "grad_norm": 11.603930997709758, + "learning_rate": 2.7466471609684175e-06, + "loss": 1.3023, + "step": 6844 + }, + { + "epoch": 0.9690663268917675, + "grad_norm": 9.440700367308189, + "learning_rate": 2.7460767512996556e-06, + "loss": 1.1517, + "step": 6845 + }, + { + "epoch": 0.9692078997664048, + "grad_norm": 9.684792023229157, + "learning_rate": 2.745506328694822e-06, + "loss": 1.0535, + "step": 6846 + }, + { + "epoch": 0.969349472641042, + "grad_norm": 9.124844902497188, + "learning_rate": 2.7449358931839042e-06, + "loss": 1.1206, + "step": 6847 + }, + { + "epoch": 0.9694910455156792, + "grad_norm": 8.102630338859298, + "learning_rate": 2.7443654447968894e-06, + "loss": 1.2422, + "step": 6848 + }, + { + "epoch": 0.9696326183903164, + "grad_norm": 10.34293641178191, + "learning_rate": 2.7437949835637644e-06, + "loss": 1.3034, + "step": 6849 + }, + { + "epoch": 0.9697741912649537, + "grad_norm": 10.262781501282506, + "learning_rate": 2.7432245095145193e-06, + "loss": 1.3562, + "step": 6850 + }, + { + "epoch": 0.9699157641395909, + "grad_norm": 7.535063351957804, + "learning_rate": 2.7426540226791437e-06, + "loss": 1.1684, + "step": 6851 + }, + { + "epoch": 0.970057337014228, + "grad_norm": 9.36825155875238, + "learning_rate": 2.7420835230876264e-06, + "loss": 1.3498, + "step": 6852 + }, + { + "epoch": 0.9701989098888653, + "grad_norm": 8.334614871348942, + "learning_rate": 2.7415130107699588e-06, + "loss": 1.373, + "step": 6853 + }, + { + "epoch": 0.9703404827635025, + "grad_norm": 9.61680908450269, + "learning_rate": 2.740942485756133e-06, + "loss": 1.2375, + "step": 6854 + }, + { + "epoch": 0.9704820556381397, + "grad_norm": 7.804914774750157, + "learning_rate": 2.7403719480761406e-06, + "loss": 1.1462, + "step": 6855 + }, + { + "epoch": 0.970623628512777, + "grad_norm": 9.38624163243318, + "learning_rate": 2.7398013977599722e-06, + "loss": 1.2008, + "step": 6856 + }, + { + "epoch": 0.9707652013874142, + "grad_norm": 9.411750883204464, + "learning_rate": 2.7392308348376243e-06, + "loss": 1.3367, + "step": 6857 + }, + { + "epoch": 0.9709067742620514, + "grad_norm": 9.621263051063197, + "learning_rate": 2.73866025933909e-06, + "loss": 1.2406, + "step": 6858 + }, + { + "epoch": 0.9710483471366886, + "grad_norm": 8.550025484816734, + "learning_rate": 2.738089671294364e-06, + "loss": 1.1948, + "step": 6859 + }, + { + "epoch": 0.9711899200113259, + "grad_norm": 8.318497638041473, + "learning_rate": 2.7375190707334416e-06, + "loss": 1.3192, + "step": 6860 + }, + { + "epoch": 0.9713314928859631, + "grad_norm": 9.371181269782015, + "learning_rate": 2.736948457686318e-06, + "loss": 1.3007, + "step": 6861 + }, + { + "epoch": 0.9714730657606002, + "grad_norm": 17.021315730841266, + "learning_rate": 2.736377832182991e-06, + "loss": 1.1558, + "step": 6862 + }, + { + "epoch": 0.9716146386352374, + "grad_norm": 11.397491687755728, + "learning_rate": 2.7358071942534574e-06, + "loss": 1.3553, + "step": 6863 + }, + { + "epoch": 0.9717562115098747, + "grad_norm": 9.128772530578198, + "learning_rate": 2.735236543927715e-06, + "loss": 1.1386, + "step": 6864 + }, + { + "epoch": 0.9718977843845119, + "grad_norm": 8.414082448328257, + "learning_rate": 2.734665881235764e-06, + "loss": 1.1614, + "step": 6865 + }, + { + "epoch": 0.9720393572591491, + "grad_norm": 8.681180782605177, + "learning_rate": 2.7340952062076022e-06, + "loss": 1.2788, + "step": 6866 + }, + { + "epoch": 0.9721809301337864, + "grad_norm": 7.974424727687184, + "learning_rate": 2.73352451887323e-06, + "loss": 1.1504, + "step": 6867 + }, + { + "epoch": 0.9723225030084236, + "grad_norm": 8.883126849648896, + "learning_rate": 2.7329538192626478e-06, + "loss": 1.2526, + "step": 6868 + }, + { + "epoch": 0.9724640758830608, + "grad_norm": 7.582118289846873, + "learning_rate": 2.7323831074058572e-06, + "loss": 1.2576, + "step": 6869 + }, + { + "epoch": 0.9726056487576981, + "grad_norm": 9.532737940978299, + "learning_rate": 2.7318123833328598e-06, + "loss": 1.197, + "step": 6870 + }, + { + "epoch": 0.9727472216323353, + "grad_norm": 7.593381268375378, + "learning_rate": 2.731241647073658e-06, + "loss": 1.183, + "step": 6871 + }, + { + "epoch": 0.9728887945069724, + "grad_norm": 8.961202708994994, + "learning_rate": 2.730670898658255e-06, + "loss": 1.2875, + "step": 6872 + }, + { + "epoch": 0.9730303673816096, + "grad_norm": 9.389703232500674, + "learning_rate": 2.7301001381166553e-06, + "loss": 1.1677, + "step": 6873 + }, + { + "epoch": 0.9731719402562469, + "grad_norm": 9.132471657454992, + "learning_rate": 2.729529365478863e-06, + "loss": 1.2873, + "step": 6874 + }, + { + "epoch": 0.9733135131308841, + "grad_norm": 8.328304553646033, + "learning_rate": 2.7289585807748832e-06, + "loss": 1.2309, + "step": 6875 + }, + { + "epoch": 0.9734550860055213, + "grad_norm": 7.786822303932303, + "learning_rate": 2.7283877840347217e-06, + "loss": 1.0204, + "step": 6876 + }, + { + "epoch": 0.9735966588801586, + "grad_norm": 7.540143219171967, + "learning_rate": 2.7278169752883845e-06, + "loss": 1.2125, + "step": 6877 + }, + { + "epoch": 0.9737382317547958, + "grad_norm": 8.005334029558336, + "learning_rate": 2.727246154565878e-06, + "loss": 1.1226, + "step": 6878 + }, + { + "epoch": 0.973879804629433, + "grad_norm": 7.951958171634691, + "learning_rate": 2.726675321897211e-06, + "loss": 1.2192, + "step": 6879 + }, + { + "epoch": 0.9740213775040703, + "grad_norm": 8.015506022204224, + "learning_rate": 2.7261044773123913e-06, + "loss": 1.3853, + "step": 6880 + }, + { + "epoch": 0.9741629503787075, + "grad_norm": 8.678138134527106, + "learning_rate": 2.725533620841429e-06, + "loss": 1.2095, + "step": 6881 + }, + { + "epoch": 0.9743045232533447, + "grad_norm": 8.403758307241079, + "learning_rate": 2.7249627525143313e-06, + "loss": 1.2609, + "step": 6882 + }, + { + "epoch": 0.9744460961279818, + "grad_norm": 9.143916341190327, + "learning_rate": 2.7243918723611095e-06, + "loss": 1.1683, + "step": 6883 + }, + { + "epoch": 0.9745876690026191, + "grad_norm": 9.408454712680845, + "learning_rate": 2.7238209804117744e-06, + "loss": 1.2685, + "step": 6884 + }, + { + "epoch": 0.9747292418772563, + "grad_norm": 8.221527442560143, + "learning_rate": 2.7232500766963373e-06, + "loss": 1.182, + "step": 6885 + }, + { + "epoch": 0.9748708147518935, + "grad_norm": 9.99897112321809, + "learning_rate": 2.72267916124481e-06, + "loss": 1.2708, + "step": 6886 + }, + { + "epoch": 0.9750123876265308, + "grad_norm": 7.211240619475449, + "learning_rate": 2.722108234087205e-06, + "loss": 1.1409, + "step": 6887 + }, + { + "epoch": 0.975153960501168, + "grad_norm": 11.228463815598424, + "learning_rate": 2.7215372952535364e-06, + "loss": 1.3685, + "step": 6888 + }, + { + "epoch": 0.9752955333758052, + "grad_norm": 11.003711767783498, + "learning_rate": 2.7209663447738164e-06, + "loss": 1.3019, + "step": 6889 + }, + { + "epoch": 0.9754371062504424, + "grad_norm": 9.040779550749697, + "learning_rate": 2.7203953826780615e-06, + "loss": 1.178, + "step": 6890 + }, + { + "epoch": 0.9755786791250797, + "grad_norm": 8.382346958910182, + "learning_rate": 2.719824408996285e-06, + "loss": 1.2288, + "step": 6891 + }, + { + "epoch": 0.9757202519997169, + "grad_norm": 7.6662723882159485, + "learning_rate": 2.7192534237585037e-06, + "loss": 1.1314, + "step": 6892 + }, + { + "epoch": 0.975861824874354, + "grad_norm": 9.65478465001595, + "learning_rate": 2.7186824269947334e-06, + "loss": 1.2274, + "step": 6893 + }, + { + "epoch": 0.9760033977489913, + "grad_norm": 10.431812312819034, + "learning_rate": 2.71811141873499e-06, + "loss": 1.2369, + "step": 6894 + }, + { + "epoch": 0.9761449706236285, + "grad_norm": 10.860950031912637, + "learning_rate": 2.717540399009293e-06, + "loss": 1.3003, + "step": 6895 + }, + { + "epoch": 0.9762865434982657, + "grad_norm": 10.73794447452984, + "learning_rate": 2.716969367847659e-06, + "loss": 1.2471, + "step": 6896 + }, + { + "epoch": 0.9764281163729029, + "grad_norm": 9.05550014216261, + "learning_rate": 2.7163983252801076e-06, + "loss": 1.2145, + "step": 6897 + }, + { + "epoch": 0.9765696892475402, + "grad_norm": 7.74147869603237, + "learning_rate": 2.7158272713366573e-06, + "loss": 1.0228, + "step": 6898 + }, + { + "epoch": 0.9767112621221774, + "grad_norm": 9.065973287136158, + "learning_rate": 2.715256206047328e-06, + "loss": 1.2788, + "step": 6899 + }, + { + "epoch": 0.9768528349968146, + "grad_norm": 9.08185693717128, + "learning_rate": 2.7146851294421404e-06, + "loss": 1.2308, + "step": 6900 + }, + { + "epoch": 0.9769944078714519, + "grad_norm": 8.65525046406533, + "learning_rate": 2.714114041551115e-06, + "loss": 1.1772, + "step": 6901 + }, + { + "epoch": 0.9771359807460891, + "grad_norm": 11.416864175317855, + "learning_rate": 2.7135429424042758e-06, + "loss": 1.1866, + "step": 6902 + }, + { + "epoch": 0.9772775536207262, + "grad_norm": 8.508470241878594, + "learning_rate": 2.712971832031642e-06, + "loss": 1.1496, + "step": 6903 + }, + { + "epoch": 0.9774191264953634, + "grad_norm": 11.846634244371769, + "learning_rate": 2.712400710463239e-06, + "loss": 1.2424, + "step": 6904 + }, + { + "epoch": 0.9775606993700007, + "grad_norm": 10.40688415977804, + "learning_rate": 2.7118295777290875e-06, + "loss": 1.377, + "step": 6905 + }, + { + "epoch": 0.9777022722446379, + "grad_norm": 10.443639354473545, + "learning_rate": 2.711258433859214e-06, + "loss": 1.2962, + "step": 6906 + }, + { + "epoch": 0.9778438451192751, + "grad_norm": 9.988084178736353, + "learning_rate": 2.710687278883642e-06, + "loss": 1.2723, + "step": 6907 + }, + { + "epoch": 0.9779854179939124, + "grad_norm": 11.151299754111836, + "learning_rate": 2.7101161128323967e-06, + "loss": 1.1949, + "step": 6908 + }, + { + "epoch": 0.9781269908685496, + "grad_norm": 9.229667793486664, + "learning_rate": 2.7095449357355042e-06, + "loss": 1.0558, + "step": 6909 + }, + { + "epoch": 0.9782685637431868, + "grad_norm": 9.601129433632648, + "learning_rate": 2.7089737476229906e-06, + "loss": 1.2438, + "step": 6910 + }, + { + "epoch": 0.9784101366178241, + "grad_norm": 10.102100237040748, + "learning_rate": 2.7084025485248827e-06, + "loss": 1.1734, + "step": 6911 + }, + { + "epoch": 0.9785517094924613, + "grad_norm": 9.178696755046097, + "learning_rate": 2.7078313384712084e-06, + "loss": 1.3564, + "step": 6912 + }, + { + "epoch": 0.9786932823670985, + "grad_norm": 8.574309954668683, + "learning_rate": 2.7072601174919965e-06, + "loss": 1.1461, + "step": 6913 + }, + { + "epoch": 0.9788348552417356, + "grad_norm": 10.274845807108896, + "learning_rate": 2.7066888856172737e-06, + "loss": 1.4199, + "step": 6914 + }, + { + "epoch": 0.9789764281163729, + "grad_norm": 7.7895049770101075, + "learning_rate": 2.70611764287707e-06, + "loss": 1.0802, + "step": 6915 + }, + { + "epoch": 0.9791180009910101, + "grad_norm": 7.749541730560034, + "learning_rate": 2.7055463893014156e-06, + "loss": 1.1431, + "step": 6916 + }, + { + "epoch": 0.9792595738656473, + "grad_norm": 8.968540734165767, + "learning_rate": 2.7049751249203414e-06, + "loss": 1.0827, + "step": 6917 + }, + { + "epoch": 0.9794011467402846, + "grad_norm": 8.332591621451444, + "learning_rate": 2.7044038497638782e-06, + "loss": 1.2306, + "step": 6918 + }, + { + "epoch": 0.9795427196149218, + "grad_norm": 8.115950019212505, + "learning_rate": 2.7038325638620563e-06, + "loss": 1.1321, + "step": 6919 + }, + { + "epoch": 0.979684292489559, + "grad_norm": 8.376326270918236, + "learning_rate": 2.7032612672449084e-06, + "loss": 1.2603, + "step": 6920 + }, + { + "epoch": 0.9798258653641962, + "grad_norm": 7.132507384544104, + "learning_rate": 2.7026899599424674e-06, + "loss": 1.1337, + "step": 6921 + }, + { + "epoch": 0.9799674382388335, + "grad_norm": 7.559724156428531, + "learning_rate": 2.702118641984766e-06, + "loss": 1.1001, + "step": 6922 + }, + { + "epoch": 0.9801090111134707, + "grad_norm": 11.61432510339804, + "learning_rate": 2.7015473134018382e-06, + "loss": 1.2655, + "step": 6923 + }, + { + "epoch": 0.9802505839881078, + "grad_norm": 7.88909063245442, + "learning_rate": 2.700975974223719e-06, + "loss": 1.2066, + "step": 6924 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 8.175875499635918, + "learning_rate": 2.700404624480443e-06, + "loss": 1.1803, + "step": 6925 + }, + { + "epoch": 0.9805337297373823, + "grad_norm": 8.71493085666351, + "learning_rate": 2.699833264202044e-06, + "loss": 1.1722, + "step": 6926 + }, + { + "epoch": 0.9806753026120195, + "grad_norm": 8.9036077495042, + "learning_rate": 2.6992618934185604e-06, + "loss": 1.1914, + "step": 6927 + }, + { + "epoch": 0.9808168754866567, + "grad_norm": 10.444006074223529, + "learning_rate": 2.698690512160027e-06, + "loss": 1.3527, + "step": 6928 + }, + { + "epoch": 0.980958448361294, + "grad_norm": 7.672874867220718, + "learning_rate": 2.6981191204564825e-06, + "loss": 1.1391, + "step": 6929 + }, + { + "epoch": 0.9811000212359312, + "grad_norm": 8.627884534492065, + "learning_rate": 2.6975477183379624e-06, + "loss": 1.0999, + "step": 6930 + }, + { + "epoch": 0.9812415941105684, + "grad_norm": 8.552696240138207, + "learning_rate": 2.6969763058345067e-06, + "loss": 1.1112, + "step": 6931 + }, + { + "epoch": 0.9813831669852057, + "grad_norm": 10.068284167093017, + "learning_rate": 2.696404882976153e-06, + "loss": 1.2233, + "step": 6932 + }, + { + "epoch": 0.9815247398598429, + "grad_norm": 9.21052872621892, + "learning_rate": 2.6958334497929416e-06, + "loss": 1.1868, + "step": 6933 + }, + { + "epoch": 0.9816663127344801, + "grad_norm": 7.955335388594219, + "learning_rate": 2.695262006314912e-06, + "loss": 1.1145, + "step": 6934 + }, + { + "epoch": 0.9818078856091172, + "grad_norm": 10.054057684722839, + "learning_rate": 2.694690552572104e-06, + "loss": 1.2774, + "step": 6935 + }, + { + "epoch": 0.9819494584837545, + "grad_norm": 8.777287685485165, + "learning_rate": 2.6941190885945582e-06, + "loss": 1.2422, + "step": 6936 + }, + { + "epoch": 0.9820910313583917, + "grad_norm": 8.478787600380333, + "learning_rate": 2.6935476144123173e-06, + "loss": 1.2292, + "step": 6937 + }, + { + "epoch": 0.9822326042330289, + "grad_norm": 7.632055551562677, + "learning_rate": 2.692976130055422e-06, + "loss": 1.1047, + "step": 6938 + }, + { + "epoch": 0.9823741771076662, + "grad_norm": 8.200942324945947, + "learning_rate": 2.692404635553917e-06, + "loss": 1.2703, + "step": 6939 + }, + { + "epoch": 0.9825157499823034, + "grad_norm": 8.44131315745753, + "learning_rate": 2.691833130937842e-06, + "loss": 1.0987, + "step": 6940 + }, + { + "epoch": 0.9826573228569406, + "grad_norm": 8.869232022693017, + "learning_rate": 2.6912616162372434e-06, + "loss": 1.21, + "step": 6941 + }, + { + "epoch": 0.9827988957315779, + "grad_norm": 9.767317822028904, + "learning_rate": 2.690690091482164e-06, + "loss": 1.2053, + "step": 6942 + }, + { + "epoch": 0.9829404686062151, + "grad_norm": 8.240471626043554, + "learning_rate": 2.6901185567026484e-06, + "loss": 1.0489, + "step": 6943 + }, + { + "epoch": 0.9830820414808523, + "grad_norm": 9.865787893115943, + "learning_rate": 2.689547011928742e-06, + "loss": 1.1645, + "step": 6944 + }, + { + "epoch": 0.9832236143554894, + "grad_norm": 8.824204484135215, + "learning_rate": 2.6889754571904907e-06, + "loss": 1.1923, + "step": 6945 + }, + { + "epoch": 0.9833651872301267, + "grad_norm": 9.617908193574086, + "learning_rate": 2.68840389251794e-06, + "loss": 1.2233, + "step": 6946 + }, + { + "epoch": 0.9835067601047639, + "grad_norm": 10.093474865638754, + "learning_rate": 2.687832317941138e-06, + "loss": 1.3215, + "step": 6947 + }, + { + "epoch": 0.9836483329794011, + "grad_norm": 10.184490110268277, + "learning_rate": 2.687260733490131e-06, + "loss": 1.3175, + "step": 6948 + }, + { + "epoch": 0.9837899058540384, + "grad_norm": 8.262630619742165, + "learning_rate": 2.6866891391949664e-06, + "loss": 1.275, + "step": 6949 + }, + { + "epoch": 0.9839314787286756, + "grad_norm": 9.016236388633711, + "learning_rate": 2.6861175350856937e-06, + "loss": 1.103, + "step": 6950 + }, + { + "epoch": 0.9840730516033128, + "grad_norm": 8.931087727871095, + "learning_rate": 2.6855459211923603e-06, + "loss": 1.2476, + "step": 6951 + }, + { + "epoch": 0.98421462447795, + "grad_norm": 7.636386811440051, + "learning_rate": 2.6849742975450165e-06, + "loss": 1.1124, + "step": 6952 + }, + { + "epoch": 0.9843561973525873, + "grad_norm": 8.58606367912288, + "learning_rate": 2.684402664173711e-06, + "loss": 1.2197, + "step": 6953 + }, + { + "epoch": 0.9844977702272245, + "grad_norm": 8.185205502082896, + "learning_rate": 2.6838310211084954e-06, + "loss": 1.4081, + "step": 6954 + }, + { + "epoch": 0.9846393431018616, + "grad_norm": 9.78376752246259, + "learning_rate": 2.6832593683794206e-06, + "loss": 1.3271, + "step": 6955 + }, + { + "epoch": 0.9847809159764989, + "grad_norm": 8.74959192686829, + "learning_rate": 2.6826877060165373e-06, + "loss": 1.2404, + "step": 6956 + }, + { + "epoch": 0.9849224888511361, + "grad_norm": 7.483184018872096, + "learning_rate": 2.6821160340498975e-06, + "loss": 1.1457, + "step": 6957 + }, + { + "epoch": 0.9850640617257733, + "grad_norm": 9.448405249319649, + "learning_rate": 2.681544352509553e-06, + "loss": 1.3839, + "step": 6958 + }, + { + "epoch": 0.9852056346004106, + "grad_norm": 7.673394885883759, + "learning_rate": 2.6809726614255575e-06, + "loss": 1.2553, + "step": 6959 + }, + { + "epoch": 0.9853472074750478, + "grad_norm": 8.796769624285606, + "learning_rate": 2.680400960827965e-06, + "loss": 1.2977, + "step": 6960 + }, + { + "epoch": 0.985488780349685, + "grad_norm": 7.690039982685573, + "learning_rate": 2.679829250746827e-06, + "loss": 1.2095, + "step": 6961 + }, + { + "epoch": 0.9856303532243222, + "grad_norm": 9.624927074601981, + "learning_rate": 2.6792575312122005e-06, + "loss": 1.2079, + "step": 6962 + }, + { + "epoch": 0.9857719260989595, + "grad_norm": 9.040372788161216, + "learning_rate": 2.6786858022541385e-06, + "loss": 1.2774, + "step": 6963 + }, + { + "epoch": 0.9859134989735967, + "grad_norm": 8.219852272061324, + "learning_rate": 2.6781140639026975e-06, + "loss": 1.2119, + "step": 6964 + }, + { + "epoch": 0.9860550718482339, + "grad_norm": 7.350012393863116, + "learning_rate": 2.6775423161879333e-06, + "loss": 1.1852, + "step": 6965 + }, + { + "epoch": 0.986196644722871, + "grad_norm": 7.344168922974842, + "learning_rate": 2.676970559139902e-06, + "loss": 1.1703, + "step": 6966 + }, + { + "epoch": 0.9863382175975083, + "grad_norm": 9.683837986028497, + "learning_rate": 2.676398792788659e-06, + "loss": 1.2587, + "step": 6967 + }, + { + "epoch": 0.9864797904721455, + "grad_norm": 8.230452528015576, + "learning_rate": 2.675827017164264e-06, + "loss": 1.2781, + "step": 6968 + }, + { + "epoch": 0.9866213633467827, + "grad_norm": 7.651247718136377, + "learning_rate": 2.675255232296774e-06, + "loss": 1.1719, + "step": 6969 + }, + { + "epoch": 0.98676293622142, + "grad_norm": 8.000490650389677, + "learning_rate": 2.674683438216247e-06, + "loss": 1.1515, + "step": 6970 + }, + { + "epoch": 0.9869045090960572, + "grad_norm": 7.655943074205081, + "learning_rate": 2.674111634952742e-06, + "loss": 1.2224, + "step": 6971 + }, + { + "epoch": 0.9870460819706944, + "grad_norm": 8.606003781455467, + "learning_rate": 2.673539822536318e-06, + "loss": 1.1354, + "step": 6972 + }, + { + "epoch": 0.9871876548453317, + "grad_norm": 9.031785572895629, + "learning_rate": 2.672968000997035e-06, + "loss": 1.1807, + "step": 6973 + }, + { + "epoch": 0.9873292277199689, + "grad_norm": 6.967298963895185, + "learning_rate": 2.6723961703649525e-06, + "loss": 1.1244, + "step": 6974 + }, + { + "epoch": 0.9874708005946061, + "grad_norm": 7.055124127805281, + "learning_rate": 2.6718243306701317e-06, + "loss": 1.2148, + "step": 6975 + }, + { + "epoch": 0.9876123734692432, + "grad_norm": 6.892932047179288, + "learning_rate": 2.6712524819426355e-06, + "loss": 1.1844, + "step": 6976 + }, + { + "epoch": 0.9877539463438805, + "grad_norm": 9.176941078518144, + "learning_rate": 2.6706806242125232e-06, + "loss": 1.3817, + "step": 6977 + }, + { + "epoch": 0.9878955192185177, + "grad_norm": 8.900253258005675, + "learning_rate": 2.670108757509858e-06, + "loss": 1.262, + "step": 6978 + }, + { + "epoch": 0.9880370920931549, + "grad_norm": 9.7833678661389, + "learning_rate": 2.6695368818647015e-06, + "loss": 1.1912, + "step": 6979 + }, + { + "epoch": 0.9881786649677922, + "grad_norm": 10.023315333798578, + "learning_rate": 2.668964997307118e-06, + "loss": 1.2809, + "step": 6980 + }, + { + "epoch": 0.9883202378424294, + "grad_norm": 7.764126116821529, + "learning_rate": 2.6683931038671705e-06, + "loss": 1.2382, + "step": 6981 + }, + { + "epoch": 0.9884618107170666, + "grad_norm": 8.48031311032905, + "learning_rate": 2.6678212015749234e-06, + "loss": 1.2131, + "step": 6982 + }, + { + "epoch": 0.9886033835917039, + "grad_norm": 8.213150960127766, + "learning_rate": 2.6672492904604403e-06, + "loss": 1.1728, + "step": 6983 + }, + { + "epoch": 0.9887449564663411, + "grad_norm": 9.345924558178712, + "learning_rate": 2.6666773705537873e-06, + "loss": 1.1832, + "step": 6984 + }, + { + "epoch": 0.9888865293409783, + "grad_norm": 9.08312199431309, + "learning_rate": 2.6661054418850286e-06, + "loss": 1.2423, + "step": 6985 + }, + { + "epoch": 0.9890281022156154, + "grad_norm": 9.115927588743926, + "learning_rate": 2.665533504484231e-06, + "loss": 1.2104, + "step": 6986 + }, + { + "epoch": 0.9891696750902527, + "grad_norm": 9.995620913128151, + "learning_rate": 2.6649615583814613e-06, + "loss": 1.2574, + "step": 6987 + }, + { + "epoch": 0.9893112479648899, + "grad_norm": 9.211373174068735, + "learning_rate": 2.6643896036067847e-06, + "loss": 1.2619, + "step": 6988 + }, + { + "epoch": 0.9894528208395271, + "grad_norm": 9.183614103683315, + "learning_rate": 2.6638176401902693e-06, + "loss": 1.24, + "step": 6989 + }, + { + "epoch": 0.9895943937141644, + "grad_norm": 8.256173338378948, + "learning_rate": 2.6632456681619817e-06, + "loss": 1.1479, + "step": 6990 + }, + { + "epoch": 0.9897359665888016, + "grad_norm": 9.787569484732, + "learning_rate": 2.662673687551992e-06, + "loss": 1.2323, + "step": 6991 + }, + { + "epoch": 0.9898775394634388, + "grad_norm": 9.85565763542737, + "learning_rate": 2.6621016983903686e-06, + "loss": 1.0817, + "step": 6992 + }, + { + "epoch": 0.990019112338076, + "grad_norm": 8.624915523391781, + "learning_rate": 2.661529700707179e-06, + "loss": 1.2362, + "step": 6993 + }, + { + "epoch": 0.9901606852127133, + "grad_norm": 9.389969737823764, + "learning_rate": 2.6609576945324933e-06, + "loss": 1.1308, + "step": 6994 + }, + { + "epoch": 0.9903022580873505, + "grad_norm": 9.168258343849736, + "learning_rate": 2.6603856798963817e-06, + "loss": 1.156, + "step": 6995 + }, + { + "epoch": 0.9904438309619877, + "grad_norm": 10.76236705454238, + "learning_rate": 2.6598136568289144e-06, + "loss": 1.3487, + "step": 6996 + }, + { + "epoch": 0.9905854038366249, + "grad_norm": 8.462586133960885, + "learning_rate": 2.6592416253601626e-06, + "loss": 1.269, + "step": 6997 + }, + { + "epoch": 0.9907269767112621, + "grad_norm": 10.796861220708465, + "learning_rate": 2.658669585520197e-06, + "loss": 1.2753, + "step": 6998 + }, + { + "epoch": 0.9908685495858993, + "grad_norm": 9.814840827230322, + "learning_rate": 2.65809753733909e-06, + "loss": 1.1819, + "step": 6999 + }, + { + "epoch": 0.9910101224605365, + "grad_norm": 10.902407780961198, + "learning_rate": 2.657525480846913e-06, + "loss": 1.3346, + "step": 7000 + }, + { + "epoch": 0.9911516953351738, + "grad_norm": 7.986173363779295, + "learning_rate": 2.6569534160737386e-06, + "loss": 1.0989, + "step": 7001 + }, + { + "epoch": 0.991293268209811, + "grad_norm": 9.373850841028128, + "learning_rate": 2.656381343049641e-06, + "loss": 1.2983, + "step": 7002 + }, + { + "epoch": 0.9914348410844482, + "grad_norm": 9.527294100936645, + "learning_rate": 2.655809261804693e-06, + "loss": 1.1775, + "step": 7003 + }, + { + "epoch": 0.9915764139590855, + "grad_norm": 10.926670489711066, + "learning_rate": 2.655237172368967e-06, + "loss": 1.1785, + "step": 7004 + }, + { + "epoch": 0.9917179868337227, + "grad_norm": 9.850782225865405, + "learning_rate": 2.654665074772539e-06, + "loss": 1.2633, + "step": 7005 + }, + { + "epoch": 0.9918595597083599, + "grad_norm": 9.485398967840075, + "learning_rate": 2.6540929690454835e-06, + "loss": 1.2236, + "step": 7006 + }, + { + "epoch": 0.992001132582997, + "grad_norm": 10.871357625705107, + "learning_rate": 2.653520855217876e-06, + "loss": 1.2835, + "step": 7007 + }, + { + "epoch": 0.9921427054576343, + "grad_norm": 9.437269599577899, + "learning_rate": 2.652948733319792e-06, + "loss": 1.2935, + "step": 7008 + }, + { + "epoch": 0.9922842783322715, + "grad_norm": 8.530975002006704, + "learning_rate": 2.652376603381306e-06, + "loss": 1.2001, + "step": 7009 + }, + { + "epoch": 0.9924258512069087, + "grad_norm": 9.670609645620525, + "learning_rate": 2.651804465432496e-06, + "loss": 1.2522, + "step": 7010 + }, + { + "epoch": 0.992567424081546, + "grad_norm": 9.51511625987643, + "learning_rate": 2.6512323195034384e-06, + "loss": 1.2304, + "step": 7011 + }, + { + "epoch": 0.9927089969561832, + "grad_norm": 8.215554199842325, + "learning_rate": 2.6506601656242105e-06, + "loss": 1.166, + "step": 7012 + }, + { + "epoch": 0.9928505698308204, + "grad_norm": 9.827261348554488, + "learning_rate": 2.65008800382489e-06, + "loss": 1.2859, + "step": 7013 + }, + { + "epoch": 0.9929921427054577, + "grad_norm": 9.039601830009836, + "learning_rate": 2.6495158341355548e-06, + "loss": 1.0756, + "step": 7014 + }, + { + "epoch": 0.9931337155800949, + "grad_norm": 8.518054860468851, + "learning_rate": 2.648943656586284e-06, + "loss": 1.0618, + "step": 7015 + }, + { + "epoch": 0.9932752884547321, + "grad_norm": 10.047614608760224, + "learning_rate": 2.648371471207156e-06, + "loss": 1.3582, + "step": 7016 + }, + { + "epoch": 0.9934168613293692, + "grad_norm": 9.621515609407403, + "learning_rate": 2.6477992780282507e-06, + "loss": 1.0955, + "step": 7017 + }, + { + "epoch": 0.9935584342040065, + "grad_norm": 10.624664660379066, + "learning_rate": 2.6472270770796475e-06, + "loss": 1.3452, + "step": 7018 + }, + { + "epoch": 0.9937000070786437, + "grad_norm": 9.488084648925502, + "learning_rate": 2.646654868391427e-06, + "loss": 1.1763, + "step": 7019 + }, + { + "epoch": 0.9938415799532809, + "grad_norm": 10.226087636391934, + "learning_rate": 2.646082651993668e-06, + "loss": 1.2588, + "step": 7020 + }, + { + "epoch": 0.9939831528279182, + "grad_norm": 9.08797478570261, + "learning_rate": 2.6455104279164546e-06, + "loss": 1.2209, + "step": 7021 + }, + { + "epoch": 0.9941247257025554, + "grad_norm": 8.597349654489049, + "learning_rate": 2.6449381961898658e-06, + "loss": 1.0251, + "step": 7022 + }, + { + "epoch": 0.9942662985771926, + "grad_norm": 10.984157598088641, + "learning_rate": 2.644365956843984e-06, + "loss": 1.2307, + "step": 7023 + }, + { + "epoch": 0.9944078714518299, + "grad_norm": 10.148459677492127, + "learning_rate": 2.643793709908892e-06, + "loss": 1.1305, + "step": 7024 + }, + { + "epoch": 0.9945494443264671, + "grad_norm": 9.56977517624255, + "learning_rate": 2.6432214554146717e-06, + "loss": 1.1742, + "step": 7025 + }, + { + "epoch": 0.9946910172011043, + "grad_norm": 8.588187119981288, + "learning_rate": 2.6426491933914062e-06, + "loss": 1.0909, + "step": 7026 + }, + { + "epoch": 0.9948325900757415, + "grad_norm": 10.265247819959136, + "learning_rate": 2.642076923869178e-06, + "loss": 1.1916, + "step": 7027 + }, + { + "epoch": 0.9949741629503787, + "grad_norm": 10.311317329792717, + "learning_rate": 2.6415046468780726e-06, + "loss": 1.2954, + "step": 7028 + }, + { + "epoch": 0.9951157358250159, + "grad_norm": 8.922084077700164, + "learning_rate": 2.6409323624481743e-06, + "loss": 1.3435, + "step": 7029 + }, + { + "epoch": 0.9952573086996531, + "grad_norm": 7.947887682247349, + "learning_rate": 2.6403600706095655e-06, + "loss": 1.151, + "step": 7030 + }, + { + "epoch": 0.9953988815742904, + "grad_norm": 9.666406912985915, + "learning_rate": 2.6397877713923333e-06, + "loss": 1.4629, + "step": 7031 + }, + { + "epoch": 0.9955404544489276, + "grad_norm": 10.263840425887668, + "learning_rate": 2.6392154648265617e-06, + "loss": 1.1688, + "step": 7032 + }, + { + "epoch": 0.9956820273235648, + "grad_norm": 7.892056597777875, + "learning_rate": 2.6386431509423373e-06, + "loss": 1.2165, + "step": 7033 + }, + { + "epoch": 0.995823600198202, + "grad_norm": 8.444099162666513, + "learning_rate": 2.6380708297697456e-06, + "loss": 1.3189, + "step": 7034 + }, + { + "epoch": 0.9959651730728393, + "grad_norm": 7.916307648666143, + "learning_rate": 2.637498501338873e-06, + "loss": 1.0153, + "step": 7035 + }, + { + "epoch": 0.9961067459474765, + "grad_norm": 7.6700879907845225, + "learning_rate": 2.6369261656798067e-06, + "loss": 1.3518, + "step": 7036 + }, + { + "epoch": 0.9962483188221137, + "grad_norm": 9.91047073123906, + "learning_rate": 2.636353822822635e-06, + "loss": 1.2152, + "step": 7037 + }, + { + "epoch": 0.9963898916967509, + "grad_norm": 8.761205854638034, + "learning_rate": 2.6357814727974434e-06, + "loss": 1.218, + "step": 7038 + }, + { + "epoch": 0.9965314645713881, + "grad_norm": 9.690952005930583, + "learning_rate": 2.6352091156343213e-06, + "loss": 1.3257, + "step": 7039 + }, + { + "epoch": 0.9966730374460253, + "grad_norm": 7.988620771360553, + "learning_rate": 2.6346367513633574e-06, + "loss": 1.193, + "step": 7040 + }, + { + "epoch": 0.9968146103206625, + "grad_norm": 7.801890745616922, + "learning_rate": 2.6340643800146387e-06, + "loss": 1.2243, + "step": 7041 + }, + { + "epoch": 0.9969561831952998, + "grad_norm": 8.198431041693901, + "learning_rate": 2.6334920016182565e-06, + "loss": 1.2215, + "step": 7042 + }, + { + "epoch": 0.997097756069937, + "grad_norm": 8.54841424168034, + "learning_rate": 2.6329196162042987e-06, + "loss": 1.1432, + "step": 7043 + }, + { + "epoch": 0.9972393289445742, + "grad_norm": 8.337351885485123, + "learning_rate": 2.6323472238028564e-06, + "loss": 1.0609, + "step": 7044 + }, + { + "epoch": 0.9973809018192115, + "grad_norm": 10.117814139618655, + "learning_rate": 2.6317748244440194e-06, + "loss": 1.2827, + "step": 7045 + }, + { + "epoch": 0.9975224746938487, + "grad_norm": 9.519660282004061, + "learning_rate": 2.6312024181578776e-06, + "loss": 1.2342, + "step": 7046 + }, + { + "epoch": 0.9976640475684859, + "grad_norm": 9.047589683963098, + "learning_rate": 2.6306300049745227e-06, + "loss": 1.2003, + "step": 7047 + }, + { + "epoch": 0.997805620443123, + "grad_norm": 7.590477760366289, + "learning_rate": 2.6300575849240455e-06, + "loss": 1.0887, + "step": 7048 + }, + { + "epoch": 0.9979471933177603, + "grad_norm": 8.613690466469551, + "learning_rate": 2.629485158036538e-06, + "loss": 1.1703, + "step": 7049 + }, + { + "epoch": 0.9980887661923975, + "grad_norm": 9.023699603877315, + "learning_rate": 2.6289127243420924e-06, + "loss": 1.1895, + "step": 7050 + }, + { + "epoch": 0.9982303390670347, + "grad_norm": 8.671928226891831, + "learning_rate": 2.628340283870801e-06, + "loss": 1.1918, + "step": 7051 + }, + { + "epoch": 0.998371911941672, + "grad_norm": 9.863318378642989, + "learning_rate": 2.627767836652757e-06, + "loss": 1.2431, + "step": 7052 + }, + { + "epoch": 0.9985134848163092, + "grad_norm": 7.879488437913523, + "learning_rate": 2.627195382718053e-06, + "loss": 1.1791, + "step": 7053 + }, + { + "epoch": 0.9986550576909464, + "grad_norm": 9.281600996887859, + "learning_rate": 2.626622922096782e-06, + "loss": 1.2246, + "step": 7054 + }, + { + "epoch": 0.9987966305655837, + "grad_norm": 6.961691801600995, + "learning_rate": 2.626050454819039e-06, + "loss": 1.1959, + "step": 7055 + }, + { + "epoch": 0.9989382034402209, + "grad_norm": 8.449928003489743, + "learning_rate": 2.6254779809149174e-06, + "loss": 1.2544, + "step": 7056 + }, + { + "epoch": 0.9990797763148581, + "grad_norm": 10.108870857792736, + "learning_rate": 2.6249055004145118e-06, + "loss": 1.2686, + "step": 7057 + }, + { + "epoch": 0.9992213491894953, + "grad_norm": 9.230316251252962, + "learning_rate": 2.6243330133479173e-06, + "loss": 1.387, + "step": 7058 + }, + { + "epoch": 0.9993629220641325, + "grad_norm": 9.95298395700694, + "learning_rate": 2.6237605197452287e-06, + "loss": 1.3227, + "step": 7059 + }, + { + "epoch": 0.9995044949387697, + "grad_norm": 9.313629286734718, + "learning_rate": 2.6231880196365423e-06, + "loss": 1.0544, + "step": 7060 + }, + { + "epoch": 0.9996460678134069, + "grad_norm": 8.087784737670175, + "learning_rate": 2.6226155130519536e-06, + "loss": 1.2075, + "step": 7061 + }, + { + "epoch": 0.9997876406880442, + "grad_norm": 7.428065880141785, + "learning_rate": 2.6220430000215584e-06, + "loss": 1.2952, + "step": 7062 + }, + { + "epoch": 0.9999292135626814, + "grad_norm": 6.510009101826155, + "learning_rate": 2.6214704805754537e-06, + "loss": 1.1718, + "step": 7063 + }, + { + "epoch": 1.0000707864373186, + "grad_norm": 10.349935750715813, + "learning_rate": 2.620897954743736e-06, + "loss": 1.1248, + "step": 7064 + }, + { + "epoch": 1.0002123593119558, + "grad_norm": 7.592974839069222, + "learning_rate": 2.6203254225565034e-06, + "loss": 1.0205, + "step": 7065 + }, + { + "epoch": 1.000353932186593, + "grad_norm": 9.214016594658414, + "learning_rate": 2.619752884043854e-06, + "loss": 1.0973, + "step": 7066 + }, + { + "epoch": 1.0004955050612303, + "grad_norm": 8.688788229296113, + "learning_rate": 2.619180339235884e-06, + "loss": 1.0005, + "step": 7067 + }, + { + "epoch": 1.0006370779358675, + "grad_norm": 8.655717192960484, + "learning_rate": 2.618607788162692e-06, + "loss": 1.0573, + "step": 7068 + }, + { + "epoch": 1.0007786508105048, + "grad_norm": 7.8980663299132265, + "learning_rate": 2.618035230854378e-06, + "loss": 0.9802, + "step": 7069 + }, + { + "epoch": 1.000920223685142, + "grad_norm": 8.96437098767057, + "learning_rate": 2.6174626673410385e-06, + "loss": 1.0547, + "step": 7070 + }, + { + "epoch": 1.0010617965597792, + "grad_norm": 9.114165680535223, + "learning_rate": 2.616890097652775e-06, + "loss": 1.1219, + "step": 7071 + }, + { + "epoch": 1.0012033694344165, + "grad_norm": 9.533132098370423, + "learning_rate": 2.6163175218196862e-06, + "loss": 1.0271, + "step": 7072 + }, + { + "epoch": 1.0013449423090537, + "grad_norm": 10.47848522683149, + "learning_rate": 2.615744939871872e-06, + "loss": 1.1114, + "step": 7073 + }, + { + "epoch": 1.0014865151836907, + "grad_norm": 8.846134578722125, + "learning_rate": 2.6151723518394327e-06, + "loss": 1.0637, + "step": 7074 + }, + { + "epoch": 1.001628088058328, + "grad_norm": 8.399767336575088, + "learning_rate": 2.6145997577524683e-06, + "loss": 1.0347, + "step": 7075 + }, + { + "epoch": 1.0017696609329652, + "grad_norm": 9.040024662490557, + "learning_rate": 2.6140271576410807e-06, + "loss": 0.9576, + "step": 7076 + }, + { + "epoch": 1.0019112338076024, + "grad_norm": 7.336748917202514, + "learning_rate": 2.613454551535371e-06, + "loss": 0.9849, + "step": 7077 + }, + { + "epoch": 1.0020528066822396, + "grad_norm": 8.731090057590174, + "learning_rate": 2.6128819394654385e-06, + "loss": 1.0454, + "step": 7078 + }, + { + "epoch": 1.0021943795568768, + "grad_norm": 8.909624069623348, + "learning_rate": 2.6123093214613875e-06, + "loss": 1.0204, + "step": 7079 + }, + { + "epoch": 1.002335952431514, + "grad_norm": 12.345101661557093, + "learning_rate": 2.6117366975533187e-06, + "loss": 1.0752, + "step": 7080 + }, + { + "epoch": 1.0024775253061513, + "grad_norm": 7.750708270779547, + "learning_rate": 2.6111640677713356e-06, + "loss": 1.0356, + "step": 7081 + }, + { + "epoch": 1.0026190981807885, + "grad_norm": 9.784476336158802, + "learning_rate": 2.6105914321455405e-06, + "loss": 1.083, + "step": 7082 + }, + { + "epoch": 1.0027606710554258, + "grad_norm": 7.996059162342757, + "learning_rate": 2.6100187907060365e-06, + "loss": 1.0148, + "step": 7083 + }, + { + "epoch": 1.002902243930063, + "grad_norm": 10.897380731135184, + "learning_rate": 2.609446143482926e-06, + "loss": 1.0549, + "step": 7084 + }, + { + "epoch": 1.0030438168047002, + "grad_norm": 9.67243307595802, + "learning_rate": 2.6088734905063134e-06, + "loss": 1.1552, + "step": 7085 + }, + { + "epoch": 1.0031853896793375, + "grad_norm": 8.55095702052097, + "learning_rate": 2.6083008318063023e-06, + "loss": 1.0147, + "step": 7086 + }, + { + "epoch": 1.0033269625539747, + "grad_norm": 8.224528902702374, + "learning_rate": 2.6077281674129974e-06, + "loss": 1.1196, + "step": 7087 + }, + { + "epoch": 1.003468535428612, + "grad_norm": 9.537612742386349, + "learning_rate": 2.607155497356504e-06, + "loss": 1.1787, + "step": 7088 + }, + { + "epoch": 1.0036101083032491, + "grad_norm": 9.026766499014714, + "learning_rate": 2.6065828216669254e-06, + "loss": 0.9629, + "step": 7089 + }, + { + "epoch": 1.0037516811778864, + "grad_norm": 9.90086049665476, + "learning_rate": 2.606010140374367e-06, + "loss": 1.1771, + "step": 7090 + }, + { + "epoch": 1.0038932540525236, + "grad_norm": 6.835348188884107, + "learning_rate": 2.6054374535089345e-06, + "loss": 1.0329, + "step": 7091 + }, + { + "epoch": 1.0040348269271608, + "grad_norm": 7.016166547166488, + "learning_rate": 2.604864761100734e-06, + "loss": 0.9642, + "step": 7092 + }, + { + "epoch": 1.004176399801798, + "grad_norm": 8.884205516871816, + "learning_rate": 2.604292063179871e-06, + "loss": 0.992, + "step": 7093 + }, + { + "epoch": 1.0043179726764353, + "grad_norm": 9.072399126103322, + "learning_rate": 2.6037193597764524e-06, + "loss": 1.1243, + "step": 7094 + }, + { + "epoch": 1.0044595455510723, + "grad_norm": 7.6535293416212005, + "learning_rate": 2.6031466509205843e-06, + "loss": 0.9634, + "step": 7095 + }, + { + "epoch": 1.0046011184257095, + "grad_norm": 8.157128297717874, + "learning_rate": 2.6025739366423735e-06, + "loss": 1.0443, + "step": 7096 + }, + { + "epoch": 1.0047426913003468, + "grad_norm": 10.492365286677835, + "learning_rate": 2.602001216971927e-06, + "loss": 1.0894, + "step": 7097 + }, + { + "epoch": 1.004884264174984, + "grad_norm": 10.362846885921584, + "learning_rate": 2.601428491939354e-06, + "loss": 1.0285, + "step": 7098 + }, + { + "epoch": 1.0050258370496212, + "grad_norm": 9.666360740643901, + "learning_rate": 2.600855761574759e-06, + "loss": 1.0366, + "step": 7099 + }, + { + "epoch": 1.0051674099242585, + "grad_norm": 9.042829541315111, + "learning_rate": 2.6002830259082527e-06, + "loss": 1.0044, + "step": 7100 + }, + { + "epoch": 1.0053089827988957, + "grad_norm": 10.153598286642223, + "learning_rate": 2.5997102849699424e-06, + "loss": 1.0295, + "step": 7101 + }, + { + "epoch": 1.005450555673533, + "grad_norm": 10.381207918360863, + "learning_rate": 2.5991375387899364e-06, + "loss": 1.1255, + "step": 7102 + }, + { + "epoch": 1.0055921285481701, + "grad_norm": 7.951932266821886, + "learning_rate": 2.598564787398345e-06, + "loss": 1.1094, + "step": 7103 + }, + { + "epoch": 1.0057337014228074, + "grad_norm": 9.06960465382368, + "learning_rate": 2.5979920308252753e-06, + "loss": 1.0108, + "step": 7104 + }, + { + "epoch": 1.0058752742974446, + "grad_norm": 9.612095889562106, + "learning_rate": 2.597419269100838e-06, + "loss": 1.0878, + "step": 7105 + }, + { + "epoch": 1.0060168471720818, + "grad_norm": 9.29212720154473, + "learning_rate": 2.596846502255142e-06, + "loss": 0.9923, + "step": 7106 + }, + { + "epoch": 1.006158420046719, + "grad_norm": 7.4034699320597035, + "learning_rate": 2.596273730318298e-06, + "loss": 0.8166, + "step": 7107 + }, + { + "epoch": 1.0062999929213563, + "grad_norm": 9.774496361131327, + "learning_rate": 2.595700953320415e-06, + "loss": 1.0381, + "step": 7108 + }, + { + "epoch": 1.0064415657959935, + "grad_norm": 9.572975554231508, + "learning_rate": 2.595128171291605e-06, + "loss": 1.0603, + "step": 7109 + }, + { + "epoch": 1.0065831386706308, + "grad_norm": 11.085369372444706, + "learning_rate": 2.5945553842619776e-06, + "loss": 0.9761, + "step": 7110 + }, + { + "epoch": 1.006724711545268, + "grad_norm": 10.341637709764326, + "learning_rate": 2.5939825922616443e-06, + "loss": 1.101, + "step": 7111 + }, + { + "epoch": 1.0068662844199052, + "grad_norm": 9.832170924216474, + "learning_rate": 2.593409795320716e-06, + "loss": 0.9372, + "step": 7112 + }, + { + "epoch": 1.0070078572945425, + "grad_norm": 8.177896006508137, + "learning_rate": 2.5928369934693043e-06, + "loss": 0.982, + "step": 7113 + }, + { + "epoch": 1.0071494301691797, + "grad_norm": 9.35012895627774, + "learning_rate": 2.592264186737522e-06, + "loss": 1.0891, + "step": 7114 + }, + { + "epoch": 1.007291003043817, + "grad_norm": 9.434187756272687, + "learning_rate": 2.5916913751554795e-06, + "loss": 0.9491, + "step": 7115 + }, + { + "epoch": 1.007432575918454, + "grad_norm": 10.203950109607332, + "learning_rate": 2.5911185587532895e-06, + "loss": 1.1604, + "step": 7116 + }, + { + "epoch": 1.0075741487930912, + "grad_norm": 9.87648945452343, + "learning_rate": 2.5905457375610647e-06, + "loss": 1.0827, + "step": 7117 + }, + { + "epoch": 1.0077157216677284, + "grad_norm": 10.02684879901851, + "learning_rate": 2.5899729116089183e-06, + "loss": 0.9636, + "step": 7118 + }, + { + "epoch": 1.0078572945423656, + "grad_norm": 8.437332264681112, + "learning_rate": 2.589400080926964e-06, + "loss": 1.0512, + "step": 7119 + }, + { + "epoch": 1.0079988674170028, + "grad_norm": 10.372435103346973, + "learning_rate": 2.5888272455453136e-06, + "loss": 1.0843, + "step": 7120 + }, + { + "epoch": 1.00814044029164, + "grad_norm": 9.08145201445881, + "learning_rate": 2.5882544054940806e-06, + "loss": 0.9635, + "step": 7121 + }, + { + "epoch": 1.0082820131662773, + "grad_norm": 12.139173266243901, + "learning_rate": 2.5876815608033797e-06, + "loss": 1.1186, + "step": 7122 + }, + { + "epoch": 1.0084235860409145, + "grad_norm": 9.865721387516158, + "learning_rate": 2.587108711503324e-06, + "loss": 1.0222, + "step": 7123 + }, + { + "epoch": 1.0085651589155518, + "grad_norm": 11.387404835677073, + "learning_rate": 2.586535857624028e-06, + "loss": 1.0469, + "step": 7124 + }, + { + "epoch": 1.008706731790189, + "grad_norm": 11.370568386824518, + "learning_rate": 2.5859629991956075e-06, + "loss": 0.9239, + "step": 7125 + }, + { + "epoch": 1.0088483046648262, + "grad_norm": 8.027429763073247, + "learning_rate": 2.585390136248176e-06, + "loss": 0.9598, + "step": 7126 + }, + { + "epoch": 1.0089898775394635, + "grad_norm": 7.954605534314255, + "learning_rate": 2.5848172688118482e-06, + "loss": 1.0084, + "step": 7127 + }, + { + "epoch": 1.0091314504141007, + "grad_norm": 9.208363206449588, + "learning_rate": 2.5842443969167402e-06, + "loss": 1.0427, + "step": 7128 + }, + { + "epoch": 1.009273023288738, + "grad_norm": 9.660323286945463, + "learning_rate": 2.583671520592967e-06, + "loss": 0.9827, + "step": 7129 + }, + { + "epoch": 1.0094145961633751, + "grad_norm": 8.825098001514098, + "learning_rate": 2.583098639870644e-06, + "loss": 0.9594, + "step": 7130 + }, + { + "epoch": 1.0095561690380124, + "grad_norm": 9.901100529083205, + "learning_rate": 2.582525754779888e-06, + "loss": 0.958, + "step": 7131 + }, + { + "epoch": 1.0096977419126496, + "grad_norm": 9.223055386984658, + "learning_rate": 2.581952865350815e-06, + "loss": 1.0264, + "step": 7132 + }, + { + "epoch": 1.0098393147872868, + "grad_norm": 9.986792043485446, + "learning_rate": 2.58137997161354e-06, + "loss": 1.1449, + "step": 7133 + }, + { + "epoch": 1.009980887661924, + "grad_norm": 8.776861322690156, + "learning_rate": 2.580807073598181e-06, + "loss": 1.0334, + "step": 7134 + }, + { + "epoch": 1.0101224605365613, + "grad_norm": 10.076033122288022, + "learning_rate": 2.580234171334855e-06, + "loss": 1.0493, + "step": 7135 + }, + { + "epoch": 1.0102640334111985, + "grad_norm": 8.826514821355117, + "learning_rate": 2.5796612648536776e-06, + "loss": 1.1065, + "step": 7136 + }, + { + "epoch": 1.0104056062858355, + "grad_norm": 9.758353151930553, + "learning_rate": 2.579088354184767e-06, + "loss": 1.0187, + "step": 7137 + }, + { + "epoch": 1.0105471791604728, + "grad_norm": 7.717624103424435, + "learning_rate": 2.5785154393582405e-06, + "loss": 1.0211, + "step": 7138 + }, + { + "epoch": 1.01068875203511, + "grad_norm": 9.51919463653939, + "learning_rate": 2.577942520404216e-06, + "loss": 1.068, + "step": 7139 + }, + { + "epoch": 1.0108303249097472, + "grad_norm": 7.488422868053142, + "learning_rate": 2.577369597352812e-06, + "loss": 1.0417, + "step": 7140 + }, + { + "epoch": 1.0109718977843845, + "grad_norm": 10.566323214784644, + "learning_rate": 2.5767966702341454e-06, + "loss": 1.2169, + "step": 7141 + }, + { + "epoch": 1.0111134706590217, + "grad_norm": 11.25021124747411, + "learning_rate": 2.576223739078335e-06, + "loss": 1.0513, + "step": 7142 + }, + { + "epoch": 1.011255043533659, + "grad_norm": 7.124617984636226, + "learning_rate": 2.5756508039155e-06, + "loss": 0.9381, + "step": 7143 + }, + { + "epoch": 1.0113966164082961, + "grad_norm": 8.886081707936135, + "learning_rate": 2.575077864775758e-06, + "loss": 1.0706, + "step": 7144 + }, + { + "epoch": 1.0115381892829334, + "grad_norm": 8.963680309854073, + "learning_rate": 2.5745049216892286e-06, + "loss": 1.0373, + "step": 7145 + }, + { + "epoch": 1.0116797621575706, + "grad_norm": 9.214227737464034, + "learning_rate": 2.5739319746860312e-06, + "loss": 0.9432, + "step": 7146 + }, + { + "epoch": 1.0118213350322078, + "grad_norm": 9.390360138215714, + "learning_rate": 2.5733590237962854e-06, + "loss": 1.047, + "step": 7147 + }, + { + "epoch": 1.011962907906845, + "grad_norm": 7.3188696612862625, + "learning_rate": 2.57278606905011e-06, + "loss": 1.0271, + "step": 7148 + }, + { + "epoch": 1.0121044807814823, + "grad_norm": 8.995122223511917, + "learning_rate": 2.572213110477625e-06, + "loss": 0.9565, + "step": 7149 + }, + { + "epoch": 1.0122460536561195, + "grad_norm": 8.770384232597928, + "learning_rate": 2.571640148108951e-06, + "loss": 1.0733, + "step": 7150 + }, + { + "epoch": 1.0123876265307568, + "grad_norm": 9.157454792216251, + "learning_rate": 2.5710671819742083e-06, + "loss": 1.0573, + "step": 7151 + }, + { + "epoch": 1.012529199405394, + "grad_norm": 10.980713669560421, + "learning_rate": 2.5704942121035163e-06, + "loss": 1.1159, + "step": 7152 + }, + { + "epoch": 1.0126707722800312, + "grad_norm": 9.387897215870133, + "learning_rate": 2.5699212385269954e-06, + "loss": 1.0929, + "step": 7153 + }, + { + "epoch": 1.0128123451546684, + "grad_norm": 10.731119297944588, + "learning_rate": 2.569348261274768e-06, + "loss": 0.9617, + "step": 7154 + }, + { + "epoch": 1.0129539180293057, + "grad_norm": 8.948075446665543, + "learning_rate": 2.5687752803769538e-06, + "loss": 1.0633, + "step": 7155 + }, + { + "epoch": 1.013095490903943, + "grad_norm": 10.395149978062186, + "learning_rate": 2.5682022958636752e-06, + "loss": 1.0466, + "step": 7156 + }, + { + "epoch": 1.01323706377858, + "grad_norm": 8.77749108062265, + "learning_rate": 2.5676293077650528e-06, + "loss": 0.9077, + "step": 7157 + }, + { + "epoch": 1.0133786366532171, + "grad_norm": 9.498784840817716, + "learning_rate": 2.5670563161112073e-06, + "loss": 1.1627, + "step": 7158 + }, + { + "epoch": 1.0135202095278544, + "grad_norm": 8.94861429238663, + "learning_rate": 2.5664833209322614e-06, + "loss": 0.9476, + "step": 7159 + }, + { + "epoch": 1.0136617824024916, + "grad_norm": 9.011132454784791, + "learning_rate": 2.565910322258337e-06, + "loss": 1.0639, + "step": 7160 + }, + { + "epoch": 1.0138033552771288, + "grad_norm": 9.421186279632058, + "learning_rate": 2.5653373201195554e-06, + "loss": 1.0923, + "step": 7161 + }, + { + "epoch": 1.013944928151766, + "grad_norm": 8.303178422244164, + "learning_rate": 2.564764314546041e-06, + "loss": 0.9811, + "step": 7162 + }, + { + "epoch": 1.0140865010264033, + "grad_norm": 10.655473876611788, + "learning_rate": 2.564191305567914e-06, + "loss": 1.0617, + "step": 7163 + }, + { + "epoch": 1.0142280739010405, + "grad_norm": 10.46165294066748, + "learning_rate": 2.563618293215298e-06, + "loss": 1.0123, + "step": 7164 + }, + { + "epoch": 1.0143696467756778, + "grad_norm": 8.461734582722238, + "learning_rate": 2.563045277518316e-06, + "loss": 1.0883, + "step": 7165 + }, + { + "epoch": 1.014511219650315, + "grad_norm": 8.17755641338187, + "learning_rate": 2.5624722585070907e-06, + "loss": 0.9451, + "step": 7166 + }, + { + "epoch": 1.0146527925249522, + "grad_norm": 9.049204366705958, + "learning_rate": 2.5618992362117453e-06, + "loss": 1.0367, + "step": 7167 + }, + { + "epoch": 1.0147943653995894, + "grad_norm": 8.578077416904685, + "learning_rate": 2.561326210662403e-06, + "loss": 1.0955, + "step": 7168 + }, + { + "epoch": 1.0149359382742267, + "grad_norm": 10.11759960852271, + "learning_rate": 2.5607531818891877e-06, + "loss": 1.1771, + "step": 7169 + }, + { + "epoch": 1.015077511148864, + "grad_norm": 6.825976839130764, + "learning_rate": 2.5601801499222227e-06, + "loss": 0.9695, + "step": 7170 + }, + { + "epoch": 1.0152190840235011, + "grad_norm": 10.015344291478725, + "learning_rate": 2.5596071147916325e-06, + "loss": 1.0821, + "step": 7171 + }, + { + "epoch": 1.0153606568981384, + "grad_norm": 11.435980357123547, + "learning_rate": 2.5590340765275414e-06, + "loss": 0.9436, + "step": 7172 + }, + { + "epoch": 1.0155022297727756, + "grad_norm": 7.254143155881706, + "learning_rate": 2.558461035160072e-06, + "loss": 1.0127, + "step": 7173 + }, + { + "epoch": 1.0156438026474128, + "grad_norm": 7.773501540643867, + "learning_rate": 2.5578879907193495e-06, + "loss": 1.0092, + "step": 7174 + }, + { + "epoch": 1.01578537552205, + "grad_norm": 8.633247302398733, + "learning_rate": 2.557314943235498e-06, + "loss": 1.0217, + "step": 7175 + }, + { + "epoch": 1.0159269483966873, + "grad_norm": 8.480161516342818, + "learning_rate": 2.556741892738643e-06, + "loss": 1.0268, + "step": 7176 + }, + { + "epoch": 1.0160685212713245, + "grad_norm": 11.015896494881272, + "learning_rate": 2.5561688392589095e-06, + "loss": 1.0972, + "step": 7177 + }, + { + "epoch": 1.0162100941459615, + "grad_norm": 7.824115144954966, + "learning_rate": 2.555595782826423e-06, + "loss": 1.0008, + "step": 7178 + }, + { + "epoch": 1.0163516670205988, + "grad_norm": 8.665244156907994, + "learning_rate": 2.555022723471306e-06, + "loss": 1.0914, + "step": 7179 + }, + { + "epoch": 1.016493239895236, + "grad_norm": 8.544636825602673, + "learning_rate": 2.554449661223686e-06, + "loss": 0.9705, + "step": 7180 + }, + { + "epoch": 1.0166348127698732, + "grad_norm": 10.63805350489679, + "learning_rate": 2.553876596113688e-06, + "loss": 1.0275, + "step": 7181 + }, + { + "epoch": 1.0167763856445104, + "grad_norm": 9.337738859295357, + "learning_rate": 2.5533035281714368e-06, + "loss": 1.0911, + "step": 7182 + }, + { + "epoch": 1.0169179585191477, + "grad_norm": 9.965160716244668, + "learning_rate": 2.5527304574270596e-06, + "loss": 1.0496, + "step": 7183 + }, + { + "epoch": 1.017059531393785, + "grad_norm": 8.557922030492009, + "learning_rate": 2.5521573839106815e-06, + "loss": 1.0635, + "step": 7184 + }, + { + "epoch": 1.0172011042684221, + "grad_norm": 10.246496951059244, + "learning_rate": 2.551584307652428e-06, + "loss": 1.0226, + "step": 7185 + }, + { + "epoch": 1.0173426771430594, + "grad_norm": 8.45142125924803, + "learning_rate": 2.551011228682427e-06, + "loss": 0.9665, + "step": 7186 + }, + { + "epoch": 1.0174842500176966, + "grad_norm": 9.230590251263392, + "learning_rate": 2.5504381470308034e-06, + "loss": 1.0244, + "step": 7187 + }, + { + "epoch": 1.0176258228923338, + "grad_norm": 9.548135177927497, + "learning_rate": 2.549865062727684e-06, + "loss": 1.026, + "step": 7188 + }, + { + "epoch": 1.017767395766971, + "grad_norm": 9.620206754052777, + "learning_rate": 2.5492919758031953e-06, + "loss": 1.0964, + "step": 7189 + }, + { + "epoch": 1.0179089686416083, + "grad_norm": 8.205351260405653, + "learning_rate": 2.5487188862874635e-06, + "loss": 1.0282, + "step": 7190 + }, + { + "epoch": 1.0180505415162455, + "grad_norm": 7.7150266987801555, + "learning_rate": 2.5481457942106165e-06, + "loss": 1.0056, + "step": 7191 + }, + { + "epoch": 1.0181921143908828, + "grad_norm": 11.69638099452735, + "learning_rate": 2.547572699602781e-06, + "loss": 1.0727, + "step": 7192 + }, + { + "epoch": 1.01833368726552, + "grad_norm": 11.486109430040582, + "learning_rate": 2.5469996024940853e-06, + "loss": 1.0631, + "step": 7193 + }, + { + "epoch": 1.0184752601401572, + "grad_norm": 9.376638854150878, + "learning_rate": 2.5464265029146546e-06, + "loss": 1.0251, + "step": 7194 + }, + { + "epoch": 1.0186168330147944, + "grad_norm": 11.199590362142052, + "learning_rate": 2.545853400894617e-06, + "loss": 1.0833, + "step": 7195 + }, + { + "epoch": 1.0187584058894317, + "grad_norm": 9.791521133531727, + "learning_rate": 2.545280296464101e-06, + "loss": 1.0162, + "step": 7196 + }, + { + "epoch": 1.018899978764069, + "grad_norm": 8.926042375146455, + "learning_rate": 2.544707189653233e-06, + "loss": 1.0494, + "step": 7197 + }, + { + "epoch": 1.019041551638706, + "grad_norm": 9.823594351429893, + "learning_rate": 2.5441340804921413e-06, + "loss": 0.9484, + "step": 7198 + }, + { + "epoch": 1.0191831245133431, + "grad_norm": 9.860123256818266, + "learning_rate": 2.5435609690109545e-06, + "loss": 1.0485, + "step": 7199 + }, + { + "epoch": 1.0193246973879804, + "grad_norm": 8.378878478246516, + "learning_rate": 2.5429878552398e-06, + "loss": 1.1666, + "step": 7200 + }, + { + "epoch": 1.0194662702626176, + "grad_norm": 8.746856996900014, + "learning_rate": 2.5424147392088057e-06, + "loss": 1.0509, + "step": 7201 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 9.432546768375849, + "learning_rate": 2.5418416209481002e-06, + "loss": 1.1939, + "step": 7202 + }, + { + "epoch": 1.019749416011892, + "grad_norm": 9.917693739417784, + "learning_rate": 2.541268500487812e-06, + "loss": 1.0888, + "step": 7203 + }, + { + "epoch": 1.0198909888865293, + "grad_norm": 9.547358074546763, + "learning_rate": 2.540695377858069e-06, + "loss": 0.9846, + "step": 7204 + }, + { + "epoch": 1.0200325617611665, + "grad_norm": 12.113097966002933, + "learning_rate": 2.540122253089001e-06, + "loss": 1.1663, + "step": 7205 + }, + { + "epoch": 1.0201741346358038, + "grad_norm": 8.950601431815123, + "learning_rate": 2.539549126210735e-06, + "loss": 1.0652, + "step": 7206 + }, + { + "epoch": 1.020315707510441, + "grad_norm": 9.949265526735347, + "learning_rate": 2.5389759972534024e-06, + "loss": 1.0736, + "step": 7207 + }, + { + "epoch": 1.0204572803850782, + "grad_norm": 9.47549450069572, + "learning_rate": 2.53840286624713e-06, + "loss": 0.9575, + "step": 7208 + }, + { + "epoch": 1.0205988532597154, + "grad_norm": 9.963165345224834, + "learning_rate": 2.5378297332220474e-06, + "loss": 1.1346, + "step": 7209 + }, + { + "epoch": 1.0207404261343527, + "grad_norm": 8.401053825759051, + "learning_rate": 2.5372565982082843e-06, + "loss": 0.9566, + "step": 7210 + }, + { + "epoch": 1.02088199900899, + "grad_norm": 7.934368086365268, + "learning_rate": 2.5366834612359697e-06, + "loss": 0.9967, + "step": 7211 + }, + { + "epoch": 1.0210235718836271, + "grad_norm": 8.753131850926675, + "learning_rate": 2.5361103223352325e-06, + "loss": 0.9269, + "step": 7212 + }, + { + "epoch": 1.0211651447582644, + "grad_norm": 9.222535057838837, + "learning_rate": 2.5355371815362017e-06, + "loss": 1.0411, + "step": 7213 + }, + { + "epoch": 1.0213067176329016, + "grad_norm": 8.19546191549328, + "learning_rate": 2.534964038869009e-06, + "loss": 1.1464, + "step": 7214 + }, + { + "epoch": 1.0214482905075388, + "grad_norm": 8.459426984567369, + "learning_rate": 2.534390894363783e-06, + "loss": 1.04, + "step": 7215 + }, + { + "epoch": 1.021589863382176, + "grad_norm": 10.381869695955015, + "learning_rate": 2.533817748050653e-06, + "loss": 1.087, + "step": 7216 + }, + { + "epoch": 1.0217314362568133, + "grad_norm": 7.918574079984671, + "learning_rate": 2.533244599959749e-06, + "loss": 1.023, + "step": 7217 + }, + { + "epoch": 1.0218730091314505, + "grad_norm": 11.403206197042374, + "learning_rate": 2.5326714501212014e-06, + "loss": 1.0915, + "step": 7218 + }, + { + "epoch": 1.0220145820060877, + "grad_norm": 9.441068467271643, + "learning_rate": 2.53209829856514e-06, + "loss": 1.0268, + "step": 7219 + }, + { + "epoch": 1.0221561548807248, + "grad_norm": 8.884919546575595, + "learning_rate": 2.531525145321695e-06, + "loss": 1.09, + "step": 7220 + }, + { + "epoch": 1.022297727755362, + "grad_norm": 8.785609564782895, + "learning_rate": 2.5309519904209962e-06, + "loss": 1.0902, + "step": 7221 + }, + { + "epoch": 1.0224393006299992, + "grad_norm": 12.433507230495394, + "learning_rate": 2.5303788338931744e-06, + "loss": 1.147, + "step": 7222 + }, + { + "epoch": 1.0225808735046364, + "grad_norm": 10.29183053775523, + "learning_rate": 2.5298056757683604e-06, + "loss": 1.0077, + "step": 7223 + }, + { + "epoch": 1.0227224463792737, + "grad_norm": 8.193947153752951, + "learning_rate": 2.529232516076684e-06, + "loss": 0.9938, + "step": 7224 + }, + { + "epoch": 1.022864019253911, + "grad_norm": 8.92523461468919, + "learning_rate": 2.528659354848277e-06, + "loss": 1.0438, + "step": 7225 + }, + { + "epoch": 1.0230055921285481, + "grad_norm": 8.24513563610174, + "learning_rate": 2.5280861921132677e-06, + "loss": 1.0239, + "step": 7226 + }, + { + "epoch": 1.0231471650031854, + "grad_norm": 8.231844257983463, + "learning_rate": 2.5275130279017884e-06, + "loss": 1.122, + "step": 7227 + }, + { + "epoch": 1.0232887378778226, + "grad_norm": 12.310886955133261, + "learning_rate": 2.52693986224397e-06, + "loss": 1.0044, + "step": 7228 + }, + { + "epoch": 1.0234303107524598, + "grad_norm": 11.021060203327586, + "learning_rate": 2.526366695169943e-06, + "loss": 1.1637, + "step": 7229 + }, + { + "epoch": 1.023571883627097, + "grad_norm": 10.062609180308305, + "learning_rate": 2.5257935267098395e-06, + "loss": 1.1228, + "step": 7230 + }, + { + "epoch": 1.0237134565017343, + "grad_norm": 9.810574287924572, + "learning_rate": 2.5252203568937884e-06, + "loss": 1.0034, + "step": 7231 + }, + { + "epoch": 1.0238550293763715, + "grad_norm": 9.491493432058034, + "learning_rate": 2.524647185751922e-06, + "loss": 1.1027, + "step": 7232 + }, + { + "epoch": 1.0239966022510087, + "grad_norm": 6.961238079667053, + "learning_rate": 2.5240740133143714e-06, + "loss": 0.9253, + "step": 7233 + }, + { + "epoch": 1.024138175125646, + "grad_norm": 13.351635070567843, + "learning_rate": 2.5235008396112688e-06, + "loss": 1.1571, + "step": 7234 + }, + { + "epoch": 1.0242797480002832, + "grad_norm": 8.834162067428565, + "learning_rate": 2.5229276646727428e-06, + "loss": 1.0838, + "step": 7235 + }, + { + "epoch": 1.0244213208749204, + "grad_norm": 8.872081867855023, + "learning_rate": 2.5223544885289287e-06, + "loss": 1.1049, + "step": 7236 + }, + { + "epoch": 1.0245628937495577, + "grad_norm": 11.348024009746421, + "learning_rate": 2.5217813112099543e-06, + "loss": 1.1046, + "step": 7237 + }, + { + "epoch": 1.024704466624195, + "grad_norm": 9.17545364464128, + "learning_rate": 2.521208132745953e-06, + "loss": 1.0508, + "step": 7238 + }, + { + "epoch": 1.0248460394988321, + "grad_norm": 8.577409002165496, + "learning_rate": 2.520634953167056e-06, + "loss": 0.9555, + "step": 7239 + }, + { + "epoch": 1.0249876123734691, + "grad_norm": 9.250358780784786, + "learning_rate": 2.5200617725033947e-06, + "loss": 0.9257, + "step": 7240 + }, + { + "epoch": 1.0251291852481064, + "grad_norm": 10.287810762886764, + "learning_rate": 2.519488590785102e-06, + "loss": 1.1022, + "step": 7241 + }, + { + "epoch": 1.0252707581227436, + "grad_norm": 9.129629972382078, + "learning_rate": 2.5189154080423073e-06, + "loss": 1.0405, + "step": 7242 + }, + { + "epoch": 1.0254123309973808, + "grad_norm": 8.782785518966518, + "learning_rate": 2.518342224305144e-06, + "loss": 0.9807, + "step": 7243 + }, + { + "epoch": 1.025553903872018, + "grad_norm": 7.839211659402874, + "learning_rate": 2.517769039603744e-06, + "loss": 0.9566, + "step": 7244 + }, + { + "epoch": 1.0256954767466553, + "grad_norm": 7.088152454088607, + "learning_rate": 2.517195853968239e-06, + "loss": 0.9241, + "step": 7245 + }, + { + "epoch": 1.0258370496212925, + "grad_norm": 8.657126482166161, + "learning_rate": 2.516622667428761e-06, + "loss": 0.9712, + "step": 7246 + }, + { + "epoch": 1.0259786224959297, + "grad_norm": 7.952885892080382, + "learning_rate": 2.516049480015441e-06, + "loss": 0.9556, + "step": 7247 + }, + { + "epoch": 1.026120195370567, + "grad_norm": 8.929676820176248, + "learning_rate": 2.5154762917584125e-06, + "loss": 1.0468, + "step": 7248 + }, + { + "epoch": 1.0262617682452042, + "grad_norm": 9.330052980066075, + "learning_rate": 2.5149031026878063e-06, + "loss": 1.1408, + "step": 7249 + }, + { + "epoch": 1.0264033411198414, + "grad_norm": 8.860770409975691, + "learning_rate": 2.5143299128337543e-06, + "loss": 1.0949, + "step": 7250 + }, + { + "epoch": 1.0265449139944787, + "grad_norm": 7.866402186241999, + "learning_rate": 2.513756722226391e-06, + "loss": 0.9972, + "step": 7251 + }, + { + "epoch": 1.026686486869116, + "grad_norm": 9.913592292090142, + "learning_rate": 2.5131835308958467e-06, + "loss": 0.9892, + "step": 7252 + }, + { + "epoch": 1.0268280597437531, + "grad_norm": 8.694672841195906, + "learning_rate": 2.512610338872254e-06, + "loss": 0.8639, + "step": 7253 + }, + { + "epoch": 1.0269696326183904, + "grad_norm": 8.352250815736129, + "learning_rate": 2.512037146185745e-06, + "loss": 1.0022, + "step": 7254 + }, + { + "epoch": 1.0271112054930276, + "grad_norm": 9.034648744123233, + "learning_rate": 2.511463952866452e-06, + "loss": 1.0295, + "step": 7255 + }, + { + "epoch": 1.0272527783676648, + "grad_norm": 12.870532992337028, + "learning_rate": 2.510890758944508e-06, + "loss": 0.9877, + "step": 7256 + }, + { + "epoch": 1.027394351242302, + "grad_norm": 9.056142537050043, + "learning_rate": 2.5103175644500444e-06, + "loss": 1.0977, + "step": 7257 + }, + { + "epoch": 1.0275359241169393, + "grad_norm": 10.565357070138768, + "learning_rate": 2.5097443694131947e-06, + "loss": 1.1082, + "step": 7258 + }, + { + "epoch": 1.0276774969915765, + "grad_norm": 9.135355468916195, + "learning_rate": 2.50917117386409e-06, + "loss": 1.0075, + "step": 7259 + }, + { + "epoch": 1.0278190698662137, + "grad_norm": 10.615494369228642, + "learning_rate": 2.508597977832864e-06, + "loss": 1.0026, + "step": 7260 + }, + { + "epoch": 1.0279606427408507, + "grad_norm": 8.119303892768455, + "learning_rate": 2.508024781349649e-06, + "loss": 0.9348, + "step": 7261 + }, + { + "epoch": 1.028102215615488, + "grad_norm": 9.136560928788729, + "learning_rate": 2.5074515844445774e-06, + "loss": 1.1098, + "step": 7262 + }, + { + "epoch": 1.0282437884901252, + "grad_norm": 8.891667149075104, + "learning_rate": 2.5068783871477807e-06, + "loss": 1.0721, + "step": 7263 + }, + { + "epoch": 1.0283853613647624, + "grad_norm": 8.245032924866274, + "learning_rate": 2.5063051894893925e-06, + "loss": 0.9986, + "step": 7264 + }, + { + "epoch": 1.0285269342393997, + "grad_norm": 9.597677315427635, + "learning_rate": 2.5057319914995454e-06, + "loss": 1.0767, + "step": 7265 + }, + { + "epoch": 1.028668507114037, + "grad_norm": 9.33685109285128, + "learning_rate": 2.5051587932083715e-06, + "loss": 1.0223, + "step": 7266 + }, + { + "epoch": 1.0288100799886741, + "grad_norm": 10.114580609484593, + "learning_rate": 2.504585594646004e-06, + "loss": 1.039, + "step": 7267 + }, + { + "epoch": 1.0289516528633114, + "grad_norm": 8.56440078910748, + "learning_rate": 2.504012395842576e-06, + "loss": 1.0217, + "step": 7268 + }, + { + "epoch": 1.0290932257379486, + "grad_norm": 8.636419725007807, + "learning_rate": 2.5034391968282186e-06, + "loss": 1.0884, + "step": 7269 + }, + { + "epoch": 1.0292347986125858, + "grad_norm": 10.11137471769495, + "learning_rate": 2.502865997633065e-06, + "loss": 1.1048, + "step": 7270 + }, + { + "epoch": 1.029376371487223, + "grad_norm": 10.473687763914702, + "learning_rate": 2.502292798287248e-06, + "loss": 1.0733, + "step": 7271 + }, + { + "epoch": 1.0295179443618603, + "grad_norm": 8.3903635790572, + "learning_rate": 2.5017195988208997e-06, + "loss": 0.9721, + "step": 7272 + }, + { + "epoch": 1.0296595172364975, + "grad_norm": 8.16401110181189, + "learning_rate": 2.5011463992641548e-06, + "loss": 1.0333, + "step": 7273 + }, + { + "epoch": 1.0298010901111347, + "grad_norm": 8.631377143116778, + "learning_rate": 2.500573199647144e-06, + "loss": 1.0556, + "step": 7274 + }, + { + "epoch": 1.029942662985772, + "grad_norm": 11.880383365495106, + "learning_rate": 2.5e-06, + "loss": 1.1122, + "step": 7275 + }, + { + "epoch": 1.0300842358604092, + "grad_norm": 9.3504936863106, + "learning_rate": 2.499426800352857e-06, + "loss": 1.0004, + "step": 7276 + }, + { + "epoch": 1.0302258087350464, + "grad_norm": 9.003241908788414, + "learning_rate": 2.4988536007358456e-06, + "loss": 0.955, + "step": 7277 + }, + { + "epoch": 1.0303673816096837, + "grad_norm": 8.999557060362184, + "learning_rate": 2.4982804011791003e-06, + "loss": 1.1626, + "step": 7278 + }, + { + "epoch": 1.030508954484321, + "grad_norm": 9.885799635023366, + "learning_rate": 2.497707201712753e-06, + "loss": 1.0781, + "step": 7279 + }, + { + "epoch": 1.0306505273589581, + "grad_norm": 7.837037881128567, + "learning_rate": 2.4971340023669356e-06, + "loss": 0.9846, + "step": 7280 + }, + { + "epoch": 1.0307921002335951, + "grad_norm": 9.463151642749464, + "learning_rate": 2.4965608031717827e-06, + "loss": 1.1419, + "step": 7281 + }, + { + "epoch": 1.0309336731082324, + "grad_norm": 9.428022665923123, + "learning_rate": 2.4959876041574256e-06, + "loss": 0.9622, + "step": 7282 + }, + { + "epoch": 1.0310752459828696, + "grad_norm": 10.522703783813123, + "learning_rate": 2.4954144053539966e-06, + "loss": 1.0655, + "step": 7283 + }, + { + "epoch": 1.0312168188575068, + "grad_norm": 8.590322025822095, + "learning_rate": 2.494841206791629e-06, + "loss": 1.0003, + "step": 7284 + }, + { + "epoch": 1.031358391732144, + "grad_norm": 9.29645605345438, + "learning_rate": 2.4942680085004554e-06, + "loss": 1.0324, + "step": 7285 + }, + { + "epoch": 1.0314999646067813, + "grad_norm": 7.536115032638253, + "learning_rate": 2.4936948105106084e-06, + "loss": 0.9096, + "step": 7286 + }, + { + "epoch": 1.0316415374814185, + "grad_norm": 8.864273251146768, + "learning_rate": 2.4931216128522197e-06, + "loss": 0.9755, + "step": 7287 + }, + { + "epoch": 1.0317831103560557, + "grad_norm": 7.956715550852483, + "learning_rate": 2.4925484155554235e-06, + "loss": 0.9847, + "step": 7288 + }, + { + "epoch": 1.031924683230693, + "grad_norm": 9.871921107615298, + "learning_rate": 2.491975218650351e-06, + "loss": 0.9689, + "step": 7289 + }, + { + "epoch": 1.0320662561053302, + "grad_norm": 10.073551151420881, + "learning_rate": 2.491402022167136e-06, + "loss": 1.0183, + "step": 7290 + }, + { + "epoch": 1.0322078289799674, + "grad_norm": 11.743574617628344, + "learning_rate": 2.4908288261359108e-06, + "loss": 1.0932, + "step": 7291 + }, + { + "epoch": 1.0323494018546047, + "grad_norm": 9.75289287199232, + "learning_rate": 2.4902556305868065e-06, + "loss": 1.0286, + "step": 7292 + }, + { + "epoch": 1.032490974729242, + "grad_norm": 10.267832804585868, + "learning_rate": 2.4896824355499565e-06, + "loss": 1.0464, + "step": 7293 + }, + { + "epoch": 1.0326325476038791, + "grad_norm": 10.294089048970994, + "learning_rate": 2.489109241055493e-06, + "loss": 1.0827, + "step": 7294 + }, + { + "epoch": 1.0327741204785164, + "grad_norm": 11.620779932737612, + "learning_rate": 2.4885360471335483e-06, + "loss": 1.0525, + "step": 7295 + }, + { + "epoch": 1.0329156933531536, + "grad_norm": 10.42995339958336, + "learning_rate": 2.4879628538142557e-06, + "loss": 1.0662, + "step": 7296 + }, + { + "epoch": 1.0330572662277908, + "grad_norm": 10.621031614727222, + "learning_rate": 2.4873896611277467e-06, + "loss": 1.1333, + "step": 7297 + }, + { + "epoch": 1.033198839102428, + "grad_norm": 9.380692444967499, + "learning_rate": 2.4868164691041537e-06, + "loss": 1.1156, + "step": 7298 + }, + { + "epoch": 1.0333404119770653, + "grad_norm": 8.694695655625061, + "learning_rate": 2.4862432777736094e-06, + "loss": 0.9902, + "step": 7299 + }, + { + "epoch": 1.0334819848517025, + "grad_norm": 9.050077777338652, + "learning_rate": 2.4856700871662452e-06, + "loss": 0.9603, + "step": 7300 + }, + { + "epoch": 1.0336235577263397, + "grad_norm": 9.74959309046729, + "learning_rate": 2.4850968973121945e-06, + "loss": 1.1073, + "step": 7301 + }, + { + "epoch": 1.033765130600977, + "grad_norm": 8.970964430863843, + "learning_rate": 2.4845237082415887e-06, + "loss": 1.0156, + "step": 7302 + }, + { + "epoch": 1.033906703475614, + "grad_norm": 8.618972662579965, + "learning_rate": 2.48395051998456e-06, + "loss": 1.0402, + "step": 7303 + }, + { + "epoch": 1.0340482763502512, + "grad_norm": 9.235620818035528, + "learning_rate": 2.48337733257124e-06, + "loss": 1.0212, + "step": 7304 + }, + { + "epoch": 1.0341898492248884, + "grad_norm": 9.57262248907578, + "learning_rate": 2.482804146031762e-06, + "loss": 0.9959, + "step": 7305 + }, + { + "epoch": 1.0343314220995257, + "grad_norm": 8.340188627236834, + "learning_rate": 2.482230960396256e-06, + "loss": 0.9693, + "step": 7306 + }, + { + "epoch": 1.034472994974163, + "grad_norm": 9.287712544490276, + "learning_rate": 2.4816577756948564e-06, + "loss": 1.0662, + "step": 7307 + }, + { + "epoch": 1.0346145678488001, + "grad_norm": 8.462053304628322, + "learning_rate": 2.481084591957693e-06, + "loss": 1.1267, + "step": 7308 + }, + { + "epoch": 1.0347561407234374, + "grad_norm": 11.214588636727413, + "learning_rate": 2.480511409214899e-06, + "loss": 1.1207, + "step": 7309 + }, + { + "epoch": 1.0348977135980746, + "grad_norm": 9.54198214302319, + "learning_rate": 2.479938227496605e-06, + "loss": 1.0575, + "step": 7310 + }, + { + "epoch": 1.0350392864727118, + "grad_norm": 9.402952712544899, + "learning_rate": 2.479365046832944e-06, + "loss": 1.0026, + "step": 7311 + }, + { + "epoch": 1.035180859347349, + "grad_norm": 8.191436520121705, + "learning_rate": 2.478791867254047e-06, + "loss": 1.0332, + "step": 7312 + }, + { + "epoch": 1.0353224322219863, + "grad_norm": 11.444758545681307, + "learning_rate": 2.478218688790047e-06, + "loss": 0.9679, + "step": 7313 + }, + { + "epoch": 1.0354640050966235, + "grad_norm": 8.623665084365504, + "learning_rate": 2.477645511471073e-06, + "loss": 0.9727, + "step": 7314 + }, + { + "epoch": 1.0356055779712607, + "grad_norm": 8.860463877777198, + "learning_rate": 2.4770723353272576e-06, + "loss": 1.066, + "step": 7315 + }, + { + "epoch": 1.035747150845898, + "grad_norm": 11.712251453347701, + "learning_rate": 2.4764991603887325e-06, + "loss": 1.0889, + "step": 7316 + }, + { + "epoch": 1.0358887237205352, + "grad_norm": 9.75037285507525, + "learning_rate": 2.475925986685629e-06, + "loss": 1.1371, + "step": 7317 + }, + { + "epoch": 1.0360302965951724, + "grad_norm": 8.850869053127239, + "learning_rate": 2.4753528142480784e-06, + "loss": 0.991, + "step": 7318 + }, + { + "epoch": 1.0361718694698097, + "grad_norm": 8.93911664519959, + "learning_rate": 2.4747796431062124e-06, + "loss": 1.0719, + "step": 7319 + }, + { + "epoch": 1.036313442344447, + "grad_norm": 8.0298306290756, + "learning_rate": 2.4742064732901618e-06, + "loss": 1.0858, + "step": 7320 + }, + { + "epoch": 1.0364550152190841, + "grad_norm": 8.31682640437022, + "learning_rate": 2.473633304830057e-06, + "loss": 1.0728, + "step": 7321 + }, + { + "epoch": 1.0365965880937213, + "grad_norm": 9.66277914692689, + "learning_rate": 2.4730601377560305e-06, + "loss": 1.0537, + "step": 7322 + }, + { + "epoch": 1.0367381609683584, + "grad_norm": 7.129602919726228, + "learning_rate": 2.4724869720982124e-06, + "loss": 0.9731, + "step": 7323 + }, + { + "epoch": 1.0368797338429956, + "grad_norm": 9.141506785058443, + "learning_rate": 2.471913807886733e-06, + "loss": 1.0211, + "step": 7324 + }, + { + "epoch": 1.0370213067176328, + "grad_norm": 8.8534637052443, + "learning_rate": 2.4713406451517247e-06, + "loss": 1.0429, + "step": 7325 + }, + { + "epoch": 1.03716287959227, + "grad_norm": 9.690250129423944, + "learning_rate": 2.4707674839233168e-06, + "loss": 1.1006, + "step": 7326 + }, + { + "epoch": 1.0373044524669073, + "grad_norm": 9.407754872742245, + "learning_rate": 2.4701943242316405e-06, + "loss": 1.0018, + "step": 7327 + }, + { + "epoch": 1.0374460253415445, + "grad_norm": 8.519139451817402, + "learning_rate": 2.469621166106826e-06, + "loss": 1.0132, + "step": 7328 + }, + { + "epoch": 1.0375875982161817, + "grad_norm": 10.000706075536733, + "learning_rate": 2.4690480095790046e-06, + "loss": 0.9392, + "step": 7329 + }, + { + "epoch": 1.037729171090819, + "grad_norm": 10.873054768891961, + "learning_rate": 2.468474854678306e-06, + "loss": 0.9717, + "step": 7330 + }, + { + "epoch": 1.0378707439654562, + "grad_norm": 8.878506679581594, + "learning_rate": 2.4679017014348606e-06, + "loss": 1.0104, + "step": 7331 + }, + { + "epoch": 1.0380123168400934, + "grad_norm": 7.3275094170251185, + "learning_rate": 2.467328549878799e-06, + "loss": 0.9902, + "step": 7332 + }, + { + "epoch": 1.0381538897147307, + "grad_norm": 7.872470510021129, + "learning_rate": 2.4667554000402513e-06, + "loss": 0.9466, + "step": 7333 + }, + { + "epoch": 1.038295462589368, + "grad_norm": 10.171192058366787, + "learning_rate": 2.4661822519493485e-06, + "loss": 0.9781, + "step": 7334 + }, + { + "epoch": 1.0384370354640051, + "grad_norm": 10.240633895446434, + "learning_rate": 2.465609105636218e-06, + "loss": 1.0264, + "step": 7335 + }, + { + "epoch": 1.0385786083386424, + "grad_norm": 9.741452113634114, + "learning_rate": 2.465035961130992e-06, + "loss": 1.0165, + "step": 7336 + }, + { + "epoch": 1.0387201812132796, + "grad_norm": 10.279005367980915, + "learning_rate": 2.4644628184637987e-06, + "loss": 1.0178, + "step": 7337 + }, + { + "epoch": 1.0388617540879168, + "grad_norm": 9.043027807582533, + "learning_rate": 2.4638896776647684e-06, + "loss": 1.0256, + "step": 7338 + }, + { + "epoch": 1.039003326962554, + "grad_norm": 9.66011202224004, + "learning_rate": 2.463316538764031e-06, + "loss": 1.1664, + "step": 7339 + }, + { + "epoch": 1.0391448998371913, + "grad_norm": 8.868530064248995, + "learning_rate": 2.462743401791716e-06, + "loss": 1.0465, + "step": 7340 + }, + { + "epoch": 1.0392864727118285, + "grad_norm": 8.067198338077409, + "learning_rate": 2.462170266777953e-06, + "loss": 1.0009, + "step": 7341 + }, + { + "epoch": 1.0394280455864657, + "grad_norm": 9.768803779517471, + "learning_rate": 2.4615971337528704e-06, + "loss": 0.9289, + "step": 7342 + }, + { + "epoch": 1.039569618461103, + "grad_norm": 10.1133755487589, + "learning_rate": 2.461024002746598e-06, + "loss": 0.9745, + "step": 7343 + }, + { + "epoch": 1.03971119133574, + "grad_norm": 8.309729092769201, + "learning_rate": 2.4604508737892653e-06, + "loss": 0.9507, + "step": 7344 + }, + { + "epoch": 1.0398527642103772, + "grad_norm": 10.395034015227385, + "learning_rate": 2.459877746911e-06, + "loss": 1.2991, + "step": 7345 + }, + { + "epoch": 1.0399943370850144, + "grad_norm": 9.754591032104488, + "learning_rate": 2.4593046221419317e-06, + "loss": 1.159, + "step": 7346 + }, + { + "epoch": 1.0401359099596517, + "grad_norm": 11.094382684423568, + "learning_rate": 2.4587314995121893e-06, + "loss": 1.0649, + "step": 7347 + }, + { + "epoch": 1.040277482834289, + "grad_norm": 8.311428373857861, + "learning_rate": 2.458158379051901e-06, + "loss": 1.0387, + "step": 7348 + }, + { + "epoch": 1.0404190557089261, + "grad_norm": 10.406479841563673, + "learning_rate": 2.4575852607911956e-06, + "loss": 1.1433, + "step": 7349 + }, + { + "epoch": 1.0405606285835634, + "grad_norm": 8.129972843439184, + "learning_rate": 2.457012144760201e-06, + "loss": 0.946, + "step": 7350 + }, + { + "epoch": 1.0407022014582006, + "grad_norm": 7.983931377949258, + "learning_rate": 2.4564390309890463e-06, + "loss": 1.0915, + "step": 7351 + }, + { + "epoch": 1.0408437743328378, + "grad_norm": 8.98471212003383, + "learning_rate": 2.455865919507859e-06, + "loss": 1.0293, + "step": 7352 + }, + { + "epoch": 1.040985347207475, + "grad_norm": 9.251211550987946, + "learning_rate": 2.4552928103467677e-06, + "loss": 1.1346, + "step": 7353 + }, + { + "epoch": 1.0411269200821123, + "grad_norm": 10.349844344438305, + "learning_rate": 2.4547197035359e-06, + "loss": 1.0218, + "step": 7354 + }, + { + "epoch": 1.0412684929567495, + "grad_norm": 8.896466580319744, + "learning_rate": 2.454146599105384e-06, + "loss": 1.0052, + "step": 7355 + }, + { + "epoch": 1.0414100658313867, + "grad_norm": 10.809884697707666, + "learning_rate": 2.4535734970853466e-06, + "loss": 1.1439, + "step": 7356 + }, + { + "epoch": 1.041551638706024, + "grad_norm": 9.177263227093112, + "learning_rate": 2.453000397505916e-06, + "loss": 1.0017, + "step": 7357 + }, + { + "epoch": 1.0416932115806612, + "grad_norm": 8.239271181336937, + "learning_rate": 2.4524273003972194e-06, + "loss": 1.0709, + "step": 7358 + }, + { + "epoch": 1.0418347844552984, + "grad_norm": 7.7441253546507705, + "learning_rate": 2.451854205789384e-06, + "loss": 0.9886, + "step": 7359 + }, + { + "epoch": 1.0419763573299357, + "grad_norm": 10.203309692767512, + "learning_rate": 2.4512811137125374e-06, + "loss": 1.0331, + "step": 7360 + }, + { + "epoch": 1.0421179302045729, + "grad_norm": 10.221731888257008, + "learning_rate": 2.4507080241968055e-06, + "loss": 0.9936, + "step": 7361 + }, + { + "epoch": 1.0422595030792101, + "grad_norm": 11.09669133267727, + "learning_rate": 2.450134937272317e-06, + "loss": 1.1567, + "step": 7362 + }, + { + "epoch": 1.0424010759538473, + "grad_norm": 9.745252186862425, + "learning_rate": 2.449561852969197e-06, + "loss": 0.9645, + "step": 7363 + }, + { + "epoch": 1.0425426488284844, + "grad_norm": 8.504579544573645, + "learning_rate": 2.448988771317573e-06, + "loss": 1.0211, + "step": 7364 + }, + { + "epoch": 1.0426842217031216, + "grad_norm": 8.022961090712451, + "learning_rate": 2.448415692347572e-06, + "loss": 1.0442, + "step": 7365 + }, + { + "epoch": 1.0428257945777588, + "grad_norm": 10.282160107063358, + "learning_rate": 2.4478426160893197e-06, + "loss": 1.1275, + "step": 7366 + }, + { + "epoch": 1.042967367452396, + "grad_norm": 8.16393120034968, + "learning_rate": 2.4472695425729412e-06, + "loss": 1.1157, + "step": 7367 + }, + { + "epoch": 1.0431089403270333, + "grad_norm": 8.89120765747817, + "learning_rate": 2.4466964718285636e-06, + "loss": 1.0701, + "step": 7368 + }, + { + "epoch": 1.0432505132016705, + "grad_norm": 9.635308628335745, + "learning_rate": 2.446123403886313e-06, + "loss": 1.081, + "step": 7369 + }, + { + "epoch": 1.0433920860763077, + "grad_norm": 8.584886231421201, + "learning_rate": 2.445550338776315e-06, + "loss": 0.8563, + "step": 7370 + }, + { + "epoch": 1.043533658950945, + "grad_norm": 7.9834294365797245, + "learning_rate": 2.4449772765286947e-06, + "loss": 0.9142, + "step": 7371 + }, + { + "epoch": 1.0436752318255822, + "grad_norm": 10.942927643243989, + "learning_rate": 2.4444042171735784e-06, + "loss": 1.1677, + "step": 7372 + }, + { + "epoch": 1.0438168047002194, + "grad_norm": 8.37626160186342, + "learning_rate": 2.4438311607410905e-06, + "loss": 1.0234, + "step": 7373 + }, + { + "epoch": 1.0439583775748567, + "grad_norm": 9.574424339897918, + "learning_rate": 2.443258107261357e-06, + "loss": 0.9915, + "step": 7374 + }, + { + "epoch": 1.0440999504494939, + "grad_norm": 13.070961579818592, + "learning_rate": 2.4426850567645022e-06, + "loss": 1.0307, + "step": 7375 + }, + { + "epoch": 1.0442415233241311, + "grad_norm": 10.370333437879081, + "learning_rate": 2.442112009280652e-06, + "loss": 1.0932, + "step": 7376 + }, + { + "epoch": 1.0443830961987683, + "grad_norm": 9.21237117102722, + "learning_rate": 2.4415389648399294e-06, + "loss": 1.0574, + "step": 7377 + }, + { + "epoch": 1.0445246690734056, + "grad_norm": 7.347513735908579, + "learning_rate": 2.44096592347246e-06, + "loss": 0.952, + "step": 7378 + }, + { + "epoch": 1.0446662419480428, + "grad_norm": 8.85267819328409, + "learning_rate": 2.440392885208368e-06, + "loss": 1.0987, + "step": 7379 + }, + { + "epoch": 1.04480781482268, + "grad_norm": 9.278448125057066, + "learning_rate": 2.4398198500777777e-06, + "loss": 1.0547, + "step": 7380 + }, + { + "epoch": 1.0449493876973173, + "grad_norm": 8.58921844301995, + "learning_rate": 2.4392468181108127e-06, + "loss": 0.9522, + "step": 7381 + }, + { + "epoch": 1.0450909605719545, + "grad_norm": 12.089131895440282, + "learning_rate": 2.438673789337598e-06, + "loss": 1.078, + "step": 7382 + }, + { + "epoch": 1.0452325334465917, + "grad_norm": 8.988046549385789, + "learning_rate": 2.4381007637882555e-06, + "loss": 1.1976, + "step": 7383 + }, + { + "epoch": 1.045374106321229, + "grad_norm": 8.456043804217646, + "learning_rate": 2.4375277414929098e-06, + "loss": 1.1808, + "step": 7384 + }, + { + "epoch": 1.045515679195866, + "grad_norm": 9.133745157280476, + "learning_rate": 2.4369547224816843e-06, + "loss": 1.0313, + "step": 7385 + }, + { + "epoch": 1.0456572520705032, + "grad_norm": 8.002178372394187, + "learning_rate": 2.436381706784702e-06, + "loss": 0.9878, + "step": 7386 + }, + { + "epoch": 1.0457988249451404, + "grad_norm": 9.547720064761984, + "learning_rate": 2.435808694432087e-06, + "loss": 0.9941, + "step": 7387 + }, + { + "epoch": 1.0459403978197777, + "grad_norm": 8.642122825224687, + "learning_rate": 2.4352356854539607e-06, + "loss": 1.0064, + "step": 7388 + }, + { + "epoch": 1.0460819706944149, + "grad_norm": 10.086612977235673, + "learning_rate": 2.434662679880445e-06, + "loss": 1.1187, + "step": 7389 + }, + { + "epoch": 1.0462235435690521, + "grad_norm": 10.648991324785252, + "learning_rate": 2.4340896777416636e-06, + "loss": 1.049, + "step": 7390 + }, + { + "epoch": 1.0463651164436893, + "grad_norm": 8.315536059305694, + "learning_rate": 2.4335166790677395e-06, + "loss": 0.9957, + "step": 7391 + }, + { + "epoch": 1.0465066893183266, + "grad_norm": 8.231352104648611, + "learning_rate": 2.4329436838887936e-06, + "loss": 1.0116, + "step": 7392 + }, + { + "epoch": 1.0466482621929638, + "grad_norm": 7.912954028754473, + "learning_rate": 2.432370692234948e-06, + "loss": 1.0076, + "step": 7393 + }, + { + "epoch": 1.046789835067601, + "grad_norm": 10.820444352761625, + "learning_rate": 2.431797704136325e-06, + "loss": 1.0035, + "step": 7394 + }, + { + "epoch": 1.0469314079422383, + "grad_norm": 7.989490997563392, + "learning_rate": 2.431224719623046e-06, + "loss": 0.9078, + "step": 7395 + }, + { + "epoch": 1.0470729808168755, + "grad_norm": 7.90563407579562, + "learning_rate": 2.430651738725232e-06, + "loss": 1.0209, + "step": 7396 + }, + { + "epoch": 1.0472145536915127, + "grad_norm": 9.444083841458216, + "learning_rate": 2.430078761473005e-06, + "loss": 1.1117, + "step": 7397 + }, + { + "epoch": 1.04735612656615, + "grad_norm": 8.365569769499228, + "learning_rate": 2.429505787896485e-06, + "loss": 1.0587, + "step": 7398 + }, + { + "epoch": 1.0474976994407872, + "grad_norm": 8.880494472036515, + "learning_rate": 2.428932818025793e-06, + "loss": 1.115, + "step": 7399 + }, + { + "epoch": 1.0476392723154244, + "grad_norm": 10.051198358401917, + "learning_rate": 2.42835985189105e-06, + "loss": 1.1142, + "step": 7400 + }, + { + "epoch": 1.0477808451900616, + "grad_norm": 10.293889679508263, + "learning_rate": 2.427786889522376e-06, + "loss": 1.0585, + "step": 7401 + }, + { + "epoch": 1.0479224180646989, + "grad_norm": 8.739877131316424, + "learning_rate": 2.427213930949891e-06, + "loss": 0.9236, + "step": 7402 + }, + { + "epoch": 1.048063990939336, + "grad_norm": 7.40055248156992, + "learning_rate": 2.426640976203716e-06, + "loss": 0.9182, + "step": 7403 + }, + { + "epoch": 1.0482055638139733, + "grad_norm": 7.439979460493059, + "learning_rate": 2.4260680253139696e-06, + "loss": 1.0562, + "step": 7404 + }, + { + "epoch": 1.0483471366886103, + "grad_norm": 10.835450968033774, + "learning_rate": 2.425495078310772e-06, + "loss": 1.0225, + "step": 7405 + }, + { + "epoch": 1.0484887095632476, + "grad_norm": 8.07585563732428, + "learning_rate": 2.424922135224243e-06, + "loss": 0.9842, + "step": 7406 + }, + { + "epoch": 1.0486302824378848, + "grad_norm": 9.526546530939338, + "learning_rate": 2.4243491960845004e-06, + "loss": 1.0997, + "step": 7407 + }, + { + "epoch": 1.048771855312522, + "grad_norm": 10.040784826113764, + "learning_rate": 2.4237762609216666e-06, + "loss": 1.0446, + "step": 7408 + }, + { + "epoch": 1.0489134281871593, + "grad_norm": 8.212250318752563, + "learning_rate": 2.423203329765856e-06, + "loss": 0.9347, + "step": 7409 + }, + { + "epoch": 1.0490550010617965, + "grad_norm": 10.446095838825421, + "learning_rate": 2.4226304026471894e-06, + "loss": 1.051, + "step": 7410 + }, + { + "epoch": 1.0491965739364337, + "grad_norm": 9.696006928318683, + "learning_rate": 2.4220574795957844e-06, + "loss": 1.1205, + "step": 7411 + }, + { + "epoch": 1.049338146811071, + "grad_norm": 9.253727754583934, + "learning_rate": 2.4214845606417604e-06, + "loss": 1.081, + "step": 7412 + }, + { + "epoch": 1.0494797196857082, + "grad_norm": 7.838217436885164, + "learning_rate": 2.4209116458152334e-06, + "loss": 0.9763, + "step": 7413 + }, + { + "epoch": 1.0496212925603454, + "grad_norm": 8.383122391318883, + "learning_rate": 2.4203387351463228e-06, + "loss": 0.9703, + "step": 7414 + }, + { + "epoch": 1.0497628654349827, + "grad_norm": 8.015021050438037, + "learning_rate": 2.4197658286651456e-06, + "loss": 1.024, + "step": 7415 + }, + { + "epoch": 1.0499044383096199, + "grad_norm": 8.809010782118085, + "learning_rate": 2.419192926401819e-06, + "loss": 1.0705, + "step": 7416 + }, + { + "epoch": 1.050046011184257, + "grad_norm": 11.197766926082155, + "learning_rate": 2.41862002838646e-06, + "loss": 1.0028, + "step": 7417 + }, + { + "epoch": 1.0501875840588943, + "grad_norm": 9.275878173712432, + "learning_rate": 2.4180471346491864e-06, + "loss": 1.0607, + "step": 7418 + }, + { + "epoch": 1.0503291569335316, + "grad_norm": 9.164316835638369, + "learning_rate": 2.4174742452201123e-06, + "loss": 1.0635, + "step": 7419 + }, + { + "epoch": 1.0504707298081688, + "grad_norm": 8.922479558928234, + "learning_rate": 2.4169013601293563e-06, + "loss": 1.0073, + "step": 7420 + }, + { + "epoch": 1.050612302682806, + "grad_norm": 8.78936588018064, + "learning_rate": 2.4163284794070333e-06, + "loss": 1.0175, + "step": 7421 + }, + { + "epoch": 1.0507538755574433, + "grad_norm": 8.696424990603179, + "learning_rate": 2.4157556030832606e-06, + "loss": 0.9943, + "step": 7422 + }, + { + "epoch": 1.0508954484320805, + "grad_norm": 8.219926525075314, + "learning_rate": 2.415182731188152e-06, + "loss": 0.9774, + "step": 7423 + }, + { + "epoch": 1.0510370213067177, + "grad_norm": 9.341655789952293, + "learning_rate": 2.4146098637518248e-06, + "loss": 1.0768, + "step": 7424 + }, + { + "epoch": 1.051178594181355, + "grad_norm": 11.264832954234892, + "learning_rate": 2.414037000804393e-06, + "loss": 0.9999, + "step": 7425 + }, + { + "epoch": 1.0513201670559922, + "grad_norm": 9.55175376886544, + "learning_rate": 2.413464142375972e-06, + "loss": 0.9889, + "step": 7426 + }, + { + "epoch": 1.0514617399306292, + "grad_norm": 11.649746714445863, + "learning_rate": 2.412891288496677e-06, + "loss": 0.9963, + "step": 7427 + }, + { + "epoch": 1.0516033128052664, + "grad_norm": 10.661961789427814, + "learning_rate": 2.4123184391966216e-06, + "loss": 1.0136, + "step": 7428 + }, + { + "epoch": 1.0517448856799037, + "grad_norm": 7.904336931576289, + "learning_rate": 2.411745594505921e-06, + "loss": 0.9287, + "step": 7429 + }, + { + "epoch": 1.0518864585545409, + "grad_norm": 10.12675140680247, + "learning_rate": 2.411172754454688e-06, + "loss": 0.9755, + "step": 7430 + }, + { + "epoch": 1.0520280314291781, + "grad_norm": 8.39187561603367, + "learning_rate": 2.410599919073037e-06, + "loss": 1.0251, + "step": 7431 + }, + { + "epoch": 1.0521696043038153, + "grad_norm": 9.198270751312615, + "learning_rate": 2.410027088391082e-06, + "loss": 1.0477, + "step": 7432 + }, + { + "epoch": 1.0523111771784526, + "grad_norm": 10.456864901086286, + "learning_rate": 2.4094542624389357e-06, + "loss": 1.016, + "step": 7433 + }, + { + "epoch": 1.0524527500530898, + "grad_norm": 10.90719294365678, + "learning_rate": 2.4088814412467117e-06, + "loss": 1.144, + "step": 7434 + }, + { + "epoch": 1.052594322927727, + "grad_norm": 9.978391570781524, + "learning_rate": 2.4083086248445213e-06, + "loss": 1.103, + "step": 7435 + }, + { + "epoch": 1.0527358958023643, + "grad_norm": 8.334473442923722, + "learning_rate": 2.4077358132624786e-06, + "loss": 1.034, + "step": 7436 + }, + { + "epoch": 1.0528774686770015, + "grad_norm": 9.319795144903601, + "learning_rate": 2.4071630065306956e-06, + "loss": 1.109, + "step": 7437 + }, + { + "epoch": 1.0530190415516387, + "grad_norm": 10.292705241527363, + "learning_rate": 2.406590204679284e-06, + "loss": 1.0838, + "step": 7438 + }, + { + "epoch": 1.053160614426276, + "grad_norm": 12.020376072235376, + "learning_rate": 2.406017407738356e-06, + "loss": 0.9354, + "step": 7439 + }, + { + "epoch": 1.0533021873009132, + "grad_norm": 10.526694379322697, + "learning_rate": 2.4054446157380237e-06, + "loss": 1.0142, + "step": 7440 + }, + { + "epoch": 1.0534437601755504, + "grad_norm": 9.033335936523663, + "learning_rate": 2.404871828708396e-06, + "loss": 1.102, + "step": 7441 + }, + { + "epoch": 1.0535853330501876, + "grad_norm": 9.269411493774454, + "learning_rate": 2.4042990466795857e-06, + "loss": 1.065, + "step": 7442 + }, + { + "epoch": 1.0537269059248249, + "grad_norm": 10.041772950006996, + "learning_rate": 2.4037262696817034e-06, + "loss": 1.0405, + "step": 7443 + }, + { + "epoch": 1.053868478799462, + "grad_norm": 7.9139371750935785, + "learning_rate": 2.403153497744859e-06, + "loss": 1.0156, + "step": 7444 + }, + { + "epoch": 1.0540100516740993, + "grad_norm": 11.498725488775307, + "learning_rate": 2.402580730899163e-06, + "loss": 0.9957, + "step": 7445 + }, + { + "epoch": 1.0541516245487366, + "grad_norm": 9.634300197492818, + "learning_rate": 2.4020079691747256e-06, + "loss": 1.0286, + "step": 7446 + }, + { + "epoch": 1.0542931974233736, + "grad_norm": 9.790758672508519, + "learning_rate": 2.4014352126016562e-06, + "loss": 1.1437, + "step": 7447 + }, + { + "epoch": 1.0544347702980108, + "grad_norm": 11.123388966699501, + "learning_rate": 2.4008624612100636e-06, + "loss": 1.0142, + "step": 7448 + }, + { + "epoch": 1.054576343172648, + "grad_norm": 8.86699003428257, + "learning_rate": 2.400289715030058e-06, + "loss": 1.0012, + "step": 7449 + }, + { + "epoch": 1.0547179160472853, + "grad_norm": 9.74539271338942, + "learning_rate": 2.3997169740917485e-06, + "loss": 1.0673, + "step": 7450 + }, + { + "epoch": 1.0548594889219225, + "grad_norm": 9.064014808070317, + "learning_rate": 2.3991442384252417e-06, + "loss": 0.9631, + "step": 7451 + }, + { + "epoch": 1.0550010617965597, + "grad_norm": 8.205789652780748, + "learning_rate": 2.3985715080606473e-06, + "loss": 0.9411, + "step": 7452 + }, + { + "epoch": 1.055142634671197, + "grad_norm": 10.336576342573002, + "learning_rate": 2.3979987830280733e-06, + "loss": 1.0801, + "step": 7453 + }, + { + "epoch": 1.0552842075458342, + "grad_norm": 10.655507528890503, + "learning_rate": 2.3974260633576274e-06, + "loss": 0.9103, + "step": 7454 + }, + { + "epoch": 1.0554257804204714, + "grad_norm": 10.800628672068774, + "learning_rate": 2.3968533490794165e-06, + "loss": 1.0545, + "step": 7455 + }, + { + "epoch": 1.0555673532951086, + "grad_norm": 8.01789808854545, + "learning_rate": 2.3962806402235484e-06, + "loss": 0.9241, + "step": 7456 + }, + { + "epoch": 1.0557089261697459, + "grad_norm": 9.37761722907368, + "learning_rate": 2.3957079368201293e-06, + "loss": 1.0685, + "step": 7457 + }, + { + "epoch": 1.055850499044383, + "grad_norm": 9.14335813171532, + "learning_rate": 2.395135238899266e-06, + "loss": 0.9655, + "step": 7458 + }, + { + "epoch": 1.0559920719190203, + "grad_norm": 10.899534415004487, + "learning_rate": 2.3945625464910654e-06, + "loss": 1.048, + "step": 7459 + }, + { + "epoch": 1.0561336447936576, + "grad_norm": 8.692197997645557, + "learning_rate": 2.3939898596256334e-06, + "loss": 1.0757, + "step": 7460 + }, + { + "epoch": 1.0562752176682948, + "grad_norm": 9.893519166433249, + "learning_rate": 2.3934171783330763e-06, + "loss": 1.0491, + "step": 7461 + }, + { + "epoch": 1.056416790542932, + "grad_norm": 8.223221291452829, + "learning_rate": 2.3928445026434973e-06, + "loss": 0.9495, + "step": 7462 + }, + { + "epoch": 1.0565583634175693, + "grad_norm": 10.593648454297613, + "learning_rate": 2.3922718325870034e-06, + "loss": 1.1302, + "step": 7463 + }, + { + "epoch": 1.0566999362922065, + "grad_norm": 10.237682477840863, + "learning_rate": 2.391699168193698e-06, + "loss": 1.0432, + "step": 7464 + }, + { + "epoch": 1.0568415091668437, + "grad_norm": 9.583816206556483, + "learning_rate": 2.3911265094936874e-06, + "loss": 1.1287, + "step": 7465 + }, + { + "epoch": 1.056983082041481, + "grad_norm": 9.153815769374454, + "learning_rate": 2.390553856517075e-06, + "loss": 0.9714, + "step": 7466 + }, + { + "epoch": 1.0571246549161182, + "grad_norm": 7.5271466105152625, + "learning_rate": 2.3899812092939644e-06, + "loss": 0.9705, + "step": 7467 + }, + { + "epoch": 1.0572662277907552, + "grad_norm": 11.066749575507353, + "learning_rate": 2.38940856785446e-06, + "loss": 0.969, + "step": 7468 + }, + { + "epoch": 1.0574078006653924, + "grad_norm": 11.928444828245961, + "learning_rate": 2.3888359322286644e-06, + "loss": 1.1624, + "step": 7469 + }, + { + "epoch": 1.0575493735400296, + "grad_norm": 9.401907187621479, + "learning_rate": 2.3882633024466813e-06, + "loss": 1.1833, + "step": 7470 + }, + { + "epoch": 1.0576909464146669, + "grad_norm": 9.678072372061866, + "learning_rate": 2.3876906785386133e-06, + "loss": 0.9596, + "step": 7471 + }, + { + "epoch": 1.057832519289304, + "grad_norm": 10.176842480050949, + "learning_rate": 2.3871180605345623e-06, + "loss": 1.1021, + "step": 7472 + }, + { + "epoch": 1.0579740921639413, + "grad_norm": 8.927904219629756, + "learning_rate": 2.3865454484646307e-06, + "loss": 0.938, + "step": 7473 + }, + { + "epoch": 1.0581156650385786, + "grad_norm": 8.439479567405346, + "learning_rate": 2.3859728423589197e-06, + "loss": 1.0449, + "step": 7474 + }, + { + "epoch": 1.0582572379132158, + "grad_norm": 11.674893848010125, + "learning_rate": 2.385400242247532e-06, + "loss": 1.0956, + "step": 7475 + }, + { + "epoch": 1.058398810787853, + "grad_norm": 10.11644882981521, + "learning_rate": 2.384827648160568e-06, + "loss": 1.0673, + "step": 7476 + }, + { + "epoch": 1.0585403836624903, + "grad_norm": 7.42717584139516, + "learning_rate": 2.3842550601281288e-06, + "loss": 0.9683, + "step": 7477 + }, + { + "epoch": 1.0586819565371275, + "grad_norm": 9.009872531814796, + "learning_rate": 2.3836824781803146e-06, + "loss": 1.0957, + "step": 7478 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 8.403560846444936, + "learning_rate": 2.3831099023472253e-06, + "loss": 0.9979, + "step": 7479 + }, + { + "epoch": 1.058965102286402, + "grad_norm": 9.178549630275455, + "learning_rate": 2.382537332658962e-06, + "loss": 1.1397, + "step": 7480 + }, + { + "epoch": 1.0591066751610392, + "grad_norm": 8.419751750702453, + "learning_rate": 2.3819647691456226e-06, + "loss": 1.0023, + "step": 7481 + }, + { + "epoch": 1.0592482480356764, + "grad_norm": 8.335860619373683, + "learning_rate": 2.3813922118373094e-06, + "loss": 0.9092, + "step": 7482 + }, + { + "epoch": 1.0593898209103136, + "grad_norm": 9.399728941560213, + "learning_rate": 2.3808196607641176e-06, + "loss": 1.0917, + "step": 7483 + }, + { + "epoch": 1.0595313937849509, + "grad_norm": 8.07240006957663, + "learning_rate": 2.3802471159561473e-06, + "loss": 0.9735, + "step": 7484 + }, + { + "epoch": 1.059672966659588, + "grad_norm": 9.443841887556566, + "learning_rate": 2.379674577443497e-06, + "loss": 0.9612, + "step": 7485 + }, + { + "epoch": 1.0598145395342253, + "grad_norm": 9.864320799178298, + "learning_rate": 2.3791020452562647e-06, + "loss": 1.0167, + "step": 7486 + }, + { + "epoch": 1.0599561124088626, + "grad_norm": 8.57554935219281, + "learning_rate": 2.378529519424547e-06, + "loss": 1.014, + "step": 7487 + }, + { + "epoch": 1.0600976852834996, + "grad_norm": 9.815218992195033, + "learning_rate": 2.377956999978442e-06, + "loss": 1.0771, + "step": 7488 + }, + { + "epoch": 1.0602392581581368, + "grad_norm": 7.617897853495991, + "learning_rate": 2.3773844869480473e-06, + "loss": 1.0386, + "step": 7489 + }, + { + "epoch": 1.060380831032774, + "grad_norm": 8.427827391216127, + "learning_rate": 2.376811980363458e-06, + "loss": 0.9884, + "step": 7490 + }, + { + "epoch": 1.0605224039074113, + "grad_norm": 44.78719893537173, + "learning_rate": 2.3762394802547717e-06, + "loss": 1.0641, + "step": 7491 + }, + { + "epoch": 1.0606639767820485, + "grad_norm": 9.825761714071778, + "learning_rate": 2.375666986652083e-06, + "loss": 1.0613, + "step": 7492 + }, + { + "epoch": 1.0608055496566857, + "grad_norm": 8.474914784410952, + "learning_rate": 2.375094499585489e-06, + "loss": 0.9479, + "step": 7493 + }, + { + "epoch": 1.060947122531323, + "grad_norm": 9.035022831915931, + "learning_rate": 2.3745220190850834e-06, + "loss": 0.8919, + "step": 7494 + }, + { + "epoch": 1.0610886954059602, + "grad_norm": 8.373772374607414, + "learning_rate": 2.3739495451809617e-06, + "loss": 0.9939, + "step": 7495 + }, + { + "epoch": 1.0612302682805974, + "grad_norm": 9.37769289123942, + "learning_rate": 2.3733770779032185e-06, + "loss": 0.9442, + "step": 7496 + }, + { + "epoch": 1.0613718411552346, + "grad_norm": 11.435227131791658, + "learning_rate": 2.372804617281948e-06, + "loss": 1.1097, + "step": 7497 + }, + { + "epoch": 1.0615134140298719, + "grad_norm": 10.723540278335053, + "learning_rate": 2.3722321633472435e-06, + "loss": 1.1448, + "step": 7498 + }, + { + "epoch": 1.061654986904509, + "grad_norm": 8.292190867456213, + "learning_rate": 2.3716597161291993e-06, + "loss": 0.9924, + "step": 7499 + }, + { + "epoch": 1.0617965597791463, + "grad_norm": 9.894225900835185, + "learning_rate": 2.371087275657908e-06, + "loss": 1.0008, + "step": 7500 + }, + { + "epoch": 1.0619381326537836, + "grad_norm": 9.736379254818104, + "learning_rate": 2.3705148419634627e-06, + "loss": 1.1855, + "step": 7501 + }, + { + "epoch": 1.0620797055284208, + "grad_norm": 10.372292406602622, + "learning_rate": 2.3699424150759553e-06, + "loss": 1.0181, + "step": 7502 + }, + { + "epoch": 1.062221278403058, + "grad_norm": 9.725822009965597, + "learning_rate": 2.369369995025479e-06, + "loss": 1.0559, + "step": 7503 + }, + { + "epoch": 1.0623628512776953, + "grad_norm": 9.425636363111478, + "learning_rate": 2.3687975818421236e-06, + "loss": 0.9781, + "step": 7504 + }, + { + "epoch": 1.0625044241523325, + "grad_norm": 7.006820081101397, + "learning_rate": 2.3682251755559823e-06, + "loss": 0.9362, + "step": 7505 + }, + { + "epoch": 1.0626459970269697, + "grad_norm": 7.867854918701063, + "learning_rate": 2.367652776197145e-06, + "loss": 1.0182, + "step": 7506 + }, + { + "epoch": 1.062787569901607, + "grad_norm": 10.38405205049092, + "learning_rate": 2.3670803837957017e-06, + "loss": 1.0864, + "step": 7507 + }, + { + "epoch": 1.0629291427762442, + "grad_norm": 10.040905260103932, + "learning_rate": 2.3665079983817443e-06, + "loss": 0.9872, + "step": 7508 + }, + { + "epoch": 1.0630707156508814, + "grad_norm": 8.5514905556082, + "learning_rate": 2.3659356199853617e-06, + "loss": 0.9157, + "step": 7509 + }, + { + "epoch": 1.0632122885255184, + "grad_norm": 9.411094254567818, + "learning_rate": 2.365363248636643e-06, + "loss": 1.0827, + "step": 7510 + }, + { + "epoch": 1.0633538614001556, + "grad_norm": 11.395897083464472, + "learning_rate": 2.3647908843656787e-06, + "loss": 1.0242, + "step": 7511 + }, + { + "epoch": 1.0634954342747929, + "grad_norm": 10.519360087263284, + "learning_rate": 2.364218527202557e-06, + "loss": 1.08, + "step": 7512 + }, + { + "epoch": 1.06363700714943, + "grad_norm": 11.938757066298232, + "learning_rate": 2.3636461771773655e-06, + "loss": 1.1544, + "step": 7513 + }, + { + "epoch": 1.0637785800240673, + "grad_norm": 10.222980894969249, + "learning_rate": 2.363073834320194e-06, + "loss": 1.0384, + "step": 7514 + }, + { + "epoch": 1.0639201528987046, + "grad_norm": 11.570729888867548, + "learning_rate": 2.3625014986611282e-06, + "loss": 1.0998, + "step": 7515 + }, + { + "epoch": 1.0640617257733418, + "grad_norm": 8.366500869060625, + "learning_rate": 2.3619291702302557e-06, + "loss": 1.0219, + "step": 7516 + }, + { + "epoch": 1.064203298647979, + "grad_norm": 8.409324792707098, + "learning_rate": 2.3613568490576635e-06, + "loss": 1.0752, + "step": 7517 + }, + { + "epoch": 1.0643448715226163, + "grad_norm": 8.350524209750949, + "learning_rate": 2.360784535173439e-06, + "loss": 1.0525, + "step": 7518 + }, + { + "epoch": 1.0644864443972535, + "grad_norm": 8.792160912191006, + "learning_rate": 2.3602122286076675e-06, + "loss": 1.0706, + "step": 7519 + }, + { + "epoch": 1.0646280172718907, + "grad_norm": 9.499224179361471, + "learning_rate": 2.359639929390435e-06, + "loss": 1.0254, + "step": 7520 + }, + { + "epoch": 1.064769590146528, + "grad_norm": 9.421575791398507, + "learning_rate": 2.359067637551827e-06, + "loss": 0.9523, + "step": 7521 + }, + { + "epoch": 1.0649111630211652, + "grad_norm": 8.576775672280183, + "learning_rate": 2.3584953531219278e-06, + "loss": 1.0711, + "step": 7522 + }, + { + "epoch": 1.0650527358958024, + "grad_norm": 9.480943942175166, + "learning_rate": 2.3579230761308223e-06, + "loss": 1.0658, + "step": 7523 + }, + { + "epoch": 1.0651943087704396, + "grad_norm": 8.876783164970865, + "learning_rate": 2.3573508066085954e-06, + "loss": 1.0383, + "step": 7524 + }, + { + "epoch": 1.0653358816450769, + "grad_norm": 9.358605456677292, + "learning_rate": 2.3567785445853295e-06, + "loss": 1.0861, + "step": 7525 + }, + { + "epoch": 1.065477454519714, + "grad_norm": 8.960908977858923, + "learning_rate": 2.356206290091109e-06, + "loss": 0.9274, + "step": 7526 + }, + { + "epoch": 1.0656190273943513, + "grad_norm": 9.123749111607712, + "learning_rate": 2.355634043156017e-06, + "loss": 1.053, + "step": 7527 + }, + { + "epoch": 1.0657606002689886, + "grad_norm": 9.29662264954259, + "learning_rate": 2.355061803810135e-06, + "loss": 0.9494, + "step": 7528 + }, + { + "epoch": 1.0659021731436256, + "grad_norm": 10.611773707111244, + "learning_rate": 2.354489572083546e-06, + "loss": 1.1368, + "step": 7529 + }, + { + "epoch": 1.0660437460182628, + "grad_norm": 8.31471165655434, + "learning_rate": 2.3539173480063323e-06, + "loss": 1.0472, + "step": 7530 + }, + { + "epoch": 1.0661853188929, + "grad_norm": 8.195277123970358, + "learning_rate": 2.3533451316085744e-06, + "loss": 1.0943, + "step": 7531 + }, + { + "epoch": 1.0663268917675373, + "grad_norm": 8.93126882733976, + "learning_rate": 2.352772922920353e-06, + "loss": 0.9857, + "step": 7532 + }, + { + "epoch": 1.0664684646421745, + "grad_norm": 7.963243444400351, + "learning_rate": 2.3522007219717493e-06, + "loss": 1.0022, + "step": 7533 + }, + { + "epoch": 1.0666100375168117, + "grad_norm": 8.245094459228564, + "learning_rate": 2.351628528792844e-06, + "loss": 0.9591, + "step": 7534 + }, + { + "epoch": 1.066751610391449, + "grad_norm": 8.538979853642822, + "learning_rate": 2.3510563434137175e-06, + "loss": 1.0668, + "step": 7535 + }, + { + "epoch": 1.0668931832660862, + "grad_norm": 9.144042329945544, + "learning_rate": 2.3504841658644465e-06, + "loss": 1.0432, + "step": 7536 + }, + { + "epoch": 1.0670347561407234, + "grad_norm": 7.936675261656128, + "learning_rate": 2.3499119961751114e-06, + "loss": 0.9805, + "step": 7537 + }, + { + "epoch": 1.0671763290153606, + "grad_norm": 10.3025509092077, + "learning_rate": 2.3493398343757904e-06, + "loss": 1.1458, + "step": 7538 + }, + { + "epoch": 1.0673179018899979, + "grad_norm": 9.632969714983286, + "learning_rate": 2.3487676804965624e-06, + "loss": 1.081, + "step": 7539 + }, + { + "epoch": 1.067459474764635, + "grad_norm": 9.041277008332742, + "learning_rate": 2.3481955345675052e-06, + "loss": 1.0794, + "step": 7540 + }, + { + "epoch": 1.0676010476392723, + "grad_norm": 9.211785637372715, + "learning_rate": 2.347623396618695e-06, + "loss": 1.0748, + "step": 7541 + }, + { + "epoch": 1.0677426205139096, + "grad_norm": 9.046387127254302, + "learning_rate": 2.3470512666802094e-06, + "loss": 1.0845, + "step": 7542 + }, + { + "epoch": 1.0678841933885468, + "grad_norm": 8.90225933414891, + "learning_rate": 2.3464791447821244e-06, + "loss": 0.9886, + "step": 7543 + }, + { + "epoch": 1.068025766263184, + "grad_norm": 8.421351372288587, + "learning_rate": 2.3459070309545165e-06, + "loss": 1.0193, + "step": 7544 + }, + { + "epoch": 1.0681673391378212, + "grad_norm": 10.534755044270058, + "learning_rate": 2.345334925227461e-06, + "loss": 1.1358, + "step": 7545 + }, + { + "epoch": 1.0683089120124585, + "grad_norm": 9.497812922846123, + "learning_rate": 2.344762827631034e-06, + "loss": 1.0011, + "step": 7546 + }, + { + "epoch": 1.0684504848870957, + "grad_norm": 8.24182926502297, + "learning_rate": 2.3441907381953084e-06, + "loss": 1.0364, + "step": 7547 + }, + { + "epoch": 1.068592057761733, + "grad_norm": 9.148699775408875, + "learning_rate": 2.3436186569503598e-06, + "loss": 1.0329, + "step": 7548 + }, + { + "epoch": 1.0687336306363702, + "grad_norm": 9.043820407385098, + "learning_rate": 2.343046583926262e-06, + "loss": 1.0228, + "step": 7549 + }, + { + "epoch": 1.0688752035110074, + "grad_norm": 8.594758907396386, + "learning_rate": 2.3424745191530877e-06, + "loss": 1.0809, + "step": 7550 + }, + { + "epoch": 1.0690167763856444, + "grad_norm": 9.344665686606108, + "learning_rate": 2.3419024626609112e-06, + "loss": 1.1405, + "step": 7551 + }, + { + "epoch": 1.0691583492602816, + "grad_norm": 9.549499850133268, + "learning_rate": 2.341330414479804e-06, + "loss": 1.0882, + "step": 7552 + }, + { + "epoch": 1.0692999221349189, + "grad_norm": 10.230383361017264, + "learning_rate": 2.340758374639838e-06, + "loss": 1.0373, + "step": 7553 + }, + { + "epoch": 1.069441495009556, + "grad_norm": 8.692472283640338, + "learning_rate": 2.3401863431710864e-06, + "loss": 1.0514, + "step": 7554 + }, + { + "epoch": 1.0695830678841933, + "grad_norm": 9.637445503605244, + "learning_rate": 2.3396143201036187e-06, + "loss": 1.1465, + "step": 7555 + }, + { + "epoch": 1.0697246407588306, + "grad_norm": 8.662155395523184, + "learning_rate": 2.3390423054675084e-06, + "loss": 1.0576, + "step": 7556 + }, + { + "epoch": 1.0698662136334678, + "grad_norm": 8.536790103916127, + "learning_rate": 2.3384702992928228e-06, + "loss": 1.0677, + "step": 7557 + }, + { + "epoch": 1.070007786508105, + "grad_norm": 10.436694451044024, + "learning_rate": 2.337898301609633e-06, + "loss": 1.0987, + "step": 7558 + }, + { + "epoch": 1.0701493593827422, + "grad_norm": 10.19896029988991, + "learning_rate": 2.3373263124480086e-06, + "loss": 0.9822, + "step": 7559 + }, + { + "epoch": 1.0702909322573795, + "grad_norm": 9.661233054496758, + "learning_rate": 2.336754331838019e-06, + "loss": 0.9748, + "step": 7560 + }, + { + "epoch": 1.0704325051320167, + "grad_norm": 8.559019397648099, + "learning_rate": 2.3361823598097316e-06, + "loss": 0.9739, + "step": 7561 + }, + { + "epoch": 1.070574078006654, + "grad_norm": 9.554060259588468, + "learning_rate": 2.335610396393216e-06, + "loss": 1.1289, + "step": 7562 + }, + { + "epoch": 1.0707156508812912, + "grad_norm": 8.463159496579914, + "learning_rate": 2.3350384416185395e-06, + "loss": 1.032, + "step": 7563 + }, + { + "epoch": 1.0708572237559284, + "grad_norm": 10.191357590196215, + "learning_rate": 2.334466495515769e-06, + "loss": 0.998, + "step": 7564 + }, + { + "epoch": 1.0709987966305656, + "grad_norm": 9.053430267549146, + "learning_rate": 2.3338945581149713e-06, + "loss": 0.9941, + "step": 7565 + }, + { + "epoch": 1.0711403695052029, + "grad_norm": 8.129001981656385, + "learning_rate": 2.333322629446213e-06, + "loss": 1.0167, + "step": 7566 + }, + { + "epoch": 1.07128194237984, + "grad_norm": 9.492179864323061, + "learning_rate": 2.33275070953956e-06, + "loss": 1.0354, + "step": 7567 + }, + { + "epoch": 1.0714235152544773, + "grad_norm": 8.602694209156223, + "learning_rate": 2.3321787984250774e-06, + "loss": 0.9789, + "step": 7568 + }, + { + "epoch": 1.0715650881291146, + "grad_norm": 8.797910476912437, + "learning_rate": 2.33160689613283e-06, + "loss": 0.9363, + "step": 7569 + }, + { + "epoch": 1.0717066610037518, + "grad_norm": 6.969957460905765, + "learning_rate": 2.3310350026928826e-06, + "loss": 1.0095, + "step": 7570 + }, + { + "epoch": 1.0718482338783888, + "grad_norm": 9.413726167487841, + "learning_rate": 2.330463118135299e-06, + "loss": 1.0979, + "step": 7571 + }, + { + "epoch": 1.071989806753026, + "grad_norm": 8.387036683977179, + "learning_rate": 2.3298912424901434e-06, + "loss": 1.0238, + "step": 7572 + }, + { + "epoch": 1.0721313796276633, + "grad_norm": 9.939060688423432, + "learning_rate": 2.3293193757874776e-06, + "loss": 1.0362, + "step": 7573 + }, + { + "epoch": 1.0722729525023005, + "grad_norm": 10.570062762663268, + "learning_rate": 2.3287475180573653e-06, + "loss": 1.0544, + "step": 7574 + }, + { + "epoch": 1.0724145253769377, + "grad_norm": 9.105055241173675, + "learning_rate": 2.328175669329868e-06, + "loss": 1.025, + "step": 7575 + }, + { + "epoch": 1.072556098251575, + "grad_norm": 11.088854416067612, + "learning_rate": 2.327603829635048e-06, + "loss": 1.1367, + "step": 7576 + }, + { + "epoch": 1.0726976711262122, + "grad_norm": 9.655531378119365, + "learning_rate": 2.3270319990029668e-06, + "loss": 1.0396, + "step": 7577 + }, + { + "epoch": 1.0728392440008494, + "grad_norm": 10.027969156577688, + "learning_rate": 2.326460177463683e-06, + "loss": 0.9739, + "step": 7578 + }, + { + "epoch": 1.0729808168754866, + "grad_norm": 10.362189783622785, + "learning_rate": 2.325888365047259e-06, + "loss": 1.1163, + "step": 7579 + }, + { + "epoch": 1.0731223897501239, + "grad_norm": 11.000120682487847, + "learning_rate": 2.325316561783754e-06, + "loss": 0.9568, + "step": 7580 + }, + { + "epoch": 1.073263962624761, + "grad_norm": 9.672524573678682, + "learning_rate": 2.324744767703227e-06, + "loss": 1.1665, + "step": 7581 + }, + { + "epoch": 1.0734055354993983, + "grad_norm": 8.545132809896494, + "learning_rate": 2.3241729828357367e-06, + "loss": 0.9102, + "step": 7582 + }, + { + "epoch": 1.0735471083740356, + "grad_norm": 8.571139912058946, + "learning_rate": 2.3236012072113414e-06, + "loss": 0.9142, + "step": 7583 + }, + { + "epoch": 1.0736886812486728, + "grad_norm": 10.062648606239625, + "learning_rate": 2.323029440860099e-06, + "loss": 1.0738, + "step": 7584 + }, + { + "epoch": 1.07383025412331, + "grad_norm": 8.952926021049146, + "learning_rate": 2.322457683812067e-06, + "loss": 1.0582, + "step": 7585 + }, + { + "epoch": 1.0739718269979472, + "grad_norm": 11.544865537895447, + "learning_rate": 2.3218859360973025e-06, + "loss": 0.981, + "step": 7586 + }, + { + "epoch": 1.0741133998725845, + "grad_norm": 9.927080173765455, + "learning_rate": 2.3213141977458615e-06, + "loss": 1.0019, + "step": 7587 + }, + { + "epoch": 1.0742549727472217, + "grad_norm": 9.426869853993988, + "learning_rate": 2.320742468787801e-06, + "loss": 0.9743, + "step": 7588 + }, + { + "epoch": 1.074396545621859, + "grad_norm": 8.573862375139457, + "learning_rate": 2.3201707492531743e-06, + "loss": 0.9915, + "step": 7589 + }, + { + "epoch": 1.0745381184964962, + "grad_norm": 8.825072930658722, + "learning_rate": 2.3195990391720364e-06, + "loss": 0.9889, + "step": 7590 + }, + { + "epoch": 1.0746796913711334, + "grad_norm": 9.107280091178687, + "learning_rate": 2.319027338574443e-06, + "loss": 1.1608, + "step": 7591 + }, + { + "epoch": 1.0748212642457706, + "grad_norm": 8.363964040466389, + "learning_rate": 2.318455647490448e-06, + "loss": 0.9965, + "step": 7592 + }, + { + "epoch": 1.0749628371204076, + "grad_norm": 10.239768900051644, + "learning_rate": 2.3178839659501033e-06, + "loss": 1.1727, + "step": 7593 + }, + { + "epoch": 1.0751044099950449, + "grad_norm": 8.010476881444953, + "learning_rate": 2.3173122939834635e-06, + "loss": 1.0997, + "step": 7594 + }, + { + "epoch": 1.075245982869682, + "grad_norm": 10.515513630655608, + "learning_rate": 2.31674063162058e-06, + "loss": 0.9931, + "step": 7595 + }, + { + "epoch": 1.0753875557443193, + "grad_norm": 8.430326902447431, + "learning_rate": 2.316168978891505e-06, + "loss": 0.9655, + "step": 7596 + }, + { + "epoch": 1.0755291286189566, + "grad_norm": 8.19697872420553, + "learning_rate": 2.315597335826289e-06, + "loss": 1.0555, + "step": 7597 + }, + { + "epoch": 1.0756707014935938, + "grad_norm": 8.090950382271474, + "learning_rate": 2.3150257024549847e-06, + "loss": 1.0724, + "step": 7598 + }, + { + "epoch": 1.075812274368231, + "grad_norm": 10.104813031369051, + "learning_rate": 2.314454078807641e-06, + "loss": 1.0371, + "step": 7599 + }, + { + "epoch": 1.0759538472428682, + "grad_norm": 10.284888832333436, + "learning_rate": 2.3138824649143076e-06, + "loss": 1.1612, + "step": 7600 + }, + { + "epoch": 1.0760954201175055, + "grad_norm": 10.839606909011716, + "learning_rate": 2.313310860805034e-06, + "loss": 1.1999, + "step": 7601 + }, + { + "epoch": 1.0762369929921427, + "grad_norm": 9.432213521570846, + "learning_rate": 2.31273926650987e-06, + "loss": 1.0189, + "step": 7602 + }, + { + "epoch": 1.07637856586678, + "grad_norm": 10.746136348061698, + "learning_rate": 2.312167682058863e-06, + "loss": 1.1511, + "step": 7603 + }, + { + "epoch": 1.0765201387414172, + "grad_norm": 8.736116566645366, + "learning_rate": 2.3115961074820604e-06, + "loss": 1.01, + "step": 7604 + }, + { + "epoch": 1.0766617116160544, + "grad_norm": 8.468705405930791, + "learning_rate": 2.31102454280951e-06, + "loss": 1.0307, + "step": 7605 + }, + { + "epoch": 1.0768032844906916, + "grad_norm": 9.659659860755362, + "learning_rate": 2.3104529880712586e-06, + "loss": 1.1288, + "step": 7606 + }, + { + "epoch": 1.0769448573653289, + "grad_norm": 9.13959955185465, + "learning_rate": 2.309881443297352e-06, + "loss": 0.9819, + "step": 7607 + }, + { + "epoch": 1.077086430239966, + "grad_norm": 9.486434888798534, + "learning_rate": 2.3093099085178366e-06, + "loss": 1.0496, + "step": 7608 + }, + { + "epoch": 1.0772280031146033, + "grad_norm": 10.195095857971255, + "learning_rate": 2.308738383762758e-06, + "loss": 1.0185, + "step": 7609 + }, + { + "epoch": 1.0773695759892405, + "grad_norm": 10.244302049788686, + "learning_rate": 2.308166869062159e-06, + "loss": 1.034, + "step": 7610 + }, + { + "epoch": 1.0775111488638778, + "grad_norm": 8.815227546373148, + "learning_rate": 2.3075953644460847e-06, + "loss": 1.0702, + "step": 7611 + }, + { + "epoch": 1.0776527217385148, + "grad_norm": 8.303317167781172, + "learning_rate": 2.3070238699445783e-06, + "loss": 1.0735, + "step": 7612 + }, + { + "epoch": 1.077794294613152, + "grad_norm": 8.720823636135416, + "learning_rate": 2.306452385587683e-06, + "loss": 1.0025, + "step": 7613 + }, + { + "epoch": 1.0779358674877892, + "grad_norm": 9.55585563050503, + "learning_rate": 2.305880911405442e-06, + "loss": 1.0346, + "step": 7614 + }, + { + "epoch": 1.0780774403624265, + "grad_norm": 10.31422141764217, + "learning_rate": 2.3053094474278967e-06, + "loss": 1.0165, + "step": 7615 + }, + { + "epoch": 1.0782190132370637, + "grad_norm": 8.179505616709916, + "learning_rate": 2.3047379936850885e-06, + "loss": 0.9981, + "step": 7616 + }, + { + "epoch": 1.078360586111701, + "grad_norm": 12.459715359346598, + "learning_rate": 2.3041665502070584e-06, + "loss": 1.0345, + "step": 7617 + }, + { + "epoch": 1.0785021589863382, + "grad_norm": 9.739345108355511, + "learning_rate": 2.3035951170238468e-06, + "loss": 1.1414, + "step": 7618 + }, + { + "epoch": 1.0786437318609754, + "grad_norm": 8.550311469935476, + "learning_rate": 2.3030236941654933e-06, + "loss": 0.9981, + "step": 7619 + }, + { + "epoch": 1.0787853047356126, + "grad_norm": 11.159426453321418, + "learning_rate": 2.302452281662038e-06, + "loss": 1.1189, + "step": 7620 + }, + { + "epoch": 1.0789268776102499, + "grad_norm": 9.114550734688729, + "learning_rate": 2.3018808795435187e-06, + "loss": 0.9502, + "step": 7621 + }, + { + "epoch": 1.079068450484887, + "grad_norm": 10.070544326753613, + "learning_rate": 2.3013094878399735e-06, + "loss": 1.0786, + "step": 7622 + }, + { + "epoch": 1.0792100233595243, + "grad_norm": 8.176592601135354, + "learning_rate": 2.3007381065814405e-06, + "loss": 0.9752, + "step": 7623 + }, + { + "epoch": 1.0793515962341615, + "grad_norm": 12.16553760323732, + "learning_rate": 2.3001667357979564e-06, + "loss": 1.0666, + "step": 7624 + }, + { + "epoch": 1.0794931691087988, + "grad_norm": 7.9957342695971185, + "learning_rate": 2.2995953755195584e-06, + "loss": 1.0594, + "step": 7625 + }, + { + "epoch": 1.079634741983436, + "grad_norm": 8.24833801174847, + "learning_rate": 2.2990240257762817e-06, + "loss": 1.0021, + "step": 7626 + }, + { + "epoch": 1.0797763148580732, + "grad_norm": 9.774932673776094, + "learning_rate": 2.298452686598162e-06, + "loss": 1.0486, + "step": 7627 + }, + { + "epoch": 1.0799178877327105, + "grad_norm": 8.911125481589451, + "learning_rate": 2.2978813580152347e-06, + "loss": 1.1131, + "step": 7628 + }, + { + "epoch": 1.0800594606073477, + "grad_norm": 11.145918135751186, + "learning_rate": 2.297310040057533e-06, + "loss": 1.0947, + "step": 7629 + }, + { + "epoch": 1.080201033481985, + "grad_norm": 9.426859737435226, + "learning_rate": 2.296738732755093e-06, + "loss": 1.0032, + "step": 7630 + }, + { + "epoch": 1.0803426063566222, + "grad_norm": 7.989684607018534, + "learning_rate": 2.296167436137945e-06, + "loss": 1.0034, + "step": 7631 + }, + { + "epoch": 1.0804841792312594, + "grad_norm": 8.108459538584823, + "learning_rate": 2.2955961502361235e-06, + "loss": 0.9895, + "step": 7632 + }, + { + "epoch": 1.0806257521058966, + "grad_norm": 10.055539584864327, + "learning_rate": 2.2950248750796594e-06, + "loss": 1.1081, + "step": 7633 + }, + { + "epoch": 1.0807673249805336, + "grad_norm": 8.766694627531109, + "learning_rate": 2.2944536106985848e-06, + "loss": 1.0177, + "step": 7634 + }, + { + "epoch": 1.0809088978551709, + "grad_norm": 8.586219179012323, + "learning_rate": 2.2938823571229303e-06, + "loss": 0.9885, + "step": 7635 + }, + { + "epoch": 1.081050470729808, + "grad_norm": 9.754665725588383, + "learning_rate": 2.2933111143827268e-06, + "loss": 1.0543, + "step": 7636 + }, + { + "epoch": 1.0811920436044453, + "grad_norm": 9.79943405385632, + "learning_rate": 2.2927398825080043e-06, + "loss": 1.0073, + "step": 7637 + }, + { + "epoch": 1.0813336164790825, + "grad_norm": 9.57366491028661, + "learning_rate": 2.2921686615287916e-06, + "loss": 0.9666, + "step": 7638 + }, + { + "epoch": 1.0814751893537198, + "grad_norm": 9.657501040899787, + "learning_rate": 2.2915974514751173e-06, + "loss": 1.0884, + "step": 7639 + }, + { + "epoch": 1.081616762228357, + "grad_norm": 9.903028283169048, + "learning_rate": 2.29102625237701e-06, + "loss": 1.0415, + "step": 7640 + }, + { + "epoch": 1.0817583351029942, + "grad_norm": 11.397057913213763, + "learning_rate": 2.290455064264497e-06, + "loss": 1.0836, + "step": 7641 + }, + { + "epoch": 1.0818999079776315, + "grad_norm": 9.603969436793887, + "learning_rate": 2.2898838871676037e-06, + "loss": 1.0682, + "step": 7642 + }, + { + "epoch": 1.0820414808522687, + "grad_norm": 8.117496253301463, + "learning_rate": 2.2893127211163583e-06, + "loss": 1.0038, + "step": 7643 + }, + { + "epoch": 1.082183053726906, + "grad_norm": 11.419017095004964, + "learning_rate": 2.2887415661407866e-06, + "loss": 1.08, + "step": 7644 + }, + { + "epoch": 1.0823246266015432, + "grad_norm": 9.815669428674125, + "learning_rate": 2.288170422270913e-06, + "loss": 0.9932, + "step": 7645 + }, + { + "epoch": 1.0824661994761804, + "grad_norm": 9.353419171367896, + "learning_rate": 2.287599289536762e-06, + "loss": 1.0943, + "step": 7646 + }, + { + "epoch": 1.0826077723508176, + "grad_norm": 9.570405771229169, + "learning_rate": 2.2870281679683582e-06, + "loss": 1.0414, + "step": 7647 + }, + { + "epoch": 1.0827493452254549, + "grad_norm": 9.470628810094594, + "learning_rate": 2.2864570575957246e-06, + "loss": 0.9579, + "step": 7648 + }, + { + "epoch": 1.082890918100092, + "grad_norm": 7.667450602160556, + "learning_rate": 2.2858859584488848e-06, + "loss": 0.9566, + "step": 7649 + }, + { + "epoch": 1.0830324909747293, + "grad_norm": 8.345708838562869, + "learning_rate": 2.28531487055786e-06, + "loss": 0.9589, + "step": 7650 + }, + { + "epoch": 1.0831740638493665, + "grad_norm": 8.064585807745384, + "learning_rate": 2.2847437939526735e-06, + "loss": 1.0519, + "step": 7651 + }, + { + "epoch": 1.0833156367240038, + "grad_norm": 9.78636898360678, + "learning_rate": 2.2841727286633444e-06, + "loss": 1.1408, + "step": 7652 + }, + { + "epoch": 1.083457209598641, + "grad_norm": 9.294435325243633, + "learning_rate": 2.2836016747198937e-06, + "loss": 0.9799, + "step": 7653 + }, + { + "epoch": 1.083598782473278, + "grad_norm": 10.168030277378195, + "learning_rate": 2.283030632152342e-06, + "loss": 1.1197, + "step": 7654 + }, + { + "epoch": 1.0837403553479152, + "grad_norm": 10.03081456840678, + "learning_rate": 2.282459600990708e-06, + "loss": 0.9611, + "step": 7655 + }, + { + "epoch": 1.0838819282225525, + "grad_norm": 9.644854392242456, + "learning_rate": 2.2818885812650105e-06, + "loss": 1.0302, + "step": 7656 + }, + { + "epoch": 1.0840235010971897, + "grad_norm": 7.801215483031815, + "learning_rate": 2.281317573005268e-06, + "loss": 0.9302, + "step": 7657 + }, + { + "epoch": 1.084165073971827, + "grad_norm": 9.939059920806143, + "learning_rate": 2.2807465762414967e-06, + "loss": 1.126, + "step": 7658 + }, + { + "epoch": 1.0843066468464642, + "grad_norm": 11.202817862116559, + "learning_rate": 2.280175591003715e-06, + "loss": 1.1611, + "step": 7659 + }, + { + "epoch": 1.0844482197211014, + "grad_norm": 7.416443067744781, + "learning_rate": 2.279604617321939e-06, + "loss": 1.0521, + "step": 7660 + }, + { + "epoch": 1.0845897925957386, + "grad_norm": 9.568454060628161, + "learning_rate": 2.279033655226183e-06, + "loss": 1.0449, + "step": 7661 + }, + { + "epoch": 1.0847313654703759, + "grad_norm": 9.001526597144595, + "learning_rate": 2.278462704746465e-06, + "loss": 1.0712, + "step": 7662 + }, + { + "epoch": 1.084872938345013, + "grad_norm": 9.36044116944916, + "learning_rate": 2.277891765912796e-06, + "loss": 1.1602, + "step": 7663 + }, + { + "epoch": 1.0850145112196503, + "grad_norm": 11.156793319337334, + "learning_rate": 2.2773208387551906e-06, + "loss": 1.0423, + "step": 7664 + }, + { + "epoch": 1.0851560840942875, + "grad_norm": 7.3507467500386054, + "learning_rate": 2.2767499233036635e-06, + "loss": 0.9809, + "step": 7665 + }, + { + "epoch": 1.0852976569689248, + "grad_norm": 8.960702083192503, + "learning_rate": 2.2761790195882264e-06, + "loss": 1.0466, + "step": 7666 + }, + { + "epoch": 1.085439229843562, + "grad_norm": 7.927822431144372, + "learning_rate": 2.275608127638891e-06, + "loss": 1.0564, + "step": 7667 + }, + { + "epoch": 1.0855808027181992, + "grad_norm": 9.043047633970193, + "learning_rate": 2.2750372474856696e-06, + "loss": 1.0261, + "step": 7668 + }, + { + "epoch": 1.0857223755928365, + "grad_norm": 10.09663279085552, + "learning_rate": 2.274466379158572e-06, + "loss": 1.0218, + "step": 7669 + }, + { + "epoch": 1.0858639484674737, + "grad_norm": 8.525699137101144, + "learning_rate": 2.2738955226876086e-06, + "loss": 0.9961, + "step": 7670 + }, + { + "epoch": 1.086005521342111, + "grad_norm": 11.99271807661776, + "learning_rate": 2.273324678102789e-06, + "loss": 1.0604, + "step": 7671 + }, + { + "epoch": 1.0861470942167482, + "grad_norm": 9.177452354213951, + "learning_rate": 2.272753845434122e-06, + "loss": 1.0426, + "step": 7672 + }, + { + "epoch": 1.0862886670913854, + "grad_norm": 10.419361915307864, + "learning_rate": 2.272183024711617e-06, + "loss": 1.0957, + "step": 7673 + }, + { + "epoch": 1.0864302399660226, + "grad_norm": 10.99726105316168, + "learning_rate": 2.2716122159652795e-06, + "loss": 1.0077, + "step": 7674 + }, + { + "epoch": 1.0865718128406598, + "grad_norm": 8.882301872828833, + "learning_rate": 2.2710414192251176e-06, + "loss": 0.9087, + "step": 7675 + }, + { + "epoch": 1.0867133857152969, + "grad_norm": 10.913501503902568, + "learning_rate": 2.2704706345211375e-06, + "loss": 1.0816, + "step": 7676 + }, + { + "epoch": 1.086854958589934, + "grad_norm": 10.37287347900406, + "learning_rate": 2.269899861883345e-06, + "loss": 1.0579, + "step": 7677 + }, + { + "epoch": 1.0869965314645713, + "grad_norm": 12.051901791501297, + "learning_rate": 2.269329101341745e-06, + "loss": 1.0264, + "step": 7678 + }, + { + "epoch": 1.0871381043392085, + "grad_norm": 11.378512668055935, + "learning_rate": 2.268758352926343e-06, + "loss": 1.0168, + "step": 7679 + }, + { + "epoch": 1.0872796772138458, + "grad_norm": 9.31274577590942, + "learning_rate": 2.268187616667141e-06, + "loss": 1.0531, + "step": 7680 + }, + { + "epoch": 1.087421250088483, + "grad_norm": 10.252377490204902, + "learning_rate": 2.267616892594143e-06, + "loss": 1.0819, + "step": 7681 + }, + { + "epoch": 1.0875628229631202, + "grad_norm": 8.939547644314054, + "learning_rate": 2.2670461807373526e-06, + "loss": 1.0136, + "step": 7682 + }, + { + "epoch": 1.0877043958377575, + "grad_norm": 10.368618760706887, + "learning_rate": 2.2664754811267713e-06, + "loss": 0.9791, + "step": 7683 + }, + { + "epoch": 1.0878459687123947, + "grad_norm": 11.510761698849269, + "learning_rate": 2.265904793792399e-06, + "loss": 1.1665, + "step": 7684 + }, + { + "epoch": 1.087987541587032, + "grad_norm": 9.947644699666387, + "learning_rate": 2.2653341187642368e-06, + "loss": 0.9754, + "step": 7685 + }, + { + "epoch": 1.0881291144616692, + "grad_norm": 10.969700090173813, + "learning_rate": 2.2647634560722857e-06, + "loss": 1.1402, + "step": 7686 + }, + { + "epoch": 1.0882706873363064, + "grad_norm": 10.189176643807434, + "learning_rate": 2.264192805746543e-06, + "loss": 1.0696, + "step": 7687 + }, + { + "epoch": 1.0884122602109436, + "grad_norm": 9.120044355451904, + "learning_rate": 2.2636221678170097e-06, + "loss": 1.0583, + "step": 7688 + }, + { + "epoch": 1.0885538330855808, + "grad_norm": 9.18988961005092, + "learning_rate": 2.2630515423136827e-06, + "loss": 0.945, + "step": 7689 + }, + { + "epoch": 1.088695405960218, + "grad_norm": 11.108340015299667, + "learning_rate": 2.2624809292665593e-06, + "loss": 0.997, + "step": 7690 + }, + { + "epoch": 1.0888369788348553, + "grad_norm": 7.679130153669242, + "learning_rate": 2.2619103287056366e-06, + "loss": 0.9474, + "step": 7691 + }, + { + "epoch": 1.0889785517094925, + "grad_norm": 8.779118506600176, + "learning_rate": 2.26133974066091e-06, + "loss": 0.8826, + "step": 7692 + }, + { + "epoch": 1.0891201245841298, + "grad_norm": 9.125149242278049, + "learning_rate": 2.2607691651623757e-06, + "loss": 0.962, + "step": 7693 + }, + { + "epoch": 1.089261697458767, + "grad_norm": 8.997584548641981, + "learning_rate": 2.260198602240028e-06, + "loss": 0.9691, + "step": 7694 + }, + { + "epoch": 1.089403270333404, + "grad_norm": 8.385621580538452, + "learning_rate": 2.259628051923861e-06, + "loss": 0.9728, + "step": 7695 + }, + { + "epoch": 1.0895448432080412, + "grad_norm": 9.623169960332676, + "learning_rate": 2.259057514243868e-06, + "loss": 0.9338, + "step": 7696 + }, + { + "epoch": 1.0896864160826785, + "grad_norm": 10.274288522261378, + "learning_rate": 2.2584869892300416e-06, + "loss": 1.0994, + "step": 7697 + }, + { + "epoch": 1.0898279889573157, + "grad_norm": 8.513218642945667, + "learning_rate": 2.2579164769123744e-06, + "loss": 0.9245, + "step": 7698 + }, + { + "epoch": 1.089969561831953, + "grad_norm": 9.786391981602726, + "learning_rate": 2.257345977320857e-06, + "loss": 0.9919, + "step": 7699 + }, + { + "epoch": 1.0901111347065902, + "grad_norm": 11.641735213416291, + "learning_rate": 2.256775490485481e-06, + "loss": 1.1819, + "step": 7700 + }, + { + "epoch": 1.0902527075812274, + "grad_norm": 9.37426510791521, + "learning_rate": 2.256205016436236e-06, + "loss": 1.0364, + "step": 7701 + }, + { + "epoch": 1.0903942804558646, + "grad_norm": 8.332175924669883, + "learning_rate": 2.255634555203112e-06, + "loss": 0.9456, + "step": 7702 + }, + { + "epoch": 1.0905358533305018, + "grad_norm": 10.42595504667337, + "learning_rate": 2.2550641068160966e-06, + "loss": 1.0606, + "step": 7703 + }, + { + "epoch": 1.090677426205139, + "grad_norm": 9.828479366251528, + "learning_rate": 2.254493671305179e-06, + "loss": 0.9927, + "step": 7704 + }, + { + "epoch": 1.0908189990797763, + "grad_norm": 9.255296170417694, + "learning_rate": 2.253923248700346e-06, + "loss": 0.9486, + "step": 7705 + }, + { + "epoch": 1.0909605719544135, + "grad_norm": 8.101819114336346, + "learning_rate": 2.2533528390315838e-06, + "loss": 1.0565, + "step": 7706 + }, + { + "epoch": 1.0911021448290508, + "grad_norm": 9.014622148242594, + "learning_rate": 2.252782442328879e-06, + "loss": 1.0009, + "step": 7707 + }, + { + "epoch": 1.091243717703688, + "grad_norm": 7.9628506234431855, + "learning_rate": 2.2522120586222174e-06, + "loss": 0.9535, + "step": 7708 + }, + { + "epoch": 1.0913852905783252, + "grad_norm": 8.272793810535871, + "learning_rate": 2.2516416879415825e-06, + "loss": 0.9023, + "step": 7709 + }, + { + "epoch": 1.0915268634529625, + "grad_norm": 8.443421497067906, + "learning_rate": 2.2510713303169588e-06, + "loss": 0.8847, + "step": 7710 + }, + { + "epoch": 1.0916684363275997, + "grad_norm": 8.624934541757947, + "learning_rate": 2.25050098577833e-06, + "loss": 1.0302, + "step": 7711 + }, + { + "epoch": 1.091810009202237, + "grad_norm": 8.205256419525002, + "learning_rate": 2.2499306543556783e-06, + "loss": 0.9775, + "step": 7712 + }, + { + "epoch": 1.0919515820768741, + "grad_norm": 8.186967439293428, + "learning_rate": 2.2493603360789855e-06, + "loss": 0.9296, + "step": 7713 + }, + { + "epoch": 1.0920931549515114, + "grad_norm": 8.639929193277208, + "learning_rate": 2.2487900309782333e-06, + "loss": 1.0713, + "step": 7714 + }, + { + "epoch": 1.0922347278261486, + "grad_norm": 8.928655342797635, + "learning_rate": 2.2482197390834027e-06, + "loss": 1.0599, + "step": 7715 + }, + { + "epoch": 1.0923763007007858, + "grad_norm": 10.45201736762921, + "learning_rate": 2.2476494604244712e-06, + "loss": 1.112, + "step": 7716 + }, + { + "epoch": 1.0925178735754228, + "grad_norm": 8.775411273414946, + "learning_rate": 2.24707919503142e-06, + "loss": 1.056, + "step": 7717 + }, + { + "epoch": 1.09265944645006, + "grad_norm": 8.390255371135678, + "learning_rate": 2.246508942934227e-06, + "loss": 0.9532, + "step": 7718 + }, + { + "epoch": 1.0928010193246973, + "grad_norm": 7.990181381210233, + "learning_rate": 2.2459387041628694e-06, + "loss": 1.1127, + "step": 7719 + }, + { + "epoch": 1.0929425921993345, + "grad_norm": 8.45343141123838, + "learning_rate": 2.2453684787473252e-06, + "loss": 1.077, + "step": 7720 + }, + { + "epoch": 1.0930841650739718, + "grad_norm": 9.221113723051952, + "learning_rate": 2.24479826671757e-06, + "loss": 1.1885, + "step": 7721 + }, + { + "epoch": 1.093225737948609, + "grad_norm": 11.063909925710634, + "learning_rate": 2.2442280681035792e-06, + "loss": 1.1772, + "step": 7722 + }, + { + "epoch": 1.0933673108232462, + "grad_norm": 8.349031184293695, + "learning_rate": 2.2436578829353286e-06, + "loss": 1.0305, + "step": 7723 + }, + { + "epoch": 1.0935088836978835, + "grad_norm": 7.962855414059384, + "learning_rate": 2.243087711242792e-06, + "loss": 0.939, + "step": 7724 + }, + { + "epoch": 1.0936504565725207, + "grad_norm": 8.687542153674709, + "learning_rate": 2.242517553055943e-06, + "loss": 0.9381, + "step": 7725 + }, + { + "epoch": 1.093792029447158, + "grad_norm": 9.488402264177855, + "learning_rate": 2.2419474084047544e-06, + "loss": 1.0553, + "step": 7726 + }, + { + "epoch": 1.0939336023217952, + "grad_norm": 9.454007900401786, + "learning_rate": 2.241377277319198e-06, + "loss": 0.9759, + "step": 7727 + }, + { + "epoch": 1.0940751751964324, + "grad_norm": 7.6079789590634865, + "learning_rate": 2.240807159829245e-06, + "loss": 0.9607, + "step": 7728 + }, + { + "epoch": 1.0942167480710696, + "grad_norm": 9.27520493437784, + "learning_rate": 2.2402370559648663e-06, + "loss": 0.9963, + "step": 7729 + }, + { + "epoch": 1.0943583209457068, + "grad_norm": 8.21726924190423, + "learning_rate": 2.239666965756032e-06, + "loss": 0.9653, + "step": 7730 + }, + { + "epoch": 1.094499893820344, + "grad_norm": 9.022025812868549, + "learning_rate": 2.2390968892327108e-06, + "loss": 0.8999, + "step": 7731 + }, + { + "epoch": 1.0946414666949813, + "grad_norm": 9.147770727863199, + "learning_rate": 2.2385268264248717e-06, + "loss": 1.089, + "step": 7732 + }, + { + "epoch": 1.0947830395696185, + "grad_norm": 7.946780328940491, + "learning_rate": 2.2379567773624825e-06, + "loss": 0.8407, + "step": 7733 + }, + { + "epoch": 1.0949246124442558, + "grad_norm": 8.03118492723516, + "learning_rate": 2.2373867420755104e-06, + "loss": 0.9642, + "step": 7734 + }, + { + "epoch": 1.095066185318893, + "grad_norm": 7.612126473719699, + "learning_rate": 2.2368167205939213e-06, + "loss": 0.9955, + "step": 7735 + }, + { + "epoch": 1.0952077581935302, + "grad_norm": 8.96515947837272, + "learning_rate": 2.236246712947682e-06, + "loss": 1.0186, + "step": 7736 + }, + { + "epoch": 1.0953493310681672, + "grad_norm": 8.014281876079592, + "learning_rate": 2.2356767191667554e-06, + "loss": 0.9941, + "step": 7737 + }, + { + "epoch": 1.0954909039428045, + "grad_norm": 8.536370050991284, + "learning_rate": 2.235106739281106e-06, + "loss": 0.9696, + "step": 7738 + }, + { + "epoch": 1.0956324768174417, + "grad_norm": 8.662389678032387, + "learning_rate": 2.2345367733206984e-06, + "loss": 1.0061, + "step": 7739 + }, + { + "epoch": 1.095774049692079, + "grad_norm": 9.343277220741811, + "learning_rate": 2.2339668213154943e-06, + "loss": 1.041, + "step": 7740 + }, + { + "epoch": 1.0959156225667162, + "grad_norm": 9.206414714720061, + "learning_rate": 2.2333968832954564e-06, + "loss": 1.1017, + "step": 7741 + }, + { + "epoch": 1.0960571954413534, + "grad_norm": 10.134804958429648, + "learning_rate": 2.2328269592905455e-06, + "loss": 1.006, + "step": 7742 + }, + { + "epoch": 1.0961987683159906, + "grad_norm": 9.102404484256867, + "learning_rate": 2.232257049330722e-06, + "loss": 0.9261, + "step": 7743 + }, + { + "epoch": 1.0963403411906278, + "grad_norm": 9.874009481078184, + "learning_rate": 2.231687153445946e-06, + "loss": 1.0771, + "step": 7744 + }, + { + "epoch": 1.096481914065265, + "grad_norm": 9.080172860941047, + "learning_rate": 2.231117271666176e-06, + "loss": 1.0088, + "step": 7745 + }, + { + "epoch": 1.0966234869399023, + "grad_norm": 10.036347897206909, + "learning_rate": 2.2305474040213707e-06, + "loss": 1.0919, + "step": 7746 + }, + { + "epoch": 1.0967650598145395, + "grad_norm": 8.800795120384098, + "learning_rate": 2.229977550541488e-06, + "loss": 1.1106, + "step": 7747 + }, + { + "epoch": 1.0969066326891768, + "grad_norm": 9.406771116300888, + "learning_rate": 2.2294077112564836e-06, + "loss": 1.0828, + "step": 7748 + }, + { + "epoch": 1.097048205563814, + "grad_norm": 10.89126177859493, + "learning_rate": 2.2288378861963144e-06, + "loss": 1.0741, + "step": 7749 + }, + { + "epoch": 1.0971897784384512, + "grad_norm": 9.505126373077008, + "learning_rate": 2.228268075390935e-06, + "loss": 1.0401, + "step": 7750 + }, + { + "epoch": 1.0973313513130885, + "grad_norm": 9.297048563899821, + "learning_rate": 2.2276982788703003e-06, + "loss": 1.0652, + "step": 7751 + }, + { + "epoch": 1.0974729241877257, + "grad_norm": 6.338555073991658, + "learning_rate": 2.227128496664364e-06, + "loss": 0.9475, + "step": 7752 + }, + { + "epoch": 1.097614497062363, + "grad_norm": 8.560655826864853, + "learning_rate": 2.226558728803079e-06, + "loss": 0.9878, + "step": 7753 + }, + { + "epoch": 1.0977560699370001, + "grad_norm": 6.600506976914343, + "learning_rate": 2.225988975316398e-06, + "loss": 0.9095, + "step": 7754 + }, + { + "epoch": 1.0978976428116374, + "grad_norm": 10.377256217046888, + "learning_rate": 2.2254192362342718e-06, + "loss": 1.1227, + "step": 7755 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 10.846653266559272, + "learning_rate": 2.224849511586652e-06, + "loss": 1.0037, + "step": 7756 + }, + { + "epoch": 1.0981807885609118, + "grad_norm": 8.596047721946888, + "learning_rate": 2.224279801403489e-06, + "loss": 1.0701, + "step": 7757 + }, + { + "epoch": 1.098322361435549, + "grad_norm": 8.894401726933529, + "learning_rate": 2.2237101057147308e-06, + "loss": 1.0191, + "step": 7758 + }, + { + "epoch": 1.098463934310186, + "grad_norm": 11.463059512993793, + "learning_rate": 2.223140424550326e-06, + "loss": 0.9573, + "step": 7759 + }, + { + "epoch": 1.0986055071848233, + "grad_norm": 8.979626802365766, + "learning_rate": 2.2225707579402225e-06, + "loss": 0.993, + "step": 7760 + }, + { + "epoch": 1.0987470800594605, + "grad_norm": 9.261402422371114, + "learning_rate": 2.222001105914367e-06, + "loss": 0.9979, + "step": 7761 + }, + { + "epoch": 1.0988886529340978, + "grad_norm": 9.639204764587811, + "learning_rate": 2.2214314685027067e-06, + "loss": 1.0512, + "step": 7762 + }, + { + "epoch": 1.099030225808735, + "grad_norm": 8.806966139817424, + "learning_rate": 2.2208618457351862e-06, + "loss": 1.1104, + "step": 7763 + }, + { + "epoch": 1.0991717986833722, + "grad_norm": 8.827257718045892, + "learning_rate": 2.2202922376417505e-06, + "loss": 1.0458, + "step": 7764 + }, + { + "epoch": 1.0993133715580095, + "grad_norm": 9.11874175041929, + "learning_rate": 2.219722644252343e-06, + "loss": 1.0584, + "step": 7765 + }, + { + "epoch": 1.0994549444326467, + "grad_norm": 8.84603755198302, + "learning_rate": 2.2191530655969077e-06, + "loss": 0.9996, + "step": 7766 + }, + { + "epoch": 1.099596517307284, + "grad_norm": 9.130026491091215, + "learning_rate": 2.2185835017053857e-06, + "loss": 1.011, + "step": 7767 + }, + { + "epoch": 1.0997380901819211, + "grad_norm": 7.634757630623852, + "learning_rate": 2.2180139526077203e-06, + "loss": 0.9961, + "step": 7768 + }, + { + "epoch": 1.0998796630565584, + "grad_norm": 7.979406075998614, + "learning_rate": 2.21744441833385e-06, + "loss": 0.8946, + "step": 7769 + }, + { + "epoch": 1.1000212359311956, + "grad_norm": 9.563863021348292, + "learning_rate": 2.2168748989137166e-06, + "loss": 1.0558, + "step": 7770 + }, + { + "epoch": 1.1001628088058328, + "grad_norm": 8.542379755787602, + "learning_rate": 2.2163053943772585e-06, + "loss": 0.9893, + "step": 7771 + }, + { + "epoch": 1.10030438168047, + "grad_norm": 7.921303632547718, + "learning_rate": 2.2157359047544137e-06, + "loss": 1.0239, + "step": 7772 + }, + { + "epoch": 1.1004459545551073, + "grad_norm": 9.18988794966205, + "learning_rate": 2.215166430075121e-06, + "loss": 1.0612, + "step": 7773 + }, + { + "epoch": 1.1005875274297445, + "grad_norm": 8.202046059997011, + "learning_rate": 2.2145969703693167e-06, + "loss": 0.9253, + "step": 7774 + }, + { + "epoch": 1.1007291003043818, + "grad_norm": 8.514714687582755, + "learning_rate": 2.2140275256669365e-06, + "loss": 1.0718, + "step": 7775 + }, + { + "epoch": 1.100870673179019, + "grad_norm": 9.586372048524634, + "learning_rate": 2.2134580959979164e-06, + "loss": 1.065, + "step": 7776 + }, + { + "epoch": 1.1010122460536562, + "grad_norm": 10.079152230836819, + "learning_rate": 2.2128886813921906e-06, + "loss": 1.0661, + "step": 7777 + }, + { + "epoch": 1.1011538189282932, + "grad_norm": 8.512827450452162, + "learning_rate": 2.2123192818796928e-06, + "loss": 1.049, + "step": 7778 + }, + { + "epoch": 1.1012953918029305, + "grad_norm": 8.659169940592378, + "learning_rate": 2.211749897490356e-06, + "loss": 1.097, + "step": 7779 + }, + { + "epoch": 1.1014369646775677, + "grad_norm": 9.101764935382853, + "learning_rate": 2.2111805282541114e-06, + "loss": 0.9961, + "step": 7780 + }, + { + "epoch": 1.101578537552205, + "grad_norm": 8.450917066660761, + "learning_rate": 2.2106111742008914e-06, + "loss": 1.0164, + "step": 7781 + }, + { + "epoch": 1.1017201104268421, + "grad_norm": 9.923255163603937, + "learning_rate": 2.2100418353606262e-06, + "loss": 1.0182, + "step": 7782 + }, + { + "epoch": 1.1018616833014794, + "grad_norm": 8.211094064426694, + "learning_rate": 2.2094725117632454e-06, + "loss": 0.959, + "step": 7783 + }, + { + "epoch": 1.1020032561761166, + "grad_norm": 8.42353807211339, + "learning_rate": 2.2089032034386775e-06, + "loss": 1.0588, + "step": 7784 + }, + { + "epoch": 1.1021448290507538, + "grad_norm": 8.487069111421844, + "learning_rate": 2.208333910416852e-06, + "loss": 0.9729, + "step": 7785 + }, + { + "epoch": 1.102286401925391, + "grad_norm": 7.11172935990434, + "learning_rate": 2.2077646327276948e-06, + "loss": 1.0025, + "step": 7786 + }, + { + "epoch": 1.1024279748000283, + "grad_norm": 8.418096993657706, + "learning_rate": 2.207195370401134e-06, + "loss": 1.0769, + "step": 7787 + }, + { + "epoch": 1.1025695476746655, + "grad_norm": 9.769274318142827, + "learning_rate": 2.206626123467093e-06, + "loss": 1.0167, + "step": 7788 + }, + { + "epoch": 1.1027111205493028, + "grad_norm": 9.893347583907866, + "learning_rate": 2.2060568919554997e-06, + "loss": 1.031, + "step": 7789 + }, + { + "epoch": 1.10285269342394, + "grad_norm": 9.281943760897754, + "learning_rate": 2.205487675896275e-06, + "loss": 1.005, + "step": 7790 + }, + { + "epoch": 1.1029942662985772, + "grad_norm": 10.364650850060139, + "learning_rate": 2.2049184753193438e-06, + "loss": 1.1171, + "step": 7791 + }, + { + "epoch": 1.1031358391732144, + "grad_norm": 8.53594461462041, + "learning_rate": 2.2043492902546284e-06, + "loss": 1.0729, + "step": 7792 + }, + { + "epoch": 1.1032774120478517, + "grad_norm": 10.022475733890593, + "learning_rate": 2.20378012073205e-06, + "loss": 1.0861, + "step": 7793 + }, + { + "epoch": 1.103418984922489, + "grad_norm": 6.9351247209419045, + "learning_rate": 2.20321096678153e-06, + "loss": 0.946, + "step": 7794 + }, + { + "epoch": 1.1035605577971261, + "grad_norm": 11.310217189710283, + "learning_rate": 2.202641828432988e-06, + "loss": 1.1024, + "step": 7795 + }, + { + "epoch": 1.1037021306717634, + "grad_norm": 11.556890003066082, + "learning_rate": 2.202072705716344e-06, + "loss": 1.0444, + "step": 7796 + }, + { + "epoch": 1.1038437035464006, + "grad_norm": 9.071852915464635, + "learning_rate": 2.201503598661515e-06, + "loss": 1.0538, + "step": 7797 + }, + { + "epoch": 1.1039852764210378, + "grad_norm": 9.933639832276123, + "learning_rate": 2.2009345072984198e-06, + "loss": 1.0406, + "step": 7798 + }, + { + "epoch": 1.104126849295675, + "grad_norm": 10.296545275212079, + "learning_rate": 2.2003654316569746e-06, + "loss": 1.1622, + "step": 7799 + }, + { + "epoch": 1.104268422170312, + "grad_norm": 8.596945428646444, + "learning_rate": 2.1997963717670952e-06, + "loss": 0.9745, + "step": 7800 + }, + { + "epoch": 1.1044099950449493, + "grad_norm": 8.02059146136597, + "learning_rate": 2.1992273276586966e-06, + "loss": 1.0064, + "step": 7801 + }, + { + "epoch": 1.1045515679195865, + "grad_norm": 8.61662881376805, + "learning_rate": 2.1986582993616926e-06, + "loss": 0.9164, + "step": 7802 + }, + { + "epoch": 1.1046931407942238, + "grad_norm": 9.493205803146713, + "learning_rate": 2.198089286905998e-06, + "loss": 1.0304, + "step": 7803 + }, + { + "epoch": 1.104834713668861, + "grad_norm": 9.839652622573125, + "learning_rate": 2.197520290321524e-06, + "loss": 1.0535, + "step": 7804 + }, + { + "epoch": 1.1049762865434982, + "grad_norm": 11.799986655017612, + "learning_rate": 2.1969513096381823e-06, + "loss": 0.9748, + "step": 7805 + }, + { + "epoch": 1.1051178594181355, + "grad_norm": 11.471650715638162, + "learning_rate": 2.1963823448858852e-06, + "loss": 1.0501, + "step": 7806 + }, + { + "epoch": 1.1052594322927727, + "grad_norm": 10.218384155817365, + "learning_rate": 2.195813396094541e-06, + "loss": 1.0938, + "step": 7807 + }, + { + "epoch": 1.10540100516741, + "grad_norm": 8.75339158948391, + "learning_rate": 2.19524446329406e-06, + "loss": 0.8999, + "step": 7808 + }, + { + "epoch": 1.1055425780420471, + "grad_norm": 8.846795625913355, + "learning_rate": 2.1946755465143505e-06, + "loss": 1.0111, + "step": 7809 + }, + { + "epoch": 1.1056841509166844, + "grad_norm": 11.613022407138933, + "learning_rate": 2.1941066457853213e-06, + "loss": 1.0618, + "step": 7810 + }, + { + "epoch": 1.1058257237913216, + "grad_norm": 10.392189584977613, + "learning_rate": 2.1935377611368758e-06, + "loss": 0.9997, + "step": 7811 + }, + { + "epoch": 1.1059672966659588, + "grad_norm": 10.425432916155463, + "learning_rate": 2.192968892598922e-06, + "loss": 0.9846, + "step": 7812 + }, + { + "epoch": 1.106108869540596, + "grad_norm": 7.836066506446779, + "learning_rate": 2.1924000402013644e-06, + "loss": 0.886, + "step": 7813 + }, + { + "epoch": 1.1062504424152333, + "grad_norm": 8.872744851301038, + "learning_rate": 2.1918312039741075e-06, + "loss": 1.0536, + "step": 7814 + }, + { + "epoch": 1.1063920152898705, + "grad_norm": 9.285995143792984, + "learning_rate": 2.1912623839470545e-06, + "loss": 1.1311, + "step": 7815 + }, + { + "epoch": 1.1065335881645078, + "grad_norm": 10.056498947523584, + "learning_rate": 2.190693580150108e-06, + "loss": 1.0296, + "step": 7816 + }, + { + "epoch": 1.106675161039145, + "grad_norm": 9.135165052671505, + "learning_rate": 2.190124792613169e-06, + "loss": 0.929, + "step": 7817 + }, + { + "epoch": 1.1068167339137822, + "grad_norm": 9.019150175335474, + "learning_rate": 2.1895560213661387e-06, + "loss": 1.0635, + "step": 7818 + }, + { + "epoch": 1.1069583067884192, + "grad_norm": 11.394734796347898, + "learning_rate": 2.188987266438917e-06, + "loss": 1.1556, + "step": 7819 + }, + { + "epoch": 1.1070998796630565, + "grad_norm": 8.757656126642535, + "learning_rate": 2.188418527861403e-06, + "loss": 0.9113, + "step": 7820 + }, + { + "epoch": 1.1072414525376937, + "grad_norm": 7.98163117146029, + "learning_rate": 2.1878498056634946e-06, + "loss": 0.931, + "step": 7821 + }, + { + "epoch": 1.107383025412331, + "grad_norm": 10.444348675263972, + "learning_rate": 2.187281099875089e-06, + "loss": 1.0304, + "step": 7822 + }, + { + "epoch": 1.1075245982869681, + "grad_norm": 8.888515872546098, + "learning_rate": 2.186712410526083e-06, + "loss": 0.9608, + "step": 7823 + }, + { + "epoch": 1.1076661711616054, + "grad_norm": 8.011196407734664, + "learning_rate": 2.186143737646372e-06, + "loss": 1.0289, + "step": 7824 + }, + { + "epoch": 1.1078077440362426, + "grad_norm": 10.80088720562857, + "learning_rate": 2.18557508126585e-06, + "loss": 1.1047, + "step": 7825 + }, + { + "epoch": 1.1079493169108798, + "grad_norm": 10.38545564257094, + "learning_rate": 2.1850064414144124e-06, + "loss": 1.0457, + "step": 7826 + }, + { + "epoch": 1.108090889785517, + "grad_norm": 10.136529080595478, + "learning_rate": 2.1844378181219507e-06, + "loss": 0.9977, + "step": 7827 + }, + { + "epoch": 1.1082324626601543, + "grad_norm": 9.23349175529589, + "learning_rate": 2.183869211418358e-06, + "loss": 0.8995, + "step": 7828 + }, + { + "epoch": 1.1083740355347915, + "grad_norm": 9.801688488673975, + "learning_rate": 2.1833006213335243e-06, + "loss": 1.0122, + "step": 7829 + }, + { + "epoch": 1.1085156084094288, + "grad_norm": 10.161674482874288, + "learning_rate": 2.1827320478973414e-06, + "loss": 1.0053, + "step": 7830 + }, + { + "epoch": 1.108657181284066, + "grad_norm": 8.761062168889595, + "learning_rate": 2.1821634911396993e-06, + "loss": 1.0247, + "step": 7831 + }, + { + "epoch": 1.1087987541587032, + "grad_norm": 7.865806905456412, + "learning_rate": 2.1815949510904843e-06, + "loss": 0.9465, + "step": 7832 + }, + { + "epoch": 1.1089403270333404, + "grad_norm": 10.398850935033302, + "learning_rate": 2.1810264277795856e-06, + "loss": 1.0947, + "step": 7833 + }, + { + "epoch": 1.1090818999079777, + "grad_norm": 10.371144509056576, + "learning_rate": 2.180457921236889e-06, + "loss": 0.9703, + "step": 7834 + }, + { + "epoch": 1.109223472782615, + "grad_norm": 9.485849784303518, + "learning_rate": 2.1798894314922824e-06, + "loss": 1.0218, + "step": 7835 + }, + { + "epoch": 1.1093650456572521, + "grad_norm": 9.205507238897301, + "learning_rate": 2.1793209585756483e-06, + "loss": 0.9762, + "step": 7836 + }, + { + "epoch": 1.1095066185318894, + "grad_norm": 8.52784431865079, + "learning_rate": 2.178752502516873e-06, + "loss": 1.029, + "step": 7837 + }, + { + "epoch": 1.1096481914065266, + "grad_norm": 8.883766250606223, + "learning_rate": 2.1781840633458394e-06, + "loss": 0.8951, + "step": 7838 + }, + { + "epoch": 1.1097897642811638, + "grad_norm": 8.201810720607961, + "learning_rate": 2.177615641092429e-06, + "loss": 0.9395, + "step": 7839 + }, + { + "epoch": 1.109931337155801, + "grad_norm": 9.167014190559643, + "learning_rate": 2.1770472357865247e-06, + "loss": 1.0432, + "step": 7840 + }, + { + "epoch": 1.110072910030438, + "grad_norm": 8.607058232405286, + "learning_rate": 2.1764788474580062e-06, + "loss": 1.0063, + "step": 7841 + }, + { + "epoch": 1.1102144829050753, + "grad_norm": 8.790181352395855, + "learning_rate": 2.175910476136754e-06, + "loss": 1.0843, + "step": 7842 + }, + { + "epoch": 1.1103560557797125, + "grad_norm": 7.807874118759506, + "learning_rate": 2.1753421218526458e-06, + "loss": 0.9903, + "step": 7843 + }, + { + "epoch": 1.1104976286543498, + "grad_norm": 9.813986890356132, + "learning_rate": 2.1747737846355603e-06, + "loss": 1.0126, + "step": 7844 + }, + { + "epoch": 1.110639201528987, + "grad_norm": 8.732767192554459, + "learning_rate": 2.1742054645153744e-06, + "loss": 1.0369, + "step": 7845 + }, + { + "epoch": 1.1107807744036242, + "grad_norm": 12.243096099419025, + "learning_rate": 2.173637161521964e-06, + "loss": 1.1151, + "step": 7846 + }, + { + "epoch": 1.1109223472782614, + "grad_norm": 11.19269019412841, + "learning_rate": 2.1730688756852046e-06, + "loss": 1.1045, + "step": 7847 + }, + { + "epoch": 1.1110639201528987, + "grad_norm": 9.477838064014165, + "learning_rate": 2.172500607034971e-06, + "loss": 1.1106, + "step": 7848 + }, + { + "epoch": 1.111205493027536, + "grad_norm": 8.211750952064394, + "learning_rate": 2.1719323556011364e-06, + "loss": 0.9405, + "step": 7849 + }, + { + "epoch": 1.1113470659021731, + "grad_norm": 9.046782656183504, + "learning_rate": 2.171364121413573e-06, + "loss": 1.0377, + "step": 7850 + }, + { + "epoch": 1.1114886387768104, + "grad_norm": 9.894427540292448, + "learning_rate": 2.170795904502153e-06, + "loss": 1.0051, + "step": 7851 + }, + { + "epoch": 1.1116302116514476, + "grad_norm": 10.025012207023819, + "learning_rate": 2.170227704896746e-06, + "loss": 1.1424, + "step": 7852 + }, + { + "epoch": 1.1117717845260848, + "grad_norm": 8.998719442182932, + "learning_rate": 2.169659522627224e-06, + "loss": 1.0936, + "step": 7853 + }, + { + "epoch": 1.111913357400722, + "grad_norm": 9.21873303427994, + "learning_rate": 2.1690913577234542e-06, + "loss": 1.0375, + "step": 7854 + }, + { + "epoch": 1.1120549302753593, + "grad_norm": 9.00291226210823, + "learning_rate": 2.1685232102153045e-06, + "loss": 1.0632, + "step": 7855 + }, + { + "epoch": 1.1121965031499965, + "grad_norm": 9.883803680532738, + "learning_rate": 2.1679550801326428e-06, + "loss": 1.0728, + "step": 7856 + }, + { + "epoch": 1.1123380760246337, + "grad_norm": 9.28426345557889, + "learning_rate": 2.167386967505335e-06, + "loss": 1.0415, + "step": 7857 + }, + { + "epoch": 1.112479648899271, + "grad_norm": 8.567408136521475, + "learning_rate": 2.1668188723632454e-06, + "loss": 0.8916, + "step": 7858 + }, + { + "epoch": 1.1126212217739082, + "grad_norm": 10.247226246743708, + "learning_rate": 2.1662507947362397e-06, + "loss": 1.1484, + "step": 7859 + }, + { + "epoch": 1.1127627946485454, + "grad_norm": 9.908659921647887, + "learning_rate": 2.165682734654181e-06, + "loss": 1.0896, + "step": 7860 + }, + { + "epoch": 1.1129043675231824, + "grad_norm": 9.299683218862425, + "learning_rate": 2.165114692146932e-06, + "loss": 1.1774, + "step": 7861 + }, + { + "epoch": 1.1130459403978197, + "grad_norm": 9.145647908780806, + "learning_rate": 2.1645466672443535e-06, + "loss": 1.0031, + "step": 7862 + }, + { + "epoch": 1.113187513272457, + "grad_norm": 9.504962227478908, + "learning_rate": 2.163978659976308e-06, + "loss": 0.9978, + "step": 7863 + }, + { + "epoch": 1.1133290861470941, + "grad_norm": 10.092625227579543, + "learning_rate": 2.163410670372652e-06, + "loss": 1.0091, + "step": 7864 + }, + { + "epoch": 1.1134706590217314, + "grad_norm": 8.239877675454006, + "learning_rate": 2.1628426984632465e-06, + "loss": 1.0185, + "step": 7865 + }, + { + "epoch": 1.1136122318963686, + "grad_norm": 8.629191831229402, + "learning_rate": 2.1622747442779495e-06, + "loss": 1.0476, + "step": 7866 + }, + { + "epoch": 1.1137538047710058, + "grad_norm": 9.088570814259015, + "learning_rate": 2.161706807846617e-06, + "loss": 1.0619, + "step": 7867 + }, + { + "epoch": 1.113895377645643, + "grad_norm": 8.060916715603657, + "learning_rate": 2.161138889199105e-06, + "loss": 1.0739, + "step": 7868 + }, + { + "epoch": 1.1140369505202803, + "grad_norm": 8.998134843625234, + "learning_rate": 2.1605709883652693e-06, + "loss": 1.0203, + "step": 7869 + }, + { + "epoch": 1.1141785233949175, + "grad_norm": 8.444906869016803, + "learning_rate": 2.160003105374964e-06, + "loss": 0.9814, + "step": 7870 + }, + { + "epoch": 1.1143200962695547, + "grad_norm": 10.015457794650516, + "learning_rate": 2.1594352402580413e-06, + "loss": 1.0041, + "step": 7871 + }, + { + "epoch": 1.114461669144192, + "grad_norm": 7.8688240659832065, + "learning_rate": 2.1588673930443544e-06, + "loss": 1.1133, + "step": 7872 + }, + { + "epoch": 1.1146032420188292, + "grad_norm": 11.050582069497834, + "learning_rate": 2.1582995637637543e-06, + "loss": 0.9915, + "step": 7873 + }, + { + "epoch": 1.1147448148934664, + "grad_norm": 7.951269026902787, + "learning_rate": 2.1577317524460917e-06, + "loss": 1.0291, + "step": 7874 + }, + { + "epoch": 1.1148863877681037, + "grad_norm": 9.644200186070584, + "learning_rate": 2.157163959121215e-06, + "loss": 0.9561, + "step": 7875 + }, + { + "epoch": 1.115027960642741, + "grad_norm": 8.964656094849694, + "learning_rate": 2.1565961838189738e-06, + "loss": 0.9856, + "step": 7876 + }, + { + "epoch": 1.1151695335173781, + "grad_norm": 9.878142424257014, + "learning_rate": 2.156028426569215e-06, + "loss": 1.0686, + "step": 7877 + }, + { + "epoch": 1.1153111063920154, + "grad_norm": 9.01360818131613, + "learning_rate": 2.155460687401785e-06, + "loss": 0.9961, + "step": 7878 + }, + { + "epoch": 1.1154526792666526, + "grad_norm": 9.991307486048147, + "learning_rate": 2.1548929663465305e-06, + "loss": 1.1562, + "step": 7879 + }, + { + "epoch": 1.1155942521412898, + "grad_norm": 10.60197844845406, + "learning_rate": 2.154325263433295e-06, + "loss": 0.9516, + "step": 7880 + }, + { + "epoch": 1.115735825015927, + "grad_norm": 9.548024509483174, + "learning_rate": 2.1537575786919222e-06, + "loss": 1.0218, + "step": 7881 + }, + { + "epoch": 1.1158773978905643, + "grad_norm": 11.71265727032219, + "learning_rate": 2.1531899121522557e-06, + "loss": 1.1345, + "step": 7882 + }, + { + "epoch": 1.1160189707652013, + "grad_norm": 8.893298131061908, + "learning_rate": 2.152622263844137e-06, + "loss": 1.0003, + "step": 7883 + }, + { + "epoch": 1.1161605436398385, + "grad_norm": 9.550234039794194, + "learning_rate": 2.152054633797408e-06, + "loss": 1.0498, + "step": 7884 + }, + { + "epoch": 1.1163021165144758, + "grad_norm": 10.2149105966981, + "learning_rate": 2.1514870220419063e-06, + "loss": 1.0762, + "step": 7885 + }, + { + "epoch": 1.116443689389113, + "grad_norm": 8.609192889549673, + "learning_rate": 2.150919428607472e-06, + "loss": 1.038, + "step": 7886 + }, + { + "epoch": 1.1165852622637502, + "grad_norm": 10.285628386547529, + "learning_rate": 2.1503518535239427e-06, + "loss": 1.0682, + "step": 7887 + }, + { + "epoch": 1.1167268351383874, + "grad_norm": 11.22115340561929, + "learning_rate": 2.149784296821156e-06, + "loss": 1.0644, + "step": 7888 + }, + { + "epoch": 1.1168684080130247, + "grad_norm": 8.99617262096342, + "learning_rate": 2.1492167585289476e-06, + "loss": 1.077, + "step": 7889 + }, + { + "epoch": 1.117009980887662, + "grad_norm": 9.741669837640051, + "learning_rate": 2.148649238677153e-06, + "loss": 1.0438, + "step": 7890 + }, + { + "epoch": 1.1171515537622991, + "grad_norm": 8.135227749853852, + "learning_rate": 2.148081737295606e-06, + "loss": 1.0089, + "step": 7891 + }, + { + "epoch": 1.1172931266369364, + "grad_norm": 9.527539540961762, + "learning_rate": 2.147514254414139e-06, + "loss": 1.0526, + "step": 7892 + }, + { + "epoch": 1.1174346995115736, + "grad_norm": 8.062439910901324, + "learning_rate": 2.146946790062586e-06, + "loss": 0.9349, + "step": 7893 + }, + { + "epoch": 1.1175762723862108, + "grad_norm": 7.62080943536809, + "learning_rate": 2.146379344270776e-06, + "loss": 1.0176, + "step": 7894 + }, + { + "epoch": 1.117717845260848, + "grad_norm": 7.9262274047356245, + "learning_rate": 2.145811917068541e-06, + "loss": 0.9441, + "step": 7895 + }, + { + "epoch": 1.1178594181354853, + "grad_norm": 8.196296916418405, + "learning_rate": 2.145244508485709e-06, + "loss": 0.932, + "step": 7896 + }, + { + "epoch": 1.1180009910101225, + "grad_norm": 9.486013858483439, + "learning_rate": 2.1446771185521086e-06, + "loss": 1.0541, + "step": 7897 + }, + { + "epoch": 1.1181425638847597, + "grad_norm": 9.564372758175217, + "learning_rate": 2.1441097472975667e-06, + "loss": 1.0544, + "step": 7898 + }, + { + "epoch": 1.118284136759397, + "grad_norm": 9.37944149347349, + "learning_rate": 2.143542394751911e-06, + "loss": 1.1405, + "step": 7899 + }, + { + "epoch": 1.1184257096340342, + "grad_norm": 8.286115721199963, + "learning_rate": 2.142975060944965e-06, + "loss": 1.0873, + "step": 7900 + }, + { + "epoch": 1.1185672825086714, + "grad_norm": 10.514465177378666, + "learning_rate": 2.1424077459065544e-06, + "loss": 0.9992, + "step": 7901 + }, + { + "epoch": 1.1187088553833084, + "grad_norm": 9.345594345304754, + "learning_rate": 2.1418404496665015e-06, + "loss": 0.9512, + "step": 7902 + }, + { + "epoch": 1.1188504282579457, + "grad_norm": 10.627173347343694, + "learning_rate": 2.1412731722546294e-06, + "loss": 1.1375, + "step": 7903 + }, + { + "epoch": 1.118992001132583, + "grad_norm": 7.8341594287503575, + "learning_rate": 2.1407059137007587e-06, + "loss": 0.9872, + "step": 7904 + }, + { + "epoch": 1.1191335740072201, + "grad_norm": 7.246573855388598, + "learning_rate": 2.14013867403471e-06, + "loss": 0.9653, + "step": 7905 + }, + { + "epoch": 1.1192751468818574, + "grad_norm": 7.9611737307014065, + "learning_rate": 2.139571453286305e-06, + "loss": 0.9162, + "step": 7906 + }, + { + "epoch": 1.1194167197564946, + "grad_norm": 10.936081799796302, + "learning_rate": 2.139004251485358e-06, + "loss": 1.0329, + "step": 7907 + }, + { + "epoch": 1.1195582926311318, + "grad_norm": 9.701249535689252, + "learning_rate": 2.138437068661689e-06, + "loss": 0.9006, + "step": 7908 + }, + { + "epoch": 1.119699865505769, + "grad_norm": 10.442739667710677, + "learning_rate": 2.1378699048451136e-06, + "loss": 1.1187, + "step": 7909 + }, + { + "epoch": 1.1198414383804063, + "grad_norm": 9.723252998083826, + "learning_rate": 2.1373027600654465e-06, + "loss": 0.9159, + "step": 7910 + }, + { + "epoch": 1.1199830112550435, + "grad_norm": 8.813965587627559, + "learning_rate": 2.1367356343525035e-06, + "loss": 1.0564, + "step": 7911 + }, + { + "epoch": 1.1201245841296807, + "grad_norm": 8.378785601549803, + "learning_rate": 2.1361685277360973e-06, + "loss": 0.9797, + "step": 7912 + }, + { + "epoch": 1.120266157004318, + "grad_norm": 8.817735347867902, + "learning_rate": 2.1356014402460403e-06, + "loss": 0.9677, + "step": 7913 + }, + { + "epoch": 1.1204077298789552, + "grad_norm": 10.871655881756382, + "learning_rate": 2.1350343719121437e-06, + "loss": 1.0749, + "step": 7914 + }, + { + "epoch": 1.1205493027535924, + "grad_norm": 7.10053939919507, + "learning_rate": 2.134467322764218e-06, + "loss": 0.9512, + "step": 7915 + }, + { + "epoch": 1.1206908756282297, + "grad_norm": 9.676013461711928, + "learning_rate": 2.1339002928320737e-06, + "loss": 1.0426, + "step": 7916 + }, + { + "epoch": 1.120832448502867, + "grad_norm": 10.29967316044126, + "learning_rate": 2.133333282145517e-06, + "loss": 0.9794, + "step": 7917 + }, + { + "epoch": 1.1209740213775041, + "grad_norm": 10.862816661788816, + "learning_rate": 2.1327662907343564e-06, + "loss": 1.0977, + "step": 7918 + }, + { + "epoch": 1.1211155942521414, + "grad_norm": 9.01062825272406, + "learning_rate": 2.1321993186283985e-06, + "loss": 0.9615, + "step": 7919 + }, + { + "epoch": 1.1212571671267786, + "grad_norm": 11.193598784464207, + "learning_rate": 2.1316323658574477e-06, + "loss": 1.0648, + "step": 7920 + }, + { + "epoch": 1.1213987400014158, + "grad_norm": 10.931330221327816, + "learning_rate": 2.1310654324513087e-06, + "loss": 1.0338, + "step": 7921 + }, + { + "epoch": 1.121540312876053, + "grad_norm": 9.76840038686806, + "learning_rate": 2.130498518439785e-06, + "loss": 1.0693, + "step": 7922 + }, + { + "epoch": 1.1216818857506903, + "grad_norm": 10.560737056441324, + "learning_rate": 2.1299316238526786e-06, + "loss": 1.1267, + "step": 7923 + }, + { + "epoch": 1.1218234586253273, + "grad_norm": 8.199281558686302, + "learning_rate": 2.129364748719791e-06, + "loss": 1.1759, + "step": 7924 + }, + { + "epoch": 1.1219650314999645, + "grad_norm": 7.599564449478493, + "learning_rate": 2.128797893070922e-06, + "loss": 0.919, + "step": 7925 + }, + { + "epoch": 1.1221066043746017, + "grad_norm": 8.951948105185245, + "learning_rate": 2.1282310569358704e-06, + "loss": 1.0479, + "step": 7926 + }, + { + "epoch": 1.122248177249239, + "grad_norm": 9.727749205970403, + "learning_rate": 2.127664240344437e-06, + "loss": 1.0551, + "step": 7927 + }, + { + "epoch": 1.1223897501238762, + "grad_norm": 7.470684232002254, + "learning_rate": 2.1270974433264152e-06, + "loss": 0.9578, + "step": 7928 + }, + { + "epoch": 1.1225313229985134, + "grad_norm": 8.669891295494056, + "learning_rate": 2.126530665911603e-06, + "loss": 1.0469, + "step": 7929 + }, + { + "epoch": 1.1226728958731507, + "grad_norm": 8.567713577284362, + "learning_rate": 2.125963908129795e-06, + "loss": 0.9389, + "step": 7930 + }, + { + "epoch": 1.122814468747788, + "grad_norm": 9.16401837558644, + "learning_rate": 2.125397170010786e-06, + "loss": 1.0092, + "step": 7931 + }, + { + "epoch": 1.1229560416224251, + "grad_norm": 10.13174326953104, + "learning_rate": 2.124830451584368e-06, + "loss": 1.0343, + "step": 7932 + }, + { + "epoch": 1.1230976144970624, + "grad_norm": 9.618365094054163, + "learning_rate": 2.124263752880333e-06, + "loss": 1.0939, + "step": 7933 + }, + { + "epoch": 1.1232391873716996, + "grad_norm": 11.733338575650981, + "learning_rate": 2.123697073928473e-06, + "loss": 1.0159, + "step": 7934 + }, + { + "epoch": 1.1233807602463368, + "grad_norm": 8.545054686467873, + "learning_rate": 2.123130414758577e-06, + "loss": 0.9889, + "step": 7935 + }, + { + "epoch": 1.123522333120974, + "grad_norm": 8.334928843029157, + "learning_rate": 2.122563775400434e-06, + "loss": 0.896, + "step": 7936 + }, + { + "epoch": 1.1236639059956113, + "grad_norm": 9.977700738342767, + "learning_rate": 2.1219971558838333e-06, + "loss": 1.12, + "step": 7937 + }, + { + "epoch": 1.1238054788702485, + "grad_norm": 8.392824705113393, + "learning_rate": 2.1214305562385592e-06, + "loss": 0.8673, + "step": 7938 + }, + { + "epoch": 1.1239470517448857, + "grad_norm": 8.691418978221018, + "learning_rate": 2.120863976494398e-06, + "loss": 0.9854, + "step": 7939 + }, + { + "epoch": 1.124088624619523, + "grad_norm": 9.860537597980722, + "learning_rate": 2.1202974166811354e-06, + "loss": 1.115, + "step": 7940 + }, + { + "epoch": 1.1242301974941602, + "grad_norm": 8.813308570507694, + "learning_rate": 2.119730876828554e-06, + "loss": 1.0559, + "step": 7941 + }, + { + "epoch": 1.1243717703687974, + "grad_norm": 8.64098436046342, + "learning_rate": 2.1191643569664373e-06, + "loss": 1.0031, + "step": 7942 + }, + { + "epoch": 1.1245133432434347, + "grad_norm": 9.451181360103217, + "learning_rate": 2.1185978571245665e-06, + "loss": 1.0964, + "step": 7943 + }, + { + "epoch": 1.1246549161180717, + "grad_norm": 8.222707745533526, + "learning_rate": 2.1180313773327214e-06, + "loss": 1.0209, + "step": 7944 + }, + { + "epoch": 1.124796488992709, + "grad_norm": 8.393217854734337, + "learning_rate": 2.1174649176206826e-06, + "loss": 0.9588, + "step": 7945 + }, + { + "epoch": 1.1249380618673461, + "grad_norm": 8.402324681812393, + "learning_rate": 2.116898478018227e-06, + "loss": 1.0144, + "step": 7946 + }, + { + "epoch": 1.1250796347419834, + "grad_norm": 9.69474472362979, + "learning_rate": 2.1163320585551335e-06, + "loss": 1.0302, + "step": 7947 + }, + { + "epoch": 1.1252212076166206, + "grad_norm": 9.926973345621892, + "learning_rate": 2.115765659261178e-06, + "loss": 1.2149, + "step": 7948 + }, + { + "epoch": 1.1253627804912578, + "grad_norm": 8.64509651690605, + "learning_rate": 2.115199280166135e-06, + "loss": 0.9183, + "step": 7949 + }, + { + "epoch": 1.125504353365895, + "grad_norm": 8.126274126360604, + "learning_rate": 2.1146329212997784e-06, + "loss": 0.994, + "step": 7950 + }, + { + "epoch": 1.1256459262405323, + "grad_norm": 8.090560932870853, + "learning_rate": 2.1140665826918823e-06, + "loss": 0.9284, + "step": 7951 + }, + { + "epoch": 1.1257874991151695, + "grad_norm": 8.756191134203897, + "learning_rate": 2.113500264372218e-06, + "loss": 1.1091, + "step": 7952 + }, + { + "epoch": 1.1259290719898067, + "grad_norm": 9.883311962428369, + "learning_rate": 2.1129339663705565e-06, + "loss": 1.1051, + "step": 7953 + }, + { + "epoch": 1.126070644864444, + "grad_norm": 9.553291624323785, + "learning_rate": 2.1123676887166678e-06, + "loss": 1.0834, + "step": 7954 + }, + { + "epoch": 1.1262122177390812, + "grad_norm": 8.609951436152828, + "learning_rate": 2.11180143144032e-06, + "loss": 1.0195, + "step": 7955 + }, + { + "epoch": 1.1263537906137184, + "grad_norm": 9.452036594754286, + "learning_rate": 2.1112351945712824e-06, + "loss": 1.1198, + "step": 7956 + }, + { + "epoch": 1.1264953634883557, + "grad_norm": 10.486633148560145, + "learning_rate": 2.1106689781393203e-06, + "loss": 1.1534, + "step": 7957 + }, + { + "epoch": 1.126636936362993, + "grad_norm": 8.843902694408394, + "learning_rate": 2.1101027821742013e-06, + "loss": 1.0617, + "step": 7958 + }, + { + "epoch": 1.1267785092376301, + "grad_norm": 9.135781803177736, + "learning_rate": 2.1095366067056868e-06, + "loss": 0.9613, + "step": 7959 + }, + { + "epoch": 1.1269200821122674, + "grad_norm": 8.07449418600715, + "learning_rate": 2.108970451763542e-06, + "loss": 0.9439, + "step": 7960 + }, + { + "epoch": 1.1270616549869046, + "grad_norm": 9.810047013134154, + "learning_rate": 2.1084043173775284e-06, + "loss": 1.1328, + "step": 7961 + }, + { + "epoch": 1.1272032278615418, + "grad_norm": 8.86066751622065, + "learning_rate": 2.1078382035774085e-06, + "loss": 0.8801, + "step": 7962 + }, + { + "epoch": 1.127344800736179, + "grad_norm": 8.685554121806623, + "learning_rate": 2.1072721103929415e-06, + "loss": 1.0805, + "step": 7963 + }, + { + "epoch": 1.1274863736108163, + "grad_norm": 8.932267798270466, + "learning_rate": 2.106706037853887e-06, + "loss": 0.9953, + "step": 7964 + }, + { + "epoch": 1.1276279464854535, + "grad_norm": 12.640280361687802, + "learning_rate": 2.106139985990003e-06, + "loss": 1.235, + "step": 7965 + }, + { + "epoch": 1.1277695193600905, + "grad_norm": 9.746966941720295, + "learning_rate": 2.105573954831046e-06, + "loss": 1.0546, + "step": 7966 + }, + { + "epoch": 1.1279110922347277, + "grad_norm": 9.411191535775206, + "learning_rate": 2.105007944406772e-06, + "loss": 1.0139, + "step": 7967 + }, + { + "epoch": 1.128052665109365, + "grad_norm": 7.278335184327521, + "learning_rate": 2.104441954746936e-06, + "loss": 0.8612, + "step": 7968 + }, + { + "epoch": 1.1281942379840022, + "grad_norm": 11.216958925729553, + "learning_rate": 2.1038759858812924e-06, + "loss": 1.0682, + "step": 7969 + }, + { + "epoch": 1.1283358108586394, + "grad_norm": 10.717381159179364, + "learning_rate": 2.103310037839592e-06, + "loss": 0.9938, + "step": 7970 + }, + { + "epoch": 1.1284773837332767, + "grad_norm": 8.87819173667609, + "learning_rate": 2.1027441106515872e-06, + "loss": 0.9956, + "step": 7971 + }, + { + "epoch": 1.128618956607914, + "grad_norm": 9.42883468854897, + "learning_rate": 2.1021782043470282e-06, + "loss": 1.0717, + "step": 7972 + }, + { + "epoch": 1.1287605294825511, + "grad_norm": 9.333884449990864, + "learning_rate": 2.1016123189556644e-06, + "loss": 1.0782, + "step": 7973 + }, + { + "epoch": 1.1289021023571884, + "grad_norm": 8.99169411453239, + "learning_rate": 2.101046454507244e-06, + "loss": 0.9551, + "step": 7974 + }, + { + "epoch": 1.1290436752318256, + "grad_norm": 10.829521760675371, + "learning_rate": 2.100480611031514e-06, + "loss": 1.1676, + "step": 7975 + }, + { + "epoch": 1.1291852481064628, + "grad_norm": 8.839318556951635, + "learning_rate": 2.0999147885582204e-06, + "loss": 1.0783, + "step": 7976 + }, + { + "epoch": 1.1293268209811, + "grad_norm": 11.716345456433597, + "learning_rate": 2.099348987117108e-06, + "loss": 1.0465, + "step": 7977 + }, + { + "epoch": 1.1294683938557373, + "grad_norm": 8.572201321600355, + "learning_rate": 2.09878320673792e-06, + "loss": 1.0768, + "step": 7978 + }, + { + "epoch": 1.1296099667303745, + "grad_norm": 9.357807722359567, + "learning_rate": 2.0982174474504004e-06, + "loss": 0.9845, + "step": 7979 + }, + { + "epoch": 1.1297515396050117, + "grad_norm": 10.147456379392803, + "learning_rate": 2.097651709284291e-06, + "loss": 1.0364, + "step": 7980 + }, + { + "epoch": 1.129893112479649, + "grad_norm": 8.505086386733575, + "learning_rate": 2.09708599226933e-06, + "loss": 0.9784, + "step": 7981 + }, + { + "epoch": 1.1300346853542862, + "grad_norm": 9.681298812240769, + "learning_rate": 2.096520296435258e-06, + "loss": 1.2178, + "step": 7982 + }, + { + "epoch": 1.1301762582289234, + "grad_norm": 10.40769274908589, + "learning_rate": 2.0959546218118133e-06, + "loss": 1.0085, + "step": 7983 + }, + { + "epoch": 1.1303178311035604, + "grad_norm": 10.107909295312538, + "learning_rate": 2.095388968428732e-06, + "loss": 1.0502, + "step": 7984 + }, + { + "epoch": 1.1304594039781977, + "grad_norm": 9.561202615726977, + "learning_rate": 2.094823336315752e-06, + "loss": 1.1084, + "step": 7985 + }, + { + "epoch": 1.130600976852835, + "grad_norm": 8.687706814506589, + "learning_rate": 2.0942577255026068e-06, + "loss": 0.9856, + "step": 7986 + }, + { + "epoch": 1.1307425497274721, + "grad_norm": 10.15867872102143, + "learning_rate": 2.0936921360190305e-06, + "loss": 1.0634, + "step": 7987 + }, + { + "epoch": 1.1308841226021094, + "grad_norm": 9.78312728465563, + "learning_rate": 2.0931265678947555e-06, + "loss": 1.1421, + "step": 7988 + }, + { + "epoch": 1.1310256954767466, + "grad_norm": 9.323734342779536, + "learning_rate": 2.0925610211595137e-06, + "loss": 0.9878, + "step": 7989 + }, + { + "epoch": 1.1311672683513838, + "grad_norm": 9.468234309389475, + "learning_rate": 2.0919954958430357e-06, + "loss": 1.1031, + "step": 7990 + }, + { + "epoch": 1.131308841226021, + "grad_norm": 7.9022341695627745, + "learning_rate": 2.0914299919750497e-06, + "loss": 1.0088, + "step": 7991 + }, + { + "epoch": 1.1314504141006583, + "grad_norm": 8.476129561236496, + "learning_rate": 2.090864509585284e-06, + "loss": 0.9829, + "step": 7992 + }, + { + "epoch": 1.1315919869752955, + "grad_norm": 8.070563017953445, + "learning_rate": 2.0902990487034664e-06, + "loss": 1.0507, + "step": 7993 + }, + { + "epoch": 1.1317335598499327, + "grad_norm": 9.435109625231533, + "learning_rate": 2.0897336093593223e-06, + "loss": 1.1032, + "step": 7994 + }, + { + "epoch": 1.13187513272457, + "grad_norm": 8.201140943097776, + "learning_rate": 2.0891681915825763e-06, + "loss": 1.014, + "step": 7995 + }, + { + "epoch": 1.1320167055992072, + "grad_norm": 10.103534314270846, + "learning_rate": 2.0886027954029525e-06, + "loss": 1.0338, + "step": 7996 + }, + { + "epoch": 1.1321582784738444, + "grad_norm": 8.456362740610247, + "learning_rate": 2.0880374208501724e-06, + "loss": 1.0103, + "step": 7997 + }, + { + "epoch": 1.1322998513484817, + "grad_norm": 8.061569751627257, + "learning_rate": 2.0874720679539585e-06, + "loss": 1.0092, + "step": 7998 + }, + { + "epoch": 1.1324414242231189, + "grad_norm": 8.685051662223334, + "learning_rate": 2.0869067367440297e-06, + "loss": 1.0877, + "step": 7999 + }, + { + "epoch": 1.1325829970977561, + "grad_norm": 11.152793199130796, + "learning_rate": 2.0863414272501067e-06, + "loss": 0.9724, + "step": 8000 + }, + { + "epoch": 1.1327245699723933, + "grad_norm": 8.752993262435776, + "learning_rate": 2.0857761395019064e-06, + "loss": 1.1512, + "step": 8001 + }, + { + "epoch": 1.1328661428470306, + "grad_norm": 7.755569917555624, + "learning_rate": 2.0852108735291448e-06, + "loss": 1.0533, + "step": 8002 + }, + { + "epoch": 1.1330077157216678, + "grad_norm": 9.39111570577313, + "learning_rate": 2.0846456293615384e-06, + "loss": 1.0496, + "step": 8003 + }, + { + "epoch": 1.133149288596305, + "grad_norm": 8.627694413971724, + "learning_rate": 2.084080407028802e-06, + "loss": 0.9824, + "step": 8004 + }, + { + "epoch": 1.1332908614709423, + "grad_norm": 9.60940636110228, + "learning_rate": 2.083515206560648e-06, + "loss": 0.9822, + "step": 8005 + }, + { + "epoch": 1.1334324343455795, + "grad_norm": 7.814035249543191, + "learning_rate": 2.0829500279867895e-06, + "loss": 0.9972, + "step": 8006 + }, + { + "epoch": 1.1335740072202167, + "grad_norm": 7.174974224828485, + "learning_rate": 2.082384871336936e-06, + "loss": 0.8575, + "step": 8007 + }, + { + "epoch": 1.1337155800948537, + "grad_norm": 9.122722602819199, + "learning_rate": 2.081819736640799e-06, + "loss": 1.045, + "step": 8008 + }, + { + "epoch": 1.133857152969491, + "grad_norm": 8.935204698024856, + "learning_rate": 2.0812546239280873e-06, + "loss": 1.1077, + "step": 8009 + }, + { + "epoch": 1.1339987258441282, + "grad_norm": 8.292838111312054, + "learning_rate": 2.0806895332285078e-06, + "loss": 0.9519, + "step": 8010 + }, + { + "epoch": 1.1341402987187654, + "grad_norm": 8.12898837281319, + "learning_rate": 2.080124464571767e-06, + "loss": 1.071, + "step": 8011 + }, + { + "epoch": 1.1342818715934027, + "grad_norm": 8.235889273863974, + "learning_rate": 2.0795594179875697e-06, + "loss": 1.0817, + "step": 8012 + }, + { + "epoch": 1.1344234444680399, + "grad_norm": 8.647352362478319, + "learning_rate": 2.0789943935056196e-06, + "loss": 0.9052, + "step": 8013 + }, + { + "epoch": 1.1345650173426771, + "grad_norm": 10.492335110419221, + "learning_rate": 2.078429391155621e-06, + "loss": 1.0947, + "step": 8014 + }, + { + "epoch": 1.1347065902173143, + "grad_norm": 10.051136115844729, + "learning_rate": 2.0778644109672747e-06, + "loss": 1.0322, + "step": 8015 + }, + { + "epoch": 1.1348481630919516, + "grad_norm": 8.969945954124912, + "learning_rate": 2.077299452970282e-06, + "loss": 0.9908, + "step": 8016 + }, + { + "epoch": 1.1349897359665888, + "grad_norm": 9.964929117504777, + "learning_rate": 2.0767345171943412e-06, + "loss": 1.0698, + "step": 8017 + }, + { + "epoch": 1.135131308841226, + "grad_norm": 7.516043732650395, + "learning_rate": 2.0761696036691515e-06, + "loss": 1.0633, + "step": 8018 + }, + { + "epoch": 1.1352728817158633, + "grad_norm": 9.581862051986596, + "learning_rate": 2.07560471242441e-06, + "loss": 0.9925, + "step": 8019 + }, + { + "epoch": 1.1354144545905005, + "grad_norm": 8.488256981960246, + "learning_rate": 2.075039843489812e-06, + "loss": 1.1314, + "step": 8020 + }, + { + "epoch": 1.1355560274651377, + "grad_norm": 9.457431386103288, + "learning_rate": 2.0744749968950527e-06, + "loss": 1.0696, + "step": 8021 + }, + { + "epoch": 1.135697600339775, + "grad_norm": 9.089939013655298, + "learning_rate": 2.073910172669826e-06, + "loss": 1.1089, + "step": 8022 + }, + { + "epoch": 1.1358391732144122, + "grad_norm": 7.467943730588255, + "learning_rate": 2.0733453708438233e-06, + "loss": 0.9835, + "step": 8023 + }, + { + "epoch": 1.1359807460890494, + "grad_norm": 8.916624645478768, + "learning_rate": 2.072780591446736e-06, + "loss": 0.9835, + "step": 8024 + }, + { + "epoch": 1.1361223189636867, + "grad_norm": 10.029849706343358, + "learning_rate": 2.072215834508255e-06, + "loss": 1.1443, + "step": 8025 + }, + { + "epoch": 1.1362638918383237, + "grad_norm": 9.251054033591085, + "learning_rate": 2.0716511000580684e-06, + "loss": 0.9697, + "step": 8026 + }, + { + "epoch": 1.1364054647129609, + "grad_norm": 8.842683222293118, + "learning_rate": 2.071086388125864e-06, + "loss": 1.0331, + "step": 8027 + }, + { + "epoch": 1.1365470375875981, + "grad_norm": 9.931943863672046, + "learning_rate": 2.0705216987413284e-06, + "loss": 1.1126, + "step": 8028 + }, + { + "epoch": 1.1366886104622353, + "grad_norm": 8.02191355643612, + "learning_rate": 2.069957031934147e-06, + "loss": 1.028, + "step": 8029 + }, + { + "epoch": 1.1368301833368726, + "grad_norm": 10.592853699523642, + "learning_rate": 2.0693923877340032e-06, + "loss": 1.0255, + "step": 8030 + }, + { + "epoch": 1.1369717562115098, + "grad_norm": 8.438373887759946, + "learning_rate": 2.0688277661705807e-06, + "loss": 0.923, + "step": 8031 + }, + { + "epoch": 1.137113329086147, + "grad_norm": 9.124596573782172, + "learning_rate": 2.0682631672735616e-06, + "loss": 1.0087, + "step": 8032 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 10.565259945330373, + "learning_rate": 2.0676985910726265e-06, + "loss": 0.9799, + "step": 8033 + }, + { + "epoch": 1.1373964748354215, + "grad_norm": 12.358729908694286, + "learning_rate": 2.0671340375974536e-06, + "loss": 1.1541, + "step": 8034 + }, + { + "epoch": 1.1375380477100587, + "grad_norm": 8.40083450531804, + "learning_rate": 2.066569506877721e-06, + "loss": 0.9223, + "step": 8035 + }, + { + "epoch": 1.137679620584696, + "grad_norm": 7.7088455717330415, + "learning_rate": 2.066004998943106e-06, + "loss": 0.989, + "step": 8036 + }, + { + "epoch": 1.1378211934593332, + "grad_norm": 9.187298208083691, + "learning_rate": 2.065440513823285e-06, + "loss": 0.9814, + "step": 8037 + }, + { + "epoch": 1.1379627663339704, + "grad_norm": 7.9804340946231145, + "learning_rate": 2.064876051547932e-06, + "loss": 0.9448, + "step": 8038 + }, + { + "epoch": 1.1381043392086077, + "grad_norm": 9.854569172182817, + "learning_rate": 2.064311612146721e-06, + "loss": 1.0189, + "step": 8039 + }, + { + "epoch": 1.1382459120832449, + "grad_norm": 7.092473793344417, + "learning_rate": 2.0637471956493236e-06, + "loss": 0.9163, + "step": 8040 + }, + { + "epoch": 1.1383874849578821, + "grad_norm": 9.049028578410537, + "learning_rate": 2.0631828020854106e-06, + "loss": 0.9935, + "step": 8041 + }, + { + "epoch": 1.1385290578325193, + "grad_norm": 8.806393502922766, + "learning_rate": 2.062618431484652e-06, + "loss": 1.1103, + "step": 8042 + }, + { + "epoch": 1.1386706307071566, + "grad_norm": 11.133159945236166, + "learning_rate": 2.062054083876717e-06, + "loss": 0.9773, + "step": 8043 + }, + { + "epoch": 1.1388122035817938, + "grad_norm": 7.561351342098338, + "learning_rate": 2.0614897592912716e-06, + "loss": 0.9507, + "step": 8044 + }, + { + "epoch": 1.138953776456431, + "grad_norm": 9.866084069587288, + "learning_rate": 2.060925457757983e-06, + "loss": 1.0788, + "step": 8045 + }, + { + "epoch": 1.1390953493310683, + "grad_norm": 9.291410799024574, + "learning_rate": 2.060361179306515e-06, + "loss": 0.9243, + "step": 8046 + }, + { + "epoch": 1.1392369222057055, + "grad_norm": 7.633834973417174, + "learning_rate": 2.0597969239665325e-06, + "loss": 1.0265, + "step": 8047 + }, + { + "epoch": 1.1393784950803427, + "grad_norm": 9.38968291928471, + "learning_rate": 2.0592326917676975e-06, + "loss": 1.0203, + "step": 8048 + }, + { + "epoch": 1.1395200679549797, + "grad_norm": 9.676870113086508, + "learning_rate": 2.0586684827396708e-06, + "loss": 0.9761, + "step": 8049 + }, + { + "epoch": 1.139661640829617, + "grad_norm": 9.849908555279246, + "learning_rate": 2.0581042969121136e-06, + "loss": 0.9953, + "step": 8050 + }, + { + "epoch": 1.1398032137042542, + "grad_norm": 9.767549419761359, + "learning_rate": 2.0575401343146832e-06, + "loss": 1.0537, + "step": 8051 + }, + { + "epoch": 1.1399447865788914, + "grad_norm": 8.768162215660515, + "learning_rate": 2.056975994977038e-06, + "loss": 0.9714, + "step": 8052 + }, + { + "epoch": 1.1400863594535287, + "grad_norm": 12.742974832933378, + "learning_rate": 2.0564118789288347e-06, + "loss": 0.9571, + "step": 8053 + }, + { + "epoch": 1.1402279323281659, + "grad_norm": 8.553929404195076, + "learning_rate": 2.0558477861997293e-06, + "loss": 0.9957, + "step": 8054 + }, + { + "epoch": 1.1403695052028031, + "grad_norm": 7.878918838301793, + "learning_rate": 2.0552837168193738e-06, + "loss": 1.0468, + "step": 8055 + }, + { + "epoch": 1.1405110780774403, + "grad_norm": 8.437293834286782, + "learning_rate": 2.0547196708174215e-06, + "loss": 0.9866, + "step": 8056 + }, + { + "epoch": 1.1406526509520776, + "grad_norm": 10.676700244016684, + "learning_rate": 2.054155648223524e-06, + "loss": 1.0787, + "step": 8057 + }, + { + "epoch": 1.1407942238267148, + "grad_norm": 8.643597441655785, + "learning_rate": 2.0535916490673313e-06, + "loss": 0.9822, + "step": 8058 + }, + { + "epoch": 1.140935796701352, + "grad_norm": 10.821220629761171, + "learning_rate": 2.0530276733784933e-06, + "loss": 1.0839, + "step": 8059 + }, + { + "epoch": 1.1410773695759893, + "grad_norm": 8.0929341144338, + "learning_rate": 2.052463721186657e-06, + "loss": 0.9958, + "step": 8060 + }, + { + "epoch": 1.1412189424506265, + "grad_norm": 10.852548509132149, + "learning_rate": 2.0518997925214694e-06, + "loss": 1.0119, + "step": 8061 + }, + { + "epoch": 1.1413605153252637, + "grad_norm": 9.942020947351606, + "learning_rate": 2.0513358874125754e-06, + "loss": 0.9566, + "step": 8062 + }, + { + "epoch": 1.141502088199901, + "grad_norm": 8.28661014618425, + "learning_rate": 2.0507720058896195e-06, + "loss": 0.9255, + "step": 8063 + }, + { + "epoch": 1.1416436610745382, + "grad_norm": 10.33645603208482, + "learning_rate": 2.0502081479822447e-06, + "loss": 1.0833, + "step": 8064 + }, + { + "epoch": 1.1417852339491754, + "grad_norm": 10.449865634182142, + "learning_rate": 2.0496443137200915e-06, + "loss": 1.078, + "step": 8065 + }, + { + "epoch": 1.1419268068238126, + "grad_norm": 7.924155243817792, + "learning_rate": 2.0490805031328013e-06, + "loss": 0.9409, + "step": 8066 + }, + { + "epoch": 1.1420683796984497, + "grad_norm": 9.61418356568906, + "learning_rate": 2.0485167162500124e-06, + "loss": 0.9887, + "step": 8067 + }, + { + "epoch": 1.1422099525730869, + "grad_norm": 9.63394660801552, + "learning_rate": 2.047952953101363e-06, + "loss": 1.101, + "step": 8068 + }, + { + "epoch": 1.1423515254477241, + "grad_norm": 10.812077332101891, + "learning_rate": 2.0473892137164906e-06, + "loss": 1.008, + "step": 8069 + }, + { + "epoch": 1.1424930983223613, + "grad_norm": 10.885017078615808, + "learning_rate": 2.0468254981250293e-06, + "loss": 1.0406, + "step": 8070 + }, + { + "epoch": 1.1426346711969986, + "grad_norm": 11.843984475746275, + "learning_rate": 2.0462618063566135e-06, + "loss": 0.9937, + "step": 8071 + }, + { + "epoch": 1.1427762440716358, + "grad_norm": 8.070204727620181, + "learning_rate": 2.045698138440876e-06, + "loss": 1.0747, + "step": 8072 + }, + { + "epoch": 1.142917816946273, + "grad_norm": 8.8077847466633, + "learning_rate": 2.045134494407449e-06, + "loss": 0.9604, + "step": 8073 + }, + { + "epoch": 1.1430593898209103, + "grad_norm": 9.887253510577235, + "learning_rate": 2.044570874285963e-06, + "loss": 1.1546, + "step": 8074 + }, + { + "epoch": 1.1432009626955475, + "grad_norm": 10.890931134730645, + "learning_rate": 2.044007278106046e-06, + "loss": 1.0558, + "step": 8075 + }, + { + "epoch": 1.1433425355701847, + "grad_norm": 13.41527788702615, + "learning_rate": 2.043443705897326e-06, + "loss": 0.9699, + "step": 8076 + }, + { + "epoch": 1.143484108444822, + "grad_norm": 10.238131467093984, + "learning_rate": 2.042880157689431e-06, + "loss": 1.0226, + "step": 8077 + }, + { + "epoch": 1.1436256813194592, + "grad_norm": 9.233603714749467, + "learning_rate": 2.0423166335119844e-06, + "loss": 1.0827, + "step": 8078 + }, + { + "epoch": 1.1437672541940964, + "grad_norm": 9.626435804407919, + "learning_rate": 2.0417531333946113e-06, + "loss": 1.1515, + "step": 8079 + }, + { + "epoch": 1.1439088270687336, + "grad_norm": 8.869090087047711, + "learning_rate": 2.041189657366934e-06, + "loss": 1.052, + "step": 8080 + }, + { + "epoch": 1.1440503999433709, + "grad_norm": 9.034394980493946, + "learning_rate": 2.040626205458574e-06, + "loss": 0.991, + "step": 8081 + }, + { + "epoch": 1.144191972818008, + "grad_norm": 10.058958958782597, + "learning_rate": 2.0400627776991526e-06, + "loss": 1.0415, + "step": 8082 + }, + { + "epoch": 1.1443335456926453, + "grad_norm": 13.921460694592854, + "learning_rate": 2.039499374118288e-06, + "loss": 1.0467, + "step": 8083 + }, + { + "epoch": 1.1444751185672826, + "grad_norm": 8.230305138234975, + "learning_rate": 2.0389359947455978e-06, + "loss": 0.9296, + "step": 8084 + }, + { + "epoch": 1.1446166914419198, + "grad_norm": 10.088149084600454, + "learning_rate": 2.0383726396106983e-06, + "loss": 1.0213, + "step": 8085 + }, + { + "epoch": 1.144758264316557, + "grad_norm": 10.705583944273691, + "learning_rate": 2.0378093087432067e-06, + "loss": 1.101, + "step": 8086 + }, + { + "epoch": 1.1448998371911943, + "grad_norm": 8.217631797000534, + "learning_rate": 2.037246002172733e-06, + "loss": 1.2336, + "step": 8087 + }, + { + "epoch": 1.1450414100658315, + "grad_norm": 7.678301758957597, + "learning_rate": 2.0366827199288923e-06, + "loss": 0.9199, + "step": 8088 + }, + { + "epoch": 1.1451829829404687, + "grad_norm": 10.33578802506713, + "learning_rate": 2.036119462041296e-06, + "loss": 1.0549, + "step": 8089 + }, + { + "epoch": 1.145324555815106, + "grad_norm": 9.325724588423201, + "learning_rate": 2.0355562285395537e-06, + "loss": 1.04, + "step": 8090 + }, + { + "epoch": 1.145466128689743, + "grad_norm": 11.812436704112825, + "learning_rate": 2.0349930194532734e-06, + "loss": 1.1286, + "step": 8091 + }, + { + "epoch": 1.1456077015643802, + "grad_norm": 9.149119650592585, + "learning_rate": 2.034429834812064e-06, + "loss": 0.9491, + "step": 8092 + }, + { + "epoch": 1.1457492744390174, + "grad_norm": 7.924789705103675, + "learning_rate": 2.033866674645531e-06, + "loss": 0.9858, + "step": 8093 + }, + { + "epoch": 1.1458908473136546, + "grad_norm": 8.853859235624757, + "learning_rate": 2.0333035389832795e-06, + "loss": 1.0049, + "step": 8094 + }, + { + "epoch": 1.1460324201882919, + "grad_norm": 10.188231646704645, + "learning_rate": 2.0327404278549127e-06, + "loss": 1.0459, + "step": 8095 + }, + { + "epoch": 1.146173993062929, + "grad_norm": 8.818815524954788, + "learning_rate": 2.032177341290034e-06, + "loss": 1.0456, + "step": 8096 + }, + { + "epoch": 1.1463155659375663, + "grad_norm": 10.423848069764974, + "learning_rate": 2.031614279318243e-06, + "loss": 1.0331, + "step": 8097 + }, + { + "epoch": 1.1464571388122036, + "grad_norm": 10.372402003761556, + "learning_rate": 2.03105124196914e-06, + "loss": 1.0609, + "step": 8098 + }, + { + "epoch": 1.1465987116868408, + "grad_norm": 12.180992854199632, + "learning_rate": 2.030488229272323e-06, + "loss": 1.0503, + "step": 8099 + }, + { + "epoch": 1.146740284561478, + "grad_norm": 8.213922859849525, + "learning_rate": 2.0299252412573907e-06, + "loss": 0.9891, + "step": 8100 + }, + { + "epoch": 1.1468818574361153, + "grad_norm": 8.187037331124762, + "learning_rate": 2.0293622779539372e-06, + "loss": 1.0672, + "step": 8101 + }, + { + "epoch": 1.1470234303107525, + "grad_norm": 9.492834098990514, + "learning_rate": 2.0287993393915585e-06, + "loss": 1.0816, + "step": 8102 + }, + { + "epoch": 1.1471650031853897, + "grad_norm": 8.328336158351433, + "learning_rate": 2.0282364255998465e-06, + "loss": 0.9507, + "step": 8103 + }, + { + "epoch": 1.147306576060027, + "grad_norm": 11.574823853571637, + "learning_rate": 2.027673536608394e-06, + "loss": 1.179, + "step": 8104 + }, + { + "epoch": 1.1474481489346642, + "grad_norm": 10.101774350664257, + "learning_rate": 2.0271106724467915e-06, + "loss": 1.0451, + "step": 8105 + }, + { + "epoch": 1.1475897218093014, + "grad_norm": 10.570831804848034, + "learning_rate": 2.0265478331446285e-06, + "loss": 1.0749, + "step": 8106 + }, + { + "epoch": 1.1477312946839386, + "grad_norm": 9.13600059971582, + "learning_rate": 2.025985018731494e-06, + "loss": 0.9994, + "step": 8107 + }, + { + "epoch": 1.1478728675585759, + "grad_norm": 8.742297897119078, + "learning_rate": 2.0254222292369725e-06, + "loss": 0.9962, + "step": 8108 + }, + { + "epoch": 1.1480144404332129, + "grad_norm": 8.370113484514553, + "learning_rate": 2.024859464690651e-06, + "loss": 1.0271, + "step": 8109 + }, + { + "epoch": 1.14815601330785, + "grad_norm": 9.431426462351604, + "learning_rate": 2.0242967251221118e-06, + "loss": 0.97, + "step": 8110 + }, + { + "epoch": 1.1482975861824873, + "grad_norm": 8.517266183593625, + "learning_rate": 2.02373401056094e-06, + "loss": 0.9592, + "step": 8111 + }, + { + "epoch": 1.1484391590571246, + "grad_norm": 7.77563126629314, + "learning_rate": 2.0231713210367163e-06, + "loss": 1.0124, + "step": 8112 + }, + { + "epoch": 1.1485807319317618, + "grad_norm": 9.49314351860047, + "learning_rate": 2.0226086565790207e-06, + "loss": 0.9668, + "step": 8113 + }, + { + "epoch": 1.148722304806399, + "grad_norm": 10.58981132994853, + "learning_rate": 2.022046017217432e-06, + "loss": 1.0644, + "step": 8114 + }, + { + "epoch": 1.1488638776810363, + "grad_norm": 9.590851684556291, + "learning_rate": 2.0214834029815276e-06, + "loss": 1.0311, + "step": 8115 + }, + { + "epoch": 1.1490054505556735, + "grad_norm": 9.89141717182084, + "learning_rate": 2.020920813900884e-06, + "loss": 1.1153, + "step": 8116 + }, + { + "epoch": 1.1491470234303107, + "grad_norm": 9.09175386838647, + "learning_rate": 2.020358250005077e-06, + "loss": 1.0654, + "step": 8117 + }, + { + "epoch": 1.149288596304948, + "grad_norm": 9.520371930205938, + "learning_rate": 2.019795711323678e-06, + "loss": 0.9853, + "step": 8118 + }, + { + "epoch": 1.1494301691795852, + "grad_norm": 9.488998065631101, + "learning_rate": 2.0192331978862604e-06, + "loss": 1.0678, + "step": 8119 + }, + { + "epoch": 1.1495717420542224, + "grad_norm": 8.176968622719155, + "learning_rate": 2.0186707097223952e-06, + "loss": 0.9608, + "step": 8120 + }, + { + "epoch": 1.1497133149288596, + "grad_norm": 8.470760092830687, + "learning_rate": 2.018108246861652e-06, + "loss": 1.0261, + "step": 8121 + }, + { + "epoch": 1.1498548878034969, + "grad_norm": 8.87064179344633, + "learning_rate": 2.017545809333599e-06, + "loss": 1.0997, + "step": 8122 + }, + { + "epoch": 1.149996460678134, + "grad_norm": 10.86678661756094, + "learning_rate": 2.0169833971678033e-06, + "loss": 1.0236, + "step": 8123 + }, + { + "epoch": 1.1501380335527713, + "grad_norm": 9.423436884689501, + "learning_rate": 2.0164210103938297e-06, + "loss": 1.117, + "step": 8124 + }, + { + "epoch": 1.1502796064274086, + "grad_norm": 9.930916770880534, + "learning_rate": 2.0158586490412436e-06, + "loss": 1.0899, + "step": 8125 + }, + { + "epoch": 1.1504211793020458, + "grad_norm": 9.290606063274302, + "learning_rate": 2.0152963131396068e-06, + "loss": 1.0653, + "step": 8126 + }, + { + "epoch": 1.150562752176683, + "grad_norm": 10.575052781570776, + "learning_rate": 2.0147340027184816e-06, + "loss": 0.9412, + "step": 8127 + }, + { + "epoch": 1.1507043250513203, + "grad_norm": 9.185759710416963, + "learning_rate": 2.014171717807429e-06, + "loss": 1.129, + "step": 8128 + }, + { + "epoch": 1.1508458979259575, + "grad_norm": 10.477013631576055, + "learning_rate": 2.013609458436006e-06, + "loss": 1.1289, + "step": 8129 + }, + { + "epoch": 1.1509874708005947, + "grad_norm": 11.419735312793279, + "learning_rate": 2.013047224633771e-06, + "loss": 1.1081, + "step": 8130 + }, + { + "epoch": 1.151129043675232, + "grad_norm": 7.320833853249558, + "learning_rate": 2.0124850164302805e-06, + "loss": 0.9368, + "step": 8131 + }, + { + "epoch": 1.151270616549869, + "grad_norm": 8.932787526475936, + "learning_rate": 2.0119228338550894e-06, + "loss": 1.057, + "step": 8132 + }, + { + "epoch": 1.1514121894245062, + "grad_norm": 9.652750411936315, + "learning_rate": 2.0113606769377497e-06, + "loss": 1.0295, + "step": 8133 + }, + { + "epoch": 1.1515537622991434, + "grad_norm": 8.514669438216016, + "learning_rate": 2.010798545707816e-06, + "loss": 1.1143, + "step": 8134 + }, + { + "epoch": 1.1516953351737806, + "grad_norm": 10.096614277719382, + "learning_rate": 2.0102364401948378e-06, + "loss": 1.0335, + "step": 8135 + }, + { + "epoch": 1.1518369080484179, + "grad_norm": 11.770738845490966, + "learning_rate": 2.009674360428365e-06, + "loss": 1.1081, + "step": 8136 + }, + { + "epoch": 1.151978480923055, + "grad_norm": 9.479871205287953, + "learning_rate": 2.009112306437945e-06, + "loss": 1.016, + "step": 8137 + }, + { + "epoch": 1.1521200537976923, + "grad_norm": 10.886039655218973, + "learning_rate": 2.008550278253127e-06, + "loss": 1.0603, + "step": 8138 + }, + { + "epoch": 1.1522616266723296, + "grad_norm": 10.827472177177764, + "learning_rate": 2.0079882759034517e-06, + "loss": 1.1815, + "step": 8139 + }, + { + "epoch": 1.1524031995469668, + "grad_norm": 10.517367577105647, + "learning_rate": 2.007426299418467e-06, + "loss": 1.112, + "step": 8140 + }, + { + "epoch": 1.152544772421604, + "grad_norm": 10.999976765001206, + "learning_rate": 2.0068643488277147e-06, + "loss": 1.0547, + "step": 8141 + }, + { + "epoch": 1.1526863452962413, + "grad_norm": 9.282788287267488, + "learning_rate": 2.0063024241607356e-06, + "loss": 0.9826, + "step": 8142 + }, + { + "epoch": 1.1528279181708785, + "grad_norm": 11.79943835974699, + "learning_rate": 2.00574052544707e-06, + "loss": 1.0112, + "step": 8143 + }, + { + "epoch": 1.1529694910455157, + "grad_norm": 11.938208753706325, + "learning_rate": 2.005178652716256e-06, + "loss": 1.0901, + "step": 8144 + }, + { + "epoch": 1.153111063920153, + "grad_norm": 7.917425286603298, + "learning_rate": 2.004616805997832e-06, + "loss": 0.88, + "step": 8145 + }, + { + "epoch": 1.1532526367947902, + "grad_norm": 8.715347993592342, + "learning_rate": 2.0040549853213326e-06, + "loss": 1.0154, + "step": 8146 + }, + { + "epoch": 1.1533942096694274, + "grad_norm": 11.061114380692997, + "learning_rate": 2.003493190716293e-06, + "loss": 1.1288, + "step": 8147 + }, + { + "epoch": 1.1535357825440646, + "grad_norm": 7.1519365262451435, + "learning_rate": 2.0029314222122463e-06, + "loss": 0.9903, + "step": 8148 + }, + { + "epoch": 1.1536773554187019, + "grad_norm": 9.228278560116479, + "learning_rate": 2.0023696798387247e-06, + "loss": 0.9655, + "step": 8149 + }, + { + "epoch": 1.1538189282933389, + "grad_norm": 9.402343344282293, + "learning_rate": 2.001807963625257e-06, + "loss": 0.9923, + "step": 8150 + }, + { + "epoch": 1.153960501167976, + "grad_norm": 8.887664355597252, + "learning_rate": 2.0012462736013735e-06, + "loss": 1.0333, + "step": 8151 + }, + { + "epoch": 1.1541020740426133, + "grad_norm": 9.05291030202572, + "learning_rate": 2.0006846097966016e-06, + "loss": 0.917, + "step": 8152 + }, + { + "epoch": 1.1542436469172506, + "grad_norm": 10.581778905259961, + "learning_rate": 2.000122972240467e-06, + "loss": 1.0396, + "step": 8153 + }, + { + "epoch": 1.1543852197918878, + "grad_norm": 10.856446674082964, + "learning_rate": 1.9995613609624957e-06, + "loss": 1.0469, + "step": 8154 + }, + { + "epoch": 1.154526792666525, + "grad_norm": 9.356366165856798, + "learning_rate": 1.9989997759922093e-06, + "loss": 1.0474, + "step": 8155 + }, + { + "epoch": 1.1546683655411623, + "grad_norm": 9.25391696785998, + "learning_rate": 1.998438217359132e-06, + "loss": 1.1443, + "step": 8156 + }, + { + "epoch": 1.1548099384157995, + "grad_norm": 9.756099651382966, + "learning_rate": 1.997876685092784e-06, + "loss": 0.9722, + "step": 8157 + }, + { + "epoch": 1.1549515112904367, + "grad_norm": 10.962179884374404, + "learning_rate": 1.9973151792226837e-06, + "loss": 1.034, + "step": 8158 + }, + { + "epoch": 1.155093084165074, + "grad_norm": 7.806330327520931, + "learning_rate": 1.9967536997783495e-06, + "loss": 0.8793, + "step": 8159 + }, + { + "epoch": 1.1552346570397112, + "grad_norm": 10.1640486134369, + "learning_rate": 1.9961922467892997e-06, + "loss": 0.9333, + "step": 8160 + }, + { + "epoch": 1.1553762299143484, + "grad_norm": 9.569250180198576, + "learning_rate": 1.9956308202850456e-06, + "loss": 1.0597, + "step": 8161 + }, + { + "epoch": 1.1555178027889856, + "grad_norm": 9.741296649450609, + "learning_rate": 1.9950694202951044e-06, + "loss": 1.0386, + "step": 8162 + }, + { + "epoch": 1.1556593756636229, + "grad_norm": 9.58523111764204, + "learning_rate": 1.994508046848987e-06, + "loss": 0.8913, + "step": 8163 + }, + { + "epoch": 1.15580094853826, + "grad_norm": 8.838468774252558, + "learning_rate": 1.9939466999762044e-06, + "loss": 1.1497, + "step": 8164 + }, + { + "epoch": 1.1559425214128973, + "grad_norm": 8.207762361086864, + "learning_rate": 1.993385379706267e-06, + "loss": 0.9558, + "step": 8165 + }, + { + "epoch": 1.1560840942875346, + "grad_norm": 9.103413589255998, + "learning_rate": 1.9928240860686822e-06, + "loss": 0.9657, + "step": 8166 + }, + { + "epoch": 1.1562256671621718, + "grad_norm": 8.489973100635344, + "learning_rate": 1.9922628190929567e-06, + "loss": 1.0642, + "step": 8167 + }, + { + "epoch": 1.156367240036809, + "grad_norm": 9.767981356341341, + "learning_rate": 1.9917015788085962e-06, + "loss": 0.9958, + "step": 8168 + }, + { + "epoch": 1.1565088129114462, + "grad_norm": 11.305075553766423, + "learning_rate": 1.991140365245105e-06, + "loss": 0.9525, + "step": 8169 + }, + { + "epoch": 1.1566503857860835, + "grad_norm": 8.633453649126814, + "learning_rate": 1.990579178431986e-06, + "loss": 0.9846, + "step": 8170 + }, + { + "epoch": 1.1567919586607207, + "grad_norm": 9.214477790528795, + "learning_rate": 1.990018018398739e-06, + "loss": 1.024, + "step": 8171 + }, + { + "epoch": 1.156933531535358, + "grad_norm": 11.652287441198174, + "learning_rate": 1.989456885174865e-06, + "loss": 1.151, + "step": 8172 + }, + { + "epoch": 1.157075104409995, + "grad_norm": 8.892464660282604, + "learning_rate": 1.988895778789861e-06, + "loss": 1.0767, + "step": 8173 + }, + { + "epoch": 1.1572166772846322, + "grad_norm": 10.10388543960008, + "learning_rate": 1.9883346992732256e-06, + "loss": 1.1081, + "step": 8174 + }, + { + "epoch": 1.1573582501592694, + "grad_norm": 9.928377006485508, + "learning_rate": 1.987773646654453e-06, + "loss": 1.0217, + "step": 8175 + }, + { + "epoch": 1.1574998230339066, + "grad_norm": 8.066123916820514, + "learning_rate": 1.987212620963038e-06, + "loss": 0.9915, + "step": 8176 + }, + { + "epoch": 1.1576413959085439, + "grad_norm": 8.719700259035392, + "learning_rate": 1.9866516222284736e-06, + "loss": 0.957, + "step": 8177 + }, + { + "epoch": 1.157782968783181, + "grad_norm": 9.732962957503172, + "learning_rate": 1.9860906504802496e-06, + "loss": 1.0144, + "step": 8178 + }, + { + "epoch": 1.1579245416578183, + "grad_norm": 8.930663154814743, + "learning_rate": 1.985529705747858e-06, + "loss": 1.051, + "step": 8179 + }, + { + "epoch": 1.1580661145324556, + "grad_norm": 8.170725637682926, + "learning_rate": 1.9849687880607855e-06, + "loss": 1.0091, + "step": 8180 + }, + { + "epoch": 1.1582076874070928, + "grad_norm": 8.84925266709643, + "learning_rate": 1.984407897448521e-06, + "loss": 1.0475, + "step": 8181 + }, + { + "epoch": 1.15834926028173, + "grad_norm": 9.29124164607097, + "learning_rate": 1.983847033940548e-06, + "loss": 1.0372, + "step": 8182 + }, + { + "epoch": 1.1584908331563673, + "grad_norm": 9.621220230466847, + "learning_rate": 1.9832861975663516e-06, + "loss": 1.26, + "step": 8183 + }, + { + "epoch": 1.1586324060310045, + "grad_norm": 8.51372676305718, + "learning_rate": 1.982725388355414e-06, + "loss": 1.0401, + "step": 8184 + }, + { + "epoch": 1.1587739789056417, + "grad_norm": 7.903960970901932, + "learning_rate": 1.9821646063372174e-06, + "loss": 0.9476, + "step": 8185 + }, + { + "epoch": 1.158915551780279, + "grad_norm": 9.918419520083187, + "learning_rate": 1.9816038515412412e-06, + "loss": 1.0303, + "step": 8186 + }, + { + "epoch": 1.1590571246549162, + "grad_norm": 8.684948443587528, + "learning_rate": 1.9810431239969646e-06, + "loss": 1.0609, + "step": 8187 + }, + { + "epoch": 1.1591986975295534, + "grad_norm": 8.763155393809622, + "learning_rate": 1.9804824237338636e-06, + "loss": 0.9426, + "step": 8188 + }, + { + "epoch": 1.1593402704041906, + "grad_norm": 7.973989641529681, + "learning_rate": 1.9799217507814144e-06, + "loss": 0.9821, + "step": 8189 + }, + { + "epoch": 1.1594818432788279, + "grad_norm": 8.14394622823954, + "learning_rate": 1.9793611051690905e-06, + "loss": 0.99, + "step": 8190 + }, + { + "epoch": 1.159623416153465, + "grad_norm": 10.333581818905186, + "learning_rate": 1.978800486926366e-06, + "loss": 0.926, + "step": 8191 + }, + { + "epoch": 1.159764989028102, + "grad_norm": 8.857447930984277, + "learning_rate": 1.9782398960827105e-06, + "loss": 0.999, + "step": 8192 + }, + { + "epoch": 1.1599065619027393, + "grad_norm": 8.968933316261358, + "learning_rate": 1.977679332667595e-06, + "loss": 1.0229, + "step": 8193 + }, + { + "epoch": 1.1600481347773766, + "grad_norm": 8.399780506717729, + "learning_rate": 1.9771187967104875e-06, + "loss": 1.0581, + "step": 8194 + }, + { + "epoch": 1.1601897076520138, + "grad_norm": 10.551125084063347, + "learning_rate": 1.9765582882408544e-06, + "loss": 1.0374, + "step": 8195 + }, + { + "epoch": 1.160331280526651, + "grad_norm": 8.358396554887188, + "learning_rate": 1.9759978072881623e-06, + "loss": 0.9252, + "step": 8196 + }, + { + "epoch": 1.1604728534012883, + "grad_norm": 9.42847420283139, + "learning_rate": 1.975437353881875e-06, + "loss": 1.0217, + "step": 8197 + }, + { + "epoch": 1.1606144262759255, + "grad_norm": 8.353202123603465, + "learning_rate": 1.9748769280514544e-06, + "loss": 1.0278, + "step": 8198 + }, + { + "epoch": 1.1607559991505627, + "grad_norm": 10.79610615931156, + "learning_rate": 1.9743165298263624e-06, + "loss": 1.1285, + "step": 8199 + }, + { + "epoch": 1.1608975720252, + "grad_norm": 9.563454175229719, + "learning_rate": 1.9737561592360583e-06, + "loss": 1.1077, + "step": 8200 + }, + { + "epoch": 1.1610391448998372, + "grad_norm": 8.543681829664713, + "learning_rate": 1.97319581631e-06, + "loss": 1.0679, + "step": 8201 + }, + { + "epoch": 1.1611807177744744, + "grad_norm": 11.307339834950854, + "learning_rate": 1.9726355010776466e-06, + "loss": 0.9842, + "step": 8202 + }, + { + "epoch": 1.1613222906491116, + "grad_norm": 7.635242524357803, + "learning_rate": 1.9720752135684505e-06, + "loss": 0.9919, + "step": 8203 + }, + { + "epoch": 1.1614638635237489, + "grad_norm": 9.283479878646077, + "learning_rate": 1.9715149538118667e-06, + "loss": 0.9529, + "step": 8204 + }, + { + "epoch": 1.161605436398386, + "grad_norm": 8.75077598400654, + "learning_rate": 1.970954721837348e-06, + "loss": 1.0419, + "step": 8205 + }, + { + "epoch": 1.1617470092730233, + "grad_norm": 9.449460162532104, + "learning_rate": 1.970394517674345e-06, + "loss": 1.1112, + "step": 8206 + }, + { + "epoch": 1.1618885821476606, + "grad_norm": 8.503376009824164, + "learning_rate": 1.9698343413523065e-06, + "loss": 0.918, + "step": 8207 + }, + { + "epoch": 1.1620301550222978, + "grad_norm": 8.992479785242127, + "learning_rate": 1.969274192900682e-06, + "loss": 1.017, + "step": 8208 + }, + { + "epoch": 1.162171727896935, + "grad_norm": 9.634526677999393, + "learning_rate": 1.9687140723489175e-06, + "loss": 1.0779, + "step": 8209 + }, + { + "epoch": 1.1623133007715722, + "grad_norm": 11.338430118786011, + "learning_rate": 1.9681539797264583e-06, + "loss": 0.98, + "step": 8210 + }, + { + "epoch": 1.1624548736462095, + "grad_norm": 9.068451709429235, + "learning_rate": 1.967593915062748e-06, + "loss": 1.0472, + "step": 8211 + }, + { + "epoch": 1.1625964465208467, + "grad_norm": 9.000028398257063, + "learning_rate": 1.9670338783872277e-06, + "loss": 0.9681, + "step": 8212 + }, + { + "epoch": 1.162738019395484, + "grad_norm": 8.961002206610319, + "learning_rate": 1.9664738697293404e-06, + "loss": 0.9444, + "step": 8213 + }, + { + "epoch": 1.1628795922701212, + "grad_norm": 10.005258322092283, + "learning_rate": 1.965913889118523e-06, + "loss": 1.0289, + "step": 8214 + }, + { + "epoch": 1.1630211651447582, + "grad_norm": 11.098524844462192, + "learning_rate": 1.9653539365842143e-06, + "loss": 1.1094, + "step": 8215 + }, + { + "epoch": 1.1631627380193954, + "grad_norm": 9.30545822269785, + "learning_rate": 1.9647940121558508e-06, + "loss": 0.998, + "step": 8216 + }, + { + "epoch": 1.1633043108940326, + "grad_norm": 10.213211285066905, + "learning_rate": 1.9642341158628665e-06, + "loss": 1.148, + "step": 8217 + }, + { + "epoch": 1.1634458837686699, + "grad_norm": 8.964799921597656, + "learning_rate": 1.963674247734696e-06, + "loss": 0.9944, + "step": 8218 + }, + { + "epoch": 1.163587456643307, + "grad_norm": 8.873143391437981, + "learning_rate": 1.96311440780077e-06, + "loss": 1.1245, + "step": 8219 + }, + { + "epoch": 1.1637290295179443, + "grad_norm": 9.669562684262551, + "learning_rate": 1.9625545960905187e-06, + "loss": 1.0586, + "step": 8220 + }, + { + "epoch": 1.1638706023925816, + "grad_norm": 10.023129227284059, + "learning_rate": 1.961994812633372e-06, + "loss": 1.049, + "step": 8221 + }, + { + "epoch": 1.1640121752672188, + "grad_norm": 9.857345831698467, + "learning_rate": 1.961435057458757e-06, + "loss": 1.0398, + "step": 8222 + }, + { + "epoch": 1.164153748141856, + "grad_norm": 8.250859996014242, + "learning_rate": 1.9608753305960997e-06, + "loss": 1.0934, + "step": 8223 + }, + { + "epoch": 1.1642953210164932, + "grad_norm": 8.52998404140254, + "learning_rate": 1.960315632074824e-06, + "loss": 1.0028, + "step": 8224 + }, + { + "epoch": 1.1644368938911305, + "grad_norm": 9.504587370912459, + "learning_rate": 1.9597559619243527e-06, + "loss": 1.0834, + "step": 8225 + }, + { + "epoch": 1.1645784667657677, + "grad_norm": 8.651360518343886, + "learning_rate": 1.959196320174108e-06, + "loss": 1.0145, + "step": 8226 + }, + { + "epoch": 1.164720039640405, + "grad_norm": 9.41574519777349, + "learning_rate": 1.95863670685351e-06, + "loss": 1.0803, + "step": 8227 + }, + { + "epoch": 1.1648616125150422, + "grad_norm": 9.788829462758894, + "learning_rate": 1.958077121991976e-06, + "loss": 1.0567, + "step": 8228 + }, + { + "epoch": 1.1650031853896794, + "grad_norm": 10.22073055754249, + "learning_rate": 1.9575175656189236e-06, + "loss": 1.0336, + "step": 8229 + }, + { + "epoch": 1.1651447582643166, + "grad_norm": 8.063585311823438, + "learning_rate": 1.9569580377637677e-06, + "loss": 0.997, + "step": 8230 + }, + { + "epoch": 1.1652863311389539, + "grad_norm": 10.025624441738147, + "learning_rate": 1.956398538455924e-06, + "loss": 1.0174, + "step": 8231 + }, + { + "epoch": 1.165427904013591, + "grad_norm": 9.388625352560522, + "learning_rate": 1.955839067724803e-06, + "loss": 1.0907, + "step": 8232 + }, + { + "epoch": 1.165569476888228, + "grad_norm": 9.138107709597596, + "learning_rate": 1.9552796255998173e-06, + "loss": 0.9516, + "step": 8233 + }, + { + "epoch": 1.1657110497628653, + "grad_norm": 8.052065699180911, + "learning_rate": 1.9547202121103757e-06, + "loss": 1.0289, + "step": 8234 + }, + { + "epoch": 1.1658526226375026, + "grad_norm": 8.092969937825716, + "learning_rate": 1.9541608272858856e-06, + "loss": 1.0886, + "step": 8235 + }, + { + "epoch": 1.1659941955121398, + "grad_norm": 10.15959267408013, + "learning_rate": 1.953601471155753e-06, + "loss": 1.1149, + "step": 8236 + }, + { + "epoch": 1.166135768386777, + "grad_norm": 10.101014160001306, + "learning_rate": 1.9530421437493843e-06, + "loss": 0.9729, + "step": 8237 + }, + { + "epoch": 1.1662773412614142, + "grad_norm": 9.660260500347302, + "learning_rate": 1.952482845096182e-06, + "loss": 1.002, + "step": 8238 + }, + { + "epoch": 1.1664189141360515, + "grad_norm": 9.55416247343783, + "learning_rate": 1.9519235752255487e-06, + "loss": 1.0966, + "step": 8239 + }, + { + "epoch": 1.1665604870106887, + "grad_norm": 8.363809881558959, + "learning_rate": 1.951364334166884e-06, + "loss": 0.9236, + "step": 8240 + }, + { + "epoch": 1.166702059885326, + "grad_norm": 9.819764382202491, + "learning_rate": 1.9508051219495877e-06, + "loss": 0.8954, + "step": 8241 + }, + { + "epoch": 1.1668436327599632, + "grad_norm": 9.370341046130646, + "learning_rate": 1.950245938603056e-06, + "loss": 1.1683, + "step": 8242 + }, + { + "epoch": 1.1669852056346004, + "grad_norm": 9.07307774385813, + "learning_rate": 1.949686784156686e-06, + "loss": 0.9998, + "step": 8243 + }, + { + "epoch": 1.1671267785092376, + "grad_norm": 9.877421794360659, + "learning_rate": 1.949127658639872e-06, + "loss": 1.0633, + "step": 8244 + }, + { + "epoch": 1.1672683513838749, + "grad_norm": 7.359038067857761, + "learning_rate": 1.948568562082005e-06, + "loss": 0.985, + "step": 8245 + }, + { + "epoch": 1.167409924258512, + "grad_norm": 8.076292084579979, + "learning_rate": 1.948009494512478e-06, + "loss": 0.934, + "step": 8246 + }, + { + "epoch": 1.1675514971331493, + "grad_norm": 10.808149399634713, + "learning_rate": 1.94745045596068e-06, + "loss": 1.0898, + "step": 8247 + }, + { + "epoch": 1.1676930700077865, + "grad_norm": 8.931381158419601, + "learning_rate": 1.9468914464559994e-06, + "loss": 1.0936, + "step": 8248 + }, + { + "epoch": 1.1678346428824238, + "grad_norm": 8.703561549307164, + "learning_rate": 1.9463324660278235e-06, + "loss": 1.0634, + "step": 8249 + }, + { + "epoch": 1.167976215757061, + "grad_norm": 10.220534609195054, + "learning_rate": 1.945773514705537e-06, + "loss": 1.0458, + "step": 8250 + }, + { + "epoch": 1.1681177886316982, + "grad_norm": 8.020000252664234, + "learning_rate": 1.9452145925185235e-06, + "loss": 1.1703, + "step": 8251 + }, + { + "epoch": 1.1682593615063355, + "grad_norm": 11.452819237264041, + "learning_rate": 1.9446556994961645e-06, + "loss": 1.0633, + "step": 8252 + }, + { + "epoch": 1.1684009343809727, + "grad_norm": 8.470229128786341, + "learning_rate": 1.944096835667842e-06, + "loss": 1.005, + "step": 8253 + }, + { + "epoch": 1.16854250725561, + "grad_norm": 9.479784688959104, + "learning_rate": 1.9435380010629343e-06, + "loss": 0.9811, + "step": 8254 + }, + { + "epoch": 1.1686840801302472, + "grad_norm": 8.797017233530422, + "learning_rate": 1.94297919571082e-06, + "loss": 0.9382, + "step": 8255 + }, + { + "epoch": 1.1688256530048842, + "grad_norm": 9.684815262434306, + "learning_rate": 1.942420419640873e-06, + "loss": 1.1246, + "step": 8256 + }, + { + "epoch": 1.1689672258795214, + "grad_norm": 8.037044586995854, + "learning_rate": 1.941861672882469e-06, + "loss": 0.9783, + "step": 8257 + }, + { + "epoch": 1.1691087987541586, + "grad_norm": 10.474114617778783, + "learning_rate": 1.9413029554649798e-06, + "loss": 1.067, + "step": 8258 + }, + { + "epoch": 1.1692503716287959, + "grad_norm": 10.112779188604904, + "learning_rate": 1.9407442674177783e-06, + "loss": 0.9603, + "step": 8259 + }, + { + "epoch": 1.169391944503433, + "grad_norm": 8.073101318872764, + "learning_rate": 1.9401856087702337e-06, + "loss": 0.9985, + "step": 8260 + }, + { + "epoch": 1.1695335173780703, + "grad_norm": 9.351506614677417, + "learning_rate": 1.9396269795517147e-06, + "loss": 1.0915, + "step": 8261 + }, + { + "epoch": 1.1696750902527075, + "grad_norm": 7.6305914217416095, + "learning_rate": 1.939068379791587e-06, + "loss": 0.9656, + "step": 8262 + }, + { + "epoch": 1.1698166631273448, + "grad_norm": 9.132417773094057, + "learning_rate": 1.938509809519216e-06, + "loss": 1.0164, + "step": 8263 + }, + { + "epoch": 1.169958236001982, + "grad_norm": 11.329536549987383, + "learning_rate": 1.9379512687639663e-06, + "loss": 1.0681, + "step": 8264 + }, + { + "epoch": 1.1700998088766192, + "grad_norm": 16.62076371220508, + "learning_rate": 1.937392757555199e-06, + "loss": 0.9696, + "step": 8265 + }, + { + "epoch": 1.1702413817512565, + "grad_norm": 8.682007734434544, + "learning_rate": 1.936834275922276e-06, + "loss": 1.0164, + "step": 8266 + }, + { + "epoch": 1.1703829546258937, + "grad_norm": 7.334470429549334, + "learning_rate": 1.936275823894554e-06, + "loss": 0.9907, + "step": 8267 + }, + { + "epoch": 1.170524527500531, + "grad_norm": 8.534920709707624, + "learning_rate": 1.9357174015013917e-06, + "loss": 0.8614, + "step": 8268 + }, + { + "epoch": 1.1706661003751682, + "grad_norm": 8.875687693426915, + "learning_rate": 1.935159008772145e-06, + "loss": 1.0155, + "step": 8269 + }, + { + "epoch": 1.1708076732498054, + "grad_norm": 8.967858884982359, + "learning_rate": 1.9346006457361684e-06, + "loss": 1.0998, + "step": 8270 + }, + { + "epoch": 1.1709492461244426, + "grad_norm": 8.562670183926912, + "learning_rate": 1.9340423124228136e-06, + "loss": 1.0179, + "step": 8271 + }, + { + "epoch": 1.1710908189990799, + "grad_norm": 9.451200733901487, + "learning_rate": 1.9334840088614327e-06, + "loss": 1.0354, + "step": 8272 + }, + { + "epoch": 1.171232391873717, + "grad_norm": 9.87119499972806, + "learning_rate": 1.9329257350813753e-06, + "loss": 1.0118, + "step": 8273 + }, + { + "epoch": 1.171373964748354, + "grad_norm": 9.25532460967043, + "learning_rate": 1.932367491111989e-06, + "loss": 1.0498, + "step": 8274 + }, + { + "epoch": 1.1715155376229913, + "grad_norm": 8.689474855628593, + "learning_rate": 1.9318092769826197e-06, + "loss": 1.011, + "step": 8275 + }, + { + "epoch": 1.1716571104976286, + "grad_norm": 8.906216591220234, + "learning_rate": 1.931251092722615e-06, + "loss": 1.0057, + "step": 8276 + }, + { + "epoch": 1.1717986833722658, + "grad_norm": 11.223931521260512, + "learning_rate": 1.930692938361315e-06, + "loss": 0.9597, + "step": 8277 + }, + { + "epoch": 1.171940256246903, + "grad_norm": 9.329796620530844, + "learning_rate": 1.930134813928063e-06, + "loss": 1.0189, + "step": 8278 + }, + { + "epoch": 1.1720818291215402, + "grad_norm": 10.294401065547225, + "learning_rate": 1.9295767194521988e-06, + "loss": 1.0459, + "step": 8279 + }, + { + "epoch": 1.1722234019961775, + "grad_norm": 10.893253485372588, + "learning_rate": 1.9290186549630606e-06, + "loss": 1.003, + "step": 8280 + }, + { + "epoch": 1.1723649748708147, + "grad_norm": 9.257567738061908, + "learning_rate": 1.9284606204899862e-06, + "loss": 1.104, + "step": 8281 + }, + { + "epoch": 1.172506547745452, + "grad_norm": 8.310958375167663, + "learning_rate": 1.927902616062311e-06, + "loss": 1.0259, + "step": 8282 + }, + { + "epoch": 1.1726481206200892, + "grad_norm": 12.904545040986287, + "learning_rate": 1.9273446417093687e-06, + "loss": 0.8959, + "step": 8283 + }, + { + "epoch": 1.1727896934947264, + "grad_norm": 8.514201697998422, + "learning_rate": 1.9267866974604914e-06, + "loss": 1.003, + "step": 8284 + }, + { + "epoch": 1.1729312663693636, + "grad_norm": 10.113325759123764, + "learning_rate": 1.9262287833450107e-06, + "loss": 1.0643, + "step": 8285 + }, + { + "epoch": 1.1730728392440009, + "grad_norm": 8.731235547282465, + "learning_rate": 1.9256708993922542e-06, + "loss": 0.939, + "step": 8286 + }, + { + "epoch": 1.173214412118638, + "grad_norm": 10.466396511341292, + "learning_rate": 1.9251130456315514e-06, + "loss": 0.9995, + "step": 8287 + }, + { + "epoch": 1.1733559849932753, + "grad_norm": 11.27555386352259, + "learning_rate": 1.9245552220922264e-06, + "loss": 1.0218, + "step": 8288 + }, + { + "epoch": 1.1734975578679125, + "grad_norm": 9.381414634891575, + "learning_rate": 1.9239974288036044e-06, + "loss": 1.132, + "step": 8289 + }, + { + "epoch": 1.1736391307425498, + "grad_norm": 8.518131888026646, + "learning_rate": 1.9234396657950076e-06, + "loss": 0.9822, + "step": 8290 + }, + { + "epoch": 1.173780703617187, + "grad_norm": 9.072841032277191, + "learning_rate": 1.922881933095758e-06, + "loss": 1.1083, + "step": 8291 + }, + { + "epoch": 1.1739222764918242, + "grad_norm": 9.200439475799596, + "learning_rate": 1.9223242307351753e-06, + "loss": 1.009, + "step": 8292 + }, + { + "epoch": 1.1740638493664615, + "grad_norm": 9.066936380451141, + "learning_rate": 1.9217665587425764e-06, + "loss": 1.0801, + "step": 8293 + }, + { + "epoch": 1.1742054222410987, + "grad_norm": 8.946575118997396, + "learning_rate": 1.9212089171472787e-06, + "loss": 1.1177, + "step": 8294 + }, + { + "epoch": 1.174346995115736, + "grad_norm": 11.718400385409838, + "learning_rate": 1.9206513059785966e-06, + "loss": 1.0042, + "step": 8295 + }, + { + "epoch": 1.1744885679903732, + "grad_norm": 11.230239467768172, + "learning_rate": 1.9200937252658435e-06, + "loss": 1.0636, + "step": 8296 + }, + { + "epoch": 1.1746301408650104, + "grad_norm": 10.215910257421768, + "learning_rate": 1.9195361750383312e-06, + "loss": 1.0209, + "step": 8297 + }, + { + "epoch": 1.1747717137396474, + "grad_norm": 8.457402019874735, + "learning_rate": 1.918978655325369e-06, + "loss": 0.9838, + "step": 8298 + }, + { + "epoch": 1.1749132866142846, + "grad_norm": 10.46016329577681, + "learning_rate": 1.9184211661562653e-06, + "loss": 0.9851, + "step": 8299 + }, + { + "epoch": 1.1750548594889219, + "grad_norm": 9.749591134130439, + "learning_rate": 1.9178637075603276e-06, + "loss": 1.1274, + "step": 8300 + }, + { + "epoch": 1.175196432363559, + "grad_norm": 10.010009715011714, + "learning_rate": 1.9173062795668606e-06, + "loss": 1.0226, + "step": 8301 + }, + { + "epoch": 1.1753380052381963, + "grad_norm": 12.891804515164917, + "learning_rate": 1.916748882205168e-06, + "loss": 1.0584, + "step": 8302 + }, + { + "epoch": 1.1754795781128335, + "grad_norm": 9.923102163283856, + "learning_rate": 1.916191515504552e-06, + "loss": 0.9875, + "step": 8303 + }, + { + "epoch": 1.1756211509874708, + "grad_norm": 9.292627213669263, + "learning_rate": 1.915634179494312e-06, + "loss": 0.9438, + "step": 8304 + }, + { + "epoch": 1.175762723862108, + "grad_norm": 10.919043090589952, + "learning_rate": 1.9150768742037477e-06, + "loss": 1.0707, + "step": 8305 + }, + { + "epoch": 1.1759042967367452, + "grad_norm": 6.787656227969401, + "learning_rate": 1.9145195996621567e-06, + "loss": 1.0433, + "step": 8306 + }, + { + "epoch": 1.1760458696113825, + "grad_norm": 8.071584861582908, + "learning_rate": 1.9139623558988334e-06, + "loss": 0.9602, + "step": 8307 + }, + { + "epoch": 1.1761874424860197, + "grad_norm": 8.913886186358864, + "learning_rate": 1.913405142943073e-06, + "loss": 0.9386, + "step": 8308 + }, + { + "epoch": 1.176329015360657, + "grad_norm": 8.853715330102068, + "learning_rate": 1.9128479608241656e-06, + "loss": 1.0128, + "step": 8309 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 9.302941658999167, + "learning_rate": 1.9122908095714032e-06, + "loss": 1.0369, + "step": 8310 + }, + { + "epoch": 1.1766121611099314, + "grad_norm": 9.123405003608207, + "learning_rate": 1.911733689214075e-06, + "loss": 1.0642, + "step": 8311 + }, + { + "epoch": 1.1767537339845686, + "grad_norm": 8.592442638482114, + "learning_rate": 1.911176599781468e-06, + "loss": 1.0919, + "step": 8312 + }, + { + "epoch": 1.1768953068592058, + "grad_norm": 9.676716370839836, + "learning_rate": 1.910619541302868e-06, + "loss": 1.0415, + "step": 8313 + }, + { + "epoch": 1.177036879733843, + "grad_norm": 9.64569403683055, + "learning_rate": 1.9100625138075595e-06, + "loss": 1.1778, + "step": 8314 + }, + { + "epoch": 1.1771784526084803, + "grad_norm": 9.321638500603717, + "learning_rate": 1.909505517324825e-06, + "loss": 1.0323, + "step": 8315 + }, + { + "epoch": 1.1773200254831173, + "grad_norm": 7.659545189325814, + "learning_rate": 1.9089485518839446e-06, + "loss": 1.0098, + "step": 8316 + }, + { + "epoch": 1.1774615983577545, + "grad_norm": 9.160284930466094, + "learning_rate": 1.9083916175141983e-06, + "loss": 0.9559, + "step": 8317 + }, + { + "epoch": 1.1776031712323918, + "grad_norm": 8.330857277819518, + "learning_rate": 1.9078347142448638e-06, + "loss": 1.0233, + "step": 8318 + }, + { + "epoch": 1.177744744107029, + "grad_norm": 10.933505827620763, + "learning_rate": 1.9072778421052172e-06, + "loss": 1.0061, + "step": 8319 + }, + { + "epoch": 1.1778863169816662, + "grad_norm": 8.950870782697887, + "learning_rate": 1.9067210011245318e-06, + "loss": 1.1157, + "step": 8320 + }, + { + "epoch": 1.1780278898563035, + "grad_norm": 10.323134239411127, + "learning_rate": 1.906164191332081e-06, + "loss": 1.0099, + "step": 8321 + }, + { + "epoch": 1.1781694627309407, + "grad_norm": 8.775123930053237, + "learning_rate": 1.905607412757136e-06, + "loss": 1.0073, + "step": 8322 + }, + { + "epoch": 1.178311035605578, + "grad_norm": 10.135748162227861, + "learning_rate": 1.9050506654289663e-06, + "loss": 1.0394, + "step": 8323 + }, + { + "epoch": 1.1784526084802152, + "grad_norm": 9.710812188074218, + "learning_rate": 1.9044939493768394e-06, + "loss": 1.175, + "step": 8324 + }, + { + "epoch": 1.1785941813548524, + "grad_norm": 8.534961829164379, + "learning_rate": 1.9039372646300216e-06, + "loss": 1.0643, + "step": 8325 + }, + { + "epoch": 1.1787357542294896, + "grad_norm": 9.561868484687343, + "learning_rate": 1.9033806112177772e-06, + "loss": 1.0531, + "step": 8326 + }, + { + "epoch": 1.1788773271041268, + "grad_norm": 10.71471308991151, + "learning_rate": 1.902823989169369e-06, + "loss": 1.0761, + "step": 8327 + }, + { + "epoch": 1.179018899978764, + "grad_norm": 9.73037622898607, + "learning_rate": 1.9022673985140585e-06, + "loss": 1.034, + "step": 8328 + }, + { + "epoch": 1.1791604728534013, + "grad_norm": 10.13175757688071, + "learning_rate": 1.9017108392811065e-06, + "loss": 1.068, + "step": 8329 + }, + { + "epoch": 1.1793020457280385, + "grad_norm": 8.062472084647652, + "learning_rate": 1.9011543114997684e-06, + "loss": 1.0828, + "step": 8330 + }, + { + "epoch": 1.1794436186026758, + "grad_norm": 9.247995932739842, + "learning_rate": 1.9005978151993014e-06, + "loss": 0.9995, + "step": 8331 + }, + { + "epoch": 1.179585191477313, + "grad_norm": 11.0463366343576, + "learning_rate": 1.9000413504089607e-06, + "loss": 1.1252, + "step": 8332 + }, + { + "epoch": 1.1797267643519502, + "grad_norm": 8.668194073795105, + "learning_rate": 1.8994849171579981e-06, + "loss": 1.071, + "step": 8333 + }, + { + "epoch": 1.1798683372265875, + "grad_norm": 10.698599779010584, + "learning_rate": 1.8989285154756665e-06, + "loss": 1.1295, + "step": 8334 + }, + { + "epoch": 1.1800099101012247, + "grad_norm": 10.118423398209313, + "learning_rate": 1.8983721453912146e-06, + "loss": 1.1028, + "step": 8335 + }, + { + "epoch": 1.180151482975862, + "grad_norm": 8.915617503775765, + "learning_rate": 1.89781580693389e-06, + "loss": 0.8722, + "step": 8336 + }, + { + "epoch": 1.1802930558504992, + "grad_norm": 6.772450728295297, + "learning_rate": 1.8972595001329398e-06, + "loss": 0.9287, + "step": 8337 + }, + { + "epoch": 1.1804346287251364, + "grad_norm": 8.475731256045524, + "learning_rate": 1.8967032250176083e-06, + "loss": 0.9448, + "step": 8338 + }, + { + "epoch": 1.1805762015997734, + "grad_norm": 8.95460846812315, + "learning_rate": 1.8961469816171383e-06, + "loss": 1.0601, + "step": 8339 + }, + { + "epoch": 1.1807177744744106, + "grad_norm": 8.223276494570296, + "learning_rate": 1.8955907699607717e-06, + "loss": 0.999, + "step": 8340 + }, + { + "epoch": 1.1808593473490478, + "grad_norm": 9.896891219496498, + "learning_rate": 1.895034590077747e-06, + "loss": 1.1173, + "step": 8341 + }, + { + "epoch": 1.181000920223685, + "grad_norm": 7.756800467409932, + "learning_rate": 1.894478441997303e-06, + "loss": 0.9485, + "step": 8342 + }, + { + "epoch": 1.1811424930983223, + "grad_norm": 8.839752695758404, + "learning_rate": 1.8939223257486759e-06, + "loss": 0.9065, + "step": 8343 + }, + { + "epoch": 1.1812840659729595, + "grad_norm": 8.829082929914113, + "learning_rate": 1.8933662413611e-06, + "loss": 0.9803, + "step": 8344 + }, + { + "epoch": 1.1814256388475968, + "grad_norm": 9.662650446935766, + "learning_rate": 1.8928101888638087e-06, + "loss": 0.9456, + "step": 8345 + }, + { + "epoch": 1.181567211722234, + "grad_norm": 9.981907690423546, + "learning_rate": 1.892254168286033e-06, + "loss": 0.9484, + "step": 8346 + }, + { + "epoch": 1.1817087845968712, + "grad_norm": 8.86293477658523, + "learning_rate": 1.8916981796570023e-06, + "loss": 0.9827, + "step": 8347 + }, + { + "epoch": 1.1818503574715085, + "grad_norm": 8.293653189490085, + "learning_rate": 1.8911422230059448e-06, + "loss": 0.9345, + "step": 8348 + }, + { + "epoch": 1.1819919303461457, + "grad_norm": 9.77966685782791, + "learning_rate": 1.8905862983620863e-06, + "loss": 1.0124, + "step": 8349 + }, + { + "epoch": 1.182133503220783, + "grad_norm": 10.524728991718556, + "learning_rate": 1.8900304057546532e-06, + "loss": 1.0671, + "step": 8350 + }, + { + "epoch": 1.1822750760954202, + "grad_norm": 9.560883828012654, + "learning_rate": 1.8894745452128657e-06, + "loss": 0.9069, + "step": 8351 + }, + { + "epoch": 1.1824166489700574, + "grad_norm": 9.5882930581794, + "learning_rate": 1.8889187167659462e-06, + "loss": 1.1039, + "step": 8352 + }, + { + "epoch": 1.1825582218446946, + "grad_norm": 9.810993443387444, + "learning_rate": 1.888362920443114e-06, + "loss": 1.0151, + "step": 8353 + }, + { + "epoch": 1.1826997947193318, + "grad_norm": 10.264292730686828, + "learning_rate": 1.8878071562735873e-06, + "loss": 0.9497, + "step": 8354 + }, + { + "epoch": 1.182841367593969, + "grad_norm": 8.313834864771575, + "learning_rate": 1.887251424286581e-06, + "loss": 0.9836, + "step": 8355 + }, + { + "epoch": 1.1829829404686063, + "grad_norm": 8.42169608242812, + "learning_rate": 1.8866957245113113e-06, + "loss": 1.072, + "step": 8356 + }, + { + "epoch": 1.1831245133432433, + "grad_norm": 8.048861064070552, + "learning_rate": 1.88614005697699e-06, + "loss": 0.9713, + "step": 8357 + }, + { + "epoch": 1.1832660862178805, + "grad_norm": 8.750900658166653, + "learning_rate": 1.8855844217128281e-06, + "loss": 0.999, + "step": 8358 + }, + { + "epoch": 1.1834076590925178, + "grad_norm": 12.41614205363702, + "learning_rate": 1.885028818748035e-06, + "loss": 1.0176, + "step": 8359 + }, + { + "epoch": 1.183549231967155, + "grad_norm": 8.357002163961491, + "learning_rate": 1.8844732481118184e-06, + "loss": 1.097, + "step": 8360 + }, + { + "epoch": 1.1836908048417922, + "grad_norm": 9.426303714669467, + "learning_rate": 1.8839177098333856e-06, + "loss": 0.9959, + "step": 8361 + }, + { + "epoch": 1.1838323777164295, + "grad_norm": 9.17443042619102, + "learning_rate": 1.8833622039419371e-06, + "loss": 1.0802, + "step": 8362 + }, + { + "epoch": 1.1839739505910667, + "grad_norm": 9.988328225715698, + "learning_rate": 1.8828067304666788e-06, + "loss": 1.1025, + "step": 8363 + }, + { + "epoch": 1.184115523465704, + "grad_norm": 9.561505035298525, + "learning_rate": 1.8822512894368106e-06, + "loss": 0.936, + "step": 8364 + }, + { + "epoch": 1.1842570963403412, + "grad_norm": 9.499761076482603, + "learning_rate": 1.8816958808815311e-06, + "loss": 0.9903, + "step": 8365 + }, + { + "epoch": 1.1843986692149784, + "grad_norm": 7.860068076423742, + "learning_rate": 1.8811405048300383e-06, + "loss": 1.0492, + "step": 8366 + }, + { + "epoch": 1.1845402420896156, + "grad_norm": 8.479740007841393, + "learning_rate": 1.8805851613115278e-06, + "loss": 1.0333, + "step": 8367 + }, + { + "epoch": 1.1846818149642528, + "grad_norm": 9.510103173884586, + "learning_rate": 1.8800298503551934e-06, + "loss": 1.0087, + "step": 8368 + }, + { + "epoch": 1.18482338783889, + "grad_norm": 11.688785257915557, + "learning_rate": 1.8794745719902274e-06, + "loss": 1.0439, + "step": 8369 + }, + { + "epoch": 1.1849649607135273, + "grad_norm": 8.970360588047162, + "learning_rate": 1.8789193262458205e-06, + "loss": 0.9948, + "step": 8370 + }, + { + "epoch": 1.1851065335881645, + "grad_norm": 8.033033358511892, + "learning_rate": 1.8783641131511624e-06, + "loss": 1.0166, + "step": 8371 + }, + { + "epoch": 1.1852481064628018, + "grad_norm": 10.647636086749014, + "learning_rate": 1.8778089327354385e-06, + "loss": 1.0928, + "step": 8372 + }, + { + "epoch": 1.185389679337439, + "grad_norm": 9.894013461703551, + "learning_rate": 1.8772537850278352e-06, + "loss": 1.0793, + "step": 8373 + }, + { + "epoch": 1.1855312522120762, + "grad_norm": 9.6374740026299, + "learning_rate": 1.876698670057536e-06, + "loss": 0.9504, + "step": 8374 + }, + { + "epoch": 1.1856728250867135, + "grad_norm": 9.380184113539896, + "learning_rate": 1.876143587853723e-06, + "loss": 1.0639, + "step": 8375 + }, + { + "epoch": 1.1858143979613507, + "grad_norm": 10.188881622426493, + "learning_rate": 1.8755885384455764e-06, + "loss": 1.044, + "step": 8376 + }, + { + "epoch": 1.185955970835988, + "grad_norm": 7.297895143741977, + "learning_rate": 1.8750335218622749e-06, + "loss": 1.0381, + "step": 8377 + }, + { + "epoch": 1.1860975437106251, + "grad_norm": 9.19722228203049, + "learning_rate": 1.8744785381329944e-06, + "loss": 1.1543, + "step": 8378 + }, + { + "epoch": 1.1862391165852624, + "grad_norm": 7.208429530683373, + "learning_rate": 1.8739235872869113e-06, + "loss": 0.8362, + "step": 8379 + }, + { + "epoch": 1.1863806894598996, + "grad_norm": 9.043787085003007, + "learning_rate": 1.8733686693531986e-06, + "loss": 0.9393, + "step": 8380 + }, + { + "epoch": 1.1865222623345366, + "grad_norm": 6.612694594290839, + "learning_rate": 1.8728137843610276e-06, + "loss": 0.9442, + "step": 8381 + }, + { + "epoch": 1.1866638352091738, + "grad_norm": 9.276539027598947, + "learning_rate": 1.8722589323395693e-06, + "loss": 1.0456, + "step": 8382 + }, + { + "epoch": 1.186805408083811, + "grad_norm": 9.520242907847155, + "learning_rate": 1.8717041133179897e-06, + "loss": 0.9579, + "step": 8383 + }, + { + "epoch": 1.1869469809584483, + "grad_norm": 10.348520337176002, + "learning_rate": 1.871149327325456e-06, + "loss": 0.9645, + "step": 8384 + }, + { + "epoch": 1.1870885538330855, + "grad_norm": 9.312432000852828, + "learning_rate": 1.8705945743911341e-06, + "loss": 1.0256, + "step": 8385 + }, + { + "epoch": 1.1872301267077228, + "grad_norm": 9.060814852080988, + "learning_rate": 1.8700398545441857e-06, + "loss": 1.1009, + "step": 8386 + }, + { + "epoch": 1.18737169958236, + "grad_norm": 8.665480556996906, + "learning_rate": 1.8694851678137726e-06, + "loss": 1.0255, + "step": 8387 + }, + { + "epoch": 1.1875132724569972, + "grad_norm": 8.780947642703847, + "learning_rate": 1.868930514229054e-06, + "loss": 0.9973, + "step": 8388 + }, + { + "epoch": 1.1876548453316345, + "grad_norm": 7.6794806110071, + "learning_rate": 1.8683758938191877e-06, + "loss": 1.0364, + "step": 8389 + }, + { + "epoch": 1.1877964182062717, + "grad_norm": 11.074094396898817, + "learning_rate": 1.86782130661333e-06, + "loss": 1.0479, + "step": 8390 + }, + { + "epoch": 1.187937991080909, + "grad_norm": 11.867593804762773, + "learning_rate": 1.8672667526406345e-06, + "loss": 0.9321, + "step": 8391 + }, + { + "epoch": 1.1880795639555461, + "grad_norm": 7.542829420402081, + "learning_rate": 1.8667122319302542e-06, + "loss": 0.857, + "step": 8392 + }, + { + "epoch": 1.1882211368301834, + "grad_norm": 9.209379341013143, + "learning_rate": 1.8661577445113399e-06, + "loss": 1.1485, + "step": 8393 + }, + { + "epoch": 1.1883627097048206, + "grad_norm": 9.677958065338634, + "learning_rate": 1.8656032904130402e-06, + "loss": 0.9217, + "step": 8394 + }, + { + "epoch": 1.1885042825794578, + "grad_norm": 9.22545770269523, + "learning_rate": 1.8650488696645025e-06, + "loss": 1.0739, + "step": 8395 + }, + { + "epoch": 1.188645855454095, + "grad_norm": 8.190870683195293, + "learning_rate": 1.864494482294872e-06, + "loss": 1.1016, + "step": 8396 + }, + { + "epoch": 1.1887874283287323, + "grad_norm": 8.698054865730937, + "learning_rate": 1.863940128333293e-06, + "loss": 0.9625, + "step": 8397 + }, + { + "epoch": 1.1889290012033695, + "grad_norm": 8.979196877366038, + "learning_rate": 1.863385807808907e-06, + "loss": 1.0781, + "step": 8398 + }, + { + "epoch": 1.1890705740780065, + "grad_norm": 8.909527734298951, + "learning_rate": 1.8628315207508547e-06, + "loss": 1.1176, + "step": 8399 + }, + { + "epoch": 1.1892121469526438, + "grad_norm": 8.61957102759221, + "learning_rate": 1.8622772671882738e-06, + "loss": 1.043, + "step": 8400 + }, + { + "epoch": 1.189353719827281, + "grad_norm": 9.325931974990405, + "learning_rate": 1.861723047150301e-06, + "loss": 1.0411, + "step": 8401 + }, + { + "epoch": 1.1894952927019182, + "grad_norm": 9.243022039403478, + "learning_rate": 1.8611688606660728e-06, + "loss": 1.0553, + "step": 8402 + }, + { + "epoch": 1.1896368655765555, + "grad_norm": 8.484165027471997, + "learning_rate": 1.8606147077647216e-06, + "loss": 1.0197, + "step": 8403 + }, + { + "epoch": 1.1897784384511927, + "grad_norm": 10.940996581160459, + "learning_rate": 1.8600605884753775e-06, + "loss": 1.1583, + "step": 8404 + }, + { + "epoch": 1.18992001132583, + "grad_norm": 10.398142179197619, + "learning_rate": 1.8595065028271713e-06, + "loss": 1.0532, + "step": 8405 + }, + { + "epoch": 1.1900615842004671, + "grad_norm": 10.541481187317018, + "learning_rate": 1.8589524508492308e-06, + "loss": 1.1411, + "step": 8406 + }, + { + "epoch": 1.1902031570751044, + "grad_norm": 9.975609216275457, + "learning_rate": 1.8583984325706813e-06, + "loss": 1.0064, + "step": 8407 + }, + { + "epoch": 1.1903447299497416, + "grad_norm": 9.032848599461898, + "learning_rate": 1.8578444480206487e-06, + "loss": 1.1194, + "step": 8408 + }, + { + "epoch": 1.1904863028243788, + "grad_norm": 8.593717595819589, + "learning_rate": 1.8572904972282541e-06, + "loss": 0.9969, + "step": 8409 + }, + { + "epoch": 1.190627875699016, + "grad_norm": 9.972404934039584, + "learning_rate": 1.856736580222619e-06, + "loss": 0.9462, + "step": 8410 + }, + { + "epoch": 1.1907694485736533, + "grad_norm": 9.86751340205556, + "learning_rate": 1.8561826970328623e-06, + "loss": 1.0164, + "step": 8411 + }, + { + "epoch": 1.1909110214482905, + "grad_norm": 10.392440660209543, + "learning_rate": 1.8556288476881012e-06, + "loss": 1.1156, + "step": 8412 + }, + { + "epoch": 1.1910525943229278, + "grad_norm": 9.3944788687469, + "learning_rate": 1.855075032217451e-06, + "loss": 0.9336, + "step": 8413 + }, + { + "epoch": 1.191194167197565, + "grad_norm": 9.643858826295801, + "learning_rate": 1.854521250650026e-06, + "loss": 1.026, + "step": 8414 + }, + { + "epoch": 1.1913357400722022, + "grad_norm": 10.207709551586525, + "learning_rate": 1.8539675030149373e-06, + "loss": 1.0191, + "step": 8415 + }, + { + "epoch": 1.1914773129468395, + "grad_norm": 11.684901637135473, + "learning_rate": 1.853413789341295e-06, + "loss": 1.1757, + "step": 8416 + }, + { + "epoch": 1.1916188858214767, + "grad_norm": 10.041383942440016, + "learning_rate": 1.8528601096582078e-06, + "loss": 1.0263, + "step": 8417 + }, + { + "epoch": 1.191760458696114, + "grad_norm": 8.218628392933507, + "learning_rate": 1.8523064639947818e-06, + "loss": 1.0122, + "step": 8418 + }, + { + "epoch": 1.1919020315707511, + "grad_norm": 7.842926103704157, + "learning_rate": 1.8517528523801226e-06, + "loss": 0.9544, + "step": 8419 + }, + { + "epoch": 1.1920436044453884, + "grad_norm": 9.093552420131298, + "learning_rate": 1.8511992748433321e-06, + "loss": 1.0281, + "step": 8420 + }, + { + "epoch": 1.1921851773200256, + "grad_norm": 10.367519761419217, + "learning_rate": 1.8506457314135123e-06, + "loss": 0.9604, + "step": 8421 + }, + { + "epoch": 1.1923267501946626, + "grad_norm": 8.051776230364775, + "learning_rate": 1.850092222119762e-06, + "loss": 0.9799, + "step": 8422 + }, + { + "epoch": 1.1924683230692998, + "grad_norm": 8.95352166646921, + "learning_rate": 1.849538746991179e-06, + "loss": 0.9513, + "step": 8423 + }, + { + "epoch": 1.192609895943937, + "grad_norm": 8.71759222403585, + "learning_rate": 1.8489853060568597e-06, + "loss": 0.9722, + "step": 8424 + }, + { + "epoch": 1.1927514688185743, + "grad_norm": 9.17758993654963, + "learning_rate": 1.848431899345897e-06, + "loss": 0.9069, + "step": 8425 + }, + { + "epoch": 1.1928930416932115, + "grad_norm": 9.466395321391316, + "learning_rate": 1.8478785268873834e-06, + "loss": 1.0003, + "step": 8426 + }, + { + "epoch": 1.1930346145678488, + "grad_norm": 8.887607270142018, + "learning_rate": 1.8473251887104093e-06, + "loss": 0.9416, + "step": 8427 + }, + { + "epoch": 1.193176187442486, + "grad_norm": 7.427507628260714, + "learning_rate": 1.8467718848440636e-06, + "loss": 0.9518, + "step": 8428 + }, + { + "epoch": 1.1933177603171232, + "grad_norm": 9.840454324918174, + "learning_rate": 1.8462186153174327e-06, + "loss": 1.0905, + "step": 8429 + }, + { + "epoch": 1.1934593331917605, + "grad_norm": 8.325285064123978, + "learning_rate": 1.8456653801596013e-06, + "loss": 0.8983, + "step": 8430 + }, + { + "epoch": 1.1936009060663977, + "grad_norm": 8.662418742713902, + "learning_rate": 1.8451121793996534e-06, + "loss": 1.0154, + "step": 8431 + }, + { + "epoch": 1.193742478941035, + "grad_norm": 9.345975987069457, + "learning_rate": 1.84455901306667e-06, + "loss": 1.0191, + "step": 8432 + }, + { + "epoch": 1.1938840518156721, + "grad_norm": 8.995095506111234, + "learning_rate": 1.8440058811897304e-06, + "loss": 0.9838, + "step": 8433 + }, + { + "epoch": 1.1940256246903094, + "grad_norm": 10.97945774761398, + "learning_rate": 1.8434527837979128e-06, + "loss": 0.916, + "step": 8434 + }, + { + "epoch": 1.1941671975649466, + "grad_norm": 8.824577118248667, + "learning_rate": 1.8428997209202935e-06, + "loss": 1.1356, + "step": 8435 + }, + { + "epoch": 1.1943087704395838, + "grad_norm": 7.489718447029333, + "learning_rate": 1.8423466925859445e-06, + "loss": 0.9395, + "step": 8436 + }, + { + "epoch": 1.194450343314221, + "grad_norm": 8.700074680051692, + "learning_rate": 1.84179369882394e-06, + "loss": 0.9982, + "step": 8437 + }, + { + "epoch": 1.1945919161888583, + "grad_norm": 9.770196976642914, + "learning_rate": 1.84124073966335e-06, + "loss": 1.0072, + "step": 8438 + }, + { + "epoch": 1.1947334890634955, + "grad_norm": 9.896742436997025, + "learning_rate": 1.8406878151332431e-06, + "loss": 1.0414, + "step": 8439 + }, + { + "epoch": 1.1948750619381325, + "grad_norm": 9.428187341834823, + "learning_rate": 1.840134925262686e-06, + "loss": 1.0463, + "step": 8440 + }, + { + "epoch": 1.1950166348127698, + "grad_norm": 12.400834676891462, + "learning_rate": 1.8395820700807444e-06, + "loss": 1.2321, + "step": 8441 + }, + { + "epoch": 1.195158207687407, + "grad_norm": 10.067449833129446, + "learning_rate": 1.8390292496164805e-06, + "loss": 1.0776, + "step": 8442 + }, + { + "epoch": 1.1952997805620442, + "grad_norm": 9.489550012948042, + "learning_rate": 1.838476463898956e-06, + "loss": 1.0913, + "step": 8443 + }, + { + "epoch": 1.1954413534366815, + "grad_norm": 9.41557220130193, + "learning_rate": 1.8379237129572307e-06, + "loss": 0.969, + "step": 8444 + }, + { + "epoch": 1.1955829263113187, + "grad_norm": 8.100176171164813, + "learning_rate": 1.8373709968203624e-06, + "loss": 1.047, + "step": 8445 + }, + { + "epoch": 1.195724499185956, + "grad_norm": 11.822466808392692, + "learning_rate": 1.8368183155174069e-06, + "loss": 0.9243, + "step": 8446 + }, + { + "epoch": 1.1958660720605931, + "grad_norm": 8.991372105461279, + "learning_rate": 1.8362656690774177e-06, + "loss": 1.0711, + "step": 8447 + }, + { + "epoch": 1.1960076449352304, + "grad_norm": 9.87166606782208, + "learning_rate": 1.8357130575294474e-06, + "loss": 1.0547, + "step": 8448 + }, + { + "epoch": 1.1961492178098676, + "grad_norm": 9.613479655612071, + "learning_rate": 1.8351604809025465e-06, + "loss": 0.9087, + "step": 8449 + }, + { + "epoch": 1.1962907906845048, + "grad_norm": 9.315114313592344, + "learning_rate": 1.8346079392257632e-06, + "loss": 0.9894, + "step": 8450 + }, + { + "epoch": 1.196432363559142, + "grad_norm": 11.338184515194042, + "learning_rate": 1.834055432528144e-06, + "loss": 0.9865, + "step": 8451 + }, + { + "epoch": 1.1965739364337793, + "grad_norm": 8.795858049982913, + "learning_rate": 1.8335029608387342e-06, + "loss": 1.063, + "step": 8452 + }, + { + "epoch": 1.1967155093084165, + "grad_norm": 9.493275320254456, + "learning_rate": 1.8329505241865772e-06, + "loss": 0.9676, + "step": 8453 + }, + { + "epoch": 1.1968570821830538, + "grad_norm": 8.69973880770369, + "learning_rate": 1.8323981226007136e-06, + "loss": 0.9026, + "step": 8454 + }, + { + "epoch": 1.196998655057691, + "grad_norm": 10.845848562020098, + "learning_rate": 1.8318457561101833e-06, + "loss": 1.0866, + "step": 8455 + }, + { + "epoch": 1.1971402279323282, + "grad_norm": 9.405958811159369, + "learning_rate": 1.831293424744024e-06, + "loss": 0.9801, + "step": 8456 + }, + { + "epoch": 1.1972818008069654, + "grad_norm": 11.560154032489482, + "learning_rate": 1.8307411285312699e-06, + "loss": 1.0029, + "step": 8457 + }, + { + "epoch": 1.1974233736816027, + "grad_norm": 9.449168286734825, + "learning_rate": 1.8301888675009554e-06, + "loss": 0.9909, + "step": 8458 + }, + { + "epoch": 1.19756494655624, + "grad_norm": 10.470588545411603, + "learning_rate": 1.8296366416821127e-06, + "loss": 1.0513, + "step": 8459 + }, + { + "epoch": 1.1977065194308771, + "grad_norm": 10.318803929650675, + "learning_rate": 1.829084451103772e-06, + "loss": 1.0241, + "step": 8460 + }, + { + "epoch": 1.1978480923055144, + "grad_norm": 7.5630273556283285, + "learning_rate": 1.8285322957949615e-06, + "loss": 0.8926, + "step": 8461 + }, + { + "epoch": 1.1979896651801516, + "grad_norm": 9.24844347255467, + "learning_rate": 1.8279801757847077e-06, + "loss": 0.9094, + "step": 8462 + }, + { + "epoch": 1.1981312380547886, + "grad_norm": 11.606314131108158, + "learning_rate": 1.8274280911020349e-06, + "loss": 1.0174, + "step": 8463 + }, + { + "epoch": 1.1982728109294258, + "grad_norm": 10.973098110429582, + "learning_rate": 1.8268760417759659e-06, + "loss": 1.1189, + "step": 8464 + }, + { + "epoch": 1.198414383804063, + "grad_norm": 9.963640105177817, + "learning_rate": 1.8263240278355216e-06, + "loss": 1.0906, + "step": 8465 + }, + { + "epoch": 1.1985559566787003, + "grad_norm": 10.2152918759256, + "learning_rate": 1.8257720493097207e-06, + "loss": 0.9286, + "step": 8466 + }, + { + "epoch": 1.1986975295533375, + "grad_norm": 9.900023227240547, + "learning_rate": 1.825220106227581e-06, + "loss": 1.0565, + "step": 8467 + }, + { + "epoch": 1.1988391024279748, + "grad_norm": 9.160413192691683, + "learning_rate": 1.8246681986181165e-06, + "loss": 0.9151, + "step": 8468 + }, + { + "epoch": 1.198980675302612, + "grad_norm": 9.330252502230103, + "learning_rate": 1.8241163265103411e-06, + "loss": 0.9403, + "step": 8469 + }, + { + "epoch": 1.1991222481772492, + "grad_norm": 10.2582554020673, + "learning_rate": 1.8235644899332669e-06, + "loss": 1.0111, + "step": 8470 + }, + { + "epoch": 1.1992638210518864, + "grad_norm": 9.780355296789851, + "learning_rate": 1.8230126889159027e-06, + "loss": 1.0563, + "step": 8471 + }, + { + "epoch": 1.1994053939265237, + "grad_norm": 9.748936130484676, + "learning_rate": 1.822460923487257e-06, + "loss": 0.9082, + "step": 8472 + }, + { + "epoch": 1.199546966801161, + "grad_norm": 9.80572236819447, + "learning_rate": 1.8219091936763353e-06, + "loss": 0.972, + "step": 8473 + }, + { + "epoch": 1.1996885396757981, + "grad_norm": 9.567326941837607, + "learning_rate": 1.8213574995121417e-06, + "loss": 0.9879, + "step": 8474 + }, + { + "epoch": 1.1998301125504354, + "grad_norm": 8.190758908314944, + "learning_rate": 1.8208058410236777e-06, + "loss": 0.8931, + "step": 8475 + }, + { + "epoch": 1.1999716854250726, + "grad_norm": 10.712040082746844, + "learning_rate": 1.8202542182399446e-06, + "loss": 1.0079, + "step": 8476 + }, + { + "epoch": 1.2001132582997098, + "grad_norm": 7.9683557749452625, + "learning_rate": 1.8197026311899419e-06, + "loss": 1.0501, + "step": 8477 + }, + { + "epoch": 1.200254831174347, + "grad_norm": 9.198911469788364, + "learning_rate": 1.8191510799026629e-06, + "loss": 1.0688, + "step": 8478 + }, + { + "epoch": 1.2003964040489843, + "grad_norm": 8.849491910843138, + "learning_rate": 1.8185995644071047e-06, + "loss": 1.096, + "step": 8479 + }, + { + "epoch": 1.2005379769236215, + "grad_norm": 9.582531262433106, + "learning_rate": 1.818048084732259e-06, + "loss": 0.955, + "step": 8480 + }, + { + "epoch": 1.2006795497982587, + "grad_norm": 8.406893000065528, + "learning_rate": 1.8174966409071162e-06, + "loss": 0.92, + "step": 8481 + }, + { + "epoch": 1.2008211226728958, + "grad_norm": 9.2377192449297, + "learning_rate": 1.8169452329606667e-06, + "loss": 1.1017, + "step": 8482 + }, + { + "epoch": 1.200962695547533, + "grad_norm": 8.975613934868884, + "learning_rate": 1.8163938609218974e-06, + "loss": 1.0914, + "step": 8483 + }, + { + "epoch": 1.2011042684221702, + "grad_norm": 9.429306414472265, + "learning_rate": 1.8158425248197931e-06, + "loss": 1.0969, + "step": 8484 + }, + { + "epoch": 1.2012458412968074, + "grad_norm": 8.458118254333932, + "learning_rate": 1.8152912246833368e-06, + "loss": 1.0557, + "step": 8485 + }, + { + "epoch": 1.2013874141714447, + "grad_norm": 8.710235047349649, + "learning_rate": 1.8147399605415104e-06, + "loss": 1.0271, + "step": 8486 + }, + { + "epoch": 1.201528987046082, + "grad_norm": 8.863907450196248, + "learning_rate": 1.8141887324232932e-06, + "loss": 0.9685, + "step": 8487 + }, + { + "epoch": 1.2016705599207191, + "grad_norm": 8.269614744057714, + "learning_rate": 1.8136375403576636e-06, + "loss": 1.0167, + "step": 8488 + }, + { + "epoch": 1.2018121327953564, + "grad_norm": 8.845035742584518, + "learning_rate": 1.8130863843735964e-06, + "loss": 1.0011, + "step": 8489 + }, + { + "epoch": 1.2019537056699936, + "grad_norm": 9.690662679934203, + "learning_rate": 1.8125352645000654e-06, + "loss": 1.0633, + "step": 8490 + }, + { + "epoch": 1.2020952785446308, + "grad_norm": 8.444928099629395, + "learning_rate": 1.8119841807660432e-06, + "loss": 1.0055, + "step": 8491 + }, + { + "epoch": 1.202236851419268, + "grad_norm": 11.033103030779372, + "learning_rate": 1.8114331332004998e-06, + "loss": 1.0047, + "step": 8492 + }, + { + "epoch": 1.2023784242939053, + "grad_norm": 9.103293323892201, + "learning_rate": 1.810882121832403e-06, + "loss": 1.0614, + "step": 8493 + }, + { + "epoch": 1.2025199971685425, + "grad_norm": 8.737042669680422, + "learning_rate": 1.8103311466907191e-06, + "loss": 0.9373, + "step": 8494 + }, + { + "epoch": 1.2026615700431798, + "grad_norm": 8.128472627790671, + "learning_rate": 1.8097802078044125e-06, + "loss": 1.0737, + "step": 8495 + }, + { + "epoch": 1.202803142917817, + "grad_norm": 10.080761373920607, + "learning_rate": 1.809229305202446e-06, + "loss": 1.0834, + "step": 8496 + }, + { + "epoch": 1.2029447157924542, + "grad_norm": 9.147597250722496, + "learning_rate": 1.8086784389137796e-06, + "loss": 1.0507, + "step": 8497 + }, + { + "epoch": 1.2030862886670914, + "grad_norm": 10.06873654270516, + "learning_rate": 1.8081276089673719e-06, + "loss": 1.0707, + "step": 8498 + }, + { + "epoch": 1.2032278615417287, + "grad_norm": 10.906791782858464, + "learning_rate": 1.8075768153921813e-06, + "loss": 1.0575, + "step": 8499 + }, + { + "epoch": 1.203369434416366, + "grad_norm": 8.851997762080762, + "learning_rate": 1.8070260582171605e-06, + "loss": 1.0014, + "step": 8500 + }, + { + "epoch": 1.2035110072910031, + "grad_norm": 9.820446902670167, + "learning_rate": 1.8064753374712629e-06, + "loss": 1.1144, + "step": 8501 + }, + { + "epoch": 1.2036525801656404, + "grad_norm": 8.156937493475167, + "learning_rate": 1.8059246531834395e-06, + "loss": 0.8863, + "step": 8502 + }, + { + "epoch": 1.2037941530402776, + "grad_norm": 9.224302733827255, + "learning_rate": 1.8053740053826399e-06, + "loss": 0.9502, + "step": 8503 + }, + { + "epoch": 1.2039357259149148, + "grad_norm": 8.327541423913313, + "learning_rate": 1.8048233940978103e-06, + "loss": 1.0274, + "step": 8504 + }, + { + "epoch": 1.2040772987895518, + "grad_norm": 9.378290844944152, + "learning_rate": 1.8042728193578968e-06, + "loss": 1.096, + "step": 8505 + }, + { + "epoch": 1.204218871664189, + "grad_norm": 9.622919031079444, + "learning_rate": 1.8037222811918426e-06, + "loss": 1.0316, + "step": 8506 + }, + { + "epoch": 1.2043604445388263, + "grad_norm": 10.332640060631096, + "learning_rate": 1.803171779628589e-06, + "loss": 1.0074, + "step": 8507 + }, + { + "epoch": 1.2045020174134635, + "grad_norm": 10.166267220409729, + "learning_rate": 1.8026213146970752e-06, + "loss": 1.0473, + "step": 8508 + }, + { + "epoch": 1.2046435902881008, + "grad_norm": 9.188669143298174, + "learning_rate": 1.80207088642624e-06, + "loss": 1.0144, + "step": 8509 + }, + { + "epoch": 1.204785163162738, + "grad_norm": 9.203208314165543, + "learning_rate": 1.8015204948450166e-06, + "loss": 1.0924, + "step": 8510 + }, + { + "epoch": 1.2049267360373752, + "grad_norm": 8.525166672674207, + "learning_rate": 1.80097013998234e-06, + "loss": 1.0815, + "step": 8511 + }, + { + "epoch": 1.2050683089120124, + "grad_norm": 9.435434683857434, + "learning_rate": 1.8004198218671423e-06, + "loss": 1.005, + "step": 8512 + }, + { + "epoch": 1.2052098817866497, + "grad_norm": 7.970364754824661, + "learning_rate": 1.7998695405283528e-06, + "loss": 0.953, + "step": 8513 + }, + { + "epoch": 1.205351454661287, + "grad_norm": 8.208617023142034, + "learning_rate": 1.7993192959948996e-06, + "loss": 1.0734, + "step": 8514 + }, + { + "epoch": 1.2054930275359241, + "grad_norm": 10.46516270452251, + "learning_rate": 1.7987690882957084e-06, + "loss": 1.0637, + "step": 8515 + }, + { + "epoch": 1.2056346004105614, + "grad_norm": 8.247452313625544, + "learning_rate": 1.7982189174597037e-06, + "loss": 0.9523, + "step": 8516 + }, + { + "epoch": 1.2057761732851986, + "grad_norm": 9.579190716098653, + "learning_rate": 1.797668783515807e-06, + "loss": 0.9919, + "step": 8517 + }, + { + "epoch": 1.2059177461598358, + "grad_norm": 9.890379699268534, + "learning_rate": 1.7971186864929386e-06, + "loss": 1.1501, + "step": 8518 + }, + { + "epoch": 1.206059319034473, + "grad_norm": 11.894086095939326, + "learning_rate": 1.7965686264200165e-06, + "loss": 1.0604, + "step": 8519 + }, + { + "epoch": 1.2062008919091103, + "grad_norm": 8.485550210677543, + "learning_rate": 1.7960186033259585e-06, + "loss": 1.0043, + "step": 8520 + }, + { + "epoch": 1.2063424647837475, + "grad_norm": 10.136129784792624, + "learning_rate": 1.7954686172396764e-06, + "loss": 0.9925, + "step": 8521 + }, + { + "epoch": 1.2064840376583847, + "grad_norm": 8.920235558380293, + "learning_rate": 1.7949186681900843e-06, + "loss": 0.9819, + "step": 8522 + }, + { + "epoch": 1.2066256105330218, + "grad_norm": 9.14431849789888, + "learning_rate": 1.7943687562060919e-06, + "loss": 0.9472, + "step": 8523 + }, + { + "epoch": 1.206767183407659, + "grad_norm": 9.374221159054612, + "learning_rate": 1.7938188813166074e-06, + "loss": 1.0098, + "step": 8524 + }, + { + "epoch": 1.2069087562822962, + "grad_norm": 11.049859880620552, + "learning_rate": 1.7932690435505385e-06, + "loss": 1.0082, + "step": 8525 + }, + { + "epoch": 1.2070503291569334, + "grad_norm": 9.894195828059216, + "learning_rate": 1.7927192429367885e-06, + "loss": 1.0714, + "step": 8526 + }, + { + "epoch": 1.2071919020315707, + "grad_norm": 9.27727878266726, + "learning_rate": 1.79216947950426e-06, + "loss": 1.0928, + "step": 8527 + }, + { + "epoch": 1.207333474906208, + "grad_norm": 10.034445755672564, + "learning_rate": 1.7916197532818548e-06, + "loss": 1.168, + "step": 8528 + }, + { + "epoch": 1.2074750477808451, + "grad_norm": 9.146396581964414, + "learning_rate": 1.791070064298471e-06, + "loss": 0.9499, + "step": 8529 + }, + { + "epoch": 1.2076166206554824, + "grad_norm": 11.37497987850212, + "learning_rate": 1.7905204125830066e-06, + "loss": 1.064, + "step": 8530 + }, + { + "epoch": 1.2077581935301196, + "grad_norm": 9.93778828136789, + "learning_rate": 1.7899707981643538e-06, + "loss": 1.022, + "step": 8531 + }, + { + "epoch": 1.2078997664047568, + "grad_norm": 12.245645177191117, + "learning_rate": 1.7894212210714068e-06, + "loss": 1.0483, + "step": 8532 + }, + { + "epoch": 1.208041339279394, + "grad_norm": 9.274805985141237, + "learning_rate": 1.788871681333056e-06, + "loss": 1.119, + "step": 8533 + }, + { + "epoch": 1.2081829121540313, + "grad_norm": 9.192986958345399, + "learning_rate": 1.7883221789781908e-06, + "loss": 0.9025, + "step": 8534 + }, + { + "epoch": 1.2083244850286685, + "grad_norm": 8.738554089532409, + "learning_rate": 1.7877727140356982e-06, + "loss": 1.0451, + "step": 8535 + }, + { + "epoch": 1.2084660579033057, + "grad_norm": 8.847016826370018, + "learning_rate": 1.787223286534463e-06, + "loss": 1.0872, + "step": 8536 + }, + { + "epoch": 1.208607630777943, + "grad_norm": 10.104723182899537, + "learning_rate": 1.7866738965033681e-06, + "loss": 1.0752, + "step": 8537 + }, + { + "epoch": 1.2087492036525802, + "grad_norm": 11.668166218616935, + "learning_rate": 1.7861245439712945e-06, + "loss": 0.9805, + "step": 8538 + }, + { + "epoch": 1.2088907765272174, + "grad_norm": 10.609738867563642, + "learning_rate": 1.7855752289671215e-06, + "loss": 1.0796, + "step": 8539 + }, + { + "epoch": 1.2090323494018547, + "grad_norm": 9.241418930691275, + "learning_rate": 1.785025951519726e-06, + "loss": 1.0158, + "step": 8540 + }, + { + "epoch": 1.209173922276492, + "grad_norm": 9.02490645172582, + "learning_rate": 1.7844767116579836e-06, + "loss": 1.0432, + "step": 8541 + }, + { + "epoch": 1.2093154951511291, + "grad_norm": 9.838670176765149, + "learning_rate": 1.7839275094107666e-06, + "loss": 0.8862, + "step": 8542 + }, + { + "epoch": 1.2094570680257664, + "grad_norm": 9.365678145442722, + "learning_rate": 1.7833783448069464e-06, + "loss": 0.9995, + "step": 8543 + }, + { + "epoch": 1.2095986409004036, + "grad_norm": 9.299446122278054, + "learning_rate": 1.782829217875392e-06, + "loss": 1.0153, + "step": 8544 + }, + { + "epoch": 1.2097402137750408, + "grad_norm": 8.863947043498252, + "learning_rate": 1.782280128644971e-06, + "loss": 1.0726, + "step": 8545 + }, + { + "epoch": 1.2098817866496778, + "grad_norm": 13.122033937549586, + "learning_rate": 1.781731077144549e-06, + "loss": 1.0847, + "step": 8546 + }, + { + "epoch": 1.210023359524315, + "grad_norm": 9.897953446135974, + "learning_rate": 1.781182063402988e-06, + "loss": 1.1131, + "step": 8547 + }, + { + "epoch": 1.2101649323989523, + "grad_norm": 8.50266089583551, + "learning_rate": 1.7806330874491504e-06, + "loss": 0.9699, + "step": 8548 + }, + { + "epoch": 1.2103065052735895, + "grad_norm": 9.644932704154849, + "learning_rate": 1.7800841493118942e-06, + "loss": 1.0342, + "step": 8549 + }, + { + "epoch": 1.2104480781482267, + "grad_norm": 9.084491014346709, + "learning_rate": 1.7795352490200782e-06, + "loss": 1.0142, + "step": 8550 + }, + { + "epoch": 1.210589651022864, + "grad_norm": 8.618865996998604, + "learning_rate": 1.778986386602558e-06, + "loss": 0.9499, + "step": 8551 + }, + { + "epoch": 1.2107312238975012, + "grad_norm": 11.022566795432253, + "learning_rate": 1.7784375620881847e-06, + "loss": 1.1112, + "step": 8552 + }, + { + "epoch": 1.2108727967721384, + "grad_norm": 10.354829599405045, + "learning_rate": 1.7778887755058108e-06, + "loss": 1.0095, + "step": 8553 + }, + { + "epoch": 1.2110143696467757, + "grad_norm": 8.866830423702448, + "learning_rate": 1.7773400268842855e-06, + "loss": 0.8931, + "step": 8554 + }, + { + "epoch": 1.211155942521413, + "grad_norm": 8.20765871738669, + "learning_rate": 1.7767913162524562e-06, + "loss": 0.9572, + "step": 8555 + }, + { + "epoch": 1.2112975153960501, + "grad_norm": 9.192239589825638, + "learning_rate": 1.7762426436391675e-06, + "loss": 0.9966, + "step": 8556 + }, + { + "epoch": 1.2114390882706874, + "grad_norm": 8.476657005521565, + "learning_rate": 1.775694009073264e-06, + "loss": 1.0065, + "step": 8557 + }, + { + "epoch": 1.2115806611453246, + "grad_norm": 10.03553181469278, + "learning_rate": 1.7751454125835866e-06, + "loss": 1.1265, + "step": 8558 + }, + { + "epoch": 1.2117222340199618, + "grad_norm": 8.641070004363609, + "learning_rate": 1.774596854198974e-06, + "loss": 1.0001, + "step": 8559 + }, + { + "epoch": 1.211863806894599, + "grad_norm": 10.427752483185719, + "learning_rate": 1.774048333948264e-06, + "loss": 0.9759, + "step": 8560 + }, + { + "epoch": 1.2120053797692363, + "grad_norm": 9.096224209024305, + "learning_rate": 1.773499851860292e-06, + "loss": 1.0225, + "step": 8561 + }, + { + "epoch": 1.2121469526438735, + "grad_norm": 9.51999768040733, + "learning_rate": 1.7729514079638915e-06, + "loss": 1.106, + "step": 8562 + }, + { + "epoch": 1.2122885255185107, + "grad_norm": 9.808528023351071, + "learning_rate": 1.7724030022878928e-06, + "loss": 1.0409, + "step": 8563 + }, + { + "epoch": 1.2124300983931477, + "grad_norm": 10.022798489593027, + "learning_rate": 1.7718546348611254e-06, + "loss": 0.9684, + "step": 8564 + }, + { + "epoch": 1.212571671267785, + "grad_norm": 9.123126111904348, + "learning_rate": 1.7713063057124174e-06, + "loss": 1.0391, + "step": 8565 + }, + { + "epoch": 1.2127132441424222, + "grad_norm": 9.268636127371765, + "learning_rate": 1.7707580148705936e-06, + "loss": 0.9213, + "step": 8566 + }, + { + "epoch": 1.2128548170170594, + "grad_norm": 8.495203796780165, + "learning_rate": 1.770209762364477e-06, + "loss": 1.124, + "step": 8567 + }, + { + "epoch": 1.2129963898916967, + "grad_norm": 9.186356453864663, + "learning_rate": 1.7696615482228891e-06, + "loss": 0.986, + "step": 8568 + }, + { + "epoch": 1.213137962766334, + "grad_norm": 7.326407745803101, + "learning_rate": 1.769113372474649e-06, + "loss": 1.1681, + "step": 8569 + }, + { + "epoch": 1.2132795356409711, + "grad_norm": 8.727630358650972, + "learning_rate": 1.768565235148574e-06, + "loss": 0.9633, + "step": 8570 + }, + { + "epoch": 1.2134211085156084, + "grad_norm": 9.583175348278846, + "learning_rate": 1.7680171362734794e-06, + "loss": 0.9056, + "step": 8571 + }, + { + "epoch": 1.2135626813902456, + "grad_norm": 11.054470445638547, + "learning_rate": 1.767469075878177e-06, + "loss": 1.1372, + "step": 8572 + }, + { + "epoch": 1.2137042542648828, + "grad_norm": 11.058600304108795, + "learning_rate": 1.7669210539914813e-06, + "loss": 1.1413, + "step": 8573 + }, + { + "epoch": 1.21384582713952, + "grad_norm": 8.171315277945284, + "learning_rate": 1.7663730706421978e-06, + "loss": 1.0095, + "step": 8574 + }, + { + "epoch": 1.2139874000141573, + "grad_norm": 9.939600308708343, + "learning_rate": 1.7658251258591352e-06, + "loss": 0.989, + "step": 8575 + }, + { + "epoch": 1.2141289728887945, + "grad_norm": 8.998667724334991, + "learning_rate": 1.7652772196710982e-06, + "loss": 1.0257, + "step": 8576 + }, + { + "epoch": 1.2142705457634317, + "grad_norm": 9.994379753038666, + "learning_rate": 1.7647293521068898e-06, + "loss": 1.0341, + "step": 8577 + }, + { + "epoch": 1.214412118638069, + "grad_norm": 9.601670962187429, + "learning_rate": 1.7641815231953107e-06, + "loss": 1.0089, + "step": 8578 + }, + { + "epoch": 1.2145536915127062, + "grad_norm": 9.580979785231754, + "learning_rate": 1.763633732965161e-06, + "loss": 1.0846, + "step": 8579 + }, + { + "epoch": 1.2146952643873434, + "grad_norm": 9.809235047857763, + "learning_rate": 1.7630859814452367e-06, + "loss": 0.9812, + "step": 8580 + }, + { + "epoch": 1.2148368372619807, + "grad_norm": 8.940670704783125, + "learning_rate": 1.7625382686643328e-06, + "loss": 1.0715, + "step": 8581 + }, + { + "epoch": 1.214978410136618, + "grad_norm": 9.988050187369412, + "learning_rate": 1.7619905946512421e-06, + "loss": 1.0506, + "step": 8582 + }, + { + "epoch": 1.2151199830112551, + "grad_norm": 10.456023996011858, + "learning_rate": 1.761442959434757e-06, + "loss": 1.0251, + "step": 8583 + }, + { + "epoch": 1.2152615558858924, + "grad_norm": 10.251496740710826, + "learning_rate": 1.7608953630436632e-06, + "loss": 1.0419, + "step": 8584 + }, + { + "epoch": 1.2154031287605296, + "grad_norm": 7.841179293473804, + "learning_rate": 1.7603478055067493e-06, + "loss": 0.8789, + "step": 8585 + }, + { + "epoch": 1.2155447016351668, + "grad_norm": 8.576379372561002, + "learning_rate": 1.7598002868528002e-06, + "loss": 0.9773, + "step": 8586 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 9.489883658175327, + "learning_rate": 1.7592528071105978e-06, + "loss": 1.0412, + "step": 8587 + }, + { + "epoch": 1.215827847384441, + "grad_norm": 8.580562502333125, + "learning_rate": 1.7587053663089233e-06, + "loss": 1.0493, + "step": 8588 + }, + { + "epoch": 1.2159694202590783, + "grad_norm": 9.918633359476827, + "learning_rate": 1.7581579644765544e-06, + "loss": 1.0383, + "step": 8589 + }, + { + "epoch": 1.2161109931337155, + "grad_norm": 10.075762425958418, + "learning_rate": 1.7576106016422684e-06, + "loss": 0.9581, + "step": 8590 + }, + { + "epoch": 1.2162525660083527, + "grad_norm": 8.661514609301795, + "learning_rate": 1.7570632778348394e-06, + "loss": 1.0034, + "step": 8591 + }, + { + "epoch": 1.21639413888299, + "grad_norm": 10.765718899830878, + "learning_rate": 1.7565159930830405e-06, + "loss": 0.9802, + "step": 8592 + }, + { + "epoch": 1.2165357117576272, + "grad_norm": 9.585778319786753, + "learning_rate": 1.7559687474156412e-06, + "loss": 1.0421, + "step": 8593 + }, + { + "epoch": 1.2166772846322644, + "grad_norm": 9.661375592778006, + "learning_rate": 1.7554215408614102e-06, + "loss": 1.0471, + "step": 8594 + }, + { + "epoch": 1.2168188575069017, + "grad_norm": 9.330758239093234, + "learning_rate": 1.7548743734491136e-06, + "loss": 1.072, + "step": 8595 + }, + { + "epoch": 1.216960430381539, + "grad_norm": 8.369968098314168, + "learning_rate": 1.7543272452075156e-06, + "loss": 0.9833, + "step": 8596 + }, + { + "epoch": 1.2171020032561761, + "grad_norm": 9.237868730879695, + "learning_rate": 1.7537801561653777e-06, + "loss": 1.0365, + "step": 8597 + }, + { + "epoch": 1.2172435761308134, + "grad_norm": 13.367312494849552, + "learning_rate": 1.7532331063514613e-06, + "loss": 1.0782, + "step": 8598 + }, + { + "epoch": 1.2173851490054506, + "grad_norm": 8.40059973982629, + "learning_rate": 1.7526860957945233e-06, + "loss": 0.9536, + "step": 8599 + }, + { + "epoch": 1.2175267218800878, + "grad_norm": 10.0680621394656, + "learning_rate": 1.7521391245233202e-06, + "loss": 1.099, + "step": 8600 + }, + { + "epoch": 1.217668294754725, + "grad_norm": 10.697476911312611, + "learning_rate": 1.7515921925666053e-06, + "loss": 1.0487, + "step": 8601 + }, + { + "epoch": 1.2178098676293623, + "grad_norm": 9.586556287969229, + "learning_rate": 1.7510452999531308e-06, + "loss": 1.0333, + "step": 8602 + }, + { + "epoch": 1.2179514405039995, + "grad_norm": 9.122157660114002, + "learning_rate": 1.7504984467116467e-06, + "loss": 1.0818, + "step": 8603 + }, + { + "epoch": 1.2180930133786367, + "grad_norm": 9.224444166356186, + "learning_rate": 1.7499516328709016e-06, + "loss": 0.925, + "step": 8604 + }, + { + "epoch": 1.218234586253274, + "grad_norm": 11.961459257480225, + "learning_rate": 1.7494048584596388e-06, + "loss": 1.1279, + "step": 8605 + }, + { + "epoch": 1.218376159127911, + "grad_norm": 8.809055385630707, + "learning_rate": 1.7488581235066027e-06, + "loss": 1.0624, + "step": 8606 + }, + { + "epoch": 1.2185177320025482, + "grad_norm": 8.748686337632487, + "learning_rate": 1.7483114280405348e-06, + "loss": 1.1313, + "step": 8607 + }, + { + "epoch": 1.2186593048771854, + "grad_norm": 10.152938161946643, + "learning_rate": 1.747764772090175e-06, + "loss": 0.9986, + "step": 8608 + }, + { + "epoch": 1.2188008777518227, + "grad_norm": 8.851685323645867, + "learning_rate": 1.7472181556842602e-06, + "loss": 1.07, + "step": 8609 + }, + { + "epoch": 1.21894245062646, + "grad_norm": 9.823316310440225, + "learning_rate": 1.7466715788515256e-06, + "loss": 1.1456, + "step": 8610 + }, + { + "epoch": 1.2190840235010971, + "grad_norm": 7.834198139691347, + "learning_rate": 1.7461250416207045e-06, + "loss": 1.1363, + "step": 8611 + }, + { + "epoch": 1.2192255963757344, + "grad_norm": 7.741813519757954, + "learning_rate": 1.745578544020528e-06, + "loss": 1.0277, + "step": 8612 + }, + { + "epoch": 1.2193671692503716, + "grad_norm": 8.287272095459379, + "learning_rate": 1.7450320860797248e-06, + "loss": 1.0458, + "step": 8613 + }, + { + "epoch": 1.2195087421250088, + "grad_norm": 8.652380784345121, + "learning_rate": 1.7444856678270218e-06, + "loss": 1.0445, + "step": 8614 + }, + { + "epoch": 1.219650314999646, + "grad_norm": 9.792309246216398, + "learning_rate": 1.7439392892911443e-06, + "loss": 1.041, + "step": 8615 + }, + { + "epoch": 1.2197918878742833, + "grad_norm": 7.264240617018641, + "learning_rate": 1.7433929505008145e-06, + "loss": 0.9001, + "step": 8616 + }, + { + "epoch": 1.2199334607489205, + "grad_norm": 9.312582335652168, + "learning_rate": 1.7428466514847531e-06, + "loss": 1.128, + "step": 8617 + }, + { + "epoch": 1.2200750336235577, + "grad_norm": 7.419082790967392, + "learning_rate": 1.7423003922716784e-06, + "loss": 0.9174, + "step": 8618 + }, + { + "epoch": 1.220216606498195, + "grad_norm": 10.691218928186354, + "learning_rate": 1.741754172890307e-06, + "loss": 1.0832, + "step": 8619 + }, + { + "epoch": 1.2203581793728322, + "grad_norm": 10.591104812986602, + "learning_rate": 1.7412079933693538e-06, + "loss": 1.0712, + "step": 8620 + }, + { + "epoch": 1.2204997522474694, + "grad_norm": 8.788115834434228, + "learning_rate": 1.7406618537375303e-06, + "loss": 1.0147, + "step": 8621 + }, + { + "epoch": 1.2206413251221067, + "grad_norm": 8.346845531233079, + "learning_rate": 1.740115754023547e-06, + "loss": 1.0001, + "step": 8622 + }, + { + "epoch": 1.2207828979967439, + "grad_norm": 7.387309776477671, + "learning_rate": 1.7395696942561119e-06, + "loss": 1.019, + "step": 8623 + }, + { + "epoch": 1.2209244708713811, + "grad_norm": 8.6040190775654, + "learning_rate": 1.7390236744639304e-06, + "loss": 1.0948, + "step": 8624 + }, + { + "epoch": 1.2210660437460183, + "grad_norm": 8.584652055588936, + "learning_rate": 1.7384776946757075e-06, + "loss": 0.9884, + "step": 8625 + }, + { + "epoch": 1.2212076166206556, + "grad_norm": 10.459448117928623, + "learning_rate": 1.7379317549201458e-06, + "loss": 1.1494, + "step": 8626 + }, + { + "epoch": 1.2213491894952928, + "grad_norm": 11.056462078745193, + "learning_rate": 1.7373858552259421e-06, + "loss": 1.0511, + "step": 8627 + }, + { + "epoch": 1.22149076236993, + "grad_norm": 11.920542551276636, + "learning_rate": 1.7368399956217954e-06, + "loss": 0.9935, + "step": 8628 + }, + { + "epoch": 1.221632335244567, + "grad_norm": 12.494870773813425, + "learning_rate": 1.7362941761364012e-06, + "loss": 1.1138, + "step": 8629 + }, + { + "epoch": 1.2217739081192043, + "grad_norm": 9.845198461141239, + "learning_rate": 1.7357483967984524e-06, + "loss": 0.9648, + "step": 8630 + }, + { + "epoch": 1.2219154809938415, + "grad_norm": 9.309171965229918, + "learning_rate": 1.7352026576366405e-06, + "loss": 1.0357, + "step": 8631 + }, + { + "epoch": 1.2220570538684787, + "grad_norm": 8.491342062044636, + "learning_rate": 1.734656958679655e-06, + "loss": 1.094, + "step": 8632 + }, + { + "epoch": 1.222198626743116, + "grad_norm": 9.349708315945568, + "learning_rate": 1.7341112999561823e-06, + "loss": 0.8822, + "step": 8633 + }, + { + "epoch": 1.2223401996177532, + "grad_norm": 10.299381674567554, + "learning_rate": 1.7335656814949075e-06, + "loss": 0.974, + "step": 8634 + }, + { + "epoch": 1.2224817724923904, + "grad_norm": 7.510213382701648, + "learning_rate": 1.7330201033245137e-06, + "loss": 0.9435, + "step": 8635 + }, + { + "epoch": 1.2226233453670277, + "grad_norm": 9.480929055059441, + "learning_rate": 1.7324745654736812e-06, + "loss": 1.1537, + "step": 8636 + }, + { + "epoch": 1.2227649182416649, + "grad_norm": 10.186765304940018, + "learning_rate": 1.7319290679710885e-06, + "loss": 1.1243, + "step": 8637 + }, + { + "epoch": 1.2229064911163021, + "grad_norm": 10.51043654793761, + "learning_rate": 1.7313836108454118e-06, + "loss": 0.9928, + "step": 8638 + }, + { + "epoch": 1.2230480639909393, + "grad_norm": 10.41200734011391, + "learning_rate": 1.7308381941253256e-06, + "loss": 0.9867, + "step": 8639 + }, + { + "epoch": 1.2231896368655766, + "grad_norm": 11.81033612166227, + "learning_rate": 1.7302928178395018e-06, + "loss": 1.0656, + "step": 8640 + }, + { + "epoch": 1.2233312097402138, + "grad_norm": 7.949007838495008, + "learning_rate": 1.7297474820166108e-06, + "loss": 0.9244, + "step": 8641 + }, + { + "epoch": 1.223472782614851, + "grad_norm": 9.447779431900793, + "learning_rate": 1.7292021866853204e-06, + "loss": 1.0183, + "step": 8642 + }, + { + "epoch": 1.2236143554894883, + "grad_norm": 10.851624721406388, + "learning_rate": 1.7286569318742962e-06, + "loss": 0.9978, + "step": 8643 + }, + { + "epoch": 1.2237559283641255, + "grad_norm": 9.88903776573479, + "learning_rate": 1.728111717612202e-06, + "loss": 1.0194, + "step": 8644 + }, + { + "epoch": 1.2238975012387627, + "grad_norm": 11.788957659822099, + "learning_rate": 1.727566543927699e-06, + "loss": 1.0137, + "step": 8645 + }, + { + "epoch": 1.2240390741134, + "grad_norm": 9.825769090519827, + "learning_rate": 1.7270214108494469e-06, + "loss": 0.911, + "step": 8646 + }, + { + "epoch": 1.224180646988037, + "grad_norm": 9.66785274551616, + "learning_rate": 1.726476318406104e-06, + "loss": 0.9926, + "step": 8647 + }, + { + "epoch": 1.2243222198626742, + "grad_norm": 9.536038756016568, + "learning_rate": 1.7259312666263235e-06, + "loss": 1.1147, + "step": 8648 + }, + { + "epoch": 1.2244637927373114, + "grad_norm": 10.841015209785871, + "learning_rate": 1.7253862555387587e-06, + "loss": 1.1747, + "step": 8649 + }, + { + "epoch": 1.2246053656119487, + "grad_norm": 9.420405183084036, + "learning_rate": 1.7248412851720613e-06, + "loss": 0.9412, + "step": 8650 + }, + { + "epoch": 1.224746938486586, + "grad_norm": 8.763618117104453, + "learning_rate": 1.7242963555548794e-06, + "loss": 1.0978, + "step": 8651 + }, + { + "epoch": 1.2248885113612231, + "grad_norm": 8.739042560152917, + "learning_rate": 1.7237514667158598e-06, + "loss": 0.9239, + "step": 8652 + }, + { + "epoch": 1.2250300842358604, + "grad_norm": 9.570629379320286, + "learning_rate": 1.723206618683646e-06, + "loss": 1.0463, + "step": 8653 + }, + { + "epoch": 1.2251716571104976, + "grad_norm": 8.831206251550343, + "learning_rate": 1.722661811486882e-06, + "loss": 1.0051, + "step": 8654 + }, + { + "epoch": 1.2253132299851348, + "grad_norm": 8.658091875906644, + "learning_rate": 1.7221170451542067e-06, + "loss": 1.0169, + "step": 8655 + }, + { + "epoch": 1.225454802859772, + "grad_norm": 7.5333816234199915, + "learning_rate": 1.721572319714258e-06, + "loss": 1.0178, + "step": 8656 + }, + { + "epoch": 1.2255963757344093, + "grad_norm": 11.068206177528554, + "learning_rate": 1.7210276351956736e-06, + "loss": 1.0446, + "step": 8657 + }, + { + "epoch": 1.2257379486090465, + "grad_norm": 7.474674444859831, + "learning_rate": 1.7204829916270842e-06, + "loss": 1.0098, + "step": 8658 + }, + { + "epoch": 1.2258795214836837, + "grad_norm": 9.359707579650259, + "learning_rate": 1.7199383890371228e-06, + "loss": 1.0338, + "step": 8659 + }, + { + "epoch": 1.226021094358321, + "grad_norm": 7.939195346753111, + "learning_rate": 1.7193938274544187e-06, + "loss": 0.9867, + "step": 8660 + }, + { + "epoch": 1.2261626672329582, + "grad_norm": 11.662170052230778, + "learning_rate": 1.718849306907599e-06, + "loss": 1.0529, + "step": 8661 + }, + { + "epoch": 1.2263042401075954, + "grad_norm": 10.208979704238935, + "learning_rate": 1.7183048274252889e-06, + "loss": 1.1053, + "step": 8662 + }, + { + "epoch": 1.2264458129822327, + "grad_norm": 11.355073011895113, + "learning_rate": 1.717760389036111e-06, + "loss": 1.0011, + "step": 8663 + }, + { + "epoch": 1.2265873858568699, + "grad_norm": 10.777964923540361, + "learning_rate": 1.7172159917686866e-06, + "loss": 1.1294, + "step": 8664 + }, + { + "epoch": 1.2267289587315071, + "grad_norm": 10.705588220206538, + "learning_rate": 1.7166716356516334e-06, + "loss": 0.9895, + "step": 8665 + }, + { + "epoch": 1.2268705316061443, + "grad_norm": 8.887251014279968, + "learning_rate": 1.716127320713568e-06, + "loss": 0.9558, + "step": 8666 + }, + { + "epoch": 1.2270121044807816, + "grad_norm": 11.153825768975771, + "learning_rate": 1.7155830469831057e-06, + "loss": 1.0899, + "step": 8667 + }, + { + "epoch": 1.2271536773554188, + "grad_norm": 9.15444293108095, + "learning_rate": 1.7150388144888577e-06, + "loss": 1.0003, + "step": 8668 + }, + { + "epoch": 1.227295250230056, + "grad_norm": 8.376280729382335, + "learning_rate": 1.7144946232594334e-06, + "loss": 0.9502, + "step": 8669 + }, + { + "epoch": 1.2274368231046933, + "grad_norm": 9.785638476765286, + "learning_rate": 1.7139504733234413e-06, + "loss": 1.0415, + "step": 8670 + }, + { + "epoch": 1.2275783959793303, + "grad_norm": 9.496358474688344, + "learning_rate": 1.7134063647094866e-06, + "loss": 0.9669, + "step": 8671 + }, + { + "epoch": 1.2277199688539675, + "grad_norm": 10.285438496056079, + "learning_rate": 1.7128622974461728e-06, + "loss": 1.0216, + "step": 8672 + }, + { + "epoch": 1.2278615417286047, + "grad_norm": 9.75535983925051, + "learning_rate": 1.7123182715621012e-06, + "loss": 1.0014, + "step": 8673 + }, + { + "epoch": 1.228003114603242, + "grad_norm": 7.941786682353525, + "learning_rate": 1.7117742870858706e-06, + "loss": 0.9778, + "step": 8674 + }, + { + "epoch": 1.2281446874778792, + "grad_norm": 11.374412437606667, + "learning_rate": 1.7112303440460775e-06, + "loss": 1.1064, + "step": 8675 + }, + { + "epoch": 1.2282862603525164, + "grad_norm": 8.1556391578614, + "learning_rate": 1.7106864424713177e-06, + "loss": 0.989, + "step": 8676 + }, + { + "epoch": 1.2284278332271537, + "grad_norm": 10.749336044743975, + "learning_rate": 1.710142582390183e-06, + "loss": 1.0834, + "step": 8677 + }, + { + "epoch": 1.2285694061017909, + "grad_norm": 11.605226168156419, + "learning_rate": 1.709598763831264e-06, + "loss": 1.1054, + "step": 8678 + }, + { + "epoch": 1.2287109789764281, + "grad_norm": 9.758085370734316, + "learning_rate": 1.7090549868231492e-06, + "loss": 1.0601, + "step": 8679 + }, + { + "epoch": 1.2288525518510653, + "grad_norm": 9.336006962346191, + "learning_rate": 1.7085112513944235e-06, + "loss": 1.0468, + "step": 8680 + }, + { + "epoch": 1.2289941247257026, + "grad_norm": 8.821702845883793, + "learning_rate": 1.7079675575736704e-06, + "loss": 0.9766, + "step": 8681 + }, + { + "epoch": 1.2291356976003398, + "grad_norm": 9.93721554906868, + "learning_rate": 1.7074239053894725e-06, + "loss": 0.9999, + "step": 8682 + }, + { + "epoch": 1.229277270474977, + "grad_norm": 8.131012555231965, + "learning_rate": 1.7068802948704094e-06, + "loss": 0.9935, + "step": 8683 + }, + { + "epoch": 1.2294188433496143, + "grad_norm": 7.9759558317134225, + "learning_rate": 1.7063367260450576e-06, + "loss": 1.0255, + "step": 8684 + }, + { + "epoch": 1.2295604162242515, + "grad_norm": 9.014266680333996, + "learning_rate": 1.7057931989419923e-06, + "loss": 0.9762, + "step": 8685 + }, + { + "epoch": 1.2297019890988887, + "grad_norm": 8.787499426981444, + "learning_rate": 1.705249713589786e-06, + "loss": 0.9686, + "step": 8686 + }, + { + "epoch": 1.229843561973526, + "grad_norm": 10.1570515008259, + "learning_rate": 1.7047062700170104e-06, + "loss": 1.0672, + "step": 8687 + }, + { + "epoch": 1.2299851348481632, + "grad_norm": 10.057198022096616, + "learning_rate": 1.7041628682522326e-06, + "loss": 1.0043, + "step": 8688 + }, + { + "epoch": 1.2301267077228002, + "grad_norm": 9.919647879965297, + "learning_rate": 1.7036195083240203e-06, + "loss": 1.0074, + "step": 8689 + }, + { + "epoch": 1.2302682805974374, + "grad_norm": 9.245708810507251, + "learning_rate": 1.703076190260936e-06, + "loss": 0.9921, + "step": 8690 + }, + { + "epoch": 1.2304098534720747, + "grad_norm": 12.81088916608, + "learning_rate": 1.702532914091542e-06, + "loss": 1.0534, + "step": 8691 + }, + { + "epoch": 1.2305514263467119, + "grad_norm": 7.594916611214295, + "learning_rate": 1.7019896798443984e-06, + "loss": 0.9018, + "step": 8692 + }, + { + "epoch": 1.2306929992213491, + "grad_norm": 9.34364017689953, + "learning_rate": 1.7014464875480618e-06, + "loss": 1.0415, + "step": 8693 + }, + { + "epoch": 1.2308345720959863, + "grad_norm": 9.160263692067119, + "learning_rate": 1.7009033372310884e-06, + "loss": 1.1074, + "step": 8694 + }, + { + "epoch": 1.2309761449706236, + "grad_norm": 9.367250622050665, + "learning_rate": 1.7003602289220305e-06, + "loss": 1.023, + "step": 8695 + }, + { + "epoch": 1.2311177178452608, + "grad_norm": 8.565882996017681, + "learning_rate": 1.6998171626494392e-06, + "loss": 0.9498, + "step": 8696 + }, + { + "epoch": 1.231259290719898, + "grad_norm": 9.504865103320732, + "learning_rate": 1.6992741384418632e-06, + "loss": 1.0849, + "step": 8697 + }, + { + "epoch": 1.2314008635945353, + "grad_norm": 8.96203063842467, + "learning_rate": 1.698731156327848e-06, + "loss": 1.0155, + "step": 8698 + }, + { + "epoch": 1.2315424364691725, + "grad_norm": 11.649051192125384, + "learning_rate": 1.6981882163359391e-06, + "loss": 1.1641, + "step": 8699 + }, + { + "epoch": 1.2316840093438097, + "grad_norm": 9.462701760563682, + "learning_rate": 1.6976453184946786e-06, + "loss": 1.0021, + "step": 8700 + }, + { + "epoch": 1.231825582218447, + "grad_norm": 9.154689616801898, + "learning_rate": 1.6971024628326046e-06, + "loss": 1.0303, + "step": 8701 + }, + { + "epoch": 1.2319671550930842, + "grad_norm": 8.423479652759257, + "learning_rate": 1.6965596493782555e-06, + "loss": 0.9856, + "step": 8702 + }, + { + "epoch": 1.2321087279677214, + "grad_norm": 8.319599074460598, + "learning_rate": 1.6960168781601665e-06, + "loss": 0.9093, + "step": 8703 + }, + { + "epoch": 1.2322503008423586, + "grad_norm": 10.187040166481902, + "learning_rate": 1.6954741492068698e-06, + "loss": 0.9491, + "step": 8704 + }, + { + "epoch": 1.2323918737169959, + "grad_norm": 8.770810910254394, + "learning_rate": 1.6949314625468985e-06, + "loss": 1.0696, + "step": 8705 + }, + { + "epoch": 1.232533446591633, + "grad_norm": 10.371032323852932, + "learning_rate": 1.6943888182087796e-06, + "loss": 1.0399, + "step": 8706 + }, + { + "epoch": 1.2326750194662703, + "grad_norm": 10.821075743049597, + "learning_rate": 1.6938462162210395e-06, + "loss": 0.9808, + "step": 8707 + }, + { + "epoch": 1.2328165923409076, + "grad_norm": 9.475562537411202, + "learning_rate": 1.6933036566122029e-06, + "loss": 0.991, + "step": 8708 + }, + { + "epoch": 1.2329581652155448, + "grad_norm": 10.289991564214304, + "learning_rate": 1.6927611394107918e-06, + "loss": 1.0875, + "step": 8709 + }, + { + "epoch": 1.233099738090182, + "grad_norm": 8.04447732407025, + "learning_rate": 1.6922186646453263e-06, + "loss": 0.8924, + "step": 8710 + }, + { + "epoch": 1.2332413109648193, + "grad_norm": 11.623840930540801, + "learning_rate": 1.6916762323443225e-06, + "loss": 1.0923, + "step": 8711 + }, + { + "epoch": 1.2333828838394563, + "grad_norm": 7.691024344568804, + "learning_rate": 1.6911338425362967e-06, + "loss": 0.9438, + "step": 8712 + }, + { + "epoch": 1.2335244567140935, + "grad_norm": 8.834803284990144, + "learning_rate": 1.6905914952497616e-06, + "loss": 0.8803, + "step": 8713 + }, + { + "epoch": 1.2336660295887307, + "grad_norm": 9.010364075141556, + "learning_rate": 1.6900491905132277e-06, + "loss": 0.9326, + "step": 8714 + }, + { + "epoch": 1.233807602463368, + "grad_norm": 11.318523678973525, + "learning_rate": 1.689506928355204e-06, + "loss": 1.093, + "step": 8715 + }, + { + "epoch": 1.2339491753380052, + "grad_norm": 9.593085946327845, + "learning_rate": 1.6889647088041972e-06, + "loss": 1.1622, + "step": 8716 + }, + { + "epoch": 1.2340907482126424, + "grad_norm": 8.991357256279978, + "learning_rate": 1.6884225318887107e-06, + "loss": 1.0424, + "step": 8717 + }, + { + "epoch": 1.2342323210872796, + "grad_norm": 9.399695663395338, + "learning_rate": 1.6878803976372465e-06, + "loss": 1.0407, + "step": 8718 + }, + { + "epoch": 1.2343738939619169, + "grad_norm": 9.167272605520454, + "learning_rate": 1.6873383060783043e-06, + "loss": 0.9277, + "step": 8719 + }, + { + "epoch": 1.234515466836554, + "grad_norm": 8.563188733990703, + "learning_rate": 1.6867962572403811e-06, + "loss": 0.9512, + "step": 8720 + }, + { + "epoch": 1.2346570397111913, + "grad_norm": 9.27859325467199, + "learning_rate": 1.6862542511519734e-06, + "loss": 0.9198, + "step": 8721 + }, + { + "epoch": 1.2347986125858286, + "grad_norm": 9.083289563368119, + "learning_rate": 1.6857122878415721e-06, + "loss": 0.8937, + "step": 8722 + }, + { + "epoch": 1.2349401854604658, + "grad_norm": 9.132737732932833, + "learning_rate": 1.6851703673376688e-06, + "loss": 1.065, + "step": 8723 + }, + { + "epoch": 1.235081758335103, + "grad_norm": 8.67471316520236, + "learning_rate": 1.6846284896687514e-06, + "loss": 1.0153, + "step": 8724 + }, + { + "epoch": 1.2352233312097403, + "grad_norm": 9.02485065706487, + "learning_rate": 1.6840866548633068e-06, + "loss": 0.9634, + "step": 8725 + }, + { + "epoch": 1.2353649040843775, + "grad_norm": 9.17366283349867, + "learning_rate": 1.6835448629498182e-06, + "loss": 0.997, + "step": 8726 + }, + { + "epoch": 1.2355064769590147, + "grad_norm": 9.290059952886345, + "learning_rate": 1.683003113956767e-06, + "loss": 1.0071, + "step": 8727 + }, + { + "epoch": 1.235648049833652, + "grad_norm": 10.389542280878034, + "learning_rate": 1.6824614079126334e-06, + "loss": 1.0603, + "step": 8728 + }, + { + "epoch": 1.2357896227082892, + "grad_norm": 9.325243941074511, + "learning_rate": 1.6819197448458935e-06, + "loss": 1.0291, + "step": 8729 + }, + { + "epoch": 1.2359311955829262, + "grad_norm": 11.063957161490144, + "learning_rate": 1.681378124785023e-06, + "loss": 1.0239, + "step": 8730 + }, + { + "epoch": 1.2360727684575634, + "grad_norm": 8.260936077731815, + "learning_rate": 1.6808365477584953e-06, + "loss": 1.1004, + "step": 8731 + }, + { + "epoch": 1.2362143413322007, + "grad_norm": 9.568194918787254, + "learning_rate": 1.6802950137947783e-06, + "loss": 1.0066, + "step": 8732 + }, + { + "epoch": 1.2363559142068379, + "grad_norm": 10.772921233777758, + "learning_rate": 1.6797535229223405e-06, + "loss": 1.0425, + "step": 8733 + }, + { + "epoch": 1.236497487081475, + "grad_norm": 10.869757079048249, + "learning_rate": 1.6792120751696495e-06, + "loss": 1.0203, + "step": 8734 + }, + { + "epoch": 1.2366390599561123, + "grad_norm": 8.321681408809553, + "learning_rate": 1.678670670565167e-06, + "loss": 0.9615, + "step": 8735 + }, + { + "epoch": 1.2367806328307496, + "grad_norm": 10.819762861152828, + "learning_rate": 1.678129309137355e-06, + "loss": 0.9188, + "step": 8736 + }, + { + "epoch": 1.2369222057053868, + "grad_norm": 7.745114170757356, + "learning_rate": 1.677587990914673e-06, + "loss": 0.9706, + "step": 8737 + }, + { + "epoch": 1.237063778580024, + "grad_norm": 8.183139251251387, + "learning_rate": 1.6770467159255768e-06, + "loss": 0.9432, + "step": 8738 + }, + { + "epoch": 1.2372053514546613, + "grad_norm": 9.840091861203028, + "learning_rate": 1.6765054841985212e-06, + "loss": 1.0281, + "step": 8739 + }, + { + "epoch": 1.2373469243292985, + "grad_norm": 9.792776318163206, + "learning_rate": 1.6759642957619581e-06, + "loss": 1.0371, + "step": 8740 + }, + { + "epoch": 1.2374884972039357, + "grad_norm": 9.637561082460856, + "learning_rate": 1.6754231506443375e-06, + "loss": 0.9265, + "step": 8741 + }, + { + "epoch": 1.237630070078573, + "grad_norm": 9.992718525153665, + "learning_rate": 1.6748820488741077e-06, + "loss": 1.0228, + "step": 8742 + }, + { + "epoch": 1.2377716429532102, + "grad_norm": 7.957871615079616, + "learning_rate": 1.674340990479713e-06, + "loss": 0.9177, + "step": 8743 + }, + { + "epoch": 1.2379132158278474, + "grad_norm": 9.587997053608225, + "learning_rate": 1.6737999754895965e-06, + "loss": 1.0801, + "step": 8744 + }, + { + "epoch": 1.2380547887024846, + "grad_norm": 8.074687410805854, + "learning_rate": 1.6732590039321993e-06, + "loss": 0.8634, + "step": 8745 + }, + { + "epoch": 1.2381963615771219, + "grad_norm": 8.604611389144958, + "learning_rate": 1.6727180758359598e-06, + "loss": 1.0435, + "step": 8746 + }, + { + "epoch": 1.238337934451759, + "grad_norm": 9.899258718770428, + "learning_rate": 1.6721771912293145e-06, + "loss": 1.0139, + "step": 8747 + }, + { + "epoch": 1.2384795073263963, + "grad_norm": 8.998889006812652, + "learning_rate": 1.6716363501406966e-06, + "loss": 0.9449, + "step": 8748 + }, + { + "epoch": 1.2386210802010336, + "grad_norm": 11.056711870188673, + "learning_rate": 1.6710955525985384e-06, + "loss": 1.076, + "step": 8749 + }, + { + "epoch": 1.2387626530756708, + "grad_norm": 8.986925164569593, + "learning_rate": 1.6705547986312681e-06, + "loss": 0.9852, + "step": 8750 + }, + { + "epoch": 1.238904225950308, + "grad_norm": 9.850911565900855, + "learning_rate": 1.6700140882673145e-06, + "loss": 1.1623, + "step": 8751 + }, + { + "epoch": 1.2390457988249453, + "grad_norm": 7.289272358480871, + "learning_rate": 1.669473421535101e-06, + "loss": 1.0276, + "step": 8752 + }, + { + "epoch": 1.2391873716995823, + "grad_norm": 9.180777029757186, + "learning_rate": 1.668932798463052e-06, + "loss": 1.0978, + "step": 8753 + }, + { + "epoch": 1.2393289445742195, + "grad_norm": 10.79965240484293, + "learning_rate": 1.668392219079585e-06, + "loss": 0.9198, + "step": 8754 + }, + { + "epoch": 1.2394705174488567, + "grad_norm": 9.233302123456555, + "learning_rate": 1.6678516834131184e-06, + "loss": 0.997, + "step": 8755 + }, + { + "epoch": 1.239612090323494, + "grad_norm": 8.296136108370769, + "learning_rate": 1.667311191492068e-06, + "loss": 1.0147, + "step": 8756 + }, + { + "epoch": 1.2397536631981312, + "grad_norm": 7.944821562926888, + "learning_rate": 1.6667707433448482e-06, + "loss": 0.976, + "step": 8757 + }, + { + "epoch": 1.2398952360727684, + "grad_norm": 9.926511050871117, + "learning_rate": 1.666230338999869e-06, + "loss": 1.0933, + "step": 8758 + }, + { + "epoch": 1.2400368089474056, + "grad_norm": 8.549685056362502, + "learning_rate": 1.6656899784855393e-06, + "loss": 1.019, + "step": 8759 + }, + { + "epoch": 1.2401783818220429, + "grad_norm": 10.862535722013149, + "learning_rate": 1.6651496618302653e-06, + "loss": 1.0167, + "step": 8760 + }, + { + "epoch": 1.24031995469668, + "grad_norm": 8.11850843062835, + "learning_rate": 1.6646093890624509e-06, + "loss": 1.0101, + "step": 8761 + }, + { + "epoch": 1.2404615275713173, + "grad_norm": 8.572726414834067, + "learning_rate": 1.6640691602104983e-06, + "loss": 1.0303, + "step": 8762 + }, + { + "epoch": 1.2406031004459546, + "grad_norm": 7.1063569422368955, + "learning_rate": 1.6635289753028073e-06, + "loss": 1.0061, + "step": 8763 + }, + { + "epoch": 1.2407446733205918, + "grad_norm": 9.088075105920911, + "learning_rate": 1.6629888343677734e-06, + "loss": 0.9873, + "step": 8764 + }, + { + "epoch": 1.240886246195229, + "grad_norm": 8.616104182386508, + "learning_rate": 1.6624487374337925e-06, + "loss": 0.8955, + "step": 8765 + }, + { + "epoch": 1.2410278190698663, + "grad_norm": 10.33086964760379, + "learning_rate": 1.661908684529257e-06, + "loss": 1.0941, + "step": 8766 + }, + { + "epoch": 1.2411693919445035, + "grad_norm": 8.174448107834301, + "learning_rate": 1.661368675682557e-06, + "loss": 1.0201, + "step": 8767 + }, + { + "epoch": 1.2413109648191407, + "grad_norm": 8.129361434576788, + "learning_rate": 1.6608287109220805e-06, + "loss": 1.0281, + "step": 8768 + }, + { + "epoch": 1.241452537693778, + "grad_norm": 9.397235593737086, + "learning_rate": 1.6602887902762132e-06, + "loss": 1.0707, + "step": 8769 + }, + { + "epoch": 1.2415941105684152, + "grad_norm": 9.50299025204534, + "learning_rate": 1.6597489137733377e-06, + "loss": 0.8212, + "step": 8770 + }, + { + "epoch": 1.2417356834430524, + "grad_norm": 8.399652891707602, + "learning_rate": 1.6592090814418354e-06, + "loss": 0.8712, + "step": 8771 + }, + { + "epoch": 1.2418772563176894, + "grad_norm": 8.10934207152515, + "learning_rate": 1.6586692933100846e-06, + "loss": 1.0564, + "step": 8772 + }, + { + "epoch": 1.2420188291923266, + "grad_norm": 8.815467280802887, + "learning_rate": 1.6581295494064615e-06, + "loss": 1.0332, + "step": 8773 + }, + { + "epoch": 1.2421604020669639, + "grad_norm": 8.369202386367794, + "learning_rate": 1.6575898497593417e-06, + "loss": 0.9394, + "step": 8774 + }, + { + "epoch": 1.242301974941601, + "grad_norm": 10.03880410659681, + "learning_rate": 1.6570501943970945e-06, + "loss": 0.9963, + "step": 8775 + }, + { + "epoch": 1.2424435478162383, + "grad_norm": 8.561945319482518, + "learning_rate": 1.65651058334809e-06, + "loss": 0.9828, + "step": 8776 + }, + { + "epoch": 1.2425851206908756, + "grad_norm": 9.144326424046318, + "learning_rate": 1.655971016640695e-06, + "loss": 0.959, + "step": 8777 + }, + { + "epoch": 1.2427266935655128, + "grad_norm": 10.274340502139601, + "learning_rate": 1.655431494303274e-06, + "loss": 0.9216, + "step": 8778 + }, + { + "epoch": 1.24286826644015, + "grad_norm": 7.7529411887628825, + "learning_rate": 1.65489201636419e-06, + "loss": 0.9195, + "step": 8779 + }, + { + "epoch": 1.2430098393147873, + "grad_norm": 9.348006793860732, + "learning_rate": 1.6543525828518025e-06, + "loss": 1.0151, + "step": 8780 + }, + { + "epoch": 1.2431514121894245, + "grad_norm": 10.93981943877628, + "learning_rate": 1.6538131937944693e-06, + "loss": 0.9956, + "step": 8781 + }, + { + "epoch": 1.2432929850640617, + "grad_norm": 9.474341425491133, + "learning_rate": 1.6532738492205456e-06, + "loss": 1.0112, + "step": 8782 + }, + { + "epoch": 1.243434557938699, + "grad_norm": 8.99863211515215, + "learning_rate": 1.652734549158384e-06, + "loss": 1.0026, + "step": 8783 + }, + { + "epoch": 1.2435761308133362, + "grad_norm": 9.639224551974028, + "learning_rate": 1.652195293636336e-06, + "loss": 1.0355, + "step": 8784 + }, + { + "epoch": 1.2437177036879734, + "grad_norm": 13.708068164861889, + "learning_rate": 1.6516560826827494e-06, + "loss": 1.0775, + "step": 8785 + }, + { + "epoch": 1.2438592765626106, + "grad_norm": 9.340270143968251, + "learning_rate": 1.6511169163259693e-06, + "loss": 1.0214, + "step": 8786 + }, + { + "epoch": 1.2440008494372479, + "grad_norm": 10.622720271856974, + "learning_rate": 1.6505777945943402e-06, + "loss": 1.0345, + "step": 8787 + }, + { + "epoch": 1.244142422311885, + "grad_norm": 8.936779926876671, + "learning_rate": 1.650038717516203e-06, + "loss": 1.0001, + "step": 8788 + }, + { + "epoch": 1.2442839951865223, + "grad_norm": 9.117679115253534, + "learning_rate": 1.6494996851198965e-06, + "loss": 0.9364, + "step": 8789 + }, + { + "epoch": 1.2444255680611596, + "grad_norm": 9.641461847669087, + "learning_rate": 1.6489606974337574e-06, + "loss": 0.9957, + "step": 8790 + }, + { + "epoch": 1.2445671409357968, + "grad_norm": 9.989819112520452, + "learning_rate": 1.6484217544861204e-06, + "loss": 1.0333, + "step": 8791 + }, + { + "epoch": 1.244708713810434, + "grad_norm": 8.28714136696924, + "learning_rate": 1.6478828563053162e-06, + "loss": 1.1241, + "step": 8792 + }, + { + "epoch": 1.2448502866850713, + "grad_norm": 9.942063921025937, + "learning_rate": 1.6473440029196752e-06, + "loss": 1.0479, + "step": 8793 + }, + { + "epoch": 1.2449918595597085, + "grad_norm": 10.453668007196685, + "learning_rate": 1.6468051943575242e-06, + "loss": 1.099, + "step": 8794 + }, + { + "epoch": 1.2451334324343455, + "grad_norm": 9.438532286517477, + "learning_rate": 1.6462664306471882e-06, + "loss": 1.0963, + "step": 8795 + }, + { + "epoch": 1.2452750053089827, + "grad_norm": 10.006041037231935, + "learning_rate": 1.6457277118169893e-06, + "loss": 1.156, + "step": 8796 + }, + { + "epoch": 1.24541657818362, + "grad_norm": 7.823881114391928, + "learning_rate": 1.6451890378952472e-06, + "loss": 0.9324, + "step": 8797 + }, + { + "epoch": 1.2455581510582572, + "grad_norm": 8.085876169755334, + "learning_rate": 1.6446504089102803e-06, + "loss": 0.9447, + "step": 8798 + }, + { + "epoch": 1.2456997239328944, + "grad_norm": 10.008657522506804, + "learning_rate": 1.6441118248904038e-06, + "loss": 0.9152, + "step": 8799 + }, + { + "epoch": 1.2458412968075316, + "grad_norm": 9.523822429284934, + "learning_rate": 1.6435732858639298e-06, + "loss": 1.0813, + "step": 8800 + }, + { + "epoch": 1.2459828696821689, + "grad_norm": 10.861174114680884, + "learning_rate": 1.6430347918591693e-06, + "loss": 1.0717, + "step": 8801 + }, + { + "epoch": 1.246124442556806, + "grad_norm": 9.142495718692386, + "learning_rate": 1.6424963429044315e-06, + "loss": 1.0366, + "step": 8802 + }, + { + "epoch": 1.2462660154314433, + "grad_norm": 7.8440636803873955, + "learning_rate": 1.6419579390280217e-06, + "loss": 0.9698, + "step": 8803 + }, + { + "epoch": 1.2464075883060806, + "grad_norm": 9.227006945132715, + "learning_rate": 1.6414195802582434e-06, + "loss": 0.9781, + "step": 8804 + }, + { + "epoch": 1.2465491611807178, + "grad_norm": 8.921703971653201, + "learning_rate": 1.640881266623397e-06, + "loss": 1.0156, + "step": 8805 + }, + { + "epoch": 1.246690734055355, + "grad_norm": 8.020624754204245, + "learning_rate": 1.6403429981517831e-06, + "loss": 0.8817, + "step": 8806 + }, + { + "epoch": 1.2468323069299923, + "grad_norm": 9.447501232654888, + "learning_rate": 1.6398047748716955e-06, + "loss": 1.0614, + "step": 8807 + }, + { + "epoch": 1.2469738798046295, + "grad_norm": 8.844165805706501, + "learning_rate": 1.6392665968114297e-06, + "loss": 1.0103, + "step": 8808 + }, + { + "epoch": 1.2471154526792667, + "grad_norm": 8.223987607479035, + "learning_rate": 1.6387284639992773e-06, + "loss": 0.9303, + "step": 8809 + }, + { + "epoch": 1.247257025553904, + "grad_norm": 9.946945212233542, + "learning_rate": 1.6381903764635274e-06, + "loss": 1.0362, + "step": 8810 + }, + { + "epoch": 1.2473985984285412, + "grad_norm": 10.501276574416218, + "learning_rate": 1.6376523342324668e-06, + "loss": 1.102, + "step": 8811 + }, + { + "epoch": 1.2475401713031784, + "grad_norm": 10.67203656768253, + "learning_rate": 1.6371143373343798e-06, + "loss": 1.0517, + "step": 8812 + }, + { + "epoch": 1.2476817441778154, + "grad_norm": 8.919885737623863, + "learning_rate": 1.6365763857975486e-06, + "loss": 1.1911, + "step": 8813 + }, + { + "epoch": 1.2478233170524526, + "grad_norm": 10.028873340314664, + "learning_rate": 1.6360384796502532e-06, + "loss": 1.0533, + "step": 8814 + }, + { + "epoch": 1.2479648899270899, + "grad_norm": 9.900718709115578, + "learning_rate": 1.635500618920771e-06, + "loss": 0.9456, + "step": 8815 + }, + { + "epoch": 1.248106462801727, + "grad_norm": 8.200680904495876, + "learning_rate": 1.634962803637377e-06, + "loss": 0.9934, + "step": 8816 + }, + { + "epoch": 1.2482480356763643, + "grad_norm": 10.359654119307251, + "learning_rate": 1.6344250338283426e-06, + "loss": 0.9905, + "step": 8817 + }, + { + "epoch": 1.2483896085510016, + "grad_norm": 8.130479578791915, + "learning_rate": 1.6338873095219391e-06, + "loss": 1.034, + "step": 8818 + }, + { + "epoch": 1.2485311814256388, + "grad_norm": 10.92531059050604, + "learning_rate": 1.6333496307464335e-06, + "loss": 1.1075, + "step": 8819 + }, + { + "epoch": 1.248672754300276, + "grad_norm": 10.057485906831452, + "learning_rate": 1.6328119975300921e-06, + "loss": 0.8949, + "step": 8820 + }, + { + "epoch": 1.2488143271749133, + "grad_norm": 9.401855658940047, + "learning_rate": 1.6322744099011772e-06, + "loss": 0.9819, + "step": 8821 + }, + { + "epoch": 1.2489559000495505, + "grad_norm": 9.37270113416047, + "learning_rate": 1.6317368678879497e-06, + "loss": 1.0554, + "step": 8822 + }, + { + "epoch": 1.2490974729241877, + "grad_norm": 8.010880700317948, + "learning_rate": 1.6311993715186674e-06, + "loss": 1.05, + "step": 8823 + }, + { + "epoch": 1.249239045798825, + "grad_norm": 10.601424326660368, + "learning_rate": 1.6306619208215862e-06, + "loss": 1.0944, + "step": 8824 + }, + { + "epoch": 1.2493806186734622, + "grad_norm": 9.764888644113336, + "learning_rate": 1.6301245158249599e-06, + "loss": 1.0645, + "step": 8825 + }, + { + "epoch": 1.2495221915480994, + "grad_norm": 10.293905243806018, + "learning_rate": 1.6295871565570392e-06, + "loss": 1.0033, + "step": 8826 + }, + { + "epoch": 1.2496637644227366, + "grad_norm": 6.776894854099877, + "learning_rate": 1.6290498430460736e-06, + "loss": 0.9759, + "step": 8827 + }, + { + "epoch": 1.2498053372973739, + "grad_norm": 10.10317789070404, + "learning_rate": 1.6285125753203073e-06, + "loss": 1.0398, + "step": 8828 + }, + { + "epoch": 1.249946910172011, + "grad_norm": 8.975861285438278, + "learning_rate": 1.6279753534079853e-06, + "loss": 1.0212, + "step": 8829 + }, + { + "epoch": 1.2500884830466483, + "grad_norm": 8.748991117897805, + "learning_rate": 1.6274381773373482e-06, + "loss": 0.8805, + "step": 8830 + }, + { + "epoch": 1.2502300559212856, + "grad_norm": 10.321195138732572, + "learning_rate": 1.6269010471366359e-06, + "loss": 1.1054, + "step": 8831 + }, + { + "epoch": 1.2503716287959228, + "grad_norm": 9.972636741681859, + "learning_rate": 1.6263639628340847e-06, + "loss": 1.2093, + "step": 8832 + }, + { + "epoch": 1.25051320167056, + "grad_norm": 10.681455096155739, + "learning_rate": 1.6258269244579283e-06, + "loss": 1.0249, + "step": 8833 + }, + { + "epoch": 1.2506547745451972, + "grad_norm": 8.826105963808445, + "learning_rate": 1.6252899320363992e-06, + "loss": 1.033, + "step": 8834 + }, + { + "epoch": 1.2507963474198345, + "grad_norm": 9.1868717731377, + "learning_rate": 1.6247529855977256e-06, + "loss": 1.0192, + "step": 8835 + }, + { + "epoch": 1.2509379202944717, + "grad_norm": 9.380442349347582, + "learning_rate": 1.6242160851701353e-06, + "loss": 1.0257, + "step": 8836 + }, + { + "epoch": 1.251079493169109, + "grad_norm": 9.528161320753174, + "learning_rate": 1.6236792307818528e-06, + "loss": 1.0383, + "step": 8837 + }, + { + "epoch": 1.251221066043746, + "grad_norm": 9.610224095489519, + "learning_rate": 1.6231424224610992e-06, + "loss": 0.9312, + "step": 8838 + }, + { + "epoch": 1.2513626389183832, + "grad_norm": 7.098932062344395, + "learning_rate": 1.6226056602360945e-06, + "loss": 1.0178, + "step": 8839 + }, + { + "epoch": 1.2515042117930204, + "grad_norm": 10.09002257693348, + "learning_rate": 1.6220689441350561e-06, + "loss": 1.0993, + "step": 8840 + }, + { + "epoch": 1.2516457846676576, + "grad_norm": 9.286028418624612, + "learning_rate": 1.6215322741861988e-06, + "loss": 1.137, + "step": 8841 + }, + { + "epoch": 1.2517873575422949, + "grad_norm": 8.553356774353272, + "learning_rate": 1.6209956504177345e-06, + "loss": 1.0672, + "step": 8842 + }, + { + "epoch": 1.251928930416932, + "grad_norm": 10.283585767156744, + "learning_rate": 1.6204590728578739e-06, + "loss": 1.0152, + "step": 8843 + }, + { + "epoch": 1.2520705032915693, + "grad_norm": 10.633817716306444, + "learning_rate": 1.6199225415348239e-06, + "loss": 1.0604, + "step": 8844 + }, + { + "epoch": 1.2522120761662066, + "grad_norm": 10.227642704600544, + "learning_rate": 1.6193860564767893e-06, + "loss": 0.946, + "step": 8845 + }, + { + "epoch": 1.2523536490408438, + "grad_norm": 9.056571757811811, + "learning_rate": 1.6188496177119737e-06, + "loss": 1.0804, + "step": 8846 + }, + { + "epoch": 1.252495221915481, + "grad_norm": 9.372055608593367, + "learning_rate": 1.6183132252685758e-06, + "loss": 1.1075, + "step": 8847 + }, + { + "epoch": 1.2526367947901182, + "grad_norm": 10.99200808033653, + "learning_rate": 1.6177768791747957e-06, + "loss": 1.1096, + "step": 8848 + }, + { + "epoch": 1.2527783676647555, + "grad_norm": 10.292672997411344, + "learning_rate": 1.6172405794588264e-06, + "loss": 0.9507, + "step": 8849 + }, + { + "epoch": 1.2529199405393927, + "grad_norm": 10.609561969081534, + "learning_rate": 1.616704326148862e-06, + "loss": 1.0453, + "step": 8850 + }, + { + "epoch": 1.25306151341403, + "grad_norm": 8.939633414926012, + "learning_rate": 1.6161681192730918e-06, + "loss": 0.903, + "step": 8851 + }, + { + "epoch": 1.2532030862886672, + "grad_norm": 8.703174967596686, + "learning_rate": 1.615631958859705e-06, + "loss": 0.9911, + "step": 8852 + }, + { + "epoch": 1.2533446591633042, + "grad_norm": 10.71578751710841, + "learning_rate": 1.6150958449368862e-06, + "loss": 1.0148, + "step": 8853 + }, + { + "epoch": 1.2534862320379414, + "grad_norm": 9.731651059744838, + "learning_rate": 1.6145597775328192e-06, + "loss": 0.9567, + "step": 8854 + }, + { + "epoch": 1.2536278049125786, + "grad_norm": 10.028428295823462, + "learning_rate": 1.614023756675685e-06, + "loss": 1.1196, + "step": 8855 + }, + { + "epoch": 1.2537693777872159, + "grad_norm": 10.95058055714738, + "learning_rate": 1.613487782393661e-06, + "loss": 1.0311, + "step": 8856 + }, + { + "epoch": 1.253910950661853, + "grad_norm": 8.87108944985585, + "learning_rate": 1.612951854714923e-06, + "loss": 1.1054, + "step": 8857 + }, + { + "epoch": 1.2540525235364903, + "grad_norm": 9.353306606886244, + "learning_rate": 1.6124159736676452e-06, + "loss": 0.9048, + "step": 8858 + }, + { + "epoch": 1.2541940964111276, + "grad_norm": 8.260532476151328, + "learning_rate": 1.611880139279998e-06, + "loss": 0.9493, + "step": 8859 + }, + { + "epoch": 1.2543356692857648, + "grad_norm": 9.14313408738718, + "learning_rate": 1.6113443515801492e-06, + "loss": 1.1572, + "step": 8860 + }, + { + "epoch": 1.254477242160402, + "grad_norm": 7.534986653256002, + "learning_rate": 1.610808610596265e-06, + "loss": 1.0231, + "step": 8861 + }, + { + "epoch": 1.2546188150350392, + "grad_norm": 9.239922885325278, + "learning_rate": 1.6102729163565095e-06, + "loss": 0.9575, + "step": 8862 + }, + { + "epoch": 1.2547603879096765, + "grad_norm": 8.76466100495269, + "learning_rate": 1.6097372688890433e-06, + "loss": 1.0437, + "step": 8863 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 9.810208387115576, + "learning_rate": 1.6092016682220252e-06, + "loss": 0.9984, + "step": 8864 + }, + { + "epoch": 1.255043533658951, + "grad_norm": 7.799708522338977, + "learning_rate": 1.6086661143836107e-06, + "loss": 0.9667, + "step": 8865 + }, + { + "epoch": 1.2551851065335882, + "grad_norm": 6.983371329678369, + "learning_rate": 1.6081306074019543e-06, + "loss": 0.9403, + "step": 8866 + }, + { + "epoch": 1.2553266794082254, + "grad_norm": 10.813915005744443, + "learning_rate": 1.607595147305207e-06, + "loss": 1.058, + "step": 8867 + }, + { + "epoch": 1.2554682522828626, + "grad_norm": 7.959432502721589, + "learning_rate": 1.6070597341215171e-06, + "loss": 0.9496, + "step": 8868 + }, + { + "epoch": 1.2556098251574999, + "grad_norm": 9.12878464898151, + "learning_rate": 1.6065243678790321e-06, + "loss": 0.9087, + "step": 8869 + }, + { + "epoch": 1.255751398032137, + "grad_norm": 9.716475536457956, + "learning_rate": 1.6059890486058937e-06, + "loss": 1.1533, + "step": 8870 + }, + { + "epoch": 1.2558929709067743, + "grad_norm": 8.632725890402645, + "learning_rate": 1.605453776330245e-06, + "loss": 1.0051, + "step": 8871 + }, + { + "epoch": 1.2560345437814115, + "grad_norm": 7.204282853745235, + "learning_rate": 1.604918551080224e-06, + "loss": 0.9868, + "step": 8872 + }, + { + "epoch": 1.2561761166560488, + "grad_norm": 9.815623569822405, + "learning_rate": 1.6043833728839675e-06, + "loss": 1.1004, + "step": 8873 + }, + { + "epoch": 1.256317689530686, + "grad_norm": 9.771339734150855, + "learning_rate": 1.6038482417696095e-06, + "loss": 1.0565, + "step": 8874 + }, + { + "epoch": 1.2564592624053232, + "grad_norm": 8.332001490658294, + "learning_rate": 1.60331315776528e-06, + "loss": 0.9647, + "step": 8875 + }, + { + "epoch": 1.2566008352799605, + "grad_norm": 10.994647024056468, + "learning_rate": 1.6027781208991102e-06, + "loss": 1.0955, + "step": 8876 + }, + { + "epoch": 1.2567424081545977, + "grad_norm": 10.306529403239665, + "learning_rate": 1.6022431311992257e-06, + "loss": 0.9632, + "step": 8877 + }, + { + "epoch": 1.256883981029235, + "grad_norm": 8.436610648296924, + "learning_rate": 1.6017081886937502e-06, + "loss": 0.9575, + "step": 8878 + }, + { + "epoch": 1.257025553903872, + "grad_norm": 10.805594118687436, + "learning_rate": 1.6011732934108055e-06, + "loss": 1.0269, + "step": 8879 + }, + { + "epoch": 1.2571671267785092, + "grad_norm": 9.445211127583592, + "learning_rate": 1.6006384453785115e-06, + "loss": 0.9981, + "step": 8880 + }, + { + "epoch": 1.2573086996531464, + "grad_norm": 7.333696471962218, + "learning_rate": 1.6001036446249824e-06, + "loss": 0.9285, + "step": 8881 + }, + { + "epoch": 1.2574502725277836, + "grad_norm": 9.89143298375761, + "learning_rate": 1.5995688911783341e-06, + "loss": 0.9571, + "step": 8882 + }, + { + "epoch": 1.2575918454024209, + "grad_norm": 9.126702136836244, + "learning_rate": 1.5990341850666779e-06, + "loss": 0.9359, + "step": 8883 + }, + { + "epoch": 1.257733418277058, + "grad_norm": 9.64327140604497, + "learning_rate": 1.598499526318123e-06, + "loss": 1.0468, + "step": 8884 + }, + { + "epoch": 1.2578749911516953, + "grad_norm": 8.31581542934202, + "learning_rate": 1.5979649149607755e-06, + "loss": 1.0708, + "step": 8885 + }, + { + "epoch": 1.2580165640263326, + "grad_norm": 9.6101466915752, + "learning_rate": 1.59743035102274e-06, + "loss": 1.0358, + "step": 8886 + }, + { + "epoch": 1.2581581369009698, + "grad_norm": 9.610192737055765, + "learning_rate": 1.5968958345321178e-06, + "loss": 1.0129, + "step": 8887 + }, + { + "epoch": 1.258299709775607, + "grad_norm": 7.921932062248259, + "learning_rate": 1.5963613655170082e-06, + "loss": 0.9886, + "step": 8888 + }, + { + "epoch": 1.2584412826502442, + "grad_norm": 8.226647357557358, + "learning_rate": 1.595826944005508e-06, + "loss": 1.0401, + "step": 8889 + }, + { + "epoch": 1.2585828555248815, + "grad_norm": 10.61108600376037, + "learning_rate": 1.5952925700257116e-06, + "loss": 1.0665, + "step": 8890 + }, + { + "epoch": 1.2587244283995187, + "grad_norm": 11.388309949927821, + "learning_rate": 1.5947582436057097e-06, + "loss": 0.9522, + "step": 8891 + }, + { + "epoch": 1.258866001274156, + "grad_norm": 10.65841375404621, + "learning_rate": 1.5942239647735918e-06, + "loss": 1.0386, + "step": 8892 + }, + { + "epoch": 1.2590075741487932, + "grad_norm": 10.449691505076574, + "learning_rate": 1.5936897335574453e-06, + "loss": 0.9493, + "step": 8893 + }, + { + "epoch": 1.2591491470234304, + "grad_norm": 12.93953434526396, + "learning_rate": 1.5931555499853529e-06, + "loss": 1.0549, + "step": 8894 + }, + { + "epoch": 1.2592907198980674, + "grad_norm": 8.036450791288452, + "learning_rate": 1.5926214140853976e-06, + "loss": 0.9518, + "step": 8895 + }, + { + "epoch": 1.2594322927727046, + "grad_norm": 9.039155344704676, + "learning_rate": 1.592087325885658e-06, + "loss": 0.9799, + "step": 8896 + }, + { + "epoch": 1.2595738656473419, + "grad_norm": 10.216808636307988, + "learning_rate": 1.5915532854142105e-06, + "loss": 1.0795, + "step": 8897 + }, + { + "epoch": 1.259715438521979, + "grad_norm": 9.087303996956713, + "learning_rate": 1.5910192926991291e-06, + "loss": 0.9737, + "step": 8898 + }, + { + "epoch": 1.2598570113966163, + "grad_norm": 8.059129113791013, + "learning_rate": 1.5904853477684863e-06, + "loss": 0.8801, + "step": 8899 + }, + { + "epoch": 1.2599985842712536, + "grad_norm": 10.930110506020885, + "learning_rate": 1.5899514506503499e-06, + "loss": 1.0074, + "step": 8900 + }, + { + "epoch": 1.2601401571458908, + "grad_norm": 9.346966142123962, + "learning_rate": 1.5894176013727891e-06, + "loss": 1.03, + "step": 8901 + }, + { + "epoch": 1.260281730020528, + "grad_norm": 9.829373569960195, + "learning_rate": 1.5888837999638646e-06, + "loss": 1.0743, + "step": 8902 + }, + { + "epoch": 1.2604233028951652, + "grad_norm": 10.045950032751849, + "learning_rate": 1.5883500464516394e-06, + "loss": 0.9407, + "step": 8903 + }, + { + "epoch": 1.2605648757698025, + "grad_norm": 10.372985276542297, + "learning_rate": 1.5878163408641717e-06, + "loss": 1.017, + "step": 8904 + }, + { + "epoch": 1.2607064486444397, + "grad_norm": 9.615081432551149, + "learning_rate": 1.5872826832295197e-06, + "loss": 0.9163, + "step": 8905 + }, + { + "epoch": 1.260848021519077, + "grad_norm": 8.766473576071563, + "learning_rate": 1.5867490735757366e-06, + "loss": 0.9634, + "step": 8906 + }, + { + "epoch": 1.2609895943937142, + "grad_norm": 8.239803139270123, + "learning_rate": 1.5862155119308737e-06, + "loss": 0.9556, + "step": 8907 + }, + { + "epoch": 1.2611311672683514, + "grad_norm": 9.102179850978048, + "learning_rate": 1.5856819983229796e-06, + "loss": 0.9512, + "step": 8908 + }, + { + "epoch": 1.2612727401429886, + "grad_norm": 9.500906650044067, + "learning_rate": 1.5851485327801014e-06, + "loss": 1.0471, + "step": 8909 + }, + { + "epoch": 1.2614143130176259, + "grad_norm": 9.06357205068019, + "learning_rate": 1.5846151153302824e-06, + "loss": 1.0671, + "step": 8910 + }, + { + "epoch": 1.261555885892263, + "grad_norm": 6.916042927879601, + "learning_rate": 1.584081746001565e-06, + "loss": 0.8267, + "step": 8911 + }, + { + "epoch": 1.2616974587669003, + "grad_norm": 8.92030440901608, + "learning_rate": 1.583548424821987e-06, + "loss": 0.9514, + "step": 8912 + }, + { + "epoch": 1.2618390316415375, + "grad_norm": 8.638272894853289, + "learning_rate": 1.5830151518195846e-06, + "loss": 0.9517, + "step": 8913 + }, + { + "epoch": 1.2619806045161748, + "grad_norm": 9.910499599885766, + "learning_rate": 1.5824819270223922e-06, + "loss": 1.0185, + "step": 8914 + }, + { + "epoch": 1.262122177390812, + "grad_norm": 9.005139049280416, + "learning_rate": 1.5819487504584408e-06, + "loss": 0.9352, + "step": 8915 + }, + { + "epoch": 1.2622637502654492, + "grad_norm": 8.489302242438807, + "learning_rate": 1.5814156221557587e-06, + "loss": 0.9836, + "step": 8916 + }, + { + "epoch": 1.2624053231400865, + "grad_norm": 10.278624225391104, + "learning_rate": 1.5808825421423729e-06, + "loss": 1.0421, + "step": 8917 + }, + { + "epoch": 1.2625468960147237, + "grad_norm": 9.670244366024718, + "learning_rate": 1.5803495104463063e-06, + "loss": 1.0334, + "step": 8918 + }, + { + "epoch": 1.262688468889361, + "grad_norm": 8.751561924628396, + "learning_rate": 1.57981652709558e-06, + "loss": 1.1205, + "step": 8919 + }, + { + "epoch": 1.262830041763998, + "grad_norm": 8.895334508223037, + "learning_rate": 1.5792835921182128e-06, + "loss": 0.9956, + "step": 8920 + }, + { + "epoch": 1.2629716146386352, + "grad_norm": 9.795216494509777, + "learning_rate": 1.5787507055422201e-06, + "loss": 1.0085, + "step": 8921 + }, + { + "epoch": 1.2631131875132724, + "grad_norm": 9.059920581865331, + "learning_rate": 1.5782178673956179e-06, + "loss": 1.0175, + "step": 8922 + }, + { + "epoch": 1.2632547603879096, + "grad_norm": 10.968549677657936, + "learning_rate": 1.5776850777064137e-06, + "loss": 1.0882, + "step": 8923 + }, + { + "epoch": 1.2633963332625469, + "grad_norm": 9.804524867924105, + "learning_rate": 1.5771523365026175e-06, + "loss": 1.0481, + "step": 8924 + }, + { + "epoch": 1.263537906137184, + "grad_norm": 9.703983582183207, + "learning_rate": 1.5766196438122344e-06, + "loss": 1.0085, + "step": 8925 + }, + { + "epoch": 1.2636794790118213, + "grad_norm": 8.909520455587671, + "learning_rate": 1.5760869996632685e-06, + "loss": 0.937, + "step": 8926 + }, + { + "epoch": 1.2638210518864585, + "grad_norm": 9.392635999704726, + "learning_rate": 1.5755544040837195e-06, + "loss": 1.0018, + "step": 8927 + }, + { + "epoch": 1.2639626247610958, + "grad_norm": 8.96889120914533, + "learning_rate": 1.575021857101587e-06, + "loss": 1.0181, + "step": 8928 + }, + { + "epoch": 1.264104197635733, + "grad_norm": 8.637901056145626, + "learning_rate": 1.5744893587448654e-06, + "loss": 0.9094, + "step": 8929 + }, + { + "epoch": 1.2642457705103702, + "grad_norm": 11.32488167471471, + "learning_rate": 1.5739569090415482e-06, + "loss": 1.0567, + "step": 8930 + }, + { + "epoch": 1.2643873433850075, + "grad_norm": 10.094752589647326, + "learning_rate": 1.573424508019626e-06, + "loss": 0.9937, + "step": 8931 + }, + { + "epoch": 1.2645289162596447, + "grad_norm": 10.092610486786274, + "learning_rate": 1.5728921557070864e-06, + "loss": 0.9859, + "step": 8932 + }, + { + "epoch": 1.264670489134282, + "grad_norm": 12.360682364087323, + "learning_rate": 1.5723598521319152e-06, + "loss": 1.117, + "step": 8933 + }, + { + "epoch": 1.2648120620089192, + "grad_norm": 8.71087487715719, + "learning_rate": 1.5718275973220944e-06, + "loss": 0.8706, + "step": 8934 + }, + { + "epoch": 1.2649536348835564, + "grad_norm": 8.823171228356749, + "learning_rate": 1.571295391305605e-06, + "loss": 0.9591, + "step": 8935 + }, + { + "epoch": 1.2650952077581934, + "grad_norm": 11.222777257440208, + "learning_rate": 1.5707632341104246e-06, + "loss": 1.1001, + "step": 8936 + }, + { + "epoch": 1.2652367806328306, + "grad_norm": 8.078214722361986, + "learning_rate": 1.5702311257645274e-06, + "loss": 0.9835, + "step": 8937 + }, + { + "epoch": 1.2653783535074679, + "grad_norm": 8.990455228590216, + "learning_rate": 1.5696990662958872e-06, + "loss": 1.0003, + "step": 8938 + }, + { + "epoch": 1.265519926382105, + "grad_norm": 9.998543442505998, + "learning_rate": 1.5691670557324734e-06, + "loss": 1.0539, + "step": 8939 + }, + { + "epoch": 1.2656614992567423, + "grad_norm": 9.111586235291005, + "learning_rate": 1.5686350941022533e-06, + "loss": 1.0081, + "step": 8940 + }, + { + "epoch": 1.2658030721313795, + "grad_norm": 9.28938076029118, + "learning_rate": 1.5681031814331918e-06, + "loss": 1.1441, + "step": 8941 + }, + { + "epoch": 1.2659446450060168, + "grad_norm": 9.675165014313668, + "learning_rate": 1.5675713177532514e-06, + "loss": 1.0411, + "step": 8942 + }, + { + "epoch": 1.266086217880654, + "grad_norm": 8.630658933164112, + "learning_rate": 1.5670395030903918e-06, + "loss": 0.8984, + "step": 8943 + }, + { + "epoch": 1.2662277907552912, + "grad_norm": 9.632667558976475, + "learning_rate": 1.5665077374725696e-06, + "loss": 0.976, + "step": 8944 + }, + { + "epoch": 1.2663693636299285, + "grad_norm": 10.972935760946894, + "learning_rate": 1.5659760209277395e-06, + "loss": 1.0773, + "step": 8945 + }, + { + "epoch": 1.2665109365045657, + "grad_norm": 9.422100108732891, + "learning_rate": 1.5654443534838537e-06, + "loss": 1.0603, + "step": 8946 + }, + { + "epoch": 1.266652509379203, + "grad_norm": 8.830348776196631, + "learning_rate": 1.564912735168861e-06, + "loss": 0.8976, + "step": 8947 + }, + { + "epoch": 1.2667940822538402, + "grad_norm": 9.602643888706918, + "learning_rate": 1.564381166010709e-06, + "loss": 1.0034, + "step": 8948 + }, + { + "epoch": 1.2669356551284774, + "grad_norm": 10.02285672153162, + "learning_rate": 1.5638496460373415e-06, + "loss": 1.069, + "step": 8949 + }, + { + "epoch": 1.2670772280031146, + "grad_norm": 8.561355848511598, + "learning_rate": 1.563318175276699e-06, + "loss": 0.9194, + "step": 8950 + }, + { + "epoch": 1.2672188008777518, + "grad_norm": 9.900950268693633, + "learning_rate": 1.5627867537567225e-06, + "loss": 1.1824, + "step": 8951 + }, + { + "epoch": 1.267360373752389, + "grad_norm": 9.879283121024095, + "learning_rate": 1.5622553815053476e-06, + "loss": 1.0229, + "step": 8952 + }, + { + "epoch": 1.2675019466270263, + "grad_norm": 10.036782328601072, + "learning_rate": 1.5617240585505084e-06, + "loss": 1.0283, + "step": 8953 + }, + { + "epoch": 1.2676435195016635, + "grad_norm": 9.577444329275483, + "learning_rate": 1.5611927849201364e-06, + "loss": 0.9504, + "step": 8954 + }, + { + "epoch": 1.2677850923763008, + "grad_norm": 10.333198259352708, + "learning_rate": 1.5606615606421588e-06, + "loss": 1.069, + "step": 8955 + }, + { + "epoch": 1.267926665250938, + "grad_norm": 9.47535439988489, + "learning_rate": 1.5601303857445018e-06, + "loss": 0.9476, + "step": 8956 + }, + { + "epoch": 1.2680682381255752, + "grad_norm": 8.575246859631115, + "learning_rate": 1.5595992602550903e-06, + "loss": 1.028, + "step": 8957 + }, + { + "epoch": 1.2682098110002125, + "grad_norm": 9.74507877668362, + "learning_rate": 1.5590681842018446e-06, + "loss": 0.9363, + "step": 8958 + }, + { + "epoch": 1.2683513838748497, + "grad_norm": 10.355421229497718, + "learning_rate": 1.5585371576126828e-06, + "loss": 1.1036, + "step": 8959 + }, + { + "epoch": 1.268492956749487, + "grad_norm": 10.159400803325815, + "learning_rate": 1.5580061805155205e-06, + "loss": 1.0512, + "step": 8960 + }, + { + "epoch": 1.2686345296241242, + "grad_norm": 8.849039282081435, + "learning_rate": 1.5574752529382714e-06, + "loss": 0.9858, + "step": 8961 + }, + { + "epoch": 1.2687761024987612, + "grad_norm": 8.416468652592851, + "learning_rate": 1.5569443749088449e-06, + "loss": 1.0487, + "step": 8962 + }, + { + "epoch": 1.2689176753733984, + "grad_norm": 10.712452097622078, + "learning_rate": 1.5564135464551496e-06, + "loss": 1.0264, + "step": 8963 + }, + { + "epoch": 1.2690592482480356, + "grad_norm": 7.79337453909043, + "learning_rate": 1.5558827676050914e-06, + "loss": 0.9627, + "step": 8964 + }, + { + "epoch": 1.2692008211226729, + "grad_norm": 9.86887604317071, + "learning_rate": 1.555352038386571e-06, + "loss": 1.0898, + "step": 8965 + }, + { + "epoch": 1.26934239399731, + "grad_norm": 9.015212871050332, + "learning_rate": 1.55482135882749e-06, + "loss": 1.0287, + "step": 8966 + }, + { + "epoch": 1.2694839668719473, + "grad_norm": 8.942352898091414, + "learning_rate": 1.5542907289557457e-06, + "loss": 0.9464, + "step": 8967 + }, + { + "epoch": 1.2696255397465845, + "grad_norm": 11.209024022788828, + "learning_rate": 1.5537601487992325e-06, + "loss": 1.071, + "step": 8968 + }, + { + "epoch": 1.2697671126212218, + "grad_norm": 7.287650900794814, + "learning_rate": 1.5532296183858424e-06, + "loss": 0.8518, + "step": 8969 + }, + { + "epoch": 1.269908685495859, + "grad_norm": 8.435972174318673, + "learning_rate": 1.5526991377434655e-06, + "loss": 1.0161, + "step": 8970 + }, + { + "epoch": 1.2700502583704962, + "grad_norm": 9.424664592087076, + "learning_rate": 1.5521687068999885e-06, + "loss": 1.0293, + "step": 8971 + }, + { + "epoch": 1.2701918312451335, + "grad_norm": 9.644297489193217, + "learning_rate": 1.5516383258832956e-06, + "loss": 1.0086, + "step": 8972 + }, + { + "epoch": 1.2703334041197707, + "grad_norm": 11.233369530261967, + "learning_rate": 1.551107994721269e-06, + "loss": 1.0038, + "step": 8973 + }, + { + "epoch": 1.270474976994408, + "grad_norm": 10.710991278470937, + "learning_rate": 1.5505777134417876e-06, + "loss": 1.0272, + "step": 8974 + }, + { + "epoch": 1.2706165498690452, + "grad_norm": 9.004755671023185, + "learning_rate": 1.550047482072729e-06, + "loss": 0.9832, + "step": 8975 + }, + { + "epoch": 1.2707581227436824, + "grad_norm": 8.989382096591267, + "learning_rate": 1.549517300641965e-06, + "loss": 1.0583, + "step": 8976 + }, + { + "epoch": 1.2708996956183194, + "grad_norm": 8.85155646651383, + "learning_rate": 1.5489871691773677e-06, + "loss": 1.0322, + "step": 8977 + }, + { + "epoch": 1.2710412684929566, + "grad_norm": 7.725302690915079, + "learning_rate": 1.5484570877068055e-06, + "loss": 0.9349, + "step": 8978 + }, + { + "epoch": 1.2711828413675939, + "grad_norm": 10.013182153216633, + "learning_rate": 1.547927056258145e-06, + "loss": 1.0468, + "step": 8979 + }, + { + "epoch": 1.271324414242231, + "grad_norm": 10.663231137465587, + "learning_rate": 1.5473970748592493e-06, + "loss": 1.0724, + "step": 8980 + }, + { + "epoch": 1.2714659871168683, + "grad_norm": 7.466476030413822, + "learning_rate": 1.5468671435379789e-06, + "loss": 0.9101, + "step": 8981 + }, + { + "epoch": 1.2716075599915055, + "grad_norm": 11.946372045676082, + "learning_rate": 1.5463372623221923e-06, + "loss": 0.9707, + "step": 8982 + }, + { + "epoch": 1.2717491328661428, + "grad_norm": 9.396793110639404, + "learning_rate": 1.5458074312397447e-06, + "loss": 1.0128, + "step": 8983 + }, + { + "epoch": 1.27189070574078, + "grad_norm": 7.8306015878748, + "learning_rate": 1.5452776503184891e-06, + "loss": 0.9315, + "step": 8984 + }, + { + "epoch": 1.2720322786154172, + "grad_norm": 8.628946921588168, + "learning_rate": 1.5447479195862752e-06, + "loss": 1.1318, + "step": 8985 + }, + { + "epoch": 1.2721738514900545, + "grad_norm": 9.458981753626004, + "learning_rate": 1.5442182390709517e-06, + "loss": 1.0379, + "step": 8986 + }, + { + "epoch": 1.2723154243646917, + "grad_norm": 9.285196512060374, + "learning_rate": 1.5436886088003622e-06, + "loss": 0.9821, + "step": 8987 + }, + { + "epoch": 1.272456997239329, + "grad_norm": 8.744887684122087, + "learning_rate": 1.5431590288023496e-06, + "loss": 1.0274, + "step": 8988 + }, + { + "epoch": 1.2725985701139662, + "grad_norm": 10.172601770551454, + "learning_rate": 1.542629499104753e-06, + "loss": 1.0418, + "step": 8989 + }, + { + "epoch": 1.2727401429886034, + "grad_norm": 7.672592471383685, + "learning_rate": 1.5421000197354099e-06, + "loss": 0.9632, + "step": 8990 + }, + { + "epoch": 1.2728817158632406, + "grad_norm": 9.736906601107531, + "learning_rate": 1.5415705907221545e-06, + "loss": 1.0115, + "step": 8991 + }, + { + "epoch": 1.2730232887378778, + "grad_norm": 9.286281878552346, + "learning_rate": 1.5410412120928189e-06, + "loss": 0.9723, + "step": 8992 + }, + { + "epoch": 1.273164861612515, + "grad_norm": 8.209689523678426, + "learning_rate": 1.5405118838752314e-06, + "loss": 0.9597, + "step": 8993 + }, + { + "epoch": 1.2733064344871523, + "grad_norm": 8.07410299793057, + "learning_rate": 1.539982606097219e-06, + "loss": 0.9347, + "step": 8994 + }, + { + "epoch": 1.2734480073617895, + "grad_norm": 10.71467748743134, + "learning_rate": 1.5394533787866045e-06, + "loss": 1.0568, + "step": 8995 + }, + { + "epoch": 1.2735895802364268, + "grad_norm": 10.87389113812929, + "learning_rate": 1.5389242019712107e-06, + "loss": 0.9972, + "step": 8996 + }, + { + "epoch": 1.273731153111064, + "grad_norm": 8.205745024299048, + "learning_rate": 1.5383950756788545e-06, + "loss": 0.9743, + "step": 8997 + }, + { + "epoch": 1.2738727259857012, + "grad_norm": 8.671661649261768, + "learning_rate": 1.5378659999373524e-06, + "loss": 1.0148, + "step": 8998 + }, + { + "epoch": 1.2740142988603385, + "grad_norm": 9.017629099853217, + "learning_rate": 1.5373369747745171e-06, + "loss": 1.0632, + "step": 8999 + }, + { + "epoch": 1.2741558717349757, + "grad_norm": 8.311830880905449, + "learning_rate": 1.5368080002181591e-06, + "loss": 0.8972, + "step": 9000 + }, + { + "epoch": 1.274297444609613, + "grad_norm": 7.4279652232895135, + "learning_rate": 1.536279076296086e-06, + "loss": 0.8769, + "step": 9001 + }, + { + "epoch": 1.2744390174842501, + "grad_norm": 8.925758598849814, + "learning_rate": 1.5357502030361036e-06, + "loss": 1.0983, + "step": 9002 + }, + { + "epoch": 1.2745805903588872, + "grad_norm": 10.569272370064894, + "learning_rate": 1.535221380466014e-06, + "loss": 0.9836, + "step": 9003 + }, + { + "epoch": 1.2747221632335244, + "grad_norm": 8.647961997147164, + "learning_rate": 1.5346926086136171e-06, + "loss": 1.0015, + "step": 9004 + }, + { + "epoch": 1.2748637361081616, + "grad_norm": 8.75046777837231, + "learning_rate": 1.5341638875067102e-06, + "loss": 1.1112, + "step": 9005 + }, + { + "epoch": 1.2750053089827988, + "grad_norm": 9.35690962938619, + "learning_rate": 1.5336352171730876e-06, + "loss": 1.0989, + "step": 9006 + }, + { + "epoch": 1.275146881857436, + "grad_norm": 10.295016547954464, + "learning_rate": 1.5331065976405412e-06, + "loss": 1.0584, + "step": 9007 + }, + { + "epoch": 1.2752884547320733, + "grad_norm": 9.868998575009277, + "learning_rate": 1.53257802893686e-06, + "loss": 1.0431, + "step": 9008 + }, + { + "epoch": 1.2754300276067105, + "grad_norm": 10.371582571460399, + "learning_rate": 1.5320495110898304e-06, + "loss": 1.0983, + "step": 9009 + }, + { + "epoch": 1.2755716004813478, + "grad_norm": 8.200415754245663, + "learning_rate": 1.531521044127236e-06, + "loss": 0.9556, + "step": 9010 + }, + { + "epoch": 1.275713173355985, + "grad_norm": 8.101055839388508, + "learning_rate": 1.5309926280768583e-06, + "loss": 1.0405, + "step": 9011 + }, + { + "epoch": 1.2758547462306222, + "grad_norm": 9.868137341085172, + "learning_rate": 1.5304642629664756e-06, + "loss": 0.9731, + "step": 9012 + }, + { + "epoch": 1.2759963191052595, + "grad_norm": 12.58910562527521, + "learning_rate": 1.5299359488238635e-06, + "loss": 1.077, + "step": 9013 + }, + { + "epoch": 1.2761378919798967, + "grad_norm": 9.762036840805816, + "learning_rate": 1.5294076856767956e-06, + "loss": 0.9745, + "step": 9014 + }, + { + "epoch": 1.276279464854534, + "grad_norm": 10.59030446422668, + "learning_rate": 1.5288794735530416e-06, + "loss": 1.0658, + "step": 9015 + }, + { + "epoch": 1.2764210377291711, + "grad_norm": 9.556955762290773, + "learning_rate": 1.52835131248037e-06, + "loss": 0.9936, + "step": 9016 + }, + { + "epoch": 1.2765626106038084, + "grad_norm": 8.577999148809276, + "learning_rate": 1.5278232024865458e-06, + "loss": 1.0346, + "step": 9017 + }, + { + "epoch": 1.2767041834784456, + "grad_norm": 9.347747253918696, + "learning_rate": 1.5272951435993303e-06, + "loss": 0.9903, + "step": 9018 + }, + { + "epoch": 1.2768457563530826, + "grad_norm": 9.956080694010296, + "learning_rate": 1.5267671358464837e-06, + "loss": 1.0669, + "step": 9019 + }, + { + "epoch": 1.2769873292277198, + "grad_norm": 9.59046944560128, + "learning_rate": 1.5262391792557635e-06, + "loss": 1.0304, + "step": 9020 + }, + { + "epoch": 1.277128902102357, + "grad_norm": 8.4267876333093, + "learning_rate": 1.5257112738549233e-06, + "loss": 1.1083, + "step": 9021 + }, + { + "epoch": 1.2772704749769943, + "grad_norm": 9.954272049901842, + "learning_rate": 1.525183419671715e-06, + "loss": 0.983, + "step": 9022 + }, + { + "epoch": 1.2774120478516315, + "grad_norm": 9.258147079924559, + "learning_rate": 1.5246556167338875e-06, + "loss": 0.8739, + "step": 9023 + }, + { + "epoch": 1.2775536207262688, + "grad_norm": 8.882453904508745, + "learning_rate": 1.5241278650691866e-06, + "loss": 0.8765, + "step": 9024 + }, + { + "epoch": 1.277695193600906, + "grad_norm": 8.656643965256452, + "learning_rate": 1.5236001647053564e-06, + "loss": 1.0235, + "step": 9025 + }, + { + "epoch": 1.2778367664755432, + "grad_norm": 9.398009686075575, + "learning_rate": 1.5230725156701375e-06, + "loss": 1.0261, + "step": 9026 + }, + { + "epoch": 1.2779783393501805, + "grad_norm": 9.527575175284797, + "learning_rate": 1.5225449179912683e-06, + "loss": 0.9796, + "step": 9027 + }, + { + "epoch": 1.2781199122248177, + "grad_norm": 8.015071024350977, + "learning_rate": 1.5220173716964847e-06, + "loss": 0.959, + "step": 9028 + }, + { + "epoch": 1.278261485099455, + "grad_norm": 9.707914635357215, + "learning_rate": 1.521489876813518e-06, + "loss": 0.9286, + "step": 9029 + }, + { + "epoch": 1.2784030579740921, + "grad_norm": 7.930372747870432, + "learning_rate": 1.5209624333700985e-06, + "loss": 1.0793, + "step": 9030 + }, + { + "epoch": 1.2785446308487294, + "grad_norm": 10.72075240668897, + "learning_rate": 1.520435041393954e-06, + "loss": 0.9781, + "step": 9031 + }, + { + "epoch": 1.2786862037233666, + "grad_norm": 9.879270378693562, + "learning_rate": 1.519907700912809e-06, + "loss": 1.0346, + "step": 9032 + }, + { + "epoch": 1.2788277765980038, + "grad_norm": 8.187220910826566, + "learning_rate": 1.5193804119543853e-06, + "loss": 0.9894, + "step": 9033 + }, + { + "epoch": 1.278969349472641, + "grad_norm": 8.226898215490872, + "learning_rate": 1.5188531745464023e-06, + "loss": 0.9669, + "step": 9034 + }, + { + "epoch": 1.2791109223472783, + "grad_norm": 9.698977654449148, + "learning_rate": 1.5183259887165763e-06, + "loss": 1.0364, + "step": 9035 + }, + { + "epoch": 1.2792524952219155, + "grad_norm": 8.499270688352427, + "learning_rate": 1.5177988544926208e-06, + "loss": 0.9655, + "step": 9036 + }, + { + "epoch": 1.2793940680965528, + "grad_norm": 9.949517427176627, + "learning_rate": 1.5172717719022475e-06, + "loss": 0.9626, + "step": 9037 + }, + { + "epoch": 1.27953564097119, + "grad_norm": 9.643827972789126, + "learning_rate": 1.5167447409731645e-06, + "loss": 0.9619, + "step": 9038 + }, + { + "epoch": 1.2796772138458272, + "grad_norm": 10.26041497296676, + "learning_rate": 1.5162177617330775e-06, + "loss": 1.0353, + "step": 9039 + }, + { + "epoch": 1.2798187867204645, + "grad_norm": 8.78889322753662, + "learning_rate": 1.515690834209689e-06, + "loss": 0.9741, + "step": 9040 + }, + { + "epoch": 1.2799603595951017, + "grad_norm": 9.821980361300538, + "learning_rate": 1.5151639584306993e-06, + "loss": 1.0082, + "step": 9041 + }, + { + "epoch": 1.280101932469739, + "grad_norm": 10.853516152206664, + "learning_rate": 1.5146371344238063e-06, + "loss": 1.0202, + "step": 9042 + }, + { + "epoch": 1.2802435053443761, + "grad_norm": 10.796212514158793, + "learning_rate": 1.5141103622167042e-06, + "loss": 1.1095, + "step": 9043 + }, + { + "epoch": 1.2803850782190134, + "grad_norm": 8.998704181209535, + "learning_rate": 1.513583641837085e-06, + "loss": 1.0678, + "step": 9044 + }, + { + "epoch": 1.2805266510936504, + "grad_norm": 9.432769195977915, + "learning_rate": 1.5130569733126382e-06, + "loss": 1.0856, + "step": 9045 + }, + { + "epoch": 1.2806682239682876, + "grad_norm": 9.88705674056799, + "learning_rate": 1.5125303566710508e-06, + "loss": 1.0256, + "step": 9046 + }, + { + "epoch": 1.2808097968429248, + "grad_norm": 10.347724450629343, + "learning_rate": 1.5120037919400054e-06, + "loss": 0.9876, + "step": 9047 + }, + { + "epoch": 1.280951369717562, + "grad_norm": 9.573682442394489, + "learning_rate": 1.5114772791471848e-06, + "loss": 1.145, + "step": 9048 + }, + { + "epoch": 1.2810929425921993, + "grad_norm": 9.603942824352792, + "learning_rate": 1.5109508183202675e-06, + "loss": 1.1661, + "step": 9049 + }, + { + "epoch": 1.2812345154668365, + "grad_norm": 10.43959508644122, + "learning_rate": 1.5104244094869272e-06, + "loss": 0.986, + "step": 9050 + }, + { + "epoch": 1.2813760883414738, + "grad_norm": 9.067163569727597, + "learning_rate": 1.509898052674838e-06, + "loss": 1.0031, + "step": 9051 + }, + { + "epoch": 1.281517661216111, + "grad_norm": 9.356119904922819, + "learning_rate": 1.5093717479116696e-06, + "loss": 1.1085, + "step": 9052 + }, + { + "epoch": 1.2816592340907482, + "grad_norm": 8.994636632970767, + "learning_rate": 1.508845495225089e-06, + "loss": 1.0897, + "step": 9053 + }, + { + "epoch": 1.2818008069653855, + "grad_norm": 7.887948184572462, + "learning_rate": 1.5083192946427626e-06, + "loss": 1.0325, + "step": 9054 + }, + { + "epoch": 1.2819423798400227, + "grad_norm": 9.546357136118075, + "learning_rate": 1.5077931461923518e-06, + "loss": 1.0165, + "step": 9055 + }, + { + "epoch": 1.28208395271466, + "grad_norm": 9.825534594408488, + "learning_rate": 1.5072670499015151e-06, + "loss": 0.9491, + "step": 9056 + }, + { + "epoch": 1.2822255255892971, + "grad_norm": 10.306595655281372, + "learning_rate": 1.5067410057979094e-06, + "loss": 1.0766, + "step": 9057 + }, + { + "epoch": 1.2823670984639344, + "grad_norm": 8.409972093606159, + "learning_rate": 1.5062150139091882e-06, + "loss": 0.9335, + "step": 9058 + }, + { + "epoch": 1.2825086713385716, + "grad_norm": 10.138000054496965, + "learning_rate": 1.505689074263003e-06, + "loss": 1.1344, + "step": 9059 + }, + { + "epoch": 1.2826502442132086, + "grad_norm": 9.880090487911353, + "learning_rate": 1.505163186887002e-06, + "loss": 0.9886, + "step": 9060 + }, + { + "epoch": 1.2827918170878458, + "grad_norm": 8.738395188824674, + "learning_rate": 1.5046373518088303e-06, + "loss": 1.0736, + "step": 9061 + }, + { + "epoch": 1.282933389962483, + "grad_norm": 8.123808788128523, + "learning_rate": 1.5041115690561308e-06, + "loss": 0.9611, + "step": 9062 + }, + { + "epoch": 1.2830749628371203, + "grad_norm": 9.432304114100589, + "learning_rate": 1.5035858386565433e-06, + "loss": 1.0986, + "step": 9063 + }, + { + "epoch": 1.2832165357117575, + "grad_norm": 8.980220676760695, + "learning_rate": 1.5030601606377054e-06, + "loss": 1.0519, + "step": 9064 + }, + { + "epoch": 1.2833581085863948, + "grad_norm": 10.307144531515734, + "learning_rate": 1.5025345350272518e-06, + "loss": 0.9533, + "step": 9065 + }, + { + "epoch": 1.283499681461032, + "grad_norm": 9.154347921969366, + "learning_rate": 1.502008961852814e-06, + "loss": 1.0107, + "step": 9066 + }, + { + "epoch": 1.2836412543356692, + "grad_norm": 7.997523639783535, + "learning_rate": 1.5014834411420204e-06, + "loss": 1.0032, + "step": 9067 + }, + { + "epoch": 1.2837828272103065, + "grad_norm": 8.026716919717385, + "learning_rate": 1.5009579729224982e-06, + "loss": 1.0337, + "step": 9068 + }, + { + "epoch": 1.2839244000849437, + "grad_norm": 9.273985000555443, + "learning_rate": 1.5004325572218698e-06, + "loss": 0.9099, + "step": 9069 + }, + { + "epoch": 1.284065972959581, + "grad_norm": 9.202535976348916, + "learning_rate": 1.4999071940677578e-06, + "loss": 0.9909, + "step": 9070 + }, + { + "epoch": 1.2842075458342181, + "grad_norm": 8.255052810952883, + "learning_rate": 1.4993818834877783e-06, + "loss": 0.9638, + "step": 9071 + }, + { + "epoch": 1.2843491187088554, + "grad_norm": 11.463235885976484, + "learning_rate": 1.498856625509547e-06, + "loss": 0.9906, + "step": 9072 + }, + { + "epoch": 1.2844906915834926, + "grad_norm": 7.8055980254345645, + "learning_rate": 1.4983314201606764e-06, + "loss": 0.9826, + "step": 9073 + }, + { + "epoch": 1.2846322644581298, + "grad_norm": 10.092170898877429, + "learning_rate": 1.497806267468776e-06, + "loss": 1.0812, + "step": 9074 + }, + { + "epoch": 1.284773837332767, + "grad_norm": 10.614376726220566, + "learning_rate": 1.4972811674614523e-06, + "loss": 0.9293, + "step": 9075 + }, + { + "epoch": 1.2849154102074043, + "grad_norm": 10.92283265986574, + "learning_rate": 1.4967561201663108e-06, + "loss": 0.9896, + "step": 9076 + }, + { + "epoch": 1.2850569830820415, + "grad_norm": 11.434644665787062, + "learning_rate": 1.4962311256109518e-06, + "loss": 0.9402, + "step": 9077 + }, + { + "epoch": 1.2851985559566788, + "grad_norm": 10.203216225185598, + "learning_rate": 1.4957061838229743e-06, + "loss": 1.011, + "step": 9078 + }, + { + "epoch": 1.285340128831316, + "grad_norm": 7.50309206483882, + "learning_rate": 1.4951812948299737e-06, + "loss": 0.9378, + "step": 9079 + }, + { + "epoch": 1.2854817017059532, + "grad_norm": 8.5307889820777, + "learning_rate": 1.494656458659543e-06, + "loss": 1.1479, + "step": 9080 + }, + { + "epoch": 1.2856232745805904, + "grad_norm": 10.350780851974669, + "learning_rate": 1.4941316753392738e-06, + "loss": 1.0419, + "step": 9081 + }, + { + "epoch": 1.2857648474552277, + "grad_norm": 9.136930844053897, + "learning_rate": 1.493606944896751e-06, + "loss": 1.0618, + "step": 9082 + }, + { + "epoch": 1.285906420329865, + "grad_norm": 8.3414971099704, + "learning_rate": 1.4930822673595613e-06, + "loss": 1.0033, + "step": 9083 + }, + { + "epoch": 1.2860479932045021, + "grad_norm": 10.110844642379005, + "learning_rate": 1.4925576427552864e-06, + "loss": 1.0289, + "step": 9084 + }, + { + "epoch": 1.2861895660791394, + "grad_norm": 8.542506578290515, + "learning_rate": 1.4920330711115043e-06, + "loss": 0.9765, + "step": 9085 + }, + { + "epoch": 1.2863311389537764, + "grad_norm": 8.86926169985901, + "learning_rate": 1.4915085524557924e-06, + "loss": 1.059, + "step": 9086 + }, + { + "epoch": 1.2864727118284136, + "grad_norm": 8.738360701751, + "learning_rate": 1.4909840868157237e-06, + "loss": 0.9547, + "step": 9087 + }, + { + "epoch": 1.2866142847030508, + "grad_norm": 9.088320235184588, + "learning_rate": 1.4904596742188695e-06, + "loss": 0.996, + "step": 9088 + }, + { + "epoch": 1.286755857577688, + "grad_norm": 8.720699406844112, + "learning_rate": 1.4899353146927975e-06, + "loss": 0.9485, + "step": 9089 + }, + { + "epoch": 1.2868974304523253, + "grad_norm": 9.00899522333551, + "learning_rate": 1.4894110082650726e-06, + "loss": 0.9032, + "step": 9090 + }, + { + "epoch": 1.2870390033269625, + "grad_norm": 9.959538811113601, + "learning_rate": 1.488886754963258e-06, + "loss": 1.039, + "step": 9091 + }, + { + "epoch": 1.2871805762015998, + "grad_norm": 7.344570162811766, + "learning_rate": 1.4883625548149125e-06, + "loss": 0.9606, + "step": 9092 + }, + { + "epoch": 1.287322149076237, + "grad_norm": 7.50200499119665, + "learning_rate": 1.4878384078475933e-06, + "loss": 0.9493, + "step": 9093 + }, + { + "epoch": 1.2874637219508742, + "grad_norm": 10.045265365444909, + "learning_rate": 1.4873143140888537e-06, + "loss": 1.1145, + "step": 9094 + }, + { + "epoch": 1.2876052948255114, + "grad_norm": 7.869588536675362, + "learning_rate": 1.486790273566246e-06, + "loss": 0.9708, + "step": 9095 + }, + { + "epoch": 1.2877468677001487, + "grad_norm": 11.044041293090862, + "learning_rate": 1.486266286307318e-06, + "loss": 1.0763, + "step": 9096 + }, + { + "epoch": 1.287888440574786, + "grad_norm": 9.951878539984769, + "learning_rate": 1.4857423523396157e-06, + "loss": 1.0614, + "step": 9097 + }, + { + "epoch": 1.2880300134494231, + "grad_norm": 10.076186071963734, + "learning_rate": 1.4852184716906808e-06, + "loss": 0.913, + "step": 9098 + }, + { + "epoch": 1.2881715863240604, + "grad_norm": 9.206968272482957, + "learning_rate": 1.484694644388055e-06, + "loss": 0.9647, + "step": 9099 + }, + { + "epoch": 1.2883131591986976, + "grad_norm": 8.749812096894255, + "learning_rate": 1.4841708704592745e-06, + "loss": 0.978, + "step": 9100 + }, + { + "epoch": 1.2884547320733348, + "grad_norm": 10.78601853330418, + "learning_rate": 1.4836471499318738e-06, + "loss": 1.0866, + "step": 9101 + }, + { + "epoch": 1.2885963049479718, + "grad_norm": 9.544145509210166, + "learning_rate": 1.4831234828333856e-06, + "loss": 1.0592, + "step": 9102 + }, + { + "epoch": 1.288737877822609, + "grad_norm": 9.68810088232233, + "learning_rate": 1.4825998691913372e-06, + "loss": 0.983, + "step": 9103 + }, + { + "epoch": 1.2888794506972463, + "grad_norm": 11.212577460626386, + "learning_rate": 1.482076309033254e-06, + "loss": 0.9137, + "step": 9104 + }, + { + "epoch": 1.2890210235718835, + "grad_norm": 10.493197462792933, + "learning_rate": 1.481552802386661e-06, + "loss": 1.0005, + "step": 9105 + }, + { + "epoch": 1.2891625964465208, + "grad_norm": 11.205216514473536, + "learning_rate": 1.4810293492790778e-06, + "loss": 1.1734, + "step": 9106 + }, + { + "epoch": 1.289304169321158, + "grad_norm": 10.469701376430042, + "learning_rate": 1.480505949738022e-06, + "loss": 1.1547, + "step": 9107 + }, + { + "epoch": 1.2894457421957952, + "grad_norm": 8.84934319251801, + "learning_rate": 1.4799826037910082e-06, + "loss": 1.026, + "step": 9108 + }, + { + "epoch": 1.2895873150704324, + "grad_norm": 11.005087282977678, + "learning_rate": 1.479459311465548e-06, + "loss": 1.0625, + "step": 9109 + }, + { + "epoch": 1.2897288879450697, + "grad_norm": 9.731920352257092, + "learning_rate": 1.478936072789151e-06, + "loss": 1.0354, + "step": 9110 + }, + { + "epoch": 1.289870460819707, + "grad_norm": 9.047332488567633, + "learning_rate": 1.4784128877893237e-06, + "loss": 0.9962, + "step": 9111 + }, + { + "epoch": 1.2900120336943441, + "grad_norm": 8.029109922035985, + "learning_rate": 1.477889756493569e-06, + "loss": 0.9668, + "step": 9112 + }, + { + "epoch": 1.2901536065689814, + "grad_norm": 9.501184891541776, + "learning_rate": 1.4773666789293881e-06, + "loss": 1.0886, + "step": 9113 + }, + { + "epoch": 1.2902951794436186, + "grad_norm": 7.738029341085241, + "learning_rate": 1.4768436551242776e-06, + "loss": 0.9879, + "step": 9114 + }, + { + "epoch": 1.2904367523182558, + "grad_norm": 9.231305588520637, + "learning_rate": 1.4763206851057338e-06, + "loss": 0.9892, + "step": 9115 + }, + { + "epoch": 1.290578325192893, + "grad_norm": 10.009828696447634, + "learning_rate": 1.4757977689012482e-06, + "loss": 0.9909, + "step": 9116 + }, + { + "epoch": 1.2907198980675303, + "grad_norm": 9.002315223448033, + "learning_rate": 1.4752749065383099e-06, + "loss": 0.9638, + "step": 9117 + }, + { + "epoch": 1.2908614709421675, + "grad_norm": 7.899176165926533, + "learning_rate": 1.4747520980444058e-06, + "loss": 0.9875, + "step": 9118 + }, + { + "epoch": 1.2910030438168048, + "grad_norm": 10.919748080076033, + "learning_rate": 1.4742293434470196e-06, + "loss": 1.2018, + "step": 9119 + }, + { + "epoch": 1.291144616691442, + "grad_norm": 10.299688715999352, + "learning_rate": 1.4737066427736317e-06, + "loss": 1.018, + "step": 9120 + }, + { + "epoch": 1.2912861895660792, + "grad_norm": 11.274633945519861, + "learning_rate": 1.4731839960517202e-06, + "loss": 1.0342, + "step": 9121 + }, + { + "epoch": 1.2914277624407164, + "grad_norm": 9.205652689672313, + "learning_rate": 1.4726614033087604e-06, + "loss": 1.06, + "step": 9122 + }, + { + "epoch": 1.2915693353153537, + "grad_norm": 9.9017982468089, + "learning_rate": 1.4721388645722262e-06, + "loss": 1.0607, + "step": 9123 + }, + { + "epoch": 1.291710908189991, + "grad_norm": 9.270207781885794, + "learning_rate": 1.4716163798695842e-06, + "loss": 1.0007, + "step": 9124 + }, + { + "epoch": 1.2918524810646281, + "grad_norm": 10.55473773755161, + "learning_rate": 1.4710939492283022e-06, + "loss": 0.9933, + "step": 9125 + }, + { + "epoch": 1.2919940539392654, + "grad_norm": 10.004961690224084, + "learning_rate": 1.4705715726758444e-06, + "loss": 1.0361, + "step": 9126 + }, + { + "epoch": 1.2921356268139026, + "grad_norm": 10.01790655991868, + "learning_rate": 1.4700492502396708e-06, + "loss": 1.0062, + "step": 9127 + }, + { + "epoch": 1.2922771996885396, + "grad_norm": 8.630361466247281, + "learning_rate": 1.4695269819472406e-06, + "loss": 0.9947, + "step": 9128 + }, + { + "epoch": 1.2924187725631768, + "grad_norm": 8.957563169034652, + "learning_rate": 1.4690047678260086e-06, + "loss": 1.1213, + "step": 9129 + }, + { + "epoch": 1.292560345437814, + "grad_norm": 9.154200405868378, + "learning_rate": 1.468482607903427e-06, + "loss": 0.8793, + "step": 9130 + }, + { + "epoch": 1.2927019183124513, + "grad_norm": 9.352318754665454, + "learning_rate": 1.467960502206946e-06, + "loss": 0.9854, + "step": 9131 + }, + { + "epoch": 1.2928434911870885, + "grad_norm": 9.35465892241334, + "learning_rate": 1.4674384507640115e-06, + "loss": 1.0034, + "step": 9132 + }, + { + "epoch": 1.2929850640617258, + "grad_norm": 9.114934935554643, + "learning_rate": 1.466916453602068e-06, + "loss": 1.0388, + "step": 9133 + }, + { + "epoch": 1.293126636936363, + "grad_norm": 9.00750292521484, + "learning_rate": 1.4663945107485567e-06, + "loss": 1.0283, + "step": 9134 + }, + { + "epoch": 1.2932682098110002, + "grad_norm": 10.448381245642642, + "learning_rate": 1.465872622230915e-06, + "loss": 0.9898, + "step": 9135 + }, + { + "epoch": 1.2934097826856374, + "grad_norm": 9.259431309549713, + "learning_rate": 1.4653507880765783e-06, + "loss": 0.8986, + "step": 9136 + }, + { + "epoch": 1.2935513555602747, + "grad_norm": 9.549283736765492, + "learning_rate": 1.464829008312979e-06, + "loss": 1.0168, + "step": 9137 + }, + { + "epoch": 1.293692928434912, + "grad_norm": 8.657490885787867, + "learning_rate": 1.464307282967547e-06, + "loss": 1.0379, + "step": 9138 + }, + { + "epoch": 1.2938345013095491, + "grad_norm": 11.020997900115736, + "learning_rate": 1.4637856120677088e-06, + "loss": 1.0004, + "step": 9139 + }, + { + "epoch": 1.2939760741841864, + "grad_norm": 10.764218530563678, + "learning_rate": 1.4632639956408884e-06, + "loss": 0.9795, + "step": 9140 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 8.185205502082896, + "learning_rate": 1.4627424337145069e-06, + "loss": 0.9732, + "step": 9141 + }, + { + "epoch": 1.2942592199334608, + "grad_norm": 10.681941856981693, + "learning_rate": 1.462220926315982e-06, + "loss": 1.0373, + "step": 9142 + }, + { + "epoch": 1.2944007928080978, + "grad_norm": 8.820710381858424, + "learning_rate": 1.4616994734727293e-06, + "loss": 1.091, + "step": 9143 + }, + { + "epoch": 1.294542365682735, + "grad_norm": 9.250143513983817, + "learning_rate": 1.461178075212162e-06, + "loss": 1.0615, + "step": 9144 + }, + { + "epoch": 1.2946839385573723, + "grad_norm": 9.137071541526884, + "learning_rate": 1.4606567315616876e-06, + "loss": 1.0008, + "step": 9145 + }, + { + "epoch": 1.2948255114320095, + "grad_norm": 11.30996355315659, + "learning_rate": 1.4601354425487141e-06, + "loss": 1.083, + "step": 9146 + }, + { + "epoch": 1.2949670843066468, + "grad_norm": 10.576059880812847, + "learning_rate": 1.4596142082006448e-06, + "loss": 0.9081, + "step": 9147 + }, + { + "epoch": 1.295108657181284, + "grad_norm": 10.067346768005244, + "learning_rate": 1.4590930285448807e-06, + "loss": 1.0478, + "step": 9148 + }, + { + "epoch": 1.2952502300559212, + "grad_norm": 7.698905857178998, + "learning_rate": 1.4585719036088205e-06, + "loss": 1.1135, + "step": 9149 + }, + { + "epoch": 1.2953918029305584, + "grad_norm": 10.41718402849331, + "learning_rate": 1.458050833419858e-06, + "loss": 1.0757, + "step": 9150 + }, + { + "epoch": 1.2955333758051957, + "grad_norm": 9.83300621221372, + "learning_rate": 1.4575298180053875e-06, + "loss": 0.9724, + "step": 9151 + }, + { + "epoch": 1.295674948679833, + "grad_norm": 10.378437254175948, + "learning_rate": 1.4570088573927959e-06, + "loss": 1.0005, + "step": 9152 + }, + { + "epoch": 1.2958165215544701, + "grad_norm": 9.965427526632975, + "learning_rate": 1.4564879516094721e-06, + "loss": 0.933, + "step": 9153 + }, + { + "epoch": 1.2959580944291074, + "grad_norm": 7.851423302170954, + "learning_rate": 1.4559671006827977e-06, + "loss": 0.8402, + "step": 9154 + }, + { + "epoch": 1.2960996673037446, + "grad_norm": 13.233355738495149, + "learning_rate": 1.4554463046401554e-06, + "loss": 0.9269, + "step": 9155 + }, + { + "epoch": 1.2962412401783818, + "grad_norm": 10.686651898915477, + "learning_rate": 1.4549255635089219e-06, + "loss": 1.0889, + "step": 9156 + }, + { + "epoch": 1.296382813053019, + "grad_norm": 9.522585072596819, + "learning_rate": 1.4544048773164712e-06, + "loss": 0.9597, + "step": 9157 + }, + { + "epoch": 1.2965243859276563, + "grad_norm": 9.273220300742011, + "learning_rate": 1.4538842460901774e-06, + "loss": 0.9552, + "step": 9158 + }, + { + "epoch": 1.2966659588022935, + "grad_norm": 9.308501133876227, + "learning_rate": 1.453363669857408e-06, + "loss": 0.9467, + "step": 9159 + }, + { + "epoch": 1.2968075316769307, + "grad_norm": 11.35312301085518, + "learning_rate": 1.4528431486455311e-06, + "loss": 1.0987, + "step": 9160 + }, + { + "epoch": 1.296949104551568, + "grad_norm": 10.043320853293846, + "learning_rate": 1.4523226824819081e-06, + "loss": 1.0955, + "step": 9161 + }, + { + "epoch": 1.2970906774262052, + "grad_norm": 9.464077542629823, + "learning_rate": 1.4518022713939e-06, + "loss": 0.9905, + "step": 9162 + }, + { + "epoch": 1.2972322503008424, + "grad_norm": 10.0126807873962, + "learning_rate": 1.4512819154088665e-06, + "loss": 1.0091, + "step": 9163 + }, + { + "epoch": 1.2973738231754797, + "grad_norm": 9.99697677689282, + "learning_rate": 1.4507616145541595e-06, + "loss": 1.0881, + "step": 9164 + }, + { + "epoch": 1.297515396050117, + "grad_norm": 10.214304105606823, + "learning_rate": 1.4502413688571332e-06, + "loss": 0.9918, + "step": 9165 + }, + { + "epoch": 1.2976569689247541, + "grad_norm": 9.662715191930841, + "learning_rate": 1.4497211783451355e-06, + "loss": 1.0239, + "step": 9166 + }, + { + "epoch": 1.2977985417993914, + "grad_norm": 11.011281903693032, + "learning_rate": 1.4492010430455108e-06, + "loss": 1.0773, + "step": 9167 + }, + { + "epoch": 1.2979401146740286, + "grad_norm": 8.968078375467911, + "learning_rate": 1.4486809629856052e-06, + "loss": 1.0759, + "step": 9168 + }, + { + "epoch": 1.2980816875486656, + "grad_norm": 9.944452121194141, + "learning_rate": 1.4481609381927565e-06, + "loss": 0.9592, + "step": 9169 + }, + { + "epoch": 1.2982232604233028, + "grad_norm": 9.039537264028171, + "learning_rate": 1.4476409686943039e-06, + "loss": 1.0487, + "step": 9170 + }, + { + "epoch": 1.29836483329794, + "grad_norm": 10.204583383641461, + "learning_rate": 1.4471210545175795e-06, + "loss": 0.9428, + "step": 9171 + }, + { + "epoch": 1.2985064061725773, + "grad_norm": 8.97496960138045, + "learning_rate": 1.446601195689918e-06, + "loss": 1.0442, + "step": 9172 + }, + { + "epoch": 1.2986479790472145, + "grad_norm": 7.737389672498846, + "learning_rate": 1.4460813922386446e-06, + "loss": 1.0238, + "step": 9173 + }, + { + "epoch": 1.2987895519218517, + "grad_norm": 9.66997414491781, + "learning_rate": 1.4455616441910878e-06, + "loss": 1.0838, + "step": 9174 + }, + { + "epoch": 1.298931124796489, + "grad_norm": 9.31320167304628, + "learning_rate": 1.445041951574568e-06, + "loss": 0.9115, + "step": 9175 + }, + { + "epoch": 1.2990726976711262, + "grad_norm": 9.305198315900752, + "learning_rate": 1.4445223144164073e-06, + "loss": 1.1832, + "step": 9176 + }, + { + "epoch": 1.2992142705457634, + "grad_norm": 9.184878851227094, + "learning_rate": 1.4440027327439215e-06, + "loss": 0.9302, + "step": 9177 + }, + { + "epoch": 1.2993558434204007, + "grad_norm": 9.717765144909661, + "learning_rate": 1.443483206584424e-06, + "loss": 1.1071, + "step": 9178 + }, + { + "epoch": 1.299497416295038, + "grad_norm": 9.203429238127, + "learning_rate": 1.4429637359652271e-06, + "loss": 1.0073, + "step": 9179 + }, + { + "epoch": 1.2996389891696751, + "grad_norm": 11.247666519604431, + "learning_rate": 1.4424443209136375e-06, + "loss": 1.0551, + "step": 9180 + }, + { + "epoch": 1.2997805620443124, + "grad_norm": 9.597893134032317, + "learning_rate": 1.4419249614569626e-06, + "loss": 0.9677, + "step": 9181 + }, + { + "epoch": 1.2999221349189496, + "grad_norm": 10.395505564892312, + "learning_rate": 1.4414056576225025e-06, + "loss": 0.9605, + "step": 9182 + }, + { + "epoch": 1.3000637077935868, + "grad_norm": 9.236269683878868, + "learning_rate": 1.4408864094375586e-06, + "loss": 1.0012, + "step": 9183 + }, + { + "epoch": 1.300205280668224, + "grad_norm": 9.170130081169319, + "learning_rate": 1.4403672169294252e-06, + "loss": 1.0954, + "step": 9184 + }, + { + "epoch": 1.300346853542861, + "grad_norm": 11.050683558798488, + "learning_rate": 1.4398480801253976e-06, + "loss": 1.0856, + "step": 9185 + }, + { + "epoch": 1.3004884264174983, + "grad_norm": 9.314138793120893, + "learning_rate": 1.4393289990527665e-06, + "loss": 0.9389, + "step": 9186 + }, + { + "epoch": 1.3006299992921355, + "grad_norm": 8.797139084207691, + "learning_rate": 1.4388099737388196e-06, + "loss": 1.0298, + "step": 9187 + }, + { + "epoch": 1.3007715721667727, + "grad_norm": 9.65269706076364, + "learning_rate": 1.4382910042108405e-06, + "loss": 0.9415, + "step": 9188 + }, + { + "epoch": 1.30091314504141, + "grad_norm": 11.022749178434733, + "learning_rate": 1.437772090496111e-06, + "loss": 1.0113, + "step": 9189 + }, + { + "epoch": 1.3010547179160472, + "grad_norm": 8.606818454781896, + "learning_rate": 1.4372532326219104e-06, + "loss": 1.0574, + "step": 9190 + }, + { + "epoch": 1.3011962907906844, + "grad_norm": 9.597474608451083, + "learning_rate": 1.4367344306155163e-06, + "loss": 1.0117, + "step": 9191 + }, + { + "epoch": 1.3013378636653217, + "grad_norm": 9.319865136869396, + "learning_rate": 1.4362156845041992e-06, + "loss": 0.9161, + "step": 9192 + }, + { + "epoch": 1.301479436539959, + "grad_norm": 10.973525005101946, + "learning_rate": 1.4356969943152315e-06, + "loss": 0.9902, + "step": 9193 + }, + { + "epoch": 1.3016210094145961, + "grad_norm": 7.999466401424506, + "learning_rate": 1.435178360075878e-06, + "loss": 0.8888, + "step": 9194 + }, + { + "epoch": 1.3017625822892334, + "grad_norm": 7.896913828601488, + "learning_rate": 1.4346597818134052e-06, + "loss": 1.09, + "step": 9195 + }, + { + "epoch": 1.3019041551638706, + "grad_norm": 8.401615496101684, + "learning_rate": 1.4341412595550724e-06, + "loss": 1.0049, + "step": 9196 + }, + { + "epoch": 1.3020457280385078, + "grad_norm": 9.552512544385614, + "learning_rate": 1.4336227933281398e-06, + "loss": 1.1129, + "step": 9197 + }, + { + "epoch": 1.302187300913145, + "grad_norm": 9.548521109108664, + "learning_rate": 1.433104383159862e-06, + "loss": 0.9204, + "step": 9198 + }, + { + "epoch": 1.3023288737877823, + "grad_norm": 9.273727502442089, + "learning_rate": 1.43258602907749e-06, + "loss": 1.0326, + "step": 9199 + }, + { + "epoch": 1.3024704466624195, + "grad_norm": 9.175675652595869, + "learning_rate": 1.432067731108276e-06, + "loss": 1.0182, + "step": 9200 + }, + { + "epoch": 1.3026120195370567, + "grad_norm": 7.542113766844245, + "learning_rate": 1.4315494892794635e-06, + "loss": 0.9209, + "step": 9201 + }, + { + "epoch": 1.302753592411694, + "grad_norm": 8.886170570317967, + "learning_rate": 1.4310313036182994e-06, + "loss": 1.0321, + "step": 9202 + }, + { + "epoch": 1.3028951652863312, + "grad_norm": 10.825199144747291, + "learning_rate": 1.4305131741520209e-06, + "loss": 1.0001, + "step": 9203 + }, + { + "epoch": 1.3030367381609684, + "grad_norm": 10.455914545692137, + "learning_rate": 1.4299951009078688e-06, + "loss": 1.034, + "step": 9204 + }, + { + "epoch": 1.3031783110356057, + "grad_norm": 9.197619204753181, + "learning_rate": 1.429477083913075e-06, + "loss": 0.9569, + "step": 9205 + }, + { + "epoch": 1.303319883910243, + "grad_norm": 10.162028854728565, + "learning_rate": 1.4289591231948742e-06, + "loss": 1.0533, + "step": 9206 + }, + { + "epoch": 1.3034614567848801, + "grad_norm": 6.704818973927971, + "learning_rate": 1.4284412187804925e-06, + "loss": 0.8744, + "step": 9207 + }, + { + "epoch": 1.3036030296595174, + "grad_norm": 10.607677745314358, + "learning_rate": 1.4279233706971579e-06, + "loss": 0.9951, + "step": 9208 + }, + { + "epoch": 1.3037446025341546, + "grad_norm": 8.095617601704477, + "learning_rate": 1.4274055789720923e-06, + "loss": 0.9625, + "step": 9209 + }, + { + "epoch": 1.3038861754087916, + "grad_norm": 8.76535561431538, + "learning_rate": 1.4268878436325145e-06, + "loss": 1.0731, + "step": 9210 + }, + { + "epoch": 1.3040277482834288, + "grad_norm": 8.912609092183844, + "learning_rate": 1.4263701647056439e-06, + "loss": 0.9855, + "step": 9211 + }, + { + "epoch": 1.304169321158066, + "grad_norm": 10.383608268743142, + "learning_rate": 1.425852542218692e-06, + "loss": 1.0372, + "step": 9212 + }, + { + "epoch": 1.3043108940327033, + "grad_norm": 9.114651599446733, + "learning_rate": 1.4253349761988714e-06, + "loss": 1.0325, + "step": 9213 + }, + { + "epoch": 1.3044524669073405, + "grad_norm": 8.892046823094308, + "learning_rate": 1.4248174666733905e-06, + "loss": 1.0282, + "step": 9214 + }, + { + "epoch": 1.3045940397819777, + "grad_norm": 8.298699187410337, + "learning_rate": 1.4243000136694527e-06, + "loss": 0.9995, + "step": 9215 + }, + { + "epoch": 1.304735612656615, + "grad_norm": 9.254085978626128, + "learning_rate": 1.423782617214262e-06, + "loss": 1.0137, + "step": 9216 + }, + { + "epoch": 1.3048771855312522, + "grad_norm": 7.793641790328642, + "learning_rate": 1.4232652773350159e-06, + "loss": 0.9762, + "step": 9217 + }, + { + "epoch": 1.3050187584058894, + "grad_norm": 8.887764361563582, + "learning_rate": 1.4227479940589122e-06, + "loss": 1.178, + "step": 9218 + }, + { + "epoch": 1.3051603312805267, + "grad_norm": 7.492718277031045, + "learning_rate": 1.422230767413143e-06, + "loss": 0.9416, + "step": 9219 + }, + { + "epoch": 1.305301904155164, + "grad_norm": 8.953247282514623, + "learning_rate": 1.421713597424898e-06, + "loss": 1.0597, + "step": 9220 + }, + { + "epoch": 1.3054434770298011, + "grad_norm": 8.942958205473385, + "learning_rate": 1.4211964841213663e-06, + "loss": 1.0513, + "step": 9221 + }, + { + "epoch": 1.3055850499044384, + "grad_norm": 9.439691756436911, + "learning_rate": 1.4206794275297298e-06, + "loss": 1.0112, + "step": 9222 + }, + { + "epoch": 1.3057266227790756, + "grad_norm": 8.570162943088874, + "learning_rate": 1.4201624276771723e-06, + "loss": 0.9729, + "step": 9223 + }, + { + "epoch": 1.3058681956537128, + "grad_norm": 10.52952566094652, + "learning_rate": 1.4196454845908696e-06, + "loss": 0.9895, + "step": 9224 + }, + { + "epoch": 1.30600976852835, + "grad_norm": 9.617897088083895, + "learning_rate": 1.4191285982979992e-06, + "loss": 1.0664, + "step": 9225 + }, + { + "epoch": 1.306151341402987, + "grad_norm": 10.127655434603351, + "learning_rate": 1.4186117688257317e-06, + "loss": 1.0758, + "step": 9226 + }, + { + "epoch": 1.3062929142776243, + "grad_norm": 8.586346242545487, + "learning_rate": 1.4180949962012377e-06, + "loss": 0.9836, + "step": 9227 + }, + { + "epoch": 1.3064344871522615, + "grad_norm": 9.65981347956372, + "learning_rate": 1.4175782804516824e-06, + "loss": 0.9318, + "step": 9228 + }, + { + "epoch": 1.3065760600268987, + "grad_norm": 9.600701910425693, + "learning_rate": 1.417061621604231e-06, + "loss": 1.0413, + "step": 9229 + }, + { + "epoch": 1.306717632901536, + "grad_norm": 9.748774524848143, + "learning_rate": 1.4165450196860423e-06, + "loss": 0.9689, + "step": 9230 + }, + { + "epoch": 1.3068592057761732, + "grad_norm": 8.561893191032485, + "learning_rate": 1.4160284747242731e-06, + "loss": 0.994, + "step": 9231 + }, + { + "epoch": 1.3070007786508104, + "grad_norm": 8.857102090768123, + "learning_rate": 1.4155119867460799e-06, + "loss": 0.9995, + "step": 9232 + }, + { + "epoch": 1.3071423515254477, + "grad_norm": 8.661546319425574, + "learning_rate": 1.4149955557786118e-06, + "loss": 1.004, + "step": 9233 + }, + { + "epoch": 1.307283924400085, + "grad_norm": 9.607962453344769, + "learning_rate": 1.4144791818490194e-06, + "loss": 1.1404, + "step": 9234 + }, + { + "epoch": 1.3074254972747221, + "grad_norm": 9.657909460588549, + "learning_rate": 1.4139628649844462e-06, + "loss": 0.8855, + "step": 9235 + }, + { + "epoch": 1.3075670701493594, + "grad_norm": 8.326682933896343, + "learning_rate": 1.4134466052120349e-06, + "loss": 0.9045, + "step": 9236 + }, + { + "epoch": 1.3077086430239966, + "grad_norm": 8.465965798672759, + "learning_rate": 1.412930402558927e-06, + "loss": 0.9687, + "step": 9237 + }, + { + "epoch": 1.3078502158986338, + "grad_norm": 7.793979757022779, + "learning_rate": 1.412414257052256e-06, + "loss": 0.9504, + "step": 9238 + }, + { + "epoch": 1.307991788773271, + "grad_norm": 7.73239622138331, + "learning_rate": 1.4118981687191573e-06, + "loss": 0.8713, + "step": 9239 + }, + { + "epoch": 1.3081333616479083, + "grad_norm": 8.682905779933531, + "learning_rate": 1.411382137586761e-06, + "loss": 0.9947, + "step": 9240 + }, + { + "epoch": 1.3082749345225455, + "grad_norm": 8.909981143473356, + "learning_rate": 1.4108661636821928e-06, + "loss": 1.0371, + "step": 9241 + }, + { + "epoch": 1.3084165073971827, + "grad_norm": 10.124347430262944, + "learning_rate": 1.4103502470325791e-06, + "loss": 1.107, + "step": 9242 + }, + { + "epoch": 1.30855808027182, + "grad_norm": 9.545207425836596, + "learning_rate": 1.4098343876650398e-06, + "loss": 1.0248, + "step": 9243 + }, + { + "epoch": 1.3086996531464572, + "grad_norm": 10.11932289240645, + "learning_rate": 1.4093185856066945e-06, + "loss": 0.9979, + "step": 9244 + }, + { + "epoch": 1.3088412260210944, + "grad_norm": 9.836347818337877, + "learning_rate": 1.4088028408846572e-06, + "loss": 1.0582, + "step": 9245 + }, + { + "epoch": 1.3089827988957317, + "grad_norm": 8.514058772250973, + "learning_rate": 1.4082871535260418e-06, + "loss": 0.9716, + "step": 9246 + }, + { + "epoch": 1.3091243717703689, + "grad_norm": 9.825079174547797, + "learning_rate": 1.4077715235579559e-06, + "loss": 0.9755, + "step": 9247 + }, + { + "epoch": 1.3092659446450061, + "grad_norm": 8.409707191622555, + "learning_rate": 1.4072559510075073e-06, + "loss": 1.0548, + "step": 9248 + }, + { + "epoch": 1.3094075175196433, + "grad_norm": 8.690192594336455, + "learning_rate": 1.4067404359017977e-06, + "loss": 1.0918, + "step": 9249 + }, + { + "epoch": 1.3095490903942806, + "grad_norm": 9.019883549989496, + "learning_rate": 1.4062249782679294e-06, + "loss": 1.0154, + "step": 9250 + }, + { + "epoch": 1.3096906632689178, + "grad_norm": 9.698872247007357, + "learning_rate": 1.4057095781329983e-06, + "loss": 0.9101, + "step": 9251 + }, + { + "epoch": 1.3098322361435548, + "grad_norm": 9.080068672357415, + "learning_rate": 1.4051942355240977e-06, + "loss": 0.923, + "step": 9252 + }, + { + "epoch": 1.309973809018192, + "grad_norm": 8.085760584568222, + "learning_rate": 1.404678950468321e-06, + "loss": 1.1172, + "step": 9253 + }, + { + "epoch": 1.3101153818928293, + "grad_norm": 9.168204253739184, + "learning_rate": 1.4041637229927541e-06, + "loss": 1.0795, + "step": 9254 + }, + { + "epoch": 1.3102569547674665, + "grad_norm": 9.833545445455165, + "learning_rate": 1.403648553124484e-06, + "loss": 1.0066, + "step": 9255 + }, + { + "epoch": 1.3103985276421037, + "grad_norm": 8.560290420454177, + "learning_rate": 1.4031334408905911e-06, + "loss": 1.0598, + "step": 9256 + }, + { + "epoch": 1.310540100516741, + "grad_norm": 9.91338565521012, + "learning_rate": 1.4026183863181563e-06, + "loss": 0.9445, + "step": 9257 + }, + { + "epoch": 1.3106816733913782, + "grad_norm": 9.923540783517186, + "learning_rate": 1.4021033894342539e-06, + "loss": 1.0357, + "step": 9258 + }, + { + "epoch": 1.3108232462660154, + "grad_norm": 9.98873801263007, + "learning_rate": 1.4015884502659574e-06, + "loss": 1.0552, + "step": 9259 + }, + { + "epoch": 1.3109648191406527, + "grad_norm": 8.925662010281217, + "learning_rate": 1.4010735688403383e-06, + "loss": 1.0466, + "step": 9260 + }, + { + "epoch": 1.31110639201529, + "grad_norm": 8.653439723194976, + "learning_rate": 1.4005587451844621e-06, + "loss": 1.0211, + "step": 9261 + }, + { + "epoch": 1.3112479648899271, + "grad_norm": 8.255595303840952, + "learning_rate": 1.4000439793253931e-06, + "loss": 0.9291, + "step": 9262 + }, + { + "epoch": 1.3113895377645644, + "grad_norm": 9.138866183617655, + "learning_rate": 1.3995292712901908e-06, + "loss": 1.0579, + "step": 9263 + }, + { + "epoch": 1.3115311106392016, + "grad_norm": 10.438035048973239, + "learning_rate": 1.3990146211059141e-06, + "loss": 0.9722, + "step": 9264 + }, + { + "epoch": 1.3116726835138388, + "grad_norm": 9.06995921469606, + "learning_rate": 1.398500028799619e-06, + "loss": 0.9909, + "step": 9265 + }, + { + "epoch": 1.311814256388476, + "grad_norm": 8.691410639043836, + "learning_rate": 1.397985494398355e-06, + "loss": 0.946, + "step": 9266 + }, + { + "epoch": 1.311955829263113, + "grad_norm": 11.05422888569457, + "learning_rate": 1.3974710179291729e-06, + "loss": 1.0536, + "step": 9267 + }, + { + "epoch": 1.3120974021377503, + "grad_norm": 8.46029139312252, + "learning_rate": 1.3969565994191165e-06, + "loss": 1.078, + "step": 9268 + }, + { + "epoch": 1.3122389750123875, + "grad_norm": 10.602842676714035, + "learning_rate": 1.3964422388952298e-06, + "loss": 1.0038, + "step": 9269 + }, + { + "epoch": 1.3123805478870247, + "grad_norm": 9.376588000272566, + "learning_rate": 1.3959279363845508e-06, + "loss": 0.9828, + "step": 9270 + }, + { + "epoch": 1.312522120761662, + "grad_norm": 7.569749116355859, + "learning_rate": 1.3954136919141182e-06, + "loss": 0.954, + "step": 9271 + }, + { + "epoch": 1.3126636936362992, + "grad_norm": 8.928598092175589, + "learning_rate": 1.3948995055109641e-06, + "loss": 1.0501, + "step": 9272 + }, + { + "epoch": 1.3128052665109364, + "grad_norm": 7.536348128990452, + "learning_rate": 1.3943853772021179e-06, + "loss": 1.002, + "step": 9273 + }, + { + "epoch": 1.3129468393855737, + "grad_norm": 8.032799715967993, + "learning_rate": 1.3938713070146093e-06, + "loss": 0.9542, + "step": 9274 + }, + { + "epoch": 1.313088412260211, + "grad_norm": 8.395009747443297, + "learning_rate": 1.3933572949754598e-06, + "loss": 0.9093, + "step": 9275 + }, + { + "epoch": 1.3132299851348481, + "grad_norm": 8.880803319663096, + "learning_rate": 1.3928433411116938e-06, + "loss": 1.0384, + "step": 9276 + }, + { + "epoch": 1.3133715580094854, + "grad_norm": 10.846637088646991, + "learning_rate": 1.3923294454503263e-06, + "loss": 1.0619, + "step": 9277 + }, + { + "epoch": 1.3135131308841226, + "grad_norm": 8.209033400475185, + "learning_rate": 1.3918156080183754e-06, + "loss": 0.8577, + "step": 9278 + }, + { + "epoch": 1.3136547037587598, + "grad_norm": 9.779688701394917, + "learning_rate": 1.3913018288428503e-06, + "loss": 1.0353, + "step": 9279 + }, + { + "epoch": 1.313796276633397, + "grad_norm": 8.836042840331405, + "learning_rate": 1.3907881079507623e-06, + "loss": 1.0757, + "step": 9280 + }, + { + "epoch": 1.3139378495080343, + "grad_norm": 9.591035042477698, + "learning_rate": 1.3902744453691158e-06, + "loss": 0.959, + "step": 9281 + }, + { + "epoch": 1.3140794223826715, + "grad_norm": 9.27924034889908, + "learning_rate": 1.3897608411249153e-06, + "loss": 1.0454, + "step": 9282 + }, + { + "epoch": 1.3142209952573087, + "grad_norm": 9.314965657750871, + "learning_rate": 1.3892472952451592e-06, + "loss": 0.9813, + "step": 9283 + }, + { + "epoch": 1.314362568131946, + "grad_norm": 9.19217692596175, + "learning_rate": 1.3887338077568437e-06, + "loss": 0.9924, + "step": 9284 + }, + { + "epoch": 1.3145041410065832, + "grad_norm": 8.386554547618216, + "learning_rate": 1.3882203786869644e-06, + "loss": 0.9874, + "step": 9285 + }, + { + "epoch": 1.3146457138812204, + "grad_norm": 9.15552463197064, + "learning_rate": 1.3877070080625098e-06, + "loss": 0.9185, + "step": 9286 + }, + { + "epoch": 1.3147872867558577, + "grad_norm": 9.316207659702064, + "learning_rate": 1.3871936959104684e-06, + "loss": 1.0378, + "step": 9287 + }, + { + "epoch": 1.3149288596304949, + "grad_norm": 8.982334601867286, + "learning_rate": 1.3866804422578256e-06, + "loss": 0.9764, + "step": 9288 + }, + { + "epoch": 1.3150704325051321, + "grad_norm": 10.336124986489285, + "learning_rate": 1.386167247131561e-06, + "loss": 1.0397, + "step": 9289 + }, + { + "epoch": 1.3152120053797693, + "grad_norm": 8.190121762368486, + "learning_rate": 1.3856541105586545e-06, + "loss": 0.9885, + "step": 9290 + }, + { + "epoch": 1.3153535782544066, + "grad_norm": 9.599639377178447, + "learning_rate": 1.3851410325660796e-06, + "loss": 0.9801, + "step": 9291 + }, + { + "epoch": 1.3154951511290438, + "grad_norm": 7.8240237275994415, + "learning_rate": 1.3846280131808103e-06, + "loss": 0.9057, + "step": 9292 + }, + { + "epoch": 1.3156367240036808, + "grad_norm": 9.234415070370972, + "learning_rate": 1.3841150524298148e-06, + "loss": 1.1202, + "step": 9293 + }, + { + "epoch": 1.315778296878318, + "grad_norm": 8.810141822807864, + "learning_rate": 1.3836021503400583e-06, + "loss": 1.1233, + "step": 9294 + }, + { + "epoch": 1.3159198697529553, + "grad_norm": 8.75636365273538, + "learning_rate": 1.3830893069385046e-06, + "loss": 0.9229, + "step": 9295 + }, + { + "epoch": 1.3160614426275925, + "grad_norm": 8.83292268002438, + "learning_rate": 1.3825765222521127e-06, + "loss": 0.9919, + "step": 9296 + }, + { + "epoch": 1.3162030155022297, + "grad_norm": 8.868865996147264, + "learning_rate": 1.3820637963078406e-06, + "loss": 1.0244, + "step": 9297 + }, + { + "epoch": 1.316344588376867, + "grad_norm": 10.118970794813524, + "learning_rate": 1.3815511291326404e-06, + "loss": 0.9589, + "step": 9298 + }, + { + "epoch": 1.3164861612515042, + "grad_norm": 8.25273040902837, + "learning_rate": 1.3810385207534641e-06, + "loss": 1.0533, + "step": 9299 + }, + { + "epoch": 1.3166277341261414, + "grad_norm": 10.682998155244666, + "learning_rate": 1.3805259711972577e-06, + "loss": 1.203, + "step": 9300 + }, + { + "epoch": 1.3167693070007787, + "grad_norm": 8.243720699371023, + "learning_rate": 1.380013480490967e-06, + "loss": 0.9706, + "step": 9301 + }, + { + "epoch": 1.3169108798754159, + "grad_norm": 7.677327939350389, + "learning_rate": 1.3795010486615318e-06, + "loss": 0.9221, + "step": 9302 + }, + { + "epoch": 1.3170524527500531, + "grad_norm": 9.65815000143378, + "learning_rate": 1.3789886757358916e-06, + "loss": 1.0548, + "step": 9303 + }, + { + "epoch": 1.3171940256246903, + "grad_norm": 8.047131462084263, + "learning_rate": 1.3784763617409814e-06, + "loss": 1.0044, + "step": 9304 + }, + { + "epoch": 1.3173355984993276, + "grad_norm": 8.556701924640617, + "learning_rate": 1.3779641067037313e-06, + "loss": 1.0343, + "step": 9305 + }, + { + "epoch": 1.3174771713739648, + "grad_norm": 12.655101582734764, + "learning_rate": 1.3774519106510725e-06, + "loss": 0.9896, + "step": 9306 + }, + { + "epoch": 1.317618744248602, + "grad_norm": 8.221871715096977, + "learning_rate": 1.3769397736099288e-06, + "loss": 0.8893, + "step": 9307 + }, + { + "epoch": 1.3177603171232393, + "grad_norm": 8.259836055430982, + "learning_rate": 1.3764276956072248e-06, + "loss": 1.0051, + "step": 9308 + }, + { + "epoch": 1.3179018899978763, + "grad_norm": 10.77733631693745, + "learning_rate": 1.3759156766698783e-06, + "loss": 1.1019, + "step": 9309 + }, + { + "epoch": 1.3180434628725135, + "grad_norm": 8.148011003489643, + "learning_rate": 1.3754037168248063e-06, + "loss": 0.9543, + "step": 9310 + }, + { + "epoch": 1.3181850357471507, + "grad_norm": 9.128676000653234, + "learning_rate": 1.3748918160989232e-06, + "loss": 0.9453, + "step": 9311 + }, + { + "epoch": 1.318326608621788, + "grad_norm": 8.165766867548708, + "learning_rate": 1.3743799745191377e-06, + "loss": 1.0606, + "step": 9312 + }, + { + "epoch": 1.3184681814964252, + "grad_norm": 9.626503170594036, + "learning_rate": 1.3738681921123586e-06, + "loss": 0.9754, + "step": 9313 + }, + { + "epoch": 1.3186097543710624, + "grad_norm": 7.257631691773667, + "learning_rate": 1.373356468905489e-06, + "loss": 1.094, + "step": 9314 + }, + { + "epoch": 1.3187513272456997, + "grad_norm": 9.577877670007124, + "learning_rate": 1.3728448049254296e-06, + "loss": 0.9891, + "step": 9315 + }, + { + "epoch": 1.3188929001203369, + "grad_norm": 7.746647263380603, + "learning_rate": 1.3723332001990774e-06, + "loss": 0.9637, + "step": 9316 + }, + { + "epoch": 1.3190344729949741, + "grad_norm": 8.842785031138867, + "learning_rate": 1.3718216547533282e-06, + "loss": 0.9399, + "step": 9317 + }, + { + "epoch": 1.3191760458696113, + "grad_norm": 9.136311665925895, + "learning_rate": 1.3713101686150742e-06, + "loss": 0.9162, + "step": 9318 + }, + { + "epoch": 1.3193176187442486, + "grad_norm": 9.62765704054377, + "learning_rate": 1.370798741811202e-06, + "loss": 1.0525, + "step": 9319 + }, + { + "epoch": 1.3194591916188858, + "grad_norm": 10.239314766845057, + "learning_rate": 1.370287374368599e-06, + "loss": 1.0594, + "step": 9320 + }, + { + "epoch": 1.319600764493523, + "grad_norm": 9.852927347094816, + "learning_rate": 1.3697760663141457e-06, + "loss": 0.9849, + "step": 9321 + }, + { + "epoch": 1.3197423373681603, + "grad_norm": 9.212379038623494, + "learning_rate": 1.3692648176747224e-06, + "loss": 1.0392, + "step": 9322 + }, + { + "epoch": 1.3198839102427975, + "grad_norm": 9.552979361060693, + "learning_rate": 1.368753628477204e-06, + "loss": 1.0169, + "step": 9323 + }, + { + "epoch": 1.3200254831174347, + "grad_norm": 8.470713707985665, + "learning_rate": 1.3682424987484647e-06, + "loss": 0.9959, + "step": 9324 + }, + { + "epoch": 1.320167055992072, + "grad_norm": 9.446490117345668, + "learning_rate": 1.367731428515373e-06, + "loss": 0.9232, + "step": 9325 + }, + { + "epoch": 1.3203086288667092, + "grad_norm": 10.46527679675111, + "learning_rate": 1.3672204178047955e-06, + "loss": 0.9771, + "step": 9326 + }, + { + "epoch": 1.3204502017413464, + "grad_norm": 8.231684380889615, + "learning_rate": 1.3667094666435964e-06, + "loss": 1.0872, + "step": 9327 + }, + { + "epoch": 1.3205917746159836, + "grad_norm": 9.701473273642474, + "learning_rate": 1.3661985750586348e-06, + "loss": 0.9733, + "step": 9328 + }, + { + "epoch": 1.3207333474906209, + "grad_norm": 7.826571555959649, + "learning_rate": 1.36568774307677e-06, + "loss": 0.9767, + "step": 9329 + }, + { + "epoch": 1.320874920365258, + "grad_norm": 8.151851008557667, + "learning_rate": 1.3651769707248535e-06, + "loss": 0.9511, + "step": 9330 + }, + { + "epoch": 1.3210164932398953, + "grad_norm": 9.451989778814225, + "learning_rate": 1.3646662580297385e-06, + "loss": 1.0072, + "step": 9331 + }, + { + "epoch": 1.3211580661145326, + "grad_norm": 8.948449743258168, + "learning_rate": 1.364155605018271e-06, + "loss": 0.994, + "step": 9332 + }, + { + "epoch": 1.3212996389891698, + "grad_norm": 10.451080806636961, + "learning_rate": 1.3636450117172962e-06, + "loss": 1.0278, + "step": 9333 + }, + { + "epoch": 1.321441211863807, + "grad_norm": 7.9436787263687645, + "learning_rate": 1.3631344781536565e-06, + "loss": 0.9729, + "step": 9334 + }, + { + "epoch": 1.321582784738444, + "grad_norm": 8.738044636660174, + "learning_rate": 1.3626240043541901e-06, + "loss": 0.9645, + "step": 9335 + }, + { + "epoch": 1.3217243576130813, + "grad_norm": 8.198241198723139, + "learning_rate": 1.3621135903457318e-06, + "loss": 0.8561, + "step": 9336 + }, + { + "epoch": 1.3218659304877185, + "grad_norm": 8.960855764666467, + "learning_rate": 1.3616032361551124e-06, + "loss": 1.0485, + "step": 9337 + }, + { + "epoch": 1.3220075033623557, + "grad_norm": 8.476913515197653, + "learning_rate": 1.3610929418091618e-06, + "loss": 0.9732, + "step": 9338 + }, + { + "epoch": 1.322149076236993, + "grad_norm": 9.285453281297588, + "learning_rate": 1.3605827073347074e-06, + "loss": 0.9808, + "step": 9339 + }, + { + "epoch": 1.3222906491116302, + "grad_norm": 11.979056201370204, + "learning_rate": 1.3600725327585695e-06, + "loss": 1.0286, + "step": 9340 + }, + { + "epoch": 1.3224322219862674, + "grad_norm": 7.193608350378328, + "learning_rate": 1.3595624181075695e-06, + "loss": 0.9666, + "step": 9341 + }, + { + "epoch": 1.3225737948609047, + "grad_norm": 8.192487522519679, + "learning_rate": 1.3590523634085218e-06, + "loss": 0.8864, + "step": 9342 + }, + { + "epoch": 1.3227153677355419, + "grad_norm": 9.625489457801411, + "learning_rate": 1.3585423686882415e-06, + "loss": 1.1868, + "step": 9343 + }, + { + "epoch": 1.322856940610179, + "grad_norm": 10.733526693304437, + "learning_rate": 1.3580324339735369e-06, + "loss": 0.974, + "step": 9344 + }, + { + "epoch": 1.3229985134848163, + "grad_norm": 9.70799833273667, + "learning_rate": 1.3575225592912166e-06, + "loss": 1.0443, + "step": 9345 + }, + { + "epoch": 1.3231400863594536, + "grad_norm": 10.05698978412857, + "learning_rate": 1.3570127446680838e-06, + "loss": 0.9424, + "step": 9346 + }, + { + "epoch": 1.3232816592340908, + "grad_norm": 8.681707632966939, + "learning_rate": 1.3565029901309378e-06, + "loss": 0.9985, + "step": 9347 + }, + { + "epoch": 1.323423232108728, + "grad_norm": 10.618530457973648, + "learning_rate": 1.3559932957065777e-06, + "loss": 1.0356, + "step": 9348 + }, + { + "epoch": 1.3235648049833653, + "grad_norm": 8.556988132423552, + "learning_rate": 1.3554836614217963e-06, + "loss": 0.8964, + "step": 9349 + }, + { + "epoch": 1.3237063778580023, + "grad_norm": 7.659447325447106, + "learning_rate": 1.354974087303386e-06, + "loss": 0.948, + "step": 9350 + }, + { + "epoch": 1.3238479507326395, + "grad_norm": 9.586158358167838, + "learning_rate": 1.3544645733781335e-06, + "loss": 1.1109, + "step": 9351 + }, + { + "epoch": 1.3239895236072767, + "grad_norm": 7.2331120025009055, + "learning_rate": 1.3539551196728252e-06, + "loss": 1.0244, + "step": 9352 + }, + { + "epoch": 1.324131096481914, + "grad_norm": 8.801945852676955, + "learning_rate": 1.3534457262142408e-06, + "loss": 0.9099, + "step": 9353 + }, + { + "epoch": 1.3242726693565512, + "grad_norm": 13.411445944573567, + "learning_rate": 1.3529363930291606e-06, + "loss": 1.0798, + "step": 9354 + }, + { + "epoch": 1.3244142422311884, + "grad_norm": 10.660047106325658, + "learning_rate": 1.3524271201443578e-06, + "loss": 1.0275, + "step": 9355 + }, + { + "epoch": 1.3245558151058257, + "grad_norm": 8.196150308700574, + "learning_rate": 1.3519179075866067e-06, + "loss": 0.9788, + "step": 9356 + }, + { + "epoch": 1.3246973879804629, + "grad_norm": 8.902821091579447, + "learning_rate": 1.3514087553826753e-06, + "loss": 1.0657, + "step": 9357 + }, + { + "epoch": 1.3248389608551001, + "grad_norm": 9.3787102033342, + "learning_rate": 1.350899663559328e-06, + "loss": 0.9677, + "step": 9358 + }, + { + "epoch": 1.3249805337297373, + "grad_norm": 9.292871462684323, + "learning_rate": 1.3503906321433298e-06, + "loss": 1.0464, + "step": 9359 + }, + { + "epoch": 1.3251221066043746, + "grad_norm": 11.598700851827989, + "learning_rate": 1.3498816611614373e-06, + "loss": 1.1955, + "step": 9360 + }, + { + "epoch": 1.3252636794790118, + "grad_norm": 8.345531487747696, + "learning_rate": 1.3493727506404092e-06, + "loss": 0.9694, + "step": 9361 + }, + { + "epoch": 1.325405252353649, + "grad_norm": 10.106713630077317, + "learning_rate": 1.348863900606998e-06, + "loss": 1.1043, + "step": 9362 + }, + { + "epoch": 1.3255468252282863, + "grad_norm": 11.403742768104415, + "learning_rate": 1.3483551110879525e-06, + "loss": 1.042, + "step": 9363 + }, + { + "epoch": 1.3256883981029235, + "grad_norm": 9.202439391110941, + "learning_rate": 1.347846382110021e-06, + "loss": 1.0678, + "step": 9364 + }, + { + "epoch": 1.3258299709775607, + "grad_norm": 10.649148941063098, + "learning_rate": 1.3473377136999452e-06, + "loss": 0.9584, + "step": 9365 + }, + { + "epoch": 1.325971543852198, + "grad_norm": 8.054489532909097, + "learning_rate": 1.3468291058844673e-06, + "loss": 0.9748, + "step": 9366 + }, + { + "epoch": 1.3261131167268352, + "grad_norm": 10.857923056599146, + "learning_rate": 1.3463205586903233e-06, + "loss": 1.0231, + "step": 9367 + }, + { + "epoch": 1.3262546896014724, + "grad_norm": 10.17665768178657, + "learning_rate": 1.3458120721442464e-06, + "loss": 1.0594, + "step": 9368 + }, + { + "epoch": 1.3263962624761096, + "grad_norm": 9.710046532437904, + "learning_rate": 1.3453036462729697e-06, + "loss": 1.0389, + "step": 9369 + }, + { + "epoch": 1.3265378353507469, + "grad_norm": 8.56545100906949, + "learning_rate": 1.3447952811032177e-06, + "loss": 0.9617, + "step": 9370 + }, + { + "epoch": 1.326679408225384, + "grad_norm": 8.357991726234285, + "learning_rate": 1.3442869766617178e-06, + "loss": 0.9527, + "step": 9371 + }, + { + "epoch": 1.3268209811000213, + "grad_norm": 9.588300219467099, + "learning_rate": 1.3437787329751887e-06, + "loss": 1.0988, + "step": 9372 + }, + { + "epoch": 1.3269625539746586, + "grad_norm": 7.927296726263656, + "learning_rate": 1.3432705500703501e-06, + "loss": 1.0545, + "step": 9373 + }, + { + "epoch": 1.3271041268492958, + "grad_norm": 9.336045779314551, + "learning_rate": 1.342762427973916e-06, + "loss": 0.9728, + "step": 9374 + }, + { + "epoch": 1.327245699723933, + "grad_norm": 9.653399296670424, + "learning_rate": 1.3422543667125988e-06, + "loss": 1.0305, + "step": 9375 + }, + { + "epoch": 1.32738727259857, + "grad_norm": 7.383084795739124, + "learning_rate": 1.341746366313105e-06, + "loss": 0.9894, + "step": 9376 + }, + { + "epoch": 1.3275288454732073, + "grad_norm": 8.193428048356424, + "learning_rate": 1.3412384268021421e-06, + "loss": 0.9595, + "step": 9377 + }, + { + "epoch": 1.3276704183478445, + "grad_norm": 9.461808382708549, + "learning_rate": 1.3407305482064115e-06, + "loss": 1.008, + "step": 9378 + }, + { + "epoch": 1.3278119912224817, + "grad_norm": 10.332902919870332, + "learning_rate": 1.3402227305526106e-06, + "loss": 1.1508, + "step": 9379 + }, + { + "epoch": 1.327953564097119, + "grad_norm": 10.554449320152148, + "learning_rate": 1.3397149738674363e-06, + "loss": 1.0234, + "step": 9380 + }, + { + "epoch": 1.3280951369717562, + "grad_norm": 10.732479988753036, + "learning_rate": 1.3392072781775806e-06, + "loss": 1.0535, + "step": 9381 + }, + { + "epoch": 1.3282367098463934, + "grad_norm": 10.630895796430812, + "learning_rate": 1.3386996435097333e-06, + "loss": 1.1745, + "step": 9382 + }, + { + "epoch": 1.3283782827210306, + "grad_norm": 8.599214793000135, + "learning_rate": 1.3381920698905788e-06, + "loss": 0.9446, + "step": 9383 + }, + { + "epoch": 1.3285198555956679, + "grad_norm": 9.641034134602487, + "learning_rate": 1.3376845573468012e-06, + "loss": 0.9709, + "step": 9384 + }, + { + "epoch": 1.328661428470305, + "grad_norm": 7.310013568411575, + "learning_rate": 1.3371771059050803e-06, + "loss": 0.9258, + "step": 9385 + }, + { + "epoch": 1.3288030013449423, + "grad_norm": 9.605888118641168, + "learning_rate": 1.3366697155920913e-06, + "loss": 1.0765, + "step": 9386 + }, + { + "epoch": 1.3289445742195796, + "grad_norm": 9.534876599365967, + "learning_rate": 1.3361623864345086e-06, + "loss": 0.9905, + "step": 9387 + }, + { + "epoch": 1.3290861470942168, + "grad_norm": 7.7414631740514865, + "learning_rate": 1.3356551184590017e-06, + "loss": 0.9322, + "step": 9388 + }, + { + "epoch": 1.329227719968854, + "grad_norm": 8.800756976775226, + "learning_rate": 1.3351479116922372e-06, + "loss": 0.9767, + "step": 9389 + }, + { + "epoch": 1.3293692928434913, + "grad_norm": 9.803192583993603, + "learning_rate": 1.3346407661608771e-06, + "loss": 0.9746, + "step": 9390 + }, + { + "epoch": 1.3295108657181285, + "grad_norm": 9.819305586803543, + "learning_rate": 1.3341336818915832e-06, + "loss": 1.0583, + "step": 9391 + }, + { + "epoch": 1.3296524385927655, + "grad_norm": 9.01611707590717, + "learning_rate": 1.3336266589110131e-06, + "loss": 1.0699, + "step": 9392 + }, + { + "epoch": 1.3297940114674027, + "grad_norm": 12.285784711667599, + "learning_rate": 1.333119697245819e-06, + "loss": 1.1801, + "step": 9393 + }, + { + "epoch": 1.32993558434204, + "grad_norm": 8.347209770318884, + "learning_rate": 1.3326127969226535e-06, + "loss": 0.9246, + "step": 9394 + }, + { + "epoch": 1.3300771572166772, + "grad_norm": 8.026550580491685, + "learning_rate": 1.3321059579681617e-06, + "loss": 0.8929, + "step": 9395 + }, + { + "epoch": 1.3302187300913144, + "grad_norm": 10.595450205794776, + "learning_rate": 1.3315991804089897e-06, + "loss": 1.0853, + "step": 9396 + }, + { + "epoch": 1.3303603029659516, + "grad_norm": 8.406793172652335, + "learning_rate": 1.3310924642717767e-06, + "loss": 1.072, + "step": 9397 + }, + { + "epoch": 1.3305018758405889, + "grad_norm": 10.009114879761823, + "learning_rate": 1.3305858095831626e-06, + "loss": 0.9873, + "step": 9398 + }, + { + "epoch": 1.330643448715226, + "grad_norm": 7.470086014136233, + "learning_rate": 1.33007921636978e-06, + "loss": 0.9447, + "step": 9399 + }, + { + "epoch": 1.3307850215898633, + "grad_norm": 9.488741176069949, + "learning_rate": 1.3295726846582602e-06, + "loss": 1.0638, + "step": 9400 + }, + { + "epoch": 1.3309265944645006, + "grad_norm": 9.81334706920547, + "learning_rate": 1.3290662144752322e-06, + "loss": 0.9436, + "step": 9401 + }, + { + "epoch": 1.3310681673391378, + "grad_norm": 8.184439283586796, + "learning_rate": 1.3285598058473195e-06, + "loss": 1.0645, + "step": 9402 + }, + { + "epoch": 1.331209740213775, + "grad_norm": 8.650588845733548, + "learning_rate": 1.3280534588011451e-06, + "loss": 0.8941, + "step": 9403 + }, + { + "epoch": 1.3313513130884123, + "grad_norm": 10.401473131751429, + "learning_rate": 1.3275471733633258e-06, + "loss": 1.0813, + "step": 9404 + }, + { + "epoch": 1.3314928859630495, + "grad_norm": 10.403923439011107, + "learning_rate": 1.3270409495604783e-06, + "loss": 0.9888, + "step": 9405 + }, + { + "epoch": 1.3316344588376867, + "grad_norm": 11.140258611831081, + "learning_rate": 1.3265347874192125e-06, + "loss": 1.0109, + "step": 9406 + }, + { + "epoch": 1.331776031712324, + "grad_norm": 8.702329424616767, + "learning_rate": 1.3260286869661378e-06, + "loss": 0.9798, + "step": 9407 + }, + { + "epoch": 1.3319176045869612, + "grad_norm": 9.538167475670853, + "learning_rate": 1.325522648227861e-06, + "loss": 0.9253, + "step": 9408 + }, + { + "epoch": 1.3320591774615984, + "grad_norm": 8.79601894841185, + "learning_rate": 1.3250166712309825e-06, + "loss": 1.0069, + "step": 9409 + }, + { + "epoch": 1.3322007503362356, + "grad_norm": 10.132074215948109, + "learning_rate": 1.3245107560021015e-06, + "loss": 1.0758, + "step": 9410 + }, + { + "epoch": 1.3323423232108729, + "grad_norm": 9.552726588053972, + "learning_rate": 1.324004902567813e-06, + "loss": 0.8801, + "step": 9411 + }, + { + "epoch": 1.33248389608551, + "grad_norm": 10.670493710937476, + "learning_rate": 1.3234991109547104e-06, + "loss": 1.1222, + "step": 9412 + }, + { + "epoch": 1.3326254689601473, + "grad_norm": 8.623953935517838, + "learning_rate": 1.3229933811893814e-06, + "loss": 1.0817, + "step": 9413 + }, + { + "epoch": 1.3327670418347846, + "grad_norm": 9.442576678719432, + "learning_rate": 1.3224877132984131e-06, + "loss": 1.0001, + "step": 9414 + }, + { + "epoch": 1.3329086147094218, + "grad_norm": 8.629113142607975, + "learning_rate": 1.3219821073083882e-06, + "loss": 1.0904, + "step": 9415 + }, + { + "epoch": 1.333050187584059, + "grad_norm": 8.78783845643103, + "learning_rate": 1.3214765632458852e-06, + "loss": 1.0422, + "step": 9416 + }, + { + "epoch": 1.3331917604586963, + "grad_norm": 8.003245172339419, + "learning_rate": 1.320971081137481e-06, + "loss": 0.9466, + "step": 9417 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 8.730388352538094, + "learning_rate": 1.3204656610097472e-06, + "loss": 1.084, + "step": 9418 + }, + { + "epoch": 1.3334749062079705, + "grad_norm": 9.825378519391322, + "learning_rate": 1.3199603028892548e-06, + "loss": 0.9988, + "step": 9419 + }, + { + "epoch": 1.3336164790826077, + "grad_norm": 9.253639123941067, + "learning_rate": 1.3194550068025697e-06, + "loss": 1.1228, + "step": 9420 + }, + { + "epoch": 1.333758051957245, + "grad_norm": 9.277704762712732, + "learning_rate": 1.3189497727762535e-06, + "loss": 1.0576, + "step": 9421 + }, + { + "epoch": 1.3338996248318822, + "grad_norm": 8.765102323292806, + "learning_rate": 1.318444600836868e-06, + "loss": 1.018, + "step": 9422 + }, + { + "epoch": 1.3340411977065194, + "grad_norm": 9.03244570304289, + "learning_rate": 1.3179394910109683e-06, + "loss": 0.8907, + "step": 9423 + }, + { + "epoch": 1.3341827705811566, + "grad_norm": 8.752172581841823, + "learning_rate": 1.3174344433251086e-06, + "loss": 1.0984, + "step": 9424 + }, + { + "epoch": 1.3343243434557939, + "grad_norm": 10.83891817557439, + "learning_rate": 1.3169294578058378e-06, + "loss": 1.0141, + "step": 9425 + }, + { + "epoch": 1.334465916330431, + "grad_norm": 10.005884727849493, + "learning_rate": 1.3164245344797045e-06, + "loss": 1.0761, + "step": 9426 + }, + { + "epoch": 1.3346074892050683, + "grad_norm": 8.242028286376934, + "learning_rate": 1.3159196733732494e-06, + "loss": 0.9642, + "step": 9427 + }, + { + "epoch": 1.3347490620797056, + "grad_norm": 10.58411106245687, + "learning_rate": 1.3154148745130151e-06, + "loss": 0.9854, + "step": 9428 + }, + { + "epoch": 1.3348906349543428, + "grad_norm": 8.766264268272513, + "learning_rate": 1.314910137925537e-06, + "loss": 1.1182, + "step": 9429 + }, + { + "epoch": 1.33503220782898, + "grad_norm": 10.438453493709307, + "learning_rate": 1.3144054636373505e-06, + "loss": 1.0181, + "step": 9430 + }, + { + "epoch": 1.3351737807036173, + "grad_norm": 9.054239652023218, + "learning_rate": 1.313900851674984e-06, + "loss": 0.9111, + "step": 9431 + }, + { + "epoch": 1.3353153535782545, + "grad_norm": 9.237233605369942, + "learning_rate": 1.3133963020649648e-06, + "loss": 0.9884, + "step": 9432 + }, + { + "epoch": 1.3354569264528915, + "grad_norm": 9.368091530083149, + "learning_rate": 1.3128918148338183e-06, + "loss": 0.9712, + "step": 9433 + }, + { + "epoch": 1.3355984993275287, + "grad_norm": 7.323106425952724, + "learning_rate": 1.312387390008063e-06, + "loss": 0.9256, + "step": 9434 + }, + { + "epoch": 1.335740072202166, + "grad_norm": 9.439387050707605, + "learning_rate": 1.3118830276142169e-06, + "loss": 0.9979, + "step": 9435 + }, + { + "epoch": 1.3358816450768032, + "grad_norm": 8.384746744315454, + "learning_rate": 1.3113787276787951e-06, + "loss": 0.9268, + "step": 9436 + }, + { + "epoch": 1.3360232179514404, + "grad_norm": 9.42649310601944, + "learning_rate": 1.3108744902283065e-06, + "loss": 1.0394, + "step": 9437 + }, + { + "epoch": 1.3361647908260776, + "grad_norm": 9.108342264304243, + "learning_rate": 1.31037031528926e-06, + "loss": 0.9926, + "step": 9438 + }, + { + "epoch": 1.3363063637007149, + "grad_norm": 8.405221188350588, + "learning_rate": 1.309866202888158e-06, + "loss": 0.9916, + "step": 9439 + }, + { + "epoch": 1.336447936575352, + "grad_norm": 9.704074782420657, + "learning_rate": 1.3093621530515038e-06, + "loss": 1.1328, + "step": 9440 + }, + { + "epoch": 1.3365895094499893, + "grad_norm": 8.086962589762775, + "learning_rate": 1.308858165805793e-06, + "loss": 0.938, + "step": 9441 + }, + { + "epoch": 1.3367310823246266, + "grad_norm": 9.323103839266937, + "learning_rate": 1.3083542411775196e-06, + "loss": 1.1214, + "step": 9442 + }, + { + "epoch": 1.3368726551992638, + "grad_norm": 10.022230615304542, + "learning_rate": 1.307850379193176e-06, + "loss": 0.9719, + "step": 9443 + }, + { + "epoch": 1.337014228073901, + "grad_norm": 8.893381774099367, + "learning_rate": 1.3073465798792482e-06, + "loss": 1.0309, + "step": 9444 + }, + { + "epoch": 1.3371558009485383, + "grad_norm": 9.147037180670598, + "learning_rate": 1.3068428432622221e-06, + "loss": 1.0852, + "step": 9445 + }, + { + "epoch": 1.3372973738231755, + "grad_norm": 9.189057302514046, + "learning_rate": 1.3063391693685773e-06, + "loss": 1.1068, + "step": 9446 + }, + { + "epoch": 1.3374389466978127, + "grad_norm": 8.524202335213552, + "learning_rate": 1.3058355582247933e-06, + "loss": 0.9943, + "step": 9447 + }, + { + "epoch": 1.33758051957245, + "grad_norm": 8.967926093879955, + "learning_rate": 1.3053320098573428e-06, + "loss": 0.9873, + "step": 9448 + }, + { + "epoch": 1.3377220924470872, + "grad_norm": 8.441336204735997, + "learning_rate": 1.3048285242926983e-06, + "loss": 0.9806, + "step": 9449 + }, + { + "epoch": 1.3378636653217244, + "grad_norm": 8.723296917946353, + "learning_rate": 1.3043251015573266e-06, + "loss": 1.0361, + "step": 9450 + }, + { + "epoch": 1.3380052381963616, + "grad_norm": 9.5665274721801, + "learning_rate": 1.3038217416776936e-06, + "loss": 1.0757, + "step": 9451 + }, + { + "epoch": 1.3381468110709989, + "grad_norm": 8.923215748645497, + "learning_rate": 1.3033184446802596e-06, + "loss": 0.8886, + "step": 9452 + }, + { + "epoch": 1.338288383945636, + "grad_norm": 9.780112690968693, + "learning_rate": 1.3028152105914818e-06, + "loss": 1.0142, + "step": 9453 + }, + { + "epoch": 1.3384299568202733, + "grad_norm": 10.318174338500368, + "learning_rate": 1.3023120394378167e-06, + "loss": 0.9736, + "step": 9454 + }, + { + "epoch": 1.3385715296949106, + "grad_norm": 11.090079718562103, + "learning_rate": 1.3018089312457137e-06, + "loss": 1.0266, + "step": 9455 + }, + { + "epoch": 1.3387131025695478, + "grad_norm": 7.6906635009099515, + "learning_rate": 1.3013058860416229e-06, + "loss": 0.8916, + "step": 9456 + }, + { + "epoch": 1.338854675444185, + "grad_norm": 9.354037028365205, + "learning_rate": 1.3008029038519866e-06, + "loss": 0.9122, + "step": 9457 + }, + { + "epoch": 1.3389962483188222, + "grad_norm": 8.990277018623065, + "learning_rate": 1.3002999847032476e-06, + "loss": 0.9772, + "step": 9458 + }, + { + "epoch": 1.3391378211934593, + "grad_norm": 7.470868564344636, + "learning_rate": 1.2997971286218448e-06, + "loss": 0.9139, + "step": 9459 + }, + { + "epoch": 1.3392793940680965, + "grad_norm": 11.42635081379536, + "learning_rate": 1.2992943356342111e-06, + "loss": 1.0055, + "step": 9460 + }, + { + "epoch": 1.3394209669427337, + "grad_norm": 8.207432369867998, + "learning_rate": 1.2987916057667799e-06, + "loss": 0.9767, + "step": 9461 + }, + { + "epoch": 1.339562539817371, + "grad_norm": 9.130757644820994, + "learning_rate": 1.2982889390459781e-06, + "loss": 0.9929, + "step": 9462 + }, + { + "epoch": 1.3397041126920082, + "grad_norm": 9.835356897453847, + "learning_rate": 1.297786335498231e-06, + "loss": 1.0049, + "step": 9463 + }, + { + "epoch": 1.3398456855666454, + "grad_norm": 7.288576819296154, + "learning_rate": 1.297283795149959e-06, + "loss": 0.9063, + "step": 9464 + }, + { + "epoch": 1.3399872584412826, + "grad_norm": 9.843710084864512, + "learning_rate": 1.2967813180275809e-06, + "loss": 0.8993, + "step": 9465 + }, + { + "epoch": 1.3401288313159199, + "grad_norm": 8.785986875055775, + "learning_rate": 1.2962789041575127e-06, + "loss": 1.0074, + "step": 9466 + }, + { + "epoch": 1.340270404190557, + "grad_norm": 8.638432754080878, + "learning_rate": 1.2957765535661644e-06, + "loss": 0.9533, + "step": 9467 + }, + { + "epoch": 1.3404119770651943, + "grad_norm": 9.691602269027326, + "learning_rate": 1.295274266279945e-06, + "loss": 1.0652, + "step": 9468 + }, + { + "epoch": 1.3405535499398316, + "grad_norm": 10.276591721107916, + "learning_rate": 1.2947720423252586e-06, + "loss": 1.0174, + "step": 9469 + }, + { + "epoch": 1.3406951228144688, + "grad_norm": 8.895270181619212, + "learning_rate": 1.2942698817285082e-06, + "loss": 1.0347, + "step": 9470 + }, + { + "epoch": 1.340836695689106, + "grad_norm": 11.156279748187755, + "learning_rate": 1.29376778451609e-06, + "loss": 0.9454, + "step": 9471 + }, + { + "epoch": 1.3409782685637432, + "grad_norm": 9.58115417472364, + "learning_rate": 1.2932657507144014e-06, + "loss": 1.0018, + "step": 9472 + }, + { + "epoch": 1.3411198414383805, + "grad_norm": 8.603947695946893, + "learning_rate": 1.2927637803498323e-06, + "loss": 0.9447, + "step": 9473 + }, + { + "epoch": 1.3412614143130177, + "grad_norm": 8.72061848188206, + "learning_rate": 1.2922618734487697e-06, + "loss": 0.9232, + "step": 9474 + }, + { + "epoch": 1.3414029871876547, + "grad_norm": 14.20815303087731, + "learning_rate": 1.2917600300376012e-06, + "loss": 1.0186, + "step": 9475 + }, + { + "epoch": 1.341544560062292, + "grad_norm": 10.973110625493412, + "learning_rate": 1.2912582501427062e-06, + "loss": 1.0393, + "step": 9476 + }, + { + "epoch": 1.3416861329369292, + "grad_norm": 10.376254453999316, + "learning_rate": 1.2907565337904642e-06, + "loss": 1.0023, + "step": 9477 + }, + { + "epoch": 1.3418277058115664, + "grad_norm": 8.532938695762704, + "learning_rate": 1.290254881007249e-06, + "loss": 1.0281, + "step": 9478 + }, + { + "epoch": 1.3419692786862036, + "grad_norm": 9.83745303306371, + "learning_rate": 1.2897532918194336e-06, + "loss": 1.0012, + "step": 9479 + }, + { + "epoch": 1.3421108515608409, + "grad_norm": 9.452290042542064, + "learning_rate": 1.2892517662533844e-06, + "loss": 1.0051, + "step": 9480 + }, + { + "epoch": 1.342252424435478, + "grad_norm": 12.624333430152424, + "learning_rate": 1.2887503043354668e-06, + "loss": 1.0808, + "step": 9481 + }, + { + "epoch": 1.3423939973101153, + "grad_norm": 8.884780867004402, + "learning_rate": 1.2882489060920436e-06, + "loss": 0.937, + "step": 9482 + }, + { + "epoch": 1.3425355701847526, + "grad_norm": 10.775794376555298, + "learning_rate": 1.287747571549472e-06, + "loss": 1.0334, + "step": 9483 + }, + { + "epoch": 1.3426771430593898, + "grad_norm": 10.045871809120074, + "learning_rate": 1.2872463007341065e-06, + "loss": 0.9463, + "step": 9484 + }, + { + "epoch": 1.342818715934027, + "grad_norm": 8.30379127410641, + "learning_rate": 1.286745093672298e-06, + "loss": 1.066, + "step": 9485 + }, + { + "epoch": 1.3429602888086642, + "grad_norm": 8.489353019227, + "learning_rate": 1.2862439503903958e-06, + "loss": 0.9741, + "step": 9486 + }, + { + "epoch": 1.3431018616833015, + "grad_norm": 9.617685288066612, + "learning_rate": 1.2857428709147434e-06, + "loss": 0.8693, + "step": 9487 + }, + { + "epoch": 1.3432434345579387, + "grad_norm": 8.86918471106363, + "learning_rate": 1.285241855271683e-06, + "loss": 0.9342, + "step": 9488 + }, + { + "epoch": 1.343385007432576, + "grad_norm": 9.604067147512339, + "learning_rate": 1.2847409034875536e-06, + "loss": 1.0445, + "step": 9489 + }, + { + "epoch": 1.3435265803072132, + "grad_norm": 8.732534798182304, + "learning_rate": 1.2842400155886876e-06, + "loss": 0.9979, + "step": 9490 + }, + { + "epoch": 1.3436681531818504, + "grad_norm": 9.689647805630914, + "learning_rate": 1.2837391916014182e-06, + "loss": 1.0376, + "step": 9491 + }, + { + "epoch": 1.3438097260564876, + "grad_norm": 8.384371851558997, + "learning_rate": 1.2832384315520717e-06, + "loss": 1.033, + "step": 9492 + }, + { + "epoch": 1.3439512989311249, + "grad_norm": 8.84787225522748, + "learning_rate": 1.2827377354669752e-06, + "loss": 1.066, + "step": 9493 + }, + { + "epoch": 1.344092871805762, + "grad_norm": 8.657782573885745, + "learning_rate": 1.2822371033724478e-06, + "loss": 1.0269, + "step": 9494 + }, + { + "epoch": 1.3442344446803993, + "grad_norm": 10.392064779242697, + "learning_rate": 1.2817365352948069e-06, + "loss": 0.9845, + "step": 9495 + }, + { + "epoch": 1.3443760175550366, + "grad_norm": 9.625305963582568, + "learning_rate": 1.2812360312603689e-06, + "loss": 0.9159, + "step": 9496 + }, + { + "epoch": 1.3445175904296738, + "grad_norm": 8.6930813870264, + "learning_rate": 1.2807355912954433e-06, + "loss": 0.9879, + "step": 9497 + }, + { + "epoch": 1.344659163304311, + "grad_norm": 9.742903645615813, + "learning_rate": 1.2802352154263392e-06, + "loss": 0.9942, + "step": 9498 + }, + { + "epoch": 1.3448007361789482, + "grad_norm": 8.924119443223184, + "learning_rate": 1.2797349036793595e-06, + "loss": 0.9481, + "step": 9499 + }, + { + "epoch": 1.3449423090535852, + "grad_norm": 9.979526160234888, + "learning_rate": 1.2792346560808068e-06, + "loss": 0.967, + "step": 9500 + }, + { + "epoch": 1.3450838819282225, + "grad_norm": 8.428024283469638, + "learning_rate": 1.2787344726569772e-06, + "loss": 0.8941, + "step": 9501 + }, + { + "epoch": 1.3452254548028597, + "grad_norm": 8.397973733561557, + "learning_rate": 1.2782343534341667e-06, + "loss": 0.9647, + "step": 9502 + }, + { + "epoch": 1.345367027677497, + "grad_norm": 9.570126353419278, + "learning_rate": 1.2777342984386648e-06, + "loss": 1.0227, + "step": 9503 + }, + { + "epoch": 1.3455086005521342, + "grad_norm": 9.174919805773934, + "learning_rate": 1.2772343076967596e-06, + "loss": 1.0053, + "step": 9504 + }, + { + "epoch": 1.3456501734267714, + "grad_norm": 9.248931410642504, + "learning_rate": 1.2767343812347356e-06, + "loss": 1.0498, + "step": 9505 + }, + { + "epoch": 1.3457917463014086, + "grad_norm": 9.638127477977156, + "learning_rate": 1.2762345190788722e-06, + "loss": 1.062, + "step": 9506 + }, + { + "epoch": 1.3459333191760459, + "grad_norm": 10.344623263741894, + "learning_rate": 1.2757347212554484e-06, + "loss": 0.9909, + "step": 9507 + }, + { + "epoch": 1.346074892050683, + "grad_norm": 8.10460932943794, + "learning_rate": 1.2752349877907364e-06, + "loss": 0.9645, + "step": 9508 + }, + { + "epoch": 1.3462164649253203, + "grad_norm": 7.937997156628527, + "learning_rate": 1.274735318711009e-06, + "loss": 0.9582, + "step": 9509 + }, + { + "epoch": 1.3463580377999576, + "grad_norm": 7.891002321191944, + "learning_rate": 1.274235714042531e-06, + "loss": 0.9381, + "step": 9510 + }, + { + "epoch": 1.3464996106745948, + "grad_norm": 8.265307472634488, + "learning_rate": 1.2737361738115681e-06, + "loss": 0.9583, + "step": 9511 + }, + { + "epoch": 1.346641183549232, + "grad_norm": 9.666125929167054, + "learning_rate": 1.2732366980443808e-06, + "loss": 1.1085, + "step": 9512 + }, + { + "epoch": 1.3467827564238692, + "grad_norm": 8.023998504608107, + "learning_rate": 1.2727372867672247e-06, + "loss": 0.9111, + "step": 9513 + }, + { + "epoch": 1.3469243292985065, + "grad_norm": 9.49374545149352, + "learning_rate": 1.2722379400063553e-06, + "loss": 1.0045, + "step": 9514 + }, + { + "epoch": 1.3470659021731437, + "grad_norm": 9.993852633199149, + "learning_rate": 1.271738657788022e-06, + "loss": 1.0083, + "step": 9515 + }, + { + "epoch": 1.3472074750477807, + "grad_norm": 8.854969039164978, + "learning_rate": 1.2712394401384703e-06, + "loss": 1.0628, + "step": 9516 + }, + { + "epoch": 1.347349047922418, + "grad_norm": 8.861117398516564, + "learning_rate": 1.2707402870839464e-06, + "loss": 1.0443, + "step": 9517 + }, + { + "epoch": 1.3474906207970552, + "grad_norm": 9.794592194414365, + "learning_rate": 1.270241198650688e-06, + "loss": 1.1491, + "step": 9518 + }, + { + "epoch": 1.3476321936716924, + "grad_norm": 9.503009921640658, + "learning_rate": 1.269742174864934e-06, + "loss": 1.0519, + "step": 9519 + }, + { + "epoch": 1.3477737665463296, + "grad_norm": 9.350716025939079, + "learning_rate": 1.2692432157529153e-06, + "loss": 0.9837, + "step": 9520 + }, + { + "epoch": 1.3479153394209669, + "grad_norm": 8.134345680562076, + "learning_rate": 1.268744321340864e-06, + "loss": 0.939, + "step": 9521 + }, + { + "epoch": 1.348056912295604, + "grad_norm": 9.102261155454167, + "learning_rate": 1.2682454916550046e-06, + "loss": 1.0031, + "step": 9522 + }, + { + "epoch": 1.3481984851702413, + "grad_norm": 9.79039475857887, + "learning_rate": 1.2677467267215626e-06, + "loss": 0.9879, + "step": 9523 + }, + { + "epoch": 1.3483400580448786, + "grad_norm": 9.953588741404168, + "learning_rate": 1.2672480265667553e-06, + "loss": 0.9561, + "step": 9524 + }, + { + "epoch": 1.3484816309195158, + "grad_norm": 9.155759205862843, + "learning_rate": 1.2667493912168008e-06, + "loss": 1.1067, + "step": 9525 + }, + { + "epoch": 1.348623203794153, + "grad_norm": 9.518413166541174, + "learning_rate": 1.2662508206979113e-06, + "loss": 0.9234, + "step": 9526 + }, + { + "epoch": 1.3487647766687902, + "grad_norm": 8.92670135096259, + "learning_rate": 1.2657523150362955e-06, + "loss": 1.0793, + "step": 9527 + }, + { + "epoch": 1.3489063495434275, + "grad_norm": 8.618548648023044, + "learning_rate": 1.265253874258161e-06, + "loss": 0.9885, + "step": 9528 + }, + { + "epoch": 1.3490479224180647, + "grad_norm": 9.523669821304935, + "learning_rate": 1.2647554983897087e-06, + "loss": 0.9443, + "step": 9529 + }, + { + "epoch": 1.349189495292702, + "grad_norm": 10.011396018135212, + "learning_rate": 1.2642571874571396e-06, + "loss": 1.064, + "step": 9530 + }, + { + "epoch": 1.3493310681673392, + "grad_norm": 9.341578202476725, + "learning_rate": 1.2637589414866483e-06, + "loss": 0.9889, + "step": 9531 + }, + { + "epoch": 1.3494726410419764, + "grad_norm": 8.332903380428613, + "learning_rate": 1.2632607605044272e-06, + "loss": 0.9862, + "step": 9532 + }, + { + "epoch": 1.3496142139166136, + "grad_norm": 7.259294274448248, + "learning_rate": 1.262762644536667e-06, + "loss": 0.8609, + "step": 9533 + }, + { + "epoch": 1.3497557867912509, + "grad_norm": 9.97834760666678, + "learning_rate": 1.262264593609551e-06, + "loss": 0.8963, + "step": 9534 + }, + { + "epoch": 1.349897359665888, + "grad_norm": 9.663860001288912, + "learning_rate": 1.2617666077492636e-06, + "loss": 0.8603, + "step": 9535 + }, + { + "epoch": 1.3500389325405253, + "grad_norm": 7.752447172622761, + "learning_rate": 1.2612686869819818e-06, + "loss": 0.9752, + "step": 9536 + }, + { + "epoch": 1.3501805054151625, + "grad_norm": 7.515237207380957, + "learning_rate": 1.2607708313338818e-06, + "loss": 0.9382, + "step": 9537 + }, + { + "epoch": 1.3503220782897998, + "grad_norm": 7.600907331815439, + "learning_rate": 1.2602730408311342e-06, + "loss": 1.0571, + "step": 9538 + }, + { + "epoch": 1.350463651164437, + "grad_norm": 8.152781249490923, + "learning_rate": 1.2597753154999088e-06, + "loss": 0.9751, + "step": 9539 + }, + { + "epoch": 1.3506052240390742, + "grad_norm": 8.64669768520763, + "learning_rate": 1.259277655366371e-06, + "loss": 1.0024, + "step": 9540 + }, + { + "epoch": 1.3507467969137115, + "grad_norm": 9.515286243841858, + "learning_rate": 1.2587800604566808e-06, + "loss": 0.9883, + "step": 9541 + }, + { + "epoch": 1.3508883697883485, + "grad_norm": 10.112799935437252, + "learning_rate": 1.2582825307969981e-06, + "loss": 1.0535, + "step": 9542 + }, + { + "epoch": 1.3510299426629857, + "grad_norm": 9.132760288425391, + "learning_rate": 1.257785066413476e-06, + "loss": 0.9531, + "step": 9543 + }, + { + "epoch": 1.351171515537623, + "grad_norm": 9.810043124571727, + "learning_rate": 1.2572876673322676e-06, + "loss": 1.0128, + "step": 9544 + }, + { + "epoch": 1.3513130884122602, + "grad_norm": 8.559986051744167, + "learning_rate": 1.2567903335795191e-06, + "loss": 0.9778, + "step": 9545 + }, + { + "epoch": 1.3514546612868974, + "grad_norm": 8.425263298289723, + "learning_rate": 1.2562930651813772e-06, + "loss": 1.0095, + "step": 9546 + }, + { + "epoch": 1.3515962341615346, + "grad_norm": 7.041278384486854, + "learning_rate": 1.255795862163981e-06, + "loss": 0.9359, + "step": 9547 + }, + { + "epoch": 1.3517378070361719, + "grad_norm": 8.500342418281004, + "learning_rate": 1.2552987245534675e-06, + "loss": 0.9564, + "step": 9548 + }, + { + "epoch": 1.351879379910809, + "grad_norm": 10.954018984892564, + "learning_rate": 1.2548016523759733e-06, + "loss": 1.1077, + "step": 9549 + }, + { + "epoch": 1.3520209527854463, + "grad_norm": 9.210803729016956, + "learning_rate": 1.2543046456576267e-06, + "loss": 0.9975, + "step": 9550 + }, + { + "epoch": 1.3521625256600835, + "grad_norm": 11.844928129275218, + "learning_rate": 1.253807704424557e-06, + "loss": 1.0775, + "step": 9551 + }, + { + "epoch": 1.3523040985347208, + "grad_norm": 8.5650198910956, + "learning_rate": 1.2533108287028862e-06, + "loss": 0.9736, + "step": 9552 + }, + { + "epoch": 1.352445671409358, + "grad_norm": 10.914638495895947, + "learning_rate": 1.2528140185187362e-06, + "loss": 1.0348, + "step": 9553 + }, + { + "epoch": 1.3525872442839952, + "grad_norm": 8.559936585235999, + "learning_rate": 1.2523172738982225e-06, + "loss": 0.9718, + "step": 9554 + }, + { + "epoch": 1.3527288171586325, + "grad_norm": 11.018461514232602, + "learning_rate": 1.2518205948674593e-06, + "loss": 1.0722, + "step": 9555 + }, + { + "epoch": 1.3528703900332697, + "grad_norm": 12.489548242735491, + "learning_rate": 1.2513239814525583e-06, + "loss": 1.0353, + "step": 9556 + }, + { + "epoch": 1.3530119629079067, + "grad_norm": 7.4509307382344385, + "learning_rate": 1.250827433679624e-06, + "loss": 1.0113, + "step": 9557 + }, + { + "epoch": 1.353153535782544, + "grad_norm": 8.811009923613947, + "learning_rate": 1.2503309515747602e-06, + "loss": 0.9508, + "step": 9558 + }, + { + "epoch": 1.3532951086571812, + "grad_norm": 9.593952785672386, + "learning_rate": 1.2498345351640655e-06, + "loss": 0.9177, + "step": 9559 + }, + { + "epoch": 1.3534366815318184, + "grad_norm": 10.88590088760529, + "learning_rate": 1.2493381844736382e-06, + "loss": 1.0201, + "step": 9560 + }, + { + "epoch": 1.3535782544064556, + "grad_norm": 8.765791675526545, + "learning_rate": 1.2488418995295689e-06, + "loss": 0.9386, + "step": 9561 + }, + { + "epoch": 1.3537198272810929, + "grad_norm": 9.325786354924464, + "learning_rate": 1.2483456803579484e-06, + "loss": 1.04, + "step": 9562 + }, + { + "epoch": 1.35386140015573, + "grad_norm": 8.01180969216425, + "learning_rate": 1.2478495269848626e-06, + "loss": 0.9701, + "step": 9563 + }, + { + "epoch": 1.3540029730303673, + "grad_norm": 9.28723198361495, + "learning_rate": 1.247353439436393e-06, + "loss": 1.0943, + "step": 9564 + }, + { + "epoch": 1.3541445459050045, + "grad_norm": 7.534624918491532, + "learning_rate": 1.2468574177386198e-06, + "loss": 0.9001, + "step": 9565 + }, + { + "epoch": 1.3542861187796418, + "grad_norm": 8.662442082463308, + "learning_rate": 1.2463614619176167e-06, + "loss": 0.9839, + "step": 9566 + }, + { + "epoch": 1.354427691654279, + "grad_norm": 9.541936967733093, + "learning_rate": 1.2458655719994582e-06, + "loss": 1.0245, + "step": 9567 + }, + { + "epoch": 1.3545692645289162, + "grad_norm": 8.377026668553949, + "learning_rate": 1.2453697480102111e-06, + "loss": 1.0731, + "step": 9568 + }, + { + "epoch": 1.3547108374035535, + "grad_norm": 9.867591879827415, + "learning_rate": 1.2448739899759398e-06, + "loss": 1.065, + "step": 9569 + }, + { + "epoch": 1.3548524102781907, + "grad_norm": 7.634324173010554, + "learning_rate": 1.2443782979227084e-06, + "loss": 1.0125, + "step": 9570 + }, + { + "epoch": 1.354993983152828, + "grad_norm": 8.159361030394255, + "learning_rate": 1.2438826718765724e-06, + "loss": 0.9996, + "step": 9571 + }, + { + "epoch": 1.3551355560274652, + "grad_norm": 8.10986844070163, + "learning_rate": 1.2433871118635888e-06, + "loss": 0.8906, + "step": 9572 + }, + { + "epoch": 1.3552771289021024, + "grad_norm": 10.653616750461913, + "learning_rate": 1.2428916179098065e-06, + "loss": 1.0464, + "step": 9573 + }, + { + "epoch": 1.3554187017767396, + "grad_norm": 9.540574012617604, + "learning_rate": 1.2423961900412756e-06, + "loss": 1.0449, + "step": 9574 + }, + { + "epoch": 1.3555602746513769, + "grad_norm": 8.23614077663672, + "learning_rate": 1.2419008282840387e-06, + "loss": 0.9628, + "step": 9575 + }, + { + "epoch": 1.355701847526014, + "grad_norm": 9.248226098778138, + "learning_rate": 1.2414055326641378e-06, + "loss": 1.0261, + "step": 9576 + }, + { + "epoch": 1.3558434204006513, + "grad_norm": 7.94722458478897, + "learning_rate": 1.2409103032076087e-06, + "loss": 1.025, + "step": 9577 + }, + { + "epoch": 1.3559849932752885, + "grad_norm": 10.081365303945768, + "learning_rate": 1.2404151399404859e-06, + "loss": 1.0034, + "step": 9578 + }, + { + "epoch": 1.3561265661499258, + "grad_norm": 8.81529894810344, + "learning_rate": 1.2399200428888023e-06, + "loss": 0.949, + "step": 9579 + }, + { + "epoch": 1.356268139024563, + "grad_norm": 8.313388404123344, + "learning_rate": 1.2394250120785806e-06, + "loss": 0.9819, + "step": 9580 + }, + { + "epoch": 1.3564097118992002, + "grad_norm": 8.72720900064379, + "learning_rate": 1.2389300475358468e-06, + "loss": 1.0425, + "step": 9581 + }, + { + "epoch": 1.3565512847738375, + "grad_norm": 8.885057365305281, + "learning_rate": 1.2384351492866192e-06, + "loss": 0.9193, + "step": 9582 + }, + { + "epoch": 1.3566928576484745, + "grad_norm": 9.695965224678954, + "learning_rate": 1.237940317356916e-06, + "loss": 1.0027, + "step": 9583 + }, + { + "epoch": 1.3568344305231117, + "grad_norm": 8.953456479738444, + "learning_rate": 1.2374455517727485e-06, + "loss": 0.9521, + "step": 9584 + }, + { + "epoch": 1.356976003397749, + "grad_norm": 8.54277227426102, + "learning_rate": 1.236950852560127e-06, + "loss": 0.8983, + "step": 9585 + }, + { + "epoch": 1.3571175762723862, + "grad_norm": 12.629437865782627, + "learning_rate": 1.2364562197450583e-06, + "loss": 1.0679, + "step": 9586 + }, + { + "epoch": 1.3572591491470234, + "grad_norm": 10.743101878697521, + "learning_rate": 1.235961653353543e-06, + "loss": 1.1284, + "step": 9587 + }, + { + "epoch": 1.3574007220216606, + "grad_norm": 9.918924112119832, + "learning_rate": 1.235467153411582e-06, + "loss": 0.9153, + "step": 9588 + }, + { + "epoch": 1.3575422948962979, + "grad_norm": 9.8967316443971, + "learning_rate": 1.2349727199451696e-06, + "loss": 0.9983, + "step": 9589 + }, + { + "epoch": 1.357683867770935, + "grad_norm": 7.551804474691768, + "learning_rate": 1.2344783529802975e-06, + "loss": 1.0082, + "step": 9590 + }, + { + "epoch": 1.3578254406455723, + "grad_norm": 9.540479650055598, + "learning_rate": 1.2339840525429559e-06, + "loss": 1.1455, + "step": 9591 + }, + { + "epoch": 1.3579670135202095, + "grad_norm": 8.77949131632565, + "learning_rate": 1.2334898186591274e-06, + "loss": 0.915, + "step": 9592 + }, + { + "epoch": 1.3581085863948468, + "grad_norm": 10.535642528520485, + "learning_rate": 1.2329956513547957e-06, + "loss": 0.904, + "step": 9593 + }, + { + "epoch": 1.358250159269484, + "grad_norm": 10.098988216068046, + "learning_rate": 1.232501550655937e-06, + "loss": 0.8354, + "step": 9594 + }, + { + "epoch": 1.3583917321441212, + "grad_norm": 7.761559510193389, + "learning_rate": 1.2320075165885278e-06, + "loss": 0.9405, + "step": 9595 + }, + { + "epoch": 1.3585333050187585, + "grad_norm": 11.872388050533933, + "learning_rate": 1.2315135491785369e-06, + "loss": 1.0809, + "step": 9596 + }, + { + "epoch": 1.3586748778933957, + "grad_norm": 9.522017813582766, + "learning_rate": 1.2310196484519339e-06, + "loss": 1.0603, + "step": 9597 + }, + { + "epoch": 1.358816450768033, + "grad_norm": 9.913695031729032, + "learning_rate": 1.2305258144346807e-06, + "loss": 0.943, + "step": 9598 + }, + { + "epoch": 1.35895802364267, + "grad_norm": 8.870266793936104, + "learning_rate": 1.23003204715274e-06, + "loss": 1.0191, + "step": 9599 + }, + { + "epoch": 1.3590995965173072, + "grad_norm": 8.412297339074273, + "learning_rate": 1.2295383466320677e-06, + "loss": 1.0343, + "step": 9600 + }, + { + "epoch": 1.3592411693919444, + "grad_norm": 9.55969476380134, + "learning_rate": 1.229044712898616e-06, + "loss": 1.0051, + "step": 9601 + }, + { + "epoch": 1.3593827422665816, + "grad_norm": 9.076386014326115, + "learning_rate": 1.2285511459783373e-06, + "loss": 1.0018, + "step": 9602 + }, + { + "epoch": 1.3595243151412189, + "grad_norm": 10.530280637415812, + "learning_rate": 1.2280576458971757e-06, + "loss": 0.9578, + "step": 9603 + }, + { + "epoch": 1.359665888015856, + "grad_norm": 10.626367099548887, + "learning_rate": 1.2275642126810764e-06, + "loss": 1.0179, + "step": 9604 + }, + { + "epoch": 1.3598074608904933, + "grad_norm": 7.715285044736404, + "learning_rate": 1.2270708463559766e-06, + "loss": 1.0612, + "step": 9605 + }, + { + "epoch": 1.3599490337651305, + "grad_norm": 11.790027374158147, + "learning_rate": 1.226577546947814e-06, + "loss": 1.0514, + "step": 9606 + }, + { + "epoch": 1.3600906066397678, + "grad_norm": 10.371101106400404, + "learning_rate": 1.2260843144825196e-06, + "loss": 1.0323, + "step": 9607 + }, + { + "epoch": 1.360232179514405, + "grad_norm": 8.446507146222716, + "learning_rate": 1.2255911489860228e-06, + "loss": 0.9512, + "step": 9608 + }, + { + "epoch": 1.3603737523890422, + "grad_norm": 8.539988531243443, + "learning_rate": 1.2250980504842503e-06, + "loss": 0.9883, + "step": 9609 + }, + { + "epoch": 1.3605153252636795, + "grad_norm": 8.174701967552137, + "learning_rate": 1.2246050190031222e-06, + "loss": 1.0503, + "step": 9610 + }, + { + "epoch": 1.3606568981383167, + "grad_norm": 8.06895767760184, + "learning_rate": 1.2241120545685575e-06, + "loss": 0.9838, + "step": 9611 + }, + { + "epoch": 1.360798471012954, + "grad_norm": 8.693459641566802, + "learning_rate": 1.2236191572064697e-06, + "loss": 0.9402, + "step": 9612 + }, + { + "epoch": 1.3609400438875912, + "grad_norm": 9.575615558807597, + "learning_rate": 1.2231263269427716e-06, + "loss": 1.0395, + "step": 9613 + }, + { + "epoch": 1.3610816167622284, + "grad_norm": 8.248159492260417, + "learning_rate": 1.2226335638033708e-06, + "loss": 0.9826, + "step": 9614 + }, + { + "epoch": 1.3612231896368656, + "grad_norm": 10.0052899673894, + "learning_rate": 1.2221408678141702e-06, + "loss": 1.0039, + "step": 9615 + }, + { + "epoch": 1.3613647625115028, + "grad_norm": 9.05581228858321, + "learning_rate": 1.2216482390010726e-06, + "loss": 0.9621, + "step": 9616 + }, + { + "epoch": 1.36150633538614, + "grad_norm": 9.287766349331019, + "learning_rate": 1.2211556773899728e-06, + "loss": 0.9464, + "step": 9617 + }, + { + "epoch": 1.3616479082607773, + "grad_norm": 8.360268461306562, + "learning_rate": 1.2206631830067663e-06, + "loss": 0.9416, + "step": 9618 + }, + { + "epoch": 1.3617894811354145, + "grad_norm": 8.453319949230746, + "learning_rate": 1.2201707558773416e-06, + "loss": 1.1088, + "step": 9619 + }, + { + "epoch": 1.3619310540100518, + "grad_norm": 8.564244447643025, + "learning_rate": 1.2196783960275867e-06, + "loss": 1.0565, + "step": 9620 + }, + { + "epoch": 1.362072626884689, + "grad_norm": 8.489336393232449, + "learning_rate": 1.2191861034833841e-06, + "loss": 0.9496, + "step": 9621 + }, + { + "epoch": 1.3622141997593262, + "grad_norm": 9.506762858042821, + "learning_rate": 1.218693878270612e-06, + "loss": 0.9779, + "step": 9622 + }, + { + "epoch": 1.3623557726339635, + "grad_norm": 8.391391030121893, + "learning_rate": 1.2182017204151484e-06, + "loss": 1.0053, + "step": 9623 + }, + { + "epoch": 1.3624973455086007, + "grad_norm": 8.313958291080146, + "learning_rate": 1.2177096299428634e-06, + "loss": 0.9859, + "step": 9624 + }, + { + "epoch": 1.3626389183832377, + "grad_norm": 8.858195123703776, + "learning_rate": 1.2172176068796281e-06, + "loss": 1.013, + "step": 9625 + }, + { + "epoch": 1.362780491257875, + "grad_norm": 9.755287497315157, + "learning_rate": 1.216725651251306e-06, + "loss": 1.0487, + "step": 9626 + }, + { + "epoch": 1.3629220641325122, + "grad_norm": 8.21912084402769, + "learning_rate": 1.2162337630837604e-06, + "loss": 1.0427, + "step": 9627 + }, + { + "epoch": 1.3630636370071494, + "grad_norm": 8.381917800867997, + "learning_rate": 1.2157419424028473e-06, + "loss": 0.9835, + "step": 9628 + }, + { + "epoch": 1.3632052098817866, + "grad_norm": 10.330470847434182, + "learning_rate": 1.2152501892344232e-06, + "loss": 0.9817, + "step": 9629 + }, + { + "epoch": 1.3633467827564238, + "grad_norm": 9.689132454442273, + "learning_rate": 1.2147585036043397e-06, + "loss": 0.9478, + "step": 9630 + }, + { + "epoch": 1.363488355631061, + "grad_norm": 11.455617427895227, + "learning_rate": 1.2142668855384421e-06, + "loss": 1.075, + "step": 9631 + }, + { + "epoch": 1.3636299285056983, + "grad_norm": 9.451136961665727, + "learning_rate": 1.2137753350625774e-06, + "loss": 0.9589, + "step": 9632 + }, + { + "epoch": 1.3637715013803355, + "grad_norm": 8.820140799784737, + "learning_rate": 1.2132838522025827e-06, + "loss": 0.9763, + "step": 9633 + }, + { + "epoch": 1.3639130742549728, + "grad_norm": 9.944736365136599, + "learning_rate": 1.2127924369842975e-06, + "loss": 1.0433, + "step": 9634 + }, + { + "epoch": 1.36405464712961, + "grad_norm": 9.027407135487119, + "learning_rate": 1.212301089433553e-06, + "loss": 1.0213, + "step": 9635 + }, + { + "epoch": 1.3641962200042472, + "grad_norm": 10.150102932412766, + "learning_rate": 1.21180980957618e-06, + "loss": 1.0499, + "step": 9636 + }, + { + "epoch": 1.3643377928788845, + "grad_norm": 8.349325424630825, + "learning_rate": 1.211318597438006e-06, + "loss": 1.0617, + "step": 9637 + }, + { + "epoch": 1.3644793657535217, + "grad_norm": 6.949561828412896, + "learning_rate": 1.2108274530448513e-06, + "loss": 0.9203, + "step": 9638 + }, + { + "epoch": 1.364620938628159, + "grad_norm": 8.238000176718929, + "learning_rate": 1.210336376422537e-06, + "loss": 1.0096, + "step": 9639 + }, + { + "epoch": 1.364762511502796, + "grad_norm": 11.087781391207134, + "learning_rate": 1.2098453675968772e-06, + "loss": 0.9337, + "step": 9640 + }, + { + "epoch": 1.3649040843774332, + "grad_norm": 9.289278918184351, + "learning_rate": 1.2093544265936848e-06, + "loss": 0.9778, + "step": 9641 + }, + { + "epoch": 1.3650456572520704, + "grad_norm": 9.652871340169373, + "learning_rate": 1.2088635534387684e-06, + "loss": 1.0751, + "step": 9642 + }, + { + "epoch": 1.3651872301267076, + "grad_norm": 9.664252363358264, + "learning_rate": 1.208372748157931e-06, + "loss": 1.0104, + "step": 9643 + }, + { + "epoch": 1.3653288030013448, + "grad_norm": 9.748744785988796, + "learning_rate": 1.2078820107769762e-06, + "loss": 1.0245, + "step": 9644 + }, + { + "epoch": 1.365470375875982, + "grad_norm": 9.191264724128226, + "learning_rate": 1.2073913413216998e-06, + "loss": 1.0178, + "step": 9645 + }, + { + "epoch": 1.3656119487506193, + "grad_norm": 7.81065603238614, + "learning_rate": 1.2069007398178978e-06, + "loss": 0.9117, + "step": 9646 + }, + { + "epoch": 1.3657535216252565, + "grad_norm": 9.530141697217465, + "learning_rate": 1.2064102062913585e-06, + "loss": 0.9524, + "step": 9647 + }, + { + "epoch": 1.3658950944998938, + "grad_norm": 8.12837548198127, + "learning_rate": 1.2059197407678714e-06, + "loss": 1.0036, + "step": 9648 + }, + { + "epoch": 1.366036667374531, + "grad_norm": 8.38041398147947, + "learning_rate": 1.2054293432732172e-06, + "loss": 0.9733, + "step": 9649 + }, + { + "epoch": 1.3661782402491682, + "grad_norm": 8.546024259247597, + "learning_rate": 1.2049390138331785e-06, + "loss": 0.9741, + "step": 9650 + }, + { + "epoch": 1.3663198131238055, + "grad_norm": 9.159183800327604, + "learning_rate": 1.204448752473529e-06, + "loss": 0.946, + "step": 9651 + }, + { + "epoch": 1.3664613859984427, + "grad_norm": 8.566475272607248, + "learning_rate": 1.2039585592200428e-06, + "loss": 0.8692, + "step": 9652 + }, + { + "epoch": 1.36660295887308, + "grad_norm": 9.055903276607665, + "learning_rate": 1.2034684340984907e-06, + "loss": 1.0206, + "step": 9653 + }, + { + "epoch": 1.3667445317477172, + "grad_norm": 10.227058602605776, + "learning_rate": 1.2029783771346344e-06, + "loss": 0.9963, + "step": 9654 + }, + { + "epoch": 1.3668861046223544, + "grad_norm": 10.182769987899405, + "learning_rate": 1.2024883883542384e-06, + "loss": 1.1043, + "step": 9655 + }, + { + "epoch": 1.3670276774969916, + "grad_norm": 10.413559188184427, + "learning_rate": 1.2019984677830597e-06, + "loss": 0.9897, + "step": 9656 + }, + { + "epoch": 1.3671692503716288, + "grad_norm": 8.485006235150736, + "learning_rate": 1.2015086154468544e-06, + "loss": 1.0187, + "step": 9657 + }, + { + "epoch": 1.367310823246266, + "grad_norm": 9.477925403076828, + "learning_rate": 1.201018831371372e-06, + "loss": 0.9543, + "step": 9658 + }, + { + "epoch": 1.3674523961209033, + "grad_norm": 9.6507789890358, + "learning_rate": 1.2005291155823612e-06, + "loss": 0.99, + "step": 9659 + }, + { + "epoch": 1.3675939689955405, + "grad_norm": 8.150358096253683, + "learning_rate": 1.200039468105567e-06, + "loss": 0.9918, + "step": 9660 + }, + { + "epoch": 1.3677355418701778, + "grad_norm": 11.500409326317545, + "learning_rate": 1.1995498889667276e-06, + "loss": 1.0365, + "step": 9661 + }, + { + "epoch": 1.367877114744815, + "grad_norm": 9.691115363051896, + "learning_rate": 1.1990603781915816e-06, + "loss": 0.9404, + "step": 9662 + }, + { + "epoch": 1.3680186876194522, + "grad_norm": 9.51303451179268, + "learning_rate": 1.1985709358058616e-06, + "loss": 0.931, + "step": 9663 + }, + { + "epoch": 1.3681602604940895, + "grad_norm": 9.224606687023217, + "learning_rate": 1.1980815618352964e-06, + "loss": 1.1028, + "step": 9664 + }, + { + "epoch": 1.3683018333687267, + "grad_norm": 8.762761428568275, + "learning_rate": 1.1975922563056136e-06, + "loss": 1.0513, + "step": 9665 + }, + { + "epoch": 1.3684434062433637, + "grad_norm": 8.052176556822763, + "learning_rate": 1.1971030192425337e-06, + "loss": 1.0296, + "step": 9666 + }, + { + "epoch": 1.368584979118001, + "grad_norm": 7.58138420571134, + "learning_rate": 1.1966138506717776e-06, + "loss": 0.8276, + "step": 9667 + }, + { + "epoch": 1.3687265519926382, + "grad_norm": 10.58930376309093, + "learning_rate": 1.1961247506190588e-06, + "loss": 0.9945, + "step": 9668 + }, + { + "epoch": 1.3688681248672754, + "grad_norm": 9.26694071339333, + "learning_rate": 1.1956357191100903e-06, + "loss": 0.9149, + "step": 9669 + }, + { + "epoch": 1.3690096977419126, + "grad_norm": 9.445864576891863, + "learning_rate": 1.1951467561705784e-06, + "loss": 1.0755, + "step": 9670 + }, + { + "epoch": 1.3691512706165498, + "grad_norm": 11.453263556460666, + "learning_rate": 1.19465786182623e-06, + "loss": 0.999, + "step": 9671 + }, + { + "epoch": 1.369292843491187, + "grad_norm": 9.69348628990812, + "learning_rate": 1.1941690361027432e-06, + "loss": 1.0317, + "step": 9672 + }, + { + "epoch": 1.3694344163658243, + "grad_norm": 8.132649268895062, + "learning_rate": 1.1936802790258176e-06, + "loss": 0.987, + "step": 9673 + }, + { + "epoch": 1.3695759892404615, + "grad_norm": 9.723799494607452, + "learning_rate": 1.1931915906211456e-06, + "loss": 0.9963, + "step": 9674 + }, + { + "epoch": 1.3697175621150988, + "grad_norm": 8.64590530059154, + "learning_rate": 1.1927029709144163e-06, + "loss": 0.974, + "step": 9675 + }, + { + "epoch": 1.369859134989736, + "grad_norm": 8.86614763657668, + "learning_rate": 1.1922144199313181e-06, + "loss": 1.0156, + "step": 9676 + }, + { + "epoch": 1.3700007078643732, + "grad_norm": 8.127739547160967, + "learning_rate": 1.1917259376975318e-06, + "loss": 0.9515, + "step": 9677 + }, + { + "epoch": 1.3701422807390105, + "grad_norm": 9.187866618827849, + "learning_rate": 1.1912375242387384e-06, + "loss": 1.0706, + "step": 9678 + }, + { + "epoch": 1.3702838536136477, + "grad_norm": 13.043665983535421, + "learning_rate": 1.1907491795806117e-06, + "loss": 1.0553, + "step": 9679 + }, + { + "epoch": 1.370425426488285, + "grad_norm": 9.772706416875993, + "learning_rate": 1.190260903748825e-06, + "loss": 0.9948, + "step": 9680 + }, + { + "epoch": 1.3705669993629221, + "grad_norm": 9.954204219355006, + "learning_rate": 1.1897726967690454e-06, + "loss": 1.0427, + "step": 9681 + }, + { + "epoch": 1.3707085722375592, + "grad_norm": 9.8477128060157, + "learning_rate": 1.189284558666938e-06, + "loss": 0.9792, + "step": 9682 + }, + { + "epoch": 1.3708501451121964, + "grad_norm": 9.711006636922031, + "learning_rate": 1.188796489468165e-06, + "loss": 0.9786, + "step": 9683 + }, + { + "epoch": 1.3709917179868336, + "grad_norm": 8.96455481900955, + "learning_rate": 1.1883084891983828e-06, + "loss": 1.0524, + "step": 9684 + }, + { + "epoch": 1.3711332908614708, + "grad_norm": 8.458049700511822, + "learning_rate": 1.1878205578832455e-06, + "loss": 0.9422, + "step": 9685 + }, + { + "epoch": 1.371274863736108, + "grad_norm": 8.684707741747578, + "learning_rate": 1.187332695548402e-06, + "loss": 0.9003, + "step": 9686 + }, + { + "epoch": 1.3714164366107453, + "grad_norm": 8.684567622093505, + "learning_rate": 1.1868449022194997e-06, + "loss": 0.9642, + "step": 9687 + }, + { + "epoch": 1.3715580094853825, + "grad_norm": 10.497125868029677, + "learning_rate": 1.186357177922183e-06, + "loss": 1.058, + "step": 9688 + }, + { + "epoch": 1.3716995823600198, + "grad_norm": 8.273840011012542, + "learning_rate": 1.185869522682089e-06, + "loss": 1.0504, + "step": 9689 + }, + { + "epoch": 1.371841155234657, + "grad_norm": 7.59743532776491, + "learning_rate": 1.1853819365248553e-06, + "loss": 0.9593, + "step": 9690 + }, + { + "epoch": 1.3719827281092942, + "grad_norm": 9.697407436966602, + "learning_rate": 1.184894419476112e-06, + "loss": 1.1385, + "step": 9691 + }, + { + "epoch": 1.3721243009839315, + "grad_norm": 7.157203577467698, + "learning_rate": 1.1844069715614893e-06, + "loss": 0.9424, + "step": 9692 + }, + { + "epoch": 1.3722658738585687, + "grad_norm": 8.31811242185962, + "learning_rate": 1.1839195928066101e-06, + "loss": 0.9322, + "step": 9693 + }, + { + "epoch": 1.372407446733206, + "grad_norm": 7.105316814792583, + "learning_rate": 1.183432283237098e-06, + "loss": 0.9271, + "step": 9694 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 8.621172954297469, + "learning_rate": 1.1829450428785689e-06, + "loss": 1.0171, + "step": 9695 + }, + { + "epoch": 1.3726905924824804, + "grad_norm": 9.458625239994662, + "learning_rate": 1.1824578717566358e-06, + "loss": 1.0915, + "step": 9696 + }, + { + "epoch": 1.3728321653571176, + "grad_norm": 9.587562578735634, + "learning_rate": 1.181970769896911e-06, + "loss": 0.9648, + "step": 9697 + }, + { + "epoch": 1.3729737382317548, + "grad_norm": 9.211468837383636, + "learning_rate": 1.1814837373249991e-06, + "loss": 0.968, + "step": 9698 + }, + { + "epoch": 1.373115311106392, + "grad_norm": 8.697316284169215, + "learning_rate": 1.180996774066505e-06, + "loss": 0.9122, + "step": 9699 + }, + { + "epoch": 1.3732568839810293, + "grad_norm": 8.937654934053722, + "learning_rate": 1.1805098801470259e-06, + "loss": 0.8791, + "step": 9700 + }, + { + "epoch": 1.3733984568556665, + "grad_norm": 10.495242176215008, + "learning_rate": 1.1800230555921597e-06, + "loss": 1.0434, + "step": 9701 + }, + { + "epoch": 1.3735400297303038, + "grad_norm": 7.978813488931929, + "learning_rate": 1.179536300427496e-06, + "loss": 1.0724, + "step": 9702 + }, + { + "epoch": 1.373681602604941, + "grad_norm": 9.661300967735151, + "learning_rate": 1.1790496146786257e-06, + "loss": 1.0061, + "step": 9703 + }, + { + "epoch": 1.3738231754795782, + "grad_norm": 10.29995167544982, + "learning_rate": 1.1785629983711311e-06, + "loss": 0.9946, + "step": 9704 + }, + { + "epoch": 1.3739647483542154, + "grad_norm": 9.034517007488212, + "learning_rate": 1.1780764515305942e-06, + "loss": 0.9804, + "step": 9705 + }, + { + "epoch": 1.3741063212288527, + "grad_norm": 8.330870556900992, + "learning_rate": 1.1775899741825947e-06, + "loss": 0.9625, + "step": 9706 + }, + { + "epoch": 1.37424789410349, + "grad_norm": 8.359269129002472, + "learning_rate": 1.1771035663527021e-06, + "loss": 0.9642, + "step": 9707 + }, + { + "epoch": 1.374389466978127, + "grad_norm": 9.823667355301541, + "learning_rate": 1.17661722806649e-06, + "loss": 0.9844, + "step": 9708 + }, + { + "epoch": 1.3745310398527641, + "grad_norm": 8.830320696250665, + "learning_rate": 1.1761309593495224e-06, + "loss": 1.0695, + "step": 9709 + }, + { + "epoch": 1.3746726127274014, + "grad_norm": 10.062235005366485, + "learning_rate": 1.1756447602273629e-06, + "loss": 1.0446, + "step": 9710 + }, + { + "epoch": 1.3748141856020386, + "grad_norm": 10.406862533000123, + "learning_rate": 1.1751586307255719e-06, + "loss": 1.023, + "step": 9711 + }, + { + "epoch": 1.3749557584766758, + "grad_norm": 9.293684211503237, + "learning_rate": 1.174672570869703e-06, + "loss": 1.049, + "step": 9712 + }, + { + "epoch": 1.375097331351313, + "grad_norm": 7.480618675064165, + "learning_rate": 1.1741865806853097e-06, + "loss": 0.9429, + "step": 9713 + }, + { + "epoch": 1.3752389042259503, + "grad_norm": 9.074196889771343, + "learning_rate": 1.1737006601979384e-06, + "loss": 1.0665, + "step": 9714 + }, + { + "epoch": 1.3753804771005875, + "grad_norm": 9.071151497453126, + "learning_rate": 1.1732148094331353e-06, + "loss": 0.9454, + "step": 9715 + }, + { + "epoch": 1.3755220499752248, + "grad_norm": 9.610676995128257, + "learning_rate": 1.1727290284164406e-06, + "loss": 1.0145, + "step": 9716 + }, + { + "epoch": 1.375663622849862, + "grad_norm": 9.566191316108345, + "learning_rate": 1.1722433171733903e-06, + "loss": 1.0013, + "step": 9717 + }, + { + "epoch": 1.3758051957244992, + "grad_norm": 9.642164110579717, + "learning_rate": 1.1717576757295192e-06, + "loss": 0.9149, + "step": 9718 + }, + { + "epoch": 1.3759467685991364, + "grad_norm": 9.516907353427923, + "learning_rate": 1.171272104110356e-06, + "loss": 0.9135, + "step": 9719 + }, + { + "epoch": 1.3760883414737737, + "grad_norm": 9.269676519220942, + "learning_rate": 1.1707866023414288e-06, + "loss": 1.0935, + "step": 9720 + }, + { + "epoch": 1.376229914348411, + "grad_norm": 7.976430744274841, + "learning_rate": 1.1703011704482577e-06, + "loss": 0.8765, + "step": 9721 + }, + { + "epoch": 1.3763714872230481, + "grad_norm": 9.126756890879413, + "learning_rate": 1.1698158084563635e-06, + "loss": 1.069, + "step": 9722 + }, + { + "epoch": 1.3765130600976851, + "grad_norm": 10.105811881133326, + "learning_rate": 1.1693305163912597e-06, + "loss": 0.9654, + "step": 9723 + }, + { + "epoch": 1.3766546329723224, + "grad_norm": 10.367817426227843, + "learning_rate": 1.1688452942784592e-06, + "loss": 1.0585, + "step": 9724 + }, + { + "epoch": 1.3767962058469596, + "grad_norm": 8.603380612235192, + "learning_rate": 1.168360142143468e-06, + "loss": 0.9179, + "step": 9725 + }, + { + "epoch": 1.3769377787215968, + "grad_norm": 10.101173529102402, + "learning_rate": 1.1678750600117914e-06, + "loss": 1.023, + "step": 9726 + }, + { + "epoch": 1.377079351596234, + "grad_norm": 8.227128200516532, + "learning_rate": 1.1673900479089314e-06, + "loss": 1.0164, + "step": 9727 + }, + { + "epoch": 1.3772209244708713, + "grad_norm": 8.463401992036571, + "learning_rate": 1.1669051058603811e-06, + "loss": 1.0171, + "step": 9728 + }, + { + "epoch": 1.3773624973455085, + "grad_norm": 9.647844823016378, + "learning_rate": 1.1664202338916364e-06, + "loss": 1.0026, + "step": 9729 + }, + { + "epoch": 1.3775040702201458, + "grad_norm": 9.715581543028614, + "learning_rate": 1.1659354320281845e-06, + "loss": 0.9628, + "step": 9730 + }, + { + "epoch": 1.377645643094783, + "grad_norm": 9.444457464271498, + "learning_rate": 1.1654507002955135e-06, + "loss": 0.9623, + "step": 9731 + }, + { + "epoch": 1.3777872159694202, + "grad_norm": 7.778744213672799, + "learning_rate": 1.1649660387191027e-06, + "loss": 0.951, + "step": 9732 + }, + { + "epoch": 1.3779287888440575, + "grad_norm": 10.012297507535148, + "learning_rate": 1.1644814473244322e-06, + "loss": 1.0129, + "step": 9733 + }, + { + "epoch": 1.3780703617186947, + "grad_norm": 9.614506140854013, + "learning_rate": 1.163996926136977e-06, + "loss": 0.905, + "step": 9734 + }, + { + "epoch": 1.378211934593332, + "grad_norm": 8.793638565669664, + "learning_rate": 1.1635124751822063e-06, + "loss": 0.8931, + "step": 9735 + }, + { + "epoch": 1.3783535074679691, + "grad_norm": 9.122764418061085, + "learning_rate": 1.163028094485589e-06, + "loss": 1.0382, + "step": 9736 + }, + { + "epoch": 1.3784950803426064, + "grad_norm": 8.886169711748481, + "learning_rate": 1.162543784072588e-06, + "loss": 0.9906, + "step": 9737 + }, + { + "epoch": 1.3786366532172436, + "grad_norm": 9.800161975378384, + "learning_rate": 1.1620595439686632e-06, + "loss": 0.9523, + "step": 9738 + }, + { + "epoch": 1.3787782260918808, + "grad_norm": 8.633896813973163, + "learning_rate": 1.1615753741992696e-06, + "loss": 0.9608, + "step": 9739 + }, + { + "epoch": 1.378919798966518, + "grad_norm": 9.44038801656586, + "learning_rate": 1.1610912747898607e-06, + "loss": 0.9882, + "step": 9740 + }, + { + "epoch": 1.3790613718411553, + "grad_norm": 8.292505985380904, + "learning_rate": 1.1606072457658856e-06, + "loss": 0.9647, + "step": 9741 + }, + { + "epoch": 1.3792029447157925, + "grad_norm": 9.385552735917656, + "learning_rate": 1.1601232871527884e-06, + "loss": 1.0268, + "step": 9742 + }, + { + "epoch": 1.3793445175904298, + "grad_norm": 10.065547884314384, + "learning_rate": 1.1596393989760118e-06, + "loss": 1.0146, + "step": 9743 + }, + { + "epoch": 1.379486090465067, + "grad_norm": 8.76177796179279, + "learning_rate": 1.1591555812609914e-06, + "loss": 0.8555, + "step": 9744 + }, + { + "epoch": 1.3796276633397042, + "grad_norm": 9.949131714013689, + "learning_rate": 1.1586718340331634e-06, + "loss": 1.0407, + "step": 9745 + }, + { + "epoch": 1.3797692362143414, + "grad_norm": 8.863758543368423, + "learning_rate": 1.1581881573179562e-06, + "loss": 0.9986, + "step": 9746 + }, + { + "epoch": 1.3799108090889787, + "grad_norm": 9.490218095594702, + "learning_rate": 1.1577045511407977e-06, + "loss": 1.0722, + "step": 9747 + }, + { + "epoch": 1.380052381963616, + "grad_norm": 7.9554710899903265, + "learning_rate": 1.1572210155271105e-06, + "loss": 1.1103, + "step": 9748 + }, + { + "epoch": 1.380193954838253, + "grad_norm": 8.396837147668917, + "learning_rate": 1.156737550502312e-06, + "loss": 0.802, + "step": 9749 + }, + { + "epoch": 1.3803355277128901, + "grad_norm": 9.293842648196364, + "learning_rate": 1.15625415609182e-06, + "loss": 1.0517, + "step": 9750 + }, + { + "epoch": 1.3804771005875274, + "grad_norm": 10.495911665126041, + "learning_rate": 1.155770832321044e-06, + "loss": 1.0364, + "step": 9751 + }, + { + "epoch": 1.3806186734621646, + "grad_norm": 10.972023152236904, + "learning_rate": 1.1552875792153943e-06, + "loss": 1.0104, + "step": 9752 + }, + { + "epoch": 1.3807602463368018, + "grad_norm": 9.064813568504224, + "learning_rate": 1.1548043968002725e-06, + "loss": 0.9811, + "step": 9753 + }, + { + "epoch": 1.380901819211439, + "grad_norm": 9.599470489521764, + "learning_rate": 1.1543212851010819e-06, + "loss": 1.0554, + "step": 9754 + }, + { + "epoch": 1.3810433920860763, + "grad_norm": 9.406172945334307, + "learning_rate": 1.1538382441432166e-06, + "loss": 0.9954, + "step": 9755 + }, + { + "epoch": 1.3811849649607135, + "grad_norm": 7.934399337100205, + "learning_rate": 1.1533552739520715e-06, + "loss": 0.9552, + "step": 9756 + }, + { + "epoch": 1.3813265378353508, + "grad_norm": 9.24541834180268, + "learning_rate": 1.1528723745530362e-06, + "loss": 1.0918, + "step": 9757 + }, + { + "epoch": 1.381468110709988, + "grad_norm": 8.691115251448737, + "learning_rate": 1.1523895459714948e-06, + "loss": 1.0263, + "step": 9758 + }, + { + "epoch": 1.3816096835846252, + "grad_norm": 8.31832842027288, + "learning_rate": 1.151906788232832e-06, + "loss": 0.9928, + "step": 9759 + }, + { + "epoch": 1.3817512564592624, + "grad_norm": 10.29538485637231, + "learning_rate": 1.1514241013624225e-06, + "loss": 1.0494, + "step": 9760 + }, + { + "epoch": 1.3818928293338997, + "grad_norm": 9.329824423819394, + "learning_rate": 1.1509414853856421e-06, + "loss": 0.8418, + "step": 9761 + }, + { + "epoch": 1.382034402208537, + "grad_norm": 9.049373828448669, + "learning_rate": 1.1504589403278631e-06, + "loss": 0.9206, + "step": 9762 + }, + { + "epoch": 1.3821759750831741, + "grad_norm": 8.739762338859455, + "learning_rate": 1.1499764662144505e-06, + "loss": 0.8851, + "step": 9763 + }, + { + "epoch": 1.3823175479578114, + "grad_norm": 7.533078806580503, + "learning_rate": 1.1494940630707693e-06, + "loss": 0.9249, + "step": 9764 + }, + { + "epoch": 1.3824591208324484, + "grad_norm": 8.539247446247368, + "learning_rate": 1.1490117309221772e-06, + "loss": 0.9477, + "step": 9765 + }, + { + "epoch": 1.3826006937070856, + "grad_norm": 9.21041027368857, + "learning_rate": 1.148529469794032e-06, + "loss": 0.9871, + "step": 9766 + }, + { + "epoch": 1.3827422665817228, + "grad_norm": 9.611124316765663, + "learning_rate": 1.148047279711684e-06, + "loss": 0.9922, + "step": 9767 + }, + { + "epoch": 1.38288383945636, + "grad_norm": 8.483281017609311, + "learning_rate": 1.1475651607004834e-06, + "loss": 0.9808, + "step": 9768 + }, + { + "epoch": 1.3830254123309973, + "grad_norm": 11.95913541188512, + "learning_rate": 1.1470831127857738e-06, + "loss": 1.0263, + "step": 9769 + }, + { + "epoch": 1.3831669852056345, + "grad_norm": 8.14955723969867, + "learning_rate": 1.1466011359928951e-06, + "loss": 0.8907, + "step": 9770 + }, + { + "epoch": 1.3833085580802718, + "grad_norm": 8.61914748505562, + "learning_rate": 1.146119230347187e-06, + "loss": 0.9406, + "step": 9771 + }, + { + "epoch": 1.383450130954909, + "grad_norm": 10.442629712856059, + "learning_rate": 1.14563739587398e-06, + "loss": 0.9875, + "step": 9772 + }, + { + "epoch": 1.3835917038295462, + "grad_norm": 9.86896687922214, + "learning_rate": 1.1451556325986065e-06, + "loss": 1.1369, + "step": 9773 + }, + { + "epoch": 1.3837332767041834, + "grad_norm": 6.605504266518207, + "learning_rate": 1.14467394054639e-06, + "loss": 0.9416, + "step": 9774 + }, + { + "epoch": 1.3838748495788207, + "grad_norm": 8.930052742284364, + "learning_rate": 1.144192319742655e-06, + "loss": 0.9492, + "step": 9775 + }, + { + "epoch": 1.384016422453458, + "grad_norm": 8.041633988019203, + "learning_rate": 1.1437107702127178e-06, + "loss": 0.9405, + "step": 9776 + }, + { + "epoch": 1.3841579953280951, + "grad_norm": 10.02109515085357, + "learning_rate": 1.1432292919818952e-06, + "loss": 0.9252, + "step": 9777 + }, + { + "epoch": 1.3842995682027324, + "grad_norm": 8.335754449724018, + "learning_rate": 1.1427478850754959e-06, + "loss": 1.0027, + "step": 9778 + }, + { + "epoch": 1.3844411410773696, + "grad_norm": 9.429193137672971, + "learning_rate": 1.1422665495188284e-06, + "loss": 0.9538, + "step": 9779 + }, + { + "epoch": 1.3845827139520068, + "grad_norm": 9.328256270668678, + "learning_rate": 1.1417852853371978e-06, + "loss": 1.0289, + "step": 9780 + }, + { + "epoch": 1.384724286826644, + "grad_norm": 8.897042423839597, + "learning_rate": 1.1413040925559e-06, + "loss": 0.9004, + "step": 9781 + }, + { + "epoch": 1.3848658597012813, + "grad_norm": 8.864002990253923, + "learning_rate": 1.1408229712002345e-06, + "loss": 0.9028, + "step": 9782 + }, + { + "epoch": 1.3850074325759185, + "grad_norm": 11.159903988442284, + "learning_rate": 1.1403419212954904e-06, + "loss": 1.0192, + "step": 9783 + }, + { + "epoch": 1.3851490054505557, + "grad_norm": 8.917373296480813, + "learning_rate": 1.1398609428669582e-06, + "loss": 0.9797, + "step": 9784 + }, + { + "epoch": 1.385290578325193, + "grad_norm": 8.544977901602039, + "learning_rate": 1.1393800359399225e-06, + "loss": 0.9699, + "step": 9785 + }, + { + "epoch": 1.3854321511998302, + "grad_norm": 10.59286630371215, + "learning_rate": 1.1388992005396632e-06, + "loss": 0.9813, + "step": 9786 + }, + { + "epoch": 1.3855737240744674, + "grad_norm": 10.649876811429925, + "learning_rate": 1.1384184366914588e-06, + "loss": 1.0246, + "step": 9787 + }, + { + "epoch": 1.3857152969491047, + "grad_norm": 10.233421502577428, + "learning_rate": 1.1379377444205814e-06, + "loss": 1.0216, + "step": 9788 + }, + { + "epoch": 1.385856869823742, + "grad_norm": 9.35213601911976, + "learning_rate": 1.1374571237523015e-06, + "loss": 0.9766, + "step": 9789 + }, + { + "epoch": 1.385998442698379, + "grad_norm": 9.97610212083811, + "learning_rate": 1.1369765747118853e-06, + "loss": 1.0166, + "step": 9790 + }, + { + "epoch": 1.3861400155730161, + "grad_norm": 9.672279263184016, + "learning_rate": 1.1364960973245927e-06, + "loss": 1.0205, + "step": 9791 + }, + { + "epoch": 1.3862815884476534, + "grad_norm": 9.677590303859503, + "learning_rate": 1.136015691615685e-06, + "loss": 0.9582, + "step": 9792 + }, + { + "epoch": 1.3864231613222906, + "grad_norm": 8.60175319810181, + "learning_rate": 1.135535357610414e-06, + "loss": 0.9753, + "step": 9793 + }, + { + "epoch": 1.3865647341969278, + "grad_norm": 8.119797669694568, + "learning_rate": 1.1350550953340334e-06, + "loss": 0.9292, + "step": 9794 + }, + { + "epoch": 1.386706307071565, + "grad_norm": 9.624366640332187, + "learning_rate": 1.1345749048117872e-06, + "loss": 1.1083, + "step": 9795 + }, + { + "epoch": 1.3868478799462023, + "grad_norm": 9.54689657705213, + "learning_rate": 1.1340947860689214e-06, + "loss": 0.9207, + "step": 9796 + }, + { + "epoch": 1.3869894528208395, + "grad_norm": 9.718768447907532, + "learning_rate": 1.133614739130673e-06, + "loss": 1.0425, + "step": 9797 + }, + { + "epoch": 1.3871310256954767, + "grad_norm": 6.998280995333671, + "learning_rate": 1.13313476402228e-06, + "loss": 0.8899, + "step": 9798 + }, + { + "epoch": 1.387272598570114, + "grad_norm": 8.60782051245316, + "learning_rate": 1.1326548607689724e-06, + "loss": 0.9122, + "step": 9799 + }, + { + "epoch": 1.3874141714447512, + "grad_norm": 9.762571006839039, + "learning_rate": 1.1321750293959802e-06, + "loss": 1.0628, + "step": 9800 + }, + { + "epoch": 1.3875557443193884, + "grad_norm": 8.006145024560627, + "learning_rate": 1.1316952699285268e-06, + "loss": 0.851, + "step": 9801 + }, + { + "epoch": 1.3876973171940257, + "grad_norm": 9.210354360251632, + "learning_rate": 1.131215582391832e-06, + "loss": 1.0349, + "step": 9802 + }, + { + "epoch": 1.387838890068663, + "grad_norm": 9.645688895564668, + "learning_rate": 1.1307359668111141e-06, + "loss": 1.0336, + "step": 9803 + }, + { + "epoch": 1.3879804629433001, + "grad_norm": 8.881161122746484, + "learning_rate": 1.1302564232115848e-06, + "loss": 0.9945, + "step": 9804 + }, + { + "epoch": 1.3881220358179374, + "grad_norm": 9.031114834628669, + "learning_rate": 1.1297769516184544e-06, + "loss": 1.0237, + "step": 9805 + }, + { + "epoch": 1.3882636086925744, + "grad_norm": 8.372949605438626, + "learning_rate": 1.1292975520569278e-06, + "loss": 1.0458, + "step": 9806 + }, + { + "epoch": 1.3884051815672116, + "grad_norm": 8.000864935847854, + "learning_rate": 1.1288182245522063e-06, + "loss": 0.8501, + "step": 9807 + }, + { + "epoch": 1.3885467544418488, + "grad_norm": 10.081596119775163, + "learning_rate": 1.1283389691294894e-06, + "loss": 0.9817, + "step": 9808 + }, + { + "epoch": 1.388688327316486, + "grad_norm": 8.603436479843117, + "learning_rate": 1.1278597858139692e-06, + "loss": 1.0192, + "step": 9809 + }, + { + "epoch": 1.3888299001911233, + "grad_norm": 9.729457243692268, + "learning_rate": 1.127380674630838e-06, + "loss": 1.0561, + "step": 9810 + }, + { + "epoch": 1.3889714730657605, + "grad_norm": 9.09255396787346, + "learning_rate": 1.1269016356052803e-06, + "loss": 1.0057, + "step": 9811 + }, + { + "epoch": 1.3891130459403978, + "grad_norm": 9.933985442533169, + "learning_rate": 1.1264226687624815e-06, + "loss": 0.9874, + "step": 9812 + }, + { + "epoch": 1.389254618815035, + "grad_norm": 11.263769920296793, + "learning_rate": 1.1259437741276172e-06, + "loss": 1.0732, + "step": 9813 + }, + { + "epoch": 1.3893961916896722, + "grad_norm": 8.128756549708793, + "learning_rate": 1.125464951725864e-06, + "loss": 1.0266, + "step": 9814 + }, + { + "epoch": 1.3895377645643094, + "grad_norm": 9.225283618898722, + "learning_rate": 1.1249862015823943e-06, + "loss": 1.0911, + "step": 9815 + }, + { + "epoch": 1.3896793374389467, + "grad_norm": 12.471955845703892, + "learning_rate": 1.1245075237223741e-06, + "loss": 0.9459, + "step": 9816 + }, + { + "epoch": 1.389820910313584, + "grad_norm": 10.253222819486327, + "learning_rate": 1.1240289181709681e-06, + "loss": 1.0048, + "step": 9817 + }, + { + "epoch": 1.3899624831882211, + "grad_norm": 10.2462638931022, + "learning_rate": 1.1235503849533355e-06, + "loss": 1.0934, + "step": 9818 + }, + { + "epoch": 1.3901040560628584, + "grad_norm": 7.996184869870414, + "learning_rate": 1.1230719240946336e-06, + "loss": 1.0285, + "step": 9819 + }, + { + "epoch": 1.3902456289374956, + "grad_norm": 8.849773822200293, + "learning_rate": 1.1225935356200129e-06, + "loss": 1.0109, + "step": 9820 + }, + { + "epoch": 1.3903872018121328, + "grad_norm": 9.30454769718126, + "learning_rate": 1.1221152195546241e-06, + "loss": 1.0644, + "step": 9821 + }, + { + "epoch": 1.39052877468677, + "grad_norm": 8.22108571328482, + "learning_rate": 1.1216369759236108e-06, + "loss": 0.9903, + "step": 9822 + }, + { + "epoch": 1.3906703475614073, + "grad_norm": 10.58991075112519, + "learning_rate": 1.121158804752113e-06, + "loss": 0.9183, + "step": 9823 + }, + { + "epoch": 1.3908119204360445, + "grad_norm": 8.383582884131203, + "learning_rate": 1.1206807060652696e-06, + "loss": 0.9565, + "step": 9824 + }, + { + "epoch": 1.3909534933106817, + "grad_norm": 7.5930617534688984, + "learning_rate": 1.120202679888212e-06, + "loss": 0.9245, + "step": 9825 + }, + { + "epoch": 1.391095066185319, + "grad_norm": 9.230753903498519, + "learning_rate": 1.119724726246072e-06, + "loss": 1.0103, + "step": 9826 + }, + { + "epoch": 1.3912366390599562, + "grad_norm": 8.644806165844294, + "learning_rate": 1.1192468451639727e-06, + "loss": 0.9606, + "step": 9827 + }, + { + "epoch": 1.3913782119345934, + "grad_norm": 8.826027734148859, + "learning_rate": 1.1187690366670381e-06, + "loss": 1.0395, + "step": 9828 + }, + { + "epoch": 1.3915197848092307, + "grad_norm": 9.606955916925303, + "learning_rate": 1.1182913007803847e-06, + "loss": 0.9304, + "step": 9829 + }, + { + "epoch": 1.391661357683868, + "grad_norm": 10.236502343310892, + "learning_rate": 1.117813637529127e-06, + "loss": 1.0783, + "step": 9830 + }, + { + "epoch": 1.3918029305585051, + "grad_norm": 10.49549114966309, + "learning_rate": 1.117336046938377e-06, + "loss": 1.1029, + "step": 9831 + }, + { + "epoch": 1.3919445034331421, + "grad_norm": 8.61517659135292, + "learning_rate": 1.116858529033239e-06, + "loss": 0.9538, + "step": 9832 + }, + { + "epoch": 1.3920860763077794, + "grad_norm": 8.714745261517661, + "learning_rate": 1.1163810838388187e-06, + "loss": 0.9938, + "step": 9833 + }, + { + "epoch": 1.3922276491824166, + "grad_norm": 8.848905213308655, + "learning_rate": 1.1159037113802113e-06, + "loss": 0.9565, + "step": 9834 + }, + { + "epoch": 1.3923692220570538, + "grad_norm": 9.05775316820097, + "learning_rate": 1.1154264116825147e-06, + "loss": 1.0408, + "step": 9835 + }, + { + "epoch": 1.392510794931691, + "grad_norm": 11.238404125805491, + "learning_rate": 1.1149491847708186e-06, + "loss": 1.1864, + "step": 9836 + }, + { + "epoch": 1.3926523678063283, + "grad_norm": 9.696230000402378, + "learning_rate": 1.1144720306702106e-06, + "loss": 1.0287, + "step": 9837 + }, + { + "epoch": 1.3927939406809655, + "grad_norm": 8.520940678922189, + "learning_rate": 1.113994949405776e-06, + "loss": 1.0714, + "step": 9838 + }, + { + "epoch": 1.3929355135556027, + "grad_norm": 9.409201124213386, + "learning_rate": 1.1135179410025925e-06, + "loss": 1.0663, + "step": 9839 + }, + { + "epoch": 1.39307708643024, + "grad_norm": 10.419871170506905, + "learning_rate": 1.1130410054857382e-06, + "loss": 1.0578, + "step": 9840 + }, + { + "epoch": 1.3932186593048772, + "grad_norm": 10.943281465845729, + "learning_rate": 1.1125641428802831e-06, + "loss": 0.909, + "step": 9841 + }, + { + "epoch": 1.3933602321795144, + "grad_norm": 7.777053375418068, + "learning_rate": 1.1120873532112971e-06, + "loss": 0.9003, + "step": 9842 + }, + { + "epoch": 1.3935018050541517, + "grad_norm": 9.45067722376513, + "learning_rate": 1.1116106365038443e-06, + "loss": 1.013, + "step": 9843 + }, + { + "epoch": 1.393643377928789, + "grad_norm": 12.175954818100381, + "learning_rate": 1.1111339927829842e-06, + "loss": 1.003, + "step": 9844 + }, + { + "epoch": 1.3937849508034261, + "grad_norm": 10.173304116076723, + "learning_rate": 1.1106574220737754e-06, + "loss": 1.0761, + "step": 9845 + }, + { + "epoch": 1.3939265236780634, + "grad_norm": 8.651016581340311, + "learning_rate": 1.110180924401269e-06, + "loss": 0.9594, + "step": 9846 + }, + { + "epoch": 1.3940680965527004, + "grad_norm": 8.632203121167716, + "learning_rate": 1.1097044997905162e-06, + "loss": 0.9537, + "step": 9847 + }, + { + "epoch": 1.3942096694273376, + "grad_norm": 11.34131102180693, + "learning_rate": 1.1092281482665601e-06, + "loss": 1.0468, + "step": 9848 + }, + { + "epoch": 1.3943512423019748, + "grad_norm": 8.093532250026966, + "learning_rate": 1.1087518698544444e-06, + "loss": 1.061, + "step": 9849 + }, + { + "epoch": 1.394492815176612, + "grad_norm": 8.832124543403774, + "learning_rate": 1.1082756645792046e-06, + "loss": 0.9748, + "step": 9850 + }, + { + "epoch": 1.3946343880512493, + "grad_norm": 8.673499812591423, + "learning_rate": 1.1077995324658762e-06, + "loss": 0.9349, + "step": 9851 + }, + { + "epoch": 1.3947759609258865, + "grad_norm": 9.57494706022204, + "learning_rate": 1.1073234735394872e-06, + "loss": 1.0039, + "step": 9852 + }, + { + "epoch": 1.3949175338005237, + "grad_norm": 10.50893457937827, + "learning_rate": 1.1068474878250649e-06, + "loss": 1.0982, + "step": 9853 + }, + { + "epoch": 1.395059106675161, + "grad_norm": 7.356896633096619, + "learning_rate": 1.1063715753476334e-06, + "loss": 0.9977, + "step": 9854 + }, + { + "epoch": 1.3952006795497982, + "grad_norm": 8.159319888176812, + "learning_rate": 1.105895736132207e-06, + "loss": 0.9707, + "step": 9855 + }, + { + "epoch": 1.3953422524244354, + "grad_norm": 7.928406079113397, + "learning_rate": 1.1054199702038032e-06, + "loss": 0.8649, + "step": 9856 + }, + { + "epoch": 1.3954838252990727, + "grad_norm": 11.467022924285699, + "learning_rate": 1.104944277587431e-06, + "loss": 0.9684, + "step": 9857 + }, + { + "epoch": 1.39562539817371, + "grad_norm": 8.042555158744348, + "learning_rate": 1.1044686583080976e-06, + "loss": 0.9052, + "step": 9858 + }, + { + "epoch": 1.3957669710483471, + "grad_norm": 10.1371857581488, + "learning_rate": 1.1039931123908074e-06, + "loss": 0.9746, + "step": 9859 + }, + { + "epoch": 1.3959085439229844, + "grad_norm": 11.208382153788792, + "learning_rate": 1.1035176398605576e-06, + "loss": 1.0338, + "step": 9860 + }, + { + "epoch": 1.3960501167976216, + "grad_norm": 9.953044897023622, + "learning_rate": 1.103042240742345e-06, + "loss": 0.9811, + "step": 9861 + }, + { + "epoch": 1.3961916896722588, + "grad_norm": 10.307088645926184, + "learning_rate": 1.1025669150611594e-06, + "loss": 1.0152, + "step": 9862 + }, + { + "epoch": 1.396333262546896, + "grad_norm": 10.920247274540507, + "learning_rate": 1.1020916628419898e-06, + "loss": 0.9956, + "step": 9863 + }, + { + "epoch": 1.3964748354215333, + "grad_norm": 8.833081175845834, + "learning_rate": 1.1016164841098193e-06, + "loss": 1.0007, + "step": 9864 + }, + { + "epoch": 1.3966164082961705, + "grad_norm": 10.232605852576347, + "learning_rate": 1.1011413788896263e-06, + "loss": 1.0058, + "step": 9865 + }, + { + "epoch": 1.3967579811708077, + "grad_norm": 9.528089655994835, + "learning_rate": 1.1006663472063892e-06, + "loss": 0.9644, + "step": 9866 + }, + { + "epoch": 1.396899554045445, + "grad_norm": 9.422239786807468, + "learning_rate": 1.100191389085078e-06, + "loss": 1.1542, + "step": 9867 + }, + { + "epoch": 1.3970411269200822, + "grad_norm": 10.938013729564638, + "learning_rate": 1.0997165045506624e-06, + "loss": 1.0074, + "step": 9868 + }, + { + "epoch": 1.3971826997947194, + "grad_norm": 8.168814966925568, + "learning_rate": 1.0992416936281054e-06, + "loss": 1.0213, + "step": 9869 + }, + { + "epoch": 1.3973242726693567, + "grad_norm": 9.124174315838161, + "learning_rate": 1.098766956342369e-06, + "loss": 0.9767, + "step": 9870 + }, + { + "epoch": 1.397465845543994, + "grad_norm": 8.992548082746504, + "learning_rate": 1.0982922927184077e-06, + "loss": 1.0447, + "step": 9871 + }, + { + "epoch": 1.3976074184186311, + "grad_norm": 9.776236812464134, + "learning_rate": 1.0978177027811767e-06, + "loss": 1.023, + "step": 9872 + }, + { + "epoch": 1.3977489912932681, + "grad_norm": 8.64250066172912, + "learning_rate": 1.0973431865556225e-06, + "loss": 0.9801, + "step": 9873 + }, + { + "epoch": 1.3978905641679054, + "grad_norm": 8.773451413613188, + "learning_rate": 1.096868744066692e-06, + "loss": 0.9677, + "step": 9874 + }, + { + "epoch": 1.3980321370425426, + "grad_norm": 9.503667826078388, + "learning_rate": 1.0963943753393252e-06, + "loss": 0.9831, + "step": 9875 + }, + { + "epoch": 1.3981737099171798, + "grad_norm": 8.556507547541392, + "learning_rate": 1.095920080398459e-06, + "loss": 0.9473, + "step": 9876 + }, + { + "epoch": 1.398315282791817, + "grad_norm": 10.339206216178496, + "learning_rate": 1.0954458592690278e-06, + "loss": 0.9401, + "step": 9877 + }, + { + "epoch": 1.3984568556664543, + "grad_norm": 10.41906242794626, + "learning_rate": 1.0949717119759597e-06, + "loss": 1.1009, + "step": 9878 + }, + { + "epoch": 1.3985984285410915, + "grad_norm": 8.455068875962255, + "learning_rate": 1.0944976385441822e-06, + "loss": 0.9754, + "step": 9879 + }, + { + "epoch": 1.3987400014157287, + "grad_norm": 8.946358085726665, + "learning_rate": 1.0940236389986148e-06, + "loss": 0.9684, + "step": 9880 + }, + { + "epoch": 1.398881574290366, + "grad_norm": 9.346867784339826, + "learning_rate": 1.0935497133641765e-06, + "loss": 0.9694, + "step": 9881 + }, + { + "epoch": 1.3990231471650032, + "grad_norm": 8.291549553994576, + "learning_rate": 1.0930758616657816e-06, + "loss": 0.9036, + "step": 9882 + }, + { + "epoch": 1.3991647200396404, + "grad_norm": 9.770689311860563, + "learning_rate": 1.0926020839283392e-06, + "loss": 0.9508, + "step": 9883 + }, + { + "epoch": 1.3993062929142777, + "grad_norm": 12.821933455569893, + "learning_rate": 1.0921283801767562e-06, + "loss": 1.0885, + "step": 9884 + }, + { + "epoch": 1.399447865788915, + "grad_norm": 8.107208962288016, + "learning_rate": 1.091654750435934e-06, + "loss": 0.8427, + "step": 9885 + }, + { + "epoch": 1.3995894386635521, + "grad_norm": 12.136076282480252, + "learning_rate": 1.0911811947307732e-06, + "loss": 1.1749, + "step": 9886 + }, + { + "epoch": 1.3997310115381894, + "grad_norm": 10.145484077557303, + "learning_rate": 1.0907077130861646e-06, + "loss": 1.0787, + "step": 9887 + }, + { + "epoch": 1.3998725844128266, + "grad_norm": 9.662377249862082, + "learning_rate": 1.0902343055270006e-06, + "loss": 0.9086, + "step": 9888 + }, + { + "epoch": 1.4000141572874636, + "grad_norm": 9.38767088062786, + "learning_rate": 1.0897609720781693e-06, + "loss": 1.0273, + "step": 9889 + }, + { + "epoch": 1.4001557301621008, + "grad_norm": 9.877597901697415, + "learning_rate": 1.089287712764551e-06, + "loss": 1.0592, + "step": 9890 + }, + { + "epoch": 1.400297303036738, + "grad_norm": 10.235603084117978, + "learning_rate": 1.0888145276110268e-06, + "loss": 0.9977, + "step": 9891 + }, + { + "epoch": 1.4004388759113753, + "grad_norm": 8.62502874839525, + "learning_rate": 1.0883414166424697e-06, + "loss": 0.9734, + "step": 9892 + }, + { + "epoch": 1.4005804487860125, + "grad_norm": 9.127843126478542, + "learning_rate": 1.0878683798837524e-06, + "loss": 0.979, + "step": 9893 + }, + { + "epoch": 1.4007220216606497, + "grad_norm": 9.891314972107672, + "learning_rate": 1.087395417359741e-06, + "loss": 1.0566, + "step": 9894 + }, + { + "epoch": 1.400863594535287, + "grad_norm": 10.167665234497091, + "learning_rate": 1.0869225290952997e-06, + "loss": 1.1065, + "step": 9895 + }, + { + "epoch": 1.4010051674099242, + "grad_norm": 8.431401061789558, + "learning_rate": 1.0864497151152879e-06, + "loss": 1.0283, + "step": 9896 + }, + { + "epoch": 1.4011467402845614, + "grad_norm": 9.248481007167685, + "learning_rate": 1.0859769754445592e-06, + "loss": 0.9978, + "step": 9897 + }, + { + "epoch": 1.4012883131591987, + "grad_norm": 10.478317398229875, + "learning_rate": 1.0855043101079677e-06, + "loss": 0.9315, + "step": 9898 + }, + { + "epoch": 1.401429886033836, + "grad_norm": 8.179632469072285, + "learning_rate": 1.085031719130359e-06, + "loss": 0.9549, + "step": 9899 + }, + { + "epoch": 1.4015714589084731, + "grad_norm": 10.750669236753865, + "learning_rate": 1.0845592025365786e-06, + "loss": 1.0403, + "step": 9900 + }, + { + "epoch": 1.4017130317831104, + "grad_norm": 10.022738734899104, + "learning_rate": 1.0840867603514648e-06, + "loss": 1.0682, + "step": 9901 + }, + { + "epoch": 1.4018546046577476, + "grad_norm": 10.798832187386196, + "learning_rate": 1.083614392599855e-06, + "loss": 0.9885, + "step": 9902 + }, + { + "epoch": 1.4019961775323848, + "grad_norm": 10.530815681534694, + "learning_rate": 1.0831420993065798e-06, + "loss": 1.0795, + "step": 9903 + }, + { + "epoch": 1.402137750407022, + "grad_norm": 9.921241752060679, + "learning_rate": 1.0826698804964679e-06, + "loss": 0.9896, + "step": 9904 + }, + { + "epoch": 1.4022793232816593, + "grad_norm": 9.350815159151901, + "learning_rate": 1.0821977361943441e-06, + "loss": 1.0132, + "step": 9905 + }, + { + "epoch": 1.4024208961562965, + "grad_norm": 8.765175003768018, + "learning_rate": 1.0817256664250275e-06, + "loss": 0.9847, + "step": 9906 + }, + { + "epoch": 1.4025624690309337, + "grad_norm": 9.03395541630947, + "learning_rate": 1.081253671213337e-06, + "loss": 1.0261, + "step": 9907 + }, + { + "epoch": 1.402704041905571, + "grad_norm": 8.119779817198564, + "learning_rate": 1.0807817505840815e-06, + "loss": 0.9404, + "step": 9908 + }, + { + "epoch": 1.4028456147802082, + "grad_norm": 8.669083869516937, + "learning_rate": 1.0803099045620716e-06, + "loss": 0.9466, + "step": 9909 + }, + { + "epoch": 1.4029871876548454, + "grad_norm": 8.702052380743696, + "learning_rate": 1.079838133172111e-06, + "loss": 0.952, + "step": 9910 + }, + { + "epoch": 1.4031287605294827, + "grad_norm": 10.601527237183433, + "learning_rate": 1.0793664364390004e-06, + "loss": 0.9277, + "step": 9911 + }, + { + "epoch": 1.4032703334041199, + "grad_norm": 7.9785318811033905, + "learning_rate": 1.0788948143875383e-06, + "loss": 1.0133, + "step": 9912 + }, + { + "epoch": 1.4034119062787571, + "grad_norm": 9.908616418112524, + "learning_rate": 1.0784232670425148e-06, + "loss": 1.0181, + "step": 9913 + }, + { + "epoch": 1.4035534791533943, + "grad_norm": 9.385296672995304, + "learning_rate": 1.0779517944287216e-06, + "loss": 1.127, + "step": 9914 + }, + { + "epoch": 1.4036950520280314, + "grad_norm": 8.50803578822003, + "learning_rate": 1.077480396570941e-06, + "loss": 1.0146, + "step": 9915 + }, + { + "epoch": 1.4038366249026686, + "grad_norm": 10.309075174079595, + "learning_rate": 1.0770090734939564e-06, + "loss": 0.9438, + "step": 9916 + }, + { + "epoch": 1.4039781977773058, + "grad_norm": 7.906534182300036, + "learning_rate": 1.0765378252225436e-06, + "loss": 1.051, + "step": 9917 + }, + { + "epoch": 1.404119770651943, + "grad_norm": 8.682100443039538, + "learning_rate": 1.076066651781475e-06, + "loss": 0.9939, + "step": 9918 + }, + { + "epoch": 1.4042613435265803, + "grad_norm": 10.414361398561304, + "learning_rate": 1.075595553195522e-06, + "loss": 1.0599, + "step": 9919 + }, + { + "epoch": 1.4044029164012175, + "grad_norm": 8.955649134943968, + "learning_rate": 1.0751245294894474e-06, + "loss": 0.9874, + "step": 9920 + }, + { + "epoch": 1.4045444892758547, + "grad_norm": 7.882142476092889, + "learning_rate": 1.074653580688015e-06, + "loss": 0.9548, + "step": 9921 + }, + { + "epoch": 1.404686062150492, + "grad_norm": 9.618622884226454, + "learning_rate": 1.0741827068159803e-06, + "loss": 0.9802, + "step": 9922 + }, + { + "epoch": 1.4048276350251292, + "grad_norm": 8.36650132500956, + "learning_rate": 1.0737119078980981e-06, + "loss": 1.1444, + "step": 9923 + }, + { + "epoch": 1.4049692078997664, + "grad_norm": 10.99576833998965, + "learning_rate": 1.0732411839591167e-06, + "loss": 1.0017, + "step": 9924 + }, + { + "epoch": 1.4051107807744037, + "grad_norm": 11.120771311623564, + "learning_rate": 1.0727705350237833e-06, + "loss": 1.0354, + "step": 9925 + }, + { + "epoch": 1.4052523536490409, + "grad_norm": 8.803277133741561, + "learning_rate": 1.0722999611168377e-06, + "loss": 0.8871, + "step": 9926 + }, + { + "epoch": 1.4053939265236781, + "grad_norm": 7.775924637448923, + "learning_rate": 1.0718294622630188e-06, + "loss": 0.966, + "step": 9927 + }, + { + "epoch": 1.4055354993983153, + "grad_norm": 9.316487322760453, + "learning_rate": 1.071359038487062e-06, + "loss": 1.008, + "step": 9928 + }, + { + "epoch": 1.4056770722729526, + "grad_norm": 12.12770412219026, + "learning_rate": 1.0708886898136932e-06, + "loss": 1.133, + "step": 9929 + }, + { + "epoch": 1.4058186451475896, + "grad_norm": 8.786018570117703, + "learning_rate": 1.0704184162676417e-06, + "loss": 0.9342, + "step": 9930 + }, + { + "epoch": 1.4059602180222268, + "grad_norm": 8.761853282098013, + "learning_rate": 1.069948217873627e-06, + "loss": 1.0629, + "step": 9931 + }, + { + "epoch": 1.406101790896864, + "grad_norm": 9.927528993117052, + "learning_rate": 1.069478094656369e-06, + "loss": 1.0585, + "step": 9932 + }, + { + "epoch": 1.4062433637715013, + "grad_norm": 9.68641942058087, + "learning_rate": 1.0690080466405803e-06, + "loss": 1.0358, + "step": 9933 + }, + { + "epoch": 1.4063849366461385, + "grad_norm": 8.489293255365014, + "learning_rate": 1.0685380738509712e-06, + "loss": 0.9752, + "step": 9934 + }, + { + "epoch": 1.4065265095207757, + "grad_norm": 8.387986775356449, + "learning_rate": 1.0680681763122493e-06, + "loss": 0.9716, + "step": 9935 + }, + { + "epoch": 1.406668082395413, + "grad_norm": 10.902521496222144, + "learning_rate": 1.067598354049115e-06, + "loss": 0.9811, + "step": 9936 + }, + { + "epoch": 1.4068096552700502, + "grad_norm": 9.243556072065656, + "learning_rate": 1.0671286070862678e-06, + "loss": 0.992, + "step": 9937 + }, + { + "epoch": 1.4069512281446874, + "grad_norm": 9.370126093094386, + "learning_rate": 1.0666589354484005e-06, + "loss": 0.9365, + "step": 9938 + }, + { + "epoch": 1.4070928010193247, + "grad_norm": 9.997209541086875, + "learning_rate": 1.066189339160205e-06, + "loss": 1.0858, + "step": 9939 + }, + { + "epoch": 1.4072343738939619, + "grad_norm": 9.726326004278487, + "learning_rate": 1.065719818246367e-06, + "loss": 1.0589, + "step": 9940 + }, + { + "epoch": 1.4073759467685991, + "grad_norm": 8.251168832921492, + "learning_rate": 1.065250372731568e-06, + "loss": 0.9036, + "step": 9941 + }, + { + "epoch": 1.4075175196432363, + "grad_norm": 7.201765945881455, + "learning_rate": 1.0647810026404878e-06, + "loss": 0.9568, + "step": 9942 + }, + { + "epoch": 1.4076590925178736, + "grad_norm": 9.494388328323415, + "learning_rate": 1.064311707997799e-06, + "loss": 1.0038, + "step": 9943 + }, + { + "epoch": 1.4078006653925108, + "grad_norm": 10.516295005006478, + "learning_rate": 1.0638424888281744e-06, + "loss": 1.1114, + "step": 9944 + }, + { + "epoch": 1.407942238267148, + "grad_norm": 9.566196101331036, + "learning_rate": 1.0633733451562787e-06, + "loss": 1.0112, + "step": 9945 + }, + { + "epoch": 1.4080838111417853, + "grad_norm": 8.960243577371115, + "learning_rate": 1.0629042770067754e-06, + "loss": 1.0299, + "step": 9946 + }, + { + "epoch": 1.4082253840164225, + "grad_norm": 9.766705018403284, + "learning_rate": 1.0624352844043224e-06, + "loss": 1.0099, + "step": 9947 + }, + { + "epoch": 1.4083669568910597, + "grad_norm": 9.192886953086422, + "learning_rate": 1.061966367373575e-06, + "loss": 1.109, + "step": 9948 + }, + { + "epoch": 1.408508529765697, + "grad_norm": 8.795389649660523, + "learning_rate": 1.0614975259391835e-06, + "loss": 1.0161, + "step": 9949 + }, + { + "epoch": 1.4086501026403342, + "grad_norm": 8.913048220943674, + "learning_rate": 1.0610287601257937e-06, + "loss": 1.1388, + "step": 9950 + }, + { + "epoch": 1.4087916755149714, + "grad_norm": 11.112956525352512, + "learning_rate": 1.06056006995805e-06, + "loss": 1.082, + "step": 9951 + }, + { + "epoch": 1.4089332483896087, + "grad_norm": 10.566218156251, + "learning_rate": 1.060091455460589e-06, + "loss": 1.037, + "step": 9952 + }, + { + "epoch": 1.4090748212642459, + "grad_norm": 9.428144048937709, + "learning_rate": 1.0596229166580477e-06, + "loss": 0.9881, + "step": 9953 + }, + { + "epoch": 1.409216394138883, + "grad_norm": 10.563663937695775, + "learning_rate": 1.0591544535750545e-06, + "loss": 1.0606, + "step": 9954 + }, + { + "epoch": 1.4093579670135203, + "grad_norm": 7.9918879865828885, + "learning_rate": 1.0586860662362375e-06, + "loss": 0.9399, + "step": 9955 + }, + { + "epoch": 1.4094995398881573, + "grad_norm": 8.641710540846491, + "learning_rate": 1.0582177546662203e-06, + "loss": 0.9624, + "step": 9956 + }, + { + "epoch": 1.4096411127627946, + "grad_norm": 9.54186580622285, + "learning_rate": 1.0577495188896198e-06, + "loss": 1.0746, + "step": 9957 + }, + { + "epoch": 1.4097826856374318, + "grad_norm": 7.907770907705898, + "learning_rate": 1.0572813589310524e-06, + "loss": 1.0465, + "step": 9958 + }, + { + "epoch": 1.409924258512069, + "grad_norm": 12.086603463188787, + "learning_rate": 1.0568132748151274e-06, + "loss": 1.0087, + "step": 9959 + }, + { + "epoch": 1.4100658313867063, + "grad_norm": 10.828175378107014, + "learning_rate": 1.0563452665664542e-06, + "loss": 0.9298, + "step": 9960 + }, + { + "epoch": 1.4102074042613435, + "grad_norm": 9.217026785448324, + "learning_rate": 1.055877334209632e-06, + "loss": 1.0611, + "step": 9961 + }, + { + "epoch": 1.4103489771359807, + "grad_norm": 9.700845693315987, + "learning_rate": 1.055409477769262e-06, + "loss": 1.1251, + "step": 9962 + }, + { + "epoch": 1.410490550010618, + "grad_norm": 11.901439265559308, + "learning_rate": 1.0549416972699392e-06, + "loss": 1.0218, + "step": 9963 + }, + { + "epoch": 1.4106321228852552, + "grad_norm": 9.372457744496012, + "learning_rate": 1.054473992736253e-06, + "loss": 1.0618, + "step": 9964 + }, + { + "epoch": 1.4107736957598924, + "grad_norm": 9.326154900180887, + "learning_rate": 1.0540063641927923e-06, + "loss": 0.9688, + "step": 9965 + }, + { + "epoch": 1.4109152686345297, + "grad_norm": 10.176788502811535, + "learning_rate": 1.0535388116641376e-06, + "loss": 1.0063, + "step": 9966 + }, + { + "epoch": 1.4110568415091669, + "grad_norm": 10.154598723934916, + "learning_rate": 1.0530713351748704e-06, + "loss": 1.0217, + "step": 9967 + }, + { + "epoch": 1.4111984143838041, + "grad_norm": 8.255454370019178, + "learning_rate": 1.052603934749563e-06, + "loss": 0.9024, + "step": 9968 + }, + { + "epoch": 1.4113399872584413, + "grad_norm": 10.210583331176071, + "learning_rate": 1.0521366104127885e-06, + "loss": 1.03, + "step": 9969 + }, + { + "epoch": 1.4114815601330786, + "grad_norm": 8.481595478035212, + "learning_rate": 1.0516693621891127e-06, + "loss": 0.9556, + "step": 9970 + }, + { + "epoch": 1.4116231330077158, + "grad_norm": 8.85825842762591, + "learning_rate": 1.0512021901030978e-06, + "loss": 0.8219, + "step": 9971 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 7.9993407454652505, + "learning_rate": 1.0507350941793044e-06, + "loss": 0.9926, + "step": 9972 + }, + { + "epoch": 1.41190627875699, + "grad_norm": 11.715142673953816, + "learning_rate": 1.0502680744422856e-06, + "loss": 0.9621, + "step": 9973 + }, + { + "epoch": 1.4120478516316273, + "grad_norm": 9.98393217486794, + "learning_rate": 1.049801130916594e-06, + "loss": 0.8474, + "step": 9974 + }, + { + "epoch": 1.4121894245062645, + "grad_norm": 7.907567815305924, + "learning_rate": 1.0493342636267747e-06, + "loss": 1.0552, + "step": 9975 + }, + { + "epoch": 1.4123309973809017, + "grad_norm": 8.542281065142872, + "learning_rate": 1.0488674725973727e-06, + "loss": 0.9288, + "step": 9976 + }, + { + "epoch": 1.412472570255539, + "grad_norm": 9.341883648392205, + "learning_rate": 1.0484007578529246e-06, + "loss": 1.0488, + "step": 9977 + }, + { + "epoch": 1.4126141431301762, + "grad_norm": 8.019366664470377, + "learning_rate": 1.047934119417966e-06, + "loss": 0.9763, + "step": 9978 + }, + { + "epoch": 1.4127557160048134, + "grad_norm": 9.023462865224088, + "learning_rate": 1.0474675573170293e-06, + "loss": 1.0877, + "step": 9979 + }, + { + "epoch": 1.4128972888794507, + "grad_norm": 8.664746242886359, + "learning_rate": 1.047001071574639e-06, + "loss": 1.042, + "step": 9980 + }, + { + "epoch": 1.4130388617540879, + "grad_norm": 9.290147825446716, + "learning_rate": 1.0465346622153209e-06, + "loss": 0.9932, + "step": 9981 + }, + { + "epoch": 1.4131804346287251, + "grad_norm": 9.108786614654479, + "learning_rate": 1.04606832926359e-06, + "loss": 1.0223, + "step": 9982 + }, + { + "epoch": 1.4133220075033623, + "grad_norm": 8.996533892022667, + "learning_rate": 1.0456020727439635e-06, + "loss": 1.0207, + "step": 9983 + }, + { + "epoch": 1.4134635803779996, + "grad_norm": 10.586047768106509, + "learning_rate": 1.0451358926809513e-06, + "loss": 0.984, + "step": 9984 + }, + { + "epoch": 1.4136051532526368, + "grad_norm": 9.32829430201261, + "learning_rate": 1.04466978909906e-06, + "loss": 0.9047, + "step": 9985 + }, + { + "epoch": 1.413746726127274, + "grad_norm": 9.881309714670728, + "learning_rate": 1.0442037620227938e-06, + "loss": 1.0072, + "step": 9986 + }, + { + "epoch": 1.4138882990019113, + "grad_norm": 8.968860585664052, + "learning_rate": 1.0437378114766495e-06, + "loss": 0.9894, + "step": 9987 + }, + { + "epoch": 1.4140298718765485, + "grad_norm": 9.371013556737948, + "learning_rate": 1.0432719374851233e-06, + "loss": 0.9621, + "step": 9988 + }, + { + "epoch": 1.4141714447511857, + "grad_norm": 9.53251104298694, + "learning_rate": 1.0428061400727045e-06, + "loss": 0.9185, + "step": 9989 + }, + { + "epoch": 1.414313017625823, + "grad_norm": 9.849703293412533, + "learning_rate": 1.0423404192638812e-06, + "loss": 0.914, + "step": 9990 + }, + { + "epoch": 1.4144545905004602, + "grad_norm": 9.297636113146204, + "learning_rate": 1.041874775083134e-06, + "loss": 0.9649, + "step": 9991 + }, + { + "epoch": 1.4145961633750974, + "grad_norm": 8.158087395561124, + "learning_rate": 1.041409207554944e-06, + "loss": 0.915, + "step": 9992 + }, + { + "epoch": 1.4147377362497346, + "grad_norm": 9.718544715602237, + "learning_rate": 1.0409437167037843e-06, + "loss": 0.9845, + "step": 9993 + }, + { + "epoch": 1.4148793091243719, + "grad_norm": 9.201212297260499, + "learning_rate": 1.0404783025541244e-06, + "loss": 0.9492, + "step": 9994 + }, + { + "epoch": 1.415020881999009, + "grad_norm": 8.992170106819625, + "learning_rate": 1.0400129651304328e-06, + "loss": 1.1167, + "step": 9995 + }, + { + "epoch": 1.4151624548736463, + "grad_norm": 9.82094759469344, + "learning_rate": 1.03954770445717e-06, + "loss": 0.9107, + "step": 9996 + }, + { + "epoch": 1.4153040277482836, + "grad_norm": 11.001914464637826, + "learning_rate": 1.0390825205587966e-06, + "loss": 0.989, + "step": 9997 + }, + { + "epoch": 1.4154456006229206, + "grad_norm": 9.779058728938253, + "learning_rate": 1.0386174134597649e-06, + "loss": 0.9825, + "step": 9998 + }, + { + "epoch": 1.4155871734975578, + "grad_norm": 9.901383320657658, + "learning_rate": 1.0381523831845266e-06, + "loss": 1.0791, + "step": 9999 + }, + { + "epoch": 1.415728746372195, + "grad_norm": 10.373147822129312, + "learning_rate": 1.037687429757527e-06, + "loss": 0.9561, + "step": 10000 + }, + { + "epoch": 1.4158703192468323, + "grad_norm": 8.36939929041636, + "learning_rate": 1.0372225532032087e-06, + "loss": 0.9785, + "step": 10001 + }, + { + "epoch": 1.4160118921214695, + "grad_norm": 8.230342681165473, + "learning_rate": 1.0367577535460122e-06, + "loss": 0.8896, + "step": 10002 + }, + { + "epoch": 1.4161534649961067, + "grad_norm": 8.162064273998988, + "learning_rate": 1.0362930308103675e-06, + "loss": 1.0298, + "step": 10003 + }, + { + "epoch": 1.416295037870744, + "grad_norm": 8.258453199926189, + "learning_rate": 1.0358283850207077e-06, + "loss": 0.9029, + "step": 10004 + }, + { + "epoch": 1.4164366107453812, + "grad_norm": 8.922044742392515, + "learning_rate": 1.035363816201457e-06, + "loss": 1.0004, + "step": 10005 + }, + { + "epoch": 1.4165781836200184, + "grad_norm": 9.112996188078089, + "learning_rate": 1.0348993243770395e-06, + "loss": 1.1048, + "step": 10006 + }, + { + "epoch": 1.4167197564946556, + "grad_norm": 10.826057886318091, + "learning_rate": 1.0344349095718712e-06, + "loss": 0.969, + "step": 10007 + }, + { + "epoch": 1.4168613293692929, + "grad_norm": 8.257863776301303, + "learning_rate": 1.0339705718103666e-06, + "loss": 0.8446, + "step": 10008 + }, + { + "epoch": 1.41700290224393, + "grad_norm": 8.325638791798735, + "learning_rate": 1.0335063111169372e-06, + "loss": 0.9271, + "step": 10009 + }, + { + "epoch": 1.4171444751185673, + "grad_norm": 8.0959361301662, + "learning_rate": 1.0330421275159863e-06, + "loss": 0.9124, + "step": 10010 + }, + { + "epoch": 1.4172860479932046, + "grad_norm": 8.847889500926504, + "learning_rate": 1.032578021031918e-06, + "loss": 0.9488, + "step": 10011 + }, + { + "epoch": 1.4174276208678418, + "grad_norm": 7.884875448207171, + "learning_rate": 1.032113991689128e-06, + "loss": 0.8708, + "step": 10012 + }, + { + "epoch": 1.4175691937424788, + "grad_norm": 9.680814934467561, + "learning_rate": 1.031650039512012e-06, + "loss": 0.9081, + "step": 10013 + }, + { + "epoch": 1.417710766617116, + "grad_norm": 8.111215489149933, + "learning_rate": 1.0311861645249588e-06, + "loss": 0.8486, + "step": 10014 + }, + { + "epoch": 1.4178523394917533, + "grad_norm": 8.579129964469919, + "learning_rate": 1.0307223667523524e-06, + "loss": 0.9023, + "step": 10015 + }, + { + "epoch": 1.4179939123663905, + "grad_norm": 9.977347083968983, + "learning_rate": 1.0302586462185769e-06, + "loss": 0.915, + "step": 10016 + }, + { + "epoch": 1.4181354852410277, + "grad_norm": 9.090893693390724, + "learning_rate": 1.0297950029480073e-06, + "loss": 1.024, + "step": 10017 + }, + { + "epoch": 1.418277058115665, + "grad_norm": 10.281589861878883, + "learning_rate": 1.0293314369650193e-06, + "loss": 0.9581, + "step": 10018 + }, + { + "epoch": 1.4184186309903022, + "grad_norm": 10.077475451451587, + "learning_rate": 1.0288679482939801e-06, + "loss": 0.9837, + "step": 10019 + }, + { + "epoch": 1.4185602038649394, + "grad_norm": 9.008164411355496, + "learning_rate": 1.0284045369592567e-06, + "loss": 1.0137, + "step": 10020 + }, + { + "epoch": 1.4187017767395766, + "grad_norm": 9.917025605018166, + "learning_rate": 1.0279412029852087e-06, + "loss": 0.9903, + "step": 10021 + }, + { + "epoch": 1.4188433496142139, + "grad_norm": 8.782629590249794, + "learning_rate": 1.0274779463961947e-06, + "loss": 0.8891, + "step": 10022 + }, + { + "epoch": 1.418984922488851, + "grad_norm": 9.959990380830218, + "learning_rate": 1.0270147672165677e-06, + "loss": 1.1136, + "step": 10023 + }, + { + "epoch": 1.4191264953634883, + "grad_norm": 10.056363906357499, + "learning_rate": 1.0265516654706748e-06, + "loss": 1.1713, + "step": 10024 + }, + { + "epoch": 1.4192680682381256, + "grad_norm": 9.597767538528572, + "learning_rate": 1.026088641182863e-06, + "loss": 1.0733, + "step": 10025 + }, + { + "epoch": 1.4194096411127628, + "grad_norm": 8.47833092872141, + "learning_rate": 1.0256256943774718e-06, + "loss": 0.9263, + "step": 10026 + }, + { + "epoch": 1.4195512139874, + "grad_norm": 10.629847957639802, + "learning_rate": 1.025162825078839e-06, + "loss": 1.0289, + "step": 10027 + }, + { + "epoch": 1.4196927868620373, + "grad_norm": 10.994680679076737, + "learning_rate": 1.0247000333112962e-06, + "loss": 0.9997, + "step": 10028 + }, + { + "epoch": 1.4198343597366745, + "grad_norm": 8.488671776134899, + "learning_rate": 1.0242373190991734e-06, + "loss": 0.9207, + "step": 10029 + }, + { + "epoch": 1.4199759326113117, + "grad_norm": 9.608594115248184, + "learning_rate": 1.0237746824667932e-06, + "loss": 0.9619, + "step": 10030 + }, + { + "epoch": 1.420117505485949, + "grad_norm": 8.768962258823999, + "learning_rate": 1.0233121234384777e-06, + "loss": 0.9249, + "step": 10031 + }, + { + "epoch": 1.4202590783605862, + "grad_norm": 11.00598640537826, + "learning_rate": 1.0228496420385434e-06, + "loss": 1.0945, + "step": 10032 + }, + { + "epoch": 1.4204006512352234, + "grad_norm": 9.459851203377307, + "learning_rate": 1.022387238291301e-06, + "loss": 0.9718, + "step": 10033 + }, + { + "epoch": 1.4205422241098606, + "grad_norm": 7.815255373481686, + "learning_rate": 1.021924912221062e-06, + "loss": 0.8313, + "step": 10034 + }, + { + "epoch": 1.4206837969844979, + "grad_norm": 6.3814332702203656, + "learning_rate": 1.021462663852126e-06, + "loss": 0.8674, + "step": 10035 + }, + { + "epoch": 1.420825369859135, + "grad_norm": 11.214812796565608, + "learning_rate": 1.0210004932087956e-06, + "loss": 1.0402, + "step": 10036 + }, + { + "epoch": 1.4209669427337723, + "grad_norm": 11.859894792627273, + "learning_rate": 1.0205384003153673e-06, + "loss": 1.025, + "step": 10037 + }, + { + "epoch": 1.4211085156084096, + "grad_norm": 8.933300819148016, + "learning_rate": 1.0200763851961313e-06, + "loss": 0.9491, + "step": 10038 + }, + { + "epoch": 1.4212500884830466, + "grad_norm": 8.670522223455828, + "learning_rate": 1.019614447875377e-06, + "loss": 0.9547, + "step": 10039 + }, + { + "epoch": 1.4213916613576838, + "grad_norm": 11.259904697482423, + "learning_rate": 1.0191525883773867e-06, + "loss": 1.0307, + "step": 10040 + }, + { + "epoch": 1.421533234232321, + "grad_norm": 8.94266558136864, + "learning_rate": 1.0186908067264415e-06, + "loss": 0.9563, + "step": 10041 + }, + { + "epoch": 1.4216748071069583, + "grad_norm": 10.100448039475502, + "learning_rate": 1.018229102946815e-06, + "loss": 0.9784, + "step": 10042 + }, + { + "epoch": 1.4218163799815955, + "grad_norm": 9.801698996738079, + "learning_rate": 1.0177674770627807e-06, + "loss": 1.032, + "step": 10043 + }, + { + "epoch": 1.4219579528562327, + "grad_norm": 10.391484045786152, + "learning_rate": 1.0173059290986048e-06, + "loss": 1.0662, + "step": 10044 + }, + { + "epoch": 1.42209952573087, + "grad_norm": 7.816105125227864, + "learning_rate": 1.01684445907855e-06, + "loss": 0.9509, + "step": 10045 + }, + { + "epoch": 1.4222410986055072, + "grad_norm": 9.409739914643255, + "learning_rate": 1.0163830670268768e-06, + "loss": 1.0095, + "step": 10046 + }, + { + "epoch": 1.4223826714801444, + "grad_norm": 8.951594409962325, + "learning_rate": 1.015921752967839e-06, + "loss": 0.9509, + "step": 10047 + }, + { + "epoch": 1.4225242443547816, + "grad_norm": 9.254385655981109, + "learning_rate": 1.0154605169256884e-06, + "loss": 1.0046, + "step": 10048 + }, + { + "epoch": 1.4226658172294189, + "grad_norm": 9.18845326155843, + "learning_rate": 1.014999358924671e-06, + "loss": 1.0559, + "step": 10049 + }, + { + "epoch": 1.422807390104056, + "grad_norm": 10.557515971745074, + "learning_rate": 1.014538278989031e-06, + "loss": 1.0383, + "step": 10050 + }, + { + "epoch": 1.4229489629786933, + "grad_norm": 8.65421819425134, + "learning_rate": 1.014077277143005e-06, + "loss": 0.9338, + "step": 10051 + }, + { + "epoch": 1.4230905358533306, + "grad_norm": 8.024176781757886, + "learning_rate": 1.0136163534108284e-06, + "loss": 0.916, + "step": 10052 + }, + { + "epoch": 1.4232321087279678, + "grad_norm": 9.728950666906558, + "learning_rate": 1.0131555078167328e-06, + "loss": 1.118, + "step": 10053 + }, + { + "epoch": 1.423373681602605, + "grad_norm": 8.713828608653953, + "learning_rate": 1.012694740384943e-06, + "loss": 0.9741, + "step": 10054 + }, + { + "epoch": 1.423515254477242, + "grad_norm": 11.159987392623506, + "learning_rate": 1.0122340511396833e-06, + "loss": 1.0491, + "step": 10055 + }, + { + "epoch": 1.4236568273518793, + "grad_norm": 9.162955995378466, + "learning_rate": 1.0117734401051682e-06, + "loss": 0.9688, + "step": 10056 + }, + { + "epoch": 1.4237984002265165, + "grad_norm": 9.51898665155901, + "learning_rate": 1.0113129073056149e-06, + "loss": 0.8839, + "step": 10057 + }, + { + "epoch": 1.4239399731011537, + "grad_norm": 12.051690352661959, + "learning_rate": 1.0108524527652308e-06, + "loss": 1.043, + "step": 10058 + }, + { + "epoch": 1.424081545975791, + "grad_norm": 9.369276003230294, + "learning_rate": 1.010392076508223e-06, + "loss": 1.0495, + "step": 10059 + }, + { + "epoch": 1.4242231188504282, + "grad_norm": 8.784590007874318, + "learning_rate": 1.0099317785587941e-06, + "loss": 0.9606, + "step": 10060 + }, + { + "epoch": 1.4243646917250654, + "grad_norm": 9.328017446642308, + "learning_rate": 1.0094715589411398e-06, + "loss": 0.9156, + "step": 10061 + }, + { + "epoch": 1.4245062645997026, + "grad_norm": 7.78077003806525, + "learning_rate": 1.009011417679455e-06, + "loss": 0.9424, + "step": 10062 + }, + { + "epoch": 1.4246478374743399, + "grad_norm": 8.098811742962619, + "learning_rate": 1.0085513547979272e-06, + "loss": 0.9325, + "step": 10063 + }, + { + "epoch": 1.424789410348977, + "grad_norm": 11.7426644010773, + "learning_rate": 1.0080913703207434e-06, + "loss": 1.0565, + "step": 10064 + }, + { + "epoch": 1.4249309832236143, + "grad_norm": 11.11757454965729, + "learning_rate": 1.0076314642720834e-06, + "loss": 0.9146, + "step": 10065 + }, + { + "epoch": 1.4250725560982516, + "grad_norm": 9.121701833316934, + "learning_rate": 1.007171636676125e-06, + "loss": 0.9089, + "step": 10066 + }, + { + "epoch": 1.4252141289728888, + "grad_norm": 10.19945325695072, + "learning_rate": 1.006711887557041e-06, + "loss": 0.9306, + "step": 10067 + }, + { + "epoch": 1.425355701847526, + "grad_norm": 9.742018342557088, + "learning_rate": 1.0062522169389986e-06, + "loss": 1.0949, + "step": 10068 + }, + { + "epoch": 1.4254972747221633, + "grad_norm": 9.333642500343515, + "learning_rate": 1.0057926248461638e-06, + "loss": 0.9547, + "step": 10069 + }, + { + "epoch": 1.4256388475968005, + "grad_norm": 9.013648386683915, + "learning_rate": 1.0053331113026962e-06, + "loss": 0.9283, + "step": 10070 + }, + { + "epoch": 1.4257804204714377, + "grad_norm": 9.330328548489272, + "learning_rate": 1.0048736763327532e-06, + "loss": 1.0323, + "step": 10071 + }, + { + "epoch": 1.425921993346075, + "grad_norm": 9.819019654357366, + "learning_rate": 1.0044143199604856e-06, + "loss": 1.0756, + "step": 10072 + }, + { + "epoch": 1.4260635662207122, + "grad_norm": 9.929667523091627, + "learning_rate": 1.0039550422100424e-06, + "loss": 0.923, + "step": 10073 + }, + { + "epoch": 1.4262051390953494, + "grad_norm": 11.535361944703448, + "learning_rate": 1.0034958431055666e-06, + "loss": 1.0485, + "step": 10074 + }, + { + "epoch": 1.4263467119699866, + "grad_norm": 9.140357484733869, + "learning_rate": 1.0030367226711984e-06, + "loss": 0.906, + "step": 10075 + }, + { + "epoch": 1.4264882848446239, + "grad_norm": 9.280017706755483, + "learning_rate": 1.0025776809310752e-06, + "loss": 1.1052, + "step": 10076 + }, + { + "epoch": 1.426629857719261, + "grad_norm": 8.862522434050193, + "learning_rate": 1.0021187179093254e-06, + "loss": 0.9694, + "step": 10077 + }, + { + "epoch": 1.4267714305938983, + "grad_norm": 10.64029911567321, + "learning_rate": 1.0016598336300781e-06, + "loss": 1.0069, + "step": 10078 + }, + { + "epoch": 1.4269130034685356, + "grad_norm": 10.715097233148205, + "learning_rate": 1.0012010281174555e-06, + "loss": 1.0523, + "step": 10079 + }, + { + "epoch": 1.4270545763431726, + "grad_norm": 11.586765869324662, + "learning_rate": 1.0007423013955784e-06, + "loss": 1.0422, + "step": 10080 + }, + { + "epoch": 1.4271961492178098, + "grad_norm": 10.614584451412682, + "learning_rate": 1.0002836534885594e-06, + "loss": 0.9988, + "step": 10081 + }, + { + "epoch": 1.427337722092447, + "grad_norm": 11.247589192101449, + "learning_rate": 9.998250844205107e-07, + "loss": 1.0537, + "step": 10082 + }, + { + "epoch": 1.4274792949670843, + "grad_norm": 8.318097746604707, + "learning_rate": 9.993665942155395e-07, + "loss": 0.8959, + "step": 10083 + }, + { + "epoch": 1.4276208678417215, + "grad_norm": 9.781726959592142, + "learning_rate": 9.989081828977464e-07, + "loss": 1.1051, + "step": 10084 + }, + { + "epoch": 1.4277624407163587, + "grad_norm": 9.808014251350142, + "learning_rate": 9.984498504912321e-07, + "loss": 0.9045, + "step": 10085 + }, + { + "epoch": 1.427904013590996, + "grad_norm": 9.60707821573561, + "learning_rate": 9.979915970200888e-07, + "loss": 0.9625, + "step": 10086 + }, + { + "epoch": 1.4280455864656332, + "grad_norm": 8.437164526027557, + "learning_rate": 9.97533422508408e-07, + "loss": 0.9823, + "step": 10087 + }, + { + "epoch": 1.4281871593402704, + "grad_norm": 9.959522341262822, + "learning_rate": 9.970753269802746e-07, + "loss": 0.9167, + "step": 10088 + }, + { + "epoch": 1.4283287322149076, + "grad_norm": 9.0213086808368, + "learning_rate": 9.966173104597701e-07, + "loss": 0.9576, + "step": 10089 + }, + { + "epoch": 1.4284703050895449, + "grad_norm": 8.547373532221252, + "learning_rate": 9.961593729709734e-07, + "loss": 0.9802, + "step": 10090 + }, + { + "epoch": 1.428611877964182, + "grad_norm": 11.014074857685648, + "learning_rate": 9.957015145379564e-07, + "loss": 1.0474, + "step": 10091 + }, + { + "epoch": 1.4287534508388193, + "grad_norm": 9.155254218729828, + "learning_rate": 9.9524373518479e-07, + "loss": 1.0277, + "step": 10092 + }, + { + "epoch": 1.4288950237134566, + "grad_norm": 8.970045468014206, + "learning_rate": 9.947860349355372e-07, + "loss": 0.8895, + "step": 10093 + }, + { + "epoch": 1.4290365965880938, + "grad_norm": 10.004422163702902, + "learning_rate": 9.943284138142615e-07, + "loss": 0.9628, + "step": 10094 + }, + { + "epoch": 1.429178169462731, + "grad_norm": 10.070359471864851, + "learning_rate": 9.938708718450175e-07, + "loss": 0.957, + "step": 10095 + }, + { + "epoch": 1.429319742337368, + "grad_norm": 8.205961656116859, + "learning_rate": 9.934134090518593e-07, + "loss": 0.8662, + "step": 10096 + }, + { + "epoch": 1.4294613152120053, + "grad_norm": 7.687691042627463, + "learning_rate": 9.929560254588353e-07, + "loss": 0.8529, + "step": 10097 + }, + { + "epoch": 1.4296028880866425, + "grad_norm": 9.962839508972477, + "learning_rate": 9.92498721089988e-07, + "loss": 0.9102, + "step": 10098 + }, + { + "epoch": 1.4297444609612797, + "grad_norm": 10.164451691362164, + "learning_rate": 9.9204149596936e-07, + "loss": 0.985, + "step": 10099 + }, + { + "epoch": 1.429886033835917, + "grad_norm": 7.39765244440643, + "learning_rate": 9.91584350120985e-07, + "loss": 0.8694, + "step": 10100 + }, + { + "epoch": 1.4300276067105542, + "grad_norm": 9.36281149413503, + "learning_rate": 9.911272835688973e-07, + "loss": 1.0648, + "step": 10101 + }, + { + "epoch": 1.4301691795851914, + "grad_norm": 9.22975582771073, + "learning_rate": 9.906702963371222e-07, + "loss": 0.9487, + "step": 10102 + }, + { + "epoch": 1.4303107524598286, + "grad_norm": 8.889900271840046, + "learning_rate": 9.902133884496853e-07, + "loss": 0.913, + "step": 10103 + }, + { + "epoch": 1.4304523253344659, + "grad_norm": 10.706660000176292, + "learning_rate": 9.897565599306037e-07, + "loss": 0.9916, + "step": 10104 + }, + { + "epoch": 1.430593898209103, + "grad_norm": 8.835747107116521, + "learning_rate": 9.892998108038937e-07, + "loss": 1.0926, + "step": 10105 + }, + { + "epoch": 1.4307354710837403, + "grad_norm": 7.628799992459572, + "learning_rate": 9.88843141093567e-07, + "loss": 0.9021, + "step": 10106 + }, + { + "epoch": 1.4308770439583776, + "grad_norm": 9.151071587855418, + "learning_rate": 9.88386550823629e-07, + "loss": 1.0125, + "step": 10107 + }, + { + "epoch": 1.4310186168330148, + "grad_norm": 9.271177226914151, + "learning_rate": 9.879300400180844e-07, + "loss": 1.0689, + "step": 10108 + }, + { + "epoch": 1.431160189707652, + "grad_norm": 9.263903504575694, + "learning_rate": 9.874736087009285e-07, + "loss": 1.0114, + "step": 10109 + }, + { + "epoch": 1.4313017625822892, + "grad_norm": 8.977241988398314, + "learning_rate": 9.870172568961572e-07, + "loss": 0.9999, + "step": 10110 + }, + { + "epoch": 1.4314433354569265, + "grad_norm": 9.810798254481472, + "learning_rate": 9.865609846277615e-07, + "loss": 1.1067, + "step": 10111 + }, + { + "epoch": 1.4315849083315637, + "grad_norm": 8.709787883080834, + "learning_rate": 9.861047919197254e-07, + "loss": 0.907, + "step": 10112 + }, + { + "epoch": 1.431726481206201, + "grad_norm": 8.399370406034814, + "learning_rate": 9.856486787960326e-07, + "loss": 1.0108, + "step": 10113 + }, + { + "epoch": 1.4318680540808382, + "grad_norm": 9.965451642571434, + "learning_rate": 9.851926452806584e-07, + "loss": 1.1269, + "step": 10114 + }, + { + "epoch": 1.4320096269554754, + "grad_norm": 9.095986841653584, + "learning_rate": 9.847366913975787e-07, + "loss": 0.9848, + "step": 10115 + }, + { + "epoch": 1.4321511998301126, + "grad_norm": 11.005586765978293, + "learning_rate": 9.842808171707602e-07, + "loss": 1.0114, + "step": 10116 + }, + { + "epoch": 1.4322927727047499, + "grad_norm": 10.437210170353795, + "learning_rate": 9.838250226241696e-07, + "loss": 0.9445, + "step": 10117 + }, + { + "epoch": 1.432434345579387, + "grad_norm": 7.588091716286473, + "learning_rate": 9.833693077817666e-07, + "loss": 0.8942, + "step": 10118 + }, + { + "epoch": 1.4325759184540243, + "grad_norm": 9.773917182598037, + "learning_rate": 9.82913672667509e-07, + "loss": 0.9805, + "step": 10119 + }, + { + "epoch": 1.4327174913286616, + "grad_norm": 10.75490121225896, + "learning_rate": 9.824581173053483e-07, + "loss": 0.9603, + "step": 10120 + }, + { + "epoch": 1.4328590642032988, + "grad_norm": 9.059269612157578, + "learning_rate": 9.820026417192322e-07, + "loss": 1.0621, + "step": 10121 + }, + { + "epoch": 1.4330006370779358, + "grad_norm": 8.892556890692306, + "learning_rate": 9.815472459331061e-07, + "loss": 1.0997, + "step": 10122 + }, + { + "epoch": 1.433142209952573, + "grad_norm": 9.914791624464074, + "learning_rate": 9.81091929970908e-07, + "loss": 1.0054, + "step": 10123 + }, + { + "epoch": 1.4332837828272103, + "grad_norm": 8.834583074104954, + "learning_rate": 9.806366938565756e-07, + "loss": 0.8961, + "step": 10124 + }, + { + "epoch": 1.4334253557018475, + "grad_norm": 9.276139313264006, + "learning_rate": 9.801815376140385e-07, + "loss": 0.8764, + "step": 10125 + }, + { + "epoch": 1.4335669285764847, + "grad_norm": 8.589605268261796, + "learning_rate": 9.797264612672256e-07, + "loss": 0.9172, + "step": 10126 + }, + { + "epoch": 1.433708501451122, + "grad_norm": 8.905071622291677, + "learning_rate": 9.792714648400584e-07, + "loss": 0.9662, + "step": 10127 + }, + { + "epoch": 1.4338500743257592, + "grad_norm": 12.152013516085193, + "learning_rate": 9.78816548356456e-07, + "loss": 0.9928, + "step": 10128 + }, + { + "epoch": 1.4339916472003964, + "grad_norm": 8.583659736821996, + "learning_rate": 9.783617118403354e-07, + "loss": 0.9449, + "step": 10129 + }, + { + "epoch": 1.4341332200750336, + "grad_norm": 8.487785089617848, + "learning_rate": 9.779069553156031e-07, + "loss": 0.9717, + "step": 10130 + }, + { + "epoch": 1.4342747929496709, + "grad_norm": 8.896203729451031, + "learning_rate": 9.774522788061685e-07, + "loss": 1.0395, + "step": 10131 + }, + { + "epoch": 1.434416365824308, + "grad_norm": 9.578796462750667, + "learning_rate": 9.769976823359311e-07, + "loss": 1.0201, + "step": 10132 + }, + { + "epoch": 1.4345579386989453, + "grad_norm": 9.82446414951957, + "learning_rate": 9.765431659287901e-07, + "loss": 0.9283, + "step": 10133 + }, + { + "epoch": 1.4346995115735826, + "grad_norm": 9.513966382926167, + "learning_rate": 9.760887296086397e-07, + "loss": 1.0969, + "step": 10134 + }, + { + "epoch": 1.4348410844482198, + "grad_norm": 7.550855514863707, + "learning_rate": 9.756343733993679e-07, + "loss": 0.926, + "step": 10135 + }, + { + "epoch": 1.434982657322857, + "grad_norm": 9.021198738103672, + "learning_rate": 9.75180097324861e-07, + "loss": 0.8644, + "step": 10136 + }, + { + "epoch": 1.435124230197494, + "grad_norm": 9.308619567604776, + "learning_rate": 9.747259014089988e-07, + "loss": 1.0022, + "step": 10137 + }, + { + "epoch": 1.4352658030721313, + "grad_norm": 8.338777467696431, + "learning_rate": 9.742717856756595e-07, + "loss": 0.9491, + "step": 10138 + }, + { + "epoch": 1.4354073759467685, + "grad_norm": 10.04302914399869, + "learning_rate": 9.738177501487137e-07, + "loss": 0.9359, + "step": 10139 + }, + { + "epoch": 1.4355489488214057, + "grad_norm": 10.10248690665911, + "learning_rate": 9.73363794852032e-07, + "loss": 1.049, + "step": 10140 + }, + { + "epoch": 1.435690521696043, + "grad_norm": 8.272763838101273, + "learning_rate": 9.729099198094771e-07, + "loss": 1.0008, + "step": 10141 + }, + { + "epoch": 1.4358320945706802, + "grad_norm": 7.930905704224874, + "learning_rate": 9.724561250449082e-07, + "loss": 0.908, + "step": 10142 + }, + { + "epoch": 1.4359736674453174, + "grad_norm": 8.68058578553764, + "learning_rate": 9.720024105821827e-07, + "loss": 1.0109, + "step": 10143 + }, + { + "epoch": 1.4361152403199546, + "grad_norm": 10.156219951878628, + "learning_rate": 9.715487764451504e-07, + "loss": 1.1139, + "step": 10144 + }, + { + "epoch": 1.4362568131945919, + "grad_norm": 8.415691306628174, + "learning_rate": 9.7109522265766e-07, + "loss": 0.9177, + "step": 10145 + }, + { + "epoch": 1.436398386069229, + "grad_norm": 8.948381109178221, + "learning_rate": 9.70641749243553e-07, + "loss": 0.9136, + "step": 10146 + }, + { + "epoch": 1.4365399589438663, + "grad_norm": 9.348628272957349, + "learning_rate": 9.701883562266696e-07, + "loss": 1.0116, + "step": 10147 + }, + { + "epoch": 1.4366815318185036, + "grad_norm": 9.945496994189169, + "learning_rate": 9.697350436308428e-07, + "loss": 0.9582, + "step": 10148 + }, + { + "epoch": 1.4368231046931408, + "grad_norm": 8.456042450855131, + "learning_rate": 9.692818114799038e-07, + "loss": 0.9232, + "step": 10149 + }, + { + "epoch": 1.436964677567778, + "grad_norm": 8.40226929302203, + "learning_rate": 9.688286597976804e-07, + "loss": 1.0238, + "step": 10150 + }, + { + "epoch": 1.4371062504424152, + "grad_norm": 9.053736586581213, + "learning_rate": 9.68375588607991e-07, + "loss": 0.9766, + "step": 10151 + }, + { + "epoch": 1.4372478233170525, + "grad_norm": 9.667902856467366, + "learning_rate": 9.679225979346558e-07, + "loss": 0.9836, + "step": 10152 + }, + { + "epoch": 1.4373893961916897, + "grad_norm": 8.876065472327419, + "learning_rate": 9.674696878014862e-07, + "loss": 0.912, + "step": 10153 + }, + { + "epoch": 1.437530969066327, + "grad_norm": 9.03167744717257, + "learning_rate": 9.67016858232293e-07, + "loss": 0.9116, + "step": 10154 + }, + { + "epoch": 1.4376725419409642, + "grad_norm": 10.228876072212605, + "learning_rate": 9.6656410925088e-07, + "loss": 0.957, + "step": 10155 + }, + { + "epoch": 1.4378141148156014, + "grad_norm": 9.11797030634144, + "learning_rate": 9.661114408810485e-07, + "loss": 1.0945, + "step": 10156 + }, + { + "epoch": 1.4379556876902386, + "grad_norm": 8.17628514632075, + "learning_rate": 9.656588531465954e-07, + "loss": 0.9836, + "step": 10157 + }, + { + "epoch": 1.4380972605648759, + "grad_norm": 9.925322358784184, + "learning_rate": 9.652063460713117e-07, + "loss": 1.0632, + "step": 10158 + }, + { + "epoch": 1.438238833439513, + "grad_norm": 8.436118683950992, + "learning_rate": 9.647539196789868e-07, + "loss": 0.9628, + "step": 10159 + }, + { + "epoch": 1.4383804063141503, + "grad_norm": 9.752125043860634, + "learning_rate": 9.643015739934027e-07, + "loss": 0.9117, + "step": 10160 + }, + { + "epoch": 1.4385219791887875, + "grad_norm": 8.031487962336753, + "learning_rate": 9.638493090383408e-07, + "loss": 0.8945, + "step": 10161 + }, + { + "epoch": 1.4386635520634248, + "grad_norm": 11.1454981863431, + "learning_rate": 9.633971248375753e-07, + "loss": 1.0763, + "step": 10162 + }, + { + "epoch": 1.4388051249380618, + "grad_norm": 11.155185509767385, + "learning_rate": 9.629450214148764e-07, + "loss": 1.0194, + "step": 10163 + }, + { + "epoch": 1.438946697812699, + "grad_norm": 11.185420178641207, + "learning_rate": 9.624929987940124e-07, + "loss": 1.1827, + "step": 10164 + }, + { + "epoch": 1.4390882706873362, + "grad_norm": 10.7559400603588, + "learning_rate": 9.62041056998744e-07, + "loss": 1.0162, + "step": 10165 + }, + { + "epoch": 1.4392298435619735, + "grad_norm": 9.541099387106732, + "learning_rate": 9.615891960528314e-07, + "loss": 1.0797, + "step": 10166 + }, + { + "epoch": 1.4393714164366107, + "grad_norm": 8.998176813895196, + "learning_rate": 9.611374159800272e-07, + "loss": 1.0409, + "step": 10167 + }, + { + "epoch": 1.439512989311248, + "grad_norm": 8.214182930060426, + "learning_rate": 9.60685716804082e-07, + "loss": 1.0057, + "step": 10168 + }, + { + "epoch": 1.4396545621858852, + "grad_norm": 8.889231701704148, + "learning_rate": 9.6023409854874e-07, + "loss": 0.9224, + "step": 10169 + }, + { + "epoch": 1.4397961350605224, + "grad_norm": 9.757438756353245, + "learning_rate": 9.597825612377448e-07, + "loss": 1.0457, + "step": 10170 + }, + { + "epoch": 1.4399377079351596, + "grad_norm": 8.103614716419331, + "learning_rate": 9.593311048948306e-07, + "loss": 1.0097, + "step": 10171 + }, + { + "epoch": 1.4400792808097969, + "grad_norm": 9.10573687132055, + "learning_rate": 9.588797295437324e-07, + "loss": 1.0008, + "step": 10172 + }, + { + "epoch": 1.440220853684434, + "grad_norm": 11.122185854789361, + "learning_rate": 9.584284352081777e-07, + "loss": 1.0495, + "step": 10173 + }, + { + "epoch": 1.4403624265590713, + "grad_norm": 8.380232813111645, + "learning_rate": 9.579772219118899e-07, + "loss": 0.9646, + "step": 10174 + }, + { + "epoch": 1.4405039994337085, + "grad_norm": 9.520329056510509, + "learning_rate": 9.575260896785907e-07, + "loss": 1.0225, + "step": 10175 + }, + { + "epoch": 1.4406455723083458, + "grad_norm": 8.251870145662673, + "learning_rate": 9.570750385319939e-07, + "loss": 1.036, + "step": 10176 + }, + { + "epoch": 1.440787145182983, + "grad_norm": 10.475369219771078, + "learning_rate": 9.566240684958128e-07, + "loss": 0.9305, + "step": 10177 + }, + { + "epoch": 1.4409287180576202, + "grad_norm": 9.138217498121215, + "learning_rate": 9.561731795937526e-07, + "loss": 1.0148, + "step": 10178 + }, + { + "epoch": 1.4410702909322572, + "grad_norm": 10.359961583718025, + "learning_rate": 9.557223718495173e-07, + "loss": 1.0224, + "step": 10179 + }, + { + "epoch": 1.4412118638068945, + "grad_norm": 9.289882152126077, + "learning_rate": 9.552716452868064e-07, + "loss": 1.0684, + "step": 10180 + }, + { + "epoch": 1.4413534366815317, + "grad_norm": 9.263026781842575, + "learning_rate": 9.548209999293122e-07, + "loss": 1.0122, + "step": 10181 + }, + { + "epoch": 1.441495009556169, + "grad_norm": 9.725376825950667, + "learning_rate": 9.543704358007281e-07, + "loss": 0.9418, + "step": 10182 + }, + { + "epoch": 1.4416365824308062, + "grad_norm": 8.419540619630354, + "learning_rate": 9.539199529247356e-07, + "loss": 1.0594, + "step": 10183 + }, + { + "epoch": 1.4417781553054434, + "grad_norm": 9.789206684143656, + "learning_rate": 9.534695513250183e-07, + "loss": 1.0601, + "step": 10184 + }, + { + "epoch": 1.4419197281800806, + "grad_norm": 10.288702122105677, + "learning_rate": 9.530192310252548e-07, + "loss": 0.9696, + "step": 10185 + }, + { + "epoch": 1.4420613010547179, + "grad_norm": 10.25360899866767, + "learning_rate": 9.525689920491157e-07, + "loss": 1.0402, + "step": 10186 + }, + { + "epoch": 1.442202873929355, + "grad_norm": 8.224429644720479, + "learning_rate": 9.521188344202717e-07, + "loss": 1.0821, + "step": 10187 + }, + { + "epoch": 1.4423444468039923, + "grad_norm": 8.729037651862702, + "learning_rate": 9.516687581623857e-07, + "loss": 0.9145, + "step": 10188 + }, + { + "epoch": 1.4424860196786295, + "grad_norm": 8.20074556264265, + "learning_rate": 9.512187632991193e-07, + "loss": 0.9674, + "step": 10189 + }, + { + "epoch": 1.4426275925532668, + "grad_norm": 7.856680982811624, + "learning_rate": 9.50768849854127e-07, + "loss": 0.9765, + "step": 10190 + }, + { + "epoch": 1.442769165427904, + "grad_norm": 11.524594089203248, + "learning_rate": 9.503190178510618e-07, + "loss": 1.0368, + "step": 10191 + }, + { + "epoch": 1.4429107383025412, + "grad_norm": 12.096201251500277, + "learning_rate": 9.498692673135698e-07, + "loss": 0.8879, + "step": 10192 + }, + { + "epoch": 1.4430523111771785, + "grad_norm": 8.67950334877219, + "learning_rate": 9.494195982652951e-07, + "loss": 0.9537, + "step": 10193 + }, + { + "epoch": 1.4431938840518157, + "grad_norm": 9.694411439544934, + "learning_rate": 9.489700107298763e-07, + "loss": 0.9988, + "step": 10194 + }, + { + "epoch": 1.443335456926453, + "grad_norm": 10.172832016231702, + "learning_rate": 9.485205047309465e-07, + "loss": 1.0809, + "step": 10195 + }, + { + "epoch": 1.4434770298010902, + "grad_norm": 11.031393163686804, + "learning_rate": 9.480710802921377e-07, + "loss": 0.9861, + "step": 10196 + }, + { + "epoch": 1.4436186026757274, + "grad_norm": 9.438166512706905, + "learning_rate": 9.476217374370741e-07, + "loss": 0.9313, + "step": 10197 + }, + { + "epoch": 1.4437601755503646, + "grad_norm": 8.82821012472787, + "learning_rate": 9.471724761893794e-07, + "loss": 0.927, + "step": 10198 + }, + { + "epoch": 1.4439017484250019, + "grad_norm": 9.487641577989882, + "learning_rate": 9.467232965726689e-07, + "loss": 0.912, + "step": 10199 + }, + { + "epoch": 1.444043321299639, + "grad_norm": 8.309961375893852, + "learning_rate": 9.462741986105573e-07, + "loss": 0.9259, + "step": 10200 + }, + { + "epoch": 1.4441848941742763, + "grad_norm": 9.890988311603317, + "learning_rate": 9.458251823266518e-07, + "loss": 0.9539, + "step": 10201 + }, + { + "epoch": 1.4443264670489135, + "grad_norm": 11.753212996754135, + "learning_rate": 9.453762477445574e-07, + "loss": 0.9804, + "step": 10202 + }, + { + "epoch": 1.4444680399235508, + "grad_norm": 8.55331618931936, + "learning_rate": 9.449273948878762e-07, + "loss": 1.0172, + "step": 10203 + }, + { + "epoch": 1.444609612798188, + "grad_norm": 10.043370610153278, + "learning_rate": 9.444786237802009e-07, + "loss": 0.9363, + "step": 10204 + }, + { + "epoch": 1.444751185672825, + "grad_norm": 11.931448635967653, + "learning_rate": 9.440299344451251e-07, + "loss": 0.9206, + "step": 10205 + }, + { + "epoch": 1.4448927585474622, + "grad_norm": 7.771798027096086, + "learning_rate": 9.435813269062349e-07, + "loss": 0.9382, + "step": 10206 + }, + { + "epoch": 1.4450343314220995, + "grad_norm": 9.457513670058106, + "learning_rate": 9.431328011871135e-07, + "loss": 0.9953, + "step": 10207 + }, + { + "epoch": 1.4451759042967367, + "grad_norm": 9.211657262219516, + "learning_rate": 9.426843573113409e-07, + "loss": 0.9446, + "step": 10208 + }, + { + "epoch": 1.445317477171374, + "grad_norm": 9.189155273799441, + "learning_rate": 9.422359953024895e-07, + "loss": 0.9659, + "step": 10209 + }, + { + "epoch": 1.4454590500460112, + "grad_norm": 9.934925441606165, + "learning_rate": 9.417877151841315e-07, + "loss": 1.0852, + "step": 10210 + }, + { + "epoch": 1.4456006229206484, + "grad_norm": 9.33438263398897, + "learning_rate": 9.413395169798303e-07, + "loss": 0.8961, + "step": 10211 + }, + { + "epoch": 1.4457421957952856, + "grad_norm": 10.786387405395624, + "learning_rate": 9.408914007131495e-07, + "loss": 1.0328, + "step": 10212 + }, + { + "epoch": 1.4458837686699229, + "grad_norm": 10.68875096902496, + "learning_rate": 9.404433664076442e-07, + "loss": 1.0679, + "step": 10213 + }, + { + "epoch": 1.44602534154456, + "grad_norm": 10.323479373708969, + "learning_rate": 9.399954140868695e-07, + "loss": 1.1236, + "step": 10214 + }, + { + "epoch": 1.4461669144191973, + "grad_norm": 9.461111682954797, + "learning_rate": 9.395475437743723e-07, + "loss": 0.9808, + "step": 10215 + }, + { + "epoch": 1.4463084872938345, + "grad_norm": 8.400734605910683, + "learning_rate": 9.390997554936964e-07, + "loss": 0.9493, + "step": 10216 + }, + { + "epoch": 1.4464500601684718, + "grad_norm": 10.139695793569588, + "learning_rate": 9.386520492683835e-07, + "loss": 0.9693, + "step": 10217 + }, + { + "epoch": 1.446591633043109, + "grad_norm": 8.921125871570597, + "learning_rate": 9.382044251219672e-07, + "loss": 0.9745, + "step": 10218 + }, + { + "epoch": 1.4467332059177462, + "grad_norm": 9.114856673728731, + "learning_rate": 9.377568830779807e-07, + "loss": 1.0036, + "step": 10219 + }, + { + "epoch": 1.4468747787923832, + "grad_norm": 8.685023112510626, + "learning_rate": 9.373094231599491e-07, + "loss": 0.964, + "step": 10220 + }, + { + "epoch": 1.4470163516670205, + "grad_norm": 10.959902420166445, + "learning_rate": 9.368620453913968e-07, + "loss": 1.0837, + "step": 10221 + }, + { + "epoch": 1.4471579245416577, + "grad_norm": 9.199577023278112, + "learning_rate": 9.364147497958404e-07, + "loss": 0.9574, + "step": 10222 + }, + { + "epoch": 1.447299497416295, + "grad_norm": 9.696344878556365, + "learning_rate": 9.359675363967958e-07, + "loss": 1.0083, + "step": 10223 + }, + { + "epoch": 1.4474410702909322, + "grad_norm": 10.3569196780358, + "learning_rate": 9.355204052177705e-07, + "loss": 0.9051, + "step": 10224 + }, + { + "epoch": 1.4475826431655694, + "grad_norm": 10.30378937874428, + "learning_rate": 9.350733562822717e-07, + "loss": 1.0302, + "step": 10225 + }, + { + "epoch": 1.4477242160402066, + "grad_norm": 9.498034223657521, + "learning_rate": 9.346263896138e-07, + "loss": 1.0057, + "step": 10226 + }, + { + "epoch": 1.4478657889148439, + "grad_norm": 9.298256446076959, + "learning_rate": 9.341795052358507e-07, + "loss": 0.9409, + "step": 10227 + }, + { + "epoch": 1.448007361789481, + "grad_norm": 10.521489994719587, + "learning_rate": 9.337327031719185e-07, + "loss": 1.047, + "step": 10228 + }, + { + "epoch": 1.4481489346641183, + "grad_norm": 9.97744725545325, + "learning_rate": 9.332859834454891e-07, + "loss": 1.0758, + "step": 10229 + }, + { + "epoch": 1.4482905075387555, + "grad_norm": 9.455967499420547, + "learning_rate": 9.328393460800475e-07, + "loss": 0.9993, + "step": 10230 + }, + { + "epoch": 1.4484320804133928, + "grad_norm": 10.031317309572659, + "learning_rate": 9.323927910990735e-07, + "loss": 0.9294, + "step": 10231 + }, + { + "epoch": 1.44857365328803, + "grad_norm": 9.12251728720058, + "learning_rate": 9.31946318526041e-07, + "loss": 0.8772, + "step": 10232 + }, + { + "epoch": 1.4487152261626672, + "grad_norm": 12.111008584369184, + "learning_rate": 9.314999283844223e-07, + "loss": 1.0386, + "step": 10233 + }, + { + "epoch": 1.4488567990373045, + "grad_norm": 7.74814534299038, + "learning_rate": 9.310536206976819e-07, + "loss": 0.9548, + "step": 10234 + }, + { + "epoch": 1.4489983719119417, + "grad_norm": 8.937071890014378, + "learning_rate": 9.306073954892844e-07, + "loss": 1.0806, + "step": 10235 + }, + { + "epoch": 1.449139944786579, + "grad_norm": 10.00324692465624, + "learning_rate": 9.301612527826844e-07, + "loss": 0.9316, + "step": 10236 + }, + { + "epoch": 1.4492815176612162, + "grad_norm": 9.843357817088352, + "learning_rate": 9.297151926013368e-07, + "loss": 1.0034, + "step": 10237 + }, + { + "epoch": 1.4494230905358534, + "grad_norm": 12.762827086857497, + "learning_rate": 9.292692149686913e-07, + "loss": 1.078, + "step": 10238 + }, + { + "epoch": 1.4495646634104906, + "grad_norm": 9.387819197875094, + "learning_rate": 9.288233199081914e-07, + "loss": 1.0899, + "step": 10239 + }, + { + "epoch": 1.4497062362851278, + "grad_norm": 11.101371103095884, + "learning_rate": 9.283775074432788e-07, + "loss": 0.9459, + "step": 10240 + }, + { + "epoch": 1.449847809159765, + "grad_norm": 7.608120103949022, + "learning_rate": 9.279317775973879e-07, + "loss": 0.8648, + "step": 10241 + }, + { + "epoch": 1.4499893820344023, + "grad_norm": 8.648416327887363, + "learning_rate": 9.274861303939523e-07, + "loss": 0.9715, + "step": 10242 + }, + { + "epoch": 1.4501309549090395, + "grad_norm": 9.187961281335713, + "learning_rate": 9.270405658563972e-07, + "loss": 1.0015, + "step": 10243 + }, + { + "epoch": 1.4502725277836768, + "grad_norm": 9.104924523230848, + "learning_rate": 9.265950840081475e-07, + "loss": 0.9995, + "step": 10244 + }, + { + "epoch": 1.450414100658314, + "grad_norm": 10.651698418525505, + "learning_rate": 9.261496848726204e-07, + "loss": 1.0393, + "step": 10245 + }, + { + "epoch": 1.450555673532951, + "grad_norm": 8.721940745396688, + "learning_rate": 9.257043684732316e-07, + "loss": 1.0426, + "step": 10246 + }, + { + "epoch": 1.4506972464075882, + "grad_norm": 8.739087520777028, + "learning_rate": 9.252591348333906e-07, + "loss": 0.9123, + "step": 10247 + }, + { + "epoch": 1.4508388192822255, + "grad_norm": 9.904786973614112, + "learning_rate": 9.248139839765013e-07, + "loss": 1.0077, + "step": 10248 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 8.151563211316338, + "learning_rate": 9.243689159259677e-07, + "loss": 0.9692, + "step": 10249 + }, + { + "epoch": 1.4511219650315, + "grad_norm": 11.138781979608803, + "learning_rate": 9.239239307051842e-07, + "loss": 1.0883, + "step": 10250 + }, + { + "epoch": 1.4512635379061372, + "grad_norm": 10.237834875570767, + "learning_rate": 9.234790283375456e-07, + "loss": 1.025, + "step": 10251 + }, + { + "epoch": 1.4514051107807744, + "grad_norm": 9.32133280052926, + "learning_rate": 9.230342088464381e-07, + "loss": 0.9356, + "step": 10252 + }, + { + "epoch": 1.4515466836554116, + "grad_norm": 9.167182722882043, + "learning_rate": 9.225894722552462e-07, + "loss": 0.9286, + "step": 10253 + }, + { + "epoch": 1.4516882565300488, + "grad_norm": 8.769486880455684, + "learning_rate": 9.221448185873505e-07, + "loss": 1.0286, + "step": 10254 + }, + { + "epoch": 1.451829829404686, + "grad_norm": 9.619096010366079, + "learning_rate": 9.217002478661244e-07, + "loss": 0.9794, + "step": 10255 + }, + { + "epoch": 1.4519714022793233, + "grad_norm": 8.309915929706452, + "learning_rate": 9.212557601149411e-07, + "loss": 0.9966, + "step": 10256 + }, + { + "epoch": 1.4521129751539605, + "grad_norm": 7.411023368233429, + "learning_rate": 9.208113553571638e-07, + "loss": 0.8686, + "step": 10257 + }, + { + "epoch": 1.4522545480285978, + "grad_norm": 10.867216284334633, + "learning_rate": 9.203670336161558e-07, + "loss": 1.1452, + "step": 10258 + }, + { + "epoch": 1.452396120903235, + "grad_norm": 9.586642238637108, + "learning_rate": 9.199227949152758e-07, + "loss": 0.9755, + "step": 10259 + }, + { + "epoch": 1.4525376937778722, + "grad_norm": 10.952414496392166, + "learning_rate": 9.194786392778757e-07, + "loss": 0.9568, + "step": 10260 + }, + { + "epoch": 1.4526792666525095, + "grad_norm": 9.191006983731434, + "learning_rate": 9.190345667273059e-07, + "loss": 0.9026, + "step": 10261 + }, + { + "epoch": 1.4528208395271465, + "grad_norm": 8.497243097545761, + "learning_rate": 9.185905772869091e-07, + "loss": 0.9793, + "step": 10262 + }, + { + "epoch": 1.4529624124017837, + "grad_norm": 8.225017753976138, + "learning_rate": 9.181466709800274e-07, + "loss": 0.9565, + "step": 10263 + }, + { + "epoch": 1.453103985276421, + "grad_norm": 8.998337062216214, + "learning_rate": 9.177028478299948e-07, + "loss": 1.0158, + "step": 10264 + }, + { + "epoch": 1.4532455581510582, + "grad_norm": 8.125523007505905, + "learning_rate": 9.172591078601448e-07, + "loss": 0.9665, + "step": 10265 + }, + { + "epoch": 1.4533871310256954, + "grad_norm": 8.93843724598381, + "learning_rate": 9.168154510938024e-07, + "loss": 1.0247, + "step": 10266 + }, + { + "epoch": 1.4535287039003326, + "grad_norm": 8.863685380137678, + "learning_rate": 9.163718775542921e-07, + "loss": 1.0645, + "step": 10267 + }, + { + "epoch": 1.4536702767749698, + "grad_norm": 10.802129634413387, + "learning_rate": 9.159283872649313e-07, + "loss": 0.944, + "step": 10268 + }, + { + "epoch": 1.453811849649607, + "grad_norm": 8.078870137599397, + "learning_rate": 9.154849802490332e-07, + "loss": 1.0331, + "step": 10269 + }, + { + "epoch": 1.4539534225242443, + "grad_norm": 11.273327864591632, + "learning_rate": 9.150416565299092e-07, + "loss": 0.9304, + "step": 10270 + }, + { + "epoch": 1.4540949953988815, + "grad_norm": 9.075336073860605, + "learning_rate": 9.145984161308627e-07, + "loss": 1.0268, + "step": 10271 + }, + { + "epoch": 1.4542365682735188, + "grad_norm": 10.688450821191594, + "learning_rate": 9.14155259075196e-07, + "loss": 1.0567, + "step": 10272 + }, + { + "epoch": 1.454378141148156, + "grad_norm": 10.94368547195068, + "learning_rate": 9.137121853862041e-07, + "loss": 1.0636, + "step": 10273 + }, + { + "epoch": 1.4545197140227932, + "grad_norm": 8.336908056930161, + "learning_rate": 9.132691950871808e-07, + "loss": 0.9923, + "step": 10274 + }, + { + "epoch": 1.4546612868974305, + "grad_norm": 8.234006232847904, + "learning_rate": 9.128262882014117e-07, + "loss": 0.9921, + "step": 10275 + }, + { + "epoch": 1.4548028597720677, + "grad_norm": 7.451211807723926, + "learning_rate": 9.123834647521812e-07, + "loss": 0.958, + "step": 10276 + }, + { + "epoch": 1.454944432646705, + "grad_norm": 8.418975160981915, + "learning_rate": 9.119407247627701e-07, + "loss": 0.9784, + "step": 10277 + }, + { + "epoch": 1.4550860055213422, + "grad_norm": 11.276553880482885, + "learning_rate": 9.114980682564492e-07, + "loss": 0.9813, + "step": 10278 + }, + { + "epoch": 1.4552275783959794, + "grad_norm": 8.944725263885232, + "learning_rate": 9.110554952564912e-07, + "loss": 1.0743, + "step": 10279 + }, + { + "epoch": 1.4553691512706166, + "grad_norm": 8.08056556535228, + "learning_rate": 9.106130057861604e-07, + "loss": 1.0248, + "step": 10280 + }, + { + "epoch": 1.4555107241452538, + "grad_norm": 9.607122290574566, + "learning_rate": 9.101705998687185e-07, + "loss": 1.0776, + "step": 10281 + }, + { + "epoch": 1.455652297019891, + "grad_norm": 11.489546295318437, + "learning_rate": 9.097282775274238e-07, + "loss": 0.9292, + "step": 10282 + }, + { + "epoch": 1.4557938698945283, + "grad_norm": 7.6867799499228, + "learning_rate": 9.092860387855271e-07, + "loss": 0.82, + "step": 10283 + }, + { + "epoch": 1.4559354427691655, + "grad_norm": 10.766239408946012, + "learning_rate": 9.088438836662777e-07, + "loss": 1.0324, + "step": 10284 + }, + { + "epoch": 1.4560770156438028, + "grad_norm": 10.48399586567657, + "learning_rate": 9.084018121929184e-07, + "loss": 1.0342, + "step": 10285 + }, + { + "epoch": 1.45621858851844, + "grad_norm": 9.449203409195897, + "learning_rate": 9.079598243886897e-07, + "loss": 0.971, + "step": 10286 + }, + { + "epoch": 1.4563601613930772, + "grad_norm": 9.016401392994316, + "learning_rate": 9.075179202768253e-07, + "loss": 0.9794, + "step": 10287 + }, + { + "epoch": 1.4565017342677142, + "grad_norm": 9.294997594708915, + "learning_rate": 9.070760998805569e-07, + "loss": 0.9699, + "step": 10288 + }, + { + "epoch": 1.4566433071423515, + "grad_norm": 7.553346753603715, + "learning_rate": 9.066343632231106e-07, + "loss": 0.8835, + "step": 10289 + }, + { + "epoch": 1.4567848800169887, + "grad_norm": 9.167247222086866, + "learning_rate": 9.061927103277068e-07, + "loss": 0.9342, + "step": 10290 + }, + { + "epoch": 1.456926452891626, + "grad_norm": 9.547621377841152, + "learning_rate": 9.057511412175646e-07, + "loss": 0.9632, + "step": 10291 + }, + { + "epoch": 1.4570680257662632, + "grad_norm": 10.595958558269029, + "learning_rate": 9.053096559158956e-07, + "loss": 1.0458, + "step": 10292 + }, + { + "epoch": 1.4572095986409004, + "grad_norm": 7.651541122085717, + "learning_rate": 9.048682544459094e-07, + "loss": 0.9265, + "step": 10293 + }, + { + "epoch": 1.4573511715155376, + "grad_norm": 8.34434798594515, + "learning_rate": 9.044269368308089e-07, + "loss": 1.0115, + "step": 10294 + }, + { + "epoch": 1.4574927443901748, + "grad_norm": 9.884672425649843, + "learning_rate": 9.039857030937957e-07, + "loss": 0.9962, + "step": 10295 + }, + { + "epoch": 1.457634317264812, + "grad_norm": 8.845834007987884, + "learning_rate": 9.03544553258063e-07, + "loss": 0.8919, + "step": 10296 + }, + { + "epoch": 1.4577758901394493, + "grad_norm": 9.916428592506147, + "learning_rate": 9.031034873468039e-07, + "loss": 0.9861, + "step": 10297 + }, + { + "epoch": 1.4579174630140865, + "grad_norm": 9.658824984719892, + "learning_rate": 9.026625053832028e-07, + "loss": 1.0353, + "step": 10298 + }, + { + "epoch": 1.4580590358887238, + "grad_norm": 9.912510190479866, + "learning_rate": 9.022216073904433e-07, + "loss": 0.9936, + "step": 10299 + }, + { + "epoch": 1.458200608763361, + "grad_norm": 8.7262958427813, + "learning_rate": 9.017807933917027e-07, + "loss": 0.9915, + "step": 10300 + }, + { + "epoch": 1.4583421816379982, + "grad_norm": 8.889459999599113, + "learning_rate": 9.013400634101535e-07, + "loss": 0.9044, + "step": 10301 + }, + { + "epoch": 1.4584837545126355, + "grad_norm": 8.784767613819309, + "learning_rate": 9.008994174689659e-07, + "loss": 0.9208, + "step": 10302 + }, + { + "epoch": 1.4586253273872725, + "grad_norm": 8.889101243131643, + "learning_rate": 9.004588555913027e-07, + "loss": 0.9879, + "step": 10303 + }, + { + "epoch": 1.4587669002619097, + "grad_norm": 8.08788425764963, + "learning_rate": 9.000183778003246e-07, + "loss": 0.9507, + "step": 10304 + }, + { + "epoch": 1.458908473136547, + "grad_norm": 9.67694264704202, + "learning_rate": 8.995779841191884e-07, + "loss": 1.0461, + "step": 10305 + }, + { + "epoch": 1.4590500460111842, + "grad_norm": 11.704667319516043, + "learning_rate": 8.991376745710436e-07, + "loss": 1.0189, + "step": 10306 + }, + { + "epoch": 1.4591916188858214, + "grad_norm": 8.35185117017479, + "learning_rate": 8.986974491790381e-07, + "loss": 0.986, + "step": 10307 + }, + { + "epoch": 1.4593331917604586, + "grad_norm": 9.958570107766908, + "learning_rate": 8.982573079663132e-07, + "loss": 0.9673, + "step": 10308 + }, + { + "epoch": 1.4594747646350958, + "grad_norm": 10.258943331256187, + "learning_rate": 8.978172509560087e-07, + "loss": 1.1126, + "step": 10309 + }, + { + "epoch": 1.459616337509733, + "grad_norm": 8.736102156911633, + "learning_rate": 8.973772781712553e-07, + "loss": 1.0102, + "step": 10310 + }, + { + "epoch": 1.4597579103843703, + "grad_norm": 7.820611267785059, + "learning_rate": 8.969373896351833e-07, + "loss": 0.9388, + "step": 10311 + }, + { + "epoch": 1.4598994832590075, + "grad_norm": 9.637515563556459, + "learning_rate": 8.964975853709179e-07, + "loss": 0.9051, + "step": 10312 + }, + { + "epoch": 1.4600410561336448, + "grad_norm": 9.168353208811462, + "learning_rate": 8.960578654015783e-07, + "loss": 1.0697, + "step": 10313 + }, + { + "epoch": 1.460182629008282, + "grad_norm": 8.883489712119445, + "learning_rate": 8.956182297502817e-07, + "loss": 0.9942, + "step": 10314 + }, + { + "epoch": 1.4603242018829192, + "grad_norm": 10.36074069818087, + "learning_rate": 8.951786784401376e-07, + "loss": 1.0869, + "step": 10315 + }, + { + "epoch": 1.4604657747575565, + "grad_norm": 10.428640660244643, + "learning_rate": 8.947392114942547e-07, + "loss": 1.0486, + "step": 10316 + }, + { + "epoch": 1.4606073476321937, + "grad_norm": 9.440235272078102, + "learning_rate": 8.942998289357333e-07, + "loss": 0.9273, + "step": 10317 + }, + { + "epoch": 1.460748920506831, + "grad_norm": 8.839606834402558, + "learning_rate": 8.938605307876738e-07, + "loss": 0.8906, + "step": 10318 + }, + { + "epoch": 1.4608904933814681, + "grad_norm": 8.80439331321412, + "learning_rate": 8.934213170731676e-07, + "loss": 0.9555, + "step": 10319 + }, + { + "epoch": 1.4610320662561054, + "grad_norm": 9.334208538255643, + "learning_rate": 8.929821878153058e-07, + "loss": 0.9483, + "step": 10320 + }, + { + "epoch": 1.4611736391307426, + "grad_norm": 7.356755594333405, + "learning_rate": 8.92543143037172e-07, + "loss": 0.9561, + "step": 10321 + }, + { + "epoch": 1.4613152120053798, + "grad_norm": 9.06419156828788, + "learning_rate": 8.921041827618459e-07, + "loss": 1.0116, + "step": 10322 + }, + { + "epoch": 1.461456784880017, + "grad_norm": 9.710599271940678, + "learning_rate": 8.916653070124048e-07, + "loss": 1.0403, + "step": 10323 + }, + { + "epoch": 1.4615983577546543, + "grad_norm": 9.747604075734978, + "learning_rate": 8.912265158119185e-07, + "loss": 1.0045, + "step": 10324 + }, + { + "epoch": 1.4617399306292915, + "grad_norm": 10.119824252204571, + "learning_rate": 8.907878091834554e-07, + "loss": 0.9501, + "step": 10325 + }, + { + "epoch": 1.4618815035039288, + "grad_norm": 9.323854628677925, + "learning_rate": 8.903491871500767e-07, + "loss": 0.9679, + "step": 10326 + }, + { + "epoch": 1.462023076378566, + "grad_norm": 10.94358682472671, + "learning_rate": 8.899106497348409e-07, + "loss": 0.9778, + "step": 10327 + }, + { + "epoch": 1.4621646492532032, + "grad_norm": 9.041386285000208, + "learning_rate": 8.894721969608025e-07, + "loss": 0.9216, + "step": 10328 + }, + { + "epoch": 1.4623062221278402, + "grad_norm": 9.230552643900387, + "learning_rate": 8.890338288510089e-07, + "loss": 1.0482, + "step": 10329 + }, + { + "epoch": 1.4624477950024775, + "grad_norm": 9.224304388021412, + "learning_rate": 8.885955454285078e-07, + "loss": 0.8981, + "step": 10330 + }, + { + "epoch": 1.4625893678771147, + "grad_norm": 9.138755568047518, + "learning_rate": 8.881573467163354e-07, + "loss": 0.9902, + "step": 10331 + }, + { + "epoch": 1.462730940751752, + "grad_norm": 9.338056686128, + "learning_rate": 8.877192327375303e-07, + "loss": 0.9726, + "step": 10332 + }, + { + "epoch": 1.4628725136263891, + "grad_norm": 9.523112641224056, + "learning_rate": 8.872812035151221e-07, + "loss": 1.0659, + "step": 10333 + }, + { + "epoch": 1.4630140865010264, + "grad_norm": 8.470444400663201, + "learning_rate": 8.868432590721384e-07, + "loss": 1.0284, + "step": 10334 + }, + { + "epoch": 1.4631556593756636, + "grad_norm": 8.284688444478181, + "learning_rate": 8.86405399431603e-07, + "loss": 1.0402, + "step": 10335 + }, + { + "epoch": 1.4632972322503008, + "grad_norm": 10.459992255651112, + "learning_rate": 8.859676246165314e-07, + "loss": 1.0138, + "step": 10336 + }, + { + "epoch": 1.463438805124938, + "grad_norm": 8.252265850213362, + "learning_rate": 8.855299346499394e-07, + "loss": 0.8866, + "step": 10337 + }, + { + "epoch": 1.4635803779995753, + "grad_norm": 10.923530420413682, + "learning_rate": 8.850923295548339e-07, + "loss": 1.1281, + "step": 10338 + }, + { + "epoch": 1.4637219508742125, + "grad_norm": 7.998394089685505, + "learning_rate": 8.846548093542215e-07, + "loss": 0.8914, + "step": 10339 + }, + { + "epoch": 1.4638635237488498, + "grad_norm": 9.980505442691847, + "learning_rate": 8.842173740711002e-07, + "loss": 1.0034, + "step": 10340 + }, + { + "epoch": 1.464005096623487, + "grad_norm": 9.882645749581371, + "learning_rate": 8.83780023728468e-07, + "loss": 0.9885, + "step": 10341 + }, + { + "epoch": 1.4641466694981242, + "grad_norm": 9.02029292813029, + "learning_rate": 8.833427583493146e-07, + "loss": 0.9547, + "step": 10342 + }, + { + "epoch": 1.4642882423727615, + "grad_norm": 9.381727729237664, + "learning_rate": 8.829055779566262e-07, + "loss": 1.086, + "step": 10343 + }, + { + "epoch": 1.4644298152473987, + "grad_norm": 8.556117888406664, + "learning_rate": 8.824684825733865e-07, + "loss": 0.9395, + "step": 10344 + }, + { + "epoch": 1.4645713881220357, + "grad_norm": 10.116013671648554, + "learning_rate": 8.82031472222572e-07, + "loss": 0.9729, + "step": 10345 + }, + { + "epoch": 1.464712960996673, + "grad_norm": 8.879713928838463, + "learning_rate": 8.815945469271573e-07, + "loss": 0.9355, + "step": 10346 + }, + { + "epoch": 1.4648545338713101, + "grad_norm": 7.446436537986328, + "learning_rate": 8.811577067101096e-07, + "loss": 0.9128, + "step": 10347 + }, + { + "epoch": 1.4649961067459474, + "grad_norm": 9.090977196871796, + "learning_rate": 8.807209515943952e-07, + "loss": 0.9786, + "step": 10348 + }, + { + "epoch": 1.4651376796205846, + "grad_norm": 8.416903300468267, + "learning_rate": 8.802842816029722e-07, + "loss": 0.8487, + "step": 10349 + }, + { + "epoch": 1.4652792524952218, + "grad_norm": 8.746043591442817, + "learning_rate": 8.798476967587965e-07, + "loss": 0.8342, + "step": 10350 + }, + { + "epoch": 1.465420825369859, + "grad_norm": 9.934111011281919, + "learning_rate": 8.794111970848205e-07, + "loss": 1.1222, + "step": 10351 + }, + { + "epoch": 1.4655623982444963, + "grad_norm": 11.50271508052046, + "learning_rate": 8.789747826039893e-07, + "loss": 1.0281, + "step": 10352 + }, + { + "epoch": 1.4657039711191335, + "grad_norm": 9.524881405229747, + "learning_rate": 8.785384533392452e-07, + "loss": 1.014, + "step": 10353 + }, + { + "epoch": 1.4658455439937708, + "grad_norm": 8.65966905626667, + "learning_rate": 8.78102209313525e-07, + "loss": 0.9687, + "step": 10354 + }, + { + "epoch": 1.465987116868408, + "grad_norm": 10.459773801615263, + "learning_rate": 8.776660505497619e-07, + "loss": 0.9685, + "step": 10355 + }, + { + "epoch": 1.4661286897430452, + "grad_norm": 10.21724136835375, + "learning_rate": 8.772299770708859e-07, + "loss": 1.0777, + "step": 10356 + }, + { + "epoch": 1.4662702626176825, + "grad_norm": 10.119026588396716, + "learning_rate": 8.767939888998192e-07, + "loss": 1.0351, + "step": 10357 + }, + { + "epoch": 1.4664118354923197, + "grad_norm": 10.735886291106404, + "learning_rate": 8.763580860594828e-07, + "loss": 1.0275, + "step": 10358 + }, + { + "epoch": 1.466553408366957, + "grad_norm": 11.522107974149902, + "learning_rate": 8.759222685727905e-07, + "loss": 0.94, + "step": 10359 + }, + { + "epoch": 1.4666949812415941, + "grad_norm": 12.404277039279133, + "learning_rate": 8.754865364626544e-07, + "loss": 0.9629, + "step": 10360 + }, + { + "epoch": 1.4668365541162314, + "grad_norm": 8.64937034476151, + "learning_rate": 8.750508897519791e-07, + "loss": 1.0043, + "step": 10361 + }, + { + "epoch": 1.4669781269908686, + "grad_norm": 9.215552030722687, + "learning_rate": 8.746153284636677e-07, + "loss": 0.9209, + "step": 10362 + }, + { + "epoch": 1.4671196998655058, + "grad_norm": 9.704225732654434, + "learning_rate": 8.741798526206164e-07, + "loss": 1.0775, + "step": 10363 + }, + { + "epoch": 1.467261272740143, + "grad_norm": 9.390654248082571, + "learning_rate": 8.737444622457172e-07, + "loss": 1.0019, + "step": 10364 + }, + { + "epoch": 1.4674028456147803, + "grad_norm": 10.294863514525817, + "learning_rate": 8.733091573618599e-07, + "loss": 0.9004, + "step": 10365 + }, + { + "epoch": 1.4675444184894175, + "grad_norm": 8.31510253549415, + "learning_rate": 8.728739379919266e-07, + "loss": 0.9733, + "step": 10366 + }, + { + "epoch": 1.4676859913640548, + "grad_norm": 9.722558554024028, + "learning_rate": 8.724388041587976e-07, + "loss": 1.0151, + "step": 10367 + }, + { + "epoch": 1.467827564238692, + "grad_norm": 10.901803845575389, + "learning_rate": 8.720037558853464e-07, + "loss": 1.0574, + "step": 10368 + }, + { + "epoch": 1.4679691371133292, + "grad_norm": 9.430461760104278, + "learning_rate": 8.71568793194445e-07, + "loss": 1.0714, + "step": 10369 + }, + { + "epoch": 1.4681107099879662, + "grad_norm": 6.6818410157029975, + "learning_rate": 8.711339161089568e-07, + "loss": 0.911, + "step": 10370 + }, + { + "epoch": 1.4682522828626035, + "grad_norm": 9.494472300894923, + "learning_rate": 8.706991246517449e-07, + "loss": 0.8727, + "step": 10371 + }, + { + "epoch": 1.4683938557372407, + "grad_norm": 10.185910715277716, + "learning_rate": 8.702644188456646e-07, + "loss": 0.9891, + "step": 10372 + }, + { + "epoch": 1.468535428611878, + "grad_norm": 9.57090519529497, + "learning_rate": 8.698297987135693e-07, + "loss": 1.0787, + "step": 10373 + }, + { + "epoch": 1.4686770014865151, + "grad_norm": 10.266002537768474, + "learning_rate": 8.69395264278306e-07, + "loss": 1.1111, + "step": 10374 + }, + { + "epoch": 1.4688185743611524, + "grad_norm": 8.972606932820497, + "learning_rate": 8.689608155627169e-07, + "loss": 0.898, + "step": 10375 + }, + { + "epoch": 1.4689601472357896, + "grad_norm": 7.956060860686356, + "learning_rate": 8.685264525896426e-07, + "loss": 0.9236, + "step": 10376 + }, + { + "epoch": 1.4691017201104268, + "grad_norm": 8.994879219469906, + "learning_rate": 8.680921753819152e-07, + "loss": 1.0083, + "step": 10377 + }, + { + "epoch": 1.469243292985064, + "grad_norm": 8.540548211041092, + "learning_rate": 8.676579839623653e-07, + "loss": 0.9582, + "step": 10378 + }, + { + "epoch": 1.4693848658597013, + "grad_norm": 9.556756582361597, + "learning_rate": 8.672238783538189e-07, + "loss": 0.916, + "step": 10379 + }, + { + "epoch": 1.4695264387343385, + "grad_norm": 8.609222133823225, + "learning_rate": 8.667898585790951e-07, + "loss": 0.9575, + "step": 10380 + }, + { + "epoch": 1.4696680116089758, + "grad_norm": 9.063283741702959, + "learning_rate": 8.663559246610115e-07, + "loss": 0.9954, + "step": 10381 + }, + { + "epoch": 1.469809584483613, + "grad_norm": 9.276208400923476, + "learning_rate": 8.659220766223778e-07, + "loss": 0.96, + "step": 10382 + }, + { + "epoch": 1.4699511573582502, + "grad_norm": 6.880284723112391, + "learning_rate": 8.654883144860038e-07, + "loss": 0.9481, + "step": 10383 + }, + { + "epoch": 1.4700927302328874, + "grad_norm": 9.816061551971332, + "learning_rate": 8.650546382746888e-07, + "loss": 1.0152, + "step": 10384 + }, + { + "epoch": 1.4702343031075247, + "grad_norm": 8.871141051448088, + "learning_rate": 8.646210480112325e-07, + "loss": 0.889, + "step": 10385 + }, + { + "epoch": 1.4703758759821617, + "grad_norm": 8.73000164787915, + "learning_rate": 8.641875437184288e-07, + "loss": 0.947, + "step": 10386 + }, + { + "epoch": 1.470517448856799, + "grad_norm": 8.724167101435501, + "learning_rate": 8.637541254190657e-07, + "loss": 1.0007, + "step": 10387 + }, + { + "epoch": 1.4706590217314361, + "grad_norm": 11.4024876072203, + "learning_rate": 8.633207931359292e-07, + "loss": 1.0317, + "step": 10388 + }, + { + "epoch": 1.4708005946060734, + "grad_norm": 8.974138189820792, + "learning_rate": 8.628875468917969e-07, + "loss": 0.9475, + "step": 10389 + }, + { + "epoch": 1.4709421674807106, + "grad_norm": 8.263005034324113, + "learning_rate": 8.624543867094468e-07, + "loss": 1.0664, + "step": 10390 + }, + { + "epoch": 1.4710837403553478, + "grad_norm": 8.44344950832356, + "learning_rate": 8.620213126116476e-07, + "loss": 0.9228, + "step": 10391 + }, + { + "epoch": 1.471225313229985, + "grad_norm": 9.277445311944085, + "learning_rate": 8.615883246211676e-07, + "loss": 0.9247, + "step": 10392 + }, + { + "epoch": 1.4713668861046223, + "grad_norm": 9.906662411934857, + "learning_rate": 8.611554227607665e-07, + "loss": 1.0225, + "step": 10393 + }, + { + "epoch": 1.4715084589792595, + "grad_norm": 8.911303134074346, + "learning_rate": 8.607226070532041e-07, + "loss": 0.9711, + "step": 10394 + }, + { + "epoch": 1.4716500318538968, + "grad_norm": 9.36477388238307, + "learning_rate": 8.602898775212317e-07, + "loss": 0.9784, + "step": 10395 + }, + { + "epoch": 1.471791604728534, + "grad_norm": 8.942006502214396, + "learning_rate": 8.598572341875975e-07, + "loss": 0.9156, + "step": 10396 + }, + { + "epoch": 1.4719331776031712, + "grad_norm": 9.840032935358769, + "learning_rate": 8.594246770750459e-07, + "loss": 1.0393, + "step": 10397 + }, + { + "epoch": 1.4720747504778084, + "grad_norm": 10.423681191289223, + "learning_rate": 8.589922062063149e-07, + "loss": 1.0821, + "step": 10398 + }, + { + "epoch": 1.4722163233524457, + "grad_norm": 9.16582189193036, + "learning_rate": 8.58559821604141e-07, + "loss": 1.0729, + "step": 10399 + }, + { + "epoch": 1.472357896227083, + "grad_norm": 8.560465995725048, + "learning_rate": 8.581275232912526e-07, + "loss": 1.0583, + "step": 10400 + }, + { + "epoch": 1.4724994691017201, + "grad_norm": 10.531653150318485, + "learning_rate": 8.57695311290376e-07, + "loss": 1.0265, + "step": 10401 + }, + { + "epoch": 1.4726410419763574, + "grad_norm": 9.577801597823251, + "learning_rate": 8.572631856242333e-07, + "loss": 0.9593, + "step": 10402 + }, + { + "epoch": 1.4727826148509946, + "grad_norm": 10.177244676438784, + "learning_rate": 8.56831146315539e-07, + "loss": 0.9982, + "step": 10403 + }, + { + "epoch": 1.4729241877256318, + "grad_norm": 7.634821835220937, + "learning_rate": 8.563991933870072e-07, + "loss": 0.8248, + "step": 10404 + }, + { + "epoch": 1.473065760600269, + "grad_norm": 9.585858308174315, + "learning_rate": 8.559673268613442e-07, + "loss": 0.9436, + "step": 10405 + }, + { + "epoch": 1.4732073334749063, + "grad_norm": 9.02773884561083, + "learning_rate": 8.555355467612527e-07, + "loss": 1.114, + "step": 10406 + }, + { + "epoch": 1.4733489063495435, + "grad_norm": 9.723890508991726, + "learning_rate": 8.551038531094308e-07, + "loss": 1.0558, + "step": 10407 + }, + { + "epoch": 1.4734904792241807, + "grad_norm": 6.903054443109315, + "learning_rate": 8.546722459285727e-07, + "loss": 0.9419, + "step": 10408 + }, + { + "epoch": 1.473632052098818, + "grad_norm": 9.339635038576647, + "learning_rate": 8.54240725241369e-07, + "loss": 0.9194, + "step": 10409 + }, + { + "epoch": 1.4737736249734552, + "grad_norm": 9.282442266776302, + "learning_rate": 8.538092910705021e-07, + "loss": 0.9418, + "step": 10410 + }, + { + "epoch": 1.4739151978480924, + "grad_norm": 11.039017922216257, + "learning_rate": 8.533779434386544e-07, + "loss": 1.1073, + "step": 10411 + }, + { + "epoch": 1.4740567707227294, + "grad_norm": 8.210254992893939, + "learning_rate": 8.529466823684993e-07, + "loss": 0.9286, + "step": 10412 + }, + { + "epoch": 1.4741983435973667, + "grad_norm": 8.272097499781928, + "learning_rate": 8.525155078827099e-07, + "loss": 0.9146, + "step": 10413 + }, + { + "epoch": 1.474339916472004, + "grad_norm": 9.190035723096884, + "learning_rate": 8.520844200039511e-07, + "loss": 1.0539, + "step": 10414 + }, + { + "epoch": 1.4744814893466411, + "grad_norm": 9.72370965629787, + "learning_rate": 8.516534187548864e-07, + "loss": 0.9986, + "step": 10415 + }, + { + "epoch": 1.4746230622212784, + "grad_norm": 8.633372790075839, + "learning_rate": 8.512225041581726e-07, + "loss": 0.9963, + "step": 10416 + }, + { + "epoch": 1.4747646350959156, + "grad_norm": 7.768526145551748, + "learning_rate": 8.507916762364613e-07, + "loss": 0.8687, + "step": 10417 + }, + { + "epoch": 1.4749062079705528, + "grad_norm": 9.609827939655588, + "learning_rate": 8.503609350124029e-07, + "loss": 0.9683, + "step": 10418 + }, + { + "epoch": 1.47504778084519, + "grad_norm": 9.529401556160527, + "learning_rate": 8.49930280508639e-07, + "loss": 0.9046, + "step": 10419 + }, + { + "epoch": 1.4751893537198273, + "grad_norm": 8.942743643837417, + "learning_rate": 8.494997127478111e-07, + "loss": 1.0498, + "step": 10420 + }, + { + "epoch": 1.4753309265944645, + "grad_norm": 8.524479341688558, + "learning_rate": 8.490692317525514e-07, + "loss": 0.8731, + "step": 10421 + }, + { + "epoch": 1.4754724994691018, + "grad_norm": 10.063448843438898, + "learning_rate": 8.486388375454924e-07, + "loss": 1.0862, + "step": 10422 + }, + { + "epoch": 1.475614072343739, + "grad_norm": 9.220273890864554, + "learning_rate": 8.482085301492574e-07, + "loss": 0.9128, + "step": 10423 + }, + { + "epoch": 1.4757556452183762, + "grad_norm": 10.681856505806682, + "learning_rate": 8.477783095864683e-07, + "loss": 1.0094, + "step": 10424 + }, + { + "epoch": 1.4758972180930134, + "grad_norm": 9.500201976034468, + "learning_rate": 8.473481758797425e-07, + "loss": 1.0692, + "step": 10425 + }, + { + "epoch": 1.4760387909676507, + "grad_norm": 7.965697178713785, + "learning_rate": 8.469181290516906e-07, + "loss": 0.9378, + "step": 10426 + }, + { + "epoch": 1.4761803638422877, + "grad_norm": 8.014890164903619, + "learning_rate": 8.464881691249202e-07, + "loss": 0.9164, + "step": 10427 + }, + { + "epoch": 1.476321936716925, + "grad_norm": 10.812572324654246, + "learning_rate": 8.460582961220332e-07, + "loss": 0.9901, + "step": 10428 + }, + { + "epoch": 1.4764635095915621, + "grad_norm": 10.211993955963774, + "learning_rate": 8.456285100656289e-07, + "loss": 1.0194, + "step": 10429 + }, + { + "epoch": 1.4766050824661994, + "grad_norm": 9.727104104524322, + "learning_rate": 8.451988109782997e-07, + "loss": 0.9091, + "step": 10430 + }, + { + "epoch": 1.4767466553408366, + "grad_norm": 11.346547520060893, + "learning_rate": 8.447691988826348e-07, + "loss": 1.0239, + "step": 10431 + }, + { + "epoch": 1.4768882282154738, + "grad_norm": 9.479307024129708, + "learning_rate": 8.443396738012199e-07, + "loss": 0.9626, + "step": 10432 + }, + { + "epoch": 1.477029801090111, + "grad_norm": 8.429622335243955, + "learning_rate": 8.439102357566331e-07, + "loss": 1.1085, + "step": 10433 + }, + { + "epoch": 1.4771713739647483, + "grad_norm": 8.075354449639018, + "learning_rate": 8.434808847714512e-07, + "loss": 1.0201, + "step": 10434 + }, + { + "epoch": 1.4773129468393855, + "grad_norm": 8.24035682053788, + "learning_rate": 8.430516208682429e-07, + "loss": 0.9616, + "step": 10435 + }, + { + "epoch": 1.4774545197140228, + "grad_norm": 9.040793052928427, + "learning_rate": 8.426224440695765e-07, + "loss": 0.9966, + "step": 10436 + }, + { + "epoch": 1.47759609258866, + "grad_norm": 10.513864718473382, + "learning_rate": 8.421933543980126e-07, + "loss": 0.9605, + "step": 10437 + }, + { + "epoch": 1.4777376654632972, + "grad_norm": 12.063112233491509, + "learning_rate": 8.417643518761068e-07, + "loss": 0.9655, + "step": 10438 + }, + { + "epoch": 1.4778792383379344, + "grad_norm": 8.257205937248793, + "learning_rate": 8.413354365264137e-07, + "loss": 0.9317, + "step": 10439 + }, + { + "epoch": 1.4780208112125717, + "grad_norm": 8.84472262692098, + "learning_rate": 8.40906608371479e-07, + "loss": 1.026, + "step": 10440 + }, + { + "epoch": 1.478162384087209, + "grad_norm": 9.778704522266182, + "learning_rate": 8.404778674338476e-07, + "loss": 0.9935, + "step": 10441 + }, + { + "epoch": 1.4783039569618461, + "grad_norm": 9.800218416232333, + "learning_rate": 8.400492137360564e-07, + "loss": 0.9632, + "step": 10442 + }, + { + "epoch": 1.4784455298364834, + "grad_norm": 9.201218101467939, + "learning_rate": 8.396206473006413e-07, + "loss": 1.0215, + "step": 10443 + }, + { + "epoch": 1.4785871027111206, + "grad_norm": 10.546452537835293, + "learning_rate": 8.391921681501297e-07, + "loss": 0.975, + "step": 10444 + }, + { + "epoch": 1.4787286755857578, + "grad_norm": 8.409426858175758, + "learning_rate": 8.387637763070486e-07, + "loss": 0.9248, + "step": 10445 + }, + { + "epoch": 1.478870248460395, + "grad_norm": 8.498572734606125, + "learning_rate": 8.383354717939163e-07, + "loss": 0.9985, + "step": 10446 + }, + { + "epoch": 1.4790118213350323, + "grad_norm": 10.236644697034293, + "learning_rate": 8.379072546332498e-07, + "loss": 1.0653, + "step": 10447 + }, + { + "epoch": 1.4791533942096695, + "grad_norm": 7.886642085359799, + "learning_rate": 8.374791248475597e-07, + "loss": 0.9279, + "step": 10448 + }, + { + "epoch": 1.4792949670843067, + "grad_norm": 9.693318250277965, + "learning_rate": 8.370510824593517e-07, + "loss": 1.0287, + "step": 10449 + }, + { + "epoch": 1.479436539958944, + "grad_norm": 8.755374674641821, + "learning_rate": 8.366231274911291e-07, + "loss": 0.9344, + "step": 10450 + }, + { + "epoch": 1.4795781128335812, + "grad_norm": 11.543711640645864, + "learning_rate": 8.361952599653875e-07, + "loss": 1.011, + "step": 10451 + }, + { + "epoch": 1.4797196857082184, + "grad_norm": 10.669356801632743, + "learning_rate": 8.357674799046206e-07, + "loss": 0.9875, + "step": 10452 + }, + { + "epoch": 1.4798612585828554, + "grad_norm": 9.431236361120233, + "learning_rate": 8.353397873313171e-07, + "loss": 0.9505, + "step": 10453 + }, + { + "epoch": 1.4800028314574927, + "grad_norm": 10.430918554453795, + "learning_rate": 8.34912182267959e-07, + "loss": 1.0267, + "step": 10454 + }, + { + "epoch": 1.48014440433213, + "grad_norm": 9.047636062682557, + "learning_rate": 8.34484664737027e-07, + "loss": 0.9372, + "step": 10455 + }, + { + "epoch": 1.4802859772067671, + "grad_norm": 10.109326700039178, + "learning_rate": 8.340572347609932e-07, + "loss": 1.065, + "step": 10456 + }, + { + "epoch": 1.4804275500814044, + "grad_norm": 8.207386820742213, + "learning_rate": 8.336298923623301e-07, + "loss": 1.0411, + "step": 10457 + }, + { + "epoch": 1.4805691229560416, + "grad_norm": 8.445042382648328, + "learning_rate": 8.332026375634994e-07, + "loss": 0.9299, + "step": 10458 + }, + { + "epoch": 1.4807106958306788, + "grad_norm": 8.25118131560732, + "learning_rate": 8.327754703869631e-07, + "loss": 0.9254, + "step": 10459 + }, + { + "epoch": 1.480852268705316, + "grad_norm": 8.641682289376725, + "learning_rate": 8.323483908551783e-07, + "loss": 0.967, + "step": 10460 + }, + { + "epoch": 1.4809938415799533, + "grad_norm": 9.73434796031056, + "learning_rate": 8.319213989905942e-07, + "loss": 1.0437, + "step": 10461 + }, + { + "epoch": 1.4811354144545905, + "grad_norm": 9.443567611781114, + "learning_rate": 8.314944948156589e-07, + "loss": 1.0333, + "step": 10462 + }, + { + "epoch": 1.4812769873292277, + "grad_norm": 9.251327007062368, + "learning_rate": 8.310676783528135e-07, + "loss": 1.0128, + "step": 10463 + }, + { + "epoch": 1.481418560203865, + "grad_norm": 8.582567298063049, + "learning_rate": 8.306409496244966e-07, + "loss": 0.9647, + "step": 10464 + }, + { + "epoch": 1.4815601330785022, + "grad_norm": 8.23330247180833, + "learning_rate": 8.302143086531392e-07, + "loss": 0.991, + "step": 10465 + }, + { + "epoch": 1.4817017059531394, + "grad_norm": 9.524105208211942, + "learning_rate": 8.297877554611717e-07, + "loss": 1.0912, + "step": 10466 + }, + { + "epoch": 1.4818432788277767, + "grad_norm": 10.355008639226575, + "learning_rate": 8.293612900710155e-07, + "loss": 1.0355, + "step": 10467 + }, + { + "epoch": 1.481984851702414, + "grad_norm": 9.605787249447108, + "learning_rate": 8.289349125050914e-07, + "loss": 0.9288, + "step": 10468 + }, + { + "epoch": 1.482126424577051, + "grad_norm": 10.113531705570034, + "learning_rate": 8.28508622785813e-07, + "loss": 0.9754, + "step": 10469 + }, + { + "epoch": 1.4822679974516881, + "grad_norm": 8.47260807458065, + "learning_rate": 8.280824209355892e-07, + "loss": 1.0291, + "step": 10470 + }, + { + "epoch": 1.4824095703263254, + "grad_norm": 10.287273833707719, + "learning_rate": 8.276563069768267e-07, + "loss": 1.0163, + "step": 10471 + }, + { + "epoch": 1.4825511432009626, + "grad_norm": 8.29844038684402, + "learning_rate": 8.272302809319243e-07, + "loss": 0.9279, + "step": 10472 + }, + { + "epoch": 1.4826927160755998, + "grad_norm": 9.911695457798556, + "learning_rate": 8.268043428232798e-07, + "loss": 0.959, + "step": 10473 + }, + { + "epoch": 1.482834288950237, + "grad_norm": 9.968109397275628, + "learning_rate": 8.263784926732824e-07, + "loss": 0.9986, + "step": 10474 + }, + { + "epoch": 1.4829758618248743, + "grad_norm": 9.02579447031188, + "learning_rate": 8.259527305043197e-07, + "loss": 0.9676, + "step": 10475 + }, + { + "epoch": 1.4831174346995115, + "grad_norm": 11.02156346191499, + "learning_rate": 8.255270563387746e-07, + "loss": 1.0002, + "step": 10476 + }, + { + "epoch": 1.4832590075741487, + "grad_norm": 7.940016220037771, + "learning_rate": 8.251014701990229e-07, + "loss": 0.9783, + "step": 10477 + }, + { + "epoch": 1.483400580448786, + "grad_norm": 8.306911339419518, + "learning_rate": 8.246759721074388e-07, + "loss": 0.8937, + "step": 10478 + }, + { + "epoch": 1.4835421533234232, + "grad_norm": 9.11047084491868, + "learning_rate": 8.242505620863894e-07, + "loss": 1.0967, + "step": 10479 + }, + { + "epoch": 1.4836837261980604, + "grad_norm": 9.764572599543609, + "learning_rate": 8.238252401582389e-07, + "loss": 0.935, + "step": 10480 + }, + { + "epoch": 1.4838252990726977, + "grad_norm": 9.000075445918583, + "learning_rate": 8.234000063453446e-07, + "loss": 0.938, + "step": 10481 + }, + { + "epoch": 1.483966871947335, + "grad_norm": 9.416395020294265, + "learning_rate": 8.229748606700619e-07, + "loss": 1.0043, + "step": 10482 + }, + { + "epoch": 1.4841084448219721, + "grad_norm": 9.652604189500373, + "learning_rate": 8.225498031547413e-07, + "loss": 1.0149, + "step": 10483 + }, + { + "epoch": 1.4842500176966094, + "grad_norm": 10.17565229044963, + "learning_rate": 8.221248338217258e-07, + "loss": 0.9503, + "step": 10484 + }, + { + "epoch": 1.4843915905712466, + "grad_norm": 10.02510086731811, + "learning_rate": 8.216999526933578e-07, + "loss": 0.9297, + "step": 10485 + }, + { + "epoch": 1.4845331634458838, + "grad_norm": 7.596432059150985, + "learning_rate": 8.212751597919708e-07, + "loss": 0.9475, + "step": 10486 + }, + { + "epoch": 1.484674736320521, + "grad_norm": 9.267256851978738, + "learning_rate": 8.208504551398977e-07, + "loss": 1.0078, + "step": 10487 + }, + { + "epoch": 1.4848163091951583, + "grad_norm": 7.784323701877092, + "learning_rate": 8.204258387594635e-07, + "loss": 0.8927, + "step": 10488 + }, + { + "epoch": 1.4849578820697955, + "grad_norm": 8.506220616970284, + "learning_rate": 8.200013106729915e-07, + "loss": 1.0293, + "step": 10489 + }, + { + "epoch": 1.4850994549444327, + "grad_norm": 10.65182913532801, + "learning_rate": 8.195768709027979e-07, + "loss": 0.9533, + "step": 10490 + }, + { + "epoch": 1.48524102781907, + "grad_norm": 10.226723642733472, + "learning_rate": 8.191525194711941e-07, + "loss": 1.0558, + "step": 10491 + }, + { + "epoch": 1.4853826006937072, + "grad_norm": 9.901512384972593, + "learning_rate": 8.187282564004903e-07, + "loss": 1.0268, + "step": 10492 + }, + { + "epoch": 1.4855241735683444, + "grad_norm": 11.162133468641976, + "learning_rate": 8.183040817129873e-07, + "loss": 1.1875, + "step": 10493 + }, + { + "epoch": 1.4856657464429817, + "grad_norm": 8.635739040147708, + "learning_rate": 8.178799954309857e-07, + "loss": 0.8267, + "step": 10494 + }, + { + "epoch": 1.4858073193176187, + "grad_norm": 10.083787093448379, + "learning_rate": 8.174559975767779e-07, + "loss": 1.0018, + "step": 10495 + }, + { + "epoch": 1.485948892192256, + "grad_norm": 9.367048629991258, + "learning_rate": 8.170320881726542e-07, + "loss": 0.9774, + "step": 10496 + }, + { + "epoch": 1.4860904650668931, + "grad_norm": 34.758436756868825, + "learning_rate": 8.166082672408985e-07, + "loss": 1.1055, + "step": 10497 + }, + { + "epoch": 1.4862320379415304, + "grad_norm": 9.108993496111797, + "learning_rate": 8.161845348037906e-07, + "loss": 0.986, + "step": 10498 + }, + { + "epoch": 1.4863736108161676, + "grad_norm": 8.839085511421567, + "learning_rate": 8.157608908836071e-07, + "loss": 0.9973, + "step": 10499 + }, + { + "epoch": 1.4865151836908048, + "grad_norm": 9.342254008585803, + "learning_rate": 8.153373355026176e-07, + "loss": 0.9969, + "step": 10500 + }, + { + "epoch": 1.486656756565442, + "grad_norm": 9.060120579336093, + "learning_rate": 8.149138686830882e-07, + "loss": 0.9729, + "step": 10501 + }, + { + "epoch": 1.4867983294400793, + "grad_norm": 10.294825718994598, + "learning_rate": 8.144904904472798e-07, + "loss": 0.9956, + "step": 10502 + }, + { + "epoch": 1.4869399023147165, + "grad_norm": 7.886663367732119, + "learning_rate": 8.1406720081745e-07, + "loss": 0.9346, + "step": 10503 + }, + { + "epoch": 1.4870814751893537, + "grad_norm": 8.975052058246915, + "learning_rate": 8.136439998158499e-07, + "loss": 0.9793, + "step": 10504 + }, + { + "epoch": 1.487223048063991, + "grad_norm": 9.149411925715542, + "learning_rate": 8.132208874647271e-07, + "loss": 0.9917, + "step": 10505 + }, + { + "epoch": 1.4873646209386282, + "grad_norm": 9.74097044071858, + "learning_rate": 8.127978637863254e-07, + "loss": 0.9173, + "step": 10506 + }, + { + "epoch": 1.4875061938132654, + "grad_norm": 7.997345722471696, + "learning_rate": 8.12374928802881e-07, + "loss": 1.0369, + "step": 10507 + }, + { + "epoch": 1.4876477666879027, + "grad_norm": 9.76863078742039, + "learning_rate": 8.119520825366292e-07, + "loss": 0.9024, + "step": 10508 + }, + { + "epoch": 1.48778933956254, + "grad_norm": 9.611052079898409, + "learning_rate": 8.115293250097969e-07, + "loss": 0.9708, + "step": 10509 + }, + { + "epoch": 1.487930912437177, + "grad_norm": 8.614166975434367, + "learning_rate": 8.111066562446098e-07, + "loss": 0.9121, + "step": 10510 + }, + { + "epoch": 1.4880724853118141, + "grad_norm": 7.960165987824056, + "learning_rate": 8.106840762632867e-07, + "loss": 0.9867, + "step": 10511 + }, + { + "epoch": 1.4882140581864514, + "grad_norm": 10.090476624315592, + "learning_rate": 8.102615850880413e-07, + "loss": 1.0913, + "step": 10512 + }, + { + "epoch": 1.4883556310610886, + "grad_norm": 8.95052514267123, + "learning_rate": 8.098391827410851e-07, + "loss": 1.0337, + "step": 10513 + }, + { + "epoch": 1.4884972039357258, + "grad_norm": 9.28909863849152, + "learning_rate": 8.09416869244622e-07, + "loss": 0.8979, + "step": 10514 + }, + { + "epoch": 1.488638776810363, + "grad_norm": 8.903141590235508, + "learning_rate": 8.089946446208543e-07, + "loss": 1.0415, + "step": 10515 + }, + { + "epoch": 1.4887803496850003, + "grad_norm": 9.062771497145487, + "learning_rate": 8.085725088919766e-07, + "loss": 0.9487, + "step": 10516 + }, + { + "epoch": 1.4889219225596375, + "grad_norm": 9.606630705696196, + "learning_rate": 8.081504620801816e-07, + "loss": 1.0539, + "step": 10517 + }, + { + "epoch": 1.4890634954342747, + "grad_norm": 12.114120164033078, + "learning_rate": 8.077285042076546e-07, + "loss": 0.9501, + "step": 10518 + }, + { + "epoch": 1.489205068308912, + "grad_norm": 9.120058576830756, + "learning_rate": 8.073066352965792e-07, + "loss": 1.0354, + "step": 10519 + }, + { + "epoch": 1.4893466411835492, + "grad_norm": 9.311548786098822, + "learning_rate": 8.068848553691308e-07, + "loss": 0.9304, + "step": 10520 + }, + { + "epoch": 1.4894882140581864, + "grad_norm": 8.833995818240986, + "learning_rate": 8.06463164447484e-07, + "loss": 1.0128, + "step": 10521 + }, + { + "epoch": 1.4896297869328237, + "grad_norm": 7.5269210847214945, + "learning_rate": 8.060415625538059e-07, + "loss": 0.9604, + "step": 10522 + }, + { + "epoch": 1.489771359807461, + "grad_norm": 8.058759427829138, + "learning_rate": 8.056200497102592e-07, + "loss": 0.9084, + "step": 10523 + }, + { + "epoch": 1.4899129326820981, + "grad_norm": 9.63302990744625, + "learning_rate": 8.051986259390038e-07, + "loss": 0.9889, + "step": 10524 + }, + { + "epoch": 1.4900545055567354, + "grad_norm": 8.473751604074703, + "learning_rate": 8.047772912621921e-07, + "loss": 0.9771, + "step": 10525 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 10.509447116700786, + "learning_rate": 8.04356045701975e-07, + "loss": 1.0418, + "step": 10526 + }, + { + "epoch": 1.4903376513060098, + "grad_norm": 11.31912694900518, + "learning_rate": 8.039348892804955e-07, + "loss": 1.0416, + "step": 10527 + }, + { + "epoch": 1.490479224180647, + "grad_norm": 10.522581613649663, + "learning_rate": 8.03513822019894e-07, + "loss": 1.0017, + "step": 10528 + }, + { + "epoch": 1.4906207970552843, + "grad_norm": 9.495608469192236, + "learning_rate": 8.030928439423069e-07, + "loss": 1.0205, + "step": 10529 + }, + { + "epoch": 1.4907623699299215, + "grad_norm": 9.48328012497316, + "learning_rate": 8.026719550698628e-07, + "loss": 1.0036, + "step": 10530 + }, + { + "epoch": 1.4909039428045587, + "grad_norm": 9.327767165587387, + "learning_rate": 8.022511554246895e-07, + "loss": 0.9552, + "step": 10531 + }, + { + "epoch": 1.491045515679196, + "grad_norm": 10.089853579892617, + "learning_rate": 8.018304450289069e-07, + "loss": 1.0296, + "step": 10532 + }, + { + "epoch": 1.4911870885538332, + "grad_norm": 9.28913067021874, + "learning_rate": 8.014098239046309e-07, + "loss": 1.0625, + "step": 10533 + }, + { + "epoch": 1.4913286614284704, + "grad_norm": 8.174666968978672, + "learning_rate": 8.009892920739745e-07, + "loss": 1.0985, + "step": 10534 + }, + { + "epoch": 1.4914702343031077, + "grad_norm": 8.6892461325509, + "learning_rate": 8.005688495590435e-07, + "loss": 1.1311, + "step": 10535 + }, + { + "epoch": 1.4916118071777447, + "grad_norm": 11.89184275342832, + "learning_rate": 8.001484963819417e-07, + "loss": 1.1134, + "step": 10536 + }, + { + "epoch": 1.491753380052382, + "grad_norm": 10.364332482962075, + "learning_rate": 7.997282325647654e-07, + "loss": 0.9262, + "step": 10537 + }, + { + "epoch": 1.4918949529270191, + "grad_norm": 9.775238237136394, + "learning_rate": 7.993080581296087e-07, + "loss": 1.026, + "step": 10538 + }, + { + "epoch": 1.4920365258016564, + "grad_norm": 9.107303966304626, + "learning_rate": 7.988879730985585e-07, + "loss": 0.9686, + "step": 10539 + }, + { + "epoch": 1.4921780986762936, + "grad_norm": 10.19158366916285, + "learning_rate": 7.984679774937002e-07, + "loss": 0.9419, + "step": 10540 + }, + { + "epoch": 1.4923196715509308, + "grad_norm": 7.7921620039812485, + "learning_rate": 7.980480713371106e-07, + "loss": 0.9005, + "step": 10541 + }, + { + "epoch": 1.492461244425568, + "grad_norm": 7.4057222975077, + "learning_rate": 7.976282546508654e-07, + "loss": 0.9272, + "step": 10542 + }, + { + "epoch": 1.4926028173002053, + "grad_norm": 9.853321084861042, + "learning_rate": 7.972085274570341e-07, + "loss": 1.057, + "step": 10543 + }, + { + "epoch": 1.4927443901748425, + "grad_norm": 8.176753555418843, + "learning_rate": 7.967888897776796e-07, + "loss": 0.9628, + "step": 10544 + }, + { + "epoch": 1.4928859630494797, + "grad_norm": 9.019790083986416, + "learning_rate": 7.963693416348642e-07, + "loss": 0.958, + "step": 10545 + }, + { + "epoch": 1.493027535924117, + "grad_norm": 7.90393756325961, + "learning_rate": 7.959498830506412e-07, + "loss": 0.9272, + "step": 10546 + }, + { + "epoch": 1.4931691087987542, + "grad_norm": 9.597362919052296, + "learning_rate": 7.955305140470635e-07, + "loss": 0.9645, + "step": 10547 + }, + { + "epoch": 1.4933106816733914, + "grad_norm": 11.705692595303457, + "learning_rate": 7.951112346461745e-07, + "loss": 1.0654, + "step": 10548 + }, + { + "epoch": 1.4934522545480287, + "grad_norm": 8.241427967817863, + "learning_rate": 7.946920448700168e-07, + "loss": 0.9933, + "step": 10549 + }, + { + "epoch": 1.4935938274226659, + "grad_norm": 8.614809069430057, + "learning_rate": 7.942729447406278e-07, + "loss": 0.9449, + "step": 10550 + }, + { + "epoch": 1.4937354002973031, + "grad_norm": 8.402881727460144, + "learning_rate": 7.938539342800373e-07, + "loss": 1.0034, + "step": 10551 + }, + { + "epoch": 1.4938769731719401, + "grad_norm": 10.174138018113432, + "learning_rate": 7.934350135102742e-07, + "loss": 0.9788, + "step": 10552 + }, + { + "epoch": 1.4940185460465774, + "grad_norm": 7.963633132386017, + "learning_rate": 7.930161824533597e-07, + "loss": 0.9677, + "step": 10553 + }, + { + "epoch": 1.4941601189212146, + "grad_norm": 10.01485331831199, + "learning_rate": 7.92597441131312e-07, + "loss": 1.0832, + "step": 10554 + }, + { + "epoch": 1.4943016917958518, + "grad_norm": 9.011639591790399, + "learning_rate": 7.921787895661429e-07, + "loss": 0.9665, + "step": 10555 + }, + { + "epoch": 1.494443264670489, + "grad_norm": 9.758059960458048, + "learning_rate": 7.917602277798613e-07, + "loss": 1.0582, + "step": 10556 + }, + { + "epoch": 1.4945848375451263, + "grad_norm": 9.319824205959062, + "learning_rate": 7.913417557944716e-07, + "loss": 0.9557, + "step": 10557 + }, + { + "epoch": 1.4947264104197635, + "grad_norm": 8.669997773307681, + "learning_rate": 7.909233736319711e-07, + "loss": 0.9677, + "step": 10558 + }, + { + "epoch": 1.4948679832944007, + "grad_norm": 8.51559005870125, + "learning_rate": 7.905050813143553e-07, + "loss": 0.9711, + "step": 10559 + }, + { + "epoch": 1.495009556169038, + "grad_norm": 9.80799597130548, + "learning_rate": 7.900868788636118e-07, + "loss": 0.9909, + "step": 10560 + }, + { + "epoch": 1.4951511290436752, + "grad_norm": 8.373345510550875, + "learning_rate": 7.896687663017269e-07, + "loss": 0.9807, + "step": 10561 + }, + { + "epoch": 1.4952927019183124, + "grad_norm": 9.46221476720864, + "learning_rate": 7.892507436506791e-07, + "loss": 1.0845, + "step": 10562 + }, + { + "epoch": 1.4954342747929497, + "grad_norm": 10.804681851046968, + "learning_rate": 7.888328109324448e-07, + "loss": 0.9872, + "step": 10563 + }, + { + "epoch": 1.4955758476675869, + "grad_norm": 9.036643137219325, + "learning_rate": 7.884149681689937e-07, + "loss": 0.9546, + "step": 10564 + }, + { + "epoch": 1.4957174205422241, + "grad_norm": 10.05867073696266, + "learning_rate": 7.879972153822907e-07, + "loss": 0.9444, + "step": 10565 + }, + { + "epoch": 1.4958589934168613, + "grad_norm": 10.47366081180698, + "learning_rate": 7.875795525942984e-07, + "loss": 1.0694, + "step": 10566 + }, + { + "epoch": 1.4960005662914986, + "grad_norm": 6.953094002300989, + "learning_rate": 7.871619798269711e-07, + "loss": 0.8915, + "step": 10567 + }, + { + "epoch": 1.4961421391661358, + "grad_norm": 9.371419400865275, + "learning_rate": 7.867444971022626e-07, + "loss": 0.93, + "step": 10568 + }, + { + "epoch": 1.496283712040773, + "grad_norm": 9.244213872055573, + "learning_rate": 7.863271044421175e-07, + "loss": 0.9873, + "step": 10569 + }, + { + "epoch": 1.4964252849154103, + "grad_norm": 7.860112968972505, + "learning_rate": 7.859098018684793e-07, + "loss": 1.0137, + "step": 10570 + }, + { + "epoch": 1.4965668577900475, + "grad_norm": 9.585537554691564, + "learning_rate": 7.854925894032842e-07, + "loss": 0.8976, + "step": 10571 + }, + { + "epoch": 1.4967084306646847, + "grad_norm": 9.296981682686221, + "learning_rate": 7.850754670684654e-07, + "loss": 0.9488, + "step": 10572 + }, + { + "epoch": 1.496850003539322, + "grad_norm": 10.172228641485994, + "learning_rate": 7.846584348859512e-07, + "loss": 0.9739, + "step": 10573 + }, + { + "epoch": 1.4969915764139592, + "grad_norm": 9.524886211206189, + "learning_rate": 7.84241492877664e-07, + "loss": 0.9419, + "step": 10574 + }, + { + "epoch": 1.4971331492885964, + "grad_norm": 9.80198815867439, + "learning_rate": 7.838246410655223e-07, + "loss": 1.0354, + "step": 10575 + }, + { + "epoch": 1.4972747221632337, + "grad_norm": 9.859460893791857, + "learning_rate": 7.834078794714389e-07, + "loss": 1.087, + "step": 10576 + }, + { + "epoch": 1.4974162950378709, + "grad_norm": 7.8889622513250774, + "learning_rate": 7.829912081173238e-07, + "loss": 0.952, + "step": 10577 + }, + { + "epoch": 1.497557867912508, + "grad_norm": 10.101764532371583, + "learning_rate": 7.825746270250803e-07, + "loss": 1.1303, + "step": 10578 + }, + { + "epoch": 1.4976994407871451, + "grad_norm": 9.102801769841808, + "learning_rate": 7.821581362166078e-07, + "loss": 0.8476, + "step": 10579 + }, + { + "epoch": 1.4978410136617824, + "grad_norm": 9.779454660637915, + "learning_rate": 7.817417357138018e-07, + "loss": 1.1036, + "step": 10580 + }, + { + "epoch": 1.4979825865364196, + "grad_norm": 9.472504831264025, + "learning_rate": 7.813254255385511e-07, + "loss": 1.011, + "step": 10581 + }, + { + "epoch": 1.4981241594110568, + "grad_norm": 8.855228806161513, + "learning_rate": 7.809092057127421e-07, + "loss": 1.0684, + "step": 10582 + }, + { + "epoch": 1.498265732285694, + "grad_norm": 9.296929572450125, + "learning_rate": 7.804930762582533e-07, + "loss": 0.9821, + "step": 10583 + }, + { + "epoch": 1.4984073051603313, + "grad_norm": 9.094597740903534, + "learning_rate": 7.800770371969624e-07, + "loss": 0.933, + "step": 10584 + }, + { + "epoch": 1.4985488780349685, + "grad_norm": 6.545595030288469, + "learning_rate": 7.796610885507391e-07, + "loss": 0.9781, + "step": 10585 + }, + { + "epoch": 1.4986904509096057, + "grad_norm": 8.14870668224484, + "learning_rate": 7.792452303414489e-07, + "loss": 0.9452, + "step": 10586 + }, + { + "epoch": 1.498832023784243, + "grad_norm": 8.311998854136824, + "learning_rate": 7.788294625909546e-07, + "loss": 1.038, + "step": 10587 + }, + { + "epoch": 1.4989735966588802, + "grad_norm": 8.353303048134004, + "learning_rate": 7.784137853211113e-07, + "loss": 0.9699, + "step": 10588 + }, + { + "epoch": 1.4991151695335174, + "grad_norm": 9.462880748511358, + "learning_rate": 7.779981985537724e-07, + "loss": 0.935, + "step": 10589 + }, + { + "epoch": 1.4992567424081547, + "grad_norm": 10.067726057615562, + "learning_rate": 7.775827023107835e-07, + "loss": 1.0534, + "step": 10590 + }, + { + "epoch": 1.4993983152827919, + "grad_norm": 10.209726625195724, + "learning_rate": 7.771672966139885e-07, + "loss": 0.9276, + "step": 10591 + }, + { + "epoch": 1.4995398881574291, + "grad_norm": 10.919600657891845, + "learning_rate": 7.767519814852234e-07, + "loss": 1.121, + "step": 10592 + }, + { + "epoch": 1.4996814610320661, + "grad_norm": 9.45806422810033, + "learning_rate": 7.763367569463224e-07, + "loss": 1.0609, + "step": 10593 + }, + { + "epoch": 1.4998230339067034, + "grad_norm": 8.325444976553337, + "learning_rate": 7.759216230191119e-07, + "loss": 0.9622, + "step": 10594 + }, + { + "epoch": 1.4999646067813406, + "grad_norm": 10.491790104978922, + "learning_rate": 7.755065797254172e-07, + "loss": 1.0115, + "step": 10595 + }, + { + "epoch": 1.5001061796559778, + "grad_norm": 10.923172465740372, + "learning_rate": 7.750916270870556e-07, + "loss": 1.0427, + "step": 10596 + }, + { + "epoch": 1.500247752530615, + "grad_norm": 9.535002223126092, + "learning_rate": 7.746767651258405e-07, + "loss": 0.857, + "step": 10597 + }, + { + "epoch": 1.5003893254052523, + "grad_norm": 8.473916817821117, + "learning_rate": 7.74261993863582e-07, + "loss": 0.9803, + "step": 10598 + }, + { + "epoch": 1.5005308982798895, + "grad_norm": 7.635910732908503, + "learning_rate": 7.738473133220828e-07, + "loss": 0.8751, + "step": 10599 + }, + { + "epoch": 1.5006724711545267, + "grad_norm": 7.6346032375925725, + "learning_rate": 7.734327235231443e-07, + "loss": 0.967, + "step": 10600 + }, + { + "epoch": 1.500814044029164, + "grad_norm": 9.180900019654851, + "learning_rate": 7.730182244885593e-07, + "loss": 1.0112, + "step": 10601 + }, + { + "epoch": 1.5009556169038012, + "grad_norm": 9.320557660622054, + "learning_rate": 7.726038162401184e-07, + "loss": 0.8991, + "step": 10602 + }, + { + "epoch": 1.5010971897784384, + "grad_norm": 9.565536914358804, + "learning_rate": 7.721894987996076e-07, + "loss": 1.0257, + "step": 10603 + }, + { + "epoch": 1.5012387626530757, + "grad_norm": 9.735909085453441, + "learning_rate": 7.717752721888058e-07, + "loss": 1.0535, + "step": 10604 + }, + { + "epoch": 1.5013803355277129, + "grad_norm": 9.717706654959077, + "learning_rate": 7.713611364294904e-07, + "loss": 0.9466, + "step": 10605 + }, + { + "epoch": 1.5015219084023501, + "grad_norm": 8.685568615401731, + "learning_rate": 7.709470915434309e-07, + "loss": 0.983, + "step": 10606 + }, + { + "epoch": 1.5016634812769873, + "grad_norm": 12.07347121897388, + "learning_rate": 7.705331375523928e-07, + "loss": 1.0591, + "step": 10607 + }, + { + "epoch": 1.5018050541516246, + "grad_norm": 10.532883152415648, + "learning_rate": 7.701192744781389e-07, + "loss": 1.1105, + "step": 10608 + }, + { + "epoch": 1.5019466270262618, + "grad_norm": 7.991414469961678, + "learning_rate": 7.69705502342424e-07, + "loss": 0.9266, + "step": 10609 + }, + { + "epoch": 1.502088199900899, + "grad_norm": 9.605052540291377, + "learning_rate": 7.692918211670017e-07, + "loss": 1.0391, + "step": 10610 + }, + { + "epoch": 1.5022297727755363, + "grad_norm": 10.240100825528632, + "learning_rate": 7.68878230973617e-07, + "loss": 0.9847, + "step": 10611 + }, + { + "epoch": 1.5023713456501735, + "grad_norm": 8.287339760452433, + "learning_rate": 7.684647317840138e-07, + "loss": 0.9959, + "step": 10612 + }, + { + "epoch": 1.5025129185248107, + "grad_norm": 9.537718332173933, + "learning_rate": 7.680513236199275e-07, + "loss": 0.9851, + "step": 10613 + }, + { + "epoch": 1.502654491399448, + "grad_norm": 10.588304048202787, + "learning_rate": 7.676380065030928e-07, + "loss": 0.972, + "step": 10614 + }, + { + "epoch": 1.5027960642740852, + "grad_norm": 8.527191650040677, + "learning_rate": 7.672247804552355e-07, + "loss": 0.947, + "step": 10615 + }, + { + "epoch": 1.5029376371487224, + "grad_norm": 9.449708027845276, + "learning_rate": 7.668116454980804e-07, + "loss": 1.0377, + "step": 10616 + }, + { + "epoch": 1.5030792100233596, + "grad_norm": 10.406714443347195, + "learning_rate": 7.663986016533446e-07, + "loss": 1.0094, + "step": 10617 + }, + { + "epoch": 1.5032207828979969, + "grad_norm": 9.080247220610369, + "learning_rate": 7.659856489427409e-07, + "loss": 1.0919, + "step": 10618 + }, + { + "epoch": 1.503362355772634, + "grad_norm": 10.919145802495374, + "learning_rate": 7.655727873879792e-07, + "loss": 0.9483, + "step": 10619 + }, + { + "epoch": 1.5035039286472713, + "grad_norm": 9.288801723762807, + "learning_rate": 7.651600170107626e-07, + "loss": 0.8591, + "step": 10620 + }, + { + "epoch": 1.5036455015219086, + "grad_norm": 7.1614632753297345, + "learning_rate": 7.647473378327908e-07, + "loss": 0.897, + "step": 10621 + }, + { + "epoch": 1.5037870743965456, + "grad_norm": 10.625561508600233, + "learning_rate": 7.64334749875757e-07, + "loss": 0.9727, + "step": 10622 + }, + { + "epoch": 1.5039286472711828, + "grad_norm": 9.454243945269244, + "learning_rate": 7.639222531613519e-07, + "loss": 0.8509, + "step": 10623 + }, + { + "epoch": 1.50407022014582, + "grad_norm": 9.409400995143226, + "learning_rate": 7.635098477112588e-07, + "loss": 0.899, + "step": 10624 + }, + { + "epoch": 1.5042117930204573, + "grad_norm": 8.993203033717412, + "learning_rate": 7.63097533547158e-07, + "loss": 1.0293, + "step": 10625 + }, + { + "epoch": 1.5043533658950945, + "grad_norm": 9.665854803610111, + "learning_rate": 7.626853106907256e-07, + "loss": 1.0635, + "step": 10626 + }, + { + "epoch": 1.5044949387697317, + "grad_norm": 8.486619627832953, + "learning_rate": 7.622731791636312e-07, + "loss": 1.0205, + "step": 10627 + }, + { + "epoch": 1.504636511644369, + "grad_norm": 9.444150488772923, + "learning_rate": 7.6186113898754e-07, + "loss": 0.9947, + "step": 10628 + }, + { + "epoch": 1.5047780845190062, + "grad_norm": 8.394150431974005, + "learning_rate": 7.614491901841118e-07, + "loss": 1.0009, + "step": 10629 + }, + { + "epoch": 1.5049196573936434, + "grad_norm": 8.613999137458555, + "learning_rate": 7.610373327750034e-07, + "loss": 0.9792, + "step": 10630 + }, + { + "epoch": 1.5050612302682806, + "grad_norm": 8.341154573507232, + "learning_rate": 7.606255667818668e-07, + "loss": 0.9266, + "step": 10631 + }, + { + "epoch": 1.5052028031429177, + "grad_norm": 11.444909202578824, + "learning_rate": 7.602138922263461e-07, + "loss": 0.9699, + "step": 10632 + }, + { + "epoch": 1.5053443760175549, + "grad_norm": 8.896201156647216, + "learning_rate": 7.598023091300849e-07, + "loss": 0.9945, + "step": 10633 + }, + { + "epoch": 1.5054859488921921, + "grad_norm": 9.849415532310788, + "learning_rate": 7.593908175147177e-07, + "loss": 1.0149, + "step": 10634 + }, + { + "epoch": 1.5056275217668293, + "grad_norm": 11.076198910818265, + "learning_rate": 7.589794174018786e-07, + "loss": 1.037, + "step": 10635 + }, + { + "epoch": 1.5057690946414666, + "grad_norm": 11.036541326239883, + "learning_rate": 7.585681088131921e-07, + "loss": 1.0438, + "step": 10636 + }, + { + "epoch": 1.5059106675161038, + "grad_norm": 10.58448661067335, + "learning_rate": 7.581568917702828e-07, + "loss": 0.9594, + "step": 10637 + }, + { + "epoch": 1.506052240390741, + "grad_norm": 8.7547507331957, + "learning_rate": 7.577457662947668e-07, + "loss": 0.9471, + "step": 10638 + }, + { + "epoch": 1.5061938132653783, + "grad_norm": 8.741977174357766, + "learning_rate": 7.573347324082558e-07, + "loss": 0.8496, + "step": 10639 + }, + { + "epoch": 1.5063353861400155, + "grad_norm": 7.305835262434812, + "learning_rate": 7.569237901323595e-07, + "loss": 0.8473, + "step": 10640 + }, + { + "epoch": 1.5064769590146527, + "grad_norm": 10.05253382383252, + "learning_rate": 7.565129394886792e-07, + "loss": 1.0053, + "step": 10641 + }, + { + "epoch": 1.50661853188929, + "grad_norm": 7.576724208082395, + "learning_rate": 7.561021804988141e-07, + "loss": 0.9412, + "step": 10642 + }, + { + "epoch": 1.5067601047639272, + "grad_norm": 8.092350547902427, + "learning_rate": 7.556915131843568e-07, + "loss": 0.9467, + "step": 10643 + }, + { + "epoch": 1.5069016776385644, + "grad_norm": 8.158801852506103, + "learning_rate": 7.552809375668965e-07, + "loss": 0.9285, + "step": 10644 + }, + { + "epoch": 1.5070432505132016, + "grad_norm": 8.818206887178262, + "learning_rate": 7.548704536680157e-07, + "loss": 0.9113, + "step": 10645 + }, + { + "epoch": 1.5071848233878389, + "grad_norm": 8.522519517620518, + "learning_rate": 7.544600615092937e-07, + "loss": 0.9896, + "step": 10646 + }, + { + "epoch": 1.507326396262476, + "grad_norm": 10.635755054696215, + "learning_rate": 7.540497611123058e-07, + "loss": 0.9882, + "step": 10647 + }, + { + "epoch": 1.5074679691371133, + "grad_norm": 9.286451121848247, + "learning_rate": 7.536395524986201e-07, + "loss": 0.9327, + "step": 10648 + }, + { + "epoch": 1.5076095420117506, + "grad_norm": 8.951233882135766, + "learning_rate": 7.532294356898006e-07, + "loss": 0.9544, + "step": 10649 + }, + { + "epoch": 1.5077511148863878, + "grad_norm": 10.42202177616378, + "learning_rate": 7.528194107074069e-07, + "loss": 0.9747, + "step": 10650 + }, + { + "epoch": 1.507892687761025, + "grad_norm": 9.093262125148497, + "learning_rate": 7.524094775729948e-07, + "loss": 0.9765, + "step": 10651 + }, + { + "epoch": 1.5080342606356623, + "grad_norm": 7.76767339880595, + "learning_rate": 7.519996363081123e-07, + "loss": 0.8716, + "step": 10652 + }, + { + "epoch": 1.5081758335102995, + "grad_norm": 11.070592991109585, + "learning_rate": 7.515898869343058e-07, + "loss": 0.9953, + "step": 10653 + }, + { + "epoch": 1.5083174063849367, + "grad_norm": 8.39990176188835, + "learning_rate": 7.51180229473116e-07, + "loss": 1.0624, + "step": 10654 + }, + { + "epoch": 1.508458979259574, + "grad_norm": 9.487885229358978, + "learning_rate": 7.507706639460768e-07, + "loss": 0.884, + "step": 10655 + }, + { + "epoch": 1.5086005521342112, + "grad_norm": 10.065443283757311, + "learning_rate": 7.503611903747202e-07, + "loss": 1.0184, + "step": 10656 + }, + { + "epoch": 1.5087421250088484, + "grad_norm": 10.210285192199809, + "learning_rate": 7.499518087805704e-07, + "loss": 1.0307, + "step": 10657 + }, + { + "epoch": 1.5088836978834856, + "grad_norm": 8.833630490852885, + "learning_rate": 7.495425191851502e-07, + "loss": 0.9901, + "step": 10658 + }, + { + "epoch": 1.5090252707581229, + "grad_norm": 9.399595828193808, + "learning_rate": 7.491333216099744e-07, + "loss": 0.9351, + "step": 10659 + }, + { + "epoch": 1.50916684363276, + "grad_norm": 9.980419443982905, + "learning_rate": 7.487242160765535e-07, + "loss": 1.0136, + "step": 10660 + }, + { + "epoch": 1.5093084165073973, + "grad_norm": 10.219024374541851, + "learning_rate": 7.483152026063955e-07, + "loss": 0.9687, + "step": 10661 + }, + { + "epoch": 1.5094499893820346, + "grad_norm": 9.34018029245195, + "learning_rate": 7.479062812210005e-07, + "loss": 0.9202, + "step": 10662 + }, + { + "epoch": 1.5095915622566716, + "grad_norm": 7.183639418356739, + "learning_rate": 7.474974519418668e-07, + "loss": 0.9223, + "step": 10663 + }, + { + "epoch": 1.5097331351313088, + "grad_norm": 9.708559047274424, + "learning_rate": 7.470887147904845e-07, + "loss": 1.005, + "step": 10664 + }, + { + "epoch": 1.509874708005946, + "grad_norm": 10.298688297635552, + "learning_rate": 7.466800697883422e-07, + "loss": 0.9181, + "step": 10665 + }, + { + "epoch": 1.5100162808805833, + "grad_norm": 9.09271842637545, + "learning_rate": 7.462715169569204e-07, + "loss": 0.9792, + "step": 10666 + }, + { + "epoch": 1.5101578537552205, + "grad_norm": 9.169464470421183, + "learning_rate": 7.458630563176983e-07, + "loss": 0.9206, + "step": 10667 + }, + { + "epoch": 1.5102994266298577, + "grad_norm": 8.237917751434189, + "learning_rate": 7.454546878921465e-07, + "loss": 0.9241, + "step": 10668 + }, + { + "epoch": 1.510440999504495, + "grad_norm": 8.384871856492781, + "learning_rate": 7.450464117017342e-07, + "loss": 0.8389, + "step": 10669 + }, + { + "epoch": 1.5105825723791322, + "grad_norm": 8.570435348367484, + "learning_rate": 7.446382277679235e-07, + "loss": 0.9227, + "step": 10670 + }, + { + "epoch": 1.5107241452537694, + "grad_norm": 9.092783873319245, + "learning_rate": 7.442301361121718e-07, + "loss": 0.9956, + "step": 10671 + }, + { + "epoch": 1.5108657181284066, + "grad_norm": 11.120334289795288, + "learning_rate": 7.438221367559331e-07, + "loss": 1.0708, + "step": 10672 + }, + { + "epoch": 1.5110072910030437, + "grad_norm": 9.095335936798742, + "learning_rate": 7.434142297206546e-07, + "loss": 0.9131, + "step": 10673 + }, + { + "epoch": 1.5111488638776809, + "grad_norm": 8.559528809885835, + "learning_rate": 7.43006415027781e-07, + "loss": 0.9403, + "step": 10674 + }, + { + "epoch": 1.5112904367523181, + "grad_norm": 10.066337659229841, + "learning_rate": 7.425986926987494e-07, + "loss": 0.9394, + "step": 10675 + }, + { + "epoch": 1.5114320096269553, + "grad_norm": 9.888729160842859, + "learning_rate": 7.421910627549942e-07, + "loss": 1.0814, + "step": 10676 + }, + { + "epoch": 1.5115735825015926, + "grad_norm": 12.001372576734127, + "learning_rate": 7.417835252179447e-07, + "loss": 0.9898, + "step": 10677 + }, + { + "epoch": 1.5117151553762298, + "grad_norm": 9.062702045161531, + "learning_rate": 7.413760801090239e-07, + "loss": 0.9495, + "step": 10678 + }, + { + "epoch": 1.511856728250867, + "grad_norm": 8.937772306548815, + "learning_rate": 7.409687274496516e-07, + "loss": 0.8965, + "step": 10679 + }, + { + "epoch": 1.5119983011255043, + "grad_norm": 8.748239830633208, + "learning_rate": 7.405614672612421e-07, + "loss": 0.9959, + "step": 10680 + }, + { + "epoch": 1.5121398740001415, + "grad_norm": 10.523443662419316, + "learning_rate": 7.401542995652033e-07, + "loss": 0.9809, + "step": 10681 + }, + { + "epoch": 1.5122814468747787, + "grad_norm": 9.450849174008825, + "learning_rate": 7.397472243829418e-07, + "loss": 0.9253, + "step": 10682 + }, + { + "epoch": 1.512423019749416, + "grad_norm": 9.583970244983979, + "learning_rate": 7.393402417358552e-07, + "loss": 0.9436, + "step": 10683 + }, + { + "epoch": 1.5125645926240532, + "grad_norm": 9.031679136646947, + "learning_rate": 7.389333516453403e-07, + "loss": 0.9714, + "step": 10684 + }, + { + "epoch": 1.5127061654986904, + "grad_norm": 10.532465923986718, + "learning_rate": 7.385265541327852e-07, + "loss": 1.1005, + "step": 10685 + }, + { + "epoch": 1.5128477383733276, + "grad_norm": 8.36445431987861, + "learning_rate": 7.381198492195762e-07, + "loss": 0.8993, + "step": 10686 + }, + { + "epoch": 1.5129893112479649, + "grad_norm": 9.339066061830088, + "learning_rate": 7.377132369270926e-07, + "loss": 0.9379, + "step": 10687 + }, + { + "epoch": 1.513130884122602, + "grad_norm": 11.927612044208532, + "learning_rate": 7.373067172767107e-07, + "loss": 0.9677, + "step": 10688 + }, + { + "epoch": 1.5132724569972393, + "grad_norm": 10.170550704044258, + "learning_rate": 7.369002902897998e-07, + "loss": 0.9955, + "step": 10689 + }, + { + "epoch": 1.5134140298718766, + "grad_norm": 11.74118945359269, + "learning_rate": 7.364939559877265e-07, + "loss": 0.9679, + "step": 10690 + }, + { + "epoch": 1.5135556027465138, + "grad_norm": 7.227276938924113, + "learning_rate": 7.360877143918512e-07, + "loss": 0.9004, + "step": 10691 + }, + { + "epoch": 1.513697175621151, + "grad_norm": 11.401673953648759, + "learning_rate": 7.356815655235286e-07, + "loss": 1.0292, + "step": 10692 + }, + { + "epoch": 1.5138387484957883, + "grad_norm": 10.500263029164586, + "learning_rate": 7.352755094041114e-07, + "loss": 0.9596, + "step": 10693 + }, + { + "epoch": 1.5139803213704255, + "grad_norm": 8.70595034859308, + "learning_rate": 7.348695460549443e-07, + "loss": 0.966, + "step": 10694 + }, + { + "epoch": 1.5141218942450627, + "grad_norm": 8.879298928569227, + "learning_rate": 7.344636754973695e-07, + "loss": 0.9694, + "step": 10695 + }, + { + "epoch": 1.5142634671197, + "grad_norm": 10.128686681118996, + "learning_rate": 7.340578977527221e-07, + "loss": 0.8536, + "step": 10696 + }, + { + "epoch": 1.5144050399943372, + "grad_norm": 8.513803381712602, + "learning_rate": 7.336522128423351e-07, + "loss": 0.9073, + "step": 10697 + }, + { + "epoch": 1.5145466128689744, + "grad_norm": 8.521825258552576, + "learning_rate": 7.332466207875336e-07, + "loss": 0.8711, + "step": 10698 + }, + { + "epoch": 1.5146881857436116, + "grad_norm": 11.942192713102356, + "learning_rate": 7.328411216096399e-07, + "loss": 1.0558, + "step": 10699 + }, + { + "epoch": 1.5148297586182489, + "grad_norm": 10.242694763611176, + "learning_rate": 7.324357153299714e-07, + "loss": 1.0306, + "step": 10700 + }, + { + "epoch": 1.514971331492886, + "grad_norm": 8.546016224566092, + "learning_rate": 7.320304019698393e-07, + "loss": 1.0671, + "step": 10701 + }, + { + "epoch": 1.5151129043675233, + "grad_norm": 7.765835734219396, + "learning_rate": 7.31625181550551e-07, + "loss": 0.9858, + "step": 10702 + }, + { + "epoch": 1.5152544772421606, + "grad_norm": 9.464564037086408, + "learning_rate": 7.312200540934073e-07, + "loss": 0.9316, + "step": 10703 + }, + { + "epoch": 1.5153960501167976, + "grad_norm": 8.797088349433258, + "learning_rate": 7.308150196197064e-07, + "loss": 0.9041, + "step": 10704 + }, + { + "epoch": 1.5155376229914348, + "grad_norm": 11.905298390243823, + "learning_rate": 7.30410078150742e-07, + "loss": 1.0085, + "step": 10705 + }, + { + "epoch": 1.515679195866072, + "grad_norm": 10.310290660590486, + "learning_rate": 7.300052297077992e-07, + "loss": 0.946, + "step": 10706 + }, + { + "epoch": 1.5158207687407093, + "grad_norm": 10.043875001652903, + "learning_rate": 7.296004743121627e-07, + "loss": 1.0319, + "step": 10707 + }, + { + "epoch": 1.5159623416153465, + "grad_norm": 9.019138755535451, + "learning_rate": 7.291958119851081e-07, + "loss": 0.9939, + "step": 10708 + }, + { + "epoch": 1.5161039144899837, + "grad_norm": 8.698082934101953, + "learning_rate": 7.287912427479105e-07, + "loss": 0.9575, + "step": 10709 + }, + { + "epoch": 1.516245487364621, + "grad_norm": 9.394357456812424, + "learning_rate": 7.283867666218355e-07, + "loss": 1.0526, + "step": 10710 + }, + { + "epoch": 1.5163870602392582, + "grad_norm": 9.700225937561768, + "learning_rate": 7.27982383628148e-07, + "loss": 0.9654, + "step": 10711 + }, + { + "epoch": 1.5165286331138954, + "grad_norm": 9.557991510834869, + "learning_rate": 7.275780937881055e-07, + "loss": 0.8925, + "step": 10712 + }, + { + "epoch": 1.5166702059885326, + "grad_norm": 7.977222441480454, + "learning_rate": 7.2717389712296e-07, + "loss": 0.9986, + "step": 10713 + }, + { + "epoch": 1.5168117788631699, + "grad_norm": 8.340699513325129, + "learning_rate": 7.267697936539619e-07, + "loss": 0.9223, + "step": 10714 + }, + { + "epoch": 1.5169533517378069, + "grad_norm": 8.505039292053565, + "learning_rate": 7.263657834023527e-07, + "loss": 0.9138, + "step": 10715 + }, + { + "epoch": 1.517094924612444, + "grad_norm": 8.75351448322932, + "learning_rate": 7.259618663893725e-07, + "loss": 0.9752, + "step": 10716 + }, + { + "epoch": 1.5172364974870813, + "grad_norm": 9.41880999003685, + "learning_rate": 7.255580426362535e-07, + "loss": 1.0748, + "step": 10717 + }, + { + "epoch": 1.5173780703617186, + "grad_norm": 9.784082946999659, + "learning_rate": 7.25154312164226e-07, + "loss": 0.9663, + "step": 10718 + }, + { + "epoch": 1.5175196432363558, + "grad_norm": 8.499707834047012, + "learning_rate": 7.247506749945118e-07, + "loss": 1.0213, + "step": 10719 + }, + { + "epoch": 1.517661216110993, + "grad_norm": 10.376849641576518, + "learning_rate": 7.243471311483322e-07, + "loss": 1.0541, + "step": 10720 + }, + { + "epoch": 1.5178027889856303, + "grad_norm": 12.028954224956289, + "learning_rate": 7.239436806468989e-07, + "loss": 1.0823, + "step": 10721 + }, + { + "epoch": 1.5179443618602675, + "grad_norm": 9.425691404202963, + "learning_rate": 7.235403235114227e-07, + "loss": 1.0524, + "step": 10722 + }, + { + "epoch": 1.5180859347349047, + "grad_norm": 11.37050934065909, + "learning_rate": 7.231370597631071e-07, + "loss": 1.0289, + "step": 10723 + }, + { + "epoch": 1.518227507609542, + "grad_norm": 7.8184447597136195, + "learning_rate": 7.227338894231509e-07, + "loss": 1.0203, + "step": 10724 + }, + { + "epoch": 1.5183690804841792, + "grad_norm": 9.350526731617391, + "learning_rate": 7.223308125127495e-07, + "loss": 0.9015, + "step": 10725 + }, + { + "epoch": 1.5185106533588164, + "grad_norm": 9.40655740127889, + "learning_rate": 7.219278290530909e-07, + "loss": 0.8697, + "step": 10726 + }, + { + "epoch": 1.5186522262334536, + "grad_norm": 8.66796875, + "learning_rate": 7.215249390653609e-07, + "loss": 0.9373, + "step": 10727 + }, + { + "epoch": 1.5187937991080909, + "grad_norm": 8.962354552978853, + "learning_rate": 7.211221425707393e-07, + "loss": 1.0046, + "step": 10728 + }, + { + "epoch": 1.518935371982728, + "grad_norm": 11.435318535561903, + "learning_rate": 7.207194395903997e-07, + "loss": 1.0444, + "step": 10729 + }, + { + "epoch": 1.5190769448573653, + "grad_norm": 12.227499714478759, + "learning_rate": 7.203168301455129e-07, + "loss": 1.0014, + "step": 10730 + }, + { + "epoch": 1.5192185177320026, + "grad_norm": 8.944462978220102, + "learning_rate": 7.199143142572429e-07, + "loss": 0.9715, + "step": 10731 + }, + { + "epoch": 1.5193600906066398, + "grad_norm": 9.774896770429782, + "learning_rate": 7.195118919467506e-07, + "loss": 1.0474, + "step": 10732 + }, + { + "epoch": 1.519501663481277, + "grad_norm": 8.87468106408663, + "learning_rate": 7.191095632351908e-07, + "loss": 0.9354, + "step": 10733 + }, + { + "epoch": 1.5196432363559143, + "grad_norm": 8.18551495199275, + "learning_rate": 7.187073281437124e-07, + "loss": 0.9651, + "step": 10734 + }, + { + "epoch": 1.5197848092305515, + "grad_norm": 9.580693508841183, + "learning_rate": 7.183051866934626e-07, + "loss": 0.9761, + "step": 10735 + }, + { + "epoch": 1.5199263821051887, + "grad_norm": 7.841893436127159, + "learning_rate": 7.179031389055796e-07, + "loss": 0.9616, + "step": 10736 + }, + { + "epoch": 1.520067954979826, + "grad_norm": 8.763052659829524, + "learning_rate": 7.175011848012009e-07, + "loss": 1.0372, + "step": 10737 + }, + { + "epoch": 1.5202095278544632, + "grad_norm": 9.059067069350585, + "learning_rate": 7.170993244014548e-07, + "loss": 0.9439, + "step": 10738 + }, + { + "epoch": 1.5203511007291004, + "grad_norm": 9.133318727950323, + "learning_rate": 7.166975577274687e-07, + "loss": 1.0085, + "step": 10739 + }, + { + "epoch": 1.5204926736037376, + "grad_norm": 9.942418063072884, + "learning_rate": 7.162958848003615e-07, + "loss": 1.0268, + "step": 10740 + }, + { + "epoch": 1.5206342464783749, + "grad_norm": 9.668214169864825, + "learning_rate": 7.158943056412504e-07, + "loss": 1.033, + "step": 10741 + }, + { + "epoch": 1.520775819353012, + "grad_norm": 8.543415715473662, + "learning_rate": 7.154928202712447e-07, + "loss": 0.996, + "step": 10742 + }, + { + "epoch": 1.5209173922276493, + "grad_norm": 9.824541029753629, + "learning_rate": 7.150914287114513e-07, + "loss": 0.8967, + "step": 10743 + }, + { + "epoch": 1.5210589651022866, + "grad_norm": 9.284115127526794, + "learning_rate": 7.146901309829709e-07, + "loss": 0.9664, + "step": 10744 + }, + { + "epoch": 1.5212005379769238, + "grad_norm": 9.443670617608744, + "learning_rate": 7.142889271068984e-07, + "loss": 0.9824, + "step": 10745 + }, + { + "epoch": 1.5213421108515608, + "grad_norm": 9.57077645559077, + "learning_rate": 7.138878171043262e-07, + "loss": 0.9877, + "step": 10746 + }, + { + "epoch": 1.521483683726198, + "grad_norm": 10.485597087283846, + "learning_rate": 7.134868009963389e-07, + "loss": 0.9932, + "step": 10747 + }, + { + "epoch": 1.5216252566008353, + "grad_norm": 8.780587493775108, + "learning_rate": 7.13085878804019e-07, + "loss": 1.1221, + "step": 10748 + }, + { + "epoch": 1.5217668294754725, + "grad_norm": 9.542299163599763, + "learning_rate": 7.126850505484415e-07, + "loss": 0.9239, + "step": 10749 + }, + { + "epoch": 1.5219084023501097, + "grad_norm": 10.037614271147683, + "learning_rate": 7.122843162506781e-07, + "loss": 1.1384, + "step": 10750 + }, + { + "epoch": 1.522049975224747, + "grad_norm": 7.936968447691375, + "learning_rate": 7.118836759317963e-07, + "loss": 0.976, + "step": 10751 + }, + { + "epoch": 1.5221915480993842, + "grad_norm": 8.135317311349795, + "learning_rate": 7.114831296128552e-07, + "loss": 0.9636, + "step": 10752 + }, + { + "epoch": 1.5223331209740214, + "grad_norm": 9.869791323109053, + "learning_rate": 7.110826773149135e-07, + "loss": 0.9541, + "step": 10753 + }, + { + "epoch": 1.5224746938486586, + "grad_norm": 9.005709638407659, + "learning_rate": 7.106823190590214e-07, + "loss": 1.1021, + "step": 10754 + }, + { + "epoch": 1.5226162667232959, + "grad_norm": 10.129800670956008, + "learning_rate": 7.102820548662257e-07, + "loss": 0.9987, + "step": 10755 + }, + { + "epoch": 1.5227578395979329, + "grad_norm": 9.787970138697561, + "learning_rate": 7.09881884757567e-07, + "loss": 0.9791, + "step": 10756 + }, + { + "epoch": 1.52289941247257, + "grad_norm": 11.095652758257891, + "learning_rate": 7.094818087540827e-07, + "loss": 1.0703, + "step": 10757 + }, + { + "epoch": 1.5230409853472073, + "grad_norm": 10.231261826719837, + "learning_rate": 7.090818268768057e-07, + "loss": 1.0468, + "step": 10758 + }, + { + "epoch": 1.5231825582218446, + "grad_norm": 10.047821142755124, + "learning_rate": 7.086819391467612e-07, + "loss": 0.9785, + "step": 10759 + }, + { + "epoch": 1.5233241310964818, + "grad_norm": 7.495455064147363, + "learning_rate": 7.082821455849717e-07, + "loss": 0.8861, + "step": 10760 + }, + { + "epoch": 1.523465703971119, + "grad_norm": 10.224370035104661, + "learning_rate": 7.078824462124534e-07, + "loss": 0.9594, + "step": 10761 + }, + { + "epoch": 1.5236072768457563, + "grad_norm": 8.077688651778931, + "learning_rate": 7.074828410502193e-07, + "loss": 0.9016, + "step": 10762 + }, + { + "epoch": 1.5237488497203935, + "grad_norm": 8.325352878365962, + "learning_rate": 7.07083330119275e-07, + "loss": 0.9562, + "step": 10763 + }, + { + "epoch": 1.5238904225950307, + "grad_norm": 9.847354095919862, + "learning_rate": 7.066839134406239e-07, + "loss": 0.9162, + "step": 10764 + }, + { + "epoch": 1.524031995469668, + "grad_norm": 9.0896448261968, + "learning_rate": 7.062845910352622e-07, + "loss": 1.0289, + "step": 10765 + }, + { + "epoch": 1.5241735683443052, + "grad_norm": 8.719110078109992, + "learning_rate": 7.058853629241816e-07, + "loss": 0.9337, + "step": 10766 + }, + { + "epoch": 1.5243151412189424, + "grad_norm": 6.870059093237851, + "learning_rate": 7.054862291283702e-07, + "loss": 0.9769, + "step": 10767 + }, + { + "epoch": 1.5244567140935796, + "grad_norm": 10.563303538800351, + "learning_rate": 7.050871896688091e-07, + "loss": 1.0638, + "step": 10768 + }, + { + "epoch": 1.5245982869682169, + "grad_norm": 10.03375612130649, + "learning_rate": 7.046882445664768e-07, + "loss": 0.8844, + "step": 10769 + }, + { + "epoch": 1.524739859842854, + "grad_norm": 9.752166116177449, + "learning_rate": 7.042893938423442e-07, + "loss": 0.9199, + "step": 10770 + }, + { + "epoch": 1.5248814327174913, + "grad_norm": 9.479233782743737, + "learning_rate": 7.038906375173799e-07, + "loss": 1.0425, + "step": 10771 + }, + { + "epoch": 1.5250230055921286, + "grad_norm": 8.761387417804515, + "learning_rate": 7.034919756125447e-07, + "loss": 0.8953, + "step": 10772 + }, + { + "epoch": 1.5251645784667658, + "grad_norm": 8.58439965399058, + "learning_rate": 7.030934081487969e-07, + "loss": 0.9808, + "step": 10773 + }, + { + "epoch": 1.525306151341403, + "grad_norm": 9.693266696552215, + "learning_rate": 7.026949351470894e-07, + "loss": 0.9943, + "step": 10774 + }, + { + "epoch": 1.5254477242160402, + "grad_norm": 10.047520072482245, + "learning_rate": 7.022965566283693e-07, + "loss": 0.9828, + "step": 10775 + }, + { + "epoch": 1.5255892970906775, + "grad_norm": 9.815073635546062, + "learning_rate": 7.018982726135787e-07, + "loss": 1.0293, + "step": 10776 + }, + { + "epoch": 1.5257308699653147, + "grad_norm": 7.856622232800528, + "learning_rate": 7.015000831236543e-07, + "loss": 0.9221, + "step": 10777 + }, + { + "epoch": 1.525872442839952, + "grad_norm": 10.840277651788773, + "learning_rate": 7.011019881795298e-07, + "loss": 1.0542, + "step": 10778 + }, + { + "epoch": 1.5260140157145892, + "grad_norm": 9.794719550191038, + "learning_rate": 7.00703987802133e-07, + "loss": 1.0875, + "step": 10779 + }, + { + "epoch": 1.5261555885892264, + "grad_norm": 8.869060409307478, + "learning_rate": 7.003060820123852e-07, + "loss": 1.0044, + "step": 10780 + }, + { + "epoch": 1.5262971614638636, + "grad_norm": 9.647814773046772, + "learning_rate": 6.999082708312055e-07, + "loss": 1.0955, + "step": 10781 + }, + { + "epoch": 1.5264387343385009, + "grad_norm": 8.266077270825965, + "learning_rate": 6.99510554279505e-07, + "loss": 0.8961, + "step": 10782 + }, + { + "epoch": 1.526580307213138, + "grad_norm": 9.466331651517574, + "learning_rate": 6.991129323781931e-07, + "loss": 1.0153, + "step": 10783 + }, + { + "epoch": 1.5267218800877753, + "grad_norm": 10.364976200089316, + "learning_rate": 6.987154051481707e-07, + "loss": 0.9216, + "step": 10784 + }, + { + "epoch": 1.5268634529624125, + "grad_norm": 8.170569233481906, + "learning_rate": 6.98317972610337e-07, + "loss": 0.8875, + "step": 10785 + }, + { + "epoch": 1.5270050258370498, + "grad_norm": 8.042422824023312, + "learning_rate": 6.979206347855843e-07, + "loss": 0.8476, + "step": 10786 + }, + { + "epoch": 1.5271465987116868, + "grad_norm": 8.983192432362532, + "learning_rate": 6.975233916947993e-07, + "loss": 0.9151, + "step": 10787 + }, + { + "epoch": 1.527288171586324, + "grad_norm": 11.776600696916296, + "learning_rate": 6.971262433588663e-07, + "loss": 0.9816, + "step": 10788 + }, + { + "epoch": 1.5274297444609612, + "grad_norm": 10.66040995999388, + "learning_rate": 6.967291897986617e-07, + "loss": 0.9557, + "step": 10789 + }, + { + "epoch": 1.5275713173355985, + "grad_norm": 10.69497472816253, + "learning_rate": 6.963322310350598e-07, + "loss": 0.9694, + "step": 10790 + }, + { + "epoch": 1.5277128902102357, + "grad_norm": 10.802401198023817, + "learning_rate": 6.959353670889269e-07, + "loss": 1.0117, + "step": 10791 + }, + { + "epoch": 1.527854463084873, + "grad_norm": 10.52829309263073, + "learning_rate": 6.955385979811275e-07, + "loss": 0.8988, + "step": 10792 + }, + { + "epoch": 1.5279960359595102, + "grad_norm": 8.60872452454213, + "learning_rate": 6.951419237325174e-07, + "loss": 0.9314, + "step": 10793 + }, + { + "epoch": 1.5281376088341474, + "grad_norm": 8.748328785017268, + "learning_rate": 6.947453443639515e-07, + "loss": 0.8901, + "step": 10794 + }, + { + "epoch": 1.5282791817087846, + "grad_norm": 9.004135029631438, + "learning_rate": 6.943488598962761e-07, + "loss": 0.9501, + "step": 10795 + }, + { + "epoch": 1.5284207545834219, + "grad_norm": 8.135778232669072, + "learning_rate": 6.939524703503356e-07, + "loss": 0.8712, + "step": 10796 + }, + { + "epoch": 1.528562327458059, + "grad_norm": 10.540163880436815, + "learning_rate": 6.93556175746967e-07, + "loss": 1.0037, + "step": 10797 + }, + { + "epoch": 1.528703900332696, + "grad_norm": 10.36177304505397, + "learning_rate": 6.931599761070027e-07, + "loss": 1.0155, + "step": 10798 + }, + { + "epoch": 1.5288454732073333, + "grad_norm": 7.559920446391067, + "learning_rate": 6.927638714512716e-07, + "loss": 0.8965, + "step": 10799 + }, + { + "epoch": 1.5289870460819706, + "grad_norm": 7.159913836765241, + "learning_rate": 6.923678618005958e-07, + "loss": 0.9671, + "step": 10800 + }, + { + "epoch": 1.5291286189566078, + "grad_norm": 9.311738872786378, + "learning_rate": 6.919719471757938e-07, + "loss": 0.9886, + "step": 10801 + }, + { + "epoch": 1.529270191831245, + "grad_norm": 8.97139730161679, + "learning_rate": 6.915761275976787e-07, + "loss": 1.0124, + "step": 10802 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 10.403616173820884, + "learning_rate": 6.911804030870578e-07, + "loss": 1.0548, + "step": 10803 + }, + { + "epoch": 1.5295533375805195, + "grad_norm": 9.188360264784395, + "learning_rate": 6.90784773664735e-07, + "loss": 1.0113, + "step": 10804 + }, + { + "epoch": 1.5296949104551567, + "grad_norm": 9.433365276089383, + "learning_rate": 6.903892393515068e-07, + "loss": 0.9648, + "step": 10805 + }, + { + "epoch": 1.529836483329794, + "grad_norm": 9.209918637988034, + "learning_rate": 6.89993800168168e-07, + "loss": 0.9521, + "step": 10806 + }, + { + "epoch": 1.5299780562044312, + "grad_norm": 10.447614143735724, + "learning_rate": 6.895984561355054e-07, + "loss": 1.0176, + "step": 10807 + }, + { + "epoch": 1.5301196290790684, + "grad_norm": 9.991995087084085, + "learning_rate": 6.892032072743013e-07, + "loss": 1.0487, + "step": 10808 + }, + { + "epoch": 1.5302612019537056, + "grad_norm": 9.355513602449895, + "learning_rate": 6.888080536053351e-07, + "loss": 0.9612, + "step": 10809 + }, + { + "epoch": 1.5304027748283429, + "grad_norm": 9.978983347144043, + "learning_rate": 6.884129951493785e-07, + "loss": 1.0236, + "step": 10810 + }, + { + "epoch": 1.53054434770298, + "grad_norm": 10.853871835132225, + "learning_rate": 6.880180319272006e-07, + "loss": 1.0297, + "step": 10811 + }, + { + "epoch": 1.5306859205776173, + "grad_norm": 9.319859815861218, + "learning_rate": 6.876231639595629e-07, + "loss": 0.9977, + "step": 10812 + }, + { + "epoch": 1.5308274934522546, + "grad_norm": 10.813767904571714, + "learning_rate": 6.872283912672247e-07, + "loss": 0.9525, + "step": 10813 + }, + { + "epoch": 1.5309690663268918, + "grad_norm": 8.379277745261017, + "learning_rate": 6.868337138709377e-07, + "loss": 1.0198, + "step": 10814 + }, + { + "epoch": 1.531110639201529, + "grad_norm": 11.248833150750258, + "learning_rate": 6.864391317914512e-07, + "loss": 0.9318, + "step": 10815 + }, + { + "epoch": 1.5312522120761662, + "grad_norm": 10.459301866971504, + "learning_rate": 6.860446450495068e-07, + "loss": 0.998, + "step": 10816 + }, + { + "epoch": 1.5313937849508035, + "grad_norm": 8.926102206346538, + "learning_rate": 6.856502536658433e-07, + "loss": 0.9408, + "step": 10817 + }, + { + "epoch": 1.5315353578254407, + "grad_norm": 9.941443854300195, + "learning_rate": 6.852559576611931e-07, + "loss": 1.0553, + "step": 10818 + }, + { + "epoch": 1.531676930700078, + "grad_norm": 9.167251799432544, + "learning_rate": 6.848617570562832e-07, + "loss": 0.9191, + "step": 10819 + }, + { + "epoch": 1.5318185035747152, + "grad_norm": 8.678535501718585, + "learning_rate": 6.844676518718385e-07, + "loss": 1.0511, + "step": 10820 + }, + { + "epoch": 1.5319600764493524, + "grad_norm": 9.567731635422314, + "learning_rate": 6.840736421285746e-07, + "loss": 0.9951, + "step": 10821 + }, + { + "epoch": 1.5321016493239896, + "grad_norm": 8.130437352052136, + "learning_rate": 6.83679727847206e-07, + "loss": 0.8765, + "step": 10822 + }, + { + "epoch": 1.5322432221986269, + "grad_norm": 9.986415409786257, + "learning_rate": 6.832859090484392e-07, + "loss": 1.0369, + "step": 10823 + }, + { + "epoch": 1.532384795073264, + "grad_norm": 8.939291607741756, + "learning_rate": 6.828921857529774e-07, + "loss": 1.0274, + "step": 10824 + }, + { + "epoch": 1.5325263679479013, + "grad_norm": 9.380736363510046, + "learning_rate": 6.824985579815194e-07, + "loss": 1.0957, + "step": 10825 + }, + { + "epoch": 1.5326679408225385, + "grad_norm": 9.147482153496401, + "learning_rate": 6.821050257547562e-07, + "loss": 0.9641, + "step": 10826 + }, + { + "epoch": 1.5328095136971758, + "grad_norm": 10.0977639168583, + "learning_rate": 6.817115890933773e-07, + "loss": 0.961, + "step": 10827 + }, + { + "epoch": 1.532951086571813, + "grad_norm": 8.902517721259741, + "learning_rate": 6.813182480180641e-07, + "loss": 1.1087, + "step": 10828 + }, + { + "epoch": 1.53309265944645, + "grad_norm": 7.974924365794118, + "learning_rate": 6.809250025494946e-07, + "loss": 0.9622, + "step": 10829 + }, + { + "epoch": 1.5332342323210872, + "grad_norm": 9.649258252034356, + "learning_rate": 6.805318527083407e-07, + "loss": 0.9455, + "step": 10830 + }, + { + "epoch": 1.5333758051957245, + "grad_norm": 9.316166712722474, + "learning_rate": 6.801387985152705e-07, + "loss": 0.9668, + "step": 10831 + }, + { + "epoch": 1.5335173780703617, + "grad_norm": 8.696836874326914, + "learning_rate": 6.797458399909476e-07, + "loss": 1.0449, + "step": 10832 + }, + { + "epoch": 1.533658950944999, + "grad_norm": 8.97688504030521, + "learning_rate": 6.793529771560278e-07, + "loss": 0.9557, + "step": 10833 + }, + { + "epoch": 1.5338005238196362, + "grad_norm": 8.28589289806259, + "learning_rate": 6.789602100311654e-07, + "loss": 0.9862, + "step": 10834 + }, + { + "epoch": 1.5339420966942734, + "grad_norm": 10.620290632563554, + "learning_rate": 6.785675386370061e-07, + "loss": 1.0876, + "step": 10835 + }, + { + "epoch": 1.5340836695689106, + "grad_norm": 9.628459754480586, + "learning_rate": 6.781749629941938e-07, + "loss": 1.0032, + "step": 10836 + }, + { + "epoch": 1.5342252424435479, + "grad_norm": 10.801080748560029, + "learning_rate": 6.777824831233645e-07, + "loss": 1.0052, + "step": 10837 + }, + { + "epoch": 1.534366815318185, + "grad_norm": 9.032645886528806, + "learning_rate": 6.773900990451523e-07, + "loss": 1.0284, + "step": 10838 + }, + { + "epoch": 1.534508388192822, + "grad_norm": 9.605347622126674, + "learning_rate": 6.769978107801837e-07, + "loss": 1.0819, + "step": 10839 + }, + { + "epoch": 1.5346499610674593, + "grad_norm": 9.996487954443793, + "learning_rate": 6.766056183490799e-07, + "loss": 0.9396, + "step": 10840 + }, + { + "epoch": 1.5347915339420966, + "grad_norm": 9.401932748988576, + "learning_rate": 6.7621352177246e-07, + "loss": 1.0378, + "step": 10841 + }, + { + "epoch": 1.5349331068167338, + "grad_norm": 9.557202436288724, + "learning_rate": 6.758215210709345e-07, + "loss": 1.0325, + "step": 10842 + }, + { + "epoch": 1.535074679691371, + "grad_norm": 9.565265330469677, + "learning_rate": 6.754296162651122e-07, + "loss": 0.9008, + "step": 10843 + }, + { + "epoch": 1.5352162525660082, + "grad_norm": 9.70160814306182, + "learning_rate": 6.750378073755939e-07, + "loss": 1.0011, + "step": 10844 + }, + { + "epoch": 1.5353578254406455, + "grad_norm": 9.16278905061309, + "learning_rate": 6.746460944229783e-07, + "loss": 0.9835, + "step": 10845 + }, + { + "epoch": 1.5354993983152827, + "grad_norm": 11.057128636719609, + "learning_rate": 6.742544774278553e-07, + "loss": 0.9851, + "step": 10846 + }, + { + "epoch": 1.53564097118992, + "grad_norm": 8.2230519686549, + "learning_rate": 6.738629564108134e-07, + "loss": 0.9544, + "step": 10847 + }, + { + "epoch": 1.5357825440645572, + "grad_norm": 10.111025738850595, + "learning_rate": 6.734715313924348e-07, + "loss": 0.9746, + "step": 10848 + }, + { + "epoch": 1.5359241169391944, + "grad_norm": 10.205109337571885, + "learning_rate": 6.730802023932962e-07, + "loss": 1.0783, + "step": 10849 + }, + { + "epoch": 1.5360656898138316, + "grad_norm": 8.928834355433935, + "learning_rate": 6.726889694339689e-07, + "loss": 0.8882, + "step": 10850 + }, + { + "epoch": 1.5362072626884689, + "grad_norm": 9.359384781923103, + "learning_rate": 6.72297832535019e-07, + "loss": 0.9998, + "step": 10851 + }, + { + "epoch": 1.536348835563106, + "grad_norm": 9.48416544700029, + "learning_rate": 6.719067917170105e-07, + "loss": 0.9895, + "step": 10852 + }, + { + "epoch": 1.5364904084377433, + "grad_norm": 8.695427440558827, + "learning_rate": 6.715158470004979e-07, + "loss": 0.9843, + "step": 10853 + }, + { + "epoch": 1.5366319813123805, + "grad_norm": 9.159891387428793, + "learning_rate": 6.711249984060337e-07, + "loss": 0.8669, + "step": 10854 + }, + { + "epoch": 1.5367735541870178, + "grad_norm": 9.510289693084365, + "learning_rate": 6.707342459541655e-07, + "loss": 0.9905, + "step": 10855 + }, + { + "epoch": 1.536915127061655, + "grad_norm": 9.248549888973304, + "learning_rate": 6.703435896654334e-07, + "loss": 1.0953, + "step": 10856 + }, + { + "epoch": 1.5370566999362922, + "grad_norm": 8.837643081823298, + "learning_rate": 6.699530295603751e-07, + "loss": 0.9012, + "step": 10857 + }, + { + "epoch": 1.5371982728109295, + "grad_norm": 10.532984921773346, + "learning_rate": 6.695625656595209e-07, + "loss": 1.0144, + "step": 10858 + }, + { + "epoch": 1.5373398456855667, + "grad_norm": 9.231552699659035, + "learning_rate": 6.691721979833984e-07, + "loss": 1.0992, + "step": 10859 + }, + { + "epoch": 1.537481418560204, + "grad_norm": 11.073272115625068, + "learning_rate": 6.687819265525286e-07, + "loss": 0.9751, + "step": 10860 + }, + { + "epoch": 1.5376229914348412, + "grad_norm": 7.801448482121089, + "learning_rate": 6.683917513874266e-07, + "loss": 0.9307, + "step": 10861 + }, + { + "epoch": 1.5377645643094784, + "grad_norm": 10.446319326125298, + "learning_rate": 6.680016725086053e-07, + "loss": 0.9453, + "step": 10862 + }, + { + "epoch": 1.5379061371841156, + "grad_norm": 9.89363406722486, + "learning_rate": 6.676116899365692e-07, + "loss": 0.9793, + "step": 10863 + }, + { + "epoch": 1.5380477100587528, + "grad_norm": 9.296043243386412, + "learning_rate": 6.67221803691821e-07, + "loss": 0.9917, + "step": 10864 + }, + { + "epoch": 1.53818928293339, + "grad_norm": 7.130739543809631, + "learning_rate": 6.668320137948556e-07, + "loss": 0.9053, + "step": 10865 + }, + { + "epoch": 1.5383308558080273, + "grad_norm": 8.084883026805576, + "learning_rate": 6.664423202661649e-07, + "loss": 0.9742, + "step": 10866 + }, + { + "epoch": 1.5384724286826645, + "grad_norm": 9.289917055546072, + "learning_rate": 6.660527231262334e-07, + "loss": 0.9804, + "step": 10867 + }, + { + "epoch": 1.5386140015573018, + "grad_norm": 9.503671037214712, + "learning_rate": 6.656632223955437e-07, + "loss": 0.8393, + "step": 10868 + }, + { + "epoch": 1.538755574431939, + "grad_norm": 9.983010546561289, + "learning_rate": 6.652738180945698e-07, + "loss": 0.9962, + "step": 10869 + }, + { + "epoch": 1.538897147306576, + "grad_norm": 9.00616540597741, + "learning_rate": 6.648845102437839e-07, + "loss": 0.9351, + "step": 10870 + }, + { + "epoch": 1.5390387201812132, + "grad_norm": 9.373605853370192, + "learning_rate": 6.644952988636514e-07, + "loss": 0.9496, + "step": 10871 + }, + { + "epoch": 1.5391802930558505, + "grad_norm": 9.84531547571769, + "learning_rate": 6.641061839746313e-07, + "loss": 0.9894, + "step": 10872 + }, + { + "epoch": 1.5393218659304877, + "grad_norm": 10.18551897317607, + "learning_rate": 6.637171655971811e-07, + "loss": 1.055, + "step": 10873 + }, + { + "epoch": 1.539463438805125, + "grad_norm": 9.850855028249967, + "learning_rate": 6.633282437517496e-07, + "loss": 0.8661, + "step": 10874 + }, + { + "epoch": 1.5396050116797622, + "grad_norm": 9.547986154777163, + "learning_rate": 6.629394184587826e-07, + "loss": 0.9255, + "step": 10875 + }, + { + "epoch": 1.5397465845543994, + "grad_norm": 9.376732017740483, + "learning_rate": 6.625506897387215e-07, + "loss": 1.1049, + "step": 10876 + }, + { + "epoch": 1.5398881574290366, + "grad_norm": 11.034250515927313, + "learning_rate": 6.621620576119999e-07, + "loss": 0.9517, + "step": 10877 + }, + { + "epoch": 1.5400297303036738, + "grad_norm": 7.289286226721266, + "learning_rate": 6.617735220990495e-07, + "loss": 0.9117, + "step": 10878 + }, + { + "epoch": 1.540171303178311, + "grad_norm": 10.595234904689319, + "learning_rate": 6.613850832202934e-07, + "loss": 0.8983, + "step": 10879 + }, + { + "epoch": 1.5403128760529483, + "grad_norm": 9.474598302856343, + "learning_rate": 6.609967409961531e-07, + "loss": 1.0366, + "step": 10880 + }, + { + "epoch": 1.5404544489275853, + "grad_norm": 8.436438373605334, + "learning_rate": 6.606084954470434e-07, + "loss": 0.9311, + "step": 10881 + }, + { + "epoch": 1.5405960218022225, + "grad_norm": 9.570835046321022, + "learning_rate": 6.602203465933727e-07, + "loss": 1.0173, + "step": 10882 + }, + { + "epoch": 1.5407375946768598, + "grad_norm": 10.115352979790755, + "learning_rate": 6.598322944555471e-07, + "loss": 0.9074, + "step": 10883 + }, + { + "epoch": 1.540879167551497, + "grad_norm": 10.001754225407122, + "learning_rate": 6.594443390539651e-07, + "loss": 1.0526, + "step": 10884 + }, + { + "epoch": 1.5410207404261342, + "grad_norm": 7.6432781332723065, + "learning_rate": 6.590564804090224e-07, + "loss": 0.946, + "step": 10885 + }, + { + "epoch": 1.5411623133007715, + "grad_norm": 9.991570925684723, + "learning_rate": 6.586687185411073e-07, + "loss": 1.0508, + "step": 10886 + }, + { + "epoch": 1.5413038861754087, + "grad_norm": 8.987527894466231, + "learning_rate": 6.582810534706055e-07, + "loss": 0.9978, + "step": 10887 + }, + { + "epoch": 1.541445459050046, + "grad_norm": 10.544540395546898, + "learning_rate": 6.578934852178945e-07, + "loss": 1.0092, + "step": 10888 + }, + { + "epoch": 1.5415870319246832, + "grad_norm": 8.424059752568459, + "learning_rate": 6.575060138033504e-07, + "loss": 1.0183, + "step": 10889 + }, + { + "epoch": 1.5417286047993204, + "grad_norm": 8.184353522308589, + "learning_rate": 6.571186392473406e-07, + "loss": 0.8266, + "step": 10890 + }, + { + "epoch": 1.5418701776739576, + "grad_norm": 8.337625949257172, + "learning_rate": 6.567313615702304e-07, + "loss": 0.8927, + "step": 10891 + }, + { + "epoch": 1.5420117505485949, + "grad_norm": 10.87657311998925, + "learning_rate": 6.563441807923782e-07, + "loss": 1.0264, + "step": 10892 + }, + { + "epoch": 1.542153323423232, + "grad_norm": 8.732156051995764, + "learning_rate": 6.559570969341369e-07, + "loss": 0.9762, + "step": 10893 + }, + { + "epoch": 1.5422948962978693, + "grad_norm": 11.47426478870896, + "learning_rate": 6.555701100158571e-07, + "loss": 0.9076, + "step": 10894 + }, + { + "epoch": 1.5424364691725065, + "grad_norm": 11.34105269886175, + "learning_rate": 6.551832200578803e-07, + "loss": 0.9349, + "step": 10895 + }, + { + "epoch": 1.5425780420471438, + "grad_norm": 9.906314308126243, + "learning_rate": 6.547964270805468e-07, + "loss": 0.9732, + "step": 10896 + }, + { + "epoch": 1.542719614921781, + "grad_norm": 10.665167782693707, + "learning_rate": 6.544097311041888e-07, + "loss": 1.0259, + "step": 10897 + }, + { + "epoch": 1.5428611877964182, + "grad_norm": 10.891444609133726, + "learning_rate": 6.54023132149135e-07, + "loss": 0.9673, + "step": 10898 + }, + { + "epoch": 1.5430027606710555, + "grad_norm": 9.229732269338687, + "learning_rate": 6.536366302357094e-07, + "loss": 0.9385, + "step": 10899 + }, + { + "epoch": 1.5431443335456927, + "grad_norm": 10.016895136287827, + "learning_rate": 6.532502253842288e-07, + "loss": 1.0921, + "step": 10900 + }, + { + "epoch": 1.54328590642033, + "grad_norm": 9.635935726447055, + "learning_rate": 6.528639176150072e-07, + "loss": 1.0077, + "step": 10901 + }, + { + "epoch": 1.5434274792949672, + "grad_norm": 10.056101405419211, + "learning_rate": 6.524777069483526e-07, + "loss": 1.0027, + "step": 10902 + }, + { + "epoch": 1.5435690521696044, + "grad_norm": 8.812238107672114, + "learning_rate": 6.520915934045674e-07, + "loss": 0.9952, + "step": 10903 + }, + { + "epoch": 1.5437106250442416, + "grad_norm": 6.97700948447367, + "learning_rate": 6.517055770039482e-07, + "loss": 0.9693, + "step": 10904 + }, + { + "epoch": 1.5438521979188788, + "grad_norm": 9.002387683764184, + "learning_rate": 6.51319657766789e-07, + "loss": 0.9229, + "step": 10905 + }, + { + "epoch": 1.543993770793516, + "grad_norm": 9.631455275559226, + "learning_rate": 6.509338357133776e-07, + "loss": 1.0499, + "step": 10906 + }, + { + "epoch": 1.5441353436681533, + "grad_norm": 9.59222618700457, + "learning_rate": 6.50548110863995e-07, + "loss": 1.0389, + "step": 10907 + }, + { + "epoch": 1.5442769165427905, + "grad_norm": 10.714349583026712, + "learning_rate": 6.501624832389197e-07, + "loss": 0.9874, + "step": 10908 + }, + { + "epoch": 1.5444184894174278, + "grad_norm": 9.273804012212766, + "learning_rate": 6.497769528584227e-07, + "loss": 0.9108, + "step": 10909 + }, + { + "epoch": 1.544560062292065, + "grad_norm": 8.57599595405711, + "learning_rate": 6.493915197427727e-07, + "loss": 1.0103, + "step": 10910 + }, + { + "epoch": 1.5447016351667022, + "grad_norm": 9.205806425742516, + "learning_rate": 6.490061839122297e-07, + "loss": 0.937, + "step": 10911 + }, + { + "epoch": 1.5448432080413392, + "grad_norm": 9.807208730245996, + "learning_rate": 6.486209453870523e-07, + "loss": 1.0145, + "step": 10912 + }, + { + "epoch": 1.5449847809159765, + "grad_norm": 8.894613165932627, + "learning_rate": 6.482358041874914e-07, + "loss": 0.8484, + "step": 10913 + }, + { + "epoch": 1.5451263537906137, + "grad_norm": 10.205772068212223, + "learning_rate": 6.478507603337928e-07, + "loss": 0.8997, + "step": 10914 + }, + { + "epoch": 1.545267926665251, + "grad_norm": 10.777728138532954, + "learning_rate": 6.474658138461992e-07, + "loss": 1.0574, + "step": 10915 + }, + { + "epoch": 1.5454094995398882, + "grad_norm": 8.553084271145492, + "learning_rate": 6.470809647449458e-07, + "loss": 0.971, + "step": 10916 + }, + { + "epoch": 1.5455510724145254, + "grad_norm": 8.157687122701786, + "learning_rate": 6.466962130502655e-07, + "loss": 0.8774, + "step": 10917 + }, + { + "epoch": 1.5456926452891626, + "grad_norm": 9.854077543752194, + "learning_rate": 6.463115587823824e-07, + "loss": 1.0206, + "step": 10918 + }, + { + "epoch": 1.5458342181637998, + "grad_norm": 10.321133785214911, + "learning_rate": 6.459270019615191e-07, + "loss": 0.967, + "step": 10919 + }, + { + "epoch": 1.545975791038437, + "grad_norm": 8.520796075442561, + "learning_rate": 6.455425426078904e-07, + "loss": 0.9514, + "step": 10920 + }, + { + "epoch": 1.5461173639130743, + "grad_norm": 9.216913382991635, + "learning_rate": 6.451581807417074e-07, + "loss": 0.948, + "step": 10921 + }, + { + "epoch": 1.5462589367877113, + "grad_norm": 9.156118763057995, + "learning_rate": 6.447739163831765e-07, + "loss": 0.9701, + "step": 10922 + }, + { + "epoch": 1.5464005096623485, + "grad_norm": 8.932247725991289, + "learning_rate": 6.443897495524976e-07, + "loss": 1.0053, + "step": 10923 + }, + { + "epoch": 1.5465420825369858, + "grad_norm": 9.691651863574473, + "learning_rate": 6.440056802698658e-07, + "loss": 0.9697, + "step": 10924 + }, + { + "epoch": 1.546683655411623, + "grad_norm": 9.003681277693751, + "learning_rate": 6.436217085554708e-07, + "loss": 1.0305, + "step": 10925 + }, + { + "epoch": 1.5468252282862602, + "grad_norm": 8.898332469611713, + "learning_rate": 6.432378344294992e-07, + "loss": 1.0101, + "step": 10926 + }, + { + "epoch": 1.5469668011608975, + "grad_norm": 7.879788350782335, + "learning_rate": 6.428540579121296e-07, + "loss": 0.967, + "step": 10927 + }, + { + "epoch": 1.5471083740355347, + "grad_norm": 8.794528682152952, + "learning_rate": 6.424703790235374e-07, + "loss": 0.9169, + "step": 10928 + }, + { + "epoch": 1.547249946910172, + "grad_norm": 9.892397464108011, + "learning_rate": 6.420867977838929e-07, + "loss": 0.9802, + "step": 10929 + }, + { + "epoch": 1.5473915197848092, + "grad_norm": 8.204289816853484, + "learning_rate": 6.417033142133594e-07, + "loss": 1.0065, + "step": 10930 + }, + { + "epoch": 1.5475330926594464, + "grad_norm": 8.79675271201636, + "learning_rate": 6.413199283320979e-07, + "loss": 0.9693, + "step": 10931 + }, + { + "epoch": 1.5476746655340836, + "grad_norm": 11.908062654217009, + "learning_rate": 6.40936640160261e-07, + "loss": 1.1407, + "step": 10932 + }, + { + "epoch": 1.5478162384087208, + "grad_norm": 10.076148591936175, + "learning_rate": 6.405534497179996e-07, + "loss": 1.0452, + "step": 10933 + }, + { + "epoch": 1.547957811283358, + "grad_norm": 10.339551551878124, + "learning_rate": 6.401703570254569e-07, + "loss": 1.1995, + "step": 10934 + }, + { + "epoch": 1.5480993841579953, + "grad_norm": 9.921302118057561, + "learning_rate": 6.397873621027711e-07, + "loss": 1.0271, + "step": 10935 + }, + { + "epoch": 1.5482409570326325, + "grad_norm": 8.08364909495889, + "learning_rate": 6.394044649700773e-07, + "loss": 0.8938, + "step": 10936 + }, + { + "epoch": 1.5483825299072698, + "grad_norm": 10.234177821931665, + "learning_rate": 6.390216656475027e-07, + "loss": 1.0561, + "step": 10937 + }, + { + "epoch": 1.548524102781907, + "grad_norm": 10.56781197326787, + "learning_rate": 6.386389641551721e-07, + "loss": 1.0073, + "step": 10938 + }, + { + "epoch": 1.5486656756565442, + "grad_norm": 8.83129868976537, + "learning_rate": 6.382563605132027e-07, + "loss": 1.0433, + "step": 10939 + }, + { + "epoch": 1.5488072485311815, + "grad_norm": 9.38195624085262, + "learning_rate": 6.37873854741709e-07, + "loss": 1.0003, + "step": 10940 + }, + { + "epoch": 1.5489488214058187, + "grad_norm": 9.249675023324299, + "learning_rate": 6.374914468607976e-07, + "loss": 1.0807, + "step": 10941 + }, + { + "epoch": 1.549090394280456, + "grad_norm": 9.058403825763426, + "learning_rate": 6.37109136890573e-07, + "loss": 0.9751, + "step": 10942 + }, + { + "epoch": 1.5492319671550931, + "grad_norm": 8.783830908254364, + "learning_rate": 6.367269248511309e-07, + "loss": 0.9837, + "step": 10943 + }, + { + "epoch": 1.5493735400297304, + "grad_norm": 8.533628027516299, + "learning_rate": 6.363448107625653e-07, + "loss": 0.9746, + "step": 10944 + }, + { + "epoch": 1.5495151129043676, + "grad_norm": 10.13352174416442, + "learning_rate": 6.359627946449648e-07, + "loss": 1.0401, + "step": 10945 + }, + { + "epoch": 1.5496566857790048, + "grad_norm": 7.127665506325922, + "learning_rate": 6.355808765184088e-07, + "loss": 0.977, + "step": 10946 + }, + { + "epoch": 1.549798258653642, + "grad_norm": 10.614545278633841, + "learning_rate": 6.351990564029767e-07, + "loss": 1.0025, + "step": 10947 + }, + { + "epoch": 1.5499398315282793, + "grad_norm": 9.664096052008247, + "learning_rate": 6.348173343187392e-07, + "loss": 1.0164, + "step": 10948 + }, + { + "epoch": 1.5500814044029165, + "grad_norm": 8.408520019797768, + "learning_rate": 6.344357102857643e-07, + "loss": 0.9125, + "step": 10949 + }, + { + "epoch": 1.5502229772775538, + "grad_norm": 9.244902159697679, + "learning_rate": 6.340541843241124e-07, + "loss": 0.9836, + "step": 10950 + }, + { + "epoch": 1.550364550152191, + "grad_norm": 10.448257111124144, + "learning_rate": 6.336727564538406e-07, + "loss": 0.9796, + "step": 10951 + }, + { + "epoch": 1.5505061230268282, + "grad_norm": 9.206780166018445, + "learning_rate": 6.332914266950011e-07, + "loss": 0.9816, + "step": 10952 + }, + { + "epoch": 1.5506476959014652, + "grad_norm": 7.8665830650844235, + "learning_rate": 6.329101950676389e-07, + "loss": 1.0046, + "step": 10953 + }, + { + "epoch": 1.5507892687761025, + "grad_norm": 8.8572067486968, + "learning_rate": 6.325290615917961e-07, + "loss": 1.0181, + "step": 10954 + }, + { + "epoch": 1.5509308416507397, + "grad_norm": 11.650285028073414, + "learning_rate": 6.321480262875082e-07, + "loss": 1.142, + "step": 10955 + }, + { + "epoch": 1.551072414525377, + "grad_norm": 8.922968648764598, + "learning_rate": 6.317670891748051e-07, + "loss": 0.9039, + "step": 10956 + }, + { + "epoch": 1.5512139874000141, + "grad_norm": 8.629540616535818, + "learning_rate": 6.313862502737139e-07, + "loss": 0.9073, + "step": 10957 + }, + { + "epoch": 1.5513555602746514, + "grad_norm": 8.99196096179674, + "learning_rate": 6.310055096042533e-07, + "loss": 1.0911, + "step": 10958 + }, + { + "epoch": 1.5514971331492886, + "grad_norm": 9.309496094273678, + "learning_rate": 6.306248671864404e-07, + "loss": 1.0205, + "step": 10959 + }, + { + "epoch": 1.5516387060239258, + "grad_norm": 10.592346278451116, + "learning_rate": 6.302443230402836e-07, + "loss": 1.0214, + "step": 10960 + }, + { + "epoch": 1.551780278898563, + "grad_norm": 8.73609997361657, + "learning_rate": 6.298638771857893e-07, + "loss": 0.875, + "step": 10961 + }, + { + "epoch": 1.5519218517732003, + "grad_norm": 9.10901276013792, + "learning_rate": 6.294835296429558e-07, + "loss": 0.9363, + "step": 10962 + }, + { + "epoch": 1.5520634246478373, + "grad_norm": 10.19852529598719, + "learning_rate": 6.291032804317789e-07, + "loss": 1.085, + "step": 10963 + }, + { + "epoch": 1.5522049975224745, + "grad_norm": 10.98751851189869, + "learning_rate": 6.28723129572247e-07, + "loss": 0.9899, + "step": 10964 + }, + { + "epoch": 1.5523465703971118, + "grad_norm": 9.63959179848939, + "learning_rate": 6.28343077084346e-07, + "loss": 0.9675, + "step": 10965 + }, + { + "epoch": 1.552488143271749, + "grad_norm": 8.740613862222524, + "learning_rate": 6.279631229880534e-07, + "loss": 0.9003, + "step": 10966 + }, + { + "epoch": 1.5526297161463862, + "grad_norm": 7.939882176291325, + "learning_rate": 6.27583267303343e-07, + "loss": 0.9581, + "step": 10967 + }, + { + "epoch": 1.5527712890210235, + "grad_norm": 10.028326731620426, + "learning_rate": 6.272035100501849e-07, + "loss": 1.0365, + "step": 10968 + }, + { + "epoch": 1.5529128618956607, + "grad_norm": 8.035944772472648, + "learning_rate": 6.268238512485412e-07, + "loss": 0.9533, + "step": 10969 + }, + { + "epoch": 1.553054434770298, + "grad_norm": 8.541992578258965, + "learning_rate": 6.264442909183715e-07, + "loss": 0.9482, + "step": 10970 + }, + { + "epoch": 1.5531960076449352, + "grad_norm": 11.000963689032082, + "learning_rate": 6.260648290796278e-07, + "loss": 0.9909, + "step": 10971 + }, + { + "epoch": 1.5533375805195724, + "grad_norm": 9.519800532198747, + "learning_rate": 6.256854657522587e-07, + "loss": 0.9916, + "step": 10972 + }, + { + "epoch": 1.5534791533942096, + "grad_norm": 10.355162994118713, + "learning_rate": 6.253062009562078e-07, + "loss": 1.0223, + "step": 10973 + }, + { + "epoch": 1.5536207262688468, + "grad_norm": 9.29872618124386, + "learning_rate": 6.249270347114114e-07, + "loss": 0.9936, + "step": 10974 + }, + { + "epoch": 1.553762299143484, + "grad_norm": 10.249158126869265, + "learning_rate": 6.245479670378036e-07, + "loss": 0.9408, + "step": 10975 + }, + { + "epoch": 1.5539038720181213, + "grad_norm": 10.087463878013857, + "learning_rate": 6.241689979553106e-07, + "loss": 1.0064, + "step": 10976 + }, + { + "epoch": 1.5540454448927585, + "grad_norm": 8.777850922012862, + "learning_rate": 6.237901274838546e-07, + "loss": 0.9223, + "step": 10977 + }, + { + "epoch": 1.5541870177673958, + "grad_norm": 9.546153738993317, + "learning_rate": 6.234113556433522e-07, + "loss": 1.0455, + "step": 10978 + }, + { + "epoch": 1.554328590642033, + "grad_norm": 10.500862086191907, + "learning_rate": 6.230326824537153e-07, + "loss": 1.0624, + "step": 10979 + }, + { + "epoch": 1.5544701635166702, + "grad_norm": 8.741090871092647, + "learning_rate": 6.226541079348517e-07, + "loss": 0.9611, + "step": 10980 + }, + { + "epoch": 1.5546117363913075, + "grad_norm": 9.492475239980383, + "learning_rate": 6.222756321066609e-07, + "loss": 0.974, + "step": 10981 + }, + { + "epoch": 1.5547533092659447, + "grad_norm": 8.71194684805301, + "learning_rate": 6.218972549890409e-07, + "loss": 0.9898, + "step": 10982 + }, + { + "epoch": 1.554894882140582, + "grad_norm": 7.982474681361302, + "learning_rate": 6.215189766018812e-07, + "loss": 0.8735, + "step": 10983 + }, + { + "epoch": 1.5550364550152191, + "grad_norm": 8.884531839430785, + "learning_rate": 6.211407969650687e-07, + "loss": 0.9459, + "step": 10984 + }, + { + "epoch": 1.5551780278898564, + "grad_norm": 8.464244361854607, + "learning_rate": 6.20762716098483e-07, + "loss": 0.9777, + "step": 10985 + }, + { + "epoch": 1.5553196007644936, + "grad_norm": 8.58006011870692, + "learning_rate": 6.203847340220006e-07, + "loss": 0.9469, + "step": 10986 + }, + { + "epoch": 1.5554611736391308, + "grad_norm": 9.132858028249435, + "learning_rate": 6.200068507554915e-07, + "loss": 0.9551, + "step": 10987 + }, + { + "epoch": 1.555602746513768, + "grad_norm": 8.668018040080677, + "learning_rate": 6.196290663188198e-07, + "loss": 0.9379, + "step": 10988 + }, + { + "epoch": 1.5557443193884053, + "grad_norm": 9.875193147339733, + "learning_rate": 6.192513807318468e-07, + "loss": 0.9728, + "step": 10989 + }, + { + "epoch": 1.5558858922630425, + "grad_norm": 9.083507524505823, + "learning_rate": 6.188737940144254e-07, + "loss": 0.9129, + "step": 10990 + }, + { + "epoch": 1.5560274651376798, + "grad_norm": 10.715586381710814, + "learning_rate": 6.184963061864069e-07, + "loss": 0.9556, + "step": 10991 + }, + { + "epoch": 1.556169038012317, + "grad_norm": 8.238400714816597, + "learning_rate": 6.181189172676338e-07, + "loss": 0.9852, + "step": 10992 + }, + { + "epoch": 1.5563106108869542, + "grad_norm": 8.834214316527081, + "learning_rate": 6.177416272779468e-07, + "loss": 0.9749, + "step": 10993 + }, + { + "epoch": 1.5564521837615912, + "grad_norm": 9.763395448614952, + "learning_rate": 6.173644362371783e-07, + "loss": 1.0192, + "step": 10994 + }, + { + "epoch": 1.5565937566362285, + "grad_norm": 11.867027417246481, + "learning_rate": 6.169873441651575e-07, + "loss": 1.0293, + "step": 10995 + }, + { + "epoch": 1.5567353295108657, + "grad_norm": 10.310375757779227, + "learning_rate": 6.166103510817089e-07, + "loss": 1.0274, + "step": 10996 + }, + { + "epoch": 1.556876902385503, + "grad_norm": 9.074864443772785, + "learning_rate": 6.162334570066497e-07, + "loss": 1.0671, + "step": 10997 + }, + { + "epoch": 1.5570184752601401, + "grad_norm": 9.03513007318949, + "learning_rate": 6.158566619597933e-07, + "loss": 0.9542, + "step": 10998 + }, + { + "epoch": 1.5571600481347774, + "grad_norm": 9.855992433693693, + "learning_rate": 6.154799659609464e-07, + "loss": 1.0302, + "step": 10999 + }, + { + "epoch": 1.5573016210094146, + "grad_norm": 8.771568092383278, + "learning_rate": 6.151033690299133e-07, + "loss": 1.0592, + "step": 11000 + }, + { + "epoch": 1.5574431938840518, + "grad_norm": 8.328255543213501, + "learning_rate": 6.147268711864898e-07, + "loss": 0.8428, + "step": 11001 + }, + { + "epoch": 1.557584766758689, + "grad_norm": 9.201323405738556, + "learning_rate": 6.14350472450469e-07, + "loss": 1.0473, + "step": 11002 + }, + { + "epoch": 1.5577263396333263, + "grad_norm": 8.049450153029913, + "learning_rate": 6.139741728416387e-07, + "loss": 0.8713, + "step": 11003 + }, + { + "epoch": 1.5578679125079635, + "grad_norm": 9.938605480964117, + "learning_rate": 6.135979723797792e-07, + "loss": 1.0513, + "step": 11004 + }, + { + "epoch": 1.5580094853826005, + "grad_norm": 10.909325108759628, + "learning_rate": 6.132218710846683e-07, + "loss": 0.965, + "step": 11005 + }, + { + "epoch": 1.5581510582572378, + "grad_norm": 13.225676469141616, + "learning_rate": 6.12845868976076e-07, + "loss": 1.0435, + "step": 11006 + }, + { + "epoch": 1.558292631131875, + "grad_norm": 9.861021167422544, + "learning_rate": 6.124699660737702e-07, + "loss": 0.9165, + "step": 11007 + }, + { + "epoch": 1.5584342040065122, + "grad_norm": 9.20830976618402, + "learning_rate": 6.120941623975107e-07, + "loss": 0.9988, + "step": 11008 + }, + { + "epoch": 1.5585757768811495, + "grad_norm": 10.154019060863481, + "learning_rate": 6.117184579670527e-07, + "loss": 1.1105, + "step": 11009 + }, + { + "epoch": 1.5587173497557867, + "grad_norm": 8.773323581462105, + "learning_rate": 6.113428528021481e-07, + "loss": 1.054, + "step": 11010 + }, + { + "epoch": 1.558858922630424, + "grad_norm": 8.998008083742628, + "learning_rate": 6.109673469225408e-07, + "loss": 0.9344, + "step": 11011 + }, + { + "epoch": 1.5590004955050611, + "grad_norm": 10.287137742924148, + "learning_rate": 6.105919403479724e-07, + "loss": 1.104, + "step": 11012 + }, + { + "epoch": 1.5591420683796984, + "grad_norm": 8.855787085560783, + "learning_rate": 6.10216633098176e-07, + "loss": 1.0823, + "step": 11013 + }, + { + "epoch": 1.5592836412543356, + "grad_norm": 13.98419823561468, + "learning_rate": 6.098414251928831e-07, + "loss": 1.0142, + "step": 11014 + }, + { + "epoch": 1.5594252141289728, + "grad_norm": 9.6779308680229, + "learning_rate": 6.094663166518161e-07, + "loss": 0.9437, + "step": 11015 + }, + { + "epoch": 1.55956678700361, + "grad_norm": 9.074045548282458, + "learning_rate": 6.090913074946958e-07, + "loss": 0.9625, + "step": 11016 + }, + { + "epoch": 1.5597083598782473, + "grad_norm": 11.0922334735894, + "learning_rate": 6.087163977412352e-07, + "loss": 1.0534, + "step": 11017 + }, + { + "epoch": 1.5598499327528845, + "grad_norm": 9.372061306995908, + "learning_rate": 6.083415874111432e-07, + "loss": 0.9916, + "step": 11018 + }, + { + "epoch": 1.5599915056275218, + "grad_norm": 7.489868186941855, + "learning_rate": 6.079668765241248e-07, + "loss": 0.9538, + "step": 11019 + }, + { + "epoch": 1.560133078502159, + "grad_norm": 8.356579008166355, + "learning_rate": 6.075922650998756e-07, + "loss": 0.9737, + "step": 11020 + }, + { + "epoch": 1.5602746513767962, + "grad_norm": 8.543962669376729, + "learning_rate": 6.072177531580909e-07, + "loss": 1.0268, + "step": 11021 + }, + { + "epoch": 1.5604162242514334, + "grad_norm": 8.1689984891612, + "learning_rate": 6.068433407184566e-07, + "loss": 0.9874, + "step": 11022 + }, + { + "epoch": 1.5605577971260707, + "grad_norm": 9.28707754167783, + "learning_rate": 6.064690278006572e-07, + "loss": 0.8961, + "step": 11023 + }, + { + "epoch": 1.560699370000708, + "grad_norm": 12.67719921768538, + "learning_rate": 6.060948144243683e-07, + "loss": 0.987, + "step": 11024 + }, + { + "epoch": 1.5608409428753451, + "grad_norm": 8.385584277892663, + "learning_rate": 6.057207006092628e-07, + "loss": 0.875, + "step": 11025 + }, + { + "epoch": 1.5609825157499824, + "grad_norm": 9.298806587679847, + "learning_rate": 6.053466863750085e-07, + "loss": 1.0237, + "step": 11026 + }, + { + "epoch": 1.5611240886246196, + "grad_norm": 11.920957277505417, + "learning_rate": 6.049727717412654e-07, + "loss": 1.0502, + "step": 11027 + }, + { + "epoch": 1.5612656614992568, + "grad_norm": 8.301438393105814, + "learning_rate": 6.045989567276913e-07, + "loss": 0.8897, + "step": 11028 + }, + { + "epoch": 1.561407234373894, + "grad_norm": 8.768949208134561, + "learning_rate": 6.042252413539368e-07, + "loss": 0.918, + "step": 11029 + }, + { + "epoch": 1.5615488072485313, + "grad_norm": 9.721813050394273, + "learning_rate": 6.038516256396473e-07, + "loss": 0.9015, + "step": 11030 + }, + { + "epoch": 1.5616903801231685, + "grad_norm": 8.988845694739014, + "learning_rate": 6.034781096044645e-07, + "loss": 0.916, + "step": 11031 + }, + { + "epoch": 1.5618319529978058, + "grad_norm": 9.614961219269004, + "learning_rate": 6.031046932680229e-07, + "loss": 0.9826, + "step": 11032 + }, + { + "epoch": 1.561973525872443, + "grad_norm": 7.54438845027369, + "learning_rate": 6.027313766499538e-07, + "loss": 0.9196, + "step": 11033 + }, + { + "epoch": 1.5621150987470802, + "grad_norm": 7.925006796656441, + "learning_rate": 6.023581597698807e-07, + "loss": 0.8265, + "step": 11034 + }, + { + "epoch": 1.5622566716217174, + "grad_norm": 10.16637416064443, + "learning_rate": 6.019850426474249e-07, + "loss": 0.937, + "step": 11035 + }, + { + "epoch": 1.5623982444963544, + "grad_norm": 9.547972970311388, + "learning_rate": 6.016120253021998e-07, + "loss": 0.9208, + "step": 11036 + }, + { + "epoch": 1.5625398173709917, + "grad_norm": 7.672076624689526, + "learning_rate": 6.012391077538154e-07, + "loss": 0.9269, + "step": 11037 + }, + { + "epoch": 1.562681390245629, + "grad_norm": 8.277648833747644, + "learning_rate": 6.008662900218748e-07, + "loss": 0.9484, + "step": 11038 + }, + { + "epoch": 1.5628229631202661, + "grad_norm": 9.39203774081028, + "learning_rate": 6.00493572125978e-07, + "loss": 0.9529, + "step": 11039 + }, + { + "epoch": 1.5629645359949034, + "grad_norm": 9.699096039799679, + "learning_rate": 6.001209540857178e-07, + "loss": 0.9926, + "step": 11040 + }, + { + "epoch": 1.5631061088695406, + "grad_norm": 8.910852360740911, + "learning_rate": 5.997484359206815e-07, + "loss": 1.0085, + "step": 11041 + }, + { + "epoch": 1.5632476817441778, + "grad_norm": 10.211391774516159, + "learning_rate": 5.99376017650454e-07, + "loss": 0.9389, + "step": 11042 + }, + { + "epoch": 1.563389254618815, + "grad_norm": 9.64253717931098, + "learning_rate": 5.990036992946114e-07, + "loss": 0.9767, + "step": 11043 + }, + { + "epoch": 1.5635308274934523, + "grad_norm": 8.56890406897868, + "learning_rate": 5.986314808727273e-07, + "loss": 0.8983, + "step": 11044 + }, + { + "epoch": 1.5636724003680895, + "grad_norm": 9.598337474552071, + "learning_rate": 5.982593624043682e-07, + "loss": 0.9848, + "step": 11045 + }, + { + "epoch": 1.5638139732427265, + "grad_norm": 9.780063545005008, + "learning_rate": 5.978873439090968e-07, + "loss": 1.0072, + "step": 11046 + }, + { + "epoch": 1.5639555461173638, + "grad_norm": 9.11711762572303, + "learning_rate": 5.975154254064688e-07, + "loss": 1.0076, + "step": 11047 + }, + { + "epoch": 1.564097118992001, + "grad_norm": 9.862162297296807, + "learning_rate": 5.971436069160363e-07, + "loss": 0.9937, + "step": 11048 + }, + { + "epoch": 1.5642386918666382, + "grad_norm": 8.625865948094265, + "learning_rate": 5.967718884573465e-07, + "loss": 1.0984, + "step": 11049 + }, + { + "epoch": 1.5643802647412755, + "grad_norm": 9.74060779970209, + "learning_rate": 5.964002700499394e-07, + "loss": 1.0331, + "step": 11050 + }, + { + "epoch": 1.5645218376159127, + "grad_norm": 8.897739989261538, + "learning_rate": 5.960287517133506e-07, + "loss": 0.919, + "step": 11051 + }, + { + "epoch": 1.56466341049055, + "grad_norm": 9.225484579948269, + "learning_rate": 5.956573334671098e-07, + "loss": 0.9276, + "step": 11052 + }, + { + "epoch": 1.5648049833651871, + "grad_norm": 10.555822304372331, + "learning_rate": 5.952860153307433e-07, + "loss": 0.9588, + "step": 11053 + }, + { + "epoch": 1.5649465562398244, + "grad_norm": 9.538732174614372, + "learning_rate": 5.949147973237713e-07, + "loss": 1.0093, + "step": 11054 + }, + { + "epoch": 1.5650881291144616, + "grad_norm": 9.22195430606466, + "learning_rate": 5.945436794657072e-07, + "loss": 0.8985, + "step": 11055 + }, + { + "epoch": 1.5652297019890988, + "grad_norm": 11.634272333266459, + "learning_rate": 5.941726617760621e-07, + "loss": 0.9922, + "step": 11056 + }, + { + "epoch": 1.565371274863736, + "grad_norm": 9.199025508851468, + "learning_rate": 5.938017442743382e-07, + "loss": 1.015, + "step": 11057 + }, + { + "epoch": 1.5655128477383733, + "grad_norm": 9.737952589846072, + "learning_rate": 5.934309269800359e-07, + "loss": 0.91, + "step": 11058 + }, + { + "epoch": 1.5656544206130105, + "grad_norm": 7.956883829405718, + "learning_rate": 5.930602099126476e-07, + "loss": 0.9973, + "step": 11059 + }, + { + "epoch": 1.5657959934876478, + "grad_norm": 7.96349493544851, + "learning_rate": 5.926895930916629e-07, + "loss": 0.8813, + "step": 11060 + }, + { + "epoch": 1.565937566362285, + "grad_norm": 9.513022882875726, + "learning_rate": 5.923190765365641e-07, + "loss": 1.018, + "step": 11061 + }, + { + "epoch": 1.5660791392369222, + "grad_norm": 8.503993722994077, + "learning_rate": 5.919486602668281e-07, + "loss": 0.8953, + "step": 11062 + }, + { + "epoch": 1.5662207121115594, + "grad_norm": 9.945922736948818, + "learning_rate": 5.915783443019293e-07, + "loss": 0.9448, + "step": 11063 + }, + { + "epoch": 1.5663622849861967, + "grad_norm": 9.23593389839847, + "learning_rate": 5.912081286613334e-07, + "loss": 0.9086, + "step": 11064 + }, + { + "epoch": 1.566503857860834, + "grad_norm": 8.672067231118262, + "learning_rate": 5.908380133645033e-07, + "loss": 0.9862, + "step": 11065 + }, + { + "epoch": 1.5666454307354711, + "grad_norm": 11.191813569703701, + "learning_rate": 5.904679984308947e-07, + "loss": 0.9571, + "step": 11066 + }, + { + "epoch": 1.5667870036101084, + "grad_norm": 9.380826233220137, + "learning_rate": 5.900980838799603e-07, + "loss": 1.0357, + "step": 11067 + }, + { + "epoch": 1.5669285764847456, + "grad_norm": 11.446422994334709, + "learning_rate": 5.897282697311449e-07, + "loss": 0.9655, + "step": 11068 + }, + { + "epoch": 1.5670701493593828, + "grad_norm": 9.081607853034013, + "learning_rate": 5.8935855600389e-07, + "loss": 0.9448, + "step": 11069 + }, + { + "epoch": 1.56721172223402, + "grad_norm": 8.919432831938197, + "learning_rate": 5.889889427176318e-07, + "loss": 1.0529, + "step": 11070 + }, + { + "epoch": 1.5673532951086573, + "grad_norm": 9.816093807204368, + "learning_rate": 5.886194298917994e-07, + "loss": 0.9949, + "step": 11071 + }, + { + "epoch": 1.5674948679832945, + "grad_norm": 8.070171639320067, + "learning_rate": 5.882500175458198e-07, + "loss": 0.9511, + "step": 11072 + }, + { + "epoch": 1.5676364408579317, + "grad_norm": 10.178349229112998, + "learning_rate": 5.878807056991098e-07, + "loss": 1.1206, + "step": 11073 + }, + { + "epoch": 1.567778013732569, + "grad_norm": 9.204555743518906, + "learning_rate": 5.87511494371086e-07, + "loss": 0.8781, + "step": 11074 + }, + { + "epoch": 1.5679195866072062, + "grad_norm": 9.598216256683248, + "learning_rate": 5.871423835811566e-07, + "loss": 0.9609, + "step": 11075 + }, + { + "epoch": 1.5680611594818434, + "grad_norm": 9.28617096501062, + "learning_rate": 5.867733733487255e-07, + "loss": 0.9459, + "step": 11076 + }, + { + "epoch": 1.5682027323564804, + "grad_norm": 10.411943590700897, + "learning_rate": 5.864044636931923e-07, + "loss": 0.945, + "step": 11077 + }, + { + "epoch": 1.5683443052311177, + "grad_norm": 8.041766335721267, + "learning_rate": 5.86035654633949e-07, + "loss": 0.9544, + "step": 11078 + }, + { + "epoch": 1.568485878105755, + "grad_norm": 9.88264188958466, + "learning_rate": 5.85666946190385e-07, + "loss": 1.0569, + "step": 11079 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 9.029974296254426, + "learning_rate": 5.852983383818813e-07, + "loss": 0.9961, + "step": 11080 + }, + { + "epoch": 1.5687690238550294, + "grad_norm": 9.776297683628798, + "learning_rate": 5.84929831227817e-07, + "loss": 1.0524, + "step": 11081 + }, + { + "epoch": 1.5689105967296666, + "grad_norm": 10.950808727760347, + "learning_rate": 5.845614247475637e-07, + "loss": 0.9419, + "step": 11082 + }, + { + "epoch": 1.5690521696043038, + "grad_norm": 10.479963167431585, + "learning_rate": 5.841931189604874e-07, + "loss": 1.0495, + "step": 11083 + }, + { + "epoch": 1.569193742478941, + "grad_norm": 7.880833962551964, + "learning_rate": 5.838249138859509e-07, + "loss": 0.9553, + "step": 11084 + }, + { + "epoch": 1.5693353153535783, + "grad_norm": 8.388708936545022, + "learning_rate": 5.834568095433093e-07, + "loss": 1.0586, + "step": 11085 + }, + { + "epoch": 1.5694768882282155, + "grad_norm": 10.851384270607086, + "learning_rate": 5.830888059519149e-07, + "loss": 0.9221, + "step": 11086 + }, + { + "epoch": 1.5696184611028527, + "grad_norm": 9.622796140748605, + "learning_rate": 5.827209031311121e-07, + "loss": 1.0704, + "step": 11087 + }, + { + "epoch": 1.5697600339774898, + "grad_norm": 9.412702508092954, + "learning_rate": 5.823531011002423e-07, + "loss": 0.9042, + "step": 11088 + }, + { + "epoch": 1.569901606852127, + "grad_norm": 9.363416509244152, + "learning_rate": 5.819853998786395e-07, + "loss": 1.0665, + "step": 11089 + }, + { + "epoch": 1.5700431797267642, + "grad_norm": 9.68866511075793, + "learning_rate": 5.816177994856347e-07, + "loss": 0.9937, + "step": 11090 + }, + { + "epoch": 1.5701847526014014, + "grad_norm": 11.933577766969657, + "learning_rate": 5.812502999405514e-07, + "loss": 0.9598, + "step": 11091 + }, + { + "epoch": 1.5703263254760387, + "grad_norm": 10.37265686784475, + "learning_rate": 5.80882901262709e-07, + "loss": 1.0108, + "step": 11092 + }, + { + "epoch": 1.570467898350676, + "grad_norm": 9.248401400609561, + "learning_rate": 5.805156034714227e-07, + "loss": 1.032, + "step": 11093 + }, + { + "epoch": 1.5706094712253131, + "grad_norm": 9.964100294443767, + "learning_rate": 5.801484065859989e-07, + "loss": 1.0037, + "step": 11094 + }, + { + "epoch": 1.5707510440999504, + "grad_norm": 8.830775148404566, + "learning_rate": 5.797813106257422e-07, + "loss": 1.0082, + "step": 11095 + }, + { + "epoch": 1.5708926169745876, + "grad_norm": 10.328624349691069, + "learning_rate": 5.794143156099497e-07, + "loss": 1.0765, + "step": 11096 + }, + { + "epoch": 1.5710341898492248, + "grad_norm": 11.330444266892462, + "learning_rate": 5.79047421557915e-07, + "loss": 1.1613, + "step": 11097 + }, + { + "epoch": 1.571175762723862, + "grad_norm": 8.442892428318174, + "learning_rate": 5.786806284889246e-07, + "loss": 0.9902, + "step": 11098 + }, + { + "epoch": 1.5713173355984993, + "grad_norm": 9.229068890492107, + "learning_rate": 5.783139364222609e-07, + "loss": 0.9467, + "step": 11099 + }, + { + "epoch": 1.5714589084731365, + "grad_norm": 9.392541369300613, + "learning_rate": 5.779473453772017e-07, + "loss": 0.9645, + "step": 11100 + }, + { + "epoch": 1.5716004813477737, + "grad_norm": 11.099023559512313, + "learning_rate": 5.775808553730164e-07, + "loss": 1.0583, + "step": 11101 + }, + { + "epoch": 1.571742054222411, + "grad_norm": 8.639161356737928, + "learning_rate": 5.772144664289728e-07, + "loss": 0.8095, + "step": 11102 + }, + { + "epoch": 1.5718836270970482, + "grad_norm": 9.281950747562485, + "learning_rate": 5.768481785643309e-07, + "loss": 1.0259, + "step": 11103 + }, + { + "epoch": 1.5720251999716854, + "grad_norm": 9.285387549049052, + "learning_rate": 5.764819917983458e-07, + "loss": 0.8707, + "step": 11104 + }, + { + "epoch": 1.5721667728463227, + "grad_norm": 9.253768565617774, + "learning_rate": 5.761159061502688e-07, + "loss": 0.9377, + "step": 11105 + }, + { + "epoch": 1.57230834572096, + "grad_norm": 9.173719802226664, + "learning_rate": 5.757499216393433e-07, + "loss": 0.9335, + "step": 11106 + }, + { + "epoch": 1.5724499185955971, + "grad_norm": 9.01140041971358, + "learning_rate": 5.753840382848105e-07, + "loss": 0.9058, + "step": 11107 + }, + { + "epoch": 1.5725914914702344, + "grad_norm": 8.461274285627496, + "learning_rate": 5.750182561059031e-07, + "loss": 0.9629, + "step": 11108 + }, + { + "epoch": 1.5727330643448716, + "grad_norm": 10.001947213372105, + "learning_rate": 5.746525751218512e-07, + "loss": 0.9247, + "step": 11109 + }, + { + "epoch": 1.5728746372195088, + "grad_norm": 9.800315338043855, + "learning_rate": 5.742869953518773e-07, + "loss": 0.9844, + "step": 11110 + }, + { + "epoch": 1.573016210094146, + "grad_norm": 9.904868622264326, + "learning_rate": 5.739215168152007e-07, + "loss": 0.9594, + "step": 11111 + }, + { + "epoch": 1.5731577829687833, + "grad_norm": 9.406599982458086, + "learning_rate": 5.735561395310333e-07, + "loss": 0.9334, + "step": 11112 + }, + { + "epoch": 1.5732993558434205, + "grad_norm": 10.939382859698906, + "learning_rate": 5.731908635185837e-07, + "loss": 1.0812, + "step": 11113 + }, + { + "epoch": 1.5734409287180577, + "grad_norm": 7.789863935464896, + "learning_rate": 5.728256887970537e-07, + "loss": 0.9819, + "step": 11114 + }, + { + "epoch": 1.573582501592695, + "grad_norm": 8.965033103256255, + "learning_rate": 5.724606153856396e-07, + "loss": 1.0218, + "step": 11115 + }, + { + "epoch": 1.5737240744673322, + "grad_norm": 11.510730546955278, + "learning_rate": 5.720956433035346e-07, + "loss": 1.1036, + "step": 11116 + }, + { + "epoch": 1.5738656473419694, + "grad_norm": 9.010220552342515, + "learning_rate": 5.717307725699234e-07, + "loss": 0.9885, + "step": 11117 + }, + { + "epoch": 1.5740072202166067, + "grad_norm": 10.508651075971809, + "learning_rate": 5.713660032039884e-07, + "loss": 1.0222, + "step": 11118 + }, + { + "epoch": 1.5741487930912437, + "grad_norm": 9.015519642338244, + "learning_rate": 5.710013352249039e-07, + "loss": 0.8769, + "step": 11119 + }, + { + "epoch": 1.574290365965881, + "grad_norm": 8.897133320527583, + "learning_rate": 5.706367686518414e-07, + "loss": 0.9199, + "step": 11120 + }, + { + "epoch": 1.5744319388405181, + "grad_norm": 8.929684936843465, + "learning_rate": 5.702723035039648e-07, + "loss": 0.9325, + "step": 11121 + }, + { + "epoch": 1.5745735117151554, + "grad_norm": 10.068794508965288, + "learning_rate": 5.699079398004342e-07, + "loss": 1.0588, + "step": 11122 + }, + { + "epoch": 1.5747150845897926, + "grad_norm": 10.780247915386154, + "learning_rate": 5.695436775604049e-07, + "loss": 0.998, + "step": 11123 + }, + { + "epoch": 1.5748566574644298, + "grad_norm": 10.213698698967052, + "learning_rate": 5.691795168030242e-07, + "loss": 0.9574, + "step": 11124 + }, + { + "epoch": 1.574998230339067, + "grad_norm": 7.539129290507252, + "learning_rate": 5.688154575474384e-07, + "loss": 0.9325, + "step": 11125 + }, + { + "epoch": 1.5751398032137043, + "grad_norm": 9.274784186389544, + "learning_rate": 5.684514998127822e-07, + "loss": 1.0004, + "step": 11126 + }, + { + "epoch": 1.5752813760883415, + "grad_norm": 10.697803906543228, + "learning_rate": 5.680876436181907e-07, + "loss": 0.9314, + "step": 11127 + }, + { + "epoch": 1.5754229489629787, + "grad_norm": 9.67513347208382, + "learning_rate": 5.677238889827918e-07, + "loss": 0.9936, + "step": 11128 + }, + { + "epoch": 1.5755645218376158, + "grad_norm": 8.106537015447008, + "learning_rate": 5.673602359257069e-07, + "loss": 0.9865, + "step": 11129 + }, + { + "epoch": 1.575706094712253, + "grad_norm": 8.982106965764412, + "learning_rate": 5.669966844660538e-07, + "loss": 1.026, + "step": 11130 + }, + { + "epoch": 1.5758476675868902, + "grad_norm": 9.573313464442526, + "learning_rate": 5.66633234622943e-07, + "loss": 0.9691, + "step": 11131 + }, + { + "epoch": 1.5759892404615274, + "grad_norm": 10.012807655132095, + "learning_rate": 5.662698864154823e-07, + "loss": 1.0006, + "step": 11132 + }, + { + "epoch": 1.5761308133361647, + "grad_norm": 8.84602763363418, + "learning_rate": 5.65906639862771e-07, + "loss": 1.1026, + "step": 11133 + }, + { + "epoch": 1.576272386210802, + "grad_norm": 9.214861550825324, + "learning_rate": 5.655434949839061e-07, + "loss": 0.9035, + "step": 11134 + }, + { + "epoch": 1.5764139590854391, + "grad_norm": 8.920817137098748, + "learning_rate": 5.651804517979775e-07, + "loss": 0.9252, + "step": 11135 + }, + { + "epoch": 1.5765555319600764, + "grad_norm": 11.138519302117869, + "learning_rate": 5.648175103240694e-07, + "loss": 0.9981, + "step": 11136 + }, + { + "epoch": 1.5766971048347136, + "grad_norm": 9.852175833405722, + "learning_rate": 5.64454670581262e-07, + "loss": 0.8099, + "step": 11137 + }, + { + "epoch": 1.5768386777093508, + "grad_norm": 8.4225566637687, + "learning_rate": 5.64091932588629e-07, + "loss": 0.8763, + "step": 11138 + }, + { + "epoch": 1.576980250583988, + "grad_norm": 8.693180559762284, + "learning_rate": 5.637292963652405e-07, + "loss": 0.9491, + "step": 11139 + }, + { + "epoch": 1.5771218234586253, + "grad_norm": 9.778824673103854, + "learning_rate": 5.63366761930158e-07, + "loss": 0.9834, + "step": 11140 + }, + { + "epoch": 1.5772633963332625, + "grad_norm": 9.625173988255964, + "learning_rate": 5.630043293024418e-07, + "loss": 1.0759, + "step": 11141 + }, + { + "epoch": 1.5774049692078997, + "grad_norm": 10.00867657947636, + "learning_rate": 5.62641998501143e-07, + "loss": 0.9334, + "step": 11142 + }, + { + "epoch": 1.577546542082537, + "grad_norm": 8.80211140705274, + "learning_rate": 5.622797695453106e-07, + "loss": 1.0084, + "step": 11143 + }, + { + "epoch": 1.5776881149571742, + "grad_norm": 8.079652033196668, + "learning_rate": 5.619176424539849e-07, + "loss": 0.9529, + "step": 11144 + }, + { + "epoch": 1.5778296878318114, + "grad_norm": 9.771050836437306, + "learning_rate": 5.615556172462039e-07, + "loss": 0.9839, + "step": 11145 + }, + { + "epoch": 1.5779712607064487, + "grad_norm": 9.260296014458211, + "learning_rate": 5.611936939409998e-07, + "loss": 0.9247, + "step": 11146 + }, + { + "epoch": 1.578112833581086, + "grad_norm": 10.129004922629678, + "learning_rate": 5.608318725573964e-07, + "loss": 0.9801, + "step": 11147 + }, + { + "epoch": 1.5782544064557231, + "grad_norm": 9.720928181102277, + "learning_rate": 5.604701531144164e-07, + "loss": 0.9917, + "step": 11148 + }, + { + "epoch": 1.5783959793303604, + "grad_norm": 7.634676187439815, + "learning_rate": 5.601085356310734e-07, + "loss": 0.9388, + "step": 11149 + }, + { + "epoch": 1.5785375522049976, + "grad_norm": 9.491779183604752, + "learning_rate": 5.597470201263783e-07, + "loss": 1.0232, + "step": 11150 + }, + { + "epoch": 1.5786791250796348, + "grad_norm": 11.184632344041834, + "learning_rate": 5.593856066193362e-07, + "loss": 1.0255, + "step": 11151 + }, + { + "epoch": 1.578820697954272, + "grad_norm": 8.855901666331356, + "learning_rate": 5.590242951289451e-07, + "loss": 1.0595, + "step": 11152 + }, + { + "epoch": 1.5789622708289093, + "grad_norm": 9.82498754447167, + "learning_rate": 5.586630856742004e-07, + "loss": 0.9797, + "step": 11153 + }, + { + "epoch": 1.5791038437035465, + "grad_norm": 9.798393460373843, + "learning_rate": 5.58301978274089e-07, + "loss": 1.0323, + "step": 11154 + }, + { + "epoch": 1.5792454165781837, + "grad_norm": 8.385279481764737, + "learning_rate": 5.579409729475954e-07, + "loss": 0.9289, + "step": 11155 + }, + { + "epoch": 1.579386989452821, + "grad_norm": 10.197781295319485, + "learning_rate": 5.575800697136968e-07, + "loss": 0.9545, + "step": 11156 + }, + { + "epoch": 1.5795285623274582, + "grad_norm": 9.719406252964028, + "learning_rate": 5.572192685913652e-07, + "loss": 0.9169, + "step": 11157 + }, + { + "epoch": 1.5796701352020954, + "grad_norm": 10.807401833336588, + "learning_rate": 5.568585695995684e-07, + "loss": 1.0688, + "step": 11158 + }, + { + "epoch": 1.5798117080767327, + "grad_norm": 10.13357708119148, + "learning_rate": 5.564979727572673e-07, + "loss": 0.9404, + "step": 11159 + }, + { + "epoch": 1.5799532809513697, + "grad_norm": 10.909803101382606, + "learning_rate": 5.561374780834192e-07, + "loss": 1.0362, + "step": 11160 + }, + { + "epoch": 1.580094853826007, + "grad_norm": 8.785598275618279, + "learning_rate": 5.557770855969738e-07, + "loss": 1.0107, + "step": 11161 + }, + { + "epoch": 1.5802364267006441, + "grad_norm": 9.536346773793362, + "learning_rate": 5.554167953168779e-07, + "loss": 0.8931, + "step": 11162 + }, + { + "epoch": 1.5803779995752814, + "grad_norm": 10.065468676021437, + "learning_rate": 5.550566072620705e-07, + "loss": 0.9747, + "step": 11163 + }, + { + "epoch": 1.5805195724499186, + "grad_norm": 9.601935557108842, + "learning_rate": 5.54696521451488e-07, + "loss": 0.9786, + "step": 11164 + }, + { + "epoch": 1.5806611453245558, + "grad_norm": 10.314772846718174, + "learning_rate": 5.54336537904058e-07, + "loss": 0.9573, + "step": 11165 + }, + { + "epoch": 1.580802718199193, + "grad_norm": 8.126901726811152, + "learning_rate": 5.539766566387053e-07, + "loss": 0.9301, + "step": 11166 + }, + { + "epoch": 1.5809442910738303, + "grad_norm": 9.847776721669886, + "learning_rate": 5.536168776743503e-07, + "loss": 1.0656, + "step": 11167 + }, + { + "epoch": 1.5810858639484675, + "grad_norm": 9.136289536733358, + "learning_rate": 5.532572010299034e-07, + "loss": 1.0218, + "step": 11168 + }, + { + "epoch": 1.5812274368231047, + "grad_norm": 8.005679022197013, + "learning_rate": 5.528976267242745e-07, + "loss": 0.9856, + "step": 11169 + }, + { + "epoch": 1.581369009697742, + "grad_norm": 10.354979904648951, + "learning_rate": 5.525381547763647e-07, + "loss": 1.0614, + "step": 11170 + }, + { + "epoch": 1.581510582572379, + "grad_norm": 10.28582865787678, + "learning_rate": 5.52178785205073e-07, + "loss": 0.9737, + "step": 11171 + }, + { + "epoch": 1.5816521554470162, + "grad_norm": 9.792181079960397, + "learning_rate": 5.518195180292893e-07, + "loss": 0.9766, + "step": 11172 + }, + { + "epoch": 1.5817937283216534, + "grad_norm": 8.493543528895705, + "learning_rate": 5.514603532679011e-07, + "loss": 0.9312, + "step": 11173 + }, + { + "epoch": 1.5819353011962907, + "grad_norm": 7.969560409924841, + "learning_rate": 5.511012909397898e-07, + "loss": 0.8787, + "step": 11174 + }, + { + "epoch": 1.582076874070928, + "grad_norm": 7.570098338527742, + "learning_rate": 5.507423310638299e-07, + "loss": 0.9848, + "step": 11175 + }, + { + "epoch": 1.5822184469455651, + "grad_norm": 10.85006626653871, + "learning_rate": 5.503834736588929e-07, + "loss": 1.0832, + "step": 11176 + }, + { + "epoch": 1.5823600198202024, + "grad_norm": 9.534778179595056, + "learning_rate": 5.500247187438429e-07, + "loss": 0.9899, + "step": 11177 + }, + { + "epoch": 1.5825015926948396, + "grad_norm": 10.468750364388987, + "learning_rate": 5.496660663375389e-07, + "loss": 0.9681, + "step": 11178 + }, + { + "epoch": 1.5826431655694768, + "grad_norm": 10.400871284374261, + "learning_rate": 5.49307516458836e-07, + "loss": 0.9448, + "step": 11179 + }, + { + "epoch": 1.582784738444114, + "grad_norm": 10.162120824217556, + "learning_rate": 5.489490691265819e-07, + "loss": 0.9497, + "step": 11180 + }, + { + "epoch": 1.5829263113187513, + "grad_norm": 9.361855205339495, + "learning_rate": 5.485907243596214e-07, + "loss": 0.9158, + "step": 11181 + }, + { + "epoch": 1.5830678841933885, + "grad_norm": 8.753280460675425, + "learning_rate": 5.482324821767904e-07, + "loss": 0.9433, + "step": 11182 + }, + { + "epoch": 1.5832094570680257, + "grad_norm": 10.320180171606214, + "learning_rate": 5.478743425969235e-07, + "loss": 0.9081, + "step": 11183 + }, + { + "epoch": 1.583351029942663, + "grad_norm": 10.198264583982038, + "learning_rate": 5.47516305638846e-07, + "loss": 1.026, + "step": 11184 + }, + { + "epoch": 1.5834926028173002, + "grad_norm": 10.054265604007957, + "learning_rate": 5.471583713213812e-07, + "loss": 0.9333, + "step": 11185 + }, + { + "epoch": 1.5836341756919374, + "grad_norm": 8.636410449325505, + "learning_rate": 5.468005396633442e-07, + "loss": 0.9974, + "step": 11186 + }, + { + "epoch": 1.5837757485665747, + "grad_norm": 8.93012408026155, + "learning_rate": 5.464428106835467e-07, + "loss": 0.9067, + "step": 11187 + }, + { + "epoch": 1.583917321441212, + "grad_norm": 10.313480954764831, + "learning_rate": 5.460851844007945e-07, + "loss": 1.0287, + "step": 11188 + }, + { + "epoch": 1.5840588943158491, + "grad_norm": 6.38031801298159, + "learning_rate": 5.457276608338862e-07, + "loss": 0.8505, + "step": 11189 + }, + { + "epoch": 1.5842004671904863, + "grad_norm": 9.25660469771639, + "learning_rate": 5.453702400016186e-07, + "loss": 0.8442, + "step": 11190 + }, + { + "epoch": 1.5843420400651236, + "grad_norm": 8.9333674340048, + "learning_rate": 5.450129219227792e-07, + "loss": 0.9508, + "step": 11191 + }, + { + "epoch": 1.5844836129397608, + "grad_norm": 8.687895621752869, + "learning_rate": 5.446557066161537e-07, + "loss": 0.9727, + "step": 11192 + }, + { + "epoch": 1.584625185814398, + "grad_norm": 10.092963126014379, + "learning_rate": 5.442985941005188e-07, + "loss": 1.0242, + "step": 11193 + }, + { + "epoch": 1.5847667586890353, + "grad_norm": 9.781708630421821, + "learning_rate": 5.439415843946493e-07, + "loss": 0.9958, + "step": 11194 + }, + { + "epoch": 1.5849083315636725, + "grad_norm": 9.150573845884441, + "learning_rate": 5.435846775173115e-07, + "loss": 1.0655, + "step": 11195 + }, + { + "epoch": 1.5850499044383097, + "grad_norm": 8.098283713027081, + "learning_rate": 5.432278734872687e-07, + "loss": 0.943, + "step": 11196 + }, + { + "epoch": 1.585191477312947, + "grad_norm": 9.240468656206142, + "learning_rate": 5.428711723232779e-07, + "loss": 1.0024, + "step": 11197 + }, + { + "epoch": 1.5853330501875842, + "grad_norm": 8.997964416881238, + "learning_rate": 5.425145740440896e-07, + "loss": 1.0445, + "step": 11198 + }, + { + "epoch": 1.5854746230622214, + "grad_norm": 9.27547760836504, + "learning_rate": 5.421580786684522e-07, + "loss": 0.9562, + "step": 11199 + }, + { + "epoch": 1.5856161959368587, + "grad_norm": 8.276230237159703, + "learning_rate": 5.418016862151032e-07, + "loss": 0.847, + "step": 11200 + }, + { + "epoch": 1.5857577688114959, + "grad_norm": 10.652398540494406, + "learning_rate": 5.414453967027797e-07, + "loss": 0.9886, + "step": 11201 + }, + { + "epoch": 1.585899341686133, + "grad_norm": 8.64873699162444, + "learning_rate": 5.410892101502119e-07, + "loss": 0.9343, + "step": 11202 + }, + { + "epoch": 1.5860409145607701, + "grad_norm": 10.22407528303007, + "learning_rate": 5.407331265761229e-07, + "loss": 1.1703, + "step": 11203 + }, + { + "epoch": 1.5861824874354074, + "grad_norm": 10.561376748937981, + "learning_rate": 5.403771459992333e-07, + "loss": 0.9298, + "step": 11204 + }, + { + "epoch": 1.5863240603100446, + "grad_norm": 10.435657087254901, + "learning_rate": 5.400212684382553e-07, + "loss": 0.9945, + "step": 11205 + }, + { + "epoch": 1.5864656331846818, + "grad_norm": 10.181535531121245, + "learning_rate": 5.396654939118984e-07, + "loss": 0.9668, + "step": 11206 + }, + { + "epoch": 1.586607206059319, + "grad_norm": 9.506389276310639, + "learning_rate": 5.393098224388643e-07, + "loss": 1.0025, + "step": 11207 + }, + { + "epoch": 1.5867487789339563, + "grad_norm": 9.035017765367582, + "learning_rate": 5.389542540378515e-07, + "loss": 0.9854, + "step": 11208 + }, + { + "epoch": 1.5868903518085935, + "grad_norm": 10.773921875441877, + "learning_rate": 5.385987887275512e-07, + "loss": 1.0502, + "step": 11209 + }, + { + "epoch": 1.5870319246832307, + "grad_norm": 10.252659522422212, + "learning_rate": 5.382434265266495e-07, + "loss": 0.9135, + "step": 11210 + }, + { + "epoch": 1.587173497557868, + "grad_norm": 10.516694738609544, + "learning_rate": 5.378881674538288e-07, + "loss": 0.963, + "step": 11211 + }, + { + "epoch": 1.587315070432505, + "grad_norm": 8.058010536942513, + "learning_rate": 5.375330115277635e-07, + "loss": 0.9602, + "step": 11212 + }, + { + "epoch": 1.5874566433071422, + "grad_norm": 8.771874252009454, + "learning_rate": 5.371779587671252e-07, + "loss": 0.9035, + "step": 11213 + }, + { + "epoch": 1.5875982161817794, + "grad_norm": 9.466826089224059, + "learning_rate": 5.368230091905774e-07, + "loss": 1.0542, + "step": 11214 + }, + { + "epoch": 1.5877397890564167, + "grad_norm": 9.705233579769764, + "learning_rate": 5.364681628167806e-07, + "loss": 0.9635, + "step": 11215 + }, + { + "epoch": 1.587881361931054, + "grad_norm": 7.910704070536425, + "learning_rate": 5.36113419664388e-07, + "loss": 1.0044, + "step": 11216 + }, + { + "epoch": 1.5880229348056911, + "grad_norm": 9.362913758693624, + "learning_rate": 5.357587797520491e-07, + "loss": 0.874, + "step": 11217 + }, + { + "epoch": 1.5881645076803284, + "grad_norm": 9.403184359494013, + "learning_rate": 5.354042430984061e-07, + "loss": 0.9357, + "step": 11218 + }, + { + "epoch": 1.5883060805549656, + "grad_norm": 8.875045561337718, + "learning_rate": 5.350498097220972e-07, + "loss": 0.9519, + "step": 11219 + }, + { + "epoch": 1.5884476534296028, + "grad_norm": 10.232624865267992, + "learning_rate": 5.346954796417558e-07, + "loss": 0.9909, + "step": 11220 + }, + { + "epoch": 1.58858922630424, + "grad_norm": 9.540350099933276, + "learning_rate": 5.343412528760064e-07, + "loss": 0.9636, + "step": 11221 + }, + { + "epoch": 1.5887307991788773, + "grad_norm": 9.99525148183169, + "learning_rate": 5.339871294434724e-07, + "loss": 1.1069, + "step": 11222 + }, + { + "epoch": 1.5888723720535145, + "grad_norm": 12.597280759907882, + "learning_rate": 5.336331093627683e-07, + "loss": 1.0425, + "step": 11223 + }, + { + "epoch": 1.5890139449281517, + "grad_norm": 10.035209848655033, + "learning_rate": 5.332791926525055e-07, + "loss": 1.0065, + "step": 11224 + }, + { + "epoch": 1.589155517802789, + "grad_norm": 9.783819379110922, + "learning_rate": 5.329253793312897e-07, + "loss": 1.0242, + "step": 11225 + }, + { + "epoch": 1.5892970906774262, + "grad_norm": 8.597456587016012, + "learning_rate": 5.325716694177194e-07, + "loss": 1.0197, + "step": 11226 + }, + { + "epoch": 1.5894386635520634, + "grad_norm": 9.182444316758534, + "learning_rate": 5.322180629303902e-07, + "loss": 0.9704, + "step": 11227 + }, + { + "epoch": 1.5895802364267007, + "grad_norm": 10.907915135802225, + "learning_rate": 5.318645598878894e-07, + "loss": 0.9941, + "step": 11228 + }, + { + "epoch": 1.5897218093013379, + "grad_norm": 8.408914703847461, + "learning_rate": 5.315111603088019e-07, + "loss": 0.9859, + "step": 11229 + }, + { + "epoch": 1.5898633821759751, + "grad_norm": 10.518445479846344, + "learning_rate": 5.311578642117049e-07, + "loss": 0.9922, + "step": 11230 + }, + { + "epoch": 1.5900049550506123, + "grad_norm": 8.289833313447629, + "learning_rate": 5.308046716151705e-07, + "loss": 0.9331, + "step": 11231 + }, + { + "epoch": 1.5901465279252496, + "grad_norm": 10.653384005060428, + "learning_rate": 5.304515825377666e-07, + "loss": 1.0366, + "step": 11232 + }, + { + "epoch": 1.5902881007998868, + "grad_norm": 11.477373503509524, + "learning_rate": 5.300985969980537e-07, + "loss": 1.1637, + "step": 11233 + }, + { + "epoch": 1.590429673674524, + "grad_norm": 9.388903264844172, + "learning_rate": 5.297457150145898e-07, + "loss": 1.0019, + "step": 11234 + }, + { + "epoch": 1.5905712465491613, + "grad_norm": 10.681046528493717, + "learning_rate": 5.293929366059236e-07, + "loss": 1.1066, + "step": 11235 + }, + { + "epoch": 1.5907128194237985, + "grad_norm": 7.654712008709361, + "learning_rate": 5.290402617906021e-07, + "loss": 0.9346, + "step": 11236 + }, + { + "epoch": 1.5908543922984357, + "grad_norm": 8.588739661592344, + "learning_rate": 5.286876905871638e-07, + "loss": 0.9497, + "step": 11237 + }, + { + "epoch": 1.590995965173073, + "grad_norm": 10.527886914906828, + "learning_rate": 5.283352230141445e-07, + "loss": 1.0091, + "step": 11238 + }, + { + "epoch": 1.5911375380477102, + "grad_norm": 8.782564438155383, + "learning_rate": 5.279828590900715e-07, + "loss": 0.9532, + "step": 11239 + }, + { + "epoch": 1.5912791109223474, + "grad_norm": 9.615361130905816, + "learning_rate": 5.276305988334701e-07, + "loss": 0.9314, + "step": 11240 + }, + { + "epoch": 1.5914206837969846, + "grad_norm": 11.251063656856827, + "learning_rate": 5.272784422628574e-07, + "loss": 0.9833, + "step": 11241 + }, + { + "epoch": 1.5915622566716219, + "grad_norm": 8.481464596354733, + "learning_rate": 5.269263893967453e-07, + "loss": 0.8925, + "step": 11242 + }, + { + "epoch": 1.5917038295462589, + "grad_norm": 11.409375240730277, + "learning_rate": 5.265744402536424e-07, + "loss": 0.9975, + "step": 11243 + }, + { + "epoch": 1.5918454024208961, + "grad_norm": 9.05790941450762, + "learning_rate": 5.262225948520491e-07, + "loss": 0.9307, + "step": 11244 + }, + { + "epoch": 1.5919869752955333, + "grad_norm": 8.588534017219784, + "learning_rate": 5.258708532104631e-07, + "loss": 0.8719, + "step": 11245 + }, + { + "epoch": 1.5921285481701706, + "grad_norm": 9.58837501481897, + "learning_rate": 5.255192153473734e-07, + "loss": 0.8895, + "step": 11246 + }, + { + "epoch": 1.5922701210448078, + "grad_norm": 9.29730664701729, + "learning_rate": 5.251676812812664e-07, + "loss": 0.9659, + "step": 11247 + }, + { + "epoch": 1.592411693919445, + "grad_norm": 10.74774785070364, + "learning_rate": 5.248162510306229e-07, + "loss": 0.9839, + "step": 11248 + }, + { + "epoch": 1.5925532667940823, + "grad_norm": 8.056179200801687, + "learning_rate": 5.244649246139152e-07, + "loss": 0.8608, + "step": 11249 + }, + { + "epoch": 1.5926948396687195, + "grad_norm": 12.011132480611346, + "learning_rate": 5.241137020496142e-07, + "loss": 1.0858, + "step": 11250 + }, + { + "epoch": 1.5928364125433567, + "grad_norm": 10.291038421935749, + "learning_rate": 5.237625833561821e-07, + "loss": 1.0057, + "step": 11251 + }, + { + "epoch": 1.592977985417994, + "grad_norm": 9.186534597317687, + "learning_rate": 5.234115685520788e-07, + "loss": 0.9673, + "step": 11252 + }, + { + "epoch": 1.593119558292631, + "grad_norm": 9.336064983439222, + "learning_rate": 5.23060657655754e-07, + "loss": 1.0473, + "step": 11253 + }, + { + "epoch": 1.5932611311672682, + "grad_norm": 9.525640719433138, + "learning_rate": 5.227098506856563e-07, + "loss": 0.9043, + "step": 11254 + }, + { + "epoch": 1.5934027040419054, + "grad_norm": 11.648853399475028, + "learning_rate": 5.223591476602283e-07, + "loss": 0.9924, + "step": 11255 + }, + { + "epoch": 1.5935442769165427, + "grad_norm": 9.203715229375446, + "learning_rate": 5.220085485979046e-07, + "loss": 0.9014, + "step": 11256 + }, + { + "epoch": 1.5936858497911799, + "grad_norm": 9.795743788733539, + "learning_rate": 5.216580535171173e-07, + "loss": 0.9111, + "step": 11257 + }, + { + "epoch": 1.5938274226658171, + "grad_norm": 12.134567418759916, + "learning_rate": 5.213076624362903e-07, + "loss": 1.0633, + "step": 11258 + }, + { + "epoch": 1.5939689955404543, + "grad_norm": 7.855023918285414, + "learning_rate": 5.209573753738448e-07, + "loss": 0.8751, + "step": 11259 + }, + { + "epoch": 1.5941105684150916, + "grad_norm": 9.692037983869117, + "learning_rate": 5.206071923481937e-07, + "loss": 0.954, + "step": 11260 + }, + { + "epoch": 1.5942521412897288, + "grad_norm": 8.77519870109232, + "learning_rate": 5.202571133777474e-07, + "loss": 0.8821, + "step": 11261 + }, + { + "epoch": 1.594393714164366, + "grad_norm": 10.124513214089141, + "learning_rate": 5.199071384809085e-07, + "loss": 0.9866, + "step": 11262 + }, + { + "epoch": 1.5945352870390033, + "grad_norm": 11.395531538198906, + "learning_rate": 5.19557267676074e-07, + "loss": 0.9898, + "step": 11263 + }, + { + "epoch": 1.5946768599136405, + "grad_norm": 10.089285698853312, + "learning_rate": 5.192075009816381e-07, + "loss": 1.0521, + "step": 11264 + }, + { + "epoch": 1.5948184327882777, + "grad_norm": 9.968997964196614, + "learning_rate": 5.188578384159862e-07, + "loss": 0.9447, + "step": 11265 + }, + { + "epoch": 1.594960005662915, + "grad_norm": 9.717972015937228, + "learning_rate": 5.185082799975013e-07, + "loss": 0.9334, + "step": 11266 + }, + { + "epoch": 1.5951015785375522, + "grad_norm": 8.691522120014152, + "learning_rate": 5.18158825744558e-07, + "loss": 0.9523, + "step": 11267 + }, + { + "epoch": 1.5952431514121894, + "grad_norm": 9.660882030206274, + "learning_rate": 5.17809475675528e-07, + "loss": 1.0605, + "step": 11268 + }, + { + "epoch": 1.5953847242868266, + "grad_norm": 8.465747258685981, + "learning_rate": 5.174602298087755e-07, + "loss": 0.8618, + "step": 11269 + }, + { + "epoch": 1.5955262971614639, + "grad_norm": 9.935156203924864, + "learning_rate": 5.171110881626604e-07, + "loss": 1.0468, + "step": 11270 + }, + { + "epoch": 1.595667870036101, + "grad_norm": 8.902416595522498, + "learning_rate": 5.167620507555373e-07, + "loss": 0.9345, + "step": 11271 + }, + { + "epoch": 1.5958094429107383, + "grad_norm": 9.262285064798105, + "learning_rate": 5.164131176057541e-07, + "loss": 0.9217, + "step": 11272 + }, + { + "epoch": 1.5959510157853756, + "grad_norm": 9.349439030918973, + "learning_rate": 5.160642887316555e-07, + "loss": 1.0374, + "step": 11273 + }, + { + "epoch": 1.5960925886600128, + "grad_norm": 8.78396249601213, + "learning_rate": 5.157155641515766e-07, + "loss": 0.9764, + "step": 11274 + }, + { + "epoch": 1.59623416153465, + "grad_norm": 9.409799508078084, + "learning_rate": 5.153669438838507e-07, + "loss": 1.0268, + "step": 11275 + }, + { + "epoch": 1.5963757344092873, + "grad_norm": 10.086089543054227, + "learning_rate": 5.150184279468057e-07, + "loss": 0.8831, + "step": 11276 + }, + { + "epoch": 1.5965173072839245, + "grad_norm": 8.558972154032054, + "learning_rate": 5.146700163587612e-07, + "loss": 0.9182, + "step": 11277 + }, + { + "epoch": 1.5966588801585617, + "grad_norm": 8.789026909650163, + "learning_rate": 5.143217091380343e-07, + "loss": 0.8448, + "step": 11278 + }, + { + "epoch": 1.596800453033199, + "grad_norm": 10.156391976171106, + "learning_rate": 5.139735063029338e-07, + "loss": 1.0539, + "step": 11279 + }, + { + "epoch": 1.5969420259078362, + "grad_norm": 8.790857455600534, + "learning_rate": 5.136254078717659e-07, + "loss": 1.0294, + "step": 11280 + }, + { + "epoch": 1.5970835987824734, + "grad_norm": 8.03455566772874, + "learning_rate": 5.132774138628286e-07, + "loss": 0.9832, + "step": 11281 + }, + { + "epoch": 1.5972251716571106, + "grad_norm": 9.905932689121517, + "learning_rate": 5.129295242944168e-07, + "loss": 1.0115, + "step": 11282 + }, + { + "epoch": 1.5973667445317479, + "grad_norm": 12.404198926209485, + "learning_rate": 5.125817391848187e-07, + "loss": 1.0297, + "step": 11283 + }, + { + "epoch": 1.5975083174063849, + "grad_norm": 9.829395691156213, + "learning_rate": 5.122340585523156e-07, + "loss": 0.9234, + "step": 11284 + }, + { + "epoch": 1.5976498902810221, + "grad_norm": 7.888680579191782, + "learning_rate": 5.118864824151868e-07, + "loss": 0.8931, + "step": 11285 + }, + { + "epoch": 1.5977914631556593, + "grad_norm": 8.766422228674948, + "learning_rate": 5.115390107917024e-07, + "loss": 1.0384, + "step": 11286 + }, + { + "epoch": 1.5979330360302966, + "grad_norm": 10.005227057965516, + "learning_rate": 5.111916437001302e-07, + "loss": 0.9741, + "step": 11287 + }, + { + "epoch": 1.5980746089049338, + "grad_norm": 8.884517670410693, + "learning_rate": 5.1084438115873e-07, + "loss": 0.9016, + "step": 11288 + }, + { + "epoch": 1.598216181779571, + "grad_norm": 11.069797328895955, + "learning_rate": 5.104972231857577e-07, + "loss": 0.9878, + "step": 11289 + }, + { + "epoch": 1.5983577546542083, + "grad_norm": 8.609414877686683, + "learning_rate": 5.101501697994626e-07, + "loss": 0.9294, + "step": 11290 + }, + { + "epoch": 1.5984993275288455, + "grad_norm": 12.464193711505358, + "learning_rate": 5.098032210180901e-07, + "loss": 0.8722, + "step": 11291 + }, + { + "epoch": 1.5986409004034827, + "grad_norm": 10.866138223854064, + "learning_rate": 5.094563768598773e-07, + "loss": 1.1141, + "step": 11292 + }, + { + "epoch": 1.59878247327812, + "grad_norm": 9.075440316775786, + "learning_rate": 5.091096373430588e-07, + "loss": 0.9746, + "step": 11293 + }, + { + "epoch": 1.5989240461527572, + "grad_norm": 10.018699709646473, + "learning_rate": 5.087630024858637e-07, + "loss": 0.9266, + "step": 11294 + }, + { + "epoch": 1.5990656190273942, + "grad_norm": 8.79384548674948, + "learning_rate": 5.084164723065111e-07, + "loss": 0.9845, + "step": 11295 + }, + { + "epoch": 1.5992071919020314, + "grad_norm": 10.403169559785539, + "learning_rate": 5.080700468232206e-07, + "loss": 1.0662, + "step": 11296 + }, + { + "epoch": 1.5993487647766687, + "grad_norm": 10.309492932790148, + "learning_rate": 5.077237260542014e-07, + "loss": 1.0499, + "step": 11297 + }, + { + "epoch": 1.5994903376513059, + "grad_norm": 7.694398839320402, + "learning_rate": 5.073775100176609e-07, + "loss": 0.8774, + "step": 11298 + }, + { + "epoch": 1.5996319105259431, + "grad_norm": 9.912753019434934, + "learning_rate": 5.070313987317992e-07, + "loss": 0.9873, + "step": 11299 + }, + { + "epoch": 1.5997734834005803, + "grad_norm": 9.46374218143437, + "learning_rate": 5.066853922148104e-07, + "loss": 1.0038, + "step": 11300 + }, + { + "epoch": 1.5999150562752176, + "grad_norm": 10.346897533764349, + "learning_rate": 5.063394904848851e-07, + "loss": 0.9202, + "step": 11301 + }, + { + "epoch": 1.6000566291498548, + "grad_norm": 9.359201369163648, + "learning_rate": 5.059936935602052e-07, + "loss": 1.0389, + "step": 11302 + }, + { + "epoch": 1.600198202024492, + "grad_norm": 9.211528471155173, + "learning_rate": 5.05648001458951e-07, + "loss": 0.9754, + "step": 11303 + }, + { + "epoch": 1.6003397748991293, + "grad_norm": 8.833129112713966, + "learning_rate": 5.053024141992935e-07, + "loss": 1.0471, + "step": 11304 + }, + { + "epoch": 1.6004813477737665, + "grad_norm": 9.299638097140821, + "learning_rate": 5.049569317994013e-07, + "loss": 0.9084, + "step": 11305 + }, + { + "epoch": 1.6006229206484037, + "grad_norm": 8.743616882139829, + "learning_rate": 5.046115542774358e-07, + "loss": 1.014, + "step": 11306 + }, + { + "epoch": 1.600764493523041, + "grad_norm": 11.04486264213781, + "learning_rate": 5.042662816515523e-07, + "loss": 1.0566, + "step": 11307 + }, + { + "epoch": 1.6009060663976782, + "grad_norm": 9.93695220397099, + "learning_rate": 5.039211139399031e-07, + "loss": 1.0597, + "step": 11308 + }, + { + "epoch": 1.6010476392723154, + "grad_norm": 9.655500166831322, + "learning_rate": 5.035760511606319e-07, + "loss": 1.05, + "step": 11309 + }, + { + "epoch": 1.6011892121469526, + "grad_norm": 9.2035606294128, + "learning_rate": 5.032310933318798e-07, + "loss": 1.0704, + "step": 11310 + }, + { + "epoch": 1.6013307850215899, + "grad_norm": 8.402276103138894, + "learning_rate": 5.028862404717796e-07, + "loss": 0.9219, + "step": 11311 + }, + { + "epoch": 1.601472357896227, + "grad_norm": 9.030832247988476, + "learning_rate": 5.025414925984612e-07, + "loss": 0.9958, + "step": 11312 + }, + { + "epoch": 1.6016139307708643, + "grad_norm": 8.169416885764734, + "learning_rate": 5.021968497300464e-07, + "loss": 0.8955, + "step": 11313 + }, + { + "epoch": 1.6017555036455016, + "grad_norm": 9.013702981054127, + "learning_rate": 5.018523118846544e-07, + "loss": 0.9711, + "step": 11314 + }, + { + "epoch": 1.6018970765201388, + "grad_norm": 9.236943283133241, + "learning_rate": 5.015078790803965e-07, + "loss": 0.925, + "step": 11315 + }, + { + "epoch": 1.602038649394776, + "grad_norm": 9.553379072180553, + "learning_rate": 5.011635513353786e-07, + "loss": 0.9241, + "step": 11316 + }, + { + "epoch": 1.6021802222694133, + "grad_norm": 9.849223428699213, + "learning_rate": 5.008193286677029e-07, + "loss": 0.9637, + "step": 11317 + }, + { + "epoch": 1.6023217951440505, + "grad_norm": 8.823651123703387, + "learning_rate": 5.004752110954642e-07, + "loss": 0.9366, + "step": 11318 + }, + { + "epoch": 1.6024633680186877, + "grad_norm": 9.412184557245382, + "learning_rate": 5.00131198636753e-07, + "loss": 0.9206, + "step": 11319 + }, + { + "epoch": 1.602604940893325, + "grad_norm": 10.108428203224578, + "learning_rate": 4.997872913096529e-07, + "loss": 0.9776, + "step": 11320 + }, + { + "epoch": 1.6027465137679622, + "grad_norm": 8.22682077956972, + "learning_rate": 4.994434891322436e-07, + "loss": 0.8809, + "step": 11321 + }, + { + "epoch": 1.6028880866425994, + "grad_norm": 8.309811723873802, + "learning_rate": 4.99099792122599e-07, + "loss": 0.9576, + "step": 11322 + }, + { + "epoch": 1.6030296595172366, + "grad_norm": 8.869514166806194, + "learning_rate": 4.987562002987858e-07, + "loss": 1.0168, + "step": 11323 + }, + { + "epoch": 1.6031712323918739, + "grad_norm": 9.008772918769411, + "learning_rate": 4.984127136788675e-07, + "loss": 0.9182, + "step": 11324 + }, + { + "epoch": 1.603312805266511, + "grad_norm": 8.44566300729626, + "learning_rate": 4.980693322808999e-07, + "loss": 0.8853, + "step": 11325 + }, + { + "epoch": 1.603454378141148, + "grad_norm": 8.858881968616961, + "learning_rate": 4.97726056122936e-07, + "loss": 0.9341, + "step": 11326 + }, + { + "epoch": 1.6035959510157853, + "grad_norm": 8.803625521718887, + "learning_rate": 4.97382885223019e-07, + "loss": 0.9187, + "step": 11327 + }, + { + "epoch": 1.6037375238904226, + "grad_norm": 9.351770129404521, + "learning_rate": 4.970398195991908e-07, + "loss": 0.9508, + "step": 11328 + }, + { + "epoch": 1.6038790967650598, + "grad_norm": 10.09121681762668, + "learning_rate": 4.96696859269486e-07, + "loss": 0.9586, + "step": 11329 + }, + { + "epoch": 1.604020669639697, + "grad_norm": 9.703777985400398, + "learning_rate": 4.963540042519333e-07, + "loss": 1.0117, + "step": 11330 + }, + { + "epoch": 1.6041622425143343, + "grad_norm": 9.495740638359711, + "learning_rate": 4.96011254564557e-07, + "loss": 0.9882, + "step": 11331 + }, + { + "epoch": 1.6043038153889715, + "grad_norm": 11.963181916096453, + "learning_rate": 4.956686102253744e-07, + "loss": 1.1031, + "step": 11332 + }, + { + "epoch": 1.6044453882636087, + "grad_norm": 9.33105831575348, + "learning_rate": 4.953260712523992e-07, + "loss": 0.9343, + "step": 11333 + }, + { + "epoch": 1.604586961138246, + "grad_norm": 9.495369033431754, + "learning_rate": 4.949836376636366e-07, + "loss": 0.9322, + "step": 11334 + }, + { + "epoch": 1.6047285340128832, + "grad_norm": 7.8463937627783125, + "learning_rate": 4.946413094770902e-07, + "loss": 0.9708, + "step": 11335 + }, + { + "epoch": 1.6048701068875202, + "grad_norm": 10.237867292399201, + "learning_rate": 4.942990867107547e-07, + "loss": 1.0268, + "step": 11336 + }, + { + "epoch": 1.6050116797621574, + "grad_norm": 8.408273219437797, + "learning_rate": 4.939569693826202e-07, + "loss": 0.9802, + "step": 11337 + }, + { + "epoch": 1.6051532526367946, + "grad_norm": 8.329230201202805, + "learning_rate": 4.936149575106727e-07, + "loss": 1.0269, + "step": 11338 + }, + { + "epoch": 1.6052948255114319, + "grad_norm": 8.688890524254548, + "learning_rate": 4.9327305111289e-07, + "loss": 0.908, + "step": 11339 + }, + { + "epoch": 1.605436398386069, + "grad_norm": 8.96168883415698, + "learning_rate": 4.929312502072475e-07, + "loss": 0.9499, + "step": 11340 + }, + { + "epoch": 1.6055779712607063, + "grad_norm": 11.474000150050522, + "learning_rate": 4.925895548117121e-07, + "loss": 0.9098, + "step": 11341 + }, + { + "epoch": 1.6057195441353436, + "grad_norm": 10.627873649193551, + "learning_rate": 4.922479649442477e-07, + "loss": 1.0664, + "step": 11342 + }, + { + "epoch": 1.6058611170099808, + "grad_norm": 10.1515185793936, + "learning_rate": 4.919064806228099e-07, + "loss": 0.9648, + "step": 11343 + }, + { + "epoch": 1.606002689884618, + "grad_norm": 8.309953112969175, + "learning_rate": 4.915651018653511e-07, + "loss": 1.0224, + "step": 11344 + }, + { + "epoch": 1.6061442627592553, + "grad_norm": 9.082570968506067, + "learning_rate": 4.91223828689818e-07, + "loss": 0.8829, + "step": 11345 + }, + { + "epoch": 1.6062858356338925, + "grad_norm": 8.30014933256451, + "learning_rate": 4.908826611141498e-07, + "loss": 0.9367, + "step": 11346 + }, + { + "epoch": 1.6064274085085297, + "grad_norm": 9.900672858906209, + "learning_rate": 4.905415991562834e-07, + "loss": 1.1747, + "step": 11347 + }, + { + "epoch": 1.606568981383167, + "grad_norm": 9.92376527576782, + "learning_rate": 4.902006428341457e-07, + "loss": 1.1159, + "step": 11348 + }, + { + "epoch": 1.6067105542578042, + "grad_norm": 8.239395724611015, + "learning_rate": 4.89859792165662e-07, + "loss": 0.9263, + "step": 11349 + }, + { + "epoch": 1.6068521271324414, + "grad_norm": 9.553602280204066, + "learning_rate": 4.895190471687497e-07, + "loss": 1.0716, + "step": 11350 + }, + { + "epoch": 1.6069937000070786, + "grad_norm": 9.066718020948192, + "learning_rate": 4.891784078613218e-07, + "loss": 0.9621, + "step": 11351 + }, + { + "epoch": 1.6071352728817159, + "grad_norm": 10.014013008927478, + "learning_rate": 4.888378742612865e-07, + "loss": 0.9182, + "step": 11352 + }, + { + "epoch": 1.607276845756353, + "grad_norm": 9.622094050085533, + "learning_rate": 4.884974463865438e-07, + "loss": 1.0567, + "step": 11353 + }, + { + "epoch": 1.6074184186309903, + "grad_norm": 9.650394775537967, + "learning_rate": 4.881571242549915e-07, + "loss": 0.9382, + "step": 11354 + }, + { + "epoch": 1.6075599915056276, + "grad_norm": 9.391671374730779, + "learning_rate": 4.87816907884518e-07, + "loss": 0.9378, + "step": 11355 + }, + { + "epoch": 1.6077015643802648, + "grad_norm": 9.69791172790948, + "learning_rate": 4.874767972930103e-07, + "loss": 1.0376, + "step": 11356 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 7.987924760917818, + "learning_rate": 4.871367924983458e-07, + "loss": 0.9377, + "step": 11357 + }, + { + "epoch": 1.6079847101295393, + "grad_norm": 8.23484797992297, + "learning_rate": 4.867968935184e-07, + "loss": 0.9849, + "step": 11358 + }, + { + "epoch": 1.6081262830041765, + "grad_norm": 8.848979360919566, + "learning_rate": 4.864571003710405e-07, + "loss": 0.9699, + "step": 11359 + }, + { + "epoch": 1.6082678558788137, + "grad_norm": 8.76497175805111, + "learning_rate": 4.861174130741292e-07, + "loss": 0.8854, + "step": 11360 + }, + { + "epoch": 1.608409428753451, + "grad_norm": 8.751930677763559, + "learning_rate": 4.857778316455245e-07, + "loss": 0.8753, + "step": 11361 + }, + { + "epoch": 1.6085510016280882, + "grad_norm": 11.298734435382956, + "learning_rate": 4.854383561030768e-07, + "loss": 1.0169, + "step": 11362 + }, + { + "epoch": 1.6086925745027254, + "grad_norm": 7.67847837474113, + "learning_rate": 4.85098986464633e-07, + "loss": 0.8503, + "step": 11363 + }, + { + "epoch": 1.6088341473773626, + "grad_norm": 8.370690902579275, + "learning_rate": 4.847597227480327e-07, + "loss": 0.9254, + "step": 11364 + }, + { + "epoch": 1.6089757202519999, + "grad_norm": 9.690032824764813, + "learning_rate": 4.844205649711118e-07, + "loss": 0.9781, + "step": 11365 + }, + { + "epoch": 1.609117293126637, + "grad_norm": 9.490280801298768, + "learning_rate": 4.840815131516979e-07, + "loss": 0.9595, + "step": 11366 + }, + { + "epoch": 1.609258866001274, + "grad_norm": 9.390661153866551, + "learning_rate": 4.83742567307616e-07, + "loss": 0.9854, + "step": 11367 + }, + { + "epoch": 1.6094004388759113, + "grad_norm": 9.101740626609947, + "learning_rate": 4.834037274566852e-07, + "loss": 0.909, + "step": 11368 + }, + { + "epoch": 1.6095420117505486, + "grad_norm": 10.278401545477688, + "learning_rate": 4.830649936167156e-07, + "loss": 0.9606, + "step": 11369 + }, + { + "epoch": 1.6096835846251858, + "grad_norm": 10.950780511499405, + "learning_rate": 4.827263658055161e-07, + "loss": 0.9639, + "step": 11370 + }, + { + "epoch": 1.609825157499823, + "grad_norm": 10.307453192161182, + "learning_rate": 4.823878440408866e-07, + "loss": 0.989, + "step": 11371 + }, + { + "epoch": 1.6099667303744603, + "grad_norm": 8.325041753074256, + "learning_rate": 4.820494283406238e-07, + "loss": 0.9372, + "step": 11372 + }, + { + "epoch": 1.6101083032490975, + "grad_norm": 12.914536298401673, + "learning_rate": 4.817111187225184e-07, + "loss": 1.0494, + "step": 11373 + }, + { + "epoch": 1.6102498761237347, + "grad_norm": 10.549058295431406, + "learning_rate": 4.813729152043542e-07, + "loss": 0.8999, + "step": 11374 + }, + { + "epoch": 1.610391448998372, + "grad_norm": 8.15344330300472, + "learning_rate": 4.810348178039112e-07, + "loss": 0.8271, + "step": 11375 + }, + { + "epoch": 1.6105330218730092, + "grad_norm": 8.539288991592544, + "learning_rate": 4.806968265389619e-07, + "loss": 0.9128, + "step": 11376 + }, + { + "epoch": 1.6106745947476464, + "grad_norm": 10.399498384556374, + "learning_rate": 4.803589414272752e-07, + "loss": 1.0517, + "step": 11377 + }, + { + "epoch": 1.6108161676222834, + "grad_norm": 8.85862015564567, + "learning_rate": 4.800211624866128e-07, + "loss": 0.8699, + "step": 11378 + }, + { + "epoch": 1.6109577404969206, + "grad_norm": 9.497095165857584, + "learning_rate": 4.796834897347319e-07, + "loss": 0.9267, + "step": 11379 + }, + { + "epoch": 1.6110993133715579, + "grad_norm": 9.087491218321281, + "learning_rate": 4.793459231893838e-07, + "loss": 1.0007, + "step": 11380 + }, + { + "epoch": 1.611240886246195, + "grad_norm": 9.674728539574282, + "learning_rate": 4.790084628683131e-07, + "loss": 1.0422, + "step": 11381 + }, + { + "epoch": 1.6113824591208323, + "grad_norm": 10.932093041323796, + "learning_rate": 4.786711087892613e-07, + "loss": 0.9649, + "step": 11382 + }, + { + "epoch": 1.6115240319954696, + "grad_norm": 10.646201847115771, + "learning_rate": 4.783338609699614e-07, + "loss": 0.9522, + "step": 11383 + }, + { + "epoch": 1.6116656048701068, + "grad_norm": 9.423435265451152, + "learning_rate": 4.779967194281438e-07, + "loss": 0.9658, + "step": 11384 + }, + { + "epoch": 1.611807177744744, + "grad_norm": 9.891471549351042, + "learning_rate": 4.776596841815304e-07, + "loss": 1.0657, + "step": 11385 + }, + { + "epoch": 1.6119487506193813, + "grad_norm": 8.34245010136996, + "learning_rate": 4.773227552478399e-07, + "loss": 0.9944, + "step": 11386 + }, + { + "epoch": 1.6120903234940185, + "grad_norm": 8.601757632891083, + "learning_rate": 4.769859326447834e-07, + "loss": 0.9776, + "step": 11387 + }, + { + "epoch": 1.6122318963686557, + "grad_norm": 10.960385515992568, + "learning_rate": 4.7664921639006877e-07, + "loss": 0.951, + "step": 11388 + }, + { + "epoch": 1.612373469243293, + "grad_norm": 9.074543284123433, + "learning_rate": 4.7631260650139595e-07, + "loss": 0.8431, + "step": 11389 + }, + { + "epoch": 1.6125150421179302, + "grad_norm": 9.396881608925803, + "learning_rate": 4.7597610299645993e-07, + "loss": 0.9065, + "step": 11390 + }, + { + "epoch": 1.6126566149925674, + "grad_norm": 9.818516921416025, + "learning_rate": 4.7563970589295185e-07, + "loss": 0.9826, + "step": 11391 + }, + { + "epoch": 1.6127981878672046, + "grad_norm": 8.364049784427799, + "learning_rate": 4.753034152085542e-07, + "loss": 0.9191, + "step": 11392 + }, + { + "epoch": 1.6129397607418419, + "grad_norm": 10.020688590566097, + "learning_rate": 4.7496723096094684e-07, + "loss": 0.9949, + "step": 11393 + }, + { + "epoch": 1.613081333616479, + "grad_norm": 9.228297988505163, + "learning_rate": 4.7463115316780163e-07, + "loss": 0.9977, + "step": 11394 + }, + { + "epoch": 1.6132229064911163, + "grad_norm": 10.657990702196425, + "learning_rate": 4.7429518184678667e-07, + "loss": 1.0057, + "step": 11395 + }, + { + "epoch": 1.6133644793657536, + "grad_norm": 9.363614098302728, + "learning_rate": 4.73959317015564e-07, + "loss": 1.0551, + "step": 11396 + }, + { + "epoch": 1.6135060522403908, + "grad_norm": 9.797453989445062, + "learning_rate": 4.736235586917889e-07, + "loss": 0.9832, + "step": 11397 + }, + { + "epoch": 1.613647625115028, + "grad_norm": 11.52371225837695, + "learning_rate": 4.732879068931132e-07, + "loss": 1.037, + "step": 11398 + }, + { + "epoch": 1.6137891979896652, + "grad_norm": 9.923147141009697, + "learning_rate": 4.7295236163718006e-07, + "loss": 1.089, + "step": 11399 + }, + { + "epoch": 1.6139307708643025, + "grad_norm": 9.6617364697723, + "learning_rate": 4.7261692294163134e-07, + "loss": 0.9276, + "step": 11400 + }, + { + "epoch": 1.6140723437389397, + "grad_norm": 9.063593515653281, + "learning_rate": 4.72281590824098e-07, + "loss": 0.9272, + "step": 11401 + }, + { + "epoch": 1.614213916613577, + "grad_norm": 8.974785558322331, + "learning_rate": 4.719463653022094e-07, + "loss": 0.8934, + "step": 11402 + }, + { + "epoch": 1.6143554894882142, + "grad_norm": 8.674196004850618, + "learning_rate": 4.7161124639358873e-07, + "loss": 0.8698, + "step": 11403 + }, + { + "epoch": 1.6144970623628514, + "grad_norm": 9.496186946786606, + "learning_rate": 4.7127623411585194e-07, + "loss": 0.9746, + "step": 11404 + }, + { + "epoch": 1.6146386352374886, + "grad_norm": 9.13165709249328, + "learning_rate": 4.7094132848661154e-07, + "loss": 0.9357, + "step": 11405 + }, + { + "epoch": 1.6147802081121259, + "grad_norm": 8.356081875441506, + "learning_rate": 4.706065295234719e-07, + "loss": 1.0198, + "step": 11406 + }, + { + "epoch": 1.614921780986763, + "grad_norm": 9.588722725974769, + "learning_rate": 4.702718372440343e-07, + "loss": 0.9291, + "step": 11407 + }, + { + "epoch": 1.6150633538614003, + "grad_norm": 8.65789096303103, + "learning_rate": 4.699372516658923e-07, + "loss": 0.9814, + "step": 11408 + }, + { + "epoch": 1.6152049267360373, + "grad_norm": 8.390442233519247, + "learning_rate": 4.6960277280663574e-07, + "loss": 0.8844, + "step": 11409 + }, + { + "epoch": 1.6153464996106746, + "grad_norm": 8.388786242308278, + "learning_rate": 4.692684006838477e-07, + "loss": 0.9455, + "step": 11410 + }, + { + "epoch": 1.6154880724853118, + "grad_norm": 9.472653028349967, + "learning_rate": 4.68934135315105e-07, + "loss": 0.9326, + "step": 11411 + }, + { + "epoch": 1.615629645359949, + "grad_norm": 8.975414179433884, + "learning_rate": 4.685999767179808e-07, + "loss": 0.9623, + "step": 11412 + }, + { + "epoch": 1.6157712182345862, + "grad_norm": 8.234745140370494, + "learning_rate": 4.6826592491004075e-07, + "loss": 0.9251, + "step": 11413 + }, + { + "epoch": 1.6159127911092235, + "grad_norm": 10.148105207704571, + "learning_rate": 4.679319799088466e-07, + "loss": 1.0287, + "step": 11414 + }, + { + "epoch": 1.6160543639838607, + "grad_norm": 8.542880783011894, + "learning_rate": 4.675981417319528e-07, + "loss": 1.0136, + "step": 11415 + }, + { + "epoch": 1.616195936858498, + "grad_norm": 9.73275797288667, + "learning_rate": 4.6726441039690955e-07, + "loss": 1.0162, + "step": 11416 + }, + { + "epoch": 1.6163375097331352, + "grad_norm": 8.711439342326793, + "learning_rate": 4.669307859212599e-07, + "loss": 0.9591, + "step": 11417 + }, + { + "epoch": 1.6164790826077724, + "grad_norm": 10.176385913536812, + "learning_rate": 4.665972683225431e-07, + "loss": 1.0155, + "step": 11418 + }, + { + "epoch": 1.6166206554824094, + "grad_norm": 8.846084124950606, + "learning_rate": 4.6626385761829234e-07, + "loss": 1.0258, + "step": 11419 + }, + { + "epoch": 1.6167622283570466, + "grad_norm": 10.686125014347606, + "learning_rate": 4.6593055382603334e-07, + "loss": 1.0076, + "step": 11420 + }, + { + "epoch": 1.6169038012316839, + "grad_norm": 10.481603583034182, + "learning_rate": 4.655973569632899e-07, + "loss": 0.9747, + "step": 11421 + }, + { + "epoch": 1.617045374106321, + "grad_norm": 7.3674567487914375, + "learning_rate": 4.6526426704757545e-07, + "loss": 0.9419, + "step": 11422 + }, + { + "epoch": 1.6171869469809583, + "grad_norm": 8.708907940416434, + "learning_rate": 4.6493128409640153e-07, + "loss": 0.9545, + "step": 11423 + }, + { + "epoch": 1.6173285198555956, + "grad_norm": 9.879058004098358, + "learning_rate": 4.6459840812727227e-07, + "loss": 0.9471, + "step": 11424 + }, + { + "epoch": 1.6174700927302328, + "grad_norm": 8.428970659974961, + "learning_rate": 4.642656391576869e-07, + "loss": 0.9959, + "step": 11425 + }, + { + "epoch": 1.61761166560487, + "grad_norm": 9.97354020453248, + "learning_rate": 4.6393297720513957e-07, + "loss": 0.9806, + "step": 11426 + }, + { + "epoch": 1.6177532384795072, + "grad_norm": 8.235891589762478, + "learning_rate": 4.6360042228711684e-07, + "loss": 0.8225, + "step": 11427 + }, + { + "epoch": 1.6178948113541445, + "grad_norm": 8.145128406615449, + "learning_rate": 4.6326797442110217e-07, + "loss": 0.8719, + "step": 11428 + }, + { + "epoch": 1.6180363842287817, + "grad_norm": 7.740241767596772, + "learning_rate": 4.629356336245708e-07, + "loss": 0.8522, + "step": 11429 + }, + { + "epoch": 1.618177957103419, + "grad_norm": 9.060456565148652, + "learning_rate": 4.626033999149948e-07, + "loss": 1.0466, + "step": 11430 + }, + { + "epoch": 1.6183195299780562, + "grad_norm": 9.579378279419538, + "learning_rate": 4.622712733098386e-07, + "loss": 0.9426, + "step": 11431 + }, + { + "epoch": 1.6184611028526934, + "grad_norm": 10.342307552016052, + "learning_rate": 4.619392538265624e-07, + "loss": 0.9281, + "step": 11432 + }, + { + "epoch": 1.6186026757273306, + "grad_norm": 10.25837477038578, + "learning_rate": 4.6160734148262027e-07, + "loss": 1.0415, + "step": 11433 + }, + { + "epoch": 1.6187442486019679, + "grad_norm": 7.550301036769965, + "learning_rate": 4.612755362954596e-07, + "loss": 0.9419, + "step": 11434 + }, + { + "epoch": 1.618885821476605, + "grad_norm": 10.505465583633418, + "learning_rate": 4.609438382825246e-07, + "loss": 1.0169, + "step": 11435 + }, + { + "epoch": 1.6190273943512423, + "grad_norm": 9.967374222389774, + "learning_rate": 4.6061224746125067e-07, + "loss": 1.0317, + "step": 11436 + }, + { + "epoch": 1.6191689672258796, + "grad_norm": 7.2381903389966835, + "learning_rate": 4.602807638490711e-07, + "loss": 0.8159, + "step": 11437 + }, + { + "epoch": 1.6193105401005168, + "grad_norm": 9.635833192362513, + "learning_rate": 4.5994938746341033e-07, + "loss": 1.1217, + "step": 11438 + }, + { + "epoch": 1.619452112975154, + "grad_norm": 8.879005495348327, + "learning_rate": 4.5961811832168965e-07, + "loss": 0.8694, + "step": 11439 + }, + { + "epoch": 1.6195936858497912, + "grad_norm": 7.873746257465574, + "learning_rate": 4.592869564413227e-07, + "loss": 0.9164, + "step": 11440 + }, + { + "epoch": 1.6197352587244285, + "grad_norm": 11.343053202600755, + "learning_rate": 4.5895590183971854e-07, + "loss": 0.9633, + "step": 11441 + }, + { + "epoch": 1.6198768315990657, + "grad_norm": 8.777726196051661, + "learning_rate": 4.5862495453428216e-07, + "loss": 0.8506, + "step": 11442 + }, + { + "epoch": 1.620018404473703, + "grad_norm": 10.144948264075166, + "learning_rate": 4.5829411454240856e-07, + "loss": 1.0048, + "step": 11443 + }, + { + "epoch": 1.6201599773483402, + "grad_norm": 9.997256856424181, + "learning_rate": 4.579633818814916e-07, + "loss": 0.9553, + "step": 11444 + }, + { + "epoch": 1.6203015502229774, + "grad_norm": 9.993348770723472, + "learning_rate": 4.576327565689165e-07, + "loss": 0.9843, + "step": 11445 + }, + { + "epoch": 1.6204431230976146, + "grad_norm": 8.96919488634179, + "learning_rate": 4.5730223862206493e-07, + "loss": 0.9325, + "step": 11446 + }, + { + "epoch": 1.6205846959722519, + "grad_norm": 9.113039303742879, + "learning_rate": 4.569718280583113e-07, + "loss": 0.9131, + "step": 11447 + }, + { + "epoch": 1.620726268846889, + "grad_norm": 8.854542539509557, + "learning_rate": 4.566415248950251e-07, + "loss": 0.9468, + "step": 11448 + }, + { + "epoch": 1.6208678417215263, + "grad_norm": 9.556914250183034, + "learning_rate": 4.5631132914957076e-07, + "loss": 0.9068, + "step": 11449 + }, + { + "epoch": 1.6210094145961633, + "grad_norm": 9.363914751592821, + "learning_rate": 4.5598124083930577e-07, + "loss": 0.915, + "step": 11450 + }, + { + "epoch": 1.6211509874708006, + "grad_norm": 9.780726994678545, + "learning_rate": 4.556512599815832e-07, + "loss": 0.8659, + "step": 11451 + }, + { + "epoch": 1.6212925603454378, + "grad_norm": 7.984320056226072, + "learning_rate": 4.553213865937492e-07, + "loss": 0.9004, + "step": 11452 + }, + { + "epoch": 1.621434133220075, + "grad_norm": 9.478394685826947, + "learning_rate": 4.5499162069314567e-07, + "loss": 0.9436, + "step": 11453 + }, + { + "epoch": 1.6215757060947122, + "grad_norm": 9.433985984340573, + "learning_rate": 4.5466196229710797e-07, + "loss": 1.0098, + "step": 11454 + }, + { + "epoch": 1.6217172789693495, + "grad_norm": 8.131816645471442, + "learning_rate": 4.5433241142296524e-07, + "loss": 0.8616, + "step": 11455 + }, + { + "epoch": 1.6218588518439867, + "grad_norm": 10.441400769849853, + "learning_rate": 4.5400296808804313e-07, + "loss": 1.1071, + "step": 11456 + }, + { + "epoch": 1.622000424718624, + "grad_norm": 8.757070246199639, + "learning_rate": 4.536736323096586e-07, + "loss": 0.8723, + "step": 11457 + }, + { + "epoch": 1.6221419975932612, + "grad_norm": 13.747383007350209, + "learning_rate": 4.5334440410512605e-07, + "loss": 1.1655, + "step": 11458 + }, + { + "epoch": 1.6222835704678984, + "grad_norm": 9.685422613074127, + "learning_rate": 4.5301528349175144e-07, + "loss": 0.9787, + "step": 11459 + }, + { + "epoch": 1.6224251433425356, + "grad_norm": 11.49146584346163, + "learning_rate": 4.526862704868376e-07, + "loss": 1.0315, + "step": 11460 + }, + { + "epoch": 1.6225667162171726, + "grad_norm": 11.617240695280024, + "learning_rate": 4.5235736510767957e-07, + "loss": 1.0132, + "step": 11461 + }, + { + "epoch": 1.6227082890918099, + "grad_norm": 8.820932236448682, + "learning_rate": 4.520285673715688e-07, + "loss": 0.9522, + "step": 11462 + }, + { + "epoch": 1.622849861966447, + "grad_norm": 9.293800371311882, + "learning_rate": 4.5169987729578897e-07, + "loss": 0.9272, + "step": 11463 + }, + { + "epoch": 1.6229914348410843, + "grad_norm": 8.26176907859966, + "learning_rate": 4.5137129489761874e-07, + "loss": 0.9098, + "step": 11464 + }, + { + "epoch": 1.6231330077157216, + "grad_norm": 10.384450997032324, + "learning_rate": 4.510428201943326e-07, + "loss": 0.9444, + "step": 11465 + }, + { + "epoch": 1.6232745805903588, + "grad_norm": 8.137992916618783, + "learning_rate": 4.5071445320319706e-07, + "loss": 0.9438, + "step": 11466 + }, + { + "epoch": 1.623416153464996, + "grad_norm": 12.835484576230094, + "learning_rate": 4.5038619394147554e-07, + "loss": 1.0152, + "step": 11467 + }, + { + "epoch": 1.6235577263396332, + "grad_norm": 8.69029618964564, + "learning_rate": 4.500580424264225e-07, + "loss": 0.9012, + "step": 11468 + }, + { + "epoch": 1.6236992992142705, + "grad_norm": 9.057193438959038, + "learning_rate": 4.497299986752901e-07, + "loss": 0.9961, + "step": 11469 + }, + { + "epoch": 1.6238408720889077, + "grad_norm": 8.338007518871978, + "learning_rate": 4.4940206270532333e-07, + "loss": 0.8998, + "step": 11470 + }, + { + "epoch": 1.623982444963545, + "grad_norm": 9.018242044611318, + "learning_rate": 4.4907423453376034e-07, + "loss": 0.9167, + "step": 11471 + }, + { + "epoch": 1.6241240178381822, + "grad_norm": 11.259306724440854, + "learning_rate": 4.487465141778366e-07, + "loss": 0.927, + "step": 11472 + }, + { + "epoch": 1.6242655907128194, + "grad_norm": 8.405133141274955, + "learning_rate": 4.4841890165477825e-07, + "loss": 0.9586, + "step": 11473 + }, + { + "epoch": 1.6244071635874566, + "grad_norm": 9.443180219021862, + "learning_rate": 4.480913969818099e-07, + "loss": 1.0147, + "step": 11474 + }, + { + "epoch": 1.6245487364620939, + "grad_norm": 8.613074419994028, + "learning_rate": 4.4776400017614546e-07, + "loss": 0.9353, + "step": 11475 + }, + { + "epoch": 1.624690309336731, + "grad_norm": 9.994356470272752, + "learning_rate": 4.474367112549974e-07, + "loss": 0.9954, + "step": 11476 + }, + { + "epoch": 1.6248318822113683, + "grad_norm": 8.45060243885546, + "learning_rate": 4.471095302355716e-07, + "loss": 0.8757, + "step": 11477 + }, + { + "epoch": 1.6249734550860055, + "grad_norm": 8.95830073387661, + "learning_rate": 4.467824571350665e-07, + "loss": 1.0932, + "step": 11478 + }, + { + "epoch": 1.6251150279606428, + "grad_norm": 9.097470076454892, + "learning_rate": 4.4645549197067736e-07, + "loss": 1.0106, + "step": 11479 + }, + { + "epoch": 1.62525660083528, + "grad_norm": 9.29163126803328, + "learning_rate": 4.461286347595911e-07, + "loss": 0.9969, + "step": 11480 + }, + { + "epoch": 1.6253981737099172, + "grad_norm": 9.472893843669816, + "learning_rate": 4.4580188551899164e-07, + "loss": 0.8249, + "step": 11481 + }, + { + "epoch": 1.6255397465845545, + "grad_norm": 9.717664651891477, + "learning_rate": 4.4547524426605484e-07, + "loss": 0.9534, + "step": 11482 + }, + { + "epoch": 1.6256813194591917, + "grad_norm": 9.690585918526079, + "learning_rate": 4.451487110179531e-07, + "loss": 0.992, + "step": 11483 + }, + { + "epoch": 1.625822892333829, + "grad_norm": 8.982061098090139, + "learning_rate": 4.448222857918508e-07, + "loss": 0.9449, + "step": 11484 + }, + { + "epoch": 1.6259644652084662, + "grad_norm": 11.30295999294215, + "learning_rate": 4.444959686049094e-07, + "loss": 1.0637, + "step": 11485 + }, + { + "epoch": 1.6261060380831034, + "grad_norm": 9.4515256427255, + "learning_rate": 4.441697594742819e-07, + "loss": 0.8613, + "step": 11486 + }, + { + "epoch": 1.6262476109577406, + "grad_norm": 9.1941371480933, + "learning_rate": 4.4384365841711684e-07, + "loss": 1.1167, + "step": 11487 + }, + { + "epoch": 1.6263891838323778, + "grad_norm": 8.044027294960548, + "learning_rate": 4.4351766545055826e-07, + "loss": 0.8927, + "step": 11488 + }, + { + "epoch": 1.626530756707015, + "grad_norm": 9.621147672910197, + "learning_rate": 4.4319178059174186e-07, + "loss": 1.0665, + "step": 11489 + }, + { + "epoch": 1.6266723295816523, + "grad_norm": 9.480321479851204, + "learning_rate": 4.428660038578006e-07, + "loss": 0.9444, + "step": 11490 + }, + { + "epoch": 1.6268139024562895, + "grad_norm": 10.016745470427804, + "learning_rate": 4.4254033526585917e-07, + "loss": 0.9305, + "step": 11491 + }, + { + "epoch": 1.6269554753309265, + "grad_norm": 9.194946178052657, + "learning_rate": 4.42214774833038e-07, + "loss": 0.9987, + "step": 11492 + }, + { + "epoch": 1.6270970482055638, + "grad_norm": 9.080506424414232, + "learning_rate": 4.418893225764526e-07, + "loss": 0.9358, + "step": 11493 + }, + { + "epoch": 1.627238621080201, + "grad_norm": 7.81108214673092, + "learning_rate": 4.4156397851321003e-07, + "loss": 0.9413, + "step": 11494 + }, + { + "epoch": 1.6273801939548382, + "grad_norm": 10.797722569278461, + "learning_rate": 4.412387426604156e-07, + "loss": 1.1375, + "step": 11495 + }, + { + "epoch": 1.6275217668294755, + "grad_norm": 8.958439552801114, + "learning_rate": 4.4091361503516424e-07, + "loss": 0.9289, + "step": 11496 + }, + { + "epoch": 1.6276633397041127, + "grad_norm": 9.5624652936717, + "learning_rate": 4.405885956545494e-07, + "loss": 0.9941, + "step": 11497 + }, + { + "epoch": 1.62780491257875, + "grad_norm": 10.167661107524234, + "learning_rate": 4.402636845356559e-07, + "loss": 0.9828, + "step": 11498 + }, + { + "epoch": 1.6279464854533872, + "grad_norm": 8.36342310350856, + "learning_rate": 4.3993888169556463e-07, + "loss": 0.9368, + "step": 11499 + }, + { + "epoch": 1.6280880583280244, + "grad_norm": 10.071559831716618, + "learning_rate": 4.3961418715135097e-07, + "loss": 0.9633, + "step": 11500 + }, + { + "epoch": 1.6282296312026616, + "grad_norm": 9.15973729718902, + "learning_rate": 4.3928960092008254e-07, + "loss": 0.9336, + "step": 11501 + }, + { + "epoch": 1.6283712040772986, + "grad_norm": 9.77901386867062, + "learning_rate": 4.389651230188241e-07, + "loss": 1.0228, + "step": 11502 + }, + { + "epoch": 1.6285127769519359, + "grad_norm": 9.691193301141901, + "learning_rate": 4.386407534646314e-07, + "loss": 0.9397, + "step": 11503 + }, + { + "epoch": 1.628654349826573, + "grad_norm": 10.511985114843318, + "learning_rate": 4.3831649227455806e-07, + "loss": 0.9111, + "step": 11504 + }, + { + "epoch": 1.6287959227012103, + "grad_norm": 9.18611560174554, + "learning_rate": 4.3799233946564904e-07, + "loss": 0.8326, + "step": 11505 + }, + { + "epoch": 1.6289374955758475, + "grad_norm": 9.986271017035518, + "learning_rate": 4.3766829505494574e-07, + "loss": 1.0648, + "step": 11506 + }, + { + "epoch": 1.6290790684504848, + "grad_norm": 9.91188327174386, + "learning_rate": 4.3734435905948226e-07, + "loss": 0.9953, + "step": 11507 + }, + { + "epoch": 1.629220641325122, + "grad_norm": 9.267861519310335, + "learning_rate": 4.370205314962872e-07, + "loss": 1.0238, + "step": 11508 + }, + { + "epoch": 1.6293622141997592, + "grad_norm": 9.128979376980322, + "learning_rate": 4.366968123823856e-07, + "loss": 1.1037, + "step": 11509 + }, + { + "epoch": 1.6295037870743965, + "grad_norm": 11.118128336638364, + "learning_rate": 4.36373201734793e-07, + "loss": 0.9855, + "step": 11510 + }, + { + "epoch": 1.6296453599490337, + "grad_norm": 10.061297421831187, + "learning_rate": 4.360496995705235e-07, + "loss": 0.9465, + "step": 11511 + }, + { + "epoch": 1.629786932823671, + "grad_norm": 9.262439096433331, + "learning_rate": 4.3572630590658136e-07, + "loss": 0.9516, + "step": 11512 + }, + { + "epoch": 1.6299285056983082, + "grad_norm": 9.609751326472534, + "learning_rate": 4.354030207599691e-07, + "loss": 1.0455, + "step": 11513 + }, + { + "epoch": 1.6300700785729454, + "grad_norm": 8.79911230985298, + "learning_rate": 4.3507984414767974e-07, + "loss": 0.9234, + "step": 11514 + }, + { + "epoch": 1.6302116514475826, + "grad_norm": 12.493082802892243, + "learning_rate": 4.347567760867036e-07, + "loss": 0.9793, + "step": 11515 + }, + { + "epoch": 1.6303532243222199, + "grad_norm": 8.806242324035201, + "learning_rate": 4.344338165940248e-07, + "loss": 0.955, + "step": 11516 + }, + { + "epoch": 1.630494797196857, + "grad_norm": 9.636883026442318, + "learning_rate": 4.341109656866188e-07, + "loss": 0.9097, + "step": 11517 + }, + { + "epoch": 1.6306363700714943, + "grad_norm": 8.73089344604611, + "learning_rate": 4.337882233814597e-07, + "loss": 0.9364, + "step": 11518 + }, + { + "epoch": 1.6307779429461315, + "grad_norm": 12.374488897076404, + "learning_rate": 4.3346558969551253e-07, + "loss": 1.0077, + "step": 11519 + }, + { + "epoch": 1.6309195158207688, + "grad_norm": 9.941339866551449, + "learning_rate": 4.331430646457391e-07, + "loss": 0.8712, + "step": 11520 + }, + { + "epoch": 1.631061088695406, + "grad_norm": 8.940941634799689, + "learning_rate": 4.3282064824909265e-07, + "loss": 0.9516, + "step": 11521 + }, + { + "epoch": 1.6312026615700432, + "grad_norm": 10.741714833351105, + "learning_rate": 4.324983405225236e-07, + "loss": 0.9691, + "step": 11522 + }, + { + "epoch": 1.6313442344446805, + "grad_norm": 8.281936789770016, + "learning_rate": 4.321761414829759e-07, + "loss": 0.9449, + "step": 11523 + }, + { + "epoch": 1.6314858073193177, + "grad_norm": 9.028523492254717, + "learning_rate": 4.3185405114738593e-07, + "loss": 0.9834, + "step": 11524 + }, + { + "epoch": 1.631627380193955, + "grad_norm": 7.593266978031143, + "learning_rate": 4.3153206953268715e-07, + "loss": 0.9946, + "step": 11525 + }, + { + "epoch": 1.6317689530685922, + "grad_norm": 8.655081219041397, + "learning_rate": 4.312101966558044e-07, + "loss": 0.9267, + "step": 11526 + }, + { + "epoch": 1.6319105259432294, + "grad_norm": 9.572231950371801, + "learning_rate": 4.308884325336596e-07, + "loss": 1.0216, + "step": 11527 + }, + { + "epoch": 1.6320520988178666, + "grad_norm": 9.406144962167891, + "learning_rate": 4.305667771831673e-07, + "loss": 0.9139, + "step": 11528 + }, + { + "epoch": 1.6321936716925038, + "grad_norm": 8.347900272813742, + "learning_rate": 4.302452306212357e-07, + "loss": 0.939, + "step": 11529 + }, + { + "epoch": 1.632335244567141, + "grad_norm": 8.794555141351799, + "learning_rate": 4.2992379286476984e-07, + "loss": 0.9116, + "step": 11530 + }, + { + "epoch": 1.6324768174417783, + "grad_norm": 7.83171879247364, + "learning_rate": 4.296024639306659e-07, + "loss": 0.8206, + "step": 11531 + }, + { + "epoch": 1.6326183903164155, + "grad_norm": 9.112078152744234, + "learning_rate": 4.292812438358174e-07, + "loss": 0.977, + "step": 11532 + }, + { + "epoch": 1.6327599631910525, + "grad_norm": 7.9875126177429925, + "learning_rate": 4.2896013259710905e-07, + "loss": 0.9885, + "step": 11533 + }, + { + "epoch": 1.6329015360656898, + "grad_norm": 9.899222880987221, + "learning_rate": 4.286391302314233e-07, + "loss": 1.0573, + "step": 11534 + }, + { + "epoch": 1.633043108940327, + "grad_norm": 9.564880872543945, + "learning_rate": 4.2831823675563324e-07, + "loss": 0.8901, + "step": 11535 + }, + { + "epoch": 1.6331846818149642, + "grad_norm": 11.395161964032079, + "learning_rate": 4.279974521866093e-07, + "loss": 1.1026, + "step": 11536 + }, + { + "epoch": 1.6333262546896015, + "grad_norm": 8.62727834491417, + "learning_rate": 4.2767677654121375e-07, + "loss": 0.932, + "step": 11537 + }, + { + "epoch": 1.6334678275642387, + "grad_norm": 8.779311865601773, + "learning_rate": 4.2735620983630543e-07, + "loss": 0.9844, + "step": 11538 + }, + { + "epoch": 1.633609400438876, + "grad_norm": 10.601274277009662, + "learning_rate": 4.2703575208873585e-07, + "loss": 0.9738, + "step": 11539 + }, + { + "epoch": 1.6337509733135132, + "grad_norm": 9.661370854697184, + "learning_rate": 4.267154033153503e-07, + "loss": 0.9605, + "step": 11540 + }, + { + "epoch": 1.6338925461881504, + "grad_norm": 9.394789903533924, + "learning_rate": 4.26395163532991e-07, + "loss": 1.1011, + "step": 11541 + }, + { + "epoch": 1.6340341190627876, + "grad_norm": 7.960904197136549, + "learning_rate": 4.2607503275849116e-07, + "loss": 1.0098, + "step": 11542 + }, + { + "epoch": 1.6341756919374246, + "grad_norm": 9.079834243673071, + "learning_rate": 4.2575501100868085e-07, + "loss": 1.0159, + "step": 11543 + }, + { + "epoch": 1.6343172648120619, + "grad_norm": 9.548477962297676, + "learning_rate": 4.2543509830038243e-07, + "loss": 0.9385, + "step": 11544 + }, + { + "epoch": 1.634458837686699, + "grad_norm": 9.33326076297703, + "learning_rate": 4.2511529465041417e-07, + "loss": 0.9722, + "step": 11545 + }, + { + "epoch": 1.6346004105613363, + "grad_norm": 7.76870807565807, + "learning_rate": 4.2479560007558845e-07, + "loss": 0.9041, + "step": 11546 + }, + { + "epoch": 1.6347419834359735, + "grad_norm": 8.215709283337642, + "learning_rate": 4.2447601459270987e-07, + "loss": 0.8449, + "step": 11547 + }, + { + "epoch": 1.6348835563106108, + "grad_norm": 10.05191260210957, + "learning_rate": 4.241565382185808e-07, + "loss": 1.0691, + "step": 11548 + }, + { + "epoch": 1.635025129185248, + "grad_norm": 9.54347280763038, + "learning_rate": 4.238371709699937e-07, + "loss": 1.1002, + "step": 11549 + }, + { + "epoch": 1.6351667020598852, + "grad_norm": 11.58945930648197, + "learning_rate": 4.2351791286373847e-07, + "loss": 1.0213, + "step": 11550 + }, + { + "epoch": 1.6353082749345225, + "grad_norm": 7.974556516555776, + "learning_rate": 4.231987639165988e-07, + "loss": 0.9436, + "step": 11551 + }, + { + "epoch": 1.6354498478091597, + "grad_norm": 9.734138694087045, + "learning_rate": 4.2287972414535084e-07, + "loss": 0.8606, + "step": 11552 + }, + { + "epoch": 1.635591420683797, + "grad_norm": 8.697380320445724, + "learning_rate": 4.2256079356676776e-07, + "loss": 0.9555, + "step": 11553 + }, + { + "epoch": 1.6357329935584342, + "grad_norm": 9.583481453013508, + "learning_rate": 4.222419721976143e-07, + "loss": 0.9515, + "step": 11554 + }, + { + "epoch": 1.6358745664330714, + "grad_norm": 10.580747113160594, + "learning_rate": 4.2192326005465134e-07, + "loss": 1.0036, + "step": 11555 + }, + { + "epoch": 1.6360161393077086, + "grad_norm": 8.78975691701177, + "learning_rate": 4.216046571546328e-07, + "loss": 0.9879, + "step": 11556 + }, + { + "epoch": 1.6361577121823458, + "grad_norm": 10.539547874409635, + "learning_rate": 4.212861635143084e-07, + "loss": 0.9782, + "step": 11557 + }, + { + "epoch": 1.636299285056983, + "grad_norm": 10.659936172265637, + "learning_rate": 4.2096777915041964e-07, + "loss": 1.0941, + "step": 11558 + }, + { + "epoch": 1.6364408579316203, + "grad_norm": 8.7957765154236, + "learning_rate": 4.206495040797051e-07, + "loss": 1.09, + "step": 11559 + }, + { + "epoch": 1.6365824308062575, + "grad_norm": 9.141252649391111, + "learning_rate": 4.203313383188959e-07, + "loss": 1.0111, + "step": 11560 + }, + { + "epoch": 1.6367240036808948, + "grad_norm": 11.19092460514619, + "learning_rate": 4.200132818847169e-07, + "loss": 0.8976, + "step": 11561 + }, + { + "epoch": 1.636865576555532, + "grad_norm": 9.053854139551477, + "learning_rate": 4.1969533479388925e-07, + "loss": 1.0852, + "step": 11562 + }, + { + "epoch": 1.6370071494301692, + "grad_norm": 10.16565069740066, + "learning_rate": 4.193774970631262e-07, + "loss": 0.9579, + "step": 11563 + }, + { + "epoch": 1.6371487223048065, + "grad_norm": 9.14807305395983, + "learning_rate": 4.1905976870913747e-07, + "loss": 0.9838, + "step": 11564 + }, + { + "epoch": 1.6372902951794437, + "grad_norm": 8.403408775772744, + "learning_rate": 4.1874214974862436e-07, + "loss": 0.9894, + "step": 11565 + }, + { + "epoch": 1.637431868054081, + "grad_norm": 9.767178001515138, + "learning_rate": 4.1842464019828444e-07, + "loss": 0.8983, + "step": 11566 + }, + { + "epoch": 1.6375734409287181, + "grad_norm": 10.677156853151352, + "learning_rate": 4.1810724007480987e-07, + "loss": 0.9388, + "step": 11567 + }, + { + "epoch": 1.6377150138033554, + "grad_norm": 9.577426007452246, + "learning_rate": 4.1778994939488476e-07, + "loss": 0.98, + "step": 11568 + }, + { + "epoch": 1.6378565866779926, + "grad_norm": 8.688252148898076, + "learning_rate": 4.174727681751906e-07, + "loss": 1.0108, + "step": 11569 + }, + { + "epoch": 1.6379981595526298, + "grad_norm": 8.881540386867286, + "learning_rate": 4.1715569643239916e-07, + "loss": 0.8609, + "step": 11570 + }, + { + "epoch": 1.638139732427267, + "grad_norm": 8.109498245390537, + "learning_rate": 4.1683873418318007e-07, + "loss": 0.9006, + "step": 11571 + }, + { + "epoch": 1.6382813053019043, + "grad_norm": 9.50619706258888, + "learning_rate": 4.1652188144419516e-07, + "loss": 0.9928, + "step": 11572 + }, + { + "epoch": 1.6384228781765415, + "grad_norm": 8.73434224404077, + "learning_rate": 4.1620513823210115e-07, + "loss": 0.8444, + "step": 11573 + }, + { + "epoch": 1.6385644510511785, + "grad_norm": 9.126183002029022, + "learning_rate": 4.1588850456354995e-07, + "loss": 0.9435, + "step": 11574 + }, + { + "epoch": 1.6387060239258158, + "grad_norm": 9.298865661353036, + "learning_rate": 4.1557198045518554e-07, + "loss": 0.9176, + "step": 11575 + }, + { + "epoch": 1.638847596800453, + "grad_norm": 9.451054218566581, + "learning_rate": 4.152555659236485e-07, + "loss": 0.9513, + "step": 11576 + }, + { + "epoch": 1.6389891696750902, + "grad_norm": 9.132972056722126, + "learning_rate": 4.1493926098557127e-07, + "loss": 0.8884, + "step": 11577 + }, + { + "epoch": 1.6391307425497275, + "grad_norm": 8.264688074557114, + "learning_rate": 4.146230656575831e-07, + "loss": 0.8631, + "step": 11578 + }, + { + "epoch": 1.6392723154243647, + "grad_norm": 10.203797953496592, + "learning_rate": 4.1430697995630486e-07, + "loss": 1.0625, + "step": 11579 + }, + { + "epoch": 1.639413888299002, + "grad_norm": 9.344951845782834, + "learning_rate": 4.139910038983541e-07, + "loss": 0.958, + "step": 11580 + }, + { + "epoch": 1.6395554611736392, + "grad_norm": 8.84428873207278, + "learning_rate": 4.136751375003406e-07, + "loss": 0.9541, + "step": 11581 + }, + { + "epoch": 1.6396970340482764, + "grad_norm": 8.835416824015807, + "learning_rate": 4.133593807788691e-07, + "loss": 0.9606, + "step": 11582 + }, + { + "epoch": 1.6398386069229136, + "grad_norm": 8.09323389477212, + "learning_rate": 4.1304373375053995e-07, + "loss": 0.805, + "step": 11583 + }, + { + "epoch": 1.6399801797975508, + "grad_norm": 9.247755860983544, + "learning_rate": 4.127281964319446e-07, + "loss": 0.8859, + "step": 11584 + }, + { + "epoch": 1.6401217526721878, + "grad_norm": 9.081806112800901, + "learning_rate": 4.1241276883967256e-07, + "loss": 0.9844, + "step": 11585 + }, + { + "epoch": 1.640263325546825, + "grad_norm": 8.68935325121015, + "learning_rate": 4.120974509903039e-07, + "loss": 0.993, + "step": 11586 + }, + { + "epoch": 1.6404048984214623, + "grad_norm": 10.103707235581588, + "learning_rate": 4.117822429004159e-07, + "loss": 0.9368, + "step": 11587 + }, + { + "epoch": 1.6405464712960995, + "grad_norm": 8.321608522154602, + "learning_rate": 4.114671445865781e-07, + "loss": 0.9352, + "step": 11588 + }, + { + "epoch": 1.6406880441707368, + "grad_norm": 9.95873252249451, + "learning_rate": 4.11152156065355e-07, + "loss": 1.0667, + "step": 11589 + }, + { + "epoch": 1.640829617045374, + "grad_norm": 8.61970246759613, + "learning_rate": 4.1083727735330677e-07, + "loss": 0.9291, + "step": 11590 + }, + { + "epoch": 1.6409711899200112, + "grad_norm": 9.458713563062707, + "learning_rate": 4.105225084669839e-07, + "loss": 0.9772, + "step": 11591 + }, + { + "epoch": 1.6411127627946485, + "grad_norm": 7.615234375, + "learning_rate": 4.1020784942293557e-07, + "loss": 0.8925, + "step": 11592 + }, + { + "epoch": 1.6412543356692857, + "grad_norm": 8.904904555034678, + "learning_rate": 4.0989330023770146e-07, + "loss": 1.0476, + "step": 11593 + }, + { + "epoch": 1.641395908543923, + "grad_norm": 10.816011244593085, + "learning_rate": 4.0957886092781897e-07, + "loss": 1.0337, + "step": 11594 + }, + { + "epoch": 1.6415374814185602, + "grad_norm": 11.89716142410331, + "learning_rate": 4.092645315098165e-07, + "loss": 1.1063, + "step": 11595 + }, + { + "epoch": 1.6416790542931974, + "grad_norm": 10.279231003672185, + "learning_rate": 4.0895031200021836e-07, + "loss": 0.9769, + "step": 11596 + }, + { + "epoch": 1.6418206271678346, + "grad_norm": 10.936800340344991, + "learning_rate": 4.0863620241554407e-07, + "loss": 0.9895, + "step": 11597 + }, + { + "epoch": 1.6419622000424718, + "grad_norm": 8.501859573701362, + "learning_rate": 4.0832220277230467e-07, + "loss": 1.0537, + "step": 11598 + }, + { + "epoch": 1.642103772917109, + "grad_norm": 10.540891314754775, + "learning_rate": 4.0800831308700773e-07, + "loss": 0.9357, + "step": 11599 + }, + { + "epoch": 1.6422453457917463, + "grad_norm": 7.909820835867117, + "learning_rate": 4.0769453337615367e-07, + "loss": 0.9581, + "step": 11600 + }, + { + "epoch": 1.6423869186663835, + "grad_norm": 10.32946454709003, + "learning_rate": 4.073808636562382e-07, + "loss": 1.0356, + "step": 11601 + }, + { + "epoch": 1.6425284915410208, + "grad_norm": 9.52808164872579, + "learning_rate": 4.070673039437506e-07, + "loss": 0.9089, + "step": 11602 + }, + { + "epoch": 1.642670064415658, + "grad_norm": 8.976209774673569, + "learning_rate": 4.0675385425517356e-07, + "loss": 0.9564, + "step": 11603 + }, + { + "epoch": 1.6428116372902952, + "grad_norm": 10.59434125032417, + "learning_rate": 4.0644051460698634e-07, + "loss": 0.9773, + "step": 11604 + }, + { + "epoch": 1.6429532101649325, + "grad_norm": 11.000929706472007, + "learning_rate": 4.0612728501565973e-07, + "loss": 0.973, + "step": 11605 + }, + { + "epoch": 1.6430947830395697, + "grad_norm": 10.296449319610552, + "learning_rate": 4.058141654976608e-07, + "loss": 0.9689, + "step": 11606 + }, + { + "epoch": 1.643236355914207, + "grad_norm": 10.412739697613395, + "learning_rate": 4.055011560694494e-07, + "loss": 0.9375, + "step": 11607 + }, + { + "epoch": 1.6433779287888441, + "grad_norm": 8.980002332657339, + "learning_rate": 4.0518825674748076e-07, + "loss": 1.0587, + "step": 11608 + }, + { + "epoch": 1.6435195016634814, + "grad_norm": 10.120878911507946, + "learning_rate": 4.0487546754820304e-07, + "loss": 0.9679, + "step": 11609 + }, + { + "epoch": 1.6436610745381186, + "grad_norm": 11.652411189253385, + "learning_rate": 4.0456278848806067e-07, + "loss": 1.0362, + "step": 11610 + }, + { + "epoch": 1.6438026474127558, + "grad_norm": 9.411394607055165, + "learning_rate": 4.042502195834891e-07, + "loss": 0.9734, + "step": 11611 + }, + { + "epoch": 1.643944220287393, + "grad_norm": 8.947074829811914, + "learning_rate": 4.039377608509218e-07, + "loss": 0.905, + "step": 11612 + }, + { + "epoch": 1.6440857931620303, + "grad_norm": 8.98279537744063, + "learning_rate": 4.0362541230678316e-07, + "loss": 1.0749, + "step": 11613 + }, + { + "epoch": 1.6442273660366675, + "grad_norm": 14.14631705189773, + "learning_rate": 4.033131739674931e-07, + "loss": 0.9214, + "step": 11614 + }, + { + "epoch": 1.6443689389113048, + "grad_norm": 8.669134913385786, + "learning_rate": 4.0300104584946655e-07, + "loss": 0.9345, + "step": 11615 + }, + { + "epoch": 1.6445105117859418, + "grad_norm": 9.326536110361182, + "learning_rate": 4.026890279691109e-07, + "loss": 1.035, + "step": 11616 + }, + { + "epoch": 1.644652084660579, + "grad_norm": 10.599539376093263, + "learning_rate": 4.0237712034283004e-07, + "loss": 1.0227, + "step": 11617 + }, + { + "epoch": 1.6447936575352162, + "grad_norm": 10.475857181118842, + "learning_rate": 4.020653229870192e-07, + "loss": 0.8981, + "step": 11618 + }, + { + "epoch": 1.6449352304098535, + "grad_norm": 9.482332364729436, + "learning_rate": 4.0175363591806985e-07, + "loss": 1.026, + "step": 11619 + }, + { + "epoch": 1.6450768032844907, + "grad_norm": 9.703882553333315, + "learning_rate": 4.0144205915236797e-07, + "loss": 1.0134, + "step": 11620 + }, + { + "epoch": 1.645218376159128, + "grad_norm": 10.542076454691575, + "learning_rate": 4.0113059270629193e-07, + "loss": 1.0175, + "step": 11621 + }, + { + "epoch": 1.6453599490337651, + "grad_norm": 10.116560067417, + "learning_rate": 4.008192365962166e-07, + "loss": 1.0363, + "step": 11622 + }, + { + "epoch": 1.6455015219084024, + "grad_norm": 9.93474113507234, + "learning_rate": 4.0050799083850787e-07, + "loss": 0.9697, + "step": 11623 + }, + { + "epoch": 1.6456430947830396, + "grad_norm": 9.52131229812456, + "learning_rate": 4.0019685544952835e-07, + "loss": 0.9616, + "step": 11624 + }, + { + "epoch": 1.6457846676576768, + "grad_norm": 11.080934843432413, + "learning_rate": 3.998858304456352e-07, + "loss": 1.0026, + "step": 11625 + }, + { + "epoch": 1.6459262405323138, + "grad_norm": 9.6003609907307, + "learning_rate": 3.995749158431772e-07, + "loss": 0.9233, + "step": 11626 + }, + { + "epoch": 1.646067813406951, + "grad_norm": 8.588203110430937, + "learning_rate": 3.9926411165850054e-07, + "loss": 1.0555, + "step": 11627 + }, + { + "epoch": 1.6462093862815883, + "grad_norm": 8.91255687465417, + "learning_rate": 3.989534179079427e-07, + "loss": 0.9637, + "step": 11628 + }, + { + "epoch": 1.6463509591562255, + "grad_norm": 10.752083155151515, + "learning_rate": 3.986428346078375e-07, + "loss": 1.0052, + "step": 11629 + }, + { + "epoch": 1.6464925320308628, + "grad_norm": 7.467017830810673, + "learning_rate": 3.983323617745111e-07, + "loss": 0.9327, + "step": 11630 + }, + { + "epoch": 1.6466341049055, + "grad_norm": 8.814458101940208, + "learning_rate": 3.980219994242859e-07, + "loss": 0.8851, + "step": 11631 + }, + { + "epoch": 1.6467756777801372, + "grad_norm": 10.377646235145807, + "learning_rate": 3.9771174757347626e-07, + "loss": 1.0057, + "step": 11632 + }, + { + "epoch": 1.6469172506547745, + "grad_norm": 10.576352758437023, + "learning_rate": 3.9740160623839314e-07, + "loss": 1.0271, + "step": 11633 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 9.078375860480781, + "learning_rate": 3.9709157543533996e-07, + "loss": 1.0319, + "step": 11634 + }, + { + "epoch": 1.647200396404049, + "grad_norm": 7.27365384051581, + "learning_rate": 3.967816551806139e-07, + "loss": 1.0377, + "step": 11635 + }, + { + "epoch": 1.6473419692786861, + "grad_norm": 8.114111074107207, + "learning_rate": 3.9647184549050865e-07, + "loss": 0.8514, + "step": 11636 + }, + { + "epoch": 1.6474835421533234, + "grad_norm": 9.357098386902212, + "learning_rate": 3.9616214638130953e-07, + "loss": 1.0231, + "step": 11637 + }, + { + "epoch": 1.6476251150279606, + "grad_norm": 9.703203627902036, + "learning_rate": 3.9585255786929816e-07, + "loss": 0.9448, + "step": 11638 + }, + { + "epoch": 1.6477666879025978, + "grad_norm": 9.964971609577596, + "learning_rate": 3.9554307997074826e-07, + "loss": 1.0127, + "step": 11639 + }, + { + "epoch": 1.647908260777235, + "grad_norm": 9.276171389741323, + "learning_rate": 3.952337127019301e-07, + "loss": 0.9524, + "step": 11640 + }, + { + "epoch": 1.6480498336518723, + "grad_norm": 9.26037675457747, + "learning_rate": 3.9492445607910574e-07, + "loss": 0.9289, + "step": 11641 + }, + { + "epoch": 1.6481914065265095, + "grad_norm": 10.195049460806985, + "learning_rate": 3.946153101185332e-07, + "loss": 1.1107, + "step": 11642 + }, + { + "epoch": 1.6483329794011468, + "grad_norm": 8.714160873626634, + "learning_rate": 3.943062748364651e-07, + "loss": 0.937, + "step": 11643 + }, + { + "epoch": 1.648474552275784, + "grad_norm": 9.753829204116013, + "learning_rate": 3.939973502491448e-07, + "loss": 1.0663, + "step": 11644 + }, + { + "epoch": 1.6486161251504212, + "grad_norm": 8.270744829320893, + "learning_rate": 3.9368853637281404e-07, + "loss": 0.9387, + "step": 11645 + }, + { + "epoch": 1.6487576980250584, + "grad_norm": 9.090649892406406, + "learning_rate": 3.93379833223706e-07, + "loss": 0.9779, + "step": 11646 + }, + { + "epoch": 1.6488992708996957, + "grad_norm": 9.759595404196492, + "learning_rate": 3.9307124081804924e-07, + "loss": 1.0579, + "step": 11647 + }, + { + "epoch": 1.649040843774333, + "grad_norm": 9.442160964870459, + "learning_rate": 3.92762759172067e-07, + "loss": 0.963, + "step": 11648 + }, + { + "epoch": 1.6491824166489701, + "grad_norm": 9.2332083389507, + "learning_rate": 3.9245438830197464e-07, + "loss": 0.9885, + "step": 11649 + }, + { + "epoch": 1.6493239895236074, + "grad_norm": 9.48551318206603, + "learning_rate": 3.9214612822398443e-07, + "loss": 1.0123, + "step": 11650 + }, + { + "epoch": 1.6494655623982446, + "grad_norm": 7.52971256756177, + "learning_rate": 3.9183797895429973e-07, + "loss": 0.8872, + "step": 11651 + }, + { + "epoch": 1.6496071352728818, + "grad_norm": 9.372001066565199, + "learning_rate": 3.9152994050912134e-07, + "loss": 0.9581, + "step": 11652 + }, + { + "epoch": 1.649748708147519, + "grad_norm": 10.593098578215582, + "learning_rate": 3.9122201290464095e-07, + "loss": 1.0242, + "step": 11653 + }, + { + "epoch": 1.6498902810221563, + "grad_norm": 8.677590845721074, + "learning_rate": 3.909141961570478e-07, + "loss": 0.9353, + "step": 11654 + }, + { + "epoch": 1.6500318538967935, + "grad_norm": 8.67052882287807, + "learning_rate": 3.9060649028252265e-07, + "loss": 0.8648, + "step": 11655 + }, + { + "epoch": 1.6501734267714308, + "grad_norm": 9.363742834497618, + "learning_rate": 3.9029889529724113e-07, + "loss": 0.9038, + "step": 11656 + }, + { + "epoch": 1.6503149996460678, + "grad_norm": 10.481659629974292, + "learning_rate": 3.899914112173739e-07, + "loss": 1.0738, + "step": 11657 + }, + { + "epoch": 1.650456572520705, + "grad_norm": 10.052963002245244, + "learning_rate": 3.896840380590844e-07, + "loss": 0.9978, + "step": 11658 + }, + { + "epoch": 1.6505981453953422, + "grad_norm": 9.51217553767132, + "learning_rate": 3.8937677583853224e-07, + "loss": 0.9839, + "step": 11659 + }, + { + "epoch": 1.6507397182699795, + "grad_norm": 7.943286138477851, + "learning_rate": 3.890696245718686e-07, + "loss": 0.9163, + "step": 11660 + }, + { + "epoch": 1.6508812911446167, + "grad_norm": 9.283142924339307, + "learning_rate": 3.887625842752413e-07, + "loss": 0.8638, + "step": 11661 + }, + { + "epoch": 1.651022864019254, + "grad_norm": 9.453737551176436, + "learning_rate": 3.8845565496479026e-07, + "loss": 0.9768, + "step": 11662 + }, + { + "epoch": 1.6511644368938911, + "grad_norm": 10.693761229606721, + "learning_rate": 3.8814883665665076e-07, + "loss": 0.9668, + "step": 11663 + }, + { + "epoch": 1.6513060097685284, + "grad_norm": 9.229054010419816, + "learning_rate": 3.878421293669532e-07, + "loss": 1.082, + "step": 11664 + }, + { + "epoch": 1.6514475826431656, + "grad_norm": 9.621221023442585, + "learning_rate": 3.8753553311181966e-07, + "loss": 1.0131, + "step": 11665 + }, + { + "epoch": 1.6515891555178028, + "grad_norm": 10.261872649445495, + "learning_rate": 3.8722904790736815e-07, + "loss": 1.0331, + "step": 11666 + }, + { + "epoch": 1.65173072839244, + "grad_norm": 10.050399423130065, + "learning_rate": 3.869226737697099e-07, + "loss": 0.9399, + "step": 11667 + }, + { + "epoch": 1.651872301267077, + "grad_norm": 8.437630659433934, + "learning_rate": 3.8661641071495145e-07, + "loss": 0.9281, + "step": 11668 + }, + { + "epoch": 1.6520138741417143, + "grad_norm": 8.254988116065748, + "learning_rate": 3.863102587591919e-07, + "loss": 0.9557, + "step": 11669 + }, + { + "epoch": 1.6521554470163515, + "grad_norm": 8.695376989737268, + "learning_rate": 3.860042179185261e-07, + "loss": 0.987, + "step": 11670 + }, + { + "epoch": 1.6522970198909888, + "grad_norm": 8.345280996154962, + "learning_rate": 3.8569828820904265e-07, + "loss": 0.9441, + "step": 11671 + }, + { + "epoch": 1.652438592765626, + "grad_norm": 9.77012123056123, + "learning_rate": 3.8539246964682336e-07, + "loss": 0.9748, + "step": 11672 + }, + { + "epoch": 1.6525801656402632, + "grad_norm": 8.815078249977288, + "learning_rate": 3.850867622479457e-07, + "loss": 0.9888, + "step": 11673 + }, + { + "epoch": 1.6527217385149005, + "grad_norm": 9.22728476414662, + "learning_rate": 3.847811660284795e-07, + "loss": 0.9244, + "step": 11674 + }, + { + "epoch": 1.6528633113895377, + "grad_norm": 9.72975286532564, + "learning_rate": 3.844756810044914e-07, + "loss": 0.9943, + "step": 11675 + }, + { + "epoch": 1.653004884264175, + "grad_norm": 9.470685603652074, + "learning_rate": 3.841703071920383e-07, + "loss": 0.9572, + "step": 11676 + }, + { + "epoch": 1.6531464571388121, + "grad_norm": 8.333353830948099, + "learning_rate": 3.8386504460717426e-07, + "loss": 0.9158, + "step": 11677 + }, + { + "epoch": 1.6532880300134494, + "grad_norm": 9.215157122294455, + "learning_rate": 3.835598932659476e-07, + "loss": 0.9195, + "step": 11678 + }, + { + "epoch": 1.6534296028880866, + "grad_norm": 10.989882237616547, + "learning_rate": 3.8325485318439883e-07, + "loss": 1.1058, + "step": 11679 + }, + { + "epoch": 1.6535711757627238, + "grad_norm": 8.56531383752247, + "learning_rate": 3.829499243785645e-07, + "loss": 1.0172, + "step": 11680 + }, + { + "epoch": 1.653712748637361, + "grad_norm": 10.054908684800075, + "learning_rate": 3.8264510686447376e-07, + "loss": 1.076, + "step": 11681 + }, + { + "epoch": 1.6538543215119983, + "grad_norm": 8.656717996994715, + "learning_rate": 3.823404006581513e-07, + "loss": 1.1387, + "step": 11682 + }, + { + "epoch": 1.6539958943866355, + "grad_norm": 9.928721263103832, + "learning_rate": 3.820358057756146e-07, + "loss": 0.9767, + "step": 11683 + }, + { + "epoch": 1.6541374672612728, + "grad_norm": 9.105581445704358, + "learning_rate": 3.8173132223287693e-07, + "loss": 0.9741, + "step": 11684 + }, + { + "epoch": 1.65427904013591, + "grad_norm": 9.912507881459666, + "learning_rate": 3.814269500459436e-07, + "loss": 0.8863, + "step": 11685 + }, + { + "epoch": 1.6544206130105472, + "grad_norm": 9.847627197125925, + "learning_rate": 3.8112268923081645e-07, + "loss": 0.8754, + "step": 11686 + }, + { + "epoch": 1.6545621858851844, + "grad_norm": 9.488793841006817, + "learning_rate": 3.808185398034897e-07, + "loss": 1.0081, + "step": 11687 + }, + { + "epoch": 1.6547037587598217, + "grad_norm": 9.010872948767402, + "learning_rate": 3.8051450177995136e-07, + "loss": 0.89, + "step": 11688 + }, + { + "epoch": 1.654845331634459, + "grad_norm": 8.739953513294608, + "learning_rate": 3.802105751761859e-07, + "loss": 0.9237, + "step": 11689 + }, + { + "epoch": 1.6549869045090961, + "grad_norm": 11.258259093978449, + "learning_rate": 3.799067600081696e-07, + "loss": 0.9712, + "step": 11690 + }, + { + "epoch": 1.6551284773837334, + "grad_norm": 8.093873483312374, + "learning_rate": 3.7960305629187454e-07, + "loss": 0.9498, + "step": 11691 + }, + { + "epoch": 1.6552700502583706, + "grad_norm": 8.970968257907003, + "learning_rate": 3.792994640432651e-07, + "loss": 1.0105, + "step": 11692 + }, + { + "epoch": 1.6554116231330078, + "grad_norm": 8.003020669956438, + "learning_rate": 3.789959832783016e-07, + "loss": 0.9366, + "step": 11693 + }, + { + "epoch": 1.655553196007645, + "grad_norm": 9.412500275590343, + "learning_rate": 3.786926140129385e-07, + "loss": 1.0552, + "step": 11694 + }, + { + "epoch": 1.6556947688822823, + "grad_norm": 9.5343128723701, + "learning_rate": 3.7838935626312246e-07, + "loss": 0.8943, + "step": 11695 + }, + { + "epoch": 1.6558363417569195, + "grad_norm": 10.286814751868423, + "learning_rate": 3.780862100447971e-07, + "loss": 1.0287, + "step": 11696 + }, + { + "epoch": 1.6559779146315567, + "grad_norm": 10.419785869250632, + "learning_rate": 3.7778317537389613e-07, + "loss": 1.1071, + "step": 11697 + }, + { + "epoch": 1.656119487506194, + "grad_norm": 12.675113137246239, + "learning_rate": 3.774802522663515e-07, + "loss": 0.9939, + "step": 11698 + }, + { + "epoch": 1.656261060380831, + "grad_norm": 8.122240742414274, + "learning_rate": 3.771774407380879e-07, + "loss": 0.981, + "step": 11699 + }, + { + "epoch": 1.6564026332554682, + "grad_norm": 9.360995399169965, + "learning_rate": 3.768747408050227e-07, + "loss": 0.9692, + "step": 11700 + }, + { + "epoch": 1.6565442061301054, + "grad_norm": 8.887222255653716, + "learning_rate": 3.765721524830701e-07, + "loss": 0.9145, + "step": 11701 + }, + { + "epoch": 1.6566857790047427, + "grad_norm": 9.816707025087702, + "learning_rate": 3.762696757881354e-07, + "loss": 0.9241, + "step": 11702 + }, + { + "epoch": 1.65682735187938, + "grad_norm": 8.703074155484336, + "learning_rate": 3.7596731073612085e-07, + "loss": 0.9322, + "step": 11703 + }, + { + "epoch": 1.6569689247540171, + "grad_norm": 9.724535823619721, + "learning_rate": 3.756650573429205e-07, + "loss": 0.9802, + "step": 11704 + }, + { + "epoch": 1.6571104976286544, + "grad_norm": 9.802769981816683, + "learning_rate": 3.7536291562442483e-07, + "loss": 0.9023, + "step": 11705 + }, + { + "epoch": 1.6572520705032916, + "grad_norm": 8.75918837806794, + "learning_rate": 3.750608855965157e-07, + "loss": 0.9733, + "step": 11706 + }, + { + "epoch": 1.6573936433779288, + "grad_norm": 9.235208593139035, + "learning_rate": 3.747589672750723e-07, + "loss": 0.9676, + "step": 11707 + }, + { + "epoch": 1.657535216252566, + "grad_norm": 10.460917078956916, + "learning_rate": 3.7445716067596506e-07, + "loss": 0.9133, + "step": 11708 + }, + { + "epoch": 1.657676789127203, + "grad_norm": 8.74570250562495, + "learning_rate": 3.7415546581505954e-07, + "loss": 0.9008, + "step": 11709 + }, + { + "epoch": 1.6578183620018403, + "grad_norm": 8.102692483877274, + "learning_rate": 3.7385388270821666e-07, + "loss": 1.0909, + "step": 11710 + }, + { + "epoch": 1.6579599348764775, + "grad_norm": 8.236062038104606, + "learning_rate": 3.735524113712891e-07, + "loss": 1.02, + "step": 11711 + }, + { + "epoch": 1.6581015077511148, + "grad_norm": 12.679485928195998, + "learning_rate": 3.7325105182012656e-07, + "loss": 0.9559, + "step": 11712 + }, + { + "epoch": 1.658243080625752, + "grad_norm": 9.680285714717035, + "learning_rate": 3.729498040705698e-07, + "loss": 0.9647, + "step": 11713 + }, + { + "epoch": 1.6583846535003892, + "grad_norm": 8.052803773879347, + "learning_rate": 3.726486681384564e-07, + "loss": 0.9541, + "step": 11714 + }, + { + "epoch": 1.6585262263750264, + "grad_norm": 7.454212805098951, + "learning_rate": 3.723476440396157e-07, + "loss": 0.8907, + "step": 11715 + }, + { + "epoch": 1.6586677992496637, + "grad_norm": 10.998951948696138, + "learning_rate": 3.7204673178987294e-07, + "loss": 1.0297, + "step": 11716 + }, + { + "epoch": 1.658809372124301, + "grad_norm": 8.884164296327517, + "learning_rate": 3.717459314050473e-07, + "loss": 0.8443, + "step": 11717 + }, + { + "epoch": 1.6589509449989381, + "grad_norm": 11.04869986087606, + "learning_rate": 3.714452429009513e-07, + "loss": 1.1421, + "step": 11718 + }, + { + "epoch": 1.6590925178735754, + "grad_norm": 9.638541072124113, + "learning_rate": 3.711446662933915e-07, + "loss": 1.0049, + "step": 11719 + }, + { + "epoch": 1.6592340907482126, + "grad_norm": 9.513624760905204, + "learning_rate": 3.708442015981689e-07, + "loss": 0.9283, + "step": 11720 + }, + { + "epoch": 1.6593756636228498, + "grad_norm": 9.16625305167407, + "learning_rate": 3.705438488310792e-07, + "loss": 0.8862, + "step": 11721 + }, + { + "epoch": 1.659517236497487, + "grad_norm": 9.148985392699036, + "learning_rate": 3.7024360800791195e-07, + "loss": 0.907, + "step": 11722 + }, + { + "epoch": 1.6596588093721243, + "grad_norm": 10.858618313340086, + "learning_rate": 3.699434791444495e-07, + "loss": 1.1472, + "step": 11723 + }, + { + "epoch": 1.6598003822467615, + "grad_norm": 11.103263966200434, + "learning_rate": 3.6964346225647097e-07, + "loss": 1.0079, + "step": 11724 + }, + { + "epoch": 1.6599419551213987, + "grad_norm": 10.84258446489664, + "learning_rate": 3.6934355735974647e-07, + "loss": 0.9794, + "step": 11725 + }, + { + "epoch": 1.660083527996036, + "grad_norm": 8.480609542964164, + "learning_rate": 3.690437644700431e-07, + "loss": 0.882, + "step": 11726 + }, + { + "epoch": 1.6602251008706732, + "grad_norm": 10.023330176500645, + "learning_rate": 3.687440836031195e-07, + "loss": 0.9927, + "step": 11727 + }, + { + "epoch": 1.6603666737453104, + "grad_norm": 9.30161791438432, + "learning_rate": 3.684445147747309e-07, + "loss": 0.9795, + "step": 11728 + }, + { + "epoch": 1.6605082466199477, + "grad_norm": 8.261616706668867, + "learning_rate": 3.681450580006246e-07, + "loss": 0.8965, + "step": 11729 + }, + { + "epoch": 1.660649819494585, + "grad_norm": 8.205975137307057, + "learning_rate": 3.6784571329654265e-07, + "loss": 0.8972, + "step": 11730 + }, + { + "epoch": 1.6607913923692221, + "grad_norm": 8.507657808941481, + "learning_rate": 3.675464806782222e-07, + "loss": 1.0148, + "step": 11731 + }, + { + "epoch": 1.6609329652438594, + "grad_norm": 8.450299536457827, + "learning_rate": 3.6724736016139293e-07, + "loss": 0.9593, + "step": 11732 + }, + { + "epoch": 1.6610745381184966, + "grad_norm": 8.536073320297348, + "learning_rate": 3.6694835176178e-07, + "loss": 1.0222, + "step": 11733 + }, + { + "epoch": 1.6612161109931338, + "grad_norm": 9.20541565760065, + "learning_rate": 3.666494554951014e-07, + "loss": 1.0516, + "step": 11734 + }, + { + "epoch": 1.661357683867771, + "grad_norm": 10.303812332529372, + "learning_rate": 3.6635067137707063e-07, + "loss": 1.0102, + "step": 11735 + }, + { + "epoch": 1.6614992567424083, + "grad_norm": 9.334597592785576, + "learning_rate": 3.660519994233935e-07, + "loss": 0.969, + "step": 11736 + }, + { + "epoch": 1.6616408296170455, + "grad_norm": 9.44091896627306, + "learning_rate": 3.657534396497725e-07, + "loss": 1.0129, + "step": 11737 + }, + { + "epoch": 1.6617824024916827, + "grad_norm": 10.384851764912161, + "learning_rate": 3.654549920719011e-07, + "loss": 1.0494, + "step": 11738 + }, + { + "epoch": 1.66192397536632, + "grad_norm": 8.798562140471937, + "learning_rate": 3.6515665670546956e-07, + "loss": 0.9682, + "step": 11739 + }, + { + "epoch": 1.662065548240957, + "grad_norm": 8.852484282183651, + "learning_rate": 3.6485843356616093e-07, + "loss": 0.9699, + "step": 11740 + }, + { + "epoch": 1.6622071211155942, + "grad_norm": 9.555789363080503, + "learning_rate": 3.6456032266965173e-07, + "loss": 0.9531, + "step": 11741 + }, + { + "epoch": 1.6623486939902314, + "grad_norm": 9.691112607653404, + "learning_rate": 3.6426232403161484e-07, + "loss": 0.9689, + "step": 11742 + }, + { + "epoch": 1.6624902668648687, + "grad_norm": 9.30383389110329, + "learning_rate": 3.639644376677146e-07, + "loss": 0.9443, + "step": 11743 + }, + { + "epoch": 1.662631839739506, + "grad_norm": 9.738143754670142, + "learning_rate": 3.636666635936112e-07, + "loss": 0.9184, + "step": 11744 + }, + { + "epoch": 1.6627734126141431, + "grad_norm": 8.585297689991958, + "learning_rate": 3.633690018249586e-07, + "loss": 0.8788, + "step": 11745 + }, + { + "epoch": 1.6629149854887804, + "grad_norm": 7.417358823332096, + "learning_rate": 3.6307145237740427e-07, + "loss": 0.9379, + "step": 11746 + }, + { + "epoch": 1.6630565583634176, + "grad_norm": 9.663223661932252, + "learning_rate": 3.6277401526659067e-07, + "loss": 1.0545, + "step": 11747 + }, + { + "epoch": 1.6631981312380548, + "grad_norm": 8.800592263838608, + "learning_rate": 3.624766905081528e-07, + "loss": 0.9599, + "step": 11748 + }, + { + "epoch": 1.663339704112692, + "grad_norm": 8.103774295778114, + "learning_rate": 3.621794781177229e-07, + "loss": 0.8686, + "step": 11749 + }, + { + "epoch": 1.6634812769873293, + "grad_norm": 9.067293569919608, + "learning_rate": 3.618823781109226e-07, + "loss": 0.8172, + "step": 11750 + }, + { + "epoch": 1.6636228498619663, + "grad_norm": 8.048862011955478, + "learning_rate": 3.6158539050337146e-07, + "loss": 1.0049, + "step": 11751 + }, + { + "epoch": 1.6637644227366035, + "grad_norm": 9.410940629279946, + "learning_rate": 3.6128851531068236e-07, + "loss": 0.966, + "step": 11752 + }, + { + "epoch": 1.6639059956112408, + "grad_norm": 10.14111138868684, + "learning_rate": 3.609917525484608e-07, + "loss": 1.0306, + "step": 11753 + }, + { + "epoch": 1.664047568485878, + "grad_norm": 8.313226883162608, + "learning_rate": 3.6069510223230854e-07, + "loss": 0.9196, + "step": 11754 + }, + { + "epoch": 1.6641891413605152, + "grad_norm": 9.203099715452321, + "learning_rate": 3.603985643778188e-07, + "loss": 1.0526, + "step": 11755 + }, + { + "epoch": 1.6643307142351524, + "grad_norm": 9.2385806142035, + "learning_rate": 3.601021390005821e-07, + "loss": 0.9767, + "step": 11756 + }, + { + "epoch": 1.6644722871097897, + "grad_norm": 8.479596951101215, + "learning_rate": 3.5980582611617966e-07, + "loss": 0.982, + "step": 11757 + }, + { + "epoch": 1.664613859984427, + "grad_norm": 8.416275569276936, + "learning_rate": 3.595096257401895e-07, + "loss": 0.9726, + "step": 11758 + }, + { + "epoch": 1.6647554328590641, + "grad_norm": 8.563547334869059, + "learning_rate": 3.59213537888182e-07, + "loss": 1.017, + "step": 11759 + }, + { + "epoch": 1.6648970057337014, + "grad_norm": 9.206710971704494, + "learning_rate": 3.58917562575723e-07, + "loss": 0.9542, + "step": 11760 + }, + { + "epoch": 1.6650385786083386, + "grad_norm": 10.454632067415814, + "learning_rate": 3.586216998183714e-07, + "loss": 0.9619, + "step": 11761 + }, + { + "epoch": 1.6651801514829758, + "grad_norm": 9.41416988074132, + "learning_rate": 3.583259496316796e-07, + "loss": 0.9729, + "step": 11762 + }, + { + "epoch": 1.665321724357613, + "grad_norm": 8.667397932844063, + "learning_rate": 3.580303120311965e-07, + "loss": 0.9786, + "step": 11763 + }, + { + "epoch": 1.6654632972322503, + "grad_norm": 8.193052316579397, + "learning_rate": 3.5773478703246213e-07, + "loss": 0.9009, + "step": 11764 + }, + { + "epoch": 1.6656048701068875, + "grad_norm": 10.221107141056924, + "learning_rate": 3.5743937465101323e-07, + "loss": 0.9233, + "step": 11765 + }, + { + "epoch": 1.6657464429815247, + "grad_norm": 8.918686492415798, + "learning_rate": 3.571440749023783e-07, + "loss": 0.9677, + "step": 11766 + }, + { + "epoch": 1.665888015856162, + "grad_norm": 8.180001822100673, + "learning_rate": 3.568488878020815e-07, + "loss": 0.9568, + "step": 11767 + }, + { + "epoch": 1.6660295887307992, + "grad_norm": 10.693692025362465, + "learning_rate": 3.5655381336564127e-07, + "loss": 0.9725, + "step": 11768 + }, + { + "epoch": 1.6661711616054364, + "grad_norm": 8.441714442256938, + "learning_rate": 3.562588516085683e-07, + "loss": 0.9594, + "step": 11769 + }, + { + "epoch": 1.6663127344800737, + "grad_norm": 8.492914724928205, + "learning_rate": 3.559640025463704e-07, + "loss": 0.9117, + "step": 11770 + }, + { + "epoch": 1.666454307354711, + "grad_norm": 7.981800119534024, + "learning_rate": 3.556692661945446e-07, + "loss": 0.9668, + "step": 11771 + }, + { + "epoch": 1.6665958802293481, + "grad_norm": 8.435364403487789, + "learning_rate": 3.553746425685875e-07, + "loss": 0.9428, + "step": 11772 + }, + { + "epoch": 1.6667374531039854, + "grad_norm": 10.16356031979067, + "learning_rate": 3.550801316839858e-07, + "loss": 1.0634, + "step": 11773 + }, + { + "epoch": 1.6668790259786226, + "grad_norm": 9.008777576635632, + "learning_rate": 3.5478573355622213e-07, + "loss": 0.9641, + "step": 11774 + }, + { + "epoch": 1.6670205988532598, + "grad_norm": 9.98134092935648, + "learning_rate": 3.544914482007736e-07, + "loss": 0.9653, + "step": 11775 + }, + { + "epoch": 1.667162171727897, + "grad_norm": 10.130051847816631, + "learning_rate": 3.541972756331091e-07, + "loss": 1.028, + "step": 11776 + }, + { + "epoch": 1.6673037446025343, + "grad_norm": 8.647387212506871, + "learning_rate": 3.5390321586869473e-07, + "loss": 0.9287, + "step": 11777 + }, + { + "epoch": 1.6674453174771715, + "grad_norm": 7.847084096707565, + "learning_rate": 3.5360926892298723e-07, + "loss": 1.0373, + "step": 11778 + }, + { + "epoch": 1.6675868903518087, + "grad_norm": 10.641364924714193, + "learning_rate": 3.5331543481144094e-07, + "loss": 1.0339, + "step": 11779 + }, + { + "epoch": 1.667728463226446, + "grad_norm": 8.126349703850883, + "learning_rate": 3.5302171354950065e-07, + "loss": 0.8772, + "step": 11780 + }, + { + "epoch": 1.6678700361010832, + "grad_norm": 11.108516182411687, + "learning_rate": 3.527281051526088e-07, + "loss": 0.9219, + "step": 11781 + }, + { + "epoch": 1.6680116089757202, + "grad_norm": 10.322464631182406, + "learning_rate": 3.5243460963619944e-07, + "loss": 1.0464, + "step": 11782 + }, + { + "epoch": 1.6681531818503574, + "grad_norm": 11.526090796231337, + "learning_rate": 3.521412270157007e-07, + "loss": 0.8959, + "step": 11783 + }, + { + "epoch": 1.6682947547249947, + "grad_norm": 11.048757864755835, + "learning_rate": 3.518479573065367e-07, + "loss": 1.0008, + "step": 11784 + }, + { + "epoch": 1.668436327599632, + "grad_norm": 10.011374299088278, + "learning_rate": 3.5155480052412344e-07, + "loss": 0.9487, + "step": 11785 + }, + { + "epoch": 1.6685779004742691, + "grad_norm": 9.508640576100705, + "learning_rate": 3.5126175668387275e-07, + "loss": 0.8857, + "step": 11786 + }, + { + "epoch": 1.6687194733489064, + "grad_norm": 8.06180492628255, + "learning_rate": 3.5096882580118866e-07, + "loss": 0.9324, + "step": 11787 + }, + { + "epoch": 1.6688610462235436, + "grad_norm": 9.003364781712342, + "learning_rate": 3.50676007891472e-07, + "loss": 0.8743, + "step": 11788 + }, + { + "epoch": 1.6690026190981808, + "grad_norm": 8.424945448520536, + "learning_rate": 3.50383302970114e-07, + "loss": 1.0064, + "step": 11789 + }, + { + "epoch": 1.669144191972818, + "grad_norm": 9.92671510881432, + "learning_rate": 3.5009071105250314e-07, + "loss": 1.0441, + "step": 11790 + }, + { + "epoch": 1.6692857648474553, + "grad_norm": 9.289647269230342, + "learning_rate": 3.497982321540211e-07, + "loss": 0.9393, + "step": 11791 + }, + { + "epoch": 1.6694273377220923, + "grad_norm": 8.3265766472248, + "learning_rate": 3.495058662900427e-07, + "loss": 1.0154, + "step": 11792 + }, + { + "epoch": 1.6695689105967295, + "grad_norm": 9.007117953492394, + "learning_rate": 3.492136134759377e-07, + "loss": 0.9564, + "step": 11793 + }, + { + "epoch": 1.6697104834713667, + "grad_norm": 9.10543146375356, + "learning_rate": 3.4892147372706854e-07, + "loss": 0.908, + "step": 11794 + }, + { + "epoch": 1.669852056346004, + "grad_norm": 9.78911510797416, + "learning_rate": 3.4862944705879364e-07, + "loss": 0.9495, + "step": 11795 + }, + { + "epoch": 1.6699936292206412, + "grad_norm": 10.205367258164864, + "learning_rate": 3.48337533486465e-07, + "loss": 0.9097, + "step": 11796 + }, + { + "epoch": 1.6701352020952784, + "grad_norm": 10.296903896725386, + "learning_rate": 3.480457330254275e-07, + "loss": 1.0547, + "step": 11797 + }, + { + "epoch": 1.6702767749699157, + "grad_norm": 9.442829572457223, + "learning_rate": 3.477540456910217e-07, + "loss": 0.9873, + "step": 11798 + }, + { + "epoch": 1.670418347844553, + "grad_norm": 8.792203431820912, + "learning_rate": 3.474624714985805e-07, + "loss": 0.9529, + "step": 11799 + }, + { + "epoch": 1.6705599207191901, + "grad_norm": 9.504623091286781, + "learning_rate": 3.4717101046343265e-07, + "loss": 1.013, + "step": 11800 + }, + { + "epoch": 1.6707014935938274, + "grad_norm": 8.156218663882571, + "learning_rate": 3.4687966260089913e-07, + "loss": 0.9721, + "step": 11801 + }, + { + "epoch": 1.6708430664684646, + "grad_norm": 8.502459899395017, + "learning_rate": 3.465884279262968e-07, + "loss": 0.8711, + "step": 11802 + }, + { + "epoch": 1.6709846393431018, + "grad_norm": 8.789533407523557, + "learning_rate": 3.4629730645493493e-07, + "loss": 1.0013, + "step": 11803 + }, + { + "epoch": 1.671126212217739, + "grad_norm": 10.126917527962991, + "learning_rate": 3.4600629820211755e-07, + "loss": 1.0359, + "step": 11804 + }, + { + "epoch": 1.6712677850923763, + "grad_norm": 9.076509998476364, + "learning_rate": 3.4571540318314335e-07, + "loss": 0.909, + "step": 11805 + }, + { + "epoch": 1.6714093579670135, + "grad_norm": 9.500723961801008, + "learning_rate": 3.4542462141330365e-07, + "loss": 0.9654, + "step": 11806 + }, + { + "epoch": 1.6715509308416507, + "grad_norm": 7.771400928648504, + "learning_rate": 3.4513395290788566e-07, + "loss": 1.0615, + "step": 11807 + }, + { + "epoch": 1.671692503716288, + "grad_norm": 9.947456793948916, + "learning_rate": 3.448433976821683e-07, + "loss": 0.9564, + "step": 11808 + }, + { + "epoch": 1.6718340765909252, + "grad_norm": 9.783364356889118, + "learning_rate": 3.445529557514274e-07, + "loss": 1.0065, + "step": 11809 + }, + { + "epoch": 1.6719756494655624, + "grad_norm": 10.580934227582079, + "learning_rate": 3.4426262713092963e-07, + "loss": 0.9055, + "step": 11810 + }, + { + "epoch": 1.6721172223401997, + "grad_norm": 9.559190763435879, + "learning_rate": 3.4397241183593887e-07, + "loss": 0.9582, + "step": 11811 + }, + { + "epoch": 1.672258795214837, + "grad_norm": 9.978690903909277, + "learning_rate": 3.436823098817102e-07, + "loss": 0.8356, + "step": 11812 + }, + { + "epoch": 1.6724003680894741, + "grad_norm": 9.799627912036021, + "learning_rate": 3.4339232128349527e-07, + "loss": 1.0757, + "step": 11813 + }, + { + "epoch": 1.6725419409641114, + "grad_norm": 10.908411724449058, + "learning_rate": 3.43102446056538e-07, + "loss": 0.9505, + "step": 11814 + }, + { + "epoch": 1.6726835138387486, + "grad_norm": 9.363446657122646, + "learning_rate": 3.428126842160762e-07, + "loss": 1.0524, + "step": 11815 + }, + { + "epoch": 1.6728250867133858, + "grad_norm": 8.457961301342067, + "learning_rate": 3.4252303577734376e-07, + "loss": 0.8464, + "step": 11816 + }, + { + "epoch": 1.672966659588023, + "grad_norm": 9.354485611834129, + "learning_rate": 3.4223350075556605e-07, + "loss": 0.9621, + "step": 11817 + }, + { + "epoch": 1.6731082324626603, + "grad_norm": 9.05830486144569, + "learning_rate": 3.419440791659645e-07, + "loss": 0.9729, + "step": 11818 + }, + { + "epoch": 1.6732498053372975, + "grad_norm": 10.678985594961834, + "learning_rate": 3.4165477102375386e-07, + "loss": 0.9458, + "step": 11819 + }, + { + "epoch": 1.6733913782119347, + "grad_norm": 9.107981239859082, + "learning_rate": 3.413655763441423e-07, + "loss": 0.9884, + "step": 11820 + }, + { + "epoch": 1.673532951086572, + "grad_norm": 10.18986399551389, + "learning_rate": 3.4107649514233343e-07, + "loss": 1.0302, + "step": 11821 + }, + { + "epoch": 1.6736745239612092, + "grad_norm": 9.874161310434049, + "learning_rate": 3.4078752743352263e-07, + "loss": 1.065, + "step": 11822 + }, + { + "epoch": 1.6738160968358462, + "grad_norm": 10.585462180864827, + "learning_rate": 3.404986732329027e-07, + "loss": 1.1469, + "step": 11823 + }, + { + "epoch": 1.6739576697104834, + "grad_norm": 8.814066862589772, + "learning_rate": 3.402099325556563e-07, + "loss": 0.9319, + "step": 11824 + }, + { + "epoch": 1.6740992425851207, + "grad_norm": 9.82492115095885, + "learning_rate": 3.3992130541696336e-07, + "loss": 0.9344, + "step": 11825 + }, + { + "epoch": 1.674240815459758, + "grad_norm": 8.477562854998435, + "learning_rate": 3.396327918319972e-07, + "loss": 0.903, + "step": 11826 + }, + { + "epoch": 1.6743823883343951, + "grad_norm": 9.440343567366702, + "learning_rate": 3.3934439181592393e-07, + "loss": 0.9049, + "step": 11827 + }, + { + "epoch": 1.6745239612090324, + "grad_norm": 7.291975643332813, + "learning_rate": 3.390561053839053e-07, + "loss": 1.0322, + "step": 11828 + }, + { + "epoch": 1.6746655340836696, + "grad_norm": 10.502238806963785, + "learning_rate": 3.3876793255109565e-07, + "loss": 0.9916, + "step": 11829 + }, + { + "epoch": 1.6748071069583068, + "grad_norm": 8.01818640163639, + "learning_rate": 3.3847987333264473e-07, + "loss": 0.9518, + "step": 11830 + }, + { + "epoch": 1.674948679832944, + "grad_norm": 11.019135217537846, + "learning_rate": 3.381919277436946e-07, + "loss": 0.9812, + "step": 11831 + }, + { + "epoch": 1.6750902527075813, + "grad_norm": 12.173396161165531, + "learning_rate": 3.3790409579938343e-07, + "loss": 1.0364, + "step": 11832 + }, + { + "epoch": 1.6752318255822183, + "grad_norm": 8.865432525255791, + "learning_rate": 3.376163775148414e-07, + "loss": 0.9214, + "step": 11833 + }, + { + "epoch": 1.6753733984568555, + "grad_norm": 10.021925731182012, + "learning_rate": 3.3732877290519437e-07, + "loss": 1.0601, + "step": 11834 + }, + { + "epoch": 1.6755149713314927, + "grad_norm": 9.153035611920558, + "learning_rate": 3.370412819855615e-07, + "loss": 0.971, + "step": 11835 + }, + { + "epoch": 1.67565654420613, + "grad_norm": 8.932999338008011, + "learning_rate": 3.3675390477105496e-07, + "loss": 0.9648, + "step": 11836 + }, + { + "epoch": 1.6757981170807672, + "grad_norm": 7.962786428908058, + "learning_rate": 3.364666412767831e-07, + "loss": 0.9761, + "step": 11837 + }, + { + "epoch": 1.6759396899554044, + "grad_norm": 7.90513802047774, + "learning_rate": 3.3617949151784623e-07, + "loss": 0.8883, + "step": 11838 + }, + { + "epoch": 1.6760812628300417, + "grad_norm": 9.45609417148436, + "learning_rate": 3.358924555093407e-07, + "loss": 0.8499, + "step": 11839 + }, + { + "epoch": 1.676222835704679, + "grad_norm": 9.371575709261172, + "learning_rate": 3.3560553326635467e-07, + "loss": 0.9934, + "step": 11840 + }, + { + "epoch": 1.6763644085793161, + "grad_norm": 11.193532670476266, + "learning_rate": 3.353187248039716e-07, + "loss": 1.1353, + "step": 11841 + }, + { + "epoch": 1.6765059814539534, + "grad_norm": 9.245526443437853, + "learning_rate": 3.3503203013727006e-07, + "loss": 1.0195, + "step": 11842 + }, + { + "epoch": 1.6766475543285906, + "grad_norm": 7.770699207556319, + "learning_rate": 3.3474544928131956e-07, + "loss": 0.9482, + "step": 11843 + }, + { + "epoch": 1.6767891272032278, + "grad_norm": 8.884202511213593, + "learning_rate": 3.3445898225118704e-07, + "loss": 0.9994, + "step": 11844 + }, + { + "epoch": 1.676930700077865, + "grad_norm": 9.815442075485294, + "learning_rate": 3.3417262906193096e-07, + "loss": 0.8812, + "step": 11845 + }, + { + "epoch": 1.6770722729525023, + "grad_norm": 9.916383199620654, + "learning_rate": 3.3388638972860515e-07, + "loss": 0.9923, + "step": 11846 + }, + { + "epoch": 1.6772138458271395, + "grad_norm": 8.512667473085285, + "learning_rate": 3.3360026426625615e-07, + "loss": 0.9772, + "step": 11847 + }, + { + "epoch": 1.6773554187017767, + "grad_norm": 8.345683241770722, + "learning_rate": 3.333142526899255e-07, + "loss": 0.9373, + "step": 11848 + }, + { + "epoch": 1.677496991576414, + "grad_norm": 10.53768624608355, + "learning_rate": 3.330283550146499e-07, + "loss": 0.9259, + "step": 11849 + }, + { + "epoch": 1.6776385644510512, + "grad_norm": 8.952089153237893, + "learning_rate": 3.3274257125545747e-07, + "loss": 0.9924, + "step": 11850 + }, + { + "epoch": 1.6777801373256884, + "grad_norm": 9.44404587247469, + "learning_rate": 3.3245690142737236e-07, + "loss": 0.903, + "step": 11851 + }, + { + "epoch": 1.6779217102003257, + "grad_norm": 7.210224087390132, + "learning_rate": 3.3217134554541145e-07, + "loss": 0.8641, + "step": 11852 + }, + { + "epoch": 1.6780632830749629, + "grad_norm": 9.015100738250283, + "learning_rate": 3.3188590362458696e-07, + "loss": 0.9772, + "step": 11853 + }, + { + "epoch": 1.6782048559496001, + "grad_norm": 10.707020917885384, + "learning_rate": 3.316005756799032e-07, + "loss": 1.094, + "step": 11854 + }, + { + "epoch": 1.6783464288242373, + "grad_norm": 7.877070079174525, + "learning_rate": 3.313153617263612e-07, + "loss": 0.8554, + "step": 11855 + }, + { + "epoch": 1.6784880016988746, + "grad_norm": 10.108267816350649, + "learning_rate": 3.310302617789532e-07, + "loss": 0.99, + "step": 11856 + }, + { + "epoch": 1.6786295745735118, + "grad_norm": 10.738785327724054, + "learning_rate": 3.307452758526669e-07, + "loss": 0.9674, + "step": 11857 + }, + { + "epoch": 1.678771147448149, + "grad_norm": 7.415109478070931, + "learning_rate": 3.3046040396248453e-07, + "loss": 0.954, + "step": 11858 + }, + { + "epoch": 1.6789127203227863, + "grad_norm": 10.065126064551427, + "learning_rate": 3.3017564612338013e-07, + "loss": 0.956, + "step": 11859 + }, + { + "epoch": 1.6790542931974235, + "grad_norm": 10.2767472538081, + "learning_rate": 3.298910023503249e-07, + "loss": 1.0592, + "step": 11860 + }, + { + "epoch": 1.6791958660720607, + "grad_norm": 7.79606523303107, + "learning_rate": 3.296064726582812e-07, + "loss": 0.8422, + "step": 11861 + }, + { + "epoch": 1.679337438946698, + "grad_norm": 9.64881862860601, + "learning_rate": 3.2932205706220714e-07, + "loss": 1.002, + "step": 11862 + }, + { + "epoch": 1.6794790118213352, + "grad_norm": 11.500534708611786, + "learning_rate": 3.290377555770538e-07, + "loss": 0.7866, + "step": 11863 + }, + { + "epoch": 1.6796205846959722, + "grad_norm": 7.591650904259787, + "learning_rate": 3.287535682177667e-07, + "loss": 0.9144, + "step": 11864 + }, + { + "epoch": 1.6797621575706094, + "grad_norm": 9.593241028844215, + "learning_rate": 3.2846949499928616e-07, + "loss": 0.9439, + "step": 11865 + }, + { + "epoch": 1.6799037304452467, + "grad_norm": 9.866170291614905, + "learning_rate": 3.281855359365452e-07, + "loss": 0.9068, + "step": 11866 + }, + { + "epoch": 1.6800453033198839, + "grad_norm": 8.397437713019547, + "learning_rate": 3.27901691044471e-07, + "loss": 0.9758, + "step": 11867 + }, + { + "epoch": 1.6801868761945211, + "grad_norm": 9.565142497044649, + "learning_rate": 3.27617960337985e-07, + "loss": 0.9678, + "step": 11868 + }, + { + "epoch": 1.6803284490691583, + "grad_norm": 11.643247659671811, + "learning_rate": 3.273343438320034e-07, + "loss": 1.0641, + "step": 11869 + }, + { + "epoch": 1.6804700219437956, + "grad_norm": 8.784963453864915, + "learning_rate": 3.2705084154143504e-07, + "loss": 0.9687, + "step": 11870 + }, + { + "epoch": 1.6806115948184328, + "grad_norm": 7.894906934947024, + "learning_rate": 3.267674534811835e-07, + "loss": 0.9253, + "step": 11871 + }, + { + "epoch": 1.68075316769307, + "grad_norm": 9.751807020923858, + "learning_rate": 3.264841796661469e-07, + "loss": 1.0447, + "step": 11872 + }, + { + "epoch": 1.6808947405677073, + "grad_norm": 10.262692664100072, + "learning_rate": 3.2620102011121616e-07, + "loss": 0.9982, + "step": 11873 + }, + { + "epoch": 1.6810363134423445, + "grad_norm": 8.513578900544752, + "learning_rate": 3.259179748312774e-07, + "loss": 0.8752, + "step": 11874 + }, + { + "epoch": 1.6811778863169815, + "grad_norm": 10.098826546060778, + "learning_rate": 3.25635043841209e-07, + "loss": 1.0218, + "step": 11875 + }, + { + "epoch": 1.6813194591916187, + "grad_norm": 9.50536395777282, + "learning_rate": 3.253522271558857e-07, + "loss": 0.9274, + "step": 11876 + }, + { + "epoch": 1.681461032066256, + "grad_norm": 9.854272649623868, + "learning_rate": 3.2506952479017417e-07, + "loss": 0.8875, + "step": 11877 + }, + { + "epoch": 1.6816026049408932, + "grad_norm": 9.047955647758702, + "learning_rate": 3.247869367589354e-07, + "loss": 0.9565, + "step": 11878 + }, + { + "epoch": 1.6817441778155304, + "grad_norm": 9.260458317860728, + "learning_rate": 3.245044630770264e-07, + "loss": 0.8994, + "step": 11879 + }, + { + "epoch": 1.6818857506901677, + "grad_norm": 9.304597304873509, + "learning_rate": 3.242221037592949e-07, + "loss": 1.0018, + "step": 11880 + }, + { + "epoch": 1.6820273235648049, + "grad_norm": 10.57726235865908, + "learning_rate": 3.2393985882058555e-07, + "loss": 0.9592, + "step": 11881 + }, + { + "epoch": 1.6821688964394421, + "grad_norm": 8.249280493601207, + "learning_rate": 3.2365772827573473e-07, + "loss": 0.9758, + "step": 11882 + }, + { + "epoch": 1.6823104693140793, + "grad_norm": 10.774387110297228, + "learning_rate": 3.23375712139575e-07, + "loss": 1.0518, + "step": 11883 + }, + { + "epoch": 1.6824520421887166, + "grad_norm": 8.322223225440574, + "learning_rate": 3.230938104269307e-07, + "loss": 0.8852, + "step": 11884 + }, + { + "epoch": 1.6825936150633538, + "grad_norm": 8.819556907544174, + "learning_rate": 3.228120231526219e-07, + "loss": 1.0592, + "step": 11885 + }, + { + "epoch": 1.682735187937991, + "grad_norm": 10.29017506857364, + "learning_rate": 3.225303503314614e-07, + "loss": 1.012, + "step": 11886 + }, + { + "epoch": 1.6828767608126283, + "grad_norm": 8.374196028195922, + "learning_rate": 3.2224879197825717e-07, + "loss": 0.9008, + "step": 11887 + }, + { + "epoch": 1.6830183336872655, + "grad_norm": 8.906665885986172, + "learning_rate": 3.2196734810781007e-07, + "loss": 1.0331, + "step": 11888 + }, + { + "epoch": 1.6831599065619027, + "grad_norm": 9.188012348862596, + "learning_rate": 3.2168601873491493e-07, + "loss": 0.9511, + "step": 11889 + }, + { + "epoch": 1.68330147943654, + "grad_norm": 9.162084185440218, + "learning_rate": 3.214048038743622e-07, + "loss": 0.9543, + "step": 11890 + }, + { + "epoch": 1.6834430523111772, + "grad_norm": 10.329737088752738, + "learning_rate": 3.2112370354093397e-07, + "loss": 0.9283, + "step": 11891 + }, + { + "epoch": 1.6835846251858144, + "grad_norm": 9.523087004548021, + "learning_rate": 3.208427177494081e-07, + "loss": 0.962, + "step": 11892 + }, + { + "epoch": 1.6837261980604517, + "grad_norm": 8.774238366441432, + "learning_rate": 3.205618465145563e-07, + "loss": 0.9373, + "step": 11893 + }, + { + "epoch": 1.6838677709350889, + "grad_norm": 10.241426557473764, + "learning_rate": 3.202810898511424e-07, + "loss": 1.0365, + "step": 11894 + }, + { + "epoch": 1.6840093438097261, + "grad_norm": 8.280929847142819, + "learning_rate": 3.2000044777392684e-07, + "loss": 0.9601, + "step": 11895 + }, + { + "epoch": 1.6841509166843633, + "grad_norm": 9.903792885828672, + "learning_rate": 3.1971992029766197e-07, + "loss": 0.829, + "step": 11896 + }, + { + "epoch": 1.6842924895590006, + "grad_norm": 9.615652722655641, + "learning_rate": 3.194395074370957e-07, + "loss": 1.0548, + "step": 11897 + }, + { + "epoch": 1.6844340624336378, + "grad_norm": 10.079324056577406, + "learning_rate": 3.191592092069684e-07, + "loss": 1.0214, + "step": 11898 + }, + { + "epoch": 1.684575635308275, + "grad_norm": 9.242922162791476, + "learning_rate": 3.1887902562201506e-07, + "loss": 0.9221, + "step": 11899 + }, + { + "epoch": 1.6847172081829123, + "grad_norm": 10.130059002688446, + "learning_rate": 3.185989566969655e-07, + "loss": 0.8802, + "step": 11900 + }, + { + "epoch": 1.6848587810575495, + "grad_norm": 9.94511956374618, + "learning_rate": 3.1831900244654157e-07, + "loss": 1.0212, + "step": 11901 + }, + { + "epoch": 1.6850003539321867, + "grad_norm": 9.058780722520606, + "learning_rate": 3.1803916288546176e-07, + "loss": 0.8941, + "step": 11902 + }, + { + "epoch": 1.685141926806824, + "grad_norm": 9.817534303345381, + "learning_rate": 3.1775943802843546e-07, + "loss": 1.0519, + "step": 11903 + }, + { + "epoch": 1.6852834996814612, + "grad_norm": 9.346211086966147, + "learning_rate": 3.174798278901692e-07, + "loss": 0.828, + "step": 11904 + }, + { + "epoch": 1.6854250725560984, + "grad_norm": 8.951270106036935, + "learning_rate": 3.172003324853601e-07, + "loss": 1.0545, + "step": 11905 + }, + { + "epoch": 1.6855666454307354, + "grad_norm": 8.770281583295542, + "learning_rate": 3.169209518287028e-07, + "loss": 0.96, + "step": 11906 + }, + { + "epoch": 1.6857082183053727, + "grad_norm": 7.704215879749045, + "learning_rate": 3.166416859348825e-07, + "loss": 0.981, + "step": 11907 + }, + { + "epoch": 1.6858497911800099, + "grad_norm": 8.960388326404216, + "learning_rate": 3.163625348185814e-07, + "loss": 1.1275, + "step": 11908 + }, + { + "epoch": 1.6859913640546471, + "grad_norm": 8.927863200845307, + "learning_rate": 3.1608349849447385e-07, + "loss": 0.9737, + "step": 11909 + }, + { + "epoch": 1.6861329369292843, + "grad_norm": 9.827097537335263, + "learning_rate": 3.1580457697722777e-07, + "loss": 0.8913, + "step": 11910 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 10.656798407351143, + "learning_rate": 3.1552577028150677e-07, + "loss": 1.065, + "step": 11911 + }, + { + "epoch": 1.6864160826785588, + "grad_norm": 8.865094311724029, + "learning_rate": 3.152470784219669e-07, + "loss": 0.9225, + "step": 11912 + }, + { + "epoch": 1.686557655553196, + "grad_norm": 8.526312995994463, + "learning_rate": 3.1496850141325973e-07, + "loss": 0.9968, + "step": 11913 + }, + { + "epoch": 1.6866992284278333, + "grad_norm": 9.624221175749552, + "learning_rate": 3.146900392700286e-07, + "loss": 0.9439, + "step": 11914 + }, + { + "epoch": 1.6868408013024705, + "grad_norm": 9.383336950632305, + "learning_rate": 3.1441169200691265e-07, + "loss": 1.0133, + "step": 11915 + }, + { + "epoch": 1.6869823741771075, + "grad_norm": 8.485156843436357, + "learning_rate": 3.141334596385448e-07, + "loss": 0.9938, + "step": 11916 + }, + { + "epoch": 1.6871239470517447, + "grad_norm": 10.424560566886996, + "learning_rate": 3.138553421795507e-07, + "loss": 1.026, + "step": 11917 + }, + { + "epoch": 1.687265519926382, + "grad_norm": 9.278864184620907, + "learning_rate": 3.1357733964455185e-07, + "loss": 0.9525, + "step": 11918 + }, + { + "epoch": 1.6874070928010192, + "grad_norm": 10.809381465242717, + "learning_rate": 3.1329945204816166e-07, + "loss": 1.0511, + "step": 11919 + }, + { + "epoch": 1.6875486656756564, + "grad_norm": 8.735957621600866, + "learning_rate": 3.1302167940498893e-07, + "loss": 0.9766, + "step": 11920 + }, + { + "epoch": 1.6876902385502937, + "grad_norm": 9.825362601153232, + "learning_rate": 3.127440217296354e-07, + "loss": 0.9797, + "step": 11921 + }, + { + "epoch": 1.6878318114249309, + "grad_norm": 8.123700786300049, + "learning_rate": 3.1246647903669794e-07, + "loss": 0.9128, + "step": 11922 + }, + { + "epoch": 1.6879733842995681, + "grad_norm": 8.544370741154944, + "learning_rate": 3.121890513407669e-07, + "loss": 0.8855, + "step": 11923 + }, + { + "epoch": 1.6881149571742053, + "grad_norm": 9.659503080172364, + "learning_rate": 3.119117386564255e-07, + "loss": 0.989, + "step": 11924 + }, + { + "epoch": 1.6882565300488426, + "grad_norm": 9.77817864993072, + "learning_rate": 3.1163454099825326e-07, + "loss": 0.9804, + "step": 11925 + }, + { + "epoch": 1.6883981029234798, + "grad_norm": 8.29319506285641, + "learning_rate": 3.113574583808207e-07, + "loss": 1.0307, + "step": 11926 + }, + { + "epoch": 1.688539675798117, + "grad_norm": 10.833348983362093, + "learning_rate": 3.110804908186954e-07, + "loss": 1.0891, + "step": 11927 + }, + { + "epoch": 1.6886812486727543, + "grad_norm": 11.058976986361047, + "learning_rate": 3.1080363832643593e-07, + "loss": 0.9622, + "step": 11928 + }, + { + "epoch": 1.6888228215473915, + "grad_norm": 9.028349836434284, + "learning_rate": 3.105269009185974e-07, + "loss": 0.8826, + "step": 11929 + }, + { + "epoch": 1.6889643944220287, + "grad_norm": 10.346505619185253, + "learning_rate": 3.102502786097272e-07, + "loss": 1.0466, + "step": 11930 + }, + { + "epoch": 1.689105967296666, + "grad_norm": 8.838653497807055, + "learning_rate": 3.0997377141436665e-07, + "loss": 0.9739, + "step": 11931 + }, + { + "epoch": 1.6892475401713032, + "grad_norm": 9.445258380996735, + "learning_rate": 3.096973793470523e-07, + "loss": 0.9447, + "step": 11932 + }, + { + "epoch": 1.6893891130459404, + "grad_norm": 9.04379847367905, + "learning_rate": 3.0942110242231316e-07, + "loss": 0.9191, + "step": 11933 + }, + { + "epoch": 1.6895306859205776, + "grad_norm": 9.9251728494424, + "learning_rate": 3.091449406546737e-07, + "loss": 1.0237, + "step": 11934 + }, + { + "epoch": 1.6896722587952149, + "grad_norm": 8.63080037000367, + "learning_rate": 3.088688940586507e-07, + "loss": 0.9055, + "step": 11935 + }, + { + "epoch": 1.689813831669852, + "grad_norm": 8.147046973575668, + "learning_rate": 3.0859296264875686e-07, + "loss": 0.8484, + "step": 11936 + }, + { + "epoch": 1.6899554045444893, + "grad_norm": 9.396829240774435, + "learning_rate": 3.083171464394963e-07, + "loss": 0.8984, + "step": 11937 + }, + { + "epoch": 1.6900969774191266, + "grad_norm": 9.469124260987366, + "learning_rate": 3.0804144544536897e-07, + "loss": 0.9799, + "step": 11938 + }, + { + "epoch": 1.6902385502937638, + "grad_norm": 8.977116633244496, + "learning_rate": 3.0776585968086914e-07, + "loss": 0.8645, + "step": 11939 + }, + { + "epoch": 1.690380123168401, + "grad_norm": 9.467218155535035, + "learning_rate": 3.0749038916048356e-07, + "loss": 0.9405, + "step": 11940 + }, + { + "epoch": 1.6905216960430383, + "grad_norm": 10.405462195099858, + "learning_rate": 3.0721503389869344e-07, + "loss": 1.0007, + "step": 11941 + }, + { + "epoch": 1.6906632689176755, + "grad_norm": 10.236970016305904, + "learning_rate": 3.0693979390997333e-07, + "loss": 0.9135, + "step": 11942 + }, + { + "epoch": 1.6908048417923127, + "grad_norm": 10.044982447136542, + "learning_rate": 3.066646692087938e-07, + "loss": 0.9734, + "step": 11943 + }, + { + "epoch": 1.69094641466695, + "grad_norm": 9.71040167216401, + "learning_rate": 3.063896598096164e-07, + "loss": 1.0161, + "step": 11944 + }, + { + "epoch": 1.6910879875415872, + "grad_norm": 8.556292657627411, + "learning_rate": 3.0611476572689896e-07, + "loss": 0.9038, + "step": 11945 + }, + { + "epoch": 1.6912295604162244, + "grad_norm": 8.327576696156637, + "learning_rate": 3.0583998697509305e-07, + "loss": 0.8661, + "step": 11946 + }, + { + "epoch": 1.6913711332908614, + "grad_norm": 8.480032861753886, + "learning_rate": 3.055653235686426e-07, + "loss": 0.9283, + "step": 11947 + }, + { + "epoch": 1.6915127061654986, + "grad_norm": 8.909738814339402, + "learning_rate": 3.0529077552198724e-07, + "loss": 0.9836, + "step": 11948 + }, + { + "epoch": 1.6916542790401359, + "grad_norm": 18.4314203098587, + "learning_rate": 3.0501634284955867e-07, + "loss": 0.9622, + "step": 11949 + }, + { + "epoch": 1.691795851914773, + "grad_norm": 9.988615040130533, + "learning_rate": 3.0474202556578513e-07, + "loss": 0.9851, + "step": 11950 + }, + { + "epoch": 1.6919374247894103, + "grad_norm": 9.341756652707158, + "learning_rate": 3.044678236850862e-07, + "loss": 0.9136, + "step": 11951 + }, + { + "epoch": 1.6920789976640476, + "grad_norm": 9.85661013567541, + "learning_rate": 3.0419373722187645e-07, + "loss": 0.948, + "step": 11952 + }, + { + "epoch": 1.6922205705386848, + "grad_norm": 9.357950398283712, + "learning_rate": 3.039197661905652e-07, + "loss": 0.9108, + "step": 11953 + }, + { + "epoch": 1.692362143413322, + "grad_norm": 11.508190389671256, + "learning_rate": 3.0364591060555363e-07, + "loss": 0.8936, + "step": 11954 + }, + { + "epoch": 1.6925037162879593, + "grad_norm": 8.14259616175443, + "learning_rate": 3.033721704812395e-07, + "loss": 0.8438, + "step": 11955 + }, + { + "epoch": 1.6926452891625965, + "grad_norm": 8.693468417578877, + "learning_rate": 3.030985458320118e-07, + "loss": 0.9738, + "step": 11956 + }, + { + "epoch": 1.6927868620372337, + "grad_norm": 10.39374769512433, + "learning_rate": 3.028250366722563e-07, + "loss": 1.0176, + "step": 11957 + }, + { + "epoch": 1.6929284349118707, + "grad_norm": 10.352590603426199, + "learning_rate": 3.025516430163497e-07, + "loss": 1.0223, + "step": 11958 + }, + { + "epoch": 1.693070007786508, + "grad_norm": 8.718775814113892, + "learning_rate": 3.022783648786651e-07, + "loss": 0.9748, + "step": 11959 + }, + { + "epoch": 1.6932115806611452, + "grad_norm": 10.48356832252021, + "learning_rate": 3.020052022735678e-07, + "loss": 0.9554, + "step": 11960 + }, + { + "epoch": 1.6933531535357824, + "grad_norm": 8.428494543135914, + "learning_rate": 3.017321552154187e-07, + "loss": 0.8462, + "step": 11961 + }, + { + "epoch": 1.6934947264104196, + "grad_norm": 9.624102265738177, + "learning_rate": 3.0145922371857097e-07, + "loss": 1.1041, + "step": 11962 + }, + { + "epoch": 1.6936362992850569, + "grad_norm": 9.509603765759028, + "learning_rate": 3.0118640779737225e-07, + "loss": 0.8537, + "step": 11963 + }, + { + "epoch": 1.693777872159694, + "grad_norm": 8.902846371972798, + "learning_rate": 3.009137074661647e-07, + "loss": 1.0184, + "step": 11964 + }, + { + "epoch": 1.6939194450343313, + "grad_norm": 7.659677664645526, + "learning_rate": 3.006411227392836e-07, + "loss": 0.9373, + "step": 11965 + }, + { + "epoch": 1.6940610179089686, + "grad_norm": 8.979883812915233, + "learning_rate": 3.003686536310593e-07, + "loss": 1.0425, + "step": 11966 + }, + { + "epoch": 1.6942025907836058, + "grad_norm": 10.390280260376048, + "learning_rate": 3.000963001558141e-07, + "loss": 1.0522, + "step": 11967 + }, + { + "epoch": 1.694344163658243, + "grad_norm": 9.783505895631857, + "learning_rate": 2.9982406232786614e-07, + "loss": 1.0407, + "step": 11968 + }, + { + "epoch": 1.6944857365328803, + "grad_norm": 10.16216361790011, + "learning_rate": 2.995519401615274e-07, + "loss": 0.9079, + "step": 11969 + }, + { + "epoch": 1.6946273094075175, + "grad_norm": 10.80149960879637, + "learning_rate": 2.9927993367110165e-07, + "loss": 0.9625, + "step": 11970 + }, + { + "epoch": 1.6947688822821547, + "grad_norm": 10.768702362008314, + "learning_rate": 2.9900804287088944e-07, + "loss": 1.0385, + "step": 11971 + }, + { + "epoch": 1.694910455156792, + "grad_norm": 9.615118329395704, + "learning_rate": 2.9873626777518343e-07, + "loss": 1.004, + "step": 11972 + }, + { + "epoch": 1.6950520280314292, + "grad_norm": 9.269698329983896, + "learning_rate": 2.984646083982698e-07, + "loss": 0.9411, + "step": 11973 + }, + { + "epoch": 1.6951936009060664, + "grad_norm": 9.38706052033342, + "learning_rate": 2.9819306475443096e-07, + "loss": 0.9899, + "step": 11974 + }, + { + "epoch": 1.6953351737807036, + "grad_norm": 9.115377709058938, + "learning_rate": 2.9792163685794015e-07, + "loss": 0.8801, + "step": 11975 + }, + { + "epoch": 1.6954767466553409, + "grad_norm": 8.913787329369187, + "learning_rate": 2.976503247230675e-07, + "loss": 1.0147, + "step": 11976 + }, + { + "epoch": 1.695618319529978, + "grad_norm": 9.408388217960782, + "learning_rate": 2.9737912836407477e-07, + "loss": 0.9077, + "step": 11977 + }, + { + "epoch": 1.6957598924046153, + "grad_norm": 7.586003877799191, + "learning_rate": 2.971080477952193e-07, + "loss": 0.961, + "step": 11978 + }, + { + "epoch": 1.6959014652792526, + "grad_norm": 8.617339782199318, + "learning_rate": 2.968370830307507e-07, + "loss": 1.0292, + "step": 11979 + }, + { + "epoch": 1.6960430381538898, + "grad_norm": 10.755669806357874, + "learning_rate": 2.965662340849146e-07, + "loss": 1.0223, + "step": 11980 + }, + { + "epoch": 1.696184611028527, + "grad_norm": 10.099490585322005, + "learning_rate": 2.9629550097194787e-07, + "loss": 0.9441, + "step": 11981 + }, + { + "epoch": 1.6963261839031643, + "grad_norm": 9.816980590291307, + "learning_rate": 2.960248837060842e-07, + "loss": 0.9685, + "step": 11982 + }, + { + "epoch": 1.6964677567778015, + "grad_norm": 8.860635227139078, + "learning_rate": 2.957543823015491e-07, + "loss": 1.0273, + "step": 11983 + }, + { + "epoch": 1.6966093296524387, + "grad_norm": 10.997541673099606, + "learning_rate": 2.9548399677256174e-07, + "loss": 1.0662, + "step": 11984 + }, + { + "epoch": 1.696750902527076, + "grad_norm": 9.260829051003698, + "learning_rate": 2.9521372713333773e-07, + "loss": 1.0339, + "step": 11985 + }, + { + "epoch": 1.6968924754017132, + "grad_norm": 9.643650365427268, + "learning_rate": 2.9494357339808347e-07, + "loss": 0.9284, + "step": 11986 + }, + { + "epoch": 1.6970340482763504, + "grad_norm": 10.350994972650124, + "learning_rate": 2.946735355810018e-07, + "loss": 0.8918, + "step": 11987 + }, + { + "epoch": 1.6971756211509876, + "grad_norm": 11.027384915682108, + "learning_rate": 2.9440361369628773e-07, + "loss": 0.9987, + "step": 11988 + }, + { + "epoch": 1.6973171940256246, + "grad_norm": 9.493851529169294, + "learning_rate": 2.94133807758131e-07, + "loss": 0.9964, + "step": 11989 + }, + { + "epoch": 1.6974587669002619, + "grad_norm": 10.821883347166864, + "learning_rate": 2.9386411778071584e-07, + "loss": 0.9757, + "step": 11990 + }, + { + "epoch": 1.697600339774899, + "grad_norm": 9.408181027130473, + "learning_rate": 2.935945437782184e-07, + "loss": 0.9536, + "step": 11991 + }, + { + "epoch": 1.6977419126495363, + "grad_norm": 9.303503414372996, + "learning_rate": 2.933250857648112e-07, + "loss": 0.8363, + "step": 11992 + }, + { + "epoch": 1.6978834855241736, + "grad_norm": 9.8638006515469, + "learning_rate": 2.9305574375465884e-07, + "loss": 1.0356, + "step": 11993 + }, + { + "epoch": 1.6980250583988108, + "grad_norm": 9.129501277676757, + "learning_rate": 2.9278651776192073e-07, + "loss": 0.9285, + "step": 11994 + }, + { + "epoch": 1.698166631273448, + "grad_norm": 11.198716321678033, + "learning_rate": 2.925174078007487e-07, + "loss": 1.0013, + "step": 11995 + }, + { + "epoch": 1.6983082041480853, + "grad_norm": 9.177042504579532, + "learning_rate": 2.922484138852907e-07, + "loss": 1.0145, + "step": 11996 + }, + { + "epoch": 1.6984497770227225, + "grad_norm": 9.908358472872948, + "learning_rate": 2.9197953602968814e-07, + "loss": 1.0366, + "step": 11997 + }, + { + "epoch": 1.6985913498973597, + "grad_norm": 8.482830883602755, + "learning_rate": 2.917107742480743e-07, + "loss": 1.0254, + "step": 11998 + }, + { + "epoch": 1.6987329227719967, + "grad_norm": 11.541415398865933, + "learning_rate": 2.9144212855457906e-07, + "loss": 0.9426, + "step": 11999 + }, + { + "epoch": 1.698874495646634, + "grad_norm": 11.320241734380264, + "learning_rate": 2.911735989633238e-07, + "loss": 0.9643, + "step": 12000 + }, + { + "epoch": 1.6990160685212712, + "grad_norm": 8.955992873467988, + "learning_rate": 2.9090518548842594e-07, + "loss": 0.9502, + "step": 12001 + }, + { + "epoch": 1.6991576413959084, + "grad_norm": 8.02701299501068, + "learning_rate": 2.906368881439947e-07, + "loss": 0.9016, + "step": 12002 + }, + { + "epoch": 1.6992992142705456, + "grad_norm": 9.313852096815133, + "learning_rate": 2.903687069441358e-07, + "loss": 1.0138, + "step": 12003 + }, + { + "epoch": 1.6994407871451829, + "grad_norm": 9.14182559222672, + "learning_rate": 2.901006419029459e-07, + "loss": 0.9687, + "step": 12004 + }, + { + "epoch": 1.69958236001982, + "grad_norm": 7.825072403350386, + "learning_rate": 2.8983269303451715e-07, + "loss": 0.9112, + "step": 12005 + }, + { + "epoch": 1.6997239328944573, + "grad_norm": 8.889984805048003, + "learning_rate": 2.8956486035293635e-07, + "loss": 1.0099, + "step": 12006 + }, + { + "epoch": 1.6998655057690946, + "grad_norm": 8.619095259978621, + "learning_rate": 2.892971438722822e-07, + "loss": 0.871, + "step": 12007 + }, + { + "epoch": 1.7000070786437318, + "grad_norm": 9.662933505698302, + "learning_rate": 2.8902954360662925e-07, + "loss": 1.144, + "step": 12008 + }, + { + "epoch": 1.700148651518369, + "grad_norm": 9.346347001396204, + "learning_rate": 2.887620595700441e-07, + "loss": 0.9412, + "step": 12009 + }, + { + "epoch": 1.7002902243930063, + "grad_norm": 9.813455911713893, + "learning_rate": 2.8849469177658933e-07, + "loss": 0.9883, + "step": 12010 + }, + { + "epoch": 1.7004317972676435, + "grad_norm": 10.6520483062027, + "learning_rate": 2.8822744024031904e-07, + "loss": 1.0134, + "step": 12011 + }, + { + "epoch": 1.7005733701422807, + "grad_norm": 9.20529796808079, + "learning_rate": 2.8796030497528325e-07, + "loss": 0.914, + "step": 12012 + }, + { + "epoch": 1.700714943016918, + "grad_norm": 9.299247580003406, + "learning_rate": 2.8769328599552503e-07, + "loss": 0.9374, + "step": 12013 + }, + { + "epoch": 1.7008565158915552, + "grad_norm": 11.406695464332149, + "learning_rate": 2.874263833150814e-07, + "loss": 0.9508, + "step": 12014 + }, + { + "epoch": 1.7009980887661924, + "grad_norm": 10.321290864275673, + "learning_rate": 2.871595969479832e-07, + "loss": 1.0107, + "step": 12015 + }, + { + "epoch": 1.7011396616408296, + "grad_norm": 8.638283051743052, + "learning_rate": 2.86892926908254e-07, + "loss": 0.9346, + "step": 12016 + }, + { + "epoch": 1.7012812345154669, + "grad_norm": 8.385240812807318, + "learning_rate": 2.866263732099145e-07, + "loss": 0.9799, + "step": 12017 + }, + { + "epoch": 1.701422807390104, + "grad_norm": 9.186276308946754, + "learning_rate": 2.8635993586697555e-07, + "loss": 0.9613, + "step": 12018 + }, + { + "epoch": 1.7015643802647413, + "grad_norm": 9.420476857161137, + "learning_rate": 2.86093614893444e-07, + "loss": 0.8663, + "step": 12019 + }, + { + "epoch": 1.7017059531393786, + "grad_norm": 9.074739596411336, + "learning_rate": 2.8582741030332095e-07, + "loss": 1.0323, + "step": 12020 + }, + { + "epoch": 1.7018475260140158, + "grad_norm": 9.222580556789127, + "learning_rate": 2.8556132211059963e-07, + "loss": 0.934, + "step": 12021 + }, + { + "epoch": 1.701989098888653, + "grad_norm": 8.5696060877111, + "learning_rate": 2.852953503292688e-07, + "loss": 0.9121, + "step": 12022 + }, + { + "epoch": 1.7021306717632902, + "grad_norm": 10.448011393448837, + "learning_rate": 2.8502949497330954e-07, + "loss": 0.9729, + "step": 12023 + }, + { + "epoch": 1.7022722446379275, + "grad_norm": 9.35119944340155, + "learning_rate": 2.8476375605669905e-07, + "loss": 0.9201, + "step": 12024 + }, + { + "epoch": 1.7024138175125647, + "grad_norm": 9.750005086261694, + "learning_rate": 2.8449813359340576e-07, + "loss": 1.002, + "step": 12025 + }, + { + "epoch": 1.702555390387202, + "grad_norm": 11.052907114251415, + "learning_rate": 2.8423262759739307e-07, + "loss": 0.9178, + "step": 12026 + }, + { + "epoch": 1.7026969632618392, + "grad_norm": 9.673185145765475, + "learning_rate": 2.839672380826197e-07, + "loss": 0.9864, + "step": 12027 + }, + { + "epoch": 1.7028385361364764, + "grad_norm": 8.968428443168534, + "learning_rate": 2.8370196506303573e-07, + "loss": 0.9014, + "step": 12028 + }, + { + "epoch": 1.7029801090111136, + "grad_norm": 8.592392914864751, + "learning_rate": 2.8343680855258764e-07, + "loss": 0.9967, + "step": 12029 + }, + { + "epoch": 1.7031216818857506, + "grad_norm": 10.681531165157296, + "learning_rate": 2.83171768565213e-07, + "loss": 1.0126, + "step": 12030 + }, + { + "epoch": 1.7032632547603879, + "grad_norm": 9.748983085615304, + "learning_rate": 2.8290684511484615e-07, + "loss": 0.9565, + "step": 12031 + }, + { + "epoch": 1.703404827635025, + "grad_norm": 9.99420532181766, + "learning_rate": 2.826420382154127e-07, + "loss": 0.9809, + "step": 12032 + }, + { + "epoch": 1.7035464005096623, + "grad_norm": 7.686636775483338, + "learning_rate": 2.823773478808348e-07, + "loss": 0.8995, + "step": 12033 + }, + { + "epoch": 1.7036879733842996, + "grad_norm": 9.952128841069044, + "learning_rate": 2.8211277412502543e-07, + "loss": 1.0475, + "step": 12034 + }, + { + "epoch": 1.7038295462589368, + "grad_norm": 7.732995359116743, + "learning_rate": 2.818483169618941e-07, + "loss": 0.8353, + "step": 12035 + }, + { + "epoch": 1.703971119133574, + "grad_norm": 10.25273393596172, + "learning_rate": 2.8158397640534326e-07, + "loss": 1.0028, + "step": 12036 + }, + { + "epoch": 1.7041126920082112, + "grad_norm": 13.185046216285551, + "learning_rate": 2.813197524692679e-07, + "loss": 0.9964, + "step": 12037 + }, + { + "epoch": 1.7042542648828485, + "grad_norm": 10.003877269579265, + "learning_rate": 2.810556451675592e-07, + "loss": 1.0644, + "step": 12038 + }, + { + "epoch": 1.7043958377574857, + "grad_norm": 11.282504447870204, + "learning_rate": 2.807916545141004e-07, + "loss": 0.9009, + "step": 12039 + }, + { + "epoch": 1.704537410632123, + "grad_norm": 7.74158858091177, + "learning_rate": 2.805277805227702e-07, + "loss": 1.0147, + "step": 12040 + }, + { + "epoch": 1.70467898350676, + "grad_norm": 9.274094003193957, + "learning_rate": 2.8026402320743914e-07, + "loss": 0.9509, + "step": 12041 + }, + { + "epoch": 1.7048205563813972, + "grad_norm": 9.379501686117596, + "learning_rate": 2.8000038258197334e-07, + "loss": 1.1136, + "step": 12042 + }, + { + "epoch": 1.7049621292560344, + "grad_norm": 9.063732826301868, + "learning_rate": 2.7973685866023224e-07, + "loss": 0.9883, + "step": 12043 + }, + { + "epoch": 1.7051037021306716, + "grad_norm": 8.09028462931316, + "learning_rate": 2.7947345145606877e-07, + "loss": 0.884, + "step": 12044 + }, + { + "epoch": 1.7052452750053089, + "grad_norm": 11.147768531194826, + "learning_rate": 2.792101609833309e-07, + "loss": 1.0315, + "step": 12045 + }, + { + "epoch": 1.705386847879946, + "grad_norm": 9.497955503814566, + "learning_rate": 2.7894698725585866e-07, + "loss": 0.9971, + "step": 12046 + }, + { + "epoch": 1.7055284207545833, + "grad_norm": 9.66149049052656, + "learning_rate": 2.786839302874869e-07, + "loss": 0.9509, + "step": 12047 + }, + { + "epoch": 1.7056699936292206, + "grad_norm": 10.118308788008607, + "learning_rate": 2.784209900920451e-07, + "loss": 1.0315, + "step": 12048 + }, + { + "epoch": 1.7058115665038578, + "grad_norm": 9.802703048677282, + "learning_rate": 2.781581666833549e-07, + "loss": 1.0331, + "step": 12049 + }, + { + "epoch": 1.705953139378495, + "grad_norm": 10.471071993537608, + "learning_rate": 2.778954600752337e-07, + "loss": 0.9965, + "step": 12050 + }, + { + "epoch": 1.7060947122531323, + "grad_norm": 9.596143392924999, + "learning_rate": 2.776328702814909e-07, + "loss": 0.9363, + "step": 12051 + }, + { + "epoch": 1.7062362851277695, + "grad_norm": 8.950754008152225, + "learning_rate": 2.773703973159314e-07, + "loss": 1.0235, + "step": 12052 + }, + { + "epoch": 1.7063778580024067, + "grad_norm": 10.851100925546067, + "learning_rate": 2.771080411923524e-07, + "loss": 0.9603, + "step": 12053 + }, + { + "epoch": 1.706519430877044, + "grad_norm": 8.146767903316494, + "learning_rate": 2.7684580192454653e-07, + "loss": 0.9182, + "step": 12054 + }, + { + "epoch": 1.7066610037516812, + "grad_norm": 8.946750362059714, + "learning_rate": 2.7658367952629885e-07, + "loss": 0.9693, + "step": 12055 + }, + { + "epoch": 1.7068025766263184, + "grad_norm": 8.520854275280525, + "learning_rate": 2.7632167401138996e-07, + "loss": 0.9691, + "step": 12056 + }, + { + "epoch": 1.7069441495009556, + "grad_norm": 9.204658523019233, + "learning_rate": 2.760597853935923e-07, + "loss": 0.9692, + "step": 12057 + }, + { + "epoch": 1.7070857223755929, + "grad_norm": 8.509892261150355, + "learning_rate": 2.757980136866731e-07, + "loss": 0.9017, + "step": 12058 + }, + { + "epoch": 1.70722729525023, + "grad_norm": 16.76415417593071, + "learning_rate": 2.755363589043944e-07, + "loss": 1.0981, + "step": 12059 + }, + { + "epoch": 1.7073688681248673, + "grad_norm": 12.607053181334335, + "learning_rate": 2.7527482106051025e-07, + "loss": 0.995, + "step": 12060 + }, + { + "epoch": 1.7075104409995046, + "grad_norm": 9.345797209231229, + "learning_rate": 2.7501340016877044e-07, + "loss": 1.0848, + "step": 12061 + }, + { + "epoch": 1.7076520138741418, + "grad_norm": 9.252642692201551, + "learning_rate": 2.7475209624291674e-07, + "loss": 0.9408, + "step": 12062 + }, + { + "epoch": 1.707793586748779, + "grad_norm": 9.156463724708056, + "learning_rate": 2.744909092966863e-07, + "loss": 0.9659, + "step": 12063 + }, + { + "epoch": 1.7079351596234162, + "grad_norm": 10.196371323131562, + "learning_rate": 2.742298393438092e-07, + "loss": 1.0373, + "step": 12064 + }, + { + "epoch": 1.7080767324980535, + "grad_norm": 9.98331547280711, + "learning_rate": 2.739688863980097e-07, + "loss": 1.011, + "step": 12065 + }, + { + "epoch": 1.7082183053726907, + "grad_norm": 9.610222110781555, + "learning_rate": 2.7370805047300633e-07, + "loss": 1.1545, + "step": 12066 + }, + { + "epoch": 1.708359878247328, + "grad_norm": 8.626259089908585, + "learning_rate": 2.734473315825112e-07, + "loss": 1.1023, + "step": 12067 + }, + { + "epoch": 1.7085014511219652, + "grad_norm": 8.251690315711457, + "learning_rate": 2.7318672974022936e-07, + "loss": 0.9123, + "step": 12068 + }, + { + "epoch": 1.7086430239966024, + "grad_norm": 8.94822039253389, + "learning_rate": 2.729262449598602e-07, + "loss": 0.8456, + "step": 12069 + }, + { + "epoch": 1.7087845968712396, + "grad_norm": 10.8454778432505, + "learning_rate": 2.7266587725509805e-07, + "loss": 1.015, + "step": 12070 + }, + { + "epoch": 1.7089261697458769, + "grad_norm": 8.787036658887278, + "learning_rate": 2.724056266396302e-07, + "loss": 0.9829, + "step": 12071 + }, + { + "epoch": 1.7090677426205139, + "grad_norm": 9.193449209182395, + "learning_rate": 2.7214549312713723e-07, + "loss": 0.9137, + "step": 12072 + }, + { + "epoch": 1.709209315495151, + "grad_norm": 8.079818695602556, + "learning_rate": 2.7188547673129477e-07, + "loss": 0.928, + "step": 12073 + }, + { + "epoch": 1.7093508883697883, + "grad_norm": 9.765124596554335, + "learning_rate": 2.716255774657714e-07, + "loss": 0.9102, + "step": 12074 + }, + { + "epoch": 1.7094924612444256, + "grad_norm": 7.891667484120934, + "learning_rate": 2.7136579534423003e-07, + "loss": 1.0628, + "step": 12075 + }, + { + "epoch": 1.7096340341190628, + "grad_norm": 8.780524498777554, + "learning_rate": 2.711061303803267e-07, + "loss": 1.0141, + "step": 12076 + }, + { + "epoch": 1.7097756069937, + "grad_norm": 8.359690323346081, + "learning_rate": 2.7084658258771265e-07, + "loss": 0.9683, + "step": 12077 + }, + { + "epoch": 1.7099171798683372, + "grad_norm": 8.850489767714437, + "learning_rate": 2.7058715198003155e-07, + "loss": 0.9542, + "step": 12078 + }, + { + "epoch": 1.7100587527429745, + "grad_norm": 9.839613078504051, + "learning_rate": 2.7032783857092096e-07, + "loss": 1.0401, + "step": 12079 + }, + { + "epoch": 1.7102003256176117, + "grad_norm": 11.410734617103017, + "learning_rate": 2.7006864237401426e-07, + "loss": 1.1198, + "step": 12080 + }, + { + "epoch": 1.710341898492249, + "grad_norm": 8.924129702238915, + "learning_rate": 2.6980956340293543e-07, + "loss": 0.8312, + "step": 12081 + }, + { + "epoch": 1.710483471366886, + "grad_norm": 9.749738640827923, + "learning_rate": 2.695506016713056e-07, + "loss": 0.9657, + "step": 12082 + }, + { + "epoch": 1.7106250442415232, + "grad_norm": 7.625617080483015, + "learning_rate": 2.692917571927373e-07, + "loss": 0.9213, + "step": 12083 + }, + { + "epoch": 1.7107666171161604, + "grad_norm": 9.499928122800663, + "learning_rate": 2.69033029980838e-07, + "loss": 1.01, + "step": 12084 + }, + { + "epoch": 1.7109081899907976, + "grad_norm": 8.416302311127206, + "learning_rate": 2.6877442004920873e-07, + "loss": 0.9777, + "step": 12085 + }, + { + "epoch": 1.7110497628654349, + "grad_norm": 7.159396217078311, + "learning_rate": 2.685159274114443e-07, + "loss": 0.8677, + "step": 12086 + }, + { + "epoch": 1.711191335740072, + "grad_norm": 10.250565676082832, + "learning_rate": 2.6825755208113454e-07, + "loss": 1.0206, + "step": 12087 + }, + { + "epoch": 1.7113329086147093, + "grad_norm": 10.79203738799522, + "learning_rate": 2.6799929407186095e-07, + "loss": 0.9905, + "step": 12088 + }, + { + "epoch": 1.7114744814893466, + "grad_norm": 9.565107800304464, + "learning_rate": 2.677411533972002e-07, + "loss": 1.0048, + "step": 12089 + }, + { + "epoch": 1.7116160543639838, + "grad_norm": 9.963936818661063, + "learning_rate": 2.674831300707223e-07, + "loss": 0.935, + "step": 12090 + }, + { + "epoch": 1.711757627238621, + "grad_norm": 12.44068875482125, + "learning_rate": 2.6722522410599196e-07, + "loss": 1.0228, + "step": 12091 + }, + { + "epoch": 1.7118992001132582, + "grad_norm": 10.758758525488227, + "learning_rate": 2.669674355165661e-07, + "loss": 1.0827, + "step": 12092 + }, + { + "epoch": 1.7120407729878955, + "grad_norm": 9.276773831178797, + "learning_rate": 2.667097643159974e-07, + "loss": 0.9878, + "step": 12093 + }, + { + "epoch": 1.7121823458625327, + "grad_norm": 10.09114726142463, + "learning_rate": 2.664522105178316e-07, + "loss": 1.1429, + "step": 12094 + }, + { + "epoch": 1.71232391873717, + "grad_norm": 9.14601995904823, + "learning_rate": 2.661947741356072e-07, + "loss": 0.9289, + "step": 12095 + }, + { + "epoch": 1.7124654916118072, + "grad_norm": 10.779722774529182, + "learning_rate": 2.6593745518285836e-07, + "loss": 0.9055, + "step": 12096 + }, + { + "epoch": 1.7126070644864444, + "grad_norm": 10.003286203207486, + "learning_rate": 2.6568025367311125e-07, + "loss": 0.9636, + "step": 12097 + }, + { + "epoch": 1.7127486373610816, + "grad_norm": 8.607789933913741, + "learning_rate": 2.654231696198878e-07, + "loss": 0.941, + "step": 12098 + }, + { + "epoch": 1.7128902102357189, + "grad_norm": 8.824245552471469, + "learning_rate": 2.651662030367019e-07, + "loss": 1.0006, + "step": 12099 + }, + { + "epoch": 1.713031783110356, + "grad_norm": 9.269828370448836, + "learning_rate": 2.64909353937062e-07, + "loss": 0.921, + "step": 12100 + }, + { + "epoch": 1.7131733559849933, + "grad_norm": 9.456549612115735, + "learning_rate": 2.646526223344714e-07, + "loss": 0.9015, + "step": 12101 + }, + { + "epoch": 1.7133149288596305, + "grad_norm": 9.327881265136492, + "learning_rate": 2.6439600824242515e-07, + "loss": 0.9839, + "step": 12102 + }, + { + "epoch": 1.7134565017342678, + "grad_norm": 10.068245142113854, + "learning_rate": 2.6413951167441415e-07, + "loss": 0.9787, + "step": 12103 + }, + { + "epoch": 1.713598074608905, + "grad_norm": 9.488860978326937, + "learning_rate": 2.6388313264392174e-07, + "loss": 0.9184, + "step": 12104 + }, + { + "epoch": 1.7137396474835422, + "grad_norm": 9.521299477371404, + "learning_rate": 2.6362687116442605e-07, + "loss": 1.0116, + "step": 12105 + }, + { + "epoch": 1.7138812203581795, + "grad_norm": 8.18410928417935, + "learning_rate": 2.633707272493977e-07, + "loss": 0.8233, + "step": 12106 + }, + { + "epoch": 1.7140227932328167, + "grad_norm": 9.159468674221392, + "learning_rate": 2.631147009123028e-07, + "loss": 0.9196, + "step": 12107 + }, + { + "epoch": 1.714164366107454, + "grad_norm": 10.711256961524207, + "learning_rate": 2.628587921666001e-07, + "loss": 1.0642, + "step": 12108 + }, + { + "epoch": 1.7143059389820912, + "grad_norm": 10.00290485152938, + "learning_rate": 2.626030010257427e-07, + "loss": 0.985, + "step": 12109 + }, + { + "epoch": 1.7144475118567284, + "grad_norm": 11.243383199530486, + "learning_rate": 2.6234732750317765e-07, + "loss": 1.0022, + "step": 12110 + }, + { + "epoch": 1.7145890847313656, + "grad_norm": 10.612345620909485, + "learning_rate": 2.620917716123444e-07, + "loss": 0.9637, + "step": 12111 + }, + { + "epoch": 1.7147306576060029, + "grad_norm": 9.284949185217457, + "learning_rate": 2.6183633336667845e-07, + "loss": 0.9775, + "step": 12112 + }, + { + "epoch": 1.7148722304806399, + "grad_norm": 8.473823632262096, + "learning_rate": 2.615810127796073e-07, + "loss": 1.023, + "step": 12113 + }, + { + "epoch": 1.715013803355277, + "grad_norm": 8.274803560351288, + "learning_rate": 2.613258098645538e-07, + "loss": 0.9633, + "step": 12114 + }, + { + "epoch": 1.7151553762299143, + "grad_norm": 8.698669717744112, + "learning_rate": 2.610707246349328e-07, + "loss": 0.988, + "step": 12115 + }, + { + "epoch": 1.7152969491045515, + "grad_norm": 9.60770874463716, + "learning_rate": 2.608157571041542e-07, + "loss": 0.9395, + "step": 12116 + }, + { + "epoch": 1.7154385219791888, + "grad_norm": 9.753629742401312, + "learning_rate": 2.6056090728562216e-07, + "loss": 0.9432, + "step": 12117 + }, + { + "epoch": 1.715580094853826, + "grad_norm": 9.65956942591856, + "learning_rate": 2.60306175192733e-07, + "loss": 1.0883, + "step": 12118 + }, + { + "epoch": 1.7157216677284632, + "grad_norm": 8.639549919697288, + "learning_rate": 2.600515608388787e-07, + "loss": 0.9243, + "step": 12119 + }, + { + "epoch": 1.7158632406031005, + "grad_norm": 9.415764239345497, + "learning_rate": 2.5979706423744396e-07, + "loss": 0.9685, + "step": 12120 + }, + { + "epoch": 1.7160048134777377, + "grad_norm": 8.309491294227804, + "learning_rate": 2.595426854018063e-07, + "loss": 0.937, + "step": 12121 + }, + { + "epoch": 1.716146386352375, + "grad_norm": 9.918263368260973, + "learning_rate": 2.592884243453397e-07, + "loss": 1.05, + "step": 12122 + }, + { + "epoch": 1.716287959227012, + "grad_norm": 9.628386459093967, + "learning_rate": 2.590342810814095e-07, + "loss": 0.9603, + "step": 12123 + }, + { + "epoch": 1.7164295321016492, + "grad_norm": 9.147198991475392, + "learning_rate": 2.587802556233765e-07, + "loss": 0.9699, + "step": 12124 + }, + { + "epoch": 1.7165711049762864, + "grad_norm": 11.302920843405264, + "learning_rate": 2.5852634798459397e-07, + "loss": 0.9498, + "step": 12125 + }, + { + "epoch": 1.7167126778509236, + "grad_norm": 9.06531223203359, + "learning_rate": 2.5827255817841067e-07, + "loss": 0.9142, + "step": 12126 + }, + { + "epoch": 1.7168542507255609, + "grad_norm": 10.099630715445931, + "learning_rate": 2.580188862181668e-07, + "loss": 0.9608, + "step": 12127 + }, + { + "epoch": 1.716995823600198, + "grad_norm": 11.305644787668742, + "learning_rate": 2.5776533211719883e-07, + "loss": 1.119, + "step": 12128 + }, + { + "epoch": 1.7171373964748353, + "grad_norm": 8.917911858529658, + "learning_rate": 2.5751189588883506e-07, + "loss": 0.9621, + "step": 12129 + }, + { + "epoch": 1.7172789693494726, + "grad_norm": 9.498473195308401, + "learning_rate": 2.572585775463993e-07, + "loss": 1.0393, + "step": 12130 + }, + { + "epoch": 1.7174205422241098, + "grad_norm": 9.93690268202871, + "learning_rate": 2.57005377103208e-07, + "loss": 0.9229, + "step": 12131 + }, + { + "epoch": 1.717562115098747, + "grad_norm": 9.13678805686625, + "learning_rate": 2.567522945725709e-07, + "loss": 1.0497, + "step": 12132 + }, + { + "epoch": 1.7177036879733842, + "grad_norm": 8.902534004110063, + "learning_rate": 2.564993299677937e-07, + "loss": 0.8511, + "step": 12133 + }, + { + "epoch": 1.7178452608480215, + "grad_norm": 9.069568902551268, + "learning_rate": 2.5624648330217327e-07, + "loss": 0.8975, + "step": 12134 + }, + { + "epoch": 1.7179868337226587, + "grad_norm": 8.959566205967848, + "learning_rate": 2.559937545890029e-07, + "loss": 1.0008, + "step": 12135 + }, + { + "epoch": 1.718128406597296, + "grad_norm": 9.806759850089335, + "learning_rate": 2.557411438415669e-07, + "loss": 1.0788, + "step": 12136 + }, + { + "epoch": 1.7182699794719332, + "grad_norm": 8.901460129194572, + "learning_rate": 2.5548865107314606e-07, + "loss": 0.9771, + "step": 12137 + }, + { + "epoch": 1.7184115523465704, + "grad_norm": 9.317131375737594, + "learning_rate": 2.552362762970129e-07, + "loss": 1.0258, + "step": 12138 + }, + { + "epoch": 1.7185531252212076, + "grad_norm": 10.75378954429499, + "learning_rate": 2.54984019526435e-07, + "loss": 1.0206, + "step": 12139 + }, + { + "epoch": 1.7186946980958449, + "grad_norm": 12.236613530440493, + "learning_rate": 2.547318807746738e-07, + "loss": 1.1325, + "step": 12140 + }, + { + "epoch": 1.718836270970482, + "grad_norm": 9.984272223527165, + "learning_rate": 2.5447986005498303e-07, + "loss": 1.0947, + "step": 12141 + }, + { + "epoch": 1.7189778438451193, + "grad_norm": 7.89663702934933, + "learning_rate": 2.542279573806122e-07, + "loss": 0.9065, + "step": 12142 + }, + { + "epoch": 1.7191194167197565, + "grad_norm": 7.848879895812936, + "learning_rate": 2.539761727648024e-07, + "loss": 0.8932, + "step": 12143 + }, + { + "epoch": 1.7192609895943938, + "grad_norm": 8.00727846923577, + "learning_rate": 2.537245062207905e-07, + "loss": 0.8669, + "step": 12144 + }, + { + "epoch": 1.719402562469031, + "grad_norm": 11.547144575675949, + "learning_rate": 2.5347295776180697e-07, + "loss": 1.07, + "step": 12145 + }, + { + "epoch": 1.7195441353436682, + "grad_norm": 9.184399139902139, + "learning_rate": 2.5322152740107436e-07, + "loss": 0.9661, + "step": 12146 + }, + { + "epoch": 1.7196857082183055, + "grad_norm": 9.884117456458437, + "learning_rate": 2.5297021515181123e-07, + "loss": 0.9294, + "step": 12147 + }, + { + "epoch": 1.7198272810929427, + "grad_norm": 11.622147404964986, + "learning_rate": 2.527190210272282e-07, + "loss": 1.0718, + "step": 12148 + }, + { + "epoch": 1.71996885396758, + "grad_norm": 9.558712676773446, + "learning_rate": 2.5246794504053094e-07, + "loss": 0.9124, + "step": 12149 + }, + { + "epoch": 1.7201104268422172, + "grad_norm": 7.840757733920939, + "learning_rate": 2.522169872049174e-07, + "loss": 0.8814, + "step": 12150 + }, + { + "epoch": 1.7202519997168544, + "grad_norm": 9.130891752950616, + "learning_rate": 2.5196614753358136e-07, + "loss": 0.9806, + "step": 12151 + }, + { + "epoch": 1.7203935725914916, + "grad_norm": 9.275895857410191, + "learning_rate": 2.5171542603970897e-07, + "loss": 1.0379, + "step": 12152 + }, + { + "epoch": 1.7205351454661288, + "grad_norm": 8.203238931500481, + "learning_rate": 2.514648227364794e-07, + "loss": 0.7677, + "step": 12153 + }, + { + "epoch": 1.7206767183407659, + "grad_norm": 7.687570711136302, + "learning_rate": 2.512143376370682e-07, + "loss": 0.96, + "step": 12154 + }, + { + "epoch": 1.720818291215403, + "grad_norm": 9.593227906574537, + "learning_rate": 2.509639707546421e-07, + "loss": 0.9974, + "step": 12155 + }, + { + "epoch": 1.7209598640900403, + "grad_norm": 10.208988298437642, + "learning_rate": 2.507137221023634e-07, + "loss": 0.9181, + "step": 12156 + }, + { + "epoch": 1.7211014369646775, + "grad_norm": 11.350415838076842, + "learning_rate": 2.5046359169338677e-07, + "loss": 1.0141, + "step": 12157 + }, + { + "epoch": 1.7212430098393148, + "grad_norm": 9.74463408094845, + "learning_rate": 2.502135795408622e-07, + "loss": 0.9727, + "step": 12158 + }, + { + "epoch": 1.721384582713952, + "grad_norm": 10.01222283102341, + "learning_rate": 2.499636856579321e-07, + "loss": 0.9704, + "step": 12159 + }, + { + "epoch": 1.7215261555885892, + "grad_norm": 12.470723160956888, + "learning_rate": 2.4971391005773337e-07, + "loss": 0.9698, + "step": 12160 + }, + { + "epoch": 1.7216677284632265, + "grad_norm": 8.063096135301214, + "learning_rate": 2.4946425275339634e-07, + "loss": 0.9714, + "step": 12161 + }, + { + "epoch": 1.7218093013378637, + "grad_norm": 7.913806063836517, + "learning_rate": 2.492147137580458e-07, + "loss": 0.9555, + "step": 12162 + }, + { + "epoch": 1.721950874212501, + "grad_norm": 9.027021322265542, + "learning_rate": 2.4896529308479966e-07, + "loss": 0.9772, + "step": 12163 + }, + { + "epoch": 1.7220924470871382, + "grad_norm": 8.391706514184992, + "learning_rate": 2.48715990746769e-07, + "loss": 0.9307, + "step": 12164 + }, + { + "epoch": 1.7222340199617752, + "grad_norm": 10.088188410718558, + "learning_rate": 2.484668067570606e-07, + "loss": 0.9819, + "step": 12165 + }, + { + "epoch": 1.7223755928364124, + "grad_norm": 9.69594397935956, + "learning_rate": 2.482177411287728e-07, + "loss": 0.8441, + "step": 12166 + }, + { + "epoch": 1.7225171657110496, + "grad_norm": 7.857270886257473, + "learning_rate": 2.4796879387499947e-07, + "loss": 0.8536, + "step": 12167 + }, + { + "epoch": 1.7226587385856869, + "grad_norm": 8.82098023933919, + "learning_rate": 2.47719965008828e-07, + "loss": 0.9403, + "step": 12168 + }, + { + "epoch": 1.722800311460324, + "grad_norm": 9.747772353565383, + "learning_rate": 2.4747125454333805e-07, + "loss": 0.9172, + "step": 12169 + }, + { + "epoch": 1.7229418843349613, + "grad_norm": 8.990054675520481, + "learning_rate": 2.4722266249160493e-07, + "loss": 0.9912, + "step": 12170 + }, + { + "epoch": 1.7230834572095985, + "grad_norm": 9.429926985359895, + "learning_rate": 2.4697418886669654e-07, + "loss": 0.8984, + "step": 12171 + }, + { + "epoch": 1.7232250300842358, + "grad_norm": 8.851642658783117, + "learning_rate": 2.467258336816755e-07, + "loss": 0.9805, + "step": 12172 + }, + { + "epoch": 1.723366602958873, + "grad_norm": 7.406729767144597, + "learning_rate": 2.4647759694959724e-07, + "loss": 0.9574, + "step": 12173 + }, + { + "epoch": 1.7235081758335102, + "grad_norm": 10.004499567528587, + "learning_rate": 2.462294786835109e-07, + "loss": 1.0425, + "step": 12174 + }, + { + "epoch": 1.7236497487081475, + "grad_norm": 8.835607224016346, + "learning_rate": 2.4598147889646097e-07, + "loss": 0.824, + "step": 12175 + }, + { + "epoch": 1.7237913215827847, + "grad_norm": 10.918515890807344, + "learning_rate": 2.4573359760148354e-07, + "loss": 0.9977, + "step": 12176 + }, + { + "epoch": 1.723932894457422, + "grad_norm": 10.580583069856997, + "learning_rate": 2.4548583481161044e-07, + "loss": 1.1313, + "step": 12177 + }, + { + "epoch": 1.7240744673320592, + "grad_norm": 8.807427427635519, + "learning_rate": 2.4523819053986544e-07, + "loss": 0.9808, + "step": 12178 + }, + { + "epoch": 1.7242160402066964, + "grad_norm": 10.115006778128329, + "learning_rate": 2.4499066479926807e-07, + "loss": 0.9967, + "step": 12179 + }, + { + "epoch": 1.7243576130813336, + "grad_norm": 8.584625393798815, + "learning_rate": 2.447432576028294e-07, + "loss": 0.9582, + "step": 12180 + }, + { + "epoch": 1.7244991859559708, + "grad_norm": 9.398164334532925, + "learning_rate": 2.4449596896355677e-07, + "loss": 0.9102, + "step": 12181 + }, + { + "epoch": 1.724640758830608, + "grad_norm": 8.719394454928882, + "learning_rate": 2.442487988944489e-07, + "loss": 1.0387, + "step": 12182 + }, + { + "epoch": 1.7247823317052453, + "grad_norm": 10.889150599061571, + "learning_rate": 2.440017474084999e-07, + "loss": 0.9324, + "step": 12183 + }, + { + "epoch": 1.7249239045798825, + "grad_norm": 10.850629839217392, + "learning_rate": 2.4375481451869713e-07, + "loss": 1.026, + "step": 12184 + }, + { + "epoch": 1.7250654774545198, + "grad_norm": 8.922090063492647, + "learning_rate": 2.4350800023802106e-07, + "loss": 0.9643, + "step": 12185 + }, + { + "epoch": 1.725207050329157, + "grad_norm": 8.66935140645445, + "learning_rate": 2.4326130457944713e-07, + "loss": 0.9385, + "step": 12186 + }, + { + "epoch": 1.7253486232037942, + "grad_norm": 9.36867177369464, + "learning_rate": 2.430147275559433e-07, + "loss": 0.9097, + "step": 12187 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 8.988334725232061, + "learning_rate": 2.4276826918047283e-07, + "loss": 0.9835, + "step": 12188 + }, + { + "epoch": 1.7256317689530687, + "grad_norm": 10.757576689840187, + "learning_rate": 2.425219294659908e-07, + "loss": 0.9332, + "step": 12189 + }, + { + "epoch": 1.725773341827706, + "grad_norm": 8.260344060423998, + "learning_rate": 2.422757084254479e-07, + "loss": 0.9211, + "step": 12190 + }, + { + "epoch": 1.7259149147023432, + "grad_norm": 10.785455475068613, + "learning_rate": 2.4202960607178806e-07, + "loss": 1.0413, + "step": 12191 + }, + { + "epoch": 1.7260564875769804, + "grad_norm": 8.284356452290865, + "learning_rate": 2.417836224179476e-07, + "loss": 0.8873, + "step": 12192 + }, + { + "epoch": 1.7261980604516176, + "grad_norm": 10.005483268416889, + "learning_rate": 2.4153775747685906e-07, + "loss": 0.9945, + "step": 12193 + }, + { + "epoch": 1.7263396333262548, + "grad_norm": 11.739634385402711, + "learning_rate": 2.412920112614464e-07, + "loss": 0.9282, + "step": 12194 + }, + { + "epoch": 1.726481206200892, + "grad_norm": 8.839734571105495, + "learning_rate": 2.41046383784628e-07, + "loss": 1.0704, + "step": 12195 + }, + { + "epoch": 1.726622779075529, + "grad_norm": 9.123614480375824, + "learning_rate": 2.4080087505931744e-07, + "loss": 0.9999, + "step": 12196 + }, + { + "epoch": 1.7267643519501663, + "grad_norm": 7.878575073604978, + "learning_rate": 2.4055548509841984e-07, + "loss": 0.9435, + "step": 12197 + }, + { + "epoch": 1.7269059248248035, + "grad_norm": 8.010380209520283, + "learning_rate": 2.403102139148361e-07, + "loss": 0.9592, + "step": 12198 + }, + { + "epoch": 1.7270474976994408, + "grad_norm": 10.278852838416752, + "learning_rate": 2.400650615214592e-07, + "loss": 0.9958, + "step": 12199 + }, + { + "epoch": 1.727189070574078, + "grad_norm": 12.007539288124748, + "learning_rate": 2.3982002793117744e-07, + "loss": 1.0177, + "step": 12200 + }, + { + "epoch": 1.7273306434487152, + "grad_norm": 9.113332317420141, + "learning_rate": 2.3957511315687075e-07, + "loss": 0.9752, + "step": 12201 + }, + { + "epoch": 1.7274722163233525, + "grad_norm": 8.894944253010255, + "learning_rate": 2.393303172114159e-07, + "loss": 0.9205, + "step": 12202 + }, + { + "epoch": 1.7276137891979897, + "grad_norm": 8.332982119470675, + "learning_rate": 2.3908564010767966e-07, + "loss": 0.9414, + "step": 12203 + }, + { + "epoch": 1.727755362072627, + "grad_norm": 9.050476938399434, + "learning_rate": 2.388410818585263e-07, + "loss": 1.0249, + "step": 12204 + }, + { + "epoch": 1.7278969349472642, + "grad_norm": 10.096028262744163, + "learning_rate": 2.38596642476811e-07, + "loss": 1.0624, + "step": 12205 + }, + { + "epoch": 1.7280385078219012, + "grad_norm": 9.618764467581713, + "learning_rate": 2.383523219753839e-07, + "loss": 0.9188, + "step": 12206 + }, + { + "epoch": 1.7281800806965384, + "grad_norm": 8.448015456243306, + "learning_rate": 2.381081203670893e-07, + "loss": 0.9047, + "step": 12207 + }, + { + "epoch": 1.7283216535711756, + "grad_norm": 10.516788321958693, + "learning_rate": 2.3786403766476368e-07, + "loss": 0.9227, + "step": 12208 + }, + { + "epoch": 1.7284632264458129, + "grad_norm": 8.149375152037875, + "learning_rate": 2.3762007388123927e-07, + "loss": 0.8539, + "step": 12209 + }, + { + "epoch": 1.72860479932045, + "grad_norm": 10.269973275243649, + "learning_rate": 2.3737622902934022e-07, + "loss": 1.017, + "step": 12210 + }, + { + "epoch": 1.7287463721950873, + "grad_norm": 10.378705936978344, + "learning_rate": 2.371325031218863e-07, + "loss": 1.0143, + "step": 12211 + }, + { + "epoch": 1.7288879450697245, + "grad_norm": 7.824822313527717, + "learning_rate": 2.368888961716889e-07, + "loss": 0.8919, + "step": 12212 + }, + { + "epoch": 1.7290295179443618, + "grad_norm": 9.005965904610122, + "learning_rate": 2.366454081915548e-07, + "loss": 0.9454, + "step": 12213 + }, + { + "epoch": 1.729171090818999, + "grad_norm": 9.071487074707324, + "learning_rate": 2.3640203919428451e-07, + "loss": 0.9327, + "step": 12214 + }, + { + "epoch": 1.7293126636936362, + "grad_norm": 7.791243053759388, + "learning_rate": 2.3615878919267116e-07, + "loss": 0.9724, + "step": 12215 + }, + { + "epoch": 1.7294542365682735, + "grad_norm": 8.782151796999804, + "learning_rate": 2.359156581995023e-07, + "loss": 0.8735, + "step": 12216 + }, + { + "epoch": 1.7295958094429107, + "grad_norm": 10.802114802394092, + "learning_rate": 2.3567264622755853e-07, + "loss": 0.9309, + "step": 12217 + }, + { + "epoch": 1.729737382317548, + "grad_norm": 8.703023310671629, + "learning_rate": 2.3542975328961548e-07, + "loss": 0.988, + "step": 12218 + }, + { + "epoch": 1.7298789551921852, + "grad_norm": 9.662769277356356, + "learning_rate": 2.351869793984421e-07, + "loss": 0.9193, + "step": 12219 + }, + { + "epoch": 1.7300205280668224, + "grad_norm": 8.823522721830338, + "learning_rate": 2.3494432456680038e-07, + "loss": 0.8542, + "step": 12220 + }, + { + "epoch": 1.7301621009414596, + "grad_norm": 7.468082158703825, + "learning_rate": 2.3470178880744681e-07, + "loss": 0.9079, + "step": 12221 + }, + { + "epoch": 1.7303036738160968, + "grad_norm": 9.920179711836713, + "learning_rate": 2.3445937213313062e-07, + "loss": 1.0143, + "step": 12222 + }, + { + "epoch": 1.730445246690734, + "grad_norm": 9.636862046715304, + "learning_rate": 2.3421707455659664e-07, + "loss": 0.9595, + "step": 12223 + }, + { + "epoch": 1.7305868195653713, + "grad_norm": 11.479250557391584, + "learning_rate": 2.3397489609058104e-07, + "loss": 0.9513, + "step": 12224 + }, + { + "epoch": 1.7307283924400085, + "grad_norm": 9.219488185487508, + "learning_rate": 2.3373283674781588e-07, + "loss": 0.9802, + "step": 12225 + }, + { + "epoch": 1.7308699653146458, + "grad_norm": 8.449667062915621, + "learning_rate": 2.3349089654102597e-07, + "loss": 1.0481, + "step": 12226 + }, + { + "epoch": 1.731011538189283, + "grad_norm": 10.54186151095768, + "learning_rate": 2.332490754829289e-07, + "loss": 0.9228, + "step": 12227 + }, + { + "epoch": 1.7311531110639202, + "grad_norm": 9.46148382699543, + "learning_rate": 2.3300737358623843e-07, + "loss": 0.9697, + "step": 12228 + }, + { + "epoch": 1.7312946839385575, + "grad_norm": 12.119096400251701, + "learning_rate": 2.3276579086365937e-07, + "loss": 1.0096, + "step": 12229 + }, + { + "epoch": 1.7314362568131947, + "grad_norm": 8.90186595429642, + "learning_rate": 2.3252432732789264e-07, + "loss": 0.9813, + "step": 12230 + }, + { + "epoch": 1.731577829687832, + "grad_norm": 7.933618512624452, + "learning_rate": 2.3228298299163092e-07, + "loss": 0.8693, + "step": 12231 + }, + { + "epoch": 1.7317194025624691, + "grad_norm": 9.419376579761911, + "learning_rate": 2.3204175786756238e-07, + "loss": 0.9455, + "step": 12232 + }, + { + "epoch": 1.7318609754371064, + "grad_norm": 8.318994265437313, + "learning_rate": 2.3180065196836716e-07, + "loss": 0.9362, + "step": 12233 + }, + { + "epoch": 1.7320025483117436, + "grad_norm": 10.527287946280667, + "learning_rate": 2.3155966530672092e-07, + "loss": 1.1002, + "step": 12234 + }, + { + "epoch": 1.7321441211863808, + "grad_norm": 10.13729300516939, + "learning_rate": 2.3131879789529105e-07, + "loss": 1.018, + "step": 12235 + }, + { + "epoch": 1.732285694061018, + "grad_norm": 10.469155557093098, + "learning_rate": 2.3107804974674074e-07, + "loss": 0.8784, + "step": 12236 + }, + { + "epoch": 1.732427266935655, + "grad_norm": 9.24604754160401, + "learning_rate": 2.3083742087372574e-07, + "loss": 0.9436, + "step": 12237 + }, + { + "epoch": 1.7325688398102923, + "grad_norm": 8.750091552255414, + "learning_rate": 2.3059691128889504e-07, + "loss": 0.9452, + "step": 12238 + }, + { + "epoch": 1.7327104126849295, + "grad_norm": 8.969116203461265, + "learning_rate": 2.303565210048933e-07, + "loss": 0.8833, + "step": 12239 + }, + { + "epoch": 1.7328519855595668, + "grad_norm": 11.83836930979259, + "learning_rate": 2.301162500343562e-07, + "loss": 0.9921, + "step": 12240 + }, + { + "epoch": 1.732993558434204, + "grad_norm": 10.620607433009363, + "learning_rate": 2.2987609838991536e-07, + "loss": 1.1429, + "step": 12241 + }, + { + "epoch": 1.7331351313088412, + "grad_norm": 9.421730053041363, + "learning_rate": 2.2963606608419593e-07, + "loss": 0.9419, + "step": 12242 + }, + { + "epoch": 1.7332767041834785, + "grad_norm": 10.49081855074932, + "learning_rate": 2.29396153129815e-07, + "loss": 1.081, + "step": 12243 + }, + { + "epoch": 1.7334182770581157, + "grad_norm": 9.985702211523563, + "learning_rate": 2.2915635953938587e-07, + "loss": 0.9243, + "step": 12244 + }, + { + "epoch": 1.733559849932753, + "grad_norm": 10.851564960740362, + "learning_rate": 2.2891668532551315e-07, + "loss": 0.9696, + "step": 12245 + }, + { + "epoch": 1.7337014228073901, + "grad_norm": 10.077504220222076, + "learning_rate": 2.2867713050079732e-07, + "loss": 1.0455, + "step": 12246 + }, + { + "epoch": 1.7338429956820274, + "grad_norm": 9.496560528171996, + "learning_rate": 2.2843769507783137e-07, + "loss": 0.9531, + "step": 12247 + }, + { + "epoch": 1.7339845685566644, + "grad_norm": 9.374320450631759, + "learning_rate": 2.2819837906920134e-07, + "loss": 0.9467, + "step": 12248 + }, + { + "epoch": 1.7341261414313016, + "grad_norm": 8.932510370333375, + "learning_rate": 2.2795918248748939e-07, + "loss": 0.9179, + "step": 12249 + }, + { + "epoch": 1.7342677143059388, + "grad_norm": 6.96556942465352, + "learning_rate": 2.2772010534526822e-07, + "loss": 0.9226, + "step": 12250 + }, + { + "epoch": 1.734409287180576, + "grad_norm": 9.41452362070874, + "learning_rate": 2.2748114765510754e-07, + "loss": 1.0146, + "step": 12251 + }, + { + "epoch": 1.7345508600552133, + "grad_norm": 9.763071931895375, + "learning_rate": 2.272423094295681e-07, + "loss": 0.9774, + "step": 12252 + }, + { + "epoch": 1.7346924329298505, + "grad_norm": 10.545343855871852, + "learning_rate": 2.2700359068120624e-07, + "loss": 0.9721, + "step": 12253 + }, + { + "epoch": 1.7348340058044878, + "grad_norm": 10.53082401308327, + "learning_rate": 2.2676499142257002e-07, + "loss": 1.0, + "step": 12254 + }, + { + "epoch": 1.734975578679125, + "grad_norm": 9.362664817917802, + "learning_rate": 2.265265116662041e-07, + "loss": 1.0022, + "step": 12255 + }, + { + "epoch": 1.7351171515537622, + "grad_norm": 8.553162767231424, + "learning_rate": 2.2628815142464344e-07, + "loss": 0.9085, + "step": 12256 + }, + { + "epoch": 1.7352587244283995, + "grad_norm": 9.355653051420552, + "learning_rate": 2.2604991071041999e-07, + "loss": 1.0007, + "step": 12257 + }, + { + "epoch": 1.7354002973030367, + "grad_norm": 8.469102239849478, + "learning_rate": 2.258117895360573e-07, + "loss": 1.0184, + "step": 12258 + }, + { + "epoch": 1.735541870177674, + "grad_norm": 9.617053826624826, + "learning_rate": 2.2557378791407264e-07, + "loss": 0.9521, + "step": 12259 + }, + { + "epoch": 1.7356834430523111, + "grad_norm": 9.745099914884106, + "learning_rate": 2.2533590585697817e-07, + "loss": 0.9972, + "step": 12260 + }, + { + "epoch": 1.7358250159269484, + "grad_norm": 9.693550435415567, + "learning_rate": 2.2509814337727891e-07, + "loss": 0.9244, + "step": 12261 + }, + { + "epoch": 1.7359665888015856, + "grad_norm": 7.78806481433264, + "learning_rate": 2.2486050048747459e-07, + "loss": 0.933, + "step": 12262 + }, + { + "epoch": 1.7361081616762228, + "grad_norm": 8.735058479659218, + "learning_rate": 2.246229772000566e-07, + "loss": 0.9756, + "step": 12263 + }, + { + "epoch": 1.73624973455086, + "grad_norm": 10.337069379516304, + "learning_rate": 2.2438557352751216e-07, + "loss": 0.9255, + "step": 12264 + }, + { + "epoch": 1.7363913074254973, + "grad_norm": 8.65993336006584, + "learning_rate": 2.2414828948232186e-07, + "loss": 0.9412, + "step": 12265 + }, + { + "epoch": 1.7365328803001345, + "grad_norm": 9.56375452925266, + "learning_rate": 2.2391112507695877e-07, + "loss": 0.9092, + "step": 12266 + }, + { + "epoch": 1.7366744531747718, + "grad_norm": 9.441838561937368, + "learning_rate": 2.23674080323891e-07, + "loss": 0.9191, + "step": 12267 + }, + { + "epoch": 1.736816026049409, + "grad_norm": 8.047489831638934, + "learning_rate": 2.2343715523557934e-07, + "loss": 0.9328, + "step": 12268 + }, + { + "epoch": 1.7369575989240462, + "grad_norm": 7.588184970561863, + "learning_rate": 2.232003498244792e-07, + "loss": 0.986, + "step": 12269 + }, + { + "epoch": 1.7370991717986835, + "grad_norm": 10.082972580522586, + "learning_rate": 2.229636641030386e-07, + "loss": 0.8987, + "step": 12270 + }, + { + "epoch": 1.7372407446733207, + "grad_norm": 9.431284897969107, + "learning_rate": 2.2272709808370013e-07, + "loss": 0.9064, + "step": 12271 + }, + { + "epoch": 1.737382317547958, + "grad_norm": 8.444218426183774, + "learning_rate": 2.2249065177890077e-07, + "loss": 0.8806, + "step": 12272 + }, + { + "epoch": 1.7375238904225951, + "grad_norm": 9.069862058825, + "learning_rate": 2.222543252010692e-07, + "loss": 0.9051, + "step": 12273 + }, + { + "epoch": 1.7376654632972324, + "grad_norm": 10.40269725741801, + "learning_rate": 2.2201811836262966e-07, + "loss": 1.081, + "step": 12274 + }, + { + "epoch": 1.7378070361718696, + "grad_norm": 10.606722920202877, + "learning_rate": 2.2178203127599883e-07, + "loss": 1.0522, + "step": 12275 + }, + { + "epoch": 1.7379486090465068, + "grad_norm": 8.199219680502567, + "learning_rate": 2.2154606395358824e-07, + "loss": 0.8656, + "step": 12276 + }, + { + "epoch": 1.738090181921144, + "grad_norm": 8.982366028786494, + "learning_rate": 2.2131021640780182e-07, + "loss": 0.9881, + "step": 12277 + }, + { + "epoch": 1.7382317547957813, + "grad_norm": 9.095689494634945, + "learning_rate": 2.2107448865103853e-07, + "loss": 0.9375, + "step": 12278 + }, + { + "epoch": 1.7383733276704183, + "grad_norm": 10.102710065689637, + "learning_rate": 2.2083888069569042e-07, + "loss": 0.9801, + "step": 12279 + }, + { + "epoch": 1.7385149005450555, + "grad_norm": 9.454312941886629, + "learning_rate": 2.2060339255414232e-07, + "loss": 0.9014, + "step": 12280 + }, + { + "epoch": 1.7386564734196928, + "grad_norm": 10.442186959332433, + "learning_rate": 2.2036802423877458e-07, + "loss": 0.9993, + "step": 12281 + }, + { + "epoch": 1.73879804629433, + "grad_norm": 9.066329252285064, + "learning_rate": 2.201327757619598e-07, + "loss": 0.9165, + "step": 12282 + }, + { + "epoch": 1.7389396191689672, + "grad_norm": 7.786863454701013, + "learning_rate": 2.198976471360656e-07, + "loss": 0.8719, + "step": 12283 + }, + { + "epoch": 1.7390811920436045, + "grad_norm": 10.143712589103263, + "learning_rate": 2.1966263837345125e-07, + "loss": 1.1036, + "step": 12284 + }, + { + "epoch": 1.7392227649182417, + "grad_norm": 9.293835260016827, + "learning_rate": 2.1942774948647245e-07, + "loss": 1.0191, + "step": 12285 + }, + { + "epoch": 1.739364337792879, + "grad_norm": 10.09293364539332, + "learning_rate": 2.1919298048747567e-07, + "loss": 0.9622, + "step": 12286 + }, + { + "epoch": 1.7395059106675161, + "grad_norm": 11.106666460102105, + "learning_rate": 2.189583313888033e-07, + "loss": 0.9584, + "step": 12287 + }, + { + "epoch": 1.7396474835421534, + "grad_norm": 9.70579524009629, + "learning_rate": 2.1872380220279127e-07, + "loss": 0.9768, + "step": 12288 + }, + { + "epoch": 1.7397890564167904, + "grad_norm": 10.345198515457243, + "learning_rate": 2.184893929417678e-07, + "loss": 1.0388, + "step": 12289 + }, + { + "epoch": 1.7399306292914276, + "grad_norm": 10.453211123991249, + "learning_rate": 2.182551036180558e-07, + "loss": 1.0447, + "step": 12290 + }, + { + "epoch": 1.7400722021660648, + "grad_norm": 9.929583004957648, + "learning_rate": 2.1802093424397126e-07, + "loss": 1.005, + "step": 12291 + }, + { + "epoch": 1.740213775040702, + "grad_norm": 8.990832003748535, + "learning_rate": 2.1778688483182486e-07, + "loss": 0.99, + "step": 12292 + }, + { + "epoch": 1.7403553479153393, + "grad_norm": 9.838467006827516, + "learning_rate": 2.175529553939204e-07, + "loss": 0.8765, + "step": 12293 + }, + { + "epoch": 1.7404969207899765, + "grad_norm": 9.888808627523204, + "learning_rate": 2.1731914594255498e-07, + "loss": 0.9669, + "step": 12294 + }, + { + "epoch": 1.7406384936646138, + "grad_norm": 9.535611114841885, + "learning_rate": 2.1708545649002015e-07, + "loss": 0.9844, + "step": 12295 + }, + { + "epoch": 1.740780066539251, + "grad_norm": 10.29067255281964, + "learning_rate": 2.1685188704860056e-07, + "loss": 1.0723, + "step": 12296 + }, + { + "epoch": 1.7409216394138882, + "grad_norm": 9.346861662451825, + "learning_rate": 2.1661843763057522e-07, + "loss": 1.0187, + "step": 12297 + }, + { + "epoch": 1.7410632122885255, + "grad_norm": 9.420745731324955, + "learning_rate": 2.1638510824821547e-07, + "loss": 0.925, + "step": 12298 + }, + { + "epoch": 1.7412047851631627, + "grad_norm": 9.901023858041148, + "learning_rate": 2.161518989137884e-07, + "loss": 0.916, + "step": 12299 + }, + { + "epoch": 1.7413463580378, + "grad_norm": 11.000088431262972, + "learning_rate": 2.1591880963955314e-07, + "loss": 0.9787, + "step": 12300 + }, + { + "epoch": 1.7414879309124371, + "grad_norm": 9.864137880498644, + "learning_rate": 2.1568584043776237e-07, + "loss": 0.9596, + "step": 12301 + }, + { + "epoch": 1.7416295037870744, + "grad_norm": 7.916551475356103, + "learning_rate": 2.1545299132066432e-07, + "loss": 0.8779, + "step": 12302 + }, + { + "epoch": 1.7417710766617116, + "grad_norm": 10.393419942472542, + "learning_rate": 2.152202623004987e-07, + "loss": 0.9432, + "step": 12303 + }, + { + "epoch": 1.7419126495363488, + "grad_norm": 9.936213955945208, + "learning_rate": 2.1498765338950067e-07, + "loss": 0.9963, + "step": 12304 + }, + { + "epoch": 1.742054222410986, + "grad_norm": 8.562193036668116, + "learning_rate": 2.1475516459989743e-07, + "loss": 0.9829, + "step": 12305 + }, + { + "epoch": 1.7421957952856233, + "grad_norm": 8.039611975572791, + "learning_rate": 2.1452279594391167e-07, + "loss": 0.9642, + "step": 12306 + }, + { + "epoch": 1.7423373681602605, + "grad_norm": 9.296388347294391, + "learning_rate": 2.142905474337578e-07, + "loss": 0.9541, + "step": 12307 + }, + { + "epoch": 1.7424789410348978, + "grad_norm": 10.26251981967959, + "learning_rate": 2.1405841908164636e-07, + "loss": 0.9605, + "step": 12308 + }, + { + "epoch": 1.742620513909535, + "grad_norm": 8.70093447171925, + "learning_rate": 2.1382641089977867e-07, + "loss": 0.8826, + "step": 12309 + }, + { + "epoch": 1.7427620867841722, + "grad_norm": 8.715746290389452, + "learning_rate": 2.1359452290035194e-07, + "loss": 1.029, + "step": 12310 + }, + { + "epoch": 1.7429036596588094, + "grad_norm": 9.697965617044382, + "learning_rate": 2.1336275509555722e-07, + "loss": 0.8008, + "step": 12311 + }, + { + "epoch": 1.7430452325334467, + "grad_norm": 10.272527362845532, + "learning_rate": 2.1313110749757672e-07, + "loss": 1.0769, + "step": 12312 + }, + { + "epoch": 1.743186805408084, + "grad_norm": 9.745127316186865, + "learning_rate": 2.1289958011858903e-07, + "loss": 0.9183, + "step": 12313 + }, + { + "epoch": 1.7433283782827211, + "grad_norm": 8.778154255107667, + "learning_rate": 2.1266817297076469e-07, + "loss": 1.0178, + "step": 12314 + }, + { + "epoch": 1.7434699511573584, + "grad_norm": 10.455905059943811, + "learning_rate": 2.12436886066269e-07, + "loss": 0.9821, + "step": 12315 + }, + { + "epoch": 1.7436115240319956, + "grad_norm": 7.849382422384694, + "learning_rate": 2.1220571941726082e-07, + "loss": 0.9082, + "step": 12316 + }, + { + "epoch": 1.7437530969066328, + "grad_norm": 10.6839775839413, + "learning_rate": 2.119746730358918e-07, + "loss": 0.9908, + "step": 12317 + }, + { + "epoch": 1.74389466978127, + "grad_norm": 8.884692420079272, + "learning_rate": 2.1174374693430865e-07, + "loss": 0.9795, + "step": 12318 + }, + { + "epoch": 1.7440362426559073, + "grad_norm": 9.698092274999969, + "learning_rate": 2.1151294112464997e-07, + "loss": 0.9567, + "step": 12319 + }, + { + "epoch": 1.7441778155305443, + "grad_norm": 11.553140347104021, + "learning_rate": 2.1128225561905024e-07, + "loss": 1.1119, + "step": 12320 + }, + { + "epoch": 1.7443193884051815, + "grad_norm": 8.926068017138455, + "learning_rate": 2.1105169042963585e-07, + "loss": 0.883, + "step": 12321 + }, + { + "epoch": 1.7444609612798188, + "grad_norm": 8.518739575206846, + "learning_rate": 2.1082124556852684e-07, + "loss": 0.8321, + "step": 12322 + }, + { + "epoch": 1.744602534154456, + "grad_norm": 8.611203423555587, + "learning_rate": 2.1059092104783824e-07, + "loss": 0.9984, + "step": 12323 + }, + { + "epoch": 1.7447441070290932, + "grad_norm": 11.264313472005538, + "learning_rate": 2.1036071687967785e-07, + "loss": 1.1026, + "step": 12324 + }, + { + "epoch": 1.7448856799037304, + "grad_norm": 9.310985467630049, + "learning_rate": 2.101306330761474e-07, + "loss": 0.89, + "step": 12325 + }, + { + "epoch": 1.7450272527783677, + "grad_norm": 8.392973785981223, + "learning_rate": 2.0990066964934193e-07, + "loss": 0.9588, + "step": 12326 + }, + { + "epoch": 1.745168825653005, + "grad_norm": 10.277823668597986, + "learning_rate": 2.096708266113512e-07, + "loss": 0.9316, + "step": 12327 + }, + { + "epoch": 1.7453103985276421, + "grad_norm": 7.129743101566559, + "learning_rate": 2.0944110397425693e-07, + "loss": 0.8784, + "step": 12328 + }, + { + "epoch": 1.7454519714022794, + "grad_norm": 8.593424620970884, + "learning_rate": 2.0921150175013616e-07, + "loss": 0.8837, + "step": 12329 + }, + { + "epoch": 1.7455935442769166, + "grad_norm": 8.51852059786613, + "learning_rate": 2.089820199510584e-07, + "loss": 0.9867, + "step": 12330 + }, + { + "epoch": 1.7457351171515536, + "grad_norm": 8.426033423005482, + "learning_rate": 2.0875265858908782e-07, + "loss": 0.9852, + "step": 12331 + }, + { + "epoch": 1.7458766900261908, + "grad_norm": 9.863097925276508, + "learning_rate": 2.0852341767628182e-07, + "loss": 1.0821, + "step": 12332 + }, + { + "epoch": 1.746018262900828, + "grad_norm": 9.521098349546563, + "learning_rate": 2.082942972246907e-07, + "loss": 0.9697, + "step": 12333 + }, + { + "epoch": 1.7461598357754653, + "grad_norm": 7.831877579929474, + "learning_rate": 2.0806529724635982e-07, + "loss": 0.9711, + "step": 12334 + }, + { + "epoch": 1.7463014086501025, + "grad_norm": 8.615965160784686, + "learning_rate": 2.0783641775332708e-07, + "loss": 0.9424, + "step": 12335 + }, + { + "epoch": 1.7464429815247398, + "grad_norm": 9.47955169454705, + "learning_rate": 2.0760765875762506e-07, + "loss": 0.9354, + "step": 12336 + }, + { + "epoch": 1.746584554399377, + "grad_norm": 9.997497626969938, + "learning_rate": 2.0737902027127888e-07, + "loss": 1.0258, + "step": 12337 + }, + { + "epoch": 1.7467261272740142, + "grad_norm": 8.521427298614148, + "learning_rate": 2.0715050230630807e-07, + "loss": 1.0145, + "step": 12338 + }, + { + "epoch": 1.7468677001486514, + "grad_norm": 9.355826340373707, + "learning_rate": 2.069221048747261e-07, + "loss": 1.0461, + "step": 12339 + }, + { + "epoch": 1.7470092730232887, + "grad_norm": 10.560510933393157, + "learning_rate": 2.0669382798853887e-07, + "loss": 1.0186, + "step": 12340 + }, + { + "epoch": 1.747150845897926, + "grad_norm": 8.786519597414108, + "learning_rate": 2.064656716597474e-07, + "loss": 0.9409, + "step": 12341 + }, + { + "epoch": 1.7472924187725631, + "grad_norm": 9.293010209685784, + "learning_rate": 2.0623763590034567e-07, + "loss": 0.9905, + "step": 12342 + }, + { + "epoch": 1.7474339916472004, + "grad_norm": 10.188206185982198, + "learning_rate": 2.0600972072232105e-07, + "loss": 0.9602, + "step": 12343 + }, + { + "epoch": 1.7475755645218376, + "grad_norm": 7.577329110106747, + "learning_rate": 2.0578192613765453e-07, + "loss": 0.9612, + "step": 12344 + }, + { + "epoch": 1.7477171373964748, + "grad_norm": 8.383972372140573, + "learning_rate": 2.0555425215832176e-07, + "loss": 0.9345, + "step": 12345 + }, + { + "epoch": 1.747858710271112, + "grad_norm": 9.270085153857305, + "learning_rate": 2.0532669879629124e-07, + "loss": 1.0398, + "step": 12346 + }, + { + "epoch": 1.7480002831457493, + "grad_norm": 8.842416184200411, + "learning_rate": 2.050992660635248e-07, + "loss": 0.8873, + "step": 12347 + }, + { + "epoch": 1.7481418560203865, + "grad_norm": 10.696569687912985, + "learning_rate": 2.0487195397197928e-07, + "loss": 1.0581, + "step": 12348 + }, + { + "epoch": 1.7482834288950238, + "grad_norm": 11.382266898814988, + "learning_rate": 2.0464476253360344e-07, + "loss": 1.0265, + "step": 12349 + }, + { + "epoch": 1.748425001769661, + "grad_norm": 8.72554478734351, + "learning_rate": 2.044176917603413e-07, + "loss": 1.0172, + "step": 12350 + }, + { + "epoch": 1.7485665746442982, + "grad_norm": 10.1369486817612, + "learning_rate": 2.0419074166412893e-07, + "loss": 0.9216, + "step": 12351 + }, + { + "epoch": 1.7487081475189354, + "grad_norm": 9.943526833629669, + "learning_rate": 2.0396391225689817e-07, + "loss": 1.0162, + "step": 12352 + }, + { + "epoch": 1.7488497203935727, + "grad_norm": 10.324966940468979, + "learning_rate": 2.037372035505722e-07, + "loss": 0.9928, + "step": 12353 + }, + { + "epoch": 1.74899129326821, + "grad_norm": 8.738427493048707, + "learning_rate": 2.0351061555706901e-07, + "loss": 0.8993, + "step": 12354 + }, + { + "epoch": 1.7491328661428471, + "grad_norm": 10.164612317589615, + "learning_rate": 2.0328414828830078e-07, + "loss": 1.0142, + "step": 12355 + }, + { + "epoch": 1.7492744390174844, + "grad_norm": 8.13383496514617, + "learning_rate": 2.0305780175617213e-07, + "loss": 1.0036, + "step": 12356 + }, + { + "epoch": 1.7494160118921216, + "grad_norm": 11.197431365150049, + "learning_rate": 2.0283157597258241e-07, + "loss": 1.0548, + "step": 12357 + }, + { + "epoch": 1.7495575847667588, + "grad_norm": 8.80692369146154, + "learning_rate": 2.026054709494235e-07, + "loss": 0.939, + "step": 12358 + }, + { + "epoch": 1.749699157641396, + "grad_norm": 8.067177059115847, + "learning_rate": 2.0237948669858233e-07, + "loss": 0.8955, + "step": 12359 + }, + { + "epoch": 1.7498407305160333, + "grad_norm": 10.139649519120741, + "learning_rate": 2.0215362323193822e-07, + "loss": 1.0363, + "step": 12360 + }, + { + "epoch": 1.7499823033906705, + "grad_norm": 8.740260780337207, + "learning_rate": 2.0192788056136446e-07, + "loss": 0.9306, + "step": 12361 + }, + { + "epoch": 1.7501238762653075, + "grad_norm": 11.084040645506944, + "learning_rate": 2.0170225869872912e-07, + "loss": 1.064, + "step": 12362 + }, + { + "epoch": 1.7502654491399448, + "grad_norm": 9.964633582039854, + "learning_rate": 2.0147675765589236e-07, + "loss": 0.8679, + "step": 12363 + }, + { + "epoch": 1.750407022014582, + "grad_norm": 12.807701440403873, + "learning_rate": 2.0125137744470863e-07, + "loss": 0.9899, + "step": 12364 + }, + { + "epoch": 1.7505485948892192, + "grad_norm": 8.951487446365627, + "learning_rate": 2.0102611807702539e-07, + "loss": 0.9016, + "step": 12365 + }, + { + "epoch": 1.7506901677638564, + "grad_norm": 8.535669767875891, + "learning_rate": 2.0080097956468537e-07, + "loss": 0.9291, + "step": 12366 + }, + { + "epoch": 1.7508317406384937, + "grad_norm": 8.08852143949365, + "learning_rate": 2.0057596191952327e-07, + "loss": 0.8867, + "step": 12367 + }, + { + "epoch": 1.750973313513131, + "grad_norm": 9.331021930950484, + "learning_rate": 2.0035106515336798e-07, + "loss": 0.8127, + "step": 12368 + }, + { + "epoch": 1.7511148863877681, + "grad_norm": 7.989335342663324, + "learning_rate": 2.001262892780434e-07, + "loss": 0.8885, + "step": 12369 + }, + { + "epoch": 1.7512564592624054, + "grad_norm": 7.518790801612198, + "learning_rate": 1.999016343053642e-07, + "loss": 1.0252, + "step": 12370 + }, + { + "epoch": 1.7513980321370426, + "grad_norm": 10.829953610364022, + "learning_rate": 1.996771002471415e-07, + "loss": 0.9558, + "step": 12371 + }, + { + "epoch": 1.7515396050116796, + "grad_norm": 9.85114816857699, + "learning_rate": 1.9945268711517807e-07, + "loss": 0.9599, + "step": 12372 + }, + { + "epoch": 1.7516811778863168, + "grad_norm": 10.188503847805048, + "learning_rate": 1.9922839492127199e-07, + "loss": 1.005, + "step": 12373 + }, + { + "epoch": 1.751822750760954, + "grad_norm": 9.359130448599712, + "learning_rate": 1.9900422367721355e-07, + "loss": 1.0208, + "step": 12374 + }, + { + "epoch": 1.7519643236355913, + "grad_norm": 9.584380604008794, + "learning_rate": 1.9878017339478695e-07, + "loss": 0.9632, + "step": 12375 + }, + { + "epoch": 1.7521058965102285, + "grad_norm": 10.189281097299512, + "learning_rate": 1.9855624408577136e-07, + "loss": 1.0212, + "step": 12376 + }, + { + "epoch": 1.7522474693848658, + "grad_norm": 8.257798641485532, + "learning_rate": 1.9833243576193734e-07, + "loss": 0.8773, + "step": 12377 + }, + { + "epoch": 1.752389042259503, + "grad_norm": 9.36580074375797, + "learning_rate": 1.9810874843505164e-07, + "loss": 0.9013, + "step": 12378 + }, + { + "epoch": 1.7525306151341402, + "grad_norm": 10.65577029941187, + "learning_rate": 1.9788518211687202e-07, + "loss": 0.9728, + "step": 12379 + }, + { + "epoch": 1.7526721880087774, + "grad_norm": 10.304554970954062, + "learning_rate": 1.9766173681915247e-07, + "loss": 0.969, + "step": 12380 + }, + { + "epoch": 1.7528137608834147, + "grad_norm": 9.954677107444347, + "learning_rate": 1.9743841255363827e-07, + "loss": 0.856, + "step": 12381 + }, + { + "epoch": 1.752955333758052, + "grad_norm": 9.380156052842809, + "learning_rate": 1.9721520933207006e-07, + "loss": 0.953, + "step": 12382 + }, + { + "epoch": 1.7530969066326891, + "grad_norm": 8.344602619855591, + "learning_rate": 1.9699212716618123e-07, + "loss": 0.9722, + "step": 12383 + }, + { + "epoch": 1.7532384795073264, + "grad_norm": 8.679984155455864, + "learning_rate": 1.9676916606769874e-07, + "loss": 0.8481, + "step": 12384 + }, + { + "epoch": 1.7533800523819636, + "grad_norm": 8.834665114095873, + "learning_rate": 1.9654632604834494e-07, + "loss": 1.037, + "step": 12385 + }, + { + "epoch": 1.7535216252566008, + "grad_norm": 8.519684829971752, + "learning_rate": 1.9632360711983212e-07, + "loss": 0.9771, + "step": 12386 + }, + { + "epoch": 1.753663198131238, + "grad_norm": 9.603078475725246, + "learning_rate": 1.9610100929387006e-07, + "loss": 0.9963, + "step": 12387 + }, + { + "epoch": 1.7538047710058753, + "grad_norm": 9.66986644885398, + "learning_rate": 1.9587853258215995e-07, + "loss": 1.0042, + "step": 12388 + }, + { + "epoch": 1.7539463438805125, + "grad_norm": 9.184303609980072, + "learning_rate": 1.9565617699639717e-07, + "loss": 0.9083, + "step": 12389 + }, + { + "epoch": 1.7540879167551497, + "grad_norm": 9.70780775314748, + "learning_rate": 1.9543394254827125e-07, + "loss": 0.9259, + "step": 12390 + }, + { + "epoch": 1.754229489629787, + "grad_norm": 8.31735615391774, + "learning_rate": 1.9521182924946426e-07, + "loss": 0.8734, + "step": 12391 + }, + { + "epoch": 1.7543710625044242, + "grad_norm": 11.462866497940617, + "learning_rate": 1.9498983711165347e-07, + "loss": 0.9948, + "step": 12392 + }, + { + "epoch": 1.7545126353790614, + "grad_norm": 9.77541189277157, + "learning_rate": 1.9476796614650766e-07, + "loss": 1.0848, + "step": 12393 + }, + { + "epoch": 1.7546542082536987, + "grad_norm": 8.731459675307642, + "learning_rate": 1.9454621636569138e-07, + "loss": 0.9391, + "step": 12394 + }, + { + "epoch": 1.754795781128336, + "grad_norm": 10.072779741641966, + "learning_rate": 1.9432458778086167e-07, + "loss": 1.0332, + "step": 12395 + }, + { + "epoch": 1.7549373540029731, + "grad_norm": 10.563932603822568, + "learning_rate": 1.9410308040366867e-07, + "loss": 0.9264, + "step": 12396 + }, + { + "epoch": 1.7550789268776104, + "grad_norm": 9.951325402090959, + "learning_rate": 1.9388169424575802e-07, + "loss": 1.051, + "step": 12397 + }, + { + "epoch": 1.7552204997522476, + "grad_norm": 10.872396223178635, + "learning_rate": 1.936604293187666e-07, + "loss": 0.9026, + "step": 12398 + }, + { + "epoch": 1.7553620726268848, + "grad_norm": 11.15368315074211, + "learning_rate": 1.93439285634327e-07, + "loss": 0.943, + "step": 12399 + }, + { + "epoch": 1.755503645501522, + "grad_norm": 8.503365691793979, + "learning_rate": 1.932182632040641e-07, + "loss": 0.9681, + "step": 12400 + }, + { + "epoch": 1.7556452183761593, + "grad_norm": 9.209925265096992, + "learning_rate": 1.929973620395975e-07, + "loss": 0.9726, + "step": 12401 + }, + { + "epoch": 1.7557867912507965, + "grad_norm": 12.232169116870113, + "learning_rate": 1.9277658215253904e-07, + "loss": 0.9771, + "step": 12402 + }, + { + "epoch": 1.7559283641254335, + "grad_norm": 8.69471978620614, + "learning_rate": 1.925559235544955e-07, + "loss": 1.0174, + "step": 12403 + }, + { + "epoch": 1.7560699370000707, + "grad_norm": 9.712903389231005, + "learning_rate": 1.9233538625706622e-07, + "loss": 0.9774, + "step": 12404 + }, + { + "epoch": 1.756211509874708, + "grad_norm": 9.307662219309934, + "learning_rate": 1.9211497027184556e-07, + "loss": 0.9663, + "step": 12405 + }, + { + "epoch": 1.7563530827493452, + "grad_norm": 9.36346865686461, + "learning_rate": 1.918946756104201e-07, + "loss": 0.9344, + "step": 12406 + }, + { + "epoch": 1.7564946556239824, + "grad_norm": 8.96054882480416, + "learning_rate": 1.9167450228436995e-07, + "loss": 1.0261, + "step": 12407 + }, + { + "epoch": 1.7566362284986197, + "grad_norm": 8.639953918334143, + "learning_rate": 1.9145445030527065e-07, + "loss": 0.905, + "step": 12408 + }, + { + "epoch": 1.756777801373257, + "grad_norm": 9.042775544642323, + "learning_rate": 1.9123451968468903e-07, + "loss": 0.97, + "step": 12409 + }, + { + "epoch": 1.7569193742478941, + "grad_norm": 10.868188237418638, + "learning_rate": 1.910147104341875e-07, + "loss": 0.9893, + "step": 12410 + }, + { + "epoch": 1.7570609471225314, + "grad_norm": 9.390671309422004, + "learning_rate": 1.9079502256532073e-07, + "loss": 0.9814, + "step": 12411 + }, + { + "epoch": 1.7572025199971686, + "grad_norm": 9.51958414531878, + "learning_rate": 1.9057545608963807e-07, + "loss": 1.0014, + "step": 12412 + }, + { + "epoch": 1.7573440928718056, + "grad_norm": 8.904201553078837, + "learning_rate": 1.9035601101868168e-07, + "loss": 1.0235, + "step": 12413 + }, + { + "epoch": 1.7574856657464428, + "grad_norm": 9.121127208182745, + "learning_rate": 1.9013668736398761e-07, + "loss": 0.9992, + "step": 12414 + }, + { + "epoch": 1.75762723862108, + "grad_norm": 10.116211267451607, + "learning_rate": 1.899174851370858e-07, + "loss": 0.9228, + "step": 12415 + }, + { + "epoch": 1.7577688114957173, + "grad_norm": 8.694714082620292, + "learning_rate": 1.8969840434949926e-07, + "loss": 1.0065, + "step": 12416 + }, + { + "epoch": 1.7579103843703545, + "grad_norm": 9.529635333262283, + "learning_rate": 1.8947944501274517e-07, + "loss": 0.9214, + "step": 12417 + }, + { + "epoch": 1.7580519572449917, + "grad_norm": 10.533939188019577, + "learning_rate": 1.892606071383332e-07, + "loss": 0.888, + "step": 12418 + }, + { + "epoch": 1.758193530119629, + "grad_norm": 8.74377263402293, + "learning_rate": 1.8904189073776835e-07, + "loss": 0.8965, + "step": 12419 + }, + { + "epoch": 1.7583351029942662, + "grad_norm": 8.373653929985414, + "learning_rate": 1.8882329582254833e-07, + "loss": 0.938, + "step": 12420 + }, + { + "epoch": 1.7584766758689034, + "grad_norm": 9.093476071770965, + "learning_rate": 1.8860482240416424e-07, + "loss": 0.94, + "step": 12421 + }, + { + "epoch": 1.7586182487435407, + "grad_norm": 8.760951137379033, + "learning_rate": 1.8838647049410157e-07, + "loss": 0.8522, + "step": 12422 + }, + { + "epoch": 1.758759821618178, + "grad_norm": 8.922718976883093, + "learning_rate": 1.881682401038379e-07, + "loss": 1.0468, + "step": 12423 + }, + { + "epoch": 1.7589013944928151, + "grad_norm": 8.539686565856291, + "learning_rate": 1.8795013124484674e-07, + "loss": 0.8883, + "step": 12424 + }, + { + "epoch": 1.7590429673674524, + "grad_norm": 9.015635578010475, + "learning_rate": 1.8773214392859284e-07, + "loss": 0.9723, + "step": 12425 + }, + { + "epoch": 1.7591845402420896, + "grad_norm": 8.691114373610313, + "learning_rate": 1.8751427816653623e-07, + "loss": 0.9502, + "step": 12426 + }, + { + "epoch": 1.7593261131167268, + "grad_norm": 9.233239738274596, + "learning_rate": 1.8729653397012993e-07, + "loss": 0.8738, + "step": 12427 + }, + { + "epoch": 1.759467685991364, + "grad_norm": 11.790590342721725, + "learning_rate": 1.870789113508198e-07, + "loss": 0.9089, + "step": 12428 + }, + { + "epoch": 1.7596092588660013, + "grad_norm": 9.453887252971368, + "learning_rate": 1.8686141032004724e-07, + "loss": 1.041, + "step": 12429 + }, + { + "epoch": 1.7597508317406385, + "grad_norm": 9.755689086038824, + "learning_rate": 1.8664403088924533e-07, + "loss": 1.1162, + "step": 12430 + }, + { + "epoch": 1.7598924046152757, + "grad_norm": 10.57415742155285, + "learning_rate": 1.8642677306984213e-07, + "loss": 0.9731, + "step": 12431 + }, + { + "epoch": 1.760033977489913, + "grad_norm": 8.711987131969241, + "learning_rate": 1.8620963687325772e-07, + "loss": 0.9267, + "step": 12432 + }, + { + "epoch": 1.7601755503645502, + "grad_norm": 10.041174236796756, + "learning_rate": 1.859926223109082e-07, + "loss": 0.915, + "step": 12433 + }, + { + "epoch": 1.7603171232391874, + "grad_norm": 7.139809993919136, + "learning_rate": 1.857757293942006e-07, + "loss": 0.9165, + "step": 12434 + }, + { + "epoch": 1.7604586961138247, + "grad_norm": 10.607759018360172, + "learning_rate": 1.855589581345374e-07, + "loss": 0.911, + "step": 12435 + }, + { + "epoch": 1.760600268988462, + "grad_norm": 9.56832648398358, + "learning_rate": 1.8534230854331454e-07, + "loss": 0.8656, + "step": 12436 + }, + { + "epoch": 1.7607418418630991, + "grad_norm": 9.249565732917334, + "learning_rate": 1.851257806319201e-07, + "loss": 0.9097, + "step": 12437 + }, + { + "epoch": 1.7608834147377364, + "grad_norm": 9.979124404934542, + "learning_rate": 1.8490937441173807e-07, + "loss": 1.0198, + "step": 12438 + }, + { + "epoch": 1.7610249876123736, + "grad_norm": 10.41489507233154, + "learning_rate": 1.846930898941432e-07, + "loss": 1.0692, + "step": 12439 + }, + { + "epoch": 1.7611665604870108, + "grad_norm": 8.327076457649646, + "learning_rate": 1.8447692709050668e-07, + "loss": 0.9545, + "step": 12440 + }, + { + "epoch": 1.761308133361648, + "grad_norm": 10.527462966045753, + "learning_rate": 1.842608860121914e-07, + "loss": 0.9721, + "step": 12441 + }, + { + "epoch": 1.7614497062362853, + "grad_norm": 8.318852112886383, + "learning_rate": 1.8404496667055433e-07, + "loss": 0.9888, + "step": 12442 + }, + { + "epoch": 1.7615912791109225, + "grad_norm": 10.55490362819983, + "learning_rate": 1.8382916907694725e-07, + "loss": 0.973, + "step": 12443 + }, + { + "epoch": 1.7617328519855595, + "grad_norm": 10.341837634185497, + "learning_rate": 1.8361349324271304e-07, + "loss": 1.0644, + "step": 12444 + }, + { + "epoch": 1.7618744248601967, + "grad_norm": 8.335766805744154, + "learning_rate": 1.8339793917919096e-07, + "loss": 0.9497, + "step": 12445 + }, + { + "epoch": 1.762015997734834, + "grad_norm": 10.00767413363072, + "learning_rate": 1.831825068977111e-07, + "loss": 1.0624, + "step": 12446 + }, + { + "epoch": 1.7621575706094712, + "grad_norm": 9.483345290066422, + "learning_rate": 1.8296719640960025e-07, + "loss": 0.9489, + "step": 12447 + }, + { + "epoch": 1.7622991434841084, + "grad_norm": 7.897250757006705, + "learning_rate": 1.8275200772617603e-07, + "loss": 0.9576, + "step": 12448 + }, + { + "epoch": 1.7624407163587457, + "grad_norm": 7.906454814926758, + "learning_rate": 1.8253694085875047e-07, + "loss": 0.9297, + "step": 12449 + }, + { + "epoch": 1.762582289233383, + "grad_norm": 9.724506402911842, + "learning_rate": 1.8232199581863036e-07, + "loss": 0.9329, + "step": 12450 + }, + { + "epoch": 1.7627238621080201, + "grad_norm": 9.362290783162852, + "learning_rate": 1.8210717261711448e-07, + "loss": 1.0372, + "step": 12451 + }, + { + "epoch": 1.7628654349826574, + "grad_norm": 9.906541501157715, + "learning_rate": 1.8189247126549653e-07, + "loss": 0.9685, + "step": 12452 + }, + { + "epoch": 1.7630070078572946, + "grad_norm": 9.74443443101605, + "learning_rate": 1.816778917750625e-07, + "loss": 0.9725, + "step": 12453 + }, + { + "epoch": 1.7631485807319318, + "grad_norm": 10.087145082264087, + "learning_rate": 1.8146343415709367e-07, + "loss": 0.9901, + "step": 12454 + }, + { + "epoch": 1.7632901536065688, + "grad_norm": 9.357457137991895, + "learning_rate": 1.8124909842286293e-07, + "loss": 0.9665, + "step": 12455 + }, + { + "epoch": 1.763431726481206, + "grad_norm": 9.106301994934563, + "learning_rate": 1.810348845836385e-07, + "loss": 1.001, + "step": 12456 + }, + { + "epoch": 1.7635732993558433, + "grad_norm": 9.313200444243023, + "learning_rate": 1.8082079265068053e-07, + "loss": 1.0108, + "step": 12457 + }, + { + "epoch": 1.7637148722304805, + "grad_norm": 11.020884715046417, + "learning_rate": 1.806068226352445e-07, + "loss": 0.9816, + "step": 12458 + }, + { + "epoch": 1.7638564451051177, + "grad_norm": 8.665196171745, + "learning_rate": 1.8039297454857885e-07, + "loss": 0.8374, + "step": 12459 + }, + { + "epoch": 1.763998017979755, + "grad_norm": 7.833464303511236, + "learning_rate": 1.8017924840192435e-07, + "loss": 0.9802, + "step": 12460 + }, + { + "epoch": 1.7641395908543922, + "grad_norm": 7.730068294721881, + "learning_rate": 1.7996564420651758e-07, + "loss": 0.8988, + "step": 12461 + }, + { + "epoch": 1.7642811637290294, + "grad_norm": 10.202516686618193, + "learning_rate": 1.7975216197358648e-07, + "loss": 1.0772, + "step": 12462 + }, + { + "epoch": 1.7644227366036667, + "grad_norm": 9.037809847299151, + "learning_rate": 1.7953880171435455e-07, + "loss": 0.9727, + "step": 12463 + }, + { + "epoch": 1.764564309478304, + "grad_norm": 9.37498982746844, + "learning_rate": 1.7932556344003703e-07, + "loss": 0.8676, + "step": 12464 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 9.135013468481365, + "learning_rate": 1.7911244716184468e-07, + "loss": 0.8966, + "step": 12465 + }, + { + "epoch": 1.7648474552275784, + "grad_norm": 10.398555625972547, + "learning_rate": 1.7889945289098042e-07, + "loss": 0.9892, + "step": 12466 + }, + { + "epoch": 1.7649890281022156, + "grad_norm": 8.96522202701374, + "learning_rate": 1.786865806386412e-07, + "loss": 0.8309, + "step": 12467 + }, + { + "epoch": 1.7651306009768528, + "grad_norm": 10.535276101816407, + "learning_rate": 1.7847383041601772e-07, + "loss": 0.9689, + "step": 12468 + }, + { + "epoch": 1.76527217385149, + "grad_norm": 7.707651661467421, + "learning_rate": 1.7826120223429416e-07, + "loss": 0.9354, + "step": 12469 + }, + { + "epoch": 1.7654137467261273, + "grad_norm": 9.91549530377281, + "learning_rate": 1.7804869610464766e-07, + "loss": 0.918, + "step": 12470 + }, + { + "epoch": 1.7655553196007645, + "grad_norm": 9.764364371758802, + "learning_rate": 1.7783631203825007e-07, + "loss": 0.9312, + "step": 12471 + }, + { + "epoch": 1.7656968924754017, + "grad_norm": 10.057806780500743, + "learning_rate": 1.7762405004626586e-07, + "loss": 1.028, + "step": 12472 + }, + { + "epoch": 1.765838465350039, + "grad_norm": 9.301919341199202, + "learning_rate": 1.7741191013985387e-07, + "loss": 0.9831, + "step": 12473 + }, + { + "epoch": 1.7659800382246762, + "grad_norm": 10.96581122435114, + "learning_rate": 1.7719989233016572e-07, + "loss": 1.0514, + "step": 12474 + }, + { + "epoch": 1.7661216110993134, + "grad_norm": 10.120258492647697, + "learning_rate": 1.7698799662834776e-07, + "loss": 0.9736, + "step": 12475 + }, + { + "epoch": 1.7662631839739507, + "grad_norm": 8.022169389958766, + "learning_rate": 1.7677622304553833e-07, + "loss": 0.8292, + "step": 12476 + }, + { + "epoch": 1.7664047568485879, + "grad_norm": 9.25098723220975, + "learning_rate": 1.76564571592871e-07, + "loss": 0.9318, + "step": 12477 + }, + { + "epoch": 1.7665463297232251, + "grad_norm": 10.032099892807953, + "learning_rate": 1.7635304228147104e-07, + "loss": 1.0354, + "step": 12478 + }, + { + "epoch": 1.7666879025978623, + "grad_norm": 9.860911689377566, + "learning_rate": 1.7614163512245957e-07, + "loss": 1.0525, + "step": 12479 + }, + { + "epoch": 1.7668294754724996, + "grad_norm": 8.891245842091632, + "learning_rate": 1.7593035012694992e-07, + "loss": 0.9926, + "step": 12480 + }, + { + "epoch": 1.7669710483471368, + "grad_norm": 9.35994274219704, + "learning_rate": 1.757191873060482e-07, + "loss": 0.9648, + "step": 12481 + }, + { + "epoch": 1.767112621221774, + "grad_norm": 10.550930209981413, + "learning_rate": 1.755081466708561e-07, + "loss": 1.0819, + "step": 12482 + }, + { + "epoch": 1.7672541940964113, + "grad_norm": 9.948267830980129, + "learning_rate": 1.752972282324672e-07, + "loss": 0.8663, + "step": 12483 + }, + { + "epoch": 1.7673957669710485, + "grad_norm": 9.020363552438674, + "learning_rate": 1.750864320019699e-07, + "loss": 1.0163, + "step": 12484 + }, + { + "epoch": 1.7675373398456857, + "grad_norm": 9.828382334046596, + "learning_rate": 1.7487575799044505e-07, + "loss": 0.9904, + "step": 12485 + }, + { + "epoch": 1.7676789127203227, + "grad_norm": 9.335955069936396, + "learning_rate": 1.746652062089685e-07, + "loss": 0.9225, + "step": 12486 + }, + { + "epoch": 1.76782048559496, + "grad_norm": 9.294074962401492, + "learning_rate": 1.7445477666860749e-07, + "loss": 0.9712, + "step": 12487 + }, + { + "epoch": 1.7679620584695972, + "grad_norm": 11.081598552422426, + "learning_rate": 1.7424446938042517e-07, + "loss": 0.9865, + "step": 12488 + }, + { + "epoch": 1.7681036313442344, + "grad_norm": 9.311886351163881, + "learning_rate": 1.740342843554771e-07, + "loss": 0.8697, + "step": 12489 + }, + { + "epoch": 1.7682452042188717, + "grad_norm": 10.299715382615707, + "learning_rate": 1.7382422160481193e-07, + "loss": 0.9316, + "step": 12490 + }, + { + "epoch": 1.7683867770935089, + "grad_norm": 10.15633000268971, + "learning_rate": 1.7361428113947392e-07, + "loss": 0.9108, + "step": 12491 + }, + { + "epoch": 1.7685283499681461, + "grad_norm": 9.814894851688786, + "learning_rate": 1.7340446297049784e-07, + "loss": 1.093, + "step": 12492 + }, + { + "epoch": 1.7686699228427833, + "grad_norm": 9.936924179956447, + "learning_rate": 1.7319476710891431e-07, + "loss": 0.9854, + "step": 12493 + }, + { + "epoch": 1.7688114957174206, + "grad_norm": 7.727104083174177, + "learning_rate": 1.7298519356574728e-07, + "loss": 0.8401, + "step": 12494 + }, + { + "epoch": 1.7689530685920578, + "grad_norm": 9.470192173072407, + "learning_rate": 1.7277574235201295e-07, + "loss": 0.9678, + "step": 12495 + }, + { + "epoch": 1.7690946414666948, + "grad_norm": 10.036151009595379, + "learning_rate": 1.7256641347872304e-07, + "loss": 1.0291, + "step": 12496 + }, + { + "epoch": 1.769236214341332, + "grad_norm": 9.75427387172661, + "learning_rate": 1.723572069568813e-07, + "loss": 0.9602, + "step": 12497 + }, + { + "epoch": 1.7693777872159693, + "grad_norm": 9.283422350727784, + "learning_rate": 1.7214812279748584e-07, + "loss": 0.869, + "step": 12498 + }, + { + "epoch": 1.7695193600906065, + "grad_norm": 10.687556751836711, + "learning_rate": 1.719391610115273e-07, + "loss": 1.0596, + "step": 12499 + }, + { + "epoch": 1.7696609329652437, + "grad_norm": 10.21514474341437, + "learning_rate": 1.7173032160999164e-07, + "loss": 1.0173, + "step": 12500 + }, + { + "epoch": 1.769802505839881, + "grad_norm": 11.686854380354758, + "learning_rate": 1.7152160460385703e-07, + "loss": 0.9274, + "step": 12501 + }, + { + "epoch": 1.7699440787145182, + "grad_norm": 11.735647973146152, + "learning_rate": 1.7131301000409496e-07, + "loss": 1.0789, + "step": 12502 + }, + { + "epoch": 1.7700856515891554, + "grad_norm": 10.824300864110224, + "learning_rate": 1.7110453782167218e-07, + "loss": 0.9453, + "step": 12503 + }, + { + "epoch": 1.7702272244637927, + "grad_norm": 9.296369471549767, + "learning_rate": 1.7089618806754692e-07, + "loss": 1.0339, + "step": 12504 + }, + { + "epoch": 1.77036879733843, + "grad_norm": 10.103789919616604, + "learning_rate": 1.7068796075267264e-07, + "loss": 0.9927, + "step": 12505 + }, + { + "epoch": 1.7705103702130671, + "grad_norm": 7.815215836544589, + "learning_rate": 1.7047985588799525e-07, + "loss": 0.9494, + "step": 12506 + }, + { + "epoch": 1.7706519430877043, + "grad_norm": 9.210628539930699, + "learning_rate": 1.7027187348445522e-07, + "loss": 1.0317, + "step": 12507 + }, + { + "epoch": 1.7707935159623416, + "grad_norm": 11.137385642409708, + "learning_rate": 1.700640135529852e-07, + "loss": 1.012, + "step": 12508 + }, + { + "epoch": 1.7709350888369788, + "grad_norm": 8.529586909354888, + "learning_rate": 1.6985627610451278e-07, + "loss": 1.023, + "step": 12509 + }, + { + "epoch": 1.771076661711616, + "grad_norm": 8.461413594638383, + "learning_rate": 1.6964866114995871e-07, + "loss": 0.9703, + "step": 12510 + }, + { + "epoch": 1.7712182345862533, + "grad_norm": 9.511791741249713, + "learning_rate": 1.6944116870023675e-07, + "loss": 0.8958, + "step": 12511 + }, + { + "epoch": 1.7713598074608905, + "grad_norm": 10.360892390401913, + "learning_rate": 1.6923379876625568e-07, + "loss": 1.0038, + "step": 12512 + }, + { + "epoch": 1.7715013803355277, + "grad_norm": 9.199278048508456, + "learning_rate": 1.690265513589151e-07, + "loss": 0.9102, + "step": 12513 + }, + { + "epoch": 1.771642953210165, + "grad_norm": 7.59267540380844, + "learning_rate": 1.6881942648911077e-07, + "loss": 0.863, + "step": 12514 + }, + { + "epoch": 1.7717845260848022, + "grad_norm": 7.37738567495103, + "learning_rate": 1.6861242416773087e-07, + "loss": 0.9862, + "step": 12515 + }, + { + "epoch": 1.7719260989594394, + "grad_norm": 9.085900973298246, + "learning_rate": 1.684055444056573e-07, + "loss": 0.9686, + "step": 12516 + }, + { + "epoch": 1.7720676718340767, + "grad_norm": 9.630536357326374, + "learning_rate": 1.6819878721376637e-07, + "loss": 0.9799, + "step": 12517 + }, + { + "epoch": 1.7722092447087139, + "grad_norm": 9.335590997753904, + "learning_rate": 1.67992152602926e-07, + "loss": 1.0369, + "step": 12518 + }, + { + "epoch": 1.7723508175833511, + "grad_norm": 9.650558819180418, + "learning_rate": 1.6778564058399977e-07, + "loss": 1.0652, + "step": 12519 + }, + { + "epoch": 1.7724923904579883, + "grad_norm": 9.219898217026271, + "learning_rate": 1.6757925116784313e-07, + "loss": 0.957, + "step": 12520 + }, + { + "epoch": 1.7726339633326256, + "grad_norm": 9.87358952360115, + "learning_rate": 1.673729843653063e-07, + "loss": 0.967, + "step": 12521 + }, + { + "epoch": 1.7727755362072628, + "grad_norm": 9.889696990675843, + "learning_rate": 1.6716684018723256e-07, + "loss": 0.9599, + "step": 12522 + }, + { + "epoch": 1.7729171090819, + "grad_norm": 8.309061586940272, + "learning_rate": 1.6696081864445823e-07, + "loss": 0.8866, + "step": 12523 + }, + { + "epoch": 1.7730586819565373, + "grad_norm": 8.588240421364334, + "learning_rate": 1.6675491974781438e-07, + "loss": 0.8765, + "step": 12524 + }, + { + "epoch": 1.7732002548311745, + "grad_norm": 8.687927674668652, + "learning_rate": 1.665491435081243e-07, + "loss": 0.8543, + "step": 12525 + }, + { + "epoch": 1.7733418277058117, + "grad_norm": 8.530762151945535, + "learning_rate": 1.6634348993620624e-07, + "loss": 0.9192, + "step": 12526 + }, + { + "epoch": 1.7734834005804487, + "grad_norm": 8.616976779495195, + "learning_rate": 1.661379590428705e-07, + "loss": 0.9594, + "step": 12527 + }, + { + "epoch": 1.773624973455086, + "grad_norm": 7.656064401050784, + "learning_rate": 1.6593255083892228e-07, + "loss": 0.9885, + "step": 12528 + }, + { + "epoch": 1.7737665463297232, + "grad_norm": 7.118070210594246, + "learning_rate": 1.6572726533515936e-07, + "loss": 0.8428, + "step": 12529 + }, + { + "epoch": 1.7739081192043604, + "grad_norm": 9.256897287963302, + "learning_rate": 1.6552210254237395e-07, + "loss": 1.0063, + "step": 12530 + }, + { + "epoch": 1.7740496920789977, + "grad_norm": 8.921041205839655, + "learning_rate": 1.6531706247135042e-07, + "loss": 0.951, + "step": 12531 + }, + { + "epoch": 1.7741912649536349, + "grad_norm": 8.829435052133503, + "learning_rate": 1.6511214513286826e-07, + "loss": 0.9142, + "step": 12532 + }, + { + "epoch": 1.7743328378282721, + "grad_norm": 10.886403736584363, + "learning_rate": 1.6490735053770023e-07, + "loss": 0.9697, + "step": 12533 + }, + { + "epoch": 1.7744744107029093, + "grad_norm": 8.933378536432636, + "learning_rate": 1.6470267869661105e-07, + "loss": 0.9557, + "step": 12534 + }, + { + "epoch": 1.7746159835775466, + "grad_norm": 10.132994333251887, + "learning_rate": 1.6449812962036128e-07, + "loss": 0.9591, + "step": 12535 + }, + { + "epoch": 1.7747575564521838, + "grad_norm": 9.903411940043654, + "learning_rate": 1.6429370331970285e-07, + "loss": 1.0265, + "step": 12536 + }, + { + "epoch": 1.774899129326821, + "grad_norm": 8.532740647873178, + "learning_rate": 1.640893998053833e-07, + "loss": 0.8891, + "step": 12537 + }, + { + "epoch": 1.775040702201458, + "grad_norm": 9.38087096442913, + "learning_rate": 1.6388521908814181e-07, + "loss": 1.0177, + "step": 12538 + }, + { + "epoch": 1.7751822750760953, + "grad_norm": 9.832302836932376, + "learning_rate": 1.6368116117871257e-07, + "loss": 0.9956, + "step": 12539 + }, + { + "epoch": 1.7753238479507325, + "grad_norm": 9.364017004102726, + "learning_rate": 1.6347722608782284e-07, + "loss": 0.957, + "step": 12540 + }, + { + "epoch": 1.7754654208253697, + "grad_norm": 11.464359284908886, + "learning_rate": 1.6327341382619294e-07, + "loss": 0.9263, + "step": 12541 + }, + { + "epoch": 1.775606993700007, + "grad_norm": 8.787913987458944, + "learning_rate": 1.6306972440453788e-07, + "loss": 0.8976, + "step": 12542 + }, + { + "epoch": 1.7757485665746442, + "grad_norm": 8.325463762625658, + "learning_rate": 1.6286615783356468e-07, + "loss": 0.971, + "step": 12543 + }, + { + "epoch": 1.7758901394492814, + "grad_norm": 8.536892879675984, + "learning_rate": 1.626627141239745e-07, + "loss": 0.9654, + "step": 12544 + }, + { + "epoch": 1.7760317123239187, + "grad_norm": 9.543082275215038, + "learning_rate": 1.6245939328646322e-07, + "loss": 1.0564, + "step": 12545 + }, + { + "epoch": 1.7761732851985559, + "grad_norm": 9.858151125628105, + "learning_rate": 1.622561953317181e-07, + "loss": 0.9744, + "step": 12546 + }, + { + "epoch": 1.7763148580731931, + "grad_norm": 9.036058459197093, + "learning_rate": 1.620531202704223e-07, + "loss": 0.9476, + "step": 12547 + }, + { + "epoch": 1.7764564309478303, + "grad_norm": 8.958166596982714, + "learning_rate": 1.6185016811325033e-07, + "loss": 0.9078, + "step": 12548 + }, + { + "epoch": 1.7765980038224676, + "grad_norm": 9.471883425304526, + "learning_rate": 1.6164733887087168e-07, + "loss": 1.0065, + "step": 12549 + }, + { + "epoch": 1.7767395766971048, + "grad_norm": 10.187926113835758, + "learning_rate": 1.614446325539487e-07, + "loss": 1.0507, + "step": 12550 + }, + { + "epoch": 1.776881149571742, + "grad_norm": 7.7425185978647315, + "learning_rate": 1.6124204917313811e-07, + "loss": 0.9452, + "step": 12551 + }, + { + "epoch": 1.7770227224463793, + "grad_norm": 10.748651109389806, + "learning_rate": 1.6103958873908893e-07, + "loss": 0.9412, + "step": 12552 + }, + { + "epoch": 1.7771642953210165, + "grad_norm": 9.392409372509313, + "learning_rate": 1.608372512624448e-07, + "loss": 0.9469, + "step": 12553 + }, + { + "epoch": 1.7773058681956537, + "grad_norm": 9.326102952952134, + "learning_rate": 1.6063503675384202e-07, + "loss": 0.9053, + "step": 12554 + }, + { + "epoch": 1.777447441070291, + "grad_norm": 10.04385980950784, + "learning_rate": 1.604329452239109e-07, + "loss": 0.9152, + "step": 12555 + }, + { + "epoch": 1.7775890139449282, + "grad_norm": 7.997577777375834, + "learning_rate": 1.6023097668327574e-07, + "loss": 0.9349, + "step": 12556 + }, + { + "epoch": 1.7777305868195654, + "grad_norm": 11.738411890911953, + "learning_rate": 1.6002913114255309e-07, + "loss": 1.121, + "step": 12557 + }, + { + "epoch": 1.7778721596942026, + "grad_norm": 9.701798451239675, + "learning_rate": 1.5982740861235468e-07, + "loss": 1.0294, + "step": 12558 + }, + { + "epoch": 1.7780137325688399, + "grad_norm": 8.342651294848743, + "learning_rate": 1.5962580910328402e-07, + "loss": 0.9253, + "step": 12559 + }, + { + "epoch": 1.778155305443477, + "grad_norm": 8.264005852164516, + "learning_rate": 1.594243326259401e-07, + "loss": 0.9247, + "step": 12560 + }, + { + "epoch": 1.7782968783181143, + "grad_norm": 9.316402564857762, + "learning_rate": 1.5922297919091334e-07, + "loss": 0.9678, + "step": 12561 + }, + { + "epoch": 1.7784384511927516, + "grad_norm": 10.605980578694481, + "learning_rate": 1.590217488087892e-07, + "loss": 0.9553, + "step": 12562 + }, + { + "epoch": 1.7785800240673888, + "grad_norm": 11.098878862277564, + "learning_rate": 1.5882064149014637e-07, + "loss": 1.0829, + "step": 12563 + }, + { + "epoch": 1.778721596942026, + "grad_norm": 9.233737155424599, + "learning_rate": 1.5861965724555673e-07, + "loss": 0.907, + "step": 12564 + }, + { + "epoch": 1.7788631698166633, + "grad_norm": 9.106547890256717, + "learning_rate": 1.5841879608558652e-07, + "loss": 0.932, + "step": 12565 + }, + { + "epoch": 1.7790047426913005, + "grad_norm": 9.557161324397867, + "learning_rate": 1.5821805802079343e-07, + "loss": 1.0524, + "step": 12566 + }, + { + "epoch": 1.7791463155659377, + "grad_norm": 9.401100954553932, + "learning_rate": 1.5801744306173094e-07, + "loss": 0.9568, + "step": 12567 + }, + { + "epoch": 1.779287888440575, + "grad_norm": 8.914139529136948, + "learning_rate": 1.5781695121894563e-07, + "loss": 0.9691, + "step": 12568 + }, + { + "epoch": 1.779429461315212, + "grad_norm": 10.55766050086215, + "learning_rate": 1.5761658250297658e-07, + "loss": 0.9681, + "step": 12569 + }, + { + "epoch": 1.7795710341898492, + "grad_norm": 9.894473033903946, + "learning_rate": 1.5741633692435725e-07, + "loss": 0.9386, + "step": 12570 + }, + { + "epoch": 1.7797126070644864, + "grad_norm": 9.836401724673518, + "learning_rate": 1.572162144936143e-07, + "loss": 0.8889, + "step": 12571 + }, + { + "epoch": 1.7798541799391236, + "grad_norm": 8.499110455808834, + "learning_rate": 1.5701621522126843e-07, + "loss": 0.9562, + "step": 12572 + }, + { + "epoch": 1.7799957528137609, + "grad_norm": 7.9116686922141355, + "learning_rate": 1.568163391178326e-07, + "loss": 0.8851, + "step": 12573 + }, + { + "epoch": 1.780137325688398, + "grad_norm": 9.137572106365205, + "learning_rate": 1.5661658619381515e-07, + "loss": 0.9452, + "step": 12574 + }, + { + "epoch": 1.7802788985630353, + "grad_norm": 9.397369552401061, + "learning_rate": 1.564169564597165e-07, + "loss": 0.8766, + "step": 12575 + }, + { + "epoch": 1.7804204714376726, + "grad_norm": 8.951571824130028, + "learning_rate": 1.5621744992603049e-07, + "loss": 1.0462, + "step": 12576 + }, + { + "epoch": 1.7805620443123098, + "grad_norm": 7.9498005871984425, + "learning_rate": 1.5601806660324598e-07, + "loss": 0.8393, + "step": 12577 + }, + { + "epoch": 1.780703617186947, + "grad_norm": 9.85963654813544, + "learning_rate": 1.558188065018437e-07, + "loss": 1.0287, + "step": 12578 + }, + { + "epoch": 1.780845190061584, + "grad_norm": 8.920173977237198, + "learning_rate": 1.5561966963229925e-07, + "loss": 0.9183, + "step": 12579 + }, + { + "epoch": 1.7809867629362213, + "grad_norm": 10.610458655767063, + "learning_rate": 1.5542065600508e-07, + "loss": 0.9157, + "step": 12580 + }, + { + "epoch": 1.7811283358108585, + "grad_norm": 9.03962081991668, + "learning_rate": 1.5522176563064928e-07, + "loss": 0.9413, + "step": 12581 + }, + { + "epoch": 1.7812699086854957, + "grad_norm": 8.203066871073805, + "learning_rate": 1.550229985194618e-07, + "loss": 0.9795, + "step": 12582 + }, + { + "epoch": 1.781411481560133, + "grad_norm": 9.653947771796075, + "learning_rate": 1.5482435468196695e-07, + "loss": 0.9558, + "step": 12583 + }, + { + "epoch": 1.7815530544347702, + "grad_norm": 8.746791578346832, + "learning_rate": 1.5462583412860692e-07, + "loss": 0.9142, + "step": 12584 + }, + { + "epoch": 1.7816946273094074, + "grad_norm": 8.851707733390393, + "learning_rate": 1.5442743686981787e-07, + "loss": 0.9191, + "step": 12585 + }, + { + "epoch": 1.7818362001840446, + "grad_norm": 9.045767655405811, + "learning_rate": 1.542291629160303e-07, + "loss": 1.1069, + "step": 12586 + }, + { + "epoch": 1.7819777730586819, + "grad_norm": 9.311652842654164, + "learning_rate": 1.5403101227766587e-07, + "loss": 1.0081, + "step": 12587 + }, + { + "epoch": 1.782119345933319, + "grad_norm": 8.883965919783078, + "learning_rate": 1.538329849651421e-07, + "loss": 0.9805, + "step": 12588 + }, + { + "epoch": 1.7822609188079563, + "grad_norm": 11.51143906599506, + "learning_rate": 1.536350809888687e-07, + "loss": 1.0494, + "step": 12589 + }, + { + "epoch": 1.7824024916825936, + "grad_norm": 9.15228998263656, + "learning_rate": 1.534373003592496e-07, + "loss": 0.9368, + "step": 12590 + }, + { + "epoch": 1.7825440645572308, + "grad_norm": 9.014593795959547, + "learning_rate": 1.5323964308668227e-07, + "loss": 0.8895, + "step": 12591 + }, + { + "epoch": 1.782685637431868, + "grad_norm": 8.886274885893203, + "learning_rate": 1.5304210918155677e-07, + "loss": 1.0013, + "step": 12592 + }, + { + "epoch": 1.7828272103065053, + "grad_norm": 10.250938930771285, + "learning_rate": 1.5284469865425784e-07, + "loss": 0.8964, + "step": 12593 + }, + { + "epoch": 1.7829687831811425, + "grad_norm": 10.149388457418029, + "learning_rate": 1.5264741151516272e-07, + "loss": 0.9657, + "step": 12594 + }, + { + "epoch": 1.7831103560557797, + "grad_norm": 9.014827382259385, + "learning_rate": 1.524502477746434e-07, + "loss": 0.9409, + "step": 12595 + }, + { + "epoch": 1.783251928930417, + "grad_norm": 10.78189500731344, + "learning_rate": 1.522532074430641e-07, + "loss": 0.91, + "step": 12596 + }, + { + "epoch": 1.7833935018050542, + "grad_norm": 9.192396040124473, + "learning_rate": 1.5205629053078262e-07, + "loss": 0.9149, + "step": 12597 + }, + { + "epoch": 1.7835350746796914, + "grad_norm": 10.069513187100721, + "learning_rate": 1.5185949704815185e-07, + "loss": 0.9813, + "step": 12598 + }, + { + "epoch": 1.7836766475543286, + "grad_norm": 9.143010172244457, + "learning_rate": 1.5166282700551594e-07, + "loss": 0.9468, + "step": 12599 + }, + { + "epoch": 1.7838182204289659, + "grad_norm": 9.110506016970403, + "learning_rate": 1.5146628041321443e-07, + "loss": 0.9728, + "step": 12600 + }, + { + "epoch": 1.783959793303603, + "grad_norm": 8.866250896983557, + "learning_rate": 1.5126985728157934e-07, + "loss": 1.0479, + "step": 12601 + }, + { + "epoch": 1.7841013661782403, + "grad_norm": 8.852124889046106, + "learning_rate": 1.5107355762093685e-07, + "loss": 0.9614, + "step": 12602 + }, + { + "epoch": 1.7842429390528776, + "grad_norm": 9.054288524563107, + "learning_rate": 1.5087738144160562e-07, + "loss": 0.9686, + "step": 12603 + }, + { + "epoch": 1.7843845119275148, + "grad_norm": 10.4861958908011, + "learning_rate": 1.5068132875389913e-07, + "loss": 0.8913, + "step": 12604 + }, + { + "epoch": 1.784526084802152, + "grad_norm": 7.7628724038054155, + "learning_rate": 1.5048539956812324e-07, + "loss": 0.9005, + "step": 12605 + }, + { + "epoch": 1.7846676576767893, + "grad_norm": 9.160719682165526, + "learning_rate": 1.5028959389457782e-07, + "loss": 0.9451, + "step": 12606 + }, + { + "epoch": 1.7848092305514265, + "grad_norm": 7.526334686030933, + "learning_rate": 1.5009391174355735e-07, + "loss": 0.9281, + "step": 12607 + }, + { + "epoch": 1.7849508034260637, + "grad_norm": 9.877468524902989, + "learning_rate": 1.49898353125347e-07, + "loss": 0.9212, + "step": 12608 + }, + { + "epoch": 1.785092376300701, + "grad_norm": 8.102197193677426, + "learning_rate": 1.4970291805022825e-07, + "loss": 0.9627, + "step": 12609 + }, + { + "epoch": 1.785233949175338, + "grad_norm": 9.59583967978604, + "learning_rate": 1.4950760652847422e-07, + "loss": 0.947, + "step": 12610 + }, + { + "epoch": 1.7853755220499752, + "grad_norm": 9.879450314323343, + "learning_rate": 1.4931241857035343e-07, + "loss": 0.947, + "step": 12611 + }, + { + "epoch": 1.7855170949246124, + "grad_norm": 8.572142135345997, + "learning_rate": 1.4911735418612515e-07, + "loss": 0.9738, + "step": 12612 + }, + { + "epoch": 1.7856586677992496, + "grad_norm": 9.219618520395574, + "learning_rate": 1.4892241338604506e-07, + "loss": 0.9884, + "step": 12613 + }, + { + "epoch": 1.7858002406738869, + "grad_norm": 9.88727125826789, + "learning_rate": 1.4872759618036081e-07, + "loss": 0.9184, + "step": 12614 + }, + { + "epoch": 1.785941813548524, + "grad_norm": 10.729807496917102, + "learning_rate": 1.4853290257931364e-07, + "loss": 0.839, + "step": 12615 + }, + { + "epoch": 1.7860833864231613, + "grad_norm": 9.127464066254204, + "learning_rate": 1.483383325931384e-07, + "loss": 0.965, + "step": 12616 + }, + { + "epoch": 1.7862249592977986, + "grad_norm": 8.551686384968491, + "learning_rate": 1.4814388623206333e-07, + "loss": 0.8766, + "step": 12617 + }, + { + "epoch": 1.7863665321724358, + "grad_norm": 8.829272602400495, + "learning_rate": 1.4794956350631106e-07, + "loss": 1.0021, + "step": 12618 + }, + { + "epoch": 1.786508105047073, + "grad_norm": 11.843237873368723, + "learning_rate": 1.4775536442609623e-07, + "loss": 1.0581, + "step": 12619 + }, + { + "epoch": 1.7866496779217103, + "grad_norm": 8.936465757056913, + "learning_rate": 1.4756128900162757e-07, + "loss": 0.9166, + "step": 12620 + }, + { + "epoch": 1.7867912507963473, + "grad_norm": 9.425380984430292, + "learning_rate": 1.4736733724310865e-07, + "loss": 1.0655, + "step": 12621 + }, + { + "epoch": 1.7869328236709845, + "grad_norm": 8.558954771881282, + "learning_rate": 1.4717350916073375e-07, + "loss": 1.0835, + "step": 12622 + }, + { + "epoch": 1.7870743965456217, + "grad_norm": 10.882001705596515, + "learning_rate": 1.4697980476469392e-07, + "loss": 1.0308, + "step": 12623 + }, + { + "epoch": 1.787215969420259, + "grad_norm": 8.566444546483025, + "learning_rate": 1.4678622406517074e-07, + "loss": 0.9314, + "step": 12624 + }, + { + "epoch": 1.7873575422948962, + "grad_norm": 10.90329018027912, + "learning_rate": 1.4659276707234132e-07, + "loss": 0.9438, + "step": 12625 + }, + { + "epoch": 1.7874991151695334, + "grad_norm": 9.749796938615608, + "learning_rate": 1.4639943379637534e-07, + "loss": 1.0804, + "step": 12626 + }, + { + "epoch": 1.7876406880441706, + "grad_norm": 9.351774208521503, + "learning_rate": 1.462062242474363e-07, + "loss": 0.9972, + "step": 12627 + }, + { + "epoch": 1.7877822609188079, + "grad_norm": 9.540375290372614, + "learning_rate": 1.460131384356811e-07, + "loss": 0.9448, + "step": 12628 + }, + { + "epoch": 1.787923833793445, + "grad_norm": 10.093671768877371, + "learning_rate": 1.4582017637125967e-07, + "loss": 0.932, + "step": 12629 + }, + { + "epoch": 1.7880654066680823, + "grad_norm": 9.077991793208374, + "learning_rate": 1.4562733806431666e-07, + "loss": 1.1062, + "step": 12630 + }, + { + "epoch": 1.7882069795427196, + "grad_norm": 10.473737297337221, + "learning_rate": 1.4543462352498844e-07, + "loss": 0.942, + "step": 12631 + }, + { + "epoch": 1.7883485524173568, + "grad_norm": 7.959071845761671, + "learning_rate": 1.4524203276340687e-07, + "loss": 0.9148, + "step": 12632 + }, + { + "epoch": 1.788490125291994, + "grad_norm": 8.79076285635551, + "learning_rate": 1.4504956578969554e-07, + "loss": 0.9902, + "step": 12633 + }, + { + "epoch": 1.7886316981666313, + "grad_norm": 9.156462058257896, + "learning_rate": 1.4485722261397273e-07, + "loss": 0.8934, + "step": 12634 + }, + { + "epoch": 1.7887732710412685, + "grad_norm": 10.864472303009087, + "learning_rate": 1.4466500324634952e-07, + "loss": 1.0181, + "step": 12635 + }, + { + "epoch": 1.7889148439159057, + "grad_norm": 9.159515735868627, + "learning_rate": 1.444729076969309e-07, + "loss": 0.8702, + "step": 12636 + }, + { + "epoch": 1.789056416790543, + "grad_norm": 9.660177968915132, + "learning_rate": 1.4428093597581544e-07, + "loss": 1.0226, + "step": 12637 + }, + { + "epoch": 1.7891979896651802, + "grad_norm": 9.814752599597638, + "learning_rate": 1.4408908809309423e-07, + "loss": 1.1138, + "step": 12638 + }, + { + "epoch": 1.7893395625398174, + "grad_norm": 11.569250167512005, + "learning_rate": 1.4389736405885397e-07, + "loss": 1.0313, + "step": 12639 + }, + { + "epoch": 1.7894811354144546, + "grad_norm": 9.630102613304462, + "learning_rate": 1.4370576388317155e-07, + "loss": 0.9918, + "step": 12640 + }, + { + "epoch": 1.7896227082890919, + "grad_norm": 8.06076859210151, + "learning_rate": 1.435142875761203e-07, + "loss": 0.9797, + "step": 12641 + }, + { + "epoch": 1.789764281163729, + "grad_norm": 10.51197096211271, + "learning_rate": 1.4332293514776635e-07, + "loss": 0.9488, + "step": 12642 + }, + { + "epoch": 1.7899058540383663, + "grad_norm": 9.635390976880167, + "learning_rate": 1.4313170660816805e-07, + "loss": 0.9343, + "step": 12643 + }, + { + "epoch": 1.7900474269130036, + "grad_norm": 10.445537830053656, + "learning_rate": 1.4294060196737874e-07, + "loss": 1.0301, + "step": 12644 + }, + { + "epoch": 1.7901889997876408, + "grad_norm": 8.948073315091138, + "learning_rate": 1.4274962123544457e-07, + "loss": 0.9848, + "step": 12645 + }, + { + "epoch": 1.790330572662278, + "grad_norm": 11.10813774591083, + "learning_rate": 1.4255876442240524e-07, + "loss": 0.9293, + "step": 12646 + }, + { + "epoch": 1.7904721455369152, + "grad_norm": 9.465496246983575, + "learning_rate": 1.423680315382933e-07, + "loss": 0.8675, + "step": 12647 + }, + { + "epoch": 1.7906137184115525, + "grad_norm": 11.748476619581766, + "learning_rate": 1.421774225931366e-07, + "loss": 1.0237, + "step": 12648 + }, + { + "epoch": 1.7907552912861897, + "grad_norm": 7.2275975821210485, + "learning_rate": 1.4198693759695486e-07, + "loss": 0.9596, + "step": 12649 + }, + { + "epoch": 1.790896864160827, + "grad_norm": 10.350919791368483, + "learning_rate": 1.417965765597612e-07, + "loss": 0.8623, + "step": 12650 + }, + { + "epoch": 1.7910384370354642, + "grad_norm": 9.361852353032516, + "learning_rate": 1.4160633949156344e-07, + "loss": 1.0354, + "step": 12651 + }, + { + "epoch": 1.7911800099101012, + "grad_norm": 12.292564092084605, + "learning_rate": 1.4141622640236164e-07, + "loss": 0.974, + "step": 12652 + }, + { + "epoch": 1.7913215827847384, + "grad_norm": 8.986023116495712, + "learning_rate": 1.412262373021503e-07, + "loss": 0.9764, + "step": 12653 + }, + { + "epoch": 1.7914631556593756, + "grad_norm": 8.418841946445976, + "learning_rate": 1.410363722009167e-07, + "loss": 0.9389, + "step": 12654 + }, + { + "epoch": 1.7916047285340129, + "grad_norm": 9.128332914173988, + "learning_rate": 1.4084663110864262e-07, + "loss": 0.9, + "step": 12655 + }, + { + "epoch": 1.79174630140865, + "grad_norm": 10.091144993279535, + "learning_rate": 1.406570140353014e-07, + "loss": 0.887, + "step": 12656 + }, + { + "epoch": 1.7918878742832873, + "grad_norm": 9.595603142607043, + "learning_rate": 1.4046752099086236e-07, + "loss": 0.9697, + "step": 12657 + }, + { + "epoch": 1.7920294471579246, + "grad_norm": 8.222716096130195, + "learning_rate": 1.4027815198528582e-07, + "loss": 0.8758, + "step": 12658 + }, + { + "epoch": 1.7921710200325618, + "grad_norm": 9.507660838714402, + "learning_rate": 1.4008890702852774e-07, + "loss": 1.0101, + "step": 12659 + }, + { + "epoch": 1.792312592907199, + "grad_norm": 12.350415402735132, + "learning_rate": 1.398997861305365e-07, + "loss": 1.0039, + "step": 12660 + }, + { + "epoch": 1.7924541657818363, + "grad_norm": 9.49739239693236, + "learning_rate": 1.397107893012531e-07, + "loss": 0.9502, + "step": 12661 + }, + { + "epoch": 1.7925957386564733, + "grad_norm": 8.470320102087992, + "learning_rate": 1.3952191655061425e-07, + "loss": 0.8519, + "step": 12662 + }, + { + "epoch": 1.7927373115311105, + "grad_norm": 9.31463557596785, + "learning_rate": 1.393331678885476e-07, + "loss": 1.001, + "step": 12663 + }, + { + "epoch": 1.7928788844057477, + "grad_norm": 8.47822609323367, + "learning_rate": 1.3914454332497608e-07, + "loss": 1.1078, + "step": 12664 + }, + { + "epoch": 1.793020457280385, + "grad_norm": 10.639459442518868, + "learning_rate": 1.3895604286981613e-07, + "loss": 1.0103, + "step": 12665 + }, + { + "epoch": 1.7931620301550222, + "grad_norm": 9.353984012533818, + "learning_rate": 1.3876766653297597e-07, + "loss": 0.9169, + "step": 12666 + }, + { + "epoch": 1.7933036030296594, + "grad_norm": 8.413548814258009, + "learning_rate": 1.3857941432435934e-07, + "loss": 0.841, + "step": 12667 + }, + { + "epoch": 1.7934451759042966, + "grad_norm": 11.662908620922998, + "learning_rate": 1.3839128625386193e-07, + "loss": 1.0956, + "step": 12668 + }, + { + "epoch": 1.7935867487789339, + "grad_norm": 9.950777983801576, + "learning_rate": 1.3820328233137393e-07, + "loss": 0.8704, + "step": 12669 + }, + { + "epoch": 1.793728321653571, + "grad_norm": 8.675387714087831, + "learning_rate": 1.380154025667782e-07, + "loss": 0.9661, + "step": 12670 + }, + { + "epoch": 1.7938698945282083, + "grad_norm": 9.588800302830307, + "learning_rate": 1.3782764696995188e-07, + "loss": 1.0735, + "step": 12671 + }, + { + "epoch": 1.7940114674028456, + "grad_norm": 10.624643476899847, + "learning_rate": 1.3764001555076484e-07, + "loss": 1.0172, + "step": 12672 + }, + { + "epoch": 1.7941530402774828, + "grad_norm": 7.423842769569218, + "learning_rate": 1.374525083190803e-07, + "loss": 0.8711, + "step": 12673 + }, + { + "epoch": 1.79429461315212, + "grad_norm": 7.340616799614273, + "learning_rate": 1.372651252847562e-07, + "loss": 0.9064, + "step": 12674 + }, + { + "epoch": 1.7944361860267573, + "grad_norm": 8.55724089675736, + "learning_rate": 1.370778664576422e-07, + "loss": 0.9706, + "step": 12675 + }, + { + "epoch": 1.7945777589013945, + "grad_norm": 9.547827140914993, + "learning_rate": 1.3689073184758345e-07, + "loss": 0.9111, + "step": 12676 + }, + { + "epoch": 1.7947193317760317, + "grad_norm": 9.688080407260205, + "learning_rate": 1.3670372146441652e-07, + "loss": 0.9627, + "step": 12677 + }, + { + "epoch": 1.794860904650669, + "grad_norm": 8.558171201344562, + "learning_rate": 1.3651683531797327e-07, + "loss": 1.1064, + "step": 12678 + }, + { + "epoch": 1.7950024775253062, + "grad_norm": 9.215722986879458, + "learning_rate": 1.3633007341807726e-07, + "loss": 0.9271, + "step": 12679 + }, + { + "epoch": 1.7951440503999434, + "grad_norm": 9.286408811313438, + "learning_rate": 1.3614343577454725e-07, + "loss": 0.9678, + "step": 12680 + }, + { + "epoch": 1.7952856232745806, + "grad_norm": 9.342968554017892, + "learning_rate": 1.3595692239719404e-07, + "loss": 0.9355, + "step": 12681 + }, + { + "epoch": 1.7954271961492179, + "grad_norm": 9.927989703521185, + "learning_rate": 1.3577053329582258e-07, + "loss": 1.0076, + "step": 12682 + }, + { + "epoch": 1.795568769023855, + "grad_norm": 9.80749889660629, + "learning_rate": 1.3558426848023165e-07, + "loss": 0.9897, + "step": 12683 + }, + { + "epoch": 1.7957103418984923, + "grad_norm": 8.88616069676387, + "learning_rate": 1.3539812796021234e-07, + "loss": 0.9749, + "step": 12684 + }, + { + "epoch": 1.7958519147731296, + "grad_norm": 11.219741479951894, + "learning_rate": 1.352121117455507e-07, + "loss": 0.9479, + "step": 12685 + }, + { + "epoch": 1.7959934876477668, + "grad_norm": 10.19653144652823, + "learning_rate": 1.3502621984602477e-07, + "loss": 0.994, + "step": 12686 + }, + { + "epoch": 1.796135060522404, + "grad_norm": 10.728476690433174, + "learning_rate": 1.3484045227140697e-07, + "loss": 0.9711, + "step": 12687 + }, + { + "epoch": 1.7962766333970412, + "grad_norm": 8.5761801038306, + "learning_rate": 1.3465480903146365e-07, + "loss": 0.8826, + "step": 12688 + }, + { + "epoch": 1.7964182062716785, + "grad_norm": 8.375671815259572, + "learning_rate": 1.344692901359529e-07, + "loss": 0.9202, + "step": 12689 + }, + { + "epoch": 1.7965597791463157, + "grad_norm": 8.325220456798538, + "learning_rate": 1.3428389559462796e-07, + "loss": 0.9551, + "step": 12690 + }, + { + "epoch": 1.796701352020953, + "grad_norm": 10.627887288648429, + "learning_rate": 1.340986254172344e-07, + "loss": 0.9938, + "step": 12691 + }, + { + "epoch": 1.7968429248955902, + "grad_norm": 9.560658395991553, + "learning_rate": 1.3391347961351275e-07, + "loss": 1.053, + "step": 12692 + }, + { + "epoch": 1.7969844977702272, + "grad_norm": 8.685457497221154, + "learning_rate": 1.337284581931944e-07, + "loss": 1.004, + "step": 12693 + }, + { + "epoch": 1.7971260706448644, + "grad_norm": 8.135353885917999, + "learning_rate": 1.3354356116600685e-07, + "loss": 0.8838, + "step": 12694 + }, + { + "epoch": 1.7972676435195016, + "grad_norm": 8.710111104998935, + "learning_rate": 1.3335878854166984e-07, + "loss": 0.9343, + "step": 12695 + }, + { + "epoch": 1.7974092163941389, + "grad_norm": 11.489811903938664, + "learning_rate": 1.3317414032989668e-07, + "loss": 1.0081, + "step": 12696 + }, + { + "epoch": 1.797550789268776, + "grad_norm": 7.888883190720102, + "learning_rate": 1.3298961654039433e-07, + "loss": 0.9216, + "step": 12697 + }, + { + "epoch": 1.7976923621434133, + "grad_norm": 9.422560836097635, + "learning_rate": 1.3280521718286255e-07, + "loss": 0.9315, + "step": 12698 + }, + { + "epoch": 1.7978339350180506, + "grad_norm": 10.698713521180983, + "learning_rate": 1.3262094226699578e-07, + "loss": 0.947, + "step": 12699 + }, + { + "epoch": 1.7979755078926878, + "grad_norm": 9.7665522997239, + "learning_rate": 1.3243679180248075e-07, + "loss": 0.953, + "step": 12700 + }, + { + "epoch": 1.798117080767325, + "grad_norm": 11.558292097491632, + "learning_rate": 1.3225276579899833e-07, + "loss": 0.9635, + "step": 12701 + }, + { + "epoch": 1.7982586536419622, + "grad_norm": 9.820504780710063, + "learning_rate": 1.3206886426622267e-07, + "loss": 0.8741, + "step": 12702 + }, + { + "epoch": 1.7984002265165993, + "grad_norm": 9.507022069393217, + "learning_rate": 1.318850872138211e-07, + "loss": 0.995, + "step": 12703 + }, + { + "epoch": 1.7985417993912365, + "grad_norm": 9.139521918616909, + "learning_rate": 1.3170143465145474e-07, + "loss": 0.9728, + "step": 12704 + }, + { + "epoch": 1.7986833722658737, + "grad_norm": 9.769362175541172, + "learning_rate": 1.3151790658877785e-07, + "loss": 0.9252, + "step": 12705 + }, + { + "epoch": 1.798824945140511, + "grad_norm": 9.992434500832015, + "learning_rate": 1.3133450303543904e-07, + "loss": 0.8868, + "step": 12706 + }, + { + "epoch": 1.7989665180151482, + "grad_norm": 10.138430506361635, + "learning_rate": 1.3115122400107872e-07, + "loss": 1.0271, + "step": 12707 + }, + { + "epoch": 1.7991080908897854, + "grad_norm": 9.599655669712888, + "learning_rate": 1.3096806949533274e-07, + "loss": 0.9676, + "step": 12708 + }, + { + "epoch": 1.7992496637644226, + "grad_norm": 8.9522574705132, + "learning_rate": 1.3078503952782845e-07, + "loss": 0.949, + "step": 12709 + }, + { + "epoch": 1.7993912366390599, + "grad_norm": 7.830652738721922, + "learning_rate": 1.306021341081881e-07, + "loss": 0.8734, + "step": 12710 + }, + { + "epoch": 1.799532809513697, + "grad_norm": 8.38970885434319, + "learning_rate": 1.304193532460274e-07, + "loss": 0.8736, + "step": 12711 + }, + { + "epoch": 1.7996743823883343, + "grad_norm": 10.97262913288746, + "learning_rate": 1.3023669695095413e-07, + "loss": 1.0721, + "step": 12712 + }, + { + "epoch": 1.7998159552629716, + "grad_norm": 8.965218197517647, + "learning_rate": 1.3005416523257126e-07, + "loss": 0.963, + "step": 12713 + }, + { + "epoch": 1.7999575281376088, + "grad_norm": 10.907585696212285, + "learning_rate": 1.2987175810047297e-07, + "loss": 1.0495, + "step": 12714 + }, + { + "epoch": 1.800099101012246, + "grad_norm": 10.545016112227088, + "learning_rate": 1.2968947556424943e-07, + "loss": 0.9067, + "step": 12715 + }, + { + "epoch": 1.8002406738868832, + "grad_norm": 10.432243805405896, + "learning_rate": 1.2950731763348295e-07, + "loss": 1.0686, + "step": 12716 + }, + { + "epoch": 1.8003822467615205, + "grad_norm": 10.90906040302982, + "learning_rate": 1.2932528431774892e-07, + "loss": 1.0509, + "step": 12717 + }, + { + "epoch": 1.8005238196361577, + "grad_norm": 6.835618575149702, + "learning_rate": 1.291433756266175e-07, + "loss": 0.9148, + "step": 12718 + }, + { + "epoch": 1.800665392510795, + "grad_norm": 8.181315408536994, + "learning_rate": 1.289615915696507e-07, + "loss": 0.9009, + "step": 12719 + }, + { + "epoch": 1.8008069653854322, + "grad_norm": 10.349441852354813, + "learning_rate": 1.2877993215640539e-07, + "loss": 0.9566, + "step": 12720 + }, + { + "epoch": 1.8009485382600694, + "grad_norm": 10.784770711372357, + "learning_rate": 1.2859839739643054e-07, + "loss": 1.012, + "step": 12721 + }, + { + "epoch": 1.8010901111347066, + "grad_norm": 9.696950720277194, + "learning_rate": 1.2841698729927022e-07, + "loss": 1.0576, + "step": 12722 + }, + { + "epoch": 1.8012316840093439, + "grad_norm": 8.897907191040549, + "learning_rate": 1.2823570187446065e-07, + "loss": 1.0437, + "step": 12723 + }, + { + "epoch": 1.801373256883981, + "grad_norm": 8.573869938804709, + "learning_rate": 1.2805454113153121e-07, + "loss": 0.9707, + "step": 12724 + }, + { + "epoch": 1.8015148297586183, + "grad_norm": 9.269937421964018, + "learning_rate": 1.2787350508000645e-07, + "loss": 0.9418, + "step": 12725 + }, + { + "epoch": 1.8016564026332555, + "grad_norm": 8.463997383564363, + "learning_rate": 1.276925937294024e-07, + "loss": 0.8227, + "step": 12726 + }, + { + "epoch": 1.8017979755078928, + "grad_norm": 10.972424361758513, + "learning_rate": 1.2751180708923005e-07, + "loss": 1.0389, + "step": 12727 + }, + { + "epoch": 1.80193954838253, + "grad_norm": 10.344878443450442, + "learning_rate": 1.2733114516899293e-07, + "loss": 0.9716, + "step": 12728 + }, + { + "epoch": 1.8020811212571672, + "grad_norm": 9.468590058338734, + "learning_rate": 1.271506079781884e-07, + "loss": 0.9114, + "step": 12729 + }, + { + "epoch": 1.8022226941318045, + "grad_norm": 10.22380365587327, + "learning_rate": 1.2697019552630696e-07, + "loss": 1.0108, + "step": 12730 + }, + { + "epoch": 1.8023642670064417, + "grad_norm": 10.803898030178015, + "learning_rate": 1.2678990782283324e-07, + "loss": 0.9444, + "step": 12731 + }, + { + "epoch": 1.802505839881079, + "grad_norm": 10.420665207495116, + "learning_rate": 1.266097448772441e-07, + "loss": 1.0263, + "step": 12732 + }, + { + "epoch": 1.8026474127557162, + "grad_norm": 7.577629403494802, + "learning_rate": 1.264297066990111e-07, + "loss": 0.8934, + "step": 12733 + }, + { + "epoch": 1.8027889856303532, + "grad_norm": 9.447722096877332, + "learning_rate": 1.2624979329759952e-07, + "loss": 1.0135, + "step": 12734 + }, + { + "epoch": 1.8029305585049904, + "grad_norm": 9.627653474537876, + "learning_rate": 1.2607000468246533e-07, + "loss": 1.0593, + "step": 12735 + }, + { + "epoch": 1.8030721313796276, + "grad_norm": 9.499057120664167, + "learning_rate": 1.2589034086306129e-07, + "loss": 0.9265, + "step": 12736 + }, + { + "epoch": 1.8032137042542649, + "grad_norm": 9.199498237527177, + "learning_rate": 1.2571080184883178e-07, + "loss": 0.99, + "step": 12737 + }, + { + "epoch": 1.803355277128902, + "grad_norm": 9.688043000784967, + "learning_rate": 1.255313876492148e-07, + "loss": 1.1023, + "step": 12738 + }, + { + "epoch": 1.8034968500035393, + "grad_norm": 11.56542865676186, + "learning_rate": 1.2535209827364282e-07, + "loss": 1.0495, + "step": 12739 + }, + { + "epoch": 1.8036384228781766, + "grad_norm": 11.597613490364084, + "learning_rate": 1.2517293373153993e-07, + "loss": 1.1008, + "step": 12740 + }, + { + "epoch": 1.8037799957528138, + "grad_norm": 9.885979451813547, + "learning_rate": 1.2499389403232532e-07, + "loss": 0.863, + "step": 12741 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 9.253165450904701, + "learning_rate": 1.2481497918541085e-07, + "loss": 0.9011, + "step": 12742 + }, + { + "epoch": 1.8040631415020882, + "grad_norm": 8.077847326586497, + "learning_rate": 1.246361892002021e-07, + "loss": 0.858, + "step": 12743 + }, + { + "epoch": 1.8042047143767255, + "grad_norm": 9.257869775007428, + "learning_rate": 1.2445752408609733e-07, + "loss": 0.9319, + "step": 12744 + }, + { + "epoch": 1.8043462872513625, + "grad_norm": 9.183058307311802, + "learning_rate": 1.2427898385248965e-07, + "loss": 0.8931, + "step": 12745 + }, + { + "epoch": 1.8044878601259997, + "grad_norm": 9.798510644585267, + "learning_rate": 1.2410056850876428e-07, + "loss": 0.93, + "step": 12746 + }, + { + "epoch": 1.804629433000637, + "grad_norm": 10.109708187974373, + "learning_rate": 1.239222780643004e-07, + "loss": 0.9574, + "step": 12747 + }, + { + "epoch": 1.8047710058752742, + "grad_norm": 10.859045141685348, + "learning_rate": 1.237441125284708e-07, + "loss": 0.9982, + "step": 12748 + }, + { + "epoch": 1.8049125787499114, + "grad_norm": 11.203166200782851, + "learning_rate": 1.2356607191064102e-07, + "loss": 1.059, + "step": 12749 + }, + { + "epoch": 1.8050541516245486, + "grad_norm": 8.393706879892035, + "learning_rate": 1.2338815622017137e-07, + "loss": 0.984, + "step": 12750 + }, + { + "epoch": 1.8051957244991859, + "grad_norm": 7.780154966953708, + "learning_rate": 1.2321036546641406e-07, + "loss": 0.8634, + "step": 12751 + }, + { + "epoch": 1.805337297373823, + "grad_norm": 10.602328897546888, + "learning_rate": 1.2303269965871583e-07, + "loss": 1.0428, + "step": 12752 + }, + { + "epoch": 1.8054788702484603, + "grad_norm": 8.059662075484065, + "learning_rate": 1.2285515880641585e-07, + "loss": 0.9367, + "step": 12753 + }, + { + "epoch": 1.8056204431230976, + "grad_norm": 10.009833269588507, + "learning_rate": 1.2267774291884805e-07, + "loss": 1.0259, + "step": 12754 + }, + { + "epoch": 1.8057620159977348, + "grad_norm": 8.380830471795301, + "learning_rate": 1.2250045200533855e-07, + "loss": 0.9226, + "step": 12755 + }, + { + "epoch": 1.805903588872372, + "grad_norm": 8.804443572624857, + "learning_rate": 1.2232328607520743e-07, + "loss": 0.9278, + "step": 12756 + }, + { + "epoch": 1.8060451617470092, + "grad_norm": 9.697855084992337, + "learning_rate": 1.2214624513776861e-07, + "loss": 0.9645, + "step": 12757 + }, + { + "epoch": 1.8061867346216465, + "grad_norm": 10.28946550024644, + "learning_rate": 1.219693292023283e-07, + "loss": 1.0778, + "step": 12758 + }, + { + "epoch": 1.8063283074962837, + "grad_norm": 7.05528282112313, + "learning_rate": 1.217925382781876e-07, + "loss": 0.8354, + "step": 12759 + }, + { + "epoch": 1.806469880370921, + "grad_norm": 8.868912449236747, + "learning_rate": 1.216158723746394e-07, + "loss": 0.9305, + "step": 12760 + }, + { + "epoch": 1.8066114532455582, + "grad_norm": 8.962233671649825, + "learning_rate": 1.2143933150097154e-07, + "loss": 0.9825, + "step": 12761 + }, + { + "epoch": 1.8067530261201954, + "grad_norm": 9.053137421920413, + "learning_rate": 1.2126291566646464e-07, + "loss": 0.9008, + "step": 12762 + }, + { + "epoch": 1.8068945989948326, + "grad_norm": 9.487512914676445, + "learning_rate": 1.210866248803924e-07, + "loss": 0.9971, + "step": 12763 + }, + { + "epoch": 1.8070361718694699, + "grad_norm": 10.052365334990816, + "learning_rate": 1.20910459152023e-07, + "loss": 1.069, + "step": 12764 + }, + { + "epoch": 1.807177744744107, + "grad_norm": 10.393172194247402, + "learning_rate": 1.2073441849061645e-07, + "loss": 1.1262, + "step": 12765 + }, + { + "epoch": 1.8073193176187443, + "grad_norm": 8.870035421651238, + "learning_rate": 1.205585029054279e-07, + "loss": 0.9298, + "step": 12766 + }, + { + "epoch": 1.8074608904933815, + "grad_norm": 10.305823181826774, + "learning_rate": 1.2038271240570415e-07, + "loss": 0.9763, + "step": 12767 + }, + { + "epoch": 1.8076024633680188, + "grad_norm": 10.582236726271974, + "learning_rate": 1.2020704700068691e-07, + "loss": 1.0406, + "step": 12768 + }, + { + "epoch": 1.807744036242656, + "grad_norm": 8.623280229359503, + "learning_rate": 1.2003150669961105e-07, + "loss": 0.9175, + "step": 12769 + }, + { + "epoch": 1.8078856091172932, + "grad_norm": 8.374850143686107, + "learning_rate": 1.198560915117039e-07, + "loss": 0.9899, + "step": 12770 + }, + { + "epoch": 1.8080271819919305, + "grad_norm": 8.840743885070207, + "learning_rate": 1.1968080144618783e-07, + "loss": 0.9166, + "step": 12771 + }, + { + "epoch": 1.8081687548665677, + "grad_norm": 11.034148875479703, + "learning_rate": 1.195056365122768e-07, + "loss": 1.0506, + "step": 12772 + }, + { + "epoch": 1.808310327741205, + "grad_norm": 10.59289655370338, + "learning_rate": 1.193305967191796e-07, + "loss": 1.0595, + "step": 12773 + }, + { + "epoch": 1.8084519006158422, + "grad_norm": 9.592673573696954, + "learning_rate": 1.191556820760978e-07, + "loss": 0.9549, + "step": 12774 + }, + { + "epoch": 1.8085934734904794, + "grad_norm": 10.153403296961182, + "learning_rate": 1.1898089259222673e-07, + "loss": 0.9528, + "step": 12775 + }, + { + "epoch": 1.8087350463651164, + "grad_norm": 9.231878314269132, + "learning_rate": 1.1880622827675464e-07, + "loss": 0.9242, + "step": 12776 + }, + { + "epoch": 1.8088766192397536, + "grad_norm": 8.936455085333753, + "learning_rate": 1.1863168913886364e-07, + "loss": 0.9109, + "step": 12777 + }, + { + "epoch": 1.8090181921143909, + "grad_norm": 8.972343336402652, + "learning_rate": 1.1845727518772915e-07, + "loss": 0.8671, + "step": 12778 + }, + { + "epoch": 1.809159764989028, + "grad_norm": 10.627238677299804, + "learning_rate": 1.1828298643251967e-07, + "loss": 0.9133, + "step": 12779 + }, + { + "epoch": 1.8093013378636653, + "grad_norm": 8.428178625675246, + "learning_rate": 1.1810882288239817e-07, + "loss": 0.8308, + "step": 12780 + }, + { + "epoch": 1.8094429107383025, + "grad_norm": 8.902147492626076, + "learning_rate": 1.1793478454651952e-07, + "loss": 0.8512, + "step": 12781 + }, + { + "epoch": 1.8095844836129398, + "grad_norm": 9.65078689450272, + "learning_rate": 1.1776087143403337e-07, + "loss": 0.9325, + "step": 12782 + }, + { + "epoch": 1.809726056487577, + "grad_norm": 9.154911711604084, + "learning_rate": 1.1758708355408155e-07, + "loss": 1.009, + "step": 12783 + }, + { + "epoch": 1.8098676293622142, + "grad_norm": 8.635541141166618, + "learning_rate": 1.1741342091580038e-07, + "loss": 0.8576, + "step": 12784 + }, + { + "epoch": 1.8100092022368515, + "grad_norm": 8.596859788898529, + "learning_rate": 1.172398835283195e-07, + "loss": 1.0073, + "step": 12785 + }, + { + "epoch": 1.8101507751114885, + "grad_norm": 10.054649939833691, + "learning_rate": 1.1706647140076105e-07, + "loss": 1.0687, + "step": 12786 + }, + { + "epoch": 1.8102923479861257, + "grad_norm": 9.225854237607173, + "learning_rate": 1.1689318454224191e-07, + "loss": 0.9986, + "step": 12787 + }, + { + "epoch": 1.810433920860763, + "grad_norm": 8.89957217784824, + "learning_rate": 1.1672002296187063e-07, + "loss": 0.9835, + "step": 12788 + }, + { + "epoch": 1.8105754937354002, + "grad_norm": 10.459002793809645, + "learning_rate": 1.1654698666875076e-07, + "loss": 0.9087, + "step": 12789 + }, + { + "epoch": 1.8107170666100374, + "grad_norm": 9.58827396138605, + "learning_rate": 1.1637407567197862e-07, + "loss": 1.0319, + "step": 12790 + }, + { + "epoch": 1.8108586394846746, + "grad_norm": 9.2888752346557, + "learning_rate": 1.162012899806439e-07, + "loss": 0.9258, + "step": 12791 + }, + { + "epoch": 1.8110002123593119, + "grad_norm": 8.752637193264329, + "learning_rate": 1.1602862960383015e-07, + "loss": 0.9687, + "step": 12792 + }, + { + "epoch": 1.811141785233949, + "grad_norm": 9.554831228477429, + "learning_rate": 1.1585609455061348e-07, + "loss": 0.9609, + "step": 12793 + }, + { + "epoch": 1.8112833581085863, + "grad_norm": 8.960098400256838, + "learning_rate": 1.1568368483006465e-07, + "loss": 1.0632, + "step": 12794 + }, + { + "epoch": 1.8114249309832235, + "grad_norm": 7.478579895747593, + "learning_rate": 1.1551140045124615e-07, + "loss": 0.9471, + "step": 12795 + }, + { + "epoch": 1.8115665038578608, + "grad_norm": 9.025846878007314, + "learning_rate": 1.1533924142321601e-07, + "loss": 1.0184, + "step": 12796 + }, + { + "epoch": 1.811708076732498, + "grad_norm": 9.34101465305216, + "learning_rate": 1.1516720775502338e-07, + "loss": 0.9815, + "step": 12797 + }, + { + "epoch": 1.8118496496071352, + "grad_norm": 8.542460583022903, + "learning_rate": 1.1499529945571269e-07, + "loss": 0.9997, + "step": 12798 + }, + { + "epoch": 1.8119912224817725, + "grad_norm": 9.45359268923126, + "learning_rate": 1.1482351653432089e-07, + "loss": 1.0937, + "step": 12799 + }, + { + "epoch": 1.8121327953564097, + "grad_norm": 10.075776434198662, + "learning_rate": 1.1465185899987797e-07, + "loss": 0.9933, + "step": 12800 + }, + { + "epoch": 1.812274368231047, + "grad_norm": 10.632498933427454, + "learning_rate": 1.1448032686140864e-07, + "loss": 0.9833, + "step": 12801 + }, + { + "epoch": 1.8124159411056842, + "grad_norm": 8.387885813360167, + "learning_rate": 1.1430892012792933e-07, + "loss": 0.8388, + "step": 12802 + }, + { + "epoch": 1.8125575139803214, + "grad_norm": 9.332236861354613, + "learning_rate": 1.1413763880845169e-07, + "loss": 0.933, + "step": 12803 + }, + { + "epoch": 1.8126990868549586, + "grad_norm": 8.657103568729383, + "learning_rate": 1.139664829119791e-07, + "loss": 0.9381, + "step": 12804 + }, + { + "epoch": 1.8128406597295958, + "grad_norm": 8.895541207894416, + "learning_rate": 1.1379545244750961e-07, + "loss": 0.9346, + "step": 12805 + }, + { + "epoch": 1.812982232604233, + "grad_norm": 8.605339309058268, + "learning_rate": 1.1362454742403356e-07, + "loss": 0.9154, + "step": 12806 + }, + { + "epoch": 1.8131238054788703, + "grad_norm": 10.262417226775852, + "learning_rate": 1.1345376785053596e-07, + "loss": 1.0922, + "step": 12807 + }, + { + "epoch": 1.8132653783535075, + "grad_norm": 8.765467025230896, + "learning_rate": 1.1328311373599493e-07, + "loss": 0.9646, + "step": 12808 + }, + { + "epoch": 1.8134069512281448, + "grad_norm": 8.186546211889024, + "learning_rate": 1.1311258508938022e-07, + "loss": 0.9584, + "step": 12809 + }, + { + "epoch": 1.813548524102782, + "grad_norm": 7.499659975609285, + "learning_rate": 1.1294218191965745e-07, + "loss": 0.8604, + "step": 12810 + }, + { + "epoch": 1.8136900969774192, + "grad_norm": 9.243726510307587, + "learning_rate": 1.1277190423578416e-07, + "loss": 1.0135, + "step": 12811 + }, + { + "epoch": 1.8138316698520565, + "grad_norm": 8.879775360977652, + "learning_rate": 1.1260175204671181e-07, + "loss": 1.0202, + "step": 12812 + }, + { + "epoch": 1.8139732427266937, + "grad_norm": 10.790577797961848, + "learning_rate": 1.1243172536138547e-07, + "loss": 0.8829, + "step": 12813 + }, + { + "epoch": 1.814114815601331, + "grad_norm": 7.490636702851856, + "learning_rate": 1.1226182418874271e-07, + "loss": 0.9443, + "step": 12814 + }, + { + "epoch": 1.8142563884759682, + "grad_norm": 9.306364560110655, + "learning_rate": 1.1209204853771582e-07, + "loss": 0.9576, + "step": 12815 + }, + { + "epoch": 1.8143979613506054, + "grad_norm": 11.38716095839445, + "learning_rate": 1.1192239841722935e-07, + "loss": 0.9607, + "step": 12816 + }, + { + "epoch": 1.8145395342252424, + "grad_norm": 8.039465357526163, + "learning_rate": 1.1175287383620197e-07, + "loss": 0.9414, + "step": 12817 + }, + { + "epoch": 1.8146811070998796, + "grad_norm": 9.117722208773811, + "learning_rate": 1.1158347480354493e-07, + "loss": 0.9659, + "step": 12818 + }, + { + "epoch": 1.8148226799745169, + "grad_norm": 11.238124428650366, + "learning_rate": 1.1141420132816383e-07, + "loss": 1.0027, + "step": 12819 + }, + { + "epoch": 1.814964252849154, + "grad_norm": 11.24945982589883, + "learning_rate": 1.1124505341895742e-07, + "loss": 1.0878, + "step": 12820 + }, + { + "epoch": 1.8151058257237913, + "grad_norm": 8.361985299959471, + "learning_rate": 1.1107603108481718e-07, + "loss": 0.9682, + "step": 12821 + }, + { + "epoch": 1.8152473985984285, + "grad_norm": 9.996223881162537, + "learning_rate": 1.109071343346288e-07, + "loss": 0.9964, + "step": 12822 + }, + { + "epoch": 1.8153889714730658, + "grad_norm": 10.264149273804499, + "learning_rate": 1.1073836317727071e-07, + "loss": 0.9917, + "step": 12823 + }, + { + "epoch": 1.815530544347703, + "grad_norm": 9.643708908972528, + "learning_rate": 1.1056971762161584e-07, + "loss": 0.9702, + "step": 12824 + }, + { + "epoch": 1.8156721172223402, + "grad_norm": 9.680010256801246, + "learning_rate": 1.1040119767652901e-07, + "loss": 1.04, + "step": 12825 + }, + { + "epoch": 1.8158136900969775, + "grad_norm": 8.609826050174174, + "learning_rate": 1.1023280335086956e-07, + "loss": 1.0025, + "step": 12826 + }, + { + "epoch": 1.8159552629716147, + "grad_norm": 10.01825459398309, + "learning_rate": 1.1006453465348954e-07, + "loss": 0.9532, + "step": 12827 + }, + { + "epoch": 1.8160968358462517, + "grad_norm": 9.11556393534569, + "learning_rate": 1.0989639159323523e-07, + "loss": 1.021, + "step": 12828 + }, + { + "epoch": 1.816238408720889, + "grad_norm": 10.794804729269835, + "learning_rate": 1.0972837417894538e-07, + "loss": 0.9194, + "step": 12829 + }, + { + "epoch": 1.8163799815955262, + "grad_norm": 8.650099349140447, + "learning_rate": 1.0956048241945238e-07, + "loss": 0.9851, + "step": 12830 + }, + { + "epoch": 1.8165215544701634, + "grad_norm": 8.125443666522587, + "learning_rate": 1.0939271632358278e-07, + "loss": 0.9918, + "step": 12831 + }, + { + "epoch": 1.8166631273448006, + "grad_norm": 8.267134934155317, + "learning_rate": 1.0922507590015535e-07, + "loss": 0.8619, + "step": 12832 + }, + { + "epoch": 1.8168047002194379, + "grad_norm": 9.259668194818051, + "learning_rate": 1.0905756115798332e-07, + "loss": 0.9139, + "step": 12833 + }, + { + "epoch": 1.816946273094075, + "grad_norm": 10.060883765411132, + "learning_rate": 1.0889017210587216e-07, + "loss": 0.909, + "step": 12834 + }, + { + "epoch": 1.8170878459687123, + "grad_norm": 7.865215217182405, + "learning_rate": 1.0872290875262175e-07, + "loss": 0.9594, + "step": 12835 + }, + { + "epoch": 1.8172294188433495, + "grad_norm": 7.792516434241874, + "learning_rate": 1.0855577110702536e-07, + "loss": 0.8817, + "step": 12836 + }, + { + "epoch": 1.8173709917179868, + "grad_norm": 8.588903107603256, + "learning_rate": 1.0838875917786845e-07, + "loss": 0.9011, + "step": 12837 + }, + { + "epoch": 1.817512564592624, + "grad_norm": 12.26359457716721, + "learning_rate": 1.0822187297393177e-07, + "loss": 1.0606, + "step": 12838 + }, + { + "epoch": 1.8176541374672612, + "grad_norm": 9.266538938378469, + "learning_rate": 1.0805511250398748e-07, + "loss": 0.9557, + "step": 12839 + }, + { + "epoch": 1.8177957103418985, + "grad_norm": 10.0745637027749, + "learning_rate": 1.07888477776803e-07, + "loss": 0.9889, + "step": 12840 + }, + { + "epoch": 1.8179372832165357, + "grad_norm": 8.269974081789892, + "learning_rate": 1.0772196880113716e-07, + "loss": 0.846, + "step": 12841 + }, + { + "epoch": 1.818078856091173, + "grad_norm": 8.05275356041554, + "learning_rate": 1.0755558558574325e-07, + "loss": 0.9247, + "step": 12842 + }, + { + "epoch": 1.8182204289658102, + "grad_norm": 9.486099915864767, + "learning_rate": 1.0738932813936897e-07, + "loss": 1.0297, + "step": 12843 + }, + { + "epoch": 1.8183620018404474, + "grad_norm": 7.422564215531679, + "learning_rate": 1.0722319647075347e-07, + "loss": 0.8692, + "step": 12844 + }, + { + "epoch": 1.8185035747150846, + "grad_norm": 9.844759450349912, + "learning_rate": 1.0705719058863057e-07, + "loss": 0.9411, + "step": 12845 + }, + { + "epoch": 1.8186451475897218, + "grad_norm": 12.219640987135923, + "learning_rate": 1.0689131050172635e-07, + "loss": 1.0461, + "step": 12846 + }, + { + "epoch": 1.818786720464359, + "grad_norm": 8.879173909158556, + "learning_rate": 1.0672555621876218e-07, + "loss": 0.7361, + "step": 12847 + }, + { + "epoch": 1.8189282933389963, + "grad_norm": 9.606599335532685, + "learning_rate": 1.0655992774845054e-07, + "loss": 0.9272, + "step": 12848 + }, + { + "epoch": 1.8190698662136335, + "grad_norm": 11.295104419619044, + "learning_rate": 1.0639442509949944e-07, + "loss": 1.0664, + "step": 12849 + }, + { + "epoch": 1.8192114390882708, + "grad_norm": 8.220000777418559, + "learning_rate": 1.0622904828060803e-07, + "loss": 0.9969, + "step": 12850 + }, + { + "epoch": 1.819353011962908, + "grad_norm": 10.705174515792548, + "learning_rate": 1.0606379730047134e-07, + "loss": 0.9877, + "step": 12851 + }, + { + "epoch": 1.8194945848375452, + "grad_norm": 9.865241528593229, + "learning_rate": 1.0589867216777544e-07, + "loss": 0.9762, + "step": 12852 + }, + { + "epoch": 1.8196361577121825, + "grad_norm": 8.278983788533516, + "learning_rate": 1.0573367289120118e-07, + "loss": 0.9783, + "step": 12853 + }, + { + "epoch": 1.8197777305868197, + "grad_norm": 10.53610814723049, + "learning_rate": 1.0556879947942272e-07, + "loss": 1.0008, + "step": 12854 + }, + { + "epoch": 1.819919303461457, + "grad_norm": 10.000548919870873, + "learning_rate": 1.0540405194110703e-07, + "loss": 0.9116, + "step": 12855 + }, + { + "epoch": 1.8200608763360941, + "grad_norm": 9.076320869432854, + "learning_rate": 1.0523943028491496e-07, + "loss": 0.9242, + "step": 12856 + }, + { + "epoch": 1.8202024492107314, + "grad_norm": 7.059311408591957, + "learning_rate": 1.0507493451949984e-07, + "loss": 0.8606, + "step": 12857 + }, + { + "epoch": 1.8203440220853686, + "grad_norm": 9.976577029893491, + "learning_rate": 1.0491056465351007e-07, + "loss": 0.9011, + "step": 12858 + }, + { + "epoch": 1.8204855949600056, + "grad_norm": 10.517163735162944, + "learning_rate": 1.0474632069558621e-07, + "loss": 0.9802, + "step": 12859 + }, + { + "epoch": 1.8206271678346428, + "grad_norm": 9.47819948956366, + "learning_rate": 1.045822026543622e-07, + "loss": 0.9208, + "step": 12860 + }, + { + "epoch": 1.82076874070928, + "grad_norm": 10.315154872513807, + "learning_rate": 1.0441821053846612e-07, + "loss": 0.9776, + "step": 12861 + }, + { + "epoch": 1.8209103135839173, + "grad_norm": 10.137760363150436, + "learning_rate": 1.0425434435651776e-07, + "loss": 1.0228, + "step": 12862 + }, + { + "epoch": 1.8210518864585545, + "grad_norm": 10.693543627122667, + "learning_rate": 1.0409060411713273e-07, + "loss": 1.0402, + "step": 12863 + }, + { + "epoch": 1.8211934593331918, + "grad_norm": 10.315760981216172, + "learning_rate": 1.0392698982891775e-07, + "loss": 0.9623, + "step": 12864 + }, + { + "epoch": 1.821335032207829, + "grad_norm": 7.736159979869267, + "learning_rate": 1.0376350150047427e-07, + "loss": 0.9462, + "step": 12865 + }, + { + "epoch": 1.8214766050824662, + "grad_norm": 11.207708934612295, + "learning_rate": 1.0360013914039708e-07, + "loss": 0.9996, + "step": 12866 + }, + { + "epoch": 1.8216181779571035, + "grad_norm": 9.608478187953539, + "learning_rate": 1.0343690275727374e-07, + "loss": 0.9923, + "step": 12867 + }, + { + "epoch": 1.8217597508317407, + "grad_norm": 9.078351909276918, + "learning_rate": 1.0327379235968549e-07, + "loss": 0.8762, + "step": 12868 + }, + { + "epoch": 1.8219013237063777, + "grad_norm": 8.846694725622637, + "learning_rate": 1.0311080795620654e-07, + "loss": 0.8426, + "step": 12869 + }, + { + "epoch": 1.822042896581015, + "grad_norm": 8.693825154970355, + "learning_rate": 1.0294794955540587e-07, + "loss": 0.9012, + "step": 12870 + }, + { + "epoch": 1.8221844694556522, + "grad_norm": 10.041853484846948, + "learning_rate": 1.0278521716584361e-07, + "loss": 1.0188, + "step": 12871 + }, + { + "epoch": 1.8223260423302894, + "grad_norm": 10.3457906963091, + "learning_rate": 1.0262261079607539e-07, + "loss": 0.974, + "step": 12872 + }, + { + "epoch": 1.8224676152049266, + "grad_norm": 9.995689989159077, + "learning_rate": 1.0246013045464881e-07, + "loss": 0.9451, + "step": 12873 + }, + { + "epoch": 1.8226091880795638, + "grad_norm": 8.591487184333934, + "learning_rate": 1.0229777615010538e-07, + "loss": 1.0158, + "step": 12874 + }, + { + "epoch": 1.822750760954201, + "grad_norm": 9.990116194010168, + "learning_rate": 1.0213554789098052e-07, + "loss": 0.9816, + "step": 12875 + }, + { + "epoch": 1.8228923338288383, + "grad_norm": 11.316094770480326, + "learning_rate": 1.0197344568580153e-07, + "loss": 0.9488, + "step": 12876 + }, + { + "epoch": 1.8230339067034755, + "grad_norm": 9.3117237151541, + "learning_rate": 1.0181146954309052e-07, + "loss": 0.9755, + "step": 12877 + }, + { + "epoch": 1.8231754795781128, + "grad_norm": 9.090930619647803, + "learning_rate": 1.0164961947136232e-07, + "loss": 0.9468, + "step": 12878 + }, + { + "epoch": 1.82331705245275, + "grad_norm": 8.898374910611752, + "learning_rate": 1.0148789547912569e-07, + "loss": 0.982, + "step": 12879 + }, + { + "epoch": 1.8234586253273872, + "grad_norm": 10.439095563626354, + "learning_rate": 1.013262975748816e-07, + "loss": 0.9876, + "step": 12880 + }, + { + "epoch": 1.8236001982020245, + "grad_norm": 7.7776603538489635, + "learning_rate": 1.011648257671255e-07, + "loss": 0.9738, + "step": 12881 + }, + { + "epoch": 1.8237417710766617, + "grad_norm": 10.684332127135027, + "learning_rate": 1.0100348006434641e-07, + "loss": 1.046, + "step": 12882 + }, + { + "epoch": 1.823883343951299, + "grad_norm": 9.463692601787782, + "learning_rate": 1.0084226047502505e-07, + "loss": 0.9212, + "step": 12883 + }, + { + "epoch": 1.8240249168259361, + "grad_norm": 10.097237282773389, + "learning_rate": 1.0068116700763769e-07, + "loss": 0.9336, + "step": 12884 + }, + { + "epoch": 1.8241664897005734, + "grad_norm": 10.976110005130531, + "learning_rate": 1.0052019967065174e-07, + "loss": 0.9587, + "step": 12885 + }, + { + "epoch": 1.8243080625752106, + "grad_norm": 9.689269070452575, + "learning_rate": 1.0035935847253015e-07, + "loss": 1.0792, + "step": 12886 + }, + { + "epoch": 1.8244496354498478, + "grad_norm": 7.2092174656674874, + "learning_rate": 1.001986434217278e-07, + "loss": 0.8763, + "step": 12887 + }, + { + "epoch": 1.824591208324485, + "grad_norm": 8.037325568553378, + "learning_rate": 1.0003805452669296e-07, + "loss": 0.9538, + "step": 12888 + }, + { + "epoch": 1.8247327811991223, + "grad_norm": 9.762523726313598, + "learning_rate": 9.987759179586886e-08, + "loss": 0.9521, + "step": 12889 + }, + { + "epoch": 1.8248743540737595, + "grad_norm": 10.017758432343488, + "learning_rate": 9.97172552376896e-08, + "loss": 0.9503, + "step": 12890 + }, + { + "epoch": 1.8250159269483968, + "grad_norm": 8.960716557460376, + "learning_rate": 9.955704486058482e-08, + "loss": 0.9259, + "step": 12891 + }, + { + "epoch": 1.825157499823034, + "grad_norm": 8.885596168794885, + "learning_rate": 9.939696067297611e-08, + "loss": 0.8666, + "step": 12892 + }, + { + "epoch": 1.8252990726976712, + "grad_norm": 10.766528176124611, + "learning_rate": 9.923700268327952e-08, + "loss": 1.031, + "step": 12893 + }, + { + "epoch": 1.8254406455723085, + "grad_norm": 11.742605926510201, + "learning_rate": 9.90771708999036e-08, + "loss": 0.9069, + "step": 12894 + }, + { + "epoch": 1.8255822184469457, + "grad_norm": 8.946917074293097, + "learning_rate": 9.891746533125024e-08, + "loss": 0.8929, + "step": 12895 + }, + { + "epoch": 1.825723791321583, + "grad_norm": 8.784639946339087, + "learning_rate": 9.87578859857155e-08, + "loss": 0.9612, + "step": 12896 + }, + { + "epoch": 1.8258653641962201, + "grad_norm": 9.244115658969562, + "learning_rate": 9.859843287168825e-08, + "loss": 0.966, + "step": 12897 + }, + { + "epoch": 1.8260069370708574, + "grad_norm": 8.50334595291874, + "learning_rate": 9.843910599755119e-08, + "loss": 0.9786, + "step": 12898 + }, + { + "epoch": 1.8261485099454946, + "grad_norm": 11.001975835732374, + "learning_rate": 9.827990537167903e-08, + "loss": 0.9132, + "step": 12899 + }, + { + "epoch": 1.8262900828201316, + "grad_norm": 8.90182995788625, + "learning_rate": 9.812083100244201e-08, + "loss": 0.9185, + "step": 12900 + }, + { + "epoch": 1.8264316556947688, + "grad_norm": 8.718808191030048, + "learning_rate": 9.796188289820152e-08, + "loss": 0.8114, + "step": 12901 + }, + { + "epoch": 1.826573228569406, + "grad_norm": 9.735792715030403, + "learning_rate": 9.780306106731419e-08, + "loss": 1.0164, + "step": 12902 + }, + { + "epoch": 1.8267148014440433, + "grad_norm": 11.095371181610593, + "learning_rate": 9.764436551812889e-08, + "loss": 1.0259, + "step": 12903 + }, + { + "epoch": 1.8268563743186805, + "grad_norm": 8.989857786961101, + "learning_rate": 9.748579625898758e-08, + "loss": 0.9497, + "step": 12904 + }, + { + "epoch": 1.8269979471933178, + "grad_norm": 10.68449564863148, + "learning_rate": 9.73273532982269e-08, + "loss": 1.0219, + "step": 12905 + }, + { + "epoch": 1.827139520067955, + "grad_norm": 9.110273627684794, + "learning_rate": 9.716903664417549e-08, + "loss": 0.9936, + "step": 12906 + }, + { + "epoch": 1.8272810929425922, + "grad_norm": 10.994308039396419, + "learning_rate": 9.701084630515667e-08, + "loss": 0.9856, + "step": 12907 + }, + { + "epoch": 1.8274226658172295, + "grad_norm": 9.4068927735375, + "learning_rate": 9.685278228948519e-08, + "loss": 0.8979, + "step": 12908 + }, + { + "epoch": 1.8275642386918667, + "grad_norm": 9.384677622181043, + "learning_rate": 9.669484460547135e-08, + "loss": 0.8183, + "step": 12909 + }, + { + "epoch": 1.827705811566504, + "grad_norm": 9.841055171185873, + "learning_rate": 9.653703326141794e-08, + "loss": 1.0055, + "step": 12910 + }, + { + "epoch": 1.827847384441141, + "grad_norm": 8.97906900044465, + "learning_rate": 9.637934826562001e-08, + "loss": 0.925, + "step": 12911 + }, + { + "epoch": 1.8279889573157782, + "grad_norm": 10.131673991552123, + "learning_rate": 9.622178962636813e-08, + "loss": 0.9977, + "step": 12912 + }, + { + "epoch": 1.8281305301904154, + "grad_norm": 8.550265962803026, + "learning_rate": 9.606435735194403e-08, + "loss": 0.9441, + "step": 12913 + }, + { + "epoch": 1.8282721030650526, + "grad_norm": 8.451402301789877, + "learning_rate": 9.590705145062468e-08, + "loss": 0.9574, + "step": 12914 + }, + { + "epoch": 1.8284136759396898, + "grad_norm": 9.011072764829217, + "learning_rate": 9.574987193067847e-08, + "loss": 0.9782, + "step": 12915 + }, + { + "epoch": 1.828555248814327, + "grad_norm": 9.845353059574643, + "learning_rate": 9.559281880036908e-08, + "loss": 1.0511, + "step": 12916 + }, + { + "epoch": 1.8286968216889643, + "grad_norm": 10.323618680537015, + "learning_rate": 9.54358920679524e-08, + "loss": 0.9772, + "step": 12917 + }, + { + "epoch": 1.8288383945636015, + "grad_norm": 10.84962150511248, + "learning_rate": 9.527909174167793e-08, + "loss": 1.0066, + "step": 12918 + }, + { + "epoch": 1.8289799674382388, + "grad_norm": 9.115421650435836, + "learning_rate": 9.512241782978853e-08, + "loss": 0.8382, + "step": 12919 + }, + { + "epoch": 1.829121540312876, + "grad_norm": 9.255342332638806, + "learning_rate": 9.496587034052041e-08, + "loss": 0.9287, + "step": 12920 + }, + { + "epoch": 1.8292631131875132, + "grad_norm": 9.08009429949893, + "learning_rate": 9.480944928210362e-08, + "loss": 1.0741, + "step": 12921 + }, + { + "epoch": 1.8294046860621505, + "grad_norm": 9.223901582985242, + "learning_rate": 9.46531546627602e-08, + "loss": 0.8933, + "step": 12922 + }, + { + "epoch": 1.8295462589367877, + "grad_norm": 9.656981999080582, + "learning_rate": 9.449698649070721e-08, + "loss": 0.9359, + "step": 12923 + }, + { + "epoch": 1.829687831811425, + "grad_norm": 8.192485194351466, + "learning_rate": 9.43409447741539e-08, + "loss": 0.9928, + "step": 12924 + }, + { + "epoch": 1.8298294046860621, + "grad_norm": 11.437982267624486, + "learning_rate": 9.418502952130343e-08, + "loss": 0.9966, + "step": 12925 + }, + { + "epoch": 1.8299709775606994, + "grad_norm": 8.922810039525936, + "learning_rate": 9.40292407403523e-08, + "loss": 1.0666, + "step": 12926 + }, + { + "epoch": 1.8301125504353366, + "grad_norm": 10.760340485698585, + "learning_rate": 9.38735784394898e-08, + "loss": 1.0672, + "step": 12927 + }, + { + "epoch": 1.8302541233099738, + "grad_norm": 8.760851425529102, + "learning_rate": 9.371804262689938e-08, + "loss": 0.927, + "step": 12928 + }, + { + "epoch": 1.830395696184611, + "grad_norm": 9.815515139990305, + "learning_rate": 9.3562633310757e-08, + "loss": 0.9425, + "step": 12929 + }, + { + "epoch": 1.8305372690592483, + "grad_norm": 11.66651705691368, + "learning_rate": 9.340735049923277e-08, + "loss": 1.1048, + "step": 12930 + }, + { + "epoch": 1.8306788419338855, + "grad_norm": 8.984788968044434, + "learning_rate": 9.325219420048964e-08, + "loss": 1.0651, + "step": 12931 + }, + { + "epoch": 1.8308204148085228, + "grad_norm": 7.6648695401251725, + "learning_rate": 9.309716442268413e-08, + "loss": 0.9509, + "step": 12932 + }, + { + "epoch": 1.83096198768316, + "grad_norm": 9.459430602924416, + "learning_rate": 9.29422611739661e-08, + "loss": 0.9915, + "step": 12933 + }, + { + "epoch": 1.8311035605577972, + "grad_norm": 8.784496209642677, + "learning_rate": 9.278748446247848e-08, + "loss": 1.033, + "step": 12934 + }, + { + "epoch": 1.8312451334324344, + "grad_norm": 10.471459610275394, + "learning_rate": 9.263283429635839e-08, + "loss": 0.9813, + "step": 12935 + }, + { + "epoch": 1.8313867063070717, + "grad_norm": 10.152111536638264, + "learning_rate": 9.247831068373458e-08, + "loss": 0.9652, + "step": 12936 + }, + { + "epoch": 1.831528279181709, + "grad_norm": 8.65816897939071, + "learning_rate": 9.23239136327314e-08, + "loss": 0.9265, + "step": 12937 + }, + { + "epoch": 1.8316698520563461, + "grad_norm": 9.238120208691505, + "learning_rate": 9.216964315146431e-08, + "loss": 0.9648, + "step": 12938 + }, + { + "epoch": 1.8318114249309834, + "grad_norm": 8.980586412479516, + "learning_rate": 9.201549924804376e-08, + "loss": 0.9927, + "step": 12939 + }, + { + "epoch": 1.8319529978056206, + "grad_norm": 11.047787986052485, + "learning_rate": 9.186148193057325e-08, + "loss": 0.851, + "step": 12940 + }, + { + "epoch": 1.8320945706802578, + "grad_norm": 11.963410862412552, + "learning_rate": 9.170759120714884e-08, + "loss": 0.909, + "step": 12941 + }, + { + "epoch": 1.8322361435548948, + "grad_norm": 9.748229428452435, + "learning_rate": 9.155382708586097e-08, + "loss": 1.0846, + "step": 12942 + }, + { + "epoch": 1.832377716429532, + "grad_norm": 10.657799213816672, + "learning_rate": 9.140018957479236e-08, + "loss": 1.0612, + "step": 12943 + }, + { + "epoch": 1.8325192893041693, + "grad_norm": 9.623194933976173, + "learning_rate": 9.124667868201986e-08, + "loss": 0.9854, + "step": 12944 + }, + { + "epoch": 1.8326608621788065, + "grad_norm": 7.671886187698691, + "learning_rate": 9.109329441561343e-08, + "loss": 0.8384, + "step": 12945 + }, + { + "epoch": 1.8328024350534438, + "grad_norm": 9.724760987156662, + "learning_rate": 9.094003678363633e-08, + "loss": 0.9189, + "step": 12946 + }, + { + "epoch": 1.832944007928081, + "grad_norm": 8.894694652237533, + "learning_rate": 9.078690579414546e-08, + "loss": 1.079, + "step": 12947 + }, + { + "epoch": 1.8330855808027182, + "grad_norm": 10.050130693335534, + "learning_rate": 9.063390145519019e-08, + "loss": 1.0162, + "step": 12948 + }, + { + "epoch": 1.8332271536773554, + "grad_norm": 10.060086736892908, + "learning_rate": 9.048102377481466e-08, + "loss": 0.942, + "step": 12949 + }, + { + "epoch": 1.8333687265519927, + "grad_norm": 11.037817364425367, + "learning_rate": 9.032827276105466e-08, + "loss": 0.9777, + "step": 12950 + }, + { + "epoch": 1.83351029942663, + "grad_norm": 9.475096335210711, + "learning_rate": 9.017564842194099e-08, + "loss": 1.0715, + "step": 12951 + }, + { + "epoch": 1.833651872301267, + "grad_norm": 10.532570957005486, + "learning_rate": 9.002315076549639e-08, + "loss": 0.9719, + "step": 12952 + }, + { + "epoch": 1.8337934451759041, + "grad_norm": 11.80842382754852, + "learning_rate": 8.987077979973807e-08, + "loss": 1.0606, + "step": 12953 + }, + { + "epoch": 1.8339350180505414, + "grad_norm": 10.010242938314835, + "learning_rate": 8.971853553267545e-08, + "loss": 0.9095, + "step": 12954 + }, + { + "epoch": 1.8340765909251786, + "grad_norm": 8.771740308487864, + "learning_rate": 8.956641797231214e-08, + "loss": 0.9323, + "step": 12955 + }, + { + "epoch": 1.8342181637998158, + "grad_norm": 9.542391509353063, + "learning_rate": 8.941442712664561e-08, + "loss": 1.1305, + "step": 12956 + }, + { + "epoch": 1.834359736674453, + "grad_norm": 12.194534228049159, + "learning_rate": 8.926256300366475e-08, + "loss": 1.0773, + "step": 12957 + }, + { + "epoch": 1.8345013095490903, + "grad_norm": 8.674930398535402, + "learning_rate": 8.911082561135348e-08, + "loss": 0.9571, + "step": 12958 + }, + { + "epoch": 1.8346428824237275, + "grad_norm": 11.52975991812194, + "learning_rate": 8.895921495768845e-08, + "loss": 1.0264, + "step": 12959 + }, + { + "epoch": 1.8347844552983648, + "grad_norm": 7.742597921292361, + "learning_rate": 8.880773105063994e-08, + "loss": 0.9414, + "step": 12960 + }, + { + "epoch": 1.834926028173002, + "grad_norm": 10.590726617914852, + "learning_rate": 8.865637389817077e-08, + "loss": 0.9761, + "step": 12961 + }, + { + "epoch": 1.8350676010476392, + "grad_norm": 10.297962646464045, + "learning_rate": 8.850514350823819e-08, + "loss": 0.932, + "step": 12962 + }, + { + "epoch": 1.8352091739222764, + "grad_norm": 8.705270280229987, + "learning_rate": 8.835403988879221e-08, + "loss": 0.9841, + "step": 12963 + }, + { + "epoch": 1.8353507467969137, + "grad_norm": 9.985411111145787, + "learning_rate": 8.820306304777593e-08, + "loss": 0.9991, + "step": 12964 + }, + { + "epoch": 1.835492319671551, + "grad_norm": 7.9582423925612975, + "learning_rate": 8.805221299312689e-08, + "loss": 0.8871, + "step": 12965 + }, + { + "epoch": 1.8356338925461881, + "grad_norm": 8.877767641743123, + "learning_rate": 8.790148973277401e-08, + "loss": 0.9625, + "step": 12966 + }, + { + "epoch": 1.8357754654208254, + "grad_norm": 8.578257075059058, + "learning_rate": 8.775089327464154e-08, + "loss": 0.9333, + "step": 12967 + }, + { + "epoch": 1.8359170382954626, + "grad_norm": 8.938028813436675, + "learning_rate": 8.760042362664617e-08, + "loss": 0.9849, + "step": 12968 + }, + { + "epoch": 1.8360586111700998, + "grad_norm": 10.112253713732583, + "learning_rate": 8.745008079669742e-08, + "loss": 0.9212, + "step": 12969 + }, + { + "epoch": 1.836200184044737, + "grad_norm": 8.17063879870517, + "learning_rate": 8.729986479269926e-08, + "loss": 0.866, + "step": 12970 + }, + { + "epoch": 1.8363417569193743, + "grad_norm": 10.897008614887376, + "learning_rate": 8.714977562254784e-08, + "loss": 1.0781, + "step": 12971 + }, + { + "epoch": 1.8364833297940115, + "grad_norm": 11.39893547575688, + "learning_rate": 8.699981329413409e-08, + "loss": 1.0536, + "step": 12972 + }, + { + "epoch": 1.8366249026686488, + "grad_norm": 10.03693587640717, + "learning_rate": 8.68499778153406e-08, + "loss": 0.9397, + "step": 12973 + }, + { + "epoch": 1.836766475543286, + "grad_norm": 8.868278859738345, + "learning_rate": 8.670026919404467e-08, + "loss": 0.9075, + "step": 12974 + }, + { + "epoch": 1.8369080484179232, + "grad_norm": 9.231602699603158, + "learning_rate": 8.655068743811613e-08, + "loss": 0.9314, + "step": 12975 + }, + { + "epoch": 1.8370496212925604, + "grad_norm": 9.424937798980343, + "learning_rate": 8.640123255541838e-08, + "loss": 1.0249, + "step": 12976 + }, + { + "epoch": 1.8371911941671977, + "grad_norm": 9.27009873355107, + "learning_rate": 8.625190455380821e-08, + "loss": 0.9314, + "step": 12977 + }, + { + "epoch": 1.837332767041835, + "grad_norm": 8.692598671713798, + "learning_rate": 8.610270344113575e-08, + "loss": 0.9824, + "step": 12978 + }, + { + "epoch": 1.8374743399164721, + "grad_norm": 9.535720727223383, + "learning_rate": 8.595362922524413e-08, + "loss": 0.9426, + "step": 12979 + }, + { + "epoch": 1.8376159127911094, + "grad_norm": 8.897706977256837, + "learning_rate": 8.580468191397018e-08, + "loss": 0.9774, + "step": 12980 + }, + { + "epoch": 1.8377574856657466, + "grad_norm": 8.961154180166897, + "learning_rate": 8.565586151514427e-08, + "loss": 0.9543, + "step": 12981 + }, + { + "epoch": 1.8378990585403838, + "grad_norm": 9.771058644595929, + "learning_rate": 8.550716803658904e-08, + "loss": 0.9443, + "step": 12982 + }, + { + "epoch": 1.8380406314150208, + "grad_norm": 8.017293834408596, + "learning_rate": 8.535860148612213e-08, + "loss": 0.8564, + "step": 12983 + }, + { + "epoch": 1.838182204289658, + "grad_norm": 9.278438668941062, + "learning_rate": 8.521016187155284e-08, + "loss": 1.0057, + "step": 12984 + }, + { + "epoch": 1.8383237771642953, + "grad_norm": 8.994430090008091, + "learning_rate": 8.506184920068466e-08, + "loss": 0.9332, + "step": 12985 + }, + { + "epoch": 1.8384653500389325, + "grad_norm": 8.257211019071546, + "learning_rate": 8.491366348131469e-08, + "loss": 0.8506, + "step": 12986 + }, + { + "epoch": 1.8386069229135698, + "grad_norm": 11.278641924978158, + "learning_rate": 8.476560472123251e-08, + "loss": 1.0233, + "step": 12987 + }, + { + "epoch": 1.838748495788207, + "grad_norm": 7.954259525461102, + "learning_rate": 8.46176729282222e-08, + "loss": 0.9463, + "step": 12988 + }, + { + "epoch": 1.8388900686628442, + "grad_norm": 10.64621725466356, + "learning_rate": 8.44698681100592e-08, + "loss": 0.9825, + "step": 12989 + }, + { + "epoch": 1.8390316415374814, + "grad_norm": 9.534412897335368, + "learning_rate": 8.432219027451421e-08, + "loss": 0.9314, + "step": 12990 + }, + { + "epoch": 1.8391732144121187, + "grad_norm": 8.572839439170938, + "learning_rate": 8.41746394293505e-08, + "loss": 0.9599, + "step": 12991 + }, + { + "epoch": 1.839314787286756, + "grad_norm": 9.203549023948884, + "learning_rate": 8.402721558232463e-08, + "loss": 0.8987, + "step": 12992 + }, + { + "epoch": 1.839456360161393, + "grad_norm": 11.394045804066097, + "learning_rate": 8.387991874118678e-08, + "loss": 1.0777, + "step": 12993 + }, + { + "epoch": 1.8395979330360301, + "grad_norm": 10.181777938540307, + "learning_rate": 8.373274891367993e-08, + "loss": 0.9484, + "step": 12994 + }, + { + "epoch": 1.8397395059106674, + "grad_norm": 9.813767661075252, + "learning_rate": 8.358570610754097e-08, + "loss": 0.9506, + "step": 12995 + }, + { + "epoch": 1.8398810787853046, + "grad_norm": 7.955160363940856, + "learning_rate": 8.343879033049951e-08, + "loss": 0.9218, + "step": 12996 + }, + { + "epoch": 1.8400226516599418, + "grad_norm": 9.680444917148453, + "learning_rate": 8.329200159027939e-08, + "loss": 0.953, + "step": 12997 + }, + { + "epoch": 1.840164224534579, + "grad_norm": 9.578605702022827, + "learning_rate": 8.314533989459612e-08, + "loss": 0.9916, + "step": 12998 + }, + { + "epoch": 1.8403057974092163, + "grad_norm": 12.50054075978753, + "learning_rate": 8.299880525116072e-08, + "loss": 0.9955, + "step": 12999 + }, + { + "epoch": 1.8404473702838535, + "grad_norm": 8.974499072703559, + "learning_rate": 8.285239766767595e-08, + "loss": 0.9163, + "step": 13000 + }, + { + "epoch": 1.8405889431584908, + "grad_norm": 11.357965814844306, + "learning_rate": 8.270611715183813e-08, + "loss": 1.0176, + "step": 13001 + }, + { + "epoch": 1.840730516033128, + "grad_norm": 9.909857156380372, + "learning_rate": 8.25599637113375e-08, + "loss": 0.8938, + "step": 13002 + }, + { + "epoch": 1.8408720889077652, + "grad_norm": 9.262306069262834, + "learning_rate": 8.241393735385684e-08, + "loss": 0.9781, + "step": 13003 + }, + { + "epoch": 1.8410136617824024, + "grad_norm": 10.71959912020275, + "learning_rate": 8.226803808707301e-08, + "loss": 0.9523, + "step": 13004 + }, + { + "epoch": 1.8411552346570397, + "grad_norm": 8.311560097238063, + "learning_rate": 8.212226591865547e-08, + "loss": 0.935, + "step": 13005 + }, + { + "epoch": 1.841296807531677, + "grad_norm": 9.355656721102237, + "learning_rate": 8.197662085626778e-08, + "loss": 0.9374, + "step": 13006 + }, + { + "epoch": 1.8414383804063141, + "grad_norm": 8.672278812388626, + "learning_rate": 8.183110290756608e-08, + "loss": 1.0015, + "step": 13007 + }, + { + "epoch": 1.8415799532809514, + "grad_norm": 9.393771896286024, + "learning_rate": 8.168571208020032e-08, + "loss": 0.9502, + "step": 13008 + }, + { + "epoch": 1.8417215261555886, + "grad_norm": 8.708969263344706, + "learning_rate": 8.154044838181385e-08, + "loss": 0.847, + "step": 13009 + }, + { + "epoch": 1.8418630990302258, + "grad_norm": 9.951260618164163, + "learning_rate": 8.139531182004223e-08, + "loss": 0.8045, + "step": 13010 + }, + { + "epoch": 1.842004671904863, + "grad_norm": 9.291207158197635, + "learning_rate": 8.125030240251575e-08, + "loss": 0.9115, + "step": 13011 + }, + { + "epoch": 1.8421462447795003, + "grad_norm": 10.611929719337763, + "learning_rate": 8.110542013685745e-08, + "loss": 1.007, + "step": 13012 + }, + { + "epoch": 1.8422878176541375, + "grad_norm": 12.28694156840785, + "learning_rate": 8.09606650306835e-08, + "loss": 0.9937, + "step": 13013 + }, + { + "epoch": 1.8424293905287747, + "grad_norm": 9.095824539196629, + "learning_rate": 8.081603709160362e-08, + "loss": 0.9475, + "step": 13014 + }, + { + "epoch": 1.842570963403412, + "grad_norm": 9.112774746158205, + "learning_rate": 8.067153632722092e-08, + "loss": 1.1256, + "step": 13015 + }, + { + "epoch": 1.8427125362780492, + "grad_norm": 10.620747871000209, + "learning_rate": 8.052716274513178e-08, + "loss": 0.9661, + "step": 13016 + }, + { + "epoch": 1.8428541091526864, + "grad_norm": 8.323687145742284, + "learning_rate": 8.038291635292545e-08, + "loss": 0.8972, + "step": 13017 + }, + { + "epoch": 1.8429956820273237, + "grad_norm": 8.970315085568343, + "learning_rate": 8.023879715818556e-08, + "loss": 0.8606, + "step": 13018 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 8.889796427915837, + "learning_rate": 8.009480516848717e-08, + "loss": 1.0637, + "step": 13019 + }, + { + "epoch": 1.8432788277765981, + "grad_norm": 8.73340406122275, + "learning_rate": 7.995094039140116e-08, + "loss": 0.9445, + "step": 13020 + }, + { + "epoch": 1.8434204006512354, + "grad_norm": 7.806131681793236, + "learning_rate": 7.980720283448957e-08, + "loss": 0.9272, + "step": 13021 + }, + { + "epoch": 1.8435619735258726, + "grad_norm": 10.588864620679505, + "learning_rate": 7.966359250530824e-08, + "loss": 1.0178, + "step": 13022 + }, + { + "epoch": 1.8437035464005098, + "grad_norm": 8.482839877523052, + "learning_rate": 7.952010941140786e-08, + "loss": 0.9035, + "step": 13023 + }, + { + "epoch": 1.8438451192751468, + "grad_norm": 9.067777373028983, + "learning_rate": 7.937675356032987e-08, + "loss": 0.916, + "step": 13024 + }, + { + "epoch": 1.843986692149784, + "grad_norm": 10.669700389827796, + "learning_rate": 7.923352495961157e-08, + "loss": 1.0147, + "step": 13025 + }, + { + "epoch": 1.8441282650244213, + "grad_norm": 9.504766372787367, + "learning_rate": 7.909042361678165e-08, + "loss": 0.9642, + "step": 13026 + }, + { + "epoch": 1.8442698378990585, + "grad_norm": 10.146428920385032, + "learning_rate": 7.894744953936329e-08, + "loss": 0.8746, + "step": 13027 + }, + { + "epoch": 1.8444114107736957, + "grad_norm": 9.618806109348409, + "learning_rate": 7.880460273487184e-08, + "loss": 0.9346, + "step": 13028 + }, + { + "epoch": 1.844552983648333, + "grad_norm": 10.186998975170125, + "learning_rate": 7.866188321081741e-08, + "loss": 0.9592, + "step": 13029 + }, + { + "epoch": 1.8446945565229702, + "grad_norm": 8.145782184358941, + "learning_rate": 7.851929097470234e-08, + "loss": 0.9785, + "step": 13030 + }, + { + "epoch": 1.8448361293976074, + "grad_norm": 9.848632376750734, + "learning_rate": 7.837682603402258e-08, + "loss": 0.9813, + "step": 13031 + }, + { + "epoch": 1.8449777022722447, + "grad_norm": 9.158348285731106, + "learning_rate": 7.823448839626768e-08, + "loss": 0.9978, + "step": 13032 + }, + { + "epoch": 1.845119275146882, + "grad_norm": 11.895150845931335, + "learning_rate": 7.809227806891972e-08, + "loss": 0.988, + "step": 13033 + }, + { + "epoch": 1.8452608480215191, + "grad_norm": 9.049581646601991, + "learning_rate": 7.795019505945495e-08, + "loss": 1.0043, + "step": 13034 + }, + { + "epoch": 1.8454024208961561, + "grad_norm": 9.594776210510862, + "learning_rate": 7.78082393753421e-08, + "loss": 1.0956, + "step": 13035 + }, + { + "epoch": 1.8455439937707934, + "grad_norm": 11.50632999191369, + "learning_rate": 7.766641102404438e-08, + "loss": 0.9045, + "step": 13036 + }, + { + "epoch": 1.8456855666454306, + "grad_norm": 8.310621465220493, + "learning_rate": 7.75247100130172e-08, + "loss": 0.9002, + "step": 13037 + }, + { + "epoch": 1.8458271395200678, + "grad_norm": 10.403922339032578, + "learning_rate": 7.738313634970962e-08, + "loss": 1.0398, + "step": 13038 + }, + { + "epoch": 1.845968712394705, + "grad_norm": 8.788950520119975, + "learning_rate": 7.724169004156457e-08, + "loss": 1.0229, + "step": 13039 + }, + { + "epoch": 1.8461102852693423, + "grad_norm": 10.149523388426031, + "learning_rate": 7.710037109601692e-08, + "loss": 0.9554, + "step": 13040 + }, + { + "epoch": 1.8462518581439795, + "grad_norm": 9.50049348854463, + "learning_rate": 7.695917952049658e-08, + "loss": 0.8966, + "step": 13041 + }, + { + "epoch": 1.8463934310186167, + "grad_norm": 9.207439350443197, + "learning_rate": 7.68181153224254e-08, + "loss": 0.9867, + "step": 13042 + }, + { + "epoch": 1.846535003893254, + "grad_norm": 9.527365771679142, + "learning_rate": 7.66771785092188e-08, + "loss": 0.9256, + "step": 13043 + }, + { + "epoch": 1.8466765767678912, + "grad_norm": 8.36289444750076, + "learning_rate": 7.653636908828644e-08, + "loss": 0.9432, + "step": 13044 + }, + { + "epoch": 1.8468181496425284, + "grad_norm": 9.657787015492788, + "learning_rate": 7.639568706702988e-08, + "loss": 0.9931, + "step": 13045 + }, + { + "epoch": 1.8469597225171657, + "grad_norm": 9.609125298457963, + "learning_rate": 7.625513245284515e-08, + "loss": 0.9211, + "step": 13046 + }, + { + "epoch": 1.847101295391803, + "grad_norm": 10.58248257158639, + "learning_rate": 7.611470525312054e-08, + "loss": 1.0482, + "step": 13047 + }, + { + "epoch": 1.8472428682664401, + "grad_norm": 8.186145932579763, + "learning_rate": 7.597440547523872e-08, + "loss": 0.8642, + "step": 13048 + }, + { + "epoch": 1.8473844411410774, + "grad_norm": 9.991028766986606, + "learning_rate": 7.58342331265749e-08, + "loss": 0.9589, + "step": 13049 + }, + { + "epoch": 1.8475260140157146, + "grad_norm": 10.968936407782149, + "learning_rate": 7.56941882144982e-08, + "loss": 1.0033, + "step": 13050 + }, + { + "epoch": 1.8476675868903518, + "grad_norm": 8.697067590870832, + "learning_rate": 7.555427074636995e-08, + "loss": 0.9412, + "step": 13051 + }, + { + "epoch": 1.847809159764989, + "grad_norm": 10.176391911259264, + "learning_rate": 7.541448072954622e-08, + "loss": 0.9544, + "step": 13052 + }, + { + "epoch": 1.8479507326396263, + "grad_norm": 8.703347660414748, + "learning_rate": 7.527481817137555e-08, + "loss": 0.9829, + "step": 13053 + }, + { + "epoch": 1.8480923055142635, + "grad_norm": 9.00740255463527, + "learning_rate": 7.513528307919931e-08, + "loss": 1.0087, + "step": 13054 + }, + { + "epoch": 1.8482338783889007, + "grad_norm": 10.014287279034452, + "learning_rate": 7.499587546035358e-08, + "loss": 0.9304, + "step": 13055 + }, + { + "epoch": 1.848375451263538, + "grad_norm": 10.069911715017277, + "learning_rate": 7.48565953221661e-08, + "loss": 0.9799, + "step": 13056 + }, + { + "epoch": 1.8485170241381752, + "grad_norm": 9.758388334315292, + "learning_rate": 7.471744267195962e-08, + "loss": 1.0266, + "step": 13057 + }, + { + "epoch": 1.8486585970128124, + "grad_norm": 8.514414962050893, + "learning_rate": 7.457841751704831e-08, + "loss": 0.9963, + "step": 13058 + }, + { + "epoch": 1.8488001698874497, + "grad_norm": 9.618668492537621, + "learning_rate": 7.44395198647413e-08, + "loss": 1.0008, + "step": 13059 + }, + { + "epoch": 1.848941742762087, + "grad_norm": 9.376238931807768, + "learning_rate": 7.430074972234053e-08, + "loss": 1.0124, + "step": 13060 + }, + { + "epoch": 1.8490833156367241, + "grad_norm": 8.033866248387053, + "learning_rate": 7.416210709714016e-08, + "loss": 0.9281, + "step": 13061 + }, + { + "epoch": 1.8492248885113614, + "grad_norm": 11.12034423789292, + "learning_rate": 7.402359199642967e-08, + "loss": 0.9565, + "step": 13062 + }, + { + "epoch": 1.8493664613859986, + "grad_norm": 10.652145714032844, + "learning_rate": 7.388520442748959e-08, + "loss": 1.0056, + "step": 13063 + }, + { + "epoch": 1.8495080342606358, + "grad_norm": 8.784513145508594, + "learning_rate": 7.374694439759523e-08, + "loss": 0.8987, + "step": 13064 + }, + { + "epoch": 1.849649607135273, + "grad_norm": 9.246618116600159, + "learning_rate": 7.36088119140152e-08, + "loss": 0.9565, + "step": 13065 + }, + { + "epoch": 1.84979118000991, + "grad_norm": 8.750112042390915, + "learning_rate": 7.347080698401038e-08, + "loss": 0.9175, + "step": 13066 + }, + { + "epoch": 1.8499327528845473, + "grad_norm": 11.31989868334394, + "learning_rate": 7.333292961483634e-08, + "loss": 0.9148, + "step": 13067 + }, + { + "epoch": 1.8500743257591845, + "grad_norm": 10.13199100973195, + "learning_rate": 7.319517981374036e-08, + "loss": 0.9726, + "step": 13068 + }, + { + "epoch": 1.8502158986338217, + "grad_norm": 7.541616815504121, + "learning_rate": 7.305755758796468e-08, + "loss": 0.9055, + "step": 13069 + }, + { + "epoch": 1.850357471508459, + "grad_norm": 9.850850768550796, + "learning_rate": 7.292006294474325e-08, + "loss": 0.9288, + "step": 13070 + }, + { + "epoch": 1.8504990443830962, + "grad_norm": 9.684367405561318, + "learning_rate": 7.278269589130472e-08, + "loss": 0.8569, + "step": 13071 + }, + { + "epoch": 1.8506406172577334, + "grad_norm": 10.889882745172846, + "learning_rate": 7.264545643486997e-08, + "loss": 0.8868, + "step": 13072 + }, + { + "epoch": 1.8507821901323707, + "grad_norm": 8.861116537519631, + "learning_rate": 7.250834458265355e-08, + "loss": 1.0099, + "step": 13073 + }, + { + "epoch": 1.850923763007008, + "grad_norm": 10.910132823663535, + "learning_rate": 7.237136034186382e-08, + "loss": 1.0168, + "step": 13074 + }, + { + "epoch": 1.8510653358816451, + "grad_norm": 8.902057504173808, + "learning_rate": 7.223450371970114e-08, + "loss": 0.9637, + "step": 13075 + }, + { + "epoch": 1.8512069087562821, + "grad_norm": 9.516098035758768, + "learning_rate": 7.209777472336061e-08, + "loss": 0.8819, + "step": 13076 + }, + { + "epoch": 1.8513484816309194, + "grad_norm": 9.765888277701064, + "learning_rate": 7.19611733600295e-08, + "loss": 0.9684, + "step": 13077 + }, + { + "epoch": 1.8514900545055566, + "grad_norm": 8.471118553552408, + "learning_rate": 7.182469963688932e-08, + "loss": 0.9412, + "step": 13078 + }, + { + "epoch": 1.8516316273801938, + "grad_norm": 8.240771594170363, + "learning_rate": 7.168835356111376e-08, + "loss": 0.9273, + "step": 13079 + }, + { + "epoch": 1.851773200254831, + "grad_norm": 9.998574727531482, + "learning_rate": 7.155213513987124e-08, + "loss": 0.946, + "step": 13080 + }, + { + "epoch": 1.8519147731294683, + "grad_norm": 8.346526523343876, + "learning_rate": 7.141604438032218e-08, + "loss": 0.9134, + "step": 13081 + }, + { + "epoch": 1.8520563460041055, + "grad_norm": 8.766702895095676, + "learning_rate": 7.128008128962055e-08, + "loss": 0.9002, + "step": 13082 + }, + { + "epoch": 1.8521979188787427, + "grad_norm": 9.53327975313651, + "learning_rate": 7.11442458749148e-08, + "loss": 0.9651, + "step": 13083 + }, + { + "epoch": 1.85233949175338, + "grad_norm": 8.867388402872761, + "learning_rate": 7.100853814334451e-08, + "loss": 0.9324, + "step": 13084 + }, + { + "epoch": 1.8524810646280172, + "grad_norm": 10.591889614717081, + "learning_rate": 7.087295810204425e-08, + "loss": 0.933, + "step": 13085 + }, + { + "epoch": 1.8526226375026544, + "grad_norm": 9.105956808697012, + "learning_rate": 7.073750575814136e-08, + "loss": 0.8856, + "step": 13086 + }, + { + "epoch": 1.8527642103772917, + "grad_norm": 8.476567000286824, + "learning_rate": 7.060218111875628e-08, + "loss": 0.9151, + "step": 13087 + }, + { + "epoch": 1.852905783251929, + "grad_norm": 8.323353959214463, + "learning_rate": 7.046698419100356e-08, + "loss": 0.974, + "step": 13088 + }, + { + "epoch": 1.8530473561265661, + "grad_norm": 9.058255589446043, + "learning_rate": 7.033191498198949e-08, + "loss": 1.0201, + "step": 13089 + }, + { + "epoch": 1.8531889290012034, + "grad_norm": 7.393273941411471, + "learning_rate": 7.019697349881532e-08, + "loss": 0.9247, + "step": 13090 + }, + { + "epoch": 1.8533305018758406, + "grad_norm": 10.97521642936424, + "learning_rate": 7.006215974857428e-08, + "loss": 0.9166, + "step": 13091 + }, + { + "epoch": 1.8534720747504778, + "grad_norm": 9.561019882642702, + "learning_rate": 6.992747373835401e-08, + "loss": 1.006, + "step": 13092 + }, + { + "epoch": 1.853613647625115, + "grad_norm": 9.265343390008038, + "learning_rate": 6.979291547523415e-08, + "loss": 0.8939, + "step": 13093 + }, + { + "epoch": 1.8537552204997523, + "grad_norm": 9.89323884917158, + "learning_rate": 6.965848496628902e-08, + "loss": 1.0408, + "step": 13094 + }, + { + "epoch": 1.8538967933743895, + "grad_norm": 7.123465121735079, + "learning_rate": 6.952418221858492e-08, + "loss": 0.9842, + "step": 13095 + }, + { + "epoch": 1.8540383662490267, + "grad_norm": 8.85638926416064, + "learning_rate": 6.939000723918232e-08, + "loss": 0.8615, + "step": 13096 + }, + { + "epoch": 1.854179939123664, + "grad_norm": 8.404809991525825, + "learning_rate": 6.925596003513501e-08, + "loss": 0.8953, + "step": 13097 + }, + { + "epoch": 1.8543215119983012, + "grad_norm": 11.017442225994426, + "learning_rate": 6.912204061348904e-08, + "loss": 1.019, + "step": 13098 + }, + { + "epoch": 1.8544630848729384, + "grad_norm": 8.331790883048352, + "learning_rate": 6.898824898128515e-08, + "loss": 0.9597, + "step": 13099 + }, + { + "epoch": 1.8546046577475757, + "grad_norm": 9.624311942736153, + "learning_rate": 6.885458514555632e-08, + "loss": 0.8911, + "step": 13100 + }, + { + "epoch": 1.8547462306222129, + "grad_norm": 8.739358152656873, + "learning_rate": 6.872104911332916e-08, + "loss": 0.948, + "step": 13101 + }, + { + "epoch": 1.8548878034968501, + "grad_norm": 8.46199695438699, + "learning_rate": 6.858764089162334e-08, + "loss": 0.8473, + "step": 13102 + }, + { + "epoch": 1.8550293763714873, + "grad_norm": 12.582404769758405, + "learning_rate": 6.845436048745241e-08, + "loss": 1.0742, + "step": 13103 + }, + { + "epoch": 1.8551709492461246, + "grad_norm": 9.67196058688404, + "learning_rate": 6.832120790782326e-08, + "loss": 1.0029, + "step": 13104 + }, + { + "epoch": 1.8553125221207618, + "grad_norm": 10.604959414564975, + "learning_rate": 6.818818315973475e-08, + "loss": 1.0182, + "step": 13105 + }, + { + "epoch": 1.855454094995399, + "grad_norm": 11.162167302179085, + "learning_rate": 6.805528625018016e-08, + "loss": 1.0216, + "step": 13106 + }, + { + "epoch": 1.855595667870036, + "grad_norm": 8.546121121202397, + "learning_rate": 6.792251718614584e-08, + "loss": 0.8667, + "step": 13107 + }, + { + "epoch": 1.8557372407446733, + "grad_norm": 9.910491131758116, + "learning_rate": 6.778987597461123e-08, + "loss": 0.9804, + "step": 13108 + }, + { + "epoch": 1.8558788136193105, + "grad_norm": 9.757001661417222, + "learning_rate": 6.765736262254935e-08, + "loss": 0.9271, + "step": 13109 + }, + { + "epoch": 1.8560203864939477, + "grad_norm": 9.900943718829913, + "learning_rate": 6.752497713692629e-08, + "loss": 0.9063, + "step": 13110 + }, + { + "epoch": 1.856161959368585, + "grad_norm": 10.366774274097882, + "learning_rate": 6.73927195247015e-08, + "loss": 0.889, + "step": 13111 + }, + { + "epoch": 1.8563035322432222, + "grad_norm": 8.834075704604727, + "learning_rate": 6.726058979282774e-08, + "loss": 0.9885, + "step": 13112 + }, + { + "epoch": 1.8564451051178594, + "grad_norm": 7.936539719533101, + "learning_rate": 6.712858794825083e-08, + "loss": 1.0252, + "step": 13113 + }, + { + "epoch": 1.8565866779924967, + "grad_norm": 10.335155777553092, + "learning_rate": 6.699671399790969e-08, + "loss": 0.9567, + "step": 13114 + }, + { + "epoch": 1.856728250867134, + "grad_norm": 10.39396937190981, + "learning_rate": 6.686496794873792e-08, + "loss": 1.0478, + "step": 13115 + }, + { + "epoch": 1.8568698237417711, + "grad_norm": 9.59702029079412, + "learning_rate": 6.673334980765972e-08, + "loss": 0.9651, + "step": 13116 + }, + { + "epoch": 1.8570113966164083, + "grad_norm": 10.175292394687379, + "learning_rate": 6.660185958159537e-08, + "loss": 0.9328, + "step": 13117 + }, + { + "epoch": 1.8571529694910454, + "grad_norm": 9.272263000207637, + "learning_rate": 6.647049727745685e-08, + "loss": 0.9154, + "step": 13118 + }, + { + "epoch": 1.8572945423656826, + "grad_norm": 9.10779695272305, + "learning_rate": 6.633926290214976e-08, + "loss": 0.8968, + "step": 13119 + }, + { + "epoch": 1.8574361152403198, + "grad_norm": 8.995993252125368, + "learning_rate": 6.620815646257301e-08, + "loss": 0.9513, + "step": 13120 + }, + { + "epoch": 1.857577688114957, + "grad_norm": 11.671939058140817, + "learning_rate": 6.607717796561858e-08, + "loss": 1.0231, + "step": 13121 + }, + { + "epoch": 1.8577192609895943, + "grad_norm": 9.819108232146613, + "learning_rate": 6.594632741817237e-08, + "loss": 0.9822, + "step": 13122 + }, + { + "epoch": 1.8578608338642315, + "grad_norm": 8.408019152676648, + "learning_rate": 6.581560482711247e-08, + "loss": 0.8869, + "step": 13123 + }, + { + "epoch": 1.8580024067388687, + "grad_norm": 10.087067556270535, + "learning_rate": 6.568501019931173e-08, + "loss": 0.9617, + "step": 13124 + }, + { + "epoch": 1.858143979613506, + "grad_norm": 9.660540865018246, + "learning_rate": 6.555454354163437e-08, + "loss": 0.963, + "step": 13125 + }, + { + "epoch": 1.8582855524881432, + "grad_norm": 8.703024187308847, + "learning_rate": 6.542420486093992e-08, + "loss": 0.901, + "step": 13126 + }, + { + "epoch": 1.8584271253627804, + "grad_norm": 9.044448869825443, + "learning_rate": 6.529399416407955e-08, + "loss": 1.0308, + "step": 13127 + }, + { + "epoch": 1.8585686982374177, + "grad_norm": 10.277133662571455, + "learning_rate": 6.516391145789836e-08, + "loss": 1.0744, + "step": 13128 + }, + { + "epoch": 1.858710271112055, + "grad_norm": 10.794214917426546, + "learning_rate": 6.503395674923529e-08, + "loss": 1.0107, + "step": 13129 + }, + { + "epoch": 1.8588518439866921, + "grad_norm": 8.803718249528993, + "learning_rate": 6.490413004492102e-08, + "loss": 1.1059, + "step": 13130 + }, + { + "epoch": 1.8589934168613294, + "grad_norm": 10.4164680970657, + "learning_rate": 6.477443135178118e-08, + "loss": 1.0361, + "step": 13131 + }, + { + "epoch": 1.8591349897359666, + "grad_norm": 9.898872588726054, + "learning_rate": 6.464486067663366e-08, + "loss": 1.0363, + "step": 13132 + }, + { + "epoch": 1.8592765626106038, + "grad_norm": 7.72906126651377, + "learning_rate": 6.451541802628969e-08, + "loss": 0.9623, + "step": 13133 + }, + { + "epoch": 1.859418135485241, + "grad_norm": 9.461080233491856, + "learning_rate": 6.438610340755464e-08, + "loss": 0.9553, + "step": 13134 + }, + { + "epoch": 1.8595597083598783, + "grad_norm": 10.32943869588143, + "learning_rate": 6.425691682722584e-08, + "loss": 0.9445, + "step": 13135 + }, + { + "epoch": 1.8597012812345155, + "grad_norm": 9.319100927119727, + "learning_rate": 6.412785829209511e-08, + "loss": 0.9638, + "step": 13136 + }, + { + "epoch": 1.8598428541091527, + "grad_norm": 8.89988679278093, + "learning_rate": 6.399892780894618e-08, + "loss": 0.8734, + "step": 13137 + }, + { + "epoch": 1.85998442698379, + "grad_norm": 9.4174486583893, + "learning_rate": 6.387012538455723e-08, + "loss": 0.9438, + "step": 13138 + }, + { + "epoch": 1.8601259998584272, + "grad_norm": 9.57461398867999, + "learning_rate": 6.37414510256995e-08, + "loss": 0.9536, + "step": 13139 + }, + { + "epoch": 1.8602675727330644, + "grad_norm": 9.097639897070962, + "learning_rate": 6.361290473913705e-08, + "loss": 0.9459, + "step": 13140 + }, + { + "epoch": 1.8604091456077017, + "grad_norm": 9.249057618671001, + "learning_rate": 6.34844865316278e-08, + "loss": 0.9651, + "step": 13141 + }, + { + "epoch": 1.8605507184823389, + "grad_norm": 8.672152128173241, + "learning_rate": 6.335619640992191e-08, + "loss": 0.8411, + "step": 13142 + }, + { + "epoch": 1.8606922913569761, + "grad_norm": 9.00859676497001, + "learning_rate": 6.322803438076453e-08, + "loss": 0.9454, + "step": 13143 + }, + { + "epoch": 1.8608338642316133, + "grad_norm": 8.298758485160267, + "learning_rate": 6.310000045089193e-08, + "loss": 1.0268, + "step": 13144 + }, + { + "epoch": 1.8609754371062506, + "grad_norm": 9.915302172051335, + "learning_rate": 6.297209462703569e-08, + "loss": 1.0177, + "step": 13145 + }, + { + "epoch": 1.8611170099808878, + "grad_norm": 7.44745079308549, + "learning_rate": 6.284431691591875e-08, + "loss": 0.8253, + "step": 13146 + }, + { + "epoch": 1.861258582855525, + "grad_norm": 9.71640447543101, + "learning_rate": 6.271666732425935e-08, + "loss": 0.9073, + "step": 13147 + }, + { + "epoch": 1.8614001557301623, + "grad_norm": 10.034096002160155, + "learning_rate": 6.258914585876741e-08, + "loss": 0.8306, + "step": 13148 + }, + { + "epoch": 1.8615417286047993, + "grad_norm": 8.227819044290287, + "learning_rate": 6.246175252614645e-08, + "loss": 0.9639, + "step": 13149 + }, + { + "epoch": 1.8616833014794365, + "grad_norm": 10.147896203809006, + "learning_rate": 6.233448733309388e-08, + "loss": 1.0243, + "step": 13150 + }, + { + "epoch": 1.8618248743540737, + "grad_norm": 10.211489276527146, + "learning_rate": 6.220735028629937e-08, + "loss": 1.0547, + "step": 13151 + }, + { + "epoch": 1.861966447228711, + "grad_norm": 9.224214647559856, + "learning_rate": 6.2080341392447e-08, + "loss": 0.9154, + "step": 13152 + }, + { + "epoch": 1.8621080201033482, + "grad_norm": 10.40983341138714, + "learning_rate": 6.195346065821312e-08, + "loss": 0.9972, + "step": 13153 + }, + { + "epoch": 1.8622495929779854, + "grad_norm": 9.637994095027308, + "learning_rate": 6.18267080902682e-08, + "loss": 0.9512, + "step": 13154 + }, + { + "epoch": 1.8623911658526227, + "grad_norm": 11.425861377924164, + "learning_rate": 6.170008369527496e-08, + "loss": 1.0542, + "step": 13155 + }, + { + "epoch": 1.8625327387272599, + "grad_norm": 9.46062097878354, + "learning_rate": 6.157358747989034e-08, + "loss": 0.96, + "step": 13156 + }, + { + "epoch": 1.8626743116018971, + "grad_norm": 9.566724853693113, + "learning_rate": 6.144721945076426e-08, + "loss": 0.9594, + "step": 13157 + }, + { + "epoch": 1.8628158844765343, + "grad_norm": 10.108588964925639, + "learning_rate": 6.132097961453948e-08, + "loss": 0.9837, + "step": 13158 + }, + { + "epoch": 1.8629574573511714, + "grad_norm": 8.910304809562712, + "learning_rate": 6.119486797785263e-08, + "loss": 0.9431, + "step": 13159 + }, + { + "epoch": 1.8630990302258086, + "grad_norm": 10.404625201608058, + "learning_rate": 6.106888454733284e-08, + "loss": 0.9828, + "step": 13160 + }, + { + "epoch": 1.8632406031004458, + "grad_norm": 8.827154865804856, + "learning_rate": 6.094302932960317e-08, + "loss": 0.9122, + "step": 13161 + }, + { + "epoch": 1.863382175975083, + "grad_norm": 10.316580040423704, + "learning_rate": 6.081730233127996e-08, + "loss": 0.991, + "step": 13162 + }, + { + "epoch": 1.8635237488497203, + "grad_norm": 9.8298516871769, + "learning_rate": 6.069170355897241e-08, + "loss": 0.9367, + "step": 13163 + }, + { + "epoch": 1.8636653217243575, + "grad_norm": 9.032865914314048, + "learning_rate": 6.056623301928327e-08, + "loss": 0.8885, + "step": 13164 + }, + { + "epoch": 1.8638068945989947, + "grad_norm": 8.356191895424857, + "learning_rate": 6.04408907188081e-08, + "loss": 0.9255, + "step": 13165 + }, + { + "epoch": 1.863948467473632, + "grad_norm": 9.652427534035398, + "learning_rate": 6.031567666413663e-08, + "loss": 0.914, + "step": 13166 + }, + { + "epoch": 1.8640900403482692, + "grad_norm": 9.536217967569742, + "learning_rate": 6.019059086185053e-08, + "loss": 0.9193, + "step": 13167 + }, + { + "epoch": 1.8642316132229064, + "grad_norm": 9.854806848314082, + "learning_rate": 6.006563331852622e-08, + "loss": 0.9535, + "step": 13168 + }, + { + "epoch": 1.8643731860975437, + "grad_norm": 10.001856822239464, + "learning_rate": 5.994080404073233e-08, + "loss": 0.939, + "step": 13169 + }, + { + "epoch": 1.8645147589721809, + "grad_norm": 8.67311189095626, + "learning_rate": 5.981610303503054e-08, + "loss": 0.9379, + "step": 13170 + }, + { + "epoch": 1.8646563318468181, + "grad_norm": 9.865987407225454, + "learning_rate": 5.969153030797731e-08, + "loss": 1.0385, + "step": 13171 + }, + { + "epoch": 1.8647979047214553, + "grad_norm": 9.573757351948313, + "learning_rate": 5.9567085866120144e-08, + "loss": 1.0238, + "step": 13172 + }, + { + "epoch": 1.8649394775960926, + "grad_norm": 8.419898542576695, + "learning_rate": 5.944276971600216e-08, + "loss": 0.8576, + "step": 13173 + }, + { + "epoch": 1.8650810504707298, + "grad_norm": 11.405991141112324, + "learning_rate": 5.9318581864157563e-08, + "loss": 1.0316, + "step": 13174 + }, + { + "epoch": 1.865222623345367, + "grad_norm": 11.364754894347424, + "learning_rate": 5.919452231711559e-08, + "loss": 1.0169, + "step": 13175 + }, + { + "epoch": 1.8653641962200043, + "grad_norm": 9.476643812769453, + "learning_rate": 5.9070591081397397e-08, + "loss": 0.8696, + "step": 13176 + }, + { + "epoch": 1.8655057690946415, + "grad_norm": 9.397357780352579, + "learning_rate": 5.894678816351862e-08, + "loss": 0.9797, + "step": 13177 + }, + { + "epoch": 1.8656473419692787, + "grad_norm": 8.829254024182063, + "learning_rate": 5.8823113569986545e-08, + "loss": 0.8938, + "step": 13178 + }, + { + "epoch": 1.865788914843916, + "grad_norm": 9.301128638908265, + "learning_rate": 5.8699567307303474e-08, + "loss": 0.8945, + "step": 13179 + }, + { + "epoch": 1.8659304877185532, + "grad_norm": 8.03930972120474, + "learning_rate": 5.8576149381963935e-08, + "loss": 0.9437, + "step": 13180 + }, + { + "epoch": 1.8660720605931904, + "grad_norm": 8.77202558833268, + "learning_rate": 5.845285980045551e-08, + "loss": 0.9612, + "step": 13181 + }, + { + "epoch": 1.8662136334678276, + "grad_norm": 10.103910357708127, + "learning_rate": 5.8329698569259963e-08, + "loss": 0.9519, + "step": 13182 + }, + { + "epoch": 1.8663552063424649, + "grad_norm": 10.04034372903964, + "learning_rate": 5.820666569485156e-08, + "loss": 1.0175, + "step": 13183 + }, + { + "epoch": 1.866496779217102, + "grad_norm": 9.829998765636844, + "learning_rate": 5.808376118369791e-08, + "loss": 0.9274, + "step": 13184 + }, + { + "epoch": 1.8666383520917393, + "grad_norm": 7.857430128320299, + "learning_rate": 5.796098504226022e-08, + "loss": 0.9941, + "step": 13185 + }, + { + "epoch": 1.8667799249663766, + "grad_norm": 9.923344733048326, + "learning_rate": 5.7838337276992787e-08, + "loss": 0.9607, + "step": 13186 + }, + { + "epoch": 1.8669214978410138, + "grad_norm": 9.198758447878815, + "learning_rate": 5.7715817894342944e-08, + "loss": 0.9882, + "step": 13187 + }, + { + "epoch": 1.867063070715651, + "grad_norm": 10.3930785989365, + "learning_rate": 5.759342690075137e-08, + "loss": 0.9949, + "step": 13188 + }, + { + "epoch": 1.8672046435902883, + "grad_norm": 9.879566923217435, + "learning_rate": 5.7471164302652646e-08, + "loss": 1.0297, + "step": 13189 + }, + { + "epoch": 1.8673462164649253, + "grad_norm": 10.355206463577648, + "learning_rate": 5.73490301064733e-08, + "loss": 1.0071, + "step": 13190 + }, + { + "epoch": 1.8674877893395625, + "grad_norm": 8.1367267200213, + "learning_rate": 5.722702431863403e-08, + "loss": 0.9257, + "step": 13191 + }, + { + "epoch": 1.8676293622141997, + "grad_norm": 10.68796435680271, + "learning_rate": 5.710514694554886e-08, + "loss": 1.1202, + "step": 13192 + }, + { + "epoch": 1.867770935088837, + "grad_norm": 9.41941059836059, + "learning_rate": 5.6983397993624346e-08, + "loss": 0.9225, + "step": 13193 + }, + { + "epoch": 1.8679125079634742, + "grad_norm": 7.770646434782907, + "learning_rate": 5.686177746926147e-08, + "loss": 0.8764, + "step": 13194 + }, + { + "epoch": 1.8680540808381114, + "grad_norm": 9.625136733632061, + "learning_rate": 5.67402853788529e-08, + "loss": 0.952, + "step": 13195 + }, + { + "epoch": 1.8681956537127486, + "grad_norm": 10.000700735336194, + "learning_rate": 5.6618921728786026e-08, + "loss": 1.1599, + "step": 13196 + }, + { + "epoch": 1.8683372265873859, + "grad_norm": 9.374472641417622, + "learning_rate": 5.6497686525440186e-08, + "loss": 1.0372, + "step": 13197 + }, + { + "epoch": 1.868478799462023, + "grad_norm": 10.731444201658988, + "learning_rate": 5.6376579775189456e-08, + "loss": 0.9827, + "step": 13198 + }, + { + "epoch": 1.8686203723366603, + "grad_norm": 8.170436170834437, + "learning_rate": 5.625560148439929e-08, + "loss": 0.9443, + "step": 13199 + }, + { + "epoch": 1.8687619452112976, + "grad_norm": 11.406104852458329, + "learning_rate": 5.6134751659430716e-08, + "loss": 1.1051, + "step": 13200 + }, + { + "epoch": 1.8689035180859346, + "grad_norm": 10.678383312946893, + "learning_rate": 5.6014030306635606e-08, + "loss": 1.012, + "step": 13201 + }, + { + "epoch": 1.8690450909605718, + "grad_norm": 9.765119908813004, + "learning_rate": 5.589343743236081e-08, + "loss": 0.9728, + "step": 13202 + }, + { + "epoch": 1.869186663835209, + "grad_norm": 9.20505346743727, + "learning_rate": 5.577297304294543e-08, + "loss": 0.9947, + "step": 13203 + }, + { + "epoch": 1.8693282367098463, + "grad_norm": 9.614592635183632, + "learning_rate": 5.5652637144722463e-08, + "loss": 1.0008, + "step": 13204 + }, + { + "epoch": 1.8694698095844835, + "grad_norm": 9.565069912915684, + "learning_rate": 5.5532429744017957e-08, + "loss": 0.9796, + "step": 13205 + }, + { + "epoch": 1.8696113824591207, + "grad_norm": 10.25259031734576, + "learning_rate": 5.5412350847150466e-08, + "loss": 1.0645, + "step": 13206 + }, + { + "epoch": 1.869752955333758, + "grad_norm": 8.77164289373295, + "learning_rate": 5.5292400460432993e-08, + "loss": 0.9238, + "step": 13207 + }, + { + "epoch": 1.8698945282083952, + "grad_norm": 9.03517060499292, + "learning_rate": 5.517257859017161e-08, + "loss": 1.0229, + "step": 13208 + }, + { + "epoch": 1.8700361010830324, + "grad_norm": 8.555108883029384, + "learning_rate": 5.505288524266461e-08, + "loss": 0.9477, + "step": 13209 + }, + { + "epoch": 1.8701776739576697, + "grad_norm": 11.169984542849646, + "learning_rate": 5.493332042420446e-08, + "loss": 0.9355, + "step": 13210 + }, + { + "epoch": 1.8703192468323069, + "grad_norm": 13.07381784846208, + "learning_rate": 5.481388414107669e-08, + "loss": 0.9079, + "step": 13211 + }, + { + "epoch": 1.8704608197069441, + "grad_norm": 10.264490816816762, + "learning_rate": 5.469457639955961e-08, + "loss": 0.8875, + "step": 13212 + }, + { + "epoch": 1.8706023925815813, + "grad_norm": 8.38332442882917, + "learning_rate": 5.457539720592514e-08, + "loss": 0.8591, + "step": 13213 + }, + { + "epoch": 1.8707439654562186, + "grad_norm": 12.444692094548785, + "learning_rate": 5.445634656643884e-08, + "loss": 1.017, + "step": 13214 + }, + { + "epoch": 1.8708855383308558, + "grad_norm": 12.509947823858582, + "learning_rate": 5.4337424487359016e-08, + "loss": 0.9404, + "step": 13215 + }, + { + "epoch": 1.871027111205493, + "grad_norm": 8.536342344646213, + "learning_rate": 5.421863097493707e-08, + "loss": 0.8926, + "step": 13216 + }, + { + "epoch": 1.8711686840801303, + "grad_norm": 8.701102824852281, + "learning_rate": 5.409996603541828e-08, + "loss": 0.8891, + "step": 13217 + }, + { + "epoch": 1.8713102569547675, + "grad_norm": 8.755106716409724, + "learning_rate": 5.398142967504017e-08, + "loss": 0.9509, + "step": 13218 + }, + { + "epoch": 1.8714518298294047, + "grad_norm": 8.799613025132972, + "learning_rate": 5.386302190003495e-08, + "loss": 0.8506, + "step": 13219 + }, + { + "epoch": 1.871593402704042, + "grad_norm": 8.32791154598639, + "learning_rate": 5.3744742716626276e-08, + "loss": 0.9666, + "step": 13220 + }, + { + "epoch": 1.8717349755786792, + "grad_norm": 8.370125789813716, + "learning_rate": 5.362659213103277e-08, + "loss": 0.9233, + "step": 13221 + }, + { + "epoch": 1.8718765484533164, + "grad_norm": 10.341494588063245, + "learning_rate": 5.350857014946531e-08, + "loss": 0.9327, + "step": 13222 + }, + { + "epoch": 1.8720181213279536, + "grad_norm": 11.31799452739178, + "learning_rate": 5.339067677812782e-08, + "loss": 0.8916, + "step": 13223 + }, + { + "epoch": 1.8721596942025909, + "grad_norm": 11.420302505895886, + "learning_rate": 5.327291202321866e-08, + "loss": 1.0188, + "step": 13224 + }, + { + "epoch": 1.872301267077228, + "grad_norm": 8.975636460202844, + "learning_rate": 5.315527589092762e-08, + "loss": 0.9627, + "step": 13225 + }, + { + "epoch": 1.8724428399518653, + "grad_norm": 8.845316502163023, + "learning_rate": 5.303776838743946e-08, + "loss": 0.982, + "step": 13226 + }, + { + "epoch": 1.8725844128265026, + "grad_norm": 6.9905358777357085, + "learning_rate": 5.292038951893119e-08, + "loss": 0.8401, + "step": 13227 + }, + { + "epoch": 1.8727259857011398, + "grad_norm": 9.857518427863276, + "learning_rate": 5.2803139291573716e-08, + "loss": 0.9373, + "step": 13228 + }, + { + "epoch": 1.872867558575777, + "grad_norm": 9.684638012850113, + "learning_rate": 5.268601771153042e-08, + "loss": 0.9604, + "step": 13229 + }, + { + "epoch": 1.8730091314504143, + "grad_norm": 9.080521127811814, + "learning_rate": 5.2569024784958065e-08, + "loss": 1.0529, + "step": 13230 + }, + { + "epoch": 1.8731507043250515, + "grad_norm": 9.921502438256224, + "learning_rate": 5.2452160518007555e-08, + "loss": 0.9141, + "step": 13231 + }, + { + "epoch": 1.8732922771996885, + "grad_norm": 10.730630857439502, + "learning_rate": 5.233542491682203e-08, + "loss": 1.0272, + "step": 13232 + }, + { + "epoch": 1.8734338500743257, + "grad_norm": 10.524612281888679, + "learning_rate": 5.2218817987537976e-08, + "loss": 0.9855, + "step": 13233 + }, + { + "epoch": 1.873575422948963, + "grad_norm": 7.96389562811062, + "learning_rate": 5.210233973628548e-08, + "loss": 0.9537, + "step": 13234 + }, + { + "epoch": 1.8737169958236002, + "grad_norm": 10.320140250975502, + "learning_rate": 5.198599016918771e-08, + "loss": 0.9, + "step": 13235 + }, + { + "epoch": 1.8738585686982374, + "grad_norm": 9.625916672688932, + "learning_rate": 5.1869769292361425e-08, + "loss": 1.0395, + "step": 13236 + }, + { + "epoch": 1.8740001415728746, + "grad_norm": 9.380123518596175, + "learning_rate": 5.1753677111915645e-08, + "loss": 0.9394, + "step": 13237 + }, + { + "epoch": 1.8741417144475119, + "grad_norm": 10.04042275560786, + "learning_rate": 5.163771363395381e-08, + "loss": 0.954, + "step": 13238 + }, + { + "epoch": 1.874283287322149, + "grad_norm": 9.243040198671528, + "learning_rate": 5.152187886457161e-08, + "loss": 0.9706, + "step": 13239 + }, + { + "epoch": 1.8744248601967863, + "grad_norm": 8.896870489357172, + "learning_rate": 5.14061728098586e-08, + "loss": 0.9961, + "step": 13240 + }, + { + "epoch": 1.8745664330714236, + "grad_norm": 9.231353110407133, + "learning_rate": 5.1290595475897434e-08, + "loss": 0.931, + "step": 13241 + }, + { + "epoch": 1.8747080059460606, + "grad_norm": 8.674727237161994, + "learning_rate": 5.117514686876379e-08, + "loss": 0.9229, + "step": 13242 + }, + { + "epoch": 1.8748495788206978, + "grad_norm": 9.290793293731998, + "learning_rate": 5.105982699452699e-08, + "loss": 0.9451, + "step": 13243 + }, + { + "epoch": 1.874991151695335, + "grad_norm": 9.217015196940114, + "learning_rate": 5.094463585924858e-08, + "loss": 1.0649, + "step": 13244 + }, + { + "epoch": 1.8751327245699723, + "grad_norm": 9.55927815732758, + "learning_rate": 5.082957346898482e-08, + "loss": 0.9931, + "step": 13245 + }, + { + "epoch": 1.8752742974446095, + "grad_norm": 12.106163258554439, + "learning_rate": 5.0714639829784195e-08, + "loss": 0.984, + "step": 13246 + }, + { + "epoch": 1.8754158703192467, + "grad_norm": 9.476625296061522, + "learning_rate": 5.0599834947688834e-08, + "loss": 0.9971, + "step": 13247 + }, + { + "epoch": 1.875557443193884, + "grad_norm": 9.128382225813448, + "learning_rate": 5.048515882873362e-08, + "loss": 0.9432, + "step": 13248 + }, + { + "epoch": 1.8756990160685212, + "grad_norm": 10.449945944322288, + "learning_rate": 5.037061147894734e-08, + "loss": 0.9985, + "step": 13249 + }, + { + "epoch": 1.8758405889431584, + "grad_norm": 9.169764001236832, + "learning_rate": 5.0256192904351295e-08, + "loss": 0.9875, + "step": 13250 + }, + { + "epoch": 1.8759821618177956, + "grad_norm": 9.817136798820611, + "learning_rate": 5.014190311096068e-08, + "loss": 1.0201, + "step": 13251 + }, + { + "epoch": 1.8761237346924329, + "grad_norm": 8.931950480151736, + "learning_rate": 5.002774210478345e-08, + "loss": 0.9613, + "step": 13252 + }, + { + "epoch": 1.87626530756707, + "grad_norm": 8.06730756919653, + "learning_rate": 4.9913709891821207e-08, + "loss": 0.9179, + "step": 13253 + }, + { + "epoch": 1.8764068804417073, + "grad_norm": 8.356303283475874, + "learning_rate": 4.9799806478068314e-08, + "loss": 0.921, + "step": 13254 + }, + { + "epoch": 1.8765484533163446, + "grad_norm": 8.775454308902573, + "learning_rate": 4.9686031869512486e-08, + "loss": 0.9654, + "step": 13255 + }, + { + "epoch": 1.8766900261909818, + "grad_norm": 9.1686719146087, + "learning_rate": 4.9572386072135046e-08, + "loss": 0.922, + "step": 13256 + }, + { + "epoch": 1.876831599065619, + "grad_norm": 9.622760859041078, + "learning_rate": 4.945886909191011e-08, + "loss": 1.0163, + "step": 13257 + }, + { + "epoch": 1.8769731719402563, + "grad_norm": 7.829734651982721, + "learning_rate": 4.9345480934805125e-08, + "loss": 0.9285, + "step": 13258 + }, + { + "epoch": 1.8771147448148935, + "grad_norm": 8.500813164803155, + "learning_rate": 4.923222160678115e-08, + "loss": 0.8954, + "step": 13259 + }, + { + "epoch": 1.8772563176895307, + "grad_norm": 10.27612993473452, + "learning_rate": 4.911909111379176e-08, + "loss": 0.9429, + "step": 13260 + }, + { + "epoch": 1.877397890564168, + "grad_norm": 8.52976893050524, + "learning_rate": 4.9006089461784424e-08, + "loss": 0.9801, + "step": 13261 + }, + { + "epoch": 1.8775394634388052, + "grad_norm": 8.46638303667114, + "learning_rate": 4.8893216656699386e-08, + "loss": 0.9552, + "step": 13262 + }, + { + "epoch": 1.8776810363134424, + "grad_norm": 10.000746889832389, + "learning_rate": 4.878047270447051e-08, + "loss": 0.9927, + "step": 13263 + }, + { + "epoch": 1.8778226091880796, + "grad_norm": 9.376646583916234, + "learning_rate": 4.8667857611024164e-08, + "loss": 0.9315, + "step": 13264 + }, + { + "epoch": 1.8779641820627169, + "grad_norm": 10.263740076188723, + "learning_rate": 4.8555371382280894e-08, + "loss": 1.0413, + "step": 13265 + }, + { + "epoch": 1.878105754937354, + "grad_norm": 9.480995845720138, + "learning_rate": 4.844301402415402e-08, + "loss": 0.9604, + "step": 13266 + }, + { + "epoch": 1.8782473278119913, + "grad_norm": 10.80554398833252, + "learning_rate": 4.833078554254966e-08, + "loss": 0.9783, + "step": 13267 + }, + { + "epoch": 1.8783889006866286, + "grad_norm": 8.416308656638526, + "learning_rate": 4.8218685943368094e-08, + "loss": 0.8969, + "step": 13268 + }, + { + "epoch": 1.8785304735612658, + "grad_norm": 10.847486748090546, + "learning_rate": 4.810671523250182e-08, + "loss": 1.0563, + "step": 13269 + }, + { + "epoch": 1.878672046435903, + "grad_norm": 9.085369850066645, + "learning_rate": 4.799487341583753e-08, + "loss": 0.9245, + "step": 13270 + }, + { + "epoch": 1.8788136193105403, + "grad_norm": 8.838085072101265, + "learning_rate": 4.788316049925412e-08, + "loss": 1.0472, + "step": 13271 + }, + { + "epoch": 1.8789551921851775, + "grad_norm": 9.80652023147319, + "learning_rate": 4.777157648862496e-08, + "loss": 0.998, + "step": 13272 + }, + { + "epoch": 1.8790967650598145, + "grad_norm": 8.683869625622249, + "learning_rate": 4.766012138981535e-08, + "loss": 0.9783, + "step": 13273 + }, + { + "epoch": 1.8792383379344517, + "grad_norm": 10.044013249116983, + "learning_rate": 4.754879520868477e-08, + "loss": 0.9726, + "step": 13274 + }, + { + "epoch": 1.879379910809089, + "grad_norm": 9.526731126521376, + "learning_rate": 4.743759795108549e-08, + "loss": 0.9683, + "step": 13275 + }, + { + "epoch": 1.8795214836837262, + "grad_norm": 9.313737415821798, + "learning_rate": 4.732652962286283e-08, + "loss": 1.0732, + "step": 13276 + }, + { + "epoch": 1.8796630565583634, + "grad_norm": 9.767019431694685, + "learning_rate": 4.7215590229855723e-08, + "loss": 0.9437, + "step": 13277 + }, + { + "epoch": 1.8798046294330006, + "grad_norm": 10.718827583766231, + "learning_rate": 4.710477977789618e-08, + "loss": 1.0804, + "step": 13278 + }, + { + "epoch": 1.8799462023076379, + "grad_norm": 8.788663619420875, + "learning_rate": 4.699409827280954e-08, + "loss": 0.9904, + "step": 13279 + }, + { + "epoch": 1.880087775182275, + "grad_norm": 10.488288524206043, + "learning_rate": 4.6883545720413925e-08, + "loss": 0.9651, + "step": 13280 + }, + { + "epoch": 1.8802293480569123, + "grad_norm": 8.592938083094882, + "learning_rate": 4.677312212652108e-08, + "loss": 0.8702, + "step": 13281 + }, + { + "epoch": 1.8803709209315496, + "grad_norm": 10.43129413245701, + "learning_rate": 4.666282749693607e-08, + "loss": 0.948, + "step": 13282 + }, + { + "epoch": 1.8805124938061866, + "grad_norm": 7.879288973801254, + "learning_rate": 4.655266183745705e-08, + "loss": 0.875, + "step": 13283 + }, + { + "epoch": 1.8806540666808238, + "grad_norm": 10.694775341468945, + "learning_rate": 4.644262515387521e-08, + "loss": 1.1069, + "step": 13284 + }, + { + "epoch": 1.880795639555461, + "grad_norm": 9.007284395431055, + "learning_rate": 4.633271745197537e-08, + "loss": 1.0015, + "step": 13285 + }, + { + "epoch": 1.8809372124300983, + "grad_norm": 8.653023569512062, + "learning_rate": 4.6222938737534864e-08, + "loss": 0.9783, + "step": 13286 + }, + { + "epoch": 1.8810787853047355, + "grad_norm": 8.561169151963718, + "learning_rate": 4.6113289016324615e-08, + "loss": 0.9322, + "step": 13287 + }, + { + "epoch": 1.8812203581793727, + "grad_norm": 9.91877027580434, + "learning_rate": 4.600376829410919e-08, + "loss": 0.8596, + "step": 13288 + }, + { + "epoch": 1.88136193105401, + "grad_norm": 8.815777973847782, + "learning_rate": 4.589437657664592e-08, + "loss": 0.903, + "step": 13289 + }, + { + "epoch": 1.8815035039286472, + "grad_norm": 10.741114292981097, + "learning_rate": 4.578511386968548e-08, + "loss": 1.0245, + "step": 13290 + }, + { + "epoch": 1.8816450768032844, + "grad_norm": 8.818082732025577, + "learning_rate": 4.567598017897162e-08, + "loss": 1.011, + "step": 13291 + }, + { + "epoch": 1.8817866496779216, + "grad_norm": 8.86751573938178, + "learning_rate": 4.556697551024142e-08, + "loss": 0.9409, + "step": 13292 + }, + { + "epoch": 1.8819282225525589, + "grad_norm": 10.017920649059832, + "learning_rate": 4.545809986922528e-08, + "loss": 1.0133, + "step": 13293 + }, + { + "epoch": 1.882069795427196, + "grad_norm": 8.990634283618556, + "learning_rate": 4.5349353261646414e-08, + "loss": 0.9205, + "step": 13294 + }, + { + "epoch": 1.8822113683018333, + "grad_norm": 8.427300964739123, + "learning_rate": 4.524073569322218e-08, + "loss": 0.9228, + "step": 13295 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 10.202797853795309, + "learning_rate": 4.5132247169661916e-08, + "loss": 0.9682, + "step": 13296 + }, + { + "epoch": 1.8824945140511078, + "grad_norm": 9.530050033325917, + "learning_rate": 4.5023887696668824e-08, + "loss": 0.9838, + "step": 13297 + }, + { + "epoch": 1.882636086925745, + "grad_norm": 9.71190026590586, + "learning_rate": 4.491565727993974e-08, + "loss": 0.9801, + "step": 13298 + }, + { + "epoch": 1.8827776598003823, + "grad_norm": 8.20485472834719, + "learning_rate": 4.480755592516372e-08, + "loss": 0.9881, + "step": 13299 + }, + { + "epoch": 1.8829192326750195, + "grad_norm": 7.8061013835995725, + "learning_rate": 4.469958363802401e-08, + "loss": 0.9568, + "step": 13300 + }, + { + "epoch": 1.8830608055496567, + "grad_norm": 9.424635449551012, + "learning_rate": 4.459174042419634e-08, + "loss": 1.0009, + "step": 13301 + }, + { + "epoch": 1.883202378424294, + "grad_norm": 8.20200187623924, + "learning_rate": 4.448402628935034e-08, + "loss": 0.9783, + "step": 13302 + }, + { + "epoch": 1.8833439512989312, + "grad_norm": 8.738539683818118, + "learning_rate": 4.437644123914758e-08, + "loss": 0.8802, + "step": 13303 + }, + { + "epoch": 1.8834855241735684, + "grad_norm": 9.613350692430009, + "learning_rate": 4.426898527924467e-08, + "loss": 0.9048, + "step": 13304 + }, + { + "epoch": 1.8836270970482056, + "grad_norm": 10.40065562309111, + "learning_rate": 4.4161658415290135e-08, + "loss": 0.9539, + "step": 13305 + }, + { + "epoch": 1.8837686699228429, + "grad_norm": 10.8680608249011, + "learning_rate": 4.405446065292612e-08, + "loss": 0.9691, + "step": 13306 + }, + { + "epoch": 1.88391024279748, + "grad_norm": 8.00131882287373, + "learning_rate": 4.3947391997787857e-08, + "loss": 0.9092, + "step": 13307 + }, + { + "epoch": 1.8840518156721173, + "grad_norm": 9.624039639208204, + "learning_rate": 4.384045245550389e-08, + "loss": 0.9545, + "step": 13308 + }, + { + "epoch": 1.8841933885467546, + "grad_norm": 10.888272309092619, + "learning_rate": 4.373364203169583e-08, + "loss": 0.9556, + "step": 13309 + }, + { + "epoch": 1.8843349614213918, + "grad_norm": 10.6046112107264, + "learning_rate": 4.362696073197864e-08, + "loss": 0.9528, + "step": 13310 + }, + { + "epoch": 1.884476534296029, + "grad_norm": 8.718632741707017, + "learning_rate": 4.35204085619606e-08, + "loss": 1.0298, + "step": 13311 + }, + { + "epoch": 1.8846181071706662, + "grad_norm": 9.05153060530804, + "learning_rate": 4.3413985527243353e-08, + "loss": 1.0097, + "step": 13312 + }, + { + "epoch": 1.8847596800453035, + "grad_norm": 7.848387301066319, + "learning_rate": 4.330769163342102e-08, + "loss": 0.9568, + "step": 13313 + }, + { + "epoch": 1.8849012529199405, + "grad_norm": 10.560864564587622, + "learning_rate": 4.320152688608165e-08, + "loss": 0.921, + "step": 13314 + }, + { + "epoch": 1.8850428257945777, + "grad_norm": 12.616120794691607, + "learning_rate": 4.309549129080576e-08, + "loss": 0.9689, + "step": 13315 + }, + { + "epoch": 1.885184398669215, + "grad_norm": 7.705250166409467, + "learning_rate": 4.298958485316834e-08, + "loss": 0.869, + "step": 13316 + }, + { + "epoch": 1.8853259715438522, + "grad_norm": 8.577057207420006, + "learning_rate": 4.2883807578736337e-08, + "loss": 0.9285, + "step": 13317 + }, + { + "epoch": 1.8854675444184894, + "grad_norm": 11.701084993052929, + "learning_rate": 4.277815947307029e-08, + "loss": 1.0675, + "step": 13318 + }, + { + "epoch": 1.8856091172931266, + "grad_norm": 8.645361707455297, + "learning_rate": 4.267264054172465e-08, + "loss": 0.9967, + "step": 13319 + }, + { + "epoch": 1.8857506901677639, + "grad_norm": 10.413934293331234, + "learning_rate": 4.256725079024554e-08, + "loss": 0.8381, + "step": 13320 + }, + { + "epoch": 1.885892263042401, + "grad_norm": 9.981143721060665, + "learning_rate": 4.2461990224174076e-08, + "loss": 1.0067, + "step": 13321 + }, + { + "epoch": 1.8860338359170383, + "grad_norm": 9.85311744271039, + "learning_rate": 4.235685884904306e-08, + "loss": 0.9281, + "step": 13322 + }, + { + "epoch": 1.8861754087916756, + "grad_norm": 10.451370251632977, + "learning_rate": 4.2251856670379733e-08, + "loss": 0.9824, + "step": 13323 + }, + { + "epoch": 1.8863169816663128, + "grad_norm": 9.26018767329415, + "learning_rate": 4.214698369370357e-08, + "loss": 0.9729, + "step": 13324 + }, + { + "epoch": 1.8864585545409498, + "grad_norm": 8.379081528615389, + "learning_rate": 4.204223992452794e-08, + "loss": 0.9223, + "step": 13325 + }, + { + "epoch": 1.886600127415587, + "grad_norm": 8.240881764907417, + "learning_rate": 4.193762536835871e-08, + "loss": 0.901, + "step": 13326 + }, + { + "epoch": 1.8867417002902243, + "grad_norm": 9.749712426272866, + "learning_rate": 4.1833140030696216e-08, + "loss": 1.0013, + "step": 13327 + }, + { + "epoch": 1.8868832731648615, + "grad_norm": 11.04511614965082, + "learning_rate": 4.172878391703245e-08, + "loss": 1.005, + "step": 13328 + }, + { + "epoch": 1.8870248460394987, + "grad_norm": 11.05504326669136, + "learning_rate": 4.162455703285356e-08, + "loss": 0.9638, + "step": 13329 + }, + { + "epoch": 1.887166418914136, + "grad_norm": 9.028449551620275, + "learning_rate": 4.152045938363852e-08, + "loss": 0.922, + "step": 13330 + }, + { + "epoch": 1.8873079917887732, + "grad_norm": 8.990171363903123, + "learning_rate": 4.141649097485989e-08, + "loss": 0.9562, + "step": 13331 + }, + { + "epoch": 1.8874495646634104, + "grad_norm": 9.706085294074015, + "learning_rate": 4.131265181198302e-08, + "loss": 0.9813, + "step": 13332 + }, + { + "epoch": 1.8875911375380476, + "grad_norm": 10.115879802069708, + "learning_rate": 4.120894190046687e-08, + "loss": 0.9894, + "step": 13333 + }, + { + "epoch": 1.8877327104126849, + "grad_norm": 9.24253378934413, + "learning_rate": 4.11053612457632e-08, + "loss": 0.866, + "step": 13334 + }, + { + "epoch": 1.887874283287322, + "grad_norm": 8.870302058326821, + "learning_rate": 4.100190985331765e-08, + "loss": 0.8875, + "step": 13335 + }, + { + "epoch": 1.8880158561619593, + "grad_norm": 10.63363260866458, + "learning_rate": 4.0898587728567805e-08, + "loss": 0.9882, + "step": 13336 + }, + { + "epoch": 1.8881574290365966, + "grad_norm": 9.626030408521533, + "learning_rate": 4.0795394876945726e-08, + "loss": 0.9881, + "step": 13337 + }, + { + "epoch": 1.8882990019112338, + "grad_norm": 9.362831050790382, + "learning_rate": 4.0692331303876234e-08, + "loss": 0.9728, + "step": 13338 + }, + { + "epoch": 1.888440574785871, + "grad_norm": 9.423776513783542, + "learning_rate": 4.058939701477693e-08, + "loss": 0.9001, + "step": 13339 + }, + { + "epoch": 1.8885821476605082, + "grad_norm": 10.186580312206411, + "learning_rate": 4.048659201505933e-08, + "loss": 0.9023, + "step": 13340 + }, + { + "epoch": 1.8887237205351455, + "grad_norm": 9.09367197534182, + "learning_rate": 4.038391631012745e-08, + "loss": 0.9923, + "step": 13341 + }, + { + "epoch": 1.8888652934097827, + "grad_norm": 9.229102783900544, + "learning_rate": 4.028136990537945e-08, + "loss": 0.8629, + "step": 13342 + }, + { + "epoch": 1.88900686628442, + "grad_norm": 11.820449334138592, + "learning_rate": 4.0178952806205486e-08, + "loss": 1.1086, + "step": 13343 + }, + { + "epoch": 1.8891484391590572, + "grad_norm": 9.912805355799772, + "learning_rate": 4.0076665017990124e-08, + "loss": 0.9833, + "step": 13344 + }, + { + "epoch": 1.8892900120336944, + "grad_norm": 10.397490609475014, + "learning_rate": 3.997450654611018e-08, + "loss": 1.0017, + "step": 13345 + }, + { + "epoch": 1.8894315849083316, + "grad_norm": 10.316023900762044, + "learning_rate": 3.987247739593636e-08, + "loss": 0.9965, + "step": 13346 + }, + { + "epoch": 1.8895731577829689, + "grad_norm": 10.592827770860124, + "learning_rate": 3.9770577572831594e-08, + "loss": 1.0287, + "step": 13347 + }, + { + "epoch": 1.889714730657606, + "grad_norm": 8.677726242470305, + "learning_rate": 3.966880708215354e-08, + "loss": 0.9679, + "step": 13348 + }, + { + "epoch": 1.8898563035322433, + "grad_norm": 11.224773012172882, + "learning_rate": 3.9567165929251804e-08, + "loss": 1.0072, + "step": 13349 + }, + { + "epoch": 1.8899978764068806, + "grad_norm": 9.441440189223236, + "learning_rate": 3.9465654119469345e-08, + "loss": 0.9587, + "step": 13350 + }, + { + "epoch": 1.8901394492815178, + "grad_norm": 10.030298110624045, + "learning_rate": 3.9364271658142997e-08, + "loss": 0.8941, + "step": 13351 + }, + { + "epoch": 1.890281022156155, + "grad_norm": 12.141419923501084, + "learning_rate": 3.926301855060183e-08, + "loss": 1.0563, + "step": 13352 + }, + { + "epoch": 1.8904225950307922, + "grad_norm": 8.887882822110692, + "learning_rate": 3.916189480216937e-08, + "loss": 0.9966, + "step": 13353 + }, + { + "epoch": 1.8905641679054295, + "grad_norm": 10.01802688816184, + "learning_rate": 3.906090041816107e-08, + "loss": 0.9931, + "step": 13354 + }, + { + "epoch": 1.8907057407800667, + "grad_norm": 9.875217483636556, + "learning_rate": 3.896003540388604e-08, + "loss": 1.0032, + "step": 13355 + }, + { + "epoch": 1.8908473136547037, + "grad_norm": 10.023933380538823, + "learning_rate": 3.885929976464725e-08, + "loss": 0.9948, + "step": 13356 + }, + { + "epoch": 1.890988886529341, + "grad_norm": 9.149410674914652, + "learning_rate": 3.875869350573963e-08, + "loss": 0.9835, + "step": 13357 + }, + { + "epoch": 1.8911304594039782, + "grad_norm": 10.350117455152343, + "learning_rate": 3.865821663245284e-08, + "loss": 0.9881, + "step": 13358 + }, + { + "epoch": 1.8912720322786154, + "grad_norm": 10.747214732744421, + "learning_rate": 3.855786915006793e-08, + "loss": 0.9584, + "step": 13359 + }, + { + "epoch": 1.8914136051532526, + "grad_norm": 10.006125290788685, + "learning_rate": 3.8457651063860954e-08, + "loss": 0.9646, + "step": 13360 + }, + { + "epoch": 1.8915551780278899, + "grad_norm": 9.493191337094988, + "learning_rate": 3.835756237909938e-08, + "loss": 0.9013, + "step": 13361 + }, + { + "epoch": 1.891696750902527, + "grad_norm": 8.807313515613298, + "learning_rate": 3.825760310104537e-08, + "loss": 0.8218, + "step": 13362 + }, + { + "epoch": 1.8918383237771643, + "grad_norm": 8.134579689126538, + "learning_rate": 3.815777323495362e-08, + "loss": 0.9365, + "step": 13363 + }, + { + "epoch": 1.8919798966518016, + "grad_norm": 13.152618124365606, + "learning_rate": 3.805807278607215e-08, + "loss": 1.0082, + "step": 13364 + }, + { + "epoch": 1.8921214695264388, + "grad_norm": 10.136924597500474, + "learning_rate": 3.795850175964205e-08, + "loss": 1.1796, + "step": 13365 + }, + { + "epoch": 1.8922630424010758, + "grad_norm": 9.081924982577215, + "learning_rate": 3.785906016089774e-08, + "loss": 0.9606, + "step": 13366 + }, + { + "epoch": 1.892404615275713, + "grad_norm": 10.561186759615813, + "learning_rate": 3.775974799506699e-08, + "loss": 0.9984, + "step": 13367 + }, + { + "epoch": 1.8925461881503503, + "grad_norm": 10.39465529214789, + "learning_rate": 3.766056526737005e-08, + "loss": 1.0717, + "step": 13368 + }, + { + "epoch": 1.8926877610249875, + "grad_norm": 9.02889614383574, + "learning_rate": 3.756151198302138e-08, + "loss": 0.8931, + "step": 13369 + }, + { + "epoch": 1.8928293338996247, + "grad_norm": 8.794903874828904, + "learning_rate": 3.7462588147228193e-08, + "loss": 0.9827, + "step": 13370 + }, + { + "epoch": 1.892970906774262, + "grad_norm": 9.490154987512676, + "learning_rate": 3.736379376519023e-08, + "loss": 0.9759, + "step": 13371 + }, + { + "epoch": 1.8931124796488992, + "grad_norm": 8.538775244625112, + "learning_rate": 3.726512884210165e-08, + "loss": 1.0444, + "step": 13372 + }, + { + "epoch": 1.8932540525235364, + "grad_norm": 8.785570052643275, + "learning_rate": 3.7166593383148594e-08, + "loss": 1.0032, + "step": 13373 + }, + { + "epoch": 1.8933956253981736, + "grad_norm": 8.348268121439405, + "learning_rate": 3.706818739351164e-08, + "loss": 0.9337, + "step": 13374 + }, + { + "epoch": 1.8935371982728109, + "grad_norm": 7.993705657525811, + "learning_rate": 3.69699108783636e-08, + "loss": 0.9573, + "step": 13375 + }, + { + "epoch": 1.893678771147448, + "grad_norm": 9.333568524621684, + "learning_rate": 3.687176384287089e-08, + "loss": 0.9256, + "step": 13376 + }, + { + "epoch": 1.8938203440220853, + "grad_norm": 9.757253443002112, + "learning_rate": 3.677374629219271e-08, + "loss": 0.9553, + "step": 13377 + }, + { + "epoch": 1.8939619168967226, + "grad_norm": 9.3030921471592, + "learning_rate": 3.667585823148218e-08, + "loss": 0.9815, + "step": 13378 + }, + { + "epoch": 1.8941034897713598, + "grad_norm": 8.0136954381356, + "learning_rate": 3.657809966588516e-08, + "loss": 1.01, + "step": 13379 + }, + { + "epoch": 1.894245062645997, + "grad_norm": 7.691047656222978, + "learning_rate": 3.6480470600540606e-08, + "loss": 0.8806, + "step": 13380 + }, + { + "epoch": 1.8943866355206342, + "grad_norm": 9.994045010352181, + "learning_rate": 3.638297104058081e-08, + "loss": 0.8653, + "step": 13381 + }, + { + "epoch": 1.8945282083952715, + "grad_norm": 8.020049244260978, + "learning_rate": 3.6285600991131095e-08, + "loss": 0.9492, + "step": 13382 + }, + { + "epoch": 1.8946697812699087, + "grad_norm": 10.33901509595072, + "learning_rate": 3.618836045731072e-08, + "loss": 0.9447, + "step": 13383 + }, + { + "epoch": 1.894811354144546, + "grad_norm": 9.634850947638117, + "learning_rate": 3.609124944423087e-08, + "loss": 1.0634, + "step": 13384 + }, + { + "epoch": 1.8949529270191832, + "grad_norm": 9.635057223377238, + "learning_rate": 3.599426795699662e-08, + "loss": 0.8922, + "step": 13385 + }, + { + "epoch": 1.8950944998938204, + "grad_norm": 8.150998350440682, + "learning_rate": 3.5897416000706956e-08, + "loss": 0.8751, + "step": 13386 + }, + { + "epoch": 1.8952360727684576, + "grad_norm": 9.417267186644352, + "learning_rate": 3.580069358045252e-08, + "loss": 0.9288, + "step": 13387 + }, + { + "epoch": 1.8953776456430949, + "grad_norm": 9.091504635810772, + "learning_rate": 3.570410070131841e-08, + "loss": 0.8924, + "step": 13388 + }, + { + "epoch": 1.895519218517732, + "grad_norm": 9.16706079733899, + "learning_rate": 3.5607637368381965e-08, + "loss": 0.9539, + "step": 13389 + }, + { + "epoch": 1.8956607913923693, + "grad_norm": 8.10870934843291, + "learning_rate": 3.5511303586714676e-08, + "loss": 0.974, + "step": 13390 + }, + { + "epoch": 1.8958023642670065, + "grad_norm": 11.078996502986342, + "learning_rate": 3.541509936138082e-08, + "loss": 0.9742, + "step": 13391 + }, + { + "epoch": 1.8959439371416438, + "grad_norm": 7.956731372064616, + "learning_rate": 3.5319024697437196e-08, + "loss": 0.8735, + "step": 13392 + }, + { + "epoch": 1.896085510016281, + "grad_norm": 11.938408782080083, + "learning_rate": 3.522307959993476e-08, + "loss": 0.9255, + "step": 13393 + }, + { + "epoch": 1.8962270828909182, + "grad_norm": 10.41963137335208, + "learning_rate": 3.5127264073917256e-08, + "loss": 0.9519, + "step": 13394 + }, + { + "epoch": 1.8963686557655555, + "grad_norm": 9.612170896613476, + "learning_rate": 3.503157812442148e-08, + "loss": 0.9151, + "step": 13395 + }, + { + "epoch": 1.8965102286401927, + "grad_norm": 9.96809294158566, + "learning_rate": 3.4936021756477865e-08, + "loss": 1.0055, + "step": 13396 + }, + { + "epoch": 1.8966518015148297, + "grad_norm": 7.6800046527371615, + "learning_rate": 3.4840594975109607e-08, + "loss": 0.9635, + "step": 13397 + }, + { + "epoch": 1.896793374389467, + "grad_norm": 10.317194939205821, + "learning_rate": 3.474529778533298e-08, + "loss": 0.9365, + "step": 13398 + }, + { + "epoch": 1.8969349472641042, + "grad_norm": 10.032948192753542, + "learning_rate": 3.465013019215785e-08, + "loss": 0.9516, + "step": 13399 + }, + { + "epoch": 1.8970765201387414, + "grad_norm": 11.500851806938313, + "learning_rate": 3.455509220058717e-08, + "loss": 0.8813, + "step": 13400 + }, + { + "epoch": 1.8972180930133786, + "grad_norm": 9.955698683633253, + "learning_rate": 3.4460183815617224e-08, + "loss": 1.1118, + "step": 13401 + }, + { + "epoch": 1.8973596658880159, + "grad_norm": 8.555520881923856, + "learning_rate": 3.4365405042236785e-08, + "loss": 0.9273, + "step": 13402 + }, + { + "epoch": 1.897501238762653, + "grad_norm": 9.017501344905423, + "learning_rate": 3.4270755885428555e-08, + "loss": 0.9484, + "step": 13403 + }, + { + "epoch": 1.8976428116372903, + "grad_norm": 9.2968479188156, + "learning_rate": 3.4176236350168255e-08, + "loss": 0.9737, + "step": 13404 + }, + { + "epoch": 1.8977843845119275, + "grad_norm": 9.144766523030656, + "learning_rate": 3.408184644142443e-08, + "loss": 0.9855, + "step": 13405 + }, + { + "epoch": 1.8979259573865648, + "grad_norm": 9.329791714059558, + "learning_rate": 3.398758616415948e-08, + "loss": 0.9728, + "step": 13406 + }, + { + "epoch": 1.898067530261202, + "grad_norm": 10.02258687264066, + "learning_rate": 3.389345552332834e-08, + "loss": 0.9495, + "step": 13407 + }, + { + "epoch": 1.898209103135839, + "grad_norm": 8.39615657616417, + "learning_rate": 3.379945452387928e-08, + "loss": 0.9369, + "step": 13408 + }, + { + "epoch": 1.8983506760104762, + "grad_norm": 12.578124393439426, + "learning_rate": 3.370558317075417e-08, + "loss": 1.0413, + "step": 13409 + }, + { + "epoch": 1.8984922488851135, + "grad_norm": 7.878413111658907, + "learning_rate": 3.3611841468887683e-08, + "loss": 0.9447, + "step": 13410 + }, + { + "epoch": 1.8986338217597507, + "grad_norm": 8.499323649705394, + "learning_rate": 3.351822942320754e-08, + "loss": 0.8958, + "step": 13411 + }, + { + "epoch": 1.898775394634388, + "grad_norm": 7.8879394795724975, + "learning_rate": 3.342474703863508e-08, + "loss": 1.0375, + "step": 13412 + }, + { + "epoch": 1.8989169675090252, + "grad_norm": 8.907440641356914, + "learning_rate": 3.333139432008442e-08, + "loss": 0.9485, + "step": 13413 + }, + { + "epoch": 1.8990585403836624, + "grad_norm": 10.450964369377594, + "learning_rate": 3.3238171272463316e-08, + "loss": 0.9393, + "step": 13414 + }, + { + "epoch": 1.8992001132582996, + "grad_norm": 10.309800413107205, + "learning_rate": 3.314507790067201e-08, + "loss": 0.9853, + "step": 13415 + }, + { + "epoch": 1.8993416861329369, + "grad_norm": 9.7462980627382, + "learning_rate": 3.3052114209604636e-08, + "loss": 0.9594, + "step": 13416 + }, + { + "epoch": 1.899483259007574, + "grad_norm": 9.692497293483227, + "learning_rate": 3.295928020414812e-08, + "loss": 1.0306, + "step": 13417 + }, + { + "epoch": 1.8996248318822113, + "grad_norm": 8.523049014405181, + "learning_rate": 3.286657588918302e-08, + "loss": 0.999, + "step": 13418 + }, + { + "epoch": 1.8997664047568485, + "grad_norm": 8.688285956747219, + "learning_rate": 3.2774001269582354e-08, + "loss": 0.9359, + "step": 13419 + }, + { + "epoch": 1.8999079776314858, + "grad_norm": 10.30667078976304, + "learning_rate": 3.2681556350212805e-08, + "loss": 1.0489, + "step": 13420 + }, + { + "epoch": 1.900049550506123, + "grad_norm": 12.170484039786393, + "learning_rate": 3.2589241135933815e-08, + "loss": 1.0389, + "step": 13421 + }, + { + "epoch": 1.9001911233807602, + "grad_norm": 8.411764498749466, + "learning_rate": 3.2497055631598995e-08, + "loss": 0.8953, + "step": 13422 + }, + { + "epoch": 1.9003326962553975, + "grad_norm": 10.861897259353631, + "learning_rate": 3.2404999842054194e-08, + "loss": 1.0661, + "step": 13423 + }, + { + "epoch": 1.9004742691300347, + "grad_norm": 10.73791569892076, + "learning_rate": 3.231307377213833e-08, + "loss": 1.0241, + "step": 13424 + }, + { + "epoch": 1.900615842004672, + "grad_norm": 9.571757703380637, + "learning_rate": 3.222127742668446e-08, + "loss": 1.0648, + "step": 13425 + }, + { + "epoch": 1.9007574148793092, + "grad_norm": 10.330235621967734, + "learning_rate": 3.2129610810517633e-08, + "loss": 0.8834, + "step": 13426 + }, + { + "epoch": 1.9008989877539464, + "grad_norm": 10.502178147870534, + "learning_rate": 3.203807392845732e-08, + "loss": 0.9516, + "step": 13427 + }, + { + "epoch": 1.9010405606285836, + "grad_norm": 9.76614178320128, + "learning_rate": 3.1946666785315216e-08, + "loss": 0.9731, + "step": 13428 + }, + { + "epoch": 1.9011821335032209, + "grad_norm": 10.113139800213006, + "learning_rate": 3.1855389385896383e-08, + "loss": 0.9995, + "step": 13429 + }, + { + "epoch": 1.901323706377858, + "grad_norm": 9.299988810214622, + "learning_rate": 3.176424173499976e-08, + "loss": 0.9323, + "step": 13430 + }, + { + "epoch": 1.9014652792524953, + "grad_norm": 9.380289442074261, + "learning_rate": 3.167322383741622e-08, + "loss": 0.9628, + "step": 13431 + }, + { + "epoch": 1.9016068521271325, + "grad_norm": 9.16936753682557, + "learning_rate": 3.158233569793112e-08, + "loss": 0.9525, + "step": 13432 + }, + { + "epoch": 1.9017484250017698, + "grad_norm": 10.687589232312066, + "learning_rate": 3.149157732132202e-08, + "loss": 0.9856, + "step": 13433 + }, + { + "epoch": 1.901889997876407, + "grad_norm": 7.345346090681809, + "learning_rate": 3.14009487123601e-08, + "loss": 0.9087, + "step": 13434 + }, + { + "epoch": 1.9020315707510442, + "grad_norm": 8.375568882993532, + "learning_rate": 3.131044987580961e-08, + "loss": 0.9363, + "step": 13435 + }, + { + "epoch": 1.9021731436256815, + "grad_norm": 8.873277953806463, + "learning_rate": 3.122008081642786e-08, + "loss": 0.9464, + "step": 13436 + }, + { + "epoch": 1.9023147165003187, + "grad_norm": 10.459277430826663, + "learning_rate": 3.112984153896603e-08, + "loss": 1.0055, + "step": 13437 + }, + { + "epoch": 1.902456289374956, + "grad_norm": 10.514638233791024, + "learning_rate": 3.1039732048167295e-08, + "loss": 0.9384, + "step": 13438 + }, + { + "epoch": 1.902597862249593, + "grad_norm": 10.2492042790888, + "learning_rate": 3.0949752348768956e-08, + "loss": 1.0037, + "step": 13439 + }, + { + "epoch": 1.9027394351242302, + "grad_norm": 7.9313859595751675, + "learning_rate": 3.0859902445501136e-08, + "loss": 1.0063, + "step": 13440 + }, + { + "epoch": 1.9028810079988674, + "grad_norm": 9.558512336187817, + "learning_rate": 3.077018234308726e-08, + "loss": 0.9287, + "step": 13441 + }, + { + "epoch": 1.9030225808735046, + "grad_norm": 9.20081760256468, + "learning_rate": 3.0680592046243576e-08, + "loss": 0.9129, + "step": 13442 + }, + { + "epoch": 1.9031641537481419, + "grad_norm": 9.249102987149323, + "learning_rate": 3.059113155968019e-08, + "loss": 0.8579, + "step": 13443 + }, + { + "epoch": 1.903305726622779, + "grad_norm": 9.417917714243718, + "learning_rate": 3.050180088809973e-08, + "loss": 0.9487, + "step": 13444 + }, + { + "epoch": 1.9034472994974163, + "grad_norm": 8.291625005211632, + "learning_rate": 3.041260003619817e-08, + "loss": 0.8918, + "step": 13445 + }, + { + "epoch": 1.9035888723720535, + "grad_norm": 11.299716196462876, + "learning_rate": 3.032352900866481e-08, + "loss": 1.0546, + "step": 13446 + }, + { + "epoch": 1.9037304452466908, + "grad_norm": 10.31118747565978, + "learning_rate": 3.0234587810182014e-08, + "loss": 1.0423, + "step": 13447 + }, + { + "epoch": 1.903872018121328, + "grad_norm": 9.421724384679239, + "learning_rate": 3.014577644542549e-08, + "loss": 0.9485, + "step": 13448 + }, + { + "epoch": 1.904013590995965, + "grad_norm": 10.778637024841371, + "learning_rate": 3.0057094919064e-08, + "loss": 0.9392, + "step": 13449 + }, + { + "epoch": 1.9041551638706022, + "grad_norm": 10.037659875872036, + "learning_rate": 2.996854323575937e-08, + "loss": 0.9923, + "step": 13450 + }, + { + "epoch": 1.9042967367452395, + "grad_norm": 8.843368683806775, + "learning_rate": 2.98801214001665e-08, + "loss": 0.9153, + "step": 13451 + }, + { + "epoch": 1.9044383096198767, + "grad_norm": 8.412251538806268, + "learning_rate": 2.9791829416933593e-08, + "loss": 0.8796, + "step": 13452 + }, + { + "epoch": 1.904579882494514, + "grad_norm": 9.533612668228791, + "learning_rate": 2.970366729070279e-08, + "loss": 0.9548, + "step": 13453 + }, + { + "epoch": 1.9047214553691512, + "grad_norm": 9.586814933482739, + "learning_rate": 2.9615635026108426e-08, + "loss": 0.9628, + "step": 13454 + }, + { + "epoch": 1.9048630282437884, + "grad_norm": 10.442315549748884, + "learning_rate": 2.9527732627777915e-08, + "loss": 0.9904, + "step": 13455 + }, + { + "epoch": 1.9050046011184256, + "grad_norm": 10.311384660921552, + "learning_rate": 2.9439960100332288e-08, + "loss": 0.9419, + "step": 13456 + }, + { + "epoch": 1.9051461739930629, + "grad_norm": 8.812790020438246, + "learning_rate": 2.9352317448385902e-08, + "loss": 0.9268, + "step": 13457 + }, + { + "epoch": 1.9052877468677, + "grad_norm": 9.722057503409932, + "learning_rate": 2.926480467654591e-08, + "loss": 1.0103, + "step": 13458 + }, + { + "epoch": 1.9054293197423373, + "grad_norm": 8.022694821701025, + "learning_rate": 2.9177421789412795e-08, + "loss": 0.9526, + "step": 13459 + }, + { + "epoch": 1.9055708926169745, + "grad_norm": 8.972158814302302, + "learning_rate": 2.9090168791580663e-08, + "loss": 0.8934, + "step": 13460 + }, + { + "epoch": 1.9057124654916118, + "grad_norm": 11.46659244542904, + "learning_rate": 2.9003045687635845e-08, + "loss": 1.0629, + "step": 13461 + }, + { + "epoch": 1.905854038366249, + "grad_norm": 9.456416895188445, + "learning_rate": 2.8916052482158284e-08, + "loss": 0.9219, + "step": 13462 + }, + { + "epoch": 1.9059956112408862, + "grad_norm": 7.862880281552141, + "learning_rate": 2.8829189179721552e-08, + "loss": 0.8812, + "step": 13463 + }, + { + "epoch": 1.9061371841155235, + "grad_norm": 8.758839093628014, + "learning_rate": 2.8742455784891708e-08, + "loss": 0.8967, + "step": 13464 + }, + { + "epoch": 1.9062787569901607, + "grad_norm": 9.049436216549754, + "learning_rate": 2.865585230222817e-08, + "loss": 0.8519, + "step": 13465 + }, + { + "epoch": 1.906420329864798, + "grad_norm": 8.606751971818086, + "learning_rate": 2.856937873628396e-08, + "loss": 0.9475, + "step": 13466 + }, + { + "epoch": 1.9065619027394352, + "grad_norm": 10.745886792013833, + "learning_rate": 2.8483035091604604e-08, + "loss": 1.0389, + "step": 13467 + }, + { + "epoch": 1.9067034756140724, + "grad_norm": 8.566105215868875, + "learning_rate": 2.8396821372729257e-08, + "loss": 0.9655, + "step": 13468 + }, + { + "epoch": 1.9068450484887096, + "grad_norm": 10.733680225082, + "learning_rate": 2.8310737584190117e-08, + "loss": 0.9589, + "step": 13469 + }, + { + "epoch": 1.9069866213633468, + "grad_norm": 10.76119199595603, + "learning_rate": 2.822478373051246e-08, + "loss": 0.936, + "step": 13470 + }, + { + "epoch": 1.907128194237984, + "grad_norm": 9.36042853588917, + "learning_rate": 2.8138959816215174e-08, + "loss": 0.9508, + "step": 13471 + }, + { + "epoch": 1.9072697671126213, + "grad_norm": 9.906229205666266, + "learning_rate": 2.8053265845809363e-08, + "loss": 1.0017, + "step": 13472 + }, + { + "epoch": 1.9074113399872585, + "grad_norm": 9.970163848969232, + "learning_rate": 2.796770182380032e-08, + "loss": 0.9964, + "step": 13473 + }, + { + "epoch": 1.9075529128618958, + "grad_norm": 8.669436330104643, + "learning_rate": 2.7882267754685832e-08, + "loss": 0.9208, + "step": 13474 + }, + { + "epoch": 1.907694485736533, + "grad_norm": 9.751788244337668, + "learning_rate": 2.7796963642957586e-08, + "loss": 0.9629, + "step": 13475 + }, + { + "epoch": 1.9078360586111702, + "grad_norm": 9.7363283209002, + "learning_rate": 2.7711789493099495e-08, + "loss": 0.9324, + "step": 13476 + }, + { + "epoch": 1.9079776314858075, + "grad_norm": 9.369634695188978, + "learning_rate": 2.7626745309589088e-08, + "loss": 1.1096, + "step": 13477 + }, + { + "epoch": 1.9081192043604447, + "grad_norm": 9.391330584583091, + "learning_rate": 2.7541831096897232e-08, + "loss": 0.9645, + "step": 13478 + }, + { + "epoch": 1.908260777235082, + "grad_norm": 9.887729986436117, + "learning_rate": 2.7457046859487578e-08, + "loss": 1.0036, + "step": 13479 + }, + { + "epoch": 1.908402350109719, + "grad_norm": 7.995703974226224, + "learning_rate": 2.7372392601817678e-08, + "loss": 0.8994, + "step": 13480 + }, + { + "epoch": 1.9085439229843562, + "grad_norm": 9.664088157428552, + "learning_rate": 2.7287868328337297e-08, + "loss": 0.9544, + "step": 13481 + }, + { + "epoch": 1.9086854958589934, + "grad_norm": 8.151237964805512, + "learning_rate": 2.720347404348983e-08, + "loss": 0.8965, + "step": 13482 + }, + { + "epoch": 1.9088270687336306, + "grad_norm": 8.072810241654011, + "learning_rate": 2.7119209751712283e-08, + "loss": 0.9081, + "step": 13483 + }, + { + "epoch": 1.9089686416082678, + "grad_norm": 10.531480011281763, + "learning_rate": 2.7035075457433613e-08, + "loss": 1.0924, + "step": 13484 + }, + { + "epoch": 1.909110214482905, + "grad_norm": 11.068224099502, + "learning_rate": 2.6951071165077504e-08, + "loss": 0.9029, + "step": 13485 + }, + { + "epoch": 1.9092517873575423, + "grad_norm": 8.819364430889776, + "learning_rate": 2.686719687905931e-08, + "loss": 0.9274, + "step": 13486 + }, + { + "epoch": 1.9093933602321795, + "grad_norm": 8.2774700246975, + "learning_rate": 2.678345260378856e-08, + "loss": 0.9829, + "step": 13487 + }, + { + "epoch": 1.9095349331068168, + "grad_norm": 9.08104413318359, + "learning_rate": 2.669983834366785e-08, + "loss": 0.996, + "step": 13488 + }, + { + "epoch": 1.909676505981454, + "grad_norm": 9.672864133084335, + "learning_rate": 2.661635410309199e-08, + "loss": 0.9142, + "step": 13489 + }, + { + "epoch": 1.9098180788560912, + "grad_norm": 9.81485287587632, + "learning_rate": 2.653299988645053e-08, + "loss": 0.931, + "step": 13490 + }, + { + "epoch": 1.9099596517307282, + "grad_norm": 10.18982730797761, + "learning_rate": 2.644977569812496e-08, + "loss": 1.0901, + "step": 13491 + }, + { + "epoch": 1.9101012246053655, + "grad_norm": 9.047751586822853, + "learning_rate": 2.6366681542490114e-08, + "loss": 0.9596, + "step": 13492 + }, + { + "epoch": 1.9102427974800027, + "grad_norm": 7.929456825847438, + "learning_rate": 2.6283717423914445e-08, + "loss": 0.9202, + "step": 13493 + }, + { + "epoch": 1.91038437035464, + "grad_norm": 9.790774257385472, + "learning_rate": 2.6200883346759466e-08, + "loss": 0.9065, + "step": 13494 + }, + { + "epoch": 1.9105259432292772, + "grad_norm": 9.142706009773711, + "learning_rate": 2.6118179315379467e-08, + "loss": 0.9257, + "step": 13495 + }, + { + "epoch": 1.9106675161039144, + "grad_norm": 9.310116866184426, + "learning_rate": 2.6035605334122084e-08, + "loss": 0.9458, + "step": 13496 + }, + { + "epoch": 1.9108090889785516, + "grad_norm": 9.207682959142014, + "learning_rate": 2.5953161407328565e-08, + "loss": 0.9008, + "step": 13497 + }, + { + "epoch": 1.9109506618531888, + "grad_norm": 8.682439194501217, + "learning_rate": 2.587084753933211e-08, + "loss": 0.9587, + "step": 13498 + }, + { + "epoch": 1.911092234727826, + "grad_norm": 7.53112970153209, + "learning_rate": 2.578866373446065e-08, + "loss": 0.8894, + "step": 13499 + }, + { + "epoch": 1.9112338076024633, + "grad_norm": 9.195610774022802, + "learning_rate": 2.5706609997034337e-08, + "loss": 0.9316, + "step": 13500 + }, + { + "epoch": 1.9113753804771005, + "grad_norm": 8.710095776317111, + "learning_rate": 2.5624686331366666e-08, + "loss": 0.8548, + "step": 13501 + }, + { + "epoch": 1.9115169533517378, + "grad_norm": 9.753688016931884, + "learning_rate": 2.554289274176419e-08, + "loss": 0.9296, + "step": 13502 + }, + { + "epoch": 1.911658526226375, + "grad_norm": 8.831449871676648, + "learning_rate": 2.546122923252681e-08, + "loss": 1.013, + "step": 13503 + }, + { + "epoch": 1.9118000991010122, + "grad_norm": 7.783480282471151, + "learning_rate": 2.5379695807947467e-08, + "loss": 0.9543, + "step": 13504 + }, + { + "epoch": 1.9119416719756495, + "grad_norm": 8.168520762372616, + "learning_rate": 2.5298292472312192e-08, + "loss": 0.9072, + "step": 13505 + }, + { + "epoch": 1.9120832448502867, + "grad_norm": 9.012006168883168, + "learning_rate": 2.5217019229900607e-08, + "loss": 0.9077, + "step": 13506 + }, + { + "epoch": 1.912224817724924, + "grad_norm": 9.214147006962607, + "learning_rate": 2.513587608498541e-08, + "loss": 1.0346, + "step": 13507 + }, + { + "epoch": 1.9123663905995612, + "grad_norm": 10.087492996459194, + "learning_rate": 2.5054863041831524e-08, + "loss": 0.9347, + "step": 13508 + }, + { + "epoch": 1.9125079634741984, + "grad_norm": 8.759677875466288, + "learning_rate": 2.4973980104698036e-08, + "loss": 0.9788, + "step": 13509 + }, + { + "epoch": 1.9126495363488356, + "grad_norm": 10.179736965056335, + "learning_rate": 2.4893227277837106e-08, + "loss": 1.0745, + "step": 13510 + }, + { + "epoch": 1.9127911092234728, + "grad_norm": 8.858605083939747, + "learning_rate": 2.481260456549367e-08, + "loss": 0.9596, + "step": 13511 + }, + { + "epoch": 1.91293268209811, + "grad_norm": 10.432688443501977, + "learning_rate": 2.4732111971906004e-08, + "loss": 0.9665, + "step": 13512 + }, + { + "epoch": 1.9130742549727473, + "grad_norm": 8.981306796070198, + "learning_rate": 2.4651749501305446e-08, + "loss": 0.9847, + "step": 13513 + }, + { + "epoch": 1.9132158278473845, + "grad_norm": 10.477581250570415, + "learning_rate": 2.4571517157916946e-08, + "loss": 1.0264, + "step": 13514 + }, + { + "epoch": 1.9133574007220218, + "grad_norm": 8.498278275190694, + "learning_rate": 2.449141494595797e-08, + "loss": 0.9464, + "step": 13515 + }, + { + "epoch": 1.913498973596659, + "grad_norm": 10.565009366422618, + "learning_rate": 2.441144286963931e-08, + "loss": 0.9029, + "step": 13516 + }, + { + "epoch": 1.9136405464712962, + "grad_norm": 8.631713023876692, + "learning_rate": 2.433160093316539e-08, + "loss": 0.9823, + "step": 13517 + }, + { + "epoch": 1.9137821193459335, + "grad_norm": 8.811159721882884, + "learning_rate": 2.4251889140733398e-08, + "loss": 0.9887, + "step": 13518 + }, + { + "epoch": 1.9139236922205707, + "grad_norm": 7.840119148438928, + "learning_rate": 2.417230749653332e-08, + "loss": 0.9025, + "step": 13519 + }, + { + "epoch": 1.914065265095208, + "grad_norm": 8.5149696027893, + "learning_rate": 2.409285600474931e-08, + "loss": 0.8929, + "step": 13520 + }, + { + "epoch": 1.9142068379698451, + "grad_norm": 9.815860634458764, + "learning_rate": 2.401353466955747e-08, + "loss": 1.056, + "step": 13521 + }, + { + "epoch": 1.9143484108444822, + "grad_norm": 8.670836790329481, + "learning_rate": 2.3934343495128075e-08, + "loss": 0.9451, + "step": 13522 + }, + { + "epoch": 1.9144899837191194, + "grad_norm": 9.22511532098905, + "learning_rate": 2.385528248562391e-08, + "loss": 1.0044, + "step": 13523 + }, + { + "epoch": 1.9146315565937566, + "grad_norm": 7.764982692013898, + "learning_rate": 2.3776351645201367e-08, + "loss": 0.9039, + "step": 13524 + }, + { + "epoch": 1.9147731294683938, + "grad_norm": 8.15212054602805, + "learning_rate": 2.3697550978009632e-08, + "loss": 1.0103, + "step": 13525 + }, + { + "epoch": 1.914914702343031, + "grad_norm": 8.724063033808909, + "learning_rate": 2.3618880488190942e-08, + "loss": 0.9494, + "step": 13526 + }, + { + "epoch": 1.9150562752176683, + "grad_norm": 9.891205058030089, + "learning_rate": 2.3540340179881717e-08, + "loss": 0.9424, + "step": 13527 + }, + { + "epoch": 1.9151978480923055, + "grad_norm": 7.768758406446578, + "learning_rate": 2.3461930057210037e-08, + "loss": 0.9529, + "step": 13528 + }, + { + "epoch": 1.9153394209669428, + "grad_norm": 11.215071986422759, + "learning_rate": 2.338365012429816e-08, + "loss": 1.0085, + "step": 13529 + }, + { + "epoch": 1.91548099384158, + "grad_norm": 8.360241996511487, + "learning_rate": 2.3305500385261137e-08, + "loss": 0.8767, + "step": 13530 + }, + { + "epoch": 1.9156225667162172, + "grad_norm": 8.427765380519443, + "learning_rate": 2.322748084420734e-08, + "loss": 1.0135, + "step": 13531 + }, + { + "epoch": 1.9157641395908542, + "grad_norm": 6.944824201584801, + "learning_rate": 2.3149591505237935e-08, + "loss": 0.8237, + "step": 13532 + }, + { + "epoch": 1.9159057124654915, + "grad_norm": 10.549319016593781, + "learning_rate": 2.30718323724477e-08, + "loss": 0.9596, + "step": 13533 + }, + { + "epoch": 1.9160472853401287, + "grad_norm": 12.288853279171015, + "learning_rate": 2.299420344992448e-08, + "loss": 0.9355, + "step": 13534 + }, + { + "epoch": 1.916188858214766, + "grad_norm": 11.593717096904118, + "learning_rate": 2.2916704741748897e-08, + "loss": 0.984, + "step": 13535 + }, + { + "epoch": 1.9163304310894032, + "grad_norm": 7.100053985820284, + "learning_rate": 2.283933625199547e-08, + "loss": 0.8617, + "step": 13536 + }, + { + "epoch": 1.9164720039640404, + "grad_norm": 8.839547280849347, + "learning_rate": 2.2762097984730948e-08, + "loss": 0.8901, + "step": 13537 + }, + { + "epoch": 1.9166135768386776, + "grad_norm": 10.37417084471487, + "learning_rate": 2.268498994401569e-08, + "loss": 1.0135, + "step": 13538 + }, + { + "epoch": 1.9167551497133148, + "grad_norm": 9.5260319668884, + "learning_rate": 2.2608012133903402e-08, + "loss": 0.953, + "step": 13539 + }, + { + "epoch": 1.916896722587952, + "grad_norm": 9.844481232141208, + "learning_rate": 2.2531164558440843e-08, + "loss": 1.0099, + "step": 13540 + }, + { + "epoch": 1.9170382954625893, + "grad_norm": 9.65059676622829, + "learning_rate": 2.2454447221667563e-08, + "loss": 0.8475, + "step": 13541 + }, + { + "epoch": 1.9171798683372265, + "grad_norm": 8.326669648136596, + "learning_rate": 2.2377860127616447e-08, + "loss": 0.9752, + "step": 13542 + }, + { + "epoch": 1.9173214412118638, + "grad_norm": 9.86911955921213, + "learning_rate": 2.230140328031427e-08, + "loss": 0.8638, + "step": 13543 + }, + { + "epoch": 1.917463014086501, + "grad_norm": 7.976369528533197, + "learning_rate": 2.222507668377949e-08, + "loss": 0.9395, + "step": 13544 + }, + { + "epoch": 1.9176045869611382, + "grad_norm": 10.425144580597776, + "learning_rate": 2.214888034202528e-08, + "loss": 0.9297, + "step": 13545 + }, + { + "epoch": 1.9177461598357755, + "grad_norm": 8.780680899318916, + "learning_rate": 2.2072814259056496e-08, + "loss": 0.9802, + "step": 13546 + }, + { + "epoch": 1.9178877327104127, + "grad_norm": 8.757851266222184, + "learning_rate": 2.199687843887244e-08, + "loss": 0.9671, + "step": 13547 + }, + { + "epoch": 1.91802930558505, + "grad_norm": 9.389971362833021, + "learning_rate": 2.1921072885464633e-08, + "loss": 0.8864, + "step": 13548 + }, + { + "epoch": 1.9181708784596871, + "grad_norm": 11.221667407162478, + "learning_rate": 2.1845397602818508e-08, + "loss": 0.9445, + "step": 13549 + }, + { + "epoch": 1.9183124513343244, + "grad_norm": 8.983788194151886, + "learning_rate": 2.1769852594912265e-08, + "loss": 0.9098, + "step": 13550 + }, + { + "epoch": 1.9184540242089616, + "grad_norm": 10.973209007303595, + "learning_rate": 2.169443786571662e-08, + "loss": 0.9304, + "step": 13551 + }, + { + "epoch": 1.9185955970835988, + "grad_norm": 9.25946715170985, + "learning_rate": 2.161915341919646e-08, + "loss": 0.9023, + "step": 13552 + }, + { + "epoch": 1.918737169958236, + "grad_norm": 10.403944338581065, + "learning_rate": 2.1543999259309724e-08, + "loss": 1.0713, + "step": 13553 + }, + { + "epoch": 1.9188787428328733, + "grad_norm": 9.748850436788288, + "learning_rate": 2.1468975390006587e-08, + "loss": 0.8966, + "step": 13554 + }, + { + "epoch": 1.9190203157075105, + "grad_norm": 12.352629816314671, + "learning_rate": 2.139408181523167e-08, + "loss": 1.0146, + "step": 13555 + }, + { + "epoch": 1.9191618885821478, + "grad_norm": 9.298379112833405, + "learning_rate": 2.1319318538921552e-08, + "loss": 0.9287, + "step": 13556 + }, + { + "epoch": 1.919303461456785, + "grad_norm": 11.054612618452216, + "learning_rate": 2.1244685565006695e-08, + "loss": 1.0996, + "step": 13557 + }, + { + "epoch": 1.9194450343314222, + "grad_norm": 9.389960394015082, + "learning_rate": 2.1170182897410353e-08, + "loss": 0.9441, + "step": 13558 + }, + { + "epoch": 1.9195866072060594, + "grad_norm": 8.86716340855758, + "learning_rate": 2.109581054004939e-08, + "loss": 0.9244, + "step": 13559 + }, + { + "epoch": 1.9197281800806967, + "grad_norm": 10.903326216481272, + "learning_rate": 2.1021568496833454e-08, + "loss": 0.9612, + "step": 13560 + }, + { + "epoch": 1.919869752955334, + "grad_norm": 9.516307687307705, + "learning_rate": 2.0947456771664987e-08, + "loss": 0.9381, + "step": 13561 + }, + { + "epoch": 1.9200113258299711, + "grad_norm": 9.552953005893649, + "learning_rate": 2.087347536844059e-08, + "loss": 0.8864, + "step": 13562 + }, + { + "epoch": 1.9201528987046081, + "grad_norm": 10.14313681694695, + "learning_rate": 2.0799624291048816e-08, + "loss": 1.0174, + "step": 13563 + }, + { + "epoch": 1.9202944715792454, + "grad_norm": 9.521752199758891, + "learning_rate": 2.0725903543372117e-08, + "loss": 0.9353, + "step": 13564 + }, + { + "epoch": 1.9204360444538826, + "grad_norm": 9.969346175109965, + "learning_rate": 2.0652313129286284e-08, + "loss": 0.9735, + "step": 13565 + }, + { + "epoch": 1.9205776173285198, + "grad_norm": 12.481654017465743, + "learning_rate": 2.057885305265961e-08, + "loss": 1.0113, + "step": 13566 + }, + { + "epoch": 1.920719190203157, + "grad_norm": 9.387675350496956, + "learning_rate": 2.0505523317353727e-08, + "loss": 0.931, + "step": 13567 + }, + { + "epoch": 1.9208607630777943, + "grad_norm": 9.088182560658614, + "learning_rate": 2.0432323927223883e-08, + "loss": 1.0067, + "step": 13568 + }, + { + "epoch": 1.9210023359524315, + "grad_norm": 9.604503258842755, + "learning_rate": 2.0359254886117842e-08, + "loss": 0.9189, + "step": 13569 + }, + { + "epoch": 1.9211439088270688, + "grad_norm": 12.511807167772194, + "learning_rate": 2.0286316197876964e-08, + "loss": 1.0589, + "step": 13570 + }, + { + "epoch": 1.921285481701706, + "grad_norm": 9.33005338892733, + "learning_rate": 2.0213507866335412e-08, + "loss": 0.9526, + "step": 13571 + }, + { + "epoch": 1.9214270545763432, + "grad_norm": 10.256774896350903, + "learning_rate": 2.0140829895320955e-08, + "loss": 1.1206, + "step": 13572 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 9.631391112564353, + "learning_rate": 2.0068282288653872e-08, + "loss": 0.9872, + "step": 13573 + }, + { + "epoch": 1.9217102003256175, + "grad_norm": 9.431225844770053, + "learning_rate": 1.9995865050147777e-08, + "loss": 1.0159, + "step": 13574 + }, + { + "epoch": 1.9218517732002547, + "grad_norm": 9.014297995927082, + "learning_rate": 1.9923578183610182e-08, + "loss": 0.9618, + "step": 13575 + }, + { + "epoch": 1.921993346074892, + "grad_norm": 8.159490533519943, + "learning_rate": 1.9851421692840822e-08, + "loss": 0.8725, + "step": 13576 + }, + { + "epoch": 1.9221349189495291, + "grad_norm": 9.90755648977812, + "learning_rate": 1.9779395581633055e-08, + "loss": 0.9635, + "step": 13577 + }, + { + "epoch": 1.9222764918241664, + "grad_norm": 9.975791238174379, + "learning_rate": 1.9707499853773016e-08, + "loss": 0.9732, + "step": 13578 + }, + { + "epoch": 1.9224180646988036, + "grad_norm": 8.839476506539738, + "learning_rate": 1.9635734513040182e-08, + "loss": 0.864, + "step": 13579 + }, + { + "epoch": 1.9225596375734408, + "grad_norm": 10.260507919499565, + "learning_rate": 1.956409956320737e-08, + "loss": 0.9208, + "step": 13580 + }, + { + "epoch": 1.922701210448078, + "grad_norm": 10.306949485790485, + "learning_rate": 1.949259500804074e-08, + "loss": 0.9467, + "step": 13581 + }, + { + "epoch": 1.9228427833227153, + "grad_norm": 9.402322246937397, + "learning_rate": 1.942122085129866e-08, + "loss": 0.9597, + "step": 13582 + }, + { + "epoch": 1.9229843561973525, + "grad_norm": 9.206092757509197, + "learning_rate": 1.9349977096733142e-08, + "loss": 1.0211, + "step": 13583 + }, + { + "epoch": 1.9231259290719898, + "grad_norm": 8.628568049478005, + "learning_rate": 1.9278863748089794e-08, + "loss": 0.8718, + "step": 13584 + }, + { + "epoch": 1.923267501946627, + "grad_norm": 8.085260010869122, + "learning_rate": 1.9207880809107014e-08, + "loss": 0.9589, + "step": 13585 + }, + { + "epoch": 1.9234090748212642, + "grad_norm": 9.795137826320827, + "learning_rate": 1.913702828351599e-08, + "loss": 0.82, + "step": 13586 + }, + { + "epoch": 1.9235506476959014, + "grad_norm": 9.899714579465552, + "learning_rate": 1.9066306175041792e-08, + "loss": 0.9791, + "step": 13587 + }, + { + "epoch": 1.9236922205705387, + "grad_norm": 7.442230259085514, + "learning_rate": 1.899571448740173e-08, + "loss": 0.8969, + "step": 13588 + }, + { + "epoch": 1.923833793445176, + "grad_norm": 8.690292677960466, + "learning_rate": 1.892525322430755e-08, + "loss": 1.0189, + "step": 13589 + }, + { + "epoch": 1.9239753663198131, + "grad_norm": 10.618202458500546, + "learning_rate": 1.8854922389462405e-08, + "loss": 0.9264, + "step": 13590 + }, + { + "epoch": 1.9241169391944504, + "grad_norm": 8.496900105412985, + "learning_rate": 1.8784721986564168e-08, + "loss": 1.0278, + "step": 13591 + }, + { + "epoch": 1.9242585120690876, + "grad_norm": 8.736047137709662, + "learning_rate": 1.871465201930295e-08, + "loss": 0.9615, + "step": 13592 + }, + { + "epoch": 1.9244000849437248, + "grad_norm": 9.683610295215797, + "learning_rate": 1.864471249136218e-08, + "loss": 1.0545, + "step": 13593 + }, + { + "epoch": 1.924541657818362, + "grad_norm": 10.81474783580057, + "learning_rate": 1.8574903406418933e-08, + "loss": 0.9543, + "step": 13594 + }, + { + "epoch": 1.9246832306929993, + "grad_norm": 7.668355285882844, + "learning_rate": 1.850522476814276e-08, + "loss": 0.9342, + "step": 13595 + }, + { + "epoch": 1.9248248035676365, + "grad_norm": 8.640330966685662, + "learning_rate": 1.843567658019657e-08, + "loss": 1.036, + "step": 13596 + }, + { + "epoch": 1.9249663764422738, + "grad_norm": 11.757155816379445, + "learning_rate": 1.8366258846236607e-08, + "loss": 1.1005, + "step": 13597 + }, + { + "epoch": 1.925107949316911, + "grad_norm": 9.418567791893217, + "learning_rate": 1.8296971569911893e-08, + "loss": 0.9845, + "step": 13598 + }, + { + "epoch": 1.9252495221915482, + "grad_norm": 8.455845308734894, + "learning_rate": 1.822781475486507e-08, + "loss": 0.8628, + "step": 13599 + }, + { + "epoch": 1.9253910950661854, + "grad_norm": 11.125256138971132, + "learning_rate": 1.8158788404731565e-08, + "loss": 0.9828, + "step": 13600 + }, + { + "epoch": 1.9255326679408227, + "grad_norm": 10.058811435751243, + "learning_rate": 1.8089892523139864e-08, + "loss": 1.033, + "step": 13601 + }, + { + "epoch": 1.92567424081546, + "grad_norm": 10.272021201091052, + "learning_rate": 1.8021127113712066e-08, + "loss": 1.1036, + "step": 13602 + }, + { + "epoch": 1.9258158136900971, + "grad_norm": 10.202681200268703, + "learning_rate": 1.7952492180063064e-08, + "loss": 0.9851, + "step": 13603 + }, + { + "epoch": 1.9259573865647341, + "grad_norm": 8.687821855689227, + "learning_rate": 1.7883987725800522e-08, + "loss": 0.9394, + "step": 13604 + }, + { + "epoch": 1.9260989594393714, + "grad_norm": 8.623108144955033, + "learning_rate": 1.7815613754526283e-08, + "loss": 0.88, + "step": 13605 + }, + { + "epoch": 1.9262405323140086, + "grad_norm": 9.420156546747744, + "learning_rate": 1.774737026983414e-08, + "loss": 1.0313, + "step": 13606 + }, + { + "epoch": 1.9263821051886458, + "grad_norm": 10.176434270073432, + "learning_rate": 1.7679257275312057e-08, + "loss": 0.9563, + "step": 13607 + }, + { + "epoch": 1.926523678063283, + "grad_norm": 10.774447653081985, + "learning_rate": 1.7611274774540777e-08, + "loss": 0.8907, + "step": 13608 + }, + { + "epoch": 1.9266652509379203, + "grad_norm": 9.36032298386678, + "learning_rate": 1.7543422771093554e-08, + "loss": 0.9141, + "step": 13609 + }, + { + "epoch": 1.9268068238125575, + "grad_norm": 10.161540088621225, + "learning_rate": 1.7475701268537814e-08, + "loss": 0.8986, + "step": 13610 + }, + { + "epoch": 1.9269483966871948, + "grad_norm": 9.83546665994783, + "learning_rate": 1.7408110270432932e-08, + "loss": 0.8716, + "step": 13611 + }, + { + "epoch": 1.927089969561832, + "grad_norm": 9.605669302628778, + "learning_rate": 1.7340649780333007e-08, + "loss": 1.0537, + "step": 13612 + }, + { + "epoch": 1.9272315424364692, + "grad_norm": 12.013680607062751, + "learning_rate": 1.7273319801784094e-08, + "loss": 1.0396, + "step": 13613 + }, + { + "epoch": 1.9273731153111064, + "grad_norm": 9.932054095068315, + "learning_rate": 1.7206120338325305e-08, + "loss": 0.9809, + "step": 13614 + }, + { + "epoch": 1.9275146881857435, + "grad_norm": 9.973693961017018, + "learning_rate": 1.7139051393489647e-08, + "loss": 1.0293, + "step": 13615 + }, + { + "epoch": 1.9276562610603807, + "grad_norm": 9.146765265345774, + "learning_rate": 1.7072112970802634e-08, + "loss": 0.9192, + "step": 13616 + }, + { + "epoch": 1.927797833935018, + "grad_norm": 8.232328504437882, + "learning_rate": 1.7005305073783396e-08, + "loss": 0.9257, + "step": 13617 + }, + { + "epoch": 1.9279394068096551, + "grad_norm": 6.6878396776869105, + "learning_rate": 1.6938627705943566e-08, + "loss": 0.8649, + "step": 13618 + }, + { + "epoch": 1.9280809796842924, + "grad_norm": 9.054470109016767, + "learning_rate": 1.6872080870788955e-08, + "loss": 0.8768, + "step": 13619 + }, + { + "epoch": 1.9282225525589296, + "grad_norm": 8.256137761082991, + "learning_rate": 1.6805664571817593e-08, + "loss": 0.9167, + "step": 13620 + }, + { + "epoch": 1.9283641254335668, + "grad_norm": 8.194541640342827, + "learning_rate": 1.6739378812520858e-08, + "loss": 0.8964, + "step": 13621 + }, + { + "epoch": 1.928505698308204, + "grad_norm": 12.178555536918996, + "learning_rate": 1.667322359638318e-08, + "loss": 1.1302, + "step": 13622 + }, + { + "epoch": 1.9286472711828413, + "grad_norm": 8.47970311922737, + "learning_rate": 1.660719892688262e-08, + "loss": 0.8946, + "step": 13623 + }, + { + "epoch": 1.9287888440574785, + "grad_norm": 11.678057604710867, + "learning_rate": 1.6541304807489998e-08, + "loss": 1.023, + "step": 13624 + }, + { + "epoch": 1.9289304169321158, + "grad_norm": 11.161742495884484, + "learning_rate": 1.6475541241669224e-08, + "loss": 0.9851, + "step": 13625 + }, + { + "epoch": 1.929071989806753, + "grad_norm": 8.537938440997664, + "learning_rate": 1.6409908232877246e-08, + "loss": 0.9547, + "step": 13626 + }, + { + "epoch": 1.9292135626813902, + "grad_norm": 8.04411882030685, + "learning_rate": 1.6344405784564642e-08, + "loss": 0.9545, + "step": 13627 + }, + { + "epoch": 1.9293551355560274, + "grad_norm": 8.54940485081989, + "learning_rate": 1.6279033900175047e-08, + "loss": 0.9554, + "step": 13628 + }, + { + "epoch": 1.9294967084306647, + "grad_norm": 10.817373952287719, + "learning_rate": 1.6213792583144318e-08, + "loss": 0.9284, + "step": 13629 + }, + { + "epoch": 1.929638281305302, + "grad_norm": 9.156766596988193, + "learning_rate": 1.614868183690249e-08, + "loss": 0.9814, + "step": 13630 + }, + { + "epoch": 1.9297798541799391, + "grad_norm": 9.561888432113376, + "learning_rate": 1.6083701664872663e-08, + "loss": 0.9981, + "step": 13631 + }, + { + "epoch": 1.9299214270545764, + "grad_norm": 9.674565694361258, + "learning_rate": 1.6018852070470437e-08, + "loss": 0.9134, + "step": 13632 + }, + { + "epoch": 1.9300629999292136, + "grad_norm": 8.445129561935158, + "learning_rate": 1.5954133057105027e-08, + "loss": 0.8493, + "step": 13633 + }, + { + "epoch": 1.9302045728038508, + "grad_norm": 9.994027070575822, + "learning_rate": 1.588954462817871e-08, + "loss": 0.9652, + "step": 13634 + }, + { + "epoch": 1.930346145678488, + "grad_norm": 10.3099210346502, + "learning_rate": 1.582508678708683e-08, + "loss": 0.8786, + "step": 13635 + }, + { + "epoch": 1.9304877185531253, + "grad_norm": 11.286134798241692, + "learning_rate": 1.5760759537217783e-08, + "loss": 1.0369, + "step": 13636 + }, + { + "epoch": 1.9306292914277625, + "grad_norm": 9.469540402499495, + "learning_rate": 1.5696562881953314e-08, + "loss": 0.9587, + "step": 13637 + }, + { + "epoch": 1.9307708643023997, + "grad_norm": 10.13174101047398, + "learning_rate": 1.563249682466822e-08, + "loss": 1.0075, + "step": 13638 + }, + { + "epoch": 1.930912437177037, + "grad_norm": 9.18923912974798, + "learning_rate": 1.5568561368730082e-08, + "loss": 0.9456, + "step": 13639 + }, + { + "epoch": 1.9310540100516742, + "grad_norm": 8.781303867161293, + "learning_rate": 1.5504756517500385e-08, + "loss": 1.0188, + "step": 13640 + }, + { + "epoch": 1.9311955829263114, + "grad_norm": 10.0418261334643, + "learning_rate": 1.544108227433311e-08, + "loss": 1.003, + "step": 13641 + }, + { + "epoch": 1.9313371558009487, + "grad_norm": 11.284488287171923, + "learning_rate": 1.5377538642575574e-08, + "loss": 0.9307, + "step": 13642 + }, + { + "epoch": 1.931478728675586, + "grad_norm": 8.939766549768258, + "learning_rate": 1.5314125625568167e-08, + "loss": 0.8804, + "step": 13643 + }, + { + "epoch": 1.9316203015502231, + "grad_norm": 11.13943950236087, + "learning_rate": 1.5250843226644608e-08, + "loss": 0.9515, + "step": 13644 + }, + { + "epoch": 1.9317618744248604, + "grad_norm": 8.443508241299847, + "learning_rate": 1.518769144913168e-08, + "loss": 0.9434, + "step": 13645 + }, + { + "epoch": 1.9319034472994974, + "grad_norm": 10.151217202550606, + "learning_rate": 1.5124670296348676e-08, + "loss": 0.9593, + "step": 13646 + }, + { + "epoch": 1.9320450201741346, + "grad_norm": 10.390291274598422, + "learning_rate": 1.506177977160933e-08, + "loss": 0.9302, + "step": 13647 + }, + { + "epoch": 1.9321865930487718, + "grad_norm": 9.911481853596852, + "learning_rate": 1.4999019878219056e-08, + "loss": 1.1009, + "step": 13648 + }, + { + "epoch": 1.932328165923409, + "grad_norm": 9.282128289277912, + "learning_rate": 1.4936390619477715e-08, + "loss": 0.9039, + "step": 13649 + }, + { + "epoch": 1.9324697387980463, + "grad_norm": 10.488431097626014, + "learning_rate": 1.4873891998677115e-08, + "loss": 0.9798, + "step": 13650 + }, + { + "epoch": 1.9326113116726835, + "grad_norm": 10.00091586687287, + "learning_rate": 1.4811524019103241e-08, + "loss": 1.0018, + "step": 13651 + }, + { + "epoch": 1.9327528845473207, + "grad_norm": 7.832358306372628, + "learning_rate": 1.4749286684034303e-08, + "loss": 0.8955, + "step": 13652 + }, + { + "epoch": 1.932894457421958, + "grad_norm": 10.88984141000495, + "learning_rate": 1.468717999674213e-08, + "loss": 1.0395, + "step": 13653 + }, + { + "epoch": 1.9330360302965952, + "grad_norm": 10.370540901578307, + "learning_rate": 1.4625203960492162e-08, + "loss": 0.961, + "step": 13654 + }, + { + "epoch": 1.9331776031712324, + "grad_norm": 9.069718215754202, + "learning_rate": 1.4563358578542074e-08, + "loss": 1.0121, + "step": 13655 + }, + { + "epoch": 1.9333191760458694, + "grad_norm": 10.243410923427382, + "learning_rate": 1.4501643854142877e-08, + "loss": 1.1012, + "step": 13656 + }, + { + "epoch": 1.9334607489205067, + "grad_norm": 8.143655809515016, + "learning_rate": 1.4440059790538918e-08, + "loss": 0.8827, + "step": 13657 + }, + { + "epoch": 1.933602321795144, + "grad_norm": 10.726687681352583, + "learning_rate": 1.4378606390967609e-08, + "loss": 0.9516, + "step": 13658 + }, + { + "epoch": 1.9337438946697811, + "grad_norm": 12.303515569191113, + "learning_rate": 1.4317283658659698e-08, + "loss": 0.9903, + "step": 13659 + }, + { + "epoch": 1.9338854675444184, + "grad_norm": 9.335125160788051, + "learning_rate": 1.4256091596838717e-08, + "loss": 0.9285, + "step": 13660 + }, + { + "epoch": 1.9340270404190556, + "grad_norm": 10.433409111877133, + "learning_rate": 1.4195030208721816e-08, + "loss": 0.9128, + "step": 13661 + }, + { + "epoch": 1.9341686132936928, + "grad_norm": 12.326808612584568, + "learning_rate": 1.4134099497518372e-08, + "loss": 1.1485, + "step": 13662 + }, + { + "epoch": 1.93431018616833, + "grad_norm": 9.43546096298344, + "learning_rate": 1.4073299466431933e-08, + "loss": 1.019, + "step": 13663 + }, + { + "epoch": 1.9344517590429673, + "grad_norm": 9.60714055576388, + "learning_rate": 1.4012630118658555e-08, + "loss": 0.9873, + "step": 13664 + }, + { + "epoch": 1.9345933319176045, + "grad_norm": 10.305968649628666, + "learning_rate": 1.395209145738763e-08, + "loss": 0.9453, + "step": 13665 + }, + { + "epoch": 1.9347349047922417, + "grad_norm": 11.340310660384196, + "learning_rate": 1.389168348580161e-08, + "loss": 0.9932, + "step": 13666 + }, + { + "epoch": 1.934876477666879, + "grad_norm": 8.41647817066992, + "learning_rate": 1.3831406207076014e-08, + "loss": 1.0044, + "step": 13667 + }, + { + "epoch": 1.9350180505415162, + "grad_norm": 9.697713476252042, + "learning_rate": 1.3771259624379696e-08, + "loss": 0.9346, + "step": 13668 + }, + { + "epoch": 1.9351596234161534, + "grad_norm": 8.242102802438042, + "learning_rate": 1.3711243740874292e-08, + "loss": 0.8208, + "step": 13669 + }, + { + "epoch": 1.9353011962907907, + "grad_norm": 9.058756719507521, + "learning_rate": 1.3651358559715056e-08, + "loss": 0.9293, + "step": 13670 + }, + { + "epoch": 1.935442769165428, + "grad_norm": 7.951378171451755, + "learning_rate": 1.3591604084049747e-08, + "loss": 0.9419, + "step": 13671 + }, + { + "epoch": 1.9355843420400651, + "grad_norm": 8.467811222759803, + "learning_rate": 1.3531980317020299e-08, + "loss": 0.9163, + "step": 13672 + }, + { + "epoch": 1.9357259149147024, + "grad_norm": 10.653150896492432, + "learning_rate": 1.3472487261760313e-08, + "loss": 0.9726, + "step": 13673 + }, + { + "epoch": 1.9358674877893396, + "grad_norm": 8.118429490973137, + "learning_rate": 1.3413124921397846e-08, + "loss": 0.8981, + "step": 13674 + }, + { + "epoch": 1.9360090606639768, + "grad_norm": 9.095953290995544, + "learning_rate": 1.3353893299053178e-08, + "loss": 0.8776, + "step": 13675 + }, + { + "epoch": 1.936150633538614, + "grad_norm": 10.613565552757272, + "learning_rate": 1.3294792397840206e-08, + "loss": 0.964, + "step": 13676 + }, + { + "epoch": 1.9362922064132513, + "grad_norm": 10.177767920269511, + "learning_rate": 1.323582222086589e-08, + "loss": 0.881, + "step": 13677 + }, + { + "epoch": 1.9364337792878885, + "grad_norm": 10.732929959822757, + "learning_rate": 1.3176982771230252e-08, + "loss": 1.0284, + "step": 13678 + }, + { + "epoch": 1.9365753521625257, + "grad_norm": 10.641606536273157, + "learning_rate": 1.311827405202637e-08, + "loss": 0.9083, + "step": 13679 + }, + { + "epoch": 1.936716925037163, + "grad_norm": 9.347344054936015, + "learning_rate": 1.3059696066340388e-08, + "loss": 0.9217, + "step": 13680 + }, + { + "epoch": 1.9368584979118002, + "grad_norm": 10.202488270355339, + "learning_rate": 1.3001248817251788e-08, + "loss": 1.0251, + "step": 13681 + }, + { + "epoch": 1.9370000707864374, + "grad_norm": 10.131010555612391, + "learning_rate": 1.294293230783339e-08, + "loss": 0.9994, + "step": 13682 + }, + { + "epoch": 1.9371416436610747, + "grad_norm": 9.855712984364743, + "learning_rate": 1.2884746541150516e-08, + "loss": 0.9747, + "step": 13683 + }, + { + "epoch": 1.937283216535712, + "grad_norm": 8.284715611072455, + "learning_rate": 1.2826691520262114e-08, + "loss": 0.9769, + "step": 13684 + }, + { + "epoch": 1.9374247894103491, + "grad_norm": 9.522612312998564, + "learning_rate": 1.2768767248219903e-08, + "loss": 0.9888, + "step": 13685 + }, + { + "epoch": 1.9375663622849864, + "grad_norm": 8.876925407341215, + "learning_rate": 1.2710973728069231e-08, + "loss": 0.8996, + "step": 13686 + }, + { + "epoch": 1.9377079351596234, + "grad_norm": 10.348796431732978, + "learning_rate": 1.2653310962847943e-08, + "loss": 1.0699, + "step": 13687 + }, + { + "epoch": 1.9378495080342606, + "grad_norm": 9.85432955477546, + "learning_rate": 1.2595778955587501e-08, + "loss": 0.9452, + "step": 13688 + }, + { + "epoch": 1.9379910809088978, + "grad_norm": 8.944010037547912, + "learning_rate": 1.2538377709312155e-08, + "loss": 0.9565, + "step": 13689 + }, + { + "epoch": 1.938132653783535, + "grad_norm": 10.27869473969131, + "learning_rate": 1.248110722703949e-08, + "loss": 1.0607, + "step": 13690 + }, + { + "epoch": 1.9382742266581723, + "grad_norm": 10.312219604379326, + "learning_rate": 1.2423967511780432e-08, + "loss": 0.9558, + "step": 13691 + }, + { + "epoch": 1.9384157995328095, + "grad_norm": 11.230913714011095, + "learning_rate": 1.2366958566538689e-08, + "loss": 1.0498, + "step": 13692 + }, + { + "epoch": 1.9385573724074467, + "grad_norm": 8.69438984954994, + "learning_rate": 1.231008039431103e-08, + "loss": 0.8978, + "step": 13693 + }, + { + "epoch": 1.938698945282084, + "grad_norm": 9.492755737878408, + "learning_rate": 1.2253332998087286e-08, + "loss": 0.9783, + "step": 13694 + }, + { + "epoch": 1.9388405181567212, + "grad_norm": 12.198099857630234, + "learning_rate": 1.2196716380851181e-08, + "loss": 1.0713, + "step": 13695 + }, + { + "epoch": 1.9389820910313584, + "grad_norm": 9.201076311924142, + "learning_rate": 1.214023054557839e-08, + "loss": 0.9476, + "step": 13696 + }, + { + "epoch": 1.9391236639059957, + "grad_norm": 8.98473037680306, + "learning_rate": 1.2083875495238761e-08, + "loss": 0.9134, + "step": 13697 + }, + { + "epoch": 1.9392652367806327, + "grad_norm": 10.284840985720805, + "learning_rate": 1.2027651232794924e-08, + "loss": 1.0095, + "step": 13698 + }, + { + "epoch": 1.93940680965527, + "grad_norm": 10.361701991664425, + "learning_rate": 1.197155776120229e-08, + "loss": 1.0064, + "step": 13699 + }, + { + "epoch": 1.9395483825299071, + "grad_norm": 10.699633754273364, + "learning_rate": 1.1915595083409615e-08, + "loss": 0.9064, + "step": 13700 + }, + { + "epoch": 1.9396899554045444, + "grad_norm": 8.374556800160896, + "learning_rate": 1.1859763202358987e-08, + "loss": 0.978, + "step": 13701 + }, + { + "epoch": 1.9398315282791816, + "grad_norm": 10.109515747788572, + "learning_rate": 1.1804062120985282e-08, + "loss": 0.868, + "step": 13702 + }, + { + "epoch": 1.9399731011538188, + "grad_norm": 9.984750182455977, + "learning_rate": 1.1748491842216714e-08, + "loss": 0.9517, + "step": 13703 + }, + { + "epoch": 1.940114674028456, + "grad_norm": 8.657068317169765, + "learning_rate": 1.1693052368974834e-08, + "loss": 0.8934, + "step": 13704 + }, + { + "epoch": 1.9402562469030933, + "grad_norm": 9.922645839273926, + "learning_rate": 1.1637743704173698e-08, + "loss": 1.0041, + "step": 13705 + }, + { + "epoch": 1.9403978197777305, + "grad_norm": 9.640379273484061, + "learning_rate": 1.1582565850720984e-08, + "loss": 0.9761, + "step": 13706 + }, + { + "epoch": 1.9405393926523677, + "grad_norm": 9.489729298468061, + "learning_rate": 1.1527518811517146e-08, + "loss": 0.8655, + "step": 13707 + }, + { + "epoch": 1.940680965527005, + "grad_norm": 10.92898315167025, + "learning_rate": 1.1472602589456538e-08, + "loss": 0.9456, + "step": 13708 + }, + { + "epoch": 1.9408225384016422, + "grad_norm": 8.588349688165412, + "learning_rate": 1.1417817187425461e-08, + "loss": 0.9265, + "step": 13709 + }, + { + "epoch": 1.9409641112762794, + "grad_norm": 8.325538448298799, + "learning_rate": 1.1363162608304112e-08, + "loss": 0.9204, + "step": 13710 + }, + { + "epoch": 1.9411056841509167, + "grad_norm": 10.654675856859564, + "learning_rate": 1.1308638854965748e-08, + "loss": 1.0429, + "step": 13711 + }, + { + "epoch": 1.941247257025554, + "grad_norm": 9.155317135312382, + "learning_rate": 1.1254245930276686e-08, + "loss": 0.8282, + "step": 13712 + }, + { + "epoch": 1.9413888299001911, + "grad_norm": 9.053248662238989, + "learning_rate": 1.1199983837096307e-08, + "loss": 0.8914, + "step": 13713 + }, + { + "epoch": 1.9415304027748284, + "grad_norm": 10.804130357788448, + "learning_rate": 1.1145852578276772e-08, + "loss": 0.9621, + "step": 13714 + }, + { + "epoch": 1.9416719756494656, + "grad_norm": 9.584896015524528, + "learning_rate": 1.109185215666414e-08, + "loss": 1.0916, + "step": 13715 + }, + { + "epoch": 1.9418135485241028, + "grad_norm": 10.647490987601477, + "learning_rate": 1.103798257509725e-08, + "loss": 0.9903, + "step": 13716 + }, + { + "epoch": 1.94195512139874, + "grad_norm": 9.885722845311637, + "learning_rate": 1.0984243836407449e-08, + "loss": 0.971, + "step": 13717 + }, + { + "epoch": 1.9420966942733773, + "grad_norm": 9.514282733224526, + "learning_rate": 1.0930635943420254e-08, + "loss": 1.0164, + "step": 13718 + }, + { + "epoch": 1.9422382671480145, + "grad_norm": 6.917131921524657, + "learning_rate": 1.0877158898953411e-08, + "loss": 0.9119, + "step": 13719 + }, + { + "epoch": 1.9423798400226517, + "grad_norm": 8.996816283945689, + "learning_rate": 1.082381270581856e-08, + "loss": 0.959, + "step": 13720 + }, + { + "epoch": 1.942521412897289, + "grad_norm": 8.978729119375474, + "learning_rate": 1.0770597366819847e-08, + "loss": 0.891, + "step": 13721 + }, + { + "epoch": 1.9426629857719262, + "grad_norm": 9.112745024810419, + "learning_rate": 1.0717512884754478e-08, + "loss": 0.9314, + "step": 13722 + }, + { + "epoch": 1.9428045586465634, + "grad_norm": 10.977076486520351, + "learning_rate": 1.0664559262413831e-08, + "loss": 0.8975, + "step": 13723 + }, + { + "epoch": 1.9429461315212007, + "grad_norm": 9.648056751723253, + "learning_rate": 1.061173650258096e-08, + "loss": 1.0029, + "step": 13724 + }, + { + "epoch": 1.943087704395838, + "grad_norm": 10.010176630281485, + "learning_rate": 1.0559044608032809e-08, + "loss": 0.8593, + "step": 13725 + }, + { + "epoch": 1.9432292772704751, + "grad_norm": 10.0039115884894, + "learning_rate": 1.0506483581539662e-08, + "loss": 0.9526, + "step": 13726 + }, + { + "epoch": 1.9433708501451123, + "grad_norm": 11.199740910939322, + "learning_rate": 1.0454053425864308e-08, + "loss": 1.0393, + "step": 13727 + }, + { + "epoch": 1.9435124230197496, + "grad_norm": 9.634139836572075, + "learning_rate": 1.0401754143763154e-08, + "loss": 0.8627, + "step": 13728 + }, + { + "epoch": 1.9436539958943866, + "grad_norm": 8.91179283728628, + "learning_rate": 1.034958573798539e-08, + "loss": 0.8972, + "step": 13729 + }, + { + "epoch": 1.9437955687690238, + "grad_norm": 10.420643975389256, + "learning_rate": 1.0297548211273544e-08, + "loss": 1.0306, + "step": 13730 + }, + { + "epoch": 1.943937141643661, + "grad_norm": 8.93627494472363, + "learning_rate": 1.0245641566363208e-08, + "loss": 0.9522, + "step": 13731 + }, + { + "epoch": 1.9440787145182983, + "grad_norm": 11.264254884827118, + "learning_rate": 1.0193865805983028e-08, + "loss": 1.0337, + "step": 13732 + }, + { + "epoch": 1.9442202873929355, + "grad_norm": 11.552909874248133, + "learning_rate": 1.0142220932854995e-08, + "loss": 0.9402, + "step": 13733 + }, + { + "epoch": 1.9443618602675727, + "grad_norm": 10.870000281873487, + "learning_rate": 1.0090706949693884e-08, + "loss": 0.9694, + "step": 13734 + }, + { + "epoch": 1.94450343314221, + "grad_norm": 7.858445706041044, + "learning_rate": 1.0039323859207529e-08, + "loss": 0.8285, + "step": 13735 + }, + { + "epoch": 1.9446450060168472, + "grad_norm": 8.104369277840291, + "learning_rate": 9.988071664097376e-09, + "loss": 0.8609, + "step": 13736 + }, + { + "epoch": 1.9447865788914844, + "grad_norm": 10.080878303046806, + "learning_rate": 9.93695036705794e-09, + "loss": 0.8424, + "step": 13737 + }, + { + "epoch": 1.9449281517661217, + "grad_norm": 9.528250600675726, + "learning_rate": 9.885959970775961e-09, + "loss": 0.9862, + "step": 13738 + }, + { + "epoch": 1.9450697246407587, + "grad_norm": 9.602059906257583, + "learning_rate": 9.835100477932624e-09, + "loss": 0.8722, + "step": 13739 + }, + { + "epoch": 1.945211297515396, + "grad_norm": 8.8982351546353, + "learning_rate": 9.784371891201349e-09, + "loss": 0.7944, + "step": 13740 + }, + { + "epoch": 1.9453528703900331, + "grad_norm": 8.990019032221625, + "learning_rate": 9.733774213248615e-09, + "loss": 0.9536, + "step": 13741 + }, + { + "epoch": 1.9454944432646704, + "grad_norm": 8.328475400842667, + "learning_rate": 9.683307446734792e-09, + "loss": 0.8791, + "step": 13742 + }, + { + "epoch": 1.9456360161393076, + "grad_norm": 9.937725712354128, + "learning_rate": 9.632971594312478e-09, + "loss": 1.0701, + "step": 13743 + }, + { + "epoch": 1.9457775890139448, + "grad_norm": 7.810214265236037, + "learning_rate": 9.582766658628173e-09, + "loss": 0.9384, + "step": 13744 + }, + { + "epoch": 1.945919161888582, + "grad_norm": 9.83854106353117, + "learning_rate": 9.532692642320596e-09, + "loss": 0.9371, + "step": 13745 + }, + { + "epoch": 1.9460607347632193, + "grad_norm": 10.469037134588387, + "learning_rate": 9.482749548022641e-09, + "loss": 1.0148, + "step": 13746 + }, + { + "epoch": 1.9462023076378565, + "grad_norm": 10.766990187501515, + "learning_rate": 9.43293737835943e-09, + "loss": 0.9663, + "step": 13747 + }, + { + "epoch": 1.9463438805124937, + "grad_norm": 9.39430751097918, + "learning_rate": 9.383256135949704e-09, + "loss": 0.9559, + "step": 13748 + }, + { + "epoch": 1.946485453387131, + "grad_norm": 8.069044192599822, + "learning_rate": 9.333705823404981e-09, + "loss": 0.9505, + "step": 13749 + }, + { + "epoch": 1.9466270262617682, + "grad_norm": 9.77610336281533, + "learning_rate": 9.284286443330127e-09, + "loss": 1.0065, + "step": 13750 + }, + { + "epoch": 1.9467685991364054, + "grad_norm": 8.388706208093303, + "learning_rate": 9.234997998323613e-09, + "loss": 1.0141, + "step": 13751 + }, + { + "epoch": 1.9469101720110427, + "grad_norm": 10.366708406648494, + "learning_rate": 9.185840490975594e-09, + "loss": 1.0471, + "step": 13752 + }, + { + "epoch": 1.94705174488568, + "grad_norm": 10.572082145842298, + "learning_rate": 9.136813923871224e-09, + "loss": 1.0055, + "step": 13753 + }, + { + "epoch": 1.9471933177603171, + "grad_norm": 11.90420570745443, + "learning_rate": 9.087918299586772e-09, + "loss": 0.8845, + "step": 13754 + }, + { + "epoch": 1.9473348906349544, + "grad_norm": 9.191859035144777, + "learning_rate": 9.039153620693242e-09, + "loss": 0.9258, + "step": 13755 + }, + { + "epoch": 1.9474764635095916, + "grad_norm": 10.30461420202324, + "learning_rate": 8.990519889754412e-09, + "loss": 0.936, + "step": 13756 + }, + { + "epoch": 1.9476180363842288, + "grad_norm": 9.210896085042602, + "learning_rate": 8.942017109326295e-09, + "loss": 0.8834, + "step": 13757 + }, + { + "epoch": 1.947759609258866, + "grad_norm": 9.470335975495775, + "learning_rate": 8.893645281959073e-09, + "loss": 0.9056, + "step": 13758 + }, + { + "epoch": 1.9479011821335033, + "grad_norm": 9.809262658891994, + "learning_rate": 8.845404410195157e-09, + "loss": 0.9402, + "step": 13759 + }, + { + "epoch": 1.9480427550081405, + "grad_norm": 10.286170222758445, + "learning_rate": 8.79729449657113e-09, + "loss": 0.8577, + "step": 13760 + }, + { + "epoch": 1.9481843278827777, + "grad_norm": 10.255685857475674, + "learning_rate": 8.7493155436158e-09, + "loss": 0.9612, + "step": 13761 + }, + { + "epoch": 1.948325900757415, + "grad_norm": 12.217875183477013, + "learning_rate": 8.701467553851317e-09, + "loss": 1.0659, + "step": 13762 + }, + { + "epoch": 1.9484674736320522, + "grad_norm": 10.68370943721642, + "learning_rate": 8.65375052979317e-09, + "loss": 0.8955, + "step": 13763 + }, + { + "epoch": 1.9486090465066894, + "grad_norm": 8.398897153119085, + "learning_rate": 8.60616447394963e-09, + "loss": 0.838, + "step": 13764 + }, + { + "epoch": 1.9487506193813267, + "grad_norm": 11.086734412424635, + "learning_rate": 8.558709388822584e-09, + "loss": 1.0533, + "step": 13765 + }, + { + "epoch": 1.9488921922559639, + "grad_norm": 10.28119694194611, + "learning_rate": 8.511385276906148e-09, + "loss": 0.817, + "step": 13766 + }, + { + "epoch": 1.9490337651306011, + "grad_norm": 24.162020251962584, + "learning_rate": 8.464192140688888e-09, + "loss": 0.9223, + "step": 13767 + }, + { + "epoch": 1.9491753380052383, + "grad_norm": 7.417064384626397, + "learning_rate": 8.417129982650762e-09, + "loss": 0.9227, + "step": 13768 + }, + { + "epoch": 1.9493169108798756, + "grad_norm": 9.449090774634623, + "learning_rate": 8.370198805266739e-09, + "loss": 0.876, + "step": 13769 + }, + { + "epoch": 1.9494584837545126, + "grad_norm": 9.475861249043952, + "learning_rate": 8.323398611003176e-09, + "loss": 0.9234, + "step": 13770 + }, + { + "epoch": 1.9496000566291498, + "grad_norm": 8.789326384927417, + "learning_rate": 8.27672940232116e-09, + "loss": 0.9823, + "step": 13771 + }, + { + "epoch": 1.949741629503787, + "grad_norm": 8.151625451255633, + "learning_rate": 8.230191181673175e-09, + "loss": 0.951, + "step": 13772 + }, + { + "epoch": 1.9498832023784243, + "grad_norm": 10.561589128393141, + "learning_rate": 8.183783951506152e-09, + "loss": 1.0485, + "step": 13773 + }, + { + "epoch": 1.9500247752530615, + "grad_norm": 10.386156449263328, + "learning_rate": 8.137507714259806e-09, + "loss": 1.0371, + "step": 13774 + }, + { + "epoch": 1.9501663481276987, + "grad_norm": 9.246924636568366, + "learning_rate": 8.09136247236636e-09, + "loss": 0.8736, + "step": 13775 + }, + { + "epoch": 1.950307921002336, + "grad_norm": 10.732412811358776, + "learning_rate": 8.045348228252204e-09, + "loss": 1.0494, + "step": 13776 + }, + { + "epoch": 1.9504494938769732, + "grad_norm": 8.858428958389847, + "learning_rate": 7.999464984335959e-09, + "loss": 1.0392, + "step": 13777 + }, + { + "epoch": 1.9505910667516104, + "grad_norm": 8.878730097515705, + "learning_rate": 7.953712743029585e-09, + "loss": 0.9428, + "step": 13778 + }, + { + "epoch": 1.9507326396262477, + "grad_norm": 9.430983965479365, + "learning_rate": 7.908091506738658e-09, + "loss": 0.9507, + "step": 13779 + }, + { + "epoch": 1.9508742125008849, + "grad_norm": 8.568254082795617, + "learning_rate": 7.862601277860982e-09, + "loss": 0.838, + "step": 13780 + }, + { + "epoch": 1.951015785375522, + "grad_norm": 9.992628050540324, + "learning_rate": 7.817242058788255e-09, + "loss": 1.0651, + "step": 13781 + }, + { + "epoch": 1.9511573582501591, + "grad_norm": 9.539485587720142, + "learning_rate": 7.772013851904681e-09, + "loss": 0.9801, + "step": 13782 + }, + { + "epoch": 1.9512989311247964, + "grad_norm": 11.955175605355315, + "learning_rate": 7.72691665958808e-09, + "loss": 1.1557, + "step": 13783 + }, + { + "epoch": 1.9514405039994336, + "grad_norm": 7.9848424845562125, + "learning_rate": 7.681950484209334e-09, + "loss": 0.9984, + "step": 13784 + }, + { + "epoch": 1.9515820768740708, + "grad_norm": 11.321755348224015, + "learning_rate": 7.637115328131828e-09, + "loss": 0.9237, + "step": 13785 + }, + { + "epoch": 1.951723649748708, + "grad_norm": 10.538654927487856, + "learning_rate": 7.592411193713123e-09, + "loss": 1.0087, + "step": 13786 + }, + { + "epoch": 1.9518652226233453, + "grad_norm": 8.751318260071594, + "learning_rate": 7.547838083302728e-09, + "loss": 1.016, + "step": 13787 + }, + { + "epoch": 1.9520067954979825, + "grad_norm": 10.175766254787415, + "learning_rate": 7.503395999244045e-09, + "loss": 1.0605, + "step": 13788 + }, + { + "epoch": 1.9521483683726197, + "grad_norm": 10.168454581050876, + "learning_rate": 7.45908494387354e-09, + "loss": 0.8283, + "step": 13789 + }, + { + "epoch": 1.952289941247257, + "grad_norm": 9.438316865646525, + "learning_rate": 7.414904919520183e-09, + "loss": 0.8882, + "step": 13790 + }, + { + "epoch": 1.9524315141218942, + "grad_norm": 10.898577158043349, + "learning_rate": 7.3708559285068374e-09, + "loss": 0.9557, + "step": 13791 + }, + { + "epoch": 1.9525730869965314, + "grad_norm": 8.977022297025282, + "learning_rate": 7.326937973148873e-09, + "loss": 0.9214, + "step": 13792 + }, + { + "epoch": 1.9527146598711687, + "grad_norm": 10.055503924701512, + "learning_rate": 7.283151055755555e-09, + "loss": 0.9461, + "step": 13793 + }, + { + "epoch": 1.9528562327458059, + "grad_norm": 9.785105959825858, + "learning_rate": 7.23949517862782e-09, + "loss": 0.9942, + "step": 13794 + }, + { + "epoch": 1.9529978056204431, + "grad_norm": 9.965077264757063, + "learning_rate": 7.195970344061609e-09, + "loss": 0.9645, + "step": 13795 + }, + { + "epoch": 1.9531393784950803, + "grad_norm": 9.137408872637895, + "learning_rate": 7.152576554344259e-09, + "loss": 0.8788, + "step": 13796 + }, + { + "epoch": 1.9532809513697176, + "grad_norm": 10.081632822788933, + "learning_rate": 7.109313811757279e-09, + "loss": 0.9177, + "step": 13797 + }, + { + "epoch": 1.9534225242443548, + "grad_norm": 9.227769273721735, + "learning_rate": 7.066182118574683e-09, + "loss": 0.8425, + "step": 13798 + }, + { + "epoch": 1.953564097118992, + "grad_norm": 9.434617972996401, + "learning_rate": 7.023181477064378e-09, + "loss": 0.8584, + "step": 13799 + }, + { + "epoch": 1.9537056699936293, + "grad_norm": 9.109857405751475, + "learning_rate": 6.980311889486502e-09, + "loss": 0.978, + "step": 13800 + }, + { + "epoch": 1.9538472428682665, + "grad_norm": 10.321083149808159, + "learning_rate": 6.937573358094529e-09, + "loss": 1.0533, + "step": 13801 + }, + { + "epoch": 1.9539888157429037, + "grad_norm": 9.717962594961472, + "learning_rate": 6.894965885135829e-09, + "loss": 0.8945, + "step": 13802 + }, + { + "epoch": 1.954130388617541, + "grad_norm": 8.943355322872526, + "learning_rate": 6.852489472849444e-09, + "loss": 0.9501, + "step": 13803 + }, + { + "epoch": 1.9542719614921782, + "grad_norm": 9.617083972711457, + "learning_rate": 6.810144123469142e-09, + "loss": 0.9254, + "step": 13804 + }, + { + "epoch": 1.9544135343668154, + "grad_norm": 9.826778835172318, + "learning_rate": 6.7679298392200885e-09, + "loss": 0.9469, + "step": 13805 + }, + { + "epoch": 1.9545551072414526, + "grad_norm": 9.16758718781509, + "learning_rate": 6.7258466223221745e-09, + "loss": 0.9272, + "step": 13806 + }, + { + "epoch": 1.9546966801160899, + "grad_norm": 9.260291895045496, + "learning_rate": 6.683894474987518e-09, + "loss": 0.9945, + "step": 13807 + }, + { + "epoch": 1.954838252990727, + "grad_norm": 10.638949583017135, + "learning_rate": 6.6420733994213006e-09, + "loss": 1.0379, + "step": 13808 + }, + { + "epoch": 1.9549798258653643, + "grad_norm": 8.033903284814057, + "learning_rate": 6.600383397822319e-09, + "loss": 0.872, + "step": 13809 + }, + { + "epoch": 1.9551213987400016, + "grad_norm": 10.77882353525649, + "learning_rate": 6.558824472381875e-09, + "loss": 0.8931, + "step": 13810 + }, + { + "epoch": 1.9552629716146388, + "grad_norm": 7.187725030444699, + "learning_rate": 6.5173966252848885e-09, + "loss": 0.7972, + "step": 13811 + }, + { + "epoch": 1.9554045444892758, + "grad_norm": 8.944044584678391, + "learning_rate": 6.476099858709062e-09, + "loss": 0.9765, + "step": 13812 + }, + { + "epoch": 1.955546117363913, + "grad_norm": 8.682859210385875, + "learning_rate": 6.4349341748254354e-09, + "loss": 0.9603, + "step": 13813 + }, + { + "epoch": 1.9556876902385503, + "grad_norm": 8.655735262222818, + "learning_rate": 6.3938995757981125e-09, + "loss": 0.9303, + "step": 13814 + }, + { + "epoch": 1.9558292631131875, + "grad_norm": 10.451945832805226, + "learning_rate": 6.3529960637842555e-09, + "loss": 1.0073, + "step": 13815 + }, + { + "epoch": 1.9559708359878247, + "grad_norm": 8.77120146956656, + "learning_rate": 6.3122236409338125e-09, + "loss": 0.9591, + "step": 13816 + }, + { + "epoch": 1.956112408862462, + "grad_norm": 9.748816393877924, + "learning_rate": 6.271582309390622e-09, + "loss": 1.0316, + "step": 13817 + }, + { + "epoch": 1.9562539817370992, + "grad_norm": 9.084824839169903, + "learning_rate": 6.231072071290756e-09, + "loss": 1.0123, + "step": 13818 + }, + { + "epoch": 1.9563955546117364, + "grad_norm": 8.16595886692485, + "learning_rate": 6.190692928764175e-09, + "loss": 0.9338, + "step": 13819 + }, + { + "epoch": 1.9565371274863737, + "grad_norm": 9.845767634868508, + "learning_rate": 6.150444883933348e-09, + "loss": 0.9778, + "step": 13820 + }, + { + "epoch": 1.9566787003610109, + "grad_norm": 8.271585604826244, + "learning_rate": 6.110327938914085e-09, + "loss": 0.9406, + "step": 13821 + }, + { + "epoch": 1.956820273235648, + "grad_norm": 9.092766672565547, + "learning_rate": 6.070342095815529e-09, + "loss": 0.9238, + "step": 13822 + }, + { + "epoch": 1.9569618461102851, + "grad_norm": 9.040899381881252, + "learning_rate": 6.030487356739334e-09, + "loss": 0.9167, + "step": 13823 + }, + { + "epoch": 1.9571034189849223, + "grad_norm": 9.495610477855964, + "learning_rate": 5.990763723780768e-09, + "loss": 0.9738, + "step": 13824 + }, + { + "epoch": 1.9572449918595596, + "grad_norm": 9.871816773950833, + "learning_rate": 5.951171199028438e-09, + "loss": 0.961, + "step": 13825 + }, + { + "epoch": 1.9573865647341968, + "grad_norm": 10.29073038102264, + "learning_rate": 5.91170978456318e-09, + "loss": 1.0215, + "step": 13826 + }, + { + "epoch": 1.957528137608834, + "grad_norm": 10.661246909669869, + "learning_rate": 5.8723794824597226e-09, + "loss": 1.0909, + "step": 13827 + }, + { + "epoch": 1.9576697104834713, + "grad_norm": 9.723610401654524, + "learning_rate": 5.833180294785579e-09, + "loss": 1.0063, + "step": 13828 + }, + { + "epoch": 1.9578112833581085, + "grad_norm": 10.891160905259724, + "learning_rate": 5.794112223601322e-09, + "loss": 0.861, + "step": 13829 + }, + { + "epoch": 1.9579528562327457, + "grad_norm": 9.562276600740145, + "learning_rate": 5.755175270961144e-09, + "loss": 1.0269, + "step": 13830 + }, + { + "epoch": 1.958094429107383, + "grad_norm": 7.807187158361429, + "learning_rate": 5.716369438911185e-09, + "loss": 0.8542, + "step": 13831 + }, + { + "epoch": 1.9582360019820202, + "grad_norm": 7.8328241493904, + "learning_rate": 5.6776947294923115e-09, + "loss": 0.7993, + "step": 13832 + }, + { + "epoch": 1.9583775748566574, + "grad_norm": 9.840396564215064, + "learning_rate": 5.639151144736787e-09, + "loss": 0.935, + "step": 13833 + }, + { + "epoch": 1.9585191477312947, + "grad_norm": 9.633851972124171, + "learning_rate": 5.6007386866713255e-09, + "loss": 1.0628, + "step": 13834 + }, + { + "epoch": 1.9586607206059319, + "grad_norm": 8.19446948491238, + "learning_rate": 5.5624573573154205e-09, + "loss": 0.9348, + "step": 13835 + }, + { + "epoch": 1.9588022934805691, + "grad_norm": 9.157673487497426, + "learning_rate": 5.524307158680797e-09, + "loss": 0.8986, + "step": 13836 + }, + { + "epoch": 1.9589438663552063, + "grad_norm": 9.908094360789297, + "learning_rate": 5.486288092773628e-09, + "loss": 1.0263, + "step": 13837 + }, + { + "epoch": 1.9590854392298436, + "grad_norm": 8.20321010000501, + "learning_rate": 5.4484001615920375e-09, + "loss": 0.9121, + "step": 13838 + }, + { + "epoch": 1.9592270121044808, + "grad_norm": 9.911197425721127, + "learning_rate": 5.410643367128321e-09, + "loss": 1.0319, + "step": 13839 + }, + { + "epoch": 1.959368584979118, + "grad_norm": 8.734641410572564, + "learning_rate": 5.373017711367001e-09, + "loss": 1.0163, + "step": 13840 + }, + { + "epoch": 1.9595101578537553, + "grad_norm": 10.011516424628462, + "learning_rate": 5.335523196285941e-09, + "loss": 1.0025, + "step": 13841 + }, + { + "epoch": 1.9596517307283925, + "grad_norm": 10.729673819299201, + "learning_rate": 5.2981598238563415e-09, + "loss": 0.9081, + "step": 13842 + }, + { + "epoch": 1.9597933036030297, + "grad_norm": 10.064005673939086, + "learning_rate": 5.260927596042464e-09, + "loss": 1.0215, + "step": 13843 + }, + { + "epoch": 1.959934876477667, + "grad_norm": 8.797285216068168, + "learning_rate": 5.223826514801356e-09, + "loss": 0.8695, + "step": 13844 + }, + { + "epoch": 1.9600764493523042, + "grad_norm": 10.665126291949955, + "learning_rate": 5.186856582083677e-09, + "loss": 1.032, + "step": 13845 + }, + { + "epoch": 1.9602180222269414, + "grad_norm": 9.221206803191574, + "learning_rate": 5.1500177998325965e-09, + "loss": 0.8942, + "step": 13846 + }, + { + "epoch": 1.9603595951015786, + "grad_norm": 8.230606867315245, + "learning_rate": 5.1133101699848975e-09, + "loss": 0.9032, + "step": 13847 + }, + { + "epoch": 1.9605011679762159, + "grad_norm": 10.884976425881325, + "learning_rate": 5.076733694470149e-09, + "loss": 0.9242, + "step": 13848 + }, + { + "epoch": 1.960642740850853, + "grad_norm": 9.698371152680059, + "learning_rate": 5.040288375211255e-09, + "loss": 0.9878, + "step": 13849 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 9.419161125783791, + "learning_rate": 5.003974214124186e-09, + "loss": 0.8995, + "step": 13850 + }, + { + "epoch": 1.9609258866001276, + "grad_norm": 10.36216806356368, + "learning_rate": 4.96779121311769e-09, + "loss": 0.9409, + "step": 13851 + }, + { + "epoch": 1.9610674594747648, + "grad_norm": 8.850665620244994, + "learning_rate": 4.931739374093858e-09, + "loss": 0.9241, + "step": 13852 + }, + { + "epoch": 1.9612090323494018, + "grad_norm": 11.053803727371937, + "learning_rate": 4.895818698948396e-09, + "loss": 0.981, + "step": 13853 + }, + { + "epoch": 1.961350605224039, + "grad_norm": 9.448588141679885, + "learning_rate": 4.860029189569237e-09, + "loss": 0.9918, + "step": 13854 + }, + { + "epoch": 1.9614921780986763, + "grad_norm": 9.552205846590857, + "learning_rate": 4.824370847837933e-09, + "loss": 0.9193, + "step": 13855 + }, + { + "epoch": 1.9616337509733135, + "grad_norm": 9.754506561297426, + "learning_rate": 4.788843675629096e-09, + "loss": 0.9275, + "step": 13856 + }, + { + "epoch": 1.9617753238479507, + "grad_norm": 8.4297775532435, + "learning_rate": 4.7534476748098416e-09, + "loss": 0.9393, + "step": 13857 + }, + { + "epoch": 1.961916896722588, + "grad_norm": 11.92590024097205, + "learning_rate": 4.7181828472417365e-09, + "loss": 0.9861, + "step": 13858 + }, + { + "epoch": 1.9620584695972252, + "grad_norm": 11.138815541605933, + "learning_rate": 4.6830491947777445e-09, + "loss": 0.9698, + "step": 13859 + }, + { + "epoch": 1.9622000424718624, + "grad_norm": 11.023367250949974, + "learning_rate": 4.648046719265553e-09, + "loss": 0.9316, + "step": 13860 + }, + { + "epoch": 1.9623416153464996, + "grad_norm": 8.790932526753997, + "learning_rate": 4.61317542254508e-09, + "loss": 0.973, + "step": 13861 + }, + { + "epoch": 1.9624831882211369, + "grad_norm": 9.946284412201294, + "learning_rate": 4.578435306449025e-09, + "loss": 1.0141, + "step": 13862 + }, + { + "epoch": 1.9626247610957739, + "grad_norm": 8.487072257723066, + "learning_rate": 4.543826372803983e-09, + "loss": 0.9251, + "step": 13863 + }, + { + "epoch": 1.9627663339704111, + "grad_norm": 8.38865846004457, + "learning_rate": 4.50934862342961e-09, + "loss": 0.9073, + "step": 13864 + }, + { + "epoch": 1.9629079068450483, + "grad_norm": 10.889355885186152, + "learning_rate": 4.475002060137789e-09, + "loss": 0.97, + "step": 13865 + }, + { + "epoch": 1.9630494797196856, + "grad_norm": 10.12561899812623, + "learning_rate": 4.440786684734577e-09, + "loss": 1.0321, + "step": 13866 + }, + { + "epoch": 1.9631910525943228, + "grad_norm": 9.60651515180278, + "learning_rate": 4.406702499018256e-09, + "loss": 0.8903, + "step": 13867 + }, + { + "epoch": 1.96333262546896, + "grad_norm": 10.988155924343074, + "learning_rate": 4.372749504780727e-09, + "loss": 0.9591, + "step": 13868 + }, + { + "epoch": 1.9634741983435973, + "grad_norm": 8.2348836491975, + "learning_rate": 4.338927703807227e-09, + "loss": 0.9118, + "step": 13869 + }, + { + "epoch": 1.9636157712182345, + "grad_norm": 8.177189748926057, + "learning_rate": 4.305237097875226e-09, + "loss": 0.8775, + "step": 13870 + }, + { + "epoch": 1.9637573440928717, + "grad_norm": 9.154900877836297, + "learning_rate": 4.271677688756082e-09, + "loss": 0.9028, + "step": 13871 + }, + { + "epoch": 1.963898916967509, + "grad_norm": 8.564947293682238, + "learning_rate": 4.23824947821394e-09, + "loss": 0.982, + "step": 13872 + }, + { + "epoch": 1.9640404898421462, + "grad_norm": 8.751250695352645, + "learning_rate": 4.204952468006007e-09, + "loss": 0.9335, + "step": 13873 + }, + { + "epoch": 1.9641820627167834, + "grad_norm": 8.886438010675487, + "learning_rate": 4.171786659882826e-09, + "loss": 0.8991, + "step": 13874 + }, + { + "epoch": 1.9643236355914206, + "grad_norm": 10.025728696752116, + "learning_rate": 4.138752055588002e-09, + "loss": 0.8841, + "step": 13875 + }, + { + "epoch": 1.9644652084660579, + "grad_norm": 11.077379812295838, + "learning_rate": 4.105848656857925e-09, + "loss": 1.0422, + "step": 13876 + }, + { + "epoch": 1.964606781340695, + "grad_norm": 9.957869472367618, + "learning_rate": 4.073076465422321e-09, + "loss": 1.0197, + "step": 13877 + }, + { + "epoch": 1.9647483542153323, + "grad_norm": 8.851531470598745, + "learning_rate": 4.0404354830042566e-09, + "loss": 0.9124, + "step": 13878 + }, + { + "epoch": 1.9648899270899696, + "grad_norm": 13.088989849237317, + "learning_rate": 4.0079257113190275e-09, + "loss": 0.973, + "step": 13879 + }, + { + "epoch": 1.9650314999646068, + "grad_norm": 8.765031383086109, + "learning_rate": 3.9755471520763754e-09, + "loss": 0.9857, + "step": 13880 + }, + { + "epoch": 1.965173072839244, + "grad_norm": 10.099395023772038, + "learning_rate": 3.943299806977996e-09, + "loss": 1.0255, + "step": 13881 + }, + { + "epoch": 1.9653146457138813, + "grad_norm": 10.417015212267724, + "learning_rate": 3.911183677719199e-09, + "loss": 0.874, + "step": 13882 + }, + { + "epoch": 1.9654562185885185, + "grad_norm": 9.590932028371984, + "learning_rate": 3.8791987659883565e-09, + "loss": 0.9618, + "step": 13883 + }, + { + "epoch": 1.9655977914631557, + "grad_norm": 10.391239922476279, + "learning_rate": 3.847345073466624e-09, + "loss": 0.8776, + "step": 13884 + }, + { + "epoch": 1.965739364337793, + "grad_norm": 10.551461314740477, + "learning_rate": 3.81562260182905e-09, + "loss": 0.9431, + "step": 13885 + }, + { + "epoch": 1.9658809372124302, + "grad_norm": 9.254553421597562, + "learning_rate": 3.784031352742912e-09, + "loss": 0.8914, + "step": 13886 + }, + { + "epoch": 1.9660225100870674, + "grad_norm": 9.810342928213093, + "learning_rate": 3.752571327868826e-09, + "loss": 0.9764, + "step": 13887 + }, + { + "epoch": 1.9661640829617046, + "grad_norm": 9.382410401328906, + "learning_rate": 3.721242528861024e-09, + "loss": 0.8723, + "step": 13888 + }, + { + "epoch": 1.9663056558363419, + "grad_norm": 8.474443499543385, + "learning_rate": 3.6900449573659682e-09, + "loss": 0.9809, + "step": 13889 + }, + { + "epoch": 1.966447228710979, + "grad_norm": 7.888031123395883, + "learning_rate": 3.6589786150240112e-09, + "loss": 0.9752, + "step": 13890 + }, + { + "epoch": 1.9665888015856163, + "grad_norm": 11.158437135159941, + "learning_rate": 3.6280435034682927e-09, + "loss": 0.8799, + "step": 13891 + }, + { + "epoch": 1.9667303744602536, + "grad_norm": 10.97045189676961, + "learning_rate": 3.597239624325011e-09, + "loss": 1.0121, + "step": 13892 + }, + { + "epoch": 1.9668719473348908, + "grad_norm": 11.429483908602005, + "learning_rate": 3.5665669792131484e-09, + "loss": 0.9647, + "step": 13893 + }, + { + "epoch": 1.9670135202095278, + "grad_norm": 10.769157904074516, + "learning_rate": 3.5360255697455826e-09, + "loss": 0.9951, + "step": 13894 + }, + { + "epoch": 1.967155093084165, + "grad_norm": 8.873195410998104, + "learning_rate": 3.505615397527695e-09, + "loss": 0.9311, + "step": 13895 + }, + { + "epoch": 1.9672966659588023, + "grad_norm": 8.651866257281531, + "learning_rate": 3.4753364641582076e-09, + "loss": 0.8699, + "step": 13896 + }, + { + "epoch": 1.9674382388334395, + "grad_norm": 10.649040042792956, + "learning_rate": 3.445188771228625e-09, + "loss": 1.1079, + "step": 13897 + }, + { + "epoch": 1.9675798117080767, + "grad_norm": 10.470833733271874, + "learning_rate": 3.4151723203240673e-09, + "loss": 0.9937, + "step": 13898 + }, + { + "epoch": 1.967721384582714, + "grad_norm": 9.04971990836854, + "learning_rate": 3.385287113022717e-09, + "loss": 0.9848, + "step": 13899 + }, + { + "epoch": 1.9678629574573512, + "grad_norm": 8.548466898568302, + "learning_rate": 3.3555331508947076e-09, + "loss": 0.9521, + "step": 13900 + }, + { + "epoch": 1.9680045303319884, + "grad_norm": 9.72188054044462, + "learning_rate": 3.325910435505175e-09, + "loss": 0.9954, + "step": 13901 + }, + { + "epoch": 1.9681461032066256, + "grad_norm": 11.186989884494455, + "learning_rate": 3.296418968410653e-09, + "loss": 1.0185, + "step": 13902 + }, + { + "epoch": 1.9682876760812629, + "grad_norm": 8.413211024987817, + "learning_rate": 3.2670587511618448e-09, + "loss": 0.9541, + "step": 13903 + }, + { + "epoch": 1.9684292489559, + "grad_norm": 8.81890073954267, + "learning_rate": 3.2378297853022377e-09, + "loss": 0.9058, + "step": 13904 + }, + { + "epoch": 1.968570821830537, + "grad_norm": 8.208235945018975, + "learning_rate": 3.208732072368104e-09, + "loss": 1.002, + "step": 13905 + }, + { + "epoch": 1.9687123947051743, + "grad_norm": 8.674799795341551, + "learning_rate": 3.179765613889052e-09, + "loss": 0.8216, + "step": 13906 + }, + { + "epoch": 1.9688539675798116, + "grad_norm": 11.089957263200313, + "learning_rate": 3.150930411388309e-09, + "loss": 0.9744, + "step": 13907 + }, + { + "epoch": 1.9689955404544488, + "grad_norm": 10.246127257617722, + "learning_rate": 3.1222264663813285e-09, + "loss": 1.0275, + "step": 13908 + }, + { + "epoch": 1.969137113329086, + "grad_norm": 9.76963003811009, + "learning_rate": 3.0936537803771814e-09, + "loss": 1.0527, + "step": 13909 + }, + { + "epoch": 1.9692786862037233, + "grad_norm": 9.675072753001857, + "learning_rate": 3.065212354878e-09, + "loss": 1.0166, + "step": 13910 + }, + { + "epoch": 1.9694202590783605, + "grad_norm": 10.993615118070025, + "learning_rate": 3.036902191378699e-09, + "loss": 0.9799, + "step": 13911 + }, + { + "epoch": 1.9695618319529977, + "grad_norm": 10.570399834496934, + "learning_rate": 3.0087232913675325e-09, + "loss": 1.0347, + "step": 13912 + }, + { + "epoch": 1.969703404827635, + "grad_norm": 10.214726860588382, + "learning_rate": 2.980675656326093e-09, + "loss": 1.0828, + "step": 13913 + }, + { + "epoch": 1.9698449777022722, + "grad_norm": 8.17832980688452, + "learning_rate": 2.9527592877284793e-09, + "loss": 0.9624, + "step": 13914 + }, + { + "epoch": 1.9699865505769094, + "grad_norm": 9.055543952835649, + "learning_rate": 2.924974187042684e-09, + "loss": 0.9055, + "step": 13915 + }, + { + "epoch": 1.9701281234515466, + "grad_norm": 10.57999681959483, + "learning_rate": 2.8973203557289274e-09, + "loss": 1.0327, + "step": 13916 + }, + { + "epoch": 1.9702696963261839, + "grad_norm": 11.447457403539852, + "learning_rate": 2.869797795241325e-09, + "loss": 1.0303, + "step": 13917 + }, + { + "epoch": 1.970411269200821, + "grad_norm": 9.351236973500901, + "learning_rate": 2.8424065070262186e-09, + "loss": 0.9229, + "step": 13918 + }, + { + "epoch": 1.9705528420754583, + "grad_norm": 8.598753869906638, + "learning_rate": 2.8151464925241235e-09, + "loss": 0.9216, + "step": 13919 + }, + { + "epoch": 1.9706944149500956, + "grad_norm": 11.4853701205769, + "learning_rate": 2.7880177531677822e-09, + "loss": 0.9128, + "step": 13920 + }, + { + "epoch": 1.9708359878247328, + "grad_norm": 8.765802119840444, + "learning_rate": 2.7610202903829986e-09, + "loss": 0.9528, + "step": 13921 + }, + { + "epoch": 1.97097756069937, + "grad_norm": 9.691083479107144, + "learning_rate": 2.734154105589748e-09, + "loss": 0.9595, + "step": 13922 + }, + { + "epoch": 1.9711191335740073, + "grad_norm": 9.500778166371509, + "learning_rate": 2.7074192001996792e-09, + "loss": 0.9791, + "step": 13923 + }, + { + "epoch": 1.9712607064486445, + "grad_norm": 9.292547985298867, + "learning_rate": 2.680815575618889e-09, + "loss": 0.8868, + "step": 13924 + }, + { + "epoch": 1.9714022793232817, + "grad_norm": 10.130296993439677, + "learning_rate": 2.654343233245149e-09, + "loss": 0.9465, + "step": 13925 + }, + { + "epoch": 1.971543852197919, + "grad_norm": 9.832515057499634, + "learning_rate": 2.6280021744706783e-09, + "loss": 1.0062, + "step": 13926 + }, + { + "epoch": 1.9716854250725562, + "grad_norm": 10.447995328496148, + "learning_rate": 2.6017924006799254e-09, + "loss": 0.9874, + "step": 13927 + }, + { + "epoch": 1.9718269979471934, + "grad_norm": 9.833025997553078, + "learning_rate": 2.5757139132509545e-09, + "loss": 1.0468, + "step": 13928 + }, + { + "epoch": 1.9719685708218306, + "grad_norm": 9.938483807311032, + "learning_rate": 2.5497667135546135e-09, + "loss": 0.8859, + "step": 13929 + }, + { + "epoch": 1.9721101436964679, + "grad_norm": 10.559106767041113, + "learning_rate": 2.5239508029545332e-09, + "loss": 0.9626, + "step": 13930 + }, + { + "epoch": 1.972251716571105, + "grad_norm": 9.69171287239149, + "learning_rate": 2.4982661828085175e-09, + "loss": 0.9002, + "step": 13931 + }, + { + "epoch": 1.9723932894457423, + "grad_norm": 10.116733520219885, + "learning_rate": 2.4727128544660415e-09, + "loss": 0.9359, + "step": 13932 + }, + { + "epoch": 1.9725348623203796, + "grad_norm": 11.23063484931517, + "learning_rate": 2.447290819271031e-09, + "loss": 0.9634, + "step": 13933 + }, + { + "epoch": 1.9726764351950168, + "grad_norm": 8.947989756974355, + "learning_rate": 2.4220000785599162e-09, + "loss": 1.0433, + "step": 13934 + }, + { + "epoch": 1.972818008069654, + "grad_norm": 12.025938610474054, + "learning_rate": 2.3968406336616344e-09, + "loss": 1.1168, + "step": 13935 + }, + { + "epoch": 1.972959580944291, + "grad_norm": 9.070209459874457, + "learning_rate": 2.3718124858992943e-09, + "loss": 1.026, + "step": 13936 + }, + { + "epoch": 1.9731011538189283, + "grad_norm": 10.176644936954986, + "learning_rate": 2.3469156365885095e-09, + "loss": 0.9297, + "step": 13937 + }, + { + "epoch": 1.9732427266935655, + "grad_norm": 9.966343891271364, + "learning_rate": 2.3221500870379552e-09, + "loss": 1.0063, + "step": 13938 + }, + { + "epoch": 1.9733842995682027, + "grad_norm": 9.893771329508555, + "learning_rate": 2.2975158385496466e-09, + "loss": 0.928, + "step": 13939 + }, + { + "epoch": 1.97352587244284, + "grad_norm": 9.343549132978893, + "learning_rate": 2.273012892418658e-09, + "loss": 0.914, + "step": 13940 + }, + { + "epoch": 1.9736674453174772, + "grad_norm": 10.759538189362026, + "learning_rate": 2.248641249932848e-09, + "loss": 0.9975, + "step": 13941 + }, + { + "epoch": 1.9738090181921144, + "grad_norm": 8.794294015938377, + "learning_rate": 2.2244009123734145e-09, + "loss": 1.0449, + "step": 13942 + }, + { + "epoch": 1.9739505910667516, + "grad_norm": 7.600852376420985, + "learning_rate": 2.200291881015171e-09, + "loss": 0.8593, + "step": 13943 + }, + { + "epoch": 1.9740921639413889, + "grad_norm": 10.519513843747204, + "learning_rate": 2.1763141571248813e-09, + "loss": 1.0167, + "step": 13944 + }, + { + "epoch": 1.974233736816026, + "grad_norm": 9.171049792519666, + "learning_rate": 2.152467741963482e-09, + "loss": 0.9663, + "step": 13945 + }, + { + "epoch": 1.974375309690663, + "grad_norm": 9.481238863189734, + "learning_rate": 2.1287526367844147e-09, + "loss": 0.9002, + "step": 13946 + }, + { + "epoch": 1.9745168825653003, + "grad_norm": 9.161704876423153, + "learning_rate": 2.105168842834182e-09, + "loss": 0.9645, + "step": 13947 + }, + { + "epoch": 1.9746584554399376, + "grad_norm": 9.188942309460952, + "learning_rate": 2.081716361352626e-09, + "loss": 0.9103, + "step": 13948 + }, + { + "epoch": 1.9748000283145748, + "grad_norm": 11.00366323336109, + "learning_rate": 2.058395193572926e-09, + "loss": 1.1364, + "step": 13949 + }, + { + "epoch": 1.974941601189212, + "grad_norm": 11.168397759773235, + "learning_rate": 2.0352053407207696e-09, + "loss": 0.9418, + "step": 13950 + }, + { + "epoch": 1.9750831740638493, + "grad_norm": 7.923587892278972, + "learning_rate": 2.0121468040151803e-09, + "loss": 0.9801, + "step": 13951 + }, + { + "epoch": 1.9752247469384865, + "grad_norm": 10.772649635237109, + "learning_rate": 1.9892195846685227e-09, + "loss": 0.9814, + "step": 13952 + }, + { + "epoch": 1.9753663198131237, + "grad_norm": 7.970676563221486, + "learning_rate": 1.9664236838862204e-09, + "loss": 1.0273, + "step": 13953 + }, + { + "epoch": 1.975507892687761, + "grad_norm": 8.684703788559489, + "learning_rate": 1.9437591028662053e-09, + "loss": 0.935, + "step": 13954 + }, + { + "epoch": 1.9756494655623982, + "grad_norm": 9.900083337298188, + "learning_rate": 1.921225842800023e-09, + "loss": 0.9843, + "step": 13955 + }, + { + "epoch": 1.9757910384370354, + "grad_norm": 8.344774961917132, + "learning_rate": 1.8988239048725598e-09, + "loss": 0.9162, + "step": 13956 + }, + { + "epoch": 1.9759326113116726, + "grad_norm": 9.249192073695587, + "learning_rate": 1.876553290261207e-09, + "loss": 1.0437, + "step": 13957 + }, + { + "epoch": 1.9760741841863099, + "grad_norm": 9.35042514752421, + "learning_rate": 1.854414000136695e-09, + "loss": 0.9327, + "step": 13958 + }, + { + "epoch": 1.976215757060947, + "grad_norm": 9.27804397028548, + "learning_rate": 1.8324060356630925e-09, + "loss": 0.9507, + "step": 13959 + }, + { + "epoch": 1.9763573299355843, + "grad_norm": 9.012726157389114, + "learning_rate": 1.8105293979972516e-09, + "loss": 0.968, + "step": 13960 + }, + { + "epoch": 1.9764989028102216, + "grad_norm": 8.62307540871553, + "learning_rate": 1.7887840882888085e-09, + "loss": 0.9067, + "step": 13961 + }, + { + "epoch": 1.9766404756848588, + "grad_norm": 8.547023625450201, + "learning_rate": 1.7671701076815706e-09, + "loss": 0.9334, + "step": 13962 + }, + { + "epoch": 1.976782048559496, + "grad_norm": 9.83787375761657, + "learning_rate": 1.7456874573112958e-09, + "loss": 0.9523, + "step": 13963 + }, + { + "epoch": 1.9769236214341332, + "grad_norm": 9.910499599885766, + "learning_rate": 1.7243361383076363e-09, + "loss": 1.0004, + "step": 13964 + }, + { + "epoch": 1.9770651943087705, + "grad_norm": 11.16983393450913, + "learning_rate": 1.703116151792472e-09, + "loss": 1.0169, + "step": 13965 + }, + { + "epoch": 1.9772067671834077, + "grad_norm": 9.75306457872385, + "learning_rate": 1.6820274988818552e-09, + "loss": 0.9968, + "step": 13966 + }, + { + "epoch": 1.977348340058045, + "grad_norm": 10.828173616638695, + "learning_rate": 1.6610701806843432e-09, + "loss": 0.9638, + "step": 13967 + }, + { + "epoch": 1.9774899129326822, + "grad_norm": 9.898615545705352, + "learning_rate": 1.6402441983015548e-09, + "loss": 0.9042, + "step": 13968 + }, + { + "epoch": 1.9776314858073194, + "grad_norm": 9.713250569881371, + "learning_rate": 1.6195495528281701e-09, + "loss": 1.0102, + "step": 13969 + }, + { + "epoch": 1.9777730586819566, + "grad_norm": 9.268443921952326, + "learning_rate": 1.5989862453522075e-09, + "loss": 0.9719, + "step": 13970 + }, + { + "epoch": 1.9779146315565939, + "grad_norm": 8.331816980367243, + "learning_rate": 1.5785542769544692e-09, + "loss": 0.8962, + "step": 13971 + }, + { + "epoch": 1.978056204431231, + "grad_norm": 8.825753276544587, + "learning_rate": 1.5582536487093737e-09, + "loss": 1.0386, + "step": 13972 + }, + { + "epoch": 1.9781977773058683, + "grad_norm": 9.455279245111026, + "learning_rate": 1.5380843616841223e-09, + "loss": 0.7992, + "step": 13973 + }, + { + "epoch": 1.9783393501805056, + "grad_norm": 8.69152563120259, + "learning_rate": 1.518046416938701e-09, + "loss": 0.8825, + "step": 13974 + }, + { + "epoch": 1.9784809230551428, + "grad_norm": 10.708690315757973, + "learning_rate": 1.4981398155267112e-09, + "loss": 0.932, + "step": 13975 + }, + { + "epoch": 1.97862249592978, + "grad_norm": 9.222338168884805, + "learning_rate": 1.4783645584942607e-09, + "loss": 0.9793, + "step": 13976 + }, + { + "epoch": 1.978764068804417, + "grad_norm": 9.207625371953096, + "learning_rate": 1.4587206468816285e-09, + "loss": 0.8506, + "step": 13977 + }, + { + "epoch": 1.9789056416790543, + "grad_norm": 7.837309727648247, + "learning_rate": 1.4392080817207666e-09, + "loss": 0.8996, + "step": 13978 + }, + { + "epoch": 1.9790472145536915, + "grad_norm": 9.450600531333023, + "learning_rate": 1.4198268640377987e-09, + "loss": 0.8772, + "step": 13979 + }, + { + "epoch": 1.9791887874283287, + "grad_norm": 9.44429186030778, + "learning_rate": 1.4005769948516324e-09, + "loss": 0.8713, + "step": 13980 + }, + { + "epoch": 1.979330360302966, + "grad_norm": 10.79476232326227, + "learning_rate": 1.381458475173958e-09, + "loss": 1.0343, + "step": 13981 + }, + { + "epoch": 1.9794719331776032, + "grad_norm": 10.44694594148905, + "learning_rate": 1.3624713060100825e-09, + "loss": 1.0366, + "step": 13982 + }, + { + "epoch": 1.9796135060522404, + "grad_norm": 7.4690697254394784, + "learning_rate": 1.343615488357819e-09, + "loss": 0.9388, + "step": 13983 + }, + { + "epoch": 1.9797550789268776, + "grad_norm": 12.030246762149275, + "learning_rate": 1.324891023208874e-09, + "loss": 1.1146, + "step": 13984 + }, + { + "epoch": 1.9798966518015149, + "grad_norm": 10.559021506781017, + "learning_rate": 1.306297911547183e-09, + "loss": 0.9715, + "step": 13985 + }, + { + "epoch": 1.980038224676152, + "grad_norm": 9.692709820129073, + "learning_rate": 1.287836154350297e-09, + "loss": 0.9135, + "step": 13986 + }, + { + "epoch": 1.9801797975507893, + "grad_norm": 9.254794553581615, + "learning_rate": 1.2695057525888288e-09, + "loss": 0.7929, + "step": 13987 + }, + { + "epoch": 1.9803213704254263, + "grad_norm": 9.374559315814405, + "learning_rate": 1.2513067072261742e-09, + "loss": 0.9053, + "step": 13988 + }, + { + "epoch": 1.9804629433000636, + "grad_norm": 8.804440539737517, + "learning_rate": 1.2332390192193456e-09, + "loss": 1.0364, + "step": 13989 + }, + { + "epoch": 1.9806045161747008, + "grad_norm": 8.718092808172786, + "learning_rate": 1.215302689517861e-09, + "loss": 0.9212, + "step": 13990 + }, + { + "epoch": 1.980746089049338, + "grad_norm": 9.299270552053692, + "learning_rate": 1.1974977190645777e-09, + "loss": 0.9643, + "step": 13991 + }, + { + "epoch": 1.9808876619239753, + "grad_norm": 9.457992032204828, + "learning_rate": 1.1798241087959684e-09, + "loss": 0.945, + "step": 13992 + }, + { + "epoch": 1.9810292347986125, + "grad_norm": 10.365968013006778, + "learning_rate": 1.1622818596407348e-09, + "loss": 1.0768, + "step": 13993 + }, + { + "epoch": 1.9811708076732497, + "grad_norm": 8.329458276689156, + "learning_rate": 1.1448709725209173e-09, + "loss": 0.8813, + "step": 13994 + }, + { + "epoch": 1.981312380547887, + "grad_norm": 8.947585171179306, + "learning_rate": 1.1275914483521721e-09, + "loss": 1.0332, + "step": 13995 + }, + { + "epoch": 1.9814539534225242, + "grad_norm": 9.60698847715025, + "learning_rate": 1.1104432880429394e-09, + "loss": 0.9802, + "step": 13996 + }, + { + "epoch": 1.9815955262971614, + "grad_norm": 9.721074945483299, + "learning_rate": 1.0934264924941651e-09, + "loss": 1.0361, + "step": 13997 + }, + { + "epoch": 1.9817370991717986, + "grad_norm": 8.280262783874647, + "learning_rate": 1.076541062600689e-09, + "loss": 0.9591, + "step": 13998 + }, + { + "epoch": 1.9818786720464359, + "grad_norm": 7.9795688564308715, + "learning_rate": 1.059786999250134e-09, + "loss": 0.9154, + "step": 13999 + }, + { + "epoch": 1.982020244921073, + "grad_norm": 9.40698644853753, + "learning_rate": 1.0431643033234629e-09, + "loss": 1.0131, + "step": 14000 + }, + { + "epoch": 1.9821618177957103, + "grad_norm": 9.742307709638274, + "learning_rate": 1.0266729756944205e-09, + "loss": 0.9872, + "step": 14001 + }, + { + "epoch": 1.9823033906703476, + "grad_norm": 8.972413487818466, + "learning_rate": 1.0103130172295362e-09, + "loss": 0.8758, + "step": 14002 + }, + { + "epoch": 1.9824449635449848, + "grad_norm": 10.217186111097742, + "learning_rate": 9.940844287895101e-10, + "loss": 1.0029, + "step": 14003 + }, + { + "epoch": 1.982586536419622, + "grad_norm": 9.35798790130844, + "learning_rate": 9.779872112267163e-10, + "loss": 0.9376, + "step": 14004 + }, + { + "epoch": 1.9827281092942592, + "grad_norm": 8.42385416360745, + "learning_rate": 9.62021365388255e-10, + "loss": 0.8877, + "step": 14005 + }, + { + "epoch": 1.9828696821688965, + "grad_norm": 10.830056814958747, + "learning_rate": 9.461868921126216e-10, + "loss": 0.9722, + "step": 14006 + }, + { + "epoch": 1.9830112550435337, + "grad_norm": 8.939988436766107, + "learning_rate": 9.304837922327614e-10, + "loss": 0.8681, + "step": 14007 + }, + { + "epoch": 1.983152827918171, + "grad_norm": 8.393747327723188, + "learning_rate": 9.149120665738476e-10, + "loss": 0.9681, + "step": 14008 + }, + { + "epoch": 1.9832944007928082, + "grad_norm": 11.694356121133417, + "learning_rate": 8.994717159546695e-10, + "loss": 1.0164, + "step": 14009 + }, + { + "epoch": 1.9834359736674454, + "grad_norm": 10.365252592373759, + "learning_rate": 8.841627411870779e-10, + "loss": 1.123, + "step": 14010 + }, + { + "epoch": 1.9835775465420826, + "grad_norm": 10.488184866293633, + "learning_rate": 8.689851430754293e-10, + "loss": 0.9634, + "step": 14011 + }, + { + "epoch": 1.9837191194167199, + "grad_norm": 10.705136387190935, + "learning_rate": 8.539389224176964e-10, + "loss": 0.9358, + "step": 14012 + }, + { + "epoch": 1.983860692291357, + "grad_norm": 8.901929376188756, + "learning_rate": 8.390240800051907e-10, + "loss": 0.8926, + "step": 14013 + }, + { + "epoch": 1.9840022651659943, + "grad_norm": 8.39886990165939, + "learning_rate": 8.242406166214522e-10, + "loss": 0.8348, + "step": 14014 + }, + { + "epoch": 1.9841438380406315, + "grad_norm": 9.018948423406274, + "learning_rate": 8.095885330441921e-10, + "loss": 1.0428, + "step": 14015 + }, + { + "epoch": 1.9842854109152688, + "grad_norm": 10.493082947060255, + "learning_rate": 7.950678300430725e-10, + "loss": 1.0039, + "step": 14016 + }, + { + "epoch": 1.984426983789906, + "grad_norm": 11.550248867507886, + "learning_rate": 7.806785083819268e-10, + "loss": 0.999, + "step": 14017 + }, + { + "epoch": 1.9845685566645432, + "grad_norm": 10.760185561724942, + "learning_rate": 7.664205688170945e-10, + "loss": 0.9057, + "step": 14018 + }, + { + "epoch": 1.9847101295391802, + "grad_norm": 9.608465483507443, + "learning_rate": 7.52294012097976e-10, + "loss": 0.9473, + "step": 14019 + }, + { + "epoch": 1.9848517024138175, + "grad_norm": 9.820782124436146, + "learning_rate": 7.382988389673107e-10, + "loss": 0.9244, + "step": 14020 + }, + { + "epoch": 1.9849932752884547, + "grad_norm": 9.620168290599345, + "learning_rate": 7.244350501606212e-10, + "loss": 0.9556, + "step": 14021 + }, + { + "epoch": 1.985134848163092, + "grad_norm": 12.290730551725442, + "learning_rate": 7.10702646406769e-10, + "loss": 0.9837, + "step": 14022 + }, + { + "epoch": 1.9852764210377292, + "grad_norm": 9.247564871612823, + "learning_rate": 6.971016284279541e-10, + "loss": 0.8975, + "step": 14023 + }, + { + "epoch": 1.9854179939123664, + "grad_norm": 10.092975976514559, + "learning_rate": 6.836319969388828e-10, + "loss": 0.9068, + "step": 14024 + }, + { + "epoch": 1.9855595667870036, + "grad_norm": 9.819925208030282, + "learning_rate": 6.702937526475994e-10, + "loss": 0.9037, + "step": 14025 + }, + { + "epoch": 1.9857011396616409, + "grad_norm": 8.894354120299301, + "learning_rate": 6.570868962554877e-10, + "loss": 0.9689, + "step": 14026 + }, + { + "epoch": 1.985842712536278, + "grad_norm": 9.814420670033403, + "learning_rate": 6.440114284567145e-10, + "loss": 0.9736, + "step": 14027 + }, + { + "epoch": 1.9859842854109153, + "grad_norm": 9.140124602355376, + "learning_rate": 6.310673499387854e-10, + "loss": 0.9375, + "step": 14028 + }, + { + "epoch": 1.9861258582855523, + "grad_norm": 11.806328993250823, + "learning_rate": 6.182546613817119e-10, + "loss": 1.0271, + "step": 14029 + }, + { + "epoch": 1.9862674311601896, + "grad_norm": 10.289114034490341, + "learning_rate": 6.055733634596772e-10, + "loss": 1.0454, + "step": 14030 + }, + { + "epoch": 1.9864090040348268, + "grad_norm": 10.499511344074802, + "learning_rate": 5.930234568388149e-10, + "loss": 1.0557, + "step": 14031 + }, + { + "epoch": 1.986550576909464, + "grad_norm": 9.30812820122024, + "learning_rate": 5.806049421791527e-10, + "loss": 0.8842, + "step": 14032 + }, + { + "epoch": 1.9866921497841012, + "grad_norm": 11.978519287307854, + "learning_rate": 5.683178201335015e-10, + "loss": 1.1155, + "step": 14033 + }, + { + "epoch": 1.9868337226587385, + "grad_norm": 9.072371795328705, + "learning_rate": 5.561620913477339e-10, + "loss": 1.0163, + "step": 14034 + }, + { + "epoch": 1.9869752955333757, + "grad_norm": 8.268525103381734, + "learning_rate": 5.44137756460783e-10, + "loss": 0.965, + "step": 14035 + }, + { + "epoch": 1.987116868408013, + "grad_norm": 10.72045244315451, + "learning_rate": 5.322448161049209e-10, + "loss": 1.0096, + "step": 14036 + }, + { + "epoch": 1.9872584412826502, + "grad_norm": 10.159443232982762, + "learning_rate": 5.204832709052032e-10, + "loss": 1.0069, + "step": 14037 + }, + { + "epoch": 1.9874000141572874, + "grad_norm": 9.766401922220263, + "learning_rate": 5.088531214800241e-10, + "loss": 0.9671, + "step": 14038 + }, + { + "epoch": 1.9875415870319246, + "grad_norm": 10.770727357521338, + "learning_rate": 4.973543684408389e-10, + "loss": 0.9431, + "step": 14039 + }, + { + "epoch": 1.9876831599065619, + "grad_norm": 10.213340517378523, + "learning_rate": 4.859870123918864e-10, + "loss": 1.0877, + "step": 14040 + }, + { + "epoch": 1.987824732781199, + "grad_norm": 10.704574063632288, + "learning_rate": 4.747510539307442e-10, + "loss": 1.0495, + "step": 14041 + }, + { + "epoch": 1.9879663056558363, + "grad_norm": 9.597340660511444, + "learning_rate": 4.636464936483287e-10, + "loss": 0.8816, + "step": 14042 + }, + { + "epoch": 1.9881078785304735, + "grad_norm": 10.459705966879481, + "learning_rate": 4.5267333212833943e-10, + "loss": 1.0029, + "step": 14043 + }, + { + "epoch": 1.9882494514051108, + "grad_norm": 8.389996212036587, + "learning_rate": 4.418315699475373e-10, + "loss": 0.9831, + "step": 14044 + }, + { + "epoch": 1.988391024279748, + "grad_norm": 8.748187504102413, + "learning_rate": 4.311212076760218e-10, + "loss": 0.963, + "step": 14045 + }, + { + "epoch": 1.9885325971543852, + "grad_norm": 10.537162773555615, + "learning_rate": 4.2054224587667615e-10, + "loss": 0.9568, + "step": 14046 + }, + { + "epoch": 1.9886741700290225, + "grad_norm": 9.078241816965699, + "learning_rate": 4.1009468510544434e-10, + "loss": 1.0877, + "step": 14047 + }, + { + "epoch": 1.9888157429036597, + "grad_norm": 11.14841525871555, + "learning_rate": 3.9977852591188694e-10, + "loss": 0.97, + "step": 14048 + }, + { + "epoch": 1.988957315778297, + "grad_norm": 11.62167868800991, + "learning_rate": 3.8959376883834776e-10, + "loss": 1.0277, + "step": 14049 + }, + { + "epoch": 1.9890988886529342, + "grad_norm": 9.66497388904279, + "learning_rate": 3.795404144199544e-10, + "loss": 0.8741, + "step": 14050 + }, + { + "epoch": 1.9892404615275714, + "grad_norm": 9.64908153531193, + "learning_rate": 3.696184631851729e-10, + "loss": 1.0663, + "step": 14051 + }, + { + "epoch": 1.9893820344022086, + "grad_norm": 9.797268654247269, + "learning_rate": 3.5982791565608575e-10, + "loss": 0.9946, + "step": 14052 + }, + { + "epoch": 1.9895236072768459, + "grad_norm": 11.365710814736524, + "learning_rate": 3.501687723467262e-10, + "loss": 0.9605, + "step": 14053 + }, + { + "epoch": 1.989665180151483, + "grad_norm": 9.827415064655982, + "learning_rate": 3.4064103376529876e-10, + "loss": 1.0662, + "step": 14054 + }, + { + "epoch": 1.9898067530261203, + "grad_norm": 10.506409278402064, + "learning_rate": 3.3124470041251413e-10, + "loss": 1.0051, + "step": 14055 + }, + { + "epoch": 1.9899483259007575, + "grad_norm": 10.867001453071344, + "learning_rate": 3.219797727824214e-10, + "loss": 1.0141, + "step": 14056 + }, + { + "epoch": 1.9900898987753948, + "grad_norm": 8.382054332851057, + "learning_rate": 3.128462513618535e-10, + "loss": 0.9268, + "step": 14057 + }, + { + "epoch": 1.990231471650032, + "grad_norm": 10.234006732400957, + "learning_rate": 3.0384413663125944e-10, + "loss": 1.0637, + "step": 14058 + }, + { + "epoch": 1.9903730445246692, + "grad_norm": 9.962880095502669, + "learning_rate": 2.9497342906387173e-10, + "loss": 0.949, + "step": 14059 + }, + { + "epoch": 1.9905146173993062, + "grad_norm": 9.377295454657773, + "learning_rate": 2.862341291257065e-10, + "loss": 0.9723, + "step": 14060 + }, + { + "epoch": 1.9906561902739435, + "grad_norm": 8.355732175750434, + "learning_rate": 2.776262372761185e-10, + "loss": 0.9262, + "step": 14061 + }, + { + "epoch": 1.9907977631485807, + "grad_norm": 9.111438445540584, + "learning_rate": 2.6914975396807873e-10, + "loss": 1.0132, + "step": 14062 + }, + { + "epoch": 1.990939336023218, + "grad_norm": 9.803736180815319, + "learning_rate": 2.6080467964706424e-10, + "loss": 0.8982, + "step": 14063 + }, + { + "epoch": 1.9910809088978552, + "grad_norm": 9.478522265542182, + "learning_rate": 2.525910147516131e-10, + "loss": 0.9836, + "step": 14064 + }, + { + "epoch": 1.9912224817724924, + "grad_norm": 9.085043603249751, + "learning_rate": 2.4450875971332445e-10, + "loss": 1.0378, + "step": 14065 + }, + { + "epoch": 1.9913640546471296, + "grad_norm": 11.780454735260912, + "learning_rate": 2.3655791495769134e-10, + "loss": 1.1135, + "step": 14066 + }, + { + "epoch": 1.9915056275217669, + "grad_norm": 8.86473930272478, + "learning_rate": 2.2873848090188e-10, + "loss": 0.912, + "step": 14067 + }, + { + "epoch": 1.991647200396404, + "grad_norm": 8.987107261172982, + "learning_rate": 2.2105045795778323e-10, + "loss": 0.9194, + "step": 14068 + }, + { + "epoch": 1.9917887732710413, + "grad_norm": 10.506981481428156, + "learning_rate": 2.134938465289671e-10, + "loss": 0.962, + "step": 14069 + }, + { + "epoch": 1.9919303461456785, + "grad_norm": 9.224682777140208, + "learning_rate": 2.0606864701289142e-10, + "loss": 0.9546, + "step": 14070 + }, + { + "epoch": 1.9920719190203156, + "grad_norm": 8.660313943365809, + "learning_rate": 1.987748597997996e-10, + "loss": 0.9418, + "step": 14071 + }, + { + "epoch": 1.9922134918949528, + "grad_norm": 11.516909565183587, + "learning_rate": 1.916124852732737e-10, + "loss": 1.0818, + "step": 14072 + }, + { + "epoch": 1.99235506476959, + "grad_norm": 8.699773447879755, + "learning_rate": 1.845815238096793e-10, + "loss": 0.9736, + "step": 14073 + }, + { + "epoch": 1.9924966376442272, + "grad_norm": 7.231468193435274, + "learning_rate": 1.776819757787207e-10, + "loss": 1.0841, + "step": 14074 + }, + { + "epoch": 1.9926382105188645, + "grad_norm": 11.735148032335182, + "learning_rate": 1.7091384154288571e-10, + "loss": 1.0571, + "step": 14075 + }, + { + "epoch": 1.9927797833935017, + "grad_norm": 9.75933821076059, + "learning_rate": 1.6427712145827834e-10, + "loss": 0.9774, + "step": 14076 + }, + { + "epoch": 1.992921356268139, + "grad_norm": 8.190988045178067, + "learning_rate": 1.577718158737862e-10, + "loss": 0.8637, + "step": 14077 + }, + { + "epoch": 1.9930629291427762, + "grad_norm": 10.10490099149178, + "learning_rate": 1.5139792513135799e-10, + "loss": 0.9389, + "step": 14078 + }, + { + "epoch": 1.9932045020174134, + "grad_norm": 8.639374626980388, + "learning_rate": 1.451554495657259e-10, + "loss": 0.9225, + "step": 14079 + }, + { + "epoch": 1.9933460748920506, + "grad_norm": 9.344997973332152, + "learning_rate": 1.39044389505516e-10, + "loss": 0.9441, + "step": 14080 + }, + { + "epoch": 1.9934876477666879, + "grad_norm": 8.03580568291897, + "learning_rate": 1.3306474527158275e-10, + "loss": 0.872, + "step": 14081 + }, + { + "epoch": 1.993629220641325, + "grad_norm": 9.467412369420497, + "learning_rate": 1.2721651717839678e-10, + "loss": 0.9022, + "step": 14082 + }, + { + "epoch": 1.9937707935159623, + "grad_norm": 8.939967955120098, + "learning_rate": 1.2149970553376745e-10, + "loss": 0.8157, + "step": 14083 + }, + { + "epoch": 1.9939123663905995, + "grad_norm": 8.383081892223094, + "learning_rate": 1.1591431063745495e-10, + "loss": 1.0132, + "step": 14084 + }, + { + "epoch": 1.9940539392652368, + "grad_norm": 10.664703864069587, + "learning_rate": 1.1046033278394597e-10, + "loss": 0.9679, + "step": 14085 + }, + { + "epoch": 1.994195512139874, + "grad_norm": 8.761370001836587, + "learning_rate": 1.0513777225940047e-10, + "loss": 1.0198, + "step": 14086 + }, + { + "epoch": 1.9943370850145112, + "grad_norm": 9.05917318381838, + "learning_rate": 9.994662934387223e-11, + "loss": 0.8921, + "step": 14087 + }, + { + "epoch": 1.9944786578891485, + "grad_norm": 9.996249449256347, + "learning_rate": 9.488690430992098e-11, + "loss": 0.9482, + "step": 14088 + }, + { + "epoch": 1.9946202307637857, + "grad_norm": 9.876373581783605, + "learning_rate": 8.99585974237227e-11, + "loss": 0.9781, + "step": 14089 + }, + { + "epoch": 1.994761803638423, + "grad_norm": 10.231358393687424, + "learning_rate": 8.516170894479203e-11, + "loss": 0.9941, + "step": 14090 + }, + { + "epoch": 1.9949033765130602, + "grad_norm": 9.65139006253297, + "learning_rate": 8.049623912459448e-11, + "loss": 0.9969, + "step": 14091 + }, + { + "epoch": 1.9950449493876974, + "grad_norm": 8.456945997693857, + "learning_rate": 7.596218820876688e-11, + "loss": 0.9453, + "step": 14092 + }, + { + "epoch": 1.9951865222623346, + "grad_norm": 10.373410758127529, + "learning_rate": 7.15595564354521e-11, + "loss": 0.9588, + "step": 14093 + }, + { + "epoch": 1.9953280951369718, + "grad_norm": 9.539160475669041, + "learning_rate": 6.728834403640916e-11, + "loss": 0.9831, + "step": 14094 + }, + { + "epoch": 1.995469668011609, + "grad_norm": 11.539020845600172, + "learning_rate": 6.314855123590313e-11, + "loss": 1.027, + "step": 14095 + }, + { + "epoch": 1.9956112408862463, + "grad_norm": 9.496088126026718, + "learning_rate": 5.914017825153773e-11, + "loss": 0.9072, + "step": 14096 + }, + { + "epoch": 1.9957528137608835, + "grad_norm": 8.727819176385564, + "learning_rate": 5.526322529425532e-11, + "loss": 0.9314, + "step": 14097 + }, + { + "epoch": 1.9958943866355208, + "grad_norm": 9.016879042441216, + "learning_rate": 5.151769256778183e-11, + "loss": 0.9545, + "step": 14098 + }, + { + "epoch": 1.996035959510158, + "grad_norm": 9.274147887043947, + "learning_rate": 4.790358026890429e-11, + "loss": 0.9317, + "step": 14099 + }, + { + "epoch": 1.9961775323847952, + "grad_norm": 7.756148329813097, + "learning_rate": 4.44208885877484e-11, + "loss": 0.8546, + "step": 14100 + }, + { + "epoch": 1.9963191052594325, + "grad_norm": 9.180174937191634, + "learning_rate": 4.1069617707223396e-11, + "loss": 0.9981, + "step": 14101 + }, + { + "epoch": 1.9964606781340695, + "grad_norm": 10.210510104865598, + "learning_rate": 3.7849767803854745e-11, + "loss": 0.9232, + "step": 14102 + }, + { + "epoch": 1.9966022510087067, + "grad_norm": 8.289224952019083, + "learning_rate": 3.4761339046396336e-11, + "loss": 0.9707, + "step": 14103 + }, + { + "epoch": 1.996743823883344, + "grad_norm": 8.788531233943473, + "learning_rate": 3.1804331597773406e-11, + "loss": 1.0367, + "step": 14104 + }, + { + "epoch": 1.9968853967579812, + "grad_norm": 10.15907450207791, + "learning_rate": 2.897874561286207e-11, + "loss": 0.9704, + "step": 14105 + }, + { + "epoch": 1.9970269696326184, + "grad_norm": 8.538672044767027, + "learning_rate": 2.6284581240709762e-11, + "loss": 0.8485, + "step": 14106 + }, + { + "epoch": 1.9971685425072556, + "grad_norm": 8.258352040084556, + "learning_rate": 2.3721838622592362e-11, + "loss": 0.9273, + "step": 14107 + }, + { + "epoch": 1.9973101153818928, + "grad_norm": 8.326705382200853, + "learning_rate": 2.1290517893401974e-11, + "loss": 0.9373, + "step": 14108 + }, + { + "epoch": 1.99745168825653, + "grad_norm": 11.450221590121766, + "learning_rate": 1.899061918081424e-11, + "loss": 1.0987, + "step": 14109 + }, + { + "epoch": 1.9975932611311673, + "grad_norm": 9.136704972066354, + "learning_rate": 1.682214260584347e-11, + "loss": 0.9543, + "step": 14110 + }, + { + "epoch": 1.9977348340058045, + "grad_norm": 9.715882298353002, + "learning_rate": 1.4785088282565084e-11, + "loss": 0.9831, + "step": 14111 + }, + { + "epoch": 1.9978764068804415, + "grad_norm": 10.009648817916032, + "learning_rate": 1.2879456318115602e-11, + "loss": 0.9491, + "step": 14112 + }, + { + "epoch": 1.9980179797550788, + "grad_norm": 11.14523532487212, + "learning_rate": 1.1105246812137538e-11, + "loss": 0.8261, + "step": 14113 + }, + { + "epoch": 1.998159552629716, + "grad_norm": 8.472663003541568, + "learning_rate": 9.462459858444739e-12, + "loss": 0.9505, + "step": 14114 + }, + { + "epoch": 1.9983011255043532, + "grad_norm": 8.70827278469032, + "learning_rate": 7.951095543357046e-12, + "loss": 0.9245, + "step": 14115 + }, + { + "epoch": 1.9984426983789905, + "grad_norm": 9.630333945880572, + "learning_rate": 6.5711539462554044e-12, + "loss": 0.9342, + "step": 14116 + }, + { + "epoch": 1.9985842712536277, + "grad_norm": 10.226881798873253, + "learning_rate": 5.322635139304311e-12, + "loss": 0.8568, + "step": 14117 + }, + { + "epoch": 1.998725844128265, + "grad_norm": 10.778458297411692, + "learning_rate": 4.205539188839591e-12, + "loss": 1.0366, + "step": 14118 + }, + { + "epoch": 1.9988674170029022, + "grad_norm": 8.40262976762149, + "learning_rate": 3.219866153147955e-12, + "loss": 0.9487, + "step": 14119 + }, + { + "epoch": 1.9990089898775394, + "grad_norm": 8.21709933221857, + "learning_rate": 2.3656160838547713e-12, + "loss": 0.9008, + "step": 14120 + }, + { + "epoch": 1.9991505627521766, + "grad_norm": 10.678906650084125, + "learning_rate": 1.64278902620163e-12, + "loss": 0.8971, + "step": 14121 + }, + { + "epoch": 1.9992921356268138, + "grad_norm": 8.876277777871838, + "learning_rate": 1.0513850182136687e-12, + "loss": 1.0227, + "step": 14122 + }, + { + "epoch": 1.999433708501451, + "grad_norm": 9.206272590852926, + "learning_rate": 5.914040909771324e-13, + "loss": 0.9777, + "step": 14123 + }, + { + "epoch": 1.9995752813760883, + "grad_norm": 10.335507892148884, + "learning_rate": 2.62846268361816e-13, + "loss": 0.8741, + "step": 14124 + }, + { + "epoch": 1.9997168542507255, + "grad_norm": 9.391732301344476, + "learning_rate": 6.571156785373234e-14, + "loss": 1.0245, + "step": 14125 + }, + { + "epoch": 1.9998584271253628, + "grad_norm": 9.947442988490929, + "learning_rate": 0.0, + "loss": 0.8958, + "step": 14126 + }, + { + "epoch": 1.9998584271253628, + "step": 14126, + "total_flos": 3113833205456896.0, + "train_loss": 1.16577873653365, + "train_runtime": 406995.3187, + "train_samples_per_second": 4.443, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1.0, + "max_steps": 14126, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3113833205456896.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}