diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,60382 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9997680083516993, + "eval_steps": 500, + "global_step": 8620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023199164830066117, + "grad_norm": 74.39870819089624, + "learning_rate": 0.0, + "loss": 2.1794, + "step": 1 + }, + { + "epoch": 0.00046398329660132234, + "grad_norm": 112.28468562448423, + "learning_rate": 3.1041712554031024e-07, + "loss": 2.3896, + "step": 2 + }, + { + "epoch": 0.0006959749449019835, + "grad_norm": 75.42842213959644, + "learning_rate": 4.919995035630432e-07, + "loss": 2.0454, + "step": 3 + }, + { + "epoch": 0.0009279665932026447, + "grad_norm": 105.0533217529224, + "learning_rate": 6.208342510806205e-07, + "loss": 2.3266, + "step": 4 + }, + { + "epoch": 0.0011599582415033058, + "grad_norm": 148.58005460638825, + "learning_rate": 7.207662449262237e-07, + "loss": 2.6506, + "step": 5 + }, + { + "epoch": 0.001391949889803967, + "grad_norm": 69.77358744845148, + "learning_rate": 8.024166291033534e-07, + "loss": 1.8992, + "step": 6 + }, + { + "epoch": 0.0016239415381046282, + "grad_norm": 114.58129896954524, + "learning_rate": 8.71451045276563e-07, + "loss": 1.137, + "step": 7 + }, + { + "epoch": 0.0018559331864052894, + "grad_norm": 94.46712989635765, + "learning_rate": 9.312513766209306e-07, + "loss": 0.9458, + "step": 8 + }, + { + "epoch": 0.0020879248347059507, + "grad_norm": 137.0968911795173, + "learning_rate": 9.839990071260865e-07, + "loss": 0.7461, + "step": 9 + }, + { + "epoch": 0.0023199164830066117, + "grad_norm": 31.144565059150484, + "learning_rate": 1.031183370466534e-06, + "loss": 0.9004, + "step": 10 + }, + { + "epoch": 0.002551908131307273, + "grad_norm": 32.870107426052726, + "learning_rate": 1.0738668190606527e-06, + "loss": 0.9956, + "step": 11 + }, + { + "epoch": 0.002783899779607934, + "grad_norm": 52.54313022163986, + "learning_rate": 1.1128337546436637e-06, + "loss": 0.7702, + "step": 12 + }, + { + "epoch": 0.0030158914279085954, + "grad_norm": 43.63442634279012, + "learning_rate": 1.1486798605405537e-06, + "loss": 0.8403, + "step": 13 + }, + { + "epoch": 0.0032478830762092564, + "grad_norm": 18.930724925338343, + "learning_rate": 1.1818681708168735e-06, + "loss": 0.9008, + "step": 14 + }, + { + "epoch": 0.0034798747245099178, + "grad_norm": 37.59291122718292, + "learning_rate": 1.212765748489267e-06, + "loss": 0.8522, + "step": 15 + }, + { + "epoch": 0.0037118663728105787, + "grad_norm": 771.0884603354021, + "learning_rate": 1.241668502161241e-06, + "loss": 0.721, + "step": 16 + }, + { + "epoch": 0.00394385802111124, + "grad_norm": 28.570318092553354, + "learning_rate": 1.2688184659337598e-06, + "loss": 0.7248, + "step": 17 + }, + { + "epoch": 0.0041758496694119015, + "grad_norm": 24.426651580387347, + "learning_rate": 1.2944161326663965e-06, + "loss": 0.8323, + "step": 18 + }, + { + "epoch": 0.004407841317712562, + "grad_norm": 19.369835379439408, + "learning_rate": 1.3186294482267552e-06, + "loss": 0.696, + "step": 19 + }, + { + "epoch": 0.004639832966013223, + "grad_norm": 41.14435097126691, + "learning_rate": 1.3416004960068441e-06, + "loss": 0.7574, + "step": 20 + }, + { + "epoch": 0.004871824614313885, + "grad_norm": 13.411501946608672, + "learning_rate": 1.3634505488396065e-06, + "loss": 0.7499, + "step": 21 + }, + { + "epoch": 0.005103816262614546, + "grad_norm": 32.892805474385675, + "learning_rate": 1.384283944600963e-06, + "loss": 0.6824, + "step": 22 + }, + { + "epoch": 0.005335807910915207, + "grad_norm": 39.84552686658576, + "learning_rate": 1.4041910996027212e-06, + "loss": 0.4729, + "step": 23 + }, + { + "epoch": 0.005567799559215868, + "grad_norm": 22.06254779204553, + "learning_rate": 1.423250880183974e-06, + "loss": 0.6192, + "step": 24 + }, + { + "epoch": 0.0057997912075165295, + "grad_norm": 37.344375903194816, + "learning_rate": 1.4415324898524475e-06, + "loss": 0.7357, + "step": 25 + }, + { + "epoch": 0.006031782855817191, + "grad_norm": 32.31772823142612, + "learning_rate": 1.459096986080864e-06, + "loss": 0.5671, + "step": 26 + }, + { + "epoch": 0.006263774504117851, + "grad_norm": 10.92384480057118, + "learning_rate": 1.4759985106891297e-06, + "loss": 0.6269, + "step": 27 + }, + { + "epoch": 0.006495766152418513, + "grad_norm": 35.369564086076316, + "learning_rate": 1.4922852963571837e-06, + "loss": 0.6397, + "step": 28 + }, + { + "epoch": 0.006727757800719174, + "grad_norm": 18.49217287082658, + "learning_rate": 1.5080004964369569e-06, + "loss": 0.628, + "step": 29 + }, + { + "epoch": 0.0069597494490198355, + "grad_norm": 41.13501433825827, + "learning_rate": 1.5231828740295771e-06, + "loss": 0.6695, + "step": 30 + }, + { + "epoch": 0.007191741097320496, + "grad_norm": 54.22576765655488, + "learning_rate": 1.5378673780327045e-06, + "loss": 0.6157, + "step": 31 + }, + { + "epoch": 0.007423732745621157, + "grad_norm": 16.447372540566636, + "learning_rate": 1.5520856277015512e-06, + "loss": 0.5254, + "step": 32 + }, + { + "epoch": 0.007655724393921819, + "grad_norm": 32.63353299704918, + "learning_rate": 1.5658663226236957e-06, + "loss": 0.6434, + "step": 33 + }, + { + "epoch": 0.00788771604222248, + "grad_norm": 24.91733224181655, + "learning_rate": 1.57923559147407e-06, + "loss": 0.5484, + "step": 34 + }, + { + "epoch": 0.00811970769052314, + "grad_norm": 25.808395796187114, + "learning_rate": 1.592217290202787e-06, + "loss": 0.5112, + "step": 35 + }, + { + "epoch": 0.008351699338823803, + "grad_norm": 16.243462702948914, + "learning_rate": 1.6048332582067068e-06, + "loss": 0.4698, + "step": 36 + }, + { + "epoch": 0.008583690987124463, + "grad_norm": 23.97628827091189, + "learning_rate": 1.6171035393948334e-06, + "loss": 0.5316, + "step": 37 + }, + { + "epoch": 0.008815682635425124, + "grad_norm": 27.040198776241745, + "learning_rate": 1.6290465737670655e-06, + "loss": 0.485, + "step": 38 + }, + { + "epoch": 0.009047674283725786, + "grad_norm": 38.2640149689467, + "learning_rate": 1.6406793641035967e-06, + "loss": 0.5918, + "step": 39 + }, + { + "epoch": 0.009279665932026447, + "grad_norm": 22.98109068927263, + "learning_rate": 1.6520176215471544e-06, + "loss": 0.4683, + "step": 40 + }, + { + "epoch": 0.009511657580327109, + "grad_norm": 48.11308667579086, + "learning_rate": 1.6630758932062726e-06, + "loss": 0.625, + "step": 41 + }, + { + "epoch": 0.00974364922862777, + "grad_norm": 33.404519417339436, + "learning_rate": 1.6738676743799167e-06, + "loss": 0.5841, + "step": 42 + }, + { + "epoch": 0.00997564087692843, + "grad_norm": 58.61162129240976, + "learning_rate": 1.6844055075753218e-06, + "loss": 0.4964, + "step": 43 + }, + { + "epoch": 0.010207632525229092, + "grad_norm": 38.349120521370125, + "learning_rate": 1.694701070141273e-06, + "loss": 0.5981, + "step": 44 + }, + { + "epoch": 0.010439624173529753, + "grad_norm": 14.12305267247844, + "learning_rate": 1.7047652520523101e-06, + "loss": 0.3997, + "step": 45 + }, + { + "epoch": 0.010671615821830413, + "grad_norm": 51.54573323050249, + "learning_rate": 1.7146082251430314e-06, + "loss": 0.5963, + "step": 46 + }, + { + "epoch": 0.010903607470131076, + "grad_norm": 39.1682679389499, + "learning_rate": 1.7242395048960248e-06, + "loss": 0.5537, + "step": 47 + }, + { + "epoch": 0.011135599118431736, + "grad_norm": 46.60587716219377, + "learning_rate": 1.7336680057242842e-06, + "loss": 0.6186, + "step": 48 + }, + { + "epoch": 0.011367590766732398, + "grad_norm": 34.28832247434921, + "learning_rate": 1.742902090553126e-06, + "loss": 0.4981, + "step": 49 + }, + { + "epoch": 0.011599582415033059, + "grad_norm": 36.911818320913, + "learning_rate": 1.7519496153927577e-06, + "loss": 0.5478, + "step": 50 + }, + { + "epoch": 0.01183157406333372, + "grad_norm": 19.20406082351278, + "learning_rate": 1.760817969496803e-06, + "loss": 0.4656, + "step": 51 + }, + { + "epoch": 0.012063565711634382, + "grad_norm": 25.02534356933273, + "learning_rate": 1.7695141116211742e-06, + "loss": 0.3916, + "step": 52 + }, + { + "epoch": 0.012295557359935042, + "grad_norm": 18.593964365810773, + "learning_rate": 1.7780446028290557e-06, + "loss": 0.4645, + "step": 53 + }, + { + "epoch": 0.012527549008235703, + "grad_norm": 17.886039648783353, + "learning_rate": 1.7864156362294398e-06, + "loss": 0.4096, + "step": 54 + }, + { + "epoch": 0.012759540656536365, + "grad_norm": 34.88545271696248, + "learning_rate": 1.7946330639868765e-06, + "loss": 0.642, + "step": 55 + }, + { + "epoch": 0.012991532304837025, + "grad_norm": 18.784349156418234, + "learning_rate": 1.802702421897494e-06, + "loss": 0.4415, + "step": 56 + }, + { + "epoch": 0.013223523953137688, + "grad_norm": 37.50634451781366, + "learning_rate": 1.8106289517897987e-06, + "loss": 0.3873, + "step": 57 + }, + { + "epoch": 0.013455515601438348, + "grad_norm": 16.880603630761, + "learning_rate": 1.818417621977267e-06, + "loss": 0.5032, + "step": 58 + }, + { + "epoch": 0.013687507249739009, + "grad_norm": 17.02585172512492, + "learning_rate": 1.8260731459625882e-06, + "loss": 0.4495, + "step": 59 + }, + { + "epoch": 0.013919498898039671, + "grad_norm": 107.35379091818216, + "learning_rate": 1.8335999995698872e-06, + "loss": 0.696, + "step": 60 + }, + { + "epoch": 0.014151490546340332, + "grad_norm": 19.40508069322086, + "learning_rate": 1.8410024366608638e-06, + "loss": 0.4458, + "step": 61 + }, + { + "epoch": 0.014383482194640992, + "grad_norm": 23.044132880786158, + "learning_rate": 1.8482845035730148e-06, + "loss": 0.5393, + "step": 62 + }, + { + "epoch": 0.014615473842941654, + "grad_norm": 28.654552405092105, + "learning_rate": 1.8554500524026495e-06, + "loss": 0.5256, + "step": 63 + }, + { + "epoch": 0.014847465491242315, + "grad_norm": 48.577148536807755, + "learning_rate": 1.8625027532418612e-06, + "loss": 0.5776, + "step": 64 + }, + { + "epoch": 0.015079457139542977, + "grad_norm": 21.245792672313023, + "learning_rate": 1.869446105466777e-06, + "loss": 0.4652, + "step": 65 + }, + { + "epoch": 0.015311448787843638, + "grad_norm": 71.60793690174401, + "learning_rate": 1.8762834481640057e-06, + "loss": 0.5653, + "step": 66 + }, + { + "epoch": 0.015543440436144298, + "grad_norm": 26.128541484591057, + "learning_rate": 1.8830179697730491e-06, + "loss": 0.535, + "step": 67 + }, + { + "epoch": 0.01577543208444496, + "grad_norm": 33.64066732768441, + "learning_rate": 1.8896527170143803e-06, + "loss": 0.5011, + "step": 68 + }, + { + "epoch": 0.01600742373274562, + "grad_norm": 26.88917149800284, + "learning_rate": 1.8961906031657644e-06, + "loss": 0.5581, + "step": 69 + }, + { + "epoch": 0.01623941538104628, + "grad_norm": 25.195762814497893, + "learning_rate": 1.9026344157430974e-06, + "loss": 0.4721, + "step": 70 + }, + { + "epoch": 0.016471407029346942, + "grad_norm": 29.42534723385049, + "learning_rate": 1.908986823636446e-06, + "loss": 0.5184, + "step": 71 + }, + { + "epoch": 0.016703398677647606, + "grad_norm": 16.62919699797701, + "learning_rate": 1.9152503837470172e-06, + "loss": 0.4352, + "step": 72 + }, + { + "epoch": 0.016935390325948266, + "grad_norm": 19.78067075131713, + "learning_rate": 1.921427547166354e-06, + "loss": 0.4264, + "step": 73 + }, + { + "epoch": 0.017167381974248927, + "grad_norm": 26.136176618497988, + "learning_rate": 1.927520664935144e-06, + "loss": 0.4323, + "step": 74 + }, + { + "epoch": 0.017399373622549587, + "grad_norm": 20.42370322133567, + "learning_rate": 1.9335319934154905e-06, + "loss": 0.4796, + "step": 75 + }, + { + "epoch": 0.017631365270850248, + "grad_norm": 16.995153150338616, + "learning_rate": 1.939463699307376e-06, + "loss": 0.3959, + "step": 76 + }, + { + "epoch": 0.017863356919150912, + "grad_norm": 16.339592308878576, + "learning_rate": 1.945317864337216e-06, + "loss": 0.3985, + "step": 77 + }, + { + "epoch": 0.018095348567451573, + "grad_norm": 19.2386041009152, + "learning_rate": 1.951096489643907e-06, + "loss": 0.4068, + "step": 78 + }, + { + "epoch": 0.018327340215752233, + "grad_norm": 23.57524018997069, + "learning_rate": 1.9568014998854824e-06, + "loss": 0.3625, + "step": 79 + }, + { + "epoch": 0.018559331864052894, + "grad_norm": 32.00958631342296, + "learning_rate": 1.9624347470874646e-06, + "loss": 0.4652, + "step": 80 + }, + { + "epoch": 0.018791323512353554, + "grad_norm": 29.277000871097183, + "learning_rate": 1.967998014252173e-06, + "loss": 0.3404, + "step": 81 + }, + { + "epoch": 0.019023315160654218, + "grad_norm": 26.679579757688188, + "learning_rate": 1.973493018746583e-06, + "loss": 0.5614, + "step": 82 + }, + { + "epoch": 0.01925530680895488, + "grad_norm": 34.03519083482874, + "learning_rate": 1.9789214154848464e-06, + "loss": 0.4527, + "step": 83 + }, + { + "epoch": 0.01948729845725554, + "grad_norm": 9.307613839376076, + "learning_rate": 1.984284799920227e-06, + "loss": 0.3238, + "step": 84 + }, + { + "epoch": 0.0197192901055562, + "grad_norm": 28.17577070445086, + "learning_rate": 1.989584710859984e-06, + "loss": 0.584, + "step": 85 + }, + { + "epoch": 0.01995128175385686, + "grad_norm": 32.013482960486684, + "learning_rate": 1.994822633115632e-06, + "loss": 0.4381, + "step": 86 + }, + { + "epoch": 0.02018327340215752, + "grad_norm": 26.285276118944726, + "learning_rate": 2e-06, + "loss": 0.3722, + "step": 87 + }, + { + "epoch": 0.020415265050458185, + "grad_norm": 37.319181949930496, + "learning_rate": 2e-06, + "loss": 0.4557, + "step": 88 + }, + { + "epoch": 0.020647256698758845, + "grad_norm": 22.094760938419988, + "learning_rate": 2e-06, + "loss": 0.3266, + "step": 89 + }, + { + "epoch": 0.020879248347059506, + "grad_norm": 25.245767362643576, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 90 + }, + { + "epoch": 0.021111239995360166, + "grad_norm": 21.175594320556343, + "learning_rate": 2e-06, + "loss": 0.4793, + "step": 91 + }, + { + "epoch": 0.021343231643660827, + "grad_norm": 34.0650426085687, + "learning_rate": 2e-06, + "loss": 0.3412, + "step": 92 + }, + { + "epoch": 0.02157522329196149, + "grad_norm": 18.241508100008726, + "learning_rate": 2e-06, + "loss": 0.4224, + "step": 93 + }, + { + "epoch": 0.02180721494026215, + "grad_norm": 60.461910401186756, + "learning_rate": 2e-06, + "loss": 0.3686, + "step": 94 + }, + { + "epoch": 0.022039206588562812, + "grad_norm": 30.838743569237803, + "learning_rate": 2e-06, + "loss": 0.4954, + "step": 95 + }, + { + "epoch": 0.022271198236863472, + "grad_norm": 18.870087576100286, + "learning_rate": 2e-06, + "loss": 0.4342, + "step": 96 + }, + { + "epoch": 0.022503189885164133, + "grad_norm": 17.830426935702803, + "learning_rate": 2e-06, + "loss": 0.3998, + "step": 97 + }, + { + "epoch": 0.022735181533464797, + "grad_norm": 22.72931222482498, + "learning_rate": 2e-06, + "loss": 0.4001, + "step": 98 + }, + { + "epoch": 0.022967173181765457, + "grad_norm": 28.15915823308931, + "learning_rate": 2e-06, + "loss": 0.3818, + "step": 99 + }, + { + "epoch": 0.023199164830066118, + "grad_norm": 19.509920544774893, + "learning_rate": 2e-06, + "loss": 0.3897, + "step": 100 + }, + { + "epoch": 0.02343115647836678, + "grad_norm": 22.200505904185203, + "learning_rate": 2e-06, + "loss": 0.3631, + "step": 101 + }, + { + "epoch": 0.02366314812666744, + "grad_norm": 46.8337504969018, + "learning_rate": 2e-06, + "loss": 0.4703, + "step": 102 + }, + { + "epoch": 0.023895139774968103, + "grad_norm": 30.52389628493483, + "learning_rate": 2e-06, + "loss": 0.4831, + "step": 103 + }, + { + "epoch": 0.024127131423268763, + "grad_norm": 33.49480618399563, + "learning_rate": 2e-06, + "loss": 0.4041, + "step": 104 + }, + { + "epoch": 0.024359123071569424, + "grad_norm": 50.81297884729123, + "learning_rate": 2e-06, + "loss": 0.6375, + "step": 105 + }, + { + "epoch": 0.024591114719870084, + "grad_norm": 17.464920058894066, + "learning_rate": 2e-06, + "loss": 0.3552, + "step": 106 + }, + { + "epoch": 0.024823106368170745, + "grad_norm": 24.45022223936699, + "learning_rate": 2e-06, + "loss": 0.4592, + "step": 107 + }, + { + "epoch": 0.025055098016471405, + "grad_norm": 19.966546967508155, + "learning_rate": 2e-06, + "loss": 0.4533, + "step": 108 + }, + { + "epoch": 0.02528708966477207, + "grad_norm": 28.450205806954816, + "learning_rate": 2e-06, + "loss": 0.4727, + "step": 109 + }, + { + "epoch": 0.02551908131307273, + "grad_norm": 41.13916073966549, + "learning_rate": 2e-06, + "loss": 0.497, + "step": 110 + }, + { + "epoch": 0.02575107296137339, + "grad_norm": 16.112754647057017, + "learning_rate": 2e-06, + "loss": 0.3844, + "step": 111 + }, + { + "epoch": 0.02598306460967405, + "grad_norm": 28.162277990599502, + "learning_rate": 2e-06, + "loss": 0.4953, + "step": 112 + }, + { + "epoch": 0.02621505625797471, + "grad_norm": 23.758505464030357, + "learning_rate": 2e-06, + "loss": 0.3308, + "step": 113 + }, + { + "epoch": 0.026447047906275375, + "grad_norm": 32.06452268639564, + "learning_rate": 2e-06, + "loss": 0.4046, + "step": 114 + }, + { + "epoch": 0.026679039554576036, + "grad_norm": 17.30981532939412, + "learning_rate": 2e-06, + "loss": 0.3985, + "step": 115 + }, + { + "epoch": 0.026911031202876697, + "grad_norm": 33.43471126599173, + "learning_rate": 2e-06, + "loss": 0.4519, + "step": 116 + }, + { + "epoch": 0.027143022851177357, + "grad_norm": 27.51227040876881, + "learning_rate": 2e-06, + "loss": 0.4136, + "step": 117 + }, + { + "epoch": 0.027375014499478018, + "grad_norm": 23.974504782175565, + "learning_rate": 2e-06, + "loss": 0.477, + "step": 118 + }, + { + "epoch": 0.02760700614777868, + "grad_norm": 30.741717997138643, + "learning_rate": 2e-06, + "loss": 0.4243, + "step": 119 + }, + { + "epoch": 0.027838997796079342, + "grad_norm": 11.972532620974395, + "learning_rate": 2e-06, + "loss": 0.2537, + "step": 120 + }, + { + "epoch": 0.028070989444380003, + "grad_norm": 11.808199219768877, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 121 + }, + { + "epoch": 0.028302981092680663, + "grad_norm": 41.16658965347815, + "learning_rate": 2e-06, + "loss": 0.446, + "step": 122 + }, + { + "epoch": 0.028534972740981324, + "grad_norm": 36.30280128025487, + "learning_rate": 2e-06, + "loss": 0.3894, + "step": 123 + }, + { + "epoch": 0.028766964389281984, + "grad_norm": 38.95823329601204, + "learning_rate": 2e-06, + "loss": 0.4648, + "step": 124 + }, + { + "epoch": 0.028998956037582648, + "grad_norm": 22.228812125565018, + "learning_rate": 2e-06, + "loss": 0.4194, + "step": 125 + }, + { + "epoch": 0.02923094768588331, + "grad_norm": 15.979110407781146, + "learning_rate": 2e-06, + "loss": 0.4041, + "step": 126 + }, + { + "epoch": 0.02946293933418397, + "grad_norm": 37.18105571848787, + "learning_rate": 2e-06, + "loss": 0.4889, + "step": 127 + }, + { + "epoch": 0.02969493098248463, + "grad_norm": 30.857636960509943, + "learning_rate": 2e-06, + "loss": 0.4519, + "step": 128 + }, + { + "epoch": 0.02992692263078529, + "grad_norm": 39.61185636984746, + "learning_rate": 2e-06, + "loss": 0.3358, + "step": 129 + }, + { + "epoch": 0.030158914279085954, + "grad_norm": 42.498140804000236, + "learning_rate": 2e-06, + "loss": 0.5417, + "step": 130 + }, + { + "epoch": 0.030390905927386615, + "grad_norm": 10.908948081927463, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 131 + }, + { + "epoch": 0.030622897575687275, + "grad_norm": 14.806615866444728, + "learning_rate": 2e-06, + "loss": 0.3544, + "step": 132 + }, + { + "epoch": 0.030854889223987936, + "grad_norm": 39.602376233592565, + "learning_rate": 2e-06, + "loss": 0.5983, + "step": 133 + }, + { + "epoch": 0.031086880872288596, + "grad_norm": 25.46432735816376, + "learning_rate": 2e-06, + "loss": 0.3938, + "step": 134 + }, + { + "epoch": 0.03131887252058926, + "grad_norm": 23.611133494637883, + "learning_rate": 2e-06, + "loss": 0.4045, + "step": 135 + }, + { + "epoch": 0.03155086416888992, + "grad_norm": 20.360702542186612, + "learning_rate": 2e-06, + "loss": 0.3803, + "step": 136 + }, + { + "epoch": 0.03178285581719058, + "grad_norm": 18.853473807665043, + "learning_rate": 2e-06, + "loss": 0.3924, + "step": 137 + }, + { + "epoch": 0.03201484746549124, + "grad_norm": 17.706059410717437, + "learning_rate": 2e-06, + "loss": 0.3616, + "step": 138 + }, + { + "epoch": 0.032246839113791906, + "grad_norm": 25.910709281503944, + "learning_rate": 2e-06, + "loss": 0.4097, + "step": 139 + }, + { + "epoch": 0.03247883076209256, + "grad_norm": 18.78408165355993, + "learning_rate": 2e-06, + "loss": 0.4219, + "step": 140 + }, + { + "epoch": 0.03271082241039323, + "grad_norm": 17.592827541949497, + "learning_rate": 2e-06, + "loss": 0.4167, + "step": 141 + }, + { + "epoch": 0.032942814058693884, + "grad_norm": 20.38480421675272, + "learning_rate": 2e-06, + "loss": 0.3778, + "step": 142 + }, + { + "epoch": 0.03317480570699455, + "grad_norm": 26.556214891507235, + "learning_rate": 2e-06, + "loss": 0.5647, + "step": 143 + }, + { + "epoch": 0.03340679735529521, + "grad_norm": 10.280063069710177, + "learning_rate": 2e-06, + "loss": 0.2938, + "step": 144 + }, + { + "epoch": 0.03363878900359587, + "grad_norm": 35.07937942331207, + "learning_rate": 2e-06, + "loss": 0.4143, + "step": 145 + }, + { + "epoch": 0.03387078065189653, + "grad_norm": 28.537691552524954, + "learning_rate": 2e-06, + "loss": 0.4831, + "step": 146 + }, + { + "epoch": 0.03410277230019719, + "grad_norm": 16.60729525087867, + "learning_rate": 2e-06, + "loss": 0.4095, + "step": 147 + }, + { + "epoch": 0.034334763948497854, + "grad_norm": 12.328955527513653, + "learning_rate": 2e-06, + "loss": 0.3769, + "step": 148 + }, + { + "epoch": 0.03456675559679852, + "grad_norm": 18.092020051851453, + "learning_rate": 2e-06, + "loss": 0.3555, + "step": 149 + }, + { + "epoch": 0.034798747245099175, + "grad_norm": 44.18114865674143, + "learning_rate": 2e-06, + "loss": 0.5052, + "step": 150 + }, + { + "epoch": 0.03503073889339984, + "grad_norm": 16.552193439071686, + "learning_rate": 2e-06, + "loss": 0.3389, + "step": 151 + }, + { + "epoch": 0.035262730541700496, + "grad_norm": 28.697477777252256, + "learning_rate": 2e-06, + "loss": 0.3826, + "step": 152 + }, + { + "epoch": 0.03549472219000116, + "grad_norm": 9.627827612017324, + "learning_rate": 2e-06, + "loss": 0.3534, + "step": 153 + }, + { + "epoch": 0.035726713838301824, + "grad_norm": 18.495594499790304, + "learning_rate": 2e-06, + "loss": 0.2883, + "step": 154 + }, + { + "epoch": 0.03595870548660248, + "grad_norm": 14.819731470172004, + "learning_rate": 2e-06, + "loss": 0.3543, + "step": 155 + }, + { + "epoch": 0.036190697134903145, + "grad_norm": 30.084238573946536, + "learning_rate": 2e-06, + "loss": 0.3936, + "step": 156 + }, + { + "epoch": 0.0364226887832038, + "grad_norm": 38.32707194245598, + "learning_rate": 2e-06, + "loss": 0.5518, + "step": 157 + }, + { + "epoch": 0.036654680431504466, + "grad_norm": 30.99662633417196, + "learning_rate": 2e-06, + "loss": 0.3367, + "step": 158 + }, + { + "epoch": 0.03688667207980513, + "grad_norm": 29.788479638237202, + "learning_rate": 2e-06, + "loss": 0.44, + "step": 159 + }, + { + "epoch": 0.03711866372810579, + "grad_norm": 17.494933531167206, + "learning_rate": 2e-06, + "loss": 0.3427, + "step": 160 + }, + { + "epoch": 0.03735065537640645, + "grad_norm": 33.72980029025054, + "learning_rate": 2e-06, + "loss": 0.3467, + "step": 161 + }, + { + "epoch": 0.03758264702470711, + "grad_norm": 13.379171231265387, + "learning_rate": 2e-06, + "loss": 0.3747, + "step": 162 + }, + { + "epoch": 0.03781463867300777, + "grad_norm": 20.70832977530479, + "learning_rate": 2e-06, + "loss": 0.3986, + "step": 163 + }, + { + "epoch": 0.038046630321308436, + "grad_norm": 28.74074583084915, + "learning_rate": 2e-06, + "loss": 0.3806, + "step": 164 + }, + { + "epoch": 0.03827862196960909, + "grad_norm": 52.2052448308457, + "learning_rate": 2e-06, + "loss": 0.4445, + "step": 165 + }, + { + "epoch": 0.03851061361790976, + "grad_norm": 31.048472582638617, + "learning_rate": 2e-06, + "loss": 0.4167, + "step": 166 + }, + { + "epoch": 0.038742605266210414, + "grad_norm": 21.227802893594617, + "learning_rate": 2e-06, + "loss": 0.4363, + "step": 167 + }, + { + "epoch": 0.03897459691451108, + "grad_norm": 22.483410466187955, + "learning_rate": 2e-06, + "loss": 0.3964, + "step": 168 + }, + { + "epoch": 0.03920658856281174, + "grad_norm": 15.724111111556514, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 169 + }, + { + "epoch": 0.0394385802111124, + "grad_norm": 17.28514355448206, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 170 + }, + { + "epoch": 0.03967057185941306, + "grad_norm": 13.689374632434935, + "learning_rate": 2e-06, + "loss": 0.283, + "step": 171 + }, + { + "epoch": 0.03990256350771372, + "grad_norm": 14.577661385441322, + "learning_rate": 2e-06, + "loss": 0.3097, + "step": 172 + }, + { + "epoch": 0.040134555156014384, + "grad_norm": 24.861537338293093, + "learning_rate": 2e-06, + "loss": 0.4145, + "step": 173 + }, + { + "epoch": 0.04036654680431504, + "grad_norm": 33.91806599011101, + "learning_rate": 2e-06, + "loss": 0.4137, + "step": 174 + }, + { + "epoch": 0.040598538452615705, + "grad_norm": 37.330205558404835, + "learning_rate": 2e-06, + "loss": 0.4001, + "step": 175 + }, + { + "epoch": 0.04083053010091637, + "grad_norm": 10.75490224381776, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 176 + }, + { + "epoch": 0.041062521749217026, + "grad_norm": 17.920632030971035, + "learning_rate": 2e-06, + "loss": 0.3957, + "step": 177 + }, + { + "epoch": 0.04129451339751769, + "grad_norm": 35.06077517129045, + "learning_rate": 2e-06, + "loss": 0.5442, + "step": 178 + }, + { + "epoch": 0.04152650504581835, + "grad_norm": 37.373576118845364, + "learning_rate": 2e-06, + "loss": 0.3909, + "step": 179 + }, + { + "epoch": 0.04175849669411901, + "grad_norm": 21.774127565938404, + "learning_rate": 2e-06, + "loss": 0.3612, + "step": 180 + }, + { + "epoch": 0.041990488342419675, + "grad_norm": 22.310572412193864, + "learning_rate": 2e-06, + "loss": 0.3541, + "step": 181 + }, + { + "epoch": 0.04222247999072033, + "grad_norm": 19.91238122930712, + "learning_rate": 2e-06, + "loss": 0.3518, + "step": 182 + }, + { + "epoch": 0.042454471639020996, + "grad_norm": 38.82299720583173, + "learning_rate": 2e-06, + "loss": 0.4455, + "step": 183 + }, + { + "epoch": 0.04268646328732165, + "grad_norm": 81.78612014143567, + "learning_rate": 2e-06, + "loss": 0.4527, + "step": 184 + }, + { + "epoch": 0.04291845493562232, + "grad_norm": 38.671646262718454, + "learning_rate": 2e-06, + "loss": 0.4736, + "step": 185 + }, + { + "epoch": 0.04315044658392298, + "grad_norm": 28.292078981601982, + "learning_rate": 2e-06, + "loss": 0.3465, + "step": 186 + }, + { + "epoch": 0.04338243823222364, + "grad_norm": 28.201025845524587, + "learning_rate": 2e-06, + "loss": 0.3274, + "step": 187 + }, + { + "epoch": 0.0436144298805243, + "grad_norm": 16.502642381982742, + "learning_rate": 2e-06, + "loss": 0.3552, + "step": 188 + }, + { + "epoch": 0.04384642152882496, + "grad_norm": 21.502109491213528, + "learning_rate": 2e-06, + "loss": 0.4031, + "step": 189 + }, + { + "epoch": 0.044078413177125624, + "grad_norm": 45.395445806829784, + "learning_rate": 2e-06, + "loss": 0.3891, + "step": 190 + }, + { + "epoch": 0.04431040482542629, + "grad_norm": 16.772245712643286, + "learning_rate": 2e-06, + "loss": 0.3811, + "step": 191 + }, + { + "epoch": 0.044542396473726945, + "grad_norm": 16.484095337228272, + "learning_rate": 2e-06, + "loss": 0.3893, + "step": 192 + }, + { + "epoch": 0.04477438812202761, + "grad_norm": 19.347435512156647, + "learning_rate": 2e-06, + "loss": 0.3649, + "step": 193 + }, + { + "epoch": 0.045006379770328266, + "grad_norm": 24.383625612454, + "learning_rate": 2e-06, + "loss": 0.3976, + "step": 194 + }, + { + "epoch": 0.04523837141862893, + "grad_norm": 16.780375260685737, + "learning_rate": 2e-06, + "loss": 0.2944, + "step": 195 + }, + { + "epoch": 0.045470363066929594, + "grad_norm": 16.538828083347266, + "learning_rate": 2e-06, + "loss": 0.3967, + "step": 196 + }, + { + "epoch": 0.04570235471523025, + "grad_norm": 25.831467483102088, + "learning_rate": 2e-06, + "loss": 0.4292, + "step": 197 + }, + { + "epoch": 0.045934346363530915, + "grad_norm": 29.933106472676695, + "learning_rate": 2e-06, + "loss": 0.3907, + "step": 198 + }, + { + "epoch": 0.04616633801183157, + "grad_norm": 34.722557394383415, + "learning_rate": 2e-06, + "loss": 0.4854, + "step": 199 + }, + { + "epoch": 0.046398329660132236, + "grad_norm": 23.307072911383692, + "learning_rate": 2e-06, + "loss": 0.3485, + "step": 200 + }, + { + "epoch": 0.0466303213084329, + "grad_norm": 44.83081493918162, + "learning_rate": 2e-06, + "loss": 0.3677, + "step": 201 + }, + { + "epoch": 0.04686231295673356, + "grad_norm": 17.456496077989364, + "learning_rate": 2e-06, + "loss": 0.2696, + "step": 202 + }, + { + "epoch": 0.04709430460503422, + "grad_norm": 47.09280879290812, + "learning_rate": 2e-06, + "loss": 0.4629, + "step": 203 + }, + { + "epoch": 0.04732629625333488, + "grad_norm": 28.03807838243593, + "learning_rate": 2e-06, + "loss": 0.4508, + "step": 204 + }, + { + "epoch": 0.04755828790163554, + "grad_norm": 22.95656727807171, + "learning_rate": 2e-06, + "loss": 0.5087, + "step": 205 + }, + { + "epoch": 0.047790279549936206, + "grad_norm": 19.916633250635453, + "learning_rate": 2e-06, + "loss": 0.3502, + "step": 206 + }, + { + "epoch": 0.04802227119823686, + "grad_norm": 18.429230451297414, + "learning_rate": 2e-06, + "loss": 0.3992, + "step": 207 + }, + { + "epoch": 0.04825426284653753, + "grad_norm": 19.146947435265297, + "learning_rate": 2e-06, + "loss": 0.37, + "step": 208 + }, + { + "epoch": 0.048486254494838184, + "grad_norm": 19.635638653510593, + "learning_rate": 2e-06, + "loss": 0.4428, + "step": 209 + }, + { + "epoch": 0.04871824614313885, + "grad_norm": 16.65678239968762, + "learning_rate": 2e-06, + "loss": 0.3539, + "step": 210 + }, + { + "epoch": 0.048950237791439505, + "grad_norm": 12.811312162663347, + "learning_rate": 2e-06, + "loss": 0.3111, + "step": 211 + }, + { + "epoch": 0.04918222943974017, + "grad_norm": 26.544754694919302, + "learning_rate": 2e-06, + "loss": 0.3996, + "step": 212 + }, + { + "epoch": 0.04941422108804083, + "grad_norm": 27.038312574981394, + "learning_rate": 2e-06, + "loss": 0.4816, + "step": 213 + }, + { + "epoch": 0.04964621273634149, + "grad_norm": 19.110034604846202, + "learning_rate": 2e-06, + "loss": 0.3396, + "step": 214 + }, + { + "epoch": 0.049878204384642154, + "grad_norm": 17.23770447640579, + "learning_rate": 2e-06, + "loss": 0.4118, + "step": 215 + }, + { + "epoch": 0.05011019603294281, + "grad_norm": 25.7082676561769, + "learning_rate": 2e-06, + "loss": 0.3486, + "step": 216 + }, + { + "epoch": 0.050342187681243475, + "grad_norm": 29.31762312050998, + "learning_rate": 2e-06, + "loss": 0.4206, + "step": 217 + }, + { + "epoch": 0.05057417932954414, + "grad_norm": 28.316750519747554, + "learning_rate": 2e-06, + "loss": 0.3658, + "step": 218 + }, + { + "epoch": 0.050806170977844796, + "grad_norm": 18.295835928970263, + "learning_rate": 2e-06, + "loss": 0.3191, + "step": 219 + }, + { + "epoch": 0.05103816262614546, + "grad_norm": 22.15390958004895, + "learning_rate": 2e-06, + "loss": 0.3842, + "step": 220 + }, + { + "epoch": 0.05127015427444612, + "grad_norm": 31.55727431986, + "learning_rate": 2e-06, + "loss": 0.4348, + "step": 221 + }, + { + "epoch": 0.05150214592274678, + "grad_norm": 18.142365032470167, + "learning_rate": 2e-06, + "loss": 0.3251, + "step": 222 + }, + { + "epoch": 0.051734137571047445, + "grad_norm": 14.63675771579077, + "learning_rate": 2e-06, + "loss": 0.3872, + "step": 223 + }, + { + "epoch": 0.0519661292193481, + "grad_norm": 15.783701599136487, + "learning_rate": 2e-06, + "loss": 0.4255, + "step": 224 + }, + { + "epoch": 0.052198120867648766, + "grad_norm": 21.07727010080486, + "learning_rate": 2e-06, + "loss": 0.3722, + "step": 225 + }, + { + "epoch": 0.05243011251594942, + "grad_norm": 19.40775575437732, + "learning_rate": 2e-06, + "loss": 0.3341, + "step": 226 + }, + { + "epoch": 0.05266210416425009, + "grad_norm": 27.236675635773192, + "learning_rate": 2e-06, + "loss": 0.2782, + "step": 227 + }, + { + "epoch": 0.05289409581255075, + "grad_norm": 24.8485428044272, + "learning_rate": 2e-06, + "loss": 0.4178, + "step": 228 + }, + { + "epoch": 0.05312608746085141, + "grad_norm": 25.44002982309577, + "learning_rate": 2e-06, + "loss": 0.4114, + "step": 229 + }, + { + "epoch": 0.05335807910915207, + "grad_norm": 32.39078707431755, + "learning_rate": 2e-06, + "loss": 0.3624, + "step": 230 + }, + { + "epoch": 0.05359007075745273, + "grad_norm": 20.898925225007876, + "learning_rate": 2e-06, + "loss": 0.3823, + "step": 231 + }, + { + "epoch": 0.05382206240575339, + "grad_norm": 14.481012835578381, + "learning_rate": 2e-06, + "loss": 0.2783, + "step": 232 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 33.345472766269374, + "learning_rate": 2e-06, + "loss": 0.4105, + "step": 233 + }, + { + "epoch": 0.054286045702354714, + "grad_norm": 27.313411652481147, + "learning_rate": 2e-06, + "loss": 0.3728, + "step": 234 + }, + { + "epoch": 0.05451803735065538, + "grad_norm": 57.409672982284185, + "learning_rate": 2e-06, + "loss": 0.2871, + "step": 235 + }, + { + "epoch": 0.054750028998956035, + "grad_norm": 16.72238049877021, + "learning_rate": 2e-06, + "loss": 0.3353, + "step": 236 + }, + { + "epoch": 0.0549820206472567, + "grad_norm": 29.3975251341519, + "learning_rate": 2e-06, + "loss": 0.4285, + "step": 237 + }, + { + "epoch": 0.05521401229555736, + "grad_norm": 16.557186007773907, + "learning_rate": 2e-06, + "loss": 0.3779, + "step": 238 + }, + { + "epoch": 0.05544600394385802, + "grad_norm": 14.576894842730137, + "learning_rate": 2e-06, + "loss": 0.2714, + "step": 239 + }, + { + "epoch": 0.055677995592158684, + "grad_norm": 33.24881434167466, + "learning_rate": 2e-06, + "loss": 0.4713, + "step": 240 + }, + { + "epoch": 0.05590998724045934, + "grad_norm": 30.214762606934197, + "learning_rate": 2e-06, + "loss": 0.3491, + "step": 241 + }, + { + "epoch": 0.056141978888760005, + "grad_norm": 24.468541329934542, + "learning_rate": 2e-06, + "loss": 0.3431, + "step": 242 + }, + { + "epoch": 0.05637397053706067, + "grad_norm": 16.08552961180912, + "learning_rate": 2e-06, + "loss": 0.4033, + "step": 243 + }, + { + "epoch": 0.056605962185361326, + "grad_norm": 14.354582913537627, + "learning_rate": 2e-06, + "loss": 0.3266, + "step": 244 + }, + { + "epoch": 0.05683795383366199, + "grad_norm": 19.620651208208127, + "learning_rate": 2e-06, + "loss": 0.3277, + "step": 245 + }, + { + "epoch": 0.05706994548196265, + "grad_norm": 15.691928454576992, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 246 + }, + { + "epoch": 0.05730193713026331, + "grad_norm": 17.42590199984151, + "learning_rate": 2e-06, + "loss": 0.3284, + "step": 247 + }, + { + "epoch": 0.05753392877856397, + "grad_norm": 11.225459850807889, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 248 + }, + { + "epoch": 0.05776592042686463, + "grad_norm": 23.956123325800576, + "learning_rate": 2e-06, + "loss": 0.4609, + "step": 249 + }, + { + "epoch": 0.057997912075165296, + "grad_norm": 30.195231126761364, + "learning_rate": 2e-06, + "loss": 0.3564, + "step": 250 + }, + { + "epoch": 0.05822990372346595, + "grad_norm": 16.841385249087196, + "learning_rate": 2e-06, + "loss": 0.4339, + "step": 251 + }, + { + "epoch": 0.05846189537176662, + "grad_norm": 18.80534936760663, + "learning_rate": 2e-06, + "loss": 0.3906, + "step": 252 + }, + { + "epoch": 0.058693887020067274, + "grad_norm": 21.267472826987586, + "learning_rate": 2e-06, + "loss": 0.4437, + "step": 253 + }, + { + "epoch": 0.05892587866836794, + "grad_norm": 16.367714475333194, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 254 + }, + { + "epoch": 0.0591578703166686, + "grad_norm": 16.808024362000612, + "learning_rate": 2e-06, + "loss": 0.4284, + "step": 255 + }, + { + "epoch": 0.05938986196496926, + "grad_norm": 14.47203839492185, + "learning_rate": 2e-06, + "loss": 0.3487, + "step": 256 + }, + { + "epoch": 0.05962185361326992, + "grad_norm": 14.422808168299367, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 257 + }, + { + "epoch": 0.05985384526157058, + "grad_norm": 17.241932663553175, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 258 + }, + { + "epoch": 0.060085836909871244, + "grad_norm": 10.136726922616239, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 259 + }, + { + "epoch": 0.06031782855817191, + "grad_norm": 31.901966098793842, + "learning_rate": 2e-06, + "loss": 0.3934, + "step": 260 + }, + { + "epoch": 0.060549820206472565, + "grad_norm": 10.203419621588298, + "learning_rate": 2e-06, + "loss": 0.2739, + "step": 261 + }, + { + "epoch": 0.06078181185477323, + "grad_norm": 30.516359662869256, + "learning_rate": 2e-06, + "loss": 0.4177, + "step": 262 + }, + { + "epoch": 0.061013803503073887, + "grad_norm": 23.490412449083408, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 263 + }, + { + "epoch": 0.06124579515137455, + "grad_norm": 26.962705108860312, + "learning_rate": 2e-06, + "loss": 0.488, + "step": 264 + }, + { + "epoch": 0.061477786799675214, + "grad_norm": 21.8606732946384, + "learning_rate": 2e-06, + "loss": 0.3353, + "step": 265 + }, + { + "epoch": 0.06170977844797587, + "grad_norm": 11.84435585441604, + "learning_rate": 2e-06, + "loss": 0.2634, + "step": 266 + }, + { + "epoch": 0.061941770096276536, + "grad_norm": 18.759386751316498, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 267 + }, + { + "epoch": 0.06217376174457719, + "grad_norm": 10.860341407594653, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 268 + }, + { + "epoch": 0.06240575339287786, + "grad_norm": 22.756283905059107, + "learning_rate": 2e-06, + "loss": 0.4431, + "step": 269 + }, + { + "epoch": 0.06263774504117851, + "grad_norm": 18.857691106467716, + "learning_rate": 2e-06, + "loss": 0.4064, + "step": 270 + }, + { + "epoch": 0.06286973668947918, + "grad_norm": 13.71142172349342, + "learning_rate": 2e-06, + "loss": 0.3452, + "step": 271 + }, + { + "epoch": 0.06310172833777984, + "grad_norm": 28.59068887733619, + "learning_rate": 2e-06, + "loss": 0.4787, + "step": 272 + }, + { + "epoch": 0.0633337199860805, + "grad_norm": 34.9471101130114, + "learning_rate": 2e-06, + "loss": 0.4529, + "step": 273 + }, + { + "epoch": 0.06356571163438116, + "grad_norm": 15.432357523010165, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 274 + }, + { + "epoch": 0.06379770328268182, + "grad_norm": 21.176022226286484, + "learning_rate": 2e-06, + "loss": 0.3524, + "step": 275 + }, + { + "epoch": 0.06402969493098248, + "grad_norm": 20.398745690881963, + "learning_rate": 2e-06, + "loss": 0.3602, + "step": 276 + }, + { + "epoch": 0.06426168657928315, + "grad_norm": 45.61620524073223, + "learning_rate": 2e-06, + "loss": 0.4475, + "step": 277 + }, + { + "epoch": 0.06449367822758381, + "grad_norm": 11.778603437990974, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 278 + }, + { + "epoch": 0.06472566987588446, + "grad_norm": 16.05408430375991, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 279 + }, + { + "epoch": 0.06495766152418513, + "grad_norm": 24.413991600204813, + "learning_rate": 2e-06, + "loss": 0.4776, + "step": 280 + }, + { + "epoch": 0.06518965317248579, + "grad_norm": 20.658726900866288, + "learning_rate": 2e-06, + "loss": 0.4699, + "step": 281 + }, + { + "epoch": 0.06542164482078645, + "grad_norm": 20.214349439391693, + "learning_rate": 2e-06, + "loss": 0.3172, + "step": 282 + }, + { + "epoch": 0.06565363646908712, + "grad_norm": 18.253321100048833, + "learning_rate": 2e-06, + "loss": 0.3053, + "step": 283 + }, + { + "epoch": 0.06588562811738777, + "grad_norm": 25.27398761329044, + "learning_rate": 2e-06, + "loss": 0.4358, + "step": 284 + }, + { + "epoch": 0.06611761976568843, + "grad_norm": 23.617098432941468, + "learning_rate": 2e-06, + "loss": 0.3992, + "step": 285 + }, + { + "epoch": 0.0663496114139891, + "grad_norm": 8.904262369286501, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 286 + }, + { + "epoch": 0.06658160306228976, + "grad_norm": 24.24410486299016, + "learning_rate": 2e-06, + "loss": 0.2762, + "step": 287 + }, + { + "epoch": 0.06681359471059042, + "grad_norm": 18.83345601805634, + "learning_rate": 2e-06, + "loss": 0.3888, + "step": 288 + }, + { + "epoch": 0.06704558635889107, + "grad_norm": 14.660856467155357, + "learning_rate": 2e-06, + "loss": 0.2764, + "step": 289 + }, + { + "epoch": 0.06727757800719174, + "grad_norm": 33.45569066237724, + "learning_rate": 2e-06, + "loss": 0.4414, + "step": 290 + }, + { + "epoch": 0.0675095696554924, + "grad_norm": 21.82220797728714, + "learning_rate": 2e-06, + "loss": 0.3749, + "step": 291 + }, + { + "epoch": 0.06774156130379307, + "grad_norm": 26.735744320549216, + "learning_rate": 2e-06, + "loss": 0.3758, + "step": 292 + }, + { + "epoch": 0.06797355295209373, + "grad_norm": 19.018016510002802, + "learning_rate": 2e-06, + "loss": 0.3474, + "step": 293 + }, + { + "epoch": 0.06820554460039438, + "grad_norm": 20.773279197240708, + "learning_rate": 2e-06, + "loss": 0.292, + "step": 294 + }, + { + "epoch": 0.06843753624869504, + "grad_norm": 21.652882354360884, + "learning_rate": 2e-06, + "loss": 0.4562, + "step": 295 + }, + { + "epoch": 0.06866952789699571, + "grad_norm": 23.18839543070114, + "learning_rate": 2e-06, + "loss": 0.4007, + "step": 296 + }, + { + "epoch": 0.06890151954529637, + "grad_norm": 20.933515687339632, + "learning_rate": 2e-06, + "loss": 0.3702, + "step": 297 + }, + { + "epoch": 0.06913351119359704, + "grad_norm": 17.835747475594196, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 298 + }, + { + "epoch": 0.06936550284189769, + "grad_norm": 17.448326380657978, + "learning_rate": 2e-06, + "loss": 0.3179, + "step": 299 + }, + { + "epoch": 0.06959749449019835, + "grad_norm": 26.014972961257733, + "learning_rate": 2e-06, + "loss": 0.3853, + "step": 300 + }, + { + "epoch": 0.06982948613849901, + "grad_norm": 25.590774728334065, + "learning_rate": 2e-06, + "loss": 0.3799, + "step": 301 + }, + { + "epoch": 0.07006147778679968, + "grad_norm": 27.64512941308947, + "learning_rate": 2e-06, + "loss": 0.4052, + "step": 302 + }, + { + "epoch": 0.07029346943510034, + "grad_norm": 30.313882980750208, + "learning_rate": 2e-06, + "loss": 0.4532, + "step": 303 + }, + { + "epoch": 0.07052546108340099, + "grad_norm": 6.010401569300229, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 304 + }, + { + "epoch": 0.07075745273170166, + "grad_norm": 13.149785684164643, + "learning_rate": 2e-06, + "loss": 0.3219, + "step": 305 + }, + { + "epoch": 0.07098944438000232, + "grad_norm": 11.675290474453474, + "learning_rate": 2e-06, + "loss": 0.2681, + "step": 306 + }, + { + "epoch": 0.07122143602830298, + "grad_norm": 23.30421897740761, + "learning_rate": 2e-06, + "loss": 0.4489, + "step": 307 + }, + { + "epoch": 0.07145342767660365, + "grad_norm": 17.335889262546875, + "learning_rate": 2e-06, + "loss": 0.3924, + "step": 308 + }, + { + "epoch": 0.0716854193249043, + "grad_norm": 19.63995512405009, + "learning_rate": 2e-06, + "loss": 0.3371, + "step": 309 + }, + { + "epoch": 0.07191741097320496, + "grad_norm": 24.18936339860376, + "learning_rate": 2e-06, + "loss": 0.5017, + "step": 310 + }, + { + "epoch": 0.07214940262150563, + "grad_norm": 19.796051503412293, + "learning_rate": 2e-06, + "loss": 0.3462, + "step": 311 + }, + { + "epoch": 0.07238139426980629, + "grad_norm": 24.245217740356853, + "learning_rate": 2e-06, + "loss": 0.3896, + "step": 312 + }, + { + "epoch": 0.07261338591810695, + "grad_norm": 12.613025856749827, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 313 + }, + { + "epoch": 0.0728453775664076, + "grad_norm": 13.186064582017874, + "learning_rate": 2e-06, + "loss": 0.3445, + "step": 314 + }, + { + "epoch": 0.07307736921470827, + "grad_norm": 19.879322945226193, + "learning_rate": 2e-06, + "loss": 0.3519, + "step": 315 + }, + { + "epoch": 0.07330936086300893, + "grad_norm": 15.234452095839453, + "learning_rate": 2e-06, + "loss": 0.3882, + "step": 316 + }, + { + "epoch": 0.0735413525113096, + "grad_norm": 19.154427840014012, + "learning_rate": 2e-06, + "loss": 0.4501, + "step": 317 + }, + { + "epoch": 0.07377334415961026, + "grad_norm": 31.599294049661268, + "learning_rate": 2e-06, + "loss": 0.4159, + "step": 318 + }, + { + "epoch": 0.07400533580791091, + "grad_norm": 12.57362027981612, + "learning_rate": 2e-06, + "loss": 0.3952, + "step": 319 + }, + { + "epoch": 0.07423732745621157, + "grad_norm": 16.12107163904157, + "learning_rate": 2e-06, + "loss": 0.3671, + "step": 320 + }, + { + "epoch": 0.07446931910451224, + "grad_norm": 18.572850404526363, + "learning_rate": 2e-06, + "loss": 0.3022, + "step": 321 + }, + { + "epoch": 0.0747013107528129, + "grad_norm": 34.59231978461037, + "learning_rate": 2e-06, + "loss": 0.4849, + "step": 322 + }, + { + "epoch": 0.07493330240111357, + "grad_norm": 13.285785318659888, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 323 + }, + { + "epoch": 0.07516529404941422, + "grad_norm": 15.365050538063457, + "learning_rate": 2e-06, + "loss": 0.3808, + "step": 324 + }, + { + "epoch": 0.07539728569771488, + "grad_norm": 19.091634443388465, + "learning_rate": 2e-06, + "loss": 0.3394, + "step": 325 + }, + { + "epoch": 0.07562927734601554, + "grad_norm": 18.766369082786838, + "learning_rate": 2e-06, + "loss": 0.4084, + "step": 326 + }, + { + "epoch": 0.07586126899431621, + "grad_norm": 22.93179092378503, + "learning_rate": 2e-06, + "loss": 0.3995, + "step": 327 + }, + { + "epoch": 0.07609326064261687, + "grad_norm": 26.769794964089836, + "learning_rate": 2e-06, + "loss": 0.3826, + "step": 328 + }, + { + "epoch": 0.07632525229091752, + "grad_norm": 17.51489111550706, + "learning_rate": 2e-06, + "loss": 0.416, + "step": 329 + }, + { + "epoch": 0.07655724393921819, + "grad_norm": 16.635578775167737, + "learning_rate": 2e-06, + "loss": 0.4193, + "step": 330 + }, + { + "epoch": 0.07678923558751885, + "grad_norm": 22.91519385563069, + "learning_rate": 2e-06, + "loss": 0.4024, + "step": 331 + }, + { + "epoch": 0.07702122723581951, + "grad_norm": 24.359791173757518, + "learning_rate": 2e-06, + "loss": 0.418, + "step": 332 + }, + { + "epoch": 0.07725321888412018, + "grad_norm": 17.329821601312467, + "learning_rate": 2e-06, + "loss": 0.3709, + "step": 333 + }, + { + "epoch": 0.07748521053242083, + "grad_norm": 27.673581294656348, + "learning_rate": 2e-06, + "loss": 0.5115, + "step": 334 + }, + { + "epoch": 0.07771720218072149, + "grad_norm": 27.328269842064092, + "learning_rate": 2e-06, + "loss": 0.4333, + "step": 335 + }, + { + "epoch": 0.07794919382902216, + "grad_norm": 17.986657057309625, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 336 + }, + { + "epoch": 0.07818118547732282, + "grad_norm": 15.710760855716062, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 337 + }, + { + "epoch": 0.07841317712562348, + "grad_norm": 28.54747304199656, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 338 + }, + { + "epoch": 0.07864516877392413, + "grad_norm": 28.406446425076776, + "learning_rate": 2e-06, + "loss": 0.4491, + "step": 339 + }, + { + "epoch": 0.0788771604222248, + "grad_norm": 13.00998113821747, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 340 + }, + { + "epoch": 0.07910915207052546, + "grad_norm": 24.394710444728887, + "learning_rate": 2e-06, + "loss": 0.3499, + "step": 341 + }, + { + "epoch": 0.07934114371882613, + "grad_norm": 16.848986324981123, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 342 + }, + { + "epoch": 0.07957313536712678, + "grad_norm": 31.12501067169636, + "learning_rate": 2e-06, + "loss": 0.4136, + "step": 343 + }, + { + "epoch": 0.07980512701542744, + "grad_norm": 33.14948891851159, + "learning_rate": 2e-06, + "loss": 0.4752, + "step": 344 + }, + { + "epoch": 0.0800371186637281, + "grad_norm": 27.33198419147624, + "learning_rate": 2e-06, + "loss": 0.4415, + "step": 345 + }, + { + "epoch": 0.08026911031202877, + "grad_norm": 23.586083476624456, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 346 + }, + { + "epoch": 0.08050110196032943, + "grad_norm": 20.590998680061357, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 347 + }, + { + "epoch": 0.08073309360863008, + "grad_norm": 24.375154903581354, + "learning_rate": 2e-06, + "loss": 0.4577, + "step": 348 + }, + { + "epoch": 0.08096508525693075, + "grad_norm": 28.988118815723574, + "learning_rate": 2e-06, + "loss": 0.4254, + "step": 349 + }, + { + "epoch": 0.08119707690523141, + "grad_norm": 16.422024111545927, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 350 + }, + { + "epoch": 0.08142906855353207, + "grad_norm": 28.466863498914023, + "learning_rate": 2e-06, + "loss": 0.4772, + "step": 351 + }, + { + "epoch": 0.08166106020183274, + "grad_norm": 24.154986228673167, + "learning_rate": 2e-06, + "loss": 0.3228, + "step": 352 + }, + { + "epoch": 0.08189305185013339, + "grad_norm": 10.822885849051195, + "learning_rate": 2e-06, + "loss": 0.3191, + "step": 353 + }, + { + "epoch": 0.08212504349843405, + "grad_norm": 12.170005974274169, + "learning_rate": 2e-06, + "loss": 0.3375, + "step": 354 + }, + { + "epoch": 0.08235703514673472, + "grad_norm": 16.35142729634425, + "learning_rate": 2e-06, + "loss": 0.3974, + "step": 355 + }, + { + "epoch": 0.08258902679503538, + "grad_norm": 20.3269200243023, + "learning_rate": 2e-06, + "loss": 0.3734, + "step": 356 + }, + { + "epoch": 0.08282101844333604, + "grad_norm": 18.34613766392723, + "learning_rate": 2e-06, + "loss": 0.464, + "step": 357 + }, + { + "epoch": 0.0830530100916367, + "grad_norm": 18.712920482784153, + "learning_rate": 2e-06, + "loss": 0.3485, + "step": 358 + }, + { + "epoch": 0.08328500173993736, + "grad_norm": 14.590189583694684, + "learning_rate": 2e-06, + "loss": 0.4094, + "step": 359 + }, + { + "epoch": 0.08351699338823802, + "grad_norm": 16.316678168602866, + "learning_rate": 2e-06, + "loss": 0.3666, + "step": 360 + }, + { + "epoch": 0.08374898503653869, + "grad_norm": 21.600786547305564, + "learning_rate": 2e-06, + "loss": 0.3957, + "step": 361 + }, + { + "epoch": 0.08398097668483935, + "grad_norm": 6.130256185688987, + "learning_rate": 2e-06, + "loss": 0.2955, + "step": 362 + }, + { + "epoch": 0.08421296833314, + "grad_norm": 15.583176633847687, + "learning_rate": 2e-06, + "loss": 0.3424, + "step": 363 + }, + { + "epoch": 0.08444495998144066, + "grad_norm": 10.731580878972554, + "learning_rate": 2e-06, + "loss": 0.3268, + "step": 364 + }, + { + "epoch": 0.08467695162974133, + "grad_norm": 16.475523855583248, + "learning_rate": 2e-06, + "loss": 0.3199, + "step": 365 + }, + { + "epoch": 0.08490894327804199, + "grad_norm": 19.190504550314518, + "learning_rate": 2e-06, + "loss": 0.3784, + "step": 366 + }, + { + "epoch": 0.08514093492634266, + "grad_norm": 22.69383405935886, + "learning_rate": 2e-06, + "loss": 0.3864, + "step": 367 + }, + { + "epoch": 0.0853729265746433, + "grad_norm": 6.1048447030660125, + "learning_rate": 2e-06, + "loss": 0.3483, + "step": 368 + }, + { + "epoch": 0.08560491822294397, + "grad_norm": 25.73071880587071, + "learning_rate": 2e-06, + "loss": 0.4592, + "step": 369 + }, + { + "epoch": 0.08583690987124463, + "grad_norm": 18.351202040668635, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 370 + }, + { + "epoch": 0.0860689015195453, + "grad_norm": 16.26909477918099, + "learning_rate": 2e-06, + "loss": 0.3124, + "step": 371 + }, + { + "epoch": 0.08630089316784596, + "grad_norm": 14.241063324741525, + "learning_rate": 2e-06, + "loss": 0.3076, + "step": 372 + }, + { + "epoch": 0.08653288481614661, + "grad_norm": 14.506215185214822, + "learning_rate": 2e-06, + "loss": 0.3984, + "step": 373 + }, + { + "epoch": 0.08676487646444728, + "grad_norm": 9.382376932185698, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 374 + }, + { + "epoch": 0.08699686811274794, + "grad_norm": 50.59490449016764, + "learning_rate": 2e-06, + "loss": 0.3454, + "step": 375 + }, + { + "epoch": 0.0872288597610486, + "grad_norm": 27.547531076602287, + "learning_rate": 2e-06, + "loss": 0.4362, + "step": 376 + }, + { + "epoch": 0.08746085140934927, + "grad_norm": 16.082596257621272, + "learning_rate": 2e-06, + "loss": 0.3319, + "step": 377 + }, + { + "epoch": 0.08769284305764992, + "grad_norm": 31.876537236032785, + "learning_rate": 2e-06, + "loss": 0.4915, + "step": 378 + }, + { + "epoch": 0.08792483470595058, + "grad_norm": 22.54072299553908, + "learning_rate": 2e-06, + "loss": 0.3151, + "step": 379 + }, + { + "epoch": 0.08815682635425125, + "grad_norm": 17.614758213456692, + "learning_rate": 2e-06, + "loss": 0.3949, + "step": 380 + }, + { + "epoch": 0.08838881800255191, + "grad_norm": 14.148275945729827, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 381 + }, + { + "epoch": 0.08862080965085258, + "grad_norm": 11.314995508339132, + "learning_rate": 2e-06, + "loss": 0.285, + "step": 382 + }, + { + "epoch": 0.08885280129915323, + "grad_norm": 17.241195116605457, + "learning_rate": 2e-06, + "loss": 0.3807, + "step": 383 + }, + { + "epoch": 0.08908479294745389, + "grad_norm": 15.385646501389344, + "learning_rate": 2e-06, + "loss": 0.3809, + "step": 384 + }, + { + "epoch": 0.08931678459575455, + "grad_norm": 17.684525111155942, + "learning_rate": 2e-06, + "loss": 0.1991, + "step": 385 + }, + { + "epoch": 0.08954877624405522, + "grad_norm": 24.501559406718627, + "learning_rate": 2e-06, + "loss": 0.4261, + "step": 386 + }, + { + "epoch": 0.08978076789235588, + "grad_norm": 18.143176673818, + "learning_rate": 2e-06, + "loss": 0.2896, + "step": 387 + }, + { + "epoch": 0.09001275954065653, + "grad_norm": 27.499625471928624, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 388 + }, + { + "epoch": 0.0902447511889572, + "grad_norm": 13.029164508292824, + "learning_rate": 2e-06, + "loss": 0.3339, + "step": 389 + }, + { + "epoch": 0.09047674283725786, + "grad_norm": 22.06911169822742, + "learning_rate": 2e-06, + "loss": 0.4013, + "step": 390 + }, + { + "epoch": 0.09070873448555852, + "grad_norm": 25.660629113538942, + "learning_rate": 2e-06, + "loss": 0.3766, + "step": 391 + }, + { + "epoch": 0.09094072613385919, + "grad_norm": 12.330854489315392, + "learning_rate": 2e-06, + "loss": 0.3189, + "step": 392 + }, + { + "epoch": 0.09117271778215984, + "grad_norm": 12.26820205350865, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 393 + }, + { + "epoch": 0.0914047094304605, + "grad_norm": 19.489121042795496, + "learning_rate": 2e-06, + "loss": 0.315, + "step": 394 + }, + { + "epoch": 0.09163670107876117, + "grad_norm": 17.211896639231988, + "learning_rate": 2e-06, + "loss": 0.2837, + "step": 395 + }, + { + "epoch": 0.09186869272706183, + "grad_norm": 12.236343790554217, + "learning_rate": 2e-06, + "loss": 0.2436, + "step": 396 + }, + { + "epoch": 0.0921006843753625, + "grad_norm": 12.287095537549003, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 397 + }, + { + "epoch": 0.09233267602366314, + "grad_norm": 36.20698132473805, + "learning_rate": 2e-06, + "loss": 0.3322, + "step": 398 + }, + { + "epoch": 0.09256466767196381, + "grad_norm": 30.012469417565942, + "learning_rate": 2e-06, + "loss": 0.3058, + "step": 399 + }, + { + "epoch": 0.09279665932026447, + "grad_norm": 18.70204786894355, + "learning_rate": 2e-06, + "loss": 0.28, + "step": 400 + }, + { + "epoch": 0.09302865096856514, + "grad_norm": 23.040397243965792, + "learning_rate": 2e-06, + "loss": 0.3506, + "step": 401 + }, + { + "epoch": 0.0932606426168658, + "grad_norm": 14.503689982051851, + "learning_rate": 2e-06, + "loss": 0.3534, + "step": 402 + }, + { + "epoch": 0.09349263426516645, + "grad_norm": 19.06995983751695, + "learning_rate": 2e-06, + "loss": 0.3877, + "step": 403 + }, + { + "epoch": 0.09372462591346711, + "grad_norm": 15.528379690183382, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 404 + }, + { + "epoch": 0.09395661756176778, + "grad_norm": 12.183328931435046, + "learning_rate": 2e-06, + "loss": 0.2328, + "step": 405 + }, + { + "epoch": 0.09418860921006844, + "grad_norm": 10.249081656164702, + "learning_rate": 2e-06, + "loss": 0.3154, + "step": 406 + }, + { + "epoch": 0.0944206008583691, + "grad_norm": 34.79842136298589, + "learning_rate": 2e-06, + "loss": 0.392, + "step": 407 + }, + { + "epoch": 0.09465259250666976, + "grad_norm": 27.944587538361805, + "learning_rate": 2e-06, + "loss": 0.4147, + "step": 408 + }, + { + "epoch": 0.09488458415497042, + "grad_norm": 20.30318182509188, + "learning_rate": 2e-06, + "loss": 0.4244, + "step": 409 + }, + { + "epoch": 0.09511657580327108, + "grad_norm": 43.576920841913044, + "learning_rate": 2e-06, + "loss": 0.49, + "step": 410 + }, + { + "epoch": 0.09534856745157175, + "grad_norm": 25.236131980802266, + "learning_rate": 2e-06, + "loss": 0.4109, + "step": 411 + }, + { + "epoch": 0.09558055909987241, + "grad_norm": 29.170644729481666, + "learning_rate": 2e-06, + "loss": 0.5216, + "step": 412 + }, + { + "epoch": 0.09581255074817306, + "grad_norm": 19.516436049903138, + "learning_rate": 2e-06, + "loss": 0.4077, + "step": 413 + }, + { + "epoch": 0.09604454239647373, + "grad_norm": 17.128714650282106, + "learning_rate": 2e-06, + "loss": 0.3999, + "step": 414 + }, + { + "epoch": 0.09627653404477439, + "grad_norm": 12.006511406067414, + "learning_rate": 2e-06, + "loss": 0.2462, + "step": 415 + }, + { + "epoch": 0.09650852569307505, + "grad_norm": 16.275965696790472, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 416 + }, + { + "epoch": 0.0967405173413757, + "grad_norm": 14.7500730370023, + "learning_rate": 2e-06, + "loss": 0.2836, + "step": 417 + }, + { + "epoch": 0.09697250898967637, + "grad_norm": 18.667197603654824, + "learning_rate": 2e-06, + "loss": 0.3598, + "step": 418 + }, + { + "epoch": 0.09720450063797703, + "grad_norm": 17.955626925134116, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 419 + }, + { + "epoch": 0.0974364922862777, + "grad_norm": 21.36474071148061, + "learning_rate": 2e-06, + "loss": 0.3349, + "step": 420 + }, + { + "epoch": 0.09766848393457836, + "grad_norm": 37.371825648292855, + "learning_rate": 2e-06, + "loss": 0.3935, + "step": 421 + }, + { + "epoch": 0.09790047558287901, + "grad_norm": 25.24655195690179, + "learning_rate": 2e-06, + "loss": 0.3842, + "step": 422 + }, + { + "epoch": 0.09813246723117967, + "grad_norm": 18.403246533860493, + "learning_rate": 2e-06, + "loss": 0.4077, + "step": 423 + }, + { + "epoch": 0.09836445887948034, + "grad_norm": 23.049463332646905, + "learning_rate": 2e-06, + "loss": 0.4219, + "step": 424 + }, + { + "epoch": 0.098596450527781, + "grad_norm": 22.791123792015163, + "learning_rate": 2e-06, + "loss": 0.386, + "step": 425 + }, + { + "epoch": 0.09882844217608167, + "grad_norm": 20.05570413178967, + "learning_rate": 2e-06, + "loss": 0.4471, + "step": 426 + }, + { + "epoch": 0.09906043382438232, + "grad_norm": 24.374099834079217, + "learning_rate": 2e-06, + "loss": 0.3568, + "step": 427 + }, + { + "epoch": 0.09929242547268298, + "grad_norm": 26.877154115175962, + "learning_rate": 2e-06, + "loss": 0.4576, + "step": 428 + }, + { + "epoch": 0.09952441712098364, + "grad_norm": 24.697932991796872, + "learning_rate": 2e-06, + "loss": 0.2783, + "step": 429 + }, + { + "epoch": 0.09975640876928431, + "grad_norm": 60.2313041091182, + "learning_rate": 2e-06, + "loss": 0.3714, + "step": 430 + }, + { + "epoch": 0.09998840041758497, + "grad_norm": 13.60633605845247, + "learning_rate": 2e-06, + "loss": 0.3332, + "step": 431 + }, + { + "epoch": 0.10022039206588562, + "grad_norm": 42.72261628492271, + "learning_rate": 2e-06, + "loss": 0.4944, + "step": 432 + }, + { + "epoch": 0.10045238371418629, + "grad_norm": 12.503324964724692, + "learning_rate": 2e-06, + "loss": 0.3633, + "step": 433 + }, + { + "epoch": 0.10068437536248695, + "grad_norm": 22.839555311492852, + "learning_rate": 2e-06, + "loss": 0.4408, + "step": 434 + }, + { + "epoch": 0.10091636701078761, + "grad_norm": 28.45242700473296, + "learning_rate": 2e-06, + "loss": 0.4241, + "step": 435 + }, + { + "epoch": 0.10114835865908828, + "grad_norm": 25.2907973397533, + "learning_rate": 2e-06, + "loss": 0.3094, + "step": 436 + }, + { + "epoch": 0.10138035030738893, + "grad_norm": 20.645973630360405, + "learning_rate": 2e-06, + "loss": 0.365, + "step": 437 + }, + { + "epoch": 0.10161234195568959, + "grad_norm": 13.503907309499892, + "learning_rate": 2e-06, + "loss": 0.3956, + "step": 438 + }, + { + "epoch": 0.10184433360399026, + "grad_norm": 26.58537180570347, + "learning_rate": 2e-06, + "loss": 0.3936, + "step": 439 + }, + { + "epoch": 0.10207632525229092, + "grad_norm": 14.061720828202937, + "learning_rate": 2e-06, + "loss": 0.399, + "step": 440 + }, + { + "epoch": 0.10230831690059158, + "grad_norm": 21.802081089998484, + "learning_rate": 2e-06, + "loss": 0.4034, + "step": 441 + }, + { + "epoch": 0.10254030854889223, + "grad_norm": 16.007298216847893, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 442 + }, + { + "epoch": 0.1027723001971929, + "grad_norm": 13.543803794607616, + "learning_rate": 2e-06, + "loss": 0.2848, + "step": 443 + }, + { + "epoch": 0.10300429184549356, + "grad_norm": 11.917643036483089, + "learning_rate": 2e-06, + "loss": 0.3229, + "step": 444 + }, + { + "epoch": 0.10323628349379423, + "grad_norm": 10.540211537982008, + "learning_rate": 2e-06, + "loss": 0.3835, + "step": 445 + }, + { + "epoch": 0.10346827514209489, + "grad_norm": 13.057127763815975, + "learning_rate": 2e-06, + "loss": 0.294, + "step": 446 + }, + { + "epoch": 0.10370026679039554, + "grad_norm": 40.636902358293355, + "learning_rate": 2e-06, + "loss": 0.5286, + "step": 447 + }, + { + "epoch": 0.1039322584386962, + "grad_norm": 9.295471134140202, + "learning_rate": 2e-06, + "loss": 0.2626, + "step": 448 + }, + { + "epoch": 0.10416425008699687, + "grad_norm": 16.71420553457328, + "learning_rate": 2e-06, + "loss": 0.4241, + "step": 449 + }, + { + "epoch": 0.10439624173529753, + "grad_norm": 8.626134436732368, + "learning_rate": 2e-06, + "loss": 0.3212, + "step": 450 + }, + { + "epoch": 0.1046282333835982, + "grad_norm": 30.773166312699107, + "learning_rate": 2e-06, + "loss": 0.4788, + "step": 451 + }, + { + "epoch": 0.10486022503189885, + "grad_norm": 13.090372218449275, + "learning_rate": 2e-06, + "loss": 0.3331, + "step": 452 + }, + { + "epoch": 0.10509221668019951, + "grad_norm": 13.979905174034451, + "learning_rate": 2e-06, + "loss": 0.2949, + "step": 453 + }, + { + "epoch": 0.10532420832850017, + "grad_norm": 42.856080673219175, + "learning_rate": 2e-06, + "loss": 0.4169, + "step": 454 + }, + { + "epoch": 0.10555619997680084, + "grad_norm": 23.273704420042517, + "learning_rate": 2e-06, + "loss": 0.3922, + "step": 455 + }, + { + "epoch": 0.1057881916251015, + "grad_norm": 12.605371417868731, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 456 + }, + { + "epoch": 0.10602018327340215, + "grad_norm": 11.582289734117069, + "learning_rate": 2e-06, + "loss": 0.3205, + "step": 457 + }, + { + "epoch": 0.10625217492170282, + "grad_norm": 13.586535826870845, + "learning_rate": 2e-06, + "loss": 0.3675, + "step": 458 + }, + { + "epoch": 0.10648416657000348, + "grad_norm": 17.080354404774585, + "learning_rate": 2e-06, + "loss": 0.3486, + "step": 459 + }, + { + "epoch": 0.10671615821830414, + "grad_norm": 17.925684598773802, + "learning_rate": 2e-06, + "loss": 0.3022, + "step": 460 + }, + { + "epoch": 0.10694814986660481, + "grad_norm": 32.39272785312845, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 461 + }, + { + "epoch": 0.10718014151490546, + "grad_norm": 18.254480374694175, + "learning_rate": 2e-06, + "loss": 0.3263, + "step": 462 + }, + { + "epoch": 0.10741213316320612, + "grad_norm": 23.591719076716867, + "learning_rate": 2e-06, + "loss": 0.4446, + "step": 463 + }, + { + "epoch": 0.10764412481150679, + "grad_norm": 19.865109921119632, + "learning_rate": 2e-06, + "loss": 0.4048, + "step": 464 + }, + { + "epoch": 0.10787611645980745, + "grad_norm": 8.884190876686526, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 465 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 19.61323282534642, + "learning_rate": 2e-06, + "loss": 0.3167, + "step": 466 + }, + { + "epoch": 0.10834009975640876, + "grad_norm": 12.695565418193748, + "learning_rate": 2e-06, + "loss": 0.369, + "step": 467 + }, + { + "epoch": 0.10857209140470943, + "grad_norm": 10.243463772330834, + "learning_rate": 2e-06, + "loss": 0.3189, + "step": 468 + }, + { + "epoch": 0.10880408305301009, + "grad_norm": 10.293892595757947, + "learning_rate": 2e-06, + "loss": 0.3302, + "step": 469 + }, + { + "epoch": 0.10903607470131076, + "grad_norm": 21.389390468394783, + "learning_rate": 2e-06, + "loss": 0.3741, + "step": 470 + }, + { + "epoch": 0.10926806634961142, + "grad_norm": 11.940062834217704, + "learning_rate": 2e-06, + "loss": 0.3796, + "step": 471 + }, + { + "epoch": 0.10950005799791207, + "grad_norm": 9.774522293402262, + "learning_rate": 2e-06, + "loss": 0.2699, + "step": 472 + }, + { + "epoch": 0.10973204964621273, + "grad_norm": 11.12192006208675, + "learning_rate": 2e-06, + "loss": 0.3056, + "step": 473 + }, + { + "epoch": 0.1099640412945134, + "grad_norm": 17.91288583179564, + "learning_rate": 2e-06, + "loss": 0.3468, + "step": 474 + }, + { + "epoch": 0.11019603294281406, + "grad_norm": 20.480071149104823, + "learning_rate": 2e-06, + "loss": 0.3443, + "step": 475 + }, + { + "epoch": 0.11042802459111473, + "grad_norm": 24.143986404076905, + "learning_rate": 2e-06, + "loss": 0.4692, + "step": 476 + }, + { + "epoch": 0.11066001623941538, + "grad_norm": 14.08740260271415, + "learning_rate": 2e-06, + "loss": 0.3726, + "step": 477 + }, + { + "epoch": 0.11089200788771604, + "grad_norm": 10.616284079876811, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 478 + }, + { + "epoch": 0.1111239995360167, + "grad_norm": 12.99385356040717, + "learning_rate": 2e-06, + "loss": 0.2818, + "step": 479 + }, + { + "epoch": 0.11135599118431737, + "grad_norm": 6.6027185157360035, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 480 + }, + { + "epoch": 0.11158798283261803, + "grad_norm": 17.59288186274003, + "learning_rate": 2e-06, + "loss": 0.3019, + "step": 481 + }, + { + "epoch": 0.11181997448091868, + "grad_norm": 18.112113143400485, + "learning_rate": 2e-06, + "loss": 0.3168, + "step": 482 + }, + { + "epoch": 0.11205196612921935, + "grad_norm": 18.58516521806112, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 483 + }, + { + "epoch": 0.11228395777752001, + "grad_norm": 20.018829935182424, + "learning_rate": 2e-06, + "loss": 0.326, + "step": 484 + }, + { + "epoch": 0.11251594942582067, + "grad_norm": 20.822972874843753, + "learning_rate": 2e-06, + "loss": 0.4031, + "step": 485 + }, + { + "epoch": 0.11274794107412134, + "grad_norm": 21.934178884656355, + "learning_rate": 2e-06, + "loss": 0.3179, + "step": 486 + }, + { + "epoch": 0.11297993272242199, + "grad_norm": 21.66633969901522, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 487 + }, + { + "epoch": 0.11321192437072265, + "grad_norm": 16.374257943413088, + "learning_rate": 2e-06, + "loss": 0.374, + "step": 488 + }, + { + "epoch": 0.11344391601902332, + "grad_norm": 19.504755329296714, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 489 + }, + { + "epoch": 0.11367590766732398, + "grad_norm": 14.763736020821334, + "learning_rate": 2e-06, + "loss": 0.3575, + "step": 490 + }, + { + "epoch": 0.11390789931562464, + "grad_norm": 10.60113072500508, + "learning_rate": 2e-06, + "loss": 0.2901, + "step": 491 + }, + { + "epoch": 0.1141398909639253, + "grad_norm": 15.115397184387295, + "learning_rate": 2e-06, + "loss": 0.3871, + "step": 492 + }, + { + "epoch": 0.11437188261222596, + "grad_norm": 17.2191940468864, + "learning_rate": 2e-06, + "loss": 0.4249, + "step": 493 + }, + { + "epoch": 0.11460387426052662, + "grad_norm": 12.074212003251892, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 494 + }, + { + "epoch": 0.11483586590882729, + "grad_norm": 35.686595454103376, + "learning_rate": 2e-06, + "loss": 0.3694, + "step": 495 + }, + { + "epoch": 0.11506785755712794, + "grad_norm": 25.21616405090084, + "learning_rate": 2e-06, + "loss": 0.3483, + "step": 496 + }, + { + "epoch": 0.1152998492054286, + "grad_norm": 17.78054548198579, + "learning_rate": 2e-06, + "loss": 0.3453, + "step": 497 + }, + { + "epoch": 0.11553184085372926, + "grad_norm": 16.1860008387927, + "learning_rate": 2e-06, + "loss": 0.3795, + "step": 498 + }, + { + "epoch": 0.11576383250202993, + "grad_norm": 10.73184589288529, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 499 + }, + { + "epoch": 0.11599582415033059, + "grad_norm": 21.798651290466267, + "learning_rate": 2e-06, + "loss": 0.3505, + "step": 500 + }, + { + "epoch": 0.11622781579863124, + "grad_norm": 20.404407291282162, + "learning_rate": 2e-06, + "loss": 0.3475, + "step": 501 + }, + { + "epoch": 0.1164598074469319, + "grad_norm": 21.873321256737448, + "learning_rate": 2e-06, + "loss": 0.4091, + "step": 502 + }, + { + "epoch": 0.11669179909523257, + "grad_norm": 11.859969122962756, + "learning_rate": 2e-06, + "loss": 0.3084, + "step": 503 + }, + { + "epoch": 0.11692379074353323, + "grad_norm": 26.627204119044684, + "learning_rate": 2e-06, + "loss": 0.422, + "step": 504 + }, + { + "epoch": 0.1171557823918339, + "grad_norm": 8.92584627620991, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 505 + }, + { + "epoch": 0.11738777404013455, + "grad_norm": 11.993139051388196, + "learning_rate": 2e-06, + "loss": 0.3289, + "step": 506 + }, + { + "epoch": 0.11761976568843521, + "grad_norm": 12.897442540293113, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 507 + }, + { + "epoch": 0.11785175733673588, + "grad_norm": 14.110845460131074, + "learning_rate": 2e-06, + "loss": 0.2998, + "step": 508 + }, + { + "epoch": 0.11808374898503654, + "grad_norm": 25.33181859269643, + "learning_rate": 2e-06, + "loss": 0.4741, + "step": 509 + }, + { + "epoch": 0.1183157406333372, + "grad_norm": 13.834992576536884, + "learning_rate": 2e-06, + "loss": 0.3232, + "step": 510 + }, + { + "epoch": 0.11854773228163785, + "grad_norm": 14.893252752827362, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 511 + }, + { + "epoch": 0.11877972392993852, + "grad_norm": 15.761119502374642, + "learning_rate": 2e-06, + "loss": 0.3421, + "step": 512 + }, + { + "epoch": 0.11901171557823918, + "grad_norm": 12.35069424882054, + "learning_rate": 2e-06, + "loss": 0.3332, + "step": 513 + }, + { + "epoch": 0.11924370722653985, + "grad_norm": 19.600361404784426, + "learning_rate": 2e-06, + "loss": 0.377, + "step": 514 + }, + { + "epoch": 0.11947569887484051, + "grad_norm": 24.217849992141204, + "learning_rate": 2e-06, + "loss": 0.4162, + "step": 515 + }, + { + "epoch": 0.11970769052314116, + "grad_norm": 17.814790081649537, + "learning_rate": 2e-06, + "loss": 0.3611, + "step": 516 + }, + { + "epoch": 0.11993968217144182, + "grad_norm": 11.464981304657163, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 517 + }, + { + "epoch": 0.12017167381974249, + "grad_norm": 26.99997443522923, + "learning_rate": 2e-06, + "loss": 0.3731, + "step": 518 + }, + { + "epoch": 0.12040366546804315, + "grad_norm": 17.150784636126104, + "learning_rate": 2e-06, + "loss": 0.4064, + "step": 519 + }, + { + "epoch": 0.12063565711634382, + "grad_norm": 17.853860712732363, + "learning_rate": 2e-06, + "loss": 0.3889, + "step": 520 + }, + { + "epoch": 0.12086764876464447, + "grad_norm": 14.08571749826114, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 521 + }, + { + "epoch": 0.12109964041294513, + "grad_norm": 20.754236483960852, + "learning_rate": 2e-06, + "loss": 0.4262, + "step": 522 + }, + { + "epoch": 0.1213316320612458, + "grad_norm": 16.55873039914546, + "learning_rate": 2e-06, + "loss": 0.3337, + "step": 523 + }, + { + "epoch": 0.12156362370954646, + "grad_norm": 32.91313558109254, + "learning_rate": 2e-06, + "loss": 0.4844, + "step": 524 + }, + { + "epoch": 0.12179561535784712, + "grad_norm": 11.950163381560618, + "learning_rate": 2e-06, + "loss": 0.3711, + "step": 525 + }, + { + "epoch": 0.12202760700614777, + "grad_norm": 34.77644420590345, + "learning_rate": 2e-06, + "loss": 0.5912, + "step": 526 + }, + { + "epoch": 0.12225959865444844, + "grad_norm": 18.000554624080966, + "learning_rate": 2e-06, + "loss": 0.4028, + "step": 527 + }, + { + "epoch": 0.1224915903027491, + "grad_norm": 12.121073135087734, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 528 + }, + { + "epoch": 0.12272358195104976, + "grad_norm": 26.90471471621084, + "learning_rate": 2e-06, + "loss": 0.427, + "step": 529 + }, + { + "epoch": 0.12295557359935043, + "grad_norm": 21.990425205897445, + "learning_rate": 2e-06, + "loss": 0.4281, + "step": 530 + }, + { + "epoch": 0.12318756524765108, + "grad_norm": 19.60118632192807, + "learning_rate": 2e-06, + "loss": 0.3642, + "step": 531 + }, + { + "epoch": 0.12341955689595174, + "grad_norm": 11.736726359770271, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 532 + }, + { + "epoch": 0.12365154854425241, + "grad_norm": 12.09972833385718, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 533 + }, + { + "epoch": 0.12388354019255307, + "grad_norm": 9.957374867715385, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 534 + }, + { + "epoch": 0.12411553184085374, + "grad_norm": 10.561712491251779, + "learning_rate": 2e-06, + "loss": 0.3149, + "step": 535 + }, + { + "epoch": 0.12434752348915439, + "grad_norm": 22.962972593305583, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 536 + }, + { + "epoch": 0.12457951513745505, + "grad_norm": 21.81855834354691, + "learning_rate": 2e-06, + "loss": 0.4408, + "step": 537 + }, + { + "epoch": 0.12481150678575571, + "grad_norm": 43.577518805619206, + "learning_rate": 2e-06, + "loss": 0.5206, + "step": 538 + }, + { + "epoch": 0.12504349843405638, + "grad_norm": 21.43613203738653, + "learning_rate": 2e-06, + "loss": 0.4016, + "step": 539 + }, + { + "epoch": 0.12527549008235703, + "grad_norm": 8.521408142301265, + "learning_rate": 2e-06, + "loss": 0.3127, + "step": 540 + }, + { + "epoch": 0.1255074817306577, + "grad_norm": 21.675160395032297, + "learning_rate": 2e-06, + "loss": 0.4071, + "step": 541 + }, + { + "epoch": 0.12573947337895836, + "grad_norm": 10.01476995805297, + "learning_rate": 2e-06, + "loss": 0.3015, + "step": 542 + }, + { + "epoch": 0.125971465027259, + "grad_norm": 12.745527693576305, + "learning_rate": 2e-06, + "loss": 0.3208, + "step": 543 + }, + { + "epoch": 0.12620345667555968, + "grad_norm": 10.28277249296884, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 544 + }, + { + "epoch": 0.12643544832386033, + "grad_norm": 24.837971502781365, + "learning_rate": 2e-06, + "loss": 0.3971, + "step": 545 + }, + { + "epoch": 0.126667439972161, + "grad_norm": 10.839451211684377, + "learning_rate": 2e-06, + "loss": 0.336, + "step": 546 + }, + { + "epoch": 0.12689943162046166, + "grad_norm": 15.188607097212989, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 547 + }, + { + "epoch": 0.1271314232687623, + "grad_norm": 11.852306795226589, + "learning_rate": 2e-06, + "loss": 0.3548, + "step": 548 + }, + { + "epoch": 0.127363414917063, + "grad_norm": 10.728552291466901, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 549 + }, + { + "epoch": 0.12759540656536364, + "grad_norm": 12.197563885040825, + "learning_rate": 2e-06, + "loss": 0.3412, + "step": 550 + }, + { + "epoch": 0.12782739821366432, + "grad_norm": 11.460186845755205, + "learning_rate": 2e-06, + "loss": 0.3667, + "step": 551 + }, + { + "epoch": 0.12805938986196497, + "grad_norm": 24.191419011715134, + "learning_rate": 2e-06, + "loss": 0.4359, + "step": 552 + }, + { + "epoch": 0.12829138151026562, + "grad_norm": 15.072369818460848, + "learning_rate": 2e-06, + "loss": 0.3481, + "step": 553 + }, + { + "epoch": 0.1285233731585663, + "grad_norm": 15.493554128998317, + "learning_rate": 2e-06, + "loss": 0.3944, + "step": 554 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 10.299584834939997, + "learning_rate": 2e-06, + "loss": 0.2933, + "step": 555 + }, + { + "epoch": 0.12898735645516762, + "grad_norm": 19.667944578719855, + "learning_rate": 2e-06, + "loss": 0.3843, + "step": 556 + }, + { + "epoch": 0.12921934810346827, + "grad_norm": 13.020604199902435, + "learning_rate": 2e-06, + "loss": 0.3362, + "step": 557 + }, + { + "epoch": 0.12945133975176892, + "grad_norm": 18.494565081255427, + "learning_rate": 2e-06, + "loss": 0.2426, + "step": 558 + }, + { + "epoch": 0.1296833314000696, + "grad_norm": 11.481350296677014, + "learning_rate": 2e-06, + "loss": 0.3884, + "step": 559 + }, + { + "epoch": 0.12991532304837025, + "grad_norm": 14.37603161322684, + "learning_rate": 2e-06, + "loss": 0.3521, + "step": 560 + }, + { + "epoch": 0.13014731469667093, + "grad_norm": 17.945165746050844, + "learning_rate": 2e-06, + "loss": 0.4201, + "step": 561 + }, + { + "epoch": 0.13037930634497158, + "grad_norm": 10.206165197896242, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 562 + }, + { + "epoch": 0.13061129799327223, + "grad_norm": 11.366171051296087, + "learning_rate": 2e-06, + "loss": 0.2697, + "step": 563 + }, + { + "epoch": 0.1308432896415729, + "grad_norm": 17.908668657075456, + "learning_rate": 2e-06, + "loss": 0.3358, + "step": 564 + }, + { + "epoch": 0.13107528128987356, + "grad_norm": 15.699717818446853, + "learning_rate": 2e-06, + "loss": 0.379, + "step": 565 + }, + { + "epoch": 0.13130727293817424, + "grad_norm": 16.386214174399377, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 566 + }, + { + "epoch": 0.13153926458647489, + "grad_norm": 10.902641798900966, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 567 + }, + { + "epoch": 0.13177125623477554, + "grad_norm": 19.20552496348048, + "learning_rate": 2e-06, + "loss": 0.3325, + "step": 568 + }, + { + "epoch": 0.1320032478830762, + "grad_norm": 17.225944620309544, + "learning_rate": 2e-06, + "loss": 0.3848, + "step": 569 + }, + { + "epoch": 0.13223523953137686, + "grad_norm": 12.063065237924471, + "learning_rate": 2e-06, + "loss": 0.3181, + "step": 570 + }, + { + "epoch": 0.13246723117967754, + "grad_norm": 15.04183112468636, + "learning_rate": 2e-06, + "loss": 0.3095, + "step": 571 + }, + { + "epoch": 0.1326992228279782, + "grad_norm": 29.59285246867334, + "learning_rate": 2e-06, + "loss": 0.3663, + "step": 572 + }, + { + "epoch": 0.13293121447627884, + "grad_norm": 22.314922985673483, + "learning_rate": 2e-06, + "loss": 0.3433, + "step": 573 + }, + { + "epoch": 0.13316320612457952, + "grad_norm": 20.246625843146674, + "learning_rate": 2e-06, + "loss": 0.3287, + "step": 574 + }, + { + "epoch": 0.13339519777288017, + "grad_norm": 35.745096593143096, + "learning_rate": 2e-06, + "loss": 0.5195, + "step": 575 + }, + { + "epoch": 0.13362718942118085, + "grad_norm": 21.991019971295252, + "learning_rate": 2e-06, + "loss": 0.3267, + "step": 576 + }, + { + "epoch": 0.1338591810694815, + "grad_norm": 11.547842625012965, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 577 + }, + { + "epoch": 0.13409117271778215, + "grad_norm": 7.575492023294917, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 578 + }, + { + "epoch": 0.13432316436608283, + "grad_norm": 14.200955942542265, + "learning_rate": 2e-06, + "loss": 0.1945, + "step": 579 + }, + { + "epoch": 0.13455515601438348, + "grad_norm": 17.254167222368046, + "learning_rate": 2e-06, + "loss": 0.3549, + "step": 580 + }, + { + "epoch": 0.13478714766268415, + "grad_norm": 14.858177542842116, + "learning_rate": 2e-06, + "loss": 0.2782, + "step": 581 + }, + { + "epoch": 0.1350191393109848, + "grad_norm": 14.332877132531262, + "learning_rate": 2e-06, + "loss": 0.2892, + "step": 582 + }, + { + "epoch": 0.13525113095928545, + "grad_norm": 11.706174595821098, + "learning_rate": 2e-06, + "loss": 0.263, + "step": 583 + }, + { + "epoch": 0.13548312260758613, + "grad_norm": 13.445854658600583, + "learning_rate": 2e-06, + "loss": 0.3788, + "step": 584 + }, + { + "epoch": 0.13571511425588678, + "grad_norm": 11.033594620835709, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 585 + }, + { + "epoch": 0.13594710590418746, + "grad_norm": 19.704778968598895, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 586 + }, + { + "epoch": 0.1361790975524881, + "grad_norm": 23.207724263487698, + "learning_rate": 2e-06, + "loss": 0.4422, + "step": 587 + }, + { + "epoch": 0.13641108920078876, + "grad_norm": 17.446548566313393, + "learning_rate": 2e-06, + "loss": 0.3404, + "step": 588 + }, + { + "epoch": 0.13664308084908944, + "grad_norm": 12.890562334412161, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 589 + }, + { + "epoch": 0.1368750724973901, + "grad_norm": 9.320297975561534, + "learning_rate": 2e-06, + "loss": 0.2527, + "step": 590 + }, + { + "epoch": 0.13710706414569077, + "grad_norm": 13.408963579965981, + "learning_rate": 2e-06, + "loss": 0.3661, + "step": 591 + }, + { + "epoch": 0.13733905579399142, + "grad_norm": 17.926859871243074, + "learning_rate": 2e-06, + "loss": 0.3791, + "step": 592 + }, + { + "epoch": 0.13757104744229207, + "grad_norm": 6.07597891688596, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 593 + }, + { + "epoch": 0.13780303909059274, + "grad_norm": 19.555073576535285, + "learning_rate": 2e-06, + "loss": 0.331, + "step": 594 + }, + { + "epoch": 0.1380350307388934, + "grad_norm": 12.512008844378691, + "learning_rate": 2e-06, + "loss": 0.3331, + "step": 595 + }, + { + "epoch": 0.13826702238719407, + "grad_norm": 20.58792756025946, + "learning_rate": 2e-06, + "loss": 0.3247, + "step": 596 + }, + { + "epoch": 0.13849901403549472, + "grad_norm": 13.69854700104819, + "learning_rate": 2e-06, + "loss": 0.3676, + "step": 597 + }, + { + "epoch": 0.13873100568379537, + "grad_norm": 7.247642328163105, + "learning_rate": 2e-06, + "loss": 0.2787, + "step": 598 + }, + { + "epoch": 0.13896299733209605, + "grad_norm": 20.662562349843842, + "learning_rate": 2e-06, + "loss": 0.4137, + "step": 599 + }, + { + "epoch": 0.1391949889803967, + "grad_norm": 23.74211769605697, + "learning_rate": 2e-06, + "loss": 0.4117, + "step": 600 + }, + { + "epoch": 0.13942698062869738, + "grad_norm": 14.54527602752155, + "learning_rate": 2e-06, + "loss": 0.3589, + "step": 601 + }, + { + "epoch": 0.13965897227699803, + "grad_norm": 13.128633707615593, + "learning_rate": 2e-06, + "loss": 0.3302, + "step": 602 + }, + { + "epoch": 0.13989096392529868, + "grad_norm": 13.78506522209295, + "learning_rate": 2e-06, + "loss": 0.3327, + "step": 603 + }, + { + "epoch": 0.14012295557359936, + "grad_norm": 15.841352731539514, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 604 + }, + { + "epoch": 0.1403549472219, + "grad_norm": 9.421455470531463, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 605 + }, + { + "epoch": 0.14058693887020068, + "grad_norm": 10.08321808770723, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 606 + }, + { + "epoch": 0.14081893051850133, + "grad_norm": 18.689274942037507, + "learning_rate": 2e-06, + "loss": 0.4126, + "step": 607 + }, + { + "epoch": 0.14105092216680198, + "grad_norm": 12.358109062169618, + "learning_rate": 2e-06, + "loss": 0.3798, + "step": 608 + }, + { + "epoch": 0.14128291381510266, + "grad_norm": 21.92588737376762, + "learning_rate": 2e-06, + "loss": 0.4711, + "step": 609 + }, + { + "epoch": 0.1415149054634033, + "grad_norm": 19.45991230773491, + "learning_rate": 2e-06, + "loss": 0.4289, + "step": 610 + }, + { + "epoch": 0.141746897111704, + "grad_norm": 13.005478035817163, + "learning_rate": 2e-06, + "loss": 0.3862, + "step": 611 + }, + { + "epoch": 0.14197888876000464, + "grad_norm": 13.562324246271288, + "learning_rate": 2e-06, + "loss": 0.3999, + "step": 612 + }, + { + "epoch": 0.1422108804083053, + "grad_norm": 31.784848826892258, + "learning_rate": 2e-06, + "loss": 0.3502, + "step": 613 + }, + { + "epoch": 0.14244287205660597, + "grad_norm": 10.206030065161611, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 614 + }, + { + "epoch": 0.14267486370490662, + "grad_norm": 24.276173026549397, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 615 + }, + { + "epoch": 0.1429068553532073, + "grad_norm": 18.342906382940086, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 616 + }, + { + "epoch": 0.14313884700150795, + "grad_norm": 8.993574006033441, + "learning_rate": 2e-06, + "loss": 0.2586, + "step": 617 + }, + { + "epoch": 0.1433708386498086, + "grad_norm": 23.936719026871057, + "learning_rate": 2e-06, + "loss": 0.3789, + "step": 618 + }, + { + "epoch": 0.14360283029810927, + "grad_norm": 16.32194373617926, + "learning_rate": 2e-06, + "loss": 0.3839, + "step": 619 + }, + { + "epoch": 0.14383482194640992, + "grad_norm": 12.988145492420317, + "learning_rate": 2e-06, + "loss": 0.3824, + "step": 620 + }, + { + "epoch": 0.1440668135947106, + "grad_norm": 20.669035386443895, + "learning_rate": 2e-06, + "loss": 0.3758, + "step": 621 + }, + { + "epoch": 0.14429880524301125, + "grad_norm": 24.990471580039006, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 622 + }, + { + "epoch": 0.1445307968913119, + "grad_norm": 17.99722260635703, + "learning_rate": 2e-06, + "loss": 0.3738, + "step": 623 + }, + { + "epoch": 0.14476278853961258, + "grad_norm": 28.862374960031236, + "learning_rate": 2e-06, + "loss": 0.3462, + "step": 624 + }, + { + "epoch": 0.14499478018791323, + "grad_norm": 16.286544229911005, + "learning_rate": 2e-06, + "loss": 0.3677, + "step": 625 + }, + { + "epoch": 0.1452267718362139, + "grad_norm": 13.749182210734784, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 626 + }, + { + "epoch": 0.14545876348451456, + "grad_norm": 8.781576263071432, + "learning_rate": 2e-06, + "loss": 0.3408, + "step": 627 + }, + { + "epoch": 0.1456907551328152, + "grad_norm": 20.23008968468618, + "learning_rate": 2e-06, + "loss": 0.3334, + "step": 628 + }, + { + "epoch": 0.1459227467811159, + "grad_norm": 21.403961956791857, + "learning_rate": 2e-06, + "loss": 0.3627, + "step": 629 + }, + { + "epoch": 0.14615473842941654, + "grad_norm": 19.072138287773942, + "learning_rate": 2e-06, + "loss": 0.2983, + "step": 630 + }, + { + "epoch": 0.14638673007771721, + "grad_norm": 22.329610088782662, + "learning_rate": 2e-06, + "loss": 0.4157, + "step": 631 + }, + { + "epoch": 0.14661872172601786, + "grad_norm": 11.982533942082721, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 632 + }, + { + "epoch": 0.14685071337431851, + "grad_norm": 22.78573635381739, + "learning_rate": 2e-06, + "loss": 0.3195, + "step": 633 + }, + { + "epoch": 0.1470827050226192, + "grad_norm": 10.40653341397135, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 634 + }, + { + "epoch": 0.14731469667091984, + "grad_norm": 14.463244019994837, + "learning_rate": 2e-06, + "loss": 0.3692, + "step": 635 + }, + { + "epoch": 0.14754668831922052, + "grad_norm": 19.800480987537895, + "learning_rate": 2e-06, + "loss": 0.3389, + "step": 636 + }, + { + "epoch": 0.14777867996752117, + "grad_norm": 10.631478775111985, + "learning_rate": 2e-06, + "loss": 0.2379, + "step": 637 + }, + { + "epoch": 0.14801067161582182, + "grad_norm": 19.589467703354284, + "learning_rate": 2e-06, + "loss": 0.3806, + "step": 638 + }, + { + "epoch": 0.1482426632641225, + "grad_norm": 17.235561484760968, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 639 + }, + { + "epoch": 0.14847465491242315, + "grad_norm": 14.29645254007326, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 640 + }, + { + "epoch": 0.14870664656072383, + "grad_norm": 15.25682225167021, + "learning_rate": 2e-06, + "loss": 0.2814, + "step": 641 + }, + { + "epoch": 0.14893863820902448, + "grad_norm": 12.72336393966868, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 642 + }, + { + "epoch": 0.14917062985732513, + "grad_norm": 34.56907190484323, + "learning_rate": 2e-06, + "loss": 0.394, + "step": 643 + }, + { + "epoch": 0.1494026215056258, + "grad_norm": 20.092572113695343, + "learning_rate": 2e-06, + "loss": 0.4555, + "step": 644 + }, + { + "epoch": 0.14963461315392645, + "grad_norm": 12.558275124737698, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 645 + }, + { + "epoch": 0.14986660480222713, + "grad_norm": 10.926979723682406, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 646 + }, + { + "epoch": 0.15009859645052778, + "grad_norm": 21.759994543709613, + "learning_rate": 2e-06, + "loss": 0.3082, + "step": 647 + }, + { + "epoch": 0.15033058809882843, + "grad_norm": 14.962828288114487, + "learning_rate": 2e-06, + "loss": 0.2977, + "step": 648 + }, + { + "epoch": 0.1505625797471291, + "grad_norm": 14.945826490956852, + "learning_rate": 2e-06, + "loss": 0.3518, + "step": 649 + }, + { + "epoch": 0.15079457139542976, + "grad_norm": 12.677037134321225, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 650 + }, + { + "epoch": 0.15102656304373044, + "grad_norm": 16.329612153370707, + "learning_rate": 2e-06, + "loss": 0.3625, + "step": 651 + }, + { + "epoch": 0.1512585546920311, + "grad_norm": 25.316498260503856, + "learning_rate": 2e-06, + "loss": 0.3509, + "step": 652 + }, + { + "epoch": 0.15149054634033174, + "grad_norm": 9.505821526608154, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 653 + }, + { + "epoch": 0.15172253798863242, + "grad_norm": 23.54621096323739, + "learning_rate": 2e-06, + "loss": 0.3844, + "step": 654 + }, + { + "epoch": 0.15195452963693307, + "grad_norm": 11.882344502450662, + "learning_rate": 2e-06, + "loss": 0.3435, + "step": 655 + }, + { + "epoch": 0.15218652128523374, + "grad_norm": 21.91198574912016, + "learning_rate": 2e-06, + "loss": 0.3632, + "step": 656 + }, + { + "epoch": 0.1524185129335344, + "grad_norm": 12.393539964259297, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 657 + }, + { + "epoch": 0.15265050458183504, + "grad_norm": 18.168511634243217, + "learning_rate": 2e-06, + "loss": 0.2467, + "step": 658 + }, + { + "epoch": 0.15288249623013572, + "grad_norm": 14.178757384068685, + "learning_rate": 2e-06, + "loss": 0.3257, + "step": 659 + }, + { + "epoch": 0.15311448787843637, + "grad_norm": 32.74130918364026, + "learning_rate": 2e-06, + "loss": 0.3114, + "step": 660 + }, + { + "epoch": 0.15334647952673705, + "grad_norm": 12.222002529077335, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 661 + }, + { + "epoch": 0.1535784711750377, + "grad_norm": 10.954915709678856, + "learning_rate": 2e-06, + "loss": 0.3513, + "step": 662 + }, + { + "epoch": 0.15381046282333835, + "grad_norm": 20.24850597550675, + "learning_rate": 2e-06, + "loss": 0.4142, + "step": 663 + }, + { + "epoch": 0.15404245447163903, + "grad_norm": 15.214951718141643, + "learning_rate": 2e-06, + "loss": 0.3569, + "step": 664 + }, + { + "epoch": 0.15427444611993968, + "grad_norm": 17.304343638378498, + "learning_rate": 2e-06, + "loss": 0.3982, + "step": 665 + }, + { + "epoch": 0.15450643776824036, + "grad_norm": 10.307606751687013, + "learning_rate": 2e-06, + "loss": 0.3254, + "step": 666 + }, + { + "epoch": 0.154738429416541, + "grad_norm": 15.932744708373177, + "learning_rate": 2e-06, + "loss": 0.3459, + "step": 667 + }, + { + "epoch": 0.15497042106484166, + "grad_norm": 22.80289035080992, + "learning_rate": 2e-06, + "loss": 0.3968, + "step": 668 + }, + { + "epoch": 0.15520241271314233, + "grad_norm": 23.14623782349953, + "learning_rate": 2e-06, + "loss": 0.2405, + "step": 669 + }, + { + "epoch": 0.15543440436144298, + "grad_norm": 18.0912009844887, + "learning_rate": 2e-06, + "loss": 0.338, + "step": 670 + }, + { + "epoch": 0.15566639600974366, + "grad_norm": 18.677691324496845, + "learning_rate": 2e-06, + "loss": 0.3047, + "step": 671 + }, + { + "epoch": 0.1558983876580443, + "grad_norm": 8.998825461022793, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 672 + }, + { + "epoch": 0.15613037930634496, + "grad_norm": 18.045426173107654, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 673 + }, + { + "epoch": 0.15636237095464564, + "grad_norm": 11.154343481153669, + "learning_rate": 2e-06, + "loss": 0.3413, + "step": 674 + }, + { + "epoch": 0.1565943626029463, + "grad_norm": 13.332750429948154, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 675 + }, + { + "epoch": 0.15682635425124697, + "grad_norm": 15.74486222469269, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 676 + }, + { + "epoch": 0.15705834589954762, + "grad_norm": 19.667188570074135, + "learning_rate": 2e-06, + "loss": 0.4405, + "step": 677 + }, + { + "epoch": 0.15729033754784827, + "grad_norm": 21.031839435111806, + "learning_rate": 2e-06, + "loss": 0.3478, + "step": 678 + }, + { + "epoch": 0.15752232919614895, + "grad_norm": 10.39843349546204, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 679 + }, + { + "epoch": 0.1577543208444496, + "grad_norm": 16.454956918604434, + "learning_rate": 2e-06, + "loss": 0.3867, + "step": 680 + }, + { + "epoch": 0.15798631249275025, + "grad_norm": 16.358326166838868, + "learning_rate": 2e-06, + "loss": 0.418, + "step": 681 + }, + { + "epoch": 0.15821830414105092, + "grad_norm": 25.612389596778975, + "learning_rate": 2e-06, + "loss": 0.3624, + "step": 682 + }, + { + "epoch": 0.15845029578935158, + "grad_norm": 10.449260622783445, + "learning_rate": 2e-06, + "loss": 0.2386, + "step": 683 + }, + { + "epoch": 0.15868228743765225, + "grad_norm": 11.170225689154774, + "learning_rate": 2e-06, + "loss": 0.2839, + "step": 684 + }, + { + "epoch": 0.1589142790859529, + "grad_norm": 20.379086082147165, + "learning_rate": 2e-06, + "loss": 0.362, + "step": 685 + }, + { + "epoch": 0.15914627073425355, + "grad_norm": 29.30564423884362, + "learning_rate": 2e-06, + "loss": 0.4104, + "step": 686 + }, + { + "epoch": 0.15937826238255423, + "grad_norm": 12.564718453938301, + "learning_rate": 2e-06, + "loss": 0.3009, + "step": 687 + }, + { + "epoch": 0.15961025403085488, + "grad_norm": 16.60002829086695, + "learning_rate": 2e-06, + "loss": 0.3076, + "step": 688 + }, + { + "epoch": 0.15984224567915556, + "grad_norm": 18.44420873851104, + "learning_rate": 2e-06, + "loss": 0.376, + "step": 689 + }, + { + "epoch": 0.1600742373274562, + "grad_norm": 14.716335866668372, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 690 + }, + { + "epoch": 0.16030622897575686, + "grad_norm": 11.136643251719736, + "learning_rate": 2e-06, + "loss": 0.3109, + "step": 691 + }, + { + "epoch": 0.16053822062405754, + "grad_norm": 15.005943488342586, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 692 + }, + { + "epoch": 0.1607702122723582, + "grad_norm": 24.658145570665926, + "learning_rate": 2e-06, + "loss": 0.3951, + "step": 693 + }, + { + "epoch": 0.16100220392065887, + "grad_norm": 10.62390063366631, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 694 + }, + { + "epoch": 0.16123419556895952, + "grad_norm": 14.095175484680707, + "learning_rate": 2e-06, + "loss": 0.3373, + "step": 695 + }, + { + "epoch": 0.16146618721726017, + "grad_norm": 12.851412497958066, + "learning_rate": 2e-06, + "loss": 0.3065, + "step": 696 + }, + { + "epoch": 0.16169817886556084, + "grad_norm": 8.31681767749006, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 697 + }, + { + "epoch": 0.1619301705138615, + "grad_norm": 15.879779774741346, + "learning_rate": 2e-06, + "loss": 0.2867, + "step": 698 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 15.024589339052074, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 699 + }, + { + "epoch": 0.16239415381046282, + "grad_norm": 25.728722695798243, + "learning_rate": 2e-06, + "loss": 0.3263, + "step": 700 + }, + { + "epoch": 0.16262614545876347, + "grad_norm": 13.962249772821057, + "learning_rate": 2e-06, + "loss": 0.4021, + "step": 701 + }, + { + "epoch": 0.16285813710706415, + "grad_norm": 16.6701984573303, + "learning_rate": 2e-06, + "loss": 0.2992, + "step": 702 + }, + { + "epoch": 0.1630901287553648, + "grad_norm": 20.731069653085676, + "learning_rate": 2e-06, + "loss": 0.2982, + "step": 703 + }, + { + "epoch": 0.16332212040366548, + "grad_norm": 22.497997523362585, + "learning_rate": 2e-06, + "loss": 0.334, + "step": 704 + }, + { + "epoch": 0.16355411205196613, + "grad_norm": 13.511717785171559, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 705 + }, + { + "epoch": 0.16378610370026678, + "grad_norm": 14.860067921359914, + "learning_rate": 2e-06, + "loss": 0.3137, + "step": 706 + }, + { + "epoch": 0.16401809534856746, + "grad_norm": 14.477755167374747, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 707 + }, + { + "epoch": 0.1642500869968681, + "grad_norm": 31.17589495214125, + "learning_rate": 2e-06, + "loss": 0.4513, + "step": 708 + }, + { + "epoch": 0.16448207864516878, + "grad_norm": 30.74207575332384, + "learning_rate": 2e-06, + "loss": 0.3825, + "step": 709 + }, + { + "epoch": 0.16471407029346943, + "grad_norm": 40.584780969108955, + "learning_rate": 2e-06, + "loss": 0.4184, + "step": 710 + }, + { + "epoch": 0.16494606194177008, + "grad_norm": 47.0752374589824, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 711 + }, + { + "epoch": 0.16517805359007076, + "grad_norm": 24.05198512196974, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 712 + }, + { + "epoch": 0.1654100452383714, + "grad_norm": 20.96193342212991, + "learning_rate": 2e-06, + "loss": 0.3844, + "step": 713 + }, + { + "epoch": 0.1656420368866721, + "grad_norm": 26.844108345388346, + "learning_rate": 2e-06, + "loss": 0.3303, + "step": 714 + }, + { + "epoch": 0.16587402853497274, + "grad_norm": 18.361559812521087, + "learning_rate": 2e-06, + "loss": 0.2599, + "step": 715 + }, + { + "epoch": 0.1661060201832734, + "grad_norm": 10.334556071049587, + "learning_rate": 2e-06, + "loss": 0.2415, + "step": 716 + }, + { + "epoch": 0.16633801183157407, + "grad_norm": 17.466059203458997, + "learning_rate": 2e-06, + "loss": 0.2934, + "step": 717 + }, + { + "epoch": 0.16657000347987472, + "grad_norm": 13.792496069247843, + "learning_rate": 2e-06, + "loss": 0.2913, + "step": 718 + }, + { + "epoch": 0.1668019951281754, + "grad_norm": 27.344398206578028, + "learning_rate": 2e-06, + "loss": 0.5, + "step": 719 + }, + { + "epoch": 0.16703398677647605, + "grad_norm": 16.570069610376084, + "learning_rate": 2e-06, + "loss": 0.3282, + "step": 720 + }, + { + "epoch": 0.1672659784247767, + "grad_norm": 12.517016004539395, + "learning_rate": 2e-06, + "loss": 0.316, + "step": 721 + }, + { + "epoch": 0.16749797007307737, + "grad_norm": 9.454686300920637, + "learning_rate": 2e-06, + "loss": 0.2546, + "step": 722 + }, + { + "epoch": 0.16772996172137802, + "grad_norm": 17.013538405587067, + "learning_rate": 2e-06, + "loss": 0.3796, + "step": 723 + }, + { + "epoch": 0.1679619533696787, + "grad_norm": 24.06500993181185, + "learning_rate": 2e-06, + "loss": 0.4782, + "step": 724 + }, + { + "epoch": 0.16819394501797935, + "grad_norm": 21.754943523584142, + "learning_rate": 2e-06, + "loss": 0.4722, + "step": 725 + }, + { + "epoch": 0.16842593666628, + "grad_norm": 19.33220164628588, + "learning_rate": 2e-06, + "loss": 0.3475, + "step": 726 + }, + { + "epoch": 0.16865792831458068, + "grad_norm": 14.11612192602318, + "learning_rate": 2e-06, + "loss": 0.3717, + "step": 727 + }, + { + "epoch": 0.16888991996288133, + "grad_norm": 20.07653914337054, + "learning_rate": 2e-06, + "loss": 0.3061, + "step": 728 + }, + { + "epoch": 0.169121911611182, + "grad_norm": 15.694126220394503, + "learning_rate": 2e-06, + "loss": 0.3679, + "step": 729 + }, + { + "epoch": 0.16935390325948266, + "grad_norm": 7.597244637446782, + "learning_rate": 2e-06, + "loss": 0.3023, + "step": 730 + }, + { + "epoch": 0.1695858949077833, + "grad_norm": 9.960347483549528, + "learning_rate": 2e-06, + "loss": 0.3413, + "step": 731 + }, + { + "epoch": 0.16981788655608399, + "grad_norm": 13.00232896611134, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 732 + }, + { + "epoch": 0.17004987820438464, + "grad_norm": 18.64285936948962, + "learning_rate": 2e-06, + "loss": 0.3768, + "step": 733 + }, + { + "epoch": 0.1702818698526853, + "grad_norm": 19.56053782005133, + "learning_rate": 2e-06, + "loss": 0.3985, + "step": 734 + }, + { + "epoch": 0.17051386150098596, + "grad_norm": 10.622675208141215, + "learning_rate": 2e-06, + "loss": 0.3562, + "step": 735 + }, + { + "epoch": 0.1707458531492866, + "grad_norm": 15.804895543467035, + "learning_rate": 2e-06, + "loss": 0.3077, + "step": 736 + }, + { + "epoch": 0.1709778447975873, + "grad_norm": 18.447008280828154, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 737 + }, + { + "epoch": 0.17120983644588794, + "grad_norm": 14.621557018343863, + "learning_rate": 2e-06, + "loss": 0.459, + "step": 738 + }, + { + "epoch": 0.17144182809418862, + "grad_norm": 8.763421231576094, + "learning_rate": 2e-06, + "loss": 0.2067, + "step": 739 + }, + { + "epoch": 0.17167381974248927, + "grad_norm": 21.564285979581147, + "learning_rate": 2e-06, + "loss": 0.3686, + "step": 740 + }, + { + "epoch": 0.17190581139078992, + "grad_norm": 15.462860742192579, + "learning_rate": 2e-06, + "loss": 0.2838, + "step": 741 + }, + { + "epoch": 0.1721378030390906, + "grad_norm": 18.866472307797476, + "learning_rate": 2e-06, + "loss": 0.3532, + "step": 742 + }, + { + "epoch": 0.17236979468739125, + "grad_norm": 11.27689988835768, + "learning_rate": 2e-06, + "loss": 0.3262, + "step": 743 + }, + { + "epoch": 0.17260178633569193, + "grad_norm": 6.560862378907595, + "learning_rate": 2e-06, + "loss": 0.2374, + "step": 744 + }, + { + "epoch": 0.17283377798399258, + "grad_norm": 17.22733586413075, + "learning_rate": 2e-06, + "loss": 0.3586, + "step": 745 + }, + { + "epoch": 0.17306576963229323, + "grad_norm": 8.816541653445544, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 746 + }, + { + "epoch": 0.1732977612805939, + "grad_norm": 9.561143157495499, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 747 + }, + { + "epoch": 0.17352975292889455, + "grad_norm": 12.445921344090433, + "learning_rate": 2e-06, + "loss": 0.3107, + "step": 748 + }, + { + "epoch": 0.17376174457719523, + "grad_norm": 14.75541877744063, + "learning_rate": 2e-06, + "loss": 0.3665, + "step": 749 + }, + { + "epoch": 0.17399373622549588, + "grad_norm": 11.932681911379351, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 750 + }, + { + "epoch": 0.17422572787379653, + "grad_norm": 13.778515110830838, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 751 + }, + { + "epoch": 0.1744577195220972, + "grad_norm": 17.98604309887815, + "learning_rate": 2e-06, + "loss": 0.3341, + "step": 752 + }, + { + "epoch": 0.17468971117039786, + "grad_norm": 17.334206140348677, + "learning_rate": 2e-06, + "loss": 0.2994, + "step": 753 + }, + { + "epoch": 0.17492170281869854, + "grad_norm": 17.67618987598563, + "learning_rate": 2e-06, + "loss": 0.4629, + "step": 754 + }, + { + "epoch": 0.1751536944669992, + "grad_norm": 13.136167347637274, + "learning_rate": 2e-06, + "loss": 0.3304, + "step": 755 + }, + { + "epoch": 0.17538568611529984, + "grad_norm": 8.952224830050373, + "learning_rate": 2e-06, + "loss": 0.3626, + "step": 756 + }, + { + "epoch": 0.17561767776360052, + "grad_norm": 11.193654263857521, + "learning_rate": 2e-06, + "loss": 0.2494, + "step": 757 + }, + { + "epoch": 0.17584966941190117, + "grad_norm": 22.320651074813423, + "learning_rate": 2e-06, + "loss": 0.3289, + "step": 758 + }, + { + "epoch": 0.17608166106020184, + "grad_norm": 17.535712646014808, + "learning_rate": 2e-06, + "loss": 0.4499, + "step": 759 + }, + { + "epoch": 0.1763136527085025, + "grad_norm": 12.733734517226111, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 760 + }, + { + "epoch": 0.17654564435680314, + "grad_norm": 13.353201137951201, + "learning_rate": 2e-06, + "loss": 0.3699, + "step": 761 + }, + { + "epoch": 0.17677763600510382, + "grad_norm": 9.201780284194042, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 762 + }, + { + "epoch": 0.17700962765340447, + "grad_norm": 9.586095433749264, + "learning_rate": 2e-06, + "loss": 0.2671, + "step": 763 + }, + { + "epoch": 0.17724161930170515, + "grad_norm": 18.8866400319037, + "learning_rate": 2e-06, + "loss": 0.3327, + "step": 764 + }, + { + "epoch": 0.1774736109500058, + "grad_norm": 16.97686477233965, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 765 + }, + { + "epoch": 0.17770560259830645, + "grad_norm": 23.109981923087553, + "learning_rate": 2e-06, + "loss": 0.3106, + "step": 766 + }, + { + "epoch": 0.17793759424660713, + "grad_norm": 16.775430605152785, + "learning_rate": 2e-06, + "loss": 0.3196, + "step": 767 + }, + { + "epoch": 0.17816958589490778, + "grad_norm": 11.87197374426931, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 768 + }, + { + "epoch": 0.17840157754320846, + "grad_norm": 18.34787640158497, + "learning_rate": 2e-06, + "loss": 0.4318, + "step": 769 + }, + { + "epoch": 0.1786335691915091, + "grad_norm": 12.462233834921067, + "learning_rate": 2e-06, + "loss": 0.3509, + "step": 770 + }, + { + "epoch": 0.17886556083980976, + "grad_norm": 9.523701984608357, + "learning_rate": 2e-06, + "loss": 0.2247, + "step": 771 + }, + { + "epoch": 0.17909755248811043, + "grad_norm": 20.77996685844607, + "learning_rate": 2e-06, + "loss": 0.3783, + "step": 772 + }, + { + "epoch": 0.17932954413641108, + "grad_norm": 14.594333033140241, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 773 + }, + { + "epoch": 0.17956153578471176, + "grad_norm": 11.82242238113082, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 774 + }, + { + "epoch": 0.1797935274330124, + "grad_norm": 13.30174298015603, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 775 + }, + { + "epoch": 0.18002551908131306, + "grad_norm": 21.963362480858713, + "learning_rate": 2e-06, + "loss": 0.4504, + "step": 776 + }, + { + "epoch": 0.18025751072961374, + "grad_norm": 16.089942933076735, + "learning_rate": 2e-06, + "loss": 0.2697, + "step": 777 + }, + { + "epoch": 0.1804895023779144, + "grad_norm": 22.34317850213985, + "learning_rate": 2e-06, + "loss": 0.3904, + "step": 778 + }, + { + "epoch": 0.18072149402621507, + "grad_norm": 23.94242658386868, + "learning_rate": 2e-06, + "loss": 0.38, + "step": 779 + }, + { + "epoch": 0.18095348567451572, + "grad_norm": 12.818302873757448, + "learning_rate": 2e-06, + "loss": 0.3119, + "step": 780 + }, + { + "epoch": 0.18118547732281637, + "grad_norm": 15.137628022957086, + "learning_rate": 2e-06, + "loss": 0.3781, + "step": 781 + }, + { + "epoch": 0.18141746897111705, + "grad_norm": 24.94966829807091, + "learning_rate": 2e-06, + "loss": 0.3605, + "step": 782 + }, + { + "epoch": 0.1816494606194177, + "grad_norm": 13.102831529880358, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 783 + }, + { + "epoch": 0.18188145226771837, + "grad_norm": 19.645424296337186, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 784 + }, + { + "epoch": 0.18211344391601902, + "grad_norm": 23.164457029140216, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 785 + }, + { + "epoch": 0.18234543556431967, + "grad_norm": 20.373224626818722, + "learning_rate": 2e-06, + "loss": 0.3663, + "step": 786 + }, + { + "epoch": 0.18257742721262035, + "grad_norm": 22.147470638665496, + "learning_rate": 2e-06, + "loss": 0.4535, + "step": 787 + }, + { + "epoch": 0.182809418860921, + "grad_norm": 15.781492413550604, + "learning_rate": 2e-06, + "loss": 0.3429, + "step": 788 + }, + { + "epoch": 0.18304141050922168, + "grad_norm": 14.581736302069048, + "learning_rate": 2e-06, + "loss": 0.3106, + "step": 789 + }, + { + "epoch": 0.18327340215752233, + "grad_norm": 20.75205079068296, + "learning_rate": 2e-06, + "loss": 0.2501, + "step": 790 + }, + { + "epoch": 0.18350539380582298, + "grad_norm": 11.99955208209882, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 791 + }, + { + "epoch": 0.18373738545412366, + "grad_norm": 23.105384860674214, + "learning_rate": 2e-06, + "loss": 0.3447, + "step": 792 + }, + { + "epoch": 0.1839693771024243, + "grad_norm": 15.03266460995024, + "learning_rate": 2e-06, + "loss": 0.3274, + "step": 793 + }, + { + "epoch": 0.184201368750725, + "grad_norm": 16.42107190060086, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 794 + }, + { + "epoch": 0.18443336039902564, + "grad_norm": 10.80224803928146, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 795 + }, + { + "epoch": 0.1846653520473263, + "grad_norm": 12.57701768453657, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 796 + }, + { + "epoch": 0.18489734369562696, + "grad_norm": 18.119585754482213, + "learning_rate": 2e-06, + "loss": 0.3537, + "step": 797 + }, + { + "epoch": 0.18512933534392761, + "grad_norm": 13.742042613570336, + "learning_rate": 2e-06, + "loss": 0.3081, + "step": 798 + }, + { + "epoch": 0.1853613269922283, + "grad_norm": 12.840783424332322, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 799 + }, + { + "epoch": 0.18559331864052894, + "grad_norm": 22.663177940728435, + "learning_rate": 2e-06, + "loss": 0.3788, + "step": 800 + }, + { + "epoch": 0.1858253102888296, + "grad_norm": 9.773808862135011, + "learning_rate": 2e-06, + "loss": 0.3019, + "step": 801 + }, + { + "epoch": 0.18605730193713027, + "grad_norm": 13.74093840790118, + "learning_rate": 2e-06, + "loss": 0.3438, + "step": 802 + }, + { + "epoch": 0.18628929358543092, + "grad_norm": 10.157630882906007, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 803 + }, + { + "epoch": 0.1865212852337316, + "grad_norm": 19.94680174631867, + "learning_rate": 2e-06, + "loss": 0.3932, + "step": 804 + }, + { + "epoch": 0.18675327688203225, + "grad_norm": 19.62427920903281, + "learning_rate": 2e-06, + "loss": 0.411, + "step": 805 + }, + { + "epoch": 0.1869852685303329, + "grad_norm": 20.749400909089836, + "learning_rate": 2e-06, + "loss": 0.3252, + "step": 806 + }, + { + "epoch": 0.18721726017863358, + "grad_norm": 16.912302806170956, + "learning_rate": 2e-06, + "loss": 0.3226, + "step": 807 + }, + { + "epoch": 0.18744925182693423, + "grad_norm": 7.836093031268887, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 808 + }, + { + "epoch": 0.1876812434752349, + "grad_norm": 10.36121215740947, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 809 + }, + { + "epoch": 0.18791323512353555, + "grad_norm": 11.951105686627765, + "learning_rate": 2e-06, + "loss": 0.2768, + "step": 810 + }, + { + "epoch": 0.1881452267718362, + "grad_norm": 14.9237046497379, + "learning_rate": 2e-06, + "loss": 0.3761, + "step": 811 + }, + { + "epoch": 0.18837721842013688, + "grad_norm": 14.974947243668653, + "learning_rate": 2e-06, + "loss": 0.3652, + "step": 812 + }, + { + "epoch": 0.18860921006843753, + "grad_norm": 21.727343631884633, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 813 + }, + { + "epoch": 0.1888412017167382, + "grad_norm": 20.573896161218798, + "learning_rate": 2e-06, + "loss": 0.4486, + "step": 814 + }, + { + "epoch": 0.18907319336503886, + "grad_norm": 16.25349741628844, + "learning_rate": 2e-06, + "loss": 0.2948, + "step": 815 + }, + { + "epoch": 0.1893051850133395, + "grad_norm": 26.182205854500186, + "learning_rate": 2e-06, + "loss": 0.4274, + "step": 816 + }, + { + "epoch": 0.1895371766616402, + "grad_norm": 17.041520476227614, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 817 + }, + { + "epoch": 0.18976916830994084, + "grad_norm": 12.614284283681387, + "learning_rate": 2e-06, + "loss": 0.3327, + "step": 818 + }, + { + "epoch": 0.19000115995824152, + "grad_norm": 21.138920185559407, + "learning_rate": 2e-06, + "loss": 0.2783, + "step": 819 + }, + { + "epoch": 0.19023315160654217, + "grad_norm": 21.829436439600233, + "learning_rate": 2e-06, + "loss": 0.4277, + "step": 820 + }, + { + "epoch": 0.19046514325484282, + "grad_norm": 13.577464451034475, + "learning_rate": 2e-06, + "loss": 0.3058, + "step": 821 + }, + { + "epoch": 0.1906971349031435, + "grad_norm": 20.013660716589733, + "learning_rate": 2e-06, + "loss": 0.3382, + "step": 822 + }, + { + "epoch": 0.19092912655144414, + "grad_norm": 22.59776772609933, + "learning_rate": 2e-06, + "loss": 0.365, + "step": 823 + }, + { + "epoch": 0.19116111819974482, + "grad_norm": 14.284326073429842, + "learning_rate": 2e-06, + "loss": 0.2696, + "step": 824 + }, + { + "epoch": 0.19139310984804547, + "grad_norm": 21.38410672676771, + "learning_rate": 2e-06, + "loss": 0.4893, + "step": 825 + }, + { + "epoch": 0.19162510149634612, + "grad_norm": 15.799118469111308, + "learning_rate": 2e-06, + "loss": 0.3119, + "step": 826 + }, + { + "epoch": 0.1918570931446468, + "grad_norm": 8.29295418290862, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 827 + }, + { + "epoch": 0.19208908479294745, + "grad_norm": 11.550631260812212, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 828 + }, + { + "epoch": 0.19232107644124813, + "grad_norm": 15.316290042899775, + "learning_rate": 2e-06, + "loss": 0.3393, + "step": 829 + }, + { + "epoch": 0.19255306808954878, + "grad_norm": 19.489646646108323, + "learning_rate": 2e-06, + "loss": 0.281, + "step": 830 + }, + { + "epoch": 0.19278505973784943, + "grad_norm": 9.929271193196863, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 831 + }, + { + "epoch": 0.1930170513861501, + "grad_norm": 14.739191818389179, + "learning_rate": 2e-06, + "loss": 0.4059, + "step": 832 + }, + { + "epoch": 0.19324904303445076, + "grad_norm": 15.977615302695332, + "learning_rate": 2e-06, + "loss": 0.3081, + "step": 833 + }, + { + "epoch": 0.1934810346827514, + "grad_norm": 17.21606409393626, + "learning_rate": 2e-06, + "loss": 0.2522, + "step": 834 + }, + { + "epoch": 0.19371302633105209, + "grad_norm": 10.464523778961931, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 835 + }, + { + "epoch": 0.19394501797935274, + "grad_norm": 21.76286939880773, + "learning_rate": 2e-06, + "loss": 0.2913, + "step": 836 + }, + { + "epoch": 0.1941770096276534, + "grad_norm": 19.049786854010637, + "learning_rate": 2e-06, + "loss": 0.3758, + "step": 837 + }, + { + "epoch": 0.19440900127595406, + "grad_norm": 11.044066209039185, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 838 + }, + { + "epoch": 0.1946409929242547, + "grad_norm": 8.866011503268515, + "learning_rate": 2e-06, + "loss": 0.3765, + "step": 839 + }, + { + "epoch": 0.1948729845725554, + "grad_norm": 13.395474628389369, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 840 + }, + { + "epoch": 0.19510497622085604, + "grad_norm": 23.036725052626185, + "learning_rate": 2e-06, + "loss": 0.32, + "step": 841 + }, + { + "epoch": 0.19533696786915672, + "grad_norm": 15.691738893928159, + "learning_rate": 2e-06, + "loss": 0.4299, + "step": 842 + }, + { + "epoch": 0.19556895951745737, + "grad_norm": 18.60561924491579, + "learning_rate": 2e-06, + "loss": 0.3365, + "step": 843 + }, + { + "epoch": 0.19580095116575802, + "grad_norm": 23.651757254556525, + "learning_rate": 2e-06, + "loss": 0.3196, + "step": 844 + }, + { + "epoch": 0.1960329428140587, + "grad_norm": 11.371947892209338, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 845 + }, + { + "epoch": 0.19626493446235935, + "grad_norm": 10.090779614062537, + "learning_rate": 2e-06, + "loss": 0.3127, + "step": 846 + }, + { + "epoch": 0.19649692611066003, + "grad_norm": 17.306165471527102, + "learning_rate": 2e-06, + "loss": 0.3371, + "step": 847 + }, + { + "epoch": 0.19672891775896068, + "grad_norm": 17.408658612319552, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 848 + }, + { + "epoch": 0.19696090940726133, + "grad_norm": 31.10483274697328, + "learning_rate": 2e-06, + "loss": 0.419, + "step": 849 + }, + { + "epoch": 0.197192901055562, + "grad_norm": 13.739629867076612, + "learning_rate": 2e-06, + "loss": 0.4226, + "step": 850 + }, + { + "epoch": 0.19742489270386265, + "grad_norm": 18.10650904524103, + "learning_rate": 2e-06, + "loss": 0.4111, + "step": 851 + }, + { + "epoch": 0.19765688435216333, + "grad_norm": 9.724254659000593, + "learning_rate": 2e-06, + "loss": 0.2907, + "step": 852 + }, + { + "epoch": 0.19788887600046398, + "grad_norm": 11.70840979887669, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 853 + }, + { + "epoch": 0.19812086764876463, + "grad_norm": 18.223636670611263, + "learning_rate": 2e-06, + "loss": 0.3297, + "step": 854 + }, + { + "epoch": 0.1983528592970653, + "grad_norm": 19.969087791024695, + "learning_rate": 2e-06, + "loss": 0.3981, + "step": 855 + }, + { + "epoch": 0.19858485094536596, + "grad_norm": 24.668935395200606, + "learning_rate": 2e-06, + "loss": 0.4325, + "step": 856 + }, + { + "epoch": 0.19881684259366664, + "grad_norm": 8.569207714004078, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 857 + }, + { + "epoch": 0.1990488342419673, + "grad_norm": 18.141428991522456, + "learning_rate": 2e-06, + "loss": 0.3797, + "step": 858 + }, + { + "epoch": 0.19928082589026794, + "grad_norm": 13.83844732174976, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 859 + }, + { + "epoch": 0.19951281753856862, + "grad_norm": 7.570402229629171, + "learning_rate": 2e-06, + "loss": 0.2659, + "step": 860 + }, + { + "epoch": 0.19974480918686927, + "grad_norm": 19.151243048179083, + "learning_rate": 2e-06, + "loss": 0.3457, + "step": 861 + }, + { + "epoch": 0.19997680083516994, + "grad_norm": 9.023686026024441, + "learning_rate": 2e-06, + "loss": 0.2003, + "step": 862 + }, + { + "epoch": 0.2002087924834706, + "grad_norm": 10.652610316585804, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 863 + }, + { + "epoch": 0.20044078413177124, + "grad_norm": 21.104189125544075, + "learning_rate": 2e-06, + "loss": 0.3277, + "step": 864 + }, + { + "epoch": 0.20067277578007192, + "grad_norm": 17.871109485100916, + "learning_rate": 2e-06, + "loss": 0.3817, + "step": 865 + }, + { + "epoch": 0.20090476742837257, + "grad_norm": 10.595302363200277, + "learning_rate": 2e-06, + "loss": 0.3649, + "step": 866 + }, + { + "epoch": 0.20113675907667325, + "grad_norm": 18.58896905743985, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 867 + }, + { + "epoch": 0.2013687507249739, + "grad_norm": 8.56618420358794, + "learning_rate": 2e-06, + "loss": 0.332, + "step": 868 + }, + { + "epoch": 0.20160074237327455, + "grad_norm": 7.871954271104144, + "learning_rate": 2e-06, + "loss": 0.3256, + "step": 869 + }, + { + "epoch": 0.20183273402157523, + "grad_norm": 14.482295828257122, + "learning_rate": 2e-06, + "loss": 0.3513, + "step": 870 + }, + { + "epoch": 0.20206472566987588, + "grad_norm": 15.59436837133702, + "learning_rate": 2e-06, + "loss": 0.4059, + "step": 871 + }, + { + "epoch": 0.20229671731817656, + "grad_norm": 11.695894552841496, + "learning_rate": 2e-06, + "loss": 0.271, + "step": 872 + }, + { + "epoch": 0.2025287089664772, + "grad_norm": 9.279739517381302, + "learning_rate": 2e-06, + "loss": 0.3572, + "step": 873 + }, + { + "epoch": 0.20276070061477786, + "grad_norm": 9.436327949040212, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 874 + }, + { + "epoch": 0.20299269226307853, + "grad_norm": 14.37356404419197, + "learning_rate": 2e-06, + "loss": 0.3537, + "step": 875 + }, + { + "epoch": 0.20322468391137918, + "grad_norm": 10.460306167068557, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 876 + }, + { + "epoch": 0.20345667555967986, + "grad_norm": 9.957388619811127, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 877 + }, + { + "epoch": 0.2036886672079805, + "grad_norm": 14.043925791536124, + "learning_rate": 2e-06, + "loss": 0.3229, + "step": 878 + }, + { + "epoch": 0.20392065885628116, + "grad_norm": 10.916638475637072, + "learning_rate": 2e-06, + "loss": 0.3094, + "step": 879 + }, + { + "epoch": 0.20415265050458184, + "grad_norm": 14.207690648062343, + "learning_rate": 2e-06, + "loss": 0.2728, + "step": 880 + }, + { + "epoch": 0.2043846421528825, + "grad_norm": 29.774947857721138, + "learning_rate": 2e-06, + "loss": 0.4734, + "step": 881 + }, + { + "epoch": 0.20461663380118317, + "grad_norm": 12.886953112164875, + "learning_rate": 2e-06, + "loss": 0.2808, + "step": 882 + }, + { + "epoch": 0.20484862544948382, + "grad_norm": 54.83652013490153, + "learning_rate": 2e-06, + "loss": 0.4345, + "step": 883 + }, + { + "epoch": 0.20508061709778447, + "grad_norm": 13.836004473480973, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 884 + }, + { + "epoch": 0.20531260874608515, + "grad_norm": 16.236716597033208, + "learning_rate": 2e-06, + "loss": 0.3773, + "step": 885 + }, + { + "epoch": 0.2055446003943858, + "grad_norm": 9.584331646818612, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 886 + }, + { + "epoch": 0.20577659204268647, + "grad_norm": 14.986402451734643, + "learning_rate": 2e-06, + "loss": 0.3277, + "step": 887 + }, + { + "epoch": 0.20600858369098712, + "grad_norm": 17.0253000035854, + "learning_rate": 2e-06, + "loss": 0.3905, + "step": 888 + }, + { + "epoch": 0.20624057533928777, + "grad_norm": 16.450631095420082, + "learning_rate": 2e-06, + "loss": 0.4094, + "step": 889 + }, + { + "epoch": 0.20647256698758845, + "grad_norm": 18.750612286588492, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 890 + }, + { + "epoch": 0.2067045586358891, + "grad_norm": 18.73328993359991, + "learning_rate": 2e-06, + "loss": 0.3061, + "step": 891 + }, + { + "epoch": 0.20693655028418978, + "grad_norm": 12.70523808816334, + "learning_rate": 2e-06, + "loss": 0.282, + "step": 892 + }, + { + "epoch": 0.20716854193249043, + "grad_norm": 19.8824827414663, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 893 + }, + { + "epoch": 0.20740053358079108, + "grad_norm": 17.324392469861575, + "learning_rate": 2e-06, + "loss": 0.2701, + "step": 894 + }, + { + "epoch": 0.20763252522909176, + "grad_norm": 13.78937251217679, + "learning_rate": 2e-06, + "loss": 0.328, + "step": 895 + }, + { + "epoch": 0.2078645168773924, + "grad_norm": 16.027091968408882, + "learning_rate": 2e-06, + "loss": 0.3655, + "step": 896 + }, + { + "epoch": 0.20809650852569309, + "grad_norm": 7.60913786100639, + "learning_rate": 2e-06, + "loss": 0.1704, + "step": 897 + }, + { + "epoch": 0.20832850017399374, + "grad_norm": 28.677874376982054, + "learning_rate": 2e-06, + "loss": 0.4329, + "step": 898 + }, + { + "epoch": 0.20856049182229439, + "grad_norm": 8.221011139573621, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 899 + }, + { + "epoch": 0.20879248347059506, + "grad_norm": 11.4744661875519, + "learning_rate": 2e-06, + "loss": 0.3259, + "step": 900 + }, + { + "epoch": 0.20902447511889571, + "grad_norm": 15.294883355532624, + "learning_rate": 2e-06, + "loss": 0.3727, + "step": 901 + }, + { + "epoch": 0.2092564667671964, + "grad_norm": 19.055402797561403, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 902 + }, + { + "epoch": 0.20948845841549704, + "grad_norm": 23.698316035534873, + "learning_rate": 2e-06, + "loss": 0.3667, + "step": 903 + }, + { + "epoch": 0.2097204500637977, + "grad_norm": 22.8054584811822, + "learning_rate": 2e-06, + "loss": 0.3585, + "step": 904 + }, + { + "epoch": 0.20995244171209837, + "grad_norm": 20.083038658083378, + "learning_rate": 2e-06, + "loss": 0.3192, + "step": 905 + }, + { + "epoch": 0.21018443336039902, + "grad_norm": 22.584586110449916, + "learning_rate": 2e-06, + "loss": 0.4078, + "step": 906 + }, + { + "epoch": 0.2104164250086997, + "grad_norm": 15.037422485453048, + "learning_rate": 2e-06, + "loss": 0.2889, + "step": 907 + }, + { + "epoch": 0.21064841665700035, + "grad_norm": 20.203868005835083, + "learning_rate": 2e-06, + "loss": 0.448, + "step": 908 + }, + { + "epoch": 0.210880408305301, + "grad_norm": 15.16465590647367, + "learning_rate": 2e-06, + "loss": 0.3637, + "step": 909 + }, + { + "epoch": 0.21111239995360168, + "grad_norm": 5.2078995129577805, + "learning_rate": 2e-06, + "loss": 0.1984, + "step": 910 + }, + { + "epoch": 0.21134439160190233, + "grad_norm": 17.776999304081468, + "learning_rate": 2e-06, + "loss": 0.4419, + "step": 911 + }, + { + "epoch": 0.211576383250203, + "grad_norm": 20.801335368698737, + "learning_rate": 2e-06, + "loss": 0.3527, + "step": 912 + }, + { + "epoch": 0.21180837489850365, + "grad_norm": 12.89318139470534, + "learning_rate": 2e-06, + "loss": 0.2925, + "step": 913 + }, + { + "epoch": 0.2120403665468043, + "grad_norm": 13.279124604676984, + "learning_rate": 2e-06, + "loss": 0.2692, + "step": 914 + }, + { + "epoch": 0.21227235819510498, + "grad_norm": 31.361102208165416, + "learning_rate": 2e-06, + "loss": 0.4164, + "step": 915 + }, + { + "epoch": 0.21250434984340563, + "grad_norm": 14.926166059180384, + "learning_rate": 2e-06, + "loss": 0.2819, + "step": 916 + }, + { + "epoch": 0.2127363414917063, + "grad_norm": 9.505395111229008, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 917 + }, + { + "epoch": 0.21296833314000696, + "grad_norm": 18.19667325838775, + "learning_rate": 2e-06, + "loss": 0.2777, + "step": 918 + }, + { + "epoch": 0.2132003247883076, + "grad_norm": 34.244764055740525, + "learning_rate": 2e-06, + "loss": 0.2731, + "step": 919 + }, + { + "epoch": 0.2134323164366083, + "grad_norm": 13.827735412000134, + "learning_rate": 2e-06, + "loss": 0.312, + "step": 920 + }, + { + "epoch": 0.21366430808490894, + "grad_norm": 20.705363084487367, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 921 + }, + { + "epoch": 0.21389629973320962, + "grad_norm": 23.492053633413406, + "learning_rate": 2e-06, + "loss": 0.3717, + "step": 922 + }, + { + "epoch": 0.21412829138151027, + "grad_norm": 17.796209396317078, + "learning_rate": 2e-06, + "loss": 0.3501, + "step": 923 + }, + { + "epoch": 0.21436028302981092, + "grad_norm": 12.432070649047894, + "learning_rate": 2e-06, + "loss": 0.2967, + "step": 924 + }, + { + "epoch": 0.2145922746781116, + "grad_norm": 7.377036669491922, + "learning_rate": 2e-06, + "loss": 0.2317, + "step": 925 + }, + { + "epoch": 0.21482426632641224, + "grad_norm": 9.763829157513072, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 926 + }, + { + "epoch": 0.21505625797471292, + "grad_norm": 14.067087869685949, + "learning_rate": 2e-06, + "loss": 0.3761, + "step": 927 + }, + { + "epoch": 0.21528824962301357, + "grad_norm": 18.626123583576774, + "learning_rate": 2e-06, + "loss": 0.2611, + "step": 928 + }, + { + "epoch": 0.21552024127131422, + "grad_norm": 13.482003556824175, + "learning_rate": 2e-06, + "loss": 0.3761, + "step": 929 + }, + { + "epoch": 0.2157522329196149, + "grad_norm": 13.154857034380507, + "learning_rate": 2e-06, + "loss": 0.2722, + "step": 930 + }, + { + "epoch": 0.21598422456791555, + "grad_norm": 31.45359846464405, + "learning_rate": 2e-06, + "loss": 0.3893, + "step": 931 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 17.454676536343737, + "learning_rate": 2e-06, + "loss": 0.3429, + "step": 932 + }, + { + "epoch": 0.21644820786451688, + "grad_norm": 13.781514708400756, + "learning_rate": 2e-06, + "loss": 0.3244, + "step": 933 + }, + { + "epoch": 0.21668019951281753, + "grad_norm": 17.4126190084326, + "learning_rate": 2e-06, + "loss": 0.3813, + "step": 934 + }, + { + "epoch": 0.2169121911611182, + "grad_norm": 18.19944774510841, + "learning_rate": 2e-06, + "loss": 0.3342, + "step": 935 + }, + { + "epoch": 0.21714418280941886, + "grad_norm": 18.326170459536367, + "learning_rate": 2e-06, + "loss": 0.3103, + "step": 936 + }, + { + "epoch": 0.21737617445771953, + "grad_norm": 10.250425954162694, + "learning_rate": 2e-06, + "loss": 0.3748, + "step": 937 + }, + { + "epoch": 0.21760816610602018, + "grad_norm": 16.066598951701888, + "learning_rate": 2e-06, + "loss": 0.4068, + "step": 938 + }, + { + "epoch": 0.21784015775432083, + "grad_norm": 10.189922232130474, + "learning_rate": 2e-06, + "loss": 0.3328, + "step": 939 + }, + { + "epoch": 0.2180721494026215, + "grad_norm": 11.083488151842678, + "learning_rate": 2e-06, + "loss": 0.3137, + "step": 940 + }, + { + "epoch": 0.21830414105092216, + "grad_norm": 15.80352256233468, + "learning_rate": 2e-06, + "loss": 0.417, + "step": 941 + }, + { + "epoch": 0.21853613269922284, + "grad_norm": 10.397398946874906, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 942 + }, + { + "epoch": 0.2187681243475235, + "grad_norm": 6.2699124866115605, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 943 + }, + { + "epoch": 0.21900011599582414, + "grad_norm": 23.96778950909715, + "learning_rate": 2e-06, + "loss": 0.3493, + "step": 944 + }, + { + "epoch": 0.21923210764412482, + "grad_norm": 12.741819491680975, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 945 + }, + { + "epoch": 0.21946409929242547, + "grad_norm": 22.214441345100916, + "learning_rate": 2e-06, + "loss": 0.306, + "step": 946 + }, + { + "epoch": 0.21969609094072615, + "grad_norm": 12.91661021107639, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 947 + }, + { + "epoch": 0.2199280825890268, + "grad_norm": 15.890385515844484, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 948 + }, + { + "epoch": 0.22016007423732745, + "grad_norm": 17.44548027799827, + "learning_rate": 2e-06, + "loss": 0.4428, + "step": 949 + }, + { + "epoch": 0.22039206588562812, + "grad_norm": 29.612385060726385, + "learning_rate": 2e-06, + "loss": 0.4595, + "step": 950 + }, + { + "epoch": 0.22062405753392877, + "grad_norm": 14.192682314656473, + "learning_rate": 2e-06, + "loss": 0.3099, + "step": 951 + }, + { + "epoch": 0.22085604918222945, + "grad_norm": 22.44367441783778, + "learning_rate": 2e-06, + "loss": 0.4008, + "step": 952 + }, + { + "epoch": 0.2210880408305301, + "grad_norm": 16.51988527879832, + "learning_rate": 2e-06, + "loss": 0.3694, + "step": 953 + }, + { + "epoch": 0.22132003247883075, + "grad_norm": 13.145552437440195, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 954 + }, + { + "epoch": 0.22155202412713143, + "grad_norm": 17.045976637894437, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 955 + }, + { + "epoch": 0.22178401577543208, + "grad_norm": 7.152755972453679, + "learning_rate": 2e-06, + "loss": 0.2048, + "step": 956 + }, + { + "epoch": 0.22201600742373276, + "grad_norm": 14.476364773777387, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 957 + }, + { + "epoch": 0.2222479990720334, + "grad_norm": 20.17448656913104, + "learning_rate": 2e-06, + "loss": 0.4056, + "step": 958 + }, + { + "epoch": 0.22247999072033406, + "grad_norm": 17.447954517156088, + "learning_rate": 2e-06, + "loss": 0.3249, + "step": 959 + }, + { + "epoch": 0.22271198236863474, + "grad_norm": 16.47574513287585, + "learning_rate": 2e-06, + "loss": 0.3578, + "step": 960 + }, + { + "epoch": 0.2229439740169354, + "grad_norm": 18.833569055388487, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 961 + }, + { + "epoch": 0.22317596566523606, + "grad_norm": 12.878810264947404, + "learning_rate": 2e-06, + "loss": 0.322, + "step": 962 + }, + { + "epoch": 0.22340795731353671, + "grad_norm": 9.754848029157317, + "learning_rate": 2e-06, + "loss": 0.3028, + "step": 963 + }, + { + "epoch": 0.22363994896183736, + "grad_norm": 10.458699501656957, + "learning_rate": 2e-06, + "loss": 0.2659, + "step": 964 + }, + { + "epoch": 0.22387194061013804, + "grad_norm": 9.213002126694942, + "learning_rate": 2e-06, + "loss": 0.2413, + "step": 965 + }, + { + "epoch": 0.2241039322584387, + "grad_norm": 22.77852040491952, + "learning_rate": 2e-06, + "loss": 0.3402, + "step": 966 + }, + { + "epoch": 0.22433592390673937, + "grad_norm": 19.97429495238275, + "learning_rate": 2e-06, + "loss": 0.3356, + "step": 967 + }, + { + "epoch": 0.22456791555504002, + "grad_norm": 22.725310459439754, + "learning_rate": 2e-06, + "loss": 0.3688, + "step": 968 + }, + { + "epoch": 0.22479990720334067, + "grad_norm": 15.880456669293403, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 969 + }, + { + "epoch": 0.22503189885164135, + "grad_norm": 12.562977799213726, + "learning_rate": 2e-06, + "loss": 0.3211, + "step": 970 + }, + { + "epoch": 0.225263890499942, + "grad_norm": 38.52385320084602, + "learning_rate": 2e-06, + "loss": 0.4099, + "step": 971 + }, + { + "epoch": 0.22549588214824268, + "grad_norm": 19.568782203246148, + "learning_rate": 2e-06, + "loss": 0.3392, + "step": 972 + }, + { + "epoch": 0.22572787379654333, + "grad_norm": 22.75231297936888, + "learning_rate": 2e-06, + "loss": 0.3981, + "step": 973 + }, + { + "epoch": 0.22595986544484398, + "grad_norm": 23.093165664624546, + "learning_rate": 2e-06, + "loss": 0.4342, + "step": 974 + }, + { + "epoch": 0.22619185709314465, + "grad_norm": 21.85862109103549, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 975 + }, + { + "epoch": 0.2264238487414453, + "grad_norm": 15.783198583306087, + "learning_rate": 2e-06, + "loss": 0.3545, + "step": 976 + }, + { + "epoch": 0.22665584038974598, + "grad_norm": 20.00115524370116, + "learning_rate": 2e-06, + "loss": 0.419, + "step": 977 + }, + { + "epoch": 0.22688783203804663, + "grad_norm": 17.13575397262277, + "learning_rate": 2e-06, + "loss": 0.4164, + "step": 978 + }, + { + "epoch": 0.22711982368634728, + "grad_norm": 20.66236442506612, + "learning_rate": 2e-06, + "loss": 0.3921, + "step": 979 + }, + { + "epoch": 0.22735181533464796, + "grad_norm": 26.425130152598022, + "learning_rate": 2e-06, + "loss": 0.4785, + "step": 980 + }, + { + "epoch": 0.2275838069829486, + "grad_norm": 18.382125908922546, + "learning_rate": 2e-06, + "loss": 0.3235, + "step": 981 + }, + { + "epoch": 0.2278157986312493, + "grad_norm": 22.287573999128128, + "learning_rate": 2e-06, + "loss": 0.3582, + "step": 982 + }, + { + "epoch": 0.22804779027954994, + "grad_norm": 15.318127091663474, + "learning_rate": 2e-06, + "loss": 0.3105, + "step": 983 + }, + { + "epoch": 0.2282797819278506, + "grad_norm": 15.2128289752655, + "learning_rate": 2e-06, + "loss": 0.2805, + "step": 984 + }, + { + "epoch": 0.22851177357615127, + "grad_norm": 13.6362169257321, + "learning_rate": 2e-06, + "loss": 0.3603, + "step": 985 + }, + { + "epoch": 0.22874376522445192, + "grad_norm": 19.427830286160788, + "learning_rate": 2e-06, + "loss": 0.3149, + "step": 986 + }, + { + "epoch": 0.22897575687275257, + "grad_norm": 10.236059122445331, + "learning_rate": 2e-06, + "loss": 0.341, + "step": 987 + }, + { + "epoch": 0.22920774852105325, + "grad_norm": 20.821455151022416, + "learning_rate": 2e-06, + "loss": 0.4091, + "step": 988 + }, + { + "epoch": 0.2294397401693539, + "grad_norm": 45.94263705822538, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 989 + }, + { + "epoch": 0.22967173181765457, + "grad_norm": 10.338890421609332, + "learning_rate": 2e-06, + "loss": 0.378, + "step": 990 + }, + { + "epoch": 0.22990372346595522, + "grad_norm": 16.299094568182728, + "learning_rate": 2e-06, + "loss": 0.3556, + "step": 991 + }, + { + "epoch": 0.23013571511425587, + "grad_norm": 26.428391344147883, + "learning_rate": 2e-06, + "loss": 0.4697, + "step": 992 + }, + { + "epoch": 0.23036770676255655, + "grad_norm": 13.437299905823535, + "learning_rate": 2e-06, + "loss": 0.3016, + "step": 993 + }, + { + "epoch": 0.2305996984108572, + "grad_norm": 12.436162205821939, + "learning_rate": 2e-06, + "loss": 0.3067, + "step": 994 + }, + { + "epoch": 0.23083169005915788, + "grad_norm": 9.291303336441867, + "learning_rate": 2e-06, + "loss": 0.2107, + "step": 995 + }, + { + "epoch": 0.23106368170745853, + "grad_norm": 18.667844626673965, + "learning_rate": 2e-06, + "loss": 0.277, + "step": 996 + }, + { + "epoch": 0.23129567335575918, + "grad_norm": 10.092912796644061, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 997 + }, + { + "epoch": 0.23152766500405986, + "grad_norm": 9.506478549775954, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 998 + }, + { + "epoch": 0.2317596566523605, + "grad_norm": 9.16061150676762, + "learning_rate": 2e-06, + "loss": 0.2959, + "step": 999 + }, + { + "epoch": 0.23199164830066119, + "grad_norm": 13.513819503869733, + "learning_rate": 2e-06, + "loss": 0.3028, + "step": 1000 + }, + { + "epoch": 0.23222363994896184, + "grad_norm": 12.741939938596376, + "learning_rate": 2e-06, + "loss": 0.3136, + "step": 1001 + }, + { + "epoch": 0.23245563159726249, + "grad_norm": 19.698673117365807, + "learning_rate": 2e-06, + "loss": 0.3715, + "step": 1002 + }, + { + "epoch": 0.23268762324556316, + "grad_norm": 19.48596566945166, + "learning_rate": 2e-06, + "loss": 0.3778, + "step": 1003 + }, + { + "epoch": 0.2329196148938638, + "grad_norm": 13.556562374482986, + "learning_rate": 2e-06, + "loss": 0.3665, + "step": 1004 + }, + { + "epoch": 0.2331516065421645, + "grad_norm": 9.87402518242158, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 1005 + }, + { + "epoch": 0.23338359819046514, + "grad_norm": 13.973925681529213, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 1006 + }, + { + "epoch": 0.2336155898387658, + "grad_norm": 12.270413839177959, + "learning_rate": 2e-06, + "loss": 0.2936, + "step": 1007 + }, + { + "epoch": 0.23384758148706647, + "grad_norm": 10.414829485394955, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 1008 + }, + { + "epoch": 0.23407957313536712, + "grad_norm": 8.359953878679415, + "learning_rate": 2e-06, + "loss": 0.2915, + "step": 1009 + }, + { + "epoch": 0.2343115647836678, + "grad_norm": 21.553315281917737, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 1010 + }, + { + "epoch": 0.23454355643196845, + "grad_norm": 15.282341825830445, + "learning_rate": 2e-06, + "loss": 0.364, + "step": 1011 + }, + { + "epoch": 0.2347755480802691, + "grad_norm": 11.105284562316186, + "learning_rate": 2e-06, + "loss": 0.3318, + "step": 1012 + }, + { + "epoch": 0.23500753972856978, + "grad_norm": 26.70366182684123, + "learning_rate": 2e-06, + "loss": 0.4745, + "step": 1013 + }, + { + "epoch": 0.23523953137687043, + "grad_norm": 13.969126715794628, + "learning_rate": 2e-06, + "loss": 0.3891, + "step": 1014 + }, + { + "epoch": 0.2354715230251711, + "grad_norm": 7.149208235386196, + "learning_rate": 2e-06, + "loss": 0.2255, + "step": 1015 + }, + { + "epoch": 0.23570351467347175, + "grad_norm": 9.282816481833471, + "learning_rate": 2e-06, + "loss": 0.2991, + "step": 1016 + }, + { + "epoch": 0.2359355063217724, + "grad_norm": 11.997722779384231, + "learning_rate": 2e-06, + "loss": 0.3355, + "step": 1017 + }, + { + "epoch": 0.23616749797007308, + "grad_norm": 7.261743927913489, + "learning_rate": 2e-06, + "loss": 0.2568, + "step": 1018 + }, + { + "epoch": 0.23639948961837373, + "grad_norm": 9.285412136288146, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 1019 + }, + { + "epoch": 0.2366314812666744, + "grad_norm": 14.550212842325665, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 1020 + }, + { + "epoch": 0.23686347291497506, + "grad_norm": 13.672680558038127, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 1021 + }, + { + "epoch": 0.2370954645632757, + "grad_norm": 13.175353642289746, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 1022 + }, + { + "epoch": 0.2373274562115764, + "grad_norm": 14.857712932971321, + "learning_rate": 2e-06, + "loss": 0.2785, + "step": 1023 + }, + { + "epoch": 0.23755944785987704, + "grad_norm": 14.940824776770855, + "learning_rate": 2e-06, + "loss": 0.2517, + "step": 1024 + }, + { + "epoch": 0.23779143950817772, + "grad_norm": 27.551149012916042, + "learning_rate": 2e-06, + "loss": 0.412, + "step": 1025 + }, + { + "epoch": 0.23802343115647837, + "grad_norm": 13.21079485337451, + "learning_rate": 2e-06, + "loss": 0.2841, + "step": 1026 + }, + { + "epoch": 0.23825542280477902, + "grad_norm": 8.073937246007285, + "learning_rate": 2e-06, + "loss": 0.4117, + "step": 1027 + }, + { + "epoch": 0.2384874144530797, + "grad_norm": 15.211564630308953, + "learning_rate": 2e-06, + "loss": 0.3509, + "step": 1028 + }, + { + "epoch": 0.23871940610138034, + "grad_norm": 6.753994429637856, + "learning_rate": 2e-06, + "loss": 0.2745, + "step": 1029 + }, + { + "epoch": 0.23895139774968102, + "grad_norm": 16.579544298722894, + "learning_rate": 2e-06, + "loss": 0.3769, + "step": 1030 + }, + { + "epoch": 0.23918338939798167, + "grad_norm": 12.312689236209662, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 1031 + }, + { + "epoch": 0.23941538104628232, + "grad_norm": 22.961333721387835, + "learning_rate": 2e-06, + "loss": 0.2667, + "step": 1032 + }, + { + "epoch": 0.239647372694583, + "grad_norm": 26.38181638368504, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 1033 + }, + { + "epoch": 0.23987936434288365, + "grad_norm": 21.122533742687466, + "learning_rate": 2e-06, + "loss": 0.324, + "step": 1034 + }, + { + "epoch": 0.24011135599118433, + "grad_norm": 9.385543716224886, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 1035 + }, + { + "epoch": 0.24034334763948498, + "grad_norm": 22.411151234064008, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 1036 + }, + { + "epoch": 0.24057533928778563, + "grad_norm": 8.712554848424903, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 1037 + }, + { + "epoch": 0.2408073309360863, + "grad_norm": 23.670062322774925, + "learning_rate": 2e-06, + "loss": 0.3704, + "step": 1038 + }, + { + "epoch": 0.24103932258438696, + "grad_norm": 14.416551486323577, + "learning_rate": 2e-06, + "loss": 0.339, + "step": 1039 + }, + { + "epoch": 0.24127131423268763, + "grad_norm": 11.777740161029039, + "learning_rate": 2e-06, + "loss": 0.307, + "step": 1040 + }, + { + "epoch": 0.24150330588098828, + "grad_norm": 16.333864517062604, + "learning_rate": 2e-06, + "loss": 0.3238, + "step": 1041 + }, + { + "epoch": 0.24173529752928893, + "grad_norm": 16.200709138501136, + "learning_rate": 2e-06, + "loss": 0.3817, + "step": 1042 + }, + { + "epoch": 0.2419672891775896, + "grad_norm": 16.75392028228238, + "learning_rate": 2e-06, + "loss": 0.3657, + "step": 1043 + }, + { + "epoch": 0.24219928082589026, + "grad_norm": 16.7005437223624, + "learning_rate": 2e-06, + "loss": 0.2499, + "step": 1044 + }, + { + "epoch": 0.24243127247419094, + "grad_norm": 8.676317977311099, + "learning_rate": 2e-06, + "loss": 0.3238, + "step": 1045 + }, + { + "epoch": 0.2426632641224916, + "grad_norm": 25.531733213211815, + "learning_rate": 2e-06, + "loss": 0.3677, + "step": 1046 + }, + { + "epoch": 0.24289525577079224, + "grad_norm": 15.53633794877907, + "learning_rate": 2e-06, + "loss": 0.3253, + "step": 1047 + }, + { + "epoch": 0.24312724741909292, + "grad_norm": 19.81381137188102, + "learning_rate": 2e-06, + "loss": 0.3648, + "step": 1048 + }, + { + "epoch": 0.24335923906739357, + "grad_norm": 12.25975324260386, + "learning_rate": 2e-06, + "loss": 0.1973, + "step": 1049 + }, + { + "epoch": 0.24359123071569425, + "grad_norm": 14.00865679850547, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 1050 + }, + { + "epoch": 0.2438232223639949, + "grad_norm": 12.392977697215564, + "learning_rate": 2e-06, + "loss": 0.3584, + "step": 1051 + }, + { + "epoch": 0.24405521401229555, + "grad_norm": 13.265639482781935, + "learning_rate": 2e-06, + "loss": 0.3614, + "step": 1052 + }, + { + "epoch": 0.24428720566059622, + "grad_norm": 12.65724022381264, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 1053 + }, + { + "epoch": 0.24451919730889687, + "grad_norm": 19.42837132743988, + "learning_rate": 2e-06, + "loss": 0.3473, + "step": 1054 + }, + { + "epoch": 0.24475118895719755, + "grad_norm": 14.077878346698146, + "learning_rate": 2e-06, + "loss": 0.3495, + "step": 1055 + }, + { + "epoch": 0.2449831806054982, + "grad_norm": 14.184080625576248, + "learning_rate": 2e-06, + "loss": 0.2904, + "step": 1056 + }, + { + "epoch": 0.24521517225379885, + "grad_norm": 14.88015498904286, + "learning_rate": 2e-06, + "loss": 0.2611, + "step": 1057 + }, + { + "epoch": 0.24544716390209953, + "grad_norm": 18.393845134928288, + "learning_rate": 2e-06, + "loss": 0.3404, + "step": 1058 + }, + { + "epoch": 0.24567915555040018, + "grad_norm": 12.814313086491294, + "learning_rate": 2e-06, + "loss": 0.3263, + "step": 1059 + }, + { + "epoch": 0.24591114719870086, + "grad_norm": 10.02474449009755, + "learning_rate": 2e-06, + "loss": 0.1984, + "step": 1060 + }, + { + "epoch": 0.2461431388470015, + "grad_norm": 22.079614637832556, + "learning_rate": 2e-06, + "loss": 0.405, + "step": 1061 + }, + { + "epoch": 0.24637513049530216, + "grad_norm": 20.259832978927108, + "learning_rate": 2e-06, + "loss": 0.3395, + "step": 1062 + }, + { + "epoch": 0.24660712214360284, + "grad_norm": 15.223930734276674, + "learning_rate": 2e-06, + "loss": 0.3289, + "step": 1063 + }, + { + "epoch": 0.24683911379190349, + "grad_norm": 15.643931293073702, + "learning_rate": 2e-06, + "loss": 0.2943, + "step": 1064 + }, + { + "epoch": 0.24707110544020416, + "grad_norm": 10.985354961154403, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 1065 + }, + { + "epoch": 0.24730309708850481, + "grad_norm": 14.823172015971181, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 1066 + }, + { + "epoch": 0.24753508873680546, + "grad_norm": 23.05124526115744, + "learning_rate": 2e-06, + "loss": 0.3497, + "step": 1067 + }, + { + "epoch": 0.24776708038510614, + "grad_norm": 20.657372065022596, + "learning_rate": 2e-06, + "loss": 0.3718, + "step": 1068 + }, + { + "epoch": 0.2479990720334068, + "grad_norm": 8.63115021917154, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 1069 + }, + { + "epoch": 0.24823106368170747, + "grad_norm": 9.151095904091301, + "learning_rate": 2e-06, + "loss": 0.2463, + "step": 1070 + }, + { + "epoch": 0.24846305533000812, + "grad_norm": 17.992692596229116, + "learning_rate": 2e-06, + "loss": 0.2568, + "step": 1071 + }, + { + "epoch": 0.24869504697830877, + "grad_norm": 14.4339784894775, + "learning_rate": 2e-06, + "loss": 0.298, + "step": 1072 + }, + { + "epoch": 0.24892703862660945, + "grad_norm": 12.390189004009237, + "learning_rate": 2e-06, + "loss": 0.2571, + "step": 1073 + }, + { + "epoch": 0.2491590302749101, + "grad_norm": 12.565613467230603, + "learning_rate": 2e-06, + "loss": 0.2154, + "step": 1074 + }, + { + "epoch": 0.24939102192321078, + "grad_norm": 23.2959019541224, + "learning_rate": 2e-06, + "loss": 0.3911, + "step": 1075 + }, + { + "epoch": 0.24962301357151143, + "grad_norm": 14.039259928255964, + "learning_rate": 2e-06, + "loss": 0.2285, + "step": 1076 + }, + { + "epoch": 0.24985500521981208, + "grad_norm": 19.598970977130815, + "learning_rate": 2e-06, + "loss": 0.4487, + "step": 1077 + }, + { + "epoch": 0.25008699686811275, + "grad_norm": 12.73183841773306, + "learning_rate": 2e-06, + "loss": 0.3772, + "step": 1078 + }, + { + "epoch": 0.25031898851641343, + "grad_norm": 17.74810849129701, + "learning_rate": 2e-06, + "loss": 0.3162, + "step": 1079 + }, + { + "epoch": 0.25055098016471405, + "grad_norm": 23.191357988387217, + "learning_rate": 2e-06, + "loss": 0.4153, + "step": 1080 + }, + { + "epoch": 0.25078297181301473, + "grad_norm": 20.19849388943989, + "learning_rate": 2e-06, + "loss": 0.4059, + "step": 1081 + }, + { + "epoch": 0.2510149634613154, + "grad_norm": 11.232999446060697, + "learning_rate": 2e-06, + "loss": 0.3919, + "step": 1082 + }, + { + "epoch": 0.25124695510961603, + "grad_norm": 16.45997289285503, + "learning_rate": 2e-06, + "loss": 0.3498, + "step": 1083 + }, + { + "epoch": 0.2514789467579167, + "grad_norm": 15.723585351231751, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 1084 + }, + { + "epoch": 0.2517109384062174, + "grad_norm": 7.330028584689176, + "learning_rate": 2e-06, + "loss": 0.2012, + "step": 1085 + }, + { + "epoch": 0.251942930054518, + "grad_norm": 13.88113627267473, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 1086 + }, + { + "epoch": 0.2521749217028187, + "grad_norm": 19.509094614015467, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 1087 + }, + { + "epoch": 0.25240691335111937, + "grad_norm": 20.16682881221863, + "learning_rate": 2e-06, + "loss": 0.4119, + "step": 1088 + }, + { + "epoch": 0.25263890499942004, + "grad_norm": 11.957155629814546, + "learning_rate": 2e-06, + "loss": 0.3324, + "step": 1089 + }, + { + "epoch": 0.25287089664772067, + "grad_norm": 21.806967508834347, + "learning_rate": 2e-06, + "loss": 0.4624, + "step": 1090 + }, + { + "epoch": 0.25310288829602134, + "grad_norm": 13.749685076401235, + "learning_rate": 2e-06, + "loss": 0.3831, + "step": 1091 + }, + { + "epoch": 0.253334879944322, + "grad_norm": 16.656854478247116, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 1092 + }, + { + "epoch": 0.25356687159262264, + "grad_norm": 17.627596146798037, + "learning_rate": 2e-06, + "loss": 0.3494, + "step": 1093 + }, + { + "epoch": 0.2537988632409233, + "grad_norm": 19.745119628195066, + "learning_rate": 2e-06, + "loss": 0.4636, + "step": 1094 + }, + { + "epoch": 0.254030854889224, + "grad_norm": 17.18830926296359, + "learning_rate": 2e-06, + "loss": 0.3584, + "step": 1095 + }, + { + "epoch": 0.2542628465375246, + "grad_norm": 9.4163939741239, + "learning_rate": 2e-06, + "loss": 0.3597, + "step": 1096 + }, + { + "epoch": 0.2544948381858253, + "grad_norm": 16.018761774541133, + "learning_rate": 2e-06, + "loss": 0.3183, + "step": 1097 + }, + { + "epoch": 0.254726829834126, + "grad_norm": 17.99094305205517, + "learning_rate": 2e-06, + "loss": 0.3835, + "step": 1098 + }, + { + "epoch": 0.25495882148242666, + "grad_norm": 19.22211629507197, + "learning_rate": 2e-06, + "loss": 0.2728, + "step": 1099 + }, + { + "epoch": 0.2551908131307273, + "grad_norm": 11.092634757643538, + "learning_rate": 2e-06, + "loss": 0.2726, + "step": 1100 + }, + { + "epoch": 0.25542280477902796, + "grad_norm": 27.83008647407575, + "learning_rate": 2e-06, + "loss": 0.4326, + "step": 1101 + }, + { + "epoch": 0.25565479642732863, + "grad_norm": 15.001540429809646, + "learning_rate": 2e-06, + "loss": 0.3416, + "step": 1102 + }, + { + "epoch": 0.25588678807562926, + "grad_norm": 15.144143645466546, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 1103 + }, + { + "epoch": 0.25611877972392993, + "grad_norm": 15.645113127825065, + "learning_rate": 2e-06, + "loss": 0.3551, + "step": 1104 + }, + { + "epoch": 0.2563507713722306, + "grad_norm": 10.928220909805345, + "learning_rate": 2e-06, + "loss": 0.2243, + "step": 1105 + }, + { + "epoch": 0.25658276302053123, + "grad_norm": 11.445144928478546, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 1106 + }, + { + "epoch": 0.2568147546688319, + "grad_norm": 7.806367904222486, + "learning_rate": 2e-06, + "loss": 0.211, + "step": 1107 + }, + { + "epoch": 0.2570467463171326, + "grad_norm": 22.770218588703784, + "learning_rate": 2e-06, + "loss": 0.4157, + "step": 1108 + }, + { + "epoch": 0.25727873796543327, + "grad_norm": 24.118256132266875, + "learning_rate": 2e-06, + "loss": 0.3311, + "step": 1109 + }, + { + "epoch": 0.2575107296137339, + "grad_norm": 14.418260243799653, + "learning_rate": 2e-06, + "loss": 0.2665, + "step": 1110 + }, + { + "epoch": 0.25774272126203457, + "grad_norm": 17.95557610022089, + "learning_rate": 2e-06, + "loss": 0.4597, + "step": 1111 + }, + { + "epoch": 0.25797471291033525, + "grad_norm": 9.998203620016008, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 1112 + }, + { + "epoch": 0.25820670455863587, + "grad_norm": 15.98410256214507, + "learning_rate": 2e-06, + "loss": 0.3399, + "step": 1113 + }, + { + "epoch": 0.25843869620693655, + "grad_norm": 14.65274151237759, + "learning_rate": 2e-06, + "loss": 0.3357, + "step": 1114 + }, + { + "epoch": 0.2586706878552372, + "grad_norm": 15.099128606884662, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 1115 + }, + { + "epoch": 0.25890267950353785, + "grad_norm": 24.82947299465006, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 1116 + }, + { + "epoch": 0.2591346711518385, + "grad_norm": 13.261084761295198, + "learning_rate": 2e-06, + "loss": 0.3377, + "step": 1117 + }, + { + "epoch": 0.2593666628001392, + "grad_norm": 22.691482458679683, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 1118 + }, + { + "epoch": 0.2595986544484399, + "grad_norm": 14.41997073811382, + "learning_rate": 2e-06, + "loss": 0.4644, + "step": 1119 + }, + { + "epoch": 0.2598306460967405, + "grad_norm": 15.695126032511784, + "learning_rate": 2e-06, + "loss": 0.3084, + "step": 1120 + }, + { + "epoch": 0.2600626377450412, + "grad_norm": 20.452165103407637, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 1121 + }, + { + "epoch": 0.26029462939334186, + "grad_norm": 19.152384814044723, + "learning_rate": 2e-06, + "loss": 0.3941, + "step": 1122 + }, + { + "epoch": 0.2605266210416425, + "grad_norm": 10.929796244886692, + "learning_rate": 2e-06, + "loss": 0.3085, + "step": 1123 + }, + { + "epoch": 0.26075861268994316, + "grad_norm": 17.796433072113317, + "learning_rate": 2e-06, + "loss": 0.4109, + "step": 1124 + }, + { + "epoch": 0.26099060433824384, + "grad_norm": 10.425466920495225, + "learning_rate": 2e-06, + "loss": 0.3517, + "step": 1125 + }, + { + "epoch": 0.26122259598654446, + "grad_norm": 6.015552175860235, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 1126 + }, + { + "epoch": 0.26145458763484514, + "grad_norm": 6.126586375825967, + "learning_rate": 2e-06, + "loss": 0.2415, + "step": 1127 + }, + { + "epoch": 0.2616865792831458, + "grad_norm": 11.863623548309842, + "learning_rate": 2e-06, + "loss": 0.2739, + "step": 1128 + }, + { + "epoch": 0.2619185709314465, + "grad_norm": 14.909953393587152, + "learning_rate": 2e-06, + "loss": 0.3223, + "step": 1129 + }, + { + "epoch": 0.2621505625797471, + "grad_norm": 14.646054099158548, + "learning_rate": 2e-06, + "loss": 0.3307, + "step": 1130 + }, + { + "epoch": 0.2623825542280478, + "grad_norm": 13.629312091258925, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 1131 + }, + { + "epoch": 0.26261454587634847, + "grad_norm": 24.391131892080814, + "learning_rate": 2e-06, + "loss": 0.3546, + "step": 1132 + }, + { + "epoch": 0.2628465375246491, + "grad_norm": 9.685129053473396, + "learning_rate": 2e-06, + "loss": 0.3143, + "step": 1133 + }, + { + "epoch": 0.26307852917294977, + "grad_norm": 19.920212953026695, + "learning_rate": 2e-06, + "loss": 0.3734, + "step": 1134 + }, + { + "epoch": 0.26331052082125045, + "grad_norm": 18.873355014808933, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 1135 + }, + { + "epoch": 0.26354251246955107, + "grad_norm": 15.679849289037026, + "learning_rate": 2e-06, + "loss": 0.338, + "step": 1136 + }, + { + "epoch": 0.26377450411785175, + "grad_norm": 13.53924087047476, + "learning_rate": 2e-06, + "loss": 0.3066, + "step": 1137 + }, + { + "epoch": 0.2640064957661524, + "grad_norm": 11.745611010016228, + "learning_rate": 2e-06, + "loss": 0.3399, + "step": 1138 + }, + { + "epoch": 0.2642384874144531, + "grad_norm": 15.225621934010864, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 1139 + }, + { + "epoch": 0.2644704790627537, + "grad_norm": 10.40783059783653, + "learning_rate": 2e-06, + "loss": 0.2386, + "step": 1140 + }, + { + "epoch": 0.2647024707110544, + "grad_norm": 15.851959230703185, + "learning_rate": 2e-06, + "loss": 0.3335, + "step": 1141 + }, + { + "epoch": 0.2649344623593551, + "grad_norm": 22.66210347750043, + "learning_rate": 2e-06, + "loss": 0.3033, + "step": 1142 + }, + { + "epoch": 0.2651664540076557, + "grad_norm": 26.031860194956312, + "learning_rate": 2e-06, + "loss": 0.3799, + "step": 1143 + }, + { + "epoch": 0.2653984456559564, + "grad_norm": 18.188321934019086, + "learning_rate": 2e-06, + "loss": 0.3478, + "step": 1144 + }, + { + "epoch": 0.26563043730425706, + "grad_norm": 9.418783234825963, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 1145 + }, + { + "epoch": 0.2658624289525577, + "grad_norm": 12.05426823213287, + "learning_rate": 2e-06, + "loss": 0.4227, + "step": 1146 + }, + { + "epoch": 0.26609442060085836, + "grad_norm": 26.72325329812725, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 1147 + }, + { + "epoch": 0.26632641224915904, + "grad_norm": 16.733829455551838, + "learning_rate": 2e-06, + "loss": 0.3918, + "step": 1148 + }, + { + "epoch": 0.2665584038974597, + "grad_norm": 14.249457828238011, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 1149 + }, + { + "epoch": 0.26679039554576034, + "grad_norm": 19.86506070787913, + "learning_rate": 2e-06, + "loss": 0.3798, + "step": 1150 + }, + { + "epoch": 0.267022387194061, + "grad_norm": 13.956094717202392, + "learning_rate": 2e-06, + "loss": 0.2447, + "step": 1151 + }, + { + "epoch": 0.2672543788423617, + "grad_norm": 10.456531626943017, + "learning_rate": 2e-06, + "loss": 0.3394, + "step": 1152 + }, + { + "epoch": 0.2674863704906623, + "grad_norm": 11.825638890958947, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 1153 + }, + { + "epoch": 0.267718362138963, + "grad_norm": 16.889888110765746, + "learning_rate": 2e-06, + "loss": 0.3971, + "step": 1154 + }, + { + "epoch": 0.2679503537872637, + "grad_norm": 14.904612459119354, + "learning_rate": 2e-06, + "loss": 0.2268, + "step": 1155 + }, + { + "epoch": 0.2681823454355643, + "grad_norm": 15.751958535325018, + "learning_rate": 2e-06, + "loss": 0.2576, + "step": 1156 + }, + { + "epoch": 0.268414337083865, + "grad_norm": 11.142765466493485, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 1157 + }, + { + "epoch": 0.26864632873216565, + "grad_norm": 11.463689658847697, + "learning_rate": 2e-06, + "loss": 0.2759, + "step": 1158 + }, + { + "epoch": 0.26887832038046633, + "grad_norm": 15.396711358653583, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 1159 + }, + { + "epoch": 0.26911031202876695, + "grad_norm": 20.355215766836594, + "learning_rate": 2e-06, + "loss": 0.2896, + "step": 1160 + }, + { + "epoch": 0.26934230367706763, + "grad_norm": 22.16248015841271, + "learning_rate": 2e-06, + "loss": 0.3936, + "step": 1161 + }, + { + "epoch": 0.2695742953253683, + "grad_norm": 25.343187131455633, + "learning_rate": 2e-06, + "loss": 0.3586, + "step": 1162 + }, + { + "epoch": 0.26980628697366893, + "grad_norm": 7.171393164606418, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 1163 + }, + { + "epoch": 0.2700382786219696, + "grad_norm": 19.05142625020754, + "learning_rate": 2e-06, + "loss": 0.3657, + "step": 1164 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 8.95382335460153, + "learning_rate": 2e-06, + "loss": 0.3213, + "step": 1165 + }, + { + "epoch": 0.2705022619185709, + "grad_norm": 12.81735944131886, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 1166 + }, + { + "epoch": 0.2707342535668716, + "grad_norm": 17.805420961529475, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 1167 + }, + { + "epoch": 0.27096624521517226, + "grad_norm": 16.874564931891637, + "learning_rate": 2e-06, + "loss": 0.3154, + "step": 1168 + }, + { + "epoch": 0.27119823686347294, + "grad_norm": 11.799433595683006, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 1169 + }, + { + "epoch": 0.27143022851177356, + "grad_norm": 9.39700626037502, + "learning_rate": 2e-06, + "loss": 0.2184, + "step": 1170 + }, + { + "epoch": 0.27166222016007424, + "grad_norm": 17.073536459242636, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 1171 + }, + { + "epoch": 0.2718942118083749, + "grad_norm": 21.22630275472056, + "learning_rate": 2e-06, + "loss": 0.3663, + "step": 1172 + }, + { + "epoch": 0.27212620345667554, + "grad_norm": 12.59115651190941, + "learning_rate": 2e-06, + "loss": 0.3009, + "step": 1173 + }, + { + "epoch": 0.2723581951049762, + "grad_norm": 16.968572807837223, + "learning_rate": 2e-06, + "loss": 0.4332, + "step": 1174 + }, + { + "epoch": 0.2725901867532769, + "grad_norm": 9.655637385536188, + "learning_rate": 2e-06, + "loss": 0.2609, + "step": 1175 + }, + { + "epoch": 0.2728221784015775, + "grad_norm": 13.890112671278628, + "learning_rate": 2e-06, + "loss": 0.2625, + "step": 1176 + }, + { + "epoch": 0.2730541700498782, + "grad_norm": 12.606139803898643, + "learning_rate": 2e-06, + "loss": 0.3785, + "step": 1177 + }, + { + "epoch": 0.2732861616981789, + "grad_norm": 13.28056042143577, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 1178 + }, + { + "epoch": 0.27351815334647955, + "grad_norm": 30.27887213841802, + "learning_rate": 2e-06, + "loss": 0.4022, + "step": 1179 + }, + { + "epoch": 0.2737501449947802, + "grad_norm": 7.930895909161131, + "learning_rate": 2e-06, + "loss": 0.2247, + "step": 1180 + }, + { + "epoch": 0.27398213664308085, + "grad_norm": 13.533083064817841, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 1181 + }, + { + "epoch": 0.27421412829138153, + "grad_norm": 10.212297449937365, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 1182 + }, + { + "epoch": 0.27444611993968215, + "grad_norm": 10.988417124060847, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 1183 + }, + { + "epoch": 0.27467811158798283, + "grad_norm": 24.17803575510525, + "learning_rate": 2e-06, + "loss": 0.4077, + "step": 1184 + }, + { + "epoch": 0.2749101032362835, + "grad_norm": 9.680825814303265, + "learning_rate": 2e-06, + "loss": 0.2429, + "step": 1185 + }, + { + "epoch": 0.27514209488458413, + "grad_norm": 6.049044301937683, + "learning_rate": 2e-06, + "loss": 0.2024, + "step": 1186 + }, + { + "epoch": 0.2753740865328848, + "grad_norm": 18.00295476464138, + "learning_rate": 2e-06, + "loss": 0.2559, + "step": 1187 + }, + { + "epoch": 0.2756060781811855, + "grad_norm": 15.30270566627656, + "learning_rate": 2e-06, + "loss": 0.3257, + "step": 1188 + }, + { + "epoch": 0.27583806982948617, + "grad_norm": 21.694395810281325, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 1189 + }, + { + "epoch": 0.2760700614777868, + "grad_norm": 8.331967188153596, + "learning_rate": 2e-06, + "loss": 0.1824, + "step": 1190 + }, + { + "epoch": 0.27630205312608747, + "grad_norm": 15.415548630144007, + "learning_rate": 2e-06, + "loss": 0.3489, + "step": 1191 + }, + { + "epoch": 0.27653404477438814, + "grad_norm": 14.233338810505089, + "learning_rate": 2e-06, + "loss": 0.1883, + "step": 1192 + }, + { + "epoch": 0.27676603642268877, + "grad_norm": 13.344673584990591, + "learning_rate": 2e-06, + "loss": 0.2581, + "step": 1193 + }, + { + "epoch": 0.27699802807098944, + "grad_norm": 8.892289670198297, + "learning_rate": 2e-06, + "loss": 0.3472, + "step": 1194 + }, + { + "epoch": 0.2772300197192901, + "grad_norm": 7.283026978981145, + "learning_rate": 2e-06, + "loss": 0.2081, + "step": 1195 + }, + { + "epoch": 0.27746201136759074, + "grad_norm": 20.765644495960995, + "learning_rate": 2e-06, + "loss": 0.3821, + "step": 1196 + }, + { + "epoch": 0.2776940030158914, + "grad_norm": 11.812462517867834, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 1197 + }, + { + "epoch": 0.2779259946641921, + "grad_norm": 21.853687986115002, + "learning_rate": 2e-06, + "loss": 0.4368, + "step": 1198 + }, + { + "epoch": 0.2781579863124928, + "grad_norm": 29.536970035347192, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 1199 + }, + { + "epoch": 0.2783899779607934, + "grad_norm": 15.492748748291122, + "learning_rate": 2e-06, + "loss": 0.2616, + "step": 1200 + }, + { + "epoch": 0.2786219696090941, + "grad_norm": 11.006289448991424, + "learning_rate": 2e-06, + "loss": 0.2992, + "step": 1201 + }, + { + "epoch": 0.27885396125739476, + "grad_norm": 16.026435056899846, + "learning_rate": 2e-06, + "loss": 0.2834, + "step": 1202 + }, + { + "epoch": 0.2790859529056954, + "grad_norm": 14.183111450349523, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 1203 + }, + { + "epoch": 0.27931794455399606, + "grad_norm": 12.028101213734947, + "learning_rate": 2e-06, + "loss": 0.4311, + "step": 1204 + }, + { + "epoch": 0.27954993620229673, + "grad_norm": 15.728963078237921, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 1205 + }, + { + "epoch": 0.27978192785059736, + "grad_norm": 18.74674073643408, + "learning_rate": 2e-06, + "loss": 0.2332, + "step": 1206 + }, + { + "epoch": 0.28001391949889803, + "grad_norm": 7.8767526680463735, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 1207 + }, + { + "epoch": 0.2802459111471987, + "grad_norm": 19.589274869050506, + "learning_rate": 2e-06, + "loss": 0.3261, + "step": 1208 + }, + { + "epoch": 0.2804779027954994, + "grad_norm": 11.569333600691113, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 1209 + }, + { + "epoch": 0.2807098944438, + "grad_norm": 20.27838346921668, + "learning_rate": 2e-06, + "loss": 0.365, + "step": 1210 + }, + { + "epoch": 0.2809418860921007, + "grad_norm": 16.174668249143103, + "learning_rate": 2e-06, + "loss": 0.2677, + "step": 1211 + }, + { + "epoch": 0.28117387774040137, + "grad_norm": 9.592205036160628, + "learning_rate": 2e-06, + "loss": 0.4643, + "step": 1212 + }, + { + "epoch": 0.281405869388702, + "grad_norm": 11.243429081806028, + "learning_rate": 2e-06, + "loss": 0.2655, + "step": 1213 + }, + { + "epoch": 0.28163786103700267, + "grad_norm": 22.27363746188085, + "learning_rate": 2e-06, + "loss": 0.4154, + "step": 1214 + }, + { + "epoch": 0.28186985268530335, + "grad_norm": 16.73996394226011, + "learning_rate": 2e-06, + "loss": 0.3184, + "step": 1215 + }, + { + "epoch": 0.28210184433360397, + "grad_norm": 15.017671158539848, + "learning_rate": 2e-06, + "loss": 0.184, + "step": 1216 + }, + { + "epoch": 0.28233383598190465, + "grad_norm": 10.718819280489564, + "learning_rate": 2e-06, + "loss": 0.226, + "step": 1217 + }, + { + "epoch": 0.2825658276302053, + "grad_norm": 16.07308944496508, + "learning_rate": 2e-06, + "loss": 0.4901, + "step": 1218 + }, + { + "epoch": 0.28279781927850595, + "grad_norm": 21.358455166015435, + "learning_rate": 2e-06, + "loss": 0.3208, + "step": 1219 + }, + { + "epoch": 0.2830298109268066, + "grad_norm": 11.131587047899961, + "learning_rate": 2e-06, + "loss": 0.4496, + "step": 1220 + }, + { + "epoch": 0.2832618025751073, + "grad_norm": 13.583352811882996, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 1221 + }, + { + "epoch": 0.283493794223408, + "grad_norm": 18.275221017841407, + "learning_rate": 2e-06, + "loss": 0.346, + "step": 1222 + }, + { + "epoch": 0.2837257858717086, + "grad_norm": 13.559004500486491, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 1223 + }, + { + "epoch": 0.2839577775200093, + "grad_norm": 12.193009244722145, + "learning_rate": 2e-06, + "loss": 0.3228, + "step": 1224 + }, + { + "epoch": 0.28418976916830996, + "grad_norm": 16.417106957183268, + "learning_rate": 2e-06, + "loss": 0.3148, + "step": 1225 + }, + { + "epoch": 0.2844217608166106, + "grad_norm": 6.873063427629634, + "learning_rate": 2e-06, + "loss": 0.2427, + "step": 1226 + }, + { + "epoch": 0.28465375246491126, + "grad_norm": 14.000907125813223, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 1227 + }, + { + "epoch": 0.28488574411321194, + "grad_norm": 6.750238251953412, + "learning_rate": 2e-06, + "loss": 0.2787, + "step": 1228 + }, + { + "epoch": 0.28511773576151256, + "grad_norm": 9.187628279617533, + "learning_rate": 2e-06, + "loss": 0.3106, + "step": 1229 + }, + { + "epoch": 0.28534972740981324, + "grad_norm": 26.616245578904888, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 1230 + }, + { + "epoch": 0.2855817190581139, + "grad_norm": 14.429684634728005, + "learning_rate": 2e-06, + "loss": 0.3717, + "step": 1231 + }, + { + "epoch": 0.2858137107064146, + "grad_norm": 15.696007478073717, + "learning_rate": 2e-06, + "loss": 0.3244, + "step": 1232 + }, + { + "epoch": 0.2860457023547152, + "grad_norm": 21.349186599737536, + "learning_rate": 2e-06, + "loss": 0.3987, + "step": 1233 + }, + { + "epoch": 0.2862776940030159, + "grad_norm": 8.12123750761895, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 1234 + }, + { + "epoch": 0.28650968565131657, + "grad_norm": 15.21984604745377, + "learning_rate": 2e-06, + "loss": 0.2997, + "step": 1235 + }, + { + "epoch": 0.2867416772996172, + "grad_norm": 12.726342178155084, + "learning_rate": 2e-06, + "loss": 0.3333, + "step": 1236 + }, + { + "epoch": 0.28697366894791787, + "grad_norm": 16.90525595728918, + "learning_rate": 2e-06, + "loss": 0.3284, + "step": 1237 + }, + { + "epoch": 0.28720566059621855, + "grad_norm": 17.579056592317873, + "learning_rate": 2e-06, + "loss": 0.3973, + "step": 1238 + }, + { + "epoch": 0.28743765224451917, + "grad_norm": 15.35796271778437, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 1239 + }, + { + "epoch": 0.28766964389281985, + "grad_norm": 21.02104399086055, + "learning_rate": 2e-06, + "loss": 0.3467, + "step": 1240 + }, + { + "epoch": 0.2879016355411205, + "grad_norm": 5.394065636271639, + "learning_rate": 2e-06, + "loss": 0.217, + "step": 1241 + }, + { + "epoch": 0.2881336271894212, + "grad_norm": 18.591886314525397, + "learning_rate": 2e-06, + "loss": 0.3213, + "step": 1242 + }, + { + "epoch": 0.2883656188377218, + "grad_norm": 22.04197488485568, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 1243 + }, + { + "epoch": 0.2885976104860225, + "grad_norm": 12.995736617500656, + "learning_rate": 2e-06, + "loss": 0.2668, + "step": 1244 + }, + { + "epoch": 0.2888296021343232, + "grad_norm": 23.646397067968046, + "learning_rate": 2e-06, + "loss": 0.3176, + "step": 1245 + }, + { + "epoch": 0.2890615937826238, + "grad_norm": 11.409118689634937, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 1246 + }, + { + "epoch": 0.2892935854309245, + "grad_norm": 11.742027071172174, + "learning_rate": 2e-06, + "loss": 0.4533, + "step": 1247 + }, + { + "epoch": 0.28952557707922516, + "grad_norm": 8.003574151157743, + "learning_rate": 2e-06, + "loss": 0.2324, + "step": 1248 + }, + { + "epoch": 0.2897575687275258, + "grad_norm": 18.4898821926245, + "learning_rate": 2e-06, + "loss": 0.3342, + "step": 1249 + }, + { + "epoch": 0.28998956037582646, + "grad_norm": 19.712215280172444, + "learning_rate": 2e-06, + "loss": 0.3294, + "step": 1250 + }, + { + "epoch": 0.29022155202412714, + "grad_norm": 12.006452342747606, + "learning_rate": 2e-06, + "loss": 0.2672, + "step": 1251 + }, + { + "epoch": 0.2904535436724278, + "grad_norm": 9.992023893613998, + "learning_rate": 2e-06, + "loss": 0.2019, + "step": 1252 + }, + { + "epoch": 0.29068553532072844, + "grad_norm": 13.6861127169211, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 1253 + }, + { + "epoch": 0.2909175269690291, + "grad_norm": 15.02484608301213, + "learning_rate": 2e-06, + "loss": 0.3086, + "step": 1254 + }, + { + "epoch": 0.2911495186173298, + "grad_norm": 9.942533776328188, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 1255 + }, + { + "epoch": 0.2913815102656304, + "grad_norm": 15.070113001242195, + "learning_rate": 2e-06, + "loss": 0.2637, + "step": 1256 + }, + { + "epoch": 0.2916135019139311, + "grad_norm": 20.127966390681635, + "learning_rate": 2e-06, + "loss": 0.3592, + "step": 1257 + }, + { + "epoch": 0.2918454935622318, + "grad_norm": 16.783537665196086, + "learning_rate": 2e-06, + "loss": 0.4248, + "step": 1258 + }, + { + "epoch": 0.2920774852105324, + "grad_norm": 14.996063668291244, + "learning_rate": 2e-06, + "loss": 0.3356, + "step": 1259 + }, + { + "epoch": 0.2923094768588331, + "grad_norm": 13.280194937801802, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 1260 + }, + { + "epoch": 0.29254146850713375, + "grad_norm": 12.639332668504503, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 1261 + }, + { + "epoch": 0.29277346015543443, + "grad_norm": 11.504927643928248, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 1262 + }, + { + "epoch": 0.29300545180373505, + "grad_norm": 20.752244836231803, + "learning_rate": 2e-06, + "loss": 0.4051, + "step": 1263 + }, + { + "epoch": 0.29323744345203573, + "grad_norm": 11.657319471913775, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 1264 + }, + { + "epoch": 0.2934694351003364, + "grad_norm": 16.861970554754627, + "learning_rate": 2e-06, + "loss": 0.3725, + "step": 1265 + }, + { + "epoch": 0.29370142674863703, + "grad_norm": 14.885281184009598, + "learning_rate": 2e-06, + "loss": 0.2854, + "step": 1266 + }, + { + "epoch": 0.2939334183969377, + "grad_norm": 9.448485631920036, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 1267 + }, + { + "epoch": 0.2941654100452384, + "grad_norm": 14.217616728146512, + "learning_rate": 2e-06, + "loss": 0.3417, + "step": 1268 + }, + { + "epoch": 0.294397401693539, + "grad_norm": 9.39865709086706, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 1269 + }, + { + "epoch": 0.2946293933418397, + "grad_norm": 19.94441583861392, + "learning_rate": 2e-06, + "loss": 0.3479, + "step": 1270 + }, + { + "epoch": 0.29486138499014036, + "grad_norm": 11.9760265364321, + "learning_rate": 2e-06, + "loss": 0.3599, + "step": 1271 + }, + { + "epoch": 0.29509337663844104, + "grad_norm": 11.245365111905581, + "learning_rate": 2e-06, + "loss": 0.2648, + "step": 1272 + }, + { + "epoch": 0.29532536828674166, + "grad_norm": 14.520006307802863, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 1273 + }, + { + "epoch": 0.29555735993504234, + "grad_norm": 18.75988516752072, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 1274 + }, + { + "epoch": 0.295789351583343, + "grad_norm": 15.494061198744825, + "learning_rate": 2e-06, + "loss": 0.3706, + "step": 1275 + }, + { + "epoch": 0.29602134323164364, + "grad_norm": 18.800626470937203, + "learning_rate": 2e-06, + "loss": 0.3949, + "step": 1276 + }, + { + "epoch": 0.2962533348799443, + "grad_norm": 21.137356291700847, + "learning_rate": 2e-06, + "loss": 0.4411, + "step": 1277 + }, + { + "epoch": 0.296485326528245, + "grad_norm": 14.69093229850745, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 1278 + }, + { + "epoch": 0.2967173181765456, + "grad_norm": 19.660974625480087, + "learning_rate": 2e-06, + "loss": 0.3263, + "step": 1279 + }, + { + "epoch": 0.2969493098248463, + "grad_norm": 11.241549838681076, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 1280 + }, + { + "epoch": 0.297181301473147, + "grad_norm": 13.986806951706525, + "learning_rate": 2e-06, + "loss": 0.2402, + "step": 1281 + }, + { + "epoch": 0.29741329312144765, + "grad_norm": 14.285571512151392, + "learning_rate": 2e-06, + "loss": 0.374, + "step": 1282 + }, + { + "epoch": 0.2976452847697483, + "grad_norm": 10.958162154047578, + "learning_rate": 2e-06, + "loss": 0.2306, + "step": 1283 + }, + { + "epoch": 0.29787727641804895, + "grad_norm": 18.532356058362055, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 1284 + }, + { + "epoch": 0.29810926806634963, + "grad_norm": 12.891377637696152, + "learning_rate": 2e-06, + "loss": 0.2602, + "step": 1285 + }, + { + "epoch": 0.29834125971465025, + "grad_norm": 16.11181170520358, + "learning_rate": 2e-06, + "loss": 0.3242, + "step": 1286 + }, + { + "epoch": 0.29857325136295093, + "grad_norm": 9.615362986016594, + "learning_rate": 2e-06, + "loss": 0.2893, + "step": 1287 + }, + { + "epoch": 0.2988052430112516, + "grad_norm": 17.945990919432717, + "learning_rate": 2e-06, + "loss": 0.3861, + "step": 1288 + }, + { + "epoch": 0.29903723465955223, + "grad_norm": 17.99362450227029, + "learning_rate": 2e-06, + "loss": 0.3724, + "step": 1289 + }, + { + "epoch": 0.2992692263078529, + "grad_norm": 17.107598158507745, + "learning_rate": 2e-06, + "loss": 0.3102, + "step": 1290 + }, + { + "epoch": 0.2995012179561536, + "grad_norm": 10.520699209722633, + "learning_rate": 2e-06, + "loss": 0.2517, + "step": 1291 + }, + { + "epoch": 0.29973320960445426, + "grad_norm": 22.305669883971103, + "learning_rate": 2e-06, + "loss": 0.3745, + "step": 1292 + }, + { + "epoch": 0.2999652012527549, + "grad_norm": 23.331585901418507, + "learning_rate": 2e-06, + "loss": 0.297, + "step": 1293 + }, + { + "epoch": 0.30019719290105557, + "grad_norm": 9.13037425685583, + "learning_rate": 2e-06, + "loss": 0.2517, + "step": 1294 + }, + { + "epoch": 0.30042918454935624, + "grad_norm": 25.03598967873586, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 1295 + }, + { + "epoch": 0.30066117619765687, + "grad_norm": 6.227195868422087, + "learning_rate": 2e-06, + "loss": 0.2703, + "step": 1296 + }, + { + "epoch": 0.30089316784595754, + "grad_norm": 10.524631044435568, + "learning_rate": 2e-06, + "loss": 0.2402, + "step": 1297 + }, + { + "epoch": 0.3011251594942582, + "grad_norm": 14.452260998510923, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 1298 + }, + { + "epoch": 0.30135715114255884, + "grad_norm": 16.572736184416854, + "learning_rate": 2e-06, + "loss": 0.3713, + "step": 1299 + }, + { + "epoch": 0.3015891427908595, + "grad_norm": 12.092715650876997, + "learning_rate": 2e-06, + "loss": 0.4643, + "step": 1300 + }, + { + "epoch": 0.3018211344391602, + "grad_norm": 11.763924015078992, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 1301 + }, + { + "epoch": 0.3020531260874609, + "grad_norm": 11.297415978022148, + "learning_rate": 2e-06, + "loss": 0.3153, + "step": 1302 + }, + { + "epoch": 0.3022851177357615, + "grad_norm": 22.418021514922543, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 1303 + }, + { + "epoch": 0.3025171093840622, + "grad_norm": 20.535356510555992, + "learning_rate": 2e-06, + "loss": 0.2667, + "step": 1304 + }, + { + "epoch": 0.30274910103236286, + "grad_norm": 15.480983737195427, + "learning_rate": 2e-06, + "loss": 0.3294, + "step": 1305 + }, + { + "epoch": 0.3029810926806635, + "grad_norm": 18.091773010729597, + "learning_rate": 2e-06, + "loss": 0.3949, + "step": 1306 + }, + { + "epoch": 0.30321308432896416, + "grad_norm": 11.142072308484599, + "learning_rate": 2e-06, + "loss": 0.299, + "step": 1307 + }, + { + "epoch": 0.30344507597726483, + "grad_norm": 10.747760068034689, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 1308 + }, + { + "epoch": 0.30367706762556546, + "grad_norm": 16.711980069760436, + "learning_rate": 2e-06, + "loss": 0.3531, + "step": 1309 + }, + { + "epoch": 0.30390905927386613, + "grad_norm": 17.991188041443273, + "learning_rate": 2e-06, + "loss": 0.2774, + "step": 1310 + }, + { + "epoch": 0.3041410509221668, + "grad_norm": 11.576762335855799, + "learning_rate": 2e-06, + "loss": 0.395, + "step": 1311 + }, + { + "epoch": 0.3043730425704675, + "grad_norm": 10.466971442475415, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 1312 + }, + { + "epoch": 0.3046050342187681, + "grad_norm": 18.42622648855536, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 1313 + }, + { + "epoch": 0.3048370258670688, + "grad_norm": 15.540120049867527, + "learning_rate": 2e-06, + "loss": 0.3262, + "step": 1314 + }, + { + "epoch": 0.30506901751536947, + "grad_norm": 17.587207558740374, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 1315 + }, + { + "epoch": 0.3053010091636701, + "grad_norm": 17.44985967664141, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 1316 + }, + { + "epoch": 0.30553300081197077, + "grad_norm": 8.54965869254267, + "learning_rate": 2e-06, + "loss": 0.2416, + "step": 1317 + }, + { + "epoch": 0.30576499246027145, + "grad_norm": 8.979997168292295, + "learning_rate": 2e-06, + "loss": 0.348, + "step": 1318 + }, + { + "epoch": 0.30599698410857207, + "grad_norm": 28.59983116861295, + "learning_rate": 2e-06, + "loss": 0.4125, + "step": 1319 + }, + { + "epoch": 0.30622897575687275, + "grad_norm": 8.07380333347449, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 1320 + }, + { + "epoch": 0.3064609674051734, + "grad_norm": 10.676150094033074, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 1321 + }, + { + "epoch": 0.3066929590534741, + "grad_norm": 17.299675012306995, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 1322 + }, + { + "epoch": 0.3069249507017747, + "grad_norm": 11.668388317530653, + "learning_rate": 2e-06, + "loss": 0.3085, + "step": 1323 + }, + { + "epoch": 0.3071569423500754, + "grad_norm": 13.351683945674617, + "learning_rate": 2e-06, + "loss": 0.3607, + "step": 1324 + }, + { + "epoch": 0.3073889339983761, + "grad_norm": 13.197158579360199, + "learning_rate": 2e-06, + "loss": 0.3151, + "step": 1325 + }, + { + "epoch": 0.3076209256466767, + "grad_norm": 7.283939134417037, + "learning_rate": 2e-06, + "loss": 0.1872, + "step": 1326 + }, + { + "epoch": 0.3078529172949774, + "grad_norm": 10.299996868078303, + "learning_rate": 2e-06, + "loss": 0.3248, + "step": 1327 + }, + { + "epoch": 0.30808490894327806, + "grad_norm": 24.67342231320794, + "learning_rate": 2e-06, + "loss": 0.3031, + "step": 1328 + }, + { + "epoch": 0.3083169005915787, + "grad_norm": 9.223137640402232, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 1329 + }, + { + "epoch": 0.30854889223987936, + "grad_norm": 13.917374776095183, + "learning_rate": 2e-06, + "loss": 0.2151, + "step": 1330 + }, + { + "epoch": 0.30878088388818004, + "grad_norm": 22.274647106743906, + "learning_rate": 2e-06, + "loss": 0.3928, + "step": 1331 + }, + { + "epoch": 0.3090128755364807, + "grad_norm": 14.705277230122983, + "learning_rate": 2e-06, + "loss": 0.3337, + "step": 1332 + }, + { + "epoch": 0.30924486718478134, + "grad_norm": 16.660797472432396, + "learning_rate": 2e-06, + "loss": 0.3067, + "step": 1333 + }, + { + "epoch": 0.309476858833082, + "grad_norm": 10.20320184386022, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 1334 + }, + { + "epoch": 0.3097088504813827, + "grad_norm": 14.164750183745458, + "learning_rate": 2e-06, + "loss": 0.3187, + "step": 1335 + }, + { + "epoch": 0.3099408421296833, + "grad_norm": 17.276330857778227, + "learning_rate": 2e-06, + "loss": 0.4243, + "step": 1336 + }, + { + "epoch": 0.310172833777984, + "grad_norm": 15.575398625815712, + "learning_rate": 2e-06, + "loss": 0.3158, + "step": 1337 + }, + { + "epoch": 0.31040482542628467, + "grad_norm": 18.940988228755096, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 1338 + }, + { + "epoch": 0.3106368170745853, + "grad_norm": 15.728923178327832, + "learning_rate": 2e-06, + "loss": 0.2889, + "step": 1339 + }, + { + "epoch": 0.31086880872288597, + "grad_norm": 12.861312651073032, + "learning_rate": 2e-06, + "loss": 0.3422, + "step": 1340 + }, + { + "epoch": 0.31110080037118665, + "grad_norm": 7.844519890976591, + "learning_rate": 2e-06, + "loss": 0.1895, + "step": 1341 + }, + { + "epoch": 0.3113327920194873, + "grad_norm": 16.205539259080346, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 1342 + }, + { + "epoch": 0.31156478366778795, + "grad_norm": 13.541084983572475, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 1343 + }, + { + "epoch": 0.3117967753160886, + "grad_norm": 15.138498429814257, + "learning_rate": 2e-06, + "loss": 0.2069, + "step": 1344 + }, + { + "epoch": 0.3120287669643893, + "grad_norm": 17.90538146115018, + "learning_rate": 2e-06, + "loss": 0.307, + "step": 1345 + }, + { + "epoch": 0.3122607586126899, + "grad_norm": 16.52864730654945, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 1346 + }, + { + "epoch": 0.3124927502609906, + "grad_norm": 10.476892047549619, + "learning_rate": 2e-06, + "loss": 0.29, + "step": 1347 + }, + { + "epoch": 0.3127247419092913, + "grad_norm": 22.11804081976467, + "learning_rate": 2e-06, + "loss": 0.2963, + "step": 1348 + }, + { + "epoch": 0.3129567335575919, + "grad_norm": 14.362786675309335, + "learning_rate": 2e-06, + "loss": 0.2286, + "step": 1349 + }, + { + "epoch": 0.3131887252058926, + "grad_norm": 7.571507878073395, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 1350 + }, + { + "epoch": 0.31342071685419326, + "grad_norm": 18.40040614466039, + "learning_rate": 2e-06, + "loss": 0.3439, + "step": 1351 + }, + { + "epoch": 0.31365270850249394, + "grad_norm": 29.00454963048086, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 1352 + }, + { + "epoch": 0.31388470015079456, + "grad_norm": 12.1743071611877, + "learning_rate": 2e-06, + "loss": 0.3104, + "step": 1353 + }, + { + "epoch": 0.31411669179909524, + "grad_norm": 13.407021819241502, + "learning_rate": 2e-06, + "loss": 0.2899, + "step": 1354 + }, + { + "epoch": 0.3143486834473959, + "grad_norm": 13.353584074233229, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 1355 + }, + { + "epoch": 0.31458067509569654, + "grad_norm": 17.248859424682657, + "learning_rate": 2e-06, + "loss": 0.3753, + "step": 1356 + }, + { + "epoch": 0.3148126667439972, + "grad_norm": 10.014448704532423, + "learning_rate": 2e-06, + "loss": 0.1834, + "step": 1357 + }, + { + "epoch": 0.3150446583922979, + "grad_norm": 21.763953176545574, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 1358 + }, + { + "epoch": 0.3152766500405985, + "grad_norm": 19.940820725887995, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 1359 + }, + { + "epoch": 0.3155086416888992, + "grad_norm": 18.912097481709182, + "learning_rate": 2e-06, + "loss": 0.3685, + "step": 1360 + }, + { + "epoch": 0.31574063333719987, + "grad_norm": 11.346471266770104, + "learning_rate": 2e-06, + "loss": 0.2816, + "step": 1361 + }, + { + "epoch": 0.3159726249855005, + "grad_norm": 28.193240828763855, + "learning_rate": 2e-06, + "loss": 0.4203, + "step": 1362 + }, + { + "epoch": 0.31620461663380117, + "grad_norm": 16.73102518384111, + "learning_rate": 2e-06, + "loss": 0.3564, + "step": 1363 + }, + { + "epoch": 0.31643660828210185, + "grad_norm": 9.763506994333493, + "learning_rate": 2e-06, + "loss": 0.2947, + "step": 1364 + }, + { + "epoch": 0.31666859993040253, + "grad_norm": 16.116070672609183, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 1365 + }, + { + "epoch": 0.31690059157870315, + "grad_norm": 16.30539281629766, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 1366 + }, + { + "epoch": 0.31713258322700383, + "grad_norm": 19.019320340396867, + "learning_rate": 2e-06, + "loss": 0.3052, + "step": 1367 + }, + { + "epoch": 0.3173645748753045, + "grad_norm": 18.228935063012884, + "learning_rate": 2e-06, + "loss": 0.4026, + "step": 1368 + }, + { + "epoch": 0.31759656652360513, + "grad_norm": 18.242836131999766, + "learning_rate": 2e-06, + "loss": 0.2933, + "step": 1369 + }, + { + "epoch": 0.3178285581719058, + "grad_norm": 11.764367942970903, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 1370 + }, + { + "epoch": 0.3180605498202065, + "grad_norm": 16.85260895830024, + "learning_rate": 2e-06, + "loss": 0.3813, + "step": 1371 + }, + { + "epoch": 0.3182925414685071, + "grad_norm": 15.517837361759963, + "learning_rate": 2e-06, + "loss": 0.3062, + "step": 1372 + }, + { + "epoch": 0.3185245331168078, + "grad_norm": 16.41448843521921, + "learning_rate": 2e-06, + "loss": 0.449, + "step": 1373 + }, + { + "epoch": 0.31875652476510846, + "grad_norm": 21.721345612017643, + "learning_rate": 2e-06, + "loss": 0.3781, + "step": 1374 + }, + { + "epoch": 0.31898851641340914, + "grad_norm": 19.833120601711492, + "learning_rate": 2e-06, + "loss": 0.3795, + "step": 1375 + }, + { + "epoch": 0.31922050806170976, + "grad_norm": 7.1809944456557, + "learning_rate": 2e-06, + "loss": 0.2324, + "step": 1376 + }, + { + "epoch": 0.31945249971001044, + "grad_norm": 27.35532202666317, + "learning_rate": 2e-06, + "loss": 0.4693, + "step": 1377 + }, + { + "epoch": 0.3196844913583111, + "grad_norm": 14.076587114302116, + "learning_rate": 2e-06, + "loss": 0.2823, + "step": 1378 + }, + { + "epoch": 0.31991648300661174, + "grad_norm": 8.780774638050996, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 1379 + }, + { + "epoch": 0.3201484746549124, + "grad_norm": 11.116062441609776, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 1380 + }, + { + "epoch": 0.3203804663032131, + "grad_norm": 8.693893969917667, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 1381 + }, + { + "epoch": 0.3206124579515137, + "grad_norm": 10.604765368834062, + "learning_rate": 2e-06, + "loss": 0.1967, + "step": 1382 + }, + { + "epoch": 0.3208444495998144, + "grad_norm": 17.142486434058437, + "learning_rate": 2e-06, + "loss": 0.3608, + "step": 1383 + }, + { + "epoch": 0.3210764412481151, + "grad_norm": 12.196660226224022, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 1384 + }, + { + "epoch": 0.32130843289641575, + "grad_norm": 7.812040078901983, + "learning_rate": 2e-06, + "loss": 0.2015, + "step": 1385 + }, + { + "epoch": 0.3215404245447164, + "grad_norm": 12.31031716391756, + "learning_rate": 2e-06, + "loss": 0.3417, + "step": 1386 + }, + { + "epoch": 0.32177241619301705, + "grad_norm": 5.6895948115371455, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 1387 + }, + { + "epoch": 0.32200440784131773, + "grad_norm": 26.500274825195785, + "learning_rate": 2e-06, + "loss": 0.3603, + "step": 1388 + }, + { + "epoch": 0.32223639948961835, + "grad_norm": 11.59197120918137, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 1389 + }, + { + "epoch": 0.32246839113791903, + "grad_norm": 21.22159104373813, + "learning_rate": 2e-06, + "loss": 0.2878, + "step": 1390 + }, + { + "epoch": 0.3227003827862197, + "grad_norm": 7.653899247960632, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 1391 + }, + { + "epoch": 0.32293237443452033, + "grad_norm": 24.21228884650294, + "learning_rate": 2e-06, + "loss": 0.4119, + "step": 1392 + }, + { + "epoch": 0.323164366082821, + "grad_norm": 19.528609223344635, + "learning_rate": 2e-06, + "loss": 0.369, + "step": 1393 + }, + { + "epoch": 0.3233963577311217, + "grad_norm": 30.983200470204057, + "learning_rate": 2e-06, + "loss": 0.5261, + "step": 1394 + }, + { + "epoch": 0.32362834937942236, + "grad_norm": 16.414562055891793, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 1395 + }, + { + "epoch": 0.323860341027723, + "grad_norm": 13.884976117418837, + "learning_rate": 2e-06, + "loss": 0.3318, + "step": 1396 + }, + { + "epoch": 0.32409233267602366, + "grad_norm": 17.823166997510224, + "learning_rate": 2e-06, + "loss": 0.2978, + "step": 1397 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 16.094709140807357, + "learning_rate": 2e-06, + "loss": 0.3587, + "step": 1398 + }, + { + "epoch": 0.32455631597262496, + "grad_norm": 21.996263140110763, + "learning_rate": 2e-06, + "loss": 0.4132, + "step": 1399 + }, + { + "epoch": 0.32478830762092564, + "grad_norm": 15.17287807041555, + "learning_rate": 2e-06, + "loss": 0.1584, + "step": 1400 + }, + { + "epoch": 0.3250202992692263, + "grad_norm": 9.076859822738712, + "learning_rate": 2e-06, + "loss": 0.1804, + "step": 1401 + }, + { + "epoch": 0.32525229091752694, + "grad_norm": 12.833433028804583, + "learning_rate": 2e-06, + "loss": 0.3237, + "step": 1402 + }, + { + "epoch": 0.3254842825658276, + "grad_norm": 21.081007557262744, + "learning_rate": 2e-06, + "loss": 0.3414, + "step": 1403 + }, + { + "epoch": 0.3257162742141283, + "grad_norm": 24.44170266561206, + "learning_rate": 2e-06, + "loss": 0.4184, + "step": 1404 + }, + { + "epoch": 0.325948265862429, + "grad_norm": 15.761112175024815, + "learning_rate": 2e-06, + "loss": 0.3424, + "step": 1405 + }, + { + "epoch": 0.3261802575107296, + "grad_norm": 26.682059157151674, + "learning_rate": 2e-06, + "loss": 0.3506, + "step": 1406 + }, + { + "epoch": 0.3264122491590303, + "grad_norm": 26.625870041802795, + "learning_rate": 2e-06, + "loss": 0.4061, + "step": 1407 + }, + { + "epoch": 0.32664424080733095, + "grad_norm": 8.524660616405798, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 1408 + }, + { + "epoch": 0.3268762324556316, + "grad_norm": 13.785566655746681, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 1409 + }, + { + "epoch": 0.32710822410393225, + "grad_norm": 10.510689880378692, + "learning_rate": 2e-06, + "loss": 0.2374, + "step": 1410 + }, + { + "epoch": 0.32734021575223293, + "grad_norm": 13.282208345671345, + "learning_rate": 2e-06, + "loss": 0.3365, + "step": 1411 + }, + { + "epoch": 0.32757220740053355, + "grad_norm": 7.200604791903009, + "learning_rate": 2e-06, + "loss": 0.3043, + "step": 1412 + }, + { + "epoch": 0.32780419904883423, + "grad_norm": 16.380763960725297, + "learning_rate": 2e-06, + "loss": 0.336, + "step": 1413 + }, + { + "epoch": 0.3280361906971349, + "grad_norm": 7.862667605788348, + "learning_rate": 2e-06, + "loss": 0.2364, + "step": 1414 + }, + { + "epoch": 0.3282681823454356, + "grad_norm": 13.792537373401919, + "learning_rate": 2e-06, + "loss": 0.3635, + "step": 1415 + }, + { + "epoch": 0.3285001739937362, + "grad_norm": 21.034934320250013, + "learning_rate": 2e-06, + "loss": 0.4046, + "step": 1416 + }, + { + "epoch": 0.3287321656420369, + "grad_norm": 22.086878012364966, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 1417 + }, + { + "epoch": 0.32896415729033757, + "grad_norm": 14.784229950459304, + "learning_rate": 2e-06, + "loss": 0.2574, + "step": 1418 + }, + { + "epoch": 0.3291961489386382, + "grad_norm": 12.373195748036627, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 1419 + }, + { + "epoch": 0.32942814058693887, + "grad_norm": 38.01272152718663, + "learning_rate": 2e-06, + "loss": 0.3616, + "step": 1420 + }, + { + "epoch": 0.32966013223523954, + "grad_norm": 11.991749136874276, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 1421 + }, + { + "epoch": 0.32989212388354017, + "grad_norm": 26.97583446550163, + "learning_rate": 2e-06, + "loss": 0.3066, + "step": 1422 + }, + { + "epoch": 0.33012411553184084, + "grad_norm": 15.834084651170842, + "learning_rate": 2e-06, + "loss": 0.298, + "step": 1423 + }, + { + "epoch": 0.3303561071801415, + "grad_norm": 18.92977879289124, + "learning_rate": 2e-06, + "loss": 0.392, + "step": 1424 + }, + { + "epoch": 0.3305880988284422, + "grad_norm": 13.530259540842376, + "learning_rate": 2e-06, + "loss": 0.3095, + "step": 1425 + }, + { + "epoch": 0.3308200904767428, + "grad_norm": 10.827914794149137, + "learning_rate": 2e-06, + "loss": 0.3398, + "step": 1426 + }, + { + "epoch": 0.3310520821250435, + "grad_norm": 9.71939624119191, + "learning_rate": 2e-06, + "loss": 0.3422, + "step": 1427 + }, + { + "epoch": 0.3312840737733442, + "grad_norm": 8.993853404795685, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 1428 + }, + { + "epoch": 0.3315160654216448, + "grad_norm": 11.037296435326578, + "learning_rate": 2e-06, + "loss": 0.3753, + "step": 1429 + }, + { + "epoch": 0.3317480570699455, + "grad_norm": 15.519371017718381, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 1430 + }, + { + "epoch": 0.33198004871824616, + "grad_norm": 12.947799922606766, + "learning_rate": 2e-06, + "loss": 0.3816, + "step": 1431 + }, + { + "epoch": 0.3322120403665468, + "grad_norm": 43.44503329570567, + "learning_rate": 2e-06, + "loss": 0.4163, + "step": 1432 + }, + { + "epoch": 0.33244403201484746, + "grad_norm": 10.85914909093172, + "learning_rate": 2e-06, + "loss": 0.3539, + "step": 1433 + }, + { + "epoch": 0.33267602366314813, + "grad_norm": 16.45369684618797, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 1434 + }, + { + "epoch": 0.3329080153114488, + "grad_norm": 11.856791453456106, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 1435 + }, + { + "epoch": 0.33314000695974944, + "grad_norm": 28.227771959540448, + "learning_rate": 2e-06, + "loss": 0.356, + "step": 1436 + }, + { + "epoch": 0.3333719986080501, + "grad_norm": 16.11516272567042, + "learning_rate": 2e-06, + "loss": 0.3581, + "step": 1437 + }, + { + "epoch": 0.3336039902563508, + "grad_norm": 17.205293488598823, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 1438 + }, + { + "epoch": 0.3338359819046514, + "grad_norm": 18.59915835527598, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 1439 + }, + { + "epoch": 0.3340679735529521, + "grad_norm": 9.665209145110666, + "learning_rate": 2e-06, + "loss": 0.3308, + "step": 1440 + }, + { + "epoch": 0.33429996520125277, + "grad_norm": 9.68040316568584, + "learning_rate": 2e-06, + "loss": 0.1829, + "step": 1441 + }, + { + "epoch": 0.3345319568495534, + "grad_norm": 13.189970236007511, + "learning_rate": 2e-06, + "loss": 0.3357, + "step": 1442 + }, + { + "epoch": 0.33476394849785407, + "grad_norm": 19.650408705244487, + "learning_rate": 2e-06, + "loss": 0.3785, + "step": 1443 + }, + { + "epoch": 0.33499594014615475, + "grad_norm": 28.296732624494975, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 1444 + }, + { + "epoch": 0.3352279317944554, + "grad_norm": 18.467670187497443, + "learning_rate": 2e-06, + "loss": 0.3028, + "step": 1445 + }, + { + "epoch": 0.33545992344275605, + "grad_norm": 18.270227526717235, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 1446 + }, + { + "epoch": 0.3356919150910567, + "grad_norm": 11.52991657081852, + "learning_rate": 2e-06, + "loss": 0.2774, + "step": 1447 + }, + { + "epoch": 0.3359239067393574, + "grad_norm": 23.744389862640357, + "learning_rate": 2e-06, + "loss": 0.548, + "step": 1448 + }, + { + "epoch": 0.336155898387658, + "grad_norm": 18.275621368601392, + "learning_rate": 2e-06, + "loss": 0.4032, + "step": 1449 + }, + { + "epoch": 0.3363878900359587, + "grad_norm": 14.829177620147869, + "learning_rate": 2e-06, + "loss": 0.3983, + "step": 1450 + }, + { + "epoch": 0.3366198816842594, + "grad_norm": 13.640819060025391, + "learning_rate": 2e-06, + "loss": 0.3402, + "step": 1451 + }, + { + "epoch": 0.33685187333256, + "grad_norm": 20.325335878357638, + "learning_rate": 2e-06, + "loss": 0.2938, + "step": 1452 + }, + { + "epoch": 0.3370838649808607, + "grad_norm": 10.1654491529205, + "learning_rate": 2e-06, + "loss": 0.3499, + "step": 1453 + }, + { + "epoch": 0.33731585662916136, + "grad_norm": 18.11067937334146, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 1454 + }, + { + "epoch": 0.33754784827746204, + "grad_norm": 7.213728305081726, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 1455 + }, + { + "epoch": 0.33777983992576266, + "grad_norm": 8.111878159324867, + "learning_rate": 2e-06, + "loss": 0.2244, + "step": 1456 + }, + { + "epoch": 0.33801183157406334, + "grad_norm": 16.26670030048152, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 1457 + }, + { + "epoch": 0.338243823222364, + "grad_norm": 24.565372997992675, + "learning_rate": 2e-06, + "loss": 0.3137, + "step": 1458 + }, + { + "epoch": 0.33847581487066464, + "grad_norm": 22.886787785735034, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 1459 + }, + { + "epoch": 0.3387078065189653, + "grad_norm": 21.436646107450535, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.338939798167266, + "grad_norm": 12.328393696710915, + "learning_rate": 2e-06, + "loss": 0.3254, + "step": 1461 + }, + { + "epoch": 0.3391717898155666, + "grad_norm": 15.206807262565173, + "learning_rate": 2e-06, + "loss": 0.3559, + "step": 1462 + }, + { + "epoch": 0.3394037814638673, + "grad_norm": 14.31015543426358, + "learning_rate": 2e-06, + "loss": 0.2817, + "step": 1463 + }, + { + "epoch": 0.33963577311216797, + "grad_norm": 21.67014327512649, + "learning_rate": 2e-06, + "loss": 0.3082, + "step": 1464 + }, + { + "epoch": 0.33986776476046865, + "grad_norm": 7.896970994527335, + "learning_rate": 2e-06, + "loss": 0.2985, + "step": 1465 + }, + { + "epoch": 0.34009975640876927, + "grad_norm": 10.908642187522922, + "learning_rate": 2e-06, + "loss": 0.4137, + "step": 1466 + }, + { + "epoch": 0.34033174805706995, + "grad_norm": 14.30417006499471, + "learning_rate": 2e-06, + "loss": 0.3977, + "step": 1467 + }, + { + "epoch": 0.3405637397053706, + "grad_norm": 12.724543568104627, + "learning_rate": 2e-06, + "loss": 0.2614, + "step": 1468 + }, + { + "epoch": 0.34079573135367125, + "grad_norm": 13.860781856965406, + "learning_rate": 2e-06, + "loss": 0.3516, + "step": 1469 + }, + { + "epoch": 0.3410277230019719, + "grad_norm": 8.11891853375226, + "learning_rate": 2e-06, + "loss": 0.2225, + "step": 1470 + }, + { + "epoch": 0.3412597146502726, + "grad_norm": 18.084721461864714, + "learning_rate": 2e-06, + "loss": 0.2586, + "step": 1471 + }, + { + "epoch": 0.3414917062985732, + "grad_norm": 7.181534455328431, + "learning_rate": 2e-06, + "loss": 0.2283, + "step": 1472 + }, + { + "epoch": 0.3417236979468739, + "grad_norm": 10.249481209532288, + "learning_rate": 2e-06, + "loss": 0.2586, + "step": 1473 + }, + { + "epoch": 0.3419556895951746, + "grad_norm": 20.235711545193375, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 1474 + }, + { + "epoch": 0.34218768124347526, + "grad_norm": 12.615681034876618, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 1475 + }, + { + "epoch": 0.3424196728917759, + "grad_norm": 20.373845008527187, + "learning_rate": 2e-06, + "loss": 0.5045, + "step": 1476 + }, + { + "epoch": 0.34265166454007656, + "grad_norm": 23.646655211696096, + "learning_rate": 2e-06, + "loss": 0.3433, + "step": 1477 + }, + { + "epoch": 0.34288365618837724, + "grad_norm": 10.779640440881222, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 1478 + }, + { + "epoch": 0.34311564783667786, + "grad_norm": 14.955799866697927, + "learning_rate": 2e-06, + "loss": 0.3575, + "step": 1479 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 23.34240373648745, + "learning_rate": 2e-06, + "loss": 0.3543, + "step": 1480 + }, + { + "epoch": 0.3435796311332792, + "grad_norm": 27.198043313140015, + "learning_rate": 2e-06, + "loss": 0.3609, + "step": 1481 + }, + { + "epoch": 0.34381162278157984, + "grad_norm": 4.740917917219977, + "learning_rate": 2e-06, + "loss": 0.2064, + "step": 1482 + }, + { + "epoch": 0.3440436144298805, + "grad_norm": 17.595434056508047, + "learning_rate": 2e-06, + "loss": 0.289, + "step": 1483 + }, + { + "epoch": 0.3442756060781812, + "grad_norm": 9.572257970860454, + "learning_rate": 2e-06, + "loss": 0.3235, + "step": 1484 + }, + { + "epoch": 0.3445075977264819, + "grad_norm": 21.185559442915206, + "learning_rate": 2e-06, + "loss": 0.3622, + "step": 1485 + }, + { + "epoch": 0.3447395893747825, + "grad_norm": 19.125381912342608, + "learning_rate": 2e-06, + "loss": 0.3033, + "step": 1486 + }, + { + "epoch": 0.3449715810230832, + "grad_norm": 14.398206313708432, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 1487 + }, + { + "epoch": 0.34520357267138385, + "grad_norm": 20.4757543531186, + "learning_rate": 2e-06, + "loss": 0.3481, + "step": 1488 + }, + { + "epoch": 0.3454355643196845, + "grad_norm": 13.521107412007973, + "learning_rate": 2e-06, + "loss": 0.3284, + "step": 1489 + }, + { + "epoch": 0.34566755596798515, + "grad_norm": 12.82873921884128, + "learning_rate": 2e-06, + "loss": 0.333, + "step": 1490 + }, + { + "epoch": 0.34589954761628583, + "grad_norm": 16.50102131285493, + "learning_rate": 2e-06, + "loss": 0.3682, + "step": 1491 + }, + { + "epoch": 0.34613153926458645, + "grad_norm": 15.968468514516362, + "learning_rate": 2e-06, + "loss": 0.3645, + "step": 1492 + }, + { + "epoch": 0.34636353091288713, + "grad_norm": 15.411384633795938, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 1493 + }, + { + "epoch": 0.3465955225611878, + "grad_norm": 10.528121007020545, + "learning_rate": 2e-06, + "loss": 0.3602, + "step": 1494 + }, + { + "epoch": 0.3468275142094885, + "grad_norm": 12.50351889191618, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 1495 + }, + { + "epoch": 0.3470595058577891, + "grad_norm": 22.53265667953538, + "learning_rate": 2e-06, + "loss": 0.3537, + "step": 1496 + }, + { + "epoch": 0.3472914975060898, + "grad_norm": 13.161514617319222, + "learning_rate": 2e-06, + "loss": 0.3082, + "step": 1497 + }, + { + "epoch": 0.34752348915439046, + "grad_norm": 12.514909309899709, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 1498 + }, + { + "epoch": 0.3477554808026911, + "grad_norm": 27.200429723788545, + "learning_rate": 2e-06, + "loss": 0.3255, + "step": 1499 + }, + { + "epoch": 0.34798747245099176, + "grad_norm": 14.382791715885393, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 1500 + }, + { + "epoch": 0.34821946409929244, + "grad_norm": 27.770489340950228, + "learning_rate": 2e-06, + "loss": 0.3061, + "step": 1501 + }, + { + "epoch": 0.34845145574759306, + "grad_norm": 13.103877107565271, + "learning_rate": 2e-06, + "loss": 0.2417, + "step": 1502 + }, + { + "epoch": 0.34868344739589374, + "grad_norm": 22.095341754977923, + "learning_rate": 2e-06, + "loss": 0.3161, + "step": 1503 + }, + { + "epoch": 0.3489154390441944, + "grad_norm": 17.66831432896543, + "learning_rate": 2e-06, + "loss": 0.3733, + "step": 1504 + }, + { + "epoch": 0.3491474306924951, + "grad_norm": 23.243882502598925, + "learning_rate": 2e-06, + "loss": 0.2508, + "step": 1505 + }, + { + "epoch": 0.3493794223407957, + "grad_norm": 14.198012691546005, + "learning_rate": 2e-06, + "loss": 0.2499, + "step": 1506 + }, + { + "epoch": 0.3496114139890964, + "grad_norm": 16.38842201576127, + "learning_rate": 2e-06, + "loss": 0.3034, + "step": 1507 + }, + { + "epoch": 0.3498434056373971, + "grad_norm": 12.030502731601553, + "learning_rate": 2e-06, + "loss": 0.2351, + "step": 1508 + }, + { + "epoch": 0.3500753972856977, + "grad_norm": 12.003382984655511, + "learning_rate": 2e-06, + "loss": 0.2391, + "step": 1509 + }, + { + "epoch": 0.3503073889339984, + "grad_norm": 16.653026981636675, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 1510 + }, + { + "epoch": 0.35053938058229905, + "grad_norm": 26.30925965099654, + "learning_rate": 2e-06, + "loss": 0.2996, + "step": 1511 + }, + { + "epoch": 0.3507713722305997, + "grad_norm": 8.268051727806895, + "learning_rate": 2e-06, + "loss": 0.3707, + "step": 1512 + }, + { + "epoch": 0.35100336387890035, + "grad_norm": 16.492831306112947, + "learning_rate": 2e-06, + "loss": 0.3764, + "step": 1513 + }, + { + "epoch": 0.35123535552720103, + "grad_norm": 12.953009967608708, + "learning_rate": 2e-06, + "loss": 0.3184, + "step": 1514 + }, + { + "epoch": 0.35146734717550165, + "grad_norm": 13.868907702767919, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 1515 + }, + { + "epoch": 0.35169933882380233, + "grad_norm": 14.707904973561801, + "learning_rate": 2e-06, + "loss": 0.3275, + "step": 1516 + }, + { + "epoch": 0.351931330472103, + "grad_norm": 18.927232123905426, + "learning_rate": 2e-06, + "loss": 0.4729, + "step": 1517 + }, + { + "epoch": 0.3521633221204037, + "grad_norm": 26.683198978700762, + "learning_rate": 2e-06, + "loss": 0.3891, + "step": 1518 + }, + { + "epoch": 0.3523953137687043, + "grad_norm": 23.200118728090455, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 1519 + }, + { + "epoch": 0.352627305417005, + "grad_norm": 12.772671965285351, + "learning_rate": 2e-06, + "loss": 0.2973, + "step": 1520 + }, + { + "epoch": 0.35285929706530567, + "grad_norm": 19.03558091042126, + "learning_rate": 2e-06, + "loss": 0.3802, + "step": 1521 + }, + { + "epoch": 0.3530912887136063, + "grad_norm": 18.97245741876137, + "learning_rate": 2e-06, + "loss": 0.3188, + "step": 1522 + }, + { + "epoch": 0.35332328036190697, + "grad_norm": 14.336059096837559, + "learning_rate": 2e-06, + "loss": 0.3388, + "step": 1523 + }, + { + "epoch": 0.35355527201020764, + "grad_norm": 11.160440481969392, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 1524 + }, + { + "epoch": 0.35378726365850827, + "grad_norm": 23.929660422992608, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 1525 + }, + { + "epoch": 0.35401925530680894, + "grad_norm": 23.916293169824055, + "learning_rate": 2e-06, + "loss": 0.4163, + "step": 1526 + }, + { + "epoch": 0.3542512469551096, + "grad_norm": 20.043119923355555, + "learning_rate": 2e-06, + "loss": 0.329, + "step": 1527 + }, + { + "epoch": 0.3544832386034103, + "grad_norm": 12.250980750726233, + "learning_rate": 2e-06, + "loss": 0.2934, + "step": 1528 + }, + { + "epoch": 0.3547152302517109, + "grad_norm": 14.48483674657129, + "learning_rate": 2e-06, + "loss": 0.3425, + "step": 1529 + }, + { + "epoch": 0.3549472219000116, + "grad_norm": 11.553582803477648, + "learning_rate": 2e-06, + "loss": 0.3147, + "step": 1530 + }, + { + "epoch": 0.3551792135483123, + "grad_norm": 11.560989065944293, + "learning_rate": 2e-06, + "loss": 0.2615, + "step": 1531 + }, + { + "epoch": 0.3554112051966129, + "grad_norm": 11.17415333216845, + "learning_rate": 2e-06, + "loss": 0.3311, + "step": 1532 + }, + { + "epoch": 0.3556431968449136, + "grad_norm": 12.185737847018876, + "learning_rate": 2e-06, + "loss": 0.2516, + "step": 1533 + }, + { + "epoch": 0.35587518849321426, + "grad_norm": 8.191288874419087, + "learning_rate": 2e-06, + "loss": 0.1495, + "step": 1534 + }, + { + "epoch": 0.3561071801415149, + "grad_norm": 10.949054148249836, + "learning_rate": 2e-06, + "loss": 0.343, + "step": 1535 + }, + { + "epoch": 0.35633917178981556, + "grad_norm": 12.278173030546192, + "learning_rate": 2e-06, + "loss": 0.3576, + "step": 1536 + }, + { + "epoch": 0.35657116343811623, + "grad_norm": 13.977798272440518, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 1537 + }, + { + "epoch": 0.3568031550864169, + "grad_norm": 11.83754990738849, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 1538 + }, + { + "epoch": 0.35703514673471753, + "grad_norm": 16.49812571960583, + "learning_rate": 2e-06, + "loss": 0.3304, + "step": 1539 + }, + { + "epoch": 0.3572671383830182, + "grad_norm": 17.13588560605484, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 1540 + }, + { + "epoch": 0.3574991300313189, + "grad_norm": 18.403814271161163, + "learning_rate": 2e-06, + "loss": 0.2804, + "step": 1541 + }, + { + "epoch": 0.3577311216796195, + "grad_norm": 12.827052629703486, + "learning_rate": 2e-06, + "loss": 0.2458, + "step": 1542 + }, + { + "epoch": 0.3579631133279202, + "grad_norm": 12.760937396611903, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 1543 + }, + { + "epoch": 0.35819510497622087, + "grad_norm": 16.539137967377687, + "learning_rate": 2e-06, + "loss": 0.383, + "step": 1544 + }, + { + "epoch": 0.3584270966245215, + "grad_norm": 20.039107817355806, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 1545 + }, + { + "epoch": 0.35865908827282217, + "grad_norm": 13.64428431793954, + "learning_rate": 2e-06, + "loss": 0.2665, + "step": 1546 + }, + { + "epoch": 0.35889107992112285, + "grad_norm": 12.42057044851482, + "learning_rate": 2e-06, + "loss": 0.3102, + "step": 1547 + }, + { + "epoch": 0.3591230715694235, + "grad_norm": 22.523044251326006, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 1548 + }, + { + "epoch": 0.35935506321772415, + "grad_norm": 16.928365338505053, + "learning_rate": 2e-06, + "loss": 0.3164, + "step": 1549 + }, + { + "epoch": 0.3595870548660248, + "grad_norm": 18.232182331487515, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 1550 + }, + { + "epoch": 0.3598190465143255, + "grad_norm": 21.978434417091222, + "learning_rate": 2e-06, + "loss": 0.3644, + "step": 1551 + }, + { + "epoch": 0.3600510381626261, + "grad_norm": 7.471359625072732, + "learning_rate": 2e-06, + "loss": 0.273, + "step": 1552 + }, + { + "epoch": 0.3602830298109268, + "grad_norm": 19.050119083208457, + "learning_rate": 2e-06, + "loss": 0.2834, + "step": 1553 + }, + { + "epoch": 0.3605150214592275, + "grad_norm": 8.666454101947402, + "learning_rate": 2e-06, + "loss": 0.2591, + "step": 1554 + }, + { + "epoch": 0.3607470131075281, + "grad_norm": 14.71395010544609, + "learning_rate": 2e-06, + "loss": 0.329, + "step": 1555 + }, + { + "epoch": 0.3609790047558288, + "grad_norm": 15.222939559224029, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 1556 + }, + { + "epoch": 0.36121099640412946, + "grad_norm": 16.19223554787744, + "learning_rate": 2e-06, + "loss": 0.4093, + "step": 1557 + }, + { + "epoch": 0.36144298805243014, + "grad_norm": 12.171363940214102, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 1558 + }, + { + "epoch": 0.36167497970073076, + "grad_norm": 15.245125862878577, + "learning_rate": 2e-06, + "loss": 0.4228, + "step": 1559 + }, + { + "epoch": 0.36190697134903144, + "grad_norm": 19.04913330454029, + "learning_rate": 2e-06, + "loss": 0.3924, + "step": 1560 + }, + { + "epoch": 0.3621389629973321, + "grad_norm": 15.994627326757682, + "learning_rate": 2e-06, + "loss": 0.3107, + "step": 1561 + }, + { + "epoch": 0.36237095464563274, + "grad_norm": 21.858725247811957, + "learning_rate": 2e-06, + "loss": 0.4131, + "step": 1562 + }, + { + "epoch": 0.3626029462939334, + "grad_norm": 12.86548452333493, + "learning_rate": 2e-06, + "loss": 0.3945, + "step": 1563 + }, + { + "epoch": 0.3628349379422341, + "grad_norm": 11.10614066767929, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 1564 + }, + { + "epoch": 0.3630669295905347, + "grad_norm": 7.711392585957257, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 1565 + }, + { + "epoch": 0.3632989212388354, + "grad_norm": 13.634176534870537, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 1566 + }, + { + "epoch": 0.36353091288713607, + "grad_norm": 10.852789631053897, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 1567 + }, + { + "epoch": 0.36376290453543675, + "grad_norm": 10.476346649167414, + "learning_rate": 2e-06, + "loss": 0.2577, + "step": 1568 + }, + { + "epoch": 0.36399489618373737, + "grad_norm": 15.507037512786859, + "learning_rate": 2e-06, + "loss": 0.2622, + "step": 1569 + }, + { + "epoch": 0.36422688783203805, + "grad_norm": 19.351470038814583, + "learning_rate": 2e-06, + "loss": 0.4442, + "step": 1570 + }, + { + "epoch": 0.3644588794803387, + "grad_norm": 27.313074395348565, + "learning_rate": 2e-06, + "loss": 0.4793, + "step": 1571 + }, + { + "epoch": 0.36469087112863935, + "grad_norm": 16.532665789399566, + "learning_rate": 2e-06, + "loss": 0.3345, + "step": 1572 + }, + { + "epoch": 0.36492286277694, + "grad_norm": 13.405927824510398, + "learning_rate": 2e-06, + "loss": 0.3608, + "step": 1573 + }, + { + "epoch": 0.3651548544252407, + "grad_norm": 10.875804546859849, + "learning_rate": 2e-06, + "loss": 0.3128, + "step": 1574 + }, + { + "epoch": 0.3653868460735413, + "grad_norm": 18.823706038628313, + "learning_rate": 2e-06, + "loss": 0.3562, + "step": 1575 + }, + { + "epoch": 0.365618837721842, + "grad_norm": 18.61416232947668, + "learning_rate": 2e-06, + "loss": 0.2618, + "step": 1576 + }, + { + "epoch": 0.3658508293701427, + "grad_norm": 24.02212692496374, + "learning_rate": 2e-06, + "loss": 0.3201, + "step": 1577 + }, + { + "epoch": 0.36608282101844336, + "grad_norm": 10.800817628224939, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 1578 + }, + { + "epoch": 0.366314812666744, + "grad_norm": 9.361010948025115, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 1579 + }, + { + "epoch": 0.36654680431504466, + "grad_norm": 24.186205696257296, + "learning_rate": 2e-06, + "loss": 0.3642, + "step": 1580 + }, + { + "epoch": 0.36677879596334534, + "grad_norm": 18.65446560349194, + "learning_rate": 2e-06, + "loss": 0.3754, + "step": 1581 + }, + { + "epoch": 0.36701078761164596, + "grad_norm": 24.572827718997146, + "learning_rate": 2e-06, + "loss": 0.3356, + "step": 1582 + }, + { + "epoch": 0.36724277925994664, + "grad_norm": 25.30961911705546, + "learning_rate": 2e-06, + "loss": 0.3706, + "step": 1583 + }, + { + "epoch": 0.3674747709082473, + "grad_norm": 19.183888435579004, + "learning_rate": 2e-06, + "loss": 0.391, + "step": 1584 + }, + { + "epoch": 0.36770676255654794, + "grad_norm": 16.32160541185497, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 1585 + }, + { + "epoch": 0.3679387542048486, + "grad_norm": 24.10960800511522, + "learning_rate": 2e-06, + "loss": 0.3104, + "step": 1586 + }, + { + "epoch": 0.3681707458531493, + "grad_norm": 15.480985134797434, + "learning_rate": 2e-06, + "loss": 0.3862, + "step": 1587 + }, + { + "epoch": 0.36840273750145, + "grad_norm": 12.142741471902808, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 1588 + }, + { + "epoch": 0.3686347291497506, + "grad_norm": 11.702276399480478, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 1589 + }, + { + "epoch": 0.3688667207980513, + "grad_norm": 16.70719405848102, + "learning_rate": 2e-06, + "loss": 0.3269, + "step": 1590 + }, + { + "epoch": 0.36909871244635195, + "grad_norm": 12.12225751120716, + "learning_rate": 2e-06, + "loss": 0.3404, + "step": 1591 + }, + { + "epoch": 0.3693307040946526, + "grad_norm": 11.512749205669706, + "learning_rate": 2e-06, + "loss": 0.2705, + "step": 1592 + }, + { + "epoch": 0.36956269574295325, + "grad_norm": 7.127614749150153, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 1593 + }, + { + "epoch": 0.36979468739125393, + "grad_norm": 23.513456031877478, + "learning_rate": 2e-06, + "loss": 0.3113, + "step": 1594 + }, + { + "epoch": 0.37002667903955455, + "grad_norm": 8.289546448002808, + "learning_rate": 2e-06, + "loss": 0.3195, + "step": 1595 + }, + { + "epoch": 0.37025867068785523, + "grad_norm": 15.104930483599123, + "learning_rate": 2e-06, + "loss": 0.2434, + "step": 1596 + }, + { + "epoch": 0.3704906623361559, + "grad_norm": 17.009568657067756, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 1597 + }, + { + "epoch": 0.3707226539844566, + "grad_norm": 17.809610643285815, + "learning_rate": 2e-06, + "loss": 0.387, + "step": 1598 + }, + { + "epoch": 0.3709546456327572, + "grad_norm": 20.129437927118126, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 1599 + }, + { + "epoch": 0.3711866372810579, + "grad_norm": 15.245701649509975, + "learning_rate": 2e-06, + "loss": 0.3453, + "step": 1600 + }, + { + "epoch": 0.37141862892935856, + "grad_norm": 12.497912883042815, + "learning_rate": 2e-06, + "loss": 0.2064, + "step": 1601 + }, + { + "epoch": 0.3716506205776592, + "grad_norm": 12.763218076540506, + "learning_rate": 2e-06, + "loss": 0.3223, + "step": 1602 + }, + { + "epoch": 0.37188261222595986, + "grad_norm": 14.76310202808672, + "learning_rate": 2e-06, + "loss": 0.4025, + "step": 1603 + }, + { + "epoch": 0.37211460387426054, + "grad_norm": 10.854093941092742, + "learning_rate": 2e-06, + "loss": 0.2103, + "step": 1604 + }, + { + "epoch": 0.37234659552256116, + "grad_norm": 20.010957585816257, + "learning_rate": 2e-06, + "loss": 0.502, + "step": 1605 + }, + { + "epoch": 0.37257858717086184, + "grad_norm": 20.461827465123676, + "learning_rate": 2e-06, + "loss": 0.3678, + "step": 1606 + }, + { + "epoch": 0.3728105788191625, + "grad_norm": 12.35407993340569, + "learning_rate": 2e-06, + "loss": 0.2872, + "step": 1607 + }, + { + "epoch": 0.3730425704674632, + "grad_norm": 16.69775007977456, + "learning_rate": 2e-06, + "loss": 0.3112, + "step": 1608 + }, + { + "epoch": 0.3732745621157638, + "grad_norm": 16.24608996111506, + "learning_rate": 2e-06, + "loss": 0.3128, + "step": 1609 + }, + { + "epoch": 0.3735065537640645, + "grad_norm": 21.652530385792378, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 1610 + }, + { + "epoch": 0.3737385454123652, + "grad_norm": 12.622235889749975, + "learning_rate": 2e-06, + "loss": 0.3509, + "step": 1611 + }, + { + "epoch": 0.3739705370606658, + "grad_norm": 17.157816059926006, + "learning_rate": 2e-06, + "loss": 0.3221, + "step": 1612 + }, + { + "epoch": 0.3742025287089665, + "grad_norm": 9.928656920548773, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 1613 + }, + { + "epoch": 0.37443452035726715, + "grad_norm": 7.878328610402713, + "learning_rate": 2e-06, + "loss": 0.2231, + "step": 1614 + }, + { + "epoch": 0.3746665120055678, + "grad_norm": 21.179210126353382, + "learning_rate": 2e-06, + "loss": 0.3815, + "step": 1615 + }, + { + "epoch": 0.37489850365386845, + "grad_norm": 19.483475307452156, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 1616 + }, + { + "epoch": 0.37513049530216913, + "grad_norm": 18.918324334501875, + "learning_rate": 2e-06, + "loss": 0.3439, + "step": 1617 + }, + { + "epoch": 0.3753624869504698, + "grad_norm": 13.974426869543807, + "learning_rate": 2e-06, + "loss": 0.3879, + "step": 1618 + }, + { + "epoch": 0.37559447859877043, + "grad_norm": 26.12250512557313, + "learning_rate": 2e-06, + "loss": 0.4059, + "step": 1619 + }, + { + "epoch": 0.3758264702470711, + "grad_norm": 16.61178037941796, + "learning_rate": 2e-06, + "loss": 0.4071, + "step": 1620 + }, + { + "epoch": 0.3760584618953718, + "grad_norm": 11.416733585531539, + "learning_rate": 2e-06, + "loss": 0.2167, + "step": 1621 + }, + { + "epoch": 0.3762904535436724, + "grad_norm": 13.319214185114532, + "learning_rate": 2e-06, + "loss": 0.3348, + "step": 1622 + }, + { + "epoch": 0.3765224451919731, + "grad_norm": 13.655422759429916, + "learning_rate": 2e-06, + "loss": 0.389, + "step": 1623 + }, + { + "epoch": 0.37675443684027377, + "grad_norm": 12.053182330123681, + "learning_rate": 2e-06, + "loss": 0.2747, + "step": 1624 + }, + { + "epoch": 0.3769864284885744, + "grad_norm": 18.83496077772956, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 1625 + }, + { + "epoch": 0.37721842013687507, + "grad_norm": 17.905370457467995, + "learning_rate": 2e-06, + "loss": 0.3528, + "step": 1626 + }, + { + "epoch": 0.37745041178517574, + "grad_norm": 17.468656650631704, + "learning_rate": 2e-06, + "loss": 0.2534, + "step": 1627 + }, + { + "epoch": 0.3776824034334764, + "grad_norm": 23.713256640121145, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 1628 + }, + { + "epoch": 0.37791439508177704, + "grad_norm": 11.843935984726398, + "learning_rate": 2e-06, + "loss": 0.3334, + "step": 1629 + }, + { + "epoch": 0.3781463867300777, + "grad_norm": 16.972846583250554, + "learning_rate": 2e-06, + "loss": 0.3979, + "step": 1630 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 16.37480579398604, + "learning_rate": 2e-06, + "loss": 0.3796, + "step": 1631 + }, + { + "epoch": 0.378610370026679, + "grad_norm": 19.360782354701076, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 1632 + }, + { + "epoch": 0.3788423616749797, + "grad_norm": 10.154342633825937, + "learning_rate": 2e-06, + "loss": 0.2564, + "step": 1633 + }, + { + "epoch": 0.3790743533232804, + "grad_norm": 22.689267404415812, + "learning_rate": 2e-06, + "loss": 0.3216, + "step": 1634 + }, + { + "epoch": 0.379306344971581, + "grad_norm": 22.7895374079743, + "learning_rate": 2e-06, + "loss": 0.3923, + "step": 1635 + }, + { + "epoch": 0.3795383366198817, + "grad_norm": 23.763833374699864, + "learning_rate": 2e-06, + "loss": 0.3587, + "step": 1636 + }, + { + "epoch": 0.37977032826818236, + "grad_norm": 9.367475197108027, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 1637 + }, + { + "epoch": 0.38000231991648303, + "grad_norm": 13.235342399140125, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 1638 + }, + { + "epoch": 0.38023431156478366, + "grad_norm": 16.83143948537525, + "learning_rate": 2e-06, + "loss": 0.3111, + "step": 1639 + }, + { + "epoch": 0.38046630321308433, + "grad_norm": 13.590884945633038, + "learning_rate": 2e-06, + "loss": 0.3164, + "step": 1640 + }, + { + "epoch": 0.380698294861385, + "grad_norm": 9.172619097320478, + "learning_rate": 2e-06, + "loss": 0.2266, + "step": 1641 + }, + { + "epoch": 0.38093028650968563, + "grad_norm": 35.29786602689293, + "learning_rate": 2e-06, + "loss": 0.405, + "step": 1642 + }, + { + "epoch": 0.3811622781579863, + "grad_norm": 21.620132837757843, + "learning_rate": 2e-06, + "loss": 0.3185, + "step": 1643 + }, + { + "epoch": 0.381394269806287, + "grad_norm": 15.970781077578724, + "learning_rate": 2e-06, + "loss": 0.2512, + "step": 1644 + }, + { + "epoch": 0.3816262614545876, + "grad_norm": 14.70064658110598, + "learning_rate": 2e-06, + "loss": 0.2931, + "step": 1645 + }, + { + "epoch": 0.3818582531028883, + "grad_norm": 13.887474908567368, + "learning_rate": 2e-06, + "loss": 0.2872, + "step": 1646 + }, + { + "epoch": 0.38209024475118897, + "grad_norm": 19.884616625123037, + "learning_rate": 2e-06, + "loss": 0.4227, + "step": 1647 + }, + { + "epoch": 0.38232223639948965, + "grad_norm": 15.302936516623225, + "learning_rate": 2e-06, + "loss": 0.3547, + "step": 1648 + }, + { + "epoch": 0.38255422804779027, + "grad_norm": 14.32883875962087, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 1649 + }, + { + "epoch": 0.38278621969609095, + "grad_norm": 11.768538296099742, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 1650 + }, + { + "epoch": 0.3830182113443916, + "grad_norm": 22.138118983802702, + "learning_rate": 2e-06, + "loss": 0.3406, + "step": 1651 + }, + { + "epoch": 0.38325020299269225, + "grad_norm": 13.11426253348709, + "learning_rate": 2e-06, + "loss": 0.2848, + "step": 1652 + }, + { + "epoch": 0.3834821946409929, + "grad_norm": 21.608807534604892, + "learning_rate": 2e-06, + "loss": 0.2958, + "step": 1653 + }, + { + "epoch": 0.3837141862892936, + "grad_norm": 9.369420646181656, + "learning_rate": 2e-06, + "loss": 0.3086, + "step": 1654 + }, + { + "epoch": 0.3839461779375942, + "grad_norm": 21.073996636387196, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 1655 + }, + { + "epoch": 0.3841781695858949, + "grad_norm": 20.865881418922612, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 1656 + }, + { + "epoch": 0.3844101612341956, + "grad_norm": 18.164322650255222, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 1657 + }, + { + "epoch": 0.38464215288249626, + "grad_norm": 10.166821633179827, + "learning_rate": 2e-06, + "loss": 0.2436, + "step": 1658 + }, + { + "epoch": 0.3848741445307969, + "grad_norm": 20.213626738684884, + "learning_rate": 2e-06, + "loss": 0.3739, + "step": 1659 + }, + { + "epoch": 0.38510613617909756, + "grad_norm": 21.43928377483437, + "learning_rate": 2e-06, + "loss": 0.3391, + "step": 1660 + }, + { + "epoch": 0.38533812782739824, + "grad_norm": 12.13247627608106, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 1661 + }, + { + "epoch": 0.38557011947569886, + "grad_norm": 15.29816035374616, + "learning_rate": 2e-06, + "loss": 0.2788, + "step": 1662 + }, + { + "epoch": 0.38580211112399954, + "grad_norm": 31.623005766886344, + "learning_rate": 2e-06, + "loss": 0.2836, + "step": 1663 + }, + { + "epoch": 0.3860341027723002, + "grad_norm": 11.67738748278718, + "learning_rate": 2e-06, + "loss": 0.2403, + "step": 1664 + }, + { + "epoch": 0.38626609442060084, + "grad_norm": 13.416501155518167, + "learning_rate": 2e-06, + "loss": 0.2573, + "step": 1665 + }, + { + "epoch": 0.3864980860689015, + "grad_norm": 11.920384086621059, + "learning_rate": 2e-06, + "loss": 0.2362, + "step": 1666 + }, + { + "epoch": 0.3867300777172022, + "grad_norm": 18.72931889505314, + "learning_rate": 2e-06, + "loss": 0.3919, + "step": 1667 + }, + { + "epoch": 0.3869620693655028, + "grad_norm": 8.137213985487316, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 1668 + }, + { + "epoch": 0.3871940610138035, + "grad_norm": 18.030404210623193, + "learning_rate": 2e-06, + "loss": 0.2867, + "step": 1669 + }, + { + "epoch": 0.38742605266210417, + "grad_norm": 23.792439026114092, + "learning_rate": 2e-06, + "loss": 0.3914, + "step": 1670 + }, + { + "epoch": 0.38765804431040485, + "grad_norm": 16.554147682332868, + "learning_rate": 2e-06, + "loss": 0.398, + "step": 1671 + }, + { + "epoch": 0.38789003595870547, + "grad_norm": 18.15107420377198, + "learning_rate": 2e-06, + "loss": 0.3045, + "step": 1672 + }, + { + "epoch": 0.38812202760700615, + "grad_norm": 16.492763041860865, + "learning_rate": 2e-06, + "loss": 0.3261, + "step": 1673 + }, + { + "epoch": 0.3883540192553068, + "grad_norm": 19.950535800151258, + "learning_rate": 2e-06, + "loss": 0.3572, + "step": 1674 + }, + { + "epoch": 0.38858601090360745, + "grad_norm": 12.332895478452894, + "learning_rate": 2e-06, + "loss": 0.2484, + "step": 1675 + }, + { + "epoch": 0.3888180025519081, + "grad_norm": 18.816090795558736, + "learning_rate": 2e-06, + "loss": 0.4445, + "step": 1676 + }, + { + "epoch": 0.3890499942002088, + "grad_norm": 11.172173233194435, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 1677 + }, + { + "epoch": 0.3892819858485094, + "grad_norm": 22.84534502367574, + "learning_rate": 2e-06, + "loss": 0.4344, + "step": 1678 + }, + { + "epoch": 0.3895139774968101, + "grad_norm": 9.327583213833936, + "learning_rate": 2e-06, + "loss": 0.1992, + "step": 1679 + }, + { + "epoch": 0.3897459691451108, + "grad_norm": 11.737114391340048, + "learning_rate": 2e-06, + "loss": 0.2423, + "step": 1680 + }, + { + "epoch": 0.38997796079341146, + "grad_norm": 12.417614516132264, + "learning_rate": 2e-06, + "loss": 0.3601, + "step": 1681 + }, + { + "epoch": 0.3902099524417121, + "grad_norm": 20.0655001781096, + "learning_rate": 2e-06, + "loss": 0.3668, + "step": 1682 + }, + { + "epoch": 0.39044194409001276, + "grad_norm": 11.796345412021118, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 1683 + }, + { + "epoch": 0.39067393573831344, + "grad_norm": 17.12049151202, + "learning_rate": 2e-06, + "loss": 0.3473, + "step": 1684 + }, + { + "epoch": 0.39090592738661406, + "grad_norm": 13.345906672000439, + "learning_rate": 2e-06, + "loss": 0.2233, + "step": 1685 + }, + { + "epoch": 0.39113791903491474, + "grad_norm": 13.240562612933, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 1686 + }, + { + "epoch": 0.3913699106832154, + "grad_norm": 14.035831267305534, + "learning_rate": 2e-06, + "loss": 0.3141, + "step": 1687 + }, + { + "epoch": 0.39160190233151604, + "grad_norm": 13.460145672208174, + "learning_rate": 2e-06, + "loss": 0.2406, + "step": 1688 + }, + { + "epoch": 0.3918338939798167, + "grad_norm": 13.097576569454555, + "learning_rate": 2e-06, + "loss": 0.2964, + "step": 1689 + }, + { + "epoch": 0.3920658856281174, + "grad_norm": 7.234995154351904, + "learning_rate": 2e-06, + "loss": 0.3318, + "step": 1690 + }, + { + "epoch": 0.39229787727641807, + "grad_norm": 21.42498802164779, + "learning_rate": 2e-06, + "loss": 0.3629, + "step": 1691 + }, + { + "epoch": 0.3925298689247187, + "grad_norm": 29.42585971810702, + "learning_rate": 2e-06, + "loss": 0.398, + "step": 1692 + }, + { + "epoch": 0.3927618605730194, + "grad_norm": 14.67197715062144, + "learning_rate": 2e-06, + "loss": 0.2969, + "step": 1693 + }, + { + "epoch": 0.39299385222132005, + "grad_norm": 26.94553900507181, + "learning_rate": 2e-06, + "loss": 0.4015, + "step": 1694 + }, + { + "epoch": 0.3932258438696207, + "grad_norm": 14.144027051794579, + "learning_rate": 2e-06, + "loss": 0.3245, + "step": 1695 + }, + { + "epoch": 0.39345783551792135, + "grad_norm": 13.121990072480953, + "learning_rate": 2e-06, + "loss": 0.3125, + "step": 1696 + }, + { + "epoch": 0.39368982716622203, + "grad_norm": 14.27997838244899, + "learning_rate": 2e-06, + "loss": 0.34, + "step": 1697 + }, + { + "epoch": 0.39392181881452265, + "grad_norm": 13.623194592867334, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 1698 + }, + { + "epoch": 0.39415381046282333, + "grad_norm": 11.2125716282906, + "learning_rate": 2e-06, + "loss": 0.2761, + "step": 1699 + }, + { + "epoch": 0.394385802111124, + "grad_norm": 12.100281502825506, + "learning_rate": 2e-06, + "loss": 0.3967, + "step": 1700 + }, + { + "epoch": 0.3946177937594247, + "grad_norm": 16.621454147354402, + "learning_rate": 2e-06, + "loss": 0.3108, + "step": 1701 + }, + { + "epoch": 0.3948497854077253, + "grad_norm": 10.874602148419658, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 1702 + }, + { + "epoch": 0.395081777056026, + "grad_norm": 9.863341650478107, + "learning_rate": 2e-06, + "loss": 0.2274, + "step": 1703 + }, + { + "epoch": 0.39531376870432666, + "grad_norm": 12.068471550080465, + "learning_rate": 2e-06, + "loss": 0.2886, + "step": 1704 + }, + { + "epoch": 0.3955457603526273, + "grad_norm": 12.144258760523668, + "learning_rate": 2e-06, + "loss": 0.2762, + "step": 1705 + }, + { + "epoch": 0.39577775200092796, + "grad_norm": 14.432399080933754, + "learning_rate": 2e-06, + "loss": 0.378, + "step": 1706 + }, + { + "epoch": 0.39600974364922864, + "grad_norm": 21.47189822882105, + "learning_rate": 2e-06, + "loss": 0.379, + "step": 1707 + }, + { + "epoch": 0.39624173529752926, + "grad_norm": 14.139723328501157, + "learning_rate": 2e-06, + "loss": 0.2892, + "step": 1708 + }, + { + "epoch": 0.39647372694582994, + "grad_norm": 12.874557329749697, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 1709 + }, + { + "epoch": 0.3967057185941306, + "grad_norm": 19.74261744906016, + "learning_rate": 2e-06, + "loss": 0.2884, + "step": 1710 + }, + { + "epoch": 0.3969377102424313, + "grad_norm": 13.526740825328071, + "learning_rate": 2e-06, + "loss": 0.2469, + "step": 1711 + }, + { + "epoch": 0.3971697018907319, + "grad_norm": 17.29573806918465, + "learning_rate": 2e-06, + "loss": 0.3666, + "step": 1712 + }, + { + "epoch": 0.3974016935390326, + "grad_norm": 22.00074367090561, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 1713 + }, + { + "epoch": 0.3976336851873333, + "grad_norm": 12.216104847810138, + "learning_rate": 2e-06, + "loss": 0.3578, + "step": 1714 + }, + { + "epoch": 0.3978656768356339, + "grad_norm": 33.55844174882175, + "learning_rate": 2e-06, + "loss": 0.4602, + "step": 1715 + }, + { + "epoch": 0.3980976684839346, + "grad_norm": 6.798634249805131, + "learning_rate": 2e-06, + "loss": 0.178, + "step": 1716 + }, + { + "epoch": 0.39832966013223525, + "grad_norm": 17.31017228664811, + "learning_rate": 2e-06, + "loss": 0.202, + "step": 1717 + }, + { + "epoch": 0.3985616517805359, + "grad_norm": 15.660089205257279, + "learning_rate": 2e-06, + "loss": 0.3996, + "step": 1718 + }, + { + "epoch": 0.39879364342883655, + "grad_norm": 21.208381895451023, + "learning_rate": 2e-06, + "loss": 0.3099, + "step": 1719 + }, + { + "epoch": 0.39902563507713723, + "grad_norm": 14.960757327235939, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 1720 + }, + { + "epoch": 0.3992576267254379, + "grad_norm": 30.451286730187253, + "learning_rate": 2e-06, + "loss": 0.3973, + "step": 1721 + }, + { + "epoch": 0.39948961837373853, + "grad_norm": 29.620181933264263, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 1722 + }, + { + "epoch": 0.3997216100220392, + "grad_norm": 13.07179496764708, + "learning_rate": 2e-06, + "loss": 0.3204, + "step": 1723 + }, + { + "epoch": 0.3999536016703399, + "grad_norm": 31.938037556895573, + "learning_rate": 2e-06, + "loss": 0.4647, + "step": 1724 + }, + { + "epoch": 0.4001855933186405, + "grad_norm": 11.521405989449157, + "learning_rate": 2e-06, + "loss": 0.3678, + "step": 1725 + }, + { + "epoch": 0.4004175849669412, + "grad_norm": 13.366109839116676, + "learning_rate": 2e-06, + "loss": 0.4173, + "step": 1726 + }, + { + "epoch": 0.40064957661524186, + "grad_norm": 17.831226552760167, + "learning_rate": 2e-06, + "loss": 0.3475, + "step": 1727 + }, + { + "epoch": 0.4008815682635425, + "grad_norm": 16.687838976166226, + "learning_rate": 2e-06, + "loss": 0.1913, + "step": 1728 + }, + { + "epoch": 0.40111355991184316, + "grad_norm": 8.855862056448904, + "learning_rate": 2e-06, + "loss": 0.2669, + "step": 1729 + }, + { + "epoch": 0.40134555156014384, + "grad_norm": 20.294045592533415, + "learning_rate": 2e-06, + "loss": 0.4138, + "step": 1730 + }, + { + "epoch": 0.4015775432084445, + "grad_norm": 20.282307181816307, + "learning_rate": 2e-06, + "loss": 0.3926, + "step": 1731 + }, + { + "epoch": 0.40180953485674514, + "grad_norm": 17.150362589886832, + "learning_rate": 2e-06, + "loss": 0.1616, + "step": 1732 + }, + { + "epoch": 0.4020415265050458, + "grad_norm": 16.07804665946086, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 1733 + }, + { + "epoch": 0.4022735181533465, + "grad_norm": 15.330586830512058, + "learning_rate": 2e-06, + "loss": 0.3177, + "step": 1734 + }, + { + "epoch": 0.4025055098016471, + "grad_norm": 12.730645638537759, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 1735 + }, + { + "epoch": 0.4027375014499478, + "grad_norm": 16.595958589744118, + "learning_rate": 2e-06, + "loss": 0.3585, + "step": 1736 + }, + { + "epoch": 0.4029694930982485, + "grad_norm": 8.706544673603664, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 1737 + }, + { + "epoch": 0.4032014847465491, + "grad_norm": 16.090059723709984, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 1738 + }, + { + "epoch": 0.4034334763948498, + "grad_norm": 8.675772602038379, + "learning_rate": 2e-06, + "loss": 0.1731, + "step": 1739 + }, + { + "epoch": 0.40366546804315045, + "grad_norm": 18.296364172981882, + "learning_rate": 2e-06, + "loss": 0.3908, + "step": 1740 + }, + { + "epoch": 0.40389745969145113, + "grad_norm": 16.115322502756978, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 1741 + }, + { + "epoch": 0.40412945133975176, + "grad_norm": 20.310432071130496, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 1742 + }, + { + "epoch": 0.40436144298805243, + "grad_norm": 15.865946919832218, + "learning_rate": 2e-06, + "loss": 0.3414, + "step": 1743 + }, + { + "epoch": 0.4045934346363531, + "grad_norm": 14.325433367552268, + "learning_rate": 2e-06, + "loss": 0.3477, + "step": 1744 + }, + { + "epoch": 0.40482542628465373, + "grad_norm": 16.91522871754405, + "learning_rate": 2e-06, + "loss": 0.2881, + "step": 1745 + }, + { + "epoch": 0.4050574179329544, + "grad_norm": 12.156285411915164, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 1746 + }, + { + "epoch": 0.4052894095812551, + "grad_norm": 15.700298066784649, + "learning_rate": 2e-06, + "loss": 0.2911, + "step": 1747 + }, + { + "epoch": 0.4055214012295557, + "grad_norm": 14.070074963571189, + "learning_rate": 2e-06, + "loss": 0.2735, + "step": 1748 + }, + { + "epoch": 0.4057533928778564, + "grad_norm": 20.567228387763485, + "learning_rate": 2e-06, + "loss": 0.3966, + "step": 1749 + }, + { + "epoch": 0.40598538452615707, + "grad_norm": 21.568521644903555, + "learning_rate": 2e-06, + "loss": 0.4069, + "step": 1750 + }, + { + "epoch": 0.40621737617445774, + "grad_norm": 24.98305458957807, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 1751 + }, + { + "epoch": 0.40644936782275837, + "grad_norm": 11.182159916834218, + "learning_rate": 2e-06, + "loss": 0.2476, + "step": 1752 + }, + { + "epoch": 0.40668135947105905, + "grad_norm": 17.16773776017756, + "learning_rate": 2e-06, + "loss": 0.2769, + "step": 1753 + }, + { + "epoch": 0.4069133511193597, + "grad_norm": 8.087817209372279, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 1754 + }, + { + "epoch": 0.40714534276766035, + "grad_norm": 9.737889285495338, + "learning_rate": 2e-06, + "loss": 0.3203, + "step": 1755 + }, + { + "epoch": 0.407377334415961, + "grad_norm": 13.276234387033147, + "learning_rate": 2e-06, + "loss": 0.3975, + "step": 1756 + }, + { + "epoch": 0.4076093260642617, + "grad_norm": 12.833120524941462, + "learning_rate": 2e-06, + "loss": 0.3393, + "step": 1757 + }, + { + "epoch": 0.4078413177125623, + "grad_norm": 10.864122378055994, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 1758 + }, + { + "epoch": 0.408073309360863, + "grad_norm": 17.43651215931162, + "learning_rate": 2e-06, + "loss": 0.2571, + "step": 1759 + }, + { + "epoch": 0.4083053010091637, + "grad_norm": 9.462835101636259, + "learning_rate": 2e-06, + "loss": 0.2363, + "step": 1760 + }, + { + "epoch": 0.40853729265746436, + "grad_norm": 14.60179515190759, + "learning_rate": 2e-06, + "loss": 0.358, + "step": 1761 + }, + { + "epoch": 0.408769284305765, + "grad_norm": 9.35250829968875, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 1762 + }, + { + "epoch": 0.40900127595406566, + "grad_norm": 16.893224625393078, + "learning_rate": 2e-06, + "loss": 0.2516, + "step": 1763 + }, + { + "epoch": 0.40923326760236634, + "grad_norm": 18.305246064820754, + "learning_rate": 2e-06, + "loss": 0.4163, + "step": 1764 + }, + { + "epoch": 0.40946525925066696, + "grad_norm": 14.924968231349286, + "learning_rate": 2e-06, + "loss": 0.4418, + "step": 1765 + }, + { + "epoch": 0.40969725089896764, + "grad_norm": 19.839080541303428, + "learning_rate": 2e-06, + "loss": 0.298, + "step": 1766 + }, + { + "epoch": 0.4099292425472683, + "grad_norm": 11.601214327833912, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 1767 + }, + { + "epoch": 0.41016123419556894, + "grad_norm": 10.968524968551193, + "learning_rate": 2e-06, + "loss": 0.276, + "step": 1768 + }, + { + "epoch": 0.4103932258438696, + "grad_norm": 6.826904440619662, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 1769 + }, + { + "epoch": 0.4106252174921703, + "grad_norm": 17.323631423776984, + "learning_rate": 2e-06, + "loss": 0.3229, + "step": 1770 + }, + { + "epoch": 0.41085720914047097, + "grad_norm": 13.28354425099362, + "learning_rate": 2e-06, + "loss": 0.3321, + "step": 1771 + }, + { + "epoch": 0.4110892007887716, + "grad_norm": 10.03252119573095, + "learning_rate": 2e-06, + "loss": 0.308, + "step": 1772 + }, + { + "epoch": 0.41132119243707227, + "grad_norm": 23.554278728776346, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 1773 + }, + { + "epoch": 0.41155318408537295, + "grad_norm": 13.998688298441905, + "learning_rate": 2e-06, + "loss": 0.2936, + "step": 1774 + }, + { + "epoch": 0.41178517573367357, + "grad_norm": 22.02491607538067, + "learning_rate": 2e-06, + "loss": 0.3387, + "step": 1775 + }, + { + "epoch": 0.41201716738197425, + "grad_norm": 13.99636863261507, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 1776 + }, + { + "epoch": 0.4122491590302749, + "grad_norm": 14.508439431491093, + "learning_rate": 2e-06, + "loss": 0.2818, + "step": 1777 + }, + { + "epoch": 0.41248115067857555, + "grad_norm": 22.92519999043949, + "learning_rate": 2e-06, + "loss": 0.4278, + "step": 1778 + }, + { + "epoch": 0.4127131423268762, + "grad_norm": 14.599442491978975, + "learning_rate": 2e-06, + "loss": 0.4189, + "step": 1779 + }, + { + "epoch": 0.4129451339751769, + "grad_norm": 20.83978644941764, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 1780 + }, + { + "epoch": 0.4131771256234776, + "grad_norm": 16.8812419501217, + "learning_rate": 2e-06, + "loss": 0.2913, + "step": 1781 + }, + { + "epoch": 0.4134091172717782, + "grad_norm": 22.735272830579238, + "learning_rate": 2e-06, + "loss": 0.3513, + "step": 1782 + }, + { + "epoch": 0.4136411089200789, + "grad_norm": 9.301251987696757, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 1783 + }, + { + "epoch": 0.41387310056837956, + "grad_norm": 18.72825758685982, + "learning_rate": 2e-06, + "loss": 0.3613, + "step": 1784 + }, + { + "epoch": 0.4141050922166802, + "grad_norm": 33.62199677330242, + "learning_rate": 2e-06, + "loss": 0.3825, + "step": 1785 + }, + { + "epoch": 0.41433708386498086, + "grad_norm": 9.097995766326937, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 1786 + }, + { + "epoch": 0.41456907551328154, + "grad_norm": 5.898798150314532, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 1787 + }, + { + "epoch": 0.41480106716158216, + "grad_norm": 16.7077409628407, + "learning_rate": 2e-06, + "loss": 0.2519, + "step": 1788 + }, + { + "epoch": 0.41503305880988284, + "grad_norm": 14.29526419345486, + "learning_rate": 2e-06, + "loss": 0.2502, + "step": 1789 + }, + { + "epoch": 0.4152650504581835, + "grad_norm": 18.288221743113432, + "learning_rate": 2e-06, + "loss": 0.3396, + "step": 1790 + }, + { + "epoch": 0.4154970421064842, + "grad_norm": 8.845939616997287, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 1791 + }, + { + "epoch": 0.4157290337547848, + "grad_norm": 12.965756776065295, + "learning_rate": 2e-06, + "loss": 0.3467, + "step": 1792 + }, + { + "epoch": 0.4159610254030855, + "grad_norm": 15.914929213419684, + "learning_rate": 2e-06, + "loss": 0.3533, + "step": 1793 + }, + { + "epoch": 0.41619301705138617, + "grad_norm": 14.67947860185235, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 1794 + }, + { + "epoch": 0.4164250086996868, + "grad_norm": 19.295061646529685, + "learning_rate": 2e-06, + "loss": 0.3472, + "step": 1795 + }, + { + "epoch": 0.41665700034798747, + "grad_norm": 8.408293380915646, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 1796 + }, + { + "epoch": 0.41688899199628815, + "grad_norm": 10.062098294402563, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 1797 + }, + { + "epoch": 0.41712098364458877, + "grad_norm": 11.463553835251005, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 1798 + }, + { + "epoch": 0.41735297529288945, + "grad_norm": 18.860146153362425, + "learning_rate": 2e-06, + "loss": 0.3719, + "step": 1799 + }, + { + "epoch": 0.4175849669411901, + "grad_norm": 13.389079107989657, + "learning_rate": 2e-06, + "loss": 0.2659, + "step": 1800 + }, + { + "epoch": 0.4178169585894908, + "grad_norm": 19.569806795295246, + "learning_rate": 2e-06, + "loss": 0.3267, + "step": 1801 + }, + { + "epoch": 0.41804895023779143, + "grad_norm": 12.072125881404723, + "learning_rate": 2e-06, + "loss": 0.2558, + "step": 1802 + }, + { + "epoch": 0.4182809418860921, + "grad_norm": 20.466583975370828, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 1803 + }, + { + "epoch": 0.4185129335343928, + "grad_norm": 16.68337800654914, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 1804 + }, + { + "epoch": 0.4187449251826934, + "grad_norm": 13.062693644167947, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 1805 + }, + { + "epoch": 0.4189769168309941, + "grad_norm": 14.091819481275975, + "learning_rate": 2e-06, + "loss": 0.2461, + "step": 1806 + }, + { + "epoch": 0.41920890847929476, + "grad_norm": 18.249361092614027, + "learning_rate": 2e-06, + "loss": 0.3251, + "step": 1807 + }, + { + "epoch": 0.4194409001275954, + "grad_norm": 18.720542222479533, + "learning_rate": 2e-06, + "loss": 0.2487, + "step": 1808 + }, + { + "epoch": 0.41967289177589606, + "grad_norm": 6.835853742556412, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 1809 + }, + { + "epoch": 0.41990488342419674, + "grad_norm": 21.313152980029592, + "learning_rate": 2e-06, + "loss": 0.2814, + "step": 1810 + }, + { + "epoch": 0.4201368750724974, + "grad_norm": 19.15105976427836, + "learning_rate": 2e-06, + "loss": 0.383, + "step": 1811 + }, + { + "epoch": 0.42036886672079804, + "grad_norm": 11.295461296397983, + "learning_rate": 2e-06, + "loss": 0.2857, + "step": 1812 + }, + { + "epoch": 0.4206008583690987, + "grad_norm": 7.541556412734428, + "learning_rate": 2e-06, + "loss": 0.2447, + "step": 1813 + }, + { + "epoch": 0.4208328500173994, + "grad_norm": 18.291856914356643, + "learning_rate": 2e-06, + "loss": 0.4423, + "step": 1814 + }, + { + "epoch": 0.4210648416657, + "grad_norm": 8.321430630856858, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 1815 + }, + { + "epoch": 0.4212968333140007, + "grad_norm": 8.615068388386895, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 1816 + }, + { + "epoch": 0.4215288249623014, + "grad_norm": 18.587239540548637, + "learning_rate": 2e-06, + "loss": 0.4264, + "step": 1817 + }, + { + "epoch": 0.421760816610602, + "grad_norm": 19.258406635069775, + "learning_rate": 2e-06, + "loss": 0.3658, + "step": 1818 + }, + { + "epoch": 0.4219928082589027, + "grad_norm": 4.718414240134861, + "learning_rate": 2e-06, + "loss": 0.1482, + "step": 1819 + }, + { + "epoch": 0.42222479990720335, + "grad_norm": 19.28316315364071, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 1820 + }, + { + "epoch": 0.422456791555504, + "grad_norm": 12.782558855185856, + "learning_rate": 2e-06, + "loss": 0.3345, + "step": 1821 + }, + { + "epoch": 0.42268878320380465, + "grad_norm": 21.96367145052797, + "learning_rate": 2e-06, + "loss": 0.339, + "step": 1822 + }, + { + "epoch": 0.42292077485210533, + "grad_norm": 15.16140918689698, + "learning_rate": 2e-06, + "loss": 0.3114, + "step": 1823 + }, + { + "epoch": 0.423152766500406, + "grad_norm": 10.192502648774663, + "learning_rate": 2e-06, + "loss": 0.1998, + "step": 1824 + }, + { + "epoch": 0.42338475814870663, + "grad_norm": 16.432028981570053, + "learning_rate": 2e-06, + "loss": 0.2548, + "step": 1825 + }, + { + "epoch": 0.4236167497970073, + "grad_norm": 12.767243541706549, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 1826 + }, + { + "epoch": 0.423848741445308, + "grad_norm": 23.907500035971236, + "learning_rate": 2e-06, + "loss": 0.3951, + "step": 1827 + }, + { + "epoch": 0.4240807330936086, + "grad_norm": 20.369900827476446, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 1828 + }, + { + "epoch": 0.4243127247419093, + "grad_norm": 12.49686598283754, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 1829 + }, + { + "epoch": 0.42454471639020996, + "grad_norm": 17.01708522916691, + "learning_rate": 2e-06, + "loss": 0.291, + "step": 1830 + }, + { + "epoch": 0.4247767080385106, + "grad_norm": 8.448959698676287, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 1831 + }, + { + "epoch": 0.42500869968681126, + "grad_norm": 13.391004267278555, + "learning_rate": 2e-06, + "loss": 0.2412, + "step": 1832 + }, + { + "epoch": 0.42524069133511194, + "grad_norm": 32.59089985580288, + "learning_rate": 2e-06, + "loss": 0.4774, + "step": 1833 + }, + { + "epoch": 0.4254726829834126, + "grad_norm": 14.407359323922236, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 1834 + }, + { + "epoch": 0.42570467463171324, + "grad_norm": 12.317732601842495, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 1835 + }, + { + "epoch": 0.4259366662800139, + "grad_norm": 19.8003091897629, + "learning_rate": 2e-06, + "loss": 0.418, + "step": 1836 + }, + { + "epoch": 0.4261686579283146, + "grad_norm": 11.484951169175416, + "learning_rate": 2e-06, + "loss": 0.217, + "step": 1837 + }, + { + "epoch": 0.4264006495766152, + "grad_norm": 13.45733271265659, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 1838 + }, + { + "epoch": 0.4266326412249159, + "grad_norm": 15.472653333707216, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 1839 + }, + { + "epoch": 0.4268646328732166, + "grad_norm": 24.25670999804905, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 1840 + }, + { + "epoch": 0.4270966245215172, + "grad_norm": 19.94460399795094, + "learning_rate": 2e-06, + "loss": 0.3361, + "step": 1841 + }, + { + "epoch": 0.4273286161698179, + "grad_norm": 13.063517439435312, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 1842 + }, + { + "epoch": 0.42756060781811855, + "grad_norm": 10.281433902687104, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 1843 + }, + { + "epoch": 0.42779259946641923, + "grad_norm": 14.490270130967572, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 1844 + }, + { + "epoch": 0.42802459111471985, + "grad_norm": 12.412525094210439, + "learning_rate": 2e-06, + "loss": 0.3253, + "step": 1845 + }, + { + "epoch": 0.42825658276302053, + "grad_norm": 20.377294425853872, + "learning_rate": 2e-06, + "loss": 0.3974, + "step": 1846 + }, + { + "epoch": 0.4284885744113212, + "grad_norm": 18.736659411511127, + "learning_rate": 2e-06, + "loss": 0.4384, + "step": 1847 + }, + { + "epoch": 0.42872056605962183, + "grad_norm": 12.415806850795308, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 1848 + }, + { + "epoch": 0.4289525577079225, + "grad_norm": 23.342252148205862, + "learning_rate": 2e-06, + "loss": 0.2838, + "step": 1849 + }, + { + "epoch": 0.4291845493562232, + "grad_norm": 13.276574562564289, + "learning_rate": 2e-06, + "loss": 0.2406, + "step": 1850 + }, + { + "epoch": 0.4294165410045238, + "grad_norm": 10.567426494535969, + "learning_rate": 2e-06, + "loss": 0.2654, + "step": 1851 + }, + { + "epoch": 0.4296485326528245, + "grad_norm": 8.000618948096822, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 1852 + }, + { + "epoch": 0.42988052430112517, + "grad_norm": 8.852477043978167, + "learning_rate": 2e-06, + "loss": 0.292, + "step": 1853 + }, + { + "epoch": 0.43011251594942584, + "grad_norm": 8.604882799928584, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 1854 + }, + { + "epoch": 0.43034450759772647, + "grad_norm": 15.729433265309597, + "learning_rate": 2e-06, + "loss": 0.3744, + "step": 1855 + }, + { + "epoch": 0.43057649924602714, + "grad_norm": 15.22527467684575, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 1856 + }, + { + "epoch": 0.4308084908943278, + "grad_norm": 14.485139169410346, + "learning_rate": 2e-06, + "loss": 0.2902, + "step": 1857 + }, + { + "epoch": 0.43104048254262844, + "grad_norm": 28.39814798556948, + "learning_rate": 2e-06, + "loss": 0.3888, + "step": 1858 + }, + { + "epoch": 0.4312724741909291, + "grad_norm": 11.340683696031968, + "learning_rate": 2e-06, + "loss": 0.2898, + "step": 1859 + }, + { + "epoch": 0.4315044658392298, + "grad_norm": 9.388704159058799, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 1860 + }, + { + "epoch": 0.4317364574875304, + "grad_norm": 12.78526627182259, + "learning_rate": 2e-06, + "loss": 0.2516, + "step": 1861 + }, + { + "epoch": 0.4319684491358311, + "grad_norm": 11.221452479221814, + "learning_rate": 2e-06, + "loss": 0.2505, + "step": 1862 + }, + { + "epoch": 0.4322004407841318, + "grad_norm": 8.805058760349361, + "learning_rate": 2e-06, + "loss": 0.3661, + "step": 1863 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 20.94913580794226, + "learning_rate": 2e-06, + "loss": 0.4231, + "step": 1864 + }, + { + "epoch": 0.4326644240807331, + "grad_norm": 18.669289662216194, + "learning_rate": 2e-06, + "loss": 0.3046, + "step": 1865 + }, + { + "epoch": 0.43289641572903376, + "grad_norm": 12.36670710729202, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 1866 + }, + { + "epoch": 0.43312840737733443, + "grad_norm": 11.374741286569185, + "learning_rate": 2e-06, + "loss": 0.3335, + "step": 1867 + }, + { + "epoch": 0.43336039902563506, + "grad_norm": 33.88185912182654, + "learning_rate": 2e-06, + "loss": 0.3517, + "step": 1868 + }, + { + "epoch": 0.43359239067393573, + "grad_norm": 22.20508173062101, + "learning_rate": 2e-06, + "loss": 0.2955, + "step": 1869 + }, + { + "epoch": 0.4338243823222364, + "grad_norm": 7.340339839461144, + "learning_rate": 2e-06, + "loss": 0.1892, + "step": 1870 + }, + { + "epoch": 0.43405637397053703, + "grad_norm": 20.551325961108073, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 1871 + }, + { + "epoch": 0.4342883656188377, + "grad_norm": 14.707610072766458, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 1872 + }, + { + "epoch": 0.4345203572671384, + "grad_norm": 15.411962906532079, + "learning_rate": 2e-06, + "loss": 0.3017, + "step": 1873 + }, + { + "epoch": 0.43475234891543907, + "grad_norm": 13.893129257683169, + "learning_rate": 2e-06, + "loss": 0.3638, + "step": 1874 + }, + { + "epoch": 0.4349843405637397, + "grad_norm": 16.280513594402603, + "learning_rate": 2e-06, + "loss": 0.2568, + "step": 1875 + }, + { + "epoch": 0.43521633221204037, + "grad_norm": 18.16785510645439, + "learning_rate": 2e-06, + "loss": 0.3422, + "step": 1876 + }, + { + "epoch": 0.43544832386034105, + "grad_norm": 9.750925911740046, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 1877 + }, + { + "epoch": 0.43568031550864167, + "grad_norm": 13.192130318022839, + "learning_rate": 2e-06, + "loss": 0.3196, + "step": 1878 + }, + { + "epoch": 0.43591230715694235, + "grad_norm": 12.223261978383052, + "learning_rate": 2e-06, + "loss": 0.2715, + "step": 1879 + }, + { + "epoch": 0.436144298805243, + "grad_norm": 11.059180241388743, + "learning_rate": 2e-06, + "loss": 0.2609, + "step": 1880 + }, + { + "epoch": 0.43637629045354365, + "grad_norm": 15.662230117945928, + "learning_rate": 2e-06, + "loss": 0.2097, + "step": 1881 + }, + { + "epoch": 0.4366082821018443, + "grad_norm": 7.331387077331177, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 1882 + }, + { + "epoch": 0.436840273750145, + "grad_norm": 12.61809767481381, + "learning_rate": 2e-06, + "loss": 0.2374, + "step": 1883 + }, + { + "epoch": 0.4370722653984457, + "grad_norm": 12.78740040484834, + "learning_rate": 2e-06, + "loss": 0.2293, + "step": 1884 + }, + { + "epoch": 0.4373042570467463, + "grad_norm": 14.424100789930472, + "learning_rate": 2e-06, + "loss": 0.3617, + "step": 1885 + }, + { + "epoch": 0.437536248695047, + "grad_norm": 80.96212131101774, + "learning_rate": 2e-06, + "loss": 0.4131, + "step": 1886 + }, + { + "epoch": 0.43776824034334766, + "grad_norm": 15.011266652601709, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 1887 + }, + { + "epoch": 0.4380002319916483, + "grad_norm": 11.718612332506902, + "learning_rate": 2e-06, + "loss": 0.2712, + "step": 1888 + }, + { + "epoch": 0.43823222363994896, + "grad_norm": 9.69556958468024, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 1889 + }, + { + "epoch": 0.43846421528824964, + "grad_norm": 13.37107182380002, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 1890 + }, + { + "epoch": 0.43869620693655026, + "grad_norm": 25.52509004317602, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 1891 + }, + { + "epoch": 0.43892819858485094, + "grad_norm": 25.557432614648956, + "learning_rate": 2e-06, + "loss": 0.2095, + "step": 1892 + }, + { + "epoch": 0.4391601902331516, + "grad_norm": 10.227165672394735, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 1893 + }, + { + "epoch": 0.4393921818814523, + "grad_norm": 10.833971024024654, + "learning_rate": 2e-06, + "loss": 0.3021, + "step": 1894 + }, + { + "epoch": 0.4396241735297529, + "grad_norm": 6.8477809683100395, + "learning_rate": 2e-06, + "loss": 0.2074, + "step": 1895 + }, + { + "epoch": 0.4398561651780536, + "grad_norm": 12.612097699683517, + "learning_rate": 2e-06, + "loss": 0.268, + "step": 1896 + }, + { + "epoch": 0.44008815682635427, + "grad_norm": 14.93894889485441, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 1897 + }, + { + "epoch": 0.4403201484746549, + "grad_norm": 12.675335108037462, + "learning_rate": 2e-06, + "loss": 0.2563, + "step": 1898 + }, + { + "epoch": 0.44055214012295557, + "grad_norm": 14.079067493345839, + "learning_rate": 2e-06, + "loss": 0.252, + "step": 1899 + }, + { + "epoch": 0.44078413177125625, + "grad_norm": 22.713016972391877, + "learning_rate": 2e-06, + "loss": 0.3575, + "step": 1900 + }, + { + "epoch": 0.44101612341955687, + "grad_norm": 16.909479818805984, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 1901 + }, + { + "epoch": 0.44124811506785755, + "grad_norm": 26.53615495358387, + "learning_rate": 2e-06, + "loss": 0.3782, + "step": 1902 + }, + { + "epoch": 0.4414801067161582, + "grad_norm": 11.637938103347429, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 1903 + }, + { + "epoch": 0.4417120983644589, + "grad_norm": 13.790174253848365, + "learning_rate": 2e-06, + "loss": 0.3713, + "step": 1904 + }, + { + "epoch": 0.4419440900127595, + "grad_norm": 12.622131028191134, + "learning_rate": 2e-06, + "loss": 0.2575, + "step": 1905 + }, + { + "epoch": 0.4421760816610602, + "grad_norm": 11.39969737402925, + "learning_rate": 2e-06, + "loss": 0.2648, + "step": 1906 + }, + { + "epoch": 0.4424080733093609, + "grad_norm": 16.675642271360555, + "learning_rate": 2e-06, + "loss": 0.4741, + "step": 1907 + }, + { + "epoch": 0.4426400649576615, + "grad_norm": 10.78164946540168, + "learning_rate": 2e-06, + "loss": 0.2563, + "step": 1908 + }, + { + "epoch": 0.4428720566059622, + "grad_norm": 19.68038839531077, + "learning_rate": 2e-06, + "loss": 0.4349, + "step": 1909 + }, + { + "epoch": 0.44310404825426286, + "grad_norm": 14.168099510681532, + "learning_rate": 2e-06, + "loss": 0.3591, + "step": 1910 + }, + { + "epoch": 0.4433360399025635, + "grad_norm": 13.257912219731839, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 1911 + }, + { + "epoch": 0.44356803155086416, + "grad_norm": 28.111517131517505, + "learning_rate": 2e-06, + "loss": 0.2837, + "step": 1912 + }, + { + "epoch": 0.44380002319916484, + "grad_norm": 25.10965649267154, + "learning_rate": 2e-06, + "loss": 0.3274, + "step": 1913 + }, + { + "epoch": 0.4440320148474655, + "grad_norm": 14.93557483973632, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 1914 + }, + { + "epoch": 0.44426400649576614, + "grad_norm": 16.642468175359035, + "learning_rate": 2e-06, + "loss": 0.3378, + "step": 1915 + }, + { + "epoch": 0.4444959981440668, + "grad_norm": 6.84438606505179, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 1916 + }, + { + "epoch": 0.4447279897923675, + "grad_norm": 8.477797064840939, + "learning_rate": 2e-06, + "loss": 0.3412, + "step": 1917 + }, + { + "epoch": 0.4449599814406681, + "grad_norm": 12.99853283907464, + "learning_rate": 2e-06, + "loss": 0.2441, + "step": 1918 + }, + { + "epoch": 0.4451919730889688, + "grad_norm": 10.977481652548592, + "learning_rate": 2e-06, + "loss": 0.3244, + "step": 1919 + }, + { + "epoch": 0.4454239647372695, + "grad_norm": 10.570234822426068, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 1920 + }, + { + "epoch": 0.4456559563855701, + "grad_norm": 6.860732361741255, + "learning_rate": 2e-06, + "loss": 0.2442, + "step": 1921 + }, + { + "epoch": 0.4458879480338708, + "grad_norm": 19.302410055233295, + "learning_rate": 2e-06, + "loss": 0.4067, + "step": 1922 + }, + { + "epoch": 0.44611993968217145, + "grad_norm": 18.20793820249486, + "learning_rate": 2e-06, + "loss": 0.3678, + "step": 1923 + }, + { + "epoch": 0.44635193133047213, + "grad_norm": 8.97289273170762, + "learning_rate": 2e-06, + "loss": 0.2128, + "step": 1924 + }, + { + "epoch": 0.44658392297877275, + "grad_norm": 6.3471643353963, + "learning_rate": 2e-06, + "loss": 0.1974, + "step": 1925 + }, + { + "epoch": 0.44681591462707343, + "grad_norm": 12.638237055112246, + "learning_rate": 2e-06, + "loss": 0.3043, + "step": 1926 + }, + { + "epoch": 0.4470479062753741, + "grad_norm": 16.3087905268165, + "learning_rate": 2e-06, + "loss": 0.3217, + "step": 1927 + }, + { + "epoch": 0.44727989792367473, + "grad_norm": 6.312405813391441, + "learning_rate": 2e-06, + "loss": 0.2761, + "step": 1928 + }, + { + "epoch": 0.4475118895719754, + "grad_norm": 9.42320757238294, + "learning_rate": 2e-06, + "loss": 0.178, + "step": 1929 + }, + { + "epoch": 0.4477438812202761, + "grad_norm": 14.571508418966776, + "learning_rate": 2e-06, + "loss": 0.3671, + "step": 1930 + }, + { + "epoch": 0.4479758728685767, + "grad_norm": 11.19866672252569, + "learning_rate": 2e-06, + "loss": 0.4218, + "step": 1931 + }, + { + "epoch": 0.4482078645168774, + "grad_norm": 18.094999969369557, + "learning_rate": 2e-06, + "loss": 0.2873, + "step": 1932 + }, + { + "epoch": 0.44843985616517806, + "grad_norm": 17.776408186188583, + "learning_rate": 2e-06, + "loss": 0.212, + "step": 1933 + }, + { + "epoch": 0.44867184781347874, + "grad_norm": 13.319460218034406, + "learning_rate": 2e-06, + "loss": 0.4009, + "step": 1934 + }, + { + "epoch": 0.44890383946177936, + "grad_norm": 20.49299261920701, + "learning_rate": 2e-06, + "loss": 0.3528, + "step": 1935 + }, + { + "epoch": 0.44913583111008004, + "grad_norm": 12.207768363429762, + "learning_rate": 2e-06, + "loss": 0.2797, + "step": 1936 + }, + { + "epoch": 0.4493678227583807, + "grad_norm": 13.241157529714169, + "learning_rate": 2e-06, + "loss": 0.2463, + "step": 1937 + }, + { + "epoch": 0.44959981440668134, + "grad_norm": 11.964061797665712, + "learning_rate": 2e-06, + "loss": 0.3486, + "step": 1938 + }, + { + "epoch": 0.449831806054982, + "grad_norm": 17.760405028259854, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 1939 + }, + { + "epoch": 0.4500637977032827, + "grad_norm": 18.835286779046086, + "learning_rate": 2e-06, + "loss": 0.2983, + "step": 1940 + }, + { + "epoch": 0.4502957893515833, + "grad_norm": 10.061056247616934, + "learning_rate": 2e-06, + "loss": 0.2066, + "step": 1941 + }, + { + "epoch": 0.450527780999884, + "grad_norm": 15.190416553972092, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 1942 + }, + { + "epoch": 0.4507597726481847, + "grad_norm": 18.100146573334026, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 1943 + }, + { + "epoch": 0.45099176429648535, + "grad_norm": 12.145254242892683, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 1944 + }, + { + "epoch": 0.451223755944786, + "grad_norm": 8.675074169684049, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 1945 + }, + { + "epoch": 0.45145574759308665, + "grad_norm": 14.379004029242951, + "learning_rate": 2e-06, + "loss": 0.3171, + "step": 1946 + }, + { + "epoch": 0.45168773924138733, + "grad_norm": 16.35450216910894, + "learning_rate": 2e-06, + "loss": 0.2428, + "step": 1947 + }, + { + "epoch": 0.45191973088968795, + "grad_norm": 15.501856211045736, + "learning_rate": 2e-06, + "loss": 0.2962, + "step": 1948 + }, + { + "epoch": 0.45215172253798863, + "grad_norm": 13.519337257061718, + "learning_rate": 2e-06, + "loss": 0.2869, + "step": 1949 + }, + { + "epoch": 0.4523837141862893, + "grad_norm": 22.08178341082319, + "learning_rate": 2e-06, + "loss": 0.3634, + "step": 1950 + }, + { + "epoch": 0.45261570583458993, + "grad_norm": 19.244466854707706, + "learning_rate": 2e-06, + "loss": 0.4142, + "step": 1951 + }, + { + "epoch": 0.4528476974828906, + "grad_norm": 13.45744682479427, + "learning_rate": 2e-06, + "loss": 0.3247, + "step": 1952 + }, + { + "epoch": 0.4530796891311913, + "grad_norm": 19.971997627098528, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 1953 + }, + { + "epoch": 0.45331168077949197, + "grad_norm": 7.784071208122583, + "learning_rate": 2e-06, + "loss": 0.1705, + "step": 1954 + }, + { + "epoch": 0.4535436724277926, + "grad_norm": 26.175747345649796, + "learning_rate": 2e-06, + "loss": 0.2603, + "step": 1955 + }, + { + "epoch": 0.45377566407609327, + "grad_norm": 9.977944745048129, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 1956 + }, + { + "epoch": 0.45400765572439394, + "grad_norm": 9.241863633870448, + "learning_rate": 2e-06, + "loss": 0.1683, + "step": 1957 + }, + { + "epoch": 0.45423964737269457, + "grad_norm": 7.854276987996447, + "learning_rate": 2e-06, + "loss": 0.2218, + "step": 1958 + }, + { + "epoch": 0.45447163902099524, + "grad_norm": 17.508549301128976, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 1959 + }, + { + "epoch": 0.4547036306692959, + "grad_norm": 14.495906592291902, + "learning_rate": 2e-06, + "loss": 0.2092, + "step": 1960 + }, + { + "epoch": 0.45493562231759654, + "grad_norm": 11.396050739221131, + "learning_rate": 2e-06, + "loss": 0.2218, + "step": 1961 + }, + { + "epoch": 0.4551676139658972, + "grad_norm": 17.21341794034456, + "learning_rate": 2e-06, + "loss": 0.4191, + "step": 1962 + }, + { + "epoch": 0.4553996056141979, + "grad_norm": 39.71999834351291, + "learning_rate": 2e-06, + "loss": 0.4364, + "step": 1963 + }, + { + "epoch": 0.4556315972624986, + "grad_norm": 14.344933276382463, + "learning_rate": 2e-06, + "loss": 0.2449, + "step": 1964 + }, + { + "epoch": 0.4558635889107992, + "grad_norm": 15.083856831023695, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 1965 + }, + { + "epoch": 0.4560955805590999, + "grad_norm": 30.1833620615398, + "learning_rate": 2e-06, + "loss": 0.3772, + "step": 1966 + }, + { + "epoch": 0.45632757220740056, + "grad_norm": 5.694038865160228, + "learning_rate": 2e-06, + "loss": 0.1896, + "step": 1967 + }, + { + "epoch": 0.4565595638557012, + "grad_norm": 7.238584483897142, + "learning_rate": 2e-06, + "loss": 0.2347, + "step": 1968 + }, + { + "epoch": 0.45679155550400186, + "grad_norm": 17.40421564491547, + "learning_rate": 2e-06, + "loss": 0.3307, + "step": 1969 + }, + { + "epoch": 0.45702354715230253, + "grad_norm": 11.75504415856113, + "learning_rate": 2e-06, + "loss": 0.2683, + "step": 1970 + }, + { + "epoch": 0.45725553880060316, + "grad_norm": 12.08785488011168, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 1971 + }, + { + "epoch": 0.45748753044890383, + "grad_norm": 5.746202672849507, + "learning_rate": 2e-06, + "loss": 0.2149, + "step": 1972 + }, + { + "epoch": 0.4577195220972045, + "grad_norm": 10.556481746351935, + "learning_rate": 2e-06, + "loss": 0.2994, + "step": 1973 + }, + { + "epoch": 0.45795151374550513, + "grad_norm": 13.933486628952526, + "learning_rate": 2e-06, + "loss": 0.2687, + "step": 1974 + }, + { + "epoch": 0.4581835053938058, + "grad_norm": 19.368610008099733, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 1975 + }, + { + "epoch": 0.4584154970421065, + "grad_norm": 7.269647480537984, + "learning_rate": 2e-06, + "loss": 0.2359, + "step": 1976 + }, + { + "epoch": 0.45864748869040717, + "grad_norm": 14.717656516693904, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 1977 + }, + { + "epoch": 0.4588794803387078, + "grad_norm": 13.554029836416927, + "learning_rate": 2e-06, + "loss": 0.3661, + "step": 1978 + }, + { + "epoch": 0.45911147198700847, + "grad_norm": 14.72655286380002, + "learning_rate": 2e-06, + "loss": 0.2868, + "step": 1979 + }, + { + "epoch": 0.45934346363530915, + "grad_norm": 19.754084297636542, + "learning_rate": 2e-06, + "loss": 0.4112, + "step": 1980 + }, + { + "epoch": 0.45957545528360977, + "grad_norm": 18.040682007952025, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 1981 + }, + { + "epoch": 0.45980744693191045, + "grad_norm": 23.563968224340655, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 1982 + }, + { + "epoch": 0.4600394385802111, + "grad_norm": 14.553620166412069, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 1983 + }, + { + "epoch": 0.46027143022851175, + "grad_norm": 7.424935719456725, + "learning_rate": 2e-06, + "loss": 0.2433, + "step": 1984 + }, + { + "epoch": 0.4605034218768124, + "grad_norm": 12.02707486472308, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 1985 + }, + { + "epoch": 0.4607354135251131, + "grad_norm": 26.349289928963117, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 1986 + }, + { + "epoch": 0.4609674051734138, + "grad_norm": 11.77000173436258, + "learning_rate": 2e-06, + "loss": 0.2468, + "step": 1987 + }, + { + "epoch": 0.4611993968217144, + "grad_norm": 14.221580423288055, + "learning_rate": 2e-06, + "loss": 0.324, + "step": 1988 + }, + { + "epoch": 0.4614313884700151, + "grad_norm": 19.491108377638728, + "learning_rate": 2e-06, + "loss": 0.413, + "step": 1989 + }, + { + "epoch": 0.46166338011831576, + "grad_norm": 16.717819048175322, + "learning_rate": 2e-06, + "loss": 0.4165, + "step": 1990 + }, + { + "epoch": 0.4618953717666164, + "grad_norm": 16.506916137038182, + "learning_rate": 2e-06, + "loss": 0.371, + "step": 1991 + }, + { + "epoch": 0.46212736341491706, + "grad_norm": 12.255157261779837, + "learning_rate": 2e-06, + "loss": 0.3191, + "step": 1992 + }, + { + "epoch": 0.46235935506321774, + "grad_norm": 10.690498300962876, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 1993 + }, + { + "epoch": 0.46259134671151836, + "grad_norm": 9.877054825920622, + "learning_rate": 2e-06, + "loss": 0.3728, + "step": 1994 + }, + { + "epoch": 0.46282333835981904, + "grad_norm": 15.340735025865651, + "learning_rate": 2e-06, + "loss": 0.4747, + "step": 1995 + }, + { + "epoch": 0.4630553300081197, + "grad_norm": 15.341221735588649, + "learning_rate": 2e-06, + "loss": 0.3361, + "step": 1996 + }, + { + "epoch": 0.4632873216564204, + "grad_norm": 10.945005995090467, + "learning_rate": 2e-06, + "loss": 0.2797, + "step": 1997 + }, + { + "epoch": 0.463519313304721, + "grad_norm": 5.284853062707283, + "learning_rate": 2e-06, + "loss": 0.1841, + "step": 1998 + }, + { + "epoch": 0.4637513049530217, + "grad_norm": 11.633188950114901, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 1999 + }, + { + "epoch": 0.46398329660132237, + "grad_norm": 6.859718365049897, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 2000 + }, + { + "epoch": 0.464215288249623, + "grad_norm": 16.589305902090143, + "learning_rate": 2e-06, + "loss": 0.4069, + "step": 2001 + }, + { + "epoch": 0.46444727989792367, + "grad_norm": 10.342457937117073, + "learning_rate": 2e-06, + "loss": 0.3057, + "step": 2002 + }, + { + "epoch": 0.46467927154622435, + "grad_norm": 12.333270314902068, + "learning_rate": 2e-06, + "loss": 0.2834, + "step": 2003 + }, + { + "epoch": 0.46491126319452497, + "grad_norm": 22.423928853702186, + "learning_rate": 2e-06, + "loss": 0.3894, + "step": 2004 + }, + { + "epoch": 0.46514325484282565, + "grad_norm": 13.766616278689142, + "learning_rate": 2e-06, + "loss": 0.375, + "step": 2005 + }, + { + "epoch": 0.4653752464911263, + "grad_norm": 16.62716850114615, + "learning_rate": 2e-06, + "loss": 0.3018, + "step": 2006 + }, + { + "epoch": 0.465607238139427, + "grad_norm": 18.43425031492219, + "learning_rate": 2e-06, + "loss": 0.3473, + "step": 2007 + }, + { + "epoch": 0.4658392297877276, + "grad_norm": 8.281568359570956, + "learning_rate": 2e-06, + "loss": 0.2622, + "step": 2008 + }, + { + "epoch": 0.4660712214360283, + "grad_norm": 14.635641990408606, + "learning_rate": 2e-06, + "loss": 0.2376, + "step": 2009 + }, + { + "epoch": 0.466303213084329, + "grad_norm": 16.20241517427514, + "learning_rate": 2e-06, + "loss": 0.3306, + "step": 2010 + }, + { + "epoch": 0.4665352047326296, + "grad_norm": 11.51127206202647, + "learning_rate": 2e-06, + "loss": 0.2747, + "step": 2011 + }, + { + "epoch": 0.4667671963809303, + "grad_norm": 58.30128405919469, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 2012 + }, + { + "epoch": 0.46699918802923096, + "grad_norm": 13.9317037243102, + "learning_rate": 2e-06, + "loss": 0.3309, + "step": 2013 + }, + { + "epoch": 0.4672311796775316, + "grad_norm": 20.31824848079081, + "learning_rate": 2e-06, + "loss": 0.3761, + "step": 2014 + }, + { + "epoch": 0.46746317132583226, + "grad_norm": 9.457890163970369, + "learning_rate": 2e-06, + "loss": 0.3379, + "step": 2015 + }, + { + "epoch": 0.46769516297413294, + "grad_norm": 18.053400087811408, + "learning_rate": 2e-06, + "loss": 0.3155, + "step": 2016 + }, + { + "epoch": 0.4679271546224336, + "grad_norm": 11.634119001512087, + "learning_rate": 2e-06, + "loss": 0.3429, + "step": 2017 + }, + { + "epoch": 0.46815914627073424, + "grad_norm": 13.942355437046755, + "learning_rate": 2e-06, + "loss": 0.3103, + "step": 2018 + }, + { + "epoch": 0.4683911379190349, + "grad_norm": 14.711577692597876, + "learning_rate": 2e-06, + "loss": 0.3242, + "step": 2019 + }, + { + "epoch": 0.4686231295673356, + "grad_norm": 12.822970373558434, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 2020 + }, + { + "epoch": 0.4688551212156362, + "grad_norm": 7.922564466961149, + "learning_rate": 2e-06, + "loss": 0.3574, + "step": 2021 + }, + { + "epoch": 0.4690871128639369, + "grad_norm": 16.645591544402976, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 2022 + }, + { + "epoch": 0.4693191045122376, + "grad_norm": 17.150991755358596, + "learning_rate": 2e-06, + "loss": 0.3091, + "step": 2023 + }, + { + "epoch": 0.4695510961605382, + "grad_norm": 19.355662866561232, + "learning_rate": 2e-06, + "loss": 0.3248, + "step": 2024 + }, + { + "epoch": 0.4697830878088389, + "grad_norm": 8.98763782114109, + "learning_rate": 2e-06, + "loss": 0.2328, + "step": 2025 + }, + { + "epoch": 0.47001507945713955, + "grad_norm": 10.750289888646794, + "learning_rate": 2e-06, + "loss": 0.2208, + "step": 2026 + }, + { + "epoch": 0.47024707110544023, + "grad_norm": 15.590133390676085, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 2027 + }, + { + "epoch": 0.47047906275374085, + "grad_norm": 16.63610849979652, + "learning_rate": 2e-06, + "loss": 0.3112, + "step": 2028 + }, + { + "epoch": 0.47071105440204153, + "grad_norm": 10.792337381759852, + "learning_rate": 2e-06, + "loss": 0.3559, + "step": 2029 + }, + { + "epoch": 0.4709430460503422, + "grad_norm": 14.816149884219211, + "learning_rate": 2e-06, + "loss": 0.3003, + "step": 2030 + }, + { + "epoch": 0.47117503769864283, + "grad_norm": 10.341262243880099, + "learning_rate": 2e-06, + "loss": 0.3463, + "step": 2031 + }, + { + "epoch": 0.4714070293469435, + "grad_norm": 17.72305433274037, + "learning_rate": 2e-06, + "loss": 0.4462, + "step": 2032 + }, + { + "epoch": 0.4716390209952442, + "grad_norm": 16.83950046901211, + "learning_rate": 2e-06, + "loss": 0.2417, + "step": 2033 + }, + { + "epoch": 0.4718710126435448, + "grad_norm": 18.10327281457699, + "learning_rate": 2e-06, + "loss": 0.3096, + "step": 2034 + }, + { + "epoch": 0.4721030042918455, + "grad_norm": 14.254882642959036, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 2035 + }, + { + "epoch": 0.47233499594014616, + "grad_norm": 21.676286798359246, + "learning_rate": 2e-06, + "loss": 0.3546, + "step": 2036 + }, + { + "epoch": 0.47256698758844684, + "grad_norm": 12.720790909738758, + "learning_rate": 2e-06, + "loss": 0.3007, + "step": 2037 + }, + { + "epoch": 0.47279897923674746, + "grad_norm": 11.498119426389403, + "learning_rate": 2e-06, + "loss": 0.2476, + "step": 2038 + }, + { + "epoch": 0.47303097088504814, + "grad_norm": 20.816542225879648, + "learning_rate": 2e-06, + "loss": 0.4009, + "step": 2039 + }, + { + "epoch": 0.4732629625333488, + "grad_norm": 16.82688865312617, + "learning_rate": 2e-06, + "loss": 0.2652, + "step": 2040 + }, + { + "epoch": 0.47349495418164944, + "grad_norm": 9.70477771542857, + "learning_rate": 2e-06, + "loss": 0.2974, + "step": 2041 + }, + { + "epoch": 0.4737269458299501, + "grad_norm": 13.296534871793732, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 2042 + }, + { + "epoch": 0.4739589374782508, + "grad_norm": 14.065427392790259, + "learning_rate": 2e-06, + "loss": 0.3149, + "step": 2043 + }, + { + "epoch": 0.4741909291265514, + "grad_norm": 22.38889426658264, + "learning_rate": 2e-06, + "loss": 0.433, + "step": 2044 + }, + { + "epoch": 0.4744229207748521, + "grad_norm": 14.376475918423807, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 2045 + }, + { + "epoch": 0.4746549124231528, + "grad_norm": 9.277440540725683, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 2046 + }, + { + "epoch": 0.47488690407145345, + "grad_norm": 17.340785257332662, + "learning_rate": 2e-06, + "loss": 0.2725, + "step": 2047 + }, + { + "epoch": 0.4751188957197541, + "grad_norm": 9.985084330908725, + "learning_rate": 2e-06, + "loss": 0.2141, + "step": 2048 + }, + { + "epoch": 0.47535088736805475, + "grad_norm": 10.633352809709951, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 2049 + }, + { + "epoch": 0.47558287901635543, + "grad_norm": 23.121629632798786, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 2050 + }, + { + "epoch": 0.47581487066465605, + "grad_norm": 10.891572671577103, + "learning_rate": 2e-06, + "loss": 0.3372, + "step": 2051 + }, + { + "epoch": 0.47604686231295673, + "grad_norm": 23.9600752713284, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 2052 + }, + { + "epoch": 0.4762788539612574, + "grad_norm": 15.048695431676713, + "learning_rate": 2e-06, + "loss": 0.4033, + "step": 2053 + }, + { + "epoch": 0.47651084560955803, + "grad_norm": 19.555167465786873, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 2054 + }, + { + "epoch": 0.4767428372578587, + "grad_norm": 27.78200124896194, + "learning_rate": 2e-06, + "loss": 0.4486, + "step": 2055 + }, + { + "epoch": 0.4769748289061594, + "grad_norm": 12.272255303881282, + "learning_rate": 2e-06, + "loss": 0.3018, + "step": 2056 + }, + { + "epoch": 0.47720682055446006, + "grad_norm": 15.556624908312893, + "learning_rate": 2e-06, + "loss": 0.5194, + "step": 2057 + }, + { + "epoch": 0.4774388122027607, + "grad_norm": 16.338581145118493, + "learning_rate": 2e-06, + "loss": 0.3547, + "step": 2058 + }, + { + "epoch": 0.47767080385106137, + "grad_norm": 9.074037753398741, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 2059 + }, + { + "epoch": 0.47790279549936204, + "grad_norm": 10.66378411033167, + "learning_rate": 2e-06, + "loss": 0.3439, + "step": 2060 + }, + { + "epoch": 0.47813478714766267, + "grad_norm": 15.797434529468434, + "learning_rate": 2e-06, + "loss": 0.2944, + "step": 2061 + }, + { + "epoch": 0.47836677879596334, + "grad_norm": 13.227433292179827, + "learning_rate": 2e-06, + "loss": 0.339, + "step": 2062 + }, + { + "epoch": 0.478598770444264, + "grad_norm": 12.03833396979913, + "learning_rate": 2e-06, + "loss": 0.3403, + "step": 2063 + }, + { + "epoch": 0.47883076209256464, + "grad_norm": 24.553130563738854, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 2064 + }, + { + "epoch": 0.4790627537408653, + "grad_norm": 20.737155578901096, + "learning_rate": 2e-06, + "loss": 0.4112, + "step": 2065 + }, + { + "epoch": 0.479294745389166, + "grad_norm": 27.884478252430842, + "learning_rate": 2e-06, + "loss": 0.3297, + "step": 2066 + }, + { + "epoch": 0.4795267370374667, + "grad_norm": 13.103378328133143, + "learning_rate": 2e-06, + "loss": 0.3001, + "step": 2067 + }, + { + "epoch": 0.4797587286857673, + "grad_norm": 17.80927820776487, + "learning_rate": 2e-06, + "loss": 0.4377, + "step": 2068 + }, + { + "epoch": 0.479990720334068, + "grad_norm": 20.661695218080254, + "learning_rate": 2e-06, + "loss": 0.3473, + "step": 2069 + }, + { + "epoch": 0.48022271198236866, + "grad_norm": 28.217870160429282, + "learning_rate": 2e-06, + "loss": 0.3783, + "step": 2070 + }, + { + "epoch": 0.4804547036306693, + "grad_norm": 17.477461536493266, + "learning_rate": 2e-06, + "loss": 0.3352, + "step": 2071 + }, + { + "epoch": 0.48068669527896996, + "grad_norm": 23.889925987937147, + "learning_rate": 2e-06, + "loss": 0.3111, + "step": 2072 + }, + { + "epoch": 0.48091868692727063, + "grad_norm": 7.9886444116729205, + "learning_rate": 2e-06, + "loss": 0.3253, + "step": 2073 + }, + { + "epoch": 0.48115067857557126, + "grad_norm": 19.73259296094963, + "learning_rate": 2e-06, + "loss": 0.3367, + "step": 2074 + }, + { + "epoch": 0.48138267022387193, + "grad_norm": 9.559685773496563, + "learning_rate": 2e-06, + "loss": 0.234, + "step": 2075 + }, + { + "epoch": 0.4816146618721726, + "grad_norm": 9.743531669983685, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 2076 + }, + { + "epoch": 0.4818466535204733, + "grad_norm": 16.81635360695747, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 2077 + }, + { + "epoch": 0.4820786451687739, + "grad_norm": 11.790522180861018, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 2078 + }, + { + "epoch": 0.4823106368170746, + "grad_norm": 14.632744832313552, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 2079 + }, + { + "epoch": 0.48254262846537527, + "grad_norm": 11.717947162424096, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 2080 + }, + { + "epoch": 0.4827746201136759, + "grad_norm": 8.98069286186794, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 2081 + }, + { + "epoch": 0.48300661176197657, + "grad_norm": 10.350196437648169, + "learning_rate": 2e-06, + "loss": 0.3483, + "step": 2082 + }, + { + "epoch": 0.48323860341027725, + "grad_norm": 18.872461989879593, + "learning_rate": 2e-06, + "loss": 0.3595, + "step": 2083 + }, + { + "epoch": 0.48347059505857787, + "grad_norm": 18.968088081083174, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 2084 + }, + { + "epoch": 0.48370258670687855, + "grad_norm": 14.399667825466722, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 2085 + }, + { + "epoch": 0.4839345783551792, + "grad_norm": 13.474511688092948, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 2086 + }, + { + "epoch": 0.4841665700034799, + "grad_norm": 11.216722839581688, + "learning_rate": 2e-06, + "loss": 0.2157, + "step": 2087 + }, + { + "epoch": 0.4843985616517805, + "grad_norm": 14.86844187814411, + "learning_rate": 2e-06, + "loss": 0.4169, + "step": 2088 + }, + { + "epoch": 0.4846305533000812, + "grad_norm": 7.8447621775911545, + "learning_rate": 2e-06, + "loss": 0.3091, + "step": 2089 + }, + { + "epoch": 0.4848625449483819, + "grad_norm": 10.254262575831111, + "learning_rate": 2e-06, + "loss": 0.359, + "step": 2090 + }, + { + "epoch": 0.4850945365966825, + "grad_norm": 11.876234611995246, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 2091 + }, + { + "epoch": 0.4853265282449832, + "grad_norm": 14.067135704743569, + "learning_rate": 2e-06, + "loss": 0.3312, + "step": 2092 + }, + { + "epoch": 0.48555851989328386, + "grad_norm": 7.128865969322003, + "learning_rate": 2e-06, + "loss": 0.235, + "step": 2093 + }, + { + "epoch": 0.4857905115415845, + "grad_norm": 26.351599690207127, + "learning_rate": 2e-06, + "loss": 0.3259, + "step": 2094 + }, + { + "epoch": 0.48602250318988516, + "grad_norm": 16.241201805268105, + "learning_rate": 2e-06, + "loss": 0.2984, + "step": 2095 + }, + { + "epoch": 0.48625449483818584, + "grad_norm": 14.767969302463698, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 2096 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 15.7565988823627, + "learning_rate": 2e-06, + "loss": 0.329, + "step": 2097 + }, + { + "epoch": 0.48671847813478714, + "grad_norm": 16.203714255793574, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 2098 + }, + { + "epoch": 0.4869504697830878, + "grad_norm": 10.42518604548034, + "learning_rate": 2e-06, + "loss": 0.3288, + "step": 2099 + }, + { + "epoch": 0.4871824614313885, + "grad_norm": 22.287666620106165, + "learning_rate": 2e-06, + "loss": 0.3003, + "step": 2100 + }, + { + "epoch": 0.4874144530796891, + "grad_norm": 15.199158506152191, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 2101 + }, + { + "epoch": 0.4876464447279898, + "grad_norm": 17.558767455624956, + "learning_rate": 2e-06, + "loss": 0.3145, + "step": 2102 + }, + { + "epoch": 0.48787843637629047, + "grad_norm": 18.285953331339602, + "learning_rate": 2e-06, + "loss": 0.4843, + "step": 2103 + }, + { + "epoch": 0.4881104280245911, + "grad_norm": 15.432844223638583, + "learning_rate": 2e-06, + "loss": 0.2451, + "step": 2104 + }, + { + "epoch": 0.48834241967289177, + "grad_norm": 11.767138467996935, + "learning_rate": 2e-06, + "loss": 0.2904, + "step": 2105 + }, + { + "epoch": 0.48857441132119245, + "grad_norm": 14.707804934793018, + "learning_rate": 2e-06, + "loss": 0.3309, + "step": 2106 + }, + { + "epoch": 0.4888064029694931, + "grad_norm": 12.873460858804348, + "learning_rate": 2e-06, + "loss": 0.2538, + "step": 2107 + }, + { + "epoch": 0.48903839461779375, + "grad_norm": 15.551365719566405, + "learning_rate": 2e-06, + "loss": 0.3376, + "step": 2108 + }, + { + "epoch": 0.4892703862660944, + "grad_norm": 22.354249058364896, + "learning_rate": 2e-06, + "loss": 0.3482, + "step": 2109 + }, + { + "epoch": 0.4895023779143951, + "grad_norm": 12.13426092309433, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 2110 + }, + { + "epoch": 0.4897343695626957, + "grad_norm": 6.378416323954354, + "learning_rate": 2e-06, + "loss": 0.1642, + "step": 2111 + }, + { + "epoch": 0.4899663612109964, + "grad_norm": 12.172120073384693, + "learning_rate": 2e-06, + "loss": 0.2751, + "step": 2112 + }, + { + "epoch": 0.4901983528592971, + "grad_norm": 8.006798240440455, + "learning_rate": 2e-06, + "loss": 0.2359, + "step": 2113 + }, + { + "epoch": 0.4904303445075977, + "grad_norm": 15.544069438253967, + "learning_rate": 2e-06, + "loss": 0.3287, + "step": 2114 + }, + { + "epoch": 0.4906623361558984, + "grad_norm": 7.0112959850423735, + "learning_rate": 2e-06, + "loss": 0.1766, + "step": 2115 + }, + { + "epoch": 0.49089432780419906, + "grad_norm": 15.549511728081491, + "learning_rate": 2e-06, + "loss": 0.3081, + "step": 2116 + }, + { + "epoch": 0.49112631945249974, + "grad_norm": 11.256547217001314, + "learning_rate": 2e-06, + "loss": 0.3567, + "step": 2117 + }, + { + "epoch": 0.49135831110080036, + "grad_norm": 14.704389837873286, + "learning_rate": 2e-06, + "loss": 0.2985, + "step": 2118 + }, + { + "epoch": 0.49159030274910104, + "grad_norm": 17.727161833868298, + "learning_rate": 2e-06, + "loss": 0.3379, + "step": 2119 + }, + { + "epoch": 0.4918222943974017, + "grad_norm": 13.301220933708462, + "learning_rate": 2e-06, + "loss": 0.2558, + "step": 2120 + }, + { + "epoch": 0.49205428604570234, + "grad_norm": 6.108889993757156, + "learning_rate": 2e-06, + "loss": 0.2027, + "step": 2121 + }, + { + "epoch": 0.492286277694003, + "grad_norm": 10.868425584914485, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 2122 + }, + { + "epoch": 0.4925182693423037, + "grad_norm": 17.635453142218786, + "learning_rate": 2e-06, + "loss": 0.4297, + "step": 2123 + }, + { + "epoch": 0.4927502609906043, + "grad_norm": 17.445941519621602, + "learning_rate": 2e-06, + "loss": 0.436, + "step": 2124 + }, + { + "epoch": 0.492982252638905, + "grad_norm": 16.86257932055074, + "learning_rate": 2e-06, + "loss": 0.334, + "step": 2125 + }, + { + "epoch": 0.49321424428720567, + "grad_norm": 14.557790324443094, + "learning_rate": 2e-06, + "loss": 0.3895, + "step": 2126 + }, + { + "epoch": 0.4934462359355063, + "grad_norm": 15.365709948457615, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 2127 + }, + { + "epoch": 0.49367822758380697, + "grad_norm": 11.727258104871728, + "learning_rate": 2e-06, + "loss": 0.1987, + "step": 2128 + }, + { + "epoch": 0.49391021923210765, + "grad_norm": 13.763772152669365, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 2129 + }, + { + "epoch": 0.49414221088040833, + "grad_norm": 12.891289421629347, + "learning_rate": 2e-06, + "loss": 0.241, + "step": 2130 + }, + { + "epoch": 0.49437420252870895, + "grad_norm": 15.264107197755068, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 2131 + }, + { + "epoch": 0.49460619417700963, + "grad_norm": 19.049486690093694, + "learning_rate": 2e-06, + "loss": 0.2844, + "step": 2132 + }, + { + "epoch": 0.4948381858253103, + "grad_norm": 20.289091498324936, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 2133 + }, + { + "epoch": 0.49507017747361093, + "grad_norm": 8.846363557878725, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 2134 + }, + { + "epoch": 0.4953021691219116, + "grad_norm": 19.68036789620874, + "learning_rate": 2e-06, + "loss": 0.3254, + "step": 2135 + }, + { + "epoch": 0.4955341607702123, + "grad_norm": 19.01167373777457, + "learning_rate": 2e-06, + "loss": 0.3668, + "step": 2136 + }, + { + "epoch": 0.4957661524185129, + "grad_norm": 20.271394808053454, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 2137 + }, + { + "epoch": 0.4959981440668136, + "grad_norm": 13.713450589308374, + "learning_rate": 2e-06, + "loss": 0.3172, + "step": 2138 + }, + { + "epoch": 0.49623013571511426, + "grad_norm": 15.681288599458853, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 2139 + }, + { + "epoch": 0.49646212736341494, + "grad_norm": 22.489577644012517, + "learning_rate": 2e-06, + "loss": 0.466, + "step": 2140 + }, + { + "epoch": 0.49669411901171556, + "grad_norm": 11.929339914832337, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 2141 + }, + { + "epoch": 0.49692611066001624, + "grad_norm": 6.908041019643261, + "learning_rate": 2e-06, + "loss": 0.1916, + "step": 2142 + }, + { + "epoch": 0.4971581023083169, + "grad_norm": 16.87423395285013, + "learning_rate": 2e-06, + "loss": 0.3377, + "step": 2143 + }, + { + "epoch": 0.49739009395661754, + "grad_norm": 21.484609766510122, + "learning_rate": 2e-06, + "loss": 0.3187, + "step": 2144 + }, + { + "epoch": 0.4976220856049182, + "grad_norm": 10.005933510006772, + "learning_rate": 2e-06, + "loss": 0.2997, + "step": 2145 + }, + { + "epoch": 0.4978540772532189, + "grad_norm": 20.390856830943978, + "learning_rate": 2e-06, + "loss": 0.2964, + "step": 2146 + }, + { + "epoch": 0.4980860689015195, + "grad_norm": 20.54252638356709, + "learning_rate": 2e-06, + "loss": 0.3739, + "step": 2147 + }, + { + "epoch": 0.4983180605498202, + "grad_norm": 6.440030667693158, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 2148 + }, + { + "epoch": 0.4985500521981209, + "grad_norm": 13.909759042917415, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 2149 + }, + { + "epoch": 0.49878204384642155, + "grad_norm": 13.442729423292715, + "learning_rate": 2e-06, + "loss": 0.1918, + "step": 2150 + }, + { + "epoch": 0.4990140354947222, + "grad_norm": 9.777832167761424, + "learning_rate": 2e-06, + "loss": 0.1691, + "step": 2151 + }, + { + "epoch": 0.49924602714302285, + "grad_norm": 7.528993896065364, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 2152 + }, + { + "epoch": 0.49947801879132353, + "grad_norm": 25.537326149047804, + "learning_rate": 2e-06, + "loss": 0.4174, + "step": 2153 + }, + { + "epoch": 0.49971001043962415, + "grad_norm": 13.598357491155793, + "learning_rate": 2e-06, + "loss": 0.3848, + "step": 2154 + }, + { + "epoch": 0.49994200208792483, + "grad_norm": 22.388810531659942, + "learning_rate": 2e-06, + "loss": 0.3423, + "step": 2155 + }, + { + "epoch": 0.5001739937362255, + "grad_norm": 13.93373343416442, + "learning_rate": 2e-06, + "loss": 0.2189, + "step": 2156 + }, + { + "epoch": 0.5004059853845262, + "grad_norm": 14.328986795854332, + "learning_rate": 2e-06, + "loss": 0.3242, + "step": 2157 + }, + { + "epoch": 0.5006379770328269, + "grad_norm": 8.904671756440253, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 2158 + }, + { + "epoch": 0.5008699686811274, + "grad_norm": 23.515968807666702, + "learning_rate": 2e-06, + "loss": 0.4471, + "step": 2159 + }, + { + "epoch": 0.5011019603294281, + "grad_norm": 21.827464695996884, + "learning_rate": 2e-06, + "loss": 0.3769, + "step": 2160 + }, + { + "epoch": 0.5013339519777288, + "grad_norm": 18.861402954271224, + "learning_rate": 2e-06, + "loss": 0.372, + "step": 2161 + }, + { + "epoch": 0.5015659436260295, + "grad_norm": 16.535871849830876, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 2162 + }, + { + "epoch": 0.5017979352743301, + "grad_norm": 10.303277065169574, + "learning_rate": 2e-06, + "loss": 0.1672, + "step": 2163 + }, + { + "epoch": 0.5020299269226308, + "grad_norm": 15.957852373614422, + "learning_rate": 2e-06, + "loss": 0.2696, + "step": 2164 + }, + { + "epoch": 0.5022619185709315, + "grad_norm": 26.862222528082594, + "learning_rate": 2e-06, + "loss": 0.3697, + "step": 2165 + }, + { + "epoch": 0.5024939102192321, + "grad_norm": 13.53436358948289, + "learning_rate": 2e-06, + "loss": 0.3213, + "step": 2166 + }, + { + "epoch": 0.5027259018675327, + "grad_norm": 29.75890831496249, + "learning_rate": 2e-06, + "loss": 0.3747, + "step": 2167 + }, + { + "epoch": 0.5029578935158334, + "grad_norm": 17.201119948928586, + "learning_rate": 2e-06, + "loss": 0.4157, + "step": 2168 + }, + { + "epoch": 0.5031898851641341, + "grad_norm": 20.322957627453476, + "learning_rate": 2e-06, + "loss": 0.3425, + "step": 2169 + }, + { + "epoch": 0.5034218768124348, + "grad_norm": 7.242148992234226, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 2170 + }, + { + "epoch": 0.5036538684607355, + "grad_norm": 11.3995466559775, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 2171 + }, + { + "epoch": 0.503885860109036, + "grad_norm": 13.692661532868897, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 2172 + }, + { + "epoch": 0.5041178517573367, + "grad_norm": 13.045830152901408, + "learning_rate": 2e-06, + "loss": 0.2802, + "step": 2173 + }, + { + "epoch": 0.5043498434056374, + "grad_norm": 13.19375863975848, + "learning_rate": 2e-06, + "loss": 0.3715, + "step": 2174 + }, + { + "epoch": 0.504581835053938, + "grad_norm": 12.549473738148507, + "learning_rate": 2e-06, + "loss": 0.2536, + "step": 2175 + }, + { + "epoch": 0.5048138267022387, + "grad_norm": 18.270592559002413, + "learning_rate": 2e-06, + "loss": 0.2249, + "step": 2176 + }, + { + "epoch": 0.5050458183505394, + "grad_norm": 22.483451955144304, + "learning_rate": 2e-06, + "loss": 0.3166, + "step": 2177 + }, + { + "epoch": 0.5052778099988401, + "grad_norm": 17.316063660340365, + "learning_rate": 2e-06, + "loss": 0.2714, + "step": 2178 + }, + { + "epoch": 0.5055098016471407, + "grad_norm": 10.947393037556699, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 2179 + }, + { + "epoch": 0.5057417932954413, + "grad_norm": 22.039873971526152, + "learning_rate": 2e-06, + "loss": 0.4411, + "step": 2180 + }, + { + "epoch": 0.505973784943742, + "grad_norm": 10.035311774289827, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 2181 + }, + { + "epoch": 0.5062057765920427, + "grad_norm": 20.400488855842067, + "learning_rate": 2e-06, + "loss": 0.3309, + "step": 2182 + }, + { + "epoch": 0.5064377682403434, + "grad_norm": 11.139661608372032, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 2183 + }, + { + "epoch": 0.506669759888644, + "grad_norm": 49.22644449344963, + "learning_rate": 2e-06, + "loss": 0.413, + "step": 2184 + }, + { + "epoch": 0.5069017515369447, + "grad_norm": 19.282249953371107, + "learning_rate": 2e-06, + "loss": 0.2559, + "step": 2185 + }, + { + "epoch": 0.5071337431852453, + "grad_norm": 9.96155712975433, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 2186 + }, + { + "epoch": 0.507365734833546, + "grad_norm": 11.389788449201504, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 2187 + }, + { + "epoch": 0.5075977264818466, + "grad_norm": 9.573367203354213, + "learning_rate": 2e-06, + "loss": 0.325, + "step": 2188 + }, + { + "epoch": 0.5078297181301473, + "grad_norm": 17.638556783160844, + "learning_rate": 2e-06, + "loss": 0.3245, + "step": 2189 + }, + { + "epoch": 0.508061709778448, + "grad_norm": 25.24343689451894, + "learning_rate": 2e-06, + "loss": 0.4962, + "step": 2190 + }, + { + "epoch": 0.5082937014267487, + "grad_norm": 15.432898553377086, + "learning_rate": 2e-06, + "loss": 0.2843, + "step": 2191 + }, + { + "epoch": 0.5085256930750492, + "grad_norm": 18.526829636362876, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 2192 + }, + { + "epoch": 0.5087576847233499, + "grad_norm": 20.829626240692, + "learning_rate": 2e-06, + "loss": 0.2898, + "step": 2193 + }, + { + "epoch": 0.5089896763716506, + "grad_norm": 12.080813626427616, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 2194 + }, + { + "epoch": 0.5092216680199513, + "grad_norm": 8.755550213350691, + "learning_rate": 2e-06, + "loss": 0.2772, + "step": 2195 + }, + { + "epoch": 0.509453659668252, + "grad_norm": 17.579316445335433, + "learning_rate": 2e-06, + "loss": 0.3266, + "step": 2196 + }, + { + "epoch": 0.5096856513165526, + "grad_norm": 17.325365704374104, + "learning_rate": 2e-06, + "loss": 0.2805, + "step": 2197 + }, + { + "epoch": 0.5099176429648533, + "grad_norm": 14.492615581490334, + "learning_rate": 2e-06, + "loss": 0.3094, + "step": 2198 + }, + { + "epoch": 0.5101496346131539, + "grad_norm": 14.234190346875504, + "learning_rate": 2e-06, + "loss": 0.2348, + "step": 2199 + }, + { + "epoch": 0.5103816262614546, + "grad_norm": 7.5398495123313465, + "learning_rate": 2e-06, + "loss": 0.1718, + "step": 2200 + }, + { + "epoch": 0.5106136179097552, + "grad_norm": 6.97541242627694, + "learning_rate": 2e-06, + "loss": 0.3266, + "step": 2201 + }, + { + "epoch": 0.5108456095580559, + "grad_norm": 12.721195042383458, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 2202 + }, + { + "epoch": 0.5110776012063566, + "grad_norm": 7.105303434910561, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 2203 + }, + { + "epoch": 0.5113095928546573, + "grad_norm": 16.65842865369534, + "learning_rate": 2e-06, + "loss": 0.326, + "step": 2204 + }, + { + "epoch": 0.511541584502958, + "grad_norm": 24.447691272915243, + "learning_rate": 2e-06, + "loss": 0.3282, + "step": 2205 + }, + { + "epoch": 0.5117735761512585, + "grad_norm": 7.472587814339079, + "learning_rate": 2e-06, + "loss": 0.1978, + "step": 2206 + }, + { + "epoch": 0.5120055677995592, + "grad_norm": 27.452162551642193, + "learning_rate": 2e-06, + "loss": 0.2873, + "step": 2207 + }, + { + "epoch": 0.5122375594478599, + "grad_norm": 12.531644069360812, + "learning_rate": 2e-06, + "loss": 0.2842, + "step": 2208 + }, + { + "epoch": 0.5124695510961605, + "grad_norm": 13.022291960747255, + "learning_rate": 2e-06, + "loss": 0.3008, + "step": 2209 + }, + { + "epoch": 0.5127015427444612, + "grad_norm": 11.436705149111399, + "learning_rate": 2e-06, + "loss": 0.2965, + "step": 2210 + }, + { + "epoch": 0.5129335343927619, + "grad_norm": 9.325082731789037, + "learning_rate": 2e-06, + "loss": 0.2812, + "step": 2211 + }, + { + "epoch": 0.5131655260410625, + "grad_norm": 15.090660618989261, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 2212 + }, + { + "epoch": 0.5133975176893631, + "grad_norm": 12.092657076627773, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 2213 + }, + { + "epoch": 0.5136295093376638, + "grad_norm": 16.494372122641952, + "learning_rate": 2e-06, + "loss": 0.3281, + "step": 2214 + }, + { + "epoch": 0.5138615009859645, + "grad_norm": 12.449203842828988, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 2215 + }, + { + "epoch": 0.5140934926342652, + "grad_norm": 15.52348629468538, + "learning_rate": 2e-06, + "loss": 0.3694, + "step": 2216 + }, + { + "epoch": 0.5143254842825659, + "grad_norm": 17.681642491435536, + "learning_rate": 2e-06, + "loss": 0.3573, + "step": 2217 + }, + { + "epoch": 0.5145574759308665, + "grad_norm": 6.604458338708483, + "learning_rate": 2e-06, + "loss": 0.2758, + "step": 2218 + }, + { + "epoch": 0.5147894675791671, + "grad_norm": 13.732407906659214, + "learning_rate": 2e-06, + "loss": 0.304, + "step": 2219 + }, + { + "epoch": 0.5150214592274678, + "grad_norm": 19.435164902984656, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 2220 + }, + { + "epoch": 0.5152534508757685, + "grad_norm": 17.545281421409236, + "learning_rate": 2e-06, + "loss": 0.3262, + "step": 2221 + }, + { + "epoch": 0.5154854425240691, + "grad_norm": 9.242074142541009, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 2222 + }, + { + "epoch": 0.5157174341723698, + "grad_norm": 15.786352449218043, + "learning_rate": 2e-06, + "loss": 0.3375, + "step": 2223 + }, + { + "epoch": 0.5159494258206705, + "grad_norm": 5.905837709522254, + "learning_rate": 2e-06, + "loss": 0.2207, + "step": 2224 + }, + { + "epoch": 0.5161814174689712, + "grad_norm": 14.735766964453777, + "learning_rate": 2e-06, + "loss": 0.3925, + "step": 2225 + }, + { + "epoch": 0.5164134091172717, + "grad_norm": 15.461827147246586, + "learning_rate": 2e-06, + "loss": 0.3894, + "step": 2226 + }, + { + "epoch": 0.5166454007655724, + "grad_norm": 10.058630282691166, + "learning_rate": 2e-06, + "loss": 0.2264, + "step": 2227 + }, + { + "epoch": 0.5168773924138731, + "grad_norm": 6.832754603678109, + "learning_rate": 2e-06, + "loss": 0.281, + "step": 2228 + }, + { + "epoch": 0.5171093840621738, + "grad_norm": 16.511843498691217, + "learning_rate": 2e-06, + "loss": 0.3108, + "step": 2229 + }, + { + "epoch": 0.5173413757104744, + "grad_norm": 19.925078422841, + "learning_rate": 2e-06, + "loss": 0.3226, + "step": 2230 + }, + { + "epoch": 0.5175733673587751, + "grad_norm": 23.279579301825684, + "learning_rate": 2e-06, + "loss": 0.3833, + "step": 2231 + }, + { + "epoch": 0.5178053590070757, + "grad_norm": 10.204446780260694, + "learning_rate": 2e-06, + "loss": 0.2996, + "step": 2232 + }, + { + "epoch": 0.5180373506553764, + "grad_norm": 12.123237316873128, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 2233 + }, + { + "epoch": 0.518269342303677, + "grad_norm": 11.329122471507537, + "learning_rate": 2e-06, + "loss": 0.3399, + "step": 2234 + }, + { + "epoch": 0.5185013339519777, + "grad_norm": 24.975527522311257, + "learning_rate": 2e-06, + "loss": 0.4095, + "step": 2235 + }, + { + "epoch": 0.5187333256002784, + "grad_norm": 15.940577202895128, + "learning_rate": 2e-06, + "loss": 0.3415, + "step": 2236 + }, + { + "epoch": 0.5189653172485791, + "grad_norm": 20.14661131715556, + "learning_rate": 2e-06, + "loss": 0.3974, + "step": 2237 + }, + { + "epoch": 0.5191973088968798, + "grad_norm": 25.747100784967884, + "learning_rate": 2e-06, + "loss": 0.5037, + "step": 2238 + }, + { + "epoch": 0.5194293005451803, + "grad_norm": 14.208535073812218, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 2239 + }, + { + "epoch": 0.519661292193481, + "grad_norm": 17.03550596170188, + "learning_rate": 2e-06, + "loss": 0.3653, + "step": 2240 + }, + { + "epoch": 0.5198932838417817, + "grad_norm": 20.93553667412997, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 2241 + }, + { + "epoch": 0.5201252754900824, + "grad_norm": 10.008111793167586, + "learning_rate": 2e-06, + "loss": 0.2699, + "step": 2242 + }, + { + "epoch": 0.520357267138383, + "grad_norm": 20.26414491317248, + "learning_rate": 2e-06, + "loss": 0.3475, + "step": 2243 + }, + { + "epoch": 0.5205892587866837, + "grad_norm": 24.230542480582827, + "learning_rate": 2e-06, + "loss": 0.3996, + "step": 2244 + }, + { + "epoch": 0.5208212504349844, + "grad_norm": 20.463428916265258, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 2245 + }, + { + "epoch": 0.521053242083285, + "grad_norm": 12.477672191195685, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 2246 + }, + { + "epoch": 0.5212852337315856, + "grad_norm": 100.35080063444947, + "learning_rate": 2e-06, + "loss": 0.2666, + "step": 2247 + }, + { + "epoch": 0.5215172253798863, + "grad_norm": 24.628890125223037, + "learning_rate": 2e-06, + "loss": 0.3212, + "step": 2248 + }, + { + "epoch": 0.521749217028187, + "grad_norm": 9.204998287585962, + "learning_rate": 2e-06, + "loss": 0.2658, + "step": 2249 + }, + { + "epoch": 0.5219812086764877, + "grad_norm": 18.863388131065673, + "learning_rate": 2e-06, + "loss": 0.3096, + "step": 2250 + }, + { + "epoch": 0.5222132003247884, + "grad_norm": 15.814258917614072, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 2251 + }, + { + "epoch": 0.5224451919730889, + "grad_norm": 12.02163164782188, + "learning_rate": 2e-06, + "loss": 0.2721, + "step": 2252 + }, + { + "epoch": 0.5226771836213896, + "grad_norm": 13.910898698958496, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 2253 + }, + { + "epoch": 0.5229091752696903, + "grad_norm": 17.072635763484605, + "learning_rate": 2e-06, + "loss": 0.2623, + "step": 2254 + }, + { + "epoch": 0.523141166917991, + "grad_norm": 14.665093774873432, + "learning_rate": 2e-06, + "loss": 0.3196, + "step": 2255 + }, + { + "epoch": 0.5233731585662916, + "grad_norm": 17.038607429232925, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 2256 + }, + { + "epoch": 0.5236051502145923, + "grad_norm": 8.32717226665899, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 2257 + }, + { + "epoch": 0.523837141862893, + "grad_norm": 16.883098831842254, + "learning_rate": 2e-06, + "loss": 0.2216, + "step": 2258 + }, + { + "epoch": 0.5240691335111936, + "grad_norm": 20.377352713674426, + "learning_rate": 2e-06, + "loss": 0.3238, + "step": 2259 + }, + { + "epoch": 0.5243011251594942, + "grad_norm": 12.458863951225556, + "learning_rate": 2e-06, + "loss": 0.3009, + "step": 2260 + }, + { + "epoch": 0.5245331168077949, + "grad_norm": 11.239890354739732, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 2261 + }, + { + "epoch": 0.5247651084560956, + "grad_norm": 18.620507924392726, + "learning_rate": 2e-06, + "loss": 0.3723, + "step": 2262 + }, + { + "epoch": 0.5249971001043963, + "grad_norm": 13.254486794790614, + "learning_rate": 2e-06, + "loss": 0.2997, + "step": 2263 + }, + { + "epoch": 0.5252290917526969, + "grad_norm": 13.22770783264009, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 2264 + }, + { + "epoch": 0.5254610834009976, + "grad_norm": 17.009437904290014, + "learning_rate": 2e-06, + "loss": 0.3899, + "step": 2265 + }, + { + "epoch": 0.5256930750492982, + "grad_norm": 20.946786476780414, + "learning_rate": 2e-06, + "loss": 0.3711, + "step": 2266 + }, + { + "epoch": 0.5259250666975989, + "grad_norm": 13.43035616771182, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 2267 + }, + { + "epoch": 0.5261570583458995, + "grad_norm": 15.701373292627125, + "learning_rate": 2e-06, + "loss": 0.34, + "step": 2268 + }, + { + "epoch": 0.5263890499942002, + "grad_norm": 11.788398579381099, + "learning_rate": 2e-06, + "loss": 0.2095, + "step": 2269 + }, + { + "epoch": 0.5266210416425009, + "grad_norm": 21.228215030045135, + "learning_rate": 2e-06, + "loss": 0.3226, + "step": 2270 + }, + { + "epoch": 0.5268530332908016, + "grad_norm": 13.830399196968724, + "learning_rate": 2e-06, + "loss": 0.2479, + "step": 2271 + }, + { + "epoch": 0.5270850249391021, + "grad_norm": 19.86319616283615, + "learning_rate": 2e-06, + "loss": 0.3438, + "step": 2272 + }, + { + "epoch": 0.5273170165874028, + "grad_norm": 11.956746996575461, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 2273 + }, + { + "epoch": 0.5275490082357035, + "grad_norm": 11.180178543415238, + "learning_rate": 2e-06, + "loss": 0.2802, + "step": 2274 + }, + { + "epoch": 0.5277809998840042, + "grad_norm": 9.880631130726139, + "learning_rate": 2e-06, + "loss": 0.3052, + "step": 2275 + }, + { + "epoch": 0.5280129915323049, + "grad_norm": 9.766750451583386, + "learning_rate": 2e-06, + "loss": 0.2437, + "step": 2276 + }, + { + "epoch": 0.5282449831806055, + "grad_norm": 18.584827551294715, + "learning_rate": 2e-06, + "loss": 0.3032, + "step": 2277 + }, + { + "epoch": 0.5284769748289062, + "grad_norm": 13.19289878347082, + "learning_rate": 2e-06, + "loss": 0.2042, + "step": 2278 + }, + { + "epoch": 0.5287089664772068, + "grad_norm": 8.841165204840243, + "learning_rate": 2e-06, + "loss": 0.1651, + "step": 2279 + }, + { + "epoch": 0.5289409581255075, + "grad_norm": 12.913411705213157, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 2280 + }, + { + "epoch": 0.5291729497738081, + "grad_norm": 16.76037343633817, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 2281 + }, + { + "epoch": 0.5294049414221088, + "grad_norm": 15.153622033605663, + "learning_rate": 2e-06, + "loss": 0.2516, + "step": 2282 + }, + { + "epoch": 0.5296369330704095, + "grad_norm": 9.680270585277633, + "learning_rate": 2e-06, + "loss": 0.2359, + "step": 2283 + }, + { + "epoch": 0.5298689247187102, + "grad_norm": 22.396002284150175, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 2284 + }, + { + "epoch": 0.5301009163670107, + "grad_norm": 16.939145166831942, + "learning_rate": 2e-06, + "loss": 0.4175, + "step": 2285 + }, + { + "epoch": 0.5303329080153114, + "grad_norm": 13.878641540351467, + "learning_rate": 2e-06, + "loss": 0.2965, + "step": 2286 + }, + { + "epoch": 0.5305648996636121, + "grad_norm": 14.818456238172777, + "learning_rate": 2e-06, + "loss": 0.2961, + "step": 2287 + }, + { + "epoch": 0.5307968913119128, + "grad_norm": 12.767136730539981, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 2288 + }, + { + "epoch": 0.5310288829602134, + "grad_norm": 10.171811680648132, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 2289 + }, + { + "epoch": 0.5312608746085141, + "grad_norm": 16.018420281641706, + "learning_rate": 2e-06, + "loss": 0.3154, + "step": 2290 + }, + { + "epoch": 0.5314928662568148, + "grad_norm": 13.07454574100996, + "learning_rate": 2e-06, + "loss": 0.2373, + "step": 2291 + }, + { + "epoch": 0.5317248579051154, + "grad_norm": 13.353026905275584, + "learning_rate": 2e-06, + "loss": 0.4516, + "step": 2292 + }, + { + "epoch": 0.531956849553416, + "grad_norm": 12.39102054984825, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 2293 + }, + { + "epoch": 0.5321888412017167, + "grad_norm": 17.34864838714827, + "learning_rate": 2e-06, + "loss": 0.3422, + "step": 2294 + }, + { + "epoch": 0.5324208328500174, + "grad_norm": 16.64109635116588, + "learning_rate": 2e-06, + "loss": 0.1882, + "step": 2295 + }, + { + "epoch": 0.5326528244983181, + "grad_norm": 14.168599271666439, + "learning_rate": 2e-06, + "loss": 0.4041, + "step": 2296 + }, + { + "epoch": 0.5328848161466188, + "grad_norm": 16.024561602681622, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 2297 + }, + { + "epoch": 0.5331168077949194, + "grad_norm": 15.266324319466408, + "learning_rate": 2e-06, + "loss": 0.3007, + "step": 2298 + }, + { + "epoch": 0.53334879944322, + "grad_norm": 14.756981269977882, + "learning_rate": 2e-06, + "loss": 0.2312, + "step": 2299 + }, + { + "epoch": 0.5335807910915207, + "grad_norm": 20.247224134109825, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 2300 + }, + { + "epoch": 0.5338127827398214, + "grad_norm": 18.21501807524069, + "learning_rate": 2e-06, + "loss": 0.2802, + "step": 2301 + }, + { + "epoch": 0.534044774388122, + "grad_norm": 9.884233780654693, + "learning_rate": 2e-06, + "loss": 0.2565, + "step": 2302 + }, + { + "epoch": 0.5342767660364227, + "grad_norm": 17.423012073490213, + "learning_rate": 2e-06, + "loss": 0.3299, + "step": 2303 + }, + { + "epoch": 0.5345087576847234, + "grad_norm": 18.25976551054708, + "learning_rate": 2e-06, + "loss": 0.4491, + "step": 2304 + }, + { + "epoch": 0.534740749333024, + "grad_norm": 11.64769360043185, + "learning_rate": 2e-06, + "loss": 0.2243, + "step": 2305 + }, + { + "epoch": 0.5349727409813246, + "grad_norm": 14.89009167595676, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 2306 + }, + { + "epoch": 0.5352047326296253, + "grad_norm": 17.75693597667897, + "learning_rate": 2e-06, + "loss": 0.4001, + "step": 2307 + }, + { + "epoch": 0.535436724277926, + "grad_norm": 17.159122883056842, + "learning_rate": 2e-06, + "loss": 0.3234, + "step": 2308 + }, + { + "epoch": 0.5356687159262267, + "grad_norm": 13.326925118558393, + "learning_rate": 2e-06, + "loss": 0.2136, + "step": 2309 + }, + { + "epoch": 0.5359007075745273, + "grad_norm": 24.72149462549094, + "learning_rate": 2e-06, + "loss": 0.3685, + "step": 2310 + }, + { + "epoch": 0.536132699222828, + "grad_norm": 8.387532478488598, + "learning_rate": 2e-06, + "loss": 0.2289, + "step": 2311 + }, + { + "epoch": 0.5363646908711286, + "grad_norm": 18.357082381108455, + "learning_rate": 2e-06, + "loss": 0.3841, + "step": 2312 + }, + { + "epoch": 0.5365966825194293, + "grad_norm": 12.95410078049241, + "learning_rate": 2e-06, + "loss": 0.3858, + "step": 2313 + }, + { + "epoch": 0.53682867416773, + "grad_norm": 13.089473379625638, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 2314 + }, + { + "epoch": 0.5370606658160306, + "grad_norm": 19.433935514249765, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 2315 + }, + { + "epoch": 0.5372926574643313, + "grad_norm": 10.02339829476107, + "learning_rate": 2e-06, + "loss": 0.2437, + "step": 2316 + }, + { + "epoch": 0.537524649112632, + "grad_norm": 12.552941976553708, + "learning_rate": 2e-06, + "loss": 0.3056, + "step": 2317 + }, + { + "epoch": 0.5377566407609327, + "grad_norm": 7.083892332328677, + "learning_rate": 2e-06, + "loss": 0.1957, + "step": 2318 + }, + { + "epoch": 0.5379886324092332, + "grad_norm": 20.42922099874839, + "learning_rate": 2e-06, + "loss": 0.4354, + "step": 2319 + }, + { + "epoch": 0.5382206240575339, + "grad_norm": 20.31467380319267, + "learning_rate": 2e-06, + "loss": 0.3671, + "step": 2320 + }, + { + "epoch": 0.5384526157058346, + "grad_norm": 14.633241874519927, + "learning_rate": 2e-06, + "loss": 0.2382, + "step": 2321 + }, + { + "epoch": 0.5386846073541353, + "grad_norm": 12.98039438440369, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 2322 + }, + { + "epoch": 0.5389165990024359, + "grad_norm": 16.565699553079266, + "learning_rate": 2e-06, + "loss": 0.2974, + "step": 2323 + }, + { + "epoch": 0.5391485906507366, + "grad_norm": 15.238803129232114, + "learning_rate": 2e-06, + "loss": 0.343, + "step": 2324 + }, + { + "epoch": 0.5393805822990372, + "grad_norm": 6.829254929370642, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 2325 + }, + { + "epoch": 0.5396125739473379, + "grad_norm": 9.858620413535052, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 2326 + }, + { + "epoch": 0.5398445655956385, + "grad_norm": 6.932270398065999, + "learning_rate": 2e-06, + "loss": 0.2907, + "step": 2327 + }, + { + "epoch": 0.5400765572439392, + "grad_norm": 12.710146071255126, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 2328 + }, + { + "epoch": 0.5403085488922399, + "grad_norm": 7.804004164747687, + "learning_rate": 2e-06, + "loss": 0.1733, + "step": 2329 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 21.81106908709857, + "learning_rate": 2e-06, + "loss": 0.3851, + "step": 2330 + }, + { + "epoch": 0.5407725321888412, + "grad_norm": 5.8748588911926145, + "learning_rate": 2e-06, + "loss": 0.2951, + "step": 2331 + }, + { + "epoch": 0.5410045238371418, + "grad_norm": 16.2691670015431, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 2332 + }, + { + "epoch": 0.5412365154854425, + "grad_norm": 17.236361159889725, + "learning_rate": 2e-06, + "loss": 0.2956, + "step": 2333 + }, + { + "epoch": 0.5414685071337432, + "grad_norm": 10.273390136146526, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 2334 + }, + { + "epoch": 0.5417004987820438, + "grad_norm": 20.87612590795775, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 2335 + }, + { + "epoch": 0.5419324904303445, + "grad_norm": 27.08238944500166, + "learning_rate": 2e-06, + "loss": 0.5375, + "step": 2336 + }, + { + "epoch": 0.5421644820786452, + "grad_norm": 6.215206436525487, + "learning_rate": 2e-06, + "loss": 0.1769, + "step": 2337 + }, + { + "epoch": 0.5423964737269459, + "grad_norm": 15.724186262250118, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 2338 + }, + { + "epoch": 0.5426284653752464, + "grad_norm": 11.368510674204973, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 2339 + }, + { + "epoch": 0.5428604570235471, + "grad_norm": 18.357210746849432, + "learning_rate": 2e-06, + "loss": 0.3577, + "step": 2340 + }, + { + "epoch": 0.5430924486718478, + "grad_norm": 12.542863291662933, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 2341 + }, + { + "epoch": 0.5433244403201485, + "grad_norm": 12.147225659096481, + "learning_rate": 2e-06, + "loss": 0.3056, + "step": 2342 + }, + { + "epoch": 0.5435564319684492, + "grad_norm": 12.837094753441027, + "learning_rate": 2e-06, + "loss": 0.3689, + "step": 2343 + }, + { + "epoch": 0.5437884236167498, + "grad_norm": 14.999446526361522, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 2344 + }, + { + "epoch": 0.5440204152650504, + "grad_norm": 26.954049953958133, + "learning_rate": 2e-06, + "loss": 0.4802, + "step": 2345 + }, + { + "epoch": 0.5442524069133511, + "grad_norm": 15.525353341388612, + "learning_rate": 2e-06, + "loss": 0.3723, + "step": 2346 + }, + { + "epoch": 0.5444843985616518, + "grad_norm": 18.84480370516095, + "learning_rate": 2e-06, + "loss": 0.4144, + "step": 2347 + }, + { + "epoch": 0.5447163902099524, + "grad_norm": 9.673118895122698, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 2348 + }, + { + "epoch": 0.5449483818582531, + "grad_norm": 6.578330514551692, + "learning_rate": 2e-06, + "loss": 0.228, + "step": 2349 + }, + { + "epoch": 0.5451803735065538, + "grad_norm": 21.222832338767525, + "learning_rate": 2e-06, + "loss": 0.3208, + "step": 2350 + }, + { + "epoch": 0.5454123651548545, + "grad_norm": 19.578665813588188, + "learning_rate": 2e-06, + "loss": 0.3288, + "step": 2351 + }, + { + "epoch": 0.545644356803155, + "grad_norm": 17.952013710643097, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 2352 + }, + { + "epoch": 0.5458763484514557, + "grad_norm": 16.066508436001325, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 2353 + }, + { + "epoch": 0.5461083400997564, + "grad_norm": 8.080325358675177, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 2354 + }, + { + "epoch": 0.5463403317480571, + "grad_norm": 16.236401419663363, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 2355 + }, + { + "epoch": 0.5465723233963578, + "grad_norm": 23.272151572351987, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 2356 + }, + { + "epoch": 0.5468043150446584, + "grad_norm": 13.538462553357316, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 2357 + }, + { + "epoch": 0.5470363066929591, + "grad_norm": 16.407723112546826, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 2358 + }, + { + "epoch": 0.5472682983412597, + "grad_norm": 8.124259163275745, + "learning_rate": 2e-06, + "loss": 0.2108, + "step": 2359 + }, + { + "epoch": 0.5475002899895604, + "grad_norm": 20.691277150417193, + "learning_rate": 2e-06, + "loss": 0.3986, + "step": 2360 + }, + { + "epoch": 0.547732281637861, + "grad_norm": 24.34822370897118, + "learning_rate": 2e-06, + "loss": 0.3407, + "step": 2361 + }, + { + "epoch": 0.5479642732861617, + "grad_norm": 14.216553246002624, + "learning_rate": 2e-06, + "loss": 0.3413, + "step": 2362 + }, + { + "epoch": 0.5481962649344624, + "grad_norm": 14.793477667414159, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 2363 + }, + { + "epoch": 0.5484282565827631, + "grad_norm": 12.641379310724114, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 2364 + }, + { + "epoch": 0.5486602482310636, + "grad_norm": 13.317895196477977, + "learning_rate": 2e-06, + "loss": 0.325, + "step": 2365 + }, + { + "epoch": 0.5488922398793643, + "grad_norm": 16.161930551381584, + "learning_rate": 2e-06, + "loss": 0.3113, + "step": 2366 + }, + { + "epoch": 0.549124231527665, + "grad_norm": 7.385801035359576, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 2367 + }, + { + "epoch": 0.5493562231759657, + "grad_norm": 11.702660325886935, + "learning_rate": 2e-06, + "loss": 0.3183, + "step": 2368 + }, + { + "epoch": 0.5495882148242663, + "grad_norm": 14.062392473354258, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 2369 + }, + { + "epoch": 0.549820206472567, + "grad_norm": 18.748060159797518, + "learning_rate": 2e-06, + "loss": 0.3368, + "step": 2370 + }, + { + "epoch": 0.5500521981208677, + "grad_norm": 14.300497850364994, + "learning_rate": 2e-06, + "loss": 0.3072, + "step": 2371 + }, + { + "epoch": 0.5502841897691683, + "grad_norm": 10.418969804626075, + "learning_rate": 2e-06, + "loss": 0.2342, + "step": 2372 + }, + { + "epoch": 0.5505161814174689, + "grad_norm": 20.44987186445462, + "learning_rate": 2e-06, + "loss": 0.3585, + "step": 2373 + }, + { + "epoch": 0.5507481730657696, + "grad_norm": 20.05953857845853, + "learning_rate": 2e-06, + "loss": 0.2701, + "step": 2374 + }, + { + "epoch": 0.5509801647140703, + "grad_norm": 16.11962443192865, + "learning_rate": 2e-06, + "loss": 0.2922, + "step": 2375 + }, + { + "epoch": 0.551212156362371, + "grad_norm": 10.646811964940431, + "learning_rate": 2e-06, + "loss": 0.2887, + "step": 2376 + }, + { + "epoch": 0.5514441480106717, + "grad_norm": 18.017398046246253, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 2377 + }, + { + "epoch": 0.5516761396589723, + "grad_norm": 9.988883639517411, + "learning_rate": 2e-06, + "loss": 0.2606, + "step": 2378 + }, + { + "epoch": 0.5519081313072729, + "grad_norm": 12.064044391011736, + "learning_rate": 2e-06, + "loss": 0.3694, + "step": 2379 + }, + { + "epoch": 0.5521401229555736, + "grad_norm": 11.821170069247135, + "learning_rate": 2e-06, + "loss": 0.1949, + "step": 2380 + }, + { + "epoch": 0.5523721146038743, + "grad_norm": 9.066163899738006, + "learning_rate": 2e-06, + "loss": 0.1487, + "step": 2381 + }, + { + "epoch": 0.5526041062521749, + "grad_norm": 11.322561241704996, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 2382 + }, + { + "epoch": 0.5528360979004756, + "grad_norm": 7.512021724930349, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 2383 + }, + { + "epoch": 0.5530680895487763, + "grad_norm": 9.300021551053998, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 2384 + }, + { + "epoch": 0.5533000811970769, + "grad_norm": 18.989662991902502, + "learning_rate": 2e-06, + "loss": 0.3323, + "step": 2385 + }, + { + "epoch": 0.5535320728453775, + "grad_norm": 10.986524529354641, + "learning_rate": 2e-06, + "loss": 0.1893, + "step": 2386 + }, + { + "epoch": 0.5537640644936782, + "grad_norm": 10.924830004619258, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 2387 + }, + { + "epoch": 0.5539960561419789, + "grad_norm": 14.54093682002165, + "learning_rate": 2e-06, + "loss": 0.2469, + "step": 2388 + }, + { + "epoch": 0.5542280477902796, + "grad_norm": 23.32331970457638, + "learning_rate": 2e-06, + "loss": 0.3154, + "step": 2389 + }, + { + "epoch": 0.5544600394385802, + "grad_norm": 14.801663169850901, + "learning_rate": 2e-06, + "loss": 0.2924, + "step": 2390 + }, + { + "epoch": 0.5546920310868809, + "grad_norm": 12.918355697187497, + "learning_rate": 2e-06, + "loss": 0.2827, + "step": 2391 + }, + { + "epoch": 0.5549240227351815, + "grad_norm": 17.089038419122126, + "learning_rate": 2e-06, + "loss": 0.321, + "step": 2392 + }, + { + "epoch": 0.5551560143834822, + "grad_norm": 9.67915384286139, + "learning_rate": 2e-06, + "loss": 0.2727, + "step": 2393 + }, + { + "epoch": 0.5553880060317828, + "grad_norm": 7.766141239959042, + "learning_rate": 2e-06, + "loss": 0.191, + "step": 2394 + }, + { + "epoch": 0.5556199976800835, + "grad_norm": 18.090735119556687, + "learning_rate": 2e-06, + "loss": 0.3184, + "step": 2395 + }, + { + "epoch": 0.5558519893283842, + "grad_norm": 8.505508622938232, + "learning_rate": 2e-06, + "loss": 0.2925, + "step": 2396 + }, + { + "epoch": 0.5560839809766849, + "grad_norm": 14.387996447013048, + "learning_rate": 2e-06, + "loss": 0.3296, + "step": 2397 + }, + { + "epoch": 0.5563159726249856, + "grad_norm": 16.06393041702412, + "learning_rate": 2e-06, + "loss": 0.3835, + "step": 2398 + }, + { + "epoch": 0.5565479642732861, + "grad_norm": 13.847206072754162, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 2399 + }, + { + "epoch": 0.5567799559215868, + "grad_norm": 6.8657066601555625, + "learning_rate": 2e-06, + "loss": 0.2501, + "step": 2400 + }, + { + "epoch": 0.5570119475698875, + "grad_norm": 19.52261317249476, + "learning_rate": 2e-06, + "loss": 0.3469, + "step": 2401 + }, + { + "epoch": 0.5572439392181882, + "grad_norm": 8.778377305253947, + "learning_rate": 2e-06, + "loss": 0.3689, + "step": 2402 + }, + { + "epoch": 0.5574759308664888, + "grad_norm": 16.378735216573, + "learning_rate": 2e-06, + "loss": 0.3538, + "step": 2403 + }, + { + "epoch": 0.5577079225147895, + "grad_norm": 11.787808238531637, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 2404 + }, + { + "epoch": 0.5579399141630901, + "grad_norm": 9.171065652456917, + "learning_rate": 2e-06, + "loss": 0.2658, + "step": 2405 + }, + { + "epoch": 0.5581719058113908, + "grad_norm": 10.920307577783632, + "learning_rate": 2e-06, + "loss": 0.3377, + "step": 2406 + }, + { + "epoch": 0.5584038974596914, + "grad_norm": 16.384068560006575, + "learning_rate": 2e-06, + "loss": 0.245, + "step": 2407 + }, + { + "epoch": 0.5586358891079921, + "grad_norm": 15.407554795589347, + "learning_rate": 2e-06, + "loss": 0.316, + "step": 2408 + }, + { + "epoch": 0.5588678807562928, + "grad_norm": 8.255536852752579, + "learning_rate": 2e-06, + "loss": 0.3018, + "step": 2409 + }, + { + "epoch": 0.5590998724045935, + "grad_norm": 15.595870272913924, + "learning_rate": 2e-06, + "loss": 0.2373, + "step": 2410 + }, + { + "epoch": 0.5593318640528941, + "grad_norm": 10.573620346477986, + "learning_rate": 2e-06, + "loss": 0.2634, + "step": 2411 + }, + { + "epoch": 0.5595638557011947, + "grad_norm": 13.44498052742146, + "learning_rate": 2e-06, + "loss": 0.2921, + "step": 2412 + }, + { + "epoch": 0.5597958473494954, + "grad_norm": 18.226161572351923, + "learning_rate": 2e-06, + "loss": 0.3855, + "step": 2413 + }, + { + "epoch": 0.5600278389977961, + "grad_norm": 26.04602255991392, + "learning_rate": 2e-06, + "loss": 0.3388, + "step": 2414 + }, + { + "epoch": 0.5602598306460967, + "grad_norm": 11.40825468917101, + "learning_rate": 2e-06, + "loss": 0.2476, + "step": 2415 + }, + { + "epoch": 0.5604918222943974, + "grad_norm": 8.047637255378097, + "learning_rate": 2e-06, + "loss": 0.2142, + "step": 2416 + }, + { + "epoch": 0.5607238139426981, + "grad_norm": 9.763552278815421, + "learning_rate": 2e-06, + "loss": 0.2166, + "step": 2417 + }, + { + "epoch": 0.5609558055909988, + "grad_norm": 15.426847438028386, + "learning_rate": 2e-06, + "loss": 0.3242, + "step": 2418 + }, + { + "epoch": 0.5611877972392993, + "grad_norm": 10.019695584582262, + "learning_rate": 2e-06, + "loss": 0.2315, + "step": 2419 + }, + { + "epoch": 0.5614197888876, + "grad_norm": 11.950391616471022, + "learning_rate": 2e-06, + "loss": 0.3032, + "step": 2420 + }, + { + "epoch": 0.5616517805359007, + "grad_norm": 9.821360624161986, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 2421 + }, + { + "epoch": 0.5618837721842014, + "grad_norm": 13.779153309516882, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 2422 + }, + { + "epoch": 0.5621157638325021, + "grad_norm": 20.55957101204331, + "learning_rate": 2e-06, + "loss": 0.348, + "step": 2423 + }, + { + "epoch": 0.5623477554808027, + "grad_norm": 10.717575945681206, + "learning_rate": 2e-06, + "loss": 0.3057, + "step": 2424 + }, + { + "epoch": 0.5625797471291033, + "grad_norm": 9.5656433019114, + "learning_rate": 2e-06, + "loss": 0.2073, + "step": 2425 + }, + { + "epoch": 0.562811738777404, + "grad_norm": 14.943321776155047, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 2426 + }, + { + "epoch": 0.5630437304257047, + "grad_norm": 14.854031581718427, + "learning_rate": 2e-06, + "loss": 0.4032, + "step": 2427 + }, + { + "epoch": 0.5632757220740053, + "grad_norm": 19.945968462228393, + "learning_rate": 2e-06, + "loss": 0.2607, + "step": 2428 + }, + { + "epoch": 0.563507713722306, + "grad_norm": 21.348685688736698, + "learning_rate": 2e-06, + "loss": 0.2983, + "step": 2429 + }, + { + "epoch": 0.5637397053706067, + "grad_norm": 8.603273140615295, + "learning_rate": 2e-06, + "loss": 0.296, + "step": 2430 + }, + { + "epoch": 0.5639716970189074, + "grad_norm": 18.347215498971828, + "learning_rate": 2e-06, + "loss": 0.3555, + "step": 2431 + }, + { + "epoch": 0.5642036886672079, + "grad_norm": 11.246191700500239, + "learning_rate": 2e-06, + "loss": 0.2004, + "step": 2432 + }, + { + "epoch": 0.5644356803155086, + "grad_norm": 14.682817929588682, + "learning_rate": 2e-06, + "loss": 0.3376, + "step": 2433 + }, + { + "epoch": 0.5646676719638093, + "grad_norm": 20.22286878096074, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 2434 + }, + { + "epoch": 0.56489966361211, + "grad_norm": 16.49444735633445, + "learning_rate": 2e-06, + "loss": 0.2472, + "step": 2435 + }, + { + "epoch": 0.5651316552604106, + "grad_norm": 6.68551153484024, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 2436 + }, + { + "epoch": 0.5653636469087113, + "grad_norm": 9.686076248914613, + "learning_rate": 2e-06, + "loss": 0.3447, + "step": 2437 + }, + { + "epoch": 0.5655956385570119, + "grad_norm": 17.76599719791514, + "learning_rate": 2e-06, + "loss": 0.2478, + "step": 2438 + }, + { + "epoch": 0.5658276302053126, + "grad_norm": 21.389768804288273, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 2439 + }, + { + "epoch": 0.5660596218536132, + "grad_norm": 14.049780815178298, + "learning_rate": 2e-06, + "loss": 0.3321, + "step": 2440 + }, + { + "epoch": 0.5662916135019139, + "grad_norm": 14.033113374604143, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 2441 + }, + { + "epoch": 0.5665236051502146, + "grad_norm": 22.448007786902185, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 2442 + }, + { + "epoch": 0.5667555967985153, + "grad_norm": 14.619950092423217, + "learning_rate": 2e-06, + "loss": 0.3202, + "step": 2443 + }, + { + "epoch": 0.566987588446816, + "grad_norm": 19.221172213265596, + "learning_rate": 2e-06, + "loss": 0.4239, + "step": 2444 + }, + { + "epoch": 0.5672195800951165, + "grad_norm": 10.266028401673237, + "learning_rate": 2e-06, + "loss": 0.2292, + "step": 2445 + }, + { + "epoch": 0.5674515717434172, + "grad_norm": 16.665276447968612, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 2446 + }, + { + "epoch": 0.5676835633917179, + "grad_norm": 10.6274177866923, + "learning_rate": 2e-06, + "loss": 0.3535, + "step": 2447 + }, + { + "epoch": 0.5679155550400186, + "grad_norm": 11.8099997640705, + "learning_rate": 2e-06, + "loss": 0.366, + "step": 2448 + }, + { + "epoch": 0.5681475466883192, + "grad_norm": 12.555980498186655, + "learning_rate": 2e-06, + "loss": 0.3625, + "step": 2449 + }, + { + "epoch": 0.5683795383366199, + "grad_norm": 19.410923811816254, + "learning_rate": 2e-06, + "loss": 0.2926, + "step": 2450 + }, + { + "epoch": 0.5686115299849206, + "grad_norm": 2.8525262146093486, + "learning_rate": 2e-06, + "loss": 0.1367, + "step": 2451 + }, + { + "epoch": 0.5688435216332212, + "grad_norm": 10.782898016379185, + "learning_rate": 2e-06, + "loss": 0.3504, + "step": 2452 + }, + { + "epoch": 0.5690755132815218, + "grad_norm": 16.295736831717836, + "learning_rate": 2e-06, + "loss": 0.3744, + "step": 2453 + }, + { + "epoch": 0.5693075049298225, + "grad_norm": 12.536042109056044, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 2454 + }, + { + "epoch": 0.5695394965781232, + "grad_norm": 14.381777859735644, + "learning_rate": 2e-06, + "loss": 0.3513, + "step": 2455 + }, + { + "epoch": 0.5697714882264239, + "grad_norm": 18.1258827344621, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 2456 + }, + { + "epoch": 0.5700034798747246, + "grad_norm": 10.674561459760259, + "learning_rate": 2e-06, + "loss": 0.314, + "step": 2457 + }, + { + "epoch": 0.5702354715230251, + "grad_norm": 16.690016743500063, + "learning_rate": 2e-06, + "loss": 0.3153, + "step": 2458 + }, + { + "epoch": 0.5704674631713258, + "grad_norm": 17.419336891896343, + "learning_rate": 2e-06, + "loss": 0.2838, + "step": 2459 + }, + { + "epoch": 0.5706994548196265, + "grad_norm": 9.199259539316124, + "learning_rate": 2e-06, + "loss": 0.315, + "step": 2460 + }, + { + "epoch": 0.5709314464679272, + "grad_norm": 11.35509051682607, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 2461 + }, + { + "epoch": 0.5711634381162278, + "grad_norm": 13.829657807396838, + "learning_rate": 2e-06, + "loss": 0.2636, + "step": 2462 + }, + { + "epoch": 0.5713954297645285, + "grad_norm": 13.03723236578468, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 2463 + }, + { + "epoch": 0.5716274214128292, + "grad_norm": 14.110551716191281, + "learning_rate": 2e-06, + "loss": 0.323, + "step": 2464 + }, + { + "epoch": 0.5718594130611298, + "grad_norm": 9.653588565786267, + "learning_rate": 2e-06, + "loss": 0.2312, + "step": 2465 + }, + { + "epoch": 0.5720914047094304, + "grad_norm": 9.622618204933014, + "learning_rate": 2e-06, + "loss": 0.3371, + "step": 2466 + }, + { + "epoch": 0.5723233963577311, + "grad_norm": 26.944013997098477, + "learning_rate": 2e-06, + "loss": 0.3133, + "step": 2467 + }, + { + "epoch": 0.5725553880060318, + "grad_norm": 22.606898378576506, + "learning_rate": 2e-06, + "loss": 0.4094, + "step": 2468 + }, + { + "epoch": 0.5727873796543325, + "grad_norm": 14.59816805685614, + "learning_rate": 2e-06, + "loss": 0.4009, + "step": 2469 + }, + { + "epoch": 0.5730193713026331, + "grad_norm": 16.673847044715497, + "learning_rate": 2e-06, + "loss": 0.3406, + "step": 2470 + }, + { + "epoch": 0.5732513629509338, + "grad_norm": 14.847720782363652, + "learning_rate": 2e-06, + "loss": 0.2871, + "step": 2471 + }, + { + "epoch": 0.5734833545992344, + "grad_norm": 18.607494659772954, + "learning_rate": 2e-06, + "loss": 0.4925, + "step": 2472 + }, + { + "epoch": 0.5737153462475351, + "grad_norm": 14.499260801159384, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 2473 + }, + { + "epoch": 0.5739473378958357, + "grad_norm": 10.426182616781508, + "learning_rate": 2e-06, + "loss": 0.2871, + "step": 2474 + }, + { + "epoch": 0.5741793295441364, + "grad_norm": 11.674093773224836, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 2475 + }, + { + "epoch": 0.5744113211924371, + "grad_norm": 17.47014738833898, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 2476 + }, + { + "epoch": 0.5746433128407378, + "grad_norm": 11.770554342989735, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 2477 + }, + { + "epoch": 0.5748753044890383, + "grad_norm": 20.94092375824433, + "learning_rate": 2e-06, + "loss": 0.3868, + "step": 2478 + }, + { + "epoch": 0.575107296137339, + "grad_norm": 13.262166143545379, + "learning_rate": 2e-06, + "loss": 0.2915, + "step": 2479 + }, + { + "epoch": 0.5753392877856397, + "grad_norm": 8.214286704099807, + "learning_rate": 2e-06, + "loss": 0.3086, + "step": 2480 + }, + { + "epoch": 0.5755712794339404, + "grad_norm": 8.507786708511688, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 2481 + }, + { + "epoch": 0.575803271082241, + "grad_norm": 18.642647129706745, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 2482 + }, + { + "epoch": 0.5760352627305417, + "grad_norm": 15.689231004790807, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 2483 + }, + { + "epoch": 0.5762672543788424, + "grad_norm": 12.418802098698727, + "learning_rate": 2e-06, + "loss": 0.2637, + "step": 2484 + }, + { + "epoch": 0.576499246027143, + "grad_norm": 11.158407856020904, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 2485 + }, + { + "epoch": 0.5767312376754437, + "grad_norm": 19.61483635744511, + "learning_rate": 2e-06, + "loss": 0.294, + "step": 2486 + }, + { + "epoch": 0.5769632293237443, + "grad_norm": 14.865542445369957, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 2487 + }, + { + "epoch": 0.577195220972045, + "grad_norm": 15.83240384681972, + "learning_rate": 2e-06, + "loss": 0.3269, + "step": 2488 + }, + { + "epoch": 0.5774272126203457, + "grad_norm": 15.78410172720427, + "learning_rate": 2e-06, + "loss": 0.3453, + "step": 2489 + }, + { + "epoch": 0.5776592042686464, + "grad_norm": 8.634452953530715, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 2490 + }, + { + "epoch": 0.577891195916947, + "grad_norm": 12.732603792922383, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 2491 + }, + { + "epoch": 0.5781231875652476, + "grad_norm": 16.61088777946793, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 2492 + }, + { + "epoch": 0.5783551792135483, + "grad_norm": 15.248936766635833, + "learning_rate": 2e-06, + "loss": 0.2938, + "step": 2493 + }, + { + "epoch": 0.578587170861849, + "grad_norm": 22.75939149932287, + "learning_rate": 2e-06, + "loss": 0.4973, + "step": 2494 + }, + { + "epoch": 0.5788191625101496, + "grad_norm": 13.169262362496072, + "learning_rate": 2e-06, + "loss": 0.3417, + "step": 2495 + }, + { + "epoch": 0.5790511541584503, + "grad_norm": 14.869556808260299, + "learning_rate": 2e-06, + "loss": 0.3089, + "step": 2496 + }, + { + "epoch": 0.579283145806751, + "grad_norm": 7.219403236004867, + "learning_rate": 2e-06, + "loss": 0.1704, + "step": 2497 + }, + { + "epoch": 0.5795151374550516, + "grad_norm": 20.08537920234524, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 2498 + }, + { + "epoch": 0.5797471291033522, + "grad_norm": 6.917849407953844, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 2499 + }, + { + "epoch": 0.5799791207516529, + "grad_norm": 17.85469764329847, + "learning_rate": 2e-06, + "loss": 0.3051, + "step": 2500 + }, + { + "epoch": 0.5802111123999536, + "grad_norm": 9.285602566523718, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 2501 + }, + { + "epoch": 0.5804431040482543, + "grad_norm": 12.654586179547152, + "learning_rate": 2e-06, + "loss": 0.2821, + "step": 2502 + }, + { + "epoch": 0.580675095696555, + "grad_norm": 20.484107836541714, + "learning_rate": 2e-06, + "loss": 0.4086, + "step": 2503 + }, + { + "epoch": 0.5809070873448556, + "grad_norm": 30.810539020797318, + "learning_rate": 2e-06, + "loss": 0.4463, + "step": 2504 + }, + { + "epoch": 0.5811390789931562, + "grad_norm": 8.239820096389755, + "learning_rate": 2e-06, + "loss": 0.1982, + "step": 2505 + }, + { + "epoch": 0.5813710706414569, + "grad_norm": 6.987799616443373, + "learning_rate": 2e-06, + "loss": 0.2571, + "step": 2506 + }, + { + "epoch": 0.5816030622897576, + "grad_norm": 9.824023287155521, + "learning_rate": 2e-06, + "loss": 0.2848, + "step": 2507 + }, + { + "epoch": 0.5818350539380582, + "grad_norm": 19.77791150461645, + "learning_rate": 2e-06, + "loss": 0.3736, + "step": 2508 + }, + { + "epoch": 0.5820670455863589, + "grad_norm": 35.11127353541142, + "learning_rate": 2e-06, + "loss": 0.4481, + "step": 2509 + }, + { + "epoch": 0.5822990372346596, + "grad_norm": 11.140002417533, + "learning_rate": 2e-06, + "loss": 0.3415, + "step": 2510 + }, + { + "epoch": 0.5825310288829603, + "grad_norm": 17.981016984708337, + "learning_rate": 2e-06, + "loss": 0.3271, + "step": 2511 + }, + { + "epoch": 0.5827630205312608, + "grad_norm": 11.72871904063151, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 2512 + }, + { + "epoch": 0.5829950121795615, + "grad_norm": 5.025027934513667, + "learning_rate": 2e-06, + "loss": 0.1639, + "step": 2513 + }, + { + "epoch": 0.5832270038278622, + "grad_norm": 11.810865289588344, + "learning_rate": 2e-06, + "loss": 0.2174, + "step": 2514 + }, + { + "epoch": 0.5834589954761629, + "grad_norm": 24.26560656264537, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 2515 + }, + { + "epoch": 0.5836909871244635, + "grad_norm": 18.46991140854295, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 2516 + }, + { + "epoch": 0.5839229787727642, + "grad_norm": 15.03403044306349, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 2517 + }, + { + "epoch": 0.5841549704210648, + "grad_norm": 11.645891200490585, + "learning_rate": 2e-06, + "loss": 0.2115, + "step": 2518 + }, + { + "epoch": 0.5843869620693655, + "grad_norm": 14.340377431021409, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 2519 + }, + { + "epoch": 0.5846189537176661, + "grad_norm": 14.147283918955994, + "learning_rate": 2e-06, + "loss": 0.2075, + "step": 2520 + }, + { + "epoch": 0.5848509453659668, + "grad_norm": 17.865648426932434, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 2521 + }, + { + "epoch": 0.5850829370142675, + "grad_norm": 14.55836248361842, + "learning_rate": 2e-06, + "loss": 0.2814, + "step": 2522 + }, + { + "epoch": 0.5853149286625682, + "grad_norm": 16.81136985208734, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 2523 + }, + { + "epoch": 0.5855469203108689, + "grad_norm": 20.565540320996366, + "learning_rate": 2e-06, + "loss": 0.3327, + "step": 2524 + }, + { + "epoch": 0.5857789119591694, + "grad_norm": 24.14059008951668, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 2525 + }, + { + "epoch": 0.5860109036074701, + "grad_norm": 15.991993948265032, + "learning_rate": 2e-06, + "loss": 0.4677, + "step": 2526 + }, + { + "epoch": 0.5862428952557708, + "grad_norm": 12.974831364373912, + "learning_rate": 2e-06, + "loss": 0.3675, + "step": 2527 + }, + { + "epoch": 0.5864748869040715, + "grad_norm": 15.676183243588898, + "learning_rate": 2e-06, + "loss": 0.3662, + "step": 2528 + }, + { + "epoch": 0.5867068785523721, + "grad_norm": 20.18352831596817, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 2529 + }, + { + "epoch": 0.5869388702006728, + "grad_norm": 11.742845399116613, + "learning_rate": 2e-06, + "loss": 0.2105, + "step": 2530 + }, + { + "epoch": 0.5871708618489735, + "grad_norm": 15.036929135487856, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 2531 + }, + { + "epoch": 0.5874028534972741, + "grad_norm": 10.653932848203507, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 2532 + }, + { + "epoch": 0.5876348451455747, + "grad_norm": 16.1086614736084, + "learning_rate": 2e-06, + "loss": 0.3744, + "step": 2533 + }, + { + "epoch": 0.5878668367938754, + "grad_norm": 29.101406200313374, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 2534 + }, + { + "epoch": 0.5880988284421761, + "grad_norm": 38.350231530496075, + "learning_rate": 2e-06, + "loss": 0.4513, + "step": 2535 + }, + { + "epoch": 0.5883308200904768, + "grad_norm": 23.724065742515208, + "learning_rate": 2e-06, + "loss": 0.3881, + "step": 2536 + }, + { + "epoch": 0.5885628117387774, + "grad_norm": 15.37795572922142, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 2537 + }, + { + "epoch": 0.588794803387078, + "grad_norm": 16.865307773422217, + "learning_rate": 2e-06, + "loss": 0.315, + "step": 2538 + }, + { + "epoch": 0.5890267950353787, + "grad_norm": 17.793328192027396, + "learning_rate": 2e-06, + "loss": 0.2581, + "step": 2539 + }, + { + "epoch": 0.5892587866836794, + "grad_norm": 17.195738504318005, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 2540 + }, + { + "epoch": 0.58949077833198, + "grad_norm": 10.37882031859556, + "learning_rate": 2e-06, + "loss": 0.3523, + "step": 2541 + }, + { + "epoch": 0.5897227699802807, + "grad_norm": 8.259590272826024, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 2542 + }, + { + "epoch": 0.5899547616285814, + "grad_norm": 19.193523289503357, + "learning_rate": 2e-06, + "loss": 0.338, + "step": 2543 + }, + { + "epoch": 0.5901867532768821, + "grad_norm": 14.629504299379278, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 2544 + }, + { + "epoch": 0.5904187449251826, + "grad_norm": 8.224400622005064, + "learning_rate": 2e-06, + "loss": 0.2536, + "step": 2545 + }, + { + "epoch": 0.5906507365734833, + "grad_norm": 15.86256323018292, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 2546 + }, + { + "epoch": 0.590882728221784, + "grad_norm": 18.072618750754682, + "learning_rate": 2e-06, + "loss": 0.2477, + "step": 2547 + }, + { + "epoch": 0.5911147198700847, + "grad_norm": 13.436673286426196, + "learning_rate": 2e-06, + "loss": 0.299, + "step": 2548 + }, + { + "epoch": 0.5913467115183854, + "grad_norm": 6.801983823427113, + "learning_rate": 2e-06, + "loss": 0.3031, + "step": 2549 + }, + { + "epoch": 0.591578703166686, + "grad_norm": 14.892621067621787, + "learning_rate": 2e-06, + "loss": 0.3393, + "step": 2550 + }, + { + "epoch": 0.5918106948149867, + "grad_norm": 15.883135619779361, + "learning_rate": 2e-06, + "loss": 0.3129, + "step": 2551 + }, + { + "epoch": 0.5920426864632873, + "grad_norm": 17.51304692590423, + "learning_rate": 2e-06, + "loss": 0.3318, + "step": 2552 + }, + { + "epoch": 0.592274678111588, + "grad_norm": 13.23396182628087, + "learning_rate": 2e-06, + "loss": 0.2345, + "step": 2553 + }, + { + "epoch": 0.5925066697598886, + "grad_norm": 16.088956436374215, + "learning_rate": 2e-06, + "loss": 0.3597, + "step": 2554 + }, + { + "epoch": 0.5927386614081893, + "grad_norm": 19.618846840750198, + "learning_rate": 2e-06, + "loss": 0.3431, + "step": 2555 + }, + { + "epoch": 0.59297065305649, + "grad_norm": 16.66909534596031, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 2556 + }, + { + "epoch": 0.5932026447047907, + "grad_norm": 10.560779229199174, + "learning_rate": 2e-06, + "loss": 0.2887, + "step": 2557 + }, + { + "epoch": 0.5934346363530912, + "grad_norm": 9.21356550066053, + "learning_rate": 2e-06, + "loss": 0.1793, + "step": 2558 + }, + { + "epoch": 0.5936666280013919, + "grad_norm": 14.076285578466297, + "learning_rate": 2e-06, + "loss": 0.3869, + "step": 2559 + }, + { + "epoch": 0.5938986196496926, + "grad_norm": 12.847638105175127, + "learning_rate": 2e-06, + "loss": 0.2315, + "step": 2560 + }, + { + "epoch": 0.5941306112979933, + "grad_norm": 14.765732513424698, + "learning_rate": 2e-06, + "loss": 0.2799, + "step": 2561 + }, + { + "epoch": 0.594362602946294, + "grad_norm": 9.942317670085435, + "learning_rate": 2e-06, + "loss": 0.2714, + "step": 2562 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 6.0178161019588945, + "learning_rate": 2e-06, + "loss": 0.2055, + "step": 2563 + }, + { + "epoch": 0.5948265862428953, + "grad_norm": 16.221366734512934, + "learning_rate": 2e-06, + "loss": 0.3824, + "step": 2564 + }, + { + "epoch": 0.5950585778911959, + "grad_norm": 13.889351920592597, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 2565 + }, + { + "epoch": 0.5952905695394966, + "grad_norm": 23.308845437497098, + "learning_rate": 2e-06, + "loss": 0.3687, + "step": 2566 + }, + { + "epoch": 0.5955225611877972, + "grad_norm": 16.15487734503701, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 2567 + }, + { + "epoch": 0.5957545528360979, + "grad_norm": 14.492592621249058, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 2568 + }, + { + "epoch": 0.5959865444843986, + "grad_norm": 14.475746535168714, + "learning_rate": 2e-06, + "loss": 0.3553, + "step": 2569 + }, + { + "epoch": 0.5962185361326993, + "grad_norm": 14.198491742040812, + "learning_rate": 2e-06, + "loss": 0.2305, + "step": 2570 + }, + { + "epoch": 0.5964505277809999, + "grad_norm": 17.549194217988287, + "learning_rate": 2e-06, + "loss": 0.3198, + "step": 2571 + }, + { + "epoch": 0.5966825194293005, + "grad_norm": 13.338985893175025, + "learning_rate": 2e-06, + "loss": 0.3296, + "step": 2572 + }, + { + "epoch": 0.5969145110776012, + "grad_norm": 17.512050505262632, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 2573 + }, + { + "epoch": 0.5971465027259019, + "grad_norm": 6.628241469541592, + "learning_rate": 2e-06, + "loss": 0.2512, + "step": 2574 + }, + { + "epoch": 0.5973784943742025, + "grad_norm": 21.899371256267624, + "learning_rate": 2e-06, + "loss": 0.418, + "step": 2575 + }, + { + "epoch": 0.5976104860225032, + "grad_norm": 21.141754928031396, + "learning_rate": 2e-06, + "loss": 0.321, + "step": 2576 + }, + { + "epoch": 0.5978424776708039, + "grad_norm": 21.29611023083716, + "learning_rate": 2e-06, + "loss": 0.3016, + "step": 2577 + }, + { + "epoch": 0.5980744693191045, + "grad_norm": 9.439127869148606, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 2578 + }, + { + "epoch": 0.5983064609674051, + "grad_norm": 12.79725874640436, + "learning_rate": 2e-06, + "loss": 0.2972, + "step": 2579 + }, + { + "epoch": 0.5985384526157058, + "grad_norm": 13.59370150264999, + "learning_rate": 2e-06, + "loss": 0.2991, + "step": 2580 + }, + { + "epoch": 0.5987704442640065, + "grad_norm": 14.290613413702845, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 2581 + }, + { + "epoch": 0.5990024359123072, + "grad_norm": 14.739111004523165, + "learning_rate": 2e-06, + "loss": 0.299, + "step": 2582 + }, + { + "epoch": 0.5992344275606079, + "grad_norm": 18.43042050151962, + "learning_rate": 2e-06, + "loss": 0.3581, + "step": 2583 + }, + { + "epoch": 0.5994664192089085, + "grad_norm": 16.742572811047022, + "learning_rate": 2e-06, + "loss": 0.405, + "step": 2584 + }, + { + "epoch": 0.5996984108572091, + "grad_norm": 13.803051539755291, + "learning_rate": 2e-06, + "loss": 0.2549, + "step": 2585 + }, + { + "epoch": 0.5999304025055098, + "grad_norm": 18.181730133497524, + "learning_rate": 2e-06, + "loss": 0.3734, + "step": 2586 + }, + { + "epoch": 0.6001623941538105, + "grad_norm": 11.635904384068603, + "learning_rate": 2e-06, + "loss": 0.3005, + "step": 2587 + }, + { + "epoch": 0.6003943858021111, + "grad_norm": 15.347115874466267, + "learning_rate": 2e-06, + "loss": 0.3835, + "step": 2588 + }, + { + "epoch": 0.6006263774504118, + "grad_norm": 8.825379842280917, + "learning_rate": 2e-06, + "loss": 0.2267, + "step": 2589 + }, + { + "epoch": 0.6008583690987125, + "grad_norm": 9.577537283110104, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 2590 + }, + { + "epoch": 0.601090360747013, + "grad_norm": 27.429490879194134, + "learning_rate": 2e-06, + "loss": 0.3661, + "step": 2591 + }, + { + "epoch": 0.6013223523953137, + "grad_norm": 22.96181063077117, + "learning_rate": 2e-06, + "loss": 0.3991, + "step": 2592 + }, + { + "epoch": 0.6015543440436144, + "grad_norm": 20.831944745474473, + "learning_rate": 2e-06, + "loss": 0.3794, + "step": 2593 + }, + { + "epoch": 0.6017863356919151, + "grad_norm": 24.6377518098682, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 2594 + }, + { + "epoch": 0.6020183273402158, + "grad_norm": 16.685655535948804, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 2595 + }, + { + "epoch": 0.6022503189885164, + "grad_norm": 16.949976752704817, + "learning_rate": 2e-06, + "loss": 0.3985, + "step": 2596 + }, + { + "epoch": 0.6024823106368171, + "grad_norm": 10.333889262423241, + "learning_rate": 2e-06, + "loss": 0.2667, + "step": 2597 + }, + { + "epoch": 0.6027143022851177, + "grad_norm": 14.510670341351082, + "learning_rate": 2e-06, + "loss": 0.3442, + "step": 2598 + }, + { + "epoch": 0.6029462939334184, + "grad_norm": 17.69640920886926, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 2599 + }, + { + "epoch": 0.603178285581719, + "grad_norm": 23.660783062967415, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 2600 + }, + { + "epoch": 0.6034102772300197, + "grad_norm": 18.057275005324275, + "learning_rate": 2e-06, + "loss": 0.3407, + "step": 2601 + }, + { + "epoch": 0.6036422688783204, + "grad_norm": 24.069099929213795, + "learning_rate": 2e-06, + "loss": 0.3268, + "step": 2602 + }, + { + "epoch": 0.6038742605266211, + "grad_norm": 10.642561428651906, + "learning_rate": 2e-06, + "loss": 0.3963, + "step": 2603 + }, + { + "epoch": 0.6041062521749218, + "grad_norm": 20.174001034192678, + "learning_rate": 2e-06, + "loss": 0.2681, + "step": 2604 + }, + { + "epoch": 0.6043382438232223, + "grad_norm": 9.175946684897225, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 2605 + }, + { + "epoch": 0.604570235471523, + "grad_norm": 8.166771981839268, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 2606 + }, + { + "epoch": 0.6048022271198237, + "grad_norm": 16.899229840133128, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 2607 + }, + { + "epoch": 0.6050342187681244, + "grad_norm": 9.44602089006785, + "learning_rate": 2e-06, + "loss": 0.2403, + "step": 2608 + }, + { + "epoch": 0.605266210416425, + "grad_norm": 13.783306010662567, + "learning_rate": 2e-06, + "loss": 0.3988, + "step": 2609 + }, + { + "epoch": 0.6054982020647257, + "grad_norm": 10.320072678209218, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 2610 + }, + { + "epoch": 0.6057301937130263, + "grad_norm": 15.997254592009337, + "learning_rate": 2e-06, + "loss": 0.3589, + "step": 2611 + }, + { + "epoch": 0.605962185361327, + "grad_norm": 8.183158067579095, + "learning_rate": 2e-06, + "loss": 0.168, + "step": 2612 + }, + { + "epoch": 0.6061941770096276, + "grad_norm": 10.745161014883454, + "learning_rate": 2e-06, + "loss": 0.2714, + "step": 2613 + }, + { + "epoch": 0.6064261686579283, + "grad_norm": 11.462258671047206, + "learning_rate": 2e-06, + "loss": 0.2523, + "step": 2614 + }, + { + "epoch": 0.606658160306229, + "grad_norm": 12.016709149335806, + "learning_rate": 2e-06, + "loss": 0.314, + "step": 2615 + }, + { + "epoch": 0.6068901519545297, + "grad_norm": 17.155313994573596, + "learning_rate": 2e-06, + "loss": 0.408, + "step": 2616 + }, + { + "epoch": 0.6071221436028303, + "grad_norm": 11.915214369441893, + "learning_rate": 2e-06, + "loss": 0.3494, + "step": 2617 + }, + { + "epoch": 0.6073541352511309, + "grad_norm": 9.603980367813907, + "learning_rate": 2e-06, + "loss": 0.3015, + "step": 2618 + }, + { + "epoch": 0.6075861268994316, + "grad_norm": 14.152671605729422, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 2619 + }, + { + "epoch": 0.6078181185477323, + "grad_norm": 9.895894107038899, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 2620 + }, + { + "epoch": 0.608050110196033, + "grad_norm": 21.33095069897468, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 2621 + }, + { + "epoch": 0.6082821018443336, + "grad_norm": 7.935644587182702, + "learning_rate": 2e-06, + "loss": 0.3164, + "step": 2622 + }, + { + "epoch": 0.6085140934926343, + "grad_norm": 19.81730932480036, + "learning_rate": 2e-06, + "loss": 0.3868, + "step": 2623 + }, + { + "epoch": 0.608746085140935, + "grad_norm": 15.406381840658463, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 2624 + }, + { + "epoch": 0.6089780767892355, + "grad_norm": 15.622610047217227, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 2625 + }, + { + "epoch": 0.6092100684375362, + "grad_norm": 19.054233656211007, + "learning_rate": 2e-06, + "loss": 0.3748, + "step": 2626 + }, + { + "epoch": 0.6094420600858369, + "grad_norm": 17.01955389711876, + "learning_rate": 2e-06, + "loss": 0.3669, + "step": 2627 + }, + { + "epoch": 0.6096740517341376, + "grad_norm": 13.138403748635863, + "learning_rate": 2e-06, + "loss": 0.3121, + "step": 2628 + }, + { + "epoch": 0.6099060433824383, + "grad_norm": 7.430059477695104, + "learning_rate": 2e-06, + "loss": 0.2589, + "step": 2629 + }, + { + "epoch": 0.6101380350307389, + "grad_norm": 12.017745160492542, + "learning_rate": 2e-06, + "loss": 0.3337, + "step": 2630 + }, + { + "epoch": 0.6103700266790395, + "grad_norm": 21.893420801946824, + "learning_rate": 2e-06, + "loss": 0.3723, + "step": 2631 + }, + { + "epoch": 0.6106020183273402, + "grad_norm": 7.716883732515496, + "learning_rate": 2e-06, + "loss": 0.246, + "step": 2632 + }, + { + "epoch": 0.6108340099756409, + "grad_norm": 12.394263618488978, + "learning_rate": 2e-06, + "loss": 0.2482, + "step": 2633 + }, + { + "epoch": 0.6110660016239415, + "grad_norm": 13.85844630687673, + "learning_rate": 2e-06, + "loss": 0.3278, + "step": 2634 + }, + { + "epoch": 0.6112979932722422, + "grad_norm": 15.23846624738061, + "learning_rate": 2e-06, + "loss": 0.3521, + "step": 2635 + }, + { + "epoch": 0.6115299849205429, + "grad_norm": 17.00492197768118, + "learning_rate": 2e-06, + "loss": 0.2893, + "step": 2636 + }, + { + "epoch": 0.6117619765688436, + "grad_norm": 13.586664525927867, + "learning_rate": 2e-06, + "loss": 0.2788, + "step": 2637 + }, + { + "epoch": 0.6119939682171441, + "grad_norm": 15.014978243660334, + "learning_rate": 2e-06, + "loss": 0.3178, + "step": 2638 + }, + { + "epoch": 0.6122259598654448, + "grad_norm": 13.84349412358929, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 2639 + }, + { + "epoch": 0.6124579515137455, + "grad_norm": 22.096753673828697, + "learning_rate": 2e-06, + "loss": 0.3522, + "step": 2640 + }, + { + "epoch": 0.6126899431620462, + "grad_norm": 14.059995206076655, + "learning_rate": 2e-06, + "loss": 0.3093, + "step": 2641 + }, + { + "epoch": 0.6129219348103468, + "grad_norm": 8.321542567832232, + "learning_rate": 2e-06, + "loss": 0.3574, + "step": 2642 + }, + { + "epoch": 0.6131539264586475, + "grad_norm": 13.750629896271956, + "learning_rate": 2e-06, + "loss": 0.1864, + "step": 2643 + }, + { + "epoch": 0.6133859181069482, + "grad_norm": 22.7628000844624, + "learning_rate": 2e-06, + "loss": 0.4686, + "step": 2644 + }, + { + "epoch": 0.6136179097552488, + "grad_norm": 9.845866081722727, + "learning_rate": 2e-06, + "loss": 0.2652, + "step": 2645 + }, + { + "epoch": 0.6138499014035494, + "grad_norm": 13.935750749172811, + "learning_rate": 2e-06, + "loss": 0.3633, + "step": 2646 + }, + { + "epoch": 0.6140818930518501, + "grad_norm": 20.494770762962357, + "learning_rate": 2e-06, + "loss": 0.3229, + "step": 2647 + }, + { + "epoch": 0.6143138847001508, + "grad_norm": 16.60093010574271, + "learning_rate": 2e-06, + "loss": 0.3424, + "step": 2648 + }, + { + "epoch": 0.6145458763484515, + "grad_norm": 20.04060203813773, + "learning_rate": 2e-06, + "loss": 0.4419, + "step": 2649 + }, + { + "epoch": 0.6147778679967522, + "grad_norm": 11.737435793683828, + "learning_rate": 2e-06, + "loss": 0.2781, + "step": 2650 + }, + { + "epoch": 0.6150098596450527, + "grad_norm": 13.725621163731567, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 2651 + }, + { + "epoch": 0.6152418512933534, + "grad_norm": 15.800607780056476, + "learning_rate": 2e-06, + "loss": 0.2469, + "step": 2652 + }, + { + "epoch": 0.6154738429416541, + "grad_norm": 20.152900288357856, + "learning_rate": 2e-06, + "loss": 0.2142, + "step": 2653 + }, + { + "epoch": 0.6157058345899548, + "grad_norm": 16.55339980838737, + "learning_rate": 2e-06, + "loss": 0.3571, + "step": 2654 + }, + { + "epoch": 0.6159378262382554, + "grad_norm": 10.524688730312457, + "learning_rate": 2e-06, + "loss": 0.3035, + "step": 2655 + }, + { + "epoch": 0.6161698178865561, + "grad_norm": 10.31213022805119, + "learning_rate": 2e-06, + "loss": 0.272, + "step": 2656 + }, + { + "epoch": 0.6164018095348568, + "grad_norm": 9.45210475666712, + "learning_rate": 2e-06, + "loss": 0.2183, + "step": 2657 + }, + { + "epoch": 0.6166338011831574, + "grad_norm": 21.582429259346284, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 2658 + }, + { + "epoch": 0.616865792831458, + "grad_norm": 26.47571112560527, + "learning_rate": 2e-06, + "loss": 0.3598, + "step": 2659 + }, + { + "epoch": 0.6170977844797587, + "grad_norm": 14.710721928558407, + "learning_rate": 2e-06, + "loss": 0.3941, + "step": 2660 + }, + { + "epoch": 0.6173297761280594, + "grad_norm": 10.122986352634644, + "learning_rate": 2e-06, + "loss": 0.2817, + "step": 2661 + }, + { + "epoch": 0.6175617677763601, + "grad_norm": 17.1002915041208, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 2662 + }, + { + "epoch": 0.6177937594246607, + "grad_norm": 15.228871856863515, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 2663 + }, + { + "epoch": 0.6180257510729614, + "grad_norm": 10.804501166132074, + "learning_rate": 2e-06, + "loss": 0.2286, + "step": 2664 + }, + { + "epoch": 0.618257742721262, + "grad_norm": 19.23509206707099, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 2665 + }, + { + "epoch": 0.6184897343695627, + "grad_norm": 19.918949097059865, + "learning_rate": 2e-06, + "loss": 0.3427, + "step": 2666 + }, + { + "epoch": 0.6187217260178633, + "grad_norm": 11.805634939444555, + "learning_rate": 2e-06, + "loss": 0.3066, + "step": 2667 + }, + { + "epoch": 0.618953717666164, + "grad_norm": 15.470640332499393, + "learning_rate": 2e-06, + "loss": 0.1898, + "step": 2668 + }, + { + "epoch": 0.6191857093144647, + "grad_norm": 9.398799386754078, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 2669 + }, + { + "epoch": 0.6194177009627654, + "grad_norm": 13.784983840873174, + "learning_rate": 2e-06, + "loss": 0.3268, + "step": 2670 + }, + { + "epoch": 0.619649692611066, + "grad_norm": 12.30965234954036, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 2671 + }, + { + "epoch": 0.6198816842593666, + "grad_norm": 11.794091327712746, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 2672 + }, + { + "epoch": 0.6201136759076673, + "grad_norm": 21.585226462652304, + "learning_rate": 2e-06, + "loss": 0.4381, + "step": 2673 + }, + { + "epoch": 0.620345667555968, + "grad_norm": 10.362497530620518, + "learning_rate": 2e-06, + "loss": 0.2232, + "step": 2674 + }, + { + "epoch": 0.6205776592042687, + "grad_norm": 12.525941047158943, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 2675 + }, + { + "epoch": 0.6208096508525693, + "grad_norm": 16.138277764285274, + "learning_rate": 2e-06, + "loss": 0.3669, + "step": 2676 + }, + { + "epoch": 0.62104164250087, + "grad_norm": 15.780768061964427, + "learning_rate": 2e-06, + "loss": 0.3749, + "step": 2677 + }, + { + "epoch": 0.6212736341491706, + "grad_norm": 20.30659677577186, + "learning_rate": 2e-06, + "loss": 0.3917, + "step": 2678 + }, + { + "epoch": 0.6215056257974713, + "grad_norm": 12.25187953805943, + "learning_rate": 2e-06, + "loss": 0.2449, + "step": 2679 + }, + { + "epoch": 0.6217376174457719, + "grad_norm": 12.850671799957505, + "learning_rate": 2e-06, + "loss": 0.2107, + "step": 2680 + }, + { + "epoch": 0.6219696090940726, + "grad_norm": 21.096912590816856, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 2681 + }, + { + "epoch": 0.6222016007423733, + "grad_norm": 25.320626047184888, + "learning_rate": 2e-06, + "loss": 0.389, + "step": 2682 + }, + { + "epoch": 0.622433592390674, + "grad_norm": 22.577550122053413, + "learning_rate": 2e-06, + "loss": 0.3165, + "step": 2683 + }, + { + "epoch": 0.6226655840389747, + "grad_norm": 12.988787747440956, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 2684 + }, + { + "epoch": 0.6228975756872752, + "grad_norm": 9.692038170370148, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 2685 + }, + { + "epoch": 0.6231295673355759, + "grad_norm": 15.284503787530403, + "learning_rate": 2e-06, + "loss": 0.3389, + "step": 2686 + }, + { + "epoch": 0.6233615589838766, + "grad_norm": 23.334498764386655, + "learning_rate": 2e-06, + "loss": 0.3517, + "step": 2687 + }, + { + "epoch": 0.6235935506321773, + "grad_norm": 10.922055258467221, + "learning_rate": 2e-06, + "loss": 0.2828, + "step": 2688 + }, + { + "epoch": 0.6238255422804779, + "grad_norm": 10.065528194947968, + "learning_rate": 2e-06, + "loss": 0.2002, + "step": 2689 + }, + { + "epoch": 0.6240575339287786, + "grad_norm": 18.114588707791558, + "learning_rate": 2e-06, + "loss": 0.199, + "step": 2690 + }, + { + "epoch": 0.6242895255770792, + "grad_norm": 25.19740551477204, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 2691 + }, + { + "epoch": 0.6245215172253799, + "grad_norm": 12.660265258094135, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 2692 + }, + { + "epoch": 0.6247535088736805, + "grad_norm": 12.01025971009369, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 2693 + }, + { + "epoch": 0.6249855005219812, + "grad_norm": 19.57290722929422, + "learning_rate": 2e-06, + "loss": 0.4168, + "step": 2694 + }, + { + "epoch": 0.6252174921702819, + "grad_norm": 18.333835413311697, + "learning_rate": 2e-06, + "loss": 0.3802, + "step": 2695 + }, + { + "epoch": 0.6254494838185826, + "grad_norm": 9.452348955554905, + "learning_rate": 2e-06, + "loss": 0.3192, + "step": 2696 + }, + { + "epoch": 0.6256814754668832, + "grad_norm": 19.413813530652142, + "learning_rate": 2e-06, + "loss": 0.3486, + "step": 2697 + }, + { + "epoch": 0.6259134671151838, + "grad_norm": 8.14297216131884, + "learning_rate": 2e-06, + "loss": 0.1745, + "step": 2698 + }, + { + "epoch": 0.6261454587634845, + "grad_norm": 16.460858927880107, + "learning_rate": 2e-06, + "loss": 0.2898, + "step": 2699 + }, + { + "epoch": 0.6263774504117852, + "grad_norm": 23.27528134870079, + "learning_rate": 2e-06, + "loss": 0.3116, + "step": 2700 + }, + { + "epoch": 0.6266094420600858, + "grad_norm": 21.682972803443782, + "learning_rate": 2e-06, + "loss": 0.3774, + "step": 2701 + }, + { + "epoch": 0.6268414337083865, + "grad_norm": 11.274293799473982, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 2702 + }, + { + "epoch": 0.6270734253566872, + "grad_norm": 14.373437442014854, + "learning_rate": 2e-06, + "loss": 0.2802, + "step": 2703 + }, + { + "epoch": 0.6273054170049879, + "grad_norm": 27.98556682726606, + "learning_rate": 2e-06, + "loss": 0.3984, + "step": 2704 + }, + { + "epoch": 0.6275374086532884, + "grad_norm": 8.125770832169447, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 2705 + }, + { + "epoch": 0.6277694003015891, + "grad_norm": 14.280645487072892, + "learning_rate": 2e-06, + "loss": 0.3523, + "step": 2706 + }, + { + "epoch": 0.6280013919498898, + "grad_norm": 13.604607657906875, + "learning_rate": 2e-06, + "loss": 0.2502, + "step": 2707 + }, + { + "epoch": 0.6282333835981905, + "grad_norm": 15.215810695283421, + "learning_rate": 2e-06, + "loss": 0.2886, + "step": 2708 + }, + { + "epoch": 0.6284653752464912, + "grad_norm": 10.526388538487529, + "learning_rate": 2e-06, + "loss": 0.2921, + "step": 2709 + }, + { + "epoch": 0.6286973668947918, + "grad_norm": 11.578951665205539, + "learning_rate": 2e-06, + "loss": 0.2166, + "step": 2710 + }, + { + "epoch": 0.6289293585430924, + "grad_norm": 26.76025334672509, + "learning_rate": 2e-06, + "loss": 0.4258, + "step": 2711 + }, + { + "epoch": 0.6291613501913931, + "grad_norm": 5.960315113925062, + "learning_rate": 2e-06, + "loss": 0.203, + "step": 2712 + }, + { + "epoch": 0.6293933418396938, + "grad_norm": 19.098755755319814, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 2713 + }, + { + "epoch": 0.6296253334879944, + "grad_norm": 11.279047664736554, + "learning_rate": 2e-06, + "loss": 0.2733, + "step": 2714 + }, + { + "epoch": 0.6298573251362951, + "grad_norm": 11.042710355062926, + "learning_rate": 2e-06, + "loss": 0.2478, + "step": 2715 + }, + { + "epoch": 0.6300893167845958, + "grad_norm": 21.186114628597807, + "learning_rate": 2e-06, + "loss": 0.3966, + "step": 2716 + }, + { + "epoch": 0.6303213084328965, + "grad_norm": 12.535985711797327, + "learning_rate": 2e-06, + "loss": 0.3958, + "step": 2717 + }, + { + "epoch": 0.630553300081197, + "grad_norm": 9.076805066853897, + "learning_rate": 2e-06, + "loss": 0.2573, + "step": 2718 + }, + { + "epoch": 0.6307852917294977, + "grad_norm": 19.664137316963803, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 2719 + }, + { + "epoch": 0.6310172833777984, + "grad_norm": 20.192281362626595, + "learning_rate": 2e-06, + "loss": 0.3911, + "step": 2720 + }, + { + "epoch": 0.6312492750260991, + "grad_norm": 9.34365143008957, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 2721 + }, + { + "epoch": 0.6314812666743997, + "grad_norm": 24.48111614902375, + "learning_rate": 2e-06, + "loss": 0.4094, + "step": 2722 + }, + { + "epoch": 0.6317132583227004, + "grad_norm": 9.707702530939931, + "learning_rate": 2e-06, + "loss": 0.2276, + "step": 2723 + }, + { + "epoch": 0.631945249971001, + "grad_norm": 10.989348115827445, + "learning_rate": 2e-06, + "loss": 0.3009, + "step": 2724 + }, + { + "epoch": 0.6321772416193017, + "grad_norm": 11.048989765456342, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 2725 + }, + { + "epoch": 0.6324092332676023, + "grad_norm": 14.44663365989049, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 2726 + }, + { + "epoch": 0.632641224915903, + "grad_norm": 20.803849251140264, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 2727 + }, + { + "epoch": 0.6328732165642037, + "grad_norm": 37.679247145652994, + "learning_rate": 2e-06, + "loss": 0.415, + "step": 2728 + }, + { + "epoch": 0.6331052082125044, + "grad_norm": 20.184213761179738, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 2729 + }, + { + "epoch": 0.6333371998608051, + "grad_norm": 11.551312967773589, + "learning_rate": 2e-06, + "loss": 0.3722, + "step": 2730 + }, + { + "epoch": 0.6335691915091056, + "grad_norm": 12.706340951891182, + "learning_rate": 2e-06, + "loss": 0.262, + "step": 2731 + }, + { + "epoch": 0.6338011831574063, + "grad_norm": 7.8792482539519835, + "learning_rate": 2e-06, + "loss": 0.2088, + "step": 2732 + }, + { + "epoch": 0.634033174805707, + "grad_norm": 20.92190277740192, + "learning_rate": 2e-06, + "loss": 0.4063, + "step": 2733 + }, + { + "epoch": 0.6342651664540077, + "grad_norm": 23.1379632657036, + "learning_rate": 2e-06, + "loss": 0.3856, + "step": 2734 + }, + { + "epoch": 0.6344971581023083, + "grad_norm": 10.469166151059296, + "learning_rate": 2e-06, + "loss": 0.3344, + "step": 2735 + }, + { + "epoch": 0.634729149750609, + "grad_norm": 7.87302230405102, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 2736 + }, + { + "epoch": 0.6349611413989097, + "grad_norm": 10.936525755895664, + "learning_rate": 2e-06, + "loss": 0.2936, + "step": 2737 + }, + { + "epoch": 0.6351931330472103, + "grad_norm": 10.375866159081577, + "learning_rate": 2e-06, + "loss": 0.2014, + "step": 2738 + }, + { + "epoch": 0.6354251246955109, + "grad_norm": 19.10009840905328, + "learning_rate": 2e-06, + "loss": 0.2933, + "step": 2739 + }, + { + "epoch": 0.6356571163438116, + "grad_norm": 10.686041317032329, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 2740 + }, + { + "epoch": 0.6358891079921123, + "grad_norm": 8.286894335356871, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 2741 + }, + { + "epoch": 0.636121099640413, + "grad_norm": 14.205426749807465, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 2742 + }, + { + "epoch": 0.6363530912887136, + "grad_norm": 17.240692772281445, + "learning_rate": 2e-06, + "loss": 0.2993, + "step": 2743 + }, + { + "epoch": 0.6365850829370142, + "grad_norm": 11.042937107655254, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 2744 + }, + { + "epoch": 0.6368170745853149, + "grad_norm": 11.676379857964525, + "learning_rate": 2e-06, + "loss": 0.2883, + "step": 2745 + }, + { + "epoch": 0.6370490662336156, + "grad_norm": 14.066457991386905, + "learning_rate": 2e-06, + "loss": 0.3843, + "step": 2746 + }, + { + "epoch": 0.6372810578819162, + "grad_norm": 9.493809790836558, + "learning_rate": 2e-06, + "loss": 0.2383, + "step": 2747 + }, + { + "epoch": 0.6375130495302169, + "grad_norm": 18.954488884963048, + "learning_rate": 2e-06, + "loss": 0.3952, + "step": 2748 + }, + { + "epoch": 0.6377450411785176, + "grad_norm": 10.592996727713146, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 2749 + }, + { + "epoch": 0.6379770328268183, + "grad_norm": 12.26499262139558, + "learning_rate": 2e-06, + "loss": 0.2301, + "step": 2750 + }, + { + "epoch": 0.6382090244751188, + "grad_norm": 16.25474051381953, + "learning_rate": 2e-06, + "loss": 0.27, + "step": 2751 + }, + { + "epoch": 0.6384410161234195, + "grad_norm": 16.255107607009478, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 2752 + }, + { + "epoch": 0.6386730077717202, + "grad_norm": 14.197750750330924, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 2753 + }, + { + "epoch": 0.6389049994200209, + "grad_norm": 10.311882448604072, + "learning_rate": 2e-06, + "loss": 0.1804, + "step": 2754 + }, + { + "epoch": 0.6391369910683216, + "grad_norm": 19.917667184339937, + "learning_rate": 2e-06, + "loss": 0.2387, + "step": 2755 + }, + { + "epoch": 0.6393689827166222, + "grad_norm": 23.591561126533815, + "learning_rate": 2e-06, + "loss": 0.2956, + "step": 2756 + }, + { + "epoch": 0.6396009743649229, + "grad_norm": 6.848883359950225, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 2757 + }, + { + "epoch": 0.6398329660132235, + "grad_norm": 30.575828724428405, + "learning_rate": 2e-06, + "loss": 0.447, + "step": 2758 + }, + { + "epoch": 0.6400649576615242, + "grad_norm": 22.80247466822758, + "learning_rate": 2e-06, + "loss": 0.3561, + "step": 2759 + }, + { + "epoch": 0.6402969493098248, + "grad_norm": 16.36344492473047, + "learning_rate": 2e-06, + "loss": 0.231, + "step": 2760 + }, + { + "epoch": 0.6405289409581255, + "grad_norm": 12.587730209883201, + "learning_rate": 2e-06, + "loss": 0.3164, + "step": 2761 + }, + { + "epoch": 0.6407609326064262, + "grad_norm": 10.992985260887487, + "learning_rate": 2e-06, + "loss": 0.3116, + "step": 2762 + }, + { + "epoch": 0.6409929242547269, + "grad_norm": 21.5416745374942, + "learning_rate": 2e-06, + "loss": 0.363, + "step": 2763 + }, + { + "epoch": 0.6412249159030274, + "grad_norm": 10.748047530819289, + "learning_rate": 2e-06, + "loss": 0.2674, + "step": 2764 + }, + { + "epoch": 0.6414569075513281, + "grad_norm": 27.600547677388484, + "learning_rate": 2e-06, + "loss": 0.4441, + "step": 2765 + }, + { + "epoch": 0.6416888991996288, + "grad_norm": 6.163204912631635, + "learning_rate": 2e-06, + "loss": 0.1463, + "step": 2766 + }, + { + "epoch": 0.6419208908479295, + "grad_norm": 21.152287218285384, + "learning_rate": 2e-06, + "loss": 0.3676, + "step": 2767 + }, + { + "epoch": 0.6421528824962301, + "grad_norm": 14.472285140393597, + "learning_rate": 2e-06, + "loss": 0.2761, + "step": 2768 + }, + { + "epoch": 0.6423848741445308, + "grad_norm": 18.049548403993875, + "learning_rate": 2e-06, + "loss": 0.2679, + "step": 2769 + }, + { + "epoch": 0.6426168657928315, + "grad_norm": 18.11985935626689, + "learning_rate": 2e-06, + "loss": 0.3918, + "step": 2770 + }, + { + "epoch": 0.6428488574411321, + "grad_norm": 26.11802174798973, + "learning_rate": 2e-06, + "loss": 0.3145, + "step": 2771 + }, + { + "epoch": 0.6430808490894327, + "grad_norm": 26.13363087877274, + "learning_rate": 2e-06, + "loss": 0.407, + "step": 2772 + }, + { + "epoch": 0.6433128407377334, + "grad_norm": 12.383254109783655, + "learning_rate": 2e-06, + "loss": 0.2272, + "step": 2773 + }, + { + "epoch": 0.6435448323860341, + "grad_norm": 14.884215874916432, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 2774 + }, + { + "epoch": 0.6437768240343348, + "grad_norm": 18.511544627989096, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 2775 + }, + { + "epoch": 0.6440088156826355, + "grad_norm": 8.920952494029857, + "learning_rate": 2e-06, + "loss": 0.2197, + "step": 2776 + }, + { + "epoch": 0.6442408073309361, + "grad_norm": 16.965335181403468, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 2777 + }, + { + "epoch": 0.6444727989792367, + "grad_norm": 17.37240735011543, + "learning_rate": 2e-06, + "loss": 0.2977, + "step": 2778 + }, + { + "epoch": 0.6447047906275374, + "grad_norm": 13.824478272806385, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 2779 + }, + { + "epoch": 0.6449367822758381, + "grad_norm": 12.1812911628817, + "learning_rate": 2e-06, + "loss": 0.2624, + "step": 2780 + }, + { + "epoch": 0.6451687739241387, + "grad_norm": 11.874202270954608, + "learning_rate": 2e-06, + "loss": 0.3416, + "step": 2781 + }, + { + "epoch": 0.6454007655724394, + "grad_norm": 15.154437415027664, + "learning_rate": 2e-06, + "loss": 0.192, + "step": 2782 + }, + { + "epoch": 0.6456327572207401, + "grad_norm": 20.87825469178961, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 2783 + }, + { + "epoch": 0.6458647488690407, + "grad_norm": 19.52264631822083, + "learning_rate": 2e-06, + "loss": 0.3867, + "step": 2784 + }, + { + "epoch": 0.6460967405173413, + "grad_norm": 9.487212028916371, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 2785 + }, + { + "epoch": 0.646328732165642, + "grad_norm": 9.733036060848567, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 2786 + }, + { + "epoch": 0.6465607238139427, + "grad_norm": 14.859036415557348, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 2787 + }, + { + "epoch": 0.6467927154622434, + "grad_norm": 8.02322631614885, + "learning_rate": 2e-06, + "loss": 0.1726, + "step": 2788 + }, + { + "epoch": 0.647024707110544, + "grad_norm": 16.839870153267437, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 2789 + }, + { + "epoch": 0.6472566987588447, + "grad_norm": 12.64538932340377, + "learning_rate": 2e-06, + "loss": 0.233, + "step": 2790 + }, + { + "epoch": 0.6474886904071453, + "grad_norm": 8.404400295247754, + "learning_rate": 2e-06, + "loss": 0.2357, + "step": 2791 + }, + { + "epoch": 0.647720682055446, + "grad_norm": 26.048227186813318, + "learning_rate": 2e-06, + "loss": 0.3968, + "step": 2792 + }, + { + "epoch": 0.6479526737037467, + "grad_norm": 21.542478177411446, + "learning_rate": 2e-06, + "loss": 0.3382, + "step": 2793 + }, + { + "epoch": 0.6481846653520473, + "grad_norm": 11.151029842131523, + "learning_rate": 2e-06, + "loss": 0.3172, + "step": 2794 + }, + { + "epoch": 0.648416657000348, + "grad_norm": 11.659646165299003, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 2795 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 10.997059886039864, + "learning_rate": 2e-06, + "loss": 0.2589, + "step": 2796 + }, + { + "epoch": 0.6488806402969494, + "grad_norm": 18.666221337499135, + "learning_rate": 2e-06, + "loss": 0.387, + "step": 2797 + }, + { + "epoch": 0.6491126319452499, + "grad_norm": 7.395206819823027, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 2798 + }, + { + "epoch": 0.6493446235935506, + "grad_norm": 11.972980545484056, + "learning_rate": 2e-06, + "loss": 0.2554, + "step": 2799 + }, + { + "epoch": 0.6495766152418513, + "grad_norm": 18.15435582309048, + "learning_rate": 2e-06, + "loss": 0.2819, + "step": 2800 + }, + { + "epoch": 0.649808606890152, + "grad_norm": 14.607467175131468, + "learning_rate": 2e-06, + "loss": 0.3821, + "step": 2801 + }, + { + "epoch": 0.6500405985384526, + "grad_norm": 15.679649143266406, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 2802 + }, + { + "epoch": 0.6502725901867533, + "grad_norm": 14.006099163322526, + "learning_rate": 2e-06, + "loss": 0.4045, + "step": 2803 + }, + { + "epoch": 0.6505045818350539, + "grad_norm": 18.225064796926716, + "learning_rate": 2e-06, + "loss": 0.286, + "step": 2804 + }, + { + "epoch": 0.6507365734833546, + "grad_norm": 11.06047957203517, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 2805 + }, + { + "epoch": 0.6509685651316552, + "grad_norm": 16.095596929410195, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 2806 + }, + { + "epoch": 0.6512005567799559, + "grad_norm": 27.95225192803792, + "learning_rate": 2e-06, + "loss": 0.2987, + "step": 2807 + }, + { + "epoch": 0.6514325484282566, + "grad_norm": 19.666700272868894, + "learning_rate": 2e-06, + "loss": 0.3157, + "step": 2808 + }, + { + "epoch": 0.6516645400765573, + "grad_norm": 18.21122294525583, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 2809 + }, + { + "epoch": 0.651896531724858, + "grad_norm": 8.459447025413976, + "learning_rate": 2e-06, + "loss": 0.208, + "step": 2810 + }, + { + "epoch": 0.6521285233731585, + "grad_norm": 15.6268310899083, + "learning_rate": 2e-06, + "loss": 0.3198, + "step": 2811 + }, + { + "epoch": 0.6523605150214592, + "grad_norm": 28.45664865289904, + "learning_rate": 2e-06, + "loss": 0.4413, + "step": 2812 + }, + { + "epoch": 0.6525925066697599, + "grad_norm": 9.57936608488626, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 2813 + }, + { + "epoch": 0.6528244983180606, + "grad_norm": 27.51762187086346, + "learning_rate": 2e-06, + "loss": 0.3609, + "step": 2814 + }, + { + "epoch": 0.6530564899663612, + "grad_norm": 10.913821665408513, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 2815 + }, + { + "epoch": 0.6532884816146619, + "grad_norm": 12.152772036034538, + "learning_rate": 2e-06, + "loss": 0.2951, + "step": 2816 + }, + { + "epoch": 0.6535204732629626, + "grad_norm": 16.723214077755184, + "learning_rate": 2e-06, + "loss": 0.3622, + "step": 2817 + }, + { + "epoch": 0.6537524649112632, + "grad_norm": 16.129821665506423, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 2818 + }, + { + "epoch": 0.6539844565595638, + "grad_norm": 20.693612400230375, + "learning_rate": 2e-06, + "loss": 0.3765, + "step": 2819 + }, + { + "epoch": 0.6542164482078645, + "grad_norm": 12.490854876318215, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 2820 + }, + { + "epoch": 0.6544484398561652, + "grad_norm": 14.731393132025028, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 2821 + }, + { + "epoch": 0.6546804315044659, + "grad_norm": 18.17261097086871, + "learning_rate": 2e-06, + "loss": 0.3278, + "step": 2822 + }, + { + "epoch": 0.6549124231527665, + "grad_norm": 11.349964778383697, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 2823 + }, + { + "epoch": 0.6551444148010671, + "grad_norm": 11.71263807129237, + "learning_rate": 2e-06, + "loss": 0.2592, + "step": 2824 + }, + { + "epoch": 0.6553764064493678, + "grad_norm": 18.08318611215625, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 2825 + }, + { + "epoch": 0.6556083980976685, + "grad_norm": 8.138291970407442, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 2826 + }, + { + "epoch": 0.6558403897459691, + "grad_norm": 19.53165707849859, + "learning_rate": 2e-06, + "loss": 0.4315, + "step": 2827 + }, + { + "epoch": 0.6560723813942698, + "grad_norm": 12.143351381067472, + "learning_rate": 2e-06, + "loss": 0.2126, + "step": 2828 + }, + { + "epoch": 0.6563043730425705, + "grad_norm": 7.720914695591568, + "learning_rate": 2e-06, + "loss": 0.2538, + "step": 2829 + }, + { + "epoch": 0.6565363646908712, + "grad_norm": 21.317140640865798, + "learning_rate": 2e-06, + "loss": 0.2426, + "step": 2830 + }, + { + "epoch": 0.6567683563391717, + "grad_norm": 10.113067962324731, + "learning_rate": 2e-06, + "loss": 0.2468, + "step": 2831 + }, + { + "epoch": 0.6570003479874724, + "grad_norm": 14.102113177194823, + "learning_rate": 2e-06, + "loss": 0.3143, + "step": 2832 + }, + { + "epoch": 0.6572323396357731, + "grad_norm": 13.987038194596758, + "learning_rate": 2e-06, + "loss": 0.2907, + "step": 2833 + }, + { + "epoch": 0.6574643312840738, + "grad_norm": 6.112736996816027, + "learning_rate": 2e-06, + "loss": 0.2774, + "step": 2834 + }, + { + "epoch": 0.6576963229323745, + "grad_norm": 10.449699832658547, + "learning_rate": 2e-06, + "loss": 0.2477, + "step": 2835 + }, + { + "epoch": 0.6579283145806751, + "grad_norm": 11.472365495587425, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 2836 + }, + { + "epoch": 0.6581603062289758, + "grad_norm": 19.22641768300949, + "learning_rate": 2e-06, + "loss": 0.4411, + "step": 2837 + }, + { + "epoch": 0.6583922978772764, + "grad_norm": 14.055238753422904, + "learning_rate": 2e-06, + "loss": 0.2558, + "step": 2838 + }, + { + "epoch": 0.6586242895255771, + "grad_norm": 10.984608588095922, + "learning_rate": 2e-06, + "loss": 0.1894, + "step": 2839 + }, + { + "epoch": 0.6588562811738777, + "grad_norm": 10.434615531979285, + "learning_rate": 2e-06, + "loss": 0.3205, + "step": 2840 + }, + { + "epoch": 0.6590882728221784, + "grad_norm": 14.644889870932198, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 2841 + }, + { + "epoch": 0.6593202644704791, + "grad_norm": 14.774480239943642, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 2842 + }, + { + "epoch": 0.6595522561187798, + "grad_norm": 14.71393716092559, + "learning_rate": 2e-06, + "loss": 0.3118, + "step": 2843 + }, + { + "epoch": 0.6597842477670803, + "grad_norm": 11.242429895186634, + "learning_rate": 2e-06, + "loss": 0.2235, + "step": 2844 + }, + { + "epoch": 0.660016239415381, + "grad_norm": 17.995923113107896, + "learning_rate": 2e-06, + "loss": 0.2863, + "step": 2845 + }, + { + "epoch": 0.6602482310636817, + "grad_norm": 17.729162434604135, + "learning_rate": 2e-06, + "loss": 0.3424, + "step": 2846 + }, + { + "epoch": 0.6604802227119824, + "grad_norm": 20.243422288385347, + "learning_rate": 2e-06, + "loss": 0.3789, + "step": 2847 + }, + { + "epoch": 0.660712214360283, + "grad_norm": 18.64732313406746, + "learning_rate": 2e-06, + "loss": 0.387, + "step": 2848 + }, + { + "epoch": 0.6609442060085837, + "grad_norm": 20.458465132834842, + "learning_rate": 2e-06, + "loss": 0.3071, + "step": 2849 + }, + { + "epoch": 0.6611761976568844, + "grad_norm": 11.585734721403155, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 2850 + }, + { + "epoch": 0.661408189305185, + "grad_norm": 13.461835104531968, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 2851 + }, + { + "epoch": 0.6616401809534856, + "grad_norm": 15.673380657407227, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 2852 + }, + { + "epoch": 0.6618721726017863, + "grad_norm": 9.921530213879748, + "learning_rate": 2e-06, + "loss": 0.2912, + "step": 2853 + }, + { + "epoch": 0.662104164250087, + "grad_norm": 14.238714967293898, + "learning_rate": 2e-06, + "loss": 0.3132, + "step": 2854 + }, + { + "epoch": 0.6623361558983877, + "grad_norm": 10.974948622348556, + "learning_rate": 2e-06, + "loss": 0.3454, + "step": 2855 + }, + { + "epoch": 0.6625681475466884, + "grad_norm": 14.885775876370817, + "learning_rate": 2e-06, + "loss": 0.2967, + "step": 2856 + }, + { + "epoch": 0.662800139194989, + "grad_norm": 10.72842736132594, + "learning_rate": 2e-06, + "loss": 0.2827, + "step": 2857 + }, + { + "epoch": 0.6630321308432896, + "grad_norm": 14.921435245690917, + "learning_rate": 2e-06, + "loss": 0.2179, + "step": 2858 + }, + { + "epoch": 0.6632641224915903, + "grad_norm": 7.573869874381404, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 2859 + }, + { + "epoch": 0.663496114139891, + "grad_norm": 20.056549707412877, + "learning_rate": 2e-06, + "loss": 0.2833, + "step": 2860 + }, + { + "epoch": 0.6637281057881916, + "grad_norm": 14.888681037702106, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 2861 + }, + { + "epoch": 0.6639600974364923, + "grad_norm": 17.421706372364103, + "learning_rate": 2e-06, + "loss": 0.4869, + "step": 2862 + }, + { + "epoch": 0.664192089084793, + "grad_norm": 13.282211399520385, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 2863 + }, + { + "epoch": 0.6644240807330936, + "grad_norm": 24.679787184942278, + "learning_rate": 2e-06, + "loss": 0.3197, + "step": 2864 + }, + { + "epoch": 0.6646560723813942, + "grad_norm": 19.27889281747182, + "learning_rate": 2e-06, + "loss": 0.3208, + "step": 2865 + }, + { + "epoch": 0.6648880640296949, + "grad_norm": 13.070453246451402, + "learning_rate": 2e-06, + "loss": 0.2411, + "step": 2866 + }, + { + "epoch": 0.6651200556779956, + "grad_norm": 25.468659736645378, + "learning_rate": 2e-06, + "loss": 0.2217, + "step": 2867 + }, + { + "epoch": 0.6653520473262963, + "grad_norm": 13.516564164744764, + "learning_rate": 2e-06, + "loss": 0.2786, + "step": 2868 + }, + { + "epoch": 0.665584038974597, + "grad_norm": 46.16793049596266, + "learning_rate": 2e-06, + "loss": 0.2839, + "step": 2869 + }, + { + "epoch": 0.6658160306228976, + "grad_norm": 15.8472120552767, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 2870 + }, + { + "epoch": 0.6660480222711982, + "grad_norm": 9.499928057186223, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 2871 + }, + { + "epoch": 0.6662800139194989, + "grad_norm": 15.961784571679711, + "learning_rate": 2e-06, + "loss": 0.3505, + "step": 2872 + }, + { + "epoch": 0.6665120055677995, + "grad_norm": 14.863888557389888, + "learning_rate": 2e-06, + "loss": 0.2517, + "step": 2873 + }, + { + "epoch": 0.6667439972161002, + "grad_norm": 12.114822736748357, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 2874 + }, + { + "epoch": 0.6669759888644009, + "grad_norm": 18.791475979494606, + "learning_rate": 2e-06, + "loss": 0.2813, + "step": 2875 + }, + { + "epoch": 0.6672079805127016, + "grad_norm": 13.241630128223022, + "learning_rate": 2e-06, + "loss": 0.1811, + "step": 2876 + }, + { + "epoch": 0.6674399721610021, + "grad_norm": 12.684844598060202, + "learning_rate": 2e-06, + "loss": 0.358, + "step": 2877 + }, + { + "epoch": 0.6676719638093028, + "grad_norm": 27.649226549284123, + "learning_rate": 2e-06, + "loss": 0.3875, + "step": 2878 + }, + { + "epoch": 0.6679039554576035, + "grad_norm": 11.450368103403596, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 2879 + }, + { + "epoch": 0.6681359471059042, + "grad_norm": 15.313099377924349, + "learning_rate": 2e-06, + "loss": 0.3333, + "step": 2880 + }, + { + "epoch": 0.6683679387542049, + "grad_norm": 17.75810877085882, + "learning_rate": 2e-06, + "loss": 0.2723, + "step": 2881 + }, + { + "epoch": 0.6685999304025055, + "grad_norm": 28.161703152994132, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 2882 + }, + { + "epoch": 0.6688319220508062, + "grad_norm": 9.307605731845863, + "learning_rate": 2e-06, + "loss": 0.2734, + "step": 2883 + }, + { + "epoch": 0.6690639136991068, + "grad_norm": 18.674716530336838, + "learning_rate": 2e-06, + "loss": 0.3547, + "step": 2884 + }, + { + "epoch": 0.6692959053474075, + "grad_norm": 17.731430790847497, + "learning_rate": 2e-06, + "loss": 0.2739, + "step": 2885 + }, + { + "epoch": 0.6695278969957081, + "grad_norm": 15.38332735147208, + "learning_rate": 2e-06, + "loss": 0.3064, + "step": 2886 + }, + { + "epoch": 0.6697598886440088, + "grad_norm": 14.647155898537573, + "learning_rate": 2e-06, + "loss": 0.1938, + "step": 2887 + }, + { + "epoch": 0.6699918802923095, + "grad_norm": 21.958953694103528, + "learning_rate": 2e-06, + "loss": 0.2576, + "step": 2888 + }, + { + "epoch": 0.6702238719406102, + "grad_norm": 19.13698687348818, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 2889 + }, + { + "epoch": 0.6704558635889108, + "grad_norm": 16.02891183410766, + "learning_rate": 2e-06, + "loss": 0.3191, + "step": 2890 + }, + { + "epoch": 0.6706878552372114, + "grad_norm": 11.89398814623729, + "learning_rate": 2e-06, + "loss": 0.2183, + "step": 2891 + }, + { + "epoch": 0.6709198468855121, + "grad_norm": 27.27437455145165, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 2892 + }, + { + "epoch": 0.6711518385338128, + "grad_norm": 17.494392318806085, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 2893 + }, + { + "epoch": 0.6713838301821135, + "grad_norm": 8.532745967737572, + "learning_rate": 2e-06, + "loss": 0.3024, + "step": 2894 + }, + { + "epoch": 0.6716158218304141, + "grad_norm": 15.784287286887945, + "learning_rate": 2e-06, + "loss": 0.3662, + "step": 2895 + }, + { + "epoch": 0.6718478134787148, + "grad_norm": 19.885911995754892, + "learning_rate": 2e-06, + "loss": 0.2912, + "step": 2896 + }, + { + "epoch": 0.6720798051270154, + "grad_norm": 11.502699311807573, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 2897 + }, + { + "epoch": 0.672311796775316, + "grad_norm": 18.332904766435426, + "learning_rate": 2e-06, + "loss": 0.2782, + "step": 2898 + }, + { + "epoch": 0.6725437884236167, + "grad_norm": 11.192278874423675, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 2899 + }, + { + "epoch": 0.6727757800719174, + "grad_norm": 10.25836407488176, + "learning_rate": 2e-06, + "loss": 0.2621, + "step": 2900 + }, + { + "epoch": 0.6730077717202181, + "grad_norm": 15.680650605945743, + "learning_rate": 2e-06, + "loss": 0.3704, + "step": 2901 + }, + { + "epoch": 0.6732397633685188, + "grad_norm": 24.10371429779154, + "learning_rate": 2e-06, + "loss": 0.3392, + "step": 2902 + }, + { + "epoch": 0.6734717550168194, + "grad_norm": 8.479237937207667, + "learning_rate": 2e-06, + "loss": 0.198, + "step": 2903 + }, + { + "epoch": 0.67370374666512, + "grad_norm": 14.174717355055936, + "learning_rate": 2e-06, + "loss": 0.3281, + "step": 2904 + }, + { + "epoch": 0.6739357383134207, + "grad_norm": 14.082257018873104, + "learning_rate": 2e-06, + "loss": 0.2516, + "step": 2905 + }, + { + "epoch": 0.6741677299617214, + "grad_norm": 13.382312420174081, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 2906 + }, + { + "epoch": 0.674399721610022, + "grad_norm": 24.6062414245324, + "learning_rate": 2e-06, + "loss": 0.3857, + "step": 2907 + }, + { + "epoch": 0.6746317132583227, + "grad_norm": 27.594212344103287, + "learning_rate": 2e-06, + "loss": 0.3976, + "step": 2908 + }, + { + "epoch": 0.6748637049066234, + "grad_norm": 22.15561527386861, + "learning_rate": 2e-06, + "loss": 0.4427, + "step": 2909 + }, + { + "epoch": 0.6750956965549241, + "grad_norm": 9.280812151914011, + "learning_rate": 2e-06, + "loss": 0.2409, + "step": 2910 + }, + { + "epoch": 0.6753276882032246, + "grad_norm": 14.739297859742182, + "learning_rate": 2e-06, + "loss": 0.2851, + "step": 2911 + }, + { + "epoch": 0.6755596798515253, + "grad_norm": 17.43973773723807, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 2912 + }, + { + "epoch": 0.675791671499826, + "grad_norm": 21.123065539604433, + "learning_rate": 2e-06, + "loss": 0.2731, + "step": 2913 + }, + { + "epoch": 0.6760236631481267, + "grad_norm": 18.18026849345217, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 2914 + }, + { + "epoch": 0.6762556547964274, + "grad_norm": 22.75544500204419, + "learning_rate": 2e-06, + "loss": 0.3933, + "step": 2915 + }, + { + "epoch": 0.676487646444728, + "grad_norm": 19.37742185783371, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 2916 + }, + { + "epoch": 0.6767196380930286, + "grad_norm": 16.69819067505101, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 2917 + }, + { + "epoch": 0.6769516297413293, + "grad_norm": 16.50659193036346, + "learning_rate": 2e-06, + "loss": 0.2146, + "step": 2918 + }, + { + "epoch": 0.67718362138963, + "grad_norm": 7.484957257065774, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 2919 + }, + { + "epoch": 0.6774156130379306, + "grad_norm": 15.034466055515287, + "learning_rate": 2e-06, + "loss": 0.2166, + "step": 2920 + }, + { + "epoch": 0.6776476046862313, + "grad_norm": 16.417816190692054, + "learning_rate": 2e-06, + "loss": 0.359, + "step": 2921 + }, + { + "epoch": 0.677879596334532, + "grad_norm": 22.66480068571077, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 2922 + }, + { + "epoch": 0.6781115879828327, + "grad_norm": 9.93097539810531, + "learning_rate": 2e-06, + "loss": 0.2954, + "step": 2923 + }, + { + "epoch": 0.6783435796311332, + "grad_norm": 15.193296735844845, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 2924 + }, + { + "epoch": 0.6785755712794339, + "grad_norm": 17.52679971654726, + "learning_rate": 2e-06, + "loss": 0.3571, + "step": 2925 + }, + { + "epoch": 0.6788075629277346, + "grad_norm": 7.445586192344902, + "learning_rate": 2e-06, + "loss": 0.1966, + "step": 2926 + }, + { + "epoch": 0.6790395545760353, + "grad_norm": 8.416210239673962, + "learning_rate": 2e-06, + "loss": 0.2133, + "step": 2927 + }, + { + "epoch": 0.6792715462243359, + "grad_norm": 14.313635294079827, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 2928 + }, + { + "epoch": 0.6795035378726366, + "grad_norm": 24.603948921934705, + "learning_rate": 2e-06, + "loss": 0.3901, + "step": 2929 + }, + { + "epoch": 0.6797355295209373, + "grad_norm": 15.833755249534436, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 2930 + }, + { + "epoch": 0.6799675211692379, + "grad_norm": 16.04070634218248, + "learning_rate": 2e-06, + "loss": 0.3736, + "step": 2931 + }, + { + "epoch": 0.6801995128175385, + "grad_norm": 20.682720901172534, + "learning_rate": 2e-06, + "loss": 0.3394, + "step": 2932 + }, + { + "epoch": 0.6804315044658392, + "grad_norm": 21.697405182973313, + "learning_rate": 2e-06, + "loss": 0.3276, + "step": 2933 + }, + { + "epoch": 0.6806634961141399, + "grad_norm": 18.865819341538803, + "learning_rate": 2e-06, + "loss": 0.3264, + "step": 2934 + }, + { + "epoch": 0.6808954877624406, + "grad_norm": 10.423027096808157, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 2935 + }, + { + "epoch": 0.6811274794107413, + "grad_norm": 21.677979146013957, + "learning_rate": 2e-06, + "loss": 0.367, + "step": 2936 + }, + { + "epoch": 0.6813594710590418, + "grad_norm": 26.4302768785951, + "learning_rate": 2e-06, + "loss": 0.3841, + "step": 2937 + }, + { + "epoch": 0.6815914627073425, + "grad_norm": 10.069866494626753, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 2938 + }, + { + "epoch": 0.6818234543556432, + "grad_norm": 9.805489612875316, + "learning_rate": 2e-06, + "loss": 0.1803, + "step": 2939 + }, + { + "epoch": 0.6820554460039439, + "grad_norm": 13.082756265772101, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 2940 + }, + { + "epoch": 0.6822874376522445, + "grad_norm": 21.214294775844788, + "learning_rate": 2e-06, + "loss": 0.3278, + "step": 2941 + }, + { + "epoch": 0.6825194293005452, + "grad_norm": 18.98402642369575, + "learning_rate": 2e-06, + "loss": 0.3717, + "step": 2942 + }, + { + "epoch": 0.6827514209488459, + "grad_norm": 18.28594400987843, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 2943 + }, + { + "epoch": 0.6829834125971465, + "grad_norm": 15.955264733597344, + "learning_rate": 2e-06, + "loss": 0.2649, + "step": 2944 + }, + { + "epoch": 0.6832154042454471, + "grad_norm": 12.143853436389726, + "learning_rate": 2e-06, + "loss": 0.3043, + "step": 2945 + }, + { + "epoch": 0.6834473958937478, + "grad_norm": 33.10313809595552, + "learning_rate": 2e-06, + "loss": 0.4111, + "step": 2946 + }, + { + "epoch": 0.6836793875420485, + "grad_norm": 15.49237897542982, + "learning_rate": 2e-06, + "loss": 0.3257, + "step": 2947 + }, + { + "epoch": 0.6839113791903492, + "grad_norm": 15.234371131142845, + "learning_rate": 2e-06, + "loss": 0.2847, + "step": 2948 + }, + { + "epoch": 0.6841433708386498, + "grad_norm": 14.394745150033899, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 2949 + }, + { + "epoch": 0.6843753624869505, + "grad_norm": 7.697439348929282, + "learning_rate": 2e-06, + "loss": 0.2083, + "step": 2950 + }, + { + "epoch": 0.6846073541352511, + "grad_norm": 8.347270088398039, + "learning_rate": 2e-06, + "loss": 0.1849, + "step": 2951 + }, + { + "epoch": 0.6848393457835518, + "grad_norm": 8.528135185478353, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 2952 + }, + { + "epoch": 0.6850713374318524, + "grad_norm": 11.440085523850653, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 2953 + }, + { + "epoch": 0.6853033290801531, + "grad_norm": 14.240300923564252, + "learning_rate": 2e-06, + "loss": 0.2868, + "step": 2954 + }, + { + "epoch": 0.6855353207284538, + "grad_norm": 15.298243190826206, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 2955 + }, + { + "epoch": 0.6857673123767545, + "grad_norm": 16.287029088480868, + "learning_rate": 2e-06, + "loss": 0.2477, + "step": 2956 + }, + { + "epoch": 0.685999304025055, + "grad_norm": 16.330072651646685, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 2957 + }, + { + "epoch": 0.6862312956733557, + "grad_norm": 10.144289299404209, + "learning_rate": 2e-06, + "loss": 0.2366, + "step": 2958 + }, + { + "epoch": 0.6864632873216564, + "grad_norm": 15.970485036072468, + "learning_rate": 2e-06, + "loss": 0.2597, + "step": 2959 + }, + { + "epoch": 0.6866952789699571, + "grad_norm": 20.059300974805478, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 2960 + }, + { + "epoch": 0.6869272706182578, + "grad_norm": 13.123594011226873, + "learning_rate": 2e-06, + "loss": 0.2808, + "step": 2961 + }, + { + "epoch": 0.6871592622665584, + "grad_norm": 11.967794532068869, + "learning_rate": 2e-06, + "loss": 0.2214, + "step": 2962 + }, + { + "epoch": 0.6873912539148591, + "grad_norm": 13.279811333305938, + "learning_rate": 2e-06, + "loss": 0.2894, + "step": 2963 + }, + { + "epoch": 0.6876232455631597, + "grad_norm": 15.978601053972245, + "learning_rate": 2e-06, + "loss": 0.3233, + "step": 2964 + }, + { + "epoch": 0.6878552372114604, + "grad_norm": 19.482813281061436, + "learning_rate": 2e-06, + "loss": 0.3722, + "step": 2965 + }, + { + "epoch": 0.688087228859761, + "grad_norm": 16.022926928632646, + "learning_rate": 2e-06, + "loss": 0.4637, + "step": 2966 + }, + { + "epoch": 0.6883192205080617, + "grad_norm": 25.90689576178664, + "learning_rate": 2e-06, + "loss": 0.3036, + "step": 2967 + }, + { + "epoch": 0.6885512121563624, + "grad_norm": 15.81243335582368, + "learning_rate": 2e-06, + "loss": 0.257, + "step": 2968 + }, + { + "epoch": 0.6887832038046631, + "grad_norm": 18.661608358707515, + "learning_rate": 2e-06, + "loss": 0.3841, + "step": 2969 + }, + { + "epoch": 0.6890151954529637, + "grad_norm": 20.789753229496526, + "learning_rate": 2e-06, + "loss": 0.3484, + "step": 2970 + }, + { + "epoch": 0.6892471871012643, + "grad_norm": 11.855384758128736, + "learning_rate": 2e-06, + "loss": 0.2881, + "step": 2971 + }, + { + "epoch": 0.689479178749565, + "grad_norm": 17.159193927611167, + "learning_rate": 2e-06, + "loss": 0.2717, + "step": 2972 + }, + { + "epoch": 0.6897111703978657, + "grad_norm": 13.992237879814896, + "learning_rate": 2e-06, + "loss": 0.4286, + "step": 2973 + }, + { + "epoch": 0.6899431620461663, + "grad_norm": 24.35252878929027, + "learning_rate": 2e-06, + "loss": 0.3378, + "step": 2974 + }, + { + "epoch": 0.690175153694467, + "grad_norm": 16.71016085700087, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 2975 + }, + { + "epoch": 0.6904071453427677, + "grad_norm": 9.784394283410494, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 2976 + }, + { + "epoch": 0.6906391369910683, + "grad_norm": 8.624407375319688, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 2977 + }, + { + "epoch": 0.690871128639369, + "grad_norm": 6.348981967459156, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 2978 + }, + { + "epoch": 0.6911031202876696, + "grad_norm": 19.035352735373152, + "learning_rate": 2e-06, + "loss": 0.3631, + "step": 2979 + }, + { + "epoch": 0.6913351119359703, + "grad_norm": 7.198738788917573, + "learning_rate": 2e-06, + "loss": 0.2179, + "step": 2980 + }, + { + "epoch": 0.691567103584271, + "grad_norm": 18.931518041058506, + "learning_rate": 2e-06, + "loss": 0.3189, + "step": 2981 + }, + { + "epoch": 0.6917990952325717, + "grad_norm": 23.18489676907909, + "learning_rate": 2e-06, + "loss": 0.2713, + "step": 2982 + }, + { + "epoch": 0.6920310868808723, + "grad_norm": 15.316381457085425, + "learning_rate": 2e-06, + "loss": 0.3127, + "step": 2983 + }, + { + "epoch": 0.6922630785291729, + "grad_norm": 11.503904532399071, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 2984 + }, + { + "epoch": 0.6924950701774736, + "grad_norm": 23.105103132210573, + "learning_rate": 2e-06, + "loss": 0.35, + "step": 2985 + }, + { + "epoch": 0.6927270618257743, + "grad_norm": 8.934461565490222, + "learning_rate": 2e-06, + "loss": 0.1694, + "step": 2986 + }, + { + "epoch": 0.6929590534740749, + "grad_norm": 15.460634718345247, + "learning_rate": 2e-06, + "loss": 0.327, + "step": 2987 + }, + { + "epoch": 0.6931910451223756, + "grad_norm": 10.533426557156625, + "learning_rate": 2e-06, + "loss": 0.2937, + "step": 2988 + }, + { + "epoch": 0.6934230367706763, + "grad_norm": 12.54384265691961, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 2989 + }, + { + "epoch": 0.693655028418977, + "grad_norm": 17.7973153093463, + "learning_rate": 2e-06, + "loss": 0.2359, + "step": 2990 + }, + { + "epoch": 0.6938870200672775, + "grad_norm": 7.855860871585307, + "learning_rate": 2e-06, + "loss": 0.214, + "step": 2991 + }, + { + "epoch": 0.6941190117155782, + "grad_norm": 23.44117120120443, + "learning_rate": 2e-06, + "loss": 0.3826, + "step": 2992 + }, + { + "epoch": 0.6943510033638789, + "grad_norm": 21.587237505595454, + "learning_rate": 2e-06, + "loss": 0.3726, + "step": 2993 + }, + { + "epoch": 0.6945829950121796, + "grad_norm": 8.966649034172018, + "learning_rate": 2e-06, + "loss": 0.3061, + "step": 2994 + }, + { + "epoch": 0.6948149866604802, + "grad_norm": 10.394999748177568, + "learning_rate": 2e-06, + "loss": 0.2836, + "step": 2995 + }, + { + "epoch": 0.6950469783087809, + "grad_norm": 16.36252544489369, + "learning_rate": 2e-06, + "loss": 0.2719, + "step": 2996 + }, + { + "epoch": 0.6952789699570815, + "grad_norm": 15.73753046229693, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 2997 + }, + { + "epoch": 0.6955109616053822, + "grad_norm": 10.482261401826955, + "learning_rate": 2e-06, + "loss": 0.2225, + "step": 2998 + }, + { + "epoch": 0.6957429532536828, + "grad_norm": 16.162687708016666, + "learning_rate": 2e-06, + "loss": 0.3063, + "step": 2999 + }, + { + "epoch": 0.6959749449019835, + "grad_norm": 10.163603053519628, + "learning_rate": 2e-06, + "loss": 0.2532, + "step": 3000 + }, + { + "epoch": 0.6962069365502842, + "grad_norm": 13.518454150870886, + "learning_rate": 2e-06, + "loss": 0.3016, + "step": 3001 + }, + { + "epoch": 0.6964389281985849, + "grad_norm": 18.165892745018137, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 3002 + }, + { + "epoch": 0.6966709198468856, + "grad_norm": 17.90105884966254, + "learning_rate": 2e-06, + "loss": 0.3063, + "step": 3003 + }, + { + "epoch": 0.6969029114951861, + "grad_norm": 12.788468740614176, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 3004 + }, + { + "epoch": 0.6971349031434868, + "grad_norm": 15.957661767789833, + "learning_rate": 2e-06, + "loss": 0.2346, + "step": 3005 + }, + { + "epoch": 0.6973668947917875, + "grad_norm": 18.210358640908733, + "learning_rate": 2e-06, + "loss": 0.2909, + "step": 3006 + }, + { + "epoch": 0.6975988864400882, + "grad_norm": 13.050628004493788, + "learning_rate": 2e-06, + "loss": 0.3312, + "step": 3007 + }, + { + "epoch": 0.6978308780883888, + "grad_norm": 13.177363294059413, + "learning_rate": 2e-06, + "loss": 0.2819, + "step": 3008 + }, + { + "epoch": 0.6980628697366895, + "grad_norm": 8.645162144450342, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 3009 + }, + { + "epoch": 0.6982948613849902, + "grad_norm": 12.456599894548935, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 3010 + }, + { + "epoch": 0.6985268530332908, + "grad_norm": 16.152022904207286, + "learning_rate": 2e-06, + "loss": 0.2267, + "step": 3011 + }, + { + "epoch": 0.6987588446815914, + "grad_norm": 15.475853041190083, + "learning_rate": 2e-06, + "loss": 0.2555, + "step": 3012 + }, + { + "epoch": 0.6989908363298921, + "grad_norm": 17.594340371945172, + "learning_rate": 2e-06, + "loss": 0.3743, + "step": 3013 + }, + { + "epoch": 0.6992228279781928, + "grad_norm": 13.891850067454452, + "learning_rate": 2e-06, + "loss": 0.333, + "step": 3014 + }, + { + "epoch": 0.6994548196264935, + "grad_norm": 9.746005875818053, + "learning_rate": 2e-06, + "loss": 0.2134, + "step": 3015 + }, + { + "epoch": 0.6996868112747942, + "grad_norm": 13.990318327075137, + "learning_rate": 2e-06, + "loss": 0.3343, + "step": 3016 + }, + { + "epoch": 0.6999188029230947, + "grad_norm": 23.025914502524685, + "learning_rate": 2e-06, + "loss": 0.4144, + "step": 3017 + }, + { + "epoch": 0.7001507945713954, + "grad_norm": 22.778332248585592, + "learning_rate": 2e-06, + "loss": 0.1918, + "step": 3018 + }, + { + "epoch": 0.7003827862196961, + "grad_norm": 15.430801616526118, + "learning_rate": 2e-06, + "loss": 0.3679, + "step": 3019 + }, + { + "epoch": 0.7006147778679968, + "grad_norm": 16.207361770057013, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 3020 + }, + { + "epoch": 0.7008467695162974, + "grad_norm": 16.574744397317918, + "learning_rate": 2e-06, + "loss": 0.3118, + "step": 3021 + }, + { + "epoch": 0.7010787611645981, + "grad_norm": 9.771455601212924, + "learning_rate": 2e-06, + "loss": 0.2042, + "step": 3022 + }, + { + "epoch": 0.7013107528128988, + "grad_norm": 16.352935531849965, + "learning_rate": 2e-06, + "loss": 0.3103, + "step": 3023 + }, + { + "epoch": 0.7015427444611994, + "grad_norm": 16.012828911522448, + "learning_rate": 2e-06, + "loss": 0.3218, + "step": 3024 + }, + { + "epoch": 0.7017747361095, + "grad_norm": 31.336384254208152, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 3025 + }, + { + "epoch": 0.7020067277578007, + "grad_norm": 11.076684492126956, + "learning_rate": 2e-06, + "loss": 0.2038, + "step": 3026 + }, + { + "epoch": 0.7022387194061014, + "grad_norm": 19.254274788008026, + "learning_rate": 2e-06, + "loss": 0.3597, + "step": 3027 + }, + { + "epoch": 0.7024707110544021, + "grad_norm": 22.845025573712128, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 3028 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 10.265590830213393, + "learning_rate": 2e-06, + "loss": 0.2768, + "step": 3029 + }, + { + "epoch": 0.7029346943510033, + "grad_norm": 13.635127225963343, + "learning_rate": 2e-06, + "loss": 0.2435, + "step": 3030 + }, + { + "epoch": 0.703166685999304, + "grad_norm": 21.43039852701912, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 3031 + }, + { + "epoch": 0.7033986776476047, + "grad_norm": 20.249225557418523, + "learning_rate": 2e-06, + "loss": 0.3132, + "step": 3032 + }, + { + "epoch": 0.7036306692959053, + "grad_norm": 10.437849106321048, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 3033 + }, + { + "epoch": 0.703862660944206, + "grad_norm": 14.921942445808748, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 3034 + }, + { + "epoch": 0.7040946525925067, + "grad_norm": 13.427974898300857, + "learning_rate": 2e-06, + "loss": 0.2818, + "step": 3035 + }, + { + "epoch": 0.7043266442408074, + "grad_norm": 18.31794009982437, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 3036 + }, + { + "epoch": 0.7045586358891079, + "grad_norm": 31.05526528803719, + "learning_rate": 2e-06, + "loss": 0.5122, + "step": 3037 + }, + { + "epoch": 0.7047906275374086, + "grad_norm": 12.254876345659827, + "learning_rate": 2e-06, + "loss": 0.3137, + "step": 3038 + }, + { + "epoch": 0.7050226191857093, + "grad_norm": 15.370854458054378, + "learning_rate": 2e-06, + "loss": 0.3237, + "step": 3039 + }, + { + "epoch": 0.70525461083401, + "grad_norm": 18.637823728977246, + "learning_rate": 2e-06, + "loss": 0.2775, + "step": 3040 + }, + { + "epoch": 0.7054866024823107, + "grad_norm": 12.873597948913517, + "learning_rate": 2e-06, + "loss": 0.3299, + "step": 3041 + }, + { + "epoch": 0.7057185941306113, + "grad_norm": 17.18921170254753, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 3042 + }, + { + "epoch": 0.705950585778912, + "grad_norm": 15.409492736211421, + "learning_rate": 2e-06, + "loss": 0.2607, + "step": 3043 + }, + { + "epoch": 0.7061825774272126, + "grad_norm": 9.975829678915247, + "learning_rate": 2e-06, + "loss": 0.3616, + "step": 3044 + }, + { + "epoch": 0.7064145690755133, + "grad_norm": 15.207969039612548, + "learning_rate": 2e-06, + "loss": 0.3872, + "step": 3045 + }, + { + "epoch": 0.7066465607238139, + "grad_norm": 12.70874179865131, + "learning_rate": 2e-06, + "loss": 0.2803, + "step": 3046 + }, + { + "epoch": 0.7068785523721146, + "grad_norm": 10.777788004616392, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 3047 + }, + { + "epoch": 0.7071105440204153, + "grad_norm": 9.71067971397703, + "learning_rate": 2e-06, + "loss": 0.231, + "step": 3048 + }, + { + "epoch": 0.707342535668716, + "grad_norm": 16.182320295977373, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 3049 + }, + { + "epoch": 0.7075745273170165, + "grad_norm": 14.24194784140107, + "learning_rate": 2e-06, + "loss": 0.3133, + "step": 3050 + }, + { + "epoch": 0.7078065189653172, + "grad_norm": 8.116297025441758, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 3051 + }, + { + "epoch": 0.7080385106136179, + "grad_norm": 15.408449590028416, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 3052 + }, + { + "epoch": 0.7082705022619186, + "grad_norm": 10.126091809211045, + "learning_rate": 2e-06, + "loss": 0.2563, + "step": 3053 + }, + { + "epoch": 0.7085024939102192, + "grad_norm": 15.76931845699468, + "learning_rate": 2e-06, + "loss": 0.3641, + "step": 3054 + }, + { + "epoch": 0.7087344855585199, + "grad_norm": 14.3181566502542, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 3055 + }, + { + "epoch": 0.7089664772068206, + "grad_norm": 11.518059207296266, + "learning_rate": 2e-06, + "loss": 0.3092, + "step": 3056 + }, + { + "epoch": 0.7091984688551212, + "grad_norm": 11.075548644625707, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 3057 + }, + { + "epoch": 0.7094304605034218, + "grad_norm": 8.931630979263996, + "learning_rate": 2e-06, + "loss": 0.2094, + "step": 3058 + }, + { + "epoch": 0.7096624521517225, + "grad_norm": 10.015903169009626, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 3059 + }, + { + "epoch": 0.7098944438000232, + "grad_norm": 9.001640545817724, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 3060 + }, + { + "epoch": 0.7101264354483239, + "grad_norm": 9.884731171167504, + "learning_rate": 2e-06, + "loss": 0.2066, + "step": 3061 + }, + { + "epoch": 0.7103584270966246, + "grad_norm": 8.79796649865471, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 3062 + }, + { + "epoch": 0.7105904187449252, + "grad_norm": 12.035283919663504, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 3063 + }, + { + "epoch": 0.7108224103932258, + "grad_norm": 5.610850717951971, + "learning_rate": 2e-06, + "loss": 0.1643, + "step": 3064 + }, + { + "epoch": 0.7110544020415265, + "grad_norm": 20.315813498268504, + "learning_rate": 2e-06, + "loss": 0.2786, + "step": 3065 + }, + { + "epoch": 0.7112863936898272, + "grad_norm": 15.024208154539762, + "learning_rate": 2e-06, + "loss": 0.358, + "step": 3066 + }, + { + "epoch": 0.7115183853381278, + "grad_norm": 18.763448531490877, + "learning_rate": 2e-06, + "loss": 0.3619, + "step": 3067 + }, + { + "epoch": 0.7117503769864285, + "grad_norm": 8.850772694364398, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 3068 + }, + { + "epoch": 0.7119823686347292, + "grad_norm": 12.558505567041202, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 3069 + }, + { + "epoch": 0.7122143602830298, + "grad_norm": 9.413887213263452, + "learning_rate": 2e-06, + "loss": 0.2578, + "step": 3070 + }, + { + "epoch": 0.7124463519313304, + "grad_norm": 8.268247015403908, + "learning_rate": 2e-06, + "loss": 0.3173, + "step": 3071 + }, + { + "epoch": 0.7126783435796311, + "grad_norm": 8.897679021367162, + "learning_rate": 2e-06, + "loss": 0.2035, + "step": 3072 + }, + { + "epoch": 0.7129103352279318, + "grad_norm": 6.774172208125166, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 3073 + }, + { + "epoch": 0.7131423268762325, + "grad_norm": 15.784809665803815, + "learning_rate": 2e-06, + "loss": 0.3427, + "step": 3074 + }, + { + "epoch": 0.7133743185245331, + "grad_norm": 9.490956131277786, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 3075 + }, + { + "epoch": 0.7136063101728338, + "grad_norm": 12.768478698546604, + "learning_rate": 2e-06, + "loss": 0.2892, + "step": 3076 + }, + { + "epoch": 0.7138383018211344, + "grad_norm": 14.733261060077966, + "learning_rate": 2e-06, + "loss": 0.2683, + "step": 3077 + }, + { + "epoch": 0.7140702934694351, + "grad_norm": 16.361146756468674, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 3078 + }, + { + "epoch": 0.7143022851177357, + "grad_norm": 20.56834735004541, + "learning_rate": 2e-06, + "loss": 0.3263, + "step": 3079 + }, + { + "epoch": 0.7145342767660364, + "grad_norm": 19.139723749178586, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 3080 + }, + { + "epoch": 0.7147662684143371, + "grad_norm": 6.061627605458326, + "learning_rate": 2e-06, + "loss": 0.1807, + "step": 3081 + }, + { + "epoch": 0.7149982600626378, + "grad_norm": 17.17408580688916, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 3082 + }, + { + "epoch": 0.7152302517109385, + "grad_norm": 21.46351281558989, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 3083 + }, + { + "epoch": 0.715462243359239, + "grad_norm": 15.413561020002579, + "learning_rate": 2e-06, + "loss": 0.2781, + "step": 3084 + }, + { + "epoch": 0.7156942350075397, + "grad_norm": 17.441447535031255, + "learning_rate": 2e-06, + "loss": 0.345, + "step": 3085 + }, + { + "epoch": 0.7159262266558404, + "grad_norm": 14.122717322632662, + "learning_rate": 2e-06, + "loss": 0.2305, + "step": 3086 + }, + { + "epoch": 0.7161582183041411, + "grad_norm": 11.784198991709928, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 3087 + }, + { + "epoch": 0.7163902099524417, + "grad_norm": 16.435399537046283, + "learning_rate": 2e-06, + "loss": 0.2253, + "step": 3088 + }, + { + "epoch": 0.7166222016007424, + "grad_norm": 20.087543297417806, + "learning_rate": 2e-06, + "loss": 0.3557, + "step": 3089 + }, + { + "epoch": 0.716854193249043, + "grad_norm": 9.136332789225563, + "learning_rate": 2e-06, + "loss": 0.2596, + "step": 3090 + }, + { + "epoch": 0.7170861848973437, + "grad_norm": 17.3524058908081, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 3091 + }, + { + "epoch": 0.7173181765456443, + "grad_norm": 17.29438646410513, + "learning_rate": 2e-06, + "loss": 0.2101, + "step": 3092 + }, + { + "epoch": 0.717550168193945, + "grad_norm": 12.382294379053754, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 3093 + }, + { + "epoch": 0.7177821598422457, + "grad_norm": 13.213939787408052, + "learning_rate": 2e-06, + "loss": 0.2625, + "step": 3094 + }, + { + "epoch": 0.7180141514905464, + "grad_norm": 8.476230812288442, + "learning_rate": 2e-06, + "loss": 0.2527, + "step": 3095 + }, + { + "epoch": 0.718246143138847, + "grad_norm": 19.212254605560727, + "learning_rate": 2e-06, + "loss": 0.2904, + "step": 3096 + }, + { + "epoch": 0.7184781347871476, + "grad_norm": 10.343472922864036, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 3097 + }, + { + "epoch": 0.7187101264354483, + "grad_norm": 13.725070104455183, + "learning_rate": 2e-06, + "loss": 0.361, + "step": 3098 + }, + { + "epoch": 0.718942118083749, + "grad_norm": 16.651969325920845, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 3099 + }, + { + "epoch": 0.7191741097320496, + "grad_norm": 9.55939511575531, + "learning_rate": 2e-06, + "loss": 0.3012, + "step": 3100 + }, + { + "epoch": 0.7194061013803503, + "grad_norm": 15.3184086976544, + "learning_rate": 2e-06, + "loss": 0.2286, + "step": 3101 + }, + { + "epoch": 0.719638093028651, + "grad_norm": 12.566094977158508, + "learning_rate": 2e-06, + "loss": 0.2944, + "step": 3102 + }, + { + "epoch": 0.7198700846769517, + "grad_norm": 6.97831787840303, + "learning_rate": 2e-06, + "loss": 0.1917, + "step": 3103 + }, + { + "epoch": 0.7201020763252522, + "grad_norm": 16.899575716149464, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 3104 + }, + { + "epoch": 0.7203340679735529, + "grad_norm": 17.976110532970768, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 3105 + }, + { + "epoch": 0.7205660596218536, + "grad_norm": 18.232864963298365, + "learning_rate": 2e-06, + "loss": 0.4656, + "step": 3106 + }, + { + "epoch": 0.7207980512701543, + "grad_norm": 9.588755633507239, + "learning_rate": 2e-06, + "loss": 0.321, + "step": 3107 + }, + { + "epoch": 0.721030042918455, + "grad_norm": 9.184369414779091, + "learning_rate": 2e-06, + "loss": 0.185, + "step": 3108 + }, + { + "epoch": 0.7212620345667556, + "grad_norm": 8.18229606602893, + "learning_rate": 2e-06, + "loss": 0.2636, + "step": 3109 + }, + { + "epoch": 0.7214940262150562, + "grad_norm": 9.565619460509582, + "learning_rate": 2e-06, + "loss": 0.3021, + "step": 3110 + }, + { + "epoch": 0.7217260178633569, + "grad_norm": 8.042176091017213, + "learning_rate": 2e-06, + "loss": 0.1867, + "step": 3111 + }, + { + "epoch": 0.7219580095116576, + "grad_norm": 13.345881386328118, + "learning_rate": 2e-06, + "loss": 0.2084, + "step": 3112 + }, + { + "epoch": 0.7221900011599582, + "grad_norm": 15.26049198288833, + "learning_rate": 2e-06, + "loss": 0.3182, + "step": 3113 + }, + { + "epoch": 0.7224219928082589, + "grad_norm": 12.405214439820826, + "learning_rate": 2e-06, + "loss": 0.3215, + "step": 3114 + }, + { + "epoch": 0.7226539844565596, + "grad_norm": 8.279027687701165, + "learning_rate": 2e-06, + "loss": 0.2029, + "step": 3115 + }, + { + "epoch": 0.7228859761048603, + "grad_norm": 9.23346549546795, + "learning_rate": 2e-06, + "loss": 0.2842, + "step": 3116 + }, + { + "epoch": 0.7231179677531608, + "grad_norm": 13.90968760840073, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 3117 + }, + { + "epoch": 0.7233499594014615, + "grad_norm": 13.869798608630939, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 3118 + }, + { + "epoch": 0.7235819510497622, + "grad_norm": 7.716027129845056, + "learning_rate": 2e-06, + "loss": 0.2188, + "step": 3119 + }, + { + "epoch": 0.7238139426980629, + "grad_norm": 10.440026382109156, + "learning_rate": 2e-06, + "loss": 0.2281, + "step": 3120 + }, + { + "epoch": 0.7240459343463636, + "grad_norm": 17.4379176105839, + "learning_rate": 2e-06, + "loss": 0.3783, + "step": 3121 + }, + { + "epoch": 0.7242779259946642, + "grad_norm": 17.53811782802477, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 3122 + }, + { + "epoch": 0.7245099176429649, + "grad_norm": 12.403149884465117, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 3123 + }, + { + "epoch": 0.7247419092912655, + "grad_norm": 7.189055185689566, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 3124 + }, + { + "epoch": 0.7249739009395662, + "grad_norm": 13.366602934084645, + "learning_rate": 2e-06, + "loss": 0.3757, + "step": 3125 + }, + { + "epoch": 0.7252058925878668, + "grad_norm": 20.723002926799648, + "learning_rate": 2e-06, + "loss": 0.3032, + "step": 3126 + }, + { + "epoch": 0.7254378842361675, + "grad_norm": 14.345531287053802, + "learning_rate": 2e-06, + "loss": 0.2848, + "step": 3127 + }, + { + "epoch": 0.7256698758844682, + "grad_norm": 15.277830898750224, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 3128 + }, + { + "epoch": 0.7259018675327689, + "grad_norm": 14.343649187551526, + "learning_rate": 2e-06, + "loss": 0.2137, + "step": 3129 + }, + { + "epoch": 0.7261338591810694, + "grad_norm": 11.927520687935251, + "learning_rate": 2e-06, + "loss": 0.2584, + "step": 3130 + }, + { + "epoch": 0.7263658508293701, + "grad_norm": 7.569851105062665, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 3131 + }, + { + "epoch": 0.7265978424776708, + "grad_norm": 15.411236027620829, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 3132 + }, + { + "epoch": 0.7268298341259715, + "grad_norm": 17.315685638615545, + "learning_rate": 2e-06, + "loss": 0.3518, + "step": 3133 + }, + { + "epoch": 0.7270618257742721, + "grad_norm": 12.187758686052286, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 3134 + }, + { + "epoch": 0.7272938174225728, + "grad_norm": 9.516509125547941, + "learning_rate": 2e-06, + "loss": 0.2373, + "step": 3135 + }, + { + "epoch": 0.7275258090708735, + "grad_norm": 13.633348693158915, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 3136 + }, + { + "epoch": 0.7277578007191741, + "grad_norm": 18.169940194464743, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 3137 + }, + { + "epoch": 0.7279897923674747, + "grad_norm": 16.600460469573456, + "learning_rate": 2e-06, + "loss": 0.236, + "step": 3138 + }, + { + "epoch": 0.7282217840157754, + "grad_norm": 13.243093105767455, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 3139 + }, + { + "epoch": 0.7284537756640761, + "grad_norm": 18.433591061729665, + "learning_rate": 2e-06, + "loss": 0.3301, + "step": 3140 + }, + { + "epoch": 0.7286857673123768, + "grad_norm": 9.66065462790196, + "learning_rate": 2e-06, + "loss": 0.2138, + "step": 3141 + }, + { + "epoch": 0.7289177589606775, + "grad_norm": 13.209312050697344, + "learning_rate": 2e-06, + "loss": 0.2658, + "step": 3142 + }, + { + "epoch": 0.7291497506089781, + "grad_norm": 19.082935008032695, + "learning_rate": 2e-06, + "loss": 0.291, + "step": 3143 + }, + { + "epoch": 0.7293817422572787, + "grad_norm": 7.89564746805161, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 3144 + }, + { + "epoch": 0.7296137339055794, + "grad_norm": 16.304437940242224, + "learning_rate": 2e-06, + "loss": 0.2943, + "step": 3145 + }, + { + "epoch": 0.72984572555388, + "grad_norm": 22.612646825330938, + "learning_rate": 2e-06, + "loss": 0.421, + "step": 3146 + }, + { + "epoch": 0.7300777172021807, + "grad_norm": 10.784007437530533, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 3147 + }, + { + "epoch": 0.7303097088504814, + "grad_norm": 22.407086484125312, + "learning_rate": 2e-06, + "loss": 0.3782, + "step": 3148 + }, + { + "epoch": 0.7305417004987821, + "grad_norm": 22.865257080954436, + "learning_rate": 2e-06, + "loss": 0.3711, + "step": 3149 + }, + { + "epoch": 0.7307736921470827, + "grad_norm": 16.632205181733372, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 3150 + }, + { + "epoch": 0.7310056837953833, + "grad_norm": 12.773540189237295, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 3151 + }, + { + "epoch": 0.731237675443684, + "grad_norm": 19.910063057592886, + "learning_rate": 2e-06, + "loss": 0.3787, + "step": 3152 + }, + { + "epoch": 0.7314696670919847, + "grad_norm": 11.60355812504079, + "learning_rate": 2e-06, + "loss": 0.248, + "step": 3153 + }, + { + "epoch": 0.7317016587402854, + "grad_norm": 11.24794631426084, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 3154 + }, + { + "epoch": 0.731933650388586, + "grad_norm": 17.127338422807483, + "learning_rate": 2e-06, + "loss": 0.3598, + "step": 3155 + }, + { + "epoch": 0.7321656420368867, + "grad_norm": 21.97871303967765, + "learning_rate": 2e-06, + "loss": 0.4009, + "step": 3156 + }, + { + "epoch": 0.7323976336851873, + "grad_norm": 15.615835043718105, + "learning_rate": 2e-06, + "loss": 0.3075, + "step": 3157 + }, + { + "epoch": 0.732629625333488, + "grad_norm": 17.522596850256367, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 3158 + }, + { + "epoch": 0.7328616169817886, + "grad_norm": 18.739187983179765, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 3159 + }, + { + "epoch": 0.7330936086300893, + "grad_norm": 6.660399942455831, + "learning_rate": 2e-06, + "loss": 0.2451, + "step": 3160 + }, + { + "epoch": 0.73332560027839, + "grad_norm": 15.690373574466072, + "learning_rate": 2e-06, + "loss": 0.2543, + "step": 3161 + }, + { + "epoch": 0.7335575919266907, + "grad_norm": 10.188035897846122, + "learning_rate": 2e-06, + "loss": 0.2615, + "step": 3162 + }, + { + "epoch": 0.7337895835749914, + "grad_norm": 23.18149272189231, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 3163 + }, + { + "epoch": 0.7340215752232919, + "grad_norm": 14.122180502228332, + "learning_rate": 2e-06, + "loss": 0.2234, + "step": 3164 + }, + { + "epoch": 0.7342535668715926, + "grad_norm": 7.541936206004345, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 3165 + }, + { + "epoch": 0.7344855585198933, + "grad_norm": 12.774313254828565, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 3166 + }, + { + "epoch": 0.734717550168194, + "grad_norm": 17.641684894723078, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 3167 + }, + { + "epoch": 0.7349495418164946, + "grad_norm": 19.12104028237049, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 3168 + }, + { + "epoch": 0.7351815334647953, + "grad_norm": 22.529089396316817, + "learning_rate": 2e-06, + "loss": 0.2992, + "step": 3169 + }, + { + "epoch": 0.7354135251130959, + "grad_norm": 19.771390540790247, + "learning_rate": 2e-06, + "loss": 0.276, + "step": 3170 + }, + { + "epoch": 0.7356455167613966, + "grad_norm": 16.61134813442355, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 3171 + }, + { + "epoch": 0.7358775084096972, + "grad_norm": 18.759166483704387, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 3172 + }, + { + "epoch": 0.7361095000579979, + "grad_norm": 10.461120170280791, + "learning_rate": 2e-06, + "loss": 0.326, + "step": 3173 + }, + { + "epoch": 0.7363414917062986, + "grad_norm": 11.874710306707069, + "learning_rate": 2e-06, + "loss": 0.3141, + "step": 3174 + }, + { + "epoch": 0.7365734833545993, + "grad_norm": 24.942525916485767, + "learning_rate": 2e-06, + "loss": 0.2926, + "step": 3175 + }, + { + "epoch": 0.7368054750029, + "grad_norm": 15.934880410013454, + "learning_rate": 2e-06, + "loss": 0.3051, + "step": 3176 + }, + { + "epoch": 0.7370374666512005, + "grad_norm": 9.61878494372916, + "learning_rate": 2e-06, + "loss": 0.2273, + "step": 3177 + }, + { + "epoch": 0.7372694582995012, + "grad_norm": 17.32211608727347, + "learning_rate": 2e-06, + "loss": 0.2992, + "step": 3178 + }, + { + "epoch": 0.7375014499478019, + "grad_norm": 19.998321939059572, + "learning_rate": 2e-06, + "loss": 0.1923, + "step": 3179 + }, + { + "epoch": 0.7377334415961025, + "grad_norm": 10.703880883805544, + "learning_rate": 2e-06, + "loss": 0.2353, + "step": 3180 + }, + { + "epoch": 0.7379654332444032, + "grad_norm": 19.34032775586904, + "learning_rate": 2e-06, + "loss": 0.3466, + "step": 3181 + }, + { + "epoch": 0.7381974248927039, + "grad_norm": 9.371863932024695, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 3182 + }, + { + "epoch": 0.7384294165410045, + "grad_norm": 12.088369944261673, + "learning_rate": 2e-06, + "loss": 0.3356, + "step": 3183 + }, + { + "epoch": 0.7386614081893051, + "grad_norm": 13.208820592992916, + "learning_rate": 2e-06, + "loss": 0.3584, + "step": 3184 + }, + { + "epoch": 0.7388933998376058, + "grad_norm": 14.48019733506012, + "learning_rate": 2e-06, + "loss": 0.2866, + "step": 3185 + }, + { + "epoch": 0.7391253914859065, + "grad_norm": 9.297669852631856, + "learning_rate": 2e-06, + "loss": 0.285, + "step": 3186 + }, + { + "epoch": 0.7393573831342072, + "grad_norm": 13.59255319093113, + "learning_rate": 2e-06, + "loss": 0.2217, + "step": 3187 + }, + { + "epoch": 0.7395893747825079, + "grad_norm": 9.63516912508189, + "learning_rate": 2e-06, + "loss": 0.2913, + "step": 3188 + }, + { + "epoch": 0.7398213664308085, + "grad_norm": 12.207878815290089, + "learning_rate": 2e-06, + "loss": 0.1737, + "step": 3189 + }, + { + "epoch": 0.7400533580791091, + "grad_norm": 7.723048500107733, + "learning_rate": 2e-06, + "loss": 0.1719, + "step": 3190 + }, + { + "epoch": 0.7402853497274098, + "grad_norm": 15.372306937561344, + "learning_rate": 2e-06, + "loss": 0.3438, + "step": 3191 + }, + { + "epoch": 0.7405173413757105, + "grad_norm": 10.693950673258227, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 3192 + }, + { + "epoch": 0.7407493330240111, + "grad_norm": 14.196738573156889, + "learning_rate": 2e-06, + "loss": 0.3165, + "step": 3193 + }, + { + "epoch": 0.7409813246723118, + "grad_norm": 266.23074189192454, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 3194 + }, + { + "epoch": 0.7412133163206125, + "grad_norm": 11.658793115573301, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 3195 + }, + { + "epoch": 0.7414453079689132, + "grad_norm": 18.86350667547308, + "learning_rate": 2e-06, + "loss": 0.4451, + "step": 3196 + }, + { + "epoch": 0.7416772996172137, + "grad_norm": 10.675248689307919, + "learning_rate": 2e-06, + "loss": 0.2638, + "step": 3197 + }, + { + "epoch": 0.7419092912655144, + "grad_norm": 10.46535396830821, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 3198 + }, + { + "epoch": 0.7421412829138151, + "grad_norm": 14.436103478643547, + "learning_rate": 2e-06, + "loss": 0.2665, + "step": 3199 + }, + { + "epoch": 0.7423732745621158, + "grad_norm": 18.35508453929755, + "learning_rate": 2e-06, + "loss": 0.3326, + "step": 3200 + }, + { + "epoch": 0.7426052662104164, + "grad_norm": 13.92213201139075, + "learning_rate": 2e-06, + "loss": 0.3001, + "step": 3201 + }, + { + "epoch": 0.7428372578587171, + "grad_norm": 24.057651507458285, + "learning_rate": 2e-06, + "loss": 0.4314, + "step": 3202 + }, + { + "epoch": 0.7430692495070177, + "grad_norm": 13.864093476181436, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 3203 + }, + { + "epoch": 0.7433012411553184, + "grad_norm": 12.054702304883937, + "learning_rate": 2e-06, + "loss": 0.3038, + "step": 3204 + }, + { + "epoch": 0.743533232803619, + "grad_norm": 12.60991002298068, + "learning_rate": 2e-06, + "loss": 0.196, + "step": 3205 + }, + { + "epoch": 0.7437652244519197, + "grad_norm": 12.127707235247618, + "learning_rate": 2e-06, + "loss": 0.1946, + "step": 3206 + }, + { + "epoch": 0.7439972161002204, + "grad_norm": 23.30100209188892, + "learning_rate": 2e-06, + "loss": 0.273, + "step": 3207 + }, + { + "epoch": 0.7442292077485211, + "grad_norm": 13.99161322820209, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 3208 + }, + { + "epoch": 0.7444611993968218, + "grad_norm": 13.887700930199868, + "learning_rate": 2e-06, + "loss": 0.2683, + "step": 3209 + }, + { + "epoch": 0.7446931910451223, + "grad_norm": 26.819487750849106, + "learning_rate": 2e-06, + "loss": 0.4258, + "step": 3210 + }, + { + "epoch": 0.744925182693423, + "grad_norm": 9.733351998525075, + "learning_rate": 2e-06, + "loss": 0.2155, + "step": 3211 + }, + { + "epoch": 0.7451571743417237, + "grad_norm": 14.340214523192815, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 3212 + }, + { + "epoch": 0.7453891659900244, + "grad_norm": 11.54245757950112, + "learning_rate": 2e-06, + "loss": 0.3347, + "step": 3213 + }, + { + "epoch": 0.745621157638325, + "grad_norm": 23.06041982558424, + "learning_rate": 2e-06, + "loss": 0.3852, + "step": 3214 + }, + { + "epoch": 0.7458531492866257, + "grad_norm": 15.579761828443129, + "learning_rate": 2e-06, + "loss": 0.1818, + "step": 3215 + }, + { + "epoch": 0.7460851409349264, + "grad_norm": 27.921307898526475, + "learning_rate": 2e-06, + "loss": 0.3919, + "step": 3216 + }, + { + "epoch": 0.746317132583227, + "grad_norm": 10.451501045988298, + "learning_rate": 2e-06, + "loss": 0.2935, + "step": 3217 + }, + { + "epoch": 0.7465491242315276, + "grad_norm": 18.411670307096102, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 3218 + }, + { + "epoch": 0.7467811158798283, + "grad_norm": 17.096497754766254, + "learning_rate": 2e-06, + "loss": 0.2315, + "step": 3219 + }, + { + "epoch": 0.747013107528129, + "grad_norm": 19.05129410304447, + "learning_rate": 2e-06, + "loss": 0.3125, + "step": 3220 + }, + { + "epoch": 0.7472450991764297, + "grad_norm": 7.424645567567219, + "learning_rate": 2e-06, + "loss": 0.2718, + "step": 3221 + }, + { + "epoch": 0.7474770908247304, + "grad_norm": 11.603587562641335, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 3222 + }, + { + "epoch": 0.7477090824730309, + "grad_norm": 9.346124673462597, + "learning_rate": 2e-06, + "loss": 0.2669, + "step": 3223 + }, + { + "epoch": 0.7479410741213316, + "grad_norm": 13.97160979948989, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 3224 + }, + { + "epoch": 0.7481730657696323, + "grad_norm": 25.922000700573506, + "learning_rate": 2e-06, + "loss": 0.3631, + "step": 3225 + }, + { + "epoch": 0.748405057417933, + "grad_norm": 10.308629661553015, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 3226 + }, + { + "epoch": 0.7486370490662336, + "grad_norm": 16.028103541976417, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 3227 + }, + { + "epoch": 0.7488690407145343, + "grad_norm": 11.867317699477844, + "learning_rate": 2e-06, + "loss": 0.3506, + "step": 3228 + }, + { + "epoch": 0.749101032362835, + "grad_norm": 27.591524237265336, + "learning_rate": 2e-06, + "loss": 0.4672, + "step": 3229 + }, + { + "epoch": 0.7493330240111356, + "grad_norm": 16.75662670018917, + "learning_rate": 2e-06, + "loss": 0.3969, + "step": 3230 + }, + { + "epoch": 0.7495650156594362, + "grad_norm": 21.443112852719207, + "learning_rate": 2e-06, + "loss": 0.3879, + "step": 3231 + }, + { + "epoch": 0.7497970073077369, + "grad_norm": 18.57519074076872, + "learning_rate": 2e-06, + "loss": 0.3253, + "step": 3232 + }, + { + "epoch": 0.7500289989560376, + "grad_norm": 9.228484999772808, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 3233 + }, + { + "epoch": 0.7502609906043383, + "grad_norm": 20.858727268557686, + "learning_rate": 2e-06, + "loss": 0.2909, + "step": 3234 + }, + { + "epoch": 0.7504929822526389, + "grad_norm": 26.914560888617398, + "learning_rate": 2e-06, + "loss": 0.3719, + "step": 3235 + }, + { + "epoch": 0.7507249739009396, + "grad_norm": 15.709843888605175, + "learning_rate": 2e-06, + "loss": 0.2176, + "step": 3236 + }, + { + "epoch": 0.7509569655492402, + "grad_norm": 19.16131357159073, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 3237 + }, + { + "epoch": 0.7511889571975409, + "grad_norm": 9.892834755241338, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 3238 + }, + { + "epoch": 0.7514209488458415, + "grad_norm": 12.710909928586199, + "learning_rate": 2e-06, + "loss": 0.3617, + "step": 3239 + }, + { + "epoch": 0.7516529404941422, + "grad_norm": 14.861685648929491, + "learning_rate": 2e-06, + "loss": 0.2903, + "step": 3240 + }, + { + "epoch": 0.7518849321424429, + "grad_norm": 10.636791599844129, + "learning_rate": 2e-06, + "loss": 0.2154, + "step": 3241 + }, + { + "epoch": 0.7521169237907436, + "grad_norm": 10.824860251051467, + "learning_rate": 2e-06, + "loss": 0.3095, + "step": 3242 + }, + { + "epoch": 0.7523489154390441, + "grad_norm": 11.258032137291424, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 3243 + }, + { + "epoch": 0.7525809070873448, + "grad_norm": 12.216580409438913, + "learning_rate": 2e-06, + "loss": 0.3381, + "step": 3244 + }, + { + "epoch": 0.7528128987356455, + "grad_norm": 16.46023841396873, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 3245 + }, + { + "epoch": 0.7530448903839462, + "grad_norm": 10.773669247612482, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 3246 + }, + { + "epoch": 0.7532768820322469, + "grad_norm": 15.47427621377556, + "learning_rate": 2e-06, + "loss": 0.3528, + "step": 3247 + }, + { + "epoch": 0.7535088736805475, + "grad_norm": 9.437936233561318, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 3248 + }, + { + "epoch": 0.7537408653288482, + "grad_norm": 15.060815916367174, + "learning_rate": 2e-06, + "loss": 0.369, + "step": 3249 + }, + { + "epoch": 0.7539728569771488, + "grad_norm": 10.051667383454044, + "learning_rate": 2e-06, + "loss": 0.2607, + "step": 3250 + }, + { + "epoch": 0.7542048486254495, + "grad_norm": 15.210897124764472, + "learning_rate": 2e-06, + "loss": 0.3493, + "step": 3251 + }, + { + "epoch": 0.7544368402737501, + "grad_norm": 16.097661441306023, + "learning_rate": 2e-06, + "loss": 0.2453, + "step": 3252 + }, + { + "epoch": 0.7546688319220508, + "grad_norm": 9.117636788720601, + "learning_rate": 2e-06, + "loss": 0.2432, + "step": 3253 + }, + { + "epoch": 0.7549008235703515, + "grad_norm": 9.98191377832381, + "learning_rate": 2e-06, + "loss": 0.2017, + "step": 3254 + }, + { + "epoch": 0.7551328152186522, + "grad_norm": 7.908774046591722, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 3255 + }, + { + "epoch": 0.7553648068669528, + "grad_norm": 23.628388777108437, + "learning_rate": 2e-06, + "loss": 0.3264, + "step": 3256 + }, + { + "epoch": 0.7555967985152534, + "grad_norm": 13.1515153464434, + "learning_rate": 2e-06, + "loss": 0.28, + "step": 3257 + }, + { + "epoch": 0.7558287901635541, + "grad_norm": 16.609710434365738, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 3258 + }, + { + "epoch": 0.7560607818118548, + "grad_norm": 19.25268556355, + "learning_rate": 2e-06, + "loss": 0.2485, + "step": 3259 + }, + { + "epoch": 0.7562927734601554, + "grad_norm": 8.798902479176105, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 3260 + }, + { + "epoch": 0.7565247651084561, + "grad_norm": 15.102364203693176, + "learning_rate": 2e-06, + "loss": 0.3814, + "step": 3261 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 12.251104054670868, + "learning_rate": 2e-06, + "loss": 0.205, + "step": 3262 + }, + { + "epoch": 0.7569887484050574, + "grad_norm": 17.830770306606684, + "learning_rate": 2e-06, + "loss": 0.3803, + "step": 3263 + }, + { + "epoch": 0.757220740053358, + "grad_norm": 9.898043465896796, + "learning_rate": 2e-06, + "loss": 0.2717, + "step": 3264 + }, + { + "epoch": 0.7574527317016587, + "grad_norm": 10.869198655689024, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 3265 + }, + { + "epoch": 0.7576847233499594, + "grad_norm": 9.569985886548965, + "learning_rate": 2e-06, + "loss": 0.2298, + "step": 3266 + }, + { + "epoch": 0.7579167149982601, + "grad_norm": 8.895433458214619, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 3267 + }, + { + "epoch": 0.7581487066465608, + "grad_norm": 10.838387016596009, + "learning_rate": 2e-06, + "loss": 0.1739, + "step": 3268 + }, + { + "epoch": 0.7583806982948614, + "grad_norm": 15.143001781351865, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 3269 + }, + { + "epoch": 0.758612689943162, + "grad_norm": 15.826963259618264, + "learning_rate": 2e-06, + "loss": 0.2124, + "step": 3270 + }, + { + "epoch": 0.7588446815914627, + "grad_norm": 18.73429903645318, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 3271 + }, + { + "epoch": 0.7590766732397634, + "grad_norm": 8.054226271696935, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 3272 + }, + { + "epoch": 0.759308664888064, + "grad_norm": 18.537441641456375, + "learning_rate": 2e-06, + "loss": 0.4262, + "step": 3273 + }, + { + "epoch": 0.7595406565363647, + "grad_norm": 18.060962169878092, + "learning_rate": 2e-06, + "loss": 0.329, + "step": 3274 + }, + { + "epoch": 0.7597726481846654, + "grad_norm": 19.176654897756123, + "learning_rate": 2e-06, + "loss": 0.2127, + "step": 3275 + }, + { + "epoch": 0.7600046398329661, + "grad_norm": 40.19397033267575, + "learning_rate": 2e-06, + "loss": 0.4408, + "step": 3276 + }, + { + "epoch": 0.7602366314812666, + "grad_norm": 14.183460089274515, + "learning_rate": 2e-06, + "loss": 0.2787, + "step": 3277 + }, + { + "epoch": 0.7604686231295673, + "grad_norm": 12.614565362170005, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 3278 + }, + { + "epoch": 0.760700614777868, + "grad_norm": 15.294760785557783, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 3279 + }, + { + "epoch": 0.7609326064261687, + "grad_norm": 14.678094198469996, + "learning_rate": 2e-06, + "loss": 0.2752, + "step": 3280 + }, + { + "epoch": 0.7611645980744693, + "grad_norm": 16.512603314388123, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 3281 + }, + { + "epoch": 0.76139658972277, + "grad_norm": 21.798372206513797, + "learning_rate": 2e-06, + "loss": 0.3324, + "step": 3282 + }, + { + "epoch": 0.7616285813710706, + "grad_norm": 13.556144644099309, + "learning_rate": 2e-06, + "loss": 0.313, + "step": 3283 + }, + { + "epoch": 0.7618605730193713, + "grad_norm": 12.0913929886167, + "learning_rate": 2e-06, + "loss": 0.2542, + "step": 3284 + }, + { + "epoch": 0.762092564667672, + "grad_norm": 8.532666772364651, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 3285 + }, + { + "epoch": 0.7623245563159726, + "grad_norm": 15.608666752534837, + "learning_rate": 2e-06, + "loss": 0.277, + "step": 3286 + }, + { + "epoch": 0.7625565479642733, + "grad_norm": 12.648901474399997, + "learning_rate": 2e-06, + "loss": 0.403, + "step": 3287 + }, + { + "epoch": 0.762788539612574, + "grad_norm": 15.6219832778702, + "learning_rate": 2e-06, + "loss": 0.3407, + "step": 3288 + }, + { + "epoch": 0.7630205312608747, + "grad_norm": 25.88285292749361, + "learning_rate": 2e-06, + "loss": 0.4021, + "step": 3289 + }, + { + "epoch": 0.7632525229091752, + "grad_norm": 14.51290497078572, + "learning_rate": 2e-06, + "loss": 0.3171, + "step": 3290 + }, + { + "epoch": 0.7634845145574759, + "grad_norm": 15.373419614063504, + "learning_rate": 2e-06, + "loss": 0.3018, + "step": 3291 + }, + { + "epoch": 0.7637165062057766, + "grad_norm": 7.465941023334532, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 3292 + }, + { + "epoch": 0.7639484978540773, + "grad_norm": 22.68478128905687, + "learning_rate": 2e-06, + "loss": 0.3353, + "step": 3293 + }, + { + "epoch": 0.7641804895023779, + "grad_norm": 12.719013375955988, + "learning_rate": 2e-06, + "loss": 0.2751, + "step": 3294 + }, + { + "epoch": 0.7644124811506786, + "grad_norm": 12.201266141777696, + "learning_rate": 2e-06, + "loss": 0.2502, + "step": 3295 + }, + { + "epoch": 0.7646444727989793, + "grad_norm": 12.401332055108211, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 3296 + }, + { + "epoch": 0.7648764644472799, + "grad_norm": 9.101908144365112, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 3297 + }, + { + "epoch": 0.7651084560955805, + "grad_norm": 15.446972836287449, + "learning_rate": 2e-06, + "loss": 0.3506, + "step": 3298 + }, + { + "epoch": 0.7653404477438812, + "grad_norm": 10.270501438242315, + "learning_rate": 2e-06, + "loss": 0.2518, + "step": 3299 + }, + { + "epoch": 0.7655724393921819, + "grad_norm": 10.880237503429663, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 3300 + }, + { + "epoch": 0.7658044310404826, + "grad_norm": 10.411867222827498, + "learning_rate": 2e-06, + "loss": 0.2021, + "step": 3301 + }, + { + "epoch": 0.7660364226887832, + "grad_norm": 6.641121711480695, + "learning_rate": 2e-06, + "loss": 0.1922, + "step": 3302 + }, + { + "epoch": 0.7662684143370838, + "grad_norm": 16.986816691328375, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 3303 + }, + { + "epoch": 0.7665004059853845, + "grad_norm": 7.048691651075567, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 3304 + }, + { + "epoch": 0.7667323976336852, + "grad_norm": 10.118835944738485, + "learning_rate": 2e-06, + "loss": 0.1769, + "step": 3305 + }, + { + "epoch": 0.7669643892819858, + "grad_norm": 20.612598747313648, + "learning_rate": 2e-06, + "loss": 0.3865, + "step": 3306 + }, + { + "epoch": 0.7671963809302865, + "grad_norm": 11.739597678261555, + "learning_rate": 2e-06, + "loss": 0.1937, + "step": 3307 + }, + { + "epoch": 0.7674283725785872, + "grad_norm": 13.091498954890323, + "learning_rate": 2e-06, + "loss": 0.3293, + "step": 3308 + }, + { + "epoch": 0.7676603642268879, + "grad_norm": 12.450404437779614, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 3309 + }, + { + "epoch": 0.7678923558751884, + "grad_norm": 12.381029099949977, + "learning_rate": 2e-06, + "loss": 0.3628, + "step": 3310 + }, + { + "epoch": 0.7681243475234891, + "grad_norm": 12.71570898337696, + "learning_rate": 2e-06, + "loss": 0.1727, + "step": 3311 + }, + { + "epoch": 0.7683563391717898, + "grad_norm": 19.630121385148918, + "learning_rate": 2e-06, + "loss": 0.3437, + "step": 3312 + }, + { + "epoch": 0.7685883308200905, + "grad_norm": 15.42711658110915, + "learning_rate": 2e-06, + "loss": 0.2281, + "step": 3313 + }, + { + "epoch": 0.7688203224683912, + "grad_norm": 6.885955436112426, + "learning_rate": 2e-06, + "loss": 0.1984, + "step": 3314 + }, + { + "epoch": 0.7690523141166918, + "grad_norm": 20.38462034150911, + "learning_rate": 2e-06, + "loss": 0.3107, + "step": 3315 + }, + { + "epoch": 0.7692843057649925, + "grad_norm": 14.158397888232825, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 3316 + }, + { + "epoch": 0.7695162974132931, + "grad_norm": 18.52903730581829, + "learning_rate": 2e-06, + "loss": 0.2503, + "step": 3317 + }, + { + "epoch": 0.7697482890615938, + "grad_norm": 24.06934932448499, + "learning_rate": 2e-06, + "loss": 0.3461, + "step": 3318 + }, + { + "epoch": 0.7699802807098944, + "grad_norm": 11.039495942718224, + "learning_rate": 2e-06, + "loss": 0.3537, + "step": 3319 + }, + { + "epoch": 0.7702122723581951, + "grad_norm": 17.35805561822558, + "learning_rate": 2e-06, + "loss": 0.3575, + "step": 3320 + }, + { + "epoch": 0.7704442640064958, + "grad_norm": 14.312621737355462, + "learning_rate": 2e-06, + "loss": 0.3117, + "step": 3321 + }, + { + "epoch": 0.7706762556547965, + "grad_norm": 14.30444248042403, + "learning_rate": 2e-06, + "loss": 0.3536, + "step": 3322 + }, + { + "epoch": 0.770908247303097, + "grad_norm": 21.15912940860675, + "learning_rate": 2e-06, + "loss": 0.433, + "step": 3323 + }, + { + "epoch": 0.7711402389513977, + "grad_norm": 12.043006519589039, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 3324 + }, + { + "epoch": 0.7713722305996984, + "grad_norm": 8.225323237276257, + "learning_rate": 2e-06, + "loss": 0.1483, + "step": 3325 + }, + { + "epoch": 0.7716042222479991, + "grad_norm": 12.721679821297943, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 3326 + }, + { + "epoch": 0.7718362138962997, + "grad_norm": 18.161270535697973, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 3327 + }, + { + "epoch": 0.7720682055446004, + "grad_norm": 16.406188851303185, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 3328 + }, + { + "epoch": 0.7723001971929011, + "grad_norm": 12.093543887689135, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 3329 + }, + { + "epoch": 0.7725321888412017, + "grad_norm": 17.715551132036744, + "learning_rate": 2e-06, + "loss": 0.3897, + "step": 3330 + }, + { + "epoch": 0.7727641804895024, + "grad_norm": 12.83841974804794, + "learning_rate": 2e-06, + "loss": 0.3194, + "step": 3331 + }, + { + "epoch": 0.772996172137803, + "grad_norm": 10.728654064148865, + "learning_rate": 2e-06, + "loss": 0.2545, + "step": 3332 + }, + { + "epoch": 0.7732281637861037, + "grad_norm": 10.098533183865884, + "learning_rate": 2e-06, + "loss": 0.354, + "step": 3333 + }, + { + "epoch": 0.7734601554344044, + "grad_norm": 15.94271340760764, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 3334 + }, + { + "epoch": 0.7736921470827051, + "grad_norm": 15.753903381136727, + "learning_rate": 2e-06, + "loss": 0.3928, + "step": 3335 + }, + { + "epoch": 0.7739241387310056, + "grad_norm": 13.636682401677106, + "learning_rate": 2e-06, + "loss": 0.2834, + "step": 3336 + }, + { + "epoch": 0.7741561303793063, + "grad_norm": 21.367904185410392, + "learning_rate": 2e-06, + "loss": 0.4015, + "step": 3337 + }, + { + "epoch": 0.774388122027607, + "grad_norm": 15.817210585659566, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 3338 + }, + { + "epoch": 0.7746201136759077, + "grad_norm": 10.298483366955015, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 3339 + }, + { + "epoch": 0.7748521053242083, + "grad_norm": 19.063689770985306, + "learning_rate": 2e-06, + "loss": 0.4006, + "step": 3340 + }, + { + "epoch": 0.775084096972509, + "grad_norm": 15.485253568225815, + "learning_rate": 2e-06, + "loss": 0.3251, + "step": 3341 + }, + { + "epoch": 0.7753160886208097, + "grad_norm": 8.78843640841125, + "learning_rate": 2e-06, + "loss": 0.4089, + "step": 3342 + }, + { + "epoch": 0.7755480802691103, + "grad_norm": 10.77846730692596, + "learning_rate": 2e-06, + "loss": 0.2857, + "step": 3343 + }, + { + "epoch": 0.7757800719174109, + "grad_norm": 13.253812636990132, + "learning_rate": 2e-06, + "loss": 0.1985, + "step": 3344 + }, + { + "epoch": 0.7760120635657116, + "grad_norm": 10.455512361721096, + "learning_rate": 2e-06, + "loss": 0.3522, + "step": 3345 + }, + { + "epoch": 0.7762440552140123, + "grad_norm": 20.698183046351787, + "learning_rate": 2e-06, + "loss": 0.3496, + "step": 3346 + }, + { + "epoch": 0.776476046862313, + "grad_norm": 7.3385363802913535, + "learning_rate": 2e-06, + "loss": 0.2692, + "step": 3347 + }, + { + "epoch": 0.7767080385106137, + "grad_norm": 14.080623288768594, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 3348 + }, + { + "epoch": 0.7769400301589143, + "grad_norm": 7.9727325221498155, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 3349 + }, + { + "epoch": 0.7771720218072149, + "grad_norm": 19.27180239056616, + "learning_rate": 2e-06, + "loss": 0.2993, + "step": 3350 + }, + { + "epoch": 0.7774040134555156, + "grad_norm": 17.39930094442428, + "learning_rate": 2e-06, + "loss": 0.4456, + "step": 3351 + }, + { + "epoch": 0.7776360051038163, + "grad_norm": 8.830695174435625, + "learning_rate": 2e-06, + "loss": 0.2363, + "step": 3352 + }, + { + "epoch": 0.7778679967521169, + "grad_norm": 12.427463571267143, + "learning_rate": 2e-06, + "loss": 0.2699, + "step": 3353 + }, + { + "epoch": 0.7780999884004176, + "grad_norm": 14.715522465947913, + "learning_rate": 2e-06, + "loss": 0.2928, + "step": 3354 + }, + { + "epoch": 0.7783319800487183, + "grad_norm": 9.18348706874619, + "learning_rate": 2e-06, + "loss": 0.333, + "step": 3355 + }, + { + "epoch": 0.7785639716970189, + "grad_norm": 11.394303641825907, + "learning_rate": 2e-06, + "loss": 0.3114, + "step": 3356 + }, + { + "epoch": 0.7787959633453195, + "grad_norm": 8.11159833896116, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 3357 + }, + { + "epoch": 0.7790279549936202, + "grad_norm": 21.21505875288533, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 3358 + }, + { + "epoch": 0.7792599466419209, + "grad_norm": 14.613621762182552, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 3359 + }, + { + "epoch": 0.7794919382902216, + "grad_norm": 8.027194232523648, + "learning_rate": 2e-06, + "loss": 0.1708, + "step": 3360 + }, + { + "epoch": 0.7797239299385222, + "grad_norm": 15.20719004188822, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 3361 + }, + { + "epoch": 0.7799559215868229, + "grad_norm": 17.25766357716426, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 3362 + }, + { + "epoch": 0.7801879132351235, + "grad_norm": 11.739053354244797, + "learning_rate": 2e-06, + "loss": 0.1788, + "step": 3363 + }, + { + "epoch": 0.7804199048834242, + "grad_norm": 12.986409445697113, + "learning_rate": 2e-06, + "loss": 0.2961, + "step": 3364 + }, + { + "epoch": 0.7806518965317248, + "grad_norm": 10.988482359288097, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 3365 + }, + { + "epoch": 0.7808838881800255, + "grad_norm": 15.64859142014504, + "learning_rate": 2e-06, + "loss": 0.2246, + "step": 3366 + }, + { + "epoch": 0.7811158798283262, + "grad_norm": 16.32751338857044, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 3367 + }, + { + "epoch": 0.7813478714766269, + "grad_norm": 13.76485487728567, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 3368 + }, + { + "epoch": 0.7815798631249276, + "grad_norm": 10.305451819845539, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 3369 + }, + { + "epoch": 0.7818118547732281, + "grad_norm": 8.387552087751217, + "learning_rate": 2e-06, + "loss": 0.2816, + "step": 3370 + }, + { + "epoch": 0.7820438464215288, + "grad_norm": 17.442401998483444, + "learning_rate": 2e-06, + "loss": 0.387, + "step": 3371 + }, + { + "epoch": 0.7822758380698295, + "grad_norm": 16.706704163997404, + "learning_rate": 2e-06, + "loss": 0.3759, + "step": 3372 + }, + { + "epoch": 0.7825078297181302, + "grad_norm": 13.969522490290837, + "learning_rate": 2e-06, + "loss": 0.2574, + "step": 3373 + }, + { + "epoch": 0.7827398213664308, + "grad_norm": 7.801153663110549, + "learning_rate": 2e-06, + "loss": 0.2363, + "step": 3374 + }, + { + "epoch": 0.7829718130147315, + "grad_norm": 16.480451323794128, + "learning_rate": 2e-06, + "loss": 0.2805, + "step": 3375 + }, + { + "epoch": 0.7832038046630321, + "grad_norm": 14.186735457312171, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 3376 + }, + { + "epoch": 0.7834357963113328, + "grad_norm": 11.646776179140373, + "learning_rate": 2e-06, + "loss": 0.2839, + "step": 3377 + }, + { + "epoch": 0.7836677879596334, + "grad_norm": 11.631828438973786, + "learning_rate": 2e-06, + "loss": 0.2361, + "step": 3378 + }, + { + "epoch": 0.7838997796079341, + "grad_norm": 14.183205529064336, + "learning_rate": 2e-06, + "loss": 0.3969, + "step": 3379 + }, + { + "epoch": 0.7841317712562348, + "grad_norm": 10.892634668274297, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 3380 + }, + { + "epoch": 0.7843637629045355, + "grad_norm": 11.98108815181279, + "learning_rate": 2e-06, + "loss": 0.273, + "step": 3381 + }, + { + "epoch": 0.7845957545528361, + "grad_norm": 8.18435288456861, + "learning_rate": 2e-06, + "loss": 0.2099, + "step": 3382 + }, + { + "epoch": 0.7848277462011367, + "grad_norm": 12.308328485458324, + "learning_rate": 2e-06, + "loss": 0.2194, + "step": 3383 + }, + { + "epoch": 0.7850597378494374, + "grad_norm": 14.88690096912976, + "learning_rate": 2e-06, + "loss": 0.3388, + "step": 3384 + }, + { + "epoch": 0.7852917294977381, + "grad_norm": 23.606043344795978, + "learning_rate": 2e-06, + "loss": 0.3365, + "step": 3385 + }, + { + "epoch": 0.7855237211460387, + "grad_norm": 17.447552640446613, + "learning_rate": 2e-06, + "loss": 0.3042, + "step": 3386 + }, + { + "epoch": 0.7857557127943394, + "grad_norm": 15.865962850844356, + "learning_rate": 2e-06, + "loss": 0.3659, + "step": 3387 + }, + { + "epoch": 0.7859877044426401, + "grad_norm": 11.920441370957631, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 3388 + }, + { + "epoch": 0.7862196960909408, + "grad_norm": 16.597232700936654, + "learning_rate": 2e-06, + "loss": 0.3375, + "step": 3389 + }, + { + "epoch": 0.7864516877392413, + "grad_norm": 18.77556256589631, + "learning_rate": 2e-06, + "loss": 0.2961, + "step": 3390 + }, + { + "epoch": 0.786683679387542, + "grad_norm": 12.901281129823335, + "learning_rate": 2e-06, + "loss": 0.2619, + "step": 3391 + }, + { + "epoch": 0.7869156710358427, + "grad_norm": 12.44922517671116, + "learning_rate": 2e-06, + "loss": 0.1882, + "step": 3392 + }, + { + "epoch": 0.7871476626841434, + "grad_norm": 14.76842717352895, + "learning_rate": 2e-06, + "loss": 0.2978, + "step": 3393 + }, + { + "epoch": 0.7873796543324441, + "grad_norm": 14.152124730026602, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 3394 + }, + { + "epoch": 0.7876116459807447, + "grad_norm": 18.40012721570145, + "learning_rate": 2e-06, + "loss": 0.3215, + "step": 3395 + }, + { + "epoch": 0.7878436376290453, + "grad_norm": 11.181603808392303, + "learning_rate": 2e-06, + "loss": 0.2649, + "step": 3396 + }, + { + "epoch": 0.788075629277346, + "grad_norm": 18.509419869703052, + "learning_rate": 2e-06, + "loss": 0.3095, + "step": 3397 + }, + { + "epoch": 0.7883076209256467, + "grad_norm": 14.184333048504568, + "learning_rate": 2e-06, + "loss": 0.2602, + "step": 3398 + }, + { + "epoch": 0.7885396125739473, + "grad_norm": 12.587887359899812, + "learning_rate": 2e-06, + "loss": 0.2427, + "step": 3399 + }, + { + "epoch": 0.788771604222248, + "grad_norm": 24.507560493039886, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 3400 + }, + { + "epoch": 0.7890035958705487, + "grad_norm": 4.819863587109384, + "learning_rate": 2e-06, + "loss": 0.1711, + "step": 3401 + }, + { + "epoch": 0.7892355875188494, + "grad_norm": 22.111328616473372, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 3402 + }, + { + "epoch": 0.7894675791671499, + "grad_norm": 11.906781767275788, + "learning_rate": 2e-06, + "loss": 0.207, + "step": 3403 + }, + { + "epoch": 0.7896995708154506, + "grad_norm": 18.47361798133603, + "learning_rate": 2e-06, + "loss": 0.3547, + "step": 3404 + }, + { + "epoch": 0.7899315624637513, + "grad_norm": 15.688065948218464, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 3405 + }, + { + "epoch": 0.790163554112052, + "grad_norm": 26.116619607587307, + "learning_rate": 2e-06, + "loss": 0.3287, + "step": 3406 + }, + { + "epoch": 0.7903955457603526, + "grad_norm": 8.882349560205764, + "learning_rate": 2e-06, + "loss": 0.2585, + "step": 3407 + }, + { + "epoch": 0.7906275374086533, + "grad_norm": 9.446665087382272, + "learning_rate": 2e-06, + "loss": 0.2615, + "step": 3408 + }, + { + "epoch": 0.790859529056954, + "grad_norm": 18.048231253616922, + "learning_rate": 2e-06, + "loss": 0.3177, + "step": 3409 + }, + { + "epoch": 0.7910915207052546, + "grad_norm": 14.29683313119664, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 3410 + }, + { + "epoch": 0.7913235123535552, + "grad_norm": 10.825488918220556, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 3411 + }, + { + "epoch": 0.7915555040018559, + "grad_norm": 8.743391123103203, + "learning_rate": 2e-06, + "loss": 0.3564, + "step": 3412 + }, + { + "epoch": 0.7917874956501566, + "grad_norm": 8.390341529464372, + "learning_rate": 2e-06, + "loss": 0.2535, + "step": 3413 + }, + { + "epoch": 0.7920194872984573, + "grad_norm": 11.615387497887594, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 3414 + }, + { + "epoch": 0.792251478946758, + "grad_norm": 10.339780478199742, + "learning_rate": 2e-06, + "loss": 0.2124, + "step": 3415 + }, + { + "epoch": 0.7924834705950585, + "grad_norm": 19.814645086976675, + "learning_rate": 2e-06, + "loss": 0.3676, + "step": 3416 + }, + { + "epoch": 0.7927154622433592, + "grad_norm": 11.85489858812581, + "learning_rate": 2e-06, + "loss": 0.2406, + "step": 3417 + }, + { + "epoch": 0.7929474538916599, + "grad_norm": 16.299759592564307, + "learning_rate": 2e-06, + "loss": 0.2692, + "step": 3418 + }, + { + "epoch": 0.7931794455399606, + "grad_norm": 13.872791910011749, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 3419 + }, + { + "epoch": 0.7934114371882612, + "grad_norm": 19.750414611149914, + "learning_rate": 2e-06, + "loss": 0.2335, + "step": 3420 + }, + { + "epoch": 0.7936434288365619, + "grad_norm": 19.956545241264873, + "learning_rate": 2e-06, + "loss": 0.3085, + "step": 3421 + }, + { + "epoch": 0.7938754204848626, + "grad_norm": 8.149993746924585, + "learning_rate": 2e-06, + "loss": 0.2059, + "step": 3422 + }, + { + "epoch": 0.7941074121331632, + "grad_norm": 10.028973054761785, + "learning_rate": 2e-06, + "loss": 0.2432, + "step": 3423 + }, + { + "epoch": 0.7943394037814638, + "grad_norm": 18.778972827943914, + "learning_rate": 2e-06, + "loss": 0.3628, + "step": 3424 + }, + { + "epoch": 0.7945713954297645, + "grad_norm": 13.930116536144318, + "learning_rate": 2e-06, + "loss": 0.2445, + "step": 3425 + }, + { + "epoch": 0.7948033870780652, + "grad_norm": 14.63089032989506, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 3426 + }, + { + "epoch": 0.7950353787263659, + "grad_norm": 15.72743340478428, + "learning_rate": 2e-06, + "loss": 0.3289, + "step": 3427 + }, + { + "epoch": 0.7952673703746665, + "grad_norm": 12.471700775785143, + "learning_rate": 2e-06, + "loss": 0.2463, + "step": 3428 + }, + { + "epoch": 0.7954993620229672, + "grad_norm": 15.784691045568465, + "learning_rate": 2e-06, + "loss": 0.3693, + "step": 3429 + }, + { + "epoch": 0.7957313536712678, + "grad_norm": 11.487372968049451, + "learning_rate": 2e-06, + "loss": 0.2886, + "step": 3430 + }, + { + "epoch": 0.7959633453195685, + "grad_norm": 23.592938069770618, + "learning_rate": 2e-06, + "loss": 0.413, + "step": 3431 + }, + { + "epoch": 0.7961953369678691, + "grad_norm": 13.804563113914236, + "learning_rate": 2e-06, + "loss": 0.2107, + "step": 3432 + }, + { + "epoch": 0.7964273286161698, + "grad_norm": 11.229386196040094, + "learning_rate": 2e-06, + "loss": 0.3498, + "step": 3433 + }, + { + "epoch": 0.7966593202644705, + "grad_norm": 11.510512004956807, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 3434 + }, + { + "epoch": 0.7968913119127712, + "grad_norm": 16.424219271937943, + "learning_rate": 2e-06, + "loss": 0.2835, + "step": 3435 + }, + { + "epoch": 0.7971233035610717, + "grad_norm": 11.513067182158752, + "learning_rate": 2e-06, + "loss": 0.2623, + "step": 3436 + }, + { + "epoch": 0.7973552952093724, + "grad_norm": 9.572539910406347, + "learning_rate": 2e-06, + "loss": 0.1982, + "step": 3437 + }, + { + "epoch": 0.7975872868576731, + "grad_norm": 17.580798746144982, + "learning_rate": 2e-06, + "loss": 0.2625, + "step": 3438 + }, + { + "epoch": 0.7978192785059738, + "grad_norm": 12.457497591236477, + "learning_rate": 2e-06, + "loss": 0.2578, + "step": 3439 + }, + { + "epoch": 0.7980512701542745, + "grad_norm": 14.722234583713858, + "learning_rate": 2e-06, + "loss": 0.3125, + "step": 3440 + }, + { + "epoch": 0.7982832618025751, + "grad_norm": 8.417471859774857, + "learning_rate": 2e-06, + "loss": 0.279, + "step": 3441 + }, + { + "epoch": 0.7985152534508758, + "grad_norm": 13.0848253183779, + "learning_rate": 2e-06, + "loss": 0.3323, + "step": 3442 + }, + { + "epoch": 0.7987472450991764, + "grad_norm": 13.163029759028204, + "learning_rate": 2e-06, + "loss": 0.2724, + "step": 3443 + }, + { + "epoch": 0.7989792367474771, + "grad_norm": 7.058717519417359, + "learning_rate": 2e-06, + "loss": 0.157, + "step": 3444 + }, + { + "epoch": 0.7992112283957777, + "grad_norm": 15.644201443704182, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 3445 + }, + { + "epoch": 0.7994432200440784, + "grad_norm": 16.464898783661045, + "learning_rate": 2e-06, + "loss": 0.3016, + "step": 3446 + }, + { + "epoch": 0.7996752116923791, + "grad_norm": 13.152454899489452, + "learning_rate": 2e-06, + "loss": 0.2524, + "step": 3447 + }, + { + "epoch": 0.7999072033406798, + "grad_norm": 16.487316256533344, + "learning_rate": 2e-06, + "loss": 0.3709, + "step": 3448 + }, + { + "epoch": 0.8001391949889805, + "grad_norm": 9.283139468973133, + "learning_rate": 2e-06, + "loss": 0.2256, + "step": 3449 + }, + { + "epoch": 0.800371186637281, + "grad_norm": 18.27553759166784, + "learning_rate": 2e-06, + "loss": 0.309, + "step": 3450 + }, + { + "epoch": 0.8006031782855817, + "grad_norm": 17.61707193996193, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 3451 + }, + { + "epoch": 0.8008351699338824, + "grad_norm": 13.457161435100732, + "learning_rate": 2e-06, + "loss": 0.2381, + "step": 3452 + }, + { + "epoch": 0.801067161582183, + "grad_norm": 13.991405109514043, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 3453 + }, + { + "epoch": 0.8012991532304837, + "grad_norm": 18.77836921343787, + "learning_rate": 2e-06, + "loss": 0.3454, + "step": 3454 + }, + { + "epoch": 0.8015311448787844, + "grad_norm": 24.73440836115384, + "learning_rate": 2e-06, + "loss": 0.3345, + "step": 3455 + }, + { + "epoch": 0.801763136527085, + "grad_norm": 19.19064961801089, + "learning_rate": 2e-06, + "loss": 0.2585, + "step": 3456 + }, + { + "epoch": 0.8019951281753857, + "grad_norm": 10.14125178150145, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 3457 + }, + { + "epoch": 0.8022271198236863, + "grad_norm": 7.075252080603709, + "learning_rate": 2e-06, + "loss": 0.1803, + "step": 3458 + }, + { + "epoch": 0.802459111471987, + "grad_norm": 15.228216202553677, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 3459 + }, + { + "epoch": 0.8026911031202877, + "grad_norm": 15.90276123435102, + "learning_rate": 2e-06, + "loss": 0.3938, + "step": 3460 + }, + { + "epoch": 0.8029230947685884, + "grad_norm": 12.67302214729661, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 3461 + }, + { + "epoch": 0.803155086416889, + "grad_norm": 14.535054771013746, + "learning_rate": 2e-06, + "loss": 0.3209, + "step": 3462 + }, + { + "epoch": 0.8033870780651896, + "grad_norm": 10.919756951774275, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 3463 + }, + { + "epoch": 0.8036190697134903, + "grad_norm": 14.211186237722966, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 3464 + }, + { + "epoch": 0.803851061361791, + "grad_norm": 10.324439146983549, + "learning_rate": 2e-06, + "loss": 0.2724, + "step": 3465 + }, + { + "epoch": 0.8040830530100916, + "grad_norm": 18.212276946809332, + "learning_rate": 2e-06, + "loss": 0.2857, + "step": 3466 + }, + { + "epoch": 0.8043150446583923, + "grad_norm": 9.08491619843411, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 3467 + }, + { + "epoch": 0.804547036306693, + "grad_norm": 13.555167646674509, + "learning_rate": 2e-06, + "loss": 0.3019, + "step": 3468 + }, + { + "epoch": 0.8047790279549937, + "grad_norm": 14.173546709054419, + "learning_rate": 2e-06, + "loss": 0.2405, + "step": 3469 + }, + { + "epoch": 0.8050110196032942, + "grad_norm": 13.906666800937307, + "learning_rate": 2e-06, + "loss": 0.293, + "step": 3470 + }, + { + "epoch": 0.8052430112515949, + "grad_norm": 20.61968237180272, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 3471 + }, + { + "epoch": 0.8054750028998956, + "grad_norm": 7.016349103407426, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 3472 + }, + { + "epoch": 0.8057069945481963, + "grad_norm": 9.27839396191916, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 3473 + }, + { + "epoch": 0.805938986196497, + "grad_norm": 9.333666595921253, + "learning_rate": 2e-06, + "loss": 0.2843, + "step": 3474 + }, + { + "epoch": 0.8061709778447976, + "grad_norm": 12.925200076949773, + "learning_rate": 2e-06, + "loss": 0.2524, + "step": 3475 + }, + { + "epoch": 0.8064029694930982, + "grad_norm": 23.82557788438642, + "learning_rate": 2e-06, + "loss": 0.3742, + "step": 3476 + }, + { + "epoch": 0.8066349611413989, + "grad_norm": 12.30874586445278, + "learning_rate": 2e-06, + "loss": 0.2475, + "step": 3477 + }, + { + "epoch": 0.8068669527896996, + "grad_norm": 17.45344333498659, + "learning_rate": 2e-06, + "loss": 0.2469, + "step": 3478 + }, + { + "epoch": 0.8070989444380002, + "grad_norm": 20.44619451931843, + "learning_rate": 2e-06, + "loss": 0.4079, + "step": 3479 + }, + { + "epoch": 0.8073309360863009, + "grad_norm": 16.5515046014371, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 3480 + }, + { + "epoch": 0.8075629277346016, + "grad_norm": 24.519338165710717, + "learning_rate": 2e-06, + "loss": 0.3352, + "step": 3481 + }, + { + "epoch": 0.8077949193829023, + "grad_norm": 14.495647774820247, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 3482 + }, + { + "epoch": 0.8080269110312028, + "grad_norm": 35.31943301373369, + "learning_rate": 2e-06, + "loss": 0.5115, + "step": 3483 + }, + { + "epoch": 0.8082589026795035, + "grad_norm": 13.811477132735103, + "learning_rate": 2e-06, + "loss": 0.3339, + "step": 3484 + }, + { + "epoch": 0.8084908943278042, + "grad_norm": 11.395040916060081, + "learning_rate": 2e-06, + "loss": 0.3566, + "step": 3485 + }, + { + "epoch": 0.8087228859761049, + "grad_norm": 6.689994663714095, + "learning_rate": 2e-06, + "loss": 0.2268, + "step": 3486 + }, + { + "epoch": 0.8089548776244055, + "grad_norm": 7.131946165209261, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 3487 + }, + { + "epoch": 0.8091868692727062, + "grad_norm": 13.33439706490419, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 3488 + }, + { + "epoch": 0.8094188609210068, + "grad_norm": 7.313970577613089, + "learning_rate": 2e-06, + "loss": 0.2157, + "step": 3489 + }, + { + "epoch": 0.8096508525693075, + "grad_norm": 8.979295973783886, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 3490 + }, + { + "epoch": 0.8098828442176081, + "grad_norm": 7.810909631076036, + "learning_rate": 2e-06, + "loss": 0.2591, + "step": 3491 + }, + { + "epoch": 0.8101148358659088, + "grad_norm": 12.246774385443148, + "learning_rate": 2e-06, + "loss": 0.3403, + "step": 3492 + }, + { + "epoch": 0.8103468275142095, + "grad_norm": 9.947901848029089, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 3493 + }, + { + "epoch": 0.8105788191625102, + "grad_norm": 11.094416640136869, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 3494 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 11.774081114699563, + "learning_rate": 2e-06, + "loss": 0.3085, + "step": 3495 + }, + { + "epoch": 0.8110428024591114, + "grad_norm": 10.698679440962588, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 3496 + }, + { + "epoch": 0.8112747941074121, + "grad_norm": 15.764409636004656, + "learning_rate": 2e-06, + "loss": 0.2487, + "step": 3497 + }, + { + "epoch": 0.8115067857557128, + "grad_norm": 15.765164065297412, + "learning_rate": 2e-06, + "loss": 0.3738, + "step": 3498 + }, + { + "epoch": 0.8117387774040135, + "grad_norm": 13.187289856086876, + "learning_rate": 2e-06, + "loss": 0.3151, + "step": 3499 + }, + { + "epoch": 0.8119707690523141, + "grad_norm": 13.801830481066148, + "learning_rate": 2e-06, + "loss": 0.2247, + "step": 3500 + }, + { + "epoch": 0.8122027607006148, + "grad_norm": 11.524118770924476, + "learning_rate": 2e-06, + "loss": 0.2742, + "step": 3501 + }, + { + "epoch": 0.8124347523489155, + "grad_norm": 20.522591212945343, + "learning_rate": 2e-06, + "loss": 0.3449, + "step": 3502 + }, + { + "epoch": 0.8126667439972161, + "grad_norm": 11.509315641920494, + "learning_rate": 2e-06, + "loss": 0.2796, + "step": 3503 + }, + { + "epoch": 0.8128987356455167, + "grad_norm": 5.488506283368135, + "learning_rate": 2e-06, + "loss": 0.1703, + "step": 3504 + }, + { + "epoch": 0.8131307272938174, + "grad_norm": 11.409597576924604, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 3505 + }, + { + "epoch": 0.8133627189421181, + "grad_norm": 14.455097672576871, + "learning_rate": 2e-06, + "loss": 0.3249, + "step": 3506 + }, + { + "epoch": 0.8135947105904188, + "grad_norm": 15.071206169953461, + "learning_rate": 2e-06, + "loss": 0.3254, + "step": 3507 + }, + { + "epoch": 0.8138267022387194, + "grad_norm": 10.482927998182806, + "learning_rate": 2e-06, + "loss": 0.2376, + "step": 3508 + }, + { + "epoch": 0.81405869388702, + "grad_norm": 25.693406081649588, + "learning_rate": 2e-06, + "loss": 0.3781, + "step": 3509 + }, + { + "epoch": 0.8142906855353207, + "grad_norm": 13.608869586175004, + "learning_rate": 2e-06, + "loss": 0.2643, + "step": 3510 + }, + { + "epoch": 0.8145226771836214, + "grad_norm": 12.004444029483697, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 3511 + }, + { + "epoch": 0.814754668831922, + "grad_norm": 12.940201273993516, + "learning_rate": 2e-06, + "loss": 0.2498, + "step": 3512 + }, + { + "epoch": 0.8149866604802227, + "grad_norm": 18.105886390268846, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 3513 + }, + { + "epoch": 0.8152186521285234, + "grad_norm": 9.755817402936005, + "learning_rate": 2e-06, + "loss": 0.2896, + "step": 3514 + }, + { + "epoch": 0.8154506437768241, + "grad_norm": 20.685876534208933, + "learning_rate": 2e-06, + "loss": 0.2989, + "step": 3515 + }, + { + "epoch": 0.8156826354251246, + "grad_norm": 10.772642056094304, + "learning_rate": 2e-06, + "loss": 0.2951, + "step": 3516 + }, + { + "epoch": 0.8159146270734253, + "grad_norm": 22.421591893301528, + "learning_rate": 2e-06, + "loss": 0.3378, + "step": 3517 + }, + { + "epoch": 0.816146618721726, + "grad_norm": 15.838181455870886, + "learning_rate": 2e-06, + "loss": 0.3841, + "step": 3518 + }, + { + "epoch": 0.8163786103700267, + "grad_norm": 10.795058033101041, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 3519 + }, + { + "epoch": 0.8166106020183274, + "grad_norm": 10.703899141780543, + "learning_rate": 2e-06, + "loss": 0.3325, + "step": 3520 + }, + { + "epoch": 0.816842593666628, + "grad_norm": 8.945582349508102, + "learning_rate": 2e-06, + "loss": 0.2288, + "step": 3521 + }, + { + "epoch": 0.8170745853149287, + "grad_norm": 16.457137415765683, + "learning_rate": 2e-06, + "loss": 0.3724, + "step": 3522 + }, + { + "epoch": 0.8173065769632293, + "grad_norm": 14.915024776601184, + "learning_rate": 2e-06, + "loss": 0.2437, + "step": 3523 + }, + { + "epoch": 0.81753856861153, + "grad_norm": 19.130665774689742, + "learning_rate": 2e-06, + "loss": 0.42, + "step": 3524 + }, + { + "epoch": 0.8177705602598306, + "grad_norm": 11.793326568443652, + "learning_rate": 2e-06, + "loss": 0.2295, + "step": 3525 + }, + { + "epoch": 0.8180025519081313, + "grad_norm": 6.473724516177291, + "learning_rate": 2e-06, + "loss": 0.1777, + "step": 3526 + }, + { + "epoch": 0.818234543556432, + "grad_norm": 13.25425715941778, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 3527 + }, + { + "epoch": 0.8184665352047327, + "grad_norm": 10.245529294799168, + "learning_rate": 2e-06, + "loss": 0.3425, + "step": 3528 + }, + { + "epoch": 0.8186985268530332, + "grad_norm": 6.669639477632707, + "learning_rate": 2e-06, + "loss": 0.2318, + "step": 3529 + }, + { + "epoch": 0.8189305185013339, + "grad_norm": 18.227102749489706, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 3530 + }, + { + "epoch": 0.8191625101496346, + "grad_norm": 9.03996463876371, + "learning_rate": 2e-06, + "loss": 0.296, + "step": 3531 + }, + { + "epoch": 0.8193945017979353, + "grad_norm": 14.128603749034852, + "learning_rate": 2e-06, + "loss": 0.228, + "step": 3532 + }, + { + "epoch": 0.819626493446236, + "grad_norm": 19.581050800333305, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 3533 + }, + { + "epoch": 0.8198584850945366, + "grad_norm": 10.830024818169084, + "learning_rate": 2e-06, + "loss": 0.4177, + "step": 3534 + }, + { + "epoch": 0.8200904767428373, + "grad_norm": 10.547522500539598, + "learning_rate": 2e-06, + "loss": 0.2547, + "step": 3535 + }, + { + "epoch": 0.8203224683911379, + "grad_norm": 32.3173599240258, + "learning_rate": 2e-06, + "loss": 0.3983, + "step": 3536 + }, + { + "epoch": 0.8205544600394385, + "grad_norm": 13.226067374404431, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 3537 + }, + { + "epoch": 0.8207864516877392, + "grad_norm": 10.462376223619785, + "learning_rate": 2e-06, + "loss": 0.2232, + "step": 3538 + }, + { + "epoch": 0.8210184433360399, + "grad_norm": 32.82054766950882, + "learning_rate": 2e-06, + "loss": 0.3654, + "step": 3539 + }, + { + "epoch": 0.8212504349843406, + "grad_norm": 10.860846241032947, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 3540 + }, + { + "epoch": 0.8214824266326413, + "grad_norm": 8.995251473292827, + "learning_rate": 2e-06, + "loss": 0.3209, + "step": 3541 + }, + { + "epoch": 0.8217144182809419, + "grad_norm": 15.77422954469678, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 3542 + }, + { + "epoch": 0.8219464099292425, + "grad_norm": 10.039797919380941, + "learning_rate": 2e-06, + "loss": 0.3015, + "step": 3543 + }, + { + "epoch": 0.8221784015775432, + "grad_norm": 9.594735943914506, + "learning_rate": 2e-06, + "loss": 0.2629, + "step": 3544 + }, + { + "epoch": 0.8224103932258439, + "grad_norm": 13.127890377106954, + "learning_rate": 2e-06, + "loss": 0.2683, + "step": 3545 + }, + { + "epoch": 0.8226423848741445, + "grad_norm": 6.018401443740743, + "learning_rate": 2e-06, + "loss": 0.1822, + "step": 3546 + }, + { + "epoch": 0.8228743765224452, + "grad_norm": 10.239555624879175, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 3547 + }, + { + "epoch": 0.8231063681707459, + "grad_norm": 8.44416471941474, + "learning_rate": 2e-06, + "loss": 0.1853, + "step": 3548 + }, + { + "epoch": 0.8233383598190465, + "grad_norm": 11.946867911578675, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 3549 + }, + { + "epoch": 0.8235703514673471, + "grad_norm": 17.157645834778513, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 3550 + }, + { + "epoch": 0.8238023431156478, + "grad_norm": 15.446833564674439, + "learning_rate": 2e-06, + "loss": 0.2505, + "step": 3551 + }, + { + "epoch": 0.8240343347639485, + "grad_norm": 16.49241567046502, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 3552 + }, + { + "epoch": 0.8242663264122492, + "grad_norm": 15.670109551955258, + "learning_rate": 2e-06, + "loss": 0.3211, + "step": 3553 + }, + { + "epoch": 0.8244983180605499, + "grad_norm": 16.140530370283813, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 3554 + }, + { + "epoch": 0.8247303097088505, + "grad_norm": 10.511695486802429, + "learning_rate": 2e-06, + "loss": 0.2428, + "step": 3555 + }, + { + "epoch": 0.8249623013571511, + "grad_norm": 10.306828334456238, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 3556 + }, + { + "epoch": 0.8251942930054518, + "grad_norm": 15.412563379487153, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 3557 + }, + { + "epoch": 0.8254262846537525, + "grad_norm": 14.469061271601475, + "learning_rate": 2e-06, + "loss": 0.3926, + "step": 3558 + }, + { + "epoch": 0.8256582763020531, + "grad_norm": 12.314557820247007, + "learning_rate": 2e-06, + "loss": 0.285, + "step": 3559 + }, + { + "epoch": 0.8258902679503538, + "grad_norm": 20.344101930227662, + "learning_rate": 2e-06, + "loss": 0.2161, + "step": 3560 + }, + { + "epoch": 0.8261222595986545, + "grad_norm": 14.321188193323712, + "learning_rate": 2e-06, + "loss": 0.282, + "step": 3561 + }, + { + "epoch": 0.8263542512469552, + "grad_norm": 10.369535133540206, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 3562 + }, + { + "epoch": 0.8265862428952557, + "grad_norm": 14.405162531020206, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 3563 + }, + { + "epoch": 0.8268182345435564, + "grad_norm": 27.51068067463199, + "learning_rate": 2e-06, + "loss": 0.3162, + "step": 3564 + }, + { + "epoch": 0.8270502261918571, + "grad_norm": 9.47044995543265, + "learning_rate": 2e-06, + "loss": 0.1824, + "step": 3565 + }, + { + "epoch": 0.8272822178401578, + "grad_norm": 13.374539645458363, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 3566 + }, + { + "epoch": 0.8275142094884584, + "grad_norm": 17.735043818984096, + "learning_rate": 2e-06, + "loss": 0.2542, + "step": 3567 + }, + { + "epoch": 0.8277462011367591, + "grad_norm": 9.242686802461979, + "learning_rate": 2e-06, + "loss": 0.1826, + "step": 3568 + }, + { + "epoch": 0.8279781927850597, + "grad_norm": 5.677185399446114, + "learning_rate": 2e-06, + "loss": 0.1462, + "step": 3569 + }, + { + "epoch": 0.8282101844333604, + "grad_norm": 12.844155490278478, + "learning_rate": 2e-06, + "loss": 0.4044, + "step": 3570 + }, + { + "epoch": 0.828442176081661, + "grad_norm": 15.018609163790327, + "learning_rate": 2e-06, + "loss": 0.4075, + "step": 3571 + }, + { + "epoch": 0.8286741677299617, + "grad_norm": 28.60480917901526, + "learning_rate": 2e-06, + "loss": 0.3256, + "step": 3572 + }, + { + "epoch": 0.8289061593782624, + "grad_norm": 11.797828274897043, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 3573 + }, + { + "epoch": 0.8291381510265631, + "grad_norm": 12.151158127436506, + "learning_rate": 2e-06, + "loss": 0.3089, + "step": 3574 + }, + { + "epoch": 0.8293701426748638, + "grad_norm": 10.69731287572852, + "learning_rate": 2e-06, + "loss": 0.1956, + "step": 3575 + }, + { + "epoch": 0.8296021343231643, + "grad_norm": 15.209091661657826, + "learning_rate": 2e-06, + "loss": 0.3467, + "step": 3576 + }, + { + "epoch": 0.829834125971465, + "grad_norm": 9.48025969073124, + "learning_rate": 2e-06, + "loss": 0.3034, + "step": 3577 + }, + { + "epoch": 0.8300661176197657, + "grad_norm": 22.156690040220127, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 3578 + }, + { + "epoch": 0.8302981092680664, + "grad_norm": 17.120131547477015, + "learning_rate": 2e-06, + "loss": 0.4078, + "step": 3579 + }, + { + "epoch": 0.830530100916367, + "grad_norm": 13.727506476081903, + "learning_rate": 2e-06, + "loss": 0.2804, + "step": 3580 + }, + { + "epoch": 0.8307620925646677, + "grad_norm": 9.66740188676226, + "learning_rate": 2e-06, + "loss": 0.2391, + "step": 3581 + }, + { + "epoch": 0.8309940842129684, + "grad_norm": 17.59160175368584, + "learning_rate": 2e-06, + "loss": 0.3679, + "step": 3582 + }, + { + "epoch": 0.831226075861269, + "grad_norm": 17.56972129711204, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 3583 + }, + { + "epoch": 0.8314580675095696, + "grad_norm": 13.197156971473062, + "learning_rate": 2e-06, + "loss": 0.2521, + "step": 3584 + }, + { + "epoch": 0.8316900591578703, + "grad_norm": 14.553013737894982, + "learning_rate": 2e-06, + "loss": 0.2363, + "step": 3585 + }, + { + "epoch": 0.831922050806171, + "grad_norm": 8.26424690417724, + "learning_rate": 2e-06, + "loss": 0.1732, + "step": 3586 + }, + { + "epoch": 0.8321540424544717, + "grad_norm": 16.971467765915932, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 3587 + }, + { + "epoch": 0.8323860341027723, + "grad_norm": 9.547199083280193, + "learning_rate": 2e-06, + "loss": 0.2821, + "step": 3588 + }, + { + "epoch": 0.8326180257510729, + "grad_norm": 8.19466444893716, + "learning_rate": 2e-06, + "loss": 0.1939, + "step": 3589 + }, + { + "epoch": 0.8328500173993736, + "grad_norm": 13.773792857129889, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 3590 + }, + { + "epoch": 0.8330820090476743, + "grad_norm": 25.164178842438982, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 3591 + }, + { + "epoch": 0.8333140006959749, + "grad_norm": 19.67131566725128, + "learning_rate": 2e-06, + "loss": 0.4116, + "step": 3592 + }, + { + "epoch": 0.8335459923442756, + "grad_norm": 11.797629439099808, + "learning_rate": 2e-06, + "loss": 0.2482, + "step": 3593 + }, + { + "epoch": 0.8337779839925763, + "grad_norm": 13.672341117810163, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 3594 + }, + { + "epoch": 0.834009975640877, + "grad_norm": 13.89021157924789, + "learning_rate": 2e-06, + "loss": 0.3536, + "step": 3595 + }, + { + "epoch": 0.8342419672891775, + "grad_norm": 12.734349467657891, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 3596 + }, + { + "epoch": 0.8344739589374782, + "grad_norm": 18.167801879618423, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 3597 + }, + { + "epoch": 0.8347059505857789, + "grad_norm": 15.522534985390315, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 3598 + }, + { + "epoch": 0.8349379422340796, + "grad_norm": 14.490683555228062, + "learning_rate": 2e-06, + "loss": 0.2978, + "step": 3599 + }, + { + "epoch": 0.8351699338823803, + "grad_norm": 29.838603940900047, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 3600 + }, + { + "epoch": 0.8354019255306809, + "grad_norm": 13.116967226198804, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 3601 + }, + { + "epoch": 0.8356339171789816, + "grad_norm": 25.33370211615091, + "learning_rate": 2e-06, + "loss": 0.4356, + "step": 3602 + }, + { + "epoch": 0.8358659088272822, + "grad_norm": 16.302885312128048, + "learning_rate": 2e-06, + "loss": 0.3364, + "step": 3603 + }, + { + "epoch": 0.8360979004755829, + "grad_norm": 12.653664618715865, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 3604 + }, + { + "epoch": 0.8363298921238835, + "grad_norm": 11.957415500851111, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 3605 + }, + { + "epoch": 0.8365618837721842, + "grad_norm": 26.471764142537726, + "learning_rate": 2e-06, + "loss": 0.3768, + "step": 3606 + }, + { + "epoch": 0.8367938754204849, + "grad_norm": 17.280954916011552, + "learning_rate": 2e-06, + "loss": 0.3322, + "step": 3607 + }, + { + "epoch": 0.8370258670687856, + "grad_norm": 12.912623624553863, + "learning_rate": 2e-06, + "loss": 0.2971, + "step": 3608 + }, + { + "epoch": 0.8372578587170861, + "grad_norm": 33.409709385748855, + "learning_rate": 2e-06, + "loss": 0.3724, + "step": 3609 + }, + { + "epoch": 0.8374898503653868, + "grad_norm": 13.12016105283151, + "learning_rate": 2e-06, + "loss": 0.3083, + "step": 3610 + }, + { + "epoch": 0.8377218420136875, + "grad_norm": 21.16503577999807, + "learning_rate": 2e-06, + "loss": 0.4303, + "step": 3611 + }, + { + "epoch": 0.8379538336619882, + "grad_norm": 10.40878601387677, + "learning_rate": 2e-06, + "loss": 0.2138, + "step": 3612 + }, + { + "epoch": 0.8381858253102888, + "grad_norm": 19.2833065138601, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 3613 + }, + { + "epoch": 0.8384178169585895, + "grad_norm": 17.850717804608752, + "learning_rate": 2e-06, + "loss": 0.3181, + "step": 3614 + }, + { + "epoch": 0.8386498086068902, + "grad_norm": 13.76177943878717, + "learning_rate": 2e-06, + "loss": 0.2724, + "step": 3615 + }, + { + "epoch": 0.8388818002551908, + "grad_norm": 13.036284766357339, + "learning_rate": 2e-06, + "loss": 0.2476, + "step": 3616 + }, + { + "epoch": 0.8391137919034914, + "grad_norm": 17.38148826319962, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 3617 + }, + { + "epoch": 0.8393457835517921, + "grad_norm": 10.903695405275533, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 3618 + }, + { + "epoch": 0.8395777752000928, + "grad_norm": 11.87283401552707, + "learning_rate": 2e-06, + "loss": 0.292, + "step": 3619 + }, + { + "epoch": 0.8398097668483935, + "grad_norm": 16.771758918256218, + "learning_rate": 2e-06, + "loss": 0.3851, + "step": 3620 + }, + { + "epoch": 0.8400417584966942, + "grad_norm": 13.719924046998493, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 3621 + }, + { + "epoch": 0.8402737501449948, + "grad_norm": 12.03787222608202, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 3622 + }, + { + "epoch": 0.8405057417932954, + "grad_norm": 12.36045663543364, + "learning_rate": 2e-06, + "loss": 0.199, + "step": 3623 + }, + { + "epoch": 0.8407377334415961, + "grad_norm": 11.3673385044902, + "learning_rate": 2e-06, + "loss": 0.2191, + "step": 3624 + }, + { + "epoch": 0.8409697250898968, + "grad_norm": 12.709416450778644, + "learning_rate": 2e-06, + "loss": 0.1901, + "step": 3625 + }, + { + "epoch": 0.8412017167381974, + "grad_norm": 9.564064452472941, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 3626 + }, + { + "epoch": 0.8414337083864981, + "grad_norm": 11.667053315828442, + "learning_rate": 2e-06, + "loss": 0.3535, + "step": 3627 + }, + { + "epoch": 0.8416657000347988, + "grad_norm": 18.12528031498164, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 3628 + }, + { + "epoch": 0.8418976916830994, + "grad_norm": 23.08275506339047, + "learning_rate": 2e-06, + "loss": 0.4012, + "step": 3629 + }, + { + "epoch": 0.8421296833314, + "grad_norm": 9.646394367795933, + "learning_rate": 2e-06, + "loss": 0.2934, + "step": 3630 + }, + { + "epoch": 0.8423616749797007, + "grad_norm": 20.793361796856317, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 3631 + }, + { + "epoch": 0.8425936666280014, + "grad_norm": 14.113988911611878, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 3632 + }, + { + "epoch": 0.8428256582763021, + "grad_norm": 8.786341654974642, + "learning_rate": 2e-06, + "loss": 0.2987, + "step": 3633 + }, + { + "epoch": 0.8430576499246027, + "grad_norm": 18.030693899144588, + "learning_rate": 2e-06, + "loss": 0.3672, + "step": 3634 + }, + { + "epoch": 0.8432896415729034, + "grad_norm": 13.25992214773615, + "learning_rate": 2e-06, + "loss": 0.3816, + "step": 3635 + }, + { + "epoch": 0.843521633221204, + "grad_norm": 12.693332638873756, + "learning_rate": 2e-06, + "loss": 0.2074, + "step": 3636 + }, + { + "epoch": 0.8437536248695047, + "grad_norm": 17.1773251818126, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 3637 + }, + { + "epoch": 0.8439856165178053, + "grad_norm": 16.218314383885915, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 3638 + }, + { + "epoch": 0.844217608166106, + "grad_norm": 16.041706947386142, + "learning_rate": 2e-06, + "loss": 0.2734, + "step": 3639 + }, + { + "epoch": 0.8444495998144067, + "grad_norm": 9.535551188493375, + "learning_rate": 2e-06, + "loss": 0.248, + "step": 3640 + }, + { + "epoch": 0.8446815914627074, + "grad_norm": 18.716928330818007, + "learning_rate": 2e-06, + "loss": 0.3895, + "step": 3641 + }, + { + "epoch": 0.844913583111008, + "grad_norm": 11.504353512182753, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 3642 + }, + { + "epoch": 0.8451455747593086, + "grad_norm": 18.269519731683634, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 3643 + }, + { + "epoch": 0.8453775664076093, + "grad_norm": 15.2935008617942, + "learning_rate": 2e-06, + "loss": 0.349, + "step": 3644 + }, + { + "epoch": 0.84560955805591, + "grad_norm": 11.958395285026274, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 3645 + }, + { + "epoch": 0.8458415497042107, + "grad_norm": 11.5462305491056, + "learning_rate": 2e-06, + "loss": 0.2421, + "step": 3646 + }, + { + "epoch": 0.8460735413525113, + "grad_norm": 9.021273192696786, + "learning_rate": 2e-06, + "loss": 0.2478, + "step": 3647 + }, + { + "epoch": 0.846305533000812, + "grad_norm": 18.029921292402488, + "learning_rate": 2e-06, + "loss": 0.3454, + "step": 3648 + }, + { + "epoch": 0.8465375246491126, + "grad_norm": 9.117078760672142, + "learning_rate": 2e-06, + "loss": 0.256, + "step": 3649 + }, + { + "epoch": 0.8467695162974133, + "grad_norm": 8.027104481169884, + "learning_rate": 2e-06, + "loss": 0.2244, + "step": 3650 + }, + { + "epoch": 0.8470015079457139, + "grad_norm": 18.36707474383036, + "learning_rate": 2e-06, + "loss": 0.3189, + "step": 3651 + }, + { + "epoch": 0.8472334995940146, + "grad_norm": 11.373520346001401, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 3652 + }, + { + "epoch": 0.8474654912423153, + "grad_norm": 9.023667037281498, + "learning_rate": 2e-06, + "loss": 0.236, + "step": 3653 + }, + { + "epoch": 0.847697482890616, + "grad_norm": 12.47557691705421, + "learning_rate": 2e-06, + "loss": 0.3199, + "step": 3654 + }, + { + "epoch": 0.8479294745389166, + "grad_norm": 11.849709139019977, + "learning_rate": 2e-06, + "loss": 0.312, + "step": 3655 + }, + { + "epoch": 0.8481614661872172, + "grad_norm": 16.08275240994812, + "learning_rate": 2e-06, + "loss": 0.2989, + "step": 3656 + }, + { + "epoch": 0.8483934578355179, + "grad_norm": 15.974269786013867, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 3657 + }, + { + "epoch": 0.8486254494838186, + "grad_norm": 12.325930745153997, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 3658 + }, + { + "epoch": 0.8488574411321193, + "grad_norm": 15.839887071663428, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 3659 + }, + { + "epoch": 0.8490894327804199, + "grad_norm": 15.718046583011358, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 3660 + }, + { + "epoch": 0.8493214244287206, + "grad_norm": 21.360025144175662, + "learning_rate": 2e-06, + "loss": 0.3237, + "step": 3661 + }, + { + "epoch": 0.8495534160770212, + "grad_norm": 21.407456834013093, + "learning_rate": 2e-06, + "loss": 0.3511, + "step": 3662 + }, + { + "epoch": 0.8497854077253219, + "grad_norm": 17.516957487751252, + "learning_rate": 2e-06, + "loss": 0.2002, + "step": 3663 + }, + { + "epoch": 0.8500173993736225, + "grad_norm": 10.00063509585583, + "learning_rate": 2e-06, + "loss": 0.3101, + "step": 3664 + }, + { + "epoch": 0.8502493910219232, + "grad_norm": 16.61146615224927, + "learning_rate": 2e-06, + "loss": 0.2991, + "step": 3665 + }, + { + "epoch": 0.8504813826702239, + "grad_norm": 14.226566986205597, + "learning_rate": 2e-06, + "loss": 0.217, + "step": 3666 + }, + { + "epoch": 0.8507133743185246, + "grad_norm": 13.941877038153388, + "learning_rate": 2e-06, + "loss": 0.4054, + "step": 3667 + }, + { + "epoch": 0.8509453659668252, + "grad_norm": 20.923056394448633, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 3668 + }, + { + "epoch": 0.8511773576151258, + "grad_norm": 11.327595266683401, + "learning_rate": 2e-06, + "loss": 0.2025, + "step": 3669 + }, + { + "epoch": 0.8514093492634265, + "grad_norm": 13.082863132805368, + "learning_rate": 2e-06, + "loss": 0.2532, + "step": 3670 + }, + { + "epoch": 0.8516413409117272, + "grad_norm": 11.390712446630442, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 3671 + }, + { + "epoch": 0.8518733325600278, + "grad_norm": 21.27549243664026, + "learning_rate": 2e-06, + "loss": 0.3924, + "step": 3672 + }, + { + "epoch": 0.8521053242083285, + "grad_norm": 21.3979323864136, + "learning_rate": 2e-06, + "loss": 0.2268, + "step": 3673 + }, + { + "epoch": 0.8523373158566292, + "grad_norm": 8.612003270380857, + "learning_rate": 2e-06, + "loss": 0.1924, + "step": 3674 + }, + { + "epoch": 0.8525693075049299, + "grad_norm": 19.820763966955873, + "learning_rate": 2e-06, + "loss": 0.4264, + "step": 3675 + }, + { + "epoch": 0.8528012991532304, + "grad_norm": 20.045730155740298, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 3676 + }, + { + "epoch": 0.8530332908015311, + "grad_norm": 13.650897189920228, + "learning_rate": 2e-06, + "loss": 0.3416, + "step": 3677 + }, + { + "epoch": 0.8532652824498318, + "grad_norm": 10.14256766403005, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 3678 + }, + { + "epoch": 0.8534972740981325, + "grad_norm": 10.037657539343313, + "learning_rate": 2e-06, + "loss": 0.2429, + "step": 3679 + }, + { + "epoch": 0.8537292657464332, + "grad_norm": 12.66315273931505, + "learning_rate": 2e-06, + "loss": 0.2382, + "step": 3680 + }, + { + "epoch": 0.8539612573947338, + "grad_norm": 9.312231073579792, + "learning_rate": 2e-06, + "loss": 0.2034, + "step": 3681 + }, + { + "epoch": 0.8541932490430344, + "grad_norm": 8.761478461055901, + "learning_rate": 2e-06, + "loss": 0.216, + "step": 3682 + }, + { + "epoch": 0.8544252406913351, + "grad_norm": 13.743741778322043, + "learning_rate": 2e-06, + "loss": 0.2603, + "step": 3683 + }, + { + "epoch": 0.8546572323396358, + "grad_norm": 8.56315291962295, + "learning_rate": 2e-06, + "loss": 0.1987, + "step": 3684 + }, + { + "epoch": 0.8548892239879364, + "grad_norm": 9.797116980529367, + "learning_rate": 2e-06, + "loss": 0.3647, + "step": 3685 + }, + { + "epoch": 0.8551212156362371, + "grad_norm": 13.622610342375989, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 3686 + }, + { + "epoch": 0.8553532072845378, + "grad_norm": 9.724405842358887, + "learning_rate": 2e-06, + "loss": 0.2339, + "step": 3687 + }, + { + "epoch": 0.8555851989328385, + "grad_norm": 9.427475360484458, + "learning_rate": 2e-06, + "loss": 0.1833, + "step": 3688 + }, + { + "epoch": 0.855817190581139, + "grad_norm": 23.056746456756965, + "learning_rate": 2e-06, + "loss": 0.3746, + "step": 3689 + }, + { + "epoch": 0.8560491822294397, + "grad_norm": 8.420394491891004, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 3690 + }, + { + "epoch": 0.8562811738777404, + "grad_norm": 17.380813276857875, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 3691 + }, + { + "epoch": 0.8565131655260411, + "grad_norm": 13.749892547660071, + "learning_rate": 2e-06, + "loss": 0.2235, + "step": 3692 + }, + { + "epoch": 0.8567451571743417, + "grad_norm": 13.50948624508527, + "learning_rate": 2e-06, + "loss": 0.3027, + "step": 3693 + }, + { + "epoch": 0.8569771488226424, + "grad_norm": 16.78402828019823, + "learning_rate": 2e-06, + "loss": 0.361, + "step": 3694 + }, + { + "epoch": 0.8572091404709431, + "grad_norm": 10.7775988342208, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 3695 + }, + { + "epoch": 0.8574411321192437, + "grad_norm": 17.91160614351676, + "learning_rate": 2e-06, + "loss": 0.4287, + "step": 3696 + }, + { + "epoch": 0.8576731237675443, + "grad_norm": 10.208952391466227, + "learning_rate": 2e-06, + "loss": 0.1963, + "step": 3697 + }, + { + "epoch": 0.857905115415845, + "grad_norm": 12.492443606261487, + "learning_rate": 2e-06, + "loss": 0.1823, + "step": 3698 + }, + { + "epoch": 0.8581371070641457, + "grad_norm": 7.964969511028492, + "learning_rate": 2e-06, + "loss": 0.1687, + "step": 3699 + }, + { + "epoch": 0.8583690987124464, + "grad_norm": 7.738033256851267, + "learning_rate": 2e-06, + "loss": 0.1877, + "step": 3700 + }, + { + "epoch": 0.858601090360747, + "grad_norm": 13.255320400154975, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 3701 + }, + { + "epoch": 0.8588330820090476, + "grad_norm": 12.671403201642306, + "learning_rate": 2e-06, + "loss": 0.2109, + "step": 3702 + }, + { + "epoch": 0.8590650736573483, + "grad_norm": 11.814382103135062, + "learning_rate": 2e-06, + "loss": 0.1372, + "step": 3703 + }, + { + "epoch": 0.859297065305649, + "grad_norm": 13.982761604963033, + "learning_rate": 2e-06, + "loss": 0.32, + "step": 3704 + }, + { + "epoch": 0.8595290569539497, + "grad_norm": 14.133755063373435, + "learning_rate": 2e-06, + "loss": 0.2347, + "step": 3705 + }, + { + "epoch": 0.8597610486022503, + "grad_norm": 8.462385181045686, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 3706 + }, + { + "epoch": 0.859993040250551, + "grad_norm": 14.085965020035298, + "learning_rate": 2e-06, + "loss": 0.3511, + "step": 3707 + }, + { + "epoch": 0.8602250318988517, + "grad_norm": 58.65048537341041, + "learning_rate": 2e-06, + "loss": 0.3468, + "step": 3708 + }, + { + "epoch": 0.8604570235471523, + "grad_norm": 13.616616688476421, + "learning_rate": 2e-06, + "loss": 0.2393, + "step": 3709 + }, + { + "epoch": 0.8606890151954529, + "grad_norm": 12.441324389887878, + "learning_rate": 2e-06, + "loss": 0.2537, + "step": 3710 + }, + { + "epoch": 0.8609210068437536, + "grad_norm": 14.305380497619064, + "learning_rate": 2e-06, + "loss": 0.2861, + "step": 3711 + }, + { + "epoch": 0.8611529984920543, + "grad_norm": 16.613861586038418, + "learning_rate": 2e-06, + "loss": 0.2142, + "step": 3712 + }, + { + "epoch": 0.861384990140355, + "grad_norm": 10.199542141395073, + "learning_rate": 2e-06, + "loss": 0.31, + "step": 3713 + }, + { + "epoch": 0.8616169817886556, + "grad_norm": 12.55464480155109, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 3714 + }, + { + "epoch": 0.8618489734369563, + "grad_norm": 16.10525020587473, + "learning_rate": 2e-06, + "loss": 0.3392, + "step": 3715 + }, + { + "epoch": 0.8620809650852569, + "grad_norm": 13.438956564440527, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 3716 + }, + { + "epoch": 0.8623129567335576, + "grad_norm": 9.808457425512904, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 3717 + }, + { + "epoch": 0.8625449483818582, + "grad_norm": 14.98954005495591, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 3718 + }, + { + "epoch": 0.8627769400301589, + "grad_norm": 14.064247216948907, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 3719 + }, + { + "epoch": 0.8630089316784596, + "grad_norm": 31.850410637664915, + "learning_rate": 2e-06, + "loss": 0.5098, + "step": 3720 + }, + { + "epoch": 0.8632409233267603, + "grad_norm": 27.13004682039094, + "learning_rate": 2e-06, + "loss": 0.2238, + "step": 3721 + }, + { + "epoch": 0.8634729149750608, + "grad_norm": 12.81414870338563, + "learning_rate": 2e-06, + "loss": 0.28, + "step": 3722 + }, + { + "epoch": 0.8637049066233615, + "grad_norm": 14.876898239075308, + "learning_rate": 2e-06, + "loss": 0.3276, + "step": 3723 + }, + { + "epoch": 0.8639368982716622, + "grad_norm": 13.039596946514822, + "learning_rate": 2e-06, + "loss": 0.3539, + "step": 3724 + }, + { + "epoch": 0.8641688899199629, + "grad_norm": 11.520663900550497, + "learning_rate": 2e-06, + "loss": 0.2403, + "step": 3725 + }, + { + "epoch": 0.8644008815682636, + "grad_norm": 20.113972396436466, + "learning_rate": 2e-06, + "loss": 0.2682, + "step": 3726 + }, + { + "epoch": 0.8646328732165642, + "grad_norm": 9.768363407171256, + "learning_rate": 2e-06, + "loss": 0.3367, + "step": 3727 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 9.754462011322826, + "learning_rate": 2e-06, + "loss": 0.1976, + "step": 3728 + }, + { + "epoch": 0.8650968565131655, + "grad_norm": 15.897468817722482, + "learning_rate": 2e-06, + "loss": 0.3516, + "step": 3729 + }, + { + "epoch": 0.8653288481614662, + "grad_norm": 12.539672362894047, + "learning_rate": 2e-06, + "loss": 0.2137, + "step": 3730 + }, + { + "epoch": 0.8655608398097668, + "grad_norm": 22.675675973677986, + "learning_rate": 2e-06, + "loss": 0.3773, + "step": 3731 + }, + { + "epoch": 0.8657928314580675, + "grad_norm": 21.084484439088442, + "learning_rate": 2e-06, + "loss": 0.2725, + "step": 3732 + }, + { + "epoch": 0.8660248231063682, + "grad_norm": 12.945612595748198, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 3733 + }, + { + "epoch": 0.8662568147546689, + "grad_norm": 10.168197656317433, + "learning_rate": 2e-06, + "loss": 0.2676, + "step": 3734 + }, + { + "epoch": 0.8664888064029695, + "grad_norm": 12.095214911594292, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 3735 + }, + { + "epoch": 0.8667207980512701, + "grad_norm": 17.38157046620352, + "learning_rate": 2e-06, + "loss": 0.3717, + "step": 3736 + }, + { + "epoch": 0.8669527896995708, + "grad_norm": 21.62919673063253, + "learning_rate": 2e-06, + "loss": 0.2156, + "step": 3737 + }, + { + "epoch": 0.8671847813478715, + "grad_norm": 11.91250814736637, + "learning_rate": 2e-06, + "loss": 0.2699, + "step": 3738 + }, + { + "epoch": 0.8674167729961721, + "grad_norm": 14.675029900401139, + "learning_rate": 2e-06, + "loss": 0.1935, + "step": 3739 + }, + { + "epoch": 0.8676487646444728, + "grad_norm": 15.028454890183673, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 3740 + }, + { + "epoch": 0.8678807562927735, + "grad_norm": 12.685941107430299, + "learning_rate": 2e-06, + "loss": 0.3368, + "step": 3741 + }, + { + "epoch": 0.8681127479410741, + "grad_norm": 14.086959911298255, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 3742 + }, + { + "epoch": 0.8683447395893747, + "grad_norm": 29.59069253850969, + "learning_rate": 2e-06, + "loss": 0.3462, + "step": 3743 + }, + { + "epoch": 0.8685767312376754, + "grad_norm": 15.70500076917416, + "learning_rate": 2e-06, + "loss": 0.2672, + "step": 3744 + }, + { + "epoch": 0.8688087228859761, + "grad_norm": 10.885835318229043, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 3745 + }, + { + "epoch": 0.8690407145342768, + "grad_norm": 36.85881948138301, + "learning_rate": 2e-06, + "loss": 0.3901, + "step": 3746 + }, + { + "epoch": 0.8692727061825775, + "grad_norm": 16.53745340862496, + "learning_rate": 2e-06, + "loss": 0.2888, + "step": 3747 + }, + { + "epoch": 0.8695046978308781, + "grad_norm": 16.261524994880595, + "learning_rate": 2e-06, + "loss": 0.4434, + "step": 3748 + }, + { + "epoch": 0.8697366894791787, + "grad_norm": 15.966881686175501, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 3749 + }, + { + "epoch": 0.8699686811274794, + "grad_norm": 24.370815496080688, + "learning_rate": 2e-06, + "loss": 0.3686, + "step": 3750 + }, + { + "epoch": 0.8702006727757801, + "grad_norm": 14.66934237706017, + "learning_rate": 2e-06, + "loss": 0.2677, + "step": 3751 + }, + { + "epoch": 0.8704326644240807, + "grad_norm": 13.86784165636144, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 3752 + }, + { + "epoch": 0.8706646560723814, + "grad_norm": 22.526612910044197, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 3753 + }, + { + "epoch": 0.8708966477206821, + "grad_norm": 10.304405082274162, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 3754 + }, + { + "epoch": 0.8711286393689828, + "grad_norm": 19.197447189792538, + "learning_rate": 2e-06, + "loss": 0.3221, + "step": 3755 + }, + { + "epoch": 0.8713606310172833, + "grad_norm": 12.457718761366353, + "learning_rate": 2e-06, + "loss": 0.2205, + "step": 3756 + }, + { + "epoch": 0.871592622665584, + "grad_norm": 12.306067228478893, + "learning_rate": 2e-06, + "loss": 0.3971, + "step": 3757 + }, + { + "epoch": 0.8718246143138847, + "grad_norm": 18.56590997063515, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 3758 + }, + { + "epoch": 0.8720566059621854, + "grad_norm": 18.02352978933951, + "learning_rate": 2e-06, + "loss": 0.3457, + "step": 3759 + }, + { + "epoch": 0.872288597610486, + "grad_norm": 14.420142710853485, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 3760 + }, + { + "epoch": 0.8725205892587867, + "grad_norm": 6.153364462060262, + "learning_rate": 2e-06, + "loss": 0.2808, + "step": 3761 + }, + { + "epoch": 0.8727525809070873, + "grad_norm": 7.722426387420128, + "learning_rate": 2e-06, + "loss": 0.2435, + "step": 3762 + }, + { + "epoch": 0.872984572555388, + "grad_norm": 12.084046865494983, + "learning_rate": 2e-06, + "loss": 0.297, + "step": 3763 + }, + { + "epoch": 0.8732165642036886, + "grad_norm": 14.772204290356598, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 3764 + }, + { + "epoch": 0.8734485558519893, + "grad_norm": 18.298133333640326, + "learning_rate": 2e-06, + "loss": 0.3385, + "step": 3765 + }, + { + "epoch": 0.87368054750029, + "grad_norm": 8.033540187695454, + "learning_rate": 2e-06, + "loss": 0.2035, + "step": 3766 + }, + { + "epoch": 0.8739125391485907, + "grad_norm": 14.501022464289226, + "learning_rate": 2e-06, + "loss": 0.3331, + "step": 3767 + }, + { + "epoch": 0.8741445307968914, + "grad_norm": 8.580235421829364, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 3768 + }, + { + "epoch": 0.8743765224451919, + "grad_norm": 8.855536509852117, + "learning_rate": 2e-06, + "loss": 0.2153, + "step": 3769 + }, + { + "epoch": 0.8746085140934926, + "grad_norm": 11.36016770749889, + "learning_rate": 2e-06, + "loss": 0.2991, + "step": 3770 + }, + { + "epoch": 0.8748405057417933, + "grad_norm": 13.635951353515276, + "learning_rate": 2e-06, + "loss": 0.3342, + "step": 3771 + }, + { + "epoch": 0.875072497390094, + "grad_norm": 7.944159507447317, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 3772 + }, + { + "epoch": 0.8753044890383946, + "grad_norm": 11.172510325456205, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 3773 + }, + { + "epoch": 0.8755364806866953, + "grad_norm": 18.73017899852579, + "learning_rate": 2e-06, + "loss": 0.3334, + "step": 3774 + }, + { + "epoch": 0.875768472334996, + "grad_norm": 10.47454924116766, + "learning_rate": 2e-06, + "loss": 0.2581, + "step": 3775 + }, + { + "epoch": 0.8760004639832966, + "grad_norm": 7.591698553448666, + "learning_rate": 2e-06, + "loss": 0.2232, + "step": 3776 + }, + { + "epoch": 0.8762324556315972, + "grad_norm": 16.266482505616455, + "learning_rate": 2e-06, + "loss": 0.3319, + "step": 3777 + }, + { + "epoch": 0.8764644472798979, + "grad_norm": 7.2554270371657354, + "learning_rate": 2e-06, + "loss": 0.1495, + "step": 3778 + }, + { + "epoch": 0.8766964389281986, + "grad_norm": 19.825695220731603, + "learning_rate": 2e-06, + "loss": 0.3339, + "step": 3779 + }, + { + "epoch": 0.8769284305764993, + "grad_norm": 13.50315788016503, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 3780 + }, + { + "epoch": 0.8771604222248, + "grad_norm": 9.889392871091157, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 3781 + }, + { + "epoch": 0.8773924138731005, + "grad_norm": 15.48773175623725, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 3782 + }, + { + "epoch": 0.8776244055214012, + "grad_norm": 14.822191847325213, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 3783 + }, + { + "epoch": 0.8778563971697019, + "grad_norm": 13.547907543836468, + "learning_rate": 2e-06, + "loss": 0.3234, + "step": 3784 + }, + { + "epoch": 0.8780883888180026, + "grad_norm": 14.875134126943225, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 3785 + }, + { + "epoch": 0.8783203804663032, + "grad_norm": 20.39451897252101, + "learning_rate": 2e-06, + "loss": 0.2972, + "step": 3786 + }, + { + "epoch": 0.8785523721146039, + "grad_norm": 17.90206206999193, + "learning_rate": 2e-06, + "loss": 0.3046, + "step": 3787 + }, + { + "epoch": 0.8787843637629046, + "grad_norm": 12.566470812398759, + "learning_rate": 2e-06, + "loss": 0.2508, + "step": 3788 + }, + { + "epoch": 0.8790163554112052, + "grad_norm": 8.893414440354205, + "learning_rate": 2e-06, + "loss": 0.294, + "step": 3789 + }, + { + "epoch": 0.8792483470595058, + "grad_norm": 19.14245934946466, + "learning_rate": 2e-06, + "loss": 0.3494, + "step": 3790 + }, + { + "epoch": 0.8794803387078065, + "grad_norm": 15.10092811690474, + "learning_rate": 2e-06, + "loss": 0.3559, + "step": 3791 + }, + { + "epoch": 0.8797123303561072, + "grad_norm": 10.508681437132152, + "learning_rate": 2e-06, + "loss": 0.2268, + "step": 3792 + }, + { + "epoch": 0.8799443220044079, + "grad_norm": 21.07249450103903, + "learning_rate": 2e-06, + "loss": 0.245, + "step": 3793 + }, + { + "epoch": 0.8801763136527085, + "grad_norm": 17.65752075276022, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 3794 + }, + { + "epoch": 0.8804083053010091, + "grad_norm": 17.466559161419354, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 3795 + }, + { + "epoch": 0.8806402969493098, + "grad_norm": 17.14933191795175, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 3796 + }, + { + "epoch": 0.8808722885976105, + "grad_norm": 14.78273941840289, + "learning_rate": 2e-06, + "loss": 0.3052, + "step": 3797 + }, + { + "epoch": 0.8811042802459111, + "grad_norm": 12.53725963656686, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 3798 + }, + { + "epoch": 0.8813362718942118, + "grad_norm": 11.924772982478647, + "learning_rate": 2e-06, + "loss": 0.3213, + "step": 3799 + }, + { + "epoch": 0.8815682635425125, + "grad_norm": 11.76335039639468, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 3800 + }, + { + "epoch": 0.8818002551908132, + "grad_norm": 19.85232422623424, + "learning_rate": 2e-06, + "loss": 0.3021, + "step": 3801 + }, + { + "epoch": 0.8820322468391137, + "grad_norm": 23.65456243506742, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 3802 + }, + { + "epoch": 0.8822642384874144, + "grad_norm": 5.716607208165604, + "learning_rate": 2e-06, + "loss": 0.2176, + "step": 3803 + }, + { + "epoch": 0.8824962301357151, + "grad_norm": 6.022036608263883, + "learning_rate": 2e-06, + "loss": 0.1139, + "step": 3804 + }, + { + "epoch": 0.8827282217840158, + "grad_norm": 16.350486842879306, + "learning_rate": 2e-06, + "loss": 0.3059, + "step": 3805 + }, + { + "epoch": 0.8829602134323165, + "grad_norm": 15.069408373253298, + "learning_rate": 2e-06, + "loss": 0.2275, + "step": 3806 + }, + { + "epoch": 0.8831922050806171, + "grad_norm": 10.384160567512737, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 3807 + }, + { + "epoch": 0.8834241967289178, + "grad_norm": 16.660880271426983, + "learning_rate": 2e-06, + "loss": 0.2554, + "step": 3808 + }, + { + "epoch": 0.8836561883772184, + "grad_norm": 18.374192009215328, + "learning_rate": 2e-06, + "loss": 0.3591, + "step": 3809 + }, + { + "epoch": 0.883888180025519, + "grad_norm": 12.872310982621023, + "learning_rate": 2e-06, + "loss": 0.2772, + "step": 3810 + }, + { + "epoch": 0.8841201716738197, + "grad_norm": 16.019644899210274, + "learning_rate": 2e-06, + "loss": 0.5156, + "step": 3811 + }, + { + "epoch": 0.8843521633221204, + "grad_norm": 12.442141234475963, + "learning_rate": 2e-06, + "loss": 0.2239, + "step": 3812 + }, + { + "epoch": 0.8845841549704211, + "grad_norm": 22.525745572070075, + "learning_rate": 2e-06, + "loss": 0.3765, + "step": 3813 + }, + { + "epoch": 0.8848161466187218, + "grad_norm": 16.304838685551868, + "learning_rate": 2e-06, + "loss": 0.4056, + "step": 3814 + }, + { + "epoch": 0.8850481382670223, + "grad_norm": 7.3709776680881705, + "learning_rate": 2e-06, + "loss": 0.1974, + "step": 3815 + }, + { + "epoch": 0.885280129915323, + "grad_norm": 13.210776449554263, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 3816 + }, + { + "epoch": 0.8855121215636237, + "grad_norm": 16.99200282899661, + "learning_rate": 2e-06, + "loss": 0.2187, + "step": 3817 + }, + { + "epoch": 0.8857441132119244, + "grad_norm": 14.484274148768987, + "learning_rate": 2e-06, + "loss": 0.2841, + "step": 3818 + }, + { + "epoch": 0.885976104860225, + "grad_norm": 13.690872050877267, + "learning_rate": 2e-06, + "loss": 0.3322, + "step": 3819 + }, + { + "epoch": 0.8862080965085257, + "grad_norm": 15.21118667258607, + "learning_rate": 2e-06, + "loss": 0.3085, + "step": 3820 + }, + { + "epoch": 0.8864400881568264, + "grad_norm": 11.234375561403914, + "learning_rate": 2e-06, + "loss": 0.221, + "step": 3821 + }, + { + "epoch": 0.886672079805127, + "grad_norm": 11.309069358024804, + "learning_rate": 2e-06, + "loss": 0.2239, + "step": 3822 + }, + { + "epoch": 0.8869040714534276, + "grad_norm": 9.480559997389527, + "learning_rate": 2e-06, + "loss": 0.4035, + "step": 3823 + }, + { + "epoch": 0.8871360631017283, + "grad_norm": 12.818376026225353, + "learning_rate": 2e-06, + "loss": 0.2866, + "step": 3824 + }, + { + "epoch": 0.887368054750029, + "grad_norm": 13.600698803419629, + "learning_rate": 2e-06, + "loss": 0.2284, + "step": 3825 + }, + { + "epoch": 0.8876000463983297, + "grad_norm": 12.323299185200682, + "learning_rate": 2e-06, + "loss": 0.2643, + "step": 3826 + }, + { + "epoch": 0.8878320380466304, + "grad_norm": 16.262470843816768, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 3827 + }, + { + "epoch": 0.888064029694931, + "grad_norm": 15.8268984032703, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 3828 + }, + { + "epoch": 0.8882960213432316, + "grad_norm": 15.451969149107263, + "learning_rate": 2e-06, + "loss": 0.1857, + "step": 3829 + }, + { + "epoch": 0.8885280129915323, + "grad_norm": 13.444047438579203, + "learning_rate": 2e-06, + "loss": 0.2971, + "step": 3830 + }, + { + "epoch": 0.888760004639833, + "grad_norm": 11.468701419479576, + "learning_rate": 2e-06, + "loss": 0.2492, + "step": 3831 + }, + { + "epoch": 0.8889919962881336, + "grad_norm": 21.788790413025257, + "learning_rate": 2e-06, + "loss": 0.2497, + "step": 3832 + }, + { + "epoch": 0.8892239879364343, + "grad_norm": 8.534053601552444, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 3833 + }, + { + "epoch": 0.889455979584735, + "grad_norm": 17.14812910368447, + "learning_rate": 2e-06, + "loss": 0.3216, + "step": 3834 + }, + { + "epoch": 0.8896879712330356, + "grad_norm": 10.518702926059186, + "learning_rate": 2e-06, + "loss": 0.1831, + "step": 3835 + }, + { + "epoch": 0.8899199628813362, + "grad_norm": 15.381986422849188, + "learning_rate": 2e-06, + "loss": 0.2893, + "step": 3836 + }, + { + "epoch": 0.8901519545296369, + "grad_norm": 21.466599789047734, + "learning_rate": 2e-06, + "loss": 0.4283, + "step": 3837 + }, + { + "epoch": 0.8903839461779376, + "grad_norm": 18.883265852737733, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 3838 + }, + { + "epoch": 0.8906159378262383, + "grad_norm": 9.604683531122912, + "learning_rate": 2e-06, + "loss": 0.293, + "step": 3839 + }, + { + "epoch": 0.890847929474539, + "grad_norm": 10.466214318640201, + "learning_rate": 2e-06, + "loss": 0.2093, + "step": 3840 + }, + { + "epoch": 0.8910799211228396, + "grad_norm": 30.045014770740092, + "learning_rate": 2e-06, + "loss": 0.3721, + "step": 3841 + }, + { + "epoch": 0.8913119127711402, + "grad_norm": 17.520621297975964, + "learning_rate": 2e-06, + "loss": 0.3136, + "step": 3842 + }, + { + "epoch": 0.8915439044194409, + "grad_norm": 10.430427280788878, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 3843 + }, + { + "epoch": 0.8917758960677415, + "grad_norm": 16.52127291757037, + "learning_rate": 2e-06, + "loss": 0.2781, + "step": 3844 + }, + { + "epoch": 0.8920078877160422, + "grad_norm": 14.03793238580409, + "learning_rate": 2e-06, + "loss": 0.3499, + "step": 3845 + }, + { + "epoch": 0.8922398793643429, + "grad_norm": 13.465497188671403, + "learning_rate": 2e-06, + "loss": 0.2369, + "step": 3846 + }, + { + "epoch": 0.8924718710126436, + "grad_norm": 16.203513528553657, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 3847 + }, + { + "epoch": 0.8927038626609443, + "grad_norm": 16.91252227614217, + "learning_rate": 2e-06, + "loss": 0.3807, + "step": 3848 + }, + { + "epoch": 0.8929358543092448, + "grad_norm": 15.13921064541899, + "learning_rate": 2e-06, + "loss": 0.2367, + "step": 3849 + }, + { + "epoch": 0.8931678459575455, + "grad_norm": 17.470809701462834, + "learning_rate": 2e-06, + "loss": 0.4561, + "step": 3850 + }, + { + "epoch": 0.8933998376058462, + "grad_norm": 17.55558022042597, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 3851 + }, + { + "epoch": 0.8936318292541469, + "grad_norm": 15.181061039277026, + "learning_rate": 2e-06, + "loss": 0.312, + "step": 3852 + }, + { + "epoch": 0.8938638209024475, + "grad_norm": 16.306178080300985, + "learning_rate": 2e-06, + "loss": 0.3254, + "step": 3853 + }, + { + "epoch": 0.8940958125507482, + "grad_norm": 13.315390126795844, + "learning_rate": 2e-06, + "loss": 0.1952, + "step": 3854 + }, + { + "epoch": 0.8943278041990488, + "grad_norm": 14.600753372967226, + "learning_rate": 2e-06, + "loss": 0.2792, + "step": 3855 + }, + { + "epoch": 0.8945597958473495, + "grad_norm": 25.411581518678336, + "learning_rate": 2e-06, + "loss": 0.3914, + "step": 3856 + }, + { + "epoch": 0.8947917874956501, + "grad_norm": 14.153992756140097, + "learning_rate": 2e-06, + "loss": 0.3199, + "step": 3857 + }, + { + "epoch": 0.8950237791439508, + "grad_norm": 30.60125428323226, + "learning_rate": 2e-06, + "loss": 0.3962, + "step": 3858 + }, + { + "epoch": 0.8952557707922515, + "grad_norm": 8.730276553996347, + "learning_rate": 2e-06, + "loss": 0.2931, + "step": 3859 + }, + { + "epoch": 0.8954877624405522, + "grad_norm": 23.977248564892673, + "learning_rate": 2e-06, + "loss": 0.3746, + "step": 3860 + }, + { + "epoch": 0.8957197540888528, + "grad_norm": 12.272894635641682, + "learning_rate": 2e-06, + "loss": 0.1781, + "step": 3861 + }, + { + "epoch": 0.8959517457371534, + "grad_norm": 10.266127690588375, + "learning_rate": 2e-06, + "loss": 0.198, + "step": 3862 + }, + { + "epoch": 0.8961837373854541, + "grad_norm": 28.403463540976567, + "learning_rate": 2e-06, + "loss": 0.3536, + "step": 3863 + }, + { + "epoch": 0.8964157290337548, + "grad_norm": 11.053863738445942, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 3864 + }, + { + "epoch": 0.8966477206820554, + "grad_norm": 9.177659242675198, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 3865 + }, + { + "epoch": 0.8968797123303561, + "grad_norm": 9.30944371626104, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 3866 + }, + { + "epoch": 0.8971117039786568, + "grad_norm": 12.882120895306459, + "learning_rate": 2e-06, + "loss": 0.3005, + "step": 3867 + }, + { + "epoch": 0.8973436956269575, + "grad_norm": 8.934266590413841, + "learning_rate": 2e-06, + "loss": 0.1734, + "step": 3868 + }, + { + "epoch": 0.897575687275258, + "grad_norm": 5.311613108640557, + "learning_rate": 2e-06, + "loss": 0.2151, + "step": 3869 + }, + { + "epoch": 0.8978076789235587, + "grad_norm": 24.19912099032846, + "learning_rate": 2e-06, + "loss": 0.3605, + "step": 3870 + }, + { + "epoch": 0.8980396705718594, + "grad_norm": 17.4060760039238, + "learning_rate": 2e-06, + "loss": 0.36, + "step": 3871 + }, + { + "epoch": 0.8982716622201601, + "grad_norm": 9.24006750676526, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 3872 + }, + { + "epoch": 0.8985036538684608, + "grad_norm": 12.557047617052072, + "learning_rate": 2e-06, + "loss": 0.3256, + "step": 3873 + }, + { + "epoch": 0.8987356455167614, + "grad_norm": 16.55716231109906, + "learning_rate": 2e-06, + "loss": 0.3307, + "step": 3874 + }, + { + "epoch": 0.898967637165062, + "grad_norm": 14.257984695745476, + "learning_rate": 2e-06, + "loss": 0.2903, + "step": 3875 + }, + { + "epoch": 0.8991996288133627, + "grad_norm": 6.765606224454405, + "learning_rate": 2e-06, + "loss": 0.1147, + "step": 3876 + }, + { + "epoch": 0.8994316204616634, + "grad_norm": 19.732475611334035, + "learning_rate": 2e-06, + "loss": 0.3762, + "step": 3877 + }, + { + "epoch": 0.899663612109964, + "grad_norm": 10.076354836054715, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 3878 + }, + { + "epoch": 0.8998956037582647, + "grad_norm": 22.692936353918086, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 3879 + }, + { + "epoch": 0.9001275954065654, + "grad_norm": 18.87531924416284, + "learning_rate": 2e-06, + "loss": 0.3505, + "step": 3880 + }, + { + "epoch": 0.9003595870548661, + "grad_norm": 21.461426922667258, + "learning_rate": 2e-06, + "loss": 0.4371, + "step": 3881 + }, + { + "epoch": 0.9005915787031666, + "grad_norm": 12.672233116120237, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 3882 + }, + { + "epoch": 0.9008235703514673, + "grad_norm": 14.92757839909907, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 3883 + }, + { + "epoch": 0.901055561999768, + "grad_norm": 9.864217379715534, + "learning_rate": 2e-06, + "loss": 0.2253, + "step": 3884 + }, + { + "epoch": 0.9012875536480687, + "grad_norm": 22.137721500164403, + "learning_rate": 2e-06, + "loss": 0.316, + "step": 3885 + }, + { + "epoch": 0.9015195452963694, + "grad_norm": 8.633971401134554, + "learning_rate": 2e-06, + "loss": 0.2195, + "step": 3886 + }, + { + "epoch": 0.90175153694467, + "grad_norm": 11.99123929014511, + "learning_rate": 2e-06, + "loss": 0.2543, + "step": 3887 + }, + { + "epoch": 0.9019835285929707, + "grad_norm": 23.05153162087007, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 3888 + }, + { + "epoch": 0.9022155202412713, + "grad_norm": 15.378579184780568, + "learning_rate": 2e-06, + "loss": 0.2613, + "step": 3889 + }, + { + "epoch": 0.902447511889572, + "grad_norm": 9.358759181548711, + "learning_rate": 2e-06, + "loss": 0.1977, + "step": 3890 + }, + { + "epoch": 0.9026795035378726, + "grad_norm": 14.072218217487547, + "learning_rate": 2e-06, + "loss": 0.314, + "step": 3891 + }, + { + "epoch": 0.9029114951861733, + "grad_norm": 8.650515094354374, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 3892 + }, + { + "epoch": 0.903143486834474, + "grad_norm": 13.973869182199863, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 3893 + }, + { + "epoch": 0.9033754784827747, + "grad_norm": 28.02108106068224, + "learning_rate": 2e-06, + "loss": 0.4868, + "step": 3894 + }, + { + "epoch": 0.9036074701310752, + "grad_norm": 10.126726222892133, + "learning_rate": 2e-06, + "loss": 0.2712, + "step": 3895 + }, + { + "epoch": 0.9038394617793759, + "grad_norm": 14.446733796997036, + "learning_rate": 2e-06, + "loss": 0.2759, + "step": 3896 + }, + { + "epoch": 0.9040714534276766, + "grad_norm": 9.142437995803487, + "learning_rate": 2e-06, + "loss": 0.2935, + "step": 3897 + }, + { + "epoch": 0.9043034450759773, + "grad_norm": 16.347152291076053, + "learning_rate": 2e-06, + "loss": 0.3192, + "step": 3898 + }, + { + "epoch": 0.9045354367242779, + "grad_norm": 10.663833545657894, + "learning_rate": 2e-06, + "loss": 0.2734, + "step": 3899 + }, + { + "epoch": 0.9047674283725786, + "grad_norm": 18.334678168407894, + "learning_rate": 2e-06, + "loss": 0.2272, + "step": 3900 + }, + { + "epoch": 0.9049994200208793, + "grad_norm": 9.98864088999583, + "learning_rate": 2e-06, + "loss": 0.2761, + "step": 3901 + }, + { + "epoch": 0.9052314116691799, + "grad_norm": 16.88195086485774, + "learning_rate": 2e-06, + "loss": 0.2828, + "step": 3902 + }, + { + "epoch": 0.9054634033174805, + "grad_norm": 14.193321843242153, + "learning_rate": 2e-06, + "loss": 0.3519, + "step": 3903 + }, + { + "epoch": 0.9056953949657812, + "grad_norm": 24.72235772205411, + "learning_rate": 2e-06, + "loss": 0.3367, + "step": 3904 + }, + { + "epoch": 0.9059273866140819, + "grad_norm": 20.661583880476023, + "learning_rate": 2e-06, + "loss": 0.3861, + "step": 3905 + }, + { + "epoch": 0.9061593782623826, + "grad_norm": 10.309625510352685, + "learning_rate": 2e-06, + "loss": 0.2169, + "step": 3906 + }, + { + "epoch": 0.9063913699106833, + "grad_norm": 15.189679441464554, + "learning_rate": 2e-06, + "loss": 0.1656, + "step": 3907 + }, + { + "epoch": 0.9066233615589839, + "grad_norm": 7.763584174957461, + "learning_rate": 2e-06, + "loss": 0.2384, + "step": 3908 + }, + { + "epoch": 0.9068553532072845, + "grad_norm": 32.802394565514405, + "learning_rate": 2e-06, + "loss": 0.418, + "step": 3909 + }, + { + "epoch": 0.9070873448555852, + "grad_norm": 19.839585714773417, + "learning_rate": 2e-06, + "loss": 0.2461, + "step": 3910 + }, + { + "epoch": 0.9073193365038859, + "grad_norm": 13.767551143679269, + "learning_rate": 2e-06, + "loss": 0.2116, + "step": 3911 + }, + { + "epoch": 0.9075513281521865, + "grad_norm": 20.153545824967566, + "learning_rate": 2e-06, + "loss": 0.3376, + "step": 3912 + }, + { + "epoch": 0.9077833198004872, + "grad_norm": 8.939747154687687, + "learning_rate": 2e-06, + "loss": 0.2546, + "step": 3913 + }, + { + "epoch": 0.9080153114487879, + "grad_norm": 32.43756975476725, + "learning_rate": 2e-06, + "loss": 0.3785, + "step": 3914 + }, + { + "epoch": 0.9082473030970885, + "grad_norm": 11.848078031401903, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 3915 + }, + { + "epoch": 0.9084792947453891, + "grad_norm": 11.22352972732074, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 3916 + }, + { + "epoch": 0.9087112863936898, + "grad_norm": 17.184953012684105, + "learning_rate": 2e-06, + "loss": 0.3557, + "step": 3917 + }, + { + "epoch": 0.9089432780419905, + "grad_norm": 8.829476518236506, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 3918 + }, + { + "epoch": 0.9091752696902912, + "grad_norm": 25.986430156365554, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 3919 + }, + { + "epoch": 0.9094072613385918, + "grad_norm": 8.631894893578531, + "learning_rate": 2e-06, + "loss": 0.2415, + "step": 3920 + }, + { + "epoch": 0.9096392529868925, + "grad_norm": 14.692330088194996, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 3921 + }, + { + "epoch": 0.9098712446351931, + "grad_norm": 10.057081616113766, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 3922 + }, + { + "epoch": 0.9101032362834938, + "grad_norm": 18.242861421349204, + "learning_rate": 2e-06, + "loss": 0.3514, + "step": 3923 + }, + { + "epoch": 0.9103352279317944, + "grad_norm": 12.00313556221313, + "learning_rate": 2e-06, + "loss": 0.3548, + "step": 3924 + }, + { + "epoch": 0.9105672195800951, + "grad_norm": 8.759182341190806, + "learning_rate": 2e-06, + "loss": 0.2854, + "step": 3925 + }, + { + "epoch": 0.9107992112283958, + "grad_norm": 21.635245268730696, + "learning_rate": 2e-06, + "loss": 0.261, + "step": 3926 + }, + { + "epoch": 0.9110312028766965, + "grad_norm": 14.39027864566995, + "learning_rate": 2e-06, + "loss": 0.2331, + "step": 3927 + }, + { + "epoch": 0.9112631945249972, + "grad_norm": 17.651616795395558, + "learning_rate": 2e-06, + "loss": 0.3345, + "step": 3928 + }, + { + "epoch": 0.9114951861732977, + "grad_norm": 17.026843525740297, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 3929 + }, + { + "epoch": 0.9117271778215984, + "grad_norm": 15.703856377194578, + "learning_rate": 2e-06, + "loss": 0.3038, + "step": 3930 + }, + { + "epoch": 0.9119591694698991, + "grad_norm": 19.510509020521962, + "learning_rate": 2e-06, + "loss": 0.3946, + "step": 3931 + }, + { + "epoch": 0.9121911611181998, + "grad_norm": 21.184958616085613, + "learning_rate": 2e-06, + "loss": 0.2226, + "step": 3932 + }, + { + "epoch": 0.9124231527665004, + "grad_norm": 22.910390703702443, + "learning_rate": 2e-06, + "loss": 0.3747, + "step": 3933 + }, + { + "epoch": 0.9126551444148011, + "grad_norm": 21.79356223730189, + "learning_rate": 2e-06, + "loss": 0.2648, + "step": 3934 + }, + { + "epoch": 0.9128871360631017, + "grad_norm": 16.719709200320167, + "learning_rate": 2e-06, + "loss": 0.3129, + "step": 3935 + }, + { + "epoch": 0.9131191277114024, + "grad_norm": 15.75587390438053, + "learning_rate": 2e-06, + "loss": 0.2771, + "step": 3936 + }, + { + "epoch": 0.913351119359703, + "grad_norm": 20.445984418989443, + "learning_rate": 2e-06, + "loss": 0.3287, + "step": 3937 + }, + { + "epoch": 0.9135831110080037, + "grad_norm": 9.295243164429655, + "learning_rate": 2e-06, + "loss": 0.2542, + "step": 3938 + }, + { + "epoch": 0.9138151026563044, + "grad_norm": 16.194199891364974, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 3939 + }, + { + "epoch": 0.9140470943046051, + "grad_norm": 20.491671606951503, + "learning_rate": 2e-06, + "loss": 0.2353, + "step": 3940 + }, + { + "epoch": 0.9142790859529057, + "grad_norm": 10.452287640229645, + "learning_rate": 2e-06, + "loss": 0.2174, + "step": 3941 + }, + { + "epoch": 0.9145110776012063, + "grad_norm": 15.182715386927697, + "learning_rate": 2e-06, + "loss": 0.2636, + "step": 3942 + }, + { + "epoch": 0.914743069249507, + "grad_norm": 14.727316457709057, + "learning_rate": 2e-06, + "loss": 0.2375, + "step": 3943 + }, + { + "epoch": 0.9149750608978077, + "grad_norm": 14.991906354041783, + "learning_rate": 2e-06, + "loss": 0.2735, + "step": 3944 + }, + { + "epoch": 0.9152070525461083, + "grad_norm": 20.877355785804873, + "learning_rate": 2e-06, + "loss": 0.3779, + "step": 3945 + }, + { + "epoch": 0.915439044194409, + "grad_norm": 15.027199298330427, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 3946 + }, + { + "epoch": 0.9156710358427097, + "grad_norm": 8.425894451610421, + "learning_rate": 2e-06, + "loss": 0.1924, + "step": 3947 + }, + { + "epoch": 0.9159030274910103, + "grad_norm": 25.349503788018012, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 3948 + }, + { + "epoch": 0.916135019139311, + "grad_norm": 10.193314799347108, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 3949 + }, + { + "epoch": 0.9163670107876116, + "grad_norm": 18.574048790419745, + "learning_rate": 2e-06, + "loss": 0.4334, + "step": 3950 + }, + { + "epoch": 0.9165990024359123, + "grad_norm": 6.773484630130486, + "learning_rate": 2e-06, + "loss": 0.1877, + "step": 3951 + }, + { + "epoch": 0.916830994084213, + "grad_norm": 13.643189811756903, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 3952 + }, + { + "epoch": 0.9170629857325137, + "grad_norm": 19.924213913482326, + "learning_rate": 2e-06, + "loss": 0.3825, + "step": 3953 + }, + { + "epoch": 0.9172949773808143, + "grad_norm": 12.958783220911148, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 3954 + }, + { + "epoch": 0.9175269690291149, + "grad_norm": 12.0595184813058, + "learning_rate": 2e-06, + "loss": 0.2276, + "step": 3955 + }, + { + "epoch": 0.9177589606774156, + "grad_norm": 19.21193616308311, + "learning_rate": 2e-06, + "loss": 0.2676, + "step": 3956 + }, + { + "epoch": 0.9179909523257163, + "grad_norm": 11.476061295294725, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 3957 + }, + { + "epoch": 0.9182229439740169, + "grad_norm": 19.28669197764272, + "learning_rate": 2e-06, + "loss": 0.4399, + "step": 3958 + }, + { + "epoch": 0.9184549356223176, + "grad_norm": 7.265364825229757, + "learning_rate": 2e-06, + "loss": 0.2276, + "step": 3959 + }, + { + "epoch": 0.9186869272706183, + "grad_norm": 15.89433824195223, + "learning_rate": 2e-06, + "loss": 0.2828, + "step": 3960 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 9.074347573516029, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 3961 + }, + { + "epoch": 0.9191509105672195, + "grad_norm": 16.415884335639657, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 3962 + }, + { + "epoch": 0.9193829022155202, + "grad_norm": 26.616894388131367, + "learning_rate": 2e-06, + "loss": 0.2135, + "step": 3963 + }, + { + "epoch": 0.9196148938638209, + "grad_norm": 6.792574370355875, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 3964 + }, + { + "epoch": 0.9198468855121216, + "grad_norm": 14.404411259491736, + "learning_rate": 2e-06, + "loss": 0.3734, + "step": 3965 + }, + { + "epoch": 0.9200788771604222, + "grad_norm": 28.16018432451622, + "learning_rate": 2e-06, + "loss": 0.4642, + "step": 3966 + }, + { + "epoch": 0.9203108688087229, + "grad_norm": 19.233118360271835, + "learning_rate": 2e-06, + "loss": 0.3211, + "step": 3967 + }, + { + "epoch": 0.9205428604570235, + "grad_norm": 13.944355764448762, + "learning_rate": 2e-06, + "loss": 0.2451, + "step": 3968 + }, + { + "epoch": 0.9207748521053242, + "grad_norm": 12.641243085346089, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 3969 + }, + { + "epoch": 0.9210068437536248, + "grad_norm": 18.809098112566154, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 3970 + }, + { + "epoch": 0.9212388354019255, + "grad_norm": 21.782444054054192, + "learning_rate": 2e-06, + "loss": 0.3959, + "step": 3971 + }, + { + "epoch": 0.9214708270502262, + "grad_norm": 13.814108123062622, + "learning_rate": 2e-06, + "loss": 0.4264, + "step": 3972 + }, + { + "epoch": 0.9217028186985269, + "grad_norm": 21.81885611322097, + "learning_rate": 2e-06, + "loss": 0.3552, + "step": 3973 + }, + { + "epoch": 0.9219348103468276, + "grad_norm": 14.101735582385354, + "learning_rate": 2e-06, + "loss": 0.3584, + "step": 3974 + }, + { + "epoch": 0.9221668019951281, + "grad_norm": 11.043363753344781, + "learning_rate": 2e-06, + "loss": 0.2681, + "step": 3975 + }, + { + "epoch": 0.9223987936434288, + "grad_norm": 13.593291215016803, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 3976 + }, + { + "epoch": 0.9226307852917295, + "grad_norm": 12.362049676839284, + "learning_rate": 2e-06, + "loss": 0.257, + "step": 3977 + }, + { + "epoch": 0.9228627769400302, + "grad_norm": 13.270066216692763, + "learning_rate": 2e-06, + "loss": 0.3117, + "step": 3978 + }, + { + "epoch": 0.9230947685883308, + "grad_norm": 14.579060042594897, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 3979 + }, + { + "epoch": 0.9233267602366315, + "grad_norm": 14.720997537346065, + "learning_rate": 2e-06, + "loss": 0.2896, + "step": 3980 + }, + { + "epoch": 0.9235587518849322, + "grad_norm": 27.34929272148167, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 3981 + }, + { + "epoch": 0.9237907435332328, + "grad_norm": 6.8723669925715605, + "learning_rate": 2e-06, + "loss": 0.2386, + "step": 3982 + }, + { + "epoch": 0.9240227351815334, + "grad_norm": 15.46405296753594, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 3983 + }, + { + "epoch": 0.9242547268298341, + "grad_norm": 21.133738829314755, + "learning_rate": 2e-06, + "loss": 0.4005, + "step": 3984 + }, + { + "epoch": 0.9244867184781348, + "grad_norm": 23.69066601838177, + "learning_rate": 2e-06, + "loss": 0.2624, + "step": 3985 + }, + { + "epoch": 0.9247187101264355, + "grad_norm": 11.928606304200803, + "learning_rate": 2e-06, + "loss": 0.2461, + "step": 3986 + }, + { + "epoch": 0.9249507017747362, + "grad_norm": 7.763964912210666, + "learning_rate": 2e-06, + "loss": 0.299, + "step": 3987 + }, + { + "epoch": 0.9251826934230367, + "grad_norm": 14.448936450722345, + "learning_rate": 2e-06, + "loss": 0.2497, + "step": 3988 + }, + { + "epoch": 0.9254146850713374, + "grad_norm": 6.527725086557093, + "learning_rate": 2e-06, + "loss": 0.231, + "step": 3989 + }, + { + "epoch": 0.9256466767196381, + "grad_norm": 22.37946721451997, + "learning_rate": 2e-06, + "loss": 0.3802, + "step": 3990 + }, + { + "epoch": 0.9258786683679388, + "grad_norm": 12.629177421202952, + "learning_rate": 2e-06, + "loss": 0.3255, + "step": 3991 + }, + { + "epoch": 0.9261106600162394, + "grad_norm": 15.587491757728799, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 3992 + }, + { + "epoch": 0.9263426516645401, + "grad_norm": 11.847664403920094, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 3993 + }, + { + "epoch": 0.9265746433128408, + "grad_norm": 13.322943271190882, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 3994 + }, + { + "epoch": 0.9268066349611414, + "grad_norm": 14.495165691483393, + "learning_rate": 2e-06, + "loss": 0.3801, + "step": 3995 + }, + { + "epoch": 0.927038626609442, + "grad_norm": 14.3677196682021, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 3996 + }, + { + "epoch": 0.9272706182577427, + "grad_norm": 10.218329276484413, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 3997 + }, + { + "epoch": 0.9275026099060434, + "grad_norm": 34.637262702418646, + "learning_rate": 2e-06, + "loss": 0.4718, + "step": 3998 + }, + { + "epoch": 0.9277346015543441, + "grad_norm": 9.237413082217557, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 3999 + }, + { + "epoch": 0.9279665932026447, + "grad_norm": 16.090490051135355, + "learning_rate": 2e-06, + "loss": 0.2933, + "step": 4000 + }, + { + "epoch": 0.9281985848509454, + "grad_norm": 18.131571394205444, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 4001 + }, + { + "epoch": 0.928430576499246, + "grad_norm": 13.298876519414382, + "learning_rate": 2e-06, + "loss": 0.2335, + "step": 4002 + }, + { + "epoch": 0.9286625681475467, + "grad_norm": 13.195268495451675, + "learning_rate": 2e-06, + "loss": 0.2911, + "step": 4003 + }, + { + "epoch": 0.9288945597958473, + "grad_norm": 8.619850603564261, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 4004 + }, + { + "epoch": 0.929126551444148, + "grad_norm": 9.168772392177162, + "learning_rate": 2e-06, + "loss": 0.2824, + "step": 4005 + }, + { + "epoch": 0.9293585430924487, + "grad_norm": 18.2102243935687, + "learning_rate": 2e-06, + "loss": 0.2815, + "step": 4006 + }, + { + "epoch": 0.9295905347407494, + "grad_norm": 10.502444860681994, + "learning_rate": 2e-06, + "loss": 0.3784, + "step": 4007 + }, + { + "epoch": 0.9298225263890499, + "grad_norm": 19.634937250194543, + "learning_rate": 2e-06, + "loss": 0.3284, + "step": 4008 + }, + { + "epoch": 0.9300545180373506, + "grad_norm": 17.777781446560336, + "learning_rate": 2e-06, + "loss": 0.3177, + "step": 4009 + }, + { + "epoch": 0.9302865096856513, + "grad_norm": 15.66459952879304, + "learning_rate": 2e-06, + "loss": 0.3439, + "step": 4010 + }, + { + "epoch": 0.930518501333952, + "grad_norm": 12.296224082963853, + "learning_rate": 2e-06, + "loss": 0.2705, + "step": 4011 + }, + { + "epoch": 0.9307504929822527, + "grad_norm": 13.958049366132693, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 4012 + }, + { + "epoch": 0.9309824846305533, + "grad_norm": 11.377406989138178, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 4013 + }, + { + "epoch": 0.931214476278854, + "grad_norm": 18.73659133228136, + "learning_rate": 2e-06, + "loss": 0.3641, + "step": 4014 + }, + { + "epoch": 0.9314464679271546, + "grad_norm": 9.579161039043823, + "learning_rate": 2e-06, + "loss": 0.1496, + "step": 4015 + }, + { + "epoch": 0.9316784595754553, + "grad_norm": 20.95385075541004, + "learning_rate": 2e-06, + "loss": 0.4974, + "step": 4016 + }, + { + "epoch": 0.9319104512237559, + "grad_norm": 15.665155596272864, + "learning_rate": 2e-06, + "loss": 0.3812, + "step": 4017 + }, + { + "epoch": 0.9321424428720566, + "grad_norm": 10.504501557374716, + "learning_rate": 2e-06, + "loss": 0.1418, + "step": 4018 + }, + { + "epoch": 0.9323744345203573, + "grad_norm": 10.85342891308318, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 4019 + }, + { + "epoch": 0.932606426168658, + "grad_norm": 8.10601244212646, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 4020 + }, + { + "epoch": 0.9328384178169586, + "grad_norm": 13.022680006869349, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 4021 + }, + { + "epoch": 0.9330704094652592, + "grad_norm": 12.13270254820567, + "learning_rate": 2e-06, + "loss": 0.2383, + "step": 4022 + }, + { + "epoch": 0.9333024011135599, + "grad_norm": 16.32468558517697, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 4023 + }, + { + "epoch": 0.9335343927618606, + "grad_norm": 12.246989823743979, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 4024 + }, + { + "epoch": 0.9337663844101612, + "grad_norm": 15.224846029260464, + "learning_rate": 2e-06, + "loss": 0.2838, + "step": 4025 + }, + { + "epoch": 0.9339983760584619, + "grad_norm": 15.018031125518513, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 4026 + }, + { + "epoch": 0.9342303677067626, + "grad_norm": 7.434044658596272, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 4027 + }, + { + "epoch": 0.9344623593550632, + "grad_norm": 20.494236364736892, + "learning_rate": 2e-06, + "loss": 0.2571, + "step": 4028 + }, + { + "epoch": 0.9346943510033638, + "grad_norm": 6.418952262961799, + "learning_rate": 2e-06, + "loss": 0.2598, + "step": 4029 + }, + { + "epoch": 0.9349263426516645, + "grad_norm": 21.616245646302445, + "learning_rate": 2e-06, + "loss": 0.2834, + "step": 4030 + }, + { + "epoch": 0.9351583342999652, + "grad_norm": 18.3306232616933, + "learning_rate": 2e-06, + "loss": 0.3145, + "step": 4031 + }, + { + "epoch": 0.9353903259482659, + "grad_norm": 10.352480111360936, + "learning_rate": 2e-06, + "loss": 0.2459, + "step": 4032 + }, + { + "epoch": 0.9356223175965666, + "grad_norm": 11.204629468614156, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 4033 + }, + { + "epoch": 0.9358543092448672, + "grad_norm": 16.492577310753635, + "learning_rate": 2e-06, + "loss": 0.2445, + "step": 4034 + }, + { + "epoch": 0.9360863008931678, + "grad_norm": 22.371945371319974, + "learning_rate": 2e-06, + "loss": 0.2821, + "step": 4035 + }, + { + "epoch": 0.9363182925414685, + "grad_norm": 8.859132258593608, + "learning_rate": 2e-06, + "loss": 0.3099, + "step": 4036 + }, + { + "epoch": 0.9365502841897692, + "grad_norm": 17.014776481789077, + "learning_rate": 2e-06, + "loss": 0.3262, + "step": 4037 + }, + { + "epoch": 0.9367822758380698, + "grad_norm": 10.625597532800239, + "learning_rate": 2e-06, + "loss": 0.2873, + "step": 4038 + }, + { + "epoch": 0.9370142674863705, + "grad_norm": 14.122822330175744, + "learning_rate": 2e-06, + "loss": 0.38, + "step": 4039 + }, + { + "epoch": 0.9372462591346712, + "grad_norm": 10.328734624379049, + "learning_rate": 2e-06, + "loss": 0.2192, + "step": 4040 + }, + { + "epoch": 0.9374782507829719, + "grad_norm": 9.09386762470707, + "learning_rate": 2e-06, + "loss": 0.2266, + "step": 4041 + }, + { + "epoch": 0.9377102424312724, + "grad_norm": 18.684735772439982, + "learning_rate": 2e-06, + "loss": 0.2231, + "step": 4042 + }, + { + "epoch": 0.9379422340795731, + "grad_norm": 9.957849633530989, + "learning_rate": 2e-06, + "loss": 0.2933, + "step": 4043 + }, + { + "epoch": 0.9381742257278738, + "grad_norm": 13.49504899297702, + "learning_rate": 2e-06, + "loss": 0.1928, + "step": 4044 + }, + { + "epoch": 0.9384062173761745, + "grad_norm": 11.66253981574796, + "learning_rate": 2e-06, + "loss": 0.2601, + "step": 4045 + }, + { + "epoch": 0.9386382090244751, + "grad_norm": 10.106589206845713, + "learning_rate": 2e-06, + "loss": 0.2958, + "step": 4046 + }, + { + "epoch": 0.9388702006727758, + "grad_norm": 12.192365381188443, + "learning_rate": 2e-06, + "loss": 0.2948, + "step": 4047 + }, + { + "epoch": 0.9391021923210764, + "grad_norm": 11.08645100908848, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 4048 + }, + { + "epoch": 0.9393341839693771, + "grad_norm": 10.461564499933814, + "learning_rate": 2e-06, + "loss": 0.2554, + "step": 4049 + }, + { + "epoch": 0.9395661756176777, + "grad_norm": 12.516633499592912, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 4050 + }, + { + "epoch": 0.9397981672659784, + "grad_norm": 17.879976322757418, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 4051 + }, + { + "epoch": 0.9400301589142791, + "grad_norm": 10.951916309028428, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 4052 + }, + { + "epoch": 0.9402621505625798, + "grad_norm": 8.976366702731776, + "learning_rate": 2e-06, + "loss": 0.1948, + "step": 4053 + }, + { + "epoch": 0.9404941422108805, + "grad_norm": 9.076599604883869, + "learning_rate": 2e-06, + "loss": 0.3378, + "step": 4054 + }, + { + "epoch": 0.940726133859181, + "grad_norm": 15.382474393870892, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 4055 + }, + { + "epoch": 0.9409581255074817, + "grad_norm": 14.329098715179613, + "learning_rate": 2e-06, + "loss": 0.3046, + "step": 4056 + }, + { + "epoch": 0.9411901171557824, + "grad_norm": 9.195493737277706, + "learning_rate": 2e-06, + "loss": 0.2264, + "step": 4057 + }, + { + "epoch": 0.9414221088040831, + "grad_norm": 10.882441981983053, + "learning_rate": 2e-06, + "loss": 0.2679, + "step": 4058 + }, + { + "epoch": 0.9416541004523837, + "grad_norm": 7.640288467837494, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 4059 + }, + { + "epoch": 0.9418860921006844, + "grad_norm": 7.9402748004771295, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 4060 + }, + { + "epoch": 0.9421180837489851, + "grad_norm": 14.183099572940574, + "learning_rate": 2e-06, + "loss": 0.197, + "step": 4061 + }, + { + "epoch": 0.9423500753972857, + "grad_norm": 14.253871053319683, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 4062 + }, + { + "epoch": 0.9425820670455863, + "grad_norm": 14.87910110838926, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 4063 + }, + { + "epoch": 0.942814058693887, + "grad_norm": 18.414766910385318, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 4064 + }, + { + "epoch": 0.9430460503421877, + "grad_norm": 19.484331898866966, + "learning_rate": 2e-06, + "loss": 0.3497, + "step": 4065 + }, + { + "epoch": 0.9432780419904884, + "grad_norm": 7.831919172941622, + "learning_rate": 2e-06, + "loss": 0.2108, + "step": 4066 + }, + { + "epoch": 0.943510033638789, + "grad_norm": 13.970992782476834, + "learning_rate": 2e-06, + "loss": 0.2523, + "step": 4067 + }, + { + "epoch": 0.9437420252870896, + "grad_norm": 15.572274581276728, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 4068 + }, + { + "epoch": 0.9439740169353903, + "grad_norm": 18.73910628418021, + "learning_rate": 2e-06, + "loss": 0.3037, + "step": 4069 + }, + { + "epoch": 0.944206008583691, + "grad_norm": 14.560548536813151, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 4070 + }, + { + "epoch": 0.9444380002319916, + "grad_norm": 13.014495432933003, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 4071 + }, + { + "epoch": 0.9446699918802923, + "grad_norm": 16.70216711953269, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 4072 + }, + { + "epoch": 0.944901983528593, + "grad_norm": 12.274314755806715, + "learning_rate": 2e-06, + "loss": 0.334, + "step": 4073 + }, + { + "epoch": 0.9451339751768937, + "grad_norm": 12.610739079859846, + "learning_rate": 2e-06, + "loss": 0.261, + "step": 4074 + }, + { + "epoch": 0.9453659668251942, + "grad_norm": 7.652569686562883, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 4075 + }, + { + "epoch": 0.9455979584734949, + "grad_norm": 9.82109278599335, + "learning_rate": 2e-06, + "loss": 0.3167, + "step": 4076 + }, + { + "epoch": 0.9458299501217956, + "grad_norm": 15.045805903494685, + "learning_rate": 2e-06, + "loss": 0.391, + "step": 4077 + }, + { + "epoch": 0.9460619417700963, + "grad_norm": 20.756814333126798, + "learning_rate": 2e-06, + "loss": 0.3532, + "step": 4078 + }, + { + "epoch": 0.946293933418397, + "grad_norm": 11.520740221476641, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 4079 + }, + { + "epoch": 0.9465259250666976, + "grad_norm": 20.332249191152876, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 4080 + }, + { + "epoch": 0.9467579167149983, + "grad_norm": 9.704537942099549, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 4081 + }, + { + "epoch": 0.9469899083632989, + "grad_norm": 12.313089663784114, + "learning_rate": 2e-06, + "loss": 0.3028, + "step": 4082 + }, + { + "epoch": 0.9472219000115996, + "grad_norm": 21.2792596586522, + "learning_rate": 2e-06, + "loss": 0.2978, + "step": 4083 + }, + { + "epoch": 0.9474538916599002, + "grad_norm": 10.101328052308142, + "learning_rate": 2e-06, + "loss": 0.2412, + "step": 4084 + }, + { + "epoch": 0.9476858833082009, + "grad_norm": 17.19211419820723, + "learning_rate": 2e-06, + "loss": 0.3151, + "step": 4085 + }, + { + "epoch": 0.9479178749565016, + "grad_norm": 16.302835135561683, + "learning_rate": 2e-06, + "loss": 0.2125, + "step": 4086 + }, + { + "epoch": 0.9481498666048023, + "grad_norm": 11.946190807136137, + "learning_rate": 2e-06, + "loss": 0.3241, + "step": 4087 + }, + { + "epoch": 0.9483818582531028, + "grad_norm": 13.151658952123395, + "learning_rate": 2e-06, + "loss": 0.2848, + "step": 4088 + }, + { + "epoch": 0.9486138499014035, + "grad_norm": 23.02715333790368, + "learning_rate": 2e-06, + "loss": 0.2935, + "step": 4089 + }, + { + "epoch": 0.9488458415497042, + "grad_norm": 8.503461364810859, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 4090 + }, + { + "epoch": 0.9490778331980049, + "grad_norm": 18.427826665139552, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 4091 + }, + { + "epoch": 0.9493098248463055, + "grad_norm": 14.303047249938743, + "learning_rate": 2e-06, + "loss": 0.2439, + "step": 4092 + }, + { + "epoch": 0.9495418164946062, + "grad_norm": 19.143796564075636, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 4093 + }, + { + "epoch": 0.9497738081429069, + "grad_norm": 13.488732980205729, + "learning_rate": 2e-06, + "loss": 0.257, + "step": 4094 + }, + { + "epoch": 0.9500057997912075, + "grad_norm": 8.526817701556933, + "learning_rate": 2e-06, + "loss": 0.193, + "step": 4095 + }, + { + "epoch": 0.9502377914395082, + "grad_norm": 15.559353236992328, + "learning_rate": 2e-06, + "loss": 0.3272, + "step": 4096 + }, + { + "epoch": 0.9504697830878088, + "grad_norm": 24.587515021857655, + "learning_rate": 2e-06, + "loss": 0.3034, + "step": 4097 + }, + { + "epoch": 0.9507017747361095, + "grad_norm": 13.200634807361052, + "learning_rate": 2e-06, + "loss": 0.2902, + "step": 4098 + }, + { + "epoch": 0.9509337663844102, + "grad_norm": 20.434562905867345, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 4099 + }, + { + "epoch": 0.9511657580327109, + "grad_norm": 18.417630148667733, + "learning_rate": 2e-06, + "loss": 0.3549, + "step": 4100 + }, + { + "epoch": 0.9513977496810114, + "grad_norm": 13.31683770770985, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 4101 + }, + { + "epoch": 0.9516297413293121, + "grad_norm": 11.843045183770629, + "learning_rate": 2e-06, + "loss": 0.252, + "step": 4102 + }, + { + "epoch": 0.9518617329776128, + "grad_norm": 11.666980315192783, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 4103 + }, + { + "epoch": 0.9520937246259135, + "grad_norm": 19.913932697088207, + "learning_rate": 2e-06, + "loss": 0.3337, + "step": 4104 + }, + { + "epoch": 0.9523257162742141, + "grad_norm": 13.36391426480302, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 4105 + }, + { + "epoch": 0.9525577079225148, + "grad_norm": 20.073464429920577, + "learning_rate": 2e-06, + "loss": 0.4001, + "step": 4106 + }, + { + "epoch": 0.9527896995708155, + "grad_norm": 15.891488742906676, + "learning_rate": 2e-06, + "loss": 0.3502, + "step": 4107 + }, + { + "epoch": 0.9530216912191161, + "grad_norm": 21.231285427185863, + "learning_rate": 2e-06, + "loss": 0.3465, + "step": 4108 + }, + { + "epoch": 0.9532536828674167, + "grad_norm": 13.94812781849407, + "learning_rate": 2e-06, + "loss": 0.2747, + "step": 4109 + }, + { + "epoch": 0.9534856745157174, + "grad_norm": 7.340569324720874, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 4110 + }, + { + "epoch": 0.9537176661640181, + "grad_norm": 15.352599805848575, + "learning_rate": 2e-06, + "loss": 0.338, + "step": 4111 + }, + { + "epoch": 0.9539496578123188, + "grad_norm": 12.305729093870848, + "learning_rate": 2e-06, + "loss": 0.2499, + "step": 4112 + }, + { + "epoch": 0.9541816494606195, + "grad_norm": 9.690251495539805, + "learning_rate": 2e-06, + "loss": 0.2528, + "step": 4113 + }, + { + "epoch": 0.9544136411089201, + "grad_norm": 15.161578406160716, + "learning_rate": 2e-06, + "loss": 0.3333, + "step": 4114 + }, + { + "epoch": 0.9546456327572207, + "grad_norm": 28.285554380523788, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 4115 + }, + { + "epoch": 0.9548776244055214, + "grad_norm": 16.19964193709801, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 4116 + }, + { + "epoch": 0.955109616053822, + "grad_norm": 12.83960278879911, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 4117 + }, + { + "epoch": 0.9553416077021227, + "grad_norm": 8.782977231717306, + "learning_rate": 2e-06, + "loss": 0.2084, + "step": 4118 + }, + { + "epoch": 0.9555735993504234, + "grad_norm": 15.30528731331944, + "learning_rate": 2e-06, + "loss": 0.326, + "step": 4119 + }, + { + "epoch": 0.9558055909987241, + "grad_norm": 13.290100729515132, + "learning_rate": 2e-06, + "loss": 0.272, + "step": 4120 + }, + { + "epoch": 0.9560375826470247, + "grad_norm": 10.406690132651857, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 4121 + }, + { + "epoch": 0.9562695742953253, + "grad_norm": 26.04862314136393, + "learning_rate": 2e-06, + "loss": 0.3575, + "step": 4122 + }, + { + "epoch": 0.956501565943626, + "grad_norm": 7.815336659949993, + "learning_rate": 2e-06, + "loss": 0.1821, + "step": 4123 + }, + { + "epoch": 0.9567335575919267, + "grad_norm": 9.946114361385957, + "learning_rate": 2e-06, + "loss": 0.2606, + "step": 4124 + }, + { + "epoch": 0.9569655492402274, + "grad_norm": 8.008143895713363, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 4125 + }, + { + "epoch": 0.957197540888528, + "grad_norm": 12.47252896155183, + "learning_rate": 2e-06, + "loss": 0.2618, + "step": 4126 + }, + { + "epoch": 0.9574295325368287, + "grad_norm": 8.102956281530695, + "learning_rate": 2e-06, + "loss": 0.3792, + "step": 4127 + }, + { + "epoch": 0.9576615241851293, + "grad_norm": 8.2663318549362, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 4128 + }, + { + "epoch": 0.95789351583343, + "grad_norm": 12.246000066279436, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 4129 + }, + { + "epoch": 0.9581255074817306, + "grad_norm": 11.872467722008956, + "learning_rate": 2e-06, + "loss": 0.2984, + "step": 4130 + }, + { + "epoch": 0.9583574991300313, + "grad_norm": 21.614820482071732, + "learning_rate": 2e-06, + "loss": 0.2752, + "step": 4131 + }, + { + "epoch": 0.958589490778332, + "grad_norm": 14.068048318367484, + "learning_rate": 2e-06, + "loss": 0.3651, + "step": 4132 + }, + { + "epoch": 0.9588214824266327, + "grad_norm": 15.711678066427929, + "learning_rate": 2e-06, + "loss": 0.2752, + "step": 4133 + }, + { + "epoch": 0.9590534740749334, + "grad_norm": 5.923996766497805, + "learning_rate": 2e-06, + "loss": 0.2064, + "step": 4134 + }, + { + "epoch": 0.9592854657232339, + "grad_norm": 9.744245529377244, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 4135 + }, + { + "epoch": 0.9595174573715346, + "grad_norm": 8.475805972629109, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 4136 + }, + { + "epoch": 0.9597494490198353, + "grad_norm": 17.447438317238976, + "learning_rate": 2e-06, + "loss": 0.3531, + "step": 4137 + }, + { + "epoch": 0.959981440668136, + "grad_norm": 10.248204605683624, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 4138 + }, + { + "epoch": 0.9602134323164366, + "grad_norm": 9.349030328082655, + "learning_rate": 2e-06, + "loss": 0.2251, + "step": 4139 + }, + { + "epoch": 0.9604454239647373, + "grad_norm": 10.040785549242996, + "learning_rate": 2e-06, + "loss": 0.3646, + "step": 4140 + }, + { + "epoch": 0.9606774156130379, + "grad_norm": 17.99881429560681, + "learning_rate": 2e-06, + "loss": 0.3179, + "step": 4141 + }, + { + "epoch": 0.9609094072613386, + "grad_norm": 23.15821161884971, + "learning_rate": 2e-06, + "loss": 0.3449, + "step": 4142 + }, + { + "epoch": 0.9611413989096392, + "grad_norm": 11.559981381311555, + "learning_rate": 2e-06, + "loss": 0.2579, + "step": 4143 + }, + { + "epoch": 0.9613733905579399, + "grad_norm": 16.29055174473012, + "learning_rate": 2e-06, + "loss": 0.2503, + "step": 4144 + }, + { + "epoch": 0.9616053822062406, + "grad_norm": 11.7947484697283, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 4145 + }, + { + "epoch": 0.9618373738545413, + "grad_norm": 19.286142596191475, + "learning_rate": 2e-06, + "loss": 0.3261, + "step": 4146 + }, + { + "epoch": 0.9620693655028419, + "grad_norm": 9.755826226950822, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 4147 + }, + { + "epoch": 0.9623013571511425, + "grad_norm": 16.999414195147992, + "learning_rate": 2e-06, + "loss": 0.2519, + "step": 4148 + }, + { + "epoch": 0.9625333487994432, + "grad_norm": 9.47299950537098, + "learning_rate": 2e-06, + "loss": 0.1947, + "step": 4149 + }, + { + "epoch": 0.9627653404477439, + "grad_norm": 7.826711210558996, + "learning_rate": 2e-06, + "loss": 0.2812, + "step": 4150 + }, + { + "epoch": 0.9629973320960445, + "grad_norm": 10.426702042786996, + "learning_rate": 2e-06, + "loss": 0.2236, + "step": 4151 + }, + { + "epoch": 0.9632293237443452, + "grad_norm": 15.108961521422483, + "learning_rate": 2e-06, + "loss": 0.3395, + "step": 4152 + }, + { + "epoch": 0.9634613153926459, + "grad_norm": 9.802328672252278, + "learning_rate": 2e-06, + "loss": 0.2706, + "step": 4153 + }, + { + "epoch": 0.9636933070409466, + "grad_norm": 18.037924430831286, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 4154 + }, + { + "epoch": 0.9639252986892471, + "grad_norm": 8.017284974062889, + "learning_rate": 2e-06, + "loss": 0.2999, + "step": 4155 + }, + { + "epoch": 0.9641572903375478, + "grad_norm": 12.34805634741516, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 4156 + }, + { + "epoch": 0.9643892819858485, + "grad_norm": 16.752460181250132, + "learning_rate": 2e-06, + "loss": 0.2908, + "step": 4157 + }, + { + "epoch": 0.9646212736341492, + "grad_norm": 16.975203492078528, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 4158 + }, + { + "epoch": 0.9648532652824499, + "grad_norm": 18.677633298719076, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 4159 + }, + { + "epoch": 0.9650852569307505, + "grad_norm": 16.169374674291078, + "learning_rate": 2e-06, + "loss": 0.3081, + "step": 4160 + }, + { + "epoch": 0.9653172485790511, + "grad_norm": 8.690820570400597, + "learning_rate": 2e-06, + "loss": 0.2812, + "step": 4161 + }, + { + "epoch": 0.9655492402273518, + "grad_norm": 18.24990491642374, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 4162 + }, + { + "epoch": 0.9657812318756525, + "grad_norm": 7.2511972849280015, + "learning_rate": 2e-06, + "loss": 0.294, + "step": 4163 + }, + { + "epoch": 0.9660132235239531, + "grad_norm": 14.82011859195889, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 4164 + }, + { + "epoch": 0.9662452151722538, + "grad_norm": 8.815089700662222, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 4165 + }, + { + "epoch": 0.9664772068205545, + "grad_norm": 11.440596644880184, + "learning_rate": 2e-06, + "loss": 0.2995, + "step": 4166 + }, + { + "epoch": 0.9667091984688552, + "grad_norm": 21.152456518163454, + "learning_rate": 2e-06, + "loss": 0.3035, + "step": 4167 + }, + { + "epoch": 0.9669411901171557, + "grad_norm": 22.36876306470267, + "learning_rate": 2e-06, + "loss": 0.3341, + "step": 4168 + }, + { + "epoch": 0.9671731817654564, + "grad_norm": 9.240470420717966, + "learning_rate": 2e-06, + "loss": 0.2977, + "step": 4169 + }, + { + "epoch": 0.9674051734137571, + "grad_norm": 12.167277516648443, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 4170 + }, + { + "epoch": 0.9676371650620578, + "grad_norm": 5.054791610066648, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 4171 + }, + { + "epoch": 0.9678691567103584, + "grad_norm": 6.7646904021890455, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 4172 + }, + { + "epoch": 0.9681011483586591, + "grad_norm": 16.408927035270644, + "learning_rate": 2e-06, + "loss": 0.2613, + "step": 4173 + }, + { + "epoch": 0.9683331400069598, + "grad_norm": 14.037063486759456, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 4174 + }, + { + "epoch": 0.9685651316552604, + "grad_norm": 13.446123483781111, + "learning_rate": 2e-06, + "loss": 0.2195, + "step": 4175 + }, + { + "epoch": 0.968797123303561, + "grad_norm": 12.642700280341838, + "learning_rate": 2e-06, + "loss": 0.2894, + "step": 4176 + }, + { + "epoch": 0.9690291149518617, + "grad_norm": 10.132367629156734, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 4177 + }, + { + "epoch": 0.9692611066001624, + "grad_norm": 8.498917805369992, + "learning_rate": 2e-06, + "loss": 0.3635, + "step": 4178 + }, + { + "epoch": 0.9694930982484631, + "grad_norm": 13.209622449831079, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 4179 + }, + { + "epoch": 0.9697250898967638, + "grad_norm": 12.29051439378758, + "learning_rate": 2e-06, + "loss": 0.3119, + "step": 4180 + }, + { + "epoch": 0.9699570815450643, + "grad_norm": 21.12653821260995, + "learning_rate": 2e-06, + "loss": 0.3552, + "step": 4181 + }, + { + "epoch": 0.970189073193365, + "grad_norm": 5.3595942600030115, + "learning_rate": 2e-06, + "loss": 0.1905, + "step": 4182 + }, + { + "epoch": 0.9704210648416657, + "grad_norm": 10.134794135814035, + "learning_rate": 2e-06, + "loss": 0.2624, + "step": 4183 + }, + { + "epoch": 0.9706530564899664, + "grad_norm": 21.330682910796718, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 4184 + }, + { + "epoch": 0.970885048138267, + "grad_norm": 14.964958740540329, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 4185 + }, + { + "epoch": 0.9711170397865677, + "grad_norm": 11.493977780236726, + "learning_rate": 2e-06, + "loss": 0.2423, + "step": 4186 + }, + { + "epoch": 0.9713490314348684, + "grad_norm": 16.30578846347033, + "learning_rate": 2e-06, + "loss": 0.2295, + "step": 4187 + }, + { + "epoch": 0.971581023083169, + "grad_norm": 21.62575639230587, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 4188 + }, + { + "epoch": 0.9718130147314696, + "grad_norm": 14.537087718010312, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 4189 + }, + { + "epoch": 0.9720450063797703, + "grad_norm": 13.378301810685581, + "learning_rate": 2e-06, + "loss": 0.326, + "step": 4190 + }, + { + "epoch": 0.972276998028071, + "grad_norm": 14.564670167831924, + "learning_rate": 2e-06, + "loss": 0.2357, + "step": 4191 + }, + { + "epoch": 0.9725089896763717, + "grad_norm": 11.895665121896728, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 4192 + }, + { + "epoch": 0.9727409813246723, + "grad_norm": 29.173504877419095, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 4193 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 18.29224069723669, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 4194 + }, + { + "epoch": 0.9732049646212736, + "grad_norm": 10.68192300084348, + "learning_rate": 2e-06, + "loss": 0.2298, + "step": 4195 + }, + { + "epoch": 0.9734369562695743, + "grad_norm": 21.485573797924964, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 4196 + }, + { + "epoch": 0.973668947917875, + "grad_norm": 7.255065307915454, + "learning_rate": 2e-06, + "loss": 0.2094, + "step": 4197 + }, + { + "epoch": 0.9739009395661756, + "grad_norm": 11.98053928489005, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 4198 + }, + { + "epoch": 0.9741329312144763, + "grad_norm": 16.306959457810798, + "learning_rate": 2e-06, + "loss": 0.2924, + "step": 4199 + }, + { + "epoch": 0.974364922862777, + "grad_norm": 15.066723005130221, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 4200 + }, + { + "epoch": 0.9745969145110776, + "grad_norm": 17.8311141003004, + "learning_rate": 2e-06, + "loss": 0.2809, + "step": 4201 + }, + { + "epoch": 0.9748289061593782, + "grad_norm": 17.330070730465206, + "learning_rate": 2e-06, + "loss": 0.3275, + "step": 4202 + }, + { + "epoch": 0.9750608978076789, + "grad_norm": 13.899796439940532, + "learning_rate": 2e-06, + "loss": 0.2049, + "step": 4203 + }, + { + "epoch": 0.9752928894559796, + "grad_norm": 12.515907162025195, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 4204 + }, + { + "epoch": 0.9755248811042803, + "grad_norm": 14.070388956916347, + "learning_rate": 2e-06, + "loss": 0.3706, + "step": 4205 + }, + { + "epoch": 0.9757568727525809, + "grad_norm": 9.998273054313804, + "learning_rate": 2e-06, + "loss": 0.2585, + "step": 4206 + }, + { + "epoch": 0.9759888644008816, + "grad_norm": 17.59697288259026, + "learning_rate": 2e-06, + "loss": 0.2893, + "step": 4207 + }, + { + "epoch": 0.9762208560491822, + "grad_norm": 10.29582133296254, + "learning_rate": 2e-06, + "loss": 0.1864, + "step": 4208 + }, + { + "epoch": 0.9764528476974829, + "grad_norm": 15.152312481568451, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 4209 + }, + { + "epoch": 0.9766848393457835, + "grad_norm": 13.491237354588291, + "learning_rate": 2e-06, + "loss": 0.2564, + "step": 4210 + }, + { + "epoch": 0.9769168309940842, + "grad_norm": 22.77241362575545, + "learning_rate": 2e-06, + "loss": 0.3766, + "step": 4211 + }, + { + "epoch": 0.9771488226423849, + "grad_norm": 24.502660636251704, + "learning_rate": 2e-06, + "loss": 0.3272, + "step": 4212 + }, + { + "epoch": 0.9773808142906856, + "grad_norm": 24.914401016942282, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 4213 + }, + { + "epoch": 0.9776128059389863, + "grad_norm": 11.111214277761023, + "learning_rate": 2e-06, + "loss": 0.2549, + "step": 4214 + }, + { + "epoch": 0.9778447975872868, + "grad_norm": 11.49523120194123, + "learning_rate": 2e-06, + "loss": 0.317, + "step": 4215 + }, + { + "epoch": 0.9780767892355875, + "grad_norm": 9.055681111603517, + "learning_rate": 2e-06, + "loss": 0.2275, + "step": 4216 + }, + { + "epoch": 0.9783087808838882, + "grad_norm": 15.174180810545446, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 4217 + }, + { + "epoch": 0.9785407725321889, + "grad_norm": 13.401080527534248, + "learning_rate": 2e-06, + "loss": 0.2857, + "step": 4218 + }, + { + "epoch": 0.9787727641804895, + "grad_norm": 6.164648447458631, + "learning_rate": 2e-06, + "loss": 0.1799, + "step": 4219 + }, + { + "epoch": 0.9790047558287902, + "grad_norm": 27.775235280358498, + "learning_rate": 2e-06, + "loss": 0.3474, + "step": 4220 + }, + { + "epoch": 0.9792367474770908, + "grad_norm": 18.37992954725875, + "learning_rate": 2e-06, + "loss": 0.229, + "step": 4221 + }, + { + "epoch": 0.9794687391253915, + "grad_norm": 13.19515503304382, + "learning_rate": 2e-06, + "loss": 0.3051, + "step": 4222 + }, + { + "epoch": 0.9797007307736921, + "grad_norm": 11.132635370674684, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 4223 + }, + { + "epoch": 0.9799327224219928, + "grad_norm": 8.405337802905594, + "learning_rate": 2e-06, + "loss": 0.1608, + "step": 4224 + }, + { + "epoch": 0.9801647140702935, + "grad_norm": 13.917574725560614, + "learning_rate": 2e-06, + "loss": 0.3205, + "step": 4225 + }, + { + "epoch": 0.9803967057185942, + "grad_norm": 6.015763156205712, + "learning_rate": 2e-06, + "loss": 0.1753, + "step": 4226 + }, + { + "epoch": 0.9806286973668948, + "grad_norm": 15.86064516096754, + "learning_rate": 2e-06, + "loss": 0.3311, + "step": 4227 + }, + { + "epoch": 0.9808606890151954, + "grad_norm": 13.07809649383298, + "learning_rate": 2e-06, + "loss": 0.3762, + "step": 4228 + }, + { + "epoch": 0.9810926806634961, + "grad_norm": 22.12385581572482, + "learning_rate": 2e-06, + "loss": 0.3983, + "step": 4229 + }, + { + "epoch": 0.9813246723117968, + "grad_norm": 21.61215045671309, + "learning_rate": 2e-06, + "loss": 0.2543, + "step": 4230 + }, + { + "epoch": 0.9815566639600974, + "grad_norm": 11.469648361559175, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 4231 + }, + { + "epoch": 0.9817886556083981, + "grad_norm": 22.827779221927173, + "learning_rate": 2e-06, + "loss": 0.4152, + "step": 4232 + }, + { + "epoch": 0.9820206472566988, + "grad_norm": 16.671441093942356, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 4233 + }, + { + "epoch": 0.9822526389049995, + "grad_norm": 15.742088488552337, + "learning_rate": 2e-06, + "loss": 0.1953, + "step": 4234 + }, + { + "epoch": 0.9824846305533, + "grad_norm": 6.266727832106582, + "learning_rate": 2e-06, + "loss": 0.1748, + "step": 4235 + }, + { + "epoch": 0.9827166222016007, + "grad_norm": 15.052647218778917, + "learning_rate": 2e-06, + "loss": 0.2619, + "step": 4236 + }, + { + "epoch": 0.9829486138499014, + "grad_norm": 11.529712668140277, + "learning_rate": 2e-06, + "loss": 0.2717, + "step": 4237 + }, + { + "epoch": 0.9831806054982021, + "grad_norm": 11.626580721336351, + "learning_rate": 2e-06, + "loss": 0.3329, + "step": 4238 + }, + { + "epoch": 0.9834125971465028, + "grad_norm": 17.82590025916798, + "learning_rate": 2e-06, + "loss": 0.3335, + "step": 4239 + }, + { + "epoch": 0.9836445887948034, + "grad_norm": 13.926504229230394, + "learning_rate": 2e-06, + "loss": 0.3854, + "step": 4240 + }, + { + "epoch": 0.983876580443104, + "grad_norm": 7.309050762390744, + "learning_rate": 2e-06, + "loss": 0.2475, + "step": 4241 + }, + { + "epoch": 0.9841085720914047, + "grad_norm": 10.44914747577233, + "learning_rate": 2e-06, + "loss": 0.1732, + "step": 4242 + }, + { + "epoch": 0.9843405637397054, + "grad_norm": 17.92649994514198, + "learning_rate": 2e-06, + "loss": 0.2721, + "step": 4243 + }, + { + "epoch": 0.984572555388006, + "grad_norm": 12.515934818615193, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 4244 + }, + { + "epoch": 0.9848045470363067, + "grad_norm": 11.349447658048096, + "learning_rate": 2e-06, + "loss": 0.2519, + "step": 4245 + }, + { + "epoch": 0.9850365386846074, + "grad_norm": 17.278232190175704, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 4246 + }, + { + "epoch": 0.9852685303329081, + "grad_norm": 19.564700818775094, + "learning_rate": 2e-06, + "loss": 0.3114, + "step": 4247 + }, + { + "epoch": 0.9855005219812086, + "grad_norm": 11.075353127881275, + "learning_rate": 2e-06, + "loss": 0.2909, + "step": 4248 + }, + { + "epoch": 0.9857325136295093, + "grad_norm": 12.89124213988945, + "learning_rate": 2e-06, + "loss": 0.1592, + "step": 4249 + }, + { + "epoch": 0.98596450527781, + "grad_norm": 16.98800196949275, + "learning_rate": 2e-06, + "loss": 0.3969, + "step": 4250 + }, + { + "epoch": 0.9861964969261107, + "grad_norm": 13.578942902506654, + "learning_rate": 2e-06, + "loss": 0.2777, + "step": 4251 + }, + { + "epoch": 0.9864284885744113, + "grad_norm": 8.1871697074597, + "learning_rate": 2e-06, + "loss": 0.203, + "step": 4252 + }, + { + "epoch": 0.986660480222712, + "grad_norm": 7.040253276385697, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 4253 + }, + { + "epoch": 0.9868924718710126, + "grad_norm": 14.405757347393758, + "learning_rate": 2e-06, + "loss": 0.2295, + "step": 4254 + }, + { + "epoch": 0.9871244635193133, + "grad_norm": 9.033447010628546, + "learning_rate": 2e-06, + "loss": 0.1497, + "step": 4255 + }, + { + "epoch": 0.9873564551676139, + "grad_norm": 20.97446599435015, + "learning_rate": 2e-06, + "loss": 0.1794, + "step": 4256 + }, + { + "epoch": 0.9875884468159146, + "grad_norm": 6.904763491483926, + "learning_rate": 2e-06, + "loss": 0.1891, + "step": 4257 + }, + { + "epoch": 0.9878204384642153, + "grad_norm": 18.688009180195195, + "learning_rate": 2e-06, + "loss": 0.3001, + "step": 4258 + }, + { + "epoch": 0.988052430112516, + "grad_norm": 13.364486799681748, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 4259 + }, + { + "epoch": 0.9882844217608167, + "grad_norm": 15.973372169182674, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 4260 + }, + { + "epoch": 0.9885164134091172, + "grad_norm": 10.556847505773627, + "learning_rate": 2e-06, + "loss": 0.3017, + "step": 4261 + }, + { + "epoch": 0.9887484050574179, + "grad_norm": 23.634222130250084, + "learning_rate": 2e-06, + "loss": 0.4199, + "step": 4262 + }, + { + "epoch": 0.9889803967057186, + "grad_norm": 16.572984667619835, + "learning_rate": 2e-06, + "loss": 0.3701, + "step": 4263 + }, + { + "epoch": 0.9892123883540193, + "grad_norm": 14.404180757195148, + "learning_rate": 2e-06, + "loss": 0.332, + "step": 4264 + }, + { + "epoch": 0.9894443800023199, + "grad_norm": 13.811304035079404, + "learning_rate": 2e-06, + "loss": 0.2003, + "step": 4265 + }, + { + "epoch": 0.9896763716506206, + "grad_norm": 11.8826651967571, + "learning_rate": 2e-06, + "loss": 0.333, + "step": 4266 + }, + { + "epoch": 0.9899083632989213, + "grad_norm": 14.923848968766313, + "learning_rate": 2e-06, + "loss": 0.4666, + "step": 4267 + }, + { + "epoch": 0.9901403549472219, + "grad_norm": 19.934166664100598, + "learning_rate": 2e-06, + "loss": 0.3621, + "step": 4268 + }, + { + "epoch": 0.9903723465955225, + "grad_norm": 10.548852262130172, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 4269 + }, + { + "epoch": 0.9906043382438232, + "grad_norm": 17.044978616656284, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 4270 + }, + { + "epoch": 0.9908363298921239, + "grad_norm": 7.3678955030967, + "learning_rate": 2e-06, + "loss": 0.1474, + "step": 4271 + }, + { + "epoch": 0.9910683215404246, + "grad_norm": 18.214716715347542, + "learning_rate": 2e-06, + "loss": 0.4379, + "step": 4272 + }, + { + "epoch": 0.9913003131887252, + "grad_norm": 12.945968711053286, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 4273 + }, + { + "epoch": 0.9915323048370258, + "grad_norm": 17.728922143117316, + "learning_rate": 2e-06, + "loss": 0.3382, + "step": 4274 + }, + { + "epoch": 0.9917642964853265, + "grad_norm": 18.646348539918346, + "learning_rate": 2e-06, + "loss": 0.3737, + "step": 4275 + }, + { + "epoch": 0.9919962881336272, + "grad_norm": 17.22778162656779, + "learning_rate": 2e-06, + "loss": 0.281, + "step": 4276 + }, + { + "epoch": 0.9922282797819278, + "grad_norm": 12.540238388456121, + "learning_rate": 2e-06, + "loss": 0.3496, + "step": 4277 + }, + { + "epoch": 0.9924602714302285, + "grad_norm": 9.732713018493623, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 4278 + }, + { + "epoch": 0.9926922630785292, + "grad_norm": 7.953560031755493, + "learning_rate": 2e-06, + "loss": 0.2391, + "step": 4279 + }, + { + "epoch": 0.9929242547268299, + "grad_norm": 11.722711090296285, + "learning_rate": 2e-06, + "loss": 0.2226, + "step": 4280 + }, + { + "epoch": 0.9931562463751304, + "grad_norm": 18.63265751918661, + "learning_rate": 2e-06, + "loss": 0.2999, + "step": 4281 + }, + { + "epoch": 0.9933882380234311, + "grad_norm": 10.19449522121469, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 4282 + }, + { + "epoch": 0.9936202296717318, + "grad_norm": 18.11672937369853, + "learning_rate": 2e-06, + "loss": 0.3204, + "step": 4283 + }, + { + "epoch": 0.9938522213200325, + "grad_norm": 16.01830303614881, + "learning_rate": 2e-06, + "loss": 0.3197, + "step": 4284 + }, + { + "epoch": 0.9940842129683332, + "grad_norm": 17.3963087347634, + "learning_rate": 2e-06, + "loss": 0.3376, + "step": 4285 + }, + { + "epoch": 0.9943162046166338, + "grad_norm": 17.667883565032273, + "learning_rate": 2e-06, + "loss": 0.3051, + "step": 4286 + }, + { + "epoch": 0.9945481962649345, + "grad_norm": 8.313509916883348, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 4287 + }, + { + "epoch": 0.9947801879132351, + "grad_norm": 21.853369792188385, + "learning_rate": 2e-06, + "loss": 0.3898, + "step": 4288 + }, + { + "epoch": 0.9950121795615358, + "grad_norm": 12.909500013417489, + "learning_rate": 2e-06, + "loss": 0.2194, + "step": 4289 + }, + { + "epoch": 0.9952441712098364, + "grad_norm": 15.270128959373558, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 4290 + }, + { + "epoch": 0.9954761628581371, + "grad_norm": 21.79462981369358, + "learning_rate": 2e-06, + "loss": 0.339, + "step": 4291 + }, + { + "epoch": 0.9957081545064378, + "grad_norm": 19.88564636383995, + "learning_rate": 2e-06, + "loss": 0.3024, + "step": 4292 + }, + { + "epoch": 0.9959401461547385, + "grad_norm": 14.399655142947225, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 4293 + }, + { + "epoch": 0.996172137803039, + "grad_norm": 19.85949576833494, + "learning_rate": 2e-06, + "loss": 0.3431, + "step": 4294 + }, + { + "epoch": 0.9964041294513397, + "grad_norm": 11.683854205528311, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 4295 + }, + { + "epoch": 0.9966361210996404, + "grad_norm": 17.951758210842055, + "learning_rate": 2e-06, + "loss": 0.3739, + "step": 4296 + }, + { + "epoch": 0.9968681127479411, + "grad_norm": 21.711123070741547, + "learning_rate": 2e-06, + "loss": 0.2775, + "step": 4297 + }, + { + "epoch": 0.9971001043962417, + "grad_norm": 12.616016808487865, + "learning_rate": 2e-06, + "loss": 0.2993, + "step": 4298 + }, + { + "epoch": 0.9973320960445424, + "grad_norm": 20.280139049312588, + "learning_rate": 2e-06, + "loss": 0.2819, + "step": 4299 + }, + { + "epoch": 0.9975640876928431, + "grad_norm": 12.354640230223774, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 4300 + }, + { + "epoch": 0.9977960793411437, + "grad_norm": 8.04778189340334, + "learning_rate": 2e-06, + "loss": 0.3385, + "step": 4301 + }, + { + "epoch": 0.9980280709894443, + "grad_norm": 13.08075846529473, + "learning_rate": 2e-06, + "loss": 0.1973, + "step": 4302 + }, + { + "epoch": 0.998260062637745, + "grad_norm": 12.520305141646622, + "learning_rate": 2e-06, + "loss": 0.3255, + "step": 4303 + }, + { + "epoch": 0.9984920542860457, + "grad_norm": 11.281683034711, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 4304 + }, + { + "epoch": 0.9987240459343464, + "grad_norm": 9.467801053894949, + "learning_rate": 2e-06, + "loss": 0.2862, + "step": 4305 + }, + { + "epoch": 0.9989560375826471, + "grad_norm": 10.738481898576978, + "learning_rate": 2e-06, + "loss": 0.2807, + "step": 4306 + }, + { + "epoch": 0.9991880292309477, + "grad_norm": 12.40324963497283, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 4307 + }, + { + "epoch": 0.9994200208792483, + "grad_norm": 17.279743062019485, + "learning_rate": 2e-06, + "loss": 0.3601, + "step": 4308 + }, + { + "epoch": 0.999652012527549, + "grad_norm": 12.254255842119658, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 4309 + }, + { + "epoch": 0.9998840041758497, + "grad_norm": 15.644995614708913, + "learning_rate": 2e-06, + "loss": 0.301, + "step": 4310 + }, + { + "epoch": 1.0001159958241503, + "grad_norm": 16.718302350093527, + "learning_rate": 2e-06, + "loss": 0.3431, + "step": 4311 + }, + { + "epoch": 1.000347987472451, + "grad_norm": 5.981456267975129, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 4312 + }, + { + "epoch": 1.0005799791207517, + "grad_norm": 11.01508633788959, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 4313 + }, + { + "epoch": 1.0008119707690524, + "grad_norm": 10.28879898371332, + "learning_rate": 2e-06, + "loss": 0.2578, + "step": 4314 + }, + { + "epoch": 1.001043962417353, + "grad_norm": 19.773665483140853, + "learning_rate": 2e-06, + "loss": 0.427, + "step": 4315 + }, + { + "epoch": 1.0012759540656537, + "grad_norm": 14.035676093485598, + "learning_rate": 2e-06, + "loss": 0.3006, + "step": 4316 + }, + { + "epoch": 1.0015079457139544, + "grad_norm": 13.1091865357948, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 4317 + }, + { + "epoch": 1.0017399373622549, + "grad_norm": 19.786693874665463, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 4318 + }, + { + "epoch": 1.0019719290105555, + "grad_norm": 7.270669487731423, + "learning_rate": 2e-06, + "loss": 0.2108, + "step": 4319 + }, + { + "epoch": 1.0022039206588562, + "grad_norm": 20.17286026961784, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 4320 + }, + { + "epoch": 1.002435912307157, + "grad_norm": 12.587046558200004, + "learning_rate": 2e-06, + "loss": 0.2785, + "step": 4321 + }, + { + "epoch": 1.0026679039554576, + "grad_norm": 8.869916800554623, + "learning_rate": 2e-06, + "loss": 0.2481, + "step": 4322 + }, + { + "epoch": 1.0028998956037583, + "grad_norm": 12.136751295143798, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 4323 + }, + { + "epoch": 1.003131887252059, + "grad_norm": 8.667913267118324, + "learning_rate": 2e-06, + "loss": 0.1941, + "step": 4324 + }, + { + "epoch": 1.0033638789003596, + "grad_norm": 14.403161655669786, + "learning_rate": 2e-06, + "loss": 0.2535, + "step": 4325 + }, + { + "epoch": 1.0035958705486603, + "grad_norm": 18.612318169636975, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 4326 + }, + { + "epoch": 1.003827862196961, + "grad_norm": 16.99645141292192, + "learning_rate": 2e-06, + "loss": 0.363, + "step": 4327 + }, + { + "epoch": 1.0040598538452616, + "grad_norm": 13.751830433854773, + "learning_rate": 2e-06, + "loss": 0.155, + "step": 4328 + }, + { + "epoch": 1.0042918454935623, + "grad_norm": 13.671771418605278, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 4329 + }, + { + "epoch": 1.004523837141863, + "grad_norm": 18.752642833061604, + "learning_rate": 2e-06, + "loss": 0.3661, + "step": 4330 + }, + { + "epoch": 1.0047558287901635, + "grad_norm": 14.847783747948847, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 4331 + }, + { + "epoch": 1.0049878204384641, + "grad_norm": 17.885233787267723, + "learning_rate": 2e-06, + "loss": 0.3698, + "step": 4332 + }, + { + "epoch": 1.0052198120867648, + "grad_norm": 27.13789320974805, + "learning_rate": 2e-06, + "loss": 0.3145, + "step": 4333 + }, + { + "epoch": 1.0054518037350655, + "grad_norm": 13.304508645892248, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 4334 + }, + { + "epoch": 1.0056837953833662, + "grad_norm": 9.862556153578181, + "learning_rate": 2e-06, + "loss": 0.1547, + "step": 4335 + }, + { + "epoch": 1.0059157870316668, + "grad_norm": 10.683912066124044, + "learning_rate": 2e-06, + "loss": 0.3168, + "step": 4336 + }, + { + "epoch": 1.0061477786799675, + "grad_norm": 12.134907278341815, + "learning_rate": 2e-06, + "loss": 0.2847, + "step": 4337 + }, + { + "epoch": 1.0063797703282682, + "grad_norm": 16.990495829795368, + "learning_rate": 2e-06, + "loss": 0.1884, + "step": 4338 + }, + { + "epoch": 1.0066117619765689, + "grad_norm": 11.49411437654733, + "learning_rate": 2e-06, + "loss": 0.2447, + "step": 4339 + }, + { + "epoch": 1.0068437536248696, + "grad_norm": 14.995630381719709, + "learning_rate": 2e-06, + "loss": 0.2222, + "step": 4340 + }, + { + "epoch": 1.0070757452731702, + "grad_norm": 13.971606657098535, + "learning_rate": 2e-06, + "loss": 0.2872, + "step": 4341 + }, + { + "epoch": 1.007307736921471, + "grad_norm": 20.96371526573696, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 4342 + }, + { + "epoch": 1.0075397285697716, + "grad_norm": 9.266262013323754, + "learning_rate": 2e-06, + "loss": 0.1939, + "step": 4343 + }, + { + "epoch": 1.007771720218072, + "grad_norm": 19.77717418896224, + "learning_rate": 2e-06, + "loss": 0.3568, + "step": 4344 + }, + { + "epoch": 1.0080037118663727, + "grad_norm": 12.26230275832593, + "learning_rate": 2e-06, + "loss": 0.3573, + "step": 4345 + }, + { + "epoch": 1.0082357035146734, + "grad_norm": 14.19253024029564, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 4346 + }, + { + "epoch": 1.008467695162974, + "grad_norm": 10.803220157659359, + "learning_rate": 2e-06, + "loss": 0.27, + "step": 4347 + }, + { + "epoch": 1.0086996868112748, + "grad_norm": 23.124982254178747, + "learning_rate": 2e-06, + "loss": 0.3364, + "step": 4348 + }, + { + "epoch": 1.0089316784595754, + "grad_norm": 9.731324037618686, + "learning_rate": 2e-06, + "loss": 0.1988, + "step": 4349 + }, + { + "epoch": 1.009163670107876, + "grad_norm": 7.93294592056329, + "learning_rate": 2e-06, + "loss": 0.2416, + "step": 4350 + }, + { + "epoch": 1.0093956617561768, + "grad_norm": 14.057267162792947, + "learning_rate": 2e-06, + "loss": 0.32, + "step": 4351 + }, + { + "epoch": 1.0096276534044775, + "grad_norm": 13.900748666564924, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 4352 + }, + { + "epoch": 1.0098596450527781, + "grad_norm": 8.245172013413677, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 4353 + }, + { + "epoch": 1.0100916367010788, + "grad_norm": 10.271433355787496, + "learning_rate": 2e-06, + "loss": 0.1919, + "step": 4354 + }, + { + "epoch": 1.0103236283493795, + "grad_norm": 8.194394625100914, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 4355 + }, + { + "epoch": 1.0105556199976802, + "grad_norm": 15.079051609451382, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 4356 + }, + { + "epoch": 1.0107876116459809, + "grad_norm": 15.291543050990853, + "learning_rate": 2e-06, + "loss": 0.4196, + "step": 4357 + }, + { + "epoch": 1.0110196032942813, + "grad_norm": 17.82830434225328, + "learning_rate": 2e-06, + "loss": 0.2908, + "step": 4358 + }, + { + "epoch": 1.011251594942582, + "grad_norm": 12.837214688662794, + "learning_rate": 2e-06, + "loss": 0.3404, + "step": 4359 + }, + { + "epoch": 1.0114835865908827, + "grad_norm": 11.901047019390361, + "learning_rate": 2e-06, + "loss": 0.2712, + "step": 4360 + }, + { + "epoch": 1.0117155782391833, + "grad_norm": 23.758163883570887, + "learning_rate": 2e-06, + "loss": 0.3621, + "step": 4361 + }, + { + "epoch": 1.011947569887484, + "grad_norm": 11.482949711438708, + "learning_rate": 2e-06, + "loss": 0.147, + "step": 4362 + }, + { + "epoch": 1.0121795615357847, + "grad_norm": 10.940010722924, + "learning_rate": 2e-06, + "loss": 0.317, + "step": 4363 + }, + { + "epoch": 1.0124115531840854, + "grad_norm": 9.369929809574218, + "learning_rate": 2e-06, + "loss": 0.1951, + "step": 4364 + }, + { + "epoch": 1.012643544832386, + "grad_norm": 9.403910759231197, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 4365 + }, + { + "epoch": 1.0128755364806867, + "grad_norm": 11.168686934520977, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 4366 + }, + { + "epoch": 1.0131075281289874, + "grad_norm": 11.041500841883662, + "learning_rate": 2e-06, + "loss": 0.3602, + "step": 4367 + }, + { + "epoch": 1.013339519777288, + "grad_norm": 9.369178993030669, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 4368 + }, + { + "epoch": 1.0135715114255888, + "grad_norm": 12.33230547811149, + "learning_rate": 2e-06, + "loss": 0.1931, + "step": 4369 + }, + { + "epoch": 1.0138035030738894, + "grad_norm": 13.487586445637827, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 4370 + }, + { + "epoch": 1.01403549472219, + "grad_norm": 15.733338825367728, + "learning_rate": 2e-06, + "loss": 0.214, + "step": 4371 + }, + { + "epoch": 1.0142674863704906, + "grad_norm": 14.042948766828657, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 4372 + }, + { + "epoch": 1.0144994780187913, + "grad_norm": 10.908660927650285, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 4373 + }, + { + "epoch": 1.014731469667092, + "grad_norm": 12.053128384428641, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 4374 + }, + { + "epoch": 1.0149634613153926, + "grad_norm": 15.94150176034634, + "learning_rate": 2e-06, + "loss": 0.34, + "step": 4375 + }, + { + "epoch": 1.0151954529636933, + "grad_norm": 30.488268098804266, + "learning_rate": 2e-06, + "loss": 0.3275, + "step": 4376 + }, + { + "epoch": 1.015427444611994, + "grad_norm": 16.418756536831573, + "learning_rate": 2e-06, + "loss": 0.2529, + "step": 4377 + }, + { + "epoch": 1.0156594362602946, + "grad_norm": 26.929167976037675, + "learning_rate": 2e-06, + "loss": 0.3632, + "step": 4378 + }, + { + "epoch": 1.0158914279085953, + "grad_norm": 13.390221016403952, + "learning_rate": 2e-06, + "loss": 0.1912, + "step": 4379 + }, + { + "epoch": 1.016123419556896, + "grad_norm": 11.176161477331638, + "learning_rate": 2e-06, + "loss": 0.2135, + "step": 4380 + }, + { + "epoch": 1.0163554112051967, + "grad_norm": 19.321719328090147, + "learning_rate": 2e-06, + "loss": 0.38, + "step": 4381 + }, + { + "epoch": 1.0165874028534974, + "grad_norm": 8.73358529268335, + "learning_rate": 2e-06, + "loss": 0.2182, + "step": 4382 + }, + { + "epoch": 1.016819394501798, + "grad_norm": 11.346499255352734, + "learning_rate": 2e-06, + "loss": 0.2417, + "step": 4383 + }, + { + "epoch": 1.0170513861500985, + "grad_norm": 9.501895122254266, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 4384 + }, + { + "epoch": 1.0172833777983992, + "grad_norm": 12.229807304921541, + "learning_rate": 2e-06, + "loss": 0.2692, + "step": 4385 + }, + { + "epoch": 1.0175153694466998, + "grad_norm": 11.847609278185644, + "learning_rate": 2e-06, + "loss": 0.2968, + "step": 4386 + }, + { + "epoch": 1.0177473610950005, + "grad_norm": 9.549411930106775, + "learning_rate": 2e-06, + "loss": 0.1805, + "step": 4387 + }, + { + "epoch": 1.0179793527433012, + "grad_norm": 9.267107386625788, + "learning_rate": 2e-06, + "loss": 0.2132, + "step": 4388 + }, + { + "epoch": 1.0182113443916019, + "grad_norm": 8.690027982071577, + "learning_rate": 2e-06, + "loss": 0.2782, + "step": 4389 + }, + { + "epoch": 1.0184433360399026, + "grad_norm": 12.795072421966484, + "learning_rate": 2e-06, + "loss": 0.274, + "step": 4390 + }, + { + "epoch": 1.0186753276882032, + "grad_norm": 5.842035700725516, + "learning_rate": 2e-06, + "loss": 0.1685, + "step": 4391 + }, + { + "epoch": 1.018907319336504, + "grad_norm": 14.051283930385816, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 4392 + }, + { + "epoch": 1.0191393109848046, + "grad_norm": 13.348897262207288, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 4393 + }, + { + "epoch": 1.0193713026331053, + "grad_norm": 22.540889105334582, + "learning_rate": 2e-06, + "loss": 0.3685, + "step": 4394 + }, + { + "epoch": 1.019603294281406, + "grad_norm": 11.525100788667038, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 4395 + }, + { + "epoch": 1.0198352859297066, + "grad_norm": 18.453868704349077, + "learning_rate": 2e-06, + "loss": 0.3714, + "step": 4396 + }, + { + "epoch": 1.0200672775780073, + "grad_norm": 14.36172001747481, + "learning_rate": 2e-06, + "loss": 0.3594, + "step": 4397 + }, + { + "epoch": 1.0202992692263078, + "grad_norm": 11.360356324324796, + "learning_rate": 2e-06, + "loss": 0.2799, + "step": 4398 + }, + { + "epoch": 1.0205312608746084, + "grad_norm": 15.46130705982928, + "learning_rate": 2e-06, + "loss": 0.2868, + "step": 4399 + }, + { + "epoch": 1.0207632525229091, + "grad_norm": 18.75112773586128, + "learning_rate": 2e-06, + "loss": 0.2558, + "step": 4400 + }, + { + "epoch": 1.0209952441712098, + "grad_norm": 14.047846349720592, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 4401 + }, + { + "epoch": 1.0212272358195105, + "grad_norm": 8.51175892802764, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 4402 + }, + { + "epoch": 1.0214592274678111, + "grad_norm": 10.553828364794468, + "learning_rate": 2e-06, + "loss": 0.1464, + "step": 4403 + }, + { + "epoch": 1.0216912191161118, + "grad_norm": 16.277207470334066, + "learning_rate": 2e-06, + "loss": 0.2669, + "step": 4404 + }, + { + "epoch": 1.0219232107644125, + "grad_norm": 14.650722863677906, + "learning_rate": 2e-06, + "loss": 0.4093, + "step": 4405 + }, + { + "epoch": 1.0221552024127132, + "grad_norm": 20.686933586265983, + "learning_rate": 2e-06, + "loss": 0.3219, + "step": 4406 + }, + { + "epoch": 1.0223871940610139, + "grad_norm": 18.149022562966458, + "learning_rate": 2e-06, + "loss": 0.2369, + "step": 4407 + }, + { + "epoch": 1.0226191857093145, + "grad_norm": 13.441727627483365, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 4408 + }, + { + "epoch": 1.0228511773576152, + "grad_norm": 9.002417763935158, + "learning_rate": 2e-06, + "loss": 0.1919, + "step": 4409 + }, + { + "epoch": 1.023083169005916, + "grad_norm": 9.604553834199644, + "learning_rate": 2e-06, + "loss": 0.1538, + "step": 4410 + }, + { + "epoch": 1.0233151606542163, + "grad_norm": 7.693559744151872, + "learning_rate": 2e-06, + "loss": 0.1894, + "step": 4411 + }, + { + "epoch": 1.023547152302517, + "grad_norm": 13.679783666333623, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 4412 + }, + { + "epoch": 1.0237791439508177, + "grad_norm": 16.107777090519455, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 4413 + }, + { + "epoch": 1.0240111355991184, + "grad_norm": 22.498111337105726, + "learning_rate": 2e-06, + "loss": 0.235, + "step": 4414 + }, + { + "epoch": 1.024243127247419, + "grad_norm": 11.17453992197758, + "learning_rate": 2e-06, + "loss": 0.3602, + "step": 4415 + }, + { + "epoch": 1.0244751188957197, + "grad_norm": 13.490648517081917, + "learning_rate": 2e-06, + "loss": 0.2619, + "step": 4416 + }, + { + "epoch": 1.0247071105440204, + "grad_norm": 14.101245578112804, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 4417 + }, + { + "epoch": 1.024939102192321, + "grad_norm": 14.941819487142078, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 4418 + }, + { + "epoch": 1.0251710938406218, + "grad_norm": 15.85648435924746, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 4419 + }, + { + "epoch": 1.0254030854889224, + "grad_norm": 17.142409104258302, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 4420 + }, + { + "epoch": 1.0256350771372231, + "grad_norm": 14.136208692909497, + "learning_rate": 2e-06, + "loss": 0.3226, + "step": 4421 + }, + { + "epoch": 1.0258670687855238, + "grad_norm": 11.199660197653973, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 4422 + }, + { + "epoch": 1.0260990604338245, + "grad_norm": 27.70601825277928, + "learning_rate": 2e-06, + "loss": 0.374, + "step": 4423 + }, + { + "epoch": 1.026331052082125, + "grad_norm": 20.308926895318287, + "learning_rate": 2e-06, + "loss": 0.2835, + "step": 4424 + }, + { + "epoch": 1.0265630437304256, + "grad_norm": 11.668644087020201, + "learning_rate": 2e-06, + "loss": 0.159, + "step": 4425 + }, + { + "epoch": 1.0267950353787263, + "grad_norm": 14.05495343536885, + "learning_rate": 2e-06, + "loss": 0.2125, + "step": 4426 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 9.009071699795452, + "learning_rate": 2e-06, + "loss": 0.28, + "step": 4427 + }, + { + "epoch": 1.0272590186753277, + "grad_norm": 24.15482898661228, + "learning_rate": 2e-06, + "loss": 0.2167, + "step": 4428 + }, + { + "epoch": 1.0274910103236283, + "grad_norm": 12.582332412328576, + "learning_rate": 2e-06, + "loss": 0.1751, + "step": 4429 + }, + { + "epoch": 1.027723001971929, + "grad_norm": 21.538185705261743, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 4430 + }, + { + "epoch": 1.0279549936202297, + "grad_norm": 8.712311921794157, + "learning_rate": 2e-06, + "loss": 0.1466, + "step": 4431 + }, + { + "epoch": 1.0281869852685304, + "grad_norm": 16.378555006915374, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 4432 + }, + { + "epoch": 1.028418976916831, + "grad_norm": 16.769818815480654, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 4433 + }, + { + "epoch": 1.0286509685651317, + "grad_norm": 7.228007866660501, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 4434 + }, + { + "epoch": 1.0288829602134324, + "grad_norm": 8.775747363226587, + "learning_rate": 2e-06, + "loss": 0.2117, + "step": 4435 + }, + { + "epoch": 1.029114951861733, + "grad_norm": 9.435564557301497, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 4436 + }, + { + "epoch": 1.0293469435100335, + "grad_norm": 9.178260857291992, + "learning_rate": 2e-06, + "loss": 0.2053, + "step": 4437 + }, + { + "epoch": 1.0295789351583342, + "grad_norm": 15.10714105935708, + "learning_rate": 2e-06, + "loss": 0.312, + "step": 4438 + }, + { + "epoch": 1.0298109268066349, + "grad_norm": 19.037191167720213, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 4439 + }, + { + "epoch": 1.0300429184549356, + "grad_norm": 13.052108536860008, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 4440 + }, + { + "epoch": 1.0302749101032362, + "grad_norm": 7.880553741749966, + "learning_rate": 2e-06, + "loss": 0.1743, + "step": 4441 + }, + { + "epoch": 1.030506901751537, + "grad_norm": 18.99204868125294, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 4442 + }, + { + "epoch": 1.0307388933998376, + "grad_norm": 41.273427411034746, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 4443 + }, + { + "epoch": 1.0309708850481383, + "grad_norm": 11.052697660150518, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 4444 + }, + { + "epoch": 1.031202876696439, + "grad_norm": 13.15387981143392, + "learning_rate": 2e-06, + "loss": 0.3272, + "step": 4445 + }, + { + "epoch": 1.0314348683447396, + "grad_norm": 16.851744488684126, + "learning_rate": 2e-06, + "loss": 0.3239, + "step": 4446 + }, + { + "epoch": 1.0316668599930403, + "grad_norm": 24.65220325923243, + "learning_rate": 2e-06, + "loss": 0.3975, + "step": 4447 + }, + { + "epoch": 1.031898851641341, + "grad_norm": 10.587443256446537, + "learning_rate": 2e-06, + "loss": 0.2074, + "step": 4448 + }, + { + "epoch": 1.0321308432896417, + "grad_norm": 9.47223283101745, + "learning_rate": 2e-06, + "loss": 0.1383, + "step": 4449 + }, + { + "epoch": 1.0323628349379423, + "grad_norm": 19.69363174664466, + "learning_rate": 2e-06, + "loss": 0.348, + "step": 4450 + }, + { + "epoch": 1.0325948265862428, + "grad_norm": 17.418878994857508, + "learning_rate": 2e-06, + "loss": 0.3025, + "step": 4451 + }, + { + "epoch": 1.0328268182345435, + "grad_norm": 17.068050882970642, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 4452 + }, + { + "epoch": 1.0330588098828442, + "grad_norm": 14.348268274620187, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 4453 + }, + { + "epoch": 1.0332908015311448, + "grad_norm": 9.200927968182237, + "learning_rate": 2e-06, + "loss": 0.3117, + "step": 4454 + }, + { + "epoch": 1.0335227931794455, + "grad_norm": 11.880208685210507, + "learning_rate": 2e-06, + "loss": 0.3102, + "step": 4455 + }, + { + "epoch": 1.0337547848277462, + "grad_norm": 13.195463192921046, + "learning_rate": 2e-06, + "loss": 0.3442, + "step": 4456 + }, + { + "epoch": 1.0339867764760469, + "grad_norm": 16.39948374883616, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 4457 + }, + { + "epoch": 1.0342187681243475, + "grad_norm": 21.695075440702873, + "learning_rate": 2e-06, + "loss": 0.2773, + "step": 4458 + }, + { + "epoch": 1.0344507597726482, + "grad_norm": 23.236654747422598, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 4459 + }, + { + "epoch": 1.034682751420949, + "grad_norm": 12.113159851456805, + "learning_rate": 2e-06, + "loss": 0.2446, + "step": 4460 + }, + { + "epoch": 1.0349147430692496, + "grad_norm": 16.68370613230402, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 4461 + }, + { + "epoch": 1.0351467347175503, + "grad_norm": 14.844254905487963, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 4462 + }, + { + "epoch": 1.035378726365851, + "grad_norm": 20.220611429267375, + "learning_rate": 2e-06, + "loss": 0.1801, + "step": 4463 + }, + { + "epoch": 1.0356107180141514, + "grad_norm": 14.251304825494103, + "learning_rate": 2e-06, + "loss": 0.263, + "step": 4464 + }, + { + "epoch": 1.035842709662452, + "grad_norm": 22.99719982100096, + "learning_rate": 2e-06, + "loss": 0.2855, + "step": 4465 + }, + { + "epoch": 1.0360747013107527, + "grad_norm": 8.585584935855628, + "learning_rate": 2e-06, + "loss": 0.2178, + "step": 4466 + }, + { + "epoch": 1.0363066929590534, + "grad_norm": 18.387469782533767, + "learning_rate": 2e-06, + "loss": 0.21, + "step": 4467 + }, + { + "epoch": 1.036538684607354, + "grad_norm": 11.401769727716564, + "learning_rate": 2e-06, + "loss": 0.2416, + "step": 4468 + }, + { + "epoch": 1.0367706762556548, + "grad_norm": 15.94523695286945, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 4469 + }, + { + "epoch": 1.0370026679039555, + "grad_norm": 11.196828766891255, + "learning_rate": 2e-06, + "loss": 0.2191, + "step": 4470 + }, + { + "epoch": 1.0372346595522561, + "grad_norm": 11.540823035696267, + "learning_rate": 2e-06, + "loss": 0.1927, + "step": 4471 + }, + { + "epoch": 1.0374666512005568, + "grad_norm": 14.536796413435079, + "learning_rate": 2e-06, + "loss": 0.3304, + "step": 4472 + }, + { + "epoch": 1.0376986428488575, + "grad_norm": 14.343576899291822, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 4473 + }, + { + "epoch": 1.0379306344971582, + "grad_norm": 16.922838282153773, + "learning_rate": 2e-06, + "loss": 0.3408, + "step": 4474 + }, + { + "epoch": 1.0381626261454588, + "grad_norm": 11.707108143377337, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 4475 + }, + { + "epoch": 1.0383946177937595, + "grad_norm": 8.517094184401891, + "learning_rate": 2e-06, + "loss": 0.2071, + "step": 4476 + }, + { + "epoch": 1.0386266094420602, + "grad_norm": 12.389320998949996, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 4477 + }, + { + "epoch": 1.0388586010903607, + "grad_norm": 13.645512621397236, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 4478 + }, + { + "epoch": 1.0390905927386613, + "grad_norm": 20.161252864133314, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 4479 + }, + { + "epoch": 1.039322584386962, + "grad_norm": 19.924490724699734, + "learning_rate": 2e-06, + "loss": 0.2941, + "step": 4480 + }, + { + "epoch": 1.0395545760352627, + "grad_norm": 17.642672690575644, + "learning_rate": 2e-06, + "loss": 0.2028, + "step": 4481 + }, + { + "epoch": 1.0397865676835634, + "grad_norm": 11.183339327538803, + "learning_rate": 2e-06, + "loss": 0.216, + "step": 4482 + }, + { + "epoch": 1.040018559331864, + "grad_norm": 16.449886451041284, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 4483 + }, + { + "epoch": 1.0402505509801647, + "grad_norm": 23.901687440717772, + "learning_rate": 2e-06, + "loss": 0.3176, + "step": 4484 + }, + { + "epoch": 1.0404825426284654, + "grad_norm": 10.718689618673862, + "learning_rate": 2e-06, + "loss": 0.2306, + "step": 4485 + }, + { + "epoch": 1.040714534276766, + "grad_norm": 13.623712550846152, + "learning_rate": 2e-06, + "loss": 0.2108, + "step": 4486 + }, + { + "epoch": 1.0409465259250668, + "grad_norm": 16.330073437541824, + "learning_rate": 2e-06, + "loss": 0.3593, + "step": 4487 + }, + { + "epoch": 1.0411785175733674, + "grad_norm": 12.825110261820797, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 4488 + }, + { + "epoch": 1.0414105092216681, + "grad_norm": 7.216701052005836, + "learning_rate": 2e-06, + "loss": 0.1575, + "step": 4489 + }, + { + "epoch": 1.0416425008699686, + "grad_norm": 22.15548889662668, + "learning_rate": 2e-06, + "loss": 0.1809, + "step": 4490 + }, + { + "epoch": 1.0418744925182692, + "grad_norm": 9.994987316571109, + "learning_rate": 2e-06, + "loss": 0.1943, + "step": 4491 + }, + { + "epoch": 1.04210648416657, + "grad_norm": 8.941695765183466, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 4492 + }, + { + "epoch": 1.0423384758148706, + "grad_norm": 15.189764942546732, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 4493 + }, + { + "epoch": 1.0425704674631713, + "grad_norm": 13.35000274072905, + "learning_rate": 2e-06, + "loss": 0.2118, + "step": 4494 + }, + { + "epoch": 1.042802459111472, + "grad_norm": 13.057315814446655, + "learning_rate": 2e-06, + "loss": 0.2257, + "step": 4495 + }, + { + "epoch": 1.0430344507597726, + "grad_norm": 12.011177476914142, + "learning_rate": 2e-06, + "loss": 0.2612, + "step": 4496 + }, + { + "epoch": 1.0432664424080733, + "grad_norm": 18.472444083720685, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 4497 + }, + { + "epoch": 1.043498434056374, + "grad_norm": 11.947161039265346, + "learning_rate": 2e-06, + "loss": 0.2028, + "step": 4498 + }, + { + "epoch": 1.0437304257046747, + "grad_norm": 11.683468527357304, + "learning_rate": 2e-06, + "loss": 0.2638, + "step": 4499 + }, + { + "epoch": 1.0439624173529753, + "grad_norm": 14.343236326278143, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 4500 + }, + { + "epoch": 1.044194409001276, + "grad_norm": 10.710904819129617, + "learning_rate": 2e-06, + "loss": 0.2985, + "step": 4501 + }, + { + "epoch": 1.0444264006495767, + "grad_norm": 12.13213015738718, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 4502 + }, + { + "epoch": 1.0446583922978774, + "grad_norm": 18.94819133437833, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 4503 + }, + { + "epoch": 1.0448903839461778, + "grad_norm": 12.539901478278948, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 4504 + }, + { + "epoch": 1.0451223755944785, + "grad_norm": 24.18437820339448, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 4505 + }, + { + "epoch": 1.0453543672427792, + "grad_norm": 23.383318703382532, + "learning_rate": 2e-06, + "loss": 0.3919, + "step": 4506 + }, + { + "epoch": 1.0455863588910799, + "grad_norm": 9.67869670349899, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 4507 + }, + { + "epoch": 1.0458183505393805, + "grad_norm": 14.588716396765122, + "learning_rate": 2e-06, + "loss": 0.2996, + "step": 4508 + }, + { + "epoch": 1.0460503421876812, + "grad_norm": 12.840467825576411, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 4509 + }, + { + "epoch": 1.046282333835982, + "grad_norm": 14.272494811690157, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 4510 + }, + { + "epoch": 1.0465143254842826, + "grad_norm": 20.01918249140347, + "learning_rate": 2e-06, + "loss": 0.3383, + "step": 4511 + }, + { + "epoch": 1.0467463171325833, + "grad_norm": 7.556423609497603, + "learning_rate": 2e-06, + "loss": 0.1243, + "step": 4512 + }, + { + "epoch": 1.046978308780884, + "grad_norm": 8.6970267586175, + "learning_rate": 2e-06, + "loss": 0.1976, + "step": 4513 + }, + { + "epoch": 1.0472103004291846, + "grad_norm": 24.096318328090813, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 4514 + }, + { + "epoch": 1.0474422920774853, + "grad_norm": 7.8467329542157565, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 4515 + }, + { + "epoch": 1.047674283725786, + "grad_norm": 29.928182855435207, + "learning_rate": 2e-06, + "loss": 0.3649, + "step": 4516 + }, + { + "epoch": 1.0479062753740864, + "grad_norm": 13.927439113839654, + "learning_rate": 2e-06, + "loss": 0.3017, + "step": 4517 + }, + { + "epoch": 1.048138267022387, + "grad_norm": 18.88795094602496, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 4518 + }, + { + "epoch": 1.0483702586706878, + "grad_norm": 9.620669829192538, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 4519 + }, + { + "epoch": 1.0486022503189885, + "grad_norm": 17.223106187454317, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 4520 + }, + { + "epoch": 1.0488342419672891, + "grad_norm": 8.646155394897107, + "learning_rate": 2e-06, + "loss": 0.3076, + "step": 4521 + }, + { + "epoch": 1.0490662336155898, + "grad_norm": 8.778649493758728, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 4522 + }, + { + "epoch": 1.0492982252638905, + "grad_norm": 7.918122268225447, + "learning_rate": 2e-06, + "loss": 0.1816, + "step": 4523 + }, + { + "epoch": 1.0495302169121912, + "grad_norm": 11.01843669150523, + "learning_rate": 2e-06, + "loss": 0.2619, + "step": 4524 + }, + { + "epoch": 1.0497622085604918, + "grad_norm": 13.875284633960788, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 4525 + }, + { + "epoch": 1.0499942002087925, + "grad_norm": 20.84523992516674, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 4526 + }, + { + "epoch": 1.0502261918570932, + "grad_norm": 7.755997981352427, + "learning_rate": 2e-06, + "loss": 0.1873, + "step": 4527 + }, + { + "epoch": 1.0504581835053939, + "grad_norm": 9.333137819737173, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 4528 + }, + { + "epoch": 1.0506901751536946, + "grad_norm": 39.160017340477, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 4529 + }, + { + "epoch": 1.0509221668019952, + "grad_norm": 25.282757342372953, + "learning_rate": 2e-06, + "loss": 0.2774, + "step": 4530 + }, + { + "epoch": 1.0511541584502957, + "grad_norm": 12.916061156883767, + "learning_rate": 2e-06, + "loss": 0.1881, + "step": 4531 + }, + { + "epoch": 1.0513861500985964, + "grad_norm": 17.357923519315666, + "learning_rate": 2e-06, + "loss": 0.3112, + "step": 4532 + }, + { + "epoch": 1.051618141746897, + "grad_norm": 8.057295108120414, + "learning_rate": 2e-06, + "loss": 0.1589, + "step": 4533 + }, + { + "epoch": 1.0518501333951977, + "grad_norm": 11.473880621998514, + "learning_rate": 2e-06, + "loss": 0.1511, + "step": 4534 + }, + { + "epoch": 1.0520821250434984, + "grad_norm": 7.7417751611696835, + "learning_rate": 2e-06, + "loss": 0.1299, + "step": 4535 + }, + { + "epoch": 1.052314116691799, + "grad_norm": 15.442864167699556, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 4536 + }, + { + "epoch": 1.0525461083400998, + "grad_norm": 17.60484150290361, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 4537 + }, + { + "epoch": 1.0527780999884004, + "grad_norm": 16.936105884989708, + "learning_rate": 2e-06, + "loss": 0.289, + "step": 4538 + }, + { + "epoch": 1.0530100916367011, + "grad_norm": 11.843293598367202, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 4539 + }, + { + "epoch": 1.0532420832850018, + "grad_norm": 23.64743039681951, + "learning_rate": 2e-06, + "loss": 0.3383, + "step": 4540 + }, + { + "epoch": 1.0534740749333025, + "grad_norm": 17.546494934902448, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 4541 + }, + { + "epoch": 1.0537060665816032, + "grad_norm": 15.785905089669802, + "learning_rate": 2e-06, + "loss": 0.2919, + "step": 4542 + }, + { + "epoch": 1.0539380582299038, + "grad_norm": 15.552562721537294, + "learning_rate": 2e-06, + "loss": 0.313, + "step": 4543 + }, + { + "epoch": 1.0541700498782043, + "grad_norm": 13.38274443962515, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 4544 + }, + { + "epoch": 1.054402041526505, + "grad_norm": 14.364837287011094, + "learning_rate": 2e-06, + "loss": 0.3176, + "step": 4545 + }, + { + "epoch": 1.0546340331748056, + "grad_norm": 10.675992961826456, + "learning_rate": 2e-06, + "loss": 0.1771, + "step": 4546 + }, + { + "epoch": 1.0548660248231063, + "grad_norm": 25.91576359257879, + "learning_rate": 2e-06, + "loss": 0.2628, + "step": 4547 + }, + { + "epoch": 1.055098016471407, + "grad_norm": 10.117754686145902, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 4548 + }, + { + "epoch": 1.0553300081197077, + "grad_norm": 10.059227091582052, + "learning_rate": 2e-06, + "loss": 0.1506, + "step": 4549 + }, + { + "epoch": 1.0555619997680084, + "grad_norm": 16.908727808786647, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 4550 + }, + { + "epoch": 1.055793991416309, + "grad_norm": 14.386589699765908, + "learning_rate": 2e-06, + "loss": 0.2534, + "step": 4551 + }, + { + "epoch": 1.0560259830646097, + "grad_norm": 10.248441451994353, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 4552 + }, + { + "epoch": 1.0562579747129104, + "grad_norm": 7.8765393725949, + "learning_rate": 2e-06, + "loss": 0.1391, + "step": 4553 + }, + { + "epoch": 1.056489966361211, + "grad_norm": 13.745768749508281, + "learning_rate": 2e-06, + "loss": 0.3445, + "step": 4554 + }, + { + "epoch": 1.0567219580095117, + "grad_norm": 20.73572027129585, + "learning_rate": 2e-06, + "loss": 0.2545, + "step": 4555 + }, + { + "epoch": 1.0569539496578124, + "grad_norm": 19.14940776893297, + "learning_rate": 2e-06, + "loss": 0.239, + "step": 4556 + }, + { + "epoch": 1.0571859413061129, + "grad_norm": 12.890430025290328, + "learning_rate": 2e-06, + "loss": 0.29, + "step": 4557 + }, + { + "epoch": 1.0574179329544136, + "grad_norm": 18.20419221939733, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 4558 + }, + { + "epoch": 1.0576499246027142, + "grad_norm": 12.980910847550414, + "learning_rate": 2e-06, + "loss": 0.2111, + "step": 4559 + }, + { + "epoch": 1.057881916251015, + "grad_norm": 8.458796386653953, + "learning_rate": 2e-06, + "loss": 0.1938, + "step": 4560 + }, + { + "epoch": 1.0581139078993156, + "grad_norm": 21.75891206290162, + "learning_rate": 2e-06, + "loss": 0.2288, + "step": 4561 + }, + { + "epoch": 1.0583458995476163, + "grad_norm": 13.117360758513612, + "learning_rate": 2e-06, + "loss": 0.2496, + "step": 4562 + }, + { + "epoch": 1.058577891195917, + "grad_norm": 18.092807169325763, + "learning_rate": 2e-06, + "loss": 0.2883, + "step": 4563 + }, + { + "epoch": 1.0588098828442176, + "grad_norm": 28.11071353474641, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 4564 + }, + { + "epoch": 1.0590418744925183, + "grad_norm": 11.093359793460808, + "learning_rate": 2e-06, + "loss": 0.2871, + "step": 4565 + }, + { + "epoch": 1.059273866140819, + "grad_norm": 5.708558446753598, + "learning_rate": 2e-06, + "loss": 0.1646, + "step": 4566 + }, + { + "epoch": 1.0595058577891197, + "grad_norm": 8.007983606895225, + "learning_rate": 2e-06, + "loss": 0.1235, + "step": 4567 + }, + { + "epoch": 1.0597378494374203, + "grad_norm": 23.691691410023708, + "learning_rate": 2e-06, + "loss": 0.4476, + "step": 4568 + }, + { + "epoch": 1.059969841085721, + "grad_norm": 14.342884710913854, + "learning_rate": 2e-06, + "loss": 0.2121, + "step": 4569 + }, + { + "epoch": 1.0602018327340215, + "grad_norm": 20.421082262288795, + "learning_rate": 2e-06, + "loss": 0.2337, + "step": 4570 + }, + { + "epoch": 1.0604338243823221, + "grad_norm": 14.283943578587204, + "learning_rate": 2e-06, + "loss": 0.3167, + "step": 4571 + }, + { + "epoch": 1.0606658160306228, + "grad_norm": 14.32307432911022, + "learning_rate": 2e-06, + "loss": 0.3219, + "step": 4572 + }, + { + "epoch": 1.0608978076789235, + "grad_norm": 23.713879420441952, + "learning_rate": 2e-06, + "loss": 0.42, + "step": 4573 + }, + { + "epoch": 1.0611297993272242, + "grad_norm": 14.388879223470608, + "learning_rate": 2e-06, + "loss": 0.2093, + "step": 4574 + }, + { + "epoch": 1.0613617909755249, + "grad_norm": 14.770522362129077, + "learning_rate": 2e-06, + "loss": 0.2532, + "step": 4575 + }, + { + "epoch": 1.0615937826238255, + "grad_norm": 9.670132297957418, + "learning_rate": 2e-06, + "loss": 0.2032, + "step": 4576 + }, + { + "epoch": 1.0618257742721262, + "grad_norm": 12.174822545988004, + "learning_rate": 2e-06, + "loss": 0.1677, + "step": 4577 + }, + { + "epoch": 1.0620577659204269, + "grad_norm": 16.409913639531826, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 4578 + }, + { + "epoch": 1.0622897575687276, + "grad_norm": 13.679220521114875, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 4579 + }, + { + "epoch": 1.0625217492170282, + "grad_norm": 15.351767353163767, + "learning_rate": 2e-06, + "loss": 0.2492, + "step": 4580 + }, + { + "epoch": 1.062753740865329, + "grad_norm": 14.55264537341593, + "learning_rate": 2e-06, + "loss": 0.194, + "step": 4581 + }, + { + "epoch": 1.0629857325136296, + "grad_norm": 23.70433697778693, + "learning_rate": 2e-06, + "loss": 0.2432, + "step": 4582 + }, + { + "epoch": 1.0632177241619303, + "grad_norm": 13.494892360681849, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 4583 + }, + { + "epoch": 1.0634497158102307, + "grad_norm": 14.626691010991978, + "learning_rate": 2e-06, + "loss": 0.2345, + "step": 4584 + }, + { + "epoch": 1.0636817074585314, + "grad_norm": 13.807386499526016, + "learning_rate": 2e-06, + "loss": 0.1773, + "step": 4585 + }, + { + "epoch": 1.063913699106832, + "grad_norm": 14.377411897301746, + "learning_rate": 2e-06, + "loss": 0.206, + "step": 4586 + }, + { + "epoch": 1.0641456907551328, + "grad_norm": 10.717646496233396, + "learning_rate": 2e-06, + "loss": 0.1543, + "step": 4587 + }, + { + "epoch": 1.0643776824034334, + "grad_norm": 12.121576229930133, + "learning_rate": 2e-06, + "loss": 0.1937, + "step": 4588 + }, + { + "epoch": 1.0646096740517341, + "grad_norm": 10.377540319043618, + "learning_rate": 2e-06, + "loss": 0.2044, + "step": 4589 + }, + { + "epoch": 1.0648416657000348, + "grad_norm": 16.025433031691097, + "learning_rate": 2e-06, + "loss": 0.2279, + "step": 4590 + }, + { + "epoch": 1.0650736573483355, + "grad_norm": 28.09748111844974, + "learning_rate": 2e-06, + "loss": 0.271, + "step": 4591 + }, + { + "epoch": 1.0653056489966362, + "grad_norm": 15.233408719058405, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 4592 + }, + { + "epoch": 1.0655376406449368, + "grad_norm": 11.688531867610594, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 4593 + }, + { + "epoch": 1.0657696322932375, + "grad_norm": 12.478963044020029, + "learning_rate": 2e-06, + "loss": 0.3794, + "step": 4594 + }, + { + "epoch": 1.0660016239415382, + "grad_norm": 12.255769188520668, + "learning_rate": 2e-06, + "loss": 0.1958, + "step": 4595 + }, + { + "epoch": 1.0662336155898389, + "grad_norm": 14.479545376110917, + "learning_rate": 2e-06, + "loss": 0.2721, + "step": 4596 + }, + { + "epoch": 1.0664656072381393, + "grad_norm": 9.375689405742289, + "learning_rate": 2e-06, + "loss": 0.2932, + "step": 4597 + }, + { + "epoch": 1.06669759888644, + "grad_norm": 10.315314477724502, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 4598 + }, + { + "epoch": 1.0669295905347407, + "grad_norm": 21.631875638569465, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 4599 + }, + { + "epoch": 1.0671615821830414, + "grad_norm": 27.067384794430104, + "learning_rate": 2e-06, + "loss": 0.4603, + "step": 4600 + }, + { + "epoch": 1.067393573831342, + "grad_norm": 9.802424258760192, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 4601 + }, + { + "epoch": 1.0676255654796427, + "grad_norm": 10.320769127337003, + "learning_rate": 2e-06, + "loss": 0.1942, + "step": 4602 + }, + { + "epoch": 1.0678575571279434, + "grad_norm": 12.10330234840912, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 4603 + }, + { + "epoch": 1.068089548776244, + "grad_norm": 9.482829401573873, + "learning_rate": 2e-06, + "loss": 0.2014, + "step": 4604 + }, + { + "epoch": 1.0683215404245447, + "grad_norm": 13.2444842153338, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 4605 + }, + { + "epoch": 1.0685535320728454, + "grad_norm": 9.059155667239775, + "learning_rate": 2e-06, + "loss": 0.1656, + "step": 4606 + }, + { + "epoch": 1.068785523721146, + "grad_norm": 22.107718337840467, + "learning_rate": 2e-06, + "loss": 0.3723, + "step": 4607 + }, + { + "epoch": 1.0690175153694468, + "grad_norm": 16.721630962015563, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 4608 + }, + { + "epoch": 1.0692495070177475, + "grad_norm": 14.552520243084661, + "learning_rate": 2e-06, + "loss": 0.2485, + "step": 4609 + }, + { + "epoch": 1.0694814986660481, + "grad_norm": 16.363779480701567, + "learning_rate": 2e-06, + "loss": 0.2091, + "step": 4610 + }, + { + "epoch": 1.0697134903143486, + "grad_norm": 6.404394903208128, + "learning_rate": 2e-06, + "loss": 0.2208, + "step": 4611 + }, + { + "epoch": 1.0699454819626493, + "grad_norm": 20.95385822012562, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 4612 + }, + { + "epoch": 1.07017747361095, + "grad_norm": 8.509211364750067, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 4613 + }, + { + "epoch": 1.0704094652592506, + "grad_norm": 9.683183528937615, + "learning_rate": 2e-06, + "loss": 0.1457, + "step": 4614 + }, + { + "epoch": 1.0706414569075513, + "grad_norm": 9.248345401803073, + "learning_rate": 2e-06, + "loss": 0.2679, + "step": 4615 + }, + { + "epoch": 1.070873448555852, + "grad_norm": 10.304306796998121, + "learning_rate": 2e-06, + "loss": 0.2017, + "step": 4616 + }, + { + "epoch": 1.0711054402041527, + "grad_norm": 12.830372489140107, + "learning_rate": 2e-06, + "loss": 0.2048, + "step": 4617 + }, + { + "epoch": 1.0713374318524533, + "grad_norm": 15.16407227138631, + "learning_rate": 2e-06, + "loss": 0.3194, + "step": 4618 + }, + { + "epoch": 1.071569423500754, + "grad_norm": 19.579768580627913, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 4619 + }, + { + "epoch": 1.0718014151490547, + "grad_norm": 18.4708758331504, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 4620 + }, + { + "epoch": 1.0720334067973554, + "grad_norm": 16.340610589492915, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 4621 + }, + { + "epoch": 1.072265398445656, + "grad_norm": 15.476040559442902, + "learning_rate": 2e-06, + "loss": 0.3229, + "step": 4622 + }, + { + "epoch": 1.0724973900939565, + "grad_norm": 11.731130456848682, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 4623 + }, + { + "epoch": 1.0727293817422572, + "grad_norm": 14.988365226355235, + "learning_rate": 2e-06, + "loss": 0.3421, + "step": 4624 + }, + { + "epoch": 1.0729613733905579, + "grad_norm": 16.246876392051842, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 4625 + }, + { + "epoch": 1.0731933650388585, + "grad_norm": 16.63092189706823, + "learning_rate": 2e-06, + "loss": 0.2903, + "step": 4626 + }, + { + "epoch": 1.0734253566871592, + "grad_norm": 9.829098076638097, + "learning_rate": 2e-06, + "loss": 0.1791, + "step": 4627 + }, + { + "epoch": 1.07365734833546, + "grad_norm": 12.44972854207715, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 4628 + }, + { + "epoch": 1.0738893399837606, + "grad_norm": 15.653560086600205, + "learning_rate": 2e-06, + "loss": 0.2672, + "step": 4629 + }, + { + "epoch": 1.0741213316320612, + "grad_norm": 6.496762545985574, + "learning_rate": 2e-06, + "loss": 0.1861, + "step": 4630 + }, + { + "epoch": 1.074353323280362, + "grad_norm": 8.684805225175854, + "learning_rate": 2e-06, + "loss": 0.1961, + "step": 4631 + }, + { + "epoch": 1.0745853149286626, + "grad_norm": 13.618572813623743, + "learning_rate": 2e-06, + "loss": 0.2212, + "step": 4632 + }, + { + "epoch": 1.0748173065769633, + "grad_norm": 11.440261368799872, + "learning_rate": 2e-06, + "loss": 0.2728, + "step": 4633 + }, + { + "epoch": 1.075049298225264, + "grad_norm": 8.376117985507744, + "learning_rate": 2e-06, + "loss": 0.2127, + "step": 4634 + }, + { + "epoch": 1.0752812898735646, + "grad_norm": 10.788704052608766, + "learning_rate": 2e-06, + "loss": 0.2423, + "step": 4635 + }, + { + "epoch": 1.0755132815218653, + "grad_norm": 12.356605240160762, + "learning_rate": 2e-06, + "loss": 0.2149, + "step": 4636 + }, + { + "epoch": 1.0757452731701658, + "grad_norm": 8.646560020577828, + "learning_rate": 2e-06, + "loss": 0.1408, + "step": 4637 + }, + { + "epoch": 1.0759772648184665, + "grad_norm": 22.873044231235614, + "learning_rate": 2e-06, + "loss": 0.3564, + "step": 4638 + }, + { + "epoch": 1.0762092564667671, + "grad_norm": 13.115236584747734, + "learning_rate": 2e-06, + "loss": 0.327, + "step": 4639 + }, + { + "epoch": 1.0764412481150678, + "grad_norm": 16.990624415839985, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 4640 + }, + { + "epoch": 1.0766732397633685, + "grad_norm": 24.284509006768378, + "learning_rate": 2e-06, + "loss": 0.3058, + "step": 4641 + }, + { + "epoch": 1.0769052314116692, + "grad_norm": 13.264059901096275, + "learning_rate": 2e-06, + "loss": 0.2127, + "step": 4642 + }, + { + "epoch": 1.0771372230599698, + "grad_norm": 17.962152197713916, + "learning_rate": 2e-06, + "loss": 0.4182, + "step": 4643 + }, + { + "epoch": 1.0773692147082705, + "grad_norm": 15.13566581385218, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 4644 + }, + { + "epoch": 1.0776012063565712, + "grad_norm": 9.107776851142189, + "learning_rate": 2e-06, + "loss": 0.1799, + "step": 4645 + }, + { + "epoch": 1.0778331980048719, + "grad_norm": 15.416609074726267, + "learning_rate": 2e-06, + "loss": 0.2687, + "step": 4646 + }, + { + "epoch": 1.0780651896531726, + "grad_norm": 60.35641994256627, + "learning_rate": 2e-06, + "loss": 0.2461, + "step": 4647 + }, + { + "epoch": 1.0782971813014732, + "grad_norm": 11.96075666327412, + "learning_rate": 2e-06, + "loss": 0.1971, + "step": 4648 + }, + { + "epoch": 1.078529172949774, + "grad_norm": 13.424063532979817, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 4649 + }, + { + "epoch": 1.0787611645980744, + "grad_norm": 12.773540979848594, + "learning_rate": 2e-06, + "loss": 0.1968, + "step": 4650 + }, + { + "epoch": 1.078993156246375, + "grad_norm": 19.417735202497628, + "learning_rate": 2e-06, + "loss": 0.2292, + "step": 4651 + }, + { + "epoch": 1.0792251478946757, + "grad_norm": 6.308682768305041, + "learning_rate": 2e-06, + "loss": 0.1197, + "step": 4652 + }, + { + "epoch": 1.0794571395429764, + "grad_norm": 7.2959765587506205, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 4653 + }, + { + "epoch": 1.079689131191277, + "grad_norm": 15.44589618064587, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 4654 + }, + { + "epoch": 1.0799211228395778, + "grad_norm": 9.29934262830167, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 4655 + }, + { + "epoch": 1.0801531144878784, + "grad_norm": 15.286437259258648, + "learning_rate": 2e-06, + "loss": 0.3175, + "step": 4656 + }, + { + "epoch": 1.080385106136179, + "grad_norm": 13.27664930644032, + "learning_rate": 2e-06, + "loss": 0.2383, + "step": 4657 + }, + { + "epoch": 1.0806170977844798, + "grad_norm": 12.263893963668028, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 4658 + }, + { + "epoch": 1.0808490894327805, + "grad_norm": 6.0357490100565006, + "learning_rate": 2e-06, + "loss": 0.1549, + "step": 4659 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 16.18608723157087, + "learning_rate": 2e-06, + "loss": 0.2984, + "step": 4660 + }, + { + "epoch": 1.0813130727293818, + "grad_norm": 9.181713104002375, + "learning_rate": 2e-06, + "loss": 0.1386, + "step": 4661 + }, + { + "epoch": 1.0815450643776825, + "grad_norm": 5.644022388995841, + "learning_rate": 2e-06, + "loss": 0.1688, + "step": 4662 + }, + { + "epoch": 1.0817770560259832, + "grad_norm": 14.739106617725085, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 4663 + }, + { + "epoch": 1.0820090476742836, + "grad_norm": 11.676188528472048, + "learning_rate": 2e-06, + "loss": 0.1579, + "step": 4664 + }, + { + "epoch": 1.0822410393225843, + "grad_norm": 9.657311316004925, + "learning_rate": 2e-06, + "loss": 0.1834, + "step": 4665 + }, + { + "epoch": 1.082473030970885, + "grad_norm": 31.149815682668027, + "learning_rate": 2e-06, + "loss": 0.5156, + "step": 4666 + }, + { + "epoch": 1.0827050226191857, + "grad_norm": 22.49755979397703, + "learning_rate": 2e-06, + "loss": 0.3144, + "step": 4667 + }, + { + "epoch": 1.0829370142674863, + "grad_norm": 16.400850906073345, + "learning_rate": 2e-06, + "loss": 0.24, + "step": 4668 + }, + { + "epoch": 1.083169005915787, + "grad_norm": 14.898844803167906, + "learning_rate": 2e-06, + "loss": 0.2425, + "step": 4669 + }, + { + "epoch": 1.0834009975640877, + "grad_norm": 15.375770635265186, + "learning_rate": 2e-06, + "loss": 0.235, + "step": 4670 + }, + { + "epoch": 1.0836329892123884, + "grad_norm": 8.903667496388419, + "learning_rate": 2e-06, + "loss": 0.1767, + "step": 4671 + }, + { + "epoch": 1.083864980860689, + "grad_norm": 20.36801862973905, + "learning_rate": 2e-06, + "loss": 0.34, + "step": 4672 + }, + { + "epoch": 1.0840969725089897, + "grad_norm": 16.623785198060162, + "learning_rate": 2e-06, + "loss": 0.3051, + "step": 4673 + }, + { + "epoch": 1.0843289641572904, + "grad_norm": 21.712004714291528, + "learning_rate": 2e-06, + "loss": 0.2214, + "step": 4674 + }, + { + "epoch": 1.084560955805591, + "grad_norm": 18.64164274149171, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 4675 + }, + { + "epoch": 1.0847929474538915, + "grad_norm": 15.021075091917751, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 4676 + }, + { + "epoch": 1.0850249391021922, + "grad_norm": 13.172486147361289, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 4677 + }, + { + "epoch": 1.085256930750493, + "grad_norm": 14.952373836405085, + "learning_rate": 2e-06, + "loss": 0.2649, + "step": 4678 + }, + { + "epoch": 1.0854889223987936, + "grad_norm": 12.057002528751237, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 4679 + }, + { + "epoch": 1.0857209140470943, + "grad_norm": 14.097720996300128, + "learning_rate": 2e-06, + "loss": 0.1673, + "step": 4680 + }, + { + "epoch": 1.085952905695395, + "grad_norm": 13.424131645799678, + "learning_rate": 2e-06, + "loss": 0.2547, + "step": 4681 + }, + { + "epoch": 1.0861848973436956, + "grad_norm": 26.119273464189042, + "learning_rate": 2e-06, + "loss": 0.2301, + "step": 4682 + }, + { + "epoch": 1.0864168889919963, + "grad_norm": 10.85903450705642, + "learning_rate": 2e-06, + "loss": 0.3203, + "step": 4683 + }, + { + "epoch": 1.086648880640297, + "grad_norm": 12.192959640288787, + "learning_rate": 2e-06, + "loss": 0.2221, + "step": 4684 + }, + { + "epoch": 1.0868808722885976, + "grad_norm": 12.991969981697782, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 4685 + }, + { + "epoch": 1.0871128639368983, + "grad_norm": 13.438897594700979, + "learning_rate": 2e-06, + "loss": 0.2253, + "step": 4686 + }, + { + "epoch": 1.087344855585199, + "grad_norm": 21.185991926389537, + "learning_rate": 2e-06, + "loss": 0.2621, + "step": 4687 + }, + { + "epoch": 1.0875768472334997, + "grad_norm": 7.49060315068765, + "learning_rate": 2e-06, + "loss": 0.152, + "step": 4688 + }, + { + "epoch": 1.0878088388818004, + "grad_norm": 10.692675079619084, + "learning_rate": 2e-06, + "loss": 0.206, + "step": 4689 + }, + { + "epoch": 1.088040830530101, + "grad_norm": 12.085593295785767, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 4690 + }, + { + "epoch": 1.0882728221784015, + "grad_norm": 15.658009069301887, + "learning_rate": 2e-06, + "loss": 0.2292, + "step": 4691 + }, + { + "epoch": 1.0885048138267022, + "grad_norm": 21.839791385577954, + "learning_rate": 2e-06, + "loss": 0.2165, + "step": 4692 + }, + { + "epoch": 1.0887368054750028, + "grad_norm": 20.168667031089573, + "learning_rate": 2e-06, + "loss": 0.4257, + "step": 4693 + }, + { + "epoch": 1.0889687971233035, + "grad_norm": 13.532395866046311, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 4694 + }, + { + "epoch": 1.0892007887716042, + "grad_norm": 7.037441919712227, + "learning_rate": 2e-06, + "loss": 0.1576, + "step": 4695 + }, + { + "epoch": 1.0894327804199049, + "grad_norm": 6.080678326111714, + "learning_rate": 2e-06, + "loss": 0.1554, + "step": 4696 + }, + { + "epoch": 1.0896647720682056, + "grad_norm": 11.62312585625033, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 4697 + }, + { + "epoch": 1.0898967637165062, + "grad_norm": 25.07140286851281, + "learning_rate": 2e-06, + "loss": 0.3817, + "step": 4698 + }, + { + "epoch": 1.090128755364807, + "grad_norm": 14.454005153102496, + "learning_rate": 2e-06, + "loss": 0.237, + "step": 4699 + }, + { + "epoch": 1.0903607470131076, + "grad_norm": 33.65593280593324, + "learning_rate": 2e-06, + "loss": 0.3603, + "step": 4700 + }, + { + "epoch": 1.0905927386614083, + "grad_norm": 24.21819465836278, + "learning_rate": 2e-06, + "loss": 0.3173, + "step": 4701 + }, + { + "epoch": 1.090824730309709, + "grad_norm": 6.002003352997946, + "learning_rate": 2e-06, + "loss": 0.1649, + "step": 4702 + }, + { + "epoch": 1.0910567219580094, + "grad_norm": 15.363931384625337, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 4703 + }, + { + "epoch": 1.09128871360631, + "grad_norm": 19.767472015606725, + "learning_rate": 2e-06, + "loss": 0.3107, + "step": 4704 + }, + { + "epoch": 1.0915207052546108, + "grad_norm": 15.38698131756278, + "learning_rate": 2e-06, + "loss": 0.2003, + "step": 4705 + }, + { + "epoch": 1.0917526969029114, + "grad_norm": 12.01919931645781, + "learning_rate": 2e-06, + "loss": 0.2028, + "step": 4706 + }, + { + "epoch": 1.0919846885512121, + "grad_norm": 15.942894539948433, + "learning_rate": 2e-06, + "loss": 0.2649, + "step": 4707 + }, + { + "epoch": 1.0922166801995128, + "grad_norm": 14.38917599038256, + "learning_rate": 2e-06, + "loss": 0.2061, + "step": 4708 + }, + { + "epoch": 1.0924486718478135, + "grad_norm": 11.833878380303931, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 4709 + }, + { + "epoch": 1.0926806634961141, + "grad_norm": 16.914912934828145, + "learning_rate": 2e-06, + "loss": 0.2783, + "step": 4710 + }, + { + "epoch": 1.0929126551444148, + "grad_norm": 17.266074010543782, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 4711 + }, + { + "epoch": 1.0931446467927155, + "grad_norm": 18.75284359060675, + "learning_rate": 2e-06, + "loss": 0.283, + "step": 4712 + }, + { + "epoch": 1.0933766384410162, + "grad_norm": 12.43982576036956, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 4713 + }, + { + "epoch": 1.0936086300893169, + "grad_norm": 14.80054201183015, + "learning_rate": 2e-06, + "loss": 0.2475, + "step": 4714 + }, + { + "epoch": 1.0938406217376175, + "grad_norm": 13.096084263417465, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 4715 + }, + { + "epoch": 1.0940726133859182, + "grad_norm": 23.74250380776062, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 4716 + }, + { + "epoch": 1.0943046050342187, + "grad_norm": 8.639370985491647, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 4717 + }, + { + "epoch": 1.0945365966825193, + "grad_norm": 8.52413219768701, + "learning_rate": 2e-06, + "loss": 0.1727, + "step": 4718 + }, + { + "epoch": 1.09476858833082, + "grad_norm": 11.6044749708807, + "learning_rate": 2e-06, + "loss": 0.2995, + "step": 4719 + }, + { + "epoch": 1.0950005799791207, + "grad_norm": 15.624858689431957, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 4720 + }, + { + "epoch": 1.0952325716274214, + "grad_norm": 13.945303085396459, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 4721 + }, + { + "epoch": 1.095464563275722, + "grad_norm": 10.769513824946431, + "learning_rate": 2e-06, + "loss": 0.1963, + "step": 4722 + }, + { + "epoch": 1.0956965549240227, + "grad_norm": 7.428153969726964, + "learning_rate": 2e-06, + "loss": 0.148, + "step": 4723 + }, + { + "epoch": 1.0959285465723234, + "grad_norm": 6.88998868347155, + "learning_rate": 2e-06, + "loss": 0.1802, + "step": 4724 + }, + { + "epoch": 1.096160538220624, + "grad_norm": 14.252947491435332, + "learning_rate": 2e-06, + "loss": 0.2731, + "step": 4725 + }, + { + "epoch": 1.0963925298689248, + "grad_norm": 18.767434337446932, + "learning_rate": 2e-06, + "loss": 0.1984, + "step": 4726 + }, + { + "epoch": 1.0966245215172254, + "grad_norm": 25.407062038419664, + "learning_rate": 2e-06, + "loss": 0.3428, + "step": 4727 + }, + { + "epoch": 1.0968565131655261, + "grad_norm": 19.46092075545016, + "learning_rate": 2e-06, + "loss": 0.2915, + "step": 4728 + }, + { + "epoch": 1.0970885048138268, + "grad_norm": 20.120073541123094, + "learning_rate": 2e-06, + "loss": 0.3822, + "step": 4729 + }, + { + "epoch": 1.0973204964621273, + "grad_norm": 22.039775692665884, + "learning_rate": 2e-06, + "loss": 0.3699, + "step": 4730 + }, + { + "epoch": 1.097552488110428, + "grad_norm": 19.658684197724533, + "learning_rate": 2e-06, + "loss": 0.2764, + "step": 4731 + }, + { + "epoch": 1.0977844797587286, + "grad_norm": 12.168061926618066, + "learning_rate": 2e-06, + "loss": 0.1944, + "step": 4732 + }, + { + "epoch": 1.0980164714070293, + "grad_norm": 24.83924510420318, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 4733 + }, + { + "epoch": 1.09824846305533, + "grad_norm": 12.746618260356046, + "learning_rate": 2e-06, + "loss": 0.2208, + "step": 4734 + }, + { + "epoch": 1.0984804547036306, + "grad_norm": 13.560829849978433, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 4735 + }, + { + "epoch": 1.0987124463519313, + "grad_norm": 16.147177241511702, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 4736 + }, + { + "epoch": 1.098944438000232, + "grad_norm": 15.593195676890463, + "learning_rate": 2e-06, + "loss": 0.1484, + "step": 4737 + }, + { + "epoch": 1.0991764296485327, + "grad_norm": 17.79741115414023, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 4738 + }, + { + "epoch": 1.0994084212968334, + "grad_norm": 12.267696363765928, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 4739 + }, + { + "epoch": 1.099640412945134, + "grad_norm": 21.270542895126816, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 4740 + }, + { + "epoch": 1.0998724045934347, + "grad_norm": 13.268440838091314, + "learning_rate": 2e-06, + "loss": 0.2893, + "step": 4741 + }, + { + "epoch": 1.1001043962417354, + "grad_norm": 5.773451254069926, + "learning_rate": 2e-06, + "loss": 0.1248, + "step": 4742 + }, + { + "epoch": 1.100336387890036, + "grad_norm": 13.38291685039027, + "learning_rate": 2e-06, + "loss": 0.2305, + "step": 4743 + }, + { + "epoch": 1.1005683795383365, + "grad_norm": 17.201303310278664, + "learning_rate": 2e-06, + "loss": 0.3608, + "step": 4744 + }, + { + "epoch": 1.1008003711866372, + "grad_norm": 14.377630959205913, + "learning_rate": 2e-06, + "loss": 0.2518, + "step": 4745 + }, + { + "epoch": 1.1010323628349379, + "grad_norm": 14.809347676129226, + "learning_rate": 2e-06, + "loss": 0.2533, + "step": 4746 + }, + { + "epoch": 1.1012643544832386, + "grad_norm": 9.986337062899754, + "learning_rate": 2e-06, + "loss": 0.2014, + "step": 4747 + }, + { + "epoch": 1.1014963461315392, + "grad_norm": 15.922279096775062, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 4748 + }, + { + "epoch": 1.10172833777984, + "grad_norm": 12.216428482913866, + "learning_rate": 2e-06, + "loss": 0.2412, + "step": 4749 + }, + { + "epoch": 1.1019603294281406, + "grad_norm": 8.778545487508715, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 4750 + }, + { + "epoch": 1.1021923210764413, + "grad_norm": 13.367201916070908, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 4751 + }, + { + "epoch": 1.102424312724742, + "grad_norm": 17.779014580733584, + "learning_rate": 2e-06, + "loss": 0.2878, + "step": 4752 + }, + { + "epoch": 1.1026563043730426, + "grad_norm": 15.27352344365719, + "learning_rate": 2e-06, + "loss": 0.3188, + "step": 4753 + }, + { + "epoch": 1.1028882960213433, + "grad_norm": 16.461117593032657, + "learning_rate": 2e-06, + "loss": 0.2435, + "step": 4754 + }, + { + "epoch": 1.103120287669644, + "grad_norm": 14.525277461670653, + "learning_rate": 2e-06, + "loss": 0.2675, + "step": 4755 + }, + { + "epoch": 1.1033522793179444, + "grad_norm": 16.976506099386146, + "learning_rate": 2e-06, + "loss": 0.1877, + "step": 4756 + }, + { + "epoch": 1.1035842709662451, + "grad_norm": 10.436412155903202, + "learning_rate": 2e-06, + "loss": 0.2611, + "step": 4757 + }, + { + "epoch": 1.1038162626145458, + "grad_norm": 17.45570996860586, + "learning_rate": 2e-06, + "loss": 0.2223, + "step": 4758 + }, + { + "epoch": 1.1040482542628465, + "grad_norm": 9.11640215787863, + "learning_rate": 2e-06, + "loss": 0.1965, + "step": 4759 + }, + { + "epoch": 1.1042802459111472, + "grad_norm": 18.53955266775229, + "learning_rate": 2e-06, + "loss": 0.2927, + "step": 4760 + }, + { + "epoch": 1.1045122375594478, + "grad_norm": 11.813660934324743, + "learning_rate": 2e-06, + "loss": 0.2288, + "step": 4761 + }, + { + "epoch": 1.1047442292077485, + "grad_norm": 28.113973632222756, + "learning_rate": 2e-06, + "loss": 0.4137, + "step": 4762 + }, + { + "epoch": 1.1049762208560492, + "grad_norm": 13.059036772244967, + "learning_rate": 2e-06, + "loss": 0.2059, + "step": 4763 + }, + { + "epoch": 1.1052082125043499, + "grad_norm": 26.718482609331993, + "learning_rate": 2e-06, + "loss": 0.5283, + "step": 4764 + }, + { + "epoch": 1.1054402041526505, + "grad_norm": 21.388850421340358, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 4765 + }, + { + "epoch": 1.1056721958009512, + "grad_norm": 13.831933938166646, + "learning_rate": 2e-06, + "loss": 0.2863, + "step": 4766 + }, + { + "epoch": 1.105904187449252, + "grad_norm": 14.716751435725632, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 4767 + }, + { + "epoch": 1.1061361790975526, + "grad_norm": 28.502863024905714, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 4768 + }, + { + "epoch": 1.1063681707458533, + "grad_norm": 11.712588324651502, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 4769 + }, + { + "epoch": 1.1066001623941537, + "grad_norm": 14.2455947813304, + "learning_rate": 2e-06, + "loss": 0.3269, + "step": 4770 + }, + { + "epoch": 1.1068321540424544, + "grad_norm": 20.12904263154396, + "learning_rate": 2e-06, + "loss": 0.3651, + "step": 4771 + }, + { + "epoch": 1.107064145690755, + "grad_norm": 10.495923075726504, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 4772 + }, + { + "epoch": 1.1072961373390557, + "grad_norm": 13.363417014459092, + "learning_rate": 2e-06, + "loss": 0.3103, + "step": 4773 + }, + { + "epoch": 1.1075281289873564, + "grad_norm": 9.752561848539205, + "learning_rate": 2e-06, + "loss": 0.1802, + "step": 4774 + }, + { + "epoch": 1.107760120635657, + "grad_norm": 10.545529316723636, + "learning_rate": 2e-06, + "loss": 0.2058, + "step": 4775 + }, + { + "epoch": 1.1079921122839578, + "grad_norm": 11.52069645016782, + "learning_rate": 2e-06, + "loss": 0.2783, + "step": 4776 + }, + { + "epoch": 1.1082241039322585, + "grad_norm": 11.148106046786955, + "learning_rate": 2e-06, + "loss": 0.1324, + "step": 4777 + }, + { + "epoch": 1.1084560955805591, + "grad_norm": 11.319629458424366, + "learning_rate": 2e-06, + "loss": 0.3652, + "step": 4778 + }, + { + "epoch": 1.1086880872288598, + "grad_norm": 13.690492530236053, + "learning_rate": 2e-06, + "loss": 0.2366, + "step": 4779 + }, + { + "epoch": 1.1089200788771605, + "grad_norm": 9.632568788720638, + "learning_rate": 2e-06, + "loss": 0.2421, + "step": 4780 + }, + { + "epoch": 1.1091520705254612, + "grad_norm": 12.249528407753822, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 4781 + }, + { + "epoch": 1.1093840621737618, + "grad_norm": 14.319966268970228, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 4782 + }, + { + "epoch": 1.1096160538220623, + "grad_norm": 25.04977436131108, + "learning_rate": 2e-06, + "loss": 0.2159, + "step": 4783 + }, + { + "epoch": 1.109848045470363, + "grad_norm": 13.118374923070153, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 4784 + }, + { + "epoch": 1.1100800371186637, + "grad_norm": 13.918332645947876, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 4785 + }, + { + "epoch": 1.1103120287669643, + "grad_norm": 17.479860616245222, + "learning_rate": 2e-06, + "loss": 0.3775, + "step": 4786 + }, + { + "epoch": 1.110544020415265, + "grad_norm": 14.610461992797951, + "learning_rate": 2e-06, + "loss": 0.2226, + "step": 4787 + }, + { + "epoch": 1.1107760120635657, + "grad_norm": 18.220638020090497, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 4788 + }, + { + "epoch": 1.1110080037118664, + "grad_norm": 11.609900531076313, + "learning_rate": 2e-06, + "loss": 0.2249, + "step": 4789 + }, + { + "epoch": 1.111239995360167, + "grad_norm": 11.198809304515796, + "learning_rate": 2e-06, + "loss": 0.2243, + "step": 4790 + }, + { + "epoch": 1.1114719870084677, + "grad_norm": 17.489913518711848, + "learning_rate": 2e-06, + "loss": 0.2504, + "step": 4791 + }, + { + "epoch": 1.1117039786567684, + "grad_norm": 12.78910159612698, + "learning_rate": 2e-06, + "loss": 0.3694, + "step": 4792 + }, + { + "epoch": 1.111935970305069, + "grad_norm": 15.772843023314104, + "learning_rate": 2e-06, + "loss": 0.3384, + "step": 4793 + }, + { + "epoch": 1.1121679619533698, + "grad_norm": 11.195851794747387, + "learning_rate": 2e-06, + "loss": 0.2107, + "step": 4794 + }, + { + "epoch": 1.1123999536016704, + "grad_norm": 9.780183781534575, + "learning_rate": 2e-06, + "loss": 0.1513, + "step": 4795 + }, + { + "epoch": 1.112631945249971, + "grad_norm": 11.903280414707714, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 4796 + }, + { + "epoch": 1.1128639368982716, + "grad_norm": 12.375824587748216, + "learning_rate": 2e-06, + "loss": 0.3704, + "step": 4797 + }, + { + "epoch": 1.1130959285465722, + "grad_norm": 15.841434537765656, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 4798 + }, + { + "epoch": 1.113327920194873, + "grad_norm": 16.05445964383955, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 4799 + }, + { + "epoch": 1.1135599118431736, + "grad_norm": 16.170941162145347, + "learning_rate": 2e-06, + "loss": 0.4808, + "step": 4800 + }, + { + "epoch": 1.1137919034914743, + "grad_norm": 7.983958612287163, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 4801 + }, + { + "epoch": 1.114023895139775, + "grad_norm": 17.947298006751954, + "learning_rate": 2e-06, + "loss": 0.2394, + "step": 4802 + }, + { + "epoch": 1.1142558867880756, + "grad_norm": 14.904696965127243, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 4803 + }, + { + "epoch": 1.1144878784363763, + "grad_norm": 24.716054456606248, + "learning_rate": 2e-06, + "loss": 0.3391, + "step": 4804 + }, + { + "epoch": 1.114719870084677, + "grad_norm": 17.012778768945804, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 4805 + }, + { + "epoch": 1.1149518617329777, + "grad_norm": 11.77157842120241, + "learning_rate": 2e-06, + "loss": 0.2345, + "step": 4806 + }, + { + "epoch": 1.1151838533812783, + "grad_norm": 17.869538222418846, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 4807 + }, + { + "epoch": 1.115415845029579, + "grad_norm": 7.464176480789736, + "learning_rate": 2e-06, + "loss": 0.1725, + "step": 4808 + }, + { + "epoch": 1.1156478366778795, + "grad_norm": 5.421341474728665, + "learning_rate": 2e-06, + "loss": 0.209, + "step": 4809 + }, + { + "epoch": 1.1158798283261802, + "grad_norm": 10.89685000856278, + "learning_rate": 2e-06, + "loss": 0.1944, + "step": 4810 + }, + { + "epoch": 1.1161118199744808, + "grad_norm": 13.971761511853458, + "learning_rate": 2e-06, + "loss": 0.3304, + "step": 4811 + }, + { + "epoch": 1.1163438116227815, + "grad_norm": 34.40954485204369, + "learning_rate": 2e-06, + "loss": 0.394, + "step": 4812 + }, + { + "epoch": 1.1165758032710822, + "grad_norm": 15.488875948943251, + "learning_rate": 2e-06, + "loss": 0.2046, + "step": 4813 + }, + { + "epoch": 1.1168077949193829, + "grad_norm": 12.132601391775486, + "learning_rate": 2e-06, + "loss": 0.2219, + "step": 4814 + }, + { + "epoch": 1.1170397865676835, + "grad_norm": 13.424740585546829, + "learning_rate": 2e-06, + "loss": 0.221, + "step": 4815 + }, + { + "epoch": 1.1172717782159842, + "grad_norm": 8.659678873098684, + "learning_rate": 2e-06, + "loss": 0.2093, + "step": 4816 + }, + { + "epoch": 1.117503769864285, + "grad_norm": 11.50236687791197, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 4817 + }, + { + "epoch": 1.1177357615125856, + "grad_norm": 18.164120226051143, + "learning_rate": 2e-06, + "loss": 0.2499, + "step": 4818 + }, + { + "epoch": 1.1179677531608863, + "grad_norm": 17.800646370079132, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 4819 + }, + { + "epoch": 1.118199744809187, + "grad_norm": 9.323484868669501, + "learning_rate": 2e-06, + "loss": 0.1765, + "step": 4820 + }, + { + "epoch": 1.1184317364574876, + "grad_norm": 16.824993656613074, + "learning_rate": 2e-06, + "loss": 0.312, + "step": 4821 + }, + { + "epoch": 1.1186637281057883, + "grad_norm": 12.203168704174818, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 4822 + }, + { + "epoch": 1.118895719754089, + "grad_norm": 31.56245938171965, + "learning_rate": 2e-06, + "loss": 0.3754, + "step": 4823 + }, + { + "epoch": 1.1191277114023894, + "grad_norm": 8.968222747142645, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 4824 + }, + { + "epoch": 1.11935970305069, + "grad_norm": 16.992866517419166, + "learning_rate": 2e-06, + "loss": 0.2995, + "step": 4825 + }, + { + "epoch": 1.1195916946989908, + "grad_norm": 10.459429594675886, + "learning_rate": 2e-06, + "loss": 0.352, + "step": 4826 + }, + { + "epoch": 1.1198236863472915, + "grad_norm": 13.449766556141963, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 4827 + }, + { + "epoch": 1.1200556779955921, + "grad_norm": 12.05557560578355, + "learning_rate": 2e-06, + "loss": 0.2017, + "step": 4828 + }, + { + "epoch": 1.1202876696438928, + "grad_norm": 9.367438870463518, + "learning_rate": 2e-06, + "loss": 0.2387, + "step": 4829 + }, + { + "epoch": 1.1205196612921935, + "grad_norm": 17.76365816605677, + "learning_rate": 2e-06, + "loss": 0.1864, + "step": 4830 + }, + { + "epoch": 1.1207516529404942, + "grad_norm": 15.404675931816326, + "learning_rate": 2e-06, + "loss": 0.2811, + "step": 4831 + }, + { + "epoch": 1.1209836445887948, + "grad_norm": 15.29386673652393, + "learning_rate": 2e-06, + "loss": 0.2481, + "step": 4832 + }, + { + "epoch": 1.1212156362370955, + "grad_norm": 8.263621968845886, + "learning_rate": 2e-06, + "loss": 0.2153, + "step": 4833 + }, + { + "epoch": 1.1214476278853962, + "grad_norm": 11.598973711992286, + "learning_rate": 2e-06, + "loss": 0.2205, + "step": 4834 + }, + { + "epoch": 1.1216796195336969, + "grad_norm": 12.121246868598893, + "learning_rate": 2e-06, + "loss": 0.2727, + "step": 4835 + }, + { + "epoch": 1.1219116111819973, + "grad_norm": 21.845578092591122, + "learning_rate": 2e-06, + "loss": 0.247, + "step": 4836 + }, + { + "epoch": 1.122143602830298, + "grad_norm": 26.244677083522305, + "learning_rate": 2e-06, + "loss": 0.4809, + "step": 4837 + }, + { + "epoch": 1.1223755944785987, + "grad_norm": 14.525950272406515, + "learning_rate": 2e-06, + "loss": 0.2579, + "step": 4838 + }, + { + "epoch": 1.1226075861268994, + "grad_norm": 8.921911193753202, + "learning_rate": 2e-06, + "loss": 0.2388, + "step": 4839 + }, + { + "epoch": 1.1228395777752, + "grad_norm": 18.573858193690356, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 4840 + }, + { + "epoch": 1.1230715694235007, + "grad_norm": 15.231148127881918, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 4841 + }, + { + "epoch": 1.1233035610718014, + "grad_norm": 14.311189724319316, + "learning_rate": 2e-06, + "loss": 0.174, + "step": 4842 + }, + { + "epoch": 1.123535552720102, + "grad_norm": 9.759725795906759, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 4843 + }, + { + "epoch": 1.1237675443684028, + "grad_norm": 16.32478232769143, + "learning_rate": 2e-06, + "loss": 0.3097, + "step": 4844 + }, + { + "epoch": 1.1239995360167034, + "grad_norm": 11.181963687751907, + "learning_rate": 2e-06, + "loss": 0.2216, + "step": 4845 + }, + { + "epoch": 1.1242315276650041, + "grad_norm": 10.353225268015038, + "learning_rate": 2e-06, + "loss": 0.2954, + "step": 4846 + }, + { + "epoch": 1.1244635193133048, + "grad_norm": 10.309147818411686, + "learning_rate": 2e-06, + "loss": 0.1935, + "step": 4847 + }, + { + "epoch": 1.1246955109616055, + "grad_norm": 14.911985063186178, + "learning_rate": 2e-06, + "loss": 0.228, + "step": 4848 + }, + { + "epoch": 1.1249275026099061, + "grad_norm": 21.648534920152432, + "learning_rate": 2e-06, + "loss": 0.3459, + "step": 4849 + }, + { + "epoch": 1.1251594942582068, + "grad_norm": 12.10124114368933, + "learning_rate": 2e-06, + "loss": 0.2225, + "step": 4850 + }, + { + "epoch": 1.1253914859065073, + "grad_norm": 15.01947620297207, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 4851 + }, + { + "epoch": 1.125623477554808, + "grad_norm": 22.524618954163767, + "learning_rate": 2e-06, + "loss": 0.2461, + "step": 4852 + }, + { + "epoch": 1.1258554692031086, + "grad_norm": 20.873088264065554, + "learning_rate": 2e-06, + "loss": 0.3608, + "step": 4853 + }, + { + "epoch": 1.1260874608514093, + "grad_norm": 13.904202955837375, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 4854 + }, + { + "epoch": 1.12631945249971, + "grad_norm": 26.509242877229937, + "learning_rate": 2e-06, + "loss": 0.3359, + "step": 4855 + }, + { + "epoch": 1.1265514441480107, + "grad_norm": 13.028136016976488, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 4856 + }, + { + "epoch": 1.1267834357963114, + "grad_norm": 7.319785950351981, + "learning_rate": 2e-06, + "loss": 0.1814, + "step": 4857 + }, + { + "epoch": 1.127015427444612, + "grad_norm": 20.918159000298974, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 4858 + }, + { + "epoch": 1.1272474190929127, + "grad_norm": 9.220278402493546, + "learning_rate": 2e-06, + "loss": 0.2005, + "step": 4859 + }, + { + "epoch": 1.1274794107412134, + "grad_norm": 7.8435657373328445, + "learning_rate": 2e-06, + "loss": 0.1529, + "step": 4860 + }, + { + "epoch": 1.127711402389514, + "grad_norm": 12.331950357133152, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 4861 + }, + { + "epoch": 1.1279433940378145, + "grad_norm": 11.468091056813932, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 4862 + }, + { + "epoch": 1.1281753856861152, + "grad_norm": 9.125446465633535, + "learning_rate": 2e-06, + "loss": 0.1779, + "step": 4863 + }, + { + "epoch": 1.1284073773344159, + "grad_norm": 16.561095932862465, + "learning_rate": 2e-06, + "loss": 0.3371, + "step": 4864 + }, + { + "epoch": 1.1286393689827166, + "grad_norm": 13.373293675830446, + "learning_rate": 2e-06, + "loss": 0.3443, + "step": 4865 + }, + { + "epoch": 1.1288713606310172, + "grad_norm": 20.618005722280156, + "learning_rate": 2e-06, + "loss": 0.302, + "step": 4866 + }, + { + "epoch": 1.129103352279318, + "grad_norm": 20.43437800180757, + "learning_rate": 2e-06, + "loss": 0.4071, + "step": 4867 + }, + { + "epoch": 1.1293353439276186, + "grad_norm": 23.09409807583735, + "learning_rate": 2e-06, + "loss": 0.3352, + "step": 4868 + }, + { + "epoch": 1.1295673355759193, + "grad_norm": 21.548952088035673, + "learning_rate": 2e-06, + "loss": 0.2033, + "step": 4869 + }, + { + "epoch": 1.12979932722422, + "grad_norm": 13.179588125487964, + "learning_rate": 2e-06, + "loss": 0.1926, + "step": 4870 + }, + { + "epoch": 1.1300313188725206, + "grad_norm": 17.560998374400327, + "learning_rate": 2e-06, + "loss": 0.3046, + "step": 4871 + }, + { + "epoch": 1.1302633105208213, + "grad_norm": 15.359237858726829, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 4872 + }, + { + "epoch": 1.130495302169122, + "grad_norm": 19.264672448088973, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 4873 + }, + { + "epoch": 1.1307272938174227, + "grad_norm": 18.49971199105671, + "learning_rate": 2e-06, + "loss": 0.2726, + "step": 4874 + }, + { + "epoch": 1.1309592854657233, + "grad_norm": 16.811942294775953, + "learning_rate": 2e-06, + "loss": 0.3512, + "step": 4875 + }, + { + "epoch": 1.131191277114024, + "grad_norm": 10.463711626604983, + "learning_rate": 2e-06, + "loss": 0.2304, + "step": 4876 + }, + { + "epoch": 1.1314232687623245, + "grad_norm": 12.918641341385287, + "learning_rate": 2e-06, + "loss": 0.2987, + "step": 4877 + }, + { + "epoch": 1.1316552604106251, + "grad_norm": 7.4286140248356825, + "learning_rate": 2e-06, + "loss": 0.1508, + "step": 4878 + }, + { + "epoch": 1.1318872520589258, + "grad_norm": 20.665785145642293, + "learning_rate": 2e-06, + "loss": 0.365, + "step": 4879 + }, + { + "epoch": 1.1321192437072265, + "grad_norm": 12.166636770416769, + "learning_rate": 2e-06, + "loss": 0.2197, + "step": 4880 + }, + { + "epoch": 1.1323512353555272, + "grad_norm": 15.393645357245871, + "learning_rate": 2e-06, + "loss": 0.3478, + "step": 4881 + }, + { + "epoch": 1.1325832270038279, + "grad_norm": 9.480527037233777, + "learning_rate": 2e-06, + "loss": 0.1504, + "step": 4882 + }, + { + "epoch": 1.1328152186521285, + "grad_norm": 10.986611255263721, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 4883 + }, + { + "epoch": 1.1330472103004292, + "grad_norm": 8.984061788935852, + "learning_rate": 2e-06, + "loss": 0.2151, + "step": 4884 + }, + { + "epoch": 1.1332792019487299, + "grad_norm": 19.65763052847948, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 4885 + }, + { + "epoch": 1.1335111935970306, + "grad_norm": 10.736693671979038, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 4886 + }, + { + "epoch": 1.1337431852453312, + "grad_norm": 14.183338903446597, + "learning_rate": 2e-06, + "loss": 0.281, + "step": 4887 + }, + { + "epoch": 1.133975176893632, + "grad_norm": 9.740995781048545, + "learning_rate": 2e-06, + "loss": 0.1822, + "step": 4888 + }, + { + "epoch": 1.1342071685419324, + "grad_norm": 21.632445425240174, + "learning_rate": 2e-06, + "loss": 0.3031, + "step": 4889 + }, + { + "epoch": 1.134439160190233, + "grad_norm": 24.816556530453955, + "learning_rate": 2e-06, + "loss": 0.3508, + "step": 4890 + }, + { + "epoch": 1.1346711518385337, + "grad_norm": 13.454497821313662, + "learning_rate": 2e-06, + "loss": 0.196, + "step": 4891 + }, + { + "epoch": 1.1349031434868344, + "grad_norm": 6.912301352121711, + "learning_rate": 2e-06, + "loss": 0.18, + "step": 4892 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 20.439880621178403, + "learning_rate": 2e-06, + "loss": 0.2886, + "step": 4893 + }, + { + "epoch": 1.1353671267834358, + "grad_norm": 19.2432781790515, + "learning_rate": 2e-06, + "loss": 0.3268, + "step": 4894 + }, + { + "epoch": 1.1355991184317364, + "grad_norm": 12.769986602500271, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 4895 + }, + { + "epoch": 1.1358311100800371, + "grad_norm": 11.600290293189488, + "learning_rate": 2e-06, + "loss": 0.1359, + "step": 4896 + }, + { + "epoch": 1.1360631017283378, + "grad_norm": 15.612197475543613, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 4897 + }, + { + "epoch": 1.1362950933766385, + "grad_norm": 8.466076316561889, + "learning_rate": 2e-06, + "loss": 0.1672, + "step": 4898 + }, + { + "epoch": 1.1365270850249392, + "grad_norm": 14.876673188857252, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 4899 + }, + { + "epoch": 1.1367590766732398, + "grad_norm": 11.133115228172553, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 4900 + }, + { + "epoch": 1.1369910683215405, + "grad_norm": 8.143066059932629, + "learning_rate": 2e-06, + "loss": 0.1884, + "step": 4901 + }, + { + "epoch": 1.1372230599698412, + "grad_norm": 17.890975380490254, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 4902 + }, + { + "epoch": 1.1374550516181419, + "grad_norm": 15.559934780180642, + "learning_rate": 2e-06, + "loss": 0.2069, + "step": 4903 + }, + { + "epoch": 1.1376870432664423, + "grad_norm": 21.19586601560053, + "learning_rate": 2e-06, + "loss": 0.3452, + "step": 4904 + }, + { + "epoch": 1.137919034914743, + "grad_norm": 18.280108012771613, + "learning_rate": 2e-06, + "loss": 0.3406, + "step": 4905 + }, + { + "epoch": 1.1381510265630437, + "grad_norm": 13.982547036255285, + "learning_rate": 2e-06, + "loss": 0.2576, + "step": 4906 + }, + { + "epoch": 1.1383830182113444, + "grad_norm": 11.804514631444446, + "learning_rate": 2e-06, + "loss": 0.229, + "step": 4907 + }, + { + "epoch": 1.138615009859645, + "grad_norm": 19.541278639122623, + "learning_rate": 2e-06, + "loss": 0.2622, + "step": 4908 + }, + { + "epoch": 1.1388470015079457, + "grad_norm": 14.760808289312461, + "learning_rate": 2e-06, + "loss": 0.2904, + "step": 4909 + }, + { + "epoch": 1.1390789931562464, + "grad_norm": 19.011030729462316, + "learning_rate": 2e-06, + "loss": 0.2777, + "step": 4910 + }, + { + "epoch": 1.139310984804547, + "grad_norm": 6.137233877763751, + "learning_rate": 2e-06, + "loss": 0.1433, + "step": 4911 + }, + { + "epoch": 1.1395429764528477, + "grad_norm": 14.918947462083826, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 4912 + }, + { + "epoch": 1.1397749681011484, + "grad_norm": 14.772137907592853, + "learning_rate": 2e-06, + "loss": 0.1439, + "step": 4913 + }, + { + "epoch": 1.140006959749449, + "grad_norm": 14.608881128373772, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 4914 + }, + { + "epoch": 1.1402389513977498, + "grad_norm": 9.58689032170824, + "learning_rate": 2e-06, + "loss": 0.1699, + "step": 4915 + }, + { + "epoch": 1.1404709430460502, + "grad_norm": 11.368371089686674, + "learning_rate": 2e-06, + "loss": 0.1777, + "step": 4916 + }, + { + "epoch": 1.140702934694351, + "grad_norm": 14.5766053931017, + "learning_rate": 2e-06, + "loss": 0.1859, + "step": 4917 + }, + { + "epoch": 1.1409349263426516, + "grad_norm": 9.122800332401326, + "learning_rate": 2e-06, + "loss": 0.1833, + "step": 4918 + }, + { + "epoch": 1.1411669179909523, + "grad_norm": 9.61504687863536, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 4919 + }, + { + "epoch": 1.141398909639253, + "grad_norm": 21.523746206148463, + "learning_rate": 2e-06, + "loss": 0.3443, + "step": 4920 + }, + { + "epoch": 1.1416309012875536, + "grad_norm": 14.444470032653033, + "learning_rate": 2e-06, + "loss": 0.1923, + "step": 4921 + }, + { + "epoch": 1.1418628929358543, + "grad_norm": 18.236309948457443, + "learning_rate": 2e-06, + "loss": 0.2983, + "step": 4922 + }, + { + "epoch": 1.142094884584155, + "grad_norm": 13.92540097571774, + "learning_rate": 2e-06, + "loss": 0.193, + "step": 4923 + }, + { + "epoch": 1.1423268762324557, + "grad_norm": 24.180937619439668, + "learning_rate": 2e-06, + "loss": 0.3859, + "step": 4924 + }, + { + "epoch": 1.1425588678807563, + "grad_norm": 15.437820412405143, + "learning_rate": 2e-06, + "loss": 0.268, + "step": 4925 + }, + { + "epoch": 1.142790859529057, + "grad_norm": 13.582142530612812, + "learning_rate": 2e-06, + "loss": 0.2587, + "step": 4926 + }, + { + "epoch": 1.1430228511773577, + "grad_norm": 15.043145299085626, + "learning_rate": 2e-06, + "loss": 0.2439, + "step": 4927 + }, + { + "epoch": 1.1432548428256584, + "grad_norm": 16.86067192142225, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 4928 + }, + { + "epoch": 1.143486834473959, + "grad_norm": 18.408278721917398, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 4929 + }, + { + "epoch": 1.1437188261222595, + "grad_norm": 11.034525892499884, + "learning_rate": 2e-06, + "loss": 0.2126, + "step": 4930 + }, + { + "epoch": 1.1439508177705602, + "grad_norm": 8.690261559663353, + "learning_rate": 2e-06, + "loss": 0.1887, + "step": 4931 + }, + { + "epoch": 1.1441828094188609, + "grad_norm": 10.915617366321268, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 4932 + }, + { + "epoch": 1.1444148010671615, + "grad_norm": 6.999836918162795, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 4933 + }, + { + "epoch": 1.1446467927154622, + "grad_norm": 12.861398225532644, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 4934 + }, + { + "epoch": 1.144878784363763, + "grad_norm": 21.73627815965974, + "learning_rate": 2e-06, + "loss": 0.2468, + "step": 4935 + }, + { + "epoch": 1.1451107760120636, + "grad_norm": 5.840407410752199, + "learning_rate": 2e-06, + "loss": 0.1333, + "step": 4936 + }, + { + "epoch": 1.1453427676603642, + "grad_norm": 13.66647412745888, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 4937 + }, + { + "epoch": 1.145574759308665, + "grad_norm": 10.955800205327387, + "learning_rate": 2e-06, + "loss": 0.2701, + "step": 4938 + }, + { + "epoch": 1.1458067509569656, + "grad_norm": 18.279641821552023, + "learning_rate": 2e-06, + "loss": 0.3168, + "step": 4939 + }, + { + "epoch": 1.1460387426052663, + "grad_norm": 13.678123638896912, + "learning_rate": 2e-06, + "loss": 0.1945, + "step": 4940 + }, + { + "epoch": 1.146270734253567, + "grad_norm": 20.690435466350603, + "learning_rate": 2e-06, + "loss": 0.3321, + "step": 4941 + }, + { + "epoch": 1.1465027259018674, + "grad_norm": 11.352089339007339, + "learning_rate": 2e-06, + "loss": 0.2042, + "step": 4942 + }, + { + "epoch": 1.146734717550168, + "grad_norm": 15.102901486629841, + "learning_rate": 2e-06, + "loss": 0.1913, + "step": 4943 + }, + { + "epoch": 1.1469667091984688, + "grad_norm": 16.465122437217744, + "learning_rate": 2e-06, + "loss": 0.2494, + "step": 4944 + }, + { + "epoch": 1.1471987008467694, + "grad_norm": 7.706423255130726, + "learning_rate": 2e-06, + "loss": 0.1918, + "step": 4945 + }, + { + "epoch": 1.1474306924950701, + "grad_norm": 13.249898609853025, + "learning_rate": 2e-06, + "loss": 0.2462, + "step": 4946 + }, + { + "epoch": 1.1476626841433708, + "grad_norm": 21.2608644464412, + "learning_rate": 2e-06, + "loss": 0.3209, + "step": 4947 + }, + { + "epoch": 1.1478946757916715, + "grad_norm": 8.674472393729499, + "learning_rate": 2e-06, + "loss": 0.1809, + "step": 4948 + }, + { + "epoch": 1.1481266674399722, + "grad_norm": 12.503974947717918, + "learning_rate": 2e-06, + "loss": 0.3836, + "step": 4949 + }, + { + "epoch": 1.1483586590882728, + "grad_norm": 20.12930606562257, + "learning_rate": 2e-06, + "loss": 0.1928, + "step": 4950 + }, + { + "epoch": 1.1485906507365735, + "grad_norm": 20.94522005014584, + "learning_rate": 2e-06, + "loss": 0.2479, + "step": 4951 + }, + { + "epoch": 1.1488226423848742, + "grad_norm": 24.26873660368573, + "learning_rate": 2e-06, + "loss": 0.2137, + "step": 4952 + }, + { + "epoch": 1.1490546340331749, + "grad_norm": 10.406117471367917, + "learning_rate": 2e-06, + "loss": 0.2668, + "step": 4953 + }, + { + "epoch": 1.1492866256814755, + "grad_norm": 11.010379737891737, + "learning_rate": 2e-06, + "loss": 0.1913, + "step": 4954 + }, + { + "epoch": 1.1495186173297762, + "grad_norm": 12.299171427398806, + "learning_rate": 2e-06, + "loss": 0.1321, + "step": 4955 + }, + { + "epoch": 1.149750608978077, + "grad_norm": 17.37082274280718, + "learning_rate": 2e-06, + "loss": 0.3217, + "step": 4956 + }, + { + "epoch": 1.1499826006263774, + "grad_norm": 14.921465025425027, + "learning_rate": 2e-06, + "loss": 0.3425, + "step": 4957 + }, + { + "epoch": 1.150214592274678, + "grad_norm": 17.521596392801534, + "learning_rate": 2e-06, + "loss": 0.3958, + "step": 4958 + }, + { + "epoch": 1.1504465839229787, + "grad_norm": 15.05574840704271, + "learning_rate": 2e-06, + "loss": 0.2477, + "step": 4959 + }, + { + "epoch": 1.1506785755712794, + "grad_norm": 9.486800586659157, + "learning_rate": 2e-06, + "loss": 0.2234, + "step": 4960 + }, + { + "epoch": 1.15091056721958, + "grad_norm": 10.712302390953361, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 4961 + }, + { + "epoch": 1.1511425588678807, + "grad_norm": 10.90126140152789, + "learning_rate": 2e-06, + "loss": 0.3454, + "step": 4962 + }, + { + "epoch": 1.1513745505161814, + "grad_norm": 20.62003729182546, + "learning_rate": 2e-06, + "loss": 0.3847, + "step": 4963 + }, + { + "epoch": 1.151606542164482, + "grad_norm": 5.952093748397747, + "learning_rate": 2e-06, + "loss": 0.1854, + "step": 4964 + }, + { + "epoch": 1.1518385338127828, + "grad_norm": 21.558466694925713, + "learning_rate": 2e-06, + "loss": 0.3083, + "step": 4965 + }, + { + "epoch": 1.1520705254610835, + "grad_norm": 16.098757853215655, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 4966 + }, + { + "epoch": 1.1523025171093841, + "grad_norm": 10.083478843015879, + "learning_rate": 2e-06, + "loss": 0.293, + "step": 4967 + }, + { + "epoch": 1.1525345087576848, + "grad_norm": 14.425534089566499, + "learning_rate": 2e-06, + "loss": 0.2881, + "step": 4968 + }, + { + "epoch": 1.1527665004059853, + "grad_norm": 18.679608904856032, + "learning_rate": 2e-06, + "loss": 0.3362, + "step": 4969 + }, + { + "epoch": 1.152998492054286, + "grad_norm": 21.405848462538046, + "learning_rate": 2e-06, + "loss": 0.2952, + "step": 4970 + }, + { + "epoch": 1.1532304837025866, + "grad_norm": 18.534218012077417, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 4971 + }, + { + "epoch": 1.1534624753508873, + "grad_norm": 16.90363676355196, + "learning_rate": 2e-06, + "loss": 0.2342, + "step": 4972 + }, + { + "epoch": 1.153694466999188, + "grad_norm": 20.269954202409547, + "learning_rate": 2e-06, + "loss": 0.2938, + "step": 4973 + }, + { + "epoch": 1.1539264586474887, + "grad_norm": 14.655589208363546, + "learning_rate": 2e-06, + "loss": 0.2959, + "step": 4974 + }, + { + "epoch": 1.1541584502957893, + "grad_norm": 16.179437731609507, + "learning_rate": 2e-06, + "loss": 0.3986, + "step": 4975 + }, + { + "epoch": 1.15439044194409, + "grad_norm": 17.471580867845255, + "learning_rate": 2e-06, + "loss": 0.3414, + "step": 4976 + }, + { + "epoch": 1.1546224335923907, + "grad_norm": 15.168744371031757, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 4977 + }, + { + "epoch": 1.1548544252406914, + "grad_norm": 14.814605235511708, + "learning_rate": 2e-06, + "loss": 0.297, + "step": 4978 + }, + { + "epoch": 1.155086416888992, + "grad_norm": 5.064559732869771, + "learning_rate": 2e-06, + "loss": 0.1687, + "step": 4979 + }, + { + "epoch": 1.1553184085372927, + "grad_norm": 22.91198371044645, + "learning_rate": 2e-06, + "loss": 0.3917, + "step": 4980 + }, + { + "epoch": 1.1555504001855934, + "grad_norm": 10.770969937868816, + "learning_rate": 2e-06, + "loss": 0.2501, + "step": 4981 + }, + { + "epoch": 1.155782391833894, + "grad_norm": 15.982737134996356, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 4982 + }, + { + "epoch": 1.1560143834821948, + "grad_norm": 16.99469144176511, + "learning_rate": 2e-06, + "loss": 0.3181, + "step": 4983 + }, + { + "epoch": 1.1562463751304952, + "grad_norm": 25.361483749720282, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 4984 + }, + { + "epoch": 1.156478366778796, + "grad_norm": 11.43453219225867, + "learning_rate": 2e-06, + "loss": 0.2394, + "step": 4985 + }, + { + "epoch": 1.1567103584270966, + "grad_norm": 17.527726479771758, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 4986 + }, + { + "epoch": 1.1569423500753973, + "grad_norm": 23.029281340142415, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 4987 + }, + { + "epoch": 1.157174341723698, + "grad_norm": 15.859396445352443, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 4988 + }, + { + "epoch": 1.1574063333719986, + "grad_norm": 18.52207039945924, + "learning_rate": 2e-06, + "loss": 0.232, + "step": 4989 + }, + { + "epoch": 1.1576383250202993, + "grad_norm": 18.5323290413136, + "learning_rate": 2e-06, + "loss": 0.2638, + "step": 4990 + }, + { + "epoch": 1.1578703166686, + "grad_norm": 11.726869560068051, + "learning_rate": 2e-06, + "loss": 0.2262, + "step": 4991 + }, + { + "epoch": 1.1581023083169006, + "grad_norm": 10.325741206233513, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 4992 + }, + { + "epoch": 1.1583342999652013, + "grad_norm": 18.185472349818276, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 4993 + }, + { + "epoch": 1.158566291613502, + "grad_norm": 16.719998948739807, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 4994 + }, + { + "epoch": 1.1587982832618025, + "grad_norm": 16.404411179286353, + "learning_rate": 2e-06, + "loss": 0.2828, + "step": 4995 + }, + { + "epoch": 1.1590302749101031, + "grad_norm": 15.826079148596243, + "learning_rate": 2e-06, + "loss": 0.1987, + "step": 4996 + }, + { + "epoch": 1.1592622665584038, + "grad_norm": 23.52084532680064, + "learning_rate": 2e-06, + "loss": 0.3933, + "step": 4997 + }, + { + "epoch": 1.1594942582067045, + "grad_norm": 19.727342270029716, + "learning_rate": 2e-06, + "loss": 0.3705, + "step": 4998 + }, + { + "epoch": 1.1597262498550052, + "grad_norm": 8.532466128418308, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 4999 + }, + { + "epoch": 1.1599582415033058, + "grad_norm": 18.545541497081892, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 5000 + }, + { + "epoch": 1.1601902331516065, + "grad_norm": 7.39760627147255, + "learning_rate": 2e-06, + "loss": 0.1612, + "step": 5001 + }, + { + "epoch": 1.1604222247999072, + "grad_norm": 10.978935659260085, + "learning_rate": 2e-06, + "loss": 0.2346, + "step": 5002 + }, + { + "epoch": 1.1606542164482079, + "grad_norm": 11.515475457268554, + "learning_rate": 2e-06, + "loss": 0.2053, + "step": 5003 + }, + { + "epoch": 1.1608862080965086, + "grad_norm": 6.859243508484112, + "learning_rate": 2e-06, + "loss": 0.1754, + "step": 5004 + }, + { + "epoch": 1.1611181997448092, + "grad_norm": 22.30127623099719, + "learning_rate": 2e-06, + "loss": 0.2655, + "step": 5005 + }, + { + "epoch": 1.16135019139311, + "grad_norm": 18.018850253614218, + "learning_rate": 2e-06, + "loss": 0.2342, + "step": 5006 + }, + { + "epoch": 1.1615821830414106, + "grad_norm": 14.185858142945714, + "learning_rate": 2e-06, + "loss": 0.2357, + "step": 5007 + }, + { + "epoch": 1.1618141746897113, + "grad_norm": 13.850770859563564, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 5008 + }, + { + "epoch": 1.162046166338012, + "grad_norm": 20.168032983436802, + "learning_rate": 2e-06, + "loss": 0.3488, + "step": 5009 + }, + { + "epoch": 1.1622781579863124, + "grad_norm": 14.213162878346074, + "learning_rate": 2e-06, + "loss": 0.1906, + "step": 5010 + }, + { + "epoch": 1.162510149634613, + "grad_norm": 18.527199878180998, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 5011 + }, + { + "epoch": 1.1627421412829138, + "grad_norm": 21.948041045089763, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 5012 + }, + { + "epoch": 1.1629741329312144, + "grad_norm": 18.20419217348716, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 5013 + }, + { + "epoch": 1.163206124579515, + "grad_norm": 7.939425692141144, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 5014 + }, + { + "epoch": 1.1634381162278158, + "grad_norm": 12.672603130303628, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 5015 + }, + { + "epoch": 1.1636701078761165, + "grad_norm": 10.981135126370553, + "learning_rate": 2e-06, + "loss": 0.3249, + "step": 5016 + }, + { + "epoch": 1.1639020995244171, + "grad_norm": 15.315053555166035, + "learning_rate": 2e-06, + "loss": 0.2706, + "step": 5017 + }, + { + "epoch": 1.1641340911727178, + "grad_norm": 7.395346913250289, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 5018 + }, + { + "epoch": 1.1643660828210185, + "grad_norm": 25.789706744104237, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 5019 + }, + { + "epoch": 1.1645980744693192, + "grad_norm": 13.929896223441505, + "learning_rate": 2e-06, + "loss": 0.3325, + "step": 5020 + }, + { + "epoch": 1.1648300661176199, + "grad_norm": 20.150725102614853, + "learning_rate": 2e-06, + "loss": 0.2426, + "step": 5021 + }, + { + "epoch": 1.1650620577659203, + "grad_norm": 16.45818500299967, + "learning_rate": 2e-06, + "loss": 0.1771, + "step": 5022 + }, + { + "epoch": 1.165294049414221, + "grad_norm": 15.739444570190075, + "learning_rate": 2e-06, + "loss": 0.2365, + "step": 5023 + }, + { + "epoch": 1.1655260410625217, + "grad_norm": 10.570517956343688, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 5024 + }, + { + "epoch": 1.1657580327108223, + "grad_norm": 13.3629168062596, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 5025 + }, + { + "epoch": 1.165990024359123, + "grad_norm": 18.754419599758915, + "learning_rate": 2e-06, + "loss": 0.2735, + "step": 5026 + }, + { + "epoch": 1.1662220160074237, + "grad_norm": 19.664815170672313, + "learning_rate": 2e-06, + "loss": 0.2899, + "step": 5027 + }, + { + "epoch": 1.1664540076557244, + "grad_norm": 9.366801142256714, + "learning_rate": 2e-06, + "loss": 0.23, + "step": 5028 + }, + { + "epoch": 1.166685999304025, + "grad_norm": 14.935637719397329, + "learning_rate": 2e-06, + "loss": 0.2281, + "step": 5029 + }, + { + "epoch": 1.1669179909523257, + "grad_norm": 24.75933529551093, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 5030 + }, + { + "epoch": 1.1671499826006264, + "grad_norm": 14.730901640082049, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 5031 + }, + { + "epoch": 1.167381974248927, + "grad_norm": 5.845621968031191, + "learning_rate": 2e-06, + "loss": 0.103, + "step": 5032 + }, + { + "epoch": 1.1676139658972278, + "grad_norm": 15.167261756957625, + "learning_rate": 2e-06, + "loss": 0.2014, + "step": 5033 + }, + { + "epoch": 1.1678459575455284, + "grad_norm": 16.816500981299157, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 5034 + }, + { + "epoch": 1.1680779491938291, + "grad_norm": 12.555567932873625, + "learning_rate": 2e-06, + "loss": 0.209, + "step": 5035 + }, + { + "epoch": 1.1683099408421298, + "grad_norm": 15.661823316978516, + "learning_rate": 2e-06, + "loss": 0.3003, + "step": 5036 + }, + { + "epoch": 1.1685419324904303, + "grad_norm": 7.488072913478131, + "learning_rate": 2e-06, + "loss": 0.1905, + "step": 5037 + }, + { + "epoch": 1.168773924138731, + "grad_norm": 15.02359869467387, + "learning_rate": 2e-06, + "loss": 0.2309, + "step": 5038 + }, + { + "epoch": 1.1690059157870316, + "grad_norm": 14.78971080478935, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 5039 + }, + { + "epoch": 1.1692379074353323, + "grad_norm": 20.143819338315392, + "learning_rate": 2e-06, + "loss": 0.3366, + "step": 5040 + }, + { + "epoch": 1.169469899083633, + "grad_norm": 20.31800677163891, + "learning_rate": 2e-06, + "loss": 0.3499, + "step": 5041 + }, + { + "epoch": 1.1697018907319336, + "grad_norm": 18.584130400204625, + "learning_rate": 2e-06, + "loss": 0.2986, + "step": 5042 + }, + { + "epoch": 1.1699338823802343, + "grad_norm": 9.014805971449672, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 5043 + }, + { + "epoch": 1.170165874028535, + "grad_norm": 15.855667851271633, + "learning_rate": 2e-06, + "loss": 0.3955, + "step": 5044 + }, + { + "epoch": 1.1703978656768357, + "grad_norm": 16.573583773977177, + "learning_rate": 2e-06, + "loss": 0.3397, + "step": 5045 + }, + { + "epoch": 1.1706298573251364, + "grad_norm": 38.35167617624872, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 5046 + }, + { + "epoch": 1.170861848973437, + "grad_norm": 22.72179482763607, + "learning_rate": 2e-06, + "loss": 0.3405, + "step": 5047 + }, + { + "epoch": 1.1710938406217377, + "grad_norm": 12.249486716090715, + "learning_rate": 2e-06, + "loss": 0.1978, + "step": 5048 + }, + { + "epoch": 1.1713258322700382, + "grad_norm": 9.266429827972644, + "learning_rate": 2e-06, + "loss": 0.2376, + "step": 5049 + }, + { + "epoch": 1.1715578239183388, + "grad_norm": 18.193981352814255, + "learning_rate": 2e-06, + "loss": 0.3892, + "step": 5050 + }, + { + "epoch": 1.1717898155666395, + "grad_norm": 15.126893755477953, + "learning_rate": 2e-06, + "loss": 0.2697, + "step": 5051 + }, + { + "epoch": 1.1720218072149402, + "grad_norm": 11.528059200038399, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 5052 + }, + { + "epoch": 1.1722537988632409, + "grad_norm": 5.696819551783051, + "learning_rate": 2e-06, + "loss": 0.1733, + "step": 5053 + }, + { + "epoch": 1.1724857905115416, + "grad_norm": 21.68939285127513, + "learning_rate": 2e-06, + "loss": 0.238, + "step": 5054 + }, + { + "epoch": 1.1727177821598422, + "grad_norm": 11.54426577532738, + "learning_rate": 2e-06, + "loss": 0.2835, + "step": 5055 + }, + { + "epoch": 1.172949773808143, + "grad_norm": 16.36253134447032, + "learning_rate": 2e-06, + "loss": 0.3236, + "step": 5056 + }, + { + "epoch": 1.1731817654564436, + "grad_norm": 8.142562005217773, + "learning_rate": 2e-06, + "loss": 0.212, + "step": 5057 + }, + { + "epoch": 1.1734137571047443, + "grad_norm": 12.615320983816575, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 5058 + }, + { + "epoch": 1.173645748753045, + "grad_norm": 12.514326523956044, + "learning_rate": 2e-06, + "loss": 0.1682, + "step": 5059 + }, + { + "epoch": 1.1738777404013456, + "grad_norm": 12.633760376909049, + "learning_rate": 2e-06, + "loss": 0.2375, + "step": 5060 + }, + { + "epoch": 1.1741097320496463, + "grad_norm": 18.721158363810936, + "learning_rate": 2e-06, + "loss": 0.3598, + "step": 5061 + }, + { + "epoch": 1.174341723697947, + "grad_norm": 19.54568283304831, + "learning_rate": 2e-06, + "loss": 0.3662, + "step": 5062 + }, + { + "epoch": 1.1745737153462477, + "grad_norm": 56.5685419514945, + "learning_rate": 2e-06, + "loss": 0.1762, + "step": 5063 + }, + { + "epoch": 1.1748057069945481, + "grad_norm": 20.023831440292504, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 5064 + }, + { + "epoch": 1.1750376986428488, + "grad_norm": 11.468361701197614, + "learning_rate": 2e-06, + "loss": 0.2195, + "step": 5065 + }, + { + "epoch": 1.1752696902911495, + "grad_norm": 12.883223148041303, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 5066 + }, + { + "epoch": 1.1755016819394501, + "grad_norm": 12.738172382017016, + "learning_rate": 2e-06, + "loss": 0.3331, + "step": 5067 + }, + { + "epoch": 1.1757336735877508, + "grad_norm": 15.362041684758266, + "learning_rate": 2e-06, + "loss": 0.3555, + "step": 5068 + }, + { + "epoch": 1.1759656652360515, + "grad_norm": 12.414739283145341, + "learning_rate": 2e-06, + "loss": 0.1942, + "step": 5069 + }, + { + "epoch": 1.1761976568843522, + "grad_norm": 17.904270630927922, + "learning_rate": 2e-06, + "loss": 0.2559, + "step": 5070 + }, + { + "epoch": 1.1764296485326529, + "grad_norm": 13.538195283396922, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 5071 + }, + { + "epoch": 1.1766616401809535, + "grad_norm": 13.422021738421487, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 5072 + }, + { + "epoch": 1.1768936318292542, + "grad_norm": 22.53580094618496, + "learning_rate": 2e-06, + "loss": 0.243, + "step": 5073 + }, + { + "epoch": 1.177125623477555, + "grad_norm": 11.55881963636086, + "learning_rate": 2e-06, + "loss": 0.2796, + "step": 5074 + }, + { + "epoch": 1.1773576151258554, + "grad_norm": 10.049045185397313, + "learning_rate": 2e-06, + "loss": 0.1981, + "step": 5075 + }, + { + "epoch": 1.177589606774156, + "grad_norm": 22.65546336238449, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 5076 + }, + { + "epoch": 1.1778215984224567, + "grad_norm": 13.048630873201166, + "learning_rate": 2e-06, + "loss": 0.2681, + "step": 5077 + }, + { + "epoch": 1.1780535900707574, + "grad_norm": 13.900332711139294, + "learning_rate": 2e-06, + "loss": 0.2521, + "step": 5078 + }, + { + "epoch": 1.178285581719058, + "grad_norm": 11.32114169357351, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 5079 + }, + { + "epoch": 1.1785175733673587, + "grad_norm": 24.06534770278926, + "learning_rate": 2e-06, + "loss": 0.4219, + "step": 5080 + }, + { + "epoch": 1.1787495650156594, + "grad_norm": 20.324166681833166, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 5081 + }, + { + "epoch": 1.17898155666396, + "grad_norm": 10.903753798413435, + "learning_rate": 2e-06, + "loss": 0.2103, + "step": 5082 + }, + { + "epoch": 1.1792135483122608, + "grad_norm": 29.388568864614136, + "learning_rate": 2e-06, + "loss": 0.4449, + "step": 5083 + }, + { + "epoch": 1.1794455399605615, + "grad_norm": 12.80463042132413, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 5084 + }, + { + "epoch": 1.1796775316088621, + "grad_norm": 21.144388392722167, + "learning_rate": 2e-06, + "loss": 0.3401, + "step": 5085 + }, + { + "epoch": 1.1799095232571628, + "grad_norm": 12.57877912456603, + "learning_rate": 2e-06, + "loss": 0.2803, + "step": 5086 + }, + { + "epoch": 1.1801415149054635, + "grad_norm": 12.110198711063557, + "learning_rate": 2e-06, + "loss": 0.192, + "step": 5087 + }, + { + "epoch": 1.1803735065537642, + "grad_norm": 9.479556009422616, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 5088 + }, + { + "epoch": 1.1806054982020648, + "grad_norm": 9.087753336527165, + "learning_rate": 2e-06, + "loss": 0.1777, + "step": 5089 + }, + { + "epoch": 1.1808374898503653, + "grad_norm": 16.533214839801566, + "learning_rate": 2e-06, + "loss": 0.2729, + "step": 5090 + }, + { + "epoch": 1.181069481498666, + "grad_norm": 20.021190380675282, + "learning_rate": 2e-06, + "loss": 0.3022, + "step": 5091 + }, + { + "epoch": 1.1813014731469667, + "grad_norm": 19.108612020028357, + "learning_rate": 2e-06, + "loss": 0.3042, + "step": 5092 + }, + { + "epoch": 1.1815334647952673, + "grad_norm": 13.03954712661226, + "learning_rate": 2e-06, + "loss": 0.1961, + "step": 5093 + }, + { + "epoch": 1.181765456443568, + "grad_norm": 13.956540532981451, + "learning_rate": 2e-06, + "loss": 0.277, + "step": 5094 + }, + { + "epoch": 1.1819974480918687, + "grad_norm": 16.498525718064315, + "learning_rate": 2e-06, + "loss": 0.1667, + "step": 5095 + }, + { + "epoch": 1.1822294397401694, + "grad_norm": 24.182252375905538, + "learning_rate": 2e-06, + "loss": 0.3496, + "step": 5096 + }, + { + "epoch": 1.18246143138847, + "grad_norm": 15.170383802626947, + "learning_rate": 2e-06, + "loss": 0.308, + "step": 5097 + }, + { + "epoch": 1.1826934230367707, + "grad_norm": 12.57381961517264, + "learning_rate": 2e-06, + "loss": 0.2234, + "step": 5098 + }, + { + "epoch": 1.1829254146850714, + "grad_norm": 17.774575808972216, + "learning_rate": 2e-06, + "loss": 0.3411, + "step": 5099 + }, + { + "epoch": 1.183157406333372, + "grad_norm": 12.241022832388117, + "learning_rate": 2e-06, + "loss": 0.3423, + "step": 5100 + }, + { + "epoch": 1.1833893979816728, + "grad_norm": 13.921320685454633, + "learning_rate": 2e-06, + "loss": 0.1748, + "step": 5101 + }, + { + "epoch": 1.1836213896299732, + "grad_norm": 11.853467367859825, + "learning_rate": 2e-06, + "loss": 0.17, + "step": 5102 + }, + { + "epoch": 1.1838533812782739, + "grad_norm": 14.718381089627021, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 5103 + }, + { + "epoch": 1.1840853729265746, + "grad_norm": 14.749166426120494, + "learning_rate": 2e-06, + "loss": 0.277, + "step": 5104 + }, + { + "epoch": 1.1843173645748752, + "grad_norm": 11.982207117697797, + "learning_rate": 2e-06, + "loss": 0.2446, + "step": 5105 + }, + { + "epoch": 1.184549356223176, + "grad_norm": 10.943136557718498, + "learning_rate": 2e-06, + "loss": 0.284, + "step": 5106 + }, + { + "epoch": 1.1847813478714766, + "grad_norm": 12.564714406437187, + "learning_rate": 2e-06, + "loss": 0.1975, + "step": 5107 + }, + { + "epoch": 1.1850133395197773, + "grad_norm": 16.674037184874972, + "learning_rate": 2e-06, + "loss": 0.2302, + "step": 5108 + }, + { + "epoch": 1.185245331168078, + "grad_norm": 21.112893619492684, + "learning_rate": 2e-06, + "loss": 0.4166, + "step": 5109 + }, + { + "epoch": 1.1854773228163786, + "grad_norm": 16.847327105118747, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 5110 + }, + { + "epoch": 1.1857093144646793, + "grad_norm": 15.396429450961502, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 5111 + }, + { + "epoch": 1.18594130611298, + "grad_norm": 15.041230182302556, + "learning_rate": 2e-06, + "loss": 0.2603, + "step": 5112 + }, + { + "epoch": 1.1861732977612807, + "grad_norm": 11.679276575944929, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 5113 + }, + { + "epoch": 1.1864052894095813, + "grad_norm": 17.23626301583967, + "learning_rate": 2e-06, + "loss": 0.2465, + "step": 5114 + }, + { + "epoch": 1.186637281057882, + "grad_norm": 10.387629871060861, + "learning_rate": 2e-06, + "loss": 0.2685, + "step": 5115 + }, + { + "epoch": 1.1868692727061827, + "grad_norm": 13.583743595609374, + "learning_rate": 2e-06, + "loss": 0.2684, + "step": 5116 + }, + { + "epoch": 1.1871012643544832, + "grad_norm": 18.0518869085287, + "learning_rate": 2e-06, + "loss": 0.1706, + "step": 5117 + }, + { + "epoch": 1.1873332560027838, + "grad_norm": 8.292611403006696, + "learning_rate": 2e-06, + "loss": 0.1672, + "step": 5118 + }, + { + "epoch": 1.1875652476510845, + "grad_norm": 9.030629924098347, + "learning_rate": 2e-06, + "loss": 0.2098, + "step": 5119 + }, + { + "epoch": 1.1877972392993852, + "grad_norm": 22.190945386452597, + "learning_rate": 2e-06, + "loss": 0.4378, + "step": 5120 + }, + { + "epoch": 1.1880292309476859, + "grad_norm": 14.062678153543576, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 5121 + }, + { + "epoch": 1.1882612225959865, + "grad_norm": 9.700629503739886, + "learning_rate": 2e-06, + "loss": 0.3183, + "step": 5122 + }, + { + "epoch": 1.1884932142442872, + "grad_norm": 8.273146258477006, + "learning_rate": 2e-06, + "loss": 0.2306, + "step": 5123 + }, + { + "epoch": 1.188725205892588, + "grad_norm": 11.615305784614497, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 5124 + }, + { + "epoch": 1.1889571975408886, + "grad_norm": 9.89918448079035, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 5125 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 16.405532140936124, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 5126 + }, + { + "epoch": 1.18942118083749, + "grad_norm": 25.228975723271233, + "learning_rate": 2e-06, + "loss": 0.3914, + "step": 5127 + }, + { + "epoch": 1.1896531724857904, + "grad_norm": 18.17180925808904, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 5128 + }, + { + "epoch": 1.189885164134091, + "grad_norm": 8.534850939568784, + "learning_rate": 2e-06, + "loss": 0.2791, + "step": 5129 + }, + { + "epoch": 1.1901171557823917, + "grad_norm": 20.27568382310745, + "learning_rate": 2e-06, + "loss": 0.3322, + "step": 5130 + }, + { + "epoch": 1.1903491474306924, + "grad_norm": 14.323277855853348, + "learning_rate": 2e-06, + "loss": 0.3306, + "step": 5131 + }, + { + "epoch": 1.190581139078993, + "grad_norm": 7.469000808934431, + "learning_rate": 2e-06, + "loss": 0.1893, + "step": 5132 + }, + { + "epoch": 1.1908131307272938, + "grad_norm": 12.340451919803156, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 5133 + }, + { + "epoch": 1.1910451223755945, + "grad_norm": 15.508648743070854, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 5134 + }, + { + "epoch": 1.1912771140238951, + "grad_norm": 23.80622709464657, + "learning_rate": 2e-06, + "loss": 0.3063, + "step": 5135 + }, + { + "epoch": 1.1915091056721958, + "grad_norm": 16.937254153485966, + "learning_rate": 2e-06, + "loss": 0.2666, + "step": 5136 + }, + { + "epoch": 1.1917410973204965, + "grad_norm": 9.110017246897536, + "learning_rate": 2e-06, + "loss": 0.17, + "step": 5137 + }, + { + "epoch": 1.1919730889687972, + "grad_norm": 13.043804133984896, + "learning_rate": 2e-06, + "loss": 0.3281, + "step": 5138 + }, + { + "epoch": 1.1922050806170978, + "grad_norm": 8.90342892280623, + "learning_rate": 2e-06, + "loss": 0.1846, + "step": 5139 + }, + { + "epoch": 1.1924370722653985, + "grad_norm": 7.969223964850742, + "learning_rate": 2e-06, + "loss": 0.1504, + "step": 5140 + }, + { + "epoch": 1.1926690639136992, + "grad_norm": 13.442988145448872, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 5141 + }, + { + "epoch": 1.1929010555619999, + "grad_norm": 18.190046733000564, + "learning_rate": 2e-06, + "loss": 0.148, + "step": 5142 + }, + { + "epoch": 1.1931330472103003, + "grad_norm": 11.28922782319458, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 5143 + }, + { + "epoch": 1.193365038858601, + "grad_norm": 22.17112011918985, + "learning_rate": 2e-06, + "loss": 0.3036, + "step": 5144 + }, + { + "epoch": 1.1935970305069017, + "grad_norm": 14.839295424960572, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 5145 + }, + { + "epoch": 1.1938290221552024, + "grad_norm": 12.869616829651251, + "learning_rate": 2e-06, + "loss": 0.189, + "step": 5146 + }, + { + "epoch": 1.194061013803503, + "grad_norm": 15.443605579905961, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 5147 + }, + { + "epoch": 1.1942930054518037, + "grad_norm": 21.87299521722901, + "learning_rate": 2e-06, + "loss": 0.2444, + "step": 5148 + }, + { + "epoch": 1.1945249971001044, + "grad_norm": 9.004184331024451, + "learning_rate": 2e-06, + "loss": 0.1728, + "step": 5149 + }, + { + "epoch": 1.194756988748405, + "grad_norm": 12.551599524000297, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 5150 + }, + { + "epoch": 1.1949889803967058, + "grad_norm": 29.857523497550893, + "learning_rate": 2e-06, + "loss": 0.3028, + "step": 5151 + }, + { + "epoch": 1.1952209720450064, + "grad_norm": 12.085422933856577, + "learning_rate": 2e-06, + "loss": 0.2088, + "step": 5152 + }, + { + "epoch": 1.1954529636933071, + "grad_norm": 17.68757686122845, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 5153 + }, + { + "epoch": 1.1956849553416078, + "grad_norm": 18.730269953915307, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 5154 + }, + { + "epoch": 1.1959169469899082, + "grad_norm": 21.30782403888487, + "learning_rate": 2e-06, + "loss": 0.2301, + "step": 5155 + }, + { + "epoch": 1.196148938638209, + "grad_norm": 22.47097003616981, + "learning_rate": 2e-06, + "loss": 0.3451, + "step": 5156 + }, + { + "epoch": 1.1963809302865096, + "grad_norm": 15.510565862769823, + "learning_rate": 2e-06, + "loss": 0.3132, + "step": 5157 + }, + { + "epoch": 1.1966129219348103, + "grad_norm": 15.18217102573386, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 5158 + }, + { + "epoch": 1.196844913583111, + "grad_norm": 8.803567230647891, + "learning_rate": 2e-06, + "loss": 0.1541, + "step": 5159 + }, + { + "epoch": 1.1970769052314116, + "grad_norm": 13.327744557640425, + "learning_rate": 2e-06, + "loss": 0.1522, + "step": 5160 + }, + { + "epoch": 1.1973088968797123, + "grad_norm": 18.90690605913973, + "learning_rate": 2e-06, + "loss": 0.2968, + "step": 5161 + }, + { + "epoch": 1.197540888528013, + "grad_norm": 12.32255655411487, + "learning_rate": 2e-06, + "loss": 0.2587, + "step": 5162 + }, + { + "epoch": 1.1977728801763137, + "grad_norm": 11.8652432225982, + "learning_rate": 2e-06, + "loss": 0.1902, + "step": 5163 + }, + { + "epoch": 1.1980048718246143, + "grad_norm": 16.73378327149659, + "learning_rate": 2e-06, + "loss": 0.2478, + "step": 5164 + }, + { + "epoch": 1.198236863472915, + "grad_norm": 11.562678414791206, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 5165 + }, + { + "epoch": 1.1984688551212157, + "grad_norm": 10.397222114298854, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 5166 + }, + { + "epoch": 1.1987008467695164, + "grad_norm": 42.61136099362814, + "learning_rate": 2e-06, + "loss": 0.3812, + "step": 5167 + }, + { + "epoch": 1.198932838417817, + "grad_norm": 7.762564277411292, + "learning_rate": 2e-06, + "loss": 0.1743, + "step": 5168 + }, + { + "epoch": 1.1991648300661177, + "grad_norm": 19.320174208023595, + "learning_rate": 2e-06, + "loss": 0.2735, + "step": 5169 + }, + { + "epoch": 1.1993968217144182, + "grad_norm": 9.33982545876157, + "learning_rate": 2e-06, + "loss": 0.1582, + "step": 5170 + }, + { + "epoch": 1.1996288133627189, + "grad_norm": 24.12096917597885, + "learning_rate": 2e-06, + "loss": 0.269, + "step": 5171 + }, + { + "epoch": 1.1998608050110195, + "grad_norm": 14.189468767579216, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 5172 + }, + { + "epoch": 1.2000927966593202, + "grad_norm": 15.64630259236317, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 5173 + }, + { + "epoch": 1.200324788307621, + "grad_norm": 11.43412902266011, + "learning_rate": 2e-06, + "loss": 0.1606, + "step": 5174 + }, + { + "epoch": 1.2005567799559216, + "grad_norm": 17.885516797052404, + "learning_rate": 2e-06, + "loss": 0.2469, + "step": 5175 + }, + { + "epoch": 1.2007887716042223, + "grad_norm": 13.97343455823514, + "learning_rate": 2e-06, + "loss": 0.2441, + "step": 5176 + }, + { + "epoch": 1.201020763252523, + "grad_norm": 21.132176964097027, + "learning_rate": 2e-06, + "loss": 0.1858, + "step": 5177 + }, + { + "epoch": 1.2012527549008236, + "grad_norm": 16.177779257561756, + "learning_rate": 2e-06, + "loss": 0.2804, + "step": 5178 + }, + { + "epoch": 1.2014847465491243, + "grad_norm": 15.237446241231124, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 5179 + }, + { + "epoch": 1.201716738197425, + "grad_norm": 15.410025662753815, + "learning_rate": 2e-06, + "loss": 0.272, + "step": 5180 + }, + { + "epoch": 1.2019487298457256, + "grad_norm": 24.874371348199457, + "learning_rate": 2e-06, + "loss": 0.2788, + "step": 5181 + }, + { + "epoch": 1.202180721494026, + "grad_norm": 18.610897292084417, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 5182 + }, + { + "epoch": 1.2024127131423268, + "grad_norm": 18.36018014980279, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 5183 + }, + { + "epoch": 1.2026447047906275, + "grad_norm": 20.29533914354666, + "learning_rate": 2e-06, + "loss": 0.279, + "step": 5184 + }, + { + "epoch": 1.2028766964389281, + "grad_norm": 15.975613795013066, + "learning_rate": 2e-06, + "loss": 0.2532, + "step": 5185 + }, + { + "epoch": 1.2031086880872288, + "grad_norm": 10.660371213800708, + "learning_rate": 2e-06, + "loss": 0.2077, + "step": 5186 + }, + { + "epoch": 1.2033406797355295, + "grad_norm": 12.96893264451673, + "learning_rate": 2e-06, + "loss": 0.2392, + "step": 5187 + }, + { + "epoch": 1.2035726713838302, + "grad_norm": 15.938185909799879, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 5188 + }, + { + "epoch": 1.2038046630321309, + "grad_norm": 15.88553069087415, + "learning_rate": 2e-06, + "loss": 0.2742, + "step": 5189 + }, + { + "epoch": 1.2040366546804315, + "grad_norm": 17.130342950677043, + "learning_rate": 2e-06, + "loss": 0.2267, + "step": 5190 + }, + { + "epoch": 1.2042686463287322, + "grad_norm": 14.692193200428905, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 5191 + }, + { + "epoch": 1.2045006379770329, + "grad_norm": 20.5376558559371, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 5192 + }, + { + "epoch": 1.2047326296253336, + "grad_norm": 12.547370979534392, + "learning_rate": 2e-06, + "loss": 0.1669, + "step": 5193 + }, + { + "epoch": 1.2049646212736342, + "grad_norm": 20.732742991471042, + "learning_rate": 2e-06, + "loss": 0.4357, + "step": 5194 + }, + { + "epoch": 1.205196612921935, + "grad_norm": 16.429927193275017, + "learning_rate": 2e-06, + "loss": 0.2503, + "step": 5195 + }, + { + "epoch": 1.2054286045702356, + "grad_norm": 11.049573146865242, + "learning_rate": 2e-06, + "loss": 0.1848, + "step": 5196 + }, + { + "epoch": 1.205660596218536, + "grad_norm": 14.675234242172602, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 5197 + }, + { + "epoch": 1.2058925878668367, + "grad_norm": 15.300604270336887, + "learning_rate": 2e-06, + "loss": 0.317, + "step": 5198 + }, + { + "epoch": 1.2061245795151374, + "grad_norm": 9.036597914764565, + "learning_rate": 2e-06, + "loss": 0.1548, + "step": 5199 + }, + { + "epoch": 1.206356571163438, + "grad_norm": 12.011136214657867, + "learning_rate": 2e-06, + "loss": 0.1626, + "step": 5200 + }, + { + "epoch": 1.2065885628117388, + "grad_norm": 16.154875873107986, + "learning_rate": 2e-06, + "loss": 0.2903, + "step": 5201 + }, + { + "epoch": 1.2068205544600394, + "grad_norm": 17.69607289312409, + "learning_rate": 2e-06, + "loss": 0.2915, + "step": 5202 + }, + { + "epoch": 1.2070525461083401, + "grad_norm": 17.132895546179085, + "learning_rate": 2e-06, + "loss": 0.2851, + "step": 5203 + }, + { + "epoch": 1.2072845377566408, + "grad_norm": 17.23651791207007, + "learning_rate": 2e-06, + "loss": 0.2503, + "step": 5204 + }, + { + "epoch": 1.2075165294049415, + "grad_norm": 13.061251148229335, + "learning_rate": 2e-06, + "loss": 0.2728, + "step": 5205 + }, + { + "epoch": 1.2077485210532422, + "grad_norm": 9.275794904296339, + "learning_rate": 2e-06, + "loss": 0.1567, + "step": 5206 + }, + { + "epoch": 1.2079805127015428, + "grad_norm": 10.008621925169576, + "learning_rate": 2e-06, + "loss": 0.2422, + "step": 5207 + }, + { + "epoch": 1.2082125043498433, + "grad_norm": 7.5606010156297, + "learning_rate": 2e-06, + "loss": 0.1861, + "step": 5208 + }, + { + "epoch": 1.208444495998144, + "grad_norm": 13.35580662177829, + "learning_rate": 2e-06, + "loss": 0.3143, + "step": 5209 + }, + { + "epoch": 1.2086764876464446, + "grad_norm": 13.10733565592071, + "learning_rate": 2e-06, + "loss": 0.1783, + "step": 5210 + }, + { + "epoch": 1.2089084792947453, + "grad_norm": 7.70315944281134, + "learning_rate": 2e-06, + "loss": 0.131, + "step": 5211 + }, + { + "epoch": 1.209140470943046, + "grad_norm": 11.866743411131484, + "learning_rate": 2e-06, + "loss": 0.2679, + "step": 5212 + }, + { + "epoch": 1.2093724625913467, + "grad_norm": 22.510614560943893, + "learning_rate": 2e-06, + "loss": 0.2429, + "step": 5213 + }, + { + "epoch": 1.2096044542396474, + "grad_norm": 10.310466738956148, + "learning_rate": 2e-06, + "loss": 0.2073, + "step": 5214 + }, + { + "epoch": 1.209836445887948, + "grad_norm": 20.320785351962254, + "learning_rate": 2e-06, + "loss": 0.3057, + "step": 5215 + }, + { + "epoch": 1.2100684375362487, + "grad_norm": 9.40601990023123, + "learning_rate": 2e-06, + "loss": 0.182, + "step": 5216 + }, + { + "epoch": 1.2103004291845494, + "grad_norm": 13.741165780691722, + "learning_rate": 2e-06, + "loss": 0.2182, + "step": 5217 + }, + { + "epoch": 1.21053242083285, + "grad_norm": 7.508991657284485, + "learning_rate": 2e-06, + "loss": 0.1946, + "step": 5218 + }, + { + "epoch": 1.2107644124811507, + "grad_norm": 15.514854424391363, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 5219 + }, + { + "epoch": 1.2109964041294514, + "grad_norm": 10.895204473041584, + "learning_rate": 2e-06, + "loss": 0.1865, + "step": 5220 + }, + { + "epoch": 1.211228395777752, + "grad_norm": 19.450039982901867, + "learning_rate": 2e-06, + "loss": 0.219, + "step": 5221 + }, + { + "epoch": 1.2114603874260528, + "grad_norm": 13.302386848635217, + "learning_rate": 2e-06, + "loss": 0.1976, + "step": 5222 + }, + { + "epoch": 1.2116923790743532, + "grad_norm": 22.35231992069119, + "learning_rate": 2e-06, + "loss": 0.4383, + "step": 5223 + }, + { + "epoch": 1.211924370722654, + "grad_norm": 16.850637258242692, + "learning_rate": 2e-06, + "loss": 0.1745, + "step": 5224 + }, + { + "epoch": 1.2121563623709546, + "grad_norm": 7.775618491069661, + "learning_rate": 2e-06, + "loss": 0.1796, + "step": 5225 + }, + { + "epoch": 1.2123883540192553, + "grad_norm": 26.089677514709958, + "learning_rate": 2e-06, + "loss": 0.3559, + "step": 5226 + }, + { + "epoch": 1.212620345667556, + "grad_norm": 21.937358674413783, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 5227 + }, + { + "epoch": 1.2128523373158566, + "grad_norm": 8.702241396578975, + "learning_rate": 2e-06, + "loss": 0.2015, + "step": 5228 + }, + { + "epoch": 1.2130843289641573, + "grad_norm": 7.409840180631241, + "learning_rate": 2e-06, + "loss": 0.1414, + "step": 5229 + }, + { + "epoch": 1.213316320612458, + "grad_norm": 19.95628813984114, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 5230 + }, + { + "epoch": 1.2135483122607587, + "grad_norm": 17.23062502380466, + "learning_rate": 2e-06, + "loss": 0.2711, + "step": 5231 + }, + { + "epoch": 1.2137803039090593, + "grad_norm": 20.833991635692122, + "learning_rate": 2e-06, + "loss": 0.4338, + "step": 5232 + }, + { + "epoch": 1.21401229555736, + "grad_norm": 15.283066556761923, + "learning_rate": 2e-06, + "loss": 0.3238, + "step": 5233 + }, + { + "epoch": 1.2142442872056607, + "grad_norm": 14.901162875963417, + "learning_rate": 2e-06, + "loss": 0.1931, + "step": 5234 + }, + { + "epoch": 1.2144762788539611, + "grad_norm": 21.469509336173594, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 5235 + }, + { + "epoch": 1.2147082705022618, + "grad_norm": 17.957005529775234, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 5236 + }, + { + "epoch": 1.2149402621505625, + "grad_norm": 18.140735392836362, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 5237 + }, + { + "epoch": 1.2151722537988632, + "grad_norm": 26.17420284310708, + "learning_rate": 2e-06, + "loss": 0.3931, + "step": 5238 + }, + { + "epoch": 1.2154042454471639, + "grad_norm": 7.8289304917804055, + "learning_rate": 2e-06, + "loss": 0.1446, + "step": 5239 + }, + { + "epoch": 1.2156362370954645, + "grad_norm": 18.23081021770153, + "learning_rate": 2e-06, + "loss": 0.1974, + "step": 5240 + }, + { + "epoch": 1.2158682287437652, + "grad_norm": 18.589950781673117, + "learning_rate": 2e-06, + "loss": 0.2152, + "step": 5241 + }, + { + "epoch": 1.216100220392066, + "grad_norm": 15.53438228175382, + "learning_rate": 2e-06, + "loss": 0.2284, + "step": 5242 + }, + { + "epoch": 1.2163322120403666, + "grad_norm": 17.21491317910429, + "learning_rate": 2e-06, + "loss": 0.2799, + "step": 5243 + }, + { + "epoch": 1.2165642036886672, + "grad_norm": 17.67007580606666, + "learning_rate": 2e-06, + "loss": 0.4263, + "step": 5244 + }, + { + "epoch": 1.216796195336968, + "grad_norm": 16.006138323506924, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 5245 + }, + { + "epoch": 1.2170281869852686, + "grad_norm": 12.50912584709491, + "learning_rate": 2e-06, + "loss": 0.2468, + "step": 5246 + }, + { + "epoch": 1.2172601786335693, + "grad_norm": 19.46136469945236, + "learning_rate": 2e-06, + "loss": 0.3507, + "step": 5247 + }, + { + "epoch": 1.21749217028187, + "grad_norm": 14.635269609314967, + "learning_rate": 2e-06, + "loss": 0.2371, + "step": 5248 + }, + { + "epoch": 1.2177241619301706, + "grad_norm": 13.798822892634078, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 5249 + }, + { + "epoch": 1.217956153578471, + "grad_norm": 27.779776858612273, + "learning_rate": 2e-06, + "loss": 0.5181, + "step": 5250 + }, + { + "epoch": 1.2181881452267718, + "grad_norm": 15.34299875845018, + "learning_rate": 2e-06, + "loss": 0.3343, + "step": 5251 + }, + { + "epoch": 1.2184201368750724, + "grad_norm": 18.417075663930053, + "learning_rate": 2e-06, + "loss": 0.2751, + "step": 5252 + }, + { + "epoch": 1.2186521285233731, + "grad_norm": 18.78961744953273, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 5253 + }, + { + "epoch": 1.2188841201716738, + "grad_norm": 13.46130141911124, + "learning_rate": 2e-06, + "loss": 0.2129, + "step": 5254 + }, + { + "epoch": 1.2191161118199745, + "grad_norm": 11.833100567305175, + "learning_rate": 2e-06, + "loss": 0.3017, + "step": 5255 + }, + { + "epoch": 1.2193481034682752, + "grad_norm": 7.404813410538326, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 5256 + }, + { + "epoch": 1.2195800951165758, + "grad_norm": 27.538389014789562, + "learning_rate": 2e-06, + "loss": 0.3444, + "step": 5257 + }, + { + "epoch": 1.2198120867648765, + "grad_norm": 14.48501764302651, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 5258 + }, + { + "epoch": 1.2200440784131772, + "grad_norm": 15.734888066600165, + "learning_rate": 2e-06, + "loss": 0.2839, + "step": 5259 + }, + { + "epoch": 1.2202760700614779, + "grad_norm": 14.022442780422809, + "learning_rate": 2e-06, + "loss": 0.3132, + "step": 5260 + }, + { + "epoch": 1.2205080617097783, + "grad_norm": 15.731696974707647, + "learning_rate": 2e-06, + "loss": 0.1934, + "step": 5261 + }, + { + "epoch": 1.220740053358079, + "grad_norm": 12.422595153554099, + "learning_rate": 2e-06, + "loss": 0.2521, + "step": 5262 + }, + { + "epoch": 1.2209720450063797, + "grad_norm": 15.45177514885588, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 5263 + }, + { + "epoch": 1.2212040366546804, + "grad_norm": 14.29565801586227, + "learning_rate": 2e-06, + "loss": 0.2793, + "step": 5264 + }, + { + "epoch": 1.221436028302981, + "grad_norm": 15.599363130704816, + "learning_rate": 2e-06, + "loss": 0.2823, + "step": 5265 + }, + { + "epoch": 1.2216680199512817, + "grad_norm": 15.440296781675615, + "learning_rate": 2e-06, + "loss": 0.3655, + "step": 5266 + }, + { + "epoch": 1.2219000115995824, + "grad_norm": 13.407628310415342, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 5267 + }, + { + "epoch": 1.222132003247883, + "grad_norm": 10.727707106075549, + "learning_rate": 2e-06, + "loss": 0.1716, + "step": 5268 + }, + { + "epoch": 1.2223639948961837, + "grad_norm": 10.130188590250924, + "learning_rate": 2e-06, + "loss": 0.2332, + "step": 5269 + }, + { + "epoch": 1.2225959865444844, + "grad_norm": 14.498875997948055, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 5270 + }, + { + "epoch": 1.222827978192785, + "grad_norm": 8.002068878752675, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 5271 + }, + { + "epoch": 1.2230599698410858, + "grad_norm": 11.603943189818388, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 5272 + }, + { + "epoch": 1.2232919614893865, + "grad_norm": 8.812245906502385, + "learning_rate": 2e-06, + "loss": 0.173, + "step": 5273 + }, + { + "epoch": 1.2235239531376871, + "grad_norm": 21.6674746111459, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 5274 + }, + { + "epoch": 1.2237559447859878, + "grad_norm": 21.08493131129624, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 5275 + }, + { + "epoch": 1.2239879364342883, + "grad_norm": 16.69822828663652, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 5276 + }, + { + "epoch": 1.224219928082589, + "grad_norm": 10.561775512795705, + "learning_rate": 2e-06, + "loss": 0.1804, + "step": 5277 + }, + { + "epoch": 1.2244519197308896, + "grad_norm": 28.678018135887456, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 5278 + }, + { + "epoch": 1.2246839113791903, + "grad_norm": 28.824433534080182, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 5279 + }, + { + "epoch": 1.224915903027491, + "grad_norm": 14.031124142089942, + "learning_rate": 2e-06, + "loss": 0.2114, + "step": 5280 + }, + { + "epoch": 1.2251478946757917, + "grad_norm": 17.859482295756116, + "learning_rate": 2e-06, + "loss": 0.226, + "step": 5281 + }, + { + "epoch": 1.2253798863240923, + "grad_norm": 16.643741572523084, + "learning_rate": 2e-06, + "loss": 0.1829, + "step": 5282 + }, + { + "epoch": 1.225611877972393, + "grad_norm": 37.07114976670886, + "learning_rate": 2e-06, + "loss": 0.232, + "step": 5283 + }, + { + "epoch": 1.2258438696206937, + "grad_norm": 20.04040061838349, + "learning_rate": 2e-06, + "loss": 0.4911, + "step": 5284 + }, + { + "epoch": 1.2260758612689944, + "grad_norm": 11.54246641036162, + "learning_rate": 2e-06, + "loss": 0.2101, + "step": 5285 + }, + { + "epoch": 1.226307852917295, + "grad_norm": 16.36165634216338, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 5286 + }, + { + "epoch": 1.2265398445655957, + "grad_norm": 19.920970194021628, + "learning_rate": 2e-06, + "loss": 0.2737, + "step": 5287 + }, + { + "epoch": 1.2267718362138962, + "grad_norm": 17.9952410931923, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 5288 + }, + { + "epoch": 1.2270038278621969, + "grad_norm": 11.289988956344413, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 5289 + }, + { + "epoch": 1.2272358195104975, + "grad_norm": 13.383557975275709, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 5290 + }, + { + "epoch": 1.2274678111587982, + "grad_norm": 22.369959716146592, + "learning_rate": 2e-06, + "loss": 0.2851, + "step": 5291 + }, + { + "epoch": 1.227699802807099, + "grad_norm": 12.843101286087245, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 5292 + }, + { + "epoch": 1.2279317944553996, + "grad_norm": 9.344779922844397, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 5293 + }, + { + "epoch": 1.2281637861037003, + "grad_norm": 12.59966525810847, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 5294 + }, + { + "epoch": 1.228395777752001, + "grad_norm": 14.793820877515458, + "learning_rate": 2e-06, + "loss": 0.2337, + "step": 5295 + }, + { + "epoch": 1.2286277694003016, + "grad_norm": 9.697869595422336, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 5296 + }, + { + "epoch": 1.2288597610486023, + "grad_norm": 12.580748125443968, + "learning_rate": 2e-06, + "loss": 0.211, + "step": 5297 + }, + { + "epoch": 1.229091752696903, + "grad_norm": 8.40386513019791, + "learning_rate": 2e-06, + "loss": 0.1698, + "step": 5298 + }, + { + "epoch": 1.2293237443452036, + "grad_norm": 7.596042516068372, + "learning_rate": 2e-06, + "loss": 0.2155, + "step": 5299 + }, + { + "epoch": 1.2295557359935043, + "grad_norm": 17.352822181947214, + "learning_rate": 2e-06, + "loss": 0.3248, + "step": 5300 + }, + { + "epoch": 1.229787727641805, + "grad_norm": 14.722065240431393, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 5301 + }, + { + "epoch": 1.2300197192901057, + "grad_norm": 8.723387036606349, + "learning_rate": 2e-06, + "loss": 0.1781, + "step": 5302 + }, + { + "epoch": 1.2302517109384061, + "grad_norm": 15.040416722840469, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 5303 + }, + { + "epoch": 1.2304837025867068, + "grad_norm": 13.198481853039846, + "learning_rate": 2e-06, + "loss": 0.234, + "step": 5304 + }, + { + "epoch": 1.2307156942350075, + "grad_norm": 9.054704316573181, + "learning_rate": 2e-06, + "loss": 0.2446, + "step": 5305 + }, + { + "epoch": 1.2309476858833082, + "grad_norm": 16.080836154342443, + "learning_rate": 2e-06, + "loss": 0.2978, + "step": 5306 + }, + { + "epoch": 1.2311796775316088, + "grad_norm": 9.419831444878099, + "learning_rate": 2e-06, + "loss": 0.1617, + "step": 5307 + }, + { + "epoch": 1.2314116691799095, + "grad_norm": 13.487378036543648, + "learning_rate": 2e-06, + "loss": 0.1953, + "step": 5308 + }, + { + "epoch": 1.2316436608282102, + "grad_norm": 13.178197224965563, + "learning_rate": 2e-06, + "loss": 0.3204, + "step": 5309 + }, + { + "epoch": 1.2318756524765109, + "grad_norm": 15.681660473782781, + "learning_rate": 2e-06, + "loss": 0.3323, + "step": 5310 + }, + { + "epoch": 1.2321076441248116, + "grad_norm": 25.239516257597224, + "learning_rate": 2e-06, + "loss": 0.3799, + "step": 5311 + }, + { + "epoch": 1.2323396357731122, + "grad_norm": 8.244385037922948, + "learning_rate": 2e-06, + "loss": 0.1961, + "step": 5312 + }, + { + "epoch": 1.232571627421413, + "grad_norm": 12.190565719041365, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 5313 + }, + { + "epoch": 1.2328036190697136, + "grad_norm": 9.850539308026018, + "learning_rate": 2e-06, + "loss": 0.2206, + "step": 5314 + }, + { + "epoch": 1.233035610718014, + "grad_norm": 14.153920435151814, + "learning_rate": 2e-06, + "loss": 0.3262, + "step": 5315 + }, + { + "epoch": 1.2332676023663147, + "grad_norm": 13.520522643805842, + "learning_rate": 2e-06, + "loss": 0.1809, + "step": 5316 + }, + { + "epoch": 1.2334995940146154, + "grad_norm": 19.47076432215645, + "learning_rate": 2e-06, + "loss": 0.2154, + "step": 5317 + }, + { + "epoch": 1.233731585662916, + "grad_norm": 11.485832864111378, + "learning_rate": 2e-06, + "loss": 0.2006, + "step": 5318 + }, + { + "epoch": 1.2339635773112168, + "grad_norm": 20.080611120412676, + "learning_rate": 2e-06, + "loss": 0.316, + "step": 5319 + }, + { + "epoch": 1.2341955689595174, + "grad_norm": 8.779411883202771, + "learning_rate": 2e-06, + "loss": 0.1902, + "step": 5320 + }, + { + "epoch": 1.234427560607818, + "grad_norm": 16.488788887028306, + "learning_rate": 2e-06, + "loss": 0.2165, + "step": 5321 + }, + { + "epoch": 1.2346595522561188, + "grad_norm": 11.51692899731653, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 5322 + }, + { + "epoch": 1.2348915439044195, + "grad_norm": 17.678905519432654, + "learning_rate": 2e-06, + "loss": 0.3973, + "step": 5323 + }, + { + "epoch": 1.2351235355527201, + "grad_norm": 11.581841331604895, + "learning_rate": 2e-06, + "loss": 0.2456, + "step": 5324 + }, + { + "epoch": 1.2353555272010208, + "grad_norm": 18.37405604865033, + "learning_rate": 2e-06, + "loss": 0.3861, + "step": 5325 + }, + { + "epoch": 1.2355875188493215, + "grad_norm": 15.130367359648647, + "learning_rate": 2e-06, + "loss": 0.4564, + "step": 5326 + }, + { + "epoch": 1.2358195104976222, + "grad_norm": 14.957660846372246, + "learning_rate": 2e-06, + "loss": 0.2367, + "step": 5327 + }, + { + "epoch": 1.2360515021459229, + "grad_norm": 10.56472444321113, + "learning_rate": 2e-06, + "loss": 0.197, + "step": 5328 + }, + { + "epoch": 1.2362834937942235, + "grad_norm": 13.940692025178311, + "learning_rate": 2e-06, + "loss": 0.1999, + "step": 5329 + }, + { + "epoch": 1.236515485442524, + "grad_norm": 11.270627604067421, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 5330 + }, + { + "epoch": 1.2367474770908247, + "grad_norm": 16.92028277449169, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 5331 + }, + { + "epoch": 1.2369794687391253, + "grad_norm": 26.307067518093277, + "learning_rate": 2e-06, + "loss": 0.3855, + "step": 5332 + }, + { + "epoch": 1.237211460387426, + "grad_norm": 21.92979348271037, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 5333 + }, + { + "epoch": 1.2374434520357267, + "grad_norm": 11.581362836366731, + "learning_rate": 2e-06, + "loss": 0.2065, + "step": 5334 + }, + { + "epoch": 1.2376754436840274, + "grad_norm": 11.930440029646041, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 5335 + }, + { + "epoch": 1.237907435332328, + "grad_norm": 17.013879424895844, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 5336 + }, + { + "epoch": 1.2381394269806287, + "grad_norm": 7.1496544046508195, + "learning_rate": 2e-06, + "loss": 0.1477, + "step": 5337 + }, + { + "epoch": 1.2383714186289294, + "grad_norm": 15.173156812522874, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 5338 + }, + { + "epoch": 1.23860341027723, + "grad_norm": 13.382184341323773, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 5339 + }, + { + "epoch": 1.2388354019255308, + "grad_norm": 10.946003794488881, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 5340 + }, + { + "epoch": 1.2390673935738312, + "grad_norm": 10.11429496594036, + "learning_rate": 2e-06, + "loss": 0.2194, + "step": 5341 + }, + { + "epoch": 1.239299385222132, + "grad_norm": 12.266409181966331, + "learning_rate": 2e-06, + "loss": 0.1854, + "step": 5342 + }, + { + "epoch": 1.2395313768704326, + "grad_norm": 14.34239606199122, + "learning_rate": 2e-06, + "loss": 0.2947, + "step": 5343 + }, + { + "epoch": 1.2397633685187333, + "grad_norm": 17.562564624110667, + "learning_rate": 2e-06, + "loss": 0.1894, + "step": 5344 + }, + { + "epoch": 1.239995360167034, + "grad_norm": 13.432435067887818, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 5345 + }, + { + "epoch": 1.2402273518153346, + "grad_norm": 22.792988260322883, + "learning_rate": 2e-06, + "loss": 0.3237, + "step": 5346 + }, + { + "epoch": 1.2404593434636353, + "grad_norm": 11.917086324715964, + "learning_rate": 2e-06, + "loss": 0.2028, + "step": 5347 + }, + { + "epoch": 1.240691335111936, + "grad_norm": 18.645434561808916, + "learning_rate": 2e-06, + "loss": 0.3638, + "step": 5348 + }, + { + "epoch": 1.2409233267602366, + "grad_norm": 14.115886033627724, + "learning_rate": 2e-06, + "loss": 0.1957, + "step": 5349 + }, + { + "epoch": 1.2411553184085373, + "grad_norm": 11.39626873072166, + "learning_rate": 2e-06, + "loss": 0.2367, + "step": 5350 + }, + { + "epoch": 1.241387310056838, + "grad_norm": 18.343971576177992, + "learning_rate": 2e-06, + "loss": 0.301, + "step": 5351 + }, + { + "epoch": 1.2416193017051387, + "grad_norm": 14.328369898087471, + "learning_rate": 2e-06, + "loss": 0.196, + "step": 5352 + }, + { + "epoch": 1.2418512933534394, + "grad_norm": 23.31079139215547, + "learning_rate": 2e-06, + "loss": 0.3009, + "step": 5353 + }, + { + "epoch": 1.24208328500174, + "grad_norm": 15.595543149206021, + "learning_rate": 2e-06, + "loss": 0.2458, + "step": 5354 + }, + { + "epoch": 1.2423152766500407, + "grad_norm": 12.699706007266634, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 5355 + }, + { + "epoch": 1.2425472682983412, + "grad_norm": 17.694766975284093, + "learning_rate": 2e-06, + "loss": 0.3025, + "step": 5356 + }, + { + "epoch": 1.2427792599466418, + "grad_norm": 19.481656997033788, + "learning_rate": 2e-06, + "loss": 0.2393, + "step": 5357 + }, + { + "epoch": 1.2430112515949425, + "grad_norm": 26.897648558409387, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 5358 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 17.579246825669056, + "learning_rate": 2e-06, + "loss": 0.2459, + "step": 5359 + }, + { + "epoch": 1.2434752348915439, + "grad_norm": 15.492054232320067, + "learning_rate": 2e-06, + "loss": 0.194, + "step": 5360 + }, + { + "epoch": 1.2437072265398446, + "grad_norm": 10.026590173467095, + "learning_rate": 2e-06, + "loss": 0.1695, + "step": 5361 + }, + { + "epoch": 1.2439392181881452, + "grad_norm": 16.692618658838594, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 5362 + }, + { + "epoch": 1.244171209836446, + "grad_norm": 15.049609606356615, + "learning_rate": 2e-06, + "loss": 0.2791, + "step": 5363 + }, + { + "epoch": 1.2444032014847466, + "grad_norm": 11.389033854712501, + "learning_rate": 2e-06, + "loss": 0.3525, + "step": 5364 + }, + { + "epoch": 1.2446351931330473, + "grad_norm": 6.8271310257402815, + "learning_rate": 2e-06, + "loss": 0.1401, + "step": 5365 + }, + { + "epoch": 1.244867184781348, + "grad_norm": 24.358447016584893, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 5366 + }, + { + "epoch": 1.2450991764296486, + "grad_norm": 15.490606812265609, + "learning_rate": 2e-06, + "loss": 0.3321, + "step": 5367 + }, + { + "epoch": 1.245331168077949, + "grad_norm": 17.53430950849937, + "learning_rate": 2e-06, + "loss": 0.2033, + "step": 5368 + }, + { + "epoch": 1.2455631597262498, + "grad_norm": 6.766701685141638, + "learning_rate": 2e-06, + "loss": 0.1528, + "step": 5369 + }, + { + "epoch": 1.2457951513745504, + "grad_norm": 21.917208890331153, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 5370 + }, + { + "epoch": 1.2460271430228511, + "grad_norm": 9.526048106428112, + "learning_rate": 2e-06, + "loss": 0.1639, + "step": 5371 + }, + { + "epoch": 1.2462591346711518, + "grad_norm": 11.603815223248493, + "learning_rate": 2e-06, + "loss": 0.2868, + "step": 5372 + }, + { + "epoch": 1.2464911263194525, + "grad_norm": 13.988597939735435, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 5373 + }, + { + "epoch": 1.2467231179677531, + "grad_norm": 8.844100357062398, + "learning_rate": 2e-06, + "loss": 0.1971, + "step": 5374 + }, + { + "epoch": 1.2469551096160538, + "grad_norm": 15.231375274391446, + "learning_rate": 2e-06, + "loss": 0.2439, + "step": 5375 + }, + { + "epoch": 1.2471871012643545, + "grad_norm": 13.556775542243084, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 5376 + }, + { + "epoch": 1.2474190929126552, + "grad_norm": 11.837907914182448, + "learning_rate": 2e-06, + "loss": 0.1514, + "step": 5377 + }, + { + "epoch": 1.2476510845609559, + "grad_norm": 11.071337144240893, + "learning_rate": 2e-06, + "loss": 0.1871, + "step": 5378 + }, + { + "epoch": 1.2478830762092565, + "grad_norm": 9.181267980204264, + "learning_rate": 2e-06, + "loss": 0.1558, + "step": 5379 + }, + { + "epoch": 1.2481150678575572, + "grad_norm": 27.662846838543977, + "learning_rate": 2e-06, + "loss": 0.4109, + "step": 5380 + }, + { + "epoch": 1.248347059505858, + "grad_norm": 7.471747957628051, + "learning_rate": 2e-06, + "loss": 0.1225, + "step": 5381 + }, + { + "epoch": 1.2485790511541586, + "grad_norm": 11.818739316598782, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 5382 + }, + { + "epoch": 1.248811042802459, + "grad_norm": 13.116379005884413, + "learning_rate": 2e-06, + "loss": 0.2719, + "step": 5383 + }, + { + "epoch": 1.2490430344507597, + "grad_norm": 15.287798792169049, + "learning_rate": 2e-06, + "loss": 0.1912, + "step": 5384 + }, + { + "epoch": 1.2492750260990604, + "grad_norm": 21.297823361697418, + "learning_rate": 2e-06, + "loss": 0.3731, + "step": 5385 + }, + { + "epoch": 1.249507017747361, + "grad_norm": 14.772441145934714, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 5386 + }, + { + "epoch": 1.2497390093956617, + "grad_norm": 22.743960581731905, + "learning_rate": 2e-06, + "loss": 0.2809, + "step": 5387 + }, + { + "epoch": 1.2499710010439624, + "grad_norm": 8.798316896561857, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 5388 + }, + { + "epoch": 1.250202992692263, + "grad_norm": 17.322801821705692, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 5389 + }, + { + "epoch": 1.2504349843405638, + "grad_norm": 22.401139532469738, + "learning_rate": 2e-06, + "loss": 0.2902, + "step": 5390 + }, + { + "epoch": 1.2506669759888644, + "grad_norm": 12.33349075041886, + "learning_rate": 2e-06, + "loss": 0.2465, + "step": 5391 + }, + { + "epoch": 1.2508989676371651, + "grad_norm": 12.554184765560295, + "learning_rate": 2e-06, + "loss": 0.2667, + "step": 5392 + }, + { + "epoch": 1.2511309592854658, + "grad_norm": 19.246451335744148, + "learning_rate": 2e-06, + "loss": 0.268, + "step": 5393 + }, + { + "epoch": 1.2513629509337663, + "grad_norm": 15.757899753835336, + "learning_rate": 2e-06, + "loss": 0.3866, + "step": 5394 + }, + { + "epoch": 1.251594942582067, + "grad_norm": 11.682845962868761, + "learning_rate": 2e-06, + "loss": 0.1962, + "step": 5395 + }, + { + "epoch": 1.2518269342303676, + "grad_norm": 8.234530902056399, + "learning_rate": 2e-06, + "loss": 0.1765, + "step": 5396 + }, + { + "epoch": 1.2520589258786683, + "grad_norm": 9.16550352575417, + "learning_rate": 2e-06, + "loss": 0.2197, + "step": 5397 + }, + { + "epoch": 1.252290917526969, + "grad_norm": 13.833352896694109, + "learning_rate": 2e-06, + "loss": 0.2307, + "step": 5398 + }, + { + "epoch": 1.2525229091752696, + "grad_norm": 13.330435337641362, + "learning_rate": 2e-06, + "loss": 0.3172, + "step": 5399 + }, + { + "epoch": 1.2527549008235703, + "grad_norm": 13.575020941419025, + "learning_rate": 2e-06, + "loss": 0.3236, + "step": 5400 + }, + { + "epoch": 1.252986892471871, + "grad_norm": 15.24233503470825, + "learning_rate": 2e-06, + "loss": 0.1454, + "step": 5401 + }, + { + "epoch": 1.2532188841201717, + "grad_norm": 7.776532329048835, + "learning_rate": 2e-06, + "loss": 0.1922, + "step": 5402 + }, + { + "epoch": 1.2534508757684724, + "grad_norm": 10.196711226595042, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 5403 + }, + { + "epoch": 1.253682867416773, + "grad_norm": 14.61397139989457, + "learning_rate": 2e-06, + "loss": 0.205, + "step": 5404 + }, + { + "epoch": 1.2539148590650737, + "grad_norm": 10.510717889247282, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 5405 + }, + { + "epoch": 1.2541468507133744, + "grad_norm": 6.766161353966618, + "learning_rate": 2e-06, + "loss": 0.2261, + "step": 5406 + }, + { + "epoch": 1.254378842361675, + "grad_norm": 11.411526589754951, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 5407 + }, + { + "epoch": 1.2546108340099758, + "grad_norm": 12.28838509585074, + "learning_rate": 2e-06, + "loss": 0.1855, + "step": 5408 + }, + { + "epoch": 1.2548428256582764, + "grad_norm": 7.946790336662799, + "learning_rate": 2e-06, + "loss": 0.1554, + "step": 5409 + }, + { + "epoch": 1.2550748173065769, + "grad_norm": 12.619103129116512, + "learning_rate": 2e-06, + "loss": 0.1923, + "step": 5410 + }, + { + "epoch": 1.2553068089548776, + "grad_norm": 27.705684365921115, + "learning_rate": 2e-06, + "loss": 0.5062, + "step": 5411 + }, + { + "epoch": 1.2555388006031782, + "grad_norm": 4.933290693333517, + "learning_rate": 2e-06, + "loss": 0.1248, + "step": 5412 + }, + { + "epoch": 1.255770792251479, + "grad_norm": 24.902814613229886, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 5413 + }, + { + "epoch": 1.2560027838997796, + "grad_norm": 10.21067892420391, + "learning_rate": 2e-06, + "loss": 0.2146, + "step": 5414 + }, + { + "epoch": 1.2562347755480803, + "grad_norm": 6.780520081375767, + "learning_rate": 2e-06, + "loss": 0.2351, + "step": 5415 + }, + { + "epoch": 1.256466767196381, + "grad_norm": 13.71098978847903, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 5416 + }, + { + "epoch": 1.2566987588446816, + "grad_norm": 10.855468980185774, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 5417 + }, + { + "epoch": 1.2569307504929823, + "grad_norm": 21.315008502857644, + "learning_rate": 2e-06, + "loss": 0.3071, + "step": 5418 + }, + { + "epoch": 1.257162742141283, + "grad_norm": 14.134423743002685, + "learning_rate": 2e-06, + "loss": 0.1695, + "step": 5419 + }, + { + "epoch": 1.2573947337895834, + "grad_norm": 15.135285920332297, + "learning_rate": 2e-06, + "loss": 0.1712, + "step": 5420 + }, + { + "epoch": 1.2576267254378841, + "grad_norm": 13.025679352989053, + "learning_rate": 2e-06, + "loss": 0.1629, + "step": 5421 + }, + { + "epoch": 1.2578587170861848, + "grad_norm": 15.418028878233873, + "learning_rate": 2e-06, + "loss": 0.1973, + "step": 5422 + }, + { + "epoch": 1.2580907087344855, + "grad_norm": 14.90544663133779, + "learning_rate": 2e-06, + "loss": 0.2366, + "step": 5423 + }, + { + "epoch": 1.2583227003827862, + "grad_norm": 17.13177952043397, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 5424 + }, + { + "epoch": 1.2585546920310868, + "grad_norm": 15.768336904910315, + "learning_rate": 2e-06, + "loss": 0.1723, + "step": 5425 + }, + { + "epoch": 1.2587866836793875, + "grad_norm": 16.784910745172407, + "learning_rate": 2e-06, + "loss": 0.2809, + "step": 5426 + }, + { + "epoch": 1.2590186753276882, + "grad_norm": 14.877686438432477, + "learning_rate": 2e-06, + "loss": 0.2759, + "step": 5427 + }, + { + "epoch": 1.2592506669759889, + "grad_norm": 9.201621848621404, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 5428 + }, + { + "epoch": 1.2594826586242895, + "grad_norm": 14.700888196853745, + "learning_rate": 2e-06, + "loss": 0.1968, + "step": 5429 + }, + { + "epoch": 1.2597146502725902, + "grad_norm": 18.81228873623378, + "learning_rate": 2e-06, + "loss": 0.3326, + "step": 5430 + }, + { + "epoch": 1.259946641920891, + "grad_norm": 12.9065843099499, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 5431 + }, + { + "epoch": 1.2601786335691916, + "grad_norm": 9.30669535994372, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 5432 + }, + { + "epoch": 1.2604106252174923, + "grad_norm": 9.267711029265648, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 5433 + }, + { + "epoch": 1.260642616865793, + "grad_norm": 26.074534924266477, + "learning_rate": 2e-06, + "loss": 0.3929, + "step": 5434 + }, + { + "epoch": 1.2608746085140936, + "grad_norm": 13.773153795926959, + "learning_rate": 2e-06, + "loss": 0.1561, + "step": 5435 + }, + { + "epoch": 1.2611066001623943, + "grad_norm": 20.51695528132908, + "learning_rate": 2e-06, + "loss": 0.1994, + "step": 5436 + }, + { + "epoch": 1.2613385918106947, + "grad_norm": 13.866096902646714, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 5437 + }, + { + "epoch": 1.2615705834589954, + "grad_norm": 15.601652672968376, + "learning_rate": 2e-06, + "loss": 0.1956, + "step": 5438 + }, + { + "epoch": 1.261802575107296, + "grad_norm": 9.407943419133732, + "learning_rate": 2e-06, + "loss": 0.2155, + "step": 5439 + }, + { + "epoch": 1.2620345667555968, + "grad_norm": 21.304630727887947, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 5440 + }, + { + "epoch": 1.2622665584038975, + "grad_norm": 18.56505166670855, + "learning_rate": 2e-06, + "loss": 0.3319, + "step": 5441 + }, + { + "epoch": 1.2624985500521981, + "grad_norm": 11.90138217449211, + "learning_rate": 2e-06, + "loss": 0.2573, + "step": 5442 + }, + { + "epoch": 1.2627305417004988, + "grad_norm": 15.059252412285858, + "learning_rate": 2e-06, + "loss": 0.2218, + "step": 5443 + }, + { + "epoch": 1.2629625333487995, + "grad_norm": 8.775996304124295, + "learning_rate": 2e-06, + "loss": 0.1364, + "step": 5444 + }, + { + "epoch": 1.2631945249971002, + "grad_norm": 12.641643012310846, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 5445 + }, + { + "epoch": 1.2634265166454008, + "grad_norm": 20.388734064223414, + "learning_rate": 2e-06, + "loss": 0.3179, + "step": 5446 + }, + { + "epoch": 1.2636585082937013, + "grad_norm": 17.263471978052475, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 5447 + }, + { + "epoch": 1.263890499942002, + "grad_norm": 13.912072866064792, + "learning_rate": 2e-06, + "loss": 0.2209, + "step": 5448 + }, + { + "epoch": 1.2641224915903027, + "grad_norm": 17.87351186270327, + "learning_rate": 2e-06, + "loss": 0.347, + "step": 5449 + }, + { + "epoch": 1.2643544832386033, + "grad_norm": 20.927462227177482, + "learning_rate": 2e-06, + "loss": 0.3443, + "step": 5450 + }, + { + "epoch": 1.264586474886904, + "grad_norm": 14.92230797263229, + "learning_rate": 2e-06, + "loss": 0.2529, + "step": 5451 + }, + { + "epoch": 1.2648184665352047, + "grad_norm": 11.658954372633668, + "learning_rate": 2e-06, + "loss": 0.1356, + "step": 5452 + }, + { + "epoch": 1.2650504581835054, + "grad_norm": 16.889816040298218, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 5453 + }, + { + "epoch": 1.265282449831806, + "grad_norm": 17.986590759970184, + "learning_rate": 2e-06, + "loss": 0.2652, + "step": 5454 + }, + { + "epoch": 1.2655144414801067, + "grad_norm": 15.375217352482847, + "learning_rate": 2e-06, + "loss": 0.2162, + "step": 5455 + }, + { + "epoch": 1.2657464331284074, + "grad_norm": 13.563201322950821, + "learning_rate": 2e-06, + "loss": 0.2364, + "step": 5456 + }, + { + "epoch": 1.265978424776708, + "grad_norm": 14.413978406555218, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 5457 + }, + { + "epoch": 1.2662104164250088, + "grad_norm": 16.31505711897065, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 5458 + }, + { + "epoch": 1.2664424080733094, + "grad_norm": 8.752232702803143, + "learning_rate": 2e-06, + "loss": 0.1601, + "step": 5459 + }, + { + "epoch": 1.2666743997216101, + "grad_norm": 9.794706568555943, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 5460 + }, + { + "epoch": 1.2669063913699108, + "grad_norm": 13.647159948468177, + "learning_rate": 2e-06, + "loss": 0.2018, + "step": 5461 + }, + { + "epoch": 1.2671383830182115, + "grad_norm": 14.812271457713734, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 5462 + }, + { + "epoch": 1.267370374666512, + "grad_norm": 8.905656894633575, + "learning_rate": 2e-06, + "loss": 0.1496, + "step": 5463 + }, + { + "epoch": 1.2676023663148126, + "grad_norm": 9.256200576462955, + "learning_rate": 2e-06, + "loss": 0.1749, + "step": 5464 + }, + { + "epoch": 1.2678343579631133, + "grad_norm": 12.138821016330132, + "learning_rate": 2e-06, + "loss": 0.3476, + "step": 5465 + }, + { + "epoch": 1.268066349611414, + "grad_norm": 14.589430451668205, + "learning_rate": 2e-06, + "loss": 0.1956, + "step": 5466 + }, + { + "epoch": 1.2682983412597146, + "grad_norm": 18.385243783916064, + "learning_rate": 2e-06, + "loss": 0.2236, + "step": 5467 + }, + { + "epoch": 1.2685303329080153, + "grad_norm": 15.912545380757068, + "learning_rate": 2e-06, + "loss": 0.2797, + "step": 5468 + }, + { + "epoch": 1.268762324556316, + "grad_norm": 24.290486722780003, + "learning_rate": 2e-06, + "loss": 0.336, + "step": 5469 + }, + { + "epoch": 1.2689943162046167, + "grad_norm": 17.68733338274814, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 5470 + }, + { + "epoch": 1.2692263078529173, + "grad_norm": 29.58373573181717, + "learning_rate": 2e-06, + "loss": 0.2578, + "step": 5471 + }, + { + "epoch": 1.269458299501218, + "grad_norm": 21.49698383474083, + "learning_rate": 2e-06, + "loss": 0.2823, + "step": 5472 + }, + { + "epoch": 1.2696902911495185, + "grad_norm": 20.99655313594657, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 5473 + }, + { + "epoch": 1.2699222827978192, + "grad_norm": 12.60122524250526, + "learning_rate": 2e-06, + "loss": 0.404, + "step": 5474 + }, + { + "epoch": 1.2701542744461198, + "grad_norm": 11.14334254645051, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 5475 + }, + { + "epoch": 1.2703862660944205, + "grad_norm": 6.256460629933859, + "learning_rate": 2e-06, + "loss": 0.1994, + "step": 5476 + }, + { + "epoch": 1.2706182577427212, + "grad_norm": 15.74119024883826, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 5477 + }, + { + "epoch": 1.2708502493910219, + "grad_norm": 22.71832340845438, + "learning_rate": 2e-06, + "loss": 0.3303, + "step": 5478 + }, + { + "epoch": 1.2710822410393225, + "grad_norm": 9.136667866436637, + "learning_rate": 2e-06, + "loss": 0.214, + "step": 5479 + }, + { + "epoch": 1.2713142326876232, + "grad_norm": 22.00409773731631, + "learning_rate": 2e-06, + "loss": 0.2791, + "step": 5480 + }, + { + "epoch": 1.271546224335924, + "grad_norm": 14.514457099733832, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 5481 + }, + { + "epoch": 1.2717782159842246, + "grad_norm": 19.24414215082918, + "learning_rate": 2e-06, + "loss": 0.239, + "step": 5482 + }, + { + "epoch": 1.2720102076325253, + "grad_norm": 10.895891454072098, + "learning_rate": 2e-06, + "loss": 0.1715, + "step": 5483 + }, + { + "epoch": 1.272242199280826, + "grad_norm": 9.632069525604589, + "learning_rate": 2e-06, + "loss": 0.328, + "step": 5484 + }, + { + "epoch": 1.2724741909291266, + "grad_norm": 9.230036573343929, + "learning_rate": 2e-06, + "loss": 0.1802, + "step": 5485 + }, + { + "epoch": 1.2727061825774273, + "grad_norm": 8.919987934719751, + "learning_rate": 2e-06, + "loss": 0.1919, + "step": 5486 + }, + { + "epoch": 1.272938174225728, + "grad_norm": 21.874171401796, + "learning_rate": 2e-06, + "loss": 0.4171, + "step": 5487 + }, + { + "epoch": 1.2731701658740286, + "grad_norm": 21.073806899512824, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 5488 + }, + { + "epoch": 1.2734021575223293, + "grad_norm": 19.802684449859967, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 5489 + }, + { + "epoch": 1.2736341491706298, + "grad_norm": 8.130642986566825, + "learning_rate": 2e-06, + "loss": 0.2091, + "step": 5490 + }, + { + "epoch": 1.2738661408189305, + "grad_norm": 16.65568102886136, + "learning_rate": 2e-06, + "loss": 0.195, + "step": 5491 + }, + { + "epoch": 1.2740981324672311, + "grad_norm": 46.93369863196454, + "learning_rate": 2e-06, + "loss": 0.2481, + "step": 5492 + }, + { + "epoch": 1.2743301241155318, + "grad_norm": 19.882996928850734, + "learning_rate": 2e-06, + "loss": 0.3797, + "step": 5493 + }, + { + "epoch": 1.2745621157638325, + "grad_norm": 6.786034128081566, + "learning_rate": 2e-06, + "loss": 0.1929, + "step": 5494 + }, + { + "epoch": 1.2747941074121332, + "grad_norm": 10.954183622735066, + "learning_rate": 2e-06, + "loss": 0.1846, + "step": 5495 + }, + { + "epoch": 1.2750260990604338, + "grad_norm": 17.289913825248284, + "learning_rate": 2e-06, + "loss": 0.2575, + "step": 5496 + }, + { + "epoch": 1.2752580907087345, + "grad_norm": 9.900332056828477, + "learning_rate": 2e-06, + "loss": 0.1919, + "step": 5497 + }, + { + "epoch": 1.2754900823570352, + "grad_norm": 11.216172941552085, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 5498 + }, + { + "epoch": 1.2757220740053359, + "grad_norm": 18.067500250176654, + "learning_rate": 2e-06, + "loss": 0.2861, + "step": 5499 + }, + { + "epoch": 1.2759540656536363, + "grad_norm": 9.544040849535888, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 5500 + }, + { + "epoch": 1.276186057301937, + "grad_norm": 19.109610262028383, + "learning_rate": 2e-06, + "loss": 0.2947, + "step": 5501 + }, + { + "epoch": 1.2764180489502377, + "grad_norm": 8.074254510862858, + "learning_rate": 2e-06, + "loss": 0.1712, + "step": 5502 + }, + { + "epoch": 1.2766500405985384, + "grad_norm": 16.010980414431298, + "learning_rate": 2e-06, + "loss": 0.3747, + "step": 5503 + }, + { + "epoch": 1.276882032246839, + "grad_norm": 13.51093579441853, + "learning_rate": 2e-06, + "loss": 0.2592, + "step": 5504 + }, + { + "epoch": 1.2771140238951397, + "grad_norm": 14.669785165177656, + "learning_rate": 2e-06, + "loss": 0.2628, + "step": 5505 + }, + { + "epoch": 1.2773460155434404, + "grad_norm": 7.660559763630664, + "learning_rate": 2e-06, + "loss": 0.1481, + "step": 5506 + }, + { + "epoch": 1.277578007191741, + "grad_norm": 20.24682997014625, + "learning_rate": 2e-06, + "loss": 0.3538, + "step": 5507 + }, + { + "epoch": 1.2778099988400418, + "grad_norm": 13.362958359090564, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 5508 + }, + { + "epoch": 1.2780419904883424, + "grad_norm": 8.164473390361263, + "learning_rate": 2e-06, + "loss": 0.1568, + "step": 5509 + }, + { + "epoch": 1.2782739821366431, + "grad_norm": 6.75118446837882, + "learning_rate": 2e-06, + "loss": 0.1275, + "step": 5510 + }, + { + "epoch": 1.2785059737849438, + "grad_norm": 8.801135729909866, + "learning_rate": 2e-06, + "loss": 0.1721, + "step": 5511 + }, + { + "epoch": 1.2787379654332445, + "grad_norm": 19.006232049977932, + "learning_rate": 2e-06, + "loss": 0.3433, + "step": 5512 + }, + { + "epoch": 1.2789699570815452, + "grad_norm": 8.830534900711758, + "learning_rate": 2e-06, + "loss": 0.1636, + "step": 5513 + }, + { + "epoch": 1.2792019487298458, + "grad_norm": 3.593403289783236, + "learning_rate": 2e-06, + "loss": 0.1181, + "step": 5514 + }, + { + "epoch": 1.2794339403781465, + "grad_norm": 20.464133931736466, + "learning_rate": 2e-06, + "loss": 0.3697, + "step": 5515 + }, + { + "epoch": 1.2796659320264472, + "grad_norm": 22.020806574162382, + "learning_rate": 2e-06, + "loss": 0.2337, + "step": 5516 + }, + { + "epoch": 1.2798979236747476, + "grad_norm": 11.288169129492667, + "learning_rate": 2e-06, + "loss": 0.2503, + "step": 5517 + }, + { + "epoch": 1.2801299153230483, + "grad_norm": 12.874188117641705, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 5518 + }, + { + "epoch": 1.280361906971349, + "grad_norm": 12.527139165639522, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 5519 + }, + { + "epoch": 1.2805938986196497, + "grad_norm": 14.050860916922632, + "learning_rate": 2e-06, + "loss": 0.4138, + "step": 5520 + }, + { + "epoch": 1.2808258902679504, + "grad_norm": 17.575926637021528, + "learning_rate": 2e-06, + "loss": 0.4182, + "step": 5521 + }, + { + "epoch": 1.281057881916251, + "grad_norm": 38.30070940939618, + "learning_rate": 2e-06, + "loss": 0.2344, + "step": 5522 + }, + { + "epoch": 1.2812898735645517, + "grad_norm": 15.721424393613805, + "learning_rate": 2e-06, + "loss": 0.2039, + "step": 5523 + }, + { + "epoch": 1.2815218652128524, + "grad_norm": 14.438274765990126, + "learning_rate": 2e-06, + "loss": 0.2811, + "step": 5524 + }, + { + "epoch": 1.281753856861153, + "grad_norm": 11.964371077643447, + "learning_rate": 2e-06, + "loss": 0.1871, + "step": 5525 + }, + { + "epoch": 1.2819858485094537, + "grad_norm": 11.192127859492148, + "learning_rate": 2e-06, + "loss": 0.2232, + "step": 5526 + }, + { + "epoch": 1.2822178401577542, + "grad_norm": 16.55720382812037, + "learning_rate": 2e-06, + "loss": 0.3419, + "step": 5527 + }, + { + "epoch": 1.2824498318060549, + "grad_norm": 11.859714499129243, + "learning_rate": 2e-06, + "loss": 0.3109, + "step": 5528 + }, + { + "epoch": 1.2826818234543556, + "grad_norm": 12.773674037445563, + "learning_rate": 2e-06, + "loss": 0.2546, + "step": 5529 + }, + { + "epoch": 1.2829138151026562, + "grad_norm": 20.39897546810617, + "learning_rate": 2e-06, + "loss": 0.3194, + "step": 5530 + }, + { + "epoch": 1.283145806750957, + "grad_norm": 16.604649378816013, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 5531 + }, + { + "epoch": 1.2833777983992576, + "grad_norm": 10.145216773558865, + "learning_rate": 2e-06, + "loss": 0.1958, + "step": 5532 + }, + { + "epoch": 1.2836097900475583, + "grad_norm": 12.156091156673625, + "learning_rate": 2e-06, + "loss": 0.1559, + "step": 5533 + }, + { + "epoch": 1.283841781695859, + "grad_norm": 14.025709421254515, + "learning_rate": 2e-06, + "loss": 0.1721, + "step": 5534 + }, + { + "epoch": 1.2840737733441596, + "grad_norm": 11.107260139493441, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 5535 + }, + { + "epoch": 1.2843057649924603, + "grad_norm": 11.576453611507793, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 5536 + }, + { + "epoch": 1.284537756640761, + "grad_norm": 8.007601321516523, + "learning_rate": 2e-06, + "loss": 0.1732, + "step": 5537 + }, + { + "epoch": 1.2847697482890617, + "grad_norm": 11.928698418916918, + "learning_rate": 2e-06, + "loss": 0.1612, + "step": 5538 + }, + { + "epoch": 1.2850017399373623, + "grad_norm": 17.465166018146665, + "learning_rate": 2e-06, + "loss": 0.3162, + "step": 5539 + }, + { + "epoch": 1.285233731585663, + "grad_norm": 5.783970569015845, + "learning_rate": 2e-06, + "loss": 0.1845, + "step": 5540 + }, + { + "epoch": 1.2854657232339637, + "grad_norm": 9.157428800986871, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 5541 + }, + { + "epoch": 1.2856977148822644, + "grad_norm": 4.1076278120894, + "learning_rate": 2e-06, + "loss": 0.1123, + "step": 5542 + }, + { + "epoch": 1.2859297065305648, + "grad_norm": 10.11312397969375, + "learning_rate": 2e-06, + "loss": 0.1765, + "step": 5543 + }, + { + "epoch": 1.2861616981788655, + "grad_norm": 11.274897323273564, + "learning_rate": 2e-06, + "loss": 0.1913, + "step": 5544 + }, + { + "epoch": 1.2863936898271662, + "grad_norm": 19.606507289753136, + "learning_rate": 2e-06, + "loss": 0.3079, + "step": 5545 + }, + { + "epoch": 1.2866256814754669, + "grad_norm": 9.684645461882754, + "learning_rate": 2e-06, + "loss": 0.1835, + "step": 5546 + }, + { + "epoch": 1.2868576731237675, + "grad_norm": 9.737675214521724, + "learning_rate": 2e-06, + "loss": 0.2299, + "step": 5547 + }, + { + "epoch": 1.2870896647720682, + "grad_norm": 10.599090403823574, + "learning_rate": 2e-06, + "loss": 0.2373, + "step": 5548 + }, + { + "epoch": 1.2873216564203689, + "grad_norm": 16.630432627012777, + "learning_rate": 2e-06, + "loss": 0.3984, + "step": 5549 + }, + { + "epoch": 1.2875536480686696, + "grad_norm": 17.729293437700733, + "learning_rate": 2e-06, + "loss": 0.2671, + "step": 5550 + }, + { + "epoch": 1.2877856397169702, + "grad_norm": 12.05835176586732, + "learning_rate": 2e-06, + "loss": 0.2906, + "step": 5551 + }, + { + "epoch": 1.288017631365271, + "grad_norm": 14.440985815174287, + "learning_rate": 2e-06, + "loss": 0.2058, + "step": 5552 + }, + { + "epoch": 1.2882496230135714, + "grad_norm": 16.84003641847929, + "learning_rate": 2e-06, + "loss": 0.32, + "step": 5553 + }, + { + "epoch": 1.288481614661872, + "grad_norm": 20.495041375308226, + "learning_rate": 2e-06, + "loss": 0.1994, + "step": 5554 + }, + { + "epoch": 1.2887136063101727, + "grad_norm": 21.208082266804475, + "learning_rate": 2e-06, + "loss": 0.4303, + "step": 5555 + }, + { + "epoch": 1.2889455979584734, + "grad_norm": 10.462591620428805, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 5556 + }, + { + "epoch": 1.289177589606774, + "grad_norm": 33.57346108492076, + "learning_rate": 2e-06, + "loss": 0.4521, + "step": 5557 + }, + { + "epoch": 1.2894095812550748, + "grad_norm": 9.690303304788598, + "learning_rate": 2e-06, + "loss": 0.1659, + "step": 5558 + }, + { + "epoch": 1.2896415729033754, + "grad_norm": 26.12477597482978, + "learning_rate": 2e-06, + "loss": 0.3697, + "step": 5559 + }, + { + "epoch": 1.2898735645516761, + "grad_norm": 10.753843000487956, + "learning_rate": 2e-06, + "loss": 0.1845, + "step": 5560 + }, + { + "epoch": 1.2901055561999768, + "grad_norm": 11.94320212794658, + "learning_rate": 2e-06, + "loss": 0.3165, + "step": 5561 + }, + { + "epoch": 1.2903375478482775, + "grad_norm": 8.074090848780601, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 5562 + }, + { + "epoch": 1.2905695394965782, + "grad_norm": 9.986874563274212, + "learning_rate": 2e-06, + "loss": 0.2196, + "step": 5563 + }, + { + "epoch": 1.2908015311448788, + "grad_norm": 13.88506300781901, + "learning_rate": 2e-06, + "loss": 0.2352, + "step": 5564 + }, + { + "epoch": 1.2910335227931795, + "grad_norm": 12.780056004774648, + "learning_rate": 2e-06, + "loss": 0.206, + "step": 5565 + }, + { + "epoch": 1.2912655144414802, + "grad_norm": 17.948578222912435, + "learning_rate": 2e-06, + "loss": 0.3228, + "step": 5566 + }, + { + "epoch": 1.2914975060897809, + "grad_norm": 8.911005690791237, + "learning_rate": 2e-06, + "loss": 0.2329, + "step": 5567 + }, + { + "epoch": 1.2917294977380815, + "grad_norm": 9.354454545488181, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 5568 + }, + { + "epoch": 1.2919614893863822, + "grad_norm": 21.323098522700068, + "learning_rate": 2e-06, + "loss": 0.2735, + "step": 5569 + }, + { + "epoch": 1.2921934810346827, + "grad_norm": 15.446059925915666, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 5570 + }, + { + "epoch": 1.2924254726829834, + "grad_norm": 8.774450735495995, + "learning_rate": 2e-06, + "loss": 0.2065, + "step": 5571 + }, + { + "epoch": 1.292657464331284, + "grad_norm": 8.875266415022931, + "learning_rate": 2e-06, + "loss": 0.1516, + "step": 5572 + }, + { + "epoch": 1.2928894559795847, + "grad_norm": 25.699395396889404, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 5573 + }, + { + "epoch": 1.2931214476278854, + "grad_norm": 13.780637111179585, + "learning_rate": 2e-06, + "loss": 0.1974, + "step": 5574 + }, + { + "epoch": 1.293353439276186, + "grad_norm": 10.596894116088386, + "learning_rate": 2e-06, + "loss": 0.1873, + "step": 5575 + }, + { + "epoch": 1.2935854309244867, + "grad_norm": 15.882696742741262, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 5576 + }, + { + "epoch": 1.2938174225727874, + "grad_norm": 16.749738561496294, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 5577 + }, + { + "epoch": 1.294049414221088, + "grad_norm": 22.970839761942415, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 5578 + }, + { + "epoch": 1.2942814058693888, + "grad_norm": 10.956263232344353, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 5579 + }, + { + "epoch": 1.2945133975176892, + "grad_norm": 12.000426780774758, + "learning_rate": 2e-06, + "loss": 0.2221, + "step": 5580 + }, + { + "epoch": 1.29474538916599, + "grad_norm": 11.959888501070827, + "learning_rate": 2e-06, + "loss": 0.2105, + "step": 5581 + }, + { + "epoch": 1.2949773808142906, + "grad_norm": 13.004029224978572, + "learning_rate": 2e-06, + "loss": 0.2816, + "step": 5582 + }, + { + "epoch": 1.2952093724625913, + "grad_norm": 16.552747214762597, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 5583 + }, + { + "epoch": 1.295441364110892, + "grad_norm": 7.828758165100487, + "learning_rate": 2e-06, + "loss": 0.1445, + "step": 5584 + }, + { + "epoch": 1.2956733557591926, + "grad_norm": 9.69840028310193, + "learning_rate": 2e-06, + "loss": 0.192, + "step": 5585 + }, + { + "epoch": 1.2959053474074933, + "grad_norm": 8.594676831269815, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 5586 + }, + { + "epoch": 1.296137339055794, + "grad_norm": 19.72568385506288, + "learning_rate": 2e-06, + "loss": 0.4113, + "step": 5587 + }, + { + "epoch": 1.2963693307040947, + "grad_norm": 11.60141821509818, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 5588 + }, + { + "epoch": 1.2966013223523953, + "grad_norm": 11.360073503368694, + "learning_rate": 2e-06, + "loss": 0.1508, + "step": 5589 + }, + { + "epoch": 1.296833314000696, + "grad_norm": 11.466245032578335, + "learning_rate": 2e-06, + "loss": 0.1911, + "step": 5590 + }, + { + "epoch": 1.2970653056489967, + "grad_norm": 11.352394051290077, + "learning_rate": 2e-06, + "loss": 0.1763, + "step": 5591 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 16.11533083404958, + "learning_rate": 2e-06, + "loss": 0.2902, + "step": 5592 + }, + { + "epoch": 1.297529288945598, + "grad_norm": 12.36723660219495, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 5593 + }, + { + "epoch": 1.2977612805938987, + "grad_norm": 30.89625173241812, + "learning_rate": 2e-06, + "loss": 0.2666, + "step": 5594 + }, + { + "epoch": 1.2979932722421994, + "grad_norm": 19.514344172121138, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 5595 + }, + { + "epoch": 1.2982252638905, + "grad_norm": 10.788689229972393, + "learning_rate": 2e-06, + "loss": 0.1786, + "step": 5596 + }, + { + "epoch": 1.2984572555388005, + "grad_norm": 15.45400603168222, + "learning_rate": 2e-06, + "loss": 0.2733, + "step": 5597 + }, + { + "epoch": 1.2986892471871012, + "grad_norm": 13.722921586738387, + "learning_rate": 2e-06, + "loss": 0.2962, + "step": 5598 + }, + { + "epoch": 1.298921238835402, + "grad_norm": 19.39647911275564, + "learning_rate": 2e-06, + "loss": 0.3514, + "step": 5599 + }, + { + "epoch": 1.2991532304837026, + "grad_norm": 15.274235619440297, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 5600 + }, + { + "epoch": 1.2993852221320032, + "grad_norm": 16.11752315764069, + "learning_rate": 2e-06, + "loss": 0.2394, + "step": 5601 + }, + { + "epoch": 1.299617213780304, + "grad_norm": 24.618352423068448, + "learning_rate": 2e-06, + "loss": 0.3625, + "step": 5602 + }, + { + "epoch": 1.2998492054286046, + "grad_norm": 24.593954311058177, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 5603 + }, + { + "epoch": 1.3000811970769053, + "grad_norm": 23.962619857204736, + "learning_rate": 2e-06, + "loss": 0.2788, + "step": 5604 + }, + { + "epoch": 1.300313188725206, + "grad_norm": 13.745233766397064, + "learning_rate": 2e-06, + "loss": 0.292, + "step": 5605 + }, + { + "epoch": 1.3005451803735064, + "grad_norm": 184.99953181781865, + "learning_rate": 2e-06, + "loss": 0.2061, + "step": 5606 + }, + { + "epoch": 1.300777172021807, + "grad_norm": 11.009695789207703, + "learning_rate": 2e-06, + "loss": 0.2157, + "step": 5607 + }, + { + "epoch": 1.3010091636701078, + "grad_norm": 15.03713330803401, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 5608 + }, + { + "epoch": 1.3012411553184084, + "grad_norm": 17.794040596623752, + "learning_rate": 2e-06, + "loss": 0.2294, + "step": 5609 + }, + { + "epoch": 1.3014731469667091, + "grad_norm": 12.91998866491081, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 5610 + }, + { + "epoch": 1.3017051386150098, + "grad_norm": 15.712347452714706, + "learning_rate": 2e-06, + "loss": 0.1369, + "step": 5611 + }, + { + "epoch": 1.3019371302633105, + "grad_norm": 18.194023250326826, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 5612 + }, + { + "epoch": 1.3021691219116112, + "grad_norm": 10.73759855968218, + "learning_rate": 2e-06, + "loss": 0.1686, + "step": 5613 + }, + { + "epoch": 1.3024011135599118, + "grad_norm": 12.442952329564145, + "learning_rate": 2e-06, + "loss": 0.2391, + "step": 5614 + }, + { + "epoch": 1.3026331052082125, + "grad_norm": 23.27162006463831, + "learning_rate": 2e-06, + "loss": 0.2731, + "step": 5615 + }, + { + "epoch": 1.3028650968565132, + "grad_norm": 15.22373633972883, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 5616 + }, + { + "epoch": 1.3030970885048139, + "grad_norm": 8.903182393289645, + "learning_rate": 2e-06, + "loss": 0.1805, + "step": 5617 + }, + { + "epoch": 1.3033290801531145, + "grad_norm": 25.937526008653386, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 5618 + }, + { + "epoch": 1.3035610718014152, + "grad_norm": 17.577990312401784, + "learning_rate": 2e-06, + "loss": 0.3776, + "step": 5619 + }, + { + "epoch": 1.303793063449716, + "grad_norm": 22.427564565601266, + "learning_rate": 2e-06, + "loss": 0.256, + "step": 5620 + }, + { + "epoch": 1.3040250550980166, + "grad_norm": 15.660251500511174, + "learning_rate": 2e-06, + "loss": 0.2918, + "step": 5621 + }, + { + "epoch": 1.3042570467463173, + "grad_norm": 28.10834557005505, + "learning_rate": 2e-06, + "loss": 0.3871, + "step": 5622 + }, + { + "epoch": 1.3044890383946177, + "grad_norm": 8.673493743204125, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 5623 + }, + { + "epoch": 1.3047210300429184, + "grad_norm": 21.417289777883838, + "learning_rate": 2e-06, + "loss": 0.3396, + "step": 5624 + }, + { + "epoch": 1.304953021691219, + "grad_norm": 16.89284063484694, + "learning_rate": 2e-06, + "loss": 0.3283, + "step": 5625 + }, + { + "epoch": 1.3051850133395198, + "grad_norm": 29.120207682173884, + "learning_rate": 2e-06, + "loss": 0.3296, + "step": 5626 + }, + { + "epoch": 1.3054170049878204, + "grad_norm": 8.919708053307183, + "learning_rate": 2e-06, + "loss": 0.1461, + "step": 5627 + }, + { + "epoch": 1.305648996636121, + "grad_norm": 19.78253962837575, + "learning_rate": 2e-06, + "loss": 0.2993, + "step": 5628 + }, + { + "epoch": 1.3058809882844218, + "grad_norm": 17.402054742095746, + "learning_rate": 2e-06, + "loss": 0.3022, + "step": 5629 + }, + { + "epoch": 1.3061129799327225, + "grad_norm": 11.4823499978865, + "learning_rate": 2e-06, + "loss": 0.2225, + "step": 5630 + }, + { + "epoch": 1.3063449715810231, + "grad_norm": 7.3361505909982005, + "learning_rate": 2e-06, + "loss": 0.1902, + "step": 5631 + }, + { + "epoch": 1.3065769632293238, + "grad_norm": 27.559188119699623, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 5632 + }, + { + "epoch": 1.3068089548776243, + "grad_norm": 15.273929151441568, + "learning_rate": 2e-06, + "loss": 0.2725, + "step": 5633 + }, + { + "epoch": 1.307040946525925, + "grad_norm": 22.424000462796233, + "learning_rate": 2e-06, + "loss": 0.3106, + "step": 5634 + }, + { + "epoch": 1.3072729381742256, + "grad_norm": 14.713171143247271, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 5635 + }, + { + "epoch": 1.3075049298225263, + "grad_norm": 19.197749791692836, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 5636 + }, + { + "epoch": 1.307736921470827, + "grad_norm": 9.70171890144793, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 5637 + }, + { + "epoch": 1.3079689131191277, + "grad_norm": 11.136763234840572, + "learning_rate": 2e-06, + "loss": 0.1674, + "step": 5638 + }, + { + "epoch": 1.3082009047674283, + "grad_norm": 26.03212305103045, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 5639 + }, + { + "epoch": 1.308432896415729, + "grad_norm": 21.30713547042178, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 5640 + }, + { + "epoch": 1.3086648880640297, + "grad_norm": 10.993965951343093, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 5641 + }, + { + "epoch": 1.3088968797123304, + "grad_norm": 17.769077676138195, + "learning_rate": 2e-06, + "loss": 0.2873, + "step": 5642 + }, + { + "epoch": 1.309128871360631, + "grad_norm": 18.506312434168304, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 5643 + }, + { + "epoch": 1.3093608630089317, + "grad_norm": 6.793232387244837, + "learning_rate": 2e-06, + "loss": 0.174, + "step": 5644 + }, + { + "epoch": 1.3095928546572324, + "grad_norm": 32.49566834679004, + "learning_rate": 2e-06, + "loss": 0.2574, + "step": 5645 + }, + { + "epoch": 1.309824846305533, + "grad_norm": 17.02251550801001, + "learning_rate": 2e-06, + "loss": 0.3174, + "step": 5646 + }, + { + "epoch": 1.3100568379538338, + "grad_norm": 12.841461665684056, + "learning_rate": 2e-06, + "loss": 0.3524, + "step": 5647 + }, + { + "epoch": 1.3102888296021344, + "grad_norm": 19.076546400110157, + "learning_rate": 2e-06, + "loss": 0.2798, + "step": 5648 + }, + { + "epoch": 1.3105208212504351, + "grad_norm": 13.18433741810789, + "learning_rate": 2e-06, + "loss": 0.3233, + "step": 5649 + }, + { + "epoch": 1.3107528128987356, + "grad_norm": 11.135069470099385, + "learning_rate": 2e-06, + "loss": 0.1642, + "step": 5650 + }, + { + "epoch": 1.3109848045470363, + "grad_norm": 15.315677523471457, + "learning_rate": 2e-06, + "loss": 0.2652, + "step": 5651 + }, + { + "epoch": 1.311216796195337, + "grad_norm": 10.62905135472249, + "learning_rate": 2e-06, + "loss": 0.2165, + "step": 5652 + }, + { + "epoch": 1.3114487878436376, + "grad_norm": 11.231634605332838, + "learning_rate": 2e-06, + "loss": 0.2613, + "step": 5653 + }, + { + "epoch": 1.3116807794919383, + "grad_norm": 9.078649052887393, + "learning_rate": 2e-06, + "loss": 0.2143, + "step": 5654 + }, + { + "epoch": 1.311912771140239, + "grad_norm": 21.39401032753167, + "learning_rate": 2e-06, + "loss": 0.3495, + "step": 5655 + }, + { + "epoch": 1.3121447627885396, + "grad_norm": 17.904213558896597, + "learning_rate": 2e-06, + "loss": 0.2527, + "step": 5656 + }, + { + "epoch": 1.3123767544368403, + "grad_norm": 15.558548274464785, + "learning_rate": 2e-06, + "loss": 0.1924, + "step": 5657 + }, + { + "epoch": 1.312608746085141, + "grad_norm": 12.029762240958295, + "learning_rate": 2e-06, + "loss": 0.2173, + "step": 5658 + }, + { + "epoch": 1.3128407377334417, + "grad_norm": 25.30959589435329, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 5659 + }, + { + "epoch": 1.3130727293817421, + "grad_norm": 10.092299250446088, + "learning_rate": 2e-06, + "loss": 0.1949, + "step": 5660 + }, + { + "epoch": 1.3133047210300428, + "grad_norm": 13.306741014180796, + "learning_rate": 2e-06, + "loss": 0.2432, + "step": 5661 + }, + { + "epoch": 1.3135367126783435, + "grad_norm": 17.398002407527894, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 5662 + }, + { + "epoch": 1.3137687043266442, + "grad_norm": 13.958106857172252, + "learning_rate": 2e-06, + "loss": 0.2637, + "step": 5663 + }, + { + "epoch": 1.3140006959749448, + "grad_norm": 11.373230060748023, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 5664 + }, + { + "epoch": 1.3142326876232455, + "grad_norm": 15.86847179090274, + "learning_rate": 2e-06, + "loss": 0.2329, + "step": 5665 + }, + { + "epoch": 1.3144646792715462, + "grad_norm": 15.508532456260358, + "learning_rate": 2e-06, + "loss": 0.3161, + "step": 5666 + }, + { + "epoch": 1.3146966709198469, + "grad_norm": 15.532683378784451, + "learning_rate": 2e-06, + "loss": 0.3558, + "step": 5667 + }, + { + "epoch": 1.3149286625681476, + "grad_norm": 19.714299042898194, + "learning_rate": 2e-06, + "loss": 0.3513, + "step": 5668 + }, + { + "epoch": 1.3151606542164482, + "grad_norm": 13.958754944833625, + "learning_rate": 2e-06, + "loss": 0.2168, + "step": 5669 + }, + { + "epoch": 1.315392645864749, + "grad_norm": 30.644144100112875, + "learning_rate": 2e-06, + "loss": 0.3604, + "step": 5670 + }, + { + "epoch": 1.3156246375130496, + "grad_norm": 16.352621053156582, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 5671 + }, + { + "epoch": 1.3158566291613503, + "grad_norm": 19.73007160153037, + "learning_rate": 2e-06, + "loss": 0.261, + "step": 5672 + }, + { + "epoch": 1.316088620809651, + "grad_norm": 14.995894794886874, + "learning_rate": 2e-06, + "loss": 0.2534, + "step": 5673 + }, + { + "epoch": 1.3163206124579516, + "grad_norm": 15.945047875864542, + "learning_rate": 2e-06, + "loss": 0.3808, + "step": 5674 + }, + { + "epoch": 1.3165526041062523, + "grad_norm": 14.018774776451082, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 5675 + }, + { + "epoch": 1.3167845957545528, + "grad_norm": 12.70859105222736, + "learning_rate": 2e-06, + "loss": 0.24, + "step": 5676 + }, + { + "epoch": 1.3170165874028534, + "grad_norm": 9.678998173041654, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 5677 + }, + { + "epoch": 1.3172485790511541, + "grad_norm": 15.736690719275137, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 5678 + }, + { + "epoch": 1.3174805706994548, + "grad_norm": 13.60977667580227, + "learning_rate": 2e-06, + "loss": 0.2472, + "step": 5679 + }, + { + "epoch": 1.3177125623477555, + "grad_norm": 11.102773871126534, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 5680 + }, + { + "epoch": 1.3179445539960561, + "grad_norm": 17.291796040037262, + "learning_rate": 2e-06, + "loss": 0.346, + "step": 5681 + }, + { + "epoch": 1.3181765456443568, + "grad_norm": 18.166601354990185, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 5682 + }, + { + "epoch": 1.3184085372926575, + "grad_norm": 10.95806449961031, + "learning_rate": 2e-06, + "loss": 0.237, + "step": 5683 + }, + { + "epoch": 1.3186405289409582, + "grad_norm": 160.91351095779942, + "learning_rate": 2e-06, + "loss": 0.2951, + "step": 5684 + }, + { + "epoch": 1.3188725205892589, + "grad_norm": 10.603170771079542, + "learning_rate": 2e-06, + "loss": 0.1857, + "step": 5685 + }, + { + "epoch": 1.3191045122375593, + "grad_norm": 6.870903833321501, + "learning_rate": 2e-06, + "loss": 0.1642, + "step": 5686 + }, + { + "epoch": 1.31933650388586, + "grad_norm": 13.412277495806073, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 5687 + }, + { + "epoch": 1.3195684955341607, + "grad_norm": 16.00296720744652, + "learning_rate": 2e-06, + "loss": 0.2616, + "step": 5688 + }, + { + "epoch": 1.3198004871824613, + "grad_norm": 14.523893821884103, + "learning_rate": 2e-06, + "loss": 0.2931, + "step": 5689 + }, + { + "epoch": 1.320032478830762, + "grad_norm": 8.344921740838892, + "learning_rate": 2e-06, + "loss": 0.1381, + "step": 5690 + }, + { + "epoch": 1.3202644704790627, + "grad_norm": 18.55719833842272, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 5691 + }, + { + "epoch": 1.3204964621273634, + "grad_norm": 14.12605618900697, + "learning_rate": 2e-06, + "loss": 0.1944, + "step": 5692 + }, + { + "epoch": 1.320728453775664, + "grad_norm": 9.998184058061966, + "learning_rate": 2e-06, + "loss": 0.1542, + "step": 5693 + }, + { + "epoch": 1.3209604454239647, + "grad_norm": 15.911168575838987, + "learning_rate": 2e-06, + "loss": 0.4191, + "step": 5694 + }, + { + "epoch": 1.3211924370722654, + "grad_norm": 24.85883116892657, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 5695 + }, + { + "epoch": 1.321424428720566, + "grad_norm": 17.623611354513486, + "learning_rate": 2e-06, + "loss": 0.3902, + "step": 5696 + }, + { + "epoch": 1.3216564203688668, + "grad_norm": 16.48576591994104, + "learning_rate": 2e-06, + "loss": 0.3015, + "step": 5697 + }, + { + "epoch": 1.3218884120171674, + "grad_norm": 15.575788365937843, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 5698 + }, + { + "epoch": 1.3221204036654681, + "grad_norm": 20.85661933589363, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 5699 + }, + { + "epoch": 1.3223523953137688, + "grad_norm": 16.061174354807644, + "learning_rate": 2e-06, + "loss": 0.1946, + "step": 5700 + }, + { + "epoch": 1.3225843869620695, + "grad_norm": 17.84776593809947, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 5701 + }, + { + "epoch": 1.3228163786103702, + "grad_norm": 18.804374385719715, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 5702 + }, + { + "epoch": 1.3230483702586706, + "grad_norm": 13.939505997269132, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 5703 + }, + { + "epoch": 1.3232803619069713, + "grad_norm": 22.54661147857366, + "learning_rate": 2e-06, + "loss": 0.4171, + "step": 5704 + }, + { + "epoch": 1.323512353555272, + "grad_norm": 14.117182775507976, + "learning_rate": 2e-06, + "loss": 0.2152, + "step": 5705 + }, + { + "epoch": 1.3237443452035726, + "grad_norm": 18.175557503429257, + "learning_rate": 2e-06, + "loss": 0.2274, + "step": 5706 + }, + { + "epoch": 1.3239763368518733, + "grad_norm": 11.06563729243645, + "learning_rate": 2e-06, + "loss": 0.1603, + "step": 5707 + }, + { + "epoch": 1.324208328500174, + "grad_norm": 11.94708026064263, + "learning_rate": 2e-06, + "loss": 0.2145, + "step": 5708 + }, + { + "epoch": 1.3244403201484747, + "grad_norm": 19.632307362212533, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 5709 + }, + { + "epoch": 1.3246723117967754, + "grad_norm": 12.854868599897623, + "learning_rate": 2e-06, + "loss": 0.2245, + "step": 5710 + }, + { + "epoch": 1.324904303445076, + "grad_norm": 23.40658671310776, + "learning_rate": 2e-06, + "loss": 0.2682, + "step": 5711 + }, + { + "epoch": 1.3251362950933767, + "grad_norm": 16.268358378842592, + "learning_rate": 2e-06, + "loss": 0.3602, + "step": 5712 + }, + { + "epoch": 1.3253682867416772, + "grad_norm": 11.210694208149162, + "learning_rate": 2e-06, + "loss": 0.2451, + "step": 5713 + }, + { + "epoch": 1.3256002783899778, + "grad_norm": 13.093366884959643, + "learning_rate": 2e-06, + "loss": 0.1851, + "step": 5714 + }, + { + "epoch": 1.3258322700382785, + "grad_norm": 13.9470039807361, + "learning_rate": 2e-06, + "loss": 0.2528, + "step": 5715 + }, + { + "epoch": 1.3260642616865792, + "grad_norm": 13.826574290698455, + "learning_rate": 2e-06, + "loss": 0.2275, + "step": 5716 + }, + { + "epoch": 1.3262962533348799, + "grad_norm": 19.0989336579408, + "learning_rate": 2e-06, + "loss": 0.4041, + "step": 5717 + }, + { + "epoch": 1.3265282449831806, + "grad_norm": 10.94421028576376, + "learning_rate": 2e-06, + "loss": 0.1777, + "step": 5718 + }, + { + "epoch": 1.3267602366314812, + "grad_norm": 11.10383820485182, + "learning_rate": 2e-06, + "loss": 0.1856, + "step": 5719 + }, + { + "epoch": 1.326992228279782, + "grad_norm": 11.421442448851714, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 5720 + }, + { + "epoch": 1.3272242199280826, + "grad_norm": 16.064869933616933, + "learning_rate": 2e-06, + "loss": 0.311, + "step": 5721 + }, + { + "epoch": 1.3274562115763833, + "grad_norm": 18.75785660722465, + "learning_rate": 2e-06, + "loss": 0.4559, + "step": 5722 + }, + { + "epoch": 1.327688203224684, + "grad_norm": 11.390146013990016, + "learning_rate": 2e-06, + "loss": 0.2577, + "step": 5723 + }, + { + "epoch": 1.3279201948729846, + "grad_norm": 16.85163301480196, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 5724 + }, + { + "epoch": 1.3281521865212853, + "grad_norm": 13.0317462251221, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 5725 + }, + { + "epoch": 1.328384178169586, + "grad_norm": 14.351959885689283, + "learning_rate": 2e-06, + "loss": 0.226, + "step": 5726 + }, + { + "epoch": 1.3286161698178867, + "grad_norm": 9.984793704893075, + "learning_rate": 2e-06, + "loss": 0.1668, + "step": 5727 + }, + { + "epoch": 1.3288481614661873, + "grad_norm": 17.625833578442002, + "learning_rate": 2e-06, + "loss": 0.343, + "step": 5728 + }, + { + "epoch": 1.329080153114488, + "grad_norm": 13.273409325079097, + "learning_rate": 2e-06, + "loss": 0.2235, + "step": 5729 + }, + { + "epoch": 1.3293121447627885, + "grad_norm": 8.8052389353841, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 5730 + }, + { + "epoch": 1.3295441364110892, + "grad_norm": 15.961048639991073, + "learning_rate": 2e-06, + "loss": 0.1882, + "step": 5731 + }, + { + "epoch": 1.3297761280593898, + "grad_norm": 15.980771784027198, + "learning_rate": 2e-06, + "loss": 0.2133, + "step": 5732 + }, + { + "epoch": 1.3300081197076905, + "grad_norm": 34.87453938575482, + "learning_rate": 2e-06, + "loss": 0.3107, + "step": 5733 + }, + { + "epoch": 1.3302401113559912, + "grad_norm": 7.971302638747583, + "learning_rate": 2e-06, + "loss": 0.1623, + "step": 5734 + }, + { + "epoch": 1.3304721030042919, + "grad_norm": 13.732619994934568, + "learning_rate": 2e-06, + "loss": 0.3478, + "step": 5735 + }, + { + "epoch": 1.3307040946525925, + "grad_norm": 13.81265807913985, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 5736 + }, + { + "epoch": 1.3309360863008932, + "grad_norm": 17.587535840998314, + "learning_rate": 2e-06, + "loss": 0.299, + "step": 5737 + }, + { + "epoch": 1.331168077949194, + "grad_norm": 18.227607371744853, + "learning_rate": 2e-06, + "loss": 0.3449, + "step": 5738 + }, + { + "epoch": 1.3314000695974946, + "grad_norm": 12.651673644096768, + "learning_rate": 2e-06, + "loss": 0.2299, + "step": 5739 + }, + { + "epoch": 1.331632061245795, + "grad_norm": 13.107582649881053, + "learning_rate": 2e-06, + "loss": 0.2587, + "step": 5740 + }, + { + "epoch": 1.3318640528940957, + "grad_norm": 19.926768899159622, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 5741 + }, + { + "epoch": 1.3320960445423964, + "grad_norm": 9.882153373560206, + "learning_rate": 2e-06, + "loss": 0.2351, + "step": 5742 + }, + { + "epoch": 1.332328036190697, + "grad_norm": 15.401606571846399, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 5743 + }, + { + "epoch": 1.3325600278389977, + "grad_norm": 8.051494076240969, + "learning_rate": 2e-06, + "loss": 0.1099, + "step": 5744 + }, + { + "epoch": 1.3327920194872984, + "grad_norm": 12.485400896606112, + "learning_rate": 2e-06, + "loss": 0.2127, + "step": 5745 + }, + { + "epoch": 1.333024011135599, + "grad_norm": 19.37818023128224, + "learning_rate": 2e-06, + "loss": 0.3585, + "step": 5746 + }, + { + "epoch": 1.3332560027838998, + "grad_norm": 11.50542375491202, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 5747 + }, + { + "epoch": 1.3334879944322005, + "grad_norm": 9.535853097525685, + "learning_rate": 2e-06, + "loss": 0.1785, + "step": 5748 + }, + { + "epoch": 1.3337199860805011, + "grad_norm": 13.825585578950808, + "learning_rate": 2e-06, + "loss": 0.2251, + "step": 5749 + }, + { + "epoch": 1.3339519777288018, + "grad_norm": 20.489808046635204, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 5750 + }, + { + "epoch": 1.3341839693771025, + "grad_norm": 7.979473578310454, + "learning_rate": 2e-06, + "loss": 0.219, + "step": 5751 + }, + { + "epoch": 1.3344159610254032, + "grad_norm": 20.741850635447804, + "learning_rate": 2e-06, + "loss": 0.2966, + "step": 5752 + }, + { + "epoch": 1.3346479526737038, + "grad_norm": 14.487809285683062, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 5753 + }, + { + "epoch": 1.3348799443220045, + "grad_norm": 13.953602449611058, + "learning_rate": 2e-06, + "loss": 0.2804, + "step": 5754 + }, + { + "epoch": 1.3351119359703052, + "grad_norm": 17.240216254771312, + "learning_rate": 2e-06, + "loss": 0.1892, + "step": 5755 + }, + { + "epoch": 1.3353439276186057, + "grad_norm": 10.47617287821396, + "learning_rate": 2e-06, + "loss": 0.1832, + "step": 5756 + }, + { + "epoch": 1.3355759192669063, + "grad_norm": 17.52330908616922, + "learning_rate": 2e-06, + "loss": 0.2934, + "step": 5757 + }, + { + "epoch": 1.335807910915207, + "grad_norm": 13.566061758419231, + "learning_rate": 2e-06, + "loss": 0.2273, + "step": 5758 + }, + { + "epoch": 1.3360399025635077, + "grad_norm": 12.742318765067107, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 5759 + }, + { + "epoch": 1.3362718942118084, + "grad_norm": 18.812041464643745, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 5760 + }, + { + "epoch": 1.336503885860109, + "grad_norm": 7.438888234983939, + "learning_rate": 2e-06, + "loss": 0.1601, + "step": 5761 + }, + { + "epoch": 1.3367358775084097, + "grad_norm": 16.70120584326876, + "learning_rate": 2e-06, + "loss": 0.2006, + "step": 5762 + }, + { + "epoch": 1.3369678691567104, + "grad_norm": 18.450037872236187, + "learning_rate": 2e-06, + "loss": 0.2465, + "step": 5763 + }, + { + "epoch": 1.337199860805011, + "grad_norm": 15.59582496370636, + "learning_rate": 2e-06, + "loss": 0.3092, + "step": 5764 + }, + { + "epoch": 1.3374318524533118, + "grad_norm": 24.090622475541675, + "learning_rate": 2e-06, + "loss": 0.3752, + "step": 5765 + }, + { + "epoch": 1.3376638441016122, + "grad_norm": 11.557989090391212, + "learning_rate": 2e-06, + "loss": 0.2061, + "step": 5766 + }, + { + "epoch": 1.3378958357499129, + "grad_norm": 7.013857459223439, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 5767 + }, + { + "epoch": 1.3381278273982136, + "grad_norm": 10.23666384150274, + "learning_rate": 2e-06, + "loss": 0.1466, + "step": 5768 + }, + { + "epoch": 1.3383598190465142, + "grad_norm": 11.148655360805588, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 5769 + }, + { + "epoch": 1.338591810694815, + "grad_norm": 16.186729561676344, + "learning_rate": 2e-06, + "loss": 0.3968, + "step": 5770 + }, + { + "epoch": 1.3388238023431156, + "grad_norm": 6.943969848029348, + "learning_rate": 2e-06, + "loss": 0.1478, + "step": 5771 + }, + { + "epoch": 1.3390557939914163, + "grad_norm": 17.341373830915476, + "learning_rate": 2e-06, + "loss": 0.1947, + "step": 5772 + }, + { + "epoch": 1.339287785639717, + "grad_norm": 24.482447956744732, + "learning_rate": 2e-06, + "loss": 0.3202, + "step": 5773 + }, + { + "epoch": 1.3395197772880176, + "grad_norm": 21.39557609795782, + "learning_rate": 2e-06, + "loss": 0.3008, + "step": 5774 + }, + { + "epoch": 1.3397517689363183, + "grad_norm": 6.321646081575424, + "learning_rate": 2e-06, + "loss": 0.18, + "step": 5775 + }, + { + "epoch": 1.339983760584619, + "grad_norm": 14.757152685348238, + "learning_rate": 2e-06, + "loss": 0.3238, + "step": 5776 + }, + { + "epoch": 1.3402157522329197, + "grad_norm": 12.278175383917478, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 5777 + }, + { + "epoch": 1.3404477438812203, + "grad_norm": 12.23761349802958, + "learning_rate": 2e-06, + "loss": 0.3068, + "step": 5778 + }, + { + "epoch": 1.340679735529521, + "grad_norm": 11.403333857518392, + "learning_rate": 2e-06, + "loss": 0.1405, + "step": 5779 + }, + { + "epoch": 1.3409117271778217, + "grad_norm": 13.05117023683827, + "learning_rate": 2e-06, + "loss": 0.2465, + "step": 5780 + }, + { + "epoch": 1.3411437188261224, + "grad_norm": 22.224875632817582, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 5781 + }, + { + "epoch": 1.341375710474423, + "grad_norm": 20.353481270647716, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 5782 + }, + { + "epoch": 1.3416077021227235, + "grad_norm": 12.682528250567298, + "learning_rate": 2e-06, + "loss": 0.1926, + "step": 5783 + }, + { + "epoch": 1.3418396937710242, + "grad_norm": 28.57266479023652, + "learning_rate": 2e-06, + "loss": 0.2512, + "step": 5784 + }, + { + "epoch": 1.3420716854193249, + "grad_norm": 16.909366137872915, + "learning_rate": 2e-06, + "loss": 0.2867, + "step": 5785 + }, + { + "epoch": 1.3423036770676255, + "grad_norm": 13.171274976616166, + "learning_rate": 2e-06, + "loss": 0.2623, + "step": 5786 + }, + { + "epoch": 1.3425356687159262, + "grad_norm": 8.080789224669203, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 5787 + }, + { + "epoch": 1.342767660364227, + "grad_norm": 20.2668518242124, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 5788 + }, + { + "epoch": 1.3429996520125276, + "grad_norm": 15.16901875665676, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 5789 + }, + { + "epoch": 1.3432316436608283, + "grad_norm": 11.042566276296274, + "learning_rate": 2e-06, + "loss": 0.1578, + "step": 5790 + }, + { + "epoch": 1.343463635309129, + "grad_norm": 10.584691156901858, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 5791 + }, + { + "epoch": 1.3436956269574296, + "grad_norm": 13.434105792076489, + "learning_rate": 2e-06, + "loss": 0.3587, + "step": 5792 + }, + { + "epoch": 1.34392761860573, + "grad_norm": 17.34269805708881, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 5793 + }, + { + "epoch": 1.3441596102540307, + "grad_norm": 6.675450785331753, + "learning_rate": 2e-06, + "loss": 0.1739, + "step": 5794 + }, + { + "epoch": 1.3443916019023314, + "grad_norm": 13.984086856770102, + "learning_rate": 2e-06, + "loss": 0.3053, + "step": 5795 + }, + { + "epoch": 1.344623593550632, + "grad_norm": 19.1634289852142, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 5796 + }, + { + "epoch": 1.3448555851989328, + "grad_norm": 11.212872422045528, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 5797 + }, + { + "epoch": 1.3450875768472335, + "grad_norm": 8.339351003547979, + "learning_rate": 2e-06, + "loss": 0.186, + "step": 5798 + }, + { + "epoch": 1.3453195684955341, + "grad_norm": 11.20007845000388, + "learning_rate": 2e-06, + "loss": 0.2126, + "step": 5799 + }, + { + "epoch": 1.3455515601438348, + "grad_norm": 9.150632215768846, + "learning_rate": 2e-06, + "loss": 0.1566, + "step": 5800 + }, + { + "epoch": 1.3457835517921355, + "grad_norm": 10.867733909146118, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 5801 + }, + { + "epoch": 1.3460155434404362, + "grad_norm": 44.450382387130546, + "learning_rate": 2e-06, + "loss": 0.3324, + "step": 5802 + }, + { + "epoch": 1.3462475350887368, + "grad_norm": 19.048516268670234, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 5803 + }, + { + "epoch": 1.3464795267370375, + "grad_norm": 12.280476212517785, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 5804 + }, + { + "epoch": 1.3467115183853382, + "grad_norm": 14.774793325255686, + "learning_rate": 2e-06, + "loss": 0.4255, + "step": 5805 + }, + { + "epoch": 1.3469435100336389, + "grad_norm": 11.714254562758581, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 5806 + }, + { + "epoch": 1.3471755016819396, + "grad_norm": 12.16881579986135, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 5807 + }, + { + "epoch": 1.3474074933302402, + "grad_norm": 15.737049960037895, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 5808 + }, + { + "epoch": 1.3476394849785407, + "grad_norm": 14.653130929599717, + "learning_rate": 2e-06, + "loss": 0.2888, + "step": 5809 + }, + { + "epoch": 1.3478714766268414, + "grad_norm": 16.50927703121179, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 5810 + }, + { + "epoch": 1.348103468275142, + "grad_norm": 13.745080652083896, + "learning_rate": 2e-06, + "loss": 0.2365, + "step": 5811 + }, + { + "epoch": 1.3483354599234427, + "grad_norm": 18.49258799469895, + "learning_rate": 2e-06, + "loss": 0.3036, + "step": 5812 + }, + { + "epoch": 1.3485674515717434, + "grad_norm": 18.51938083068741, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 5813 + }, + { + "epoch": 1.348799443220044, + "grad_norm": 12.325727775873908, + "learning_rate": 2e-06, + "loss": 0.2631, + "step": 5814 + }, + { + "epoch": 1.3490314348683448, + "grad_norm": 17.84369574907302, + "learning_rate": 2e-06, + "loss": 0.3388, + "step": 5815 + }, + { + "epoch": 1.3492634265166454, + "grad_norm": 13.687209491713253, + "learning_rate": 2e-06, + "loss": 0.1935, + "step": 5816 + }, + { + "epoch": 1.3494954181649461, + "grad_norm": 16.68203773438771, + "learning_rate": 2e-06, + "loss": 0.2844, + "step": 5817 + }, + { + "epoch": 1.3497274098132468, + "grad_norm": 9.734625692979442, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 5818 + }, + { + "epoch": 1.3499594014615472, + "grad_norm": 17.018315278088533, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 5819 + }, + { + "epoch": 1.350191393109848, + "grad_norm": 5.075900093156938, + "learning_rate": 2e-06, + "loss": 0.1322, + "step": 5820 + }, + { + "epoch": 1.3504233847581486, + "grad_norm": 12.15002342495086, + "learning_rate": 2e-06, + "loss": 0.1723, + "step": 5821 + }, + { + "epoch": 1.3506553764064493, + "grad_norm": 13.438205754472456, + "learning_rate": 2e-06, + "loss": 0.2599, + "step": 5822 + }, + { + "epoch": 1.35088736805475, + "grad_norm": 14.590596311024154, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 5823 + }, + { + "epoch": 1.3511193597030506, + "grad_norm": 11.914815783448612, + "learning_rate": 2e-06, + "loss": 0.1507, + "step": 5824 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 11.406277237176791, + "learning_rate": 2e-06, + "loss": 0.1968, + "step": 5825 + }, + { + "epoch": 1.351583342999652, + "grad_norm": 23.409525749652285, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 5826 + }, + { + "epoch": 1.3518153346479527, + "grad_norm": 10.095249207393666, + "learning_rate": 2e-06, + "loss": 0.1637, + "step": 5827 + }, + { + "epoch": 1.3520473262962533, + "grad_norm": 11.506175779660916, + "learning_rate": 2e-06, + "loss": 0.2528, + "step": 5828 + }, + { + "epoch": 1.352279317944554, + "grad_norm": 15.188116430508073, + "learning_rate": 2e-06, + "loss": 0.3157, + "step": 5829 + }, + { + "epoch": 1.3525113095928547, + "grad_norm": 10.428369062323641, + "learning_rate": 2e-06, + "loss": 0.1897, + "step": 5830 + }, + { + "epoch": 1.3527433012411554, + "grad_norm": 11.703161185958193, + "learning_rate": 2e-06, + "loss": 0.2059, + "step": 5831 + }, + { + "epoch": 1.352975292889456, + "grad_norm": 18.495688046319522, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 5832 + }, + { + "epoch": 1.3532072845377567, + "grad_norm": 15.077754673818836, + "learning_rate": 2e-06, + "loss": 0.1848, + "step": 5833 + }, + { + "epoch": 1.3534392761860574, + "grad_norm": 8.695499011423294, + "learning_rate": 2e-06, + "loss": 0.1427, + "step": 5834 + }, + { + "epoch": 1.353671267834358, + "grad_norm": 10.649967194948475, + "learning_rate": 2e-06, + "loss": 0.2084, + "step": 5835 + }, + { + "epoch": 1.3539032594826585, + "grad_norm": 9.647015626245473, + "learning_rate": 2e-06, + "loss": 0.164, + "step": 5836 + }, + { + "epoch": 1.3541352511309592, + "grad_norm": 18.321975887396846, + "learning_rate": 2e-06, + "loss": 0.2208, + "step": 5837 + }, + { + "epoch": 1.35436724277926, + "grad_norm": 12.6264369995146, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 5838 + }, + { + "epoch": 1.3545992344275606, + "grad_norm": 13.3140753106527, + "learning_rate": 2e-06, + "loss": 0.271, + "step": 5839 + }, + { + "epoch": 1.3548312260758613, + "grad_norm": 18.567865143723125, + "learning_rate": 2e-06, + "loss": 0.3481, + "step": 5840 + }, + { + "epoch": 1.355063217724162, + "grad_norm": 12.737075327613965, + "learning_rate": 2e-06, + "loss": 0.4176, + "step": 5841 + }, + { + "epoch": 1.3552952093724626, + "grad_norm": 11.288649162226255, + "learning_rate": 2e-06, + "loss": 0.2044, + "step": 5842 + }, + { + "epoch": 1.3555272010207633, + "grad_norm": 18.55176280025957, + "learning_rate": 2e-06, + "loss": 0.2101, + "step": 5843 + }, + { + "epoch": 1.355759192669064, + "grad_norm": 17.54094405996478, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 5844 + }, + { + "epoch": 1.3559911843173647, + "grad_norm": 16.9841331544675, + "learning_rate": 2e-06, + "loss": 0.3381, + "step": 5845 + }, + { + "epoch": 1.356223175965665, + "grad_norm": 19.49676791705497, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 5846 + }, + { + "epoch": 1.3564551676139658, + "grad_norm": 14.364415926692745, + "learning_rate": 2e-06, + "loss": 0.1998, + "step": 5847 + }, + { + "epoch": 1.3566871592622665, + "grad_norm": 13.325930038440983, + "learning_rate": 2e-06, + "loss": 0.1628, + "step": 5848 + }, + { + "epoch": 1.3569191509105671, + "grad_norm": 23.459217434270393, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 5849 + }, + { + "epoch": 1.3571511425588678, + "grad_norm": 8.778681303386438, + "learning_rate": 2e-06, + "loss": 0.153, + "step": 5850 + }, + { + "epoch": 1.3573831342071685, + "grad_norm": 18.54260424822745, + "learning_rate": 2e-06, + "loss": 0.309, + "step": 5851 + }, + { + "epoch": 1.3576151258554692, + "grad_norm": 15.49191454628838, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 5852 + }, + { + "epoch": 1.3578471175037699, + "grad_norm": 12.041053783427783, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 5853 + }, + { + "epoch": 1.3580791091520705, + "grad_norm": 22.61898634773694, + "learning_rate": 2e-06, + "loss": 0.298, + "step": 5854 + }, + { + "epoch": 1.3583111008003712, + "grad_norm": 4.671569771383828, + "learning_rate": 2e-06, + "loss": 0.1256, + "step": 5855 + }, + { + "epoch": 1.3585430924486719, + "grad_norm": 28.314641736645186, + "learning_rate": 2e-06, + "loss": 0.2476, + "step": 5856 + }, + { + "epoch": 1.3587750840969726, + "grad_norm": 17.632781361738424, + "learning_rate": 2e-06, + "loss": 0.2985, + "step": 5857 + }, + { + "epoch": 1.3590070757452732, + "grad_norm": 8.707190057399295, + "learning_rate": 2e-06, + "loss": 0.2273, + "step": 5858 + }, + { + "epoch": 1.359239067393574, + "grad_norm": 10.346660254268741, + "learning_rate": 2e-06, + "loss": 0.1795, + "step": 5859 + }, + { + "epoch": 1.3594710590418746, + "grad_norm": 23.020950583167213, + "learning_rate": 2e-06, + "loss": 0.3505, + "step": 5860 + }, + { + "epoch": 1.3597030506901753, + "grad_norm": 13.438837661679297, + "learning_rate": 2e-06, + "loss": 0.2115, + "step": 5861 + }, + { + "epoch": 1.359935042338476, + "grad_norm": 11.591111693578771, + "learning_rate": 2e-06, + "loss": 0.1497, + "step": 5862 + }, + { + "epoch": 1.3601670339867764, + "grad_norm": 13.92468760116834, + "learning_rate": 2e-06, + "loss": 0.3266, + "step": 5863 + }, + { + "epoch": 1.360399025635077, + "grad_norm": 11.622340864961435, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 5864 + }, + { + "epoch": 1.3606310172833778, + "grad_norm": 18.978722409368135, + "learning_rate": 2e-06, + "loss": 0.2449, + "step": 5865 + }, + { + "epoch": 1.3608630089316784, + "grad_norm": 9.71805799882756, + "learning_rate": 2e-06, + "loss": 0.1905, + "step": 5866 + }, + { + "epoch": 1.3610950005799791, + "grad_norm": 16.208738621497222, + "learning_rate": 2e-06, + "loss": 0.233, + "step": 5867 + }, + { + "epoch": 1.3613269922282798, + "grad_norm": 13.70124124125462, + "learning_rate": 2e-06, + "loss": 0.2076, + "step": 5868 + }, + { + "epoch": 1.3615589838765805, + "grad_norm": 13.063859874737823, + "learning_rate": 2e-06, + "loss": 0.2459, + "step": 5869 + }, + { + "epoch": 1.3617909755248812, + "grad_norm": 8.165669539795408, + "learning_rate": 2e-06, + "loss": 0.1647, + "step": 5870 + }, + { + "epoch": 1.3620229671731818, + "grad_norm": 11.781677616521952, + "learning_rate": 2e-06, + "loss": 0.274, + "step": 5871 + }, + { + "epoch": 1.3622549588214825, + "grad_norm": 9.778774987540954, + "learning_rate": 2e-06, + "loss": 0.1572, + "step": 5872 + }, + { + "epoch": 1.362486950469783, + "grad_norm": 20.766841055575046, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 5873 + }, + { + "epoch": 1.3627189421180836, + "grad_norm": 11.59339540161316, + "learning_rate": 2e-06, + "loss": 0.2524, + "step": 5874 + }, + { + "epoch": 1.3629509337663843, + "grad_norm": 12.47148805393732, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 5875 + }, + { + "epoch": 1.363182925414685, + "grad_norm": 10.960900356092315, + "learning_rate": 2e-06, + "loss": 0.1566, + "step": 5876 + }, + { + "epoch": 1.3634149170629857, + "grad_norm": 14.772461356180653, + "learning_rate": 2e-06, + "loss": 0.2168, + "step": 5877 + }, + { + "epoch": 1.3636469087112864, + "grad_norm": 17.572173076188516, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 5878 + }, + { + "epoch": 1.363878900359587, + "grad_norm": 9.980374339519635, + "learning_rate": 2e-06, + "loss": 0.2, + "step": 5879 + }, + { + "epoch": 1.3641108920078877, + "grad_norm": 16.310899671151198, + "learning_rate": 2e-06, + "loss": 0.157, + "step": 5880 + }, + { + "epoch": 1.3643428836561884, + "grad_norm": 15.22384696178038, + "learning_rate": 2e-06, + "loss": 0.2674, + "step": 5881 + }, + { + "epoch": 1.364574875304489, + "grad_norm": 113.19316199239812, + "learning_rate": 2e-06, + "loss": 0.3356, + "step": 5882 + }, + { + "epoch": 1.3648068669527897, + "grad_norm": 14.031477531072245, + "learning_rate": 2e-06, + "loss": 0.3136, + "step": 5883 + }, + { + "epoch": 1.3650388586010904, + "grad_norm": 22.121313316789323, + "learning_rate": 2e-06, + "loss": 0.1891, + "step": 5884 + }, + { + "epoch": 1.365270850249391, + "grad_norm": 17.825567248921306, + "learning_rate": 2e-06, + "loss": 0.3541, + "step": 5885 + }, + { + "epoch": 1.3655028418976918, + "grad_norm": 19.069180568099014, + "learning_rate": 2e-06, + "loss": 0.3158, + "step": 5886 + }, + { + "epoch": 1.3657348335459925, + "grad_norm": 25.075437472856258, + "learning_rate": 2e-06, + "loss": 0.4052, + "step": 5887 + }, + { + "epoch": 1.3659668251942931, + "grad_norm": 9.977701700428549, + "learning_rate": 2e-06, + "loss": 0.181, + "step": 5888 + }, + { + "epoch": 1.3661988168425936, + "grad_norm": 17.59684848990075, + "learning_rate": 2e-06, + "loss": 0.2857, + "step": 5889 + }, + { + "epoch": 1.3664308084908943, + "grad_norm": 15.09816100180224, + "learning_rate": 2e-06, + "loss": 0.3539, + "step": 5890 + }, + { + "epoch": 1.366662800139195, + "grad_norm": 10.943892970975487, + "learning_rate": 2e-06, + "loss": 0.1357, + "step": 5891 + }, + { + "epoch": 1.3668947917874956, + "grad_norm": 15.689614080268415, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 5892 + }, + { + "epoch": 1.3671267834357963, + "grad_norm": 17.654429430044164, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 5893 + }, + { + "epoch": 1.367358775084097, + "grad_norm": 18.689276228543932, + "learning_rate": 2e-06, + "loss": 0.1976, + "step": 5894 + }, + { + "epoch": 1.3675907667323977, + "grad_norm": 18.938131307283825, + "learning_rate": 2e-06, + "loss": 0.3159, + "step": 5895 + }, + { + "epoch": 1.3678227583806983, + "grad_norm": 18.188342876250907, + "learning_rate": 2e-06, + "loss": 0.2705, + "step": 5896 + }, + { + "epoch": 1.368054750028999, + "grad_norm": 19.57321528336182, + "learning_rate": 2e-06, + "loss": 0.263, + "step": 5897 + }, + { + "epoch": 1.3682867416772997, + "grad_norm": 12.626901209943034, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 5898 + }, + { + "epoch": 1.3685187333256001, + "grad_norm": 16.377096327836597, + "learning_rate": 2e-06, + "loss": 0.2324, + "step": 5899 + }, + { + "epoch": 1.3687507249739008, + "grad_norm": 10.146682101900355, + "learning_rate": 2e-06, + "loss": 0.1951, + "step": 5900 + }, + { + "epoch": 1.3689827166222015, + "grad_norm": 9.085444222802984, + "learning_rate": 2e-06, + "loss": 0.2103, + "step": 5901 + }, + { + "epoch": 1.3692147082705022, + "grad_norm": 13.078879458322199, + "learning_rate": 2e-06, + "loss": 0.1458, + "step": 5902 + }, + { + "epoch": 1.3694466999188029, + "grad_norm": 22.771698063467927, + "learning_rate": 2e-06, + "loss": 0.3165, + "step": 5903 + }, + { + "epoch": 1.3696786915671035, + "grad_norm": 12.684698649225055, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 5904 + }, + { + "epoch": 1.3699106832154042, + "grad_norm": 18.780053747838103, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 5905 + }, + { + "epoch": 1.370142674863705, + "grad_norm": 13.218927833905115, + "learning_rate": 2e-06, + "loss": 0.1716, + "step": 5906 + }, + { + "epoch": 1.3703746665120056, + "grad_norm": 12.99323013996737, + "learning_rate": 2e-06, + "loss": 0.3394, + "step": 5907 + }, + { + "epoch": 1.3706066581603062, + "grad_norm": 14.975069838383702, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 5908 + }, + { + "epoch": 1.370838649808607, + "grad_norm": 13.194842859328135, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 5909 + }, + { + "epoch": 1.3710706414569076, + "grad_norm": 13.572338888548854, + "learning_rate": 2e-06, + "loss": 0.2105, + "step": 5910 + }, + { + "epoch": 1.3713026331052083, + "grad_norm": 9.995146313684502, + "learning_rate": 2e-06, + "loss": 0.2161, + "step": 5911 + }, + { + "epoch": 1.371534624753509, + "grad_norm": 10.08923809195754, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 5912 + }, + { + "epoch": 1.3717666164018096, + "grad_norm": 26.709924443854867, + "learning_rate": 2e-06, + "loss": 0.3202, + "step": 5913 + }, + { + "epoch": 1.3719986080501103, + "grad_norm": 12.308014395977608, + "learning_rate": 2e-06, + "loss": 0.2215, + "step": 5914 + }, + { + "epoch": 1.372230599698411, + "grad_norm": 13.05156012354374, + "learning_rate": 2e-06, + "loss": 0.2219, + "step": 5915 + }, + { + "epoch": 1.3724625913467114, + "grad_norm": 15.342982310766626, + "learning_rate": 2e-06, + "loss": 0.2504, + "step": 5916 + }, + { + "epoch": 1.3726945829950121, + "grad_norm": 17.12977581566977, + "learning_rate": 2e-06, + "loss": 0.3918, + "step": 5917 + }, + { + "epoch": 1.3729265746433128, + "grad_norm": 10.564058055931197, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 5918 + }, + { + "epoch": 1.3731585662916135, + "grad_norm": 14.801827927834955, + "learning_rate": 2e-06, + "loss": 0.2071, + "step": 5919 + }, + { + "epoch": 1.3733905579399142, + "grad_norm": 19.601648310112644, + "learning_rate": 2e-06, + "loss": 0.3412, + "step": 5920 + }, + { + "epoch": 1.3736225495882148, + "grad_norm": 14.214701362197474, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 5921 + }, + { + "epoch": 1.3738545412365155, + "grad_norm": 12.208764144329555, + "learning_rate": 2e-06, + "loss": 0.3126, + "step": 5922 + }, + { + "epoch": 1.3740865328848162, + "grad_norm": 17.16152183037491, + "learning_rate": 2e-06, + "loss": 0.3282, + "step": 5923 + }, + { + "epoch": 1.3743185245331169, + "grad_norm": 15.279736880276335, + "learning_rate": 2e-06, + "loss": 0.2844, + "step": 5924 + }, + { + "epoch": 1.3745505161814175, + "grad_norm": 12.674344871267387, + "learning_rate": 2e-06, + "loss": 0.2018, + "step": 5925 + }, + { + "epoch": 1.374782507829718, + "grad_norm": 10.83384890181781, + "learning_rate": 2e-06, + "loss": 0.2479, + "step": 5926 + }, + { + "epoch": 1.3750144994780187, + "grad_norm": 12.041522272764032, + "learning_rate": 2e-06, + "loss": 0.2226, + "step": 5927 + }, + { + "epoch": 1.3752464911263194, + "grad_norm": 13.700646292901698, + "learning_rate": 2e-06, + "loss": 0.284, + "step": 5928 + }, + { + "epoch": 1.37547848277462, + "grad_norm": 16.9939098026401, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 5929 + }, + { + "epoch": 1.3757104744229207, + "grad_norm": 12.29850103313542, + "learning_rate": 2e-06, + "loss": 0.2211, + "step": 5930 + }, + { + "epoch": 1.3759424660712214, + "grad_norm": 22.822665379374467, + "learning_rate": 2e-06, + "loss": 0.1982, + "step": 5931 + }, + { + "epoch": 1.376174457719522, + "grad_norm": 13.545598501814286, + "learning_rate": 2e-06, + "loss": 0.187, + "step": 5932 + }, + { + "epoch": 1.3764064493678227, + "grad_norm": 12.914616833005294, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 5933 + }, + { + "epoch": 1.3766384410161234, + "grad_norm": 21.749768835773786, + "learning_rate": 2e-06, + "loss": 0.2858, + "step": 5934 + }, + { + "epoch": 1.376870432664424, + "grad_norm": 22.098796702958314, + "learning_rate": 2e-06, + "loss": 0.3402, + "step": 5935 + }, + { + "epoch": 1.3771024243127248, + "grad_norm": 21.733880085073434, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 5936 + }, + { + "epoch": 1.3773344159610255, + "grad_norm": 8.488122175170245, + "learning_rate": 2e-06, + "loss": 0.1605, + "step": 5937 + }, + { + "epoch": 1.3775664076093261, + "grad_norm": 12.911399378010156, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 5938 + }, + { + "epoch": 1.3777983992576268, + "grad_norm": 28.886112837195807, + "learning_rate": 2e-06, + "loss": 0.4906, + "step": 5939 + }, + { + "epoch": 1.3780303909059275, + "grad_norm": 21.59522435818442, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 5940 + }, + { + "epoch": 1.3782623825542282, + "grad_norm": 10.076643720998616, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 5941 + }, + { + "epoch": 1.3784943742025286, + "grad_norm": 13.516919113099314, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 5942 + }, + { + "epoch": 1.3787263658508293, + "grad_norm": 8.643098284175489, + "learning_rate": 2e-06, + "loss": 0.216, + "step": 5943 + }, + { + "epoch": 1.37895835749913, + "grad_norm": 14.300103934873006, + "learning_rate": 2e-06, + "loss": 0.2819, + "step": 5944 + }, + { + "epoch": 1.3791903491474307, + "grad_norm": 12.466558909522385, + "learning_rate": 2e-06, + "loss": 0.2555, + "step": 5945 + }, + { + "epoch": 1.3794223407957313, + "grad_norm": 8.138712893736352, + "learning_rate": 2e-06, + "loss": 0.2659, + "step": 5946 + }, + { + "epoch": 1.379654332444032, + "grad_norm": 12.649915403183721, + "learning_rate": 2e-06, + "loss": 0.2405, + "step": 5947 + }, + { + "epoch": 1.3798863240923327, + "grad_norm": 9.271014937152602, + "learning_rate": 2e-06, + "loss": 0.1644, + "step": 5948 + }, + { + "epoch": 1.3801183157406334, + "grad_norm": 12.106163458210577, + "learning_rate": 2e-06, + "loss": 0.2074, + "step": 5949 + }, + { + "epoch": 1.380350307388934, + "grad_norm": 14.840048188527057, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 5950 + }, + { + "epoch": 1.3805822990372347, + "grad_norm": 15.092469306811585, + "learning_rate": 2e-06, + "loss": 0.3458, + "step": 5951 + }, + { + "epoch": 1.3808142906855352, + "grad_norm": 18.896819046822152, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 5952 + }, + { + "epoch": 1.3810462823338359, + "grad_norm": 12.53153675981379, + "learning_rate": 2e-06, + "loss": 0.3175, + "step": 5953 + }, + { + "epoch": 1.3812782739821365, + "grad_norm": 12.148576689161318, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 5954 + }, + { + "epoch": 1.3815102656304372, + "grad_norm": 17.49484792170981, + "learning_rate": 2e-06, + "loss": 0.2591, + "step": 5955 + }, + { + "epoch": 1.381742257278738, + "grad_norm": 10.394134652681377, + "learning_rate": 2e-06, + "loss": 0.2057, + "step": 5956 + }, + { + "epoch": 1.3819742489270386, + "grad_norm": 29.127601288530386, + "learning_rate": 2e-06, + "loss": 0.4277, + "step": 5957 + }, + { + "epoch": 1.3822062405753393, + "grad_norm": 22.843391511775554, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 5958 + }, + { + "epoch": 1.38243823222364, + "grad_norm": 5.510142521551222, + "learning_rate": 2e-06, + "loss": 0.1484, + "step": 5959 + }, + { + "epoch": 1.3826702238719406, + "grad_norm": 11.396703033934903, + "learning_rate": 2e-06, + "loss": 0.2945, + "step": 5960 + }, + { + "epoch": 1.3829022155202413, + "grad_norm": 9.600664042933897, + "learning_rate": 2e-06, + "loss": 0.1599, + "step": 5961 + }, + { + "epoch": 1.383134207168542, + "grad_norm": 5.892640207145712, + "learning_rate": 2e-06, + "loss": 0.1359, + "step": 5962 + }, + { + "epoch": 1.3833661988168426, + "grad_norm": 20.277757054717135, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 5963 + }, + { + "epoch": 1.3835981904651433, + "grad_norm": 10.898832193056212, + "learning_rate": 2e-06, + "loss": 0.1662, + "step": 5964 + }, + { + "epoch": 1.383830182113444, + "grad_norm": 19.88410341780845, + "learning_rate": 2e-06, + "loss": 0.2381, + "step": 5965 + }, + { + "epoch": 1.3840621737617447, + "grad_norm": 20.916354990874595, + "learning_rate": 2e-06, + "loss": 0.4176, + "step": 5966 + }, + { + "epoch": 1.3842941654100454, + "grad_norm": 12.718636226829762, + "learning_rate": 2e-06, + "loss": 0.1831, + "step": 5967 + }, + { + "epoch": 1.384526157058346, + "grad_norm": 15.40648006298018, + "learning_rate": 2e-06, + "loss": 0.2073, + "step": 5968 + }, + { + "epoch": 1.3847581487066465, + "grad_norm": 14.03245463978515, + "learning_rate": 2e-06, + "loss": 0.2025, + "step": 5969 + }, + { + "epoch": 1.3849901403549472, + "grad_norm": 23.08067780388722, + "learning_rate": 2e-06, + "loss": 0.3673, + "step": 5970 + }, + { + "epoch": 1.3852221320032478, + "grad_norm": 8.710463900190048, + "learning_rate": 2e-06, + "loss": 0.1394, + "step": 5971 + }, + { + "epoch": 1.3854541236515485, + "grad_norm": 19.634103965805803, + "learning_rate": 2e-06, + "loss": 0.3047, + "step": 5972 + }, + { + "epoch": 1.3856861152998492, + "grad_norm": 9.507533107547227, + "learning_rate": 2e-06, + "loss": 0.1999, + "step": 5973 + }, + { + "epoch": 1.3859181069481499, + "grad_norm": 17.26108100615538, + "learning_rate": 2e-06, + "loss": 0.3609, + "step": 5974 + }, + { + "epoch": 1.3861500985964506, + "grad_norm": 11.200525491341423, + "learning_rate": 2e-06, + "loss": 0.2384, + "step": 5975 + }, + { + "epoch": 1.3863820902447512, + "grad_norm": 26.638142154059025, + "learning_rate": 2e-06, + "loss": 0.4838, + "step": 5976 + }, + { + "epoch": 1.386614081893052, + "grad_norm": 9.526430529695023, + "learning_rate": 2e-06, + "loss": 0.3479, + "step": 5977 + }, + { + "epoch": 1.3868460735413526, + "grad_norm": 11.526083961069686, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 5978 + }, + { + "epoch": 1.387078065189653, + "grad_norm": 18.60022661873739, + "learning_rate": 2e-06, + "loss": 0.2312, + "step": 5979 + }, + { + "epoch": 1.3873100568379537, + "grad_norm": 12.776358544049172, + "learning_rate": 2e-06, + "loss": 0.2869, + "step": 5980 + }, + { + "epoch": 1.3875420484862544, + "grad_norm": 6.80341080091959, + "learning_rate": 2e-06, + "loss": 0.1541, + "step": 5981 + }, + { + "epoch": 1.387774040134555, + "grad_norm": 30.41245426392083, + "learning_rate": 2e-06, + "loss": 0.3814, + "step": 5982 + }, + { + "epoch": 1.3880060317828558, + "grad_norm": 8.694938302800548, + "learning_rate": 2e-06, + "loss": 0.2095, + "step": 5983 + }, + { + "epoch": 1.3882380234311564, + "grad_norm": 17.115266860722507, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 5984 + }, + { + "epoch": 1.388470015079457, + "grad_norm": 11.428928951768523, + "learning_rate": 2e-06, + "loss": 0.2025, + "step": 5985 + }, + { + "epoch": 1.3887020067277578, + "grad_norm": 7.287599703203382, + "learning_rate": 2e-06, + "loss": 0.136, + "step": 5986 + }, + { + "epoch": 1.3889339983760585, + "grad_norm": 22.706310929356366, + "learning_rate": 2e-06, + "loss": 0.2294, + "step": 5987 + }, + { + "epoch": 1.3891659900243591, + "grad_norm": 13.099589997732332, + "learning_rate": 2e-06, + "loss": 0.2546, + "step": 5988 + }, + { + "epoch": 1.3893979816726598, + "grad_norm": 7.785644395983943, + "learning_rate": 2e-06, + "loss": 0.191, + "step": 5989 + }, + { + "epoch": 1.3896299733209605, + "grad_norm": 16.68842225821719, + "learning_rate": 2e-06, + "loss": 0.3427, + "step": 5990 + }, + { + "epoch": 1.3898619649692612, + "grad_norm": 14.016036584118243, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 5991 + }, + { + "epoch": 1.3900939566175619, + "grad_norm": 7.988120441251208, + "learning_rate": 2e-06, + "loss": 0.1942, + "step": 5992 + }, + { + "epoch": 1.3903259482658625, + "grad_norm": 112.7683572465281, + "learning_rate": 2e-06, + "loss": 0.3115, + "step": 5993 + }, + { + "epoch": 1.3905579399141632, + "grad_norm": 42.66441929587144, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 5994 + }, + { + "epoch": 1.3907899315624639, + "grad_norm": 10.947839024667601, + "learning_rate": 2e-06, + "loss": 0.2289, + "step": 5995 + }, + { + "epoch": 1.3910219232107643, + "grad_norm": 13.246445827515094, + "learning_rate": 2e-06, + "loss": 0.2489, + "step": 5996 + }, + { + "epoch": 1.391253914859065, + "grad_norm": 17.01408986194027, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 5997 + }, + { + "epoch": 1.3914859065073657, + "grad_norm": 9.757210001235624, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 5998 + }, + { + "epoch": 1.3917178981556664, + "grad_norm": 14.177056956242843, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 5999 + }, + { + "epoch": 1.391949889803967, + "grad_norm": 5.241533037856061, + "learning_rate": 2e-06, + "loss": 0.149, + "step": 6000 + }, + { + "epoch": 1.3921818814522677, + "grad_norm": 11.535395075466417, + "learning_rate": 2e-06, + "loss": 0.2143, + "step": 6001 + }, + { + "epoch": 1.3924138731005684, + "grad_norm": 10.4332883563328, + "learning_rate": 2e-06, + "loss": 0.183, + "step": 6002 + }, + { + "epoch": 1.392645864748869, + "grad_norm": 6.525121061567562, + "learning_rate": 2e-06, + "loss": 0.1406, + "step": 6003 + }, + { + "epoch": 1.3928778563971698, + "grad_norm": 9.429201567332692, + "learning_rate": 2e-06, + "loss": 0.1821, + "step": 6004 + }, + { + "epoch": 1.3931098480454704, + "grad_norm": 12.268882730046338, + "learning_rate": 2e-06, + "loss": 0.2193, + "step": 6005 + }, + { + "epoch": 1.393341839693771, + "grad_norm": 21.003497758566507, + "learning_rate": 2e-06, + "loss": 0.3852, + "step": 6006 + }, + { + "epoch": 1.3935738313420716, + "grad_norm": 17.83956602871701, + "learning_rate": 2e-06, + "loss": 0.323, + "step": 6007 + }, + { + "epoch": 1.3938058229903723, + "grad_norm": 10.34881103448905, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 6008 + }, + { + "epoch": 1.394037814638673, + "grad_norm": 14.011766669465839, + "learning_rate": 2e-06, + "loss": 0.2148, + "step": 6009 + }, + { + "epoch": 1.3942698062869736, + "grad_norm": 19.869740415845733, + "learning_rate": 2e-06, + "loss": 0.394, + "step": 6010 + }, + { + "epoch": 1.3945017979352743, + "grad_norm": 18.403769506198863, + "learning_rate": 2e-06, + "loss": 0.3615, + "step": 6011 + }, + { + "epoch": 1.394733789583575, + "grad_norm": 7.7733595799105615, + "learning_rate": 2e-06, + "loss": 0.2009, + "step": 6012 + }, + { + "epoch": 1.3949657812318756, + "grad_norm": 7.321632491742111, + "learning_rate": 2e-06, + "loss": 0.1419, + "step": 6013 + }, + { + "epoch": 1.3951977728801763, + "grad_norm": 9.675069354228755, + "learning_rate": 2e-06, + "loss": 0.1768, + "step": 6014 + }, + { + "epoch": 1.395429764528477, + "grad_norm": 17.28073435681396, + "learning_rate": 2e-06, + "loss": 0.2887, + "step": 6015 + }, + { + "epoch": 1.3956617561767777, + "grad_norm": 9.356232891758404, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 6016 + }, + { + "epoch": 1.3958937478250784, + "grad_norm": 12.157659270560018, + "learning_rate": 2e-06, + "loss": 0.1606, + "step": 6017 + }, + { + "epoch": 1.396125739473379, + "grad_norm": 22.129004589054688, + "learning_rate": 2e-06, + "loss": 0.307, + "step": 6018 + }, + { + "epoch": 1.3963577311216797, + "grad_norm": 32.36328986876033, + "learning_rate": 2e-06, + "loss": 0.4857, + "step": 6019 + }, + { + "epoch": 1.3965897227699804, + "grad_norm": 19.052967549651004, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 6020 + }, + { + "epoch": 1.396821714418281, + "grad_norm": 23.30684610747512, + "learning_rate": 2e-06, + "loss": 0.2642, + "step": 6021 + }, + { + "epoch": 1.3970537060665815, + "grad_norm": 13.054309580640513, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 6022 + }, + { + "epoch": 1.3972856977148822, + "grad_norm": 11.28301974377214, + "learning_rate": 2e-06, + "loss": 0.2084, + "step": 6023 + }, + { + "epoch": 1.3975176893631829, + "grad_norm": 19.094865999095894, + "learning_rate": 2e-06, + "loss": 0.4077, + "step": 6024 + }, + { + "epoch": 1.3977496810114836, + "grad_norm": 20.856946893139874, + "learning_rate": 2e-06, + "loss": 0.2084, + "step": 6025 + }, + { + "epoch": 1.3979816726597842, + "grad_norm": 9.45025107913317, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 6026 + }, + { + "epoch": 1.398213664308085, + "grad_norm": 15.203941180895525, + "learning_rate": 2e-06, + "loss": 0.1621, + "step": 6027 + }, + { + "epoch": 1.3984456559563856, + "grad_norm": 6.03861772807484, + "learning_rate": 2e-06, + "loss": 0.1304, + "step": 6028 + }, + { + "epoch": 1.3986776476046863, + "grad_norm": 17.018859853107013, + "learning_rate": 2e-06, + "loss": 0.3151, + "step": 6029 + }, + { + "epoch": 1.398909639252987, + "grad_norm": 15.165568168648043, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 6030 + }, + { + "epoch": 1.3991416309012876, + "grad_norm": 13.53577123662282, + "learning_rate": 2e-06, + "loss": 0.2988, + "step": 6031 + }, + { + "epoch": 1.399373622549588, + "grad_norm": 14.900819174481283, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 6032 + }, + { + "epoch": 1.3996056141978888, + "grad_norm": 7.993752940333517, + "learning_rate": 2e-06, + "loss": 0.2056, + "step": 6033 + }, + { + "epoch": 1.3998376058461894, + "grad_norm": 22.356467342587003, + "learning_rate": 2e-06, + "loss": 0.1956, + "step": 6034 + }, + { + "epoch": 1.4000695974944901, + "grad_norm": 14.683958717312493, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 6035 + }, + { + "epoch": 1.4003015891427908, + "grad_norm": 12.143973408945179, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 6036 + }, + { + "epoch": 1.4005335807910915, + "grad_norm": 11.753610537279906, + "learning_rate": 2e-06, + "loss": 0.2235, + "step": 6037 + }, + { + "epoch": 1.4007655724393921, + "grad_norm": 22.482569340622312, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 6038 + }, + { + "epoch": 1.4009975640876928, + "grad_norm": 10.124105869305984, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 6039 + }, + { + "epoch": 1.4012295557359935, + "grad_norm": 9.48123463264121, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 6040 + }, + { + "epoch": 1.4014615473842942, + "grad_norm": 6.375181245128169, + "learning_rate": 2e-06, + "loss": 0.1122, + "step": 6041 + }, + { + "epoch": 1.4016935390325949, + "grad_norm": 25.529223668853717, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 6042 + }, + { + "epoch": 1.4019255306808955, + "grad_norm": 15.821112574407275, + "learning_rate": 2e-06, + "loss": 0.3286, + "step": 6043 + }, + { + "epoch": 1.4021575223291962, + "grad_norm": 9.7919902421017, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 6044 + }, + { + "epoch": 1.402389513977497, + "grad_norm": 47.05440000845356, + "learning_rate": 2e-06, + "loss": 0.2881, + "step": 6045 + }, + { + "epoch": 1.4026215056257976, + "grad_norm": 15.433693071590781, + "learning_rate": 2e-06, + "loss": 0.2866, + "step": 6046 + }, + { + "epoch": 1.4028534972740982, + "grad_norm": 19.2840330067713, + "learning_rate": 2e-06, + "loss": 0.413, + "step": 6047 + }, + { + "epoch": 1.403085488922399, + "grad_norm": 15.715282873660984, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 6048 + }, + { + "epoch": 1.4033174805706994, + "grad_norm": 12.690754735658464, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 6049 + }, + { + "epoch": 1.403549472219, + "grad_norm": 17.86836759439439, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 6050 + }, + { + "epoch": 1.4037814638673007, + "grad_norm": 20.685699119431618, + "learning_rate": 2e-06, + "loss": 0.2731, + "step": 6051 + }, + { + "epoch": 1.4040134555156014, + "grad_norm": 14.641062692101555, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 6052 + }, + { + "epoch": 1.404245447163902, + "grad_norm": 7.732580108107947, + "learning_rate": 2e-06, + "loss": 0.1796, + "step": 6053 + }, + { + "epoch": 1.4044774388122028, + "grad_norm": 18.032129481542793, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 6054 + }, + { + "epoch": 1.4047094304605034, + "grad_norm": 14.180763869770052, + "learning_rate": 2e-06, + "loss": 0.2168, + "step": 6055 + }, + { + "epoch": 1.4049414221088041, + "grad_norm": 13.529057984274884, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 6056 + }, + { + "epoch": 1.4051734137571048, + "grad_norm": 19.91666727951003, + "learning_rate": 2e-06, + "loss": 0.1565, + "step": 6057 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 20.439080791635305, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 6058 + }, + { + "epoch": 1.405637397053706, + "grad_norm": 8.094859396497109, + "learning_rate": 2e-06, + "loss": 0.159, + "step": 6059 + }, + { + "epoch": 1.4058693887020066, + "grad_norm": 17.211446044937162, + "learning_rate": 2e-06, + "loss": 0.2021, + "step": 6060 + }, + { + "epoch": 1.4061013803503073, + "grad_norm": 16.984237415332583, + "learning_rate": 2e-06, + "loss": 0.3073, + "step": 6061 + }, + { + "epoch": 1.406333371998608, + "grad_norm": 19.736131712976423, + "learning_rate": 2e-06, + "loss": 0.3034, + "step": 6062 + }, + { + "epoch": 1.4065653636469087, + "grad_norm": 8.803289909229532, + "learning_rate": 2e-06, + "loss": 0.2266, + "step": 6063 + }, + { + "epoch": 1.4067973552952093, + "grad_norm": 14.67567093094092, + "learning_rate": 2e-06, + "loss": 0.2873, + "step": 6064 + }, + { + "epoch": 1.40702934694351, + "grad_norm": 21.111374577296598, + "learning_rate": 2e-06, + "loss": 0.3436, + "step": 6065 + }, + { + "epoch": 1.4072613385918107, + "grad_norm": 23.285346279881995, + "learning_rate": 2e-06, + "loss": 0.353, + "step": 6066 + }, + { + "epoch": 1.4074933302401114, + "grad_norm": 30.91209969978155, + "learning_rate": 2e-06, + "loss": 0.3959, + "step": 6067 + }, + { + "epoch": 1.407725321888412, + "grad_norm": 23.50666321730142, + "learning_rate": 2e-06, + "loss": 0.3817, + "step": 6068 + }, + { + "epoch": 1.4079573135367127, + "grad_norm": 24.887261676979207, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 6069 + }, + { + "epoch": 1.4081893051850134, + "grad_norm": 17.371120969303167, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 6070 + }, + { + "epoch": 1.408421296833314, + "grad_norm": 17.01279131314068, + "learning_rate": 2e-06, + "loss": 0.2949, + "step": 6071 + }, + { + "epoch": 1.4086532884816148, + "grad_norm": 8.158139020970319, + "learning_rate": 2e-06, + "loss": 0.2076, + "step": 6072 + }, + { + "epoch": 1.4088852801299154, + "grad_norm": 7.818186686590393, + "learning_rate": 2e-06, + "loss": 0.1549, + "step": 6073 + }, + { + "epoch": 1.409117271778216, + "grad_norm": 5.121188328909677, + "learning_rate": 2e-06, + "loss": 0.1714, + "step": 6074 + }, + { + "epoch": 1.4093492634265166, + "grad_norm": 11.775903756067791, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 6075 + }, + { + "epoch": 1.4095812550748172, + "grad_norm": 30.858707507154378, + "learning_rate": 2e-06, + "loss": 0.4582, + "step": 6076 + }, + { + "epoch": 1.409813246723118, + "grad_norm": 15.755543306378247, + "learning_rate": 2e-06, + "loss": 0.2169, + "step": 6077 + }, + { + "epoch": 1.4100452383714186, + "grad_norm": 18.69184492966843, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 6078 + }, + { + "epoch": 1.4102772300197193, + "grad_norm": 27.398404718078048, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 6079 + }, + { + "epoch": 1.41050922166802, + "grad_norm": 25.75019073595999, + "learning_rate": 2e-06, + "loss": 0.3711, + "step": 6080 + }, + { + "epoch": 1.4107412133163206, + "grad_norm": 9.892592852299064, + "learning_rate": 2e-06, + "loss": 0.2051, + "step": 6081 + }, + { + "epoch": 1.4109732049646213, + "grad_norm": 7.408465030887885, + "learning_rate": 2e-06, + "loss": 0.1045, + "step": 6082 + }, + { + "epoch": 1.411205196612922, + "grad_norm": 14.06067060739889, + "learning_rate": 2e-06, + "loss": 0.1927, + "step": 6083 + }, + { + "epoch": 1.4114371882612227, + "grad_norm": 13.913663213295978, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 6084 + }, + { + "epoch": 1.4116691799095231, + "grad_norm": 10.085229486493613, + "learning_rate": 2e-06, + "loss": 0.2227, + "step": 6085 + }, + { + "epoch": 1.4119011715578238, + "grad_norm": 13.631124668880567, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 6086 + }, + { + "epoch": 1.4121331632061245, + "grad_norm": 10.88601808887599, + "learning_rate": 2e-06, + "loss": 0.2068, + "step": 6087 + }, + { + "epoch": 1.4123651548544252, + "grad_norm": 13.403020292657482, + "learning_rate": 2e-06, + "loss": 0.2976, + "step": 6088 + }, + { + "epoch": 1.4125971465027258, + "grad_norm": 10.344911101143142, + "learning_rate": 2e-06, + "loss": 0.2509, + "step": 6089 + }, + { + "epoch": 1.4128291381510265, + "grad_norm": 12.71640337739951, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 6090 + }, + { + "epoch": 1.4130611297993272, + "grad_norm": 15.261059190133613, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 6091 + }, + { + "epoch": 1.4132931214476279, + "grad_norm": 17.65399175249053, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 6092 + }, + { + "epoch": 1.4135251130959285, + "grad_norm": 18.415558010960087, + "learning_rate": 2e-06, + "loss": 0.3171, + "step": 6093 + }, + { + "epoch": 1.4137571047442292, + "grad_norm": 18.71760155254099, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 6094 + }, + { + "epoch": 1.41398909639253, + "grad_norm": 12.08233981768872, + "learning_rate": 2e-06, + "loss": 0.1896, + "step": 6095 + }, + { + "epoch": 1.4142210880408306, + "grad_norm": 12.924121884049468, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 6096 + }, + { + "epoch": 1.4144530796891313, + "grad_norm": 15.51268648637336, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 6097 + }, + { + "epoch": 1.414685071337432, + "grad_norm": 17.961260828460986, + "learning_rate": 2e-06, + "loss": 0.2108, + "step": 6098 + }, + { + "epoch": 1.4149170629857326, + "grad_norm": 17.365521433935196, + "learning_rate": 2e-06, + "loss": 0.2831, + "step": 6099 + }, + { + "epoch": 1.4151490546340333, + "grad_norm": 10.163031122556262, + "learning_rate": 2e-06, + "loss": 0.2289, + "step": 6100 + }, + { + "epoch": 1.415381046282334, + "grad_norm": 7.099404642130162, + "learning_rate": 2e-06, + "loss": 0.1514, + "step": 6101 + }, + { + "epoch": 1.4156130379306344, + "grad_norm": 11.052661536656478, + "learning_rate": 2e-06, + "loss": 0.1811, + "step": 6102 + }, + { + "epoch": 1.415845029578935, + "grad_norm": 12.598462452392125, + "learning_rate": 2e-06, + "loss": 0.1743, + "step": 6103 + }, + { + "epoch": 1.4160770212272358, + "grad_norm": 6.835223930186045, + "learning_rate": 2e-06, + "loss": 0.1597, + "step": 6104 + }, + { + "epoch": 1.4163090128755365, + "grad_norm": 26.35206340861612, + "learning_rate": 2e-06, + "loss": 0.3793, + "step": 6105 + }, + { + "epoch": 1.4165410045238371, + "grad_norm": 25.374028954103416, + "learning_rate": 2e-06, + "loss": 0.3094, + "step": 6106 + }, + { + "epoch": 1.4167729961721378, + "grad_norm": 11.833484379950999, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 6107 + }, + { + "epoch": 1.4170049878204385, + "grad_norm": 20.596598339272045, + "learning_rate": 2e-06, + "loss": 0.3502, + "step": 6108 + }, + { + "epoch": 1.4172369794687392, + "grad_norm": 10.987960713511372, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 6109 + }, + { + "epoch": 1.4174689711170398, + "grad_norm": 7.404113399619784, + "learning_rate": 2e-06, + "loss": 0.1751, + "step": 6110 + }, + { + "epoch": 1.4177009627653405, + "grad_norm": 8.825514460580765, + "learning_rate": 2e-06, + "loss": 0.1497, + "step": 6111 + }, + { + "epoch": 1.417932954413641, + "grad_norm": 15.713391703876344, + "learning_rate": 2e-06, + "loss": 0.2371, + "step": 6112 + }, + { + "epoch": 1.4181649460619417, + "grad_norm": 9.618159708726328, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 6113 + }, + { + "epoch": 1.4183969377102423, + "grad_norm": 10.16052102551055, + "learning_rate": 2e-06, + "loss": 0.3071, + "step": 6114 + }, + { + "epoch": 1.418628929358543, + "grad_norm": 6.310866150220136, + "learning_rate": 2e-06, + "loss": 0.1548, + "step": 6115 + }, + { + "epoch": 1.4188609210068437, + "grad_norm": 19.80731411393623, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 6116 + }, + { + "epoch": 1.4190929126551444, + "grad_norm": 13.3606379253285, + "learning_rate": 2e-06, + "loss": 0.1826, + "step": 6117 + }, + { + "epoch": 1.419324904303445, + "grad_norm": 8.81503462660466, + "learning_rate": 2e-06, + "loss": 0.2251, + "step": 6118 + }, + { + "epoch": 1.4195568959517457, + "grad_norm": 15.069781449360283, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 6119 + }, + { + "epoch": 1.4197888876000464, + "grad_norm": 18.822472041359575, + "learning_rate": 2e-06, + "loss": 0.3187, + "step": 6120 + }, + { + "epoch": 1.420020879248347, + "grad_norm": 20.897490149361044, + "learning_rate": 2e-06, + "loss": 0.2403, + "step": 6121 + }, + { + "epoch": 1.4202528708966478, + "grad_norm": 6.836425013476632, + "learning_rate": 2e-06, + "loss": 0.1446, + "step": 6122 + }, + { + "epoch": 1.4204848625449484, + "grad_norm": 11.993374434727933, + "learning_rate": 2e-06, + "loss": 0.217, + "step": 6123 + }, + { + "epoch": 1.4207168541932491, + "grad_norm": 11.146419401766341, + "learning_rate": 2e-06, + "loss": 0.158, + "step": 6124 + }, + { + "epoch": 1.4209488458415498, + "grad_norm": 12.804609156327725, + "learning_rate": 2e-06, + "loss": 0.2305, + "step": 6125 + }, + { + "epoch": 1.4211808374898505, + "grad_norm": 20.536425822305404, + "learning_rate": 2e-06, + "loss": 0.2628, + "step": 6126 + }, + { + "epoch": 1.4214128291381511, + "grad_norm": 19.6182767891473, + "learning_rate": 2e-06, + "loss": 0.2312, + "step": 6127 + }, + { + "epoch": 1.4216448207864518, + "grad_norm": 8.67891356781281, + "learning_rate": 2e-06, + "loss": 0.1341, + "step": 6128 + }, + { + "epoch": 1.4218768124347523, + "grad_norm": 9.495972989677902, + "learning_rate": 2e-06, + "loss": 0.194, + "step": 6129 + }, + { + "epoch": 1.422108804083053, + "grad_norm": 16.465910741318215, + "learning_rate": 2e-06, + "loss": 0.2441, + "step": 6130 + }, + { + "epoch": 1.4223407957313536, + "grad_norm": 18.64237556218032, + "learning_rate": 2e-06, + "loss": 0.2429, + "step": 6131 + }, + { + "epoch": 1.4225727873796543, + "grad_norm": 15.131650364743992, + "learning_rate": 2e-06, + "loss": 0.2176, + "step": 6132 + }, + { + "epoch": 1.422804779027955, + "grad_norm": 16.521831690467256, + "learning_rate": 2e-06, + "loss": 0.1971, + "step": 6133 + }, + { + "epoch": 1.4230367706762557, + "grad_norm": 15.255936878428859, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 6134 + }, + { + "epoch": 1.4232687623245563, + "grad_norm": 19.319660936948484, + "learning_rate": 2e-06, + "loss": 0.1897, + "step": 6135 + }, + { + "epoch": 1.423500753972857, + "grad_norm": 19.973456561284625, + "learning_rate": 2e-06, + "loss": 0.3944, + "step": 6136 + }, + { + "epoch": 1.4237327456211577, + "grad_norm": 21.849948560087277, + "learning_rate": 2e-06, + "loss": 0.3276, + "step": 6137 + }, + { + "epoch": 1.4239647372694584, + "grad_norm": 16.2802313700536, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 6138 + }, + { + "epoch": 1.4241967289177588, + "grad_norm": 19.909440328810053, + "learning_rate": 2e-06, + "loss": 0.1586, + "step": 6139 + }, + { + "epoch": 1.4244287205660595, + "grad_norm": 14.748574132006137, + "learning_rate": 2e-06, + "loss": 0.2809, + "step": 6140 + }, + { + "epoch": 1.4246607122143602, + "grad_norm": 9.825467024454078, + "learning_rate": 2e-06, + "loss": 0.2456, + "step": 6141 + }, + { + "epoch": 1.4248927038626609, + "grad_norm": 15.769056569854035, + "learning_rate": 2e-06, + "loss": 0.2548, + "step": 6142 + }, + { + "epoch": 1.4251246955109615, + "grad_norm": 9.883953478158336, + "learning_rate": 2e-06, + "loss": 0.2044, + "step": 6143 + }, + { + "epoch": 1.4253566871592622, + "grad_norm": 17.016653772181083, + "learning_rate": 2e-06, + "loss": 0.406, + "step": 6144 + }, + { + "epoch": 1.425588678807563, + "grad_norm": 20.34748285243919, + "learning_rate": 2e-06, + "loss": 0.3685, + "step": 6145 + }, + { + "epoch": 1.4258206704558636, + "grad_norm": 11.292193060995796, + "learning_rate": 2e-06, + "loss": 0.2376, + "step": 6146 + }, + { + "epoch": 1.4260526621041643, + "grad_norm": 9.088756757460795, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 6147 + }, + { + "epoch": 1.426284653752465, + "grad_norm": 18.792375574742056, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 6148 + }, + { + "epoch": 1.4265166454007656, + "grad_norm": 25.108023742053295, + "learning_rate": 2e-06, + "loss": 0.2133, + "step": 6149 + }, + { + "epoch": 1.4267486370490663, + "grad_norm": 19.186927719163982, + "learning_rate": 2e-06, + "loss": 0.2004, + "step": 6150 + }, + { + "epoch": 1.426980628697367, + "grad_norm": 17.71675497443564, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 6151 + }, + { + "epoch": 1.4272126203456676, + "grad_norm": 22.360834089520896, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 6152 + }, + { + "epoch": 1.4274446119939683, + "grad_norm": 19.971085580506056, + "learning_rate": 2e-06, + "loss": 0.2874, + "step": 6153 + }, + { + "epoch": 1.427676603642269, + "grad_norm": 23.875068987608802, + "learning_rate": 2e-06, + "loss": 0.3523, + "step": 6154 + }, + { + "epoch": 1.4279085952905695, + "grad_norm": 14.028916962582743, + "learning_rate": 2e-06, + "loss": 0.2884, + "step": 6155 + }, + { + "epoch": 1.4281405869388701, + "grad_norm": 18.488646503647313, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 6156 + }, + { + "epoch": 1.4283725785871708, + "grad_norm": 18.390315378298006, + "learning_rate": 2e-06, + "loss": 0.3414, + "step": 6157 + }, + { + "epoch": 1.4286045702354715, + "grad_norm": 25.069614253868096, + "learning_rate": 2e-06, + "loss": 0.3823, + "step": 6158 + }, + { + "epoch": 1.4288365618837722, + "grad_norm": 16.644875413619268, + "learning_rate": 2e-06, + "loss": 0.2242, + "step": 6159 + }, + { + "epoch": 1.4290685535320728, + "grad_norm": 16.553733055395217, + "learning_rate": 2e-06, + "loss": 0.314, + "step": 6160 + }, + { + "epoch": 1.4293005451803735, + "grad_norm": 11.93423897501811, + "learning_rate": 2e-06, + "loss": 0.2209, + "step": 6161 + }, + { + "epoch": 1.4295325368286742, + "grad_norm": 10.172506414261784, + "learning_rate": 2e-06, + "loss": 0.2577, + "step": 6162 + }, + { + "epoch": 1.4297645284769749, + "grad_norm": 13.74352601131, + "learning_rate": 2e-06, + "loss": 0.1991, + "step": 6163 + }, + { + "epoch": 1.4299965201252756, + "grad_norm": 20.315569158586207, + "learning_rate": 2e-06, + "loss": 0.2665, + "step": 6164 + }, + { + "epoch": 1.430228511773576, + "grad_norm": 9.354263225246887, + "learning_rate": 2e-06, + "loss": 0.2413, + "step": 6165 + }, + { + "epoch": 1.4304605034218767, + "grad_norm": 12.08739139621951, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 6166 + }, + { + "epoch": 1.4306924950701774, + "grad_norm": 9.65658511342512, + "learning_rate": 2e-06, + "loss": 0.1819, + "step": 6167 + }, + { + "epoch": 1.430924486718478, + "grad_norm": 7.610932909883998, + "learning_rate": 2e-06, + "loss": 0.172, + "step": 6168 + }, + { + "epoch": 1.4311564783667787, + "grad_norm": 17.58117935316524, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 6169 + }, + { + "epoch": 1.4313884700150794, + "grad_norm": 16.990502623789205, + "learning_rate": 2e-06, + "loss": 0.255, + "step": 6170 + }, + { + "epoch": 1.43162046166338, + "grad_norm": 13.475962120240382, + "learning_rate": 2e-06, + "loss": 0.2718, + "step": 6171 + }, + { + "epoch": 1.4318524533116808, + "grad_norm": 24.175582308504648, + "learning_rate": 2e-06, + "loss": 0.2386, + "step": 6172 + }, + { + "epoch": 1.4320844449599814, + "grad_norm": 5.392419668157998, + "learning_rate": 2e-06, + "loss": 0.0997, + "step": 6173 + }, + { + "epoch": 1.4323164366082821, + "grad_norm": 18.51628849044846, + "learning_rate": 2e-06, + "loss": 0.2131, + "step": 6174 + }, + { + "epoch": 1.4325484282565828, + "grad_norm": 14.240146276064726, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 6175 + }, + { + "epoch": 1.4327804199048835, + "grad_norm": 11.586250671216098, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 6176 + }, + { + "epoch": 1.4330124115531842, + "grad_norm": 11.519426842901828, + "learning_rate": 2e-06, + "loss": 0.2363, + "step": 6177 + }, + { + "epoch": 1.4332444032014848, + "grad_norm": 14.449292115185402, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 6178 + }, + { + "epoch": 1.4334763948497855, + "grad_norm": 14.108943556459652, + "learning_rate": 2e-06, + "loss": 0.2261, + "step": 6179 + }, + { + "epoch": 1.4337083864980862, + "grad_norm": 16.064294267055452, + "learning_rate": 2e-06, + "loss": 0.2962, + "step": 6180 + }, + { + "epoch": 1.4339403781463869, + "grad_norm": 10.455267909022457, + "learning_rate": 2e-06, + "loss": 0.1511, + "step": 6181 + }, + { + "epoch": 1.4341723697946873, + "grad_norm": 11.967123031710038, + "learning_rate": 2e-06, + "loss": 0.2125, + "step": 6182 + }, + { + "epoch": 1.434404361442988, + "grad_norm": 15.112844010484544, + "learning_rate": 2e-06, + "loss": 0.1897, + "step": 6183 + }, + { + "epoch": 1.4346363530912887, + "grad_norm": 13.501040212475816, + "learning_rate": 2e-06, + "loss": 0.2221, + "step": 6184 + }, + { + "epoch": 1.4348683447395894, + "grad_norm": 11.403029548215033, + "learning_rate": 2e-06, + "loss": 0.2974, + "step": 6185 + }, + { + "epoch": 1.43510033638789, + "grad_norm": 10.528756579434749, + "learning_rate": 2e-06, + "loss": 0.2717, + "step": 6186 + }, + { + "epoch": 1.4353323280361907, + "grad_norm": 17.4785306549272, + "learning_rate": 2e-06, + "loss": 0.3093, + "step": 6187 + }, + { + "epoch": 1.4355643196844914, + "grad_norm": 16.520461776544106, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 6188 + }, + { + "epoch": 1.435796311332792, + "grad_norm": 18.529090232095495, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 6189 + }, + { + "epoch": 1.4360283029810927, + "grad_norm": 19.45665700411441, + "learning_rate": 2e-06, + "loss": 0.1903, + "step": 6190 + }, + { + "epoch": 1.4362602946293934, + "grad_norm": 23.243262921443165, + "learning_rate": 2e-06, + "loss": 0.3617, + "step": 6191 + }, + { + "epoch": 1.4364922862776939, + "grad_norm": 14.006968855607878, + "learning_rate": 2e-06, + "loss": 0.2091, + "step": 6192 + }, + { + "epoch": 1.4367242779259946, + "grad_norm": 21.504718595315833, + "learning_rate": 2e-06, + "loss": 0.2635, + "step": 6193 + }, + { + "epoch": 1.4369562695742952, + "grad_norm": 13.817998624581417, + "learning_rate": 2e-06, + "loss": 0.2324, + "step": 6194 + }, + { + "epoch": 1.437188261222596, + "grad_norm": 7.810015772765436, + "learning_rate": 2e-06, + "loss": 0.1785, + "step": 6195 + }, + { + "epoch": 1.4374202528708966, + "grad_norm": 9.365374008202055, + "learning_rate": 2e-06, + "loss": 0.1485, + "step": 6196 + }, + { + "epoch": 1.4376522445191973, + "grad_norm": 10.895895587927544, + "learning_rate": 2e-06, + "loss": 0.3335, + "step": 6197 + }, + { + "epoch": 1.437884236167498, + "grad_norm": 14.286288695476802, + "learning_rate": 2e-06, + "loss": 0.2615, + "step": 6198 + }, + { + "epoch": 1.4381162278157986, + "grad_norm": 10.872284344699505, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 6199 + }, + { + "epoch": 1.4383482194640993, + "grad_norm": 10.360378431259461, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 6200 + }, + { + "epoch": 1.4385802111124, + "grad_norm": 13.574591955794803, + "learning_rate": 2e-06, + "loss": 0.2481, + "step": 6201 + }, + { + "epoch": 1.4388122027607007, + "grad_norm": 19.38468404639361, + "learning_rate": 2e-06, + "loss": 0.2538, + "step": 6202 + }, + { + "epoch": 1.4390441944090013, + "grad_norm": 17.673282877067486, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 6203 + }, + { + "epoch": 1.439276186057302, + "grad_norm": 19.05185072264639, + "learning_rate": 2e-06, + "loss": 0.1937, + "step": 6204 + }, + { + "epoch": 1.4395081777056027, + "grad_norm": 14.081400982151527, + "learning_rate": 2e-06, + "loss": 0.2435, + "step": 6205 + }, + { + "epoch": 1.4397401693539034, + "grad_norm": 15.081745367531159, + "learning_rate": 2e-06, + "loss": 0.187, + "step": 6206 + }, + { + "epoch": 1.439972161002204, + "grad_norm": 7.762727291413856, + "learning_rate": 2e-06, + "loss": 0.1486, + "step": 6207 + }, + { + "epoch": 1.4402041526505045, + "grad_norm": 18.9592087976754, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 6208 + }, + { + "epoch": 1.4404361442988052, + "grad_norm": 11.651180155359892, + "learning_rate": 2e-06, + "loss": 0.2049, + "step": 6209 + }, + { + "epoch": 1.4406681359471059, + "grad_norm": 21.85022053628526, + "learning_rate": 2e-06, + "loss": 0.3898, + "step": 6210 + }, + { + "epoch": 1.4409001275954065, + "grad_norm": 15.022376335743237, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 6211 + }, + { + "epoch": 1.4411321192437072, + "grad_norm": 14.013016604280779, + "learning_rate": 2e-06, + "loss": 0.3044, + "step": 6212 + }, + { + "epoch": 1.4413641108920079, + "grad_norm": 15.010439248112638, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 6213 + }, + { + "epoch": 1.4415961025403086, + "grad_norm": 14.021784314225817, + "learning_rate": 2e-06, + "loss": 0.1858, + "step": 6214 + }, + { + "epoch": 1.4418280941886092, + "grad_norm": 7.823186542884315, + "learning_rate": 2e-06, + "loss": 0.1652, + "step": 6215 + }, + { + "epoch": 1.44206008583691, + "grad_norm": 15.567837669828194, + "learning_rate": 2e-06, + "loss": 0.2944, + "step": 6216 + }, + { + "epoch": 1.4422920774852106, + "grad_norm": 14.942780282170178, + "learning_rate": 2e-06, + "loss": 0.3001, + "step": 6217 + }, + { + "epoch": 1.442524069133511, + "grad_norm": 19.08348088709312, + "learning_rate": 2e-06, + "loss": 0.3644, + "step": 6218 + }, + { + "epoch": 1.4427560607818117, + "grad_norm": 11.287593844636518, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 6219 + }, + { + "epoch": 1.4429880524301124, + "grad_norm": 7.621639203002738, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 6220 + }, + { + "epoch": 1.443220044078413, + "grad_norm": 18.406599332910428, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 6221 + }, + { + "epoch": 1.4434520357267138, + "grad_norm": 23.286825030451947, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 6222 + }, + { + "epoch": 1.4436840273750144, + "grad_norm": 8.222848890709107, + "learning_rate": 2e-06, + "loss": 0.2088, + "step": 6223 + }, + { + "epoch": 1.4439160190233151, + "grad_norm": 15.558196318996108, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 6224 + }, + { + "epoch": 1.4441480106716158, + "grad_norm": 12.775547720889165, + "learning_rate": 2e-06, + "loss": 0.1868, + "step": 6225 + }, + { + "epoch": 1.4443800023199165, + "grad_norm": 7.512811432134344, + "learning_rate": 2e-06, + "loss": 0.1356, + "step": 6226 + }, + { + "epoch": 1.4446119939682172, + "grad_norm": 13.737685430600592, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 6227 + }, + { + "epoch": 1.4448439856165178, + "grad_norm": 14.156243226031668, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 6228 + }, + { + "epoch": 1.4450759772648185, + "grad_norm": 17.606808588301096, + "learning_rate": 2e-06, + "loss": 0.2185, + "step": 6229 + }, + { + "epoch": 1.4453079689131192, + "grad_norm": 11.62668769274412, + "learning_rate": 2e-06, + "loss": 0.2055, + "step": 6230 + }, + { + "epoch": 1.4455399605614199, + "grad_norm": 14.338056904667104, + "learning_rate": 2e-06, + "loss": 0.2705, + "step": 6231 + }, + { + "epoch": 1.4457719522097205, + "grad_norm": 8.648296098897648, + "learning_rate": 2e-06, + "loss": 0.1655, + "step": 6232 + }, + { + "epoch": 1.4460039438580212, + "grad_norm": 13.206173762170703, + "learning_rate": 2e-06, + "loss": 0.2529, + "step": 6233 + }, + { + "epoch": 1.446235935506322, + "grad_norm": 9.84978112659568, + "learning_rate": 2e-06, + "loss": 0.1463, + "step": 6234 + }, + { + "epoch": 1.4464679271546224, + "grad_norm": 5.811228390154432, + "learning_rate": 2e-06, + "loss": 0.1444, + "step": 6235 + }, + { + "epoch": 1.446699918802923, + "grad_norm": 8.78844000550045, + "learning_rate": 2e-06, + "loss": 0.1566, + "step": 6236 + }, + { + "epoch": 1.4469319104512237, + "grad_norm": 14.425942706111064, + "learning_rate": 2e-06, + "loss": 0.2102, + "step": 6237 + }, + { + "epoch": 1.4471639020995244, + "grad_norm": 30.388185582567207, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 6238 + }, + { + "epoch": 1.447395893747825, + "grad_norm": 10.26658546069454, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 6239 + }, + { + "epoch": 1.4476278853961257, + "grad_norm": 12.24282778864517, + "learning_rate": 2e-06, + "loss": 0.2435, + "step": 6240 + }, + { + "epoch": 1.4478598770444264, + "grad_norm": 32.10949393944506, + "learning_rate": 2e-06, + "loss": 0.4579, + "step": 6241 + }, + { + "epoch": 1.448091868692727, + "grad_norm": 12.488039092828613, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 6242 + }, + { + "epoch": 1.4483238603410278, + "grad_norm": 19.789272082035247, + "learning_rate": 2e-06, + "loss": 0.3089, + "step": 6243 + }, + { + "epoch": 1.4485558519893285, + "grad_norm": 10.038105505453792, + "learning_rate": 2e-06, + "loss": 0.2293, + "step": 6244 + }, + { + "epoch": 1.448787843637629, + "grad_norm": 14.92566839643602, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 6245 + }, + { + "epoch": 1.4490198352859296, + "grad_norm": 13.084583951620527, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 6246 + }, + { + "epoch": 1.4492518269342303, + "grad_norm": 16.02842816644472, + "learning_rate": 2e-06, + "loss": 0.2676, + "step": 6247 + }, + { + "epoch": 1.449483818582531, + "grad_norm": 19.128811378051246, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 6248 + }, + { + "epoch": 1.4497158102308316, + "grad_norm": 22.26464674913668, + "learning_rate": 2e-06, + "loss": 0.2977, + "step": 6249 + }, + { + "epoch": 1.4499478018791323, + "grad_norm": 18.209839699130395, + "learning_rate": 2e-06, + "loss": 0.2602, + "step": 6250 + }, + { + "epoch": 1.450179793527433, + "grad_norm": 15.616951580530294, + "learning_rate": 2e-06, + "loss": 0.253, + "step": 6251 + }, + { + "epoch": 1.4504117851757337, + "grad_norm": 11.246926978514672, + "learning_rate": 2e-06, + "loss": 0.1721, + "step": 6252 + }, + { + "epoch": 1.4506437768240343, + "grad_norm": 16.39083591876464, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 6253 + }, + { + "epoch": 1.450875768472335, + "grad_norm": 20.47143320303876, + "learning_rate": 2e-06, + "loss": 0.2169, + "step": 6254 + }, + { + "epoch": 1.4511077601206357, + "grad_norm": 19.1369428998323, + "learning_rate": 2e-06, + "loss": 0.2878, + "step": 6255 + }, + { + "epoch": 1.4513397517689364, + "grad_norm": 12.488871471556534, + "learning_rate": 2e-06, + "loss": 0.2151, + "step": 6256 + }, + { + "epoch": 1.451571743417237, + "grad_norm": 11.303663814651218, + "learning_rate": 2e-06, + "loss": 0.1897, + "step": 6257 + }, + { + "epoch": 1.4518037350655377, + "grad_norm": 14.441768775753307, + "learning_rate": 2e-06, + "loss": 0.3127, + "step": 6258 + }, + { + "epoch": 1.4520357267138384, + "grad_norm": 16.363244917076656, + "learning_rate": 2e-06, + "loss": 0.2342, + "step": 6259 + }, + { + "epoch": 1.452267718362139, + "grad_norm": 23.380606574314694, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 6260 + }, + { + "epoch": 1.4524997100104398, + "grad_norm": 7.810441383500039, + "learning_rate": 2e-06, + "loss": 0.1686, + "step": 6261 + }, + { + "epoch": 1.4527317016587402, + "grad_norm": 17.524082584791838, + "learning_rate": 2e-06, + "loss": 0.2508, + "step": 6262 + }, + { + "epoch": 1.452963693307041, + "grad_norm": 15.673484460883444, + "learning_rate": 2e-06, + "loss": 0.3772, + "step": 6263 + }, + { + "epoch": 1.4531956849553416, + "grad_norm": 8.920955052046363, + "learning_rate": 2e-06, + "loss": 0.1725, + "step": 6264 + }, + { + "epoch": 1.4534276766036422, + "grad_norm": 17.182446994054157, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 6265 + }, + { + "epoch": 1.453659668251943, + "grad_norm": 17.68772899689125, + "learning_rate": 2e-06, + "loss": 0.2875, + "step": 6266 + }, + { + "epoch": 1.4538916599002436, + "grad_norm": 19.9172602061717, + "learning_rate": 2e-06, + "loss": 0.3221, + "step": 6267 + }, + { + "epoch": 1.4541236515485443, + "grad_norm": 10.481349481722452, + "learning_rate": 2e-06, + "loss": 0.1947, + "step": 6268 + }, + { + "epoch": 1.454355643196845, + "grad_norm": 10.944112076996793, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 6269 + }, + { + "epoch": 1.4545876348451456, + "grad_norm": 15.685246629354507, + "learning_rate": 2e-06, + "loss": 0.3094, + "step": 6270 + }, + { + "epoch": 1.4548196264934463, + "grad_norm": 8.918649947722725, + "learning_rate": 2e-06, + "loss": 0.2104, + "step": 6271 + }, + { + "epoch": 1.4550516181417468, + "grad_norm": 11.16163245508632, + "learning_rate": 2e-06, + "loss": 0.1538, + "step": 6272 + }, + { + "epoch": 1.4552836097900474, + "grad_norm": 19.679477959880053, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 6273 + }, + { + "epoch": 1.4555156014383481, + "grad_norm": 12.987750627077375, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 6274 + }, + { + "epoch": 1.4557475930866488, + "grad_norm": 17.114922892888984, + "learning_rate": 2e-06, + "loss": 0.3117, + "step": 6275 + }, + { + "epoch": 1.4559795847349495, + "grad_norm": 10.067639029819325, + "learning_rate": 2e-06, + "loss": 0.1696, + "step": 6276 + }, + { + "epoch": 1.4562115763832502, + "grad_norm": 17.685941678809247, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 6277 + }, + { + "epoch": 1.4564435680315508, + "grad_norm": 20.106708525227468, + "learning_rate": 2e-06, + "loss": 0.2805, + "step": 6278 + }, + { + "epoch": 1.4566755596798515, + "grad_norm": 29.450049625966425, + "learning_rate": 2e-06, + "loss": 0.3727, + "step": 6279 + }, + { + "epoch": 1.4569075513281522, + "grad_norm": 12.063322046943563, + "learning_rate": 2e-06, + "loss": 0.1852, + "step": 6280 + }, + { + "epoch": 1.4571395429764529, + "grad_norm": 23.251493560396554, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 6281 + }, + { + "epoch": 1.4573715346247536, + "grad_norm": 16.424722310407798, + "learning_rate": 2e-06, + "loss": 0.2657, + "step": 6282 + }, + { + "epoch": 1.4576035262730542, + "grad_norm": 48.52568098535008, + "learning_rate": 2e-06, + "loss": 0.3805, + "step": 6283 + }, + { + "epoch": 1.457835517921355, + "grad_norm": 11.824165675935632, + "learning_rate": 2e-06, + "loss": 0.0905, + "step": 6284 + }, + { + "epoch": 1.4580675095696556, + "grad_norm": 12.360254817926524, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 6285 + }, + { + "epoch": 1.4582995012179563, + "grad_norm": 11.468732781327246, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 6286 + }, + { + "epoch": 1.458531492866257, + "grad_norm": 17.519255642282676, + "learning_rate": 2e-06, + "loss": 0.2276, + "step": 6287 + }, + { + "epoch": 1.4587634845145574, + "grad_norm": 16.463553573068534, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 6288 + }, + { + "epoch": 1.458995476162858, + "grad_norm": 10.964272322350954, + "learning_rate": 2e-06, + "loss": 0.1639, + "step": 6289 + }, + { + "epoch": 1.4592274678111588, + "grad_norm": 22.362764075921604, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 6290 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 12.523391116774407, + "learning_rate": 2e-06, + "loss": 0.2957, + "step": 6291 + }, + { + "epoch": 1.45969145110776, + "grad_norm": 11.906953940165494, + "learning_rate": 2e-06, + "loss": 0.2841, + "step": 6292 + }, + { + "epoch": 1.4599234427560608, + "grad_norm": 10.510627828123306, + "learning_rate": 2e-06, + "loss": 0.1849, + "step": 6293 + }, + { + "epoch": 1.4601554344043615, + "grad_norm": 11.924520737964247, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 6294 + }, + { + "epoch": 1.4603874260526621, + "grad_norm": 17.689641377689316, + "learning_rate": 2e-06, + "loss": 0.4174, + "step": 6295 + }, + { + "epoch": 1.4606194177009628, + "grad_norm": 11.519765885822281, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 6296 + }, + { + "epoch": 1.4608514093492635, + "grad_norm": 12.189681236199014, + "learning_rate": 2e-06, + "loss": 0.3487, + "step": 6297 + }, + { + "epoch": 1.461083400997564, + "grad_norm": 20.49920570696744, + "learning_rate": 2e-06, + "loss": 0.2388, + "step": 6298 + }, + { + "epoch": 1.4613153926458646, + "grad_norm": 12.667179012508328, + "learning_rate": 2e-06, + "loss": 0.282, + "step": 6299 + }, + { + "epoch": 1.4615473842941653, + "grad_norm": 12.849410460911564, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 6300 + }, + { + "epoch": 1.461779375942466, + "grad_norm": 12.650327888469596, + "learning_rate": 2e-06, + "loss": 0.2, + "step": 6301 + }, + { + "epoch": 1.4620113675907667, + "grad_norm": 10.508182667607313, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 6302 + }, + { + "epoch": 1.4622433592390673, + "grad_norm": 11.683754689306165, + "learning_rate": 2e-06, + "loss": 0.1806, + "step": 6303 + }, + { + "epoch": 1.462475350887368, + "grad_norm": 14.253812881316286, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 6304 + }, + { + "epoch": 1.4627073425356687, + "grad_norm": 12.658627134992473, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 6305 + }, + { + "epoch": 1.4629393341839694, + "grad_norm": 31.209899902519314, + "learning_rate": 2e-06, + "loss": 0.3512, + "step": 6306 + }, + { + "epoch": 1.46317132583227, + "grad_norm": 6.515647786907523, + "learning_rate": 2e-06, + "loss": 0.1727, + "step": 6307 + }, + { + "epoch": 1.4634033174805707, + "grad_norm": 14.165600444230142, + "learning_rate": 2e-06, + "loss": 0.2658, + "step": 6308 + }, + { + "epoch": 1.4636353091288714, + "grad_norm": 20.574582915051842, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 6309 + }, + { + "epoch": 1.463867300777172, + "grad_norm": 9.196302550572256, + "learning_rate": 2e-06, + "loss": 0.2192, + "step": 6310 + }, + { + "epoch": 1.4640992924254728, + "grad_norm": 12.910670650009056, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 6311 + }, + { + "epoch": 1.4643312840737734, + "grad_norm": 16.611732586870954, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 6312 + }, + { + "epoch": 1.4645632757220741, + "grad_norm": 11.80635259402948, + "learning_rate": 2e-06, + "loss": 0.2631, + "step": 6313 + }, + { + "epoch": 1.4647952673703748, + "grad_norm": 11.714774501709245, + "learning_rate": 2e-06, + "loss": 0.1788, + "step": 6314 + }, + { + "epoch": 1.4650272590186753, + "grad_norm": 14.071179173593196, + "learning_rate": 2e-06, + "loss": 0.2477, + "step": 6315 + }, + { + "epoch": 1.465259250666976, + "grad_norm": 14.841448731705155, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 6316 + }, + { + "epoch": 1.4654912423152766, + "grad_norm": 16.19049964702802, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 6317 + }, + { + "epoch": 1.4657232339635773, + "grad_norm": 11.0637061092165, + "learning_rate": 2e-06, + "loss": 0.2283, + "step": 6318 + }, + { + "epoch": 1.465955225611878, + "grad_norm": 11.873817866467068, + "learning_rate": 2e-06, + "loss": 0.1563, + "step": 6319 + }, + { + "epoch": 1.4661872172601786, + "grad_norm": 9.005565476151371, + "learning_rate": 2e-06, + "loss": 0.1996, + "step": 6320 + }, + { + "epoch": 1.4664192089084793, + "grad_norm": 13.384400838128574, + "learning_rate": 2e-06, + "loss": 0.3289, + "step": 6321 + }, + { + "epoch": 1.46665120055678, + "grad_norm": 20.766319695050672, + "learning_rate": 2e-06, + "loss": 0.2601, + "step": 6322 + }, + { + "epoch": 1.4668831922050807, + "grad_norm": 6.532545552029801, + "learning_rate": 2e-06, + "loss": 0.1765, + "step": 6323 + }, + { + "epoch": 1.4671151838533814, + "grad_norm": 7.998792218137757, + "learning_rate": 2e-06, + "loss": 0.1722, + "step": 6324 + }, + { + "epoch": 1.4673471755016818, + "grad_norm": 12.834191100976502, + "learning_rate": 2e-06, + "loss": 0.2775, + "step": 6325 + }, + { + "epoch": 1.4675791671499825, + "grad_norm": 9.317612115484362, + "learning_rate": 2e-06, + "loss": 0.2043, + "step": 6326 + }, + { + "epoch": 1.4678111587982832, + "grad_norm": 5.091989747213939, + "learning_rate": 2e-06, + "loss": 0.201, + "step": 6327 + }, + { + "epoch": 1.4680431504465838, + "grad_norm": 11.99678074994932, + "learning_rate": 2e-06, + "loss": 0.1815, + "step": 6328 + }, + { + "epoch": 1.4682751420948845, + "grad_norm": 21.94462944847759, + "learning_rate": 2e-06, + "loss": 0.3566, + "step": 6329 + }, + { + "epoch": 1.4685071337431852, + "grad_norm": 12.180969031739162, + "learning_rate": 2e-06, + "loss": 0.245, + "step": 6330 + }, + { + "epoch": 1.4687391253914859, + "grad_norm": 9.778283626208468, + "learning_rate": 2e-06, + "loss": 0.2044, + "step": 6331 + }, + { + "epoch": 1.4689711170397866, + "grad_norm": 16.4203032986916, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 6332 + }, + { + "epoch": 1.4692031086880872, + "grad_norm": 16.900025226867008, + "learning_rate": 2e-06, + "loss": 0.1976, + "step": 6333 + }, + { + "epoch": 1.469435100336388, + "grad_norm": 10.021115257040865, + "learning_rate": 2e-06, + "loss": 0.1496, + "step": 6334 + }, + { + "epoch": 1.4696670919846886, + "grad_norm": 13.389193715874105, + "learning_rate": 2e-06, + "loss": 0.2155, + "step": 6335 + }, + { + "epoch": 1.4698990836329893, + "grad_norm": 8.596070239371144, + "learning_rate": 2e-06, + "loss": 0.1977, + "step": 6336 + }, + { + "epoch": 1.47013107528129, + "grad_norm": 12.127040615020997, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 6337 + }, + { + "epoch": 1.4703630669295906, + "grad_norm": 16.134833289034738, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 6338 + }, + { + "epoch": 1.4705950585778913, + "grad_norm": 10.771203339462213, + "learning_rate": 2e-06, + "loss": 0.2874, + "step": 6339 + }, + { + "epoch": 1.470827050226192, + "grad_norm": 16.109559074612886, + "learning_rate": 2e-06, + "loss": 0.1802, + "step": 6340 + }, + { + "epoch": 1.4710590418744927, + "grad_norm": 24.667718004035784, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 6341 + }, + { + "epoch": 1.4712910335227931, + "grad_norm": 11.749541340994458, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 6342 + }, + { + "epoch": 1.4715230251710938, + "grad_norm": 14.16110476245822, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 6343 + }, + { + "epoch": 1.4717550168193945, + "grad_norm": 10.151478075496506, + "learning_rate": 2e-06, + "loss": 0.127, + "step": 6344 + }, + { + "epoch": 1.4719870084676951, + "grad_norm": 16.90380252331379, + "learning_rate": 2e-06, + "loss": 0.1837, + "step": 6345 + }, + { + "epoch": 1.4722190001159958, + "grad_norm": 8.733030126409302, + "learning_rate": 2e-06, + "loss": 0.138, + "step": 6346 + }, + { + "epoch": 1.4724509917642965, + "grad_norm": 6.654863266480442, + "learning_rate": 2e-06, + "loss": 0.1539, + "step": 6347 + }, + { + "epoch": 1.4726829834125972, + "grad_norm": 23.177115613442897, + "learning_rate": 2e-06, + "loss": 0.1599, + "step": 6348 + }, + { + "epoch": 1.4729149750608979, + "grad_norm": 14.13503325100472, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 6349 + }, + { + "epoch": 1.4731469667091985, + "grad_norm": 19.238936368187463, + "learning_rate": 2e-06, + "loss": 0.3622, + "step": 6350 + }, + { + "epoch": 1.4733789583574992, + "grad_norm": 20.46860314621833, + "learning_rate": 2e-06, + "loss": 0.2823, + "step": 6351 + }, + { + "epoch": 1.4736109500057997, + "grad_norm": 10.96071359838017, + "learning_rate": 2e-06, + "loss": 0.178, + "step": 6352 + }, + { + "epoch": 1.4738429416541003, + "grad_norm": 17.243761699595712, + "learning_rate": 2e-06, + "loss": 0.3201, + "step": 6353 + }, + { + "epoch": 1.474074933302401, + "grad_norm": 11.761870335399129, + "learning_rate": 2e-06, + "loss": 0.1588, + "step": 6354 + }, + { + "epoch": 1.4743069249507017, + "grad_norm": 15.063788634244156, + "learning_rate": 2e-06, + "loss": 0.1985, + "step": 6355 + }, + { + "epoch": 1.4745389165990024, + "grad_norm": 10.205439964919952, + "learning_rate": 2e-06, + "loss": 0.1951, + "step": 6356 + }, + { + "epoch": 1.474770908247303, + "grad_norm": 19.755369133580384, + "learning_rate": 2e-06, + "loss": 0.314, + "step": 6357 + }, + { + "epoch": 1.4750028998956037, + "grad_norm": 18.02092442331726, + "learning_rate": 2e-06, + "loss": 0.2532, + "step": 6358 + }, + { + "epoch": 1.4752348915439044, + "grad_norm": 20.776127614591175, + "learning_rate": 2e-06, + "loss": 0.3867, + "step": 6359 + }, + { + "epoch": 1.475466883192205, + "grad_norm": 18.8490296827277, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 6360 + }, + { + "epoch": 1.4756988748405058, + "grad_norm": 16.243421950387734, + "learning_rate": 2e-06, + "loss": 0.3152, + "step": 6361 + }, + { + "epoch": 1.4759308664888064, + "grad_norm": 12.912182042718188, + "learning_rate": 2e-06, + "loss": 0.2522, + "step": 6362 + }, + { + "epoch": 1.4761628581371071, + "grad_norm": 13.434169048365156, + "learning_rate": 2e-06, + "loss": 0.1501, + "step": 6363 + }, + { + "epoch": 1.4763948497854078, + "grad_norm": 23.64559945466988, + "learning_rate": 2e-06, + "loss": 0.322, + "step": 6364 + }, + { + "epoch": 1.4766268414337085, + "grad_norm": 15.840938473928546, + "learning_rate": 2e-06, + "loss": 0.398, + "step": 6365 + }, + { + "epoch": 1.4768588330820092, + "grad_norm": 13.593669173006043, + "learning_rate": 2e-06, + "loss": 0.2298, + "step": 6366 + }, + { + "epoch": 1.4770908247303098, + "grad_norm": 15.484086423496137, + "learning_rate": 2e-06, + "loss": 0.2102, + "step": 6367 + }, + { + "epoch": 1.4773228163786103, + "grad_norm": 20.566877565115348, + "learning_rate": 2e-06, + "loss": 0.2452, + "step": 6368 + }, + { + "epoch": 1.477554808026911, + "grad_norm": 26.017129239071807, + "learning_rate": 2e-06, + "loss": 0.3534, + "step": 6369 + }, + { + "epoch": 1.4777867996752116, + "grad_norm": 22.452637281185886, + "learning_rate": 2e-06, + "loss": 0.3603, + "step": 6370 + }, + { + "epoch": 1.4780187913235123, + "grad_norm": 16.962058059153215, + "learning_rate": 2e-06, + "loss": 0.2599, + "step": 6371 + }, + { + "epoch": 1.478250782971813, + "grad_norm": 31.44988018042093, + "learning_rate": 2e-06, + "loss": 0.3149, + "step": 6372 + }, + { + "epoch": 1.4784827746201137, + "grad_norm": 17.97818075763738, + "learning_rate": 2e-06, + "loss": 0.2712, + "step": 6373 + }, + { + "epoch": 1.4787147662684144, + "grad_norm": 16.298172657415773, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 6374 + }, + { + "epoch": 1.478946757916715, + "grad_norm": 18.243457940810273, + "learning_rate": 2e-06, + "loss": 0.2485, + "step": 6375 + }, + { + "epoch": 1.4791787495650157, + "grad_norm": 11.290826885683321, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 6376 + }, + { + "epoch": 1.4794107412133164, + "grad_norm": 8.751435320787085, + "learning_rate": 2e-06, + "loss": 0.1369, + "step": 6377 + }, + { + "epoch": 1.4796427328616168, + "grad_norm": 16.818687395472264, + "learning_rate": 2e-06, + "loss": 0.209, + "step": 6378 + }, + { + "epoch": 1.4798747245099175, + "grad_norm": 15.913409104761076, + "learning_rate": 2e-06, + "loss": 0.2155, + "step": 6379 + }, + { + "epoch": 1.4801067161582182, + "grad_norm": 17.282055857289045, + "learning_rate": 2e-06, + "loss": 0.3269, + "step": 6380 + }, + { + "epoch": 1.4803387078065189, + "grad_norm": 17.084959173190963, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 6381 + }, + { + "epoch": 1.4805706994548196, + "grad_norm": 10.158009644371294, + "learning_rate": 2e-06, + "loss": 0.3108, + "step": 6382 + }, + { + "epoch": 1.4808026911031202, + "grad_norm": 13.161002799442894, + "learning_rate": 2e-06, + "loss": 0.2121, + "step": 6383 + }, + { + "epoch": 1.481034682751421, + "grad_norm": 10.16665338585957, + "learning_rate": 2e-06, + "loss": 0.1704, + "step": 6384 + }, + { + "epoch": 1.4812666743997216, + "grad_norm": 11.521240503832262, + "learning_rate": 2e-06, + "loss": 0.2038, + "step": 6385 + }, + { + "epoch": 1.4814986660480223, + "grad_norm": 15.73741550757018, + "learning_rate": 2e-06, + "loss": 0.3129, + "step": 6386 + }, + { + "epoch": 1.481730657696323, + "grad_norm": 12.864819746200281, + "learning_rate": 2e-06, + "loss": 0.2185, + "step": 6387 + }, + { + "epoch": 1.4819626493446236, + "grad_norm": 13.343572206351373, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 6388 + }, + { + "epoch": 1.4821946409929243, + "grad_norm": 11.726772864515828, + "learning_rate": 2e-06, + "loss": 0.214, + "step": 6389 + }, + { + "epoch": 1.482426632641225, + "grad_norm": 12.530670113616015, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 6390 + }, + { + "epoch": 1.4826586242895257, + "grad_norm": 19.523033448539547, + "learning_rate": 2e-06, + "loss": 0.3102, + "step": 6391 + }, + { + "epoch": 1.4828906159378263, + "grad_norm": 14.813811087640675, + "learning_rate": 2e-06, + "loss": 0.2472, + "step": 6392 + }, + { + "epoch": 1.483122607586127, + "grad_norm": 11.124681524188954, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 6393 + }, + { + "epoch": 1.4833545992344277, + "grad_norm": 13.281192962940535, + "learning_rate": 2e-06, + "loss": 0.1957, + "step": 6394 + }, + { + "epoch": 1.4835865908827282, + "grad_norm": 9.593242911666561, + "learning_rate": 2e-06, + "loss": 0.1657, + "step": 6395 + }, + { + "epoch": 1.4838185825310288, + "grad_norm": 14.775396233103777, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 6396 + }, + { + "epoch": 1.4840505741793295, + "grad_norm": 10.907066557516638, + "learning_rate": 2e-06, + "loss": 0.1952, + "step": 6397 + }, + { + "epoch": 1.4842825658276302, + "grad_norm": 12.904636573491848, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 6398 + }, + { + "epoch": 1.4845145574759309, + "grad_norm": 16.960841103912564, + "learning_rate": 2e-06, + "loss": 0.3691, + "step": 6399 + }, + { + "epoch": 1.4847465491242315, + "grad_norm": 10.291992699295115, + "learning_rate": 2e-06, + "loss": 0.2032, + "step": 6400 + }, + { + "epoch": 1.4849785407725322, + "grad_norm": 10.330431592292854, + "learning_rate": 2e-06, + "loss": 0.1908, + "step": 6401 + }, + { + "epoch": 1.485210532420833, + "grad_norm": 12.167172292182945, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 6402 + }, + { + "epoch": 1.4854425240691336, + "grad_norm": 9.095098976169192, + "learning_rate": 2e-06, + "loss": 0.203, + "step": 6403 + }, + { + "epoch": 1.4856745157174343, + "grad_norm": 21.738840895505344, + "learning_rate": 2e-06, + "loss": 0.3902, + "step": 6404 + }, + { + "epoch": 1.4859065073657347, + "grad_norm": 20.273545028134055, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 6405 + }, + { + "epoch": 1.4861384990140354, + "grad_norm": 15.994132167667901, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 6406 + }, + { + "epoch": 1.486370490662336, + "grad_norm": 13.177377098709755, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 6407 + }, + { + "epoch": 1.4866024823106367, + "grad_norm": 18.166082735053617, + "learning_rate": 2e-06, + "loss": 0.3841, + "step": 6408 + }, + { + "epoch": 1.4868344739589374, + "grad_norm": 19.653124970701338, + "learning_rate": 2e-06, + "loss": 0.2373, + "step": 6409 + }, + { + "epoch": 1.487066465607238, + "grad_norm": 20.508541376693575, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 6410 + }, + { + "epoch": 1.4872984572555388, + "grad_norm": 18.380647360810677, + "learning_rate": 2e-06, + "loss": 0.3239, + "step": 6411 + }, + { + "epoch": 1.4875304489038395, + "grad_norm": 32.4958970734748, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 6412 + }, + { + "epoch": 1.4877624405521401, + "grad_norm": 29.17435793636348, + "learning_rate": 2e-06, + "loss": 0.2003, + "step": 6413 + }, + { + "epoch": 1.4879944322004408, + "grad_norm": 14.065406253018462, + "learning_rate": 2e-06, + "loss": 0.3424, + "step": 6414 + }, + { + "epoch": 1.4882264238487415, + "grad_norm": 16.456177366838705, + "learning_rate": 2e-06, + "loss": 0.2163, + "step": 6415 + }, + { + "epoch": 1.4884584154970422, + "grad_norm": 18.49614960441739, + "learning_rate": 2e-06, + "loss": 0.3273, + "step": 6416 + }, + { + "epoch": 1.4886904071453428, + "grad_norm": 12.96237766808122, + "learning_rate": 2e-06, + "loss": 0.1475, + "step": 6417 + }, + { + "epoch": 1.4889223987936435, + "grad_norm": 13.498753630694996, + "learning_rate": 2e-06, + "loss": 0.2419, + "step": 6418 + }, + { + "epoch": 1.4891543904419442, + "grad_norm": 19.590928089711877, + "learning_rate": 2e-06, + "loss": 0.4618, + "step": 6419 + }, + { + "epoch": 1.4893863820902449, + "grad_norm": 51.909561474681176, + "learning_rate": 2e-06, + "loss": 0.3855, + "step": 6420 + }, + { + "epoch": 1.4896183737385453, + "grad_norm": 14.388437806863546, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 6421 + }, + { + "epoch": 1.489850365386846, + "grad_norm": 21.97510015415248, + "learning_rate": 2e-06, + "loss": 0.4647, + "step": 6422 + }, + { + "epoch": 1.4900823570351467, + "grad_norm": 10.430234245754157, + "learning_rate": 2e-06, + "loss": 0.1647, + "step": 6423 + }, + { + "epoch": 1.4903143486834474, + "grad_norm": 11.789116182208888, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 6424 + }, + { + "epoch": 1.490546340331748, + "grad_norm": 9.155597005212678, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 6425 + }, + { + "epoch": 1.4907783319800487, + "grad_norm": 11.338288541901479, + "learning_rate": 2e-06, + "loss": 0.3419, + "step": 6426 + }, + { + "epoch": 1.4910103236283494, + "grad_norm": 13.99855524710128, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 6427 + }, + { + "epoch": 1.49124231527665, + "grad_norm": 19.044828598098015, + "learning_rate": 2e-06, + "loss": 0.3295, + "step": 6428 + }, + { + "epoch": 1.4914743069249508, + "grad_norm": 10.062277970290738, + "learning_rate": 2e-06, + "loss": 0.2077, + "step": 6429 + }, + { + "epoch": 1.4917062985732514, + "grad_norm": 14.284423700285167, + "learning_rate": 2e-06, + "loss": 0.2801, + "step": 6430 + }, + { + "epoch": 1.4919382902215519, + "grad_norm": 35.50237344890928, + "learning_rate": 2e-06, + "loss": 0.2168, + "step": 6431 + }, + { + "epoch": 1.4921702818698526, + "grad_norm": 5.948831486266877, + "learning_rate": 2e-06, + "loss": 0.1457, + "step": 6432 + }, + { + "epoch": 1.4924022735181532, + "grad_norm": 15.893248474505135, + "learning_rate": 2e-06, + "loss": 0.2549, + "step": 6433 + }, + { + "epoch": 1.492634265166454, + "grad_norm": 17.785878418025135, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 6434 + }, + { + "epoch": 1.4928662568147546, + "grad_norm": 17.70381675074382, + "learning_rate": 2e-06, + "loss": 0.3474, + "step": 6435 + }, + { + "epoch": 1.4930982484630553, + "grad_norm": 12.609051573865418, + "learning_rate": 2e-06, + "loss": 0.2953, + "step": 6436 + }, + { + "epoch": 1.493330240111356, + "grad_norm": 5.811534282114288, + "learning_rate": 2e-06, + "loss": 0.2519, + "step": 6437 + }, + { + "epoch": 1.4935622317596566, + "grad_norm": 10.391498834861888, + "learning_rate": 2e-06, + "loss": 0.1582, + "step": 6438 + }, + { + "epoch": 1.4937942234079573, + "grad_norm": 10.597649958632756, + "learning_rate": 2e-06, + "loss": 0.2445, + "step": 6439 + }, + { + "epoch": 1.494026215056258, + "grad_norm": 9.533945431912754, + "learning_rate": 2e-06, + "loss": 0.1678, + "step": 6440 + }, + { + "epoch": 1.4942582067045587, + "grad_norm": 19.709720547193285, + "learning_rate": 2e-06, + "loss": 0.3057, + "step": 6441 + }, + { + "epoch": 1.4944901983528593, + "grad_norm": 17.841104378481514, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 6442 + }, + { + "epoch": 1.49472219000116, + "grad_norm": 13.695781339844167, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 6443 + }, + { + "epoch": 1.4949541816494607, + "grad_norm": 10.329248417507095, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 6444 + }, + { + "epoch": 1.4951861732977614, + "grad_norm": 11.273298957941135, + "learning_rate": 2e-06, + "loss": 0.3243, + "step": 6445 + }, + { + "epoch": 1.495418164946062, + "grad_norm": 11.123275926165398, + "learning_rate": 2e-06, + "loss": 0.2626, + "step": 6446 + }, + { + "epoch": 1.4956501565943627, + "grad_norm": 19.591634516882863, + "learning_rate": 2e-06, + "loss": 0.3643, + "step": 6447 + }, + { + "epoch": 1.4958821482426632, + "grad_norm": 14.93813363970523, + "learning_rate": 2e-06, + "loss": 0.2281, + "step": 6448 + }, + { + "epoch": 1.4961141398909639, + "grad_norm": 14.059895540139681, + "learning_rate": 2e-06, + "loss": 0.2931, + "step": 6449 + }, + { + "epoch": 1.4963461315392645, + "grad_norm": 13.686524407500817, + "learning_rate": 2e-06, + "loss": 0.2201, + "step": 6450 + }, + { + "epoch": 1.4965781231875652, + "grad_norm": 12.69422427186578, + "learning_rate": 2e-06, + "loss": 0.2263, + "step": 6451 + }, + { + "epoch": 1.496810114835866, + "grad_norm": 6.64847972204828, + "learning_rate": 2e-06, + "loss": 0.1778, + "step": 6452 + }, + { + "epoch": 1.4970421064841666, + "grad_norm": 10.611507182667797, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 6453 + }, + { + "epoch": 1.4972740981324673, + "grad_norm": 8.206805145949547, + "learning_rate": 2e-06, + "loss": 0.1589, + "step": 6454 + }, + { + "epoch": 1.497506089780768, + "grad_norm": 12.934838866814447, + "learning_rate": 2e-06, + "loss": 0.2327, + "step": 6455 + }, + { + "epoch": 1.4977380814290686, + "grad_norm": 8.605576047699119, + "learning_rate": 2e-06, + "loss": 0.1946, + "step": 6456 + }, + { + "epoch": 1.4979700730773693, + "grad_norm": 13.740337789467292, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 6457 + }, + { + "epoch": 1.4982020647256697, + "grad_norm": 12.719383705892868, + "learning_rate": 2e-06, + "loss": 0.3019, + "step": 6458 + }, + { + "epoch": 1.4984340563739704, + "grad_norm": 9.414119565019343, + "learning_rate": 2e-06, + "loss": 0.1875, + "step": 6459 + }, + { + "epoch": 1.498666048022271, + "grad_norm": 7.286243777258862, + "learning_rate": 2e-06, + "loss": 0.2029, + "step": 6460 + }, + { + "epoch": 1.4988980396705718, + "grad_norm": 25.087415884774984, + "learning_rate": 2e-06, + "loss": 0.3493, + "step": 6461 + }, + { + "epoch": 1.4991300313188725, + "grad_norm": 19.529758328612694, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 6462 + }, + { + "epoch": 1.4993620229671731, + "grad_norm": 10.558360675132494, + "learning_rate": 2e-06, + "loss": 0.3066, + "step": 6463 + }, + { + "epoch": 1.4995940146154738, + "grad_norm": 12.929556331585013, + "learning_rate": 2e-06, + "loss": 0.2349, + "step": 6464 + }, + { + "epoch": 1.4998260062637745, + "grad_norm": 13.317649088780495, + "learning_rate": 2e-06, + "loss": 0.2356, + "step": 6465 + }, + { + "epoch": 1.5000579979120752, + "grad_norm": 14.80770359106743, + "learning_rate": 2e-06, + "loss": 0.2201, + "step": 6466 + }, + { + "epoch": 1.5002899895603758, + "grad_norm": 10.099958255530332, + "learning_rate": 2e-06, + "loss": 0.2774, + "step": 6467 + }, + { + "epoch": 1.5005219812086765, + "grad_norm": 10.157809884548199, + "learning_rate": 2e-06, + "loss": 0.2198, + "step": 6468 + }, + { + "epoch": 1.5007539728569772, + "grad_norm": 12.343167575809934, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 6469 + }, + { + "epoch": 1.5009859645052779, + "grad_norm": 16.69967960591968, + "learning_rate": 2e-06, + "loss": 0.3119, + "step": 6470 + }, + { + "epoch": 1.5012179561535786, + "grad_norm": 11.728765200927777, + "learning_rate": 2e-06, + "loss": 0.2118, + "step": 6471 + }, + { + "epoch": 1.5014499478018792, + "grad_norm": 11.541890012505004, + "learning_rate": 2e-06, + "loss": 0.1384, + "step": 6472 + }, + { + "epoch": 1.50168193945018, + "grad_norm": 9.732218961699658, + "learning_rate": 2e-06, + "loss": 0.1718, + "step": 6473 + }, + { + "epoch": 1.5019139310984806, + "grad_norm": 17.88222137496914, + "learning_rate": 2e-06, + "loss": 0.309, + "step": 6474 + }, + { + "epoch": 1.5021459227467813, + "grad_norm": 16.773658318362514, + "learning_rate": 2e-06, + "loss": 0.2618, + "step": 6475 + }, + { + "epoch": 1.5023779143950817, + "grad_norm": 12.08839410124754, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 6476 + }, + { + "epoch": 1.5026099060433824, + "grad_norm": 14.656155009066419, + "learning_rate": 2e-06, + "loss": 0.2386, + "step": 6477 + }, + { + "epoch": 1.502841897691683, + "grad_norm": 9.973123753284185, + "learning_rate": 2e-06, + "loss": 0.308, + "step": 6478 + }, + { + "epoch": 1.5030738893399838, + "grad_norm": 41.89955411808061, + "learning_rate": 2e-06, + "loss": 0.1906, + "step": 6479 + }, + { + "epoch": 1.5033058809882844, + "grad_norm": 10.945620291360028, + "learning_rate": 2e-06, + "loss": 0.204, + "step": 6480 + }, + { + "epoch": 1.5035378726365851, + "grad_norm": 10.877027582989168, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 6481 + }, + { + "epoch": 1.5037698642848858, + "grad_norm": 13.071631401641872, + "learning_rate": 2e-06, + "loss": 0.3342, + "step": 6482 + }, + { + "epoch": 1.5040018559331862, + "grad_norm": 13.171544565902238, + "learning_rate": 2e-06, + "loss": 0.3112, + "step": 6483 + }, + { + "epoch": 1.504233847581487, + "grad_norm": 12.170105872800002, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 6484 + }, + { + "epoch": 1.5044658392297876, + "grad_norm": 13.122499592129397, + "learning_rate": 2e-06, + "loss": 0.1801, + "step": 6485 + }, + { + "epoch": 1.5046978308780883, + "grad_norm": 15.47394430721565, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 6486 + }, + { + "epoch": 1.504929822526389, + "grad_norm": 11.18579845578175, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 6487 + }, + { + "epoch": 1.5051618141746896, + "grad_norm": 19.618419994055362, + "learning_rate": 2e-06, + "loss": 0.3429, + "step": 6488 + }, + { + "epoch": 1.5053938058229903, + "grad_norm": 10.515346377508802, + "learning_rate": 2e-06, + "loss": 0.2214, + "step": 6489 + }, + { + "epoch": 1.505625797471291, + "grad_norm": 16.76037884514449, + "learning_rate": 2e-06, + "loss": 0.3002, + "step": 6490 + }, + { + "epoch": 1.5058577891195917, + "grad_norm": 18.413510022031442, + "learning_rate": 2e-06, + "loss": 0.2109, + "step": 6491 + }, + { + "epoch": 1.5060897807678923, + "grad_norm": 45.908922289749256, + "learning_rate": 2e-06, + "loss": 0.2336, + "step": 6492 + }, + { + "epoch": 1.506321772416193, + "grad_norm": 15.561067182649099, + "learning_rate": 2e-06, + "loss": 0.2545, + "step": 6493 + }, + { + "epoch": 1.5065537640644937, + "grad_norm": 16.969709333468, + "learning_rate": 2e-06, + "loss": 0.3423, + "step": 6494 + }, + { + "epoch": 1.5067857557127944, + "grad_norm": 10.3194205977704, + "learning_rate": 2e-06, + "loss": 0.1431, + "step": 6495 + }, + { + "epoch": 1.507017747361095, + "grad_norm": 17.808742052277843, + "learning_rate": 2e-06, + "loss": 0.3041, + "step": 6496 + }, + { + "epoch": 1.5072497390093957, + "grad_norm": 9.31469480252244, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 6497 + }, + { + "epoch": 1.5074817306576964, + "grad_norm": 17.60457924421259, + "learning_rate": 2e-06, + "loss": 0.2772, + "step": 6498 + }, + { + "epoch": 1.507713722305997, + "grad_norm": 11.324853479445679, + "learning_rate": 2e-06, + "loss": 0.2332, + "step": 6499 + }, + { + "epoch": 1.5079457139542978, + "grad_norm": 5.4358127032744035, + "learning_rate": 2e-06, + "loss": 0.1556, + "step": 6500 + }, + { + "epoch": 1.5081777056025985, + "grad_norm": 10.692291572338746, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 6501 + }, + { + "epoch": 1.5084096972508991, + "grad_norm": 23.722388037868864, + "learning_rate": 2e-06, + "loss": 0.3714, + "step": 6502 + }, + { + "epoch": 1.5086416888991996, + "grad_norm": 14.672151917575201, + "learning_rate": 2e-06, + "loss": 0.2105, + "step": 6503 + }, + { + "epoch": 1.5088736805475003, + "grad_norm": 12.031371405646743, + "learning_rate": 2e-06, + "loss": 0.2954, + "step": 6504 + }, + { + "epoch": 1.509105672195801, + "grad_norm": 3.1784302456047224, + "learning_rate": 2e-06, + "loss": 0.1228, + "step": 6505 + }, + { + "epoch": 1.5093376638441016, + "grad_norm": 14.213306200760728, + "learning_rate": 2e-06, + "loss": 0.2708, + "step": 6506 + }, + { + "epoch": 1.5095696554924023, + "grad_norm": 15.232030296744599, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 6507 + }, + { + "epoch": 1.509801647140703, + "grad_norm": 14.984208384309996, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 6508 + }, + { + "epoch": 1.5100336387890034, + "grad_norm": 15.054506568863584, + "learning_rate": 2e-06, + "loss": 0.291, + "step": 6509 + }, + { + "epoch": 1.510265630437304, + "grad_norm": 18.958083066770698, + "learning_rate": 2e-06, + "loss": 0.3165, + "step": 6510 + }, + { + "epoch": 1.5104976220856048, + "grad_norm": 15.20084589197303, + "learning_rate": 2e-06, + "loss": 0.2437, + "step": 6511 + }, + { + "epoch": 1.5107296137339055, + "grad_norm": 10.595426838724423, + "learning_rate": 2e-06, + "loss": 0.1996, + "step": 6512 + }, + { + "epoch": 1.5109616053822061, + "grad_norm": 13.323499389916211, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 6513 + }, + { + "epoch": 1.5111935970305068, + "grad_norm": 12.349992816548118, + "learning_rate": 2e-06, + "loss": 0.159, + "step": 6514 + }, + { + "epoch": 1.5114255886788075, + "grad_norm": 8.347530066491494, + "learning_rate": 2e-06, + "loss": 0.1881, + "step": 6515 + }, + { + "epoch": 1.5116575803271082, + "grad_norm": 14.539653588005612, + "learning_rate": 2e-06, + "loss": 0.233, + "step": 6516 + }, + { + "epoch": 1.5118895719754089, + "grad_norm": 14.335559702883744, + "learning_rate": 2e-06, + "loss": 0.2032, + "step": 6517 + }, + { + "epoch": 1.5121215636237095, + "grad_norm": 10.128049266081188, + "learning_rate": 2e-06, + "loss": 0.1667, + "step": 6518 + }, + { + "epoch": 1.5123535552720102, + "grad_norm": 8.385680504688892, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 6519 + }, + { + "epoch": 1.5125855469203109, + "grad_norm": 17.06497822611483, + "learning_rate": 2e-06, + "loss": 0.1913, + "step": 6520 + }, + { + "epoch": 1.5128175385686116, + "grad_norm": 16.332199808682926, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 6521 + }, + { + "epoch": 1.5130495302169122, + "grad_norm": 19.854169165602478, + "learning_rate": 2e-06, + "loss": 0.3257, + "step": 6522 + }, + { + "epoch": 1.513281521865213, + "grad_norm": 18.77711344263193, + "learning_rate": 2e-06, + "loss": 0.2624, + "step": 6523 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 13.115808526811055, + "learning_rate": 2e-06, + "loss": 0.2487, + "step": 6524 + }, + { + "epoch": 1.5137455051618143, + "grad_norm": 6.266253493130478, + "learning_rate": 2e-06, + "loss": 0.1387, + "step": 6525 + }, + { + "epoch": 1.513977496810115, + "grad_norm": 30.669033494279304, + "learning_rate": 2e-06, + "loss": 0.3013, + "step": 6526 + }, + { + "epoch": 1.5142094884584156, + "grad_norm": 17.383425126374238, + "learning_rate": 2e-06, + "loss": 0.3154, + "step": 6527 + }, + { + "epoch": 1.5144414801067163, + "grad_norm": 9.12719618391748, + "learning_rate": 2e-06, + "loss": 0.2042, + "step": 6528 + }, + { + "epoch": 1.5146734717550168, + "grad_norm": 16.01351599490197, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 6529 + }, + { + "epoch": 1.5149054634033174, + "grad_norm": 7.5545651946593155, + "learning_rate": 2e-06, + "loss": 0.1569, + "step": 6530 + }, + { + "epoch": 1.5151374550516181, + "grad_norm": 10.252329088153038, + "learning_rate": 2e-06, + "loss": 0.1647, + "step": 6531 + }, + { + "epoch": 1.5153694466999188, + "grad_norm": 12.471980786928466, + "learning_rate": 2e-06, + "loss": 0.2009, + "step": 6532 + }, + { + "epoch": 1.5156014383482195, + "grad_norm": 11.677249814799206, + "learning_rate": 2e-06, + "loss": 0.273, + "step": 6533 + }, + { + "epoch": 1.5158334299965202, + "grad_norm": 16.203498552868982, + "learning_rate": 2e-06, + "loss": 0.2866, + "step": 6534 + }, + { + "epoch": 1.5160654216448208, + "grad_norm": 16.749259296710072, + "learning_rate": 2e-06, + "loss": 0.2742, + "step": 6535 + }, + { + "epoch": 1.5162974132931213, + "grad_norm": 11.47405388582526, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 6536 + }, + { + "epoch": 1.516529404941422, + "grad_norm": 25.593684432781764, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 6537 + }, + { + "epoch": 1.5167613965897226, + "grad_norm": 20.441951801303862, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 6538 + }, + { + "epoch": 1.5169933882380233, + "grad_norm": 14.458608496737757, + "learning_rate": 2e-06, + "loss": 0.2365, + "step": 6539 + }, + { + "epoch": 1.517225379886324, + "grad_norm": 33.16233465034753, + "learning_rate": 2e-06, + "loss": 0.3357, + "step": 6540 + }, + { + "epoch": 1.5174573715346247, + "grad_norm": 14.1321734243137, + "learning_rate": 2e-06, + "loss": 0.2245, + "step": 6541 + }, + { + "epoch": 1.5176893631829254, + "grad_norm": 14.33484209040663, + "learning_rate": 2e-06, + "loss": 0.2339, + "step": 6542 + }, + { + "epoch": 1.517921354831226, + "grad_norm": 15.328236886901246, + "learning_rate": 2e-06, + "loss": 0.2048, + "step": 6543 + }, + { + "epoch": 1.5181533464795267, + "grad_norm": 10.470249220106636, + "learning_rate": 2e-06, + "loss": 0.2095, + "step": 6544 + }, + { + "epoch": 1.5183853381278274, + "grad_norm": 19.99383171597015, + "learning_rate": 2e-06, + "loss": 0.2835, + "step": 6545 + }, + { + "epoch": 1.518617329776128, + "grad_norm": 19.34033305766493, + "learning_rate": 2e-06, + "loss": 0.2156, + "step": 6546 + }, + { + "epoch": 1.5188493214244287, + "grad_norm": 20.841866636345877, + "learning_rate": 2e-06, + "loss": 0.3316, + "step": 6547 + }, + { + "epoch": 1.5190813130727294, + "grad_norm": 10.383445951544882, + "learning_rate": 2e-06, + "loss": 0.131, + "step": 6548 + }, + { + "epoch": 1.51931330472103, + "grad_norm": 14.532499443762863, + "learning_rate": 2e-06, + "loss": 0.3055, + "step": 6549 + }, + { + "epoch": 1.5195452963693308, + "grad_norm": 15.630033556591604, + "learning_rate": 2e-06, + "loss": 0.4152, + "step": 6550 + }, + { + "epoch": 1.5197772880176315, + "grad_norm": 12.030975197512266, + "learning_rate": 2e-06, + "loss": 0.2482, + "step": 6551 + }, + { + "epoch": 1.5200092796659321, + "grad_norm": 13.340481336968507, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 6552 + }, + { + "epoch": 1.5202412713142328, + "grad_norm": 14.488802750841987, + "learning_rate": 2e-06, + "loss": 0.3086, + "step": 6553 + }, + { + "epoch": 1.5204732629625335, + "grad_norm": 8.115303601704829, + "learning_rate": 2e-06, + "loss": 0.1744, + "step": 6554 + }, + { + "epoch": 1.5207052546108342, + "grad_norm": 26.135474002000866, + "learning_rate": 2e-06, + "loss": 0.4021, + "step": 6555 + }, + { + "epoch": 1.5209372462591346, + "grad_norm": 25.226920890660292, + "learning_rate": 2e-06, + "loss": 0.3901, + "step": 6556 + }, + { + "epoch": 1.5211692379074353, + "grad_norm": 16.670183683407572, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 6557 + }, + { + "epoch": 1.521401229555736, + "grad_norm": 15.625375444010585, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 6558 + }, + { + "epoch": 1.5216332212040367, + "grad_norm": 14.917535152680767, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 6559 + }, + { + "epoch": 1.5218652128523373, + "grad_norm": 13.332426191345766, + "learning_rate": 2e-06, + "loss": 0.2578, + "step": 6560 + }, + { + "epoch": 1.522097204500638, + "grad_norm": 18.671462189485453, + "learning_rate": 2e-06, + "loss": 0.3071, + "step": 6561 + }, + { + "epoch": 1.5223291961489387, + "grad_norm": 11.246730512085772, + "learning_rate": 2e-06, + "loss": 0.1709, + "step": 6562 + }, + { + "epoch": 1.5225611877972391, + "grad_norm": 19.622214769945217, + "learning_rate": 2e-06, + "loss": 0.2527, + "step": 6563 + }, + { + "epoch": 1.5227931794455398, + "grad_norm": 12.284109364278002, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 6564 + }, + { + "epoch": 1.5230251710938405, + "grad_norm": 6.678563214595849, + "learning_rate": 2e-06, + "loss": 0.1916, + "step": 6565 + }, + { + "epoch": 1.5232571627421412, + "grad_norm": 10.69967395464174, + "learning_rate": 2e-06, + "loss": 0.183, + "step": 6566 + }, + { + "epoch": 1.5234891543904419, + "grad_norm": 18.99299029345869, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 6567 + }, + { + "epoch": 1.5237211460387425, + "grad_norm": 9.86938009551779, + "learning_rate": 2e-06, + "loss": 0.2259, + "step": 6568 + }, + { + "epoch": 1.5239531376870432, + "grad_norm": 12.17829700571588, + "learning_rate": 2e-06, + "loss": 0.221, + "step": 6569 + }, + { + "epoch": 1.524185129335344, + "grad_norm": 11.699644060447417, + "learning_rate": 2e-06, + "loss": 0.2575, + "step": 6570 + }, + { + "epoch": 1.5244171209836446, + "grad_norm": 9.703258680204174, + "learning_rate": 2e-06, + "loss": 0.2, + "step": 6571 + }, + { + "epoch": 1.5246491126319452, + "grad_norm": 14.95404804361216, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 6572 + }, + { + "epoch": 1.524881104280246, + "grad_norm": 13.152934576076827, + "learning_rate": 2e-06, + "loss": 0.1685, + "step": 6573 + }, + { + "epoch": 1.5251130959285466, + "grad_norm": 15.716773941763252, + "learning_rate": 2e-06, + "loss": 0.2364, + "step": 6574 + }, + { + "epoch": 1.5253450875768473, + "grad_norm": 10.98282431021787, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 6575 + }, + { + "epoch": 1.525577079225148, + "grad_norm": 10.298974653319465, + "learning_rate": 2e-06, + "loss": 0.3139, + "step": 6576 + }, + { + "epoch": 1.5258090708734486, + "grad_norm": 15.407719180498699, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 6577 + }, + { + "epoch": 1.5260410625217493, + "grad_norm": 13.536063143083295, + "learning_rate": 2e-06, + "loss": 0.2669, + "step": 6578 + }, + { + "epoch": 1.52627305417005, + "grad_norm": 17.303253418761933, + "learning_rate": 2e-06, + "loss": 0.2706, + "step": 6579 + }, + { + "epoch": 1.5265050458183507, + "grad_norm": 8.999912205572087, + "learning_rate": 2e-06, + "loss": 0.1724, + "step": 6580 + }, + { + "epoch": 1.5267370374666513, + "grad_norm": 13.214016604293978, + "learning_rate": 2e-06, + "loss": 0.2278, + "step": 6581 + }, + { + "epoch": 1.526969029114952, + "grad_norm": 19.689311179023186, + "learning_rate": 2e-06, + "loss": 0.4611, + "step": 6582 + }, + { + "epoch": 1.5272010207632525, + "grad_norm": 17.523858510746575, + "learning_rate": 2e-06, + "loss": 0.3864, + "step": 6583 + }, + { + "epoch": 1.5274330124115532, + "grad_norm": 8.710260312691528, + "learning_rate": 2e-06, + "loss": 0.1756, + "step": 6584 + }, + { + "epoch": 1.5276650040598538, + "grad_norm": 10.848455252659218, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 6585 + }, + { + "epoch": 1.5278969957081545, + "grad_norm": 16.842313323134718, + "learning_rate": 2e-06, + "loss": 0.2283, + "step": 6586 + }, + { + "epoch": 1.5281289873564552, + "grad_norm": 14.082538162407813, + "learning_rate": 2e-06, + "loss": 0.2786, + "step": 6587 + }, + { + "epoch": 1.5283609790047559, + "grad_norm": 13.690119420871433, + "learning_rate": 2e-06, + "loss": 0.213, + "step": 6588 + }, + { + "epoch": 1.5285929706530563, + "grad_norm": 8.366710908032633, + "learning_rate": 2e-06, + "loss": 0.2031, + "step": 6589 + }, + { + "epoch": 1.528824962301357, + "grad_norm": 11.625826226867636, + "learning_rate": 2e-06, + "loss": 0.2905, + "step": 6590 + }, + { + "epoch": 1.5290569539496577, + "grad_norm": 9.64366760293919, + "learning_rate": 2e-06, + "loss": 0.1721, + "step": 6591 + }, + { + "epoch": 1.5292889455979584, + "grad_norm": 11.064945785341019, + "learning_rate": 2e-06, + "loss": 0.2017, + "step": 6592 + }, + { + "epoch": 1.529520937246259, + "grad_norm": 12.392594488420682, + "learning_rate": 2e-06, + "loss": 0.3388, + "step": 6593 + }, + { + "epoch": 1.5297529288945597, + "grad_norm": 10.038529036525198, + "learning_rate": 2e-06, + "loss": 0.1767, + "step": 6594 + }, + { + "epoch": 1.5299849205428604, + "grad_norm": 18.155270257757028, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 6595 + }, + { + "epoch": 1.530216912191161, + "grad_norm": 18.631747785591013, + "learning_rate": 2e-06, + "loss": 0.4423, + "step": 6596 + }, + { + "epoch": 1.5304489038394617, + "grad_norm": 11.650253566150127, + "learning_rate": 2e-06, + "loss": 0.1601, + "step": 6597 + }, + { + "epoch": 1.5306808954877624, + "grad_norm": 11.892720717803435, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 6598 + }, + { + "epoch": 1.530912887136063, + "grad_norm": 12.76447937851006, + "learning_rate": 2e-06, + "loss": 0.2272, + "step": 6599 + }, + { + "epoch": 1.5311448787843638, + "grad_norm": 19.670009456521562, + "learning_rate": 2e-06, + "loss": 0.191, + "step": 6600 + }, + { + "epoch": 1.5313768704326645, + "grad_norm": 17.911080788040636, + "learning_rate": 2e-06, + "loss": 0.2934, + "step": 6601 + }, + { + "epoch": 1.5316088620809651, + "grad_norm": 12.02701092188653, + "learning_rate": 2e-06, + "loss": 0.2006, + "step": 6602 + }, + { + "epoch": 1.5318408537292658, + "grad_norm": 14.254489915142843, + "learning_rate": 2e-06, + "loss": 0.1326, + "step": 6603 + }, + { + "epoch": 1.5320728453775665, + "grad_norm": 16.942217221710525, + "learning_rate": 2e-06, + "loss": 0.2467, + "step": 6604 + }, + { + "epoch": 1.5323048370258672, + "grad_norm": 12.251395907968385, + "learning_rate": 2e-06, + "loss": 0.1818, + "step": 6605 + }, + { + "epoch": 1.5325368286741679, + "grad_norm": 18.873433824205463, + "learning_rate": 2e-06, + "loss": 0.304, + "step": 6606 + }, + { + "epoch": 1.5327688203224685, + "grad_norm": 19.97499742532436, + "learning_rate": 2e-06, + "loss": 0.3184, + "step": 6607 + }, + { + "epoch": 1.5330008119707692, + "grad_norm": 8.638731047589733, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 6608 + }, + { + "epoch": 1.5332328036190697, + "grad_norm": 16.66699551716635, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 6609 + }, + { + "epoch": 1.5334647952673703, + "grad_norm": 5.295961005754983, + "learning_rate": 2e-06, + "loss": 0.1288, + "step": 6610 + }, + { + "epoch": 1.533696786915671, + "grad_norm": 21.756919902176616, + "learning_rate": 2e-06, + "loss": 0.2657, + "step": 6611 + }, + { + "epoch": 1.5339287785639717, + "grad_norm": 16.63173357913042, + "learning_rate": 2e-06, + "loss": 0.3024, + "step": 6612 + }, + { + "epoch": 1.5341607702122724, + "grad_norm": 15.377279500427504, + "learning_rate": 2e-06, + "loss": 0.1583, + "step": 6613 + }, + { + "epoch": 1.534392761860573, + "grad_norm": 13.542730249995518, + "learning_rate": 2e-06, + "loss": 0.2867, + "step": 6614 + }, + { + "epoch": 1.5346247535088737, + "grad_norm": 13.45081937791314, + "learning_rate": 2e-06, + "loss": 0.3402, + "step": 6615 + }, + { + "epoch": 1.5348567451571742, + "grad_norm": 18.73627537803717, + "learning_rate": 2e-06, + "loss": 0.3025, + "step": 6616 + }, + { + "epoch": 1.5350887368054749, + "grad_norm": 17.96909572583018, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 6617 + }, + { + "epoch": 1.5353207284537755, + "grad_norm": 7.296570244559349, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 6618 + }, + { + "epoch": 1.5355527201020762, + "grad_norm": 11.84219243712135, + "learning_rate": 2e-06, + "loss": 0.2714, + "step": 6619 + }, + { + "epoch": 1.535784711750377, + "grad_norm": 11.616406825047289, + "learning_rate": 2e-06, + "loss": 0.2136, + "step": 6620 + }, + { + "epoch": 1.5360167033986776, + "grad_norm": 8.454661127361028, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 6621 + }, + { + "epoch": 1.5362486950469783, + "grad_norm": 17.398773472415783, + "learning_rate": 2e-06, + "loss": 0.3175, + "step": 6622 + }, + { + "epoch": 1.536480686695279, + "grad_norm": 14.004436701132663, + "learning_rate": 2e-06, + "loss": 0.1801, + "step": 6623 + }, + { + "epoch": 1.5367126783435796, + "grad_norm": 15.782936325674424, + "learning_rate": 2e-06, + "loss": 0.1985, + "step": 6624 + }, + { + "epoch": 1.5369446699918803, + "grad_norm": 8.437744584794697, + "learning_rate": 2e-06, + "loss": 0.1646, + "step": 6625 + }, + { + "epoch": 1.537176661640181, + "grad_norm": 10.262631524625183, + "learning_rate": 2e-06, + "loss": 0.178, + "step": 6626 + }, + { + "epoch": 1.5374086532884816, + "grad_norm": 6.3501856308833755, + "learning_rate": 2e-06, + "loss": 0.228, + "step": 6627 + }, + { + "epoch": 1.5376406449367823, + "grad_norm": 10.01570986797597, + "learning_rate": 2e-06, + "loss": 0.3387, + "step": 6628 + }, + { + "epoch": 1.537872636585083, + "grad_norm": 30.458849058891083, + "learning_rate": 2e-06, + "loss": 0.4754, + "step": 6629 + }, + { + "epoch": 1.5381046282333837, + "grad_norm": 11.716062302207616, + "learning_rate": 2e-06, + "loss": 0.1712, + "step": 6630 + }, + { + "epoch": 1.5383366198816844, + "grad_norm": 18.175307041258502, + "learning_rate": 2e-06, + "loss": 0.2043, + "step": 6631 + }, + { + "epoch": 1.538568611529985, + "grad_norm": 12.24045972591269, + "learning_rate": 2e-06, + "loss": 0.179, + "step": 6632 + }, + { + "epoch": 1.5388006031782857, + "grad_norm": 11.07667745607954, + "learning_rate": 2e-06, + "loss": 0.273, + "step": 6633 + }, + { + "epoch": 1.5390325948265864, + "grad_norm": 12.272831750483638, + "learning_rate": 2e-06, + "loss": 0.179, + "step": 6634 + }, + { + "epoch": 1.539264586474887, + "grad_norm": 12.611848727210983, + "learning_rate": 2e-06, + "loss": 0.411, + "step": 6635 + }, + { + "epoch": 1.5394965781231875, + "grad_norm": 18.854779245821923, + "learning_rate": 2e-06, + "loss": 0.3932, + "step": 6636 + }, + { + "epoch": 1.5397285697714882, + "grad_norm": 7.172384964319514, + "learning_rate": 2e-06, + "loss": 0.1187, + "step": 6637 + }, + { + "epoch": 1.5399605614197889, + "grad_norm": 6.576975458207418, + "learning_rate": 2e-06, + "loss": 0.1599, + "step": 6638 + }, + { + "epoch": 1.5401925530680896, + "grad_norm": 12.24297995933426, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 6639 + }, + { + "epoch": 1.5404245447163902, + "grad_norm": 10.647747894323764, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 6640 + }, + { + "epoch": 1.540656536364691, + "grad_norm": 19.60146067514771, + "learning_rate": 2e-06, + "loss": 0.1466, + "step": 6641 + }, + { + "epoch": 1.5408885280129916, + "grad_norm": 14.684098261137065, + "learning_rate": 2e-06, + "loss": 0.2603, + "step": 6642 + }, + { + "epoch": 1.541120519661292, + "grad_norm": 16.59216010203391, + "learning_rate": 2e-06, + "loss": 0.2821, + "step": 6643 + }, + { + "epoch": 1.5413525113095927, + "grad_norm": 14.913905233436939, + "learning_rate": 2e-06, + "loss": 0.3483, + "step": 6644 + }, + { + "epoch": 1.5415845029578934, + "grad_norm": 7.166202900774562, + "learning_rate": 2e-06, + "loss": 0.1455, + "step": 6645 + }, + { + "epoch": 1.541816494606194, + "grad_norm": 21.821133413069827, + "learning_rate": 2e-06, + "loss": 0.3514, + "step": 6646 + }, + { + "epoch": 1.5420484862544948, + "grad_norm": 12.236283152701525, + "learning_rate": 2e-06, + "loss": 0.2124, + "step": 6647 + }, + { + "epoch": 1.5422804779027954, + "grad_norm": 17.098002615388406, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 6648 + }, + { + "epoch": 1.542512469551096, + "grad_norm": 14.267657162351332, + "learning_rate": 2e-06, + "loss": 0.2191, + "step": 6649 + }, + { + "epoch": 1.5427444611993968, + "grad_norm": 15.523793251848586, + "learning_rate": 2e-06, + "loss": 0.1445, + "step": 6650 + }, + { + "epoch": 1.5429764528476975, + "grad_norm": 14.559855721210122, + "learning_rate": 2e-06, + "loss": 0.2379, + "step": 6651 + }, + { + "epoch": 1.5432084444959981, + "grad_norm": 23.65012451053669, + "learning_rate": 2e-06, + "loss": 0.4326, + "step": 6652 + }, + { + "epoch": 1.5434404361442988, + "grad_norm": 8.303100619952016, + "learning_rate": 2e-06, + "loss": 0.198, + "step": 6653 + }, + { + "epoch": 1.5436724277925995, + "grad_norm": 15.935548264212265, + "learning_rate": 2e-06, + "loss": 0.1968, + "step": 6654 + }, + { + "epoch": 1.5439044194409002, + "grad_norm": 8.883382058562992, + "learning_rate": 2e-06, + "loss": 0.181, + "step": 6655 + }, + { + "epoch": 1.5441364110892009, + "grad_norm": 13.870904459442297, + "learning_rate": 2e-06, + "loss": 0.2708, + "step": 6656 + }, + { + "epoch": 1.5443684027375015, + "grad_norm": 18.77365934368007, + "learning_rate": 2e-06, + "loss": 0.3606, + "step": 6657 + }, + { + "epoch": 1.5446003943858022, + "grad_norm": 11.231032727883631, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 6658 + }, + { + "epoch": 1.544832386034103, + "grad_norm": 13.809682217206333, + "learning_rate": 2e-06, + "loss": 0.212, + "step": 6659 + }, + { + "epoch": 1.5450643776824036, + "grad_norm": 9.238173167632462, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 6660 + }, + { + "epoch": 1.5452963693307042, + "grad_norm": 13.771198076915777, + "learning_rate": 2e-06, + "loss": 0.196, + "step": 6661 + }, + { + "epoch": 1.5455283609790047, + "grad_norm": 15.557560680677668, + "learning_rate": 2e-06, + "loss": 0.1748, + "step": 6662 + }, + { + "epoch": 1.5457603526273054, + "grad_norm": 19.399216680140757, + "learning_rate": 2e-06, + "loss": 0.3062, + "step": 6663 + }, + { + "epoch": 1.545992344275606, + "grad_norm": 12.735134100589814, + "learning_rate": 2e-06, + "loss": 0.1553, + "step": 6664 + }, + { + "epoch": 1.5462243359239067, + "grad_norm": 11.721525953735028, + "learning_rate": 2e-06, + "loss": 0.1754, + "step": 6665 + }, + { + "epoch": 1.5464563275722074, + "grad_norm": 25.85267662972695, + "learning_rate": 2e-06, + "loss": 0.3786, + "step": 6666 + }, + { + "epoch": 1.546688319220508, + "grad_norm": 9.82288963464247, + "learning_rate": 2e-06, + "loss": 0.22, + "step": 6667 + }, + { + "epoch": 1.5469203108688088, + "grad_norm": 14.562619633467646, + "learning_rate": 2e-06, + "loss": 0.1804, + "step": 6668 + }, + { + "epoch": 1.5471523025171092, + "grad_norm": 12.559383643951522, + "learning_rate": 2e-06, + "loss": 0.2596, + "step": 6669 + }, + { + "epoch": 1.54738429416541, + "grad_norm": 23.46206957851582, + "learning_rate": 2e-06, + "loss": 0.2093, + "step": 6670 + }, + { + "epoch": 1.5476162858137106, + "grad_norm": 15.949501067315126, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 6671 + }, + { + "epoch": 1.5478482774620113, + "grad_norm": 24.868620049788493, + "learning_rate": 2e-06, + "loss": 0.4118, + "step": 6672 + }, + { + "epoch": 1.548080269110312, + "grad_norm": 15.287923367273436, + "learning_rate": 2e-06, + "loss": 0.2223, + "step": 6673 + }, + { + "epoch": 1.5483122607586126, + "grad_norm": 6.52400379250765, + "learning_rate": 2e-06, + "loss": 0.1518, + "step": 6674 + }, + { + "epoch": 1.5485442524069133, + "grad_norm": 14.473581077854373, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 6675 + }, + { + "epoch": 1.548776244055214, + "grad_norm": 13.152523903148891, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 6676 + }, + { + "epoch": 1.5490082357035146, + "grad_norm": 15.555084560097228, + "learning_rate": 2e-06, + "loss": 0.2117, + "step": 6677 + }, + { + "epoch": 1.5492402273518153, + "grad_norm": 18.95724262071258, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 6678 + }, + { + "epoch": 1.549472219000116, + "grad_norm": 23.805287017623378, + "learning_rate": 2e-06, + "loss": 0.2882, + "step": 6679 + }, + { + "epoch": 1.5497042106484167, + "grad_norm": 9.063455098713165, + "learning_rate": 2e-06, + "loss": 0.1562, + "step": 6680 + }, + { + "epoch": 1.5499362022967174, + "grad_norm": 16.767249600071942, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 6681 + }, + { + "epoch": 1.550168193945018, + "grad_norm": 11.262662463905274, + "learning_rate": 2e-06, + "loss": 0.2708, + "step": 6682 + }, + { + "epoch": 1.5504001855933187, + "grad_norm": 6.92155707769241, + "learning_rate": 2e-06, + "loss": 0.1165, + "step": 6683 + }, + { + "epoch": 1.5506321772416194, + "grad_norm": 13.00332862318548, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 6684 + }, + { + "epoch": 1.55086416888992, + "grad_norm": 16.8149521444566, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 6685 + }, + { + "epoch": 1.5510961605382207, + "grad_norm": 19.949779449084364, + "learning_rate": 2e-06, + "loss": 0.4181, + "step": 6686 + }, + { + "epoch": 1.5513281521865214, + "grad_norm": 15.470405223572625, + "learning_rate": 2e-06, + "loss": 0.2668, + "step": 6687 + }, + { + "epoch": 1.551560143834822, + "grad_norm": 9.524958830303015, + "learning_rate": 2e-06, + "loss": 0.1999, + "step": 6688 + }, + { + "epoch": 1.5517921354831226, + "grad_norm": 10.401920612753036, + "learning_rate": 2e-06, + "loss": 0.1673, + "step": 6689 + }, + { + "epoch": 1.5520241271314232, + "grad_norm": 14.408570492378052, + "learning_rate": 2e-06, + "loss": 0.2244, + "step": 6690 + }, + { + "epoch": 1.552256118779724, + "grad_norm": 21.054179752792802, + "learning_rate": 2e-06, + "loss": 0.3153, + "step": 6691 + }, + { + "epoch": 1.5524881104280246, + "grad_norm": 12.234752323220984, + "learning_rate": 2e-06, + "loss": 0.233, + "step": 6692 + }, + { + "epoch": 1.5527201020763253, + "grad_norm": 16.21777025907242, + "learning_rate": 2e-06, + "loss": 0.2629, + "step": 6693 + }, + { + "epoch": 1.552952093724626, + "grad_norm": 12.365330556133785, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 6694 + }, + { + "epoch": 1.5531840853729266, + "grad_norm": 13.604646105301338, + "learning_rate": 2e-06, + "loss": 0.1856, + "step": 6695 + }, + { + "epoch": 1.553416077021227, + "grad_norm": 11.565050637758615, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 6696 + }, + { + "epoch": 1.5536480686695278, + "grad_norm": 10.095837283548882, + "learning_rate": 2e-06, + "loss": 0.144, + "step": 6697 + }, + { + "epoch": 1.5538800603178284, + "grad_norm": 26.34064370188464, + "learning_rate": 2e-06, + "loss": 0.2701, + "step": 6698 + }, + { + "epoch": 1.5541120519661291, + "grad_norm": 15.714451308894228, + "learning_rate": 2e-06, + "loss": 0.2907, + "step": 6699 + }, + { + "epoch": 1.5543440436144298, + "grad_norm": 23.67048393742744, + "learning_rate": 2e-06, + "loss": 0.28, + "step": 6700 + }, + { + "epoch": 1.5545760352627305, + "grad_norm": 19.769761573650925, + "learning_rate": 2e-06, + "loss": 0.4424, + "step": 6701 + }, + { + "epoch": 1.5548080269110311, + "grad_norm": 13.912627564672938, + "learning_rate": 2e-06, + "loss": 0.2099, + "step": 6702 + }, + { + "epoch": 1.5550400185593318, + "grad_norm": 17.84318080318395, + "learning_rate": 2e-06, + "loss": 0.2191, + "step": 6703 + }, + { + "epoch": 1.5552720102076325, + "grad_norm": 17.882379196787138, + "learning_rate": 2e-06, + "loss": 0.2854, + "step": 6704 + }, + { + "epoch": 1.5555040018559332, + "grad_norm": 19.754686577344806, + "learning_rate": 2e-06, + "loss": 0.2216, + "step": 6705 + }, + { + "epoch": 1.5557359935042339, + "grad_norm": 12.30229564320053, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 6706 + }, + { + "epoch": 1.5559679851525345, + "grad_norm": 11.072988829052486, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 6707 + }, + { + "epoch": 1.5561999768008352, + "grad_norm": 11.920322064117377, + "learning_rate": 2e-06, + "loss": 0.184, + "step": 6708 + }, + { + "epoch": 1.556431968449136, + "grad_norm": 23.620264848489672, + "learning_rate": 2e-06, + "loss": 0.5399, + "step": 6709 + }, + { + "epoch": 1.5566639600974366, + "grad_norm": 18.340827062399427, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 6710 + }, + { + "epoch": 1.5568959517457372, + "grad_norm": 15.804678291711879, + "learning_rate": 2e-06, + "loss": 0.2082, + "step": 6711 + }, + { + "epoch": 1.557127943394038, + "grad_norm": 9.287038285372942, + "learning_rate": 2e-06, + "loss": 0.1843, + "step": 6712 + }, + { + "epoch": 1.5573599350423386, + "grad_norm": 13.862382590699733, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 6713 + }, + { + "epoch": 1.5575919266906393, + "grad_norm": 22.047374843399634, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 6714 + }, + { + "epoch": 1.55782391833894, + "grad_norm": 8.856948070107391, + "learning_rate": 2e-06, + "loss": 0.1535, + "step": 6715 + }, + { + "epoch": 1.5580559099872404, + "grad_norm": 12.628650705847866, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 6716 + }, + { + "epoch": 1.558287901635541, + "grad_norm": 23.188665968133286, + "learning_rate": 2e-06, + "loss": 0.2675, + "step": 6717 + }, + { + "epoch": 1.5585198932838418, + "grad_norm": 10.686962550001526, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 6718 + }, + { + "epoch": 1.5587518849321425, + "grad_norm": 17.718995290129744, + "learning_rate": 2e-06, + "loss": 0.2649, + "step": 6719 + }, + { + "epoch": 1.5589838765804431, + "grad_norm": 13.700206729941808, + "learning_rate": 2e-06, + "loss": 0.2116, + "step": 6720 + }, + { + "epoch": 1.5592158682287438, + "grad_norm": 13.688686894063217, + "learning_rate": 2e-06, + "loss": 0.2587, + "step": 6721 + }, + { + "epoch": 1.5594478598770443, + "grad_norm": 12.49552285498753, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 6722 + }, + { + "epoch": 1.559679851525345, + "grad_norm": 13.62322311967421, + "learning_rate": 2e-06, + "loss": 0.1915, + "step": 6723 + }, + { + "epoch": 1.5599118431736456, + "grad_norm": 7.434462107481903, + "learning_rate": 2e-06, + "loss": 0.1737, + "step": 6724 + }, + { + "epoch": 1.5601438348219463, + "grad_norm": 10.92478845203364, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 6725 + }, + { + "epoch": 1.560375826470247, + "grad_norm": 12.50872192630918, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 6726 + }, + { + "epoch": 1.5606078181185477, + "grad_norm": 14.669812825175278, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 6727 + }, + { + "epoch": 1.5608398097668483, + "grad_norm": 16.867602158489653, + "learning_rate": 2e-06, + "loss": 0.1857, + "step": 6728 + }, + { + "epoch": 1.561071801415149, + "grad_norm": 24.13353406032791, + "learning_rate": 2e-06, + "loss": 0.2582, + "step": 6729 + }, + { + "epoch": 1.5613037930634497, + "grad_norm": 11.22810564138021, + "learning_rate": 2e-06, + "loss": 0.1928, + "step": 6730 + }, + { + "epoch": 1.5615357847117504, + "grad_norm": 5.337300728209669, + "learning_rate": 2e-06, + "loss": 0.1356, + "step": 6731 + }, + { + "epoch": 1.561767776360051, + "grad_norm": 14.689968164162265, + "learning_rate": 2e-06, + "loss": 0.3578, + "step": 6732 + }, + { + "epoch": 1.5619997680083517, + "grad_norm": 11.167047902657862, + "learning_rate": 2e-06, + "loss": 0.1839, + "step": 6733 + }, + { + "epoch": 1.5622317596566524, + "grad_norm": 8.165761931642459, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 6734 + }, + { + "epoch": 1.562463751304953, + "grad_norm": 10.54991480371774, + "learning_rate": 2e-06, + "loss": 0.1719, + "step": 6735 + }, + { + "epoch": 1.5626957429532538, + "grad_norm": 11.442688510228646, + "learning_rate": 2e-06, + "loss": 0.2653, + "step": 6736 + }, + { + "epoch": 1.5629277346015544, + "grad_norm": 17.070029585225544, + "learning_rate": 2e-06, + "loss": 0.2521, + "step": 6737 + }, + { + "epoch": 1.563159726249855, + "grad_norm": 19.686280230299616, + "learning_rate": 2e-06, + "loss": 0.244, + "step": 6738 + }, + { + "epoch": 1.5633917178981558, + "grad_norm": 9.948167478079785, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 6739 + }, + { + "epoch": 1.5636237095464565, + "grad_norm": 24.922215627756376, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 6740 + }, + { + "epoch": 1.5638557011947571, + "grad_norm": 16.741152692172665, + "learning_rate": 2e-06, + "loss": 0.2465, + "step": 6741 + }, + { + "epoch": 1.5640876928430576, + "grad_norm": 10.47916098313398, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 6742 + }, + { + "epoch": 1.5643196844913583, + "grad_norm": 9.617663992594, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 6743 + }, + { + "epoch": 1.564551676139659, + "grad_norm": 11.671052205411605, + "learning_rate": 2e-06, + "loss": 0.2597, + "step": 6744 + }, + { + "epoch": 1.5647836677879596, + "grad_norm": 12.058931258856934, + "learning_rate": 2e-06, + "loss": 0.2216, + "step": 6745 + }, + { + "epoch": 1.5650156594362603, + "grad_norm": 21.409795579059104, + "learning_rate": 2e-06, + "loss": 0.3596, + "step": 6746 + }, + { + "epoch": 1.565247651084561, + "grad_norm": 11.764103802660122, + "learning_rate": 2e-06, + "loss": 0.1823, + "step": 6747 + }, + { + "epoch": 1.5654796427328617, + "grad_norm": 9.706993728012272, + "learning_rate": 2e-06, + "loss": 0.169, + "step": 6748 + }, + { + "epoch": 1.5657116343811621, + "grad_norm": 15.599357074918032, + "learning_rate": 2e-06, + "loss": 0.342, + "step": 6749 + }, + { + "epoch": 1.5659436260294628, + "grad_norm": 20.41002666037572, + "learning_rate": 2e-06, + "loss": 0.2836, + "step": 6750 + }, + { + "epoch": 1.5661756176777635, + "grad_norm": 15.460943544867607, + "learning_rate": 2e-06, + "loss": 0.3194, + "step": 6751 + }, + { + "epoch": 1.5664076093260642, + "grad_norm": 18.23070983470633, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 6752 + }, + { + "epoch": 1.5666396009743648, + "grad_norm": 24.89491573709899, + "learning_rate": 2e-06, + "loss": 0.2149, + "step": 6753 + }, + { + "epoch": 1.5668715926226655, + "grad_norm": 10.42731457408122, + "learning_rate": 2e-06, + "loss": 0.1346, + "step": 6754 + }, + { + "epoch": 1.5671035842709662, + "grad_norm": 26.145121796832107, + "learning_rate": 2e-06, + "loss": 0.3824, + "step": 6755 + }, + { + "epoch": 1.5673355759192669, + "grad_norm": 19.222209564955683, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 6756 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 10.470769858587463, + "learning_rate": 2e-06, + "loss": 0.2401, + "step": 6757 + }, + { + "epoch": 1.5677995592158682, + "grad_norm": 11.749844689977595, + "learning_rate": 2e-06, + "loss": 0.1865, + "step": 6758 + }, + { + "epoch": 1.568031550864169, + "grad_norm": 24.061754616113973, + "learning_rate": 2e-06, + "loss": 0.3668, + "step": 6759 + }, + { + "epoch": 1.5682635425124696, + "grad_norm": 14.476040215882161, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 6760 + }, + { + "epoch": 1.5684955341607703, + "grad_norm": 19.156412552986147, + "learning_rate": 2e-06, + "loss": 0.2453, + "step": 6761 + }, + { + "epoch": 1.568727525809071, + "grad_norm": 9.392538890429808, + "learning_rate": 2e-06, + "loss": 0.207, + "step": 6762 + }, + { + "epoch": 1.5689595174573716, + "grad_norm": 6.850823669102928, + "learning_rate": 2e-06, + "loss": 0.145, + "step": 6763 + }, + { + "epoch": 1.5691915091056723, + "grad_norm": 13.466344294898505, + "learning_rate": 2e-06, + "loss": 0.2537, + "step": 6764 + }, + { + "epoch": 1.569423500753973, + "grad_norm": 15.669892855674025, + "learning_rate": 2e-06, + "loss": 0.2211, + "step": 6765 + }, + { + "epoch": 1.5696554924022736, + "grad_norm": 16.398892981894086, + "learning_rate": 2e-06, + "loss": 0.2775, + "step": 6766 + }, + { + "epoch": 1.5698874840505743, + "grad_norm": 15.11369594798255, + "learning_rate": 2e-06, + "loss": 0.2086, + "step": 6767 + }, + { + "epoch": 1.570119475698875, + "grad_norm": 16.09216950836462, + "learning_rate": 2e-06, + "loss": 0.2734, + "step": 6768 + }, + { + "epoch": 1.5703514673471755, + "grad_norm": 11.456504129507978, + "learning_rate": 2e-06, + "loss": 0.2696, + "step": 6769 + }, + { + "epoch": 1.5705834589954761, + "grad_norm": 31.295646221623393, + "learning_rate": 2e-06, + "loss": 0.2382, + "step": 6770 + }, + { + "epoch": 1.5708154506437768, + "grad_norm": 14.91478972462344, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 6771 + }, + { + "epoch": 1.5710474422920775, + "grad_norm": 14.084946879586926, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 6772 + }, + { + "epoch": 1.5712794339403782, + "grad_norm": 7.254192390967452, + "learning_rate": 2e-06, + "loss": 0.1817, + "step": 6773 + }, + { + "epoch": 1.5715114255886788, + "grad_norm": 21.57021793936699, + "learning_rate": 2e-06, + "loss": 0.287, + "step": 6774 + }, + { + "epoch": 1.5717434172369795, + "grad_norm": 21.625281378712074, + "learning_rate": 2e-06, + "loss": 0.2452, + "step": 6775 + }, + { + "epoch": 1.57197540888528, + "grad_norm": 14.467275186899649, + "learning_rate": 2e-06, + "loss": 0.2274, + "step": 6776 + }, + { + "epoch": 1.5722074005335807, + "grad_norm": 8.852228895085334, + "learning_rate": 2e-06, + "loss": 0.1996, + "step": 6777 + }, + { + "epoch": 1.5724393921818813, + "grad_norm": 8.286774206475272, + "learning_rate": 2e-06, + "loss": 0.187, + "step": 6778 + }, + { + "epoch": 1.572671383830182, + "grad_norm": 14.316350630272812, + "learning_rate": 2e-06, + "loss": 0.2814, + "step": 6779 + }, + { + "epoch": 1.5729033754784827, + "grad_norm": 12.459751760967073, + "learning_rate": 2e-06, + "loss": 0.2404, + "step": 6780 + }, + { + "epoch": 1.5731353671267834, + "grad_norm": 7.701203511096362, + "learning_rate": 2e-06, + "loss": 0.1813, + "step": 6781 + }, + { + "epoch": 1.573367358775084, + "grad_norm": 15.65163803258445, + "learning_rate": 2e-06, + "loss": 0.3066, + "step": 6782 + }, + { + "epoch": 1.5735993504233847, + "grad_norm": 11.415272637022873, + "learning_rate": 2e-06, + "loss": 0.1425, + "step": 6783 + }, + { + "epoch": 1.5738313420716854, + "grad_norm": 19.358539152020647, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 6784 + }, + { + "epoch": 1.574063333719986, + "grad_norm": 12.680294495610841, + "learning_rate": 2e-06, + "loss": 0.2396, + "step": 6785 + }, + { + "epoch": 1.5742953253682868, + "grad_norm": 19.26119166215534, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 6786 + }, + { + "epoch": 1.5745273170165874, + "grad_norm": 43.35671069330195, + "learning_rate": 2e-06, + "loss": 0.3221, + "step": 6787 + }, + { + "epoch": 1.5747593086648881, + "grad_norm": 25.079014100953824, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 6788 + }, + { + "epoch": 1.5749913003131888, + "grad_norm": 25.518585287352856, + "learning_rate": 2e-06, + "loss": 0.335, + "step": 6789 + }, + { + "epoch": 1.5752232919614895, + "grad_norm": 8.677770134465073, + "learning_rate": 2e-06, + "loss": 0.2009, + "step": 6790 + }, + { + "epoch": 1.5754552836097901, + "grad_norm": 16.41414521651827, + "learning_rate": 2e-06, + "loss": 0.341, + "step": 6791 + }, + { + "epoch": 1.5756872752580908, + "grad_norm": 14.16484596317382, + "learning_rate": 2e-06, + "loss": 0.1712, + "step": 6792 + }, + { + "epoch": 1.5759192669063915, + "grad_norm": 31.22819956446191, + "learning_rate": 2e-06, + "loss": 0.3924, + "step": 6793 + }, + { + "epoch": 1.5761512585546922, + "grad_norm": 10.63781472409317, + "learning_rate": 2e-06, + "loss": 0.1474, + "step": 6794 + }, + { + "epoch": 1.5763832502029929, + "grad_norm": 15.8712959128847, + "learning_rate": 2e-06, + "loss": 0.2747, + "step": 6795 + }, + { + "epoch": 1.5766152418512933, + "grad_norm": 16.81316543278334, + "learning_rate": 2e-06, + "loss": 0.2064, + "step": 6796 + }, + { + "epoch": 1.576847233499594, + "grad_norm": 10.956852812678623, + "learning_rate": 2e-06, + "loss": 0.2405, + "step": 6797 + }, + { + "epoch": 1.5770792251478947, + "grad_norm": 10.47422890616429, + "learning_rate": 2e-06, + "loss": 0.2584, + "step": 6798 + }, + { + "epoch": 1.5773112167961953, + "grad_norm": 11.247698042368608, + "learning_rate": 2e-06, + "loss": 0.2368, + "step": 6799 + }, + { + "epoch": 1.577543208444496, + "grad_norm": 11.458253221758767, + "learning_rate": 2e-06, + "loss": 0.2327, + "step": 6800 + }, + { + "epoch": 1.5777752000927967, + "grad_norm": 8.372281579949735, + "learning_rate": 2e-06, + "loss": 0.2589, + "step": 6801 + }, + { + "epoch": 1.5780071917410972, + "grad_norm": 23.002950702220993, + "learning_rate": 2e-06, + "loss": 0.2453, + "step": 6802 + }, + { + "epoch": 1.5782391833893978, + "grad_norm": 8.10206962011327, + "learning_rate": 2e-06, + "loss": 0.1706, + "step": 6803 + }, + { + "epoch": 1.5784711750376985, + "grad_norm": 5.549188344327498, + "learning_rate": 2e-06, + "loss": 0.1771, + "step": 6804 + }, + { + "epoch": 1.5787031666859992, + "grad_norm": 14.578356313178427, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 6805 + }, + { + "epoch": 1.5789351583342999, + "grad_norm": 10.42207019577819, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 6806 + }, + { + "epoch": 1.5791671499826005, + "grad_norm": 6.864112094691692, + "learning_rate": 2e-06, + "loss": 0.1364, + "step": 6807 + }, + { + "epoch": 1.5793991416309012, + "grad_norm": 4.728977241944988, + "learning_rate": 2e-06, + "loss": 0.1352, + "step": 6808 + }, + { + "epoch": 1.579631133279202, + "grad_norm": 13.672761713026164, + "learning_rate": 2e-06, + "loss": 0.2204, + "step": 6809 + }, + { + "epoch": 1.5798631249275026, + "grad_norm": 17.40182794118163, + "learning_rate": 2e-06, + "loss": 0.3978, + "step": 6810 + }, + { + "epoch": 1.5800951165758033, + "grad_norm": 15.13729096710601, + "learning_rate": 2e-06, + "loss": 0.1737, + "step": 6811 + }, + { + "epoch": 1.580327108224104, + "grad_norm": 21.127192170719233, + "learning_rate": 2e-06, + "loss": 0.5297, + "step": 6812 + }, + { + "epoch": 1.5805590998724046, + "grad_norm": 16.860482121408943, + "learning_rate": 2e-06, + "loss": 0.1868, + "step": 6813 + }, + { + "epoch": 1.5807910915207053, + "grad_norm": 10.21009934750211, + "learning_rate": 2e-06, + "loss": 0.2236, + "step": 6814 + }, + { + "epoch": 1.581023083169006, + "grad_norm": 15.657316076566811, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 6815 + }, + { + "epoch": 1.5812550748173066, + "grad_norm": 15.242342987174924, + "learning_rate": 2e-06, + "loss": 0.3382, + "step": 6816 + }, + { + "epoch": 1.5814870664656073, + "grad_norm": 8.972983513729012, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 6817 + }, + { + "epoch": 1.581719058113908, + "grad_norm": 8.861874703597897, + "learning_rate": 2e-06, + "loss": 0.2629, + "step": 6818 + }, + { + "epoch": 1.5819510497622087, + "grad_norm": 11.282644462145537, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 6819 + }, + { + "epoch": 1.5821830414105094, + "grad_norm": 16.643703301291083, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 6820 + }, + { + "epoch": 1.58241503305881, + "grad_norm": 21.338207571831255, + "learning_rate": 2e-06, + "loss": 0.3162, + "step": 6821 + }, + { + "epoch": 1.5826470247071105, + "grad_norm": 10.456117076657767, + "learning_rate": 2e-06, + "loss": 0.1931, + "step": 6822 + }, + { + "epoch": 1.5828790163554112, + "grad_norm": 20.253880071565263, + "learning_rate": 2e-06, + "loss": 0.3495, + "step": 6823 + }, + { + "epoch": 1.5831110080037119, + "grad_norm": 17.609409195262224, + "learning_rate": 2e-06, + "loss": 0.3101, + "step": 6824 + }, + { + "epoch": 1.5833429996520125, + "grad_norm": 14.320650363686498, + "learning_rate": 2e-06, + "loss": 0.2719, + "step": 6825 + }, + { + "epoch": 1.5835749913003132, + "grad_norm": 18.748152428299427, + "learning_rate": 2e-06, + "loss": 0.2974, + "step": 6826 + }, + { + "epoch": 1.5838069829486139, + "grad_norm": 9.52741203825633, + "learning_rate": 2e-06, + "loss": 0.1331, + "step": 6827 + }, + { + "epoch": 1.5840389745969146, + "grad_norm": 18.27852816483889, + "learning_rate": 2e-06, + "loss": 0.3419, + "step": 6828 + }, + { + "epoch": 1.584270966245215, + "grad_norm": 9.530308694977986, + "learning_rate": 2e-06, + "loss": 0.1755, + "step": 6829 + }, + { + "epoch": 1.5845029578935157, + "grad_norm": 16.07479575909688, + "learning_rate": 2e-06, + "loss": 0.2276, + "step": 6830 + }, + { + "epoch": 1.5847349495418164, + "grad_norm": 5.195872495498322, + "learning_rate": 2e-06, + "loss": 0.1703, + "step": 6831 + }, + { + "epoch": 1.584966941190117, + "grad_norm": 9.3011970800669, + "learning_rate": 2e-06, + "loss": 0.2328, + "step": 6832 + }, + { + "epoch": 1.5851989328384177, + "grad_norm": 11.992368937834383, + "learning_rate": 2e-06, + "loss": 0.2196, + "step": 6833 + }, + { + "epoch": 1.5854309244867184, + "grad_norm": 8.603011392573826, + "learning_rate": 2e-06, + "loss": 0.2124, + "step": 6834 + }, + { + "epoch": 1.585662916135019, + "grad_norm": 12.27925572845166, + "learning_rate": 2e-06, + "loss": 0.243, + "step": 6835 + }, + { + "epoch": 1.5858949077833198, + "grad_norm": 17.79861330852824, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 6836 + }, + { + "epoch": 1.5861268994316204, + "grad_norm": 8.642526433758599, + "learning_rate": 2e-06, + "loss": 0.1874, + "step": 6837 + }, + { + "epoch": 1.5863588910799211, + "grad_norm": 7.819458952164534, + "learning_rate": 2e-06, + "loss": 0.172, + "step": 6838 + }, + { + "epoch": 1.5865908827282218, + "grad_norm": 14.04549995988459, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 6839 + }, + { + "epoch": 1.5868228743765225, + "grad_norm": 9.58691437742038, + "learning_rate": 2e-06, + "loss": 0.1658, + "step": 6840 + }, + { + "epoch": 1.5870548660248232, + "grad_norm": 7.415634246482783, + "learning_rate": 2e-06, + "loss": 0.1984, + "step": 6841 + }, + { + "epoch": 1.5872868576731238, + "grad_norm": 7.101726327086242, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 6842 + }, + { + "epoch": 1.5875188493214245, + "grad_norm": 7.409833020702544, + "learning_rate": 2e-06, + "loss": 0.173, + "step": 6843 + }, + { + "epoch": 1.5877508409697252, + "grad_norm": 5.643831335022831, + "learning_rate": 2e-06, + "loss": 0.1592, + "step": 6844 + }, + { + "epoch": 1.5879828326180259, + "grad_norm": 12.266503536628118, + "learning_rate": 2e-06, + "loss": 0.266, + "step": 6845 + }, + { + "epoch": 1.5882148242663265, + "grad_norm": 17.78772424839602, + "learning_rate": 2e-06, + "loss": 0.3874, + "step": 6846 + }, + { + "epoch": 1.5884468159146272, + "grad_norm": 15.623174768233348, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 6847 + }, + { + "epoch": 1.588678807562928, + "grad_norm": 14.267785946840403, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 6848 + }, + { + "epoch": 1.5889107992112284, + "grad_norm": 25.090609601792437, + "learning_rate": 2e-06, + "loss": 0.1686, + "step": 6849 + }, + { + "epoch": 1.589142790859529, + "grad_norm": 10.182624698844203, + "learning_rate": 2e-06, + "loss": 0.1973, + "step": 6850 + }, + { + "epoch": 1.5893747825078297, + "grad_norm": 16.37330122299087, + "learning_rate": 2e-06, + "loss": 0.2241, + "step": 6851 + }, + { + "epoch": 1.5896067741561304, + "grad_norm": 13.459030559524912, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 6852 + }, + { + "epoch": 1.589838765804431, + "grad_norm": 18.64639791565411, + "learning_rate": 2e-06, + "loss": 0.2362, + "step": 6853 + }, + { + "epoch": 1.5900707574527317, + "grad_norm": 8.167161131211438, + "learning_rate": 2e-06, + "loss": 0.2205, + "step": 6854 + }, + { + "epoch": 1.5903027491010322, + "grad_norm": 12.041969699634368, + "learning_rate": 2e-06, + "loss": 0.2306, + "step": 6855 + }, + { + "epoch": 1.5905347407493329, + "grad_norm": 14.858497088147926, + "learning_rate": 2e-06, + "loss": 0.2603, + "step": 6856 + }, + { + "epoch": 1.5907667323976336, + "grad_norm": 12.226650589555664, + "learning_rate": 2e-06, + "loss": 0.2172, + "step": 6857 + }, + { + "epoch": 1.5909987240459342, + "grad_norm": 9.08808966600926, + "learning_rate": 2e-06, + "loss": 0.1842, + "step": 6858 + }, + { + "epoch": 1.591230715694235, + "grad_norm": 13.717394027158436, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 6859 + }, + { + "epoch": 1.5914627073425356, + "grad_norm": 13.078249717134886, + "learning_rate": 2e-06, + "loss": 0.2417, + "step": 6860 + }, + { + "epoch": 1.5916946989908363, + "grad_norm": 28.056304082891295, + "learning_rate": 2e-06, + "loss": 0.2097, + "step": 6861 + }, + { + "epoch": 1.591926690639137, + "grad_norm": 14.645870930917718, + "learning_rate": 2e-06, + "loss": 0.1665, + "step": 6862 + }, + { + "epoch": 1.5921586822874376, + "grad_norm": 15.310789998713801, + "learning_rate": 2e-06, + "loss": 0.3082, + "step": 6863 + }, + { + "epoch": 1.5923906739357383, + "grad_norm": 20.451507985504687, + "learning_rate": 2e-06, + "loss": 0.2574, + "step": 6864 + }, + { + "epoch": 1.592622665584039, + "grad_norm": 21.807670305901297, + "learning_rate": 2e-06, + "loss": 0.3035, + "step": 6865 + }, + { + "epoch": 1.5928546572323397, + "grad_norm": 19.895841208070944, + "learning_rate": 2e-06, + "loss": 0.2902, + "step": 6866 + }, + { + "epoch": 1.5930866488806403, + "grad_norm": 8.457046599193541, + "learning_rate": 2e-06, + "loss": 0.2041, + "step": 6867 + }, + { + "epoch": 1.593318640528941, + "grad_norm": 22.47335248001497, + "learning_rate": 2e-06, + "loss": 0.4859, + "step": 6868 + }, + { + "epoch": 1.5935506321772417, + "grad_norm": 6.685828572757369, + "learning_rate": 2e-06, + "loss": 0.1665, + "step": 6869 + }, + { + "epoch": 1.5937826238255424, + "grad_norm": 12.58042735255098, + "learning_rate": 2e-06, + "loss": 0.1601, + "step": 6870 + }, + { + "epoch": 1.594014615473843, + "grad_norm": 22.317484002899555, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 6871 + }, + { + "epoch": 1.5942466071221437, + "grad_norm": 18.39607935176384, + "learning_rate": 2e-06, + "loss": 0.3981, + "step": 6872 + }, + { + "epoch": 1.5944785987704444, + "grad_norm": 10.360346744792217, + "learning_rate": 2e-06, + "loss": 0.1472, + "step": 6873 + }, + { + "epoch": 1.594710590418745, + "grad_norm": 11.671580147838984, + "learning_rate": 2e-06, + "loss": 0.2312, + "step": 6874 + }, + { + "epoch": 1.5949425820670455, + "grad_norm": 8.029919131678305, + "learning_rate": 2e-06, + "loss": 0.234, + "step": 6875 + }, + { + "epoch": 1.5951745737153462, + "grad_norm": 14.454042603280314, + "learning_rate": 2e-06, + "loss": 0.2537, + "step": 6876 + }, + { + "epoch": 1.595406565363647, + "grad_norm": 14.486299136318895, + "learning_rate": 2e-06, + "loss": 0.2426, + "step": 6877 + }, + { + "epoch": 1.5956385570119476, + "grad_norm": 14.517852619599177, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 6878 + }, + { + "epoch": 1.5958705486602482, + "grad_norm": 13.24440098634251, + "learning_rate": 2e-06, + "loss": 0.2915, + "step": 6879 + }, + { + "epoch": 1.596102540308549, + "grad_norm": 12.914902414937012, + "learning_rate": 2e-06, + "loss": 0.3206, + "step": 6880 + }, + { + "epoch": 1.5963345319568496, + "grad_norm": 12.885798355302601, + "learning_rate": 2e-06, + "loss": 0.1833, + "step": 6881 + }, + { + "epoch": 1.59656652360515, + "grad_norm": 15.829499481850467, + "learning_rate": 2e-06, + "loss": 0.2901, + "step": 6882 + }, + { + "epoch": 1.5967985152534507, + "grad_norm": 11.329045094579913, + "learning_rate": 2e-06, + "loss": 0.241, + "step": 6883 + }, + { + "epoch": 1.5970305069017514, + "grad_norm": 14.853956669259542, + "learning_rate": 2e-06, + "loss": 0.2227, + "step": 6884 + }, + { + "epoch": 1.597262498550052, + "grad_norm": 9.383651932545247, + "learning_rate": 2e-06, + "loss": 0.1166, + "step": 6885 + }, + { + "epoch": 1.5974944901983528, + "grad_norm": 29.26111858454847, + "learning_rate": 2e-06, + "loss": 0.3973, + "step": 6886 + }, + { + "epoch": 1.5977264818466534, + "grad_norm": 6.7484712455436116, + "learning_rate": 2e-06, + "loss": 0.1273, + "step": 6887 + }, + { + "epoch": 1.5979584734949541, + "grad_norm": 10.15763033087749, + "learning_rate": 2e-06, + "loss": 0.271, + "step": 6888 + }, + { + "epoch": 1.5981904651432548, + "grad_norm": 17.35175331054969, + "learning_rate": 2e-06, + "loss": 0.4555, + "step": 6889 + }, + { + "epoch": 1.5984224567915555, + "grad_norm": 12.735975895446408, + "learning_rate": 2e-06, + "loss": 0.258, + "step": 6890 + }, + { + "epoch": 1.5986544484398562, + "grad_norm": 12.899543658492155, + "learning_rate": 2e-06, + "loss": 0.1864, + "step": 6891 + }, + { + "epoch": 1.5988864400881568, + "grad_norm": 12.698385568969032, + "learning_rate": 2e-06, + "loss": 0.1718, + "step": 6892 + }, + { + "epoch": 1.5991184317364575, + "grad_norm": 4.28535250927259, + "learning_rate": 2e-06, + "loss": 0.1266, + "step": 6893 + }, + { + "epoch": 1.5993504233847582, + "grad_norm": 15.392604216922926, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 6894 + }, + { + "epoch": 1.5995824150330589, + "grad_norm": 15.137874358989361, + "learning_rate": 2e-06, + "loss": 0.2634, + "step": 6895 + }, + { + "epoch": 1.5998144066813595, + "grad_norm": 15.254360087513062, + "learning_rate": 2e-06, + "loss": 0.3004, + "step": 6896 + }, + { + "epoch": 1.6000463983296602, + "grad_norm": 18.28465820969029, + "learning_rate": 2e-06, + "loss": 0.4048, + "step": 6897 + }, + { + "epoch": 1.600278389977961, + "grad_norm": 7.541889046152906, + "learning_rate": 2e-06, + "loss": 0.1286, + "step": 6898 + }, + { + "epoch": 1.6005103816262616, + "grad_norm": 14.525111372393546, + "learning_rate": 2e-06, + "loss": 0.1813, + "step": 6899 + }, + { + "epoch": 1.6007423732745623, + "grad_norm": 10.764190597901676, + "learning_rate": 2e-06, + "loss": 0.2426, + "step": 6900 + }, + { + "epoch": 1.600974364922863, + "grad_norm": 16.376203207837985, + "learning_rate": 2e-06, + "loss": 0.3211, + "step": 6901 + }, + { + "epoch": 1.6012063565711634, + "grad_norm": 8.72620445463533, + "learning_rate": 2e-06, + "loss": 0.1751, + "step": 6902 + }, + { + "epoch": 1.601438348219464, + "grad_norm": 15.942772043804196, + "learning_rate": 2e-06, + "loss": 0.2968, + "step": 6903 + }, + { + "epoch": 1.6016703398677647, + "grad_norm": 18.402605174063517, + "learning_rate": 2e-06, + "loss": 0.2, + "step": 6904 + }, + { + "epoch": 1.6019023315160654, + "grad_norm": 12.426769789870608, + "learning_rate": 2e-06, + "loss": 0.2583, + "step": 6905 + }, + { + "epoch": 1.602134323164366, + "grad_norm": 10.17124275196209, + "learning_rate": 2e-06, + "loss": 0.1767, + "step": 6906 + }, + { + "epoch": 1.6023663148126668, + "grad_norm": 20.138982762387695, + "learning_rate": 2e-06, + "loss": 0.301, + "step": 6907 + }, + { + "epoch": 1.6025983064609675, + "grad_norm": 14.742653023011412, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 6908 + }, + { + "epoch": 1.602830298109268, + "grad_norm": 25.703435637577172, + "learning_rate": 2e-06, + "loss": 0.3056, + "step": 6909 + }, + { + "epoch": 1.6030622897575686, + "grad_norm": 12.570448997393886, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 6910 + }, + { + "epoch": 1.6032942814058693, + "grad_norm": 7.55682126082453, + "learning_rate": 2e-06, + "loss": 0.1689, + "step": 6911 + }, + { + "epoch": 1.60352627305417, + "grad_norm": 13.806272086112495, + "learning_rate": 2e-06, + "loss": 0.2732, + "step": 6912 + }, + { + "epoch": 1.6037582647024706, + "grad_norm": 22.446697414205737, + "learning_rate": 2e-06, + "loss": 0.4215, + "step": 6913 + }, + { + "epoch": 1.6039902563507713, + "grad_norm": 17.32438262165776, + "learning_rate": 2e-06, + "loss": 0.2283, + "step": 6914 + }, + { + "epoch": 1.604222247999072, + "grad_norm": 23.79328231241564, + "learning_rate": 2e-06, + "loss": 0.3247, + "step": 6915 + }, + { + "epoch": 1.6044542396473727, + "grad_norm": 11.871426031777244, + "learning_rate": 2e-06, + "loss": 0.2369, + "step": 6916 + }, + { + "epoch": 1.6046862312956733, + "grad_norm": 12.60168201994963, + "learning_rate": 2e-06, + "loss": 0.2332, + "step": 6917 + }, + { + "epoch": 1.604918222943974, + "grad_norm": 14.391829705544197, + "learning_rate": 2e-06, + "loss": 0.2458, + "step": 6918 + }, + { + "epoch": 1.6051502145922747, + "grad_norm": 22.365046280164332, + "learning_rate": 2e-06, + "loss": 0.3598, + "step": 6919 + }, + { + "epoch": 1.6053822062405754, + "grad_norm": 9.21861265143294, + "learning_rate": 2e-06, + "loss": 0.1547, + "step": 6920 + }, + { + "epoch": 1.605614197888876, + "grad_norm": 17.501152537868485, + "learning_rate": 2e-06, + "loss": 0.241, + "step": 6921 + }, + { + "epoch": 1.6058461895371767, + "grad_norm": 13.363051800078775, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 6922 + }, + { + "epoch": 1.6060781811854774, + "grad_norm": 10.522078319379862, + "learning_rate": 2e-06, + "loss": 0.2682, + "step": 6923 + }, + { + "epoch": 1.606310172833778, + "grad_norm": 10.73490836853733, + "learning_rate": 2e-06, + "loss": 0.1414, + "step": 6924 + }, + { + "epoch": 1.6065421644820788, + "grad_norm": 8.652664717579718, + "learning_rate": 2e-06, + "loss": 0.2046, + "step": 6925 + }, + { + "epoch": 1.6067741561303794, + "grad_norm": 10.409928706645774, + "learning_rate": 2e-06, + "loss": 0.1589, + "step": 6926 + }, + { + "epoch": 1.6070061477786801, + "grad_norm": 27.671244480261826, + "learning_rate": 2e-06, + "loss": 0.3348, + "step": 6927 + }, + { + "epoch": 1.6072381394269808, + "grad_norm": 17.329101528036233, + "learning_rate": 2e-06, + "loss": 0.3062, + "step": 6928 + }, + { + "epoch": 1.6074701310752812, + "grad_norm": 14.260022102662832, + "learning_rate": 2e-06, + "loss": 0.2847, + "step": 6929 + }, + { + "epoch": 1.607702122723582, + "grad_norm": 34.96318462390881, + "learning_rate": 2e-06, + "loss": 0.4542, + "step": 6930 + }, + { + "epoch": 1.6079341143718826, + "grad_norm": 10.267362931793937, + "learning_rate": 2e-06, + "loss": 0.2643, + "step": 6931 + }, + { + "epoch": 1.6081661060201833, + "grad_norm": 17.83854286219526, + "learning_rate": 2e-06, + "loss": 0.2844, + "step": 6932 + }, + { + "epoch": 1.608398097668484, + "grad_norm": 15.649285578386333, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 6933 + }, + { + "epoch": 1.6086300893167846, + "grad_norm": 11.59450320536855, + "learning_rate": 2e-06, + "loss": 0.2121, + "step": 6934 + }, + { + "epoch": 1.608862080965085, + "grad_norm": 14.804743000417668, + "learning_rate": 2e-06, + "loss": 0.2726, + "step": 6935 + }, + { + "epoch": 1.6090940726133858, + "grad_norm": 27.013454990211674, + "learning_rate": 2e-06, + "loss": 0.2249, + "step": 6936 + }, + { + "epoch": 1.6093260642616865, + "grad_norm": 13.469542011658062, + "learning_rate": 2e-06, + "loss": 0.2222, + "step": 6937 + }, + { + "epoch": 1.6095580559099871, + "grad_norm": 6.0198227896653975, + "learning_rate": 2e-06, + "loss": 0.1241, + "step": 6938 + }, + { + "epoch": 1.6097900475582878, + "grad_norm": 15.411189902368585, + "learning_rate": 2e-06, + "loss": 0.2981, + "step": 6939 + }, + { + "epoch": 1.6100220392065885, + "grad_norm": 11.78771720357112, + "learning_rate": 2e-06, + "loss": 0.2158, + "step": 6940 + }, + { + "epoch": 1.6102540308548892, + "grad_norm": 22.74730256036679, + "learning_rate": 2e-06, + "loss": 0.3225, + "step": 6941 + }, + { + "epoch": 1.6104860225031898, + "grad_norm": 10.730879983272139, + "learning_rate": 2e-06, + "loss": 0.248, + "step": 6942 + }, + { + "epoch": 1.6107180141514905, + "grad_norm": 11.91366434944726, + "learning_rate": 2e-06, + "loss": 0.1315, + "step": 6943 + }, + { + "epoch": 1.6109500057997912, + "grad_norm": 13.894807052318972, + "learning_rate": 2e-06, + "loss": 0.2077, + "step": 6944 + }, + { + "epoch": 1.6111819974480919, + "grad_norm": 25.650238814998406, + "learning_rate": 2e-06, + "loss": 0.3684, + "step": 6945 + }, + { + "epoch": 1.6114139890963926, + "grad_norm": 20.577245785155775, + "learning_rate": 2e-06, + "loss": 0.3512, + "step": 6946 + }, + { + "epoch": 1.6116459807446932, + "grad_norm": 15.803228741453447, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 6947 + }, + { + "epoch": 1.611877972392994, + "grad_norm": 9.749549037933088, + "learning_rate": 2e-06, + "loss": 0.2645, + "step": 6948 + }, + { + "epoch": 1.6121099640412946, + "grad_norm": 19.299638527391338, + "learning_rate": 2e-06, + "loss": 0.2504, + "step": 6949 + }, + { + "epoch": 1.6123419556895953, + "grad_norm": 14.484103820826514, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 6950 + }, + { + "epoch": 1.612573947337896, + "grad_norm": 18.78291403294643, + "learning_rate": 2e-06, + "loss": 0.2847, + "step": 6951 + }, + { + "epoch": 1.6128059389861966, + "grad_norm": 11.665599286408527, + "learning_rate": 2e-06, + "loss": 0.1852, + "step": 6952 + }, + { + "epoch": 1.6130379306344973, + "grad_norm": 9.265971566495484, + "learning_rate": 2e-06, + "loss": 0.2277, + "step": 6953 + }, + { + "epoch": 1.613269922282798, + "grad_norm": 8.44029894218763, + "learning_rate": 2e-06, + "loss": 0.1743, + "step": 6954 + }, + { + "epoch": 1.6135019139310984, + "grad_norm": 13.715280174043034, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 6955 + }, + { + "epoch": 1.613733905579399, + "grad_norm": 9.560914543568476, + "learning_rate": 2e-06, + "loss": 0.1785, + "step": 6956 + }, + { + "epoch": 1.6139658972276998, + "grad_norm": 4.536975004892732, + "learning_rate": 2e-06, + "loss": 0.1289, + "step": 6957 + }, + { + "epoch": 1.6141978888760005, + "grad_norm": 9.108618213148045, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 6958 + }, + { + "epoch": 1.6144298805243011, + "grad_norm": 15.068677329144581, + "learning_rate": 2e-06, + "loss": 0.344, + "step": 6959 + }, + { + "epoch": 1.6146618721726018, + "grad_norm": 10.657346641654241, + "learning_rate": 2e-06, + "loss": 0.2706, + "step": 6960 + }, + { + "epoch": 1.6148938638209025, + "grad_norm": 10.836273054781074, + "learning_rate": 2e-06, + "loss": 0.2135, + "step": 6961 + }, + { + "epoch": 1.615125855469203, + "grad_norm": 15.3784691856773, + "learning_rate": 2e-06, + "loss": 0.2182, + "step": 6962 + }, + { + "epoch": 1.6153578471175036, + "grad_norm": 12.18357080868277, + "learning_rate": 2e-06, + "loss": 0.1148, + "step": 6963 + }, + { + "epoch": 1.6155898387658043, + "grad_norm": 16.523784611503427, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 6964 + }, + { + "epoch": 1.615821830414105, + "grad_norm": 13.457293895959346, + "learning_rate": 2e-06, + "loss": 0.2961, + "step": 6965 + }, + { + "epoch": 1.6160538220624057, + "grad_norm": 9.172409383148903, + "learning_rate": 2e-06, + "loss": 0.2673, + "step": 6966 + }, + { + "epoch": 1.6162858137107063, + "grad_norm": 10.592448387883387, + "learning_rate": 2e-06, + "loss": 0.2896, + "step": 6967 + }, + { + "epoch": 1.616517805359007, + "grad_norm": 8.043362460788911, + "learning_rate": 2e-06, + "loss": 0.2057, + "step": 6968 + }, + { + "epoch": 1.6167497970073077, + "grad_norm": 13.452763397776307, + "learning_rate": 2e-06, + "loss": 0.2028, + "step": 6969 + }, + { + "epoch": 1.6169817886556084, + "grad_norm": 11.889829935690594, + "learning_rate": 2e-06, + "loss": 0.3731, + "step": 6970 + }, + { + "epoch": 1.617213780303909, + "grad_norm": 15.55403517134958, + "learning_rate": 2e-06, + "loss": 0.2903, + "step": 6971 + }, + { + "epoch": 1.6174457719522097, + "grad_norm": 10.142859304319238, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 6972 + }, + { + "epoch": 1.6176777636005104, + "grad_norm": 22.015360469237347, + "learning_rate": 2e-06, + "loss": 0.3501, + "step": 6973 + }, + { + "epoch": 1.617909755248811, + "grad_norm": 16.099404708706622, + "learning_rate": 2e-06, + "loss": 0.2518, + "step": 6974 + }, + { + "epoch": 1.6181417468971118, + "grad_norm": 16.155557913431416, + "learning_rate": 2e-06, + "loss": 0.3623, + "step": 6975 + }, + { + "epoch": 1.6183737385454124, + "grad_norm": 12.094711432250437, + "learning_rate": 2e-06, + "loss": 0.228, + "step": 6976 + }, + { + "epoch": 1.6186057301937131, + "grad_norm": 9.624041117699276, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 6977 + }, + { + "epoch": 1.6188377218420138, + "grad_norm": 11.654084378361608, + "learning_rate": 2e-06, + "loss": 0.1419, + "step": 6978 + }, + { + "epoch": 1.6190697134903145, + "grad_norm": 10.785825490097595, + "learning_rate": 2e-06, + "loss": 0.2813, + "step": 6979 + }, + { + "epoch": 1.6193017051386152, + "grad_norm": 15.670027219809535, + "learning_rate": 2e-06, + "loss": 0.2212, + "step": 6980 + }, + { + "epoch": 1.6195336967869158, + "grad_norm": 10.37649965458519, + "learning_rate": 2e-06, + "loss": 0.3193, + "step": 6981 + }, + { + "epoch": 1.6197656884352163, + "grad_norm": 12.447543343424678, + "learning_rate": 2e-06, + "loss": 0.351, + "step": 6982 + }, + { + "epoch": 1.619997680083517, + "grad_norm": 20.356869900294978, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 6983 + }, + { + "epoch": 1.6202296717318176, + "grad_norm": 6.864750951398252, + "learning_rate": 2e-06, + "loss": 0.1783, + "step": 6984 + }, + { + "epoch": 1.6204616633801183, + "grad_norm": 12.418089662331026, + "learning_rate": 2e-06, + "loss": 0.3723, + "step": 6985 + }, + { + "epoch": 1.620693655028419, + "grad_norm": 12.540617596943656, + "learning_rate": 2e-06, + "loss": 0.3026, + "step": 6986 + }, + { + "epoch": 1.6209256466767197, + "grad_norm": 19.082797466653933, + "learning_rate": 2e-06, + "loss": 0.4161, + "step": 6987 + }, + { + "epoch": 1.6211576383250201, + "grad_norm": 7.163325569917312, + "learning_rate": 2e-06, + "loss": 0.1496, + "step": 6988 + }, + { + "epoch": 1.6213896299733208, + "grad_norm": 7.111859392755355, + "learning_rate": 2e-06, + "loss": 0.1597, + "step": 6989 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 19.57450811713846, + "learning_rate": 2e-06, + "loss": 0.2889, + "step": 6990 + }, + { + "epoch": 1.6218536132699222, + "grad_norm": 15.036470054451497, + "learning_rate": 2e-06, + "loss": 0.2912, + "step": 6991 + }, + { + "epoch": 1.6220856049182228, + "grad_norm": 15.753907717110106, + "learning_rate": 2e-06, + "loss": 0.1838, + "step": 6992 + }, + { + "epoch": 1.6223175965665235, + "grad_norm": 13.527695512482605, + "learning_rate": 2e-06, + "loss": 0.2222, + "step": 6993 + }, + { + "epoch": 1.6225495882148242, + "grad_norm": 14.951485530049725, + "learning_rate": 2e-06, + "loss": 0.4262, + "step": 6994 + }, + { + "epoch": 1.6227815798631249, + "grad_norm": 10.88914474498736, + "learning_rate": 2e-06, + "loss": 0.2445, + "step": 6995 + }, + { + "epoch": 1.6230135715114256, + "grad_norm": 7.7800678868379265, + "learning_rate": 2e-06, + "loss": 0.1926, + "step": 6996 + }, + { + "epoch": 1.6232455631597262, + "grad_norm": 8.47913440262279, + "learning_rate": 2e-06, + "loss": 0.1662, + "step": 6997 + }, + { + "epoch": 1.623477554808027, + "grad_norm": 17.86686445889657, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 6998 + }, + { + "epoch": 1.6237095464563276, + "grad_norm": 7.766073380740134, + "learning_rate": 2e-06, + "loss": 0.1742, + "step": 6999 + }, + { + "epoch": 1.6239415381046283, + "grad_norm": 15.802998754377214, + "learning_rate": 2e-06, + "loss": 0.3292, + "step": 7000 + }, + { + "epoch": 1.624173529752929, + "grad_norm": 12.543459151492442, + "learning_rate": 2e-06, + "loss": 0.254, + "step": 7001 + }, + { + "epoch": 1.6244055214012296, + "grad_norm": 13.798966596523735, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 7002 + }, + { + "epoch": 1.6246375130495303, + "grad_norm": 14.669498102811584, + "learning_rate": 2e-06, + "loss": 0.1382, + "step": 7003 + }, + { + "epoch": 1.624869504697831, + "grad_norm": 18.572808451157982, + "learning_rate": 2e-06, + "loss": 0.2693, + "step": 7004 + }, + { + "epoch": 1.6251014963461317, + "grad_norm": 16.05801422771798, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 7005 + }, + { + "epoch": 1.6253334879944323, + "grad_norm": 6.454360588304225, + "learning_rate": 2e-06, + "loss": 0.1544, + "step": 7006 + }, + { + "epoch": 1.625565479642733, + "grad_norm": 9.589180560376818, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 7007 + }, + { + "epoch": 1.6257974712910335, + "grad_norm": 14.671821741588948, + "learning_rate": 2e-06, + "loss": 0.2478, + "step": 7008 + }, + { + "epoch": 1.6260294629393341, + "grad_norm": 22.87668683869851, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 7009 + }, + { + "epoch": 1.6262614545876348, + "grad_norm": 13.872086745553338, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 7010 + }, + { + "epoch": 1.6264934462359355, + "grad_norm": 22.842902471747887, + "learning_rate": 2e-06, + "loss": 0.2549, + "step": 7011 + }, + { + "epoch": 1.6267254378842362, + "grad_norm": 15.858015016072699, + "learning_rate": 2e-06, + "loss": 0.3301, + "step": 7012 + }, + { + "epoch": 1.6269574295325369, + "grad_norm": 17.20531113729456, + "learning_rate": 2e-06, + "loss": 0.3237, + "step": 7013 + }, + { + "epoch": 1.6271894211808375, + "grad_norm": 16.205524427264955, + "learning_rate": 2e-06, + "loss": 0.2433, + "step": 7014 + }, + { + "epoch": 1.627421412829138, + "grad_norm": 15.422387385622043, + "learning_rate": 2e-06, + "loss": 0.4239, + "step": 7015 + }, + { + "epoch": 1.6276534044774387, + "grad_norm": 16.60742013346518, + "learning_rate": 2e-06, + "loss": 0.3804, + "step": 7016 + }, + { + "epoch": 1.6278853961257393, + "grad_norm": 13.290142696386331, + "learning_rate": 2e-06, + "loss": 0.2607, + "step": 7017 + }, + { + "epoch": 1.62811738777404, + "grad_norm": 14.580299839319386, + "learning_rate": 2e-06, + "loss": 0.3681, + "step": 7018 + }, + { + "epoch": 1.6283493794223407, + "grad_norm": 11.730455808829811, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 7019 + }, + { + "epoch": 1.6285813710706414, + "grad_norm": 12.826274722124499, + "learning_rate": 2e-06, + "loss": 0.2255, + "step": 7020 + }, + { + "epoch": 1.628813362718942, + "grad_norm": 12.43214970101332, + "learning_rate": 2e-06, + "loss": 0.2796, + "step": 7021 + }, + { + "epoch": 1.6290453543672427, + "grad_norm": 14.440187541395591, + "learning_rate": 2e-06, + "loss": 0.4325, + "step": 7022 + }, + { + "epoch": 1.6292773460155434, + "grad_norm": 9.727986359501164, + "learning_rate": 2e-06, + "loss": 0.2083, + "step": 7023 + }, + { + "epoch": 1.629509337663844, + "grad_norm": 15.501284501084388, + "learning_rate": 2e-06, + "loss": 0.2339, + "step": 7024 + }, + { + "epoch": 1.6297413293121448, + "grad_norm": 13.762739971342741, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 7025 + }, + { + "epoch": 1.6299733209604454, + "grad_norm": 12.265472728748575, + "learning_rate": 2e-06, + "loss": 0.3016, + "step": 7026 + }, + { + "epoch": 1.6302053126087461, + "grad_norm": 15.96070196913894, + "learning_rate": 2e-06, + "loss": 0.2472, + "step": 7027 + }, + { + "epoch": 1.6304373042570468, + "grad_norm": 14.095882691910807, + "learning_rate": 2e-06, + "loss": 0.2394, + "step": 7028 + }, + { + "epoch": 1.6306692959053475, + "grad_norm": 16.54000688770342, + "learning_rate": 2e-06, + "loss": 0.2738, + "step": 7029 + }, + { + "epoch": 1.6309012875536482, + "grad_norm": 11.581800441529591, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 7030 + }, + { + "epoch": 1.6311332792019488, + "grad_norm": 10.634740006896134, + "learning_rate": 2e-06, + "loss": 0.1975, + "step": 7031 + }, + { + "epoch": 1.6313652708502495, + "grad_norm": 5.71974278126696, + "learning_rate": 2e-06, + "loss": 0.1228, + "step": 7032 + }, + { + "epoch": 1.6315972624985502, + "grad_norm": 11.52002459240929, + "learning_rate": 2e-06, + "loss": 0.1986, + "step": 7033 + }, + { + "epoch": 1.6318292541468509, + "grad_norm": 12.300589751455139, + "learning_rate": 2e-06, + "loss": 0.2178, + "step": 7034 + }, + { + "epoch": 1.6320612457951513, + "grad_norm": 11.600142147918511, + "learning_rate": 2e-06, + "loss": 0.2138, + "step": 7035 + }, + { + "epoch": 1.632293237443452, + "grad_norm": 10.474970068593255, + "learning_rate": 2e-06, + "loss": 0.2222, + "step": 7036 + }, + { + "epoch": 1.6325252290917527, + "grad_norm": 19.9060475508325, + "learning_rate": 2e-06, + "loss": 0.2173, + "step": 7037 + }, + { + "epoch": 1.6327572207400534, + "grad_norm": 11.184774219572693, + "learning_rate": 2e-06, + "loss": 0.1366, + "step": 7038 + }, + { + "epoch": 1.632989212388354, + "grad_norm": 11.245052817363465, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 7039 + }, + { + "epoch": 1.6332212040366547, + "grad_norm": 13.971578345151503, + "learning_rate": 2e-06, + "loss": 0.3935, + "step": 7040 + }, + { + "epoch": 1.6334531956849554, + "grad_norm": 10.115989514407806, + "learning_rate": 2e-06, + "loss": 0.2755, + "step": 7041 + }, + { + "epoch": 1.6336851873332559, + "grad_norm": 9.736157752300581, + "learning_rate": 2e-06, + "loss": 0.1958, + "step": 7042 + }, + { + "epoch": 1.6339171789815565, + "grad_norm": 12.293691083260265, + "learning_rate": 2e-06, + "loss": 0.2329, + "step": 7043 + }, + { + "epoch": 1.6341491706298572, + "grad_norm": 10.391037532897096, + "learning_rate": 2e-06, + "loss": 0.1948, + "step": 7044 + }, + { + "epoch": 1.6343811622781579, + "grad_norm": 17.20398074920175, + "learning_rate": 2e-06, + "loss": 0.3274, + "step": 7045 + }, + { + "epoch": 1.6346131539264586, + "grad_norm": 26.074947278089915, + "learning_rate": 2e-06, + "loss": 0.3111, + "step": 7046 + }, + { + "epoch": 1.6348451455747592, + "grad_norm": 12.340585075621851, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 7047 + }, + { + "epoch": 1.63507713722306, + "grad_norm": 11.670255160442114, + "learning_rate": 2e-06, + "loss": 0.2644, + "step": 7048 + }, + { + "epoch": 1.6353091288713606, + "grad_norm": 10.342650552953074, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 7049 + }, + { + "epoch": 1.6355411205196613, + "grad_norm": 9.819092085974871, + "learning_rate": 2e-06, + "loss": 0.2415, + "step": 7050 + }, + { + "epoch": 1.635773112167962, + "grad_norm": 11.559798052526743, + "learning_rate": 2e-06, + "loss": 0.2196, + "step": 7051 + }, + { + "epoch": 1.6360051038162626, + "grad_norm": 16.251447881217928, + "learning_rate": 2e-06, + "loss": 0.3706, + "step": 7052 + }, + { + "epoch": 1.6362370954645633, + "grad_norm": 18.046939192717193, + "learning_rate": 2e-06, + "loss": 0.3217, + "step": 7053 + }, + { + "epoch": 1.636469087112864, + "grad_norm": 5.2898743743472645, + "learning_rate": 2e-06, + "loss": 0.1283, + "step": 7054 + }, + { + "epoch": 1.6367010787611647, + "grad_norm": 14.568451000162517, + "learning_rate": 2e-06, + "loss": 0.3354, + "step": 7055 + }, + { + "epoch": 1.6369330704094653, + "grad_norm": 8.994228712771722, + "learning_rate": 2e-06, + "loss": 0.2345, + "step": 7056 + }, + { + "epoch": 1.637165062057766, + "grad_norm": 22.65822871755929, + "learning_rate": 2e-06, + "loss": 0.3173, + "step": 7057 + }, + { + "epoch": 1.6373970537060667, + "grad_norm": 18.544582583755645, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 7058 + }, + { + "epoch": 1.6376290453543674, + "grad_norm": 12.768144951770473, + "learning_rate": 2e-06, + "loss": 0.3208, + "step": 7059 + }, + { + "epoch": 1.637861037002668, + "grad_norm": 16.23576130431629, + "learning_rate": 2e-06, + "loss": 0.2045, + "step": 7060 + }, + { + "epoch": 1.6380930286509687, + "grad_norm": 10.590738727915014, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 7061 + }, + { + "epoch": 1.6383250202992692, + "grad_norm": 25.543926792585626, + "learning_rate": 2e-06, + "loss": 0.3695, + "step": 7062 + }, + { + "epoch": 1.6385570119475699, + "grad_norm": 13.621056889878547, + "learning_rate": 2e-06, + "loss": 0.3308, + "step": 7063 + }, + { + "epoch": 1.6387890035958705, + "grad_norm": 11.16766633431158, + "learning_rate": 2e-06, + "loss": 0.3386, + "step": 7064 + }, + { + "epoch": 1.6390209952441712, + "grad_norm": 10.051390960951437, + "learning_rate": 2e-06, + "loss": 0.1408, + "step": 7065 + }, + { + "epoch": 1.639252986892472, + "grad_norm": 16.319182344726816, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 7066 + }, + { + "epoch": 1.6394849785407726, + "grad_norm": 8.323363441312226, + "learning_rate": 2e-06, + "loss": 0.17, + "step": 7067 + }, + { + "epoch": 1.639716970189073, + "grad_norm": 18.082884377601907, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 7068 + }, + { + "epoch": 1.6399489618373737, + "grad_norm": 17.96858123227917, + "learning_rate": 2e-06, + "loss": 0.3246, + "step": 7069 + }, + { + "epoch": 1.6401809534856744, + "grad_norm": 8.348504342658279, + "learning_rate": 2e-06, + "loss": 0.2122, + "step": 7070 + }, + { + "epoch": 1.640412945133975, + "grad_norm": 14.195859872328015, + "learning_rate": 2e-06, + "loss": 0.2579, + "step": 7071 + }, + { + "epoch": 1.6406449367822757, + "grad_norm": 8.747800547063882, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 7072 + }, + { + "epoch": 1.6408769284305764, + "grad_norm": 13.625462706249374, + "learning_rate": 2e-06, + "loss": 0.1596, + "step": 7073 + }, + { + "epoch": 1.641108920078877, + "grad_norm": 9.66076283450918, + "learning_rate": 2e-06, + "loss": 0.1452, + "step": 7074 + }, + { + "epoch": 1.6413409117271778, + "grad_norm": 15.258436926230441, + "learning_rate": 2e-06, + "loss": 0.1653, + "step": 7075 + }, + { + "epoch": 1.6415729033754785, + "grad_norm": 16.421172139281012, + "learning_rate": 2e-06, + "loss": 0.2951, + "step": 7076 + }, + { + "epoch": 1.6418048950237791, + "grad_norm": 13.484416764834592, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 7077 + }, + { + "epoch": 1.6420368866720798, + "grad_norm": 18.197461234111085, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 7078 + }, + { + "epoch": 1.6422688783203805, + "grad_norm": 21.10162923051286, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 7079 + }, + { + "epoch": 1.6425008699686812, + "grad_norm": 12.427903176075388, + "learning_rate": 2e-06, + "loss": 0.2173, + "step": 7080 + }, + { + "epoch": 1.6427328616169818, + "grad_norm": 18.285960244290226, + "learning_rate": 2e-06, + "loss": 0.3021, + "step": 7081 + }, + { + "epoch": 1.6429648532652825, + "grad_norm": 8.15178393543725, + "learning_rate": 2e-06, + "loss": 0.1949, + "step": 7082 + }, + { + "epoch": 1.6431968449135832, + "grad_norm": 10.810702543448354, + "learning_rate": 2e-06, + "loss": 0.2177, + "step": 7083 + }, + { + "epoch": 1.6434288365618839, + "grad_norm": 10.53137318843868, + "learning_rate": 2e-06, + "loss": 0.1946, + "step": 7084 + }, + { + "epoch": 1.6436608282101846, + "grad_norm": 8.979276521118402, + "learning_rate": 2e-06, + "loss": 0.2341, + "step": 7085 + }, + { + "epoch": 1.6438928198584852, + "grad_norm": 7.373128298424443, + "learning_rate": 2e-06, + "loss": 0.1703, + "step": 7086 + }, + { + "epoch": 1.644124811506786, + "grad_norm": 9.086247902153618, + "learning_rate": 2e-06, + "loss": 0.1985, + "step": 7087 + }, + { + "epoch": 1.6443568031550864, + "grad_norm": 18.829921655392535, + "learning_rate": 2e-06, + "loss": 0.2776, + "step": 7088 + }, + { + "epoch": 1.644588794803387, + "grad_norm": 16.727235065578363, + "learning_rate": 2e-06, + "loss": 0.2458, + "step": 7089 + }, + { + "epoch": 1.6448207864516877, + "grad_norm": 11.826408708970803, + "learning_rate": 2e-06, + "loss": 0.2563, + "step": 7090 + }, + { + "epoch": 1.6450527780999884, + "grad_norm": 12.812812098521128, + "learning_rate": 2e-06, + "loss": 0.2057, + "step": 7091 + }, + { + "epoch": 1.645284769748289, + "grad_norm": 18.747553139697516, + "learning_rate": 2e-06, + "loss": 0.2797, + "step": 7092 + }, + { + "epoch": 1.6455167613965898, + "grad_norm": 21.37070998648583, + "learning_rate": 2e-06, + "loss": 0.2842, + "step": 7093 + }, + { + "epoch": 1.6457487530448904, + "grad_norm": 16.031144141161292, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 7094 + }, + { + "epoch": 1.645980744693191, + "grad_norm": 11.355397426862138, + "learning_rate": 2e-06, + "loss": 0.2688, + "step": 7095 + }, + { + "epoch": 1.6462127363414916, + "grad_norm": 10.83029224429112, + "learning_rate": 2e-06, + "loss": 0.211, + "step": 7096 + }, + { + "epoch": 1.6464447279897922, + "grad_norm": 9.158647967175433, + "learning_rate": 2e-06, + "loss": 0.2457, + "step": 7097 + }, + { + "epoch": 1.646676719638093, + "grad_norm": 12.659950260456156, + "learning_rate": 2e-06, + "loss": 0.2327, + "step": 7098 + }, + { + "epoch": 1.6469087112863936, + "grad_norm": 11.799338439161634, + "learning_rate": 2e-06, + "loss": 0.2592, + "step": 7099 + }, + { + "epoch": 1.6471407029346943, + "grad_norm": 13.881708515680108, + "learning_rate": 2e-06, + "loss": 0.2502, + "step": 7100 + }, + { + "epoch": 1.647372694582995, + "grad_norm": 15.506415761028464, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 7101 + }, + { + "epoch": 1.6476046862312956, + "grad_norm": 14.239460943501205, + "learning_rate": 2e-06, + "loss": 0.2004, + "step": 7102 + }, + { + "epoch": 1.6478366778795963, + "grad_norm": 12.17574573411829, + "learning_rate": 2e-06, + "loss": 0.1786, + "step": 7103 + }, + { + "epoch": 1.648068669527897, + "grad_norm": 17.152879482952468, + "learning_rate": 2e-06, + "loss": 0.1815, + "step": 7104 + }, + { + "epoch": 1.6483006611761977, + "grad_norm": 7.326448041591375, + "learning_rate": 2e-06, + "loss": 0.1246, + "step": 7105 + }, + { + "epoch": 1.6485326528244983, + "grad_norm": 16.79864805443018, + "learning_rate": 2e-06, + "loss": 0.3143, + "step": 7106 + }, + { + "epoch": 1.648764644472799, + "grad_norm": 13.370767304403438, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 7107 + }, + { + "epoch": 1.6489966361210997, + "grad_norm": 21.51092895572337, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 7108 + }, + { + "epoch": 1.6492286277694004, + "grad_norm": 18.007298325256, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 7109 + }, + { + "epoch": 1.649460619417701, + "grad_norm": 27.794812087978574, + "learning_rate": 2e-06, + "loss": 0.3555, + "step": 7110 + }, + { + "epoch": 1.6496926110660017, + "grad_norm": 9.828940646642232, + "learning_rate": 2e-06, + "loss": 0.1938, + "step": 7111 + }, + { + "epoch": 1.6499246027143024, + "grad_norm": 19.735335010849287, + "learning_rate": 2e-06, + "loss": 0.2739, + "step": 7112 + }, + { + "epoch": 1.650156594362603, + "grad_norm": 11.14559502802546, + "learning_rate": 2e-06, + "loss": 0.2884, + "step": 7113 + }, + { + "epoch": 1.6503885860109038, + "grad_norm": 12.179689951370753, + "learning_rate": 2e-06, + "loss": 0.194, + "step": 7114 + }, + { + "epoch": 1.6506205776592042, + "grad_norm": 16.684675765332578, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 7115 + }, + { + "epoch": 1.650852569307505, + "grad_norm": 6.811563807023977, + "learning_rate": 2e-06, + "loss": 0.1291, + "step": 7116 + }, + { + "epoch": 1.6510845609558056, + "grad_norm": 10.669302710321867, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 7117 + }, + { + "epoch": 1.6513165526041063, + "grad_norm": 11.669049742452437, + "learning_rate": 2e-06, + "loss": 0.2609, + "step": 7118 + }, + { + "epoch": 1.651548544252407, + "grad_norm": 18.476138275697288, + "learning_rate": 2e-06, + "loss": 0.3279, + "step": 7119 + }, + { + "epoch": 1.6517805359007076, + "grad_norm": 16.27903554685498, + "learning_rate": 2e-06, + "loss": 0.2781, + "step": 7120 + }, + { + "epoch": 1.652012527549008, + "grad_norm": 17.359246051089706, + "learning_rate": 2e-06, + "loss": 0.3157, + "step": 7121 + }, + { + "epoch": 1.6522445191973087, + "grad_norm": 12.366129458336568, + "learning_rate": 2e-06, + "loss": 0.2502, + "step": 7122 + }, + { + "epoch": 1.6524765108456094, + "grad_norm": 11.376221833141713, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 7123 + }, + { + "epoch": 1.65270850249391, + "grad_norm": 23.552326593798647, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 7124 + }, + { + "epoch": 1.6529404941422108, + "grad_norm": 21.398110960726864, + "learning_rate": 2e-06, + "loss": 0.3372, + "step": 7125 + }, + { + "epoch": 1.6531724857905115, + "grad_norm": 10.320835052153663, + "learning_rate": 2e-06, + "loss": 0.1588, + "step": 7126 + }, + { + "epoch": 1.6534044774388121, + "grad_norm": 43.51247348322184, + "learning_rate": 2e-06, + "loss": 0.3616, + "step": 7127 + }, + { + "epoch": 1.6536364690871128, + "grad_norm": 16.571909321923275, + "learning_rate": 2e-06, + "loss": 0.3267, + "step": 7128 + }, + { + "epoch": 1.6538684607354135, + "grad_norm": 12.319199115679023, + "learning_rate": 2e-06, + "loss": 0.3166, + "step": 7129 + }, + { + "epoch": 1.6541004523837142, + "grad_norm": 17.12975006935626, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 7130 + }, + { + "epoch": 1.6543324440320148, + "grad_norm": 14.052613143049053, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 7131 + }, + { + "epoch": 1.6545644356803155, + "grad_norm": 17.40443541616568, + "learning_rate": 2e-06, + "loss": 0.3052, + "step": 7132 + }, + { + "epoch": 1.6547964273286162, + "grad_norm": 27.391161447742995, + "learning_rate": 2e-06, + "loss": 0.3455, + "step": 7133 + }, + { + "epoch": 1.6550284189769169, + "grad_norm": 16.172270537691748, + "learning_rate": 2e-06, + "loss": 0.1898, + "step": 7134 + }, + { + "epoch": 1.6552604106252176, + "grad_norm": 7.745639522105556, + "learning_rate": 2e-06, + "loss": 0.1708, + "step": 7135 + }, + { + "epoch": 1.6554924022735182, + "grad_norm": 15.698002630882499, + "learning_rate": 2e-06, + "loss": 0.2375, + "step": 7136 + }, + { + "epoch": 1.655724393921819, + "grad_norm": 18.27497738405015, + "learning_rate": 2e-06, + "loss": 0.2097, + "step": 7137 + }, + { + "epoch": 1.6559563855701196, + "grad_norm": 19.399303027089903, + "learning_rate": 2e-06, + "loss": 0.2969, + "step": 7138 + }, + { + "epoch": 1.6561883772184203, + "grad_norm": 15.486733537769139, + "learning_rate": 2e-06, + "loss": 0.3188, + "step": 7139 + }, + { + "epoch": 1.656420368866721, + "grad_norm": 9.362904869460838, + "learning_rate": 2e-06, + "loss": 0.2204, + "step": 7140 + }, + { + "epoch": 1.6566523605150214, + "grad_norm": 12.366872072316477, + "learning_rate": 2e-06, + "loss": 0.2335, + "step": 7141 + }, + { + "epoch": 1.656884352163322, + "grad_norm": 12.496802501137957, + "learning_rate": 2e-06, + "loss": 0.236, + "step": 7142 + }, + { + "epoch": 1.6571163438116228, + "grad_norm": 12.254734708977596, + "learning_rate": 2e-06, + "loss": 0.1864, + "step": 7143 + }, + { + "epoch": 1.6573483354599234, + "grad_norm": 9.660063763494001, + "learning_rate": 2e-06, + "loss": 0.2146, + "step": 7144 + }, + { + "epoch": 1.6575803271082241, + "grad_norm": 10.427701070630379, + "learning_rate": 2e-06, + "loss": 0.1668, + "step": 7145 + }, + { + "epoch": 1.6578123187565248, + "grad_norm": 12.60420478831905, + "learning_rate": 2e-06, + "loss": 0.2078, + "step": 7146 + }, + { + "epoch": 1.6580443104048255, + "grad_norm": 7.42629483367214, + "learning_rate": 2e-06, + "loss": 0.1457, + "step": 7147 + }, + { + "epoch": 1.658276302053126, + "grad_norm": 18.688867676781133, + "learning_rate": 2e-06, + "loss": 0.3135, + "step": 7148 + }, + { + "epoch": 1.6585082937014266, + "grad_norm": 18.540251604423794, + "learning_rate": 2e-06, + "loss": 0.3148, + "step": 7149 + }, + { + "epoch": 1.6587402853497273, + "grad_norm": 15.21926828603947, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 7150 + }, + { + "epoch": 1.658972276998028, + "grad_norm": 17.34134795151519, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 7151 + }, + { + "epoch": 1.6592042686463286, + "grad_norm": 15.21227762992704, + "learning_rate": 2e-06, + "loss": 0.242, + "step": 7152 + }, + { + "epoch": 1.6594362602946293, + "grad_norm": 12.275563953766433, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 7153 + }, + { + "epoch": 1.65966825194293, + "grad_norm": 8.203457602806631, + "learning_rate": 2e-06, + "loss": 0.1712, + "step": 7154 + }, + { + "epoch": 1.6599002435912307, + "grad_norm": 12.874196812733794, + "learning_rate": 2e-06, + "loss": 0.2242, + "step": 7155 + }, + { + "epoch": 1.6601322352395314, + "grad_norm": 9.838459330430709, + "learning_rate": 2e-06, + "loss": 0.2479, + "step": 7156 + }, + { + "epoch": 1.660364226887832, + "grad_norm": 12.464409721834336, + "learning_rate": 2e-06, + "loss": 0.2776, + "step": 7157 + }, + { + "epoch": 1.6605962185361327, + "grad_norm": 11.060179146903785, + "learning_rate": 2e-06, + "loss": 0.2777, + "step": 7158 + }, + { + "epoch": 1.6608282101844334, + "grad_norm": 19.965716682293383, + "learning_rate": 2e-06, + "loss": 0.2935, + "step": 7159 + }, + { + "epoch": 1.661060201832734, + "grad_norm": 14.087061956800868, + "learning_rate": 2e-06, + "loss": 0.246, + "step": 7160 + }, + { + "epoch": 1.6612921934810347, + "grad_norm": 17.92066191994181, + "learning_rate": 2e-06, + "loss": 0.2955, + "step": 7161 + }, + { + "epoch": 1.6615241851293354, + "grad_norm": 24.209134033683643, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 7162 + }, + { + "epoch": 1.661756176777636, + "grad_norm": 11.876774521762682, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 7163 + }, + { + "epoch": 1.6619881684259368, + "grad_norm": 11.534212929724452, + "learning_rate": 2e-06, + "loss": 0.1807, + "step": 7164 + }, + { + "epoch": 1.6622201600742375, + "grad_norm": 15.413596450223995, + "learning_rate": 2e-06, + "loss": 0.2057, + "step": 7165 + }, + { + "epoch": 1.6624521517225381, + "grad_norm": 14.949284064707648, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 7166 + }, + { + "epoch": 1.6626841433708388, + "grad_norm": 7.451658989662105, + "learning_rate": 2e-06, + "loss": 0.2982, + "step": 7167 + }, + { + "epoch": 1.6629161350191393, + "grad_norm": 17.876488522008227, + "learning_rate": 2e-06, + "loss": 0.3764, + "step": 7168 + }, + { + "epoch": 1.66314812666744, + "grad_norm": 9.601497369310469, + "learning_rate": 2e-06, + "loss": 0.1856, + "step": 7169 + }, + { + "epoch": 1.6633801183157406, + "grad_norm": 26.29643437167827, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 7170 + }, + { + "epoch": 1.6636121099640413, + "grad_norm": 26.008823115734014, + "learning_rate": 2e-06, + "loss": 0.1835, + "step": 7171 + }, + { + "epoch": 1.663844101612342, + "grad_norm": 17.775698521683385, + "learning_rate": 2e-06, + "loss": 0.3432, + "step": 7172 + }, + { + "epoch": 1.6640760932606427, + "grad_norm": 16.226258624963403, + "learning_rate": 2e-06, + "loss": 0.2569, + "step": 7173 + }, + { + "epoch": 1.6643080849089433, + "grad_norm": 11.083269886897604, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 7174 + }, + { + "epoch": 1.6645400765572438, + "grad_norm": 16.496353744352007, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 7175 + }, + { + "epoch": 1.6647720682055445, + "grad_norm": 20.16568187382624, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 7176 + }, + { + "epoch": 1.6650040598538451, + "grad_norm": 5.926606012468249, + "learning_rate": 2e-06, + "loss": 0.1369, + "step": 7177 + }, + { + "epoch": 1.6652360515021458, + "grad_norm": 12.910944857308234, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 7178 + }, + { + "epoch": 1.6654680431504465, + "grad_norm": 12.738385571388708, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 7179 + }, + { + "epoch": 1.6657000347987472, + "grad_norm": 17.232627774810663, + "learning_rate": 2e-06, + "loss": 0.2987, + "step": 7180 + }, + { + "epoch": 1.6659320264470479, + "grad_norm": 10.560890514090739, + "learning_rate": 2e-06, + "loss": 0.2315, + "step": 7181 + }, + { + "epoch": 1.6661640180953485, + "grad_norm": 5.4585419224197445, + "learning_rate": 2e-06, + "loss": 0.145, + "step": 7182 + }, + { + "epoch": 1.6663960097436492, + "grad_norm": 20.192116461853846, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 7183 + }, + { + "epoch": 1.6666280013919499, + "grad_norm": 18.001526552299513, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 7184 + }, + { + "epoch": 1.6668599930402506, + "grad_norm": 17.958796196132493, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 7185 + }, + { + "epoch": 1.6670919846885512, + "grad_norm": 13.977270787710811, + "learning_rate": 2e-06, + "loss": 0.2066, + "step": 7186 + }, + { + "epoch": 1.667323976336852, + "grad_norm": 6.077093067527741, + "learning_rate": 2e-06, + "loss": 0.1958, + "step": 7187 + }, + { + "epoch": 1.6675559679851526, + "grad_norm": 14.162961183279469, + "learning_rate": 2e-06, + "loss": 0.1998, + "step": 7188 + }, + { + "epoch": 1.6677879596334533, + "grad_norm": 15.410769413943282, + "learning_rate": 2e-06, + "loss": 0.2707, + "step": 7189 + }, + { + "epoch": 1.668019951281754, + "grad_norm": 14.002372039832355, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 7190 + }, + { + "epoch": 1.6682519429300546, + "grad_norm": 11.425963241642725, + "learning_rate": 2e-06, + "loss": 0.249, + "step": 7191 + }, + { + "epoch": 1.6684839345783553, + "grad_norm": 10.424330561363904, + "learning_rate": 2e-06, + "loss": 0.1938, + "step": 7192 + }, + { + "epoch": 1.668715926226656, + "grad_norm": 7.77959555923727, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 7193 + }, + { + "epoch": 1.6689479178749567, + "grad_norm": 12.563718470613516, + "learning_rate": 2e-06, + "loss": 0.2741, + "step": 7194 + }, + { + "epoch": 1.6691799095232571, + "grad_norm": 12.248035271839441, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 7195 + }, + { + "epoch": 1.6694119011715578, + "grad_norm": 9.725214437497984, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 7196 + }, + { + "epoch": 1.6696438928198585, + "grad_norm": 83.29799388121215, + "learning_rate": 2e-06, + "loss": 0.3167, + "step": 7197 + }, + { + "epoch": 1.6698758844681592, + "grad_norm": 17.328766071210865, + "learning_rate": 2e-06, + "loss": 0.2559, + "step": 7198 + }, + { + "epoch": 1.6701078761164598, + "grad_norm": 11.736583987785513, + "learning_rate": 2e-06, + "loss": 0.1709, + "step": 7199 + }, + { + "epoch": 1.6703398677647605, + "grad_norm": 5.073238294383731, + "learning_rate": 2e-06, + "loss": 0.1439, + "step": 7200 + }, + { + "epoch": 1.670571859413061, + "grad_norm": 23.945785671567652, + "learning_rate": 2e-06, + "loss": 0.4377, + "step": 7201 + }, + { + "epoch": 1.6708038510613616, + "grad_norm": 17.812827348025948, + "learning_rate": 2e-06, + "loss": 0.2269, + "step": 7202 + }, + { + "epoch": 1.6710358427096623, + "grad_norm": 13.814090695774514, + "learning_rate": 2e-06, + "loss": 0.2267, + "step": 7203 + }, + { + "epoch": 1.671267834357963, + "grad_norm": 11.800236103771432, + "learning_rate": 2e-06, + "loss": 0.1501, + "step": 7204 + }, + { + "epoch": 1.6714998260062637, + "grad_norm": 14.133330969888693, + "learning_rate": 2e-06, + "loss": 0.2594, + "step": 7205 + }, + { + "epoch": 1.6717318176545644, + "grad_norm": 12.403132619329371, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 7206 + }, + { + "epoch": 1.671963809302865, + "grad_norm": 11.387512003348485, + "learning_rate": 2e-06, + "loss": 0.243, + "step": 7207 + }, + { + "epoch": 1.6721958009511657, + "grad_norm": 29.850020206835577, + "learning_rate": 2e-06, + "loss": 0.3552, + "step": 7208 + }, + { + "epoch": 1.6724277925994664, + "grad_norm": 14.538118662992103, + "learning_rate": 2e-06, + "loss": 0.2018, + "step": 7209 + }, + { + "epoch": 1.672659784247767, + "grad_norm": 17.847018805166996, + "learning_rate": 2e-06, + "loss": 0.3098, + "step": 7210 + }, + { + "epoch": 1.6728917758960677, + "grad_norm": 11.821794128153151, + "learning_rate": 2e-06, + "loss": 0.3643, + "step": 7211 + }, + { + "epoch": 1.6731237675443684, + "grad_norm": 20.07149739617927, + "learning_rate": 2e-06, + "loss": 0.3223, + "step": 7212 + }, + { + "epoch": 1.673355759192669, + "grad_norm": 9.991688195420922, + "learning_rate": 2e-06, + "loss": 0.2121, + "step": 7213 + }, + { + "epoch": 1.6735877508409698, + "grad_norm": 13.829416802908472, + "learning_rate": 2e-06, + "loss": 0.2223, + "step": 7214 + }, + { + "epoch": 1.6738197424892705, + "grad_norm": 5.792930798689378, + "learning_rate": 2e-06, + "loss": 0.1783, + "step": 7215 + }, + { + "epoch": 1.6740517341375711, + "grad_norm": 21.858001437576718, + "learning_rate": 2e-06, + "loss": 0.3587, + "step": 7216 + }, + { + "epoch": 1.6742837257858718, + "grad_norm": 14.953758049208334, + "learning_rate": 2e-06, + "loss": 0.261, + "step": 7217 + }, + { + "epoch": 1.6745157174341725, + "grad_norm": 15.32929982573524, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 7218 + }, + { + "epoch": 1.6747477090824732, + "grad_norm": 12.461868779926432, + "learning_rate": 2e-06, + "loss": 0.2068, + "step": 7219 + }, + { + "epoch": 1.6749797007307738, + "grad_norm": 15.767175377082367, + "learning_rate": 2e-06, + "loss": 0.2911, + "step": 7220 + }, + { + "epoch": 1.6752116923790743, + "grad_norm": 13.8931709287092, + "learning_rate": 2e-06, + "loss": 0.2982, + "step": 7221 + }, + { + "epoch": 1.675443684027375, + "grad_norm": 14.649828134211061, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 7222 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 21.45201926425029, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 7223 + }, + { + "epoch": 1.6759076673239763, + "grad_norm": 13.8367213243625, + "learning_rate": 2e-06, + "loss": 0.2521, + "step": 7224 + }, + { + "epoch": 1.676139658972277, + "grad_norm": 10.023145113017152, + "learning_rate": 2e-06, + "loss": 0.22, + "step": 7225 + }, + { + "epoch": 1.6763716506205777, + "grad_norm": 29.17788046000198, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 7226 + }, + { + "epoch": 1.6766036422688784, + "grad_norm": 9.137259677515337, + "learning_rate": 2e-06, + "loss": 0.276, + "step": 7227 + }, + { + "epoch": 1.6768356339171788, + "grad_norm": 13.453485559735219, + "learning_rate": 2e-06, + "loss": 0.2402, + "step": 7228 + }, + { + "epoch": 1.6770676255654795, + "grad_norm": 14.936132272085098, + "learning_rate": 2e-06, + "loss": 0.2982, + "step": 7229 + }, + { + "epoch": 1.6772996172137802, + "grad_norm": 11.989505567263949, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 7230 + }, + { + "epoch": 1.6775316088620809, + "grad_norm": 16.932666813685692, + "learning_rate": 2e-06, + "loss": 0.3932, + "step": 7231 + }, + { + "epoch": 1.6777636005103815, + "grad_norm": 10.189344381942949, + "learning_rate": 2e-06, + "loss": 0.2055, + "step": 7232 + }, + { + "epoch": 1.6779955921586822, + "grad_norm": 13.877659783625909, + "learning_rate": 2e-06, + "loss": 0.2089, + "step": 7233 + }, + { + "epoch": 1.678227583806983, + "grad_norm": 11.88046757505946, + "learning_rate": 2e-06, + "loss": 0.2989, + "step": 7234 + }, + { + "epoch": 1.6784595754552836, + "grad_norm": 12.790805012245048, + "learning_rate": 2e-06, + "loss": 0.2614, + "step": 7235 + }, + { + "epoch": 1.6786915671035842, + "grad_norm": 13.851701743001856, + "learning_rate": 2e-06, + "loss": 0.1686, + "step": 7236 + }, + { + "epoch": 1.678923558751885, + "grad_norm": 17.59761637474589, + "learning_rate": 2e-06, + "loss": 0.2935, + "step": 7237 + }, + { + "epoch": 1.6791555504001856, + "grad_norm": 13.48792494767899, + "learning_rate": 2e-06, + "loss": 0.3288, + "step": 7238 + }, + { + "epoch": 1.6793875420484863, + "grad_norm": 20.19153728570158, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 7239 + }, + { + "epoch": 1.679619533696787, + "grad_norm": 8.553678141416055, + "learning_rate": 2e-06, + "loss": 0.2466, + "step": 7240 + }, + { + "epoch": 1.6798515253450876, + "grad_norm": 9.832203832754326, + "learning_rate": 2e-06, + "loss": 0.2081, + "step": 7241 + }, + { + "epoch": 1.6800835169933883, + "grad_norm": 16.72323345948946, + "learning_rate": 2e-06, + "loss": 0.2816, + "step": 7242 + }, + { + "epoch": 1.680315508641689, + "grad_norm": 9.139400513114731, + "learning_rate": 2e-06, + "loss": 0.1794, + "step": 7243 + }, + { + "epoch": 1.6805475002899897, + "grad_norm": 11.173560663571159, + "learning_rate": 2e-06, + "loss": 0.21, + "step": 7244 + }, + { + "epoch": 1.6807794919382903, + "grad_norm": 13.41759061043099, + "learning_rate": 2e-06, + "loss": 0.2704, + "step": 7245 + }, + { + "epoch": 1.681011483586591, + "grad_norm": 11.530722433665698, + "learning_rate": 2e-06, + "loss": 0.2328, + "step": 7246 + }, + { + "epoch": 1.6812434752348917, + "grad_norm": 48.37803041976953, + "learning_rate": 2e-06, + "loss": 0.1837, + "step": 7247 + }, + { + "epoch": 1.6814754668831922, + "grad_norm": 9.805662828423877, + "learning_rate": 2e-06, + "loss": 0.3065, + "step": 7248 + }, + { + "epoch": 1.6817074585314928, + "grad_norm": 14.922040681149298, + "learning_rate": 2e-06, + "loss": 0.2789, + "step": 7249 + }, + { + "epoch": 1.6819394501797935, + "grad_norm": 16.518766137978755, + "learning_rate": 2e-06, + "loss": 0.197, + "step": 7250 + }, + { + "epoch": 1.6821714418280942, + "grad_norm": 9.19056226837705, + "learning_rate": 2e-06, + "loss": 0.2237, + "step": 7251 + }, + { + "epoch": 1.6824034334763949, + "grad_norm": 13.196367330577875, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 7252 + }, + { + "epoch": 1.6826354251246955, + "grad_norm": 14.036007189990867, + "learning_rate": 2e-06, + "loss": 0.2197, + "step": 7253 + }, + { + "epoch": 1.6828674167729962, + "grad_norm": 14.91431701000174, + "learning_rate": 2e-06, + "loss": 0.2718, + "step": 7254 + }, + { + "epoch": 1.6830994084212967, + "grad_norm": 16.16148852325024, + "learning_rate": 2e-06, + "loss": 0.2549, + "step": 7255 + }, + { + "epoch": 1.6833314000695974, + "grad_norm": 15.585917882713956, + "learning_rate": 2e-06, + "loss": 0.1808, + "step": 7256 + }, + { + "epoch": 1.683563391717898, + "grad_norm": 20.987773075527787, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 7257 + }, + { + "epoch": 1.6837953833661987, + "grad_norm": 9.67357503129696, + "learning_rate": 2e-06, + "loss": 0.2684, + "step": 7258 + }, + { + "epoch": 1.6840273750144994, + "grad_norm": 12.767117257443124, + "learning_rate": 2e-06, + "loss": 0.2111, + "step": 7259 + }, + { + "epoch": 1.6842593666628, + "grad_norm": 11.685125969676664, + "learning_rate": 2e-06, + "loss": 0.2216, + "step": 7260 + }, + { + "epoch": 1.6844913583111008, + "grad_norm": 15.651428326415914, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 7261 + }, + { + "epoch": 1.6847233499594014, + "grad_norm": 10.54086061454288, + "learning_rate": 2e-06, + "loss": 0.2496, + "step": 7262 + }, + { + "epoch": 1.684955341607702, + "grad_norm": 15.971847719178118, + "learning_rate": 2e-06, + "loss": 0.189, + "step": 7263 + }, + { + "epoch": 1.6851873332560028, + "grad_norm": 8.066546502110779, + "learning_rate": 2e-06, + "loss": 0.2025, + "step": 7264 + }, + { + "epoch": 1.6854193249043035, + "grad_norm": 8.468046938996284, + "learning_rate": 2e-06, + "loss": 0.2017, + "step": 7265 + }, + { + "epoch": 1.6856513165526041, + "grad_norm": 9.632765936355588, + "learning_rate": 2e-06, + "loss": 0.2203, + "step": 7266 + }, + { + "epoch": 1.6858833082009048, + "grad_norm": 14.954336739425484, + "learning_rate": 2e-06, + "loss": 0.3368, + "step": 7267 + }, + { + "epoch": 1.6861152998492055, + "grad_norm": 11.151590329091023, + "learning_rate": 2e-06, + "loss": 0.1318, + "step": 7268 + }, + { + "epoch": 1.6863472914975062, + "grad_norm": 18.76079227324024, + "learning_rate": 2e-06, + "loss": 0.2761, + "step": 7269 + }, + { + "epoch": 1.6865792831458069, + "grad_norm": 17.605320132885076, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 7270 + }, + { + "epoch": 1.6868112747941075, + "grad_norm": 12.944711743448268, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 7271 + }, + { + "epoch": 1.6870432664424082, + "grad_norm": 123.24890746611254, + "learning_rate": 2e-06, + "loss": 0.2955, + "step": 7272 + }, + { + "epoch": 1.6872752580907089, + "grad_norm": 8.159596136022413, + "learning_rate": 2e-06, + "loss": 0.1544, + "step": 7273 + }, + { + "epoch": 1.6875072497390093, + "grad_norm": 6.68692543458059, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 7274 + }, + { + "epoch": 1.68773924138731, + "grad_norm": 13.505784951239923, + "learning_rate": 2e-06, + "loss": 0.2127, + "step": 7275 + }, + { + "epoch": 1.6879712330356107, + "grad_norm": 12.820996468185626, + "learning_rate": 2e-06, + "loss": 0.322, + "step": 7276 + }, + { + "epoch": 1.6882032246839114, + "grad_norm": 16.991615458156502, + "learning_rate": 2e-06, + "loss": 0.2802, + "step": 7277 + }, + { + "epoch": 1.688435216332212, + "grad_norm": 10.463780024145759, + "learning_rate": 2e-06, + "loss": 0.1726, + "step": 7278 + }, + { + "epoch": 1.6886672079805127, + "grad_norm": 16.589872313730154, + "learning_rate": 2e-06, + "loss": 0.288, + "step": 7279 + }, + { + "epoch": 1.6888991996288134, + "grad_norm": 9.827582196248413, + "learning_rate": 2e-06, + "loss": 0.1659, + "step": 7280 + }, + { + "epoch": 1.6891311912771139, + "grad_norm": 15.061570411083014, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 7281 + }, + { + "epoch": 1.6893631829254145, + "grad_norm": 14.836146002644297, + "learning_rate": 2e-06, + "loss": 0.1934, + "step": 7282 + }, + { + "epoch": 1.6895951745737152, + "grad_norm": 10.968410038414373, + "learning_rate": 2e-06, + "loss": 0.1331, + "step": 7283 + }, + { + "epoch": 1.689827166222016, + "grad_norm": 9.697378039295229, + "learning_rate": 2e-06, + "loss": 0.1898, + "step": 7284 + }, + { + "epoch": 1.6900591578703166, + "grad_norm": 9.26353684174463, + "learning_rate": 2e-06, + "loss": 0.2501, + "step": 7285 + }, + { + "epoch": 1.6902911495186173, + "grad_norm": 16.511560222732655, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 7286 + }, + { + "epoch": 1.690523141166918, + "grad_norm": 15.538008963116829, + "learning_rate": 2e-06, + "loss": 0.2322, + "step": 7287 + }, + { + "epoch": 1.6907551328152186, + "grad_norm": 6.702030928521217, + "learning_rate": 2e-06, + "loss": 0.1563, + "step": 7288 + }, + { + "epoch": 1.6909871244635193, + "grad_norm": 18.95424278469588, + "learning_rate": 2e-06, + "loss": 0.412, + "step": 7289 + }, + { + "epoch": 1.69121911611182, + "grad_norm": 19.402775836233555, + "learning_rate": 2e-06, + "loss": 0.3471, + "step": 7290 + }, + { + "epoch": 1.6914511077601206, + "grad_norm": 18.586344150422477, + "learning_rate": 2e-06, + "loss": 0.1725, + "step": 7291 + }, + { + "epoch": 1.6916830994084213, + "grad_norm": 13.803017788235675, + "learning_rate": 2e-06, + "loss": 0.2308, + "step": 7292 + }, + { + "epoch": 1.691915091056722, + "grad_norm": 12.289919019565339, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 7293 + }, + { + "epoch": 1.6921470827050227, + "grad_norm": 14.677095371271689, + "learning_rate": 2e-06, + "loss": 0.2651, + "step": 7294 + }, + { + "epoch": 1.6923790743533234, + "grad_norm": 15.267058990164466, + "learning_rate": 2e-06, + "loss": 0.2599, + "step": 7295 + }, + { + "epoch": 1.692611066001624, + "grad_norm": 10.819449552891342, + "learning_rate": 2e-06, + "loss": 0.2296, + "step": 7296 + }, + { + "epoch": 1.6928430576499247, + "grad_norm": 7.79293963930981, + "learning_rate": 2e-06, + "loss": 0.1925, + "step": 7297 + }, + { + "epoch": 1.6930750492982254, + "grad_norm": 12.106463229541392, + "learning_rate": 2e-06, + "loss": 0.2504, + "step": 7298 + }, + { + "epoch": 1.693307040946526, + "grad_norm": 20.23700952736958, + "learning_rate": 2e-06, + "loss": 0.3133, + "step": 7299 + }, + { + "epoch": 1.6935390325948267, + "grad_norm": 13.082483003263226, + "learning_rate": 2e-06, + "loss": 0.3206, + "step": 7300 + }, + { + "epoch": 1.6937710242431272, + "grad_norm": 5.561796986154681, + "learning_rate": 2e-06, + "loss": 0.1812, + "step": 7301 + }, + { + "epoch": 1.6940030158914279, + "grad_norm": 18.485603555962598, + "learning_rate": 2e-06, + "loss": 0.2279, + "step": 7302 + }, + { + "epoch": 1.6942350075397286, + "grad_norm": 19.852227953612164, + "learning_rate": 2e-06, + "loss": 0.4063, + "step": 7303 + }, + { + "epoch": 1.6944669991880292, + "grad_norm": 19.68949775248541, + "learning_rate": 2e-06, + "loss": 0.3253, + "step": 7304 + }, + { + "epoch": 1.69469899083633, + "grad_norm": 33.77052968630431, + "learning_rate": 2e-06, + "loss": 0.3797, + "step": 7305 + }, + { + "epoch": 1.6949309824846306, + "grad_norm": 11.94857037264836, + "learning_rate": 2e-06, + "loss": 0.209, + "step": 7306 + }, + { + "epoch": 1.6951629741329313, + "grad_norm": 13.804592380035896, + "learning_rate": 2e-06, + "loss": 0.2286, + "step": 7307 + }, + { + "epoch": 1.6953949657812317, + "grad_norm": 8.742072666215575, + "learning_rate": 2e-06, + "loss": 0.1553, + "step": 7308 + }, + { + "epoch": 1.6956269574295324, + "grad_norm": 11.872405377077234, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 7309 + }, + { + "epoch": 1.695858949077833, + "grad_norm": 8.341582411825478, + "learning_rate": 2e-06, + "loss": 0.1319, + "step": 7310 + }, + { + "epoch": 1.6960909407261338, + "grad_norm": 10.44053334267485, + "learning_rate": 2e-06, + "loss": 0.2116, + "step": 7311 + }, + { + "epoch": 1.6963229323744344, + "grad_norm": 17.881643539271504, + "learning_rate": 2e-06, + "loss": 0.2348, + "step": 7312 + }, + { + "epoch": 1.6965549240227351, + "grad_norm": 19.973000179858243, + "learning_rate": 2e-06, + "loss": 0.3227, + "step": 7313 + }, + { + "epoch": 1.6967869156710358, + "grad_norm": 25.58664111962267, + "learning_rate": 2e-06, + "loss": 0.3412, + "step": 7314 + }, + { + "epoch": 1.6970189073193365, + "grad_norm": 16.170402376477018, + "learning_rate": 2e-06, + "loss": 0.2891, + "step": 7315 + }, + { + "epoch": 1.6972508989676371, + "grad_norm": 25.391042103179647, + "learning_rate": 2e-06, + "loss": 0.4034, + "step": 7316 + }, + { + "epoch": 1.6974828906159378, + "grad_norm": 15.631652628735425, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 7317 + }, + { + "epoch": 1.6977148822642385, + "grad_norm": 15.931573085004318, + "learning_rate": 2e-06, + "loss": 0.3106, + "step": 7318 + }, + { + "epoch": 1.6979468739125392, + "grad_norm": 19.12347367874556, + "learning_rate": 2e-06, + "loss": 0.1906, + "step": 7319 + }, + { + "epoch": 1.6981788655608399, + "grad_norm": 15.569423556959888, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 7320 + }, + { + "epoch": 1.6984108572091405, + "grad_norm": 14.068857339247876, + "learning_rate": 2e-06, + "loss": 0.237, + "step": 7321 + }, + { + "epoch": 1.6986428488574412, + "grad_norm": 7.4346690593116, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 7322 + }, + { + "epoch": 1.698874840505742, + "grad_norm": 13.320835738404085, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 7323 + }, + { + "epoch": 1.6991068321540426, + "grad_norm": 16.618587408159758, + "learning_rate": 2e-06, + "loss": 0.3168, + "step": 7324 + }, + { + "epoch": 1.6993388238023432, + "grad_norm": 14.77136574060571, + "learning_rate": 2e-06, + "loss": 0.1647, + "step": 7325 + }, + { + "epoch": 1.699570815450644, + "grad_norm": 14.808907885630699, + "learning_rate": 2e-06, + "loss": 0.2321, + "step": 7326 + }, + { + "epoch": 1.6998028070989446, + "grad_norm": 11.36970811881377, + "learning_rate": 2e-06, + "loss": 0.2535, + "step": 7327 + }, + { + "epoch": 1.700034798747245, + "grad_norm": 11.569639321117595, + "learning_rate": 2e-06, + "loss": 0.2157, + "step": 7328 + }, + { + "epoch": 1.7002667903955457, + "grad_norm": 13.531970644424788, + "learning_rate": 2e-06, + "loss": 0.154, + "step": 7329 + }, + { + "epoch": 1.7004987820438464, + "grad_norm": 8.423003530239141, + "learning_rate": 2e-06, + "loss": 0.2078, + "step": 7330 + }, + { + "epoch": 1.700730773692147, + "grad_norm": 16.232057245838032, + "learning_rate": 2e-06, + "loss": 0.1807, + "step": 7331 + }, + { + "epoch": 1.7009627653404478, + "grad_norm": 12.855703913990952, + "learning_rate": 2e-06, + "loss": 0.2202, + "step": 7332 + }, + { + "epoch": 1.7011947569887484, + "grad_norm": 12.417491377191617, + "learning_rate": 2e-06, + "loss": 0.2327, + "step": 7333 + }, + { + "epoch": 1.701426748637049, + "grad_norm": 14.984586298988479, + "learning_rate": 2e-06, + "loss": 0.2119, + "step": 7334 + }, + { + "epoch": 1.7016587402853496, + "grad_norm": 18.66081159209174, + "learning_rate": 2e-06, + "loss": 0.3525, + "step": 7335 + }, + { + "epoch": 1.7018907319336503, + "grad_norm": 9.688077039211272, + "learning_rate": 2e-06, + "loss": 0.1497, + "step": 7336 + }, + { + "epoch": 1.702122723581951, + "grad_norm": 7.287356424854079, + "learning_rate": 2e-06, + "loss": 0.1793, + "step": 7337 + }, + { + "epoch": 1.7023547152302516, + "grad_norm": 8.409087290247287, + "learning_rate": 2e-06, + "loss": 0.2413, + "step": 7338 + }, + { + "epoch": 1.7025867068785523, + "grad_norm": 13.801898488863271, + "learning_rate": 2e-06, + "loss": 0.2279, + "step": 7339 + }, + { + "epoch": 1.702818698526853, + "grad_norm": 10.625966492141034, + "learning_rate": 2e-06, + "loss": 0.1772, + "step": 7340 + }, + { + "epoch": 1.7030506901751536, + "grad_norm": 8.734020582081474, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 7341 + }, + { + "epoch": 1.7032826818234543, + "grad_norm": 15.367119881801106, + "learning_rate": 2e-06, + "loss": 0.2827, + "step": 7342 + }, + { + "epoch": 1.703514673471755, + "grad_norm": 9.132522203727172, + "learning_rate": 2e-06, + "loss": 0.1569, + "step": 7343 + }, + { + "epoch": 1.7037466651200557, + "grad_norm": 26.578806454786072, + "learning_rate": 2e-06, + "loss": 0.3061, + "step": 7344 + }, + { + "epoch": 1.7039786567683564, + "grad_norm": 13.623761355111489, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 7345 + }, + { + "epoch": 1.704210648416657, + "grad_norm": 10.402184063489008, + "learning_rate": 2e-06, + "loss": 0.1433, + "step": 7346 + }, + { + "epoch": 1.7044426400649577, + "grad_norm": 14.848757688205776, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 7347 + }, + { + "epoch": 1.7046746317132584, + "grad_norm": 10.388087104158064, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 7348 + }, + { + "epoch": 1.704906623361559, + "grad_norm": 10.690069785947735, + "learning_rate": 2e-06, + "loss": 0.251, + "step": 7349 + }, + { + "epoch": 1.7051386150098597, + "grad_norm": 12.286173589369547, + "learning_rate": 2e-06, + "loss": 0.193, + "step": 7350 + }, + { + "epoch": 1.7053706066581604, + "grad_norm": 18.118306711239853, + "learning_rate": 2e-06, + "loss": 0.2965, + "step": 7351 + }, + { + "epoch": 1.705602598306461, + "grad_norm": 17.74489505862405, + "learning_rate": 2e-06, + "loss": 0.4117, + "step": 7352 + }, + { + "epoch": 1.7058345899547618, + "grad_norm": 14.440055676789756, + "learning_rate": 2e-06, + "loss": 0.1778, + "step": 7353 + }, + { + "epoch": 1.7060665816030622, + "grad_norm": 21.136004075754848, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 7354 + }, + { + "epoch": 1.706298573251363, + "grad_norm": 15.881275091214922, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 7355 + }, + { + "epoch": 1.7065305648996636, + "grad_norm": 7.7350361481897725, + "learning_rate": 2e-06, + "loss": 0.1523, + "step": 7356 + }, + { + "epoch": 1.7067625565479643, + "grad_norm": 10.3562729510965, + "learning_rate": 2e-06, + "loss": 0.3039, + "step": 7357 + }, + { + "epoch": 1.706994548196265, + "grad_norm": 15.919554632350538, + "learning_rate": 2e-06, + "loss": 0.2601, + "step": 7358 + }, + { + "epoch": 1.7072265398445656, + "grad_norm": 6.907861083753691, + "learning_rate": 2e-06, + "loss": 0.1562, + "step": 7359 + }, + { + "epoch": 1.7074585314928663, + "grad_norm": 13.332375865894466, + "learning_rate": 2e-06, + "loss": 0.1999, + "step": 7360 + }, + { + "epoch": 1.7076905231411668, + "grad_norm": 9.13285577347624, + "learning_rate": 2e-06, + "loss": 0.2235, + "step": 7361 + }, + { + "epoch": 1.7079225147894674, + "grad_norm": 14.410625044515026, + "learning_rate": 2e-06, + "loss": 0.3811, + "step": 7362 + }, + { + "epoch": 1.7081545064377681, + "grad_norm": 15.474633868476383, + "learning_rate": 2e-06, + "loss": 0.399, + "step": 7363 + }, + { + "epoch": 1.7083864980860688, + "grad_norm": 8.161414052092125, + "learning_rate": 2e-06, + "loss": 0.1485, + "step": 7364 + }, + { + "epoch": 1.7086184897343695, + "grad_norm": 7.173334004364632, + "learning_rate": 2e-06, + "loss": 0.2258, + "step": 7365 + }, + { + "epoch": 1.7088504813826702, + "grad_norm": 15.427045346489873, + "learning_rate": 2e-06, + "loss": 0.2198, + "step": 7366 + }, + { + "epoch": 1.7090824730309708, + "grad_norm": 12.765297230763718, + "learning_rate": 2e-06, + "loss": 0.2092, + "step": 7367 + }, + { + "epoch": 1.7093144646792715, + "grad_norm": 10.360717945103982, + "learning_rate": 2e-06, + "loss": 0.2371, + "step": 7368 + }, + { + "epoch": 1.7095464563275722, + "grad_norm": 11.163692771886856, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 7369 + }, + { + "epoch": 1.7097784479758729, + "grad_norm": 9.414971675169285, + "learning_rate": 2e-06, + "loss": 0.2496, + "step": 7370 + }, + { + "epoch": 1.7100104396241735, + "grad_norm": 12.427623155406073, + "learning_rate": 2e-06, + "loss": 0.2593, + "step": 7371 + }, + { + "epoch": 1.7102424312724742, + "grad_norm": 9.993736030250428, + "learning_rate": 2e-06, + "loss": 0.2196, + "step": 7372 + }, + { + "epoch": 1.710474422920775, + "grad_norm": 12.357927783611151, + "learning_rate": 2e-06, + "loss": 0.2762, + "step": 7373 + }, + { + "epoch": 1.7107064145690756, + "grad_norm": 17.258664349758774, + "learning_rate": 2e-06, + "loss": 0.3045, + "step": 7374 + }, + { + "epoch": 1.7109384062173763, + "grad_norm": 7.514361257025172, + "learning_rate": 2e-06, + "loss": 0.1494, + "step": 7375 + }, + { + "epoch": 1.711170397865677, + "grad_norm": 21.462401133891056, + "learning_rate": 2e-06, + "loss": 0.3879, + "step": 7376 + }, + { + "epoch": 1.7114023895139776, + "grad_norm": 14.017574262579139, + "learning_rate": 2e-06, + "loss": 0.2501, + "step": 7377 + }, + { + "epoch": 1.7116343811622783, + "grad_norm": 59.43227788764268, + "learning_rate": 2e-06, + "loss": 0.1805, + "step": 7378 + }, + { + "epoch": 1.711866372810579, + "grad_norm": 13.789051712709842, + "learning_rate": 2e-06, + "loss": 0.2946, + "step": 7379 + }, + { + "epoch": 1.7120983644588796, + "grad_norm": 11.672587587978251, + "learning_rate": 2e-06, + "loss": 0.2156, + "step": 7380 + }, + { + "epoch": 1.71233035610718, + "grad_norm": 21.34434420615057, + "learning_rate": 2e-06, + "loss": 0.325, + "step": 7381 + }, + { + "epoch": 1.7125623477554808, + "grad_norm": 16.615661247788037, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 7382 + }, + { + "epoch": 1.7127943394037815, + "grad_norm": 20.75326511437421, + "learning_rate": 2e-06, + "loss": 0.3114, + "step": 7383 + }, + { + "epoch": 1.7130263310520821, + "grad_norm": 12.286557692400283, + "learning_rate": 2e-06, + "loss": 0.2356, + "step": 7384 + }, + { + "epoch": 1.7132583227003828, + "grad_norm": 12.174885841243048, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 7385 + }, + { + "epoch": 1.7134903143486835, + "grad_norm": 14.877687974756407, + "learning_rate": 2e-06, + "loss": 0.3443, + "step": 7386 + }, + { + "epoch": 1.7137223059969842, + "grad_norm": 17.07574828854014, + "learning_rate": 2e-06, + "loss": 0.3792, + "step": 7387 + }, + { + "epoch": 1.7139542976452846, + "grad_norm": 13.713405100948842, + "learning_rate": 2e-06, + "loss": 0.2943, + "step": 7388 + }, + { + "epoch": 1.7141862892935853, + "grad_norm": 10.06477622586005, + "learning_rate": 2e-06, + "loss": 0.2264, + "step": 7389 + }, + { + "epoch": 1.714418280941886, + "grad_norm": 11.263272476317919, + "learning_rate": 2e-06, + "loss": 0.2381, + "step": 7390 + }, + { + "epoch": 1.7146502725901867, + "grad_norm": 19.508196318478443, + "learning_rate": 2e-06, + "loss": 0.3064, + "step": 7391 + }, + { + "epoch": 1.7148822642384873, + "grad_norm": 11.261598830547625, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 7392 + }, + { + "epoch": 1.715114255886788, + "grad_norm": 12.405569223219322, + "learning_rate": 2e-06, + "loss": 0.3194, + "step": 7393 + }, + { + "epoch": 1.7153462475350887, + "grad_norm": 16.233821077133854, + "learning_rate": 2e-06, + "loss": 0.2638, + "step": 7394 + }, + { + "epoch": 1.7155782391833894, + "grad_norm": 18.979216214316168, + "learning_rate": 2e-06, + "loss": 0.2379, + "step": 7395 + }, + { + "epoch": 1.71581023083169, + "grad_norm": 12.002055700439051, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 7396 + }, + { + "epoch": 1.7160422224799907, + "grad_norm": 7.190049386433045, + "learning_rate": 2e-06, + "loss": 0.1919, + "step": 7397 + }, + { + "epoch": 1.7162742141282914, + "grad_norm": 9.293922544406554, + "learning_rate": 2e-06, + "loss": 0.1739, + "step": 7398 + }, + { + "epoch": 1.716506205776592, + "grad_norm": 19.113098476282847, + "learning_rate": 2e-06, + "loss": 0.2361, + "step": 7399 + }, + { + "epoch": 1.7167381974248928, + "grad_norm": 21.822492041683304, + "learning_rate": 2e-06, + "loss": 0.3029, + "step": 7400 + }, + { + "epoch": 1.7169701890731934, + "grad_norm": 13.930659725168429, + "learning_rate": 2e-06, + "loss": 0.263, + "step": 7401 + }, + { + "epoch": 1.717202180721494, + "grad_norm": 14.905559993741406, + "learning_rate": 2e-06, + "loss": 0.2522, + "step": 7402 + }, + { + "epoch": 1.7174341723697948, + "grad_norm": 11.164819303552916, + "learning_rate": 2e-06, + "loss": 0.2697, + "step": 7403 + }, + { + "epoch": 1.7176661640180955, + "grad_norm": 8.91142886247981, + "learning_rate": 2e-06, + "loss": 0.115, + "step": 7404 + }, + { + "epoch": 1.7178981556663961, + "grad_norm": 7.949290065036741, + "learning_rate": 2e-06, + "loss": 0.2455, + "step": 7405 + }, + { + "epoch": 1.7181301473146968, + "grad_norm": 10.785661327154305, + "learning_rate": 2e-06, + "loss": 0.1592, + "step": 7406 + }, + { + "epoch": 1.7183621389629975, + "grad_norm": 16.361965541625413, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 7407 + }, + { + "epoch": 1.718594130611298, + "grad_norm": 19.92901371079172, + "learning_rate": 2e-06, + "loss": 0.3203, + "step": 7408 + }, + { + "epoch": 1.7188261222595986, + "grad_norm": 9.312777707144642, + "learning_rate": 2e-06, + "loss": 0.1555, + "step": 7409 + }, + { + "epoch": 1.7190581139078993, + "grad_norm": 6.883164003465984, + "learning_rate": 2e-06, + "loss": 0.1613, + "step": 7410 + }, + { + "epoch": 1.7192901055562, + "grad_norm": 7.625503436868352, + "learning_rate": 2e-06, + "loss": 0.233, + "step": 7411 + }, + { + "epoch": 1.7195220972045007, + "grad_norm": 8.824659120759403, + "learning_rate": 2e-06, + "loss": 0.2014, + "step": 7412 + }, + { + "epoch": 1.7197540888528013, + "grad_norm": 6.026256965073181, + "learning_rate": 2e-06, + "loss": 0.187, + "step": 7413 + }, + { + "epoch": 1.7199860805011018, + "grad_norm": 13.585052566414436, + "learning_rate": 2e-06, + "loss": 0.2389, + "step": 7414 + }, + { + "epoch": 1.7202180721494025, + "grad_norm": 20.721352563891614, + "learning_rate": 2e-06, + "loss": 0.3369, + "step": 7415 + }, + { + "epoch": 1.7204500637977032, + "grad_norm": 17.802194293608064, + "learning_rate": 2e-06, + "loss": 0.291, + "step": 7416 + }, + { + "epoch": 1.7206820554460038, + "grad_norm": 12.616286867089714, + "learning_rate": 2e-06, + "loss": 0.1882, + "step": 7417 + }, + { + "epoch": 1.7209140470943045, + "grad_norm": 9.911712897105605, + "learning_rate": 2e-06, + "loss": 0.2234, + "step": 7418 + }, + { + "epoch": 1.7211460387426052, + "grad_norm": 16.06779518312088, + "learning_rate": 2e-06, + "loss": 0.2163, + "step": 7419 + }, + { + "epoch": 1.7213780303909059, + "grad_norm": 14.026915838395407, + "learning_rate": 2e-06, + "loss": 0.2335, + "step": 7420 + }, + { + "epoch": 1.7216100220392065, + "grad_norm": 16.03652589750512, + "learning_rate": 2e-06, + "loss": 0.2006, + "step": 7421 + }, + { + "epoch": 1.7218420136875072, + "grad_norm": 14.123340674487144, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 7422 + }, + { + "epoch": 1.722074005335808, + "grad_norm": 9.61097211651846, + "learning_rate": 2e-06, + "loss": 0.1643, + "step": 7423 + }, + { + "epoch": 1.7223059969841086, + "grad_norm": 6.260522196319805, + "learning_rate": 2e-06, + "loss": 0.1685, + "step": 7424 + }, + { + "epoch": 1.7225379886324093, + "grad_norm": 12.096547938872812, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 7425 + }, + { + "epoch": 1.72276998028071, + "grad_norm": 10.742362818573552, + "learning_rate": 2e-06, + "loss": 0.1912, + "step": 7426 + }, + { + "epoch": 1.7230019719290106, + "grad_norm": 9.979446739855348, + "learning_rate": 2e-06, + "loss": 0.2561, + "step": 7427 + }, + { + "epoch": 1.7232339635773113, + "grad_norm": 12.982352110270831, + "learning_rate": 2e-06, + "loss": 0.246, + "step": 7428 + }, + { + "epoch": 1.723465955225612, + "grad_norm": 10.51719727420884, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 7429 + }, + { + "epoch": 1.7236979468739126, + "grad_norm": 19.54035939653835, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 7430 + }, + { + "epoch": 1.7239299385222133, + "grad_norm": 10.282834840940072, + "learning_rate": 2e-06, + "loss": 0.1643, + "step": 7431 + }, + { + "epoch": 1.724161930170514, + "grad_norm": 15.72589351752963, + "learning_rate": 2e-06, + "loss": 0.3086, + "step": 7432 + }, + { + "epoch": 1.7243939218188147, + "grad_norm": 11.020603461535057, + "learning_rate": 2e-06, + "loss": 0.1795, + "step": 7433 + }, + { + "epoch": 1.7246259134671151, + "grad_norm": 9.55280925000355, + "learning_rate": 2e-06, + "loss": 0.2475, + "step": 7434 + }, + { + "epoch": 1.7248579051154158, + "grad_norm": 12.311849057297254, + "learning_rate": 2e-06, + "loss": 0.1943, + "step": 7435 + }, + { + "epoch": 1.7250898967637165, + "grad_norm": 8.677933835714159, + "learning_rate": 2e-06, + "loss": 0.1278, + "step": 7436 + }, + { + "epoch": 1.7253218884120172, + "grad_norm": 9.135809044634728, + "learning_rate": 2e-06, + "loss": 0.1905, + "step": 7437 + }, + { + "epoch": 1.7255538800603178, + "grad_norm": 12.128952282925505, + "learning_rate": 2e-06, + "loss": 0.2191, + "step": 7438 + }, + { + "epoch": 1.7257858717086185, + "grad_norm": 11.620391971320371, + "learning_rate": 2e-06, + "loss": 0.3023, + "step": 7439 + }, + { + "epoch": 1.7260178633569192, + "grad_norm": 7.742562300737355, + "learning_rate": 2e-06, + "loss": 0.2002, + "step": 7440 + }, + { + "epoch": 1.7262498550052197, + "grad_norm": 11.483373680485649, + "learning_rate": 2e-06, + "loss": 0.3349, + "step": 7441 + }, + { + "epoch": 1.7264818466535203, + "grad_norm": 14.756476462309388, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 7442 + }, + { + "epoch": 1.726713838301821, + "grad_norm": 24.504072386189904, + "learning_rate": 2e-06, + "loss": 0.3742, + "step": 7443 + }, + { + "epoch": 1.7269458299501217, + "grad_norm": 12.707665192584258, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 7444 + }, + { + "epoch": 1.7271778215984224, + "grad_norm": 25.36059367282093, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 7445 + }, + { + "epoch": 1.727409813246723, + "grad_norm": 18.671781461183837, + "learning_rate": 2e-06, + "loss": 0.3407, + "step": 7446 + }, + { + "epoch": 1.7276418048950237, + "grad_norm": 15.746357771325753, + "learning_rate": 2e-06, + "loss": 0.2162, + "step": 7447 + }, + { + "epoch": 1.7278737965433244, + "grad_norm": 8.535603643460004, + "learning_rate": 2e-06, + "loss": 0.2024, + "step": 7448 + }, + { + "epoch": 1.728105788191625, + "grad_norm": 16.43628019510447, + "learning_rate": 2e-06, + "loss": 0.351, + "step": 7449 + }, + { + "epoch": 1.7283377798399258, + "grad_norm": 10.001938065434366, + "learning_rate": 2e-06, + "loss": 0.2177, + "step": 7450 + }, + { + "epoch": 1.7285697714882264, + "grad_norm": 12.779681705176559, + "learning_rate": 2e-06, + "loss": 0.1771, + "step": 7451 + }, + { + "epoch": 1.7288017631365271, + "grad_norm": 13.328753843002087, + "learning_rate": 2e-06, + "loss": 0.1639, + "step": 7452 + }, + { + "epoch": 1.7290337547848278, + "grad_norm": 13.858957106187253, + "learning_rate": 2e-06, + "loss": 0.2973, + "step": 7453 + }, + { + "epoch": 1.7292657464331285, + "grad_norm": 14.723632245449247, + "learning_rate": 2e-06, + "loss": 0.2721, + "step": 7454 + }, + { + "epoch": 1.7294977380814291, + "grad_norm": 11.906681071889423, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 7455 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 7.628253623661381, + "learning_rate": 2e-06, + "loss": 0.1461, + "step": 7456 + }, + { + "epoch": 1.7299617213780305, + "grad_norm": 13.74681104057977, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 7457 + }, + { + "epoch": 1.7301937130263312, + "grad_norm": 8.654851282645897, + "learning_rate": 2e-06, + "loss": 0.2446, + "step": 7458 + }, + { + "epoch": 1.7304257046746319, + "grad_norm": 11.135571891568727, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 7459 + }, + { + "epoch": 1.7306576963229325, + "grad_norm": 23.09441910446493, + "learning_rate": 2e-06, + "loss": 0.3025, + "step": 7460 + }, + { + "epoch": 1.730889687971233, + "grad_norm": 7.408623355561432, + "learning_rate": 2e-06, + "loss": 0.2236, + "step": 7461 + }, + { + "epoch": 1.7311216796195337, + "grad_norm": 14.021068585561615, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 7462 + }, + { + "epoch": 1.7313536712678343, + "grad_norm": 13.492300579390124, + "learning_rate": 2e-06, + "loss": 0.2432, + "step": 7463 + }, + { + "epoch": 1.731585662916135, + "grad_norm": 9.888204097854688, + "learning_rate": 2e-06, + "loss": 0.264, + "step": 7464 + }, + { + "epoch": 1.7318176545644357, + "grad_norm": 15.78820383154411, + "learning_rate": 2e-06, + "loss": 0.2384, + "step": 7465 + }, + { + "epoch": 1.7320496462127364, + "grad_norm": 9.480461865651494, + "learning_rate": 2e-06, + "loss": 0.2361, + "step": 7466 + }, + { + "epoch": 1.7322816378610368, + "grad_norm": 10.41053618967658, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 7467 + }, + { + "epoch": 1.7325136295093375, + "grad_norm": 12.479664664280824, + "learning_rate": 2e-06, + "loss": 0.2908, + "step": 7468 + }, + { + "epoch": 1.7327456211576382, + "grad_norm": 15.79738817329508, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 7469 + }, + { + "epoch": 1.7329776128059389, + "grad_norm": 12.33172355967031, + "learning_rate": 2e-06, + "loss": 0.2325, + "step": 7470 + }, + { + "epoch": 1.7332096044542395, + "grad_norm": 24.294313092336242, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 7471 + }, + { + "epoch": 1.7334415961025402, + "grad_norm": 25.220141346783684, + "learning_rate": 2e-06, + "loss": 0.3333, + "step": 7472 + }, + { + "epoch": 1.733673587750841, + "grad_norm": 18.63395457662261, + "learning_rate": 2e-06, + "loss": 0.2178, + "step": 7473 + }, + { + "epoch": 1.7339055793991416, + "grad_norm": 15.239283398729846, + "learning_rate": 2e-06, + "loss": 0.2145, + "step": 7474 + }, + { + "epoch": 1.7341375710474423, + "grad_norm": 11.652366727377803, + "learning_rate": 2e-06, + "loss": 0.2277, + "step": 7475 + }, + { + "epoch": 1.734369562695743, + "grad_norm": 11.646906456604956, + "learning_rate": 2e-06, + "loss": 0.2013, + "step": 7476 + }, + { + "epoch": 1.7346015543440436, + "grad_norm": 8.637249927622804, + "learning_rate": 2e-06, + "loss": 0.1697, + "step": 7477 + }, + { + "epoch": 1.7348335459923443, + "grad_norm": 11.990032300591745, + "learning_rate": 2e-06, + "loss": 0.1589, + "step": 7478 + }, + { + "epoch": 1.735065537640645, + "grad_norm": 10.945905877991185, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 7479 + }, + { + "epoch": 1.7352975292889457, + "grad_norm": 7.920506695800604, + "learning_rate": 2e-06, + "loss": 0.2008, + "step": 7480 + }, + { + "epoch": 1.7355295209372463, + "grad_norm": 8.339878093891347, + "learning_rate": 2e-06, + "loss": 0.1503, + "step": 7481 + }, + { + "epoch": 1.735761512585547, + "grad_norm": 15.023425770672432, + "learning_rate": 2e-06, + "loss": 0.3408, + "step": 7482 + }, + { + "epoch": 1.7359935042338477, + "grad_norm": 9.377761857893933, + "learning_rate": 2e-06, + "loss": 0.2254, + "step": 7483 + }, + { + "epoch": 1.7362254958821484, + "grad_norm": 10.184995366885822, + "learning_rate": 2e-06, + "loss": 0.1921, + "step": 7484 + }, + { + "epoch": 1.736457487530449, + "grad_norm": 13.578407661309907, + "learning_rate": 2e-06, + "loss": 0.2303, + "step": 7485 + }, + { + "epoch": 1.7366894791787497, + "grad_norm": 7.356389683470666, + "learning_rate": 2e-06, + "loss": 0.2637, + "step": 7486 + }, + { + "epoch": 1.7369214708270502, + "grad_norm": 24.566412216006192, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 7487 + }, + { + "epoch": 1.7371534624753509, + "grad_norm": 11.865767307057718, + "learning_rate": 2e-06, + "loss": 0.2375, + "step": 7488 + }, + { + "epoch": 1.7373854541236515, + "grad_norm": 9.212238196866018, + "learning_rate": 2e-06, + "loss": 0.1615, + "step": 7489 + }, + { + "epoch": 1.7376174457719522, + "grad_norm": 11.28651869495539, + "learning_rate": 2e-06, + "loss": 0.2427, + "step": 7490 + }, + { + "epoch": 1.7378494374202529, + "grad_norm": 16.932290711659782, + "learning_rate": 2e-06, + "loss": 0.2682, + "step": 7491 + }, + { + "epoch": 1.7380814290685536, + "grad_norm": 7.522671677144873, + "learning_rate": 2e-06, + "loss": 0.1405, + "step": 7492 + }, + { + "epoch": 1.7383134207168542, + "grad_norm": 10.36334726849435, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 7493 + }, + { + "epoch": 1.7385454123651547, + "grad_norm": 10.670762139411545, + "learning_rate": 2e-06, + "loss": 0.1941, + "step": 7494 + }, + { + "epoch": 1.7387774040134554, + "grad_norm": 13.315131202322673, + "learning_rate": 2e-06, + "loss": 0.1852, + "step": 7495 + }, + { + "epoch": 1.739009395661756, + "grad_norm": 13.907336275113536, + "learning_rate": 2e-06, + "loss": 0.216, + "step": 7496 + }, + { + "epoch": 1.7392413873100567, + "grad_norm": 13.021695949675474, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 7497 + }, + { + "epoch": 1.7394733789583574, + "grad_norm": 13.732802454687654, + "learning_rate": 2e-06, + "loss": 0.1771, + "step": 7498 + }, + { + "epoch": 1.739705370606658, + "grad_norm": 11.879483859690765, + "learning_rate": 2e-06, + "loss": 0.4304, + "step": 7499 + }, + { + "epoch": 1.7399373622549588, + "grad_norm": 16.22319785403237, + "learning_rate": 2e-06, + "loss": 0.2655, + "step": 7500 + }, + { + "epoch": 1.7401693539032594, + "grad_norm": 15.214716242928388, + "learning_rate": 2e-06, + "loss": 0.2577, + "step": 7501 + }, + { + "epoch": 1.7404013455515601, + "grad_norm": 14.592929299836015, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 7502 + }, + { + "epoch": 1.7406333371998608, + "grad_norm": 20.926869969586114, + "learning_rate": 2e-06, + "loss": 0.3618, + "step": 7503 + }, + { + "epoch": 1.7408653288481615, + "grad_norm": 11.524230240811791, + "learning_rate": 2e-06, + "loss": 0.3186, + "step": 7504 + }, + { + "epoch": 1.7410973204964622, + "grad_norm": 8.307629840052147, + "learning_rate": 2e-06, + "loss": 0.256, + "step": 7505 + }, + { + "epoch": 1.7413293121447628, + "grad_norm": 14.612464879032128, + "learning_rate": 2e-06, + "loss": 0.3739, + "step": 7506 + }, + { + "epoch": 1.7415613037930635, + "grad_norm": 9.623353081994557, + "learning_rate": 2e-06, + "loss": 0.1218, + "step": 7507 + }, + { + "epoch": 1.7417932954413642, + "grad_norm": 18.58891226199371, + "learning_rate": 2e-06, + "loss": 0.208, + "step": 7508 + }, + { + "epoch": 1.7420252870896649, + "grad_norm": 10.85916321252991, + "learning_rate": 2e-06, + "loss": 0.2402, + "step": 7509 + }, + { + "epoch": 1.7422572787379655, + "grad_norm": 9.232168351677451, + "learning_rate": 2e-06, + "loss": 0.2054, + "step": 7510 + }, + { + "epoch": 1.7424892703862662, + "grad_norm": 11.909383974387465, + "learning_rate": 2e-06, + "loss": 0.1525, + "step": 7511 + }, + { + "epoch": 1.742721262034567, + "grad_norm": 10.25152784065819, + "learning_rate": 2e-06, + "loss": 0.2434, + "step": 7512 + }, + { + "epoch": 1.7429532536828676, + "grad_norm": 9.52767133429135, + "learning_rate": 2e-06, + "loss": 0.1615, + "step": 7513 + }, + { + "epoch": 1.743185245331168, + "grad_norm": 7.889990749074475, + "learning_rate": 2e-06, + "loss": 0.1444, + "step": 7514 + }, + { + "epoch": 1.7434172369794687, + "grad_norm": 11.299170921030075, + "learning_rate": 2e-06, + "loss": 0.1949, + "step": 7515 + }, + { + "epoch": 1.7436492286277694, + "grad_norm": 16.034637698952388, + "learning_rate": 2e-06, + "loss": 0.2648, + "step": 7516 + }, + { + "epoch": 1.74388122027607, + "grad_norm": 16.068494140149642, + "learning_rate": 2e-06, + "loss": 0.2912, + "step": 7517 + }, + { + "epoch": 1.7441132119243707, + "grad_norm": 11.633519815812562, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 7518 + }, + { + "epoch": 1.7443452035726714, + "grad_norm": 14.184939207792523, + "learning_rate": 2e-06, + "loss": 0.2376, + "step": 7519 + }, + { + "epoch": 1.744577195220972, + "grad_norm": 15.154376906265702, + "learning_rate": 2e-06, + "loss": 0.2754, + "step": 7520 + }, + { + "epoch": 1.7448091868692726, + "grad_norm": 16.188615959115307, + "learning_rate": 2e-06, + "loss": 0.1729, + "step": 7521 + }, + { + "epoch": 1.7450411785175732, + "grad_norm": 12.385480875263635, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 7522 + }, + { + "epoch": 1.745273170165874, + "grad_norm": 20.89350954693197, + "learning_rate": 2e-06, + "loss": 0.3894, + "step": 7523 + }, + { + "epoch": 1.7455051618141746, + "grad_norm": 12.754250138450933, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 7524 + }, + { + "epoch": 1.7457371534624753, + "grad_norm": 16.636950981836346, + "learning_rate": 2e-06, + "loss": 0.3309, + "step": 7525 + }, + { + "epoch": 1.745969145110776, + "grad_norm": 9.15472274877276, + "learning_rate": 2e-06, + "loss": 0.2223, + "step": 7526 + }, + { + "epoch": 1.7462011367590766, + "grad_norm": 12.287297688521102, + "learning_rate": 2e-06, + "loss": 0.1625, + "step": 7527 + }, + { + "epoch": 1.7464331284073773, + "grad_norm": 16.256618957006086, + "learning_rate": 2e-06, + "loss": 0.1969, + "step": 7528 + }, + { + "epoch": 1.746665120055678, + "grad_norm": 11.264582784625329, + "learning_rate": 2e-06, + "loss": 0.2765, + "step": 7529 + }, + { + "epoch": 1.7468971117039787, + "grad_norm": 7.349783736438385, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 7530 + }, + { + "epoch": 1.7471291033522793, + "grad_norm": 13.180748443177167, + "learning_rate": 2e-06, + "loss": 0.3109, + "step": 7531 + }, + { + "epoch": 1.74736109500058, + "grad_norm": 12.550800164137836, + "learning_rate": 2e-06, + "loss": 0.2776, + "step": 7532 + }, + { + "epoch": 1.7475930866488807, + "grad_norm": 18.065154984445197, + "learning_rate": 2e-06, + "loss": 0.3497, + "step": 7533 + }, + { + "epoch": 1.7478250782971814, + "grad_norm": 12.56891058573405, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 7534 + }, + { + "epoch": 1.748057069945482, + "grad_norm": 13.878529337942904, + "learning_rate": 2e-06, + "loss": 0.286, + "step": 7535 + }, + { + "epoch": 1.7482890615937827, + "grad_norm": 19.637820912174707, + "learning_rate": 2e-06, + "loss": 0.2508, + "step": 7536 + }, + { + "epoch": 1.7485210532420834, + "grad_norm": 11.758718917948533, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 7537 + }, + { + "epoch": 1.748753044890384, + "grad_norm": 7.703897971156512, + "learning_rate": 2e-06, + "loss": 0.1873, + "step": 7538 + }, + { + "epoch": 1.7489850365386848, + "grad_norm": 22.799078363783156, + "learning_rate": 2e-06, + "loss": 0.3683, + "step": 7539 + }, + { + "epoch": 1.7492170281869854, + "grad_norm": 8.022522116780017, + "learning_rate": 2e-06, + "loss": 0.2211, + "step": 7540 + }, + { + "epoch": 1.749449019835286, + "grad_norm": 9.720216247242394, + "learning_rate": 2e-06, + "loss": 0.1509, + "step": 7541 + }, + { + "epoch": 1.7496810114835866, + "grad_norm": 12.831999318303131, + "learning_rate": 2e-06, + "loss": 0.3326, + "step": 7542 + }, + { + "epoch": 1.7499130031318872, + "grad_norm": 13.350805612457814, + "learning_rate": 2e-06, + "loss": 0.2451, + "step": 7543 + }, + { + "epoch": 1.750144994780188, + "grad_norm": 29.70812030671747, + "learning_rate": 2e-06, + "loss": 0.3109, + "step": 7544 + }, + { + "epoch": 1.7503769864284886, + "grad_norm": 14.927095134968875, + "learning_rate": 2e-06, + "loss": 0.2826, + "step": 7545 + }, + { + "epoch": 1.7506089780767893, + "grad_norm": 15.271269633188103, + "learning_rate": 2e-06, + "loss": 0.2656, + "step": 7546 + }, + { + "epoch": 1.7508409697250897, + "grad_norm": 18.91591537542004, + "learning_rate": 2e-06, + "loss": 0.2331, + "step": 7547 + }, + { + "epoch": 1.7510729613733904, + "grad_norm": 15.809697419634523, + "learning_rate": 2e-06, + "loss": 0.2874, + "step": 7548 + }, + { + "epoch": 1.751304953021691, + "grad_norm": 10.111798022135611, + "learning_rate": 2e-06, + "loss": 0.2326, + "step": 7549 + }, + { + "epoch": 1.7515369446699918, + "grad_norm": 12.66921707938741, + "learning_rate": 2e-06, + "loss": 0.2217, + "step": 7550 + }, + { + "epoch": 1.7517689363182924, + "grad_norm": 7.11185655840169, + "learning_rate": 2e-06, + "loss": 0.1963, + "step": 7551 + }, + { + "epoch": 1.7520009279665931, + "grad_norm": 12.80342476480035, + "learning_rate": 2e-06, + "loss": 0.229, + "step": 7552 + }, + { + "epoch": 1.7522329196148938, + "grad_norm": 10.654623307789162, + "learning_rate": 2e-06, + "loss": 0.2072, + "step": 7553 + }, + { + "epoch": 1.7524649112631945, + "grad_norm": 31.130164040219547, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 7554 + }, + { + "epoch": 1.7526969029114952, + "grad_norm": 9.521927343087164, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 7555 + }, + { + "epoch": 1.7529288945597958, + "grad_norm": 17.004948780199133, + "learning_rate": 2e-06, + "loss": 0.2874, + "step": 7556 + }, + { + "epoch": 1.7531608862080965, + "grad_norm": 24.340497532566054, + "learning_rate": 2e-06, + "loss": 0.3665, + "step": 7557 + }, + { + "epoch": 1.7533928778563972, + "grad_norm": 9.246138535580242, + "learning_rate": 2e-06, + "loss": 0.1686, + "step": 7558 + }, + { + "epoch": 1.7536248695046979, + "grad_norm": 10.679323908025648, + "learning_rate": 2e-06, + "loss": 0.1937, + "step": 7559 + }, + { + "epoch": 1.7538568611529985, + "grad_norm": 7.459226203385063, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 7560 + }, + { + "epoch": 1.7540888528012992, + "grad_norm": 14.642865169587246, + "learning_rate": 2e-06, + "loss": 0.2596, + "step": 7561 + }, + { + "epoch": 1.7543208444496, + "grad_norm": 10.384026009480467, + "learning_rate": 2e-06, + "loss": 0.1944, + "step": 7562 + }, + { + "epoch": 1.7545528360979006, + "grad_norm": 17.544916763886047, + "learning_rate": 2e-06, + "loss": 0.3075, + "step": 7563 + }, + { + "epoch": 1.7547848277462013, + "grad_norm": 20.255564597301053, + "learning_rate": 2e-06, + "loss": 0.2725, + "step": 7564 + }, + { + "epoch": 1.755016819394502, + "grad_norm": 15.313597934383315, + "learning_rate": 2e-06, + "loss": 0.2702, + "step": 7565 + }, + { + "epoch": 1.7552488110428026, + "grad_norm": 11.868105774967077, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 7566 + }, + { + "epoch": 1.755480802691103, + "grad_norm": 8.132450737749771, + "learning_rate": 2e-06, + "loss": 0.1891, + "step": 7567 + }, + { + "epoch": 1.7557127943394037, + "grad_norm": 9.64320808739205, + "learning_rate": 2e-06, + "loss": 0.237, + "step": 7568 + }, + { + "epoch": 1.7559447859877044, + "grad_norm": 14.945149857562205, + "learning_rate": 2e-06, + "loss": 0.32, + "step": 7569 + }, + { + "epoch": 1.756176777636005, + "grad_norm": 5.6092356114404796, + "learning_rate": 2e-06, + "loss": 0.1472, + "step": 7570 + }, + { + "epoch": 1.7564087692843058, + "grad_norm": 15.171945822518653, + "learning_rate": 2e-06, + "loss": 0.2661, + "step": 7571 + }, + { + "epoch": 1.7566407609326065, + "grad_norm": 12.379076468659813, + "learning_rate": 2e-06, + "loss": 0.2324, + "step": 7572 + }, + { + "epoch": 1.7568727525809071, + "grad_norm": 10.976070574759373, + "learning_rate": 2e-06, + "loss": 0.2262, + "step": 7573 + }, + { + "epoch": 1.7571047442292076, + "grad_norm": 17.711656018486853, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 7574 + }, + { + "epoch": 1.7573367358775083, + "grad_norm": 9.414455829200904, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 7575 + }, + { + "epoch": 1.757568727525809, + "grad_norm": 10.143710465167523, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 7576 + }, + { + "epoch": 1.7578007191741096, + "grad_norm": 14.533736277795931, + "learning_rate": 2e-06, + "loss": 0.2556, + "step": 7577 + }, + { + "epoch": 1.7580327108224103, + "grad_norm": 15.913258956484471, + "learning_rate": 2e-06, + "loss": 0.2865, + "step": 7578 + }, + { + "epoch": 1.758264702470711, + "grad_norm": 12.767660833167866, + "learning_rate": 2e-06, + "loss": 0.271, + "step": 7579 + }, + { + "epoch": 1.7584966941190117, + "grad_norm": 11.294717862094545, + "learning_rate": 2e-06, + "loss": 0.2234, + "step": 7580 + }, + { + "epoch": 1.7587286857673123, + "grad_norm": 11.306623689139046, + "learning_rate": 2e-06, + "loss": 0.2039, + "step": 7581 + }, + { + "epoch": 1.758960677415613, + "grad_norm": 8.535703463216377, + "learning_rate": 2e-06, + "loss": 0.1754, + "step": 7582 + }, + { + "epoch": 1.7591926690639137, + "grad_norm": 15.506198698340077, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 7583 + }, + { + "epoch": 1.7594246607122144, + "grad_norm": 13.271148442659763, + "learning_rate": 2e-06, + "loss": 0.2016, + "step": 7584 + }, + { + "epoch": 1.759656652360515, + "grad_norm": 12.936342351595748, + "learning_rate": 2e-06, + "loss": 0.2456, + "step": 7585 + }, + { + "epoch": 1.7598886440088157, + "grad_norm": 4.865820394943794, + "learning_rate": 2e-06, + "loss": 0.1095, + "step": 7586 + }, + { + "epoch": 1.7601206356571164, + "grad_norm": 24.73588202755812, + "learning_rate": 2e-06, + "loss": 0.4227, + "step": 7587 + }, + { + "epoch": 1.760352627305417, + "grad_norm": 20.67162051161943, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 7588 + }, + { + "epoch": 1.7605846189537178, + "grad_norm": 19.6711134518649, + "learning_rate": 2e-06, + "loss": 0.3269, + "step": 7589 + }, + { + "epoch": 1.7608166106020184, + "grad_norm": 14.748934619827882, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 7590 + }, + { + "epoch": 1.7610486022503191, + "grad_norm": 11.75288363328804, + "learning_rate": 2e-06, + "loss": 0.1792, + "step": 7591 + }, + { + "epoch": 1.7612805938986198, + "grad_norm": 13.161600894829599, + "learning_rate": 2e-06, + "loss": 0.3328, + "step": 7592 + }, + { + "epoch": 1.7615125855469205, + "grad_norm": 24.39174578419128, + "learning_rate": 2e-06, + "loss": 0.276, + "step": 7593 + }, + { + "epoch": 1.761744577195221, + "grad_norm": 12.371452232880623, + "learning_rate": 2e-06, + "loss": 0.2194, + "step": 7594 + }, + { + "epoch": 1.7619765688435216, + "grad_norm": 14.749222617313963, + "learning_rate": 2e-06, + "loss": 0.2265, + "step": 7595 + }, + { + "epoch": 1.7622085604918223, + "grad_norm": 14.394177818624705, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 7596 + }, + { + "epoch": 1.762440552140123, + "grad_norm": 11.736377858916772, + "learning_rate": 2e-06, + "loss": 0.2413, + "step": 7597 + }, + { + "epoch": 1.7626725437884236, + "grad_norm": 9.37857496441924, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 7598 + }, + { + "epoch": 1.7629045354367243, + "grad_norm": 7.755353360563734, + "learning_rate": 2e-06, + "loss": 0.1467, + "step": 7599 + }, + { + "epoch": 1.7631365270850248, + "grad_norm": 17.392051182713978, + "learning_rate": 2e-06, + "loss": 0.3205, + "step": 7600 + }, + { + "epoch": 1.7633685187333255, + "grad_norm": 6.479599459078176, + "learning_rate": 2e-06, + "loss": 0.1768, + "step": 7601 + }, + { + "epoch": 1.7636005103816261, + "grad_norm": 12.443300925816132, + "learning_rate": 2e-06, + "loss": 0.2188, + "step": 7602 + }, + { + "epoch": 1.7638325020299268, + "grad_norm": 14.02188400560854, + "learning_rate": 2e-06, + "loss": 0.1731, + "step": 7603 + }, + { + "epoch": 1.7640644936782275, + "grad_norm": 13.579452802841217, + "learning_rate": 2e-06, + "loss": 0.188, + "step": 7604 + }, + { + "epoch": 1.7642964853265282, + "grad_norm": 11.894099852846338, + "learning_rate": 2e-06, + "loss": 0.2135, + "step": 7605 + }, + { + "epoch": 1.7645284769748288, + "grad_norm": 15.44436809606793, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 7606 + }, + { + "epoch": 1.7647604686231295, + "grad_norm": 25.01790094541462, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 7607 + }, + { + "epoch": 1.7649924602714302, + "grad_norm": 17.30869963511119, + "learning_rate": 2e-06, + "loss": 0.3645, + "step": 7608 + }, + { + "epoch": 1.7652244519197309, + "grad_norm": 7.544423889631266, + "learning_rate": 2e-06, + "loss": 0.1884, + "step": 7609 + }, + { + "epoch": 1.7654564435680316, + "grad_norm": 16.77983044601465, + "learning_rate": 2e-06, + "loss": 0.3064, + "step": 7610 + }, + { + "epoch": 1.7656884352163322, + "grad_norm": 13.915765880853346, + "learning_rate": 2e-06, + "loss": 0.3585, + "step": 7611 + }, + { + "epoch": 1.765920426864633, + "grad_norm": 14.45783459966466, + "learning_rate": 2e-06, + "loss": 0.234, + "step": 7612 + }, + { + "epoch": 1.7661524185129336, + "grad_norm": 13.257478868195667, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 7613 + }, + { + "epoch": 1.7663844101612343, + "grad_norm": 24.416811462153376, + "learning_rate": 2e-06, + "loss": 0.3739, + "step": 7614 + }, + { + "epoch": 1.766616401809535, + "grad_norm": 16.41812661132975, + "learning_rate": 2e-06, + "loss": 0.3323, + "step": 7615 + }, + { + "epoch": 1.7668483934578356, + "grad_norm": 13.841333663391419, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 7616 + }, + { + "epoch": 1.7670803851061363, + "grad_norm": 18.261926281948277, + "learning_rate": 2e-06, + "loss": 0.2349, + "step": 7617 + }, + { + "epoch": 1.767312376754437, + "grad_norm": 12.052038914795572, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 7618 + }, + { + "epoch": 1.7675443684027377, + "grad_norm": 13.682131575458424, + "learning_rate": 2e-06, + "loss": 0.2445, + "step": 7619 + }, + { + "epoch": 1.767776360051038, + "grad_norm": 17.787255720543858, + "learning_rate": 2e-06, + "loss": 0.2742, + "step": 7620 + }, + { + "epoch": 1.7680083516993388, + "grad_norm": 15.57759468719208, + "learning_rate": 2e-06, + "loss": 0.2104, + "step": 7621 + }, + { + "epoch": 1.7682403433476395, + "grad_norm": 10.939546101600278, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 7622 + }, + { + "epoch": 1.7684723349959401, + "grad_norm": 17.21174522628649, + "learning_rate": 2e-06, + "loss": 0.3564, + "step": 7623 + }, + { + "epoch": 1.7687043266442408, + "grad_norm": 10.751055553207847, + "learning_rate": 2e-06, + "loss": 0.1995, + "step": 7624 + }, + { + "epoch": 1.7689363182925415, + "grad_norm": 39.846053247413224, + "learning_rate": 2e-06, + "loss": 0.3384, + "step": 7625 + }, + { + "epoch": 1.7691683099408422, + "grad_norm": 12.08905876035218, + "learning_rate": 2e-06, + "loss": 0.1989, + "step": 7626 + }, + { + "epoch": 1.7694003015891426, + "grad_norm": 12.793073296425936, + "learning_rate": 2e-06, + "loss": 0.2565, + "step": 7627 + }, + { + "epoch": 1.7696322932374433, + "grad_norm": 15.993395786481747, + "learning_rate": 2e-06, + "loss": 0.3398, + "step": 7628 + }, + { + "epoch": 1.769864284885744, + "grad_norm": 13.706742234967228, + "learning_rate": 2e-06, + "loss": 0.1252, + "step": 7629 + }, + { + "epoch": 1.7700962765340447, + "grad_norm": 11.953171049385569, + "learning_rate": 2e-06, + "loss": 0.2428, + "step": 7630 + }, + { + "epoch": 1.7703282681823453, + "grad_norm": 14.74479617381812, + "learning_rate": 2e-06, + "loss": 0.2547, + "step": 7631 + }, + { + "epoch": 1.770560259830646, + "grad_norm": 14.219514617403146, + "learning_rate": 2e-06, + "loss": 0.2018, + "step": 7632 + }, + { + "epoch": 1.7707922514789467, + "grad_norm": 13.951297658416964, + "learning_rate": 2e-06, + "loss": 0.2766, + "step": 7633 + }, + { + "epoch": 1.7710242431272474, + "grad_norm": 9.491752262170884, + "learning_rate": 2e-06, + "loss": 0.1672, + "step": 7634 + }, + { + "epoch": 1.771256234775548, + "grad_norm": 17.46797158449257, + "learning_rate": 2e-06, + "loss": 0.3137, + "step": 7635 + }, + { + "epoch": 1.7714882264238487, + "grad_norm": 19.06944157828703, + "learning_rate": 2e-06, + "loss": 0.2695, + "step": 7636 + }, + { + "epoch": 1.7717202180721494, + "grad_norm": 15.879264186736425, + "learning_rate": 2e-06, + "loss": 0.2672, + "step": 7637 + }, + { + "epoch": 1.77195220972045, + "grad_norm": 9.32608523384946, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 7638 + }, + { + "epoch": 1.7721842013687508, + "grad_norm": 18.03702817443358, + "learning_rate": 2e-06, + "loss": 0.2554, + "step": 7639 + }, + { + "epoch": 1.7724161930170514, + "grad_norm": 13.736295330033094, + "learning_rate": 2e-06, + "loss": 0.2727, + "step": 7640 + }, + { + "epoch": 1.7726481846653521, + "grad_norm": 8.515303235720744, + "learning_rate": 2e-06, + "loss": 0.1856, + "step": 7641 + }, + { + "epoch": 1.7728801763136528, + "grad_norm": 23.630669813146312, + "learning_rate": 2e-06, + "loss": 0.2172, + "step": 7642 + }, + { + "epoch": 1.7731121679619535, + "grad_norm": 12.887268646237283, + "learning_rate": 2e-06, + "loss": 0.3188, + "step": 7643 + }, + { + "epoch": 1.7733441596102542, + "grad_norm": 12.540798137644023, + "learning_rate": 2e-06, + "loss": 0.2652, + "step": 7644 + }, + { + "epoch": 1.7735761512585548, + "grad_norm": 14.105494099925721, + "learning_rate": 2e-06, + "loss": 0.2969, + "step": 7645 + }, + { + "epoch": 1.7738081429068555, + "grad_norm": 13.725792982960401, + "learning_rate": 2e-06, + "loss": 0.227, + "step": 7646 + }, + { + "epoch": 1.774040134555156, + "grad_norm": 21.37437022895052, + "learning_rate": 2e-06, + "loss": 0.3201, + "step": 7647 + }, + { + "epoch": 1.7742721262034566, + "grad_norm": 10.309915600828834, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 7648 + }, + { + "epoch": 1.7745041178517573, + "grad_norm": 12.343204213291852, + "learning_rate": 2e-06, + "loss": 0.1647, + "step": 7649 + }, + { + "epoch": 1.774736109500058, + "grad_norm": 12.04423941594413, + "learning_rate": 2e-06, + "loss": 0.2637, + "step": 7650 + }, + { + "epoch": 1.7749681011483587, + "grad_norm": 18.005224230213113, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 7651 + }, + { + "epoch": 1.7752000927966594, + "grad_norm": 7.775026327525456, + "learning_rate": 2e-06, + "loss": 0.1205, + "step": 7652 + }, + { + "epoch": 1.77543208444496, + "grad_norm": 15.384233415521097, + "learning_rate": 2e-06, + "loss": 0.3409, + "step": 7653 + }, + { + "epoch": 1.7756640760932605, + "grad_norm": 4.987462215279807, + "learning_rate": 2e-06, + "loss": 0.1379, + "step": 7654 + }, + { + "epoch": 1.7758960677415612, + "grad_norm": 22.727798216734325, + "learning_rate": 2e-06, + "loss": 0.4157, + "step": 7655 + }, + { + "epoch": 1.7761280593898618, + "grad_norm": 8.363343229513351, + "learning_rate": 2e-06, + "loss": 0.167, + "step": 7656 + }, + { + "epoch": 1.7763600510381625, + "grad_norm": 7.1776307695687045, + "learning_rate": 2e-06, + "loss": 0.1454, + "step": 7657 + }, + { + "epoch": 1.7765920426864632, + "grad_norm": 10.413632590231295, + "learning_rate": 2e-06, + "loss": 0.1798, + "step": 7658 + }, + { + "epoch": 1.7768240343347639, + "grad_norm": 17.375429455673192, + "learning_rate": 2e-06, + "loss": 0.3528, + "step": 7659 + }, + { + "epoch": 1.7770560259830646, + "grad_norm": 9.339877282892838, + "learning_rate": 2e-06, + "loss": 0.3373, + "step": 7660 + }, + { + "epoch": 1.7772880176313652, + "grad_norm": 15.76690432141599, + "learning_rate": 2e-06, + "loss": 0.2209, + "step": 7661 + }, + { + "epoch": 1.777520009279666, + "grad_norm": 16.580997351774784, + "learning_rate": 2e-06, + "loss": 0.248, + "step": 7662 + }, + { + "epoch": 1.7777520009279666, + "grad_norm": 8.380956961573576, + "learning_rate": 2e-06, + "loss": 0.1943, + "step": 7663 + }, + { + "epoch": 1.7779839925762673, + "grad_norm": 18.827557808833518, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 7664 + }, + { + "epoch": 1.778215984224568, + "grad_norm": 13.388074493082893, + "learning_rate": 2e-06, + "loss": 0.2367, + "step": 7665 + }, + { + "epoch": 1.7784479758728686, + "grad_norm": 26.059732419409222, + "learning_rate": 2e-06, + "loss": 0.2773, + "step": 7666 + }, + { + "epoch": 1.7786799675211693, + "grad_norm": 11.879194017751967, + "learning_rate": 2e-06, + "loss": 0.2716, + "step": 7667 + }, + { + "epoch": 1.77891195916947, + "grad_norm": 20.71252153591305, + "learning_rate": 2e-06, + "loss": 0.2782, + "step": 7668 + }, + { + "epoch": 1.7791439508177707, + "grad_norm": 8.655225424649053, + "learning_rate": 2e-06, + "loss": 0.149, + "step": 7669 + }, + { + "epoch": 1.7793759424660713, + "grad_norm": 24.443766643758085, + "learning_rate": 2e-06, + "loss": 0.378, + "step": 7670 + }, + { + "epoch": 1.779607934114372, + "grad_norm": 13.744789244886295, + "learning_rate": 2e-06, + "loss": 0.2104, + "step": 7671 + }, + { + "epoch": 1.7798399257626727, + "grad_norm": 11.431132212646698, + "learning_rate": 2e-06, + "loss": 0.2329, + "step": 7672 + }, + { + "epoch": 1.7800719174109734, + "grad_norm": 12.346610648641189, + "learning_rate": 2e-06, + "loss": 0.3629, + "step": 7673 + }, + { + "epoch": 1.7803039090592738, + "grad_norm": 13.919607998095852, + "learning_rate": 2e-06, + "loss": 0.178, + "step": 7674 + }, + { + "epoch": 1.7805359007075745, + "grad_norm": 17.970739199315016, + "learning_rate": 2e-06, + "loss": 0.3034, + "step": 7675 + }, + { + "epoch": 1.7807678923558752, + "grad_norm": 7.195667958235038, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 7676 + }, + { + "epoch": 1.7809998840041759, + "grad_norm": 8.009797752370751, + "learning_rate": 2e-06, + "loss": 0.1594, + "step": 7677 + }, + { + "epoch": 1.7812318756524765, + "grad_norm": 4.141046293959366, + "learning_rate": 2e-06, + "loss": 0.1238, + "step": 7678 + }, + { + "epoch": 1.7814638673007772, + "grad_norm": 12.84582198199472, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 7679 + }, + { + "epoch": 1.7816958589490777, + "grad_norm": 15.120203097994304, + "learning_rate": 2e-06, + "loss": 0.2866, + "step": 7680 + }, + { + "epoch": 1.7819278505973783, + "grad_norm": 14.5632055267852, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 7681 + }, + { + "epoch": 1.782159842245679, + "grad_norm": 16.375652600630268, + "learning_rate": 2e-06, + "loss": 0.3306, + "step": 7682 + }, + { + "epoch": 1.7823918338939797, + "grad_norm": 15.930067366581838, + "learning_rate": 2e-06, + "loss": 0.2648, + "step": 7683 + }, + { + "epoch": 1.7826238255422804, + "grad_norm": 13.939992885928524, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 7684 + }, + { + "epoch": 1.782855817190581, + "grad_norm": 8.928404635074617, + "learning_rate": 2e-06, + "loss": 0.2156, + "step": 7685 + }, + { + "epoch": 1.7830878088388817, + "grad_norm": 6.86939899315531, + "learning_rate": 2e-06, + "loss": 0.2018, + "step": 7686 + }, + { + "epoch": 1.7833198004871824, + "grad_norm": 14.856892617680042, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 7687 + }, + { + "epoch": 1.783551792135483, + "grad_norm": 9.050658316312433, + "learning_rate": 2e-06, + "loss": 0.2238, + "step": 7688 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 10.238929511967964, + "learning_rate": 2e-06, + "loss": 0.261, + "step": 7689 + }, + { + "epoch": 1.7840157754320844, + "grad_norm": 12.373983145047632, + "learning_rate": 2e-06, + "loss": 0.2076, + "step": 7690 + }, + { + "epoch": 1.7842477670803851, + "grad_norm": 18.484115788006253, + "learning_rate": 2e-06, + "loss": 0.3735, + "step": 7691 + }, + { + "epoch": 1.7844797587286858, + "grad_norm": 13.47239908811981, + "learning_rate": 2e-06, + "loss": 0.3014, + "step": 7692 + }, + { + "epoch": 1.7847117503769865, + "grad_norm": 13.809744947458755, + "learning_rate": 2e-06, + "loss": 0.2936, + "step": 7693 + }, + { + "epoch": 1.7849437420252872, + "grad_norm": 12.90404228506961, + "learning_rate": 2e-06, + "loss": 0.2841, + "step": 7694 + }, + { + "epoch": 1.7851757336735878, + "grad_norm": 9.70647010268845, + "learning_rate": 2e-06, + "loss": 0.194, + "step": 7695 + }, + { + "epoch": 1.7854077253218885, + "grad_norm": 10.11452466154601, + "learning_rate": 2e-06, + "loss": 0.2296, + "step": 7696 + }, + { + "epoch": 1.7856397169701892, + "grad_norm": 18.241161735120254, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 7697 + }, + { + "epoch": 1.7858717086184899, + "grad_norm": 4.363594729824572, + "learning_rate": 2e-06, + "loss": 0.1453, + "step": 7698 + }, + { + "epoch": 1.7861037002667906, + "grad_norm": 19.847002056503342, + "learning_rate": 2e-06, + "loss": 0.2892, + "step": 7699 + }, + { + "epoch": 1.786335691915091, + "grad_norm": 12.037313097036915, + "learning_rate": 2e-06, + "loss": 0.1817, + "step": 7700 + }, + { + "epoch": 1.7865676835633917, + "grad_norm": 9.665541170604797, + "learning_rate": 2e-06, + "loss": 0.2488, + "step": 7701 + }, + { + "epoch": 1.7867996752116924, + "grad_norm": 9.578282900368189, + "learning_rate": 2e-06, + "loss": 0.1973, + "step": 7702 + }, + { + "epoch": 1.787031666859993, + "grad_norm": 12.011774653107265, + "learning_rate": 2e-06, + "loss": 0.1994, + "step": 7703 + }, + { + "epoch": 1.7872636585082937, + "grad_norm": 14.059264981423192, + "learning_rate": 2e-06, + "loss": 0.3322, + "step": 7704 + }, + { + "epoch": 1.7874956501565944, + "grad_norm": 11.308037303258596, + "learning_rate": 2e-06, + "loss": 0.1671, + "step": 7705 + }, + { + "epoch": 1.787727641804895, + "grad_norm": 14.830070083622193, + "learning_rate": 2e-06, + "loss": 0.2383, + "step": 7706 + }, + { + "epoch": 1.7879596334531955, + "grad_norm": 8.970729653365368, + "learning_rate": 2e-06, + "loss": 0.1878, + "step": 7707 + }, + { + "epoch": 1.7881916251014962, + "grad_norm": 15.265954978012068, + "learning_rate": 2e-06, + "loss": 0.1944, + "step": 7708 + }, + { + "epoch": 1.7884236167497969, + "grad_norm": 9.267580147523244, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 7709 + }, + { + "epoch": 1.7886556083980976, + "grad_norm": 11.816356243554782, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 7710 + }, + { + "epoch": 1.7888876000463982, + "grad_norm": 14.13892712739281, + "learning_rate": 2e-06, + "loss": 0.3183, + "step": 7711 + }, + { + "epoch": 1.789119591694699, + "grad_norm": 24.37772003449913, + "learning_rate": 2e-06, + "loss": 0.3273, + "step": 7712 + }, + { + "epoch": 1.7893515833429996, + "grad_norm": 18.881919815002593, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 7713 + }, + { + "epoch": 1.7895835749913003, + "grad_norm": 9.161757211887082, + "learning_rate": 2e-06, + "loss": 0.1735, + "step": 7714 + }, + { + "epoch": 1.789815566639601, + "grad_norm": 17.849954977305433, + "learning_rate": 2e-06, + "loss": 0.3116, + "step": 7715 + }, + { + "epoch": 1.7900475582879016, + "grad_norm": 8.87979005272126, + "learning_rate": 2e-06, + "loss": 0.137, + "step": 7716 + }, + { + "epoch": 1.7902795499362023, + "grad_norm": 17.163684039103828, + "learning_rate": 2e-06, + "loss": 0.3231, + "step": 7717 + }, + { + "epoch": 1.790511541584503, + "grad_norm": 10.957487983245123, + "learning_rate": 2e-06, + "loss": 0.2304, + "step": 7718 + }, + { + "epoch": 1.7907435332328037, + "grad_norm": 7.331413316833315, + "learning_rate": 2e-06, + "loss": 0.2005, + "step": 7719 + }, + { + "epoch": 1.7909755248811043, + "grad_norm": 10.347401603722119, + "learning_rate": 2e-06, + "loss": 0.2613, + "step": 7720 + }, + { + "epoch": 1.791207516529405, + "grad_norm": 14.864802701315973, + "learning_rate": 2e-06, + "loss": 0.2408, + "step": 7721 + }, + { + "epoch": 1.7914395081777057, + "grad_norm": 12.12204936362209, + "learning_rate": 2e-06, + "loss": 0.2537, + "step": 7722 + }, + { + "epoch": 1.7916714998260064, + "grad_norm": 11.61910317439687, + "learning_rate": 2e-06, + "loss": 0.2107, + "step": 7723 + }, + { + "epoch": 1.791903491474307, + "grad_norm": 11.021376022092555, + "learning_rate": 2e-06, + "loss": 0.2689, + "step": 7724 + }, + { + "epoch": 1.7921354831226077, + "grad_norm": 14.823213878454235, + "learning_rate": 2e-06, + "loss": 0.2617, + "step": 7725 + }, + { + "epoch": 1.7923674747709084, + "grad_norm": 13.759950442388043, + "learning_rate": 2e-06, + "loss": 0.284, + "step": 7726 + }, + { + "epoch": 1.7925994664192089, + "grad_norm": 13.73359765140498, + "learning_rate": 2e-06, + "loss": 0.1964, + "step": 7727 + }, + { + "epoch": 1.7928314580675095, + "grad_norm": 21.592029416160536, + "learning_rate": 2e-06, + "loss": 0.2878, + "step": 7728 + }, + { + "epoch": 1.7930634497158102, + "grad_norm": 13.066431196494792, + "learning_rate": 2e-06, + "loss": 0.2148, + "step": 7729 + }, + { + "epoch": 1.793295441364111, + "grad_norm": 23.518466840599984, + "learning_rate": 2e-06, + "loss": 0.3538, + "step": 7730 + }, + { + "epoch": 1.7935274330124116, + "grad_norm": 22.280556048239266, + "learning_rate": 2e-06, + "loss": 0.3328, + "step": 7731 + }, + { + "epoch": 1.7937594246607123, + "grad_norm": 18.70270509152741, + "learning_rate": 2e-06, + "loss": 0.2993, + "step": 7732 + }, + { + "epoch": 1.7939914163090127, + "grad_norm": 14.745009418359984, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 7733 + }, + { + "epoch": 1.7942234079573134, + "grad_norm": 7.020182056098816, + "learning_rate": 2e-06, + "loss": 0.1825, + "step": 7734 + }, + { + "epoch": 1.794455399605614, + "grad_norm": 8.9041045197384, + "learning_rate": 2e-06, + "loss": 0.1951, + "step": 7735 + }, + { + "epoch": 1.7946873912539147, + "grad_norm": 16.96340376259227, + "learning_rate": 2e-06, + "loss": 0.2135, + "step": 7736 + }, + { + "epoch": 1.7949193829022154, + "grad_norm": 13.063819447973446, + "learning_rate": 2e-06, + "loss": 0.3117, + "step": 7737 + }, + { + "epoch": 1.795151374550516, + "grad_norm": 17.813456129041118, + "learning_rate": 2e-06, + "loss": 0.319, + "step": 7738 + }, + { + "epoch": 1.7953833661988168, + "grad_norm": 14.21559790292514, + "learning_rate": 2e-06, + "loss": 0.3333, + "step": 7739 + }, + { + "epoch": 1.7956153578471175, + "grad_norm": 13.970632816975339, + "learning_rate": 2e-06, + "loss": 0.3119, + "step": 7740 + }, + { + "epoch": 1.7958473494954181, + "grad_norm": 20.13254991016089, + "learning_rate": 2e-06, + "loss": 0.3724, + "step": 7741 + }, + { + "epoch": 1.7960793411437188, + "grad_norm": 12.702450368500116, + "learning_rate": 2e-06, + "loss": 0.1933, + "step": 7742 + }, + { + "epoch": 1.7963113327920195, + "grad_norm": 5.033487482828364, + "learning_rate": 2e-06, + "loss": 0.1609, + "step": 7743 + }, + { + "epoch": 1.7965433244403202, + "grad_norm": 13.226202684968266, + "learning_rate": 2e-06, + "loss": 0.2027, + "step": 7744 + }, + { + "epoch": 1.7967753160886208, + "grad_norm": 11.445493042999509, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 7745 + }, + { + "epoch": 1.7970073077369215, + "grad_norm": 9.386391237175017, + "learning_rate": 2e-06, + "loss": 0.2463, + "step": 7746 + }, + { + "epoch": 1.7972392993852222, + "grad_norm": 8.137216847310562, + "learning_rate": 2e-06, + "loss": 0.2218, + "step": 7747 + }, + { + "epoch": 1.7974712910335229, + "grad_norm": 14.29784916326407, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 7748 + }, + { + "epoch": 1.7977032826818236, + "grad_norm": 32.617210845413794, + "learning_rate": 2e-06, + "loss": 0.5112, + "step": 7749 + }, + { + "epoch": 1.7979352743301242, + "grad_norm": 11.9630554774893, + "learning_rate": 2e-06, + "loss": 0.2474, + "step": 7750 + }, + { + "epoch": 1.798167265978425, + "grad_norm": 9.624929990079778, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 7751 + }, + { + "epoch": 1.7983992576267256, + "grad_norm": 14.670156870012233, + "learning_rate": 2e-06, + "loss": 0.2639, + "step": 7752 + }, + { + "epoch": 1.798631249275026, + "grad_norm": 10.86678132759406, + "learning_rate": 2e-06, + "loss": 0.2205, + "step": 7753 + }, + { + "epoch": 1.7988632409233267, + "grad_norm": 16.638640631879152, + "learning_rate": 2e-06, + "loss": 0.3019, + "step": 7754 + }, + { + "epoch": 1.7990952325716274, + "grad_norm": 10.957011788431164, + "learning_rate": 2e-06, + "loss": 0.2409, + "step": 7755 + }, + { + "epoch": 1.799327224219928, + "grad_norm": 11.232915987644578, + "learning_rate": 2e-06, + "loss": 0.2198, + "step": 7756 + }, + { + "epoch": 1.7995592158682288, + "grad_norm": 7.602008455160166, + "learning_rate": 2e-06, + "loss": 0.1659, + "step": 7757 + }, + { + "epoch": 1.7997912075165294, + "grad_norm": 10.923209403337351, + "learning_rate": 2e-06, + "loss": 0.2615, + "step": 7758 + }, + { + "epoch": 1.8000231991648301, + "grad_norm": 7.540291491452469, + "learning_rate": 2e-06, + "loss": 0.1623, + "step": 7759 + }, + { + "epoch": 1.8002551908131306, + "grad_norm": 5.588197524067868, + "learning_rate": 2e-06, + "loss": 0.1303, + "step": 7760 + }, + { + "epoch": 1.8004871824614312, + "grad_norm": 7.92180637279653, + "learning_rate": 2e-06, + "loss": 0.1813, + "step": 7761 + }, + { + "epoch": 1.800719174109732, + "grad_norm": 10.141586008302626, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 7762 + }, + { + "epoch": 1.8009511657580326, + "grad_norm": 21.33250743923904, + "learning_rate": 2e-06, + "loss": 0.334, + "step": 7763 + }, + { + "epoch": 1.8011831574063333, + "grad_norm": 10.204261149433293, + "learning_rate": 2e-06, + "loss": 0.1503, + "step": 7764 + }, + { + "epoch": 1.801415149054634, + "grad_norm": 14.724303989684397, + "learning_rate": 2e-06, + "loss": 0.2317, + "step": 7765 + }, + { + "epoch": 1.8016471407029346, + "grad_norm": 11.189179616754403, + "learning_rate": 2e-06, + "loss": 0.2597, + "step": 7766 + }, + { + "epoch": 1.8018791323512353, + "grad_norm": 8.667717048927448, + "learning_rate": 2e-06, + "loss": 0.1805, + "step": 7767 + }, + { + "epoch": 1.802111123999536, + "grad_norm": 10.099312307553099, + "learning_rate": 2e-06, + "loss": 0.2101, + "step": 7768 + }, + { + "epoch": 1.8023431156478367, + "grad_norm": 10.156468261813107, + "learning_rate": 2e-06, + "loss": 0.2434, + "step": 7769 + }, + { + "epoch": 1.8025751072961373, + "grad_norm": 6.768385693132343, + "learning_rate": 2e-06, + "loss": 0.23, + "step": 7770 + }, + { + "epoch": 1.802807098944438, + "grad_norm": 9.443324597411717, + "learning_rate": 2e-06, + "loss": 0.2353, + "step": 7771 + }, + { + "epoch": 1.8030390905927387, + "grad_norm": 10.59913449235798, + "learning_rate": 2e-06, + "loss": 0.2387, + "step": 7772 + }, + { + "epoch": 1.8032710822410394, + "grad_norm": 16.13258187524545, + "learning_rate": 2e-06, + "loss": 0.2608, + "step": 7773 + }, + { + "epoch": 1.80350307388934, + "grad_norm": 15.139015354804348, + "learning_rate": 2e-06, + "loss": 0.376, + "step": 7774 + }, + { + "epoch": 1.8037350655376407, + "grad_norm": 10.451231399468004, + "learning_rate": 2e-06, + "loss": 0.2176, + "step": 7775 + }, + { + "epoch": 1.8039670571859414, + "grad_norm": 17.00309216109801, + "learning_rate": 2e-06, + "loss": 0.2994, + "step": 7776 + }, + { + "epoch": 1.804199048834242, + "grad_norm": 12.648387109888144, + "learning_rate": 2e-06, + "loss": 0.2219, + "step": 7777 + }, + { + "epoch": 1.8044310404825428, + "grad_norm": 17.7688065718772, + "learning_rate": 2e-06, + "loss": 0.2452, + "step": 7778 + }, + { + "epoch": 1.8046630321308434, + "grad_norm": 14.367544047444886, + "learning_rate": 2e-06, + "loss": 0.2233, + "step": 7779 + }, + { + "epoch": 1.804895023779144, + "grad_norm": 9.520236345447163, + "learning_rate": 2e-06, + "loss": 0.1703, + "step": 7780 + }, + { + "epoch": 1.8051270154274446, + "grad_norm": 14.606371152511754, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 7781 + }, + { + "epoch": 1.8053590070757453, + "grad_norm": 5.579155887762543, + "learning_rate": 2e-06, + "loss": 0.1576, + "step": 7782 + }, + { + "epoch": 1.805590998724046, + "grad_norm": 12.948532567369465, + "learning_rate": 2e-06, + "loss": 0.2833, + "step": 7783 + }, + { + "epoch": 1.8058229903723466, + "grad_norm": 16.693346240562605, + "learning_rate": 2e-06, + "loss": 0.3763, + "step": 7784 + }, + { + "epoch": 1.8060549820206473, + "grad_norm": 15.82863877420163, + "learning_rate": 2e-06, + "loss": 0.2494, + "step": 7785 + }, + { + "epoch": 1.806286973668948, + "grad_norm": 8.62009366577174, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 7786 + }, + { + "epoch": 1.8065189653172484, + "grad_norm": 15.841950814591993, + "learning_rate": 2e-06, + "loss": 0.2895, + "step": 7787 + }, + { + "epoch": 1.806750956965549, + "grad_norm": 19.67162703794575, + "learning_rate": 2e-06, + "loss": 0.2885, + "step": 7788 + }, + { + "epoch": 1.8069829486138498, + "grad_norm": 9.324527872345765, + "learning_rate": 2e-06, + "loss": 0.2164, + "step": 7789 + }, + { + "epoch": 1.8072149402621505, + "grad_norm": 8.803393444656317, + "learning_rate": 2e-06, + "loss": 0.2439, + "step": 7790 + }, + { + "epoch": 1.8074469319104511, + "grad_norm": 10.688359265610206, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 7791 + }, + { + "epoch": 1.8076789235587518, + "grad_norm": 20.370846970552368, + "learning_rate": 2e-06, + "loss": 0.2405, + "step": 7792 + }, + { + "epoch": 1.8079109152070525, + "grad_norm": 12.314345405676008, + "learning_rate": 2e-06, + "loss": 0.3558, + "step": 7793 + }, + { + "epoch": 1.8081429068553532, + "grad_norm": 9.02896450167309, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 7794 + }, + { + "epoch": 1.8083748985036538, + "grad_norm": 11.910756638938176, + "learning_rate": 2e-06, + "loss": 0.1928, + "step": 7795 + }, + { + "epoch": 1.8086068901519545, + "grad_norm": 10.854730659767704, + "learning_rate": 2e-06, + "loss": 0.2081, + "step": 7796 + }, + { + "epoch": 1.8088388818002552, + "grad_norm": 14.871704613286662, + "learning_rate": 2e-06, + "loss": 0.2384, + "step": 7797 + }, + { + "epoch": 1.8090708734485559, + "grad_norm": 10.544675807311968, + "learning_rate": 2e-06, + "loss": 0.1492, + "step": 7798 + }, + { + "epoch": 1.8093028650968566, + "grad_norm": 13.806037913485634, + "learning_rate": 2e-06, + "loss": 0.2252, + "step": 7799 + }, + { + "epoch": 1.8095348567451572, + "grad_norm": 11.072548191035832, + "learning_rate": 2e-06, + "loss": 0.2812, + "step": 7800 + }, + { + "epoch": 1.809766848393458, + "grad_norm": 11.267678733025672, + "learning_rate": 2e-06, + "loss": 0.1959, + "step": 7801 + }, + { + "epoch": 1.8099988400417586, + "grad_norm": 18.377656686975637, + "learning_rate": 2e-06, + "loss": 0.241, + "step": 7802 + }, + { + "epoch": 1.8102308316900593, + "grad_norm": 25.95688759321459, + "learning_rate": 2e-06, + "loss": 0.3239, + "step": 7803 + }, + { + "epoch": 1.81046282333836, + "grad_norm": 13.683329106749722, + "learning_rate": 2e-06, + "loss": 0.1345, + "step": 7804 + }, + { + "epoch": 1.8106948149866606, + "grad_norm": 9.097969627130546, + "learning_rate": 2e-06, + "loss": 0.1568, + "step": 7805 + }, + { + "epoch": 1.8109268066349613, + "grad_norm": 20.28212796053784, + "learning_rate": 2e-06, + "loss": 0.2317, + "step": 7806 + }, + { + "epoch": 1.8111587982832618, + "grad_norm": 13.13116391176701, + "learning_rate": 2e-06, + "loss": 0.2239, + "step": 7807 + }, + { + "epoch": 1.8113907899315624, + "grad_norm": 35.367434165456125, + "learning_rate": 2e-06, + "loss": 0.2811, + "step": 7808 + }, + { + "epoch": 1.8116227815798631, + "grad_norm": 13.284596823413086, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 7809 + }, + { + "epoch": 1.8118547732281638, + "grad_norm": 25.05754023794442, + "learning_rate": 2e-06, + "loss": 0.199, + "step": 7810 + }, + { + "epoch": 1.8120867648764645, + "grad_norm": 11.14006570651281, + "learning_rate": 2e-06, + "loss": 0.1748, + "step": 7811 + }, + { + "epoch": 1.8123187565247652, + "grad_norm": 9.07722743677708, + "learning_rate": 2e-06, + "loss": 0.1929, + "step": 7812 + }, + { + "epoch": 1.8125507481730656, + "grad_norm": 22.349518160097013, + "learning_rate": 2e-06, + "loss": 0.2513, + "step": 7813 + }, + { + "epoch": 1.8127827398213663, + "grad_norm": 9.248339748264183, + "learning_rate": 2e-06, + "loss": 0.1483, + "step": 7814 + }, + { + "epoch": 1.813014731469667, + "grad_norm": 11.764457418025861, + "learning_rate": 2e-06, + "loss": 0.2851, + "step": 7815 + }, + { + "epoch": 1.8132467231179676, + "grad_norm": 12.019207291970828, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 7816 + }, + { + "epoch": 1.8134787147662683, + "grad_norm": 11.780327657611625, + "learning_rate": 2e-06, + "loss": 0.1862, + "step": 7817 + }, + { + "epoch": 1.813710706414569, + "grad_norm": 7.56300305679845, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 7818 + }, + { + "epoch": 1.8139426980628697, + "grad_norm": 18.84673033730254, + "learning_rate": 2e-06, + "loss": 0.2622, + "step": 7819 + }, + { + "epoch": 1.8141746897111704, + "grad_norm": 4.401220419426188, + "learning_rate": 2e-06, + "loss": 0.1358, + "step": 7820 + }, + { + "epoch": 1.814406681359471, + "grad_norm": 22.284767666525287, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 7821 + }, + { + "epoch": 1.8146386730077717, + "grad_norm": 18.64348610947482, + "learning_rate": 2e-06, + "loss": 0.2767, + "step": 7822 + }, + { + "epoch": 1.8148706646560724, + "grad_norm": 10.983716767173828, + "learning_rate": 2e-06, + "loss": 0.1831, + "step": 7823 + }, + { + "epoch": 1.815102656304373, + "grad_norm": 12.724066637808379, + "learning_rate": 2e-06, + "loss": 0.1808, + "step": 7824 + }, + { + "epoch": 1.8153346479526737, + "grad_norm": 14.024835862214248, + "learning_rate": 2e-06, + "loss": 0.2691, + "step": 7825 + }, + { + "epoch": 1.8155666396009744, + "grad_norm": 13.519710827991254, + "learning_rate": 2e-06, + "loss": 0.1943, + "step": 7826 + }, + { + "epoch": 1.815798631249275, + "grad_norm": 14.843148579657452, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 7827 + }, + { + "epoch": 1.8160306228975758, + "grad_norm": 9.460839761780512, + "learning_rate": 2e-06, + "loss": 0.1931, + "step": 7828 + }, + { + "epoch": 1.8162626145458765, + "grad_norm": 38.541938313911785, + "learning_rate": 2e-06, + "loss": 0.3944, + "step": 7829 + }, + { + "epoch": 1.8164946061941771, + "grad_norm": 13.17265386137172, + "learning_rate": 2e-06, + "loss": 0.2638, + "step": 7830 + }, + { + "epoch": 1.8167265978424778, + "grad_norm": 12.84058855474656, + "learning_rate": 2e-06, + "loss": 0.2178, + "step": 7831 + }, + { + "epoch": 1.8169585894907785, + "grad_norm": 10.788615152278911, + "learning_rate": 2e-06, + "loss": 0.221, + "step": 7832 + }, + { + "epoch": 1.817190581139079, + "grad_norm": 19.497970514954385, + "learning_rate": 2e-06, + "loss": 0.1933, + "step": 7833 + }, + { + "epoch": 1.8174225727873796, + "grad_norm": 11.516461157637663, + "learning_rate": 2e-06, + "loss": 0.1594, + "step": 7834 + }, + { + "epoch": 1.8176545644356803, + "grad_norm": 13.070370027025735, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 7835 + }, + { + "epoch": 1.817886556083981, + "grad_norm": 11.093947259075945, + "learning_rate": 2e-06, + "loss": 0.2507, + "step": 7836 + }, + { + "epoch": 1.8181185477322817, + "grad_norm": 10.952601265349044, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 7837 + }, + { + "epoch": 1.8183505393805823, + "grad_norm": 12.583982455505959, + "learning_rate": 2e-06, + "loss": 0.2542, + "step": 7838 + }, + { + "epoch": 1.818582531028883, + "grad_norm": 10.412530305980933, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 7839 + }, + { + "epoch": 1.8188145226771835, + "grad_norm": 20.58663323966539, + "learning_rate": 2e-06, + "loss": 0.3298, + "step": 7840 + }, + { + "epoch": 1.8190465143254841, + "grad_norm": 16.069589171499164, + "learning_rate": 2e-06, + "loss": 0.2718, + "step": 7841 + }, + { + "epoch": 1.8192785059737848, + "grad_norm": 15.718424709122079, + "learning_rate": 2e-06, + "loss": 0.2185, + "step": 7842 + }, + { + "epoch": 1.8195104976220855, + "grad_norm": 13.114479648383073, + "learning_rate": 2e-06, + "loss": 0.2185, + "step": 7843 + }, + { + "epoch": 1.8197424892703862, + "grad_norm": 11.817145160415038, + "learning_rate": 2e-06, + "loss": 0.2357, + "step": 7844 + }, + { + "epoch": 1.8199744809186869, + "grad_norm": 8.646679220264772, + "learning_rate": 2e-06, + "loss": 0.1634, + "step": 7845 + }, + { + "epoch": 1.8202064725669875, + "grad_norm": 12.234848247512957, + "learning_rate": 2e-06, + "loss": 0.1666, + "step": 7846 + }, + { + "epoch": 1.8204384642152882, + "grad_norm": 22.072929455242598, + "learning_rate": 2e-06, + "loss": 0.4095, + "step": 7847 + }, + { + "epoch": 1.8206704558635889, + "grad_norm": 10.690070905760146, + "learning_rate": 2e-06, + "loss": 0.1838, + "step": 7848 + }, + { + "epoch": 1.8209024475118896, + "grad_norm": 11.534125053246434, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 7849 + }, + { + "epoch": 1.8211344391601902, + "grad_norm": 7.3850766019875955, + "learning_rate": 2e-06, + "loss": 0.1526, + "step": 7850 + }, + { + "epoch": 1.821366430808491, + "grad_norm": 10.193700861814932, + "learning_rate": 2e-06, + "loss": 0.1689, + "step": 7851 + }, + { + "epoch": 1.8215984224567916, + "grad_norm": 7.246342613215927, + "learning_rate": 2e-06, + "loss": 0.1663, + "step": 7852 + }, + { + "epoch": 1.8218304141050923, + "grad_norm": 10.911410337288949, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 7853 + }, + { + "epoch": 1.822062405753393, + "grad_norm": 28.141488972018617, + "learning_rate": 2e-06, + "loss": 0.281, + "step": 7854 + }, + { + "epoch": 1.8222943974016936, + "grad_norm": 15.119773914996467, + "learning_rate": 2e-06, + "loss": 0.2547, + "step": 7855 + }, + { + "epoch": 1.8225263890499943, + "grad_norm": 8.544803647608756, + "learning_rate": 2e-06, + "loss": 0.1434, + "step": 7856 + }, + { + "epoch": 1.822758380698295, + "grad_norm": 13.881430004783747, + "learning_rate": 2e-06, + "loss": 0.3103, + "step": 7857 + }, + { + "epoch": 1.8229903723465957, + "grad_norm": 20.911031864699904, + "learning_rate": 2e-06, + "loss": 0.3389, + "step": 7858 + }, + { + "epoch": 1.8232223639948963, + "grad_norm": 18.0973394831566, + "learning_rate": 2e-06, + "loss": 0.2147, + "step": 7859 + }, + { + "epoch": 1.8234543556431968, + "grad_norm": 8.356805904776557, + "learning_rate": 2e-06, + "loss": 0.1853, + "step": 7860 + }, + { + "epoch": 1.8236863472914975, + "grad_norm": 9.609173826516205, + "learning_rate": 2e-06, + "loss": 0.1511, + "step": 7861 + }, + { + "epoch": 1.8239183389397982, + "grad_norm": 14.17171157061349, + "learning_rate": 2e-06, + "loss": 0.3058, + "step": 7862 + }, + { + "epoch": 1.8241503305880988, + "grad_norm": 8.80752026693199, + "learning_rate": 2e-06, + "loss": 0.2101, + "step": 7863 + }, + { + "epoch": 1.8243823222363995, + "grad_norm": 7.813485615307996, + "learning_rate": 2e-06, + "loss": 0.2748, + "step": 7864 + }, + { + "epoch": 1.8246143138847002, + "grad_norm": 9.32680535990223, + "learning_rate": 2e-06, + "loss": 0.2522, + "step": 7865 + }, + { + "epoch": 1.8248463055330006, + "grad_norm": 9.896185768039878, + "learning_rate": 2e-06, + "loss": 0.1993, + "step": 7866 + }, + { + "epoch": 1.8250782971813013, + "grad_norm": 16.489932650037563, + "learning_rate": 2e-06, + "loss": 0.4062, + "step": 7867 + }, + { + "epoch": 1.825310288829602, + "grad_norm": 12.21555760793375, + "learning_rate": 2e-06, + "loss": 0.2114, + "step": 7868 + }, + { + "epoch": 1.8255422804779027, + "grad_norm": 10.668708934887045, + "learning_rate": 2e-06, + "loss": 0.2042, + "step": 7869 + }, + { + "epoch": 1.8257742721262034, + "grad_norm": 19.506231923470228, + "learning_rate": 2e-06, + "loss": 0.2417, + "step": 7870 + }, + { + "epoch": 1.826006263774504, + "grad_norm": 15.243980399678323, + "learning_rate": 2e-06, + "loss": 0.3222, + "step": 7871 + }, + { + "epoch": 1.8262382554228047, + "grad_norm": 8.266212958892414, + "learning_rate": 2e-06, + "loss": 0.1827, + "step": 7872 + }, + { + "epoch": 1.8264702470711054, + "grad_norm": 9.228484859678517, + "learning_rate": 2e-06, + "loss": 0.1577, + "step": 7873 + }, + { + "epoch": 1.826702238719406, + "grad_norm": 15.119166628188635, + "learning_rate": 2e-06, + "loss": 0.1612, + "step": 7874 + }, + { + "epoch": 1.8269342303677067, + "grad_norm": 7.614144747643755, + "learning_rate": 2e-06, + "loss": 0.1429, + "step": 7875 + }, + { + "epoch": 1.8271662220160074, + "grad_norm": 13.10715413771453, + "learning_rate": 2e-06, + "loss": 0.2377, + "step": 7876 + }, + { + "epoch": 1.827398213664308, + "grad_norm": 17.644967445732938, + "learning_rate": 2e-06, + "loss": 0.2515, + "step": 7877 + }, + { + "epoch": 1.8276302053126088, + "grad_norm": 19.092950755612446, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 7878 + }, + { + "epoch": 1.8278621969609095, + "grad_norm": 12.387344155425442, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 7879 + }, + { + "epoch": 1.8280941886092101, + "grad_norm": 14.011979558604576, + "learning_rate": 2e-06, + "loss": 0.2179, + "step": 7880 + }, + { + "epoch": 1.8283261802575108, + "grad_norm": 15.390569673859822, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 7881 + }, + { + "epoch": 1.8285581719058115, + "grad_norm": 14.684780900009232, + "learning_rate": 2e-06, + "loss": 0.1935, + "step": 7882 + }, + { + "epoch": 1.8287901635541122, + "grad_norm": 11.430777807918494, + "learning_rate": 2e-06, + "loss": 0.18, + "step": 7883 + }, + { + "epoch": 1.8290221552024128, + "grad_norm": 7.791418141941399, + "learning_rate": 2e-06, + "loss": 0.1834, + "step": 7884 + }, + { + "epoch": 1.8292541468507135, + "grad_norm": 8.64636567131907, + "learning_rate": 2e-06, + "loss": 0.1954, + "step": 7885 + }, + { + "epoch": 1.829486138499014, + "grad_norm": 15.346495917999354, + "learning_rate": 2e-06, + "loss": 0.2054, + "step": 7886 + }, + { + "epoch": 1.8297181301473147, + "grad_norm": 12.296776262277993, + "learning_rate": 2e-06, + "loss": 0.1336, + "step": 7887 + }, + { + "epoch": 1.8299501217956153, + "grad_norm": 19.494222119023746, + "learning_rate": 2e-06, + "loss": 0.2837, + "step": 7888 + }, + { + "epoch": 1.830182113443916, + "grad_norm": 13.25582425439502, + "learning_rate": 2e-06, + "loss": 0.3047, + "step": 7889 + }, + { + "epoch": 1.8304141050922167, + "grad_norm": 9.671077020296584, + "learning_rate": 2e-06, + "loss": 0.1777, + "step": 7890 + }, + { + "epoch": 1.8306460967405174, + "grad_norm": 10.511830121222385, + "learning_rate": 2e-06, + "loss": 0.1911, + "step": 7891 + }, + { + "epoch": 1.830878088388818, + "grad_norm": 12.49834553629901, + "learning_rate": 2e-06, + "loss": 0.2534, + "step": 7892 + }, + { + "epoch": 1.8311100800371185, + "grad_norm": 17.583887619416416, + "learning_rate": 2e-06, + "loss": 0.2884, + "step": 7893 + }, + { + "epoch": 1.8313420716854192, + "grad_norm": 13.841502274963958, + "learning_rate": 2e-06, + "loss": 0.1891, + "step": 7894 + }, + { + "epoch": 1.8315740633337199, + "grad_norm": 12.67870442340447, + "learning_rate": 2e-06, + "loss": 0.2628, + "step": 7895 + }, + { + "epoch": 1.8318060549820205, + "grad_norm": 16.384562089918386, + "learning_rate": 2e-06, + "loss": 0.2482, + "step": 7896 + }, + { + "epoch": 1.8320380466303212, + "grad_norm": 16.046864886905126, + "learning_rate": 2e-06, + "loss": 0.2183, + "step": 7897 + }, + { + "epoch": 1.832270038278622, + "grad_norm": 11.993529864813677, + "learning_rate": 2e-06, + "loss": 0.2744, + "step": 7898 + }, + { + "epoch": 1.8325020299269226, + "grad_norm": 16.457508380663167, + "learning_rate": 2e-06, + "loss": 0.3141, + "step": 7899 + }, + { + "epoch": 1.8327340215752232, + "grad_norm": 11.896321643614053, + "learning_rate": 2e-06, + "loss": 0.1972, + "step": 7900 + }, + { + "epoch": 1.832966013223524, + "grad_norm": 10.634033931621662, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 7901 + }, + { + "epoch": 1.8331980048718246, + "grad_norm": 17.425657647560143, + "learning_rate": 2e-06, + "loss": 0.305, + "step": 7902 + }, + { + "epoch": 1.8334299965201253, + "grad_norm": 7.202441697727795, + "learning_rate": 2e-06, + "loss": 0.1521, + "step": 7903 + }, + { + "epoch": 1.833661988168426, + "grad_norm": 17.19205030465811, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 7904 + }, + { + "epoch": 1.8338939798167266, + "grad_norm": 19.90010493299395, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 7905 + }, + { + "epoch": 1.8341259714650273, + "grad_norm": 19.52854387682745, + "learning_rate": 2e-06, + "loss": 0.3048, + "step": 7906 + }, + { + "epoch": 1.834357963113328, + "grad_norm": 10.902417230263483, + "learning_rate": 2e-06, + "loss": 0.2184, + "step": 7907 + }, + { + "epoch": 1.8345899547616287, + "grad_norm": 17.362599545004098, + "learning_rate": 2e-06, + "loss": 0.2387, + "step": 7908 + }, + { + "epoch": 1.8348219464099293, + "grad_norm": 16.18254568800934, + "learning_rate": 2e-06, + "loss": 0.2198, + "step": 7909 + }, + { + "epoch": 1.83505393805823, + "grad_norm": 27.008496402497844, + "learning_rate": 2e-06, + "loss": 0.5212, + "step": 7910 + }, + { + "epoch": 1.8352859297065307, + "grad_norm": 23.159668358951915, + "learning_rate": 2e-06, + "loss": 0.3255, + "step": 7911 + }, + { + "epoch": 1.8355179213548314, + "grad_norm": 21.345873761286768, + "learning_rate": 2e-06, + "loss": 0.2788, + "step": 7912 + }, + { + "epoch": 1.8357499130031318, + "grad_norm": 16.364693029156975, + "learning_rate": 2e-06, + "loss": 0.2494, + "step": 7913 + }, + { + "epoch": 1.8359819046514325, + "grad_norm": 21.265562874143434, + "learning_rate": 2e-06, + "loss": 0.2334, + "step": 7914 + }, + { + "epoch": 1.8362138962997332, + "grad_norm": 8.42226495039441, + "learning_rate": 2e-06, + "loss": 0.2178, + "step": 7915 + }, + { + "epoch": 1.8364458879480339, + "grad_norm": 14.120536069491699, + "learning_rate": 2e-06, + "loss": 0.2106, + "step": 7916 + }, + { + "epoch": 1.8366778795963346, + "grad_norm": 16.753426930658062, + "learning_rate": 2e-06, + "loss": 0.2631, + "step": 7917 + }, + { + "epoch": 1.8369098712446352, + "grad_norm": 9.596391419187736, + "learning_rate": 2e-06, + "loss": 0.1716, + "step": 7918 + }, + { + "epoch": 1.837141862892936, + "grad_norm": 14.833436969950283, + "learning_rate": 2e-06, + "loss": 0.2554, + "step": 7919 + }, + { + "epoch": 1.8373738545412364, + "grad_norm": 6.294776122050837, + "learning_rate": 2e-06, + "loss": 0.1314, + "step": 7920 + }, + { + "epoch": 1.837605846189537, + "grad_norm": 8.58136538816046, + "learning_rate": 2e-06, + "loss": 0.1775, + "step": 7921 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 6.839034671602942, + "learning_rate": 2e-06, + "loss": 0.147, + "step": 7922 + }, + { + "epoch": 1.8380698294861384, + "grad_norm": 11.75226477118561, + "learning_rate": 2e-06, + "loss": 0.2727, + "step": 7923 + }, + { + "epoch": 1.838301821134439, + "grad_norm": 16.951382202621378, + "learning_rate": 2e-06, + "loss": 0.387, + "step": 7924 + }, + { + "epoch": 1.8385338127827398, + "grad_norm": 9.669318688045863, + "learning_rate": 2e-06, + "loss": 0.2122, + "step": 7925 + }, + { + "epoch": 1.8387658044310404, + "grad_norm": 10.658857823286702, + "learning_rate": 2e-06, + "loss": 0.1693, + "step": 7926 + }, + { + "epoch": 1.838997796079341, + "grad_norm": 11.164269201204279, + "learning_rate": 2e-06, + "loss": 0.2675, + "step": 7927 + }, + { + "epoch": 1.8392297877276418, + "grad_norm": 11.292803059157091, + "learning_rate": 2e-06, + "loss": 0.3351, + "step": 7928 + }, + { + "epoch": 1.8394617793759425, + "grad_norm": 13.662105944550406, + "learning_rate": 2e-06, + "loss": 0.2295, + "step": 7929 + }, + { + "epoch": 1.8396937710242431, + "grad_norm": 9.984871014211448, + "learning_rate": 2e-06, + "loss": 0.1737, + "step": 7930 + }, + { + "epoch": 1.8399257626725438, + "grad_norm": 15.528006175833218, + "learning_rate": 2e-06, + "loss": 0.2634, + "step": 7931 + }, + { + "epoch": 1.8401577543208445, + "grad_norm": 11.802930981454681, + "learning_rate": 2e-06, + "loss": 0.3148, + "step": 7932 + }, + { + "epoch": 1.8403897459691452, + "grad_norm": 25.823960323458206, + "learning_rate": 2e-06, + "loss": 0.3875, + "step": 7933 + }, + { + "epoch": 1.8406217376174459, + "grad_norm": 5.4731564900955645, + "learning_rate": 2e-06, + "loss": 0.1652, + "step": 7934 + }, + { + "epoch": 1.8408537292657465, + "grad_norm": 12.465598680342927, + "learning_rate": 2e-06, + "loss": 0.2969, + "step": 7935 + }, + { + "epoch": 1.8410857209140472, + "grad_norm": 10.890761811575437, + "learning_rate": 2e-06, + "loss": 0.1851, + "step": 7936 + }, + { + "epoch": 1.8413177125623479, + "grad_norm": 11.075531771245522, + "learning_rate": 2e-06, + "loss": 0.19, + "step": 7937 + }, + { + "epoch": 1.8415497042106486, + "grad_norm": 10.76191718337455, + "learning_rate": 2e-06, + "loss": 0.2794, + "step": 7938 + }, + { + "epoch": 1.8417816958589492, + "grad_norm": 13.447884654447204, + "learning_rate": 2e-06, + "loss": 0.2633, + "step": 7939 + }, + { + "epoch": 1.8420136875072497, + "grad_norm": 8.35862362187622, + "learning_rate": 2e-06, + "loss": 0.2139, + "step": 7940 + }, + { + "epoch": 1.8422456791555504, + "grad_norm": 11.615926552765899, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 7941 + }, + { + "epoch": 1.842477670803851, + "grad_norm": 16.797906163493213, + "learning_rate": 2e-06, + "loss": 0.2207, + "step": 7942 + }, + { + "epoch": 1.8427096624521517, + "grad_norm": 11.526737998281494, + "learning_rate": 2e-06, + "loss": 0.2684, + "step": 7943 + }, + { + "epoch": 1.8429416541004524, + "grad_norm": 14.813675655160232, + "learning_rate": 2e-06, + "loss": 0.2797, + "step": 7944 + }, + { + "epoch": 1.843173645748753, + "grad_norm": 16.873756448801746, + "learning_rate": 2e-06, + "loss": 0.1525, + "step": 7945 + }, + { + "epoch": 1.8434056373970535, + "grad_norm": 8.490637239098254, + "learning_rate": 2e-06, + "loss": 0.1979, + "step": 7946 + }, + { + "epoch": 1.8436376290453542, + "grad_norm": 8.188933475422282, + "learning_rate": 2e-06, + "loss": 0.1951, + "step": 7947 + }, + { + "epoch": 1.843869620693655, + "grad_norm": 12.989538195546992, + "learning_rate": 2e-06, + "loss": 0.2963, + "step": 7948 + }, + { + "epoch": 1.8441016123419556, + "grad_norm": 25.921803203614267, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 7949 + }, + { + "epoch": 1.8443336039902563, + "grad_norm": 13.38270787112594, + "learning_rate": 2e-06, + "loss": 0.278, + "step": 7950 + }, + { + "epoch": 1.844565595638557, + "grad_norm": 15.901728624066433, + "learning_rate": 2e-06, + "loss": 0.2949, + "step": 7951 + }, + { + "epoch": 1.8447975872868576, + "grad_norm": 30.953565323139976, + "learning_rate": 2e-06, + "loss": 0.3273, + "step": 7952 + }, + { + "epoch": 1.8450295789351583, + "grad_norm": 12.810216707183498, + "learning_rate": 2e-06, + "loss": 0.1859, + "step": 7953 + }, + { + "epoch": 1.845261570583459, + "grad_norm": 11.592796853621131, + "learning_rate": 2e-06, + "loss": 0.1906, + "step": 7954 + }, + { + "epoch": 1.8454935622317596, + "grad_norm": 8.935925628117879, + "learning_rate": 2e-06, + "loss": 0.2226, + "step": 7955 + }, + { + "epoch": 1.8457255538800603, + "grad_norm": 9.500849218443497, + "learning_rate": 2e-06, + "loss": 0.215, + "step": 7956 + }, + { + "epoch": 1.845957545528361, + "grad_norm": 10.944130618833324, + "learning_rate": 2e-06, + "loss": 0.2248, + "step": 7957 + }, + { + "epoch": 1.8461895371766617, + "grad_norm": 9.18069076395506, + "learning_rate": 2e-06, + "loss": 0.1856, + "step": 7958 + }, + { + "epoch": 1.8464215288249624, + "grad_norm": 9.685371708277902, + "learning_rate": 2e-06, + "loss": 0.1926, + "step": 7959 + }, + { + "epoch": 1.846653520473263, + "grad_norm": 26.997531508708946, + "learning_rate": 2e-06, + "loss": 0.3127, + "step": 7960 + }, + { + "epoch": 1.8468855121215637, + "grad_norm": 12.585809388667009, + "learning_rate": 2e-06, + "loss": 0.1861, + "step": 7961 + }, + { + "epoch": 1.8471175037698644, + "grad_norm": 11.3527907629372, + "learning_rate": 2e-06, + "loss": 0.2552, + "step": 7962 + }, + { + "epoch": 1.847349495418165, + "grad_norm": 23.095925633732367, + "learning_rate": 2e-06, + "loss": 0.3313, + "step": 7963 + }, + { + "epoch": 1.8475814870664657, + "grad_norm": 13.084226012680856, + "learning_rate": 2e-06, + "loss": 0.2949, + "step": 7964 + }, + { + "epoch": 1.8478134787147664, + "grad_norm": 18.085654292620408, + "learning_rate": 2e-06, + "loss": 0.3053, + "step": 7965 + }, + { + "epoch": 1.8480454703630669, + "grad_norm": 14.712791411772923, + "learning_rate": 2e-06, + "loss": 0.3434, + "step": 7966 + }, + { + "epoch": 1.8482774620113676, + "grad_norm": 8.334839560631009, + "learning_rate": 2e-06, + "loss": 0.1873, + "step": 7967 + }, + { + "epoch": 1.8485094536596682, + "grad_norm": 10.518803209871267, + "learning_rate": 2e-06, + "loss": 0.3561, + "step": 7968 + }, + { + "epoch": 1.848741445307969, + "grad_norm": 22.56750349666928, + "learning_rate": 2e-06, + "loss": 0.286, + "step": 7969 + }, + { + "epoch": 1.8489734369562696, + "grad_norm": 18.491907036686882, + "learning_rate": 2e-06, + "loss": 0.321, + "step": 7970 + }, + { + "epoch": 1.8492054286045703, + "grad_norm": 10.100982824992236, + "learning_rate": 2e-06, + "loss": 0.1683, + "step": 7971 + }, + { + "epoch": 1.849437420252871, + "grad_norm": 15.074645252715337, + "learning_rate": 2e-06, + "loss": 0.2232, + "step": 7972 + }, + { + "epoch": 1.8496694119011714, + "grad_norm": 14.046821339662545, + "learning_rate": 2e-06, + "loss": 0.2664, + "step": 7973 + }, + { + "epoch": 1.849901403549472, + "grad_norm": 13.59824054448896, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 7974 + }, + { + "epoch": 1.8501333951977728, + "grad_norm": 13.790300671692236, + "learning_rate": 2e-06, + "loss": 0.2642, + "step": 7975 + }, + { + "epoch": 1.8503653868460734, + "grad_norm": 10.412428158867494, + "learning_rate": 2e-06, + "loss": 0.1941, + "step": 7976 + }, + { + "epoch": 1.8505973784943741, + "grad_norm": 9.241196274154442, + "learning_rate": 2e-06, + "loss": 0.2431, + "step": 7977 + }, + { + "epoch": 1.8508293701426748, + "grad_norm": 18.006977742518487, + "learning_rate": 2e-06, + "loss": 0.3314, + "step": 7978 + }, + { + "epoch": 1.8510613617909755, + "grad_norm": 8.735578702425071, + "learning_rate": 2e-06, + "loss": 0.1915, + "step": 7979 + }, + { + "epoch": 1.8512933534392761, + "grad_norm": 11.066825745768144, + "learning_rate": 2e-06, + "loss": 0.3201, + "step": 7980 + }, + { + "epoch": 1.8515253450875768, + "grad_norm": 12.934799799849172, + "learning_rate": 2e-06, + "loss": 0.2668, + "step": 7981 + }, + { + "epoch": 1.8517573367358775, + "grad_norm": 7.652593730052824, + "learning_rate": 2e-06, + "loss": 0.2211, + "step": 7982 + }, + { + "epoch": 1.8519893283841782, + "grad_norm": 17.360645095568245, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 7983 + }, + { + "epoch": 1.8522213200324789, + "grad_norm": 9.342320938908026, + "learning_rate": 2e-06, + "loss": 0.2092, + "step": 7984 + }, + { + "epoch": 1.8524533116807795, + "grad_norm": 28.26538822948439, + "learning_rate": 2e-06, + "loss": 0.3829, + "step": 7985 + }, + { + "epoch": 1.8526853033290802, + "grad_norm": 12.575102199523327, + "learning_rate": 2e-06, + "loss": 0.2662, + "step": 7986 + }, + { + "epoch": 1.852917294977381, + "grad_norm": 12.649445490250741, + "learning_rate": 2e-06, + "loss": 0.2228, + "step": 7987 + }, + { + "epoch": 1.8531492866256816, + "grad_norm": 10.65686234012664, + "learning_rate": 2e-06, + "loss": 0.2862, + "step": 7988 + }, + { + "epoch": 1.8533812782739822, + "grad_norm": 13.377746222563427, + "learning_rate": 2e-06, + "loss": 0.2825, + "step": 7989 + }, + { + "epoch": 1.853613269922283, + "grad_norm": 8.499970384087424, + "learning_rate": 2e-06, + "loss": 0.2493, + "step": 7990 + }, + { + "epoch": 1.8538452615705836, + "grad_norm": 8.641111636956055, + "learning_rate": 2e-06, + "loss": 0.1472, + "step": 7991 + }, + { + "epoch": 1.8540772532188843, + "grad_norm": 18.614890935703087, + "learning_rate": 2e-06, + "loss": 0.2897, + "step": 7992 + }, + { + "epoch": 1.8543092448671847, + "grad_norm": 9.830085780138903, + "learning_rate": 2e-06, + "loss": 0.2636, + "step": 7993 + }, + { + "epoch": 1.8545412365154854, + "grad_norm": 7.283981619222619, + "learning_rate": 2e-06, + "loss": 0.2193, + "step": 7994 + }, + { + "epoch": 1.854773228163786, + "grad_norm": 16.965974276924957, + "learning_rate": 2e-06, + "loss": 0.2347, + "step": 7995 + }, + { + "epoch": 1.8550052198120868, + "grad_norm": 11.141632499056673, + "learning_rate": 2e-06, + "loss": 0.2715, + "step": 7996 + }, + { + "epoch": 1.8552372114603874, + "grad_norm": 13.656532823530375, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 7997 + }, + { + "epoch": 1.8554692031086881, + "grad_norm": 8.84745272325778, + "learning_rate": 2e-06, + "loss": 0.2077, + "step": 7998 + }, + { + "epoch": 1.8557011947569888, + "grad_norm": 9.806319010310668, + "learning_rate": 2e-06, + "loss": 0.2299, + "step": 7999 + }, + { + "epoch": 1.8559331864052893, + "grad_norm": 8.63698914343859, + "learning_rate": 2e-06, + "loss": 0.2444, + "step": 8000 + }, + { + "epoch": 1.85616517805359, + "grad_norm": 19.746684480722426, + "learning_rate": 2e-06, + "loss": 0.3618, + "step": 8001 + }, + { + "epoch": 1.8563971697018906, + "grad_norm": 10.050691436850114, + "learning_rate": 2e-06, + "loss": 0.2331, + "step": 8002 + }, + { + "epoch": 1.8566291613501913, + "grad_norm": 23.133285568263815, + "learning_rate": 2e-06, + "loss": 0.2847, + "step": 8003 + }, + { + "epoch": 1.856861152998492, + "grad_norm": 9.853651040793197, + "learning_rate": 2e-06, + "loss": 0.1745, + "step": 8004 + }, + { + "epoch": 1.8570931446467926, + "grad_norm": 6.07637328157186, + "learning_rate": 2e-06, + "loss": 0.1538, + "step": 8005 + }, + { + "epoch": 1.8573251362950933, + "grad_norm": 10.665413106483905, + "learning_rate": 2e-06, + "loss": 0.303, + "step": 8006 + }, + { + "epoch": 1.857557127943394, + "grad_norm": 5.973799154259255, + "learning_rate": 2e-06, + "loss": 0.1648, + "step": 8007 + }, + { + "epoch": 1.8577891195916947, + "grad_norm": 11.260741284363519, + "learning_rate": 2e-06, + "loss": 0.2539, + "step": 8008 + }, + { + "epoch": 1.8580211112399954, + "grad_norm": 12.612140361854662, + "learning_rate": 2e-06, + "loss": 0.1837, + "step": 8009 + }, + { + "epoch": 1.858253102888296, + "grad_norm": 6.64501874593489, + "learning_rate": 2e-06, + "loss": 0.127, + "step": 8010 + }, + { + "epoch": 1.8584850945365967, + "grad_norm": 12.641913011016891, + "learning_rate": 2e-06, + "loss": 0.2472, + "step": 8011 + }, + { + "epoch": 1.8587170861848974, + "grad_norm": 16.01004189377708, + "learning_rate": 2e-06, + "loss": 0.2818, + "step": 8012 + }, + { + "epoch": 1.858949077833198, + "grad_norm": 17.650523284827663, + "learning_rate": 2e-06, + "loss": 0.3115, + "step": 8013 + }, + { + "epoch": 1.8591810694814987, + "grad_norm": 67.9849712980046, + "learning_rate": 2e-06, + "loss": 0.2104, + "step": 8014 + }, + { + "epoch": 1.8594130611297994, + "grad_norm": 18.553952981423404, + "learning_rate": 2e-06, + "loss": 0.3201, + "step": 8015 + }, + { + "epoch": 1.8596450527781, + "grad_norm": 19.8359578410062, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 8016 + }, + { + "epoch": 1.8598770444264008, + "grad_norm": 13.506166327338736, + "learning_rate": 2e-06, + "loss": 0.2029, + "step": 8017 + }, + { + "epoch": 1.8601090360747015, + "grad_norm": 13.173351169007882, + "learning_rate": 2e-06, + "loss": 0.1775, + "step": 8018 + }, + { + "epoch": 1.860341027723002, + "grad_norm": 16.9773971290991, + "learning_rate": 2e-06, + "loss": 0.2437, + "step": 8019 + }, + { + "epoch": 1.8605730193713026, + "grad_norm": 11.166665045313033, + "learning_rate": 2e-06, + "loss": 0.2486, + "step": 8020 + }, + { + "epoch": 1.8608050110196033, + "grad_norm": 17.771633832862648, + "learning_rate": 2e-06, + "loss": 0.3317, + "step": 8021 + }, + { + "epoch": 1.861037002667904, + "grad_norm": 13.650262107790912, + "learning_rate": 2e-06, + "loss": 0.2381, + "step": 8022 + }, + { + "epoch": 1.8612689943162046, + "grad_norm": 14.93861953518762, + "learning_rate": 2e-06, + "loss": 0.2917, + "step": 8023 + }, + { + "epoch": 1.8615009859645053, + "grad_norm": 50.26662400292754, + "learning_rate": 2e-06, + "loss": 0.1945, + "step": 8024 + }, + { + "epoch": 1.861732977612806, + "grad_norm": 11.603664768722052, + "learning_rate": 2e-06, + "loss": 0.3147, + "step": 8025 + }, + { + "epoch": 1.8619649692611064, + "grad_norm": 12.386479170427224, + "learning_rate": 2e-06, + "loss": 0.201, + "step": 8026 + }, + { + "epoch": 1.8621969609094071, + "grad_norm": 20.00899319309306, + "learning_rate": 2e-06, + "loss": 0.2685, + "step": 8027 + }, + { + "epoch": 1.8624289525577078, + "grad_norm": 14.307786611545218, + "learning_rate": 2e-06, + "loss": 0.3113, + "step": 8028 + }, + { + "epoch": 1.8626609442060085, + "grad_norm": 10.419544932315354, + "learning_rate": 2e-06, + "loss": 0.2153, + "step": 8029 + }, + { + "epoch": 1.8628929358543092, + "grad_norm": 14.944128238574825, + "learning_rate": 2e-06, + "loss": 0.2304, + "step": 8030 + }, + { + "epoch": 1.8631249275026098, + "grad_norm": 14.828493347874334, + "learning_rate": 2e-06, + "loss": 0.2339, + "step": 8031 + }, + { + "epoch": 1.8633569191509105, + "grad_norm": 8.43164667293264, + "learning_rate": 2e-06, + "loss": 0.1578, + "step": 8032 + }, + { + "epoch": 1.8635889107992112, + "grad_norm": 8.225180499158036, + "learning_rate": 2e-06, + "loss": 0.2174, + "step": 8033 + }, + { + "epoch": 1.8638209024475119, + "grad_norm": 8.714004111926837, + "learning_rate": 2e-06, + "loss": 0.1848, + "step": 8034 + }, + { + "epoch": 1.8640528940958125, + "grad_norm": 7.209489143883522, + "learning_rate": 2e-06, + "loss": 0.1175, + "step": 8035 + }, + { + "epoch": 1.8642848857441132, + "grad_norm": 19.47233075136568, + "learning_rate": 2e-06, + "loss": 0.3315, + "step": 8036 + }, + { + "epoch": 1.864516877392414, + "grad_norm": 11.761565148017262, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 8037 + }, + { + "epoch": 1.8647488690407146, + "grad_norm": 22.946883432516643, + "learning_rate": 2e-06, + "loss": 0.2992, + "step": 8038 + }, + { + "epoch": 1.8649808606890153, + "grad_norm": 10.099079023837012, + "learning_rate": 2e-06, + "loss": 0.2294, + "step": 8039 + }, + { + "epoch": 1.865212852337316, + "grad_norm": 11.982340199964142, + "learning_rate": 2e-06, + "loss": 0.225, + "step": 8040 + }, + { + "epoch": 1.8654448439856166, + "grad_norm": 7.414469724974377, + "learning_rate": 2e-06, + "loss": 0.1843, + "step": 8041 + }, + { + "epoch": 1.8656768356339173, + "grad_norm": 14.110442842535285, + "learning_rate": 2e-06, + "loss": 0.3108, + "step": 8042 + }, + { + "epoch": 1.865908827282218, + "grad_norm": 8.306951984448888, + "learning_rate": 2e-06, + "loss": 0.222, + "step": 8043 + }, + { + "epoch": 1.8661408189305186, + "grad_norm": 8.020722441848871, + "learning_rate": 2e-06, + "loss": 0.2296, + "step": 8044 + }, + { + "epoch": 1.8663728105788193, + "grad_norm": 14.329130788383804, + "learning_rate": 2e-06, + "loss": 0.3023, + "step": 8045 + }, + { + "epoch": 1.8666048022271198, + "grad_norm": 12.243991467568708, + "learning_rate": 2e-06, + "loss": 0.2126, + "step": 8046 + }, + { + "epoch": 1.8668367938754205, + "grad_norm": 11.49317180930521, + "learning_rate": 2e-06, + "loss": 0.1405, + "step": 8047 + }, + { + "epoch": 1.8670687855237211, + "grad_norm": 12.517361716203531, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 8048 + }, + { + "epoch": 1.8673007771720218, + "grad_norm": 14.413158198703666, + "learning_rate": 2e-06, + "loss": 0.2407, + "step": 8049 + }, + { + "epoch": 1.8675327688203225, + "grad_norm": 20.62454734785914, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 8050 + }, + { + "epoch": 1.8677647604686232, + "grad_norm": 11.221516745189199, + "learning_rate": 2e-06, + "loss": 0.2663, + "step": 8051 + }, + { + "epoch": 1.8679967521169238, + "grad_norm": 14.150459004169447, + "learning_rate": 2e-06, + "loss": 0.2292, + "step": 8052 + }, + { + "epoch": 1.8682287437652243, + "grad_norm": 16.522724664226192, + "learning_rate": 2e-06, + "loss": 0.2884, + "step": 8053 + }, + { + "epoch": 1.868460735413525, + "grad_norm": 11.491441717510634, + "learning_rate": 2e-06, + "loss": 0.2876, + "step": 8054 + }, + { + "epoch": 1.8686927270618257, + "grad_norm": 19.27225227412006, + "learning_rate": 2e-06, + "loss": 0.2874, + "step": 8055 + }, + { + "epoch": 1.8689247187101263, + "grad_norm": 7.696650394731279, + "learning_rate": 2e-06, + "loss": 0.1884, + "step": 8056 + }, + { + "epoch": 1.869156710358427, + "grad_norm": 13.935186097930975, + "learning_rate": 2e-06, + "loss": 0.2719, + "step": 8057 + }, + { + "epoch": 1.8693887020067277, + "grad_norm": 7.901974875172806, + "learning_rate": 2e-06, + "loss": 0.2239, + "step": 8058 + }, + { + "epoch": 1.8696206936550284, + "grad_norm": 14.279966452089885, + "learning_rate": 2e-06, + "loss": 0.309, + "step": 8059 + }, + { + "epoch": 1.869852685303329, + "grad_norm": 21.199725575895616, + "learning_rate": 2e-06, + "loss": 0.3386, + "step": 8060 + }, + { + "epoch": 1.8700846769516297, + "grad_norm": 9.775247696139246, + "learning_rate": 2e-06, + "loss": 0.2333, + "step": 8061 + }, + { + "epoch": 1.8703166685999304, + "grad_norm": 17.735314243096184, + "learning_rate": 2e-06, + "loss": 0.274, + "step": 8062 + }, + { + "epoch": 1.870548660248231, + "grad_norm": 9.118166555172795, + "learning_rate": 2e-06, + "loss": 0.2709, + "step": 8063 + }, + { + "epoch": 1.8707806518965318, + "grad_norm": 10.560664539208704, + "learning_rate": 2e-06, + "loss": 0.1477, + "step": 8064 + }, + { + "epoch": 1.8710126435448324, + "grad_norm": 21.11132835177889, + "learning_rate": 2e-06, + "loss": 0.2022, + "step": 8065 + }, + { + "epoch": 1.871244635193133, + "grad_norm": 12.965268707465519, + "learning_rate": 2e-06, + "loss": 0.2166, + "step": 8066 + }, + { + "epoch": 1.8714766268414338, + "grad_norm": 16.22190776075008, + "learning_rate": 2e-06, + "loss": 0.3488, + "step": 8067 + }, + { + "epoch": 1.8717086184897345, + "grad_norm": 18.535484259025015, + "learning_rate": 2e-06, + "loss": 0.2677, + "step": 8068 + }, + { + "epoch": 1.8719406101380351, + "grad_norm": 12.973138242724986, + "learning_rate": 2e-06, + "loss": 0.2302, + "step": 8069 + }, + { + "epoch": 1.8721726017863358, + "grad_norm": 17.321787407067543, + "learning_rate": 2e-06, + "loss": 0.424, + "step": 8070 + }, + { + "epoch": 1.8724045934346365, + "grad_norm": 8.641817252315901, + "learning_rate": 2e-06, + "loss": 0.2172, + "step": 8071 + }, + { + "epoch": 1.8726365850829372, + "grad_norm": 13.667276124603173, + "learning_rate": 2e-06, + "loss": 0.1638, + "step": 8072 + }, + { + "epoch": 1.8728685767312376, + "grad_norm": 12.34392074525113, + "learning_rate": 2e-06, + "loss": 0.1892, + "step": 8073 + }, + { + "epoch": 1.8731005683795383, + "grad_norm": 13.175216729247062, + "learning_rate": 2e-06, + "loss": 0.2071, + "step": 8074 + }, + { + "epoch": 1.873332560027839, + "grad_norm": 10.656296949914095, + "learning_rate": 2e-06, + "loss": 0.2188, + "step": 8075 + }, + { + "epoch": 1.8735645516761397, + "grad_norm": 21.55205836915179, + "learning_rate": 2e-06, + "loss": 0.2832, + "step": 8076 + }, + { + "epoch": 1.8737965433244403, + "grad_norm": 14.915656126608042, + "learning_rate": 2e-06, + "loss": 0.2965, + "step": 8077 + }, + { + "epoch": 1.874028534972741, + "grad_norm": 15.797128126647014, + "learning_rate": 2e-06, + "loss": 0.1939, + "step": 8078 + }, + { + "epoch": 1.8742605266210415, + "grad_norm": 15.63902295953352, + "learning_rate": 2e-06, + "loss": 0.2841, + "step": 8079 + }, + { + "epoch": 1.8744925182693422, + "grad_norm": 7.098117949851355, + "learning_rate": 2e-06, + "loss": 0.1646, + "step": 8080 + }, + { + "epoch": 1.8747245099176428, + "grad_norm": 16.878481122030315, + "learning_rate": 2e-06, + "loss": 0.212, + "step": 8081 + }, + { + "epoch": 1.8749565015659435, + "grad_norm": 14.102839925698367, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 8082 + }, + { + "epoch": 1.8751884932142442, + "grad_norm": 8.015270991643696, + "learning_rate": 2e-06, + "loss": 0.2123, + "step": 8083 + }, + { + "epoch": 1.8754204848625449, + "grad_norm": 10.75137113578418, + "learning_rate": 2e-06, + "loss": 0.1887, + "step": 8084 + }, + { + "epoch": 1.8756524765108455, + "grad_norm": 12.192149049115477, + "learning_rate": 2e-06, + "loss": 0.2144, + "step": 8085 + }, + { + "epoch": 1.8758844681591462, + "grad_norm": 16.200978015518178, + "learning_rate": 2e-06, + "loss": 0.1866, + "step": 8086 + }, + { + "epoch": 1.876116459807447, + "grad_norm": 13.604564928941926, + "learning_rate": 2e-06, + "loss": 0.2224, + "step": 8087 + }, + { + "epoch": 1.8763484514557476, + "grad_norm": 18.27121505748652, + "learning_rate": 2e-06, + "loss": 0.3974, + "step": 8088 + }, + { + "epoch": 1.8765804431040483, + "grad_norm": 20.230308054727587, + "learning_rate": 2e-06, + "loss": 0.2641, + "step": 8089 + }, + { + "epoch": 1.876812434752349, + "grad_norm": 19.98312531455832, + "learning_rate": 2e-06, + "loss": 0.2823, + "step": 8090 + }, + { + "epoch": 1.8770444264006496, + "grad_norm": 13.997583676506107, + "learning_rate": 2e-06, + "loss": 0.3769, + "step": 8091 + }, + { + "epoch": 1.8772764180489503, + "grad_norm": 16.245997317386614, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 8092 + }, + { + "epoch": 1.877508409697251, + "grad_norm": 6.987822813233787, + "learning_rate": 2e-06, + "loss": 0.1468, + "step": 8093 + }, + { + "epoch": 1.8777404013455516, + "grad_norm": 13.866209639245193, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 8094 + }, + { + "epoch": 1.8779723929938523, + "grad_norm": 9.262533352772289, + "learning_rate": 2e-06, + "loss": 0.2204, + "step": 8095 + }, + { + "epoch": 1.878204384642153, + "grad_norm": 17.625891059497523, + "learning_rate": 2e-06, + "loss": 0.3638, + "step": 8096 + }, + { + "epoch": 1.8784363762904537, + "grad_norm": 15.109355853479892, + "learning_rate": 2e-06, + "loss": 0.1749, + "step": 8097 + }, + { + "epoch": 1.8786683679387544, + "grad_norm": 23.316790726338905, + "learning_rate": 2e-06, + "loss": 0.3017, + "step": 8098 + }, + { + "epoch": 1.8789003595870548, + "grad_norm": 11.417710489306446, + "learning_rate": 2e-06, + "loss": 0.2762, + "step": 8099 + }, + { + "epoch": 1.8791323512353555, + "grad_norm": 7.568858560857904, + "learning_rate": 2e-06, + "loss": 0.1969, + "step": 8100 + }, + { + "epoch": 1.8793643428836562, + "grad_norm": 24.255230697484876, + "learning_rate": 2e-06, + "loss": 0.3469, + "step": 8101 + }, + { + "epoch": 1.8795963345319568, + "grad_norm": 10.709730914448418, + "learning_rate": 2e-06, + "loss": 0.1396, + "step": 8102 + }, + { + "epoch": 1.8798283261802575, + "grad_norm": 15.12566667585955, + "learning_rate": 2e-06, + "loss": 0.2438, + "step": 8103 + }, + { + "epoch": 1.8800603178285582, + "grad_norm": 10.69027851351895, + "learning_rate": 2e-06, + "loss": 0.1836, + "step": 8104 + }, + { + "epoch": 1.8802923094768589, + "grad_norm": 8.275867129578193, + "learning_rate": 2e-06, + "loss": 0.1433, + "step": 8105 + }, + { + "epoch": 1.8805243011251593, + "grad_norm": 18.796644003060116, + "learning_rate": 2e-06, + "loss": 0.3486, + "step": 8106 + }, + { + "epoch": 1.88075629277346, + "grad_norm": 11.441232359659232, + "learning_rate": 2e-06, + "loss": 0.267, + "step": 8107 + }, + { + "epoch": 1.8809882844217607, + "grad_norm": 13.62519582991966, + "learning_rate": 2e-06, + "loss": 0.1969, + "step": 8108 + }, + { + "epoch": 1.8812202760700614, + "grad_norm": 12.018684733601567, + "learning_rate": 2e-06, + "loss": 0.2544, + "step": 8109 + }, + { + "epoch": 1.881452267718362, + "grad_norm": 18.41427434587355, + "learning_rate": 2e-06, + "loss": 0.2961, + "step": 8110 + }, + { + "epoch": 1.8816842593666627, + "grad_norm": 11.72345637527048, + "learning_rate": 2e-06, + "loss": 0.2071, + "step": 8111 + }, + { + "epoch": 1.8819162510149634, + "grad_norm": 9.216262154161301, + "learning_rate": 2e-06, + "loss": 0.2301, + "step": 8112 + }, + { + "epoch": 1.882148242663264, + "grad_norm": 17.853558650966367, + "learning_rate": 2e-06, + "loss": 0.2497, + "step": 8113 + }, + { + "epoch": 1.8823802343115648, + "grad_norm": 15.682849567438378, + "learning_rate": 2e-06, + "loss": 0.2568, + "step": 8114 + }, + { + "epoch": 1.8826122259598654, + "grad_norm": 23.39963649706194, + "learning_rate": 2e-06, + "loss": 0.3241, + "step": 8115 + }, + { + "epoch": 1.8828442176081661, + "grad_norm": 11.397005268434754, + "learning_rate": 2e-06, + "loss": 0.2757, + "step": 8116 + }, + { + "epoch": 1.8830762092564668, + "grad_norm": 11.623011034774825, + "learning_rate": 2e-06, + "loss": 0.2046, + "step": 8117 + }, + { + "epoch": 1.8833082009047675, + "grad_norm": 16.244386942551117, + "learning_rate": 2e-06, + "loss": 0.2464, + "step": 8118 + }, + { + "epoch": 1.8835401925530681, + "grad_norm": 6.50215631748504, + "learning_rate": 2e-06, + "loss": 0.1487, + "step": 8119 + }, + { + "epoch": 1.8837721842013688, + "grad_norm": 13.968814913063218, + "learning_rate": 2e-06, + "loss": 0.24, + "step": 8120 + }, + { + "epoch": 1.8840041758496695, + "grad_norm": 14.418952475658717, + "learning_rate": 2e-06, + "loss": 0.2565, + "step": 8121 + }, + { + "epoch": 1.8842361674979702, + "grad_norm": 12.90293443244331, + "learning_rate": 2e-06, + "loss": 0.2305, + "step": 8122 + }, + { + "epoch": 1.8844681591462709, + "grad_norm": 11.279220611971848, + "learning_rate": 2e-06, + "loss": 0.2261, + "step": 8123 + }, + { + "epoch": 1.8847001507945715, + "grad_norm": 12.477288015553475, + "learning_rate": 2e-06, + "loss": 0.2421, + "step": 8124 + }, + { + "epoch": 1.8849321424428722, + "grad_norm": 12.22566777943852, + "learning_rate": 2e-06, + "loss": 0.2004, + "step": 8125 + }, + { + "epoch": 1.8851641340911727, + "grad_norm": 10.719972656821513, + "learning_rate": 2e-06, + "loss": 0.2487, + "step": 8126 + }, + { + "epoch": 1.8853961257394733, + "grad_norm": 12.691193607855086, + "learning_rate": 2e-06, + "loss": 0.3267, + "step": 8127 + }, + { + "epoch": 1.885628117387774, + "grad_norm": 11.938062303337183, + "learning_rate": 2e-06, + "loss": 0.2, + "step": 8128 + }, + { + "epoch": 1.8858601090360747, + "grad_norm": 16.38343012082749, + "learning_rate": 2e-06, + "loss": 0.3129, + "step": 8129 + }, + { + "epoch": 1.8860921006843754, + "grad_norm": 13.360707433601444, + "learning_rate": 2e-06, + "loss": 0.297, + "step": 8130 + }, + { + "epoch": 1.886324092332676, + "grad_norm": 8.182444282018277, + "learning_rate": 2e-06, + "loss": 0.2032, + "step": 8131 + }, + { + "epoch": 1.8865560839809767, + "grad_norm": 10.785681550556108, + "learning_rate": 2e-06, + "loss": 0.198, + "step": 8132 + }, + { + "epoch": 1.8867880756292772, + "grad_norm": 32.651483206036424, + "learning_rate": 2e-06, + "loss": 0.2223, + "step": 8133 + }, + { + "epoch": 1.8870200672775779, + "grad_norm": 12.78225895745723, + "learning_rate": 2e-06, + "loss": 0.3003, + "step": 8134 + }, + { + "epoch": 1.8872520589258786, + "grad_norm": 16.118136488781317, + "learning_rate": 2e-06, + "loss": 0.3566, + "step": 8135 + }, + { + "epoch": 1.8874840505741792, + "grad_norm": 16.010772026878108, + "learning_rate": 2e-06, + "loss": 0.2491, + "step": 8136 + }, + { + "epoch": 1.88771604222248, + "grad_norm": 19.512267007684628, + "learning_rate": 2e-06, + "loss": 0.298, + "step": 8137 + }, + { + "epoch": 1.8879480338707806, + "grad_norm": 12.91385202920017, + "learning_rate": 2e-06, + "loss": 0.2058, + "step": 8138 + }, + { + "epoch": 1.8881800255190813, + "grad_norm": 13.799231253139101, + "learning_rate": 2e-06, + "loss": 0.1785, + "step": 8139 + }, + { + "epoch": 1.888412017167382, + "grad_norm": 22.71783170409705, + "learning_rate": 2e-06, + "loss": 0.3614, + "step": 8140 + }, + { + "epoch": 1.8886440088156826, + "grad_norm": 17.47576531872985, + "learning_rate": 2e-06, + "loss": 0.2828, + "step": 8141 + }, + { + "epoch": 1.8888760004639833, + "grad_norm": 17.212646341870357, + "learning_rate": 2e-06, + "loss": 0.2752, + "step": 8142 + }, + { + "epoch": 1.889107992112284, + "grad_norm": 11.878432237227706, + "learning_rate": 2e-06, + "loss": 0.216, + "step": 8143 + }, + { + "epoch": 1.8893399837605847, + "grad_norm": 14.344434202109612, + "learning_rate": 2e-06, + "loss": 0.2197, + "step": 8144 + }, + { + "epoch": 1.8895719754088853, + "grad_norm": 24.818920744905732, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 8145 + }, + { + "epoch": 1.889803967057186, + "grad_norm": 10.651778755421654, + "learning_rate": 2e-06, + "loss": 0.1695, + "step": 8146 + }, + { + "epoch": 1.8900359587054867, + "grad_norm": 14.289848463260581, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 8147 + }, + { + "epoch": 1.8902679503537874, + "grad_norm": 7.128992669850312, + "learning_rate": 2e-06, + "loss": 0.1521, + "step": 8148 + }, + { + "epoch": 1.890499942002088, + "grad_norm": 17.376190428609913, + "learning_rate": 2e-06, + "loss": 0.2186, + "step": 8149 + }, + { + "epoch": 1.8907319336503887, + "grad_norm": 14.718226807755526, + "learning_rate": 2e-06, + "loss": 0.2132, + "step": 8150 + }, + { + "epoch": 1.8909639252986894, + "grad_norm": 35.42613465223065, + "learning_rate": 2e-06, + "loss": 0.3643, + "step": 8151 + }, + { + "epoch": 1.89119591694699, + "grad_norm": 29.24116725879479, + "learning_rate": 2e-06, + "loss": 0.3953, + "step": 8152 + }, + { + "epoch": 1.8914279085952905, + "grad_norm": 11.504743827237121, + "learning_rate": 2e-06, + "loss": 0.1929, + "step": 8153 + }, + { + "epoch": 1.8916599002435912, + "grad_norm": 15.527277538736046, + "learning_rate": 2e-06, + "loss": 0.2618, + "step": 8154 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 34.41089193580936, + "learning_rate": 2e-06, + "loss": 0.3138, + "step": 8155 + }, + { + "epoch": 1.8921238835401926, + "grad_norm": 17.68286000532179, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 8156 + }, + { + "epoch": 1.8923558751884932, + "grad_norm": 13.15350512468965, + "learning_rate": 2e-06, + "loss": 0.134, + "step": 8157 + }, + { + "epoch": 1.892587866836794, + "grad_norm": 10.329227674219053, + "learning_rate": 2e-06, + "loss": 0.1932, + "step": 8158 + }, + { + "epoch": 1.8928198584850944, + "grad_norm": 13.006155127117866, + "learning_rate": 2e-06, + "loss": 0.2922, + "step": 8159 + }, + { + "epoch": 1.893051850133395, + "grad_norm": 12.329676024489999, + "learning_rate": 2e-06, + "loss": 0.2295, + "step": 8160 + }, + { + "epoch": 1.8932838417816957, + "grad_norm": 7.363387316454607, + "learning_rate": 2e-06, + "loss": 0.1269, + "step": 8161 + }, + { + "epoch": 1.8935158334299964, + "grad_norm": 17.318169665052793, + "learning_rate": 2e-06, + "loss": 0.2694, + "step": 8162 + }, + { + "epoch": 1.893747825078297, + "grad_norm": 19.869125915460828, + "learning_rate": 2e-06, + "loss": 0.2753, + "step": 8163 + }, + { + "epoch": 1.8939798167265978, + "grad_norm": 7.136062236164506, + "learning_rate": 2e-06, + "loss": 0.2068, + "step": 8164 + }, + { + "epoch": 1.8942118083748984, + "grad_norm": 12.456129038837584, + "learning_rate": 2e-06, + "loss": 0.2313, + "step": 8165 + }, + { + "epoch": 1.8944438000231991, + "grad_norm": 17.244983081129227, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 8166 + }, + { + "epoch": 1.8946757916714998, + "grad_norm": 16.69022335332699, + "learning_rate": 2e-06, + "loss": 0.2655, + "step": 8167 + }, + { + "epoch": 1.8949077833198005, + "grad_norm": 15.769969245776224, + "learning_rate": 2e-06, + "loss": 0.2282, + "step": 8168 + }, + { + "epoch": 1.8951397749681012, + "grad_norm": 8.289435804973813, + "learning_rate": 2e-06, + "loss": 0.1708, + "step": 8169 + }, + { + "epoch": 1.8953717666164018, + "grad_norm": 10.823837362743815, + "learning_rate": 2e-06, + "loss": 0.234, + "step": 8170 + }, + { + "epoch": 1.8956037582647025, + "grad_norm": 11.207424384436552, + "learning_rate": 2e-06, + "loss": 0.1962, + "step": 8171 + }, + { + "epoch": 1.8958357499130032, + "grad_norm": 9.150322406339379, + "learning_rate": 2e-06, + "loss": 0.2094, + "step": 8172 + }, + { + "epoch": 1.8960677415613039, + "grad_norm": 17.94134879246771, + "learning_rate": 2e-06, + "loss": 0.2231, + "step": 8173 + }, + { + "epoch": 1.8962997332096045, + "grad_norm": 22.93012671108854, + "learning_rate": 2e-06, + "loss": 0.2061, + "step": 8174 + }, + { + "epoch": 1.8965317248579052, + "grad_norm": 16.504892149263032, + "learning_rate": 2e-06, + "loss": 0.2255, + "step": 8175 + }, + { + "epoch": 1.896763716506206, + "grad_norm": 17.36398239932726, + "learning_rate": 2e-06, + "loss": 0.2867, + "step": 8176 + }, + { + "epoch": 1.8969957081545066, + "grad_norm": 22.820471052406855, + "learning_rate": 2e-06, + "loss": 0.1776, + "step": 8177 + }, + { + "epoch": 1.8972276998028073, + "grad_norm": 12.508140962756473, + "learning_rate": 2e-06, + "loss": 0.2118, + "step": 8178 + }, + { + "epoch": 1.8974596914511077, + "grad_norm": 15.703706473722347, + "learning_rate": 2e-06, + "loss": 0.2247, + "step": 8179 + }, + { + "epoch": 1.8976916830994084, + "grad_norm": 34.07236853610495, + "learning_rate": 2e-06, + "loss": 0.4813, + "step": 8180 + }, + { + "epoch": 1.897923674747709, + "grad_norm": 15.111982169520063, + "learning_rate": 2e-06, + "loss": 0.3347, + "step": 8181 + }, + { + "epoch": 1.8981556663960097, + "grad_norm": 18.108042990665616, + "learning_rate": 2e-06, + "loss": 0.3716, + "step": 8182 + }, + { + "epoch": 1.8983876580443104, + "grad_norm": 10.371113738441565, + "learning_rate": 2e-06, + "loss": 0.1723, + "step": 8183 + }, + { + "epoch": 1.898619649692611, + "grad_norm": 11.176901885735523, + "learning_rate": 2e-06, + "loss": 0.2535, + "step": 8184 + }, + { + "epoch": 1.8988516413409118, + "grad_norm": 11.546728228280674, + "learning_rate": 2e-06, + "loss": 0.1787, + "step": 8185 + }, + { + "epoch": 1.8990836329892122, + "grad_norm": 18.031724353604417, + "learning_rate": 2e-06, + "loss": 0.263, + "step": 8186 + }, + { + "epoch": 1.899315624637513, + "grad_norm": 7.97087998375236, + "learning_rate": 2e-06, + "loss": 0.128, + "step": 8187 + }, + { + "epoch": 1.8995476162858136, + "grad_norm": 10.91288450703131, + "learning_rate": 2e-06, + "loss": 0.1763, + "step": 8188 + }, + { + "epoch": 1.8997796079341143, + "grad_norm": 15.287091653761278, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 8189 + }, + { + "epoch": 1.900011599582415, + "grad_norm": 9.40977049520668, + "learning_rate": 2e-06, + "loss": 0.1625, + "step": 8190 + }, + { + "epoch": 1.9002435912307156, + "grad_norm": 16.861117607116114, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 8191 + }, + { + "epoch": 1.9004755828790163, + "grad_norm": 11.037214769342043, + "learning_rate": 2e-06, + "loss": 0.1528, + "step": 8192 + }, + { + "epoch": 1.900707574527317, + "grad_norm": 10.195558248624783, + "learning_rate": 2e-06, + "loss": 0.1352, + "step": 8193 + }, + { + "epoch": 1.9009395661756177, + "grad_norm": 19.357736986172732, + "learning_rate": 2e-06, + "loss": 0.2074, + "step": 8194 + }, + { + "epoch": 1.9011715578239183, + "grad_norm": 22.36942699777849, + "learning_rate": 2e-06, + "loss": 0.3187, + "step": 8195 + }, + { + "epoch": 1.901403549472219, + "grad_norm": 8.715362015576734, + "learning_rate": 2e-06, + "loss": 0.2511, + "step": 8196 + }, + { + "epoch": 1.9016355411205197, + "grad_norm": 10.25230107442266, + "learning_rate": 2e-06, + "loss": 0.1778, + "step": 8197 + }, + { + "epoch": 1.9018675327688204, + "grad_norm": 23.561928495390756, + "learning_rate": 2e-06, + "loss": 0.4416, + "step": 8198 + }, + { + "epoch": 1.902099524417121, + "grad_norm": 18.159330177747112, + "learning_rate": 2e-06, + "loss": 0.265, + "step": 8199 + }, + { + "epoch": 1.9023315160654217, + "grad_norm": 18.474659179229178, + "learning_rate": 2e-06, + "loss": 0.2264, + "step": 8200 + }, + { + "epoch": 1.9025635077137224, + "grad_norm": 11.213883376723222, + "learning_rate": 2e-06, + "loss": 0.2187, + "step": 8201 + }, + { + "epoch": 1.902795499362023, + "grad_norm": 11.325475746908815, + "learning_rate": 2e-06, + "loss": 0.1437, + "step": 8202 + }, + { + "epoch": 1.9030274910103238, + "grad_norm": 12.667406137830117, + "learning_rate": 2e-06, + "loss": 0.2505, + "step": 8203 + }, + { + "epoch": 1.9032594826586244, + "grad_norm": 14.219083128107396, + "learning_rate": 2e-06, + "loss": 0.1709, + "step": 8204 + }, + { + "epoch": 1.9034914743069251, + "grad_norm": 21.08465805561743, + "learning_rate": 2e-06, + "loss": 0.4011, + "step": 8205 + }, + { + "epoch": 1.9037234659552256, + "grad_norm": 16.782938445686284, + "learning_rate": 2e-06, + "loss": 0.2381, + "step": 8206 + }, + { + "epoch": 1.9039554576035262, + "grad_norm": 12.519800931564225, + "learning_rate": 2e-06, + "loss": 0.1999, + "step": 8207 + }, + { + "epoch": 1.904187449251827, + "grad_norm": 14.461461522859151, + "learning_rate": 2e-06, + "loss": 0.3346, + "step": 8208 + }, + { + "epoch": 1.9044194409001276, + "grad_norm": 14.345881539762717, + "learning_rate": 2e-06, + "loss": 0.2979, + "step": 8209 + }, + { + "epoch": 1.9046514325484283, + "grad_norm": 21.421155368467556, + "learning_rate": 2e-06, + "loss": 0.3399, + "step": 8210 + }, + { + "epoch": 1.904883424196729, + "grad_norm": 14.75430383047075, + "learning_rate": 2e-06, + "loss": 0.1874, + "step": 8211 + }, + { + "epoch": 1.9051154158450294, + "grad_norm": 16.452560204189812, + "learning_rate": 2e-06, + "loss": 0.2955, + "step": 8212 + }, + { + "epoch": 1.90534740749333, + "grad_norm": 12.58674547104881, + "learning_rate": 2e-06, + "loss": 0.2175, + "step": 8213 + }, + { + "epoch": 1.9055793991416308, + "grad_norm": 32.0433974432042, + "learning_rate": 2e-06, + "loss": 0.3187, + "step": 8214 + }, + { + "epoch": 1.9058113907899314, + "grad_norm": 27.844541614260155, + "learning_rate": 2e-06, + "loss": 0.3755, + "step": 8215 + }, + { + "epoch": 1.9060433824382321, + "grad_norm": 10.550625708302274, + "learning_rate": 2e-06, + "loss": 0.1614, + "step": 8216 + }, + { + "epoch": 1.9062753740865328, + "grad_norm": 20.29109359857516, + "learning_rate": 2e-06, + "loss": 0.3744, + "step": 8217 + }, + { + "epoch": 1.9065073657348335, + "grad_norm": 18.18674145651999, + "learning_rate": 2e-06, + "loss": 0.3038, + "step": 8218 + }, + { + "epoch": 1.9067393573831342, + "grad_norm": 26.554136681725186, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 8219 + }, + { + "epoch": 1.9069713490314348, + "grad_norm": 17.913901890935545, + "learning_rate": 2e-06, + "loss": 0.2838, + "step": 8220 + }, + { + "epoch": 1.9072033406797355, + "grad_norm": 19.446754092117164, + "learning_rate": 2e-06, + "loss": 0.2213, + "step": 8221 + }, + { + "epoch": 1.9074353323280362, + "grad_norm": 19.892146142585084, + "learning_rate": 2e-06, + "loss": 0.3371, + "step": 8222 + }, + { + "epoch": 1.9076673239763369, + "grad_norm": 13.182404615018127, + "learning_rate": 2e-06, + "loss": 0.2448, + "step": 8223 + }, + { + "epoch": 1.9078993156246375, + "grad_norm": 20.5755363946165, + "learning_rate": 2e-06, + "loss": 0.2963, + "step": 8224 + }, + { + "epoch": 1.9081313072729382, + "grad_norm": 12.559442015884914, + "learning_rate": 2e-06, + "loss": 0.2099, + "step": 8225 + }, + { + "epoch": 1.908363298921239, + "grad_norm": 9.005075480299361, + "learning_rate": 2e-06, + "loss": 0.1641, + "step": 8226 + }, + { + "epoch": 1.9085952905695396, + "grad_norm": 12.662288781435917, + "learning_rate": 2e-06, + "loss": 0.3134, + "step": 8227 + }, + { + "epoch": 1.9088272822178403, + "grad_norm": 12.007012421122173, + "learning_rate": 2e-06, + "loss": 0.2553, + "step": 8228 + }, + { + "epoch": 1.909059273866141, + "grad_norm": 11.816634760711612, + "learning_rate": 2e-06, + "loss": 0.2257, + "step": 8229 + }, + { + "epoch": 1.9092912655144416, + "grad_norm": 12.862882132997411, + "learning_rate": 2e-06, + "loss": 0.1843, + "step": 8230 + }, + { + "epoch": 1.9095232571627423, + "grad_norm": 26.931106829079567, + "learning_rate": 2e-06, + "loss": 0.236, + "step": 8231 + }, + { + "epoch": 1.9097552488110427, + "grad_norm": 9.259853349503151, + "learning_rate": 2e-06, + "loss": 0.2687, + "step": 8232 + }, + { + "epoch": 1.9099872404593434, + "grad_norm": 12.53754130652307, + "learning_rate": 2e-06, + "loss": 0.2398, + "step": 8233 + }, + { + "epoch": 1.910219232107644, + "grad_norm": 18.023157803986198, + "learning_rate": 2e-06, + "loss": 0.3218, + "step": 8234 + }, + { + "epoch": 1.9104512237559448, + "grad_norm": 8.078029810459396, + "learning_rate": 2e-06, + "loss": 0.1691, + "step": 8235 + }, + { + "epoch": 1.9106832154042455, + "grad_norm": 14.604188755436693, + "learning_rate": 2e-06, + "loss": 0.2846, + "step": 8236 + }, + { + "epoch": 1.9109152070525461, + "grad_norm": 9.915303648016163, + "learning_rate": 2e-06, + "loss": 0.1822, + "step": 8237 + }, + { + "epoch": 1.9111471987008468, + "grad_norm": 8.937008541033672, + "learning_rate": 2e-06, + "loss": 0.1915, + "step": 8238 + }, + { + "epoch": 1.9113791903491473, + "grad_norm": 15.495136841108435, + "learning_rate": 2e-06, + "loss": 0.1943, + "step": 8239 + }, + { + "epoch": 1.911611181997448, + "grad_norm": 18.594498427474978, + "learning_rate": 2e-06, + "loss": 0.2526, + "step": 8240 + }, + { + "epoch": 1.9118431736457486, + "grad_norm": 11.037879862277668, + "learning_rate": 2e-06, + "loss": 0.2421, + "step": 8241 + }, + { + "epoch": 1.9120751652940493, + "grad_norm": 24.98641628833053, + "learning_rate": 2e-06, + "loss": 0.2371, + "step": 8242 + }, + { + "epoch": 1.91230715694235, + "grad_norm": 16.29291873274218, + "learning_rate": 2e-06, + "loss": 0.3609, + "step": 8243 + }, + { + "epoch": 1.9125391485906507, + "grad_norm": 21.25534135384721, + "learning_rate": 2e-06, + "loss": 0.3265, + "step": 8244 + }, + { + "epoch": 1.9127711402389513, + "grad_norm": 10.772522409197187, + "learning_rate": 2e-06, + "loss": 0.2059, + "step": 8245 + }, + { + "epoch": 1.913003131887252, + "grad_norm": 16.822319519518206, + "learning_rate": 2e-06, + "loss": 0.2291, + "step": 8246 + }, + { + "epoch": 1.9132351235355527, + "grad_norm": 14.701476350588532, + "learning_rate": 2e-06, + "loss": 0.2077, + "step": 8247 + }, + { + "epoch": 1.9134671151838534, + "grad_norm": 10.151290714699144, + "learning_rate": 2e-06, + "loss": 0.1983, + "step": 8248 + }, + { + "epoch": 1.913699106832154, + "grad_norm": 11.486781285690082, + "learning_rate": 2e-06, + "loss": 0.1778, + "step": 8249 + }, + { + "epoch": 1.9139310984804547, + "grad_norm": 16.169915834829315, + "learning_rate": 2e-06, + "loss": 0.2844, + "step": 8250 + }, + { + "epoch": 1.9141630901287554, + "grad_norm": 18.397321285879713, + "learning_rate": 2e-06, + "loss": 0.2849, + "step": 8251 + }, + { + "epoch": 1.914395081777056, + "grad_norm": 11.317068990602722, + "learning_rate": 2e-06, + "loss": 0.2229, + "step": 8252 + }, + { + "epoch": 1.9146270734253568, + "grad_norm": 19.361350050762294, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 8253 + }, + { + "epoch": 1.9148590650736574, + "grad_norm": 14.06912297548446, + "learning_rate": 2e-06, + "loss": 0.3069, + "step": 8254 + }, + { + "epoch": 1.9150910567219581, + "grad_norm": 17.87175382915879, + "learning_rate": 2e-06, + "loss": 0.2287, + "step": 8255 + }, + { + "epoch": 1.9153230483702588, + "grad_norm": 15.518948701012697, + "learning_rate": 2e-06, + "loss": 0.2169, + "step": 8256 + }, + { + "epoch": 1.9155550400185595, + "grad_norm": 9.438148701509457, + "learning_rate": 2e-06, + "loss": 0.1725, + "step": 8257 + }, + { + "epoch": 1.9157870316668602, + "grad_norm": 17.408432016873743, + "learning_rate": 2e-06, + "loss": 0.2678, + "step": 8258 + }, + { + "epoch": 1.9160190233151606, + "grad_norm": 8.518150459279795, + "learning_rate": 2e-06, + "loss": 0.1942, + "step": 8259 + }, + { + "epoch": 1.9162510149634613, + "grad_norm": 11.916440509055764, + "learning_rate": 2e-06, + "loss": 0.2316, + "step": 8260 + }, + { + "epoch": 1.916483006611762, + "grad_norm": 19.22244658410548, + "learning_rate": 2e-06, + "loss": 0.2165, + "step": 8261 + }, + { + "epoch": 1.9167149982600626, + "grad_norm": 22.489005047076123, + "learning_rate": 2e-06, + "loss": 0.3285, + "step": 8262 + }, + { + "epoch": 1.9169469899083633, + "grad_norm": 13.18035388952861, + "learning_rate": 2e-06, + "loss": 0.2058, + "step": 8263 + }, + { + "epoch": 1.917178981556664, + "grad_norm": 16.614831732754023, + "learning_rate": 2e-06, + "loss": 0.2375, + "step": 8264 + }, + { + "epoch": 1.9174109732049647, + "grad_norm": 11.545535466753725, + "learning_rate": 2e-06, + "loss": 0.1232, + "step": 8265 + }, + { + "epoch": 1.9176429648532651, + "grad_norm": 12.093689927206457, + "learning_rate": 2e-06, + "loss": 0.2792, + "step": 8266 + }, + { + "epoch": 1.9178749565015658, + "grad_norm": 9.959454490978523, + "learning_rate": 2e-06, + "loss": 0.2145, + "step": 8267 + }, + { + "epoch": 1.9181069481498665, + "grad_norm": 12.2647047989879, + "learning_rate": 2e-06, + "loss": 0.2634, + "step": 8268 + }, + { + "epoch": 1.9183389397981672, + "grad_norm": 16.72304117556915, + "learning_rate": 2e-06, + "loss": 0.3142, + "step": 8269 + }, + { + "epoch": 1.9185709314464678, + "grad_norm": 19.31870618615468, + "learning_rate": 2e-06, + "loss": 0.382, + "step": 8270 + }, + { + "epoch": 1.9188029230947685, + "grad_norm": 6.752555646575119, + "learning_rate": 2e-06, + "loss": 0.1397, + "step": 8271 + }, + { + "epoch": 1.9190349147430692, + "grad_norm": 7.376211731271454, + "learning_rate": 2e-06, + "loss": 0.1096, + "step": 8272 + }, + { + "epoch": 1.9192669063913699, + "grad_norm": 14.1403769816287, + "learning_rate": 2e-06, + "loss": 0.2297, + "step": 8273 + }, + { + "epoch": 1.9194988980396706, + "grad_norm": 18.630262161070675, + "learning_rate": 2e-06, + "loss": 0.2822, + "step": 8274 + }, + { + "epoch": 1.9197308896879712, + "grad_norm": 10.637772181676997, + "learning_rate": 2e-06, + "loss": 0.2134, + "step": 8275 + }, + { + "epoch": 1.919962881336272, + "grad_norm": 10.803230879586474, + "learning_rate": 2e-06, + "loss": 0.2273, + "step": 8276 + }, + { + "epoch": 1.9201948729845726, + "grad_norm": 14.041361424353319, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 8277 + }, + { + "epoch": 1.9204268646328733, + "grad_norm": 25.059508495132192, + "learning_rate": 2e-06, + "loss": 0.3291, + "step": 8278 + }, + { + "epoch": 1.920658856281174, + "grad_norm": 17.070058774407894, + "learning_rate": 2e-06, + "loss": 0.2736, + "step": 8279 + }, + { + "epoch": 1.9208908479294746, + "grad_norm": 11.351872788066839, + "learning_rate": 2e-06, + "loss": 0.2799, + "step": 8280 + }, + { + "epoch": 1.9211228395777753, + "grad_norm": 18.216056033597024, + "learning_rate": 2e-06, + "loss": 0.2414, + "step": 8281 + }, + { + "epoch": 1.921354831226076, + "grad_norm": 13.080737756337564, + "learning_rate": 2e-06, + "loss": 0.3448, + "step": 8282 + }, + { + "epoch": 1.9215868228743767, + "grad_norm": 20.208896374040844, + "learning_rate": 2e-06, + "loss": 0.259, + "step": 8283 + }, + { + "epoch": 1.9218188145226773, + "grad_norm": 2.866532475964635, + "learning_rate": 2e-06, + "loss": 0.1151, + "step": 8284 + }, + { + "epoch": 1.922050806170978, + "grad_norm": 7.573395237380319, + "learning_rate": 2e-06, + "loss": 0.1573, + "step": 8285 + }, + { + "epoch": 1.9222827978192785, + "grad_norm": 18.308860336279682, + "learning_rate": 2e-06, + "loss": 0.2621, + "step": 8286 + }, + { + "epoch": 1.9225147894675791, + "grad_norm": 11.665796555064631, + "learning_rate": 2e-06, + "loss": 0.1945, + "step": 8287 + }, + { + "epoch": 1.9227467811158798, + "grad_norm": 15.296308819790674, + "learning_rate": 2e-06, + "loss": 0.2331, + "step": 8288 + }, + { + "epoch": 1.9229787727641805, + "grad_norm": 9.646629419972417, + "learning_rate": 2e-06, + "loss": 0.1939, + "step": 8289 + }, + { + "epoch": 1.9232107644124812, + "grad_norm": 103.93129525379511, + "learning_rate": 2e-06, + "loss": 0.2315, + "step": 8290 + }, + { + "epoch": 1.9234427560607819, + "grad_norm": 13.058749888640612, + "learning_rate": 2e-06, + "loss": 0.2627, + "step": 8291 + }, + { + "epoch": 1.9236747477090823, + "grad_norm": 17.24259009477687, + "learning_rate": 2e-06, + "loss": 0.2115, + "step": 8292 + }, + { + "epoch": 1.923906739357383, + "grad_norm": 10.338322982938008, + "learning_rate": 2e-06, + "loss": 0.2494, + "step": 8293 + }, + { + "epoch": 1.9241387310056837, + "grad_norm": 13.010569352737638, + "learning_rate": 2e-06, + "loss": 0.2818, + "step": 8294 + }, + { + "epoch": 1.9243707226539843, + "grad_norm": 10.243548363837144, + "learning_rate": 2e-06, + "loss": 0.2308, + "step": 8295 + }, + { + "epoch": 1.924602714302285, + "grad_norm": 26.38176885433789, + "learning_rate": 2e-06, + "loss": 0.2453, + "step": 8296 + }, + { + "epoch": 1.9248347059505857, + "grad_norm": 18.326231746899325, + "learning_rate": 2e-06, + "loss": 0.2372, + "step": 8297 + }, + { + "epoch": 1.9250666975988864, + "grad_norm": 15.340498799445832, + "learning_rate": 2e-06, + "loss": 0.2787, + "step": 8298 + }, + { + "epoch": 1.925298689247187, + "grad_norm": 11.107298049649948, + "learning_rate": 2e-06, + "loss": 0.2725, + "step": 8299 + }, + { + "epoch": 1.9255306808954877, + "grad_norm": 15.409151955869918, + "learning_rate": 2e-06, + "loss": 0.3218, + "step": 8300 + }, + { + "epoch": 1.9257626725437884, + "grad_norm": 13.18529269749906, + "learning_rate": 2e-06, + "loss": 0.3214, + "step": 8301 + }, + { + "epoch": 1.925994664192089, + "grad_norm": 12.997759326811828, + "learning_rate": 2e-06, + "loss": 0.1669, + "step": 8302 + }, + { + "epoch": 1.9262266558403898, + "grad_norm": 11.75141841418822, + "learning_rate": 2e-06, + "loss": 0.2567, + "step": 8303 + }, + { + "epoch": 1.9264586474886904, + "grad_norm": 13.803993613795765, + "learning_rate": 2e-06, + "loss": 0.3057, + "step": 8304 + }, + { + "epoch": 1.9266906391369911, + "grad_norm": 11.200192090314818, + "learning_rate": 2e-06, + "loss": 0.2454, + "step": 8305 + }, + { + "epoch": 1.9269226307852918, + "grad_norm": 10.956488078705902, + "learning_rate": 2e-06, + "loss": 0.2506, + "step": 8306 + }, + { + "epoch": 1.9271546224335925, + "grad_norm": 38.90968870785902, + "learning_rate": 2e-06, + "loss": 0.26, + "step": 8307 + }, + { + "epoch": 1.9273866140818932, + "grad_norm": 12.874901051121027, + "learning_rate": 2e-06, + "loss": 0.3622, + "step": 8308 + }, + { + "epoch": 1.9276186057301938, + "grad_norm": 18.50058674378736, + "learning_rate": 2e-06, + "loss": 0.2987, + "step": 8309 + }, + { + "epoch": 1.9278505973784945, + "grad_norm": 14.402281423883219, + "learning_rate": 2e-06, + "loss": 0.2354, + "step": 8310 + }, + { + "epoch": 1.9280825890267952, + "grad_norm": 14.094001216596647, + "learning_rate": 2e-06, + "loss": 0.3899, + "step": 8311 + }, + { + "epoch": 1.9283145806750956, + "grad_norm": 18.630838256596093, + "learning_rate": 2e-06, + "loss": 0.2709, + "step": 8312 + }, + { + "epoch": 1.9285465723233963, + "grad_norm": 16.355351861756276, + "learning_rate": 2e-06, + "loss": 0.3076, + "step": 8313 + }, + { + "epoch": 1.928778563971697, + "grad_norm": 13.490106187278478, + "learning_rate": 2e-06, + "loss": 0.2527, + "step": 8314 + }, + { + "epoch": 1.9290105556199977, + "grad_norm": 17.257607194236968, + "learning_rate": 2e-06, + "loss": 0.2722, + "step": 8315 + }, + { + "epoch": 1.9292425472682984, + "grad_norm": 10.354500315551464, + "learning_rate": 2e-06, + "loss": 0.2323, + "step": 8316 + }, + { + "epoch": 1.929474538916599, + "grad_norm": 16.34349615296237, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 8317 + }, + { + "epoch": 1.9297065305648997, + "grad_norm": 8.077491084678064, + "learning_rate": 2e-06, + "loss": 0.177, + "step": 8318 + }, + { + "epoch": 1.9299385222132002, + "grad_norm": 19.54466387276364, + "learning_rate": 2e-06, + "loss": 0.3311, + "step": 8319 + }, + { + "epoch": 1.9301705138615008, + "grad_norm": 11.75052050959431, + "learning_rate": 2e-06, + "loss": 0.2188, + "step": 8320 + }, + { + "epoch": 1.9304025055098015, + "grad_norm": 15.061184790407024, + "learning_rate": 2e-06, + "loss": 0.2134, + "step": 8321 + }, + { + "epoch": 1.9306344971581022, + "grad_norm": 14.553576441539253, + "learning_rate": 2e-06, + "loss": 0.2815, + "step": 8322 + }, + { + "epoch": 1.9308664888064029, + "grad_norm": 14.718620141267312, + "learning_rate": 2e-06, + "loss": 0.2289, + "step": 8323 + }, + { + "epoch": 1.9310984804547036, + "grad_norm": 20.38486878391961, + "learning_rate": 2e-06, + "loss": 0.2531, + "step": 8324 + }, + { + "epoch": 1.9313304721030042, + "grad_norm": 14.426701058009007, + "learning_rate": 2e-06, + "loss": 0.2395, + "step": 8325 + }, + { + "epoch": 1.931562463751305, + "grad_norm": 7.443348328961412, + "learning_rate": 2e-06, + "loss": 0.1931, + "step": 8326 + }, + { + "epoch": 1.9317944553996056, + "grad_norm": 13.279828797272623, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 8327 + }, + { + "epoch": 1.9320264470479063, + "grad_norm": 14.691548590843379, + "learning_rate": 2e-06, + "loss": 0.2645, + "step": 8328 + }, + { + "epoch": 1.932258438696207, + "grad_norm": 18.380291854873782, + "learning_rate": 2e-06, + "loss": 0.2756, + "step": 8329 + }, + { + "epoch": 1.9324904303445076, + "grad_norm": 16.434077330676157, + "learning_rate": 2e-06, + "loss": 0.2942, + "step": 8330 + }, + { + "epoch": 1.9327224219928083, + "grad_norm": 23.01022355060371, + "learning_rate": 2e-06, + "loss": 0.337, + "step": 8331 + }, + { + "epoch": 1.932954413641109, + "grad_norm": 15.352569187881347, + "learning_rate": 2e-06, + "loss": 0.2557, + "step": 8332 + }, + { + "epoch": 1.9331864052894097, + "grad_norm": 31.26671018444753, + "learning_rate": 2e-06, + "loss": 0.2989, + "step": 8333 + }, + { + "epoch": 1.9334183969377103, + "grad_norm": 4.16190214180648, + "learning_rate": 2e-06, + "loss": 0.1166, + "step": 8334 + }, + { + "epoch": 1.933650388586011, + "grad_norm": 19.25567276550992, + "learning_rate": 2e-06, + "loss": 0.3374, + "step": 8335 + }, + { + "epoch": 1.9338823802343117, + "grad_norm": 16.127640357750774, + "learning_rate": 2e-06, + "loss": 0.1962, + "step": 8336 + }, + { + "epoch": 1.9341143718826124, + "grad_norm": 9.010495407693012, + "learning_rate": 2e-06, + "loss": 0.1595, + "step": 8337 + }, + { + "epoch": 1.934346363530913, + "grad_norm": 13.093907921328455, + "learning_rate": 2e-06, + "loss": 0.2079, + "step": 8338 + }, + { + "epoch": 1.9345783551792135, + "grad_norm": 11.93257976455428, + "learning_rate": 2e-06, + "loss": 0.1789, + "step": 8339 + }, + { + "epoch": 1.9348103468275142, + "grad_norm": 12.455310510281812, + "learning_rate": 2e-06, + "loss": 0.1926, + "step": 8340 + }, + { + "epoch": 1.9350423384758149, + "grad_norm": 9.872377564603019, + "learning_rate": 2e-06, + "loss": 0.1529, + "step": 8341 + }, + { + "epoch": 1.9352743301241155, + "grad_norm": 16.326098772159725, + "learning_rate": 2e-06, + "loss": 0.1564, + "step": 8342 + }, + { + "epoch": 1.9355063217724162, + "grad_norm": 8.191776245582423, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 8343 + }, + { + "epoch": 1.935738313420717, + "grad_norm": 15.177847539987345, + "learning_rate": 2e-06, + "loss": 0.2256, + "step": 8344 + }, + { + "epoch": 1.9359703050690173, + "grad_norm": 15.284297069013228, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 8345 + }, + { + "epoch": 1.936202296717318, + "grad_norm": 10.819774117608612, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 8346 + }, + { + "epoch": 1.9364342883656187, + "grad_norm": 24.980519277732025, + "learning_rate": 2e-06, + "loss": 0.2839, + "step": 8347 + }, + { + "epoch": 1.9366662800139194, + "grad_norm": 16.686504111394285, + "learning_rate": 2e-06, + "loss": 0.256, + "step": 8348 + }, + { + "epoch": 1.93689827166222, + "grad_norm": 15.473655593524072, + "learning_rate": 2e-06, + "loss": 0.2852, + "step": 8349 + }, + { + "epoch": 1.9371302633105207, + "grad_norm": 13.78303381473132, + "learning_rate": 2e-06, + "loss": 0.3147, + "step": 8350 + }, + { + "epoch": 1.9373622549588214, + "grad_norm": 15.809047183386966, + "learning_rate": 2e-06, + "loss": 0.2181, + "step": 8351 + }, + { + "epoch": 1.937594246607122, + "grad_norm": 14.087075162865043, + "learning_rate": 2e-06, + "loss": 0.2784, + "step": 8352 + }, + { + "epoch": 1.9378262382554228, + "grad_norm": 6.435673804344862, + "learning_rate": 2e-06, + "loss": 0.144, + "step": 8353 + }, + { + "epoch": 1.9380582299037235, + "grad_norm": 12.885130455224658, + "learning_rate": 2e-06, + "loss": 0.3624, + "step": 8354 + }, + { + "epoch": 1.9382902215520241, + "grad_norm": 13.006022998323044, + "learning_rate": 2e-06, + "loss": 0.1781, + "step": 8355 + }, + { + "epoch": 1.9385222132003248, + "grad_norm": 12.62404139588954, + "learning_rate": 2e-06, + "loss": 0.2423, + "step": 8356 + }, + { + "epoch": 1.9387542048486255, + "grad_norm": 9.91038736403853, + "learning_rate": 2e-06, + "loss": 0.2856, + "step": 8357 + }, + { + "epoch": 1.9389861964969262, + "grad_norm": 24.05402448981315, + "learning_rate": 2e-06, + "loss": 0.3787, + "step": 8358 + }, + { + "epoch": 1.9392181881452268, + "grad_norm": 15.102452709625638, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 8359 + }, + { + "epoch": 1.9394501797935275, + "grad_norm": 13.076728471248122, + "learning_rate": 2e-06, + "loss": 0.1743, + "step": 8360 + }, + { + "epoch": 1.9396821714418282, + "grad_norm": 9.24397385494658, + "learning_rate": 2e-06, + "loss": 0.2097, + "step": 8361 + }, + { + "epoch": 1.9399141630901289, + "grad_norm": 16.266485171871672, + "learning_rate": 2e-06, + "loss": 0.2397, + "step": 8362 + }, + { + "epoch": 1.9401461547384296, + "grad_norm": 12.217629891584592, + "learning_rate": 2e-06, + "loss": 0.16, + "step": 8363 + }, + { + "epoch": 1.9403781463867302, + "grad_norm": 11.591707102973945, + "learning_rate": 2e-06, + "loss": 0.2571, + "step": 8364 + }, + { + "epoch": 1.9406101380350307, + "grad_norm": 8.643280962767939, + "learning_rate": 2e-06, + "loss": 0.1058, + "step": 8365 + }, + { + "epoch": 1.9408421296833314, + "grad_norm": 40.216672592940476, + "learning_rate": 2e-06, + "loss": 0.378, + "step": 8366 + }, + { + "epoch": 1.941074121331632, + "grad_norm": 6.157949903531771, + "learning_rate": 2e-06, + "loss": 0.1596, + "step": 8367 + }, + { + "epoch": 1.9413061129799327, + "grad_norm": 11.693409202021451, + "learning_rate": 2e-06, + "loss": 0.2576, + "step": 8368 + }, + { + "epoch": 1.9415381046282334, + "grad_norm": 10.782026224472514, + "learning_rate": 2e-06, + "loss": 0.1393, + "step": 8369 + }, + { + "epoch": 1.941770096276534, + "grad_norm": 23.328197673210656, + "learning_rate": 2e-06, + "loss": 0.3011, + "step": 8370 + }, + { + "epoch": 1.9420020879248348, + "grad_norm": 20.896594843390798, + "learning_rate": 2e-06, + "loss": 0.2877, + "step": 8371 + }, + { + "epoch": 1.9422340795731352, + "grad_norm": 15.411905183941613, + "learning_rate": 2e-06, + "loss": 0.2703, + "step": 8372 + }, + { + "epoch": 1.9424660712214359, + "grad_norm": 13.43832227814687, + "learning_rate": 2e-06, + "loss": 0.209, + "step": 8373 + }, + { + "epoch": 1.9426980628697366, + "grad_norm": 11.963382454873283, + "learning_rate": 2e-06, + "loss": 0.3074, + "step": 8374 + }, + { + "epoch": 1.9429300545180372, + "grad_norm": 11.95775311491443, + "learning_rate": 2e-06, + "loss": 0.223, + "step": 8375 + }, + { + "epoch": 1.943162046166338, + "grad_norm": 17.428277570749742, + "learning_rate": 2e-06, + "loss": 0.3112, + "step": 8376 + }, + { + "epoch": 1.9433940378146386, + "grad_norm": 15.230805680496852, + "learning_rate": 2e-06, + "loss": 0.2687, + "step": 8377 + }, + { + "epoch": 1.9436260294629393, + "grad_norm": 11.831562166444332, + "learning_rate": 2e-06, + "loss": 0.2156, + "step": 8378 + }, + { + "epoch": 1.94385802111124, + "grad_norm": 11.559861634564646, + "learning_rate": 2e-06, + "loss": 0.205, + "step": 8379 + }, + { + "epoch": 1.9440900127595406, + "grad_norm": 20.769356547787247, + "learning_rate": 2e-06, + "loss": 0.2047, + "step": 8380 + }, + { + "epoch": 1.9443220044078413, + "grad_norm": 14.79081337535743, + "learning_rate": 2e-06, + "loss": 0.3526, + "step": 8381 + }, + { + "epoch": 1.944553996056142, + "grad_norm": 26.84448752847015, + "learning_rate": 2e-06, + "loss": 0.2859, + "step": 8382 + }, + { + "epoch": 1.9447859877044427, + "grad_norm": 24.08944134225375, + "learning_rate": 2e-06, + "loss": 0.2332, + "step": 8383 + }, + { + "epoch": 1.9450179793527433, + "grad_norm": 15.492793474505326, + "learning_rate": 2e-06, + "loss": 0.2011, + "step": 8384 + }, + { + "epoch": 1.945249971001044, + "grad_norm": 13.641540417777914, + "learning_rate": 2e-06, + "loss": 0.2306, + "step": 8385 + }, + { + "epoch": 1.9454819626493447, + "grad_norm": 10.80299252573023, + "learning_rate": 2e-06, + "loss": 0.1511, + "step": 8386 + }, + { + "epoch": 1.9457139542976454, + "grad_norm": 13.673854162605489, + "learning_rate": 2e-06, + "loss": 0.2412, + "step": 8387 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 10.380498854463934, + "learning_rate": 2e-06, + "loss": 0.1457, + "step": 8388 + }, + { + "epoch": 1.9461779375942467, + "grad_norm": 22.70279296127693, + "learning_rate": 2e-06, + "loss": 0.2985, + "step": 8389 + }, + { + "epoch": 1.9464099292425474, + "grad_norm": 10.8828809073517, + "learning_rate": 2e-06, + "loss": 0.2901, + "step": 8390 + }, + { + "epoch": 1.946641920890848, + "grad_norm": 12.740824574439682, + "learning_rate": 2e-06, + "loss": 0.2331, + "step": 8391 + }, + { + "epoch": 1.9468739125391485, + "grad_norm": 10.986567109093164, + "learning_rate": 2e-06, + "loss": 0.2182, + "step": 8392 + }, + { + "epoch": 1.9471059041874492, + "grad_norm": 12.355360192244024, + "learning_rate": 2e-06, + "loss": 0.3446, + "step": 8393 + }, + { + "epoch": 1.94733789583575, + "grad_norm": 21.81122795551603, + "learning_rate": 2e-06, + "loss": 0.3911, + "step": 8394 + }, + { + "epoch": 1.9475698874840506, + "grad_norm": 10.852009445706015, + "learning_rate": 2e-06, + "loss": 0.1806, + "step": 8395 + }, + { + "epoch": 1.9478018791323513, + "grad_norm": 15.858822478149348, + "learning_rate": 2e-06, + "loss": 0.307, + "step": 8396 + }, + { + "epoch": 1.948033870780652, + "grad_norm": 7.640288640596409, + "learning_rate": 2e-06, + "loss": 0.2044, + "step": 8397 + }, + { + "epoch": 1.9482658624289526, + "grad_norm": 12.081191805075523, + "learning_rate": 2e-06, + "loss": 0.2311, + "step": 8398 + }, + { + "epoch": 1.948497854077253, + "grad_norm": 14.423207635176416, + "learning_rate": 2e-06, + "loss": 0.2262, + "step": 8399 + }, + { + "epoch": 1.9487298457255537, + "grad_norm": 11.753273624897368, + "learning_rate": 2e-06, + "loss": 0.1815, + "step": 8400 + }, + { + "epoch": 1.9489618373738544, + "grad_norm": 14.569786005421452, + "learning_rate": 2e-06, + "loss": 0.1847, + "step": 8401 + }, + { + "epoch": 1.949193829022155, + "grad_norm": 12.004180040477387, + "learning_rate": 2e-06, + "loss": 0.2279, + "step": 8402 + }, + { + "epoch": 1.9494258206704558, + "grad_norm": 11.534062119713278, + "learning_rate": 2e-06, + "loss": 0.2343, + "step": 8403 + }, + { + "epoch": 1.9496578123187565, + "grad_norm": 16.491537593130033, + "learning_rate": 2e-06, + "loss": 0.4219, + "step": 8404 + }, + { + "epoch": 1.9498898039670571, + "grad_norm": 20.454712496719754, + "learning_rate": 2e-06, + "loss": 0.3419, + "step": 8405 + }, + { + "epoch": 1.9501217956153578, + "grad_norm": 14.254428634177303, + "learning_rate": 2e-06, + "loss": 0.229, + "step": 8406 + }, + { + "epoch": 1.9503537872636585, + "grad_norm": 17.131492337479326, + "learning_rate": 2e-06, + "loss": 0.2399, + "step": 8407 + }, + { + "epoch": 1.9505857789119592, + "grad_norm": 10.012126423335909, + "learning_rate": 2e-06, + "loss": 0.176, + "step": 8408 + }, + { + "epoch": 1.9508177705602598, + "grad_norm": 15.887064355909393, + "learning_rate": 2e-06, + "loss": 0.2605, + "step": 8409 + }, + { + "epoch": 1.9510497622085605, + "grad_norm": 18.013675269362956, + "learning_rate": 2e-06, + "loss": 0.2112, + "step": 8410 + }, + { + "epoch": 1.9512817538568612, + "grad_norm": 16.777442020004855, + "learning_rate": 2e-06, + "loss": 0.2361, + "step": 8411 + }, + { + "epoch": 1.9515137455051619, + "grad_norm": 14.27637380350311, + "learning_rate": 2e-06, + "loss": 0.1877, + "step": 8412 + }, + { + "epoch": 1.9517457371534626, + "grad_norm": 5.247172375194371, + "learning_rate": 2e-06, + "loss": 0.1402, + "step": 8413 + }, + { + "epoch": 1.9519777288017632, + "grad_norm": 12.012920149131409, + "learning_rate": 2e-06, + "loss": 0.2541, + "step": 8414 + }, + { + "epoch": 1.952209720450064, + "grad_norm": 8.75045047925571, + "learning_rate": 2e-06, + "loss": 0.2358, + "step": 8415 + }, + { + "epoch": 1.9524417120983646, + "grad_norm": 11.665687192552866, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 8416 + }, + { + "epoch": 1.9526737037466653, + "grad_norm": 11.848604742375645, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 8417 + }, + { + "epoch": 1.952905695394966, + "grad_norm": 7.23663640660394, + "learning_rate": 2e-06, + "loss": 0.1936, + "step": 8418 + }, + { + "epoch": 1.9531376870432664, + "grad_norm": 12.568935820789614, + "learning_rate": 2e-06, + "loss": 0.3088, + "step": 8419 + }, + { + "epoch": 1.953369678691567, + "grad_norm": 8.327932198506467, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 8420 + }, + { + "epoch": 1.9536016703398678, + "grad_norm": 7.20468914494589, + "learning_rate": 2e-06, + "loss": 0.164, + "step": 8421 + }, + { + "epoch": 1.9538336619881684, + "grad_norm": 11.390215281587516, + "learning_rate": 2e-06, + "loss": 0.1818, + "step": 8422 + }, + { + "epoch": 1.9540656536364691, + "grad_norm": 7.102489376796252, + "learning_rate": 2e-06, + "loss": 0.1231, + "step": 8423 + }, + { + "epoch": 1.9542976452847698, + "grad_norm": 15.509530220768998, + "learning_rate": 2e-06, + "loss": 0.2076, + "step": 8424 + }, + { + "epoch": 1.9545296369330702, + "grad_norm": 16.151138514300136, + "learning_rate": 2e-06, + "loss": 0.3305, + "step": 8425 + }, + { + "epoch": 1.954761628581371, + "grad_norm": 8.82614805592662, + "learning_rate": 2e-06, + "loss": 0.1806, + "step": 8426 + }, + { + "epoch": 1.9549936202296716, + "grad_norm": 20.327694912149823, + "learning_rate": 2e-06, + "loss": 0.3804, + "step": 8427 + }, + { + "epoch": 1.9552256118779723, + "grad_norm": 13.419014003838033, + "learning_rate": 2e-06, + "loss": 0.25, + "step": 8428 + }, + { + "epoch": 1.955457603526273, + "grad_norm": 24.40155706745644, + "learning_rate": 2e-06, + "loss": 0.4561, + "step": 8429 + }, + { + "epoch": 1.9556895951745736, + "grad_norm": 14.019688656480323, + "learning_rate": 2e-06, + "loss": 0.1941, + "step": 8430 + }, + { + "epoch": 1.9559215868228743, + "grad_norm": 12.53301727015699, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 8431 + }, + { + "epoch": 1.956153578471175, + "grad_norm": 14.361679193213567, + "learning_rate": 2e-06, + "loss": 0.3081, + "step": 8432 + }, + { + "epoch": 1.9563855701194757, + "grad_norm": 26.510147634656878, + "learning_rate": 2e-06, + "loss": 0.4196, + "step": 8433 + }, + { + "epoch": 1.9566175617677763, + "grad_norm": 13.513129730273098, + "learning_rate": 2e-06, + "loss": 0.2441, + "step": 8434 + }, + { + "epoch": 1.956849553416077, + "grad_norm": 14.561957210594727, + "learning_rate": 2e-06, + "loss": 0.279, + "step": 8435 + }, + { + "epoch": 1.9570815450643777, + "grad_norm": 26.445220538157606, + "learning_rate": 2e-06, + "loss": 0.2931, + "step": 8436 + }, + { + "epoch": 1.9573135367126784, + "grad_norm": 13.783969377146583, + "learning_rate": 2e-06, + "loss": 0.2249, + "step": 8437 + }, + { + "epoch": 1.957545528360979, + "grad_norm": 27.928379555788805, + "learning_rate": 2e-06, + "loss": 0.4553, + "step": 8438 + }, + { + "epoch": 1.9577775200092797, + "grad_norm": 17.986716161532808, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 8439 + }, + { + "epoch": 1.9580095116575804, + "grad_norm": 14.087517558232287, + "learning_rate": 2e-06, + "loss": 0.3162, + "step": 8440 + }, + { + "epoch": 1.958241503305881, + "grad_norm": 12.892139932310869, + "learning_rate": 2e-06, + "loss": 0.2236, + "step": 8441 + }, + { + "epoch": 1.9584734949541818, + "grad_norm": 11.508804773043272, + "learning_rate": 2e-06, + "loss": 0.2051, + "step": 8442 + }, + { + "epoch": 1.9587054866024824, + "grad_norm": 22.660170295163322, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 8443 + }, + { + "epoch": 1.9589374782507831, + "grad_norm": 19.71433988724558, + "learning_rate": 2e-06, + "loss": 0.3338, + "step": 8444 + }, + { + "epoch": 1.9591694698990836, + "grad_norm": 13.96157075577866, + "learning_rate": 2e-06, + "loss": 0.2529, + "step": 8445 + }, + { + "epoch": 1.9594014615473843, + "grad_norm": 19.057960436934465, + "learning_rate": 2e-06, + "loss": 0.2632, + "step": 8446 + }, + { + "epoch": 1.959633453195685, + "grad_norm": 11.549432793969682, + "learning_rate": 2e-06, + "loss": 0.275, + "step": 8447 + }, + { + "epoch": 1.9598654448439856, + "grad_norm": 21.336870167414542, + "learning_rate": 2e-06, + "loss": 0.2588, + "step": 8448 + }, + { + "epoch": 1.9600974364922863, + "grad_norm": 11.302086911448749, + "learning_rate": 2e-06, + "loss": 0.2778, + "step": 8449 + }, + { + "epoch": 1.960329428140587, + "grad_norm": 15.244429894793514, + "learning_rate": 2e-06, + "loss": 0.2052, + "step": 8450 + }, + { + "epoch": 1.9605614197888876, + "grad_norm": 8.502244674605915, + "learning_rate": 2e-06, + "loss": 0.2113, + "step": 8451 + }, + { + "epoch": 1.960793411437188, + "grad_norm": 26.1900071729061, + "learning_rate": 2e-06, + "loss": 0.3883, + "step": 8452 + }, + { + "epoch": 1.9610254030854888, + "grad_norm": 14.440024116328479, + "learning_rate": 2e-06, + "loss": 0.2319, + "step": 8453 + }, + { + "epoch": 1.9612573947337895, + "grad_norm": 10.80774147458881, + "learning_rate": 2e-06, + "loss": 0.1606, + "step": 8454 + }, + { + "epoch": 1.9614893863820901, + "grad_norm": 15.094895749753553, + "learning_rate": 2e-06, + "loss": 0.2338, + "step": 8455 + }, + { + "epoch": 1.9617213780303908, + "grad_norm": 15.965997277124025, + "learning_rate": 2e-06, + "loss": 0.1719, + "step": 8456 + }, + { + "epoch": 1.9619533696786915, + "grad_norm": 19.69390806864314, + "learning_rate": 2e-06, + "loss": 0.2806, + "step": 8457 + }, + { + "epoch": 1.9621853613269922, + "grad_norm": 9.577296538084363, + "learning_rate": 2e-06, + "loss": 0.2495, + "step": 8458 + }, + { + "epoch": 1.9624173529752929, + "grad_norm": 26.75682900422842, + "learning_rate": 2e-06, + "loss": 0.2845, + "step": 8459 + }, + { + "epoch": 1.9626493446235935, + "grad_norm": 9.733980937932044, + "learning_rate": 2e-06, + "loss": 0.2037, + "step": 8460 + }, + { + "epoch": 1.9628813362718942, + "grad_norm": 22.218555882854805, + "learning_rate": 2e-06, + "loss": 0.3003, + "step": 8461 + }, + { + "epoch": 1.9631133279201949, + "grad_norm": 7.914416513340461, + "learning_rate": 2e-06, + "loss": 0.137, + "step": 8462 + }, + { + "epoch": 1.9633453195684956, + "grad_norm": 10.2317656645032, + "learning_rate": 2e-06, + "loss": 0.1882, + "step": 8463 + }, + { + "epoch": 1.9635773112167962, + "grad_norm": 10.376146409636112, + "learning_rate": 2e-06, + "loss": 0.1614, + "step": 8464 + }, + { + "epoch": 1.963809302865097, + "grad_norm": 17.186088573393064, + "learning_rate": 2e-06, + "loss": 0.2096, + "step": 8465 + }, + { + "epoch": 1.9640412945133976, + "grad_norm": 7.104689584111022, + "learning_rate": 2e-06, + "loss": 0.1568, + "step": 8466 + }, + { + "epoch": 1.9642732861616983, + "grad_norm": 21.66118189834683, + "learning_rate": 2e-06, + "loss": 0.3, + "step": 8467 + }, + { + "epoch": 1.964505277809999, + "grad_norm": 19.41182700985375, + "learning_rate": 2e-06, + "loss": 0.3131, + "step": 8468 + }, + { + "epoch": 1.9647372694582996, + "grad_norm": 9.70519193662155, + "learning_rate": 2e-06, + "loss": 0.3182, + "step": 8469 + }, + { + "epoch": 1.9649692611066003, + "grad_norm": 15.098456257992314, + "learning_rate": 2e-06, + "loss": 0.2525, + "step": 8470 + }, + { + "epoch": 1.965201252754901, + "grad_norm": 8.604709981076459, + "learning_rate": 2e-06, + "loss": 0.285, + "step": 8471 + }, + { + "epoch": 1.9654332444032014, + "grad_norm": 16.637331812879484, + "learning_rate": 2e-06, + "loss": 0.3224, + "step": 8472 + }, + { + "epoch": 1.9656652360515021, + "grad_norm": 21.586022385819454, + "learning_rate": 2e-06, + "loss": 0.4029, + "step": 8473 + }, + { + "epoch": 1.9658972276998028, + "grad_norm": 7.344537697532603, + "learning_rate": 2e-06, + "loss": 0.1618, + "step": 8474 + }, + { + "epoch": 1.9661292193481035, + "grad_norm": 11.941410369839828, + "learning_rate": 2e-06, + "loss": 0.2365, + "step": 8475 + }, + { + "epoch": 1.9663612109964042, + "grad_norm": 10.13680904838394, + "learning_rate": 2e-06, + "loss": 0.1756, + "step": 8476 + }, + { + "epoch": 1.9665932026447048, + "grad_norm": 9.082398998611101, + "learning_rate": 2e-06, + "loss": 0.2709, + "step": 8477 + }, + { + "epoch": 1.9668251942930053, + "grad_norm": 18.60080880891386, + "learning_rate": 2e-06, + "loss": 0.3045, + "step": 8478 + }, + { + "epoch": 1.967057185941306, + "grad_norm": 16.36407242270446, + "learning_rate": 2e-06, + "loss": 0.2604, + "step": 8479 + }, + { + "epoch": 1.9672891775896066, + "grad_norm": 14.299477048910845, + "learning_rate": 2e-06, + "loss": 0.2473, + "step": 8480 + }, + { + "epoch": 1.9675211692379073, + "grad_norm": 16.17852866015562, + "learning_rate": 2e-06, + "loss": 0.3183, + "step": 8481 + }, + { + "epoch": 1.967753160886208, + "grad_norm": 19.005085835163797, + "learning_rate": 2e-06, + "loss": 0.2687, + "step": 8482 + }, + { + "epoch": 1.9679851525345087, + "grad_norm": 12.339758285163436, + "learning_rate": 2e-06, + "loss": 0.2092, + "step": 8483 + }, + { + "epoch": 1.9682171441828094, + "grad_norm": 17.454199640819382, + "learning_rate": 2e-06, + "loss": 0.2479, + "step": 8484 + }, + { + "epoch": 1.96844913583111, + "grad_norm": 20.698836453320574, + "learning_rate": 2e-06, + "loss": 0.405, + "step": 8485 + }, + { + "epoch": 1.9686811274794107, + "grad_norm": 6.671319954473673, + "learning_rate": 2e-06, + "loss": 0.2192, + "step": 8486 + }, + { + "epoch": 1.9689131191277114, + "grad_norm": 13.887452692343361, + "learning_rate": 2e-06, + "loss": 0.2378, + "step": 8487 + }, + { + "epoch": 1.969145110776012, + "grad_norm": 15.72369330544798, + "learning_rate": 2e-06, + "loss": 0.2271, + "step": 8488 + }, + { + "epoch": 1.9693771024243127, + "grad_norm": 22.1359744407644, + "learning_rate": 2e-06, + "loss": 0.3123, + "step": 8489 + }, + { + "epoch": 1.9696090940726134, + "grad_norm": 9.595794165045705, + "learning_rate": 2e-06, + "loss": 0.2045, + "step": 8490 + }, + { + "epoch": 1.969841085720914, + "grad_norm": 8.637735599591288, + "learning_rate": 2e-06, + "loss": 0.1908, + "step": 8491 + }, + { + "epoch": 1.9700730773692148, + "grad_norm": 18.339224502699604, + "learning_rate": 2e-06, + "loss": 0.2914, + "step": 8492 + }, + { + "epoch": 1.9703050690175155, + "grad_norm": 13.889624100787907, + "learning_rate": 2e-06, + "loss": 0.2646, + "step": 8493 + }, + { + "epoch": 1.9705370606658161, + "grad_norm": 14.503935064188033, + "learning_rate": 2e-06, + "loss": 0.2686, + "step": 8494 + }, + { + "epoch": 1.9707690523141168, + "grad_norm": 7.739563925770556, + "learning_rate": 2e-06, + "loss": 0.1703, + "step": 8495 + }, + { + "epoch": 1.9710010439624175, + "grad_norm": 10.551688065569241, + "learning_rate": 2e-06, + "loss": 0.3515, + "step": 8496 + }, + { + "epoch": 1.9712330356107182, + "grad_norm": 9.902050304949746, + "learning_rate": 2e-06, + "loss": 0.2068, + "step": 8497 + }, + { + "epoch": 1.9714650272590186, + "grad_norm": 6.983275072558769, + "learning_rate": 2e-06, + "loss": 0.1782, + "step": 8498 + }, + { + "epoch": 1.9716970189073193, + "grad_norm": 13.785828158122156, + "learning_rate": 2e-06, + "loss": 0.3464, + "step": 8499 + }, + { + "epoch": 1.97192901055562, + "grad_norm": 14.076067112016535, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 8500 + }, + { + "epoch": 1.9721610022039207, + "grad_norm": 12.483396942471327, + "learning_rate": 2e-06, + "loss": 0.2082, + "step": 8501 + }, + { + "epoch": 1.9723929938522213, + "grad_norm": 12.186631988394545, + "learning_rate": 2e-06, + "loss": 0.2424, + "step": 8502 + }, + { + "epoch": 1.972624985500522, + "grad_norm": 9.102218431247547, + "learning_rate": 2e-06, + "loss": 0.3024, + "step": 8503 + }, + { + "epoch": 1.9728569771488227, + "grad_norm": 10.886415063131288, + "learning_rate": 2e-06, + "loss": 0.2534, + "step": 8504 + }, + { + "epoch": 1.9730889687971231, + "grad_norm": 12.836982023025246, + "learning_rate": 2e-06, + "loss": 0.2293, + "step": 8505 + }, + { + "epoch": 1.9733209604454238, + "grad_norm": 7.1802546184489255, + "learning_rate": 2e-06, + "loss": 0.1422, + "step": 8506 + }, + { + "epoch": 1.9735529520937245, + "grad_norm": 10.10370150232109, + "learning_rate": 2e-06, + "loss": 0.2613, + "step": 8507 + }, + { + "epoch": 1.9737849437420252, + "grad_norm": 12.471189951292699, + "learning_rate": 2e-06, + "loss": 0.2318, + "step": 8508 + }, + { + "epoch": 1.9740169353903259, + "grad_norm": 21.52531851333419, + "learning_rate": 2e-06, + "loss": 0.3975, + "step": 8509 + }, + { + "epoch": 1.9742489270386265, + "grad_norm": 9.963227113698434, + "learning_rate": 2e-06, + "loss": 0.2103, + "step": 8510 + }, + { + "epoch": 1.9744809186869272, + "grad_norm": 11.232264002401344, + "learning_rate": 2e-06, + "loss": 0.2087, + "step": 8511 + }, + { + "epoch": 1.9747129103352279, + "grad_norm": 16.55607130186788, + "learning_rate": 2e-06, + "loss": 0.3419, + "step": 8512 + }, + { + "epoch": 1.9749449019835286, + "grad_norm": 8.849922497856467, + "learning_rate": 2e-06, + "loss": 0.224, + "step": 8513 + }, + { + "epoch": 1.9751768936318292, + "grad_norm": 21.600280522778597, + "learning_rate": 2e-06, + "loss": 0.2065, + "step": 8514 + }, + { + "epoch": 1.97540888528013, + "grad_norm": 19.806163133323412, + "learning_rate": 2e-06, + "loss": 0.3007, + "step": 8515 + }, + { + "epoch": 1.9756408769284306, + "grad_norm": 20.337850588002727, + "learning_rate": 2e-06, + "loss": 0.237, + "step": 8516 + }, + { + "epoch": 1.9758728685767313, + "grad_norm": 8.776756918468648, + "learning_rate": 2e-06, + "loss": 0.1599, + "step": 8517 + }, + { + "epoch": 1.976104860225032, + "grad_norm": 11.533070577796943, + "learning_rate": 2e-06, + "loss": 0.1877, + "step": 8518 + }, + { + "epoch": 1.9763368518733326, + "grad_norm": 12.84657385589497, + "learning_rate": 2e-06, + "loss": 0.2746, + "step": 8519 + }, + { + "epoch": 1.9765688435216333, + "grad_norm": 16.518796669554234, + "learning_rate": 2e-06, + "loss": 0.295, + "step": 8520 + }, + { + "epoch": 1.976800835169934, + "grad_norm": 16.963351981037178, + "learning_rate": 2e-06, + "loss": 0.2811, + "step": 8521 + }, + { + "epoch": 1.9770328268182347, + "grad_norm": 4.346222300206503, + "learning_rate": 2e-06, + "loss": 0.1426, + "step": 8522 + }, + { + "epoch": 1.9772648184665353, + "grad_norm": 11.419250342857609, + "learning_rate": 2e-06, + "loss": 0.2009, + "step": 8523 + }, + { + "epoch": 1.977496810114836, + "grad_norm": 18.097650604177662, + "learning_rate": 2e-06, + "loss": 0.3862, + "step": 8524 + }, + { + "epoch": 1.9777288017631365, + "grad_norm": 20.249542109447514, + "learning_rate": 2e-06, + "loss": 0.3922, + "step": 8525 + }, + { + "epoch": 1.9779607934114372, + "grad_norm": 13.871707038549962, + "learning_rate": 2e-06, + "loss": 0.1997, + "step": 8526 + }, + { + "epoch": 1.9781927850597378, + "grad_norm": 13.029721724629182, + "learning_rate": 2e-06, + "loss": 0.1359, + "step": 8527 + }, + { + "epoch": 1.9784247767080385, + "grad_norm": 8.589532756585148, + "learning_rate": 2e-06, + "loss": 0.203, + "step": 8528 + }, + { + "epoch": 1.9786567683563392, + "grad_norm": 19.65133151674858, + "learning_rate": 2e-06, + "loss": 0.3163, + "step": 8529 + }, + { + "epoch": 1.9788887600046399, + "grad_norm": 10.528372449802873, + "learning_rate": 2e-06, + "loss": 0.2023, + "step": 8530 + }, + { + "epoch": 1.9791207516529405, + "grad_norm": 13.753512180806643, + "learning_rate": 2e-06, + "loss": 0.2618, + "step": 8531 + }, + { + "epoch": 1.979352743301241, + "grad_norm": 10.418585440923946, + "learning_rate": 2e-06, + "loss": 0.2562, + "step": 8532 + }, + { + "epoch": 1.9795847349495417, + "grad_norm": 9.050020397117102, + "learning_rate": 2e-06, + "loss": 0.2337, + "step": 8533 + }, + { + "epoch": 1.9798167265978424, + "grad_norm": 10.006801325208919, + "learning_rate": 2e-06, + "loss": 0.2385, + "step": 8534 + }, + { + "epoch": 1.980048718246143, + "grad_norm": 11.336402703687698, + "learning_rate": 2e-06, + "loss": 0.205, + "step": 8535 + }, + { + "epoch": 1.9802807098944437, + "grad_norm": 12.825693375037904, + "learning_rate": 2e-06, + "loss": 0.2171, + "step": 8536 + }, + { + "epoch": 1.9805127015427444, + "grad_norm": 5.524550556954387, + "learning_rate": 2e-06, + "loss": 0.1233, + "step": 8537 + }, + { + "epoch": 1.980744693191045, + "grad_norm": 8.083841633609582, + "learning_rate": 2e-06, + "loss": 0.1239, + "step": 8538 + }, + { + "epoch": 1.9809766848393457, + "grad_norm": 11.573832770938761, + "learning_rate": 2e-06, + "loss": 0.2471, + "step": 8539 + }, + { + "epoch": 1.9812086764876464, + "grad_norm": 7.392452023260391, + "learning_rate": 2e-06, + "loss": 0.1823, + "step": 8540 + }, + { + "epoch": 1.981440668135947, + "grad_norm": 21.98841567744905, + "learning_rate": 2e-06, + "loss": 0.3394, + "step": 8541 + }, + { + "epoch": 1.9816726597842478, + "grad_norm": 6.8136512886727445, + "learning_rate": 2e-06, + "loss": 0.1468, + "step": 8542 + }, + { + "epoch": 1.9819046514325485, + "grad_norm": 19.584918378180546, + "learning_rate": 2e-06, + "loss": 0.247, + "step": 8543 + }, + { + "epoch": 1.9821366430808491, + "grad_norm": 13.017497347406255, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 8544 + }, + { + "epoch": 1.9823686347291498, + "grad_norm": 16.294108863284833, + "learning_rate": 2e-06, + "loss": 0.2779, + "step": 8545 + }, + { + "epoch": 1.9826006263774505, + "grad_norm": 8.147068821027043, + "learning_rate": 2e-06, + "loss": 0.1924, + "step": 8546 + }, + { + "epoch": 1.9828326180257512, + "grad_norm": 9.830621743145237, + "learning_rate": 2e-06, + "loss": 0.24, + "step": 8547 + }, + { + "epoch": 1.9830646096740518, + "grad_norm": 22.846163306008492, + "learning_rate": 2e-06, + "loss": 0.3014, + "step": 8548 + }, + { + "epoch": 1.9832966013223525, + "grad_norm": 18.613538314175056, + "learning_rate": 2e-06, + "loss": 0.4346, + "step": 8549 + }, + { + "epoch": 1.9835285929706532, + "grad_norm": 13.590506060031638, + "learning_rate": 2e-06, + "loss": 0.2421, + "step": 8550 + }, + { + "epoch": 1.9837605846189539, + "grad_norm": 13.04190187877181, + "learning_rate": 2e-06, + "loss": 0.2047, + "step": 8551 + }, + { + "epoch": 1.9839925762672543, + "grad_norm": 10.602516907217499, + "learning_rate": 2e-06, + "loss": 0.2152, + "step": 8552 + }, + { + "epoch": 1.984224567915555, + "grad_norm": 12.780620813300855, + "learning_rate": 2e-06, + "loss": 0.2415, + "step": 8553 + }, + { + "epoch": 1.9844565595638557, + "grad_norm": 12.627158631424301, + "learning_rate": 2e-06, + "loss": 0.1876, + "step": 8554 + }, + { + "epoch": 1.9846885512121564, + "grad_norm": 19.562289971581983, + "learning_rate": 2e-06, + "loss": 0.2719, + "step": 8555 + }, + { + "epoch": 1.984920542860457, + "grad_norm": 19.225652733982006, + "learning_rate": 2e-06, + "loss": 0.3401, + "step": 8556 + }, + { + "epoch": 1.9851525345087577, + "grad_norm": 10.428271595257316, + "learning_rate": 2e-06, + "loss": 0.2353, + "step": 8557 + }, + { + "epoch": 1.9853845261570582, + "grad_norm": 9.14187799266162, + "learning_rate": 2e-06, + "loss": 0.2572, + "step": 8558 + }, + { + "epoch": 1.9856165178053589, + "grad_norm": 12.648061748666258, + "learning_rate": 2e-06, + "loss": 0.2418, + "step": 8559 + }, + { + "epoch": 1.9858485094536595, + "grad_norm": 10.629099645791218, + "learning_rate": 2e-06, + "loss": 0.235, + "step": 8560 + }, + { + "epoch": 1.9860805011019602, + "grad_norm": 4.342336370217446, + "learning_rate": 2e-06, + "loss": 0.1194, + "step": 8561 + }, + { + "epoch": 1.986312492750261, + "grad_norm": 16.143025239136698, + "learning_rate": 2e-06, + "loss": 0.2595, + "step": 8562 + }, + { + "epoch": 1.9865444843985616, + "grad_norm": 9.754089385774627, + "learning_rate": 2e-06, + "loss": 0.2004, + "step": 8563 + }, + { + "epoch": 1.9867764760468622, + "grad_norm": 15.841521291649542, + "learning_rate": 2e-06, + "loss": 0.2853, + "step": 8564 + }, + { + "epoch": 1.987008467695163, + "grad_norm": 14.760371139310362, + "learning_rate": 2e-06, + "loss": 0.2199, + "step": 8565 + }, + { + "epoch": 1.9872404593434636, + "grad_norm": 10.32037228954311, + "learning_rate": 2e-06, + "loss": 0.2619, + "step": 8566 + }, + { + "epoch": 1.9874724509917643, + "grad_norm": 11.920341608033274, + "learning_rate": 2e-06, + "loss": 0.1909, + "step": 8567 + }, + { + "epoch": 1.987704442640065, + "grad_norm": 11.357525599468852, + "learning_rate": 2e-06, + "loss": 0.172, + "step": 8568 + }, + { + "epoch": 1.9879364342883656, + "grad_norm": 12.124003669396531, + "learning_rate": 2e-06, + "loss": 0.229, + "step": 8569 + }, + { + "epoch": 1.9881684259366663, + "grad_norm": 19.62315796808522, + "learning_rate": 2e-06, + "loss": 0.2791, + "step": 8570 + }, + { + "epoch": 1.988400417584967, + "grad_norm": 11.750587502060243, + "learning_rate": 2e-06, + "loss": 0.2483, + "step": 8571 + }, + { + "epoch": 1.9886324092332677, + "grad_norm": 15.404513427589219, + "learning_rate": 2e-06, + "loss": 0.2913, + "step": 8572 + }, + { + "epoch": 1.9888644008815684, + "grad_norm": 5.384890915201693, + "learning_rate": 2e-06, + "loss": 0.149, + "step": 8573 + }, + { + "epoch": 1.989096392529869, + "grad_norm": 11.17698439178024, + "learning_rate": 2e-06, + "loss": 0.236, + "step": 8574 + }, + { + "epoch": 1.9893283841781697, + "grad_norm": 25.955564600183294, + "learning_rate": 2e-06, + "loss": 0.3429, + "step": 8575 + }, + { + "epoch": 1.9895603758264704, + "grad_norm": 9.573922131436106, + "learning_rate": 2e-06, + "loss": 0.2218, + "step": 8576 + }, + { + "epoch": 1.989792367474771, + "grad_norm": 10.128885271798515, + "learning_rate": 2e-06, + "loss": 0.2154, + "step": 8577 + }, + { + "epoch": 1.9900243591230715, + "grad_norm": 12.827613625373983, + "learning_rate": 2e-06, + "loss": 0.2551, + "step": 8578 + }, + { + "epoch": 1.9902563507713722, + "grad_norm": 12.181597847453952, + "learning_rate": 2e-06, + "loss": 0.2036, + "step": 8579 + }, + { + "epoch": 1.9904883424196729, + "grad_norm": 11.204479881120816, + "learning_rate": 2e-06, + "loss": 0.2682, + "step": 8580 + }, + { + "epoch": 1.9907203340679736, + "grad_norm": 14.7149776713658, + "learning_rate": 2e-06, + "loss": 0.1934, + "step": 8581 + }, + { + "epoch": 1.9909523257162742, + "grad_norm": 14.559952872618316, + "learning_rate": 2e-06, + "loss": 0.3198, + "step": 8582 + }, + { + "epoch": 1.991184317364575, + "grad_norm": 17.65883394269422, + "learning_rate": 2e-06, + "loss": 0.2938, + "step": 8583 + }, + { + "epoch": 1.9914163090128756, + "grad_norm": 11.148759808439678, + "learning_rate": 2e-06, + "loss": 0.2647, + "step": 8584 + }, + { + "epoch": 1.991648300661176, + "grad_norm": 11.688280097465956, + "learning_rate": 2e-06, + "loss": 0.2514, + "step": 8585 + }, + { + "epoch": 1.9918802923094767, + "grad_norm": 12.486888404537797, + "learning_rate": 2e-06, + "loss": 0.2763, + "step": 8586 + }, + { + "epoch": 1.9921122839577774, + "grad_norm": 11.592115694074757, + "learning_rate": 2e-06, + "loss": 0.2743, + "step": 8587 + }, + { + "epoch": 1.992344275606078, + "grad_norm": 11.680842392664022, + "learning_rate": 2e-06, + "loss": 0.2208, + "step": 8588 + }, + { + "epoch": 1.9925762672543788, + "grad_norm": 18.036521977119452, + "learning_rate": 2e-06, + "loss": 0.3916, + "step": 8589 + }, + { + "epoch": 1.9928082589026794, + "grad_norm": 8.003475013553198, + "learning_rate": 2e-06, + "loss": 0.1752, + "step": 8590 + }, + { + "epoch": 1.99304025055098, + "grad_norm": 13.081533947310318, + "learning_rate": 2e-06, + "loss": 0.2328, + "step": 8591 + }, + { + "epoch": 1.9932722421992808, + "grad_norm": 12.915462699759964, + "learning_rate": 2e-06, + "loss": 0.2749, + "step": 8592 + }, + { + "epoch": 1.9935042338475815, + "grad_norm": 10.856135360597765, + "learning_rate": 2e-06, + "loss": 0.376, + "step": 8593 + }, + { + "epoch": 1.9937362254958821, + "grad_norm": 7.591018863066772, + "learning_rate": 2e-06, + "loss": 0.1334, + "step": 8594 + }, + { + "epoch": 1.9939682171441828, + "grad_norm": 16.73779734886259, + "learning_rate": 2e-06, + "loss": 0.3323, + "step": 8595 + }, + { + "epoch": 1.9942002087924835, + "grad_norm": 17.095465102448927, + "learning_rate": 2e-06, + "loss": 0.2698, + "step": 8596 + }, + { + "epoch": 1.9944322004407842, + "grad_norm": 16.201500392677026, + "learning_rate": 2e-06, + "loss": 0.3091, + "step": 8597 + }, + { + "epoch": 1.9946641920890849, + "grad_norm": 10.38865379973004, + "learning_rate": 2e-06, + "loss": 0.2829, + "step": 8598 + }, + { + "epoch": 1.9948961837373855, + "grad_norm": 9.53782022055871, + "learning_rate": 2e-06, + "loss": 0.2776, + "step": 8599 + }, + { + "epoch": 1.9951281753856862, + "grad_norm": 8.811020125502003, + "learning_rate": 2e-06, + "loss": 0.1423, + "step": 8600 + }, + { + "epoch": 1.9953601670339869, + "grad_norm": 15.270821714163787, + "learning_rate": 2e-06, + "loss": 0.2136, + "step": 8601 + }, + { + "epoch": 1.9955921586822876, + "grad_norm": 18.901918550352107, + "learning_rate": 2e-06, + "loss": 0.3078, + "step": 8602 + }, + { + "epoch": 1.9958241503305882, + "grad_norm": 9.205250195423652, + "learning_rate": 2e-06, + "loss": 0.1244, + "step": 8603 + }, + { + "epoch": 1.996056141978889, + "grad_norm": 12.846303081127727, + "learning_rate": 2e-06, + "loss": 0.2795, + "step": 8604 + }, + { + "epoch": 1.9962881336271894, + "grad_norm": 15.88210580620789, + "learning_rate": 2e-06, + "loss": 0.2166, + "step": 8605 + }, + { + "epoch": 1.99652012527549, + "grad_norm": 10.421273958167372, + "learning_rate": 2e-06, + "loss": 0.2085, + "step": 8606 + }, + { + "epoch": 1.9967521169237907, + "grad_norm": 11.151099582393117, + "learning_rate": 2e-06, + "loss": 0.3363, + "step": 8607 + }, + { + "epoch": 1.9969841085720914, + "grad_norm": 14.37049931427536, + "learning_rate": 2e-06, + "loss": 0.2388, + "step": 8608 + }, + { + "epoch": 1.997216100220392, + "grad_norm": 14.278416156594247, + "learning_rate": 2e-06, + "loss": 0.1511, + "step": 8609 + }, + { + "epoch": 1.9974480918686928, + "grad_norm": 12.665305808679461, + "learning_rate": 2e-06, + "loss": 0.1878, + "step": 8610 + }, + { + "epoch": 1.9976800835169934, + "grad_norm": 15.377701007457812, + "learning_rate": 2e-06, + "loss": 0.3327, + "step": 8611 + }, + { + "epoch": 1.997912075165294, + "grad_norm": 8.160117309260238, + "learning_rate": 2e-06, + "loss": 0.1728, + "step": 8612 + }, + { + "epoch": 1.9981440668135946, + "grad_norm": 10.501757269939331, + "learning_rate": 2e-06, + "loss": 0.218, + "step": 8613 + }, + { + "epoch": 1.9983760584618953, + "grad_norm": 12.866025684053145, + "learning_rate": 2e-06, + "loss": 0.3282, + "step": 8614 + }, + { + "epoch": 1.998608050110196, + "grad_norm": 13.778437356571668, + "learning_rate": 2e-06, + "loss": 0.2975, + "step": 8615 + }, + { + "epoch": 1.9988400417584966, + "grad_norm": 13.683219081874212, + "learning_rate": 2e-06, + "loss": 0.2543, + "step": 8616 + }, + { + "epoch": 1.9990720334067973, + "grad_norm": 7.564723850571399, + "learning_rate": 2e-06, + "loss": 0.1385, + "step": 8617 + }, + { + "epoch": 1.999304025055098, + "grad_norm": 11.625230524676542, + "learning_rate": 2e-06, + "loss": 0.3084, + "step": 8618 + }, + { + "epoch": 1.9995360167033986, + "grad_norm": 17.15768884295696, + "learning_rate": 2e-06, + "loss": 0.3042, + "step": 8619 + }, + { + "epoch": 1.9997680083516993, + "grad_norm": 9.186683071124046, + "learning_rate": 2e-06, + "loss": 0.1746, + "step": 8620 + }, + { + "epoch": 1.9997680083516993, + "step": 8620, + "total_flos": 2997308663627776.0, + "train_loss": 0.283832880054231, + "train_runtime": 20710.279, + "train_samples_per_second": 6.66, + "train_steps_per_second": 0.416 + } + ], + "logging_steps": 1.0, + "max_steps": 8620, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2997308663627776.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}