diff --git "a/training_logs.json" "b/training_logs.json" new file mode 100644--- /dev/null +++ "b/training_logs.json" @@ -0,0 +1,6861 @@ +[ + { + "loss": 29.9707, + "grad_norm": 0.4777052700519562, + "learning_rate": 0.0009991248796709547, + "epoch": 0.0 + }, + { + "loss": 22.6857, + "grad_norm": 0.7528864741325378, + "learning_rate": 0.0009982497593419095, + "epoch": 0.01 + }, + { + "loss": 23.3032, + "grad_norm": 0.2558889389038086, + "learning_rate": 0.0009973746390128642, + "epoch": 0.01 + }, + { + "loss": 22.7608, + "grad_norm": 0.16549238562583923, + "learning_rate": 0.000996499518683819, + "epoch": 0.01 + }, + { + "loss": 21.8524, + "grad_norm": 0.969261109828949, + "learning_rate": 0.0009956243983547737, + "epoch": 0.01 + }, + { + "loss": 20.1216, + "grad_norm": 1.4401915073394775, + "learning_rate": 0.0009947492780257286, + "epoch": 0.02 + }, + { + "loss": 18.8698, + "grad_norm": 1.2519457340240479, + "learning_rate": 0.0009938741576966832, + "epoch": 0.02 + }, + { + "loss": 17.8051, + "grad_norm": 0.6829971075057983, + "learning_rate": 0.0009929990373676381, + "epoch": 0.02 + }, + { + "loss": 16.7436, + "grad_norm": 0.8918408155441284, + "learning_rate": 0.0009921239170385928, + "epoch": 0.02 + }, + { + "loss": 16.3997, + "grad_norm": 1.1997641324996948, + "learning_rate": 0.0009912487967095476, + "epoch": 0.03 + }, + { + "loss": 16.6555, + "grad_norm": 1.0908863544464111, + "learning_rate": 0.0009903736763805023, + "epoch": 0.03 + }, + { + "loss": 16.168, + "grad_norm": 0.8117638230323792, + "learning_rate": 0.0009894985560514572, + "epoch": 0.03 + }, + { + "loss": 16.4056, + "grad_norm": 0.7367461919784546, + "learning_rate": 0.0009886234357224118, + "epoch": 0.03 + }, + { + "loss": 16.0576, + "grad_norm": 0.827192485332489, + "learning_rate": 0.0009877483153933667, + "epoch": 0.04 + }, + { + "loss": 14.8864, + "grad_norm": 0.6644204258918762, + "learning_rate": 0.0009868731950643213, + "epoch": 0.04 + }, + { + "loss": 15.1702, + "grad_norm": 0.6314308047294617, + "learning_rate": 0.0009859980747352762, + "epoch": 0.04 + }, + { + "loss": 14.8873, + "grad_norm": 0.4996398091316223, + "learning_rate": 0.0009851229544062309, + "epoch": 0.04 + }, + { + "loss": 14.704, + "grad_norm": 0.6396967768669128, + "learning_rate": 0.0009842478340771857, + "epoch": 0.05 + }, + { + "loss": 14.8636, + "grad_norm": 0.5319499373435974, + "learning_rate": 0.0009833727137481404, + "epoch": 0.05 + }, + { + "loss": 14.7236, + "grad_norm": 1.1328645944595337, + "learning_rate": 0.0009824975934190953, + "epoch": 0.05 + }, + { + "loss": 14.2966, + "grad_norm": 0.5435690879821777, + "learning_rate": 0.00098162247309005, + "epoch": 0.06 + }, + { + "loss": 14.8865, + "grad_norm": 0.5260070562362671, + "learning_rate": 0.0009807473527610048, + "epoch": 0.06 + }, + { + "loss": 14.1635, + "grad_norm": 0.5490550994873047, + "learning_rate": 0.0009798722324319594, + "epoch": 0.06 + }, + { + "loss": 14.1756, + "grad_norm": 0.5580148696899414, + "learning_rate": 0.0009789971121029143, + "epoch": 0.06 + }, + { + "loss": 13.3026, + "grad_norm": 0.4862927198410034, + "learning_rate": 0.000978121991773869, + "epoch": 0.07 + }, + { + "loss": 13.9938, + "grad_norm": 0.4365651607513428, + "learning_rate": 0.0009772468714448236, + "epoch": 0.07 + }, + { + "loss": 13.7628, + "grad_norm": 0.5206578373908997, + "learning_rate": 0.0009763717511157785, + "epoch": 0.07 + }, + { + "loss": 13.2932, + "grad_norm": 0.4493275582790375, + "learning_rate": 0.0009754966307867332, + "epoch": 0.07 + }, + { + "loss": 13.4192, + "grad_norm": 0.5717960596084595, + "learning_rate": 0.000974621510457688, + "epoch": 0.08 + }, + { + "loss": 13.2883, + "grad_norm": 0.48513928055763245, + "learning_rate": 0.0009737463901286428, + "epoch": 0.08 + }, + { + "loss": 13.8283, + "grad_norm": 0.7734763622283936, + "learning_rate": 0.0009728712697995975, + "epoch": 0.08 + }, + { + "loss": 12.4766, + "grad_norm": 0.45278435945510864, + "learning_rate": 0.0009719961494705523, + "epoch": 0.08 + }, + { + "loss": 13.41, + "grad_norm": 0.5911663174629211, + "learning_rate": 0.000971121029141507, + "epoch": 0.09 + }, + { + "loss": 12.4475, + "grad_norm": 0.5840547680854797, + "learning_rate": 0.0009702459088124618, + "epoch": 0.09 + }, + { + "loss": 13.9417, + "grad_norm": 0.7008219957351685, + "learning_rate": 0.0009693707884834166, + "epoch": 0.09 + }, + { + "loss": 12.9164, + "grad_norm": 0.7021568417549133, + "learning_rate": 0.0009684956681543713, + "epoch": 0.09 + }, + { + "loss": 12.9824, + "grad_norm": 0.5466001629829407, + "learning_rate": 0.0009676205478253261, + "epoch": 0.1 + }, + { + "loss": 12.5413, + "grad_norm": 0.6215840578079224, + "learning_rate": 0.0009667454274962808, + "epoch": 0.1 + }, + { + "loss": 12.7753, + "grad_norm": 1.5948784351348877, + "learning_rate": 0.0009658703071672355, + "epoch": 0.1 + }, + { + "loss": 12.5837, + "grad_norm": 1.1063404083251953, + "learning_rate": 0.0009649951868381903, + "epoch": 0.1 + }, + { + "loss": 12.7079, + "grad_norm": 0.7521733045578003, + "learning_rate": 0.000964120066509145, + "epoch": 0.11 + }, + { + "loss": 12.1584, + "grad_norm": 0.7596040964126587, + "learning_rate": 0.0009632449461800998, + "epoch": 0.11 + }, + { + "loss": 12.6058, + "grad_norm": 1.1221098899841309, + "learning_rate": 0.0009623698258510546, + "epoch": 0.11 + }, + { + "loss": 12.7003, + "grad_norm": 0.795098602771759, + "learning_rate": 0.0009614947055220093, + "epoch": 0.12 + }, + { + "loss": 12.4519, + "grad_norm": 0.481406569480896, + "learning_rate": 0.0009606195851929641, + "epoch": 0.12 + }, + { + "loss": 12.8483, + "grad_norm": 0.6707068681716919, + "learning_rate": 0.0009597444648639187, + "epoch": 0.12 + }, + { + "loss": 12.593, + "grad_norm": 0.6381434798240662, + "learning_rate": 0.0009588693445348735, + "epoch": 0.12 + }, + { + "loss": 12.1654, + "grad_norm": 0.7791229486465454, + "learning_rate": 0.0009579942242058283, + "epoch": 0.13 + }, + { + "loss": 11.8089, + "grad_norm": 0.8445360660552979, + "learning_rate": 0.000957119103876783, + "epoch": 0.13 + }, + { + "loss": 12.232, + "grad_norm": 0.6427455544471741, + "learning_rate": 0.0009562439835477378, + "epoch": 0.13 + }, + { + "loss": 12.9625, + "grad_norm": 0.5700855255126953, + "learning_rate": 0.0009553688632186925, + "epoch": 0.13 + }, + { + "loss": 12.2628, + "grad_norm": 0.8731588125228882, + "learning_rate": 0.0009544937428896473, + "epoch": 0.14 + }, + { + "loss": 12.0435, + "grad_norm": 0.869883120059967, + "learning_rate": 0.0009536186225606021, + "epoch": 0.14 + }, + { + "loss": 12.2867, + "grad_norm": 0.8802808523178101, + "learning_rate": 0.0009527435022315568, + "epoch": 0.14 + }, + { + "loss": 11.9397, + "grad_norm": 1.0076773166656494, + "learning_rate": 0.0009518683819025116, + "epoch": 0.14 + }, + { + "loss": 11.8392, + "grad_norm": 0.5855250954627991, + "learning_rate": 0.0009509932615734664, + "epoch": 0.15 + }, + { + "loss": 11.3847, + "grad_norm": 0.5606763958930969, + "learning_rate": 0.0009501181412444211, + "epoch": 0.15 + }, + { + "loss": 12.2154, + "grad_norm": 1.1014057397842407, + "learning_rate": 0.0009492430209153759, + "epoch": 0.15 + }, + { + "loss": 11.6247, + "grad_norm": 0.6524838805198669, + "learning_rate": 0.0009483679005863306, + "epoch": 0.15 + }, + { + "loss": 11.5115, + "grad_norm": 1.0140221118927002, + "learning_rate": 0.0009474927802572854, + "epoch": 0.16 + }, + { + "loss": 12.1707, + "grad_norm": 1.4689868688583374, + "learning_rate": 0.0009466176599282402, + "epoch": 0.16 + }, + { + "loss": 11.6165, + "grad_norm": 0.8136260509490967, + "learning_rate": 0.0009457425395991948, + "epoch": 0.16 + }, + { + "loss": 11.8841, + "grad_norm": 2.0376949310302734, + "learning_rate": 0.0009448674192701496, + "epoch": 0.17 + }, + { + "loss": 11.2108, + "grad_norm": 1.1647133827209473, + "learning_rate": 0.0009439922989411043, + "epoch": 0.17 + }, + { + "loss": 11.9281, + "grad_norm": 0.8479063510894775, + "learning_rate": 0.0009431171786120591, + "epoch": 0.17 + }, + { + "loss": 11.0593, + "grad_norm": 0.8340569138526917, + "learning_rate": 0.0009422420582830139, + "epoch": 0.17 + }, + { + "loss": 11.5591, + "grad_norm": 0.9813485145568848, + "learning_rate": 0.0009413669379539686, + "epoch": 0.18 + }, + { + "loss": 11.1773, + "grad_norm": 0.9088229537010193, + "learning_rate": 0.0009404918176249234, + "epoch": 0.18 + }, + { + "loss": 11.6913, + "grad_norm": 0.860917866230011, + "learning_rate": 0.0009396166972958782, + "epoch": 0.18 + }, + { + "loss": 12.3707, + "grad_norm": 0.7795988321304321, + "learning_rate": 0.0009387415769668329, + "epoch": 0.18 + }, + { + "loss": 11.6669, + "grad_norm": 0.914884626865387, + "learning_rate": 0.0009378664566377877, + "epoch": 0.19 + }, + { + "loss": 11.6139, + "grad_norm": 1.7863789796829224, + "learning_rate": 0.0009369913363087424, + "epoch": 0.19 + }, + { + "loss": 11.1885, + "grad_norm": 0.7225568294525146, + "learning_rate": 0.0009361162159796972, + "epoch": 0.19 + }, + { + "loss": 11.7488, + "grad_norm": 0.9028294682502747, + "learning_rate": 0.000935241095650652, + "epoch": 0.19 + }, + { + "loss": 11.227, + "grad_norm": 1.0842101573944092, + "learning_rate": 0.0009343659753216067, + "epoch": 0.2 + }, + { + "loss": 11.4022, + "grad_norm": 0.7042496800422668, + "learning_rate": 0.0009334908549925615, + "epoch": 0.2 + }, + { + "loss": 11.006, + "grad_norm": 0.8355586528778076, + "learning_rate": 0.0009326157346635162, + "epoch": 0.2 + }, + { + "loss": 11.0561, + "grad_norm": 0.9001519083976746, + "learning_rate": 0.000931740614334471, + "epoch": 0.2 + }, + { + "loss": 11.357, + "grad_norm": 0.8695396184921265, + "learning_rate": 0.0009308654940054258, + "epoch": 0.21 + }, + { + "loss": 10.7003, + "grad_norm": 0.8076105117797852, + "learning_rate": 0.0009299903736763805, + "epoch": 0.21 + }, + { + "loss": 11.2661, + "grad_norm": 0.9677106142044067, + "learning_rate": 0.0009291152533473353, + "epoch": 0.21 + }, + { + "loss": 10.8957, + "grad_norm": 0.8753145337104797, + "learning_rate": 0.0009282401330182901, + "epoch": 0.22 + }, + { + "loss": 11.2854, + "grad_norm": 0.7343422770500183, + "learning_rate": 0.0009273650126892448, + "epoch": 0.22 + }, + { + "loss": 10.8205, + "grad_norm": 0.9795741438865662, + "learning_rate": 0.0009264898923601996, + "epoch": 0.22 + }, + { + "loss": 10.6805, + "grad_norm": 0.9723809957504272, + "learning_rate": 0.0009256147720311543, + "epoch": 0.22 + }, + { + "loss": 10.7639, + "grad_norm": 0.6675435900688171, + "learning_rate": 0.0009247396517021091, + "epoch": 0.23 + }, + { + "loss": 11.1119, + "grad_norm": 0.9673445224761963, + "learning_rate": 0.0009238645313730638, + "epoch": 0.23 + }, + { + "loss": 11.293, + "grad_norm": 0.9545767307281494, + "learning_rate": 0.0009229894110440185, + "epoch": 0.23 + }, + { + "loss": 11.4529, + "grad_norm": 0.8443020582199097, + "learning_rate": 0.0009221142907149733, + "epoch": 0.23 + }, + { + "loss": 10.402, + "grad_norm": 0.9980494976043701, + "learning_rate": 0.000921239170385928, + "epoch": 0.24 + }, + { + "loss": 10.8417, + "grad_norm": 1.2651828527450562, + "learning_rate": 0.0009203640500568828, + "epoch": 0.24 + }, + { + "loss": 10.9627, + "grad_norm": 0.7320075035095215, + "learning_rate": 0.0009194889297278376, + "epoch": 0.24 + }, + { + "loss": 10.1427, + "grad_norm": 1.5249311923980713, + "learning_rate": 0.0009186138093987923, + "epoch": 0.24 + }, + { + "loss": 11.0647, + "grad_norm": 0.8371347188949585, + "learning_rate": 0.0009177386890697471, + "epoch": 0.25 + }, + { + "loss": 10.7984, + "grad_norm": 1.0522745847702026, + "learning_rate": 0.0009168635687407019, + "epoch": 0.25 + }, + { + "loss": 10.0289, + "grad_norm": 0.9992939829826355, + "learning_rate": 0.0009159884484116566, + "epoch": 0.25 + }, + { + "loss": 10.6594, + "grad_norm": 1.6465744972229004, + "learning_rate": 0.0009151133280826114, + "epoch": 0.25 + }, + { + "loss": 10.7898, + "grad_norm": 0.8755474090576172, + "learning_rate": 0.0009142382077535661, + "epoch": 0.26 + }, + { + "loss": 10.8566, + "grad_norm": 0.9154648780822754, + "learning_rate": 0.0009133630874245209, + "epoch": 0.26 + }, + { + "loss": 10.3388, + "grad_norm": 0.9557958245277405, + "learning_rate": 0.0009124879670954757, + "epoch": 0.26 + }, + { + "loss": 11.0761, + "grad_norm": 0.9756875038146973, + "learning_rate": 0.0009116128467664304, + "epoch": 0.27 + }, + { + "loss": 10.6927, + "grad_norm": 0.9137876033782959, + "learning_rate": 0.0009107377264373852, + "epoch": 0.27 + }, + { + "loss": 10.4956, + "grad_norm": 1.2811295986175537, + "learning_rate": 0.00090986260610834, + "epoch": 0.27 + }, + { + "loss": 11.13, + "grad_norm": 1.574196696281433, + "learning_rate": 0.0009090749978121991, + "epoch": 0.27 + }, + { + "loss": 10.4299, + "grad_norm": 1.120239019393921, + "learning_rate": 0.0009082873895160585, + "epoch": 0.28 + }, + { + "loss": 10.9432, + "grad_norm": 4.42399263381958, + "learning_rate": 0.0009074122691870133, + "epoch": 0.28 + }, + { + "loss": 10.6758, + "grad_norm": 1.1292444467544556, + "learning_rate": 0.000906537148857968, + "epoch": 0.28 + }, + { + "loss": 9.9808, + "grad_norm": 1.36553156375885, + "learning_rate": 0.0009056620285289227, + "epoch": 0.28 + }, + { + "loss": 10.4376, + "grad_norm": 1.4920979738235474, + "learning_rate": 0.0009047869081998775, + "epoch": 0.29 + }, + { + "loss": 11.5319, + "grad_norm": 1.142583966255188, + "learning_rate": 0.0009039117878708322, + "epoch": 0.29 + }, + { + "loss": 10.8741, + "grad_norm": 1.7269898653030396, + "learning_rate": 0.000903036667541787, + "epoch": 0.29 + }, + { + "loss": 10.6609, + "grad_norm": 1.0620924234390259, + "learning_rate": 0.0009021615472127418, + "epoch": 0.29 + }, + { + "loss": 10.8716, + "grad_norm": 1.0225517749786377, + "learning_rate": 0.0009012864268836965, + "epoch": 0.3 + }, + { + "loss": 10.8629, + "grad_norm": 0.8201847672462463, + "learning_rate": 0.0009004113065546513, + "epoch": 0.3 + }, + { + "loss": 10.2614, + "grad_norm": 0.7885268926620483, + "learning_rate": 0.000899536186225606, + "epoch": 0.3 + }, + { + "loss": 10.1758, + "grad_norm": 0.8671897053718567, + "learning_rate": 0.0008986610658965608, + "epoch": 0.3 + }, + { + "loss": 10.2796, + "grad_norm": 0.8501631617546082, + "learning_rate": 0.0008977859455675156, + "epoch": 0.31 + }, + { + "loss": 10.4376, + "grad_norm": 1.3847661018371582, + "learning_rate": 0.0008969108252384703, + "epoch": 0.31 + }, + { + "loss": 10.6258, + "grad_norm": 1.1267868280410767, + "learning_rate": 0.0008960357049094251, + "epoch": 0.31 + }, + { + "loss": 10.3214, + "grad_norm": 0.9492388963699341, + "learning_rate": 0.0008951605845803799, + "epoch": 0.31 + }, + { + "loss": 10.3126, + "grad_norm": 2.884838819503784, + "learning_rate": 0.0008942854642513346, + "epoch": 0.32 + }, + { + "loss": 9.8104, + "grad_norm": 1.007505178451538, + "learning_rate": 0.0008934103439222894, + "epoch": 0.32 + }, + { + "loss": 10.7341, + "grad_norm": 0.9504636526107788, + "learning_rate": 0.0008925352235932441, + "epoch": 0.32 + }, + { + "loss": 10.3923, + "grad_norm": 1.1075007915496826, + "learning_rate": 0.0008916601032641989, + "epoch": 0.33 + }, + { + "loss": 10.323, + "grad_norm": 1.137343406677246, + "learning_rate": 0.0008907849829351537, + "epoch": 0.33 + }, + { + "loss": 10.2794, + "grad_norm": 0.797771155834198, + "learning_rate": 0.0008899098626061084, + "epoch": 0.33 + }, + { + "loss": 10.6656, + "grad_norm": 1.018343448638916, + "learning_rate": 0.0008890347422770632, + "epoch": 0.33 + }, + { + "loss": 10.2778, + "grad_norm": 1.0548039674758911, + "learning_rate": 0.000888159621948018, + "epoch": 0.34 + }, + { + "loss": 10.1114, + "grad_norm": 3.0174038410186768, + "learning_rate": 0.0008872845016189727, + "epoch": 0.34 + }, + { + "loss": 10.8685, + "grad_norm": 2.50591778755188, + "learning_rate": 0.0008864093812899275, + "epoch": 0.34 + }, + { + "loss": 10.0677, + "grad_norm": 1.2851207256317139, + "learning_rate": 0.0008855342609608822, + "epoch": 0.34 + }, + { + "loss": 10.0311, + "grad_norm": 0.7987344264984131, + "learning_rate": 0.0008846591406318369, + "epoch": 0.35 + }, + { + "loss": 9.7713, + "grad_norm": 1.114479899406433, + "learning_rate": 0.0008837840203027917, + "epoch": 0.35 + }, + { + "loss": 9.9371, + "grad_norm": 1.2233116626739502, + "learning_rate": 0.0008829088999737464, + "epoch": 0.35 + }, + { + "loss": 10.5333, + "grad_norm": 2.0412189960479736, + "learning_rate": 0.0008820337796447012, + "epoch": 0.35 + }, + { + "loss": 10.2577, + "grad_norm": 1.2635306119918823, + "learning_rate": 0.0008811586593156559, + "epoch": 0.36 + }, + { + "loss": 9.8937, + "grad_norm": 12.760872840881348, + "learning_rate": 0.0008802835389866107, + "epoch": 0.36 + }, + { + "loss": 10.6092, + "grad_norm": 1.3580334186553955, + "learning_rate": 0.0008794084186575654, + "epoch": 0.36 + }, + { + "loss": 10.2467, + "grad_norm": 1.250632643699646, + "learning_rate": 0.0008785332983285201, + "epoch": 0.36 + }, + { + "loss": 10.5076, + "grad_norm": 1.458349585533142, + "learning_rate": 0.0008776581779994749, + "epoch": 0.37 + }, + { + "loss": 10.2769, + "grad_norm": 1.9139622449874878, + "learning_rate": 0.0008767830576704296, + "epoch": 0.37 + }, + { + "loss": 10.4452, + "grad_norm": 1.2400761842727661, + "learning_rate": 0.0008759079373413844, + "epoch": 0.37 + }, + { + "loss": 10.01, + "grad_norm": 1.5482594966888428, + "learning_rate": 0.0008750328170123392, + "epoch": 0.38 + }, + { + "loss": 10.2997, + "grad_norm": 1.68232262134552, + "learning_rate": 0.0008741576966832939, + "epoch": 0.38 + }, + { + "loss": 10.0902, + "grad_norm": 1.206350564956665, + "learning_rate": 0.0008732825763542487, + "epoch": 0.38 + }, + { + "loss": 9.6499, + "grad_norm": 1.2805421352386475, + "learning_rate": 0.0008724074560252034, + "epoch": 0.38 + }, + { + "loss": 10.3858, + "grad_norm": 1.1297776699066162, + "learning_rate": 0.0008715323356961582, + "epoch": 0.39 + }, + { + "loss": 10.4059, + "grad_norm": 1.382300853729248, + "learning_rate": 0.000870657215367113, + "epoch": 0.39 + }, + { + "loss": 9.8993, + "grad_norm": 1.1831278800964355, + "learning_rate": 0.0008697820950380677, + "epoch": 0.39 + }, + { + "loss": 10.2277, + "grad_norm": 1.5924201011657715, + "learning_rate": 0.0008689069747090225, + "epoch": 0.39 + }, + { + "loss": 10.2644, + "grad_norm": 0.9275569319725037, + "learning_rate": 0.0008680318543799773, + "epoch": 0.4 + }, + { + "loss": 10.2756, + "grad_norm": 1.220247745513916, + "learning_rate": 0.000867156734050932, + "epoch": 0.4 + }, + { + "loss": 9.873, + "grad_norm": 1.2408357858657837, + "learning_rate": 0.0008662816137218868, + "epoch": 0.4 + }, + { + "loss": 10.4232, + "grad_norm": 2.236565351486206, + "learning_rate": 0.0008654064933928415, + "epoch": 0.4 + }, + { + "loss": 9.8613, + "grad_norm": 1.3093738555908203, + "learning_rate": 0.0008645313730637963, + "epoch": 0.41 + }, + { + "loss": 10.1708, + "grad_norm": 2.232199192047119, + "learning_rate": 0.000863656252734751, + "epoch": 0.41 + }, + { + "loss": 9.9729, + "grad_norm": 1.4281343221664429, + "learning_rate": 0.0008627811324057057, + "epoch": 0.41 + }, + { + "loss": 10.3467, + "grad_norm": 1.7682894468307495, + "learning_rate": 0.0008619060120766605, + "epoch": 0.41 + }, + { + "loss": 9.7119, + "grad_norm": 1.7619984149932861, + "learning_rate": 0.0008610308917476152, + "epoch": 0.42 + }, + { + "loss": 10.4769, + "grad_norm": 1.5372920036315918, + "learning_rate": 0.00086015577141857, + "epoch": 0.42 + }, + { + "loss": 10.691, + "grad_norm": 2.3789474964141846, + "learning_rate": 0.0008592806510895248, + "epoch": 0.42 + }, + { + "loss": 9.8791, + "grad_norm": 2.496776819229126, + "learning_rate": 0.0008584055307604795, + "epoch": 0.43 + }, + { + "loss": 9.7356, + "grad_norm": 4.118072032928467, + "learning_rate": 0.0008575304104314343, + "epoch": 0.43 + }, + { + "loss": 10.3761, + "grad_norm": 1.7359448671340942, + "learning_rate": 0.0008566552901023891, + "epoch": 0.43 + }, + { + "loss": 10.1403, + "grad_norm": 1.8283412456512451, + "learning_rate": 0.0008557801697733438, + "epoch": 0.43 + }, + { + "loss": 10.306, + "grad_norm": 1.9979033470153809, + "learning_rate": 0.0008549050494442986, + "epoch": 0.44 + }, + { + "loss": 9.5832, + "grad_norm": 3.1794967651367188, + "learning_rate": 0.0008540299291152533, + "epoch": 0.44 + }, + { + "loss": 10.1963, + "grad_norm": 3.1991539001464844, + "learning_rate": 0.0008531548087862081, + "epoch": 0.44 + }, + { + "loss": 10.7828, + "grad_norm": 2.5145182609558105, + "learning_rate": 0.0008522796884571629, + "epoch": 0.44 + }, + { + "loss": 10.1017, + "grad_norm": 1.0783337354660034, + "learning_rate": 0.0008514045681281176, + "epoch": 0.45 + }, + { + "loss": 9.4955, + "grad_norm": 6.040937423706055, + "learning_rate": 0.0008505294477990724, + "epoch": 0.45 + }, + { + "loss": 10.3679, + "grad_norm": 1.5212355852127075, + "learning_rate": 0.0008496543274700271, + "epoch": 0.45 + }, + { + "loss": 9.7236, + "grad_norm": 4.30284309387207, + "learning_rate": 0.0008487792071409819, + "epoch": 0.45 + }, + { + "loss": 9.7635, + "grad_norm": 2.9821696281433105, + "learning_rate": 0.0008479040868119367, + "epoch": 0.46 + }, + { + "loss": 9.8438, + "grad_norm": 1.676759958267212, + "learning_rate": 0.0008470289664828914, + "epoch": 0.46 + }, + { + "loss": 9.6693, + "grad_norm": 1.8075122833251953, + "learning_rate": 0.0008461538461538462, + "epoch": 0.46 + }, + { + "loss": 10.4572, + "grad_norm": 2.4182658195495605, + "learning_rate": 0.000845278725824801, + "epoch": 0.46 + }, + { + "loss": 10.3901, + "grad_norm": 1.7208518981933594, + "learning_rate": 0.0008444036054957557, + "epoch": 0.47 + }, + { + "loss": 9.7696, + "grad_norm": 2.4831340312957764, + "learning_rate": 0.0008435284851667105, + "epoch": 0.47 + }, + { + "loss": 10.409, + "grad_norm": 1.3335094451904297, + "learning_rate": 0.0008426533648376652, + "epoch": 0.47 + }, + { + "loss": 10.526, + "grad_norm": 0.9441933035850525, + "learning_rate": 0.0008417782445086199, + "epoch": 0.48 + }, + { + "loss": 10.14, + "grad_norm": 1.1018340587615967, + "learning_rate": 0.0008409031241795747, + "epoch": 0.48 + }, + { + "loss": 10.0298, + "grad_norm": 1.2077239751815796, + "learning_rate": 0.0008400280038505294, + "epoch": 0.48 + }, + { + "loss": 9.7303, + "grad_norm": 2.0401172637939453, + "learning_rate": 0.0008391528835214842, + "epoch": 0.48 + }, + { + "loss": 10.1229, + "grad_norm": 1.9456411600112915, + "learning_rate": 0.0008382777631924389, + "epoch": 0.49 + }, + { + "loss": 9.9805, + "grad_norm": 1.830814003944397, + "learning_rate": 0.0008374026428633937, + "epoch": 0.49 + }, + { + "loss": 10.1328, + "grad_norm": 2.1729185581207275, + "learning_rate": 0.0008365275225343485, + "epoch": 0.49 + }, + { + "loss": 10.4834, + "grad_norm": 1.324315071105957, + "learning_rate": 0.0008356524022053032, + "epoch": 0.49 + }, + { + "loss": 10.349, + "grad_norm": 2.837768077850342, + "learning_rate": 0.000834777281876258, + "epoch": 0.5 + }, + { + "loss": 9.8015, + "grad_norm": 1.1361275911331177, + "learning_rate": 0.0008339021615472128, + "epoch": 0.5 + }, + { + "loss": 9.5739, + "grad_norm": 1.4033498764038086, + "learning_rate": 0.0008330270412181675, + "epoch": 0.5 + }, + { + "loss": 9.5204, + "grad_norm": 1.1027082204818726, + "learning_rate": 0.0008321519208891223, + "epoch": 0.5 + }, + { + "loss": 9.4961, + "grad_norm": 2.2432548999786377, + "learning_rate": 0.000831276800560077, + "epoch": 0.51 + }, + { + "loss": 10.4562, + "grad_norm": 1.3807300329208374, + "learning_rate": 0.0008304016802310318, + "epoch": 0.51 + }, + { + "loss": 9.9888, + "grad_norm": 2.594301462173462, + "learning_rate": 0.0008295265599019866, + "epoch": 0.51 + }, + { + "loss": 9.4501, + "grad_norm": 1.4775426387786865, + "learning_rate": 0.0008286514395729413, + "epoch": 0.51 + }, + { + "loss": 9.9432, + "grad_norm": 1.463850736618042, + "learning_rate": 0.0008277763192438961, + "epoch": 0.52 + }, + { + "loss": 9.7867, + "grad_norm": 1.5370949506759644, + "learning_rate": 0.0008269011989148508, + "epoch": 0.52 + }, + { + "loss": 9.674, + "grad_norm": 1.2858608961105347, + "learning_rate": 0.0008260260785858056, + "epoch": 0.52 + }, + { + "loss": 10.4663, + "grad_norm": 1.2758288383483887, + "learning_rate": 0.0008251509582567604, + "epoch": 0.52 + }, + { + "loss": 9.552, + "grad_norm": 1.181013822555542, + "learning_rate": 0.0008242758379277151, + "epoch": 0.53 + }, + { + "loss": 9.9999, + "grad_norm": 0.9388832449913025, + "learning_rate": 0.0008234007175986699, + "epoch": 0.53 + }, + { + "loss": 10.1529, + "grad_norm": 1.3157830238342285, + "learning_rate": 0.0008225255972696247, + "epoch": 0.53 + }, + { + "loss": 10.3224, + "grad_norm": 1.603309154510498, + "learning_rate": 0.0008216504769405794, + "epoch": 0.54 + }, + { + "loss": 9.2725, + "grad_norm": 1.2987728118896484, + "learning_rate": 0.0008207753566115342, + "epoch": 0.54 + }, + { + "loss": 10.2593, + "grad_norm": 1.398086428642273, + "learning_rate": 0.0008199002362824888, + "epoch": 0.54 + }, + { + "loss": 9.8407, + "grad_norm": 1.3308155536651611, + "learning_rate": 0.0008190251159534436, + "epoch": 0.54 + }, + { + "loss": 10.7467, + "grad_norm": 1.3167645931243896, + "learning_rate": 0.0008181499956243984, + "epoch": 0.55 + }, + { + "loss": 10.1278, + "grad_norm": 1.935791254043579, + "learning_rate": 0.0008172748752953531, + "epoch": 0.55 + }, + { + "loss": 9.9477, + "grad_norm": 1.7790919542312622, + "learning_rate": 0.0008163997549663079, + "epoch": 0.55 + }, + { + "loss": 9.2234, + "grad_norm": 0.8335697650909424, + "learning_rate": 0.0008155246346372626, + "epoch": 0.55 + }, + { + "loss": 9.8562, + "grad_norm": 2.750474691390991, + "learning_rate": 0.0008146495143082174, + "epoch": 0.56 + }, + { + "loss": 10.3218, + "grad_norm": 1.4811447858810425, + "learning_rate": 0.0008137743939791722, + "epoch": 0.56 + }, + { + "loss": 9.6582, + "grad_norm": 1.9921342134475708, + "learning_rate": 0.0008128992736501269, + "epoch": 0.56 + }, + { + "loss": 9.8513, + "grad_norm": 2.635014295578003, + "learning_rate": 0.0008120241533210817, + "epoch": 0.56 + }, + { + "loss": 9.8862, + "grad_norm": 1.5898804664611816, + "learning_rate": 0.0008111490329920365, + "epoch": 0.57 + }, + { + "loss": 9.4721, + "grad_norm": 4.158829689025879, + "learning_rate": 0.0008102739126629912, + "epoch": 0.57 + }, + { + "loss": 10.1474, + "grad_norm": 1.8269054889678955, + "learning_rate": 0.000809398792333946, + "epoch": 0.57 + }, + { + "loss": 9.4288, + "grad_norm": 3.384010076522827, + "learning_rate": 0.0008085236720049007, + "epoch": 0.57 + }, + { + "loss": 10.0144, + "grad_norm": 1.6854453086853027, + "learning_rate": 0.0008076485516758555, + "epoch": 0.58 + }, + { + "loss": 10.229, + "grad_norm": 2.0812976360321045, + "learning_rate": 0.0008067734313468103, + "epoch": 0.58 + }, + { + "loss": 9.7204, + "grad_norm": 1.7673369646072388, + "learning_rate": 0.000805898311017765, + "epoch": 0.58 + }, + { + "loss": 9.6859, + "grad_norm": 2.155963897705078, + "learning_rate": 0.0008050231906887198, + "epoch": 0.59 + }, + { + "loss": 10.029, + "grad_norm": 1.482950210571289, + "learning_rate": 0.0008041480703596745, + "epoch": 0.59 + }, + { + "loss": 9.25, + "grad_norm": 2.6473171710968018, + "learning_rate": 0.0008032729500306293, + "epoch": 0.59 + }, + { + "loss": 10.028, + "grad_norm": 1.3584322929382324, + "learning_rate": 0.0008023978297015841, + "epoch": 0.59 + }, + { + "loss": 9.6924, + "grad_norm": 1.74970543384552, + "learning_rate": 0.0008015227093725388, + "epoch": 0.6 + }, + { + "loss": 10.0445, + "grad_norm": 2.0750019550323486, + "learning_rate": 0.0008006475890434936, + "epoch": 0.6 + }, + { + "loss": 9.7962, + "grad_norm": 7.219356060028076, + "learning_rate": 0.0007997724687144482, + "epoch": 0.6 + }, + { + "loss": 10.215, + "grad_norm": 1.2369924783706665, + "learning_rate": 0.0007988973483854029, + "epoch": 0.6 + }, + { + "loss": 9.538, + "grad_norm": 1.9686328172683716, + "learning_rate": 0.0007980222280563577, + "epoch": 0.61 + }, + { + "loss": 10.2107, + "grad_norm": 1.2081037759780884, + "learning_rate": 0.0007971471077273124, + "epoch": 0.61 + }, + { + "loss": 9.6709, + "grad_norm": 1.7755659818649292, + "learning_rate": 0.0007962719873982672, + "epoch": 0.61 + }, + { + "loss": 9.7973, + "grad_norm": 2.226400375366211, + "learning_rate": 0.000795396867069222, + "epoch": 0.61 + }, + { + "loss": 9.5564, + "grad_norm": 1.2814253568649292, + "learning_rate": 0.0007945217467401767, + "epoch": 0.62 + }, + { + "loss": 9.7987, + "grad_norm": 2.0225868225097656, + "learning_rate": 0.0007936466264111315, + "epoch": 0.62 + }, + { + "loss": 10.0866, + "grad_norm": 2.059910774230957, + "learning_rate": 0.0007927715060820862, + "epoch": 0.62 + }, + { + "loss": 10.031, + "grad_norm": 3.564408779144287, + "learning_rate": 0.000791896385753041, + "epoch": 0.62 + }, + { + "loss": 9.5562, + "grad_norm": 1.6237695217132568, + "learning_rate": 0.0007910212654239958, + "epoch": 0.63 + }, + { + "loss": 10.032, + "grad_norm": 1.8051832914352417, + "learning_rate": 0.0007901461450949505, + "epoch": 0.63 + }, + { + "loss": 9.5223, + "grad_norm": 1.807507872581482, + "learning_rate": 0.0007892710247659053, + "epoch": 0.63 + }, + { + "loss": 9.4476, + "grad_norm": 1.3200876712799072, + "learning_rate": 0.00078839590443686, + "epoch": 0.64 + }, + { + "loss": 9.4836, + "grad_norm": 3.295555353164673, + "learning_rate": 0.0007875207841078148, + "epoch": 0.64 + }, + { + "loss": 9.9695, + "grad_norm": 2.036158561706543, + "learning_rate": 0.0007867331758116741, + "epoch": 0.64 + }, + { + "loss": 9.414, + "grad_norm": 1.6501291990280151, + "learning_rate": 0.0007858580554826289, + "epoch": 0.64 + }, + { + "loss": 10.3832, + "grad_norm": 1.3873107433319092, + "learning_rate": 0.0007849829351535836, + "epoch": 0.65 + }, + { + "loss": 9.6308, + "grad_norm": 1.0633749961853027, + "learning_rate": 0.0007841078148245384, + "epoch": 0.65 + }, + { + "loss": 9.8861, + "grad_norm": 2.238201141357422, + "learning_rate": 0.0007832326944954931, + "epoch": 0.65 + }, + { + "loss": 9.9682, + "grad_norm": 1.2320759296417236, + "learning_rate": 0.0007823575741664479, + "epoch": 0.65 + }, + { + "loss": 9.496, + "grad_norm": 1.8895844221115112, + "learning_rate": 0.0007814824538374027, + "epoch": 0.66 + }, + { + "loss": 9.9117, + "grad_norm": 1.7297803163528442, + "learning_rate": 0.0007806073335083574, + "epoch": 0.66 + }, + { + "loss": 10.0705, + "grad_norm": 1.8089996576309204, + "learning_rate": 0.0007797322131793122, + "epoch": 0.66 + }, + { + "loss": 9.8684, + "grad_norm": 2.4221599102020264, + "learning_rate": 0.000778857092850267, + "epoch": 0.66 + }, + { + "loss": 9.343, + "grad_norm": 1.869035243988037, + "learning_rate": 0.0007779819725212217, + "epoch": 0.67 + }, + { + "loss": 9.395, + "grad_norm": 1.5427782535552979, + "learning_rate": 0.0007771068521921765, + "epoch": 0.67 + }, + { + "loss": 9.3372, + "grad_norm": 1.2343759536743164, + "learning_rate": 0.0007762317318631312, + "epoch": 0.67 + }, + { + "loss": 10.0514, + "grad_norm": 1.057860016822815, + "learning_rate": 0.000775356611534086, + "epoch": 0.67 + }, + { + "loss": 9.8897, + "grad_norm": 4.536896228790283, + "learning_rate": 0.0007744814912050408, + "epoch": 0.68 + }, + { + "loss": 9.7529, + "grad_norm": 2.2841501235961914, + "learning_rate": 0.0007736063708759955, + "epoch": 0.68 + }, + { + "loss": 9.7393, + "grad_norm": 1.4836674928665161, + "learning_rate": 0.0007727312505469503, + "epoch": 0.68 + }, + { + "loss": 9.4403, + "grad_norm": 1.9073762893676758, + "learning_rate": 0.000771856130217905, + "epoch": 0.69 + }, + { + "loss": 9.8424, + "grad_norm": 2.367785930633545, + "learning_rate": 0.0007709810098888598, + "epoch": 0.69 + }, + { + "loss": 9.5098, + "grad_norm": 0.824318528175354, + "learning_rate": 0.0007701058895598146, + "epoch": 0.69 + }, + { + "loss": 9.4785, + "grad_norm": 1.2716361284255981, + "learning_rate": 0.0007692307692307693, + "epoch": 0.69 + }, + { + "loss": 9.8112, + "grad_norm": 2.1307737827301025, + "learning_rate": 0.0007683556489017241, + "epoch": 0.7 + }, + { + "loss": 9.5932, + "grad_norm": 2.0558087825775146, + "learning_rate": 0.0007674805285726788, + "epoch": 0.7 + }, + { + "loss": 9.5525, + "grad_norm": 1.582262396812439, + "learning_rate": 0.0007666054082436335, + "epoch": 0.7 + }, + { + "loss": 9.8359, + "grad_norm": 7.788843154907227, + "learning_rate": 0.0007657302879145883, + "epoch": 0.7 + }, + { + "loss": 10.3724, + "grad_norm": 1.328479528427124, + "learning_rate": 0.000764855167585543, + "epoch": 0.71 + }, + { + "loss": 8.8465, + "grad_norm": 1.6026923656463623, + "learning_rate": 0.0007639800472564978, + "epoch": 0.71 + }, + { + "loss": 9.4257, + "grad_norm": 4.00112247467041, + "learning_rate": 0.0007631049269274526, + "epoch": 0.71 + }, + { + "loss": 9.4006, + "grad_norm": 1.2519035339355469, + "learning_rate": 0.0007622298065984073, + "epoch": 0.71 + }, + { + "loss": 9.2469, + "grad_norm": 1.0302975177764893, + "learning_rate": 0.0007613546862693621, + "epoch": 0.72 + }, + { + "loss": 9.6992, + "grad_norm": 1.066437840461731, + "learning_rate": 0.0007604795659403168, + "epoch": 0.72 + }, + { + "loss": 8.9602, + "grad_norm": 1.232923984527588, + "learning_rate": 0.0007596044456112715, + "epoch": 0.72 + }, + { + "loss": 10.1371, + "grad_norm": 2.129009962081909, + "learning_rate": 0.0007587293252822263, + "epoch": 0.72 + }, + { + "loss": 9.3879, + "grad_norm": 1.385560154914856, + "learning_rate": 0.000757854204953181, + "epoch": 0.73 + }, + { + "loss": 9.898, + "grad_norm": 15.102237701416016, + "learning_rate": 0.0007569790846241358, + "epoch": 0.73 + }, + { + "loss": 9.723, + "grad_norm": 1.5371789932250977, + "learning_rate": 0.0007561039642950905, + "epoch": 0.73 + }, + { + "loss": 9.5436, + "grad_norm": 1.3847825527191162, + "learning_rate": 0.0007552288439660453, + "epoch": 0.73 + }, + { + "loss": 9.4084, + "grad_norm": 2.662229299545288, + "learning_rate": 0.0007543537236370001, + "epoch": 0.74 + }, + { + "loss": 9.6916, + "grad_norm": 1.3952440023422241, + "learning_rate": 0.0007534786033079548, + "epoch": 0.74 + }, + { + "loss": 9.2971, + "grad_norm": 2.79449725151062, + "learning_rate": 0.0007526034829789096, + "epoch": 0.74 + }, + { + "loss": 9.6677, + "grad_norm": 0.959707498550415, + "learning_rate": 0.0007517283626498644, + "epoch": 0.75 + }, + { + "loss": 9.5952, + "grad_norm": 1.7505630254745483, + "learning_rate": 0.0007508532423208191, + "epoch": 0.75 + }, + { + "loss": 9.901, + "grad_norm": 4.176792621612549, + "learning_rate": 0.0007499781219917739, + "epoch": 0.75 + }, + { + "loss": 9.5036, + "grad_norm": 2.338407516479492, + "learning_rate": 0.0007491030016627286, + "epoch": 0.75 + }, + { + "loss": 10.0173, + "grad_norm": 1.4003384113311768, + "learning_rate": 0.0007482278813336834, + "epoch": 0.76 + }, + { + "loss": 9.7204, + "grad_norm": 2.0305333137512207, + "learning_rate": 0.0007473527610046382, + "epoch": 0.76 + }, + { + "loss": 9.7901, + "grad_norm": 2.2396442890167236, + "learning_rate": 0.0007464776406755928, + "epoch": 0.76 + }, + { + "loss": 9.5465, + "grad_norm": 3.230546474456787, + "learning_rate": 0.0007456025203465476, + "epoch": 0.76 + }, + { + "loss": 8.9817, + "grad_norm": 3.14975643157959, + "learning_rate": 0.0007447274000175023, + "epoch": 0.77 + }, + { + "loss": 10.0403, + "grad_norm": 2.1714890003204346, + "learning_rate": 0.0007438522796884571, + "epoch": 0.77 + }, + { + "loss": 9.054, + "grad_norm": 1.8472590446472168, + "learning_rate": 0.0007429771593594119, + "epoch": 0.77 + }, + { + "loss": 9.4847, + "grad_norm": 1.0868862867355347, + "learning_rate": 0.0007421020390303666, + "epoch": 0.77 + }, + { + "loss": 9.5688, + "grad_norm": 0.9088165760040283, + "learning_rate": 0.0007412269187013214, + "epoch": 0.78 + }, + { + "loss": 9.2655, + "grad_norm": 1.2336516380310059, + "learning_rate": 0.0007403517983722762, + "epoch": 0.78 + }, + { + "loss": 9.6194, + "grad_norm": 1.2794588804244995, + "learning_rate": 0.0007394766780432309, + "epoch": 0.78 + }, + { + "loss": 9.4072, + "grad_norm": 1.5056113004684448, + "learning_rate": 0.0007386015577141857, + "epoch": 0.78 + }, + { + "loss": 8.781, + "grad_norm": 1.809520959854126, + "learning_rate": 0.0007377264373851404, + "epoch": 0.79 + }, + { + "loss": 9.3203, + "grad_norm": 3.1000723838806152, + "learning_rate": 0.0007368513170560952, + "epoch": 0.79 + }, + { + "loss": 9.3199, + "grad_norm": 4.879993915557861, + "learning_rate": 0.00073597619672705, + "epoch": 0.79 + }, + { + "loss": 10.2243, + "grad_norm": 1.508380651473999, + "learning_rate": 0.0007351010763980047, + "epoch": 0.8 + }, + { + "loss": 9.3476, + "grad_norm": 1.2379094362258911, + "learning_rate": 0.0007342259560689595, + "epoch": 0.8 + }, + { + "loss": 9.3482, + "grad_norm": 1.3472929000854492, + "learning_rate": 0.0007333508357399142, + "epoch": 0.8 + }, + { + "loss": 9.1645, + "grad_norm": 1.2490941286087036, + "learning_rate": 0.000732475715410869, + "epoch": 0.8 + }, + { + "loss": 9.8443, + "grad_norm": 1.3615162372589111, + "learning_rate": 0.0007316005950818238, + "epoch": 0.81 + }, + { + "loss": 9.1608, + "grad_norm": 1.608033299446106, + "learning_rate": 0.0007307254747527785, + "epoch": 0.81 + }, + { + "loss": 9.5366, + "grad_norm": 1.819758415222168, + "learning_rate": 0.0007298503544237333, + "epoch": 0.81 + }, + { + "loss": 9.3414, + "grad_norm": 1.190049409866333, + "learning_rate": 0.0007289752340946881, + "epoch": 0.81 + }, + { + "loss": 9.3362, + "grad_norm": 1.136693000793457, + "learning_rate": 0.0007281001137656428, + "epoch": 0.82 + }, + { + "loss": 9.4184, + "grad_norm": 1.3066457509994507, + "learning_rate": 0.0007272249934365976, + "epoch": 0.82 + }, + { + "loss": 9.3295, + "grad_norm": 2.193195343017578, + "learning_rate": 0.0007263498731075523, + "epoch": 0.82 + }, + { + "loss": 9.0824, + "grad_norm": 1.2458583116531372, + "learning_rate": 0.0007254747527785071, + "epoch": 0.82 + }, + { + "loss": 9.4671, + "grad_norm": 1.4734137058258057, + "learning_rate": 0.0007245996324494618, + "epoch": 0.83 + }, + { + "loss": 8.8882, + "grad_norm": 1.8609868288040161, + "learning_rate": 0.0007237245121204165, + "epoch": 0.83 + }, + { + "loss": 9.8334, + "grad_norm": 1.2084137201309204, + "learning_rate": 0.0007228493917913713, + "epoch": 0.83 + }, + { + "loss": 9.301, + "grad_norm": 1.3520543575286865, + "learning_rate": 0.000721974271462326, + "epoch": 0.83 + }, + { + "loss": 9.4308, + "grad_norm": 1.7796053886413574, + "learning_rate": 0.0007210991511332808, + "epoch": 0.84 + }, + { + "loss": 9.2915, + "grad_norm": 1.583756685256958, + "learning_rate": 0.0007202240308042356, + "epoch": 0.84 + }, + { + "loss": 9.543, + "grad_norm": 1.3439078330993652, + "learning_rate": 0.0007193489104751903, + "epoch": 0.84 + }, + { + "loss": 9.4767, + "grad_norm": 1.0626850128173828, + "learning_rate": 0.0007184737901461451, + "epoch": 0.85 + }, + { + "loss": 9.6831, + "grad_norm": 1.559846043586731, + "learning_rate": 0.0007175986698170999, + "epoch": 0.85 + }, + { + "loss": 9.3683, + "grad_norm": 1.3399856090545654, + "learning_rate": 0.0007167235494880546, + "epoch": 0.85 + }, + { + "loss": 9.4018, + "grad_norm": 2.0115649700164795, + "learning_rate": 0.0007158484291590094, + "epoch": 0.85 + }, + { + "loss": 9.6007, + "grad_norm": 1.9016413688659668, + "learning_rate": 0.0007149733088299641, + "epoch": 0.86 + }, + { + "loss": 9.7843, + "grad_norm": 9.662792205810547, + "learning_rate": 0.0007140981885009189, + "epoch": 0.86 + }, + { + "loss": 9.4248, + "grad_norm": 0.9219140410423279, + "learning_rate": 0.0007132230681718737, + "epoch": 0.86 + }, + { + "loss": 9.8659, + "grad_norm": 1.0851889848709106, + "learning_rate": 0.0007123479478428284, + "epoch": 0.86 + }, + { + "loss": 9.1677, + "grad_norm": 1.0349225997924805, + "learning_rate": 0.0007114728275137832, + "epoch": 0.87 + }, + { + "loss": 9.1666, + "grad_norm": 1.286309003829956, + "learning_rate": 0.000710597707184738, + "epoch": 0.87 + }, + { + "loss": 9.5514, + "grad_norm": 1.0325031280517578, + "learning_rate": 0.0007097225868556927, + "epoch": 0.87 + }, + { + "loss": 9.2542, + "grad_norm": 1.2344691753387451, + "learning_rate": 0.0007088474665266475, + "epoch": 0.87 + }, + { + "loss": 9.1687, + "grad_norm": 0.9820197224617004, + "learning_rate": 0.0007079723461976022, + "epoch": 0.88 + }, + { + "loss": 8.9295, + "grad_norm": 2.573585033416748, + "learning_rate": 0.000707097225868557, + "epoch": 0.88 + }, + { + "loss": 9.6702, + "grad_norm": 0.8707136511802673, + "learning_rate": 0.0007062221055395118, + "epoch": 0.88 + }, + { + "loss": 8.5564, + "grad_norm": 0.9832028150558472, + "learning_rate": 0.0007053469852104665, + "epoch": 0.88 + }, + { + "loss": 9.426, + "grad_norm": 2.1577107906341553, + "learning_rate": 0.0007044718648814213, + "epoch": 0.89 + }, + { + "loss": 9.2118, + "grad_norm": 1.6314407587051392, + "learning_rate": 0.000703596744552376, + "epoch": 0.89 + }, + { + "loss": 9.4482, + "grad_norm": 1.6563376188278198, + "learning_rate": 0.0007027216242233307, + "epoch": 0.89 + }, + { + "loss": 10.1221, + "grad_norm": 1.3398720026016235, + "learning_rate": 0.0007018465038942855, + "epoch": 0.9 + }, + { + "loss": 9.2569, + "grad_norm": 1.2780015468597412, + "learning_rate": 0.0007009713835652402, + "epoch": 0.9 + }, + { + "loss": 9.1485, + "grad_norm": 1.3434102535247803, + "learning_rate": 0.000700096263236195, + "epoch": 0.9 + }, + { + "loss": 9.3431, + "grad_norm": 2.2103283405303955, + "learning_rate": 0.0006992211429071497, + "epoch": 0.9 + }, + { + "loss": 9.5529, + "grad_norm": 2.479997158050537, + "learning_rate": 0.0006983460225781045, + "epoch": 0.91 + }, + { + "loss": 8.5835, + "grad_norm": 1.3891953229904175, + "learning_rate": 0.0006974709022490593, + "epoch": 0.91 + }, + { + "loss": 8.835, + "grad_norm": 0.9400926828384399, + "learning_rate": 0.000696595781920014, + "epoch": 0.91 + }, + { + "loss": 9.1069, + "grad_norm": 1.2385962009429932, + "learning_rate": 0.0006957206615909688, + "epoch": 0.91 + }, + { + "loss": 9.2235, + "grad_norm": 1.7397691011428833, + "learning_rate": 0.0006948455412619236, + "epoch": 0.92 + }, + { + "loss": 9.2386, + "grad_norm": 1.7163151502609253, + "learning_rate": 0.0006939704209328783, + "epoch": 0.92 + }, + { + "loss": 8.7562, + "grad_norm": 1.5626498460769653, + "learning_rate": 0.0006930953006038331, + "epoch": 0.92 + }, + { + "loss": 8.8432, + "grad_norm": 1.9265193939208984, + "learning_rate": 0.0006922201802747878, + "epoch": 0.92 + }, + { + "loss": 8.8117, + "grad_norm": 1.4459571838378906, + "learning_rate": 0.0006913450599457426, + "epoch": 0.93 + }, + { + "loss": 9.439, + "grad_norm": 0.9559070467948914, + "learning_rate": 0.0006904699396166974, + "epoch": 0.93 + }, + { + "loss": 9.1912, + "grad_norm": 1.9344050884246826, + "learning_rate": 0.0006895948192876521, + "epoch": 0.93 + }, + { + "loss": 9.5571, + "grad_norm": 1.52436101436615, + "learning_rate": 0.0006887196989586069, + "epoch": 0.93 + }, + { + "loss": 8.9898, + "grad_norm": 1.4828134775161743, + "learning_rate": 0.0006878445786295616, + "epoch": 0.94 + }, + { + "loss": 9.1776, + "grad_norm": 1.4312185049057007, + "learning_rate": 0.0006869694583005164, + "epoch": 0.94 + }, + { + "loss": 10.1621, + "grad_norm": 1.2089942693710327, + "learning_rate": 0.0006860943379714712, + "epoch": 0.94 + }, + { + "loss": 8.8634, + "grad_norm": 5.034254550933838, + "learning_rate": 0.0006852192176424259, + "epoch": 0.94 + }, + { + "loss": 9.1892, + "grad_norm": 2.494285821914673, + "learning_rate": 0.0006843440973133807, + "epoch": 0.95 + }, + { + "loss": 8.6028, + "grad_norm": 1.5366199016571045, + "learning_rate": 0.0006834689769843355, + "epoch": 0.95 + }, + { + "loss": 9.0938, + "grad_norm": 1.1272014379501343, + "learning_rate": 0.0006825938566552902, + "epoch": 0.95 + }, + { + "loss": 9.607, + "grad_norm": 3.852747917175293, + "learning_rate": 0.000681718736326245, + "epoch": 0.96 + }, + { + "loss": 9.6214, + "grad_norm": 1.9155749082565308, + "learning_rate": 0.0006808436159971996, + "epoch": 0.96 + }, + { + "loss": 8.6868, + "grad_norm": 1.9045560359954834, + "learning_rate": 0.0006799684956681543, + "epoch": 0.96 + }, + { + "loss": 9.8133, + "grad_norm": 1.4083536863327026, + "learning_rate": 0.000679093375339109, + "epoch": 0.96 + }, + { + "loss": 9.2029, + "grad_norm": 4.824470043182373, + "learning_rate": 0.0006782182550100638, + "epoch": 0.97 + }, + { + "loss": 9.3758, + "grad_norm": 1.2905750274658203, + "learning_rate": 0.0006773431346810186, + "epoch": 0.97 + }, + { + "loss": 9.2105, + "grad_norm": 1.4681618213653564, + "learning_rate": 0.0006764680143519733, + "epoch": 0.97 + }, + { + "loss": 9.1096, + "grad_norm": 1.5041123628616333, + "learning_rate": 0.0006755928940229281, + "epoch": 0.97 + }, + { + "loss": 9.1485, + "grad_norm": 1.7930779457092285, + "learning_rate": 0.0006747177736938829, + "epoch": 0.98 + }, + { + "loss": 9.2587, + "grad_norm": 1.1871591806411743, + "learning_rate": 0.0006738426533648376, + "epoch": 0.98 + }, + { + "loss": 9.2174, + "grad_norm": 1.550445556640625, + "learning_rate": 0.0006729675330357924, + "epoch": 0.98 + }, + { + "loss": 8.8521, + "grad_norm": 1.361382007598877, + "learning_rate": 0.0006720924127067471, + "epoch": 0.98 + }, + { + "loss": 9.0098, + "grad_norm": 1.350142002105713, + "learning_rate": 0.0006712172923777019, + "epoch": 0.99 + }, + { + "loss": 8.6736, + "grad_norm": 1.2662369012832642, + "learning_rate": 0.0006703421720486567, + "epoch": 0.99 + }, + { + "loss": 8.9752, + "grad_norm": 1.474623441696167, + "learning_rate": 0.0006694670517196114, + "epoch": 0.99 + }, + { + "loss": 8.7473, + "grad_norm": 2.676971912384033, + "learning_rate": 0.0006685919313905662, + "epoch": 0.99 + }, + { + "loss": 8.8512, + "grad_norm": 1.114418625831604, + "learning_rate": 0.000667716811061521, + "epoch": 1.0 + }, + { + "loss": 8.1921, + "grad_norm": 2.0294203758239746, + "learning_rate": 0.0006668416907324757, + "epoch": 1.0 + }, + { + "loss": 8.8171, + "grad_norm": 0.9778627157211304, + "learning_rate": 0.0006659665704034305, + "epoch": 1.0 + }, + { + "loss": 8.8809, + "grad_norm": 1.621929407119751, + "learning_rate": 0.0006650914500743852, + "epoch": 1.01 + }, + { + "loss": 8.9527, + "grad_norm": 1.0340059995651245, + "learning_rate": 0.00066421632974534, + "epoch": 1.01 + }, + { + "loss": 8.6295, + "grad_norm": 1.4925633668899536, + "learning_rate": 0.0006633412094162948, + "epoch": 1.01 + }, + { + "loss": 8.7158, + "grad_norm": 1.3651670217514038, + "learning_rate": 0.0006624660890872495, + "epoch": 1.01 + }, + { + "loss": 9.0606, + "grad_norm": 1.1281485557556152, + "learning_rate": 0.0006615909687582043, + "epoch": 1.02 + }, + { + "loss": 8.8925, + "grad_norm": 1.0784941911697388, + "learning_rate": 0.000660715848429159, + "epoch": 1.02 + }, + { + "loss": 9.1237, + "grad_norm": 1.49080228805542, + "learning_rate": 0.0006598407281001137, + "epoch": 1.02 + }, + { + "loss": 8.9093, + "grad_norm": 1.080828309059143, + "learning_rate": 0.0006589656077710685, + "epoch": 1.02 + }, + { + "loss": 8.9275, + "grad_norm": 1.0867069959640503, + "learning_rate": 0.0006580904874420232, + "epoch": 1.03 + }, + { + "loss": 8.5924, + "grad_norm": 1.0178778171539307, + "learning_rate": 0.000657215367112978, + "epoch": 1.03 + }, + { + "loss": 8.8768, + "grad_norm": 0.978421688079834, + "learning_rate": 0.0006563402467839327, + "epoch": 1.03 + }, + { + "loss": 8.8812, + "grad_norm": 1.6234030723571777, + "learning_rate": 0.0006554651264548875, + "epoch": 1.03 + }, + { + "loss": 9.5212, + "grad_norm": 5.744367599487305, + "learning_rate": 0.0006545900061258423, + "epoch": 1.04 + }, + { + "loss": 8.066, + "grad_norm": 3.1010031700134277, + "learning_rate": 0.000653714885796797, + "epoch": 1.04 + }, + { + "loss": 8.8401, + "grad_norm": 1.4084874391555786, + "learning_rate": 0.0006528397654677518, + "epoch": 1.04 + }, + { + "loss": 9.1554, + "grad_norm": 1.4125443696975708, + "learning_rate": 0.0006519646451387066, + "epoch": 1.04 + }, + { + "loss": 8.5098, + "grad_norm": 1.0087417364120483, + "learning_rate": 0.0006510895248096613, + "epoch": 1.05 + }, + { + "loss": 8.6227, + "grad_norm": 1.404480218887329, + "learning_rate": 0.0006502144044806161, + "epoch": 1.05 + }, + { + "loss": 8.7843, + "grad_norm": 1.1295698881149292, + "learning_rate": 0.0006493392841515708, + "epoch": 1.05 + }, + { + "loss": 8.6766, + "grad_norm": 1.0821887254714966, + "learning_rate": 0.0006484641638225256, + "epoch": 1.06 + }, + { + "loss": 8.6414, + "grad_norm": 1.1444706916809082, + "learning_rate": 0.0006475890434934804, + "epoch": 1.06 + }, + { + "loss": 8.457, + "grad_norm": 1.277224063873291, + "learning_rate": 0.0006467139231644351, + "epoch": 1.06 + }, + { + "loss": 9.058, + "grad_norm": 1.4391515254974365, + "learning_rate": 0.0006458388028353899, + "epoch": 1.06 + }, + { + "loss": 9.0137, + "grad_norm": 1.1909124851226807, + "learning_rate": 0.0006449636825063447, + "epoch": 1.07 + }, + { + "loss": 8.95, + "grad_norm": 1.1959373950958252, + "learning_rate": 0.0006440885621772994, + "epoch": 1.07 + }, + { + "loss": 8.7242, + "grad_norm": 1.0742520093917847, + "learning_rate": 0.0006432134418482542, + "epoch": 1.07 + }, + { + "loss": 8.6848, + "grad_norm": 1.1215168237686157, + "learning_rate": 0.0006423383215192089, + "epoch": 1.07 + }, + { + "loss": 8.2356, + "grad_norm": 1.329377293586731, + "learning_rate": 0.0006414632011901637, + "epoch": 1.08 + }, + { + "loss": 9.357, + "grad_norm": 1.2252676486968994, + "learning_rate": 0.0006405880808611185, + "epoch": 1.08 + }, + { + "loss": 8.9564, + "grad_norm": 1.4522862434387207, + "learning_rate": 0.0006397129605320732, + "epoch": 1.08 + }, + { + "loss": 9.315, + "grad_norm": 1.3707520961761475, + "learning_rate": 0.000638837840203028, + "epoch": 1.08 + }, + { + "loss": 8.5879, + "grad_norm": 1.6546357870101929, + "learning_rate": 0.0006379627198739826, + "epoch": 1.09 + }, + { + "loss": 9.4063, + "grad_norm": 0.9310407638549805, + "learning_rate": 0.0006370875995449374, + "epoch": 1.09 + }, + { + "loss": 8.8435, + "grad_norm": 0.9878571629524231, + "learning_rate": 0.0006362124792158922, + "epoch": 1.09 + }, + { + "loss": 9.0975, + "grad_norm": 0.9288727045059204, + "learning_rate": 0.0006353373588868469, + "epoch": 1.09 + }, + { + "loss": 9.219, + "grad_norm": 0.9407894015312195, + "learning_rate": 0.0006344622385578017, + "epoch": 1.1 + }, + { + "loss": 8.6555, + "grad_norm": 0.9899985790252686, + "learning_rate": 0.0006335871182287564, + "epoch": 1.1 + }, + { + "loss": 8.1403, + "grad_norm": 0.8422369360923767, + "learning_rate": 0.0006327119978997112, + "epoch": 1.1 + }, + { + "loss": 8.5879, + "grad_norm": 1.1602038145065308, + "learning_rate": 0.000631836877570666, + "epoch": 1.11 + }, + { + "loss": 8.8147, + "grad_norm": 1.0149036645889282, + "learning_rate": 0.0006309617572416207, + "epoch": 1.11 + }, + { + "loss": 8.6708, + "grad_norm": 1.3015429973602295, + "learning_rate": 0.0006300866369125755, + "epoch": 1.11 + }, + { + "loss": 8.213, + "grad_norm": 1.0710703134536743, + "learning_rate": 0.0006292115165835303, + "epoch": 1.11 + }, + { + "loss": 8.7651, + "grad_norm": 0.9002228379249573, + "learning_rate": 0.000628336396254485, + "epoch": 1.12 + }, + { + "loss": 9.2161, + "grad_norm": 1.2090556621551514, + "learning_rate": 0.0006274612759254398, + "epoch": 1.12 + }, + { + "loss": 8.4087, + "grad_norm": 1.2179570198059082, + "learning_rate": 0.0006265861555963945, + "epoch": 1.12 + }, + { + "loss": 8.5906, + "grad_norm": 1.7626177072525024, + "learning_rate": 0.0006257110352673493, + "epoch": 1.12 + }, + { + "loss": 8.7996, + "grad_norm": 1.2657760381698608, + "learning_rate": 0.0006248359149383041, + "epoch": 1.13 + }, + { + "loss": 8.7193, + "grad_norm": 0.8737196326255798, + "learning_rate": 0.0006239607946092588, + "epoch": 1.13 + }, + { + "loss": 8.5347, + "grad_norm": 1.1074841022491455, + "learning_rate": 0.0006230856742802136, + "epoch": 1.13 + }, + { + "loss": 8.8374, + "grad_norm": 1.264391303062439, + "learning_rate": 0.0006222105539511684, + "epoch": 1.13 + }, + { + "loss": 7.9866, + "grad_norm": 1.0013505220413208, + "learning_rate": 0.0006213354336221231, + "epoch": 1.14 + }, + { + "loss": 8.1635, + "grad_norm": 1.0330276489257812, + "learning_rate": 0.0006204603132930779, + "epoch": 1.14 + }, + { + "loss": 8.1751, + "grad_norm": 1.125343918800354, + "learning_rate": 0.0006195851929640326, + "epoch": 1.14 + }, + { + "loss": 9.082, + "grad_norm": 1.0461503267288208, + "learning_rate": 0.0006187100726349874, + "epoch": 1.14 + }, + { + "loss": 8.4013, + "grad_norm": 1.2671931982040405, + "learning_rate": 0.0006178349523059422, + "epoch": 1.15 + }, + { + "loss": 8.735, + "grad_norm": 1.315640926361084, + "learning_rate": 0.0006169598319768969, + "epoch": 1.15 + }, + { + "loss": 8.3872, + "grad_norm": 1.0746458768844604, + "learning_rate": 0.0006160847116478516, + "epoch": 1.15 + }, + { + "loss": 8.4791, + "grad_norm": 0.9568318724632263, + "learning_rate": 0.0006152095913188063, + "epoch": 1.15 + }, + { + "loss": 8.4284, + "grad_norm": 1.0956138372421265, + "learning_rate": 0.0006143344709897611, + "epoch": 1.16 + }, + { + "loss": 9.1513, + "grad_norm": 1.2635217905044556, + "learning_rate": 0.0006134593506607159, + "epoch": 1.16 + }, + { + "loss": 8.7084, + "grad_norm": 1.242577075958252, + "learning_rate": 0.0006125842303316706, + "epoch": 1.16 + }, + { + "loss": 8.9941, + "grad_norm": 1.0156121253967285, + "learning_rate": 0.0006117091100026254, + "epoch": 1.17 + }, + { + "loss": 8.731, + "grad_norm": 1.3975499868392944, + "learning_rate": 0.0006108339896735801, + "epoch": 1.17 + }, + { + "loss": 8.5287, + "grad_norm": 1.0764504671096802, + "learning_rate": 0.0006099588693445349, + "epoch": 1.17 + }, + { + "loss": 8.2368, + "grad_norm": 1.0151234865188599, + "learning_rate": 0.0006090837490154897, + "epoch": 1.17 + }, + { + "loss": 9.1091, + "grad_norm": 6.751773834228516, + "learning_rate": 0.0006082086286864444, + "epoch": 1.18 + }, + { + "loss": 8.7919, + "grad_norm": 0.95284503698349, + "learning_rate": 0.0006073335083573992, + "epoch": 1.18 + }, + { + "loss": 8.0937, + "grad_norm": 1.131046175956726, + "learning_rate": 0.000606458388028354, + "epoch": 1.18 + }, + { + "loss": 8.4255, + "grad_norm": 0.8307482600212097, + "learning_rate": 0.0006055832676993087, + "epoch": 1.18 + }, + { + "loss": 8.3428, + "grad_norm": 1.1681163311004639, + "learning_rate": 0.0006047081473702635, + "epoch": 1.19 + }, + { + "loss": 8.1699, + "grad_norm": 1.6491031646728516, + "learning_rate": 0.0006038330270412182, + "epoch": 1.19 + }, + { + "loss": 8.3981, + "grad_norm": 0.9328737258911133, + "learning_rate": 0.000602957906712173, + "epoch": 1.19 + }, + { + "loss": 8.5749, + "grad_norm": 1.3434003591537476, + "learning_rate": 0.0006020827863831278, + "epoch": 1.19 + }, + { + "loss": 8.6492, + "grad_norm": 1.1651496887207031, + "learning_rate": 0.0006012076660540825, + "epoch": 1.2 + }, + { + "loss": 8.9343, + "grad_norm": 1.1224288940429688, + "learning_rate": 0.0006003325457250373, + "epoch": 1.2 + }, + { + "loss": 8.4265, + "grad_norm": 1.1075445413589478, + "learning_rate": 0.0005994574253959919, + "epoch": 1.2 + }, + { + "loss": 8.3367, + "grad_norm": 1.0349383354187012, + "learning_rate": 0.0005985823050669467, + "epoch": 1.2 + }, + { + "loss": 8.6752, + "grad_norm": 0.9915909767150879, + "learning_rate": 0.0005977071847379015, + "epoch": 1.21 + }, + { + "loss": 8.2193, + "grad_norm": 1.172624111175537, + "learning_rate": 0.0005968320644088562, + "epoch": 1.21 + }, + { + "loss": 7.7701, + "grad_norm": 1.0810112953186035, + "learning_rate": 0.000595956944079811, + "epoch": 1.21 + }, + { + "loss": 8.9113, + "grad_norm": 1.1411935091018677, + "learning_rate": 0.0005950818237507656, + "epoch": 1.22 + }, + { + "loss": 8.3426, + "grad_norm": 0.9251805543899536, + "learning_rate": 0.0005942067034217204, + "epoch": 1.22 + }, + { + "loss": 8.1973, + "grad_norm": 0.9023226499557495, + "learning_rate": 0.0005933315830926752, + "epoch": 1.22 + }, + { + "loss": 8.8777, + "grad_norm": 0.9467354416847229, + "learning_rate": 0.0005924564627636299, + "epoch": 1.22 + }, + { + "loss": 8.758, + "grad_norm": 0.9941525459289551, + "learning_rate": 0.0005915813424345847, + "epoch": 1.23 + }, + { + "loss": 8.6786, + "grad_norm": 0.7721539735794067, + "learning_rate": 0.0005907062221055395, + "epoch": 1.23 + }, + { + "loss": 8.7063, + "grad_norm": 0.9968111515045166, + "learning_rate": 0.0005898311017764942, + "epoch": 1.23 + }, + { + "loss": 8.4121, + "grad_norm": 0.8019425272941589, + "learning_rate": 0.000588955981447449, + "epoch": 1.23 + }, + { + "loss": 8.8181, + "grad_norm": 1.1664308309555054, + "learning_rate": 0.0005880808611184037, + "epoch": 1.24 + }, + { + "loss": 8.1548, + "grad_norm": 1.008786678314209, + "learning_rate": 0.0005872057407893585, + "epoch": 1.24 + }, + { + "loss": 8.5725, + "grad_norm": 1.2349562644958496, + "learning_rate": 0.0005863306204603133, + "epoch": 1.24 + }, + { + "loss": 8.8339, + "grad_norm": 1.2367397546768188, + "learning_rate": 0.000585455500131268, + "epoch": 1.24 + }, + { + "loss": 8.3184, + "grad_norm": 0.9427123665809631, + "learning_rate": 0.0005845803798022228, + "epoch": 1.25 + }, + { + "loss": 8.2814, + "grad_norm": 0.951808512210846, + "learning_rate": 0.0005837052594731775, + "epoch": 1.25 + }, + { + "loss": 8.1453, + "grad_norm": 1.076816439628601, + "learning_rate": 0.0005828301391441323, + "epoch": 1.25 + }, + { + "loss": 8.5114, + "grad_norm": 1.248741865158081, + "learning_rate": 0.0005819550188150871, + "epoch": 1.25 + }, + { + "loss": 8.7265, + "grad_norm": 1.0166980028152466, + "learning_rate": 0.0005810798984860418, + "epoch": 1.26 + }, + { + "loss": 9.0454, + "grad_norm": 1.273942232131958, + "learning_rate": 0.0005802047781569966, + "epoch": 1.26 + }, + { + "loss": 8.6499, + "grad_norm": 0.8551316857337952, + "learning_rate": 0.0005793296578279514, + "epoch": 1.26 + }, + { + "loss": 8.0282, + "grad_norm": 1.0231510400772095, + "learning_rate": 0.0005784545374989061, + "epoch": 1.27 + }, + { + "loss": 8.5694, + "grad_norm": 0.8138982653617859, + "learning_rate": 0.0005775794171698609, + "epoch": 1.27 + }, + { + "loss": 8.9449, + "grad_norm": 1.151458978652954, + "learning_rate": 0.0005767042968408156, + "epoch": 1.27 + }, + { + "loss": 8.5309, + "grad_norm": 1.311020851135254, + "learning_rate": 0.0005758291765117704, + "epoch": 1.27 + }, + { + "loss": 8.3937, + "grad_norm": 1.0431928634643555, + "learning_rate": 0.0005749540561827252, + "epoch": 1.28 + }, + { + "loss": 8.0121, + "grad_norm": 0.9487342238426208, + "learning_rate": 0.0005740789358536799, + "epoch": 1.28 + }, + { + "loss": 8.9756, + "grad_norm": 0.7705584764480591, + "learning_rate": 0.0005732038155246346, + "epoch": 1.28 + }, + { + "loss": 8.9679, + "grad_norm": 0.9359903335571289, + "learning_rate": 0.0005723286951955893, + "epoch": 1.28 + }, + { + "loss": 8.0724, + "grad_norm": 1.031725525856018, + "learning_rate": 0.0005714535748665441, + "epoch": 1.29 + }, + { + "loss": 8.7014, + "grad_norm": 1.0501611232757568, + "learning_rate": 0.0005705784545374989, + "epoch": 1.29 + }, + { + "loss": 8.2284, + "grad_norm": 0.8158836960792542, + "learning_rate": 0.0005697033342084536, + "epoch": 1.29 + }, + { + "loss": 8.8206, + "grad_norm": 0.8827638030052185, + "learning_rate": 0.0005688282138794084, + "epoch": 1.29 + }, + { + "loss": 8.4189, + "grad_norm": 0.9118880033493042, + "learning_rate": 0.0005679530935503632, + "epoch": 1.3 + }, + { + "loss": 8.5532, + "grad_norm": 1.2081084251403809, + "learning_rate": 0.0005670779732213179, + "epoch": 1.3 + }, + { + "loss": 8.5477, + "grad_norm": 1.3465925455093384, + "learning_rate": 0.0005662028528922727, + "epoch": 1.3 + }, + { + "loss": 9.2068, + "grad_norm": 0.8770077228546143, + "learning_rate": 0.0005653277325632274, + "epoch": 1.3 + }, + { + "loss": 8.6147, + "grad_norm": 1.1257092952728271, + "learning_rate": 0.0005644526122341822, + "epoch": 1.31 + }, + { + "loss": 8.4279, + "grad_norm": 1.0482877492904663, + "learning_rate": 0.000563577491905137, + "epoch": 1.31 + }, + { + "loss": 9.1236, + "grad_norm": 1.0635833740234375, + "learning_rate": 0.0005627023715760917, + "epoch": 1.31 + }, + { + "loss": 8.7325, + "grad_norm": 0.866674542427063, + "learning_rate": 0.0005618272512470465, + "epoch": 1.32 + }, + { + "loss": 8.3691, + "grad_norm": 0.9562137126922607, + "learning_rate": 0.0005609521309180012, + "epoch": 1.32 + }, + { + "loss": 8.3844, + "grad_norm": 1.2593939304351807, + "learning_rate": 0.000560077010588956, + "epoch": 1.32 + }, + { + "loss": 8.7797, + "grad_norm": 0.8865370154380798, + "learning_rate": 0.0005592018902599108, + "epoch": 1.32 + }, + { + "loss": 8.7078, + "grad_norm": 1.0417253971099854, + "learning_rate": 0.0005583267699308655, + "epoch": 1.33 + }, + { + "loss": 8.6024, + "grad_norm": 1.1513303518295288, + "learning_rate": 0.0005574516496018203, + "epoch": 1.33 + }, + { + "loss": 8.4373, + "grad_norm": 0.8727751970291138, + "learning_rate": 0.000556576529272775, + "epoch": 1.33 + }, + { + "loss": 8.2888, + "grad_norm": 1.0075277090072632, + "learning_rate": 0.0005557014089437298, + "epoch": 1.33 + }, + { + "loss": 8.465, + "grad_norm": 0.9511576294898987, + "learning_rate": 0.0005548262886146846, + "epoch": 1.34 + }, + { + "loss": 7.7129, + "grad_norm": 0.9443394541740417, + "learning_rate": 0.0005539511682856393, + "epoch": 1.34 + }, + { + "loss": 8.4521, + "grad_norm": 0.9932364225387573, + "learning_rate": 0.0005530760479565941, + "epoch": 1.34 + }, + { + "loss": 8.2593, + "grad_norm": 0.8069454431533813, + "learning_rate": 0.0005522009276275489, + "epoch": 1.34 + }, + { + "loss": 8.4721, + "grad_norm": 1.1227058172225952, + "learning_rate": 0.0005513258072985035, + "epoch": 1.35 + }, + { + "loss": 8.9954, + "grad_norm": 0.8359375596046448, + "learning_rate": 0.0005504506869694583, + "epoch": 1.35 + }, + { + "loss": 8.6039, + "grad_norm": 1.1721514463424683, + "learning_rate": 0.000549575566640413, + "epoch": 1.35 + }, + { + "loss": 7.8393, + "grad_norm": 1.031473994255066, + "learning_rate": 0.0005487004463113678, + "epoch": 1.35 + }, + { + "loss": 7.8643, + "grad_norm": 0.935614287853241, + "learning_rate": 0.0005478253259823226, + "epoch": 1.36 + }, + { + "loss": 8.4271, + "grad_norm": 0.9366902709007263, + "learning_rate": 0.0005469502056532773, + "epoch": 1.36 + }, + { + "loss": 8.3338, + "grad_norm": 0.9616496562957764, + "learning_rate": 0.0005460750853242321, + "epoch": 1.36 + }, + { + "loss": 8.1388, + "grad_norm": 2.2672061920166016, + "learning_rate": 0.0005451999649951868, + "epoch": 1.36 + }, + { + "loss": 8.879, + "grad_norm": 1.948036789894104, + "learning_rate": 0.0005443248446661416, + "epoch": 1.37 + }, + { + "loss": 8.8816, + "grad_norm": 1.0832654237747192, + "learning_rate": 0.0005434497243370964, + "epoch": 1.37 + }, + { + "loss": 8.5489, + "grad_norm": 0.9174715876579285, + "learning_rate": 0.0005425746040080511, + "epoch": 1.37 + }, + { + "loss": 8.8525, + "grad_norm": 0.8547096252441406, + "learning_rate": 0.0005416994836790059, + "epoch": 1.38 + }, + { + "loss": 8.6111, + "grad_norm": 0.7524705529212952, + "learning_rate": 0.0005408243633499607, + "epoch": 1.38 + }, + { + "loss": 8.0862, + "grad_norm": 0.8433651328086853, + "learning_rate": 0.0005399492430209154, + "epoch": 1.38 + }, + { + "loss": 8.2379, + "grad_norm": 0.8744563460350037, + "learning_rate": 0.0005390741226918702, + "epoch": 1.38 + }, + { + "loss": 8.2883, + "grad_norm": 0.8806482553482056, + "learning_rate": 0.0005381990023628249, + "epoch": 1.39 + }, + { + "loss": 8.6411, + "grad_norm": 0.9276745319366455, + "learning_rate": 0.0005373238820337797, + "epoch": 1.39 + }, + { + "loss": 8.7561, + "grad_norm": 0.9556492567062378, + "learning_rate": 0.0005364487617047345, + "epoch": 1.39 + }, + { + "loss": 9.305, + "grad_norm": 0.8606293797492981, + "learning_rate": 0.0005355736413756892, + "epoch": 1.39 + }, + { + "loss": 8.3839, + "grad_norm": 1.108547329902649, + "learning_rate": 0.000534698521046644, + "epoch": 1.4 + }, + { + "loss": 8.2164, + "grad_norm": 0.9102107882499695, + "learning_rate": 0.0005338234007175988, + "epoch": 1.4 + }, + { + "loss": 8.606, + "grad_norm": 1.0984998941421509, + "learning_rate": 0.0005329482803885535, + "epoch": 1.4 + }, + { + "loss": 8.0491, + "grad_norm": 1.1762152910232544, + "learning_rate": 0.0005320731600595083, + "epoch": 1.4 + }, + { + "loss": 8.7257, + "grad_norm": 0.9669533371925354, + "learning_rate": 0.000531198039730463, + "epoch": 1.41 + }, + { + "loss": 8.4473, + "grad_norm": 1.0668437480926514, + "learning_rate": 0.0005303229194014178, + "epoch": 1.41 + }, + { + "loss": 8.1594, + "grad_norm": 0.8289794921875, + "learning_rate": 0.0005294477990723725, + "epoch": 1.41 + }, + { + "loss": 8.9208, + "grad_norm": 1.0676897764205933, + "learning_rate": 0.0005285726787433272, + "epoch": 1.41 + }, + { + "loss": 8.0344, + "grad_norm": 0.9914399981498718, + "learning_rate": 0.000527697558414282, + "epoch": 1.42 + }, + { + "loss": 7.9721, + "grad_norm": 0.7524304986000061, + "learning_rate": 0.0005268224380852367, + "epoch": 1.42 + }, + { + "loss": 8.5322, + "grad_norm": 0.9521943926811218, + "learning_rate": 0.0005259473177561915, + "epoch": 1.42 + }, + { + "loss": 8.1986, + "grad_norm": 0.9657976627349854, + "learning_rate": 0.0005250721974271463, + "epoch": 1.43 + }, + { + "loss": 8.476, + "grad_norm": 0.9338609576225281, + "learning_rate": 0.000524197077098101, + "epoch": 1.43 + }, + { + "loss": 8.0189, + "grad_norm": 0.8801831007003784, + "learning_rate": 0.0005233219567690558, + "epoch": 1.43 + }, + { + "loss": 8.0839, + "grad_norm": 0.8173283934593201, + "learning_rate": 0.0005224468364400105, + "epoch": 1.43 + }, + { + "loss": 8.3716, + "grad_norm": 0.8624017238616943, + "learning_rate": 0.0005215717161109653, + "epoch": 1.44 + }, + { + "loss": 8.2837, + "grad_norm": 0.8650451302528381, + "learning_rate": 0.0005206965957819201, + "epoch": 1.44 + }, + { + "loss": 7.889, + "grad_norm": 0.8268963098526001, + "learning_rate": 0.0005198214754528747, + "epoch": 1.44 + }, + { + "loss": 8.7807, + "grad_norm": 0.9244619607925415, + "learning_rate": 0.0005189463551238295, + "epoch": 1.44 + }, + { + "loss": 8.5503, + "grad_norm": 0.8533423542976379, + "learning_rate": 0.0005180712347947842, + "epoch": 1.45 + }, + { + "loss": 7.7895, + "grad_norm": 0.885784924030304, + "learning_rate": 0.000517196114465739, + "epoch": 1.45 + }, + { + "loss": 8.9325, + "grad_norm": 1.252569556236267, + "learning_rate": 0.0005163209941366938, + "epoch": 1.45 + }, + { + "loss": 7.6823, + "grad_norm": 0.9340423941612244, + "learning_rate": 0.0005154458738076485, + "epoch": 1.45 + }, + { + "loss": 8.5812, + "grad_norm": 1.1366244554519653, + "learning_rate": 0.0005145707534786033, + "epoch": 1.46 + }, + { + "loss": 8.1907, + "grad_norm": 0.6764490604400635, + "learning_rate": 0.0005136956331495581, + "epoch": 1.46 + }, + { + "loss": 8.7694, + "grad_norm": 0.7598670721054077, + "learning_rate": 0.0005128205128205128, + "epoch": 1.46 + }, + { + "loss": 8.4732, + "grad_norm": 1.1497093439102173, + "learning_rate": 0.0005119453924914676, + "epoch": 1.46 + }, + { + "loss": 7.9224, + "grad_norm": 0.8351478576660156, + "learning_rate": 0.0005110702721624223, + "epoch": 1.47 + }, + { + "loss": 8.253, + "grad_norm": 0.8981735706329346, + "learning_rate": 0.0005101951518333771, + "epoch": 1.47 + }, + { + "loss": 8.442, + "grad_norm": 0.910393238067627, + "learning_rate": 0.0005093200315043319, + "epoch": 1.47 + }, + { + "loss": 8.4128, + "grad_norm": 1.0419617891311646, + "learning_rate": 0.0005084449111752865, + "epoch": 1.48 + }, + { + "loss": 8.5377, + "grad_norm": 1.1774574518203735, + "learning_rate": 0.0005075697908462413, + "epoch": 1.48 + }, + { + "loss": 8.1727, + "grad_norm": 0.8679039478302002, + "learning_rate": 0.000506694670517196, + "epoch": 1.48 + }, + { + "loss": 8.2085, + "grad_norm": 0.8273195028305054, + "learning_rate": 0.0005058195501881508, + "epoch": 1.48 + }, + { + "loss": 9.0157, + "grad_norm": 1.0897700786590576, + "learning_rate": 0.0005049444298591056, + "epoch": 1.49 + }, + { + "loss": 8.5794, + "grad_norm": 1.19176185131073, + "learning_rate": 0.0005040693095300603, + "epoch": 1.49 + }, + { + "loss": 8.4796, + "grad_norm": 0.7944311499595642, + "learning_rate": 0.0005031941892010151, + "epoch": 1.49 + }, + { + "loss": 8.2379, + "grad_norm": 1.1032432317733765, + "learning_rate": 0.0005023190688719699, + "epoch": 1.49 + }, + { + "loss": 7.8506, + "grad_norm": 0.9756267070770264, + "learning_rate": 0.0005014439485429246, + "epoch": 1.5 + }, + { + "loss": 8.4113, + "grad_norm": 0.8557083010673523, + "learning_rate": 0.0005005688282138794, + "epoch": 1.5 + }, + { + "loss": 8.3315, + "grad_norm": 0.9195913672447205, + "learning_rate": 0.0004996937078848341, + "epoch": 1.5 + }, + { + "loss": 8.3911, + "grad_norm": 0.7430265545845032, + "learning_rate": 0.0004988185875557889, + "epoch": 1.5 + }, + { + "loss": 8.3471, + "grad_norm": 0.7685049176216125, + "learning_rate": 0.0004979434672267437, + "epoch": 1.51 + }, + { + "loss": 8.252, + "grad_norm": 0.9667441844940186, + "learning_rate": 0.0004970683468976984, + "epoch": 1.51 + }, + { + "loss": 7.9134, + "grad_norm": 0.878400981426239, + "learning_rate": 0.0004961932265686532, + "epoch": 1.51 + }, + { + "loss": 8.337, + "grad_norm": 0.8655962944030762, + "learning_rate": 0.000495318106239608, + "epoch": 1.51 + }, + { + "loss": 8.2066, + "grad_norm": 0.8063825964927673, + "learning_rate": 0.0004944429859105627, + "epoch": 1.52 + }, + { + "loss": 8.4102, + "grad_norm": 0.7918370962142944, + "learning_rate": 0.0004935678655815175, + "epoch": 1.52 + }, + { + "loss": 8.1297, + "grad_norm": 1.03073251247406, + "learning_rate": 0.0004926927452524722, + "epoch": 1.52 + }, + { + "loss": 8.296, + "grad_norm": 0.9369198679924011, + "learning_rate": 0.000491817624923427, + "epoch": 1.53 + }, + { + "loss": 7.8051, + "grad_norm": 0.9166183471679688, + "learning_rate": 0.0004909425045943818, + "epoch": 1.53 + }, + { + "loss": 8.0258, + "grad_norm": 0.8817450404167175, + "learning_rate": 0.0004900673842653365, + "epoch": 1.53 + }, + { + "loss": 7.9202, + "grad_norm": 1.0320311784744263, + "learning_rate": 0.0004891922639362913, + "epoch": 1.53 + }, + { + "loss": 8.6314, + "grad_norm": 0.9652658700942993, + "learning_rate": 0.000488317143607246, + "epoch": 1.54 + }, + { + "loss": 8.5648, + "grad_norm": 1.0785067081451416, + "learning_rate": 0.00048744202327820075, + "epoch": 1.54 + }, + { + "loss": 7.528, + "grad_norm": 1.0575002431869507, + "learning_rate": 0.0004865669029491555, + "epoch": 1.54 + }, + { + "loss": 7.9019, + "grad_norm": 0.8822360634803772, + "learning_rate": 0.0004856917826201103, + "epoch": 1.54 + }, + { + "loss": 8.2544, + "grad_norm": 0.7296998500823975, + "learning_rate": 0.00048481666229106504, + "epoch": 1.55 + }, + { + "loss": 8.5853, + "grad_norm": 0.925472617149353, + "learning_rate": 0.0004839415419620198, + "epoch": 1.55 + }, + { + "loss": 8.3512, + "grad_norm": 0.8641199469566345, + "learning_rate": 0.00048306642163297456, + "epoch": 1.55 + }, + { + "loss": 8.0277, + "grad_norm": 1.0501607656478882, + "learning_rate": 0.0004821913013039293, + "epoch": 1.55 + }, + { + "loss": 8.0559, + "grad_norm": 0.7827814221382141, + "learning_rate": 0.00048131618097488403, + "epoch": 1.56 + }, + { + "loss": 8.0869, + "grad_norm": 0.929253339767456, + "learning_rate": 0.0004804410606458388, + "epoch": 1.56 + }, + { + "loss": 8.2206, + "grad_norm": 0.9882745742797852, + "learning_rate": 0.00047956594031679355, + "epoch": 1.56 + }, + { + "loss": 8.8141, + "grad_norm": 0.874455988407135, + "learning_rate": 0.0004786908199877483, + "epoch": 1.56 + }, + { + "loss": 8.403, + "grad_norm": 1.1270105838775635, + "learning_rate": 0.0004778156996587031, + "epoch": 1.57 + }, + { + "loss": 8.7545, + "grad_norm": 0.7236598134040833, + "learning_rate": 0.00047694057932965784, + "epoch": 1.57 + }, + { + "loss": 8.3653, + "grad_norm": 0.8243849873542786, + "learning_rate": 0.0004760654590006126, + "epoch": 1.57 + }, + { + "loss": 8.0057, + "grad_norm": 0.9829972386360168, + "learning_rate": 0.00047519033867156736, + "epoch": 1.57 + }, + { + "loss": 7.7738, + "grad_norm": 1.1444923877716064, + "learning_rate": 0.0004743152183425221, + "epoch": 1.58 + }, + { + "loss": 7.9619, + "grad_norm": 1.1846139430999756, + "learning_rate": 0.0004734400980134769, + "epoch": 1.58 + }, + { + "loss": 8.8667, + "grad_norm": 0.9437428712844849, + "learning_rate": 0.00047256497768443165, + "epoch": 1.58 + }, + { + "loss": 8.2367, + "grad_norm": 0.8670662641525269, + "learning_rate": 0.0004716898573553864, + "epoch": 1.59 + }, + { + "loss": 7.5306, + "grad_norm": 0.823538064956665, + "learning_rate": 0.00047081473702634117, + "epoch": 1.59 + }, + { + "loss": 8.0832, + "grad_norm": 0.8938249349594116, + "learning_rate": 0.0004699396166972959, + "epoch": 1.59 + }, + { + "loss": 7.7995, + "grad_norm": 0.8147523999214172, + "learning_rate": 0.00046906449636825064, + "epoch": 1.59 + }, + { + "loss": 8.2207, + "grad_norm": 0.7885489463806152, + "learning_rate": 0.0004681893760392054, + "epoch": 1.6 + }, + { + "loss": 8.3315, + "grad_norm": 0.9256998300552368, + "learning_rate": 0.00046731425571016016, + "epoch": 1.6 + }, + { + "loss": 7.8139, + "grad_norm": 0.7331977486610413, + "learning_rate": 0.0004664391353811149, + "epoch": 1.6 + }, + { + "loss": 8.2015, + "grad_norm": 0.7677296996116638, + "learning_rate": 0.0004655640150520697, + "epoch": 1.6 + }, + { + "loss": 8.114, + "grad_norm": 1.066036343574524, + "learning_rate": 0.00046468889472302445, + "epoch": 1.61 + }, + { + "loss": 8.3314, + "grad_norm": 0.7969563603401184, + "learning_rate": 0.0004638137743939792, + "epoch": 1.61 + }, + { + "loss": 8.4266, + "grad_norm": 0.8454012274742126, + "learning_rate": 0.000462938654064934, + "epoch": 1.61 + }, + { + "loss": 8.0451, + "grad_norm": 1.049949288368225, + "learning_rate": 0.00046206353373588874, + "epoch": 1.61 + }, + { + "loss": 7.8993, + "grad_norm": 0.8960159420967102, + "learning_rate": 0.0004611884134068435, + "epoch": 1.62 + }, + { + "loss": 8.4117, + "grad_norm": 1.0029221773147583, + "learning_rate": 0.00046031329307779826, + "epoch": 1.62 + }, + { + "loss": 7.9899, + "grad_norm": 1.0616450309753418, + "learning_rate": 0.0004594381727487529, + "epoch": 1.62 + }, + { + "loss": 7.9134, + "grad_norm": 0.8082576990127563, + "learning_rate": 0.0004585630524197077, + "epoch": 1.62 + }, + { + "loss": 8.1685, + "grad_norm": 0.9529896974563599, + "learning_rate": 0.00045768793209066244, + "epoch": 1.63 + }, + { + "loss": 8.7919, + "grad_norm": 0.7967125773429871, + "learning_rate": 0.0004568128117616172, + "epoch": 1.63 + }, + { + "loss": 8.4375, + "grad_norm": 0.8775154948234558, + "learning_rate": 0.00045593769143257196, + "epoch": 1.63 + }, + { + "loss": 8.559, + "grad_norm": 0.782707929611206, + "learning_rate": 0.0004550625711035267, + "epoch": 1.64 + }, + { + "loss": 8.4288, + "grad_norm": 0.7907795310020447, + "learning_rate": 0.0004541874507744815, + "epoch": 1.64 + }, + { + "loss": 8.5237, + "grad_norm": 1.0685423612594604, + "learning_rate": 0.00045331233044543625, + "epoch": 1.64 + }, + { + "loss": 8.4464, + "grad_norm": 1.1534669399261475, + "learning_rate": 0.000452437210116391, + "epoch": 1.64 + }, + { + "loss": 7.8577, + "grad_norm": 0.7411785125732422, + "learning_rate": 0.00045156208978734577, + "epoch": 1.65 + }, + { + "loss": 7.8189, + "grad_norm": 0.87079256772995, + "learning_rate": 0.00045068696945830053, + "epoch": 1.65 + }, + { + "loss": 8.1193, + "grad_norm": 0.9850463271141052, + "learning_rate": 0.0004498118491292553, + "epoch": 1.65 + }, + { + "loss": 7.9457, + "grad_norm": 0.8739660978317261, + "learning_rate": 0.00044893672880021, + "epoch": 1.65 + }, + { + "loss": 7.728, + "grad_norm": 0.8551336526870728, + "learning_rate": 0.00044806160847116476, + "epoch": 1.66 + }, + { + "loss": 8.8456, + "grad_norm": 0.8609566688537598, + "learning_rate": 0.0004471864881421195, + "epoch": 1.66 + }, + { + "loss": 8.0812, + "grad_norm": 0.7449157238006592, + "learning_rate": 0.0004463113678130743, + "epoch": 1.66 + }, + { + "loss": 8.0729, + "grad_norm": 0.8253002762794495, + "learning_rate": 0.00044543624748402905, + "epoch": 1.66 + }, + { + "loss": 8.4942, + "grad_norm": 0.8349846601486206, + "learning_rate": 0.0004445611271549838, + "epoch": 1.67 + }, + { + "loss": 8.3446, + "grad_norm": 0.9881287813186646, + "learning_rate": 0.00044368600682593857, + "epoch": 1.67 + }, + { + "loss": 7.3313, + "grad_norm": 0.863059401512146, + "learning_rate": 0.00044281088649689333, + "epoch": 1.67 + }, + { + "loss": 8.4412, + "grad_norm": 0.9246751666069031, + "learning_rate": 0.0004419357661678481, + "epoch": 1.67 + }, + { + "loss": 8.4511, + "grad_norm": 0.7963143587112427, + "learning_rate": 0.00044106064583880286, + "epoch": 1.68 + }, + { + "loss": 7.8743, + "grad_norm": 1.0088573694229126, + "learning_rate": 0.0004401855255097576, + "epoch": 1.68 + }, + { + "loss": 8.0994, + "grad_norm": 0.7680083513259888, + "learning_rate": 0.0004393104051807124, + "epoch": 1.68 + }, + { + "loss": 7.8712, + "grad_norm": 0.8324389457702637, + "learning_rate": 0.00043843528485166714, + "epoch": 1.69 + }, + { + "loss": 7.8454, + "grad_norm": 0.9649554491043091, + "learning_rate": 0.00043756016452262185, + "epoch": 1.69 + }, + { + "loss": 7.925, + "grad_norm": 0.7881239652633667, + "learning_rate": 0.0004366850441935766, + "epoch": 1.69 + }, + { + "loss": 7.9826, + "grad_norm": 1.2129865884780884, + "learning_rate": 0.0004358099238645314, + "epoch": 1.69 + }, + { + "loss": 8.3911, + "grad_norm": 0.7000688910484314, + "learning_rate": 0.00043493480353548614, + "epoch": 1.7 + }, + { + "loss": 7.9635, + "grad_norm": 0.7449495196342468, + "learning_rate": 0.0004340596832064409, + "epoch": 1.7 + }, + { + "loss": 7.8492, + "grad_norm": 0.7399414777755737, + "learning_rate": 0.00043318456287739566, + "epoch": 1.7 + }, + { + "loss": 8.5288, + "grad_norm": 1.0965951681137085, + "learning_rate": 0.0004323094425483504, + "epoch": 1.7 + }, + { + "loss": 8.0104, + "grad_norm": 0.8990981578826904, + "learning_rate": 0.0004314343222193052, + "epoch": 1.71 + }, + { + "loss": 7.8636, + "grad_norm": 0.8695485591888428, + "learning_rate": 0.00043055920189025994, + "epoch": 1.71 + }, + { + "loss": 7.9194, + "grad_norm": 0.7813265919685364, + "learning_rate": 0.0004296840815612147, + "epoch": 1.71 + }, + { + "loss": 8.4535, + "grad_norm": 0.7645956873893738, + "learning_rate": 0.00042880896123216947, + "epoch": 1.71 + }, + { + "loss": 7.8434, + "grad_norm": 1.0397326946258545, + "learning_rate": 0.00042793384090312423, + "epoch": 1.72 + }, + { + "loss": 7.8072, + "grad_norm": 0.9630481004714966, + "learning_rate": 0.00042705872057407894, + "epoch": 1.72 + }, + { + "loss": 8.327, + "grad_norm": 0.7939698696136475, + "learning_rate": 0.0004261836002450337, + "epoch": 1.72 + }, + { + "loss": 8.2467, + "grad_norm": 1.0103453397750854, + "learning_rate": 0.00042530847991598846, + "epoch": 1.72 + }, + { + "loss": 7.63, + "grad_norm": 0.9281976819038391, + "learning_rate": 0.0004244333595869432, + "epoch": 1.73 + }, + { + "loss": 7.7603, + "grad_norm": 0.7895064949989319, + "learning_rate": 0.000423558239257898, + "epoch": 1.73 + }, + { + "loss": 7.6725, + "grad_norm": 0.7491249442100525, + "learning_rate": 0.00042268311892885275, + "epoch": 1.73 + }, + { + "loss": 8.0813, + "grad_norm": 0.7357456088066101, + "learning_rate": 0.0004218079985998075, + "epoch": 1.74 + }, + { + "loss": 8.1603, + "grad_norm": 0.8232001066207886, + "learning_rate": 0.00042093287827076227, + "epoch": 1.74 + }, + { + "loss": 8.172, + "grad_norm": 0.7846309542655945, + "learning_rate": 0.00042005775794171703, + "epoch": 1.74 + }, + { + "loss": 8.2372, + "grad_norm": 0.9100042581558228, + "learning_rate": 0.00041918263761267174, + "epoch": 1.74 + }, + { + "loss": 7.8489, + "grad_norm": 0.9496660828590393, + "learning_rate": 0.0004183075172836265, + "epoch": 1.75 + }, + { + "loss": 7.7246, + "grad_norm": 0.7061757445335388, + "learning_rate": 0.00041743239695458126, + "epoch": 1.75 + }, + { + "loss": 7.988, + "grad_norm": 0.9927607774734497, + "learning_rate": 0.00041655727662553597, + "epoch": 1.75 + }, + { + "loss": 7.9562, + "grad_norm": 0.8585007190704346, + "learning_rate": 0.00041568215629649073, + "epoch": 1.75 + }, + { + "loss": 8.1105, + "grad_norm": 1.0176628828048706, + "learning_rate": 0.0004148070359674455, + "epoch": 1.76 + }, + { + "loss": 7.7869, + "grad_norm": 0.8576889038085938, + "learning_rate": 0.00041393191563840026, + "epoch": 1.76 + }, + { + "loss": 7.7945, + "grad_norm": 0.8359828591346741, + "learning_rate": 0.000413056795309355, + "epoch": 1.76 + }, + { + "loss": 7.9683, + "grad_norm": 0.8636084794998169, + "learning_rate": 0.0004121816749803098, + "epoch": 1.76 + }, + { + "loss": 8.3303, + "grad_norm": 0.9006314873695374, + "learning_rate": 0.00041130655465126454, + "epoch": 1.77 + }, + { + "loss": 8.1457, + "grad_norm": 1.217007040977478, + "learning_rate": 0.0004104314343222193, + "epoch": 1.77 + }, + { + "loss": 8.6171, + "grad_norm": 1.0577572584152222, + "learning_rate": 0.00040955631399317407, + "epoch": 1.77 + }, + { + "loss": 7.9349, + "grad_norm": 0.9530831575393677, + "learning_rate": 0.00040868119366412883, + "epoch": 1.77 + }, + { + "loss": 8.2722, + "grad_norm": 0.9652631282806396, + "learning_rate": 0.0004078060733350836, + "epoch": 1.78 + }, + { + "loss": 8.185, + "grad_norm": 0.7349383234977722, + "learning_rate": 0.00040693095300603835, + "epoch": 1.78 + }, + { + "loss": 7.3944, + "grad_norm": 1.122018814086914, + "learning_rate": 0.0004060558326769931, + "epoch": 1.78 + }, + { + "loss": 7.8828, + "grad_norm": 0.96207195520401, + "learning_rate": 0.0004051807123479478, + "epoch": 1.78 + }, + { + "loss": 8.1287, + "grad_norm": 0.833884060382843, + "learning_rate": 0.0004043055920189026, + "epoch": 1.79 + }, + { + "loss": 8.0382, + "grad_norm": 0.9089711904525757, + "learning_rate": 0.00040343047168985734, + "epoch": 1.79 + }, + { + "loss": 8.1137, + "grad_norm": 0.6977031230926514, + "learning_rate": 0.0004025553513608121, + "epoch": 1.79 + }, + { + "loss": 7.9215, + "grad_norm": 0.9814949631690979, + "learning_rate": 0.00040168023103176687, + "epoch": 1.8 + }, + { + "loss": 8.2266, + "grad_norm": 0.9767114520072937, + "learning_rate": 0.00040080511070272163, + "epoch": 1.8 + }, + { + "loss": 8.3445, + "grad_norm": 1.1093454360961914, + "learning_rate": 0.0003999299903736764, + "epoch": 1.8 + }, + { + "loss": 8.4239, + "grad_norm": 0.93362957239151, + "learning_rate": 0.00039905487004463115, + "epoch": 1.8 + }, + { + "loss": 8.2468, + "grad_norm": 0.9497604370117188, + "learning_rate": 0.0003981797497155859, + "epoch": 1.81 + }, + { + "loss": 7.8793, + "grad_norm": 0.8992236852645874, + "learning_rate": 0.0003973046293865407, + "epoch": 1.81 + }, + { + "loss": 7.8246, + "grad_norm": 0.9486469030380249, + "learning_rate": 0.00039642950905749544, + "epoch": 1.81 + }, + { + "loss": 8.6243, + "grad_norm": 0.970136284828186, + "learning_rate": 0.0003955543887284502, + "epoch": 1.81 + }, + { + "loss": 7.8859, + "grad_norm": 1.0090283155441284, + "learning_rate": 0.0003946792683994049, + "epoch": 1.82 + }, + { + "loss": 8.156, + "grad_norm": 0.9662021994590759, + "learning_rate": 0.00039380414807035967, + "epoch": 1.82 + }, + { + "loss": 7.7991, + "grad_norm": 0.8005274534225464, + "learning_rate": 0.00039292902774131443, + "epoch": 1.82 + }, + { + "loss": 7.8432, + "grad_norm": 0.8537503480911255, + "learning_rate": 0.0003920539074122692, + "epoch": 1.82 + }, + { + "loss": 7.7118, + "grad_norm": 0.8975428342819214, + "learning_rate": 0.00039117878708322396, + "epoch": 1.83 + }, + { + "loss": 8.0563, + "grad_norm": 0.9040714502334595, + "learning_rate": 0.0003903036667541787, + "epoch": 1.83 + }, + { + "loss": 8.005, + "grad_norm": 0.882514476776123, + "learning_rate": 0.0003894285464251335, + "epoch": 1.83 + }, + { + "loss": 7.99, + "grad_norm": 0.9527498483657837, + "learning_rate": 0.00038855342609608824, + "epoch": 1.83 + }, + { + "loss": 7.9497, + "grad_norm": 0.7327905893325806, + "learning_rate": 0.000387678305767043, + "epoch": 1.84 + }, + { + "loss": 8.1346, + "grad_norm": 0.9137473106384277, + "learning_rate": 0.00038680318543799776, + "epoch": 1.84 + }, + { + "loss": 7.266, + "grad_norm": 0.8273423910140991, + "learning_rate": 0.0003859280651089525, + "epoch": 1.84 + }, + { + "loss": 7.525, + "grad_norm": 1.2288787364959717, + "learning_rate": 0.0003850529447799073, + "epoch": 1.85 + }, + { + "loss": 8.5105, + "grad_norm": 0.7940724492073059, + "learning_rate": 0.00038417782445086205, + "epoch": 1.85 + }, + { + "loss": 8.0599, + "grad_norm": 0.9253759384155273, + "learning_rate": 0.00038330270412181676, + "epoch": 1.85 + }, + { + "loss": 7.1757, + "grad_norm": 0.8145419359207153, + "learning_rate": 0.0003824275837927715, + "epoch": 1.85 + }, + { + "loss": 7.6177, + "grad_norm": 1.1738182306289673, + "learning_rate": 0.0003815524634637263, + "epoch": 1.86 + }, + { + "loss": 7.6901, + "grad_norm": 0.9141517877578735, + "learning_rate": 0.00038067734313468104, + "epoch": 1.86 + }, + { + "loss": 7.7036, + "grad_norm": 1.0994611978530884, + "learning_rate": 0.00037980222280563575, + "epoch": 1.86 + }, + { + "loss": 7.9458, + "grad_norm": 0.8445936441421509, + "learning_rate": 0.0003789271024765905, + "epoch": 1.86 + }, + { + "loss": 7.6019, + "grad_norm": 0.8796238899230957, + "learning_rate": 0.0003780519821475453, + "epoch": 1.87 + }, + { + "loss": 7.7582, + "grad_norm": 0.7801417112350464, + "learning_rate": 0.00037717686181850004, + "epoch": 1.87 + }, + { + "loss": 7.8483, + "grad_norm": 1.008893609046936, + "learning_rate": 0.0003763017414894548, + "epoch": 1.87 + }, + { + "loss": 8.047, + "grad_norm": 0.8021620512008667, + "learning_rate": 0.00037542662116040956, + "epoch": 1.87 + }, + { + "loss": 8.2537, + "grad_norm": 0.919774055480957, + "learning_rate": 0.0003745515008313643, + "epoch": 1.88 + }, + { + "loss": 8.1101, + "grad_norm": 1.094642996788025, + "learning_rate": 0.0003736763805023191, + "epoch": 1.88 + }, + { + "loss": 7.9119, + "grad_norm": 1.0133185386657715, + "learning_rate": 0.0003728012601732738, + "epoch": 1.88 + }, + { + "loss": 7.9624, + "grad_norm": 0.7546307444572449, + "learning_rate": 0.00037192613984422855, + "epoch": 1.88 + }, + { + "loss": 7.9547, + "grad_norm": 0.7390889525413513, + "learning_rate": 0.0003710510195151833, + "epoch": 1.89 + }, + { + "loss": 7.7794, + "grad_norm": 0.9140797257423401, + "learning_rate": 0.0003701758991861381, + "epoch": 1.89 + }, + { + "loss": 8.0254, + "grad_norm": 0.8325345516204834, + "learning_rate": 0.00036930077885709284, + "epoch": 1.89 + }, + { + "loss": 7.7692, + "grad_norm": 1.228366732597351, + "learning_rate": 0.0003684256585280476, + "epoch": 1.9 + }, + { + "loss": 7.2768, + "grad_norm": 1.0541235208511353, + "learning_rate": 0.00036755053819900236, + "epoch": 1.9 + }, + { + "loss": 8.1104, + "grad_norm": 1.0765891075134277, + "learning_rate": 0.0003666754178699571, + "epoch": 1.9 + }, + { + "loss": 7.5317, + "grad_norm": 0.9508135914802551, + "learning_rate": 0.0003658002975409119, + "epoch": 1.9 + }, + { + "loss": 7.1908, + "grad_norm": 0.7984021306037903, + "learning_rate": 0.00036492517721186665, + "epoch": 1.91 + }, + { + "loss": 7.8423, + "grad_norm": 1.0381263494491577, + "learning_rate": 0.0003640500568828214, + "epoch": 1.91 + }, + { + "loss": 8.297, + "grad_norm": 0.9509484171867371, + "learning_rate": 0.00036317493655377617, + "epoch": 1.91 + }, + { + "loss": 7.7339, + "grad_norm": 0.8926167488098145, + "learning_rate": 0.0003622998162247309, + "epoch": 1.91 + }, + { + "loss": 7.9, + "grad_norm": 1.0550678968429565, + "learning_rate": 0.00036142469589568564, + "epoch": 1.92 + }, + { + "loss": 7.6175, + "grad_norm": 0.9359092712402344, + "learning_rate": 0.0003605495755666404, + "epoch": 1.92 + }, + { + "loss": 8.0818, + "grad_norm": 0.735281765460968, + "learning_rate": 0.00035967445523759516, + "epoch": 1.92 + }, + { + "loss": 8.1061, + "grad_norm": 0.8289329409599304, + "learning_rate": 0.0003587993349085499, + "epoch": 1.92 + }, + { + "loss": 7.3778, + "grad_norm": 0.7723102569580078, + "learning_rate": 0.0003579242145795047, + "epoch": 1.93 + }, + { + "loss": 7.853, + "grad_norm": 0.7856701612472534, + "learning_rate": 0.00035704909425045945, + "epoch": 1.93 + }, + { + "loss": 8.5133, + "grad_norm": 0.7649736404418945, + "learning_rate": 0.0003561739739214142, + "epoch": 1.93 + }, + { + "loss": 8.4676, + "grad_norm": 0.6755172610282898, + "learning_rate": 0.000355298853592369, + "epoch": 1.93 + }, + { + "loss": 8.2074, + "grad_norm": 0.8537729382514954, + "learning_rate": 0.00035442373326332374, + "epoch": 1.94 + }, + { + "loss": 8.249, + "grad_norm": 0.9827852845191956, + "learning_rate": 0.0003535486129342785, + "epoch": 1.94 + }, + { + "loss": 8.4107, + "grad_norm": 1.2670233249664307, + "learning_rate": 0.00035267349260523326, + "epoch": 1.94 + }, + { + "loss": 8.1578, + "grad_norm": 0.8494543433189392, + "learning_rate": 0.000351798372276188, + "epoch": 1.95 + }, + { + "loss": 7.9296, + "grad_norm": 0.8582159876823425, + "learning_rate": 0.00035092325194714273, + "epoch": 1.95 + }, + { + "loss": 7.4592, + "grad_norm": 0.8539626598358154, + "learning_rate": 0.0003500481316180975, + "epoch": 1.95 + }, + { + "loss": 8.1603, + "grad_norm": 0.9004923701286316, + "learning_rate": 0.00034917301128905225, + "epoch": 1.95 + }, + { + "loss": 8.1319, + "grad_norm": 0.722870945930481, + "learning_rate": 0.000348297890960007, + "epoch": 1.96 + }, + { + "loss": 7.791, + "grad_norm": 0.9422692656517029, + "learning_rate": 0.0003474227706309618, + "epoch": 1.96 + }, + { + "loss": 8.0631, + "grad_norm": 1.2248715162277222, + "learning_rate": 0.00034654765030191654, + "epoch": 1.96 + }, + { + "loss": 8.3269, + "grad_norm": 1.370082974433899, + "learning_rate": 0.0003456725299728713, + "epoch": 1.96 + }, + { + "loss": 7.7562, + "grad_norm": 1.0009835958480835, + "learning_rate": 0.00034479740964382606, + "epoch": 1.97 + }, + { + "loss": 7.4909, + "grad_norm": 0.9207608103752136, + "learning_rate": 0.0003439222893147808, + "epoch": 1.97 + }, + { + "loss": 7.2907, + "grad_norm": 1.0351985692977905, + "learning_rate": 0.0003430471689857356, + "epoch": 1.97 + }, + { + "loss": 7.9972, + "grad_norm": 0.9398946762084961, + "learning_rate": 0.00034217204865669035, + "epoch": 1.97 + }, + { + "loss": 7.6034, + "grad_norm": 0.8558303713798523, + "learning_rate": 0.0003412969283276451, + "epoch": 1.98 + }, + { + "loss": 8.3452, + "grad_norm": 0.8279830813407898, + "learning_rate": 0.0003404218079985998, + "epoch": 1.98 + }, + { + "loss": 8.3979, + "grad_norm": 0.7496762275695801, + "learning_rate": 0.0003395466876695545, + "epoch": 1.98 + }, + { + "loss": 7.5979, + "grad_norm": 0.865039587020874, + "learning_rate": 0.0003386715673405093, + "epoch": 1.98 + }, + { + "loss": 7.7027, + "grad_norm": 0.7518277764320374, + "learning_rate": 0.00033779644701146405, + "epoch": 1.99 + }, + { + "loss": 7.8756, + "grad_norm": 0.8984577059745789, + "learning_rate": 0.0003369213266824188, + "epoch": 1.99 + }, + { + "loss": 7.4597, + "grad_norm": 0.7312489151954651, + "learning_rate": 0.00033604620635337357, + "epoch": 1.99 + }, + { + "loss": 7.8173, + "grad_norm": 0.8688482046127319, + "learning_rate": 0.00033517108602432833, + "epoch": 1.99 + }, + { + "loss": 7.6772, + "grad_norm": 0.9117947816848755, + "learning_rate": 0.0003342959656952831, + "epoch": 2.0 + }, + { + "loss": 7.65, + "grad_norm": 1.044518232345581, + "learning_rate": 0.00033342084536623786, + "epoch": 2.0 + }, + { + "loss": 7.6424, + "grad_norm": 0.8763852119445801, + "learning_rate": 0.0003325457250371926, + "epoch": 2.0 + }, + { + "loss": 8.1303, + "grad_norm": 1.2922908067703247, + "learning_rate": 0.0003316706047081474, + "epoch": 2.01 + }, + { + "loss": 8.3256, + "grad_norm": 0.7980864644050598, + "learning_rate": 0.00033079548437910214, + "epoch": 2.01 + }, + { + "loss": 7.7353, + "grad_norm": 0.8062283396720886, + "learning_rate": 0.00032992036405005685, + "epoch": 2.01 + }, + { + "loss": 8.2314, + "grad_norm": 0.9204174280166626, + "learning_rate": 0.0003290452437210116, + "epoch": 2.01 + }, + { + "loss": 7.5946, + "grad_norm": 0.7235244512557983, + "learning_rate": 0.0003281701233919664, + "epoch": 2.02 + }, + { + "loss": 7.4673, + "grad_norm": 0.8126214146614075, + "learning_rate": 0.00032729500306292114, + "epoch": 2.02 + }, + { + "loss": 7.6391, + "grad_norm": 0.7648585438728333, + "learning_rate": 0.0003264198827338759, + "epoch": 2.02 + }, + { + "loss": 8.005, + "grad_norm": 0.7453392148017883, + "learning_rate": 0.00032554476240483066, + "epoch": 2.02 + }, + { + "loss": 7.8703, + "grad_norm": 0.8830775022506714, + "learning_rate": 0.0003246696420757854, + "epoch": 2.03 + }, + { + "loss": 7.8639, + "grad_norm": 1.2337687015533447, + "learning_rate": 0.0003237945217467402, + "epoch": 2.03 + }, + { + "loss": 7.8224, + "grad_norm": 1.0393247604370117, + "learning_rate": 0.00032291940141769494, + "epoch": 2.03 + }, + { + "loss": 7.7573, + "grad_norm": 0.7463309168815613, + "learning_rate": 0.0003220442810886497, + "epoch": 2.03 + }, + { + "loss": 8.3318, + "grad_norm": 0.8722276091575623, + "learning_rate": 0.00032116916075960447, + "epoch": 2.04 + }, + { + "loss": 8.0517, + "grad_norm": 0.9069348573684692, + "learning_rate": 0.00032029404043055923, + "epoch": 2.04 + }, + { + "loss": 7.9696, + "grad_norm": 0.7715663909912109, + "learning_rate": 0.000319418920101514, + "epoch": 2.04 + }, + { + "loss": 7.7113, + "grad_norm": 0.8788508176803589, + "learning_rate": 0.0003185437997724687, + "epoch": 2.04 + }, + { + "loss": 7.5771, + "grad_norm": 1.057786226272583, + "learning_rate": 0.00031766867944342346, + "epoch": 2.05 + }, + { + "loss": 7.985, + "grad_norm": 1.2888935804367065, + "learning_rate": 0.0003167935591143782, + "epoch": 2.05 + }, + { + "loss": 7.5748, + "grad_norm": 0.8100298047065735, + "learning_rate": 0.000315918438785333, + "epoch": 2.05 + }, + { + "loss": 7.7785, + "grad_norm": 0.9130757451057434, + "learning_rate": 0.00031504331845628775, + "epoch": 2.06 + }, + { + "loss": 7.3718, + "grad_norm": 0.895447313785553, + "learning_rate": 0.0003141681981272425, + "epoch": 2.06 + }, + { + "loss": 8.0138, + "grad_norm": 0.8260514736175537, + "learning_rate": 0.00031329307779819727, + "epoch": 2.06 + }, + { + "loss": 7.6438, + "grad_norm": 0.9353188276290894, + "learning_rate": 0.00031241795746915203, + "epoch": 2.06 + }, + { + "loss": 7.9212, + "grad_norm": 0.8095923066139221, + "learning_rate": 0.0003115428371401068, + "epoch": 2.07 + }, + { + "loss": 8.2193, + "grad_norm": 0.8156134486198425, + "learning_rate": 0.00031066771681106156, + "epoch": 2.07 + }, + { + "loss": 7.6264, + "grad_norm": 0.9613614082336426, + "learning_rate": 0.0003097925964820163, + "epoch": 2.07 + }, + { + "loss": 7.6684, + "grad_norm": 0.8426281809806824, + "learning_rate": 0.0003089174761529711, + "epoch": 2.07 + }, + { + "loss": 7.7356, + "grad_norm": 0.8271446824073792, + "learning_rate": 0.0003080423558239258, + "epoch": 2.08 + }, + { + "loss": 7.8816, + "grad_norm": 0.9108027219772339, + "learning_rate": 0.00030716723549488055, + "epoch": 2.08 + }, + { + "loss": 8.5754, + "grad_norm": 0.8285607099533081, + "learning_rate": 0.0003062921151658353, + "epoch": 2.08 + }, + { + "loss": 7.8875, + "grad_norm": 0.79032963514328, + "learning_rate": 0.00030541699483679007, + "epoch": 2.08 + }, + { + "loss": 7.4168, + "grad_norm": 0.8623600602149963, + "learning_rate": 0.00030454187450774483, + "epoch": 2.09 + }, + { + "loss": 7.546, + "grad_norm": 0.8102550506591797, + "learning_rate": 0.0003036667541786996, + "epoch": 2.09 + }, + { + "loss": 7.9269, + "grad_norm": 1.0298386812210083, + "learning_rate": 0.00030279163384965436, + "epoch": 2.09 + }, + { + "loss": 7.6682, + "grad_norm": 0.8902001976966858, + "learning_rate": 0.0003019165135206091, + "epoch": 2.09 + }, + { + "loss": 8.0309, + "grad_norm": 0.831743597984314, + "learning_rate": 0.0003010413931915639, + "epoch": 2.1 + }, + { + "loss": 8.157, + "grad_norm": 0.8056457042694092, + "learning_rate": 0.00030016627286251864, + "epoch": 2.1 + }, + { + "loss": 7.6514, + "grad_norm": 1.071753978729248, + "learning_rate": 0.00029929115253347335, + "epoch": 2.1 + }, + { + "loss": 7.8337, + "grad_norm": 0.8061104416847229, + "learning_rate": 0.0002984160322044281, + "epoch": 2.11 + }, + { + "loss": 7.8925, + "grad_norm": 1.1958301067352295, + "learning_rate": 0.0002975409118753828, + "epoch": 2.11 + }, + { + "loss": 6.9557, + "grad_norm": 0.7460314631462097, + "learning_rate": 0.0002966657915463376, + "epoch": 2.11 + }, + { + "loss": 7.724, + "grad_norm": 0.8949922323226929, + "learning_rate": 0.00029579067121729234, + "epoch": 2.11 + }, + { + "loss": 8.1209, + "grad_norm": 0.7350090146064758, + "learning_rate": 0.0002949155508882471, + "epoch": 2.12 + }, + { + "loss": 7.7897, + "grad_norm": 0.9530614018440247, + "learning_rate": 0.00029404043055920187, + "epoch": 2.12 + }, + { + "loss": 7.7916, + "grad_norm": 0.7030171155929565, + "learning_rate": 0.00029316531023015663, + "epoch": 2.12 + }, + { + "loss": 7.53, + "grad_norm": 0.8843898177146912, + "learning_rate": 0.0002922901899011114, + "epoch": 2.12 + }, + { + "loss": 7.5228, + "grad_norm": 0.9127951860427856, + "learning_rate": 0.00029141506957206615, + "epoch": 2.13 + }, + { + "loss": 7.423, + "grad_norm": 0.7194523811340332, + "learning_rate": 0.0002905399492430209, + "epoch": 2.13 + }, + { + "loss": 8.3464, + "grad_norm": 0.8251200318336487, + "learning_rate": 0.0002896648289139757, + "epoch": 2.13 + }, + { + "loss": 7.8906, + "grad_norm": 0.9383019804954529, + "learning_rate": 0.00028878970858493044, + "epoch": 2.13 + }, + { + "loss": 6.9917, + "grad_norm": 1.1721993684768677, + "learning_rate": 0.0002879145882558852, + "epoch": 2.14 + }, + { + "loss": 7.7154, + "grad_norm": 0.7905781865119934, + "learning_rate": 0.00028703946792683996, + "epoch": 2.14 + }, + { + "loss": 7.9272, + "grad_norm": 0.9261153936386108, + "learning_rate": 0.00028616434759779467, + "epoch": 2.14 + }, + { + "loss": 7.9141, + "grad_norm": 1.206111192703247, + "learning_rate": 0.00028528922726874943, + "epoch": 2.14 + }, + { + "loss": 7.9561, + "grad_norm": 0.8015759587287903, + "learning_rate": 0.0002844141069397042, + "epoch": 2.15 + }, + { + "loss": 7.6844, + "grad_norm": 0.970389723777771, + "learning_rate": 0.00028353898661065896, + "epoch": 2.15 + }, + { + "loss": 7.7312, + "grad_norm": 1.3079341650009155, + "learning_rate": 0.0002826638662816137, + "epoch": 2.15 + }, + { + "loss": 7.506, + "grad_norm": 0.8393199443817139, + "learning_rate": 0.0002817887459525685, + "epoch": 2.16 + }, + { + "loss": 7.3006, + "grad_norm": 0.9169728755950928, + "learning_rate": 0.00028091362562352324, + "epoch": 2.16 + }, + { + "loss": 7.5924, + "grad_norm": 0.8766190409660339, + "learning_rate": 0.000280038505294478, + "epoch": 2.16 + }, + { + "loss": 8.2074, + "grad_norm": 0.8473224639892578, + "learning_rate": 0.00027916338496543277, + "epoch": 2.16 + }, + { + "loss": 7.2028, + "grad_norm": 0.9415881037712097, + "learning_rate": 0.0002782882646363875, + "epoch": 2.17 + }, + { + "loss": 7.87, + "grad_norm": 0.8043491840362549, + "learning_rate": 0.0002774131443073423, + "epoch": 2.17 + }, + { + "loss": 8.5354, + "grad_norm": 0.9696796536445618, + "learning_rate": 0.00027653802397829705, + "epoch": 2.17 + }, + { + "loss": 8.1185, + "grad_norm": 0.9294397830963135, + "learning_rate": 0.00027566290364925176, + "epoch": 2.17 + }, + { + "loss": 7.8844, + "grad_norm": 1.0350419282913208, + "learning_rate": 0.0002747877833202065, + "epoch": 2.18 + }, + { + "loss": 7.9054, + "grad_norm": 1.086616039276123, + "learning_rate": 0.0002739126629911613, + "epoch": 2.18 + }, + { + "loss": 7.4362, + "grad_norm": 0.865028440952301, + "learning_rate": 0.00027303754266211604, + "epoch": 2.18 + }, + { + "loss": 7.4039, + "grad_norm": 0.8574273586273193, + "learning_rate": 0.0002721624223330708, + "epoch": 2.18 + }, + { + "loss": 8.0095, + "grad_norm": 1.0509589910507202, + "learning_rate": 0.00027128730200402557, + "epoch": 2.19 + }, + { + "loss": 7.6467, + "grad_norm": 0.7813432812690735, + "learning_rate": 0.00027041218167498033, + "epoch": 2.19 + }, + { + "loss": 7.4786, + "grad_norm": 0.855741560459137, + "learning_rate": 0.0002695370613459351, + "epoch": 2.19 + }, + { + "loss": 7.7862, + "grad_norm": 0.8451842069625854, + "learning_rate": 0.00026866194101688985, + "epoch": 2.19 + }, + { + "loss": 7.7616, + "grad_norm": 0.882211446762085, + "learning_rate": 0.0002677868206878446, + "epoch": 2.2 + }, + { + "loss": 8.1508, + "grad_norm": 0.7093100547790527, + "learning_rate": 0.0002669117003587994, + "epoch": 2.2 + }, + { + "loss": 7.8715, + "grad_norm": 0.9282416701316833, + "learning_rate": 0.00026603658002975414, + "epoch": 2.2 + }, + { + "loss": 7.6333, + "grad_norm": 0.8849425911903381, + "learning_rate": 0.0002651614597007089, + "epoch": 2.2 + }, + { + "loss": 7.624, + "grad_norm": 0.8789107203483582, + "learning_rate": 0.0002642863393716636, + "epoch": 2.21 + }, + { + "loss": 7.5042, + "grad_norm": 0.9759025573730469, + "learning_rate": 0.00026341121904261837, + "epoch": 2.21 + }, + { + "loss": 7.7317, + "grad_norm": 0.794627845287323, + "learning_rate": 0.00026253609871357313, + "epoch": 2.21 + }, + { + "loss": 7.4743, + "grad_norm": 1.3992342948913574, + "learning_rate": 0.0002616609783845279, + "epoch": 2.22 + }, + { + "loss": 7.5986, + "grad_norm": 0.8934722542762756, + "learning_rate": 0.00026078585805548266, + "epoch": 2.22 + }, + { + "loss": 7.7515, + "grad_norm": 1.0474205017089844, + "learning_rate": 0.00025991073772643736, + "epoch": 2.22 + }, + { + "loss": 7.0749, + "grad_norm": 0.7677063345909119, + "learning_rate": 0.0002590356173973921, + "epoch": 2.22 + }, + { + "loss": 7.7033, + "grad_norm": 0.8318948149681091, + "learning_rate": 0.0002581604970683469, + "epoch": 2.23 + }, + { + "loss": 7.775, + "grad_norm": 0.7674381136894226, + "learning_rate": 0.00025728537673930165, + "epoch": 2.23 + }, + { + "loss": 7.5289, + "grad_norm": 1.0669969320297241, + "learning_rate": 0.0002564102564102564, + "epoch": 2.23 + }, + { + "loss": 7.3784, + "grad_norm": 1.0004348754882812, + "learning_rate": 0.00025553513608121117, + "epoch": 2.23 + }, + { + "loss": 7.4305, + "grad_norm": 0.7937709093093872, + "learning_rate": 0.00025466001575216593, + "epoch": 2.24 + }, + { + "loss": 7.1845, + "grad_norm": 0.9088554382324219, + "learning_rate": 0.00025378489542312064, + "epoch": 2.24 + }, + { + "loss": 7.9313, + "grad_norm": 1.0221823453903198, + "learning_rate": 0.0002529097750940754, + "epoch": 2.24 + }, + { + "loss": 7.4251, + "grad_norm": 0.7980064153671265, + "learning_rate": 0.00025203465476503016, + "epoch": 2.24 + }, + { + "loss": 8.0494, + "grad_norm": 0.8470319509506226, + "learning_rate": 0.0002511595344359849, + "epoch": 2.25 + }, + { + "loss": 7.7765, + "grad_norm": 1.101785659790039, + "learning_rate": 0.0002502844141069397, + "epoch": 2.25 + }, + { + "loss": 7.8624, + "grad_norm": 0.8655755519866943, + "learning_rate": 0.00024940929377789445, + "epoch": 2.25 + }, + { + "loss": 7.6855, + "grad_norm": 1.0447689294815063, + "learning_rate": 0.0002485341734488492, + "epoch": 2.25 + }, + { + "loss": 7.7653, + "grad_norm": 0.9611648917198181, + "learning_rate": 0.000247659053119804, + "epoch": 2.26 + }, + { + "loss": 8.0705, + "grad_norm": 1.410849928855896, + "learning_rate": 0.00024678393279075874, + "epoch": 2.26 + }, + { + "loss": 7.8147, + "grad_norm": 0.9252009987831116, + "learning_rate": 0.0002459088124617135, + "epoch": 2.26 + }, + { + "loss": 7.9366, + "grad_norm": 0.899348258972168, + "learning_rate": 0.00024503369213266826, + "epoch": 2.27 + }, + { + "loss": 8.089, + "grad_norm": 0.7920341491699219, + "learning_rate": 0.000244158571803623, + "epoch": 2.27 + }, + { + "loss": 7.5066, + "grad_norm": 0.8289885520935059, + "learning_rate": 0.00024328345147457776, + "epoch": 2.27 + }, + { + "loss": 7.4402, + "grad_norm": 0.9304541349411011, + "learning_rate": 0.00024240833114553252, + "epoch": 2.27 + }, + { + "loss": 8.1004, + "grad_norm": 0.8798967003822327, + "learning_rate": 0.00024153321081648728, + "epoch": 2.28 + }, + { + "loss": 7.8528, + "grad_norm": 0.9733609557151794, + "learning_rate": 0.00024065809048744201, + "epoch": 2.28 + }, + { + "loss": 7.1178, + "grad_norm": 1.1248620748519897, + "learning_rate": 0.00023978297015839678, + "epoch": 2.28 + }, + { + "loss": 7.7862, + "grad_norm": 1.2658095359802246, + "learning_rate": 0.00023890784982935154, + "epoch": 2.28 + }, + { + "loss": 7.9395, + "grad_norm": 1.0820565223693848, + "learning_rate": 0.0002380327295003063, + "epoch": 2.29 + }, + { + "loss": 7.4596, + "grad_norm": 0.9462448954582214, + "learning_rate": 0.00023715760917126106, + "epoch": 2.29 + }, + { + "loss": 7.8461, + "grad_norm": 0.8025732636451721, + "learning_rate": 0.00023628248884221582, + "epoch": 2.29 + }, + { + "loss": 7.7102, + "grad_norm": 0.7947144508361816, + "learning_rate": 0.00023540736851317059, + "epoch": 2.29 + }, + { + "loss": 7.8149, + "grad_norm": 0.8819990158081055, + "learning_rate": 0.00023453224818412532, + "epoch": 2.3 + }, + { + "loss": 7.5168, + "grad_norm": 0.9773268103599548, + "learning_rate": 0.00023365712785508008, + "epoch": 2.3 + }, + { + "loss": 7.7338, + "grad_norm": 1.384716510772705, + "learning_rate": 0.00023278200752603484, + "epoch": 2.3 + }, + { + "loss": 6.9549, + "grad_norm": 1.1293810606002808, + "learning_rate": 0.0002319068871969896, + "epoch": 2.3 + }, + { + "loss": 7.8655, + "grad_norm": 0.7238449454307556, + "learning_rate": 0.00023103176686794437, + "epoch": 2.31 + }, + { + "loss": 7.7399, + "grad_norm": 0.8876301646232605, + "learning_rate": 0.00023015664653889913, + "epoch": 2.31 + }, + { + "loss": 7.5196, + "grad_norm": 0.7352742552757263, + "learning_rate": 0.00022928152620985384, + "epoch": 2.31 + }, + { + "loss": 8.0545, + "grad_norm": 1.0614981651306152, + "learning_rate": 0.0002284064058808086, + "epoch": 2.32 + }, + { + "loss": 7.8036, + "grad_norm": 0.999052882194519, + "learning_rate": 0.00022753128555176336, + "epoch": 2.32 + }, + { + "loss": 7.5506, + "grad_norm": 1.084981918334961, + "learning_rate": 0.00022665616522271812, + "epoch": 2.32 + }, + { + "loss": 7.7953, + "grad_norm": 1.110907793045044, + "learning_rate": 0.00022578104489367288, + "epoch": 2.32 + }, + { + "loss": 7.6064, + "grad_norm": 1.29153311252594, + "learning_rate": 0.00022490592456462765, + "epoch": 2.33 + }, + { + "loss": 7.9157, + "grad_norm": 1.5039303302764893, + "learning_rate": 0.00022403080423558238, + "epoch": 2.33 + }, + { + "loss": 7.5924, + "grad_norm": 0.850940465927124, + "learning_rate": 0.00022315568390653714, + "epoch": 2.33 + }, + { + "loss": 7.9425, + "grad_norm": 0.79768967628479, + "learning_rate": 0.0002222805635774919, + "epoch": 2.33 + }, + { + "loss": 8.0374, + "grad_norm": 0.771493673324585, + "learning_rate": 0.00022140544324844667, + "epoch": 2.34 + }, + { + "loss": 7.1645, + "grad_norm": 0.7525059580802917, + "learning_rate": 0.00022053032291940143, + "epoch": 2.34 + }, + { + "loss": 7.5769, + "grad_norm": 0.9684802293777466, + "learning_rate": 0.0002196552025903562, + "epoch": 2.34 + }, + { + "loss": 7.781, + "grad_norm": 1.1203564405441284, + "learning_rate": 0.00021878008226131092, + "epoch": 2.34 + }, + { + "loss": 7.4585, + "grad_norm": 1.0650273561477661, + "learning_rate": 0.0002179049619322657, + "epoch": 2.35 + }, + { + "loss": 7.7015, + "grad_norm": 0.9924284219741821, + "learning_rate": 0.00021702984160322045, + "epoch": 2.35 + }, + { + "loss": 7.572, + "grad_norm": 0.8644096255302429, + "learning_rate": 0.0002161547212741752, + "epoch": 2.35 + }, + { + "loss": 7.8879, + "grad_norm": 0.854030966758728, + "learning_rate": 0.00021527960094512997, + "epoch": 2.35 + }, + { + "loss": 7.842, + "grad_norm": 0.7271285653114319, + "learning_rate": 0.00021440448061608473, + "epoch": 2.36 + }, + { + "loss": 7.652, + "grad_norm": 0.6921567320823669, + "learning_rate": 0.00021352936028703947, + "epoch": 2.36 + }, + { + "loss": 7.8335, + "grad_norm": 1.2016472816467285, + "learning_rate": 0.00021265423995799423, + "epoch": 2.36 + }, + { + "loss": 7.5109, + "grad_norm": 0.79868084192276, + "learning_rate": 0.000211779119628949, + "epoch": 2.37 + }, + { + "loss": 7.3853, + "grad_norm": 0.8064858913421631, + "learning_rate": 0.00021090399929990375, + "epoch": 2.37 + }, + { + "loss": 7.6334, + "grad_norm": 0.9092600345611572, + "learning_rate": 0.00021002887897085852, + "epoch": 2.37 + }, + { + "loss": 7.9536, + "grad_norm": 1.0683679580688477, + "learning_rate": 0.00020915375864181325, + "epoch": 2.37 + }, + { + "loss": 7.7399, + "grad_norm": 1.1141338348388672, + "learning_rate": 0.00020827863831276799, + "epoch": 2.38 + }, + { + "loss": 7.9567, + "grad_norm": 0.9624096751213074, + "learning_rate": 0.00020740351798372275, + "epoch": 2.38 + }, + { + "loss": 8.1788, + "grad_norm": 0.7703258991241455, + "learning_rate": 0.0002065283976546775, + "epoch": 2.38 + }, + { + "loss": 7.8642, + "grad_norm": 0.9297539591789246, + "learning_rate": 0.00020565327732563227, + "epoch": 2.38 + }, + { + "loss": 7.4837, + "grad_norm": 0.7845075130462646, + "learning_rate": 0.00020477815699658703, + "epoch": 2.39 + }, + { + "loss": 7.5431, + "grad_norm": 0.8620021343231201, + "learning_rate": 0.0002039030366675418, + "epoch": 2.39 + }, + { + "loss": 7.7398, + "grad_norm": 0.8532699942588806, + "learning_rate": 0.00020302791633849656, + "epoch": 2.39 + }, + { + "loss": 7.7079, + "grad_norm": 1.1266266107559204, + "learning_rate": 0.0002021527960094513, + "epoch": 2.39 + }, + { + "loss": 7.8789, + "grad_norm": 1.003790020942688, + "learning_rate": 0.00020127767568040605, + "epoch": 2.4 + }, + { + "loss": 7.5108, + "grad_norm": 1.1769237518310547, + "learning_rate": 0.00020040255535136081, + "epoch": 2.4 + }, + { + "loss": 7.8151, + "grad_norm": 0.9078934192657471, + "learning_rate": 0.00019952743502231558, + "epoch": 2.4 + }, + { + "loss": 7.4092, + "grad_norm": 0.8376544713973999, + "learning_rate": 0.00019865231469327034, + "epoch": 2.4 + }, + { + "loss": 7.4658, + "grad_norm": 0.9094048738479614, + "learning_rate": 0.0001977771943642251, + "epoch": 2.41 + }, + { + "loss": 7.6113, + "grad_norm": 1.1345362663269043, + "learning_rate": 0.00019690207403517984, + "epoch": 2.41 + }, + { + "loss": 7.4598, + "grad_norm": 0.8164626955986023, + "learning_rate": 0.0001960269537061346, + "epoch": 2.41 + }, + { + "loss": 7.707, + "grad_norm": 1.125823736190796, + "learning_rate": 0.00019515183337708936, + "epoch": 2.41 + }, + { + "loss": 8.0873, + "grad_norm": 0.8651579022407532, + "learning_rate": 0.00019427671304804412, + "epoch": 2.42 + }, + { + "loss": 7.5421, + "grad_norm": 0.9041004776954651, + "learning_rate": 0.00019340159271899888, + "epoch": 2.42 + }, + { + "loss": 7.9615, + "grad_norm": 0.8012003302574158, + "learning_rate": 0.00019252647238995364, + "epoch": 2.42 + }, + { + "loss": 7.6728, + "grad_norm": 0.8691316246986389, + "learning_rate": 0.00019165135206090838, + "epoch": 2.43 + }, + { + "loss": 7.4882, + "grad_norm": 0.8700850605964661, + "learning_rate": 0.00019077623173186314, + "epoch": 2.43 + }, + { + "loss": 7.4824, + "grad_norm": 1.0540724992752075, + "learning_rate": 0.00018990111140281788, + "epoch": 2.43 + }, + { + "loss": 7.3133, + "grad_norm": 0.9065701365470886, + "learning_rate": 0.00018902599107377264, + "epoch": 2.43 + }, + { + "loss": 8.1036, + "grad_norm": 0.8794527649879456, + "learning_rate": 0.0001881508707447274, + "epoch": 2.44 + }, + { + "loss": 7.3707, + "grad_norm": 0.9155571460723877, + "learning_rate": 0.00018727575041568216, + "epoch": 2.44 + }, + { + "loss": 7.0801, + "grad_norm": 0.7177339792251587, + "learning_rate": 0.0001864006300866369, + "epoch": 2.44 + }, + { + "loss": 7.4368, + "grad_norm": 0.8027993440628052, + "learning_rate": 0.00018552550975759166, + "epoch": 2.44 + }, + { + "loss": 8.2545, + "grad_norm": 0.9770577549934387, + "learning_rate": 0.00018465038942854642, + "epoch": 2.45 + }, + { + "loss": 7.4767, + "grad_norm": 1.0428367853164673, + "learning_rate": 0.00018377526909950118, + "epoch": 2.45 + }, + { + "loss": 8.4641, + "grad_norm": 0.8214976787567139, + "learning_rate": 0.00018290014877045594, + "epoch": 2.45 + }, + { + "loss": 7.054, + "grad_norm": 1.1258653402328491, + "learning_rate": 0.0001820250284414107, + "epoch": 2.45 + }, + { + "loss": 7.5935, + "grad_norm": 1.07210373878479, + "learning_rate": 0.00018114990811236544, + "epoch": 2.46 + }, + { + "loss": 7.8104, + "grad_norm": 1.0441612005233765, + "learning_rate": 0.0001802747877833202, + "epoch": 2.46 + }, + { + "loss": 7.6147, + "grad_norm": 0.9820619821548462, + "learning_rate": 0.00017939966745427496, + "epoch": 2.46 + }, + { + "loss": 8.1347, + "grad_norm": 0.8725702166557312, + "learning_rate": 0.00017852454712522973, + "epoch": 2.46 + }, + { + "loss": 8.227, + "grad_norm": 0.8640567660331726, + "learning_rate": 0.0001776494267961845, + "epoch": 2.47 + }, + { + "loss": 7.392, + "grad_norm": 1.0909335613250732, + "learning_rate": 0.00017677430646713925, + "epoch": 2.47 + }, + { + "loss": 6.7634, + "grad_norm": 0.8133190274238586, + "learning_rate": 0.000175899186138094, + "epoch": 2.47 + }, + { + "loss": 7.802, + "grad_norm": 0.9833294749259949, + "learning_rate": 0.00017502406580904875, + "epoch": 2.48 + }, + { + "loss": 7.2764, + "grad_norm": 0.9594758152961731, + "learning_rate": 0.0001741489454800035, + "epoch": 2.48 + }, + { + "loss": 7.1931, + "grad_norm": 0.9970749616622925, + "learning_rate": 0.00017327382515095827, + "epoch": 2.48 + }, + { + "loss": 7.6539, + "grad_norm": 0.8486274480819702, + "learning_rate": 0.00017239870482191303, + "epoch": 2.48 + }, + { + "loss": 7.3367, + "grad_norm": 0.9591713547706604, + "learning_rate": 0.0001715235844928678, + "epoch": 2.49 + }, + { + "loss": 7.545, + "grad_norm": 1.1163291931152344, + "learning_rate": 0.00017064846416382255, + "epoch": 2.49 + }, + { + "loss": 8.3214, + "grad_norm": 0.8581505417823792, + "learning_rate": 0.00016977334383477726, + "epoch": 2.49 + }, + { + "loss": 7.1871, + "grad_norm": 0.8021834492683411, + "learning_rate": 0.00016889822350573202, + "epoch": 2.49 + }, + { + "loss": 7.8969, + "grad_norm": 0.9090090990066528, + "learning_rate": 0.00016802310317668679, + "epoch": 2.5 + }, + { + "loss": 7.37, + "grad_norm": 0.8283194303512573, + "learning_rate": 0.00016714798284764155, + "epoch": 2.5 + }, + { + "loss": 7.4669, + "grad_norm": 0.8183834552764893, + "learning_rate": 0.0001662728625185963, + "epoch": 2.5 + }, + { + "loss": 7.4362, + "grad_norm": 0.9701572060585022, + "learning_rate": 0.00016539774218955107, + "epoch": 2.5 + }, + { + "loss": 7.2859, + "grad_norm": 1.079610824584961, + "learning_rate": 0.0001645226218605058, + "epoch": 2.51 + }, + { + "loss": 8.0835, + "grad_norm": 0.8598064184188843, + "learning_rate": 0.00016364750153146057, + "epoch": 2.51 + }, + { + "loss": 7.6696, + "grad_norm": 0.8653038740158081, + "learning_rate": 0.00016277238120241533, + "epoch": 2.51 + }, + { + "loss": 7.6096, + "grad_norm": 1.0018919706344604, + "learning_rate": 0.0001618972608733701, + "epoch": 2.51 + }, + { + "loss": 7.7412, + "grad_norm": 0.8919802308082581, + "learning_rate": 0.00016102214054432485, + "epoch": 2.52 + }, + { + "loss": 7.4504, + "grad_norm": 0.8712960481643677, + "learning_rate": 0.00016014702021527962, + "epoch": 2.52 + }, + { + "loss": 8.1, + "grad_norm": 0.8894332647323608, + "learning_rate": 0.00015927189988623435, + "epoch": 2.52 + }, + { + "loss": 8.1017, + "grad_norm": 1.024781584739685, + "learning_rate": 0.0001583967795571891, + "epoch": 2.53 + }, + { + "loss": 7.6484, + "grad_norm": 0.9175984859466553, + "learning_rate": 0.00015752165922814387, + "epoch": 2.53 + }, + { + "loss": 7.3766, + "grad_norm": 0.9064013361930847, + "learning_rate": 0.00015664653889909864, + "epoch": 2.53 + }, + { + "loss": 7.6414, + "grad_norm": 0.9600405097007751, + "learning_rate": 0.0001557714185700534, + "epoch": 2.53 + }, + { + "loss": 7.2811, + "grad_norm": 0.9788243174552917, + "learning_rate": 0.00015489629824100816, + "epoch": 2.54 + }, + { + "loss": 7.3704, + "grad_norm": 0.8740330338478088, + "learning_rate": 0.0001540211779119629, + "epoch": 2.54 + }, + { + "loss": 7.5645, + "grad_norm": 0.8021050095558167, + "learning_rate": 0.00015314605758291766, + "epoch": 2.54 + }, + { + "loss": 7.9078, + "grad_norm": 1.0614405870437622, + "learning_rate": 0.00015227093725387242, + "epoch": 2.54 + }, + { + "loss": 7.3365, + "grad_norm": 0.8063251376152039, + "learning_rate": 0.00015139581692482718, + "epoch": 2.55 + }, + { + "loss": 7.8801, + "grad_norm": 0.8937615752220154, + "learning_rate": 0.00015052069659578194, + "epoch": 2.55 + }, + { + "loss": 8.0013, + "grad_norm": 0.9128641486167908, + "learning_rate": 0.00014964557626673668, + "epoch": 2.55 + }, + { + "loss": 8.1354, + "grad_norm": 0.8519286513328552, + "learning_rate": 0.0001487704559376914, + "epoch": 2.55 + }, + { + "loss": 7.6918, + "grad_norm": 0.9265363812446594, + "learning_rate": 0.00014789533560864617, + "epoch": 2.56 + }, + { + "loss": 7.7237, + "grad_norm": 1.113276720046997, + "learning_rate": 0.00014702021527960093, + "epoch": 2.56 + }, + { + "loss": 7.3281, + "grad_norm": 0.9011558890342712, + "learning_rate": 0.0001461450949505557, + "epoch": 2.56 + }, + { + "loss": 6.577, + "grad_norm": 0.990836501121521, + "learning_rate": 0.00014526997462151046, + "epoch": 2.56 + }, + { + "loss": 7.4641, + "grad_norm": 1.1346269845962524, + "learning_rate": 0.00014439485429246522, + "epoch": 2.57 + }, + { + "loss": 7.6071, + "grad_norm": 1.0057759284973145, + "learning_rate": 0.00014351973396341998, + "epoch": 2.57 + }, + { + "loss": 8.0022, + "grad_norm": 0.8524260520935059, + "learning_rate": 0.00014264461363437472, + "epoch": 2.57 + }, + { + "loss": 7.4056, + "grad_norm": 0.7590330839157104, + "learning_rate": 0.00014176949330532948, + "epoch": 2.58 + }, + { + "loss": 7.9487, + "grad_norm": 1.2074108123779297, + "learning_rate": 0.00014089437297628424, + "epoch": 2.58 + }, + { + "loss": 7.7237, + "grad_norm": 0.9621999263763428, + "learning_rate": 0.000140019252647239, + "epoch": 2.58 + }, + { + "loss": 7.2588, + "grad_norm": 0.843911349773407, + "learning_rate": 0.00013914413231819376, + "epoch": 2.58 + }, + { + "loss": 7.1697, + "grad_norm": 0.7619708180427551, + "learning_rate": 0.00013826901198914853, + "epoch": 2.59 + }, + { + "loss": 7.2903, + "grad_norm": 1.0736790895462036, + "learning_rate": 0.00013739389166010326, + "epoch": 2.59 + }, + { + "loss": 7.9193, + "grad_norm": 1.03206467628479, + "learning_rate": 0.00013651877133105802, + "epoch": 2.59 + }, + { + "loss": 8.182, + "grad_norm": 0.9106431603431702, + "learning_rate": 0.00013564365100201278, + "epoch": 2.59 + }, + { + "loss": 7.5139, + "grad_norm": 0.9506519436836243, + "learning_rate": 0.00013476853067296755, + "epoch": 2.6 + }, + { + "loss": 7.9055, + "grad_norm": 0.859704852104187, + "learning_rate": 0.0001338934103439223, + "epoch": 2.6 + }, + { + "loss": 8.0011, + "grad_norm": 0.9628238677978516, + "learning_rate": 0.00013301829001487707, + "epoch": 2.6 + }, + { + "loss": 7.4412, + "grad_norm": 0.8472156524658203, + "learning_rate": 0.0001321431696858318, + "epoch": 2.6 + }, + { + "loss": 7.6981, + "grad_norm": 0.9454402327537537, + "learning_rate": 0.00013126804935678657, + "epoch": 2.61 + }, + { + "loss": 7.4101, + "grad_norm": 0.8925793766975403, + "learning_rate": 0.00013039292902774133, + "epoch": 2.61 + }, + { + "loss": 7.1784, + "grad_norm": 0.8468560576438904, + "learning_rate": 0.00012951780869869606, + "epoch": 2.61 + }, + { + "loss": 7.6655, + "grad_norm": 0.8432177901268005, + "learning_rate": 0.00012864268836965082, + "epoch": 2.61 + }, + { + "loss": 7.4518, + "grad_norm": 0.813543438911438, + "learning_rate": 0.00012776756804060559, + "epoch": 2.62 + }, + { + "loss": 7.5661, + "grad_norm": 1.134985327720642, + "learning_rate": 0.00012689244771156032, + "epoch": 2.62 + }, + { + "loss": 7.3611, + "grad_norm": 1.05497407913208, + "learning_rate": 0.00012601732738251508, + "epoch": 2.62 + }, + { + "loss": 7.8965, + "grad_norm": 1.0532019138336182, + "learning_rate": 0.00012514220705346984, + "epoch": 2.62 + }, + { + "loss": 8.113, + "grad_norm": 1.0708712339401245, + "learning_rate": 0.0001242670867244246, + "epoch": 2.63 + }, + { + "loss": 7.4782, + "grad_norm": 1.1848175525665283, + "learning_rate": 0.00012339196639537937, + "epoch": 2.63 + }, + { + "loss": 7.761, + "grad_norm": 0.9672744870185852, + "learning_rate": 0.00012251684606633413, + "epoch": 2.63 + }, + { + "loss": 7.8212, + "grad_norm": 1.2713532447814941, + "learning_rate": 0.00012164172573728888, + "epoch": 2.64 + }, + { + "loss": 7.5313, + "grad_norm": 1.026662826538086, + "learning_rate": 0.00012076660540824364, + "epoch": 2.64 + }, + { + "loss": 7.9599, + "grad_norm": 0.8448575139045715, + "learning_rate": 0.00011989148507919839, + "epoch": 2.64 + }, + { + "loss": 7.7506, + "grad_norm": 1.041380524635315, + "learning_rate": 0.00011901636475015315, + "epoch": 2.64 + }, + { + "loss": 7.9023, + "grad_norm": 0.8197987675666809, + "learning_rate": 0.00011814124442110791, + "epoch": 2.65 + }, + { + "loss": 7.4913, + "grad_norm": 0.918388307094574, + "learning_rate": 0.00011726612409206266, + "epoch": 2.65 + }, + { + "loss": 7.8685, + "grad_norm": 0.9161803722381592, + "learning_rate": 0.00011639100376301742, + "epoch": 2.65 + }, + { + "loss": 7.5855, + "grad_norm": 0.8994104266166687, + "learning_rate": 0.00011551588343397218, + "epoch": 2.65 + }, + { + "loss": 7.6488, + "grad_norm": 0.8985808491706848, + "learning_rate": 0.00011464076310492692, + "epoch": 2.66 + }, + { + "loss": 7.5261, + "grad_norm": 0.9975460767745972, + "learning_rate": 0.00011376564277588168, + "epoch": 2.66 + }, + { + "loss": 7.9051, + "grad_norm": 1.051378607749939, + "learning_rate": 0.00011289052244683644, + "epoch": 2.66 + }, + { + "loss": 7.2571, + "grad_norm": 1.017866611480713, + "learning_rate": 0.00011201540211779119, + "epoch": 2.66 + }, + { + "loss": 7.6068, + "grad_norm": 1.1010361909866333, + "learning_rate": 0.00011114028178874595, + "epoch": 2.67 + }, + { + "loss": 7.6306, + "grad_norm": 0.9585467576980591, + "learning_rate": 0.00011026516145970071, + "epoch": 2.67 + }, + { + "loss": 7.3702, + "grad_norm": 0.9484645128250122, + "learning_rate": 0.00010939004113065546, + "epoch": 2.67 + }, + { + "loss": 7.4482, + "grad_norm": 1.0726372003555298, + "learning_rate": 0.00010851492080161022, + "epoch": 2.67 + }, + { + "loss": 7.7554, + "grad_norm": 0.8078585863113403, + "learning_rate": 0.00010763980047256499, + "epoch": 2.68 + }, + { + "loss": 7.3881, + "grad_norm": 0.9488946199417114, + "learning_rate": 0.00010676468014351973, + "epoch": 2.68 + }, + { + "loss": 7.7557, + "grad_norm": 0.8590677976608276, + "learning_rate": 0.0001058895598144745, + "epoch": 2.68 + }, + { + "loss": 7.2128, + "grad_norm": 0.8768866062164307, + "learning_rate": 0.00010501443948542926, + "epoch": 2.69 + }, + { + "loss": 7.6447, + "grad_norm": 1.1127121448516846, + "learning_rate": 0.00010413931915638399, + "epoch": 2.69 + }, + { + "loss": 7.7283, + "grad_norm": 0.7706397771835327, + "learning_rate": 0.00010326419882733875, + "epoch": 2.69 + }, + { + "loss": 7.8187, + "grad_norm": 0.910484254360199, + "learning_rate": 0.00010238907849829352, + "epoch": 2.69 + }, + { + "loss": 6.9677, + "grad_norm": 0.8292771577835083, + "learning_rate": 0.00010151395816924828, + "epoch": 2.7 + }, + { + "loss": 7.7939, + "grad_norm": 1.2936872243881226, + "learning_rate": 0.00010063883784020303, + "epoch": 2.7 + }, + { + "loss": 7.2773, + "grad_norm": 1.050876259803772, + "learning_rate": 9.976371751115779e-05, + "epoch": 2.7 + }, + { + "loss": 7.6461, + "grad_norm": 1.0275306701660156, + "learning_rate": 9.888859718211255e-05, + "epoch": 2.7 + }, + { + "loss": 7.4058, + "grad_norm": 0.9414623379707336, + "learning_rate": 9.80134768530673e-05, + "epoch": 2.71 + }, + { + "loss": 7.4938, + "grad_norm": 0.8367570042610168, + "learning_rate": 9.713835652402206e-05, + "epoch": 2.71 + }, + { + "loss": 7.4702, + "grad_norm": 0.9100292325019836, + "learning_rate": 9.626323619497682e-05, + "epoch": 2.71 + }, + { + "loss": 7.4209, + "grad_norm": 0.881262481212616, + "learning_rate": 9.538811586593157e-05, + "epoch": 2.71 + }, + { + "loss": 7.568, + "grad_norm": 1.0841021537780762, + "learning_rate": 9.451299553688632e-05, + "epoch": 2.72 + }, + { + "loss": 7.4385, + "grad_norm": 0.8553777933120728, + "learning_rate": 9.363787520784108e-05, + "epoch": 2.72 + }, + { + "loss": 7.7745, + "grad_norm": 0.8244187235832214, + "learning_rate": 9.276275487879583e-05, + "epoch": 2.72 + }, + { + "loss": 7.3427, + "grad_norm": 1.0330350399017334, + "learning_rate": 9.188763454975059e-05, + "epoch": 2.72 + }, + { + "loss": 7.4313, + "grad_norm": 0.86846524477005, + "learning_rate": 9.101251422070535e-05, + "epoch": 2.73 + }, + { + "loss": 7.6994, + "grad_norm": 1.0151475667953491, + "learning_rate": 9.01373938916601e-05, + "epoch": 2.73 + }, + { + "loss": 7.6994, + "grad_norm": 0.8053341507911682, + "learning_rate": 8.926227356261486e-05, + "epoch": 2.73 + }, + { + "loss": 7.8782, + "grad_norm": 0.917957067489624, + "learning_rate": 8.838715323356962e-05, + "epoch": 2.74 + }, + { + "loss": 7.5889, + "grad_norm": 1.556181788444519, + "learning_rate": 8.751203290452437e-05, + "epoch": 2.74 + }, + { + "loss": 7.6279, + "grad_norm": 1.043771743774414, + "learning_rate": 8.663691257547913e-05, + "epoch": 2.74 + }, + { + "loss": 7.8682, + "grad_norm": 1.1640032529830933, + "learning_rate": 8.57617922464339e-05, + "epoch": 2.74 + }, + { + "loss": 7.8918, + "grad_norm": 0.8830235600471497, + "learning_rate": 8.488667191738863e-05, + "epoch": 2.75 + }, + { + "loss": 7.5466, + "grad_norm": 0.958690345287323, + "learning_rate": 8.401155158834339e-05, + "epoch": 2.75 + }, + { + "loss": 7.5439, + "grad_norm": 1.1970360279083252, + "learning_rate": 8.313643125929815e-05, + "epoch": 2.75 + }, + { + "loss": 8.1002, + "grad_norm": 0.9388788938522339, + "learning_rate": 8.22613109302529e-05, + "epoch": 2.75 + }, + { + "loss": 7.6892, + "grad_norm": 1.0798841714859009, + "learning_rate": 8.138619060120766e-05, + "epoch": 2.76 + }, + { + "loss": 8.0534, + "grad_norm": 1.2909208536148071, + "learning_rate": 8.051107027216243e-05, + "epoch": 2.76 + }, + { + "loss": 7.3369, + "grad_norm": 1.272641658782959, + "learning_rate": 7.963594994311717e-05, + "epoch": 2.76 + }, + { + "loss": 7.5785, + "grad_norm": 0.9654033780097961, + "learning_rate": 7.876082961407194e-05, + "epoch": 2.76 + }, + { + "loss": 7.8078, + "grad_norm": 0.8423277139663696, + "learning_rate": 7.78857092850267e-05, + "epoch": 2.77 + }, + { + "loss": 7.8086, + "grad_norm": 0.9509181380271912, + "learning_rate": 7.701058895598145e-05, + "epoch": 2.77 + }, + { + "loss": 8.1405, + "grad_norm": 0.9167718291282654, + "learning_rate": 7.613546862693621e-05, + "epoch": 2.77 + }, + { + "loss": 7.7728, + "grad_norm": 0.9845168590545654, + "learning_rate": 7.526034829789097e-05, + "epoch": 2.77 + }, + { + "loss": 7.4146, + "grad_norm": 0.9597529768943787, + "learning_rate": 7.43852279688457e-05, + "epoch": 2.78 + }, + { + "loss": 7.0711, + "grad_norm": 1.0068391561508179, + "learning_rate": 7.351010763980047e-05, + "epoch": 2.78 + }, + { + "loss": 7.1173, + "grad_norm": 0.8510629534721375, + "learning_rate": 7.263498731075523e-05, + "epoch": 2.78 + }, + { + "loss": 7.1843, + "grad_norm": 0.8737899661064148, + "learning_rate": 7.175986698170999e-05, + "epoch": 2.79 + }, + { + "loss": 7.376, + "grad_norm": 0.9045628905296326, + "learning_rate": 7.088474665266474e-05, + "epoch": 2.79 + }, + { + "loss": 7.4447, + "grad_norm": 0.8932380080223083, + "learning_rate": 7.00096263236195e-05, + "epoch": 2.79 + }, + { + "loss": 7.362, + "grad_norm": 0.8961164951324463, + "learning_rate": 6.913450599457426e-05, + "epoch": 2.79 + }, + { + "loss": 7.4237, + "grad_norm": 1.0015422105789185, + "learning_rate": 6.825938566552901e-05, + "epoch": 2.8 + }, + { + "loss": 7.2541, + "grad_norm": 0.9842544198036194, + "learning_rate": 6.738426533648377e-05, + "epoch": 2.8 + }, + { + "loss": 8.0427, + "grad_norm": 1.0375638008117676, + "learning_rate": 6.650914500743853e-05, + "epoch": 2.8 + }, + { + "loss": 7.4801, + "grad_norm": 0.9552834630012512, + "learning_rate": 6.563402467839328e-05, + "epoch": 2.8 + }, + { + "loss": 7.8596, + "grad_norm": 0.8038078546524048, + "learning_rate": 6.475890434934803e-05, + "epoch": 2.81 + }, + { + "loss": 7.2653, + "grad_norm": 0.8008092045783997, + "learning_rate": 6.388378402030279e-05, + "epoch": 2.81 + }, + { + "loss": 7.159, + "grad_norm": 1.087442398071289, + "learning_rate": 6.300866369125754e-05, + "epoch": 2.81 + }, + { + "loss": 7.0556, + "grad_norm": 1.0442233085632324, + "learning_rate": 6.21335433622123e-05, + "epoch": 2.81 + }, + { + "loss": 7.162, + "grad_norm": 1.0271589756011963, + "learning_rate": 6.125842303316706e-05, + "epoch": 2.82 + }, + { + "loss": 7.5864, + "grad_norm": 0.9957409501075745, + "learning_rate": 6.038330270412182e-05, + "epoch": 2.82 + }, + { + "loss": 8.4511, + "grad_norm": 0.870765745639801, + "learning_rate": 5.9508182375076575e-05, + "epoch": 2.82 + }, + { + "loss": 7.9488, + "grad_norm": 0.8632308840751648, + "learning_rate": 5.863306204603133e-05, + "epoch": 2.82 + }, + { + "loss": 8.1216, + "grad_norm": 1.1113914251327515, + "learning_rate": 5.775794171698609e-05, + "epoch": 2.83 + }, + { + "loss": 7.7049, + "grad_norm": 0.9410499334335327, + "learning_rate": 5.688282138794084e-05, + "epoch": 2.83 + }, + { + "loss": 7.6916, + "grad_norm": 0.8908835053443909, + "learning_rate": 5.6007701058895595e-05, + "epoch": 2.83 + }, + { + "loss": 7.5659, + "grad_norm": 0.7924339175224304, + "learning_rate": 5.513258072985036e-05, + "epoch": 2.83 + }, + { + "loss": 7.4359, + "grad_norm": 0.8098507523536682, + "learning_rate": 5.425746040080511e-05, + "epoch": 2.84 + }, + { + "loss": 7.4043, + "grad_norm": 0.8541660904884338, + "learning_rate": 5.338234007175987e-05, + "epoch": 2.84 + }, + { + "loss": 7.5664, + "grad_norm": 0.9474323987960815, + "learning_rate": 5.250721974271463e-05, + "epoch": 2.84 + }, + { + "loss": 7.7903, + "grad_norm": 1.0568387508392334, + "learning_rate": 5.163209941366938e-05, + "epoch": 2.85 + }, + { + "loss": 7.4216, + "grad_norm": 0.9031184315681458, + "learning_rate": 5.075697908462414e-05, + "epoch": 2.85 + }, + { + "loss": 7.5944, + "grad_norm": 0.8136922121047974, + "learning_rate": 4.9881858755578894e-05, + "epoch": 2.85 + }, + { + "loss": 7.6272, + "grad_norm": 1.1002339124679565, + "learning_rate": 4.900673842653365e-05, + "epoch": 2.85 + }, + { + "loss": 7.5846, + "grad_norm": 1.2232916355133057, + "learning_rate": 4.813161809748841e-05, + "epoch": 2.86 + }, + { + "loss": 7.8478, + "grad_norm": 0.8891430497169495, + "learning_rate": 4.725649776844316e-05, + "epoch": 2.86 + }, + { + "loss": 7.3, + "grad_norm": 0.9129414558410645, + "learning_rate": 4.6381377439397914e-05, + "epoch": 2.86 + }, + { + "loss": 7.4529, + "grad_norm": 0.7938532829284668, + "learning_rate": 4.5506257110352676e-05, + "epoch": 2.86 + }, + { + "loss": 7.2803, + "grad_norm": 0.9501358270645142, + "learning_rate": 4.463113678130743e-05, + "epoch": 2.87 + }, + { + "loss": 7.5943, + "grad_norm": 1.0423897504806519, + "learning_rate": 4.3756016452262186e-05, + "epoch": 2.87 + }, + { + "loss": 7.2376, + "grad_norm": 0.9883305430412292, + "learning_rate": 4.288089612321695e-05, + "epoch": 2.87 + }, + { + "loss": 7.6255, + "grad_norm": 0.9974358677864075, + "learning_rate": 4.2005775794171696e-05, + "epoch": 2.87 + }, + { + "loss": 7.2739, + "grad_norm": 0.9481905102729797, + "learning_rate": 4.113065546512645e-05, + "epoch": 2.88 + }, + { + "loss": 8.0077, + "grad_norm": 1.067797064781189, + "learning_rate": 4.025553513608121e-05, + "epoch": 2.88 + }, + { + "loss": 7.5522, + "grad_norm": 0.8410007953643799, + "learning_rate": 3.938041480703597e-05, + "epoch": 2.88 + }, + { + "loss": 7.5854, + "grad_norm": 0.847583532333374, + "learning_rate": 3.8505294477990723e-05, + "epoch": 2.88 + }, + { + "loss": 7.2142, + "grad_norm": 1.0279533863067627, + "learning_rate": 3.7630174148945485e-05, + "epoch": 2.89 + }, + { + "loss": 7.4712, + "grad_norm": 1.1256965398788452, + "learning_rate": 3.6755053819900234e-05, + "epoch": 2.89 + }, + { + "loss": 7.7947, + "grad_norm": 1.0278571844100952, + "learning_rate": 3.5879933490854995e-05, + "epoch": 2.89 + }, + { + "loss": 7.3523, + "grad_norm": 0.9609654545783997, + "learning_rate": 3.500481316180975e-05, + "epoch": 2.9 + }, + { + "loss": 7.3334, + "grad_norm": 0.8453736901283264, + "learning_rate": 3.4129692832764505e-05, + "epoch": 2.9 + }, + { + "loss": 7.177, + "grad_norm": 0.8161653280258179, + "learning_rate": 3.325457250371927e-05, + "epoch": 2.9 + }, + { + "loss": 7.9061, + "grad_norm": 0.9861032366752625, + "learning_rate": 3.2379452174674016e-05, + "epoch": 2.9 + }, + { + "loss": 7.3155, + "grad_norm": 1.1409838199615479, + "learning_rate": 3.150433184562877e-05, + "epoch": 2.91 + }, + { + "loss": 7.7667, + "grad_norm": 0.8848074078559875, + "learning_rate": 3.062921151658353e-05, + "epoch": 2.91 + }, + { + "loss": 7.2722, + "grad_norm": 0.8996227979660034, + "learning_rate": 2.9754091187538288e-05, + "epoch": 2.91 + }, + { + "loss": 7.4819, + "grad_norm": 0.9429714679718018, + "learning_rate": 2.8878970858493046e-05, + "epoch": 2.91 + }, + { + "loss": 7.4154, + "grad_norm": 1.1169899702072144, + "learning_rate": 2.8003850529447798e-05, + "epoch": 2.92 + }, + { + "loss": 7.7691, + "grad_norm": 0.8326570987701416, + "learning_rate": 2.7128730200402556e-05, + "epoch": 2.92 + }, + { + "loss": 7.6196, + "grad_norm": 0.9243487119674683, + "learning_rate": 2.6253609871357314e-05, + "epoch": 2.92 + }, + { + "loss": 7.6529, + "grad_norm": 0.8465039730072021, + "learning_rate": 2.537848954231207e-05, + "epoch": 2.92 + }, + { + "loss": 7.2623, + "grad_norm": 1.0216766595840454, + "learning_rate": 2.4503369213266825e-05, + "epoch": 2.93 + }, + { + "loss": 7.5628, + "grad_norm": 0.9314711689949036, + "learning_rate": 2.362824888422158e-05, + "epoch": 2.93 + }, + { + "loss": 7.9252, + "grad_norm": 0.8769168853759766, + "learning_rate": 2.2753128555176338e-05, + "epoch": 2.93 + }, + { + "loss": 7.2971, + "grad_norm": 0.8925982713699341, + "learning_rate": 2.1878008226131093e-05, + "epoch": 2.93 + }, + { + "loss": 7.1022, + "grad_norm": 0.937786340713501, + "learning_rate": 2.1002887897085848e-05, + "epoch": 2.94 + }, + { + "loss": 7.5253, + "grad_norm": 0.900693416595459, + "learning_rate": 2.0127767568040607e-05, + "epoch": 2.94 + }, + { + "loss": 7.5837, + "grad_norm": 0.9113482236862183, + "learning_rate": 1.9252647238995362e-05, + "epoch": 2.94 + }, + { + "loss": 7.7925, + "grad_norm": 0.8734735250473022, + "learning_rate": 1.8377526909950117e-05, + "epoch": 2.95 + }, + { + "loss": 7.5821, + "grad_norm": 0.8616068959236145, + "learning_rate": 1.7502406580904875e-05, + "epoch": 2.95 + }, + { + "loss": 6.7659, + "grad_norm": 0.8509213328361511, + "learning_rate": 1.6627286251859634e-05, + "epoch": 2.95 + }, + { + "loss": 7.9045, + "grad_norm": 0.8518444895744324, + "learning_rate": 1.5752165922814385e-05, + "epoch": 2.95 + }, + { + "loss": 7.2314, + "grad_norm": 1.1429413557052612, + "learning_rate": 1.4877045593769144e-05, + "epoch": 2.96 + }, + { + "loss": 7.5707, + "grad_norm": 0.825677752494812, + "learning_rate": 1.4001925264723899e-05, + "epoch": 2.96 + }, + { + "loss": 7.2231, + "grad_norm": 0.9227612018585205, + "learning_rate": 1.3126804935678657e-05, + "epoch": 2.96 + }, + { + "loss": 7.3015, + "grad_norm": 0.9745140671730042, + "learning_rate": 1.2251684606633412e-05, + "epoch": 2.96 + }, + { + "loss": 7.5931, + "grad_norm": 0.8096091151237488, + "learning_rate": 1.1376564277588169e-05, + "epoch": 2.97 + }, + { + "loss": 7.2393, + "grad_norm": 0.9233807921409607, + "learning_rate": 1.0501443948542924e-05, + "epoch": 2.97 + }, + { + "loss": 7.2871, + "grad_norm": 0.7690852880477905, + "learning_rate": 9.626323619497681e-06, + "epoch": 2.97 + }, + { + "loss": 7.5845, + "grad_norm": 0.882102370262146, + "learning_rate": 8.751203290452438e-06, + "epoch": 2.97 + }, + { + "loss": 7.2335, + "grad_norm": 0.887958288192749, + "learning_rate": 7.876082961407193e-06, + "epoch": 2.98 + }, + { + "loss": 7.5324, + "grad_norm": 0.8895597457885742, + "learning_rate": 7.000962632361949e-06, + "epoch": 2.98 + }, + { + "loss": 7.1145, + "grad_norm": 0.8137519955635071, + "learning_rate": 6.125842303316706e-06, + "epoch": 2.98 + }, + { + "loss": 7.411, + "grad_norm": 0.9460362195968628, + "learning_rate": 5.250721974271462e-06, + "epoch": 2.98 + }, + { + "loss": 7.6058, + "grad_norm": 0.9842742681503296, + "learning_rate": 4.375601645226219e-06, + "epoch": 2.99 + }, + { + "loss": 7.6927, + "grad_norm": 0.938562273979187, + "learning_rate": 3.5004813161809747e-06, + "epoch": 2.99 + }, + { + "loss": 7.6995, + "grad_norm": 0.9931243658065796, + "learning_rate": 2.625360987135731e-06, + "epoch": 2.99 + }, + { + "loss": 7.5572, + "grad_norm": 0.8916573524475098, + "learning_rate": 1.7502406580904874e-06, + "epoch": 3.0 + }, + { + "loss": 7.2556, + "grad_norm": 0.780832052230835, + "learning_rate": 8.751203290452437e-07, + "epoch": 3.0 + }, + { + "train_runtime": 112786.1501, + "train_samples_per_second": 3.243, + "train_steps_per_second": 0.101, + "train_loss": 8.874524852365107, + "epoch": 3.0 + } +] \ No newline at end of file