diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7124 +1,13067 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7499861462585662, + "epoch": 0.24997764863656682, "eval_steps": 500, - "global_step": 20301, + "global_step": 1864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0007388662098010602, - "grad_norm": 12.062633265062304, - "learning_rate": 4.999999984845559e-07, - "loss": 1.9234, + "epoch": 0.00013410818059901653, + "grad_norm": 6.53010672133618, + "learning_rate": 5.999999760325567e-07, + "loss": 1.7583, + "step": 1 + }, + { + "epoch": 0.00026821636119803307, + "grad_norm": 3.9699106842198337, + "learning_rate": 5.999999041302309e-07, + "loss": 1.6802, + "step": 2 + }, + { + "epoch": 0.0004023245417970496, + "grad_norm": 1.9790778060230643, + "learning_rate": 5.999997842930357e-07, + "loss": 1.7683, + "step": 3 + }, + { + "epoch": 0.0005364327223960661, + "grad_norm": 4.227652802101559, + "learning_rate": 5.999996165209921e-07, + "loss": 1.7059, + "step": 4 + }, + { + "epoch": 0.0006705409029950827, + "grad_norm": 3.569278124536831, + "learning_rate": 5.9999940081413e-07, + "loss": 1.7249, + "step": 5 + }, + { + "epoch": 0.0008046490835940993, + "grad_norm": 4.901107992602518, + "learning_rate": 5.999991371724877e-07, + "loss": 1.7577, + "step": 6 + }, + { + "epoch": 0.0009387572641931158, + "grad_norm": 1.7387118214106754, + "learning_rate": 5.999988255961119e-07, + "loss": 1.7158, + "step": 7 + }, + { + "epoch": 0.0010728654447921323, + "grad_norm": 1.809495324631513, + "learning_rate": 5.99998466085058e-07, + "loss": 1.7287, + "step": 8 + }, + { + "epoch": 0.0012069736253911489, + "grad_norm": 2.596203473961021, + "learning_rate": 5.999980586393898e-07, + "loss": 1.7724, + "step": 9 + }, + { + "epoch": 0.0013410818059901655, + "grad_norm": 2.024872932500911, + "learning_rate": 5.999976032591797e-07, + "loss": 1.7405, + "step": 10 + }, + { + "epoch": 0.001475189986589182, + "grad_norm": 1.8852498791545222, + "learning_rate": 5.999970999445085e-07, + "loss": 1.8083, + "step": 11 + }, + { + "epoch": 0.0016092981671881985, + "grad_norm": 2.789618405199575, + "learning_rate": 5.999965486954655e-07, + "loss": 1.7057, + "step": 12 + }, + { + "epoch": 0.0017434063477872151, + "grad_norm": 5.494954113770268, + "learning_rate": 5.999959495121485e-07, + "loss": 1.7091, + "step": 13 + }, + { + "epoch": 0.0018775145283862315, + "grad_norm": 2.79414084165035, + "learning_rate": 5.999953023946642e-07, + "loss": 1.7631, + "step": 14 + }, + { + "epoch": 0.002011622708985248, + "grad_norm": 4.805471519443609, + "learning_rate": 5.999946073431272e-07, + "loss": 1.8484, + "step": 15 + }, + { + "epoch": 0.0021457308895842645, + "grad_norm": 1.3908444469943815, + "learning_rate": 5.99993864357661e-07, + "loss": 1.7106, + "step": 16 + }, + { + "epoch": 0.002279839070183281, + "grad_norm": 1.9101877569067494, + "learning_rate": 5.999930734383974e-07, + "loss": 1.7213, + "step": 17 + }, + { + "epoch": 0.0024139472507822978, + "grad_norm": 1.547605953954358, + "learning_rate": 5.999922345854771e-07, + "loss": 1.7222, + "step": 18 + }, + { + "epoch": 0.0025480554313813144, + "grad_norm": 1.4545546480798652, + "learning_rate": 5.999913477990486e-07, + "loss": 1.6248, + "step": 19 + }, + { + "epoch": 0.002682163611980331, + "grad_norm": 1.3664919966665414, + "learning_rate": 5.999904130792696e-07, + "loss": 1.7481, "step": 20 }, { - "epoch": 0.0014777324196021205, - "grad_norm": 7.366383713647709, - "learning_rate": 4.999993938226169e-07, - "loss": 1.8338, - "step": 40 + "epoch": 0.002816271792579347, + "grad_norm": 1.2263286406299385, + "learning_rate": 5.999894304263061e-07, + "loss": 1.731, + "step": 21 + }, + { + "epoch": 0.002950379973178364, + "grad_norm": 1.4441675215823284, + "learning_rate": 5.999883998403325e-07, + "loss": 1.7489, + "step": 22 + }, + { + "epoch": 0.0030844881537773804, + "grad_norm": 2.076565725996637, + "learning_rate": 5.999873213215316e-07, + "loss": 1.609, + "step": 23 + }, + { + "epoch": 0.003218596334376397, + "grad_norm": 2.056874040951704, + "learning_rate": 5.999861948700952e-07, + "loss": 1.7387, + "step": 24 + }, + { + "epoch": 0.0033527045149754136, + "grad_norm": 1.196618920130671, + "learning_rate": 5.99985020486223e-07, + "loss": 1.6522, + "step": 25 + }, + { + "epoch": 0.0034868126955744302, + "grad_norm": 1.4295779403436433, + "learning_rate": 5.999837981701236e-07, + "loss": 1.7226, + "step": 26 + }, + { + "epoch": 0.0036209208761734464, + "grad_norm": 1.22926449530156, + "learning_rate": 5.99982527922014e-07, + "loss": 1.699, + "step": 27 + }, + { + "epoch": 0.003755029056772463, + "grad_norm": 1.324357519929758, + "learning_rate": 5.999812097421198e-07, + "loss": 1.784, + "step": 28 + }, + { + "epoch": 0.0038891372373714797, + "grad_norm": 1.351746272995911, + "learning_rate": 5.999798436306748e-07, + "loss": 1.7094, + "step": 29 + }, + { + "epoch": 0.004023245417970496, + "grad_norm": 1.3696717018122837, + "learning_rate": 5.999784295879217e-07, + "loss": 1.8113, + "step": 30 + }, + { + "epoch": 0.0041573535985695124, + "grad_norm": 1.2950751514861556, + "learning_rate": 5.999769676141116e-07, + "loss": 1.7043, + "step": 31 + }, + { + "epoch": 0.004291461779168529, + "grad_norm": 1.510791624383582, + "learning_rate": 5.99975457709504e-07, + "loss": 1.7247, + "step": 32 + }, + { + "epoch": 0.004425569959767546, + "grad_norm": 1.205151919537117, + "learning_rate": 5.999738998743669e-07, + "loss": 1.7102, + "step": 33 + }, + { + "epoch": 0.004559678140366562, + "grad_norm": 1.2313460275237813, + "learning_rate": 5.999722941089769e-07, + "loss": 1.6194, + "step": 34 + }, + { + "epoch": 0.004693786320965579, + "grad_norm": 1.2769810504677248, + "learning_rate": 5.999706404136191e-07, + "loss": 1.6776, + "step": 35 + }, + { + "epoch": 0.0048278945015645955, + "grad_norm": 1.210621301547261, + "learning_rate": 5.99968938788587e-07, + "loss": 1.658, + "step": 36 + }, + { + "epoch": 0.004962002682163612, + "grad_norm": 1.3309399301655989, + "learning_rate": 5.99967189234183e-07, + "loss": 1.598, + "step": 37 + }, + { + "epoch": 0.005096110862762629, + "grad_norm": 1.2698125491932901, + "learning_rate": 5.999653917507173e-07, + "loss": 1.6783, + "step": 38 + }, + { + "epoch": 0.005230219043361645, + "grad_norm": 1.269690475054205, + "learning_rate": 5.999635463385092e-07, + "loss": 1.7118, + "step": 39 + }, + { + "epoch": 0.005364327223960662, + "grad_norm": 1.239195449838068, + "learning_rate": 5.999616529978864e-07, + "loss": 1.7552, + "step": 40 + }, + { + "epoch": 0.005498435404559678, + "grad_norm": 1.213245919091097, + "learning_rate": 5.999597117291851e-07, + "loss": 1.6195, + "step": 41 + }, + { + "epoch": 0.005632543585158694, + "grad_norm": 1.472546911008587, + "learning_rate": 5.999577225327498e-07, + "loss": 1.7151, + "step": 42 + }, + { + "epoch": 0.005766651765757711, + "grad_norm": 1.1739645532291967, + "learning_rate": 5.999556854089335e-07, + "loss": 1.6848, + "step": 43 + }, + { + "epoch": 0.005900759946356728, + "grad_norm": 1.6603998730539062, + "learning_rate": 5.999536003580982e-07, + "loss": 1.7987, + "step": 44 + }, + { + "epoch": 0.006034868126955744, + "grad_norm": 1.2267822489395797, + "learning_rate": 5.999514673806138e-07, + "loss": 1.7743, + "step": 45 + }, + { + "epoch": 0.006168976307554761, + "grad_norm": 1.182672696849382, + "learning_rate": 5.999492864768594e-07, + "loss": 1.7007, + "step": 46 + }, + { + "epoch": 0.006303084488153777, + "grad_norm": 1.3725982814639008, + "learning_rate": 5.999470576472216e-07, + "loss": 1.6453, + "step": 47 + }, + { + "epoch": 0.006437192668752794, + "grad_norm": 1.2302523441661959, + "learning_rate": 5.999447808920965e-07, + "loss": 1.668, + "step": 48 + }, + { + "epoch": 0.006571300849351811, + "grad_norm": 1.5825139985036842, + "learning_rate": 5.999424562118882e-07, + "loss": 1.677, + "step": 49 + }, + { + "epoch": 0.006705409029950827, + "grad_norm": 1.3441000281769755, + "learning_rate": 5.999400836070092e-07, + "loss": 1.7907, + "step": 50 + }, + { + "epoch": 0.006839517210549844, + "grad_norm": 1.2662568205784916, + "learning_rate": 5.999376630778812e-07, + "loss": 1.7948, + "step": 51 + }, + { + "epoch": 0.0069736253911488605, + "grad_norm": 1.6969919156319755, + "learning_rate": 5.999351946249336e-07, + "loss": 1.704, + "step": 52 + }, + { + "epoch": 0.007107733571747876, + "grad_norm": 1.3702701009027687, + "learning_rate": 5.999326782486047e-07, + "loss": 1.7596, + "step": 53 + }, + { + "epoch": 0.007241841752346893, + "grad_norm": 1.2008226357772018, + "learning_rate": 5.999301139493413e-07, + "loss": 1.7446, + "step": 54 + }, + { + "epoch": 0.0073759499329459095, + "grad_norm": 1.1610594693793954, + "learning_rate": 5.999275017275985e-07, + "loss": 1.6545, + "step": 55 + }, + { + "epoch": 0.007510058113544926, + "grad_norm": 1.2318851837588591, + "learning_rate": 5.999248415838404e-07, + "loss": 1.6945, + "step": 56 + }, + { + "epoch": 0.007644166294143943, + "grad_norm": 1.3623097314650943, + "learning_rate": 5.99922133518539e-07, + "loss": 1.7576, + "step": 57 + }, + { + "epoch": 0.007778274474742959, + "grad_norm": 1.263711259426924, + "learning_rate": 5.999193775321749e-07, + "loss": 1.7202, + "step": 58 + }, + { + "epoch": 0.007912382655341976, + "grad_norm": 1.266618530800646, + "learning_rate": 5.999165736252378e-07, + "loss": 1.7277, + "step": 59 + }, + { + "epoch": 0.008046490835940992, + "grad_norm": 1.20263409583272, + "learning_rate": 5.999137217982253e-07, + "loss": 1.7287, + "step": 60 + }, + { + "epoch": 0.00818059901654001, + "grad_norm": 1.3137021476149842, + "learning_rate": 5.999108220516439e-07, + "loss": 1.7524, + "step": 61 + }, + { + "epoch": 0.008314707197139025, + "grad_norm": 1.2381760472328087, + "learning_rate": 5.999078743860079e-07, + "loss": 1.6713, + "step": 62 + }, + { + "epoch": 0.008448815377738042, + "grad_norm": 1.1488246018603008, + "learning_rate": 5.999048788018412e-07, + "loss": 1.61, + "step": 63 + }, + { + "epoch": 0.008582923558337058, + "grad_norm": 1.1657309327731467, + "learning_rate": 5.999018352996753e-07, + "loss": 1.7329, + "step": 64 + }, + { + "epoch": 0.008717031738936076, + "grad_norm": 1.4859993327682761, + "learning_rate": 5.998987438800507e-07, + "loss": 1.7751, + "step": 65 + }, + { + "epoch": 0.008851139919535091, + "grad_norm": 1.2336235778167894, + "learning_rate": 5.99895604543516e-07, + "loss": 1.7698, + "step": 66 + }, + { + "epoch": 0.008985248100134109, + "grad_norm": 1.2063484420298083, + "learning_rate": 5.998924172906287e-07, + "loss": 1.6674, + "step": 67 + }, + { + "epoch": 0.009119356280733125, + "grad_norm": 1.144489164232074, + "learning_rate": 5.998891821219549e-07, + "loss": 1.6727, + "step": 68 + }, + { + "epoch": 0.009253464461332142, + "grad_norm": 1.1661711232482204, + "learning_rate": 5.998858990380685e-07, + "loss": 1.72, + "step": 69 + }, + { + "epoch": 0.009387572641931158, + "grad_norm": 1.8657773898969878, + "learning_rate": 5.998825680395526e-07, + "loss": 1.67, + "step": 70 + }, + { + "epoch": 0.009521680822530174, + "grad_norm": 1.2765420086009807, + "learning_rate": 5.998791891269986e-07, + "loss": 1.7016, + "step": 71 + }, + { + "epoch": 0.009655789003129191, + "grad_norm": 1.1153772140374385, + "learning_rate": 5.998757623010063e-07, + "loss": 1.707, + "step": 72 + }, + { + "epoch": 0.009789897183728207, + "grad_norm": 1.1669261546443137, + "learning_rate": 5.998722875621842e-07, + "loss": 1.6859, + "step": 73 + }, + { + "epoch": 0.009924005364327224, + "grad_norm": 1.1777142907854627, + "learning_rate": 5.99868764911149e-07, + "loss": 1.616, + "step": 74 + }, + { + "epoch": 0.01005811354492624, + "grad_norm": 1.1386560612646601, + "learning_rate": 5.998651943485263e-07, + "loss": 1.7086, + "step": 75 + }, + { + "epoch": 0.010192221725525258, + "grad_norm": 1.1396265347862253, + "learning_rate": 5.998615758749499e-07, + "loss": 1.6094, + "step": 76 + }, + { + "epoch": 0.010326329906124273, + "grad_norm": 1.1418930865866173, + "learning_rate": 5.998579094910623e-07, + "loss": 1.5653, + "step": 77 + }, + { + "epoch": 0.01046043808672329, + "grad_norm": 1.2012675736770206, + "learning_rate": 5.998541951975143e-07, + "loss": 1.749, + "step": 78 + }, + { + "epoch": 0.010594546267322306, + "grad_norm": 1.1829649799589437, + "learning_rate": 5.998504329949654e-07, + "loss": 1.741, + "step": 79 + }, + { + "epoch": 0.010728654447921324, + "grad_norm": 1.1137771771242837, + "learning_rate": 5.998466228840834e-07, + "loss": 1.7467, + "step": 80 + }, + { + "epoch": 0.01086276262852034, + "grad_norm": 1.2213171478733171, + "learning_rate": 5.998427648655449e-07, + "loss": 1.7411, + "step": 81 + }, + { + "epoch": 0.010996870809119355, + "grad_norm": 1.2565644926554131, + "learning_rate": 5.998388589400348e-07, + "loss": 1.5334, + "step": 82 + }, + { + "epoch": 0.011130978989718373, + "grad_norm": 1.1677953640865506, + "learning_rate": 5.998349051082467e-07, + "loss": 1.6292, + "step": 83 + }, + { + "epoch": 0.011265087170317389, + "grad_norm": 1.1735267116017247, + "learning_rate": 5.998309033708821e-07, + "loss": 1.7093, + "step": 84 + }, + { + "epoch": 0.011399195350916406, + "grad_norm": 1.1800930162312424, + "learning_rate": 5.998268537286519e-07, + "loss": 1.6931, + "step": 85 + }, + { + "epoch": 0.011533303531515422, + "grad_norm": 1.267648133239451, + "learning_rate": 5.998227561822748e-07, + "loss": 1.7372, + "step": 86 + }, + { + "epoch": 0.01166741171211444, + "grad_norm": 1.322566507024075, + "learning_rate": 5.998186107324783e-07, + "loss": 1.729, + "step": 87 + }, + { + "epoch": 0.011801519892713455, + "grad_norm": 1.2115409550398644, + "learning_rate": 5.998144173799985e-07, + "loss": 1.8509, + "step": 88 + }, + { + "epoch": 0.011935628073312473, + "grad_norm": 1.2085609394825974, + "learning_rate": 5.998101761255799e-07, + "loss": 1.6913, + "step": 89 + }, + { + "epoch": 0.012069736253911488, + "grad_norm": 1.290801409771777, + "learning_rate": 5.998058869699753e-07, + "loss": 1.7102, + "step": 90 + }, + { + "epoch": 0.012203844434510506, + "grad_norm": 1.1367739383903264, + "learning_rate": 5.998015499139461e-07, + "loss": 1.6836, + "step": 91 + }, + { + "epoch": 0.012337952615109522, + "grad_norm": 1.1670495306196762, + "learning_rate": 5.997971649582626e-07, + "loss": 1.7664, + "step": 92 + }, + { + "epoch": 0.012472060795708539, + "grad_norm": 1.1506012664004979, + "learning_rate": 5.99792732103703e-07, + "loss": 1.6477, + "step": 93 + }, + { + "epoch": 0.012606168976307555, + "grad_norm": 1.1708715291743035, + "learning_rate": 5.997882513510546e-07, + "loss": 1.6524, + "step": 94 + }, + { + "epoch": 0.01274027715690657, + "grad_norm": 1.151933225888518, + "learning_rate": 5.997837227011127e-07, + "loss": 1.7245, + "step": 95 + }, + { + "epoch": 0.012874385337505588, + "grad_norm": 1.1690902149158897, + "learning_rate": 5.997791461546813e-07, + "loss": 1.7276, + "step": 96 + }, + { + "epoch": 0.013008493518104604, + "grad_norm": 1.2091849738704115, + "learning_rate": 5.997745217125728e-07, + "loss": 1.6816, + "step": 97 + }, + { + "epoch": 0.013142601698703621, + "grad_norm": 1.154532285060635, + "learning_rate": 5.997698493756085e-07, + "loss": 1.7065, + "step": 98 + }, + { + "epoch": 0.013276709879302637, + "grad_norm": 1.084556225295123, + "learning_rate": 5.997651291446176e-07, + "loss": 1.6972, + "step": 99 + }, + { + "epoch": 0.013410818059901655, + "grad_norm": 1.0844384684144817, + "learning_rate": 5.997603610204383e-07, + "loss": 1.6011, + "step": 100 + }, + { + "epoch": 0.01354492624050067, + "grad_norm": 1.1349833362519353, + "learning_rate": 5.997555450039173e-07, + "loss": 1.7058, + "step": 101 + }, + { + "epoch": 0.013679034421099688, + "grad_norm": 1.161646012371061, + "learning_rate": 5.997506810959091e-07, + "loss": 1.7284, + "step": 102 + }, + { + "epoch": 0.013813142601698703, + "grad_norm": 1.1931085385755509, + "learning_rate": 5.997457692972776e-07, + "loss": 1.6889, + "step": 103 + }, + { + "epoch": 0.013947250782297721, + "grad_norm": 1.1274496052792788, + "learning_rate": 5.997408096088949e-07, + "loss": 1.6966, + "step": 104 + }, + { + "epoch": 0.014081358962896737, + "grad_norm": 1.181021137421778, + "learning_rate": 5.997358020316412e-07, + "loss": 1.6328, + "step": 105 + }, + { + "epoch": 0.014215467143495752, + "grad_norm": 1.1775178821818613, + "learning_rate": 5.997307465664057e-07, + "loss": 1.776, + "step": 106 + }, + { + "epoch": 0.01434957532409477, + "grad_norm": 1.1589504262285564, + "learning_rate": 5.99725643214086e-07, + "loss": 1.7587, + "step": 107 + }, + { + "epoch": 0.014483683504693786, + "grad_norm": 1.0988787787594243, + "learning_rate": 5.99720491975588e-07, + "loss": 1.6803, + "step": 108 + }, + { + "epoch": 0.014617791685292803, + "grad_norm": 1.1461688756871193, + "learning_rate": 5.997152928518265e-07, + "loss": 1.607, + "step": 109 + }, + { + "epoch": 0.014751899865891819, + "grad_norm": 1.152474644239047, + "learning_rate": 5.99710045843724e-07, + "loss": 1.7633, + "step": 110 + }, + { + "epoch": 0.014886008046490836, + "grad_norm": 1.1059120772328972, + "learning_rate": 5.997047509522127e-07, + "loss": 1.6747, + "step": 111 + }, + { + "epoch": 0.015020116227089852, + "grad_norm": 1.313489457814451, + "learning_rate": 5.996994081782321e-07, + "loss": 1.7596, + "step": 112 + }, + { + "epoch": 0.01515422440768887, + "grad_norm": 1.111253336672023, + "learning_rate": 5.99694017522731e-07, + "loss": 1.6808, + "step": 113 + }, + { + "epoch": 0.015288332588287885, + "grad_norm": 1.1886992881117084, + "learning_rate": 5.996885789866662e-07, + "loss": 1.7115, + "step": 114 + }, + { + "epoch": 0.015422440768886903, + "grad_norm": 1.3209352003575652, + "learning_rate": 5.996830925710036e-07, + "loss": 1.6806, + "step": 115 + }, + { + "epoch": 0.015556548949485919, + "grad_norm": 1.1206995765571244, + "learning_rate": 5.99677558276717e-07, + "loss": 1.7454, + "step": 116 + }, + { + "epoch": 0.015690657130084936, + "grad_norm": 1.5425145699155092, + "learning_rate": 5.996719761047891e-07, + "loss": 1.7396, + "step": 117 + }, + { + "epoch": 0.015824765310683952, + "grad_norm": 1.1362376633432387, + "learning_rate": 5.996663460562107e-07, + "loss": 1.7999, + "step": 118 + }, + { + "epoch": 0.015958873491282968, + "grad_norm": 1.3633221428865825, + "learning_rate": 5.996606681319816e-07, + "loss": 1.7351, + "step": 119 + }, + { + "epoch": 0.016092981671881983, + "grad_norm": 1.3214169620385536, + "learning_rate": 5.996549423331097e-07, + "loss": 1.8187, + "step": 120 + }, + { + "epoch": 0.016227089852481003, + "grad_norm": 1.127623956399482, + "learning_rate": 5.996491686606115e-07, + "loss": 1.7869, + "step": 121 + }, + { + "epoch": 0.01636119803308002, + "grad_norm": 1.1391849506858633, + "learning_rate": 5.996433471155121e-07, + "loss": 1.6692, + "step": 122 + }, + { + "epoch": 0.016495306213679034, + "grad_norm": 1.2689844428227393, + "learning_rate": 5.99637477698845e-07, + "loss": 1.7503, + "step": 123 + }, + { + "epoch": 0.01662941439427805, + "grad_norm": 1.1304906129070134, + "learning_rate": 5.996315604116523e-07, + "loss": 1.7342, + "step": 124 + }, + { + "epoch": 0.01676352257487707, + "grad_norm": 1.114839424591474, + "learning_rate": 5.996255952549846e-07, + "loss": 1.7152, + "step": 125 + }, + { + "epoch": 0.016897630755476085, + "grad_norm": 1.1209913395354725, + "learning_rate": 5.996195822299007e-07, + "loss": 1.7016, + "step": 126 + }, + { + "epoch": 0.0170317389360751, + "grad_norm": 1.2030735367344376, + "learning_rate": 5.996135213374683e-07, + "loss": 1.6916, + "step": 127 + }, + { + "epoch": 0.017165847116674116, + "grad_norm": 1.1576104199692667, + "learning_rate": 5.996074125787635e-07, + "loss": 1.6998, + "step": 128 + }, + { + "epoch": 0.017299955297273132, + "grad_norm": 1.1589080789600115, + "learning_rate": 5.996012559548706e-07, + "loss": 1.7135, + "step": 129 + }, + { + "epoch": 0.01743406347787215, + "grad_norm": 1.1305459345535596, + "learning_rate": 5.995950514668828e-07, + "loss": 1.7388, + "step": 130 + }, + { + "epoch": 0.017568171658471167, + "grad_norm": 1.1845858349294451, + "learning_rate": 5.995887991159015e-07, + "loss": 1.6555, + "step": 131 + }, + { + "epoch": 0.017702279839070183, + "grad_norm": 1.1323032600098655, + "learning_rate": 5.99582498903037e-07, + "loss": 1.7391, + "step": 132 + }, + { + "epoch": 0.0178363880196692, + "grad_norm": 1.1284637339427535, + "learning_rate": 5.995761508294074e-07, + "loss": 1.7362, + "step": 133 + }, + { + "epoch": 0.017970496200268218, + "grad_norm": 1.1056404062639804, + "learning_rate": 5.995697548961401e-07, + "loss": 1.6097, + "step": 134 + }, + { + "epoch": 0.018104604380867233, + "grad_norm": 1.168355336516219, + "learning_rate": 5.995633111043703e-07, + "loss": 1.6254, + "step": 135 + }, + { + "epoch": 0.01823871256146625, + "grad_norm": 1.1077402595262742, + "learning_rate": 5.995568194552422e-07, + "loss": 1.6421, + "step": 136 + }, + { + "epoch": 0.018372820742065265, + "grad_norm": 1.1281814027534607, + "learning_rate": 5.995502799499084e-07, + "loss": 1.6564, + "step": 137 + }, + { + "epoch": 0.018506928922664284, + "grad_norm": 1.1954713277317879, + "learning_rate": 5.995436925895296e-07, + "loss": 1.7595, + "step": 138 + }, + { + "epoch": 0.0186410371032633, + "grad_norm": 1.0981557354609233, + "learning_rate": 5.995370573752754e-07, + "loss": 1.7267, + "step": 139 + }, + { + "epoch": 0.018775145283862316, + "grad_norm": 1.1055449164193234, + "learning_rate": 5.99530374308324e-07, + "loss": 1.7206, + "step": 140 + }, + { + "epoch": 0.01890925346446133, + "grad_norm": 1.1553645507386814, + "learning_rate": 5.995236433898617e-07, + "loss": 1.7575, + "step": 141 + }, + { + "epoch": 0.019043361645060347, + "grad_norm": 1.153673293502894, + "learning_rate": 5.995168646210836e-07, + "loss": 1.6141, + "step": 142 + }, + { + "epoch": 0.019177469825659366, + "grad_norm": 1.2666080786381764, + "learning_rate": 5.995100380031929e-07, + "loss": 1.6959, + "step": 143 + }, + { + "epoch": 0.019311578006258382, + "grad_norm": 1.1798276231576013, + "learning_rate": 5.99503163537402e-07, + "loss": 1.6898, + "step": 144 + }, + { + "epoch": 0.019445686186857398, + "grad_norm": 1.1774834485948251, + "learning_rate": 5.99496241224931e-07, + "loss": 1.6964, + "step": 145 + }, + { + "epoch": 0.019579794367456414, + "grad_norm": 1.1714422777051698, + "learning_rate": 5.994892710670092e-07, + "loss": 1.7554, + "step": 146 + }, + { + "epoch": 0.019713902548055433, + "grad_norm": 1.1423261455221443, + "learning_rate": 5.994822530648737e-07, + "loss": 1.6261, + "step": 147 + }, + { + "epoch": 0.01984801072865445, + "grad_norm": 1.2304343117411098, + "learning_rate": 5.994751872197707e-07, + "loss": 1.6867, + "step": 148 + }, + { + "epoch": 0.019982118909253464, + "grad_norm": 1.6067971505148326, + "learning_rate": 5.994680735329545e-07, + "loss": 1.7063, + "step": 149 + }, + { + "epoch": 0.02011622708985248, + "grad_norm": 1.2298413663973762, + "learning_rate": 5.994609120056881e-07, + "loss": 1.8201, + "step": 150 + }, + { + "epoch": 0.0202503352704515, + "grad_norm": 1.3415909829046686, + "learning_rate": 5.994537026392431e-07, + "loss": 1.7761, + "step": 151 + }, + { + "epoch": 0.020384443451050515, + "grad_norm": 1.092009599817307, + "learning_rate": 5.994464454348991e-07, + "loss": 1.6873, + "step": 152 + }, + { + "epoch": 0.02051855163164953, + "grad_norm": 1.123624476668709, + "learning_rate": 5.994391403939447e-07, + "loss": 1.6261, + "step": 153 + }, + { + "epoch": 0.020652659812248546, + "grad_norm": 1.194551927187871, + "learning_rate": 5.994317875176768e-07, + "loss": 1.6832, + "step": 154 + }, + { + "epoch": 0.020786767992847562, + "grad_norm": 1.2676660386871186, + "learning_rate": 5.99424386807401e-07, + "loss": 1.7296, + "step": 155 + }, + { + "epoch": 0.02092087617344658, + "grad_norm": 1.1316668463703698, + "learning_rate": 5.994169382644308e-07, + "loss": 1.5888, + "step": 156 + }, + { + "epoch": 0.021054984354045597, + "grad_norm": 1.1959893911689907, + "learning_rate": 5.994094418900889e-07, + "loss": 1.75, + "step": 157 + }, + { + "epoch": 0.021189092534644613, + "grad_norm": 1.1591910643371741, + "learning_rate": 5.994018976857061e-07, + "loss": 1.6475, + "step": 158 + }, + { + "epoch": 0.02132320071524363, + "grad_norm": 1.1845979417363135, + "learning_rate": 5.993943056526216e-07, + "loss": 1.6961, + "step": 159 + }, + { + "epoch": 0.021457308895842648, + "grad_norm": 1.1240443814893686, + "learning_rate": 5.993866657921835e-07, + "loss": 1.6806, + "step": 160 + }, + { + "epoch": 0.021591417076441664, + "grad_norm": 1.24112432554422, + "learning_rate": 5.99378978105748e-07, + "loss": 1.7856, + "step": 161 + }, + { + "epoch": 0.02172552525704068, + "grad_norm": 1.1097493699883791, + "learning_rate": 5.993712425946801e-07, + "loss": 1.6526, + "step": 162 + }, + { + "epoch": 0.021859633437639695, + "grad_norm": 1.1808847880790212, + "learning_rate": 5.99363459260353e-07, + "loss": 1.6635, + "step": 163 + }, + { + "epoch": 0.02199374161823871, + "grad_norm": 1.1260977048233447, + "learning_rate": 5.993556281041487e-07, + "loss": 1.6883, + "step": 164 + }, + { + "epoch": 0.02212784979883773, + "grad_norm": 1.1355214317178735, + "learning_rate": 5.993477491274572e-07, + "loss": 1.7197, + "step": 165 + }, + { + "epoch": 0.022261957979436746, + "grad_norm": 1.1667677632223183, + "learning_rate": 5.993398223316776e-07, + "loss": 1.652, + "step": 166 + }, + { + "epoch": 0.02239606616003576, + "grad_norm": 1.2054751712250278, + "learning_rate": 5.993318477182171e-07, + "loss": 1.7181, + "step": 167 + }, + { + "epoch": 0.022530174340634777, + "grad_norm": 1.102112367147099, + "learning_rate": 5.993238252884914e-07, + "loss": 1.7064, + "step": 168 + }, + { + "epoch": 0.022664282521233797, + "grad_norm": 1.1174237172322072, + "learning_rate": 5.99315755043925e-07, + "loss": 1.7088, + "step": 169 + }, + { + "epoch": 0.022798390701832812, + "grad_norm": 1.1526425806154745, + "learning_rate": 5.993076369859505e-07, + "loss": 1.6713, + "step": 170 + }, + { + "epoch": 0.022932498882431828, + "grad_norm": 1.189041831016279, + "learning_rate": 5.992994711160089e-07, + "loss": 1.796, + "step": 171 + }, + { + "epoch": 0.023066607063030844, + "grad_norm": 1.1020745587716836, + "learning_rate": 5.992912574355505e-07, + "loss": 1.7036, + "step": 172 + }, + { + "epoch": 0.023200715243629863, + "grad_norm": 1.1024337259717305, + "learning_rate": 5.992829959460332e-07, + "loss": 1.7183, + "step": 173 + }, + { + "epoch": 0.02333482342422888, + "grad_norm": 1.2334229037351967, + "learning_rate": 5.992746866489237e-07, + "loss": 1.7278, + "step": 174 + }, + { + "epoch": 0.023468931604827895, + "grad_norm": 1.1942148623032185, + "learning_rate": 5.992663295456972e-07, + "loss": 1.7127, + "step": 175 + }, + { + "epoch": 0.02360303978542691, + "grad_norm": 1.154550051904798, + "learning_rate": 5.992579246378375e-07, + "loss": 1.7259, + "step": 176 + }, + { + "epoch": 0.023737147966025926, + "grad_norm": 1.139454652919351, + "learning_rate": 5.992494719268369e-07, + "loss": 1.8202, + "step": 177 + }, + { + "epoch": 0.023871256146624945, + "grad_norm": 1.1334182773252635, + "learning_rate": 5.992409714141957e-07, + "loss": 1.7458, + "step": 178 + }, + { + "epoch": 0.02400536432722396, + "grad_norm": 1.175452351416824, + "learning_rate": 5.992324231014234e-07, + "loss": 1.7343, + "step": 179 + }, + { + "epoch": 0.024139472507822977, + "grad_norm": 1.15495982844933, + "learning_rate": 5.992238269900374e-07, + "loss": 1.6397, + "step": 180 + }, + { + "epoch": 0.024273580688421992, + "grad_norm": 1.222036602203619, + "learning_rate": 5.992151830815639e-07, + "loss": 1.6585, + "step": 181 + }, + { + "epoch": 0.02440768886902101, + "grad_norm": 1.1103145700032067, + "learning_rate": 5.992064913775376e-07, + "loss": 1.6729, + "step": 182 + }, + { + "epoch": 0.024541797049620027, + "grad_norm": 1.1627847561281206, + "learning_rate": 5.991977518795014e-07, + "loss": 1.6693, + "step": 183 + }, + { + "epoch": 0.024675905230219043, + "grad_norm": 1.2021941957895712, + "learning_rate": 5.991889645890071e-07, + "loss": 1.7692, + "step": 184 + }, + { + "epoch": 0.02481001341081806, + "grad_norm": 1.0987338031386753, + "learning_rate": 5.991801295076147e-07, + "loss": 1.7378, + "step": 185 + }, + { + "epoch": 0.024944121591417078, + "grad_norm": 1.1764726234102538, + "learning_rate": 5.991712466368927e-07, + "loss": 1.7519, + "step": 186 + }, + { + "epoch": 0.025078229772016094, + "grad_norm": 1.1211959879636015, + "learning_rate": 5.991623159784181e-07, + "loss": 1.6915, + "step": 187 + }, + { + "epoch": 0.02521233795261511, + "grad_norm": 1.183694924138999, + "learning_rate": 5.991533375337764e-07, + "loss": 1.6992, + "step": 188 + }, + { + "epoch": 0.025346446133214125, + "grad_norm": 1.1093630411034636, + "learning_rate": 5.991443113045618e-07, + "loss": 1.7517, + "step": 189 + }, + { + "epoch": 0.02548055431381314, + "grad_norm": 1.0851009440926755, + "learning_rate": 5.991352372923766e-07, + "loss": 1.6776, + "step": 190 + }, + { + "epoch": 0.02561466249441216, + "grad_norm": 1.1395885180659286, + "learning_rate": 5.99126115498832e-07, + "loss": 1.6924, + "step": 191 + }, + { + "epoch": 0.025748770675011176, + "grad_norm": 1.177643032023232, + "learning_rate": 5.99116945925547e-07, + "loss": 1.8049, + "step": 192 + }, + { + "epoch": 0.025882878855610192, + "grad_norm": 1.2055741329571488, + "learning_rate": 5.9910772857415e-07, + "loss": 1.7318, + "step": 193 + }, + { + "epoch": 0.026016987036209208, + "grad_norm": 1.0540261983643227, + "learning_rate": 5.990984634462772e-07, + "loss": 1.6957, + "step": 194 + }, + { + "epoch": 0.026151095216808227, + "grad_norm": 1.1229012489144132, + "learning_rate": 5.990891505435736e-07, + "loss": 1.6655, + "step": 195 + }, + { + "epoch": 0.026285203397407243, + "grad_norm": 1.244124126818224, + "learning_rate": 5.990797898676924e-07, + "loss": 1.6651, + "step": 196 + }, + { + "epoch": 0.02641931157800626, + "grad_norm": 1.153272959704337, + "learning_rate": 5.990703814202957e-07, + "loss": 1.614, + "step": 197 + }, + { + "epoch": 0.026553419758605274, + "grad_norm": 1.1097663064103196, + "learning_rate": 5.990609252030535e-07, + "loss": 1.6663, + "step": 198 + }, + { + "epoch": 0.02668752793920429, + "grad_norm": 1.1863665422420122, + "learning_rate": 5.990514212176451e-07, + "loss": 1.6996, + "step": 199 + }, + { + "epoch": 0.02682163611980331, + "grad_norm": 1.19855062119957, + "learning_rate": 5.990418694657574e-07, + "loss": 1.6788, + "step": 200 + }, + { + "epoch": 0.026955744300402325, + "grad_norm": 2.240646939516989, + "learning_rate": 5.990322699490864e-07, + "loss": 1.6072, + "step": 201 + }, + { + "epoch": 0.02708985248100134, + "grad_norm": 1.16787040451677, + "learning_rate": 5.990226226693363e-07, + "loss": 1.7495, + "step": 202 + }, + { + "epoch": 0.027223960661600356, + "grad_norm": 1.1801746959435724, + "learning_rate": 5.990129276282199e-07, + "loss": 1.7816, + "step": 203 + }, + { + "epoch": 0.027358068842199375, + "grad_norm": 1.0865741581201773, + "learning_rate": 5.990031848274582e-07, + "loss": 1.6386, + "step": 204 + }, + { + "epoch": 0.02749217702279839, + "grad_norm": 1.1195448058003792, + "learning_rate": 5.989933942687813e-07, + "loss": 1.7666, + "step": 205 + }, + { + "epoch": 0.027626285203397407, + "grad_norm": 1.1595509114049103, + "learning_rate": 5.989835559539271e-07, + "loss": 1.7783, + "step": 206 + }, + { + "epoch": 0.027760393383996423, + "grad_norm": 1.132633530996875, + "learning_rate": 5.989736698846422e-07, + "loss": 1.7369, + "step": 207 + }, + { + "epoch": 0.027894501564595442, + "grad_norm": 1.2238390397270622, + "learning_rate": 5.98963736062682e-07, + "loss": 1.77, + "step": 208 + }, + { + "epoch": 0.028028609745194458, + "grad_norm": 1.1148263262442593, + "learning_rate": 5.989537544898099e-07, + "loss": 1.7091, + "step": 209 + }, + { + "epoch": 0.028162717925793473, + "grad_norm": 1.8988797886120061, + "learning_rate": 5.989437251677981e-07, + "loss": 1.7075, + "step": 210 + }, + { + "epoch": 0.02829682610639249, + "grad_norm": 1.1460869915607401, + "learning_rate": 5.989336480984271e-07, + "loss": 1.7101, + "step": 211 + }, + { + "epoch": 0.028430934286991505, + "grad_norm": 1.1467483507445029, + "learning_rate": 5.989235232834861e-07, + "loss": 1.826, + "step": 212 + }, + { + "epoch": 0.028565042467590524, + "grad_norm": 1.1300279144587981, + "learning_rate": 5.989133507247724e-07, + "loss": 1.6014, + "step": 213 + }, + { + "epoch": 0.02869915064818954, + "grad_norm": 1.1992643920221002, + "learning_rate": 5.989031304240922e-07, + "loss": 1.7145, + "step": 214 + }, + { + "epoch": 0.028833258828788556, + "grad_norm": 1.1299143353929064, + "learning_rate": 5.988928623832598e-07, + "loss": 1.7769, + "step": 215 + }, + { + "epoch": 0.02896736700938757, + "grad_norm": 1.2042592418402756, + "learning_rate": 5.988825466040984e-07, + "loss": 1.7626, + "step": 216 + }, + { + "epoch": 0.02910147518998659, + "grad_norm": 1.0995902853233575, + "learning_rate": 5.988721830884392e-07, + "loss": 1.6348, + "step": 217 + }, + { + "epoch": 0.029235583370585606, + "grad_norm": 1.6143410051222686, + "learning_rate": 5.988617718381222e-07, + "loss": 1.6693, + "step": 218 + }, + { + "epoch": 0.029369691551184622, + "grad_norm": 1.1356912583442442, + "learning_rate": 5.988513128549958e-07, + "loss": 1.8413, + "step": 219 + }, + { + "epoch": 0.029503799731783638, + "grad_norm": 1.0893609511374684, + "learning_rate": 5.988408061409167e-07, + "loss": 1.7344, + "step": 220 + }, + { + "epoch": 0.029637907912382657, + "grad_norm": 1.7248790007955832, + "learning_rate": 5.988302516977504e-07, + "loss": 1.6685, + "step": 221 + }, + { + "epoch": 0.029772016092981673, + "grad_norm": 1.2197670257203657, + "learning_rate": 5.988196495273707e-07, + "loss": 1.7656, + "step": 222 + }, + { + "epoch": 0.02990612427358069, + "grad_norm": 1.0570007929897236, + "learning_rate": 5.988089996316597e-07, + "loss": 1.6939, + "step": 223 + }, + { + "epoch": 0.030040232454179704, + "grad_norm": 1.2787842409441683, + "learning_rate": 5.987983020125083e-07, + "loss": 1.6764, + "step": 224 + }, + { + "epoch": 0.03017434063477872, + "grad_norm": 1.1358825590170436, + "learning_rate": 5.987875566718158e-07, + "loss": 1.6609, + "step": 225 + }, + { + "epoch": 0.03030844881537774, + "grad_norm": 1.118237942922342, + "learning_rate": 5.987767636114897e-07, + "loss": 1.7554, + "step": 226 + }, + { + "epoch": 0.030442556995976755, + "grad_norm": 1.091737931283322, + "learning_rate": 5.987659228334462e-07, + "loss": 1.7449, + "step": 227 + }, + { + "epoch": 0.03057666517657577, + "grad_norm": 1.1839355406865255, + "learning_rate": 5.9875503433961e-07, + "loss": 1.5726, + "step": 228 + }, + { + "epoch": 0.030710773357174787, + "grad_norm": 1.1337421280370006, + "learning_rate": 5.987440981319141e-07, + "loss": 1.7921, + "step": 229 + }, + { + "epoch": 0.030844881537773806, + "grad_norm": 1.1412449749582727, + "learning_rate": 5.987331142123003e-07, + "loss": 1.74, + "step": 230 + }, + { + "epoch": 0.03097898971837282, + "grad_norm": 1.153189714483035, + "learning_rate": 5.987220825827184e-07, + "loss": 1.8381, + "step": 231 + }, + { + "epoch": 0.031113097898971837, + "grad_norm": 1.5918789493838401, + "learning_rate": 5.98711003245127e-07, + "loss": 1.775, + "step": 232 + }, + { + "epoch": 0.031247206079570853, + "grad_norm": 1.1156741804185832, + "learning_rate": 5.986998762014931e-07, + "loss": 1.7849, + "step": 233 + }, + { + "epoch": 0.03138131426016987, + "grad_norm": 1.3525186481687417, + "learning_rate": 5.986887014537923e-07, + "loss": 1.6405, + "step": 234 + }, + { + "epoch": 0.03151542244076889, + "grad_norm": 1.158420443205213, + "learning_rate": 5.986774790040083e-07, + "loss": 1.7375, + "step": 235 + }, + { + "epoch": 0.031649530621367904, + "grad_norm": 1.123395074640784, + "learning_rate": 5.986662088541335e-07, + "loss": 1.7682, + "step": 236 + }, + { + "epoch": 0.03178363880196692, + "grad_norm": 1.1675872323082288, + "learning_rate": 5.98654891006169e-07, + "loss": 1.7364, + "step": 237 + }, + { + "epoch": 0.031917746982565935, + "grad_norm": 1.0814715571489928, + "learning_rate": 5.986435254621239e-07, + "loss": 1.5985, + "step": 238 + }, + { + "epoch": 0.03205185516316495, + "grad_norm": 3.0737070295965427, + "learning_rate": 5.986321122240162e-07, + "loss": 1.7085, + "step": 239 + }, + { + "epoch": 0.03218596334376397, + "grad_norm": 1.1671133111581686, + "learning_rate": 5.986206512938719e-07, + "loss": 1.6533, + "step": 240 + }, + { + "epoch": 0.03232007152436299, + "grad_norm": 1.145018806372248, + "learning_rate": 5.98609142673726e-07, + "loss": 1.7335, + "step": 241 + }, + { + "epoch": 0.032454179704962005, + "grad_norm": 1.159474229307987, + "learning_rate": 5.985975863656216e-07, + "loss": 1.7531, + "step": 242 + }, + { + "epoch": 0.03258828788556102, + "grad_norm": 1.2078048688870913, + "learning_rate": 5.985859823716102e-07, + "loss": 1.7911, + "step": 243 + }, + { + "epoch": 0.03272239606616004, + "grad_norm": 1.123182359654964, + "learning_rate": 5.985743306937522e-07, + "loss": 1.7939, + "step": 244 + }, + { + "epoch": 0.03285650424675905, + "grad_norm": 1.2328138827190458, + "learning_rate": 5.985626313341161e-07, + "loss": 1.7224, + "step": 245 + }, + { + "epoch": 0.03299061242735807, + "grad_norm": 1.148111739587274, + "learning_rate": 5.98550884294779e-07, + "loss": 1.7458, + "step": 246 + }, + { + "epoch": 0.033124720607957084, + "grad_norm": 1.1781302748488391, + "learning_rate": 5.985390895778263e-07, + "loss": 1.7283, + "step": 247 + }, + { + "epoch": 0.0332588287885561, + "grad_norm": 1.1649269851093655, + "learning_rate": 5.985272471853521e-07, + "loss": 1.7535, + "step": 248 + }, + { + "epoch": 0.033392936969155115, + "grad_norm": 1.1003523240939477, + "learning_rate": 5.985153571194589e-07, + "loss": 1.7422, + "step": 249 + }, + { + "epoch": 0.03352704514975414, + "grad_norm": 1.1239095176492149, + "learning_rate": 5.985034193822575e-07, + "loss": 1.7838, + "step": 250 + }, + { + "epoch": 0.033661153330353154, + "grad_norm": 1.1810699355311947, + "learning_rate": 5.984914339758673e-07, + "loss": 1.6863, + "step": 251 + }, + { + "epoch": 0.03379526151095217, + "grad_norm": 1.1136505916452646, + "learning_rate": 5.984794009024162e-07, + "loss": 1.7424, + "step": 252 + }, + { + "epoch": 0.033929369691551185, + "grad_norm": 1.1748644896008424, + "learning_rate": 5.984673201640406e-07, + "loss": 1.7273, + "step": 253 + }, + { + "epoch": 0.0340634778721502, + "grad_norm": 1.1728309803897534, + "learning_rate": 5.98455191762885e-07, + "loss": 1.7322, + "step": 254 + }, + { + "epoch": 0.03419758605274922, + "grad_norm": 1.1617256887218326, + "learning_rate": 5.984430157011031e-07, + "loss": 1.6426, + "step": 255 + }, + { + "epoch": 0.03433169423334823, + "grad_norm": 1.0944959568956085, + "learning_rate": 5.984307919808561e-07, + "loss": 1.6643, + "step": 256 + }, + { + "epoch": 0.03446580241394725, + "grad_norm": 1.1692415951338644, + "learning_rate": 5.984185206043145e-07, + "loss": 1.6584, + "step": 257 + }, + { + "epoch": 0.034599910594546264, + "grad_norm": 4.382957589748632, + "learning_rate": 5.984062015736567e-07, + "loss": 1.7101, + "step": 258 + }, + { + "epoch": 0.03473401877514529, + "grad_norm": 1.1567530728762943, + "learning_rate": 5.983938348910698e-07, + "loss": 1.643, + "step": 259 + }, + { + "epoch": 0.0348681269557443, + "grad_norm": 1.215341418188577, + "learning_rate": 5.983814205587494e-07, + "loss": 1.7239, + "step": 260 + }, + { + "epoch": 0.03500223513634332, + "grad_norm": 1.0746883114524803, + "learning_rate": 5.983689585788997e-07, + "loss": 1.6076, + "step": 261 + }, + { + "epoch": 0.035136343316942334, + "grad_norm": 1.0844612292689275, + "learning_rate": 5.983564489537329e-07, + "loss": 1.6903, + "step": 262 + }, + { + "epoch": 0.03527045149754135, + "grad_norm": 1.2255887165848134, + "learning_rate": 5.983438916854698e-07, + "loss": 1.6497, + "step": 263 + }, + { + "epoch": 0.035404559678140365, + "grad_norm": 1.1308380556818496, + "learning_rate": 5.983312867763402e-07, + "loss": 1.7412, + "step": 264 + }, + { + "epoch": 0.03553866785873938, + "grad_norm": 1.1248240455028355, + "learning_rate": 5.983186342285815e-07, + "loss": 1.6542, + "step": 265 + }, + { + "epoch": 0.0356727760393384, + "grad_norm": 1.127913908764272, + "learning_rate": 5.983059340444401e-07, + "loss": 1.7996, + "step": 266 + }, + { + "epoch": 0.03580688421993742, + "grad_norm": 1.1345562808363212, + "learning_rate": 5.98293186226171e-07, + "loss": 1.7426, + "step": 267 + }, + { + "epoch": 0.035940992400536435, + "grad_norm": 1.1100506727991573, + "learning_rate": 5.982803907760373e-07, + "loss": 1.6947, + "step": 268 + }, + { + "epoch": 0.03607510058113545, + "grad_norm": 1.1397892876092324, + "learning_rate": 5.982675476963105e-07, + "loss": 1.7525, + "step": 269 + }, + { + "epoch": 0.03620920876173447, + "grad_norm": 1.0980888601137475, + "learning_rate": 5.982546569892707e-07, + "loss": 1.6763, + "step": 270 + }, + { + "epoch": 0.03634331694233348, + "grad_norm": 1.1179358157267492, + "learning_rate": 5.982417186572067e-07, + "loss": 1.8195, + "step": 271 + }, + { + "epoch": 0.0364774251229325, + "grad_norm": 1.15212876523653, + "learning_rate": 5.982287327024153e-07, + "loss": 1.7003, + "step": 272 + }, + { + "epoch": 0.036611533303531514, + "grad_norm": 1.0898032141275467, + "learning_rate": 5.982156991272021e-07, + "loss": 1.7347, + "step": 273 + }, + { + "epoch": 0.03674564148413053, + "grad_norm": 1.2234098091068482, + "learning_rate": 5.982026179338812e-07, + "loss": 1.71, + "step": 274 + }, + { + "epoch": 0.036879749664729546, + "grad_norm": 1.2077801818134501, + "learning_rate": 5.981894891247747e-07, + "loss": 1.7966, + "step": 275 + }, + { + "epoch": 0.03701385784532857, + "grad_norm": 1.1190450985953022, + "learning_rate": 5.981763127022135e-07, + "loss": 1.6619, + "step": 276 + }, + { + "epoch": 0.037147966025927584, + "grad_norm": 1.235343710444344, + "learning_rate": 5.981630886685369e-07, + "loss": 1.7484, + "step": 277 + }, + { + "epoch": 0.0372820742065266, + "grad_norm": 1.2266668117138695, + "learning_rate": 5.98149817026093e-07, + "loss": 1.6734, + "step": 278 + }, + { + "epoch": 0.037416182387125616, + "grad_norm": 1.4154140120426957, + "learning_rate": 5.981364977772374e-07, + "loss": 1.7073, + "step": 279 + }, + { + "epoch": 0.03755029056772463, + "grad_norm": 1.2222936436898488, + "learning_rate": 5.981231309243353e-07, + "loss": 1.7837, + "step": 280 + }, + { + "epoch": 0.03768439874832365, + "grad_norm": 1.1519207095634527, + "learning_rate": 5.981097164697594e-07, + "loss": 1.7349, + "step": 281 + }, + { + "epoch": 0.03781850692892266, + "grad_norm": 1.172450505222872, + "learning_rate": 5.980962544158915e-07, + "loss": 1.7005, + "step": 282 + }, + { + "epoch": 0.03795261510952168, + "grad_norm": 1.2857156876454048, + "learning_rate": 5.980827447651216e-07, + "loss": 1.561, + "step": 283 + }, + { + "epoch": 0.038086723290120694, + "grad_norm": 1.2389387482561154, + "learning_rate": 5.98069187519848e-07, + "loss": 1.7068, + "step": 284 + }, + { + "epoch": 0.03822083147071972, + "grad_norm": 1.163985598391861, + "learning_rate": 5.980555826824778e-07, + "loss": 1.7442, + "step": 285 + }, + { + "epoch": 0.03835493965131873, + "grad_norm": 1.1048173896847064, + "learning_rate": 5.980419302554261e-07, + "loss": 1.685, + "step": 286 + }, + { + "epoch": 0.03848904783191775, + "grad_norm": 1.472564099104008, + "learning_rate": 5.98028230241117e-07, + "loss": 1.6997, + "step": 287 + }, + { + "epoch": 0.038623156012516764, + "grad_norm": 1.287728938848147, + "learning_rate": 5.980144826419825e-07, + "loss": 1.7084, + "step": 288 + }, + { + "epoch": 0.03875726419311578, + "grad_norm": 1.124267938500328, + "learning_rate": 5.980006874604635e-07, + "loss": 1.7134, + "step": 289 + }, + { + "epoch": 0.038891372373714796, + "grad_norm": 1.1218572497983328, + "learning_rate": 5.979868446990091e-07, + "loss": 1.6841, + "step": 290 + }, + { + "epoch": 0.03902548055431381, + "grad_norm": 1.1011749075237598, + "learning_rate": 5.979729543600769e-07, + "loss": 1.7323, + "step": 291 + }, + { + "epoch": 0.03915958873491283, + "grad_norm": 1.100745780533083, + "learning_rate": 5.979590164461328e-07, + "loss": 1.6788, + "step": 292 + }, + { + "epoch": 0.03929369691551184, + "grad_norm": 1.1613502217053182, + "learning_rate": 5.979450309596514e-07, + "loss": 1.6776, + "step": 293 + }, + { + "epoch": 0.039427805096110866, + "grad_norm": 1.089657509345998, + "learning_rate": 5.979309979031158e-07, + "loss": 1.7068, + "step": 294 + }, + { + "epoch": 0.03956191327670988, + "grad_norm": 1.1436391576530838, + "learning_rate": 5.97916917279017e-07, + "loss": 1.7388, + "step": 295 + }, + { + "epoch": 0.0396960214573089, + "grad_norm": 1.1145075933124646, + "learning_rate": 5.979027890898551e-07, + "loss": 1.7004, + "step": 296 + }, + { + "epoch": 0.03983012963790791, + "grad_norm": 1.0907272047712597, + "learning_rate": 5.978886133381384e-07, + "loss": 1.679, + "step": 297 + }, + { + "epoch": 0.03996423781850693, + "grad_norm": 1.12558267559901, + "learning_rate": 5.978743900263835e-07, + "loss": 1.6608, + "step": 298 + }, + { + "epoch": 0.040098345999105944, + "grad_norm": 1.136659951867088, + "learning_rate": 5.978601191571155e-07, + "loss": 1.6383, + "step": 299 + }, + { + "epoch": 0.04023245417970496, + "grad_norm": 1.2441133556300974, + "learning_rate": 5.978458007328682e-07, + "loss": 1.7697, + "step": 300 + }, + { + "epoch": 0.040366562360303976, + "grad_norm": 1.216051798039534, + "learning_rate": 5.978314347561835e-07, + "loss": 1.7656, + "step": 301 + }, + { + "epoch": 0.040500670540903, + "grad_norm": 1.1193332609304543, + "learning_rate": 5.978170212296118e-07, + "loss": 1.7034, + "step": 302 + }, + { + "epoch": 0.040634778721502014, + "grad_norm": 1.1450830933525635, + "learning_rate": 5.978025601557124e-07, + "loss": 1.6769, + "step": 303 + }, + { + "epoch": 0.04076888690210103, + "grad_norm": 1.1570981861957024, + "learning_rate": 5.977880515370523e-07, + "loss": 1.7491, + "step": 304 + }, + { + "epoch": 0.040902995082700046, + "grad_norm": 1.103432713835437, + "learning_rate": 5.977734953762075e-07, + "loss": 1.6544, + "step": 305 + }, + { + "epoch": 0.04103710326329906, + "grad_norm": 1.134144784637958, + "learning_rate": 5.97758891675762e-07, + "loss": 1.7084, + "step": 306 + }, + { + "epoch": 0.04117121144389808, + "grad_norm": 1.07738843402297, + "learning_rate": 5.977442404383088e-07, + "loss": 1.7369, + "step": 307 + }, + { + "epoch": 0.04130531962449709, + "grad_norm": 1.1164259724731038, + "learning_rate": 5.977295416664489e-07, + "loss": 1.6785, + "step": 308 + }, + { + "epoch": 0.04143942780509611, + "grad_norm": 1.2001430339127754, + "learning_rate": 5.977147953627918e-07, + "loss": 1.6496, + "step": 309 + }, + { + "epoch": 0.041573535985695124, + "grad_norm": 1.1849867153137015, + "learning_rate": 5.977000015299557e-07, + "loss": 1.6736, + "step": 310 + }, + { + "epoch": 0.04170764416629415, + "grad_norm": 1.1582589308770772, + "learning_rate": 5.976851601705669e-07, + "loss": 1.6775, + "step": 311 + }, + { + "epoch": 0.04184175234689316, + "grad_norm": 1.1033822470615744, + "learning_rate": 5.976702712872603e-07, + "loss": 1.6598, + "step": 312 + }, + { + "epoch": 0.04197586052749218, + "grad_norm": 1.1682634791444901, + "learning_rate": 5.976553348826793e-07, + "loss": 1.7557, + "step": 313 + }, + { + "epoch": 0.042109968708091194, + "grad_norm": 1.0838004153530265, + "learning_rate": 5.976403509594756e-07, + "loss": 1.6741, + "step": 314 + }, + { + "epoch": 0.04224407688869021, + "grad_norm": 1.121835854661048, + "learning_rate": 5.976253195203092e-07, + "loss": 1.7262, + "step": 315 + }, + { + "epoch": 0.042378185069289226, + "grad_norm": 1.1243699312065234, + "learning_rate": 5.976102405678491e-07, + "loss": 1.7902, + "step": 316 + }, + { + "epoch": 0.04251229324988824, + "grad_norm": 1.0991499127058322, + "learning_rate": 5.975951141047721e-07, + "loss": 1.6865, + "step": 317 + }, + { + "epoch": 0.04264640143048726, + "grad_norm": 1.126580502499325, + "learning_rate": 5.975799401337638e-07, + "loss": 1.6798, + "step": 318 + }, + { + "epoch": 0.04278050961108627, + "grad_norm": 1.1221949135632994, + "learning_rate": 5.975647186575182e-07, + "loss": 1.7491, + "step": 319 + }, + { + "epoch": 0.042914617791685296, + "grad_norm": 1.14926550813679, + "learning_rate": 5.975494496787376e-07, + "loss": 1.6549, + "step": 320 + }, + { + "epoch": 0.04304872597228431, + "grad_norm": 1.12638348214928, + "learning_rate": 5.975341332001328e-07, + "loss": 1.5897, + "step": 321 + }, + { + "epoch": 0.04318283415288333, + "grad_norm": 1.1725295960645503, + "learning_rate": 5.97518769224423e-07, + "loss": 1.695, + "step": 322 + }, + { + "epoch": 0.04331694233348234, + "grad_norm": 1.0904790236385375, + "learning_rate": 5.975033577543359e-07, + "loss": 1.6841, + "step": 323 + }, + { + "epoch": 0.04345105051408136, + "grad_norm": 1.1090846497862015, + "learning_rate": 5.974878987926075e-07, + "loss": 1.6075, + "step": 324 + }, + { + "epoch": 0.043585158694680375, + "grad_norm": 1.2329654322486787, + "learning_rate": 5.974723923419827e-07, + "loss": 1.7124, + "step": 325 + }, + { + "epoch": 0.04371926687527939, + "grad_norm": 1.1520738825385615, + "learning_rate": 5.974568384052139e-07, + "loss": 1.7492, + "step": 326 + }, + { + "epoch": 0.043853375055878406, + "grad_norm": 1.107509031801798, + "learning_rate": 5.974412369850631e-07, + "loss": 1.7233, + "step": 327 + }, + { + "epoch": 0.04398748323647742, + "grad_norm": 1.9987713290159552, + "learning_rate": 5.974255880842995e-07, + "loss": 1.7005, + "step": 328 + }, + { + "epoch": 0.044121591417076444, + "grad_norm": 1.1227927295658309, + "learning_rate": 5.974098917057019e-07, + "loss": 1.8204, + "step": 329 + }, + { + "epoch": 0.04425569959767546, + "grad_norm": 1.1208739563830832, + "learning_rate": 5.973941478520565e-07, + "loss": 1.7393, + "step": 330 + }, + { + "epoch": 0.044389807778274476, + "grad_norm": 1.0722310163444908, + "learning_rate": 5.973783565261589e-07, + "loss": 1.6568, + "step": 331 + }, + { + "epoch": 0.04452391595887349, + "grad_norm": 1.1809997483096673, + "learning_rate": 5.973625177308124e-07, + "loss": 1.7233, + "step": 332 + }, + { + "epoch": 0.04465802413947251, + "grad_norm": 1.0854965350422932, + "learning_rate": 5.973466314688289e-07, + "loss": 1.5838, + "step": 333 + }, + { + "epoch": 0.04479213232007152, + "grad_norm": 1.0394749005048125, + "learning_rate": 5.973306977430288e-07, + "loss": 1.6982, + "step": 334 + }, + { + "epoch": 0.04492624050067054, + "grad_norm": 1.1372698128741796, + "learning_rate": 5.973147165562409e-07, + "loss": 1.7363, + "step": 335 + }, + { + "epoch": 0.045060348681269555, + "grad_norm": 1.0872018588712997, + "learning_rate": 5.972986879113027e-07, + "loss": 1.7134, + "step": 336 + }, + { + "epoch": 0.04519445686186858, + "grad_norm": 1.136573181976626, + "learning_rate": 5.972826118110597e-07, + "loss": 1.6747, + "step": 337 + }, + { + "epoch": 0.04532856504246759, + "grad_norm": 1.1438807799337474, + "learning_rate": 5.972664882583659e-07, + "loss": 1.7632, + "step": 338 + }, + { + "epoch": 0.04546267322306661, + "grad_norm": 1.1746151029086915, + "learning_rate": 5.97250317256084e-07, + "loss": 1.5568, + "step": 339 + }, + { + "epoch": 0.045596781403665625, + "grad_norm": 1.067551171735795, + "learning_rate": 5.972340988070848e-07, + "loss": 1.7722, + "step": 340 + }, + { + "epoch": 0.04573088958426464, + "grad_norm": 1.100004825990679, + "learning_rate": 5.972178329142476e-07, + "loss": 1.7111, + "step": 341 + }, + { + "epoch": 0.045864997764863656, + "grad_norm": 1.3130274389549708, + "learning_rate": 5.972015195804604e-07, + "loss": 1.7768, + "step": 342 + }, + { + "epoch": 0.04599910594546267, + "grad_norm": 1.1532781776242376, + "learning_rate": 5.971851588086195e-07, + "loss": 1.7096, + "step": 343 + }, + { + "epoch": 0.04613321412606169, + "grad_norm": 1.1087417118719138, + "learning_rate": 5.971687506016292e-07, + "loss": 1.6085, + "step": 344 + }, + { + "epoch": 0.0462673223066607, + "grad_norm": 1.105566689388399, + "learning_rate": 5.971522949624028e-07, + "loss": 1.6791, + "step": 345 + }, + { + "epoch": 0.046401430487259726, + "grad_norm": 1.090277130406352, + "learning_rate": 5.971357918938616e-07, + "loss": 1.6585, + "step": 346 + }, + { + "epoch": 0.04653553866785874, + "grad_norm": 1.1679080769492398, + "learning_rate": 5.971192413989357e-07, + "loss": 1.6861, + "step": 347 + }, + { + "epoch": 0.04666964684845776, + "grad_norm": 1.1647454348028623, + "learning_rate": 5.971026434805633e-07, + "loss": 1.7167, + "step": 348 + }, + { + "epoch": 0.04680375502905677, + "grad_norm": 1.1324717330275416, + "learning_rate": 5.970859981416911e-07, + "loss": 1.6656, + "step": 349 + }, + { + "epoch": 0.04693786320965579, + "grad_norm": 1.0895090583275637, + "learning_rate": 5.970693053852743e-07, + "loss": 1.7932, + "step": 350 + }, + { + "epoch": 0.047071971390254805, + "grad_norm": 1.0846672521830747, + "learning_rate": 5.970525652142767e-07, + "loss": 1.568, + "step": 351 + }, + { + "epoch": 0.04720607957085382, + "grad_norm": 1.0946401497383844, + "learning_rate": 5.970357776316699e-07, + "loss": 1.6717, + "step": 352 + }, + { + "epoch": 0.047340187751452836, + "grad_norm": 1.203590152178876, + "learning_rate": 5.970189426404346e-07, + "loss": 1.6852, + "step": 353 + }, + { + "epoch": 0.04747429593205185, + "grad_norm": 1.1550529538782315, + "learning_rate": 5.970020602435594e-07, + "loss": 1.7621, + "step": 354 + }, + { + "epoch": 0.047608404112650875, + "grad_norm": 1.096867626156823, + "learning_rate": 5.969851304440418e-07, + "loss": 1.7309, + "step": 355 + }, + { + "epoch": 0.04774251229324989, + "grad_norm": 1.166383772927886, + "learning_rate": 5.969681532448872e-07, + "loss": 1.7181, + "step": 356 + }, + { + "epoch": 0.047876620473848906, + "grad_norm": 1.1239983839028163, + "learning_rate": 5.9695112864911e-07, + "loss": 1.6855, + "step": 357 + }, + { + "epoch": 0.04801072865444792, + "grad_norm": 1.146063042749729, + "learning_rate": 5.969340566597323e-07, + "loss": 1.7481, + "step": 358 + }, + { + "epoch": 0.04814483683504694, + "grad_norm": 1.1888010033263623, + "learning_rate": 5.969169372797852e-07, + "loss": 1.7679, + "step": 359 + }, + { + "epoch": 0.048278945015645953, + "grad_norm": 1.1182477969412692, + "learning_rate": 5.96899770512308e-07, + "loss": 1.703, + "step": 360 + }, + { + "epoch": 0.04841305319624497, + "grad_norm": 1.1404473863138842, + "learning_rate": 5.968825563603486e-07, + "loss": 1.7899, + "step": 361 + }, + { + "epoch": 0.048547161376843985, + "grad_norm": 1.1404415220346715, + "learning_rate": 5.968652948269629e-07, + "loss": 1.6586, + "step": 362 + }, + { + "epoch": 0.048681269557443, + "grad_norm": 1.0188482574967557, + "learning_rate": 5.968479859152155e-07, + "loss": 1.6772, + "step": 363 + }, + { + "epoch": 0.04881537773804202, + "grad_norm": 1.1444032147790508, + "learning_rate": 5.968306296281794e-07, + "loss": 1.7235, + "step": 364 + }, + { + "epoch": 0.04894948591864104, + "grad_norm": 1.147526204803139, + "learning_rate": 5.968132259689361e-07, + "loss": 1.6656, + "step": 365 + }, + { + "epoch": 0.049083594099240055, + "grad_norm": 1.094173771252459, + "learning_rate": 5.967957749405751e-07, + "loss": 1.6133, + "step": 366 + }, + { + "epoch": 0.04921770227983907, + "grad_norm": 1.1560369729609308, + "learning_rate": 5.967782765461948e-07, + "loss": 1.7796, + "step": 367 + }, + { + "epoch": 0.049351810460438086, + "grad_norm": 1.1696121017752343, + "learning_rate": 5.967607307889018e-07, + "loss": 1.65, + "step": 368 + }, + { + "epoch": 0.0494859186410371, + "grad_norm": 1.134918792559745, + "learning_rate": 5.967431376718111e-07, + "loss": 1.717, + "step": 369 + }, + { + "epoch": 0.04962002682163612, + "grad_norm": 1.0765623022573645, + "learning_rate": 5.967254971980461e-07, + "loss": 1.7028, + "step": 370 + }, + { + "epoch": 0.049754135002235134, + "grad_norm": 1.1093533051376567, + "learning_rate": 5.967078093707387e-07, + "loss": 1.687, + "step": 371 + }, + { + "epoch": 0.049888243182834156, + "grad_norm": 1.0724867576763264, + "learning_rate": 5.966900741930289e-07, + "loss": 1.709, + "step": 372 + }, + { + "epoch": 0.05002235136343317, + "grad_norm": 1.1870703976775374, + "learning_rate": 5.966722916680656e-07, + "loss": 1.7623, + "step": 373 + }, + { + "epoch": 0.05015645954403219, + "grad_norm": 1.1118336624167122, + "learning_rate": 5.966544617990058e-07, + "loss": 1.713, + "step": 374 + }, + { + "epoch": 0.050290567724631204, + "grad_norm": 1.1147242423912, + "learning_rate": 5.966365845890149e-07, + "loss": 1.5956, + "step": 375 + }, + { + "epoch": 0.05042467590523022, + "grad_norm": 1.1489546583821737, + "learning_rate": 5.966186600412668e-07, + "loss": 1.7536, + "step": 376 + }, + { + "epoch": 0.050558784085829235, + "grad_norm": 1.0985836995809481, + "learning_rate": 5.966006881589437e-07, + "loss": 1.6415, + "step": 377 + }, + { + "epoch": 0.05069289226642825, + "grad_norm": 1.5210499221056473, + "learning_rate": 5.965826689452363e-07, + "loss": 1.7034, + "step": 378 + }, + { + "epoch": 0.05082700044702727, + "grad_norm": 1.1770747351660449, + "learning_rate": 5.965646024033437e-07, + "loss": 1.7998, + "step": 379 + }, + { + "epoch": 0.05096110862762628, + "grad_norm": 1.103353857870669, + "learning_rate": 5.965464885364734e-07, + "loss": 1.677, + "step": 380 + }, + { + "epoch": 0.051095216808225305, + "grad_norm": 1.1279052370658624, + "learning_rate": 5.965283273478411e-07, + "loss": 1.7125, + "step": 381 + }, + { + "epoch": 0.05122932498882432, + "grad_norm": 1.1260317026536582, + "learning_rate": 5.965101188406713e-07, + "loss": 1.713, + "step": 382 + }, + { + "epoch": 0.051363433169423336, + "grad_norm": 1.1217115939734228, + "learning_rate": 5.964918630181966e-07, + "loss": 1.7513, + "step": 383 + }, + { + "epoch": 0.05149754135002235, + "grad_norm": 1.0938140494838644, + "learning_rate": 5.964735598836581e-07, + "loss": 1.6722, + "step": 384 + }, + { + "epoch": 0.05163164953062137, + "grad_norm": 1.5746119243016816, + "learning_rate": 5.964552094403051e-07, + "loss": 1.7249, + "step": 385 + }, + { + "epoch": 0.051765757711220384, + "grad_norm": 1.1376993855927013, + "learning_rate": 5.964368116913957e-07, + "loss": 1.7292, + "step": 386 + }, + { + "epoch": 0.0518998658918194, + "grad_norm": 1.1288484886032422, + "learning_rate": 5.96418366640196e-07, + "loss": 1.7373, + "step": 387 + }, + { + "epoch": 0.052033974072418415, + "grad_norm": 1.0912837401536597, + "learning_rate": 5.963998742899809e-07, + "loss": 1.6279, + "step": 388 + }, + { + "epoch": 0.05216808225301743, + "grad_norm": 1.080399914264917, + "learning_rate": 5.963813346440332e-07, + "loss": 1.6828, + "step": 389 + }, + { + "epoch": 0.052302190433616454, + "grad_norm": 1.18296526148637, + "learning_rate": 5.963627477056445e-07, + "loss": 1.7037, + "step": 390 + }, + { + "epoch": 0.05243629861421547, + "grad_norm": 1.0700933148095726, + "learning_rate": 5.963441134781147e-07, + "loss": 1.6773, + "step": 391 + }, + { + "epoch": 0.052570406794814485, + "grad_norm": 1.5541605676471624, + "learning_rate": 5.963254319647519e-07, + "loss": 1.5786, + "step": 392 + }, + { + "epoch": 0.0527045149754135, + "grad_norm": 1.154992915725033, + "learning_rate": 5.96306703168873e-07, + "loss": 1.7743, + "step": 393 + }, + { + "epoch": 0.05283862315601252, + "grad_norm": 1.117612338423665, + "learning_rate": 5.962879270938028e-07, + "loss": 1.723, + "step": 394 + }, + { + "epoch": 0.05297273133661153, + "grad_norm": 1.0907791376426386, + "learning_rate": 5.96269103742875e-07, + "loss": 1.73, + "step": 395 + }, + { + "epoch": 0.05310683951721055, + "grad_norm": 1.1325939188472074, + "learning_rate": 5.962502331194311e-07, + "loss": 1.6756, + "step": 396 + }, + { + "epoch": 0.053240947697809564, + "grad_norm": 1.0925915487497773, + "learning_rate": 5.962313152268218e-07, + "loss": 1.7166, + "step": 397 + }, + { + "epoch": 0.05337505587840858, + "grad_norm": 1.1102789558363542, + "learning_rate": 5.96212350068405e-07, + "loss": 1.6697, + "step": 398 + }, + { + "epoch": 0.0535091640590076, + "grad_norm": 1.1054817006563584, + "learning_rate": 5.961933376475485e-07, + "loss": 1.7231, + "step": 399 + }, + { + "epoch": 0.05364327223960662, + "grad_norm": 1.307573555314525, + "learning_rate": 5.961742779676272e-07, + "loss": 1.7651, + "step": 400 + }, + { + "epoch": 0.053777380420205634, + "grad_norm": 1.1445042759796842, + "learning_rate": 5.961551710320251e-07, + "loss": 1.6765, + "step": 401 + }, + { + "epoch": 0.05391148860080465, + "grad_norm": 1.0762583158173675, + "learning_rate": 5.961360168441342e-07, + "loss": 1.6481, + "step": 402 + }, + { + "epoch": 0.054045596781403665, + "grad_norm": 1.1084304546525765, + "learning_rate": 5.961168154073553e-07, + "loss": 1.7338, + "step": 403 + }, + { + "epoch": 0.05417970496200268, + "grad_norm": 1.0982232521403124, + "learning_rate": 5.960975667250972e-07, + "loss": 1.6638, + "step": 404 + }, + { + "epoch": 0.0543138131426017, + "grad_norm": 1.2140530141548174, + "learning_rate": 5.960782708007773e-07, + "loss": 1.7516, + "step": 405 + }, + { + "epoch": 0.05444792132320071, + "grad_norm": 1.5212193377424008, + "learning_rate": 5.960589276378213e-07, + "loss": 1.7427, + "step": 406 + }, + { + "epoch": 0.054582029503799735, + "grad_norm": 1.11412919662803, + "learning_rate": 5.960395372396633e-07, + "loss": 1.6931, + "step": 407 + }, + { + "epoch": 0.05471613768439875, + "grad_norm": 1.0851895981130018, + "learning_rate": 5.960200996097458e-07, + "loss": 1.6913, + "step": 408 + }, + { + "epoch": 0.05485024586499777, + "grad_norm": 1.1246816244588258, + "learning_rate": 5.960006147515199e-07, + "loss": 1.7152, + "step": 409 + }, + { + "epoch": 0.05498435404559678, + "grad_norm": 1.0772018259030958, + "learning_rate": 5.959810826684446e-07, + "loss": 1.7227, + "step": 410 + }, + { + "epoch": 0.0551184622261958, + "grad_norm": 1.1172898063954977, + "learning_rate": 5.959615033639877e-07, + "loss": 1.6459, + "step": 411 + }, + { + "epoch": 0.055252570406794814, + "grad_norm": 1.190430020238442, + "learning_rate": 5.959418768416252e-07, + "loss": 1.7491, + "step": 412 + }, + { + "epoch": 0.05538667858739383, + "grad_norm": 1.0954974858449955, + "learning_rate": 5.959222031048417e-07, + "loss": 1.7136, + "step": 413 + }, + { + "epoch": 0.055520786767992845, + "grad_norm": 1.1287823535303052, + "learning_rate": 5.959024821571296e-07, + "loss": 1.7765, + "step": 414 + }, + { + "epoch": 0.05565489494859186, + "grad_norm": 1.0561812337694518, + "learning_rate": 5.958827140019905e-07, + "loss": 1.6913, + "step": 415 + }, + { + "epoch": 0.055789003129190884, + "grad_norm": 1.1085682708952787, + "learning_rate": 5.958628986429338e-07, + "loss": 1.7022, + "step": 416 + }, + { + "epoch": 0.0559231113097899, + "grad_norm": 1.145351387138441, + "learning_rate": 5.958430360834773e-07, + "loss": 1.7236, + "step": 417 + }, + { + "epoch": 0.056057219490388915, + "grad_norm": 1.0897443627255616, + "learning_rate": 5.958231263271476e-07, + "loss": 1.6012, + "step": 418 + }, + { + "epoch": 0.05619132767098793, + "grad_norm": 1.1200731868604838, + "learning_rate": 5.958031693774794e-07, + "loss": 1.7389, + "step": 419 + }, + { + "epoch": 0.05632543585158695, + "grad_norm": 1.1038585013517133, + "learning_rate": 5.957831652380156e-07, + "loss": 1.583, + "step": 420 + }, + { + "epoch": 0.05645954403218596, + "grad_norm": 1.4548045332193216, + "learning_rate": 5.95763113912308e-07, + "loss": 1.7524, + "step": 421 + }, + { + "epoch": 0.05659365221278498, + "grad_norm": 1.1692222790883888, + "learning_rate": 5.95743015403916e-07, + "loss": 1.6299, + "step": 422 + }, + { + "epoch": 0.056727760393383994, + "grad_norm": 1.1247764368969244, + "learning_rate": 5.95722869716408e-07, + "loss": 1.5839, + "step": 423 + }, + { + "epoch": 0.05686186857398301, + "grad_norm": 1.1555568325620067, + "learning_rate": 5.957026768533605e-07, + "loss": 1.7239, + "step": 424 + }, + { + "epoch": 0.05699597675458203, + "grad_norm": 1.1216899351148046, + "learning_rate": 5.956824368183589e-07, + "loss": 1.7256, + "step": 425 + }, + { + "epoch": 0.05713008493518105, + "grad_norm": 1.145568323616433, + "learning_rate": 5.956621496149961e-07, + "loss": 1.6824, + "step": 426 + }, + { + "epoch": 0.057264193115780064, + "grad_norm": 1.0986327998626733, + "learning_rate": 5.956418152468739e-07, + "loss": 1.6288, + "step": 427 + }, + { + "epoch": 0.05739830129637908, + "grad_norm": 1.107394613480044, + "learning_rate": 5.956214337176026e-07, + "loss": 1.7525, + "step": 428 + }, + { + "epoch": 0.057532409476978096, + "grad_norm": 1.1530636510188206, + "learning_rate": 5.956010050308003e-07, + "loss": 1.6703, + "step": 429 + }, + { + "epoch": 0.05766651765757711, + "grad_norm": 1.2684443748494443, + "learning_rate": 5.955805291900944e-07, + "loss": 1.7255, + "step": 430 + }, + { + "epoch": 0.05780062583817613, + "grad_norm": 1.1216850925610182, + "learning_rate": 5.955600061991196e-07, + "loss": 1.6833, + "step": 431 + }, + { + "epoch": 0.05793473401877514, + "grad_norm": 1.1163294449512198, + "learning_rate": 5.955394360615196e-07, + "loss": 1.6738, + "step": 432 + }, + { + "epoch": 0.05806884219937416, + "grad_norm": 1.0993928108999345, + "learning_rate": 5.955188187809465e-07, + "loss": 1.575, + "step": 433 + }, + { + "epoch": 0.05820295037997318, + "grad_norm": 1.199099074821361, + "learning_rate": 5.954981543610606e-07, + "loss": 1.7117, + "step": 434 + }, + { + "epoch": 0.0583370585605722, + "grad_norm": 1.1208106037393502, + "learning_rate": 5.954774428055305e-07, + "loss": 1.7093, + "step": 435 + }, + { + "epoch": 0.05847116674117121, + "grad_norm": 1.2627670829161222, + "learning_rate": 5.954566841180332e-07, + "loss": 1.6188, + "step": 436 + }, + { + "epoch": 0.05860527492177023, + "grad_norm": 1.0799814850943354, + "learning_rate": 5.954358783022543e-07, + "loss": 1.7059, + "step": 437 + }, + { + "epoch": 0.058739383102369244, + "grad_norm": 1.1341395954441937, + "learning_rate": 5.954150253618875e-07, + "loss": 1.5712, + "step": 438 + }, + { + "epoch": 0.05887349128296826, + "grad_norm": 1.1117856654912641, + "learning_rate": 5.95394125300635e-07, + "loss": 1.6777, + "step": 439 + }, + { + "epoch": 0.059007599463567276, + "grad_norm": 1.0923581672387388, + "learning_rate": 5.953731781222071e-07, + "loss": 1.7159, + "step": 440 + }, + { + "epoch": 0.05914170764416629, + "grad_norm": 1.0600443650637132, + "learning_rate": 5.953521838303231e-07, + "loss": 1.7249, + "step": 441 + }, + { + "epoch": 0.059275815824765314, + "grad_norm": 1.2138612225345329, + "learning_rate": 5.9533114242871e-07, + "loss": 1.7013, + "step": 442 + }, + { + "epoch": 0.05940992400536433, + "grad_norm": 1.0419430689297875, + "learning_rate": 5.953100539211034e-07, + "loss": 1.7552, + "step": 443 + }, + { + "epoch": 0.059544032185963346, + "grad_norm": 1.1237438417872123, + "learning_rate": 5.952889183112474e-07, + "loss": 1.7112, + "step": 444 + }, + { + "epoch": 0.05967814036656236, + "grad_norm": 1.2319625967973615, + "learning_rate": 5.952677356028943e-07, + "loss": 1.7093, + "step": 445 + }, + { + "epoch": 0.05981224854716138, + "grad_norm": 1.086955577183242, + "learning_rate": 5.952465057998049e-07, + "loss": 1.6358, + "step": 446 + }, + { + "epoch": 0.05994635672776039, + "grad_norm": 1.1264500428377913, + "learning_rate": 5.952252289057481e-07, + "loss": 1.7178, + "step": 447 + }, + { + "epoch": 0.06008046490835941, + "grad_norm": 1.128811841099524, + "learning_rate": 5.952039049245012e-07, + "loss": 1.7591, + "step": 448 + }, + { + "epoch": 0.060214573088958424, + "grad_norm": 1.1110504835526924, + "learning_rate": 5.951825338598503e-07, + "loss": 1.6403, + "step": 449 + }, + { + "epoch": 0.06034868126955744, + "grad_norm": 1.2271379194246814, + "learning_rate": 5.951611157155895e-07, + "loss": 1.7213, + "step": 450 + }, + { + "epoch": 0.06048278945015646, + "grad_norm": 1.1228932913870193, + "learning_rate": 5.951396504955212e-07, + "loss": 1.5935, + "step": 451 + }, + { + "epoch": 0.06061689763075548, + "grad_norm": 1.11062455626935, + "learning_rate": 5.951181382034563e-07, + "loss": 1.6998, + "step": 452 + }, + { + "epoch": 0.060751005811354494, + "grad_norm": 1.0990862927657152, + "learning_rate": 5.950965788432139e-07, + "loss": 1.6468, + "step": 453 + }, + { + "epoch": 0.06088511399195351, + "grad_norm": 1.2688756973522501, + "learning_rate": 5.950749724186219e-07, + "loss": 1.741, + "step": 454 + }, + { + "epoch": 0.061019222172552526, + "grad_norm": 1.2895801173515846, + "learning_rate": 5.950533189335158e-07, + "loss": 1.6955, + "step": 455 + }, + { + "epoch": 0.06115333035315154, + "grad_norm": 1.077512840039689, + "learning_rate": 5.950316183917403e-07, + "loss": 1.641, + "step": 456 + }, + { + "epoch": 0.06128743853375056, + "grad_norm": 1.0847961133378894, + "learning_rate": 5.950098707971477e-07, + "loss": 1.83, + "step": 457 + }, + { + "epoch": 0.06142154671434957, + "grad_norm": 1.1936301482363822, + "learning_rate": 5.949880761535992e-07, + "loss": 1.8029, + "step": 458 + }, + { + "epoch": 0.06155565489494859, + "grad_norm": 1.1712115230746196, + "learning_rate": 5.949662344649641e-07, + "loss": 1.7041, + "step": 459 + }, + { + "epoch": 0.06168976307554761, + "grad_norm": 1.1207575353150439, + "learning_rate": 5.9494434573512e-07, + "loss": 1.8268, + "step": 460 + }, + { + "epoch": 0.06182387125614663, + "grad_norm": 1.0875570889732413, + "learning_rate": 5.949224099679532e-07, + "loss": 1.7194, + "step": 461 + }, + { + "epoch": 0.06195797943674564, + "grad_norm": 1.0917010226696162, + "learning_rate": 5.949004271673578e-07, + "loss": 1.7354, + "step": 462 + }, + { + "epoch": 0.06209208761734466, + "grad_norm": 1.0997856156670267, + "learning_rate": 5.948783973372368e-07, + "loss": 1.7529, + "step": 463 + }, + { + "epoch": 0.062226195797943674, + "grad_norm": 1.0621713053596278, + "learning_rate": 5.948563204815011e-07, + "loss": 1.6898, + "step": 464 + }, + { + "epoch": 0.06236030397854269, + "grad_norm": 1.0614544715813865, + "learning_rate": 5.948341966040703e-07, + "loss": 1.7044, + "step": 465 + }, + { + "epoch": 0.062494412159141706, + "grad_norm": 1.154295913834985, + "learning_rate": 5.948120257088721e-07, + "loss": 1.739, + "step": 466 + }, + { + "epoch": 0.06262852033974073, + "grad_norm": 1.6321838989867514, + "learning_rate": 5.947898077998429e-07, + "loss": 1.6571, + "step": 467 + }, + { + "epoch": 0.06276262852033974, + "grad_norm": 1.1020818061209965, + "learning_rate": 5.947675428809268e-07, + "loss": 1.7457, + "step": 468 + }, + { + "epoch": 0.06289673670093876, + "grad_norm": 1.1541190378330166, + "learning_rate": 5.947452309560767e-07, + "loss": 1.7659, + "step": 469 + }, + { + "epoch": 0.06303084488153778, + "grad_norm": 1.084642443791217, + "learning_rate": 5.947228720292541e-07, + "loss": 1.7144, + "step": 470 + }, + { + "epoch": 0.06316495306213679, + "grad_norm": 1.1145594614023564, + "learning_rate": 5.947004661044283e-07, + "loss": 1.6729, + "step": 471 + }, + { + "epoch": 0.06329906124273581, + "grad_norm": 1.115158449397951, + "learning_rate": 5.946780131855772e-07, + "loss": 1.7349, + "step": 472 + }, + { + "epoch": 0.06343316942333482, + "grad_norm": 1.1366035122661107, + "learning_rate": 5.94655513276687e-07, + "loss": 1.7005, + "step": 473 + }, + { + "epoch": 0.06356727760393384, + "grad_norm": 1.1207240569861627, + "learning_rate": 5.946329663817522e-07, + "loss": 1.6988, + "step": 474 + }, + { + "epoch": 0.06370138578453285, + "grad_norm": 1.0633079931171385, + "learning_rate": 5.946103725047759e-07, + "loss": 1.6861, + "step": 475 + }, + { + "epoch": 0.06383549396513187, + "grad_norm": 1.148420369678469, + "learning_rate": 5.945877316497692e-07, + "loss": 1.7186, + "step": 476 + }, + { + "epoch": 0.06396960214573089, + "grad_norm": 1.1296345116481292, + "learning_rate": 5.945650438207517e-07, + "loss": 1.7515, + "step": 477 + }, + { + "epoch": 0.0641037103263299, + "grad_norm": 1.1072132368875205, + "learning_rate": 5.945423090217512e-07, + "loss": 1.7498, + "step": 478 + }, + { + "epoch": 0.06423781850692892, + "grad_norm": 1.0636459120097348, + "learning_rate": 5.945195272568042e-07, + "loss": 1.6705, + "step": 479 + }, + { + "epoch": 0.06437192668752793, + "grad_norm": 1.1184722760153458, + "learning_rate": 5.944966985299551e-07, + "loss": 1.74, + "step": 480 + }, + { + "epoch": 0.06450603486812695, + "grad_norm": 1.09226255206473, + "learning_rate": 5.944738228452569e-07, + "loss": 1.7125, + "step": 481 + }, + { + "epoch": 0.06464014304872598, + "grad_norm": 1.0980507704132523, + "learning_rate": 5.94450900206771e-07, + "loss": 1.7187, + "step": 482 + }, + { + "epoch": 0.064774251229325, + "grad_norm": 1.0944716620001702, + "learning_rate": 5.944279306185668e-07, + "loss": 1.5932, + "step": 483 + }, + { + "epoch": 0.06490835940992401, + "grad_norm": 1.1136224916178525, + "learning_rate": 5.944049140847224e-07, + "loss": 1.6976, + "step": 484 + }, + { + "epoch": 0.06504246759052303, + "grad_norm": 1.1013486929558047, + "learning_rate": 5.943818506093239e-07, + "loss": 1.6864, + "step": 485 + }, + { + "epoch": 0.06517657577112204, + "grad_norm": 1.1430455689049595, + "learning_rate": 5.943587401964661e-07, + "loss": 1.6274, + "step": 486 + }, + { + "epoch": 0.06531068395172106, + "grad_norm": 1.1269355413734778, + "learning_rate": 5.943355828502519e-07, + "loss": 1.7389, + "step": 487 + }, + { + "epoch": 0.06544479213232007, + "grad_norm": 1.1442671190598854, + "learning_rate": 5.943123785747925e-07, + "loss": 1.6724, + "step": 488 + }, + { + "epoch": 0.06557890031291909, + "grad_norm": 1.1006441895975216, + "learning_rate": 5.942891273742075e-07, + "loss": 1.687, + "step": 489 + }, + { + "epoch": 0.0657130084935181, + "grad_norm": 1.1130024103107554, + "learning_rate": 5.94265829252625e-07, + "loss": 1.6774, + "step": 490 + }, + { + "epoch": 0.06584711667411712, + "grad_norm": 1.10665029408129, + "learning_rate": 5.942424842141811e-07, + "loss": 1.7053, + "step": 491 + }, + { + "epoch": 0.06598122485471614, + "grad_norm": 1.0895398255696098, + "learning_rate": 5.942190922630204e-07, + "loss": 1.6816, + "step": 492 + }, + { + "epoch": 0.06611533303531515, + "grad_norm": 1.0952133118391503, + "learning_rate": 5.941956534032961e-07, + "loss": 1.58, + "step": 493 + }, + { + "epoch": 0.06624944121591417, + "grad_norm": 1.104962374424092, + "learning_rate": 5.941721676391691e-07, + "loss": 1.758, + "step": 494 + }, + { + "epoch": 0.06638354939651318, + "grad_norm": 1.1134158734370636, + "learning_rate": 5.941486349748091e-07, + "loss": 1.7508, + "step": 495 + }, + { + "epoch": 0.0665176575771122, + "grad_norm": 1.175784721072215, + "learning_rate": 5.94125055414394e-07, + "loss": 1.7113, + "step": 496 + }, + { + "epoch": 0.06665176575771121, + "grad_norm": 1.0778973456587042, + "learning_rate": 5.941014289621102e-07, + "loss": 1.7558, + "step": 497 + }, + { + "epoch": 0.06678587393831023, + "grad_norm": 1.11982522730228, + "learning_rate": 5.940777556221521e-07, + "loss": 1.6791, + "step": 498 + }, + { + "epoch": 0.06691998211890926, + "grad_norm": 1.1807400353238904, + "learning_rate": 5.940540353987225e-07, + "loss": 1.7484, + "step": 499 + }, + { + "epoch": 0.06705409029950828, + "grad_norm": 1.1987690536433178, + "learning_rate": 5.940302682960328e-07, + "loss": 1.59, + "step": 500 + }, + { + "epoch": 0.06718819848010729, + "grad_norm": 1.1093357389120035, + "learning_rate": 5.940064543183026e-07, + "loss": 1.8238, + "step": 501 + }, + { + "epoch": 0.06732230666070631, + "grad_norm": 1.2404864761664665, + "learning_rate": 5.939825934697594e-07, + "loss": 1.6965, + "step": 502 + }, + { + "epoch": 0.06745641484130532, + "grad_norm": 1.1369155507476978, + "learning_rate": 5.939586857546397e-07, + "loss": 1.7284, + "step": 503 + }, + { + "epoch": 0.06759052302190434, + "grad_norm": 1.0747025812432756, + "learning_rate": 5.939347311771877e-07, + "loss": 1.6029, + "step": 504 + }, + { + "epoch": 0.06772463120250335, + "grad_norm": 1.2065817260719833, + "learning_rate": 5.939107297416566e-07, + "loss": 1.7937, + "step": 505 + }, + { + "epoch": 0.06785873938310237, + "grad_norm": 1.072195510416472, + "learning_rate": 5.938866814523073e-07, + "loss": 1.6844, + "step": 506 + }, + { + "epoch": 0.06799284756370139, + "grad_norm": 1.0788223308291087, + "learning_rate": 5.938625863134092e-07, + "loss": 1.7651, + "step": 507 + }, + { + "epoch": 0.0681269557443004, + "grad_norm": 1.1125709389242076, + "learning_rate": 5.938384443292403e-07, + "loss": 1.6723, + "step": 508 + }, + { + "epoch": 0.06826106392489942, + "grad_norm": 1.2370173408194798, + "learning_rate": 5.938142555040863e-07, + "loss": 1.6491, + "step": 509 + }, + { + "epoch": 0.06839517210549843, + "grad_norm": 1.0646655039063193, + "learning_rate": 5.93790019842242e-07, + "loss": 1.7609, + "step": 510 + }, + { + "epoch": 0.06852928028609745, + "grad_norm": 1.137655615576816, + "learning_rate": 5.9376573734801e-07, + "loss": 1.6971, + "step": 511 + }, + { + "epoch": 0.06866338846669647, + "grad_norm": 1.1610648719854884, + "learning_rate": 5.937414080257011e-07, + "loss": 1.7563, + "step": 512 + }, + { + "epoch": 0.06879749664729548, + "grad_norm": 1.022128030652968, + "learning_rate": 5.93717031879635e-07, + "loss": 1.6585, + "step": 513 + }, + { + "epoch": 0.0689316048278945, + "grad_norm": 1.1094802666159138, + "learning_rate": 5.936926089141391e-07, + "loss": 1.6963, + "step": 514 + }, + { + "epoch": 0.06906571300849351, + "grad_norm": 1.0491463968940271, + "learning_rate": 5.936681391335494e-07, + "loss": 1.653, + "step": 515 + }, + { + "epoch": 0.06919982118909253, + "grad_norm": 1.1153617117594175, + "learning_rate": 5.936436225422104e-07, + "loss": 1.6738, + "step": 516 + }, + { + "epoch": 0.06933392936969156, + "grad_norm": 1.1150239468835819, + "learning_rate": 5.936190591444744e-07, + "loss": 1.726, + "step": 517 + }, + { + "epoch": 0.06946803755029057, + "grad_norm": 1.1299338290201733, + "learning_rate": 5.935944489447026e-07, + "loss": 1.6814, + "step": 518 + }, + { + "epoch": 0.06960214573088959, + "grad_norm": 1.0925086075502406, + "learning_rate": 5.935697919472639e-07, + "loss": 1.6141, + "step": 519 + }, + { + "epoch": 0.0697362539114886, + "grad_norm": 1.1136653572074133, + "learning_rate": 5.93545088156536e-07, + "loss": 1.6752, + "step": 520 + }, + { + "epoch": 0.06987036209208762, + "grad_norm": 1.086968726752448, + "learning_rate": 5.935203375769048e-07, + "loss": 1.6593, + "step": 521 + }, + { + "epoch": 0.07000447027268664, + "grad_norm": 1.0785790431427873, + "learning_rate": 5.934955402127642e-07, + "loss": 1.7806, + "step": 522 + }, + { + "epoch": 0.07013857845328565, + "grad_norm": 1.061202101435773, + "learning_rate": 5.934706960685168e-07, + "loss": 1.6015, + "step": 523 + }, + { + "epoch": 0.07027268663388467, + "grad_norm": 1.1217377555129306, + "learning_rate": 5.934458051485734e-07, + "loss": 1.6836, + "step": 524 + }, + { + "epoch": 0.07040679481448368, + "grad_norm": 1.1634463467399316, + "learning_rate": 5.934208674573529e-07, + "loss": 1.641, + "step": 525 + }, + { + "epoch": 0.0705409029950827, + "grad_norm": 1.1853874452885456, + "learning_rate": 5.933958829992828e-07, + "loss": 1.6501, + "step": 526 + }, + { + "epoch": 0.07067501117568172, + "grad_norm": 1.0827543649368265, + "learning_rate": 5.933708517787985e-07, + "loss": 1.6664, + "step": 527 + }, + { + "epoch": 0.07080911935628073, + "grad_norm": 1.1171619381364966, + "learning_rate": 5.933457738003443e-07, + "loss": 1.6758, + "step": 528 + }, + { + "epoch": 0.07094322753687975, + "grad_norm": 1.2171560054678998, + "learning_rate": 5.933206490683722e-07, + "loss": 1.6914, + "step": 529 + }, + { + "epoch": 0.07107733571747876, + "grad_norm": 1.130266539632813, + "learning_rate": 5.932954775873429e-07, + "loss": 1.6301, + "step": 530 + }, + { + "epoch": 0.07121144389807778, + "grad_norm": 1.1814157624655244, + "learning_rate": 5.932702593617252e-07, + "loss": 1.689, + "step": 531 + }, + { + "epoch": 0.0713455520786768, + "grad_norm": 1.1423293526842793, + "learning_rate": 5.932449943959963e-07, + "loss": 1.7379, + "step": 532 + }, + { + "epoch": 0.07147966025927581, + "grad_norm": 1.0830256450215578, + "learning_rate": 5.932196826946416e-07, + "loss": 1.6752, + "step": 533 + }, + { + "epoch": 0.07161376843987484, + "grad_norm": 1.2254212102036337, + "learning_rate": 5.931943242621548e-07, + "loss": 1.7602, + "step": 534 + }, + { + "epoch": 0.07174787662047385, + "grad_norm": 1.1254407305546181, + "learning_rate": 5.931689191030381e-07, + "loss": 1.7144, + "step": 535 + }, + { + "epoch": 0.07188198480107287, + "grad_norm": 1.7531628186363164, + "learning_rate": 5.931434672218018e-07, + "loss": 1.7868, + "step": 536 + }, + { + "epoch": 0.07201609298167189, + "grad_norm": 1.1530768773395477, + "learning_rate": 5.931179686229645e-07, + "loss": 1.7128, + "step": 537 + }, + { + "epoch": 0.0721502011622709, + "grad_norm": 1.0869645546426585, + "learning_rate": 5.930924233110532e-07, + "loss": 1.626, + "step": 538 + }, + { + "epoch": 0.07228430934286992, + "grad_norm": 1.2196040558075754, + "learning_rate": 5.930668312906031e-07, + "loss": 1.7148, + "step": 539 + }, + { + "epoch": 0.07241841752346893, + "grad_norm": 1.1904076173283444, + "learning_rate": 5.930411925661577e-07, + "loss": 1.6981, + "step": 540 + }, + { + "epoch": 0.07255252570406795, + "grad_norm": 1.5987820485565098, + "learning_rate": 5.930155071422687e-07, + "loss": 1.7351, + "step": 541 + }, + { + "epoch": 0.07268663388466697, + "grad_norm": 1.101070130998752, + "learning_rate": 5.929897750234963e-07, + "loss": 1.6313, + "step": 542 + }, + { + "epoch": 0.07282074206526598, + "grad_norm": 1.0908625387826942, + "learning_rate": 5.929639962144091e-07, + "loss": 1.5891, + "step": 543 + }, + { + "epoch": 0.072954850245865, + "grad_norm": 1.0986511244523132, + "learning_rate": 5.929381707195834e-07, + "loss": 1.6991, + "step": 544 + }, + { + "epoch": 0.07308895842646401, + "grad_norm": 1.055356610594688, + "learning_rate": 5.929122985436045e-07, + "loss": 1.7331, + "step": 545 + }, + { + "epoch": 0.07322306660706303, + "grad_norm": 1.035590332821026, + "learning_rate": 5.928863796910655e-07, + "loss": 1.5682, + "step": 546 + }, + { + "epoch": 0.07335717478766204, + "grad_norm": 1.0783361793793855, + "learning_rate": 5.928604141665679e-07, + "loss": 1.6092, + "step": 547 + }, + { + "epoch": 0.07349128296826106, + "grad_norm": 1.090736305001705, + "learning_rate": 5.928344019747217e-07, + "loss": 1.7072, + "step": 548 + }, + { + "epoch": 0.07362539114886008, + "grad_norm": 1.4276709820636466, + "learning_rate": 5.928083431201449e-07, + "loss": 1.6789, + "step": 549 + }, + { + "epoch": 0.07375949932945909, + "grad_norm": 1.0906054014326296, + "learning_rate": 5.927822376074639e-07, + "loss": 1.7215, + "step": 550 + }, + { + "epoch": 0.0738936075100581, + "grad_norm": 1.364150462787829, + "learning_rate": 5.927560854413134e-07, + "loss": 1.6841, + "step": 551 + }, + { + "epoch": 0.07402771569065714, + "grad_norm": 1.1159870574206099, + "learning_rate": 5.927298866263363e-07, + "loss": 1.7298, + "step": 552 + }, + { + "epoch": 0.07416182387125615, + "grad_norm": 1.1812983592653572, + "learning_rate": 5.92703641167184e-07, + "loss": 1.7091, + "step": 553 + }, + { + "epoch": 0.07429593205185517, + "grad_norm": 1.0688687878186984, + "learning_rate": 5.926773490685159e-07, + "loss": 1.8398, + "step": 554 + }, + { + "epoch": 0.07443004023245418, + "grad_norm": 1.2894858274000411, + "learning_rate": 5.92651010335e-07, + "loss": 1.6902, + "step": 555 + }, + { + "epoch": 0.0745641484130532, + "grad_norm": 1.1464943136824657, + "learning_rate": 5.926246249713121e-07, + "loss": 1.7249, + "step": 556 + }, + { + "epoch": 0.07469825659365222, + "grad_norm": 1.3070568856631266, + "learning_rate": 5.925981929821368e-07, + "loss": 1.6741, + "step": 557 + }, + { + "epoch": 0.07483236477425123, + "grad_norm": 1.1646332582267231, + "learning_rate": 5.925717143721665e-07, + "loss": 1.6975, + "step": 558 + }, + { + "epoch": 0.07496647295485025, + "grad_norm": 1.213733563154542, + "learning_rate": 5.925451891461026e-07, + "loss": 1.6688, + "step": 559 + }, + { + "epoch": 0.07510058113544926, + "grad_norm": 1.1250145434758787, + "learning_rate": 5.925186173086538e-07, + "loss": 1.7044, + "step": 560 + }, + { + "epoch": 0.07523468931604828, + "grad_norm": 1.0865739045197238, + "learning_rate": 5.924919988645377e-07, + "loss": 1.6663, + "step": 561 + }, + { + "epoch": 0.0753687974966473, + "grad_norm": 1.1159580863498637, + "learning_rate": 5.924653338184801e-07, + "loss": 1.5986, + "step": 562 + }, + { + "epoch": 0.07550290567724631, + "grad_norm": 1.0795350956359355, + "learning_rate": 5.924386221752151e-07, + "loss": 1.7059, + "step": 563 + }, + { + "epoch": 0.07563701385784533, + "grad_norm": 1.059523546111381, + "learning_rate": 5.924118639394849e-07, + "loss": 1.6525, + "step": 564 + }, + { + "epoch": 0.07577112203844434, + "grad_norm": 1.0995795687250527, + "learning_rate": 5.923850591160401e-07, + "loss": 1.6524, + "step": 565 + }, + { + "epoch": 0.07590523021904336, + "grad_norm": 1.1092841538303688, + "learning_rate": 5.923582077096395e-07, + "loss": 1.7758, + "step": 566 + }, + { + "epoch": 0.07603933839964237, + "grad_norm": 2.6979584052916503, + "learning_rate": 5.923313097250504e-07, + "loss": 1.6593, + "step": 567 + }, + { + "epoch": 0.07617344658024139, + "grad_norm": 1.0621178435726715, + "learning_rate": 5.923043651670478e-07, + "loss": 1.6983, + "step": 568 + }, + { + "epoch": 0.07630755476084042, + "grad_norm": 1.1573135825405225, + "learning_rate": 5.922773740404157e-07, + "loss": 1.7572, + "step": 569 + }, + { + "epoch": 0.07644166294143943, + "grad_norm": 1.3034930029837637, + "learning_rate": 5.922503363499457e-07, + "loss": 1.7229, + "step": 570 + }, + { + "epoch": 0.07657577112203845, + "grad_norm": 1.063644093194536, + "learning_rate": 5.922232521004384e-07, + "loss": 1.6373, + "step": 571 + }, + { + "epoch": 0.07670987930263747, + "grad_norm": 1.0799490002557715, + "learning_rate": 5.921961212967018e-07, + "loss": 1.7291, + "step": 572 + }, + { + "epoch": 0.07684398748323648, + "grad_norm": 1.1456297613060256, + "learning_rate": 5.921689439435529e-07, + "loss": 1.6715, + "step": 573 + }, + { + "epoch": 0.0769780956638355, + "grad_norm": 1.1064438116765838, + "learning_rate": 5.921417200458166e-07, + "loss": 1.6324, + "step": 574 + }, + { + "epoch": 0.07711220384443451, + "grad_norm": 1.2537502156532783, + "learning_rate": 5.921144496083261e-07, + "loss": 1.6255, + "step": 575 + }, + { + "epoch": 0.07724631202503353, + "grad_norm": 1.1130457826739977, + "learning_rate": 5.920871326359228e-07, + "loss": 1.7305, + "step": 576 + }, + { + "epoch": 0.07738042020563254, + "grad_norm": 1.1106269047087995, + "learning_rate": 5.920597691334568e-07, + "loss": 1.7839, + "step": 577 + }, + { + "epoch": 0.07751452838623156, + "grad_norm": 1.1308110312275523, + "learning_rate": 5.920323591057858e-07, + "loss": 1.702, + "step": 578 + }, + { + "epoch": 0.07764863656683058, + "grad_norm": 1.1274236401107995, + "learning_rate": 5.920049025577762e-07, + "loss": 1.6345, + "step": 579 + }, + { + "epoch": 0.07778274474742959, + "grad_norm": 1.1274894849868589, + "learning_rate": 5.919773994943026e-07, + "loss": 1.6358, + "step": 580 + }, + { + "epoch": 0.07791685292802861, + "grad_norm": 1.203139388656472, + "learning_rate": 5.919498499202476e-07, + "loss": 1.7228, + "step": 581 + }, + { + "epoch": 0.07805096110862762, + "grad_norm": 1.1343472094184475, + "learning_rate": 5.919222538405025e-07, + "loss": 1.5995, + "step": 582 + }, + { + "epoch": 0.07818506928922664, + "grad_norm": 1.1211098856442396, + "learning_rate": 5.918946112599665e-07, + "loss": 1.7545, + "step": 583 + }, + { + "epoch": 0.07831917746982565, + "grad_norm": 1.3590410455725328, + "learning_rate": 5.918669221835472e-07, + "loss": 1.6658, + "step": 584 + }, + { + "epoch": 0.07845328565042467, + "grad_norm": 1.1368973789149184, + "learning_rate": 5.918391866161604e-07, + "loss": 1.6578, + "step": 585 + }, + { + "epoch": 0.07858739383102369, + "grad_norm": 1.144480010176944, + "learning_rate": 5.918114045627301e-07, + "loss": 1.687, + "step": 586 + }, + { + "epoch": 0.07872150201162272, + "grad_norm": 1.1079667555369228, + "learning_rate": 5.91783576028189e-07, + "loss": 1.6571, + "step": 587 + }, + { + "epoch": 0.07885561019222173, + "grad_norm": 1.1172832381186681, + "learning_rate": 5.917557010174771e-07, + "loss": 1.6347, + "step": 588 + }, + { + "epoch": 0.07898971837282075, + "grad_norm": 1.1477730537939723, + "learning_rate": 5.917277795355436e-07, + "loss": 1.696, + "step": 589 + }, + { + "epoch": 0.07912382655341976, + "grad_norm": 1.1124249695741149, + "learning_rate": 5.916998115873455e-07, + "loss": 1.7316, + "step": 590 + }, + { + "epoch": 0.07925793473401878, + "grad_norm": 1.2132332214863524, + "learning_rate": 5.916717971778482e-07, + "loss": 1.7529, + "step": 591 + }, + { + "epoch": 0.0793920429146178, + "grad_norm": 1.1308959961423235, + "learning_rate": 5.916437363120253e-07, + "loss": 1.6713, + "step": 592 + }, + { + "epoch": 0.07952615109521681, + "grad_norm": 1.1204029361778143, + "learning_rate": 5.916156289948584e-07, + "loss": 1.6751, + "step": 593 + }, + { + "epoch": 0.07966025927581583, + "grad_norm": 1.1836584994154395, + "learning_rate": 5.91587475231338e-07, + "loss": 1.7145, + "step": 594 + }, + { + "epoch": 0.07979436745641484, + "grad_norm": 1.0952029272098618, + "learning_rate": 5.91559275026462e-07, + "loss": 1.6849, + "step": 595 + }, + { + "epoch": 0.07992847563701386, + "grad_norm": 1.2564246490346886, + "learning_rate": 5.915310283852372e-07, + "loss": 1.6352, + "step": 596 + }, + { + "epoch": 0.08006258381761287, + "grad_norm": 1.1465710959467506, + "learning_rate": 5.915027353126783e-07, + "loss": 1.6647, + "step": 597 + }, + { + "epoch": 0.08019669199821189, + "grad_norm": 1.1382835508015974, + "learning_rate": 5.914743958138086e-07, + "loss": 1.7106, + "step": 598 + }, + { + "epoch": 0.0803308001788109, + "grad_norm": 1.1192071556571492, + "learning_rate": 5.91446009893659e-07, + "loss": 1.706, + "step": 599 + }, + { + "epoch": 0.08046490835940992, + "grad_norm": 1.1629696564337242, + "learning_rate": 5.914175775572693e-07, + "loss": 1.676, + "step": 600 + }, + { + "epoch": 0.08059901654000894, + "grad_norm": 1.1336751221713581, + "learning_rate": 5.913890988096872e-07, + "loss": 1.7061, + "step": 601 + }, + { + "epoch": 0.08073312472060795, + "grad_norm": 1.063751409329425, + "learning_rate": 5.913605736559689e-07, + "loss": 1.6276, + "step": 602 + }, + { + "epoch": 0.08086723290120697, + "grad_norm": 1.7847493987152905, + "learning_rate": 5.913320021011784e-07, + "loss": 1.7643, + "step": 603 + }, + { + "epoch": 0.081001341081806, + "grad_norm": 1.1752588010758491, + "learning_rate": 5.913033841503882e-07, + "loss": 1.7136, + "step": 604 + }, + { + "epoch": 0.08113544926240501, + "grad_norm": 1.092151629247411, + "learning_rate": 5.912747198086793e-07, + "loss": 1.6921, + "step": 605 + }, + { + "epoch": 0.08126955744300403, + "grad_norm": 1.1813450877374088, + "learning_rate": 5.912460090811404e-07, + "loss": 1.5961, + "step": 606 + }, + { + "epoch": 0.08140366562360304, + "grad_norm": 1.1386503634209713, + "learning_rate": 5.912172519728691e-07, + "loss": 1.6936, + "step": 607 + }, + { + "epoch": 0.08153777380420206, + "grad_norm": 1.1478659529471829, + "learning_rate": 5.911884484889702e-07, + "loss": 1.7133, + "step": 608 + }, + { + "epoch": 0.08167188198480108, + "grad_norm": 1.2776303627444894, + "learning_rate": 5.911595986345579e-07, + "loss": 1.686, + "step": 609 + }, + { + "epoch": 0.08180599016540009, + "grad_norm": 1.0774582052806807, + "learning_rate": 5.91130702414754e-07, + "loss": 1.8028, + "step": 610 + }, + { + "epoch": 0.08194009834599911, + "grad_norm": 1.0810859242279176, + "learning_rate": 5.911017598346885e-07, + "loss": 1.6044, + "step": 611 + }, + { + "epoch": 0.08207420652659812, + "grad_norm": 1.1594727731031893, + "learning_rate": 5.910727708994998e-07, + "loss": 1.7686, + "step": 612 + }, + { + "epoch": 0.08220831470719714, + "grad_norm": 1.1321005040254193, + "learning_rate": 5.910437356143345e-07, + "loss": 1.6522, + "step": 613 + }, + { + "epoch": 0.08234242288779615, + "grad_norm": 1.0653919163589205, + "learning_rate": 5.910146539843476e-07, + "loss": 1.7465, + "step": 614 + }, + { + "epoch": 0.08247653106839517, + "grad_norm": 1.1128916496114905, + "learning_rate": 5.90985526014702e-07, + "loss": 1.6125, + "step": 615 + }, + { + "epoch": 0.08261063924899419, + "grad_norm": 1.4081204838899852, + "learning_rate": 5.90956351710569e-07, + "loss": 1.7639, + "step": 616 + }, + { + "epoch": 0.0827447474295932, + "grad_norm": 1.1683592035720405, + "learning_rate": 5.909271310771279e-07, + "loss": 1.637, + "step": 617 + }, + { + "epoch": 0.08287885561019222, + "grad_norm": 1.115793940661641, + "learning_rate": 5.90897864119567e-07, + "loss": 1.6118, + "step": 618 + }, + { + "epoch": 0.08301296379079123, + "grad_norm": 1.0879479857779484, + "learning_rate": 5.908685508430816e-07, + "loss": 1.6846, + "step": 619 + }, + { + "epoch": 0.08314707197139025, + "grad_norm": 1.1428114800136786, + "learning_rate": 5.908391912528764e-07, + "loss": 1.6949, + "step": 620 + }, + { + "epoch": 0.08328118015198926, + "grad_norm": 1.11661524840305, + "learning_rate": 5.908097853541634e-07, + "loss": 1.754, + "step": 621 + }, + { + "epoch": 0.0834152883325883, + "grad_norm": 1.0762293742420466, + "learning_rate": 5.907803331521635e-07, + "loss": 1.7609, + "step": 622 + }, + { + "epoch": 0.08354939651318731, + "grad_norm": 1.0719203407555025, + "learning_rate": 5.907508346521054e-07, + "loss": 1.6981, + "step": 623 + }, + { + "epoch": 0.08368350469378633, + "grad_norm": 1.1553772926251566, + "learning_rate": 5.907212898592263e-07, + "loss": 1.7024, + "step": 624 + }, + { + "epoch": 0.08381761287438534, + "grad_norm": 1.1270260996688657, + "learning_rate": 5.906916987787713e-07, + "loss": 1.6906, + "step": 625 + }, + { + "epoch": 0.08395172105498436, + "grad_norm": 1.1229658996843206, + "learning_rate": 5.90662061415994e-07, + "loss": 1.694, + "step": 626 + }, + { + "epoch": 0.08408582923558337, + "grad_norm": 1.1277068299424584, + "learning_rate": 5.906323777761561e-07, + "loss": 1.5693, + "step": 627 + }, + { + "epoch": 0.08421993741618239, + "grad_norm": 1.1180105581479995, + "learning_rate": 5.906026478645276e-07, + "loss": 1.7247, + "step": 628 + }, + { + "epoch": 0.0843540455967814, + "grad_norm": 1.2224062872746266, + "learning_rate": 5.905728716863865e-07, + "loss": 1.6829, + "step": 629 + }, + { + "epoch": 0.08448815377738042, + "grad_norm": 1.1085889629398797, + "learning_rate": 5.905430492470195e-07, + "loss": 1.7271, + "step": 630 + }, + { + "epoch": 0.08462226195797944, + "grad_norm": 1.1451977446739299, + "learning_rate": 5.905131805517207e-07, + "loss": 1.5877, + "step": 631 + }, + { + "epoch": 0.08475637013857845, + "grad_norm": 1.1422915014499277, + "learning_rate": 5.904832656057932e-07, + "loss": 1.6977, + "step": 632 + }, + { + "epoch": 0.08489047831917747, + "grad_norm": 1.131510544315339, + "learning_rate": 5.904533044145479e-07, + "loss": 1.5513, + "step": 633 + }, + { + "epoch": 0.08502458649977648, + "grad_norm": 1.2432140035573447, + "learning_rate": 5.904232969833039e-07, + "loss": 1.6835, + "step": 634 + }, + { + "epoch": 0.0851586946803755, + "grad_norm": 1.0744643011300827, + "learning_rate": 5.90393243317389e-07, + "loss": 1.6052, + "step": 635 + }, + { + "epoch": 0.08529280286097451, + "grad_norm": 1.3098823736310086, + "learning_rate": 5.903631434221384e-07, + "loss": 1.7622, + "step": 636 + }, + { + "epoch": 0.08542691104157353, + "grad_norm": 1.1182788647555526, + "learning_rate": 5.903329973028961e-07, + "loss": 1.7497, + "step": 637 + }, + { + "epoch": 0.08556101922217255, + "grad_norm": 1.305543631329334, + "learning_rate": 5.903028049650141e-07, + "loss": 1.6732, + "step": 638 + }, + { + "epoch": 0.08569512740277158, + "grad_norm": 1.1108546390310376, + "learning_rate": 5.902725664138528e-07, + "loss": 1.7271, + "step": 639 + }, + { + "epoch": 0.08582923558337059, + "grad_norm": 1.0769425748182762, + "learning_rate": 5.902422816547804e-07, + "loss": 1.666, + "step": 640 + }, + { + "epoch": 0.08596334376396961, + "grad_norm": 1.0710915573180522, + "learning_rate": 5.902119506931739e-07, + "loss": 1.7208, + "step": 641 + }, + { + "epoch": 0.08609745194456862, + "grad_norm": 1.1265338939849623, + "learning_rate": 5.901815735344178e-07, + "loss": 1.713, + "step": 642 + }, + { + "epoch": 0.08623156012516764, + "grad_norm": 1.1032977967977797, + "learning_rate": 5.901511501839053e-07, + "loss": 1.655, + "step": 643 + }, + { + "epoch": 0.08636566830576665, + "grad_norm": 1.067089553405501, + "learning_rate": 5.901206806470377e-07, + "loss": 1.6794, + "step": 644 + }, + { + "epoch": 0.08649977648636567, + "grad_norm": 1.1924702814140196, + "learning_rate": 5.900901649292243e-07, + "loss": 1.6186, + "step": 645 + }, + { + "epoch": 0.08663388466696469, + "grad_norm": 1.1000064746041005, + "learning_rate": 5.900596030358831e-07, + "loss": 1.7316, + "step": 646 + }, + { + "epoch": 0.0867679928475637, + "grad_norm": 1.16787242186727, + "learning_rate": 5.900289949724397e-07, + "loss": 1.6475, + "step": 647 + }, + { + "epoch": 0.08690210102816272, + "grad_norm": 1.153036807295657, + "learning_rate": 5.899983407443281e-07, + "loss": 1.604, + "step": 648 + }, + { + "epoch": 0.08703620920876173, + "grad_norm": 1.1418227950695776, + "learning_rate": 5.899676403569906e-07, + "loss": 1.7925, + "step": 649 + }, + { + "epoch": 0.08717031738936075, + "grad_norm": 1.1018946533270777, + "learning_rate": 5.899368938158777e-07, + "loss": 1.5998, + "step": 650 + }, + { + "epoch": 0.08730442556995976, + "grad_norm": 1.0898779658636957, + "learning_rate": 5.899061011264481e-07, + "loss": 1.6772, + "step": 651 + }, + { + "epoch": 0.08743853375055878, + "grad_norm": 1.1828085767178107, + "learning_rate": 5.898752622941684e-07, + "loss": 1.6564, + "step": 652 + }, + { + "epoch": 0.0875726419311578, + "grad_norm": 1.123777742875525, + "learning_rate": 5.89844377324514e-07, + "loss": 1.7173, + "step": 653 + }, + { + "epoch": 0.08770675011175681, + "grad_norm": 1.1137884706219183, + "learning_rate": 5.898134462229677e-07, + "loss": 1.705, + "step": 654 + }, + { + "epoch": 0.08784085829235583, + "grad_norm": 1.0736901627301867, + "learning_rate": 5.89782468995021e-07, + "loss": 1.6673, + "step": 655 + }, + { + "epoch": 0.08797496647295484, + "grad_norm": 1.1006296755478988, + "learning_rate": 5.897514456461737e-07, + "loss": 1.662, + "step": 656 + }, + { + "epoch": 0.08810907465355387, + "grad_norm": 1.0993086803454002, + "learning_rate": 5.897203761819334e-07, + "loss": 1.7671, + "step": 657 + }, + { + "epoch": 0.08824318283415289, + "grad_norm": 1.1555576950225783, + "learning_rate": 5.896892606078163e-07, + "loss": 1.6558, + "step": 658 + }, + { + "epoch": 0.0883772910147519, + "grad_norm": 1.1044269950107921, + "learning_rate": 5.896580989293461e-07, + "loss": 1.6538, + "step": 659 + }, + { + "epoch": 0.08851139919535092, + "grad_norm": 1.1293808136662087, + "learning_rate": 5.896268911520556e-07, + "loss": 1.6734, + "step": 660 + }, + { + "epoch": 0.08864550737594994, + "grad_norm": 1.0799327058316142, + "learning_rate": 5.895956372814851e-07, + "loss": 1.7258, + "step": 661 + }, + { + "epoch": 0.08877961555654895, + "grad_norm": 1.2412270489033748, + "learning_rate": 5.895643373231834e-07, + "loss": 1.7033, + "step": 662 + }, + { + "epoch": 0.08891372373714797, + "grad_norm": 1.2660732052099137, + "learning_rate": 5.895329912827074e-07, + "loss": 1.6607, + "step": 663 + }, + { + "epoch": 0.08904783191774698, + "grad_norm": 1.0851423150565935, + "learning_rate": 5.895015991656218e-07, + "loss": 1.7365, + "step": 664 + }, + { + "epoch": 0.089181940098346, + "grad_norm": 1.0926935688632777, + "learning_rate": 5.894701609775004e-07, + "loss": 1.723, + "step": 665 + }, + { + "epoch": 0.08931604827894501, + "grad_norm": 1.1335362217269433, + "learning_rate": 5.894386767239243e-07, + "loss": 1.7482, + "step": 666 + }, + { + "epoch": 0.08945015645954403, + "grad_norm": 1.0690769483519065, + "learning_rate": 5.894071464104832e-07, + "loss": 1.7083, + "step": 667 + }, + { + "epoch": 0.08958426464014305, + "grad_norm": 1.144239086274215, + "learning_rate": 5.893755700427749e-07, + "loss": 1.6672, + "step": 668 + }, + { + "epoch": 0.08971837282074206, + "grad_norm": 1.154969050751237, + "learning_rate": 5.893439476264053e-07, + "loss": 1.5992, + "step": 669 + }, + { + "epoch": 0.08985248100134108, + "grad_norm": 1.1692487930022055, + "learning_rate": 5.893122791669886e-07, + "loss": 1.6895, + "step": 670 + }, + { + "epoch": 0.0899865891819401, + "grad_norm": 1.1445503009803197, + "learning_rate": 5.892805646701471e-07, + "loss": 1.6176, + "step": 671 + }, + { + "epoch": 0.09012069736253911, + "grad_norm": 1.0860602124973238, + "learning_rate": 5.892488041415113e-07, + "loss": 1.7431, + "step": 672 + }, + { + "epoch": 0.09025480554313813, + "grad_norm": 1.1840804859528216, + "learning_rate": 5.892169975867196e-07, + "loss": 1.5377, + "step": 673 + }, + { + "epoch": 0.09038891372373715, + "grad_norm": 1.0925936180668785, + "learning_rate": 5.891851450114193e-07, + "loss": 1.693, + "step": 674 + }, + { + "epoch": 0.09052302190433617, + "grad_norm": 1.1412736395289622, + "learning_rate": 5.891532464212651e-07, + "loss": 1.6782, + "step": 675 + }, + { + "epoch": 0.09065713008493519, + "grad_norm": 1.1014154222006858, + "learning_rate": 5.891213018219203e-07, + "loss": 1.6661, + "step": 676 + }, + { + "epoch": 0.0907912382655342, + "grad_norm": 1.1028682933773437, + "learning_rate": 5.89089311219056e-07, + "loss": 1.6283, + "step": 677 + }, + { + "epoch": 0.09092534644613322, + "grad_norm": 1.0999221111301187, + "learning_rate": 5.89057274618352e-07, + "loss": 1.6288, + "step": 678 + }, + { + "epoch": 0.09105945462673223, + "grad_norm": 1.0929215008817739, + "learning_rate": 5.890251920254958e-07, + "loss": 1.6966, + "step": 679 + }, + { + "epoch": 0.09119356280733125, + "grad_norm": 1.0995793357287673, + "learning_rate": 5.889930634461832e-07, + "loss": 1.7086, + "step": 680 + }, + { + "epoch": 0.09132767098793027, + "grad_norm": 1.0809381415190136, + "learning_rate": 5.889608888861182e-07, + "loss": 1.6829, + "step": 681 + }, + { + "epoch": 0.09146177916852928, + "grad_norm": 1.0548227913499995, + "learning_rate": 5.889286683510132e-07, + "loss": 1.6826, + "step": 682 + }, + { + "epoch": 0.0915958873491283, + "grad_norm": 1.1106859513783915, + "learning_rate": 5.888964018465883e-07, + "loss": 1.6544, + "step": 683 + }, + { + "epoch": 0.09172999552972731, + "grad_norm": 1.0878369148062472, + "learning_rate": 5.88864089378572e-07, + "loss": 1.6342, + "step": 684 + }, + { + "epoch": 0.09186410371032633, + "grad_norm": 1.128955444803477, + "learning_rate": 5.888317309527009e-07, + "loss": 1.6121, + "step": 685 + }, + { + "epoch": 0.09199821189092534, + "grad_norm": 1.246867762194091, + "learning_rate": 5.887993265747201e-07, + "loss": 1.6819, + "step": 686 + }, + { + "epoch": 0.09213232007152436, + "grad_norm": 1.1533855664708184, + "learning_rate": 5.887668762503822e-07, + "loss": 1.7429, + "step": 687 + }, + { + "epoch": 0.09226642825212338, + "grad_norm": 1.0405450268075809, + "learning_rate": 5.887343799854485e-07, + "loss": 1.6759, + "step": 688 + }, + { + "epoch": 0.09240053643272239, + "grad_norm": 1.1507085139636744, + "learning_rate": 5.887018377856884e-07, + "loss": 1.8036, + "step": 689 + }, + { + "epoch": 0.0925346446133214, + "grad_norm": 6.743658343986094, + "learning_rate": 5.886692496568789e-07, + "loss": 1.6027, + "step": 690 + }, + { + "epoch": 0.09266875279392042, + "grad_norm": 1.0641784107760024, + "learning_rate": 5.886366156048061e-07, + "loss": 1.6558, + "step": 691 + }, + { + "epoch": 0.09280286097451945, + "grad_norm": 1.0922990524942957, + "learning_rate": 5.886039356352634e-07, + "loss": 1.7383, + "step": 692 + }, + { + "epoch": 0.09293696915511847, + "grad_norm": 1.1742618579401762, + "learning_rate": 5.885712097540529e-07, + "loss": 1.5927, + "step": 693 + }, + { + "epoch": 0.09307107733571748, + "grad_norm": 1.1075189838987614, + "learning_rate": 5.885384379669844e-07, + "loss": 1.7738, + "step": 694 + }, + { + "epoch": 0.0932051855163165, + "grad_norm": 2.1929813163212093, + "learning_rate": 5.885056202798763e-07, + "loss": 1.7975, + "step": 695 + }, + { + "epoch": 0.09333929369691552, + "grad_norm": 1.0998963175774283, + "learning_rate": 5.88472756698555e-07, + "loss": 1.6156, + "step": 696 + }, + { + "epoch": 0.09347340187751453, + "grad_norm": 1.0824346616111722, + "learning_rate": 5.884398472288546e-07, + "loss": 1.7226, + "step": 697 + }, + { + "epoch": 0.09360751005811355, + "grad_norm": 1.048887980139358, + "learning_rate": 5.884068918766182e-07, + "loss": 1.7065, + "step": 698 + }, + { + "epoch": 0.09374161823871256, + "grad_norm": 1.0293430293240384, + "learning_rate": 5.883738906476963e-07, + "loss": 1.6596, + "step": 699 + }, + { + "epoch": 0.09387572641931158, + "grad_norm": 1.0943419458638883, + "learning_rate": 5.88340843547948e-07, + "loss": 1.7356, + "step": 700 + }, + { + "epoch": 0.0940098345999106, + "grad_norm": 1.0980484739258698, + "learning_rate": 5.883077505832403e-07, + "loss": 1.6039, + "step": 701 + }, + { + "epoch": 0.09414394278050961, + "grad_norm": 1.1455036041824893, + "learning_rate": 5.882746117594482e-07, + "loss": 1.6255, + "step": 702 + }, + { + "epoch": 0.09427805096110863, + "grad_norm": 1.4001837690870673, + "learning_rate": 5.882414270824554e-07, + "loss": 1.6008, + "step": 703 + }, + { + "epoch": 0.09441215914170764, + "grad_norm": 1.1130500383248842, + "learning_rate": 5.882081965581533e-07, + "loss": 1.7358, + "step": 704 + }, + { + "epoch": 0.09454626732230666, + "grad_norm": 1.070694937502845, + "learning_rate": 5.881749201924413e-07, + "loss": 1.6635, + "step": 705 + }, + { + "epoch": 0.09468037550290567, + "grad_norm": 1.1144333495898877, + "learning_rate": 5.881415979912274e-07, + "loss": 1.7066, + "step": 706 + }, + { + "epoch": 0.09481448368350469, + "grad_norm": 1.1422205384748831, + "learning_rate": 5.881082299604276e-07, + "loss": 1.6546, + "step": 707 + }, + { + "epoch": 0.0949485918641037, + "grad_norm": 1.0853098558287595, + "learning_rate": 5.880748161059657e-07, + "loss": 1.6753, + "step": 708 + }, + { + "epoch": 0.09508270004470273, + "grad_norm": 1.198904753001485, + "learning_rate": 5.88041356433774e-07, + "loss": 1.7569, + "step": 709 + }, + { + "epoch": 0.09521680822530175, + "grad_norm": 1.1071829227283936, + "learning_rate": 5.880078509497928e-07, + "loss": 1.6232, + "step": 710 + }, + { + "epoch": 0.09535091640590077, + "grad_norm": 1.0695300790601336, + "learning_rate": 5.879742996599706e-07, + "loss": 1.6413, + "step": 711 + }, + { + "epoch": 0.09548502458649978, + "grad_norm": 3.3268091455655355, + "learning_rate": 5.879407025702638e-07, + "loss": 1.593, + "step": 712 + }, + { + "epoch": 0.0956191327670988, + "grad_norm": 1.0722393433959394, + "learning_rate": 5.879070596866374e-07, + "loss": 1.7546, + "step": 713 + }, + { + "epoch": 0.09575324094769781, + "grad_norm": 1.153579196694916, + "learning_rate": 5.87873371015064e-07, + "loss": 1.657, + "step": 714 + }, + { + "epoch": 0.09588734912829683, + "grad_norm": 1.1213730882230093, + "learning_rate": 5.878396365615248e-07, + "loss": 1.6892, + "step": 715 + }, + { + "epoch": 0.09602145730889584, + "grad_norm": 1.1795757056582914, + "learning_rate": 5.878058563320086e-07, + "loss": 1.6945, + "step": 716 + }, + { + "epoch": 0.09615556548949486, + "grad_norm": 1.075176593983707, + "learning_rate": 5.87772030332513e-07, + "loss": 1.7196, + "step": 717 + }, + { + "epoch": 0.09628967367009388, + "grad_norm": 1.0441316150069637, + "learning_rate": 5.877381585690431e-07, + "loss": 1.6256, + "step": 718 + }, + { + "epoch": 0.09642378185069289, + "grad_norm": 1.1023538045059467, + "learning_rate": 5.877042410476124e-07, + "loss": 1.6537, + "step": 719 + }, + { + "epoch": 0.09655789003129191, + "grad_norm": 1.154659783031204, + "learning_rate": 5.876702777742425e-07, + "loss": 1.75, + "step": 720 + }, + { + "epoch": 0.09669199821189092, + "grad_norm": 1.1756635069685608, + "learning_rate": 5.876362687549632e-07, + "loss": 1.6535, + "step": 721 + }, + { + "epoch": 0.09682610639248994, + "grad_norm": 1.1127957017636008, + "learning_rate": 5.876022139958122e-07, + "loss": 1.6513, + "step": 722 + }, + { + "epoch": 0.09696021457308895, + "grad_norm": 1.1770680572803744, + "learning_rate": 5.875681135028358e-07, + "loss": 1.6897, + "step": 723 + }, + { + "epoch": 0.09709432275368797, + "grad_norm": 1.054488251672258, + "learning_rate": 5.875339672820877e-07, + "loss": 1.7035, + "step": 724 + }, + { + "epoch": 0.09722843093428699, + "grad_norm": 1.1537946876962146, + "learning_rate": 5.874997753396303e-07, + "loss": 1.6564, + "step": 725 + }, + { + "epoch": 0.097362539114886, + "grad_norm": 1.2650547539228134, + "learning_rate": 5.874655376815338e-07, + "loss": 1.7448, + "step": 726 + }, + { + "epoch": 0.09749664729548503, + "grad_norm": 1.0865445919691652, + "learning_rate": 5.874312543138768e-07, + "loss": 1.7492, + "step": 727 + }, + { + "epoch": 0.09763075547608405, + "grad_norm": 1.0635064685924933, + "learning_rate": 5.873969252427457e-07, + "loss": 1.569, + "step": 728 + }, + { + "epoch": 0.09776486365668306, + "grad_norm": 1.1242141873259432, + "learning_rate": 5.873625504742354e-07, + "loss": 1.6972, + "step": 729 + }, + { + "epoch": 0.09789897183728208, + "grad_norm": 1.374622796897752, + "learning_rate": 5.873281300144483e-07, + "loss": 1.66, + "step": 730 + }, + { + "epoch": 0.0980330800178811, + "grad_norm": 1.0742640980921085, + "learning_rate": 5.872936638694958e-07, + "loss": 1.6395, + "step": 731 + }, + { + "epoch": 0.09816718819848011, + "grad_norm": 1.1834566808846507, + "learning_rate": 5.872591520454964e-07, + "loss": 1.6467, + "step": 732 + }, + { + "epoch": 0.09830129637907913, + "grad_norm": 1.1393523410825188, + "learning_rate": 5.872245945485774e-07, + "loss": 1.6715, + "step": 733 + }, + { + "epoch": 0.09843540455967814, + "grad_norm": 1.133914370439065, + "learning_rate": 5.871899913848743e-07, + "loss": 1.6661, + "step": 734 + }, + { + "epoch": 0.09856951274027716, + "grad_norm": 1.1318819144753365, + "learning_rate": 5.871553425605299e-07, + "loss": 1.7463, + "step": 735 + }, + { + "epoch": 0.09870362092087617, + "grad_norm": 1.119126620886235, + "learning_rate": 5.871206480816961e-07, + "loss": 1.681, + "step": 736 + }, + { + "epoch": 0.09883772910147519, + "grad_norm": 1.074480380396243, + "learning_rate": 5.870859079545321e-07, + "loss": 1.6163, + "step": 737 + }, + { + "epoch": 0.0989718372820742, + "grad_norm": 1.1208330921778833, + "learning_rate": 5.870511221852059e-07, + "loss": 1.619, + "step": 738 + }, + { + "epoch": 0.09910594546267322, + "grad_norm": 1.1594847796734538, + "learning_rate": 5.870162907798928e-07, + "loss": 1.6592, + "step": 739 + }, + { + "epoch": 0.09924005364327224, + "grad_norm": 1.058931279874539, + "learning_rate": 5.869814137447771e-07, + "loss": 1.6851, + "step": 740 + }, + { + "epoch": 0.09937416182387125, + "grad_norm": 1.1378546192527486, + "learning_rate": 5.869464910860505e-07, + "loss": 1.7918, + "step": 741 + }, + { + "epoch": 0.09950827000447027, + "grad_norm": 1.1325033016555488, + "learning_rate": 5.869115228099131e-07, + "loss": 1.6834, + "step": 742 + }, + { + "epoch": 0.09964237818506928, + "grad_norm": 1.3421525418201607, + "learning_rate": 5.86876508922573e-07, + "loss": 1.6549, + "step": 743 + }, + { + "epoch": 0.09977648636566831, + "grad_norm": 1.1427938179025248, + "learning_rate": 5.868414494302465e-07, + "loss": 1.6589, + "step": 744 + }, + { + "epoch": 0.09991059454626733, + "grad_norm": 1.1974168236579015, + "learning_rate": 5.86806344339158e-07, + "loss": 1.6378, + "step": 745 + }, + { + "epoch": 0.10004470272686634, + "grad_norm": 1.182005807170805, + "learning_rate": 5.867711936555398e-07, + "loss": 1.6299, + "step": 746 + }, + { + "epoch": 0.10017881090746536, + "grad_norm": 1.1347901749058797, + "learning_rate": 5.867359973856326e-07, + "loss": 1.6285, + "step": 747 + }, + { + "epoch": 0.10031291908806438, + "grad_norm": 1.0865847111724278, + "learning_rate": 5.867007555356848e-07, + "loss": 1.5712, + "step": 748 + }, + { + "epoch": 0.10044702726866339, + "grad_norm": 1.0792499138775284, + "learning_rate": 5.866654681119534e-07, + "loss": 1.6768, + "step": 749 + }, + { + "epoch": 0.10058113544926241, + "grad_norm": 1.1459851366680363, + "learning_rate": 5.866301351207031e-07, + "loss": 1.6162, + "step": 750 + }, + { + "epoch": 0.10071524362986142, + "grad_norm": 1.0878281762208375, + "learning_rate": 5.865947565682066e-07, + "loss": 1.6656, + "step": 751 + }, + { + "epoch": 0.10084935181046044, + "grad_norm": 1.0847043417176385, + "learning_rate": 5.865593324607452e-07, + "loss": 1.6349, + "step": 752 + }, + { + "epoch": 0.10098345999105945, + "grad_norm": 1.07175506702241, + "learning_rate": 5.865238628046077e-07, + "loss": 1.646, + "step": 753 + }, + { + "epoch": 0.10111756817165847, + "grad_norm": 1.1573886829728748, + "learning_rate": 5.864883476060915e-07, + "loss": 1.6585, + "step": 754 + }, + { + "epoch": 0.10125167635225749, + "grad_norm": 1.0662183481503906, + "learning_rate": 5.864527868715017e-07, + "loss": 1.6685, + "step": 755 + }, + { + "epoch": 0.1013857845328565, + "grad_norm": 1.1141344678729455, + "learning_rate": 5.864171806071517e-07, + "loss": 1.7169, + "step": 756 + }, + { + "epoch": 0.10151989271345552, + "grad_norm": 1.100766756813705, + "learning_rate": 5.863815288193628e-07, + "loss": 1.6247, + "step": 757 + }, + { + "epoch": 0.10165400089405453, + "grad_norm": 1.0952255674456979, + "learning_rate": 5.863458315144646e-07, + "loss": 1.6211, + "step": 758 + }, + { + "epoch": 0.10178810907465355, + "grad_norm": 1.1257453114351714, + "learning_rate": 5.863100886987948e-07, + "loss": 1.7725, + "step": 759 + }, + { + "epoch": 0.10192221725525256, + "grad_norm": 1.1540265958163123, + "learning_rate": 5.862743003786989e-07, + "loss": 1.7236, + "step": 760 + }, + { + "epoch": 0.10205632543585158, + "grad_norm": 1.1525383018656805, + "learning_rate": 5.862384665605306e-07, + "loss": 1.6291, + "step": 761 + }, + { + "epoch": 0.10219043361645061, + "grad_norm": 1.0998304145799205, + "learning_rate": 5.862025872506518e-07, + "loss": 1.6707, + "step": 762 + }, + { + "epoch": 0.10232454179704963, + "grad_norm": 1.1328389993712693, + "learning_rate": 5.861666624554323e-07, + "loss": 1.7046, + "step": 763 + }, + { + "epoch": 0.10245864997764864, + "grad_norm": 1.1261717885021774, + "learning_rate": 5.861306921812503e-07, + "loss": 1.7154, + "step": 764 + }, + { + "epoch": 0.10259275815824766, + "grad_norm": 1.1225339366672114, + "learning_rate": 5.860946764344915e-07, + "loss": 1.6906, + "step": 765 + }, + { + "epoch": 0.10272686633884667, + "grad_norm": 1.0705179266385985, + "learning_rate": 5.860586152215504e-07, + "loss": 1.6246, + "step": 766 + }, + { + "epoch": 0.10286097451944569, + "grad_norm": 1.1541152561285446, + "learning_rate": 5.860225085488287e-07, + "loss": 1.7682, + "step": 767 + }, + { + "epoch": 0.1029950827000447, + "grad_norm": 1.0637815973415343, + "learning_rate": 5.859863564227371e-07, + "loss": 1.5644, + "step": 768 + }, + { + "epoch": 0.10312919088064372, + "grad_norm": 1.4548832416501927, + "learning_rate": 5.859501588496937e-07, + "loss": 1.6585, + "step": 769 + }, + { + "epoch": 0.10326329906124274, + "grad_norm": 1.1159025503039528, + "learning_rate": 5.859139158361249e-07, + "loss": 1.7046, + "step": 770 + }, + { + "epoch": 0.10339740724184175, + "grad_norm": 1.1310495005094254, + "learning_rate": 5.858776273884653e-07, + "loss": 1.6818, + "step": 771 + }, + { + "epoch": 0.10353151542244077, + "grad_norm": 1.0517973047871627, + "learning_rate": 5.858412935131574e-07, + "loss": 1.6145, + "step": 772 + }, + { + "epoch": 0.10366562360303978, + "grad_norm": 1.080650360146408, + "learning_rate": 5.858049142166517e-07, + "loss": 1.6628, + "step": 773 + }, + { + "epoch": 0.1037997317836388, + "grad_norm": 1.1586931721545415, + "learning_rate": 5.857684895054069e-07, + "loss": 1.6491, + "step": 774 + }, + { + "epoch": 0.10393383996423781, + "grad_norm": 1.1442490123077105, + "learning_rate": 5.857320193858896e-07, + "loss": 1.701, + "step": 775 + }, + { + "epoch": 0.10406794814483683, + "grad_norm": 1.1690889705843661, + "learning_rate": 5.856955038645748e-07, + "loss": 1.6635, + "step": 776 + }, + { + "epoch": 0.10420205632543585, + "grad_norm": 1.0789106990522987, + "learning_rate": 5.856589429479454e-07, + "loss": 1.7244, + "step": 777 + }, + { + "epoch": 0.10433616450603486, + "grad_norm": 1.1621702061459454, + "learning_rate": 5.856223366424918e-07, + "loss": 1.6577, + "step": 778 + }, + { + "epoch": 0.10447027268663389, + "grad_norm": 1.234518365304015, + "learning_rate": 5.855856849547135e-07, + "loss": 1.628, + "step": 779 + }, + { + "epoch": 0.10460438086723291, + "grad_norm": 1.0985603622430586, + "learning_rate": 5.855489878911173e-07, + "loss": 1.5708, + "step": 780 + }, + { + "epoch": 0.10473848904783192, + "grad_norm": 1.2290143697832727, + "learning_rate": 5.855122454582182e-07, + "loss": 1.6148, + "step": 781 + }, + { + "epoch": 0.10487259722843094, + "grad_norm": 1.0968718099792736, + "learning_rate": 5.854754576625395e-07, + "loss": 1.6741, + "step": 782 + }, + { + "epoch": 0.10500670540902995, + "grad_norm": 1.1287867540808152, + "learning_rate": 5.854386245106123e-07, + "loss": 1.6414, + "step": 783 + }, + { + "epoch": 0.10514081358962897, + "grad_norm": 1.23300063689037, + "learning_rate": 5.854017460089758e-07, + "loss": 1.6692, + "step": 784 + }, + { + "epoch": 0.10527492177022799, + "grad_norm": 1.057896247934459, + "learning_rate": 5.853648221641774e-07, + "loss": 1.5768, + "step": 785 + }, + { + "epoch": 0.105409029950827, + "grad_norm": 1.1246918122007368, + "learning_rate": 5.853278529827722e-07, + "loss": 1.7188, + "step": 786 + }, + { + "epoch": 0.10554313813142602, + "grad_norm": 1.1394479386508116, + "learning_rate": 5.852908384713238e-07, + "loss": 1.6904, + "step": 787 + }, + { + "epoch": 0.10567724631202503, + "grad_norm": 1.111982268532425, + "learning_rate": 5.852537786364036e-07, + "loss": 1.6384, + "step": 788 + }, + { + "epoch": 0.10581135449262405, + "grad_norm": 1.1240815270464448, + "learning_rate": 5.85216673484591e-07, + "loss": 1.7382, + "step": 789 + }, + { + "epoch": 0.10594546267322306, + "grad_norm": 1.103447231107936, + "learning_rate": 5.851795230224736e-07, + "loss": 1.7285, + "step": 790 + }, + { + "epoch": 0.10607957085382208, + "grad_norm": 1.124305841718373, + "learning_rate": 5.851423272566469e-07, + "loss": 1.5874, + "step": 791 + }, + { + "epoch": 0.1062136790344211, + "grad_norm": 1.1424352731892036, + "learning_rate": 5.851050861937145e-07, + "loss": 1.7097, + "step": 792 + }, + { + "epoch": 0.10634778721502011, + "grad_norm": 1.1724771511120693, + "learning_rate": 5.850677998402881e-07, + "loss": 1.6847, + "step": 793 + }, + { + "epoch": 0.10648189539561913, + "grad_norm": 1.1246235851433404, + "learning_rate": 5.850304682029874e-07, + "loss": 1.6735, + "step": 794 + }, + { + "epoch": 0.10661600357621814, + "grad_norm": 1.1044843136711693, + "learning_rate": 5.849930912884402e-07, + "loss": 1.6758, + "step": 795 + }, + { + "epoch": 0.10675011175681716, + "grad_norm": 1.086861760986685, + "learning_rate": 5.849556691032821e-07, + "loss": 1.6564, + "step": 796 + }, + { + "epoch": 0.10688421993741619, + "grad_norm": 1.1156492790718477, + "learning_rate": 5.84918201654157e-07, + "loss": 1.7699, + "step": 797 + }, + { + "epoch": 0.1070183281180152, + "grad_norm": 1.105919104931648, + "learning_rate": 5.848806889477168e-07, + "loss": 1.6673, + "step": 798 + }, + { + "epoch": 0.10715243629861422, + "grad_norm": 1.1197711837565212, + "learning_rate": 5.848431309906213e-07, + "loss": 1.6681, + "step": 799 + }, + { + "epoch": 0.10728654447921324, + "grad_norm": 1.0624511416416331, + "learning_rate": 5.848055277895385e-07, + "loss": 1.6102, + "step": 800 + }, + { + "epoch": 0.10742065265981225, + "grad_norm": 1.2004229748929618, + "learning_rate": 5.847678793511441e-07, + "loss": 1.5863, + "step": 801 + }, + { + "epoch": 0.10755476084041127, + "grad_norm": 1.0858125624618846, + "learning_rate": 5.847301856821225e-07, + "loss": 1.5247, + "step": 802 + }, + { + "epoch": 0.10768886902101028, + "grad_norm": 1.1461866619519925, + "learning_rate": 5.846924467891654e-07, + "loss": 1.6982, + "step": 803 + }, + { + "epoch": 0.1078229772016093, + "grad_norm": 1.072949621974548, + "learning_rate": 5.846546626789727e-07, + "loss": 1.6836, + "step": 804 + }, + { + "epoch": 0.10795708538220831, + "grad_norm": 1.2070245013041887, + "learning_rate": 5.846168333582527e-07, + "loss": 1.6951, + "step": 805 + }, + { + "epoch": 0.10809119356280733, + "grad_norm": 1.1065226823941745, + "learning_rate": 5.845789588337217e-07, + "loss": 1.6581, + "step": 806 + }, + { + "epoch": 0.10822530174340635, + "grad_norm": 1.1493594907559954, + "learning_rate": 5.845410391121034e-07, + "loss": 1.5682, + "step": 807 + }, + { + "epoch": 0.10835940992400536, + "grad_norm": 1.060419028705976, + "learning_rate": 5.845030742001301e-07, + "loss": 1.6098, + "step": 808 + }, + { + "epoch": 0.10849351810460438, + "grad_norm": 1.0986472798667166, + "learning_rate": 5.84465064104542e-07, + "loss": 1.6998, + "step": 809 + }, + { + "epoch": 0.1086276262852034, + "grad_norm": 1.0780015294363108, + "learning_rate": 5.844270088320872e-07, + "loss": 1.6396, + "step": 810 + }, + { + "epoch": 0.10876173446580241, + "grad_norm": 1.1471597573517582, + "learning_rate": 5.843889083895219e-07, + "loss": 1.7247, + "step": 811 + }, + { + "epoch": 0.10889584264640143, + "grad_norm": 1.1383862809473648, + "learning_rate": 5.843507627836106e-07, + "loss": 1.6618, + "step": 812 + }, + { + "epoch": 0.10902995082700044, + "grad_norm": 1.1192741205184784, + "learning_rate": 5.843125720211251e-07, + "loss": 1.6551, + "step": 813 + }, + { + "epoch": 0.10916405900759947, + "grad_norm": 1.137804969239655, + "learning_rate": 5.84274336108846e-07, + "loss": 1.7777, + "step": 814 + }, + { + "epoch": 0.10929816718819849, + "grad_norm": 1.153664414743612, + "learning_rate": 5.842360550535614e-07, + "loss": 1.693, + "step": 815 + }, + { + "epoch": 0.1094322753687975, + "grad_norm": 1.2362947655431056, + "learning_rate": 5.841977288620676e-07, + "loss": 1.7216, + "step": 816 + }, + { + "epoch": 0.10956638354939652, + "grad_norm": 1.0845642638897275, + "learning_rate": 5.84159357541169e-07, + "loss": 1.704, + "step": 817 + }, + { + "epoch": 0.10970049172999553, + "grad_norm": 1.1373055917212407, + "learning_rate": 5.841209410976779e-07, + "loss": 1.7146, + "step": 818 + }, + { + "epoch": 0.10983459991059455, + "grad_norm": 1.071610572427508, + "learning_rate": 5.840824795384146e-07, + "loss": 1.6785, + "step": 819 + }, + { + "epoch": 0.10996870809119356, + "grad_norm": 1.1237115070149213, + "learning_rate": 5.840439728702073e-07, + "loss": 1.7022, + "step": 820 + }, + { + "epoch": 0.11010281627179258, + "grad_norm": 1.1135499435889078, + "learning_rate": 5.840054210998925e-07, + "loss": 1.6762, + "step": 821 + }, + { + "epoch": 0.1102369244523916, + "grad_norm": 1.1412142978650357, + "learning_rate": 5.839668242343147e-07, + "loss": 1.7325, + "step": 822 }, { - "epoch": 0.002216598629403181, - "grad_norm": 1.7977538453913595, - "learning_rate": 4.999975752937336e-07, - "loss": 1.7526, - "step": 60 + "epoch": 0.11037103263299061, + "grad_norm": 1.066696944750096, + "learning_rate": 5.839281822803259e-07, + "loss": 1.7209, + "step": 823 }, { - "epoch": 0.002955464839204241, - "grad_norm": 1.6391792506527756, - "learning_rate": 4.999945444231491e-07, - "loss": 1.7305, - "step": 80 + "epoch": 0.11050514081358963, + "grad_norm": 1.109425591853705, + "learning_rate": 5.838894952447866e-07, + "loss": 1.6248, + "step": 824 }, { - "epoch": 0.0036943310490053015, - "grad_norm": 2.046829887141379, - "learning_rate": 4.999903012271942e-07, - "loss": 1.741, - "step": 100 + "epoch": 0.11063924899418864, + "grad_norm": 1.0738541935378725, + "learning_rate": 5.838507631345652e-07, + "loss": 1.6582, + "step": 825 }, { - "epoch": 0.004433197258806362, - "grad_norm": 1.8506603870837903, - "learning_rate": 4.999848457287324e-07, - "loss": 1.7129, - "step": 120 + "epoch": 0.11077335717478766, + "grad_norm": 1.4358787492291483, + "learning_rate": 5.838119859565381e-07, + "loss": 1.807, + "step": 826 }, { - "epoch": 0.005172063468607422, - "grad_norm": 1.532512758048568, - "learning_rate": 4.999781779571592e-07, - "loss": 1.6774, - "step": 140 + "epoch": 0.11090746535538668, + "grad_norm": 1.1425108913039257, + "learning_rate": 5.837731637175898e-07, + "loss": 1.6146, + "step": 827 }, { - "epoch": 0.005910929678408482, - "grad_norm": 1.5744798799071549, - "learning_rate": 4.999702979484023e-07, - "loss": 1.7007, - "step": 160 + "epoch": 0.11104157353598569, + "grad_norm": 1.0637227390318094, + "learning_rate": 5.837342964246123e-07, + "loss": 1.6954, + "step": 828 }, { - "epoch": 0.006649795888209542, - "grad_norm": 3.3220978847027203, - "learning_rate": 4.999612057449209e-07, - "loss": 1.713, - "step": 180 + "epoch": 0.1111756817165847, + "grad_norm": 1.1694795366123236, + "learning_rate": 5.836953840845062e-07, + "loss": 1.6337, + "step": 829 }, { - "epoch": 0.007388662098010603, - "grad_norm": 1.6192232490789547, - "learning_rate": 4.999509013957061e-07, - "loss": 1.7085, - "step": 200 + "epoch": 0.11130978989718372, + "grad_norm": 1.1776659131207758, + "learning_rate": 5.836564267041799e-07, + "loss": 1.7132, + "step": 830 }, { - "epoch": 0.008127528307811663, - "grad_norm": 1.6102605979486695, - "learning_rate": 4.999393849562803e-07, - "loss": 1.6909, - "step": 220 + "epoch": 0.11144389807778274, + "grad_norm": 1.0835328202264551, + "learning_rate": 5.836174242905497e-07, + "loss": 1.7406, + "step": 831 }, { - "epoch": 0.008866394517612723, - "grad_norm": 1.7856392499507323, - "learning_rate": 4.999266564886968e-07, - "loss": 1.7105, - "step": 240 + "epoch": 0.11157800625838177, + "grad_norm": 1.0933003960120042, + "learning_rate": 5.835783768505399e-07, + "loss": 1.6104, + "step": 832 }, { - "epoch": 0.009605260727413783, - "grad_norm": 1.462292806282488, - "learning_rate": 4.999127160615396e-07, - "loss": 1.7254, - "step": 260 + "epoch": 0.11171211443898078, + "grad_norm": 1.075129502416788, + "learning_rate": 5.835392843910829e-07, + "loss": 1.6599, + "step": 833 }, { - "epoch": 0.010344126937214844, - "grad_norm": 1.6210572507919945, - "learning_rate": 4.998975637499234e-07, - "loss": 1.7228, - "step": 280 + "epoch": 0.1118462226195798, + "grad_norm": 1.1891418452392997, + "learning_rate": 5.835001469191191e-07, + "loss": 1.6589, + "step": 834 }, { - "epoch": 0.011082993147015904, - "grad_norm": 1.5952635257667038, - "learning_rate": 4.998811996354924e-07, - "loss": 1.747, - "step": 300 + "epoch": 0.11198033080017882, + "grad_norm": 1.7726602578762463, + "learning_rate": 5.834609644415967e-07, + "loss": 1.8068, + "step": 835 }, { - "epoch": 0.011821859356816964, - "grad_norm": 1.7849552493640788, - "learning_rate": 4.998636238064202e-07, - "loss": 1.6851, - "step": 320 + "epoch": 0.11211443898077783, + "grad_norm": 1.1160187069875398, + "learning_rate": 5.834217369654723e-07, + "loss": 1.7302, + "step": 836 }, { - "epoch": 0.012560725566618025, - "grad_norm": 1.7748576256781579, - "learning_rate": 4.9984483635741e-07, - "loss": 1.7215, - "step": 340 + "epoch": 0.11224854716137685, + "grad_norm": 1.2586778829179404, + "learning_rate": 5.833824644977098e-07, + "loss": 1.5899, + "step": 837 }, { - "epoch": 0.013299591776419085, - "grad_norm": 1.5217631531152356, - "learning_rate": 4.998248373896929e-07, - "loss": 1.7062, - "step": 360 + "epoch": 0.11238265534197586, + "grad_norm": 1.1096559717797458, + "learning_rate": 5.833431470452818e-07, + "loss": 1.7175, + "step": 838 }, { - "epoch": 0.014038457986220144, - "grad_norm": 1.8070743437219547, - "learning_rate": 4.998036270110284e-07, - "loss": 1.7108, - "step": 380 + "epoch": 0.11251676352257488, + "grad_norm": 1.1754882099239772, + "learning_rate": 5.833037846151686e-07, + "loss": 1.6674, + "step": 839 }, { - "epoch": 0.014777324196021206, - "grad_norm": 1.931037650128842, - "learning_rate": 4.997812053357031e-07, - "loss": 1.6739, - "step": 400 + "epoch": 0.1126508717031739, + "grad_norm": 1.030872040717494, + "learning_rate": 5.832643772143582e-07, + "loss": 1.6117, + "step": 840 }, { - "epoch": 0.015516190405822266, - "grad_norm": 2.3425787137976073, - "learning_rate": 4.997575724845303e-07, - "loss": 1.6501, - "step": 420 + "epoch": 0.11278497988377291, + "grad_norm": 1.1260356355011998, + "learning_rate": 5.832249248498472e-07, + "loss": 1.6813, + "step": 841 }, { - "epoch": 0.016255056615623327, - "grad_norm": 1.5819249160473718, - "learning_rate": 4.997327285848497e-07, - "loss": 1.7295, - "step": 440 + "epoch": 0.11291908806437193, + "grad_norm": 1.0550888868426265, + "learning_rate": 5.831854275286396e-07, + "loss": 1.6859, + "step": 842 }, { - "epoch": 0.016993922825424387, - "grad_norm": 1.780767344095751, - "learning_rate": 4.997066737705263e-07, - "loss": 1.7035, - "step": 460 + "epoch": 0.11305319624497094, + "grad_norm": 1.165191007399385, + "learning_rate": 5.831458852577477e-07, + "loss": 1.6982, + "step": 843 }, { - "epoch": 0.017732789035225446, - "grad_norm": 1.502517772930168, - "learning_rate": 4.996794081819497e-07, - "loss": 1.72, - "step": 480 + "epoch": 0.11318730442556996, + "grad_norm": 1.178851685175072, + "learning_rate": 5.831062980441918e-07, + "loss": 1.6891, + "step": 844 }, { - "epoch": 0.018471655245026506, - "grad_norm": 1.6230104328728192, - "learning_rate": 4.996509319660336e-07, - "loss": 1.7052, - "step": 500 + "epoch": 0.11332141260616897, + "grad_norm": 1.173173669662085, + "learning_rate": 5.830666658949999e-07, + "loss": 1.7388, + "step": 845 }, { - "epoch": 0.019210521454827566, - "grad_norm": 2.5063386321134287, - "learning_rate": 4.996212452762147e-07, - "loss": 1.7111, - "step": 520 + "epoch": 0.11345552078676799, + "grad_norm": 1.1552209879477302, + "learning_rate": 5.830269888172083e-07, + "loss": 1.7383, + "step": 846 }, { - "epoch": 0.01994938766462863, - "grad_norm": 1.508194569170525, - "learning_rate": 4.995903482724523e-07, - "loss": 1.7116, - "step": 540 + "epoch": 0.113589628967367, + "grad_norm": 1.0974766482142095, + "learning_rate": 5.82987266817861e-07, + "loss": 1.7139, + "step": 847 }, { - "epoch": 0.02068825387442969, - "grad_norm": 1.5662589803980058, - "learning_rate": 4.995582411212267e-07, - "loss": 1.6586, - "step": 560 + "epoch": 0.11372373714796602, + "grad_norm": 1.1314238053001549, + "learning_rate": 5.829474999040102e-07, + "loss": 1.6041, + "step": 848 }, { - "epoch": 0.021427120084230748, - "grad_norm": 1.5479171922961865, - "learning_rate": 4.995249239955392e-07, - "loss": 1.6605, - "step": 580 + "epoch": 0.11385784532856505, + "grad_norm": 1.100933720786019, + "learning_rate": 5.829076880827159e-07, + "loss": 1.7101, + "step": 849 }, { - "epoch": 0.022165986294031808, - "grad_norm": 1.4441787150001577, - "learning_rate": 4.994903970749107e-07, - "loss": 1.6952, - "step": 600 + "epoch": 0.11399195350916407, + "grad_norm": 1.1461722995944397, + "learning_rate": 5.828678313610463e-07, + "loss": 1.7009, + "step": 850 }, { - "epoch": 0.022904852503832868, - "grad_norm": 1.8034952536565763, - "learning_rate": 4.994546605453804e-07, - "loss": 1.6928, - "step": 620 + "epoch": 0.11412606168976308, + "grad_norm": 1.2722684302580665, + "learning_rate": 5.828279297460774e-07, + "loss": 1.6484, + "step": 851 }, { - "epoch": 0.023643718713633927, - "grad_norm": 2.0505792045813123, - "learning_rate": 4.994177145995056e-07, - "loss": 1.6979, - "step": 640 + "epoch": 0.1142601698703621, + "grad_norm": 1.1151947943169025, + "learning_rate": 5.82787983244893e-07, + "loss": 1.655, + "step": 852 }, { - "epoch": 0.02438258492343499, - "grad_norm": 1.7345451000474756, - "learning_rate": 4.993795594363599e-07, - "loss": 1.6827, - "step": 660 + "epoch": 0.11439427805096111, + "grad_norm": 1.1184598730723336, + "learning_rate": 5.827479918645852e-07, + "loss": 1.6165, + "step": 853 }, { - "epoch": 0.02512145113323605, - "grad_norm": 1.7265664949693813, - "learning_rate": 4.993401952615327e-07, - "loss": 1.6949, - "step": 680 + "epoch": 0.11452838623156013, + "grad_norm": 1.023276016208069, + "learning_rate": 5.827079556122542e-07, + "loss": 1.4802, + "step": 854 }, { - "epoch": 0.02586031734303711, - "grad_norm": 1.5684431888117931, - "learning_rate": 4.992996222871278e-07, - "loss": 1.6725, - "step": 700 + "epoch": 0.11466249441215914, + "grad_norm": 1.1363089821207286, + "learning_rate": 5.826678744950074e-07, + "loss": 1.7255, + "step": 855 }, { - "epoch": 0.02659918355283817, - "grad_norm": 1.8458741005435486, - "learning_rate": 4.992578407317622e-07, - "loss": 1.6876, - "step": 720 + "epoch": 0.11479660259275816, + "grad_norm": 1.1011868598006873, + "learning_rate": 5.826277485199609e-07, + "loss": 1.6958, + "step": 856 }, { - "epoch": 0.02733804976263923, - "grad_norm": 1.603183220486937, - "learning_rate": 4.992148508205652e-07, - "loss": 1.7001, - "step": 740 + "epoch": 0.11493071077335718, + "grad_norm": 1.1338150939022813, + "learning_rate": 5.825875776942388e-07, + "loss": 1.7061, + "step": 857 }, { - "epoch": 0.02807691597244029, - "grad_norm": 1.4656870216667528, - "learning_rate": 4.991706527851766e-07, - "loss": 1.6743, - "step": 760 + "epoch": 0.11506481895395619, + "grad_norm": 1.130051416794989, + "learning_rate": 5.825473620249724e-07, + "loss": 1.7138, + "step": 858 }, { - "epoch": 0.028815782182241352, - "grad_norm": 1.79341933069724, - "learning_rate": 4.991252468637465e-07, - "loss": 1.6894, - "step": 780 + "epoch": 0.1151989271345552, + "grad_norm": 1.0842663625693372, + "learning_rate": 5.825071015193018e-07, + "loss": 1.6059, + "step": 859 }, { - "epoch": 0.029554648392042412, - "grad_norm": 1.4496770314789245, - "learning_rate": 4.990786333009329e-07, - "loss": 1.7038, - "step": 800 + "epoch": 0.11533303531515422, + "grad_norm": 1.126331708345394, + "learning_rate": 5.824667961843746e-07, + "loss": 1.6874, + "step": 860 }, { - "epoch": 0.03029351460184347, - "grad_norm": 1.757004570982493, - "learning_rate": 4.990308123479012e-07, - "loss": 1.7134, - "step": 820 + "epoch": 0.11546714349575324, + "grad_norm": 1.067788867144983, + "learning_rate": 5.824264460273465e-07, + "loss": 1.7211, + "step": 861 }, { - "epoch": 0.03103238081164453, - "grad_norm": 1.5200364379437228, - "learning_rate": 4.98981784262322e-07, - "loss": 1.6698, - "step": 840 + "epoch": 0.11560125167635225, + "grad_norm": 1.0567680329056464, + "learning_rate": 5.823860510553811e-07, + "loss": 1.5729, + "step": 862 }, { - "epoch": 0.03177124702144559, - "grad_norm": 1.486100216095798, - "learning_rate": 4.989315493083708e-07, - "loss": 1.6896, - "step": 860 + "epoch": 0.11573535985695127, + "grad_norm": 1.088021498471896, + "learning_rate": 5.823456112756498e-07, + "loss": 1.6884, + "step": 863 + }, + { + "epoch": 0.11586946803755029, + "grad_norm": 1.1157283518569765, + "learning_rate": 5.823051266953325e-07, + "loss": 1.6806, + "step": 864 + }, + { + "epoch": 0.1160035762181493, + "grad_norm": 1.0681883774872867, + "learning_rate": 5.822645973216165e-07, + "loss": 1.6397, + "step": 865 + }, + { + "epoch": 0.11613768439874832, + "grad_norm": 1.0861783292304394, + "learning_rate": 5.822240231616973e-07, + "loss": 1.575, + "step": 866 + }, + { + "epoch": 0.11627179257934735, + "grad_norm": 1.068546853668492, + "learning_rate": 5.821834042227783e-07, + "loss": 1.6436, + "step": 867 + }, + { + "epoch": 0.11640590075994636, + "grad_norm": 1.1370891534192904, + "learning_rate": 5.821427405120708e-07, + "loss": 1.7133, + "step": 868 }, { - "epoch": 0.032510113231246654, - "grad_norm": 1.6006604995588511, - "learning_rate": 4.988801077567258e-07, - "loss": 1.6842, + "epoch": 0.11654000894054538, + "grad_norm": 1.0975985479163, + "learning_rate": 5.821020320367942e-07, + "loss": 1.7395, + "step": 869 + }, + { + "epoch": 0.1166741171211444, + "grad_norm": 1.0979310675749658, + "learning_rate": 5.820612788041756e-07, + "loss": 1.733, + "step": 870 + }, + { + "epoch": 0.11680822530174341, + "grad_norm": 1.1290790783874916, + "learning_rate": 5.820204808214503e-07, + "loss": 1.5963, + "step": 871 + }, + { + "epoch": 0.11694233348234243, + "grad_norm": 1.0767125460282738, + "learning_rate": 5.819796380958613e-07, + "loss": 1.7139, + "step": 872 + }, + { + "epoch": 0.11707644166294144, + "grad_norm": 1.242641974109421, + "learning_rate": 5.819387506346598e-07, + "loss": 1.7068, + "step": 873 + }, + { + "epoch": 0.11721054984354046, + "grad_norm": 1.0978061234757794, + "learning_rate": 5.818978184451048e-07, + "loss": 1.625, + "step": 874 + }, + { + "epoch": 0.11734465802413947, + "grad_norm": 1.0887952709463755, + "learning_rate": 5.818568415344633e-07, + "loss": 1.6017, + "step": 875 + }, + { + "epoch": 0.11747876620473849, + "grad_norm": 1.0584442299701264, + "learning_rate": 5.818158199100101e-07, + "loss": 1.7367, + "step": 876 + }, + { + "epoch": 0.1176128743853375, + "grad_norm": 1.0996935525118328, + "learning_rate": 5.817747535790283e-07, + "loss": 1.6186, + "step": 877 + }, + { + "epoch": 0.11774698256593652, + "grad_norm": 1.1314747020843203, + "learning_rate": 5.817336425488082e-07, + "loss": 1.6249, + "step": 878 + }, + { + "epoch": 0.11788109074653554, + "grad_norm": 1.1919795844521832, + "learning_rate": 5.81692486826649e-07, + "loss": 1.6532, + "step": 879 + }, + { + "epoch": 0.11801519892713455, + "grad_norm": 1.305262723197089, + "learning_rate": 5.816512864198571e-07, + "loss": 1.5978, "step": 880 }, { - "epoch": 0.03324897944104771, - "grad_norm": 1.6369118826080298, - "learning_rate": 4.988274598845665e-07, - "loss": 1.7129, - "step": 900 + "epoch": 0.11814930710773357, + "grad_norm": 1.1155976857853542, + "learning_rate": 5.816100413357471e-07, + "loss": 1.6797, + "step": 881 }, { - "epoch": 0.03398784565084877, - "grad_norm": 1.594714153238538, - "learning_rate": 4.987736059755724e-07, - "loss": 1.6812, - "step": 920 + "epoch": 0.11828341528833258, + "grad_norm": 1.123108419027786, + "learning_rate": 5.815687515816415e-07, + "loss": 1.5944, + "step": 882 }, { - "epoch": 0.03472671186064983, - "grad_norm": 1.691349253313934, - "learning_rate": 4.987185463199215e-07, - "loss": 1.7131, - "step": 940 + "epoch": 0.1184175234689316, + "grad_norm": 1.1318300431723485, + "learning_rate": 5.815274171648709e-07, + "loss": 1.6328, + "step": 883 }, { - "epoch": 0.03546557807045089, - "grad_norm": 2.1466962990805385, - "learning_rate": 4.986622812142888e-07, - "loss": 1.7217, - "step": 960 + "epoch": 0.11855163164953063, + "grad_norm": 1.1498251619378483, + "learning_rate": 5.814860380927734e-07, + "loss": 1.6131, + "step": 884 }, { - "epoch": 0.036204444280251956, - "grad_norm": 1.6551865204895997, - "learning_rate": 4.986048109618442e-07, - "loss": 1.7179, - "step": 980 + "epoch": 0.11868573983012964, + "grad_norm": 1.0940645690658886, + "learning_rate": 5.814446143726956e-07, + "loss": 1.6142, + "step": 885 }, { - "epoch": 0.03694331049005301, - "grad_norm": 1.5681769699914139, - "learning_rate": 4.985461358722514e-07, - "loss": 1.6897, - "step": 1000 + "epoch": 0.11881984801072866, + "grad_norm": 1.0820516072736348, + "learning_rate": 5.814031460119914e-07, + "loss": 1.6148, + "step": 886 }, { - "epoch": 0.037682176699854075, - "grad_norm": 1.5623589901869384, - "learning_rate": 4.984862562616661e-07, - "loss": 1.7307, - "step": 1020 + "epoch": 0.11895395619132768, + "grad_norm": 1.1247339726082044, + "learning_rate": 5.813616330180233e-07, + "loss": 1.7608, + "step": 887 }, { - "epoch": 0.03842104290965513, - "grad_norm": 1.710638101923504, - "learning_rate": 4.984251724527342e-07, - "loss": 1.6815, - "step": 1040 + "epoch": 0.11908806437192669, + "grad_norm": 1.3664008359044402, + "learning_rate": 5.813200753981611e-07, + "loss": 1.6969, + "step": 888 }, { - "epoch": 0.039159909119456195, - "grad_norm": 2.613860202511964, - "learning_rate": 4.983628847745904e-07, - "loss": 1.6798, - "step": 1060 + "epoch": 0.11922217255252571, + "grad_norm": 1.1603697359280436, + "learning_rate": 5.812784731597829e-07, + "loss": 1.7402, + "step": 889 }, { - "epoch": 0.03989877532925726, - "grad_norm": 1.9956576024499864, - "learning_rate": 4.982993935628554e-07, - "loss": 1.6715, - "step": 1080 + "epoch": 0.11935628073312472, + "grad_norm": 1.1010475016983683, + "learning_rate": 5.812368263102746e-07, + "loss": 1.759, + "step": 890 }, { - "epoch": 0.040637641539058314, - "grad_norm": 1.833965747583207, - "learning_rate": 4.982346991596356e-07, - "loss": 1.7044, - "step": 1100 + "epoch": 0.11949038891372374, + "grad_norm": 1.1085219941083455, + "learning_rate": 5.811951348570302e-07, + "loss": 1.667, + "step": 891 }, { - "epoch": 0.04137650774885938, - "grad_norm": 1.709065067682895, - "learning_rate": 4.981688019135202e-07, - "loss": 1.6612, - "step": 1120 + "epoch": 0.11962449709432275, + "grad_norm": 1.1139382749577305, + "learning_rate": 5.811533988074512e-07, + "loss": 1.6677, + "step": 892 }, { - "epoch": 0.04211537395866043, - "grad_norm": 2.2904499102757767, - "learning_rate": 4.981017021795794e-07, - "loss": 1.6984, - "step": 1140 + "epoch": 0.11975860527492177, + "grad_norm": 1.1325956159096344, + "learning_rate": 5.811116181689475e-07, + "loss": 1.7068, + "step": 893 }, { - "epoch": 0.042854240168461497, - "grad_norm": 1.5809892282131641, - "learning_rate": 4.980334003193632e-07, - "loss": 1.672, - "step": 1160 + "epoch": 0.11989271345552079, + "grad_norm": 1.0408410504808954, + "learning_rate": 5.810697929489365e-07, + "loss": 1.6708, + "step": 894 }, { - "epoch": 0.04359310637826255, - "grad_norm": 1.4895408854624943, - "learning_rate": 4.979638967008983e-07, - "loss": 1.6637, - "step": 1180 + "epoch": 0.1200268216361198, + "grad_norm": 1.0658514906014669, + "learning_rate": 5.810279231548439e-07, + "loss": 1.6833, + "step": 895 }, { - "epoch": 0.044331972588063616, - "grad_norm": 1.6294048072820626, - "learning_rate": 4.978931916986874e-07, - "loss": 1.6604, - "step": 1200 + "epoch": 0.12016092981671882, + "grad_norm": 1.0840346983956348, + "learning_rate": 5.80986008794103e-07, + "loss": 1.6973, + "step": 896 }, { - "epoch": 0.04507083879786468, - "grad_norm": 2.156396711607377, - "learning_rate": 4.978212856937062e-07, - "loss": 1.678, - "step": 1220 + "epoch": 0.12029503799731783, + "grad_norm": 1.1508325943207491, + "learning_rate": 5.809440498741552e-07, + "loss": 1.7326, + "step": 897 }, { - "epoch": 0.045809705007665735, - "grad_norm": 1.6010232675443634, - "learning_rate": 4.977481790734016e-07, - "loss": 1.6922, - "step": 1240 + "epoch": 0.12042914617791685, + "grad_norm": 1.0629236207923716, + "learning_rate": 5.809020464024496e-07, + "loss": 1.5428, + "step": 898 }, { - "epoch": 0.0465485712174668, - "grad_norm": 1.4024504403885678, - "learning_rate": 4.9767387223169e-07, - "loss": 1.6538, - "step": 1260 + "epoch": 0.12056325435851586, + "grad_norm": 1.112200747649366, + "learning_rate": 5.808599983864435e-07, + "loss": 1.6729, + "step": 899 }, { - "epoch": 0.047287437427267855, - "grad_norm": 1.711902948101267, - "learning_rate": 4.975983655689547e-07, - "loss": 1.6844, - "step": 1280 + "epoch": 0.12069736253911488, + "grad_norm": 1.2078470991285137, + "learning_rate": 5.80817905833602e-07, + "loss": 1.738, + "step": 900 }, { - "epoch": 0.04802630363706892, - "grad_norm": 1.572788133497536, - "learning_rate": 4.975216594920441e-07, - "loss": 1.6773, - "step": 1300 + "epoch": 0.1208314707197139, + "grad_norm": 1.1190068417460075, + "learning_rate": 5.807757687513979e-07, + "loss": 1.6607, + "step": 901 }, { - "epoch": 0.04876516984686998, - "grad_norm": 1.5865129712420638, - "learning_rate": 4.974437544142691e-07, - "loss": 1.6794, - "step": 1320 + "epoch": 0.12096557890031293, + "grad_norm": 1.0450615497760403, + "learning_rate": 5.807335871473122e-07, + "loss": 1.6588, + "step": 902 }, { - "epoch": 0.04950403605667104, - "grad_norm": 1.5690014017001472, - "learning_rate": 4.973646507554012e-07, - "loss": 1.7073, - "step": 1340 + "epoch": 0.12109968708091194, + "grad_norm": 1.121198054415324, + "learning_rate": 5.806913610288336e-07, + "loss": 1.662, + "step": 903 }, { - "epoch": 0.0502429022664721, - "grad_norm": 1.5937164094810738, - "learning_rate": 4.972843489416702e-07, - "loss": 1.6958, - "step": 1360 + "epoch": 0.12123379526151096, + "grad_norm": 1.1054682653267978, + "learning_rate": 5.806490904034589e-07, + "loss": 1.6706, + "step": 904 }, { - "epoch": 0.05098176847627316, - "grad_norm": 1.5264901796499448, - "learning_rate": 4.972028494057619e-07, - "loss": 1.6452, - "step": 1380 + "epoch": 0.12136790344210997, + "grad_norm": 1.113997411395293, + "learning_rate": 5.806067752786926e-07, + "loss": 1.7632, + "step": 905 }, { - "epoch": 0.05172063468607422, - "grad_norm": 1.7268277514753942, - "learning_rate": 4.971201525868155e-07, - "loss": 1.6944, - "step": 1400 + "epoch": 0.12150201162270899, + "grad_norm": 1.1613864633248003, + "learning_rate": 5.805644156620472e-07, + "loss": 1.7098, + "step": 906 }, { - "epoch": 0.052459500895875276, - "grad_norm": 1.6746090286211905, - "learning_rate": 4.970362589304216e-07, - "loss": 1.6621, - "step": 1420 + "epoch": 0.121636119803308, + "grad_norm": 1.1055893873511211, + "learning_rate": 5.805220115610431e-07, + "loss": 1.7946, + "step": 907 }, { - "epoch": 0.05319836710567634, - "grad_norm": 1.5009694467436718, - "learning_rate": 4.969511688886198e-07, - "loss": 1.6797, - "step": 1440 + "epoch": 0.12177022798390702, + "grad_norm": 1.059537639783976, + "learning_rate": 5.804795629832085e-07, + "loss": 1.6377, + "step": 908 }, { - "epoch": 0.0539372333154774, - "grad_norm": 1.4662106712988012, - "learning_rate": 4.968648829198958e-07, - "loss": 1.6664, - "step": 1460 + "epoch": 0.12190433616450604, + "grad_norm": 1.075756870276535, + "learning_rate": 5.804370699360796e-07, + "loss": 1.6709, + "step": 909 }, { - "epoch": 0.05467609952527846, - "grad_norm": 1.5749221565087543, - "learning_rate": 4.967774014891796e-07, - "loss": 1.7086, - "step": 1480 + "epoch": 0.12203844434510505, + "grad_norm": 1.0951662603881447, + "learning_rate": 5.803945324272006e-07, + "loss": 1.6114, + "step": 910 }, { - "epoch": 0.05541496573507952, - "grad_norm": 1.482093097261866, - "learning_rate": 4.966887250678421e-07, - "loss": 1.7089, - "step": 1500 + "epoch": 0.12217255252570407, + "grad_norm": 1.0835170338297386, + "learning_rate": 5.803519504641234e-07, + "loss": 1.6945, + "step": 911 }, { - "epoch": 0.05615383194488058, - "grad_norm": 1.520435320135513, - "learning_rate": 4.965988541336936e-07, - "loss": 1.6734, - "step": 1520 + "epoch": 0.12230666070630308, + "grad_norm": 1.188508933084379, + "learning_rate": 5.803093240544077e-07, + "loss": 1.7176, + "step": 912 }, { - "epoch": 0.05689269815468164, - "grad_norm": 1.5553430296104012, - "learning_rate": 4.965077891709807e-07, - "loss": 1.697, - "step": 1540 + "epoch": 0.1224407688869021, + "grad_norm": 1.0574940351976068, + "learning_rate": 5.802666532056215e-07, + "loss": 1.6449, + "step": 913 }, { - "epoch": 0.057631564364482704, - "grad_norm": 1.3543792401342896, - "learning_rate": 4.964155306703835e-07, - "loss": 1.6997, - "step": 1560 + "epoch": 0.12257487706750111, + "grad_norm": 1.1011954691706793, + "learning_rate": 5.802239379253403e-07, + "loss": 1.7403, + "step": 914 }, { - "epoch": 0.05837043057428376, - "grad_norm": 1.575031153656866, - "learning_rate": 4.963220791290132e-07, - "loss": 1.6916, - "step": 1580 + "epoch": 0.12270898524810013, + "grad_norm": 1.05289982245001, + "learning_rate": 5.801811782211476e-07, + "loss": 1.7121, + "step": 915 }, { - "epoch": 0.059109296784084824, - "grad_norm": 1.669401673230416, - "learning_rate": 4.962274350504096e-07, - "loss": 1.7042, - "step": 1600 + "epoch": 0.12284309342869915, + "grad_norm": 1.1247742251938873, + "learning_rate": 5.801383741006349e-07, + "loss": 1.6904, + "step": 916 }, { - "epoch": 0.05984816299388588, - "grad_norm": 1.5212881661869584, - "learning_rate": 4.961315989445378e-07, - "loss": 1.667, - "step": 1620 + "epoch": 0.12297720160929816, + "grad_norm": 1.1060690034689273, + "learning_rate": 5.800955255714014e-07, + "loss": 1.5423, + "step": 917 }, { - "epoch": 0.06058702920368694, - "grad_norm": 1.5762244565376538, - "learning_rate": 4.960345713277863e-07, - "loss": 1.6342, - "step": 1640 + "epoch": 0.12311130978989718, + "grad_norm": 1.17690980567079, + "learning_rate": 5.800526326410544e-07, + "loss": 1.6638, + "step": 918 }, { - "epoch": 0.061325895413488006, - "grad_norm": 1.5691787785228513, - "learning_rate": 4.959363527229634e-07, - "loss": 1.6525, - "step": 1660 + "epoch": 0.12324541797049621, + "grad_norm": 1.0758724475892376, + "learning_rate": 5.800096953172088e-07, + "loss": 1.7136, + "step": 919 }, { - "epoch": 0.06206476162328906, - "grad_norm": 1.6437209138688083, - "learning_rate": 4.958369436592948e-07, - "loss": 1.6769, - "step": 1680 + "epoch": 0.12337952615109522, + "grad_norm": 1.0847412248840858, + "learning_rate": 5.799667136074878e-07, + "loss": 1.7712, + "step": 920 }, { - "epoch": 0.06280362783309013, - "grad_norm": 1.553888728962609, - "learning_rate": 4.957363446724208e-07, - "loss": 1.6924, - "step": 1700 + "epoch": 0.12351363433169424, + "grad_norm": 1.1331387033738405, + "learning_rate": 5.799236875195219e-07, + "loss": 1.664, + "step": 921 }, { - "epoch": 0.06354249404289118, - "grad_norm": 1.5365170274491486, - "learning_rate": 4.956345563043933e-07, - "loss": 1.6894, - "step": 1720 + "epoch": 0.12364774251229325, + "grad_norm": 1.3262309930515026, + "learning_rate": 5.798806170609502e-07, + "loss": 1.6546, + "step": 922 }, { - "epoch": 0.06428136025269224, - "grad_norm": 1.5197453947387185, - "learning_rate": 4.955315791036727e-07, - "loss": 1.6758, - "step": 1740 + "epoch": 0.12378185069289227, + "grad_norm": 1.1280111604345993, + "learning_rate": 5.79837502239419e-07, + "loss": 1.6623, + "step": 923 }, { - "epoch": 0.06502022646249331, - "grad_norm": 2.464391338240643, - "learning_rate": 4.954274136251251e-07, - "loss": 1.6332, - "step": 1760 + "epoch": 0.12391595887349129, + "grad_norm": 1.1001484560762704, + "learning_rate": 5.797943430625828e-07, + "loss": 1.6743, + "step": 924 }, { - "epoch": 0.06575909267229436, - "grad_norm": 1.6394501426047832, - "learning_rate": 4.953220604300198e-07, - "loss": 1.6879, - "step": 1780 + "epoch": 0.1240500670540903, + "grad_norm": 1.1051963249243846, + "learning_rate": 5.79751139538104e-07, + "loss": 1.6542, + "step": 925 }, { - "epoch": 0.06649795888209542, - "grad_norm": 1.6810218422818062, - "learning_rate": 4.952155200860251e-07, - "loss": 1.6724, - "step": 1800 + "epoch": 0.12418417523468932, + "grad_norm": 2.096743814606382, + "learning_rate": 5.797078916736527e-07, + "loss": 1.7618, + "step": 926 + }, + { + "epoch": 0.12431828341528833, + "grad_norm": 1.1918807746678728, + "learning_rate": 5.79664599476907e-07, + "loss": 1.7489, + "step": 927 + }, + { + "epoch": 0.12445239159588735, + "grad_norm": 1.2255902304289672, + "learning_rate": 5.79621262955553e-07, + "loss": 1.805, + "step": 928 + }, + { + "epoch": 0.12458649977648636, + "grad_norm": 1.1112711388204457, + "learning_rate": 5.795778821172845e-07, + "loss": 1.6535, + "step": 929 + }, + { + "epoch": 0.12472060795708538, + "grad_norm": 1.15632851861526, + "learning_rate": 5.79534456969803e-07, + "loss": 1.7674, + "step": 930 + }, + { + "epoch": 0.1248547161376844, + "grad_norm": 1.1364857063021152, + "learning_rate": 5.794909875208182e-07, + "loss": 1.6668, + "step": 931 + }, + { + "epoch": 0.12498882431828341, + "grad_norm": 1.1554164021245972, + "learning_rate": 5.794474737780474e-07, + "loss": 1.6862, + "step": 932 + }, + { + "epoch": 0.12512293249888243, + "grad_norm": 1.1360253650713825, + "learning_rate": 5.79403915749216e-07, + "loss": 1.6811, + "step": 933 + }, + { + "epoch": 0.12525704067948146, + "grad_norm": 1.066412847829235, + "learning_rate": 5.793603134420571e-07, + "loss": 1.6562, + "step": 934 + }, + { + "epoch": 0.12539114886008046, + "grad_norm": 1.081900817528408, + "learning_rate": 5.793166668643118e-07, + "loss": 1.6319, + "step": 935 + }, + { + "epoch": 0.1255252570406795, + "grad_norm": 1.12430422704736, + "learning_rate": 5.792729760237288e-07, + "loss": 1.6679, + "step": 936 + }, + { + "epoch": 0.1256593652212785, + "grad_norm": 1.1555451362888864, + "learning_rate": 5.79229240928065e-07, + "loss": 1.6272, + "step": 937 + }, + { + "epoch": 0.12579347340187752, + "grad_norm": 1.1120423598959, + "learning_rate": 5.791854615850848e-07, + "loss": 1.7271, + "step": 938 + }, + { + "epoch": 0.12592758158247652, + "grad_norm": 1.099822375040922, + "learning_rate": 5.791416380025607e-07, + "loss": 1.6762, + "step": 939 + }, + { + "epoch": 0.12606168976307555, + "grad_norm": 1.1055384980174303, + "learning_rate": 5.79097770188273e-07, + "loss": 1.6526, + "step": 940 }, { - "epoch": 0.06723682509189649, - "grad_norm": 1.6917107156739108, - "learning_rate": 4.951077931672067e-07, - "loss": 1.6826, - "step": 1820 + "epoch": 0.12619579794367455, + "grad_norm": 1.1135160613742192, + "learning_rate": 5.7905385815001e-07, + "loss": 1.7112, + "step": 941 }, { - "epoch": 0.06797569130169755, - "grad_norm": 4.580444011220939, - "learning_rate": 4.949988802540229e-07, - "loss": 1.6581, - "step": 1840 + "epoch": 0.12632990612427358, + "grad_norm": 1.172524893436665, + "learning_rate": 5.790099018955674e-07, + "loss": 1.6629, + "step": 942 }, { - "epoch": 0.0687145575114986, - "grad_norm": 1.529682024928737, - "learning_rate": 4.948887819333236e-07, - "loss": 1.6649, - "step": 1860 + "epoch": 0.12646401430487259, + "grad_norm": 1.143908651612981, + "learning_rate": 5.789659014327492e-07, + "loss": 1.6004, + "step": 943 }, { - "epoch": 0.06945342372129966, - "grad_norm": 1.4414816212505979, - "learning_rate": 4.947774987983449e-07, - "loss": 1.6691, - "step": 1880 + "epoch": 0.12659812248547161, + "grad_norm": 1.0950798365706262, + "learning_rate": 5.789218567693672e-07, + "loss": 1.6794, + "step": 944 }, { - "epoch": 0.07019228993110073, - "grad_norm": 1.9799877388187868, - "learning_rate": 4.946650314487077e-07, - "loss": 1.6937, - "step": 1900 + "epoch": 0.12673223066607062, + "grad_norm": 1.0865150988933485, + "learning_rate": 5.788777679132408e-07, + "loss": 1.7733, + "step": 945 }, { - "epoch": 0.07093115614090179, - "grad_norm": 1.4952225950122013, - "learning_rate": 4.945513804904131e-07, - "loss": 1.6798, - "step": 1920 + "epoch": 0.12686633884666965, + "grad_norm": 1.081699940619205, + "learning_rate": 5.788336348721972e-07, + "loss": 1.6587, + "step": 946 }, { - "epoch": 0.07167002235070284, - "grad_norm": 1.5642244850204086, - "learning_rate": 4.944365465358407e-07, - "loss": 1.6783, - "step": 1940 + "epoch": 0.12700044702726868, + "grad_norm": 1.0733926398236942, + "learning_rate": 5.787894576540721e-07, + "loss": 1.6461, + "step": 947 }, { - "epoch": 0.07240888856050391, - "grad_norm": 1.5048141774546024, - "learning_rate": 4.943205302037432e-07, - "loss": 1.6486, - "step": 1960 + "epoch": 0.12713455520786768, + "grad_norm": 1.126195585933314, + "learning_rate": 5.787452362667083e-07, + "loss": 1.6838, + "step": 948 }, { - "epoch": 0.07314775477030497, - "grad_norm": 1.4222420311354336, - "learning_rate": 4.942033321192452e-07, - "loss": 1.6868, - "step": 1980 + "epoch": 0.1272686633884667, + "grad_norm": 1.1329864382691732, + "learning_rate": 5.787009707179567e-07, + "loss": 1.6329, + "step": 949 }, { - "epoch": 0.07388662098010602, - "grad_norm": 1.6893784971157513, - "learning_rate": 4.940849529138383e-07, - "loss": 1.6934, - "step": 2000 + "epoch": 0.1274027715690657, + "grad_norm": 1.1004395022968605, + "learning_rate": 5.786566610156759e-07, + "loss": 1.7147, + "step": 950 }, { - "epoch": 0.0746254871899071, - "grad_norm": 1.9605139373755291, - "learning_rate": 4.939653932253786e-07, - "loss": 1.6537, - "step": 2020 + "epoch": 0.12753687974966474, + "grad_norm": 1.0391080576189866, + "learning_rate": 5.78612307167733e-07, + "loss": 1.6315, + "step": 951 }, { - "epoch": 0.07536435339970815, - "grad_norm": 1.6497175696745814, - "learning_rate": 4.938446536980829e-07, - "loss": 1.7022, - "step": 2040 + "epoch": 0.12767098793026374, + "grad_norm": 1.0855474578853979, + "learning_rate": 5.78567909182002e-07, + "loss": 1.7127, + "step": 952 }, { - "epoch": 0.0761032196095092, - "grad_norm": 1.6258237906038047, - "learning_rate": 4.93722734982525e-07, - "loss": 1.6925, - "step": 2060 + "epoch": 0.12780509611086277, + "grad_norm": 1.1433214364150983, + "learning_rate": 5.785234670663652e-07, + "loss": 1.7042, + "step": 953 }, { - "epoch": 0.07684208581931026, - "grad_norm": 1.5236446480879742, - "learning_rate": 4.935996377356326e-07, - "loss": 1.6418, - "step": 2080 + "epoch": 0.12793920429146177, + "grad_norm": 1.0903898099360794, + "learning_rate": 5.784789808287129e-07, + "loss": 1.749, + "step": 954 }, { - "epoch": 0.07758095202911133, - "grad_norm": 1.5958168322212294, - "learning_rate": 4.934753626206837e-07, - "loss": 1.7259, - "step": 2100 + "epoch": 0.1280733124720608, + "grad_norm": 1.1462757739762268, + "learning_rate": 5.784344504769428e-07, + "loss": 1.7118, + "step": 955 }, { - "epoch": 0.07831981823891239, - "grad_norm": 2.45551144657574, - "learning_rate": 4.933499103073029e-07, - "loss": 1.7141, - "step": 2120 + "epoch": 0.1282074206526598, + "grad_norm": 1.0944948131751315, + "learning_rate": 5.783898760189609e-07, + "loss": 1.7308, + "step": 956 }, { - "epoch": 0.07905868444871345, - "grad_norm": 1.519298595383626, - "learning_rate": 4.932232814714576e-07, - "loss": 1.6712, - "step": 2140 + "epoch": 0.12834152883325883, + "grad_norm": 1.0898739853739683, + "learning_rate": 5.783452574626806e-07, + "loss": 1.5947, + "step": 957 }, { - "epoch": 0.07979755065851452, - "grad_norm": 1.6278461520508305, - "learning_rate": 4.930954767954551e-07, - "loss": 1.6784, - "step": 2160 + "epoch": 0.12847563701385784, + "grad_norm": 1.1070871512716438, + "learning_rate": 5.783005948160236e-07, + "loss": 1.7032, + "step": 958 }, { - "epoch": 0.08053641686831557, - "grad_norm": 1.5199514109499472, - "learning_rate": 4.92966496967938e-07, - "loss": 1.6842, - "step": 2180 + "epoch": 0.12860974519445686, + "grad_norm": 1.1173517977218599, + "learning_rate": 5.782558880869187e-07, + "loss": 1.76, + "step": 959 }, { - "epoch": 0.08127528307811663, - "grad_norm": 1.8268617588637115, - "learning_rate": 4.928363426838808e-07, - "loss": 1.714, - "step": 2200 + "epoch": 0.12874385337505587, + "grad_norm": 1.0784753543720036, + "learning_rate": 5.782111372833035e-07, + "loss": 1.6817, + "step": 960 }, { - "epoch": 0.08201414928791768, - "grad_norm": 1.671569089879459, - "learning_rate": 4.927050146445867e-07, - "loss": 1.6693, - "step": 2220 + "epoch": 0.1288779615556549, + "grad_norm": 1.099729300157914, + "learning_rate": 5.781663424131225e-07, + "loss": 1.5885, + "step": 961 }, { - "epoch": 0.08275301549771875, - "grad_norm": 1.4546842764348067, - "learning_rate": 4.92572513557683e-07, - "loss": 1.6724, - "step": 2240 + "epoch": 0.1290120697362539, + "grad_norm": 1.1053155402387764, + "learning_rate": 5.781215034843288e-07, + "loss": 1.649, + "step": 962 }, { - "epoch": 0.08349188170751981, - "grad_norm": 1.5602541654624753, - "learning_rate": 4.924388401371179e-07, - "loss": 1.6715, - "step": 2260 + "epoch": 0.12914617791685293, + "grad_norm": 1.0498243431495933, + "learning_rate": 5.780766205048826e-07, + "loss": 1.6, + "step": 963 }, { - "epoch": 0.08423074791732087, - "grad_norm": 1.6408350929881408, - "learning_rate": 4.923039951031562e-07, - "loss": 1.6538, - "step": 2280 + "epoch": 0.12928028609745196, + "grad_norm": 1.0650679197683777, + "learning_rate": 5.780316934827524e-07, + "loss": 1.7031, + "step": 964 }, { - "epoch": 0.08496961412712194, - "grad_norm": 1.3547990923859226, - "learning_rate": 4.921679791823761e-07, - "loss": 1.6639, - "step": 2300 + "epoch": 0.12941439427805096, + "grad_norm": 1.2041255427364985, + "learning_rate": 5.779867224259144e-07, + "loss": 1.7187, + "step": 965 }, { - "epoch": 0.08570848033692299, - "grad_norm": 1.536988279196407, - "learning_rate": 4.92030793107664e-07, - "loss": 1.6709, - "step": 2320 + "epoch": 0.12954850245865, + "grad_norm": 1.0678692273869028, + "learning_rate": 5.779417073423526e-07, + "loss": 1.6825, + "step": 966 }, { - "epoch": 0.08644734654672405, - "grad_norm": 1.4484585256339257, - "learning_rate": 4.918924376182121e-07, - "loss": 1.6517, - "step": 2340 + "epoch": 0.129682610639249, + "grad_norm": 1.1199711834538628, + "learning_rate": 5.778966482400589e-07, + "loss": 1.6826, + "step": 967 }, { - "epoch": 0.0871862127565251, - "grad_norm": 1.4965077209050879, - "learning_rate": 4.917529134595135e-07, - "loss": 1.6956, - "step": 2360 + "epoch": 0.12981671881984802, + "grad_norm": 1.3086828320370905, + "learning_rate": 5.778515451270329e-07, + "loss": 1.6527, + "step": 968 }, { - "epoch": 0.08792507896632618, - "grad_norm": 1.858590784934109, - "learning_rate": 4.916122213833584e-07, - "loss": 1.6667, - "step": 2380 + "epoch": 0.12995082700044702, + "grad_norm": 1.1283872527591725, + "learning_rate": 5.77806398011282e-07, + "loss": 1.6979, + "step": 969 }, { - "epoch": 0.08866394517612723, - "grad_norm": 1.6845854426029852, - "learning_rate": 4.914703621478297e-07, - "loss": 1.6392, - "step": 2400 + "epoch": 0.13008493518104605, + "grad_norm": 1.6891339086777561, + "learning_rate": 5.777612069008215e-07, + "loss": 1.6052, + "step": 970 }, { - "epoch": 0.08940281138592829, - "grad_norm": 1.479499322660105, - "learning_rate": 4.913273365172998e-07, - "loss": 1.6323, - "step": 2420 + "epoch": 0.13021904336164505, + "grad_norm": 1.0995419197341152, + "learning_rate": 5.777159718036745e-07, + "loss": 1.6741, + "step": 971 }, { - "epoch": 0.09014167759572936, - "grad_norm": 1.4475363138688357, - "learning_rate": 4.911831452624253e-07, - "loss": 1.655, - "step": 2440 + "epoch": 0.13035315154224408, + "grad_norm": 1.0826527648905109, + "learning_rate": 5.776706927278718e-07, + "loss": 1.7414, + "step": 972 }, { - "epoch": 0.09088054380553041, - "grad_norm": 1.4410524177419237, - "learning_rate": 4.910377891601439e-07, - "loss": 1.6977, - "step": 2460 + "epoch": 0.13048725972284309, + "grad_norm": 1.1749450180853513, + "learning_rate": 5.776253696814523e-07, + "loss": 1.7253, + "step": 973 }, { - "epoch": 0.09161941001533147, - "grad_norm": 1.512362377905178, - "learning_rate": 4.908912689936697e-07, - "loss": 1.6716, - "step": 2480 + "epoch": 0.13062136790344211, + "grad_norm": 1.1522644681889058, + "learning_rate": 5.775800026724622e-07, + "loss": 1.7109, + "step": 974 }, { - "epoch": 0.09235827622513254, - "grad_norm": 1.8697344841744916, - "learning_rate": 4.90743585552489e-07, - "loss": 1.6694, - "step": 2500 + "epoch": 0.13075547608404112, + "grad_norm": 1.1287433508002416, + "learning_rate": 5.775345917089561e-07, + "loss": 1.7602, + "step": 975 }, { - "epoch": 0.0930971424349336, - "grad_norm": 1.5406884179833267, - "learning_rate": 4.905947396323561e-07, - "loss": 1.7013, - "step": 2520 + "epoch": 0.13088958426464015, + "grad_norm": 1.1367208391544785, + "learning_rate": 5.77489136798996e-07, + "loss": 1.7096, + "step": 976 }, { - "epoch": 0.09383600864473465, - "grad_norm": 1.883868312926782, - "learning_rate": 4.904447320352891e-07, - "loss": 1.6438, - "step": 2540 + "epoch": 0.13102369244523915, + "grad_norm": 1.093651839491161, + "learning_rate": 5.774436379506516e-07, + "loss": 1.7313, + "step": 977 }, { - "epoch": 0.09457487485453571, - "grad_norm": 1.5063919396389938, - "learning_rate": 4.902935635695655e-07, - "loss": 1.6341, - "step": 2560 + "epoch": 0.13115780062583818, + "grad_norm": 1.1158114646345074, + "learning_rate": 5.773980951720009e-07, + "loss": 1.7152, + "step": 978 }, { - "epoch": 0.09531374106433678, - "grad_norm": 2.5666944465723223, - "learning_rate": 4.901412350497177e-07, - "loss": 1.673, - "step": 2580 + "epoch": 0.13129190880643718, + "grad_norm": 1.1405133501951592, + "learning_rate": 5.773525084711293e-07, + "loss": 1.6721, + "step": 979 }, { - "epoch": 0.09605260727413784, - "grad_norm": 1.6864017944187357, - "learning_rate": 4.899877472965289e-07, - "loss": 1.6532, - "step": 2600 + "epoch": 0.1314260169870362, + "grad_norm": 1.0757304379815442, + "learning_rate": 5.773068778561302e-07, + "loss": 1.64, + "step": 980 }, { - "epoch": 0.09679147348393889, - "grad_norm": 1.489838376943142, - "learning_rate": 4.898331011370282e-07, - "loss": 1.7006, - "step": 2620 + "epoch": 0.13156012516763524, + "grad_norm": 1.0607235063703084, + "learning_rate": 5.772612033351045e-07, + "loss": 1.7254, + "step": 981 }, { - "epoch": 0.09753033969373996, - "grad_norm": 1.516224235445671, - "learning_rate": 4.896772974044871e-07, - "loss": 1.664, - "step": 2640 + "epoch": 0.13169423334823424, + "grad_norm": 1.0583251896426324, + "learning_rate": 5.772154849161613e-07, + "loss": 1.687, + "step": 982 }, { - "epoch": 0.09826920590354102, - "grad_norm": 1.4154240383552321, - "learning_rate": 4.895203369384138e-07, - "loss": 1.6749, - "step": 2660 + "epoch": 0.13182834152883327, + "grad_norm": 1.098628320814992, + "learning_rate": 5.771697226074171e-07, + "loss": 1.635, + "step": 983 }, { - "epoch": 0.09900807211334207, - "grad_norm": 1.8653781111338754, - "learning_rate": 4.893622205845498e-07, - "loss": 1.6255, - "step": 2680 + "epoch": 0.13196244970943227, + "grad_norm": 1.1805474022437217, + "learning_rate": 5.771239164169966e-07, + "loss": 1.6698, + "step": 984 }, { - "epoch": 0.09974693832314313, - "grad_norm": 1.6154619117281779, - "learning_rate": 4.892029491948642e-07, - "loss": 1.7121, - "step": 2700 + "epoch": 0.1320965578900313, + "grad_norm": 1.0875587476789947, + "learning_rate": 5.77078066353032e-07, + "loss": 1.6354, + "step": 985 }, { - "epoch": 0.1004858045329442, - "grad_norm": 1.6240732568528131, - "learning_rate": 4.890425236275502e-07, - "loss": 1.687, - "step": 2720 + "epoch": 0.1322306660706303, + "grad_norm": 1.2112176511625345, + "learning_rate": 5.770321724236633e-07, + "loss": 1.7872, + "step": 986 }, { - "epoch": 0.10122467074274526, - "grad_norm": 1.459326292962488, - "learning_rate": 4.888809447470195e-07, - "loss": 1.5967, - "step": 2740 + "epoch": 0.13236477425122933, + "grad_norm": 1.2350020465740164, + "learning_rate": 5.769862346370384e-07, + "loss": 1.7646, + "step": 987 }, { - "epoch": 0.10196353695254631, - "grad_norm": 1.7582112558661527, - "learning_rate": 4.887182134238989e-07, - "loss": 1.7297, - "step": 2760 + "epoch": 0.13249888243182834, + "grad_norm": 1.1782226253464931, + "learning_rate": 5.769402530013128e-07, + "loss": 1.7215, + "step": 988 }, { - "epoch": 0.10270240316234738, - "grad_norm": 1.7154567295687058, - "learning_rate": 4.885543305350241e-07, - "loss": 1.6881, - "step": 2780 + "epoch": 0.13263299061242736, + "grad_norm": 1.0995226058236465, + "learning_rate": 5.768942275246503e-07, + "loss": 1.6472, + "step": 989 }, { - "epoch": 0.10344126937214844, - "grad_norm": 1.68486225816754, - "learning_rate": 4.88389296963436e-07, - "loss": 1.6351, - "step": 2800 + "epoch": 0.13276709879302637, + "grad_norm": 1.1354276853120844, + "learning_rate": 5.768481582152218e-07, + "loss": 1.7206, + "step": 990 }, { - "epoch": 0.1041801355819495, - "grad_norm": 1.4658940210533413, - "learning_rate": 4.882231135983757e-07, - "loss": 1.6584, - "step": 2820 + "epoch": 0.1329012069736254, + "grad_norm": 1.1299465711204602, + "learning_rate": 5.768020450812064e-07, + "loss": 1.6917, + "step": 991 }, { - "epoch": 0.10491900179175055, - "grad_norm": 1.3967168353938462, - "learning_rate": 4.880557813352796e-07, - "loss": 1.6811, - "step": 2840 + "epoch": 0.1330353151542244, + "grad_norm": 1.0767689418910376, + "learning_rate": 5.767558881307906e-07, + "loss": 1.6643, + "step": 992 }, { - "epoch": 0.10565786800155162, - "grad_norm": 1.6648778148188543, - "learning_rate": 4.878873010757747e-07, - "loss": 1.6447, - "step": 2860 + "epoch": 0.13316942333482343, + "grad_norm": 1.1138902596082148, + "learning_rate": 5.767096873721693e-07, + "loss": 1.7642, + "step": 993 }, { - "epoch": 0.10639673421135268, - "grad_norm": 1.6827360384506134, - "learning_rate": 4.877176737276736e-07, - "loss": 1.6671, - "step": 2880 + "epoch": 0.13330353151542243, + "grad_norm": 1.1056642001660029, + "learning_rate": 5.766634428135447e-07, + "loss": 1.689, + "step": 994 }, { - "epoch": 0.10713560042115373, - "grad_norm": 1.6125148782802161, - "learning_rate": 4.875469002049697e-07, - "loss": 1.6611, - "step": 2900 + "epoch": 0.13343763969602146, + "grad_norm": 1.0482595089911335, + "learning_rate": 5.76617154463127e-07, + "loss": 1.635, + "step": 995 }, { - "epoch": 0.1078744666309548, - "grad_norm": 3.1640996826552925, - "learning_rate": 4.873749814278325e-07, - "loss": 1.6914, - "step": 2920 + "epoch": 0.13357174787662046, + "grad_norm": 1.0936790475077613, + "learning_rate": 5.765708223291338e-07, + "loss": 1.6614, + "step": 996 }, { - "epoch": 0.10861333284075586, - "grad_norm": 1.5756821875718683, - "learning_rate": 4.87201918322602e-07, - "loss": 1.6891, - "step": 2940 + "epoch": 0.1337058560572195, + "grad_norm": 1.1904352264236198, + "learning_rate": 5.765244464197911e-07, + "loss": 1.6631, + "step": 997 }, { - "epoch": 0.10935219905055692, - "grad_norm": 1.508384464413988, - "learning_rate": 4.870277118217844e-07, - "loss": 1.6765, - "step": 2960 + "epoch": 0.13383996423781852, + "grad_norm": 1.1399324270789883, + "learning_rate": 5.76478026743332e-07, + "loss": 1.6956, + "step": 998 }, { - "epoch": 0.11009106526035799, - "grad_norm": 1.8943879400046142, - "learning_rate": 4.868523628640468e-07, - "loss": 1.6718, - "step": 2980 + "epoch": 0.13397407241841752, + "grad_norm": 1.0631541550252919, + "learning_rate": 5.76431563307998e-07, + "loss": 1.6357, + "step": 999 }, { - "epoch": 0.11082993147015904, - "grad_norm": 1.5476264075937183, - "learning_rate": 4.86675872394212e-07, - "loss": 1.6384, - "step": 3000 + "epoch": 0.13410818059901655, + "grad_norm": 2.7939617071812304, + "learning_rate": 5.763850561220378e-07, + "loss": 1.7513, + "step": 1000 }, { - "epoch": 0.1115687976799601, - "grad_norm": 1.7120101891654744, - "learning_rate": 4.864982413632537e-07, - "loss": 1.66, - "step": 3020 + "epoch": 0.13424228877961555, + "grad_norm": 1.1023053650764323, + "learning_rate": 5.763385051937082e-07, + "loss": 1.6986, + "step": 1001 }, { - "epoch": 0.11230766388976116, - "grad_norm": 1.8834789513548644, - "learning_rate": 4.863284363107887e-07, - "loss": 1.6453, - "step": 3040 + "epoch": 0.13437639696021458, + "grad_norm": 1.1134127723095217, + "learning_rate": 5.762919105312739e-07, + "loss": 1.6972, + "step": 1002 }, { - "epoch": 0.11304653009956223, - "grad_norm": 1.6393861847878763, - "learning_rate": 4.861485839441465e-07, - "loss": 1.6914, - "step": 3060 + "epoch": 0.13451050514081359, + "grad_norm": 1.3206325684664686, + "learning_rate": 5.762452721430068e-07, + "loss": 1.6561, + "step": 1003 }, { - "epoch": 0.11378539630936328, - "grad_norm": 1.548505894649462, - "learning_rate": 4.859675938575391e-07, - "loss": 1.6513, - "step": 3080 + "epoch": 0.13464461332141262, + "grad_norm": 1.1017815335316827, + "learning_rate": 5.761985900371871e-07, + "loss": 1.6294, + "step": 1004 }, { - "epoch": 0.11452426251916434, - "grad_norm": 1.7314160899998987, - "learning_rate": 4.857854670261854e-07, - "loss": 1.6652, - "step": 3100 + "epoch": 0.13477872150201162, + "grad_norm": 1.091998126330244, + "learning_rate": 5.761518642221027e-07, + "loss": 1.6645, + "step": 1005 }, { - "epoch": 0.11526312872896541, - "grad_norm": 1.6255645061866926, - "learning_rate": 4.856022044314289e-07, - "loss": 1.6825, - "step": 3120 + "epoch": 0.13491282968261065, + "grad_norm": 1.1390065790034687, + "learning_rate": 5.76105094706049e-07, + "loss": 1.6634, + "step": 1006 }, { - "epoch": 0.11600199493876646, - "grad_norm": 1.7047082936180922, - "learning_rate": 4.854178070607332e-07, - "loss": 1.6571, - "step": 3140 + "epoch": 0.13504693786320965, + "grad_norm": 1.1165938666136697, + "learning_rate": 5.760582814973294e-07, + "loss": 1.6884, + "step": 1007 }, { - "epoch": 0.11674086114856752, - "grad_norm": 1.5937691951508997, - "learning_rate": 4.852322759076762e-07, - "loss": 1.6796, - "step": 3160 + "epoch": 0.13518104604380868, + "grad_norm": 1.1265961333800854, + "learning_rate": 5.760114246042548e-07, + "loss": 1.581, + "step": 1008 }, { - "epoch": 0.11747972735836858, - "grad_norm": 1.5581038553350461, - "learning_rate": 4.850456119719448e-07, - "loss": 1.6237, - "step": 3180 + "epoch": 0.13531515422440768, + "grad_norm": 1.1108402335230954, + "learning_rate": 5.759645240351442e-07, + "loss": 1.6948, + "step": 1009 }, { - "epoch": 0.11821859356816965, - "grad_norm": 1.5319442885899253, - "learning_rate": 4.848578162593298e-07, - "loss": 1.6507, - "step": 3200 + "epoch": 0.1354492624050067, + "grad_norm": 1.1540406201851725, + "learning_rate": 5.75917579798324e-07, + "loss": 1.6816, + "step": 1010 }, { - "epoch": 0.1189574597779707, - "grad_norm": 1.7452294652307094, - "learning_rate": 4.846783629455789e-07, - "loss": 1.6334, - "step": 3220 + "epoch": 0.1355833705856057, + "grad_norm": 1.0776760932575635, + "learning_rate": 5.758705919021285e-07, + "loss": 1.6455, + "step": 1011 }, { - "epoch": 0.11969632598777176, - "grad_norm": 1.6423452527210813, - "learning_rate": 4.844883631840362e-07, - "loss": 1.6591, - "step": 3240 + "epoch": 0.13571747876620474, + "grad_norm": 1.1626622938941558, + "learning_rate": 5.758235603549001e-07, + "loss": 1.7679, + "step": 1012 }, { - "epoch": 0.12043519219757283, - "grad_norm": 2.3138017105742277, - "learning_rate": 4.842972346482019e-07, - "loss": 1.6693, - "step": 3260 + "epoch": 0.13585158694680374, + "grad_norm": 1.187443307470314, + "learning_rate": 5.757764851649882e-07, + "loss": 1.6258, + "step": 1013 }, { - "epoch": 0.12117405840737389, - "grad_norm": 1.5077648756938484, - "learning_rate": 4.841049783679233e-07, - "loss": 1.6486, - "step": 3280 + "epoch": 0.13598569512740277, + "grad_norm": 1.1483737298574974, + "learning_rate": 5.757293663407507e-07, + "loss": 1.7531, + "step": 1014 }, { - "epoch": 0.12191292461717494, - "grad_norm": 1.4711190983794034, - "learning_rate": 4.839115953791238e-07, - "loss": 1.6881, - "step": 3300 + "epoch": 0.13611980330800177, + "grad_norm": 1.108423451892347, + "learning_rate": 5.756822038905527e-07, + "loss": 1.5847, + "step": 1015 }, { - "epoch": 0.12265179082697601, - "grad_norm": 4.058044242916531, - "learning_rate": 4.837170867237982e-07, - "loss": 1.6469, - "step": 3320 + "epoch": 0.1362539114886008, + "grad_norm": 1.056521665647446, + "learning_rate": 5.756349978227674e-07, + "loss": 1.6545, + "step": 1016 }, { - "epoch": 0.12339065703677707, - "grad_norm": 1.8109757223352017, - "learning_rate": 4.835214534500064e-07, - "loss": 1.6912, - "step": 3340 + "epoch": 0.13638801966919983, + "grad_norm": 1.122523040636454, + "learning_rate": 5.755877481457756e-07, + "loss": 1.6762, + "step": 1017 }, { - "epoch": 0.12412952324657812, - "grad_norm": 1.5112894099167034, - "learning_rate": 4.83324696611868e-07, - "loss": 1.6452, - "step": 3360 + "epoch": 0.13652212784979884, + "grad_norm": 1.1104212906292141, + "learning_rate": 5.755404548679657e-07, + "loss": 1.6761, + "step": 1018 }, { - "epoch": 0.12486838945637918, - "grad_norm": 1.7532693818843224, - "learning_rate": 4.83126817269557e-07, - "loss": 1.6158, - "step": 3380 + "epoch": 0.13665623603039787, + "grad_norm": 1.0971062205375117, + "learning_rate": 5.75493117997734e-07, + "loss": 1.6676, + "step": 1019 }, { - "epoch": 0.12560725566618025, - "grad_norm": 1.7433921276878421, - "learning_rate": 4.829278164892951e-07, - "loss": 1.6684, - "step": 3400 + "epoch": 0.13679034421099687, + "grad_norm": 1.1923600261259284, + "learning_rate": 5.754457375434848e-07, + "loss": 1.6966, + "step": 1020 }, { - "epoch": 0.1263461218759813, - "grad_norm": 1.499971805431214, - "learning_rate": 4.827276953433474e-07, - "loss": 1.6596, - "step": 3420 + "epoch": 0.1369244523915959, + "grad_norm": 1.1577052085464195, + "learning_rate": 5.753983135136295e-07, + "loss": 1.7123, + "step": 1021 }, { - "epoch": 0.12708498808578236, - "grad_norm": 1.5392331224579805, - "learning_rate": 4.825264549100149e-07, - "loss": 1.6411, - "step": 3440 + "epoch": 0.1370585605721949, + "grad_norm": 1.1404232349413184, + "learning_rate": 5.753508459165879e-07, + "loss": 1.703, + "step": 1022 }, { - "epoch": 0.12782385429558343, - "grad_norm": 1.5289257318137572, - "learning_rate": 4.823240962736303e-07, - "loss": 1.6759, - "step": 3460 + "epoch": 0.13719266875279393, + "grad_norm": 1.392333260935911, + "learning_rate": 5.75303334760787e-07, + "loss": 1.7096, + "step": 1023 }, { - "epoch": 0.12856272050538448, - "grad_norm": 1.5034439532563377, - "learning_rate": 4.82120620524551e-07, - "loss": 1.6405, - "step": 3480 + "epoch": 0.13732677693339293, + "grad_norm": 1.113423870991827, + "learning_rate": 5.75255780054662e-07, + "loss": 1.7556, + "step": 1024 }, { - "epoch": 0.12930158671518555, - "grad_norm": 1.4978715454221503, - "learning_rate": 4.81916028759154e-07, - "loss": 1.6732, - "step": 3500 + "epoch": 0.13746088511399196, + "grad_norm": 1.0653465618827531, + "learning_rate": 5.752081818066555e-07, + "loss": 1.7324, + "step": 1025 }, { - "epoch": 0.13004045292498662, - "grad_norm": 1.45790640802375, - "learning_rate": 4.817103220798296e-07, - "loss": 1.6649, - "step": 3520 + "epoch": 0.13759499329459096, + "grad_norm": 1.0145309694174296, + "learning_rate": 5.751605400252179e-07, + "loss": 1.684, + "step": 1026 }, { - "epoch": 0.13077931913478766, - "grad_norm": 1.5322708095688835, - "learning_rate": 4.815035015949754e-07, - "loss": 1.6588, - "step": 3540 + "epoch": 0.13772910147519, + "grad_norm": 1.1507242589279925, + "learning_rate": 5.751128547188073e-07, + "loss": 1.7363, + "step": 1027 }, { - "epoch": 0.13151818534458873, - "grad_norm": 1.540513558070265, - "learning_rate": 4.812955684189904e-07, - "loss": 1.6718, - "step": 3560 + "epoch": 0.137863209655789, + "grad_norm": 1.1602441710831857, + "learning_rate": 5.750651258958897e-07, + "loss": 1.6452, + "step": 1028 }, { - "epoch": 0.1322570515543898, - "grad_norm": 1.4880225438470713, - "learning_rate": 4.810865236722692e-07, - "loss": 1.6313, - "step": 3580 + "epoch": 0.13799731783638802, + "grad_norm": 1.0450164574336993, + "learning_rate": 5.750173535649387e-07, + "loss": 1.6581, + "step": 1029 }, { - "epoch": 0.13299591776419084, - "grad_norm": 1.4919528959671158, - "learning_rate": 4.808763684811959e-07, - "loss": 1.62, - "step": 3600 + "epoch": 0.13813142601698702, + "grad_norm": 1.1152601638616617, + "learning_rate": 5.749695377344356e-07, + "loss": 1.7178, + "step": 1030 }, { - "epoch": 0.1337347839739919, - "grad_norm": 1.6101194590431924, - "learning_rate": 4.806651039781377e-07, - "loss": 1.6933, - "step": 3620 + "epoch": 0.13826553419758605, + "grad_norm": 1.1109479531814108, + "learning_rate": 5.749216784128695e-07, + "loss": 1.6318, + "step": 1031 }, { - "epoch": 0.13447365018379298, - "grad_norm": 1.5722737602103793, - "learning_rate": 4.804527313014392e-07, - "loss": 1.6555, - "step": 3640 + "epoch": 0.13839964237818506, + "grad_norm": 1.1171173194344595, + "learning_rate": 5.748737756087372e-07, + "loss": 1.7563, + "step": 1032 }, { - "epoch": 0.13521251639359402, - "grad_norm": 1.647937670204523, - "learning_rate": 4.802392515954161e-07, - "loss": 1.6561, - "step": 3660 + "epoch": 0.13853375055878409, + "grad_norm": 1.1229721774030046, + "learning_rate": 5.74825829330543e-07, + "loss": 1.6557, + "step": 1033 }, { - "epoch": 0.1359513826033951, - "grad_norm": 1.6527027343392149, - "learning_rate": 4.80024666010349e-07, - "loss": 1.6747, - "step": 3680 + "epoch": 0.13866785873938312, + "grad_norm": 1.0610467262170575, + "learning_rate": 5.747778395867995e-07, + "loss": 1.5954, + "step": 1034 }, { - "epoch": 0.13669024881319616, - "grad_norm": 1.596151179002379, - "learning_rate": 4.798089757024773e-07, - "loss": 1.6602, - "step": 3700 + "epoch": 0.13880196691998212, + "grad_norm": 1.057400993985582, + "learning_rate": 5.747298063860264e-07, + "loss": 1.6836, + "step": 1035 }, { - "epoch": 0.1374291150229972, - "grad_norm": 1.6359785367644735, - "learning_rate": 4.795921818339928e-07, - "loss": 1.7041, - "step": 3720 + "epoch": 0.13893607510058115, + "grad_norm": 1.2946727429654457, + "learning_rate": 5.746817297367512e-07, + "loss": 1.7718, + "step": 1036 }, { - "epoch": 0.13816798123279828, - "grad_norm": 1.5303851327334592, - "learning_rate": 4.793742855730337e-07, - "loss": 1.6921, - "step": 3740 + "epoch": 0.13907018328118015, + "grad_norm": 1.0793836410907007, + "learning_rate": 5.746336096475097e-07, + "loss": 1.6192, + "step": 1037 }, { - "epoch": 0.13890684744259932, - "grad_norm": 1.552833624004378, - "learning_rate": 4.79155288093678e-07, - "loss": 1.6646, - "step": 3760 + "epoch": 0.13920429146177918, + "grad_norm": 1.0456487983417475, + "learning_rate": 5.745854461268445e-07, + "loss": 1.6997, + "step": 1038 }, { - "epoch": 0.1396457136524004, - "grad_norm": 1.5328749650552398, - "learning_rate": 4.789351905759377e-07, - "loss": 1.671, - "step": 3780 + "epoch": 0.13933839964237818, + "grad_norm": 1.0783776132275518, + "learning_rate": 5.745372391833066e-07, + "loss": 1.5643, + "step": 1039 }, { - "epoch": 0.14038457986220146, - "grad_norm": 1.4637618775535644, - "learning_rate": 4.787139942057513e-07, - "loss": 1.6826, - "step": 3800 + "epoch": 0.1394725078229772, + "grad_norm": 1.1073544797133057, + "learning_rate": 5.744889888254545e-07, + "loss": 1.7453, + "step": 1040 }, { - "epoch": 0.1411234460720025, - "grad_norm": 1.456698106912096, - "learning_rate": 4.784917001749791e-07, - "loss": 1.7079, - "step": 3820 + "epoch": 0.1396066160035762, + "grad_norm": 1.0897237578625294, + "learning_rate": 5.744406950618546e-07, + "loss": 1.7507, + "step": 1041 }, { - "epoch": 0.14186231228180357, - "grad_norm": 1.4778158837226694, - "learning_rate": 4.782683096813954e-07, - "loss": 1.6673, - "step": 3840 + "epoch": 0.13974072418417524, + "grad_norm": 1.1334242880215313, + "learning_rate": 5.743923579010804e-07, + "loss": 1.5952, + "step": 1042 }, { - "epoch": 0.14260117849160464, - "grad_norm": 1.426517743754919, - "learning_rate": 4.780438239286824e-07, - "loss": 1.6327, - "step": 3860 + "epoch": 0.13987483236477424, + "grad_norm": 1.0794611740077888, + "learning_rate": 5.743439773517138e-07, + "loss": 1.6699, + "step": 1043 }, { - "epoch": 0.14334004470140568, - "grad_norm": 1.7717097070454197, - "learning_rate": 4.77818244126424e-07, - "loss": 1.6577, - "step": 3880 + "epoch": 0.14000894054537327, + "grad_norm": 1.2221425859227393, + "learning_rate": 5.742955534223441e-07, + "loss": 1.6667, + "step": 1044 }, { - "epoch": 0.14407891091120675, - "grad_norm": 1.6008901431845195, - "learning_rate": 4.775915714900992e-07, - "loss": 1.6493, - "step": 3900 + "epoch": 0.14014304872597227, + "grad_norm": 1.0734586645398891, + "learning_rate": 5.742470861215682e-07, + "loss": 1.7595, + "step": 1045 }, { - "epoch": 0.14481777712100782, - "grad_norm": 1.5377457534191892, - "learning_rate": 4.773638072410752e-07, - "loss": 1.6668, - "step": 3920 + "epoch": 0.1402771569065713, + "grad_norm": 1.1044082425274806, + "learning_rate": 5.74198575457991e-07, + "loss": 1.6741, + "step": 1046 }, { - "epoch": 0.14555664333080887, - "grad_norm": 1.9114280227385299, - "learning_rate": 4.771349526066014e-07, - "loss": 1.6925, - "step": 3940 + "epoch": 0.1404112650871703, + "grad_norm": 1.114278005814131, + "learning_rate": 5.741500214402247e-07, + "loss": 1.6869, + "step": 1047 }, { - "epoch": 0.14629550954060994, - "grad_norm": 1.803899924444919, - "learning_rate": 4.769050088198021e-07, - "loss": 1.6775, - "step": 3960 + "epoch": 0.14054537326776934, + "grad_norm": 1.1185672447220645, + "learning_rate": 5.741014240768896e-07, + "loss": 1.7676, + "step": 1048 }, { - "epoch": 0.147034375750411, - "grad_norm": 1.5100721777601815, - "learning_rate": 4.7667397711967037e-07, - "loss": 1.6181, - "step": 3980 + "epoch": 0.14067948144836834, + "grad_norm": 1.1307460519899954, + "learning_rate": 5.740527833766135e-07, + "loss": 1.7232, + "step": 1049 }, { - "epoch": 0.14777324196021205, - "grad_norm": 1.4720945445766893, - "learning_rate": 4.764418587510615e-07, - "loss": 1.6607, - "step": 4000 + "epoch": 0.14081358962896737, + "grad_norm": 1.1013230366573936, + "learning_rate": 5.740040993480318e-07, + "loss": 1.7287, + "step": 1050 }, { - "epoch": 0.14851210817001312, - "grad_norm": 1.569266687535282, - "learning_rate": 4.7620865496468544e-07, - "loss": 1.6829, - "step": 4020 + "epoch": 0.1409476978095664, + "grad_norm": 1.2887563539916567, + "learning_rate": 5.739553719997877e-07, + "loss": 1.6725, + "step": 1051 }, { - "epoch": 0.1492509743798142, - "grad_norm": 1.5799540185979453, - "learning_rate": 4.7597436701710107e-07, - "loss": 1.6483, - "step": 4040 + "epoch": 0.1410818059901654, + "grad_norm": 1.128200473385445, + "learning_rate": 5.739066013405322e-07, + "loss": 1.7193, + "step": 1052 }, { - "epoch": 0.14998984058961523, - "grad_norm": 1.5804308168544465, - "learning_rate": 4.75738996170709e-07, - "loss": 1.6924, - "step": 4060 + "epoch": 0.14121591417076443, + "grad_norm": 1.0948929309224316, + "learning_rate": 5.738577873789237e-07, + "loss": 1.6993, + "step": 1053 }, { - "epoch": 0.1507287067994163, - "grad_norm": 1.523398154876467, - "learning_rate": 4.7550254369374455e-07, - "loss": 1.6519, - "step": 4080 + "epoch": 0.14135002235136343, + "grad_norm": 1.0842896614577642, + "learning_rate": 5.738089301236286e-07, + "loss": 1.7045, + "step": 1054 }, { - "epoch": 0.15146757300921734, - "grad_norm": 1.4233865381689017, - "learning_rate": 4.752650108602712e-07, - "loss": 1.664, - "step": 4100 + "epoch": 0.14148413053196246, + "grad_norm": 1.0699301937780477, + "learning_rate": 5.73760029583321e-07, + "loss": 1.7216, + "step": 1055 }, { - "epoch": 0.1522064392190184, - "grad_norm": 1.512734811893487, - "learning_rate": 4.7502639895017366e-07, - "loss": 1.7103, - "step": 4120 + "epoch": 0.14161823871256146, + "grad_norm": 1.0958889223597748, + "learning_rate": 5.737110857666822e-07, + "loss": 1.6649, + "step": 1056 }, { - "epoch": 0.15294530542881948, - "grad_norm": 1.5630800949377466, - "learning_rate": 4.747867092491511e-07, - "loss": 1.6531, - "step": 4140 + "epoch": 0.1417523468931605, + "grad_norm": 1.0656247406409773, + "learning_rate": 5.736620986824017e-07, + "loss": 1.683, + "step": 1057 }, { - "epoch": 0.15368417163862053, - "grad_norm": 1.470144612554125, - "learning_rate": 4.7454594304870977e-07, - "loss": 1.6725, - "step": 4160 + "epoch": 0.1418864550737595, + "grad_norm": 1.2444649158517036, + "learning_rate": 5.736130683391765e-07, + "loss": 1.6188, + "step": 1058 }, { - "epoch": 0.1544230378484216, - "grad_norm": 1.6569477682445206, - "learning_rate": 4.743041016461567e-07, - "loss": 1.6998, - "step": 4180 + "epoch": 0.14202056325435852, + "grad_norm": 1.0989443966595032, + "learning_rate": 5.735639947457113e-07, + "loss": 1.7038, + "step": 1059 }, { - "epoch": 0.15516190405822267, - "grad_norm": 1.7296103801240361, - "learning_rate": 4.7406118634459223e-07, - "loss": 1.6613, - "step": 4200 + "epoch": 0.14215467143495752, + "grad_norm": 1.142667824771637, + "learning_rate": 5.735148779107184e-07, + "loss": 1.6156, + "step": 1060 }, { - "epoch": 0.1559007702680237, - "grad_norm": 1.6463696442561442, - "learning_rate": 4.738171984529031e-07, - "loss": 1.6575, - "step": 4220 + "epoch": 0.14228877961555655, + "grad_norm": 1.1299828935757683, + "learning_rate": 5.734657178429179e-07, + "loss": 1.6754, + "step": 1061 }, { - "epoch": 0.15663963647782478, - "grad_norm": 1.545869558479261, - "learning_rate": 4.7357213928575546e-07, - "loss": 1.6741, - "step": 4240 + "epoch": 0.14242288779615556, + "grad_norm": 1.0986771884553144, + "learning_rate": 5.734165145510375e-07, + "loss": 1.6201, + "step": 1062 }, { - "epoch": 0.15737850268762585, - "grad_norm": 1.7796493147352748, - "learning_rate": 4.7332601016358773e-07, - "loss": 1.7046, - "step": 4260 + "epoch": 0.14255699597675459, + "grad_norm": 1.0853274840023213, + "learning_rate": 5.733672680438124e-07, + "loss": 1.6885, + "step": 1063 }, { - "epoch": 0.1581173688974269, - "grad_norm": 1.5172414763731175, - "learning_rate": 4.7307881241260365e-07, - "loss": 1.6365, - "step": 4280 + "epoch": 0.1426911041573536, + "grad_norm": 1.0820811488797877, + "learning_rate": 5.73317978329986e-07, + "loss": 1.7995, + "step": 1064 }, { - "epoch": 0.15885623510722796, - "grad_norm": 1.5470321540163943, - "learning_rate": 4.7283054736476474e-07, - "loss": 1.6844, - "step": 4300 + "epoch": 0.14282521233795262, + "grad_norm": 1.1295149364952306, + "learning_rate": 5.732686454183087e-07, + "loss": 1.6925, + "step": 1065 }, { - "epoch": 0.15959510131702903, - "grad_norm": 1.5074962263335083, - "learning_rate": 4.725812163577835e-07, - "loss": 1.6683, - "step": 4320 + "epoch": 0.14295932051855162, + "grad_norm": 1.057888764325057, + "learning_rate": 5.732192693175391e-07, + "loss": 1.6412, + "step": 1066 }, { - "epoch": 0.16033396752683007, - "grad_norm": 1.5931587963454854, - "learning_rate": 4.723308207351162e-07, - "loss": 1.6972, - "step": 4340 + "epoch": 0.14309342869915065, + "grad_norm": 1.098616962497695, + "learning_rate": 5.731698500364434e-07, + "loss": 1.6271, + "step": 1067 }, { - "epoch": 0.16107283373663114, - "grad_norm": 1.4335946997211053, - "learning_rate": 4.720793618459553e-07, - "loss": 1.6182, - "step": 4360 + "epoch": 0.14322753687974968, + "grad_norm": 1.2745609637830848, + "learning_rate": 5.731203875837949e-07, + "loss": 1.671, + "step": 1068 }, { - "epoch": 0.1618116999464322, - "grad_norm": 1.9207877719443267, - "learning_rate": 4.718268410452226e-07, - "loss": 1.6777, - "step": 4380 + "epoch": 0.14336164506034868, + "grad_norm": 1.120730846705753, + "learning_rate": 5.730708819683753e-07, + "loss": 1.7433, + "step": 1069 }, { - "epoch": 0.16255056615623326, - "grad_norm": 1.4490578223410473, - "learning_rate": 4.7157325969356143e-07, - "loss": 1.6911, - "step": 4400 + "epoch": 0.1434957532409477, + "grad_norm": 1.1177693123454027, + "learning_rate": 5.730213331989736e-07, + "loss": 1.7291, + "step": 1070 }, { - "epoch": 0.16328943236603433, - "grad_norm": 1.593847776562296, - "learning_rate": 4.713186191573301e-07, - "loss": 1.6927, - "step": 4420 + "epoch": 0.1436298614215467, + "grad_norm": 1.0910765331643333, + "learning_rate": 5.729717412843866e-07, + "loss": 1.6739, + "step": 1071 }, { - "epoch": 0.16402829857583537, - "grad_norm": 1.4739123126868083, - "learning_rate": 4.7106292080859363e-07, - "loss": 1.6492, - "step": 4440 + "epoch": 0.14376396960214574, + "grad_norm": 1.1741168573690484, + "learning_rate": 5.729221062334186e-07, + "loss": 1.7401, + "step": 1072 }, { - "epoch": 0.16476716478563644, - "grad_norm": 1.424511297941709, - "learning_rate": 4.7080616602511705e-07, - "loss": 1.6847, - "step": 4460 + "epoch": 0.14389807778274474, + "grad_norm": 1.2230565196681809, + "learning_rate": 5.728724280548815e-07, + "loss": 1.6466, + "step": 1073 }, { - "epoch": 0.1655060309954375, - "grad_norm": 1.6007681786366288, - "learning_rate": 4.705483561903576e-07, - "loss": 1.662, - "step": 4480 + "epoch": 0.14403218596334377, + "grad_norm": 1.075125807457348, + "learning_rate": 5.728227067575953e-07, + "loss": 1.6632, + "step": 1074 }, { - "epoch": 0.16624489720523855, - "grad_norm": 1.55690540989863, - "learning_rate": 4.702894926934573e-07, - "loss": 1.6851, - "step": 4500 + "epoch": 0.14416629414394277, + "grad_norm": 1.0629310683077087, + "learning_rate": 5.727729423503871e-07, + "loss": 1.6456, + "step": 1075 }, { - "epoch": 0.16698376341503962, - "grad_norm": 2.0423474735881926, - "learning_rate": 4.700295769292359e-07, - "loss": 1.6604, - "step": 4520 + "epoch": 0.1443004023245418, + "grad_norm": 1.131277162697691, + "learning_rate": 5.72723134842092e-07, + "loss": 1.7069, + "step": 1076 }, { - "epoch": 0.1677226296248407, - "grad_norm": 1.453355289637868, - "learning_rate": 4.6976861029818264e-07, - "loss": 1.6842, - "step": 4540 + "epoch": 0.1444345105051408, + "grad_norm": 1.4319225703993534, + "learning_rate": 5.726732842415527e-07, + "loss": 1.7104, + "step": 1077 }, { - "epoch": 0.16846149583464173, - "grad_norm": 1.5505160972568328, - "learning_rate": 4.695065942064494e-07, - "loss": 1.6804, - "step": 4560 + "epoch": 0.14456861868573984, + "grad_norm": 1.1218543441609072, + "learning_rate": 5.726233905576194e-07, + "loss": 1.8235, + "step": 1078 }, { - "epoch": 0.1692003620444428, - "grad_norm": 1.7608287873846744, - "learning_rate": 4.6924353006584244e-07, - "loss": 1.6595, - "step": 4580 + "epoch": 0.14470272686633884, + "grad_norm": 1.0682688173779038, + "learning_rate": 5.725734537991502e-07, + "loss": 1.7334, + "step": 1079 }, { - "epoch": 0.16993922825424387, - "grad_norm": 1.4685283699391545, - "learning_rate": 4.689794192938156e-07, - "loss": 1.6264, - "step": 4600 + "epoch": 0.14483683504693787, + "grad_norm": 1.0513899411618064, + "learning_rate": 5.725234739750106e-07, + "loss": 1.564, + "step": 1080 }, { - "epoch": 0.17067809446404492, - "grad_norm": 1.7781661683868824, - "learning_rate": 4.687142633134619e-07, - "loss": 1.6875, - "step": 4620 + "epoch": 0.14497094322753687, + "grad_norm": 1.073556864405118, + "learning_rate": 5.724734510940738e-07, + "loss": 1.6191, + "step": 1081 }, { - "epoch": 0.17141696067384599, - "grad_norm": 1.6196809334292608, - "learning_rate": 4.6844806355350623e-07, - "loss": 1.6753, - "step": 4640 + "epoch": 0.1451050514081359, + "grad_norm": 1.1272658425201874, + "learning_rate": 5.724233851652208e-07, + "loss": 1.5812, + "step": 1082 }, { - "epoch": 0.17215582688364706, - "grad_norm": 1.6293152376567321, - "learning_rate": 4.6818082144829787e-07, - "loss": 1.6665, - "step": 4660 + "epoch": 0.1452391595887349, + "grad_norm": 1.1649864304286308, + "learning_rate": 5.723732761973399e-07, + "loss": 1.7974, + "step": 1083 }, { - "epoch": 0.1728946930934481, - "grad_norm": 1.510069163173277, - "learning_rate": 4.6791253843780217e-07, - "loss": 1.6697, - "step": 4680 + "epoch": 0.14537326776933393, + "grad_norm": 1.1842565824330795, + "learning_rate": 5.723231241993277e-07, + "loss": 1.642, + "step": 1084 }, { - "epoch": 0.17363355930324917, - "grad_norm": 1.4471973015401869, - "learning_rate": 4.676432159675933e-07, - "loss": 1.6806, - "step": 4700 + "epoch": 0.14550737594993293, + "grad_norm": 1.1226873500626315, + "learning_rate": 5.722729291800877e-07, + "loss": 1.648, + "step": 1085 }, { - "epoch": 0.1743724255130502, - "grad_norm": 1.7753201121195747, - "learning_rate": 4.6737285548884655e-07, - "loss": 1.6935, - "step": 4720 + "epoch": 0.14564148413053196, + "grad_norm": 1.074175742058312, + "learning_rate": 5.722226911485315e-07, + "loss": 1.6477, + "step": 1086 }, { - "epoch": 0.17511129172285128, - "grad_norm": 1.5222859899502188, - "learning_rate": 4.671014584583296e-07, - "loss": 1.6664, - "step": 4740 + "epoch": 0.145775592311131, + "grad_norm": 1.6414796585857712, + "learning_rate": 5.721724101135781e-07, + "loss": 1.6099, + "step": 1087 }, { - "epoch": 0.17585015793265235, - "grad_norm": 1.4892529478692567, - "learning_rate": 4.668290263383959e-07, - "loss": 1.6669, - "step": 4760 + "epoch": 0.14590970049173, + "grad_norm": 1.1490676419596029, + "learning_rate": 5.721220860841543e-07, + "loss": 1.5671, + "step": 1088 }, { - "epoch": 0.1765890241424534, - "grad_norm": 1.5841443455470228, - "learning_rate": 4.66555560596976e-07, - "loss": 1.6419, - "step": 4780 + "epoch": 0.14604380867232902, + "grad_norm": 1.0434774110585503, + "learning_rate": 5.720717190691943e-07, + "loss": 1.6001, + "step": 1089 }, { - "epoch": 0.17732789035225446, - "grad_norm": 1.5264328160932443, - "learning_rate": 4.6628106270757e-07, - "loss": 1.6642, - "step": 4800 + "epoch": 0.14617791685292802, + "grad_norm": 1.0806260779363936, + "learning_rate": 5.720213090776403e-07, + "loss": 1.7541, + "step": 1090 }, { - "epoch": 0.17806675656205553, - "grad_norm": 1.6887371299004348, - "learning_rate": 4.6600553414923913e-07, - "loss": 1.6387, - "step": 4820 + "epoch": 0.14631202503352705, + "grad_norm": 1.1814630509058974, + "learning_rate": 5.719708561184417e-07, + "loss": 1.6864, + "step": 1091 }, { - "epoch": 0.17880562277185658, - "grad_norm": 1.4594422560615166, - "learning_rate": 4.657289764065985e-07, - "loss": 1.6493, - "step": 4840 + "epoch": 0.14644613321412606, + "grad_norm": 1.0965207690798646, + "learning_rate": 5.719203602005559e-07, + "loss": 1.7179, + "step": 1092 }, { - "epoch": 0.17954448898165765, - "grad_norm": 1.6615232385858325, - "learning_rate": 4.6545139096980846e-07, - "loss": 1.6312, - "step": 4860 + "epoch": 0.14658024139472509, + "grad_norm": 1.187634257937833, + "learning_rate": 5.718698213329479e-07, + "loss": 1.5889, + "step": 1093 }, { - "epoch": 0.18028335519145872, - "grad_norm": 1.4161658999634517, - "learning_rate": 4.651727793345669e-07, - "loss": 1.687, - "step": 4880 + "epoch": 0.1467143495753241, + "grad_norm": 1.151719981823989, + "learning_rate": 5.718192395245899e-07, + "loss": 1.6503, + "step": 1094 }, { - "epoch": 0.18102222140125976, - "grad_norm": 1.4750799503852594, - "learning_rate": 4.6489314300210117e-07, - "loss": 1.6579, - "step": 4900 + "epoch": 0.14684845775592312, + "grad_norm": 1.0407283688373252, + "learning_rate": 5.717686147844622e-07, + "loss": 1.5976, + "step": 1095 }, { - "epoch": 0.18176108761106083, - "grad_norm": 1.5823630581751142, - "learning_rate": 4.646124834791598e-07, - "loss": 1.6974, - "step": 4920 + "epoch": 0.14698256593652212, + "grad_norm": 1.0743575974553181, + "learning_rate": 5.717179471215527e-07, + "loss": 1.7028, + "step": 1096 }, { - "epoch": 0.1824999538208619, - "grad_norm": 1.5953496527857909, - "learning_rate": 4.6433080227800476e-07, - "loss": 1.6349, - "step": 4940 + "epoch": 0.14711667411712115, + "grad_norm": 1.080606301144591, + "learning_rate": 5.716672365448564e-07, + "loss": 1.6827, + "step": 1097 }, { - "epoch": 0.18323882003066294, - "grad_norm": 1.8088958779925088, - "learning_rate": 4.640481009164028e-07, - "loss": 1.7021, - "step": 4960 + "epoch": 0.14725078229772015, + "grad_norm": 1.0807596555370267, + "learning_rate": 5.716164830633764e-07, + "loss": 1.6778, + "step": 1098 }, { - "epoch": 0.183977686240464, - "grad_norm": 1.6985722383661672, - "learning_rate": 4.6376438091761776e-07, - "loss": 1.6835, - "step": 4980 + "epoch": 0.14738489047831918, + "grad_norm": 1.1284745845133346, + "learning_rate": 5.715656866861234e-07, + "loss": 1.6209, + "step": 1099 }, { - "epoch": 0.18471655245026508, - "grad_norm": 1.5740586459999972, - "learning_rate": 4.63479643810402e-07, - "loss": 1.6778, - "step": 5000 + "epoch": 0.14751899865891818, + "grad_norm": 0.989581549531516, + "learning_rate": 5.715148474221156e-07, + "loss": 1.5879, + "step": 1100 }, { - "epoch": 0.18545541866006612, - "grad_norm": 1.5576615822168314, - "learning_rate": 4.631938911289884e-07, - "loss": 1.6432, - "step": 5020 + "epoch": 0.1476531068395172, + "grad_norm": 1.1254043833078187, + "learning_rate": 5.714639652803788e-07, + "loss": 1.6834, + "step": 1101 }, { - "epoch": 0.1861942848698672, - "grad_norm": 1.4882435243374539, - "learning_rate": 4.629071244130818e-07, - "loss": 1.697, - "step": 5040 + "epoch": 0.1477872150201162, + "grad_norm": 1.0789006249002853, + "learning_rate": 5.714130402699465e-07, + "loss": 1.6314, + "step": 1102 }, { - "epoch": 0.18693315107966824, - "grad_norm": 1.7414218611909407, - "learning_rate": 4.6261934520785135e-07, - "loss": 1.6472, - "step": 5060 + "epoch": 0.14792132320071524, + "grad_norm": 1.0792687942782158, + "learning_rate": 5.713620723998597e-07, + "loss": 1.7229, + "step": 1103 }, { - "epoch": 0.1876720172894693, - "grad_norm": 1.5111215790202166, - "learning_rate": 4.623305550639212e-07, - "loss": 1.6814, - "step": 5080 + "epoch": 0.14805543138131427, + "grad_norm": 1.1190452519207015, + "learning_rate": 5.71311061679167e-07, + "loss": 1.6851, + "step": 1104 }, { - "epoch": 0.18841088349927038, - "grad_norm": 1.4998930010938694, - "learning_rate": 4.6204075553736317e-07, - "loss": 1.6965, - "step": 5100 + "epoch": 0.14818953956191327, + "grad_norm": 1.1240598043365235, + "learning_rate": 5.712600081169248e-07, + "loss": 1.6486, + "step": 1105 }, { - "epoch": 0.18914974970907142, - "grad_norm": 1.6166379161449234, - "learning_rate": 4.617499481896874e-07, - "loss": 1.6367, - "step": 5120 + "epoch": 0.1483236477425123, + "grad_norm": 1.110168533453958, + "learning_rate": 5.71208911722197e-07, + "loss": 1.651, + "step": 1106 }, { - "epoch": 0.1898886159188725, - "grad_norm": 1.564060473042759, - "learning_rate": 4.6145813458783484e-07, - "loss": 1.6404, - "step": 5140 + "epoch": 0.1484577559231113, + "grad_norm": 1.0688369448448625, + "learning_rate": 5.71157772504055e-07, + "loss": 1.709, + "step": 1107 }, { - "epoch": 0.19062748212867356, - "grad_norm": 1.5498475055243737, - "learning_rate": 4.611653163041681e-07, - "loss": 1.64, - "step": 5160 + "epoch": 0.14859186410371034, + "grad_norm": 1.1187107525701387, + "learning_rate": 5.711065904715777e-07, + "loss": 1.7167, + "step": 1108 }, { - "epoch": 0.1913663483384746, - "grad_norm": 1.6108037998097682, - "learning_rate": 4.6087149491646343e-07, - "loss": 1.699, - "step": 5180 + "epoch": 0.14872597228430934, + "grad_norm": 1.1397259364080825, + "learning_rate": 5.710553656338521e-07, + "loss": 1.6975, + "step": 1109 }, { - "epoch": 0.19210521454827567, - "grad_norm": 1.6995541712978521, - "learning_rate": 4.6057667200790203e-07, - "loss": 1.6546, - "step": 5200 + "epoch": 0.14886008046490837, + "grad_norm": 1.1590128512082682, + "learning_rate": 5.710040979999723e-07, + "loss": 1.7414, + "step": 1110 }, { - "epoch": 0.19284408075807674, - "grad_norm": 1.499037507366822, - "learning_rate": 4.6028084916706147e-07, - "loss": 1.6083, - "step": 5220 + "epoch": 0.14899418864550737, + "grad_norm": 1.167811852838392, + "learning_rate": 5.709527875790403e-07, + "loss": 1.6626, + "step": 1111 }, { - "epoch": 0.19358294696787778, - "grad_norm": 1.5172594570626625, - "learning_rate": 4.5998402798790704e-07, - "loss": 1.6699, - "step": 5240 + "epoch": 0.1491282968261064, + "grad_norm": 1.0973271552840278, + "learning_rate": 5.709014343801655e-07, + "loss": 1.6324, + "step": 1112 }, { - "epoch": 0.19432181317767885, - "grad_norm": 1.4963740648019974, - "learning_rate": 4.5968621006978373e-07, - "loss": 1.6898, - "step": 5260 + "epoch": 0.1492624050067054, + "grad_norm": 1.3487898998822019, + "learning_rate": 5.708500384124648e-07, + "loss": 1.6641, + "step": 1113 }, { - "epoch": 0.19506067938747992, - "grad_norm": 2.566805183937073, - "learning_rate": 4.5938739701740686e-07, - "loss": 1.6694, - "step": 5280 + "epoch": 0.14939651318730443, + "grad_norm": 1.173261054584497, + "learning_rate": 5.707985996850633e-07, + "loss": 1.6297, + "step": 1114 }, { - "epoch": 0.19579954559728097, - "grad_norm": 1.4540566793967926, - "learning_rate": 4.590875904408539e-07, - "loss": 1.6692, - "step": 5300 + "epoch": 0.14953062136790343, + "grad_norm": 1.056190301936881, + "learning_rate": 5.707471182070929e-07, + "loss": 1.7222, + "step": 1115 }, { - "epoch": 0.19653841180708204, - "grad_norm": 3.9730656922103447, - "learning_rate": 4.587867919555557e-07, - "loss": 1.6625, - "step": 5320 + "epoch": 0.14966472954850246, + "grad_norm": 1.0543304581404804, + "learning_rate": 5.706955939876936e-07, + "loss": 1.6486, + "step": 1116 }, { - "epoch": 0.1972772780168831, - "grad_norm": 1.5142078546698041, - "learning_rate": 4.5848500318228774e-07, - "loss": 1.6654, - "step": 5340 + "epoch": 0.14979883772910146, + "grad_norm": 1.0951287089797115, + "learning_rate": 5.706440270360128e-07, + "loss": 1.6158, + "step": 1117 }, { - "epoch": 0.19801614422668415, - "grad_norm": 1.7032492720795371, - "learning_rate": 4.5818222574716127e-07, - "loss": 1.7022, - "step": 5360 + "epoch": 0.1499329459097005, + "grad_norm": 1.1191851976325244, + "learning_rate": 5.705924173612055e-07, + "loss": 1.7315, + "step": 1118 }, { - "epoch": 0.19875501043648522, - "grad_norm": 1.554191757548726, - "learning_rate": 4.578784612816149e-07, - "loss": 1.6811, - "step": 5380 + "epoch": 0.1500670540902995, + "grad_norm": 1.0577825904689977, + "learning_rate": 5.705407649724343e-07, + "loss": 1.6935, + "step": 1119 }, { - "epoch": 0.19949387664628626, - "grad_norm": 1.4929225978552914, - "learning_rate": 4.5758897229313755e-07, - "loss": 1.6509, - "step": 5400 + "epoch": 0.15020116227089853, + "grad_norm": 1.056299942663864, + "learning_rate": 5.704890698788693e-07, + "loss": 1.628, + "step": 1120 }, { - "epoch": 0.20023274285608733, - "grad_norm": 1.4628893559215694, - "learning_rate": 4.5728328783083036e-07, - "loss": 1.7302, - "step": 5420 + "epoch": 0.15033527045149755, + "grad_norm": 1.1590721147664085, + "learning_rate": 5.704373320896886e-07, + "loss": 1.6249, + "step": 1121 }, { - "epoch": 0.2009716090658884, - "grad_norm": 1.493249123165425, - "learning_rate": 4.5699197781569844e-07, - "loss": 1.6383, - "step": 5440 + "epoch": 0.15046937863209656, + "grad_norm": 1.1117527447235374, + "learning_rate": 5.703855516140773e-07, + "loss": 1.7004, + "step": 1122 }, { - "epoch": 0.20171047527568944, - "grad_norm": 1.600690331893774, - "learning_rate": 4.5668437961972905e-07, - "loss": 1.6189, - "step": 5460 + "epoch": 0.1506034868126956, + "grad_norm": 1.1049104937281078, + "learning_rate": 5.703337284612283e-07, + "loss": 1.6377, + "step": 1123 }, { - "epoch": 0.2024493414854905, - "grad_norm": 1.464802503893095, - "learning_rate": 4.5637580246409934e-07, - "loss": 1.65, - "step": 5480 + "epoch": 0.1507375949932946, + "grad_norm": 1.59710670500923, + "learning_rate": 5.702818626403422e-07, + "loss": 1.6834, + "step": 1124 }, { - "epoch": 0.20318820769529158, - "grad_norm": 1.5375722464094912, - "learning_rate": 4.5606624801149797e-07, - "loss": 1.6546, - "step": 5500 + "epoch": 0.15087170317389362, + "grad_norm": 1.0967048039417424, + "learning_rate": 5.702299541606271e-07, + "loss": 1.7351, + "step": 1125 }, { - "epoch": 0.20392707390509263, - "grad_norm": 1.5967568446324583, - "learning_rate": 4.5575571792987984e-07, - "loss": 1.6286, - "step": 5520 + "epoch": 0.15100581135449262, + "grad_norm": 1.0979605765370022, + "learning_rate": 5.701780030312985e-07, + "loss": 1.6961, + "step": 1126 }, { - "epoch": 0.2046659401148937, - "grad_norm": 1.5568969231756908, - "learning_rate": 4.5544421389245646e-07, - "loss": 1.6278, - "step": 5540 + "epoch": 0.15113991953509165, + "grad_norm": 1.0799636277645253, + "learning_rate": 5.701260092615798e-07, + "loss": 1.6698, + "step": 1127 }, { - "epoch": 0.20540480632469477, - "grad_norm": 1.5499607650206735, - "learning_rate": 4.5513173757768746e-07, - "loss": 1.6755, - "step": 5560 + "epoch": 0.15127402771569065, + "grad_norm": 1.0680391383117414, + "learning_rate": 5.700739728607018e-07, + "loss": 1.6337, + "step": 1128 }, { - "epoch": 0.2061436725344958, - "grad_norm": 1.4823222337131237, - "learning_rate": 4.548182906692714e-07, - "loss": 1.6661, - "step": 5580 + "epoch": 0.15140813589628968, + "grad_norm": 1.1265492196116744, + "learning_rate": 5.700218938379027e-07, + "loss": 1.758, + "step": 1129 }, { - "epoch": 0.20688253874429688, - "grad_norm": 1.507552555113675, - "learning_rate": 4.5450387485613635e-07, - "loss": 1.6659, - "step": 5600 + "epoch": 0.15154224407688868, + "grad_norm": 1.1871181924509882, + "learning_rate": 5.699697722024286e-07, + "loss": 1.7564, + "step": 1130 }, { - "epoch": 0.20762140495409795, - "grad_norm": 1.4811185047336115, - "learning_rate": 4.541884918324313e-07, - "loss": 1.656, - "step": 5620 + "epoch": 0.1516763522574877, + "grad_norm": 1.0181987331367963, + "learning_rate": 5.69917607963533e-07, + "loss": 1.5776, + "step": 1131 }, { - "epoch": 0.208360271163899, - "grad_norm": 1.576191450168426, - "learning_rate": 4.538721432975168e-07, - "loss": 1.6875, - "step": 5640 + "epoch": 0.15181046043808671, + "grad_norm": 1.1284590442586029, + "learning_rate": 5.698654011304768e-07, + "loss": 1.6984, + "step": 1132 }, { - "epoch": 0.20909913737370006, - "grad_norm": 1.7938635395127402, - "learning_rate": 4.535707194370682e-07, - "loss": 1.6646, - "step": 5660 + "epoch": 0.15194456861868574, + "grad_norm": 1.2930521652564555, + "learning_rate": 5.698131517125288e-07, + "loss": 1.6334, + "step": 1133 }, { - "epoch": 0.2098380035835011, - "grad_norm": 1.6552255449585238, - "learning_rate": 4.532524930627744e-07, - "loss": 1.6524, - "step": 5680 + "epoch": 0.15207867679928475, + "grad_norm": 1.117570312123897, + "learning_rate": 5.697608597189651e-07, + "loss": 1.6531, + "step": 1134 }, { - "epoch": 0.21057686979330217, - "grad_norm": 1.7516118506092397, - "learning_rate": 4.5293330622066034e-07, - "loss": 1.6157, - "step": 5700 + "epoch": 0.15221278497988378, + "grad_norm": 1.4856967946676458, + "learning_rate": 5.697085251590694e-07, + "loss": 1.6406, + "step": 1135 }, { - "epoch": 0.21131573600310324, - "grad_norm": 1.4545866638005132, - "learning_rate": 4.526131606305823e-07, - "loss": 1.6476, - "step": 5720 + "epoch": 0.15234689316048278, + "grad_norm": 1.1601905755705224, + "learning_rate": 5.696561480421331e-07, + "loss": 1.6839, + "step": 1136 }, { - "epoch": 0.2120546022129043, - "grad_norm": 1.6248585310317667, - "learning_rate": 4.5229205801756273e-07, - "loss": 1.6573, - "step": 5740 + "epoch": 0.1524810013410818, + "grad_norm": 1.1233822318963709, + "learning_rate": 5.696037283774549e-07, + "loss": 1.6607, + "step": 1137 }, { - "epoch": 0.21279346842270536, - "grad_norm": 1.41925791489552, - "learning_rate": 4.519700001117807e-07, - "loss": 1.6685, - "step": 5760 + "epoch": 0.15261510952168084, + "grad_norm": 1.1742187355064484, + "learning_rate": 5.695512661743415e-07, + "loss": 1.6646, + "step": 1138 }, { - "epoch": 0.21353233463250643, - "grad_norm": 1.7509635950883726, - "learning_rate": 4.5164698864856257e-07, - "loss": 1.6812, - "step": 5780 + "epoch": 0.15274921770227984, + "grad_norm": 1.086363990541314, + "learning_rate": 5.694987614421066e-07, + "loss": 1.6739, + "step": 1139 }, { - "epoch": 0.21427120084230747, - "grad_norm": 1.4694228842841779, - "learning_rate": 4.5132302536837273e-07, - "loss": 1.6556, - "step": 5800 + "epoch": 0.15288332588287887, + "grad_norm": 1.194737878034564, + "learning_rate": 5.694462141900719e-07, + "loss": 1.6835, + "step": 1140 }, { - "epoch": 0.21501006705210854, - "grad_norm": 1.553864895417105, - "learning_rate": 4.5099811201680416e-07, - "loss": 1.6883, - "step": 5820 + "epoch": 0.15301743406347787, + "grad_norm": 1.1598758612040898, + "learning_rate": 5.693936244275662e-07, + "loss": 1.6587, + "step": 1141 }, { - "epoch": 0.2157489332619096, - "grad_norm": 1.491366651426128, - "learning_rate": 4.506722503445691e-07, - "loss": 1.6613, - "step": 5840 + "epoch": 0.1531515422440769, + "grad_norm": 1.1381348609460207, + "learning_rate": 5.693409921639263e-07, + "loss": 1.7111, + "step": 1142 }, { - "epoch": 0.21648779947171065, - "grad_norm": 1.6466798284982602, - "learning_rate": 4.5034544210748953e-07, - "loss": 1.6497, - "step": 5860 + "epoch": 0.1532856504246759, + "grad_norm": 1.0954642701505761, + "learning_rate": 5.692883174084963e-07, + "loss": 1.6453, + "step": 1143 }, { - "epoch": 0.21722666568151172, - "grad_norm": 1.4331846976152014, - "learning_rate": 4.5001768906648783e-07, - "loss": 1.6583, - "step": 5880 + "epoch": 0.15341975860527493, + "grad_norm": 1.181240368838665, + "learning_rate": 5.69235600170628e-07, + "loss": 1.7074, + "step": 1144 }, { - "epoch": 0.2179655318913128, - "grad_norm": 2.4779046528418793, - "learning_rate": 4.496889929875771e-07, - "loss": 1.6456, - "step": 5900 + "epoch": 0.15355386678587393, + "grad_norm": 1.0848362523541808, + "learning_rate": 5.691828404596804e-07, + "loss": 1.7188, + "step": 1145 }, { - "epoch": 0.21870439810111383, - "grad_norm": 1.6613792185698004, - "learning_rate": 4.493593556418519e-07, - "loss": 1.6876, - "step": 5920 + "epoch": 0.15368797496647296, + "grad_norm": 1.0976088776241693, + "learning_rate": 5.691300382850205e-07, + "loss": 1.6133, + "step": 1146 }, { - "epoch": 0.2194432643109149, - "grad_norm": 1.5936970250540041, - "learning_rate": 4.490287788054785e-07, - "loss": 1.6856, - "step": 5940 + "epoch": 0.15382208314707196, + "grad_norm": 1.1535833554516768, + "learning_rate": 5.690771936560228e-07, + "loss": 1.6823, + "step": 1147 }, { - "epoch": 0.22018213052071597, - "grad_norm": 1.7774522510719284, - "learning_rate": 4.486972642596852e-07, - "loss": 1.6574, - "step": 5960 + "epoch": 0.153956191327671, + "grad_norm": 1.1763699702630221, + "learning_rate": 5.690243065820687e-07, + "loss": 1.692, + "step": 1148 }, { - "epoch": 0.22092099673051702, - "grad_norm": 1.5404871158832736, - "learning_rate": 4.483648137907532e-07, - "loss": 1.6637, - "step": 5980 + "epoch": 0.15409029950827, + "grad_norm": 1.0627345607622845, + "learning_rate": 5.689713770725477e-07, + "loss": 1.5961, + "step": 1149 }, { - "epoch": 0.2216598629403181, - "grad_norm": 1.5238762502370415, - "learning_rate": 4.4803142919000645e-07, - "loss": 1.6526, - "step": 6000 + "epoch": 0.15422440768886903, + "grad_norm": 1.0792270716448427, + "learning_rate": 5.689184051368572e-07, + "loss": 1.64, + "step": 1150 }, { - "epoch": 0.22239872915011913, - "grad_norm": 1.4681103098352588, - "learning_rate": 4.4769711225380254e-07, - "loss": 1.6538, - "step": 6020 + "epoch": 0.15435851586946803, + "grad_norm": 1.0247043986886288, + "learning_rate": 5.688653907844009e-07, + "loss": 1.5285, + "step": 1151 }, { - "epoch": 0.2231375953599202, - "grad_norm": 1.406496721553823, - "learning_rate": 4.4736186478352225e-07, - "loss": 1.6593, - "step": 6040 + "epoch": 0.15449262405006706, + "grad_norm": 1.07857428312717, + "learning_rate": 5.688123340245914e-07, + "loss": 1.6444, + "step": 1152 }, { - "epoch": 0.22387646156972127, - "grad_norm": 1.6502790317877305, - "learning_rate": 4.4702568858556063e-07, - "loss": 1.6946, - "step": 6060 + "epoch": 0.15462673223066606, + "grad_norm": 1.0930284133542458, + "learning_rate": 5.687592348668479e-07, + "loss": 1.6882, + "step": 1153 }, { - "epoch": 0.2246153277795223, - "grad_norm": 1.5544958034860874, - "learning_rate": 4.466885854713169e-07, - "loss": 1.6922, - "step": 6080 + "epoch": 0.1547608404112651, + "grad_norm": 1.0484076712069612, + "learning_rate": 5.687060933205976e-07, + "loss": 1.5796, + "step": 1154 }, { - "epoch": 0.22535419398932338, - "grad_norm": 1.35257283259656, - "learning_rate": 4.463505572571847e-07, - "loss": 1.6646, - "step": 6100 + "epoch": 0.1548949485918641, + "grad_norm": 1.1209018475352952, + "learning_rate": 5.686529093952749e-07, + "loss": 1.702, + "step": 1155 }, { - "epoch": 0.22609306019912445, - "grad_norm": 1.624788597950665, - "learning_rate": 4.460116057645422e-07, - "loss": 1.6464, - "step": 6120 + "epoch": 0.15502905677246312, + "grad_norm": 1.084792074670866, + "learning_rate": 5.685996831003221e-07, + "loss": 1.6856, + "step": 1156 }, { - "epoch": 0.2268319264089255, - "grad_norm": 1.5573729356283417, - "learning_rate": 4.4567173281974274e-07, - "loss": 1.6311, - "step": 6140 + "epoch": 0.15516316495306215, + "grad_norm": 1.081652083067762, + "learning_rate": 5.685464144451888e-07, + "loss": 1.6781, + "step": 1157 }, { - "epoch": 0.22757079261872656, - "grad_norm": 1.9342192243430807, - "learning_rate": 4.453309402541044e-07, - "loss": 1.6517, - "step": 6160 + "epoch": 0.15529727313366115, + "grad_norm": 1.2019370572090728, + "learning_rate": 5.684931034393319e-07, + "loss": 1.6854, + "step": 1158 }, { - "epoch": 0.22830965882852763, - "grad_norm": 1.6525422759457808, - "learning_rate": 4.4498922990390044e-07, - "loss": 1.6584, - "step": 6180 + "epoch": 0.15543138131426018, + "grad_norm": 1.1546384235930545, + "learning_rate": 5.684397500922163e-07, + "loss": 1.5995, + "step": 1159 }, { - "epoch": 0.22904852503832868, - "grad_norm": 1.3709737663427297, - "learning_rate": 4.446466036103493e-07, - "loss": 1.6552, - "step": 6200 + "epoch": 0.15556548949485918, + "grad_norm": 1.0806139711906346, + "learning_rate": 5.68386354413314e-07, + "loss": 1.6043, + "step": 1160 }, { - "epoch": 0.22978739124812975, - "grad_norm": 1.7619047090616546, - "learning_rate": 4.44303063219605e-07, - "loss": 1.6515, - "step": 6220 + "epoch": 0.1556995976754582, + "grad_norm": 1.1695139264738694, + "learning_rate": 5.683329164121049e-07, + "loss": 1.6565, + "step": 1161 }, { - "epoch": 0.23052625745793082, - "grad_norm": 1.425527104774275, - "learning_rate": 4.439586105827468e-07, - "loss": 1.7082, - "step": 6240 + "epoch": 0.15583370585605721, + "grad_norm": 1.1082458941671236, + "learning_rate": 5.682794360980761e-07, + "loss": 1.6997, + "step": 1162 }, { - "epoch": 0.23126512366773186, - "grad_norm": 2.183066565667764, - "learning_rate": 4.436132475557693e-07, - "loss": 1.6457, - "step": 6260 + "epoch": 0.15596781403665624, + "grad_norm": 1.171803562739694, + "learning_rate": 5.682259134807222e-07, + "loss": 1.5452, + "step": 1163 }, { - "epoch": 0.23200398987753293, - "grad_norm": 2.5631189419788103, - "learning_rate": 4.432669759995725e-07, - "loss": 1.6441, - "step": 6280 + "epoch": 0.15610192221725525, + "grad_norm": 1.0813601117636722, + "learning_rate": 5.681723485695456e-07, + "loss": 1.6468, + "step": 1164 }, { - "epoch": 0.232742856087334, - "grad_norm": 1.531958854525398, - "learning_rate": 4.4291979777995186e-07, - "loss": 1.6597, - "step": 6300 + "epoch": 0.15623603039785428, + "grad_norm": 1.0850091737441245, + "learning_rate": 5.681187413740558e-07, + "loss": 1.6521, + "step": 1165 }, { - "epoch": 0.23348172229713504, - "grad_norm": 1.7334807358971334, - "learning_rate": 4.4257171476758813e-07, - "loss": 1.6189, - "step": 6320 + "epoch": 0.15637013857845328, + "grad_norm": 1.0888617126493352, + "learning_rate": 5.680650919037703e-07, + "loss": 1.6318, + "step": 1166 }, { - "epoch": 0.2342205885069361, - "grad_norm": 1.606688663391079, - "learning_rate": 4.422227288380374e-07, - "loss": 1.6635, - "step": 6340 + "epoch": 0.1565042467590523, + "grad_norm": 1.0832051131221956, + "learning_rate": 5.680114001682137e-07, + "loss": 1.6244, + "step": 1167 }, { - "epoch": 0.23495945471673715, - "grad_norm": 1.5504111994528522, - "learning_rate": 4.418728418717207e-07, - "loss": 1.6619, - "step": 6360 + "epoch": 0.1566383549396513, + "grad_norm": 1.1345011329722676, + "learning_rate": 5.679576661769184e-07, + "loss": 1.6903, + "step": 1168 }, { - "epoch": 0.23569832092653822, - "grad_norm": 1.7059923161913078, - "learning_rate": 4.415220557539142e-07, - "loss": 1.6518, - "step": 6380 + "epoch": 0.15677246312025034, + "grad_norm": 1.0989237696533585, + "learning_rate": 5.679038899394239e-07, + "loss": 1.748, + "step": 1169 }, { - "epoch": 0.2364371871363393, - "grad_norm": 1.5282124634083587, - "learning_rate": 4.411703723747389e-07, - "loss": 1.6281, - "step": 6400 + "epoch": 0.15690657130084934, + "grad_norm": 1.0586060818560636, + "learning_rate": 5.678500714652776e-07, + "loss": 1.7243, + "step": 1170 }, { - "epoch": 0.23717605334614034, - "grad_norm": 1.817029524914551, - "learning_rate": 4.4081779362915033e-07, - "loss": 1.6196, - "step": 6420 + "epoch": 0.15704067948144837, + "grad_norm": 1.1184535612835667, + "learning_rate": 5.677962107640342e-07, + "loss": 1.6538, + "step": 1171 }, { - "epoch": 0.2379149195559414, - "grad_norm": 1.4287258918617316, - "learning_rate": 4.404643214169288e-07, - "loss": 1.6552, - "step": 6440 + "epoch": 0.15717478766204737, + "grad_norm": 1.0607792312898765, + "learning_rate": 5.677423078452561e-07, + "loss": 1.6324, + "step": 1172 }, { - "epoch": 0.23865378576574248, - "grad_norm": 1.4874633967888828, - "learning_rate": 4.4010995764266845e-07, - "loss": 1.6398, - "step": 6460 + "epoch": 0.1573088958426464, + "grad_norm": 1.0442851907949064, + "learning_rate": 5.676883627185129e-07, + "loss": 1.6818, + "step": 1173 }, { - "epoch": 0.23939265197554352, - "grad_norm": 1.721122957795877, - "learning_rate": 4.3975470421576764e-07, - "loss": 1.6512, - "step": 6480 + "epoch": 0.15744300402324543, + "grad_norm": 1.0805916545031482, + "learning_rate": 5.676343753933818e-07, + "loss": 1.6477, + "step": 1174 }, { - "epoch": 0.2401315181853446, - "grad_norm": 1.523301573082442, - "learning_rate": 4.393985630504183e-07, - "loss": 1.6782, - "step": 6500 + "epoch": 0.15757711220384443, + "grad_norm": 1.055305047370012, + "learning_rate": 5.675803458794477e-07, + "loss": 1.675, + "step": 1175 }, { - "epoch": 0.24087038439514566, - "grad_norm": 1.4599906341858953, - "learning_rate": 4.390415360655957e-07, - "loss": 1.6396, - "step": 6520 + "epoch": 0.15771122038444346, + "grad_norm": 1.1317344965112557, + "learning_rate": 5.675262741863026e-07, + "loss": 1.6195, + "step": 1176 }, { - "epoch": 0.2416092506049467, - "grad_norm": 1.5009190844531946, - "learning_rate": 4.386836251850481e-07, - "loss": 1.648, - "step": 6540 + "epoch": 0.15784532856504246, + "grad_norm": 1.0677408822746999, + "learning_rate": 5.674721603235462e-07, + "loss": 1.673, + "step": 1177 }, { - "epoch": 0.24234811681474777, - "grad_norm": 1.3512220497620588, - "learning_rate": 4.3832483233728654e-07, - "loss": 1.6712, - "step": 6560 + "epoch": 0.1579794367456415, + "grad_norm": 1.1173608676015656, + "learning_rate": 5.67418004300786e-07, + "loss": 1.704, + "step": 1178 }, { - "epoch": 0.24308698302454884, - "grad_norm": 1.6590943419842232, - "learning_rate": 4.379651594555741e-07, - "loss": 1.6174, - "step": 6580 + "epoch": 0.1581135449262405, + "grad_norm": 1.056889330893961, + "learning_rate": 5.673638061276364e-07, + "loss": 1.6232, + "step": 1179 }, { - "epoch": 0.24382584923434988, - "grad_norm": 1.3956181675020793, - "learning_rate": 4.376046084779159e-07, - "loss": 1.6173, - "step": 6600 + "epoch": 0.15824765310683953, + "grad_norm": 1.1175288057488566, + "learning_rate": 5.673095658137197e-07, + "loss": 1.7439, + "step": 1180 }, { - "epoch": 0.24456471544415095, - "grad_norm": 1.5798276517321244, - "learning_rate": 4.3724318134704826e-07, - "loss": 1.6419, - "step": 6620 + "epoch": 0.15838176128743853, + "grad_norm": 1.1363903828654547, + "learning_rate": 5.672552833686654e-07, + "loss": 1.6943, + "step": 1181 }, { - "epoch": 0.24530358165395202, - "grad_norm": 1.4769865046542814, - "learning_rate": 4.3688088001042866e-07, - "loss": 1.6631, - "step": 6640 + "epoch": 0.15851586946803756, + "grad_norm": 1.0761526122635945, + "learning_rate": 5.672009588021108e-07, + "loss": 1.6178, + "step": 1182 }, { - "epoch": 0.24604244786375307, - "grad_norm": 1.7571296905735259, - "learning_rate": 4.3651770642022483e-07, - "loss": 1.6615, - "step": 6660 + "epoch": 0.15864997764863656, + "grad_norm": 1.0868039624863182, + "learning_rate": 5.671465921237003e-07, + "loss": 1.7295, + "step": 1183 }, { - "epoch": 0.24678131407355414, - "grad_norm": 10.261084539724488, - "learning_rate": 4.361536625333045e-07, - "loss": 1.6515, - "step": 6680 + "epoch": 0.1587840858292356, + "grad_norm": 1.5375983527794888, + "learning_rate": 5.670921833430861e-07, + "loss": 1.5868, + "step": 1184 }, { - "epoch": 0.24752018028335518, - "grad_norm": 2.7070070654149956, - "learning_rate": 4.3578875031122466e-07, - "loss": 1.6584, - "step": 6700 + "epoch": 0.1589181940098346, + "grad_norm": 1.1761526374271758, + "learning_rate": 5.670377324699277e-07, + "loss": 1.6585, + "step": 1185 }, { - "epoch": 0.24825904649315625, - "grad_norm": 1.54607876926978, - "learning_rate": 4.3542297172022126e-07, - "loss": 1.6517, - "step": 6720 + "epoch": 0.15905230219043362, + "grad_norm": 1.0911545993652647, + "learning_rate": 5.669832395138923e-07, + "loss": 1.6849, + "step": 1186 }, { - "epoch": 0.24899791270295732, - "grad_norm": 1.3861037085930092, - "learning_rate": 4.3505632873119844e-07, - "loss": 1.6686, - "step": 6740 + "epoch": 0.15918641037103262, + "grad_norm": 1.0517360680747312, + "learning_rate": 5.669287044846542e-07, + "loss": 1.7081, + "step": 1187 }, { - "epoch": 0.24973677891275836, - "grad_norm": 1.4161848471548175, - "learning_rate": 4.346888233197178e-07, - "loss": 1.6449, - "step": 6760 + "epoch": 0.15932051855163165, + "grad_norm": 1.0460736006845528, + "learning_rate": 5.668741273918952e-07, + "loss": 1.6946, + "step": 1188 }, { - "epoch": 0.25047564512255943, - "grad_norm": 1.9634719417599906, - "learning_rate": 4.343204574659878e-07, - "loss": 1.6586, - "step": 6780 + "epoch": 0.15945462673223065, + "grad_norm": 1.1057544457050006, + "learning_rate": 5.668195082453052e-07, + "loss": 1.6648, + "step": 1189 }, { - "epoch": 0.2512145113323605, - "grad_norm": 2.2362709149394835, - "learning_rate": 4.339512331548535e-07, - "loss": 1.6481, - "step": 6800 + "epoch": 0.15958873491282968, + "grad_norm": 1.290894867238456, + "learning_rate": 5.667648470545808e-07, + "loss": 1.6921, + "step": 1190 }, { - "epoch": 0.25195337754216157, - "grad_norm": 2.435262162446439, - "learning_rate": 4.335811523757855e-07, - "loss": 1.6751, - "step": 6820 + "epoch": 0.1597228430934287, + "grad_norm": 1.2497492674256703, + "learning_rate": 5.667101438294264e-07, + "loss": 1.7095, + "step": 1191 }, { - "epoch": 0.2526922437519626, - "grad_norm": 1.4440630152259213, - "learning_rate": 4.3321021712286874e-07, - "loss": 1.6865, - "step": 6840 + "epoch": 0.15985695127402771, + "grad_norm": 1.1080523067750003, + "learning_rate": 5.666553985795538e-07, + "loss": 1.6313, + "step": 1192 }, { - "epoch": 0.25343110996176366, - "grad_norm": 1.6572017188801809, - "learning_rate": 4.3283842939479297e-07, - "loss": 1.6874, - "step": 6860 + "epoch": 0.15999105945462674, + "grad_norm": 1.0983444417697228, + "learning_rate": 5.666006113146823e-07, + "loss": 1.6836, + "step": 1193 }, { - "epoch": 0.2541699761715647, - "grad_norm": 1.6358091879473202, - "learning_rate": 4.3246579119484086e-07, - "loss": 1.6442, - "step": 6880 + "epoch": 0.16012516763522575, + "grad_norm": 1.1242609644362185, + "learning_rate": 5.665457820445387e-07, + "loss": 1.6522, + "step": 1194 }, { - "epoch": 0.2549088423813658, - "grad_norm": 1.861949731594006, - "learning_rate": 4.3209230453087763e-07, - "loss": 1.6596, - "step": 6900 + "epoch": 0.16025927581582478, + "grad_norm": 1.1033082182518592, + "learning_rate": 5.664909107788571e-07, + "loss": 1.6958, + "step": 1195 }, { - "epoch": 0.25564770859116687, - "grad_norm": 1.576364259347636, - "learning_rate": 4.317179714153405e-07, - "loss": 1.6409, - "step": 6920 + "epoch": 0.16039338399642378, + "grad_norm": 1.1353654965954614, + "learning_rate": 5.664359975273792e-07, + "loss": 1.6604, + "step": 1196 }, { - "epoch": 0.25638657480096794, - "grad_norm": 1.6344350623705748, - "learning_rate": 4.3134279386522734e-07, - "loss": 1.6634, - "step": 6940 + "epoch": 0.1605274921770228, + "grad_norm": 1.1259316457840236, + "learning_rate": 5.663810422998543e-07, + "loss": 1.7241, + "step": 1197 }, { - "epoch": 0.25712544101076895, - "grad_norm": 2.4484913186668056, - "learning_rate": 4.3096677390208606e-07, - "loss": 1.6635, - "step": 6960 + "epoch": 0.1606616003576218, + "grad_norm": 1.0922411903046598, + "learning_rate": 5.663260451060388e-07, + "loss": 1.6432, + "step": 1198 }, { - "epoch": 0.25786430722057, - "grad_norm": 1.459583448230627, - "learning_rate": 4.3058991355200385e-07, - "loss": 1.6437, - "step": 6980 + "epoch": 0.16079570853822084, + "grad_norm": 1.0707962447880088, + "learning_rate": 5.662710059556966e-07, + "loss": 1.6666, + "step": 1199 }, { - "epoch": 0.2586031734303711, - "grad_norm": 2.0774440428993426, - "learning_rate": 4.302122148455959e-07, - "loss": 1.6807, - "step": 7000 + "epoch": 0.16092981671881984, + "grad_norm": 1.0837296784325723, + "learning_rate": 5.662159248585993e-07, + "loss": 1.6965, + "step": 1200 }, { - "epoch": 0.25934203964017216, - "grad_norm": 1.4906050741171306, - "learning_rate": 4.2983367981799484e-07, - "loss": 1.6477, - "step": 7020 + "epoch": 0.16106392489941887, + "grad_norm": 1.0703824186490674, + "learning_rate": 5.66160801824526e-07, + "loss": 1.7293, + "step": 1201 }, { - "epoch": 0.26008090584997323, - "grad_norm": 1.6727105507446454, - "learning_rate": 4.294543105088395e-07, - "loss": 1.617, - "step": 7040 + "epoch": 0.16119803308001787, + "grad_norm": 1.095076268284643, + "learning_rate": 5.661056368632625e-07, + "loss": 1.6433, + "step": 1202 }, { - "epoch": 0.2608197720597743, - "grad_norm": 1.4754199269220696, - "learning_rate": 4.2907410896226415e-07, - "loss": 1.6391, - "step": 7060 + "epoch": 0.1613321412606169, + "grad_norm": 1.0622058510882262, + "learning_rate": 5.660504299846032e-07, + "loss": 1.6237, + "step": 1203 }, { - "epoch": 0.2615586382695753, - "grad_norm": 1.5380802874413815, - "learning_rate": 4.2869307722688715e-07, - "loss": 1.687, - "step": 7080 + "epoch": 0.1614662494412159, + "grad_norm": 1.0981636682859879, + "learning_rate": 5.65995181198349e-07, + "loss": 1.8076, + "step": 1204 }, { - "epoch": 0.2622975044793764, - "grad_norm": 1.6040883755814137, - "learning_rate": 4.283112173558003e-07, - "loss": 1.7171, - "step": 7100 + "epoch": 0.16160035762181493, + "grad_norm": 1.1393139443072446, + "learning_rate": 5.659398905143088e-07, + "loss": 1.7572, + "step": 1205 }, { - "epoch": 0.26303637068917746, - "grad_norm": 2.822094109735399, - "learning_rate": 4.279285314065575e-07, - "loss": 1.6671, - "step": 7120 + "epoch": 0.16173446580241393, + "grad_norm": 1.0960864805053374, + "learning_rate": 5.658845579422985e-07, + "loss": 1.6836, + "step": 1206 }, { - "epoch": 0.2637752368989785, - "grad_norm": 1.4328096068889253, - "learning_rate": 4.275450214411638e-07, - "loss": 1.6475, - "step": 7140 + "epoch": 0.16186857398301296, + "grad_norm": 1.0536699550048987, + "learning_rate": 5.658291834921417e-07, + "loss": 1.6933, + "step": 1207 }, { - "epoch": 0.2645141031087796, - "grad_norm": 1.624272809516238, - "learning_rate": 4.2716068952606424e-07, - "loss": 1.693, - "step": 7160 + "epoch": 0.162002682163612, + "grad_norm": 1.1996669047917732, + "learning_rate": 5.657737671736696e-07, + "loss": 1.6405, + "step": 1208 }, { - "epoch": 0.2652529693185806, - "grad_norm": 1.502383886350249, - "learning_rate": 4.267755377321327e-07, - "loss": 1.6592, - "step": 7180 + "epoch": 0.162136790344211, + "grad_norm": 1.10569454454835, + "learning_rate": 5.657183089967204e-07, + "loss": 1.5797, + "step": 1209 }, { - "epoch": 0.2659918355283817, - "grad_norm": 1.4780327874669796, - "learning_rate": 4.2638956813466094e-07, - "loss": 1.6273, - "step": 7200 + "epoch": 0.16227089852481003, + "grad_norm": 1.2803251145710948, + "learning_rate": 5.6566280897114e-07, + "loss": 1.6207, + "step": 1210 }, { - "epoch": 0.26673070173818275, - "grad_norm": 1.647788340317037, - "learning_rate": 4.2600278281334683e-07, - "loss": 1.7177, - "step": 7220 + "epoch": 0.16240500670540903, + "grad_norm": 1.048684333970024, + "learning_rate": 5.656072671067818e-07, + "loss": 1.5924, + "step": 1211 }, { - "epoch": 0.2674695679479838, - "grad_norm": 1.4249175729696602, - "learning_rate": 4.256151838522842e-07, - "loss": 1.6134, - "step": 7240 + "epoch": 0.16253911488600806, + "grad_norm": 1.0612522875516415, + "learning_rate": 5.655516834135063e-07, + "loss": 1.5299, + "step": 1212 }, { - "epoch": 0.2682084341577849, - "grad_norm": 1.525640467280493, - "learning_rate": 4.252267733399502e-07, - "loss": 1.6279, - "step": 7260 + "epoch": 0.16267322306660706, + "grad_norm": 1.0932249588392913, + "learning_rate": 5.65496057901182e-07, + "loss": 1.6653, + "step": 1213 }, { - "epoch": 0.26894730036758596, - "grad_norm": 1.5643231773087998, - "learning_rate": 4.2483755336919546e-07, - "loss": 1.6319, - "step": 7280 + "epoch": 0.1628073312472061, + "grad_norm": 1.0734042304698213, + "learning_rate": 5.65440390579684e-07, + "loss": 1.5442, + "step": 1214 }, { - "epoch": 0.269686166577387, - "grad_norm": 1.5088025290660787, - "learning_rate": 4.2444752603723185e-07, - "loss": 1.6465, - "step": 7300 + "epoch": 0.1629414394278051, + "grad_norm": 1.1189271058187575, + "learning_rate": 5.653846814588957e-07, + "loss": 1.6881, + "step": 1215 }, { - "epoch": 0.27042503278718805, - "grad_norm": 1.690559249481047, - "learning_rate": 4.2405669344562157e-07, - "loss": 1.6597, - "step": 7320 + "epoch": 0.16307554760840412, + "grad_norm": 1.1589238023336688, + "learning_rate": 5.653289305487072e-07, + "loss": 1.7461, + "step": 1216 }, { - "epoch": 0.2711638989969891, - "grad_norm": 1.4158777914075165, - "learning_rate": 4.236650577002658e-07, - "loss": 1.6498, - "step": 7340 + "epoch": 0.16320965578900312, + "grad_norm": 1.02665461506197, + "learning_rate": 5.652731378590166e-07, + "loss": 1.6576, + "step": 1217 }, { - "epoch": 0.2719027652067902, - "grad_norm": 1.4954788634515361, - "learning_rate": 4.232726209113931e-07, - "loss": 1.7073, - "step": 7360 + "epoch": 0.16334376396960215, + "grad_norm": 1.1444702149064363, + "learning_rate": 5.65217303399729e-07, + "loss": 1.6162, + "step": 1218 }, { - "epoch": 0.27264163141659126, - "grad_norm": 1.96245857269846, - "learning_rate": 4.228793851935486e-07, - "loss": 1.6559, - "step": 7380 + "epoch": 0.16347787215020115, + "grad_norm": 1.1311619335366723, + "learning_rate": 5.65161427180757e-07, + "loss": 1.6957, + "step": 1219 }, { - "epoch": 0.2733804976263923, - "grad_norm": 1.5534874631194424, - "learning_rate": 4.22485352665582e-07, - "loss": 1.6795, - "step": 7400 + "epoch": 0.16361198033080018, + "grad_norm": 1.0555386995041562, + "learning_rate": 5.651055092120208e-07, + "loss": 1.7145, + "step": 1220 }, { - "epoch": 0.27411936383619334, - "grad_norm": 1.513478614204036, - "learning_rate": 4.2209052545063645e-07, - "loss": 1.6598, - "step": 7420 + "epoch": 0.16374608851139918, + "grad_norm": 1.189321876945114, + "learning_rate": 5.650495495034477e-07, + "loss": 1.698, + "step": 1221 }, { - "epoch": 0.2748582300459944, - "grad_norm": 1.4981685008613979, - "learning_rate": 4.216949056761371e-07, - "loss": 1.6796, - "step": 7440 + "epoch": 0.16388019669199821, + "grad_norm": 1.084782331393969, + "learning_rate": 5.649935480649729e-07, + "loss": 1.6739, + "step": 1222 }, { - "epoch": 0.2755970962557955, - "grad_norm": 1.453166525310124, - "learning_rate": 4.212984954737796e-07, - "loss": 1.6547, - "step": 7460 + "epoch": 0.16401430487259722, + "grad_norm": 1.1283603135723947, + "learning_rate": 5.649375049065386e-07, + "loss": 1.752, + "step": 1223 }, { - "epoch": 0.27633596246559655, - "grad_norm": 1.4590359213340498, - "learning_rate": 4.2090129697951865e-07, - "loss": 1.668, - "step": 7480 + "epoch": 0.16414841305319625, + "grad_norm": 1.11896193815645, + "learning_rate": 5.648814200380943e-07, + "loss": 1.6303, + "step": 1224 }, { - "epoch": 0.2770748286753976, - "grad_norm": 1.5012030999873756, - "learning_rate": 4.205033123335563e-07, - "loss": 1.6253, - "step": 7500 + "epoch": 0.16428252123379525, + "grad_norm": 1.067115391566694, + "learning_rate": 5.648252934695973e-07, + "loss": 1.6735, + "step": 1225 }, { - "epoch": 0.27781369488519864, - "grad_norm": 1.605863135582104, - "learning_rate": 4.2010454368033075e-07, - "loss": 1.6684, - "step": 7520 + "epoch": 0.16441662941439428, + "grad_norm": 1.0804557718519556, + "learning_rate": 5.64769125211012e-07, + "loss": 1.6247, + "step": 1226 }, { - "epoch": 0.2785525610949997, - "grad_norm": 1.9991749625802369, - "learning_rate": 4.197049931685046e-07, - "loss": 1.6403, - "step": 7540 + "epoch": 0.1645507375949933, + "grad_norm": 1.0059736180266399, + "learning_rate": 5.647129152723106e-07, + "loss": 1.5354, + "step": 1227 }, { - "epoch": 0.2792914273048008, - "grad_norm": 1.5084206750440898, - "learning_rate": 4.193046629509533e-07, - "loss": 1.6673, - "step": 7560 + "epoch": 0.1646848457755923, + "grad_norm": 1.0770670756683223, + "learning_rate": 5.646566636634721e-07, + "loss": 1.6768, + "step": 1228 }, { - "epoch": 0.28003029351460185, - "grad_norm": 1.6013334792913052, - "learning_rate": 4.1890355518475335e-07, - "loss": 1.6483, - "step": 7580 + "epoch": 0.16481895395619134, + "grad_norm": 1.0638623481159848, + "learning_rate": 5.646003703944834e-07, + "loss": 1.6413, + "step": 1229 }, { - "epoch": 0.2807691597244029, - "grad_norm": 1.798812837038986, - "learning_rate": 4.185016720311712e-07, - "loss": 1.6795, - "step": 7600 + "epoch": 0.16495306213679034, + "grad_norm": 1.0839631787802386, + "learning_rate": 5.645440354753386e-07, + "loss": 1.6411, + "step": 1230 }, { - "epoch": 0.281508025934204, - "grad_norm": 1.4900500600235345, - "learning_rate": 4.18099015655651e-07, - "loss": 1.6807, - "step": 7620 + "epoch": 0.16508717031738937, + "grad_norm": 1.1589896172936287, + "learning_rate": 5.644876589160391e-07, + "loss": 1.6042, + "step": 1231 }, { - "epoch": 0.282246892144005, - "grad_norm": 1.6028189719479609, - "learning_rate": 4.176955882278033e-07, - "loss": 1.6596, - "step": 7640 + "epoch": 0.16522127849798837, + "grad_norm": 1.1160410996742565, + "learning_rate": 5.644312407265939e-07, + "loss": 1.6573, + "step": 1232 }, { - "epoch": 0.28298575835380607, - "grad_norm": 1.9939881516366833, - "learning_rate": 4.1729139192139335e-07, - "loss": 1.6695, - "step": 7660 + "epoch": 0.1653553866785874, + "grad_norm": 1.4171454379604909, + "learning_rate": 5.643747809170193e-07, + "loss": 1.6332, + "step": 1233 }, { - "epoch": 0.28372462456360714, - "grad_norm": 1.5127346940191255, - "learning_rate": 4.168864289143291e-07, - "loss": 1.7078, - "step": 7680 + "epoch": 0.1654894948591864, + "grad_norm": 1.0531642470485152, + "learning_rate": 5.643182794973391e-07, + "loss": 1.6602, + "step": 1234 }, { - "epoch": 0.2844634907734082, - "grad_norm": 1.5284950240291668, - "learning_rate": 4.1648070138864993e-07, - "loss": 1.7175, - "step": 7700 + "epoch": 0.16562360303978543, + "grad_norm": 1.1086706049405617, + "learning_rate": 5.64261736477584e-07, + "loss": 1.7038, + "step": 1235 }, { - "epoch": 0.2852023569832093, - "grad_norm": 1.5249438102092971, - "learning_rate": 4.1607421153051454e-07, - "loss": 1.6753, - "step": 7720 + "epoch": 0.16575771122038443, + "grad_norm": 1.0944161073367153, + "learning_rate": 5.642051518677929e-07, + "loss": 1.6386, + "step": 1236 }, { - "epoch": 0.28594122319301035, - "grad_norm": 1.6281345917446086, - "learning_rate": 4.156669615301891e-07, - "loss": 1.6455, - "step": 7740 + "epoch": 0.16589181940098346, + "grad_norm": 1.0383994077860026, + "learning_rate": 5.641485256780112e-07, + "loss": 1.6683, + "step": 1237 }, { - "epoch": 0.28668008940281137, - "grad_norm": 1.7327391694790744, - "learning_rate": 4.152589535820358e-07, - "loss": 1.6115, - "step": 7760 + "epoch": 0.16602592758158247, + "grad_norm": 1.110409441026267, + "learning_rate": 5.640918579182926e-07, + "loss": 1.7666, + "step": 1238 }, { - "epoch": 0.28741895561261244, - "grad_norm": 1.8046545180697087, - "learning_rate": 4.148501898845008e-07, - "loss": 1.6752, - "step": 7780 + "epoch": 0.1661600357621815, + "grad_norm": 1.062864948914823, + "learning_rate": 5.640351485986973e-07, + "loss": 1.6995, + "step": 1239 }, { - "epoch": 0.2881578218224135, - "grad_norm": 1.4479684507284691, - "learning_rate": 4.144406726401024e-07, - "loss": 1.7095, - "step": 7800 + "epoch": 0.1662941439427805, + "grad_norm": 1.1144719375181737, + "learning_rate": 5.639783977292936e-07, + "loss": 1.6904, + "step": 1240 }, { - "epoch": 0.2888966880322146, - "grad_norm": 1.5133767331728856, - "learning_rate": 4.140304040554192e-07, - "loss": 1.6637, - "step": 7820 + "epoch": 0.16642825212337953, + "grad_norm": 1.090081045271864, + "learning_rate": 5.639216053201565e-07, + "loss": 1.696, + "step": 1241 }, { - "epoch": 0.28963555424201565, - "grad_norm": 1.69526484807945, - "learning_rate": 4.1361938634107795e-07, - "loss": 1.6604, - "step": 7840 + "epoch": 0.16656236030397853, + "grad_norm": 1.0630959169468894, + "learning_rate": 5.638647713813691e-07, + "loss": 1.6521, + "step": 1242 }, { - "epoch": 0.29037442045181666, - "grad_norm": 1.5901137640996412, - "learning_rate": 4.132076217117425e-07, - "loss": 1.7023, - "step": 7860 + "epoch": 0.16669646848457756, + "grad_norm": 2.998931919925447, + "learning_rate": 5.638078959230211e-07, + "loss": 1.706, + "step": 1243 }, { - "epoch": 0.29111328666161773, - "grad_norm": 1.423118541107655, - "learning_rate": 4.1279511238610075e-07, - "loss": 1.6251, - "step": 7880 + "epoch": 0.1668305766651766, + "grad_norm": 1.2341388992185853, + "learning_rate": 5.637509789552104e-07, + "loss": 1.5942, + "step": 1244 }, { - "epoch": 0.2918521528714188, - "grad_norm": 1.3770610046698395, - "learning_rate": 4.123818605868533e-07, - "loss": 1.6859, - "step": 7900 + "epoch": 0.1669646848457756, + "grad_norm": 1.1027382262588608, + "learning_rate": 5.636940204880415e-07, + "loss": 1.6176, + "step": 1245 }, { - "epoch": 0.29259101908121987, - "grad_norm": 1.5512042035926865, - "learning_rate": 4.1196786854070147e-07, - "loss": 1.6682, - "step": 7920 + "epoch": 0.16709879302637462, + "grad_norm": 1.1453532005308322, + "learning_rate": 5.636370205316269e-07, + "loss": 1.7051, + "step": 1246 }, { - "epoch": 0.29332988529102094, - "grad_norm": 1.5657764052019774, - "learning_rate": 4.115531384783352e-07, - "loss": 1.6373, - "step": 7940 + "epoch": 0.16723290120697362, + "grad_norm": 1.1774692080993565, + "learning_rate": 5.63579979096086e-07, + "loss": 1.7089, + "step": 1247 }, { - "epoch": 0.294068751500822, - "grad_norm": 1.3977001410170469, - "learning_rate": 4.11137672634421e-07, - "loss": 1.623, - "step": 7960 + "epoch": 0.16736700938757265, + "grad_norm": 1.05810539274269, + "learning_rate": 5.635228961915458e-07, + "loss": 1.6353, + "step": 1248 }, { - "epoch": 0.294807617710623, - "grad_norm": 1.5471885506840533, - "learning_rate": 4.1072147324759007e-07, - "loss": 1.6359, - "step": 7980 + "epoch": 0.16750111756817165, + "grad_norm": 1.1450836955803443, + "learning_rate": 5.634657718281407e-07, + "loss": 1.7418, + "step": 1249 }, { - "epoch": 0.2955464839204241, - "grad_norm": 1.9646501043093372, - "learning_rate": 4.103045425604257e-07, - "loss": 1.6575, - "step": 8000 + "epoch": 0.16763522574877068, + "grad_norm": 1.125948952992154, + "learning_rate": 5.634086060160121e-07, + "loss": 1.7343, + "step": 1250 }, { - "epoch": 0.29628535013022517, - "grad_norm": 2.4554925260754192, - "learning_rate": 4.098868828194523e-07, - "loss": 1.6505, - "step": 8020 + "epoch": 0.16776933392936969, + "grad_norm": 1.069728820008434, + "learning_rate": 5.633513987653094e-07, + "loss": 1.4826, + "step": 1251 }, { - "epoch": 0.29702421634002624, - "grad_norm": 1.5764440647794176, - "learning_rate": 4.0946849627512194e-07, - "loss": 1.6537, - "step": 8040 + "epoch": 0.16790344210996871, + "grad_norm": 1.0401896130830024, + "learning_rate": 5.632941500861885e-07, + "loss": 1.7211, + "step": 1252 }, { - "epoch": 0.2977630825498273, - "grad_norm": 1.5679031999275903, - "learning_rate": 4.090493851818032e-07, - "loss": 1.6678, - "step": 8060 + "epoch": 0.16803755029056772, + "grad_norm": 1.09563187676157, + "learning_rate": 5.632368599888135e-07, + "loss": 1.7378, + "step": 1253 }, { - "epoch": 0.2985019487596284, - "grad_norm": 1.5427978270277976, - "learning_rate": 4.086295517977688e-07, - "loss": 1.646, - "step": 8080 + "epoch": 0.16817165847116675, + "grad_norm": 1.0701481214906692, + "learning_rate": 5.631795284833555e-07, + "loss": 1.7191, + "step": 1254 }, { - "epoch": 0.2992408149694294, - "grad_norm": 1.6159758168642673, - "learning_rate": 4.082089983851831e-07, - "loss": 1.6543, - "step": 8100 + "epoch": 0.16830576665176575, + "grad_norm": 1.2554327805183711, + "learning_rate": 5.631221555799927e-07, + "loss": 1.6476, + "step": 1255 }, { - "epoch": 0.29997968117923046, - "grad_norm": 1.4061897285537437, - "learning_rate": 4.0778772721009036e-07, - "loss": 1.6285, - "step": 8120 + "epoch": 0.16843987483236478, + "grad_norm": 1.0867457009428256, + "learning_rate": 5.63064741288911e-07, + "loss": 1.6594, + "step": 1256 }, { - "epoch": 0.30071854738903153, - "grad_norm": 1.3965741494953192, - "learning_rate": 4.073657405424019e-07, - "loss": 1.6656, - "step": 8140 + "epoch": 0.16857398301296378, + "grad_norm": 1.0587419661389497, + "learning_rate": 5.630072856203037e-07, + "loss": 1.7365, + "step": 1257 }, { - "epoch": 0.3014574135988326, - "grad_norm": 1.5484468689064121, - "learning_rate": 4.06943040655885e-07, - "loss": 1.661, - "step": 8160 + "epoch": 0.1687080911935628, + "grad_norm": 1.0437016123668459, + "learning_rate": 5.629497885843712e-07, + "loss": 1.6223, + "step": 1258 }, { - "epoch": 0.30219627980863367, - "grad_norm": 1.5843927161871971, - "learning_rate": 4.065196298281493e-07, - "loss": 1.6622, - "step": 8180 + "epoch": 0.1688421993741618, + "grad_norm": 1.093304989043814, + "learning_rate": 5.628922501913211e-07, + "loss": 1.7281, + "step": 1259 }, { - "epoch": 0.3029351460184347, - "grad_norm": 1.6553065392619284, - "learning_rate": 4.0609551034063555e-07, - "loss": 1.6989, - "step": 8200 + "epoch": 0.16897630755476084, + "grad_norm": 1.0787876584693192, + "learning_rate": 5.628346704513689e-07, + "loss": 1.7033, + "step": 1260 }, { - "epoch": 0.30367401222823576, - "grad_norm": 1.6004229625484228, - "learning_rate": 4.056706844786025e-07, - "loss": 1.6673, - "step": 8220 + "epoch": 0.16911041573535987, + "grad_norm": 1.119310868984826, + "learning_rate": 5.627770493747369e-07, + "loss": 1.6785, + "step": 1261 }, { - "epoch": 0.3044128784380368, - "grad_norm": 1.7218496726083523, - "learning_rate": 4.052451545311157e-07, - "loss": 1.7071, - "step": 8240 + "epoch": 0.16924452391595887, + "grad_norm": 1.0543862123255383, + "learning_rate": 5.62719386971655e-07, + "loss": 1.6329, + "step": 1262 }, { - "epoch": 0.3051517446478379, - "grad_norm": 1.4453612541643919, - "learning_rate": 4.0481892279103375e-07, - "loss": 1.6418, - "step": 8260 + "epoch": 0.1693786320965579, + "grad_norm": 1.1801974734059986, + "learning_rate": 5.626616832523605e-07, + "loss": 1.6647, + "step": 1263 }, { - "epoch": 0.30589061085763897, - "grad_norm": 2.0343056912272415, - "learning_rate": 4.043919915549972e-07, - "loss": 1.6406, - "step": 8280 + "epoch": 0.1695127402771569, + "grad_norm": 1.0966012840078587, + "learning_rate": 5.626039382270977e-07, + "loss": 1.7489, + "step": 1264 }, { - "epoch": 0.30662947706744004, - "grad_norm": 1.4141851056827188, - "learning_rate": 4.0396436312341537e-07, - "loss": 1.6697, - "step": 8300 + "epoch": 0.16964684845775593, + "grad_norm": 1.0464685107772078, + "learning_rate": 5.625461519061187e-07, + "loss": 1.613, + "step": 1265 }, { - "epoch": 0.30736834327724105, - "grad_norm": 1.7030187367387806, - "learning_rate": 4.0353603980045434e-07, - "loss": 1.648, - "step": 8320 + "epoch": 0.16978095663835494, + "grad_norm": 1.1162999981242707, + "learning_rate": 5.624883242996825e-07, + "loss": 1.6777, + "step": 1266 }, { - "epoch": 0.3081072094870421, - "grad_norm": 1.4580931131013146, - "learning_rate": 4.0310702389402455e-07, - "loss": 1.6738, - "step": 8340 + "epoch": 0.16991506481895396, + "grad_norm": 1.0848332959906992, + "learning_rate": 5.624304554180556e-07, + "loss": 1.6708, + "step": 1267 }, { - "epoch": 0.3088460756968432, - "grad_norm": 1.6315260212867364, - "learning_rate": 4.0267731771576795e-07, - "loss": 1.6568, - "step": 8360 + "epoch": 0.17004917299955297, + "grad_norm": 1.0397576875295036, + "learning_rate": 5.623725452715121e-07, + "loss": 1.6809, + "step": 1268 }, { - "epoch": 0.30958494190664426, - "grad_norm": 1.760277165218215, - "learning_rate": 4.022469235810462e-07, - "loss": 1.7044, - "step": 8380 + "epoch": 0.170183281180152, + "grad_norm": 1.0775743863836376, + "learning_rate": 5.62314593870333e-07, + "loss": 1.7068, + "step": 1269 }, { - "epoch": 0.31032380811644533, - "grad_norm": 1.5247483148379708, - "learning_rate": 4.0181584380892747e-07, - "loss": 1.625, - "step": 8400 + "epoch": 0.170317389360751, + "grad_norm": 1.1030270698791587, + "learning_rate": 5.622566012248068e-07, + "loss": 1.7731, + "step": 1270 }, { - "epoch": 0.3110626743262464, - "grad_norm": 1.6055425468824278, - "learning_rate": 4.0138408072217467e-07, - "loss": 1.6332, - "step": 8420 + "epoch": 0.17045149754135003, + "grad_norm": 1.0632600433435002, + "learning_rate": 5.621985673452292e-07, + "loss": 1.6944, + "step": 1271 }, { - "epoch": 0.3118015405360474, - "grad_norm": 2.522263277058951, - "learning_rate": 4.009516366472323e-07, - "loss": 1.6795, - "step": 8440 + "epoch": 0.17058560572194903, + "grad_norm": 2.354964154428233, + "learning_rate": 5.621404922419036e-07, + "loss": 1.5583, + "step": 1272 }, { - "epoch": 0.3125404067458485, - "grad_norm": 1.4776229994815417, - "learning_rate": 4.005185139142143e-07, - "loss": 1.6675, - "step": 8460 + "epoch": 0.17071971390254806, + "grad_norm": 1.0841684512277456, + "learning_rate": 5.620823759251403e-07, + "loss": 1.6523, + "step": 1273 }, { - "epoch": 0.31327927295564956, - "grad_norm": 1.458660936186841, - "learning_rate": 4.000847148568915e-07, - "loss": 1.661, - "step": 8480 + "epoch": 0.17085382208314706, + "grad_norm": 1.1343004749820542, + "learning_rate": 5.62024218405257e-07, + "loss": 1.6026, + "step": 1274 }, { - "epoch": 0.3140181391654506, - "grad_norm": 1.5895551714359692, - "learning_rate": 3.9965024181267865e-07, - "loss": 1.6474, - "step": 8500 + "epoch": 0.1709879302637461, + "grad_norm": 1.3571816054618184, + "learning_rate": 5.619660196925789e-07, + "loss": 1.6434, + "step": 1275 }, { - "epoch": 0.3147570053752517, - "grad_norm": 1.6027764846949324, - "learning_rate": 3.9921509712262237e-07, - "loss": 1.7055, - "step": 8520 + "epoch": 0.1711220384443451, + "grad_norm": 1.058572028264877, + "learning_rate": 5.619077797974385e-07, + "loss": 1.6225, + "step": 1276 }, { - "epoch": 0.3154958715850527, - "grad_norm": 1.4709407841933115, - "learning_rate": 3.9877928313138807e-07, - "loss": 1.6721, - "step": 8540 + "epoch": 0.17125614662494412, + "grad_norm": 1.068136194752418, + "learning_rate": 5.618494987301753e-07, + "loss": 1.6629, + "step": 1277 }, { - "epoch": 0.3162347377948538, - "grad_norm": 1.4461242455876133, - "learning_rate": 3.983428021872477e-07, - "loss": 1.6496, - "step": 8560 + "epoch": 0.17139025480554315, + "grad_norm": 1.2779625791938292, + "learning_rate": 5.617911765011364e-07, + "loss": 1.6295, + "step": 1278 }, { - "epoch": 0.31697360400465485, - "grad_norm": 1.4524171700785795, - "learning_rate": 3.979056566420668e-07, - "loss": 1.6553, - "step": 8580 + "epoch": 0.17152436298614215, + "grad_norm": 1.09073380795014, + "learning_rate": 5.617328131206761e-07, + "loss": 1.6544, + "step": 1279 }, { - "epoch": 0.3177124702144559, - "grad_norm": 1.5057325136067627, - "learning_rate": 3.974678488512921e-07, - "loss": 1.6723, - "step": 8600 + "epoch": 0.17165847116674118, + "grad_norm": 1.0808553452465872, + "learning_rate": 5.616744085991562e-07, + "loss": 1.6671, + "step": 1280 }, { - "epoch": 0.318451336424257, - "grad_norm": 1.4293777770249827, - "learning_rate": 3.9702938117393825e-07, - "loss": 1.6586, - "step": 8620 + "epoch": 0.17179257934734019, + "grad_norm": 1.1043939527890692, + "learning_rate": 5.616159629469456e-07, + "loss": 1.6977, + "step": 1281 }, { - "epoch": 0.31919020263405806, - "grad_norm": 1.4212368243075615, - "learning_rate": 3.965902559725761e-07, - "loss": 1.6458, - "step": 8640 + "epoch": 0.17192668752793921, + "grad_norm": 1.0969178723829076, + "learning_rate": 5.615574761744202e-07, + "loss": 1.7814, + "step": 1282 }, { - "epoch": 0.3199290688438591, - "grad_norm": 1.4727420961415922, - "learning_rate": 3.961504756133189e-07, - "loss": 1.6481, - "step": 8660 + "epoch": 0.17206079570853822, + "grad_norm": 1.0619478458391556, + "learning_rate": 5.614989482919641e-07, + "loss": 1.6899, + "step": 1283 }, { - "epoch": 0.32066793505366015, - "grad_norm": 2.5900548552419895, - "learning_rate": 3.9573207959028544e-07, - "loss": 1.621, - "step": 8680 + "epoch": 0.17219490388913725, + "grad_norm": 1.1116637641823053, + "learning_rate": 5.614403793099678e-07, + "loss": 1.6795, + "step": 1284 }, { - "epoch": 0.3214068012634612, - "grad_norm": 1.5430259080799726, - "learning_rate": 3.952910284920244e-07, - "loss": 1.6812, - "step": 8700 + "epoch": 0.17232901206973625, + "grad_norm": 1.1188139751673378, + "learning_rate": 5.613817692388295e-07, + "loss": 1.6586, + "step": 1285 }, { - "epoch": 0.3221456674732623, - "grad_norm": 1.4794345694793534, - "learning_rate": 3.948493292364224e-07, - "loss": 1.6585, - "step": 8720 + "epoch": 0.17246312025033528, + "grad_norm": 1.1092151541540025, + "learning_rate": 5.613231180889545e-07, + "loss": 1.731, + "step": 1286 }, { - "epoch": 0.32288453368306336, - "grad_norm": 1.4614630552620829, - "learning_rate": 3.9440698420346246e-07, - "loss": 1.6466, - "step": 8740 + "epoch": 0.17259722843093428, + "grad_norm": 1.0776307968053882, + "learning_rate": 5.612644258707557e-07, + "loss": 1.639, + "step": 1287 }, { - "epoch": 0.3236233998928644, - "grad_norm": 1.4393288175430394, - "learning_rate": 3.939639957766073e-07, - "loss": 1.6215, - "step": 8760 + "epoch": 0.1727313366115333, + "grad_norm": 1.1568418405932983, + "learning_rate": 5.612056925946532e-07, + "loss": 1.6265, + "step": 1288 }, { - "epoch": 0.32436226610266544, - "grad_norm": 2.1230018342791532, - "learning_rate": 3.9352036634278634e-07, - "loss": 1.6803, - "step": 8780 + "epoch": 0.1728654447921323, + "grad_norm": 1.1686914549112786, + "learning_rate": 5.611469182710741e-07, + "loss": 1.5635, + "step": 1289 }, { - "epoch": 0.3251011323124665, - "grad_norm": 1.6164570568462948, - "learning_rate": 3.9307609829238297e-07, - "loss": 1.6766, - "step": 8800 + "epoch": 0.17299955297273134, + "grad_norm": 1.0798126174498692, + "learning_rate": 5.61088102910453e-07, + "loss": 1.6009, + "step": 1290 }, { - "epoch": 0.3258399985222676, - "grad_norm": 1.4370335980422504, - "learning_rate": 3.9263119401922175e-07, - "loss": 1.6822, - "step": 8820 + "epoch": 0.17313366115333034, + "grad_norm": 1.0565094574884266, + "learning_rate": 5.61029246523232e-07, + "loss": 1.6236, + "step": 1291 }, { - "epoch": 0.32657886473206865, - "grad_norm": 1.644081010299245, - "learning_rate": 3.9218565592055486e-07, - "loss": 1.6633, - "step": 8840 + "epoch": 0.17326776933392937, + "grad_norm": 1.1580137951907012, + "learning_rate": 5.609703491198601e-07, + "loss": 1.6664, + "step": 1292 }, { - "epoch": 0.3273177309418697, - "grad_norm": 2.1011988058241173, - "learning_rate": 3.9173948639705027e-07, - "loss": 1.6765, - "step": 8860 + "epoch": 0.17340187751452837, + "grad_norm": 1.0812242416939941, + "learning_rate": 5.609114107107936e-07, + "loss": 1.5541, + "step": 1293 }, { - "epoch": 0.32805659715167074, - "grad_norm": 2.151384135030328, - "learning_rate": 3.9129268785277796e-07, - "loss": 1.6465, - "step": 8880 + "epoch": 0.1735359856951274, + "grad_norm": 1.0926652109752668, + "learning_rate": 5.608524313064966e-07, + "loss": 1.6495, + "step": 1294 }, { - "epoch": 0.3287954633614718, - "grad_norm": 1.4309025880636768, - "learning_rate": 3.908452626951972e-07, - "loss": 1.6543, - "step": 8900 + "epoch": 0.1736700938757264, + "grad_norm": 1.116001777343314, + "learning_rate": 5.607934109174398e-07, + "loss": 1.568, + "step": 1295 }, { - "epoch": 0.3295343295712729, - "grad_norm": 1.8849999578121595, - "learning_rate": 3.903972133351436e-07, - "loss": 1.6514, - "step": 8920 + "epoch": 0.17380420205632544, + "grad_norm": 1.0742848470460207, + "learning_rate": 5.607343495541017e-07, + "loss": 1.6815, + "step": 1296 }, { - "epoch": 0.33027319578107395, - "grad_norm": 1.7164685196230511, - "learning_rate": 3.8994854218681627e-07, - "loss": 1.7006, - "step": 8940 + "epoch": 0.17393831023692446, + "grad_norm": 1.1104040571093063, + "learning_rate": 5.606752472269675e-07, + "loss": 1.7855, + "step": 1297 }, { - "epoch": 0.331012061990875, - "grad_norm": 1.4964402365248954, - "learning_rate": 3.8949925166776454e-07, - "loss": 1.6995, - "step": 8960 + "epoch": 0.17407241841752347, + "grad_norm": 1.1082815736136737, + "learning_rate": 5.606161039465304e-07, + "loss": 1.5563, + "step": 1298 }, { - "epoch": 0.3317509282006761, - "grad_norm": 1.9725561956682367, - "learning_rate": 3.8904934419887493e-07, - "loss": 1.634, - "step": 8980 + "epoch": 0.1742065265981225, + "grad_norm": 1.3426693471935263, + "learning_rate": 5.605569197232904e-07, + "loss": 1.6382, + "step": 1299 }, { - "epoch": 0.3324897944104771, - "grad_norm": 1.604770043849599, - "learning_rate": 3.885988222043586e-07, - "loss": 1.6307, - "step": 9000 + "epoch": 0.1743406347787215, + "grad_norm": 1.1018630739261308, + "learning_rate": 5.604976945677547e-07, + "loss": 1.5862, + "step": 1300 }, { - "epoch": 0.33322866062027817, - "grad_norm": 1.4014528232679808, - "learning_rate": 3.881476881117376e-07, - "loss": 1.6384, - "step": 9020 + "epoch": 0.17447474295932053, + "grad_norm": 1.08258660371521, + "learning_rate": 5.604384284904382e-07, + "loss": 1.7377, + "step": 1301 }, { - "epoch": 0.33396752683007924, - "grad_norm": 1.5592294550988919, - "learning_rate": 3.876959443518323e-07, - "loss": 1.6893, - "step": 9040 + "epoch": 0.17460885113991953, + "grad_norm": 1.0416433850048736, + "learning_rate": 5.603791215018626e-07, + "loss": 1.6654, + "step": 1302 }, { - "epoch": 0.3347063930398803, - "grad_norm": 1.512028885113723, - "learning_rate": 3.872662252925764e-07, - "loss": 1.6126, - "step": 9060 + "epoch": 0.17474295932051856, + "grad_norm": 1.0585227638311847, + "learning_rate": 5.603197736125572e-07, + "loss": 1.6259, + "step": 1303 }, { - "epoch": 0.3354452592496814, - "grad_norm": 1.5167336039874841, - "learning_rate": 3.868132996855423e-07, - "loss": 1.6438, - "step": 9080 + "epoch": 0.17487706750111756, + "grad_norm": 1.800828493151873, + "learning_rate": 5.602603848330582e-07, + "loss": 1.6681, + "step": 1304 }, { - "epoch": 0.3361841254594824, - "grad_norm": 1.5732905269770532, - "learning_rate": 3.8635977160123356e-07, - "loss": 1.6129, - "step": 9100 + "epoch": 0.1750111756817166, + "grad_norm": 1.2442322404337642, + "learning_rate": 5.602009551739095e-07, + "loss": 1.7388, + "step": 1305 }, { - "epoch": 0.33692299166928347, - "grad_norm": 1.6825164459147328, - "learning_rate": 3.859056434833698e-07, - "loss": 1.611, - "step": 9120 + "epoch": 0.1751452838623156, + "grad_norm": 1.0650536278693077, + "learning_rate": 5.60141484645662e-07, + "loss": 1.6913, + "step": 1306 }, { - "epoch": 0.33766185787908454, - "grad_norm": 2.3767246380889095, - "learning_rate": 3.854509177789039e-07, - "loss": 1.6473, - "step": 9140 + "epoch": 0.17527939204291462, + "grad_norm": 1.0715066374394453, + "learning_rate": 5.600819732588738e-07, + "loss": 1.7508, + "step": 1307 }, { - "epoch": 0.3384007240888856, - "grad_norm": 1.51475900965411, - "learning_rate": 3.8499559693800866e-07, - "loss": 1.6696, - "step": 9160 + "epoch": 0.17541350022351362, + "grad_norm": 1.2154515219706747, + "learning_rate": 5.600224210241104e-07, + "loss": 1.6431, + "step": 1308 }, { - "epoch": 0.3391395902986867, - "grad_norm": 2.1798994146623496, - "learning_rate": 3.845396834140635e-07, - "loss": 1.6272, - "step": 9180 + "epoch": 0.17554760840411265, + "grad_norm": 1.0580023010334576, + "learning_rate": 5.599628279519445e-07, + "loss": 1.7028, + "step": 1309 }, { - "epoch": 0.33987845650848775, - "grad_norm": 5.503662773520221, - "learning_rate": 3.8408317966364155e-07, - "loss": 1.6598, - "step": 9200 + "epoch": 0.17568171658471166, + "grad_norm": 1.0649573978054163, + "learning_rate": 5.599031940529562e-07, + "loss": 1.7045, + "step": 1310 }, { - "epoch": 0.34061732271828876, - "grad_norm": 1.4387011677124582, - "learning_rate": 3.836260881464961e-07, - "loss": 1.6327, - "step": 9220 + "epoch": 0.17581582476531069, + "grad_norm": 1.066600801218827, + "learning_rate": 5.598435193377324e-07, + "loss": 1.6888, + "step": 1311 }, { - "epoch": 0.34135618892808983, - "grad_norm": 1.8647315334479582, - "learning_rate": 3.831684113255475e-07, - "loss": 1.6511, - "step": 9240 + "epoch": 0.1759499329459097, + "grad_norm": 1.2123022138020687, + "learning_rate": 5.597838038168678e-07, + "loss": 1.7297, + "step": 1312 }, { - "epoch": 0.3420950551378909, - "grad_norm": 1.4777808537198769, - "learning_rate": 3.8271015166686987e-07, - "loss": 1.6361, - "step": 9260 + "epoch": 0.17608404112650872, + "grad_norm": 1.0436067677488805, + "learning_rate": 5.59724047500964e-07, + "loss": 1.652, + "step": 1313 }, { - "epoch": 0.34283392134769197, - "grad_norm": 2.045197276055339, - "learning_rate": 3.822513116396778e-07, - "loss": 1.6659, - "step": 9280 + "epoch": 0.17621814930710775, + "grad_norm": 1.0487601395222634, + "learning_rate": 5.5966425040063e-07, + "loss": 1.7444, + "step": 1314 }, { - "epoch": 0.34357278755749304, - "grad_norm": 1.7790240681877276, - "learning_rate": 3.8179189371631307e-07, - "loss": 1.617, - "step": 9300 + "epoch": 0.17635225748770675, + "grad_norm": 1.117082389094809, + "learning_rate": 5.596044125264818e-07, + "loss": 1.64, + "step": 1315 }, { - "epoch": 0.3443116537672941, - "grad_norm": 1.6594283041904447, - "learning_rate": 3.813319003722312e-07, - "loss": 1.6798, - "step": 9320 + "epoch": 0.17648636566830578, + "grad_norm": 1.0558238043899169, + "learning_rate": 5.595445338891431e-07, + "loss": 1.6659, + "step": 1316 }, { - "epoch": 0.3450505199770951, - "grad_norm": 1.5722518111489987, - "learning_rate": 3.8087133408598837e-07, - "loss": 1.6448, - "step": 9340 + "epoch": 0.17662047384890478, + "grad_norm": 1.0478981037852866, + "learning_rate": 5.594846144992443e-07, + "loss": 1.52, + "step": 1317 }, { - "epoch": 0.3457893861868962, - "grad_norm": 1.3834190123625751, - "learning_rate": 3.804101973392278e-07, - "loss": 1.6937, - "step": 9360 + "epoch": 0.1767545820295038, + "grad_norm": 1.257918943849832, + "learning_rate": 5.594246543674234e-07, + "loss": 1.7601, + "step": 1318 }, { - "epoch": 0.34652825239669727, - "grad_norm": 2.860970712860898, - "learning_rate": 3.799484926166665e-07, - "loss": 1.6803, - "step": 9380 + "epoch": 0.1768886902101028, + "grad_norm": 1.4225322949034613, + "learning_rate": 5.593646535043253e-07, + "loss": 1.7307, + "step": 1319 }, { - "epoch": 0.34726711860649834, - "grad_norm": 1.7303789413551895, - "learning_rate": 3.794862224060819e-07, - "loss": 1.6652, - "step": 9400 + "epoch": 0.17702279839070184, + "grad_norm": 1.1490395041861463, + "learning_rate": 5.593046119206027e-07, + "loss": 1.7181, + "step": 1320 }, { - "epoch": 0.3480059848162994, - "grad_norm": 1.5722357665247504, - "learning_rate": 3.7902338919829854e-07, - "loss": 1.6824, - "step": 9420 + "epoch": 0.17715690657130084, + "grad_norm": 1.0611730445421508, + "learning_rate": 5.59244529626915e-07, + "loss": 1.6528, + "step": 1321 }, { - "epoch": 0.3487448510261004, - "grad_norm": 1.4942909416069685, - "learning_rate": 3.785599954871741e-07, - "loss": 1.6334, - "step": 9440 + "epoch": 0.17729101475189987, + "grad_norm": 1.204549135410644, + "learning_rate": 5.591844066339289e-07, + "loss": 1.7908, + "step": 1322 }, { - "epoch": 0.3494837172359015, - "grad_norm": 1.5407701751336818, - "learning_rate": 3.7809604376958705e-07, - "loss": 1.6147, - "step": 9460 + "epoch": 0.17742512293249887, + "grad_norm": 1.1001829655239295, + "learning_rate": 5.591242429523187e-07, + "loss": 1.6403, + "step": 1323 }, { - "epoch": 0.35022258344570256, - "grad_norm": 1.5151800327591411, - "learning_rate": 3.7763153654542187e-07, - "loss": 1.6591, - "step": 9480 + "epoch": 0.1775592311130979, + "grad_norm": 1.1251080236723472, + "learning_rate": 5.590640385927655e-07, + "loss": 1.6476, + "step": 1324 }, { - "epoch": 0.35096144965550363, - "grad_norm": 1.5820720313790753, - "learning_rate": 3.7716647631755684e-07, - "loss": 1.6267, - "step": 9500 + "epoch": 0.1776933392936969, + "grad_norm": 1.0879047909659794, + "learning_rate": 5.590037935659577e-07, + "loss": 1.7197, + "step": 1325 }, { - "epoch": 0.3517003158653047, - "grad_norm": 1.7136185539713005, - "learning_rate": 3.7670086559184944e-07, - "loss": 1.6443, - "step": 9520 + "epoch": 0.17782744747429594, + "grad_norm": 1.0406989517811054, + "learning_rate": 5.589435078825912e-07, + "loss": 1.5898, + "step": 1326 }, { - "epoch": 0.3524391820751058, - "grad_norm": 1.6610072999142345, - "learning_rate": 3.7623470687712363e-07, - "loss": 1.6391, - "step": 9540 + "epoch": 0.17796155565489494, + "grad_norm": 1.055284942749228, + "learning_rate": 5.588831815533688e-07, + "loss": 1.6537, + "step": 1327 }, { - "epoch": 0.3531780482849068, - "grad_norm": 1.7561532016780041, - "learning_rate": 3.7576800268515615e-07, - "loss": 1.6403, - "step": 9560 + "epoch": 0.17809566383549397, + "grad_norm": 1.1132782384590842, + "learning_rate": 5.588228145890006e-07, + "loss": 1.6304, + "step": 1328 }, { - "epoch": 0.35391691449470786, - "grad_norm": 1.6534365111706855, - "learning_rate": 3.7530075553066256e-07, - "loss": 1.6604, - "step": 9580 + "epoch": 0.17822977201609297, + "grad_norm": 1.1856096238614278, + "learning_rate": 5.587624070002039e-07, + "loss": 1.6901, + "step": 1329 }, { - "epoch": 0.3546557807045089, - "grad_norm": 1.5197922636545014, - "learning_rate": 3.748329679312845e-07, - "loss": 1.6005, - "step": 9600 + "epoch": 0.178363880196692, + "grad_norm": 1.0716839423819353, + "learning_rate": 5.587019587977035e-07, + "loss": 1.6256, + "step": 1330 }, { - "epoch": 0.35539464691431, - "grad_norm": 2.1221364447575635, - "learning_rate": 3.743646424075753e-07, - "loss": 1.6302, - "step": 9620 + "epoch": 0.17849798837729103, + "grad_norm": 1.0832321039520167, + "learning_rate": 5.586414699922309e-07, + "loss": 1.6811, + "step": 1331 }, { - "epoch": 0.35613351312411107, - "grad_norm": 1.520654127135304, - "learning_rate": 3.738957814829868e-07, - "loss": 1.7174, - "step": 9640 + "epoch": 0.17863209655789003, + "grad_norm": 1.0997046830321784, + "learning_rate": 5.585809405945252e-07, + "loss": 1.5625, + "step": 1332 }, { - "epoch": 0.35687237933391214, - "grad_norm": 1.5099869797232601, - "learning_rate": 3.7342638768385597e-07, - "loss": 1.6592, - "step": 9660 + "epoch": 0.17876620473848906, + "grad_norm": 1.0713255103444261, + "learning_rate": 5.585203706153326e-07, + "loss": 1.6532, + "step": 1333 }, { - "epoch": 0.35761124554371315, - "grad_norm": 1.8304484700278734, - "learning_rate": 3.729564635393907e-07, - "loss": 1.6745, - "step": 9680 + "epoch": 0.17890031291908806, + "grad_norm": 1.097655546141729, + "learning_rate": 5.584597600654066e-07, + "loss": 1.561, + "step": 1334 }, { - "epoch": 0.3583501117535142, - "grad_norm": 1.778696114508267, - "learning_rate": 3.7248601158165674e-07, - "loss": 1.6592, - "step": 9700 + "epoch": 0.1790344210996871, + "grad_norm": 1.118524842313588, + "learning_rate": 5.583991089555074e-07, + "loss": 1.6562, + "step": 1335 }, { - "epoch": 0.3590889779633153, - "grad_norm": 1.4183327236752137, - "learning_rate": 3.720150343455638e-07, - "loss": 1.6637, - "step": 9720 + "epoch": 0.1791685292802861, + "grad_norm": 1.143484492621255, + "learning_rate": 5.583384172964032e-07, + "loss": 1.6106, + "step": 1336 }, { - "epoch": 0.35982784417311636, - "grad_norm": 1.559240346976758, - "learning_rate": 3.715435343688517e-07, - "loss": 1.6862, - "step": 9740 + "epoch": 0.17930263746088512, + "grad_norm": 1.1214046342101587, + "learning_rate": 5.582776850988688e-07, + "loss": 1.6307, + "step": 1337 }, { - "epoch": 0.36056671038291743, - "grad_norm": 1.5461740842164586, - "learning_rate": 3.710715141920772e-07, - "loss": 1.6276, - "step": 9760 + "epoch": 0.17943674564148412, + "grad_norm": 1.1213846092161437, + "learning_rate": 5.582169123736864e-07, + "loss": 1.7581, + "step": 1338 }, { - "epoch": 0.36130557659271845, - "grad_norm": 1.541024781373399, - "learning_rate": 3.705989763585998e-07, - "loss": 1.6519, - "step": 9780 + "epoch": 0.17957085382208315, + "grad_norm": 1.1045643310044297, + "learning_rate": 5.581560991316455e-07, + "loss": 1.7356, + "step": 1339 }, { - "epoch": 0.3620444428025195, - "grad_norm": 1.568073509021964, - "learning_rate": 3.7012592341456855e-07, - "loss": 1.644, - "step": 9800 + "epoch": 0.17970496200268216, + "grad_norm": 1.1684585589911254, + "learning_rate": 5.580952453835426e-07, + "loss": 1.7319, + "step": 1340 }, { - "epoch": 0.3627833090123206, - "grad_norm": 7.164278419276029, - "learning_rate": 3.6965235790890776e-07, - "loss": 1.6649, - "step": 9820 + "epoch": 0.17983907018328119, + "grad_norm": 1.3021764184252913, + "learning_rate": 5.580343511401813e-07, + "loss": 1.7263, + "step": 1341 }, { - "epoch": 0.36352217522212166, - "grad_norm": 1.6290047071156604, - "learning_rate": 3.6917828239330364e-07, - "loss": 1.6321, - "step": 9840 + "epoch": 0.1799731783638802, + "grad_norm": 1.113861073703856, + "learning_rate": 5.579734164123729e-07, + "loss": 1.6896, + "step": 1342 }, { - "epoch": 0.3642610414319227, - "grad_norm": 2.2138525137520078, - "learning_rate": 3.6870369942219043e-07, - "loss": 1.6623, - "step": 9860 + "epoch": 0.18010728654447922, + "grad_norm": 1.081482477946928, + "learning_rate": 5.579124412109352e-07, + "loss": 1.7272, + "step": 1343 }, { - "epoch": 0.3649999076417238, - "grad_norm": 1.4780745550505248, - "learning_rate": 3.6822861155273664e-07, - "loss": 1.6303, - "step": 9880 + "epoch": 0.18024139472507822, + "grad_norm": 1.2066355523363086, + "learning_rate": 5.578514255466939e-07, + "loss": 1.7111, + "step": 1344 }, { - "epoch": 0.3657387738515248, - "grad_norm": 1.6513433655082623, - "learning_rate": 3.677530213448315e-07, - "loss": 1.6678, - "step": 9900 + "epoch": 0.18037550290567725, + "grad_norm": 1.0985468030112344, + "learning_rate": 5.577903694304811e-07, + "loss": 1.6341, + "step": 1345 }, { - "epoch": 0.3664776400613259, - "grad_norm": 1.4330452468765504, - "learning_rate": 3.6727693136107074e-07, - "loss": 1.6411, - "step": 9920 + "epoch": 0.18050961108627625, + "grad_norm": 1.171300719246094, + "learning_rate": 5.577292728731368e-07, + "loss": 1.7271, + "step": 1346 }, { - "epoch": 0.36721650627112695, - "grad_norm": 2.1041910204234773, - "learning_rate": 3.668241852955783e-07, - "loss": 1.6638, - "step": 9940 + "epoch": 0.18064371926687528, + "grad_norm": 1.0938624509126613, + "learning_rate": 5.576681358855078e-07, + "loss": 1.6505, + "step": 1347 }, { - "epoch": 0.367955372480928, - "grad_norm": 1.579705325259841, - "learning_rate": 3.66347128129751e-07, - "loss": 1.6245, - "step": 9960 + "epoch": 0.1807778274474743, + "grad_norm": 1.1376662655489747, + "learning_rate": 5.57606958478448e-07, + "loss": 1.6729, + "step": 1348 }, { - "epoch": 0.3686942386907291, - "grad_norm": 2.2840341365356185, - "learning_rate": 3.65869578763363e-07, - "loss": 1.6621, - "step": 9980 + "epoch": 0.1809119356280733, + "grad_norm": 1.1248050141842243, + "learning_rate": 5.575457406628189e-07, + "loss": 1.6139, + "step": 1349 }, { - "epoch": 0.36943310490053016, - "grad_norm": 1.4886178225841975, - "learning_rate": 3.6539153976956643e-07, - "loss": 1.6815, - "step": 10000 + "epoch": 0.18104604380867234, + "grad_norm": 1.0939373053874768, + "learning_rate": 5.574844824494888e-07, + "loss": 1.6295, + "step": 1350 }, { - "epoch": 0.3701719711103312, - "grad_norm": 2.0581153395070952, - "learning_rate": 3.6491301372415173e-07, - "loss": 1.6911, - "step": 10020 + "epoch": 0.18118015198927134, + "grad_norm": 1.0842961883880395, + "learning_rate": 5.574231838493333e-07, + "loss": 1.5905, + "step": 1351 }, { - "epoch": 0.37091083732013225, - "grad_norm": 1.5433010278052928, - "learning_rate": 3.6443400320553387e-07, - "loss": 1.6726, - "step": 10040 + "epoch": 0.18131426016987037, + "grad_norm": 1.1099129964326464, + "learning_rate": 5.573618448732349e-07, + "loss": 1.5986, + "step": 1352 }, { - "epoch": 0.3716497035299333, - "grad_norm": 1.3650733078052242, - "learning_rate": 3.6395451079473785e-07, - "loss": 1.6808, - "step": 10060 + "epoch": 0.18144836835046937, + "grad_norm": 1.1232448273106495, + "learning_rate": 5.573004655320838e-07, + "loss": 1.7579, + "step": 1353 }, { - "epoch": 0.3723885697397344, - "grad_norm": 1.4829849508478619, - "learning_rate": 3.634745390753857e-07, - "loss": 1.638, - "step": 10080 + "epoch": 0.1815824765310684, + "grad_norm": 1.1666528664724998, + "learning_rate": 5.57239045836777e-07, + "loss": 1.6152, + "step": 1354 }, { - "epoch": 0.37312743594953546, - "grad_norm": 1.4843368467181628, - "learning_rate": 3.6299409063368177e-07, - "loss": 1.6608, - "step": 10100 + "epoch": 0.1817165847116674, + "grad_norm": 1.1370227967293582, + "learning_rate": 5.571775857982186e-07, + "loss": 1.7261, + "step": 1355 }, { - "epoch": 0.37386630215933647, - "grad_norm": 1.7135290138411319, - "learning_rate": 3.6251316805839925e-07, - "loss": 1.6201, - "step": 10120 + "epoch": 0.18185069289226644, + "grad_norm": 1.1281838118145104, + "learning_rate": 5.571160854273203e-07, + "loss": 1.7791, + "step": 1356 }, { - "epoch": 0.37460516836913754, - "grad_norm": 1.4665338261705847, - "learning_rate": 3.6203177394086603e-07, - "loss": 1.6576, - "step": 10140 + "epoch": 0.18198480107286544, + "grad_norm": 1.1128745175377743, + "learning_rate": 5.570545447350004e-07, + "loss": 1.6613, + "step": 1357 }, { - "epoch": 0.3753440345789386, - "grad_norm": 1.523807524784342, - "learning_rate": 3.615499108749508e-07, - "loss": 1.6531, - "step": 10160 + "epoch": 0.18211890925346447, + "grad_norm": 1.0867439824153309, + "learning_rate": 5.569929637321848e-07, + "loss": 1.7577, + "step": 1358 }, { - "epoch": 0.3760829007887397, - "grad_norm": 1.4605532197043567, - "learning_rate": 3.6106758145704903e-07, - "loss": 1.6351, - "step": 10180 + "epoch": 0.18225301743406347, + "grad_norm": 1.1168304669263995, + "learning_rate": 5.569313424298063e-07, + "loss": 1.6313, + "step": 1359 }, { - "epoch": 0.37682176699854075, - "grad_norm": 1.4767414919395185, - "learning_rate": 3.6058478828606904e-07, - "loss": 1.6816, - "step": 10200 + "epoch": 0.1823871256146625, + "grad_norm": 1.0783686555511454, + "learning_rate": 5.56869680838805e-07, + "loss": 1.6155, + "step": 1360 }, { - "epoch": 0.3775606332083418, - "grad_norm": 3.319352148345807, - "learning_rate": 3.601015339634179e-07, - "loss": 1.646, - "step": 10220 + "epoch": 0.1825212337952615, + "grad_norm": 1.1849330577729977, + "learning_rate": 5.568079789701281e-07, + "loss": 1.7919, + "step": 1361 }, { - "epoch": 0.37829949941814284, - "grad_norm": 1.6462705304843952, - "learning_rate": 3.5961782109298767e-07, - "loss": 1.6572, - "step": 10240 + "epoch": 0.18265534197586053, + "grad_norm": 1.0642283339220127, + "learning_rate": 5.567462368347296e-07, + "loss": 1.6483, + "step": 1362 }, { - "epoch": 0.3790383656279439, - "grad_norm": 1.987828688877245, - "learning_rate": 3.5913365228114085e-07, - "loss": 1.6272, - "step": 10260 + "epoch": 0.18278945015645953, + "grad_norm": 1.0762888034859384, + "learning_rate": 5.566844544435715e-07, + "loss": 1.6447, + "step": 1363 }, { - "epoch": 0.379777231837745, - "grad_norm": 1.5685525483250444, - "learning_rate": 3.5864903013669696e-07, - "loss": 1.629, - "step": 10280 + "epoch": 0.18292355833705856, + "grad_norm": 1.1102699057236556, + "learning_rate": 5.566226318076221e-07, + "loss": 1.6753, + "step": 1364 }, { - "epoch": 0.38051609804754605, - "grad_norm": 1.454531386924792, - "learning_rate": 3.58163957270918e-07, - "loss": 1.6391, - "step": 10300 + "epoch": 0.18305766651765756, + "grad_norm": 1.0900024036456375, + "learning_rate": 5.565607689378574e-07, + "loss": 1.6932, + "step": 1365 }, { - "epoch": 0.3812549642573471, - "grad_norm": 1.5741474691311197, - "learning_rate": 3.5767843629749465e-07, - "loss": 1.6497, - "step": 10320 + "epoch": 0.1831917746982566, + "grad_norm": 1.170525713074084, + "learning_rate": 5.564988658452601e-07, + "loss": 1.6378, + "step": 1366 }, { - "epoch": 0.3819938304671482, - "grad_norm": 1.494255550534897, - "learning_rate": 3.5719246983253227e-07, - "loss": 1.6584, - "step": 10340 + "epoch": 0.18332588287885562, + "grad_norm": 1.1252580693238932, + "learning_rate": 5.564369225408206e-07, + "loss": 1.7611, + "step": 1367 }, { - "epoch": 0.3827326966769492, - "grad_norm": 1.5743114630725665, - "learning_rate": 3.5670606049453624e-07, - "loss": 1.6333, - "step": 10360 + "epoch": 0.18345999105945462, + "grad_norm": 1.0779299976202001, + "learning_rate": 5.563749390355356e-07, + "loss": 1.6517, + "step": 1368 }, { - "epoch": 0.3834715628867503, - "grad_norm": 1.5229234435536247, - "learning_rate": 3.5621921090439856e-07, - "loss": 1.651, - "step": 10380 + "epoch": 0.18359409924005365, + "grad_norm": 1.0810638342875853, + "learning_rate": 5.563129153404099e-07, + "loss": 1.5525, + "step": 1369 }, { - "epoch": 0.38421042909655134, - "grad_norm": 1.5784429804907898, - "learning_rate": 3.557319236853833e-07, - "loss": 1.6922, - "step": 10400 + "epoch": 0.18372820742065266, + "grad_norm": 1.061240323219775, + "learning_rate": 5.562508514664548e-07, + "loss": 1.7482, + "step": 1370 }, { - "epoch": 0.3849492953063524, - "grad_norm": 1.581472732564025, - "learning_rate": 3.552442014631125e-07, - "loss": 1.6725, - "step": 10420 + "epoch": 0.18386231560125169, + "grad_norm": 1.1362519090350038, + "learning_rate": 5.561887474246889e-07, + "loss": 1.5771, + "step": 1371 }, { - "epoch": 0.3856881615161535, - "grad_norm": 1.5126802451542531, - "learning_rate": 3.5475604686555246e-07, - "loss": 1.6944, - "step": 10440 + "epoch": 0.1839964237818507, + "grad_norm": 1.7306083620793078, + "learning_rate": 5.561266032261379e-07, + "loss": 1.6738, + "step": 1372 }, { - "epoch": 0.3864270277259545, - "grad_norm": 1.5957042160618131, - "learning_rate": 3.5426746252299876e-07, - "loss": 1.6474, - "step": 10460 + "epoch": 0.18413053196244972, + "grad_norm": 1.1266147426655102, + "learning_rate": 5.560644188818348e-07, + "loss": 1.6809, + "step": 1373 }, { - "epoch": 0.38716589393575557, - "grad_norm": 1.5167798574452542, - "learning_rate": 3.537784510680629e-07, - "loss": 1.6269, - "step": 10480 + "epoch": 0.18426464014304872, + "grad_norm": 1.4560506903910069, + "learning_rate": 5.560021944028195e-07, + "loss": 1.7862, + "step": 1374 }, { - "epoch": 0.38790476014555664, - "grad_norm": 1.4073803006779033, - "learning_rate": 3.5328901513565755e-07, - "loss": 1.667, - "step": 10500 + "epoch": 0.18439874832364775, + "grad_norm": 1.1339717685703572, + "learning_rate": 5.559399298001391e-07, + "loss": 1.7362, + "step": 1375 }, { - "epoch": 0.3886436263553577, - "grad_norm": 1.5025049762633182, - "learning_rate": 3.527991573629826e-07, - "loss": 1.6685, - "step": 10520 + "epoch": 0.18453285650424675, + "grad_norm": 1.0605805880234964, + "learning_rate": 5.55877625084848e-07, + "loss": 1.6264, + "step": 1376 }, { - "epoch": 0.3893824925651588, - "grad_norm": 1.498817940042482, - "learning_rate": 3.523088803895111e-07, - "loss": 1.6693, - "step": 10540 + "epoch": 0.18466696468484578, + "grad_norm": 1.1589703072777433, + "learning_rate": 5.558152802680075e-07, + "loss": 1.6524, + "step": 1377 }, { - "epoch": 0.39012135877495985, - "grad_norm": 1.5375475807699233, - "learning_rate": 3.5181818685697454e-07, - "loss": 1.6257, - "step": 10560 + "epoch": 0.18480107286544478, + "grad_norm": 1.0842230894260985, + "learning_rate": 5.557528953606858e-07, + "loss": 1.8047, + "step": 1378 }, { - "epoch": 0.39086022498476086, - "grad_norm": 1.4788669954107543, - "learning_rate": 3.513270794093493e-07, - "loss": 1.6396, - "step": 10580 + "epoch": 0.1849351810460438, + "grad_norm": 1.1794280210617787, + "learning_rate": 5.55690470373959e-07, + "loss": 1.6757, + "step": 1379 }, { - "epoch": 0.39159909119456193, - "grad_norm": 1.8280175785471986, - "learning_rate": 3.508355606928417e-07, - "loss": 1.6708, - "step": 10600 + "epoch": 0.1850692892266428, + "grad_norm": 1.097631119847551, + "learning_rate": 5.556280053189095e-07, + "loss": 1.6108, + "step": 1380 }, { - "epoch": 0.392337957404363, - "grad_norm": 1.657327382022486, - "learning_rate": 3.503436333558744e-07, - "loss": 1.6344, - "step": 10620 + "epoch": 0.18520339740724184, + "grad_norm": 1.1017129023282082, + "learning_rate": 5.555655002066273e-07, + "loss": 1.7577, + "step": 1381 }, { - "epoch": 0.3930768236141641, - "grad_norm": 3.2933368891799772, - "learning_rate": 3.498513000490713e-07, - "loss": 1.6233, - "step": 10640 + "epoch": 0.18533750558784085, + "grad_norm": 1.1361790282178577, + "learning_rate": 5.555029550482091e-07, + "loss": 1.7294, + "step": 1382 }, { - "epoch": 0.39381568982396514, - "grad_norm": 1.5787521448516106, - "learning_rate": 3.4935856342524445e-07, - "loss": 1.6504, - "step": 10660 + "epoch": 0.18547161376843987, + "grad_norm": 1.055142090473337, + "learning_rate": 5.554403698547593e-07, + "loss": 1.6388, + "step": 1383 }, { - "epoch": 0.3945545560337662, - "grad_norm": 1.7273082957996757, - "learning_rate": 3.488654261393786e-07, - "loss": 1.6501, - "step": 10680 + "epoch": 0.1856057219490389, + "grad_norm": 7.061910083877572, + "learning_rate": 5.553777446373886e-07, + "loss": 1.6087, + "step": 1384 }, { - "epoch": 0.3952934222435672, - "grad_norm": 1.5427159019633168, - "learning_rate": 3.483718908486173e-07, - "loss": 1.6213, - "step": 10700 + "epoch": 0.1857398301296379, + "grad_norm": 1.1547867916367462, + "learning_rate": 5.553150794072159e-07, + "loss": 1.6509, + "step": 1385 }, { - "epoch": 0.3960322884533683, - "grad_norm": 2.4791279004019944, - "learning_rate": 3.478779602122491e-07, - "loss": 1.6341, - "step": 10720 + "epoch": 0.18587393831023694, + "grad_norm": 1.193219273609135, + "learning_rate": 5.552523741753659e-07, + "loss": 1.8231, + "step": 1386 }, { - "epoch": 0.39677115466316937, - "grad_norm": 1.5057908958686839, - "learning_rate": 3.4738363689169227e-07, - "loss": 1.6344, - "step": 10740 + "epoch": 0.18600804649083594, + "grad_norm": 1.0693060290055107, + "learning_rate": 5.551896289529716e-07, + "loss": 1.656, + "step": 1387 }, { - "epoch": 0.39751002087297044, - "grad_norm": 1.6211537727930727, - "learning_rate": 3.4688892355048133e-07, - "loss": 1.6684, - "step": 10760 + "epoch": 0.18614215467143497, + "grad_norm": 1.1745807906563366, + "learning_rate": 5.551268437511724e-07, + "loss": 1.6985, + "step": 1388 }, { - "epoch": 0.3982488870827715, - "grad_norm": 1.7112433425010558, - "learning_rate": 3.4639382285425217e-07, - "loss": 1.6742, - "step": 10780 + "epoch": 0.18627626285203397, + "grad_norm": 1.099307648055397, + "learning_rate": 5.550640185811148e-07, + "loss": 1.6393, + "step": 1389 }, { - "epoch": 0.3989877532925725, - "grad_norm": 1.7626819549867558, - "learning_rate": 3.4589833747072765e-07, - "loss": 1.6497, - "step": 10800 + "epoch": 0.186410371032633, + "grad_norm": 1.1139438125947954, + "learning_rate": 5.550011534539527e-07, + "loss": 1.6638, + "step": 1390 }, { - "epoch": 0.3997266195023736, - "grad_norm": 1.536514259186305, - "learning_rate": 3.4540247006970395e-07, - "loss": 1.6533, - "step": 10820 + "epoch": 0.186544479213232, + "grad_norm": 1.0670126218487324, + "learning_rate": 5.549382483808472e-07, + "loss": 1.6649, + "step": 1391 }, { - "epoch": 0.40046548571217466, - "grad_norm": 1.4352156142464503, - "learning_rate": 3.449062233230351e-07, - "loss": 1.6423, - "step": 10840 + "epoch": 0.18667858739383103, + "grad_norm": 1.1017328082824618, + "learning_rate": 5.548753033729658e-07, + "loss": 1.6979, + "step": 1392 }, { - "epoch": 0.40120435192197573, - "grad_norm": 1.517870844401341, - "learning_rate": 3.4440959990461936e-07, - "loss": 1.6888, - "step": 10860 + "epoch": 0.18681269557443003, + "grad_norm": 1.1113229457677472, + "learning_rate": 5.548123184414838e-07, + "loss": 1.6629, + "step": 1393 }, { - "epoch": 0.4019432181317768, - "grad_norm": 1.6903764999597104, - "learning_rate": 3.4391260249038467e-07, - "loss": 1.6242, - "step": 10880 + "epoch": 0.18694680375502906, + "grad_norm": 1.061154042048288, + "learning_rate": 5.547492935975834e-07, + "loss": 1.6141, + "step": 1394 }, { - "epoch": 0.4026820843415779, - "grad_norm": 1.9353070894961153, - "learning_rate": 3.4341523375827407e-07, - "loss": 1.6219, - "step": 10900 + "epoch": 0.18708091193562806, + "grad_norm": 1.1037785149371337, + "learning_rate": 5.546862288524536e-07, + "loss": 1.619, + "step": 1395 }, { - "epoch": 0.4034209505513789, - "grad_norm": 1.70733565978221, - "learning_rate": 3.4291749638823144e-07, - "loss": 1.6524, - "step": 10920 + "epoch": 0.1872150201162271, + "grad_norm": 1.042211773070437, + "learning_rate": 5.546231242172909e-07, + "loss": 1.6314, + "step": 1396 }, { - "epoch": 0.40415981676117996, - "grad_norm": 1.3794756923120337, - "learning_rate": 3.4241939306218655e-07, - "loss": 1.647, - "step": 10940 + "epoch": 0.1873491282968261, + "grad_norm": 1.0991209850271397, + "learning_rate": 5.545599797032986e-07, + "loss": 1.6851, + "step": 1397 }, { - "epoch": 0.404898682970981, - "grad_norm": 1.4536895089620647, - "learning_rate": 3.4192092646404166e-07, - "loss": 1.6697, - "step": 10960 + "epoch": 0.18748323647742512, + "grad_norm": 1.0820523032730966, + "learning_rate": 5.544967953216872e-07, + "loss": 1.614, + "step": 1398 }, { - "epoch": 0.4056375491807821, - "grad_norm": 1.4185925084451405, - "learning_rate": 3.41422099279656e-07, - "loss": 1.6916, - "step": 10980 + "epoch": 0.18761734465802413, + "grad_norm": 1.0852415180954882, + "learning_rate": 5.544335710836741e-07, + "loss": 1.7069, + "step": 1399 }, { - "epoch": 0.40637641539058317, - "grad_norm": 1.5516883391882288, - "learning_rate": 3.40922914196832e-07, - "loss": 1.6702, - "step": 11000 + "epoch": 0.18775145283862316, + "grad_norm": 1.1386169714316008, + "learning_rate": 5.543703070004842e-07, + "loss": 1.7039, + "step": 1400 }, { - "epoch": 0.40711528160038424, - "grad_norm": 1.500896700694977, - "learning_rate": 3.4042337390530027e-07, - "loss": 1.6379, - "step": 11020 + "epoch": 0.18788556101922219, + "grad_norm": 1.1108755081549002, + "learning_rate": 5.543070030833488e-07, + "loss": 1.5328, + "step": 1401 }, { - "epoch": 0.40785414781018525, - "grad_norm": 1.4488842610705819, - "learning_rate": 3.399234810967055e-07, - "loss": 1.6322, - "step": 11040 + "epoch": 0.1880196691998212, + "grad_norm": 1.1300665980334217, + "learning_rate": 5.542436593435071e-07, + "loss": 1.5492, + "step": 1402 }, { - "epoch": 0.4085930140199863, - "grad_norm": 1.5363179452812292, - "learning_rate": 3.394232384645918e-07, - "loss": 1.7085, - "step": 11060 + "epoch": 0.18815377738042022, + "grad_norm": 1.1648000097455284, + "learning_rate": 5.541802757922047e-07, + "loss": 1.7602, + "step": 1403 }, { - "epoch": 0.4093318802297874, - "grad_norm": 1.6587795154693055, - "learning_rate": 3.389226487043883e-07, - "loss": 1.6212, - "step": 11080 + "epoch": 0.18828788556101922, + "grad_norm": 1.2562290482462215, + "learning_rate": 5.541168524406944e-07, + "loss": 1.7935, + "step": 1404 }, { - "epoch": 0.41007074643958846, - "grad_norm": 2.185811847037595, - "learning_rate": 3.3842171451339446e-07, - "loss": 1.653, - "step": 11100 + "epoch": 0.18842199374161825, + "grad_norm": 1.0533823510103384, + "learning_rate": 5.540533893002363e-07, + "loss": 1.6259, + "step": 1405 }, { - "epoch": 0.41080961264938953, - "grad_norm": 1.4930598472252423, - "learning_rate": 3.3792043859076556e-07, - "loss": 1.6401, - "step": 11120 + "epoch": 0.18855610192221725, + "grad_norm": 1.1167925185725207, + "learning_rate": 5.539898863820975e-07, + "loss": 1.6887, + "step": 1406 }, { - "epoch": 0.41154847885919055, - "grad_norm": 1.585267885050689, - "learning_rate": 3.3741882363749836e-07, - "loss": 1.6081, - "step": 11140 + "epoch": 0.18869021010281628, + "grad_norm": 1.1001250134186094, + "learning_rate": 5.539263436975518e-07, + "loss": 1.6111, + "step": 1407 }, { - "epoch": 0.4122873450689916, - "grad_norm": 1.5745770350836434, - "learning_rate": 3.3691687235641633e-07, - "loss": 1.6657, - "step": 11160 + "epoch": 0.18882431828341528, + "grad_norm": 1.0625193817660576, + "learning_rate": 5.538627612578808e-07, + "loss": 1.6671, + "step": 1408 }, { - "epoch": 0.4130262112787927, - "grad_norm": 1.638169374979827, - "learning_rate": 3.364145874521552e-07, - "loss": 1.6439, - "step": 11180 + "epoch": 0.1889584264640143, + "grad_norm": 1.080439840869442, + "learning_rate": 5.537991390743723e-07, + "loss": 1.6131, + "step": 1409 }, { - "epoch": 0.41376507748859376, - "grad_norm": 1.5771694576157802, - "learning_rate": 3.3591197163114807e-07, - "loss": 1.6344, - "step": 11200 + "epoch": 0.1890925346446133, + "grad_norm": 1.0665498302900025, + "learning_rate": 5.537354771583218e-07, + "loss": 1.6202, + "step": 1410 }, { - "epoch": 0.41450394369839483, - "grad_norm": 1.507624879108444, - "learning_rate": 3.3540902760161153e-07, - "loss": 1.6414, - "step": 11220 + "epoch": 0.18922664282521234, + "grad_norm": 1.062235350568671, + "learning_rate": 5.536717755210317e-07, + "loss": 1.7539, + "step": 1411 }, { - "epoch": 0.4152428099081959, - "grad_norm": 1.5517359392564993, - "learning_rate": 3.349057580735304e-07, - "loss": 1.6103, - "step": 11240 + "epoch": 0.18936075100581135, + "grad_norm": 1.1086535902273902, + "learning_rate": 5.536080341738112e-07, + "loss": 1.6395, + "step": 1412 }, { - "epoch": 0.4159816761179969, - "grad_norm": 1.6910189529581492, - "learning_rate": 3.3440216575864336e-07, - "loss": 1.6097, - "step": 11260 + "epoch": 0.18949485918641037, + "grad_norm": 1.0667252815429409, + "learning_rate": 5.535442531279765e-07, + "loss": 1.6353, + "step": 1413 }, { - "epoch": 0.416720542327798, - "grad_norm": 1.4817048826322234, - "learning_rate": 3.338982533704284e-07, - "loss": 1.6322, - "step": 11280 + "epoch": 0.18962896736700938, + "grad_norm": 1.046416301897227, + "learning_rate": 5.534804323948516e-07, + "loss": 1.6511, + "step": 1414 }, { - "epoch": 0.41745940853759905, - "grad_norm": 2.4572073331823843, - "learning_rate": 3.3339402362408803e-07, - "loss": 1.6818, - "step": 11300 + "epoch": 0.1897630755476084, + "grad_norm": 1.075372549798005, + "learning_rate": 5.534165719857666e-07, + "loss": 1.7723, + "step": 1415 }, { - "epoch": 0.4181982747474001, - "grad_norm": 1.4690103698141457, - "learning_rate": 3.32889479236535e-07, - "loss": 1.6734, - "step": 11320 + "epoch": 0.1898971837282074, + "grad_norm": 1.104299289930867, + "learning_rate": 5.533526719120594e-07, + "loss": 1.6641, + "step": 1416 }, { - "epoch": 0.4189371409572012, - "grad_norm": 1.4525562290767953, - "learning_rate": 3.323846229263772e-07, - "loss": 1.6777, - "step": 11340 + "epoch": 0.19003129190880644, + "grad_norm": 1.0927891670744394, + "learning_rate": 5.532887321850742e-07, + "loss": 1.5863, + "step": 1417 }, { - "epoch": 0.4196760071670022, - "grad_norm": 1.6088576080590102, - "learning_rate": 3.318794574139033e-07, - "loss": 1.6815, - "step": 11360 + "epoch": 0.19016540008940547, + "grad_norm": 1.1166521049854372, + "learning_rate": 5.532247528161629e-07, + "loss": 1.6574, + "step": 1418 }, { - "epoch": 0.4204148733768033, - "grad_norm": 1.658735344378412, - "learning_rate": 3.3137398542106816e-07, - "loss": 1.7156, - "step": 11380 + "epoch": 0.19029950827000447, + "grad_norm": 1.1320778263461202, + "learning_rate": 5.531607338166842e-07, + "loss": 1.6688, + "step": 1419 }, { - "epoch": 0.42115373958660435, - "grad_norm": 1.856711421074202, - "learning_rate": 3.308682096714777e-07, - "loss": 1.6056, - "step": 11400 + "epoch": 0.1904336164506035, + "grad_norm": 1.1471242711580207, + "learning_rate": 5.530966751980036e-07, + "loss": 1.6654, + "step": 1420 }, { - "epoch": 0.4218926057964054, - "grad_norm": 1.524820866790581, - "learning_rate": 3.3036213289037494e-07, - "loss": 1.653, - "step": 11420 + "epoch": 0.1905677246312025, + "grad_norm": 1.0867888184745689, + "learning_rate": 5.530325769714941e-07, + "loss": 1.5906, + "step": 1421 }, { - "epoch": 0.4226314720062065, - "grad_norm": 2.091225075765613, - "learning_rate": 3.298557578046248e-07, - "loss": 1.6344, - "step": 11440 + "epoch": 0.19070183281180153, + "grad_norm": 1.4483826712692085, + "learning_rate": 5.529684391485354e-07, + "loss": 1.5822, + "step": 1422 }, { - "epoch": 0.42337033821600756, - "grad_norm": 1.5873899994137428, - "learning_rate": 3.2934908714269926e-07, - "loss": 1.7056, - "step": 11460 + "epoch": 0.19083594099240053, + "grad_norm": 1.255407468844781, + "learning_rate": 5.529042617405144e-07, + "loss": 1.7131, + "step": 1423 }, { - "epoch": 0.4241092044258086, - "grad_norm": 1.530785170405434, - "learning_rate": 3.2884212363466336e-07, - "loss": 1.6592, - "step": 11480 + "epoch": 0.19097004917299956, + "grad_norm": 1.158464569939825, + "learning_rate": 5.528400447588247e-07, + "loss": 1.7756, + "step": 1424 }, { - "epoch": 0.42484807063560964, - "grad_norm": 1.4187769683759475, - "learning_rate": 3.283348700121599e-07, - "loss": 1.6155, - "step": 11500 + "epoch": 0.19110415735359856, + "grad_norm": 1.0950885308074678, + "learning_rate": 5.527757882148672e-07, + "loss": 1.5582, + "step": 1425 }, { - "epoch": 0.4255869368454107, - "grad_norm": 1.7098484503844666, - "learning_rate": 3.278273290083948e-07, - "loss": 1.6145, - "step": 11520 + "epoch": 0.1912382655341976, + "grad_norm": 1.1070256742947473, + "learning_rate": 5.527114921200501e-07, + "loss": 1.6467, + "step": 1426 }, { - "epoch": 0.4263258030552118, - "grad_norm": 1.6337855300981592, - "learning_rate": 3.2731950335812245e-07, - "loss": 1.6718, - "step": 11540 + "epoch": 0.1913723737147966, + "grad_norm": 1.0928498062976033, + "learning_rate": 5.52647156485788e-07, + "loss": 1.7125, + "step": 1427 }, { - "epoch": 0.42706466926501285, - "grad_norm": 1.562376692174843, - "learning_rate": 3.2681139579763116e-07, - "loss": 1.6299, - "step": 11560 + "epoch": 0.19150648189539562, + "grad_norm": 1.1327469366060336, + "learning_rate": 5.525827813235029e-07, + "loss": 1.6743, + "step": 1428 }, { - "epoch": 0.4278035354748139, - "grad_norm": 1.7830680336877842, - "learning_rate": 3.263030090647282e-07, - "loss": 1.6427, - "step": 11580 + "epoch": 0.19164059007599463, + "grad_norm": 1.0882012662709442, + "learning_rate": 5.525183666446239e-07, + "loss": 1.6799, + "step": 1429 }, { - "epoch": 0.42854240168461494, - "grad_norm": 1.67004917671626, - "learning_rate": 3.2579434589872487e-07, - "loss": 1.6645, - "step": 11600 + "epoch": 0.19177469825659366, + "grad_norm": 1.1709943857735898, + "learning_rate": 5.524539124605868e-07, + "loss": 1.766, + "step": 1430 }, { - "epoch": 0.429281267894416, - "grad_norm": 1.6704228734275928, - "learning_rate": 3.2528540904042226e-07, - "loss": 1.6427, - "step": 11620 + "epoch": 0.19190880643719266, + "grad_norm": 1.0839291014706198, + "learning_rate": 5.523894187828345e-07, + "loss": 1.6322, + "step": 1431 }, { - "epoch": 0.4300201341042171, - "grad_norm": 1.4195450351330696, - "learning_rate": 3.24776201232096e-07, - "loss": 1.62, - "step": 11640 + "epoch": 0.1920429146177917, + "grad_norm": 1.0975188778434444, + "learning_rate": 5.523248856228172e-07, + "loss": 1.7589, + "step": 1432 }, { - "epoch": 0.43075900031401815, - "grad_norm": 1.5285023969215334, - "learning_rate": 3.242667252174816e-07, - "loss": 1.6654, - "step": 11660 + "epoch": 0.1921770227983907, + "grad_norm": 1.1022611138397802, + "learning_rate": 5.522603129919919e-07, + "loss": 1.6493, + "step": 1433 }, { - "epoch": 0.4314978665238192, - "grad_norm": 1.4602369388272751, - "learning_rate": 3.2375698374176e-07, - "loss": 1.6073, - "step": 11680 + "epoch": 0.19231113097898972, + "grad_norm": 1.0944356638645014, + "learning_rate": 5.521957009018224e-07, + "loss": 1.6845, + "step": 1434 }, { - "epoch": 0.43223673273362023, - "grad_norm": 1.4791375841387864, - "learning_rate": 3.232469795515423e-07, - "loss": 1.6277, - "step": 11700 + "epoch": 0.19244523915958872, + "grad_norm": 1.1206597827966063, + "learning_rate": 5.521310493637798e-07, + "loss": 1.6926, + "step": 1435 }, { - "epoch": 0.4329755989434213, - "grad_norm": 1.4365509577307647, - "learning_rate": 3.227367153948551e-07, - "loss": 1.6678, - "step": 11720 + "epoch": 0.19257934734018775, + "grad_norm": 1.0956992634305383, + "learning_rate": 5.520663583893422e-07, + "loss": 1.6463, + "step": 1436 }, { - "epoch": 0.4337144651532224, - "grad_norm": 1.4925933032216425, - "learning_rate": 3.22226194021126e-07, - "loss": 1.6138, - "step": 11740 + "epoch": 0.19271345552078678, + "grad_norm": 1.0831083719944854, + "learning_rate": 5.520016279899947e-07, + "loss": 1.599, + "step": 1437 }, { - "epoch": 0.43445333136302344, - "grad_norm": 1.5965165214882902, - "learning_rate": 3.2171541818116844e-07, - "loss": 1.682, - "step": 11760 + "epoch": 0.19284756370138578, + "grad_norm": 1.391549260981187, + "learning_rate": 5.51936858177229e-07, + "loss": 1.6344, + "step": 1438 }, { - "epoch": 0.4351921975728245, - "grad_norm": 1.622561586319955, - "learning_rate": 3.2120439062716673e-07, - "loss": 1.6685, - "step": 11780 + "epoch": 0.1929816718819848, + "grad_norm": 1.1524973265055787, + "learning_rate": 5.518720489625443e-07, + "loss": 1.7242, + "step": 1439 }, { - "epoch": 0.4359310637826256, - "grad_norm": 1.5068996818021825, - "learning_rate": 3.206931141126622e-07, - "loss": 1.6353, - "step": 11800 + "epoch": 0.19311578006258381, + "grad_norm": 1.1802426876707486, + "learning_rate": 5.518072003574467e-07, + "loss": 1.6515, + "step": 1440 }, { - "epoch": 0.4366699299924266, - "grad_norm": 1.5980487695346257, - "learning_rate": 3.2018159139253667e-07, - "loss": 1.6442, - "step": 11820 + "epoch": 0.19324988824318284, + "grad_norm": 1.1402824833918361, + "learning_rate": 5.51742312373449e-07, + "loss": 1.8068, + "step": 1441 }, { - "epoch": 0.43740879620222767, - "grad_norm": 1.9446682447819341, - "learning_rate": 3.1966982522299927e-07, - "loss": 1.6215, - "step": 11840 + "epoch": 0.19338399642378185, + "grad_norm": 1.3034827789380141, + "learning_rate": 5.516773850220713e-07, + "loss": 1.5961, + "step": 1442 }, { - "epoch": 0.43814766241202874, - "grad_norm": 1.3911283325778476, - "learning_rate": 3.1915781836157076e-07, - "loss": 1.6237, - "step": 11860 + "epoch": 0.19351810460438088, + "grad_norm": 1.0690564805797904, + "learning_rate": 5.516124183148406e-07, + "loss": 1.6845, + "step": 1443 }, { - "epoch": 0.4388865286218298, - "grad_norm": 1.7379788181506113, - "learning_rate": 3.1864557356706854e-07, - "loss": 1.6311, - "step": 11880 + "epoch": 0.19365221278497988, + "grad_norm": 1.0643025118264189, + "learning_rate": 5.515474122632908e-07, + "loss": 1.6856, + "step": 1444 }, { - "epoch": 0.4396253948316309, - "grad_norm": 1.5960691894661032, - "learning_rate": 3.181330935995925e-07, - "loss": 1.6967, - "step": 11900 + "epoch": 0.1937863209655789, + "grad_norm": 1.1264779418191524, + "learning_rate": 5.51482366878963e-07, + "loss": 1.6055, + "step": 1445 }, { - "epoch": 0.44036426104143195, - "grad_norm": 1.334622875404918, - "learning_rate": 3.176203812205092e-07, - "loss": 1.7151, - "step": 11920 + "epoch": 0.1939204291461779, + "grad_norm": 1.024225937952105, + "learning_rate": 5.51417282173405e-07, + "loss": 1.6615, + "step": 1446 }, { - "epoch": 0.44110312725123296, - "grad_norm": 2.3408851593313287, - "learning_rate": 3.171074391924379e-07, - "loss": 1.6204, - "step": 11940 + "epoch": 0.19405453732677694, + "grad_norm": 1.161971897328525, + "learning_rate": 5.513521581581719e-07, + "loss": 1.6043, + "step": 1447 }, { - "epoch": 0.44184199346103403, - "grad_norm": 1.517416691835459, - "learning_rate": 3.16594270279235e-07, - "loss": 1.647, - "step": 11960 + "epoch": 0.19418864550737594, + "grad_norm": 1.0885797045193277, + "learning_rate": 5.512869948448252e-07, + "loss": 1.701, + "step": 1448 }, { - "epoch": 0.4425808596708351, - "grad_norm": 1.732092967222855, - "learning_rate": 3.160808772459796e-07, - "loss": 1.6246, - "step": 11980 + "epoch": 0.19432275368797497, + "grad_norm": 1.1421314031719336, + "learning_rate": 5.512217922449342e-07, + "loss": 1.6471, + "step": 1449 }, { - "epoch": 0.4433197258806362, - "grad_norm": 1.4748895033828555, - "learning_rate": 3.155672628589582e-07, - "loss": 1.6559, - "step": 12000 + "epoch": 0.19445686186857397, + "grad_norm": 1.077561352558914, + "learning_rate": 5.511565503700745e-07, + "loss": 1.7467, + "step": 1450 }, { - "epoch": 0.44405859209043724, - "grad_norm": 1.466688995230755, - "learning_rate": 3.1505342988565024e-07, - "loss": 1.6631, - "step": 12020 + "epoch": 0.194590970049173, + "grad_norm": 1.1713587803273386, + "learning_rate": 5.51091269231829e-07, + "loss": 1.833, + "step": 1451 }, { - "epoch": 0.44479745830023826, - "grad_norm": 1.5762348950247518, - "learning_rate": 3.145393810947129e-07, - "loss": 1.6507, - "step": 12040 + "epoch": 0.194725078229772, + "grad_norm": 1.1325441610620945, + "learning_rate": 5.510259488417875e-07, + "loss": 1.6516, + "step": 1452 }, { - "epoch": 0.4455363245100393, - "grad_norm": 1.5705066014221254, - "learning_rate": 3.1402511925596604e-07, - "loss": 1.6218, - "step": 12060 + "epoch": 0.19485918641037103, + "grad_norm": 1.105302401232543, + "learning_rate": 5.509605892115468e-07, + "loss": 1.6555, + "step": 1453 }, { - "epoch": 0.4462751907198404, - "grad_norm": 1.5033544192166477, - "learning_rate": 3.135106471403778e-07, - "loss": 1.6645, - "step": 12080 + "epoch": 0.19499329459097006, + "grad_norm": 1.1082502770943088, + "learning_rate": 5.508951903527105e-07, + "loss": 1.6901, + "step": 1454 }, { - "epoch": 0.44701405692964147, - "grad_norm": 1.8660368037827004, - "learning_rate": 3.1299596752004884e-07, - "loss": 1.6617, - "step": 12100 + "epoch": 0.19512740277156906, + "grad_norm": 1.2100417158092283, + "learning_rate": 5.508297522768895e-07, + "loss": 1.7645, + "step": 1455 }, { - "epoch": 0.44775292313944254, - "grad_norm": 1.6278625709035912, - "learning_rate": 3.124810831681987e-07, - "loss": 1.6383, - "step": 12120 + "epoch": 0.1952615109521681, + "grad_norm": 1.054087647701517, + "learning_rate": 5.507642749957011e-07, + "loss": 1.714, + "step": 1456 }, { - "epoch": 0.4484917893492436, - "grad_norm": 1.6698134882051106, - "learning_rate": 3.1196599685914916e-07, - "loss": 1.6691, - "step": 12140 + "epoch": 0.1953956191327671, + "grad_norm": 1.0560637240698765, + "learning_rate": 5.506987585207703e-07, + "loss": 1.6332, + "step": 1457 }, { - "epoch": 0.4492306555590446, - "grad_norm": 1.5877476217951574, - "learning_rate": 3.114507113683109e-07, - "loss": 1.6091, - "step": 12160 + "epoch": 0.19552972731336613, + "grad_norm": 1.110689185152269, + "learning_rate": 5.506332028637285e-07, + "loss": 1.6175, + "step": 1458 }, { - "epoch": 0.4499695217688457, - "grad_norm": 1.533714449161249, - "learning_rate": 3.109352294721674e-07, - "loss": 1.6721, - "step": 12180 + "epoch": 0.19566383549396513, + "grad_norm": 1.0676099046686827, + "learning_rate": 5.505676080362142e-07, + "loss": 1.753, + "step": 1459 }, { - "epoch": 0.45070838797864676, - "grad_norm": 1.415779061176635, - "learning_rate": 3.104195539482607e-07, - "loss": 1.606, - "step": 12200 + "epoch": 0.19579794367456416, + "grad_norm": 1.0306885085920625, + "learning_rate": 5.505019740498731e-07, + "loss": 1.5685, + "step": 1460 }, { - "epoch": 0.45144725418844783, - "grad_norm": 1.4338589085273825, - "learning_rate": 3.0990368757517605e-07, - "loss": 1.6661, - "step": 12220 + "epoch": 0.19593205185516316, + "grad_norm": 1.0775372740576943, + "learning_rate": 5.504363009163573e-07, + "loss": 1.6199, + "step": 1461 }, { - "epoch": 0.4521861203982489, - "grad_norm": 1.8998339669584823, - "learning_rate": 3.093876331325269e-07, - "loss": 1.609, - "step": 12240 + "epoch": 0.1960661600357622, + "grad_norm": 1.0643274573728114, + "learning_rate": 5.503705886473264e-07, + "loss": 1.6547, + "step": 1462 }, { - "epoch": 0.45292498660805, - "grad_norm": 1.384458068102408, - "learning_rate": 3.0889720974519455e-07, - "loss": 1.6454, - "step": 12260 + "epoch": 0.1962002682163612, + "grad_norm": 1.0711004226035805, + "learning_rate": 5.503048372544466e-07, + "loss": 1.7047, + "step": 1463 }, { - "epoch": 0.453663852817851, - "grad_norm": 1.4452081009096462, - "learning_rate": 3.083807965655827e-07, - "loss": 1.6452, - "step": 12280 + "epoch": 0.19633437639696022, + "grad_norm": 1.123667947934815, + "learning_rate": 5.502390467493915e-07, + "loss": 1.7008, + "step": 1464 }, { - "epoch": 0.45440271902765206, - "grad_norm": 1.5698647385968285, - "learning_rate": 3.0786420352211376e-07, - "loss": 1.6741, - "step": 12300 + "epoch": 0.19646848457755922, + "grad_norm": 1.0844329149084733, + "learning_rate": 5.501732171438408e-07, + "loss": 1.6279, + "step": 1465 }, { - "epoch": 0.45514158523745313, - "grad_norm": 1.9552580205602894, - "learning_rate": 3.0734743339831694e-07, - "loss": 1.6845, - "step": 12320 + "epoch": 0.19660259275815825, + "grad_norm": 1.436970815874584, + "learning_rate": 5.501073484494822e-07, + "loss": 1.6543, + "step": 1466 }, { - "epoch": 0.4558804514472542, - "grad_norm": 1.3583889408096808, - "learning_rate": 3.068304889786754e-07, - "loss": 1.6744, - "step": 12340 + "epoch": 0.19673670093875725, + "grad_norm": 1.1579140829195231, + "learning_rate": 5.500414406780093e-07, + "loss": 1.6149, + "step": 1467 }, { - "epoch": 0.45661931765705527, - "grad_norm": 1.6780668319449847, - "learning_rate": 3.063133730486116e-07, - "loss": 1.6258, - "step": 12360 + "epoch": 0.19687080911935628, + "grad_norm": 1.1219759034001007, + "learning_rate": 5.499754938411235e-07, + "loss": 1.6853, + "step": 1468 }, { - "epoch": 0.4573581838668563, - "grad_norm": 1.627173946323959, - "learning_rate": 3.057960883944719e-07, - "loss": 1.6198, - "step": 12380 + "epoch": 0.19700491729995528, + "grad_norm": 1.1456958318708046, + "learning_rate": 5.499095079505327e-07, + "loss": 1.6056, + "step": 1469 }, { - "epoch": 0.45809705007665735, - "grad_norm": 1.3800453841054778, - "learning_rate": 3.0527863780351194e-07, - "loss": 1.6268, - "step": 12400 + "epoch": 0.19713902548055431, + "grad_norm": 1.135367963109951, + "learning_rate": 5.498434830179519e-07, + "loss": 1.6775, + "step": 1470 }, { - "epoch": 0.4588359162864584, - "grad_norm": 1.5516028071383072, - "learning_rate": 3.047610240638816e-07, - "loss": 1.679, - "step": 12420 + "epoch": 0.19727313366115334, + "grad_norm": 1.0665068451667536, + "learning_rate": 5.497774190551028e-07, + "loss": 1.6953, + "step": 1471 }, { - "epoch": 0.4595747824962595, - "grad_norm": 1.546230302013408, - "learning_rate": 3.0424324996460955e-07, - "loss": 1.6234, - "step": 12440 + "epoch": 0.19740724184175235, + "grad_norm": 1.0531212330423794, + "learning_rate": 5.497113160737142e-07, + "loss": 1.6531, + "step": 1472 }, { - "epoch": 0.46031364870606056, - "grad_norm": 1.5739393391599368, - "learning_rate": 3.037253182955887e-07, - "loss": 1.703, - "step": 12460 + "epoch": 0.19754135002235138, + "grad_norm": 1.1454744923401645, + "learning_rate": 5.496451740855217e-07, + "loss": 1.7061, + "step": 1473 }, { - "epoch": 0.46105251491586163, - "grad_norm": 1.5792552039289542, - "learning_rate": 3.0320723184756095e-07, - "loss": 1.6453, - "step": 12480 + "epoch": 0.19767545820295038, + "grad_norm": 1.1044037302229577, + "learning_rate": 5.49578993102268e-07, + "loss": 1.6111, + "step": 1474 }, { - "epoch": 0.46179138112566265, - "grad_norm": 1.5239329095833032, - "learning_rate": 3.026889934121023e-07, - "loss": 1.6553, - "step": 12500 + "epoch": 0.1978095663835494, + "grad_norm": 1.0685087974547518, + "learning_rate": 5.495127731357029e-07, + "loss": 1.572, + "step": 1475 }, { - "epoch": 0.4625302473354637, - "grad_norm": 1.4558048272931619, - "learning_rate": 3.021706057816074e-07, - "loss": 1.6563, - "step": 12520 + "epoch": 0.1979436745641484, + "grad_norm": 1.0974414948618096, + "learning_rate": 5.494465141975826e-07, + "loss": 1.6854, + "step": 1476 }, { - "epoch": 0.4632691135452648, - "grad_norm": 1.5801820167249694, - "learning_rate": 3.0165207174927513e-07, - "loss": 1.6645, - "step": 12540 + "epoch": 0.19807778274474744, + "grad_norm": 1.0834578832501205, + "learning_rate": 5.493802162996703e-07, + "loss": 1.6889, + "step": 1477 }, { - "epoch": 0.46400797975506586, - "grad_norm": 1.5560547577828236, - "learning_rate": 3.01133394109093e-07, - "loss": 1.6596, - "step": 12560 + "epoch": 0.19821189092534644, + "grad_norm": 1.070274290599906, + "learning_rate": 5.493138794537367e-07, + "loss": 1.6939, + "step": 1478 }, { - "epoch": 0.46474684596486693, - "grad_norm": 1.6818881647492323, - "learning_rate": 3.006145756558223e-07, - "loss": 1.6335, - "step": 12580 + "epoch": 0.19834599910594547, + "grad_norm": 1.115057911105637, + "learning_rate": 5.49247503671559e-07, + "loss": 1.6584, + "step": 1479 }, { - "epoch": 0.465485712174668, - "grad_norm": 1.6120666995517767, - "learning_rate": 3.0009561918498335e-07, - "loss": 1.6685, - "step": 12600 + "epoch": 0.19848010728654447, + "grad_norm": 1.1561061527897827, + "learning_rate": 5.491810889649211e-07, + "loss": 1.7095, + "step": 1480 }, { - "epoch": 0.466224578384469, - "grad_norm": 1.4949729602626867, - "learning_rate": 2.995765274928398e-07, - "loss": 1.6753, - "step": 12620 + "epoch": 0.1986142154671435, + "grad_norm": 1.1456838684818837, + "learning_rate": 5.491146353456139e-07, + "loss": 1.5911, + "step": 1481 }, { - "epoch": 0.4669634445942701, - "grad_norm": 1.5289962949889762, - "learning_rate": 2.9905730337638395e-07, - "loss": 1.6548, - "step": 12640 + "epoch": 0.1987483236477425, + "grad_norm": 1.0828440940723576, + "learning_rate": 5.490481428254358e-07, + "loss": 1.6674, + "step": 1482 }, { - "epoch": 0.46770231080407115, - "grad_norm": 1.8299373423521412, - "learning_rate": 2.98537949633322e-07, - "loss": 1.5999, - "step": 12660 + "epoch": 0.19888243182834153, + "grad_norm": 1.1636923332921367, + "learning_rate": 5.489816114161914e-07, + "loss": 1.7205, + "step": 1483 }, { - "epoch": 0.4684411770138722, - "grad_norm": 1.5948007806430553, - "learning_rate": 2.9801846906205794e-07, - "loss": 1.6638, - "step": 12680 + "epoch": 0.19901654000894053, + "grad_norm": 1.197166180009061, + "learning_rate": 5.489150411296926e-07, + "loss": 1.5965, + "step": 1484 }, { - "epoch": 0.4691800432236733, - "grad_norm": 1.418583561219425, - "learning_rate": 2.974988644616799e-07, - "loss": 1.6782, - "step": 12700 + "epoch": 0.19915064818953956, + "grad_norm": 1.9827106666547534, + "learning_rate": 5.488484319777578e-07, + "loss": 1.7469, + "step": 1485 }, { - "epoch": 0.4699189094334743, - "grad_norm": 1.461318006445296, - "learning_rate": 2.9700512775939907e-07, - "loss": 1.6528, - "step": 12720 + "epoch": 0.19928475637013857, + "grad_norm": 1.140838885188788, + "learning_rate": 5.487817839722128e-07, + "loss": 1.7168, + "step": 1486 }, { - "epoch": 0.4706577756432754, - "grad_norm": 1.5468327583259127, - "learning_rate": 2.964852893556419e-07, - "loss": 1.6685, - "step": 12740 + "epoch": 0.1994188645507376, + "grad_norm": 1.0817633006855307, + "learning_rate": 5.487150971248901e-07, + "loss": 1.5428, + "step": 1487 }, { - "epoch": 0.47139664185307645, - "grad_norm": 1.6470459204833447, - "learning_rate": 2.9596533518391615e-07, - "loss": 1.6733, - "step": 12760 + "epoch": 0.19955297273133663, + "grad_norm": 1.076792423128002, + "learning_rate": 5.486483714476288e-07, + "loss": 1.788, + "step": 1488 }, { - "epoch": 0.4721355080628775, - "grad_norm": 1.624503313092944, - "learning_rate": 2.954452680458612e-07, - "loss": 1.6737, - "step": 12780 + "epoch": 0.19968708091193563, + "grad_norm": 1.1267981548935038, + "learning_rate": 5.485816069522754e-07, + "loss": 1.692, + "step": 1489 }, { - "epoch": 0.4728743742726786, - "grad_norm": 1.5728828027087576, - "learning_rate": 2.949250907437256e-07, - "loss": 1.6671, - "step": 12800 + "epoch": 0.19982118909253466, + "grad_norm": 1.0735390096180335, + "learning_rate": 5.485148036506829e-07, + "loss": 1.6896, + "step": 1490 }, { - "epoch": 0.47361324048247966, - "grad_norm": 1.679151732155206, - "learning_rate": 2.944048060803512e-07, - "loss": 1.656, - "step": 12820 + "epoch": 0.19995529727313366, + "grad_norm": 1.067799342487284, + "learning_rate": 5.484479615547114e-07, + "loss": 1.5558, + "step": 1491 }, { - "epoch": 0.4743521066922807, - "grad_norm": 1.4259988112675113, - "learning_rate": 2.938844168591584e-07, - "loss": 1.6088, - "step": 12840 + "epoch": 0.2000894054537327, + "grad_norm": 1.134188777380917, + "learning_rate": 5.483810806762278e-07, + "loss": 1.6667, + "step": 1492 }, { - "epoch": 0.47509097290208174, - "grad_norm": 2.10422922646524, - "learning_rate": 2.933639258841309e-07, - "loss": 1.6411, - "step": 12860 + "epoch": 0.2002235136343317, + "grad_norm": 1.0312169428441251, + "learning_rate": 5.483141610271059e-07, + "loss": 1.5311, + "step": 1493 }, { - "epoch": 0.4758298391118828, - "grad_norm": 1.809412517307293, - "learning_rate": 2.92843335959801e-07, - "loss": 1.654, - "step": 12880 + "epoch": 0.20035762181493072, + "grad_norm": 1.113434318828811, + "learning_rate": 5.482472026192263e-07, + "loss": 1.662, + "step": 1494 }, { - "epoch": 0.4765687053216839, - "grad_norm": 1.6010915209622532, - "learning_rate": 2.923226498912336e-07, - "loss": 1.6653, - "step": 12900 + "epoch": 0.20049172999552972, + "grad_norm": 1.0830554984993648, + "learning_rate": 5.481802054644767e-07, + "loss": 1.6549, + "step": 1495 }, { - "epoch": 0.47730757153148495, - "grad_norm": 1.7399335136485357, - "learning_rate": 2.918018704840123e-07, - "loss": 1.6839, - "step": 12920 + "epoch": 0.20062583817612875, + "grad_norm": 1.1263172768039542, + "learning_rate": 5.481131695747516e-07, + "loss": 1.7273, + "step": 1496 }, { - "epoch": 0.478046437741286, - "grad_norm": 1.9845153410774579, - "learning_rate": 2.912810005442231e-07, - "loss": 1.6308, - "step": 12940 + "epoch": 0.20075994635672775, + "grad_norm": 1.0175973585933547, + "learning_rate": 5.480460949619521e-07, + "loss": 1.6573, + "step": 1497 }, { - "epoch": 0.47878530395108704, - "grad_norm": 1.4672730941447367, - "learning_rate": 2.9076004287844007e-07, - "loss": 1.7158, - "step": 12960 + "epoch": 0.20089405453732678, + "grad_norm": 1.0684638665771677, + "learning_rate": 5.479789816379866e-07, + "loss": 1.5783, + "step": 1498 }, { - "epoch": 0.4795241701608881, - "grad_norm": 1.3537458462825016, - "learning_rate": 2.9023900029371e-07, - "loss": 1.5888, - "step": 12980 + "epoch": 0.20102816271792578, + "grad_norm": 1.100911731230959, + "learning_rate": 5.479118296147701e-07, + "loss": 1.7139, + "step": 1499 }, { - "epoch": 0.4802630363706892, - "grad_norm": 1.585460577335508, - "learning_rate": 2.8971787559753695e-07, - "loss": 1.6476, - "step": 13000 + "epoch": 0.20116227089852481, + "grad_norm": 1.0645364314712737, + "learning_rate": 5.478446389042245e-07, + "loss": 1.6684, + "step": 1500 }, { - "epoch": 0.48100190258049025, - "grad_norm": 1.561928549919643, - "learning_rate": 2.891966715978679e-07, - "loss": 1.6339, - "step": 13020 + "epoch": 0.20129637907912382, + "grad_norm": 1.0556389823591241, + "learning_rate": 5.477774095182787e-07, + "loss": 1.5132, + "step": 1501 }, { - "epoch": 0.4817407687902913, - "grad_norm": 1.439464952580829, - "learning_rate": 2.886753911030767e-07, - "loss": 1.6619, - "step": 13040 + "epoch": 0.20143048725972285, + "grad_norm": 1.2334210157237786, + "learning_rate": 5.477101414688683e-07, + "loss": 1.6951, + "step": 1502 }, { - "epoch": 0.48247963500009233, - "grad_norm": 1.5693967956885457, - "learning_rate": 2.8815403692194954e-07, - "loss": 1.6443, - "step": 13060 + "epoch": 0.20156459544032185, + "grad_norm": 1.058485353217571, + "learning_rate": 5.47642834767936e-07, + "loss": 1.6295, + "step": 1503 }, { - "epoch": 0.4832185012098934, - "grad_norm": 1.8445144793183739, - "learning_rate": 2.8763261186366977e-07, - "loss": 1.6395, - "step": 13080 + "epoch": 0.20169870362092088, + "grad_norm": 1.0445219837933504, + "learning_rate": 5.475754894274309e-07, + "loss": 1.6173, + "step": 1504 }, { - "epoch": 0.4839573674196945, - "grad_norm": 1.4215590880054088, - "learning_rate": 2.8711111873780224e-07, - "loss": 1.6583, - "step": 13100 + "epoch": 0.20183281180151988, + "grad_norm": 1.1004187774444296, + "learning_rate": 5.475081054593096e-07, + "loss": 1.739, + "step": 1505 }, { - "epoch": 0.48469623362949554, - "grad_norm": 1.6129407222161285, - "learning_rate": 2.8658956035427917e-07, - "loss": 1.6579, - "step": 13120 + "epoch": 0.2019669199821189, + "grad_norm": 1.1602467124536924, + "learning_rate": 5.47440682875535e-07, + "loss": 1.6625, + "step": 1506 }, { - "epoch": 0.4854350998392966, - "grad_norm": 1.7787904262576621, - "learning_rate": 2.8606793952338394e-07, - "loss": 1.6387, - "step": 13140 + "epoch": 0.20210102816271794, + "grad_norm": 1.0567141838600442, + "learning_rate": 5.47373221688077e-07, + "loss": 1.7637, + "step": 1507 }, { - "epoch": 0.4861739660490977, - "grad_norm": 2.9317837538381384, - "learning_rate": 2.8554625905573646e-07, - "loss": 1.6258, - "step": 13160 + "epoch": 0.20223513634331694, + "grad_norm": 1.1231422155189525, + "learning_rate": 5.473057219089128e-07, + "loss": 1.6322, + "step": 1508 }, { - "epoch": 0.4869128322588987, - "grad_norm": 1.6449106895888608, - "learning_rate": 2.850245217622784e-07, - "loss": 1.6492, - "step": 13180 + "epoch": 0.20236924452391597, + "grad_norm": 1.090099414447627, + "learning_rate": 5.472381835500258e-07, + "loss": 1.7463, + "step": 1509 }, { - "epoch": 0.48765169846869977, - "grad_norm": 1.5321621721627146, - "learning_rate": 2.8450273045425677e-07, - "loss": 1.6456, - "step": 13200 + "epoch": 0.20250335270451497, + "grad_norm": 1.036240114395212, + "learning_rate": 5.471706066234064e-07, + "loss": 1.5938, + "step": 1510 }, { - "epoch": 0.48839056467850084, - "grad_norm": 1.5327848701302575, - "learning_rate": 2.8398088794321054e-07, - "loss": 1.6299, - "step": 13220 + "epoch": 0.202637460885114, + "grad_norm": 1.0971271814274632, + "learning_rate": 5.471029911410524e-07, + "loss": 1.729, + "step": 1511 }, { - "epoch": 0.4891294308883019, - "grad_norm": 1.5262317315528862, - "learning_rate": 2.8345899704095424e-07, - "loss": 1.6815, - "step": 13240 + "epoch": 0.202771569065713, + "grad_norm": 1.0884227452009132, + "learning_rate": 5.470353371149678e-07, + "loss": 1.6752, + "step": 1512 }, { - "epoch": 0.489868297098103, - "grad_norm": 8.056093277940944, - "learning_rate": 2.8293706055956266e-07, - "loss": 1.6196, - "step": 13260 + "epoch": 0.20290567724631203, + "grad_norm": 1.0387697751366196, + "learning_rate": 5.469676445571636e-07, + "loss": 1.6329, + "step": 1513 }, { - "epoch": 0.49060716330790405, - "grad_norm": 1.7903474479157373, - "learning_rate": 2.8241508131135704e-07, - "loss": 1.6748, - "step": 13280 + "epoch": 0.20303978542691103, + "grad_norm": 1.0513306520797294, + "learning_rate": 5.468999134796577e-07, + "loss": 1.7112, + "step": 1514 }, { - "epoch": 0.49134602951770506, - "grad_norm": 2.3280755640085857, - "learning_rate": 2.818930621088883e-07, - "loss": 1.674, - "step": 13300 + "epoch": 0.20317389360751006, + "grad_norm": 1.0894137924530085, + "learning_rate": 5.46832143894475e-07, + "loss": 1.6982, + "step": 1515 }, { - "epoch": 0.49208489572750613, - "grad_norm": 1.7132266058410768, - "learning_rate": 2.8137100576492324e-07, - "loss": 1.6407, - "step": 13320 + "epoch": 0.20330800178810907, + "grad_norm": 1.0770824339698073, + "learning_rate": 5.467643358136469e-07, + "loss": 1.7484, + "step": 1516 }, { - "epoch": 0.4928237619373072, - "grad_norm": 1.652779406776925, - "learning_rate": 2.808489150924283e-07, - "loss": 1.6672, - "step": 13340 + "epoch": 0.2034421099687081, + "grad_norm": 1.095801657453924, + "learning_rate": 5.466964892492119e-07, + "loss": 1.6417, + "step": 1517 }, { - "epoch": 0.4935626281471083, - "grad_norm": 1.597072673714322, - "learning_rate": 2.8032679290455525e-07, - "loss": 1.6326, - "step": 13360 + "epoch": 0.2035762181493071, + "grad_norm": 1.0796491311299437, + "learning_rate": 5.466286042132154e-07, + "loss": 1.701, + "step": 1518 }, { - "epoch": 0.49430149435690934, - "grad_norm": 1.483890002284729, - "learning_rate": 2.798046420146254e-07, - "loss": 1.6953, - "step": 13380 + "epoch": 0.20371032632990613, + "grad_norm": 1.1233329399203666, + "learning_rate": 5.465606807177093e-07, + "loss": 1.7951, + "step": 1519 }, { - "epoch": 0.49504036056671036, - "grad_norm": 1.5673926854706393, - "learning_rate": 2.792824652361149e-07, - "loss": 1.6348, - "step": 13400 + "epoch": 0.20384443451050513, + "grad_norm": 1.1327885765244115, + "learning_rate": 5.464927187747525e-07, + "loss": 1.7971, + "step": 1520 }, { - "epoch": 0.49577922677651143, - "grad_norm": 1.3752789014048936, - "learning_rate": 2.7876026538263935e-07, - "loss": 1.6333, - "step": 13420 + "epoch": 0.20397854269110416, + "grad_norm": 1.088717235432573, + "learning_rate": 5.464247183964108e-07, + "loss": 1.7474, + "step": 1521 }, { - "epoch": 0.4965180929863125, - "grad_norm": 1.439519752453901, - "learning_rate": 2.7823804526793863e-07, - "loss": 1.6322, - "step": 13440 + "epoch": 0.20411265087170316, + "grad_norm": 1.1850510030757087, + "learning_rate": 5.463566795947566e-07, + "loss": 1.755, + "step": 1522 }, { - "epoch": 0.49725695919611357, - "grad_norm": 1.6858659909371638, - "learning_rate": 2.777158077058619e-07, - "loss": 1.6087, - "step": 13460 + "epoch": 0.2042467590523022, + "grad_norm": 1.0812752508540497, + "learning_rate": 5.462886023818697e-07, + "loss": 1.7443, + "step": 1523 }, { - "epoch": 0.49799582540591464, - "grad_norm": 1.475020677300443, - "learning_rate": 2.771935555103521e-07, - "loss": 1.6085, - "step": 13480 + "epoch": 0.20438086723290122, + "grad_norm": 1.1119200217165637, + "learning_rate": 5.462204867698359e-07, + "loss": 1.7364, + "step": 1524 }, { - "epoch": 0.4987346916157157, - "grad_norm": 1.5498271971579036, - "learning_rate": 2.766712914954314e-07, - "loss": 1.6546, - "step": 13500 + "epoch": 0.20451497541350022, + "grad_norm": 1.0637313799778825, + "learning_rate": 5.461523327707483e-07, + "loss": 1.6503, + "step": 1525 }, { - "epoch": 0.4994735578255167, - "grad_norm": 2.096090843883931, - "learning_rate": 2.7614901847518525e-07, - "loss": 1.6812, - "step": 13520 + "epoch": 0.20464908359409925, + "grad_norm": 1.0673393108107518, + "learning_rate": 5.460841403967067e-07, + "loss": 1.7131, + "step": 1526 }, { - "epoch": 0.5002124240353178, - "grad_norm": 1.4457832913454574, - "learning_rate": 2.756267392637479e-07, - "loss": 1.6581, - "step": 13540 + "epoch": 0.20478319177469825, + "grad_norm": 1.1295826465075736, + "learning_rate": 5.46015909659818e-07, + "loss": 1.6669, + "step": 1527 }, { - "epoch": 0.5009512902451189, - "grad_norm": 2.01817520318154, - "learning_rate": 2.751044566752869e-07, - "loss": 1.6615, - "step": 13560 + "epoch": 0.20491729995529728, + "grad_norm": 1.037795106149209, + "learning_rate": 5.459476405721954e-07, + "loss": 1.7402, + "step": 1528 }, { - "epoch": 0.5016901564549199, - "grad_norm": 1.4227402127659055, - "learning_rate": 2.745821735239878e-07, - "loss": 1.6324, - "step": 13580 + "epoch": 0.20505140813589628, + "grad_norm": 1.0645070431850514, + "learning_rate": 5.458793331459591e-07, + "loss": 1.5445, + "step": 1529 }, { - "epoch": 0.502429022664721, - "grad_norm": 1.8405513240063371, - "learning_rate": 2.7405989262403955e-07, - "loss": 1.6698, - "step": 13600 + "epoch": 0.20518551631649531, + "grad_norm": 1.128995508468257, + "learning_rate": 5.458109873932364e-07, + "loss": 1.648, + "step": 1530 }, { - "epoch": 0.503167888874522, - "grad_norm": 1.4788179775173926, - "learning_rate": 2.7353761678961865e-07, - "loss": 1.6359, - "step": 13620 + "epoch": 0.20531962449709432, + "grad_norm": 1.1073104845376167, + "learning_rate": 5.45742603326161e-07, + "loss": 1.6629, + "step": 1531 }, { - "epoch": 0.5039067550843231, - "grad_norm": 1.7223731354636942, - "learning_rate": 2.730153488348744e-07, - "loss": 1.6306, - "step": 13640 + "epoch": 0.20545373267769335, + "grad_norm": 1.0389720964404514, + "learning_rate": 5.456741809568737e-07, + "loss": 1.6007, + "step": 1532 }, { - "epoch": 0.5046456212941242, - "grad_norm": 2.5321925077821406, - "learning_rate": 2.724930915739137e-07, - "loss": 1.6752, - "step": 13660 + "epoch": 0.20558784085829235, + "grad_norm": 1.0874355308974621, + "learning_rate": 5.456057202975218e-07, + "loss": 1.7692, + "step": 1533 }, { - "epoch": 0.5053844875039252, - "grad_norm": 1.5208216957527443, - "learning_rate": 2.7197084782078585e-07, - "loss": 1.6439, - "step": 13680 + "epoch": 0.20572194903889138, + "grad_norm": 1.1762066099415274, + "learning_rate": 5.455372213602598e-07, + "loss": 1.7199, + "step": 1534 }, { - "epoch": 0.5061233537137263, - "grad_norm": 1.4323741561095633, - "learning_rate": 2.7144862038946716e-07, - "loss": 1.644, - "step": 13700 + "epoch": 0.20585605721949038, + "grad_norm": 1.1248545879023728, + "learning_rate": 5.454686841572487e-07, + "loss": 1.6949, + "step": 1535 }, { - "epoch": 0.5068622199235273, - "grad_norm": 1.426194444263622, - "learning_rate": 2.709264120938464e-07, - "loss": 1.6383, - "step": 13720 + "epoch": 0.2059901654000894, + "grad_norm": 1.1062297817819333, + "learning_rate": 5.454001087006563e-07, + "loss": 1.6879, + "step": 1536 }, { - "epoch": 0.5076010861333284, - "grad_norm": 1.9190094996790648, - "learning_rate": 2.7040422574770866e-07, - "loss": 1.6015, - "step": 13740 + "epoch": 0.2061242735806884, + "grad_norm": 1.5278212260735322, + "learning_rate": 5.453314950026572e-07, + "loss": 1.6452, + "step": 1537 }, { - "epoch": 0.5083399523431295, - "grad_norm": 1.5070566631142777, - "learning_rate": 2.698820641647212e-07, - "loss": 1.6841, - "step": 13760 + "epoch": 0.20625838176128744, + "grad_norm": 1.1382568321141864, + "learning_rate": 5.452628430754329e-07, + "loss": 1.6296, + "step": 1538 }, { - "epoch": 0.5090788185529306, - "grad_norm": 1.9970969408548236, - "learning_rate": 2.693599301584179e-07, - "loss": 1.6346, - "step": 13780 + "epoch": 0.20639248994188644, + "grad_norm": 1.0827447066590228, + "learning_rate": 5.451941529311719e-07, + "loss": 1.6213, + "step": 1539 }, { - "epoch": 0.5098176847627316, - "grad_norm": 1.683784538174349, - "learning_rate": 2.688378265421837e-07, - "loss": 1.6829, - "step": 13800 + "epoch": 0.20652659812248547, + "grad_norm": 1.090225177526994, + "learning_rate": 5.451254245820687e-07, + "loss": 1.7525, + "step": 1540 }, { - "epoch": 0.5105565509725326, - "grad_norm": 1.7421711729558282, - "learning_rate": 2.683157561292399e-07, - "loss": 1.626, - "step": 13820 + "epoch": 0.2066607063030845, + "grad_norm": 1.1632282700056857, + "learning_rate": 5.450566580403255e-07, + "loss": 1.7183, + "step": 1541 }, { - "epoch": 0.5112954171823337, - "grad_norm": 1.6638975974760875, - "learning_rate": 2.6779372173262917e-07, - "loss": 1.6847, - "step": 13840 + "epoch": 0.2067948144836835, + "grad_norm": 1.0773895407601781, + "learning_rate": 5.449878533181507e-07, + "loss": 1.5786, + "step": 1542 }, { - "epoch": 0.5120342833921347, - "grad_norm": 1.7300243765637946, - "learning_rate": 2.672717261651998e-07, - "loss": 1.6635, - "step": 13860 + "epoch": 0.20692892266428253, + "grad_norm": 1.1177081269020515, + "learning_rate": 5.449190104277597e-07, + "loss": 1.6153, + "step": 1543 }, { - "epoch": 0.5127731496019359, - "grad_norm": 1.7350443481000342, - "learning_rate": 2.667497722395909e-07, - "loss": 1.6648, - "step": 13880 + "epoch": 0.20706303084488153, + "grad_norm": 1.0715060717734257, + "learning_rate": 5.448501293813747e-07, + "loss": 1.6768, + "step": 1544 }, { - "epoch": 0.5135120158117369, - "grad_norm": 1.8257677624748465, - "learning_rate": 2.662278627682172e-07, - "loss": 1.642, - "step": 13900 + "epoch": 0.20719713902548056, + "grad_norm": 1.0810287574993174, + "learning_rate": 5.447812101912244e-07, + "loss": 1.6401, + "step": 1545 }, { - "epoch": 0.5142508820215379, - "grad_norm": 1.7828372493231617, - "learning_rate": 2.657060005632543e-07, - "loss": 1.6354, - "step": 13920 + "epoch": 0.20733124720607957, + "grad_norm": 1.130608952204106, + "learning_rate": 5.447122528695449e-07, + "loss": 1.6824, + "step": 1546 }, { - "epoch": 0.514989748231339, - "grad_norm": 1.4463498826235905, - "learning_rate": 2.6518418843662256e-07, - "loss": 1.6342, - "step": 13940 + "epoch": 0.2074653553866786, + "grad_norm": 1.0467682842596422, + "learning_rate": 5.446432574285782e-07, + "loss": 1.6087, + "step": 1547 }, { - "epoch": 0.51572861444114, - "grad_norm": 1.5876083742799603, - "learning_rate": 2.6466242919997263e-07, - "loss": 1.6541, - "step": 13960 + "epoch": 0.2075994635672776, + "grad_norm": 1.139618228642282, + "learning_rate": 5.445742238805737e-07, + "loss": 1.7645, + "step": 1548 }, { - "epoch": 0.5164674806509412, - "grad_norm": 1.4658443332943762, - "learning_rate": 2.641407256646705e-07, - "loss": 1.6865, - "step": 13980 + "epoch": 0.20773357174787663, + "grad_norm": 1.1216742451759847, + "learning_rate": 5.445051522377873e-07, + "loss": 1.7316, + "step": 1549 }, { - "epoch": 0.5172063468607422, - "grad_norm": 1.3991873689568013, - "learning_rate": 2.636190806417817e-07, + "epoch": 0.20786767992847563, + "grad_norm": 1.0899102977167905, + "learning_rate": 5.44436042512482e-07, "loss": 1.6322, - "step": 14000 - }, - { - "epoch": 0.5179452130705432, - "grad_norm": 2.1443694620412823, - "learning_rate": 2.6309749694205643e-07, - "loss": 1.6337, - "step": 14020 + "step": 1550 }, { - "epoch": 0.5186840792803443, - "grad_norm": 1.8812922050974208, - "learning_rate": 2.6257597737591484e-07, - "loss": 1.6003, - "step": 14040 + "epoch": 0.20800178810907466, + "grad_norm": 1.0497718485142342, + "learning_rate": 5.44366894716927e-07, + "loss": 1.6566, + "step": 1551 }, { - "epoch": 0.5194229454901453, - "grad_norm": 1.4849904179267404, - "learning_rate": 2.6205452475343135e-07, - "loss": 1.6554, - "step": 14060 + "epoch": 0.20813589628967366, + "grad_norm": 1.0712432967566454, + "learning_rate": 5.442977088633988e-07, + "loss": 1.6461, + "step": 1552 }, { - "epoch": 0.5201618116999465, - "grad_norm": 1.5710794059095268, - "learning_rate": 2.6153314188431934e-07, - "loss": 1.6585, - "step": 14080 + "epoch": 0.2082700044702727, + "grad_norm": 1.1933916778735016, + "learning_rate": 5.442284849641803e-07, + "loss": 1.7043, + "step": 1553 }, { - "epoch": 0.5209006779097475, - "grad_norm": 1.4300979250373247, - "learning_rate": 2.6101183157791687e-07, - "loss": 1.6266, - "step": 14100 + "epoch": 0.2084041126508717, + "grad_norm": 1.0126599257311222, + "learning_rate": 5.441592230315611e-07, + "loss": 1.6054, + "step": 1554 }, { - "epoch": 0.5216395441195486, - "grad_norm": 1.4201641845366786, - "learning_rate": 2.604905966431707e-07, - "loss": 1.6278, - "step": 14120 + "epoch": 0.20853822083147072, + "grad_norm": 1.3982183722799013, + "learning_rate": 5.440899230778381e-07, + "loss": 1.6898, + "step": 1555 }, { - "epoch": 0.5223784103293496, - "grad_norm": 1.4634294685934828, - "learning_rate": 2.5996943988862136e-07, - "loss": 1.6575, - "step": 14140 + "epoch": 0.20867232901206972, + "grad_norm": 1.056858598949215, + "learning_rate": 5.440205851153145e-07, + "loss": 1.6916, + "step": 1556 }, { - "epoch": 0.5231172765391506, - "grad_norm": 1.5428372121996694, - "learning_rate": 2.594483641223885e-07, - "loss": 1.6751, - "step": 14160 + "epoch": 0.20880643719266875, + "grad_norm": 1.176924033372761, + "learning_rate": 5.439512091563e-07, + "loss": 1.7511, + "step": 1557 }, { - "epoch": 0.5238561427489518, - "grad_norm": 1.738164845435304, - "learning_rate": 2.5892737215215507e-07, - "loss": 1.6492, - "step": 14180 + "epoch": 0.20894054537326778, + "grad_norm": 1.057297882595847, + "learning_rate": 5.438817952131117e-07, + "loss": 1.6588, + "step": 1558 }, { - "epoch": 0.5245950089587528, - "grad_norm": 1.5256411770058975, - "learning_rate": 2.584064667851527e-07, - "loss": 1.6491, - "step": 14200 + "epoch": 0.20907465355386678, + "grad_norm": 1.0967767040598801, + "learning_rate": 5.43812343298073e-07, + "loss": 1.6058, + "step": 1559 }, { - "epoch": 0.5253338751685539, - "grad_norm": 2.0408240630415513, - "learning_rate": 2.578856508281461e-07, - "loss": 1.6424, - "step": 14220 + "epoch": 0.20920876173446581, + "grad_norm": 1.0935764349197725, + "learning_rate": 5.437428534235142e-07, + "loss": 1.7097, + "step": 1560 }, { - "epoch": 0.5260727413783549, - "grad_norm": 1.5107852579348091, - "learning_rate": 2.573649270874187e-07, - "loss": 1.6575, - "step": 14240 + "epoch": 0.20934286991506482, + "grad_norm": 1.1271250900157348, + "learning_rate": 5.436733256017723e-07, + "loss": 1.6236, + "step": 1561 }, { - "epoch": 0.5268116075881559, - "grad_norm": 1.606923866961281, - "learning_rate": 2.568442983687567e-07, - "loss": 1.6678, - "step": 14260 + "epoch": 0.20947697809566385, + "grad_norm": 1.1745352200541934, + "learning_rate": 5.43603759845191e-07, + "loss": 1.6031, + "step": 1562 }, { - "epoch": 0.527550473797957, - "grad_norm": 1.86036331527246, - "learning_rate": 2.5632376747743416e-07, - "loss": 1.6611, - "step": 14280 + "epoch": 0.20961108627626285, + "grad_norm": 1.110585453023522, + "learning_rate": 5.435341561661208e-07, + "loss": 1.6934, + "step": 1563 }, { - "epoch": 0.5282893400077581, - "grad_norm": 1.6282520348397496, - "learning_rate": 2.5580333721819837e-07, - "loss": 1.6887, - "step": 14300 + "epoch": 0.20974519445686188, + "grad_norm": 1.1030892366405238, + "learning_rate": 5.434645145769189e-07, + "loss": 1.6745, + "step": 1564 }, { - "epoch": 0.5290282062175592, - "grad_norm": 1.4902965967534727, - "learning_rate": 2.5528301039525427e-07, - "loss": 1.673, - "step": 14320 + "epoch": 0.20987930263746088, + "grad_norm": 1.0681781865728208, + "learning_rate": 5.433948350899491e-07, + "loss": 1.6327, + "step": 1565 }, { - "epoch": 0.5297670724273602, - "grad_norm": 2.9289521410401607, - "learning_rate": 2.547627898122493e-07, - "loss": 1.618, - "step": 14340 + "epoch": 0.2100134108180599, + "grad_norm": 1.1588290451716836, + "learning_rate": 5.433251177175822e-07, + "loss": 1.6737, + "step": 1566 }, { - "epoch": 0.5305059386371612, - "grad_norm": 1.5801255890460382, - "learning_rate": 2.5424267827225884e-07, - "loss": 1.6478, - "step": 14360 + "epoch": 0.2101475189986589, + "grad_norm": 1.055357765245883, + "learning_rate": 5.432553624721957e-07, + "loss": 1.6018, + "step": 1567 }, { - "epoch": 0.5312448048469623, - "grad_norm": 1.904222753922445, - "learning_rate": 2.5372267857777017e-07, - "loss": 1.6543, - "step": 14380 + "epoch": 0.21028162717925794, + "grad_norm": 1.2241168862848832, + "learning_rate": 5.431855693661734e-07, + "loss": 1.6702, + "step": 1568 }, { - "epoch": 0.5319836710567634, - "grad_norm": 1.5136725876022765, - "learning_rate": 2.532027935306684e-07, - "loss": 1.658, - "step": 14400 + "epoch": 0.21041573535985694, + "grad_norm": 1.0592720322600389, + "learning_rate": 5.431157384119064e-07, + "loss": 1.6243, + "step": 1569 }, { - "epoch": 0.5327225372665645, - "grad_norm": 1.8648484080963088, - "learning_rate": 2.5268302593222056e-07, - "loss": 1.6279, - "step": 14420 + "epoch": 0.21054984354045597, + "grad_norm": 1.0780860574912356, + "learning_rate": 5.43045869621792e-07, + "loss": 1.5921, + "step": 1570 }, { - "epoch": 0.5334614034763655, - "grad_norm": 1.4732933175166334, - "learning_rate": 2.521633785830612e-07, - "loss": 1.6535, - "step": 14440 + "epoch": 0.21068395172105497, + "grad_norm": 1.0964102584808006, + "learning_rate": 5.429759630082348e-07, + "loss": 1.6461, + "step": 1571 }, { - "epoch": 0.5342002696861666, - "grad_norm": 1.7964137810644547, - "learning_rate": 2.5164385428317656e-07, - "loss": 1.6291, - "step": 14460 + "epoch": 0.210818059901654, + "grad_norm": 1.135891674611892, + "learning_rate": 5.429060185836456e-07, + "loss": 1.6602, + "step": 1572 }, { - "epoch": 0.5349391358959676, - "grad_norm": 1.7384258178088878, - "learning_rate": 2.5112445583189e-07, - "loss": 1.6484, - "step": 14480 + "epoch": 0.210952168082253, + "grad_norm": 1.104678715415077, + "learning_rate": 5.42836036360442e-07, + "loss": 1.5908, + "step": 1573 }, { - "epoch": 0.5356780021057687, - "grad_norm": 1.6118844731600752, - "learning_rate": 2.506051860278469e-07, - "loss": 1.6461, - "step": 14500 + "epoch": 0.21108627626285204, + "grad_norm": 1.1405223716065391, + "learning_rate": 5.427660163510486e-07, + "loss": 1.6062, + "step": 1574 }, { - "epoch": 0.5364168683155698, - "grad_norm": 1.612441861147252, - "learning_rate": 2.500860476689993e-07, - "loss": 1.6368, - "step": 14520 + "epoch": 0.21122038444345104, + "grad_norm": 1.055115497261772, + "learning_rate": 5.426959585678964e-07, + "loss": 1.614, + "step": 1575 }, { - "epoch": 0.5371557345253708, - "grad_norm": 1.4719276982885592, - "learning_rate": 2.4956704355259106e-07, - "loss": 1.616, - "step": 14540 + "epoch": 0.21135449262405007, + "grad_norm": 1.0866284593737212, + "learning_rate": 5.426258630234232e-07, + "loss": 1.623, + "step": 1576 }, { - "epoch": 0.5378946007351719, - "grad_norm": 1.4849285106056183, - "learning_rate": 2.4904817647514273e-07, - "loss": 1.6467, - "step": 14560 + "epoch": 0.2114886008046491, + "grad_norm": 1.1082738074471385, + "learning_rate": 5.425557297300736e-07, + "loss": 1.6905, + "step": 1577 }, { - "epoch": 0.5386334669449729, - "grad_norm": 2.0929018106610533, - "learning_rate": 2.485294492324364e-07, - "loss": 1.6517, - "step": 14580 + "epoch": 0.2116227089852481, + "grad_norm": 1.0561977130172522, + "learning_rate": 5.424855587002988e-07, + "loss": 1.7265, + "step": 1578 }, { - "epoch": 0.539372333154774, - "grad_norm": 1.3910097740422103, - "learning_rate": 2.480108646195006e-07, - "loss": 1.6319, - "step": 14600 + "epoch": 0.21175681716584713, + "grad_norm": 1.111034072593952, + "learning_rate": 5.424153499465566e-07, + "loss": 1.5797, + "step": 1579 }, { - "epoch": 0.5401111993645751, - "grad_norm": 1.8158803135234147, - "learning_rate": 2.474924254305956e-07, - "loss": 1.6902, - "step": 14620 + "epoch": 0.21189092534644613, + "grad_norm": 1.110485425151033, + "learning_rate": 5.42345103481312e-07, + "loss": 1.7321, + "step": 1580 }, { - "epoch": 0.5408500655743761, - "grad_norm": 1.6514040636762424, - "learning_rate": 2.4697413445919785e-07, - "loss": 1.6479, - "step": 14640 + "epoch": 0.21202503352704516, + "grad_norm": 1.057458554660141, + "learning_rate": 5.42274819317036e-07, + "loss": 1.6052, + "step": 1581 }, { - "epoch": 0.5415889317841772, - "grad_norm": 1.5739603939688216, - "learning_rate": 2.4645599449798536e-07, - "loss": 1.639, - "step": 14660 + "epoch": 0.21215914170764416, + "grad_norm": 1.0759547522338926, + "learning_rate": 5.422044974662066e-07, + "loss": 1.5403, + "step": 1582 }, { - "epoch": 0.5423277979939782, - "grad_norm": 1.5178753830207266, - "learning_rate": 2.459380083388221e-07, - "loss": 1.6235, - "step": 14680 + "epoch": 0.2122932498882432, + "grad_norm": 1.09889881778652, + "learning_rate": 5.421341379413087e-07, + "loss": 1.6477, + "step": 1583 }, { - "epoch": 0.5430666642037792, - "grad_norm": 1.52558838171546, - "learning_rate": 2.4542017877274397e-07, - "loss": 1.6835, - "step": 14700 + "epoch": 0.2124273580688422, + "grad_norm": 1.0824182868909191, + "learning_rate": 5.420637407548336e-07, + "loss": 1.6666, + "step": 1584 }, { - "epoch": 0.5438055304135804, - "grad_norm": 2.2408509501139533, - "learning_rate": 2.4490250858994243e-07, - "loss": 1.5869, - "step": 14720 + "epoch": 0.21256146624944122, + "grad_norm": 1.1246790227619754, + "learning_rate": 5.419933059192792e-07, + "loss": 1.7284, + "step": 1585 }, { - "epoch": 0.5445443966233814, - "grad_norm": 1.6053244248684069, - "learning_rate": 2.4438500057975043e-07, - "loss": 1.6698, - "step": 14740 + "epoch": 0.21269557443004022, + "grad_norm": 1.1784965009347046, + "learning_rate": 5.419228334471505e-07, + "loss": 1.6751, + "step": 1586 }, { - "epoch": 0.5452832628331825, - "grad_norm": 1.4975811830975623, - "learning_rate": 2.4386765753062733e-07, - "loss": 1.6337, - "step": 14760 + "epoch": 0.21282968261063925, + "grad_norm": 1.0981401155317758, + "learning_rate": 5.418523233509588e-07, + "loss": 1.5569, + "step": 1587 }, { - "epoch": 0.5460221290429835, - "grad_norm": 1.4849817547603397, - "learning_rate": 2.4335048223014316e-07, - "loss": 1.6095, - "step": 14780 + "epoch": 0.21296379079123826, + "grad_norm": 1.059671249600233, + "learning_rate": 5.417817756432223e-07, + "loss": 1.6094, + "step": 1588 }, { - "epoch": 0.5467609952527847, - "grad_norm": 1.8454272427613772, - "learning_rate": 2.4283347746496436e-07, - "loss": 1.6191, - "step": 14800 + "epoch": 0.21309789897183729, + "grad_norm": 1.0850751309161322, + "learning_rate": 5.417111903364658e-07, + "loss": 1.6205, + "step": 1589 }, { - "epoch": 0.5474998614625857, - "grad_norm": 1.484721990845683, - "learning_rate": 2.4231664602083857e-07, - "loss": 1.6156, - "step": 14820 + "epoch": 0.2132320071524363, + "grad_norm": 1.1513764671534936, + "learning_rate": 5.416405674432208e-07, + "loss": 1.6778, + "step": 1590 }, { - "epoch": 0.5482387276723867, - "grad_norm": 1.4970531164331227, - "learning_rate": 2.4179999068257935e-07, - "loss": 1.6903, - "step": 14840 + "epoch": 0.21336611533303532, + "grad_norm": 1.0380273585127677, + "learning_rate": 5.415699069760254e-07, + "loss": 1.6195, + "step": 1591 }, { - "epoch": 0.5489775938821878, - "grad_norm": 1.60919652354879, - "learning_rate": 2.412835142340513e-07, - "loss": 1.6813, - "step": 14860 + "epoch": 0.21350022351363432, + "grad_norm": 1.166702747823365, + "learning_rate": 5.414992089474245e-07, + "loss": 1.6814, + "step": 1592 }, { - "epoch": 0.5497164600919888, - "grad_norm": 1.3606018353206684, - "learning_rate": 2.4076721945815544e-07, - "loss": 1.6769, - "step": 14880 + "epoch": 0.21363433169423335, + "grad_norm": 1.1893324397979834, + "learning_rate": 5.414284733699695e-07, + "loss": 1.773, + "step": 1593 }, { - "epoch": 0.55045532630179, - "grad_norm": 1.458693168765768, - "learning_rate": 2.4025110913681355e-07, - "loss": 1.6373, - "step": 14900 + "epoch": 0.21376843987483238, + "grad_norm": 1.1127641897298384, + "learning_rate": 5.413577002562186e-07, + "loss": 1.7076, + "step": 1594 }, { - "epoch": 0.551194192511591, - "grad_norm": 1.547291419668359, - "learning_rate": 2.397351860509537e-07, - "loss": 1.6525, - "step": 14920 + "epoch": 0.21390254805543138, + "grad_norm": 1.080383382708094, + "learning_rate": 5.412868896187365e-07, + "loss": 1.7324, + "step": 1595 }, { - "epoch": 0.551933058721392, - "grad_norm": 1.7224542921095407, - "learning_rate": 2.392194529804951e-07, - "loss": 1.6761, - "step": 14940 + "epoch": 0.2140366562360304, + "grad_norm": 1.0952540724207267, + "learning_rate": 5.412160414700948e-07, + "loss": 1.7437, + "step": 1596 }, { - "epoch": 0.5526719249311931, - "grad_norm": 1.6677249547234672, - "learning_rate": 2.38703912704333e-07, - "loss": 1.625, - "step": 14960 + "epoch": 0.2141707644166294, + "grad_norm": 1.153542257175551, + "learning_rate": 5.411451558228716e-07, + "loss": 1.7386, + "step": 1597 }, { - "epoch": 0.5534107911409941, - "grad_norm": 1.4519952098563818, - "learning_rate": 2.3818856800032395e-07, - "loss": 1.6244, - "step": 14980 + "epoch": 0.21430487259722844, + "grad_norm": 1.111562609679836, + "learning_rate": 5.410742326896519e-07, + "loss": 1.6339, + "step": 1598 }, { - "epoch": 0.5541496573507952, - "grad_norm": 1.7967122495859562, - "learning_rate": 2.3767342164527055e-07, - "loss": 1.6719, - "step": 15000 + "epoch": 0.21443898077782744, + "grad_norm": 1.0752256282606487, + "learning_rate": 5.410032720830268e-07, + "loss": 1.6502, + "step": 1599 }, { - "epoch": 0.5548885235605963, - "grad_norm": 1.3751693238795433, - "learning_rate": 2.3715847641490688e-07, - "loss": 1.6397, - "step": 15020 + "epoch": 0.21457308895842647, + "grad_norm": 1.1124138961511616, + "learning_rate": 5.409322740155947e-07, + "loss": 1.6977, + "step": 1600 }, { - "epoch": 0.5556273897703973, - "grad_norm": 1.5461207825297583, - "learning_rate": 2.3664373508388318e-07, - "loss": 1.6871, - "step": 15040 + "epoch": 0.21470719713902547, + "grad_norm": 1.1079557958778445, + "learning_rate": 5.408612384999601e-07, + "loss": 1.752, + "step": 1601 }, { - "epoch": 0.5563662559801984, - "grad_norm": 1.3729095610665938, - "learning_rate": 2.3612920042575091e-07, - "loss": 1.6568, - "step": 15060 + "epoch": 0.2148413053196245, + "grad_norm": 1.0753628455770323, + "learning_rate": 5.407901655487346e-07, + "loss": 1.6314, + "step": 1602 }, { - "epoch": 0.5571051221899994, - "grad_norm": 1.5955595428086877, - "learning_rate": 2.3561487521294814e-07, - "loss": 1.6439, - "step": 15080 + "epoch": 0.2149754135002235, + "grad_norm": 1.083459999091914, + "learning_rate": 5.407190551745362e-07, + "loss": 1.6034, + "step": 1603 }, { - "epoch": 0.5578439883998005, - "grad_norm": 1.505255489966295, - "learning_rate": 2.351007622167843e-07, - "loss": 1.6114, - "step": 15100 + "epoch": 0.21510952168082254, + "grad_norm": 1.0970151998487565, + "learning_rate": 5.406479073899896e-07, + "loss": 1.6246, + "step": 1604 }, { - "epoch": 0.5585828546096016, - "grad_norm": 1.4629681148522744, - "learning_rate": 2.3458686420742528e-07, - "loss": 1.6114, - "step": 15120 + "epoch": 0.21524362986142154, + "grad_norm": 1.0937201976032398, + "learning_rate": 5.405767222077262e-07, + "loss": 1.7172, + "step": 1605 }, { - "epoch": 0.5593217208194027, - "grad_norm": 1.7359961722060924, - "learning_rate": 2.3407318395387875e-07, - "loss": 1.6416, - "step": 15140 + "epoch": 0.21537773804202057, + "grad_norm": 1.0450933325728613, + "learning_rate": 5.405054996403838e-07, + "loss": 1.6418, + "step": 1606 }, { - "epoch": 0.5600605870292037, - "grad_norm": 1.6390324621472498, - "learning_rate": 2.3355972422397895e-07, - "loss": 1.6625, - "step": 15160 + "epoch": 0.21551184622261957, + "grad_norm": 1.1080460169200497, + "learning_rate": 5.40434239700607e-07, + "loss": 1.5472, + "step": 1607 }, { - "epoch": 0.5607994532390047, - "grad_norm": 1.7925619507510513, - "learning_rate": 2.3304648778437175e-07, - "loss": 1.6822, - "step": 15180 + "epoch": 0.2156459544032186, + "grad_norm": 1.1272243483080113, + "learning_rate": 5.403629424010473e-07, + "loss": 1.6365, + "step": 1608 }, { - "epoch": 0.5615383194488058, - "grad_norm": 1.6256712121515025, - "learning_rate": 2.3253347740050012e-07, - "loss": 1.6793, - "step": 15200 + "epoch": 0.2157800625838176, + "grad_norm": 1.0764797457941864, + "learning_rate": 5.402916077543625e-07, + "loss": 1.6407, + "step": 1609 }, { - "epoch": 0.5622771856586068, - "grad_norm": 1.6887168187109596, - "learning_rate": 2.3202069583658883e-07, - "loss": 1.6403, - "step": 15220 + "epoch": 0.21591417076441663, + "grad_norm": 1.113524889126991, + "learning_rate": 5.402202357732169e-07, + "loss": 1.6827, + "step": 1610 }, { - "epoch": 0.563016051868408, - "grad_norm": 1.4622893380793243, - "learning_rate": 2.3150814585562984e-07, - "loss": 1.6256, - "step": 15240 + "epoch": 0.21604827894501566, + "grad_norm": 1.0108430825355625, + "learning_rate": 5.40148826470282e-07, + "loss": 1.6089, + "step": 1611 }, { - "epoch": 0.563754918078209, - "grad_norm": 1.720681049824639, - "learning_rate": 2.3099583021936703e-07, - "loss": 1.6331, - "step": 15260 + "epoch": 0.21618238712561466, + "grad_norm": 1.0591615486944377, + "learning_rate": 5.400773798582352e-07, + "loss": 1.6503, + "step": 1612 }, { - "epoch": 0.56449378428801, - "grad_norm": 1.6844323896773028, - "learning_rate": 2.3048375168828194e-07, - "loss": 1.6249, - "step": 15280 + "epoch": 0.2163164953062137, + "grad_norm": 1.0340063487662052, + "learning_rate": 5.400058959497611e-07, + "loss": 1.6383, + "step": 1613 }, { - "epoch": 0.5652326504978111, - "grad_norm": 1.4304416297000766, - "learning_rate": 2.2997191302157831e-07, - "loss": 1.6476, - "step": 15300 + "epoch": 0.2164506034868127, + "grad_norm": 1.1516572358715267, + "learning_rate": 5.399343747575507e-07, + "loss": 1.6974, + "step": 1614 }, { - "epoch": 0.5659715167076121, - "grad_norm": 2.6747036703519966, - "learning_rate": 2.2946031697716728e-07, - "loss": 1.6704, - "step": 15320 + "epoch": 0.21658471166741172, + "grad_norm": 1.0592103543406746, + "learning_rate": 5.398628162943016e-07, + "loss": 1.6353, + "step": 1615 }, { - "epoch": 0.5667103829174133, - "grad_norm": 1.8934913018327109, - "learning_rate": 2.2894896631165312e-07, - "loss": 1.6557, - "step": 15340 + "epoch": 0.21671881984801072, + "grad_norm": 1.0385313908985447, + "learning_rate": 5.39791220572718e-07, + "loss": 1.6162, + "step": 1616 }, { - "epoch": 0.5674492491272143, - "grad_norm": 1.5864443521535418, - "learning_rate": 2.2843786378031749e-07, - "loss": 1.6111, - "step": 15360 + "epoch": 0.21685292802860975, + "grad_norm": 1.2744072569777416, + "learning_rate": 5.397195876055107e-07, + "loss": 1.6091, + "step": 1617 }, { - "epoch": 0.5681881153370153, - "grad_norm": 1.6147764207744268, - "learning_rate": 2.279270121371053e-07, - "loss": 1.6617, - "step": 15380 + "epoch": 0.21698703620920876, + "grad_norm": 1.1238614219371639, + "learning_rate": 5.396479174053974e-07, + "loss": 1.6806, + "step": 1618 }, { - "epoch": 0.5689269815468164, - "grad_norm": 1.5889401903281988, - "learning_rate": 2.274164141346096e-07, - "loss": 1.6472, - "step": 15400 + "epoch": 0.21712114438980779, + "grad_norm": 1.1243988511377025, + "learning_rate": 5.39576209985102e-07, + "loss": 1.6404, + "step": 1619 }, { - "epoch": 0.5696658477566174, - "grad_norm": 1.8322046948313095, - "learning_rate": 2.2690607252405664e-07, - "loss": 1.681, - "step": 15420 + "epoch": 0.2172552525704068, + "grad_norm": 1.110274303539327, + "learning_rate": 5.395044653573553e-07, + "loss": 1.7572, + "step": 1620 }, { - "epoch": 0.5704047139664186, - "grad_norm": 1.319095874026253, - "learning_rate": 2.2639599005529124e-07, - "loss": 1.6339, - "step": 15440 + "epoch": 0.21738936075100582, + "grad_norm": 1.485784445158895, + "learning_rate": 5.394326835348946e-07, + "loss": 1.6521, + "step": 1621 }, { - "epoch": 0.5711435801762196, - "grad_norm": 1.568413450074265, - "learning_rate": 2.258861694767619e-07, - "loss": 1.6385, - "step": 15460 + "epoch": 0.21752346893160482, + "grad_norm": 1.1075544133593012, + "learning_rate": 5.393608645304638e-07, + "loss": 1.6241, + "step": 1622 }, { - "epoch": 0.5718824463860207, - "grad_norm": 1.659163649600049, - "learning_rate": 2.2537661353550603e-07, - "loss": 1.6292, - "step": 15480 + "epoch": 0.21765757711220385, + "grad_norm": 1.1036354518105045, + "learning_rate": 5.392890083568133e-07, + "loss": 1.7734, + "step": 1623 }, { - "epoch": 0.5726213125958217, - "grad_norm": 1.484851792665619, - "learning_rate": 2.2486732497713507e-07, - "loss": 1.6887, - "step": 15500 + "epoch": 0.21779168529280285, + "grad_norm": 1.1528361438777202, + "learning_rate": 5.392171150267002e-07, + "loss": 1.6317, + "step": 1624 }, { - "epoch": 0.5733601788056227, - "grad_norm": 1.609907878598695, - "learning_rate": 2.2435830654581962e-07, - "loss": 1.6266, - "step": 15520 + "epoch": 0.21792579347340188, + "grad_norm": 1.093945976907915, + "learning_rate": 5.391451845528883e-07, + "loss": 1.6645, + "step": 1625 }, { - "epoch": 0.5740990450154239, - "grad_norm": 1.4453575034227937, - "learning_rate": 2.2387499173937125e-07, - "loss": 1.6537, - "step": 15540 + "epoch": 0.21805990165400088, + "grad_norm": 1.0725853841774324, + "learning_rate": 5.390732169481478e-07, + "loss": 1.6491, + "step": 1626 }, { - "epoch": 0.5748379112252249, - "grad_norm": 1.7710876217433056, - "learning_rate": 2.2336650794320994e-07, - "loss": 1.6588, - "step": 15560 + "epoch": 0.2181940098345999, + "grad_norm": 1.1106862604843828, + "learning_rate": 5.390012122252557e-07, + "loss": 1.6931, + "step": 1627 }, { - "epoch": 0.575576777435026, - "grad_norm": 1.4085011499137292, - "learning_rate": 2.2285830236087167e-07, - "loss": 1.6293, - "step": 15580 + "epoch": 0.21832811801519894, + "grad_norm": 1.2277327010437984, + "learning_rate": 5.389291703969954e-07, + "loss": 1.6584, + "step": 1628 }, { - "epoch": 0.576315643644827, - "grad_norm": 1.4053148152524308, - "learning_rate": 2.2235037773069188e-07, - "loss": 1.629, - "step": 15600 + "epoch": 0.21846222619579794, + "grad_norm": 1.1082783806832028, + "learning_rate": 5.388570914761571e-07, + "loss": 1.6083, + "step": 1629 }, { - "epoch": 0.577054509854628, - "grad_norm": 1.456136317052379, - "learning_rate": 2.2184273678949212e-07, - "loss": 1.6448, - "step": 15620 + "epoch": 0.21859633437639697, + "grad_norm": 1.0835070473943422, + "learning_rate": 5.387849754755371e-07, + "loss": 1.6693, + "step": 1630 }, { - "epoch": 0.5777933760644292, - "grad_norm": 1.5709035364905237, - "learning_rate": 2.213353822725652e-07, - "loss": 1.6556, - "step": 15640 + "epoch": 0.21873044255699597, + "grad_norm": 1.0984810480873552, + "learning_rate": 5.38712822407939e-07, + "loss": 1.7465, + "step": 1631 }, { - "epoch": 0.5785322422742302, - "grad_norm": 2.381482655936729, - "learning_rate": 2.2082831691366104e-07, - "loss": 1.6298, - "step": 15660 + "epoch": 0.218864550737595, + "grad_norm": 1.0824052521651053, + "learning_rate": 5.386406322861723e-07, + "loss": 1.6514, + "step": 1632 }, { - "epoch": 0.5792711084840313, - "grad_norm": 1.510088899026219, - "learning_rate": 2.2032154344497096e-07, - "loss": 1.69, - "step": 15680 + "epoch": 0.218998658918194, + "grad_norm": 1.1359714482507233, + "learning_rate": 5.385684051230537e-07, + "loss": 1.7069, + "step": 1633 }, { - "epoch": 0.5800099746938323, - "grad_norm": 1.4208293328335637, - "learning_rate": 2.198150645971138e-07, - "loss": 1.6533, - "step": 15700 + "epoch": 0.21913276709879304, + "grad_norm": 1.1071556040519455, + "learning_rate": 5.384961409314061e-07, + "loss": 1.7147, + "step": 1634 }, { - "epoch": 0.5807488409036333, - "grad_norm": 1.5394108559637645, - "learning_rate": 2.1930888309912098e-07, - "loss": 1.6145, - "step": 15720 + "epoch": 0.21926687527939204, + "grad_norm": 1.2083127255075479, + "learning_rate": 5.384238397240588e-07, + "loss": 1.6825, + "step": 1635 }, { - "epoch": 0.5814877071134344, - "grad_norm": 1.8494498268185677, - "learning_rate": 2.188030016784216e-07, - "loss": 1.6262, - "step": 15740 + "epoch": 0.21940098345999107, + "grad_norm": 1.090487031491975, + "learning_rate": 5.383515015138481e-07, + "loss": 1.6754, + "step": 1636 }, { - "epoch": 0.5822265733232355, - "grad_norm": 2.390942191221342, - "learning_rate": 2.1829742306082778e-07, - "loss": 1.612, - "step": 15760 + "epoch": 0.21953509164059007, + "grad_norm": 1.1766814612885304, + "learning_rate": 5.382791263136168e-07, + "loss": 1.6694, + "step": 1637 }, { - "epoch": 0.5829654395330366, - "grad_norm": 2.4364332149226446, - "learning_rate": 2.1779214997052025e-07, - "loss": 1.6548, - "step": 15780 + "epoch": 0.2196691998211891, + "grad_norm": 1.122843389486521, + "learning_rate": 5.382067141362139e-07, + "loss": 1.6044, + "step": 1638 }, { - "epoch": 0.5837043057428376, - "grad_norm": 1.7161768355514782, - "learning_rate": 2.1728718513003342e-07, - "loss": 1.6822, - "step": 15800 + "epoch": 0.2198033080017881, + "grad_norm": 1.223339411577744, + "learning_rate": 5.381342649944952e-07, + "loss": 1.6101, + "step": 1639 }, { - "epoch": 0.5844431719526387, - "grad_norm": 1.6209379371159418, - "learning_rate": 2.1678253126024072e-07, - "loss": 1.6068, - "step": 15820 + "epoch": 0.21993741618238713, + "grad_norm": 1.0694591790206647, + "learning_rate": 5.380617789013233e-07, + "loss": 1.6867, + "step": 1640 }, { - "epoch": 0.5851820381624397, - "grad_norm": 2.1623351366291725, - "learning_rate": 2.1627819108034002e-07, - "loss": 1.6138, - "step": 15840 + "epoch": 0.22007152436298613, + "grad_norm": 1.2184481374104812, + "learning_rate": 5.379892558695671e-07, + "loss": 1.8251, + "step": 1641 }, { - "epoch": 0.5859209043722408, - "grad_norm": 1.3848518910214123, - "learning_rate": 2.1577416730783904e-07, - "loss": 1.6315, - "step": 15860 + "epoch": 0.22020563254358516, + "grad_norm": 1.144903181431307, + "learning_rate": 5.37916695912102e-07, + "loss": 1.6531, + "step": 1642 }, { - "epoch": 0.5866597705820419, - "grad_norm": 1.377598599479366, - "learning_rate": 2.1527046265854049e-07, - "loss": 1.6263, - "step": 15880 + "epoch": 0.22033974072418416, + "grad_norm": 1.0887276474568761, + "learning_rate": 5.378440990418099e-07, + "loss": 1.6042, + "step": 1643 }, { - "epoch": 0.5873986367918429, - "grad_norm": 1.5951258889353628, - "learning_rate": 2.1476707984652764e-07, - "loss": 1.6442, - "step": 15900 + "epoch": 0.2204738489047832, + "grad_norm": 1.0674234053275629, + "learning_rate": 5.377714652715797e-07, + "loss": 1.6711, + "step": 1644 }, { - "epoch": 0.588137503001644, - "grad_norm": 1.4119428291190372, - "learning_rate": 2.1426402158414964e-07, - "loss": 1.6776, - "step": 15920 + "epoch": 0.2206079570853822, + "grad_norm": 1.0790696186946844, + "learning_rate": 5.376987946143065e-07, + "loss": 1.6381, + "step": 1645 }, { - "epoch": 0.588876369211445, - "grad_norm": 1.5401792838637114, - "learning_rate": 2.1376129058200687e-07, - "loss": 1.6489, - "step": 15940 + "epoch": 0.22074206526598122, + "grad_norm": 1.1045544089627806, + "learning_rate": 5.376260870828918e-07, + "loss": 1.6532, + "step": 1646 }, { - "epoch": 0.589615235421246, - "grad_norm": 1.603780373356476, - "learning_rate": 2.1325888954893618e-07, - "loss": 1.6525, - "step": 15960 + "epoch": 0.22087617344658025, + "grad_norm": 1.1325732851922752, + "learning_rate": 5.375533426902441e-07, + "loss": 1.698, + "step": 1647 }, { - "epoch": 0.5903541016310472, - "grad_norm": 1.5200619012123444, - "learning_rate": 2.1275682119199674e-07, - "loss": 1.6103, - "step": 15980 + "epoch": 0.22101028162717926, + "grad_norm": 1.1364383071296065, + "learning_rate": 5.37480561449278e-07, + "loss": 1.6822, + "step": 1648 }, { - "epoch": 0.5910929678408482, - "grad_norm": 2.1303907208230637, - "learning_rate": 2.122550882164552e-07, - "loss": 1.6515, - "step": 16000 + "epoch": 0.22114438980777829, + "grad_norm": 1.2662493806229793, + "learning_rate": 5.374077433729149e-07, + "loss": 1.6811, + "step": 1649 }, { - "epoch": 0.5918318340506493, - "grad_norm": 1.4309458414094776, - "learning_rate": 2.1175369332577075e-07, - "loss": 1.6476, - "step": 16020 + "epoch": 0.2212784979883773, + "grad_norm": 1.0631367908379292, + "learning_rate": 5.373348884740827e-07, + "loss": 1.6659, + "step": 1650 }, { - "epoch": 0.5925707002604503, - "grad_norm": 1.3885096209200305, - "learning_rate": 2.112526392215811e-07, - "loss": 1.6161, - "step": 16040 + "epoch": 0.22141260616897632, + "grad_norm": 1.041940858543604, + "learning_rate": 5.372619967657157e-07, + "loss": 1.6331, + "step": 1651 }, { - "epoch": 0.5933095664702513, - "grad_norm": 1.4639170589501997, - "learning_rate": 2.107519286036879e-07, - "loss": 1.6626, - "step": 16060 + "epoch": 0.22154671434957532, + "grad_norm": 1.1280546628953805, + "learning_rate": 5.37189068260755e-07, + "loss": 1.56, + "step": 1652 }, { - "epoch": 0.5940484326800525, - "grad_norm": 1.5413296048888148, - "learning_rate": 2.102515641700417e-07, - "loss": 1.7111, - "step": 16080 + "epoch": 0.22168082253017435, + "grad_norm": 1.1849258060825412, + "learning_rate": 5.371161029721481e-07, + "loss": 1.7092, + "step": 1653 }, { - "epoch": 0.5947872988898535, - "grad_norm": 1.477261253181655, - "learning_rate": 2.0975154861672782e-07, - "loss": 1.6606, - "step": 16100 + "epoch": 0.22181493071077335, + "grad_norm": 1.049528339776241, + "learning_rate": 5.370431009128489e-07, + "loss": 1.6428, + "step": 1654 }, { - "epoch": 0.5955261650996546, - "grad_norm": 1.484117052461405, - "learning_rate": 2.0925188463795195e-07, - "loss": 1.6587, - "step": 16120 + "epoch": 0.22194903889137238, + "grad_norm": 1.0820046738092695, + "learning_rate": 5.36970062095818e-07, + "loss": 1.7025, + "step": 1655 }, { - "epoch": 0.5962650313094556, - "grad_norm": 1.492261770923395, - "learning_rate": 2.0875257492602505e-07, - "loss": 1.629, - "step": 16140 + "epoch": 0.22208314707197138, + "grad_norm": 1.154353230216256, + "learning_rate": 5.368969865340224e-07, + "loss": 1.6826, + "step": 1656 }, { - "epoch": 0.5970038975192568, - "grad_norm": 1.4469424063226348, - "learning_rate": 2.082536221713494e-07, - "loss": 1.6496, - "step": 16160 + "epoch": 0.2222172552525704, + "grad_norm": 1.053650977152218, + "learning_rate": 5.368238742404357e-07, + "loss": 1.6172, + "step": 1657 }, { - "epoch": 0.5977427637290578, - "grad_norm": 1.6092362505845061, - "learning_rate": 2.07755029062404e-07, - "loss": 1.6664, - "step": 16180 + "epoch": 0.2223513634331694, + "grad_norm": 1.1279575224119966, + "learning_rate": 5.367507252280381e-07, + "loss": 1.6856, + "step": 1658 }, { - "epoch": 0.5984816299388588, - "grad_norm": 1.779958420465131, - "learning_rate": 2.0725679828572983e-07, - "loss": 1.6212, - "step": 16200 + "epoch": 0.22248547161376844, + "grad_norm": 1.084009451627439, + "learning_rate": 5.36677539509816e-07, + "loss": 1.7398, + "step": 1659 }, { - "epoch": 0.5992204961486599, - "grad_norm": 2.256981377181274, - "learning_rate": 2.0675893252591558e-07, - "loss": 1.6603, - "step": 16220 + "epoch": 0.22261957979436744, + "grad_norm": 1.1545149862581074, + "learning_rate": 5.366043170987628e-07, + "loss": 1.7321, + "step": 1660 }, { - "epoch": 0.5999593623584609, - "grad_norm": 1.4438145967369689, - "learning_rate": 2.0626143446558313e-07, - "loss": 1.7086, - "step": 16240 + "epoch": 0.22275368797496647, + "grad_norm": 1.1304140083027916, + "learning_rate": 5.365310580078781e-07, + "loss": 1.773, + "step": 1661 }, { - "epoch": 0.600698228568262, - "grad_norm": 1.4523681015745287, - "learning_rate": 2.0576430678537314e-07, - "loss": 1.6363, - "step": 16260 + "epoch": 0.22288779615556548, + "grad_norm": 1.0642630051886424, + "learning_rate": 5.364577622501681e-07, + "loss": 1.711, + "step": 1662 }, { - "epoch": 0.6014370947780631, - "grad_norm": 2.081965836536827, - "learning_rate": 2.052675521639306e-07, - "loss": 1.6525, - "step": 16280 + "epoch": 0.2230219043361645, + "grad_norm": 1.040347865228387, + "learning_rate": 5.363844298386453e-07, + "loss": 1.631, + "step": 1663 }, { - "epoch": 0.6021759609878641, - "grad_norm": 1.641105539346371, - "learning_rate": 2.0477117327789017e-07, - "loss": 1.7219, - "step": 16300 + "epoch": 0.22315601251676354, + "grad_norm": 1.0625862966142028, + "learning_rate": 5.36311060786329e-07, + "loss": 1.7056, + "step": 1664 }, { - "epoch": 0.6029148271976652, - "grad_norm": 2.1960028742429887, - "learning_rate": 2.0427517280186225e-07, - "loss": 1.7079, - "step": 16320 + "epoch": 0.22329012069736254, + "grad_norm": 1.051398453698011, + "learning_rate": 5.36237655106245e-07, + "loss": 1.5779, + "step": 1665 }, { - "epoch": 0.6036536934074662, - "grad_norm": 1.421358868551972, - "learning_rate": 2.0377955340841817e-07, - "loss": 1.6494, - "step": 16340 + "epoch": 0.22342422887796157, + "grad_norm": 1.0373708741511485, + "learning_rate": 5.361642128114253e-07, + "loss": 1.6937, + "step": 1666 }, { - "epoch": 0.6043925596172673, - "grad_norm": 1.4519180712299584, - "learning_rate": 2.032843177680757e-07, - "loss": 1.6497, - "step": 16360 + "epoch": 0.22355833705856057, + "grad_norm": 1.0970775365230832, + "learning_rate": 5.360907339149088e-07, + "loss": 1.7652, + "step": 1667 }, { - "epoch": 0.6051314258270684, - "grad_norm": 1.4554186364319244, - "learning_rate": 2.0278946854928512e-07, - "loss": 1.6623, - "step": 16380 + "epoch": 0.2236924452391596, + "grad_norm": 1.0939499626158076, + "learning_rate": 5.360172184297405e-07, + "loss": 1.7164, + "step": 1668 }, { - "epoch": 0.6058702920368694, - "grad_norm": 1.453630709571824, - "learning_rate": 2.022950084184145e-07, - "loss": 1.6481, - "step": 16400 + "epoch": 0.2238265534197586, + "grad_norm": 1.2815989841015132, + "learning_rate": 5.359436663689721e-07, + "loss": 1.6641, + "step": 1669 }, { - "epoch": 0.6066091582466705, - "grad_norm": 1.504491667770329, - "learning_rate": 2.018009400397353e-07, - "loss": 1.677, - "step": 16420 + "epoch": 0.22396066160035763, + "grad_norm": 1.143698149806719, + "learning_rate": 5.358700777456621e-07, + "loss": 1.6344, + "step": 1670 }, { - "epoch": 0.6073480244564715, - "grad_norm": 1.388924417705384, - "learning_rate": 2.0130726607540828e-07, - "loss": 1.6496, - "step": 16440 + "epoch": 0.22409476978095663, + "grad_norm": 1.1716879090974532, + "learning_rate": 5.357964525728747e-07, + "loss": 1.6979, + "step": 1671 }, { - "epoch": 0.6080868906662726, - "grad_norm": 1.464940095501643, - "learning_rate": 2.0081398918546882e-07, - "loss": 1.6999, - "step": 16460 + "epoch": 0.22422887796155566, + "grad_norm": 1.063819709741502, + "learning_rate": 5.357227908636814e-07, + "loss": 1.624, + "step": 1672 }, { - "epoch": 0.6088257568760737, - "grad_norm": 1.7055463049168984, - "learning_rate": 2.0032111202781282e-07, - "loss": 1.6249, - "step": 16480 + "epoch": 0.22436298614215466, + "grad_norm": 1.2013467122145707, + "learning_rate": 5.356490926311598e-07, + "loss": 1.6952, + "step": 1673 }, { - "epoch": 0.6095646230858748, - "grad_norm": 1.6279220224411552, - "learning_rate": 1.9982863725818267e-07, - "loss": 1.6285, - "step": 16500 + "epoch": 0.2244970943227537, + "grad_norm": 1.0555387980604758, + "learning_rate": 5.355753578883939e-07, + "loss": 1.6313, + "step": 1674 }, { - "epoch": 0.6103034892956758, - "grad_norm": 2.0351245502127404, - "learning_rate": 1.9933656753015204e-07, - "loss": 1.6595, - "step": 16520 + "epoch": 0.2246312025033527, + "grad_norm": 1.0893242689976388, + "learning_rate": 5.355015866484744e-07, + "loss": 1.6749, + "step": 1675 }, { - "epoch": 0.6110423555054768, - "grad_norm": 2.018723900559302, - "learning_rate": 1.9884490549511252e-07, - "loss": 1.7325, - "step": 16540 + "epoch": 0.22476531068395172, + "grad_norm": 1.1013312078930966, + "learning_rate": 5.354277789244984e-07, + "loss": 1.6346, + "step": 1676 }, { - "epoch": 0.6117812217152779, - "grad_norm": 1.4930972850593807, - "learning_rate": 1.983782066004026e-07, - "loss": 1.6739, - "step": 16560 + "epoch": 0.22489941886455073, + "grad_norm": 1.0396725082524636, + "learning_rate": 5.353539347295696e-07, + "loss": 1.6516, + "step": 1677 }, { - "epoch": 0.612520087925079, - "grad_norm": 1.6719536221986355, - "learning_rate": 1.9788734718442834e-07, - "loss": 1.6453, - "step": 16580 + "epoch": 0.22503352704514976, + "grad_norm": 1.1068093515212976, + "learning_rate": 5.352800540767978e-07, + "loss": 1.6229, + "step": 1678 }, { - "epoch": 0.6132589541348801, - "grad_norm": 1.5901664783269642, - "learning_rate": 1.9739690327019692e-07, - "loss": 1.6688, - "step": 16600 + "epoch": 0.22516763522574876, + "grad_norm": 1.0984721962823492, + "learning_rate": 5.352061369792997e-07, + "loss": 1.6208, + "step": 1679 }, { - "epoch": 0.6139978203446811, - "grad_norm": 1.5005389488409309, - "learning_rate": 1.9693136881713379e-07, - "loss": 1.6697, - "step": 16620 + "epoch": 0.2253017434063478, + "grad_norm": 1.0826869933413177, + "learning_rate": 5.351321834501981e-07, + "loss": 1.677, + "step": 1680 }, { - "epoch": 0.6147366865544821, - "grad_norm": 1.5857034959363703, - "learning_rate": 1.9644174273011738e-07, - "loss": 1.6639, - "step": 16640 + "epoch": 0.22543585158694682, + "grad_norm": 1.084000373067938, + "learning_rate": 5.350581935026227e-07, + "loss": 1.7401, + "step": 1681 }, { - "epoch": 0.6154755527642832, - "grad_norm": 1.8800052700521002, - "learning_rate": 1.959525399341126e-07, - "loss": 1.6406, - "step": 16660 + "epoch": 0.22556995976754582, + "grad_norm": 1.0851285225408938, + "learning_rate": 5.349841671497093e-07, + "loss": 1.7231, + "step": 1682 }, { - "epoch": 0.6162144189740842, - "grad_norm": 1.5463318718925796, - "learning_rate": 1.954637630650633e-07, - "loss": 1.6456, - "step": 16680 + "epoch": 0.22570406794814485, + "grad_norm": 1.1364065037848023, + "learning_rate": 5.349101044046004e-07, + "loss": 1.6977, + "step": 1683 }, { - "epoch": 0.6169532851838854, - "grad_norm": 1.7265411721417883, - "learning_rate": 1.9497541475661822e-07, - "loss": 1.6396, - "step": 16700 + "epoch": 0.22583817612874385, + "grad_norm": 1.1009000528239055, + "learning_rate": 5.348360052804447e-07, + "loss": 1.7396, + "step": 1684 }, { - "epoch": 0.6176921513936864, - "grad_norm": 1.6019332231293413, - "learning_rate": 1.9448749764011674e-07, - "loss": 1.6319, - "step": 16720 + "epoch": 0.22597228430934288, + "grad_norm": 1.0627127199486133, + "learning_rate": 5.347618697903976e-07, + "loss": 1.6, + "step": 1685 }, { - "epoch": 0.6184310176034874, - "grad_norm": 1.6078339500202126, - "learning_rate": 1.940000143445753e-07, - "loss": 1.6287, - "step": 16740 + "epoch": 0.22610639248994188, + "grad_norm": 1.0936508465446555, + "learning_rate": 5.346876979476206e-07, + "loss": 1.6898, + "step": 1686 }, { - "epoch": 0.6191698838132885, - "grad_norm": 1.5200063311449286, - "learning_rate": 1.9351296749667239e-07, - "loss": 1.6556, - "step": 16760 + "epoch": 0.2262405006705409, + "grad_norm": 1.158039404421018, + "learning_rate": 5.346134897652824e-07, + "loss": 1.6173, + "step": 1687 }, { - "epoch": 0.6199087500230895, - "grad_norm": 1.5605900758303721, - "learning_rate": 1.9302635972073504e-07, - "loss": 1.6709, - "step": 16780 + "epoch": 0.2263746088511399, + "grad_norm": 1.1476901068480616, + "learning_rate": 5.345392452565574e-07, + "loss": 1.6939, + "step": 1688 }, { - "epoch": 0.6206476162328907, - "grad_norm": 1.5245501861602075, - "learning_rate": 1.9254019363872432e-07, - "loss": 1.6744, - "step": 16800 + "epoch": 0.22650871703173894, + "grad_norm": 1.1331738396979525, + "learning_rate": 5.344649644346266e-07, + "loss": 1.7156, + "step": 1689 }, { - "epoch": 0.6213864824426917, - "grad_norm": 1.4527294863239084, - "learning_rate": 1.9205447187022145e-07, - "loss": 1.6564, - "step": 16820 + "epoch": 0.22664282521233794, + "grad_norm": 1.0799876163240634, + "learning_rate": 5.343906473126778e-07, + "loss": 1.716, + "step": 1690 }, { - "epoch": 0.6221253486524928, - "grad_norm": 2.0368137299260276, - "learning_rate": 1.915691970324137e-07, - "loss": 1.6289, - "step": 16840 + "epoch": 0.22677693339293697, + "grad_norm": 1.082964627665107, + "learning_rate": 5.343162939039048e-07, + "loss": 1.7274, + "step": 1691 }, { - "epoch": 0.6228642148622938, - "grad_norm": 2.2640348268112147, - "learning_rate": 1.9108437174007967e-07, - "loss": 1.667, - "step": 16860 + "epoch": 0.22691104157353598, + "grad_norm": 1.0606670008679837, + "learning_rate": 5.342419042215082e-07, + "loss": 1.6872, + "step": 1692 }, { - "epoch": 0.6236030810720948, - "grad_norm": 1.4879411305430876, - "learning_rate": 1.9059999860557635e-07, - "loss": 1.6516, - "step": 16880 + "epoch": 0.227045149754135, + "grad_norm": 1.2139606651511192, + "learning_rate": 5.341674782786949e-07, + "loss": 1.6144, + "step": 1693 }, { - "epoch": 0.624341947281896, - "grad_norm": 1.99321589038771, - "learning_rate": 1.9011608023882396e-07, - "loss": 1.6617, - "step": 16900 + "epoch": 0.227179257934734, + "grad_norm": 1.1259721685135795, + "learning_rate": 5.340930160886783e-07, + "loss": 1.682, + "step": 1694 }, { - "epoch": 0.625080813491697, - "grad_norm": 1.4486992732108148, - "learning_rate": 1.8963261924729247e-07, - "loss": 1.6477, - "step": 16920 + "epoch": 0.22731336611533304, + "grad_norm": 1.1971458828681856, + "learning_rate": 5.340185176646779e-07, + "loss": 1.666, + "step": 1695 }, { - "epoch": 0.6258196797014981, - "grad_norm": 1.4436779823541692, - "learning_rate": 1.8914961823598742e-07, - "loss": 1.6276, - "step": 16940 + "epoch": 0.22744747429593204, + "grad_norm": 1.0623938370168757, + "learning_rate": 5.339439830199201e-07, + "loss": 1.6716, + "step": 1696 }, { - "epoch": 0.6265585459112991, - "grad_norm": 1.7823515681610929, - "learning_rate": 1.886670798074358e-07, - "loss": 1.6722, - "step": 16960 + "epoch": 0.22758158247653107, + "grad_norm": 1.0291752731398527, + "learning_rate": 5.338694121676374e-07, + "loss": 1.5643, + "step": 1697 }, { - "epoch": 0.6272974121211001, - "grad_norm": 1.4559994514082784, - "learning_rate": 1.8818500656167198e-07, - "loss": 1.6721, - "step": 16980 + "epoch": 0.2277156906571301, + "grad_norm": 1.073415400659899, + "learning_rate": 5.33794805121069e-07, + "loss": 1.7113, + "step": 1698 }, { - "epoch": 0.6280362783309013, - "grad_norm": 1.5502170823927217, - "learning_rate": 1.8770340109622418e-07, - "loss": 1.6468, - "step": 17000 + "epoch": 0.2278497988377291, + "grad_norm": 1.0719841904118037, + "learning_rate": 5.337201618934604e-07, + "loss": 1.6904, + "step": 1699 }, { - "epoch": 0.6287751445407023, - "grad_norm": 1.3693032988758314, - "learning_rate": 1.8722226600609974e-07, - "loss": 1.6503, - "step": 17020 + "epoch": 0.22798390701832813, + "grad_norm": 1.0589482779303245, + "learning_rate": 5.336454824980633e-07, + "loss": 1.6258, + "step": 1700 }, { - "epoch": 0.6295140107505034, - "grad_norm": 1.8228163395950472, - "learning_rate": 1.8674160388377174e-07, - "loss": 1.6691, - "step": 17040 + "epoch": 0.22811801519892713, + "grad_norm": 1.1032497481356218, + "learning_rate": 5.335707669481362e-07, + "loss": 1.6656, + "step": 1701 }, { - "epoch": 0.6302528769603044, - "grad_norm": 1.607512275964286, - "learning_rate": 1.8626141731916446e-07, - "loss": 1.6381, - "step": 17060 + "epoch": 0.22825212337952616, + "grad_norm": 1.0840451749643811, + "learning_rate": 5.334960152569437e-07, + "loss": 1.5383, + "step": 1702 }, { - "epoch": 0.6309917431701054, - "grad_norm": 1.6555733853411483, - "learning_rate": 1.8578170889964022e-07, - "loss": 1.624, - "step": 17080 + "epoch": 0.22838623156012516, + "grad_norm": 1.2721911706046112, + "learning_rate": 5.334212274377572e-07, + "loss": 1.6877, + "step": 1703 }, { - "epoch": 0.6317306093799065, - "grad_norm": 1.4667357369050853, - "learning_rate": 1.853024812099847e-07, - "loss": 1.6233, - "step": 17100 + "epoch": 0.2285203397407242, + "grad_norm": 1.113467373081235, + "learning_rate": 5.333464035038541e-07, + "loss": 1.7795, + "step": 1704 }, { - "epoch": 0.6324694755897076, - "grad_norm": 1.555065221242107, - "learning_rate": 1.8482373683239316e-07, - "loss": 1.6372, - "step": 17120 + "epoch": 0.2286544479213232, + "grad_norm": 1.0985371740747398, + "learning_rate": 5.332715434685184e-07, + "loss": 1.646, + "step": 1705 }, { - "epoch": 0.6332083417995087, - "grad_norm": 1.5169327799558363, - "learning_rate": 1.8434547834645714e-07, - "loss": 1.6738, - "step": 17140 + "epoch": 0.22878855610192222, + "grad_norm": 1.0986088766126445, + "learning_rate": 5.331966473450405e-07, + "loss": 1.7123, + "step": 1706 }, { - "epoch": 0.6339472080093097, - "grad_norm": 1.419410682586359, - "learning_rate": 1.8386770832914955e-07, - "loss": 1.6677, - "step": 17160 + "epoch": 0.22892266428252123, + "grad_norm": 1.0916765886457365, + "learning_rate": 5.331217151467172e-07, + "loss": 1.6558, + "step": 1707 }, { - "epoch": 0.6346860742191108, - "grad_norm": 1.6719841699284368, - "learning_rate": 1.833904293548116e-07, - "loss": 1.6821, - "step": 17180 + "epoch": 0.22905677246312026, + "grad_norm": 1.1105626967058537, + "learning_rate": 5.330467468868518e-07, + "loss": 1.6464, + "step": 1708 }, { - "epoch": 0.6354249404289118, - "grad_norm": 1.5798183541162123, - "learning_rate": 1.8291364399513864e-07, - "loss": 1.7092, - "step": 17200 + "epoch": 0.22919088064371926, + "grad_norm": 1.060186115294533, + "learning_rate": 5.329717425787539e-07, + "loss": 1.7554, + "step": 1709 }, { - "epoch": 0.6361638066387129, - "grad_norm": 1.4604030691233605, - "learning_rate": 1.8243735481916611e-07, - "loss": 1.662, - "step": 17220 + "epoch": 0.2293249888243183, + "grad_norm": 1.1194774279858801, + "learning_rate": 5.328967022357393e-07, + "loss": 1.6726, + "step": 1710 }, { - "epoch": 0.636902672848514, - "grad_norm": 1.7774575653306484, - "learning_rate": 1.8196156439325604e-07, - "loss": 1.655, - "step": 17240 + "epoch": 0.2294590970049173, + "grad_norm": 1.04897630046238, + "learning_rate": 5.328216258711307e-07, + "loss": 1.658, + "step": 1711 }, { - "epoch": 0.637641539058315, - "grad_norm": 2.062948052538768, - "learning_rate": 1.8148627528108323e-07, - "loss": 1.65, - "step": 17260 + "epoch": 0.22959320518551632, + "grad_norm": 1.0978402523327002, + "learning_rate": 5.327465134982568e-07, + "loss": 1.7228, + "step": 1712 }, { - "epoch": 0.6383804052681161, - "grad_norm": 1.7560243016328074, - "learning_rate": 1.8101149004362088e-07, - "loss": 1.6068, - "step": 17280 + "epoch": 0.22972731336611532, + "grad_norm": 1.0849254385283391, + "learning_rate": 5.326713651304527e-07, + "loss": 1.5941, + "step": 1713 }, { - "epoch": 0.6391192714779171, - "grad_norm": 1.589922555292764, - "learning_rate": 1.8053721123912764e-07, - "loss": 1.6432, - "step": 17300 + "epoch": 0.22986142154671435, + "grad_norm": 1.1076316095810992, + "learning_rate": 5.3259618078106e-07, + "loss": 1.6087, + "step": 1714 }, { - "epoch": 0.6398581376877182, - "grad_norm": 1.7855781248038047, - "learning_rate": 1.8006344142313285e-07, - "loss": 1.6444, - "step": 17320 + "epoch": 0.22999552972731335, + "grad_norm": 1.173053113513891, + "learning_rate": 5.325209604634268e-07, + "loss": 1.6916, + "step": 1715 }, { - "epoch": 0.6405970038975193, - "grad_norm": 1.462859488532895, - "learning_rate": 1.7959018314842395e-07, - "loss": 1.6225, - "step": 17340 + "epoch": 0.23012963790791238, + "grad_norm": 1.0524457049873044, + "learning_rate": 5.324457041909073e-07, + "loss": 1.7742, + "step": 1716 }, { - "epoch": 0.6413358701073203, - "grad_norm": 1.5201929263554286, - "learning_rate": 1.7911743896503144e-07, - "loss": 1.6216, - "step": 17360 + "epoch": 0.2302637460885114, + "grad_norm": 1.0634034874984304, + "learning_rate": 5.323704119768625e-07, + "loss": 1.676, + "step": 1717 }, { - "epoch": 0.6420747363171214, - "grad_norm": 1.5039545520824391, - "learning_rate": 1.7864521142021616e-07, - "loss": 1.597, - "step": 17380 + "epoch": 0.2303978542691104, + "grad_norm": 1.1156008079132087, + "learning_rate": 5.322950838346592e-07, + "loss": 1.7271, + "step": 1718 }, { - "epoch": 0.6428136025269224, - "grad_norm": 2.1198882531068106, - "learning_rate": 1.7817350305845503e-07, - "loss": 1.6762, - "step": 17400 + "epoch": 0.23053196244970944, + "grad_norm": 1.1047727328230366, + "learning_rate": 5.322197197776711e-07, + "loss": 1.7865, + "step": 1719 }, { - "epoch": 0.6435524687367234, - "grad_norm": 1.5052045132821683, - "learning_rate": 1.7770231642142758e-07, - "loss": 1.6459, - "step": 17420 + "epoch": 0.23066607063030845, + "grad_norm": 1.027356701503526, + "learning_rate": 5.321443198192781e-07, + "loss": 1.709, + "step": 1720 }, { - "epoch": 0.6442913349465246, - "grad_norm": 1.5702310750127326, - "learning_rate": 1.77231654048002e-07, - "loss": 1.5676, - "step": 17440 + "epoch": 0.23080017881090747, + "grad_norm": 1.136877539749875, + "learning_rate": 5.320688839728663e-07, + "loss": 1.6582, + "step": 1721 }, { - "epoch": 0.6450302011563256, - "grad_norm": 1.49975631121171, - "learning_rate": 1.7676151847422188e-07, - "loss": 1.6558, - "step": 17460 + "epoch": 0.23093428699150648, + "grad_norm": 1.0127690499338695, + "learning_rate": 5.319934122518285e-07, + "loss": 1.7492, + "step": 1722 }, { - "epoch": 0.6457690673661267, - "grad_norm": 1.8852376014336283, - "learning_rate": 1.7629191223329188e-07, - "loss": 1.6598, - "step": 17480 + "epoch": 0.2310683951721055, + "grad_norm": 1.0939228317341436, + "learning_rate": 5.319179046695635e-07, + "loss": 1.5875, + "step": 1723 }, { - "epoch": 0.6465079335759277, - "grad_norm": 1.5809036111526213, - "learning_rate": 1.7582283785556494e-07, - "loss": 1.6148, - "step": 17500 + "epoch": 0.2312025033527045, + "grad_norm": 1.1310800565403134, + "learning_rate": 5.318423612394769e-07, + "loss": 1.6674, + "step": 1724 }, { - "epoch": 0.6472467997857289, - "grad_norm": 1.4247569077843545, - "learning_rate": 1.75354297868528e-07, - "loss": 1.6318, - "step": 17520 + "epoch": 0.23133661153330354, + "grad_norm": 1.1687734972345458, + "learning_rate": 5.317667819749803e-07, + "loss": 1.6984, + "step": 1725 }, { - "epoch": 0.6479856659955299, - "grad_norm": 1.6577683592238937, - "learning_rate": 1.748862947967885e-07, - "loss": 1.6551, - "step": 17540 + "epoch": 0.23147071971390254, + "grad_norm": 1.3079097416665406, + "learning_rate": 5.316911668894917e-07, + "loss": 1.7021, + "step": 1726 }, { - "epoch": 0.6487245322053309, - "grad_norm": 7.300032033927882, - "learning_rate": 1.744188311620608e-07, - "loss": 1.6892, - "step": 17560 + "epoch": 0.23160482789450157, + "grad_norm": 1.121551582881909, + "learning_rate": 5.316155159964357e-07, + "loss": 1.6389, + "step": 1727 }, { - "epoch": 0.649463398415132, - "grad_norm": 1.4132601163703873, - "learning_rate": 1.7395190948315282e-07, - "loss": 1.6817, - "step": 17580 + "epoch": 0.23173893607510057, + "grad_norm": 1.110653445896344, + "learning_rate": 5.31539829309243e-07, + "loss": 1.6069, + "step": 1728 }, { - "epoch": 0.650202264624933, - "grad_norm": 1.5063433467194194, - "learning_rate": 1.7348553227595218e-07, - "loss": 1.6158, - "step": 17600 + "epoch": 0.2318730442556996, + "grad_norm": 1.0532131317248028, + "learning_rate": 5.314641068413509e-07, + "loss": 1.6365, + "step": 1729 }, { - "epoch": 0.6509411308347341, - "grad_norm": 1.5169596981657725, - "learning_rate": 1.7301970205341292e-07, - "loss": 1.6779, - "step": 17620 + "epoch": 0.2320071524362986, + "grad_norm": 1.0606458320174244, + "learning_rate": 5.313883486062026e-07, + "loss": 1.7264, + "step": 1730 }, { - "epoch": 0.6516799970445352, - "grad_norm": 1.6068564294026548, - "learning_rate": 1.725544213255415e-07, - "loss": 1.6179, - "step": 17640 + "epoch": 0.23214126061689763, + "grad_norm": 1.341898889664279, + "learning_rate": 5.313125546172484e-07, + "loss": 1.6649, + "step": 1731 }, { - "epoch": 0.6524188632543362, - "grad_norm": 1.401533779590892, - "learning_rate": 1.7208969259938396e-07, - "loss": 1.6992, - "step": 17660 + "epoch": 0.23227536879749663, + "grad_norm": 1.1400544409976623, + "learning_rate": 5.312367248879441e-07, + "loss": 1.7331, + "step": 1732 }, { - "epoch": 0.6531577294641373, - "grad_norm": 1.7940271180903984, - "learning_rate": 1.7162551837901149e-07, - "loss": 1.6343, - "step": 17680 + "epoch": 0.23240947697809566, + "grad_norm": 1.0680650695769265, + "learning_rate": 5.311608594317525e-07, + "loss": 1.6919, + "step": 1733 }, { - "epoch": 0.6538965956739383, - "grad_norm": 1.4503762459176361, - "learning_rate": 1.7116190116550798e-07, - "loss": 1.6241, - "step": 17700 + "epoch": 0.2325435851586947, + "grad_norm": 1.1255461157368476, + "learning_rate": 5.310849582621425e-07, + "loss": 1.6049, + "step": 1734 }, { - "epoch": 0.6546354618837394, - "grad_norm": 1.9129744363614924, - "learning_rate": 1.7069884345695585e-07, - "loss": 1.6242, - "step": 17720 + "epoch": 0.2326776933392937, + "grad_norm": 1.1072444623083968, + "learning_rate": 5.310090213925891e-07, + "loss": 1.5269, + "step": 1735 }, { - "epoch": 0.6553743280935405, - "grad_norm": 1.4592502547252286, - "learning_rate": 1.7023634774842265e-07, - "loss": 1.6433, - "step": 17740 + "epoch": 0.23281180151989272, + "grad_norm": 1.0710603367422178, + "learning_rate": 5.309330488365741e-07, + "loss": 1.5994, + "step": 1736 }, { - "epoch": 0.6561131943033415, - "grad_norm": 2.3740218695344026, - "learning_rate": 1.6977441653194778e-07, - "loss": 1.6407, - "step": 17760 + "epoch": 0.23294590970049173, + "grad_norm": 1.0644784872053028, + "learning_rate": 5.308570406075853e-07, + "loss": 1.7374, + "step": 1737 }, { - "epoch": 0.6568520605131426, - "grad_norm": 1.652867656549423, - "learning_rate": 1.6931305229652911e-07, - "loss": 1.6571, - "step": 17780 + "epoch": 0.23308001788109076, + "grad_norm": 1.1498695736382247, + "learning_rate": 5.307809967191172e-07, + "loss": 1.7718, + "step": 1738 }, { - "epoch": 0.6575909267229436, - "grad_norm": 1.8510532804043571, - "learning_rate": 1.688522575281096e-07, - "loss": 1.6393, - "step": 17800 + "epoch": 0.23321412606168976, + "grad_norm": 1.1460626302338928, + "learning_rate": 5.307049171846698e-07, + "loss": 1.7527, + "step": 1739 }, { - "epoch": 0.6583297929327447, - "grad_norm": 1.5330852891820108, - "learning_rate": 1.6839203470956348e-07, - "loss": 1.6181, - "step": 17820 + "epoch": 0.2333482342422888, + "grad_norm": 1.0375010028149447, + "learning_rate": 5.306288020177507e-07, + "loss": 1.6096, + "step": 1740 }, { - "epoch": 0.6590686591425458, - "grad_norm": 2.179872107638406, - "learning_rate": 1.6793238632068323e-07, - "loss": 1.6467, - "step": 17840 + "epoch": 0.2334823424228878, + "grad_norm": 1.0840298111802271, + "learning_rate": 5.305526512318727e-07, + "loss": 1.6765, + "step": 1741 }, { - "epoch": 0.6598075253523468, - "grad_norm": 1.5709625450812563, - "learning_rate": 1.6747331483816645e-07, - "loss": 1.6931, - "step": 17860 + "epoch": 0.23361645060348682, + "grad_norm": 1.175481103771977, + "learning_rate": 5.304764648405554e-07, + "loss": 1.6737, + "step": 1742 }, { - "epoch": 0.6605463915621479, - "grad_norm": 1.7454282483475967, - "learning_rate": 1.6701482273560185e-07, - "loss": 1.6292, - "step": 17880 + "epoch": 0.23375055878408582, + "grad_norm": 1.0760963915335215, + "learning_rate": 5.304002428573248e-07, + "loss": 1.6407, + "step": 1743 }, { - "epoch": 0.6612852577719489, - "grad_norm": 1.7594994883208979, - "learning_rate": 1.6655691248345655e-07, - "loss": 1.6171, - "step": 17900 + "epoch": 0.23388466696468485, + "grad_norm": 1.0391117459687709, + "learning_rate": 5.303239852957129e-07, + "loss": 1.7296, + "step": 1744 }, { - "epoch": 0.66202412398175, - "grad_norm": 1.5140697252908892, - "learning_rate": 1.6609958654906255e-07, - "loss": 1.6319, - "step": 17920 + "epoch": 0.23401877514528385, + "grad_norm": 1.2433142693729942, + "learning_rate": 5.302476921692584e-07, + "loss": 1.6453, + "step": 1745 }, { - "epoch": 0.662762990191551, - "grad_norm": 2.248352984954327, - "learning_rate": 1.6564284739660316e-07, - "loss": 1.6363, - "step": 17940 + "epoch": 0.23415288332588288, + "grad_norm": 1.1097947586973798, + "learning_rate": 5.30171363491506e-07, + "loss": 1.6873, + "step": 1746 }, { - "epoch": 0.6635018564013522, - "grad_norm": 2.0596192177611368, - "learning_rate": 1.6518669748710013e-07, - "loss": 1.6264, - "step": 17960 + "epoch": 0.23428699150648188, + "grad_norm": 1.044700396070487, + "learning_rate": 5.30094999276007e-07, + "loss": 1.5877, + "step": 1747 }, { - "epoch": 0.6642407226111532, - "grad_norm": 1.4805518708471208, - "learning_rate": 1.647311392784002e-07, - "loss": 1.6559, - "step": 17980 + "epoch": 0.2344210996870809, + "grad_norm": 1.1166075784138738, + "learning_rate": 5.300185995363186e-07, + "loss": 1.6547, + "step": 1748 }, { - "epoch": 0.6649795888209542, - "grad_norm": 1.5620227618208977, - "learning_rate": 1.6427617522516196e-07, - "loss": 1.6528, - "step": 18000 + "epoch": 0.23455520786767992, + "grad_norm": 1.1455525392590689, + "learning_rate": 5.299421642860049e-07, + "loss": 1.6328, + "step": 1749 }, { - "epoch": 0.6657184550307553, - "grad_norm": 1.5698059903501222, - "learning_rate": 1.6382180777884236e-07, - "loss": 1.68, - "step": 18020 + "epoch": 0.23468931604827895, + "grad_norm": 1.0432073116091243, + "learning_rate": 5.298656935386355e-07, + "loss": 1.6934, + "step": 1750 }, { - "epoch": 0.6664573212405563, - "grad_norm": 1.525456023190327, - "learning_rate": 1.6336803938768396e-07, - "loss": 1.6129, - "step": 18040 + "epoch": 0.23482342422887797, + "grad_norm": 1.301933185584584, + "learning_rate": 5.297891873077872e-07, + "loss": 1.6322, + "step": 1751 }, { - "epoch": 0.6671961874503575, - "grad_norm": 1.9244616810959143, - "learning_rate": 1.6291487249670116e-07, - "loss": 1.6074, - "step": 18060 + "epoch": 0.23495753240947698, + "grad_norm": 1.1184463227985266, + "learning_rate": 5.297126456070423e-07, + "loss": 1.5901, + "step": 1752 }, { - "epoch": 0.6679350536601585, - "grad_norm": 1.5470316335951617, - "learning_rate": 1.6246230954766744e-07, - "loss": 1.6174, - "step": 18080 + "epoch": 0.235091640590076, + "grad_norm": 1.0894760385328393, + "learning_rate": 5.296360684499899e-07, + "loss": 1.6307, + "step": 1753 }, { - "epoch": 0.6686739198699595, - "grad_norm": 1.460047028189958, - "learning_rate": 1.6201035297910215e-07, - "loss": 1.6387, - "step": 18100 + "epoch": 0.235225748770675, + "grad_norm": 1.0810964826554634, + "learning_rate": 5.295594558502254e-07, + "loss": 1.671, + "step": 1754 }, { - "epoch": 0.6694127860797606, - "grad_norm": 1.849597715575099, - "learning_rate": 1.6155900522625744e-07, - "loss": 1.6357, - "step": 18120 + "epoch": 0.23535985695127404, + "grad_norm": 1.0867830593910155, + "learning_rate": 5.2948280782135e-07, + "loss": 1.5898, + "step": 1755 }, { - "epoch": 0.6701516522895616, - "grad_norm": 1.595432962229376, - "learning_rate": 1.6110826872110478e-07, - "loss": 1.6175, - "step": 18140 + "epoch": 0.23549396513187304, + "grad_norm": 1.0826732184990124, + "learning_rate": 5.29406124376972e-07, + "loss": 1.6753, + "step": 1756 }, { - "epoch": 0.6708905184993628, - "grad_norm": 1.5318757576478021, - "learning_rate": 1.6065814589232206e-07, - "loss": 1.6235, - "step": 18160 + "epoch": 0.23562807331247207, + "grad_norm": 1.1750857610640004, + "learning_rate": 5.29329405530705e-07, + "loss": 1.6238, + "step": 1757 }, { - "epoch": 0.6716293847091638, - "grad_norm": 1.4152502346247018, - "learning_rate": 1.602086391652807e-07, - "loss": 1.6287, - "step": 18180 + "epoch": 0.23576218149307107, + "grad_norm": 1.145244574282678, + "learning_rate": 5.292526512961698e-07, + "loss": 1.7374, + "step": 1758 }, { - "epoch": 0.6723682509189648, - "grad_norm": 1.730605954821045, - "learning_rate": 1.5975975096203248e-07, - "loss": 1.6297, - "step": 18200 + "epoch": 0.2358962896736701, + "grad_norm": 1.0998728885819122, + "learning_rate": 5.291758616869928e-07, + "loss": 1.7178, + "step": 1759 }, { - "epoch": 0.6731071171287659, - "grad_norm": 1.641811600664541, - "learning_rate": 1.5931148370129613e-07, - "loss": 1.6575, - "step": 18220 + "epoch": 0.2360303978542691, + "grad_norm": 1.122069140362572, + "learning_rate": 5.290990367168073e-07, + "loss": 1.634, + "step": 1760 }, { - "epoch": 0.6738459833385669, - "grad_norm": 1.4446876896322507, - "learning_rate": 1.5886383979844492e-07, - "loss": 1.6488, - "step": 18240 + "epoch": 0.23616450603486813, + "grad_norm": 1.1231670039812451, + "learning_rate": 5.290221763992522e-07, + "loss": 1.6238, + "step": 1761 }, { - "epoch": 0.6745848495483681, - "grad_norm": 1.6489416268912538, - "learning_rate": 1.5841682166549308e-07, - "loss": 1.6466, - "step": 18260 + "epoch": 0.23629861421546713, + "grad_norm": 1.0647516707650018, + "learning_rate": 5.289452807479734e-07, + "loss": 1.6579, + "step": 1762 }, { - "epoch": 0.6753237157581691, - "grad_norm": 1.6240331247999147, - "learning_rate": 1.5797043171108297e-07, - "loss": 1.6693, - "step": 18280 + "epoch": 0.23643272239606616, + "grad_norm": 1.2107894163734518, + "learning_rate": 5.288683497766222e-07, + "loss": 1.7207, + "step": 1763 }, { - "epoch": 0.6760625819679702, - "grad_norm": 2.2147991050957, - "learning_rate": 1.5752467234047263e-07, - "loss": 1.6051, - "step": 18300 + "epoch": 0.23656683057666517, + "grad_norm": 1.1025744988730661, + "learning_rate": 5.287913834988569e-07, + "loss": 1.7006, + "step": 1764 }, { - "epoch": 0.6768014481777712, - "grad_norm": 1.5203059720344088, - "learning_rate": 1.5707954595552187e-07, - "loss": 1.653, - "step": 18320 + "epoch": 0.2367009387572642, + "grad_norm": 1.0797524236014637, + "learning_rate": 5.287143819283421e-07, + "loss": 1.7584, + "step": 1765 }, { - "epoch": 0.6775403143875722, - "grad_norm": 1.5328417599383586, - "learning_rate": 1.5663505495468e-07, - "loss": 1.6381, - "step": 18340 + "epoch": 0.2368350469378632, + "grad_norm": 1.0751286199968113, + "learning_rate": 5.286373450787481e-07, + "loss": 1.5611, + "step": 1766 }, { - "epoch": 0.6782791805973734, - "grad_norm": 1.5445956099646183, - "learning_rate": 1.5619120173297267e-07, - "loss": 1.6037, - "step": 18360 + "epoch": 0.23696915511846223, + "grad_norm": 1.0636517626500344, + "learning_rate": 5.285602729637518e-07, + "loss": 1.6433, + "step": 1767 }, { - "epoch": 0.6790180468071744, - "grad_norm": 1.479872310550016, - "learning_rate": 1.5574798868198912e-07, - "loss": 1.6353, - "step": 18380 + "epoch": 0.23710326329906126, + "grad_norm": 1.048651758235017, + "learning_rate": 5.284831655970363e-07, + "loss": 1.6267, + "step": 1768 }, { - "epoch": 0.6797569130169755, - "grad_norm": 1.7841436633262773, - "learning_rate": 1.5530541818986927e-07, - "loss": 1.7364, - "step": 18400 + "epoch": 0.23723737147966026, + "grad_norm": 1.0862538156700035, + "learning_rate": 5.28406022992291e-07, + "loss": 1.591, + "step": 1769 }, { - "epoch": 0.6804957792267765, - "grad_norm": 1.529508435392583, - "learning_rate": 1.5486349264129046e-07, - "loss": 1.6181, - "step": 18420 + "epoch": 0.2373714796602593, + "grad_norm": 1.112560210549691, + "learning_rate": 5.283288451632116e-07, + "loss": 1.6387, + "step": 1770 }, { - "epoch": 0.6812346454365775, - "grad_norm": 1.6539396952625665, - "learning_rate": 1.5442221441745533e-07, - "loss": 1.6985, - "step": 18440 + "epoch": 0.2375055878408583, + "grad_norm": 1.163175696596488, + "learning_rate": 5.282516321235001e-07, + "loss": 1.8051, + "step": 1771 }, { - "epoch": 0.6819735116463786, - "grad_norm": 1.5860780535239207, - "learning_rate": 1.5398158589607813e-07, - "loss": 1.6636, - "step": 18460 + "epoch": 0.23763969602145732, + "grad_norm": 1.112481677106296, + "learning_rate": 5.281743838868644e-07, + "loss": 1.5411, + "step": 1772 }, { - "epoch": 0.6827123778561797, - "grad_norm": 1.9353694955508953, - "learning_rate": 1.5354160945137268e-07, - "loss": 1.6277, - "step": 18480 + "epoch": 0.23777380420205632, + "grad_norm": 1.1911416700291582, + "learning_rate": 5.28097100467019e-07, + "loss": 1.6194, + "step": 1773 }, { - "epoch": 0.6834512440659808, - "grad_norm": 1.4060414431962835, - "learning_rate": 1.5310228745403925e-07, - "loss": 1.6348, - "step": 18500 + "epoch": 0.23790791238265535, + "grad_norm": 1.0990682965946412, + "learning_rate": 5.280197818776845e-07, + "loss": 1.6605, + "step": 1774 }, { - "epoch": 0.6841901102757818, - "grad_norm": 1.9510007446700244, - "learning_rate": 1.5266362227125164e-07, - "loss": 1.666, - "step": 18520 + "epoch": 0.23804202056325435, + "grad_norm": 1.0591136451690275, + "learning_rate": 5.279424281325878e-07, + "loss": 1.6389, + "step": 1775 }, { - "epoch": 0.6849289764855828, - "grad_norm": 2.5976331102164694, - "learning_rate": 1.5222561626664448e-07, - "loss": 1.6437, - "step": 18540 + "epoch": 0.23817612874385338, + "grad_norm": 1.0683888995182673, + "learning_rate": 5.278650392454621e-07, + "loss": 1.6092, + "step": 1776 }, { - "epoch": 0.6856678426953839, - "grad_norm": 1.635565277090673, - "learning_rate": 1.51788271800301e-07, - "loss": 1.6367, - "step": 18560 + "epoch": 0.23831023692445238, + "grad_norm": 1.1224739302408693, + "learning_rate": 5.277876152300467e-07, + "loss": 1.6494, + "step": 1777 }, { - "epoch": 0.686406708905185, - "grad_norm": 1.6414633412876904, - "learning_rate": 1.5135159122873936e-07, - "loss": 1.6239, - "step": 18580 + "epoch": 0.23844434510505141, + "grad_norm": 1.0723497695462585, + "learning_rate": 5.27710156100087e-07, + "loss": 1.7937, + "step": 1778 }, { - "epoch": 0.6871455751149861, - "grad_norm": 1.972663651970077, - "learning_rate": 1.5091557690490104e-07, - "loss": 1.6551, - "step": 18600 + "epoch": 0.23857845328565042, + "grad_norm": 1.1351190756385903, + "learning_rate": 5.276326618693352e-07, + "loss": 1.7266, + "step": 1779 }, { - "epoch": 0.6878844413247871, - "grad_norm": 1.376913066395765, - "learning_rate": 1.504802311781371e-07, - "loss": 1.6494, - "step": 18620 + "epoch": 0.23871256146624945, + "grad_norm": 1.0579576318516895, + "learning_rate": 5.275551325515491e-07, + "loss": 1.6662, + "step": 1780 }, { - "epoch": 0.6886233075345882, - "grad_norm": 1.441207784040776, - "learning_rate": 1.5004555639419648e-07, - "loss": 1.6697, - "step": 18640 + "epoch": 0.23884666964684845, + "grad_norm": 1.1337655082128173, + "learning_rate": 5.27477568160493e-07, + "loss": 1.6656, + "step": 1781 }, { - "epoch": 0.6893621737443892, - "grad_norm": 2.5475644652288514, - "learning_rate": 1.4961155489521253e-07, - "loss": 1.6449, - "step": 18660 + "epoch": 0.23898077782744748, + "grad_norm": 1.3625169955042795, + "learning_rate": 5.273999687099377e-07, + "loss": 1.6154, + "step": 1782 }, { - "epoch": 0.6901010399541903, - "grad_norm": 1.4330764200962958, - "learning_rate": 1.4917822901969108e-07, - "loss": 1.5962, - "step": 18680 + "epoch": 0.23911488600804648, + "grad_norm": 1.0606076186008175, + "learning_rate": 5.273223342136596e-07, + "loss": 1.6295, + "step": 1783 }, { - "epoch": 0.6908399061639914, - "grad_norm": 1.5535375552432238, - "learning_rate": 1.487455811024975e-07, - "loss": 1.6682, - "step": 18700 + "epoch": 0.2392489941886455, + "grad_norm": 3.7952746706102753, + "learning_rate": 5.27244664685442e-07, + "loss": 1.593, + "step": 1784 }, { - "epoch": 0.6915787723737924, - "grad_norm": 1.5430558472764233, - "learning_rate": 1.4831361347484396e-07, - "loss": 1.6646, - "step": 18720 + "epoch": 0.2393831023692445, + "grad_norm": 1.1015598004917457, + "learning_rate": 5.271669601390737e-07, + "loss": 1.659, + "step": 1785 }, { - "epoch": 0.6923176385835935, - "grad_norm": 1.5354124537032656, - "learning_rate": 1.4788232846427718e-07, - "loss": 1.6569, - "step": 18740 + "epoch": 0.23951721054984354, + "grad_norm": 1.1429465431928834, + "learning_rate": 5.270892205883503e-07, + "loss": 1.7055, + "step": 1786 }, { - "epoch": 0.6930565047933945, - "grad_norm": 1.723896126450271, - "learning_rate": 1.474517283946658e-07, - "loss": 1.6694, - "step": 18760 + "epoch": 0.23965131873044257, + "grad_norm": 1.1572569512743107, + "learning_rate": 5.270114460470735e-07, + "loss": 1.75, + "step": 1787 }, { - "epoch": 0.6937953710031955, - "grad_norm": 1.4743738549149994, - "learning_rate": 1.4702181558618777e-07, - "loss": 1.6161, - "step": 18780 + "epoch": 0.23978542691104157, + "grad_norm": 1.1342505841464177, + "learning_rate": 5.269336365290511e-07, + "loss": 1.692, + "step": 1788 }, { - "epoch": 0.6945342372129967, - "grad_norm": 1.675747008439809, - "learning_rate": 1.4659259235531796e-07, - "loss": 1.6558, - "step": 18800 + "epoch": 0.2399195350916406, + "grad_norm": 1.1491667363729234, + "learning_rate": 5.268557920480969e-07, + "loss": 1.6956, + "step": 1789 }, { - "epoch": 0.6952731034227977, - "grad_norm": 1.760786257067446, - "learning_rate": 1.4616406101481574e-07, - "loss": 1.5887, - "step": 18820 + "epoch": 0.2400536432722396, + "grad_norm": 1.1290663441601718, + "learning_rate": 5.267779126180313e-07, + "loss": 1.7194, + "step": 1790 }, { - "epoch": 0.6960119696325988, - "grad_norm": 2.8049367365120608, - "learning_rate": 1.4573622387371217e-07, - "loss": 1.6649, - "step": 18840 + "epoch": 0.24018775145283863, + "grad_norm": 1.1068721597891535, + "learning_rate": 5.26699998252681e-07, + "loss": 1.6775, + "step": 1791 }, { - "epoch": 0.6967508358423998, - "grad_norm": 1.496529351669967, - "learning_rate": 1.4530908323729782e-07, - "loss": 1.6433, - "step": 18860 + "epoch": 0.24032185963343763, + "grad_norm": 1.0965127649518425, + "learning_rate": 5.266220489658783e-07, + "loss": 1.7381, + "step": 1792 }, { - "epoch": 0.6974897020522008, - "grad_norm": 1.4994802420062043, - "learning_rate": 1.448826414071105e-07, - "loss": 1.6841, - "step": 18880 + "epoch": 0.24045596781403666, + "grad_norm": 1.0539192312552248, + "learning_rate": 5.265440647714622e-07, + "loss": 1.6916, + "step": 1793 }, { - "epoch": 0.698228568262002, - "grad_norm": 1.420851366464802, - "learning_rate": 1.4445690068092265e-07, - "loss": 1.6504, - "step": 18900 + "epoch": 0.24059007599463567, + "grad_norm": 1.3925405964228643, + "learning_rate": 5.264660456832777e-07, + "loss": 1.6934, + "step": 1794 }, { - "epoch": 0.698967434471803, - "grad_norm": 1.7411191806669424, - "learning_rate": 1.4403186335272888e-07, - "loss": 1.6298, - "step": 18920 + "epoch": 0.2407241841752347, + "grad_norm": 1.0796598245896871, + "learning_rate": 5.263879917151761e-07, + "loss": 1.6891, + "step": 1795 }, { - "epoch": 0.6997063006816041, - "grad_norm": 1.628227507992112, - "learning_rate": 1.4360753171273364e-07, - "loss": 1.673, - "step": 18940 + "epoch": 0.2408582923558337, + "grad_norm": 1.0549168383726284, + "learning_rate": 5.263099028810148e-07, + "loss": 1.6417, + "step": 1796 }, { - "epoch": 0.7004451668914051, - "grad_norm": 1.7368645634603777, - "learning_rate": 1.4318390804733927e-07, - "loss": 1.6198, - "step": 18960 + "epoch": 0.24099240053643273, + "grad_norm": 1.0854208022859217, + "learning_rate": 5.262317791946574e-07, + "loss": 1.6132, + "step": 1797 }, { - "epoch": 0.7011840331012062, - "grad_norm": 1.4616447916754742, - "learning_rate": 1.4276099463913315e-07, - "loss": 1.6096, - "step": 18980 + "epoch": 0.24112650871703173, + "grad_norm": 1.1038896542176981, + "learning_rate": 5.261536206699738e-07, + "loss": 1.6074, + "step": 1798 }, { - "epoch": 0.7019228993110073, - "grad_norm": 1.517480098110094, - "learning_rate": 1.4233879376687563e-07, - "loss": 1.6345, - "step": 19000 + "epoch": 0.24126061689763076, + "grad_norm": 1.0646960968846464, + "learning_rate": 5.2607542732084e-07, + "loss": 1.601, + "step": 1799 }, { - "epoch": 0.7026617655208083, - "grad_norm": 1.636195025828432, - "learning_rate": 1.419173077054878e-07, - "loss": 1.6119, - "step": 19020 + "epoch": 0.24139472507822976, + "grad_norm": 1.1557060399556212, + "learning_rate": 5.259971991611381e-07, + "loss": 1.7684, + "step": 1800 }, { - "epoch": 0.7034006317306094, - "grad_norm": 1.5039586339840252, - "learning_rate": 1.4149653872603917e-07, - "loss": 1.7208, - "step": 19040 + "epoch": 0.2415288332588288, + "grad_norm": 1.0313305926934546, + "learning_rate": 5.259189362047565e-07, + "loss": 1.6322, + "step": 1801 }, { - "epoch": 0.7041394979404104, - "grad_norm": 1.4728764699529369, - "learning_rate": 1.410764890957353e-07, - "loss": 1.6572, - "step": 19060 + "epoch": 0.2416629414394278, + "grad_norm": 1.0974406411588324, + "learning_rate": 5.258406384655897e-07, + "loss": 1.6857, + "step": 1802 }, { - "epoch": 0.7048783641502115, - "grad_norm": 1.9218697223400836, - "learning_rate": 1.406571610779059e-07, - "loss": 1.6514, - "step": 19080 + "epoch": 0.24179704962002682, + "grad_norm": 1.1146673930740303, + "learning_rate": 5.257623059575385e-07, + "loss": 1.6456, + "step": 1803 }, { - "epoch": 0.7056172303600126, - "grad_norm": 1.5761294021476189, - "learning_rate": 1.4023855693199254e-07, - "loss": 1.6381, - "step": 19100 + "epoch": 0.24193115780062585, + "grad_norm": 1.0970256705246042, + "learning_rate": 5.256839386945097e-07, + "loss": 1.7583, + "step": 1804 }, { - "epoch": 0.7063560965698136, - "grad_norm": 1.435908518604352, - "learning_rate": 1.398206789135361e-07, - "loss": 1.6126, - "step": 19120 + "epoch": 0.24206526598122485, + "grad_norm": 1.107274760930789, + "learning_rate": 5.256055366904164e-07, + "loss": 1.6586, + "step": 1805 }, { - "epoch": 0.7070949627796147, - "grad_norm": 4.717577212666518, - "learning_rate": 1.3940352927416504e-07, - "loss": 1.6647, - "step": 19140 + "epoch": 0.24219937416182388, + "grad_norm": 1.1073843937392611, + "learning_rate": 5.255270999591779e-07, + "loss": 1.7062, + "step": 1806 }, { - "epoch": 0.7078338289894157, - "grad_norm": 2.1188904047967245, - "learning_rate": 1.3898711026158323e-07, - "loss": 1.6794, - "step": 19160 + "epoch": 0.24233348234242288, + "grad_norm": 1.0566525499472572, + "learning_rate": 5.254486285147196e-07, + "loss": 1.6526, + "step": 1807 }, { - "epoch": 0.7085726951992168, - "grad_norm": 1.5687418673344722, - "learning_rate": 1.3857142411955767e-07, - "loss": 1.6474, - "step": 19180 + "epoch": 0.24246759052302191, + "grad_norm": 1.1537228290096582, + "learning_rate": 5.253701223709729e-07, + "loss": 1.6933, + "step": 1808 }, { - "epoch": 0.7093115614090179, - "grad_norm": 1.6271449022527302, - "learning_rate": 1.381564730879064e-07, - "loss": 1.6347, - "step": 19200 + "epoch": 0.24260169870362092, + "grad_norm": 1.0990727257935735, + "learning_rate": 5.252915815418755e-07, + "loss": 1.7125, + "step": 1809 }, { - "epoch": 0.7100504276188189, - "grad_norm": 1.4693942372788273, - "learning_rate": 1.377422594024867e-07, - "loss": 1.6474, - "step": 19220 + "epoch": 0.24273580688421995, + "grad_norm": 1.244262115292612, + "learning_rate": 5.252130060413716e-07, + "loss": 1.6264, + "step": 1810 }, { - "epoch": 0.71078929382862, - "grad_norm": 1.488154969512232, - "learning_rate": 1.373287852951826e-07, - "loss": 1.6128, - "step": 19240 + "epoch": 0.24286991506481895, + "grad_norm": 1.1688493530359219, + "learning_rate": 5.251343958834107e-07, + "loss": 1.6785, + "step": 1811 }, { - "epoch": 0.711528160038421, - "grad_norm": 1.5779135188256272, - "learning_rate": 1.3691605299389328e-07, - "loss": 1.7183, - "step": 19260 + "epoch": 0.24300402324541798, + "grad_norm": 1.2285366933673156, + "learning_rate": 5.250557510819494e-07, + "loss": 1.572, + "step": 1812 }, { - "epoch": 0.7122670262482221, - "grad_norm": 1.6650630460525442, - "learning_rate": 1.3650406472252083e-07, - "loss": 1.6683, - "step": 19280 + "epoch": 0.24313813142601698, + "grad_norm": 1.1296607396854323, + "learning_rate": 5.249770716509499e-07, + "loss": 1.6761, + "step": 1813 }, { - "epoch": 0.7130058924580231, - "grad_norm": 1.4154650117196357, - "learning_rate": 1.360928227009584e-07, - "loss": 1.6717, - "step": 19300 + "epoch": 0.243272239606616, + "grad_norm": 1.1537668172261726, + "learning_rate": 5.248983576043808e-07, + "loss": 1.6839, + "step": 1814 }, { - "epoch": 0.7137447586678243, - "grad_norm": 1.6468623503038222, - "learning_rate": 1.3568232914507802e-07, - "loss": 1.6348, - "step": 19320 + "epoch": 0.243406347787215, + "grad_norm": 1.2774536095786413, + "learning_rate": 5.248196089562165e-07, + "loss": 1.6752, + "step": 1815 }, { - "epoch": 0.7144836248776253, - "grad_norm": 1.5015397491680238, - "learning_rate": 1.3527258626671898e-07, - "loss": 1.6112, - "step": 19340 + "epoch": 0.24354045596781404, + "grad_norm": 1.0391234761075887, + "learning_rate": 5.247408257204379e-07, + "loss": 1.713, + "step": 1816 }, { - "epoch": 0.7152224910874263, - "grad_norm": 3.3400095996186865, - "learning_rate": 1.348635962736755e-07, - "loss": 1.6523, - "step": 19360 + "epoch": 0.24367456414841304, + "grad_norm": 1.1351662284778345, + "learning_rate": 5.24662007911032e-07, + "loss": 1.741, + "step": 1817 }, { - "epoch": 0.7159613572972274, - "grad_norm": 1.5139103946143873, - "learning_rate": 1.344553613696854e-07, - "loss": 1.6941, - "step": 19380 + "epoch": 0.24380867232901207, + "grad_norm": 1.101327635041692, + "learning_rate": 5.245831555419915e-07, + "loss": 1.6196, + "step": 1818 }, { - "epoch": 0.7167002235070284, - "grad_norm": 1.4051928644539238, - "learning_rate": 1.340478837544175e-07, - "loss": 1.6237, - "step": 19400 + "epoch": 0.24394278050961107, + "grad_norm": 1.0713266982503056, + "learning_rate": 5.24504268627316e-07, + "loss": 1.6454, + "step": 1819 }, { - "epoch": 0.7174390897168296, - "grad_norm": 1.5234389161550645, - "learning_rate": 1.3364116562346055e-07, - "loss": 1.6559, - "step": 19420 + "epoch": 0.2440768886902101, + "grad_norm": 1.1530834766346107, + "learning_rate": 5.244253471810106e-07, + "loss": 1.7217, + "step": 1820 }, { - "epoch": 0.7181779559266306, - "grad_norm": 1.4205504198026582, - "learning_rate": 1.3323520916831077e-07, - "loss": 1.6478, - "step": 19440 + "epoch": 0.24421099687080913, + "grad_norm": 1.121128499361746, + "learning_rate": 5.243463912170868e-07, + "loss": 1.635, + "step": 1821 }, { - "epoch": 0.7189168221364316, - "grad_norm": 1.5989862880917087, - "learning_rate": 1.328300165763602e-07, - "loss": 1.6123, - "step": 19460 + "epoch": 0.24434510505140813, + "grad_norm": 1.1890728819475802, + "learning_rate": 5.242674007495621e-07, + "loss": 1.6498, + "step": 1822 }, { - "epoch": 0.7196556883462327, - "grad_norm": 1.6443108557654487, - "learning_rate": 1.3242559003088546e-07, - "loss": 1.6832, - "step": 19480 + "epoch": 0.24447921323200716, + "grad_norm": 1.0869958269746995, + "learning_rate": 5.241883757924604e-07, + "loss": 1.6685, + "step": 1823 }, { - "epoch": 0.7203945545560337, - "grad_norm": 1.3202697054517272, - "learning_rate": 1.3202193171103506e-07, - "loss": 1.6339, - "step": 19500 + "epoch": 0.24461332141260617, + "grad_norm": 1.072161128457571, + "learning_rate": 5.241093163598111e-07, + "loss": 1.613, + "step": 1824 }, { - "epoch": 0.7211334207658349, - "grad_norm": 1.5006943077767945, - "learning_rate": 1.316190437918182e-07, - "loss": 1.6469, - "step": 19520 + "epoch": 0.2447474295932052, + "grad_norm": 1.0697959147126053, + "learning_rate": 5.240302224656507e-07, + "loss": 1.7839, + "step": 1825 }, { - "epoch": 0.7218722869756359, - "grad_norm": 1.7379534891877164, - "learning_rate": 1.3121692844409321e-07, - "loss": 1.6797, - "step": 19540 + "epoch": 0.2448815377738042, + "grad_norm": 1.0447563021570512, + "learning_rate": 5.239510941240209e-07, + "loss": 1.553, + "step": 1826 }, { - "epoch": 0.7226111531854369, - "grad_norm": 1.526373724090785, - "learning_rate": 1.308155878345553e-07, - "loss": 1.6636, - "step": 19560 + "epoch": 0.24501564595440323, + "grad_norm": 1.1246283994835846, + "learning_rate": 5.2387193134897e-07, + "loss": 1.7167, + "step": 1827 }, { - "epoch": 0.723350019395238, - "grad_norm": 2.0046685771285793, - "learning_rate": 1.3041502412572542e-07, - "loss": 1.6748, - "step": 19580 + "epoch": 0.24514975413500223, + "grad_norm": 1.0539923982868098, + "learning_rate": 5.237927341545521e-07, + "loss": 1.6228, + "step": 1828 }, { - "epoch": 0.724088885605039, - "grad_norm": 1.4955882650728989, - "learning_rate": 1.3001523947593845e-07, - "loss": 1.6293, - "step": 19600 + "epoch": 0.24528386231560126, + "grad_norm": 1.1056807313462267, + "learning_rate": 5.23713502554828e-07, + "loss": 1.6631, + "step": 1829 }, { - "epoch": 0.7248277518148402, - "grad_norm": 2.4302511767713324, - "learning_rate": 1.2961623603933134e-07, - "loss": 1.6004, - "step": 19620 + "epoch": 0.24541797049620026, + "grad_norm": 1.1081084022345968, + "learning_rate": 5.236342365638638e-07, + "loss": 1.7182, + "step": 1830 }, { - "epoch": 0.7255666180246412, - "grad_norm": 1.6494154347871601, - "learning_rate": 1.2921801596583153e-07, - "loss": 1.6136, - "step": 19640 + "epoch": 0.2455520786767993, + "grad_norm": 1.1259734401016548, + "learning_rate": 5.235549361957323e-07, + "loss": 1.6281, + "step": 1831 }, { - "epoch": 0.7263054842344423, - "grad_norm": 1.4459727786023948, - "learning_rate": 1.2882058140114594e-07, - "loss": 1.6435, - "step": 19660 + "epoch": 0.2456861868573983, + "grad_norm": 1.073575909581403, + "learning_rate": 5.234756014645123e-07, + "loss": 1.7089, + "step": 1832 }, { - "epoch": 0.7270443504442433, - "grad_norm": 1.4490955525578755, - "learning_rate": 1.2842393448674869e-07, - "loss": 1.6508, - "step": 19680 + "epoch": 0.24582029503799732, + "grad_norm": 1.182395764700481, + "learning_rate": 5.233962323842885e-07, + "loss": 1.6138, + "step": 1833 }, { - "epoch": 0.7277832166540443, - "grad_norm": 1.7167939191812815, - "learning_rate": 1.280280773598699e-07, - "loss": 1.6299, - "step": 19700 + "epoch": 0.24595440321859632, + "grad_norm": 1.067652195605279, + "learning_rate": 5.233168289691518e-07, + "loss": 1.6409, + "step": 1834 }, { - "epoch": 0.7285220828638455, - "grad_norm": 1.8412228497101617, - "learning_rate": 1.2763301215348402e-07, - "loss": 1.6758, - "step": 19720 + "epoch": 0.24608851139919535, + "grad_norm": 1.0539945315127641, + "learning_rate": 5.232373912331994e-07, + "loss": 1.6632, + "step": 1835 }, { - "epoch": 0.7292609490736465, - "grad_norm": 1.6407591339864582, - "learning_rate": 1.2723874099629866e-07, - "loss": 1.6443, - "step": 19740 + "epoch": 0.24622261957979436, + "grad_norm": 1.1353497557175543, + "learning_rate": 5.231579191905341e-07, + "loss": 1.6481, + "step": 1836 }, { - "epoch": 0.7299998152834476, - "grad_norm": 2.010243920808459, - "learning_rate": 1.268452660127427e-07, - "loss": 1.6317, - "step": 19760 + "epoch": 0.24635672776039338, + "grad_norm": 1.0518079931176558, + "learning_rate": 5.230784128552653e-07, + "loss": 1.641, + "step": 1837 }, { - "epoch": 0.7307386814932486, - "grad_norm": 1.521357800662826, - "learning_rate": 1.2645258932295518e-07, - "loss": 1.6162, - "step": 19780 + "epoch": 0.24649083594099241, + "grad_norm": 1.068415705515305, + "learning_rate": 5.229988722415082e-07, + "loss": 1.706, + "step": 1838 }, { - "epoch": 0.7314775477030496, - "grad_norm": 1.5657714545631887, - "learning_rate": 1.260607130427737e-07, - "loss": 1.6134, - "step": 19800 + "epoch": 0.24662494412159142, + "grad_norm": 1.128403860172621, + "learning_rate": 5.229192973633844e-07, + "loss": 1.6095, + "step": 1839 }, { - "epoch": 0.7322164139128507, - "grad_norm": 1.7902489767561236, - "learning_rate": 1.2566963928372308e-07, - "loss": 1.6633, - "step": 19820 + "epoch": 0.24675905230219045, + "grad_norm": 1.069414952826673, + "learning_rate": 5.22839688235021e-07, + "loss": 1.6543, + "step": 1840 }, { - "epoch": 0.7329552801226518, - "grad_norm": 2.0435731294538466, - "learning_rate": 1.2527937015300378e-07, - "loss": 1.6505, - "step": 19840 + "epoch": 0.24689316048278945, + "grad_norm": 1.0821194973907244, + "learning_rate": 5.227600448705517e-07, + "loss": 1.556, + "step": 1841 }, { - "epoch": 0.7336941463324529, - "grad_norm": 5.207754525218824, - "learning_rate": 1.2488990775348092e-07, - "loss": 1.6453, - "step": 19860 + "epoch": 0.24702726866338848, + "grad_norm": 1.084344318240152, + "learning_rate": 5.226803672841162e-07, + "loss": 1.6034, + "step": 1842 }, { - "epoch": 0.7344330125422539, - "grad_norm": 1.6112840529464336, - "learning_rate": 1.245012541836728e-07, - "loss": 1.6082, - "step": 19880 + "epoch": 0.24716137684398748, + "grad_norm": 1.1202391548493928, + "learning_rate": 5.226006554898601e-07, + "loss": 1.6966, + "step": 1843 }, { - "epoch": 0.7351718787520549, - "grad_norm": 1.4827262765821532, - "learning_rate": 1.241134115377394e-07, - "loss": 1.6161, - "step": 19900 + "epoch": 0.2472954850245865, + "grad_norm": 1.0911354590278528, + "learning_rate": 5.225209095019351e-07, + "loss": 1.6948, + "step": 1844 }, { - "epoch": 0.735910744961856, - "grad_norm": 1.7219801506968755, - "learning_rate": 1.2372638190547122e-07, - "loss": 1.6305, - "step": 19920 + "epoch": 0.2474295932051855, + "grad_norm": 1.1062195036954834, + "learning_rate": 5.224411293344992e-07, + "loss": 1.5054, + "step": 1845 }, { - "epoch": 0.7366496111716571, - "grad_norm": 1.3720219936046893, - "learning_rate": 1.233401673722782e-07, - "loss": 1.6099, - "step": 19940 + "epoch": 0.24756370138578454, + "grad_norm": 1.0581940583028457, + "learning_rate": 5.223613150017162e-07, + "loss": 1.6027, + "step": 1846 }, { - "epoch": 0.7373884773814582, - "grad_norm": 1.7432612385035637, - "learning_rate": 1.229547700191783e-07, - "loss": 1.6372, - "step": 19960 + "epoch": 0.24769780956638354, + "grad_norm": 1.0564622037081781, + "learning_rate": 5.22281466517756e-07, + "loss": 1.6139, + "step": 1847 }, { - "epoch": 0.7381273435912592, - "grad_norm": 3.2034872788326925, - "learning_rate": 1.2257019192278617e-07, - "loss": 1.6147, - "step": 19980 + "epoch": 0.24783191774698257, + "grad_norm": 1.0965905968449954, + "learning_rate": 5.222015838967948e-07, + "loss": 1.6531, + "step": 1848 }, { - "epoch": 0.7388662098010603, - "grad_norm": 1.6442745462596664, - "learning_rate": 1.2218643515530227e-07, - "loss": 1.6344, - "step": 20000 + "epoch": 0.24796602592758157, + "grad_norm": 1.1162216415234159, + "learning_rate": 5.221216671530146e-07, + "loss": 1.6434, + "step": 1849 }, { - "epoch": 0.7396050760108613, - "grad_norm": 1.959102806007239, - "learning_rate": 1.218035017845015e-07, - "loss": 1.6451, - "step": 20020 + "epoch": 0.2481001341081806, + "grad_norm": 1.0760593930698765, + "learning_rate": 5.220417163006035e-07, + "loss": 1.7068, + "step": 1850 }, { - "epoch": 0.7403439422206624, - "grad_norm": 1.6408059937998853, - "learning_rate": 1.214213938737219e-07, - "loss": 1.6757, - "step": 20040 + "epoch": 0.2482342422887796, + "grad_norm": 1.3461498868058117, + "learning_rate": 5.219617313537557e-07, + "loss": 1.6895, + "step": 1851 }, { - "epoch": 0.7410828084304635, - "grad_norm": 1.5657243128524525, - "learning_rate": 1.210591578161399e-07, - "loss": 1.6359, - "step": 20060 + "epoch": 0.24836835046937863, + "grad_norm": 1.116707873551399, + "learning_rate": 5.218817123266716e-07, + "loss": 1.6986, + "step": 1852 }, { - "epoch": 0.7418216746402645, - "grad_norm": 1.4736673628427441, - "learning_rate": 1.2067866547022443e-07, - "loss": 1.6603, - "step": 20080 + "epoch": 0.24850245864997764, + "grad_norm": 1.0874229858859366, + "learning_rate": 5.218016592335574e-07, + "loss": 1.696, + "step": 1853 }, { - "epoch": 0.7425605408500656, - "grad_norm": 1.4833315219916223, - "learning_rate": 1.2029900464522203e-07, - "loss": 1.6342, - "step": 20100 + "epoch": 0.24863656683057667, + "grad_norm": 1.2149675834773461, + "learning_rate": 5.217215720886254e-07, + "loss": 1.6334, + "step": 1854 }, { - "epoch": 0.7432994070598666, - "grad_norm": 1.9340686772443259, - "learning_rate": 1.1992017738683768e-07, - "loss": 1.6416, - "step": 20120 + "epoch": 0.24877067501117567, + "grad_norm": 1.0673684982385807, + "learning_rate": 5.21641450906094e-07, + "loss": 1.6445, + "step": 1855 }, { - "epoch": 0.7440382732696676, - "grad_norm": 1.6355654798248513, - "learning_rate": 1.1954218573628499e-07, - "loss": 1.6678, - "step": 20140 + "epoch": 0.2489047831917747, + "grad_norm": 1.0639747826797143, + "learning_rate": 5.215612957001879e-07, + "loss": 1.7352, + "step": 1856 }, { - "epoch": 0.7447771394794688, - "grad_norm": 1.5624481100138734, - "learning_rate": 1.1916503173027475e-07, - "loss": 1.614, - "step": 20160 + "epoch": 0.24903889137237373, + "grad_norm": 1.1955320747693832, + "learning_rate": 5.214811064851373e-07, + "loss": 1.6991, + "step": 1857 }, { - "epoch": 0.7455160056892698, - "grad_norm": 1.5029974061648055, - "learning_rate": 1.1878871740100476e-07, - "loss": 1.639, - "step": 20180 + "epoch": 0.24917299955297273, + "grad_norm": 1.1925934103789766, + "learning_rate": 5.214008832751788e-07, + "loss": 1.6421, + "step": 1858 }, { - "epoch": 0.7462548718990709, - "grad_norm": 1.4683397727523646, - "learning_rate": 1.1841324477614812e-07, - "loss": 1.6516, - "step": 20200 + "epoch": 0.24930710773357176, + "grad_norm": 1.152167600482823, + "learning_rate": 5.21320626084555e-07, + "loss": 1.6614, + "step": 1859 }, { - "epoch": 0.7469937381088719, - "grad_norm": 1.478703041295488, - "learning_rate": 1.1803861587884268e-07, - "loss": 1.7247, - "step": 20220 + "epoch": 0.24944121591417076, + "grad_norm": 1.115689117193753, + "learning_rate": 5.212403349275145e-07, + "loss": 1.67, + "step": 1860 }, { - "epoch": 0.7477326043186729, - "grad_norm": 1.4765169074470068, - "learning_rate": 1.1766483272768017e-07, - "loss": 1.6786, - "step": 20240 + "epoch": 0.2495753240947698, + "grad_norm": 1.0409261709804483, + "learning_rate": 5.211600098183119e-07, + "loss": 1.5712, + "step": 1861 }, { - "epoch": 0.7484714705284741, - "grad_norm": 1.3861683142566674, - "learning_rate": 1.1729189733669528e-07, - "loss": 1.6242, - "step": 20260 + "epoch": 0.2497094322753688, + "grad_norm": 1.1645359690711583, + "learning_rate": 5.210796507712078e-07, + "loss": 1.6747, + "step": 1862 }, { - "epoch": 0.7492103367382751, - "grad_norm": 1.5107470749741048, - "learning_rate": 1.1691981171535459e-07, - "loss": 1.6476, - "step": 20280 + "epoch": 0.24984354045596782, + "grad_norm": 1.1220835902669124, + "learning_rate": 5.209992578004688e-07, + "loss": 1.6994, + "step": 1863 }, { - "epoch": 0.7499492029480762, - "grad_norm": 1.5696138247640767, - "learning_rate": 1.1654857786854591e-07, - "loss": 1.6691, - "step": 20300 + "epoch": 0.24997764863656682, + "grad_norm": 1.0574464835321717, + "learning_rate": 5.209188309203678e-07, + "loss": 1.6434, + "step": 1864 } ], - "logging_steps": 20, - "max_steps": 27068, + "logging_steps": 1, + "max_steps": 7456, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 6767, + "save_steps": 1864, "stateful_callbacks": { "TrainerControl": { "args": { @@ -7131,8 +13074,8 @@ "attributes": {} } }, - "total_flos": 3859009492746240.0, - "train_batch_size": 1, + "total_flos": 498613144780800.0, + "train_batch_size": 3, "trial_name": null, "trial_params": null }