{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9938366718027734, "eval_steps": 500, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006158583525789068, "grad_norm": 1.4914512634277344, "learning_rate": 2e-05, "loss": 2.3404, "step": 1 }, { "epoch": 0.012317167051578136, "grad_norm": 1.486335039138794, "learning_rate": 4e-05, "loss": 2.3428, "step": 2 }, { "epoch": 0.018475750577367205, "grad_norm": 1.4762102365493774, "learning_rate": 6e-05, "loss": 2.3374, "step": 3 }, { "epoch": 0.024634334103156273, "grad_norm": 1.5677003860473633, "learning_rate": 8e-05, "loss": 2.3111, "step": 4 }, { "epoch": 0.030792917628945343, "grad_norm": 1.5895426273345947, "learning_rate": 0.0001, "loss": 2.2003, "step": 5 }, { "epoch": 0.03695150115473441, "grad_norm": 1.4371826648712158, "learning_rate": 0.00012, "loss": 2.0329, "step": 6 }, { "epoch": 0.04311008468052348, "grad_norm": 1.3726378679275513, "learning_rate": 0.00014, "loss": 1.8007, "step": 7 }, { "epoch": 0.049268668206312545, "grad_norm": 1.4886671304702759, "learning_rate": 0.00016, "loss": 1.5998, "step": 8 }, { "epoch": 0.05542725173210162, "grad_norm": 1.3068833351135254, "learning_rate": 0.00018, "loss": 1.3961, "step": 9 }, { "epoch": 0.061585835257890686, "grad_norm": 3.4257965087890625, "learning_rate": 0.0002, "loss": 1.3012, "step": 10 }, { "epoch": 0.06774441878367975, "grad_norm": 1.5885640382766724, "learning_rate": 0.00019999782201809226, "loss": 1.1308, "step": 11 }, { "epoch": 0.07390300230946882, "grad_norm": 0.9890074729919434, "learning_rate": 0.00019999128816724108, "loss": 1.0712, "step": 12 }, { "epoch": 0.0800615858352579, "grad_norm": 0.9384965300559998, "learning_rate": 0.00019998039873205868, "loss": 1.0627, "step": 13 }, { "epoch": 0.08622016936104696, "grad_norm": 3.847111225128174, "learning_rate": 0.0001999651541868849, "loss": 1.01, "step": 14 }, { "epoch": 0.09237875288683603, "grad_norm": 1.0700613260269165, "learning_rate": 0.00019994555519576662, "loss": 1.0198, "step": 15 }, { "epoch": 0.09853733641262509, "grad_norm": 3.9324963092803955, "learning_rate": 0.00019992160261242877, "loss": 1.0316, "step": 16 }, { "epoch": 0.10469591993841416, "grad_norm": 0.8849812150001526, "learning_rate": 0.00019989329748023725, "loss": 0.9896, "step": 17 }, { "epoch": 0.11085450346420324, "grad_norm": 0.5666069984436035, "learning_rate": 0.00019986064103215339, "loss": 0.9734, "step": 18 }, { "epoch": 0.1170130869899923, "grad_norm": 0.4113065302371979, "learning_rate": 0.0001998236346906802, "loss": 0.9253, "step": 19 }, { "epoch": 0.12317167051578137, "grad_norm": 0.3489767909049988, "learning_rate": 0.00019978228006780054, "loss": 0.9113, "step": 20 }, { "epoch": 0.12933025404157045, "grad_norm": 1.2077497243881226, "learning_rate": 0.00019973657896490686, "loss": 0.9194, "step": 21 }, { "epoch": 0.1354888375673595, "grad_norm": 0.4487021863460541, "learning_rate": 0.00019968653337272261, "loss": 0.9058, "step": 22 }, { "epoch": 0.14164742109314857, "grad_norm": 0.35114091634750366, "learning_rate": 0.0001996321454712157, "loss": 0.9066, "step": 23 }, { "epoch": 0.14780600461893764, "grad_norm": 0.3634772300720215, "learning_rate": 0.00019957341762950344, "loss": 0.913, "step": 24 }, { "epoch": 0.15396458814472672, "grad_norm": 0.4948749244213104, "learning_rate": 0.0001995103524057494, "loss": 0.8944, "step": 25 }, { "epoch": 0.1601231716705158, "grad_norm": 0.3144727051258087, "learning_rate": 0.00019944295254705185, "loss": 0.8973, "step": 26 }, { "epoch": 0.16628175519630484, "grad_norm": 0.26469510793685913, "learning_rate": 0.00019937122098932428, "loss": 0.8776, "step": 27 }, { "epoch": 0.1724403387220939, "grad_norm": 0.29824352264404297, "learning_rate": 0.00019929516085716734, "loss": 0.8521, "step": 28 }, { "epoch": 0.17859892224788299, "grad_norm": 0.2820914387702942, "learning_rate": 0.00019921477546373296, "loss": 0.862, "step": 29 }, { "epoch": 0.18475750577367206, "grad_norm": 0.3102569282054901, "learning_rate": 0.00019913006831057969, "loss": 0.8586, "step": 30 }, { "epoch": 0.19091608929946113, "grad_norm": 0.30348193645477295, "learning_rate": 0.0001990410430875205, "loss": 0.854, "step": 31 }, { "epoch": 0.19707467282525018, "grad_norm": 0.31126317381858826, "learning_rate": 0.00019894770367246195, "loss": 0.8209, "step": 32 }, { "epoch": 0.20323325635103925, "grad_norm": 0.3367297351360321, "learning_rate": 0.00019885005413123515, "loss": 0.8545, "step": 33 }, { "epoch": 0.20939183987682833, "grad_norm": 0.28940874338150024, "learning_rate": 0.00019874809871741876, "loss": 0.8561, "step": 34 }, { "epoch": 0.2155504234026174, "grad_norm": 0.3074447512626648, "learning_rate": 0.00019864184187215372, "loss": 0.8495, "step": 35 }, { "epoch": 0.22170900692840648, "grad_norm": 0.29862889647483826, "learning_rate": 0.00019853128822394975, "loss": 0.8447, "step": 36 }, { "epoch": 0.22786759045419552, "grad_norm": 0.3074705898761749, "learning_rate": 0.0001984164425884838, "loss": 0.8199, "step": 37 }, { "epoch": 0.2340261739799846, "grad_norm": 0.2626888155937195, "learning_rate": 0.0001982973099683902, "loss": 0.8289, "step": 38 }, { "epoch": 0.24018475750577367, "grad_norm": 0.26036959886550903, "learning_rate": 0.00019817389555304272, "loss": 0.8244, "step": 39 }, { "epoch": 0.24634334103156275, "grad_norm": 0.2532216012477875, "learning_rate": 0.0001980462047183287, "loss": 0.8236, "step": 40 }, { "epoch": 0.2525019245573518, "grad_norm": 0.30278533697128296, "learning_rate": 0.0001979142430264146, "loss": 0.8207, "step": 41 }, { "epoch": 0.2586605080831409, "grad_norm": 0.29070666432380676, "learning_rate": 0.00019777801622550408, "loss": 0.823, "step": 42 }, { "epoch": 0.26481909160892997, "grad_norm": 0.2718726694583893, "learning_rate": 0.00019763753024958723, "loss": 0.8054, "step": 43 }, { "epoch": 0.270977675134719, "grad_norm": 0.25870487093925476, "learning_rate": 0.00019749279121818235, "loss": 0.8118, "step": 44 }, { "epoch": 0.27713625866050806, "grad_norm": 0.2909790873527527, "learning_rate": 0.0001973438054360693, "loss": 0.8101, "step": 45 }, { "epoch": 0.28329484218629714, "grad_norm": 0.30481499433517456, "learning_rate": 0.00019719057939301477, "loss": 0.8394, "step": 46 }, { "epoch": 0.2894534257120862, "grad_norm": 0.2736552059650421, "learning_rate": 0.0001970331197634898, "loss": 0.7932, "step": 47 }, { "epoch": 0.2956120092378753, "grad_norm": 0.250683456659317, "learning_rate": 0.00019687143340637887, "loss": 0.7907, "step": 48 }, { "epoch": 0.30177059276366436, "grad_norm": 0.32575681805610657, "learning_rate": 0.00019670552736468118, "loss": 0.8022, "step": 49 }, { "epoch": 0.30792917628945343, "grad_norm": 0.3301404118537903, "learning_rate": 0.00019653540886520386, "loss": 0.8163, "step": 50 }, { "epoch": 0.3140877598152425, "grad_norm": 0.31461504101753235, "learning_rate": 0.00019636108531824724, "loss": 0.8111, "step": 51 }, { "epoch": 0.3202463433410316, "grad_norm": 0.2696295380592346, "learning_rate": 0.00019618256431728194, "loss": 0.7953, "step": 52 }, { "epoch": 0.32640492686682065, "grad_norm": 0.29559096693992615, "learning_rate": 0.0001959998536386181, "loss": 0.7759, "step": 53 }, { "epoch": 0.3325635103926097, "grad_norm": 0.2746593654155731, "learning_rate": 0.0001958129612410668, "loss": 0.7817, "step": 54 }, { "epoch": 0.33872209391839875, "grad_norm": 0.30077093839645386, "learning_rate": 0.0001956218952655933, "loss": 0.7965, "step": 55 }, { "epoch": 0.3448806774441878, "grad_norm": 0.2864063084125519, "learning_rate": 0.00019542666403496233, "loss": 0.812, "step": 56 }, { "epoch": 0.3510392609699769, "grad_norm": 0.3612823784351349, "learning_rate": 0.0001952272760533756, "loss": 0.7652, "step": 57 }, { "epoch": 0.35719784449576597, "grad_norm": 0.28117987513542175, "learning_rate": 0.00019502374000610151, "loss": 0.7982, "step": 58 }, { "epoch": 0.36335642802155504, "grad_norm": 0.3067573010921478, "learning_rate": 0.0001948160647590966, "loss": 0.7703, "step": 59 }, { "epoch": 0.3695150115473441, "grad_norm": 0.27148452401161194, "learning_rate": 0.00019460425935861948, "loss": 0.7795, "step": 60 }, { "epoch": 0.3756735950731332, "grad_norm": 0.35128867626190186, "learning_rate": 0.00019438833303083678, "loss": 0.7637, "step": 61 }, { "epoch": 0.38183217859892227, "grad_norm": 0.3009471297264099, "learning_rate": 0.00019416829518142118, "loss": 0.7838, "step": 62 }, { "epoch": 0.38799076212471134, "grad_norm": 0.3329865038394928, "learning_rate": 0.00019394415539514178, "loss": 0.7949, "step": 63 }, { "epoch": 0.39414934565050036, "grad_norm": 0.2731780707836151, "learning_rate": 0.00019371592343544656, "loss": 0.779, "step": 64 }, { "epoch": 0.40030792917628943, "grad_norm": 0.3036772608757019, "learning_rate": 0.00019348360924403713, "loss": 0.7665, "step": 65 }, { "epoch": 0.4064665127020785, "grad_norm": 0.28768160939216614, "learning_rate": 0.00019324722294043558, "loss": 0.7916, "step": 66 }, { "epoch": 0.4126250962278676, "grad_norm": 0.30529218912124634, "learning_rate": 0.0001930067748215438, "loss": 0.7848, "step": 67 }, { "epoch": 0.41878367975365666, "grad_norm": 0.2631685733795166, "learning_rate": 0.0001927622753611948, "loss": 0.785, "step": 68 }, { "epoch": 0.42494226327944573, "grad_norm": 0.33626529574394226, "learning_rate": 0.0001925137352096966, "loss": 0.7696, "step": 69 }, { "epoch": 0.4311008468052348, "grad_norm": 0.30560940504074097, "learning_rate": 0.0001922611651933683, "loss": 0.7623, "step": 70 }, { "epoch": 0.4372594303310239, "grad_norm": 0.31474098563194275, "learning_rate": 0.0001920045763140684, "loss": 0.7909, "step": 71 }, { "epoch": 0.44341801385681295, "grad_norm": 0.2986692488193512, "learning_rate": 0.00019174397974871564, "loss": 0.7837, "step": 72 }, { "epoch": 0.44957659738260203, "grad_norm": 0.2956785261631012, "learning_rate": 0.0001914793868488021, "loss": 0.7721, "step": 73 }, { "epoch": 0.45573518090839105, "grad_norm": 0.34299513697624207, "learning_rate": 0.0001912108091398988, "loss": 0.7595, "step": 74 }, { "epoch": 0.4618937644341801, "grad_norm": 0.2858146131038666, "learning_rate": 0.0001909382583211535, "loss": 0.77, "step": 75 }, { "epoch": 0.4680523479599692, "grad_norm": 0.2714357078075409, "learning_rate": 0.0001906617462647813, "loss": 0.755, "step": 76 }, { "epoch": 0.47421093148575827, "grad_norm": 0.2805250287055969, "learning_rate": 0.0001903812850155472, "loss": 0.7572, "step": 77 }, { "epoch": 0.48036951501154734, "grad_norm": 0.321464866399765, "learning_rate": 0.0001900968867902419, "loss": 0.7259, "step": 78 }, { "epoch": 0.4865280985373364, "grad_norm": 0.27994561195373535, "learning_rate": 0.00018980856397714913, "loss": 0.7779, "step": 79 }, { "epoch": 0.4926866820631255, "grad_norm": 0.3206578195095062, "learning_rate": 0.00018951632913550626, "loss": 0.7599, "step": 80 }, { "epoch": 0.49884526558891457, "grad_norm": 0.30579233169555664, "learning_rate": 0.00018922019499495725, "loss": 0.763, "step": 81 }, { "epoch": 0.5050038491147036, "grad_norm": 0.2705940902233124, "learning_rate": 0.0001889201744549981, "loss": 0.7436, "step": 82 }, { "epoch": 0.5111624326404927, "grad_norm": 0.36793988943099976, "learning_rate": 0.00018861628058441506, "loss": 0.7489, "step": 83 }, { "epoch": 0.5173210161662818, "grad_norm": 0.30018600821495056, "learning_rate": 0.00018830852662071507, "loss": 0.7479, "step": 84 }, { "epoch": 0.5234795996920708, "grad_norm": 0.2857201397418976, "learning_rate": 0.00018799692596954947, "loss": 0.7584, "step": 85 }, { "epoch": 0.5296381832178599, "grad_norm": 0.3345021903514862, "learning_rate": 0.0001876814922041299, "loss": 0.7589, "step": 86 }, { "epoch": 0.535796766743649, "grad_norm": 0.2717898488044739, "learning_rate": 0.00018736223906463696, "loss": 0.7658, "step": 87 }, { "epoch": 0.541955350269438, "grad_norm": 0.30215781927108765, "learning_rate": 0.00018703918045762197, "loss": 0.7836, "step": 88 }, { "epoch": 0.5481139337952271, "grad_norm": 0.2765495479106903, "learning_rate": 0.0001867123304554009, "loss": 0.7556, "step": 89 }, { "epoch": 0.5542725173210161, "grad_norm": 0.31692612171173096, "learning_rate": 0.00018638170329544164, "loss": 0.7684, "step": 90 }, { "epoch": 0.5604311008468053, "grad_norm": 0.273282915353775, "learning_rate": 0.00018604731337974357, "loss": 0.7544, "step": 91 }, { "epoch": 0.5665896843725943, "grad_norm": 0.2583194971084595, "learning_rate": 0.00018570917527421048, "loss": 0.7469, "step": 92 }, { "epoch": 0.5727482678983834, "grad_norm": 0.287936270236969, "learning_rate": 0.00018536730370801585, "loss": 0.7417, "step": 93 }, { "epoch": 0.5789068514241724, "grad_norm": 0.31012046337127686, "learning_rate": 0.00018502171357296144, "loss": 0.7405, "step": 94 }, { "epoch": 0.5850654349499615, "grad_norm": 0.30453142523765564, "learning_rate": 0.00018467241992282843, "loss": 0.7688, "step": 95 }, { "epoch": 0.5912240184757506, "grad_norm": 0.2779741585254669, "learning_rate": 0.00018431943797272187, "loss": 0.7396, "step": 96 }, { "epoch": 0.5973826020015397, "grad_norm": 0.29110845923423767, "learning_rate": 0.00018396278309840779, "loss": 0.7556, "step": 97 }, { "epoch": 0.6035411855273287, "grad_norm": 0.315677285194397, "learning_rate": 0.00018360247083564342, "loss": 0.749, "step": 98 }, { "epoch": 0.6096997690531177, "grad_norm": 0.3191000521183014, "learning_rate": 0.00018323851687950055, "loss": 0.7548, "step": 99 }, { "epoch": 0.6158583525789069, "grad_norm": 0.28777754306793213, "learning_rate": 0.00018287093708368188, "loss": 0.7625, "step": 100 }, { "epoch": 0.6220169361046959, "grad_norm": 0.30834269523620605, "learning_rate": 0.00018249974745983023, "loss": 0.7605, "step": 101 }, { "epoch": 0.628175519630485, "grad_norm": 0.31846028566360474, "learning_rate": 0.00018212496417683137, "loss": 0.7715, "step": 102 }, { "epoch": 0.634334103156274, "grad_norm": 0.3188404142856598, "learning_rate": 0.00018174660356010943, "loss": 0.7661, "step": 103 }, { "epoch": 0.6404926866820632, "grad_norm": 0.2989709973335266, "learning_rate": 0.00018136468209091602, "loss": 0.728, "step": 104 }, { "epoch": 0.6466512702078522, "grad_norm": 0.31583070755004883, "learning_rate": 0.0001809792164056121, "loss": 0.7751, "step": 105 }, { "epoch": 0.6528098537336413, "grad_norm": 0.2666497528553009, "learning_rate": 0.0001805902232949435, "loss": 0.7407, "step": 106 }, { "epoch": 0.6589684372594303, "grad_norm": 0.32868844270706177, "learning_rate": 0.0001801977197033093, "loss": 0.7429, "step": 107 }, { "epoch": 0.6651270207852193, "grad_norm": 0.3197931945323944, "learning_rate": 0.000179801722728024, "loss": 0.746, "step": 108 }, { "epoch": 0.6712856043110085, "grad_norm": 0.29100263118743896, "learning_rate": 0.00017940224961857242, "loss": 0.7483, "step": 109 }, { "epoch": 0.6774441878367975, "grad_norm": 0.3278297185897827, "learning_rate": 0.00017899931777585882, "loss": 0.7619, "step": 110 }, { "epoch": 0.6836027713625866, "grad_norm": 0.3008161783218384, "learning_rate": 0.00017859294475144837, "loss": 0.7464, "step": 111 }, { "epoch": 0.6897613548883756, "grad_norm": 0.26044347882270813, "learning_rate": 0.000178183148246803, "loss": 0.7408, "step": 112 }, { "epoch": 0.6959199384141648, "grad_norm": 0.3036176562309265, "learning_rate": 0.00017776994611251015, "loss": 0.7614, "step": 113 }, { "epoch": 0.7020785219399538, "grad_norm": 0.3001931309700012, "learning_rate": 0.00017735335634750532, "loss": 0.7308, "step": 114 }, { "epoch": 0.7082371054657429, "grad_norm": 0.27744966745376587, "learning_rate": 0.00017693339709828792, "loss": 0.7456, "step": 115 }, { "epoch": 0.7143956889915319, "grad_norm": 0.2697376012802124, "learning_rate": 0.00017651008665813081, "loss": 0.7456, "step": 116 }, { "epoch": 0.7205542725173211, "grad_norm": 0.2659561336040497, "learning_rate": 0.0001760834434662837, "loss": 0.7262, "step": 117 }, { "epoch": 0.7267128560431101, "grad_norm": 0.29246559739112854, "learning_rate": 0.0001756534861071696, "loss": 0.7433, "step": 118 }, { "epoch": 0.7328714395688991, "grad_norm": 0.2959323525428772, "learning_rate": 0.00017522023330957548, "loss": 0.7512, "step": 119 }, { "epoch": 0.7390300230946882, "grad_norm": 0.26138290762901306, "learning_rate": 0.00017478370394583646, "loss": 0.7503, "step": 120 }, { "epoch": 0.7451886066204773, "grad_norm": 0.26741594076156616, "learning_rate": 0.00017434391703101363, "loss": 0.7582, "step": 121 }, { "epoch": 0.7513471901462664, "grad_norm": 0.28922754526138306, "learning_rate": 0.00017390089172206592, "loss": 0.7405, "step": 122 }, { "epoch": 0.7575057736720554, "grad_norm": 0.2668907344341278, "learning_rate": 0.00017345464731701547, "loss": 0.7381, "step": 123 }, { "epoch": 0.7636643571978445, "grad_norm": 0.27399054169654846, "learning_rate": 0.00017300520325410701, "loss": 0.7413, "step": 124 }, { "epoch": 0.7698229407236336, "grad_norm": 0.29532185196876526, "learning_rate": 0.0001725525791109614, "loss": 0.7709, "step": 125 }, { "epoch": 0.7759815242494227, "grad_norm": 0.3103969097137451, "learning_rate": 0.0001720967946037225, "loss": 0.7406, "step": 126 }, { "epoch": 0.7821401077752117, "grad_norm": 0.309108704328537, "learning_rate": 0.0001716378695861985, "loss": 0.7698, "step": 127 }, { "epoch": 0.7882986913010007, "grad_norm": 0.2896479070186615, "learning_rate": 0.00017117582404899712, "loss": 0.7417, "step": 128 }, { "epoch": 0.7944572748267898, "grad_norm": 0.28942593932151794, "learning_rate": 0.00017071067811865476, "loss": 0.7234, "step": 129 }, { "epoch": 0.8006158583525789, "grad_norm": 0.2783251702785492, "learning_rate": 0.00017024245205675986, "loss": 0.7441, "step": 130 }, { "epoch": 0.806774441878368, "grad_norm": 0.3195393979549408, "learning_rate": 0.00016977116625907024, "loss": 0.7407, "step": 131 }, { "epoch": 0.812933025404157, "grad_norm": 0.27995389699935913, "learning_rate": 0.0001692968412546247, "loss": 0.7616, "step": 132 }, { "epoch": 0.8190916089299461, "grad_norm": 0.26950138807296753, "learning_rate": 0.0001688194977048488, "loss": 0.7261, "step": 133 }, { "epoch": 0.8252501924557352, "grad_norm": 0.28609132766723633, "learning_rate": 0.00016833915640265484, "loss": 0.7596, "step": 134 }, { "epoch": 0.8314087759815243, "grad_norm": 0.29152774810791016, "learning_rate": 0.00016785583827153618, "loss": 0.7488, "step": 135 }, { "epoch": 0.8375673595073133, "grad_norm": 0.24516189098358154, "learning_rate": 0.00016736956436465573, "loss": 0.7213, "step": 136 }, { "epoch": 0.8437259430331023, "grad_norm": 0.2636944055557251, "learning_rate": 0.00016688035586392885, "loss": 0.7124, "step": 137 }, { "epoch": 0.8498845265588915, "grad_norm": 0.2662704288959503, "learning_rate": 0.00016638823407910084, "loss": 0.7208, "step": 138 }, { "epoch": 0.8560431100846805, "grad_norm": 0.28026285767555237, "learning_rate": 0.00016589322044681861, "loss": 0.7362, "step": 139 }, { "epoch": 0.8622016936104696, "grad_norm": 0.312272846698761, "learning_rate": 0.00016539533652969683, "loss": 0.7353, "step": 140 }, { "epoch": 0.8683602771362586, "grad_norm": 0.28158485889434814, "learning_rate": 0.00016489460401537874, "loss": 0.74, "step": 141 }, { "epoch": 0.8745188606620478, "grad_norm": 0.26379963755607605, "learning_rate": 0.00016439104471559156, "loss": 0.7342, "step": 142 }, { "epoch": 0.8806774441878368, "grad_norm": 0.3109482228755951, "learning_rate": 0.00016388468056519612, "loss": 0.7649, "step": 143 }, { "epoch": 0.8868360277136259, "grad_norm": 0.3082488477230072, "learning_rate": 0.00016337553362123165, "loss": 0.7502, "step": 144 }, { "epoch": 0.8929946112394149, "grad_norm": 0.27953189611434937, "learning_rate": 0.00016286362606195468, "loss": 0.7321, "step": 145 }, { "epoch": 0.8991531947652041, "grad_norm": 0.27005666494369507, "learning_rate": 0.00016234898018587337, "loss": 0.7447, "step": 146 }, { "epoch": 0.9053117782909931, "grad_norm": 0.29707372188568115, "learning_rate": 0.0001618316184107758, "loss": 0.727, "step": 147 }, { "epoch": 0.9114703618167821, "grad_norm": 0.3249203860759735, "learning_rate": 0.00016131156327275372, "loss": 0.7508, "step": 148 }, { "epoch": 0.9176289453425712, "grad_norm": 0.2724114954471588, "learning_rate": 0.00016078883742522075, "loss": 0.709, "step": 149 }, { "epoch": 0.9237875288683602, "grad_norm": 0.27933382987976074, "learning_rate": 0.00016026346363792567, "loss": 0.7318, "step": 150 }, { "epoch": 0.9299461123941494, "grad_norm": 0.27181556820869446, "learning_rate": 0.00015973546479596052, "loss": 0.7686, "step": 151 }, { "epoch": 0.9361046959199384, "grad_norm": 0.27142593264579773, "learning_rate": 0.00015920486389876383, "loss": 0.7485, "step": 152 }, { "epoch": 0.9422632794457275, "grad_norm": 0.26695144176483154, "learning_rate": 0.0001586716840591187, "loss": 0.7426, "step": 153 }, { "epoch": 0.9484218629715165, "grad_norm": 0.2876656949520111, "learning_rate": 0.000158135948502146, "loss": 0.7442, "step": 154 }, { "epoch": 0.9545804464973057, "grad_norm": 0.27336934208869934, "learning_rate": 0.00015759768056429274, "loss": 0.7353, "step": 155 }, { "epoch": 0.9607390300230947, "grad_norm": 0.2817447781562805, "learning_rate": 0.00015705690369231551, "loss": 0.7552, "step": 156 }, { "epoch": 0.9668976135488837, "grad_norm": 0.284213662147522, "learning_rate": 0.0001565136414422592, "loss": 0.7398, "step": 157 }, { "epoch": 0.9730561970746728, "grad_norm": 0.2847895622253418, "learning_rate": 0.0001559679174784308, "loss": 0.7364, "step": 158 }, { "epoch": 0.9792147806004619, "grad_norm": 0.2839486598968506, "learning_rate": 0.00015541975557236882, "loss": 0.754, "step": 159 }, { "epoch": 0.985373364126251, "grad_norm": 0.2721126973628998, "learning_rate": 0.0001548691796018074, "loss": 0.7448, "step": 160 }, { "epoch": 0.99153194765204, "grad_norm": 0.2735673785209656, "learning_rate": 0.00015431621354963668, "loss": 0.7308, "step": 161 }, { "epoch": 0.9976905311778291, "grad_norm": 0.31629157066345215, "learning_rate": 0.00015376088150285773, "loss": 0.7456, "step": 162 }, { "epoch": 1.0015408320493067, "grad_norm": 0.2917494475841522, "learning_rate": 0.00015320320765153367, "loss": 0.7408, "step": 163 }, { "epoch": 1.007704160246533, "grad_norm": 0.28891703486442566, "learning_rate": 0.0001526432162877356, "loss": 0.7162, "step": 164 }, { "epoch": 1.0138674884437597, "grad_norm": 0.27484121918678284, "learning_rate": 0.0001520809318044847, "loss": 0.7032, "step": 165 }, { "epoch": 1.0200308166409862, "grad_norm": 0.28564733266830444, "learning_rate": 0.0001515163786946896, "loss": 0.7112, "step": 166 }, { "epoch": 1.0261941448382126, "grad_norm": 0.2875756025314331, "learning_rate": 0.00015094958155007952, "loss": 0.7148, "step": 167 }, { "epoch": 1.0323574730354392, "grad_norm": 0.353564590215683, "learning_rate": 0.00015038056506013297, "loss": 0.7166, "step": 168 }, { "epoch": 1.0385208012326657, "grad_norm": 0.282805472612381, "learning_rate": 0.00014980935401100233, "loss": 0.6975, "step": 169 }, { "epoch": 1.044684129429892, "grad_norm": 0.27754154801368713, "learning_rate": 0.00014923597328443422, "loss": 0.7313, "step": 170 }, { "epoch": 1.0508474576271187, "grad_norm": 0.27703657746315, "learning_rate": 0.00014866044785668563, "loss": 0.7406, "step": 171 }, { "epoch": 1.0570107858243452, "grad_norm": 0.29809364676475525, "learning_rate": 0.00014808280279743593, "loss": 0.7316, "step": 172 }, { "epoch": 1.0631741140215716, "grad_norm": 0.30768731236457825, "learning_rate": 0.00014750306326869492, "loss": 0.6826, "step": 173 }, { "epoch": 1.0693374422187982, "grad_norm": 0.2725447416305542, "learning_rate": 0.00014692125452370663, "loss": 0.6971, "step": 174 }, { "epoch": 1.0755007704160247, "grad_norm": 0.2886168956756592, "learning_rate": 0.00014633740190584952, "loss": 0.7192, "step": 175 }, { "epoch": 1.081664098613251, "grad_norm": 0.3197912573814392, "learning_rate": 0.00014575153084753233, "loss": 0.7266, "step": 176 }, { "epoch": 1.0878274268104777, "grad_norm": 0.27032333612442017, "learning_rate": 0.00014516366686908637, "loss": 0.6884, "step": 177 }, { "epoch": 1.0939907550077042, "grad_norm": 0.2735288739204407, "learning_rate": 0.00014457383557765386, "loss": 0.6962, "step": 178 }, { "epoch": 1.1001540832049306, "grad_norm": 0.279834121465683, "learning_rate": 0.00014398206266607236, "loss": 0.6876, "step": 179 }, { "epoch": 1.1063174114021572, "grad_norm": 0.27184656262397766, "learning_rate": 0.00014338837391175582, "loss": 0.7232, "step": 180 }, { "epoch": 1.1124807395993837, "grad_norm": 0.2871675491333008, "learning_rate": 0.00014279279517557156, "loss": 0.7223, "step": 181 }, { "epoch": 1.11864406779661, "grad_norm": 0.27277106046676636, "learning_rate": 0.00014219535240071377, "loss": 0.7021, "step": 182 }, { "epoch": 1.1248073959938367, "grad_norm": 0.2617622911930084, "learning_rate": 0.00014159607161157362, "loss": 0.6881, "step": 183 }, { "epoch": 1.1309707241910631, "grad_norm": 0.2872948944568634, "learning_rate": 0.00014099497891260538, "loss": 0.705, "step": 184 }, { "epoch": 1.1371340523882898, "grad_norm": 0.3010096848011017, "learning_rate": 0.00014039210048718949, "loss": 0.7006, "step": 185 }, { "epoch": 1.1432973805855162, "grad_norm": 0.30869531631469727, "learning_rate": 0.00013978746259649209, "loss": 0.711, "step": 186 }, { "epoch": 1.1494607087827426, "grad_norm": 0.28331878781318665, "learning_rate": 0.00013918109157832088, "loss": 0.7035, "step": 187 }, { "epoch": 1.1556240369799693, "grad_norm": 0.29280489683151245, "learning_rate": 0.00013857301384597796, "loss": 0.7084, "step": 188 }, { "epoch": 1.1617873651771957, "grad_norm": 0.2873375415802002, "learning_rate": 0.0001379632558871094, "loss": 0.7207, "step": 189 }, { "epoch": 1.1679506933744221, "grad_norm": 0.31296560168266296, "learning_rate": 0.00013735184426255117, "loss": 0.7223, "step": 190 }, { "epoch": 1.1741140215716488, "grad_norm": 0.32619667053222656, "learning_rate": 0.00013673880560517246, "loss": 0.7098, "step": 191 }, { "epoch": 1.1802773497688752, "grad_norm": 0.2920374274253845, "learning_rate": 0.00013612416661871533, "loss": 0.699, "step": 192 }, { "epoch": 1.1864406779661016, "grad_norm": 0.33378660678863525, "learning_rate": 0.00013550795407663157, "loss": 0.7068, "step": 193 }, { "epoch": 1.1926040061633283, "grad_norm": 0.3474499583244324, "learning_rate": 0.0001348901948209167, "loss": 0.7054, "step": 194 }, { "epoch": 1.1987673343605547, "grad_norm": 0.3073098957538605, "learning_rate": 0.00013427091576094022, "loss": 0.7139, "step": 195 }, { "epoch": 1.2049306625577811, "grad_norm": 0.33176594972610474, "learning_rate": 0.00013365014387227393, "loss": 0.7353, "step": 196 }, { "epoch": 1.2110939907550078, "grad_norm": 0.31728485226631165, "learning_rate": 0.00013302790619551674, "loss": 0.6911, "step": 197 }, { "epoch": 1.2172573189522342, "grad_norm": 0.2931523025035858, "learning_rate": 0.0001324042298351166, "loss": 0.7192, "step": 198 }, { "epoch": 1.2234206471494606, "grad_norm": 0.31170010566711426, "learning_rate": 0.00013177914195819016, "loss": 0.7368, "step": 199 }, { "epoch": 1.2295839753466873, "grad_norm": 0.30348193645477295, "learning_rate": 0.00013115266979333917, "loss": 0.6952, "step": 200 }, { "epoch": 1.2357473035439137, "grad_norm": 0.2996613085269928, "learning_rate": 0.0001305248406294644, "loss": 0.702, "step": 201 }, { "epoch": 1.2419106317411401, "grad_norm": 0.29154476523399353, "learning_rate": 0.00012989568181457704, "loss": 0.7182, "step": 202 }, { "epoch": 1.2480739599383668, "grad_norm": 0.31373095512390137, "learning_rate": 0.00012926522075460745, "loss": 0.7316, "step": 203 }, { "epoch": 1.2542372881355932, "grad_norm": 0.30474114418029785, "learning_rate": 0.00012863348491221128, "loss": 0.7052, "step": 204 }, { "epoch": 1.2604006163328196, "grad_norm": 0.31707093119621277, "learning_rate": 0.00012800050180557322, "loss": 0.6927, "step": 205 }, { "epoch": 1.2665639445300463, "grad_norm": 0.2982027232646942, "learning_rate": 0.0001273662990072083, "loss": 0.6991, "step": 206 }, { "epoch": 1.2727272727272727, "grad_norm": 0.31432804465293884, "learning_rate": 0.00012673090414276101, "loss": 0.7145, "step": 207 }, { "epoch": 1.2788906009244991, "grad_norm": 0.30092254281044006, "learning_rate": 0.00012609434488980168, "loss": 0.6993, "step": 208 }, { "epoch": 1.2850539291217258, "grad_norm": 0.29248011112213135, "learning_rate": 0.00012545664897662109, "loss": 0.6892, "step": 209 }, { "epoch": 1.2912172573189522, "grad_norm": 0.3298072814941406, "learning_rate": 0.00012481784418102242, "loss": 0.7039, "step": 210 }, { "epoch": 1.2973805855161786, "grad_norm": 0.2912119925022125, "learning_rate": 0.0001241779583291114, "loss": 0.7027, "step": 211 }, { "epoch": 1.3035439137134053, "grad_norm": 0.3143533766269684, "learning_rate": 0.00012353701929408427, "loss": 0.6955, "step": 212 }, { "epoch": 1.3097072419106317, "grad_norm": 0.31738749146461487, "learning_rate": 0.0001228950549950134, "loss": 0.714, "step": 213 }, { "epoch": 1.3158705701078581, "grad_norm": 0.3286758065223694, "learning_rate": 0.00012225209339563145, "loss": 0.6951, "step": 214 }, { "epoch": 1.3220338983050848, "grad_norm": 0.28571856021881104, "learning_rate": 0.00012160816250311298, "loss": 0.7079, "step": 215 }, { "epoch": 1.3281972265023112, "grad_norm": 0.2884030044078827, "learning_rate": 0.00012096329036685468, "loss": 0.7054, "step": 216 }, { "epoch": 1.3343605546995376, "grad_norm": 0.29154953360557556, "learning_rate": 0.00012031750507725344, "loss": 0.6997, "step": 217 }, { "epoch": 1.3405238828967643, "grad_norm": 0.29759618639945984, "learning_rate": 0.00011967083476448282, "loss": 0.7108, "step": 218 }, { "epoch": 1.3466872110939907, "grad_norm": 0.31861481070518494, "learning_rate": 0.00011902330759726765, "loss": 0.7262, "step": 219 }, { "epoch": 1.3528505392912171, "grad_norm": 0.28898975253105164, "learning_rate": 0.00011837495178165706, "loss": 0.6913, "step": 220 }, { "epoch": 1.3590138674884438, "grad_norm": 0.3104959726333618, "learning_rate": 0.00011772579555979572, "loss": 0.7171, "step": 221 }, { "epoch": 1.3651771956856702, "grad_norm": 0.31155380606651306, "learning_rate": 0.00011707586720869374, "loss": 0.7108, "step": 222 }, { "epoch": 1.3713405238828968, "grad_norm": 0.3034297823905945, "learning_rate": 0.000116425195038995, "loss": 0.695, "step": 223 }, { "epoch": 1.3775038520801233, "grad_norm": 0.3030366003513336, "learning_rate": 0.00011577380739374375, "loss": 0.7105, "step": 224 }, { "epoch": 1.3836671802773497, "grad_norm": 0.32835301756858826, "learning_rate": 0.00011512173264715011, "loss": 0.704, "step": 225 }, { "epoch": 1.3898305084745763, "grad_norm": 0.3224051892757416, "learning_rate": 0.00011446899920335405, "loss": 0.707, "step": 226 }, { "epoch": 1.3959938366718028, "grad_norm": 0.29188138246536255, "learning_rate": 0.00011381563549518823, "loss": 0.6834, "step": 227 }, { "epoch": 1.4021571648690292, "grad_norm": 0.2895198166370392, "learning_rate": 0.00011316166998293935, "loss": 0.6835, "step": 228 }, { "epoch": 1.4083204930662558, "grad_norm": 0.290546178817749, "learning_rate": 0.00011250713115310851, "loss": 0.7032, "step": 229 }, { "epoch": 1.4144838212634823, "grad_norm": 0.31281739473342896, "learning_rate": 0.00011185204751717029, "loss": 0.686, "step": 230 }, { "epoch": 1.420647149460709, "grad_norm": 0.3190004825592041, "learning_rate": 0.00011119644761033078, "loss": 0.7173, "step": 231 }, { "epoch": 1.4268104776579353, "grad_norm": 0.28831568360328674, "learning_rate": 0.00011054035999028478, "loss": 0.7009, "step": 232 }, { "epoch": 1.4329738058551618, "grad_norm": 0.28771984577178955, "learning_rate": 0.00010988381323597157, "loss": 0.7114, "step": 233 }, { "epoch": 1.4391371340523884, "grad_norm": 0.30700981616973877, "learning_rate": 0.00010922683594633021, "loss": 0.7084, "step": 234 }, { "epoch": 1.4453004622496148, "grad_norm": 0.3292597532272339, "learning_rate": 0.00010856945673905369, "loss": 0.6937, "step": 235 }, { "epoch": 1.4514637904468413, "grad_norm": 0.31772634387016296, "learning_rate": 0.00010791170424934247, "loss": 0.7193, "step": 236 }, { "epoch": 1.457627118644068, "grad_norm": 0.2986114025115967, "learning_rate": 0.00010725360712865693, "loss": 0.7132, "step": 237 }, { "epoch": 1.4637904468412943, "grad_norm": 0.3067531883716583, "learning_rate": 0.00010659519404346954, "loss": 0.7152, "step": 238 }, { "epoch": 1.4699537750385208, "grad_norm": 0.34523364901542664, "learning_rate": 0.00010593649367401605, "loss": 0.6991, "step": 239 }, { "epoch": 1.4761171032357474, "grad_norm": 0.29430335760116577, "learning_rate": 0.00010527753471304625, "loss": 0.692, "step": 240 }, { "epoch": 1.4822804314329738, "grad_norm": 0.3257347643375397, "learning_rate": 0.00010461834586457398, "loss": 0.7137, "step": 241 }, { "epoch": 1.4884437596302003, "grad_norm": 0.3314994275569916, "learning_rate": 0.00010395895584262696, "loss": 0.707, "step": 242 }, { "epoch": 1.494607087827427, "grad_norm": 0.3335106372833252, "learning_rate": 0.00010329939336999596, "loss": 0.7173, "step": 243 }, { "epoch": 1.5007704160246533, "grad_norm": 0.31246572732925415, "learning_rate": 0.00010263968717698364, "loss": 0.6886, "step": 244 }, { "epoch": 1.5069337442218798, "grad_norm": 0.3151553273200989, "learning_rate": 0.00010197986600015305, "loss": 0.7264, "step": 245 }, { "epoch": 1.5130970724191064, "grad_norm": 0.3316774368286133, "learning_rate": 0.00010131995858107591, "loss": 0.7008, "step": 246 }, { "epoch": 1.5192604006163328, "grad_norm": 0.2832745909690857, "learning_rate": 0.00010065999366508057, "loss": 0.6898, "step": 247 }, { "epoch": 1.5254237288135593, "grad_norm": 0.28495272994041443, "learning_rate": 0.0001, "loss": 0.6953, "step": 248 }, { "epoch": 1.531587057010786, "grad_norm": 0.33213871717453003, "learning_rate": 9.934000633491944e-05, "loss": 0.7176, "step": 249 }, { "epoch": 1.5377503852080123, "grad_norm": 0.33437585830688477, "learning_rate": 9.868004141892411e-05, "loss": 0.7023, "step": 250 }, { "epoch": 1.5439137134052388, "grad_norm": 0.28734612464904785, "learning_rate": 9.802013399984696e-05, "loss": 0.6878, "step": 251 }, { "epoch": 1.5500770416024654, "grad_norm": 0.28802672028541565, "learning_rate": 9.73603128230164e-05, "loss": 0.6957, "step": 252 }, { "epoch": 1.5562403697996918, "grad_norm": 0.3412761688232422, "learning_rate": 9.670060663000408e-05, "loss": 0.7167, "step": 253 }, { "epoch": 1.5624036979969183, "grad_norm": 0.3285284638404846, "learning_rate": 9.604104415737308e-05, "loss": 0.6969, "step": 254 }, { "epoch": 1.568567026194145, "grad_norm": 0.31626519560813904, "learning_rate": 9.538165413542607e-05, "loss": 0.6938, "step": 255 }, { "epoch": 1.5747303543913713, "grad_norm": 0.3252091109752655, "learning_rate": 9.472246528695376e-05, "loss": 0.6961, "step": 256 }, { "epoch": 1.5808936825885977, "grad_norm": 0.36647289991378784, "learning_rate": 9.406350632598393e-05, "loss": 0.7062, "step": 257 }, { "epoch": 1.5870570107858244, "grad_norm": 0.2924617826938629, "learning_rate": 9.340480595653047e-05, "loss": 0.722, "step": 258 }, { "epoch": 1.5932203389830508, "grad_norm": 0.3311356008052826, "learning_rate": 9.274639287134308e-05, "loss": 0.7087, "step": 259 }, { "epoch": 1.5993836671802772, "grad_norm": 0.3344646692276001, "learning_rate": 9.208829575065754e-05, "loss": 0.6879, "step": 260 }, { "epoch": 1.605546995377504, "grad_norm": 0.31932997703552246, "learning_rate": 9.143054326094632e-05, "loss": 0.7035, "step": 261 }, { "epoch": 1.6117103235747303, "grad_norm": 0.28748106956481934, "learning_rate": 9.077316405366981e-05, "loss": 0.695, "step": 262 }, { "epoch": 1.6178736517719567, "grad_norm": 0.33069753646850586, "learning_rate": 9.011618676402845e-05, "loss": 0.6812, "step": 263 }, { "epoch": 1.6240369799691834, "grad_norm": 0.3465948700904846, "learning_rate": 8.945964000971524e-05, "loss": 0.6822, "step": 264 }, { "epoch": 1.6302003081664098, "grad_norm": 0.2904537320137024, "learning_rate": 8.880355238966923e-05, "loss": 0.6849, "step": 265 }, { "epoch": 1.6363636363636362, "grad_norm": 0.2855393886566162, "learning_rate": 8.814795248282974e-05, "loss": 0.6769, "step": 266 }, { "epoch": 1.642526964560863, "grad_norm": 0.31568413972854614, "learning_rate": 8.749286884689152e-05, "loss": 0.681, "step": 267 }, { "epoch": 1.6486902927580893, "grad_norm": 0.350299209356308, "learning_rate": 8.683833001706067e-05, "loss": 0.7055, "step": 268 }, { "epoch": 1.6548536209553157, "grad_norm": 0.29535943269729614, "learning_rate": 8.61843645048118e-05, "loss": 0.6867, "step": 269 }, { "epoch": 1.6610169491525424, "grad_norm": 0.30539003014564514, "learning_rate": 8.553100079664598e-05, "loss": 0.7039, "step": 270 }, { "epoch": 1.667180277349769, "grad_norm": 0.3268585801124573, "learning_rate": 8.487826735284991e-05, "loss": 0.7145, "step": 271 }, { "epoch": 1.6733436055469952, "grad_norm": 0.35204389691352844, "learning_rate": 8.422619260625625e-05, "loss": 0.6869, "step": 272 }, { "epoch": 1.6795069337442219, "grad_norm": 0.30481913685798645, "learning_rate": 8.357480496100498e-05, "loss": 0.6789, "step": 273 }, { "epoch": 1.6856702619414485, "grad_norm": 0.3037097454071045, "learning_rate": 8.292413279130624e-05, "loss": 0.7007, "step": 274 }, { "epoch": 1.6918335901386747, "grad_norm": 0.29266032576560974, "learning_rate": 8.22742044402043e-05, "loss": 0.689, "step": 275 }, { "epoch": 1.6979969183359014, "grad_norm": 0.30263689160346985, "learning_rate": 8.162504821834295e-05, "loss": 0.6918, "step": 276 }, { "epoch": 1.704160246533128, "grad_norm": 0.3087637424468994, "learning_rate": 8.097669240273236e-05, "loss": 0.6934, "step": 277 }, { "epoch": 1.7103235747303542, "grad_norm": 0.29714614152908325, "learning_rate": 8.03291652355172e-05, "loss": 0.6979, "step": 278 }, { "epoch": 1.7164869029275809, "grad_norm": 0.29517361521720886, "learning_rate": 7.96824949227466e-05, "loss": 0.706, "step": 279 }, { "epoch": 1.7226502311248075, "grad_norm": 0.3192397952079773, "learning_rate": 7.903670963314536e-05, "loss": 0.7056, "step": 280 }, { "epoch": 1.7288135593220337, "grad_norm": 0.3339553773403168, "learning_rate": 7.839183749688704e-05, "loss": 0.6903, "step": 281 }, { "epoch": 1.7349768875192604, "grad_norm": 0.32658275961875916, "learning_rate": 7.774790660436858e-05, "loss": 0.7115, "step": 282 }, { "epoch": 1.741140215716487, "grad_norm": 0.29387855529785156, "learning_rate": 7.710494500498662e-05, "loss": 0.6877, "step": 283 }, { "epoch": 1.7473035439137135, "grad_norm": 0.29822373390197754, "learning_rate": 7.646298070591578e-05, "loss": 0.6949, "step": 284 }, { "epoch": 1.7534668721109399, "grad_norm": 0.31081193685531616, "learning_rate": 7.582204167088864e-05, "loss": 0.6917, "step": 285 }, { "epoch": 1.7596302003081665, "grad_norm": 0.2900318503379822, "learning_rate": 7.518215581897763e-05, "loss": 0.6935, "step": 286 }, { "epoch": 1.765793528505393, "grad_norm": 0.31811362504959106, "learning_rate": 7.454335102337895e-05, "loss": 0.711, "step": 287 }, { "epoch": 1.7719568567026194, "grad_norm": 0.2821851074695587, "learning_rate": 7.390565511019834e-05, "loss": 0.7083, "step": 288 }, { "epoch": 1.778120184899846, "grad_norm": 0.3173108994960785, "learning_rate": 7.326909585723901e-05, "loss": 0.712, "step": 289 }, { "epoch": 1.7842835130970724, "grad_norm": 0.29498183727264404, "learning_rate": 7.263370099279172e-05, "loss": 0.6876, "step": 290 }, { "epoch": 1.7904468412942989, "grad_norm": 0.30469003319740295, "learning_rate": 7.199949819442682e-05, "loss": 0.6954, "step": 291 }, { "epoch": 1.7966101694915255, "grad_norm": 0.2901192307472229, "learning_rate": 7.136651508778875e-05, "loss": 0.7086, "step": 292 }, { "epoch": 1.802773497688752, "grad_norm": 0.2790692150592804, "learning_rate": 7.073477924539255e-05, "loss": 0.6667, "step": 293 }, { "epoch": 1.8089368258859784, "grad_norm": 0.30207961797714233, "learning_rate": 7.010431818542297e-05, "loss": 0.7192, "step": 294 }, { "epoch": 1.815100154083205, "grad_norm": 0.290354460477829, "learning_rate": 6.947515937053563e-05, "loss": 0.6741, "step": 295 }, { "epoch": 1.8212634822804314, "grad_norm": 0.2848537862300873, "learning_rate": 6.884733020666086e-05, "loss": 0.6809, "step": 296 }, { "epoch": 1.8274268104776579, "grad_norm": 0.28692835569381714, "learning_rate": 6.822085804180984e-05, "loss": 0.694, "step": 297 }, { "epoch": 1.8335901386748845, "grad_norm": 0.28540992736816406, "learning_rate": 6.759577016488343e-05, "loss": 0.6825, "step": 298 }, { "epoch": 1.839753466872111, "grad_norm": 0.31507408618927, "learning_rate": 6.697209380448333e-05, "loss": 0.6826, "step": 299 }, { "epoch": 1.8459167950693374, "grad_norm": 0.3058791160583496, "learning_rate": 6.634985612772611e-05, "loss": 0.7011, "step": 300 }, { "epoch": 1.852080123266564, "grad_norm": 0.30382823944091797, "learning_rate": 6.572908423905979e-05, "loss": 0.6994, "step": 301 }, { "epoch": 1.8582434514637904, "grad_norm": 0.30683571100234985, "learning_rate": 6.510980517908334e-05, "loss": 0.7012, "step": 302 }, { "epoch": 1.8644067796610169, "grad_norm": 0.2885062098503113, "learning_rate": 6.449204592336841e-05, "loss": 0.7007, "step": 303 }, { "epoch": 1.8705701078582435, "grad_norm": 0.3002052307128906, "learning_rate": 6.387583338128471e-05, "loss": 0.7048, "step": 304 }, { "epoch": 1.87673343605547, "grad_norm": 0.3367098271846771, "learning_rate": 6.326119439482756e-05, "loss": 0.7044, "step": 305 }, { "epoch": 1.8828967642526964, "grad_norm": 0.3124644160270691, "learning_rate": 6.264815573744884e-05, "loss": 0.6954, "step": 306 }, { "epoch": 1.889060092449923, "grad_norm": 0.30981218814849854, "learning_rate": 6.203674411289062e-05, "loss": 0.692, "step": 307 }, { "epoch": 1.8952234206471494, "grad_norm": 0.3078441619873047, "learning_rate": 6.142698615402205e-05, "loss": 0.6872, "step": 308 }, { "epoch": 1.9013867488443759, "grad_norm": 0.3080589473247528, "learning_rate": 6.0818908421679154e-05, "loss": 0.6902, "step": 309 }, { "epoch": 1.9075500770416025, "grad_norm": 0.30964910984039307, "learning_rate": 6.021253740350793e-05, "loss": 0.7077, "step": 310 }, { "epoch": 1.913713405238829, "grad_norm": 0.3046227693557739, "learning_rate": 5.960789951281052e-05, "loss": 0.7082, "step": 311 }, { "epoch": 1.9198767334360554, "grad_norm": 0.31337085366249084, "learning_rate": 5.900502108739465e-05, "loss": 0.6838, "step": 312 }, { "epoch": 1.926040061633282, "grad_norm": 0.3050321042537689, "learning_rate": 5.840392838842641e-05, "loss": 0.6967, "step": 313 }, { "epoch": 1.9322033898305084, "grad_norm": 0.2953138053417206, "learning_rate": 5.780464759928623e-05, "loss": 0.6813, "step": 314 }, { "epoch": 1.9383667180277349, "grad_norm": 0.295775443315506, "learning_rate": 5.720720482442845e-05, "loss": 0.6733, "step": 315 }, { "epoch": 1.9445300462249615, "grad_norm": 0.3145037591457367, "learning_rate": 5.6611626088244194e-05, "loss": 0.7047, "step": 316 }, { "epoch": 1.950693374422188, "grad_norm": 0.3275890648365021, "learning_rate": 5.601793733392764e-05, "loss": 0.6938, "step": 317 }, { "epoch": 1.9568567026194144, "grad_norm": 0.30370888113975525, "learning_rate": 5.542616442234618e-05, "loss": 0.6818, "step": 318 }, { "epoch": 1.963020030816641, "grad_norm": 0.31202757358551025, "learning_rate": 5.483633313091363e-05, "loss": 0.725, "step": 319 }, { "epoch": 1.9691833590138677, "grad_norm": 0.2959240972995758, "learning_rate": 5.4248469152467695e-05, "loss": 0.6918, "step": 320 }, { "epoch": 1.9753466872110939, "grad_norm": 0.29921895265579224, "learning_rate": 5.366259809415053e-05, "loss": 0.671, "step": 321 }, { "epoch": 1.9815100154083205, "grad_norm": 0.3147589862346649, "learning_rate": 5.307874547629339e-05, "loss": 0.7122, "step": 322 }, { "epoch": 1.9876733436055471, "grad_norm": 0.31646057963371277, "learning_rate": 5.249693673130511e-05, "loss": 0.6999, "step": 323 }, { "epoch": 1.9938366718027734, "grad_norm": 0.29936257004737854, "learning_rate": 5.191719720256407e-05, "loss": 0.709, "step": 324 } ], "logging_steps": 1, "max_steps": 486, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 162, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9174432561535386e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }