|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9884526558891453, |
|
"eval_steps": 500, |
|
"global_step": 486, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006158583525789068, |
|
"grad_norm": 1.4914512634277344, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3404, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012317167051578136, |
|
"grad_norm": 1.486335039138794, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3428, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018475750577367205, |
|
"grad_norm": 1.4762102365493774, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3374, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024634334103156273, |
|
"grad_norm": 1.5677003860473633, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3111, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.030792917628945343, |
|
"grad_norm": 1.5895426273345947, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2003, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03695150115473441, |
|
"grad_norm": 1.4371826648712158, |
|
"learning_rate": 0.00012, |
|
"loss": 2.0329, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04311008468052348, |
|
"grad_norm": 1.3726378679275513, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8007, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.049268668206312545, |
|
"grad_norm": 1.4886671304702759, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5998, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05542725173210162, |
|
"grad_norm": 1.3068833351135254, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3961, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.061585835257890686, |
|
"grad_norm": 3.4257965087890625, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3012, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06774441878367975, |
|
"grad_norm": 1.5885640382766724, |
|
"learning_rate": 0.00019999782201809226, |
|
"loss": 1.1308, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07390300230946882, |
|
"grad_norm": 0.9890074729919434, |
|
"learning_rate": 0.00019999128816724108, |
|
"loss": 1.0712, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0800615858352579, |
|
"grad_norm": 0.9384965300559998, |
|
"learning_rate": 0.00019998039873205868, |
|
"loss": 1.0627, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08622016936104696, |
|
"grad_norm": 3.847111225128174, |
|
"learning_rate": 0.0001999651541868849, |
|
"loss": 1.01, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 1.0700613260269165, |
|
"learning_rate": 0.00019994555519576662, |
|
"loss": 1.0198, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09853733641262509, |
|
"grad_norm": 3.9324963092803955, |
|
"learning_rate": 0.00019992160261242877, |
|
"loss": 1.0316, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10469591993841416, |
|
"grad_norm": 0.8849812150001526, |
|
"learning_rate": 0.00019989329748023725, |
|
"loss": 0.9896, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11085450346420324, |
|
"grad_norm": 0.5666069984436035, |
|
"learning_rate": 0.00019986064103215339, |
|
"loss": 0.9734, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1170130869899923, |
|
"grad_norm": 0.4113065302371979, |
|
"learning_rate": 0.0001998236346906802, |
|
"loss": 0.9253, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12317167051578137, |
|
"grad_norm": 0.3489767909049988, |
|
"learning_rate": 0.00019978228006780054, |
|
"loss": 0.9113, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12933025404157045, |
|
"grad_norm": 1.2077497243881226, |
|
"learning_rate": 0.00019973657896490686, |
|
"loss": 0.9194, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1354888375673595, |
|
"grad_norm": 0.4487021863460541, |
|
"learning_rate": 0.00019968653337272261, |
|
"loss": 0.9058, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14164742109314857, |
|
"grad_norm": 0.35114091634750366, |
|
"learning_rate": 0.0001996321454712157, |
|
"loss": 0.9066, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14780600461893764, |
|
"grad_norm": 0.3634772300720215, |
|
"learning_rate": 0.00019957341762950344, |
|
"loss": 0.913, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15396458814472672, |
|
"grad_norm": 0.4948749244213104, |
|
"learning_rate": 0.0001995103524057494, |
|
"loss": 0.8944, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1601231716705158, |
|
"grad_norm": 0.3144727051258087, |
|
"learning_rate": 0.00019944295254705185, |
|
"loss": 0.8973, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16628175519630484, |
|
"grad_norm": 0.26469510793685913, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 0.8776, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1724403387220939, |
|
"grad_norm": 0.29824352264404297, |
|
"learning_rate": 0.00019929516085716734, |
|
"loss": 0.8521, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17859892224788299, |
|
"grad_norm": 0.2820914387702942, |
|
"learning_rate": 0.00019921477546373296, |
|
"loss": 0.862, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 0.3102569282054901, |
|
"learning_rate": 0.00019913006831057969, |
|
"loss": 0.8586, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19091608929946113, |
|
"grad_norm": 0.30348193645477295, |
|
"learning_rate": 0.0001990410430875205, |
|
"loss": 0.854, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19707467282525018, |
|
"grad_norm": 0.31126317381858826, |
|
"learning_rate": 0.00019894770367246195, |
|
"loss": 0.8209, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.20323325635103925, |
|
"grad_norm": 0.3367297351360321, |
|
"learning_rate": 0.00019885005413123515, |
|
"loss": 0.8545, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20939183987682833, |
|
"grad_norm": 0.28940874338150024, |
|
"learning_rate": 0.00019874809871741876, |
|
"loss": 0.8561, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2155504234026174, |
|
"grad_norm": 0.3074447512626648, |
|
"learning_rate": 0.00019864184187215372, |
|
"loss": 0.8495, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22170900692840648, |
|
"grad_norm": 0.29862889647483826, |
|
"learning_rate": 0.00019853128822394975, |
|
"loss": 0.8447, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22786759045419552, |
|
"grad_norm": 0.3074705898761749, |
|
"learning_rate": 0.0001984164425884838, |
|
"loss": 0.8199, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2340261739799846, |
|
"grad_norm": 0.2626888155937195, |
|
"learning_rate": 0.0001982973099683902, |
|
"loss": 0.8289, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.24018475750577367, |
|
"grad_norm": 0.26036959886550903, |
|
"learning_rate": 0.00019817389555304272, |
|
"loss": 0.8244, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24634334103156275, |
|
"grad_norm": 0.2532216012477875, |
|
"learning_rate": 0.0001980462047183287, |
|
"loss": 0.8236, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2525019245573518, |
|
"grad_norm": 0.30278533697128296, |
|
"learning_rate": 0.0001979142430264146, |
|
"loss": 0.8207, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2586605080831409, |
|
"grad_norm": 0.29070666432380676, |
|
"learning_rate": 0.00019777801622550408, |
|
"loss": 0.823, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26481909160892997, |
|
"grad_norm": 0.2718726694583893, |
|
"learning_rate": 0.00019763753024958723, |
|
"loss": 0.8054, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.270977675134719, |
|
"grad_norm": 0.25870487093925476, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.8118, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.27713625866050806, |
|
"grad_norm": 0.2909790873527527, |
|
"learning_rate": 0.0001973438054360693, |
|
"loss": 0.8101, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.28329484218629714, |
|
"grad_norm": 0.30481499433517456, |
|
"learning_rate": 0.00019719057939301477, |
|
"loss": 0.8394, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2894534257120862, |
|
"grad_norm": 0.2736552059650421, |
|
"learning_rate": 0.0001970331197634898, |
|
"loss": 0.7932, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2956120092378753, |
|
"grad_norm": 0.250683456659317, |
|
"learning_rate": 0.00019687143340637887, |
|
"loss": 0.7907, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.30177059276366436, |
|
"grad_norm": 0.32575681805610657, |
|
"learning_rate": 0.00019670552736468118, |
|
"loss": 0.8022, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.30792917628945343, |
|
"grad_norm": 0.3301404118537903, |
|
"learning_rate": 0.00019653540886520386, |
|
"loss": 0.8163, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3140877598152425, |
|
"grad_norm": 0.31461504101753235, |
|
"learning_rate": 0.00019636108531824724, |
|
"loss": 0.8111, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3202463433410316, |
|
"grad_norm": 0.2696295380592346, |
|
"learning_rate": 0.00019618256431728194, |
|
"loss": 0.7953, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.32640492686682065, |
|
"grad_norm": 0.29559096693992615, |
|
"learning_rate": 0.0001959998536386181, |
|
"loss": 0.7759, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3325635103926097, |
|
"grad_norm": 0.2746593654155731, |
|
"learning_rate": 0.0001958129612410668, |
|
"loss": 0.7817, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.33872209391839875, |
|
"grad_norm": 0.30077093839645386, |
|
"learning_rate": 0.0001956218952655933, |
|
"loss": 0.7965, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3448806774441878, |
|
"grad_norm": 0.2864063084125519, |
|
"learning_rate": 0.00019542666403496233, |
|
"loss": 0.812, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3510392609699769, |
|
"grad_norm": 0.3612823784351349, |
|
"learning_rate": 0.0001952272760533756, |
|
"loss": 0.7652, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.35719784449576597, |
|
"grad_norm": 0.28117987513542175, |
|
"learning_rate": 0.00019502374000610151, |
|
"loss": 0.7982, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.36335642802155504, |
|
"grad_norm": 0.3067573010921478, |
|
"learning_rate": 0.0001948160647590966, |
|
"loss": 0.7703, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3695150115473441, |
|
"grad_norm": 0.27148452401161194, |
|
"learning_rate": 0.00019460425935861948, |
|
"loss": 0.7795, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3756735950731332, |
|
"grad_norm": 0.35128867626190186, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.7637, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.38183217859892227, |
|
"grad_norm": 0.3009471297264099, |
|
"learning_rate": 0.00019416829518142118, |
|
"loss": 0.7838, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.38799076212471134, |
|
"grad_norm": 0.3329865038394928, |
|
"learning_rate": 0.00019394415539514178, |
|
"loss": 0.7949, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.39414934565050036, |
|
"grad_norm": 0.2731780707836151, |
|
"learning_rate": 0.00019371592343544656, |
|
"loss": 0.779, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.40030792917628943, |
|
"grad_norm": 0.3036772608757019, |
|
"learning_rate": 0.00019348360924403713, |
|
"loss": 0.7665, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4064665127020785, |
|
"grad_norm": 0.28768160939216614, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.7916, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4126250962278676, |
|
"grad_norm": 0.30529218912124634, |
|
"learning_rate": 0.0001930067748215438, |
|
"loss": 0.7848, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.41878367975365666, |
|
"grad_norm": 0.2631685733795166, |
|
"learning_rate": 0.0001927622753611948, |
|
"loss": 0.785, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.42494226327944573, |
|
"grad_norm": 0.33626529574394226, |
|
"learning_rate": 0.0001925137352096966, |
|
"loss": 0.7696, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4311008468052348, |
|
"grad_norm": 0.30560940504074097, |
|
"learning_rate": 0.0001922611651933683, |
|
"loss": 0.7623, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4372594303310239, |
|
"grad_norm": 0.31474098563194275, |
|
"learning_rate": 0.0001920045763140684, |
|
"loss": 0.7909, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.44341801385681295, |
|
"grad_norm": 0.2986692488193512, |
|
"learning_rate": 0.00019174397974871564, |
|
"loss": 0.7837, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.44957659738260203, |
|
"grad_norm": 0.2956785261631012, |
|
"learning_rate": 0.0001914793868488021, |
|
"loss": 0.7721, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.45573518090839105, |
|
"grad_norm": 0.34299513697624207, |
|
"learning_rate": 0.0001912108091398988, |
|
"loss": 0.7595, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4618937644341801, |
|
"grad_norm": 0.2858146131038666, |
|
"learning_rate": 0.0001909382583211535, |
|
"loss": 0.77, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4680523479599692, |
|
"grad_norm": 0.2714357078075409, |
|
"learning_rate": 0.0001906617462647813, |
|
"loss": 0.755, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.47421093148575827, |
|
"grad_norm": 0.2805250287055969, |
|
"learning_rate": 0.0001903812850155472, |
|
"loss": 0.7572, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.48036951501154734, |
|
"grad_norm": 0.321464866399765, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.7259, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4865280985373364, |
|
"grad_norm": 0.27994561195373535, |
|
"learning_rate": 0.00018980856397714913, |
|
"loss": 0.7779, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4926866820631255, |
|
"grad_norm": 0.3206578195095062, |
|
"learning_rate": 0.00018951632913550626, |
|
"loss": 0.7599, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49884526558891457, |
|
"grad_norm": 0.30579233169555664, |
|
"learning_rate": 0.00018922019499495725, |
|
"loss": 0.763, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5050038491147036, |
|
"grad_norm": 0.2705940902233124, |
|
"learning_rate": 0.0001889201744549981, |
|
"loss": 0.7436, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5111624326404927, |
|
"grad_norm": 0.36793988943099976, |
|
"learning_rate": 0.00018861628058441506, |
|
"loss": 0.7489, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5173210161662818, |
|
"grad_norm": 0.30018600821495056, |
|
"learning_rate": 0.00018830852662071507, |
|
"loss": 0.7479, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5234795996920708, |
|
"grad_norm": 0.2857201397418976, |
|
"learning_rate": 0.00018799692596954947, |
|
"loss": 0.7584, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5296381832178599, |
|
"grad_norm": 0.3345021903514862, |
|
"learning_rate": 0.0001876814922041299, |
|
"loss": 0.7589, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.535796766743649, |
|
"grad_norm": 0.2717898488044739, |
|
"learning_rate": 0.00018736223906463696, |
|
"loss": 0.7658, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.541955350269438, |
|
"grad_norm": 0.30215781927108765, |
|
"learning_rate": 0.00018703918045762197, |
|
"loss": 0.7836, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5481139337952271, |
|
"grad_norm": 0.2765495479106903, |
|
"learning_rate": 0.0001867123304554009, |
|
"loss": 0.7556, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5542725173210161, |
|
"grad_norm": 0.31692612171173096, |
|
"learning_rate": 0.00018638170329544164, |
|
"loss": 0.7684, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5604311008468053, |
|
"grad_norm": 0.273282915353775, |
|
"learning_rate": 0.00018604731337974357, |
|
"loss": 0.7544, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5665896843725943, |
|
"grad_norm": 0.2583194971084595, |
|
"learning_rate": 0.00018570917527421048, |
|
"loss": 0.7469, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5727482678983834, |
|
"grad_norm": 0.287936270236969, |
|
"learning_rate": 0.00018536730370801585, |
|
"loss": 0.7417, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5789068514241724, |
|
"grad_norm": 0.31012046337127686, |
|
"learning_rate": 0.00018502171357296144, |
|
"loss": 0.7405, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5850654349499615, |
|
"grad_norm": 0.30453142523765564, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 0.7688, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5912240184757506, |
|
"grad_norm": 0.2779741585254669, |
|
"learning_rate": 0.00018431943797272187, |
|
"loss": 0.7396, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5973826020015397, |
|
"grad_norm": 0.29110845923423767, |
|
"learning_rate": 0.00018396278309840779, |
|
"loss": 0.7556, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6035411855273287, |
|
"grad_norm": 0.315677285194397, |
|
"learning_rate": 0.00018360247083564342, |
|
"loss": 0.749, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6096997690531177, |
|
"grad_norm": 0.3191000521183014, |
|
"learning_rate": 0.00018323851687950055, |
|
"loss": 0.7548, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6158583525789069, |
|
"grad_norm": 0.28777754306793213, |
|
"learning_rate": 0.00018287093708368188, |
|
"loss": 0.7625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6220169361046959, |
|
"grad_norm": 0.30834269523620605, |
|
"learning_rate": 0.00018249974745983023, |
|
"loss": 0.7605, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.628175519630485, |
|
"grad_norm": 0.31846028566360474, |
|
"learning_rate": 0.00018212496417683137, |
|
"loss": 0.7715, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.634334103156274, |
|
"grad_norm": 0.3188404142856598, |
|
"learning_rate": 0.00018174660356010943, |
|
"loss": 0.7661, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6404926866820632, |
|
"grad_norm": 0.2989709973335266, |
|
"learning_rate": 0.00018136468209091602, |
|
"loss": 0.728, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6466512702078522, |
|
"grad_norm": 0.31583070755004883, |
|
"learning_rate": 0.0001809792164056121, |
|
"loss": 0.7751, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6528098537336413, |
|
"grad_norm": 0.2666497528553009, |
|
"learning_rate": 0.0001805902232949435, |
|
"loss": 0.7407, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6589684372594303, |
|
"grad_norm": 0.32868844270706177, |
|
"learning_rate": 0.0001801977197033093, |
|
"loss": 0.7429, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6651270207852193, |
|
"grad_norm": 0.3197931945323944, |
|
"learning_rate": 0.000179801722728024, |
|
"loss": 0.746, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6712856043110085, |
|
"grad_norm": 0.29100263118743896, |
|
"learning_rate": 0.00017940224961857242, |
|
"loss": 0.7483, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6774441878367975, |
|
"grad_norm": 0.3278297185897827, |
|
"learning_rate": 0.00017899931777585882, |
|
"loss": 0.7619, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6836027713625866, |
|
"grad_norm": 0.3008161783218384, |
|
"learning_rate": 0.00017859294475144837, |
|
"loss": 0.7464, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6897613548883756, |
|
"grad_norm": 0.26044347882270813, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.7408, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6959199384141648, |
|
"grad_norm": 0.3036176562309265, |
|
"learning_rate": 0.00017776994611251015, |
|
"loss": 0.7614, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7020785219399538, |
|
"grad_norm": 0.3001931309700012, |
|
"learning_rate": 0.00017735335634750532, |
|
"loss": 0.7308, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7082371054657429, |
|
"grad_norm": 0.27744966745376587, |
|
"learning_rate": 0.00017693339709828792, |
|
"loss": 0.7456, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7143956889915319, |
|
"grad_norm": 0.2697376012802124, |
|
"learning_rate": 0.00017651008665813081, |
|
"loss": 0.7456, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7205542725173211, |
|
"grad_norm": 0.2659561336040497, |
|
"learning_rate": 0.0001760834434662837, |
|
"loss": 0.7262, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7267128560431101, |
|
"grad_norm": 0.29246559739112854, |
|
"learning_rate": 0.0001756534861071696, |
|
"loss": 0.7433, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7328714395688991, |
|
"grad_norm": 0.2959323525428772, |
|
"learning_rate": 0.00017522023330957548, |
|
"loss": 0.7512, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7390300230946882, |
|
"grad_norm": 0.26138290762901306, |
|
"learning_rate": 0.00017478370394583646, |
|
"loss": 0.7503, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7451886066204773, |
|
"grad_norm": 0.26741594076156616, |
|
"learning_rate": 0.00017434391703101363, |
|
"loss": 0.7582, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7513471901462664, |
|
"grad_norm": 0.28922754526138306, |
|
"learning_rate": 0.00017390089172206592, |
|
"loss": 0.7405, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7575057736720554, |
|
"grad_norm": 0.2668907344341278, |
|
"learning_rate": 0.00017345464731701547, |
|
"loss": 0.7381, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7636643571978445, |
|
"grad_norm": 0.27399054169654846, |
|
"learning_rate": 0.00017300520325410701, |
|
"loss": 0.7413, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7698229407236336, |
|
"grad_norm": 0.29532185196876526, |
|
"learning_rate": 0.0001725525791109614, |
|
"loss": 0.7709, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7759815242494227, |
|
"grad_norm": 0.3103969097137451, |
|
"learning_rate": 0.0001720967946037225, |
|
"loss": 0.7406, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7821401077752117, |
|
"grad_norm": 0.309108704328537, |
|
"learning_rate": 0.0001716378695861985, |
|
"loss": 0.7698, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7882986913010007, |
|
"grad_norm": 0.2896479070186615, |
|
"learning_rate": 0.00017117582404899712, |
|
"loss": 0.7417, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7944572748267898, |
|
"grad_norm": 0.28942593932151794, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.7234, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8006158583525789, |
|
"grad_norm": 0.2783251702785492, |
|
"learning_rate": 0.00017024245205675986, |
|
"loss": 0.7441, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.806774441878368, |
|
"grad_norm": 0.3195393979549408, |
|
"learning_rate": 0.00016977116625907024, |
|
"loss": 0.7407, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.812933025404157, |
|
"grad_norm": 0.27995389699935913, |
|
"learning_rate": 0.0001692968412546247, |
|
"loss": 0.7616, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8190916089299461, |
|
"grad_norm": 0.26950138807296753, |
|
"learning_rate": 0.0001688194977048488, |
|
"loss": 0.7261, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8252501924557352, |
|
"grad_norm": 0.28609132766723633, |
|
"learning_rate": 0.00016833915640265484, |
|
"loss": 0.7596, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8314087759815243, |
|
"grad_norm": 0.29152774810791016, |
|
"learning_rate": 0.00016785583827153618, |
|
"loss": 0.7488, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8375673595073133, |
|
"grad_norm": 0.24516189098358154, |
|
"learning_rate": 0.00016736956436465573, |
|
"loss": 0.7213, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8437259430331023, |
|
"grad_norm": 0.2636944055557251, |
|
"learning_rate": 0.00016688035586392885, |
|
"loss": 0.7124, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8498845265588915, |
|
"grad_norm": 0.2662704288959503, |
|
"learning_rate": 0.00016638823407910084, |
|
"loss": 0.7208, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8560431100846805, |
|
"grad_norm": 0.28026285767555237, |
|
"learning_rate": 0.00016589322044681861, |
|
"loss": 0.7362, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8622016936104696, |
|
"grad_norm": 0.312272846698761, |
|
"learning_rate": 0.00016539533652969683, |
|
"loss": 0.7353, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8683602771362586, |
|
"grad_norm": 0.28158485889434814, |
|
"learning_rate": 0.00016489460401537874, |
|
"loss": 0.74, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8745188606620478, |
|
"grad_norm": 0.26379963755607605, |
|
"learning_rate": 0.00016439104471559156, |
|
"loss": 0.7342, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8806774441878368, |
|
"grad_norm": 0.3109482228755951, |
|
"learning_rate": 0.00016388468056519612, |
|
"loss": 0.7649, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8868360277136259, |
|
"grad_norm": 0.3082488477230072, |
|
"learning_rate": 0.00016337553362123165, |
|
"loss": 0.7502, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8929946112394149, |
|
"grad_norm": 0.27953189611434937, |
|
"learning_rate": 0.00016286362606195468, |
|
"loss": 0.7321, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8991531947652041, |
|
"grad_norm": 0.27005666494369507, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.7447, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9053117782909931, |
|
"grad_norm": 0.29707372188568115, |
|
"learning_rate": 0.0001618316184107758, |
|
"loss": 0.727, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9114703618167821, |
|
"grad_norm": 0.3249203860759735, |
|
"learning_rate": 0.00016131156327275372, |
|
"loss": 0.7508, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9176289453425712, |
|
"grad_norm": 0.2724114954471588, |
|
"learning_rate": 0.00016078883742522075, |
|
"loss": 0.709, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9237875288683602, |
|
"grad_norm": 0.27933382987976074, |
|
"learning_rate": 0.00016026346363792567, |
|
"loss": 0.7318, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9299461123941494, |
|
"grad_norm": 0.27181556820869446, |
|
"learning_rate": 0.00015973546479596052, |
|
"loss": 0.7686, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9361046959199384, |
|
"grad_norm": 0.27142593264579773, |
|
"learning_rate": 0.00015920486389876383, |
|
"loss": 0.7485, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9422632794457275, |
|
"grad_norm": 0.26695144176483154, |
|
"learning_rate": 0.0001586716840591187, |
|
"loss": 0.7426, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9484218629715165, |
|
"grad_norm": 0.2876656949520111, |
|
"learning_rate": 0.000158135948502146, |
|
"loss": 0.7442, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9545804464973057, |
|
"grad_norm": 0.27336934208869934, |
|
"learning_rate": 0.00015759768056429274, |
|
"loss": 0.7353, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9607390300230947, |
|
"grad_norm": 0.2817447781562805, |
|
"learning_rate": 0.00015705690369231551, |
|
"loss": 0.7552, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9668976135488837, |
|
"grad_norm": 0.284213662147522, |
|
"learning_rate": 0.0001565136414422592, |
|
"loss": 0.7398, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9730561970746728, |
|
"grad_norm": 0.2847895622253418, |
|
"learning_rate": 0.0001559679174784308, |
|
"loss": 0.7364, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9792147806004619, |
|
"grad_norm": 0.2839486598968506, |
|
"learning_rate": 0.00015541975557236882, |
|
"loss": 0.754, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.985373364126251, |
|
"grad_norm": 0.2721126973628998, |
|
"learning_rate": 0.0001548691796018074, |
|
"loss": 0.7448, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.99153194765204, |
|
"grad_norm": 0.2735673785209656, |
|
"learning_rate": 0.00015431621354963668, |
|
"loss": 0.7308, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9976905311778291, |
|
"grad_norm": 0.31629157066345215, |
|
"learning_rate": 0.00015376088150285773, |
|
"loss": 0.7456, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0015408320493067, |
|
"grad_norm": 0.2917494475841522, |
|
"learning_rate": 0.00015320320765153367, |
|
"loss": 0.7408, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.007704160246533, |
|
"grad_norm": 0.28891703486442566, |
|
"learning_rate": 0.0001526432162877356, |
|
"loss": 0.7162, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0138674884437597, |
|
"grad_norm": 0.27484121918678284, |
|
"learning_rate": 0.0001520809318044847, |
|
"loss": 0.7032, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0200308166409862, |
|
"grad_norm": 0.28564733266830444, |
|
"learning_rate": 0.0001515163786946896, |
|
"loss": 0.7112, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0261941448382126, |
|
"grad_norm": 0.2875756025314331, |
|
"learning_rate": 0.00015094958155007952, |
|
"loss": 0.7148, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.0323574730354392, |
|
"grad_norm": 0.353564590215683, |
|
"learning_rate": 0.00015038056506013297, |
|
"loss": 0.7166, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0385208012326657, |
|
"grad_norm": 0.282805472612381, |
|
"learning_rate": 0.00014980935401100233, |
|
"loss": 0.6975, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.044684129429892, |
|
"grad_norm": 0.27754154801368713, |
|
"learning_rate": 0.00014923597328443422, |
|
"loss": 0.7313, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0508474576271187, |
|
"grad_norm": 0.27703657746315, |
|
"learning_rate": 0.00014866044785668563, |
|
"loss": 0.7406, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.0570107858243452, |
|
"grad_norm": 0.29809364676475525, |
|
"learning_rate": 0.00014808280279743593, |
|
"loss": 0.7316, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0631741140215716, |
|
"grad_norm": 0.30768731236457825, |
|
"learning_rate": 0.00014750306326869492, |
|
"loss": 0.6826, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0693374422187982, |
|
"grad_norm": 0.2725447416305542, |
|
"learning_rate": 0.00014692125452370663, |
|
"loss": 0.6971, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0755007704160247, |
|
"grad_norm": 0.2886168956756592, |
|
"learning_rate": 0.00014633740190584952, |
|
"loss": 0.7192, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.081664098613251, |
|
"grad_norm": 0.3197912573814392, |
|
"learning_rate": 0.00014575153084753233, |
|
"loss": 0.7266, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0878274268104777, |
|
"grad_norm": 0.27032333612442017, |
|
"learning_rate": 0.00014516366686908637, |
|
"loss": 0.6884, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0939907550077042, |
|
"grad_norm": 0.2735288739204407, |
|
"learning_rate": 0.00014457383557765386, |
|
"loss": 0.6962, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1001540832049306, |
|
"grad_norm": 0.279834121465683, |
|
"learning_rate": 0.00014398206266607236, |
|
"loss": 0.6876, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1063174114021572, |
|
"grad_norm": 0.27184656262397766, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 0.7232, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1124807395993837, |
|
"grad_norm": 0.2871675491333008, |
|
"learning_rate": 0.00014279279517557156, |
|
"loss": 0.7223, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.11864406779661, |
|
"grad_norm": 0.27277106046676636, |
|
"learning_rate": 0.00014219535240071377, |
|
"loss": 0.7021, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1248073959938367, |
|
"grad_norm": 0.2617622911930084, |
|
"learning_rate": 0.00014159607161157362, |
|
"loss": 0.6881, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1309707241910631, |
|
"grad_norm": 0.2872948944568634, |
|
"learning_rate": 0.00014099497891260538, |
|
"loss": 0.705, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1371340523882898, |
|
"grad_norm": 0.3010096848011017, |
|
"learning_rate": 0.00014039210048718949, |
|
"loss": 0.7006, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1432973805855162, |
|
"grad_norm": 0.30869531631469727, |
|
"learning_rate": 0.00013978746259649209, |
|
"loss": 0.711, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1494607087827426, |
|
"grad_norm": 0.28331878781318665, |
|
"learning_rate": 0.00013918109157832088, |
|
"loss": 0.7035, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.1556240369799693, |
|
"grad_norm": 0.29280489683151245, |
|
"learning_rate": 0.00013857301384597796, |
|
"loss": 0.7084, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1617873651771957, |
|
"grad_norm": 0.2873375415802002, |
|
"learning_rate": 0.0001379632558871094, |
|
"loss": 0.7207, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.1679506933744221, |
|
"grad_norm": 0.31296560168266296, |
|
"learning_rate": 0.00013735184426255117, |
|
"loss": 0.7223, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1741140215716488, |
|
"grad_norm": 0.32619667053222656, |
|
"learning_rate": 0.00013673880560517246, |
|
"loss": 0.7098, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.1802773497688752, |
|
"grad_norm": 0.2920374274253845, |
|
"learning_rate": 0.00013612416661871533, |
|
"loss": 0.699, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.1864406779661016, |
|
"grad_norm": 0.33378660678863525, |
|
"learning_rate": 0.00013550795407663157, |
|
"loss": 0.7068, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1926040061633283, |
|
"grad_norm": 0.3474499583244324, |
|
"learning_rate": 0.0001348901948209167, |
|
"loss": 0.7054, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1987673343605547, |
|
"grad_norm": 0.3073098957538605, |
|
"learning_rate": 0.00013427091576094022, |
|
"loss": 0.7139, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2049306625577811, |
|
"grad_norm": 0.33176594972610474, |
|
"learning_rate": 0.00013365014387227393, |
|
"loss": 0.7353, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2110939907550078, |
|
"grad_norm": 0.31728485226631165, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 0.6911, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2172573189522342, |
|
"grad_norm": 0.2931523025035858, |
|
"learning_rate": 0.0001324042298351166, |
|
"loss": 0.7192, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2234206471494606, |
|
"grad_norm": 0.31170010566711426, |
|
"learning_rate": 0.00013177914195819016, |
|
"loss": 0.7368, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2295839753466873, |
|
"grad_norm": 0.30348193645477295, |
|
"learning_rate": 0.00013115266979333917, |
|
"loss": 0.6952, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2357473035439137, |
|
"grad_norm": 0.2996613085269928, |
|
"learning_rate": 0.0001305248406294644, |
|
"loss": 0.702, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.2419106317411401, |
|
"grad_norm": 0.29154476523399353, |
|
"learning_rate": 0.00012989568181457704, |
|
"loss": 0.7182, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.2480739599383668, |
|
"grad_norm": 0.31373095512390137, |
|
"learning_rate": 0.00012926522075460745, |
|
"loss": 0.7316, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2542372881355932, |
|
"grad_norm": 0.30474114418029785, |
|
"learning_rate": 0.00012863348491221128, |
|
"loss": 0.7052, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.2604006163328196, |
|
"grad_norm": 0.31707093119621277, |
|
"learning_rate": 0.00012800050180557322, |
|
"loss": 0.6927, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2665639445300463, |
|
"grad_norm": 0.2982027232646942, |
|
"learning_rate": 0.0001273662990072083, |
|
"loss": 0.6991, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 0.31432804465293884, |
|
"learning_rate": 0.00012673090414276101, |
|
"loss": 0.7145, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.2788906009244991, |
|
"grad_norm": 0.30092254281044006, |
|
"learning_rate": 0.00012609434488980168, |
|
"loss": 0.6993, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2850539291217258, |
|
"grad_norm": 0.29248011112213135, |
|
"learning_rate": 0.00012545664897662109, |
|
"loss": 0.6892, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2912172573189522, |
|
"grad_norm": 0.3298072814941406, |
|
"learning_rate": 0.00012481784418102242, |
|
"loss": 0.7039, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2973805855161786, |
|
"grad_norm": 0.2912119925022125, |
|
"learning_rate": 0.0001241779583291114, |
|
"loss": 0.7027, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3035439137134053, |
|
"grad_norm": 0.3143533766269684, |
|
"learning_rate": 0.00012353701929408427, |
|
"loss": 0.6955, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3097072419106317, |
|
"grad_norm": 0.31738749146461487, |
|
"learning_rate": 0.0001228950549950134, |
|
"loss": 0.714, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3158705701078581, |
|
"grad_norm": 0.3286758065223694, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.6951, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3220338983050848, |
|
"grad_norm": 0.28571856021881104, |
|
"learning_rate": 0.00012160816250311298, |
|
"loss": 0.7079, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3281972265023112, |
|
"grad_norm": 0.2884030044078827, |
|
"learning_rate": 0.00012096329036685468, |
|
"loss": 0.7054, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3343605546995376, |
|
"grad_norm": 0.29154953360557556, |
|
"learning_rate": 0.00012031750507725344, |
|
"loss": 0.6997, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.3405238828967643, |
|
"grad_norm": 0.29759618639945984, |
|
"learning_rate": 0.00011967083476448282, |
|
"loss": 0.7108, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3466872110939907, |
|
"grad_norm": 0.31861481070518494, |
|
"learning_rate": 0.00011902330759726765, |
|
"loss": 0.7262, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.3528505392912171, |
|
"grad_norm": 0.28898975253105164, |
|
"learning_rate": 0.00011837495178165706, |
|
"loss": 0.6913, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3590138674884438, |
|
"grad_norm": 0.3104959726333618, |
|
"learning_rate": 0.00011772579555979572, |
|
"loss": 0.7171, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.3651771956856702, |
|
"grad_norm": 0.31155380606651306, |
|
"learning_rate": 0.00011707586720869374, |
|
"loss": 0.7108, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.3713405238828968, |
|
"grad_norm": 0.3034297823905945, |
|
"learning_rate": 0.000116425195038995, |
|
"loss": 0.695, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.3775038520801233, |
|
"grad_norm": 0.3030366003513336, |
|
"learning_rate": 0.00011577380739374375, |
|
"loss": 0.7105, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.3836671802773497, |
|
"grad_norm": 0.32835301756858826, |
|
"learning_rate": 0.00011512173264715011, |
|
"loss": 0.704, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3898305084745763, |
|
"grad_norm": 0.3224051892757416, |
|
"learning_rate": 0.00011446899920335405, |
|
"loss": 0.707, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3959938366718028, |
|
"grad_norm": 0.29188138246536255, |
|
"learning_rate": 0.00011381563549518823, |
|
"loss": 0.6834, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4021571648690292, |
|
"grad_norm": 0.2895198166370392, |
|
"learning_rate": 0.00011316166998293935, |
|
"loss": 0.6835, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4083204930662558, |
|
"grad_norm": 0.290546178817749, |
|
"learning_rate": 0.00011250713115310851, |
|
"loss": 0.7032, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4144838212634823, |
|
"grad_norm": 0.31281739473342896, |
|
"learning_rate": 0.00011185204751717029, |
|
"loss": 0.686, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.420647149460709, |
|
"grad_norm": 0.3190004825592041, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.7173, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4268104776579353, |
|
"grad_norm": 0.28831568360328674, |
|
"learning_rate": 0.00011054035999028478, |
|
"loss": 0.7009, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.4329738058551618, |
|
"grad_norm": 0.28771984577178955, |
|
"learning_rate": 0.00010988381323597157, |
|
"loss": 0.7114, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.4391371340523884, |
|
"grad_norm": 0.30700981616973877, |
|
"learning_rate": 0.00010922683594633021, |
|
"loss": 0.7084, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4453004622496148, |
|
"grad_norm": 0.3292597532272339, |
|
"learning_rate": 0.00010856945673905369, |
|
"loss": 0.6937, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.4514637904468413, |
|
"grad_norm": 0.31772634387016296, |
|
"learning_rate": 0.00010791170424934247, |
|
"loss": 0.7193, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.457627118644068, |
|
"grad_norm": 0.2986114025115967, |
|
"learning_rate": 0.00010725360712865693, |
|
"loss": 0.7132, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.4637904468412943, |
|
"grad_norm": 0.3067531883716583, |
|
"learning_rate": 0.00010659519404346954, |
|
"loss": 0.7152, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4699537750385208, |
|
"grad_norm": 0.34523364901542664, |
|
"learning_rate": 0.00010593649367401605, |
|
"loss": 0.6991, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.4761171032357474, |
|
"grad_norm": 0.29430335760116577, |
|
"learning_rate": 0.00010527753471304625, |
|
"loss": 0.692, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4822804314329738, |
|
"grad_norm": 0.3257347643375397, |
|
"learning_rate": 0.00010461834586457398, |
|
"loss": 0.7137, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.4884437596302003, |
|
"grad_norm": 0.3314994275569916, |
|
"learning_rate": 0.00010395895584262696, |
|
"loss": 0.707, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.494607087827427, |
|
"grad_norm": 0.3335106372833252, |
|
"learning_rate": 0.00010329939336999596, |
|
"loss": 0.7173, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5007704160246533, |
|
"grad_norm": 0.31246572732925415, |
|
"learning_rate": 0.00010263968717698364, |
|
"loss": 0.6886, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5069337442218798, |
|
"grad_norm": 0.3151553273200989, |
|
"learning_rate": 0.00010197986600015305, |
|
"loss": 0.7264, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5130970724191064, |
|
"grad_norm": 0.3316774368286133, |
|
"learning_rate": 0.00010131995858107591, |
|
"loss": 0.7008, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5192604006163328, |
|
"grad_norm": 0.2832745909690857, |
|
"learning_rate": 0.00010065999366508057, |
|
"loss": 0.6898, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5254237288135593, |
|
"grad_norm": 0.28495272994041443, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6953, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.531587057010786, |
|
"grad_norm": 0.33213871717453003, |
|
"learning_rate": 9.934000633491944e-05, |
|
"loss": 0.7176, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.5377503852080123, |
|
"grad_norm": 0.33437585830688477, |
|
"learning_rate": 9.868004141892411e-05, |
|
"loss": 0.7023, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5439137134052388, |
|
"grad_norm": 0.28734612464904785, |
|
"learning_rate": 9.802013399984696e-05, |
|
"loss": 0.6878, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.5500770416024654, |
|
"grad_norm": 0.28802672028541565, |
|
"learning_rate": 9.73603128230164e-05, |
|
"loss": 0.6957, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.5562403697996918, |
|
"grad_norm": 0.3412761688232422, |
|
"learning_rate": 9.670060663000408e-05, |
|
"loss": 0.7167, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.5624036979969183, |
|
"grad_norm": 0.3285284638404846, |
|
"learning_rate": 9.604104415737308e-05, |
|
"loss": 0.6969, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.568567026194145, |
|
"grad_norm": 0.31626519560813904, |
|
"learning_rate": 9.538165413542607e-05, |
|
"loss": 0.6938, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.5747303543913713, |
|
"grad_norm": 0.3252091109752655, |
|
"learning_rate": 9.472246528695376e-05, |
|
"loss": 0.6961, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.5808936825885977, |
|
"grad_norm": 0.36647289991378784, |
|
"learning_rate": 9.406350632598393e-05, |
|
"loss": 0.7062, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.5870570107858244, |
|
"grad_norm": 0.2924617826938629, |
|
"learning_rate": 9.340480595653047e-05, |
|
"loss": 0.722, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.5932203389830508, |
|
"grad_norm": 0.3311356008052826, |
|
"learning_rate": 9.274639287134308e-05, |
|
"loss": 0.7087, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.5993836671802772, |
|
"grad_norm": 0.3344646692276001, |
|
"learning_rate": 9.208829575065754e-05, |
|
"loss": 0.6879, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.605546995377504, |
|
"grad_norm": 0.31932997703552246, |
|
"learning_rate": 9.143054326094632e-05, |
|
"loss": 0.7035, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6117103235747303, |
|
"grad_norm": 0.28748106956481934, |
|
"learning_rate": 9.077316405366981e-05, |
|
"loss": 0.695, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6178736517719567, |
|
"grad_norm": 0.33069753646850586, |
|
"learning_rate": 9.011618676402845e-05, |
|
"loss": 0.6812, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6240369799691834, |
|
"grad_norm": 0.3465948700904846, |
|
"learning_rate": 8.945964000971524e-05, |
|
"loss": 0.6822, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.6302003081664098, |
|
"grad_norm": 0.2904537320137024, |
|
"learning_rate": 8.880355238966923e-05, |
|
"loss": 0.6849, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.2855393886566162, |
|
"learning_rate": 8.814795248282974e-05, |
|
"loss": 0.6769, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.642526964560863, |
|
"grad_norm": 0.31568413972854614, |
|
"learning_rate": 8.749286884689152e-05, |
|
"loss": 0.681, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.6486902927580893, |
|
"grad_norm": 0.350299209356308, |
|
"learning_rate": 8.683833001706067e-05, |
|
"loss": 0.7055, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.6548536209553157, |
|
"grad_norm": 0.29535943269729614, |
|
"learning_rate": 8.61843645048118e-05, |
|
"loss": 0.6867, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.6610169491525424, |
|
"grad_norm": 0.30539003014564514, |
|
"learning_rate": 8.553100079664598e-05, |
|
"loss": 0.7039, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.667180277349769, |
|
"grad_norm": 0.3268585801124573, |
|
"learning_rate": 8.487826735284991e-05, |
|
"loss": 0.7145, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.6733436055469952, |
|
"grad_norm": 0.35204389691352844, |
|
"learning_rate": 8.422619260625625e-05, |
|
"loss": 0.6869, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.6795069337442219, |
|
"grad_norm": 0.30481913685798645, |
|
"learning_rate": 8.357480496100498e-05, |
|
"loss": 0.6789, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6856702619414485, |
|
"grad_norm": 0.3037097454071045, |
|
"learning_rate": 8.292413279130624e-05, |
|
"loss": 0.7007, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6918335901386747, |
|
"grad_norm": 0.29266032576560974, |
|
"learning_rate": 8.22742044402043e-05, |
|
"loss": 0.689, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6979969183359014, |
|
"grad_norm": 0.30263689160346985, |
|
"learning_rate": 8.162504821834295e-05, |
|
"loss": 0.6918, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.704160246533128, |
|
"grad_norm": 0.3087637424468994, |
|
"learning_rate": 8.097669240273236e-05, |
|
"loss": 0.6934, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7103235747303542, |
|
"grad_norm": 0.29714614152908325, |
|
"learning_rate": 8.03291652355172e-05, |
|
"loss": 0.6979, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7164869029275809, |
|
"grad_norm": 0.29517361521720886, |
|
"learning_rate": 7.96824949227466e-05, |
|
"loss": 0.706, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.7226502311248075, |
|
"grad_norm": 0.3192397952079773, |
|
"learning_rate": 7.903670963314536e-05, |
|
"loss": 0.7056, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7288135593220337, |
|
"grad_norm": 0.3339553773403168, |
|
"learning_rate": 7.839183749688704e-05, |
|
"loss": 0.6903, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.7349768875192604, |
|
"grad_norm": 0.32658275961875916, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.7115, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.741140215716487, |
|
"grad_norm": 0.29387855529785156, |
|
"learning_rate": 7.710494500498662e-05, |
|
"loss": 0.6877, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.7473035439137135, |
|
"grad_norm": 0.29822373390197754, |
|
"learning_rate": 7.646298070591578e-05, |
|
"loss": 0.6949, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.7534668721109399, |
|
"grad_norm": 0.31081193685531616, |
|
"learning_rate": 7.582204167088864e-05, |
|
"loss": 0.6917, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.7596302003081665, |
|
"grad_norm": 0.2900318503379822, |
|
"learning_rate": 7.518215581897763e-05, |
|
"loss": 0.6935, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.765793528505393, |
|
"grad_norm": 0.31811362504959106, |
|
"learning_rate": 7.454335102337895e-05, |
|
"loss": 0.711, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.7719568567026194, |
|
"grad_norm": 0.2821851074695587, |
|
"learning_rate": 7.390565511019834e-05, |
|
"loss": 0.7083, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.778120184899846, |
|
"grad_norm": 0.3173108994960785, |
|
"learning_rate": 7.326909585723901e-05, |
|
"loss": 0.712, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.7842835130970724, |
|
"grad_norm": 0.29498183727264404, |
|
"learning_rate": 7.263370099279172e-05, |
|
"loss": 0.6876, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7904468412942989, |
|
"grad_norm": 0.30469003319740295, |
|
"learning_rate": 7.199949819442682e-05, |
|
"loss": 0.6954, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.7966101694915255, |
|
"grad_norm": 0.2901192307472229, |
|
"learning_rate": 7.136651508778875e-05, |
|
"loss": 0.7086, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.802773497688752, |
|
"grad_norm": 0.2790692150592804, |
|
"learning_rate": 7.073477924539255e-05, |
|
"loss": 0.6667, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8089368258859784, |
|
"grad_norm": 0.30207961797714233, |
|
"learning_rate": 7.010431818542297e-05, |
|
"loss": 0.7192, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.815100154083205, |
|
"grad_norm": 0.290354460477829, |
|
"learning_rate": 6.947515937053563e-05, |
|
"loss": 0.6741, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.8212634822804314, |
|
"grad_norm": 0.2848537862300873, |
|
"learning_rate": 6.884733020666086e-05, |
|
"loss": 0.6809, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.8274268104776579, |
|
"grad_norm": 0.28692835569381714, |
|
"learning_rate": 6.822085804180984e-05, |
|
"loss": 0.694, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.8335901386748845, |
|
"grad_norm": 0.28540992736816406, |
|
"learning_rate": 6.759577016488343e-05, |
|
"loss": 0.6825, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.839753466872111, |
|
"grad_norm": 0.31507408618927, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 0.6826, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.8459167950693374, |
|
"grad_norm": 0.3058791160583496, |
|
"learning_rate": 6.634985612772611e-05, |
|
"loss": 0.7011, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.852080123266564, |
|
"grad_norm": 0.30382823944091797, |
|
"learning_rate": 6.572908423905979e-05, |
|
"loss": 0.6994, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.8582434514637904, |
|
"grad_norm": 0.30683571100234985, |
|
"learning_rate": 6.510980517908334e-05, |
|
"loss": 0.7012, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.8644067796610169, |
|
"grad_norm": 0.2885062098503113, |
|
"learning_rate": 6.449204592336841e-05, |
|
"loss": 0.7007, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.8705701078582435, |
|
"grad_norm": 0.3002052307128906, |
|
"learning_rate": 6.387583338128471e-05, |
|
"loss": 0.7048, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.87673343605547, |
|
"grad_norm": 0.3367098271846771, |
|
"learning_rate": 6.326119439482756e-05, |
|
"loss": 0.7044, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.8828967642526964, |
|
"grad_norm": 0.3124644160270691, |
|
"learning_rate": 6.264815573744884e-05, |
|
"loss": 0.6954, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.889060092449923, |
|
"grad_norm": 0.30981218814849854, |
|
"learning_rate": 6.203674411289062e-05, |
|
"loss": 0.692, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.8952234206471494, |
|
"grad_norm": 0.3078441619873047, |
|
"learning_rate": 6.142698615402205e-05, |
|
"loss": 0.6872, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9013867488443759, |
|
"grad_norm": 0.3080589473247528, |
|
"learning_rate": 6.0818908421679154e-05, |
|
"loss": 0.6902, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.9075500770416025, |
|
"grad_norm": 0.30964910984039307, |
|
"learning_rate": 6.021253740350793e-05, |
|
"loss": 0.7077, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.913713405238829, |
|
"grad_norm": 0.3046227693557739, |
|
"learning_rate": 5.960789951281052e-05, |
|
"loss": 0.7082, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.9198767334360554, |
|
"grad_norm": 0.31337085366249084, |
|
"learning_rate": 5.900502108739465e-05, |
|
"loss": 0.6838, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.926040061633282, |
|
"grad_norm": 0.3050321042537689, |
|
"learning_rate": 5.840392838842641e-05, |
|
"loss": 0.6967, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.9322033898305084, |
|
"grad_norm": 0.2953138053417206, |
|
"learning_rate": 5.780464759928623e-05, |
|
"loss": 0.6813, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.9383667180277349, |
|
"grad_norm": 0.295775443315506, |
|
"learning_rate": 5.720720482442845e-05, |
|
"loss": 0.6733, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.9445300462249615, |
|
"grad_norm": 0.3145037591457367, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.7047, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.950693374422188, |
|
"grad_norm": 0.3275890648365021, |
|
"learning_rate": 5.601793733392764e-05, |
|
"loss": 0.6938, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.9568567026194144, |
|
"grad_norm": 0.30370888113975525, |
|
"learning_rate": 5.542616442234618e-05, |
|
"loss": 0.6818, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.963020030816641, |
|
"grad_norm": 0.31202757358551025, |
|
"learning_rate": 5.483633313091363e-05, |
|
"loss": 0.725, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.9691833590138677, |
|
"grad_norm": 0.2959240972995758, |
|
"learning_rate": 5.4248469152467695e-05, |
|
"loss": 0.6918, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9753466872110939, |
|
"grad_norm": 0.29921895265579224, |
|
"learning_rate": 5.366259809415053e-05, |
|
"loss": 0.671, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.9815100154083205, |
|
"grad_norm": 0.3147589862346649, |
|
"learning_rate": 5.307874547629339e-05, |
|
"loss": 0.7122, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.9876733436055471, |
|
"grad_norm": 0.31646057963371277, |
|
"learning_rate": 5.249693673130511e-05, |
|
"loss": 0.6999, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.9938366718027734, |
|
"grad_norm": 0.29936257004737854, |
|
"learning_rate": 5.191719720256407e-05, |
|
"loss": 0.709, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.31907781958580017, |
|
"learning_rate": 5.1339552143314384e-05, |
|
"loss": 0.692, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.0030792917628943, |
|
"grad_norm": 0.2952061891555786, |
|
"learning_rate": 5.0764026715565785e-05, |
|
"loss": 0.7051, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.0092378752886835, |
|
"grad_norm": 0.29321756958961487, |
|
"learning_rate": 5.01906459889977e-05, |
|
"loss": 0.6786, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.0153964588144726, |
|
"grad_norm": 0.30473142862319946, |
|
"learning_rate": 4.961943493986708e-05, |
|
"loss": 0.6471, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.0215550423402617, |
|
"grad_norm": 0.33245131373405457, |
|
"learning_rate": 4.90504184499205e-05, |
|
"loss": 0.6572, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.027713625866051, |
|
"grad_norm": 0.3262290060520172, |
|
"learning_rate": 4.848362130531039e-05, |
|
"loss": 0.6661, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.03387220939184, |
|
"grad_norm": 0.31137773394584656, |
|
"learning_rate": 4.791906819551533e-05, |
|
"loss": 0.6809, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.040030792917629, |
|
"grad_norm": 0.3372393846511841, |
|
"learning_rate": 4.735678371226441e-05, |
|
"loss": 0.6769, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.046189376443418, |
|
"grad_norm": 0.3040129542350769, |
|
"learning_rate": 4.6796792348466356e-05, |
|
"loss": 0.668, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.052347959969207, |
|
"grad_norm": 0.33269986510276794, |
|
"learning_rate": 4.6239118497142256e-05, |
|
"loss": 0.6663, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.058506543494996, |
|
"grad_norm": 0.3195638656616211, |
|
"learning_rate": 4.568378645036335e-05, |
|
"loss": 0.6654, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.064665127020785, |
|
"grad_norm": 0.3389524519443512, |
|
"learning_rate": 4.513082039819264e-05, |
|
"loss": 0.7097, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.0708237105465743, |
|
"grad_norm": 0.32742780447006226, |
|
"learning_rate": 4.4580244427631215e-05, |
|
"loss": 0.6729, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.0769822940723635, |
|
"grad_norm": 0.3140937089920044, |
|
"learning_rate": 4.403208252156921e-05, |
|
"loss": 0.6645, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.0831408775981526, |
|
"grad_norm": 0.2982357144355774, |
|
"learning_rate": 4.3486358557740814e-05, |
|
"loss": 0.6527, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.0892994611239413, |
|
"grad_norm": 0.31070762872695923, |
|
"learning_rate": 4.2943096307684516e-05, |
|
"loss": 0.6329, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0954580446497304, |
|
"grad_norm": 0.32711467146873474, |
|
"learning_rate": 4.2402319435707274e-05, |
|
"loss": 0.6531, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.1016166281755195, |
|
"grad_norm": 0.3270188570022583, |
|
"learning_rate": 4.186405149785403e-05, |
|
"loss": 0.6543, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.1077752117013087, |
|
"grad_norm": 0.3132490813732147, |
|
"learning_rate": 4.132831594088135e-05, |
|
"loss": 0.6646, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.113933795227098, |
|
"grad_norm": 0.3081105351448059, |
|
"learning_rate": 4.079513610123619e-05, |
|
"loss": 0.6564, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.120092378752887, |
|
"grad_norm": 0.3046637177467346, |
|
"learning_rate": 4.0264535204039486e-05, |
|
"loss": 0.6379, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.126250962278676, |
|
"grad_norm": 0.3062226474285126, |
|
"learning_rate": 3.973653636207437e-05, |
|
"loss": 0.6678, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.132409545804465, |
|
"grad_norm": 0.32641440629959106, |
|
"learning_rate": 3.921116257477927e-05, |
|
"loss": 0.6743, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.138568129330254, |
|
"grad_norm": 0.31709715723991394, |
|
"learning_rate": 3.86884367272463e-05, |
|
"loss": 0.6587, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.144726712856043, |
|
"grad_norm": 0.30693697929382324, |
|
"learning_rate": 3.81683815892242e-05, |
|
"loss": 0.6596, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.150885296381832, |
|
"grad_norm": 0.31892386078834534, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.6471, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1570438799076213, |
|
"grad_norm": 0.33241894841194153, |
|
"learning_rate": 3.713637393804531e-05, |
|
"loss": 0.6665, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.1632024634334104, |
|
"grad_norm": 0.3056892156600952, |
|
"learning_rate": 3.662446637876838e-05, |
|
"loss": 0.6595, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.1693610469591995, |
|
"grad_norm": 0.31584927439689636, |
|
"learning_rate": 3.6115319434803894e-05, |
|
"loss": 0.6726, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.1755196304849886, |
|
"grad_norm": 0.32377269864082336, |
|
"learning_rate": 3.5608955284408443e-05, |
|
"loss": 0.6869, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.1816782140107773, |
|
"grad_norm": 0.33342280983924866, |
|
"learning_rate": 3.510539598462127e-05, |
|
"loss": 0.6589, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.1878367975365665, |
|
"grad_norm": 0.3189857602119446, |
|
"learning_rate": 3.460466347030319e-05, |
|
"loss": 0.6841, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.1939953810623556, |
|
"grad_norm": 0.3210230767726898, |
|
"learning_rate": 3.410677955318142e-05, |
|
"loss": 0.6727, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.2001539645881447, |
|
"grad_norm": 0.3273890018463135, |
|
"learning_rate": 3.361176592089919e-05, |
|
"loss": 0.6617, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.206312548113934, |
|
"grad_norm": 0.3325875401496887, |
|
"learning_rate": 3.311964413607117e-05, |
|
"loss": 0.7081, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.212471131639723, |
|
"grad_norm": 0.3347218632698059, |
|
"learning_rate": 3.263043563534428e-05, |
|
"loss": 0.669, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.218629715165512, |
|
"grad_norm": 0.31812214851379395, |
|
"learning_rate": 3.214416172846381e-05, |
|
"loss": 0.6686, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.2247882986913012, |
|
"grad_norm": 0.32867950201034546, |
|
"learning_rate": 3.1660843597345135e-05, |
|
"loss": 0.6562, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.23094688221709, |
|
"grad_norm": 0.32618239521980286, |
|
"learning_rate": 3.1180502295151215e-05, |
|
"loss": 0.6604, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.237105465742879, |
|
"grad_norm": 0.34001123905181885, |
|
"learning_rate": 3.070315874537532e-05, |
|
"loss": 0.6585, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.243264049268668, |
|
"grad_norm": 0.32346731424331665, |
|
"learning_rate": 3.0228833740929797e-05, |
|
"loss": 0.6568, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.2494226327944573, |
|
"grad_norm": 0.3185584247112274, |
|
"learning_rate": 2.975754794324015e-05, |
|
"loss": 0.6535, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.2555812163202464, |
|
"grad_norm": 0.3232629895210266, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.6685, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.2617397998460356, |
|
"grad_norm": 0.31890401244163513, |
|
"learning_rate": 2.8824175951002917e-05, |
|
"loss": 0.6771, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.2678983833718247, |
|
"grad_norm": 0.3079417943954468, |
|
"learning_rate": 2.8362130413801524e-05, |
|
"loss": 0.6678, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.2740569668976134, |
|
"grad_norm": 0.33039936423301697, |
|
"learning_rate": 2.7903205396277542e-05, |
|
"loss": 0.6727, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.2802155504234025, |
|
"grad_norm": 0.31055954098701477, |
|
"learning_rate": 2.744742088903861e-05, |
|
"loss": 0.6649, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.2863741339491916, |
|
"grad_norm": 0.33042651414871216, |
|
"learning_rate": 2.6994796745893002e-05, |
|
"loss": 0.6769, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.2925327174749808, |
|
"grad_norm": 0.3143700361251831, |
|
"learning_rate": 2.654535268298457e-05, |
|
"loss": 0.6776, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.29869130100077, |
|
"grad_norm": 0.32236433029174805, |
|
"learning_rate": 2.6099108277934103e-05, |
|
"loss": 0.6713, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.304849884526559, |
|
"grad_norm": 0.3301021456718445, |
|
"learning_rate": 2.5656082968986373e-05, |
|
"loss": 0.6623, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.311008468052348, |
|
"grad_norm": 0.314859002828598, |
|
"learning_rate": 2.5216296054163546e-05, |
|
"loss": 0.6703, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.317167051578137, |
|
"grad_norm": 0.3108798861503601, |
|
"learning_rate": 2.477976669042452e-05, |
|
"loss": 0.6665, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.323325635103926, |
|
"grad_norm": 0.33090952038764954, |
|
"learning_rate": 2.4346513892830423e-05, |
|
"loss": 0.6775, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.329484218629715, |
|
"grad_norm": 0.3476983606815338, |
|
"learning_rate": 2.3916556533716294e-05, |
|
"loss": 0.69, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.3356428021555042, |
|
"grad_norm": 0.32023751735687256, |
|
"learning_rate": 2.3489913341869195e-05, |
|
"loss": 0.6659, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3418013856812934, |
|
"grad_norm": 0.32467585802078247, |
|
"learning_rate": 2.3066602901712108e-05, |
|
"loss": 0.6679, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.3479599692070825, |
|
"grad_norm": 0.31382596492767334, |
|
"learning_rate": 2.2646643652494692e-05, |
|
"loss": 0.6669, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.3541185527328716, |
|
"grad_norm": 0.33904537558555603, |
|
"learning_rate": 2.2230053887489867e-05, |
|
"loss": 0.6636, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.3602771362586603, |
|
"grad_norm": 0.3278086185455322, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 0.6557, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.3664357197844494, |
|
"grad_norm": 0.3350560665130615, |
|
"learning_rate": 2.1407055248551665e-05, |
|
"loss": 0.6698, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.3725943033102386, |
|
"grad_norm": 0.3336027264595032, |
|
"learning_rate": 2.100068222414121e-05, |
|
"loss": 0.6756, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.3787528868360277, |
|
"grad_norm": 0.32608169317245483, |
|
"learning_rate": 2.0597750381427604e-05, |
|
"loss": 0.662, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.384911470361817, |
|
"grad_norm": 0.323307603597641, |
|
"learning_rate": 2.0198277271976052e-05, |
|
"loss": 0.67, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.391070053887606, |
|
"grad_norm": 0.3259533941745758, |
|
"learning_rate": 1.9802280296690722e-05, |
|
"loss": 0.6599, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.397228637413395, |
|
"grad_norm": 0.3259793519973755, |
|
"learning_rate": 1.9409776705056516e-05, |
|
"loss": 0.6713, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.4033872209391838, |
|
"grad_norm": 0.3325396478176117, |
|
"learning_rate": 1.902078359438788e-05, |
|
"loss": 0.667, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.409545804464973, |
|
"grad_norm": 0.3348793685436249, |
|
"learning_rate": 1.8635317909083983e-05, |
|
"loss": 0.6745, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.415704387990762, |
|
"grad_norm": 0.33007267117500305, |
|
"learning_rate": 1.825339643989058e-05, |
|
"loss": 0.678, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.421862971516551, |
|
"grad_norm": 0.3420184850692749, |
|
"learning_rate": 1.787503582316864e-05, |
|
"loss": 0.6888, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.4280215550423403, |
|
"grad_norm": 0.34831422567367554, |
|
"learning_rate": 1.750025254016978e-05, |
|
"loss": 0.6722, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.4341801385681294, |
|
"grad_norm": 0.3232448995113373, |
|
"learning_rate": 1.712906291631814e-05, |
|
"loss": 0.6306, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.4403387220939186, |
|
"grad_norm": 0.32886627316474915, |
|
"learning_rate": 1.6761483120499455e-05, |
|
"loss": 0.6809, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.4464973056197072, |
|
"grad_norm": 0.3261054456233978, |
|
"learning_rate": 1.6397529164356606e-05, |
|
"loss": 0.6402, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.4526558891454964, |
|
"grad_norm": 0.3413579761981964, |
|
"learning_rate": 1.6037216901592243e-05, |
|
"loss": 0.6654, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.4588144726712855, |
|
"grad_norm": 0.31918439269065857, |
|
"learning_rate": 1.5680562027278157e-05, |
|
"loss": 0.6782, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4649730561970746, |
|
"grad_norm": 0.33338531851768494, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.6605, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.4711316397228638, |
|
"grad_norm": 0.33805474638938904, |
|
"learning_rate": 1.4978286427038601e-05, |
|
"loss": 0.6483, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.477290223248653, |
|
"grad_norm": 0.3180122971534729, |
|
"learning_rate": 1.463269629198416e-05, |
|
"loss": 0.6636, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.483448806774442, |
|
"grad_norm": 0.3195035755634308, |
|
"learning_rate": 1.4290824725789542e-05, |
|
"loss": 0.6603, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.4896073903002307, |
|
"grad_norm": 0.310829222202301, |
|
"learning_rate": 1.3952686620256428e-05, |
|
"loss": 0.6484, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.49576597382602, |
|
"grad_norm": 0.3324812054634094, |
|
"learning_rate": 1.3618296704558364e-05, |
|
"loss": 0.6706, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.501924557351809, |
|
"grad_norm": 0.33571305871009827, |
|
"learning_rate": 1.328766954459909e-05, |
|
"loss": 0.6631, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.508083140877598, |
|
"grad_norm": 0.36481767892837524, |
|
"learning_rate": 1.2960819542378056e-05, |
|
"loss": 0.6737, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.5142417244033872, |
|
"grad_norm": 0.34252485632896423, |
|
"learning_rate": 1.2637760935363053e-05, |
|
"loss": 0.6736, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.5204003079291764, |
|
"grad_norm": 0.340250700712204, |
|
"learning_rate": 1.2318507795870138e-05, |
|
"loss": 0.6653, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.5265588914549655, |
|
"grad_norm": 0.3391686677932739, |
|
"learning_rate": 1.2003074030450534e-05, |
|
"loss": 0.668, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.532717474980754, |
|
"grad_norm": 0.3234756290912628, |
|
"learning_rate": 1.1691473379284944e-05, |
|
"loss": 0.6512, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.5388760585065437, |
|
"grad_norm": 0.3238365948200226, |
|
"learning_rate": 1.1383719415584948e-05, |
|
"loss": 0.6581, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.5450346420323324, |
|
"grad_norm": 0.34081512689590454, |
|
"learning_rate": 1.1079825545001888e-05, |
|
"loss": 0.6705, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.5511932255581216, |
|
"grad_norm": 0.3313857913017273, |
|
"learning_rate": 1.0779805005042787e-05, |
|
"loss": 0.6544, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.5573518090839107, |
|
"grad_norm": 0.31094086170196533, |
|
"learning_rate": 1.0483670864493778e-05, |
|
"loss": 0.6555, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.5635103926097, |
|
"grad_norm": 0.3293236494064331, |
|
"learning_rate": 1.0191436022850908e-05, |
|
"loss": 0.6713, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.569668976135489, |
|
"grad_norm": 0.3299398720264435, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.6724, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.5758275596612776, |
|
"grad_norm": 0.3232874572277069, |
|
"learning_rate": 9.618714984452793e-06, |
|
"loss": 0.6528, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.581986143187067, |
|
"grad_norm": 0.33690372109413147, |
|
"learning_rate": 9.33825373521875e-06, |
|
"loss": 0.7024, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.588144726712856, |
|
"grad_norm": 0.3221462070941925, |
|
"learning_rate": 9.061741678846514e-06, |
|
"loss": 0.6745, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.594303310238645, |
|
"grad_norm": 0.34095361828804016, |
|
"learning_rate": 8.789190860101225e-06, |
|
"loss": 0.6825, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.600461893764434, |
|
"grad_norm": 0.3275180757045746, |
|
"learning_rate": 8.520613151197898e-06, |
|
"loss": 0.6655, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.6066204772902233, |
|
"grad_norm": 0.335509330034256, |
|
"learning_rate": 8.25602025128438e-06, |
|
"loss": 0.6625, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.6127790608160124, |
|
"grad_norm": 0.34106114506721497, |
|
"learning_rate": 7.995423685931625e-06, |
|
"loss": 0.6565, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.6189376443418015, |
|
"grad_norm": 0.3306207060813904, |
|
"learning_rate": 7.738834806631711e-06, |
|
"loss": 0.6482, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.6250962278675907, |
|
"grad_norm": 0.3112262487411499, |
|
"learning_rate": 7.48626479030341e-06, |
|
"loss": 0.6613, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.6312548113933794, |
|
"grad_norm": 0.3088861405849457, |
|
"learning_rate": 7.237724638805221e-06, |
|
"loss": 0.6738, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.6374133949191685, |
|
"grad_norm": 0.3252825438976288, |
|
"learning_rate": 6.9932251784562194e-06, |
|
"loss": 0.6636, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.6435719784449576, |
|
"grad_norm": 0.32623544335365295, |
|
"learning_rate": 6.75277705956443e-06, |
|
"loss": 0.6706, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.6497305619707467, |
|
"grad_norm": 0.31198734045028687, |
|
"learning_rate": 6.516390755962886e-06, |
|
"loss": 0.6358, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.655889145496536, |
|
"grad_norm": 0.33825692534446716, |
|
"learning_rate": 6.284076564553465e-06, |
|
"loss": 0.6768, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.662047729022325, |
|
"grad_norm": 0.3198145627975464, |
|
"learning_rate": 6.055844604858252e-06, |
|
"loss": 0.6766, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.668206312548114, |
|
"grad_norm": 0.3122054636478424, |
|
"learning_rate": 5.831704818578843e-06, |
|
"loss": 0.6697, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.674364896073903, |
|
"grad_norm": 0.330606609582901, |
|
"learning_rate": 5.611666969163243e-06, |
|
"loss": 0.6424, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.680523479599692, |
|
"grad_norm": 0.3488421142101288, |
|
"learning_rate": 5.3957406413805315e-06, |
|
"loss": 0.6735, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.686682063125481, |
|
"grad_norm": 0.32194870710372925, |
|
"learning_rate": 5.183935240903414e-06, |
|
"loss": 0.6689, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.69284064665127, |
|
"grad_norm": 0.3430643379688263, |
|
"learning_rate": 4.976259993898502e-06, |
|
"loss": 0.6592, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.6989992301770593, |
|
"grad_norm": 0.31703972816467285, |
|
"learning_rate": 4.7727239466244135e-06, |
|
"loss": 0.6738, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.7051578137028485, |
|
"grad_norm": 0.3227265775203705, |
|
"learning_rate": 4.573335965037706e-06, |
|
"loss": 0.6883, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.7113163972286376, |
|
"grad_norm": 0.3276483118534088, |
|
"learning_rate": 4.378104734406707e-06, |
|
"loss": 0.6686, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.7174749807544263, |
|
"grad_norm": 0.34250396490097046, |
|
"learning_rate": 4.187038758933204e-06, |
|
"loss": 0.6833, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.7236335642802154, |
|
"grad_norm": 0.33254727721214294, |
|
"learning_rate": 4.000146361381918e-06, |
|
"loss": 0.6639, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.7297921478060045, |
|
"grad_norm": 0.31365904211997986, |
|
"learning_rate": 3.817435682718096e-06, |
|
"loss": 0.6624, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.7359507313317937, |
|
"grad_norm": 0.3160648047924042, |
|
"learning_rate": 3.638914681752759e-06, |
|
"loss": 0.6489, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.742109314857583, |
|
"grad_norm": 0.3108401298522949, |
|
"learning_rate": 3.4645911347961357e-06, |
|
"loss": 0.6621, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.748267898383372, |
|
"grad_norm": 0.3212301433086395, |
|
"learning_rate": 3.294472635318846e-06, |
|
"loss": 0.663, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.754426481909161, |
|
"grad_norm": 0.3138129711151123, |
|
"learning_rate": 3.1285665936211515e-06, |
|
"loss": 0.6578, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.7605850654349497, |
|
"grad_norm": 0.32291486859321594, |
|
"learning_rate": 2.9668802365102054e-06, |
|
"loss": 0.6557, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.766743648960739, |
|
"grad_norm": 0.32492905855178833, |
|
"learning_rate": 2.809420606985236e-06, |
|
"loss": 0.6436, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.772902232486528, |
|
"grad_norm": 0.31086674332618713, |
|
"learning_rate": 2.656194563930714e-06, |
|
"loss": 0.6628, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.779060816012317, |
|
"grad_norm": 0.3261342942714691, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 0.6647, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.7852193995381063, |
|
"grad_norm": 0.3263491690158844, |
|
"learning_rate": 2.3624697504127545e-06, |
|
"loss": 0.6857, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.7913779830638954, |
|
"grad_norm": 0.32167235016822815, |
|
"learning_rate": 2.2219837744959283e-06, |
|
"loss": 0.6613, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.7975365665896845, |
|
"grad_norm": 0.31256523728370667, |
|
"learning_rate": 2.0857569735853956e-06, |
|
"loss": 0.6582, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.803695150115473, |
|
"grad_norm": 0.32856857776641846, |
|
"learning_rate": 1.9537952816713333e-06, |
|
"loss": 0.6712, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.809853733641263, |
|
"grad_norm": 0.3099411725997925, |
|
"learning_rate": 1.8261044469572997e-06, |
|
"loss": 0.6513, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.8160123171670515, |
|
"grad_norm": 0.3154350519180298, |
|
"learning_rate": 1.7026900316098215e-06, |
|
"loss": 0.6485, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.8221709006928406, |
|
"grad_norm": 0.3081769049167633, |
|
"learning_rate": 1.5835574115162121e-06, |
|
"loss": 0.6518, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.8283294842186297, |
|
"grad_norm": 0.3187597095966339, |
|
"learning_rate": 1.4687117760502578e-06, |
|
"loss": 0.6723, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.834488067744419, |
|
"grad_norm": 0.3135562241077423, |
|
"learning_rate": 1.3581581278463096e-06, |
|
"loss": 0.6535, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.840646651270208, |
|
"grad_norm": 0.32838648557662964, |
|
"learning_rate": 1.2519012825812804e-06, |
|
"loss": 0.6634, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.8468052347959967, |
|
"grad_norm": 0.32376208901405334, |
|
"learning_rate": 1.149945868764879e-06, |
|
"loss": 0.6459, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.8529638183217862, |
|
"grad_norm": 0.33817723393440247, |
|
"learning_rate": 1.0522963275380493e-06, |
|
"loss": 0.6745, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.859122401847575, |
|
"grad_norm": 0.31837162375450134, |
|
"learning_rate": 9.589569124794916e-07, |
|
"loss": 0.6384, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.865280985373364, |
|
"grad_norm": 0.3298625946044922, |
|
"learning_rate": 8.699316894203224e-07, |
|
"loss": 0.6444, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.871439568899153, |
|
"grad_norm": 0.3114056885242462, |
|
"learning_rate": 7.852245362670707e-07, |
|
"loss": 0.6536, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.8775981524249423, |
|
"grad_norm": 0.32438334822654724, |
|
"learning_rate": 7.048391428326584e-07, |
|
"loss": 0.6767, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.8837567359507315, |
|
"grad_norm": 0.3234598636627197, |
|
"learning_rate": 6.287790106757396e-07, |
|
"loss": 0.6613, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.88991531947652, |
|
"grad_norm": 0.31476038694381714, |
|
"learning_rate": 5.570474529481562e-07, |
|
"loss": 0.6693, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8960739030023097, |
|
"grad_norm": 0.31584304571151733, |
|
"learning_rate": 4.896475942506085e-07, |
|
"loss": 0.6585, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.9022324865280984, |
|
"grad_norm": 0.3189273178577423, |
|
"learning_rate": 4.2658237049655323e-07, |
|
"loss": 0.6599, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.9083910700538875, |
|
"grad_norm": 0.34343400597572327, |
|
"learning_rate": 3.6785452878429493e-07, |
|
"loss": 0.6678, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.9145496535796767, |
|
"grad_norm": 0.31915152072906494, |
|
"learning_rate": 3.134666272774034e-07, |
|
"loss": 0.6743, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.920708237105466, |
|
"grad_norm": 0.32635965943336487, |
|
"learning_rate": 2.6342103509315876e-07, |
|
"loss": 0.6463, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.926866820631255, |
|
"grad_norm": 0.350659042596817, |
|
"learning_rate": 2.177199321994672e-07, |
|
"loss": 0.6762, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.9330254041570436, |
|
"grad_norm": 0.31696727871894836, |
|
"learning_rate": 1.7636530931982586e-07, |
|
"loss": 0.6755, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.939183987682833, |
|
"grad_norm": 0.33761581778526306, |
|
"learning_rate": 1.393589678466367e-07, |
|
"loss": 0.6655, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.945342571208622, |
|
"grad_norm": 0.3336792290210724, |
|
"learning_rate": 1.0670251976275803e-07, |
|
"loss": 0.6818, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.951501154734411, |
|
"grad_norm": 0.33801573514938354, |
|
"learning_rate": 7.839738757123849e-08, |
|
"loss": 0.6653, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9576597382602, |
|
"grad_norm": 0.319459468126297, |
|
"learning_rate": 5.444480423341114e-08, |
|
"loss": 0.6577, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.9638183217859893, |
|
"grad_norm": 0.3176517188549042, |
|
"learning_rate": 3.484581311511414e-08, |
|
"loss": 0.67, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.9699769053117784, |
|
"grad_norm": 0.3248700499534607, |
|
"learning_rate": 1.9601267941338208e-08, |
|
"loss": 0.6551, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.9761354888375675, |
|
"grad_norm": 0.3170982897281647, |
|
"learning_rate": 8.711832758934169e-09, |
|
"loss": 0.6735, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.9822940723633566, |
|
"grad_norm": 0.32592684030532837, |
|
"learning_rate": 2.1779819077583087e-09, |
|
"loss": 0.6479, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.9884526558891453, |
|
"grad_norm": 0.32434290647506714, |
|
"learning_rate": 0.0, |
|
"loss": 0.6599, |
|
"step": 486 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 486, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 162, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8762573623923835e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|