|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9976905311778291, |
|
"eval_steps": 500, |
|
"global_step": 162, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006158583525789068, |
|
"grad_norm": 1.4914512634277344, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3404, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012317167051578136, |
|
"grad_norm": 1.486335039138794, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3428, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018475750577367205, |
|
"grad_norm": 1.4762102365493774, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3374, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024634334103156273, |
|
"grad_norm": 1.5677003860473633, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3111, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.030792917628945343, |
|
"grad_norm": 1.5895426273345947, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2003, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03695150115473441, |
|
"grad_norm": 1.4371826648712158, |
|
"learning_rate": 0.00012, |
|
"loss": 2.0329, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04311008468052348, |
|
"grad_norm": 1.3726378679275513, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8007, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.049268668206312545, |
|
"grad_norm": 1.4886671304702759, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5998, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05542725173210162, |
|
"grad_norm": 1.3068833351135254, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3961, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.061585835257890686, |
|
"grad_norm": 3.4257965087890625, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3012, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06774441878367975, |
|
"grad_norm": 1.5885640382766724, |
|
"learning_rate": 0.00019999782201809226, |
|
"loss": 1.1308, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07390300230946882, |
|
"grad_norm": 0.9890074729919434, |
|
"learning_rate": 0.00019999128816724108, |
|
"loss": 1.0712, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0800615858352579, |
|
"grad_norm": 0.9384965300559998, |
|
"learning_rate": 0.00019998039873205868, |
|
"loss": 1.0627, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08622016936104696, |
|
"grad_norm": 3.847111225128174, |
|
"learning_rate": 0.0001999651541868849, |
|
"loss": 1.01, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 1.0700613260269165, |
|
"learning_rate": 0.00019994555519576662, |
|
"loss": 1.0198, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09853733641262509, |
|
"grad_norm": 3.9324963092803955, |
|
"learning_rate": 0.00019992160261242877, |
|
"loss": 1.0316, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10469591993841416, |
|
"grad_norm": 0.8849812150001526, |
|
"learning_rate": 0.00019989329748023725, |
|
"loss": 0.9896, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11085450346420324, |
|
"grad_norm": 0.5666069984436035, |
|
"learning_rate": 0.00019986064103215339, |
|
"loss": 0.9734, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1170130869899923, |
|
"grad_norm": 0.4113065302371979, |
|
"learning_rate": 0.0001998236346906802, |
|
"loss": 0.9253, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12317167051578137, |
|
"grad_norm": 0.3489767909049988, |
|
"learning_rate": 0.00019978228006780054, |
|
"loss": 0.9113, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12933025404157045, |
|
"grad_norm": 1.2077497243881226, |
|
"learning_rate": 0.00019973657896490686, |
|
"loss": 0.9194, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1354888375673595, |
|
"grad_norm": 0.4487021863460541, |
|
"learning_rate": 0.00019968653337272261, |
|
"loss": 0.9058, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14164742109314857, |
|
"grad_norm": 0.35114091634750366, |
|
"learning_rate": 0.0001996321454712157, |
|
"loss": 0.9066, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14780600461893764, |
|
"grad_norm": 0.3634772300720215, |
|
"learning_rate": 0.00019957341762950344, |
|
"loss": 0.913, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15396458814472672, |
|
"grad_norm": 0.4948749244213104, |
|
"learning_rate": 0.0001995103524057494, |
|
"loss": 0.8944, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1601231716705158, |
|
"grad_norm": 0.3144727051258087, |
|
"learning_rate": 0.00019944295254705185, |
|
"loss": 0.8973, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16628175519630484, |
|
"grad_norm": 0.26469510793685913, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 0.8776, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1724403387220939, |
|
"grad_norm": 0.29824352264404297, |
|
"learning_rate": 0.00019929516085716734, |
|
"loss": 0.8521, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17859892224788299, |
|
"grad_norm": 0.2820914387702942, |
|
"learning_rate": 0.00019921477546373296, |
|
"loss": 0.862, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 0.3102569282054901, |
|
"learning_rate": 0.00019913006831057969, |
|
"loss": 0.8586, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19091608929946113, |
|
"grad_norm": 0.30348193645477295, |
|
"learning_rate": 0.0001990410430875205, |
|
"loss": 0.854, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19707467282525018, |
|
"grad_norm": 0.31126317381858826, |
|
"learning_rate": 0.00019894770367246195, |
|
"loss": 0.8209, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.20323325635103925, |
|
"grad_norm": 0.3367297351360321, |
|
"learning_rate": 0.00019885005413123515, |
|
"loss": 0.8545, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20939183987682833, |
|
"grad_norm": 0.28940874338150024, |
|
"learning_rate": 0.00019874809871741876, |
|
"loss": 0.8561, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2155504234026174, |
|
"grad_norm": 0.3074447512626648, |
|
"learning_rate": 0.00019864184187215372, |
|
"loss": 0.8495, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22170900692840648, |
|
"grad_norm": 0.29862889647483826, |
|
"learning_rate": 0.00019853128822394975, |
|
"loss": 0.8447, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22786759045419552, |
|
"grad_norm": 0.3074705898761749, |
|
"learning_rate": 0.0001984164425884838, |
|
"loss": 0.8199, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2340261739799846, |
|
"grad_norm": 0.2626888155937195, |
|
"learning_rate": 0.0001982973099683902, |
|
"loss": 0.8289, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.24018475750577367, |
|
"grad_norm": 0.26036959886550903, |
|
"learning_rate": 0.00019817389555304272, |
|
"loss": 0.8244, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24634334103156275, |
|
"grad_norm": 0.2532216012477875, |
|
"learning_rate": 0.0001980462047183287, |
|
"loss": 0.8236, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2525019245573518, |
|
"grad_norm": 0.30278533697128296, |
|
"learning_rate": 0.0001979142430264146, |
|
"loss": 0.8207, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2586605080831409, |
|
"grad_norm": 0.29070666432380676, |
|
"learning_rate": 0.00019777801622550408, |
|
"loss": 0.823, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26481909160892997, |
|
"grad_norm": 0.2718726694583893, |
|
"learning_rate": 0.00019763753024958723, |
|
"loss": 0.8054, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.270977675134719, |
|
"grad_norm": 0.25870487093925476, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.8118, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.27713625866050806, |
|
"grad_norm": 0.2909790873527527, |
|
"learning_rate": 0.0001973438054360693, |
|
"loss": 0.8101, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.28329484218629714, |
|
"grad_norm": 0.30481499433517456, |
|
"learning_rate": 0.00019719057939301477, |
|
"loss": 0.8394, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2894534257120862, |
|
"grad_norm": 0.2736552059650421, |
|
"learning_rate": 0.0001970331197634898, |
|
"loss": 0.7932, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2956120092378753, |
|
"grad_norm": 0.250683456659317, |
|
"learning_rate": 0.00019687143340637887, |
|
"loss": 0.7907, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.30177059276366436, |
|
"grad_norm": 0.32575681805610657, |
|
"learning_rate": 0.00019670552736468118, |
|
"loss": 0.8022, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.30792917628945343, |
|
"grad_norm": 0.3301404118537903, |
|
"learning_rate": 0.00019653540886520386, |
|
"loss": 0.8163, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3140877598152425, |
|
"grad_norm": 0.31461504101753235, |
|
"learning_rate": 0.00019636108531824724, |
|
"loss": 0.8111, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3202463433410316, |
|
"grad_norm": 0.2696295380592346, |
|
"learning_rate": 0.00019618256431728194, |
|
"loss": 0.7953, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.32640492686682065, |
|
"grad_norm": 0.29559096693992615, |
|
"learning_rate": 0.0001959998536386181, |
|
"loss": 0.7759, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3325635103926097, |
|
"grad_norm": 0.2746593654155731, |
|
"learning_rate": 0.0001958129612410668, |
|
"loss": 0.7817, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.33872209391839875, |
|
"grad_norm": 0.30077093839645386, |
|
"learning_rate": 0.0001956218952655933, |
|
"loss": 0.7965, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3448806774441878, |
|
"grad_norm": 0.2864063084125519, |
|
"learning_rate": 0.00019542666403496233, |
|
"loss": 0.812, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3510392609699769, |
|
"grad_norm": 0.3612823784351349, |
|
"learning_rate": 0.0001952272760533756, |
|
"loss": 0.7652, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.35719784449576597, |
|
"grad_norm": 0.28117987513542175, |
|
"learning_rate": 0.00019502374000610151, |
|
"loss": 0.7982, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.36335642802155504, |
|
"grad_norm": 0.3067573010921478, |
|
"learning_rate": 0.0001948160647590966, |
|
"loss": 0.7703, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3695150115473441, |
|
"grad_norm": 0.27148452401161194, |
|
"learning_rate": 0.00019460425935861948, |
|
"loss": 0.7795, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3756735950731332, |
|
"grad_norm": 0.35128867626190186, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.7637, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.38183217859892227, |
|
"grad_norm": 0.3009471297264099, |
|
"learning_rate": 0.00019416829518142118, |
|
"loss": 0.7838, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.38799076212471134, |
|
"grad_norm": 0.3329865038394928, |
|
"learning_rate": 0.00019394415539514178, |
|
"loss": 0.7949, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.39414934565050036, |
|
"grad_norm": 0.2731780707836151, |
|
"learning_rate": 0.00019371592343544656, |
|
"loss": 0.779, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.40030792917628943, |
|
"grad_norm": 0.3036772608757019, |
|
"learning_rate": 0.00019348360924403713, |
|
"loss": 0.7665, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4064665127020785, |
|
"grad_norm": 0.28768160939216614, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.7916, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4126250962278676, |
|
"grad_norm": 0.30529218912124634, |
|
"learning_rate": 0.0001930067748215438, |
|
"loss": 0.7848, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.41878367975365666, |
|
"grad_norm": 0.2631685733795166, |
|
"learning_rate": 0.0001927622753611948, |
|
"loss": 0.785, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.42494226327944573, |
|
"grad_norm": 0.33626529574394226, |
|
"learning_rate": 0.0001925137352096966, |
|
"loss": 0.7696, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4311008468052348, |
|
"grad_norm": 0.30560940504074097, |
|
"learning_rate": 0.0001922611651933683, |
|
"loss": 0.7623, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4372594303310239, |
|
"grad_norm": 0.31474098563194275, |
|
"learning_rate": 0.0001920045763140684, |
|
"loss": 0.7909, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.44341801385681295, |
|
"grad_norm": 0.2986692488193512, |
|
"learning_rate": 0.00019174397974871564, |
|
"loss": 0.7837, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.44957659738260203, |
|
"grad_norm": 0.2956785261631012, |
|
"learning_rate": 0.0001914793868488021, |
|
"loss": 0.7721, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.45573518090839105, |
|
"grad_norm": 0.34299513697624207, |
|
"learning_rate": 0.0001912108091398988, |
|
"loss": 0.7595, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4618937644341801, |
|
"grad_norm": 0.2858146131038666, |
|
"learning_rate": 0.0001909382583211535, |
|
"loss": 0.77, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4680523479599692, |
|
"grad_norm": 0.2714357078075409, |
|
"learning_rate": 0.0001906617462647813, |
|
"loss": 0.755, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.47421093148575827, |
|
"grad_norm": 0.2805250287055969, |
|
"learning_rate": 0.0001903812850155472, |
|
"loss": 0.7572, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.48036951501154734, |
|
"grad_norm": 0.321464866399765, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.7259, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4865280985373364, |
|
"grad_norm": 0.27994561195373535, |
|
"learning_rate": 0.00018980856397714913, |
|
"loss": 0.7779, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4926866820631255, |
|
"grad_norm": 0.3206578195095062, |
|
"learning_rate": 0.00018951632913550626, |
|
"loss": 0.7599, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49884526558891457, |
|
"grad_norm": 0.30579233169555664, |
|
"learning_rate": 0.00018922019499495725, |
|
"loss": 0.763, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5050038491147036, |
|
"grad_norm": 0.2705940902233124, |
|
"learning_rate": 0.0001889201744549981, |
|
"loss": 0.7436, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5111624326404927, |
|
"grad_norm": 0.36793988943099976, |
|
"learning_rate": 0.00018861628058441506, |
|
"loss": 0.7489, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5173210161662818, |
|
"grad_norm": 0.30018600821495056, |
|
"learning_rate": 0.00018830852662071507, |
|
"loss": 0.7479, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5234795996920708, |
|
"grad_norm": 0.2857201397418976, |
|
"learning_rate": 0.00018799692596954947, |
|
"loss": 0.7584, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5296381832178599, |
|
"grad_norm": 0.3345021903514862, |
|
"learning_rate": 0.0001876814922041299, |
|
"loss": 0.7589, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.535796766743649, |
|
"grad_norm": 0.2717898488044739, |
|
"learning_rate": 0.00018736223906463696, |
|
"loss": 0.7658, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.541955350269438, |
|
"grad_norm": 0.30215781927108765, |
|
"learning_rate": 0.00018703918045762197, |
|
"loss": 0.7836, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5481139337952271, |
|
"grad_norm": 0.2765495479106903, |
|
"learning_rate": 0.0001867123304554009, |
|
"loss": 0.7556, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5542725173210161, |
|
"grad_norm": 0.31692612171173096, |
|
"learning_rate": 0.00018638170329544164, |
|
"loss": 0.7684, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5604311008468053, |
|
"grad_norm": 0.273282915353775, |
|
"learning_rate": 0.00018604731337974357, |
|
"loss": 0.7544, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5665896843725943, |
|
"grad_norm": 0.2583194971084595, |
|
"learning_rate": 0.00018570917527421048, |
|
"loss": 0.7469, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5727482678983834, |
|
"grad_norm": 0.287936270236969, |
|
"learning_rate": 0.00018536730370801585, |
|
"loss": 0.7417, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5789068514241724, |
|
"grad_norm": 0.31012046337127686, |
|
"learning_rate": 0.00018502171357296144, |
|
"loss": 0.7405, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5850654349499615, |
|
"grad_norm": 0.30453142523765564, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 0.7688, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5912240184757506, |
|
"grad_norm": 0.2779741585254669, |
|
"learning_rate": 0.00018431943797272187, |
|
"loss": 0.7396, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5973826020015397, |
|
"grad_norm": 0.29110845923423767, |
|
"learning_rate": 0.00018396278309840779, |
|
"loss": 0.7556, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6035411855273287, |
|
"grad_norm": 0.315677285194397, |
|
"learning_rate": 0.00018360247083564342, |
|
"loss": 0.749, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6096997690531177, |
|
"grad_norm": 0.3191000521183014, |
|
"learning_rate": 0.00018323851687950055, |
|
"loss": 0.7548, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6158583525789069, |
|
"grad_norm": 0.28777754306793213, |
|
"learning_rate": 0.00018287093708368188, |
|
"loss": 0.7625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6220169361046959, |
|
"grad_norm": 0.30834269523620605, |
|
"learning_rate": 0.00018249974745983023, |
|
"loss": 0.7605, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.628175519630485, |
|
"grad_norm": 0.31846028566360474, |
|
"learning_rate": 0.00018212496417683137, |
|
"loss": 0.7715, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.634334103156274, |
|
"grad_norm": 0.3188404142856598, |
|
"learning_rate": 0.00018174660356010943, |
|
"loss": 0.7661, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6404926866820632, |
|
"grad_norm": 0.2989709973335266, |
|
"learning_rate": 0.00018136468209091602, |
|
"loss": 0.728, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6466512702078522, |
|
"grad_norm": 0.31583070755004883, |
|
"learning_rate": 0.0001809792164056121, |
|
"loss": 0.7751, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6528098537336413, |
|
"grad_norm": 0.2666497528553009, |
|
"learning_rate": 0.0001805902232949435, |
|
"loss": 0.7407, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6589684372594303, |
|
"grad_norm": 0.32868844270706177, |
|
"learning_rate": 0.0001801977197033093, |
|
"loss": 0.7429, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6651270207852193, |
|
"grad_norm": 0.3197931945323944, |
|
"learning_rate": 0.000179801722728024, |
|
"loss": 0.746, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6712856043110085, |
|
"grad_norm": 0.29100263118743896, |
|
"learning_rate": 0.00017940224961857242, |
|
"loss": 0.7483, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6774441878367975, |
|
"grad_norm": 0.3278297185897827, |
|
"learning_rate": 0.00017899931777585882, |
|
"loss": 0.7619, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6836027713625866, |
|
"grad_norm": 0.3008161783218384, |
|
"learning_rate": 0.00017859294475144837, |
|
"loss": 0.7464, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6897613548883756, |
|
"grad_norm": 0.26044347882270813, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.7408, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6959199384141648, |
|
"grad_norm": 0.3036176562309265, |
|
"learning_rate": 0.00017776994611251015, |
|
"loss": 0.7614, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7020785219399538, |
|
"grad_norm": 0.3001931309700012, |
|
"learning_rate": 0.00017735335634750532, |
|
"loss": 0.7308, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7082371054657429, |
|
"grad_norm": 0.27744966745376587, |
|
"learning_rate": 0.00017693339709828792, |
|
"loss": 0.7456, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7143956889915319, |
|
"grad_norm": 0.2697376012802124, |
|
"learning_rate": 0.00017651008665813081, |
|
"loss": 0.7456, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7205542725173211, |
|
"grad_norm": 0.2659561336040497, |
|
"learning_rate": 0.0001760834434662837, |
|
"loss": 0.7262, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7267128560431101, |
|
"grad_norm": 0.29246559739112854, |
|
"learning_rate": 0.0001756534861071696, |
|
"loss": 0.7433, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7328714395688991, |
|
"grad_norm": 0.2959323525428772, |
|
"learning_rate": 0.00017522023330957548, |
|
"loss": 0.7512, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7390300230946882, |
|
"grad_norm": 0.26138290762901306, |
|
"learning_rate": 0.00017478370394583646, |
|
"loss": 0.7503, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7451886066204773, |
|
"grad_norm": 0.26741594076156616, |
|
"learning_rate": 0.00017434391703101363, |
|
"loss": 0.7582, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7513471901462664, |
|
"grad_norm": 0.28922754526138306, |
|
"learning_rate": 0.00017390089172206592, |
|
"loss": 0.7405, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7575057736720554, |
|
"grad_norm": 0.2668907344341278, |
|
"learning_rate": 0.00017345464731701547, |
|
"loss": 0.7381, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7636643571978445, |
|
"grad_norm": 0.27399054169654846, |
|
"learning_rate": 0.00017300520325410701, |
|
"loss": 0.7413, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7698229407236336, |
|
"grad_norm": 0.29532185196876526, |
|
"learning_rate": 0.0001725525791109614, |
|
"loss": 0.7709, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7759815242494227, |
|
"grad_norm": 0.3103969097137451, |
|
"learning_rate": 0.0001720967946037225, |
|
"loss": 0.7406, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7821401077752117, |
|
"grad_norm": 0.309108704328537, |
|
"learning_rate": 0.0001716378695861985, |
|
"loss": 0.7698, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7882986913010007, |
|
"grad_norm": 0.2896479070186615, |
|
"learning_rate": 0.00017117582404899712, |
|
"loss": 0.7417, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7944572748267898, |
|
"grad_norm": 0.28942593932151794, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.7234, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8006158583525789, |
|
"grad_norm": 0.2783251702785492, |
|
"learning_rate": 0.00017024245205675986, |
|
"loss": 0.7441, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.806774441878368, |
|
"grad_norm": 0.3195393979549408, |
|
"learning_rate": 0.00016977116625907024, |
|
"loss": 0.7407, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.812933025404157, |
|
"grad_norm": 0.27995389699935913, |
|
"learning_rate": 0.0001692968412546247, |
|
"loss": 0.7616, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8190916089299461, |
|
"grad_norm": 0.26950138807296753, |
|
"learning_rate": 0.0001688194977048488, |
|
"loss": 0.7261, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8252501924557352, |
|
"grad_norm": 0.28609132766723633, |
|
"learning_rate": 0.00016833915640265484, |
|
"loss": 0.7596, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8314087759815243, |
|
"grad_norm": 0.29152774810791016, |
|
"learning_rate": 0.00016785583827153618, |
|
"loss": 0.7488, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8375673595073133, |
|
"grad_norm": 0.24516189098358154, |
|
"learning_rate": 0.00016736956436465573, |
|
"loss": 0.7213, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8437259430331023, |
|
"grad_norm": 0.2636944055557251, |
|
"learning_rate": 0.00016688035586392885, |
|
"loss": 0.7124, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8498845265588915, |
|
"grad_norm": 0.2662704288959503, |
|
"learning_rate": 0.00016638823407910084, |
|
"loss": 0.7208, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8560431100846805, |
|
"grad_norm": 0.28026285767555237, |
|
"learning_rate": 0.00016589322044681861, |
|
"loss": 0.7362, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8622016936104696, |
|
"grad_norm": 0.312272846698761, |
|
"learning_rate": 0.00016539533652969683, |
|
"loss": 0.7353, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8683602771362586, |
|
"grad_norm": 0.28158485889434814, |
|
"learning_rate": 0.00016489460401537874, |
|
"loss": 0.74, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8745188606620478, |
|
"grad_norm": 0.26379963755607605, |
|
"learning_rate": 0.00016439104471559156, |
|
"loss": 0.7342, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8806774441878368, |
|
"grad_norm": 0.3109482228755951, |
|
"learning_rate": 0.00016388468056519612, |
|
"loss": 0.7649, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8868360277136259, |
|
"grad_norm": 0.3082488477230072, |
|
"learning_rate": 0.00016337553362123165, |
|
"loss": 0.7502, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8929946112394149, |
|
"grad_norm": 0.27953189611434937, |
|
"learning_rate": 0.00016286362606195468, |
|
"loss": 0.7321, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8991531947652041, |
|
"grad_norm": 0.27005666494369507, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.7447, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9053117782909931, |
|
"grad_norm": 0.29707372188568115, |
|
"learning_rate": 0.0001618316184107758, |
|
"loss": 0.727, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9114703618167821, |
|
"grad_norm": 0.3249203860759735, |
|
"learning_rate": 0.00016131156327275372, |
|
"loss": 0.7508, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9176289453425712, |
|
"grad_norm": 0.2724114954471588, |
|
"learning_rate": 0.00016078883742522075, |
|
"loss": 0.709, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9237875288683602, |
|
"grad_norm": 0.27933382987976074, |
|
"learning_rate": 0.00016026346363792567, |
|
"loss": 0.7318, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9299461123941494, |
|
"grad_norm": 0.27181556820869446, |
|
"learning_rate": 0.00015973546479596052, |
|
"loss": 0.7686, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9361046959199384, |
|
"grad_norm": 0.27142593264579773, |
|
"learning_rate": 0.00015920486389876383, |
|
"loss": 0.7485, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9422632794457275, |
|
"grad_norm": 0.26695144176483154, |
|
"learning_rate": 0.0001586716840591187, |
|
"loss": 0.7426, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9484218629715165, |
|
"grad_norm": 0.2876656949520111, |
|
"learning_rate": 0.000158135948502146, |
|
"loss": 0.7442, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9545804464973057, |
|
"grad_norm": 0.27336934208869934, |
|
"learning_rate": 0.00015759768056429274, |
|
"loss": 0.7353, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9607390300230947, |
|
"grad_norm": 0.2817447781562805, |
|
"learning_rate": 0.00015705690369231551, |
|
"loss": 0.7552, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9668976135488837, |
|
"grad_norm": 0.284213662147522, |
|
"learning_rate": 0.0001565136414422592, |
|
"loss": 0.7398, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9730561970746728, |
|
"grad_norm": 0.2847895622253418, |
|
"learning_rate": 0.0001559679174784308, |
|
"loss": 0.7364, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9792147806004619, |
|
"grad_norm": 0.2839486598968506, |
|
"learning_rate": 0.00015541975557236882, |
|
"loss": 0.754, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.985373364126251, |
|
"grad_norm": 0.2721126973628998, |
|
"learning_rate": 0.0001548691796018074, |
|
"loss": 0.7448, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.99153194765204, |
|
"grad_norm": 0.2735673785209656, |
|
"learning_rate": 0.00015431621354963668, |
|
"loss": 0.7308, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9976905311778291, |
|
"grad_norm": 0.31629157066345215, |
|
"learning_rate": 0.00015376088150285773, |
|
"loss": 0.7456, |
|
"step": 162 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 486, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 162, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.588141062388449e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|