|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 174828, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0028599537831468643, |
|
"grad_norm": 8.294376373291016, |
|
"learning_rate": 4.985700231084266e-05, |
|
"loss": 6.665, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.005719907566293729, |
|
"grad_norm": 8.308354377746582, |
|
"learning_rate": 4.971400462168532e-05, |
|
"loss": 5.1044, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.008579861349440594, |
|
"grad_norm": 7.017335891723633, |
|
"learning_rate": 4.9571006932527974e-05, |
|
"loss": 4.6249, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.011439815132587457, |
|
"grad_norm": 7.528384685516357, |
|
"learning_rate": 4.942800924337063e-05, |
|
"loss": 4.3456, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.014299768915734323, |
|
"grad_norm": 7.852795600891113, |
|
"learning_rate": 4.928501155421328e-05, |
|
"loss": 4.0772, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.017159722698881188, |
|
"grad_norm": 7.606760025024414, |
|
"learning_rate": 4.914201386505594e-05, |
|
"loss": 3.974, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.02001967648202805, |
|
"grad_norm": 7.45611572265625, |
|
"learning_rate": 4.8999016175898596e-05, |
|
"loss": 3.8115, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.022879630265174915, |
|
"grad_norm": 7.1126861572265625, |
|
"learning_rate": 4.885601848674126e-05, |
|
"loss": 3.7529, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.02573958404832178, |
|
"grad_norm": 7.052072525024414, |
|
"learning_rate": 4.871302079758392e-05, |
|
"loss": 3.654, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.028599537831468645, |
|
"grad_norm": 7.367290019989014, |
|
"learning_rate": 4.8570023108426574e-05, |
|
"loss": 3.5671, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.03145949161461551, |
|
"grad_norm": 8.534124374389648, |
|
"learning_rate": 4.842702541926923e-05, |
|
"loss": 3.5032, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.034319445397762376, |
|
"grad_norm": 7.498807907104492, |
|
"learning_rate": 4.828402773011189e-05, |
|
"loss": 3.5244, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.037179399180909234, |
|
"grad_norm": 6.6923298835754395, |
|
"learning_rate": 4.814103004095454e-05, |
|
"loss": 3.4814, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0400393529640561, |
|
"grad_norm": 6.853496551513672, |
|
"learning_rate": 4.7998032351797196e-05, |
|
"loss": 3.3781, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.042899306747202964, |
|
"grad_norm": 7.179901599884033, |
|
"learning_rate": 4.785503466263985e-05, |
|
"loss": 3.3461, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.04575926053034983, |
|
"grad_norm": 7.566349506378174, |
|
"learning_rate": 4.771203697348251e-05, |
|
"loss": 3.3024, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.048619214313496695, |
|
"grad_norm": 7.144839763641357, |
|
"learning_rate": 4.756903928432517e-05, |
|
"loss": 3.2582, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.05147916809664356, |
|
"grad_norm": 6.929514408111572, |
|
"learning_rate": 4.7426041595167824e-05, |
|
"loss": 3.2371, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.054339121879790425, |
|
"grad_norm": 6.699374198913574, |
|
"learning_rate": 4.728304390601048e-05, |
|
"loss": 3.201, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.05719907566293729, |
|
"grad_norm": 5.757383346557617, |
|
"learning_rate": 4.714004621685314e-05, |
|
"loss": 3.1877, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.06005902944608415, |
|
"grad_norm": 7.66983699798584, |
|
"learning_rate": 4.6997048527695796e-05, |
|
"loss": 3.1514, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.06291898322923102, |
|
"grad_norm": 7.166614532470703, |
|
"learning_rate": 4.6854050838538446e-05, |
|
"loss": 3.1203, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.06577893701237789, |
|
"grad_norm": 8.80114459991455, |
|
"learning_rate": 4.671105314938111e-05, |
|
"loss": 3.1312, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.06863889079552475, |
|
"grad_norm": 29.41587257385254, |
|
"learning_rate": 4.656805546022377e-05, |
|
"loss": 3.1019, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.0714988445786716, |
|
"grad_norm": 6.1705145835876465, |
|
"learning_rate": 4.6425057771066424e-05, |
|
"loss": 3.0878, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.07435879836181847, |
|
"grad_norm": 7.218475341796875, |
|
"learning_rate": 4.628206008190908e-05, |
|
"loss": 3.0758, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.07721875214496533, |
|
"grad_norm": 6.435647964477539, |
|
"learning_rate": 4.613906239275174e-05, |
|
"loss": 3.0396, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.0800787059281122, |
|
"grad_norm": 7.471750736236572, |
|
"learning_rate": 4.5996064703594396e-05, |
|
"loss": 3.0454, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.08293865971125906, |
|
"grad_norm": 6.561801910400391, |
|
"learning_rate": 4.585306701443705e-05, |
|
"loss": 3.0095, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.08579861349440593, |
|
"grad_norm": 7.1273369789123535, |
|
"learning_rate": 4.57100693252797e-05, |
|
"loss": 3.0249, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0886585672775528, |
|
"grad_norm": 6.540430545806885, |
|
"learning_rate": 4.556707163612236e-05, |
|
"loss": 3.0156, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.09151852106069966, |
|
"grad_norm": 6.394286632537842, |
|
"learning_rate": 4.542407394696502e-05, |
|
"loss": 2.9634, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.09437847484384652, |
|
"grad_norm": 7.856606960296631, |
|
"learning_rate": 4.5281076257807674e-05, |
|
"loss": 2.9866, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.09723842862699339, |
|
"grad_norm": 7.8352861404418945, |
|
"learning_rate": 4.513807856865033e-05, |
|
"loss": 2.9864, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.10009838241014025, |
|
"grad_norm": 6.253101348876953, |
|
"learning_rate": 4.499508087949299e-05, |
|
"loss": 2.9592, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.10295833619328712, |
|
"grad_norm": 6.485994815826416, |
|
"learning_rate": 4.4852083190335646e-05, |
|
"loss": 2.9405, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.10581828997643399, |
|
"grad_norm": 6.409724712371826, |
|
"learning_rate": 4.47090855011783e-05, |
|
"loss": 2.9567, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.10867824375958085, |
|
"grad_norm": 7.388598918914795, |
|
"learning_rate": 4.456608781202096e-05, |
|
"loss": 2.9123, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.11153819754272772, |
|
"grad_norm": 7.503371715545654, |
|
"learning_rate": 4.442309012286362e-05, |
|
"loss": 2.922, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.11439815132587458, |
|
"grad_norm": 6.702953338623047, |
|
"learning_rate": 4.4280092433706274e-05, |
|
"loss": 2.9321, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.11725810510902143, |
|
"grad_norm": 7.328106880187988, |
|
"learning_rate": 4.413709474454893e-05, |
|
"loss": 2.8965, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.1201180588921683, |
|
"grad_norm": 6.787193775177002, |
|
"learning_rate": 4.399409705539159e-05, |
|
"loss": 2.9204, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.12297801267531516, |
|
"grad_norm": 5.832542896270752, |
|
"learning_rate": 4.3851099366234246e-05, |
|
"loss": 2.8724, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.12583796645846204, |
|
"grad_norm": 6.784033298492432, |
|
"learning_rate": 4.37081016770769e-05, |
|
"loss": 2.889, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.1286979202416089, |
|
"grad_norm": 7.457705497741699, |
|
"learning_rate": 4.356510398791956e-05, |
|
"loss": 2.8845, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.13155787402475577, |
|
"grad_norm": 7.377457141876221, |
|
"learning_rate": 4.342210629876222e-05, |
|
"loss": 2.876, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.13441782780790262, |
|
"grad_norm": 6.810230731964111, |
|
"learning_rate": 4.327910860960487e-05, |
|
"loss": 2.8881, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.1372777815910495, |
|
"grad_norm": 6.137091636657715, |
|
"learning_rate": 4.3136110920447525e-05, |
|
"loss": 2.8599, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.14013773537419635, |
|
"grad_norm": 27.535808563232422, |
|
"learning_rate": 4.299311323129018e-05, |
|
"loss": 2.8432, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.1429976891573432, |
|
"grad_norm": 6.044827461242676, |
|
"learning_rate": 4.285011554213284e-05, |
|
"loss": 2.8644, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.14585764294049008, |
|
"grad_norm": 6.300295829772949, |
|
"learning_rate": 4.2707117852975496e-05, |
|
"loss": 2.8269, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.14871759672363694, |
|
"grad_norm": 5.811293125152588, |
|
"learning_rate": 4.256412016381815e-05, |
|
"loss": 2.8308, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.15157755050678381, |
|
"grad_norm": 6.52765417098999, |
|
"learning_rate": 4.242112247466081e-05, |
|
"loss": 2.8309, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.15443750428993067, |
|
"grad_norm": 6.731512546539307, |
|
"learning_rate": 4.227812478550347e-05, |
|
"loss": 2.8066, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.15729745807307755, |
|
"grad_norm": 6.837157249450684, |
|
"learning_rate": 4.2135127096346125e-05, |
|
"loss": 2.8282, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.1601574118562244, |
|
"grad_norm": 5.657121181488037, |
|
"learning_rate": 4.199212940718878e-05, |
|
"loss": 2.7904, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.16301736563937128, |
|
"grad_norm": 8.501928329467773, |
|
"learning_rate": 4.184913171803144e-05, |
|
"loss": 2.8224, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.16587731942251813, |
|
"grad_norm": 6.447242736816406, |
|
"learning_rate": 4.1706134028874096e-05, |
|
"loss": 2.7731, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.168737273205665, |
|
"grad_norm": 6.049993991851807, |
|
"learning_rate": 4.156313633971675e-05, |
|
"loss": 2.7726, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.17159722698881186, |
|
"grad_norm": 5.747082710266113, |
|
"learning_rate": 4.142013865055941e-05, |
|
"loss": 2.799, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.17445718077195874, |
|
"grad_norm": 6.7925615310668945, |
|
"learning_rate": 4.127714096140207e-05, |
|
"loss": 2.7704, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.1773171345551056, |
|
"grad_norm": 7.164943218231201, |
|
"learning_rate": 4.1134143272244725e-05, |
|
"loss": 2.7611, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.18017708833825247, |
|
"grad_norm": 6.813632011413574, |
|
"learning_rate": 4.099114558308738e-05, |
|
"loss": 2.7637, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.18303704212139932, |
|
"grad_norm": 5.981168270111084, |
|
"learning_rate": 4.084814789393003e-05, |
|
"loss": 2.7495, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.18589699590454617, |
|
"grad_norm": 6.125492095947266, |
|
"learning_rate": 4.070515020477269e-05, |
|
"loss": 2.7592, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.18875694968769305, |
|
"grad_norm": 44.21103286743164, |
|
"learning_rate": 4.0562152515615347e-05, |
|
"loss": 2.7711, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.1916169034708399, |
|
"grad_norm": 5.714451789855957, |
|
"learning_rate": 4.0419154826458004e-05, |
|
"loss": 2.7456, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.19447685725398678, |
|
"grad_norm": 5.732424736022949, |
|
"learning_rate": 4.027615713730066e-05, |
|
"loss": 2.7412, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.19733681103713363, |
|
"grad_norm": 7.989277362823486, |
|
"learning_rate": 4.013315944814332e-05, |
|
"loss": 2.7429, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.2001967648202805, |
|
"grad_norm": 6.200708389282227, |
|
"learning_rate": 3.9990161758985975e-05, |
|
"loss": 2.7497, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.20305671860342736, |
|
"grad_norm": 6.867748260498047, |
|
"learning_rate": 3.984716406982863e-05, |
|
"loss": 2.7387, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.20591667238657424, |
|
"grad_norm": 5.795921325683594, |
|
"learning_rate": 3.970416638067129e-05, |
|
"loss": 2.7262, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2087766261697211, |
|
"grad_norm": 6.110116958618164, |
|
"learning_rate": 3.9561168691513947e-05, |
|
"loss": 2.7416, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.21163657995286797, |
|
"grad_norm": 6.253924369812012, |
|
"learning_rate": 3.9418171002356604e-05, |
|
"loss": 2.7215, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.21449653373601482, |
|
"grad_norm": 6.117007732391357, |
|
"learning_rate": 3.927517331319926e-05, |
|
"loss": 2.7231, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.2173564875191617, |
|
"grad_norm": 8.227131843566895, |
|
"learning_rate": 3.913217562404192e-05, |
|
"loss": 2.71, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.22021644130230855, |
|
"grad_norm": 6.146326541900635, |
|
"learning_rate": 3.8989177934884575e-05, |
|
"loss": 2.7076, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.22307639508545543, |
|
"grad_norm": 6.7277398109436035, |
|
"learning_rate": 3.884618024572723e-05, |
|
"loss": 2.7226, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.22593634886860228, |
|
"grad_norm": 6.300662994384766, |
|
"learning_rate": 3.870318255656989e-05, |
|
"loss": 2.7025, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.22879630265174916, |
|
"grad_norm": 5.755123138427734, |
|
"learning_rate": 3.8560184867412547e-05, |
|
"loss": 2.7025, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.231656256434896, |
|
"grad_norm": 6.393768310546875, |
|
"learning_rate": 3.8417187178255204e-05, |
|
"loss": 2.7113, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.23451621021804286, |
|
"grad_norm": 5.855433464050293, |
|
"learning_rate": 3.8274189489097854e-05, |
|
"loss": 2.721, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.23737616400118974, |
|
"grad_norm": 4.719547271728516, |
|
"learning_rate": 3.813119179994051e-05, |
|
"loss": 2.6774, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.2402361177843366, |
|
"grad_norm": 5.75437068939209, |
|
"learning_rate": 3.798819411078317e-05, |
|
"loss": 2.6922, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.24309607156748347, |
|
"grad_norm": 6.258277416229248, |
|
"learning_rate": 3.7845196421625825e-05, |
|
"loss": 2.701, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.24595602535063033, |
|
"grad_norm": 5.8440165519714355, |
|
"learning_rate": 3.770219873246848e-05, |
|
"loss": 2.697, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.2488159791337772, |
|
"grad_norm": 5.4940009117126465, |
|
"learning_rate": 3.755920104331114e-05, |
|
"loss": 2.6826, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.2516759329169241, |
|
"grad_norm": 8.00302791595459, |
|
"learning_rate": 3.7416203354153804e-05, |
|
"loss": 2.6782, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.25453588670007093, |
|
"grad_norm": 6.31597375869751, |
|
"learning_rate": 3.727320566499646e-05, |
|
"loss": 2.7113, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.2573958404832178, |
|
"grad_norm": 6.734432697296143, |
|
"learning_rate": 3.713020797583911e-05, |
|
"loss": 2.6883, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.26025579426636464, |
|
"grad_norm": 8.607872009277344, |
|
"learning_rate": 3.698721028668177e-05, |
|
"loss": 2.675, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.26311574804951154, |
|
"grad_norm": 6.785426139831543, |
|
"learning_rate": 3.6844212597524425e-05, |
|
"loss": 2.6662, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.2659757018326584, |
|
"grad_norm": 5.7255072593688965, |
|
"learning_rate": 3.670121490836708e-05, |
|
"loss": 2.6566, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.26883565561580525, |
|
"grad_norm": 5.778408527374268, |
|
"learning_rate": 3.655821721920974e-05, |
|
"loss": 2.6869, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.2716956093989521, |
|
"grad_norm": 7.3644490242004395, |
|
"learning_rate": 3.64152195300524e-05, |
|
"loss": 2.6548, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.274555563182099, |
|
"grad_norm": 9.922218322753906, |
|
"learning_rate": 3.6272221840895054e-05, |
|
"loss": 2.6548, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.27741551696524586, |
|
"grad_norm": 6.6563944816589355, |
|
"learning_rate": 3.612922415173771e-05, |
|
"loss": 2.6466, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.2802754707483927, |
|
"grad_norm": 5.308610439300537, |
|
"learning_rate": 3.598622646258037e-05, |
|
"loss": 2.6744, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.28313542453153956, |
|
"grad_norm": 6.213603973388672, |
|
"learning_rate": 3.584322877342302e-05, |
|
"loss": 2.6484, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.2859953783146864, |
|
"grad_norm": 5.715392589569092, |
|
"learning_rate": 3.5700231084265676e-05, |
|
"loss": 2.6573, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.2888553320978333, |
|
"grad_norm": 6.067576885223389, |
|
"learning_rate": 3.555723339510833e-05, |
|
"loss": 2.6487, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.29171528588098017, |
|
"grad_norm": 6.300750255584717, |
|
"learning_rate": 3.541423570595099e-05, |
|
"loss": 2.6445, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.294575239664127, |
|
"grad_norm": 6.036895275115967, |
|
"learning_rate": 3.5271238016793654e-05, |
|
"loss": 2.6756, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.29743519344727387, |
|
"grad_norm": 5.856159687042236, |
|
"learning_rate": 3.512824032763631e-05, |
|
"loss": 2.6415, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.3002951472304208, |
|
"grad_norm": 12.173583984375, |
|
"learning_rate": 3.498524263847897e-05, |
|
"loss": 2.6386, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.30315510101356763, |
|
"grad_norm": 5.493927478790283, |
|
"learning_rate": 3.4842244949321625e-05, |
|
"loss": 2.6515, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.3060150547967145, |
|
"grad_norm": 5.786694526672363, |
|
"learning_rate": 3.4699247260164276e-05, |
|
"loss": 2.6449, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.30887500857986133, |
|
"grad_norm": 5.755667686462402, |
|
"learning_rate": 3.455624957100693e-05, |
|
"loss": 2.6357, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.31173496236300824, |
|
"grad_norm": 5.9297027587890625, |
|
"learning_rate": 3.441325188184959e-05, |
|
"loss": 2.6493, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.3145949161461551, |
|
"grad_norm": 6.182466983795166, |
|
"learning_rate": 3.427025419269225e-05, |
|
"loss": 2.6298, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.31745486992930194, |
|
"grad_norm": 6.565801620483398, |
|
"learning_rate": 3.4127256503534904e-05, |
|
"loss": 2.6611, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.3203148237124488, |
|
"grad_norm": 5.94129753112793, |
|
"learning_rate": 3.398425881437756e-05, |
|
"loss": 2.6201, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.32317477749559564, |
|
"grad_norm": 6.72519063949585, |
|
"learning_rate": 3.384126112522022e-05, |
|
"loss": 2.5961, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.32603473127874255, |
|
"grad_norm": 6.440931797027588, |
|
"learning_rate": 3.3698263436062876e-05, |
|
"loss": 2.6322, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.3288946850618894, |
|
"grad_norm": 6.059328079223633, |
|
"learning_rate": 3.355526574690553e-05, |
|
"loss": 2.615, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.33175463884503625, |
|
"grad_norm": 6.007944107055664, |
|
"learning_rate": 3.341226805774818e-05, |
|
"loss": 2.5992, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.3346145926281831, |
|
"grad_norm": 6.9386982917785645, |
|
"learning_rate": 3.326927036859084e-05, |
|
"loss": 2.6317, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.33747454641133, |
|
"grad_norm": 5.493308067321777, |
|
"learning_rate": 3.3126272679433504e-05, |
|
"loss": 2.5975, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.34033450019447686, |
|
"grad_norm": 7.026157855987549, |
|
"learning_rate": 3.298327499027616e-05, |
|
"loss": 2.6139, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.3431944539776237, |
|
"grad_norm": 5.790646553039551, |
|
"learning_rate": 3.284027730111882e-05, |
|
"loss": 2.5916, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.34605440776077057, |
|
"grad_norm": 5.980741024017334, |
|
"learning_rate": 3.2697279611961476e-05, |
|
"loss": 2.5811, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.3489143615439175, |
|
"grad_norm": 6.555883407592773, |
|
"learning_rate": 3.255428192280413e-05, |
|
"loss": 2.5909, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.3517743153270643, |
|
"grad_norm": 5.8480706214904785, |
|
"learning_rate": 3.241128423364679e-05, |
|
"loss": 2.6127, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.3546342691102112, |
|
"grad_norm": 6.341095924377441, |
|
"learning_rate": 3.226828654448944e-05, |
|
"loss": 2.5959, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.357494222893358, |
|
"grad_norm": 5.832342147827148, |
|
"learning_rate": 3.21252888553321e-05, |
|
"loss": 2.5946, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.36035417667650493, |
|
"grad_norm": 6.495291709899902, |
|
"learning_rate": 3.1982291166174755e-05, |
|
"loss": 2.6122, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.3632141304596518, |
|
"grad_norm": 6.527446746826172, |
|
"learning_rate": 3.183929347701741e-05, |
|
"loss": 2.573, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.36607408424279864, |
|
"grad_norm": 6.4324951171875, |
|
"learning_rate": 3.169629578786007e-05, |
|
"loss": 2.6119, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.3689340380259455, |
|
"grad_norm": 7.166018009185791, |
|
"learning_rate": 3.1553298098702726e-05, |
|
"loss": 2.6124, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.37179399180909234, |
|
"grad_norm": 6.462119102478027, |
|
"learning_rate": 3.141030040954538e-05, |
|
"loss": 2.5552, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.37465394559223925, |
|
"grad_norm": 6.0564703941345215, |
|
"learning_rate": 3.126730272038804e-05, |
|
"loss": 2.5672, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.3775138993753861, |
|
"grad_norm": 5.307662487030029, |
|
"learning_rate": 3.11243050312307e-05, |
|
"loss": 2.5611, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.38037385315853295, |
|
"grad_norm": 5.18694543838501, |
|
"learning_rate": 3.0981307342073355e-05, |
|
"loss": 2.5691, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.3832338069416798, |
|
"grad_norm": 5.568657398223877, |
|
"learning_rate": 3.083830965291601e-05, |
|
"loss": 2.575, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.3860937607248267, |
|
"grad_norm": 10.616528511047363, |
|
"learning_rate": 3.069531196375867e-05, |
|
"loss": 2.5886, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.38895371450797356, |
|
"grad_norm": 6.7568206787109375, |
|
"learning_rate": 3.0552314274601326e-05, |
|
"loss": 2.5822, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.3918136682911204, |
|
"grad_norm": 6.087740421295166, |
|
"learning_rate": 3.040931658544398e-05, |
|
"loss": 2.5472, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.39467362207426726, |
|
"grad_norm": 6.702504634857178, |
|
"learning_rate": 3.0266318896286637e-05, |
|
"loss": 2.5897, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.39753357585741417, |
|
"grad_norm": 6.2178053855896, |
|
"learning_rate": 3.0123321207129297e-05, |
|
"loss": 2.5698, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.400393529640561, |
|
"grad_norm": 6.559543609619141, |
|
"learning_rate": 2.9980323517971955e-05, |
|
"loss": 2.5725, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.40325348342370787, |
|
"grad_norm": 5.918066501617432, |
|
"learning_rate": 2.9837325828814605e-05, |
|
"loss": 2.5847, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.4061134372068547, |
|
"grad_norm": 5.602575778961182, |
|
"learning_rate": 2.9694328139657262e-05, |
|
"loss": 2.583, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.40897339099000163, |
|
"grad_norm": 5.304308891296387, |
|
"learning_rate": 2.955133045049992e-05, |
|
"loss": 2.5632, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.4118333447731485, |
|
"grad_norm": 5.540666103363037, |
|
"learning_rate": 2.9408332761342576e-05, |
|
"loss": 2.5756, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.41469329855629533, |
|
"grad_norm": 6.2000861167907715, |
|
"learning_rate": 2.9265335072185234e-05, |
|
"loss": 2.5357, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.4175532523394422, |
|
"grad_norm": 5.1564459800720215, |
|
"learning_rate": 2.912233738302789e-05, |
|
"loss": 2.5516, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.42041320612258903, |
|
"grad_norm": 6.008329391479492, |
|
"learning_rate": 2.897933969387055e-05, |
|
"loss": 2.5738, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.42327315990573594, |
|
"grad_norm": 6.52450704574585, |
|
"learning_rate": 2.883634200471321e-05, |
|
"loss": 2.578, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.4261331136888828, |
|
"grad_norm": 5.788220405578613, |
|
"learning_rate": 2.8693344315555866e-05, |
|
"loss": 2.5578, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.42899306747202964, |
|
"grad_norm": 5.5810112953186035, |
|
"learning_rate": 2.8550346626398516e-05, |
|
"loss": 2.5643, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.4318530212551765, |
|
"grad_norm": 5.334226608276367, |
|
"learning_rate": 2.8407348937241173e-05, |
|
"loss": 2.5376, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.4347129750383234, |
|
"grad_norm": 5.804100513458252, |
|
"learning_rate": 2.826435124808383e-05, |
|
"loss": 2.541, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.43757292882147025, |
|
"grad_norm": 5.555410385131836, |
|
"learning_rate": 2.8121353558926487e-05, |
|
"loss": 2.5364, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.4404328826046171, |
|
"grad_norm": 5.454427719116211, |
|
"learning_rate": 2.7978355869769148e-05, |
|
"loss": 2.5602, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.44329283638776396, |
|
"grad_norm": 16.772747039794922, |
|
"learning_rate": 2.7835358180611805e-05, |
|
"loss": 2.5674, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.44615279017091086, |
|
"grad_norm": 8.047761917114258, |
|
"learning_rate": 2.7692360491454462e-05, |
|
"loss": 2.5334, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.4490127439540577, |
|
"grad_norm": 6.612277507781982, |
|
"learning_rate": 2.754936280229712e-05, |
|
"loss": 2.5525, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.45187269773720457, |
|
"grad_norm": 6.439370632171631, |
|
"learning_rate": 2.740636511313977e-05, |
|
"loss": 2.5349, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.4547326515203514, |
|
"grad_norm": 6.890873908996582, |
|
"learning_rate": 2.7263367423982427e-05, |
|
"loss": 2.5145, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.4575926053034983, |
|
"grad_norm": 5.4768500328063965, |
|
"learning_rate": 2.7120369734825084e-05, |
|
"loss": 2.5277, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.4604525590866452, |
|
"grad_norm": 5.825018405914307, |
|
"learning_rate": 2.697737204566774e-05, |
|
"loss": 2.5505, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.463312512869792, |
|
"grad_norm": 6.583479881286621, |
|
"learning_rate": 2.68343743565104e-05, |
|
"loss": 2.5562, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.4661724666529389, |
|
"grad_norm": 6.420114040374756, |
|
"learning_rate": 2.669137666735306e-05, |
|
"loss": 2.5094, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.46903242043608573, |
|
"grad_norm": 6.8168110847473145, |
|
"learning_rate": 2.6548378978195716e-05, |
|
"loss": 2.5347, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.47189237421923264, |
|
"grad_norm": 6.224096298217773, |
|
"learning_rate": 2.6405381289038373e-05, |
|
"loss": 2.5154, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.4747523280023795, |
|
"grad_norm": 6.240240097045898, |
|
"learning_rate": 2.626238359988103e-05, |
|
"loss": 2.535, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.47761228178552634, |
|
"grad_norm": 6.053983211517334, |
|
"learning_rate": 2.611938591072368e-05, |
|
"loss": 2.5275, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.4804722355686732, |
|
"grad_norm": 5.546879768371582, |
|
"learning_rate": 2.5976388221566338e-05, |
|
"loss": 2.5329, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.4833321893518201, |
|
"grad_norm": 6.190423011779785, |
|
"learning_rate": 2.5833390532408995e-05, |
|
"loss": 2.5174, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.48619214313496695, |
|
"grad_norm": 5.437402248382568, |
|
"learning_rate": 2.5690392843251655e-05, |
|
"loss": 2.49, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.4890520969181138, |
|
"grad_norm": 6.8163557052612305, |
|
"learning_rate": 2.5547395154094312e-05, |
|
"loss": 2.524, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.49191205070126065, |
|
"grad_norm": 6.754604816436768, |
|
"learning_rate": 2.540439746493697e-05, |
|
"loss": 2.5041, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.49477200448440756, |
|
"grad_norm": 5.496472358703613, |
|
"learning_rate": 2.5261399775779627e-05, |
|
"loss": 2.5277, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.4976319582675544, |
|
"grad_norm": 5.616280555725098, |
|
"learning_rate": 2.5118402086622284e-05, |
|
"loss": 2.5061, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.5004919120507013, |
|
"grad_norm": 6.141283988952637, |
|
"learning_rate": 2.4975404397464938e-05, |
|
"loss": 2.5214, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.5033518658338482, |
|
"grad_norm": 6.124631404876709, |
|
"learning_rate": 2.4832406708307595e-05, |
|
"loss": 2.4854, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.506211819616995, |
|
"grad_norm": 6.740499496459961, |
|
"learning_rate": 2.4689409019150252e-05, |
|
"loss": 2.5054, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.5090717734001419, |
|
"grad_norm": 6.040327548980713, |
|
"learning_rate": 2.454641132999291e-05, |
|
"loss": 2.5042, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.5119317271832887, |
|
"grad_norm": 5.564330577850342, |
|
"learning_rate": 2.4403413640835566e-05, |
|
"loss": 2.5021, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.5147916809664356, |
|
"grad_norm": 6.915059566497803, |
|
"learning_rate": 2.4260415951678223e-05, |
|
"loss": 2.5227, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.5176516347495824, |
|
"grad_norm": 6.181910991668701, |
|
"learning_rate": 2.411741826252088e-05, |
|
"loss": 2.5098, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.5205115885327293, |
|
"grad_norm": 5.829164505004883, |
|
"learning_rate": 2.3974420573363534e-05, |
|
"loss": 2.5133, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.5233715423158761, |
|
"grad_norm": 14.621573448181152, |
|
"learning_rate": 2.383142288420619e-05, |
|
"loss": 2.503, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.5262314960990231, |
|
"grad_norm": 6.3930511474609375, |
|
"learning_rate": 2.368842519504885e-05, |
|
"loss": 2.5124, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.5290914498821699, |
|
"grad_norm": 5.840575695037842, |
|
"learning_rate": 2.3545427505891506e-05, |
|
"loss": 2.5177, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.5319514036653168, |
|
"grad_norm": 6.612518787384033, |
|
"learning_rate": 2.3402429816734163e-05, |
|
"loss": 2.4881, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.5348113574484636, |
|
"grad_norm": 6.505732536315918, |
|
"learning_rate": 2.325943212757682e-05, |
|
"loss": 2.4872, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.5376713112316105, |
|
"grad_norm": 7.19988489151001, |
|
"learning_rate": 2.3116434438419477e-05, |
|
"loss": 2.4958, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.5405312650147573, |
|
"grad_norm": 5.988187789916992, |
|
"learning_rate": 2.2973436749262134e-05, |
|
"loss": 2.5094, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.5433912187979042, |
|
"grad_norm": 5.709506511688232, |
|
"learning_rate": 2.2830439060104788e-05, |
|
"loss": 2.4882, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.546251172581051, |
|
"grad_norm": 5.567132949829102, |
|
"learning_rate": 2.2687441370947445e-05, |
|
"loss": 2.4909, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.549111126364198, |
|
"grad_norm": 11.825920104980469, |
|
"learning_rate": 2.2544443681790102e-05, |
|
"loss": 2.4944, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.5519710801473449, |
|
"grad_norm": 5.969587802886963, |
|
"learning_rate": 2.240144599263276e-05, |
|
"loss": 2.4912, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.5548310339304917, |
|
"grad_norm": 6.31153678894043, |
|
"learning_rate": 2.225844830347542e-05, |
|
"loss": 2.4901, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.5576909877136386, |
|
"grad_norm": 7.130558013916016, |
|
"learning_rate": 2.2115450614318074e-05, |
|
"loss": 2.4768, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.5605509414967854, |
|
"grad_norm": 5.947187900543213, |
|
"learning_rate": 2.197245292516073e-05, |
|
"loss": 2.4971, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.5634108952799323, |
|
"grad_norm": 6.830575466156006, |
|
"learning_rate": 2.1829455236003388e-05, |
|
"loss": 2.4901, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.5662708490630791, |
|
"grad_norm": 5.682921409606934, |
|
"learning_rate": 2.1686457546846045e-05, |
|
"loss": 2.4946, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.569130802846226, |
|
"grad_norm": 5.174154758453369, |
|
"learning_rate": 2.15434598576887e-05, |
|
"loss": 2.4813, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.5719907566293728, |
|
"grad_norm": 5.400365352630615, |
|
"learning_rate": 2.1400462168531356e-05, |
|
"loss": 2.4498, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.5748507104125198, |
|
"grad_norm": 5.433869361877441, |
|
"learning_rate": 2.1257464479374013e-05, |
|
"loss": 2.523, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.5777106641956666, |
|
"grad_norm": 6.321377754211426, |
|
"learning_rate": 2.1114466790216674e-05, |
|
"loss": 2.4731, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.5805706179788135, |
|
"grad_norm": 6.643988609313965, |
|
"learning_rate": 2.0971469101059327e-05, |
|
"loss": 2.4837, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.5834305717619603, |
|
"grad_norm": 6.258885383605957, |
|
"learning_rate": 2.0828471411901985e-05, |
|
"loss": 2.4735, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.5862905255451072, |
|
"grad_norm": 5.747689723968506, |
|
"learning_rate": 2.068547372274464e-05, |
|
"loss": 2.4742, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.589150479328254, |
|
"grad_norm": 6.016144275665283, |
|
"learning_rate": 2.05424760335873e-05, |
|
"loss": 2.4633, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.5920104331114009, |
|
"grad_norm": 5.250337600708008, |
|
"learning_rate": 2.0399478344429953e-05, |
|
"loss": 2.467, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.5948703868945477, |
|
"grad_norm": 5.667397975921631, |
|
"learning_rate": 2.025648065527261e-05, |
|
"loss": 2.4709, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.5977303406776946, |
|
"grad_norm": 6.414941310882568, |
|
"learning_rate": 2.0113482966115267e-05, |
|
"loss": 2.4805, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.6005902944608416, |
|
"grad_norm": 6.118762493133545, |
|
"learning_rate": 1.9970485276957927e-05, |
|
"loss": 2.46, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.6034502482439884, |
|
"grad_norm": 7.456865310668945, |
|
"learning_rate": 1.9827487587800584e-05, |
|
"loss": 2.4863, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.6063102020271353, |
|
"grad_norm": 7.2666096687316895, |
|
"learning_rate": 1.9684489898643238e-05, |
|
"loss": 2.431, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.6091701558102821, |
|
"grad_norm": 6.135725975036621, |
|
"learning_rate": 1.9541492209485895e-05, |
|
"loss": 2.4833, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.612030109593429, |
|
"grad_norm": 6.930655002593994, |
|
"learning_rate": 1.9398494520328553e-05, |
|
"loss": 2.4791, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.6148900633765758, |
|
"grad_norm": 5.848691940307617, |
|
"learning_rate": 1.925549683117121e-05, |
|
"loss": 2.4744, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.6177500171597227, |
|
"grad_norm": 6.593609809875488, |
|
"learning_rate": 1.9112499142013863e-05, |
|
"loss": 2.4818, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.6206099709428695, |
|
"grad_norm": 5.148362636566162, |
|
"learning_rate": 1.8969501452856524e-05, |
|
"loss": 2.4863, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.6234699247260165, |
|
"grad_norm": 6.264626979827881, |
|
"learning_rate": 1.882650376369918e-05, |
|
"loss": 2.4896, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.6263298785091633, |
|
"grad_norm": 7.046905040740967, |
|
"learning_rate": 1.8683506074541838e-05, |
|
"loss": 2.4746, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.6291898322923102, |
|
"grad_norm": 6.274538993835449, |
|
"learning_rate": 1.8540508385384492e-05, |
|
"loss": 2.4395, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.632049786075457, |
|
"grad_norm": 5.889391899108887, |
|
"learning_rate": 1.839751069622715e-05, |
|
"loss": 2.4307, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.6349097398586039, |
|
"grad_norm": 5.6989030838012695, |
|
"learning_rate": 1.8254513007069806e-05, |
|
"loss": 2.4297, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.6377696936417507, |
|
"grad_norm": 6.275044918060303, |
|
"learning_rate": 1.8111515317912463e-05, |
|
"loss": 2.4504, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.6406296474248976, |
|
"grad_norm": 6.444321155548096, |
|
"learning_rate": 1.7968517628755117e-05, |
|
"loss": 2.4286, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.6434896012080444, |
|
"grad_norm": 6.624863147735596, |
|
"learning_rate": 1.7825519939597778e-05, |
|
"loss": 2.463, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.6463495549911913, |
|
"grad_norm": 7.994183540344238, |
|
"learning_rate": 1.7682522250440435e-05, |
|
"loss": 2.4362, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.6492095087743383, |
|
"grad_norm": 5.6794257164001465, |
|
"learning_rate": 1.7539524561283092e-05, |
|
"loss": 2.4355, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.6520694625574851, |
|
"grad_norm": 5.606757164001465, |
|
"learning_rate": 1.739652687212575e-05, |
|
"loss": 2.4525, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.654929416340632, |
|
"grad_norm": 6.253554344177246, |
|
"learning_rate": 1.7253529182968403e-05, |
|
"loss": 2.4511, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.6577893701237788, |
|
"grad_norm": 6.014497756958008, |
|
"learning_rate": 1.711053149381106e-05, |
|
"loss": 2.4571, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.6606493239069257, |
|
"grad_norm": 6.601302146911621, |
|
"learning_rate": 1.6967533804653717e-05, |
|
"loss": 2.4505, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.6635092776900725, |
|
"grad_norm": 7.215948104858398, |
|
"learning_rate": 1.6824536115496374e-05, |
|
"loss": 2.4351, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.6663692314732194, |
|
"grad_norm": 5.974714279174805, |
|
"learning_rate": 1.668153842633903e-05, |
|
"loss": 2.4435, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.6692291852563662, |
|
"grad_norm": 6.903178691864014, |
|
"learning_rate": 1.653854073718169e-05, |
|
"loss": 2.4388, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.6720891390395132, |
|
"grad_norm": 6.214517116546631, |
|
"learning_rate": 1.6395543048024346e-05, |
|
"loss": 2.4405, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.67494909282266, |
|
"grad_norm": 6.263461589813232, |
|
"learning_rate": 1.6252545358867003e-05, |
|
"loss": 2.4496, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.6778090466058069, |
|
"grad_norm": 8.066364288330078, |
|
"learning_rate": 1.6109547669709657e-05, |
|
"loss": 2.4368, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.6806690003889537, |
|
"grad_norm": 5.834959506988525, |
|
"learning_rate": 1.5966549980552314e-05, |
|
"loss": 2.4481, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.6835289541721006, |
|
"grad_norm": 6.710206031799316, |
|
"learning_rate": 1.582355229139497e-05, |
|
"loss": 2.4325, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.6863889079552474, |
|
"grad_norm": 5.984834671020508, |
|
"learning_rate": 1.5680554602237628e-05, |
|
"loss": 2.4454, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.6892488617383943, |
|
"grad_norm": 5.370354652404785, |
|
"learning_rate": 1.5537556913080285e-05, |
|
"loss": 2.4279, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.6921088155215411, |
|
"grad_norm": 6.09434175491333, |
|
"learning_rate": 1.5394559223922942e-05, |
|
"loss": 2.4314, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.694968769304688, |
|
"grad_norm": 6.878710746765137, |
|
"learning_rate": 1.52515615347656e-05, |
|
"loss": 2.4191, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.697828723087835, |
|
"grad_norm": 5.660272121429443, |
|
"learning_rate": 1.5108563845608257e-05, |
|
"loss": 2.433, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.7006886768709818, |
|
"grad_norm": 6.489835739135742, |
|
"learning_rate": 1.4965566156450914e-05, |
|
"loss": 2.4491, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.7035486306541286, |
|
"grad_norm": 5.600217819213867, |
|
"learning_rate": 1.4822568467293567e-05, |
|
"loss": 2.4235, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.7064085844372755, |
|
"grad_norm": 5.281232833862305, |
|
"learning_rate": 1.4679570778136226e-05, |
|
"loss": 2.4219, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.7092685382204224, |
|
"grad_norm": 5.651204586029053, |
|
"learning_rate": 1.4536573088978883e-05, |
|
"loss": 2.448, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.7121284920035692, |
|
"grad_norm": 5.520606994628906, |
|
"learning_rate": 1.439357539982154e-05, |
|
"loss": 2.4118, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.714988445786716, |
|
"grad_norm": 6.359561920166016, |
|
"learning_rate": 1.4250577710664196e-05, |
|
"loss": 2.4502, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.7178483995698629, |
|
"grad_norm": 6.264361381530762, |
|
"learning_rate": 1.4107580021506853e-05, |
|
"loss": 2.4214, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.7207083533530099, |
|
"grad_norm": 15.211498260498047, |
|
"learning_rate": 1.396458233234951e-05, |
|
"loss": 2.4476, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.7235683071361567, |
|
"grad_norm": 6.165014266967773, |
|
"learning_rate": 1.3821584643192167e-05, |
|
"loss": 2.4255, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.7264282609193036, |
|
"grad_norm": 5.279512882232666, |
|
"learning_rate": 1.3678586954034823e-05, |
|
"loss": 2.4458, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.7292882147024504, |
|
"grad_norm": 6.13384485244751, |
|
"learning_rate": 1.353558926487748e-05, |
|
"loss": 2.4022, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.7321481684855973, |
|
"grad_norm": 5.577615261077881, |
|
"learning_rate": 1.3392591575720137e-05, |
|
"loss": 2.4174, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.7350081222687441, |
|
"grad_norm": 5.860058784484863, |
|
"learning_rate": 1.3249593886562794e-05, |
|
"loss": 2.4043, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.737868076051891, |
|
"grad_norm": 6.8798065185546875, |
|
"learning_rate": 1.3106596197405451e-05, |
|
"loss": 2.3858, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.7407280298350378, |
|
"grad_norm": 7.996329307556152, |
|
"learning_rate": 1.2963598508248107e-05, |
|
"loss": 2.3993, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.7435879836181847, |
|
"grad_norm": 6.488850116729736, |
|
"learning_rate": 1.2820600819090764e-05, |
|
"loss": 2.4204, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.7464479374013316, |
|
"grad_norm": 5.177313804626465, |
|
"learning_rate": 1.2677603129933421e-05, |
|
"loss": 2.433, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.7493078911844785, |
|
"grad_norm": 6.9536895751953125, |
|
"learning_rate": 1.2534605440776078e-05, |
|
"loss": 2.4145, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.7521678449676253, |
|
"grad_norm": 5.639203071594238, |
|
"learning_rate": 1.2391607751618735e-05, |
|
"loss": 2.3906, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.7550277987507722, |
|
"grad_norm": 5.76200532913208, |
|
"learning_rate": 1.2248610062461391e-05, |
|
"loss": 2.4065, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.757887752533919, |
|
"grad_norm": 7.033239364624023, |
|
"learning_rate": 1.2105612373304048e-05, |
|
"loss": 2.4045, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.7607477063170659, |
|
"grad_norm": 6.319807529449463, |
|
"learning_rate": 1.1962614684146704e-05, |
|
"loss": 2.3646, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.7636076601002127, |
|
"grad_norm": 6.506091117858887, |
|
"learning_rate": 1.1819616994989362e-05, |
|
"loss": 2.4247, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.7664676138833596, |
|
"grad_norm": 6.245853424072266, |
|
"learning_rate": 1.1676619305832018e-05, |
|
"loss": 2.3998, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.7693275676665066, |
|
"grad_norm": 6.403684616088867, |
|
"learning_rate": 1.1533621616674675e-05, |
|
"loss": 2.4072, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.7721875214496534, |
|
"grad_norm": 6.385560035705566, |
|
"learning_rate": 1.1390623927517332e-05, |
|
"loss": 2.4078, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.7750474752328003, |
|
"grad_norm": 6.857175350189209, |
|
"learning_rate": 1.124762623835999e-05, |
|
"loss": 2.4167, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 0.7779074290159471, |
|
"grad_norm": 5.734222888946533, |
|
"learning_rate": 1.1104628549202645e-05, |
|
"loss": 2.411, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.780767382799094, |
|
"grad_norm": 6.311659812927246, |
|
"learning_rate": 1.0961630860045302e-05, |
|
"loss": 2.4232, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 0.7836273365822408, |
|
"grad_norm": 6.344162940979004, |
|
"learning_rate": 1.0818633170887959e-05, |
|
"loss": 2.3997, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.7864872903653877, |
|
"grad_norm": 5.971358776092529, |
|
"learning_rate": 1.0675635481730616e-05, |
|
"loss": 2.4181, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 0.7893472441485345, |
|
"grad_norm": 5.663905620574951, |
|
"learning_rate": 1.0532637792573273e-05, |
|
"loss": 2.3939, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.7922071979316814, |
|
"grad_norm": 5.739428520202637, |
|
"learning_rate": 1.0389640103415929e-05, |
|
"loss": 2.3803, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 0.7950671517148283, |
|
"grad_norm": 6.558109760284424, |
|
"learning_rate": 1.0246642414258586e-05, |
|
"loss": 2.3794, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.7979271054979752, |
|
"grad_norm": 7.577678203582764, |
|
"learning_rate": 1.0103644725101243e-05, |
|
"loss": 2.4035, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 0.800787059281122, |
|
"grad_norm": 6.890414237976074, |
|
"learning_rate": 9.9606470359439e-06, |
|
"loss": 2.3791, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.8036470130642689, |
|
"grad_norm": 6.212318420410156, |
|
"learning_rate": 9.817649346786556e-06, |
|
"loss": 2.363, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 0.8065069668474157, |
|
"grad_norm": 6.501023292541504, |
|
"learning_rate": 9.674651657629213e-06, |
|
"loss": 2.3794, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.8093669206305626, |
|
"grad_norm": 6.136830806732178, |
|
"learning_rate": 9.53165396847187e-06, |
|
"loss": 2.3835, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 0.8122268744137094, |
|
"grad_norm": 6.386491298675537, |
|
"learning_rate": 9.388656279314527e-06, |
|
"loss": 2.3836, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.8150868281968563, |
|
"grad_norm": 6.060532093048096, |
|
"learning_rate": 9.245658590157182e-06, |
|
"loss": 2.3714, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 0.8179467819800033, |
|
"grad_norm": 6.481443405151367, |
|
"learning_rate": 9.10266090099984e-06, |
|
"loss": 2.3842, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.8208067357631501, |
|
"grad_norm": 6.378634929656982, |
|
"learning_rate": 8.959663211842497e-06, |
|
"loss": 2.4011, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 0.823666689546297, |
|
"grad_norm": 7.321898937225342, |
|
"learning_rate": 8.816665522685154e-06, |
|
"loss": 2.3874, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.8265266433294438, |
|
"grad_norm": 5.878232479095459, |
|
"learning_rate": 8.673667833527811e-06, |
|
"loss": 2.3747, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 0.8293865971125907, |
|
"grad_norm": 6.182088375091553, |
|
"learning_rate": 8.530670144370468e-06, |
|
"loss": 2.3928, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.8322465508957375, |
|
"grad_norm": 6.2058258056640625, |
|
"learning_rate": 8.387672455213125e-06, |
|
"loss": 2.3784, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 0.8351065046788844, |
|
"grad_norm": 6.231584072113037, |
|
"learning_rate": 8.24467476605578e-06, |
|
"loss": 2.3715, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.8379664584620312, |
|
"grad_norm": 6.14652156829834, |
|
"learning_rate": 8.101677076898438e-06, |
|
"loss": 2.3789, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 0.8408264122451781, |
|
"grad_norm": 6.431158065795898, |
|
"learning_rate": 7.958679387741095e-06, |
|
"loss": 2.3792, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.843686366028325, |
|
"grad_norm": 5.822235584259033, |
|
"learning_rate": 7.815681698583752e-06, |
|
"loss": 2.4062, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 0.8465463198114719, |
|
"grad_norm": 5.64607048034668, |
|
"learning_rate": 7.672684009426408e-06, |
|
"loss": 2.368, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.8494062735946187, |
|
"grad_norm": 6.182931900024414, |
|
"learning_rate": 7.5296863202690655e-06, |
|
"loss": 2.3877, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 0.8522662273777656, |
|
"grad_norm": 6.151760578155518, |
|
"learning_rate": 7.386688631111721e-06, |
|
"loss": 2.3915, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.8551261811609124, |
|
"grad_norm": 6.303664684295654, |
|
"learning_rate": 7.243690941954379e-06, |
|
"loss": 2.3565, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 0.8579861349440593, |
|
"grad_norm": 6.381216526031494, |
|
"learning_rate": 7.100693252797034e-06, |
|
"loss": 2.3697, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.8608460887272061, |
|
"grad_norm": 5.706302165985107, |
|
"learning_rate": 6.957695563639692e-06, |
|
"loss": 2.4026, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 0.863706042510353, |
|
"grad_norm": 7.22359561920166, |
|
"learning_rate": 6.814697874482348e-06, |
|
"loss": 2.3759, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.8665659962935, |
|
"grad_norm": 5.458381652832031, |
|
"learning_rate": 6.671700185325006e-06, |
|
"loss": 2.3836, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 0.8694259500766468, |
|
"grad_norm": 5.785479545593262, |
|
"learning_rate": 6.528702496167661e-06, |
|
"loss": 2.3655, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.8722859038597937, |
|
"grad_norm": 5.856048583984375, |
|
"learning_rate": 6.385704807010319e-06, |
|
"loss": 2.3669, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 0.8751458576429405, |
|
"grad_norm": 5.491500377655029, |
|
"learning_rate": 6.2427071178529756e-06, |
|
"loss": 2.4154, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.8780058114260874, |
|
"grad_norm": 5.936758518218994, |
|
"learning_rate": 6.099709428695633e-06, |
|
"loss": 2.3702, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 0.8808657652092342, |
|
"grad_norm": 7.138918399810791, |
|
"learning_rate": 5.956711739538289e-06, |
|
"loss": 2.3582, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.8837257189923811, |
|
"grad_norm": 6.457569122314453, |
|
"learning_rate": 5.813714050380946e-06, |
|
"loss": 2.381, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 0.8865856727755279, |
|
"grad_norm": 6.026115894317627, |
|
"learning_rate": 5.6707163612236024e-06, |
|
"loss": 2.385, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.8894456265586748, |
|
"grad_norm": 6.851065158843994, |
|
"learning_rate": 5.52771867206626e-06, |
|
"loss": 2.3664, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 0.8923055803418217, |
|
"grad_norm": 6.16819953918457, |
|
"learning_rate": 5.384720982908916e-06, |
|
"loss": 2.3814, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 0.8951655341249686, |
|
"grad_norm": 5.917440891265869, |
|
"learning_rate": 5.241723293751574e-06, |
|
"loss": 2.3701, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 0.8980254879081154, |
|
"grad_norm": 10.217552185058594, |
|
"learning_rate": 5.09872560459423e-06, |
|
"loss": 2.3516, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 0.9008854416912623, |
|
"grad_norm": 7.088205814361572, |
|
"learning_rate": 4.955727915436887e-06, |
|
"loss": 2.3936, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 0.9037453954744091, |
|
"grad_norm": 6.357458591461182, |
|
"learning_rate": 4.812730226279544e-06, |
|
"loss": 2.3672, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 0.906605349257556, |
|
"grad_norm": 6.871440887451172, |
|
"learning_rate": 4.669732537122201e-06, |
|
"loss": 2.3691, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 0.9094653030407028, |
|
"grad_norm": 6.192137718200684, |
|
"learning_rate": 4.526734847964857e-06, |
|
"loss": 2.3608, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 0.9123252568238497, |
|
"grad_norm": 6.265544414520264, |
|
"learning_rate": 4.383737158807514e-06, |
|
"loss": 2.3682, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 0.9151852106069966, |
|
"grad_norm": 5.907118320465088, |
|
"learning_rate": 4.2407394696501705e-06, |
|
"loss": 2.3423, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.9180451643901435, |
|
"grad_norm": 6.204267501831055, |
|
"learning_rate": 4.097741780492828e-06, |
|
"loss": 2.3605, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 0.9209051181732903, |
|
"grad_norm": 6.978556156158447, |
|
"learning_rate": 3.954744091335484e-06, |
|
"loss": 2.3594, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 0.9237650719564372, |
|
"grad_norm": 6.3842082023620605, |
|
"learning_rate": 3.811746402178141e-06, |
|
"loss": 2.3677, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 0.926625025739584, |
|
"grad_norm": 6.20996618270874, |
|
"learning_rate": 3.6687487130207977e-06, |
|
"loss": 2.3538, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 0.9294849795227309, |
|
"grad_norm": 6.184482574462891, |
|
"learning_rate": 3.5257510238634545e-06, |
|
"loss": 2.3787, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 0.9323449333058778, |
|
"grad_norm": 6.219623565673828, |
|
"learning_rate": 3.382753334706111e-06, |
|
"loss": 2.3774, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 0.9352048870890246, |
|
"grad_norm": 6.634711742401123, |
|
"learning_rate": 3.239755645548768e-06, |
|
"loss": 2.3671, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 0.9380648408721715, |
|
"grad_norm": 7.119485855102539, |
|
"learning_rate": 3.096757956391425e-06, |
|
"loss": 2.356, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 0.9409247946553184, |
|
"grad_norm": 6.833123207092285, |
|
"learning_rate": 2.9537602672340818e-06, |
|
"loss": 2.3451, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 0.9437847484384653, |
|
"grad_norm": 6.631540298461914, |
|
"learning_rate": 2.8107625780767385e-06, |
|
"loss": 2.3324, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 0.9466447022216121, |
|
"grad_norm": 6.187737941741943, |
|
"learning_rate": 2.667764888919395e-06, |
|
"loss": 2.3573, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 0.949504656004759, |
|
"grad_norm": 5.523457050323486, |
|
"learning_rate": 2.524767199762052e-06, |
|
"loss": 2.3468, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 0.9523646097879058, |
|
"grad_norm": 6.898806095123291, |
|
"learning_rate": 2.381769510604709e-06, |
|
"loss": 2.3534, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 0.9552245635710527, |
|
"grad_norm": 6.348108291625977, |
|
"learning_rate": 2.2387718214473658e-06, |
|
"loss": 2.3588, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 0.9580845173541995, |
|
"grad_norm": 6.188412189483643, |
|
"learning_rate": 2.0957741322900225e-06, |
|
"loss": 2.3607, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 0.9609444711373464, |
|
"grad_norm": 6.769163608551025, |
|
"learning_rate": 1.952776443132679e-06, |
|
"loss": 2.3721, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 0.9638044249204932, |
|
"grad_norm": 6.389153957366943, |
|
"learning_rate": 1.8097787539753357e-06, |
|
"loss": 2.381, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 0.9666643787036402, |
|
"grad_norm": 5.625518798828125, |
|
"learning_rate": 1.6667810648179926e-06, |
|
"loss": 2.3656, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 0.969524332486787, |
|
"grad_norm": 6.03477144241333, |
|
"learning_rate": 1.5237833756606493e-06, |
|
"loss": 2.3796, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 0.9723842862699339, |
|
"grad_norm": 6.034476280212402, |
|
"learning_rate": 1.3807856865033063e-06, |
|
"loss": 2.3407, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.9752442400530807, |
|
"grad_norm": 6.318973541259766, |
|
"learning_rate": 1.237787997345963e-06, |
|
"loss": 2.3537, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 0.9781041938362276, |
|
"grad_norm": 6.3570237159729, |
|
"learning_rate": 1.0947903081886197e-06, |
|
"loss": 2.3744, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 0.9809641476193744, |
|
"grad_norm": 5.440378189086914, |
|
"learning_rate": 9.517926190312765e-07, |
|
"loss": 2.3775, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 0.9838241014025213, |
|
"grad_norm": 7.5823655128479, |
|
"learning_rate": 8.087949298739332e-07, |
|
"loss": 2.3301, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 0.9866840551856682, |
|
"grad_norm": 6.07295560836792, |
|
"learning_rate": 6.6579724071659e-07, |
|
"loss": 2.3347, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 0.9895440089688151, |
|
"grad_norm": 7.158942222595215, |
|
"learning_rate": 5.227995515592468e-07, |
|
"loss": 2.3567, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 0.992403962751962, |
|
"grad_norm": 6.406834125518799, |
|
"learning_rate": 3.798018624019036e-07, |
|
"loss": 2.3204, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 0.9952639165351088, |
|
"grad_norm": 5.863027572631836, |
|
"learning_rate": 2.3680417324456038e-07, |
|
"loss": 2.3569, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 0.9981238703182557, |
|
"grad_norm": 6.552116394042969, |
|
"learning_rate": 9.380648408721716e-08, |
|
"loss": 2.3332, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 174828, |
|
"total_flos": 1.8427441878551347e+17, |
|
"train_loss": 1.5726176189089465, |
|
"train_runtime": 27622.4465, |
|
"train_samples_per_second": 25.317, |
|
"train_steps_per_second": 6.329 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 174828, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8427441878551347e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|