|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9989310529128808, |
|
"eval_steps": 500, |
|
"global_step": 1870, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010689470871191875, |
|
"grad_norm": 56.09147115973681, |
|
"learning_rate": 1.0695187165775401e-07, |
|
"loss": 1.554, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010689470871191877, |
|
"grad_norm": 416.2698691138927, |
|
"learning_rate": 1.0695187165775401e-06, |
|
"loss": 1.5405, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021378941742383754, |
|
"grad_norm": 667.8842682438211, |
|
"learning_rate": 2.1390374331550802e-06, |
|
"loss": 1.4648, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.032068412613575625, |
|
"grad_norm": 2.9422589389154457, |
|
"learning_rate": 3.2085561497326205e-06, |
|
"loss": 1.3258, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04275788348476751, |
|
"grad_norm": 52.87526964166794, |
|
"learning_rate": 4.2780748663101604e-06, |
|
"loss": 1.2952, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05344735435595938, |
|
"grad_norm": 2.8923791457722072, |
|
"learning_rate": 5.347593582887702e-06, |
|
"loss": 1.2588, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06413682522715125, |
|
"grad_norm": 2.656284320194568, |
|
"learning_rate": 6.417112299465241e-06, |
|
"loss": 1.2247, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07482629609834313, |
|
"grad_norm": 4.4735196432994915, |
|
"learning_rate": 7.486631016042781e-06, |
|
"loss": 1.1873, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08551576696953501, |
|
"grad_norm": 2.828665120905719, |
|
"learning_rate": 8.556149732620321e-06, |
|
"loss": 1.1654, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09620523784072689, |
|
"grad_norm": 27.50225041150734, |
|
"learning_rate": 9.625668449197861e-06, |
|
"loss": 1.1473, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10689470871191876, |
|
"grad_norm": 1.1135780282016265, |
|
"learning_rate": 1.0695187165775403e-05, |
|
"loss": 1.1594, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11758417958311064, |
|
"grad_norm": 1.5471458694252325, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 1.1471, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1282736504543025, |
|
"grad_norm": 2.9925400086464946, |
|
"learning_rate": 1.2834224598930482e-05, |
|
"loss": 1.1446, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13896312132549438, |
|
"grad_norm": 1.3582655233383645, |
|
"learning_rate": 1.3903743315508022e-05, |
|
"loss": 1.1231, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14965259219668625, |
|
"grad_norm": 4.301858568361503, |
|
"learning_rate": 1.4973262032085563e-05, |
|
"loss": 1.121, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16034206306787813, |
|
"grad_norm": 1.7516618532476456, |
|
"learning_rate": 1.60427807486631e-05, |
|
"loss": 1.1375, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17103153393907003, |
|
"grad_norm": 2.05276098281325, |
|
"learning_rate": 1.7112299465240642e-05, |
|
"loss": 1.1317, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1817210048102619, |
|
"grad_norm": 1.936189800352967, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.1261, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19241047568145378, |
|
"grad_norm": 2.8285205229802317, |
|
"learning_rate": 1.9251336898395722e-05, |
|
"loss": 1.1194, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20309994655264565, |
|
"grad_norm": 1.2316486726277835, |
|
"learning_rate": 1.99998432011431e-05, |
|
"loss": 1.1127, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21378941742383753, |
|
"grad_norm": 1.1894590702071168, |
|
"learning_rate": 1.9997055802697737e-05, |
|
"loss": 1.1251, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2244788882950294, |
|
"grad_norm": 0.9254235249331663, |
|
"learning_rate": 1.99907851031346e-05, |
|
"loss": 1.1281, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23516835916622128, |
|
"grad_norm": 0.8840386606822348, |
|
"learning_rate": 1.9981033287370443e-05, |
|
"loss": 1.114, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24585783003741316, |
|
"grad_norm": 1.2054591384545625, |
|
"learning_rate": 1.9967803753256737e-05, |
|
"loss": 1.1071, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.256547300908605, |
|
"grad_norm": 1.1840624995699496, |
|
"learning_rate": 1.995110111039574e-05, |
|
"loss": 1.1194, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2672367717797969, |
|
"grad_norm": 1.0998971951669942, |
|
"learning_rate": 1.9930931178534353e-05, |
|
"loss": 1.1207, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27792624265098875, |
|
"grad_norm": 1.0298199126134084, |
|
"learning_rate": 1.9907300985536334e-05, |
|
"loss": 1.111, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28861571352218063, |
|
"grad_norm": 1.1181488992425574, |
|
"learning_rate": 1.988021876493357e-05, |
|
"loss": 1.1111, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2993051843933725, |
|
"grad_norm": 1.379189172394145, |
|
"learning_rate": 1.9849693953057235e-05, |
|
"loss": 1.1203, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3099946552645644, |
|
"grad_norm": 1.3727219764638852, |
|
"learning_rate": 1.9815737185749855e-05, |
|
"loss": 1.1124, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32068412613575625, |
|
"grad_norm": 0.9557824074089444, |
|
"learning_rate": 1.9778360294659463e-05, |
|
"loss": 1.1075, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33137359700694813, |
|
"grad_norm": 1.1644352715278528, |
|
"learning_rate": 1.9737576303117025e-05, |
|
"loss": 1.119, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34206306787814006, |
|
"grad_norm": 1.0990077051440315, |
|
"learning_rate": 1.969339942159873e-05, |
|
"loss": 1.1029, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.35275253874933193, |
|
"grad_norm": 1.0573662138914466, |
|
"learning_rate": 1.9645845042774555e-05, |
|
"loss": 1.0974, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3634420096205238, |
|
"grad_norm": 2.3340677840799313, |
|
"learning_rate": 1.9594929736144978e-05, |
|
"loss": 1.1153, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3741314804917157, |
|
"grad_norm": 2.3221634783528686, |
|
"learning_rate": 1.9540671242267616e-05, |
|
"loss": 1.107, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.38482095136290756, |
|
"grad_norm": 0.9109466500529232, |
|
"learning_rate": 1.9483088466575848e-05, |
|
"loss": 1.1043, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.39551042223409943, |
|
"grad_norm": 1.0011924259988965, |
|
"learning_rate": 1.9422201472791515e-05, |
|
"loss": 1.1121, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4061998931052913, |
|
"grad_norm": 0.9440180335807369, |
|
"learning_rate": 1.9358031475934093e-05, |
|
"loss": 1.1227, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4168893639764832, |
|
"grad_norm": 1.1307195146615314, |
|
"learning_rate": 1.9290600834928665e-05, |
|
"loss": 1.1219, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42757883484767506, |
|
"grad_norm": 0.9621091161912891, |
|
"learning_rate": 1.9219933044815357e-05, |
|
"loss": 1.1048, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43826830571886694, |
|
"grad_norm": 2.48708331314638, |
|
"learning_rate": 1.9146052728562882e-05, |
|
"loss": 1.1108, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4489577765900588, |
|
"grad_norm": 1.3248966545586405, |
|
"learning_rate": 1.9068985628489105e-05, |
|
"loss": 1.1006, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4596472474612507, |
|
"grad_norm": 0.8657688159326627, |
|
"learning_rate": 1.8988758597291577e-05, |
|
"loss": 1.1104, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.47033671833244256, |
|
"grad_norm": 0.935936780857328, |
|
"learning_rate": 1.8905399588691165e-05, |
|
"loss": 1.1056, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.48102618920363444, |
|
"grad_norm": 1.2429779878387484, |
|
"learning_rate": 1.8818937647692076e-05, |
|
"loss": 1.1074, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4917156600748263, |
|
"grad_norm": 1.0378059855689485, |
|
"learning_rate": 1.8729402900461627e-05, |
|
"loss": 1.1008, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5024051309460181, |
|
"grad_norm": 1.0652615925406739, |
|
"learning_rate": 1.863682654383328e-05, |
|
"loss": 1.1087, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.51309460181721, |
|
"grad_norm": 1.1862267155083708, |
|
"learning_rate": 1.8541240834436687e-05, |
|
"loss": 1.1057, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5237840726884019, |
|
"grad_norm": 0.8664081720983386, |
|
"learning_rate": 1.8442679077458383e-05, |
|
"loss": 1.0964, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5344735435595938, |
|
"grad_norm": 0.8315173201885296, |
|
"learning_rate": 1.83411756150372e-05, |
|
"loss": 1.0907, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5451630144307856, |
|
"grad_norm": 0.9387766421901986, |
|
"learning_rate": 1.8236765814298328e-05, |
|
"loss": 1.094, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5558524853019775, |
|
"grad_norm": 0.7690516313506294, |
|
"learning_rate": 1.8129486055030255e-05, |
|
"loss": 1.0948, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5665419561731694, |
|
"grad_norm": 1.2145466612540963, |
|
"learning_rate": 1.801937371700887e-05, |
|
"loss": 1.0857, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5772314270443613, |
|
"grad_norm": 0.9040800382389166, |
|
"learning_rate": 1.7906467166973096e-05, |
|
"loss": 1.1107, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5879208979155531, |
|
"grad_norm": 0.7980679102324686, |
|
"learning_rate": 1.7790805745256703e-05, |
|
"loss": 1.0901, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.598610368786745, |
|
"grad_norm": 1.0614775069350189, |
|
"learning_rate": 1.767242975208083e-05, |
|
"loss": 1.1017, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6092998396579369, |
|
"grad_norm": 0.888082998520413, |
|
"learning_rate": 1.755138043351207e-05, |
|
"loss": 1.0899, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6199893105291288, |
|
"grad_norm": 0.9317009226983197, |
|
"learning_rate": 1.742769996709098e-05, |
|
"loss": 1.1001, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6306787814003206, |
|
"grad_norm": 0.8729974876830693, |
|
"learning_rate": 1.7301431447136077e-05, |
|
"loss": 1.0828, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6413682522715125, |
|
"grad_norm": 1.2895448854179192, |
|
"learning_rate": 1.7172618869728346e-05, |
|
"loss": 1.0897, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6520577231427044, |
|
"grad_norm": 0.8660951578681755, |
|
"learning_rate": 1.704130711738157e-05, |
|
"loss": 1.0809, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6627471940138963, |
|
"grad_norm": 1.2186690464245094, |
|
"learning_rate": 1.6907541943403798e-05, |
|
"loss": 1.0921, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6734366648850882, |
|
"grad_norm": 1.4778826133569847, |
|
"learning_rate": 1.6771369955955396e-05, |
|
"loss": 1.1037, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6841261357562801, |
|
"grad_norm": 0.801944528331015, |
|
"learning_rate": 1.6632838601809243e-05, |
|
"loss": 1.0943, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.694815606627472, |
|
"grad_norm": 0.7830148390541238, |
|
"learning_rate": 1.649199614981871e-05, |
|
"loss": 1.0918, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7055050774986639, |
|
"grad_norm": 1.0185586437824388, |
|
"learning_rate": 1.634889167409923e-05, |
|
"loss": 1.0844, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7161945483698557, |
|
"grad_norm": 0.8838705526707066, |
|
"learning_rate": 1.6203575036929268e-05, |
|
"loss": 1.0853, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7268840192410476, |
|
"grad_norm": 1.096045173824452, |
|
"learning_rate": 1.6056096871376667e-05, |
|
"loss": 1.0802, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7375734901122395, |
|
"grad_norm": 1.19736597555692, |
|
"learning_rate": 1.5906508563656434e-05, |
|
"loss": 1.0861, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7482629609834314, |
|
"grad_norm": 2.268133688603997, |
|
"learning_rate": 1.57548622352261e-05, |
|
"loss": 1.0792, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7589524318546232, |
|
"grad_norm": 1.155157995630093, |
|
"learning_rate": 1.5601210724624912e-05, |
|
"loss": 1.0745, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7696419027258151, |
|
"grad_norm": 0.7705769101041595, |
|
"learning_rate": 1.5445607569063144e-05, |
|
"loss": 1.0794, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.780331373597007, |
|
"grad_norm": 1.0736892593265372, |
|
"learning_rate": 1.528810698576798e-05, |
|
"loss": 1.0895, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7910208444681989, |
|
"grad_norm": 1.1337775203934415, |
|
"learning_rate": 1.5128763853092476e-05, |
|
"loss": 1.0754, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8017103153393907, |
|
"grad_norm": 0.8438005844283198, |
|
"learning_rate": 1.4967633691394139e-05, |
|
"loss": 1.0833, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8123997862105826, |
|
"grad_norm": 0.8559009738146083, |
|
"learning_rate": 1.480477264368982e-05, |
|
"loss": 1.0791, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8230892570817745, |
|
"grad_norm": 0.8637829399117554, |
|
"learning_rate": 1.4640237456093636e-05, |
|
"loss": 1.0952, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8337787279529664, |
|
"grad_norm": 0.9219591233164315, |
|
"learning_rate": 1.44740854580448e-05, |
|
"loss": 1.0849, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8444681988241582, |
|
"grad_norm": 0.9489856259273626, |
|
"learning_rate": 1.4306374542332141e-05, |
|
"loss": 1.0845, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8551576696953501, |
|
"grad_norm": 0.726086087500721, |
|
"learning_rate": 1.4137163144922377e-05, |
|
"loss": 1.0757, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.865847140566542, |
|
"grad_norm": 0.7147881127002116, |
|
"learning_rate": 1.3966510224599109e-05, |
|
"loss": 1.0744, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8765366114377339, |
|
"grad_norm": 0.9658809100864133, |
|
"learning_rate": 1.3794475242419662e-05, |
|
"loss": 1.0779, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8872260823089257, |
|
"grad_norm": 0.8176502290457607, |
|
"learning_rate": 1.3621118140996893e-05, |
|
"loss": 1.0713, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8979155531801176, |
|
"grad_norm": 1.3753581886211053, |
|
"learning_rate": 1.3446499323613233e-05, |
|
"loss": 1.0741, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9086050240513095, |
|
"grad_norm": 0.8886361584497087, |
|
"learning_rate": 1.3270679633174219e-05, |
|
"loss": 1.068, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9192944949225014, |
|
"grad_norm": 0.7532737963851583, |
|
"learning_rate": 1.3093720331008812e-05, |
|
"loss": 1.0651, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9299839657936932, |
|
"grad_norm": 1.0561855303366106, |
|
"learning_rate": 1.291568307552397e-05, |
|
"loss": 1.0638, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9406734366648851, |
|
"grad_norm": 1.0975528887378465, |
|
"learning_rate": 1.2736629900720832e-05, |
|
"loss": 1.0708, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.951362907536077, |
|
"grad_norm": 1.0367264540308103, |
|
"learning_rate": 1.2556623194580038e-05, |
|
"loss": 1.0697, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9620523784072689, |
|
"grad_norm": 0.7496159960346996, |
|
"learning_rate": 1.2375725677323737e-05, |
|
"loss": 1.0658, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9727418492784607, |
|
"grad_norm": 0.7165830493866471, |
|
"learning_rate": 1.2194000379561786e-05, |
|
"loss": 1.066, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9834313201496526, |
|
"grad_norm": 0.8117062083163005, |
|
"learning_rate": 1.2011510620329838e-05, |
|
"loss": 1.0559, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9941207910208445, |
|
"grad_norm": 0.9671101728818755, |
|
"learning_rate": 1.1828319985026929e-05, |
|
"loss": 1.0719, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9994655264564404, |
|
"eval_loss": 1.077652096748352, |
|
"eval_runtime": 645.1277, |
|
"eval_samples_per_second": 20.532, |
|
"eval_steps_per_second": 2.567, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.0048102618920363, |
|
"grad_norm": 1.1087788044463114, |
|
"learning_rate": 1.1644492303260218e-05, |
|
"loss": 1.012, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0154997327632282, |
|
"grad_norm": 0.886022399302217, |
|
"learning_rate": 1.1460091626604694e-05, |
|
"loss": 0.9152, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.02618920363442, |
|
"grad_norm": 1.1706937749180708, |
|
"learning_rate": 1.12751822062855e-05, |
|
"loss": 0.9167, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.036878674505612, |
|
"grad_norm": 1.7532601167315451, |
|
"learning_rate": 1.1089828470790694e-05, |
|
"loss": 0.9232, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.0475681453768038, |
|
"grad_norm": 0.7576611900072645, |
|
"learning_rate": 1.0904095003422288e-05, |
|
"loss": 0.9135, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0582576162479957, |
|
"grad_norm": 1.406066479915882, |
|
"learning_rate": 1.0718046519793276e-05, |
|
"loss": 0.911, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0689470871191875, |
|
"grad_norm": 0.8276407306306659, |
|
"learning_rate": 1.053174784527863e-05, |
|
"loss": 0.9118, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0796365579903795, |
|
"grad_norm": 1.5786140198155785, |
|
"learning_rate": 1.0345263892428006e-05, |
|
"loss": 0.9143, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.0903260288615713, |
|
"grad_norm": 0.7600031883291228, |
|
"learning_rate": 1.015865963834808e-05, |
|
"loss": 0.9109, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.1010154997327632, |
|
"grad_norm": 0.9360779030634315, |
|
"learning_rate": 9.972000102062424e-06, |
|
"loss": 0.9053, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.111704970603955, |
|
"grad_norm": 0.7119953722649014, |
|
"learning_rate": 9.785350321856727e-06, |
|
"loss": 0.908, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.122394441475147, |
|
"grad_norm": 1.2298264228966356, |
|
"learning_rate": 9.59877533261735e-06, |
|
"loss": 0.9101, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1330839123463388, |
|
"grad_norm": 0.8613400924950422, |
|
"learning_rate": 9.412340143171025e-06, |
|
"loss": 0.9084, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.1437733832175307, |
|
"grad_norm": 0.7397211579098093, |
|
"learning_rate": 9.226109713633673e-06, |
|
"loss": 0.9177, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.1544628540887225, |
|
"grad_norm": 0.8896654098285914, |
|
"learning_rate": 9.040148932776171e-06, |
|
"loss": 0.9099, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.1651523249599145, |
|
"grad_norm": 0.8849999970569379, |
|
"learning_rate": 8.854522595414964e-06, |
|
"loss": 0.9037, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1758417958311065, |
|
"grad_norm": 0.8584268485223988, |
|
"learning_rate": 8.669295379835467e-06, |
|
"loss": 0.9191, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1865312667022982, |
|
"grad_norm": 1.7832046863381712, |
|
"learning_rate": 8.484531825256e-06, |
|
"loss": 0.9147, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.19722073757349, |
|
"grad_norm": 0.943125758208895, |
|
"learning_rate": 8.300296309340237e-06, |
|
"loss": 0.9006, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.207910208444682, |
|
"grad_norm": 1.027773392254089, |
|
"learning_rate": 8.11665302576592e-06, |
|
"loss": 0.9075, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.218599679315874, |
|
"grad_norm": 0.8100086271002062, |
|
"learning_rate": 7.933665961857668e-06, |
|
"loss": 0.9103, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.2292891501870657, |
|
"grad_norm": 0.9544459170071978, |
|
"learning_rate": 7.751398876291725e-06, |
|
"loss": 0.9176, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2399786210582575, |
|
"grad_norm": 2.225340528053797, |
|
"learning_rate": 7.56991527688033e-06, |
|
"loss": 0.9156, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.2506680919294495, |
|
"grad_norm": 0.9023151298540013, |
|
"learning_rate": 7.389278398443528e-06, |
|
"loss": 0.9119, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.2613575628006415, |
|
"grad_norm": 1.4863083286254688, |
|
"learning_rate": 7.2095511807760955e-06, |
|
"loss": 0.9132, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.2720470336718332, |
|
"grad_norm": 0.8183011216223799, |
|
"learning_rate": 7.0307962467172555e-06, |
|
"loss": 0.9075, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.282736504543025, |
|
"grad_norm": 1.5386124533111987, |
|
"learning_rate": 6.853075880330819e-06, |
|
"loss": 0.9027, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.293425975414217, |
|
"grad_norm": 0.7528676159107465, |
|
"learning_rate": 6.6764520052034054e-06, |
|
"loss": 0.9064, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.304115446285409, |
|
"grad_norm": 0.8967065438707555, |
|
"learning_rate": 6.500986162868224e-06, |
|
"loss": 0.9059, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.3148049171566007, |
|
"grad_norm": 1.0341197275879177, |
|
"learning_rate": 6.3267394913619864e-06, |
|
"loss": 0.8956, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.3254943880277925, |
|
"grad_norm": 0.7557659052059883, |
|
"learning_rate": 6.153772703922434e-06, |
|
"loss": 0.8995, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.3361838588989845, |
|
"grad_norm": 1.0298457610722649, |
|
"learning_rate": 5.982146067833849e-06, |
|
"loss": 0.9111, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3468733297701765, |
|
"grad_norm": 0.6755040857479608, |
|
"learning_rate": 5.811919383427961e-06, |
|
"loss": 0.9124, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.3575628006413683, |
|
"grad_norm": 0.8917854927658709, |
|
"learning_rate": 5.6431519632475496e-06, |
|
"loss": 0.9172, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.36825227151256, |
|
"grad_norm": 0.7626134459542782, |
|
"learning_rate": 5.475902611380051e-06, |
|
"loss": 0.893, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.378941742383752, |
|
"grad_norm": 1.0706548113801684, |
|
"learning_rate": 5.31022960296824e-06, |
|
"loss": 0.9012, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.389631213254944, |
|
"grad_norm": 0.9177131965846325, |
|
"learning_rate": 5.146190663905292e-06, |
|
"loss": 0.8949, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4003206841261358, |
|
"grad_norm": 2.534008416771525, |
|
"learning_rate": 4.9838429507212085e-06, |
|
"loss": 0.9141, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.4110101549973275, |
|
"grad_norm": 1.0276887869821825, |
|
"learning_rate": 4.823243030667576e-06, |
|
"loss": 0.8951, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.4216996258685195, |
|
"grad_norm": 0.7439623469240491, |
|
"learning_rate": 4.664446862007718e-06, |
|
"loss": 0.8957, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.4323890967397115, |
|
"grad_norm": 1.0892357195069147, |
|
"learning_rate": 4.507509774518987e-06, |
|
"loss": 0.9007, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.4430785676109033, |
|
"grad_norm": 0.7332808674148251, |
|
"learning_rate": 4.352486450214081e-06, |
|
"loss": 0.8971, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.453768038482095, |
|
"grad_norm": 1.0981109067365051, |
|
"learning_rate": 4.19943090428802e-06, |
|
"loss": 0.9038, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.464457509353287, |
|
"grad_norm": 1.0348847081193606, |
|
"learning_rate": 4.048396466297493e-06, |
|
"loss": 0.9026, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.475146980224479, |
|
"grad_norm": 0.7848138958466574, |
|
"learning_rate": 3.899435761579114e-06, |
|
"loss": 0.8992, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.4858364510956708, |
|
"grad_norm": 0.7599216516004884, |
|
"learning_rate": 3.7526006929130044e-06, |
|
"loss": 0.9067, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.4965259219668625, |
|
"grad_norm": 0.7304865400667054, |
|
"learning_rate": 3.6079424224381877e-06, |
|
"loss": 0.9002, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5072153928380545, |
|
"grad_norm": 0.7480908189249247, |
|
"learning_rate": 3.46551135382603e-06, |
|
"loss": 0.8884, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.5179048637092465, |
|
"grad_norm": 0.7177286196799881, |
|
"learning_rate": 3.3253571147179333e-06, |
|
"loss": 0.8985, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.5285943345804383, |
|
"grad_norm": 0.867580603053855, |
|
"learning_rate": 3.1875285394334575e-06, |
|
"loss": 0.8923, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.53928380545163, |
|
"grad_norm": 0.9881425126168384, |
|
"learning_rate": 3.052073651954852e-06, |
|
"loss": 0.8993, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.549973276322822, |
|
"grad_norm": 0.9051127099020313, |
|
"learning_rate": 2.919039649193912e-06, |
|
"loss": 0.899, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.560662747194014, |
|
"grad_norm": 0.7445995785573707, |
|
"learning_rate": 2.788472884547041e-06, |
|
"loss": 0.8955, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.5713522180652058, |
|
"grad_norm": 0.8797867158896838, |
|
"learning_rate": 2.66041885174422e-06, |
|
"loss": 0.8865, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.5820416889363975, |
|
"grad_norm": 0.7305375315664138, |
|
"learning_rate": 2.534922168997488e-06, |
|
"loss": 0.8916, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.5927311598075895, |
|
"grad_norm": 2.534643027146753, |
|
"learning_rate": 2.412026563454506e-06, |
|
"loss": 0.8949, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.6034206306787815, |
|
"grad_norm": 1.2859411614634395, |
|
"learning_rate": 2.2917748559625985e-06, |
|
"loss": 0.9056, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6141101015499733, |
|
"grad_norm": 0.9710242041721586, |
|
"learning_rate": 2.1742089461485504e-06, |
|
"loss": 0.8981, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.624799572421165, |
|
"grad_norm": 0.769368245004695, |
|
"learning_rate": 2.0593697978194207e-06, |
|
"loss": 0.9007, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.635489043292357, |
|
"grad_norm": 0.8015397618995402, |
|
"learning_rate": 1.947297424689414e-06, |
|
"loss": 0.8886, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.646178514163549, |
|
"grad_norm": 0.8275628360204516, |
|
"learning_rate": 1.8380308764377841e-06, |
|
"loss": 0.8925, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6568679850347408, |
|
"grad_norm": 0.7756796925943065, |
|
"learning_rate": 1.7316082251026534e-06, |
|
"loss": 0.9021, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6675574559059325, |
|
"grad_norm": 0.8141069583254018, |
|
"learning_rate": 1.6280665518154793e-06, |
|
"loss": 0.9012, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.6782469267771245, |
|
"grad_norm": 1.0082290712791837, |
|
"learning_rate": 1.5274419338807577e-06, |
|
"loss": 0.8966, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.6889363976483165, |
|
"grad_norm": 0.7257612696025059, |
|
"learning_rate": 1.4297694322055244e-06, |
|
"loss": 0.8845, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.6996258685195083, |
|
"grad_norm": 0.8013586352610749, |
|
"learning_rate": 1.3350830790829883e-06, |
|
"loss": 0.8899, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.7103153393907, |
|
"grad_norm": 2.796682716354161, |
|
"learning_rate": 1.2434158663345553e-06, |
|
"loss": 0.8822, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.721004810261892, |
|
"grad_norm": 0.8220972733831542, |
|
"learning_rate": 1.1547997338144113e-06, |
|
"loss": 0.8957, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.731694281133084, |
|
"grad_norm": 0.7615550972821201, |
|
"learning_rate": 1.0692655582806333e-06, |
|
"loss": 0.8958, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.7423837520042758, |
|
"grad_norm": 0.6868208233977947, |
|
"learning_rate": 9.868431426367054e-07, |
|
"loss": 0.8893, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.7530732228754675, |
|
"grad_norm": 0.7642536246851884, |
|
"learning_rate": 9.075612055472228e-07, |
|
"loss": 0.8904, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.7637626937466595, |
|
"grad_norm": 0.7619842486287282, |
|
"learning_rate": 8.31447371431372e-07, |
|
"loss": 0.8987, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7744521646178515, |
|
"grad_norm": 0.7244947582854459, |
|
"learning_rate": 7.585281608376671e-07, |
|
"loss": 0.8964, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.7851416354890433, |
|
"grad_norm": 0.8227237122805585, |
|
"learning_rate": 6.888289812033355e-07, |
|
"loss": 0.8926, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.795831106360235, |
|
"grad_norm": 0.7083179730078351, |
|
"learning_rate": 6.223741180015364e-07, |
|
"loss": 0.8868, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.806520577231427, |
|
"grad_norm": 1.016704872651831, |
|
"learning_rate": 5.591867262794969e-07, |
|
"loss": 0.8893, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.817210048102619, |
|
"grad_norm": 0.7541296642350999, |
|
"learning_rate": 4.992888225905467e-07, |
|
"loss": 0.8873, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8278995189738108, |
|
"grad_norm": 1.1828510824703147, |
|
"learning_rate": 4.4270127732282674e-07, |
|
"loss": 0.8888, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.8385889898450025, |
|
"grad_norm": 0.8854813180131454, |
|
"learning_rate": 3.894438074273654e-07, |
|
"loss": 0.8931, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.8492784607161945, |
|
"grad_norm": 0.6824883127136027, |
|
"learning_rate": 3.395349695480477e-07, |
|
"loss": 0.8919, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.8599679315873865, |
|
"grad_norm": 0.6963594352156394, |
|
"learning_rate": 2.9299215355586776e-07, |
|
"loss": 0.8918, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.8706574024585783, |
|
"grad_norm": 0.8163050203075269, |
|
"learning_rate": 2.498315764897441e-07, |
|
"loss": 0.9054, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.88134687332977, |
|
"grad_norm": 0.744317394175376, |
|
"learning_rate": 2.1006827690595478e-07, |
|
"loss": 0.8757, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.892036344200962, |
|
"grad_norm": 0.6963502727814512, |
|
"learning_rate": 1.737161096382256e-07, |
|
"loss": 0.8836, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.902725815072154, |
|
"grad_norm": 0.7735187846239215, |
|
"learning_rate": 1.407877409702496e-07, |
|
"loss": 0.8913, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.9134152859433458, |
|
"grad_norm": 0.9790540762915396, |
|
"learning_rate": 1.1129464422233615e-07, |
|
"loss": 0.8918, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.9241047568145375, |
|
"grad_norm": 0.6906743386051164, |
|
"learning_rate": 8.524709575373436e-08, |
|
"loss": 0.8968, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9347942276857295, |
|
"grad_norm": 0.948337666554192, |
|
"learning_rate": 6.265417138201391e-08, |
|
"loss": 0.8798, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.9454836985569215, |
|
"grad_norm": 1.1478744196499828, |
|
"learning_rate": 4.352374322075359e-08, |
|
"loss": 0.8874, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.9561731694281133, |
|
"grad_norm": 0.7815098293930887, |
|
"learning_rate": 2.786247693663646e-08, |
|
"loss": 0.8874, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.966862640299305, |
|
"grad_norm": 1.3945629640091628, |
|
"learning_rate": 1.567582942691437e-08, |
|
"loss": 0.8939, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.977552111170497, |
|
"grad_norm": 0.7412508100956116, |
|
"learning_rate": 6.968046918052196e-09, |
|
"loss": 0.8891, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.988241582041689, |
|
"grad_norm": 0.720933817131335, |
|
"learning_rate": 1.7421634861936043e-09, |
|
"loss": 0.8888, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.9989310529128808, |
|
"grad_norm": 0.9923707428720276, |
|
"learning_rate": 0.0, |
|
"loss": 0.883, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.9989310529128808, |
|
"eval_loss": 1.0620321035385132, |
|
"eval_runtime": 645.1871, |
|
"eval_samples_per_second": 20.53, |
|
"eval_steps_per_second": 2.567, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.9989310529128808, |
|
"step": 1870, |
|
"total_flos": 783053739786240.0, |
|
"train_loss": 1.0084666737260666, |
|
"train_runtime": 48199.6175, |
|
"train_samples_per_second": 4.968, |
|
"train_steps_per_second": 0.039 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"total_flos": 783053739786240.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|