|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.3522493384298735, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011761246692149369, |
|
"grad_norm": 0.25049906969070435, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1846, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023522493384298737, |
|
"grad_norm": 0.28385764360427856, |
|
"learning_rate": 0.00019952830188679245, |
|
"loss": 1.9496, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.035283740076448106, |
|
"grad_norm": 0.27383577823638916, |
|
"learning_rate": 0.0001990566037735849, |
|
"loss": 2.003, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.047044986768597474, |
|
"grad_norm": 0.3645761013031006, |
|
"learning_rate": 0.00019858490566037736, |
|
"loss": 1.9349, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.058806233460746836, |
|
"grad_norm": 0.6317451596260071, |
|
"learning_rate": 0.00019811320754716983, |
|
"loss": 1.7642, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07056748015289621, |
|
"grad_norm": 0.18650729954242706, |
|
"learning_rate": 0.00019764150943396227, |
|
"loss": 1.9489, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08232872684504558, |
|
"grad_norm": 0.2123485505580902, |
|
"learning_rate": 0.00019716981132075472, |
|
"loss": 1.9835, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09408997353719495, |
|
"grad_norm": 0.2508566379547119, |
|
"learning_rate": 0.00019669811320754718, |
|
"loss": 1.8866, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10585122022934432, |
|
"grad_norm": 0.31404730677604675, |
|
"learning_rate": 0.00019622641509433963, |
|
"loss": 1.8954, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11761246692149367, |
|
"grad_norm": 0.5218461155891418, |
|
"learning_rate": 0.00019575471698113207, |
|
"loss": 1.7887, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12937371361364305, |
|
"grad_norm": 0.2055450826883316, |
|
"learning_rate": 0.00019528301886792454, |
|
"loss": 1.8669, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14113496030579242, |
|
"grad_norm": 0.22005008161067963, |
|
"learning_rate": 0.000194811320754717, |
|
"loss": 1.8794, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1528962069979418, |
|
"grad_norm": 0.2914157807826996, |
|
"learning_rate": 0.00019433962264150945, |
|
"loss": 1.8953, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16465745369009116, |
|
"grad_norm": 0.303595632314682, |
|
"learning_rate": 0.0001938679245283019, |
|
"loss": 1.8921, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17641870038224053, |
|
"grad_norm": 0.6398317813873291, |
|
"learning_rate": 0.00019339622641509433, |
|
"loss": 1.8473, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1881799470743899, |
|
"grad_norm": 0.20719175040721893, |
|
"learning_rate": 0.0001929245283018868, |
|
"loss": 1.8451, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19994119376653927, |
|
"grad_norm": 0.21924979984760284, |
|
"learning_rate": 0.00019245283018867927, |
|
"loss": 1.9988, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21170244045868863, |
|
"grad_norm": 0.3456704914569855, |
|
"learning_rate": 0.0001919811320754717, |
|
"loss": 1.8011, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22346368715083798, |
|
"grad_norm": 0.3223501741886139, |
|
"learning_rate": 0.00019150943396226415, |
|
"loss": 1.8616, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23522493384298734, |
|
"grad_norm": 0.6237074136734009, |
|
"learning_rate": 0.00019103773584905662, |
|
"loss": 1.7474, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2469861805351367, |
|
"grad_norm": 0.19962406158447266, |
|
"learning_rate": 0.00019056603773584906, |
|
"loss": 1.9342, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2587474272272861, |
|
"grad_norm": 0.23922637104988098, |
|
"learning_rate": 0.0001900943396226415, |
|
"loss": 1.8467, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.27050867391943545, |
|
"grad_norm": 0.28077131509780884, |
|
"learning_rate": 0.00018962264150943397, |
|
"loss": 1.8587, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28226992061158485, |
|
"grad_norm": 0.3607043921947479, |
|
"learning_rate": 0.00018915094339622644, |
|
"loss": 1.7562, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2940311673037342, |
|
"grad_norm": 0.5218331217765808, |
|
"learning_rate": 0.00018867924528301889, |
|
"loss": 1.6972, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3057924139958836, |
|
"grad_norm": 0.22418326139450073, |
|
"learning_rate": 0.00018820754716981133, |
|
"loss": 1.9769, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3175536606880329, |
|
"grad_norm": 0.2506936192512512, |
|
"learning_rate": 0.00018773584905660377, |
|
"loss": 1.9306, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3293149073801823, |
|
"grad_norm": 0.2900485396385193, |
|
"learning_rate": 0.00018726415094339624, |
|
"loss": 1.8749, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34107615407233166, |
|
"grad_norm": 0.36592498421669006, |
|
"learning_rate": 0.00018679245283018868, |
|
"loss": 1.8406, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35283740076448106, |
|
"grad_norm": 0.7308420538902283, |
|
"learning_rate": 0.00018632075471698115, |
|
"loss": 1.7093, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3645986474566304, |
|
"grad_norm": 0.2798251509666443, |
|
"learning_rate": 0.0001858490566037736, |
|
"loss": 1.9505, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3763598941487798, |
|
"grad_norm": 0.26333555579185486, |
|
"learning_rate": 0.00018537735849056606, |
|
"loss": 1.9265, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.38812114084092914, |
|
"grad_norm": 0.34414273500442505, |
|
"learning_rate": 0.0001849056603773585, |
|
"loss": 1.8182, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.39988238753307853, |
|
"grad_norm": 0.36221399903297424, |
|
"learning_rate": 0.00018443396226415094, |
|
"loss": 1.8312, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4116436342252279, |
|
"grad_norm": 0.7047480344772339, |
|
"learning_rate": 0.00018396226415094339, |
|
"loss": 1.743, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.42340488091737727, |
|
"grad_norm": 0.2410486787557602, |
|
"learning_rate": 0.00018349056603773585, |
|
"loss": 1.9485, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4351661276095266, |
|
"grad_norm": 0.30608782172203064, |
|
"learning_rate": 0.00018301886792452832, |
|
"loss": 1.8891, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.44692737430167595, |
|
"grad_norm": 0.32783588767051697, |
|
"learning_rate": 0.00018254716981132077, |
|
"loss": 1.8253, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.45868862099382535, |
|
"grad_norm": 0.37461912631988525, |
|
"learning_rate": 0.0001820754716981132, |
|
"loss": 1.8013, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4704498676859747, |
|
"grad_norm": 0.7036715149879456, |
|
"learning_rate": 0.00018160377358490568, |
|
"loss": 1.7548, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4822111143781241, |
|
"grad_norm": 0.23341324925422668, |
|
"learning_rate": 0.00018113207547169812, |
|
"loss": 1.8426, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4939723610702734, |
|
"grad_norm": 0.29215207695961, |
|
"learning_rate": 0.00018066037735849056, |
|
"loss": 1.8704, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5057336077624228, |
|
"grad_norm": 0.37499144673347473, |
|
"learning_rate": 0.00018018867924528303, |
|
"loss": 1.7602, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5174948544545722, |
|
"grad_norm": 0.41657859086990356, |
|
"learning_rate": 0.0001797169811320755, |
|
"loss": 1.8011, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5292561011467215, |
|
"grad_norm": 0.6587756872177124, |
|
"learning_rate": 0.00017924528301886794, |
|
"loss": 1.6643, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5410173478388709, |
|
"grad_norm": 0.28515782952308655, |
|
"learning_rate": 0.00017877358490566038, |
|
"loss": 1.8037, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5527785945310203, |
|
"grad_norm": 0.2742769420146942, |
|
"learning_rate": 0.00017830188679245282, |
|
"loss": 1.8544, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5645398412231697, |
|
"grad_norm": 0.34683799743652344, |
|
"learning_rate": 0.0001778301886792453, |
|
"loss": 1.7819, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.576301087915319, |
|
"grad_norm": 0.47388383746147156, |
|
"learning_rate": 0.00017735849056603776, |
|
"loss": 1.786, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5880623346074684, |
|
"grad_norm": 0.617415726184845, |
|
"learning_rate": 0.0001768867924528302, |
|
"loss": 1.7053, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5998235812996178, |
|
"grad_norm": 0.26782867312431335, |
|
"learning_rate": 0.00017641509433962265, |
|
"loss": 1.8774, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6115848279917672, |
|
"grad_norm": 0.3381577134132385, |
|
"learning_rate": 0.00017594339622641511, |
|
"loss": 1.8537, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6233460746839165, |
|
"grad_norm": 0.3665984272956848, |
|
"learning_rate": 0.00017547169811320756, |
|
"loss": 1.7465, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6351073213760658, |
|
"grad_norm": 0.46630290150642395, |
|
"learning_rate": 0.000175, |
|
"loss": 1.7545, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6468685680682152, |
|
"grad_norm": 0.7455469369888306, |
|
"learning_rate": 0.00017452830188679247, |
|
"loss": 1.6776, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6586298147603646, |
|
"grad_norm": 0.27579784393310547, |
|
"learning_rate": 0.0001740566037735849, |
|
"loss": 1.8787, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6703910614525139, |
|
"grad_norm": 0.3148879110813141, |
|
"learning_rate": 0.00017358490566037738, |
|
"loss": 1.8989, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6821523081446633, |
|
"grad_norm": 0.3903751075267792, |
|
"learning_rate": 0.00017311320754716982, |
|
"loss": 1.7702, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6939135548368127, |
|
"grad_norm": 0.4537353217601776, |
|
"learning_rate": 0.00017264150943396226, |
|
"loss": 1.758, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7056748015289621, |
|
"grad_norm": 0.7169495224952698, |
|
"learning_rate": 0.0001721698113207547, |
|
"loss": 1.5495, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7174360482211114, |
|
"grad_norm": 0.2942892909049988, |
|
"learning_rate": 0.00017169811320754717, |
|
"loss": 1.7981, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7291972949132608, |
|
"grad_norm": 0.39550286531448364, |
|
"learning_rate": 0.00017122641509433964, |
|
"loss": 1.7919, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7409585416054102, |
|
"grad_norm": 0.3948846459388733, |
|
"learning_rate": 0.00017075471698113208, |
|
"loss": 1.7793, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7527197882975596, |
|
"grad_norm": 0.4996489882469177, |
|
"learning_rate": 0.00017028301886792453, |
|
"loss": 1.6956, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7644810349897089, |
|
"grad_norm": 0.7511508464813232, |
|
"learning_rate": 0.000169811320754717, |
|
"loss": 1.6399, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7762422816818583, |
|
"grad_norm": 0.3312196433544159, |
|
"learning_rate": 0.00016933962264150944, |
|
"loss": 1.7876, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7880035283740077, |
|
"grad_norm": 0.40000253915786743, |
|
"learning_rate": 0.00016886792452830188, |
|
"loss": 1.8278, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7997647750661571, |
|
"grad_norm": 0.4055274724960327, |
|
"learning_rate": 0.00016839622641509435, |
|
"loss": 1.6638, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8115260217583063, |
|
"grad_norm": 0.48130497336387634, |
|
"learning_rate": 0.00016792452830188682, |
|
"loss": 1.7075, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8232872684504557, |
|
"grad_norm": 1.0582154989242554, |
|
"learning_rate": 0.00016745283018867926, |
|
"loss": 1.5322, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8350485151426051, |
|
"grad_norm": 0.31292250752449036, |
|
"learning_rate": 0.0001669811320754717, |
|
"loss": 1.8269, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8468097618347545, |
|
"grad_norm": 0.3395911157131195, |
|
"learning_rate": 0.00016650943396226414, |
|
"loss": 1.7563, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8585710085269038, |
|
"grad_norm": 0.4362980127334595, |
|
"learning_rate": 0.0001660377358490566, |
|
"loss": 1.6545, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8703322552190532, |
|
"grad_norm": 0.5648341774940491, |
|
"learning_rate": 0.00016556603773584908, |
|
"loss": 1.5902, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8820935019112026, |
|
"grad_norm": 0.8163714408874512, |
|
"learning_rate": 0.00016509433962264152, |
|
"loss": 1.6889, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8938547486033519, |
|
"grad_norm": 0.3610120117664337, |
|
"learning_rate": 0.00016462264150943396, |
|
"loss": 1.8061, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9056159952955013, |
|
"grad_norm": 0.40071502327919006, |
|
"learning_rate": 0.00016415094339622643, |
|
"loss": 1.7412, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9173772419876507, |
|
"grad_norm": 0.4744262993335724, |
|
"learning_rate": 0.00016367924528301887, |
|
"loss": 1.7553, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9291384886798001, |
|
"grad_norm": 0.5387608408927917, |
|
"learning_rate": 0.00016320754716981132, |
|
"loss": 1.6238, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9408997353719494, |
|
"grad_norm": 0.9463699460029602, |
|
"learning_rate": 0.00016273584905660379, |
|
"loss": 1.4965, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9526609820640988, |
|
"grad_norm": 0.39017385244369507, |
|
"learning_rate": 0.00016226415094339625, |
|
"loss": 1.7494, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9644222287562482, |
|
"grad_norm": 0.39241862297058105, |
|
"learning_rate": 0.0001617924528301887, |
|
"loss": 1.745, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9761834754483976, |
|
"grad_norm": 0.4188750982284546, |
|
"learning_rate": 0.00016132075471698114, |
|
"loss": 1.7072, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9879447221405468, |
|
"grad_norm": 0.54363614320755, |
|
"learning_rate": 0.00016084905660377358, |
|
"loss": 1.6571, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9997059688326962, |
|
"grad_norm": 0.8282334804534912, |
|
"learning_rate": 0.00016037735849056605, |
|
"loss": 1.5609, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0114672155248456, |
|
"grad_norm": 0.4861317574977875, |
|
"learning_rate": 0.0001599056603773585, |
|
"loss": 1.7427, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.023228462216995, |
|
"grad_norm": 0.47034987807273865, |
|
"learning_rate": 0.00015943396226415096, |
|
"loss": 1.4911, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0349897089091444, |
|
"grad_norm": 0.8243444561958313, |
|
"learning_rate": 0.0001589622641509434, |
|
"loss": 1.242, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0467509556012937, |
|
"grad_norm": 0.8107286095619202, |
|
"learning_rate": 0.00015849056603773587, |
|
"loss": 1.1067, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.058512202293443, |
|
"grad_norm": 0.9792178869247437, |
|
"learning_rate": 0.0001580188679245283, |
|
"loss": 0.9078, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0702734489855925, |
|
"grad_norm": 0.4514322280883789, |
|
"learning_rate": 0.00015754716981132075, |
|
"loss": 1.5328, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0820346956777418, |
|
"grad_norm": 0.5203831791877747, |
|
"learning_rate": 0.0001570754716981132, |
|
"loss": 1.4851, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0937959423698913, |
|
"grad_norm": 0.7015544176101685, |
|
"learning_rate": 0.00015660377358490567, |
|
"loss": 1.2215, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1055571890620406, |
|
"grad_norm": 0.7290483117103577, |
|
"learning_rate": 0.00015613207547169813, |
|
"loss": 1.0831, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1173184357541899, |
|
"grad_norm": 1.0971975326538086, |
|
"learning_rate": 0.00015566037735849058, |
|
"loss": 0.8673, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1290796824463394, |
|
"grad_norm": 0.5123384594917297, |
|
"learning_rate": 0.00015518867924528302, |
|
"loss": 1.5301, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1408409291384887, |
|
"grad_norm": 0.6260602474212646, |
|
"learning_rate": 0.0001547169811320755, |
|
"loss": 1.4956, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.152602175830638, |
|
"grad_norm": 0.6829984188079834, |
|
"learning_rate": 0.00015424528301886793, |
|
"loss": 1.3128, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1643634225227875, |
|
"grad_norm": 0.7748053073883057, |
|
"learning_rate": 0.00015377358490566037, |
|
"loss": 1.1702, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.1761246692149367, |
|
"grad_norm": 1.001291036605835, |
|
"learning_rate": 0.00015330188679245284, |
|
"loss": 0.8918, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1878859159070863, |
|
"grad_norm": 0.517902135848999, |
|
"learning_rate": 0.0001528301886792453, |
|
"loss": 1.4376, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1996471625992355, |
|
"grad_norm": 0.6000102758407593, |
|
"learning_rate": 0.00015235849056603775, |
|
"loss": 1.4512, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2114084092913848, |
|
"grad_norm": 0.762768566608429, |
|
"learning_rate": 0.0001518867924528302, |
|
"loss": 1.3248, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2231696559835343, |
|
"grad_norm": 0.9720354676246643, |
|
"learning_rate": 0.00015141509433962263, |
|
"loss": 1.0415, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2349309026756836, |
|
"grad_norm": 0.902864396572113, |
|
"learning_rate": 0.0001509433962264151, |
|
"loss": 0.8803, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.246692149367833, |
|
"grad_norm": 0.5235794186592102, |
|
"learning_rate": 0.00015047169811320757, |
|
"loss": 1.4755, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2584533960599824, |
|
"grad_norm": 0.5898970365524292, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.4036, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2702146427521317, |
|
"grad_norm": 0.9541133642196655, |
|
"learning_rate": 0.00014952830188679246, |
|
"loss": 1.197, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.281975889444281, |
|
"grad_norm": 0.9920721054077148, |
|
"learning_rate": 0.0001490566037735849, |
|
"loss": 1.0743, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.2937371361364305, |
|
"grad_norm": 1.3523385524749756, |
|
"learning_rate": 0.00014858490566037737, |
|
"loss": 0.8989, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3054983828285798, |
|
"grad_norm": 0.5665034055709839, |
|
"learning_rate": 0.0001481132075471698, |
|
"loss": 1.4432, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.3172596295207293, |
|
"grad_norm": 0.6107054352760315, |
|
"learning_rate": 0.00014764150943396228, |
|
"loss": 1.356, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3290208762128786, |
|
"grad_norm": 0.7833155393600464, |
|
"learning_rate": 0.00014716981132075472, |
|
"loss": 1.2708, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3407821229050279, |
|
"grad_norm": 0.9629625082015991, |
|
"learning_rate": 0.0001466981132075472, |
|
"loss": 0.9813, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3525433695971774, |
|
"grad_norm": 1.0938910245895386, |
|
"learning_rate": 0.00014622641509433963, |
|
"loss": 0.8002, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3643046162893266, |
|
"grad_norm": 0.5895722508430481, |
|
"learning_rate": 0.00014575471698113207, |
|
"loss": 1.469, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3760658629814762, |
|
"grad_norm": 0.6274592280387878, |
|
"learning_rate": 0.00014528301886792451, |
|
"loss": 1.2954, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.3878271096736254, |
|
"grad_norm": 0.748171329498291, |
|
"learning_rate": 0.00014481132075471698, |
|
"loss": 1.1003, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.3995883563657747, |
|
"grad_norm": 1.0281026363372803, |
|
"learning_rate": 0.00014433962264150945, |
|
"loss": 1.0006, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.411349603057924, |
|
"grad_norm": 1.0714832544326782, |
|
"learning_rate": 0.0001438679245283019, |
|
"loss": 0.8439, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4231108497500735, |
|
"grad_norm": 0.6404314637184143, |
|
"learning_rate": 0.00014339622641509434, |
|
"loss": 1.4463, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4348720964422228, |
|
"grad_norm": 0.6800934672355652, |
|
"learning_rate": 0.0001429245283018868, |
|
"loss": 1.2484, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4466333431343723, |
|
"grad_norm": 0.8627371191978455, |
|
"learning_rate": 0.00014245283018867925, |
|
"loss": 1.1863, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4583945898265216, |
|
"grad_norm": 1.0996595621109009, |
|
"learning_rate": 0.0001419811320754717, |
|
"loss": 0.9519, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4701558365186709, |
|
"grad_norm": 1.1529676914215088, |
|
"learning_rate": 0.00014150943396226416, |
|
"loss": 0.8407, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.4819170832108204, |
|
"grad_norm": 0.611027717590332, |
|
"learning_rate": 0.00014103773584905663, |
|
"loss": 1.3786, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4936783299029697, |
|
"grad_norm": 0.7889626026153564, |
|
"learning_rate": 0.00014056603773584907, |
|
"loss": 1.2603, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5054395765951192, |
|
"grad_norm": 0.8136641979217529, |
|
"learning_rate": 0.0001400943396226415, |
|
"loss": 1.0535, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5172008232872685, |
|
"grad_norm": 1.0993061065673828, |
|
"learning_rate": 0.00013962264150943395, |
|
"loss": 0.9192, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5289620699794177, |
|
"grad_norm": 1.2532891035079956, |
|
"learning_rate": 0.00013915094339622642, |
|
"loss": 0.8772, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.540723316671567, |
|
"grad_norm": 0.6979594826698303, |
|
"learning_rate": 0.0001386792452830189, |
|
"loss": 1.3564, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5524845633637165, |
|
"grad_norm": 0.6345073580741882, |
|
"learning_rate": 0.00013820754716981133, |
|
"loss": 1.235, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.564245810055866, |
|
"grad_norm": 1.0022692680358887, |
|
"learning_rate": 0.00013773584905660377, |
|
"loss": 1.0533, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5760070567480153, |
|
"grad_norm": 1.0487345457077026, |
|
"learning_rate": 0.00013726415094339624, |
|
"loss": 0.9879, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5877683034401646, |
|
"grad_norm": 1.3332520723342896, |
|
"learning_rate": 0.00013679245283018868, |
|
"loss": 0.8568, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.599529550132314, |
|
"grad_norm": 0.6801854968070984, |
|
"learning_rate": 0.00013632075471698113, |
|
"loss": 1.3149, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6112907968244634, |
|
"grad_norm": 0.7094405293464661, |
|
"learning_rate": 0.0001358490566037736, |
|
"loss": 1.2843, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.623052043516613, |
|
"grad_norm": 0.7568113803863525, |
|
"learning_rate": 0.00013537735849056606, |
|
"loss": 1.1169, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6348132902087622, |
|
"grad_norm": 1.1939420700073242, |
|
"learning_rate": 0.0001349056603773585, |
|
"loss": 0.8441, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6465745369009115, |
|
"grad_norm": 1.4502966403961182, |
|
"learning_rate": 0.00013443396226415095, |
|
"loss": 0.801, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6583357835930608, |
|
"grad_norm": 0.6542213559150696, |
|
"learning_rate": 0.0001339622641509434, |
|
"loss": 1.4153, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.6700970302852103, |
|
"grad_norm": 0.7604705691337585, |
|
"learning_rate": 0.00013349056603773586, |
|
"loss": 1.25, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6818582769773596, |
|
"grad_norm": 0.8076483607292175, |
|
"learning_rate": 0.0001330188679245283, |
|
"loss": 1.1673, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.693619523669509, |
|
"grad_norm": 0.9957693815231323, |
|
"learning_rate": 0.00013254716981132077, |
|
"loss": 0.9437, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7053807703616584, |
|
"grad_norm": 1.2569739818572998, |
|
"learning_rate": 0.0001320754716981132, |
|
"loss": 0.7948, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7171420170538076, |
|
"grad_norm": 0.8244763016700745, |
|
"learning_rate": 0.00013160377358490568, |
|
"loss": 1.3546, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.728903263745957, |
|
"grad_norm": 0.8371909856796265, |
|
"learning_rate": 0.00013113207547169812, |
|
"loss": 1.2031, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.7406645104381064, |
|
"grad_norm": 1.203465223312378, |
|
"learning_rate": 0.00013066037735849056, |
|
"loss": 1.0183, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.752425757130256, |
|
"grad_norm": 1.2281197309494019, |
|
"learning_rate": 0.000130188679245283, |
|
"loss": 0.9311, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7641870038224052, |
|
"grad_norm": 1.3259741067886353, |
|
"learning_rate": 0.00012971698113207548, |
|
"loss": 0.872, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7759482505145545, |
|
"grad_norm": 0.7928496599197388, |
|
"learning_rate": 0.00012924528301886794, |
|
"loss": 1.3436, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7877094972067038, |
|
"grad_norm": 0.8125369548797607, |
|
"learning_rate": 0.00012877358490566039, |
|
"loss": 1.0189, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.7994707438988533, |
|
"grad_norm": 1.0345025062561035, |
|
"learning_rate": 0.00012830188679245283, |
|
"loss": 1.0006, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8112319905910026, |
|
"grad_norm": 0.8656748533248901, |
|
"learning_rate": 0.0001278301886792453, |
|
"loss": 0.8927, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.822993237283152, |
|
"grad_norm": 1.12923264503479, |
|
"learning_rate": 0.00012735849056603774, |
|
"loss": 0.7717, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.8347544839753014, |
|
"grad_norm": 0.898140549659729, |
|
"learning_rate": 0.00012688679245283018, |
|
"loss": 1.2768, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.8465157306674507, |
|
"grad_norm": 0.748009204864502, |
|
"learning_rate": 0.00012641509433962265, |
|
"loss": 1.1579, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8582769773596, |
|
"grad_norm": 1.3326165676116943, |
|
"learning_rate": 0.00012594339622641512, |
|
"loss": 0.973, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.8700382240517495, |
|
"grad_norm": 0.9244058132171631, |
|
"learning_rate": 0.00012547169811320756, |
|
"loss": 0.929, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.881799470743899, |
|
"grad_norm": 1.3473211526870728, |
|
"learning_rate": 0.000125, |
|
"loss": 0.7777, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8935607174360483, |
|
"grad_norm": 0.8593601584434509, |
|
"learning_rate": 0.00012452830188679244, |
|
"loss": 1.327, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9053219641281975, |
|
"grad_norm": 0.8441507816314697, |
|
"learning_rate": 0.0001240566037735849, |
|
"loss": 1.1585, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9170832108203468, |
|
"grad_norm": 0.908469557762146, |
|
"learning_rate": 0.00012358490566037738, |
|
"loss": 0.9916, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9288444575124963, |
|
"grad_norm": 1.1003684997558594, |
|
"learning_rate": 0.00012311320754716982, |
|
"loss": 0.7808, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.9406057042046458, |
|
"grad_norm": 1.2000435590744019, |
|
"learning_rate": 0.00012264150943396227, |
|
"loss": 0.847, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.9523669508967951, |
|
"grad_norm": 0.7908065915107727, |
|
"learning_rate": 0.0001221698113207547, |
|
"loss": 1.2671, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.9641281975889444, |
|
"grad_norm": 0.8809382319450378, |
|
"learning_rate": 0.00012169811320754718, |
|
"loss": 1.1279, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.9758894442810937, |
|
"grad_norm": 1.1937824487686157, |
|
"learning_rate": 0.00012122641509433963, |
|
"loss": 0.8854, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.9876506909732432, |
|
"grad_norm": 1.0509068965911865, |
|
"learning_rate": 0.00012075471698113207, |
|
"loss": 0.7986, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.9994119376653925, |
|
"grad_norm": 1.2940934896469116, |
|
"learning_rate": 0.00012028301886792453, |
|
"loss": 0.8177, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.011173184357542, |
|
"grad_norm": 1.00706148147583, |
|
"learning_rate": 0.000119811320754717, |
|
"loss": 0.9009, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.0229344310496913, |
|
"grad_norm": 0.7884982824325562, |
|
"learning_rate": 0.00011933962264150944, |
|
"loss": 0.5944, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.0346956777418406, |
|
"grad_norm": 0.8072102069854736, |
|
"learning_rate": 0.00011886792452830188, |
|
"loss": 0.4579, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.04645692443399, |
|
"grad_norm": 1.0868206024169922, |
|
"learning_rate": 0.00011839622641509434, |
|
"loss": 0.3714, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.0582181711261396, |
|
"grad_norm": 1.2487127780914307, |
|
"learning_rate": 0.00011792452830188681, |
|
"loss": 0.2964, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.069979417818289, |
|
"grad_norm": 0.8450261354446411, |
|
"learning_rate": 0.00011745283018867925, |
|
"loss": 0.8774, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.081740664510438, |
|
"grad_norm": 0.8103846311569214, |
|
"learning_rate": 0.0001169811320754717, |
|
"loss": 0.6733, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.0935019112025874, |
|
"grad_norm": 0.7691318392753601, |
|
"learning_rate": 0.00011650943396226415, |
|
"loss": 0.5036, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.8625450134277344, |
|
"learning_rate": 0.00011603773584905662, |
|
"loss": 0.33, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.117024404586886, |
|
"grad_norm": 1.036942481994629, |
|
"learning_rate": 0.00011556603773584907, |
|
"loss": 0.3417, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1287856512790357, |
|
"grad_norm": 0.7786136269569397, |
|
"learning_rate": 0.00011509433962264151, |
|
"loss": 0.7165, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.140546897971185, |
|
"grad_norm": 0.7121214866638184, |
|
"learning_rate": 0.00011462264150943395, |
|
"loss": 0.5809, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.1523081446633343, |
|
"grad_norm": 0.8065999150276184, |
|
"learning_rate": 0.00011415094339622642, |
|
"loss": 0.4644, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.1640693913554836, |
|
"grad_norm": 1.0368797779083252, |
|
"learning_rate": 0.00011367924528301888, |
|
"loss": 0.3668, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.175830638047633, |
|
"grad_norm": 0.7784335613250732, |
|
"learning_rate": 0.00011320754716981132, |
|
"loss": 0.2946, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.1875918847397826, |
|
"grad_norm": 1.0500203371047974, |
|
"learning_rate": 0.00011273584905660378, |
|
"loss": 0.8046, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.199353131431932, |
|
"grad_norm": 0.8676533699035645, |
|
"learning_rate": 0.00011226415094339624, |
|
"loss": 0.6096, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.211114378124081, |
|
"grad_norm": 0.8250516057014465, |
|
"learning_rate": 0.00011179245283018869, |
|
"loss": 0.4479, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.2228756248162305, |
|
"grad_norm": 1.0254476070404053, |
|
"learning_rate": 0.00011132075471698113, |
|
"loss": 0.3672, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.2346368715083798, |
|
"grad_norm": 0.9448462128639221, |
|
"learning_rate": 0.00011084905660377358, |
|
"loss": 0.2956, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.2463981182005295, |
|
"grad_norm": 0.9094712138175964, |
|
"learning_rate": 0.00011037735849056605, |
|
"loss": 0.8546, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.2581593648926788, |
|
"grad_norm": 0.7168066501617432, |
|
"learning_rate": 0.0001099056603773585, |
|
"loss": 0.6113, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.269920611584828, |
|
"grad_norm": 0.9491825699806213, |
|
"learning_rate": 0.00010943396226415095, |
|
"loss": 0.4896, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.2816818582769773, |
|
"grad_norm": 0.9781097173690796, |
|
"learning_rate": 0.00010896226415094339, |
|
"loss": 0.3119, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.2934431049691266, |
|
"grad_norm": 1.6303428411483765, |
|
"learning_rate": 0.00010849056603773586, |
|
"loss": 0.3224, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.305204351661276, |
|
"grad_norm": 0.9339887499809265, |
|
"learning_rate": 0.00010801886792452832, |
|
"loss": 0.7914, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.3169655983534256, |
|
"grad_norm": 0.8690701127052307, |
|
"learning_rate": 0.00010754716981132076, |
|
"loss": 0.5327, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.328726845045575, |
|
"grad_norm": 0.8797821998596191, |
|
"learning_rate": 0.0001070754716981132, |
|
"loss": 0.3846, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.340488091737724, |
|
"grad_norm": 1.1787986755371094, |
|
"learning_rate": 0.00010660377358490567, |
|
"loss": 0.3308, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.3522493384298735, |
|
"grad_norm": 1.682032585144043, |
|
"learning_rate": 0.00010613207547169812, |
|
"loss": 0.3173, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.9950155919985664e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|