{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3522493384298735, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011761246692149369, "grad_norm": 0.25049906969070435, "learning_rate": 0.0002, "loss": 2.1846, "step": 10 }, { "epoch": 0.023522493384298737, "grad_norm": 0.28385764360427856, "learning_rate": 0.00019952830188679245, "loss": 1.9496, "step": 20 }, { "epoch": 0.035283740076448106, "grad_norm": 0.27383577823638916, "learning_rate": 0.0001990566037735849, "loss": 2.003, "step": 30 }, { "epoch": 0.047044986768597474, "grad_norm": 0.3645761013031006, "learning_rate": 0.00019858490566037736, "loss": 1.9349, "step": 40 }, { "epoch": 0.058806233460746836, "grad_norm": 0.6317451596260071, "learning_rate": 0.00019811320754716983, "loss": 1.7642, "step": 50 }, { "epoch": 0.07056748015289621, "grad_norm": 0.18650729954242706, "learning_rate": 0.00019764150943396227, "loss": 1.9489, "step": 60 }, { "epoch": 0.08232872684504558, "grad_norm": 0.2123485505580902, "learning_rate": 0.00019716981132075472, "loss": 1.9835, "step": 70 }, { "epoch": 0.09408997353719495, "grad_norm": 0.2508566379547119, "learning_rate": 0.00019669811320754718, "loss": 1.8866, "step": 80 }, { "epoch": 0.10585122022934432, "grad_norm": 0.31404730677604675, "learning_rate": 0.00019622641509433963, "loss": 1.8954, "step": 90 }, { "epoch": 0.11761246692149367, "grad_norm": 0.5218461155891418, "learning_rate": 0.00019575471698113207, "loss": 1.7887, "step": 100 }, { "epoch": 0.12937371361364305, "grad_norm": 0.2055450826883316, "learning_rate": 0.00019528301886792454, "loss": 1.8669, "step": 110 }, { "epoch": 0.14113496030579242, "grad_norm": 0.22005008161067963, "learning_rate": 0.000194811320754717, "loss": 1.8794, "step": 120 }, { "epoch": 0.1528962069979418, "grad_norm": 0.2914157807826996, "learning_rate": 0.00019433962264150945, "loss": 1.8953, "step": 130 }, { "epoch": 0.16465745369009116, "grad_norm": 0.303595632314682, "learning_rate": 0.0001938679245283019, "loss": 1.8921, "step": 140 }, { "epoch": 0.17641870038224053, "grad_norm": 0.6398317813873291, "learning_rate": 0.00019339622641509433, "loss": 1.8473, "step": 150 }, { "epoch": 0.1881799470743899, "grad_norm": 0.20719175040721893, "learning_rate": 0.0001929245283018868, "loss": 1.8451, "step": 160 }, { "epoch": 0.19994119376653927, "grad_norm": 0.21924979984760284, "learning_rate": 0.00019245283018867927, "loss": 1.9988, "step": 170 }, { "epoch": 0.21170244045868863, "grad_norm": 0.3456704914569855, "learning_rate": 0.0001919811320754717, "loss": 1.8011, "step": 180 }, { "epoch": 0.22346368715083798, "grad_norm": 0.3223501741886139, "learning_rate": 0.00019150943396226415, "loss": 1.8616, "step": 190 }, { "epoch": 0.23522493384298734, "grad_norm": 0.6237074136734009, "learning_rate": 0.00019103773584905662, "loss": 1.7474, "step": 200 }, { "epoch": 0.2469861805351367, "grad_norm": 0.19962406158447266, "learning_rate": 0.00019056603773584906, "loss": 1.9342, "step": 210 }, { "epoch": 0.2587474272272861, "grad_norm": 0.23922637104988098, "learning_rate": 0.0001900943396226415, "loss": 1.8467, "step": 220 }, { "epoch": 0.27050867391943545, "grad_norm": 0.28077131509780884, "learning_rate": 0.00018962264150943397, "loss": 1.8587, "step": 230 }, { "epoch": 0.28226992061158485, "grad_norm": 0.3607043921947479, "learning_rate": 0.00018915094339622644, "loss": 1.7562, "step": 240 }, { "epoch": 0.2940311673037342, "grad_norm": 0.5218331217765808, "learning_rate": 0.00018867924528301889, "loss": 1.6972, "step": 250 }, { "epoch": 0.3057924139958836, "grad_norm": 0.22418326139450073, "learning_rate": 0.00018820754716981133, "loss": 1.9769, "step": 260 }, { "epoch": 0.3175536606880329, "grad_norm": 0.2506936192512512, "learning_rate": 0.00018773584905660377, "loss": 1.9306, "step": 270 }, { "epoch": 0.3293149073801823, "grad_norm": 0.2900485396385193, "learning_rate": 0.00018726415094339624, "loss": 1.8749, "step": 280 }, { "epoch": 0.34107615407233166, "grad_norm": 0.36592498421669006, "learning_rate": 0.00018679245283018868, "loss": 1.8406, "step": 290 }, { "epoch": 0.35283740076448106, "grad_norm": 0.7308420538902283, "learning_rate": 0.00018632075471698115, "loss": 1.7093, "step": 300 }, { "epoch": 0.3645986474566304, "grad_norm": 0.2798251509666443, "learning_rate": 0.0001858490566037736, "loss": 1.9505, "step": 310 }, { "epoch": 0.3763598941487798, "grad_norm": 0.26333555579185486, "learning_rate": 0.00018537735849056606, "loss": 1.9265, "step": 320 }, { "epoch": 0.38812114084092914, "grad_norm": 0.34414273500442505, "learning_rate": 0.0001849056603773585, "loss": 1.8182, "step": 330 }, { "epoch": 0.39988238753307853, "grad_norm": 0.36221399903297424, "learning_rate": 0.00018443396226415094, "loss": 1.8312, "step": 340 }, { "epoch": 0.4116436342252279, "grad_norm": 0.7047480344772339, "learning_rate": 0.00018396226415094339, "loss": 1.743, "step": 350 }, { "epoch": 0.42340488091737727, "grad_norm": 0.2410486787557602, "learning_rate": 0.00018349056603773585, "loss": 1.9485, "step": 360 }, { "epoch": 0.4351661276095266, "grad_norm": 0.30608782172203064, "learning_rate": 0.00018301886792452832, "loss": 1.8891, "step": 370 }, { "epoch": 0.44692737430167595, "grad_norm": 0.32783588767051697, "learning_rate": 0.00018254716981132077, "loss": 1.8253, "step": 380 }, { "epoch": 0.45868862099382535, "grad_norm": 0.37461912631988525, "learning_rate": 0.0001820754716981132, "loss": 1.8013, "step": 390 }, { "epoch": 0.4704498676859747, "grad_norm": 0.7036715149879456, "learning_rate": 0.00018160377358490568, "loss": 1.7548, "step": 400 }, { "epoch": 0.4822111143781241, "grad_norm": 0.23341324925422668, "learning_rate": 0.00018113207547169812, "loss": 1.8426, "step": 410 }, { "epoch": 0.4939723610702734, "grad_norm": 0.29215207695961, "learning_rate": 0.00018066037735849056, "loss": 1.8704, "step": 420 }, { "epoch": 0.5057336077624228, "grad_norm": 0.37499144673347473, "learning_rate": 0.00018018867924528303, "loss": 1.7602, "step": 430 }, { "epoch": 0.5174948544545722, "grad_norm": 0.41657859086990356, "learning_rate": 0.0001797169811320755, "loss": 1.8011, "step": 440 }, { "epoch": 0.5292561011467215, "grad_norm": 0.6587756872177124, "learning_rate": 0.00017924528301886794, "loss": 1.6643, "step": 450 }, { "epoch": 0.5410173478388709, "grad_norm": 0.28515782952308655, "learning_rate": 0.00017877358490566038, "loss": 1.8037, "step": 460 }, { "epoch": 0.5527785945310203, "grad_norm": 0.2742769420146942, "learning_rate": 0.00017830188679245282, "loss": 1.8544, "step": 470 }, { "epoch": 0.5645398412231697, "grad_norm": 0.34683799743652344, "learning_rate": 0.0001778301886792453, "loss": 1.7819, "step": 480 }, { "epoch": 0.576301087915319, "grad_norm": 0.47388383746147156, "learning_rate": 0.00017735849056603776, "loss": 1.786, "step": 490 }, { "epoch": 0.5880623346074684, "grad_norm": 0.617415726184845, "learning_rate": 0.0001768867924528302, "loss": 1.7053, "step": 500 }, { "epoch": 0.5998235812996178, "grad_norm": 0.26782867312431335, "learning_rate": 0.00017641509433962265, "loss": 1.8774, "step": 510 }, { "epoch": 0.6115848279917672, "grad_norm": 0.3381577134132385, "learning_rate": 0.00017594339622641511, "loss": 1.8537, "step": 520 }, { "epoch": 0.6233460746839165, "grad_norm": 0.3665984272956848, "learning_rate": 0.00017547169811320756, "loss": 1.7465, "step": 530 }, { "epoch": 0.6351073213760658, "grad_norm": 0.46630290150642395, "learning_rate": 0.000175, "loss": 1.7545, "step": 540 }, { "epoch": 0.6468685680682152, "grad_norm": 0.7455469369888306, "learning_rate": 0.00017452830188679247, "loss": 1.6776, "step": 550 }, { "epoch": 0.6586298147603646, "grad_norm": 0.27579784393310547, "learning_rate": 0.0001740566037735849, "loss": 1.8787, "step": 560 }, { "epoch": 0.6703910614525139, "grad_norm": 0.3148879110813141, "learning_rate": 0.00017358490566037738, "loss": 1.8989, "step": 570 }, { "epoch": 0.6821523081446633, "grad_norm": 0.3903751075267792, "learning_rate": 0.00017311320754716982, "loss": 1.7702, "step": 580 }, { "epoch": 0.6939135548368127, "grad_norm": 0.4537353217601776, "learning_rate": 0.00017264150943396226, "loss": 1.758, "step": 590 }, { "epoch": 0.7056748015289621, "grad_norm": 0.7169495224952698, "learning_rate": 0.0001721698113207547, "loss": 1.5495, "step": 600 }, { "epoch": 0.7174360482211114, "grad_norm": 0.2942892909049988, "learning_rate": 0.00017169811320754717, "loss": 1.7981, "step": 610 }, { "epoch": 0.7291972949132608, "grad_norm": 0.39550286531448364, "learning_rate": 0.00017122641509433964, "loss": 1.7919, "step": 620 }, { "epoch": 0.7409585416054102, "grad_norm": 0.3948846459388733, "learning_rate": 0.00017075471698113208, "loss": 1.7793, "step": 630 }, { "epoch": 0.7527197882975596, "grad_norm": 0.4996489882469177, "learning_rate": 0.00017028301886792453, "loss": 1.6956, "step": 640 }, { "epoch": 0.7644810349897089, "grad_norm": 0.7511508464813232, "learning_rate": 0.000169811320754717, "loss": 1.6399, "step": 650 }, { "epoch": 0.7762422816818583, "grad_norm": 0.3312196433544159, "learning_rate": 0.00016933962264150944, "loss": 1.7876, "step": 660 }, { "epoch": 0.7880035283740077, "grad_norm": 0.40000253915786743, "learning_rate": 0.00016886792452830188, "loss": 1.8278, "step": 670 }, { "epoch": 0.7997647750661571, "grad_norm": 0.4055274724960327, "learning_rate": 0.00016839622641509435, "loss": 1.6638, "step": 680 }, { "epoch": 0.8115260217583063, "grad_norm": 0.48130497336387634, "learning_rate": 0.00016792452830188682, "loss": 1.7075, "step": 690 }, { "epoch": 0.8232872684504557, "grad_norm": 1.0582154989242554, "learning_rate": 0.00016745283018867926, "loss": 1.5322, "step": 700 }, { "epoch": 0.8350485151426051, "grad_norm": 0.31292250752449036, "learning_rate": 0.0001669811320754717, "loss": 1.8269, "step": 710 }, { "epoch": 0.8468097618347545, "grad_norm": 0.3395911157131195, "learning_rate": 0.00016650943396226414, "loss": 1.7563, "step": 720 }, { "epoch": 0.8585710085269038, "grad_norm": 0.4362980127334595, "learning_rate": 0.0001660377358490566, "loss": 1.6545, "step": 730 }, { "epoch": 0.8703322552190532, "grad_norm": 0.5648341774940491, "learning_rate": 0.00016556603773584908, "loss": 1.5902, "step": 740 }, { "epoch": 0.8820935019112026, "grad_norm": 0.8163714408874512, "learning_rate": 0.00016509433962264152, "loss": 1.6889, "step": 750 }, { "epoch": 0.8938547486033519, "grad_norm": 0.3610120117664337, "learning_rate": 0.00016462264150943396, "loss": 1.8061, "step": 760 }, { "epoch": 0.9056159952955013, "grad_norm": 0.40071502327919006, "learning_rate": 0.00016415094339622643, "loss": 1.7412, "step": 770 }, { "epoch": 0.9173772419876507, "grad_norm": 0.4744262993335724, "learning_rate": 0.00016367924528301887, "loss": 1.7553, "step": 780 }, { "epoch": 0.9291384886798001, "grad_norm": 0.5387608408927917, "learning_rate": 0.00016320754716981132, "loss": 1.6238, "step": 790 }, { "epoch": 0.9408997353719494, "grad_norm": 0.9463699460029602, "learning_rate": 0.00016273584905660379, "loss": 1.4965, "step": 800 }, { "epoch": 0.9526609820640988, "grad_norm": 0.39017385244369507, "learning_rate": 0.00016226415094339625, "loss": 1.7494, "step": 810 }, { "epoch": 0.9644222287562482, "grad_norm": 0.39241862297058105, "learning_rate": 0.0001617924528301887, "loss": 1.745, "step": 820 }, { "epoch": 0.9761834754483976, "grad_norm": 0.4188750982284546, "learning_rate": 0.00016132075471698114, "loss": 1.7072, "step": 830 }, { "epoch": 0.9879447221405468, "grad_norm": 0.54363614320755, "learning_rate": 0.00016084905660377358, "loss": 1.6571, "step": 840 }, { "epoch": 0.9997059688326962, "grad_norm": 0.8282334804534912, "learning_rate": 0.00016037735849056605, "loss": 1.5609, "step": 850 }, { "epoch": 1.0114672155248456, "grad_norm": 0.4861317574977875, "learning_rate": 0.0001599056603773585, "loss": 1.7427, "step": 860 }, { "epoch": 1.023228462216995, "grad_norm": 0.47034987807273865, "learning_rate": 0.00015943396226415096, "loss": 1.4911, "step": 870 }, { "epoch": 1.0349897089091444, "grad_norm": 0.8243444561958313, "learning_rate": 0.0001589622641509434, "loss": 1.242, "step": 880 }, { "epoch": 1.0467509556012937, "grad_norm": 0.8107286095619202, "learning_rate": 0.00015849056603773587, "loss": 1.1067, "step": 890 }, { "epoch": 1.058512202293443, "grad_norm": 0.9792178869247437, "learning_rate": 0.0001580188679245283, "loss": 0.9078, "step": 900 }, { "epoch": 1.0702734489855925, "grad_norm": 0.4514322280883789, "learning_rate": 0.00015754716981132075, "loss": 1.5328, "step": 910 }, { "epoch": 1.0820346956777418, "grad_norm": 0.5203831791877747, "learning_rate": 0.0001570754716981132, "loss": 1.4851, "step": 920 }, { "epoch": 1.0937959423698913, "grad_norm": 0.7015544176101685, "learning_rate": 0.00015660377358490567, "loss": 1.2215, "step": 930 }, { "epoch": 1.1055571890620406, "grad_norm": 0.7290483117103577, "learning_rate": 0.00015613207547169813, "loss": 1.0831, "step": 940 }, { "epoch": 1.1173184357541899, "grad_norm": 1.0971975326538086, "learning_rate": 0.00015566037735849058, "loss": 0.8673, "step": 950 }, { "epoch": 1.1290796824463394, "grad_norm": 0.5123384594917297, "learning_rate": 0.00015518867924528302, "loss": 1.5301, "step": 960 }, { "epoch": 1.1408409291384887, "grad_norm": 0.6260602474212646, "learning_rate": 0.0001547169811320755, "loss": 1.4956, "step": 970 }, { "epoch": 1.152602175830638, "grad_norm": 0.6829984188079834, "learning_rate": 0.00015424528301886793, "loss": 1.3128, "step": 980 }, { "epoch": 1.1643634225227875, "grad_norm": 0.7748053073883057, "learning_rate": 0.00015377358490566037, "loss": 1.1702, "step": 990 }, { "epoch": 1.1761246692149367, "grad_norm": 1.001291036605835, "learning_rate": 0.00015330188679245284, "loss": 0.8918, "step": 1000 }, { "epoch": 1.1878859159070863, "grad_norm": 0.517902135848999, "learning_rate": 0.0001528301886792453, "loss": 1.4376, "step": 1010 }, { "epoch": 1.1996471625992355, "grad_norm": 0.6000102758407593, "learning_rate": 0.00015235849056603775, "loss": 1.4512, "step": 1020 }, { "epoch": 1.2114084092913848, "grad_norm": 0.762768566608429, "learning_rate": 0.0001518867924528302, "loss": 1.3248, "step": 1030 }, { "epoch": 1.2231696559835343, "grad_norm": 0.9720354676246643, "learning_rate": 0.00015141509433962263, "loss": 1.0415, "step": 1040 }, { "epoch": 1.2349309026756836, "grad_norm": 0.902864396572113, "learning_rate": 0.0001509433962264151, "loss": 0.8803, "step": 1050 }, { "epoch": 1.246692149367833, "grad_norm": 0.5235794186592102, "learning_rate": 0.00015047169811320757, "loss": 1.4755, "step": 1060 }, { "epoch": 1.2584533960599824, "grad_norm": 0.5898970365524292, "learning_rate": 0.00015000000000000001, "loss": 1.4036, "step": 1070 }, { "epoch": 1.2702146427521317, "grad_norm": 0.9541133642196655, "learning_rate": 0.00014952830188679246, "loss": 1.197, "step": 1080 }, { "epoch": 1.281975889444281, "grad_norm": 0.9920721054077148, "learning_rate": 0.0001490566037735849, "loss": 1.0743, "step": 1090 }, { "epoch": 1.2937371361364305, "grad_norm": 1.3523385524749756, "learning_rate": 0.00014858490566037737, "loss": 0.8989, "step": 1100 }, { "epoch": 1.3054983828285798, "grad_norm": 0.5665034055709839, "learning_rate": 0.0001481132075471698, "loss": 1.4432, "step": 1110 }, { "epoch": 1.3172596295207293, "grad_norm": 0.6107054352760315, "learning_rate": 0.00014764150943396228, "loss": 1.356, "step": 1120 }, { "epoch": 1.3290208762128786, "grad_norm": 0.7833155393600464, "learning_rate": 0.00014716981132075472, "loss": 1.2708, "step": 1130 }, { "epoch": 1.3407821229050279, "grad_norm": 0.9629625082015991, "learning_rate": 0.0001466981132075472, "loss": 0.9813, "step": 1140 }, { "epoch": 1.3525433695971774, "grad_norm": 1.0938910245895386, "learning_rate": 0.00014622641509433963, "loss": 0.8002, "step": 1150 }, { "epoch": 1.3643046162893266, "grad_norm": 0.5895722508430481, "learning_rate": 0.00014575471698113207, "loss": 1.469, "step": 1160 }, { "epoch": 1.3760658629814762, "grad_norm": 0.6274592280387878, "learning_rate": 0.00014528301886792451, "loss": 1.2954, "step": 1170 }, { "epoch": 1.3878271096736254, "grad_norm": 0.748171329498291, "learning_rate": 0.00014481132075471698, "loss": 1.1003, "step": 1180 }, { "epoch": 1.3995883563657747, "grad_norm": 1.0281026363372803, "learning_rate": 0.00014433962264150945, "loss": 1.0006, "step": 1190 }, { "epoch": 1.411349603057924, "grad_norm": 1.0714832544326782, "learning_rate": 0.0001438679245283019, "loss": 0.8439, "step": 1200 }, { "epoch": 1.4231108497500735, "grad_norm": 0.6404314637184143, "learning_rate": 0.00014339622641509434, "loss": 1.4463, "step": 1210 }, { "epoch": 1.4348720964422228, "grad_norm": 0.6800934672355652, "learning_rate": 0.0001429245283018868, "loss": 1.2484, "step": 1220 }, { "epoch": 1.4466333431343723, "grad_norm": 0.8627371191978455, "learning_rate": 0.00014245283018867925, "loss": 1.1863, "step": 1230 }, { "epoch": 1.4583945898265216, "grad_norm": 1.0996595621109009, "learning_rate": 0.0001419811320754717, "loss": 0.9519, "step": 1240 }, { "epoch": 1.4701558365186709, "grad_norm": 1.1529676914215088, "learning_rate": 0.00014150943396226416, "loss": 0.8407, "step": 1250 }, { "epoch": 1.4819170832108204, "grad_norm": 0.611027717590332, "learning_rate": 0.00014103773584905663, "loss": 1.3786, "step": 1260 }, { "epoch": 1.4936783299029697, "grad_norm": 0.7889626026153564, "learning_rate": 0.00014056603773584907, "loss": 1.2603, "step": 1270 }, { "epoch": 1.5054395765951192, "grad_norm": 0.8136641979217529, "learning_rate": 0.0001400943396226415, "loss": 1.0535, "step": 1280 }, { "epoch": 1.5172008232872685, "grad_norm": 1.0993061065673828, "learning_rate": 0.00013962264150943395, "loss": 0.9192, "step": 1290 }, { "epoch": 1.5289620699794177, "grad_norm": 1.2532891035079956, "learning_rate": 0.00013915094339622642, "loss": 0.8772, "step": 1300 }, { "epoch": 1.540723316671567, "grad_norm": 0.6979594826698303, "learning_rate": 0.0001386792452830189, "loss": 1.3564, "step": 1310 }, { "epoch": 1.5524845633637165, "grad_norm": 0.6345073580741882, "learning_rate": 0.00013820754716981133, "loss": 1.235, "step": 1320 }, { "epoch": 1.564245810055866, "grad_norm": 1.0022692680358887, "learning_rate": 0.00013773584905660377, "loss": 1.0533, "step": 1330 }, { "epoch": 1.5760070567480153, "grad_norm": 1.0487345457077026, "learning_rate": 0.00013726415094339624, "loss": 0.9879, "step": 1340 }, { "epoch": 1.5877683034401646, "grad_norm": 1.3332520723342896, "learning_rate": 0.00013679245283018868, "loss": 0.8568, "step": 1350 }, { "epoch": 1.599529550132314, "grad_norm": 0.6801854968070984, "learning_rate": 0.00013632075471698113, "loss": 1.3149, "step": 1360 }, { "epoch": 1.6112907968244634, "grad_norm": 0.7094405293464661, "learning_rate": 0.0001358490566037736, "loss": 1.2843, "step": 1370 }, { "epoch": 1.623052043516613, "grad_norm": 0.7568113803863525, "learning_rate": 0.00013537735849056606, "loss": 1.1169, "step": 1380 }, { "epoch": 1.6348132902087622, "grad_norm": 1.1939420700073242, "learning_rate": 0.0001349056603773585, "loss": 0.8441, "step": 1390 }, { "epoch": 1.6465745369009115, "grad_norm": 1.4502966403961182, "learning_rate": 0.00013443396226415095, "loss": 0.801, "step": 1400 }, { "epoch": 1.6583357835930608, "grad_norm": 0.6542213559150696, "learning_rate": 0.0001339622641509434, "loss": 1.4153, "step": 1410 }, { "epoch": 1.6700970302852103, "grad_norm": 0.7604705691337585, "learning_rate": 0.00013349056603773586, "loss": 1.25, "step": 1420 }, { "epoch": 1.6818582769773596, "grad_norm": 0.8076483607292175, "learning_rate": 0.0001330188679245283, "loss": 1.1673, "step": 1430 }, { "epoch": 1.693619523669509, "grad_norm": 0.9957693815231323, "learning_rate": 0.00013254716981132077, "loss": 0.9437, "step": 1440 }, { "epoch": 1.7053807703616584, "grad_norm": 1.2569739818572998, "learning_rate": 0.0001320754716981132, "loss": 0.7948, "step": 1450 }, { "epoch": 1.7171420170538076, "grad_norm": 0.8244763016700745, "learning_rate": 0.00013160377358490568, "loss": 1.3546, "step": 1460 }, { "epoch": 1.728903263745957, "grad_norm": 0.8371909856796265, "learning_rate": 0.00013113207547169812, "loss": 1.2031, "step": 1470 }, { "epoch": 1.7406645104381064, "grad_norm": 1.203465223312378, "learning_rate": 0.00013066037735849056, "loss": 1.0183, "step": 1480 }, { "epoch": 1.752425757130256, "grad_norm": 1.2281197309494019, "learning_rate": 0.000130188679245283, "loss": 0.9311, "step": 1490 }, { "epoch": 1.7641870038224052, "grad_norm": 1.3259741067886353, "learning_rate": 0.00012971698113207548, "loss": 0.872, "step": 1500 }, { "epoch": 1.7759482505145545, "grad_norm": 0.7928496599197388, "learning_rate": 0.00012924528301886794, "loss": 1.3436, "step": 1510 }, { "epoch": 1.7877094972067038, "grad_norm": 0.8125369548797607, "learning_rate": 0.00012877358490566039, "loss": 1.0189, "step": 1520 }, { "epoch": 1.7994707438988533, "grad_norm": 1.0345025062561035, "learning_rate": 0.00012830188679245283, "loss": 1.0006, "step": 1530 }, { "epoch": 1.8112319905910026, "grad_norm": 0.8656748533248901, "learning_rate": 0.0001278301886792453, "loss": 0.8927, "step": 1540 }, { "epoch": 1.822993237283152, "grad_norm": 1.12923264503479, "learning_rate": 0.00012735849056603774, "loss": 0.7717, "step": 1550 }, { "epoch": 1.8347544839753014, "grad_norm": 0.898140549659729, "learning_rate": 0.00012688679245283018, "loss": 1.2768, "step": 1560 }, { "epoch": 1.8465157306674507, "grad_norm": 0.748009204864502, "learning_rate": 0.00012641509433962265, "loss": 1.1579, "step": 1570 }, { "epoch": 1.8582769773596, "grad_norm": 1.3326165676116943, "learning_rate": 0.00012594339622641512, "loss": 0.973, "step": 1580 }, { "epoch": 1.8700382240517495, "grad_norm": 0.9244058132171631, "learning_rate": 0.00012547169811320756, "loss": 0.929, "step": 1590 }, { "epoch": 1.881799470743899, "grad_norm": 1.3473211526870728, "learning_rate": 0.000125, "loss": 0.7777, "step": 1600 }, { "epoch": 1.8935607174360483, "grad_norm": 0.8593601584434509, "learning_rate": 0.00012452830188679244, "loss": 1.327, "step": 1610 }, { "epoch": 1.9053219641281975, "grad_norm": 0.8441507816314697, "learning_rate": 0.0001240566037735849, "loss": 1.1585, "step": 1620 }, { "epoch": 1.9170832108203468, "grad_norm": 0.908469557762146, "learning_rate": 0.00012358490566037738, "loss": 0.9916, "step": 1630 }, { "epoch": 1.9288444575124963, "grad_norm": 1.1003684997558594, "learning_rate": 0.00012311320754716982, "loss": 0.7808, "step": 1640 }, { "epoch": 1.9406057042046458, "grad_norm": 1.2000435590744019, "learning_rate": 0.00012264150943396227, "loss": 0.847, "step": 1650 }, { "epoch": 1.9523669508967951, "grad_norm": 0.7908065915107727, "learning_rate": 0.0001221698113207547, "loss": 1.2671, "step": 1660 }, { "epoch": 1.9641281975889444, "grad_norm": 0.8809382319450378, "learning_rate": 0.00012169811320754718, "loss": 1.1279, "step": 1670 }, { "epoch": 1.9758894442810937, "grad_norm": 1.1937824487686157, "learning_rate": 0.00012122641509433963, "loss": 0.8854, "step": 1680 }, { "epoch": 1.9876506909732432, "grad_norm": 1.0509068965911865, "learning_rate": 0.00012075471698113207, "loss": 0.7986, "step": 1690 }, { "epoch": 1.9994119376653925, "grad_norm": 1.2940934896469116, "learning_rate": 0.00012028301886792453, "loss": 0.8177, "step": 1700 }, { "epoch": 2.011173184357542, "grad_norm": 1.00706148147583, "learning_rate": 0.000119811320754717, "loss": 0.9009, "step": 1710 }, { "epoch": 2.0229344310496913, "grad_norm": 0.7884982824325562, "learning_rate": 0.00011933962264150944, "loss": 0.5944, "step": 1720 }, { "epoch": 2.0346956777418406, "grad_norm": 0.8072102069854736, "learning_rate": 0.00011886792452830188, "loss": 0.4579, "step": 1730 }, { "epoch": 2.04645692443399, "grad_norm": 1.0868206024169922, "learning_rate": 0.00011839622641509434, "loss": 0.3714, "step": 1740 }, { "epoch": 2.0582181711261396, "grad_norm": 1.2487127780914307, "learning_rate": 0.00011792452830188681, "loss": 0.2964, "step": 1750 }, { "epoch": 2.069979417818289, "grad_norm": 0.8450261354446411, "learning_rate": 0.00011745283018867925, "loss": 0.8774, "step": 1760 }, { "epoch": 2.081740664510438, "grad_norm": 0.8103846311569214, "learning_rate": 0.0001169811320754717, "loss": 0.6733, "step": 1770 }, { "epoch": 2.0935019112025874, "grad_norm": 0.7691318392753601, "learning_rate": 0.00011650943396226415, "loss": 0.5036, "step": 1780 }, { "epoch": 2.1052631578947367, "grad_norm": 0.8625450134277344, "learning_rate": 0.00011603773584905662, "loss": 0.33, "step": 1790 }, { "epoch": 2.117024404586886, "grad_norm": 1.036942481994629, "learning_rate": 0.00011556603773584907, "loss": 0.3417, "step": 1800 }, { "epoch": 2.1287856512790357, "grad_norm": 0.7786136269569397, "learning_rate": 0.00011509433962264151, "loss": 0.7165, "step": 1810 }, { "epoch": 2.140546897971185, "grad_norm": 0.7121214866638184, "learning_rate": 0.00011462264150943395, "loss": 0.5809, "step": 1820 }, { "epoch": 2.1523081446633343, "grad_norm": 0.8065999150276184, "learning_rate": 0.00011415094339622642, "loss": 0.4644, "step": 1830 }, { "epoch": 2.1640693913554836, "grad_norm": 1.0368797779083252, "learning_rate": 0.00011367924528301888, "loss": 0.3668, "step": 1840 }, { "epoch": 2.175830638047633, "grad_norm": 0.7784335613250732, "learning_rate": 0.00011320754716981132, "loss": 0.2946, "step": 1850 }, { "epoch": 2.1875918847397826, "grad_norm": 1.0500203371047974, "learning_rate": 0.00011273584905660378, "loss": 0.8046, "step": 1860 }, { "epoch": 2.199353131431932, "grad_norm": 0.8676533699035645, "learning_rate": 0.00011226415094339624, "loss": 0.6096, "step": 1870 }, { "epoch": 2.211114378124081, "grad_norm": 0.8250516057014465, "learning_rate": 0.00011179245283018869, "loss": 0.4479, "step": 1880 }, { "epoch": 2.2228756248162305, "grad_norm": 1.0254476070404053, "learning_rate": 0.00011132075471698113, "loss": 0.3672, "step": 1890 }, { "epoch": 2.2346368715083798, "grad_norm": 0.9448462128639221, "learning_rate": 0.00011084905660377358, "loss": 0.2956, "step": 1900 }, { "epoch": 2.2463981182005295, "grad_norm": 0.9094712138175964, "learning_rate": 0.00011037735849056605, "loss": 0.8546, "step": 1910 }, { "epoch": 2.2581593648926788, "grad_norm": 0.7168066501617432, "learning_rate": 0.0001099056603773585, "loss": 0.6113, "step": 1920 }, { "epoch": 2.269920611584828, "grad_norm": 0.9491825699806213, "learning_rate": 0.00010943396226415095, "loss": 0.4896, "step": 1930 }, { "epoch": 2.2816818582769773, "grad_norm": 0.9781097173690796, "learning_rate": 0.00010896226415094339, "loss": 0.3119, "step": 1940 }, { "epoch": 2.2934431049691266, "grad_norm": 1.6303428411483765, "learning_rate": 0.00010849056603773586, "loss": 0.3224, "step": 1950 }, { "epoch": 2.305204351661276, "grad_norm": 0.9339887499809265, "learning_rate": 0.00010801886792452832, "loss": 0.7914, "step": 1960 }, { "epoch": 2.3169655983534256, "grad_norm": 0.8690701127052307, "learning_rate": 0.00010754716981132076, "loss": 0.5327, "step": 1970 }, { "epoch": 2.328726845045575, "grad_norm": 0.8797821998596191, "learning_rate": 0.0001070754716981132, "loss": 0.3846, "step": 1980 }, { "epoch": 2.340488091737724, "grad_norm": 1.1787986755371094, "learning_rate": 0.00010660377358490567, "loss": 0.3308, "step": 1990 }, { "epoch": 2.3522493384298735, "grad_norm": 1.682032585144043, "learning_rate": 0.00010613207547169812, "loss": 0.3173, "step": 2000 } ], "logging_steps": 10, "max_steps": 4250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.9950155919985664e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }