{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 5900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0211864406779661, "grad_norm": 0.3481608033180237, "learning_rate": 2.11864406779661e-05, "loss": 1.0844, "step": 25 }, { "epoch": 0.0423728813559322, "grad_norm": 0.4737679064273834, "learning_rate": 4.23728813559322e-05, "loss": 0.968, "step": 50 }, { "epoch": 0.0635593220338983, "grad_norm": 0.24385006725788116, "learning_rate": 6.355932203389829e-05, "loss": 0.7097, "step": 75 }, { "epoch": 0.0847457627118644, "grad_norm": 0.32397332787513733, "learning_rate": 8.47457627118644e-05, "loss": 0.6995, "step": 100 }, { "epoch": 0.1059322033898305, "grad_norm": 0.28883692622184753, "learning_rate": 0.0001059322033898305, "loss": 0.6362, "step": 125 }, { "epoch": 0.1271186440677966, "grad_norm": 0.2939394414424896, "learning_rate": 0.00012711864406779658, "loss": 0.6646, "step": 150 }, { "epoch": 0.1483050847457627, "grad_norm": 0.21506226062774658, "learning_rate": 0.0001483050847457627, "loss": 0.67, "step": 175 }, { "epoch": 0.1694915254237288, "grad_norm": 0.24949543178081512, "learning_rate": 0.00014999402230951556, "loss": 0.6328, "step": 200 }, { "epoch": 0.1906779661016949, "grad_norm": 0.1662125438451767, "learning_rate": 0.00014997396600188487, "loss": 0.6365, "step": 225 }, { "epoch": 0.211864406779661, "grad_norm": 0.18493060767650604, "learning_rate": 0.00014993978965384007, "loss": 0.6661, "step": 250 }, { "epoch": 0.2330508474576271, "grad_norm": 0.1724727302789688, "learning_rate": 0.00014989149970190098, "loss": 0.6453, "step": 275 }, { "epoch": 0.2542372881355932, "grad_norm": 0.2265160232782364, "learning_rate": 0.00014982910524063883, "loss": 0.5802, "step": 300 }, { "epoch": 0.2754237288135593, "grad_norm": 0.14229296147823334, "learning_rate": 0.00014975261802096344, "loss": 0.6559, "step": 325 }, { "epoch": 0.2966101694915254, "grad_norm": 0.19628387689590454, "learning_rate": 0.0001496620524479102, "loss": 0.6181, "step": 350 }, { "epoch": 0.3177966101694915, "grad_norm": 0.19808532297611237, "learning_rate": 0.00014955742557792704, "loss": 0.6363, "step": 375 }, { "epoch": 0.3389830508474576, "grad_norm": 0.2479950338602066, "learning_rate": 0.00014943875711566237, "loss": 0.601, "step": 400 }, { "epoch": 0.3601694915254237, "grad_norm": 0.18844148516654968, "learning_rate": 0.0001493060694102537, "loss": 0.6406, "step": 425 }, { "epoch": 0.3813559322033898, "grad_norm": 0.21692270040512085, "learning_rate": 0.00014915938745111896, "loss": 0.674, "step": 450 }, { "epoch": 0.4025423728813559, "grad_norm": 0.18362776935100555, "learning_rate": 0.0001489987388632498, "loss": 0.6326, "step": 475 }, { "epoch": 0.423728813559322, "grad_norm": 0.1860133409500122, "learning_rate": 0.0001488241539020092, "loss": 0.6539, "step": 500 }, { "epoch": 0.4449152542372881, "grad_norm": 0.16509853303432465, "learning_rate": 0.00014863566544743326, "loss": 0.6649, "step": 525 }, { "epoch": 0.4661016949152542, "grad_norm": 0.17422816157341003, "learning_rate": 0.0001484333089980388, "loss": 0.6365, "step": 550 }, { "epoch": 0.4872881355932203, "grad_norm": 0.16881784796714783, "learning_rate": 0.000148217122664138, "loss": 0.6014, "step": 575 }, { "epoch": 0.5084745762711864, "grad_norm": 0.24150097370147705, "learning_rate": 0.00014798714716066072, "loss": 0.6225, "step": 600 }, { "epoch": 0.5296610169491526, "grad_norm": 0.183096244931221, "learning_rate": 0.00014774342579948675, "loss": 0.628, "step": 625 }, { "epoch": 0.5508474576271186, "grad_norm": 0.2092808037996292, "learning_rate": 0.00014748600448128877, "loss": 0.6196, "step": 650 }, { "epoch": 0.5720338983050848, "grad_norm": 0.1650499850511551, "learning_rate": 0.00014721493168688764, "loss": 0.6617, "step": 675 }, { "epoch": 0.5932203389830508, "grad_norm": 0.2336203157901764, "learning_rate": 0.00014693025846812194, "loss": 0.5995, "step": 700 }, { "epoch": 0.614406779661017, "grad_norm": 0.1635483205318451, "learning_rate": 0.0001466320384382333, "loss": 0.6225, "step": 725 }, { "epoch": 0.635593220338983, "grad_norm": 0.24543817341327667, "learning_rate": 0.00014632032776176924, "loss": 0.6208, "step": 750 }, { "epoch": 0.6567796610169492, "grad_norm": 0.156394824385643, "learning_rate": 0.0001459951851440055, "loss": 0.6234, "step": 775 }, { "epoch": 0.6779661016949152, "grad_norm": 0.21179354190826416, "learning_rate": 0.00014565667181988995, "loss": 0.6101, "step": 800 }, { "epoch": 0.6991525423728814, "grad_norm": 0.1816495805978775, "learning_rate": 0.00014530485154251021, "loss": 0.6212, "step": 825 }, { "epoch": 0.7203389830508474, "grad_norm": 0.18615126609802246, "learning_rate": 0.0001449397905710866, "loss": 0.6019, "step": 850 }, { "epoch": 0.7415254237288136, "grad_norm": 0.13972151279449463, "learning_rate": 0.00014456155765849355, "loss": 0.6804, "step": 875 }, { "epoch": 0.7627118644067796, "grad_norm": 0.19166871905326843, "learning_rate": 0.00014417022403831117, "loss": 0.6265, "step": 900 }, { "epoch": 0.7838983050847458, "grad_norm": 0.1559162586927414, "learning_rate": 0.00014376586341140955, "loss": 0.5893, "step": 925 }, { "epoch": 0.8050847457627118, "grad_norm": 0.17139187455177307, "learning_rate": 0.0001433485519320687, "loss": 0.6192, "step": 950 }, { "epoch": 0.826271186440678, "grad_norm": 0.19588051736354828, "learning_rate": 0.0001429183681936359, "loss": 0.6545, "step": 975 }, { "epoch": 0.847457627118644, "grad_norm": 0.17011399567127228, "learning_rate": 0.0001424753932137243, "loss": 0.6274, "step": 1000 }, { "epoch": 0.8686440677966102, "grad_norm": 0.13620993494987488, "learning_rate": 0.00014201971041895455, "loss": 0.6185, "step": 1025 }, { "epoch": 0.8898305084745762, "grad_norm": 0.19832104444503784, "learning_rate": 0.00014155140562924286, "loss": 0.5788, "step": 1050 }, { "epoch": 0.9110169491525424, "grad_norm": 0.15580902993679047, "learning_rate": 0.00014107056704163823, "loss": 0.6756, "step": 1075 }, { "epoch": 0.9322033898305084, "grad_norm": 0.2072034329175949, "learning_rate": 0.00014057728521371218, "loss": 0.6347, "step": 1100 }, { "epoch": 0.9533898305084746, "grad_norm": 0.13679395616054535, "learning_rate": 0.00014007165304650386, "loss": 0.6419, "step": 1125 }, { "epoch": 0.9745762711864406, "grad_norm": 0.20975461602210999, "learning_rate": 0.00013955376576702357, "loss": 0.5929, "step": 1150 }, { "epoch": 0.9957627118644068, "grad_norm": 0.18808843195438385, "learning_rate": 0.00013902372091031856, "loss": 0.6327, "step": 1175 }, { "epoch": 1.0169491525423728, "grad_norm": 0.12700864672660828, "learning_rate": 0.00013848161830110395, "loss": 0.6166, "step": 1200 }, { "epoch": 1.0381355932203389, "grad_norm": 0.17502394318580627, "learning_rate": 0.0001379275600349625, "loss": 0.542, "step": 1225 }, { "epoch": 1.0593220338983051, "grad_norm": 0.17643524706363678, "learning_rate": 0.0001373616504591167, "loss": 0.6077, "step": 1250 }, { "epoch": 1.0805084745762712, "grad_norm": 0.21401630342006683, "learning_rate": 0.00013678399615277674, "loss": 0.5856, "step": 1275 }, { "epoch": 1.1016949152542372, "grad_norm": 0.1577410101890564, "learning_rate": 0.00013619470590706814, "loss": 0.5882, "step": 1300 }, { "epoch": 1.1228813559322033, "grad_norm": 0.2284272313117981, "learning_rate": 0.00013559389070454304, "loss": 0.5842, "step": 1325 }, { "epoch": 1.1440677966101696, "grad_norm": 0.2204512506723404, "learning_rate": 0.00013498166369827833, "loss": 0.5911, "step": 1350 }, { "epoch": 1.1652542372881356, "grad_norm": 0.21209457516670227, "learning_rate": 0.00013435814019056535, "loss": 0.602, "step": 1375 }, { "epoch": 1.1864406779661016, "grad_norm": 0.16774219274520874, "learning_rate": 0.00013372343761119466, "loss": 0.5746, "step": 1400 }, { "epoch": 1.207627118644068, "grad_norm": 0.23171478509902954, "learning_rate": 0.00013307767549534033, "loss": 0.6046, "step": 1425 }, { "epoch": 1.228813559322034, "grad_norm": 0.17449446022510529, "learning_rate": 0.00013242097546104734, "loss": 0.5969, "step": 1450 }, { "epoch": 1.25, "grad_norm": 0.21454857289791107, "learning_rate": 0.00013175346118632713, "loss": 0.5927, "step": 1475 }, { "epoch": 1.271186440677966, "grad_norm": 0.17533324658870697, "learning_rate": 0.00013107525838586495, "loss": 0.5806, "step": 1500 }, { "epoch": 1.292372881355932, "grad_norm": 0.2303514927625656, "learning_rate": 0.00013038649478734363, "loss": 0.6269, "step": 1525 }, { "epoch": 1.3135593220338984, "grad_norm": 0.2209363877773285, "learning_rate": 0.00012968730010738837, "loss": 0.5699, "step": 1550 }, { "epoch": 1.3347457627118644, "grad_norm": 0.2777274250984192, "learning_rate": 0.0001289778060271368, "loss": 0.5583, "step": 1575 }, { "epoch": 1.3559322033898304, "grad_norm": 0.19397616386413574, "learning_rate": 0.00012825814616743928, "loss": 0.5785, "step": 1600 }, { "epoch": 1.3771186440677967, "grad_norm": 0.24071291089057922, "learning_rate": 0.0001275284560636935, "loss": 0.5793, "step": 1625 }, { "epoch": 1.3983050847457628, "grad_norm": 0.16364933550357819, "learning_rate": 0.000126788873140319, "loss": 0.5591, "step": 1650 }, { "epoch": 1.4194915254237288, "grad_norm": 0.2222534716129303, "learning_rate": 0.00012603953668487546, "loss": 0.5649, "step": 1675 }, { "epoch": 1.4406779661016949, "grad_norm": 0.18990883231163025, "learning_rate": 0.00012528058782183048, "loss": 0.5732, "step": 1700 }, { "epoch": 1.461864406779661, "grad_norm": 0.23255659639835358, "learning_rate": 0.00012451216948598117, "loss": 0.55, "step": 1725 }, { "epoch": 1.4830508474576272, "grad_norm": 0.19624237716197968, "learning_rate": 0.00012373442639553487, "loss": 0.5793, "step": 1750 }, { "epoch": 1.5042372881355932, "grad_norm": 0.24238888919353485, "learning_rate": 0.00012294750502485398, "loss": 0.5823, "step": 1775 }, { "epoch": 1.5254237288135593, "grad_norm": 0.2002212405204773, "learning_rate": 0.00012215155357687017, "loss": 0.571, "step": 1800 }, { "epoch": 1.5466101694915255, "grad_norm": 0.21096192300319672, "learning_rate": 0.0001213467219551728, "loss": 0.588, "step": 1825 }, { "epoch": 1.5677966101694916, "grad_norm": 0.20380620658397675, "learning_rate": 0.00012053316173577726, "loss": 0.5869, "step": 1850 }, { "epoch": 1.5889830508474576, "grad_norm": 0.25443893671035767, "learning_rate": 0.00011971102613857823, "loss": 0.5659, "step": 1875 }, { "epoch": 1.6101694915254239, "grad_norm": 0.22190341353416443, "learning_rate": 0.0001188804699984935, "loss": 0.5835, "step": 1900 }, { "epoch": 1.6313559322033897, "grad_norm": 0.24329130351543427, "learning_rate": 0.00011804164973630335, "loss": 0.5639, "step": 1925 }, { "epoch": 1.652542372881356, "grad_norm": 0.2349741905927658, "learning_rate": 0.00011719472332919148, "loss": 0.5726, "step": 1950 }, { "epoch": 1.673728813559322, "grad_norm": 0.20963279902935028, "learning_rate": 0.00011633985028099284, "loss": 0.5612, "step": 1975 }, { "epoch": 1.694915254237288, "grad_norm": 0.27600300312042236, "learning_rate": 0.00011547719159215378, "loss": 0.5943, "step": 2000 }, { "epoch": 1.7161016949152543, "grad_norm": 0.21020427346229553, "learning_rate": 0.00011460690972941037, "loss": 0.5802, "step": 2025 }, { "epoch": 1.7372881355932204, "grad_norm": 0.20670145750045776, "learning_rate": 0.00011372916859519075, "loss": 0.5766, "step": 2050 }, { "epoch": 1.7584745762711864, "grad_norm": 0.2435368299484253, "learning_rate": 0.0001128441334967469, "loss": 0.6128, "step": 2075 }, { "epoch": 1.7796610169491527, "grad_norm": 0.21429473161697388, "learning_rate": 0.00011195197111502184, "loss": 0.5844, "step": 2100 }, { "epoch": 1.8008474576271185, "grad_norm": 0.21995683014392853, "learning_rate": 0.0001110528494732583, "loss": 0.5532, "step": 2125 }, { "epoch": 1.8220338983050848, "grad_norm": 0.19685518741607666, "learning_rate": 0.00011014693790535437, "loss": 0.5569, "step": 2150 }, { "epoch": 1.8432203389830508, "grad_norm": 0.20260564982891083, "learning_rate": 0.00010923440702397243, "loss": 0.5792, "step": 2175 }, { "epoch": 1.8644067796610169, "grad_norm": 0.19778716564178467, "learning_rate": 0.00010831542868840729, "loss": 0.5978, "step": 2200 }, { "epoch": 1.8855932203389831, "grad_norm": 0.22923052310943604, "learning_rate": 0.00010739017597221942, "loss": 0.5572, "step": 2225 }, { "epoch": 1.9067796610169492, "grad_norm": 0.21343784034252167, "learning_rate": 0.00010645882313063953, "loss": 0.5643, "step": 2250 }, { "epoch": 1.9279661016949152, "grad_norm": 0.2053421288728714, "learning_rate": 0.00010552154556775076, "loss": 0.5806, "step": 2275 }, { "epoch": 1.9491525423728815, "grad_norm": 0.22164656221866608, "learning_rate": 0.00010457851980345423, "loss": 0.6011, "step": 2300 }, { "epoch": 1.9703389830508473, "grad_norm": 0.284758985042572, "learning_rate": 0.00010362992344022468, "loss": 0.5374, "step": 2325 }, { "epoch": 1.9915254237288136, "grad_norm": 0.2642022371292114, "learning_rate": 0.00010267593512966216, "loss": 0.5892, "step": 2350 }, { "epoch": 2.01271186440678, "grad_norm": 0.19165368378162384, "learning_rate": 0.00010171673453884601, "loss": 0.5175, "step": 2375 }, { "epoch": 2.0338983050847457, "grad_norm": 0.2643072307109833, "learning_rate": 0.00010075250231649775, "loss": 0.5204, "step": 2400 }, { "epoch": 2.055084745762712, "grad_norm": 0.2326943427324295, "learning_rate": 9.978342005895911e-05, "loss": 0.4847, "step": 2425 }, { "epoch": 2.0762711864406778, "grad_norm": 0.2779608368873596, "learning_rate": 9.880967027599139e-05, "loss": 0.52, "step": 2450 }, { "epoch": 2.097457627118644, "grad_norm": 0.22342316806316376, "learning_rate": 9.783143635640304e-05, "loss": 0.5124, "step": 2475 }, { "epoch": 2.1186440677966103, "grad_norm": 0.26453691720962524, "learning_rate": 9.684890253351153e-05, "loss": 0.4954, "step": 2500 }, { "epoch": 2.139830508474576, "grad_norm": 0.26683682203292847, "learning_rate": 9.586225385044615e-05, "loss": 0.519, "step": 2525 }, { "epoch": 2.1610169491525424, "grad_norm": 0.27656257152557373, "learning_rate": 9.487167612529851e-05, "loss": 0.5409, "step": 2550 }, { "epoch": 2.1822033898305087, "grad_norm": 0.27244171500205994, "learning_rate": 9.387735591612677e-05, "loss": 0.4976, "step": 2575 }, { "epoch": 2.2033898305084745, "grad_norm": 0.29296210408210754, "learning_rate": 9.28794804858208e-05, "loss": 0.4964, "step": 2600 }, { "epoch": 2.2245762711864407, "grad_norm": 0.28374531865119934, "learning_rate": 9.187823776683444e-05, "loss": 0.4936, "step": 2625 }, { "epoch": 2.2457627118644066, "grad_norm": 0.25039607286453247, "learning_rate": 9.087381632579165e-05, "loss": 0.4548, "step": 2650 }, { "epoch": 2.266949152542373, "grad_norm": 0.2839612662792206, "learning_rate": 8.986640532797341e-05, "loss": 0.521, "step": 2675 }, { "epoch": 2.288135593220339, "grad_norm": 0.26817333698272705, "learning_rate": 8.885619450169154e-05, "loss": 0.4813, "step": 2700 }, { "epoch": 2.309322033898305, "grad_norm": 0.2513103187084198, "learning_rate": 8.78433741025568e-05, "loss": 0.4964, "step": 2725 }, { "epoch": 2.330508474576271, "grad_norm": 0.2661533057689667, "learning_rate": 8.682813487764759e-05, "loss": 0.5267, "step": 2750 }, { "epoch": 2.3516949152542375, "grad_norm": 0.31996023654937744, "learning_rate": 8.581066802958593e-05, "loss": 0.4877, "step": 2775 }, { "epoch": 2.3728813559322033, "grad_norm": 0.3120092749595642, "learning_rate": 8.479116518052793e-05, "loss": 0.5025, "step": 2800 }, { "epoch": 2.3940677966101696, "grad_norm": 0.25984951853752136, "learning_rate": 8.376981833607496e-05, "loss": 0.5184, "step": 2825 }, { "epoch": 2.415254237288136, "grad_norm": 0.28586438298225403, "learning_rate": 8.274681984911279e-05, "loss": 0.5128, "step": 2850 }, { "epoch": 2.4364406779661016, "grad_norm": 0.23898103833198547, "learning_rate": 8.172236238358537e-05, "loss": 0.4968, "step": 2875 }, { "epoch": 2.457627118644068, "grad_norm": 0.2596363127231598, "learning_rate": 8.069663887820978e-05, "loss": 0.5338, "step": 2900 }, { "epoch": 2.4788135593220337, "grad_norm": 0.2569097578525543, "learning_rate": 7.966984251013964e-05, "loss": 0.5186, "step": 2925 }, { "epoch": 2.5, "grad_norm": 0.23606939613819122, "learning_rate": 7.864216665858362e-05, "loss": 0.5087, "step": 2950 }, { "epoch": 2.5211864406779663, "grad_norm": 0.24160584807395935, "learning_rate": 7.761380486838573e-05, "loss": 0.5164, "step": 2975 }, { "epoch": 2.542372881355932, "grad_norm": 0.3212146461009979, "learning_rate": 7.658495081357461e-05, "loss": 0.5173, "step": 3000 }, { "epoch": 2.5635593220338984, "grad_norm": 0.22904744744300842, "learning_rate": 7.555579826088837e-05, "loss": 0.5345, "step": 3025 }, { "epoch": 2.584745762711864, "grad_norm": 0.31355756521224976, "learning_rate": 7.452654103328196e-05, "loss": 0.4683, "step": 3050 }, { "epoch": 2.6059322033898304, "grad_norm": 0.31533321738243103, "learning_rate": 7.349737297342404e-05, "loss": 0.5259, "step": 3075 }, { "epoch": 2.6271186440677967, "grad_norm": 0.2956444025039673, "learning_rate": 7.24684879071901e-05, "loss": 0.497, "step": 3100 }, { "epoch": 2.648305084745763, "grad_norm": 0.2766103446483612, "learning_rate": 7.14400796071587e-05, "loss": 0.5166, "step": 3125 }, { "epoch": 2.669491525423729, "grad_norm": 0.3354440927505493, "learning_rate": 7.041234175611775e-05, "loss": 0.5233, "step": 3150 }, { "epoch": 2.690677966101695, "grad_norm": 0.2812809348106384, "learning_rate": 6.938546791058785e-05, "loss": 0.5155, "step": 3175 }, { "epoch": 2.711864406779661, "grad_norm": 0.39217862486839294, "learning_rate": 6.835965146436916e-05, "loss": 0.4926, "step": 3200 }, { "epoch": 2.733050847457627, "grad_norm": 0.3037302494049072, "learning_rate": 6.73350856121191e-05, "loss": 0.5098, "step": 3225 }, { "epoch": 2.7542372881355934, "grad_norm": 0.2784561514854431, "learning_rate": 6.63119633129675e-05, "loss": 0.5371, "step": 3250 }, { "epoch": 2.7754237288135593, "grad_norm": 0.2815192639827728, "learning_rate": 6.529047725417618e-05, "loss": 0.4839, "step": 3275 }, { "epoch": 2.7966101694915255, "grad_norm": 0.26870056986808777, "learning_rate": 6.427081981484946e-05, "loss": 0.4981, "step": 3300 }, { "epoch": 2.8177966101694913, "grad_norm": 0.28585174679756165, "learning_rate": 6.325318302970318e-05, "loss": 0.4841, "step": 3325 }, { "epoch": 2.8389830508474576, "grad_norm": 0.2712132930755615, "learning_rate": 6.22377585528981e-05, "loss": 0.4833, "step": 3350 }, { "epoch": 2.860169491525424, "grad_norm": 0.28583309054374695, "learning_rate": 6.12247376219452e-05, "loss": 0.5043, "step": 3375 }, { "epoch": 2.8813559322033897, "grad_norm": 0.29179123044013977, "learning_rate": 6.021431102168954e-05, "loss": 0.5343, "step": 3400 }, { "epoch": 2.902542372881356, "grad_norm": 0.29638585448265076, "learning_rate": 5.92066690483792e-05, "loss": 0.501, "step": 3425 }, { "epoch": 2.923728813559322, "grad_norm": 0.2945152521133423, "learning_rate": 5.820200147382617e-05, "loss": 0.5149, "step": 3450 }, { "epoch": 2.944915254237288, "grad_norm": 0.24451757967472076, "learning_rate": 5.720049750966638e-05, "loss": 0.501, "step": 3475 }, { "epoch": 2.9661016949152543, "grad_norm": 0.33959802985191345, "learning_rate": 5.6202345771724785e-05, "loss": 0.5202, "step": 3500 }, { "epoch": 2.9872881355932206, "grad_norm": 0.40264537930488586, "learning_rate": 5.520773424449299e-05, "loss": 0.5004, "step": 3525 }, { "epoch": 3.0084745762711864, "grad_norm": 0.23446495831012726, "learning_rate": 5.421685024572547e-05, "loss": 0.4788, "step": 3550 }, { "epoch": 3.0296610169491527, "grad_norm": 0.29302000999450684, "learning_rate": 5.322988039116176e-05, "loss": 0.4302, "step": 3575 }, { "epoch": 3.0508474576271185, "grad_norm": 0.28345516324043274, "learning_rate": 5.224701055938047e-05, "loss": 0.4195, "step": 3600 }, { "epoch": 3.0720338983050848, "grad_norm": 0.3563604950904846, "learning_rate": 5.126842585679235e-05, "loss": 0.4302, "step": 3625 }, { "epoch": 3.093220338983051, "grad_norm": 0.2989650070667267, "learning_rate": 5.0294310582778717e-05, "loss": 0.4082, "step": 3650 }, { "epoch": 3.114406779661017, "grad_norm": 0.3035448491573334, "learning_rate": 4.9324848194981906e-05, "loss": 0.4294, "step": 3675 }, { "epoch": 3.135593220338983, "grad_norm": 0.3060661256313324, "learning_rate": 4.83602212747541e-05, "loss": 0.4243, "step": 3700 }, { "epoch": 3.156779661016949, "grad_norm": 0.3512302041053772, "learning_rate": 4.7400611492771505e-05, "loss": 0.4558, "step": 3725 }, { "epoch": 3.1779661016949152, "grad_norm": 0.3085233271121979, "learning_rate": 4.644619957481972e-05, "loss": 0.4405, "step": 3750 }, { "epoch": 3.1991525423728815, "grad_norm": 0.37406814098358154, "learning_rate": 4.549716526775711e-05, "loss": 0.4394, "step": 3775 }, { "epoch": 3.2203389830508473, "grad_norm": 0.28444594144821167, "learning_rate": 4.455368730566282e-05, "loss": 0.4356, "step": 3800 }, { "epoch": 3.2415254237288136, "grad_norm": 0.3252512812614441, "learning_rate": 4.361594337617518e-05, "loss": 0.4422, "step": 3825 }, { "epoch": 3.26271186440678, "grad_norm": 0.34911468625068665, "learning_rate": 4.2684110087027364e-05, "loss": 0.42, "step": 3850 }, { "epoch": 3.2838983050847457, "grad_norm": 0.31359365582466125, "learning_rate": 4.175836293278635e-05, "loss": 0.4229, "step": 3875 }, { "epoch": 3.305084745762712, "grad_norm": 0.332359254360199, "learning_rate": 4.083887626180175e-05, "loss": 0.4428, "step": 3900 }, { "epoch": 3.326271186440678, "grad_norm": 0.3841429054737091, "learning_rate": 3.992582324337009e-05, "loss": 0.4643, "step": 3925 }, { "epoch": 3.347457627118644, "grad_norm": 0.3356688618659973, "learning_rate": 3.901937583512158e-05, "loss": 0.4169, "step": 3950 }, { "epoch": 3.3686440677966103, "grad_norm": 0.39436978101730347, "learning_rate": 3.811970475063486e-05, "loss": 0.4564, "step": 3975 }, { "epoch": 3.389830508474576, "grad_norm": 0.29478755593299866, "learning_rate": 3.7226979427285943e-05, "loss": 0.3858, "step": 4000 }, { "epoch": 3.4110169491525424, "grad_norm": 0.4711458086967468, "learning_rate": 3.6341367994337784e-05, "loss": 0.4547, "step": 4025 }, { "epoch": 3.4322033898305087, "grad_norm": 0.38489460945129395, "learning_rate": 3.546303724127603e-05, "loss": 0.4235, "step": 4050 }, { "epoch": 3.4533898305084745, "grad_norm": 0.41311007738113403, "learning_rate": 3.459215258639708e-05, "loss": 0.4589, "step": 4075 }, { "epoch": 3.4745762711864407, "grad_norm": 0.3139210641384125, "learning_rate": 3.372887804565442e-05, "loss": 0.4163, "step": 4100 }, { "epoch": 3.4957627118644066, "grad_norm": 0.43436604738235474, "learning_rate": 3.2873376201769154e-05, "loss": 0.4465, "step": 4125 }, { "epoch": 3.516949152542373, "grad_norm": 0.37427470088005066, "learning_rate": 3.202580817361037e-05, "loss": 0.4106, "step": 4150 }, { "epoch": 3.538135593220339, "grad_norm": 0.3729758560657501, "learning_rate": 3.1186333585851056e-05, "loss": 0.47, "step": 4175 }, { "epoch": 3.559322033898305, "grad_norm": 0.3862791955471039, "learning_rate": 3.0355110538905815e-05, "loss": 0.3975, "step": 4200 }, { "epoch": 3.580508474576271, "grad_norm": 0.35095420479774475, "learning_rate": 2.953229557915525e-05, "loss": 0.4422, "step": 4225 }, { "epoch": 3.601694915254237, "grad_norm": 0.34636810421943665, "learning_rate": 2.871804366946315e-05, "loss": 0.428, "step": 4250 }, { "epoch": 3.6228813559322033, "grad_norm": 0.3737597167491913, "learning_rate": 2.791250815999207e-05, "loss": 0.4544, "step": 4275 }, { "epoch": 3.6440677966101696, "grad_norm": 0.3554207384586334, "learning_rate": 2.7115840759322436e-05, "loss": 0.4167, "step": 4300 }, { "epoch": 3.665254237288136, "grad_norm": 0.369305819272995, "learning_rate": 2.6359522461221096e-05, "loss": 0.4456, "step": 4325 }, { "epoch": 3.6864406779661016, "grad_norm": 0.40377670526504517, "learning_rate": 2.5580670208969884e-05, "loss": 0.4465, "step": 4350 }, { "epoch": 3.707627118644068, "grad_norm": 0.4016803801059723, "learning_rate": 2.4811125226576454e-05, "loss": 0.4395, "step": 4375 }, { "epoch": 3.7288135593220337, "grad_norm": 0.3124406337738037, "learning_rate": 2.405103244443235e-05, "loss": 0.4154, "step": 4400 }, { "epoch": 3.75, "grad_norm": 0.44163626432418823, "learning_rate": 2.330053501277194e-05, "loss": 0.4607, "step": 4425 }, { "epoch": 3.7711864406779663, "grad_norm": 0.33251988887786865, "learning_rate": 2.2559774274712466e-05, "loss": 0.4114, "step": 4450 }, { "epoch": 3.792372881355932, "grad_norm": 0.4052109718322754, "learning_rate": 2.1828889739634496e-05, "loss": 0.4123, "step": 4475 }, { "epoch": 3.8135593220338984, "grad_norm": 0.3507472276687622, "learning_rate": 2.110801905690787e-05, "loss": 0.4199, "step": 4500 }, { "epoch": 3.834745762711864, "grad_norm": 0.4040756821632385, "learning_rate": 2.03972979899678e-05, "loss": 0.4526, "step": 4525 }, { "epoch": 3.8559322033898304, "grad_norm": 0.30861154198646545, "learning_rate": 1.9696860390746082e-05, "loss": 0.4152, "step": 4550 }, { "epoch": 3.8771186440677967, "grad_norm": 0.4708113670349121, "learning_rate": 1.900683817446263e-05, "loss": 0.4477, "step": 4575 }, { "epoch": 3.898305084745763, "grad_norm": 0.3677612543106079, "learning_rate": 1.832736129478131e-05, "loss": 0.4279, "step": 4600 }, { "epoch": 3.919491525423729, "grad_norm": 0.3834724724292755, "learning_rate": 1.7658557719335652e-05, "loss": 0.4235, "step": 4625 }, { "epoch": 3.940677966101695, "grad_norm": 0.3320079445838928, "learning_rate": 1.7000553405628164e-05, "loss": 0.4103, "step": 4650 }, { "epoch": 3.961864406779661, "grad_norm": 0.4474587142467499, "learning_rate": 1.6353472277308618e-05, "loss": 0.4422, "step": 4675 }, { "epoch": 3.983050847457627, "grad_norm": 0.3154617249965668, "learning_rate": 1.571743620083504e-05, "loss": 0.4343, "step": 4700 }, { "epoch": 4.004237288135593, "grad_norm": 0.32371950149536133, "learning_rate": 1.5092564962522388e-05, "loss": 0.452, "step": 4725 }, { "epoch": 4.02542372881356, "grad_norm": 0.3051236867904663, "learning_rate": 1.447897624598286e-05, "loss": 0.4164, "step": 4750 }, { "epoch": 4.046610169491525, "grad_norm": 0.3187614679336548, "learning_rate": 1.3876785609962218e-05, "loss": 0.3446, "step": 4775 }, { "epoch": 4.067796610169491, "grad_norm": 0.48843175172805786, "learning_rate": 1.3286106466576264e-05, "loss": 0.4296, "step": 4800 }, { "epoch": 4.088983050847458, "grad_norm": 0.3983837068080902, "learning_rate": 1.2707050059951763e-05, "loss": 0.344, "step": 4825 }, { "epoch": 4.110169491525424, "grad_norm": 0.300611674785614, "learning_rate": 1.2139725445275481e-05, "loss": 0.4169, "step": 4850 }, { "epoch": 4.13135593220339, "grad_norm": 0.4292912781238556, "learning_rate": 1.158423946825549e-05, "loss": 0.3689, "step": 4875 }, { "epoch": 4.1525423728813555, "grad_norm": 0.3964712917804718, "learning_rate": 1.1040696744998754e-05, "loss": 0.4404, "step": 4900 }, { "epoch": 4.173728813559322, "grad_norm": 0.6776478886604309, "learning_rate": 1.0509199642308436e-05, "loss": 0.3979, "step": 4925 }, { "epoch": 4.194915254237288, "grad_norm": 0.396267831325531, "learning_rate": 9.98984825840486e-06, "loss": 0.4182, "step": 4950 }, { "epoch": 4.216101694915254, "grad_norm": 0.28718650341033936, "learning_rate": 9.482740404073851e-06, "loss": 0.3736, "step": 4975 }, { "epoch": 4.237288135593221, "grad_norm": 0.3323756158351898, "learning_rate": 8.987971584245729e-06, "loss": 0.4113, "step": 5000 }, { "epoch": 4.258474576271187, "grad_norm": 0.33957767486572266, "learning_rate": 8.50563498000856e-06, "loss": 0.3925, "step": 5025 }, { "epoch": 4.279661016949152, "grad_norm": 0.4178178906440735, "learning_rate": 8.035821431059244e-06, "loss": 0.3973, "step": 5050 }, { "epoch": 4.3008474576271185, "grad_norm": 0.3192192614078522, "learning_rate": 7.578619418595358e-06, "loss": 0.3605, "step": 5075 }, { "epoch": 4.322033898305085, "grad_norm": 0.4187626540660858, "learning_rate": 7.1341150486512374e-06, "loss": 0.4199, "step": 5100 }, { "epoch": 4.343220338983051, "grad_norm": 0.3863602578639984, "learning_rate": 6.702392035881507e-06, "loss": 0.3568, "step": 5125 }, { "epoch": 4.364406779661017, "grad_norm": 0.4073178172111511, "learning_rate": 6.28353168779481e-06, "loss": 0.4327, "step": 5150 }, { "epoch": 4.385593220338983, "grad_norm": 0.31056177616119385, "learning_rate": 5.8776128894409305e-06, "loss": 0.372, "step": 5175 }, { "epoch": 4.406779661016949, "grad_norm": 0.3671024739742279, "learning_rate": 5.484712088554253e-06, "loss": 0.4078, "step": 5200 }, { "epoch": 4.427966101694915, "grad_norm": 0.2966119349002838, "learning_rate": 5.1049032811561196e-06, "loss": 0.3529, "step": 5225 }, { "epoch": 4.4491525423728815, "grad_norm": 0.3545999526977539, "learning_rate": 4.7382579976189244e-06, "loss": 0.3864, "step": 5250 }, { "epoch": 4.470338983050848, "grad_norm": 0.3902367651462555, "learning_rate": 4.384845289194699e-06, "loss": 0.3434, "step": 5275 }, { "epoch": 4.491525423728813, "grad_norm": 0.4561343193054199, "learning_rate": 4.044731715010463e-06, "loss": 0.371, "step": 5300 }, { "epoch": 4.512711864406779, "grad_norm": 0.29569247364997864, "learning_rate": 3.717981329532979e-06, "loss": 0.3957, "step": 5325 }, { "epoch": 4.533898305084746, "grad_norm": 0.3961041271686554, "learning_rate": 3.4046556705051744e-06, "loss": 0.3938, "step": 5350 }, { "epoch": 4.555084745762712, "grad_norm": 0.35009700059890747, "learning_rate": 3.104813747356674e-06, "loss": 0.3829, "step": 5375 }, { "epoch": 4.576271186440678, "grad_norm": 0.404491662979126, "learning_rate": 2.8185120300902865e-06, "loss": 0.3916, "step": 5400 }, { "epoch": 4.597457627118644, "grad_norm": 0.3277469277381897, "learning_rate": 2.5458044386469727e-06, "loss": 0.3681, "step": 5425 }, { "epoch": 4.61864406779661, "grad_norm": 0.4005700349807739, "learning_rate": 2.2867423327508654e-06, "loss": 0.4249, "step": 5450 }, { "epoch": 4.639830508474576, "grad_norm": 0.30087345838546753, "learning_rate": 2.0413745022366285e-06, "loss": 0.3493, "step": 5475 }, { "epoch": 4.661016949152542, "grad_norm": 0.37881365418434143, "learning_rate": 1.8097471578607164e-06, "loss": 0.4209, "step": 5500 }, { "epoch": 4.682203389830509, "grad_norm": 0.40850409865379333, "learning_rate": 1.5919039225983782e-06, "loss": 0.378, "step": 5525 }, { "epoch": 4.703389830508475, "grad_norm": 0.43627145886421204, "learning_rate": 1.3878858234280532e-06, "loss": 0.4131, "step": 5550 }, { "epoch": 4.72457627118644, "grad_norm": 0.3629254400730133, "learning_rate": 1.1977312836046194e-06, "loss": 0.3555, "step": 5575 }, { "epoch": 4.745762711864407, "grad_norm": 0.4231952428817749, "learning_rate": 1.0214761154230643e-06, "loss": 0.4459, "step": 5600 }, { "epoch": 4.766949152542373, "grad_norm": 0.32848870754241943, "learning_rate": 8.591535134738814e-07, "loss": 0.3753, "step": 5625 }, { "epoch": 4.788135593220339, "grad_norm": 0.49593624472618103, "learning_rate": 7.107940483913943e-07, "loss": 0.4109, "step": 5650 }, { "epoch": 4.809322033898305, "grad_norm": 0.3230677545070648, "learning_rate": 5.764256610963636e-07, "loss": 0.3534, "step": 5675 }, { "epoch": 4.830508474576272, "grad_norm": 0.4409547746181488, "learning_rate": 4.560736575337787e-07, "loss": 0.4389, "step": 5700 }, { "epoch": 4.851694915254237, "grad_norm": 0.3816058039665222, "learning_rate": 3.4976070390692054e-07, "loss": 0.369, "step": 5725 }, { "epoch": 4.872881355932203, "grad_norm": 0.3902296721935272, "learning_rate": 2.5750682240857634e-07, "loss": 0.4134, "step": 5750 }, { "epoch": 4.8940677966101696, "grad_norm": 0.3721590042114258, "learning_rate": 1.7932938745022218e-07, "loss": 0.3509, "step": 5775 }, { "epoch": 4.915254237288136, "grad_norm": 0.3812599778175354, "learning_rate": 1.1524312238984923e-07, "loss": 0.4109, "step": 5800 }, { "epoch": 4.936440677966102, "grad_norm": 0.39883601665496826, "learning_rate": 6.526009675905663e-08, "loss": 0.3768, "step": 5825 }, { "epoch": 4.9576271186440675, "grad_norm": 0.38190439343452454, "learning_rate": 2.9389723990011495e-08, "loss": 0.4262, "step": 5850 }, { "epoch": 4.978813559322034, "grad_norm": 0.2927350699901581, "learning_rate": 7.638759642525361e-09, "loss": 0.3631, "step": 5875 }, { "epoch": 5.0, "grad_norm": 0.7745693922042847, "learning_rate": 1.1300131838587468e-11, "loss": 0.3756, "step": 5900 } ], "logging_steps": 25, "max_steps": 5900, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0720875463474176e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }