|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 5900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0211864406779661, |
|
"grad_norm": 0.3481608033180237, |
|
"learning_rate": 2.11864406779661e-05, |
|
"loss": 1.0844, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0423728813559322, |
|
"grad_norm": 0.4737679064273834, |
|
"learning_rate": 4.23728813559322e-05, |
|
"loss": 0.968, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0635593220338983, |
|
"grad_norm": 0.24385006725788116, |
|
"learning_rate": 6.355932203389829e-05, |
|
"loss": 0.7097, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0847457627118644, |
|
"grad_norm": 0.32397332787513733, |
|
"learning_rate": 8.47457627118644e-05, |
|
"loss": 0.6995, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1059322033898305, |
|
"grad_norm": 0.28883692622184753, |
|
"learning_rate": 0.0001059322033898305, |
|
"loss": 0.6362, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1271186440677966, |
|
"grad_norm": 0.2939394414424896, |
|
"learning_rate": 0.00012711864406779658, |
|
"loss": 0.6646, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1483050847457627, |
|
"grad_norm": 0.21506226062774658, |
|
"learning_rate": 0.0001483050847457627, |
|
"loss": 0.67, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1694915254237288, |
|
"grad_norm": 0.24949543178081512, |
|
"learning_rate": 0.00014999402230951556, |
|
"loss": 0.6328, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1906779661016949, |
|
"grad_norm": 0.1662125438451767, |
|
"learning_rate": 0.00014997396600188487, |
|
"loss": 0.6365, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.211864406779661, |
|
"grad_norm": 0.18493060767650604, |
|
"learning_rate": 0.00014993978965384007, |
|
"loss": 0.6661, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2330508474576271, |
|
"grad_norm": 0.1724727302789688, |
|
"learning_rate": 0.00014989149970190098, |
|
"loss": 0.6453, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2542372881355932, |
|
"grad_norm": 0.2265160232782364, |
|
"learning_rate": 0.00014982910524063883, |
|
"loss": 0.5802, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2754237288135593, |
|
"grad_norm": 0.14229296147823334, |
|
"learning_rate": 0.00014975261802096344, |
|
"loss": 0.6559, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2966101694915254, |
|
"grad_norm": 0.19628387689590454, |
|
"learning_rate": 0.0001496620524479102, |
|
"loss": 0.6181, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3177966101694915, |
|
"grad_norm": 0.19808532297611237, |
|
"learning_rate": 0.00014955742557792704, |
|
"loss": 0.6363, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 0.2479950338602066, |
|
"learning_rate": 0.00014943875711566237, |
|
"loss": 0.601, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3601694915254237, |
|
"grad_norm": 0.18844148516654968, |
|
"learning_rate": 0.0001493060694102537, |
|
"loss": 0.6406, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3813559322033898, |
|
"grad_norm": 0.21692270040512085, |
|
"learning_rate": 0.00014915938745111896, |
|
"loss": 0.674, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4025423728813559, |
|
"grad_norm": 0.18362776935100555, |
|
"learning_rate": 0.0001489987388632498, |
|
"loss": 0.6326, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.423728813559322, |
|
"grad_norm": 0.1860133409500122, |
|
"learning_rate": 0.0001488241539020092, |
|
"loss": 0.6539, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4449152542372881, |
|
"grad_norm": 0.16509853303432465, |
|
"learning_rate": 0.00014863566544743326, |
|
"loss": 0.6649, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4661016949152542, |
|
"grad_norm": 0.17422816157341003, |
|
"learning_rate": 0.0001484333089980388, |
|
"loss": 0.6365, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4872881355932203, |
|
"grad_norm": 0.16881784796714783, |
|
"learning_rate": 0.000148217122664138, |
|
"loss": 0.6014, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5084745762711864, |
|
"grad_norm": 0.24150097370147705, |
|
"learning_rate": 0.00014798714716066072, |
|
"loss": 0.6225, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 0.183096244931221, |
|
"learning_rate": 0.00014774342579948675, |
|
"loss": 0.628, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5508474576271186, |
|
"grad_norm": 0.2092808037996292, |
|
"learning_rate": 0.00014748600448128877, |
|
"loss": 0.6196, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5720338983050848, |
|
"grad_norm": 0.1650499850511551, |
|
"learning_rate": 0.00014721493168688764, |
|
"loss": 0.6617, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5932203389830508, |
|
"grad_norm": 0.2336203157901764, |
|
"learning_rate": 0.00014693025846812194, |
|
"loss": 0.5995, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.614406779661017, |
|
"grad_norm": 0.1635483205318451, |
|
"learning_rate": 0.0001466320384382333, |
|
"loss": 0.6225, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.635593220338983, |
|
"grad_norm": 0.24543817341327667, |
|
"learning_rate": 0.00014632032776176924, |
|
"loss": 0.6208, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6567796610169492, |
|
"grad_norm": 0.156394824385643, |
|
"learning_rate": 0.0001459951851440055, |
|
"loss": 0.6234, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.21179354190826416, |
|
"learning_rate": 0.00014565667181988995, |
|
"loss": 0.6101, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6991525423728814, |
|
"grad_norm": 0.1816495805978775, |
|
"learning_rate": 0.00014530485154251021, |
|
"loss": 0.6212, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7203389830508474, |
|
"grad_norm": 0.18615126609802246, |
|
"learning_rate": 0.0001449397905710866, |
|
"loss": 0.6019, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7415254237288136, |
|
"grad_norm": 0.13972151279449463, |
|
"learning_rate": 0.00014456155765849355, |
|
"loss": 0.6804, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7627118644067796, |
|
"grad_norm": 0.19166871905326843, |
|
"learning_rate": 0.00014417022403831117, |
|
"loss": 0.6265, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7838983050847458, |
|
"grad_norm": 0.1559162586927414, |
|
"learning_rate": 0.00014376586341140955, |
|
"loss": 0.5893, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8050847457627118, |
|
"grad_norm": 0.17139187455177307, |
|
"learning_rate": 0.0001433485519320687, |
|
"loss": 0.6192, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.826271186440678, |
|
"grad_norm": 0.19588051736354828, |
|
"learning_rate": 0.0001429183681936359, |
|
"loss": 0.6545, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.847457627118644, |
|
"grad_norm": 0.17011399567127228, |
|
"learning_rate": 0.0001424753932137243, |
|
"loss": 0.6274, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8686440677966102, |
|
"grad_norm": 0.13620993494987488, |
|
"learning_rate": 0.00014201971041895455, |
|
"loss": 0.6185, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8898305084745762, |
|
"grad_norm": 0.19832104444503784, |
|
"learning_rate": 0.00014155140562924286, |
|
"loss": 0.5788, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9110169491525424, |
|
"grad_norm": 0.15580902993679047, |
|
"learning_rate": 0.00014107056704163823, |
|
"loss": 0.6756, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9322033898305084, |
|
"grad_norm": 0.2072034329175949, |
|
"learning_rate": 0.00014057728521371218, |
|
"loss": 0.6347, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9533898305084746, |
|
"grad_norm": 0.13679395616054535, |
|
"learning_rate": 0.00014007165304650386, |
|
"loss": 0.6419, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9745762711864406, |
|
"grad_norm": 0.20975461602210999, |
|
"learning_rate": 0.00013955376576702357, |
|
"loss": 0.5929, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9957627118644068, |
|
"grad_norm": 0.18808843195438385, |
|
"learning_rate": 0.00013902372091031856, |
|
"loss": 0.6327, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.0169491525423728, |
|
"grad_norm": 0.12700864672660828, |
|
"learning_rate": 0.00013848161830110395, |
|
"loss": 0.6166, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0381355932203389, |
|
"grad_norm": 0.17502394318580627, |
|
"learning_rate": 0.0001379275600349625, |
|
"loss": 0.542, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 0.17643524706363678, |
|
"learning_rate": 0.0001373616504591167, |
|
"loss": 0.6077, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0805084745762712, |
|
"grad_norm": 0.21401630342006683, |
|
"learning_rate": 0.00013678399615277674, |
|
"loss": 0.5856, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.1016949152542372, |
|
"grad_norm": 0.1577410101890564, |
|
"learning_rate": 0.00013619470590706814, |
|
"loss": 0.5882, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1228813559322033, |
|
"grad_norm": 0.2284272313117981, |
|
"learning_rate": 0.00013559389070454304, |
|
"loss": 0.5842, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.1440677966101696, |
|
"grad_norm": 0.2204512506723404, |
|
"learning_rate": 0.00013498166369827833, |
|
"loss": 0.5911, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1652542372881356, |
|
"grad_norm": 0.21209457516670227, |
|
"learning_rate": 0.00013435814019056535, |
|
"loss": 0.602, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.1864406779661016, |
|
"grad_norm": 0.16774219274520874, |
|
"learning_rate": 0.00013372343761119466, |
|
"loss": 0.5746, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.207627118644068, |
|
"grad_norm": 0.23171478509902954, |
|
"learning_rate": 0.00013307767549534033, |
|
"loss": 0.6046, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.228813559322034, |
|
"grad_norm": 0.17449446022510529, |
|
"learning_rate": 0.00013242097546104734, |
|
"loss": 0.5969, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.21454857289791107, |
|
"learning_rate": 0.00013175346118632713, |
|
"loss": 0.5927, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.271186440677966, |
|
"grad_norm": 0.17533324658870697, |
|
"learning_rate": 0.00013107525838586495, |
|
"loss": 0.5806, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.292372881355932, |
|
"grad_norm": 0.2303514927625656, |
|
"learning_rate": 0.00013038649478734363, |
|
"loss": 0.6269, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.3135593220338984, |
|
"grad_norm": 0.2209363877773285, |
|
"learning_rate": 0.00012968730010738837, |
|
"loss": 0.5699, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.3347457627118644, |
|
"grad_norm": 0.2777274250984192, |
|
"learning_rate": 0.0001289778060271368, |
|
"loss": 0.5583, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.3559322033898304, |
|
"grad_norm": 0.19397616386413574, |
|
"learning_rate": 0.00012825814616743928, |
|
"loss": 0.5785, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3771186440677967, |
|
"grad_norm": 0.24071291089057922, |
|
"learning_rate": 0.0001275284560636935, |
|
"loss": 0.5793, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.3983050847457628, |
|
"grad_norm": 0.16364933550357819, |
|
"learning_rate": 0.000126788873140319, |
|
"loss": 0.5591, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.4194915254237288, |
|
"grad_norm": 0.2222534716129303, |
|
"learning_rate": 0.00012603953668487546, |
|
"loss": 0.5649, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.4406779661016949, |
|
"grad_norm": 0.18990883231163025, |
|
"learning_rate": 0.00012528058782183048, |
|
"loss": 0.5732, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.461864406779661, |
|
"grad_norm": 0.23255659639835358, |
|
"learning_rate": 0.00012451216948598117, |
|
"loss": 0.55, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.4830508474576272, |
|
"grad_norm": 0.19624237716197968, |
|
"learning_rate": 0.00012373442639553487, |
|
"loss": 0.5793, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5042372881355932, |
|
"grad_norm": 0.24238888919353485, |
|
"learning_rate": 0.00012294750502485398, |
|
"loss": 0.5823, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.5254237288135593, |
|
"grad_norm": 0.2002212405204773, |
|
"learning_rate": 0.00012215155357687017, |
|
"loss": 0.571, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5466101694915255, |
|
"grad_norm": 0.21096192300319672, |
|
"learning_rate": 0.0001213467219551728, |
|
"loss": 0.588, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.5677966101694916, |
|
"grad_norm": 0.20380620658397675, |
|
"learning_rate": 0.00012053316173577726, |
|
"loss": 0.5869, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 0.25443893671035767, |
|
"learning_rate": 0.00011971102613857823, |
|
"loss": 0.5659, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.6101694915254239, |
|
"grad_norm": 0.22190341353416443, |
|
"learning_rate": 0.0001188804699984935, |
|
"loss": 0.5835, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6313559322033897, |
|
"grad_norm": 0.24329130351543427, |
|
"learning_rate": 0.00011804164973630335, |
|
"loss": 0.5639, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.652542372881356, |
|
"grad_norm": 0.2349741905927658, |
|
"learning_rate": 0.00011719472332919148, |
|
"loss": 0.5726, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.673728813559322, |
|
"grad_norm": 0.20963279902935028, |
|
"learning_rate": 0.00011633985028099284, |
|
"loss": 0.5612, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"grad_norm": 0.27600300312042236, |
|
"learning_rate": 0.00011547719159215378, |
|
"loss": 0.5943, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.7161016949152543, |
|
"grad_norm": 0.21020427346229553, |
|
"learning_rate": 0.00011460690972941037, |
|
"loss": 0.5802, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.7372881355932204, |
|
"grad_norm": 0.20670145750045776, |
|
"learning_rate": 0.00011372916859519075, |
|
"loss": 0.5766, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.7584745762711864, |
|
"grad_norm": 0.2435368299484253, |
|
"learning_rate": 0.0001128441334967469, |
|
"loss": 0.6128, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.7796610169491527, |
|
"grad_norm": 0.21429473161697388, |
|
"learning_rate": 0.00011195197111502184, |
|
"loss": 0.5844, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.8008474576271185, |
|
"grad_norm": 0.21995683014392853, |
|
"learning_rate": 0.0001110528494732583, |
|
"loss": 0.5532, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.8220338983050848, |
|
"grad_norm": 0.19685518741607666, |
|
"learning_rate": 0.00011014693790535437, |
|
"loss": 0.5569, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.8432203389830508, |
|
"grad_norm": 0.20260564982891083, |
|
"learning_rate": 0.00010923440702397243, |
|
"loss": 0.5792, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.8644067796610169, |
|
"grad_norm": 0.19778716564178467, |
|
"learning_rate": 0.00010831542868840729, |
|
"loss": 0.5978, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.8855932203389831, |
|
"grad_norm": 0.22923052310943604, |
|
"learning_rate": 0.00010739017597221942, |
|
"loss": 0.5572, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.9067796610169492, |
|
"grad_norm": 0.21343784034252167, |
|
"learning_rate": 0.00010645882313063953, |
|
"loss": 0.5643, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.9279661016949152, |
|
"grad_norm": 0.2053421288728714, |
|
"learning_rate": 0.00010552154556775076, |
|
"loss": 0.5806, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.9491525423728815, |
|
"grad_norm": 0.22164656221866608, |
|
"learning_rate": 0.00010457851980345423, |
|
"loss": 0.6011, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9703389830508473, |
|
"grad_norm": 0.284758985042572, |
|
"learning_rate": 0.00010362992344022468, |
|
"loss": 0.5374, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.9915254237288136, |
|
"grad_norm": 0.2642022371292114, |
|
"learning_rate": 0.00010267593512966216, |
|
"loss": 0.5892, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.01271186440678, |
|
"grad_norm": 0.19165368378162384, |
|
"learning_rate": 0.00010171673453884601, |
|
"loss": 0.5175, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.0338983050847457, |
|
"grad_norm": 0.2643072307109833, |
|
"learning_rate": 0.00010075250231649775, |
|
"loss": 0.5204, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.055084745762712, |
|
"grad_norm": 0.2326943427324295, |
|
"learning_rate": 9.978342005895911e-05, |
|
"loss": 0.4847, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.0762711864406778, |
|
"grad_norm": 0.2779608368873596, |
|
"learning_rate": 9.880967027599139e-05, |
|
"loss": 0.52, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.097457627118644, |
|
"grad_norm": 0.22342316806316376, |
|
"learning_rate": 9.783143635640304e-05, |
|
"loss": 0.5124, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 0.26453691720962524, |
|
"learning_rate": 9.684890253351153e-05, |
|
"loss": 0.4954, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.139830508474576, |
|
"grad_norm": 0.26683682203292847, |
|
"learning_rate": 9.586225385044615e-05, |
|
"loss": 0.519, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.1610169491525424, |
|
"grad_norm": 0.27656257152557373, |
|
"learning_rate": 9.487167612529851e-05, |
|
"loss": 0.5409, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.1822033898305087, |
|
"grad_norm": 0.27244171500205994, |
|
"learning_rate": 9.387735591612677e-05, |
|
"loss": 0.4976, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.2033898305084745, |
|
"grad_norm": 0.29296210408210754, |
|
"learning_rate": 9.28794804858208e-05, |
|
"loss": 0.4964, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.2245762711864407, |
|
"grad_norm": 0.28374531865119934, |
|
"learning_rate": 9.187823776683444e-05, |
|
"loss": 0.4936, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.2457627118644066, |
|
"grad_norm": 0.25039607286453247, |
|
"learning_rate": 9.087381632579165e-05, |
|
"loss": 0.4548, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.266949152542373, |
|
"grad_norm": 0.2839612662792206, |
|
"learning_rate": 8.986640532797341e-05, |
|
"loss": 0.521, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.288135593220339, |
|
"grad_norm": 0.26817333698272705, |
|
"learning_rate": 8.885619450169154e-05, |
|
"loss": 0.4813, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.309322033898305, |
|
"grad_norm": 0.2513103187084198, |
|
"learning_rate": 8.78433741025568e-05, |
|
"loss": 0.4964, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.330508474576271, |
|
"grad_norm": 0.2661533057689667, |
|
"learning_rate": 8.682813487764759e-05, |
|
"loss": 0.5267, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.3516949152542375, |
|
"grad_norm": 0.31996023654937744, |
|
"learning_rate": 8.581066802958593e-05, |
|
"loss": 0.4877, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.3728813559322033, |
|
"grad_norm": 0.3120092749595642, |
|
"learning_rate": 8.479116518052793e-05, |
|
"loss": 0.5025, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.3940677966101696, |
|
"grad_norm": 0.25984951853752136, |
|
"learning_rate": 8.376981833607496e-05, |
|
"loss": 0.5184, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.415254237288136, |
|
"grad_norm": 0.28586438298225403, |
|
"learning_rate": 8.274681984911279e-05, |
|
"loss": 0.5128, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.4364406779661016, |
|
"grad_norm": 0.23898103833198547, |
|
"learning_rate": 8.172236238358537e-05, |
|
"loss": 0.4968, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.457627118644068, |
|
"grad_norm": 0.2596363127231598, |
|
"learning_rate": 8.069663887820978e-05, |
|
"loss": 0.5338, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.4788135593220337, |
|
"grad_norm": 0.2569097578525543, |
|
"learning_rate": 7.966984251013964e-05, |
|
"loss": 0.5186, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.23606939613819122, |
|
"learning_rate": 7.864216665858362e-05, |
|
"loss": 0.5087, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.5211864406779663, |
|
"grad_norm": 0.24160584807395935, |
|
"learning_rate": 7.761380486838573e-05, |
|
"loss": 0.5164, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.542372881355932, |
|
"grad_norm": 0.3212146461009979, |
|
"learning_rate": 7.658495081357461e-05, |
|
"loss": 0.5173, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.5635593220338984, |
|
"grad_norm": 0.22904744744300842, |
|
"learning_rate": 7.555579826088837e-05, |
|
"loss": 0.5345, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.584745762711864, |
|
"grad_norm": 0.31355756521224976, |
|
"learning_rate": 7.452654103328196e-05, |
|
"loss": 0.4683, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.6059322033898304, |
|
"grad_norm": 0.31533321738243103, |
|
"learning_rate": 7.349737297342404e-05, |
|
"loss": 0.5259, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.6271186440677967, |
|
"grad_norm": 0.2956444025039673, |
|
"learning_rate": 7.24684879071901e-05, |
|
"loss": 0.497, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 0.2766103446483612, |
|
"learning_rate": 7.14400796071587e-05, |
|
"loss": 0.5166, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.669491525423729, |
|
"grad_norm": 0.3354440927505493, |
|
"learning_rate": 7.041234175611775e-05, |
|
"loss": 0.5233, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.690677966101695, |
|
"grad_norm": 0.2812809348106384, |
|
"learning_rate": 6.938546791058785e-05, |
|
"loss": 0.5155, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.711864406779661, |
|
"grad_norm": 0.39217862486839294, |
|
"learning_rate": 6.835965146436916e-05, |
|
"loss": 0.4926, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.733050847457627, |
|
"grad_norm": 0.3037302494049072, |
|
"learning_rate": 6.73350856121191e-05, |
|
"loss": 0.5098, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.7542372881355934, |
|
"grad_norm": 0.2784561514854431, |
|
"learning_rate": 6.63119633129675e-05, |
|
"loss": 0.5371, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.7754237288135593, |
|
"grad_norm": 0.2815192639827728, |
|
"learning_rate": 6.529047725417618e-05, |
|
"loss": 0.4839, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.7966101694915255, |
|
"grad_norm": 0.26870056986808777, |
|
"learning_rate": 6.427081981484946e-05, |
|
"loss": 0.4981, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.8177966101694913, |
|
"grad_norm": 0.28585174679756165, |
|
"learning_rate": 6.325318302970318e-05, |
|
"loss": 0.4841, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.8389830508474576, |
|
"grad_norm": 0.2712132930755615, |
|
"learning_rate": 6.22377585528981e-05, |
|
"loss": 0.4833, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.860169491525424, |
|
"grad_norm": 0.28583309054374695, |
|
"learning_rate": 6.12247376219452e-05, |
|
"loss": 0.5043, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.8813559322033897, |
|
"grad_norm": 0.29179123044013977, |
|
"learning_rate": 6.021431102168954e-05, |
|
"loss": 0.5343, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.902542372881356, |
|
"grad_norm": 0.29638585448265076, |
|
"learning_rate": 5.92066690483792e-05, |
|
"loss": 0.501, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.923728813559322, |
|
"grad_norm": 0.2945152521133423, |
|
"learning_rate": 5.820200147382617e-05, |
|
"loss": 0.5149, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.944915254237288, |
|
"grad_norm": 0.24451757967472076, |
|
"learning_rate": 5.720049750966638e-05, |
|
"loss": 0.501, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.9661016949152543, |
|
"grad_norm": 0.33959802985191345, |
|
"learning_rate": 5.6202345771724785e-05, |
|
"loss": 0.5202, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.9872881355932206, |
|
"grad_norm": 0.40264537930488586, |
|
"learning_rate": 5.520773424449299e-05, |
|
"loss": 0.5004, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 3.0084745762711864, |
|
"grad_norm": 0.23446495831012726, |
|
"learning_rate": 5.421685024572547e-05, |
|
"loss": 0.4788, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.0296610169491527, |
|
"grad_norm": 0.29302000999450684, |
|
"learning_rate": 5.322988039116176e-05, |
|
"loss": 0.4302, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 3.0508474576271185, |
|
"grad_norm": 0.28345516324043274, |
|
"learning_rate": 5.224701055938047e-05, |
|
"loss": 0.4195, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.0720338983050848, |
|
"grad_norm": 0.3563604950904846, |
|
"learning_rate": 5.126842585679235e-05, |
|
"loss": 0.4302, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 3.093220338983051, |
|
"grad_norm": 0.2989650070667267, |
|
"learning_rate": 5.0294310582778717e-05, |
|
"loss": 0.4082, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.114406779661017, |
|
"grad_norm": 0.3035448491573334, |
|
"learning_rate": 4.9324848194981906e-05, |
|
"loss": 0.4294, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 3.135593220338983, |
|
"grad_norm": 0.3060661256313324, |
|
"learning_rate": 4.83602212747541e-05, |
|
"loss": 0.4243, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.156779661016949, |
|
"grad_norm": 0.3512302041053772, |
|
"learning_rate": 4.7400611492771505e-05, |
|
"loss": 0.4558, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 3.1779661016949152, |
|
"grad_norm": 0.3085233271121979, |
|
"learning_rate": 4.644619957481972e-05, |
|
"loss": 0.4405, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.1991525423728815, |
|
"grad_norm": 0.37406814098358154, |
|
"learning_rate": 4.549716526775711e-05, |
|
"loss": 0.4394, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 3.2203389830508473, |
|
"grad_norm": 0.28444594144821167, |
|
"learning_rate": 4.455368730566282e-05, |
|
"loss": 0.4356, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.2415254237288136, |
|
"grad_norm": 0.3252512812614441, |
|
"learning_rate": 4.361594337617518e-05, |
|
"loss": 0.4422, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 3.26271186440678, |
|
"grad_norm": 0.34911468625068665, |
|
"learning_rate": 4.2684110087027364e-05, |
|
"loss": 0.42, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.2838983050847457, |
|
"grad_norm": 0.31359365582466125, |
|
"learning_rate": 4.175836293278635e-05, |
|
"loss": 0.4229, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 3.305084745762712, |
|
"grad_norm": 0.332359254360199, |
|
"learning_rate": 4.083887626180175e-05, |
|
"loss": 0.4428, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.326271186440678, |
|
"grad_norm": 0.3841429054737091, |
|
"learning_rate": 3.992582324337009e-05, |
|
"loss": 0.4643, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 3.347457627118644, |
|
"grad_norm": 0.3356688618659973, |
|
"learning_rate": 3.901937583512158e-05, |
|
"loss": 0.4169, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.3686440677966103, |
|
"grad_norm": 0.39436978101730347, |
|
"learning_rate": 3.811970475063486e-05, |
|
"loss": 0.4564, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 3.389830508474576, |
|
"grad_norm": 0.29478755593299866, |
|
"learning_rate": 3.7226979427285943e-05, |
|
"loss": 0.3858, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.4110169491525424, |
|
"grad_norm": 0.4711458086967468, |
|
"learning_rate": 3.6341367994337784e-05, |
|
"loss": 0.4547, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 3.4322033898305087, |
|
"grad_norm": 0.38489460945129395, |
|
"learning_rate": 3.546303724127603e-05, |
|
"loss": 0.4235, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.4533898305084745, |
|
"grad_norm": 0.41311007738113403, |
|
"learning_rate": 3.459215258639708e-05, |
|
"loss": 0.4589, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 3.4745762711864407, |
|
"grad_norm": 0.3139210641384125, |
|
"learning_rate": 3.372887804565442e-05, |
|
"loss": 0.4163, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.4957627118644066, |
|
"grad_norm": 0.43436604738235474, |
|
"learning_rate": 3.2873376201769154e-05, |
|
"loss": 0.4465, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 3.516949152542373, |
|
"grad_norm": 0.37427470088005066, |
|
"learning_rate": 3.202580817361037e-05, |
|
"loss": 0.4106, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.538135593220339, |
|
"grad_norm": 0.3729758560657501, |
|
"learning_rate": 3.1186333585851056e-05, |
|
"loss": 0.47, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 3.559322033898305, |
|
"grad_norm": 0.3862791955471039, |
|
"learning_rate": 3.0355110538905815e-05, |
|
"loss": 0.3975, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.580508474576271, |
|
"grad_norm": 0.35095420479774475, |
|
"learning_rate": 2.953229557915525e-05, |
|
"loss": 0.4422, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 3.601694915254237, |
|
"grad_norm": 0.34636810421943665, |
|
"learning_rate": 2.871804366946315e-05, |
|
"loss": 0.428, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.6228813559322033, |
|
"grad_norm": 0.3737597167491913, |
|
"learning_rate": 2.791250815999207e-05, |
|
"loss": 0.4544, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 3.6440677966101696, |
|
"grad_norm": 0.3554207384586334, |
|
"learning_rate": 2.7115840759322436e-05, |
|
"loss": 0.4167, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.665254237288136, |
|
"grad_norm": 0.369305819272995, |
|
"learning_rate": 2.6359522461221096e-05, |
|
"loss": 0.4456, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 3.6864406779661016, |
|
"grad_norm": 0.40377670526504517, |
|
"learning_rate": 2.5580670208969884e-05, |
|
"loss": 0.4465, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.707627118644068, |
|
"grad_norm": 0.4016803801059723, |
|
"learning_rate": 2.4811125226576454e-05, |
|
"loss": 0.4395, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 3.7288135593220337, |
|
"grad_norm": 0.3124406337738037, |
|
"learning_rate": 2.405103244443235e-05, |
|
"loss": 0.4154, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.44163626432418823, |
|
"learning_rate": 2.330053501277194e-05, |
|
"loss": 0.4607, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 3.7711864406779663, |
|
"grad_norm": 0.33251988887786865, |
|
"learning_rate": 2.2559774274712466e-05, |
|
"loss": 0.4114, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.792372881355932, |
|
"grad_norm": 0.4052109718322754, |
|
"learning_rate": 2.1828889739634496e-05, |
|
"loss": 0.4123, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 3.8135593220338984, |
|
"grad_norm": 0.3507472276687622, |
|
"learning_rate": 2.110801905690787e-05, |
|
"loss": 0.4199, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.834745762711864, |
|
"grad_norm": 0.4040756821632385, |
|
"learning_rate": 2.03972979899678e-05, |
|
"loss": 0.4526, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 3.8559322033898304, |
|
"grad_norm": 0.30861154198646545, |
|
"learning_rate": 1.9696860390746082e-05, |
|
"loss": 0.4152, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.8771186440677967, |
|
"grad_norm": 0.4708113670349121, |
|
"learning_rate": 1.900683817446263e-05, |
|
"loss": 0.4477, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 3.898305084745763, |
|
"grad_norm": 0.3677612543106079, |
|
"learning_rate": 1.832736129478131e-05, |
|
"loss": 0.4279, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.919491525423729, |
|
"grad_norm": 0.3834724724292755, |
|
"learning_rate": 1.7658557719335652e-05, |
|
"loss": 0.4235, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 3.940677966101695, |
|
"grad_norm": 0.3320079445838928, |
|
"learning_rate": 1.7000553405628164e-05, |
|
"loss": 0.4103, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.961864406779661, |
|
"grad_norm": 0.4474587142467499, |
|
"learning_rate": 1.6353472277308618e-05, |
|
"loss": 0.4422, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 3.983050847457627, |
|
"grad_norm": 0.3154617249965668, |
|
"learning_rate": 1.571743620083504e-05, |
|
"loss": 0.4343, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.004237288135593, |
|
"grad_norm": 0.32371950149536133, |
|
"learning_rate": 1.5092564962522388e-05, |
|
"loss": 0.452, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 4.02542372881356, |
|
"grad_norm": 0.3051236867904663, |
|
"learning_rate": 1.447897624598286e-05, |
|
"loss": 0.4164, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 4.046610169491525, |
|
"grad_norm": 0.3187614679336548, |
|
"learning_rate": 1.3876785609962218e-05, |
|
"loss": 0.3446, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 4.067796610169491, |
|
"grad_norm": 0.48843175172805786, |
|
"learning_rate": 1.3286106466576264e-05, |
|
"loss": 0.4296, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.088983050847458, |
|
"grad_norm": 0.3983837068080902, |
|
"learning_rate": 1.2707050059951763e-05, |
|
"loss": 0.344, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 4.110169491525424, |
|
"grad_norm": 0.300611674785614, |
|
"learning_rate": 1.2139725445275481e-05, |
|
"loss": 0.4169, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 4.13135593220339, |
|
"grad_norm": 0.4292912781238556, |
|
"learning_rate": 1.158423946825549e-05, |
|
"loss": 0.3689, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 4.1525423728813555, |
|
"grad_norm": 0.3964712917804718, |
|
"learning_rate": 1.1040696744998754e-05, |
|
"loss": 0.4404, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.173728813559322, |
|
"grad_norm": 0.6776478886604309, |
|
"learning_rate": 1.0509199642308436e-05, |
|
"loss": 0.3979, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 4.194915254237288, |
|
"grad_norm": 0.396267831325531, |
|
"learning_rate": 9.98984825840486e-06, |
|
"loss": 0.4182, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 4.216101694915254, |
|
"grad_norm": 0.28718650341033936, |
|
"learning_rate": 9.482740404073851e-06, |
|
"loss": 0.3736, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 4.237288135593221, |
|
"grad_norm": 0.3323756158351898, |
|
"learning_rate": 8.987971584245729e-06, |
|
"loss": 0.4113, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.258474576271187, |
|
"grad_norm": 0.33957767486572266, |
|
"learning_rate": 8.50563498000856e-06, |
|
"loss": 0.3925, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 4.279661016949152, |
|
"grad_norm": 0.4178178906440735, |
|
"learning_rate": 8.035821431059244e-06, |
|
"loss": 0.3973, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 4.3008474576271185, |
|
"grad_norm": 0.3192192614078522, |
|
"learning_rate": 7.578619418595358e-06, |
|
"loss": 0.3605, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 4.322033898305085, |
|
"grad_norm": 0.4187626540660858, |
|
"learning_rate": 7.1341150486512374e-06, |
|
"loss": 0.4199, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.343220338983051, |
|
"grad_norm": 0.3863602578639984, |
|
"learning_rate": 6.702392035881507e-06, |
|
"loss": 0.3568, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 4.364406779661017, |
|
"grad_norm": 0.4073178172111511, |
|
"learning_rate": 6.28353168779481e-06, |
|
"loss": 0.4327, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 4.385593220338983, |
|
"grad_norm": 0.31056177616119385, |
|
"learning_rate": 5.8776128894409305e-06, |
|
"loss": 0.372, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 4.406779661016949, |
|
"grad_norm": 0.3671024739742279, |
|
"learning_rate": 5.484712088554253e-06, |
|
"loss": 0.4078, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.427966101694915, |
|
"grad_norm": 0.2966119349002838, |
|
"learning_rate": 5.1049032811561196e-06, |
|
"loss": 0.3529, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 4.4491525423728815, |
|
"grad_norm": 0.3545999526977539, |
|
"learning_rate": 4.7382579976189244e-06, |
|
"loss": 0.3864, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 4.470338983050848, |
|
"grad_norm": 0.3902367651462555, |
|
"learning_rate": 4.384845289194699e-06, |
|
"loss": 0.3434, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 4.491525423728813, |
|
"grad_norm": 0.4561343193054199, |
|
"learning_rate": 4.044731715010463e-06, |
|
"loss": 0.371, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.512711864406779, |
|
"grad_norm": 0.29569247364997864, |
|
"learning_rate": 3.717981329532979e-06, |
|
"loss": 0.3957, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 4.533898305084746, |
|
"grad_norm": 0.3961041271686554, |
|
"learning_rate": 3.4046556705051744e-06, |
|
"loss": 0.3938, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 4.555084745762712, |
|
"grad_norm": 0.35009700059890747, |
|
"learning_rate": 3.104813747356674e-06, |
|
"loss": 0.3829, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 4.576271186440678, |
|
"grad_norm": 0.404491662979126, |
|
"learning_rate": 2.8185120300902865e-06, |
|
"loss": 0.3916, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.597457627118644, |
|
"grad_norm": 0.3277469277381897, |
|
"learning_rate": 2.5458044386469727e-06, |
|
"loss": 0.3681, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 4.61864406779661, |
|
"grad_norm": 0.4005700349807739, |
|
"learning_rate": 2.2867423327508654e-06, |
|
"loss": 0.4249, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 4.639830508474576, |
|
"grad_norm": 0.30087345838546753, |
|
"learning_rate": 2.0413745022366285e-06, |
|
"loss": 0.3493, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 4.661016949152542, |
|
"grad_norm": 0.37881365418434143, |
|
"learning_rate": 1.8097471578607164e-06, |
|
"loss": 0.4209, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.682203389830509, |
|
"grad_norm": 0.40850409865379333, |
|
"learning_rate": 1.5919039225983782e-06, |
|
"loss": 0.378, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 4.703389830508475, |
|
"grad_norm": 0.43627145886421204, |
|
"learning_rate": 1.3878858234280532e-06, |
|
"loss": 0.4131, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.72457627118644, |
|
"grad_norm": 0.3629254400730133, |
|
"learning_rate": 1.1977312836046194e-06, |
|
"loss": 0.3555, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 4.745762711864407, |
|
"grad_norm": 0.4231952428817749, |
|
"learning_rate": 1.0214761154230643e-06, |
|
"loss": 0.4459, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.766949152542373, |
|
"grad_norm": 0.32848870754241943, |
|
"learning_rate": 8.591535134738814e-07, |
|
"loss": 0.3753, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 4.788135593220339, |
|
"grad_norm": 0.49593624472618103, |
|
"learning_rate": 7.107940483913943e-07, |
|
"loss": 0.4109, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 4.809322033898305, |
|
"grad_norm": 0.3230677545070648, |
|
"learning_rate": 5.764256610963636e-07, |
|
"loss": 0.3534, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 4.830508474576272, |
|
"grad_norm": 0.4409547746181488, |
|
"learning_rate": 4.560736575337787e-07, |
|
"loss": 0.4389, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.851694915254237, |
|
"grad_norm": 0.3816058039665222, |
|
"learning_rate": 3.4976070390692054e-07, |
|
"loss": 0.369, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 4.872881355932203, |
|
"grad_norm": 0.3902296721935272, |
|
"learning_rate": 2.5750682240857634e-07, |
|
"loss": 0.4134, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.8940677966101696, |
|
"grad_norm": 0.3721590042114258, |
|
"learning_rate": 1.7932938745022218e-07, |
|
"loss": 0.3509, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 4.915254237288136, |
|
"grad_norm": 0.3812599778175354, |
|
"learning_rate": 1.1524312238984923e-07, |
|
"loss": 0.4109, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.936440677966102, |
|
"grad_norm": 0.39883601665496826, |
|
"learning_rate": 6.526009675905663e-08, |
|
"loss": 0.3768, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 4.9576271186440675, |
|
"grad_norm": 0.38190439343452454, |
|
"learning_rate": 2.9389723990011495e-08, |
|
"loss": 0.4262, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.978813559322034, |
|
"grad_norm": 0.2927350699901581, |
|
"learning_rate": 7.638759642525361e-09, |
|
"loss": 0.3631, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7745693922042847, |
|
"learning_rate": 1.1300131838587468e-11, |
|
"loss": 0.3756, |
|
"step": 5900 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 5900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0720875463474176e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|