abanm's picture
Initial commit: Uploading project files
23d7c37
raw
history blame
41.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 5900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0211864406779661,
"grad_norm": 0.3481608033180237,
"learning_rate": 2.11864406779661e-05,
"loss": 1.0844,
"step": 25
},
{
"epoch": 0.0423728813559322,
"grad_norm": 0.4737679064273834,
"learning_rate": 4.23728813559322e-05,
"loss": 0.968,
"step": 50
},
{
"epoch": 0.0635593220338983,
"grad_norm": 0.24385006725788116,
"learning_rate": 6.355932203389829e-05,
"loss": 0.7097,
"step": 75
},
{
"epoch": 0.0847457627118644,
"grad_norm": 0.32397332787513733,
"learning_rate": 8.47457627118644e-05,
"loss": 0.6995,
"step": 100
},
{
"epoch": 0.1059322033898305,
"grad_norm": 0.28883692622184753,
"learning_rate": 0.0001059322033898305,
"loss": 0.6362,
"step": 125
},
{
"epoch": 0.1271186440677966,
"grad_norm": 0.2939394414424896,
"learning_rate": 0.00012711864406779658,
"loss": 0.6646,
"step": 150
},
{
"epoch": 0.1483050847457627,
"grad_norm": 0.21506226062774658,
"learning_rate": 0.0001483050847457627,
"loss": 0.67,
"step": 175
},
{
"epoch": 0.1694915254237288,
"grad_norm": 0.24949543178081512,
"learning_rate": 0.00014999402230951556,
"loss": 0.6328,
"step": 200
},
{
"epoch": 0.1906779661016949,
"grad_norm": 0.1662125438451767,
"learning_rate": 0.00014997396600188487,
"loss": 0.6365,
"step": 225
},
{
"epoch": 0.211864406779661,
"grad_norm": 0.18493060767650604,
"learning_rate": 0.00014993978965384007,
"loss": 0.6661,
"step": 250
},
{
"epoch": 0.2330508474576271,
"grad_norm": 0.1724727302789688,
"learning_rate": 0.00014989149970190098,
"loss": 0.6453,
"step": 275
},
{
"epoch": 0.2542372881355932,
"grad_norm": 0.2265160232782364,
"learning_rate": 0.00014982910524063883,
"loss": 0.5802,
"step": 300
},
{
"epoch": 0.2754237288135593,
"grad_norm": 0.14229296147823334,
"learning_rate": 0.00014975261802096344,
"loss": 0.6559,
"step": 325
},
{
"epoch": 0.2966101694915254,
"grad_norm": 0.19628387689590454,
"learning_rate": 0.0001496620524479102,
"loss": 0.6181,
"step": 350
},
{
"epoch": 0.3177966101694915,
"grad_norm": 0.19808532297611237,
"learning_rate": 0.00014955742557792704,
"loss": 0.6363,
"step": 375
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.2479950338602066,
"learning_rate": 0.00014943875711566237,
"loss": 0.601,
"step": 400
},
{
"epoch": 0.3601694915254237,
"grad_norm": 0.18844148516654968,
"learning_rate": 0.0001493060694102537,
"loss": 0.6406,
"step": 425
},
{
"epoch": 0.3813559322033898,
"grad_norm": 0.21692270040512085,
"learning_rate": 0.00014915938745111896,
"loss": 0.674,
"step": 450
},
{
"epoch": 0.4025423728813559,
"grad_norm": 0.18362776935100555,
"learning_rate": 0.0001489987388632498,
"loss": 0.6326,
"step": 475
},
{
"epoch": 0.423728813559322,
"grad_norm": 0.1860133409500122,
"learning_rate": 0.0001488241539020092,
"loss": 0.6539,
"step": 500
},
{
"epoch": 0.4449152542372881,
"grad_norm": 0.16509853303432465,
"learning_rate": 0.00014863566544743326,
"loss": 0.6649,
"step": 525
},
{
"epoch": 0.4661016949152542,
"grad_norm": 0.17422816157341003,
"learning_rate": 0.0001484333089980388,
"loss": 0.6365,
"step": 550
},
{
"epoch": 0.4872881355932203,
"grad_norm": 0.16881784796714783,
"learning_rate": 0.000148217122664138,
"loss": 0.6014,
"step": 575
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.24150097370147705,
"learning_rate": 0.00014798714716066072,
"loss": 0.6225,
"step": 600
},
{
"epoch": 0.5296610169491526,
"grad_norm": 0.183096244931221,
"learning_rate": 0.00014774342579948675,
"loss": 0.628,
"step": 625
},
{
"epoch": 0.5508474576271186,
"grad_norm": 0.2092808037996292,
"learning_rate": 0.00014748600448128877,
"loss": 0.6196,
"step": 650
},
{
"epoch": 0.5720338983050848,
"grad_norm": 0.1650499850511551,
"learning_rate": 0.00014721493168688764,
"loss": 0.6617,
"step": 675
},
{
"epoch": 0.5932203389830508,
"grad_norm": 0.2336203157901764,
"learning_rate": 0.00014693025846812194,
"loss": 0.5995,
"step": 700
},
{
"epoch": 0.614406779661017,
"grad_norm": 0.1635483205318451,
"learning_rate": 0.0001466320384382333,
"loss": 0.6225,
"step": 725
},
{
"epoch": 0.635593220338983,
"grad_norm": 0.24543817341327667,
"learning_rate": 0.00014632032776176924,
"loss": 0.6208,
"step": 750
},
{
"epoch": 0.6567796610169492,
"grad_norm": 0.156394824385643,
"learning_rate": 0.0001459951851440055,
"loss": 0.6234,
"step": 775
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.21179354190826416,
"learning_rate": 0.00014565667181988995,
"loss": 0.6101,
"step": 800
},
{
"epoch": 0.6991525423728814,
"grad_norm": 0.1816495805978775,
"learning_rate": 0.00014530485154251021,
"loss": 0.6212,
"step": 825
},
{
"epoch": 0.7203389830508474,
"grad_norm": 0.18615126609802246,
"learning_rate": 0.0001449397905710866,
"loss": 0.6019,
"step": 850
},
{
"epoch": 0.7415254237288136,
"grad_norm": 0.13972151279449463,
"learning_rate": 0.00014456155765849355,
"loss": 0.6804,
"step": 875
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.19166871905326843,
"learning_rate": 0.00014417022403831117,
"loss": 0.6265,
"step": 900
},
{
"epoch": 0.7838983050847458,
"grad_norm": 0.1559162586927414,
"learning_rate": 0.00014376586341140955,
"loss": 0.5893,
"step": 925
},
{
"epoch": 0.8050847457627118,
"grad_norm": 0.17139187455177307,
"learning_rate": 0.0001433485519320687,
"loss": 0.6192,
"step": 950
},
{
"epoch": 0.826271186440678,
"grad_norm": 0.19588051736354828,
"learning_rate": 0.0001429183681936359,
"loss": 0.6545,
"step": 975
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.17011399567127228,
"learning_rate": 0.0001424753932137243,
"loss": 0.6274,
"step": 1000
},
{
"epoch": 0.8686440677966102,
"grad_norm": 0.13620993494987488,
"learning_rate": 0.00014201971041895455,
"loss": 0.6185,
"step": 1025
},
{
"epoch": 0.8898305084745762,
"grad_norm": 0.19832104444503784,
"learning_rate": 0.00014155140562924286,
"loss": 0.5788,
"step": 1050
},
{
"epoch": 0.9110169491525424,
"grad_norm": 0.15580902993679047,
"learning_rate": 0.00014107056704163823,
"loss": 0.6756,
"step": 1075
},
{
"epoch": 0.9322033898305084,
"grad_norm": 0.2072034329175949,
"learning_rate": 0.00014057728521371218,
"loss": 0.6347,
"step": 1100
},
{
"epoch": 0.9533898305084746,
"grad_norm": 0.13679395616054535,
"learning_rate": 0.00014007165304650386,
"loss": 0.6419,
"step": 1125
},
{
"epoch": 0.9745762711864406,
"grad_norm": 0.20975461602210999,
"learning_rate": 0.00013955376576702357,
"loss": 0.5929,
"step": 1150
},
{
"epoch": 0.9957627118644068,
"grad_norm": 0.18808843195438385,
"learning_rate": 0.00013902372091031856,
"loss": 0.6327,
"step": 1175
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.12700864672660828,
"learning_rate": 0.00013848161830110395,
"loss": 0.6166,
"step": 1200
},
{
"epoch": 1.0381355932203389,
"grad_norm": 0.17502394318580627,
"learning_rate": 0.0001379275600349625,
"loss": 0.542,
"step": 1225
},
{
"epoch": 1.0593220338983051,
"grad_norm": 0.17643524706363678,
"learning_rate": 0.0001373616504591167,
"loss": 0.6077,
"step": 1250
},
{
"epoch": 1.0805084745762712,
"grad_norm": 0.21401630342006683,
"learning_rate": 0.00013678399615277674,
"loss": 0.5856,
"step": 1275
},
{
"epoch": 1.1016949152542372,
"grad_norm": 0.1577410101890564,
"learning_rate": 0.00013619470590706814,
"loss": 0.5882,
"step": 1300
},
{
"epoch": 1.1228813559322033,
"grad_norm": 0.2284272313117981,
"learning_rate": 0.00013559389070454304,
"loss": 0.5842,
"step": 1325
},
{
"epoch": 1.1440677966101696,
"grad_norm": 0.2204512506723404,
"learning_rate": 0.00013498166369827833,
"loss": 0.5911,
"step": 1350
},
{
"epoch": 1.1652542372881356,
"grad_norm": 0.21209457516670227,
"learning_rate": 0.00013435814019056535,
"loss": 0.602,
"step": 1375
},
{
"epoch": 1.1864406779661016,
"grad_norm": 0.16774219274520874,
"learning_rate": 0.00013372343761119466,
"loss": 0.5746,
"step": 1400
},
{
"epoch": 1.207627118644068,
"grad_norm": 0.23171478509902954,
"learning_rate": 0.00013307767549534033,
"loss": 0.6046,
"step": 1425
},
{
"epoch": 1.228813559322034,
"grad_norm": 0.17449446022510529,
"learning_rate": 0.00013242097546104734,
"loss": 0.5969,
"step": 1450
},
{
"epoch": 1.25,
"grad_norm": 0.21454857289791107,
"learning_rate": 0.00013175346118632713,
"loss": 0.5927,
"step": 1475
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.17533324658870697,
"learning_rate": 0.00013107525838586495,
"loss": 0.5806,
"step": 1500
},
{
"epoch": 1.292372881355932,
"grad_norm": 0.2303514927625656,
"learning_rate": 0.00013038649478734363,
"loss": 0.6269,
"step": 1525
},
{
"epoch": 1.3135593220338984,
"grad_norm": 0.2209363877773285,
"learning_rate": 0.00012968730010738837,
"loss": 0.5699,
"step": 1550
},
{
"epoch": 1.3347457627118644,
"grad_norm": 0.2777274250984192,
"learning_rate": 0.0001289778060271368,
"loss": 0.5583,
"step": 1575
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.19397616386413574,
"learning_rate": 0.00012825814616743928,
"loss": 0.5785,
"step": 1600
},
{
"epoch": 1.3771186440677967,
"grad_norm": 0.24071291089057922,
"learning_rate": 0.0001275284560636935,
"loss": 0.5793,
"step": 1625
},
{
"epoch": 1.3983050847457628,
"grad_norm": 0.16364933550357819,
"learning_rate": 0.000126788873140319,
"loss": 0.5591,
"step": 1650
},
{
"epoch": 1.4194915254237288,
"grad_norm": 0.2222534716129303,
"learning_rate": 0.00012603953668487546,
"loss": 0.5649,
"step": 1675
},
{
"epoch": 1.4406779661016949,
"grad_norm": 0.18990883231163025,
"learning_rate": 0.00012528058782183048,
"loss": 0.5732,
"step": 1700
},
{
"epoch": 1.461864406779661,
"grad_norm": 0.23255659639835358,
"learning_rate": 0.00012451216948598117,
"loss": 0.55,
"step": 1725
},
{
"epoch": 1.4830508474576272,
"grad_norm": 0.19624237716197968,
"learning_rate": 0.00012373442639553487,
"loss": 0.5793,
"step": 1750
},
{
"epoch": 1.5042372881355932,
"grad_norm": 0.24238888919353485,
"learning_rate": 0.00012294750502485398,
"loss": 0.5823,
"step": 1775
},
{
"epoch": 1.5254237288135593,
"grad_norm": 0.2002212405204773,
"learning_rate": 0.00012215155357687017,
"loss": 0.571,
"step": 1800
},
{
"epoch": 1.5466101694915255,
"grad_norm": 0.21096192300319672,
"learning_rate": 0.0001213467219551728,
"loss": 0.588,
"step": 1825
},
{
"epoch": 1.5677966101694916,
"grad_norm": 0.20380620658397675,
"learning_rate": 0.00012053316173577726,
"loss": 0.5869,
"step": 1850
},
{
"epoch": 1.5889830508474576,
"grad_norm": 0.25443893671035767,
"learning_rate": 0.00011971102613857823,
"loss": 0.5659,
"step": 1875
},
{
"epoch": 1.6101694915254239,
"grad_norm": 0.22190341353416443,
"learning_rate": 0.0001188804699984935,
"loss": 0.5835,
"step": 1900
},
{
"epoch": 1.6313559322033897,
"grad_norm": 0.24329130351543427,
"learning_rate": 0.00011804164973630335,
"loss": 0.5639,
"step": 1925
},
{
"epoch": 1.652542372881356,
"grad_norm": 0.2349741905927658,
"learning_rate": 0.00011719472332919148,
"loss": 0.5726,
"step": 1950
},
{
"epoch": 1.673728813559322,
"grad_norm": 0.20963279902935028,
"learning_rate": 0.00011633985028099284,
"loss": 0.5612,
"step": 1975
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.27600300312042236,
"learning_rate": 0.00011547719159215378,
"loss": 0.5943,
"step": 2000
},
{
"epoch": 1.7161016949152543,
"grad_norm": 0.21020427346229553,
"learning_rate": 0.00011460690972941037,
"loss": 0.5802,
"step": 2025
},
{
"epoch": 1.7372881355932204,
"grad_norm": 0.20670145750045776,
"learning_rate": 0.00011372916859519075,
"loss": 0.5766,
"step": 2050
},
{
"epoch": 1.7584745762711864,
"grad_norm": 0.2435368299484253,
"learning_rate": 0.0001128441334967469,
"loss": 0.6128,
"step": 2075
},
{
"epoch": 1.7796610169491527,
"grad_norm": 0.21429473161697388,
"learning_rate": 0.00011195197111502184,
"loss": 0.5844,
"step": 2100
},
{
"epoch": 1.8008474576271185,
"grad_norm": 0.21995683014392853,
"learning_rate": 0.0001110528494732583,
"loss": 0.5532,
"step": 2125
},
{
"epoch": 1.8220338983050848,
"grad_norm": 0.19685518741607666,
"learning_rate": 0.00011014693790535437,
"loss": 0.5569,
"step": 2150
},
{
"epoch": 1.8432203389830508,
"grad_norm": 0.20260564982891083,
"learning_rate": 0.00010923440702397243,
"loss": 0.5792,
"step": 2175
},
{
"epoch": 1.8644067796610169,
"grad_norm": 0.19778716564178467,
"learning_rate": 0.00010831542868840729,
"loss": 0.5978,
"step": 2200
},
{
"epoch": 1.8855932203389831,
"grad_norm": 0.22923052310943604,
"learning_rate": 0.00010739017597221942,
"loss": 0.5572,
"step": 2225
},
{
"epoch": 1.9067796610169492,
"grad_norm": 0.21343784034252167,
"learning_rate": 0.00010645882313063953,
"loss": 0.5643,
"step": 2250
},
{
"epoch": 1.9279661016949152,
"grad_norm": 0.2053421288728714,
"learning_rate": 0.00010552154556775076,
"loss": 0.5806,
"step": 2275
},
{
"epoch": 1.9491525423728815,
"grad_norm": 0.22164656221866608,
"learning_rate": 0.00010457851980345423,
"loss": 0.6011,
"step": 2300
},
{
"epoch": 1.9703389830508473,
"grad_norm": 0.284758985042572,
"learning_rate": 0.00010362992344022468,
"loss": 0.5374,
"step": 2325
},
{
"epoch": 1.9915254237288136,
"grad_norm": 0.2642022371292114,
"learning_rate": 0.00010267593512966216,
"loss": 0.5892,
"step": 2350
},
{
"epoch": 2.01271186440678,
"grad_norm": 0.19165368378162384,
"learning_rate": 0.00010171673453884601,
"loss": 0.5175,
"step": 2375
},
{
"epoch": 2.0338983050847457,
"grad_norm": 0.2643072307109833,
"learning_rate": 0.00010075250231649775,
"loss": 0.5204,
"step": 2400
},
{
"epoch": 2.055084745762712,
"grad_norm": 0.2326943427324295,
"learning_rate": 9.978342005895911e-05,
"loss": 0.4847,
"step": 2425
},
{
"epoch": 2.0762711864406778,
"grad_norm": 0.2779608368873596,
"learning_rate": 9.880967027599139e-05,
"loss": 0.52,
"step": 2450
},
{
"epoch": 2.097457627118644,
"grad_norm": 0.22342316806316376,
"learning_rate": 9.783143635640304e-05,
"loss": 0.5124,
"step": 2475
},
{
"epoch": 2.1186440677966103,
"grad_norm": 0.26453691720962524,
"learning_rate": 9.684890253351153e-05,
"loss": 0.4954,
"step": 2500
},
{
"epoch": 2.139830508474576,
"grad_norm": 0.26683682203292847,
"learning_rate": 9.586225385044615e-05,
"loss": 0.519,
"step": 2525
},
{
"epoch": 2.1610169491525424,
"grad_norm": 0.27656257152557373,
"learning_rate": 9.487167612529851e-05,
"loss": 0.5409,
"step": 2550
},
{
"epoch": 2.1822033898305087,
"grad_norm": 0.27244171500205994,
"learning_rate": 9.387735591612677e-05,
"loss": 0.4976,
"step": 2575
},
{
"epoch": 2.2033898305084745,
"grad_norm": 0.29296210408210754,
"learning_rate": 9.28794804858208e-05,
"loss": 0.4964,
"step": 2600
},
{
"epoch": 2.2245762711864407,
"grad_norm": 0.28374531865119934,
"learning_rate": 9.187823776683444e-05,
"loss": 0.4936,
"step": 2625
},
{
"epoch": 2.2457627118644066,
"grad_norm": 0.25039607286453247,
"learning_rate": 9.087381632579165e-05,
"loss": 0.4548,
"step": 2650
},
{
"epoch": 2.266949152542373,
"grad_norm": 0.2839612662792206,
"learning_rate": 8.986640532797341e-05,
"loss": 0.521,
"step": 2675
},
{
"epoch": 2.288135593220339,
"grad_norm": 0.26817333698272705,
"learning_rate": 8.885619450169154e-05,
"loss": 0.4813,
"step": 2700
},
{
"epoch": 2.309322033898305,
"grad_norm": 0.2513103187084198,
"learning_rate": 8.78433741025568e-05,
"loss": 0.4964,
"step": 2725
},
{
"epoch": 2.330508474576271,
"grad_norm": 0.2661533057689667,
"learning_rate": 8.682813487764759e-05,
"loss": 0.5267,
"step": 2750
},
{
"epoch": 2.3516949152542375,
"grad_norm": 0.31996023654937744,
"learning_rate": 8.581066802958593e-05,
"loss": 0.4877,
"step": 2775
},
{
"epoch": 2.3728813559322033,
"grad_norm": 0.3120092749595642,
"learning_rate": 8.479116518052793e-05,
"loss": 0.5025,
"step": 2800
},
{
"epoch": 2.3940677966101696,
"grad_norm": 0.25984951853752136,
"learning_rate": 8.376981833607496e-05,
"loss": 0.5184,
"step": 2825
},
{
"epoch": 2.415254237288136,
"grad_norm": 0.28586438298225403,
"learning_rate": 8.274681984911279e-05,
"loss": 0.5128,
"step": 2850
},
{
"epoch": 2.4364406779661016,
"grad_norm": 0.23898103833198547,
"learning_rate": 8.172236238358537e-05,
"loss": 0.4968,
"step": 2875
},
{
"epoch": 2.457627118644068,
"grad_norm": 0.2596363127231598,
"learning_rate": 8.069663887820978e-05,
"loss": 0.5338,
"step": 2900
},
{
"epoch": 2.4788135593220337,
"grad_norm": 0.2569097578525543,
"learning_rate": 7.966984251013964e-05,
"loss": 0.5186,
"step": 2925
},
{
"epoch": 2.5,
"grad_norm": 0.23606939613819122,
"learning_rate": 7.864216665858362e-05,
"loss": 0.5087,
"step": 2950
},
{
"epoch": 2.5211864406779663,
"grad_norm": 0.24160584807395935,
"learning_rate": 7.761380486838573e-05,
"loss": 0.5164,
"step": 2975
},
{
"epoch": 2.542372881355932,
"grad_norm": 0.3212146461009979,
"learning_rate": 7.658495081357461e-05,
"loss": 0.5173,
"step": 3000
},
{
"epoch": 2.5635593220338984,
"grad_norm": 0.22904744744300842,
"learning_rate": 7.555579826088837e-05,
"loss": 0.5345,
"step": 3025
},
{
"epoch": 2.584745762711864,
"grad_norm": 0.31355756521224976,
"learning_rate": 7.452654103328196e-05,
"loss": 0.4683,
"step": 3050
},
{
"epoch": 2.6059322033898304,
"grad_norm": 0.31533321738243103,
"learning_rate": 7.349737297342404e-05,
"loss": 0.5259,
"step": 3075
},
{
"epoch": 2.6271186440677967,
"grad_norm": 0.2956444025039673,
"learning_rate": 7.24684879071901e-05,
"loss": 0.497,
"step": 3100
},
{
"epoch": 2.648305084745763,
"grad_norm": 0.2766103446483612,
"learning_rate": 7.14400796071587e-05,
"loss": 0.5166,
"step": 3125
},
{
"epoch": 2.669491525423729,
"grad_norm": 0.3354440927505493,
"learning_rate": 7.041234175611775e-05,
"loss": 0.5233,
"step": 3150
},
{
"epoch": 2.690677966101695,
"grad_norm": 0.2812809348106384,
"learning_rate": 6.938546791058785e-05,
"loss": 0.5155,
"step": 3175
},
{
"epoch": 2.711864406779661,
"grad_norm": 0.39217862486839294,
"learning_rate": 6.835965146436916e-05,
"loss": 0.4926,
"step": 3200
},
{
"epoch": 2.733050847457627,
"grad_norm": 0.3037302494049072,
"learning_rate": 6.73350856121191e-05,
"loss": 0.5098,
"step": 3225
},
{
"epoch": 2.7542372881355934,
"grad_norm": 0.2784561514854431,
"learning_rate": 6.63119633129675e-05,
"loss": 0.5371,
"step": 3250
},
{
"epoch": 2.7754237288135593,
"grad_norm": 0.2815192639827728,
"learning_rate": 6.529047725417618e-05,
"loss": 0.4839,
"step": 3275
},
{
"epoch": 2.7966101694915255,
"grad_norm": 0.26870056986808777,
"learning_rate": 6.427081981484946e-05,
"loss": 0.4981,
"step": 3300
},
{
"epoch": 2.8177966101694913,
"grad_norm": 0.28585174679756165,
"learning_rate": 6.325318302970318e-05,
"loss": 0.4841,
"step": 3325
},
{
"epoch": 2.8389830508474576,
"grad_norm": 0.2712132930755615,
"learning_rate": 6.22377585528981e-05,
"loss": 0.4833,
"step": 3350
},
{
"epoch": 2.860169491525424,
"grad_norm": 0.28583309054374695,
"learning_rate": 6.12247376219452e-05,
"loss": 0.5043,
"step": 3375
},
{
"epoch": 2.8813559322033897,
"grad_norm": 0.29179123044013977,
"learning_rate": 6.021431102168954e-05,
"loss": 0.5343,
"step": 3400
},
{
"epoch": 2.902542372881356,
"grad_norm": 0.29638585448265076,
"learning_rate": 5.92066690483792e-05,
"loss": 0.501,
"step": 3425
},
{
"epoch": 2.923728813559322,
"grad_norm": 0.2945152521133423,
"learning_rate": 5.820200147382617e-05,
"loss": 0.5149,
"step": 3450
},
{
"epoch": 2.944915254237288,
"grad_norm": 0.24451757967472076,
"learning_rate": 5.720049750966638e-05,
"loss": 0.501,
"step": 3475
},
{
"epoch": 2.9661016949152543,
"grad_norm": 0.33959802985191345,
"learning_rate": 5.6202345771724785e-05,
"loss": 0.5202,
"step": 3500
},
{
"epoch": 2.9872881355932206,
"grad_norm": 0.40264537930488586,
"learning_rate": 5.520773424449299e-05,
"loss": 0.5004,
"step": 3525
},
{
"epoch": 3.0084745762711864,
"grad_norm": 0.23446495831012726,
"learning_rate": 5.421685024572547e-05,
"loss": 0.4788,
"step": 3550
},
{
"epoch": 3.0296610169491527,
"grad_norm": 0.29302000999450684,
"learning_rate": 5.322988039116176e-05,
"loss": 0.4302,
"step": 3575
},
{
"epoch": 3.0508474576271185,
"grad_norm": 0.28345516324043274,
"learning_rate": 5.224701055938047e-05,
"loss": 0.4195,
"step": 3600
},
{
"epoch": 3.0720338983050848,
"grad_norm": 0.3563604950904846,
"learning_rate": 5.126842585679235e-05,
"loss": 0.4302,
"step": 3625
},
{
"epoch": 3.093220338983051,
"grad_norm": 0.2989650070667267,
"learning_rate": 5.0294310582778717e-05,
"loss": 0.4082,
"step": 3650
},
{
"epoch": 3.114406779661017,
"grad_norm": 0.3035448491573334,
"learning_rate": 4.9324848194981906e-05,
"loss": 0.4294,
"step": 3675
},
{
"epoch": 3.135593220338983,
"grad_norm": 0.3060661256313324,
"learning_rate": 4.83602212747541e-05,
"loss": 0.4243,
"step": 3700
},
{
"epoch": 3.156779661016949,
"grad_norm": 0.3512302041053772,
"learning_rate": 4.7400611492771505e-05,
"loss": 0.4558,
"step": 3725
},
{
"epoch": 3.1779661016949152,
"grad_norm": 0.3085233271121979,
"learning_rate": 4.644619957481972e-05,
"loss": 0.4405,
"step": 3750
},
{
"epoch": 3.1991525423728815,
"grad_norm": 0.37406814098358154,
"learning_rate": 4.549716526775711e-05,
"loss": 0.4394,
"step": 3775
},
{
"epoch": 3.2203389830508473,
"grad_norm": 0.28444594144821167,
"learning_rate": 4.455368730566282e-05,
"loss": 0.4356,
"step": 3800
},
{
"epoch": 3.2415254237288136,
"grad_norm": 0.3252512812614441,
"learning_rate": 4.361594337617518e-05,
"loss": 0.4422,
"step": 3825
},
{
"epoch": 3.26271186440678,
"grad_norm": 0.34911468625068665,
"learning_rate": 4.2684110087027364e-05,
"loss": 0.42,
"step": 3850
},
{
"epoch": 3.2838983050847457,
"grad_norm": 0.31359365582466125,
"learning_rate": 4.175836293278635e-05,
"loss": 0.4229,
"step": 3875
},
{
"epoch": 3.305084745762712,
"grad_norm": 0.332359254360199,
"learning_rate": 4.083887626180175e-05,
"loss": 0.4428,
"step": 3900
},
{
"epoch": 3.326271186440678,
"grad_norm": 0.3841429054737091,
"learning_rate": 3.992582324337009e-05,
"loss": 0.4643,
"step": 3925
},
{
"epoch": 3.347457627118644,
"grad_norm": 0.3356688618659973,
"learning_rate": 3.901937583512158e-05,
"loss": 0.4169,
"step": 3950
},
{
"epoch": 3.3686440677966103,
"grad_norm": 0.39436978101730347,
"learning_rate": 3.811970475063486e-05,
"loss": 0.4564,
"step": 3975
},
{
"epoch": 3.389830508474576,
"grad_norm": 0.29478755593299866,
"learning_rate": 3.7226979427285943e-05,
"loss": 0.3858,
"step": 4000
},
{
"epoch": 3.4110169491525424,
"grad_norm": 0.4711458086967468,
"learning_rate": 3.6341367994337784e-05,
"loss": 0.4547,
"step": 4025
},
{
"epoch": 3.4322033898305087,
"grad_norm": 0.38489460945129395,
"learning_rate": 3.546303724127603e-05,
"loss": 0.4235,
"step": 4050
},
{
"epoch": 3.4533898305084745,
"grad_norm": 0.41311007738113403,
"learning_rate": 3.459215258639708e-05,
"loss": 0.4589,
"step": 4075
},
{
"epoch": 3.4745762711864407,
"grad_norm": 0.3139210641384125,
"learning_rate": 3.372887804565442e-05,
"loss": 0.4163,
"step": 4100
},
{
"epoch": 3.4957627118644066,
"grad_norm": 0.43436604738235474,
"learning_rate": 3.2873376201769154e-05,
"loss": 0.4465,
"step": 4125
},
{
"epoch": 3.516949152542373,
"grad_norm": 0.37427470088005066,
"learning_rate": 3.202580817361037e-05,
"loss": 0.4106,
"step": 4150
},
{
"epoch": 3.538135593220339,
"grad_norm": 0.3729758560657501,
"learning_rate": 3.1186333585851056e-05,
"loss": 0.47,
"step": 4175
},
{
"epoch": 3.559322033898305,
"grad_norm": 0.3862791955471039,
"learning_rate": 3.0355110538905815e-05,
"loss": 0.3975,
"step": 4200
},
{
"epoch": 3.580508474576271,
"grad_norm": 0.35095420479774475,
"learning_rate": 2.953229557915525e-05,
"loss": 0.4422,
"step": 4225
},
{
"epoch": 3.601694915254237,
"grad_norm": 0.34636810421943665,
"learning_rate": 2.871804366946315e-05,
"loss": 0.428,
"step": 4250
},
{
"epoch": 3.6228813559322033,
"grad_norm": 0.3737597167491913,
"learning_rate": 2.791250815999207e-05,
"loss": 0.4544,
"step": 4275
},
{
"epoch": 3.6440677966101696,
"grad_norm": 0.3554207384586334,
"learning_rate": 2.7115840759322436e-05,
"loss": 0.4167,
"step": 4300
},
{
"epoch": 3.665254237288136,
"grad_norm": 0.369305819272995,
"learning_rate": 2.6359522461221096e-05,
"loss": 0.4456,
"step": 4325
},
{
"epoch": 3.6864406779661016,
"grad_norm": 0.40377670526504517,
"learning_rate": 2.5580670208969884e-05,
"loss": 0.4465,
"step": 4350
},
{
"epoch": 3.707627118644068,
"grad_norm": 0.4016803801059723,
"learning_rate": 2.4811125226576454e-05,
"loss": 0.4395,
"step": 4375
},
{
"epoch": 3.7288135593220337,
"grad_norm": 0.3124406337738037,
"learning_rate": 2.405103244443235e-05,
"loss": 0.4154,
"step": 4400
},
{
"epoch": 3.75,
"grad_norm": 0.44163626432418823,
"learning_rate": 2.330053501277194e-05,
"loss": 0.4607,
"step": 4425
},
{
"epoch": 3.7711864406779663,
"grad_norm": 0.33251988887786865,
"learning_rate": 2.2559774274712466e-05,
"loss": 0.4114,
"step": 4450
},
{
"epoch": 3.792372881355932,
"grad_norm": 0.4052109718322754,
"learning_rate": 2.1828889739634496e-05,
"loss": 0.4123,
"step": 4475
},
{
"epoch": 3.8135593220338984,
"grad_norm": 0.3507472276687622,
"learning_rate": 2.110801905690787e-05,
"loss": 0.4199,
"step": 4500
},
{
"epoch": 3.834745762711864,
"grad_norm": 0.4040756821632385,
"learning_rate": 2.03972979899678e-05,
"loss": 0.4526,
"step": 4525
},
{
"epoch": 3.8559322033898304,
"grad_norm": 0.30861154198646545,
"learning_rate": 1.9696860390746082e-05,
"loss": 0.4152,
"step": 4550
},
{
"epoch": 3.8771186440677967,
"grad_norm": 0.4708113670349121,
"learning_rate": 1.900683817446263e-05,
"loss": 0.4477,
"step": 4575
},
{
"epoch": 3.898305084745763,
"grad_norm": 0.3677612543106079,
"learning_rate": 1.832736129478131e-05,
"loss": 0.4279,
"step": 4600
},
{
"epoch": 3.919491525423729,
"grad_norm": 0.3834724724292755,
"learning_rate": 1.7658557719335652e-05,
"loss": 0.4235,
"step": 4625
},
{
"epoch": 3.940677966101695,
"grad_norm": 0.3320079445838928,
"learning_rate": 1.7000553405628164e-05,
"loss": 0.4103,
"step": 4650
},
{
"epoch": 3.961864406779661,
"grad_norm": 0.4474587142467499,
"learning_rate": 1.6353472277308618e-05,
"loss": 0.4422,
"step": 4675
},
{
"epoch": 3.983050847457627,
"grad_norm": 0.3154617249965668,
"learning_rate": 1.571743620083504e-05,
"loss": 0.4343,
"step": 4700
},
{
"epoch": 4.004237288135593,
"grad_norm": 0.32371950149536133,
"learning_rate": 1.5092564962522388e-05,
"loss": 0.452,
"step": 4725
},
{
"epoch": 4.02542372881356,
"grad_norm": 0.3051236867904663,
"learning_rate": 1.447897624598286e-05,
"loss": 0.4164,
"step": 4750
},
{
"epoch": 4.046610169491525,
"grad_norm": 0.3187614679336548,
"learning_rate": 1.3876785609962218e-05,
"loss": 0.3446,
"step": 4775
},
{
"epoch": 4.067796610169491,
"grad_norm": 0.48843175172805786,
"learning_rate": 1.3286106466576264e-05,
"loss": 0.4296,
"step": 4800
},
{
"epoch": 4.088983050847458,
"grad_norm": 0.3983837068080902,
"learning_rate": 1.2707050059951763e-05,
"loss": 0.344,
"step": 4825
},
{
"epoch": 4.110169491525424,
"grad_norm": 0.300611674785614,
"learning_rate": 1.2139725445275481e-05,
"loss": 0.4169,
"step": 4850
},
{
"epoch": 4.13135593220339,
"grad_norm": 0.4292912781238556,
"learning_rate": 1.158423946825549e-05,
"loss": 0.3689,
"step": 4875
},
{
"epoch": 4.1525423728813555,
"grad_norm": 0.3964712917804718,
"learning_rate": 1.1040696744998754e-05,
"loss": 0.4404,
"step": 4900
},
{
"epoch": 4.173728813559322,
"grad_norm": 0.6776478886604309,
"learning_rate": 1.0509199642308436e-05,
"loss": 0.3979,
"step": 4925
},
{
"epoch": 4.194915254237288,
"grad_norm": 0.396267831325531,
"learning_rate": 9.98984825840486e-06,
"loss": 0.4182,
"step": 4950
},
{
"epoch": 4.216101694915254,
"grad_norm": 0.28718650341033936,
"learning_rate": 9.482740404073851e-06,
"loss": 0.3736,
"step": 4975
},
{
"epoch": 4.237288135593221,
"grad_norm": 0.3323756158351898,
"learning_rate": 8.987971584245729e-06,
"loss": 0.4113,
"step": 5000
},
{
"epoch": 4.258474576271187,
"grad_norm": 0.33957767486572266,
"learning_rate": 8.50563498000856e-06,
"loss": 0.3925,
"step": 5025
},
{
"epoch": 4.279661016949152,
"grad_norm": 0.4178178906440735,
"learning_rate": 8.035821431059244e-06,
"loss": 0.3973,
"step": 5050
},
{
"epoch": 4.3008474576271185,
"grad_norm": 0.3192192614078522,
"learning_rate": 7.578619418595358e-06,
"loss": 0.3605,
"step": 5075
},
{
"epoch": 4.322033898305085,
"grad_norm": 0.4187626540660858,
"learning_rate": 7.1341150486512374e-06,
"loss": 0.4199,
"step": 5100
},
{
"epoch": 4.343220338983051,
"grad_norm": 0.3863602578639984,
"learning_rate": 6.702392035881507e-06,
"loss": 0.3568,
"step": 5125
},
{
"epoch": 4.364406779661017,
"grad_norm": 0.4073178172111511,
"learning_rate": 6.28353168779481e-06,
"loss": 0.4327,
"step": 5150
},
{
"epoch": 4.385593220338983,
"grad_norm": 0.31056177616119385,
"learning_rate": 5.8776128894409305e-06,
"loss": 0.372,
"step": 5175
},
{
"epoch": 4.406779661016949,
"grad_norm": 0.3671024739742279,
"learning_rate": 5.484712088554253e-06,
"loss": 0.4078,
"step": 5200
},
{
"epoch": 4.427966101694915,
"grad_norm": 0.2966119349002838,
"learning_rate": 5.1049032811561196e-06,
"loss": 0.3529,
"step": 5225
},
{
"epoch": 4.4491525423728815,
"grad_norm": 0.3545999526977539,
"learning_rate": 4.7382579976189244e-06,
"loss": 0.3864,
"step": 5250
},
{
"epoch": 4.470338983050848,
"grad_norm": 0.3902367651462555,
"learning_rate": 4.384845289194699e-06,
"loss": 0.3434,
"step": 5275
},
{
"epoch": 4.491525423728813,
"grad_norm": 0.4561343193054199,
"learning_rate": 4.044731715010463e-06,
"loss": 0.371,
"step": 5300
},
{
"epoch": 4.512711864406779,
"grad_norm": 0.29569247364997864,
"learning_rate": 3.717981329532979e-06,
"loss": 0.3957,
"step": 5325
},
{
"epoch": 4.533898305084746,
"grad_norm": 0.3961041271686554,
"learning_rate": 3.4046556705051744e-06,
"loss": 0.3938,
"step": 5350
},
{
"epoch": 4.555084745762712,
"grad_norm": 0.35009700059890747,
"learning_rate": 3.104813747356674e-06,
"loss": 0.3829,
"step": 5375
},
{
"epoch": 4.576271186440678,
"grad_norm": 0.404491662979126,
"learning_rate": 2.8185120300902865e-06,
"loss": 0.3916,
"step": 5400
},
{
"epoch": 4.597457627118644,
"grad_norm": 0.3277469277381897,
"learning_rate": 2.5458044386469727e-06,
"loss": 0.3681,
"step": 5425
},
{
"epoch": 4.61864406779661,
"grad_norm": 0.4005700349807739,
"learning_rate": 2.2867423327508654e-06,
"loss": 0.4249,
"step": 5450
},
{
"epoch": 4.639830508474576,
"grad_norm": 0.30087345838546753,
"learning_rate": 2.0413745022366285e-06,
"loss": 0.3493,
"step": 5475
},
{
"epoch": 4.661016949152542,
"grad_norm": 0.37881365418434143,
"learning_rate": 1.8097471578607164e-06,
"loss": 0.4209,
"step": 5500
},
{
"epoch": 4.682203389830509,
"grad_norm": 0.40850409865379333,
"learning_rate": 1.5919039225983782e-06,
"loss": 0.378,
"step": 5525
},
{
"epoch": 4.703389830508475,
"grad_norm": 0.43627145886421204,
"learning_rate": 1.3878858234280532e-06,
"loss": 0.4131,
"step": 5550
},
{
"epoch": 4.72457627118644,
"grad_norm": 0.3629254400730133,
"learning_rate": 1.1977312836046194e-06,
"loss": 0.3555,
"step": 5575
},
{
"epoch": 4.745762711864407,
"grad_norm": 0.4231952428817749,
"learning_rate": 1.0214761154230643e-06,
"loss": 0.4459,
"step": 5600
},
{
"epoch": 4.766949152542373,
"grad_norm": 0.32848870754241943,
"learning_rate": 8.591535134738814e-07,
"loss": 0.3753,
"step": 5625
},
{
"epoch": 4.788135593220339,
"grad_norm": 0.49593624472618103,
"learning_rate": 7.107940483913943e-07,
"loss": 0.4109,
"step": 5650
},
{
"epoch": 4.809322033898305,
"grad_norm": 0.3230677545070648,
"learning_rate": 5.764256610963636e-07,
"loss": 0.3534,
"step": 5675
},
{
"epoch": 4.830508474576272,
"grad_norm": 0.4409547746181488,
"learning_rate": 4.560736575337787e-07,
"loss": 0.4389,
"step": 5700
},
{
"epoch": 4.851694915254237,
"grad_norm": 0.3816058039665222,
"learning_rate": 3.4976070390692054e-07,
"loss": 0.369,
"step": 5725
},
{
"epoch": 4.872881355932203,
"grad_norm": 0.3902296721935272,
"learning_rate": 2.5750682240857634e-07,
"loss": 0.4134,
"step": 5750
},
{
"epoch": 4.8940677966101696,
"grad_norm": 0.3721590042114258,
"learning_rate": 1.7932938745022218e-07,
"loss": 0.3509,
"step": 5775
},
{
"epoch": 4.915254237288136,
"grad_norm": 0.3812599778175354,
"learning_rate": 1.1524312238984923e-07,
"loss": 0.4109,
"step": 5800
},
{
"epoch": 4.936440677966102,
"grad_norm": 0.39883601665496826,
"learning_rate": 6.526009675905663e-08,
"loss": 0.3768,
"step": 5825
},
{
"epoch": 4.9576271186440675,
"grad_norm": 0.38190439343452454,
"learning_rate": 2.9389723990011495e-08,
"loss": 0.4262,
"step": 5850
},
{
"epoch": 4.978813559322034,
"grad_norm": 0.2927350699901581,
"learning_rate": 7.638759642525361e-09,
"loss": 0.3631,
"step": 5875
},
{
"epoch": 5.0,
"grad_norm": 0.7745693922042847,
"learning_rate": 1.1300131838587468e-11,
"loss": 0.3756,
"step": 5900
}
],
"logging_steps": 25,
"max_steps": 5900,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0720875463474176e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}