DebertaV2-Base-10M_babylm-A__mnli / trainer_state.json
Ar4l's picture
Upload folder using huggingface_hub
baf7106 verified
{
"best_metric": 0.7337000966072083,
"best_model_checkpoint": "/home/ubuntu/utah/babylm-24/src/evaluation/results/finetune/DebertaV2-Base-10M_babylm-A/mnli/checkpoint-147264",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 245440,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010185788787483703,
"grad_norm": 3.972818613052368,
"learning_rate": 2.9938885267275102e-05,
"loss": 1.1058,
"step": 500
},
{
"epoch": 0.020371577574967405,
"grad_norm": 3.13022780418396,
"learning_rate": 2.9877770534550196e-05,
"loss": 1.0388,
"step": 1000
},
{
"epoch": 0.03055736636245111,
"grad_norm": 5.194879055023193,
"learning_rate": 2.9816655801825297e-05,
"loss": 0.999,
"step": 1500
},
{
"epoch": 0.04074315514993481,
"grad_norm": 6.858974933624268,
"learning_rate": 2.975554106910039e-05,
"loss": 0.9637,
"step": 2000
},
{
"epoch": 0.050928943937418515,
"grad_norm": 7.074331760406494,
"learning_rate": 2.9694426336375492e-05,
"loss": 0.9697,
"step": 2500
},
{
"epoch": 0.06111473272490222,
"grad_norm": 5.386518478393555,
"learning_rate": 2.9633311603650586e-05,
"loss": 0.9472,
"step": 3000
},
{
"epoch": 0.07130052151238592,
"grad_norm": 5.582085609436035,
"learning_rate": 2.9572196870925687e-05,
"loss": 0.9273,
"step": 3500
},
{
"epoch": 0.08148631029986962,
"grad_norm": 10.966410636901855,
"learning_rate": 2.951108213820078e-05,
"loss": 0.9316,
"step": 4000
},
{
"epoch": 0.09167209908735333,
"grad_norm": 7.840672492980957,
"learning_rate": 2.9449967405475882e-05,
"loss": 0.9237,
"step": 4500
},
{
"epoch": 0.10185788787483703,
"grad_norm": 7.442521095275879,
"learning_rate": 2.9388852672750976e-05,
"loss": 0.918,
"step": 5000
},
{
"epoch": 0.11204367666232073,
"grad_norm": 8.379261016845703,
"learning_rate": 2.9327737940026077e-05,
"loss": 0.8798,
"step": 5500
},
{
"epoch": 0.12222946544980444,
"grad_norm": 6.880769729614258,
"learning_rate": 2.926662320730117e-05,
"loss": 0.9045,
"step": 6000
},
{
"epoch": 0.13241525423728814,
"grad_norm": 5.965817451477051,
"learning_rate": 2.9205508474576272e-05,
"loss": 0.8941,
"step": 6500
},
{
"epoch": 0.14260104302477183,
"grad_norm": 9.67165756225586,
"learning_rate": 2.914439374185137e-05,
"loss": 0.878,
"step": 7000
},
{
"epoch": 0.15278683181225555,
"grad_norm": 6.4934892654418945,
"learning_rate": 2.9083279009126467e-05,
"loss": 0.8723,
"step": 7500
},
{
"epoch": 0.16297262059973924,
"grad_norm": 11.987072944641113,
"learning_rate": 2.9022164276401565e-05,
"loss": 0.8913,
"step": 8000
},
{
"epoch": 0.17315840938722293,
"grad_norm": 9.218280792236328,
"learning_rate": 2.8961049543676662e-05,
"loss": 0.868,
"step": 8500
},
{
"epoch": 0.18334419817470665,
"grad_norm": 6.697582244873047,
"learning_rate": 2.8899934810951763e-05,
"loss": 0.8618,
"step": 9000
},
{
"epoch": 0.19352998696219034,
"grad_norm": 8.035455703735352,
"learning_rate": 2.8838820078226858e-05,
"loss": 0.8641,
"step": 9500
},
{
"epoch": 0.20371577574967406,
"grad_norm": 6.890414237976074,
"learning_rate": 2.877770534550196e-05,
"loss": 0.8574,
"step": 10000
},
{
"epoch": 0.21390156453715775,
"grad_norm": 9.001678466796875,
"learning_rate": 2.8716590612777053e-05,
"loss": 0.8547,
"step": 10500
},
{
"epoch": 0.22408735332464147,
"grad_norm": 7.271191120147705,
"learning_rate": 2.8655475880052154e-05,
"loss": 0.8332,
"step": 11000
},
{
"epoch": 0.23427314211212516,
"grad_norm": 9.849822998046875,
"learning_rate": 2.8594361147327248e-05,
"loss": 0.8447,
"step": 11500
},
{
"epoch": 0.24445893089960888,
"grad_norm": 15.07829761505127,
"learning_rate": 2.853324641460235e-05,
"loss": 0.8251,
"step": 12000
},
{
"epoch": 0.25464471968709257,
"grad_norm": 6.886602878570557,
"learning_rate": 2.8472131681877446e-05,
"loss": 0.8444,
"step": 12500
},
{
"epoch": 0.2648305084745763,
"grad_norm": 9.94825267791748,
"learning_rate": 2.8411016949152544e-05,
"loss": 0.8504,
"step": 13000
},
{
"epoch": 0.27501629726205995,
"grad_norm": 8.0740966796875,
"learning_rate": 2.834990221642764e-05,
"loss": 0.8182,
"step": 13500
},
{
"epoch": 0.28520208604954367,
"grad_norm": 10.770129203796387,
"learning_rate": 2.828878748370274e-05,
"loss": 0.8373,
"step": 14000
},
{
"epoch": 0.2953878748370274,
"grad_norm": 9.214078903198242,
"learning_rate": 2.8227672750977836e-05,
"loss": 0.8089,
"step": 14500
},
{
"epoch": 0.3055736636245111,
"grad_norm": 14.231884956359863,
"learning_rate": 2.8166558018252934e-05,
"loss": 0.8337,
"step": 15000
},
{
"epoch": 0.31575945241199477,
"grad_norm": 9.379185676574707,
"learning_rate": 2.810544328552803e-05,
"loss": 0.8315,
"step": 15500
},
{
"epoch": 0.3259452411994785,
"grad_norm": 8.370843887329102,
"learning_rate": 2.804432855280313e-05,
"loss": 0.808,
"step": 16000
},
{
"epoch": 0.3361310299869622,
"grad_norm": 7.201444149017334,
"learning_rate": 2.7983213820078227e-05,
"loss": 0.808,
"step": 16500
},
{
"epoch": 0.34631681877444587,
"grad_norm": 10.696725845336914,
"learning_rate": 2.7922099087353324e-05,
"loss": 0.813,
"step": 17000
},
{
"epoch": 0.3565026075619296,
"grad_norm": 6.316204071044922,
"learning_rate": 2.7860984354628422e-05,
"loss": 0.8213,
"step": 17500
},
{
"epoch": 0.3666883963494133,
"grad_norm": 14.780086517333984,
"learning_rate": 2.779986962190352e-05,
"loss": 0.8165,
"step": 18000
},
{
"epoch": 0.376874185136897,
"grad_norm": 11.34945011138916,
"learning_rate": 2.773875488917862e-05,
"loss": 0.7943,
"step": 18500
},
{
"epoch": 0.3870599739243807,
"grad_norm": 7.136452674865723,
"learning_rate": 2.7677640156453718e-05,
"loss": 0.8017,
"step": 19000
},
{
"epoch": 0.3972457627118644,
"grad_norm": 6.209742069244385,
"learning_rate": 2.7616525423728815e-05,
"loss": 0.7941,
"step": 19500
},
{
"epoch": 0.4074315514993481,
"grad_norm": 8.923270225524902,
"learning_rate": 2.7555410691003913e-05,
"loss": 0.7826,
"step": 20000
},
{
"epoch": 0.41761734028683184,
"grad_norm": 6.371203899383545,
"learning_rate": 2.749429595827901e-05,
"loss": 0.7938,
"step": 20500
},
{
"epoch": 0.4278031290743155,
"grad_norm": 8.679354667663574,
"learning_rate": 2.7433181225554108e-05,
"loss": 0.7936,
"step": 21000
},
{
"epoch": 0.4379889178617992,
"grad_norm": 5.962934970855713,
"learning_rate": 2.7372066492829205e-05,
"loss": 0.7924,
"step": 21500
},
{
"epoch": 0.44817470664928294,
"grad_norm": 11.835200309753418,
"learning_rate": 2.7310951760104303e-05,
"loss": 0.7894,
"step": 22000
},
{
"epoch": 0.4583604954367666,
"grad_norm": 9.62806510925293,
"learning_rate": 2.72498370273794e-05,
"loss": 0.7963,
"step": 22500
},
{
"epoch": 0.4685462842242503,
"grad_norm": 7.6390509605407715,
"learning_rate": 2.7188722294654498e-05,
"loss": 0.783,
"step": 23000
},
{
"epoch": 0.47873207301173404,
"grad_norm": 5.826275825500488,
"learning_rate": 2.7127607561929596e-05,
"loss": 0.7859,
"step": 23500
},
{
"epoch": 0.48891786179921776,
"grad_norm": 5.199416637420654,
"learning_rate": 2.7066492829204693e-05,
"loss": 0.7989,
"step": 24000
},
{
"epoch": 0.4991036505867014,
"grad_norm": 12.38355541229248,
"learning_rate": 2.7005378096479794e-05,
"loss": 0.7864,
"step": 24500
},
{
"epoch": 0.5092894393741851,
"grad_norm": 7.700937271118164,
"learning_rate": 2.694426336375489e-05,
"loss": 0.7747,
"step": 25000
},
{
"epoch": 0.5194752281616688,
"grad_norm": 10.8720703125,
"learning_rate": 2.688314863102999e-05,
"loss": 0.7974,
"step": 25500
},
{
"epoch": 0.5296610169491526,
"grad_norm": 6.260867595672607,
"learning_rate": 2.6822033898305083e-05,
"loss": 0.7914,
"step": 26000
},
{
"epoch": 0.5398468057366362,
"grad_norm": 11.113471984863281,
"learning_rate": 2.6760919165580184e-05,
"loss": 0.782,
"step": 26500
},
{
"epoch": 0.5500325945241199,
"grad_norm": 7.497796058654785,
"learning_rate": 2.6699804432855282e-05,
"loss": 0.7642,
"step": 27000
},
{
"epoch": 0.5602183833116037,
"grad_norm": 6.1969170570373535,
"learning_rate": 2.663868970013038e-05,
"loss": 0.7693,
"step": 27500
},
{
"epoch": 0.5704041720990873,
"grad_norm": 9.965928077697754,
"learning_rate": 2.6577574967405477e-05,
"loss": 0.7681,
"step": 28000
},
{
"epoch": 0.5805899608865711,
"grad_norm": 5.052292346954346,
"learning_rate": 2.6516460234680575e-05,
"loss": 0.7664,
"step": 28500
},
{
"epoch": 0.5907757496740548,
"grad_norm": 6.7289276123046875,
"learning_rate": 2.6455345501955672e-05,
"loss": 0.7445,
"step": 29000
},
{
"epoch": 0.6009615384615384,
"grad_norm": 8.478078842163086,
"learning_rate": 2.639423076923077e-05,
"loss": 0.7859,
"step": 29500
},
{
"epoch": 0.6111473272490222,
"grad_norm": 6.2336106300354,
"learning_rate": 2.6333116036505867e-05,
"loss": 0.7598,
"step": 30000
},
{
"epoch": 0.6213331160365059,
"grad_norm": 11.82295036315918,
"learning_rate": 2.6272001303780965e-05,
"loss": 0.7507,
"step": 30500
},
{
"epoch": 0.6315189048239895,
"grad_norm": 8.545003890991211,
"learning_rate": 2.6210886571056066e-05,
"loss": 0.7769,
"step": 31000
},
{
"epoch": 0.6417046936114733,
"grad_norm": 9.849065780639648,
"learning_rate": 2.614977183833116e-05,
"loss": 0.7781,
"step": 31500
},
{
"epoch": 0.651890482398957,
"grad_norm": 7.641623020172119,
"learning_rate": 2.608865710560626e-05,
"loss": 0.7789,
"step": 32000
},
{
"epoch": 0.6620762711864406,
"grad_norm": 16.397785186767578,
"learning_rate": 2.6027542372881355e-05,
"loss": 0.7542,
"step": 32500
},
{
"epoch": 0.6722620599739244,
"grad_norm": 11.119671821594238,
"learning_rate": 2.5966427640156456e-05,
"loss": 0.7716,
"step": 33000
},
{
"epoch": 0.6824478487614081,
"grad_norm": 12.466675758361816,
"learning_rate": 2.590531290743155e-05,
"loss": 0.7743,
"step": 33500
},
{
"epoch": 0.6926336375488917,
"grad_norm": 8.151625633239746,
"learning_rate": 2.584419817470665e-05,
"loss": 0.7581,
"step": 34000
},
{
"epoch": 0.7028194263363755,
"grad_norm": 7.743143558502197,
"learning_rate": 2.5783083441981745e-05,
"loss": 0.7599,
"step": 34500
},
{
"epoch": 0.7130052151238592,
"grad_norm": 5.249680995941162,
"learning_rate": 2.5721968709256846e-05,
"loss": 0.7483,
"step": 35000
},
{
"epoch": 0.7231910039113429,
"grad_norm": 6.738178730010986,
"learning_rate": 2.566085397653194e-05,
"loss": 0.7781,
"step": 35500
},
{
"epoch": 0.7333767926988266,
"grad_norm": 6.7923102378845215,
"learning_rate": 2.559973924380704e-05,
"loss": 0.7464,
"step": 36000
},
{
"epoch": 0.7435625814863103,
"grad_norm": 15.575695991516113,
"learning_rate": 2.553862451108214e-05,
"loss": 0.7571,
"step": 36500
},
{
"epoch": 0.753748370273794,
"grad_norm": 13.128719329833984,
"learning_rate": 2.5477509778357236e-05,
"loss": 0.7625,
"step": 37000
},
{
"epoch": 0.7639341590612777,
"grad_norm": 4.938621997833252,
"learning_rate": 2.5416395045632337e-05,
"loss": 0.7477,
"step": 37500
},
{
"epoch": 0.7741199478487614,
"grad_norm": 8.46985912322998,
"learning_rate": 2.535528031290743e-05,
"loss": 0.7419,
"step": 38000
},
{
"epoch": 0.7843057366362451,
"grad_norm": 18.881208419799805,
"learning_rate": 2.5294165580182532e-05,
"loss": 0.7403,
"step": 38500
},
{
"epoch": 0.7944915254237288,
"grad_norm": 9.986156463623047,
"learning_rate": 2.5233050847457626e-05,
"loss": 0.7508,
"step": 39000
},
{
"epoch": 0.8046773142112125,
"grad_norm": 11.625401496887207,
"learning_rate": 2.5171936114732727e-05,
"loss": 0.7426,
"step": 39500
},
{
"epoch": 0.8148631029986962,
"grad_norm": 9.926522254943848,
"learning_rate": 2.511082138200782e-05,
"loss": 0.7365,
"step": 40000
},
{
"epoch": 0.8250488917861799,
"grad_norm": 8.104277610778809,
"learning_rate": 2.5049706649282922e-05,
"loss": 0.7532,
"step": 40500
},
{
"epoch": 0.8352346805736637,
"grad_norm": 5.898177623748779,
"learning_rate": 2.4988591916558017e-05,
"loss": 0.7657,
"step": 41000
},
{
"epoch": 0.8454204693611473,
"grad_norm": 6.856070041656494,
"learning_rate": 2.4927477183833118e-05,
"loss": 0.7438,
"step": 41500
},
{
"epoch": 0.855606258148631,
"grad_norm": 11.806628227233887,
"learning_rate": 2.4866362451108212e-05,
"loss": 0.7348,
"step": 42000
},
{
"epoch": 0.8657920469361148,
"grad_norm": 12.08478832244873,
"learning_rate": 2.4805247718383313e-05,
"loss": 0.7277,
"step": 42500
},
{
"epoch": 0.8759778357235984,
"grad_norm": 5.707648277282715,
"learning_rate": 2.474413298565841e-05,
"loss": 0.7458,
"step": 43000
},
{
"epoch": 0.8861636245110821,
"grad_norm": 5.36006498336792,
"learning_rate": 2.4683018252933508e-05,
"loss": 0.7446,
"step": 43500
},
{
"epoch": 0.8963494132985659,
"grad_norm": 6.603504180908203,
"learning_rate": 2.4621903520208605e-05,
"loss": 0.7278,
"step": 44000
},
{
"epoch": 0.9065352020860495,
"grad_norm": 7.789953231811523,
"learning_rate": 2.4560788787483703e-05,
"loss": 0.7375,
"step": 44500
},
{
"epoch": 0.9167209908735332,
"grad_norm": 11.424116134643555,
"learning_rate": 2.4499674054758804e-05,
"loss": 0.7297,
"step": 45000
},
{
"epoch": 0.926906779661017,
"grad_norm": 6.7079620361328125,
"learning_rate": 2.4438559322033898e-05,
"loss": 0.7411,
"step": 45500
},
{
"epoch": 0.9370925684485006,
"grad_norm": 11.931846618652344,
"learning_rate": 2.4377444589309e-05,
"loss": 0.749,
"step": 46000
},
{
"epoch": 0.9472783572359843,
"grad_norm": 8.87076473236084,
"learning_rate": 2.4316329856584093e-05,
"loss": 0.7312,
"step": 46500
},
{
"epoch": 0.9574641460234681,
"grad_norm": 12.787999153137207,
"learning_rate": 2.4255215123859194e-05,
"loss": 0.7244,
"step": 47000
},
{
"epoch": 0.9676499348109517,
"grad_norm": 6.917387962341309,
"learning_rate": 2.4194100391134288e-05,
"loss": 0.733,
"step": 47500
},
{
"epoch": 0.9778357235984355,
"grad_norm": 7.564793109893799,
"learning_rate": 2.413298565840939e-05,
"loss": 0.7253,
"step": 48000
},
{
"epoch": 0.9880215123859192,
"grad_norm": 7.665174961090088,
"learning_rate": 2.4071870925684483e-05,
"loss": 0.7191,
"step": 48500
},
{
"epoch": 0.9982073011734028,
"grad_norm": 9.235818862915039,
"learning_rate": 2.4010756192959584e-05,
"loss": 0.7288,
"step": 49000
},
{
"epoch": 1.0,
"eval_accuracy": 0.6841890811920166,
"eval_loss": 0.7356013059616089,
"eval_runtime": 8.0037,
"eval_samples_per_second": 613.213,
"eval_steps_per_second": 76.714,
"step": 49088
},
{
"epoch": 1.0083930899608866,
"grad_norm": 9.232441902160645,
"learning_rate": 2.3949641460234682e-05,
"loss": 0.666,
"step": 49500
},
{
"epoch": 1.0185788787483703,
"grad_norm": 5.098124980926514,
"learning_rate": 2.388852672750978e-05,
"loss": 0.6762,
"step": 50000
},
{
"epoch": 1.028764667535854,
"grad_norm": 6.507536888122559,
"learning_rate": 2.3827411994784877e-05,
"loss": 0.6509,
"step": 50500
},
{
"epoch": 1.0389504563233376,
"grad_norm": 12.516643524169922,
"learning_rate": 2.3766297262059974e-05,
"loss": 0.6569,
"step": 51000
},
{
"epoch": 1.0491362451108215,
"grad_norm": 11.644853591918945,
"learning_rate": 2.3705182529335072e-05,
"loss": 0.6498,
"step": 51500
},
{
"epoch": 1.0593220338983051,
"grad_norm": 15.303723335266113,
"learning_rate": 2.364406779661017e-05,
"loss": 0.6565,
"step": 52000
},
{
"epoch": 1.0695078226857888,
"grad_norm": 11.212811470031738,
"learning_rate": 2.3582953063885267e-05,
"loss": 0.6492,
"step": 52500
},
{
"epoch": 1.0796936114732725,
"grad_norm": 7.695071220397949,
"learning_rate": 2.3521838331160365e-05,
"loss": 0.6618,
"step": 53000
},
{
"epoch": 1.0898794002607561,
"grad_norm": 8.234328269958496,
"learning_rate": 2.3460723598435462e-05,
"loss": 0.6709,
"step": 53500
},
{
"epoch": 1.1000651890482398,
"grad_norm": 12.574545860290527,
"learning_rate": 2.339960886571056e-05,
"loss": 0.6518,
"step": 54000
},
{
"epoch": 1.1102509778357237,
"grad_norm": 5.914109706878662,
"learning_rate": 2.333849413298566e-05,
"loss": 0.6494,
"step": 54500
},
{
"epoch": 1.1204367666232073,
"grad_norm": 11.49263858795166,
"learning_rate": 2.3277379400260758e-05,
"loss": 0.6723,
"step": 55000
},
{
"epoch": 1.130622555410691,
"grad_norm": 11.228019714355469,
"learning_rate": 2.3216264667535856e-05,
"loss": 0.6328,
"step": 55500
},
{
"epoch": 1.1408083441981747,
"grad_norm": 14.478355407714844,
"learning_rate": 2.3155149934810953e-05,
"loss": 0.667,
"step": 56000
},
{
"epoch": 1.1509941329856583,
"grad_norm": 18.817468643188477,
"learning_rate": 2.309403520208605e-05,
"loss": 0.6674,
"step": 56500
},
{
"epoch": 1.161179921773142,
"grad_norm": 12.483678817749023,
"learning_rate": 2.303292046936115e-05,
"loss": 0.6677,
"step": 57000
},
{
"epoch": 1.1713657105606259,
"grad_norm": 7.133495807647705,
"learning_rate": 2.2971805736636246e-05,
"loss": 0.642,
"step": 57500
},
{
"epoch": 1.1815514993481095,
"grad_norm": 7.099282741546631,
"learning_rate": 2.2910691003911343e-05,
"loss": 0.664,
"step": 58000
},
{
"epoch": 1.1917372881355932,
"grad_norm": 9.188867568969727,
"learning_rate": 2.284957627118644e-05,
"loss": 0.6703,
"step": 58500
},
{
"epoch": 1.2019230769230769,
"grad_norm": 5.470687389373779,
"learning_rate": 2.278846153846154e-05,
"loss": 0.6575,
"step": 59000
},
{
"epoch": 1.2121088657105605,
"grad_norm": 8.013532638549805,
"learning_rate": 2.2727346805736636e-05,
"loss": 0.6465,
"step": 59500
},
{
"epoch": 1.2222946544980444,
"grad_norm": 8.846539497375488,
"learning_rate": 2.2666232073011734e-05,
"loss": 0.6682,
"step": 60000
},
{
"epoch": 1.232480443285528,
"grad_norm": 11.2225341796875,
"learning_rate": 2.260511734028683e-05,
"loss": 0.653,
"step": 60500
},
{
"epoch": 1.2426662320730117,
"grad_norm": 12.363032341003418,
"learning_rate": 2.254400260756193e-05,
"loss": 0.6807,
"step": 61000
},
{
"epoch": 1.2528520208604954,
"grad_norm": 8.01014232635498,
"learning_rate": 2.248288787483703e-05,
"loss": 0.6625,
"step": 61500
},
{
"epoch": 1.263037809647979,
"grad_norm": 20.960845947265625,
"learning_rate": 2.2421773142112124e-05,
"loss": 0.6524,
"step": 62000
},
{
"epoch": 1.2732235984354627,
"grad_norm": 13.620718002319336,
"learning_rate": 2.2360658409387225e-05,
"loss": 0.6625,
"step": 62500
},
{
"epoch": 1.2834093872229466,
"grad_norm": 18.55965232849121,
"learning_rate": 2.2299543676662322e-05,
"loss": 0.6434,
"step": 63000
},
{
"epoch": 1.2935951760104303,
"grad_norm": 12.919173240661621,
"learning_rate": 2.223842894393742e-05,
"loss": 0.652,
"step": 63500
},
{
"epoch": 1.303780964797914,
"grad_norm": 19.561386108398438,
"learning_rate": 2.2177314211212517e-05,
"loss": 0.6498,
"step": 64000
},
{
"epoch": 1.3139667535853976,
"grad_norm": 10.529995918273926,
"learning_rate": 2.2116199478487615e-05,
"loss": 0.666,
"step": 64500
},
{
"epoch": 1.3241525423728815,
"grad_norm": 13.268050193786621,
"learning_rate": 2.2055084745762713e-05,
"loss": 0.6499,
"step": 65000
},
{
"epoch": 1.3343383311603652,
"grad_norm": 6.3072285652160645,
"learning_rate": 2.199397001303781e-05,
"loss": 0.6798,
"step": 65500
},
{
"epoch": 1.3445241199478488,
"grad_norm": 5.304599761962891,
"learning_rate": 2.1932855280312908e-05,
"loss": 0.6559,
"step": 66000
},
{
"epoch": 1.3547099087353325,
"grad_norm": 13.698792457580566,
"learning_rate": 2.1871740547588005e-05,
"loss": 0.6441,
"step": 66500
},
{
"epoch": 1.3648956975228161,
"grad_norm": 8.951632499694824,
"learning_rate": 2.1810625814863103e-05,
"loss": 0.6564,
"step": 67000
},
{
"epoch": 1.3750814863102998,
"grad_norm": 5.84067440032959,
"learning_rate": 2.17495110821382e-05,
"loss": 0.6686,
"step": 67500
},
{
"epoch": 1.3852672750977835,
"grad_norm": 6.442080497741699,
"learning_rate": 2.16883963494133e-05,
"loss": 0.6517,
"step": 68000
},
{
"epoch": 1.3954530638852674,
"grad_norm": 5.514212608337402,
"learning_rate": 2.1627281616688395e-05,
"loss": 0.6664,
"step": 68500
},
{
"epoch": 1.405638852672751,
"grad_norm": 14.150158882141113,
"learning_rate": 2.1566166883963496e-05,
"loss": 0.6561,
"step": 69000
},
{
"epoch": 1.4158246414602347,
"grad_norm": 17.196884155273438,
"learning_rate": 2.150505215123859e-05,
"loss": 0.6513,
"step": 69500
},
{
"epoch": 1.4260104302477183,
"grad_norm": 6.181870937347412,
"learning_rate": 2.144393741851369e-05,
"loss": 0.65,
"step": 70000
},
{
"epoch": 1.436196219035202,
"grad_norm": 14.865707397460938,
"learning_rate": 2.1382822685788786e-05,
"loss": 0.6596,
"step": 70500
},
{
"epoch": 1.4463820078226859,
"grad_norm": 15.808574676513672,
"learning_rate": 2.1321707953063886e-05,
"loss": 0.6559,
"step": 71000
},
{
"epoch": 1.4565677966101696,
"grad_norm": 16.76003074645996,
"learning_rate": 2.1260593220338984e-05,
"loss": 0.6467,
"step": 71500
},
{
"epoch": 1.4667535853976532,
"grad_norm": 21.466825485229492,
"learning_rate": 2.119947848761408e-05,
"loss": 0.6579,
"step": 72000
},
{
"epoch": 1.4769393741851369,
"grad_norm": 18.81052589416504,
"learning_rate": 2.113836375488918e-05,
"loss": 0.6699,
"step": 72500
},
{
"epoch": 1.4871251629726205,
"grad_norm": 10.334957122802734,
"learning_rate": 2.1077249022164277e-05,
"loss": 0.655,
"step": 73000
},
{
"epoch": 1.4973109517601042,
"grad_norm": 14.836852073669434,
"learning_rate": 2.1016134289439378e-05,
"loss": 0.66,
"step": 73500
},
{
"epoch": 1.5074967405475879,
"grad_norm": 21.439233779907227,
"learning_rate": 2.0955019556714472e-05,
"loss": 0.6486,
"step": 74000
},
{
"epoch": 1.5176825293350718,
"grad_norm": 21.09993553161621,
"learning_rate": 2.0893904823989573e-05,
"loss": 0.6432,
"step": 74500
},
{
"epoch": 1.5278683181225554,
"grad_norm": 8.926955223083496,
"learning_rate": 2.0832790091264667e-05,
"loss": 0.6418,
"step": 75000
},
{
"epoch": 1.538054106910039,
"grad_norm": 3.551163911819458,
"learning_rate": 2.0771675358539768e-05,
"loss": 0.6418,
"step": 75500
},
{
"epoch": 1.548239895697523,
"grad_norm": 10.46932315826416,
"learning_rate": 2.0710560625814862e-05,
"loss": 0.657,
"step": 76000
},
{
"epoch": 1.5584256844850066,
"grad_norm": 10.022995948791504,
"learning_rate": 2.0649445893089963e-05,
"loss": 0.6564,
"step": 76500
},
{
"epoch": 1.5686114732724903,
"grad_norm": 15.299884796142578,
"learning_rate": 2.0588331160365057e-05,
"loss": 0.6535,
"step": 77000
},
{
"epoch": 1.578797262059974,
"grad_norm": 11.149444580078125,
"learning_rate": 2.0527216427640158e-05,
"loss": 0.6409,
"step": 77500
},
{
"epoch": 1.5889830508474576,
"grad_norm": 4.011179447174072,
"learning_rate": 2.0466101694915252e-05,
"loss": 0.6299,
"step": 78000
},
{
"epoch": 1.5991688396349413,
"grad_norm": 13.744772911071777,
"learning_rate": 2.0404986962190353e-05,
"loss": 0.6511,
"step": 78500
},
{
"epoch": 1.609354628422425,
"grad_norm": 11.129698753356934,
"learning_rate": 2.0343872229465447e-05,
"loss": 0.6349,
"step": 79000
},
{
"epoch": 1.6195404172099086,
"grad_norm": 16.331953048706055,
"learning_rate": 2.0282757496740548e-05,
"loss": 0.6517,
"step": 79500
},
{
"epoch": 1.6297262059973925,
"grad_norm": 12.816121101379395,
"learning_rate": 2.0221642764015646e-05,
"loss": 0.6409,
"step": 80000
},
{
"epoch": 1.6399119947848761,
"grad_norm": 13.881726264953613,
"learning_rate": 2.0160528031290743e-05,
"loss": 0.6606,
"step": 80500
},
{
"epoch": 1.6500977835723598,
"grad_norm": 14.215683937072754,
"learning_rate": 2.0099413298565844e-05,
"loss": 0.657,
"step": 81000
},
{
"epoch": 1.6602835723598437,
"grad_norm": 19.557342529296875,
"learning_rate": 2.003829856584094e-05,
"loss": 0.6451,
"step": 81500
},
{
"epoch": 1.6704693611473274,
"grad_norm": 17.83816909790039,
"learning_rate": 1.997718383311604e-05,
"loss": 0.6397,
"step": 82000
},
{
"epoch": 1.680655149934811,
"grad_norm": 8.789198875427246,
"learning_rate": 1.9916069100391133e-05,
"loss": 0.6514,
"step": 82500
},
{
"epoch": 1.6908409387222947,
"grad_norm": 11.361543655395508,
"learning_rate": 1.9854954367666234e-05,
"loss": 0.6466,
"step": 83000
},
{
"epoch": 1.7010267275097783,
"grad_norm": 15.630517959594727,
"learning_rate": 1.979383963494133e-05,
"loss": 0.6316,
"step": 83500
},
{
"epoch": 1.711212516297262,
"grad_norm": 10.55432415008545,
"learning_rate": 1.973272490221643e-05,
"loss": 0.6533,
"step": 84000
},
{
"epoch": 1.7213983050847457,
"grad_norm": 8.428609848022461,
"learning_rate": 1.9671610169491524e-05,
"loss": 0.6478,
"step": 84500
},
{
"epoch": 1.7315840938722293,
"grad_norm": 4.304576873779297,
"learning_rate": 1.9610495436766625e-05,
"loss": 0.6449,
"step": 85000
},
{
"epoch": 1.7417698826597132,
"grad_norm": 21.92937469482422,
"learning_rate": 1.9549380704041722e-05,
"loss": 0.6425,
"step": 85500
},
{
"epoch": 1.7519556714471969,
"grad_norm": 19.813392639160156,
"learning_rate": 1.948826597131682e-05,
"loss": 0.6238,
"step": 86000
},
{
"epoch": 1.7621414602346805,
"grad_norm": 6.831646919250488,
"learning_rate": 1.9427151238591917e-05,
"loss": 0.6553,
"step": 86500
},
{
"epoch": 1.7723272490221644,
"grad_norm": 11.058158874511719,
"learning_rate": 1.9366036505867015e-05,
"loss": 0.6528,
"step": 87000
},
{
"epoch": 1.782513037809648,
"grad_norm": 7.01440954208374,
"learning_rate": 1.9304921773142112e-05,
"loss": 0.6506,
"step": 87500
},
{
"epoch": 1.7926988265971318,
"grad_norm": 4.963765621185303,
"learning_rate": 1.924380704041721e-05,
"loss": 0.6412,
"step": 88000
},
{
"epoch": 1.8028846153846154,
"grad_norm": 38.63767623901367,
"learning_rate": 1.9182692307692307e-05,
"loss": 0.6629,
"step": 88500
},
{
"epoch": 1.813070404172099,
"grad_norm": 11.423843383789062,
"learning_rate": 1.9121577574967405e-05,
"loss": 0.673,
"step": 89000
},
{
"epoch": 1.8232561929595827,
"grad_norm": 11.73025894165039,
"learning_rate": 1.9060462842242506e-05,
"loss": 0.625,
"step": 89500
},
{
"epoch": 1.8334419817470664,
"grad_norm": 11.493837356567383,
"learning_rate": 1.89993481095176e-05,
"loss": 0.6658,
"step": 90000
},
{
"epoch": 1.84362777053455,
"grad_norm": 12.196702003479004,
"learning_rate": 1.89382333767927e-05,
"loss": 0.6451,
"step": 90500
},
{
"epoch": 1.8538135593220337,
"grad_norm": 9.690689086914062,
"learning_rate": 1.8877118644067795e-05,
"loss": 0.6282,
"step": 91000
},
{
"epoch": 1.8639993481095176,
"grad_norm": 13.679101943969727,
"learning_rate": 1.8816003911342896e-05,
"loss": 0.6328,
"step": 91500
},
{
"epoch": 1.8741851368970013,
"grad_norm": 14.264899253845215,
"learning_rate": 1.8754889178617994e-05,
"loss": 0.6355,
"step": 92000
},
{
"epoch": 1.8843709256844852,
"grad_norm": 9.039682388305664,
"learning_rate": 1.869377444589309e-05,
"loss": 0.6471,
"step": 92500
},
{
"epoch": 1.8945567144719688,
"grad_norm": 9.861899375915527,
"learning_rate": 1.863265971316819e-05,
"loss": 0.6549,
"step": 93000
},
{
"epoch": 1.9047425032594525,
"grad_norm": 14.948457717895508,
"learning_rate": 1.8571544980443286e-05,
"loss": 0.6436,
"step": 93500
},
{
"epoch": 1.9149282920469362,
"grad_norm": 15.613487243652344,
"learning_rate": 1.8510430247718384e-05,
"loss": 0.6355,
"step": 94000
},
{
"epoch": 1.9251140808344198,
"grad_norm": 14.105792045593262,
"learning_rate": 1.844931551499348e-05,
"loss": 0.6345,
"step": 94500
},
{
"epoch": 1.9352998696219035,
"grad_norm": 7.004807472229004,
"learning_rate": 1.838820078226858e-05,
"loss": 0.6456,
"step": 95000
},
{
"epoch": 1.9454856584093871,
"grad_norm": 10.506879806518555,
"learning_rate": 1.8327086049543677e-05,
"loss": 0.6303,
"step": 95500
},
{
"epoch": 1.9556714471968708,
"grad_norm": 11.595491409301758,
"learning_rate": 1.8265971316818774e-05,
"loss": 0.6509,
"step": 96000
},
{
"epoch": 1.9658572359843545,
"grad_norm": 11.29542064666748,
"learning_rate": 1.820485658409387e-05,
"loss": 0.6273,
"step": 96500
},
{
"epoch": 1.9760430247718384,
"grad_norm": 4.132607460021973,
"learning_rate": 1.814374185136897e-05,
"loss": 0.643,
"step": 97000
},
{
"epoch": 1.986228813559322,
"grad_norm": 9.745738983154297,
"learning_rate": 1.8082627118644067e-05,
"loss": 0.6656,
"step": 97500
},
{
"epoch": 1.996414602346806,
"grad_norm": 4.289452075958252,
"learning_rate": 1.8021512385919164e-05,
"loss": 0.6367,
"step": 98000
},
{
"epoch": 2.0,
"eval_accuracy": 0.7218826413154602,
"eval_loss": 0.6872764825820923,
"eval_runtime": 7.6175,
"eval_samples_per_second": 644.307,
"eval_steps_per_second": 80.604,
"step": 98176
},
{
"epoch": 2.0066003911342896,
"grad_norm": 19.260536193847656,
"learning_rate": 1.7960397653194265e-05,
"loss": 0.5568,
"step": 98500
},
{
"epoch": 2.0167861799217732,
"grad_norm": 8.347018241882324,
"learning_rate": 1.7899282920469363e-05,
"loss": 0.5344,
"step": 99000
},
{
"epoch": 2.026971968709257,
"grad_norm": 12.238682746887207,
"learning_rate": 1.783816818774446e-05,
"loss": 0.5685,
"step": 99500
},
{
"epoch": 2.0371577574967406,
"grad_norm": 18.42786979675293,
"learning_rate": 1.7777053455019558e-05,
"loss": 0.5539,
"step": 100000
},
{
"epoch": 2.047343546284224,
"grad_norm": 25.627717971801758,
"learning_rate": 1.7715938722294655e-05,
"loss": 0.5393,
"step": 100500
},
{
"epoch": 2.057529335071708,
"grad_norm": 15.942806243896484,
"learning_rate": 1.7654823989569753e-05,
"loss": 0.5191,
"step": 101000
},
{
"epoch": 2.0677151238591915,
"grad_norm": 7.591663360595703,
"learning_rate": 1.759370925684485e-05,
"loss": 0.5428,
"step": 101500
},
{
"epoch": 2.077900912646675,
"grad_norm": 7.183356285095215,
"learning_rate": 1.7532594524119948e-05,
"loss": 0.5475,
"step": 102000
},
{
"epoch": 2.088086701434159,
"grad_norm": 19.916288375854492,
"learning_rate": 1.7471479791395046e-05,
"loss": 0.5586,
"step": 102500
},
{
"epoch": 2.098272490221643,
"grad_norm": 9.07480239868164,
"learning_rate": 1.7410365058670143e-05,
"loss": 0.5564,
"step": 103000
},
{
"epoch": 2.1084582790091266,
"grad_norm": 7.647058010101318,
"learning_rate": 1.734925032594524e-05,
"loss": 0.543,
"step": 103500
},
{
"epoch": 2.1186440677966103,
"grad_norm": 8.871644973754883,
"learning_rate": 1.728813559322034e-05,
"loss": 0.5623,
"step": 104000
},
{
"epoch": 2.128829856584094,
"grad_norm": 9.516389846801758,
"learning_rate": 1.7227020860495436e-05,
"loss": 0.5464,
"step": 104500
},
{
"epoch": 2.1390156453715776,
"grad_norm": 13.605746269226074,
"learning_rate": 1.7165906127770537e-05,
"loss": 0.5425,
"step": 105000
},
{
"epoch": 2.1492014341590613,
"grad_norm": 14.016572952270508,
"learning_rate": 1.710479139504563e-05,
"loss": 0.5496,
"step": 105500
},
{
"epoch": 2.159387222946545,
"grad_norm": 14.719120979309082,
"learning_rate": 1.7043676662320732e-05,
"loss": 0.5591,
"step": 106000
},
{
"epoch": 2.1695730117340286,
"grad_norm": 14.922130584716797,
"learning_rate": 1.6982561929595826e-05,
"loss": 0.5462,
"step": 106500
},
{
"epoch": 2.1797588005215123,
"grad_norm": 10.74787425994873,
"learning_rate": 1.6921447196870927e-05,
"loss": 0.5699,
"step": 107000
},
{
"epoch": 2.189944589308996,
"grad_norm": 7.978420257568359,
"learning_rate": 1.6860332464146024e-05,
"loss": 0.5449,
"step": 107500
},
{
"epoch": 2.2001303780964796,
"grad_norm": 15.361347198486328,
"learning_rate": 1.6799217731421122e-05,
"loss": 0.5443,
"step": 108000
},
{
"epoch": 2.2103161668839633,
"grad_norm": 6.552661418914795,
"learning_rate": 1.673810299869622e-05,
"loss": 0.5663,
"step": 108500
},
{
"epoch": 2.2205019556714474,
"grad_norm": 6.750521659851074,
"learning_rate": 1.6676988265971317e-05,
"loss": 0.5488,
"step": 109000
},
{
"epoch": 2.230687744458931,
"grad_norm": 11.149799346923828,
"learning_rate": 1.6615873533246415e-05,
"loss": 0.5412,
"step": 109500
},
{
"epoch": 2.2408735332464147,
"grad_norm": 19.362638473510742,
"learning_rate": 1.6554758800521512e-05,
"loss": 0.5505,
"step": 110000
},
{
"epoch": 2.2510593220338984,
"grad_norm": 11.263615608215332,
"learning_rate": 1.6493644067796613e-05,
"loss": 0.5591,
"step": 110500
},
{
"epoch": 2.261245110821382,
"grad_norm": 8.90456771850586,
"learning_rate": 1.6432529335071707e-05,
"loss": 0.5491,
"step": 111000
},
{
"epoch": 2.2714308996088657,
"grad_norm": 11.962569236755371,
"learning_rate": 1.6371414602346808e-05,
"loss": 0.556,
"step": 111500
},
{
"epoch": 2.2816166883963493,
"grad_norm": 8.397544860839844,
"learning_rate": 1.6310299869621902e-05,
"loss": 0.5561,
"step": 112000
},
{
"epoch": 2.291802477183833,
"grad_norm": 16.376155853271484,
"learning_rate": 1.6249185136897003e-05,
"loss": 0.5435,
"step": 112500
},
{
"epoch": 2.3019882659713167,
"grad_norm": 9.14609432220459,
"learning_rate": 1.6188070404172097e-05,
"loss": 0.5457,
"step": 113000
},
{
"epoch": 2.3121740547588003,
"grad_norm": 26.53936767578125,
"learning_rate": 1.61269556714472e-05,
"loss": 0.5295,
"step": 113500
},
{
"epoch": 2.322359843546284,
"grad_norm": 17.222721099853516,
"learning_rate": 1.6065840938722293e-05,
"loss": 0.5177,
"step": 114000
},
{
"epoch": 2.332545632333768,
"grad_norm": 9.585100173950195,
"learning_rate": 1.6004726205997394e-05,
"loss": 0.5419,
"step": 114500
},
{
"epoch": 2.3427314211212518,
"grad_norm": 9.392489433288574,
"learning_rate": 1.5943611473272488e-05,
"loss": 0.5296,
"step": 115000
},
{
"epoch": 2.3529172099087354,
"grad_norm": 15.146162986755371,
"learning_rate": 1.588249674054759e-05,
"loss": 0.5518,
"step": 115500
},
{
"epoch": 2.363102998696219,
"grad_norm": 15.631641387939453,
"learning_rate": 1.582138200782269e-05,
"loss": 0.5584,
"step": 116000
},
{
"epoch": 2.3732887874837028,
"grad_norm": 17.875492095947266,
"learning_rate": 1.5760267275097784e-05,
"loss": 0.5601,
"step": 116500
},
{
"epoch": 2.3834745762711864,
"grad_norm": 7.7471184730529785,
"learning_rate": 1.5699152542372885e-05,
"loss": 0.549,
"step": 117000
},
{
"epoch": 2.39366036505867,
"grad_norm": 14.712841987609863,
"learning_rate": 1.563803780964798e-05,
"loss": 0.5293,
"step": 117500
},
{
"epoch": 2.4038461538461537,
"grad_norm": 9.229011535644531,
"learning_rate": 1.557692307692308e-05,
"loss": 0.5483,
"step": 118000
},
{
"epoch": 2.4140319426336374,
"grad_norm": 11.47548770904541,
"learning_rate": 1.5515808344198174e-05,
"loss": 0.5529,
"step": 118500
},
{
"epoch": 2.424217731421121,
"grad_norm": 29.980873107910156,
"learning_rate": 1.5454693611473275e-05,
"loss": 0.5489,
"step": 119000
},
{
"epoch": 2.4344035202086047,
"grad_norm": 10.478185653686523,
"learning_rate": 1.539357887874837e-05,
"loss": 0.5595,
"step": 119500
},
{
"epoch": 2.444589308996089,
"grad_norm": 12.978096008300781,
"learning_rate": 1.533246414602347e-05,
"loss": 0.5612,
"step": 120000
},
{
"epoch": 2.4547750977835725,
"grad_norm": 17.834806442260742,
"learning_rate": 1.5271349413298564e-05,
"loss": 0.5412,
"step": 120500
},
{
"epoch": 2.464960886571056,
"grad_norm": 25.001754760742188,
"learning_rate": 1.5210234680573665e-05,
"loss": 0.5673,
"step": 121000
},
{
"epoch": 2.47514667535854,
"grad_norm": 27.953767776489258,
"learning_rate": 1.5149119947848761e-05,
"loss": 0.5497,
"step": 121500
},
{
"epoch": 2.4853324641460235,
"grad_norm": 9.370855331420898,
"learning_rate": 1.508800521512386e-05,
"loss": 0.5344,
"step": 122000
},
{
"epoch": 2.495518252933507,
"grad_norm": 18.109821319580078,
"learning_rate": 1.5026890482398956e-05,
"loss": 0.5572,
"step": 122500
},
{
"epoch": 2.505704041720991,
"grad_norm": 20.435270309448242,
"learning_rate": 1.4965775749674055e-05,
"loss": 0.5631,
"step": 123000
},
{
"epoch": 2.5158898305084745,
"grad_norm": 10.656272888183594,
"learning_rate": 1.4904661016949153e-05,
"loss": 0.5344,
"step": 123500
},
{
"epoch": 2.526075619295958,
"grad_norm": 18.796552658081055,
"learning_rate": 1.484354628422425e-05,
"loss": 0.5464,
"step": 124000
},
{
"epoch": 2.5362614080834422,
"grad_norm": 9.630306243896484,
"learning_rate": 1.4782431551499348e-05,
"loss": 0.547,
"step": 124500
},
{
"epoch": 2.5464471968709255,
"grad_norm": 6.3599467277526855,
"learning_rate": 1.4721316818774445e-05,
"loss": 0.5577,
"step": 125000
},
{
"epoch": 2.5566329856584096,
"grad_norm": 8.967144966125488,
"learning_rate": 1.4660202086049545e-05,
"loss": 0.5524,
"step": 125500
},
{
"epoch": 2.5668187744458932,
"grad_norm": 16.690263748168945,
"learning_rate": 1.4599087353324642e-05,
"loss": 0.5546,
"step": 126000
},
{
"epoch": 2.577004563233377,
"grad_norm": 17.713790893554688,
"learning_rate": 1.453797262059974e-05,
"loss": 0.5686,
"step": 126500
},
{
"epoch": 2.5871903520208606,
"grad_norm": 7.970760822296143,
"learning_rate": 1.4476857887874837e-05,
"loss": 0.5273,
"step": 127000
},
{
"epoch": 2.5973761408083442,
"grad_norm": 8.869507789611816,
"learning_rate": 1.4415743155149935e-05,
"loss": 0.5472,
"step": 127500
},
{
"epoch": 2.607561929595828,
"grad_norm": 17.247589111328125,
"learning_rate": 1.4354628422425032e-05,
"loss": 0.5444,
"step": 128000
},
{
"epoch": 2.6177477183833116,
"grad_norm": 16.415138244628906,
"learning_rate": 1.429351368970013e-05,
"loss": 0.5445,
"step": 128500
},
{
"epoch": 2.627933507170795,
"grad_norm": 10.408729553222656,
"learning_rate": 1.4232398956975227e-05,
"loss": 0.5667,
"step": 129000
},
{
"epoch": 2.638119295958279,
"grad_norm": 53.77888488769531,
"learning_rate": 1.4171284224250327e-05,
"loss": 0.547,
"step": 129500
},
{
"epoch": 2.648305084745763,
"grad_norm": 12.244620323181152,
"learning_rate": 1.4110169491525424e-05,
"loss": 0.5428,
"step": 130000
},
{
"epoch": 2.658490873533246,
"grad_norm": 11.80124568939209,
"learning_rate": 1.4049054758800522e-05,
"loss": 0.5565,
"step": 130500
},
{
"epoch": 2.6686766623207303,
"grad_norm": 13.250449180603027,
"learning_rate": 1.398794002607562e-05,
"loss": 0.5423,
"step": 131000
},
{
"epoch": 2.678862451108214,
"grad_norm": 15.81285572052002,
"learning_rate": 1.3926825293350719e-05,
"loss": 0.5621,
"step": 131500
},
{
"epoch": 2.6890482398956976,
"grad_norm": 22.584035873413086,
"learning_rate": 1.3865710560625816e-05,
"loss": 0.5521,
"step": 132000
},
{
"epoch": 2.6992340286831813,
"grad_norm": 23.865680694580078,
"learning_rate": 1.3804595827900914e-05,
"loss": 0.551,
"step": 132500
},
{
"epoch": 2.709419817470665,
"grad_norm": 18.28876304626465,
"learning_rate": 1.3743481095176011e-05,
"loss": 0.5451,
"step": 133000
},
{
"epoch": 2.7196056062581486,
"grad_norm": 16.23969078063965,
"learning_rate": 1.3682366362451109e-05,
"loss": 0.5392,
"step": 133500
},
{
"epoch": 2.7297913950456323,
"grad_norm": 14.673959732055664,
"learning_rate": 1.3621251629726206e-05,
"loss": 0.5451,
"step": 134000
},
{
"epoch": 2.739977183833116,
"grad_norm": 8.019514083862305,
"learning_rate": 1.3560136897001304e-05,
"loss": 0.5568,
"step": 134500
},
{
"epoch": 2.7501629726205996,
"grad_norm": 13.495898246765137,
"learning_rate": 1.3499022164276401e-05,
"loss": 0.5734,
"step": 135000
},
{
"epoch": 2.7603487614080837,
"grad_norm": 7.548976421356201,
"learning_rate": 1.3437907431551499e-05,
"loss": 0.557,
"step": 135500
},
{
"epoch": 2.770534550195567,
"grad_norm": 20.049760818481445,
"learning_rate": 1.3376792698826597e-05,
"loss": 0.5658,
"step": 136000
},
{
"epoch": 2.780720338983051,
"grad_norm": 9.346122741699219,
"learning_rate": 1.3315677966101694e-05,
"loss": 0.535,
"step": 136500
},
{
"epoch": 2.7909061277705347,
"grad_norm": 15.080660820007324,
"learning_rate": 1.3254563233376792e-05,
"loss": 0.5538,
"step": 137000
},
{
"epoch": 2.8010919165580184,
"grad_norm": 11.485374450683594,
"learning_rate": 1.3193448500651891e-05,
"loss": 0.5419,
"step": 137500
},
{
"epoch": 2.811277705345502,
"grad_norm": 12.089446067810059,
"learning_rate": 1.3132333767926988e-05,
"loss": 0.5651,
"step": 138000
},
{
"epoch": 2.8214634941329857,
"grad_norm": 5.466490268707275,
"learning_rate": 1.3071219035202088e-05,
"loss": 0.5217,
"step": 138500
},
{
"epoch": 2.8316492829204694,
"grad_norm": 12.89148235321045,
"learning_rate": 1.3010104302477185e-05,
"loss": 0.5477,
"step": 139000
},
{
"epoch": 2.841835071707953,
"grad_norm": 5.610709190368652,
"learning_rate": 1.2948989569752283e-05,
"loss": 0.5377,
"step": 139500
},
{
"epoch": 2.8520208604954367,
"grad_norm": 26.2186222076416,
"learning_rate": 1.288787483702738e-05,
"loss": 0.5281,
"step": 140000
},
{
"epoch": 2.8622066492829203,
"grad_norm": 2.1066462993621826,
"learning_rate": 1.2826760104302478e-05,
"loss": 0.5387,
"step": 140500
},
{
"epoch": 2.872392438070404,
"grad_norm": 13.6646728515625,
"learning_rate": 1.2765645371577575e-05,
"loss": 0.5416,
"step": 141000
},
{
"epoch": 2.8825782268578877,
"grad_norm": 14.357284545898438,
"learning_rate": 1.2704530638852673e-05,
"loss": 0.5504,
"step": 141500
},
{
"epoch": 2.8927640156453718,
"grad_norm": 14.08674144744873,
"learning_rate": 1.264341590612777e-05,
"loss": 0.5691,
"step": 142000
},
{
"epoch": 2.9029498044328554,
"grad_norm": 20.460561752319336,
"learning_rate": 1.2582301173402868e-05,
"loss": 0.5624,
"step": 142500
},
{
"epoch": 2.913135593220339,
"grad_norm": 12.731225967407227,
"learning_rate": 1.2521186440677966e-05,
"loss": 0.5439,
"step": 143000
},
{
"epoch": 2.9233213820078228,
"grad_norm": 3.781522274017334,
"learning_rate": 1.2460071707953065e-05,
"loss": 0.5348,
"step": 143500
},
{
"epoch": 2.9335071707953064,
"grad_norm": 14.624204635620117,
"learning_rate": 1.2398956975228162e-05,
"loss": 0.5589,
"step": 144000
},
{
"epoch": 2.94369295958279,
"grad_norm": 18.551095962524414,
"learning_rate": 1.233784224250326e-05,
"loss": 0.532,
"step": 144500
},
{
"epoch": 2.9538787483702738,
"grad_norm": 17.094831466674805,
"learning_rate": 1.2276727509778357e-05,
"loss": 0.558,
"step": 145000
},
{
"epoch": 2.9640645371577574,
"grad_norm": 12.306907653808594,
"learning_rate": 1.2215612777053455e-05,
"loss": 0.542,
"step": 145500
},
{
"epoch": 2.974250325945241,
"grad_norm": 12.134025573730469,
"learning_rate": 1.2154498044328553e-05,
"loss": 0.5429,
"step": 146000
},
{
"epoch": 2.9844361147327247,
"grad_norm": 22.295795440673828,
"learning_rate": 1.209338331160365e-05,
"loss": 0.5632,
"step": 146500
},
{
"epoch": 2.9946219035202084,
"grad_norm": 17.749858856201172,
"learning_rate": 1.2032268578878748e-05,
"loss": 0.5402,
"step": 147000
},
{
"epoch": 3.0,
"eval_accuracy": 0.7337000966072083,
"eval_loss": 0.7322831153869629,
"eval_runtime": 8.3496,
"eval_samples_per_second": 587.813,
"eval_steps_per_second": 73.536,
"step": 147264
},
{
"epoch": 3.0048076923076925,
"grad_norm": 22.746931076049805,
"learning_rate": 1.1971153846153847e-05,
"loss": 0.5024,
"step": 147500
},
{
"epoch": 3.014993481095176,
"grad_norm": 19.812463760375977,
"learning_rate": 1.1910039113428944e-05,
"loss": 0.455,
"step": 148000
},
{
"epoch": 3.02517926988266,
"grad_norm": 8.608039855957031,
"learning_rate": 1.1848924380704042e-05,
"loss": 0.4529,
"step": 148500
},
{
"epoch": 3.0353650586701435,
"grad_norm": 21.67936897277832,
"learning_rate": 1.178780964797914e-05,
"loss": 0.467,
"step": 149000
},
{
"epoch": 3.045550847457627,
"grad_norm": 23.777921676635742,
"learning_rate": 1.1726694915254239e-05,
"loss": 0.4399,
"step": 149500
},
{
"epoch": 3.055736636245111,
"grad_norm": 5.8297600746154785,
"learning_rate": 1.1665580182529336e-05,
"loss": 0.4605,
"step": 150000
},
{
"epoch": 3.0659224250325945,
"grad_norm": 1.6593589782714844,
"learning_rate": 1.1604465449804434e-05,
"loss": 0.4668,
"step": 150500
},
{
"epoch": 3.076108213820078,
"grad_norm": 41.140750885009766,
"learning_rate": 1.1543350717079531e-05,
"loss": 0.4558,
"step": 151000
},
{
"epoch": 3.086294002607562,
"grad_norm": 14.093750953674316,
"learning_rate": 1.1482235984354629e-05,
"loss": 0.4645,
"step": 151500
},
{
"epoch": 3.0964797913950455,
"grad_norm": 25.050161361694336,
"learning_rate": 1.1421121251629727e-05,
"loss": 0.4549,
"step": 152000
},
{
"epoch": 3.106665580182529,
"grad_norm": 12.942317962646484,
"learning_rate": 1.1360006518904824e-05,
"loss": 0.4675,
"step": 152500
},
{
"epoch": 3.1168513689700132,
"grad_norm": 26.615070343017578,
"learning_rate": 1.1298891786179922e-05,
"loss": 0.4606,
"step": 153000
},
{
"epoch": 3.127037157757497,
"grad_norm": 35.78953552246094,
"learning_rate": 1.123777705345502e-05,
"loss": 0.4754,
"step": 153500
},
{
"epoch": 3.1372229465449806,
"grad_norm": 17.106658935546875,
"learning_rate": 1.1176662320730117e-05,
"loss": 0.46,
"step": 154000
},
{
"epoch": 3.1474087353324642,
"grad_norm": 5.81476354598999,
"learning_rate": 1.1115547588005214e-05,
"loss": 0.455,
"step": 154500
},
{
"epoch": 3.157594524119948,
"grad_norm": 13.205123901367188,
"learning_rate": 1.1054432855280312e-05,
"loss": 0.4443,
"step": 155000
},
{
"epoch": 3.1677803129074316,
"grad_norm": 44.934471130371094,
"learning_rate": 1.0993318122555411e-05,
"loss": 0.447,
"step": 155500
},
{
"epoch": 3.1779661016949152,
"grad_norm": 8.879769325256348,
"learning_rate": 1.0932203389830509e-05,
"loss": 0.4536,
"step": 156000
},
{
"epoch": 3.188151890482399,
"grad_norm": 22.281354904174805,
"learning_rate": 1.0871088657105608e-05,
"loss": 0.4541,
"step": 156500
},
{
"epoch": 3.1983376792698825,
"grad_norm": 46.504112243652344,
"learning_rate": 1.0809973924380705e-05,
"loss": 0.4467,
"step": 157000
},
{
"epoch": 3.208523468057366,
"grad_norm": 1.4721815586090088,
"learning_rate": 1.0748859191655803e-05,
"loss": 0.4474,
"step": 157500
},
{
"epoch": 3.21870925684485,
"grad_norm": 18.72897720336914,
"learning_rate": 1.06877444589309e-05,
"loss": 0.4465,
"step": 158000
},
{
"epoch": 3.228895045632334,
"grad_norm": 13.687529563903809,
"learning_rate": 1.0626629726205998e-05,
"loss": 0.4674,
"step": 158500
},
{
"epoch": 3.2390808344198176,
"grad_norm": 2.0822532176971436,
"learning_rate": 1.0565514993481096e-05,
"loss": 0.4516,
"step": 159000
},
{
"epoch": 3.2492666232073013,
"grad_norm": 15.965363502502441,
"learning_rate": 1.0504400260756193e-05,
"loss": 0.4582,
"step": 159500
},
{
"epoch": 3.259452411994785,
"grad_norm": 19.683805465698242,
"learning_rate": 1.044328552803129e-05,
"loss": 0.452,
"step": 160000
},
{
"epoch": 3.2696382007822686,
"grad_norm": 30.53873062133789,
"learning_rate": 1.0382170795306388e-05,
"loss": 0.4532,
"step": 160500
},
{
"epoch": 3.2798239895697523,
"grad_norm": 14.192473411560059,
"learning_rate": 1.0321056062581486e-05,
"loss": 0.4576,
"step": 161000
},
{
"epoch": 3.290009778357236,
"grad_norm": 19.519214630126953,
"learning_rate": 1.0259941329856583e-05,
"loss": 0.4531,
"step": 161500
},
{
"epoch": 3.3001955671447196,
"grad_norm": 11.308916091918945,
"learning_rate": 1.0198826597131683e-05,
"loss": 0.4374,
"step": 162000
},
{
"epoch": 3.3103813559322033,
"grad_norm": 22.54949378967285,
"learning_rate": 1.013771186440678e-05,
"loss": 0.4641,
"step": 162500
},
{
"epoch": 3.320567144719687,
"grad_norm": 27.339025497436523,
"learning_rate": 1.0076597131681878e-05,
"loss": 0.4603,
"step": 163000
},
{
"epoch": 3.3307529335071706,
"grad_norm": 10.614120483398438,
"learning_rate": 1.0015482398956975e-05,
"loss": 0.4482,
"step": 163500
},
{
"epoch": 3.3409387222946547,
"grad_norm": 19.92940902709961,
"learning_rate": 9.954367666232073e-06,
"loss": 0.4503,
"step": 164000
},
{
"epoch": 3.3511245110821384,
"grad_norm": 21.356348037719727,
"learning_rate": 9.89325293350717e-06,
"loss": 0.4542,
"step": 164500
},
{
"epoch": 3.361310299869622,
"grad_norm": 19.11351203918457,
"learning_rate": 9.83213820078227e-06,
"loss": 0.4653,
"step": 165000
},
{
"epoch": 3.3714960886571057,
"grad_norm": 15.949396133422852,
"learning_rate": 9.771023468057367e-06,
"loss": 0.4583,
"step": 165500
},
{
"epoch": 3.3816818774445894,
"grad_norm": 10.171916961669922,
"learning_rate": 9.709908735332465e-06,
"loss": 0.4747,
"step": 166000
},
{
"epoch": 3.391867666232073,
"grad_norm": 17.548404693603516,
"learning_rate": 9.648794002607562e-06,
"loss": 0.4252,
"step": 166500
},
{
"epoch": 3.4020534550195567,
"grad_norm": 26.116491317749023,
"learning_rate": 9.58767926988266e-06,
"loss": 0.4429,
"step": 167000
},
{
"epoch": 3.4122392438070404,
"grad_norm": 8.717327117919922,
"learning_rate": 9.526564537157757e-06,
"loss": 0.4752,
"step": 167500
},
{
"epoch": 3.422425032594524,
"grad_norm": 5.2715535163879395,
"learning_rate": 9.465449804432857e-06,
"loss": 0.4689,
"step": 168000
},
{
"epoch": 3.4326108213820077,
"grad_norm": 15.369921684265137,
"learning_rate": 9.404335071707954e-06,
"loss": 0.474,
"step": 168500
},
{
"epoch": 3.4427966101694913,
"grad_norm": 23.541234970092773,
"learning_rate": 9.343220338983052e-06,
"loss": 0.4717,
"step": 169000
},
{
"epoch": 3.4529823989569755,
"grad_norm": 25.710681915283203,
"learning_rate": 9.28210560625815e-06,
"loss": 0.459,
"step": 169500
},
{
"epoch": 3.463168187744459,
"grad_norm": 36.22392272949219,
"learning_rate": 9.220990873533247e-06,
"loss": 0.4374,
"step": 170000
},
{
"epoch": 3.4733539765319428,
"grad_norm": 17.486045837402344,
"learning_rate": 9.159876140808344e-06,
"loss": 0.4682,
"step": 170500
},
{
"epoch": 3.4835397653194264,
"grad_norm": 21.079090118408203,
"learning_rate": 9.098761408083442e-06,
"loss": 0.461,
"step": 171000
},
{
"epoch": 3.49372555410691,
"grad_norm": 47.44207763671875,
"learning_rate": 9.03764667535854e-06,
"loss": 0.428,
"step": 171500
},
{
"epoch": 3.5039113428943938,
"grad_norm": 10.158509254455566,
"learning_rate": 8.976531942633637e-06,
"loss": 0.4705,
"step": 172000
},
{
"epoch": 3.5140971316818774,
"grad_norm": 2.3025388717651367,
"learning_rate": 8.915417209908735e-06,
"loss": 0.4794,
"step": 172500
},
{
"epoch": 3.524282920469361,
"grad_norm": 7.106923580169678,
"learning_rate": 8.854302477183832e-06,
"loss": 0.4596,
"step": 173000
},
{
"epoch": 3.5344687092568448,
"grad_norm": 11.846363067626953,
"learning_rate": 8.79318774445893e-06,
"loss": 0.4548,
"step": 173500
},
{
"epoch": 3.5446544980443284,
"grad_norm": 23.35795783996582,
"learning_rate": 8.73207301173403e-06,
"loss": 0.4627,
"step": 174000
},
{
"epoch": 3.554840286831812,
"grad_norm": 51.494873046875,
"learning_rate": 8.670958279009128e-06,
"loss": 0.443,
"step": 174500
},
{
"epoch": 3.565026075619296,
"grad_norm": 17.30136489868164,
"learning_rate": 8.609843546284226e-06,
"loss": 0.4431,
"step": 175000
},
{
"epoch": 3.5752118644067794,
"grad_norm": 6.228327751159668,
"learning_rate": 8.548728813559323e-06,
"loss": 0.4494,
"step": 175500
},
{
"epoch": 3.5853976531942635,
"grad_norm": 7.742433071136475,
"learning_rate": 8.48761408083442e-06,
"loss": 0.4591,
"step": 176000
},
{
"epoch": 3.595583441981747,
"grad_norm": 17.392818450927734,
"learning_rate": 8.426499348109518e-06,
"loss": 0.4572,
"step": 176500
},
{
"epoch": 3.605769230769231,
"grad_norm": 9.273465156555176,
"learning_rate": 8.365384615384616e-06,
"loss": 0.4592,
"step": 177000
},
{
"epoch": 3.6159550195567145,
"grad_norm": 10.859349250793457,
"learning_rate": 8.304269882659713e-06,
"loss": 0.4507,
"step": 177500
},
{
"epoch": 3.626140808344198,
"grad_norm": 16.180192947387695,
"learning_rate": 8.243155149934811e-06,
"loss": 0.4615,
"step": 178000
},
{
"epoch": 3.636326597131682,
"grad_norm": 7.720341682434082,
"learning_rate": 8.182040417209908e-06,
"loss": 0.4614,
"step": 178500
},
{
"epoch": 3.6465123859191655,
"grad_norm": 10.102255821228027,
"learning_rate": 8.120925684485006e-06,
"loss": 0.4504,
"step": 179000
},
{
"epoch": 3.656698174706649,
"grad_norm": 9.698884010314941,
"learning_rate": 8.059810951760104e-06,
"loss": 0.4791,
"step": 179500
},
{
"epoch": 3.666883963494133,
"grad_norm": 13.645587921142578,
"learning_rate": 7.998696219035203e-06,
"loss": 0.4655,
"step": 180000
},
{
"epoch": 3.677069752281617,
"grad_norm": 6.0963358879089355,
"learning_rate": 7.9375814863103e-06,
"loss": 0.4522,
"step": 180500
},
{
"epoch": 3.6872555410691,
"grad_norm": 19.082395553588867,
"learning_rate": 7.876466753585398e-06,
"loss": 0.4606,
"step": 181000
},
{
"epoch": 3.6974413298565842,
"grad_norm": 21.68135643005371,
"learning_rate": 7.815352020860495e-06,
"loss": 0.475,
"step": 181500
},
{
"epoch": 3.707627118644068,
"grad_norm": 22.7216796875,
"learning_rate": 7.754237288135593e-06,
"loss": 0.4561,
"step": 182000
},
{
"epoch": 3.7178129074315516,
"grad_norm": 5.14304780960083,
"learning_rate": 7.69312255541069e-06,
"loss": 0.4561,
"step": 182500
},
{
"epoch": 3.7279986962190352,
"grad_norm": 12.879047393798828,
"learning_rate": 7.63200782268579e-06,
"loss": 0.4619,
"step": 183000
},
{
"epoch": 3.738184485006519,
"grad_norm": 20.963592529296875,
"learning_rate": 7.570893089960887e-06,
"loss": 0.4488,
"step": 183500
},
{
"epoch": 3.7483702737940026,
"grad_norm": 15.685653686523438,
"learning_rate": 7.509778357235985e-06,
"loss": 0.4515,
"step": 184000
},
{
"epoch": 3.7585560625814862,
"grad_norm": 14.296714782714844,
"learning_rate": 7.4486636245110824e-06,
"loss": 0.4461,
"step": 184500
},
{
"epoch": 3.76874185136897,
"grad_norm": 8.592365264892578,
"learning_rate": 7.38754889178618e-06,
"loss": 0.4571,
"step": 185000
},
{
"epoch": 3.7789276401564535,
"grad_norm": 32.55515670776367,
"learning_rate": 7.326434159061278e-06,
"loss": 0.4601,
"step": 185500
},
{
"epoch": 3.7891134289439377,
"grad_norm": 19.179519653320312,
"learning_rate": 7.265319426336376e-06,
"loss": 0.4591,
"step": 186000
},
{
"epoch": 3.799299217731421,
"grad_norm": 11.681869506835938,
"learning_rate": 7.2042046936114735e-06,
"loss": 0.4638,
"step": 186500
},
{
"epoch": 3.809485006518905,
"grad_norm": 17.548564910888672,
"learning_rate": 7.143089960886571e-06,
"loss": 0.4595,
"step": 187000
},
{
"epoch": 3.8196707953063886,
"grad_norm": 34.2673225402832,
"learning_rate": 7.0819752281616686e-06,
"loss": 0.4622,
"step": 187500
},
{
"epoch": 3.8298565840938723,
"grad_norm": 25.60137939453125,
"learning_rate": 7.020860495436767e-06,
"loss": 0.4653,
"step": 188000
},
{
"epoch": 3.840042372881356,
"grad_norm": 14.652885437011719,
"learning_rate": 6.9597457627118645e-06,
"loss": 0.4484,
"step": 188500
},
{
"epoch": 3.8502281616688396,
"grad_norm": 20.71872901916504,
"learning_rate": 6.898631029986962e-06,
"loss": 0.4717,
"step": 189000
},
{
"epoch": 3.8604139504563233,
"grad_norm": 6.4770989418029785,
"learning_rate": 6.83751629726206e-06,
"loss": 0.462,
"step": 189500
},
{
"epoch": 3.870599739243807,
"grad_norm": 21.029338836669922,
"learning_rate": 6.776401564537158e-06,
"loss": 0.4312,
"step": 190000
},
{
"epoch": 3.8807855280312906,
"grad_norm": 27.267345428466797,
"learning_rate": 6.7152868318122556e-06,
"loss": 0.4623,
"step": 190500
},
{
"epoch": 3.8909713168187743,
"grad_norm": 16.270328521728516,
"learning_rate": 6.654172099087354e-06,
"loss": 0.4594,
"step": 191000
},
{
"epoch": 3.9011571056062584,
"grad_norm": 15.952796936035156,
"learning_rate": 6.5930573663624515e-06,
"loss": 0.4478,
"step": 191500
},
{
"epoch": 3.9113428943937416,
"grad_norm": 24.082805633544922,
"learning_rate": 6.531942633637549e-06,
"loss": 0.4572,
"step": 192000
},
{
"epoch": 3.9215286831812257,
"grad_norm": 17.784955978393555,
"learning_rate": 6.470827900912647e-06,
"loss": 0.453,
"step": 192500
},
{
"epoch": 3.9317144719687094,
"grad_norm": 15.306330680847168,
"learning_rate": 6.409713168187744e-06,
"loss": 0.4516,
"step": 193000
},
{
"epoch": 3.941900260756193,
"grad_norm": 17.07221031188965,
"learning_rate": 6.348598435462842e-06,
"loss": 0.4513,
"step": 193500
},
{
"epoch": 3.9520860495436767,
"grad_norm": 1.9630801677703857,
"learning_rate": 6.287483702737941e-06,
"loss": 0.4339,
"step": 194000
},
{
"epoch": 3.9622718383311604,
"grad_norm": 23.103532791137695,
"learning_rate": 6.2263689700130385e-06,
"loss": 0.475,
"step": 194500
},
{
"epoch": 3.972457627118644,
"grad_norm": 9.09752082824707,
"learning_rate": 6.165254237288136e-06,
"loss": 0.4422,
"step": 195000
},
{
"epoch": 3.9826434159061277,
"grad_norm": 23.36123275756836,
"learning_rate": 6.104139504563234e-06,
"loss": 0.4578,
"step": 195500
},
{
"epoch": 3.9928292046936114,
"grad_norm": 15.44927978515625,
"learning_rate": 6.043024771838331e-06,
"loss": 0.4815,
"step": 196000
},
{
"epoch": 4.0,
"eval_accuracy": 0.7300326228141785,
"eval_loss": 0.8482908010482788,
"eval_runtime": 7.2526,
"eval_samples_per_second": 676.722,
"eval_steps_per_second": 84.659,
"step": 196352
},
{
"epoch": 4.003014993481095,
"grad_norm": 18.638896942138672,
"learning_rate": 5.981910039113429e-06,
"loss": 0.4162,
"step": 196500
},
{
"epoch": 4.013200782268579,
"grad_norm": 23.482778549194336,
"learning_rate": 5.920795306388527e-06,
"loss": 0.3857,
"step": 197000
},
{
"epoch": 4.023386571056062,
"grad_norm": 19.941246032714844,
"learning_rate": 5.859680573663625e-06,
"loss": 0.4055,
"step": 197500
},
{
"epoch": 4.0335723598435465,
"grad_norm": 55.201602935791016,
"learning_rate": 5.798565840938722e-06,
"loss": 0.4054,
"step": 198000
},
{
"epoch": 4.04375814863103,
"grad_norm": 9.683618545532227,
"learning_rate": 5.7374511082138206e-06,
"loss": 0.3772,
"step": 198500
},
{
"epoch": 4.053943937418514,
"grad_norm": 14.284856796264648,
"learning_rate": 5.676336375488918e-06,
"loss": 0.3761,
"step": 199000
},
{
"epoch": 4.064129726205997,
"grad_norm": 33.600894927978516,
"learning_rate": 5.615221642764016e-06,
"loss": 0.394,
"step": 199500
},
{
"epoch": 4.074315514993481,
"grad_norm": 4.879152774810791,
"learning_rate": 5.554106910039114e-06,
"loss": 0.3754,
"step": 200000
},
{
"epoch": 4.084501303780965,
"grad_norm": 4.289336681365967,
"learning_rate": 5.492992177314212e-06,
"loss": 0.3822,
"step": 200500
},
{
"epoch": 4.094687092568448,
"grad_norm": 13.225927352905273,
"learning_rate": 5.431877444589309e-06,
"loss": 0.385,
"step": 201000
},
{
"epoch": 4.1048728813559325,
"grad_norm": 30.11479949951172,
"learning_rate": 5.370762711864407e-06,
"loss": 0.4043,
"step": 201500
},
{
"epoch": 4.115058670143416,
"grad_norm": 30.226152420043945,
"learning_rate": 5.309647979139504e-06,
"loss": 0.3785,
"step": 202000
},
{
"epoch": 4.1252444589309,
"grad_norm": 57.69206619262695,
"learning_rate": 5.248533246414602e-06,
"loss": 0.3783,
"step": 202500
},
{
"epoch": 4.135430247718383,
"grad_norm": 0.5498570203781128,
"learning_rate": 5.187418513689701e-06,
"loss": 0.387,
"step": 203000
},
{
"epoch": 4.145616036505867,
"grad_norm": 1.2860363721847534,
"learning_rate": 5.126303780964799e-06,
"loss": 0.3788,
"step": 203500
},
{
"epoch": 4.15580182529335,
"grad_norm": 24.312036514282227,
"learning_rate": 5.065189048239896e-06,
"loss": 0.3958,
"step": 204000
},
{
"epoch": 4.1659876140808345,
"grad_norm": 31.0595645904541,
"learning_rate": 5.004074315514994e-06,
"loss": 0.368,
"step": 204500
},
{
"epoch": 4.176173402868318,
"grad_norm": 30.00829315185547,
"learning_rate": 4.942959582790091e-06,
"loss": 0.387,
"step": 205000
},
{
"epoch": 4.186359191655802,
"grad_norm": 5.656859874725342,
"learning_rate": 4.881844850065189e-06,
"loss": 0.3876,
"step": 205500
},
{
"epoch": 4.196544980443286,
"grad_norm": 28.364885330200195,
"learning_rate": 4.820730117340287e-06,
"loss": 0.3974,
"step": 206000
},
{
"epoch": 4.206730769230769,
"grad_norm": 21.450817108154297,
"learning_rate": 4.759615384615385e-06,
"loss": 0.4304,
"step": 206500
},
{
"epoch": 4.216916558018253,
"grad_norm": 26.55447769165039,
"learning_rate": 4.698500651890482e-06,
"loss": 0.3598,
"step": 207000
},
{
"epoch": 4.2271023468057365,
"grad_norm": 63.503299713134766,
"learning_rate": 4.637385919165581e-06,
"loss": 0.3972,
"step": 207500
},
{
"epoch": 4.237288135593221,
"grad_norm": 6.910649299621582,
"learning_rate": 4.576271186440678e-06,
"loss": 0.3788,
"step": 208000
},
{
"epoch": 4.247473924380704,
"grad_norm": 6.5545172691345215,
"learning_rate": 4.515156453715776e-06,
"loss": 0.3978,
"step": 208500
},
{
"epoch": 4.257659713168188,
"grad_norm": 17.815195083618164,
"learning_rate": 4.454041720990874e-06,
"loss": 0.4014,
"step": 209000
},
{
"epoch": 4.267845501955671,
"grad_norm": 33.210113525390625,
"learning_rate": 4.392926988265972e-06,
"loss": 0.3891,
"step": 209500
},
{
"epoch": 4.278031290743155,
"grad_norm": 33.96702194213867,
"learning_rate": 4.331812255541069e-06,
"loss": 0.3891,
"step": 210000
},
{
"epoch": 4.2882170795306385,
"grad_norm": 13.674240112304688,
"learning_rate": 4.270697522816167e-06,
"loss": 0.3743,
"step": 210500
},
{
"epoch": 4.298402868318123,
"grad_norm": 1.694655179977417,
"learning_rate": 4.209582790091264e-06,
"loss": 0.4072,
"step": 211000
},
{
"epoch": 4.308588657105606,
"grad_norm": 10.407938957214355,
"learning_rate": 4.148468057366362e-06,
"loss": 0.3788,
"step": 211500
},
{
"epoch": 4.31877444589309,
"grad_norm": 28.451736450195312,
"learning_rate": 4.087353324641461e-06,
"loss": 0.44,
"step": 212000
},
{
"epoch": 4.328960234680574,
"grad_norm": 40.655616760253906,
"learning_rate": 4.026238591916559e-06,
"loss": 0.3846,
"step": 212500
},
{
"epoch": 4.339146023468057,
"grad_norm": 33.640682220458984,
"learning_rate": 3.965123859191656e-06,
"loss": 0.3967,
"step": 213000
},
{
"epoch": 4.349331812255541,
"grad_norm": 77.51473236083984,
"learning_rate": 3.904009126466754e-06,
"loss": 0.384,
"step": 213500
},
{
"epoch": 4.3595176010430245,
"grad_norm": 15.64946174621582,
"learning_rate": 3.842894393741851e-06,
"loss": 0.3935,
"step": 214000
},
{
"epoch": 4.369703389830509,
"grad_norm": 38.03608703613281,
"learning_rate": 3.7817796610169493e-06,
"loss": 0.3933,
"step": 214500
},
{
"epoch": 4.379889178617992,
"grad_norm": 24.385404586791992,
"learning_rate": 3.720664928292047e-06,
"loss": 0.3879,
"step": 215000
},
{
"epoch": 4.390074967405476,
"grad_norm": 19.359683990478516,
"learning_rate": 3.659550195567145e-06,
"loss": 0.4027,
"step": 215500
},
{
"epoch": 4.400260756192959,
"grad_norm": 20.058656692504883,
"learning_rate": 3.598435462842243e-06,
"loss": 0.3947,
"step": 216000
},
{
"epoch": 4.410446544980443,
"grad_norm": 22.407522201538086,
"learning_rate": 3.5373207301173403e-06,
"loss": 0.3692,
"step": 216500
},
{
"epoch": 4.4206323337679265,
"grad_norm": 0.43725308775901794,
"learning_rate": 3.476205997392438e-06,
"loss": 0.3876,
"step": 217000
},
{
"epoch": 4.430818122555411,
"grad_norm": 6.114492416381836,
"learning_rate": 3.4150912646675363e-06,
"loss": 0.3812,
"step": 217500
},
{
"epoch": 4.441003911342895,
"grad_norm": 8.737237930297852,
"learning_rate": 3.353976531942634e-06,
"loss": 0.4218,
"step": 218000
},
{
"epoch": 4.451189700130378,
"grad_norm": 21.46657371520996,
"learning_rate": 3.2928617992177314e-06,
"loss": 0.3878,
"step": 218500
},
{
"epoch": 4.461375488917862,
"grad_norm": 28.781583786010742,
"learning_rate": 3.2317470664928294e-06,
"loss": 0.4014,
"step": 219000
},
{
"epoch": 4.471561277705345,
"grad_norm": 65.7254867553711,
"learning_rate": 3.170632333767927e-06,
"loss": 0.3846,
"step": 219500
},
{
"epoch": 4.481747066492829,
"grad_norm": 42.237693786621094,
"learning_rate": 3.109517601043025e-06,
"loss": 0.3945,
"step": 220000
},
{
"epoch": 4.491932855280313,
"grad_norm": 22.627384185791016,
"learning_rate": 3.048402868318123e-06,
"loss": 0.3846,
"step": 220500
},
{
"epoch": 4.502118644067797,
"grad_norm": 5.28376579284668,
"learning_rate": 2.9872881355932204e-06,
"loss": 0.3951,
"step": 221000
},
{
"epoch": 4.51230443285528,
"grad_norm": 29.31270980834961,
"learning_rate": 2.926173402868318e-06,
"loss": 0.387,
"step": 221500
},
{
"epoch": 4.522490221642764,
"grad_norm": 28.06836700439453,
"learning_rate": 2.8650586701434163e-06,
"loss": 0.3758,
"step": 222000
},
{
"epoch": 4.532676010430247,
"grad_norm": 19.28433609008789,
"learning_rate": 2.803943937418514e-06,
"loss": 0.4011,
"step": 222500
},
{
"epoch": 4.542861799217731,
"grad_norm": 28.01453399658203,
"learning_rate": 2.7428292046936114e-06,
"loss": 0.4123,
"step": 223000
},
{
"epoch": 4.5530475880052155,
"grad_norm": 22.563831329345703,
"learning_rate": 2.6817144719687094e-06,
"loss": 0.3901,
"step": 223500
},
{
"epoch": 4.563233376792699,
"grad_norm": 6.881279945373535,
"learning_rate": 2.620599739243807e-06,
"loss": 0.3641,
"step": 224000
},
{
"epoch": 4.573419165580183,
"grad_norm": 10.799131393432617,
"learning_rate": 2.559485006518905e-06,
"loss": 0.388,
"step": 224500
},
{
"epoch": 4.583604954367666,
"grad_norm": 8.748978614807129,
"learning_rate": 2.498370273794003e-06,
"loss": 0.3855,
"step": 225000
},
{
"epoch": 4.59379074315515,
"grad_norm": 30.88077163696289,
"learning_rate": 2.4372555410691004e-06,
"loss": 0.3846,
"step": 225500
},
{
"epoch": 4.603976531942633,
"grad_norm": 9.857905387878418,
"learning_rate": 2.376140808344198e-06,
"loss": 0.3887,
"step": 226000
},
{
"epoch": 4.6141623207301175,
"grad_norm": 18.84724235534668,
"learning_rate": 2.3150260756192964e-06,
"loss": 0.3981,
"step": 226500
},
{
"epoch": 4.624348109517601,
"grad_norm": 1.5743709802627563,
"learning_rate": 2.253911342894394e-06,
"loss": 0.4198,
"step": 227000
},
{
"epoch": 4.634533898305085,
"grad_norm": 1.2036515474319458,
"learning_rate": 2.1927966101694915e-06,
"loss": 0.3883,
"step": 227500
},
{
"epoch": 4.644719687092568,
"grad_norm": 25.730968475341797,
"learning_rate": 2.1316818774445895e-06,
"loss": 0.3791,
"step": 228000
},
{
"epoch": 4.654905475880052,
"grad_norm": 32.65802764892578,
"learning_rate": 2.070567144719687e-06,
"loss": 0.3772,
"step": 228500
},
{
"epoch": 4.665091264667536,
"grad_norm": 3.2796003818511963,
"learning_rate": 2.009452411994785e-06,
"loss": 0.3994,
"step": 229000
},
{
"epoch": 4.675277053455019,
"grad_norm": 12.903874397277832,
"learning_rate": 1.948337679269883e-06,
"loss": 0.3725,
"step": 229500
},
{
"epoch": 4.6854628422425035,
"grad_norm": 1.0021297931671143,
"learning_rate": 1.8872229465449805e-06,
"loss": 0.393,
"step": 230000
},
{
"epoch": 4.695648631029987,
"grad_norm": 5.547977447509766,
"learning_rate": 1.8261082138200783e-06,
"loss": 0.3907,
"step": 230500
},
{
"epoch": 4.705834419817471,
"grad_norm": 5.375828742980957,
"learning_rate": 1.764993481095176e-06,
"loss": 0.4023,
"step": 231000
},
{
"epoch": 4.716020208604954,
"grad_norm": 20.087383270263672,
"learning_rate": 1.703878748370274e-06,
"loss": 0.3652,
"step": 231500
},
{
"epoch": 4.726205997392438,
"grad_norm": 19.617931365966797,
"learning_rate": 1.6427640156453715e-06,
"loss": 0.3842,
"step": 232000
},
{
"epoch": 4.736391786179921,
"grad_norm": 22.911376953125,
"learning_rate": 1.5816492829204695e-06,
"loss": 0.3871,
"step": 232500
},
{
"epoch": 4.7465775749674055,
"grad_norm": 31.00206184387207,
"learning_rate": 1.5205345501955673e-06,
"loss": 0.3918,
"step": 233000
},
{
"epoch": 4.756763363754889,
"grad_norm": 39.066619873046875,
"learning_rate": 1.4594198174706648e-06,
"loss": 0.3743,
"step": 233500
},
{
"epoch": 4.766949152542373,
"grad_norm": 9.460115432739258,
"learning_rate": 1.3983050847457628e-06,
"loss": 0.388,
"step": 234000
},
{
"epoch": 4.777134941329857,
"grad_norm": 10.770241737365723,
"learning_rate": 1.3371903520208605e-06,
"loss": 0.3986,
"step": 234500
},
{
"epoch": 4.78732073011734,
"grad_norm": 29.36595916748047,
"learning_rate": 1.2760756192959583e-06,
"loss": 0.3836,
"step": 235000
},
{
"epoch": 4.797506518904824,
"grad_norm": 21.498857498168945,
"learning_rate": 1.214960886571056e-06,
"loss": 0.3839,
"step": 235500
},
{
"epoch": 4.8076923076923075,
"grad_norm": 2.3236119747161865,
"learning_rate": 1.153846153846154e-06,
"loss": 0.3983,
"step": 236000
},
{
"epoch": 4.817878096479792,
"grad_norm": 10.451170921325684,
"learning_rate": 1.0927314211212516e-06,
"loss": 0.3853,
"step": 236500
},
{
"epoch": 4.828063885267275,
"grad_norm": 18.35541343688965,
"learning_rate": 1.0316166883963496e-06,
"loss": 0.3779,
"step": 237000
},
{
"epoch": 4.838249674054759,
"grad_norm": 56.08652877807617,
"learning_rate": 9.705019556714473e-07,
"loss": 0.3608,
"step": 237500
},
{
"epoch": 4.848435462842242,
"grad_norm": 30.668987274169922,
"learning_rate": 9.09387222946545e-07,
"loss": 0.3901,
"step": 238000
},
{
"epoch": 4.858621251629726,
"grad_norm": 44.03152084350586,
"learning_rate": 8.482724902216428e-07,
"loss": 0.3742,
"step": 238500
},
{
"epoch": 4.8688070404172095,
"grad_norm": 46.421485900878906,
"learning_rate": 7.871577574967406e-07,
"loss": 0.4039,
"step": 239000
},
{
"epoch": 4.878992829204694,
"grad_norm": 41.14004898071289,
"learning_rate": 7.260430247718384e-07,
"loss": 0.4113,
"step": 239500
},
{
"epoch": 4.889178617992178,
"grad_norm": 3.9679319858551025,
"learning_rate": 6.649282920469362e-07,
"loss": 0.3856,
"step": 240000
},
{
"epoch": 4.899364406779661,
"grad_norm": 30.101577758789062,
"learning_rate": 6.038135593220339e-07,
"loss": 0.3742,
"step": 240500
},
{
"epoch": 4.909550195567145,
"grad_norm": 12.514345169067383,
"learning_rate": 5.426988265971316e-07,
"loss": 0.3795,
"step": 241000
},
{
"epoch": 4.919735984354628,
"grad_norm": 33.0556526184082,
"learning_rate": 4.815840938722295e-07,
"loss": 0.3865,
"step": 241500
},
{
"epoch": 4.929921773142112,
"grad_norm": 36.99669647216797,
"learning_rate": 4.2046936114732726e-07,
"loss": 0.4271,
"step": 242000
},
{
"epoch": 4.9401075619295955,
"grad_norm": 2.8075144290924072,
"learning_rate": 3.59354628422425e-07,
"loss": 0.3867,
"step": 242500
},
{
"epoch": 4.95029335071708,
"grad_norm": 0.9676657319068909,
"learning_rate": 2.9823989569752284e-07,
"loss": 0.3994,
"step": 243000
},
{
"epoch": 4.960479139504563,
"grad_norm": 26.419252395629883,
"learning_rate": 2.3712516297262062e-07,
"loss": 0.3956,
"step": 243500
},
{
"epoch": 4.970664928292047,
"grad_norm": 37.89094543457031,
"learning_rate": 1.7601043024771838e-07,
"loss": 0.3856,
"step": 244000
},
{
"epoch": 4.98085071707953,
"grad_norm": 20.763473510742188,
"learning_rate": 1.1489569752281617e-07,
"loss": 0.4161,
"step": 244500
},
{
"epoch": 4.991036505867014,
"grad_norm": 10.952351570129395,
"learning_rate": 5.378096479791395e-08,
"loss": 0.4074,
"step": 245000
},
{
"epoch": 5.0,
"eval_accuracy": 0.7310513257980347,
"eval_loss": 1.0102216005325317,
"eval_runtime": 7.6909,
"eval_samples_per_second": 638.153,
"eval_steps_per_second": 79.834,
"step": 245440
},
{
"epoch": 5.0,
"step": 245440,
"total_flos": 1.158449687808768e+17,
"train_loss": 0.5704954994082295,
"train_runtime": 14864.7666,
"train_samples_per_second": 132.092,
"train_steps_per_second": 16.512
}
],
"logging_steps": 500,
"max_steps": 245440,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.001
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.158449687808768e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}