ALBERT-model-train / trainer_state.json
BryanSagbay's picture
Upload 10 files
d030ab5 verified
{
"best_metric": 0.9182948490230906,
"best_model_checkpoint": "./results/checkpoint-29910",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 29910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 4.954967021942139,
"learning_rate": 4.9832831828819794e-05,
"loss": 4.7403,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 9.918214797973633,
"learning_rate": 4.9665663657639585e-05,
"loss": 4.7281,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 6.391179084777832,
"learning_rate": 4.949849548645938e-05,
"loss": 4.6786,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 6.756315231323242,
"learning_rate": 4.9331327315279175e-05,
"loss": 4.6128,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 8.407713890075684,
"learning_rate": 4.916415914409897e-05,
"loss": 4.4836,
"step": 500
},
{
"epoch": 0.1,
"grad_norm": 8.354033470153809,
"learning_rate": 4.899699097291876e-05,
"loss": 4.3776,
"step": 600
},
{
"epoch": 0.12,
"grad_norm": 7.996518611907959,
"learning_rate": 4.882982280173855e-05,
"loss": 4.2701,
"step": 700
},
{
"epoch": 0.13,
"grad_norm": 14.100532531738281,
"learning_rate": 4.866265463055835e-05,
"loss": 4.1032,
"step": 800
},
{
"epoch": 0.15,
"grad_norm": 10.907315254211426,
"learning_rate": 4.849548645937814e-05,
"loss": 3.952,
"step": 900
},
{
"epoch": 0.17,
"grad_norm": 9.731605529785156,
"learning_rate": 4.8328318288197924e-05,
"loss": 3.732,
"step": 1000
},
{
"epoch": 0.18,
"grad_norm": 9.989665985107422,
"learning_rate": 4.816115011701772e-05,
"loss": 3.5489,
"step": 1100
},
{
"epoch": 0.2,
"grad_norm": 9.542133331298828,
"learning_rate": 4.7993981945837514e-05,
"loss": 3.3949,
"step": 1200
},
{
"epoch": 0.22,
"grad_norm": 11.988595008850098,
"learning_rate": 4.7826813774657305e-05,
"loss": 3.216,
"step": 1300
},
{
"epoch": 0.23,
"grad_norm": 13.553967475891113,
"learning_rate": 4.76596456034771e-05,
"loss": 2.9855,
"step": 1400
},
{
"epoch": 0.25,
"grad_norm": 42.20621109008789,
"learning_rate": 4.7492477432296895e-05,
"loss": 2.7659,
"step": 1500
},
{
"epoch": 0.27,
"grad_norm": 18.790130615234375,
"learning_rate": 4.732530926111669e-05,
"loss": 2.5604,
"step": 1600
},
{
"epoch": 0.28,
"grad_norm": 20.554113388061523,
"learning_rate": 4.715814108993648e-05,
"loss": 2.4376,
"step": 1700
},
{
"epoch": 0.3,
"grad_norm": 18.882707595825195,
"learning_rate": 4.699097291875627e-05,
"loss": 2.3501,
"step": 1800
},
{
"epoch": 0.32,
"grad_norm": 14.733109474182129,
"learning_rate": 4.682380474757606e-05,
"loss": 2.168,
"step": 1900
},
{
"epoch": 0.33,
"grad_norm": 17.430740356445312,
"learning_rate": 4.665663657639586e-05,
"loss": 2.0081,
"step": 2000
},
{
"epoch": 0.35,
"grad_norm": 21.797836303710938,
"learning_rate": 4.6489468405215644e-05,
"loss": 1.9554,
"step": 2100
},
{
"epoch": 0.37,
"grad_norm": 13.148958206176758,
"learning_rate": 4.632230023403544e-05,
"loss": 1.8524,
"step": 2200
},
{
"epoch": 0.38,
"grad_norm": 14.161394119262695,
"learning_rate": 4.6155132062855234e-05,
"loss": 1.793,
"step": 2300
},
{
"epoch": 0.4,
"grad_norm": 20.908519744873047,
"learning_rate": 4.5987963891675026e-05,
"loss": 1.6493,
"step": 2400
},
{
"epoch": 0.42,
"grad_norm": 15.107952117919922,
"learning_rate": 4.582079572049482e-05,
"loss": 1.5724,
"step": 2500
},
{
"epoch": 0.43,
"grad_norm": 18.561201095581055,
"learning_rate": 4.5653627549314615e-05,
"loss": 1.4915,
"step": 2600
},
{
"epoch": 0.45,
"grad_norm": 15.365275382995605,
"learning_rate": 4.548645937813441e-05,
"loss": 1.488,
"step": 2700
},
{
"epoch": 0.47,
"grad_norm": 16.04875946044922,
"learning_rate": 4.53192912069542e-05,
"loss": 1.4316,
"step": 2800
},
{
"epoch": 0.48,
"grad_norm": 13.593673706054688,
"learning_rate": 4.515212303577399e-05,
"loss": 1.456,
"step": 2900
},
{
"epoch": 0.5,
"grad_norm": 16.379798889160156,
"learning_rate": 4.498495486459378e-05,
"loss": 1.3006,
"step": 3000
},
{
"epoch": 0.52,
"grad_norm": 13.564205169677734,
"learning_rate": 4.481778669341358e-05,
"loss": 1.2661,
"step": 3100
},
{
"epoch": 0.53,
"grad_norm": 15.44586181640625,
"learning_rate": 4.4650618522233364e-05,
"loss": 1.2917,
"step": 3200
},
{
"epoch": 0.55,
"grad_norm": 12.80644416809082,
"learning_rate": 4.448345035105316e-05,
"loss": 1.1765,
"step": 3300
},
{
"epoch": 0.57,
"grad_norm": 19.545106887817383,
"learning_rate": 4.4316282179872954e-05,
"loss": 1.1622,
"step": 3400
},
{
"epoch": 0.59,
"grad_norm": 14.377379417419434,
"learning_rate": 4.414911400869275e-05,
"loss": 1.1047,
"step": 3500
},
{
"epoch": 0.6,
"grad_norm": 21.595245361328125,
"learning_rate": 4.398194583751254e-05,
"loss": 1.1384,
"step": 3600
},
{
"epoch": 0.62,
"grad_norm": 14.641448020935059,
"learning_rate": 4.3814777666332335e-05,
"loss": 1.0872,
"step": 3700
},
{
"epoch": 0.64,
"grad_norm": 13.082781791687012,
"learning_rate": 4.364760949515213e-05,
"loss": 1.0366,
"step": 3800
},
{
"epoch": 0.65,
"grad_norm": 18.576641082763672,
"learning_rate": 4.348044132397192e-05,
"loss": 1.0953,
"step": 3900
},
{
"epoch": 0.67,
"grad_norm": 9.915220260620117,
"learning_rate": 4.331327315279171e-05,
"loss": 1.001,
"step": 4000
},
{
"epoch": 0.69,
"grad_norm": 12.059024810791016,
"learning_rate": 4.31461049816115e-05,
"loss": 1.0585,
"step": 4100
},
{
"epoch": 0.7,
"grad_norm": 17.607337951660156,
"learning_rate": 4.29789368104313e-05,
"loss": 1.0179,
"step": 4200
},
{
"epoch": 0.72,
"grad_norm": 16.324430465698242,
"learning_rate": 4.2811768639251084e-05,
"loss": 0.9491,
"step": 4300
},
{
"epoch": 0.74,
"grad_norm": 19.5161075592041,
"learning_rate": 4.264460046807088e-05,
"loss": 0.9374,
"step": 4400
},
{
"epoch": 0.75,
"grad_norm": 20.448488235473633,
"learning_rate": 4.2477432296890674e-05,
"loss": 0.9146,
"step": 4500
},
{
"epoch": 0.77,
"grad_norm": 10.544804573059082,
"learning_rate": 4.231026412571047e-05,
"loss": 0.9187,
"step": 4600
},
{
"epoch": 0.79,
"grad_norm": 17.095731735229492,
"learning_rate": 4.214309595453026e-05,
"loss": 0.8732,
"step": 4700
},
{
"epoch": 0.8,
"grad_norm": 18.1314754486084,
"learning_rate": 4.197592778335005e-05,
"loss": 0.9072,
"step": 4800
},
{
"epoch": 0.82,
"grad_norm": 8.516233444213867,
"learning_rate": 4.180875961216985e-05,
"loss": 0.8264,
"step": 4900
},
{
"epoch": 0.84,
"grad_norm": 12.620676040649414,
"learning_rate": 4.164159144098964e-05,
"loss": 0.8425,
"step": 5000
},
{
"epoch": 0.85,
"grad_norm": 23.544219970703125,
"learning_rate": 4.147442326980943e-05,
"loss": 0.8371,
"step": 5100
},
{
"epoch": 0.87,
"grad_norm": 15.980536460876465,
"learning_rate": 4.130725509862922e-05,
"loss": 0.8257,
"step": 5200
},
{
"epoch": 0.89,
"grad_norm": 16.621524810791016,
"learning_rate": 4.114008692744902e-05,
"loss": 0.7705,
"step": 5300
},
{
"epoch": 0.9,
"grad_norm": 25.2496280670166,
"learning_rate": 4.0972918756268804e-05,
"loss": 0.7741,
"step": 5400
},
{
"epoch": 0.92,
"grad_norm": 12.541385650634766,
"learning_rate": 4.08057505850886e-05,
"loss": 0.7408,
"step": 5500
},
{
"epoch": 0.94,
"grad_norm": 30.975236892700195,
"learning_rate": 4.0638582413908394e-05,
"loss": 0.7417,
"step": 5600
},
{
"epoch": 0.95,
"grad_norm": 16.33625030517578,
"learning_rate": 4.0471414242728186e-05,
"loss": 0.766,
"step": 5700
},
{
"epoch": 0.97,
"grad_norm": 17.48399543762207,
"learning_rate": 4.030424607154798e-05,
"loss": 0.8336,
"step": 5800
},
{
"epoch": 0.99,
"grad_norm": 19.421096801757812,
"learning_rate": 4.013707790036777e-05,
"loss": 0.7135,
"step": 5900
},
{
"epoch": 1.0,
"eval_accuracy": 0.7995263469508584,
"eval_f1": 0.7955612032049123,
"eval_loss": 0.7165877223014832,
"eval_precision": 0.805591523931921,
"eval_recall": 0.7995263469508584,
"eval_runtime": 64.1068,
"eval_samples_per_second": 131.733,
"eval_steps_per_second": 8.236,
"step": 5982
},
{
"epoch": 1.0,
"grad_norm": 10.486939430236816,
"learning_rate": 3.996990972918757e-05,
"loss": 0.685,
"step": 6000
},
{
"epoch": 1.02,
"grad_norm": 19.489837646484375,
"learning_rate": 3.980274155800736e-05,
"loss": 0.6431,
"step": 6100
},
{
"epoch": 1.04,
"grad_norm": 8.935369491577148,
"learning_rate": 3.963557338682715e-05,
"loss": 0.6402,
"step": 6200
},
{
"epoch": 1.05,
"grad_norm": 10.298083305358887,
"learning_rate": 3.946840521564694e-05,
"loss": 0.6261,
"step": 6300
},
{
"epoch": 1.07,
"grad_norm": 18.606569290161133,
"learning_rate": 3.930123704446674e-05,
"loss": 0.5874,
"step": 6400
},
{
"epoch": 1.09,
"grad_norm": 12.412484169006348,
"learning_rate": 3.913406887328653e-05,
"loss": 0.5923,
"step": 6500
},
{
"epoch": 1.1,
"grad_norm": 9.3939847946167,
"learning_rate": 3.8966900702106316e-05,
"loss": 0.6091,
"step": 6600
},
{
"epoch": 1.12,
"grad_norm": 14.168825149536133,
"learning_rate": 3.8799732530926114e-05,
"loss": 0.6259,
"step": 6700
},
{
"epoch": 1.14,
"grad_norm": 18.846487045288086,
"learning_rate": 3.8632564359745906e-05,
"loss": 0.5543,
"step": 6800
},
{
"epoch": 1.15,
"grad_norm": 7.268430709838867,
"learning_rate": 3.84653961885657e-05,
"loss": 0.5615,
"step": 6900
},
{
"epoch": 1.17,
"grad_norm": 6.565930366516113,
"learning_rate": 3.829822801738549e-05,
"loss": 0.5725,
"step": 7000
},
{
"epoch": 1.19,
"grad_norm": 11.122172355651855,
"learning_rate": 3.813105984620529e-05,
"loss": 0.543,
"step": 7100
},
{
"epoch": 1.2,
"grad_norm": 15.909794807434082,
"learning_rate": 3.796389167502508e-05,
"loss": 0.5053,
"step": 7200
},
{
"epoch": 1.22,
"grad_norm": 17.935998916625977,
"learning_rate": 3.779672350384487e-05,
"loss": 0.5866,
"step": 7300
},
{
"epoch": 1.24,
"grad_norm": 7.46903657913208,
"learning_rate": 3.762955533266466e-05,
"loss": 0.5573,
"step": 7400
},
{
"epoch": 1.25,
"grad_norm": 10.208723068237305,
"learning_rate": 3.746238716148446e-05,
"loss": 0.511,
"step": 7500
},
{
"epoch": 1.27,
"grad_norm": 15.062224388122559,
"learning_rate": 3.729521899030425e-05,
"loss": 0.5211,
"step": 7600
},
{
"epoch": 1.29,
"grad_norm": 11.787239074707031,
"learning_rate": 3.7128050819124036e-05,
"loss": 0.5687,
"step": 7700
},
{
"epoch": 1.3,
"grad_norm": 20.22210693359375,
"learning_rate": 3.6960882647943834e-05,
"loss": 0.544,
"step": 7800
},
{
"epoch": 1.32,
"grad_norm": 22.17251205444336,
"learning_rate": 3.6793714476763626e-05,
"loss": 0.5223,
"step": 7900
},
{
"epoch": 1.34,
"grad_norm": 16.83318519592285,
"learning_rate": 3.662654630558342e-05,
"loss": 0.5043,
"step": 8000
},
{
"epoch": 1.35,
"grad_norm": 10.143548965454102,
"learning_rate": 3.645937813440321e-05,
"loss": 0.5181,
"step": 8100
},
{
"epoch": 1.37,
"grad_norm": 20.629831314086914,
"learning_rate": 3.629220996322301e-05,
"loss": 0.4886,
"step": 8200
},
{
"epoch": 1.39,
"grad_norm": 12.14686107635498,
"learning_rate": 3.61250417920428e-05,
"loss": 0.5667,
"step": 8300
},
{
"epoch": 1.4,
"grad_norm": 17.1881160736084,
"learning_rate": 3.595787362086259e-05,
"loss": 0.5211,
"step": 8400
},
{
"epoch": 1.42,
"grad_norm": 7.506267070770264,
"learning_rate": 3.579070544968238e-05,
"loss": 0.5356,
"step": 8500
},
{
"epoch": 1.44,
"grad_norm": 23.122560501098633,
"learning_rate": 3.562353727850217e-05,
"loss": 0.5044,
"step": 8600
},
{
"epoch": 1.45,
"grad_norm": 21.808191299438477,
"learning_rate": 3.545636910732197e-05,
"loss": 0.5059,
"step": 8700
},
{
"epoch": 1.47,
"grad_norm": 12.899435997009277,
"learning_rate": 3.5289200936141756e-05,
"loss": 0.5082,
"step": 8800
},
{
"epoch": 1.49,
"grad_norm": 11.228046417236328,
"learning_rate": 3.5122032764961554e-05,
"loss": 0.4466,
"step": 8900
},
{
"epoch": 1.5,
"grad_norm": 15.656624794006348,
"learning_rate": 3.4954864593781346e-05,
"loss": 0.4877,
"step": 9000
},
{
"epoch": 1.52,
"grad_norm": 14.958187103271484,
"learning_rate": 3.478769642260114e-05,
"loss": 0.4283,
"step": 9100
},
{
"epoch": 1.54,
"grad_norm": 27.727924346923828,
"learning_rate": 3.462052825142093e-05,
"loss": 0.504,
"step": 9200
},
{
"epoch": 1.55,
"grad_norm": 21.103147506713867,
"learning_rate": 3.445336008024073e-05,
"loss": 0.5081,
"step": 9300
},
{
"epoch": 1.57,
"grad_norm": 14.884688377380371,
"learning_rate": 3.428619190906052e-05,
"loss": 0.47,
"step": 9400
},
{
"epoch": 1.59,
"grad_norm": 26.825908660888672,
"learning_rate": 3.411902373788031e-05,
"loss": 0.4587,
"step": 9500
},
{
"epoch": 1.6,
"grad_norm": 23.39227867126465,
"learning_rate": 3.39518555667001e-05,
"loss": 0.4621,
"step": 9600
},
{
"epoch": 1.62,
"grad_norm": 15.503640174865723,
"learning_rate": 3.378468739551989e-05,
"loss": 0.5122,
"step": 9700
},
{
"epoch": 1.64,
"grad_norm": 13.298539161682129,
"learning_rate": 3.361751922433969e-05,
"loss": 0.4846,
"step": 9800
},
{
"epoch": 1.65,
"grad_norm": 17.961261749267578,
"learning_rate": 3.3450351053159476e-05,
"loss": 0.4576,
"step": 9900
},
{
"epoch": 1.67,
"grad_norm": 15.622933387756348,
"learning_rate": 3.3283182881979274e-05,
"loss": 0.4239,
"step": 10000
},
{
"epoch": 1.69,
"grad_norm": 15.286486625671387,
"learning_rate": 3.3116014710799066e-05,
"loss": 0.4478,
"step": 10100
},
{
"epoch": 1.71,
"grad_norm": 28.045799255371094,
"learning_rate": 3.294884653961886e-05,
"loss": 0.4457,
"step": 10200
},
{
"epoch": 1.72,
"grad_norm": 23.578136444091797,
"learning_rate": 3.278167836843865e-05,
"loss": 0.464,
"step": 10300
},
{
"epoch": 1.74,
"grad_norm": 12.858305931091309,
"learning_rate": 3.261451019725844e-05,
"loss": 0.4507,
"step": 10400
},
{
"epoch": 1.76,
"grad_norm": 18.197952270507812,
"learning_rate": 3.244734202607824e-05,
"loss": 0.4158,
"step": 10500
},
{
"epoch": 1.77,
"grad_norm": 5.134513854980469,
"learning_rate": 3.228017385489803e-05,
"loss": 0.4088,
"step": 10600
},
{
"epoch": 1.79,
"grad_norm": 2.1014363765716553,
"learning_rate": 3.211300568371782e-05,
"loss": 0.4524,
"step": 10700
},
{
"epoch": 1.81,
"grad_norm": 14.459040641784668,
"learning_rate": 3.194583751253761e-05,
"loss": 0.4637,
"step": 10800
},
{
"epoch": 1.82,
"grad_norm": 29.922468185424805,
"learning_rate": 3.177866934135741e-05,
"loss": 0.4302,
"step": 10900
},
{
"epoch": 1.84,
"grad_norm": 23.523460388183594,
"learning_rate": 3.1611501170177196e-05,
"loss": 0.4155,
"step": 11000
},
{
"epoch": 1.86,
"grad_norm": 11.668371200561523,
"learning_rate": 3.1444332998996994e-05,
"loss": 0.4238,
"step": 11100
},
{
"epoch": 1.87,
"grad_norm": 15.930005073547363,
"learning_rate": 3.1277164827816786e-05,
"loss": 0.4072,
"step": 11200
},
{
"epoch": 1.89,
"grad_norm": 18.61160659790039,
"learning_rate": 3.110999665663658e-05,
"loss": 0.4348,
"step": 11300
},
{
"epoch": 1.91,
"grad_norm": 27.475053787231445,
"learning_rate": 3.094282848545637e-05,
"loss": 0.4648,
"step": 11400
},
{
"epoch": 1.92,
"grad_norm": 6.477468013763428,
"learning_rate": 3.077566031427616e-05,
"loss": 0.4241,
"step": 11500
},
{
"epoch": 1.94,
"grad_norm": 26.99014663696289,
"learning_rate": 3.060849214309596e-05,
"loss": 0.4243,
"step": 11600
},
{
"epoch": 1.96,
"grad_norm": 16.152755737304688,
"learning_rate": 3.0441323971915747e-05,
"loss": 0.4186,
"step": 11700
},
{
"epoch": 1.97,
"grad_norm": 15.536150932312012,
"learning_rate": 3.0274155800735542e-05,
"loss": 0.3808,
"step": 11800
},
{
"epoch": 1.99,
"grad_norm": 23.708145141601562,
"learning_rate": 3.0106987629555333e-05,
"loss": 0.4365,
"step": 11900
},
{
"epoch": 2.0,
"eval_accuracy": 0.8680876258140912,
"eval_f1": 0.8628914936078326,
"eval_loss": 0.4633374810218811,
"eval_precision": 0.8684864554322808,
"eval_recall": 0.8680876258140912,
"eval_runtime": 64.0052,
"eval_samples_per_second": 131.942,
"eval_steps_per_second": 8.249,
"step": 11964
},
{
"epoch": 2.01,
"grad_norm": 10.474257469177246,
"learning_rate": 2.9939819458375128e-05,
"loss": 0.3853,
"step": 12000
},
{
"epoch": 2.02,
"grad_norm": 15.668170928955078,
"learning_rate": 2.977265128719492e-05,
"loss": 0.2858,
"step": 12100
},
{
"epoch": 2.04,
"grad_norm": 10.29902172088623,
"learning_rate": 2.960548311601471e-05,
"loss": 0.2803,
"step": 12200
},
{
"epoch": 2.06,
"grad_norm": 33.27579116821289,
"learning_rate": 2.9438314944834506e-05,
"loss": 0.2858,
"step": 12300
},
{
"epoch": 2.07,
"grad_norm": 13.799466133117676,
"learning_rate": 2.9271146773654294e-05,
"loss": 0.2793,
"step": 12400
},
{
"epoch": 2.09,
"grad_norm": 4.722692489624023,
"learning_rate": 2.9103978602474092e-05,
"loss": 0.2935,
"step": 12500
},
{
"epoch": 2.11,
"grad_norm": 8.643231391906738,
"learning_rate": 2.893681043129388e-05,
"loss": 0.2825,
"step": 12600
},
{
"epoch": 2.12,
"grad_norm": 10.378469467163086,
"learning_rate": 2.876964226011368e-05,
"loss": 0.2845,
"step": 12700
},
{
"epoch": 2.14,
"grad_norm": 9.1376953125,
"learning_rate": 2.8602474088933467e-05,
"loss": 0.2725,
"step": 12800
},
{
"epoch": 2.16,
"grad_norm": 10.372312545776367,
"learning_rate": 2.8435305917753262e-05,
"loss": 0.3067,
"step": 12900
},
{
"epoch": 2.17,
"grad_norm": 23.952699661254883,
"learning_rate": 2.8268137746573053e-05,
"loss": 0.2934,
"step": 13000
},
{
"epoch": 2.19,
"grad_norm": 2.125562906265259,
"learning_rate": 2.8100969575392848e-05,
"loss": 0.2535,
"step": 13100
},
{
"epoch": 2.21,
"grad_norm": 8.090828895568848,
"learning_rate": 2.793380140421264e-05,
"loss": 0.295,
"step": 13200
},
{
"epoch": 2.22,
"grad_norm": 13.274210929870605,
"learning_rate": 2.776663323303243e-05,
"loss": 0.2851,
"step": 13300
},
{
"epoch": 2.24,
"grad_norm": 5.6807732582092285,
"learning_rate": 2.7599465061852226e-05,
"loss": 0.2662,
"step": 13400
},
{
"epoch": 2.26,
"grad_norm": 11.885269165039062,
"learning_rate": 2.7432296890672014e-05,
"loss": 0.2969,
"step": 13500
},
{
"epoch": 2.27,
"grad_norm": 21.52318000793457,
"learning_rate": 2.7265128719491812e-05,
"loss": 0.2706,
"step": 13600
},
{
"epoch": 2.29,
"grad_norm": 21.661279678344727,
"learning_rate": 2.70979605483116e-05,
"loss": 0.2715,
"step": 13700
},
{
"epoch": 2.31,
"grad_norm": 27.985078811645508,
"learning_rate": 2.69307923771314e-05,
"loss": 0.3016,
"step": 13800
},
{
"epoch": 2.32,
"grad_norm": 11.431729316711426,
"learning_rate": 2.6763624205951187e-05,
"loss": 0.2501,
"step": 13900
},
{
"epoch": 2.34,
"grad_norm": 5.3406901359558105,
"learning_rate": 2.6596456034770982e-05,
"loss": 0.2762,
"step": 14000
},
{
"epoch": 2.36,
"grad_norm": 11.077746391296387,
"learning_rate": 2.6429287863590773e-05,
"loss": 0.2819,
"step": 14100
},
{
"epoch": 2.37,
"grad_norm": 17.451330184936523,
"learning_rate": 2.6262119692410565e-05,
"loss": 0.3074,
"step": 14200
},
{
"epoch": 2.39,
"grad_norm": 7.353370189666748,
"learning_rate": 2.609495152123036e-05,
"loss": 0.3068,
"step": 14300
},
{
"epoch": 2.41,
"grad_norm": 12.055102348327637,
"learning_rate": 2.592778335005015e-05,
"loss": 0.2779,
"step": 14400
},
{
"epoch": 2.42,
"grad_norm": 17.555917739868164,
"learning_rate": 2.5760615178869946e-05,
"loss": 0.2421,
"step": 14500
},
{
"epoch": 2.44,
"grad_norm": 22.887771606445312,
"learning_rate": 2.5593447007689734e-05,
"loss": 0.3016,
"step": 14600
},
{
"epoch": 2.46,
"grad_norm": 1.915899395942688,
"learning_rate": 2.5426278836509533e-05,
"loss": 0.2638,
"step": 14700
},
{
"epoch": 2.47,
"grad_norm": 13.446496963500977,
"learning_rate": 2.525911066532932e-05,
"loss": 0.293,
"step": 14800
},
{
"epoch": 2.49,
"grad_norm": 12.734638214111328,
"learning_rate": 2.509194249414912e-05,
"loss": 0.2668,
"step": 14900
},
{
"epoch": 2.51,
"grad_norm": 15.557112693786621,
"learning_rate": 2.4924774322968907e-05,
"loss": 0.2691,
"step": 15000
},
{
"epoch": 2.52,
"grad_norm": 10.383445739746094,
"learning_rate": 2.4757606151788702e-05,
"loss": 0.2204,
"step": 15100
},
{
"epoch": 2.54,
"grad_norm": 7.19666862487793,
"learning_rate": 2.4590437980608493e-05,
"loss": 0.2447,
"step": 15200
},
{
"epoch": 2.56,
"grad_norm": 17.903339385986328,
"learning_rate": 2.442326980942829e-05,
"loss": 0.2504,
"step": 15300
},
{
"epoch": 2.57,
"grad_norm": 10.492616653442383,
"learning_rate": 2.425610163824808e-05,
"loss": 0.2256,
"step": 15400
},
{
"epoch": 2.59,
"grad_norm": 11.051074028015137,
"learning_rate": 2.408893346706787e-05,
"loss": 0.259,
"step": 15500
},
{
"epoch": 2.61,
"grad_norm": 23.400402069091797,
"learning_rate": 2.3921765295887663e-05,
"loss": 0.2487,
"step": 15600
},
{
"epoch": 2.62,
"grad_norm": 20.601686477661133,
"learning_rate": 2.3754597124707458e-05,
"loss": 0.2338,
"step": 15700
},
{
"epoch": 2.64,
"grad_norm": 12.519159317016602,
"learning_rate": 2.358742895352725e-05,
"loss": 0.2652,
"step": 15800
},
{
"epoch": 2.66,
"grad_norm": 21.95683479309082,
"learning_rate": 2.342026078234704e-05,
"loss": 0.2306,
"step": 15900
},
{
"epoch": 2.67,
"grad_norm": 24.98236656188965,
"learning_rate": 2.3253092611166836e-05,
"loss": 0.2475,
"step": 16000
},
{
"epoch": 2.69,
"grad_norm": 6.362200736999512,
"learning_rate": 2.3085924439986627e-05,
"loss": 0.2646,
"step": 16100
},
{
"epoch": 2.71,
"grad_norm": 14.293391227722168,
"learning_rate": 2.2918756268806422e-05,
"loss": 0.2404,
"step": 16200
},
{
"epoch": 2.72,
"grad_norm": 11.405878067016602,
"learning_rate": 2.2751588097626213e-05,
"loss": 0.2651,
"step": 16300
},
{
"epoch": 2.74,
"grad_norm": 15.082180976867676,
"learning_rate": 2.258441992644601e-05,
"loss": 0.281,
"step": 16400
},
{
"epoch": 2.76,
"grad_norm": 27.33397674560547,
"learning_rate": 2.2417251755265796e-05,
"loss": 0.2492,
"step": 16500
},
{
"epoch": 2.77,
"grad_norm": 10.052102088928223,
"learning_rate": 2.225008358408559e-05,
"loss": 0.2382,
"step": 16600
},
{
"epoch": 2.79,
"grad_norm": 15.405964851379395,
"learning_rate": 2.2082915412905383e-05,
"loss": 0.2496,
"step": 16700
},
{
"epoch": 2.81,
"grad_norm": 7.162382125854492,
"learning_rate": 2.1915747241725178e-05,
"loss": 0.2343,
"step": 16800
},
{
"epoch": 2.83,
"grad_norm": 11.130888938903809,
"learning_rate": 2.174857907054497e-05,
"loss": 0.2474,
"step": 16900
},
{
"epoch": 2.84,
"grad_norm": 8.277360916137695,
"learning_rate": 2.158141089936476e-05,
"loss": 0.2687,
"step": 17000
},
{
"epoch": 2.86,
"grad_norm": 31.100744247436523,
"learning_rate": 2.1414242728184556e-05,
"loss": 0.2422,
"step": 17100
},
{
"epoch": 2.88,
"grad_norm": 12.757442474365234,
"learning_rate": 2.1247074557004347e-05,
"loss": 0.2275,
"step": 17200
},
{
"epoch": 2.89,
"grad_norm": 4.860738277435303,
"learning_rate": 2.1079906385824142e-05,
"loss": 0.2252,
"step": 17300
},
{
"epoch": 2.91,
"grad_norm": 10.574835777282715,
"learning_rate": 2.091273821464393e-05,
"loss": 0.2114,
"step": 17400
},
{
"epoch": 2.93,
"grad_norm": 13.01117992401123,
"learning_rate": 2.0745570043463725e-05,
"loss": 0.2407,
"step": 17500
},
{
"epoch": 2.94,
"grad_norm": 4.970390319824219,
"learning_rate": 2.0578401872283517e-05,
"loss": 0.2509,
"step": 17600
},
{
"epoch": 2.96,
"grad_norm": 18.95350456237793,
"learning_rate": 2.041123370110331e-05,
"loss": 0.2814,
"step": 17700
},
{
"epoch": 2.98,
"grad_norm": 1.5296308994293213,
"learning_rate": 2.0244065529923103e-05,
"loss": 0.235,
"step": 17800
},
{
"epoch": 2.99,
"grad_norm": 12.501904487609863,
"learning_rate": 2.0076897358742898e-05,
"loss": 0.2479,
"step": 17900
},
{
"epoch": 3.0,
"eval_accuracy": 0.8965068087625814,
"eval_f1": 0.8930257247589533,
"eval_loss": 0.36622655391693115,
"eval_precision": 0.8950199629292306,
"eval_recall": 0.8965068087625814,
"eval_runtime": 64.0862,
"eval_samples_per_second": 131.776,
"eval_steps_per_second": 8.239,
"step": 17946
},
{
"epoch": 3.01,
"grad_norm": 19.13836097717285,
"learning_rate": 1.990972918756269e-05,
"loss": 0.2272,
"step": 18000
},
{
"epoch": 3.03,
"grad_norm": 8.622084617614746,
"learning_rate": 1.9742561016382484e-05,
"loss": 0.131,
"step": 18100
},
{
"epoch": 3.04,
"grad_norm": 32.99411392211914,
"learning_rate": 1.9575392845202276e-05,
"loss": 0.1477,
"step": 18200
},
{
"epoch": 3.06,
"grad_norm": 5.467390060424805,
"learning_rate": 1.9408224674022067e-05,
"loss": 0.1439,
"step": 18300
},
{
"epoch": 3.08,
"grad_norm": 2.5153982639312744,
"learning_rate": 1.924105650284186e-05,
"loss": 0.1405,
"step": 18400
},
{
"epoch": 3.09,
"grad_norm": 20.424579620361328,
"learning_rate": 1.907388833166165e-05,
"loss": 0.1594,
"step": 18500
},
{
"epoch": 3.11,
"grad_norm": 5.207544803619385,
"learning_rate": 1.8906720160481445e-05,
"loss": 0.1323,
"step": 18600
},
{
"epoch": 3.13,
"grad_norm": 8.750362396240234,
"learning_rate": 1.8739551989301237e-05,
"loss": 0.1683,
"step": 18700
},
{
"epoch": 3.14,
"grad_norm": 2.464329481124878,
"learning_rate": 1.857238381812103e-05,
"loss": 0.1388,
"step": 18800
},
{
"epoch": 3.16,
"grad_norm": 3.784031867980957,
"learning_rate": 1.8405215646940823e-05,
"loss": 0.149,
"step": 18900
},
{
"epoch": 3.18,
"grad_norm": 2.632542610168457,
"learning_rate": 1.8238047475760618e-05,
"loss": 0.1284,
"step": 19000
},
{
"epoch": 3.19,
"grad_norm": 11.050533294677734,
"learning_rate": 1.807087930458041e-05,
"loss": 0.1525,
"step": 19100
},
{
"epoch": 3.21,
"grad_norm": 7.363661766052246,
"learning_rate": 1.7903711133400204e-05,
"loss": 0.1481,
"step": 19200
},
{
"epoch": 3.23,
"grad_norm": 9.882287979125977,
"learning_rate": 1.7736542962219992e-05,
"loss": 0.1231,
"step": 19300
},
{
"epoch": 3.24,
"grad_norm": 24.93657684326172,
"learning_rate": 1.7569374791039787e-05,
"loss": 0.1332,
"step": 19400
},
{
"epoch": 3.26,
"grad_norm": 2.2802133560180664,
"learning_rate": 1.740220661985958e-05,
"loss": 0.1425,
"step": 19500
},
{
"epoch": 3.28,
"grad_norm": 1.5991661548614502,
"learning_rate": 1.7235038448679374e-05,
"loss": 0.1283,
"step": 19600
},
{
"epoch": 3.29,
"grad_norm": 8.344457626342773,
"learning_rate": 1.7067870277499165e-05,
"loss": 0.1502,
"step": 19700
},
{
"epoch": 3.31,
"grad_norm": 12.95904541015625,
"learning_rate": 1.6900702106318957e-05,
"loss": 0.1287,
"step": 19800
},
{
"epoch": 3.33,
"grad_norm": 20.562625885009766,
"learning_rate": 1.673353393513875e-05,
"loss": 0.1422,
"step": 19900
},
{
"epoch": 3.34,
"grad_norm": 4.20346736907959,
"learning_rate": 1.6566365763958543e-05,
"loss": 0.1082,
"step": 20000
},
{
"epoch": 3.36,
"grad_norm": 25.636775970458984,
"learning_rate": 1.6399197592778338e-05,
"loss": 0.1416,
"step": 20100
},
{
"epoch": 3.38,
"grad_norm": 23.23301887512207,
"learning_rate": 1.6232029421598126e-05,
"loss": 0.1497,
"step": 20200
},
{
"epoch": 3.39,
"grad_norm": 22.21303939819336,
"learning_rate": 1.606486125041792e-05,
"loss": 0.1568,
"step": 20300
},
{
"epoch": 3.41,
"grad_norm": 21.14128303527832,
"learning_rate": 1.5897693079237712e-05,
"loss": 0.139,
"step": 20400
},
{
"epoch": 3.43,
"grad_norm": 22.63404083251953,
"learning_rate": 1.5730524908057507e-05,
"loss": 0.1518,
"step": 20500
},
{
"epoch": 3.44,
"grad_norm": 13.030010223388672,
"learning_rate": 1.55633567368773e-05,
"loss": 0.1319,
"step": 20600
},
{
"epoch": 3.46,
"grad_norm": 18.308670043945312,
"learning_rate": 1.5396188565697094e-05,
"loss": 0.1494,
"step": 20700
},
{
"epoch": 3.48,
"grad_norm": 24.907419204711914,
"learning_rate": 1.5229020394516885e-05,
"loss": 0.1425,
"step": 20800
},
{
"epoch": 3.49,
"grad_norm": 19.32282066345215,
"learning_rate": 1.5061852223336678e-05,
"loss": 0.1264,
"step": 20900
},
{
"epoch": 3.51,
"grad_norm": 17.444271087646484,
"learning_rate": 1.4894684052156472e-05,
"loss": 0.14,
"step": 21000
},
{
"epoch": 3.53,
"grad_norm": 1.832461953163147,
"learning_rate": 1.4727515880976261e-05,
"loss": 0.1438,
"step": 21100
},
{
"epoch": 3.54,
"grad_norm": 10.410861015319824,
"learning_rate": 1.4560347709796055e-05,
"loss": 0.1393,
"step": 21200
},
{
"epoch": 3.56,
"grad_norm": 3.6459202766418457,
"learning_rate": 1.4393179538615848e-05,
"loss": 0.1077,
"step": 21300
},
{
"epoch": 3.58,
"grad_norm": 3.216399669647217,
"learning_rate": 1.4226011367435641e-05,
"loss": 0.1154,
"step": 21400
},
{
"epoch": 3.59,
"grad_norm": 5.621729373931885,
"learning_rate": 1.4058843196255434e-05,
"loss": 0.1208,
"step": 21500
},
{
"epoch": 3.61,
"grad_norm": 5.559453010559082,
"learning_rate": 1.3891675025075226e-05,
"loss": 0.1441,
"step": 21600
},
{
"epoch": 3.63,
"grad_norm": 22.32745933532715,
"learning_rate": 1.3724506853895019e-05,
"loss": 0.1176,
"step": 21700
},
{
"epoch": 3.64,
"grad_norm": 4.509443759918213,
"learning_rate": 1.3557338682714812e-05,
"loss": 0.1382,
"step": 21800
},
{
"epoch": 3.66,
"grad_norm": 15.154895782470703,
"learning_rate": 1.3390170511534605e-05,
"loss": 0.1475,
"step": 21900
},
{
"epoch": 3.68,
"grad_norm": 0.8804099559783936,
"learning_rate": 1.3223002340354398e-05,
"loss": 0.1325,
"step": 22000
},
{
"epoch": 3.69,
"grad_norm": 1.9917913675308228,
"learning_rate": 1.3055834169174188e-05,
"loss": 0.1255,
"step": 22100
},
{
"epoch": 3.71,
"grad_norm": 16.314374923706055,
"learning_rate": 1.2888665997993981e-05,
"loss": 0.1275,
"step": 22200
},
{
"epoch": 3.73,
"grad_norm": 5.355242729187012,
"learning_rate": 1.2721497826813775e-05,
"loss": 0.1185,
"step": 22300
},
{
"epoch": 3.74,
"grad_norm": 20.218473434448242,
"learning_rate": 1.2554329655633568e-05,
"loss": 0.1203,
"step": 22400
},
{
"epoch": 3.76,
"grad_norm": 1.39955735206604,
"learning_rate": 1.2387161484453361e-05,
"loss": 0.1636,
"step": 22500
},
{
"epoch": 3.78,
"grad_norm": 17.855899810791016,
"learning_rate": 1.2219993313273154e-05,
"loss": 0.1369,
"step": 22600
},
{
"epoch": 3.79,
"grad_norm": 14.41054630279541,
"learning_rate": 1.2052825142092947e-05,
"loss": 0.1245,
"step": 22700
},
{
"epoch": 3.81,
"grad_norm": 11.451350212097168,
"learning_rate": 1.1885656970912739e-05,
"loss": 0.1508,
"step": 22800
},
{
"epoch": 3.83,
"grad_norm": 9.41112995147705,
"learning_rate": 1.171848879973253e-05,
"loss": 0.125,
"step": 22900
},
{
"epoch": 3.84,
"grad_norm": 29.826963424682617,
"learning_rate": 1.1551320628552324e-05,
"loss": 0.1545,
"step": 23000
},
{
"epoch": 3.86,
"grad_norm": 11.454690933227539,
"learning_rate": 1.1384152457372117e-05,
"loss": 0.1353,
"step": 23100
},
{
"epoch": 3.88,
"grad_norm": 12.364923477172852,
"learning_rate": 1.121698428619191e-05,
"loss": 0.1346,
"step": 23200
},
{
"epoch": 3.9,
"grad_norm": 1.8181456327438354,
"learning_rate": 1.1049816115011702e-05,
"loss": 0.1092,
"step": 23300
},
{
"epoch": 3.91,
"grad_norm": 30.87436866760254,
"learning_rate": 1.0882647943831495e-05,
"loss": 0.1059,
"step": 23400
},
{
"epoch": 3.93,
"grad_norm": 16.423452377319336,
"learning_rate": 1.0715479772651288e-05,
"loss": 0.1157,
"step": 23500
},
{
"epoch": 3.95,
"grad_norm": 27.86665153503418,
"learning_rate": 1.0548311601471081e-05,
"loss": 0.1317,
"step": 23600
},
{
"epoch": 3.96,
"grad_norm": 24.479764938354492,
"learning_rate": 1.0381143430290873e-05,
"loss": 0.1184,
"step": 23700
},
{
"epoch": 3.98,
"grad_norm": 1.4079170227050781,
"learning_rate": 1.0213975259110666e-05,
"loss": 0.1303,
"step": 23800
},
{
"epoch": 4.0,
"grad_norm": 4.259897232055664,
"learning_rate": 1.0046807087930459e-05,
"loss": 0.1322,
"step": 23900
},
{
"epoch": 4.0,
"eval_accuracy": 0.9113084665482534,
"eval_f1": 0.9092055511030135,
"eval_loss": 0.3260073661804199,
"eval_precision": 0.9099757491171729,
"eval_recall": 0.9113084665482534,
"eval_runtime": 64.1166,
"eval_samples_per_second": 131.713,
"eval_steps_per_second": 8.235,
"step": 23928
},
{
"epoch": 4.01,
"grad_norm": 13.925552368164062,
"learning_rate": 9.879638916750252e-06,
"loss": 0.0687,
"step": 24000
},
{
"epoch": 4.03,
"grad_norm": 0.18495211005210876,
"learning_rate": 9.712470745570044e-06,
"loss": 0.066,
"step": 24100
},
{
"epoch": 4.05,
"grad_norm": 1.0808857679367065,
"learning_rate": 9.545302574389837e-06,
"loss": 0.0648,
"step": 24200
},
{
"epoch": 4.06,
"grad_norm": 1.0073552131652832,
"learning_rate": 9.378134403209628e-06,
"loss": 0.071,
"step": 24300
},
{
"epoch": 4.08,
"grad_norm": 15.166232109069824,
"learning_rate": 9.210966232029422e-06,
"loss": 0.0666,
"step": 24400
},
{
"epoch": 4.1,
"grad_norm": 18.000640869140625,
"learning_rate": 9.043798060849215e-06,
"loss": 0.0778,
"step": 24500
},
{
"epoch": 4.11,
"grad_norm": 1.214728593826294,
"learning_rate": 8.876629889669008e-06,
"loss": 0.07,
"step": 24600
},
{
"epoch": 4.13,
"grad_norm": 1.982407808303833,
"learning_rate": 8.7094617184888e-06,
"loss": 0.0752,
"step": 24700
},
{
"epoch": 4.15,
"grad_norm": 20.929153442382812,
"learning_rate": 8.542293547308593e-06,
"loss": 0.0785,
"step": 24800
},
{
"epoch": 4.16,
"grad_norm": 0.8963820934295654,
"learning_rate": 8.375125376128386e-06,
"loss": 0.0524,
"step": 24900
},
{
"epoch": 4.18,
"grad_norm": 3.5774483680725098,
"learning_rate": 8.207957204948179e-06,
"loss": 0.0692,
"step": 25000
},
{
"epoch": 4.2,
"grad_norm": 3.7253074645996094,
"learning_rate": 8.04078903376797e-06,
"loss": 0.0641,
"step": 25100
},
{
"epoch": 4.21,
"grad_norm": 1.2855291366577148,
"learning_rate": 7.873620862587764e-06,
"loss": 0.0699,
"step": 25200
},
{
"epoch": 4.23,
"grad_norm": 1.9972455501556396,
"learning_rate": 7.706452691407557e-06,
"loss": 0.062,
"step": 25300
},
{
"epoch": 4.25,
"grad_norm": 1.0809322595596313,
"learning_rate": 7.539284520227349e-06,
"loss": 0.058,
"step": 25400
},
{
"epoch": 4.26,
"grad_norm": 3.876232862472534,
"learning_rate": 7.3721163490471425e-06,
"loss": 0.0693,
"step": 25500
},
{
"epoch": 4.28,
"grad_norm": 6.069151878356934,
"learning_rate": 7.204948177866934e-06,
"loss": 0.0617,
"step": 25600
},
{
"epoch": 4.3,
"grad_norm": 0.895815372467041,
"learning_rate": 7.037780006686727e-06,
"loss": 0.0623,
"step": 25700
},
{
"epoch": 4.31,
"grad_norm": 0.4176822602748871,
"learning_rate": 6.8706118355065195e-06,
"loss": 0.0833,
"step": 25800
},
{
"epoch": 4.33,
"grad_norm": 0.6760619878768921,
"learning_rate": 6.703443664326313e-06,
"loss": 0.0567,
"step": 25900
},
{
"epoch": 4.35,
"grad_norm": 14.889734268188477,
"learning_rate": 6.536275493146106e-06,
"loss": 0.053,
"step": 26000
},
{
"epoch": 4.36,
"grad_norm": 0.5385121703147888,
"learning_rate": 6.369107321965897e-06,
"loss": 0.0703,
"step": 26100
},
{
"epoch": 4.38,
"grad_norm": 6.336006164550781,
"learning_rate": 6.201939150785691e-06,
"loss": 0.063,
"step": 26200
},
{
"epoch": 4.4,
"grad_norm": 0.20758749544620514,
"learning_rate": 6.034770979605484e-06,
"loss": 0.0753,
"step": 26300
},
{
"epoch": 4.41,
"grad_norm": 11.717066764831543,
"learning_rate": 5.867602808425276e-06,
"loss": 0.0598,
"step": 26400
},
{
"epoch": 4.43,
"grad_norm": 26.475128173828125,
"learning_rate": 5.7004346372450685e-06,
"loss": 0.064,
"step": 26500
},
{
"epoch": 4.45,
"grad_norm": 20.872194290161133,
"learning_rate": 5.533266466064862e-06,
"loss": 0.0708,
"step": 26600
},
{
"epoch": 4.46,
"grad_norm": 1.2749828100204468,
"learning_rate": 5.366098294884654e-06,
"loss": 0.0705,
"step": 26700
},
{
"epoch": 4.48,
"grad_norm": 6.7912702560424805,
"learning_rate": 5.198930123704447e-06,
"loss": 0.0742,
"step": 26800
},
{
"epoch": 4.5,
"grad_norm": 10.904654502868652,
"learning_rate": 5.03176195252424e-06,
"loss": 0.0665,
"step": 26900
},
{
"epoch": 4.51,
"grad_norm": 6.191511154174805,
"learning_rate": 4.864593781344033e-06,
"loss": 0.0549,
"step": 27000
},
{
"epoch": 4.53,
"grad_norm": 2.479524850845337,
"learning_rate": 4.697425610163825e-06,
"loss": 0.0539,
"step": 27100
},
{
"epoch": 4.55,
"grad_norm": 0.7285805940628052,
"learning_rate": 4.5302574389836175e-06,
"loss": 0.0662,
"step": 27200
},
{
"epoch": 4.56,
"grad_norm": 4.313304901123047,
"learning_rate": 4.363089267803411e-06,
"loss": 0.0571,
"step": 27300
},
{
"epoch": 4.58,
"grad_norm": 17.61699867248535,
"learning_rate": 4.195921096623203e-06,
"loss": 0.0634,
"step": 27400
},
{
"epoch": 4.6,
"grad_norm": 1.3776081800460815,
"learning_rate": 4.028752925442996e-06,
"loss": 0.0526,
"step": 27500
},
{
"epoch": 4.61,
"grad_norm": 0.36369597911834717,
"learning_rate": 3.8615847542627886e-06,
"loss": 0.0669,
"step": 27600
},
{
"epoch": 4.63,
"grad_norm": 4.591643333435059,
"learning_rate": 3.6944165830825813e-06,
"loss": 0.0578,
"step": 27700
},
{
"epoch": 4.65,
"grad_norm": 0.930225670337677,
"learning_rate": 3.5272484119023737e-06,
"loss": 0.0456,
"step": 27800
},
{
"epoch": 4.66,
"grad_norm": 1.136043906211853,
"learning_rate": 3.360080240722167e-06,
"loss": 0.0617,
"step": 27900
},
{
"epoch": 4.68,
"grad_norm": 0.6426201462745667,
"learning_rate": 3.1929120695419596e-06,
"loss": 0.0568,
"step": 28000
},
{
"epoch": 4.7,
"grad_norm": 2.6884241104125977,
"learning_rate": 3.025743898361752e-06,
"loss": 0.0606,
"step": 28100
},
{
"epoch": 4.71,
"grad_norm": 0.4525424838066101,
"learning_rate": 2.8585757271815448e-06,
"loss": 0.066,
"step": 28200
},
{
"epoch": 4.73,
"grad_norm": 1.0276681184768677,
"learning_rate": 2.6914075560013375e-06,
"loss": 0.0444,
"step": 28300
},
{
"epoch": 4.75,
"grad_norm": 7.886939525604248,
"learning_rate": 2.5242393848211303e-06,
"loss": 0.065,
"step": 28400
},
{
"epoch": 4.76,
"grad_norm": 0.37203583121299744,
"learning_rate": 2.357071213640923e-06,
"loss": 0.0559,
"step": 28500
},
{
"epoch": 4.78,
"grad_norm": 6.219501495361328,
"learning_rate": 2.1899030424607154e-06,
"loss": 0.07,
"step": 28600
},
{
"epoch": 4.8,
"grad_norm": 8.10631275177002,
"learning_rate": 2.022734871280508e-06,
"loss": 0.0623,
"step": 28700
},
{
"epoch": 4.81,
"grad_norm": 24.999059677124023,
"learning_rate": 1.855566700100301e-06,
"loss": 0.0701,
"step": 28800
},
{
"epoch": 4.83,
"grad_norm": 3.5445597171783447,
"learning_rate": 1.6883985289200935e-06,
"loss": 0.0561,
"step": 28900
},
{
"epoch": 4.85,
"grad_norm": 11.693018913269043,
"learning_rate": 1.5212303577398863e-06,
"loss": 0.062,
"step": 29000
},
{
"epoch": 4.86,
"grad_norm": 17.059640884399414,
"learning_rate": 1.354062186559679e-06,
"loss": 0.0663,
"step": 29100
},
{
"epoch": 4.88,
"grad_norm": 3.2128794193267822,
"learning_rate": 1.1868940153794718e-06,
"loss": 0.0541,
"step": 29200
},
{
"epoch": 4.9,
"grad_norm": 1.6803439855575562,
"learning_rate": 1.0197258441992646e-06,
"loss": 0.0619,
"step": 29300
},
{
"epoch": 4.91,
"grad_norm": 7.980160236358643,
"learning_rate": 8.525576730190572e-07,
"loss": 0.0649,
"step": 29400
},
{
"epoch": 4.93,
"grad_norm": 0.3919593393802643,
"learning_rate": 6.853895018388499e-07,
"loss": 0.0753,
"step": 29500
},
{
"epoch": 4.95,
"grad_norm": 2.870180368423462,
"learning_rate": 5.182213306586426e-07,
"loss": 0.0461,
"step": 29600
},
{
"epoch": 4.96,
"grad_norm": 0.5204899907112122,
"learning_rate": 3.510531594784353e-07,
"loss": 0.0446,
"step": 29700
},
{
"epoch": 4.98,
"grad_norm": 2.318403482437134,
"learning_rate": 1.8388498829822804e-07,
"loss": 0.0588,
"step": 29800
},
{
"epoch": 5.0,
"grad_norm": 1.1591626405715942,
"learning_rate": 1.6716817118020728e-08,
"loss": 0.0589,
"step": 29900
},
{
"epoch": 5.0,
"eval_accuracy": 0.9182948490230906,
"eval_f1": 0.9165254517429693,
"eval_loss": 0.3342040479183197,
"eval_precision": 0.9170562701684628,
"eval_recall": 0.9182948490230906,
"eval_runtime": 63.9141,
"eval_samples_per_second": 132.131,
"eval_steps_per_second": 8.261,
"step": 29910
}
],
"logging_steps": 100,
"max_steps": 29910,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.15579279766016e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}