{ "best_metric": 0.9182948490230906, "best_model_checkpoint": "./results/checkpoint-29910", "epoch": 5.0, "eval_steps": 500, "global_step": 29910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 4.954967021942139, "learning_rate": 4.9832831828819794e-05, "loss": 4.7403, "step": 100 }, { "epoch": 0.03, "grad_norm": 9.918214797973633, "learning_rate": 4.9665663657639585e-05, "loss": 4.7281, "step": 200 }, { "epoch": 0.05, "grad_norm": 6.391179084777832, "learning_rate": 4.949849548645938e-05, "loss": 4.6786, "step": 300 }, { "epoch": 0.07, "grad_norm": 6.756315231323242, "learning_rate": 4.9331327315279175e-05, "loss": 4.6128, "step": 400 }, { "epoch": 0.08, "grad_norm": 8.407713890075684, "learning_rate": 4.916415914409897e-05, "loss": 4.4836, "step": 500 }, { "epoch": 0.1, "grad_norm": 8.354033470153809, "learning_rate": 4.899699097291876e-05, "loss": 4.3776, "step": 600 }, { "epoch": 0.12, "grad_norm": 7.996518611907959, "learning_rate": 4.882982280173855e-05, "loss": 4.2701, "step": 700 }, { "epoch": 0.13, "grad_norm": 14.100532531738281, "learning_rate": 4.866265463055835e-05, "loss": 4.1032, "step": 800 }, { "epoch": 0.15, "grad_norm": 10.907315254211426, "learning_rate": 4.849548645937814e-05, "loss": 3.952, "step": 900 }, { "epoch": 0.17, "grad_norm": 9.731605529785156, "learning_rate": 4.8328318288197924e-05, "loss": 3.732, "step": 1000 }, { "epoch": 0.18, "grad_norm": 9.989665985107422, "learning_rate": 4.816115011701772e-05, "loss": 3.5489, "step": 1100 }, { "epoch": 0.2, "grad_norm": 9.542133331298828, "learning_rate": 4.7993981945837514e-05, "loss": 3.3949, "step": 1200 }, { "epoch": 0.22, "grad_norm": 11.988595008850098, "learning_rate": 4.7826813774657305e-05, "loss": 3.216, "step": 1300 }, { "epoch": 0.23, "grad_norm": 13.553967475891113, "learning_rate": 4.76596456034771e-05, "loss": 2.9855, "step": 1400 }, { "epoch": 0.25, "grad_norm": 42.20621109008789, "learning_rate": 4.7492477432296895e-05, "loss": 2.7659, "step": 1500 }, { "epoch": 0.27, "grad_norm": 18.790130615234375, "learning_rate": 4.732530926111669e-05, "loss": 2.5604, "step": 1600 }, { "epoch": 0.28, "grad_norm": 20.554113388061523, "learning_rate": 4.715814108993648e-05, "loss": 2.4376, "step": 1700 }, { "epoch": 0.3, "grad_norm": 18.882707595825195, "learning_rate": 4.699097291875627e-05, "loss": 2.3501, "step": 1800 }, { "epoch": 0.32, "grad_norm": 14.733109474182129, "learning_rate": 4.682380474757606e-05, "loss": 2.168, "step": 1900 }, { "epoch": 0.33, "grad_norm": 17.430740356445312, "learning_rate": 4.665663657639586e-05, "loss": 2.0081, "step": 2000 }, { "epoch": 0.35, "grad_norm": 21.797836303710938, "learning_rate": 4.6489468405215644e-05, "loss": 1.9554, "step": 2100 }, { "epoch": 0.37, "grad_norm": 13.148958206176758, "learning_rate": 4.632230023403544e-05, "loss": 1.8524, "step": 2200 }, { "epoch": 0.38, "grad_norm": 14.161394119262695, "learning_rate": 4.6155132062855234e-05, "loss": 1.793, "step": 2300 }, { "epoch": 0.4, "grad_norm": 20.908519744873047, "learning_rate": 4.5987963891675026e-05, "loss": 1.6493, "step": 2400 }, { "epoch": 0.42, "grad_norm": 15.107952117919922, "learning_rate": 4.582079572049482e-05, "loss": 1.5724, "step": 2500 }, { "epoch": 0.43, "grad_norm": 18.561201095581055, "learning_rate": 4.5653627549314615e-05, "loss": 1.4915, "step": 2600 }, { "epoch": 0.45, "grad_norm": 15.365275382995605, "learning_rate": 4.548645937813441e-05, "loss": 1.488, "step": 2700 }, { "epoch": 0.47, "grad_norm": 16.04875946044922, "learning_rate": 4.53192912069542e-05, "loss": 1.4316, "step": 2800 }, { "epoch": 0.48, "grad_norm": 13.593673706054688, "learning_rate": 4.515212303577399e-05, "loss": 1.456, "step": 2900 }, { "epoch": 0.5, "grad_norm": 16.379798889160156, "learning_rate": 4.498495486459378e-05, "loss": 1.3006, "step": 3000 }, { "epoch": 0.52, "grad_norm": 13.564205169677734, "learning_rate": 4.481778669341358e-05, "loss": 1.2661, "step": 3100 }, { "epoch": 0.53, "grad_norm": 15.44586181640625, "learning_rate": 4.4650618522233364e-05, "loss": 1.2917, "step": 3200 }, { "epoch": 0.55, "grad_norm": 12.80644416809082, "learning_rate": 4.448345035105316e-05, "loss": 1.1765, "step": 3300 }, { "epoch": 0.57, "grad_norm": 19.545106887817383, "learning_rate": 4.4316282179872954e-05, "loss": 1.1622, "step": 3400 }, { "epoch": 0.59, "grad_norm": 14.377379417419434, "learning_rate": 4.414911400869275e-05, "loss": 1.1047, "step": 3500 }, { "epoch": 0.6, "grad_norm": 21.595245361328125, "learning_rate": 4.398194583751254e-05, "loss": 1.1384, "step": 3600 }, { "epoch": 0.62, "grad_norm": 14.641448020935059, "learning_rate": 4.3814777666332335e-05, "loss": 1.0872, "step": 3700 }, { "epoch": 0.64, "grad_norm": 13.082781791687012, "learning_rate": 4.364760949515213e-05, "loss": 1.0366, "step": 3800 }, { "epoch": 0.65, "grad_norm": 18.576641082763672, "learning_rate": 4.348044132397192e-05, "loss": 1.0953, "step": 3900 }, { "epoch": 0.67, "grad_norm": 9.915220260620117, "learning_rate": 4.331327315279171e-05, "loss": 1.001, "step": 4000 }, { "epoch": 0.69, "grad_norm": 12.059024810791016, "learning_rate": 4.31461049816115e-05, "loss": 1.0585, "step": 4100 }, { "epoch": 0.7, "grad_norm": 17.607337951660156, "learning_rate": 4.29789368104313e-05, "loss": 1.0179, "step": 4200 }, { "epoch": 0.72, "grad_norm": 16.324430465698242, "learning_rate": 4.2811768639251084e-05, "loss": 0.9491, "step": 4300 }, { "epoch": 0.74, "grad_norm": 19.5161075592041, "learning_rate": 4.264460046807088e-05, "loss": 0.9374, "step": 4400 }, { "epoch": 0.75, "grad_norm": 20.448488235473633, "learning_rate": 4.2477432296890674e-05, "loss": 0.9146, "step": 4500 }, { "epoch": 0.77, "grad_norm": 10.544804573059082, "learning_rate": 4.231026412571047e-05, "loss": 0.9187, "step": 4600 }, { "epoch": 0.79, "grad_norm": 17.095731735229492, "learning_rate": 4.214309595453026e-05, "loss": 0.8732, "step": 4700 }, { "epoch": 0.8, "grad_norm": 18.1314754486084, "learning_rate": 4.197592778335005e-05, "loss": 0.9072, "step": 4800 }, { "epoch": 0.82, "grad_norm": 8.516233444213867, "learning_rate": 4.180875961216985e-05, "loss": 0.8264, "step": 4900 }, { "epoch": 0.84, "grad_norm": 12.620676040649414, "learning_rate": 4.164159144098964e-05, "loss": 0.8425, "step": 5000 }, { "epoch": 0.85, "grad_norm": 23.544219970703125, "learning_rate": 4.147442326980943e-05, "loss": 0.8371, "step": 5100 }, { "epoch": 0.87, "grad_norm": 15.980536460876465, "learning_rate": 4.130725509862922e-05, "loss": 0.8257, "step": 5200 }, { "epoch": 0.89, "grad_norm": 16.621524810791016, "learning_rate": 4.114008692744902e-05, "loss": 0.7705, "step": 5300 }, { "epoch": 0.9, "grad_norm": 25.2496280670166, "learning_rate": 4.0972918756268804e-05, "loss": 0.7741, "step": 5400 }, { "epoch": 0.92, "grad_norm": 12.541385650634766, "learning_rate": 4.08057505850886e-05, "loss": 0.7408, "step": 5500 }, { "epoch": 0.94, "grad_norm": 30.975236892700195, "learning_rate": 4.0638582413908394e-05, "loss": 0.7417, "step": 5600 }, { "epoch": 0.95, "grad_norm": 16.33625030517578, "learning_rate": 4.0471414242728186e-05, "loss": 0.766, "step": 5700 }, { "epoch": 0.97, "grad_norm": 17.48399543762207, "learning_rate": 4.030424607154798e-05, "loss": 0.8336, "step": 5800 }, { "epoch": 0.99, "grad_norm": 19.421096801757812, "learning_rate": 4.013707790036777e-05, "loss": 0.7135, "step": 5900 }, { "epoch": 1.0, "eval_accuracy": 0.7995263469508584, "eval_f1": 0.7955612032049123, "eval_loss": 0.7165877223014832, "eval_precision": 0.805591523931921, "eval_recall": 0.7995263469508584, "eval_runtime": 64.1068, "eval_samples_per_second": 131.733, "eval_steps_per_second": 8.236, "step": 5982 }, { "epoch": 1.0, "grad_norm": 10.486939430236816, "learning_rate": 3.996990972918757e-05, "loss": 0.685, "step": 6000 }, { "epoch": 1.02, "grad_norm": 19.489837646484375, "learning_rate": 3.980274155800736e-05, "loss": 0.6431, "step": 6100 }, { "epoch": 1.04, "grad_norm": 8.935369491577148, "learning_rate": 3.963557338682715e-05, "loss": 0.6402, "step": 6200 }, { "epoch": 1.05, "grad_norm": 10.298083305358887, "learning_rate": 3.946840521564694e-05, "loss": 0.6261, "step": 6300 }, { "epoch": 1.07, "grad_norm": 18.606569290161133, "learning_rate": 3.930123704446674e-05, "loss": 0.5874, "step": 6400 }, { "epoch": 1.09, "grad_norm": 12.412484169006348, "learning_rate": 3.913406887328653e-05, "loss": 0.5923, "step": 6500 }, { "epoch": 1.1, "grad_norm": 9.3939847946167, "learning_rate": 3.8966900702106316e-05, "loss": 0.6091, "step": 6600 }, { "epoch": 1.12, "grad_norm": 14.168825149536133, "learning_rate": 3.8799732530926114e-05, "loss": 0.6259, "step": 6700 }, { "epoch": 1.14, "grad_norm": 18.846487045288086, "learning_rate": 3.8632564359745906e-05, "loss": 0.5543, "step": 6800 }, { "epoch": 1.15, "grad_norm": 7.268430709838867, "learning_rate": 3.84653961885657e-05, "loss": 0.5615, "step": 6900 }, { "epoch": 1.17, "grad_norm": 6.565930366516113, "learning_rate": 3.829822801738549e-05, "loss": 0.5725, "step": 7000 }, { "epoch": 1.19, "grad_norm": 11.122172355651855, "learning_rate": 3.813105984620529e-05, "loss": 0.543, "step": 7100 }, { "epoch": 1.2, "grad_norm": 15.909794807434082, "learning_rate": 3.796389167502508e-05, "loss": 0.5053, "step": 7200 }, { "epoch": 1.22, "grad_norm": 17.935998916625977, "learning_rate": 3.779672350384487e-05, "loss": 0.5866, "step": 7300 }, { "epoch": 1.24, "grad_norm": 7.46903657913208, "learning_rate": 3.762955533266466e-05, "loss": 0.5573, "step": 7400 }, { "epoch": 1.25, "grad_norm": 10.208723068237305, "learning_rate": 3.746238716148446e-05, "loss": 0.511, "step": 7500 }, { "epoch": 1.27, "grad_norm": 15.062224388122559, "learning_rate": 3.729521899030425e-05, "loss": 0.5211, "step": 7600 }, { "epoch": 1.29, "grad_norm": 11.787239074707031, "learning_rate": 3.7128050819124036e-05, "loss": 0.5687, "step": 7700 }, { "epoch": 1.3, "grad_norm": 20.22210693359375, "learning_rate": 3.6960882647943834e-05, "loss": 0.544, "step": 7800 }, { "epoch": 1.32, "grad_norm": 22.17251205444336, "learning_rate": 3.6793714476763626e-05, "loss": 0.5223, "step": 7900 }, { "epoch": 1.34, "grad_norm": 16.83318519592285, "learning_rate": 3.662654630558342e-05, "loss": 0.5043, "step": 8000 }, { "epoch": 1.35, "grad_norm": 10.143548965454102, "learning_rate": 3.645937813440321e-05, "loss": 0.5181, "step": 8100 }, { "epoch": 1.37, "grad_norm": 20.629831314086914, "learning_rate": 3.629220996322301e-05, "loss": 0.4886, "step": 8200 }, { "epoch": 1.39, "grad_norm": 12.14686107635498, "learning_rate": 3.61250417920428e-05, "loss": 0.5667, "step": 8300 }, { "epoch": 1.4, "grad_norm": 17.1881160736084, "learning_rate": 3.595787362086259e-05, "loss": 0.5211, "step": 8400 }, { "epoch": 1.42, "grad_norm": 7.506267070770264, "learning_rate": 3.579070544968238e-05, "loss": 0.5356, "step": 8500 }, { "epoch": 1.44, "grad_norm": 23.122560501098633, "learning_rate": 3.562353727850217e-05, "loss": 0.5044, "step": 8600 }, { "epoch": 1.45, "grad_norm": 21.808191299438477, "learning_rate": 3.545636910732197e-05, "loss": 0.5059, "step": 8700 }, { "epoch": 1.47, "grad_norm": 12.899435997009277, "learning_rate": 3.5289200936141756e-05, "loss": 0.5082, "step": 8800 }, { "epoch": 1.49, "grad_norm": 11.228046417236328, "learning_rate": 3.5122032764961554e-05, "loss": 0.4466, "step": 8900 }, { "epoch": 1.5, "grad_norm": 15.656624794006348, "learning_rate": 3.4954864593781346e-05, "loss": 0.4877, "step": 9000 }, { "epoch": 1.52, "grad_norm": 14.958187103271484, "learning_rate": 3.478769642260114e-05, "loss": 0.4283, "step": 9100 }, { "epoch": 1.54, "grad_norm": 27.727924346923828, "learning_rate": 3.462052825142093e-05, "loss": 0.504, "step": 9200 }, { "epoch": 1.55, "grad_norm": 21.103147506713867, "learning_rate": 3.445336008024073e-05, "loss": 0.5081, "step": 9300 }, { "epoch": 1.57, "grad_norm": 14.884688377380371, "learning_rate": 3.428619190906052e-05, "loss": 0.47, "step": 9400 }, { "epoch": 1.59, "grad_norm": 26.825908660888672, "learning_rate": 3.411902373788031e-05, "loss": 0.4587, "step": 9500 }, { "epoch": 1.6, "grad_norm": 23.39227867126465, "learning_rate": 3.39518555667001e-05, "loss": 0.4621, "step": 9600 }, { "epoch": 1.62, "grad_norm": 15.503640174865723, "learning_rate": 3.378468739551989e-05, "loss": 0.5122, "step": 9700 }, { "epoch": 1.64, "grad_norm": 13.298539161682129, "learning_rate": 3.361751922433969e-05, "loss": 0.4846, "step": 9800 }, { "epoch": 1.65, "grad_norm": 17.961261749267578, "learning_rate": 3.3450351053159476e-05, "loss": 0.4576, "step": 9900 }, { "epoch": 1.67, "grad_norm": 15.622933387756348, "learning_rate": 3.3283182881979274e-05, "loss": 0.4239, "step": 10000 }, { "epoch": 1.69, "grad_norm": 15.286486625671387, "learning_rate": 3.3116014710799066e-05, "loss": 0.4478, "step": 10100 }, { "epoch": 1.71, "grad_norm": 28.045799255371094, "learning_rate": 3.294884653961886e-05, "loss": 0.4457, "step": 10200 }, { "epoch": 1.72, "grad_norm": 23.578136444091797, "learning_rate": 3.278167836843865e-05, "loss": 0.464, "step": 10300 }, { "epoch": 1.74, "grad_norm": 12.858305931091309, "learning_rate": 3.261451019725844e-05, "loss": 0.4507, "step": 10400 }, { "epoch": 1.76, "grad_norm": 18.197952270507812, "learning_rate": 3.244734202607824e-05, "loss": 0.4158, "step": 10500 }, { "epoch": 1.77, "grad_norm": 5.134513854980469, "learning_rate": 3.228017385489803e-05, "loss": 0.4088, "step": 10600 }, { "epoch": 1.79, "grad_norm": 2.1014363765716553, "learning_rate": 3.211300568371782e-05, "loss": 0.4524, "step": 10700 }, { "epoch": 1.81, "grad_norm": 14.459040641784668, "learning_rate": 3.194583751253761e-05, "loss": 0.4637, "step": 10800 }, { "epoch": 1.82, "grad_norm": 29.922468185424805, "learning_rate": 3.177866934135741e-05, "loss": 0.4302, "step": 10900 }, { "epoch": 1.84, "grad_norm": 23.523460388183594, "learning_rate": 3.1611501170177196e-05, "loss": 0.4155, "step": 11000 }, { "epoch": 1.86, "grad_norm": 11.668371200561523, "learning_rate": 3.1444332998996994e-05, "loss": 0.4238, "step": 11100 }, { "epoch": 1.87, "grad_norm": 15.930005073547363, "learning_rate": 3.1277164827816786e-05, "loss": 0.4072, "step": 11200 }, { "epoch": 1.89, "grad_norm": 18.61160659790039, "learning_rate": 3.110999665663658e-05, "loss": 0.4348, "step": 11300 }, { "epoch": 1.91, "grad_norm": 27.475053787231445, "learning_rate": 3.094282848545637e-05, "loss": 0.4648, "step": 11400 }, { "epoch": 1.92, "grad_norm": 6.477468013763428, "learning_rate": 3.077566031427616e-05, "loss": 0.4241, "step": 11500 }, { "epoch": 1.94, "grad_norm": 26.99014663696289, "learning_rate": 3.060849214309596e-05, "loss": 0.4243, "step": 11600 }, { "epoch": 1.96, "grad_norm": 16.152755737304688, "learning_rate": 3.0441323971915747e-05, "loss": 0.4186, "step": 11700 }, { "epoch": 1.97, "grad_norm": 15.536150932312012, "learning_rate": 3.0274155800735542e-05, "loss": 0.3808, "step": 11800 }, { "epoch": 1.99, "grad_norm": 23.708145141601562, "learning_rate": 3.0106987629555333e-05, "loss": 0.4365, "step": 11900 }, { "epoch": 2.0, "eval_accuracy": 0.8680876258140912, "eval_f1": 0.8628914936078326, "eval_loss": 0.4633374810218811, "eval_precision": 0.8684864554322808, "eval_recall": 0.8680876258140912, "eval_runtime": 64.0052, "eval_samples_per_second": 131.942, "eval_steps_per_second": 8.249, "step": 11964 }, { "epoch": 2.01, "grad_norm": 10.474257469177246, "learning_rate": 2.9939819458375128e-05, "loss": 0.3853, "step": 12000 }, { "epoch": 2.02, "grad_norm": 15.668170928955078, "learning_rate": 2.977265128719492e-05, "loss": 0.2858, "step": 12100 }, { "epoch": 2.04, "grad_norm": 10.29902172088623, "learning_rate": 2.960548311601471e-05, "loss": 0.2803, "step": 12200 }, { "epoch": 2.06, "grad_norm": 33.27579116821289, "learning_rate": 2.9438314944834506e-05, "loss": 0.2858, "step": 12300 }, { "epoch": 2.07, "grad_norm": 13.799466133117676, "learning_rate": 2.9271146773654294e-05, "loss": 0.2793, "step": 12400 }, { "epoch": 2.09, "grad_norm": 4.722692489624023, "learning_rate": 2.9103978602474092e-05, "loss": 0.2935, "step": 12500 }, { "epoch": 2.11, "grad_norm": 8.643231391906738, "learning_rate": 2.893681043129388e-05, "loss": 0.2825, "step": 12600 }, { "epoch": 2.12, "grad_norm": 10.378469467163086, "learning_rate": 2.876964226011368e-05, "loss": 0.2845, "step": 12700 }, { "epoch": 2.14, "grad_norm": 9.1376953125, "learning_rate": 2.8602474088933467e-05, "loss": 0.2725, "step": 12800 }, { "epoch": 2.16, "grad_norm": 10.372312545776367, "learning_rate": 2.8435305917753262e-05, "loss": 0.3067, "step": 12900 }, { "epoch": 2.17, "grad_norm": 23.952699661254883, "learning_rate": 2.8268137746573053e-05, "loss": 0.2934, "step": 13000 }, { "epoch": 2.19, "grad_norm": 2.125562906265259, "learning_rate": 2.8100969575392848e-05, "loss": 0.2535, "step": 13100 }, { "epoch": 2.21, "grad_norm": 8.090828895568848, "learning_rate": 2.793380140421264e-05, "loss": 0.295, "step": 13200 }, { "epoch": 2.22, "grad_norm": 13.274210929870605, "learning_rate": 2.776663323303243e-05, "loss": 0.2851, "step": 13300 }, { "epoch": 2.24, "grad_norm": 5.6807732582092285, "learning_rate": 2.7599465061852226e-05, "loss": 0.2662, "step": 13400 }, { "epoch": 2.26, "grad_norm": 11.885269165039062, "learning_rate": 2.7432296890672014e-05, "loss": 0.2969, "step": 13500 }, { "epoch": 2.27, "grad_norm": 21.52318000793457, "learning_rate": 2.7265128719491812e-05, "loss": 0.2706, "step": 13600 }, { "epoch": 2.29, "grad_norm": 21.661279678344727, "learning_rate": 2.70979605483116e-05, "loss": 0.2715, "step": 13700 }, { "epoch": 2.31, "grad_norm": 27.985078811645508, "learning_rate": 2.69307923771314e-05, "loss": 0.3016, "step": 13800 }, { "epoch": 2.32, "grad_norm": 11.431729316711426, "learning_rate": 2.6763624205951187e-05, "loss": 0.2501, "step": 13900 }, { "epoch": 2.34, "grad_norm": 5.3406901359558105, "learning_rate": 2.6596456034770982e-05, "loss": 0.2762, "step": 14000 }, { "epoch": 2.36, "grad_norm": 11.077746391296387, "learning_rate": 2.6429287863590773e-05, "loss": 0.2819, "step": 14100 }, { "epoch": 2.37, "grad_norm": 17.451330184936523, "learning_rate": 2.6262119692410565e-05, "loss": 0.3074, "step": 14200 }, { "epoch": 2.39, "grad_norm": 7.353370189666748, "learning_rate": 2.609495152123036e-05, "loss": 0.3068, "step": 14300 }, { "epoch": 2.41, "grad_norm": 12.055102348327637, "learning_rate": 2.592778335005015e-05, "loss": 0.2779, "step": 14400 }, { "epoch": 2.42, "grad_norm": 17.555917739868164, "learning_rate": 2.5760615178869946e-05, "loss": 0.2421, "step": 14500 }, { "epoch": 2.44, "grad_norm": 22.887771606445312, "learning_rate": 2.5593447007689734e-05, "loss": 0.3016, "step": 14600 }, { "epoch": 2.46, "grad_norm": 1.915899395942688, "learning_rate": 2.5426278836509533e-05, "loss": 0.2638, "step": 14700 }, { "epoch": 2.47, "grad_norm": 13.446496963500977, "learning_rate": 2.525911066532932e-05, "loss": 0.293, "step": 14800 }, { "epoch": 2.49, "grad_norm": 12.734638214111328, "learning_rate": 2.509194249414912e-05, "loss": 0.2668, "step": 14900 }, { "epoch": 2.51, "grad_norm": 15.557112693786621, "learning_rate": 2.4924774322968907e-05, "loss": 0.2691, "step": 15000 }, { "epoch": 2.52, "grad_norm": 10.383445739746094, "learning_rate": 2.4757606151788702e-05, "loss": 0.2204, "step": 15100 }, { "epoch": 2.54, "grad_norm": 7.19666862487793, "learning_rate": 2.4590437980608493e-05, "loss": 0.2447, "step": 15200 }, { "epoch": 2.56, "grad_norm": 17.903339385986328, "learning_rate": 2.442326980942829e-05, "loss": 0.2504, "step": 15300 }, { "epoch": 2.57, "grad_norm": 10.492616653442383, "learning_rate": 2.425610163824808e-05, "loss": 0.2256, "step": 15400 }, { "epoch": 2.59, "grad_norm": 11.051074028015137, "learning_rate": 2.408893346706787e-05, "loss": 0.259, "step": 15500 }, { "epoch": 2.61, "grad_norm": 23.400402069091797, "learning_rate": 2.3921765295887663e-05, "loss": 0.2487, "step": 15600 }, { "epoch": 2.62, "grad_norm": 20.601686477661133, "learning_rate": 2.3754597124707458e-05, "loss": 0.2338, "step": 15700 }, { "epoch": 2.64, "grad_norm": 12.519159317016602, "learning_rate": 2.358742895352725e-05, "loss": 0.2652, "step": 15800 }, { "epoch": 2.66, "grad_norm": 21.95683479309082, "learning_rate": 2.342026078234704e-05, "loss": 0.2306, "step": 15900 }, { "epoch": 2.67, "grad_norm": 24.98236656188965, "learning_rate": 2.3253092611166836e-05, "loss": 0.2475, "step": 16000 }, { "epoch": 2.69, "grad_norm": 6.362200736999512, "learning_rate": 2.3085924439986627e-05, "loss": 0.2646, "step": 16100 }, { "epoch": 2.71, "grad_norm": 14.293391227722168, "learning_rate": 2.2918756268806422e-05, "loss": 0.2404, "step": 16200 }, { "epoch": 2.72, "grad_norm": 11.405878067016602, "learning_rate": 2.2751588097626213e-05, "loss": 0.2651, "step": 16300 }, { "epoch": 2.74, "grad_norm": 15.082180976867676, "learning_rate": 2.258441992644601e-05, "loss": 0.281, "step": 16400 }, { "epoch": 2.76, "grad_norm": 27.33397674560547, "learning_rate": 2.2417251755265796e-05, "loss": 0.2492, "step": 16500 }, { "epoch": 2.77, "grad_norm": 10.052102088928223, "learning_rate": 2.225008358408559e-05, "loss": 0.2382, "step": 16600 }, { "epoch": 2.79, "grad_norm": 15.405964851379395, "learning_rate": 2.2082915412905383e-05, "loss": 0.2496, "step": 16700 }, { "epoch": 2.81, "grad_norm": 7.162382125854492, "learning_rate": 2.1915747241725178e-05, "loss": 0.2343, "step": 16800 }, { "epoch": 2.83, "grad_norm": 11.130888938903809, "learning_rate": 2.174857907054497e-05, "loss": 0.2474, "step": 16900 }, { "epoch": 2.84, "grad_norm": 8.277360916137695, "learning_rate": 2.158141089936476e-05, "loss": 0.2687, "step": 17000 }, { "epoch": 2.86, "grad_norm": 31.100744247436523, "learning_rate": 2.1414242728184556e-05, "loss": 0.2422, "step": 17100 }, { "epoch": 2.88, "grad_norm": 12.757442474365234, "learning_rate": 2.1247074557004347e-05, "loss": 0.2275, "step": 17200 }, { "epoch": 2.89, "grad_norm": 4.860738277435303, "learning_rate": 2.1079906385824142e-05, "loss": 0.2252, "step": 17300 }, { "epoch": 2.91, "grad_norm": 10.574835777282715, "learning_rate": 2.091273821464393e-05, "loss": 0.2114, "step": 17400 }, { "epoch": 2.93, "grad_norm": 13.01117992401123, "learning_rate": 2.0745570043463725e-05, "loss": 0.2407, "step": 17500 }, { "epoch": 2.94, "grad_norm": 4.970390319824219, "learning_rate": 2.0578401872283517e-05, "loss": 0.2509, "step": 17600 }, { "epoch": 2.96, "grad_norm": 18.95350456237793, "learning_rate": 2.041123370110331e-05, "loss": 0.2814, "step": 17700 }, { "epoch": 2.98, "grad_norm": 1.5296308994293213, "learning_rate": 2.0244065529923103e-05, "loss": 0.235, "step": 17800 }, { "epoch": 2.99, "grad_norm": 12.501904487609863, "learning_rate": 2.0076897358742898e-05, "loss": 0.2479, "step": 17900 }, { "epoch": 3.0, "eval_accuracy": 0.8965068087625814, "eval_f1": 0.8930257247589533, "eval_loss": 0.36622655391693115, "eval_precision": 0.8950199629292306, "eval_recall": 0.8965068087625814, "eval_runtime": 64.0862, "eval_samples_per_second": 131.776, "eval_steps_per_second": 8.239, "step": 17946 }, { "epoch": 3.01, "grad_norm": 19.13836097717285, "learning_rate": 1.990972918756269e-05, "loss": 0.2272, "step": 18000 }, { "epoch": 3.03, "grad_norm": 8.622084617614746, "learning_rate": 1.9742561016382484e-05, "loss": 0.131, "step": 18100 }, { "epoch": 3.04, "grad_norm": 32.99411392211914, "learning_rate": 1.9575392845202276e-05, "loss": 0.1477, "step": 18200 }, { "epoch": 3.06, "grad_norm": 5.467390060424805, "learning_rate": 1.9408224674022067e-05, "loss": 0.1439, "step": 18300 }, { "epoch": 3.08, "grad_norm": 2.5153982639312744, "learning_rate": 1.924105650284186e-05, "loss": 0.1405, "step": 18400 }, { "epoch": 3.09, "grad_norm": 20.424579620361328, "learning_rate": 1.907388833166165e-05, "loss": 0.1594, "step": 18500 }, { "epoch": 3.11, "grad_norm": 5.207544803619385, "learning_rate": 1.8906720160481445e-05, "loss": 0.1323, "step": 18600 }, { "epoch": 3.13, "grad_norm": 8.750362396240234, "learning_rate": 1.8739551989301237e-05, "loss": 0.1683, "step": 18700 }, { "epoch": 3.14, "grad_norm": 2.464329481124878, "learning_rate": 1.857238381812103e-05, "loss": 0.1388, "step": 18800 }, { "epoch": 3.16, "grad_norm": 3.784031867980957, "learning_rate": 1.8405215646940823e-05, "loss": 0.149, "step": 18900 }, { "epoch": 3.18, "grad_norm": 2.632542610168457, "learning_rate": 1.8238047475760618e-05, "loss": 0.1284, "step": 19000 }, { "epoch": 3.19, "grad_norm": 11.050533294677734, "learning_rate": 1.807087930458041e-05, "loss": 0.1525, "step": 19100 }, { "epoch": 3.21, "grad_norm": 7.363661766052246, "learning_rate": 1.7903711133400204e-05, "loss": 0.1481, "step": 19200 }, { "epoch": 3.23, "grad_norm": 9.882287979125977, "learning_rate": 1.7736542962219992e-05, "loss": 0.1231, "step": 19300 }, { "epoch": 3.24, "grad_norm": 24.93657684326172, "learning_rate": 1.7569374791039787e-05, "loss": 0.1332, "step": 19400 }, { "epoch": 3.26, "grad_norm": 2.2802133560180664, "learning_rate": 1.740220661985958e-05, "loss": 0.1425, "step": 19500 }, { "epoch": 3.28, "grad_norm": 1.5991661548614502, "learning_rate": 1.7235038448679374e-05, "loss": 0.1283, "step": 19600 }, { "epoch": 3.29, "grad_norm": 8.344457626342773, "learning_rate": 1.7067870277499165e-05, "loss": 0.1502, "step": 19700 }, { "epoch": 3.31, "grad_norm": 12.95904541015625, "learning_rate": 1.6900702106318957e-05, "loss": 0.1287, "step": 19800 }, { "epoch": 3.33, "grad_norm": 20.562625885009766, "learning_rate": 1.673353393513875e-05, "loss": 0.1422, "step": 19900 }, { "epoch": 3.34, "grad_norm": 4.20346736907959, "learning_rate": 1.6566365763958543e-05, "loss": 0.1082, "step": 20000 }, { "epoch": 3.36, "grad_norm": 25.636775970458984, "learning_rate": 1.6399197592778338e-05, "loss": 0.1416, "step": 20100 }, { "epoch": 3.38, "grad_norm": 23.23301887512207, "learning_rate": 1.6232029421598126e-05, "loss": 0.1497, "step": 20200 }, { "epoch": 3.39, "grad_norm": 22.21303939819336, "learning_rate": 1.606486125041792e-05, "loss": 0.1568, "step": 20300 }, { "epoch": 3.41, "grad_norm": 21.14128303527832, "learning_rate": 1.5897693079237712e-05, "loss": 0.139, "step": 20400 }, { "epoch": 3.43, "grad_norm": 22.63404083251953, "learning_rate": 1.5730524908057507e-05, "loss": 0.1518, "step": 20500 }, { "epoch": 3.44, "grad_norm": 13.030010223388672, "learning_rate": 1.55633567368773e-05, "loss": 0.1319, "step": 20600 }, { "epoch": 3.46, "grad_norm": 18.308670043945312, "learning_rate": 1.5396188565697094e-05, "loss": 0.1494, "step": 20700 }, { "epoch": 3.48, "grad_norm": 24.907419204711914, "learning_rate": 1.5229020394516885e-05, "loss": 0.1425, "step": 20800 }, { "epoch": 3.49, "grad_norm": 19.32282066345215, "learning_rate": 1.5061852223336678e-05, "loss": 0.1264, "step": 20900 }, { "epoch": 3.51, "grad_norm": 17.444271087646484, "learning_rate": 1.4894684052156472e-05, "loss": 0.14, "step": 21000 }, { "epoch": 3.53, "grad_norm": 1.832461953163147, "learning_rate": 1.4727515880976261e-05, "loss": 0.1438, "step": 21100 }, { "epoch": 3.54, "grad_norm": 10.410861015319824, "learning_rate": 1.4560347709796055e-05, "loss": 0.1393, "step": 21200 }, { "epoch": 3.56, "grad_norm": 3.6459202766418457, "learning_rate": 1.4393179538615848e-05, "loss": 0.1077, "step": 21300 }, { "epoch": 3.58, "grad_norm": 3.216399669647217, "learning_rate": 1.4226011367435641e-05, "loss": 0.1154, "step": 21400 }, { "epoch": 3.59, "grad_norm": 5.621729373931885, "learning_rate": 1.4058843196255434e-05, "loss": 0.1208, "step": 21500 }, { "epoch": 3.61, "grad_norm": 5.559453010559082, "learning_rate": 1.3891675025075226e-05, "loss": 0.1441, "step": 21600 }, { "epoch": 3.63, "grad_norm": 22.32745933532715, "learning_rate": 1.3724506853895019e-05, "loss": 0.1176, "step": 21700 }, { "epoch": 3.64, "grad_norm": 4.509443759918213, "learning_rate": 1.3557338682714812e-05, "loss": 0.1382, "step": 21800 }, { "epoch": 3.66, "grad_norm": 15.154895782470703, "learning_rate": 1.3390170511534605e-05, "loss": 0.1475, "step": 21900 }, { "epoch": 3.68, "grad_norm": 0.8804099559783936, "learning_rate": 1.3223002340354398e-05, "loss": 0.1325, "step": 22000 }, { "epoch": 3.69, "grad_norm": 1.9917913675308228, "learning_rate": 1.3055834169174188e-05, "loss": 0.1255, "step": 22100 }, { "epoch": 3.71, "grad_norm": 16.314374923706055, "learning_rate": 1.2888665997993981e-05, "loss": 0.1275, "step": 22200 }, { "epoch": 3.73, "grad_norm": 5.355242729187012, "learning_rate": 1.2721497826813775e-05, "loss": 0.1185, "step": 22300 }, { "epoch": 3.74, "grad_norm": 20.218473434448242, "learning_rate": 1.2554329655633568e-05, "loss": 0.1203, "step": 22400 }, { "epoch": 3.76, "grad_norm": 1.39955735206604, "learning_rate": 1.2387161484453361e-05, "loss": 0.1636, "step": 22500 }, { "epoch": 3.78, "grad_norm": 17.855899810791016, "learning_rate": 1.2219993313273154e-05, "loss": 0.1369, "step": 22600 }, { "epoch": 3.79, "grad_norm": 14.41054630279541, "learning_rate": 1.2052825142092947e-05, "loss": 0.1245, "step": 22700 }, { "epoch": 3.81, "grad_norm": 11.451350212097168, "learning_rate": 1.1885656970912739e-05, "loss": 0.1508, "step": 22800 }, { "epoch": 3.83, "grad_norm": 9.41112995147705, "learning_rate": 1.171848879973253e-05, "loss": 0.125, "step": 22900 }, { "epoch": 3.84, "grad_norm": 29.826963424682617, "learning_rate": 1.1551320628552324e-05, "loss": 0.1545, "step": 23000 }, { "epoch": 3.86, "grad_norm": 11.454690933227539, "learning_rate": 1.1384152457372117e-05, "loss": 0.1353, "step": 23100 }, { "epoch": 3.88, "grad_norm": 12.364923477172852, "learning_rate": 1.121698428619191e-05, "loss": 0.1346, "step": 23200 }, { "epoch": 3.9, "grad_norm": 1.8181456327438354, "learning_rate": 1.1049816115011702e-05, "loss": 0.1092, "step": 23300 }, { "epoch": 3.91, "grad_norm": 30.87436866760254, "learning_rate": 1.0882647943831495e-05, "loss": 0.1059, "step": 23400 }, { "epoch": 3.93, "grad_norm": 16.423452377319336, "learning_rate": 1.0715479772651288e-05, "loss": 0.1157, "step": 23500 }, { "epoch": 3.95, "grad_norm": 27.86665153503418, "learning_rate": 1.0548311601471081e-05, "loss": 0.1317, "step": 23600 }, { "epoch": 3.96, "grad_norm": 24.479764938354492, "learning_rate": 1.0381143430290873e-05, "loss": 0.1184, "step": 23700 }, { "epoch": 3.98, "grad_norm": 1.4079170227050781, "learning_rate": 1.0213975259110666e-05, "loss": 0.1303, "step": 23800 }, { "epoch": 4.0, "grad_norm": 4.259897232055664, "learning_rate": 1.0046807087930459e-05, "loss": 0.1322, "step": 23900 }, { "epoch": 4.0, "eval_accuracy": 0.9113084665482534, "eval_f1": 0.9092055511030135, "eval_loss": 0.3260073661804199, "eval_precision": 0.9099757491171729, "eval_recall": 0.9113084665482534, "eval_runtime": 64.1166, "eval_samples_per_second": 131.713, "eval_steps_per_second": 8.235, "step": 23928 }, { "epoch": 4.01, "grad_norm": 13.925552368164062, "learning_rate": 9.879638916750252e-06, "loss": 0.0687, "step": 24000 }, { "epoch": 4.03, "grad_norm": 0.18495211005210876, "learning_rate": 9.712470745570044e-06, "loss": 0.066, "step": 24100 }, { "epoch": 4.05, "grad_norm": 1.0808857679367065, "learning_rate": 9.545302574389837e-06, "loss": 0.0648, "step": 24200 }, { "epoch": 4.06, "grad_norm": 1.0073552131652832, "learning_rate": 9.378134403209628e-06, "loss": 0.071, "step": 24300 }, { "epoch": 4.08, "grad_norm": 15.166232109069824, "learning_rate": 9.210966232029422e-06, "loss": 0.0666, "step": 24400 }, { "epoch": 4.1, "grad_norm": 18.000640869140625, "learning_rate": 9.043798060849215e-06, "loss": 0.0778, "step": 24500 }, { "epoch": 4.11, "grad_norm": 1.214728593826294, "learning_rate": 8.876629889669008e-06, "loss": 0.07, "step": 24600 }, { "epoch": 4.13, "grad_norm": 1.982407808303833, "learning_rate": 8.7094617184888e-06, "loss": 0.0752, "step": 24700 }, { "epoch": 4.15, "grad_norm": 20.929153442382812, "learning_rate": 8.542293547308593e-06, "loss": 0.0785, "step": 24800 }, { "epoch": 4.16, "grad_norm": 0.8963820934295654, "learning_rate": 8.375125376128386e-06, "loss": 0.0524, "step": 24900 }, { "epoch": 4.18, "grad_norm": 3.5774483680725098, "learning_rate": 8.207957204948179e-06, "loss": 0.0692, "step": 25000 }, { "epoch": 4.2, "grad_norm": 3.7253074645996094, "learning_rate": 8.04078903376797e-06, "loss": 0.0641, "step": 25100 }, { "epoch": 4.21, "grad_norm": 1.2855291366577148, "learning_rate": 7.873620862587764e-06, "loss": 0.0699, "step": 25200 }, { "epoch": 4.23, "grad_norm": 1.9972455501556396, "learning_rate": 7.706452691407557e-06, "loss": 0.062, "step": 25300 }, { "epoch": 4.25, "grad_norm": 1.0809322595596313, "learning_rate": 7.539284520227349e-06, "loss": 0.058, "step": 25400 }, { "epoch": 4.26, "grad_norm": 3.876232862472534, "learning_rate": 7.3721163490471425e-06, "loss": 0.0693, "step": 25500 }, { "epoch": 4.28, "grad_norm": 6.069151878356934, "learning_rate": 7.204948177866934e-06, "loss": 0.0617, "step": 25600 }, { "epoch": 4.3, "grad_norm": 0.895815372467041, "learning_rate": 7.037780006686727e-06, "loss": 0.0623, "step": 25700 }, { "epoch": 4.31, "grad_norm": 0.4176822602748871, "learning_rate": 6.8706118355065195e-06, "loss": 0.0833, "step": 25800 }, { "epoch": 4.33, "grad_norm": 0.6760619878768921, "learning_rate": 6.703443664326313e-06, "loss": 0.0567, "step": 25900 }, { "epoch": 4.35, "grad_norm": 14.889734268188477, "learning_rate": 6.536275493146106e-06, "loss": 0.053, "step": 26000 }, { "epoch": 4.36, "grad_norm": 0.5385121703147888, "learning_rate": 6.369107321965897e-06, "loss": 0.0703, "step": 26100 }, { "epoch": 4.38, "grad_norm": 6.336006164550781, "learning_rate": 6.201939150785691e-06, "loss": 0.063, "step": 26200 }, { "epoch": 4.4, "grad_norm": 0.20758749544620514, "learning_rate": 6.034770979605484e-06, "loss": 0.0753, "step": 26300 }, { "epoch": 4.41, "grad_norm": 11.717066764831543, "learning_rate": 5.867602808425276e-06, "loss": 0.0598, "step": 26400 }, { "epoch": 4.43, "grad_norm": 26.475128173828125, "learning_rate": 5.7004346372450685e-06, "loss": 0.064, "step": 26500 }, { "epoch": 4.45, "grad_norm": 20.872194290161133, "learning_rate": 5.533266466064862e-06, "loss": 0.0708, "step": 26600 }, { "epoch": 4.46, "grad_norm": 1.2749828100204468, "learning_rate": 5.366098294884654e-06, "loss": 0.0705, "step": 26700 }, { "epoch": 4.48, "grad_norm": 6.7912702560424805, "learning_rate": 5.198930123704447e-06, "loss": 0.0742, "step": 26800 }, { "epoch": 4.5, "grad_norm": 10.904654502868652, "learning_rate": 5.03176195252424e-06, "loss": 0.0665, "step": 26900 }, { "epoch": 4.51, "grad_norm": 6.191511154174805, "learning_rate": 4.864593781344033e-06, "loss": 0.0549, "step": 27000 }, { "epoch": 4.53, "grad_norm": 2.479524850845337, "learning_rate": 4.697425610163825e-06, "loss": 0.0539, "step": 27100 }, { "epoch": 4.55, "grad_norm": 0.7285805940628052, "learning_rate": 4.5302574389836175e-06, "loss": 0.0662, "step": 27200 }, { "epoch": 4.56, "grad_norm": 4.313304901123047, "learning_rate": 4.363089267803411e-06, "loss": 0.0571, "step": 27300 }, { "epoch": 4.58, "grad_norm": 17.61699867248535, "learning_rate": 4.195921096623203e-06, "loss": 0.0634, "step": 27400 }, { "epoch": 4.6, "grad_norm": 1.3776081800460815, "learning_rate": 4.028752925442996e-06, "loss": 0.0526, "step": 27500 }, { "epoch": 4.61, "grad_norm": 0.36369597911834717, "learning_rate": 3.8615847542627886e-06, "loss": 0.0669, "step": 27600 }, { "epoch": 4.63, "grad_norm": 4.591643333435059, "learning_rate": 3.6944165830825813e-06, "loss": 0.0578, "step": 27700 }, { "epoch": 4.65, "grad_norm": 0.930225670337677, "learning_rate": 3.5272484119023737e-06, "loss": 0.0456, "step": 27800 }, { "epoch": 4.66, "grad_norm": 1.136043906211853, "learning_rate": 3.360080240722167e-06, "loss": 0.0617, "step": 27900 }, { "epoch": 4.68, "grad_norm": 0.6426201462745667, "learning_rate": 3.1929120695419596e-06, "loss": 0.0568, "step": 28000 }, { "epoch": 4.7, "grad_norm": 2.6884241104125977, "learning_rate": 3.025743898361752e-06, "loss": 0.0606, "step": 28100 }, { "epoch": 4.71, "grad_norm": 0.4525424838066101, "learning_rate": 2.8585757271815448e-06, "loss": 0.066, "step": 28200 }, { "epoch": 4.73, "grad_norm": 1.0276681184768677, "learning_rate": 2.6914075560013375e-06, "loss": 0.0444, "step": 28300 }, { "epoch": 4.75, "grad_norm": 7.886939525604248, "learning_rate": 2.5242393848211303e-06, "loss": 0.065, "step": 28400 }, { "epoch": 4.76, "grad_norm": 0.37203583121299744, "learning_rate": 2.357071213640923e-06, "loss": 0.0559, "step": 28500 }, { "epoch": 4.78, "grad_norm": 6.219501495361328, "learning_rate": 2.1899030424607154e-06, "loss": 0.07, "step": 28600 }, { "epoch": 4.8, "grad_norm": 8.10631275177002, "learning_rate": 2.022734871280508e-06, "loss": 0.0623, "step": 28700 }, { "epoch": 4.81, "grad_norm": 24.999059677124023, "learning_rate": 1.855566700100301e-06, "loss": 0.0701, "step": 28800 }, { "epoch": 4.83, "grad_norm": 3.5445597171783447, "learning_rate": 1.6883985289200935e-06, "loss": 0.0561, "step": 28900 }, { "epoch": 4.85, "grad_norm": 11.693018913269043, "learning_rate": 1.5212303577398863e-06, "loss": 0.062, "step": 29000 }, { "epoch": 4.86, "grad_norm": 17.059640884399414, "learning_rate": 1.354062186559679e-06, "loss": 0.0663, "step": 29100 }, { "epoch": 4.88, "grad_norm": 3.2128794193267822, "learning_rate": 1.1868940153794718e-06, "loss": 0.0541, "step": 29200 }, { "epoch": 4.9, "grad_norm": 1.6803439855575562, "learning_rate": 1.0197258441992646e-06, "loss": 0.0619, "step": 29300 }, { "epoch": 4.91, "grad_norm": 7.980160236358643, "learning_rate": 8.525576730190572e-07, "loss": 0.0649, "step": 29400 }, { "epoch": 4.93, "grad_norm": 0.3919593393802643, "learning_rate": 6.853895018388499e-07, "loss": 0.0753, "step": 29500 }, { "epoch": 4.95, "grad_norm": 2.870180368423462, "learning_rate": 5.182213306586426e-07, "loss": 0.0461, "step": 29600 }, { "epoch": 4.96, "grad_norm": 0.5204899907112122, "learning_rate": 3.510531594784353e-07, "loss": 0.0446, "step": 29700 }, { "epoch": 4.98, "grad_norm": 2.318403482437134, "learning_rate": 1.8388498829822804e-07, "loss": 0.0588, "step": 29800 }, { "epoch": 5.0, "grad_norm": 1.1591626405715942, "learning_rate": 1.6716817118020728e-08, "loss": 0.0589, "step": 29900 }, { "epoch": 5.0, "eval_accuracy": 0.9182948490230906, "eval_f1": 0.9165254517429693, "eval_loss": 0.3342040479183197, "eval_precision": 0.9170562701684628, "eval_recall": 0.9182948490230906, "eval_runtime": 63.9141, "eval_samples_per_second": 132.131, "eval_steps_per_second": 8.261, "step": 29910 } ], "logging_steps": 100, "max_steps": 29910, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.15579279766016e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }