{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.0, "global_step": 94050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-09, "loss": 10.4865, "step": 1 }, { "epoch": 0.1, "learning_rate": 2.5e-06, "loss": 9.4439, "step": 500 }, { "epoch": 0.19, "learning_rate": 5e-06, "loss": 7.6179, "step": 1000 }, { "epoch": 0.29, "learning_rate": 7.5e-06, "loss": 6.3619, "step": 1500 }, { "epoch": 0.38, "learning_rate": 1e-05, "loss": 6.0809, "step": 2000 }, { "epoch": 0.48, "learning_rate": 1.25e-05, "loss": 5.957, "step": 2500 }, { "epoch": 0.57, "learning_rate": 1.5e-05, "loss": 5.8758, "step": 3000 }, { "epoch": 0.67, "learning_rate": 1.75e-05, "loss": 5.8102, "step": 3500 }, { "epoch": 0.77, "learning_rate": 2e-05, "loss": 5.7625, "step": 4000 }, { "epoch": 0.86, "learning_rate": 2.25e-05, "loss": 5.7195, "step": 4500 }, { "epoch": 0.96, "learning_rate": 2.5e-05, "loss": 5.6801, "step": 5000 }, { "epoch": 1.05, "learning_rate": 2.7500000000000004e-05, "loss": 5.6449, "step": 5500 }, { "epoch": 1.15, "learning_rate": 3e-05, "loss": 5.6134, "step": 6000 }, { "epoch": 1.24, "learning_rate": 3.2500000000000004e-05, "loss": 5.591, "step": 6500 }, { "epoch": 1.34, "learning_rate": 3.5e-05, "loss": 5.5683, "step": 7000 }, { "epoch": 1.44, "learning_rate": 3.7500000000000003e-05, "loss": 5.5419, "step": 7500 }, { "epoch": 1.53, "learning_rate": 4e-05, "loss": 5.5231, "step": 8000 }, { "epoch": 1.63, "learning_rate": 4.25e-05, "loss": 5.506, "step": 8500 }, { "epoch": 1.72, "learning_rate": 4.4995000000000005e-05, "loss": 5.4871, "step": 9000 }, { "epoch": 1.82, "learning_rate": 4.7495e-05, "loss": 5.4763, "step": 9500 }, { "epoch": 1.91, "learning_rate": 4.9995000000000005e-05, "loss": 5.4615, "step": 10000 }, { "epoch": 2.01, "learning_rate": 4.998800480769231e-05, "loss": 5.4468, "step": 10500 }, { "epoch": 2.11, "learning_rate": 4.997600961538462e-05, "loss": 5.4341, "step": 11000 }, { "epoch": 2.2, "learning_rate": 4.996399038461539e-05, "loss": 5.4224, "step": 11500 }, { "epoch": 2.3, "learning_rate": 4.995197115384615e-05, "loss": 5.4099, "step": 12000 }, { "epoch": 2.39, "learning_rate": 4.993995192307693e-05, "loss": 5.3978, "step": 12500 }, { "epoch": 2.49, "learning_rate": 4.992795673076923e-05, "loss": 5.3897, "step": 13000 }, { "epoch": 2.58, "learning_rate": 4.991593750000001e-05, "loss": 5.3836, "step": 13500 }, { "epoch": 2.68, "learning_rate": 4.990391826923077e-05, "loss": 5.3737, "step": 14000 }, { "epoch": 2.78, "learning_rate": 4.989189903846154e-05, "loss": 5.3668, "step": 14500 }, { "epoch": 2.87, "learning_rate": 4.987990384615385e-05, "loss": 5.3597, "step": 15000 }, { "epoch": 2.97, "learning_rate": 4.986788461538462e-05, "loss": 5.3485, "step": 15500 }, { "epoch": 3.06, "learning_rate": 4.9855889423076926e-05, "loss": 5.3413, "step": 16000 }, { "epoch": 3.16, "learning_rate": 4.9843870192307694e-05, "loss": 5.338, "step": 16500 }, { "epoch": 3.25, "learning_rate": 4.983185096153846e-05, "loss": 5.3304, "step": 17000 }, { "epoch": 3.35, "learning_rate": 4.981983173076924e-05, "loss": 5.3258, "step": 17500 }, { "epoch": 3.44, "learning_rate": 4.98078125e-05, "loss": 5.317, "step": 18000 }, { "epoch": 3.54, "learning_rate": 4.9795793269230774e-05, "loss": 5.3134, "step": 18500 }, { "epoch": 3.64, "learning_rate": 4.978377403846154e-05, "loss": 5.3097, "step": 19000 }, { "epoch": 3.73, "learning_rate": 4.977175480769231e-05, "loss": 5.3019, "step": 19500 }, { "epoch": 3.83, "learning_rate": 4.9759759615384614e-05, "loss": 5.2985, "step": 20000 }, { "epoch": 3.92, "learning_rate": 4.974774038461539e-05, "loss": 5.2942, "step": 20500 }, { "epoch": 4.02, "learning_rate": 4.973572115384616e-05, "loss": 5.2893, "step": 21000 }, { "epoch": 4.11, "learning_rate": 4.9723701923076925e-05, "loss": 5.2843, "step": 21500 }, { "epoch": 4.21, "learning_rate": 4.971170673076923e-05, "loss": 5.2784, "step": 22000 }, { "epoch": 4.31, "learning_rate": 4.9699687500000004e-05, "loss": 5.2732, "step": 22500 }, { "epoch": 4.4, "learning_rate": 4.968766826923077e-05, "loss": 5.2701, "step": 23000 }, { "epoch": 4.5, "learning_rate": 4.967564903846154e-05, "loss": 5.2677, "step": 23500 }, { "epoch": 4.59, "learning_rate": 4.9663653846153844e-05, "loss": 5.2644, "step": 24000 }, { "epoch": 4.69, "learning_rate": 4.965163461538462e-05, "loss": 5.2562, "step": 24500 }, { "epoch": 4.78, "learning_rate": 4.963963942307693e-05, "loss": 5.2557, "step": 25000 }, { "epoch": 4.88, "learning_rate": 4.96276201923077e-05, "loss": 5.2529, "step": 25500 }, { "epoch": 4.98, "learning_rate": 4.9615600961538466e-05, "loss": 5.2504, "step": 26000 }, { "epoch": 5.07, "learning_rate": 4.9603581730769234e-05, "loss": 5.1431, "step": 26500 }, { "epoch": 5.17, "learning_rate": 4.95915625e-05, "loss": 4.8144, "step": 27000 }, { "epoch": 5.26, "learning_rate": 4.957954326923077e-05, "loss": 4.5379, "step": 27500 }, { "epoch": 5.36, "learning_rate": 4.9567524038461545e-05, "loss": 4.282, "step": 28000 }, { "epoch": 5.45, "learning_rate": 4.955550480769231e-05, "loss": 3.7642, "step": 28500 }, { "epoch": 5.55, "learning_rate": 4.9543485576923075e-05, "loss": 3.0854, "step": 29000 }, { "epoch": 5.65, "learning_rate": 4.9531490384615385e-05, "loss": 2.6674, "step": 29500 }, { "epoch": 5.74, "learning_rate": 4.951947115384616e-05, "loss": 2.2254, "step": 30000 }, { "epoch": 5.84, "learning_rate": 4.950745192307693e-05, "loss": 1.9446, "step": 30500 }, { "epoch": 5.93, "learning_rate": 4.94954326923077e-05, "loss": 1.7693, "step": 31000 }, { "epoch": 6.03, "learning_rate": 4.94834375e-05, "loss": 1.6527, "step": 31500 }, { "epoch": 6.12, "learning_rate": 4.9471418269230775e-05, "loss": 1.5696, "step": 32000 }, { "epoch": 6.22, "learning_rate": 4.9459399038461544e-05, "loss": 1.5054, "step": 32500 }, { "epoch": 6.32, "learning_rate": 4.944737980769231e-05, "loss": 1.4447, "step": 33000 }, { "epoch": 6.41, "learning_rate": 4.9435384615384616e-05, "loss": 1.3901, "step": 33500 }, { "epoch": 6.51, "learning_rate": 4.942336538461539e-05, "loss": 1.3332, "step": 34000 }, { "epoch": 6.6, "learning_rate": 4.941134615384615e-05, "loss": 1.2692, "step": 34500 }, { "epoch": 6.7, "learning_rate": 4.939932692307693e-05, "loss": 1.217, "step": 35000 }, { "epoch": 6.79, "learning_rate": 4.938733173076923e-05, "loss": 1.178, "step": 35500 }, { "epoch": 6.89, "learning_rate": 4.9375312500000006e-05, "loss": 1.143, "step": 36000 }, { "epoch": 6.99, "learning_rate": 4.936329326923077e-05, "loss": 1.1109, "step": 36500 }, { "epoch": 7.08, "learning_rate": 4.935129807692308e-05, "loss": 1.0859, "step": 37000 }, { "epoch": 7.18, "learning_rate": 4.9339278846153846e-05, "loss": 1.0619, "step": 37500 }, { "epoch": 7.27, "learning_rate": 4.932725961538462e-05, "loss": 1.0387, "step": 38000 }, { "epoch": 7.37, "learning_rate": 4.931524038461538e-05, "loss": 1.0205, "step": 38500 }, { "epoch": 7.46, "learning_rate": 4.930322115384616e-05, "loss": 1.0017, "step": 39000 }, { "epoch": 7.56, "learning_rate": 4.929122596153846e-05, "loss": 0.9856, "step": 39500 }, { "epoch": 7.66, "learning_rate": 4.9279206730769236e-05, "loss": 0.9707, "step": 40000 }, { "epoch": 7.75, "learning_rate": 4.92671875e-05, "loss": 0.9574, "step": 40500 }, { "epoch": 7.85, "learning_rate": 4.925516826923077e-05, "loss": 0.9455, "step": 41000 }, { "epoch": 7.94, "learning_rate": 4.924314903846154e-05, "loss": 0.9323, "step": 41500 }, { "epoch": 8.04, "learning_rate": 4.923112980769231e-05, "loss": 0.9199, "step": 42000 }, { "epoch": 8.13, "learning_rate": 4.921913461538461e-05, "loss": 0.9113, "step": 42500 }, { "epoch": 8.23, "learning_rate": 4.920713942307692e-05, "loss": 0.9012, "step": 43000 }, { "epoch": 8.33, "learning_rate": 4.919512019230769e-05, "loss": 0.8939, "step": 43500 }, { "epoch": 8.42, "learning_rate": 4.9183100961538466e-05, "loss": 0.8851, "step": 44000 }, { "epoch": 8.52, "learning_rate": 4.917108173076923e-05, "loss": 0.8745, "step": 44500 }, { "epoch": 8.61, "learning_rate": 4.91590625e-05, "loss": 0.8651, "step": 45000 }, { "epoch": 8.71, "learning_rate": 4.914704326923077e-05, "loss": 0.8578, "step": 45500 }, { "epoch": 8.8, "learning_rate": 4.913502403846154e-05, "loss": 0.8519, "step": 46000 }, { "epoch": 8.9, "learning_rate": 4.912300480769231e-05, "loss": 0.8457, "step": 46500 }, { "epoch": 9.0, "learning_rate": 4.911098557692308e-05, "loss": 0.8389, "step": 47000 }, { "epoch": 9.09, "learning_rate": 4.9098990384615386e-05, "loss": 0.8305, "step": 47500 }, { "epoch": 9.19, "learning_rate": 4.9086971153846154e-05, "loss": 0.8233, "step": 48000 }, { "epoch": 9.28, "learning_rate": 4.907495192307692e-05, "loss": 0.8189, "step": 48500 }, { "epoch": 9.38, "learning_rate": 4.90629326923077e-05, "loss": 0.8129, "step": 49000 }, { "epoch": 9.47, "learning_rate": 4.905093750000001e-05, "loss": 0.8076, "step": 49500 }, { "epoch": 9.57, "learning_rate": 4.903894230769231e-05, "loss": 0.8019, "step": 50000 }, { "epoch": 9.67, "learning_rate": 4.902692307692308e-05, "loss": 0.7962, "step": 50500 }, { "epoch": 9.76, "learning_rate": 4.901490384615385e-05, "loss": 0.7904, "step": 51000 }, { "epoch": 9.86, "learning_rate": 4.900288461538462e-05, "loss": 0.7879, "step": 51500 }, { "epoch": 9.95, "learning_rate": 4.8990865384615384e-05, "loss": 0.7811, "step": 52000 }, { "epoch": 10.05, "learning_rate": 4.897884615384616e-05, "loss": 0.7781, "step": 52500 }, { "epoch": 10.14, "learning_rate": 4.896682692307693e-05, "loss": 0.7724, "step": 53000 }, { "epoch": 10.24, "learning_rate": 4.8954807692307695e-05, "loss": 0.7682, "step": 53500 }, { "epoch": 10.33, "learning_rate": 4.89428125e-05, "loss": 0.7637, "step": 54000 }, { "epoch": 10.43, "learning_rate": 4.893081730769231e-05, "loss": 0.7592, "step": 54500 }, { "epoch": 10.53, "learning_rate": 4.891879807692308e-05, "loss": 0.7541, "step": 55000 }, { "epoch": 10.62, "learning_rate": 4.890677884615385e-05, "loss": 0.75, "step": 55500 }, { "epoch": 10.72, "learning_rate": 4.8894759615384614e-05, "loss": 0.749, "step": 56000 }, { "epoch": 10.81, "learning_rate": 4.888274038461539e-05, "loss": 0.7434, "step": 56500 }, { "epoch": 10.91, "learning_rate": 4.887074519230769e-05, "loss": 0.7407, "step": 57000 }, { "epoch": 11.0, "learning_rate": 4.885872596153847e-05, "loss": 0.7381, "step": 57500 }, { "epoch": 11.1, "learning_rate": 4.884670673076923e-05, "loss": 0.7344, "step": 58000 }, { "epoch": 11.2, "learning_rate": 4.8834687500000004e-05, "loss": 0.7282, "step": 58500 }, { "epoch": 11.29, "learning_rate": 4.882269230769231e-05, "loss": 0.7272, "step": 59000 }, { "epoch": 11.39, "learning_rate": 4.881067307692308e-05, "loss": 0.7236, "step": 59500 }, { "epoch": 11.48, "learning_rate": 4.8798653846153845e-05, "loss": 0.7196, "step": 60000 }, { "epoch": 11.58, "learning_rate": 4.878663461538462e-05, "loss": 0.7164, "step": 60500 }, { "epoch": 11.67, "learning_rate": 4.877461538461539e-05, "loss": 0.7129, "step": 61000 }, { "epoch": 11.77, "learning_rate": 4.87626201923077e-05, "loss": 0.71, "step": 61500 }, { "epoch": 11.87, "learning_rate": 4.875060096153846e-05, "loss": 0.7088, "step": 62000 }, { "epoch": 11.96, "learning_rate": 4.8738581730769235e-05, "loss": 0.7057, "step": 62500 }, { "epoch": 12.06, "learning_rate": 4.87265625e-05, "loss": 0.7022, "step": 63000 }, { "epoch": 12.15, "learning_rate": 4.8714567307692313e-05, "loss": 0.6977, "step": 63500 }, { "epoch": 12.25, "learning_rate": 4.8702548076923075e-05, "loss": 0.6988, "step": 64000 }, { "epoch": 12.34, "learning_rate": 4.869052884615385e-05, "loss": 0.6943, "step": 64500 }, { "epoch": 12.44, "learning_rate": 4.867850961538462e-05, "loss": 0.6919, "step": 65000 }, { "epoch": 12.54, "learning_rate": 4.8666490384615386e-05, "loss": 0.6888, "step": 65500 }, { "epoch": 12.63, "learning_rate": 4.865449519230769e-05, "loss": 0.686, "step": 66000 }, { "epoch": 12.73, "learning_rate": 4.8642475961538465e-05, "loss": 0.6843, "step": 66500 }, { "epoch": 12.82, "learning_rate": 4.863045673076923e-05, "loss": 0.681, "step": 67000 }, { "epoch": 12.92, "learning_rate": 4.86184375e-05, "loss": 0.68, "step": 67500 }, { "epoch": 13.01, "learning_rate": 4.860644230769231e-05, "loss": 0.6775, "step": 68000 }, { "epoch": 13.11, "learning_rate": 4.859442307692308e-05, "loss": 0.6745, "step": 68500 }, { "epoch": 13.21, "learning_rate": 4.858240384615385e-05, "loss": 0.6726, "step": 69000 }, { "epoch": 13.3, "learning_rate": 4.8570384615384616e-05, "loss": 0.6716, "step": 69500 }, { "epoch": 13.4, "learning_rate": 4.855836538461539e-05, "loss": 0.6691, "step": 70000 }, { "epoch": 13.49, "learning_rate": 4.8546370192307695e-05, "loss": 0.6665, "step": 70500 }, { "epoch": 13.59, "learning_rate": 4.853435096153846e-05, "loss": 0.6625, "step": 71000 }, { "epoch": 13.68, "learning_rate": 4.852233173076923e-05, "loss": 0.6609, "step": 71500 }, { "epoch": 13.78, "learning_rate": 4.8510312500000006e-05, "loss": 0.66, "step": 72000 }, { "epoch": 13.88, "learning_rate": 4.8498293269230774e-05, "loss": 0.6566, "step": 72500 }, { "epoch": 13.97, "learning_rate": 4.848627403846154e-05, "loss": 0.6561, "step": 73000 }, { "epoch": 14.07, "learning_rate": 4.8474278846153847e-05, "loss": 0.6536, "step": 73500 }, { "epoch": 14.16, "learning_rate": 4.846225961538462e-05, "loss": 0.6509, "step": 74000 }, { "epoch": 14.26, "learning_rate": 4.845024038461539e-05, "loss": 0.6509, "step": 74500 }, { "epoch": 14.35, "learning_rate": 4.843822115384616e-05, "loss": 0.6471, "step": 75000 }, { "epoch": 14.45, "learning_rate": 4.8426201923076926e-05, "loss": 0.6459, "step": 75500 }, { "epoch": 14.55, "learning_rate": 4.8414206730769237e-05, "loss": 0.6452, "step": 76000 }, { "epoch": 14.64, "learning_rate": 4.840221153846154e-05, "loss": 0.6425, "step": 76500 }, { "epoch": 14.74, "learning_rate": 4.839019230769231e-05, "loss": 0.6406, "step": 77000 }, { "epoch": 14.83, "learning_rate": 4.837817307692308e-05, "loss": 0.6393, "step": 77500 }, { "epoch": 14.93, "learning_rate": 4.836615384615385e-05, "loss": 0.6362, "step": 78000 }, { "epoch": 15.02, "learning_rate": 4.835413461538461e-05, "loss": 0.6368, "step": 78500 }, { "epoch": 15.12, "learning_rate": 4.834211538461539e-05, "loss": 0.634, "step": 79000 }, { "epoch": 15.22, "learning_rate": 4.8330096153846156e-05, "loss": 0.6324, "step": 79500 }, { "epoch": 15.31, "learning_rate": 4.831810096153847e-05, "loss": 0.6287, "step": 80000 }, { "epoch": 15.41, "learning_rate": 4.830608173076923e-05, "loss": 0.6286, "step": 80500 }, { "epoch": 15.5, "learning_rate": 4.82940625e-05, "loss": 0.6267, "step": 81000 }, { "epoch": 15.6, "learning_rate": 4.828204326923077e-05, "loss": 0.6257, "step": 81500 }, { "epoch": 15.69, "learning_rate": 4.827002403846154e-05, "loss": 0.6245, "step": 82000 }, { "epoch": 15.79, "learning_rate": 4.825800480769231e-05, "loss": 0.6216, "step": 82500 }, { "epoch": 15.89, "learning_rate": 4.824598557692308e-05, "loss": 0.6208, "step": 83000 }, { "epoch": 15.98, "learning_rate": 4.8233966346153844e-05, "loss": 0.621, "step": 83500 }, { "epoch": 16.08, "learning_rate": 4.822194711538462e-05, "loss": 0.617, "step": 84000 }, { "epoch": 16.17, "learning_rate": 4.820995192307692e-05, "loss": 0.6158, "step": 84500 }, { "epoch": 16.27, "learning_rate": 4.819795673076923e-05, "loss": 0.617, "step": 85000 }, { "epoch": 16.36, "learning_rate": 4.81859375e-05, "loss": 0.6149, "step": 85500 }, { "epoch": 16.46, "learning_rate": 4.817391826923077e-05, "loss": 0.6128, "step": 86000 }, { "epoch": 16.56, "learning_rate": 4.816189903846154e-05, "loss": 0.6119, "step": 86500 }, { "epoch": 16.65, "learning_rate": 4.814987980769231e-05, "loss": 0.6104, "step": 87000 }, { "epoch": 16.75, "learning_rate": 4.8137860576923074e-05, "loss": 0.6082, "step": 87500 }, { "epoch": 16.84, "learning_rate": 4.812584134615385e-05, "loss": 0.6077, "step": 88000 }, { "epoch": 16.94, "learning_rate": 4.811382211538462e-05, "loss": 0.6066, "step": 88500 }, { "epoch": 17.03, "learning_rate": 4.810182692307693e-05, "loss": 0.6051, "step": 89000 }, { "epoch": 17.13, "learning_rate": 4.808980769230769e-05, "loss": 0.6035, "step": 89500 }, { "epoch": 17.22, "learning_rate": 4.8077788461538464e-05, "loss": 0.6032, "step": 90000 }, { "epoch": 17.32, "learning_rate": 4.806576923076923e-05, "loss": 0.6001, "step": 90500 }, { "epoch": 17.42, "learning_rate": 4.805375e-05, "loss": 0.598, "step": 91000 }, { "epoch": 17.51, "learning_rate": 4.8041754807692304e-05, "loss": 0.5996, "step": 91500 }, { "epoch": 17.61, "learning_rate": 4.802973557692308e-05, "loss": 0.5987, "step": 92000 }, { "epoch": 17.7, "learning_rate": 4.801771634615385e-05, "loss": 0.5973, "step": 92500 }, { "epoch": 17.8, "learning_rate": 4.800572115384616e-05, "loss": 0.5957, "step": 93000 }, { "epoch": 17.89, "learning_rate": 4.7993701923076926e-05, "loss": 0.5938, "step": 93500 }, { "epoch": 17.99, "learning_rate": 4.7981682692307694e-05, "loss": 0.593, "step": 94000 } ], "max_steps": 2090000, "num_train_epochs": 400, "total_flos": 2.5346531711380357e+19, "trial_name": null, "trial_params": null }