plt5-base-poquad2 / trainer_state.json
mzasada's picture
Upload 12 files
6847bf0
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 400,
"global_step": 21234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 2.992935857586889e-05,
"loss": 14.6395,
"step": 50
},
{
"epoch": 0.01,
"learning_rate": 2.985871715173778e-05,
"loss": 8.6304,
"step": 100
},
{
"epoch": 0.02,
"learning_rate": 2.978807572760667e-05,
"loss": 6.1691,
"step": 150
},
{
"epoch": 0.03,
"learning_rate": 2.9717434303475558e-05,
"loss": 5.4462,
"step": 200
},
{
"epoch": 0.04,
"learning_rate": 2.964679287934445e-05,
"loss": 4.827,
"step": 250
},
{
"epoch": 0.04,
"learning_rate": 2.9576151455213337e-05,
"loss": 4.2751,
"step": 300
},
{
"epoch": 0.05,
"learning_rate": 2.9505510031082228e-05,
"loss": 3.9951,
"step": 350
},
{
"epoch": 0.06,
"learning_rate": 2.9434868606951115e-05,
"loss": 3.8793,
"step": 400
},
{
"epoch": 0.06,
"eval_loss": 2.5431909561157227,
"eval_runtime": 146.3919,
"eval_samples_per_second": 51.499,
"eval_steps_per_second": 6.442,
"step": 400
},
{
"epoch": 0.06,
"learning_rate": 2.9364227182820006e-05,
"loss": 3.5368,
"step": 450
},
{
"epoch": 0.07,
"learning_rate": 2.9293585758688894e-05,
"loss": 3.243,
"step": 500
},
{
"epoch": 0.08,
"learning_rate": 2.9222944334557785e-05,
"loss": 3.3564,
"step": 550
},
{
"epoch": 0.08,
"learning_rate": 2.9152302910426676e-05,
"loss": 3.1593,
"step": 600
},
{
"epoch": 0.09,
"learning_rate": 2.9081661486295564e-05,
"loss": 2.9504,
"step": 650
},
{
"epoch": 0.1,
"learning_rate": 2.901102006216445e-05,
"loss": 2.6833,
"step": 700
},
{
"epoch": 0.11,
"learning_rate": 2.8940378638033346e-05,
"loss": 2.5516,
"step": 750
},
{
"epoch": 0.11,
"learning_rate": 2.8869737213902233e-05,
"loss": 2.3584,
"step": 800
},
{
"epoch": 0.11,
"eval_loss": 2.064560651779175,
"eval_runtime": 146.3705,
"eval_samples_per_second": 51.506,
"eval_steps_per_second": 6.443,
"step": 800
},
{
"epoch": 0.12,
"learning_rate": 2.879909578977112e-05,
"loss": 2.3706,
"step": 850
},
{
"epoch": 0.13,
"learning_rate": 2.8728454365640012e-05,
"loss": 2.3712,
"step": 900
},
{
"epoch": 0.13,
"learning_rate": 2.8657812941508903e-05,
"loss": 2.454,
"step": 950
},
{
"epoch": 0.14,
"learning_rate": 2.858717151737779e-05,
"loss": 2.3754,
"step": 1000
},
{
"epoch": 0.15,
"learning_rate": 2.851653009324668e-05,
"loss": 2.2086,
"step": 1050
},
{
"epoch": 0.16,
"learning_rate": 2.844588866911557e-05,
"loss": 2.4654,
"step": 1100
},
{
"epoch": 0.16,
"learning_rate": 2.837524724498446e-05,
"loss": 2.2728,
"step": 1150
},
{
"epoch": 0.17,
"learning_rate": 2.8304605820853348e-05,
"loss": 2.1175,
"step": 1200
},
{
"epoch": 0.17,
"eval_loss": 1.9099233150482178,
"eval_runtime": 146.2849,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.446,
"step": 1200
},
{
"epoch": 0.18,
"learning_rate": 2.823396439672224e-05,
"loss": 1.9796,
"step": 1250
},
{
"epoch": 0.18,
"learning_rate": 2.8163322972591127e-05,
"loss": 2.1218,
"step": 1300
},
{
"epoch": 0.19,
"learning_rate": 2.8092681548460018e-05,
"loss": 2.0264,
"step": 1350
},
{
"epoch": 0.2,
"learning_rate": 2.802204012432891e-05,
"loss": 2.0893,
"step": 1400
},
{
"epoch": 0.2,
"learning_rate": 2.7951398700197796e-05,
"loss": 2.0299,
"step": 1450
},
{
"epoch": 0.21,
"learning_rate": 2.7880757276066684e-05,
"loss": 1.8765,
"step": 1500
},
{
"epoch": 0.22,
"learning_rate": 2.781011585193558e-05,
"loss": 1.8572,
"step": 1550
},
{
"epoch": 0.23,
"learning_rate": 2.7739474427804466e-05,
"loss": 1.7761,
"step": 1600
},
{
"epoch": 0.23,
"eval_loss": 1.8229914903640747,
"eval_runtime": 146.5454,
"eval_samples_per_second": 51.445,
"eval_steps_per_second": 6.435,
"step": 1600
},
{
"epoch": 0.23,
"learning_rate": 2.7668833003673354e-05,
"loss": 1.7773,
"step": 1650
},
{
"epoch": 0.24,
"learning_rate": 2.7598191579542245e-05,
"loss": 1.8014,
"step": 1700
},
{
"epoch": 0.25,
"learning_rate": 2.7527550155411136e-05,
"loss": 1.8682,
"step": 1750
},
{
"epoch": 0.25,
"learning_rate": 2.7456908731280023e-05,
"loss": 1.9171,
"step": 1800
},
{
"epoch": 0.26,
"learning_rate": 2.7386267307148914e-05,
"loss": 1.7615,
"step": 1850
},
{
"epoch": 0.27,
"learning_rate": 2.7315625883017802e-05,
"loss": 1.7201,
"step": 1900
},
{
"epoch": 0.28,
"learning_rate": 2.7244984458886693e-05,
"loss": 1.6485,
"step": 1950
},
{
"epoch": 0.28,
"learning_rate": 2.717434303475558e-05,
"loss": 1.6427,
"step": 2000
},
{
"epoch": 0.28,
"eval_loss": 1.7565761804580688,
"eval_runtime": 146.4911,
"eval_samples_per_second": 51.464,
"eval_steps_per_second": 6.437,
"step": 2000
},
{
"epoch": 0.29,
"learning_rate": 2.710370161062447e-05,
"loss": 1.9137,
"step": 2050
},
{
"epoch": 0.3,
"learning_rate": 2.703306018649336e-05,
"loss": 1.8964,
"step": 2100
},
{
"epoch": 0.3,
"learning_rate": 2.696241876236225e-05,
"loss": 1.7006,
"step": 2150
},
{
"epoch": 0.31,
"learning_rate": 2.689177733823114e-05,
"loss": 1.7578,
"step": 2200
},
{
"epoch": 0.32,
"learning_rate": 2.682113591410003e-05,
"loss": 1.7194,
"step": 2250
},
{
"epoch": 0.32,
"learning_rate": 2.6750494489968917e-05,
"loss": 1.8679,
"step": 2300
},
{
"epoch": 0.33,
"learning_rate": 2.667985306583781e-05,
"loss": 1.5503,
"step": 2350
},
{
"epoch": 0.34,
"learning_rate": 2.66092116417067e-05,
"loss": 1.5331,
"step": 2400
},
{
"epoch": 0.34,
"eval_loss": 1.6002424955368042,
"eval_runtime": 146.4497,
"eval_samples_per_second": 51.478,
"eval_steps_per_second": 6.439,
"step": 2400
},
{
"epoch": 0.35,
"learning_rate": 2.6538570217575586e-05,
"loss": 1.5046,
"step": 2450
},
{
"epoch": 0.35,
"learning_rate": 2.6467928793444477e-05,
"loss": 1.4923,
"step": 2500
},
{
"epoch": 0.36,
"learning_rate": 2.6397287369313365e-05,
"loss": 1.4884,
"step": 2550
},
{
"epoch": 0.37,
"learning_rate": 2.6326645945182256e-05,
"loss": 1.4889,
"step": 2600
},
{
"epoch": 0.37,
"learning_rate": 2.6256004521051143e-05,
"loss": 1.4259,
"step": 2650
},
{
"epoch": 0.38,
"learning_rate": 2.6185363096920034e-05,
"loss": 1.4368,
"step": 2700
},
{
"epoch": 0.39,
"learning_rate": 2.6114721672788922e-05,
"loss": 1.3874,
"step": 2750
},
{
"epoch": 0.4,
"learning_rate": 2.6044080248657813e-05,
"loss": 1.2631,
"step": 2800
},
{
"epoch": 0.4,
"eval_loss": 1.2792493104934692,
"eval_runtime": 146.3888,
"eval_samples_per_second": 51.5,
"eval_steps_per_second": 6.442,
"step": 2800
},
{
"epoch": 0.4,
"learning_rate": 2.5973438824526704e-05,
"loss": 1.3781,
"step": 2850
},
{
"epoch": 0.41,
"learning_rate": 2.5902797400395592e-05,
"loss": 1.2868,
"step": 2900
},
{
"epoch": 0.42,
"learning_rate": 2.583215597626448e-05,
"loss": 1.3965,
"step": 2950
},
{
"epoch": 0.42,
"learning_rate": 2.5761514552133374e-05,
"loss": 1.1789,
"step": 3000
},
{
"epoch": 0.43,
"learning_rate": 2.569087312800226e-05,
"loss": 1.1915,
"step": 3050
},
{
"epoch": 0.44,
"learning_rate": 2.562023170387115e-05,
"loss": 1.1582,
"step": 3100
},
{
"epoch": 0.45,
"learning_rate": 2.554959027974004e-05,
"loss": 1.3444,
"step": 3150
},
{
"epoch": 0.45,
"learning_rate": 2.547894885560893e-05,
"loss": 1.3403,
"step": 3200
},
{
"epoch": 0.45,
"eval_loss": 1.1806565523147583,
"eval_runtime": 146.4657,
"eval_samples_per_second": 51.473,
"eval_steps_per_second": 6.438,
"step": 3200
},
{
"epoch": 0.46,
"learning_rate": 2.540830743147782e-05,
"loss": 1.2903,
"step": 3250
},
{
"epoch": 0.47,
"learning_rate": 2.533766600734671e-05,
"loss": 1.2086,
"step": 3300
},
{
"epoch": 0.47,
"learning_rate": 2.5267024583215597e-05,
"loss": 1.2047,
"step": 3350
},
{
"epoch": 0.48,
"learning_rate": 2.519638315908449e-05,
"loss": 1.1538,
"step": 3400
},
{
"epoch": 0.49,
"learning_rate": 2.5125741734953376e-05,
"loss": 1.184,
"step": 3450
},
{
"epoch": 0.49,
"learning_rate": 2.5055100310822267e-05,
"loss": 1.0577,
"step": 3500
},
{
"epoch": 0.5,
"learning_rate": 2.4984458886691155e-05,
"loss": 1.1838,
"step": 3550
},
{
"epoch": 0.51,
"learning_rate": 2.4913817462560046e-05,
"loss": 1.1105,
"step": 3600
},
{
"epoch": 0.51,
"eval_loss": 1.1069550514221191,
"eval_runtime": 146.67,
"eval_samples_per_second": 51.401,
"eval_steps_per_second": 6.429,
"step": 3600
},
{
"epoch": 0.52,
"learning_rate": 2.4843176038428937e-05,
"loss": 1.0968,
"step": 3650
},
{
"epoch": 0.52,
"learning_rate": 2.4772534614297824e-05,
"loss": 1.0911,
"step": 3700
},
{
"epoch": 0.53,
"learning_rate": 2.4701893190166712e-05,
"loss": 0.9316,
"step": 3750
},
{
"epoch": 0.54,
"learning_rate": 2.4631251766035606e-05,
"loss": 1.1932,
"step": 3800
},
{
"epoch": 0.54,
"learning_rate": 2.4560610341904494e-05,
"loss": 1.1092,
"step": 3850
},
{
"epoch": 0.55,
"learning_rate": 2.448996891777338e-05,
"loss": 0.9679,
"step": 3900
},
{
"epoch": 0.56,
"learning_rate": 2.4419327493642273e-05,
"loss": 1.0059,
"step": 3950
},
{
"epoch": 0.57,
"learning_rate": 2.4348686069511164e-05,
"loss": 1.1187,
"step": 4000
},
{
"epoch": 0.57,
"eval_loss": 1.0644315481185913,
"eval_runtime": 146.3933,
"eval_samples_per_second": 51.498,
"eval_steps_per_second": 6.442,
"step": 4000
},
{
"epoch": 0.57,
"learning_rate": 2.427804464538005e-05,
"loss": 1.0207,
"step": 4050
},
{
"epoch": 0.58,
"learning_rate": 2.420740322124894e-05,
"loss": 1.074,
"step": 4100
},
{
"epoch": 0.59,
"learning_rate": 2.413676179711783e-05,
"loss": 1.0159,
"step": 4150
},
{
"epoch": 0.59,
"learning_rate": 2.406612037298672e-05,
"loss": 1.0128,
"step": 4200
},
{
"epoch": 0.6,
"learning_rate": 2.399547894885561e-05,
"loss": 1.0398,
"step": 4250
},
{
"epoch": 0.61,
"learning_rate": 2.39248375247245e-05,
"loss": 1.0713,
"step": 4300
},
{
"epoch": 0.61,
"learning_rate": 2.3854196100593387e-05,
"loss": 0.9827,
"step": 4350
},
{
"epoch": 0.62,
"learning_rate": 2.378355467646228e-05,
"loss": 1.0307,
"step": 4400
},
{
"epoch": 0.62,
"eval_loss": 1.053775668144226,
"eval_runtime": 146.3869,
"eval_samples_per_second": 51.501,
"eval_steps_per_second": 6.442,
"step": 4400
},
{
"epoch": 0.63,
"learning_rate": 2.371291325233117e-05,
"loss": 0.9596,
"step": 4450
},
{
"epoch": 0.64,
"learning_rate": 2.3642271828200057e-05,
"loss": 0.9598,
"step": 4500
},
{
"epoch": 0.64,
"learning_rate": 2.3571630404068945e-05,
"loss": 0.9912,
"step": 4550
},
{
"epoch": 0.65,
"learning_rate": 2.350098897993784e-05,
"loss": 0.9629,
"step": 4600
},
{
"epoch": 0.66,
"learning_rate": 2.3430347555806727e-05,
"loss": 0.9623,
"step": 4650
},
{
"epoch": 0.66,
"learning_rate": 2.3359706131675614e-05,
"loss": 0.9315,
"step": 4700
},
{
"epoch": 0.67,
"learning_rate": 2.3289064707544505e-05,
"loss": 0.9403,
"step": 4750
},
{
"epoch": 0.68,
"learning_rate": 2.3218423283413396e-05,
"loss": 0.9285,
"step": 4800
},
{
"epoch": 0.68,
"eval_loss": 1.0219874382019043,
"eval_runtime": 146.3894,
"eval_samples_per_second": 51.5,
"eval_steps_per_second": 6.442,
"step": 4800
},
{
"epoch": 0.69,
"learning_rate": 2.3147781859282284e-05,
"loss": 0.8947,
"step": 4850
},
{
"epoch": 0.69,
"learning_rate": 2.307714043515117e-05,
"loss": 0.9009,
"step": 4900
},
{
"epoch": 0.7,
"learning_rate": 2.3006499011020063e-05,
"loss": 0.888,
"step": 4950
},
{
"epoch": 0.71,
"learning_rate": 2.2935857586888954e-05,
"loss": 0.9096,
"step": 5000
},
{
"epoch": 0.71,
"learning_rate": 2.286521616275784e-05,
"loss": 0.926,
"step": 5050
},
{
"epoch": 0.72,
"learning_rate": 2.2794574738626732e-05,
"loss": 0.9217,
"step": 5100
},
{
"epoch": 0.73,
"learning_rate": 2.272393331449562e-05,
"loss": 0.9105,
"step": 5150
},
{
"epoch": 0.73,
"learning_rate": 2.2653291890364508e-05,
"loss": 0.8917,
"step": 5200
},
{
"epoch": 0.73,
"eval_loss": 0.9944532513618469,
"eval_runtime": 146.4135,
"eval_samples_per_second": 51.491,
"eval_steps_per_second": 6.441,
"step": 5200
},
{
"epoch": 0.74,
"learning_rate": 2.2582650466233402e-05,
"loss": 0.8959,
"step": 5250
},
{
"epoch": 0.75,
"learning_rate": 2.251200904210229e-05,
"loss": 0.8332,
"step": 5300
},
{
"epoch": 0.76,
"learning_rate": 2.2441367617971177e-05,
"loss": 0.9235,
"step": 5350
},
{
"epoch": 0.76,
"learning_rate": 2.2370726193840068e-05,
"loss": 0.7935,
"step": 5400
},
{
"epoch": 0.77,
"learning_rate": 2.230008476970896e-05,
"loss": 0.9101,
"step": 5450
},
{
"epoch": 0.78,
"learning_rate": 2.2229443345577847e-05,
"loss": 0.9317,
"step": 5500
},
{
"epoch": 0.78,
"learning_rate": 2.2158801921446738e-05,
"loss": 0.9412,
"step": 5550
},
{
"epoch": 0.79,
"learning_rate": 2.2088160497315626e-05,
"loss": 0.9018,
"step": 5600
},
{
"epoch": 0.79,
"eval_loss": 0.961943507194519,
"eval_runtime": 146.3492,
"eval_samples_per_second": 51.514,
"eval_steps_per_second": 6.443,
"step": 5600
},
{
"epoch": 0.8,
"learning_rate": 2.2017519073184517e-05,
"loss": 0.9469,
"step": 5650
},
{
"epoch": 0.81,
"learning_rate": 2.1946877649053404e-05,
"loss": 0.8957,
"step": 5700
},
{
"epoch": 0.81,
"learning_rate": 2.1876236224922295e-05,
"loss": 0.9212,
"step": 5750
},
{
"epoch": 0.82,
"learning_rate": 2.1805594800791183e-05,
"loss": 0.9815,
"step": 5800
},
{
"epoch": 0.83,
"learning_rate": 2.1734953376660074e-05,
"loss": 0.8364,
"step": 5850
},
{
"epoch": 0.83,
"learning_rate": 2.1664311952528965e-05,
"loss": 0.9239,
"step": 5900
},
{
"epoch": 0.84,
"learning_rate": 2.1593670528397852e-05,
"loss": 0.8507,
"step": 5950
},
{
"epoch": 0.85,
"learning_rate": 2.152302910426674e-05,
"loss": 0.8455,
"step": 6000
},
{
"epoch": 0.85,
"eval_loss": 0.9640973210334778,
"eval_runtime": 146.216,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 6.449,
"step": 6000
},
{
"epoch": 0.85,
"learning_rate": 2.1452387680135635e-05,
"loss": 0.873,
"step": 6050
},
{
"epoch": 0.86,
"learning_rate": 2.1381746256004522e-05,
"loss": 0.9022,
"step": 6100
},
{
"epoch": 0.87,
"learning_rate": 2.131110483187341e-05,
"loss": 0.9067,
"step": 6150
},
{
"epoch": 0.88,
"learning_rate": 2.12404634077423e-05,
"loss": 0.8156,
"step": 6200
},
{
"epoch": 0.88,
"learning_rate": 2.1169821983611192e-05,
"loss": 0.7626,
"step": 6250
},
{
"epoch": 0.89,
"learning_rate": 2.109918055948008e-05,
"loss": 0.9551,
"step": 6300
},
{
"epoch": 0.9,
"learning_rate": 2.1028539135348967e-05,
"loss": 0.7957,
"step": 6350
},
{
"epoch": 0.9,
"learning_rate": 2.0957897711217858e-05,
"loss": 0.8259,
"step": 6400
},
{
"epoch": 0.9,
"eval_loss": 0.9384291768074036,
"eval_runtime": 146.2207,
"eval_samples_per_second": 51.559,
"eval_steps_per_second": 6.449,
"step": 6400
},
{
"epoch": 0.91,
"learning_rate": 2.088725628708675e-05,
"loss": 0.7219,
"step": 6450
},
{
"epoch": 0.92,
"learning_rate": 2.0816614862955637e-05,
"loss": 0.8212,
"step": 6500
},
{
"epoch": 0.93,
"learning_rate": 2.0745973438824528e-05,
"loss": 0.7422,
"step": 6550
},
{
"epoch": 0.93,
"learning_rate": 2.0675332014693415e-05,
"loss": 0.8335,
"step": 6600
},
{
"epoch": 0.94,
"learning_rate": 2.0604690590562306e-05,
"loss": 0.8139,
"step": 6650
},
{
"epoch": 0.95,
"learning_rate": 2.0534049166431197e-05,
"loss": 0.8703,
"step": 6700
},
{
"epoch": 0.95,
"learning_rate": 2.0463407742300085e-05,
"loss": 0.9146,
"step": 6750
},
{
"epoch": 0.96,
"learning_rate": 2.0392766318168973e-05,
"loss": 0.7593,
"step": 6800
},
{
"epoch": 0.96,
"eval_loss": 0.9210972785949707,
"eval_runtime": 146.1977,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.45,
"step": 6800
},
{
"epoch": 0.97,
"learning_rate": 2.0322124894037867e-05,
"loss": 0.8349,
"step": 6850
},
{
"epoch": 0.97,
"learning_rate": 2.0251483469906755e-05,
"loss": 0.8791,
"step": 6900
},
{
"epoch": 0.98,
"learning_rate": 2.0180842045775642e-05,
"loss": 0.9861,
"step": 6950
},
{
"epoch": 0.99,
"learning_rate": 2.0110200621644533e-05,
"loss": 0.7273,
"step": 7000
},
{
"epoch": 1.0,
"learning_rate": 2.0039559197513424e-05,
"loss": 0.8075,
"step": 7050
},
{
"epoch": 1.0,
"learning_rate": 1.9968917773382312e-05,
"loss": 0.7553,
"step": 7100
},
{
"epoch": 1.01,
"learning_rate": 1.98982763492512e-05,
"loss": 0.7873,
"step": 7150
},
{
"epoch": 1.02,
"learning_rate": 1.982763492512009e-05,
"loss": 0.8176,
"step": 7200
},
{
"epoch": 1.02,
"eval_loss": 0.910588264465332,
"eval_runtime": 146.2211,
"eval_samples_per_second": 51.559,
"eval_steps_per_second": 6.449,
"step": 7200
},
{
"epoch": 1.02,
"learning_rate": 1.9756993500988982e-05,
"loss": 0.7748,
"step": 7250
},
{
"epoch": 1.03,
"learning_rate": 1.968635207685787e-05,
"loss": 0.8337,
"step": 7300
},
{
"epoch": 1.04,
"learning_rate": 1.961571065272676e-05,
"loss": 0.7968,
"step": 7350
},
{
"epoch": 1.05,
"learning_rate": 1.9545069228595648e-05,
"loss": 0.7117,
"step": 7400
},
{
"epoch": 1.05,
"learning_rate": 1.947442780446454e-05,
"loss": 0.7323,
"step": 7450
},
{
"epoch": 1.06,
"learning_rate": 1.940378638033343e-05,
"loss": 0.7593,
"step": 7500
},
{
"epoch": 1.07,
"learning_rate": 1.9333144956202318e-05,
"loss": 0.7826,
"step": 7550
},
{
"epoch": 1.07,
"learning_rate": 1.9262503532071205e-05,
"loss": 0.881,
"step": 7600
},
{
"epoch": 1.07,
"eval_loss": 0.8992499709129333,
"eval_runtime": 146.4953,
"eval_samples_per_second": 51.462,
"eval_steps_per_second": 6.437,
"step": 7600
},
{
"epoch": 1.08,
"learning_rate": 1.9191862107940096e-05,
"loss": 0.6995,
"step": 7650
},
{
"epoch": 1.09,
"learning_rate": 1.9121220683808987e-05,
"loss": 0.6882,
"step": 7700
},
{
"epoch": 1.09,
"learning_rate": 1.9050579259677875e-05,
"loss": 0.7669,
"step": 7750
},
{
"epoch": 1.1,
"learning_rate": 1.8979937835546766e-05,
"loss": 0.7485,
"step": 7800
},
{
"epoch": 1.11,
"learning_rate": 1.8909296411415654e-05,
"loss": 0.8179,
"step": 7850
},
{
"epoch": 1.12,
"learning_rate": 1.8838654987284545e-05,
"loss": 0.6935,
"step": 7900
},
{
"epoch": 1.12,
"learning_rate": 1.8768013563153432e-05,
"loss": 0.7565,
"step": 7950
},
{
"epoch": 1.13,
"learning_rate": 1.8697372139022323e-05,
"loss": 0.7842,
"step": 8000
},
{
"epoch": 1.13,
"eval_loss": 0.8948126435279846,
"eval_runtime": 146.2227,
"eval_samples_per_second": 51.558,
"eval_steps_per_second": 6.449,
"step": 8000
},
{
"epoch": 1.14,
"learning_rate": 1.862673071489121e-05,
"loss": 0.7082,
"step": 8050
},
{
"epoch": 1.14,
"learning_rate": 1.8556089290760102e-05,
"loss": 0.712,
"step": 8100
},
{
"epoch": 1.15,
"learning_rate": 1.8485447866628993e-05,
"loss": 0.7024,
"step": 8150
},
{
"epoch": 1.16,
"learning_rate": 1.841480644249788e-05,
"loss": 0.7146,
"step": 8200
},
{
"epoch": 1.17,
"learning_rate": 1.8344165018366768e-05,
"loss": 0.7073,
"step": 8250
},
{
"epoch": 1.17,
"learning_rate": 1.8273523594235663e-05,
"loss": 0.72,
"step": 8300
},
{
"epoch": 1.18,
"learning_rate": 1.820288217010455e-05,
"loss": 0.7678,
"step": 8350
},
{
"epoch": 1.19,
"learning_rate": 1.8132240745973438e-05,
"loss": 0.698,
"step": 8400
},
{
"epoch": 1.19,
"eval_loss": 0.8967130780220032,
"eval_runtime": 146.58,
"eval_samples_per_second": 51.433,
"eval_steps_per_second": 6.433,
"step": 8400
},
{
"epoch": 1.19,
"learning_rate": 1.806159932184233e-05,
"loss": 0.6976,
"step": 8450
},
{
"epoch": 1.2,
"learning_rate": 1.799095789771122e-05,
"loss": 0.74,
"step": 8500
},
{
"epoch": 1.21,
"learning_rate": 1.7920316473580108e-05,
"loss": 0.7858,
"step": 8550
},
{
"epoch": 1.22,
"learning_rate": 1.7849675049448995e-05,
"loss": 0.6905,
"step": 8600
},
{
"epoch": 1.22,
"learning_rate": 1.7779033625317886e-05,
"loss": 0.7195,
"step": 8650
},
{
"epoch": 1.23,
"learning_rate": 1.7708392201186777e-05,
"loss": 0.6889,
"step": 8700
},
{
"epoch": 1.24,
"learning_rate": 1.7637750777055665e-05,
"loss": 0.7499,
"step": 8750
},
{
"epoch": 1.24,
"learning_rate": 1.7567109352924556e-05,
"loss": 0.649,
"step": 8800
},
{
"epoch": 1.24,
"eval_loss": 0.8673287630081177,
"eval_runtime": 146.2981,
"eval_samples_per_second": 51.532,
"eval_steps_per_second": 6.446,
"step": 8800
},
{
"epoch": 1.25,
"learning_rate": 1.7496467928793444e-05,
"loss": 0.6689,
"step": 8850
},
{
"epoch": 1.26,
"learning_rate": 1.7425826504662335e-05,
"loss": 0.7401,
"step": 8900
},
{
"epoch": 1.26,
"learning_rate": 1.7355185080531226e-05,
"loss": 0.6371,
"step": 8950
},
{
"epoch": 1.27,
"learning_rate": 1.7284543656400113e-05,
"loss": 0.6743,
"step": 9000
},
{
"epoch": 1.28,
"learning_rate": 1.7213902232269e-05,
"loss": 0.7858,
"step": 9050
},
{
"epoch": 1.29,
"learning_rate": 1.7143260808137895e-05,
"loss": 0.6682,
"step": 9100
},
{
"epoch": 1.29,
"learning_rate": 1.7072619384006783e-05,
"loss": 0.6313,
"step": 9150
},
{
"epoch": 1.3,
"learning_rate": 1.700197795987567e-05,
"loss": 0.7243,
"step": 9200
},
{
"epoch": 1.3,
"eval_loss": 0.8753476738929749,
"eval_runtime": 146.7694,
"eval_samples_per_second": 51.366,
"eval_steps_per_second": 6.425,
"step": 9200
},
{
"epoch": 1.31,
"learning_rate": 1.693133653574456e-05,
"loss": 0.723,
"step": 9250
},
{
"epoch": 1.31,
"learning_rate": 1.6860695111613453e-05,
"loss": 0.6731,
"step": 9300
},
{
"epoch": 1.32,
"learning_rate": 1.679005368748234e-05,
"loss": 0.7232,
"step": 9350
},
{
"epoch": 1.33,
"learning_rate": 1.6719412263351228e-05,
"loss": 0.802,
"step": 9400
},
{
"epoch": 1.34,
"learning_rate": 1.664877083922012e-05,
"loss": 0.7172,
"step": 9450
},
{
"epoch": 1.34,
"learning_rate": 1.657812941508901e-05,
"loss": 0.6294,
"step": 9500
},
{
"epoch": 1.35,
"learning_rate": 1.6507487990957897e-05,
"loss": 0.6319,
"step": 9550
},
{
"epoch": 1.36,
"learning_rate": 1.643684656682679e-05,
"loss": 0.7411,
"step": 9600
},
{
"epoch": 1.36,
"eval_loss": 0.8756875991821289,
"eval_runtime": 146.4905,
"eval_samples_per_second": 51.464,
"eval_steps_per_second": 6.437,
"step": 9600
},
{
"epoch": 1.36,
"learning_rate": 1.6366205142695676e-05,
"loss": 0.6805,
"step": 9650
},
{
"epoch": 1.37,
"learning_rate": 1.6295563718564567e-05,
"loss": 0.7114,
"step": 9700
},
{
"epoch": 1.38,
"learning_rate": 1.6224922294433458e-05,
"loss": 0.6841,
"step": 9750
},
{
"epoch": 1.38,
"learning_rate": 1.6154280870302346e-05,
"loss": 0.6689,
"step": 9800
},
{
"epoch": 1.39,
"learning_rate": 1.6083639446171233e-05,
"loss": 0.7589,
"step": 9850
},
{
"epoch": 1.4,
"learning_rate": 1.6012998022040128e-05,
"loss": 0.7229,
"step": 9900
},
{
"epoch": 1.41,
"learning_rate": 1.5942356597909015e-05,
"loss": 0.6673,
"step": 9950
},
{
"epoch": 1.41,
"learning_rate": 1.5871715173777903e-05,
"loss": 0.702,
"step": 10000
},
{
"epoch": 1.41,
"eval_loss": 0.8556445837020874,
"eval_runtime": 146.5096,
"eval_samples_per_second": 51.457,
"eval_steps_per_second": 6.436,
"step": 10000
},
{
"epoch": 1.42,
"learning_rate": 1.580107374964679e-05,
"loss": 0.7646,
"step": 10050
},
{
"epoch": 1.43,
"learning_rate": 1.5730432325515685e-05,
"loss": 0.7534,
"step": 10100
},
{
"epoch": 1.43,
"learning_rate": 1.5659790901384573e-05,
"loss": 0.6328,
"step": 10150
},
{
"epoch": 1.44,
"learning_rate": 1.558914947725346e-05,
"loss": 0.62,
"step": 10200
},
{
"epoch": 1.45,
"learning_rate": 1.551850805312235e-05,
"loss": 0.6525,
"step": 10250
},
{
"epoch": 1.46,
"learning_rate": 1.544786662899124e-05,
"loss": 0.7223,
"step": 10300
},
{
"epoch": 1.46,
"learning_rate": 1.537722520486013e-05,
"loss": 0.6904,
"step": 10350
},
{
"epoch": 1.47,
"learning_rate": 1.530658378072902e-05,
"loss": 0.6655,
"step": 10400
},
{
"epoch": 1.47,
"eval_loss": 0.8510493040084839,
"eval_runtime": 146.5188,
"eval_samples_per_second": 51.454,
"eval_steps_per_second": 6.436,
"step": 10400
},
{
"epoch": 1.48,
"learning_rate": 1.5235942356597909e-05,
"loss": 0.6835,
"step": 10450
},
{
"epoch": 1.48,
"learning_rate": 1.5165300932466798e-05,
"loss": 0.6766,
"step": 10500
},
{
"epoch": 1.49,
"learning_rate": 1.5094659508335689e-05,
"loss": 0.6127,
"step": 10550
},
{
"epoch": 1.5,
"learning_rate": 1.5024018084204578e-05,
"loss": 0.6755,
"step": 10600
},
{
"epoch": 1.5,
"learning_rate": 1.4953376660073466e-05,
"loss": 0.749,
"step": 10650
},
{
"epoch": 1.51,
"learning_rate": 1.4882735235942357e-05,
"loss": 0.6164,
"step": 10700
},
{
"epoch": 1.52,
"learning_rate": 1.4812093811811246e-05,
"loss": 0.6659,
"step": 10750
},
{
"epoch": 1.53,
"learning_rate": 1.4741452387680136e-05,
"loss": 0.6577,
"step": 10800
},
{
"epoch": 1.53,
"eval_loss": 0.8637193441390991,
"eval_runtime": 146.2948,
"eval_samples_per_second": 51.533,
"eval_steps_per_second": 6.446,
"step": 10800
},
{
"epoch": 1.53,
"learning_rate": 1.4670810963549025e-05,
"loss": 0.7381,
"step": 10850
},
{
"epoch": 1.54,
"learning_rate": 1.4600169539417916e-05,
"loss": 0.6131,
"step": 10900
},
{
"epoch": 1.55,
"learning_rate": 1.4529528115286804e-05,
"loss": 0.6697,
"step": 10950
},
{
"epoch": 1.55,
"learning_rate": 1.4458886691155695e-05,
"loss": 0.6415,
"step": 11000
},
{
"epoch": 1.56,
"learning_rate": 1.4388245267024582e-05,
"loss": 0.6193,
"step": 11050
},
{
"epoch": 1.57,
"learning_rate": 1.4317603842893473e-05,
"loss": 0.6082,
"step": 11100
},
{
"epoch": 1.58,
"learning_rate": 1.4246962418762363e-05,
"loss": 0.7267,
"step": 11150
},
{
"epoch": 1.58,
"learning_rate": 1.4176320994631252e-05,
"loss": 0.5725,
"step": 11200
},
{
"epoch": 1.58,
"eval_loss": 0.8456605076789856,
"eval_runtime": 146.5267,
"eval_samples_per_second": 51.451,
"eval_steps_per_second": 6.436,
"step": 11200
},
{
"epoch": 1.59,
"learning_rate": 1.4105679570500141e-05,
"loss": 0.6639,
"step": 11250
},
{
"epoch": 1.6,
"learning_rate": 1.4035038146369032e-05,
"loss": 0.6161,
"step": 11300
},
{
"epoch": 1.6,
"learning_rate": 1.396439672223792e-05,
"loss": 0.6083,
"step": 11350
},
{
"epoch": 1.61,
"learning_rate": 1.3893755298106811e-05,
"loss": 0.5692,
"step": 11400
},
{
"epoch": 1.62,
"learning_rate": 1.3823113873975699e-05,
"loss": 0.6962,
"step": 11450
},
{
"epoch": 1.62,
"learning_rate": 1.375247244984459e-05,
"loss": 0.6135,
"step": 11500
},
{
"epoch": 1.63,
"learning_rate": 1.3681831025713479e-05,
"loss": 0.6544,
"step": 11550
},
{
"epoch": 1.64,
"learning_rate": 1.3611189601582368e-05,
"loss": 0.6225,
"step": 11600
},
{
"epoch": 1.64,
"eval_loss": 0.8483865857124329,
"eval_runtime": 146.5858,
"eval_samples_per_second": 51.431,
"eval_steps_per_second": 6.433,
"step": 11600
},
{
"epoch": 1.65,
"learning_rate": 1.3540548177451258e-05,
"loss": 0.6596,
"step": 11650
},
{
"epoch": 1.65,
"learning_rate": 1.3469906753320149e-05,
"loss": 0.624,
"step": 11700
},
{
"epoch": 1.66,
"learning_rate": 1.3399265329189036e-05,
"loss": 0.6327,
"step": 11750
},
{
"epoch": 1.67,
"learning_rate": 1.3328623905057927e-05,
"loss": 0.6095,
"step": 11800
},
{
"epoch": 1.67,
"learning_rate": 1.3257982480926815e-05,
"loss": 0.6864,
"step": 11850
},
{
"epoch": 1.68,
"learning_rate": 1.3187341056795706e-05,
"loss": 0.7451,
"step": 11900
},
{
"epoch": 1.69,
"learning_rate": 1.3116699632664595e-05,
"loss": 0.6118,
"step": 11950
},
{
"epoch": 1.7,
"learning_rate": 1.3046058208533485e-05,
"loss": 0.6571,
"step": 12000
},
{
"epoch": 1.7,
"eval_loss": 0.8351559638977051,
"eval_runtime": 146.3219,
"eval_samples_per_second": 51.523,
"eval_steps_per_second": 6.445,
"step": 12000
},
{
"epoch": 1.7,
"learning_rate": 1.2975416784402374e-05,
"loss": 0.6203,
"step": 12050
},
{
"epoch": 1.71,
"learning_rate": 1.2904775360271263e-05,
"loss": 0.7048,
"step": 12100
},
{
"epoch": 1.72,
"learning_rate": 1.2834133936140153e-05,
"loss": 0.6577,
"step": 12150
},
{
"epoch": 1.72,
"learning_rate": 1.2763492512009044e-05,
"loss": 0.5403,
"step": 12200
},
{
"epoch": 1.73,
"learning_rate": 1.2692851087877931e-05,
"loss": 0.6941,
"step": 12250
},
{
"epoch": 1.74,
"learning_rate": 1.2622209663746822e-05,
"loss": 0.6098,
"step": 12300
},
{
"epoch": 1.74,
"learning_rate": 1.2551568239615712e-05,
"loss": 0.5796,
"step": 12350
},
{
"epoch": 1.75,
"learning_rate": 1.2480926815484601e-05,
"loss": 0.6302,
"step": 12400
},
{
"epoch": 1.75,
"eval_loss": 0.8377495408058167,
"eval_runtime": 146.3733,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.442,
"step": 12400
},
{
"epoch": 1.76,
"learning_rate": 1.241028539135349e-05,
"loss": 0.6653,
"step": 12450
},
{
"epoch": 1.77,
"learning_rate": 1.233964396722238e-05,
"loss": 0.5894,
"step": 12500
},
{
"epoch": 1.77,
"learning_rate": 1.2269002543091269e-05,
"loss": 0.6507,
"step": 12550
},
{
"epoch": 1.78,
"learning_rate": 1.219836111896016e-05,
"loss": 0.6557,
"step": 12600
},
{
"epoch": 1.79,
"learning_rate": 1.2127719694829047e-05,
"loss": 0.6185,
"step": 12650
},
{
"epoch": 1.79,
"learning_rate": 1.2057078270697937e-05,
"loss": 0.5889,
"step": 12700
},
{
"epoch": 1.8,
"learning_rate": 1.1986436846566828e-05,
"loss": 0.6514,
"step": 12750
},
{
"epoch": 1.81,
"learning_rate": 1.1915795422435715e-05,
"loss": 0.6231,
"step": 12800
},
{
"epoch": 1.81,
"eval_loss": 0.8342207670211792,
"eval_runtime": 146.3255,
"eval_samples_per_second": 51.522,
"eval_steps_per_second": 6.445,
"step": 12800
},
{
"epoch": 1.82,
"learning_rate": 1.1845153998304606e-05,
"loss": 0.7848,
"step": 12850
},
{
"epoch": 1.82,
"learning_rate": 1.1774512574173494e-05,
"loss": 0.762,
"step": 12900
},
{
"epoch": 1.83,
"learning_rate": 1.1703871150042385e-05,
"loss": 0.586,
"step": 12950
},
{
"epoch": 1.84,
"learning_rate": 1.1633229725911274e-05,
"loss": 0.6209,
"step": 13000
},
{
"epoch": 1.84,
"learning_rate": 1.1562588301780164e-05,
"loss": 0.6537,
"step": 13050
},
{
"epoch": 1.85,
"learning_rate": 1.1491946877649053e-05,
"loss": 0.5856,
"step": 13100
},
{
"epoch": 1.86,
"learning_rate": 1.1421305453517944e-05,
"loss": 0.7796,
"step": 13150
},
{
"epoch": 1.86,
"learning_rate": 1.1350664029386832e-05,
"loss": 0.5718,
"step": 13200
},
{
"epoch": 1.86,
"eval_loss": 0.8108465671539307,
"eval_runtime": 146.303,
"eval_samples_per_second": 51.53,
"eval_steps_per_second": 6.446,
"step": 13200
},
{
"epoch": 1.87,
"learning_rate": 1.1280022605255723e-05,
"loss": 0.6347,
"step": 13250
},
{
"epoch": 1.88,
"learning_rate": 1.120938118112461e-05,
"loss": 0.6508,
"step": 13300
},
{
"epoch": 1.89,
"learning_rate": 1.1138739756993501e-05,
"loss": 0.6238,
"step": 13350
},
{
"epoch": 1.89,
"learning_rate": 1.106809833286239e-05,
"loss": 0.6448,
"step": 13400
},
{
"epoch": 1.9,
"learning_rate": 1.099745690873128e-05,
"loss": 0.6704,
"step": 13450
},
{
"epoch": 1.91,
"learning_rate": 1.092681548460017e-05,
"loss": 0.574,
"step": 13500
},
{
"epoch": 1.91,
"learning_rate": 1.085617406046906e-05,
"loss": 0.7448,
"step": 13550
},
{
"epoch": 1.92,
"learning_rate": 1.0785532636337948e-05,
"loss": 0.6936,
"step": 13600
},
{
"epoch": 1.92,
"eval_loss": 0.8107577562332153,
"eval_runtime": 146.3517,
"eval_samples_per_second": 51.513,
"eval_steps_per_second": 6.443,
"step": 13600
},
{
"epoch": 1.93,
"learning_rate": 1.0714891212206839e-05,
"loss": 0.6312,
"step": 13650
},
{
"epoch": 1.94,
"learning_rate": 1.0644249788075727e-05,
"loss": 0.62,
"step": 13700
},
{
"epoch": 1.94,
"learning_rate": 1.0573608363944618e-05,
"loss": 0.686,
"step": 13750
},
{
"epoch": 1.95,
"learning_rate": 1.0502966939813507e-05,
"loss": 0.7155,
"step": 13800
},
{
"epoch": 1.96,
"learning_rate": 1.0432325515682396e-05,
"loss": 0.6248,
"step": 13850
},
{
"epoch": 1.96,
"learning_rate": 1.0361684091551286e-05,
"loss": 0.6356,
"step": 13900
},
{
"epoch": 1.97,
"learning_rate": 1.0291042667420175e-05,
"loss": 0.6264,
"step": 13950
},
{
"epoch": 1.98,
"learning_rate": 1.0220401243289064e-05,
"loss": 0.6367,
"step": 14000
},
{
"epoch": 1.98,
"eval_loss": 0.8052195906639099,
"eval_runtime": 146.3569,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.443,
"step": 14000
},
{
"epoch": 1.99,
"learning_rate": 1.0149759819157955e-05,
"loss": 0.5832,
"step": 14050
},
{
"epoch": 1.99,
"learning_rate": 1.0079118395026843e-05,
"loss": 0.7692,
"step": 14100
},
{
"epoch": 2.0,
"learning_rate": 1.0008476970895734e-05,
"loss": 0.6246,
"step": 14150
},
{
"epoch": 2.01,
"learning_rate": 9.937835546764623e-06,
"loss": 0.562,
"step": 14200
},
{
"epoch": 2.01,
"learning_rate": 9.867194122633513e-06,
"loss": 0.6346,
"step": 14250
},
{
"epoch": 2.02,
"learning_rate": 9.796552698502402e-06,
"loss": 0.6442,
"step": 14300
},
{
"epoch": 2.03,
"learning_rate": 9.725911274371291e-06,
"loss": 0.534,
"step": 14350
},
{
"epoch": 2.03,
"learning_rate": 9.65526985024018e-06,
"loss": 0.5806,
"step": 14400
},
{
"epoch": 2.03,
"eval_loss": 0.815026044845581,
"eval_runtime": 146.3828,
"eval_samples_per_second": 51.502,
"eval_steps_per_second": 6.442,
"step": 14400
},
{
"epoch": 2.04,
"learning_rate": 9.584628426109072e-06,
"loss": 0.5669,
"step": 14450
},
{
"epoch": 2.05,
"learning_rate": 9.51398700197796e-06,
"loss": 0.5238,
"step": 14500
},
{
"epoch": 2.06,
"learning_rate": 9.44334557784685e-06,
"loss": 0.6376,
"step": 14550
},
{
"epoch": 2.06,
"learning_rate": 9.37270415371574e-06,
"loss": 0.6301,
"step": 14600
},
{
"epoch": 2.07,
"learning_rate": 9.302062729584629e-06,
"loss": 0.5822,
"step": 14650
},
{
"epoch": 2.08,
"learning_rate": 9.231421305453518e-06,
"loss": 0.533,
"step": 14700
},
{
"epoch": 2.08,
"learning_rate": 9.160779881322408e-06,
"loss": 0.5156,
"step": 14750
},
{
"epoch": 2.09,
"learning_rate": 9.090138457191297e-06,
"loss": 0.6329,
"step": 14800
},
{
"epoch": 2.09,
"eval_loss": 0.8057652711868286,
"eval_runtime": 146.3678,
"eval_samples_per_second": 51.507,
"eval_steps_per_second": 6.443,
"step": 14800
},
{
"epoch": 2.1,
"learning_rate": 9.019497033060188e-06,
"loss": 0.555,
"step": 14850
},
{
"epoch": 2.11,
"learning_rate": 8.948855608929076e-06,
"loss": 0.546,
"step": 14900
},
{
"epoch": 2.11,
"learning_rate": 8.878214184797967e-06,
"loss": 0.6075,
"step": 14950
},
{
"epoch": 2.12,
"learning_rate": 8.807572760666856e-06,
"loss": 0.5868,
"step": 15000
},
{
"epoch": 2.13,
"learning_rate": 8.736931336535745e-06,
"loss": 0.6193,
"step": 15050
},
{
"epoch": 2.13,
"learning_rate": 8.666289912404635e-06,
"loss": 0.6052,
"step": 15100
},
{
"epoch": 2.14,
"learning_rate": 8.595648488273524e-06,
"loss": 0.6021,
"step": 15150
},
{
"epoch": 2.15,
"learning_rate": 8.525007064142413e-06,
"loss": 0.5913,
"step": 15200
},
{
"epoch": 2.15,
"eval_loss": 0.8025009036064148,
"eval_runtime": 146.3582,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.443,
"step": 15200
},
{
"epoch": 2.15,
"learning_rate": 8.454365640011303e-06,
"loss": 0.587,
"step": 15250
},
{
"epoch": 2.16,
"learning_rate": 8.383724215880192e-06,
"loss": 0.5106,
"step": 15300
},
{
"epoch": 2.17,
"learning_rate": 8.313082791749081e-06,
"loss": 0.5606,
"step": 15350
},
{
"epoch": 2.18,
"learning_rate": 8.242441367617972e-06,
"loss": 0.5843,
"step": 15400
},
{
"epoch": 2.18,
"learning_rate": 8.17179994348686e-06,
"loss": 0.5879,
"step": 15450
},
{
"epoch": 2.19,
"learning_rate": 8.101158519355751e-06,
"loss": 0.5804,
"step": 15500
},
{
"epoch": 2.2,
"learning_rate": 8.030517095224639e-06,
"loss": 0.5332,
"step": 15550
},
{
"epoch": 2.2,
"learning_rate": 7.95987567109353e-06,
"loss": 0.5891,
"step": 15600
},
{
"epoch": 2.2,
"eval_loss": 0.8068730235099792,
"eval_runtime": 146.6368,
"eval_samples_per_second": 51.413,
"eval_steps_per_second": 6.431,
"step": 15600
},
{
"epoch": 2.21,
"learning_rate": 7.889234246962419e-06,
"loss": 0.6205,
"step": 15650
},
{
"epoch": 2.22,
"learning_rate": 7.818592822831308e-06,
"loss": 0.5775,
"step": 15700
},
{
"epoch": 2.23,
"learning_rate": 7.747951398700198e-06,
"loss": 0.5515,
"step": 15750
},
{
"epoch": 2.23,
"learning_rate": 7.677309974569087e-06,
"loss": 0.6581,
"step": 15800
},
{
"epoch": 2.24,
"learning_rate": 7.606668550437977e-06,
"loss": 0.5393,
"step": 15850
},
{
"epoch": 2.25,
"learning_rate": 7.536027126306867e-06,
"loss": 0.6666,
"step": 15900
},
{
"epoch": 2.25,
"learning_rate": 7.4653857021757565e-06,
"loss": 0.6101,
"step": 15950
},
{
"epoch": 2.26,
"learning_rate": 7.394744278044646e-06,
"loss": 0.5265,
"step": 16000
},
{
"epoch": 2.26,
"eval_loss": 0.7969786524772644,
"eval_runtime": 146.4757,
"eval_samples_per_second": 51.469,
"eval_steps_per_second": 6.438,
"step": 16000
},
{
"epoch": 2.27,
"learning_rate": 7.324102853913535e-06,
"loss": 0.5849,
"step": 16050
},
{
"epoch": 2.27,
"learning_rate": 7.2534614297824245e-06,
"loss": 0.4989,
"step": 16100
},
{
"epoch": 2.28,
"learning_rate": 7.182820005651315e-06,
"loss": 0.5835,
"step": 16150
},
{
"epoch": 2.29,
"learning_rate": 7.112178581520204e-06,
"loss": 0.598,
"step": 16200
},
{
"epoch": 2.3,
"learning_rate": 7.041537157389093e-06,
"loss": 0.5479,
"step": 16250
},
{
"epoch": 2.3,
"learning_rate": 6.970895733257983e-06,
"loss": 0.6026,
"step": 16300
},
{
"epoch": 2.31,
"learning_rate": 6.900254309126873e-06,
"loss": 0.4896,
"step": 16350
},
{
"epoch": 2.32,
"learning_rate": 6.829612884995762e-06,
"loss": 0.6014,
"step": 16400
},
{
"epoch": 2.32,
"eval_loss": 0.7993112802505493,
"eval_runtime": 146.4005,
"eval_samples_per_second": 51.496,
"eval_steps_per_second": 6.441,
"step": 16400
},
{
"epoch": 2.32,
"learning_rate": 6.7589714608646515e-06,
"loss": 0.6562,
"step": 16450
},
{
"epoch": 2.33,
"learning_rate": 6.688330036733541e-06,
"loss": 0.5241,
"step": 16500
},
{
"epoch": 2.34,
"learning_rate": 6.61768861260243e-06,
"loss": 0.6455,
"step": 16550
},
{
"epoch": 2.35,
"learning_rate": 6.5470471884713194e-06,
"loss": 0.5524,
"step": 16600
},
{
"epoch": 2.35,
"learning_rate": 6.476405764340209e-06,
"loss": 0.5883,
"step": 16650
},
{
"epoch": 2.36,
"learning_rate": 6.405764340209098e-06,
"loss": 0.6005,
"step": 16700
},
{
"epoch": 2.37,
"learning_rate": 6.335122916077988e-06,
"loss": 0.6611,
"step": 16750
},
{
"epoch": 2.37,
"learning_rate": 6.264481491946878e-06,
"loss": 0.5392,
"step": 16800
},
{
"epoch": 2.37,
"eval_loss": 0.8024877905845642,
"eval_runtime": 146.2872,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.446,
"step": 16800
},
{
"epoch": 2.38,
"learning_rate": 6.193840067815767e-06,
"loss": 0.5831,
"step": 16850
},
{
"epoch": 2.39,
"learning_rate": 6.123198643684656e-06,
"loss": 0.5662,
"step": 16900
},
{
"epoch": 2.39,
"learning_rate": 6.052557219553546e-06,
"loss": 0.5643,
"step": 16950
},
{
"epoch": 2.4,
"learning_rate": 5.981915795422436e-06,
"loss": 0.6554,
"step": 17000
},
{
"epoch": 2.41,
"learning_rate": 5.911274371291325e-06,
"loss": 0.5664,
"step": 17050
},
{
"epoch": 2.42,
"learning_rate": 5.840632947160214e-06,
"loss": 0.586,
"step": 17100
},
{
"epoch": 2.42,
"learning_rate": 5.7699915230291046e-06,
"loss": 0.53,
"step": 17150
},
{
"epoch": 2.43,
"learning_rate": 5.699350098897994e-06,
"loss": 0.6073,
"step": 17200
},
{
"epoch": 2.43,
"eval_loss": 0.7982571125030518,
"eval_runtime": 146.4741,
"eval_samples_per_second": 51.47,
"eval_steps_per_second": 6.438,
"step": 17200
},
{
"epoch": 2.44,
"learning_rate": 5.628708674766883e-06,
"loss": 0.5988,
"step": 17250
},
{
"epoch": 2.44,
"learning_rate": 5.5580672506357725e-06,
"loss": 0.5899,
"step": 17300
},
{
"epoch": 2.45,
"learning_rate": 5.487425826504663e-06,
"loss": 0.5709,
"step": 17350
},
{
"epoch": 2.46,
"learning_rate": 5.416784402373552e-06,
"loss": 0.6798,
"step": 17400
},
{
"epoch": 2.47,
"learning_rate": 5.346142978242441e-06,
"loss": 0.6366,
"step": 17450
},
{
"epoch": 2.47,
"learning_rate": 5.275501554111331e-06,
"loss": 0.678,
"step": 17500
},
{
"epoch": 2.48,
"learning_rate": 5.204860129980221e-06,
"loss": 0.5061,
"step": 17550
},
{
"epoch": 2.49,
"learning_rate": 5.13421870584911e-06,
"loss": 0.5536,
"step": 17600
},
{
"epoch": 2.49,
"eval_loss": 0.7959641814231873,
"eval_runtime": 146.5594,
"eval_samples_per_second": 51.44,
"eval_steps_per_second": 6.434,
"step": 17600
},
{
"epoch": 2.49,
"learning_rate": 5.0635772817179995e-06,
"loss": 0.6453,
"step": 17650
},
{
"epoch": 2.5,
"learning_rate": 4.992935857586889e-06,
"loss": 0.5388,
"step": 17700
},
{
"epoch": 2.51,
"learning_rate": 4.922294433455779e-06,
"loss": 0.5864,
"step": 17750
},
{
"epoch": 2.51,
"learning_rate": 4.851653009324668e-06,
"loss": 0.5365,
"step": 17800
},
{
"epoch": 2.52,
"learning_rate": 4.781011585193558e-06,
"loss": 0.5472,
"step": 17850
},
{
"epoch": 2.53,
"learning_rate": 4.710370161062447e-06,
"loss": 0.6661,
"step": 17900
},
{
"epoch": 2.54,
"learning_rate": 4.639728736931336e-06,
"loss": 0.6971,
"step": 17950
},
{
"epoch": 2.54,
"learning_rate": 4.5690873128002265e-06,
"loss": 0.5196,
"step": 18000
},
{
"epoch": 2.54,
"eval_loss": 0.7836597561836243,
"eval_runtime": 146.6328,
"eval_samples_per_second": 51.414,
"eval_steps_per_second": 6.431,
"step": 18000
},
{
"epoch": 2.55,
"learning_rate": 4.498445888669116e-06,
"loss": 0.6394,
"step": 18050
},
{
"epoch": 2.56,
"learning_rate": 4.427804464538005e-06,
"loss": 0.4884,
"step": 18100
},
{
"epoch": 2.56,
"learning_rate": 4.3571630404068945e-06,
"loss": 0.514,
"step": 18150
},
{
"epoch": 2.57,
"learning_rate": 4.286521616275785e-06,
"loss": 0.5209,
"step": 18200
},
{
"epoch": 2.58,
"learning_rate": 4.215880192144674e-06,
"loss": 0.5669,
"step": 18250
},
{
"epoch": 2.59,
"learning_rate": 4.145238768013563e-06,
"loss": 0.5224,
"step": 18300
},
{
"epoch": 2.59,
"learning_rate": 4.074597343882453e-06,
"loss": 0.5607,
"step": 18350
},
{
"epoch": 2.6,
"learning_rate": 4.003955919751343e-06,
"loss": 0.5776,
"step": 18400
},
{
"epoch": 2.6,
"eval_loss": 0.7881141901016235,
"eval_runtime": 146.3758,
"eval_samples_per_second": 51.504,
"eval_steps_per_second": 6.442,
"step": 18400
},
{
"epoch": 2.61,
"learning_rate": 3.933314495620232e-06,
"loss": 0.5759,
"step": 18450
},
{
"epoch": 2.61,
"learning_rate": 3.8626730714891214e-06,
"loss": 0.5871,
"step": 18500
},
{
"epoch": 2.62,
"learning_rate": 3.7920316473580103e-06,
"loss": 0.5992,
"step": 18550
},
{
"epoch": 2.63,
"learning_rate": 3.7213902232269e-06,
"loss": 0.5106,
"step": 18600
},
{
"epoch": 2.63,
"learning_rate": 3.65074879909579e-06,
"loss": 0.5211,
"step": 18650
},
{
"epoch": 2.64,
"learning_rate": 3.580107374964679e-06,
"loss": 0.5794,
"step": 18700
},
{
"epoch": 2.65,
"learning_rate": 3.509465950833569e-06,
"loss": 0.6017,
"step": 18750
},
{
"epoch": 2.66,
"learning_rate": 3.4388245267024582e-06,
"loss": 0.5981,
"step": 18800
},
{
"epoch": 2.66,
"eval_loss": 0.7939748167991638,
"eval_runtime": 146.5246,
"eval_samples_per_second": 51.452,
"eval_steps_per_second": 6.436,
"step": 18800
},
{
"epoch": 2.66,
"learning_rate": 3.368183102571348e-06,
"loss": 0.6937,
"step": 18850
},
{
"epoch": 2.67,
"learning_rate": 3.2975416784402373e-06,
"loss": 0.6696,
"step": 18900
},
{
"epoch": 2.68,
"learning_rate": 3.226900254309127e-06,
"loss": 0.639,
"step": 18950
},
{
"epoch": 2.68,
"learning_rate": 3.1562588301780164e-06,
"loss": 0.5565,
"step": 19000
},
{
"epoch": 2.69,
"learning_rate": 3.085617406046906e-06,
"loss": 0.4692,
"step": 19050
},
{
"epoch": 2.7,
"learning_rate": 3.0149759819157954e-06,
"loss": 0.5703,
"step": 19100
},
{
"epoch": 2.71,
"learning_rate": 2.944334557784685e-06,
"loss": 0.4944,
"step": 19150
},
{
"epoch": 2.71,
"learning_rate": 2.8736931336535745e-06,
"loss": 0.6101,
"step": 19200
},
{
"epoch": 2.71,
"eval_loss": 0.7930362224578857,
"eval_runtime": 146.3443,
"eval_samples_per_second": 51.515,
"eval_steps_per_second": 6.444,
"step": 19200
},
{
"epoch": 2.72,
"learning_rate": 2.8030517095224643e-06,
"loss": 0.59,
"step": 19250
},
{
"epoch": 2.73,
"learning_rate": 2.7324102853913536e-06,
"loss": 0.5799,
"step": 19300
},
{
"epoch": 2.73,
"learning_rate": 2.6617688612602433e-06,
"loss": 0.5569,
"step": 19350
},
{
"epoch": 2.74,
"learning_rate": 2.5911274371291327e-06,
"loss": 0.5548,
"step": 19400
},
{
"epoch": 2.75,
"learning_rate": 2.5204860129980224e-06,
"loss": 0.5544,
"step": 19450
},
{
"epoch": 2.76,
"learning_rate": 2.4498445888669117e-06,
"loss": 0.6427,
"step": 19500
},
{
"epoch": 2.76,
"learning_rate": 2.3792031647358015e-06,
"loss": 0.5027,
"step": 19550
},
{
"epoch": 2.77,
"learning_rate": 2.308561740604691e-06,
"loss": 0.5696,
"step": 19600
},
{
"epoch": 2.77,
"eval_loss": 0.7890626192092896,
"eval_runtime": 146.5527,
"eval_samples_per_second": 51.442,
"eval_steps_per_second": 6.435,
"step": 19600
},
{
"epoch": 2.78,
"learning_rate": 2.2379203164735806e-06,
"loss": 0.509,
"step": 19650
},
{
"epoch": 2.78,
"learning_rate": 2.1672788923424695e-06,
"loss": 0.5078,
"step": 19700
},
{
"epoch": 2.79,
"learning_rate": 2.096637468211359e-06,
"loss": 0.563,
"step": 19750
},
{
"epoch": 2.8,
"learning_rate": 2.0259960440802485e-06,
"loss": 0.6039,
"step": 19800
},
{
"epoch": 2.8,
"learning_rate": 1.955354619949138e-06,
"loss": 0.5289,
"step": 19850
},
{
"epoch": 2.81,
"learning_rate": 1.8847131958180278e-06,
"loss": 0.5394,
"step": 19900
},
{
"epoch": 2.82,
"learning_rate": 1.8140717716869174e-06,
"loss": 0.6539,
"step": 19950
},
{
"epoch": 2.83,
"learning_rate": 1.743430347555807e-06,
"loss": 0.563,
"step": 20000
},
{
"epoch": 2.83,
"eval_loss": 0.7903943657875061,
"eval_runtime": 146.7425,
"eval_samples_per_second": 51.376,
"eval_steps_per_second": 6.426,
"step": 20000
},
{
"epoch": 2.83,
"learning_rate": 1.6727889234246962e-06,
"loss": 0.5043,
"step": 20050
},
{
"epoch": 2.84,
"learning_rate": 1.6021474992935858e-06,
"loss": 0.5243,
"step": 20100
},
{
"epoch": 2.85,
"learning_rate": 1.5315060751624753e-06,
"loss": 0.6144,
"step": 20150
},
{
"epoch": 2.85,
"learning_rate": 1.4608646510313648e-06,
"loss": 0.5324,
"step": 20200
},
{
"epoch": 2.86,
"learning_rate": 1.3902232269002544e-06,
"loss": 0.5494,
"step": 20250
},
{
"epoch": 2.87,
"learning_rate": 1.319581802769144e-06,
"loss": 0.5302,
"step": 20300
},
{
"epoch": 2.88,
"learning_rate": 1.2489403786380335e-06,
"loss": 0.6458,
"step": 20350
},
{
"epoch": 2.88,
"learning_rate": 1.178298954506923e-06,
"loss": 0.5361,
"step": 20400
},
{
"epoch": 2.88,
"eval_loss": 0.7929331660270691,
"eval_runtime": 146.7924,
"eval_samples_per_second": 51.358,
"eval_steps_per_second": 6.424,
"step": 20400
},
{
"epoch": 2.89,
"learning_rate": 1.1076575303758125e-06,
"loss": 0.5857,
"step": 20450
},
{
"epoch": 2.9,
"learning_rate": 1.037016106244702e-06,
"loss": 0.5542,
"step": 20500
},
{
"epoch": 2.9,
"learning_rate": 9.663746821135916e-07,
"loss": 0.6382,
"step": 20550
},
{
"epoch": 2.91,
"learning_rate": 8.957332579824809e-07,
"loss": 0.5708,
"step": 20600
},
{
"epoch": 2.92,
"learning_rate": 8.250918338513705e-07,
"loss": 0.5726,
"step": 20650
},
{
"epoch": 2.92,
"learning_rate": 7.5445040972026e-07,
"loss": 0.5438,
"step": 20700
},
{
"epoch": 2.93,
"learning_rate": 6.838089855891495e-07,
"loss": 0.5833,
"step": 20750
},
{
"epoch": 2.94,
"learning_rate": 6.13167561458039e-07,
"loss": 0.5654,
"step": 20800
},
{
"epoch": 2.94,
"eval_loss": 0.7940900325775146,
"eval_runtime": 146.3661,
"eval_samples_per_second": 51.508,
"eval_steps_per_second": 6.443,
"step": 20800
},
{
"epoch": 2.95,
"learning_rate": 5.425261373269285e-07,
"loss": 0.5341,
"step": 20850
},
{
"epoch": 2.95,
"learning_rate": 4.7188471319581804e-07,
"loss": 0.5934,
"step": 20900
},
{
"epoch": 2.96,
"learning_rate": 4.012432890647076e-07,
"loss": 0.6293,
"step": 20950
},
{
"epoch": 2.97,
"learning_rate": 3.3060186493359706e-07,
"loss": 0.6117,
"step": 21000
},
{
"epoch": 2.97,
"learning_rate": 2.5996044080248655e-07,
"loss": 0.602,
"step": 21050
},
{
"epoch": 2.98,
"learning_rate": 1.8931901667137608e-07,
"loss": 0.538,
"step": 21100
},
{
"epoch": 2.99,
"learning_rate": 1.1867759254026561e-07,
"loss": 0.5409,
"step": 21150
},
{
"epoch": 3.0,
"learning_rate": 4.803616840915513e-08,
"loss": 0.5808,
"step": 21200
},
{
"epoch": 3.0,
"eval_loss": 0.7935702800750732,
"eval_runtime": 146.4206,
"eval_samples_per_second": 51.489,
"eval_steps_per_second": 6.44,
"step": 21200
},
{
"epoch": 3.0,
"step": 21234,
"total_flos": 9.26174715028439e+16,
"train_loss": 0.9690902242330864,
"train_runtime": 18755.4157,
"train_samples_per_second": 9.056,
"train_steps_per_second": 1.132
}
],
"logging_steps": 50,
"max_steps": 21234,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"total_flos": 9.26174715028439e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}