zephyr-sft-timedial / trainer_state.json
EllieS's picture
Model save
46f0297 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998272884283247,
"eval_steps": 500,
"global_step": 289,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.46875,
"learning_rate": 6.896551724137932e-06,
"loss": 1.9268,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 2.078125,
"learning_rate": 3.4482758620689657e-05,
"loss": 1.9116,
"step": 5
},
{
"epoch": 0.03,
"grad_norm": 1.390625,
"learning_rate": 6.896551724137931e-05,
"loss": 1.8301,
"step": 10
},
{
"epoch": 0.05,
"grad_norm": 1.421875,
"learning_rate": 0.00010344827586206898,
"loss": 1.5573,
"step": 15
},
{
"epoch": 0.07,
"grad_norm": 1.21875,
"learning_rate": 0.00013793103448275863,
"loss": 1.4594,
"step": 20
},
{
"epoch": 0.09,
"grad_norm": 1.2734375,
"learning_rate": 0.00017241379310344826,
"loss": 1.2235,
"step": 25
},
{
"epoch": 0.1,
"grad_norm": 1.4140625,
"learning_rate": 0.00019999270008556108,
"loss": 1.2842,
"step": 30
},
{
"epoch": 0.12,
"grad_norm": 1.03125,
"learning_rate": 0.00019973731496914914,
"loss": 1.3727,
"step": 35
},
{
"epoch": 0.14,
"grad_norm": 1.328125,
"learning_rate": 0.00019911799920659093,
"loss": 1.3235,
"step": 40
},
{
"epoch": 0.16,
"grad_norm": 1.2421875,
"learning_rate": 0.00019813701261394136,
"loss": 1.3308,
"step": 45
},
{
"epoch": 0.17,
"grad_norm": 1.0,
"learning_rate": 0.00019679793470489228,
"loss": 1.3697,
"step": 50
},
{
"epoch": 0.19,
"grad_norm": 1.1328125,
"learning_rate": 0.00019510565162951537,
"loss": 1.323,
"step": 55
},
{
"epoch": 0.21,
"grad_norm": 1.1796875,
"learning_rate": 0.00019306633834523024,
"loss": 1.3125,
"step": 60
},
{
"epoch": 0.22,
"grad_norm": 0.98046875,
"learning_rate": 0.00019068743608505455,
"loss": 1.3512,
"step": 65
},
{
"epoch": 0.24,
"grad_norm": 1.0,
"learning_rate": 0.00018797762520535177,
"loss": 1.334,
"step": 70
},
{
"epoch": 0.26,
"grad_norm": 1.1171875,
"learning_rate": 0.0001849467935121521,
"loss": 1.0914,
"step": 75
},
{
"epoch": 0.28,
"grad_norm": 1.1015625,
"learning_rate": 0.0001816060001816205,
"loss": 1.2314,
"step": 80
},
{
"epoch": 0.29,
"grad_norm": 1.015625,
"learning_rate": 0.00017796743540632223,
"loss": 1.1808,
"step": 85
},
{
"epoch": 0.31,
"grad_norm": 1.09375,
"learning_rate": 0.00017404437591453235,
"loss": 1.2491,
"step": 90
},
{
"epoch": 0.33,
"grad_norm": 1.1796875,
"learning_rate": 0.00016985113652489374,
"loss": 1.2451,
"step": 95
},
{
"epoch": 0.35,
"grad_norm": 1.3515625,
"learning_rate": 0.00016540301791319645,
"loss": 1.1333,
"step": 100
},
{
"epoch": 0.36,
"grad_norm": 1.484375,
"learning_rate": 0.00016071625078187114,
"loss": 1.1685,
"step": 105
},
{
"epoch": 0.38,
"grad_norm": 1.1171875,
"learning_rate": 0.00015580793663591585,
"loss": 1.1954,
"step": 110
},
{
"epoch": 0.4,
"grad_norm": 1.015625,
"learning_rate": 0.00015069598538135906,
"loss": 1.2303,
"step": 115
},
{
"epoch": 0.41,
"grad_norm": 1.1171875,
"learning_rate": 0.00014539904997395468,
"loss": 1.3181,
"step": 120
},
{
"epoch": 0.43,
"grad_norm": 1.125,
"learning_rate": 0.00013993645835656953,
"loss": 1.1347,
"step": 125
},
{
"epoch": 0.45,
"grad_norm": 1.140625,
"learning_rate": 0.00013432814293361584,
"loss": 1.2001,
"step": 130
},
{
"epoch": 0.47,
"grad_norm": 0.97265625,
"learning_rate": 0.00012859456783986893,
"loss": 1.2733,
"step": 135
},
{
"epoch": 0.48,
"grad_norm": 1.1015625,
"learning_rate": 0.000122756654269059,
"loss": 1.2653,
"step": 140
},
{
"epoch": 0.5,
"grad_norm": 1.1015625,
"learning_rate": 0.00011683570413470383,
"loss": 1.2328,
"step": 145
},
{
"epoch": 0.52,
"grad_norm": 0.98046875,
"learning_rate": 0.00011085332234173664,
"loss": 1.1522,
"step": 150
},
{
"epoch": 0.54,
"grad_norm": 1.2265625,
"learning_rate": 0.00010483133795255071,
"loss": 1.1529,
"step": 155
},
{
"epoch": 0.55,
"grad_norm": 1.34375,
"learning_rate": 9.879172453511827e-05,
"loss": 1.0412,
"step": 160
},
{
"epoch": 0.57,
"grad_norm": 1.2421875,
"learning_rate": 9.275651998382377e-05,
"loss": 1.1489,
"step": 165
},
{
"epoch": 0.59,
"grad_norm": 1.015625,
"learning_rate": 8.674774610557728e-05,
"loss": 1.2629,
"step": 170
},
{
"epoch": 0.6,
"grad_norm": 1.2421875,
"learning_rate": 8.078732826462915e-05,
"loss": 1.0591,
"step": 175
},
{
"epoch": 0.62,
"grad_norm": 1.421875,
"learning_rate": 7.489701537929384e-05,
"loss": 1.2508,
"step": 180
},
{
"epoch": 0.64,
"grad_norm": 1.2734375,
"learning_rate": 6.909830056250527e-05,
"loss": 1.2073,
"step": 185
},
{
"epoch": 0.66,
"grad_norm": 1.375,
"learning_rate": 6.341234269577879e-05,
"loss": 1.09,
"step": 190
},
{
"epoch": 0.67,
"grad_norm": 1.4921875,
"learning_rate": 5.785988922274711e-05,
"loss": 1.1575,
"step": 195
},
{
"epoch": 0.69,
"grad_norm": 1.34375,
"learning_rate": 5.246120044398839e-05,
"loss": 1.1168,
"step": 200
},
{
"epoch": 0.71,
"grad_norm": 1.2421875,
"learning_rate": 4.723597558938672e-05,
"loss": 1.1412,
"step": 205
},
{
"epoch": 0.73,
"grad_norm": 1.1875,
"learning_rate": 4.220328093777851e-05,
"loss": 1.1943,
"step": 210
},
{
"epoch": 0.74,
"grad_norm": 1.1171875,
"learning_rate": 3.738148024616863e-05,
"loss": 1.1862,
"step": 215
},
{
"epoch": 0.76,
"grad_norm": 1.0859375,
"learning_rate": 3.2788167742372725e-05,
"loss": 1.0806,
"step": 220
},
{
"epoch": 0.78,
"grad_norm": 1.203125,
"learning_rate": 2.84401039255879e-05,
"loss": 1.2544,
"step": 225
},
{
"epoch": 0.79,
"grad_norm": 1.3984375,
"learning_rate": 2.4353154409148637e-05,
"loss": 1.1946,
"step": 230
},
{
"epoch": 0.81,
"grad_norm": 1.3984375,
"learning_rate": 2.0542232028624586e-05,
"loss": 1.1809,
"step": 235
},
{
"epoch": 0.83,
"grad_norm": 1.3515625,
"learning_rate": 1.7021242426500493e-05,
"loss": 1.2461,
"step": 240
},
{
"epoch": 0.85,
"grad_norm": 1.203125,
"learning_rate": 1.3803033311995072e-05,
"loss": 0.9889,
"step": 245
},
{
"epoch": 0.86,
"grad_norm": 1.2109375,
"learning_rate": 1.0899347581163221e-05,
"loss": 1.2301,
"step": 250
},
{
"epoch": 0.88,
"grad_norm": 1.5078125,
"learning_rate": 8.32078046834176e-06,
"loss": 1.0813,
"step": 255
},
{
"epoch": 0.9,
"grad_norm": 1.1953125,
"learning_rate": 6.076740885288479e-06,
"loss": 1.0638,
"step": 260
},
{
"epoch": 0.92,
"grad_norm": 1.3359375,
"learning_rate": 4.175417089083378e-06,
"loss": 1.1524,
"step": 265
},
{
"epoch": 0.93,
"grad_norm": 1.0859375,
"learning_rate": 2.6237468040666512e-06,
"loss": 1.2414,
"step": 270
},
{
"epoch": 0.95,
"grad_norm": 1.3359375,
"learning_rate": 1.4273919068349184e-06,
"loss": 1.1618,
"step": 275
},
{
"epoch": 0.97,
"grad_norm": 1.421875,
"learning_rate": 5.907177666674812e-07,
"loss": 1.1204,
"step": 280
},
{
"epoch": 0.98,
"grad_norm": 1.2421875,
"learning_rate": 1.1677731676733584e-07,
"loss": 1.0432,
"step": 285
},
{
"epoch": 1.0,
"eval_loss": 1.1134740114212036,
"eval_runtime": 21.1932,
"eval_samples_per_second": 13.636,
"eval_steps_per_second": 1.746,
"step": 289
},
{
"epoch": 1.0,
"step": 289,
"total_flos": 1.887089628230451e+16,
"train_loss": 1.2325792506491848,
"train_runtime": 253.7515,
"train_samples_per_second": 4.56,
"train_steps_per_second": 1.139
}
],
"logging_steps": 5,
"max_steps": 289,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 1.887089628230451e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}