Llama-3.1-8B-FLDx2 / trainer_state.json
MorishT's picture
Upload folder using huggingface_hub
56495dc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 390,
"global_step": 390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0128,
"grad_norm": 182.65389468504085,
"learning_rate": 5.000000000000001e-07,
"loss": 3.0348,
"step": 5
},
{
"epoch": 0.0256,
"grad_norm": 139.49962686394534,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.3592,
"step": 10
},
{
"epoch": 0.0384,
"grad_norm": 73.38280389729819,
"learning_rate": 1.5e-06,
"loss": 0.8854,
"step": 15
},
{
"epoch": 0.0512,
"grad_norm": 10.581257887011294,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.437,
"step": 20
},
{
"epoch": 0.064,
"grad_norm": 19.483047427550837,
"learning_rate": 2.5e-06,
"loss": 0.2943,
"step": 25
},
{
"epoch": 0.0768,
"grad_norm": 11.913519176859392,
"learning_rate": 3e-06,
"loss": 0.2626,
"step": 30
},
{
"epoch": 0.0896,
"grad_norm": 11.341258695250447,
"learning_rate": 3.5e-06,
"loss": 0.234,
"step": 35
},
{
"epoch": 0.1024,
"grad_norm": 7.3736324646601865,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2023,
"step": 40
},
{
"epoch": 0.1152,
"grad_norm": 8.384878179412805,
"learning_rate": 4.5e-06,
"loss": 0.1772,
"step": 45
},
{
"epoch": 0.128,
"grad_norm": 4.321321712106481,
"learning_rate": 5e-06,
"loss": 0.1648,
"step": 50
},
{
"epoch": 0.1408,
"grad_norm": 8.223137316430781,
"learning_rate": 5.500000000000001e-06,
"loss": 0.1712,
"step": 55
},
{
"epoch": 0.1536,
"grad_norm": 5.267777408485924,
"learning_rate": 6e-06,
"loss": 0.1414,
"step": 60
},
{
"epoch": 0.1664,
"grad_norm": 6.820714261557992,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.1356,
"step": 65
},
{
"epoch": 0.1792,
"grad_norm": 10.169624218399154,
"learning_rate": 7e-06,
"loss": 0.1307,
"step": 70
},
{
"epoch": 0.192,
"grad_norm": 4.648811703272445,
"learning_rate": 7.500000000000001e-06,
"loss": 0.1272,
"step": 75
},
{
"epoch": 0.2048,
"grad_norm": 7.8160709736421365,
"learning_rate": 8.000000000000001e-06,
"loss": 0.1238,
"step": 80
},
{
"epoch": 0.2176,
"grad_norm": 4.6578435324125005,
"learning_rate": 8.5e-06,
"loss": 0.1188,
"step": 85
},
{
"epoch": 0.2304,
"grad_norm": 4.334284814503559,
"learning_rate": 9e-06,
"loss": 0.1102,
"step": 90
},
{
"epoch": 0.2432,
"grad_norm": 5.185815667957919,
"learning_rate": 9.5e-06,
"loss": 0.1321,
"step": 95
},
{
"epoch": 0.256,
"grad_norm": 5.055357962960289,
"learning_rate": 1e-05,
"loss": 0.1152,
"step": 100
},
{
"epoch": 0.2688,
"grad_norm": 5.024069288121602,
"learning_rate": 1.0500000000000001e-05,
"loss": 0.1226,
"step": 105
},
{
"epoch": 0.2816,
"grad_norm": 7.4125001237167805,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.1178,
"step": 110
},
{
"epoch": 0.2944,
"grad_norm": 6.3618508498418835,
"learning_rate": 1.15e-05,
"loss": 0.1149,
"step": 115
},
{
"epoch": 0.3072,
"grad_norm": 6.469859734653322,
"learning_rate": 1.2e-05,
"loss": 0.1246,
"step": 120
},
{
"epoch": 0.32,
"grad_norm": 5.006561942877804,
"learning_rate": 1.25e-05,
"loss": 0.2708,
"step": 125
},
{
"epoch": 0.3328,
"grad_norm": 3.986948414918879,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.1547,
"step": 130
},
{
"epoch": 0.3456,
"grad_norm": 9.389803413840562,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.1285,
"step": 135
},
{
"epoch": 0.3584,
"grad_norm": 11.72955750893183,
"learning_rate": 1.4e-05,
"loss": 0.1346,
"step": 140
},
{
"epoch": 0.3712,
"grad_norm": 5.134775487996449,
"learning_rate": 1.45e-05,
"loss": 0.1382,
"step": 145
},
{
"epoch": 0.384,
"grad_norm": 4.474102477655389,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.127,
"step": 150
},
{
"epoch": 0.3968,
"grad_norm": 2.9780337882613246,
"learning_rate": 1.55e-05,
"loss": 0.1232,
"step": 155
},
{
"epoch": 0.4096,
"grad_norm": 4.373151697292325,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.1251,
"step": 160
},
{
"epoch": 0.4224,
"grad_norm": 9.660073045187783,
"learning_rate": 1.65e-05,
"loss": 0.1256,
"step": 165
},
{
"epoch": 0.4352,
"grad_norm": 6.853478399209637,
"learning_rate": 1.7e-05,
"loss": 0.1414,
"step": 170
},
{
"epoch": 0.448,
"grad_norm": 8.677517234598955,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.206,
"step": 175
},
{
"epoch": 0.4608,
"grad_norm": 8.651004571305064,
"learning_rate": 1.8e-05,
"loss": 0.1425,
"step": 180
},
{
"epoch": 0.4736,
"grad_norm": 5.901558970919691,
"learning_rate": 1.8500000000000002e-05,
"loss": 0.1207,
"step": 185
},
{
"epoch": 0.4864,
"grad_norm": 3.882452425752908,
"learning_rate": 1.9e-05,
"loss": 0.1184,
"step": 190
},
{
"epoch": 0.4992,
"grad_norm": 10.279050422228723,
"learning_rate": 1.95e-05,
"loss": 0.1266,
"step": 195
},
{
"epoch": 0.512,
"grad_norm": 156.76899002893614,
"learning_rate": 2e-05,
"loss": 0.2276,
"step": 200
},
{
"epoch": 0.5248,
"grad_norm": 5.748361151055656,
"learning_rate": 1.9473684210526318e-05,
"loss": 0.2735,
"step": 205
},
{
"epoch": 0.5376,
"grad_norm": 6.707674047117148,
"learning_rate": 1.894736842105263e-05,
"loss": 0.2056,
"step": 210
},
{
"epoch": 0.5504,
"grad_norm": 4.28932948719806,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.1361,
"step": 215
},
{
"epoch": 0.5632,
"grad_norm": 3.9017098971521986,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.1207,
"step": 220
},
{
"epoch": 0.576,
"grad_norm": 6.2162092103600814,
"learning_rate": 1.736842105263158e-05,
"loss": 0.1317,
"step": 225
},
{
"epoch": 0.5888,
"grad_norm": 5.854326003876861,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.1445,
"step": 230
},
{
"epoch": 0.6016,
"grad_norm": 6.246048863269034,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.1171,
"step": 235
},
{
"epoch": 0.6144,
"grad_norm": 3.7109059300429044,
"learning_rate": 1.578947368421053e-05,
"loss": 0.1141,
"step": 240
},
{
"epoch": 0.6272,
"grad_norm": 3.5668255484420706,
"learning_rate": 1.5263157894736846e-05,
"loss": 0.1037,
"step": 245
},
{
"epoch": 0.64,
"grad_norm": 4.782059977111824,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.1106,
"step": 250
},
{
"epoch": 0.6528,
"grad_norm": 10.5984194077511,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.1117,
"step": 255
},
{
"epoch": 0.6656,
"grad_norm": 3.779958670519677,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.1117,
"step": 260
},
{
"epoch": 0.6784,
"grad_norm": 5.911614916259078,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.1354,
"step": 265
},
{
"epoch": 0.6912,
"grad_norm": 5.80946707419217,
"learning_rate": 1.263157894736842e-05,
"loss": 0.1169,
"step": 270
},
{
"epoch": 0.704,
"grad_norm": 3.2969117252565776,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.106,
"step": 275
},
{
"epoch": 0.7168,
"grad_norm": 5.038146795065825,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.1133,
"step": 280
},
{
"epoch": 0.7296,
"grad_norm": 6.738857671001296,
"learning_rate": 1.105263157894737e-05,
"loss": 0.1077,
"step": 285
},
{
"epoch": 0.7424,
"grad_norm": 4.382581837531665,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.1176,
"step": 290
},
{
"epoch": 0.7552,
"grad_norm": 4.792805538486606,
"learning_rate": 1e-05,
"loss": 0.1035,
"step": 295
},
{
"epoch": 0.768,
"grad_norm": 4.972924462435217,
"learning_rate": 9.473684210526315e-06,
"loss": 0.1087,
"step": 300
},
{
"epoch": 0.7808,
"grad_norm": 15.861511630609764,
"learning_rate": 8.947368421052632e-06,
"loss": 0.1009,
"step": 305
},
{
"epoch": 0.7936,
"grad_norm": 7.880322698758499,
"learning_rate": 8.421052631578948e-06,
"loss": 0.1,
"step": 310
},
{
"epoch": 0.8064,
"grad_norm": 3.4154322142376197,
"learning_rate": 7.894736842105265e-06,
"loss": 0.0992,
"step": 315
},
{
"epoch": 0.8192,
"grad_norm": 4.078488845111544,
"learning_rate": 7.368421052631579e-06,
"loss": 0.1054,
"step": 320
},
{
"epoch": 0.832,
"grad_norm": 3.5309378515758643,
"learning_rate": 6.842105263157896e-06,
"loss": 0.0974,
"step": 325
},
{
"epoch": 0.8448,
"grad_norm": 6.343576179177006,
"learning_rate": 6.31578947368421e-06,
"loss": 0.1005,
"step": 330
},
{
"epoch": 0.8576,
"grad_norm": 3.801584889448848,
"learning_rate": 5.789473684210527e-06,
"loss": 0.089,
"step": 335
},
{
"epoch": 0.8704,
"grad_norm": 6.129470969543439,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0909,
"step": 340
},
{
"epoch": 0.8832,
"grad_norm": 5.965460894326214,
"learning_rate": 4.736842105263158e-06,
"loss": 0.0901,
"step": 345
},
{
"epoch": 0.896,
"grad_norm": 2.642223202809459,
"learning_rate": 4.210526315789474e-06,
"loss": 0.0932,
"step": 350
},
{
"epoch": 0.9088,
"grad_norm": 3.9893969211639755,
"learning_rate": 3.6842105263157896e-06,
"loss": 0.0821,
"step": 355
},
{
"epoch": 0.9216,
"grad_norm": 4.071378981195404,
"learning_rate": 3.157894736842105e-06,
"loss": 0.0908,
"step": 360
},
{
"epoch": 0.9344,
"grad_norm": 3.7877745428251592,
"learning_rate": 2.631578947368421e-06,
"loss": 0.0835,
"step": 365
},
{
"epoch": 0.9472,
"grad_norm": 5.175552389302117,
"learning_rate": 2.105263157894737e-06,
"loss": 0.0812,
"step": 370
},
{
"epoch": 0.96,
"grad_norm": 3.966653575974842,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.0713,
"step": 375
},
{
"epoch": 0.9728,
"grad_norm": 4.135076374239712,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.0751,
"step": 380
},
{
"epoch": 0.9856,
"grad_norm": 2.978106249425422,
"learning_rate": 5.263157894736843e-07,
"loss": 0.0753,
"step": 385
},
{
"epoch": 0.9984,
"grad_norm": 3.437868997060425,
"learning_rate": 0.0,
"loss": 0.0659,
"step": 390
},
{
"epoch": 0.9984,
"eval_accuracy": 0.007717635564240352,
"eval_loss": 0.0636017844080925,
"eval_runtime": 11.9633,
"eval_samples_per_second": 83.589,
"eval_steps_per_second": 2.675,
"step": 390
},
{
"epoch": 0.9984,
"logic_eval_extr_stps.D-0.answer_accuracy": 0.1111111111111111,
"logic_eval_extr_stps.D-0.proof_accuracy.zero_one": 0.1111111111111111,
"logic_eval_extr_stps.D-0.rouge1": 15.764555555555555,
"logic_eval_extr_stps.D-0.rouge2": 1.5705666666666667,
"logic_eval_extr_stps.D-0.rougeL": 15.764555555555555,
"logic_eval_extr_stps.D-0.rougeLsum": 15.764555555555555,
"logic_eval_extr_stps.D-1.answer_accuracy": 0.11764705882352941,
"logic_eval_extr_stps.D-1.proof_accuracy.zero_one": 0.0,
"logic_eval_extr_stps.D-1.rouge1": 7.244564705882353,
"logic_eval_extr_stps.D-1.rouge2": 0.6124411764705883,
"logic_eval_extr_stps.D-1.rougeL": 6.963717647058823,
"logic_eval_extr_stps.D-1.rougeLsum": 7.244564705882353,
"logic_eval_extr_stps.D-2.answer_accuracy": 0.45454545454545453,
"logic_eval_extr_stps.D-2.proof_accuracy.zero_one": 0.22727272727272727,
"logic_eval_extr_stps.D-2.rouge1": 21.115345454545455,
"logic_eval_extr_stps.D-2.rouge2": 7.12185,
"logic_eval_extr_stps.D-2.rougeL": 20.353104545454546,
"logic_eval_extr_stps.D-2.rougeLsum": 21.03720909090909,
"logic_eval_extr_stps.D-3.answer_accuracy": 0.23809523809523808,
"logic_eval_extr_stps.D-3.proof_accuracy.zero_one": 0.047619047619047616,
"logic_eval_extr_stps.D-3.rouge1": 10.493052380952381,
"logic_eval_extr_stps.D-3.rouge2": 2.243357142857143,
"logic_eval_extr_stps.D-3.rougeL": 9.783571428571431,
"logic_eval_extr_stps.D-3.rougeLsum": 10.493052380952381,
"logic_eval_extr_stps.D-None.answer_accuracy": 0.15625,
"logic_eval_extr_stps.D-None.proof_accuracy.zero_one": 0.15625,
"logic_eval_extr_stps.D-None.rouge1": 16.7680875,
"logic_eval_extr_stps.D-None.rouge2": 0.0,
"logic_eval_extr_stps.D-None.rougeL": 16.7680875,
"logic_eval_extr_stps.D-None.rougeLsum": 16.7680875,
"logic_eval_extr_stps.D-all.answer_accuracy": 0.22772277227722773,
"logic_eval_extr_stps.D-all.proof_accuracy.zero_one": 0.1188118811881188,
"logic_eval_extr_stps.D-all.rouge1": 14.71791188118812,
"logic_eval_extr_stps.D-all.rouge2": 2.260770297029703,
"logic_eval_extr_stps.D-all.rougeL": 14.357092079207925,
"logic_eval_extr_stps.D-all.rougeLsum": 14.700892079207922,
"logic_eval_gen_len": 354.359375,
"logic_eval_runtime": 335.0776,
"logic_eval_samples_per_second": 0.301,
"logic_eval_steps_per_second": 0.012,
"logic_eval_strct.D-0.answer_accuracy": 0.1111111111111111,
"logic_eval_strct.D-0.proof_accuracy.zero_one": 0.1111111111111111,
"logic_eval_strct.D-0.rouge1": 15.764555555555555,
"logic_eval_strct.D-0.rouge2": 1.5705666666666667,
"logic_eval_strct.D-0.rougeL": 15.764555555555555,
"logic_eval_strct.D-0.rougeLsum": 15.764555555555555,
"logic_eval_strct.D-1.answer_accuracy": 0.11764705882352941,
"logic_eval_strct.D-1.proof_accuracy.zero_one": 0.0,
"logic_eval_strct.D-1.rouge1": 7.244564705882353,
"logic_eval_strct.D-1.rouge2": 0.6124411764705883,
"logic_eval_strct.D-1.rougeL": 6.963717647058823,
"logic_eval_strct.D-1.rougeLsum": 7.244564705882353,
"logic_eval_strct.D-2.answer_accuracy": 0.45454545454545453,
"logic_eval_strct.D-2.proof_accuracy.zero_one": 0.13636363636363635,
"logic_eval_strct.D-2.rouge1": 21.115345454545455,
"logic_eval_strct.D-2.rouge2": 7.12185,
"logic_eval_strct.D-2.rougeL": 20.353104545454546,
"logic_eval_strct.D-2.rougeLsum": 21.03720909090909,
"logic_eval_strct.D-3.answer_accuracy": 0.23809523809523808,
"logic_eval_strct.D-3.proof_accuracy.zero_one": 0.047619047619047616,
"logic_eval_strct.D-3.rouge1": 10.493052380952381,
"logic_eval_strct.D-3.rouge2": 2.243357142857143,
"logic_eval_strct.D-3.rougeL": 9.783571428571431,
"logic_eval_strct.D-3.rougeLsum": 10.493052380952381,
"logic_eval_strct.D-None.answer_accuracy": 0.15625,
"logic_eval_strct.D-None.proof_accuracy.zero_one": 0.15625,
"logic_eval_strct.D-None.rouge1": 16.7680875,
"logic_eval_strct.D-None.rouge2": 0.0,
"logic_eval_strct.D-None.rougeL": 16.7680875,
"logic_eval_strct.D-None.rougeLsum": 16.7680875,
"logic_eval_strct.D-all.answer_accuracy": 0.22772277227722773,
"logic_eval_strct.D-all.proof_accuracy.zero_one": 0.09900990099009901,
"logic_eval_strct.D-all.rouge1": 14.71791188118812,
"logic_eval_strct.D-all.rouge2": 2.260770297029703,
"logic_eval_strct.D-all.rougeL": 14.357092079207925,
"logic_eval_strct.D-all.rougeLsum": 14.700892079207922,
"step": 390
}
],
"logging_steps": 5,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 390,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 326527573032960.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}