llama3-sudo-sanity / trainer_state.json
Qin Liu
Model save
acc2564 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.8989898989899,
"eval_steps": 500,
"global_step": 490,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020202020202020204,
"grad_norm": 1.1391378054420909,
"learning_rate": 4.081632653061224e-06,
"loss": 2.5995,
"step": 1
},
{
"epoch": 0.10101010101010101,
"grad_norm": 1.006731604503432,
"learning_rate": 2.0408163265306123e-05,
"loss": 2.5925,
"step": 5
},
{
"epoch": 0.20202020202020202,
"grad_norm": 1.3965084950072466,
"learning_rate": 4.0816326530612245e-05,
"loss": 2.546,
"step": 10
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.59095362852847,
"learning_rate": 6.122448979591838e-05,
"loss": 2.396,
"step": 15
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.33359551466461584,
"learning_rate": 8.163265306122449e-05,
"loss": 2.2744,
"step": 20
},
{
"epoch": 0.5050505050505051,
"grad_norm": 0.3767673243983956,
"learning_rate": 0.00010204081632653062,
"loss": 2.1608,
"step": 25
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.3530777092096336,
"learning_rate": 0.00012244897959183676,
"loss": 2.0261,
"step": 30
},
{
"epoch": 0.7070707070707071,
"grad_norm": 0.36168305575388426,
"learning_rate": 0.00014285714285714287,
"loss": 2.0091,
"step": 35
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.2764545304686734,
"learning_rate": 0.00016326530612244898,
"loss": 1.9434,
"step": 40
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.2653033039849191,
"learning_rate": 0.00018367346938775512,
"loss": 1.8735,
"step": 45
},
{
"epoch": 0.98989898989899,
"eval_loss": 1.8325327634811401,
"eval_runtime": 177.2293,
"eval_samples_per_second": 35.666,
"eval_steps_per_second": 2.234,
"step": 49
},
{
"epoch": 1.0101010101010102,
"grad_norm": 0.2170406034599688,
"learning_rate": 0.00019999746258949147,
"loss": 1.8679,
"step": 50
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.2414932638214735,
"learning_rate": 0.00019990866674170983,
"loss": 1.8705,
"step": 55
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.280526285390802,
"learning_rate": 0.00019969312910817183,
"loss": 1.8428,
"step": 60
},
{
"epoch": 1.3131313131313131,
"grad_norm": 0.5716516166032067,
"learning_rate": 0.000199351123114852,
"loss": 1.8267,
"step": 65
},
{
"epoch": 1.4141414141414141,
"grad_norm": 0.22548657696350605,
"learning_rate": 0.00019888308262251285,
"loss": 1.7959,
"step": 70
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.2400943754848952,
"learning_rate": 0.00019828960137631928,
"loss": 1.8328,
"step": 75
},
{
"epoch": 1.6161616161616161,
"grad_norm": 0.24338845667140263,
"learning_rate": 0.00019757143225262728,
"loss": 1.8287,
"step": 80
},
{
"epoch": 1.7171717171717171,
"grad_norm": 0.23391106368008455,
"learning_rate": 0.00019672948630390294,
"loss": 1.8345,
"step": 85
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.27223367547235244,
"learning_rate": 0.00019576483160298246,
"loss": 1.7731,
"step": 90
},
{
"epoch": 1.9191919191919191,
"grad_norm": 0.25522514087403636,
"learning_rate": 0.00019467869188814023,
"loss": 1.8231,
"step": 95
},
{
"epoch": 2.0,
"eval_loss": 1.7238675355911255,
"eval_runtime": 175.3209,
"eval_samples_per_second": 36.054,
"eval_steps_per_second": 2.259,
"step": 99
},
{
"epoch": 2.0202020202020203,
"grad_norm": 0.24156163537353126,
"learning_rate": 0.00019347244501068312,
"loss": 1.8199,
"step": 100
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.26738911600323706,
"learning_rate": 0.00019214762118704076,
"loss": 1.7554,
"step": 105
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.2974142479913874,
"learning_rate": 0.000190705901057569,
"loss": 1.7693,
"step": 110
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.32489025521491743,
"learning_rate": 0.00018914911355452895,
"loss": 1.7036,
"step": 115
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.34564309759210704,
"learning_rate": 0.00018747923358194662,
"loss": 1.7449,
"step": 120
},
{
"epoch": 2.525252525252525,
"grad_norm": 0.33251849318291843,
"learning_rate": 0.00018569837951029595,
"loss": 1.7556,
"step": 125
},
{
"epoch": 2.6262626262626263,
"grad_norm": 0.33565250510381917,
"learning_rate": 0.00018380881048918405,
"loss": 1.744,
"step": 130
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.3427001935365706,
"learning_rate": 0.00018181292358144703,
"loss": 1.7234,
"step": 135
},
{
"epoch": 2.8282828282828283,
"grad_norm": 0.32846296076937864,
"learning_rate": 0.00017971325072229226,
"loss": 1.7274,
"step": 140
},
{
"epoch": 2.929292929292929,
"grad_norm": 0.3446715050022125,
"learning_rate": 0.0001775124555073452,
"loss": 1.7516,
"step": 145
},
{
"epoch": 2.98989898989899,
"eval_loss": 1.6329888105392456,
"eval_runtime": 174.9738,
"eval_samples_per_second": 36.125,
"eval_steps_per_second": 2.263,
"step": 148
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.34753378836184123,
"learning_rate": 0.0001752133298136744,
"loss": 1.7442,
"step": 150
},
{
"epoch": 3.1313131313131315,
"grad_norm": 0.3899145091665638,
"learning_rate": 0.0001728187902580819,
"loss": 1.6414,
"step": 155
},
{
"epoch": 3.2323232323232323,
"grad_norm": 0.3969944429695798,
"learning_rate": 0.00017033187449715196,
"loss": 1.6411,
"step": 160
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.4463802224093316,
"learning_rate": 0.00016775573737375096,
"loss": 1.6955,
"step": 165
},
{
"epoch": 3.4343434343434343,
"grad_norm": 0.4873799041554826,
"learning_rate": 0.0001650936469148681,
"loss": 1.6812,
"step": 170
},
{
"epoch": 3.5353535353535355,
"grad_norm": 0.5138644486787001,
"learning_rate": 0.00016234898018587337,
"loss": 1.6455,
"step": 175
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.4441989179255284,
"learning_rate": 0.00015952521900645144,
"loss": 1.6537,
"step": 180
},
{
"epoch": 3.7373737373737375,
"grad_norm": 0.45397642246696135,
"learning_rate": 0.0001566259455336474,
"loss": 1.6384,
"step": 185
},
{
"epoch": 3.8383838383838382,
"grad_norm": 0.48522658034874977,
"learning_rate": 0.0001536548377176263,
"loss": 1.6292,
"step": 190
},
{
"epoch": 3.9393939393939394,
"grad_norm": 0.43244857762556244,
"learning_rate": 0.0001506156646359123,
"loss": 1.6586,
"step": 195
},
{
"epoch": 4.0,
"eval_loss": 1.5279655456542969,
"eval_runtime": 175.1053,
"eval_samples_per_second": 36.098,
"eval_steps_per_second": 2.261,
"step": 198
},
{
"epoch": 4.040404040404041,
"grad_norm": 0.48231588190167396,
"learning_rate": 0.0001475122817120253,
"loss": 1.6137,
"step": 200
},
{
"epoch": 4.141414141414141,
"grad_norm": 0.5842989060014684,
"learning_rate": 0.00014434862582458135,
"loss": 1.5082,
"step": 205
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.5870996767325578,
"learning_rate": 0.00014112871031306119,
"loss": 1.5382,
"step": 210
},
{
"epoch": 4.343434343434343,
"grad_norm": 0.6294490520638103,
"learning_rate": 0.0001378566198865818,
"loss": 1.5738,
"step": 215
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.6361554344671604,
"learning_rate": 0.00013453650544213076,
"loss": 1.5609,
"step": 220
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.5845910737228225,
"learning_rate": 0.00013117257879883583,
"loss": 1.5832,
"step": 225
},
{
"epoch": 4.646464646464646,
"grad_norm": 0.6362570401491278,
"learning_rate": 0.00012776910735495003,
"loss": 1.5386,
"step": 230
},
{
"epoch": 4.747474747474747,
"grad_norm": 0.6079381787055775,
"learning_rate": 0.0001243304086743309,
"loss": 1.5408,
"step": 235
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.5955494164961348,
"learning_rate": 0.0001208608450092801,
"loss": 1.5767,
"step": 240
},
{
"epoch": 4.94949494949495,
"grad_norm": 0.5941973746172844,
"learning_rate": 0.00011736481776669306,
"loss": 1.5571,
"step": 245
},
{
"epoch": 4.98989898989899,
"eval_loss": 1.4166467189788818,
"eval_runtime": 174.6193,
"eval_samples_per_second": 36.199,
"eval_steps_per_second": 2.268,
"step": 247
},
{
"epoch": 5.05050505050505,
"grad_norm": 0.6955112160268645,
"learning_rate": 0.0001138467619245374,
"loss": 1.5011,
"step": 250
},
{
"epoch": 5.151515151515151,
"grad_norm": 0.7116916227562953,
"learning_rate": 0.00011031114040574437,
"loss": 1.4537,
"step": 255
},
{
"epoch": 5.252525252525253,
"grad_norm": 0.8295579161716972,
"learning_rate": 0.0001067624384166495,
"loss": 1.398,
"step": 260
},
{
"epoch": 5.353535353535354,
"grad_norm": 0.7415551092257379,
"learning_rate": 0.00010320515775716555,
"loss": 1.4474,
"step": 265
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.7957507416152227,
"learning_rate": 9.96438111099047e-05,
"loss": 1.4459,
"step": 270
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.8098108632452509,
"learning_rate": 9.608291631549574e-05,
"loss": 1.4266,
"step": 275
},
{
"epoch": 5.656565656565657,
"grad_norm": 0.8498743613190896,
"learning_rate": 9.252699064135758e-05,
"loss": 1.3931,
"step": 280
},
{
"epoch": 5.757575757575758,
"grad_norm": 0.7968761297367668,
"learning_rate": 8.898054505119989e-05,
"loss": 1.4628,
"step": 285
},
{
"epoch": 5.858585858585858,
"grad_norm": 0.8166566096084199,
"learning_rate": 8.54480784825207e-05,
"loss": 1.4777,
"step": 290
},
{
"epoch": 5.959595959595959,
"grad_norm": 0.7564583944169918,
"learning_rate": 8.193407213936012e-05,
"loss": 1.4677,
"step": 295
},
{
"epoch": 6.0,
"eval_loss": 1.3067700862884521,
"eval_runtime": 175.1357,
"eval_samples_per_second": 36.092,
"eval_steps_per_second": 2.261,
"step": 297
},
{
"epoch": 6.0606060606060606,
"grad_norm": 0.8369827825158762,
"learning_rate": 7.844298380755003e-05,
"loss": 1.375,
"step": 300
},
{
"epoch": 6.161616161616162,
"grad_norm": 0.9204188282340791,
"learning_rate": 7.497924219967209e-05,
"loss": 1.2999,
"step": 305
},
{
"epoch": 6.262626262626263,
"grad_norm": 0.9559880600184892,
"learning_rate": 7.154724133689677e-05,
"loss": 1.3084,
"step": 310
},
{
"epoch": 6.363636363636363,
"grad_norm": 0.9272296702059781,
"learning_rate": 6.815133497483157e-05,
"loss": 1.3405,
"step": 315
},
{
"epoch": 6.4646464646464645,
"grad_norm": 1.0203421193696094,
"learning_rate": 6.479583108044899e-05,
"loss": 1.3165,
"step": 320
},
{
"epoch": 6.565656565656566,
"grad_norm": 0.8932381508297077,
"learning_rate": 6.148498636710092e-05,
"loss": 1.3641,
"step": 325
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.9527012454845684,
"learning_rate": 5.822300089455211e-05,
"loss": 1.3179,
"step": 330
},
{
"epoch": 6.767676767676767,
"grad_norm": 0.9644270167383292,
"learning_rate": 5.5014012740883115e-05,
"loss": 1.3295,
"step": 335
},
{
"epoch": 6.8686868686868685,
"grad_norm": 0.9489303492473159,
"learning_rate": 5.1862092753021754e-05,
"loss": 1.3482,
"step": 340
},
{
"epoch": 6.96969696969697,
"grad_norm": 0.9417366559193787,
"learning_rate": 4.8771239382562287e-05,
"loss": 1.3422,
"step": 345
},
{
"epoch": 6.98989898989899,
"eval_loss": 1.2082042694091797,
"eval_runtime": 174.4886,
"eval_samples_per_second": 36.226,
"eval_steps_per_second": 2.269,
"step": 346
},
{
"epoch": 7.070707070707071,
"grad_norm": 1.0189885797060951,
"learning_rate": 4.574537361342407e-05,
"loss": 1.2447,
"step": 350
},
{
"epoch": 7.171717171717171,
"grad_norm": 1.0456878645941505,
"learning_rate": 4.278833398778306e-05,
"loss": 1.2438,
"step": 355
},
{
"epoch": 7.2727272727272725,
"grad_norm": 1.0906515200546398,
"learning_rate": 3.990387173658774e-05,
"loss": 1.2135,
"step": 360
},
{
"epoch": 7.373737373737374,
"grad_norm": 1.1138045736907602,
"learning_rate": 3.7095646020835754e-05,
"loss": 1.2152,
"step": 365
},
{
"epoch": 7.474747474747475,
"grad_norm": 1.1333935442018617,
"learning_rate": 3.436721928964819e-05,
"loss": 1.2004,
"step": 370
},
{
"epoch": 7.575757575757576,
"grad_norm": 1.0135992096066218,
"learning_rate": 3.172205276103033e-05,
"loss": 1.1904,
"step": 375
},
{
"epoch": 7.6767676767676765,
"grad_norm": 1.0811091792166911,
"learning_rate": 2.916350203105207e-05,
"loss": 1.2475,
"step": 380
},
{
"epoch": 7.777777777777778,
"grad_norm": 1.1722915377984628,
"learning_rate": 2.669481281701739e-05,
"loss": 1.2273,
"step": 385
},
{
"epoch": 7.878787878787879,
"grad_norm": 1.0151820296117031,
"learning_rate": 2.4319116840023813e-05,
"loss": 1.2462,
"step": 390
},
{
"epoch": 7.97979797979798,
"grad_norm": 1.0359433658999384,
"learning_rate": 2.2039427852134788e-05,
"loss": 1.2609,
"step": 395
},
{
"epoch": 8.0,
"eval_loss": 1.137781023979187,
"eval_runtime": 163.6859,
"eval_samples_per_second": 38.617,
"eval_steps_per_second": 2.419,
"step": 396
},
{
"epoch": 8.080808080808081,
"grad_norm": 1.0914414519615843,
"learning_rate": 1.985863781320435e-05,
"loss": 1.1457,
"step": 400
},
{
"epoch": 8.181818181818182,
"grad_norm": 1.2849174669504693,
"learning_rate": 1.777951322220508e-05,
"loss": 1.1925,
"step": 405
},
{
"epoch": 8.282828282828282,
"grad_norm": 1.0562037397284274,
"learning_rate": 1.580469160771253e-05,
"loss": 1.1653,
"step": 410
},
{
"epoch": 8.383838383838384,
"grad_norm": 1.1942325172166053,
"learning_rate": 1.3936678181998374e-05,
"loss": 1.1451,
"step": 415
},
{
"epoch": 8.484848484848484,
"grad_norm": 1.2292184186394104,
"learning_rate": 1.2177842662977135e-05,
"loss": 1.1432,
"step": 420
},
{
"epoch": 8.585858585858587,
"grad_norm": 1.1449254109310076,
"learning_rate": 1.0530416268037702e-05,
"loss": 1.1459,
"step": 425
},
{
"epoch": 8.686868686868687,
"grad_norm": 1.1159137674762092,
"learning_rate": 8.99648888357335e-06,
"loss": 1.1889,
"step": 430
},
{
"epoch": 8.787878787878787,
"grad_norm": 1.1893818134430183,
"learning_rate": 7.578006413801075e-06,
"loss": 1.1809,
"step": 435
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.131890459862098,
"learning_rate": 6.276768312233228e-06,
"loss": 1.1806,
"step": 440
},
{
"epoch": 8.98989898989899,
"grad_norm": 1.0926795618787801,
"learning_rate": 5.094425298933136e-06,
"loss": 1.1647,
"step": 445
},
{
"epoch": 8.98989898989899,
"eval_loss": 1.107386827468872,
"eval_runtime": 163.223,
"eval_samples_per_second": 38.726,
"eval_steps_per_second": 2.426,
"step": 445
},
{
"epoch": 9.090909090909092,
"grad_norm": 1.1051037332814697,
"learning_rate": 4.0324772664503296e-06,
"loss": 1.1438,
"step": 450
},
{
"epoch": 9.191919191919192,
"grad_norm": 1.164376038164282,
"learning_rate": 3.092271377092215e-06,
"loss": 1.1481,
"step": 455
},
{
"epoch": 9.292929292929292,
"grad_norm": 1.2303081966513765,
"learning_rate": 2.2750003539455998e-06,
"loss": 1.1202,
"step": 460
},
{
"epoch": 9.393939393939394,
"grad_norm": 1.2010643624794166,
"learning_rate": 1.5817009678162685e-06,
"loss": 1.142,
"step": 465
},
{
"epoch": 9.494949494949495,
"grad_norm": 1.2423558300638782,
"learning_rate": 1.013252722005842e-06,
"loss": 1.1842,
"step": 470
},
{
"epoch": 9.595959595959595,
"grad_norm": 1.1980002179676799,
"learning_rate": 5.703767365946466e-07,
"loss": 1.1236,
"step": 475
},
{
"epoch": 9.696969696969697,
"grad_norm": 1.2034300162155251,
"learning_rate": 2.536348336456551e-07,
"loss": 1.1168,
"step": 480
},
{
"epoch": 9.797979797979798,
"grad_norm": 1.2022297984086214,
"learning_rate": 6.342882449029696e-08,
"loss": 1.1133,
"step": 485
},
{
"epoch": 9.8989898989899,
"grad_norm": 1.0699752057421905,
"learning_rate": 0.0,
"loss": 1.1571,
"step": 490
},
{
"epoch": 9.8989898989899,
"eval_loss": 1.1029597520828247,
"eval_runtime": 163.4283,
"eval_samples_per_second": 38.678,
"eval_steps_per_second": 2.423,
"step": 490
},
{
"epoch": 9.8989898989899,
"step": 490,
"total_flos": 2344635780825088.0,
"train_loss": 1.5290005391957808,
"train_runtime": 5413.3076,
"train_samples_per_second": 11.677,
"train_steps_per_second": 0.091
}
],
"logging_steps": 5,
"max_steps": 490,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2344635780825088.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}