|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.8989898989899, |
|
"eval_steps": 500, |
|
"global_step": 490, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020202020202020204, |
|
"grad_norm": 1.1391378054420909, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 2.5995, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10101010101010101, |
|
"grad_norm": 1.006731604503432, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 2.5925, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 1.3965084950072466, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 2.546, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.59095362852847, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 2.396, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 0.33359551466461584, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 2.2744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"grad_norm": 0.3767673243983956, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 2.1608, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.3530777092096336, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 2.0261, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7070707070707071, |
|
"grad_norm": 0.36168305575388426, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 2.0091, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.2764545304686734, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 1.9434, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.2653033039849191, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 1.8735, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.98989898989899, |
|
"eval_loss": 1.8325327634811401, |
|
"eval_runtime": 177.2293, |
|
"eval_samples_per_second": 35.666, |
|
"eval_steps_per_second": 2.234, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 0.2170406034599688, |
|
"learning_rate": 0.00019999746258949147, |
|
"loss": 1.8679, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.2414932638214735, |
|
"learning_rate": 0.00019990866674170983, |
|
"loss": 1.8705, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.280526285390802, |
|
"learning_rate": 0.00019969312910817183, |
|
"loss": 1.8428, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3131313131313131, |
|
"grad_norm": 0.5716516166032067, |
|
"learning_rate": 0.000199351123114852, |
|
"loss": 1.8267, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4141414141414141, |
|
"grad_norm": 0.22548657696350605, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 1.7959, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.2400943754848952, |
|
"learning_rate": 0.00019828960137631928, |
|
"loss": 1.8328, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 0.24338845667140263, |
|
"learning_rate": 0.00019757143225262728, |
|
"loss": 1.8287, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7171717171717171, |
|
"grad_norm": 0.23391106368008455, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 1.8345, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.27223367547235244, |
|
"learning_rate": 0.00019576483160298246, |
|
"loss": 1.7731, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9191919191919191, |
|
"grad_norm": 0.25522514087403636, |
|
"learning_rate": 0.00019467869188814023, |
|
"loss": 1.8231, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.7238675355911255, |
|
"eval_runtime": 175.3209, |
|
"eval_samples_per_second": 36.054, |
|
"eval_steps_per_second": 2.259, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 0.24156163537353126, |
|
"learning_rate": 0.00019347244501068312, |
|
"loss": 1.8199, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 0.26738911600323706, |
|
"learning_rate": 0.00019214762118704076, |
|
"loss": 1.7554, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.2974142479913874, |
|
"learning_rate": 0.000190705901057569, |
|
"loss": 1.7693, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.323232323232323, |
|
"grad_norm": 0.32489025521491743, |
|
"learning_rate": 0.00018914911355452895, |
|
"loss": 1.7036, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.34564309759210704, |
|
"learning_rate": 0.00018747923358194662, |
|
"loss": 1.7449, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"grad_norm": 0.33251849318291843, |
|
"learning_rate": 0.00018569837951029595, |
|
"loss": 1.7556, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6262626262626263, |
|
"grad_norm": 0.33565250510381917, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 1.744, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.3427001935365706, |
|
"learning_rate": 0.00018181292358144703, |
|
"loss": 1.7234, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.8282828282828283, |
|
"grad_norm": 0.32846296076937864, |
|
"learning_rate": 0.00017971325072229226, |
|
"loss": 1.7274, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.929292929292929, |
|
"grad_norm": 0.3446715050022125, |
|
"learning_rate": 0.0001775124555073452, |
|
"loss": 1.7516, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.98989898989899, |
|
"eval_loss": 1.6329888105392456, |
|
"eval_runtime": 174.9738, |
|
"eval_samples_per_second": 36.125, |
|
"eval_steps_per_second": 2.263, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 0.34753378836184123, |
|
"learning_rate": 0.0001752133298136744, |
|
"loss": 1.7442, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.1313131313131315, |
|
"grad_norm": 0.3899145091665638, |
|
"learning_rate": 0.0001728187902580819, |
|
"loss": 1.6414, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.2323232323232323, |
|
"grad_norm": 0.3969944429695798, |
|
"learning_rate": 0.00017033187449715196, |
|
"loss": 1.6411, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.4463802224093316, |
|
"learning_rate": 0.00016775573737375096, |
|
"loss": 1.6955, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.4343434343434343, |
|
"grad_norm": 0.4873799041554826, |
|
"learning_rate": 0.0001650936469148681, |
|
"loss": 1.6812, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.5353535353535355, |
|
"grad_norm": 0.5138644486787001, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 1.6455, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.4441989179255284, |
|
"learning_rate": 0.00015952521900645144, |
|
"loss": 1.6537, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.7373737373737375, |
|
"grad_norm": 0.45397642246696135, |
|
"learning_rate": 0.0001566259455336474, |
|
"loss": 1.6384, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.8383838383838382, |
|
"grad_norm": 0.48522658034874977, |
|
"learning_rate": 0.0001536548377176263, |
|
"loss": 1.6292, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 0.43244857762556244, |
|
"learning_rate": 0.0001506156646359123, |
|
"loss": 1.6586, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.5279655456542969, |
|
"eval_runtime": 175.1053, |
|
"eval_samples_per_second": 36.098, |
|
"eval_steps_per_second": 2.261, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 0.48231588190167396, |
|
"learning_rate": 0.0001475122817120253, |
|
"loss": 1.6137, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.141414141414141, |
|
"grad_norm": 0.5842989060014684, |
|
"learning_rate": 0.00014434862582458135, |
|
"loss": 1.5082, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 0.5870996767325578, |
|
"learning_rate": 0.00014112871031306119, |
|
"loss": 1.5382, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.343434343434343, |
|
"grad_norm": 0.6294490520638103, |
|
"learning_rate": 0.0001378566198865818, |
|
"loss": 1.5738, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.6361554344671604, |
|
"learning_rate": 0.00013453650544213076, |
|
"loss": 1.5609, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.5845910737228225, |
|
"learning_rate": 0.00013117257879883583, |
|
"loss": 1.5832, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.646464646464646, |
|
"grad_norm": 0.6362570401491278, |
|
"learning_rate": 0.00012776910735495003, |
|
"loss": 1.5386, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.747474747474747, |
|
"grad_norm": 0.6079381787055775, |
|
"learning_rate": 0.0001243304086743309, |
|
"loss": 1.5408, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 0.5955494164961348, |
|
"learning_rate": 0.0001208608450092801, |
|
"loss": 1.5767, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.94949494949495, |
|
"grad_norm": 0.5941973746172844, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.5571, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.98989898989899, |
|
"eval_loss": 1.4166467189788818, |
|
"eval_runtime": 174.6193, |
|
"eval_samples_per_second": 36.199, |
|
"eval_steps_per_second": 2.268, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 5.05050505050505, |
|
"grad_norm": 0.6955112160268645, |
|
"learning_rate": 0.0001138467619245374, |
|
"loss": 1.5011, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.151515151515151, |
|
"grad_norm": 0.7116916227562953, |
|
"learning_rate": 0.00011031114040574437, |
|
"loss": 1.4537, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.252525252525253, |
|
"grad_norm": 0.8295579161716972, |
|
"learning_rate": 0.0001067624384166495, |
|
"loss": 1.398, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.353535353535354, |
|
"grad_norm": 0.7415551092257379, |
|
"learning_rate": 0.00010320515775716555, |
|
"loss": 1.4474, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.7957507416152227, |
|
"learning_rate": 9.96438111099047e-05, |
|
"loss": 1.4459, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.8098108632452509, |
|
"learning_rate": 9.608291631549574e-05, |
|
"loss": 1.4266, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.656565656565657, |
|
"grad_norm": 0.8498743613190896, |
|
"learning_rate": 9.252699064135758e-05, |
|
"loss": 1.3931, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.757575757575758, |
|
"grad_norm": 0.7968761297367668, |
|
"learning_rate": 8.898054505119989e-05, |
|
"loss": 1.4628, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.858585858585858, |
|
"grad_norm": 0.8166566096084199, |
|
"learning_rate": 8.54480784825207e-05, |
|
"loss": 1.4777, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.959595959595959, |
|
"grad_norm": 0.7564583944169918, |
|
"learning_rate": 8.193407213936012e-05, |
|
"loss": 1.4677, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.3067700862884521, |
|
"eval_runtime": 175.1357, |
|
"eval_samples_per_second": 36.092, |
|
"eval_steps_per_second": 2.261, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"grad_norm": 0.8369827825158762, |
|
"learning_rate": 7.844298380755003e-05, |
|
"loss": 1.375, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.161616161616162, |
|
"grad_norm": 0.9204188282340791, |
|
"learning_rate": 7.497924219967209e-05, |
|
"loss": 1.2999, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 6.262626262626263, |
|
"grad_norm": 0.9559880600184892, |
|
"learning_rate": 7.154724133689677e-05, |
|
"loss": 1.3084, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.9272296702059781, |
|
"learning_rate": 6.815133497483157e-05, |
|
"loss": 1.3405, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.4646464646464645, |
|
"grad_norm": 1.0203421193696094, |
|
"learning_rate": 6.479583108044899e-05, |
|
"loss": 1.3165, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.565656565656566, |
|
"grad_norm": 0.8932381508297077, |
|
"learning_rate": 6.148498636710092e-05, |
|
"loss": 1.3641, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.9527012454845684, |
|
"learning_rate": 5.822300089455211e-05, |
|
"loss": 1.3179, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.767676767676767, |
|
"grad_norm": 0.9644270167383292, |
|
"learning_rate": 5.5014012740883115e-05, |
|
"loss": 1.3295, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 6.8686868686868685, |
|
"grad_norm": 0.9489303492473159, |
|
"learning_rate": 5.1862092753021754e-05, |
|
"loss": 1.3482, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.96969696969697, |
|
"grad_norm": 0.9417366559193787, |
|
"learning_rate": 4.8771239382562287e-05, |
|
"loss": 1.3422, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 6.98989898989899, |
|
"eval_loss": 1.2082042694091797, |
|
"eval_runtime": 174.4886, |
|
"eval_samples_per_second": 36.226, |
|
"eval_steps_per_second": 2.269, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 7.070707070707071, |
|
"grad_norm": 1.0189885797060951, |
|
"learning_rate": 4.574537361342407e-05, |
|
"loss": 1.2447, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.171717171717171, |
|
"grad_norm": 1.0456878645941505, |
|
"learning_rate": 4.278833398778306e-05, |
|
"loss": 1.2438, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 1.0906515200546398, |
|
"learning_rate": 3.990387173658774e-05, |
|
"loss": 1.2135, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.373737373737374, |
|
"grad_norm": 1.1138045736907602, |
|
"learning_rate": 3.7095646020835754e-05, |
|
"loss": 1.2152, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 7.474747474747475, |
|
"grad_norm": 1.1333935442018617, |
|
"learning_rate": 3.436721928964819e-05, |
|
"loss": 1.2004, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.575757575757576, |
|
"grad_norm": 1.0135992096066218, |
|
"learning_rate": 3.172205276103033e-05, |
|
"loss": 1.1904, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 7.6767676767676765, |
|
"grad_norm": 1.0811091792166911, |
|
"learning_rate": 2.916350203105207e-05, |
|
"loss": 1.2475, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 1.1722915377984628, |
|
"learning_rate": 2.669481281701739e-05, |
|
"loss": 1.2273, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.878787878787879, |
|
"grad_norm": 1.0151820296117031, |
|
"learning_rate": 2.4319116840023813e-05, |
|
"loss": 1.2462, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.97979797979798, |
|
"grad_norm": 1.0359433658999384, |
|
"learning_rate": 2.2039427852134788e-05, |
|
"loss": 1.2609, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.137781023979187, |
|
"eval_runtime": 163.6859, |
|
"eval_samples_per_second": 38.617, |
|
"eval_steps_per_second": 2.419, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"grad_norm": 1.0914414519615843, |
|
"learning_rate": 1.985863781320435e-05, |
|
"loss": 1.1457, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 1.2849174669504693, |
|
"learning_rate": 1.777951322220508e-05, |
|
"loss": 1.1925, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 8.282828282828282, |
|
"grad_norm": 1.0562037397284274, |
|
"learning_rate": 1.580469160771253e-05, |
|
"loss": 1.1653, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.383838383838384, |
|
"grad_norm": 1.1942325172166053, |
|
"learning_rate": 1.3936678181998374e-05, |
|
"loss": 1.1451, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 8.484848484848484, |
|
"grad_norm": 1.2292184186394104, |
|
"learning_rate": 1.2177842662977135e-05, |
|
"loss": 1.1432, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.585858585858587, |
|
"grad_norm": 1.1449254109310076, |
|
"learning_rate": 1.0530416268037702e-05, |
|
"loss": 1.1459, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 8.686868686868687, |
|
"grad_norm": 1.1159137674762092, |
|
"learning_rate": 8.99648888357335e-06, |
|
"loss": 1.1889, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.787878787878787, |
|
"grad_norm": 1.1893818134430183, |
|
"learning_rate": 7.578006413801075e-06, |
|
"loss": 1.1809, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 1.131890459862098, |
|
"learning_rate": 6.276768312233228e-06, |
|
"loss": 1.1806, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.98989898989899, |
|
"grad_norm": 1.0926795618787801, |
|
"learning_rate": 5.094425298933136e-06, |
|
"loss": 1.1647, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 8.98989898989899, |
|
"eval_loss": 1.107386827468872, |
|
"eval_runtime": 163.223, |
|
"eval_samples_per_second": 38.726, |
|
"eval_steps_per_second": 2.426, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 1.1051037332814697, |
|
"learning_rate": 4.0324772664503296e-06, |
|
"loss": 1.1438, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.191919191919192, |
|
"grad_norm": 1.164376038164282, |
|
"learning_rate": 3.092271377092215e-06, |
|
"loss": 1.1481, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 9.292929292929292, |
|
"grad_norm": 1.2303081966513765, |
|
"learning_rate": 2.2750003539455998e-06, |
|
"loss": 1.1202, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.393939393939394, |
|
"grad_norm": 1.2010643624794166, |
|
"learning_rate": 1.5817009678162685e-06, |
|
"loss": 1.142, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 9.494949494949495, |
|
"grad_norm": 1.2423558300638782, |
|
"learning_rate": 1.013252722005842e-06, |
|
"loss": 1.1842, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.595959595959595, |
|
"grad_norm": 1.1980002179676799, |
|
"learning_rate": 5.703767365946466e-07, |
|
"loss": 1.1236, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 9.696969696969697, |
|
"grad_norm": 1.2034300162155251, |
|
"learning_rate": 2.536348336456551e-07, |
|
"loss": 1.1168, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.797979797979798, |
|
"grad_norm": 1.2022297984086214, |
|
"learning_rate": 6.342882449029696e-08, |
|
"loss": 1.1133, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 9.8989898989899, |
|
"grad_norm": 1.0699752057421905, |
|
"learning_rate": 0.0, |
|
"loss": 1.1571, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.8989898989899, |
|
"eval_loss": 1.1029597520828247, |
|
"eval_runtime": 163.4283, |
|
"eval_samples_per_second": 38.678, |
|
"eval_steps_per_second": 2.423, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.8989898989899, |
|
"step": 490, |
|
"total_flos": 2344635780825088.0, |
|
"train_loss": 1.5290005391957808, |
|
"train_runtime": 5413.3076, |
|
"train_samples_per_second": 11.677, |
|
"train_steps_per_second": 0.091 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2344635780825088.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|