{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.8989898989899, "eval_steps": 500, "global_step": 490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020202020202020204, "grad_norm": 1.1391378054420909, "learning_rate": 4.081632653061224e-06, "loss": 2.5995, "step": 1 }, { "epoch": 0.10101010101010101, "grad_norm": 1.006731604503432, "learning_rate": 2.0408163265306123e-05, "loss": 2.5925, "step": 5 }, { "epoch": 0.20202020202020202, "grad_norm": 1.3965084950072466, "learning_rate": 4.0816326530612245e-05, "loss": 2.546, "step": 10 }, { "epoch": 0.30303030303030304, "grad_norm": 0.59095362852847, "learning_rate": 6.122448979591838e-05, "loss": 2.396, "step": 15 }, { "epoch": 0.40404040404040403, "grad_norm": 0.33359551466461584, "learning_rate": 8.163265306122449e-05, "loss": 2.2744, "step": 20 }, { "epoch": 0.5050505050505051, "grad_norm": 0.3767673243983956, "learning_rate": 0.00010204081632653062, "loss": 2.1608, "step": 25 }, { "epoch": 0.6060606060606061, "grad_norm": 0.3530777092096336, "learning_rate": 0.00012244897959183676, "loss": 2.0261, "step": 30 }, { "epoch": 0.7070707070707071, "grad_norm": 0.36168305575388426, "learning_rate": 0.00014285714285714287, "loss": 2.0091, "step": 35 }, { "epoch": 0.8080808080808081, "grad_norm": 0.2764545304686734, "learning_rate": 0.00016326530612244898, "loss": 1.9434, "step": 40 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2653033039849191, "learning_rate": 0.00018367346938775512, "loss": 1.8735, "step": 45 }, { "epoch": 0.98989898989899, "eval_loss": 1.8325327634811401, "eval_runtime": 177.2293, "eval_samples_per_second": 35.666, "eval_steps_per_second": 2.234, "step": 49 }, { "epoch": 1.0101010101010102, "grad_norm": 0.2170406034599688, "learning_rate": 0.00019999746258949147, "loss": 1.8679, "step": 50 }, { "epoch": 1.1111111111111112, "grad_norm": 0.2414932638214735, "learning_rate": 0.00019990866674170983, "loss": 1.8705, "step": 55 }, { "epoch": 1.2121212121212122, "grad_norm": 0.280526285390802, "learning_rate": 0.00019969312910817183, "loss": 1.8428, "step": 60 }, { "epoch": 1.3131313131313131, "grad_norm": 0.5716516166032067, "learning_rate": 0.000199351123114852, "loss": 1.8267, "step": 65 }, { "epoch": 1.4141414141414141, "grad_norm": 0.22548657696350605, "learning_rate": 0.00019888308262251285, "loss": 1.7959, "step": 70 }, { "epoch": 1.5151515151515151, "grad_norm": 0.2400943754848952, "learning_rate": 0.00019828960137631928, "loss": 1.8328, "step": 75 }, { "epoch": 1.6161616161616161, "grad_norm": 0.24338845667140263, "learning_rate": 0.00019757143225262728, "loss": 1.8287, "step": 80 }, { "epoch": 1.7171717171717171, "grad_norm": 0.23391106368008455, "learning_rate": 0.00019672948630390294, "loss": 1.8345, "step": 85 }, { "epoch": 1.8181818181818183, "grad_norm": 0.27223367547235244, "learning_rate": 0.00019576483160298246, "loss": 1.7731, "step": 90 }, { "epoch": 1.9191919191919191, "grad_norm": 0.25522514087403636, "learning_rate": 0.00019467869188814023, "loss": 1.8231, "step": 95 }, { "epoch": 2.0, "eval_loss": 1.7238675355911255, "eval_runtime": 175.3209, "eval_samples_per_second": 36.054, "eval_steps_per_second": 2.259, "step": 99 }, { "epoch": 2.0202020202020203, "grad_norm": 0.24156163537353126, "learning_rate": 0.00019347244501068312, "loss": 1.8199, "step": 100 }, { "epoch": 2.121212121212121, "grad_norm": 0.26738911600323706, "learning_rate": 0.00019214762118704076, "loss": 1.7554, "step": 105 }, { "epoch": 2.2222222222222223, "grad_norm": 0.2974142479913874, "learning_rate": 0.000190705901057569, "loss": 1.7693, "step": 110 }, { "epoch": 2.323232323232323, "grad_norm": 0.32489025521491743, "learning_rate": 0.00018914911355452895, "loss": 1.7036, "step": 115 }, { "epoch": 2.4242424242424243, "grad_norm": 0.34564309759210704, "learning_rate": 0.00018747923358194662, "loss": 1.7449, "step": 120 }, { "epoch": 2.525252525252525, "grad_norm": 0.33251849318291843, "learning_rate": 0.00018569837951029595, "loss": 1.7556, "step": 125 }, { "epoch": 2.6262626262626263, "grad_norm": 0.33565250510381917, "learning_rate": 0.00018380881048918405, "loss": 1.744, "step": 130 }, { "epoch": 2.7272727272727275, "grad_norm": 0.3427001935365706, "learning_rate": 0.00018181292358144703, "loss": 1.7234, "step": 135 }, { "epoch": 2.8282828282828283, "grad_norm": 0.32846296076937864, "learning_rate": 0.00017971325072229226, "loss": 1.7274, "step": 140 }, { "epoch": 2.929292929292929, "grad_norm": 0.3446715050022125, "learning_rate": 0.0001775124555073452, "loss": 1.7516, "step": 145 }, { "epoch": 2.98989898989899, "eval_loss": 1.6329888105392456, "eval_runtime": 174.9738, "eval_samples_per_second": 36.125, "eval_steps_per_second": 2.263, "step": 148 }, { "epoch": 3.0303030303030303, "grad_norm": 0.34753378836184123, "learning_rate": 0.0001752133298136744, "loss": 1.7442, "step": 150 }, { "epoch": 3.1313131313131315, "grad_norm": 0.3899145091665638, "learning_rate": 0.0001728187902580819, "loss": 1.6414, "step": 155 }, { "epoch": 3.2323232323232323, "grad_norm": 0.3969944429695798, "learning_rate": 0.00017033187449715196, "loss": 1.6411, "step": 160 }, { "epoch": 3.3333333333333335, "grad_norm": 0.4463802224093316, "learning_rate": 0.00016775573737375096, "loss": 1.6955, "step": 165 }, { "epoch": 3.4343434343434343, "grad_norm": 0.4873799041554826, "learning_rate": 0.0001650936469148681, "loss": 1.6812, "step": 170 }, { "epoch": 3.5353535353535355, "grad_norm": 0.5138644486787001, "learning_rate": 0.00016234898018587337, "loss": 1.6455, "step": 175 }, { "epoch": 3.6363636363636362, "grad_norm": 0.4441989179255284, "learning_rate": 0.00015952521900645144, "loss": 1.6537, "step": 180 }, { "epoch": 3.7373737373737375, "grad_norm": 0.45397642246696135, "learning_rate": 0.0001566259455336474, "loss": 1.6384, "step": 185 }, { "epoch": 3.8383838383838382, "grad_norm": 0.48522658034874977, "learning_rate": 0.0001536548377176263, "loss": 1.6292, "step": 190 }, { "epoch": 3.9393939393939394, "grad_norm": 0.43244857762556244, "learning_rate": 0.0001506156646359123, "loss": 1.6586, "step": 195 }, { "epoch": 4.0, "eval_loss": 1.5279655456542969, "eval_runtime": 175.1053, "eval_samples_per_second": 36.098, "eval_steps_per_second": 2.261, "step": 198 }, { "epoch": 4.040404040404041, "grad_norm": 0.48231588190167396, "learning_rate": 0.0001475122817120253, "loss": 1.6137, "step": 200 }, { "epoch": 4.141414141414141, "grad_norm": 0.5842989060014684, "learning_rate": 0.00014434862582458135, "loss": 1.5082, "step": 205 }, { "epoch": 4.242424242424242, "grad_norm": 0.5870996767325578, "learning_rate": 0.00014112871031306119, "loss": 1.5382, "step": 210 }, { "epoch": 4.343434343434343, "grad_norm": 0.6294490520638103, "learning_rate": 0.0001378566198865818, "loss": 1.5738, "step": 215 }, { "epoch": 4.444444444444445, "grad_norm": 0.6361554344671604, "learning_rate": 0.00013453650544213076, "loss": 1.5609, "step": 220 }, { "epoch": 4.545454545454545, "grad_norm": 0.5845910737228225, "learning_rate": 0.00013117257879883583, "loss": 1.5832, "step": 225 }, { "epoch": 4.646464646464646, "grad_norm": 0.6362570401491278, "learning_rate": 0.00012776910735495003, "loss": 1.5386, "step": 230 }, { "epoch": 4.747474747474747, "grad_norm": 0.6079381787055775, "learning_rate": 0.0001243304086743309, "loss": 1.5408, "step": 235 }, { "epoch": 4.848484848484849, "grad_norm": 0.5955494164961348, "learning_rate": 0.0001208608450092801, "loss": 1.5767, "step": 240 }, { "epoch": 4.94949494949495, "grad_norm": 0.5941973746172844, "learning_rate": 0.00011736481776669306, "loss": 1.5571, "step": 245 }, { "epoch": 4.98989898989899, "eval_loss": 1.4166467189788818, "eval_runtime": 174.6193, "eval_samples_per_second": 36.199, "eval_steps_per_second": 2.268, "step": 247 }, { "epoch": 5.05050505050505, "grad_norm": 0.6955112160268645, "learning_rate": 0.0001138467619245374, "loss": 1.5011, "step": 250 }, { "epoch": 5.151515151515151, "grad_norm": 0.7116916227562953, "learning_rate": 0.00011031114040574437, "loss": 1.4537, "step": 255 }, { "epoch": 5.252525252525253, "grad_norm": 0.8295579161716972, "learning_rate": 0.0001067624384166495, "loss": 1.398, "step": 260 }, { "epoch": 5.353535353535354, "grad_norm": 0.7415551092257379, "learning_rate": 0.00010320515775716555, "loss": 1.4474, "step": 265 }, { "epoch": 5.454545454545454, "grad_norm": 0.7957507416152227, "learning_rate": 9.96438111099047e-05, "loss": 1.4459, "step": 270 }, { "epoch": 5.555555555555555, "grad_norm": 0.8098108632452509, "learning_rate": 9.608291631549574e-05, "loss": 1.4266, "step": 275 }, { "epoch": 5.656565656565657, "grad_norm": 0.8498743613190896, "learning_rate": 9.252699064135758e-05, "loss": 1.3931, "step": 280 }, { "epoch": 5.757575757575758, "grad_norm": 0.7968761297367668, "learning_rate": 8.898054505119989e-05, "loss": 1.4628, "step": 285 }, { "epoch": 5.858585858585858, "grad_norm": 0.8166566096084199, "learning_rate": 8.54480784825207e-05, "loss": 1.4777, "step": 290 }, { "epoch": 5.959595959595959, "grad_norm": 0.7564583944169918, "learning_rate": 8.193407213936012e-05, "loss": 1.4677, "step": 295 }, { "epoch": 6.0, "eval_loss": 1.3067700862884521, "eval_runtime": 175.1357, "eval_samples_per_second": 36.092, "eval_steps_per_second": 2.261, "step": 297 }, { "epoch": 6.0606060606060606, "grad_norm": 0.8369827825158762, "learning_rate": 7.844298380755003e-05, "loss": 1.375, "step": 300 }, { "epoch": 6.161616161616162, "grad_norm": 0.9204188282340791, "learning_rate": 7.497924219967209e-05, "loss": 1.2999, "step": 305 }, { "epoch": 6.262626262626263, "grad_norm": 0.9559880600184892, "learning_rate": 7.154724133689677e-05, "loss": 1.3084, "step": 310 }, { "epoch": 6.363636363636363, "grad_norm": 0.9272296702059781, "learning_rate": 6.815133497483157e-05, "loss": 1.3405, "step": 315 }, { "epoch": 6.4646464646464645, "grad_norm": 1.0203421193696094, "learning_rate": 6.479583108044899e-05, "loss": 1.3165, "step": 320 }, { "epoch": 6.565656565656566, "grad_norm": 0.8932381508297077, "learning_rate": 6.148498636710092e-05, "loss": 1.3641, "step": 325 }, { "epoch": 6.666666666666667, "grad_norm": 0.9527012454845684, "learning_rate": 5.822300089455211e-05, "loss": 1.3179, "step": 330 }, { "epoch": 6.767676767676767, "grad_norm": 0.9644270167383292, "learning_rate": 5.5014012740883115e-05, "loss": 1.3295, "step": 335 }, { "epoch": 6.8686868686868685, "grad_norm": 0.9489303492473159, "learning_rate": 5.1862092753021754e-05, "loss": 1.3482, "step": 340 }, { "epoch": 6.96969696969697, "grad_norm": 0.9417366559193787, "learning_rate": 4.8771239382562287e-05, "loss": 1.3422, "step": 345 }, { "epoch": 6.98989898989899, "eval_loss": 1.2082042694091797, "eval_runtime": 174.4886, "eval_samples_per_second": 36.226, "eval_steps_per_second": 2.269, "step": 346 }, { "epoch": 7.070707070707071, "grad_norm": 1.0189885797060951, "learning_rate": 4.574537361342407e-05, "loss": 1.2447, "step": 350 }, { "epoch": 7.171717171717171, "grad_norm": 1.0456878645941505, "learning_rate": 4.278833398778306e-05, "loss": 1.2438, "step": 355 }, { "epoch": 7.2727272727272725, "grad_norm": 1.0906515200546398, "learning_rate": 3.990387173658774e-05, "loss": 1.2135, "step": 360 }, { "epoch": 7.373737373737374, "grad_norm": 1.1138045736907602, "learning_rate": 3.7095646020835754e-05, "loss": 1.2152, "step": 365 }, { "epoch": 7.474747474747475, "grad_norm": 1.1333935442018617, "learning_rate": 3.436721928964819e-05, "loss": 1.2004, "step": 370 }, { "epoch": 7.575757575757576, "grad_norm": 1.0135992096066218, "learning_rate": 3.172205276103033e-05, "loss": 1.1904, "step": 375 }, { "epoch": 7.6767676767676765, "grad_norm": 1.0811091792166911, "learning_rate": 2.916350203105207e-05, "loss": 1.2475, "step": 380 }, { "epoch": 7.777777777777778, "grad_norm": 1.1722915377984628, "learning_rate": 2.669481281701739e-05, "loss": 1.2273, "step": 385 }, { "epoch": 7.878787878787879, "grad_norm": 1.0151820296117031, "learning_rate": 2.4319116840023813e-05, "loss": 1.2462, "step": 390 }, { "epoch": 7.97979797979798, "grad_norm": 1.0359433658999384, "learning_rate": 2.2039427852134788e-05, "loss": 1.2609, "step": 395 }, { "epoch": 8.0, "eval_loss": 1.137781023979187, "eval_runtime": 163.6859, "eval_samples_per_second": 38.617, "eval_steps_per_second": 2.419, "step": 396 }, { "epoch": 8.080808080808081, "grad_norm": 1.0914414519615843, "learning_rate": 1.985863781320435e-05, "loss": 1.1457, "step": 400 }, { "epoch": 8.181818181818182, "grad_norm": 1.2849174669504693, "learning_rate": 1.777951322220508e-05, "loss": 1.1925, "step": 405 }, { "epoch": 8.282828282828282, "grad_norm": 1.0562037397284274, "learning_rate": 1.580469160771253e-05, "loss": 1.1653, "step": 410 }, { "epoch": 8.383838383838384, "grad_norm": 1.1942325172166053, "learning_rate": 1.3936678181998374e-05, "loss": 1.1451, "step": 415 }, { "epoch": 8.484848484848484, "grad_norm": 1.2292184186394104, "learning_rate": 1.2177842662977135e-05, "loss": 1.1432, "step": 420 }, { "epoch": 8.585858585858587, "grad_norm": 1.1449254109310076, "learning_rate": 1.0530416268037702e-05, "loss": 1.1459, "step": 425 }, { "epoch": 8.686868686868687, "grad_norm": 1.1159137674762092, "learning_rate": 8.99648888357335e-06, "loss": 1.1889, "step": 430 }, { "epoch": 8.787878787878787, "grad_norm": 1.1893818134430183, "learning_rate": 7.578006413801075e-06, "loss": 1.1809, "step": 435 }, { "epoch": 8.88888888888889, "grad_norm": 1.131890459862098, "learning_rate": 6.276768312233228e-06, "loss": 1.1806, "step": 440 }, { "epoch": 8.98989898989899, "grad_norm": 1.0926795618787801, "learning_rate": 5.094425298933136e-06, "loss": 1.1647, "step": 445 }, { "epoch": 8.98989898989899, "eval_loss": 1.107386827468872, "eval_runtime": 163.223, "eval_samples_per_second": 38.726, "eval_steps_per_second": 2.426, "step": 445 }, { "epoch": 9.090909090909092, "grad_norm": 1.1051037332814697, "learning_rate": 4.0324772664503296e-06, "loss": 1.1438, "step": 450 }, { "epoch": 9.191919191919192, "grad_norm": 1.164376038164282, "learning_rate": 3.092271377092215e-06, "loss": 1.1481, "step": 455 }, { "epoch": 9.292929292929292, "grad_norm": 1.2303081966513765, "learning_rate": 2.2750003539455998e-06, "loss": 1.1202, "step": 460 }, { "epoch": 9.393939393939394, "grad_norm": 1.2010643624794166, "learning_rate": 1.5817009678162685e-06, "loss": 1.142, "step": 465 }, { "epoch": 9.494949494949495, "grad_norm": 1.2423558300638782, "learning_rate": 1.013252722005842e-06, "loss": 1.1842, "step": 470 }, { "epoch": 9.595959595959595, "grad_norm": 1.1980002179676799, "learning_rate": 5.703767365946466e-07, "loss": 1.1236, "step": 475 }, { "epoch": 9.696969696969697, "grad_norm": 1.2034300162155251, "learning_rate": 2.536348336456551e-07, "loss": 1.1168, "step": 480 }, { "epoch": 9.797979797979798, "grad_norm": 1.2022297984086214, "learning_rate": 6.342882449029696e-08, "loss": 1.1133, "step": 485 }, { "epoch": 9.8989898989899, "grad_norm": 1.0699752057421905, "learning_rate": 0.0, "loss": 1.1571, "step": 490 }, { "epoch": 9.8989898989899, "eval_loss": 1.1029597520828247, "eval_runtime": 163.4283, "eval_samples_per_second": 38.678, "eval_steps_per_second": 2.423, "step": 490 }, { "epoch": 9.8989898989899, "step": 490, "total_flos": 2344635780825088.0, "train_loss": 1.5290005391957808, "train_runtime": 5413.3076, "train_samples_per_second": 11.677, "train_steps_per_second": 0.091 } ], "logging_steps": 5, "max_steps": 490, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2344635780825088.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }