|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 10.444775818294296, |
|
"learning_rate": 5.208333333333334e-07, |
|
"loss": 0.8468, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 3.3931220262488844, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 0.7571, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 1.725977917785408, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.6887, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 1.5579846393216565, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.6419, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 1.7208975007176577, |
|
"learning_rate": 2.604166666666667e-06, |
|
"loss": 0.6117, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 1.7694674612194612, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.5912, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 2.038979400755539, |
|
"learning_rate": 3.6458333333333333e-06, |
|
"loss": 0.5743, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.758888705110013, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.5807, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 1.747787322446748, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.561, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 2.023155826824772, |
|
"learning_rate": 4.999946602507342e-06, |
|
"loss": 0.5596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 2.7065268999883747, |
|
"learning_rate": 4.999345909821495e-06, |
|
"loss": 0.5426, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 3.0442036812906323, |
|
"learning_rate": 4.998077956371428e-06, |
|
"loss": 0.5514, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 2.569524644395966, |
|
"learning_rate": 4.996143118291191e-06, |
|
"loss": 0.5499, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 2.4274788708639226, |
|
"learning_rate": 4.993541969543877e-06, |
|
"loss": 0.535, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 2.242979877814027, |
|
"learning_rate": 4.990275281751359e-06, |
|
"loss": 0.5295, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.085717253543815, |
|
"learning_rate": 4.9863440239653865e-06, |
|
"loss": 0.5394, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 1.980124498533203, |
|
"learning_rate": 4.981749362380126e-06, |
|
"loss": 0.5318, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 1.4291512505399062, |
|
"learning_rate": 4.976492659986207e-06, |
|
"loss": 0.5273, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 2.1782127342393456, |
|
"learning_rate": 4.9705754761663995e-06, |
|
"loss": 0.5262, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.8726553071551566, |
|
"learning_rate": 4.9639995662330295e-06, |
|
"loss": 0.5156, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 1.7397260041367604, |
|
"learning_rate": 4.956766880907269e-06, |
|
"loss": 0.5138, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 1.9152193087219476, |
|
"learning_rate": 4.948879565740459e-06, |
|
"loss": 0.5117, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 1.909800181545602, |
|
"learning_rate": 4.940339960477644e-06, |
|
"loss": 0.5158, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 2.077990164198304, |
|
"learning_rate": 4.931150598363494e-06, |
|
"loss": 0.5056, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 1.5538564815058695, |
|
"learning_rate": 4.921314205390822e-06, |
|
"loss": 0.5059, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.8861482326610306, |
|
"learning_rate": 4.910833699491932e-06, |
|
"loss": 0.4997, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 2.22030396642517, |
|
"learning_rate": 4.899712189673023e-06, |
|
"loss": 0.5034, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.1138662624316003, |
|
"learning_rate": 4.8879529750919105e-06, |
|
"loss": 0.5035, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 3.704875355592209, |
|
"learning_rate": 4.875559544079348e-06, |
|
"loss": 0.5024, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 2.4124963571250917, |
|
"learning_rate": 4.862535573104217e-06, |
|
"loss": 0.49, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 2.6383521452329703, |
|
"learning_rate": 4.848884925682926e-06, |
|
"loss": 0.494, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.7798577923861036, |
|
"learning_rate": 4.8346116512333045e-06, |
|
"loss": 0.4978, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 2.5016539728327105, |
|
"learning_rate": 4.819719983873357e-06, |
|
"loss": 0.4959, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 2.3486985533520395, |
|
"learning_rate": 4.804214341165228e-06, |
|
"loss": 0.4933, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 2.1224863685900113, |
|
"learning_rate": 4.788099322804749e-06, |
|
"loss": 0.4835, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 2.1137649633214473, |
|
"learning_rate": 4.771379709256953e-06, |
|
"loss": 0.4899, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 1.8070271600662309, |
|
"learning_rate": 4.754060460337963e-06, |
|
"loss": 0.4904, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 2.229961084900482, |
|
"learning_rate": 4.7361467137436846e-06, |
|
"loss": 0.4909, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 2.5450078236200255, |
|
"learning_rate": 4.7176437835257225e-06, |
|
"loss": 0.4878, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.6490727233459694, |
|
"learning_rate": 4.698557158514988e-06, |
|
"loss": 0.5006, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 1.5461965981797419, |
|
"learning_rate": 4.678892500693451e-06, |
|
"loss": 0.493, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.5503545160303023, |
|
"learning_rate": 4.6586556435145404e-06, |
|
"loss": 0.4815, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.671875, |
|
"grad_norm": 1.4158769400928066, |
|
"learning_rate": 4.637852590172665e-06, |
|
"loss": 0.4936, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.4477782106737869, |
|
"learning_rate": 4.616489511822384e-06, |
|
"loss": 0.4885, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 1.6879117554810668, |
|
"learning_rate": 4.59457274574776e-06, |
|
"loss": 0.4937, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.5561703988248663, |
|
"learning_rate": 4.572108793482425e-06, |
|
"loss": 0.4834, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.734375, |
|
"grad_norm": 1.4814626689757864, |
|
"learning_rate": 4.549104318880919e-06, |
|
"loss": 0.4955, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.3493469423874893, |
|
"learning_rate": 4.525566146141886e-06, |
|
"loss": 0.4839, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.765625, |
|
"grad_norm": 1.5247306592093048, |
|
"learning_rate": 4.501501257783692e-06, |
|
"loss": 0.482, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.8021809188964066, |
|
"learning_rate": 4.47691679257309e-06, |
|
"loss": 0.4758, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.796875, |
|
"grad_norm": 1.3217160484942474, |
|
"learning_rate": 4.451820043407527e-06, |
|
"loss": 0.4788, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.5947481734921138, |
|
"learning_rate": 4.426218455151733e-06, |
|
"loss": 0.4722, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.828125, |
|
"grad_norm": 1.3218456009113562, |
|
"learning_rate": 4.400119622429226e-06, |
|
"loss": 0.4792, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 1.4119902047840658, |
|
"learning_rate": 4.3735312873693905e-06, |
|
"loss": 0.4836, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 1.2667791953694767, |
|
"learning_rate": 4.346461337310805e-06, |
|
"loss": 0.4827, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.9380616838070583, |
|
"learning_rate": 4.31891780246149e-06, |
|
"loss": 0.4798, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.890625, |
|
"grad_norm": 1.906273428173042, |
|
"learning_rate": 4.290908853516771e-06, |
|
"loss": 0.4764, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 2.1397738699102526, |
|
"learning_rate": 4.262442799235474e-06, |
|
"loss": 0.4698, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.921875, |
|
"grad_norm": 1.4439475222238531, |
|
"learning_rate": 4.233528083975155e-06, |
|
"loss": 0.4702, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.7063789221261367, |
|
"learning_rate": 4.204173285187117e-06, |
|
"loss": 0.466, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.953125, |
|
"grad_norm": 1.5514623726199628, |
|
"learning_rate": 4.1743871108719334e-06, |
|
"loss": 0.4696, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 1.4916007625639955, |
|
"learning_rate": 4.1441783969962506e-06, |
|
"loss": 0.4708, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.984375, |
|
"grad_norm": 2.36511645668308, |
|
"learning_rate": 4.113556104871631e-06, |
|
"loss": 0.4684, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6210452179231138, |
|
"learning_rate": 4.082529318496206e-06, |
|
"loss": 0.4706, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4656927287578583, |
|
"eval_runtime": 59.1544, |
|
"eval_samples_per_second": 291.441, |
|
"eval_steps_per_second": 1.15, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 1.8958635048318864, |
|
"learning_rate": 4.051107241859944e-06, |
|
"loss": 0.3837, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 1.4711121127809, |
|
"learning_rate": 4.019299196214315e-06, |
|
"loss": 0.3803, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.046875, |
|
"grad_norm": 1.389098603274721, |
|
"learning_rate": 3.987114617307176e-06, |
|
"loss": 0.3742, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.4934816227614662, |
|
"learning_rate": 3.954563052583687e-06, |
|
"loss": 0.3877, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.078125, |
|
"grad_norm": 2.191093614414856, |
|
"learning_rate": 3.92165415835409e-06, |
|
"loss": 0.3805, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 2.0750682076780134, |
|
"learning_rate": 3.888397696929208e-06, |
|
"loss": 0.377, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.109375, |
|
"grad_norm": 2.0456715879045366, |
|
"learning_rate": 3.854803533724477e-06, |
|
"loss": 0.378, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.9908156741899143, |
|
"learning_rate": 3.820881634333416e-06, |
|
"loss": 0.3815, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.140625, |
|
"grad_norm": 1.434003143563507, |
|
"learning_rate": 3.786642061571356e-06, |
|
"loss": 0.3801, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 1.922816409585947, |
|
"learning_rate": 3.752094972490348e-06, |
|
"loss": 0.3774, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 1.2664235758845404, |
|
"learning_rate": 3.717250615366108e-06, |
|
"loss": 0.3821, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 1.4529409619288622, |
|
"learning_rate": 3.6821193266578976e-06, |
|
"loss": 0.3806, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.203125, |
|
"grad_norm": 1.4048451043641628, |
|
"learning_rate": 3.6467115279422523e-06, |
|
"loss": 0.3768, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 1.5780951905511307, |
|
"learning_rate": 3.611037722821452e-06, |
|
"loss": 0.3849, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.234375, |
|
"grad_norm": 1.3754990978690818, |
|
"learning_rate": 3.5751084938076697e-06, |
|
"loss": 0.3819, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.7832805601394666, |
|
"learning_rate": 3.5389344991836977e-06, |
|
"loss": 0.3815, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.265625, |
|
"grad_norm": 1.5731803619002744, |
|
"learning_rate": 3.5025264698412127e-06, |
|
"loss": 0.3769, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 1.4508731895019338, |
|
"learning_rate": 3.4658952060974858e-06, |
|
"loss": 0.386, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.296875, |
|
"grad_norm": 1.3310110347445678, |
|
"learning_rate": 3.4290515744915135e-06, |
|
"loss": 0.3793, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 1.5870629040014994, |
|
"learning_rate": 3.3920065045604874e-06, |
|
"loss": 0.3815, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 1.4428244899236522, |
|
"learning_rate": 3.3547709855975908e-06, |
|
"loss": 0.3812, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 1.4623661323810098, |
|
"learning_rate": 3.317356063392059e-06, |
|
"loss": 0.3759, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.359375, |
|
"grad_norm": 1.4541598774189346, |
|
"learning_rate": 3.2797728369524878e-06, |
|
"loss": 0.3863, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.7531641958321784, |
|
"learning_rate": 3.242032455214346e-06, |
|
"loss": 0.3825, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.390625, |
|
"grad_norm": 2.0248369297536164, |
|
"learning_rate": 3.2041461137326825e-06, |
|
"loss": 0.3808, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 1.7114859453687477, |
|
"learning_rate": 3.166125051361007e-06, |
|
"loss": 0.3813, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.421875, |
|
"grad_norm": 1.5182934800219041, |
|
"learning_rate": 3.127980546917318e-06, |
|
"loss": 0.3749, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 1.2810157768825081, |
|
"learning_rate": 3.089723915838283e-06, |
|
"loss": 0.3798, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.453125, |
|
"grad_norm": 1.3056749068330433, |
|
"learning_rate": 3.051366506822554e-06, |
|
"loss": 0.3824, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 1.339776576173714, |
|
"learning_rate": 3.0129196984642084e-06, |
|
"loss": 0.3819, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 1.3275514822294423, |
|
"learning_rate": 2.9743948958773373e-06, |
|
"loss": 0.3804, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.3861961679351784, |
|
"learning_rate": 2.9358035273127484e-06, |
|
"loss": 0.3876, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.515625, |
|
"grad_norm": 1.4583205842845002, |
|
"learning_rate": 2.8971570407678222e-06, |
|
"loss": 0.3811, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 1.5523942903859895, |
|
"learning_rate": 2.8584669005904993e-06, |
|
"loss": 0.3764, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.546875, |
|
"grad_norm": 1.365188486045966, |
|
"learning_rate": 2.8197445840784173e-06, |
|
"loss": 0.3706, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.5063105750252836, |
|
"learning_rate": 2.781001578074217e-06, |
|
"loss": 0.3806, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.578125, |
|
"grad_norm": 1.4218424948694481, |
|
"learning_rate": 2.7422493755580052e-06, |
|
"loss": 0.3787, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 1.263189980212751, |
|
"learning_rate": 2.703499472238004e-06, |
|
"loss": 0.3726, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.609375, |
|
"grad_norm": 1.3846905506410119, |
|
"learning_rate": 2.664763363140382e-06, |
|
"loss": 0.3814, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 1.2445425042616887, |
|
"learning_rate": 2.6260525391993027e-06, |
|
"loss": 0.3759, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 1.3055007938596355, |
|
"learning_rate": 2.5873784838481764e-06, |
|
"loss": 0.3762, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 1.3144625349610841, |
|
"learning_rate": 2.548752669613132e-06, |
|
"loss": 0.3796, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.671875, |
|
"grad_norm": 1.3350371737116375, |
|
"learning_rate": 2.510186554709741e-06, |
|
"loss": 0.3815, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.566780577726475, |
|
"learning_rate": 2.471691579643968e-06, |
|
"loss": 0.3708, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.703125, |
|
"grad_norm": 1.2771965688411218, |
|
"learning_rate": 2.4332791638183935e-06, |
|
"loss": 0.3741, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 1.3207787835503493, |
|
"learning_rate": 2.3949607021446774e-06, |
|
"loss": 0.3775, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.734375, |
|
"grad_norm": 1.263046733536951, |
|
"learning_rate": 2.3567475616633046e-06, |
|
"loss": 0.3742, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.3161682150839322, |
|
"learning_rate": 2.318651078171589e-06, |
|
"loss": 0.3771, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.765625, |
|
"grad_norm": 1.2308341651874004, |
|
"learning_rate": 2.2806825528609457e-06, |
|
"loss": 0.3788, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 1.3859163680872322, |
|
"learning_rate": 2.2428532489644368e-06, |
|
"loss": 0.3718, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 1.389479577560998, |
|
"learning_rate": 2.2051743884155636e-06, |
|
"loss": 0.3811, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.179457462552155, |
|
"learning_rate": 2.167657148519328e-06, |
|
"loss": 0.3709, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.828125, |
|
"grad_norm": 1.1984315597042852, |
|
"learning_rate": 2.1303126586365174e-06, |
|
"loss": 0.3746, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 1.3778930599081922, |
|
"learning_rate": 2.093151996882217e-06, |
|
"loss": 0.3701, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.859375, |
|
"grad_norm": 1.393001774481534, |
|
"learning_rate": 2.0561861868395303e-06, |
|
"loss": 0.3777, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 1.255228122839463, |
|
"learning_rate": 2.0194261942894625e-06, |
|
"loss": 0.3723, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.890625, |
|
"grad_norm": 1.3414771695904, |
|
"learning_rate": 1.982882923957969e-06, |
|
"loss": 0.3788, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 1.3315196766376038, |
|
"learning_rate": 1.9465672162811004e-06, |
|
"loss": 0.3749, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.921875, |
|
"grad_norm": 1.196593458985567, |
|
"learning_rate": 1.9104898441892223e-06, |
|
"loss": 0.3699, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 1.2207357045658285, |
|
"learning_rate": 1.8746615099112667e-06, |
|
"loss": 0.3675, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 1.2040449871674417, |
|
"learning_rate": 1.8390928417999492e-06, |
|
"loss": 0.3738, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 1.506663538304499, |
|
"learning_rate": 1.803794391178908e-06, |
|
"loss": 0.3733, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.984375, |
|
"grad_norm": 1.3210143508850016, |
|
"learning_rate": 1.7687766292126865e-06, |
|
"loss": 0.3742, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.3003812635061929, |
|
"learning_rate": 1.7340499438004995e-06, |
|
"loss": 0.369, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4540603756904602, |
|
"eval_runtime": 68.5046, |
|
"eval_samples_per_second": 251.662, |
|
"eval_steps_per_second": 0.993, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.015625, |
|
"grad_norm": 1.799529367757516, |
|
"learning_rate": 1.6996246364946986e-06, |
|
"loss": 0.2851, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 1.4376838261754665, |
|
"learning_rate": 1.665510919444851e-06, |
|
"loss": 0.2857, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.046875, |
|
"grad_norm": 1.3710510542531367, |
|
"learning_rate": 1.6317189123683428e-06, |
|
"loss": 0.2876, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 1.4736988163726463, |
|
"learning_rate": 1.5982586395483984e-06, |
|
"loss": 0.2851, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.078125, |
|
"grad_norm": 1.3865134466434497, |
|
"learning_rate": 1.5651400268604063e-06, |
|
"loss": 0.2825, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 1.3685250396007145, |
|
"learning_rate": 1.5323728988274513e-06, |
|
"loss": 0.2823, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 1.3255774882469817, |
|
"learning_rate": 1.4999669757058956e-06, |
|
"loss": 0.2829, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 1.3462861294111625, |
|
"learning_rate": 1.4679318706019011e-06, |
|
"loss": 0.2861, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.140625, |
|
"grad_norm": 1.2761171662988255, |
|
"learning_rate": 1.4362770866197365e-06, |
|
"loss": 0.2798, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 1.2996766419653487, |
|
"learning_rate": 1.405012014042708e-06, |
|
"loss": 0.2831, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.171875, |
|
"grad_norm": 1.2756611803725677, |
|
"learning_rate": 1.3741459275475718e-06, |
|
"loss": 0.2842, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.3788917408268033, |
|
"learning_rate": 1.3436879834532237e-06, |
|
"loss": 0.2883, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.203125, |
|
"grad_norm": 1.3457810994006327, |
|
"learning_rate": 1.3136472170045173e-06, |
|
"loss": 0.2796, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 1.3527803102732727, |
|
"learning_rate": 1.2840325396919851e-06, |
|
"loss": 0.2809, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.234375, |
|
"grad_norm": 1.3139656273448967, |
|
"learning_rate": 1.2548527366082746e-06, |
|
"loss": 0.2899, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.3171500244547871, |
|
"learning_rate": 1.226116463842083e-06, |
|
"loss": 0.2816, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 1.321106017997469, |
|
"learning_rate": 1.1978322459103558e-06, |
|
"loss": 0.2866, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 1.3880168746621624, |
|
"learning_rate": 1.1700084732295222e-06, |
|
"loss": 0.2789, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.296875, |
|
"grad_norm": 1.297165181118468, |
|
"learning_rate": 1.1426533996265006e-06, |
|
"loss": 0.2818, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 1.2914280928037412, |
|
"learning_rate": 1.1157751398902338e-06, |
|
"loss": 0.2856, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.328125, |
|
"grad_norm": 1.374705202612009, |
|
"learning_rate": 1.0893816673644643e-06, |
|
"loss": 0.2836, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.3661463800498177, |
|
"learning_rate": 1.0634808115824667e-06, |
|
"loss": 0.2822, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.359375, |
|
"grad_norm": 1.285521847136074, |
|
"learning_rate": 1.0380802559444444e-06, |
|
"loss": 0.2788, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.3609559873792854, |
|
"learning_rate": 1.013187535438278e-06, |
|
"loss": 0.2815, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.390625, |
|
"grad_norm": 1.359498367844339, |
|
"learning_rate": 9.888100344042926e-07, |
|
"loss": 0.279, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 1.2578896443023726, |
|
"learning_rate": 9.649549843447213e-07, |
|
"loss": 0.2837, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 1.476169547857346, |
|
"learning_rate": 9.416294617785033e-07, |
|
"loss": 0.2846, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 1.3700496792517187, |
|
"learning_rate": 9.188403861420614e-07, |
|
"loss": 0.2841, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.453125, |
|
"grad_norm": 1.278136583382333, |
|
"learning_rate": 8.965945177366716e-07, |
|
"loss": 0.2802, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 1.321201564334253, |
|
"learning_rate": 8.74898455723044e-07, |
|
"loss": 0.2854, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.484375, |
|
"grad_norm": 1.3469922385355766, |
|
"learning_rate": 8.537586361637059e-07, |
|
"loss": 0.2815, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.3453576727439425, |
|
"learning_rate": 8.331813301137644e-07, |
|
"loss": 0.2864, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.515625, |
|
"grad_norm": 1.2960156031598227, |
|
"learning_rate": 8.131726417606181e-07, |
|
"loss": 0.2739, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 1.3651173071468319, |
|
"learning_rate": 7.937385066131745e-07, |
|
"loss": 0.2795, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.546875, |
|
"grad_norm": 1.3217889886043823, |
|
"learning_rate": 7.748846897410985e-07, |
|
"loss": 0.2734, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 1.3037202509770986, |
|
"learning_rate": 7.566167840646245e-07, |
|
"loss": 0.2847, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 1.2986387865278517, |
|
"learning_rate": 7.389402086954368e-07, |
|
"loss": 0.2948, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 1.2549602691888244, |
|
"learning_rate": 7.218602073291095e-07, |
|
"loss": 0.2842, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.609375, |
|
"grad_norm": 1.2411990452005732, |
|
"learning_rate": 7.053818466895791e-07, |
|
"loss": 0.279, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 1.2694405064326646, |
|
"learning_rate": 6.895100150261206e-07, |
|
"loss": 0.2868, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.640625, |
|
"grad_norm": 1.3041289401671792, |
|
"learning_rate": 6.742494206632659e-07, |
|
"loss": 0.285, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 1.2477084403906091, |
|
"learning_rate": 6.596045906040921e-07, |
|
"loss": 0.2823, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.671875, |
|
"grad_norm": 1.2407685713907555, |
|
"learning_rate": 6.455798691873041e-07, |
|
"loss": 0.2845, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 1.2726899580839428, |
|
"learning_rate": 6.32179416798501e-07, |
|
"loss": 0.2891, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.703125, |
|
"grad_norm": 1.2629501555738192, |
|
"learning_rate": 6.194072086360125e-07, |
|
"loss": 0.2779, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 1.3031032470418795, |
|
"learning_rate": 6.072670335316676e-07, |
|
"loss": 0.2793, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 1.2682201497592243, |
|
"learning_rate": 5.957624928268528e-07, |
|
"loss": 0.2855, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.3301507903535867, |
|
"learning_rate": 5.848969993041866e-07, |
|
"loss": 0.2788, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.765625, |
|
"grad_norm": 1.2869688012601592, |
|
"learning_rate": 5.746737761751279e-07, |
|
"loss": 0.2843, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 1.304345949854902, |
|
"learning_rate": 5.650958561238212e-07, |
|
"loss": 0.2848, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.796875, |
|
"grad_norm": 1.2612206521119516, |
|
"learning_rate": 5.561660804074635e-07, |
|
"loss": 0.2784, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.21963172307497, |
|
"learning_rate": 5.478870980134524e-07, |
|
"loss": 0.2787, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.828125, |
|
"grad_norm": 1.2820291216583481, |
|
"learning_rate": 5.402613648735749e-07, |
|
"loss": 0.2827, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 1.270150906269018, |
|
"learning_rate": 5.332911431354621e-07, |
|
"loss": 0.2753, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.859375, |
|
"grad_norm": 1.2373120076036523, |
|
"learning_rate": 5.269785004915327e-07, |
|
"loss": 0.2842, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 1.3257054430640673, |
|
"learning_rate": 5.213253095656177e-07, |
|
"loss": 0.2794, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.890625, |
|
"grad_norm": 1.2939668010311147, |
|
"learning_rate": 5.16333247357453e-07, |
|
"loss": 0.285, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 1.2762831271442217, |
|
"learning_rate": 5.120037947452043e-07, |
|
"loss": 0.2808, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.921875, |
|
"grad_norm": 1.2672944555649501, |
|
"learning_rate": 5.083382360461682e-07, |
|
"loss": 0.2828, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 1.253289043826509, |
|
"learning_rate": 5.05337658635785e-07, |
|
"loss": 0.2859, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.953125, |
|
"grad_norm": 1.3369028630930568, |
|
"learning_rate": 5.030029526250719e-07, |
|
"loss": 0.2758, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 1.2821831872418814, |
|
"learning_rate": 5.013348105965748e-07, |
|
"loss": 0.2853, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.984375, |
|
"grad_norm": 1.350766870220701, |
|
"learning_rate": 5.003337273989165e-07, |
|
"loss": 0.2841, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.2850731256398582, |
|
"learning_rate": 5e-07, |
|
"loss": 0.287, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.4800361096858978, |
|
"eval_runtime": 67.4827, |
|
"eval_samples_per_second": 255.473, |
|
"eval_steps_per_second": 1.008, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1920, |
|
"total_flos": 3216071511244800.0, |
|
"train_loss": 0.3934497516602278, |
|
"train_runtime": 11864.3784, |
|
"train_samples_per_second": 82.821, |
|
"train_steps_per_second": 0.162 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1920, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3216071511244800.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|