|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997407311381903, |
|
"eval_steps": 50, |
|
"global_step": 964, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010370754472387866, |
|
"grad_norm": 2.4068312644958496, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 4.4724, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020741508944775732, |
|
"grad_norm": 2.3241941928863525, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 4.5137, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0311122634171636, |
|
"grad_norm": 2.4529693126678467, |
|
"learning_rate": 1.5e-06, |
|
"loss": 4.431, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.041483017889551464, |
|
"grad_norm": 2.5506527423858643, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 4.4615, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05185377236193933, |
|
"grad_norm": 2.6286089420318604, |
|
"learning_rate": 2.5e-06, |
|
"loss": 4.4173, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05185377236193933, |
|
"eval_loss": 4.529317855834961, |
|
"eval_runtime": 43.0684, |
|
"eval_samples_per_second": 79.594, |
|
"eval_steps_per_second": 9.961, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0622245268343272, |
|
"grad_norm": 2.2027931213378906, |
|
"learning_rate": 3e-06, |
|
"loss": 4.3936, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07259528130671507, |
|
"grad_norm": 2.632085084915161, |
|
"learning_rate": 3.5e-06, |
|
"loss": 4.4038, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08296603577910293, |
|
"grad_norm": 2.330366849899292, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.3844, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09333679025149079, |
|
"grad_norm": 2.4520134925842285, |
|
"learning_rate": 4.5e-06, |
|
"loss": 4.3325, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10370754472387866, |
|
"grad_norm": 2.727679491043091, |
|
"learning_rate": 5e-06, |
|
"loss": 4.2768, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10370754472387866, |
|
"eval_loss": 4.328857898712158, |
|
"eval_runtime": 43.0263, |
|
"eval_samples_per_second": 79.672, |
|
"eval_steps_per_second": 9.971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11407829919626652, |
|
"grad_norm": 2.3905959129333496, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 4.1575, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1244490536686544, |
|
"grad_norm": 2.3810746669769287, |
|
"learning_rate": 6e-06, |
|
"loss": 4.1188, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13481980814104227, |
|
"grad_norm": 2.3154499530792236, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 4.0751, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14519056261343014, |
|
"grad_norm": 2.404163360595703, |
|
"learning_rate": 7e-06, |
|
"loss": 3.9433, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.155561317085818, |
|
"grad_norm": 2.620729446411133, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 3.9579, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.155561317085818, |
|
"eval_loss": 3.87788724899292, |
|
"eval_runtime": 43.1648, |
|
"eval_samples_per_second": 79.417, |
|
"eval_steps_per_second": 9.939, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16593207155820586, |
|
"grad_norm": 2.6772756576538086, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.7662, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17630282603059372, |
|
"grad_norm": 2.6104724407196045, |
|
"learning_rate": 8.5e-06, |
|
"loss": 3.6483, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18667358050298158, |
|
"grad_norm": 2.636183023452759, |
|
"learning_rate": 9e-06, |
|
"loss": 3.4924, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19704433497536947, |
|
"grad_norm": 2.9193673133850098, |
|
"learning_rate": 9.5e-06, |
|
"loss": 3.33, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20741508944775733, |
|
"grad_norm": 2.378948926925659, |
|
"learning_rate": 1e-05, |
|
"loss": 3.1667, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20741508944775733, |
|
"eval_loss": 3.0792782306671143, |
|
"eval_runtime": 43.0269, |
|
"eval_samples_per_second": 79.671, |
|
"eval_steps_per_second": 9.971, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2177858439201452, |
|
"grad_norm": 3.6606717109680176, |
|
"learning_rate": 9.86910994764398e-06, |
|
"loss": 2.9038, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22815659839253305, |
|
"grad_norm": 4.638175964355469, |
|
"learning_rate": 9.73821989528796e-06, |
|
"loss": 2.7723, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2385273528649209, |
|
"grad_norm": 5.681021690368652, |
|
"learning_rate": 9.607329842931939e-06, |
|
"loss": 2.4375, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2488981073373088, |
|
"grad_norm": 3.9302401542663574, |
|
"learning_rate": 9.476439790575916e-06, |
|
"loss": 2.2828, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.25926886180969666, |
|
"grad_norm": 3.4748728275299072, |
|
"learning_rate": 9.345549738219896e-06, |
|
"loss": 2.1372, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25926886180969666, |
|
"eval_loss": 2.027852773666382, |
|
"eval_runtime": 42.9733, |
|
"eval_samples_per_second": 79.77, |
|
"eval_steps_per_second": 9.983, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26963961628208455, |
|
"grad_norm": 2.240591049194336, |
|
"learning_rate": 9.214659685863875e-06, |
|
"loss": 2.0402, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2800103707544724, |
|
"grad_norm": 2.12862229347229, |
|
"learning_rate": 9.083769633507853e-06, |
|
"loss": 1.8311, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29038112522686027, |
|
"grad_norm": 1.9156771898269653, |
|
"learning_rate": 8.952879581151834e-06, |
|
"loss": 1.7948, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3007518796992481, |
|
"grad_norm": 1.2717920541763306, |
|
"learning_rate": 8.821989528795813e-06, |
|
"loss": 1.7745, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.311122634171636, |
|
"grad_norm": 0.9778507947921753, |
|
"learning_rate": 8.691099476439791e-06, |
|
"loss": 1.6066, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.311122634171636, |
|
"eval_loss": 1.6197232007980347, |
|
"eval_runtime": 43.0004, |
|
"eval_samples_per_second": 79.72, |
|
"eval_steps_per_second": 9.977, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3214933886440238, |
|
"grad_norm": 0.966334879398346, |
|
"learning_rate": 8.56020942408377e-06, |
|
"loss": 1.6183, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3318641431164117, |
|
"grad_norm": 0.8336134552955627, |
|
"learning_rate": 8.429319371727749e-06, |
|
"loss": 1.543, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3422348975887996, |
|
"grad_norm": 0.7293752431869507, |
|
"learning_rate": 8.298429319371727e-06, |
|
"loss": 1.5888, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35260565206118744, |
|
"grad_norm": 0.7492266297340393, |
|
"learning_rate": 8.167539267015708e-06, |
|
"loss": 1.5612, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3629764065335753, |
|
"grad_norm": 0.8373680710792542, |
|
"learning_rate": 8.036649214659686e-06, |
|
"loss": 1.547, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3629764065335753, |
|
"eval_loss": 1.571603536605835, |
|
"eval_runtime": 43.1239, |
|
"eval_samples_per_second": 79.492, |
|
"eval_steps_per_second": 9.948, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37334716100596316, |
|
"grad_norm": 0.9682691097259521, |
|
"learning_rate": 7.905759162303665e-06, |
|
"loss": 1.6005, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38371791547835105, |
|
"grad_norm": 0.6970401406288147, |
|
"learning_rate": 7.774869109947646e-06, |
|
"loss": 1.6102, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.39408866995073893, |
|
"grad_norm": 0.8149111866950989, |
|
"learning_rate": 7.643979057591624e-06, |
|
"loss": 1.5331, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.40445942442312677, |
|
"grad_norm": 0.6417681574821472, |
|
"learning_rate": 7.513089005235603e-06, |
|
"loss": 1.5559, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.41483017889551466, |
|
"grad_norm": 0.669866144657135, |
|
"learning_rate": 7.382198952879581e-06, |
|
"loss": 1.6237, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41483017889551466, |
|
"eval_loss": 1.5569473505020142, |
|
"eval_runtime": 43.0517, |
|
"eval_samples_per_second": 79.625, |
|
"eval_steps_per_second": 9.965, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4252009333679025, |
|
"grad_norm": 0.7108224630355835, |
|
"learning_rate": 7.25130890052356e-06, |
|
"loss": 1.5205, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4355716878402904, |
|
"grad_norm": 0.772306501865387, |
|
"learning_rate": 7.12041884816754e-06, |
|
"loss": 1.4833, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.44594244231267827, |
|
"grad_norm": 0.8170768618583679, |
|
"learning_rate": 6.989528795811519e-06, |
|
"loss": 1.506, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4563131967850661, |
|
"grad_norm": 0.7127036452293396, |
|
"learning_rate": 6.858638743455498e-06, |
|
"loss": 1.642, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.466683951257454, |
|
"grad_norm": 1.1019853353500366, |
|
"learning_rate": 6.727748691099477e-06, |
|
"loss": 1.5815, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.466683951257454, |
|
"eval_loss": 1.5491901636123657, |
|
"eval_runtime": 42.974, |
|
"eval_samples_per_second": 79.769, |
|
"eval_steps_per_second": 9.983, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4770547057298418, |
|
"grad_norm": 0.7836682200431824, |
|
"learning_rate": 6.5968586387434565e-06, |
|
"loss": 1.479, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4874254602022297, |
|
"grad_norm": 0.8299842476844788, |
|
"learning_rate": 6.465968586387435e-06, |
|
"loss": 1.4768, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4977962146746176, |
|
"grad_norm": 0.7423719763755798, |
|
"learning_rate": 6.335078534031414e-06, |
|
"loss": 1.5919, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5081669691470054, |
|
"grad_norm": 0.7347830533981323, |
|
"learning_rate": 6.204188481675393e-06, |
|
"loss": 1.4697, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5185377236193933, |
|
"grad_norm": 0.8458806276321411, |
|
"learning_rate": 6.073298429319372e-06, |
|
"loss": 1.5822, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5185377236193933, |
|
"eval_loss": 1.5439085960388184, |
|
"eval_runtime": 43.1032, |
|
"eval_samples_per_second": 79.53, |
|
"eval_steps_per_second": 9.953, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5289084780917812, |
|
"grad_norm": 0.8292895555496216, |
|
"learning_rate": 5.942408376963351e-06, |
|
"loss": 1.5543, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5392792325641691, |
|
"grad_norm": 0.7892965078353882, |
|
"learning_rate": 5.81151832460733e-06, |
|
"loss": 1.6241, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5496499870365569, |
|
"grad_norm": 0.8499513268470764, |
|
"learning_rate": 5.680628272251309e-06, |
|
"loss": 1.4915, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5600207415089448, |
|
"grad_norm": 0.8531098365783691, |
|
"learning_rate": 5.549738219895289e-06, |
|
"loss": 1.5094, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5703914959813327, |
|
"grad_norm": 0.7012779116630554, |
|
"learning_rate": 5.418848167539268e-06, |
|
"loss": 1.5539, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5703914959813327, |
|
"eval_loss": 1.5399216413497925, |
|
"eval_runtime": 43.067, |
|
"eval_samples_per_second": 79.597, |
|
"eval_steps_per_second": 9.961, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5807622504537205, |
|
"grad_norm": 0.7626951336860657, |
|
"learning_rate": 5.287958115183246e-06, |
|
"loss": 1.5038, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5911330049261084, |
|
"grad_norm": 0.8458223938941956, |
|
"learning_rate": 5.157068062827225e-06, |
|
"loss": 1.5217, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6015037593984962, |
|
"grad_norm": 0.8810559511184692, |
|
"learning_rate": 5.026178010471204e-06, |
|
"loss": 1.6896, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6118745138708841, |
|
"grad_norm": 0.9249419569969177, |
|
"learning_rate": 4.895287958115184e-06, |
|
"loss": 1.5184, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.622245268343272, |
|
"grad_norm": 0.7158748507499695, |
|
"learning_rate": 4.764397905759163e-06, |
|
"loss": 1.5405, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.622245268343272, |
|
"eval_loss": 1.5371109247207642, |
|
"eval_runtime": 43.1456, |
|
"eval_samples_per_second": 79.452, |
|
"eval_steps_per_second": 9.943, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6326160228156599, |
|
"grad_norm": 0.8123712539672852, |
|
"learning_rate": 4.633507853403142e-06, |
|
"loss": 1.4703, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6429867772880477, |
|
"grad_norm": 0.8977182507514954, |
|
"learning_rate": 4.502617801047121e-06, |
|
"loss": 1.5568, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6533575317604355, |
|
"grad_norm": 0.8391156792640686, |
|
"learning_rate": 4.3717277486910996e-06, |
|
"loss": 1.5993, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6637282862328234, |
|
"grad_norm": 0.7252123355865479, |
|
"learning_rate": 4.240837696335079e-06, |
|
"loss": 1.5162, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6740990407052113, |
|
"grad_norm": 0.7567150592803955, |
|
"learning_rate": 4.109947643979058e-06, |
|
"loss": 1.5821, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6740990407052113, |
|
"eval_loss": 1.5346648693084717, |
|
"eval_runtime": 43.0456, |
|
"eval_samples_per_second": 79.637, |
|
"eval_steps_per_second": 9.966, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6844697951775992, |
|
"grad_norm": 0.6526748538017273, |
|
"learning_rate": 3.9790575916230365e-06, |
|
"loss": 1.5429, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.694840549649987, |
|
"grad_norm": 0.7770061492919922, |
|
"learning_rate": 3.848167539267016e-06, |
|
"loss": 1.497, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7052113041223749, |
|
"grad_norm": 0.6573889255523682, |
|
"learning_rate": 3.717277486910995e-06, |
|
"loss": 1.6247, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7155820585947628, |
|
"grad_norm": 0.9382066130638123, |
|
"learning_rate": 3.5863874345549743e-06, |
|
"loss": 1.5577, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7259528130671506, |
|
"grad_norm": 0.9911208748817444, |
|
"learning_rate": 3.455497382198953e-06, |
|
"loss": 1.4734, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7259528130671506, |
|
"eval_loss": 1.5329481363296509, |
|
"eval_runtime": 43.1201, |
|
"eval_samples_per_second": 79.499, |
|
"eval_steps_per_second": 9.949, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7363235675395385, |
|
"grad_norm": 0.8948063850402832, |
|
"learning_rate": 3.324607329842932e-06, |
|
"loss": 1.5257, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7466943220119263, |
|
"grad_norm": 1.0471000671386719, |
|
"learning_rate": 3.1937172774869113e-06, |
|
"loss": 1.5289, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7570650764843142, |
|
"grad_norm": 0.7089968323707581, |
|
"learning_rate": 3.0628272251308904e-06, |
|
"loss": 1.5721, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7674358309567021, |
|
"grad_norm": 0.9314925074577332, |
|
"learning_rate": 2.931937172774869e-06, |
|
"loss": 1.4879, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.77780658542909, |
|
"grad_norm": 0.8222401142120361, |
|
"learning_rate": 2.8010471204188483e-06, |
|
"loss": 1.5909, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.77780658542909, |
|
"eval_loss": 1.5315285921096802, |
|
"eval_runtime": 43.1263, |
|
"eval_samples_per_second": 79.487, |
|
"eval_steps_per_second": 9.948, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"grad_norm": 0.7002791166305542, |
|
"learning_rate": 2.6701570680628274e-06, |
|
"loss": 1.5853, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7985480943738656, |
|
"grad_norm": 0.7302571535110474, |
|
"learning_rate": 2.5392670157068065e-06, |
|
"loss": 1.4632, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8089188488462535, |
|
"grad_norm": 0.785142719745636, |
|
"learning_rate": 2.4083769633507856e-06, |
|
"loss": 1.505, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8192896033186414, |
|
"grad_norm": 0.6490882039070129, |
|
"learning_rate": 2.2774869109947643e-06, |
|
"loss": 1.4813, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8296603577910293, |
|
"grad_norm": 0.7147834897041321, |
|
"learning_rate": 2.1465968586387435e-06, |
|
"loss": 1.4852, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8296603577910293, |
|
"eval_loss": 1.5305155515670776, |
|
"eval_runtime": 43.0366, |
|
"eval_samples_per_second": 79.653, |
|
"eval_steps_per_second": 9.968, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8400311122634172, |
|
"grad_norm": 0.742734432220459, |
|
"learning_rate": 2.0157068062827226e-06, |
|
"loss": 1.4627, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.850401866735805, |
|
"grad_norm": 0.7220650315284729, |
|
"learning_rate": 1.8848167539267017e-06, |
|
"loss": 1.4692, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8607726212081929, |
|
"grad_norm": 0.8684506416320801, |
|
"learning_rate": 1.7539267015706806e-06, |
|
"loss": 1.56, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8711433756805808, |
|
"grad_norm": 0.7521070241928101, |
|
"learning_rate": 1.6230366492146598e-06, |
|
"loss": 1.5089, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8815141301529686, |
|
"grad_norm": 0.9445785284042358, |
|
"learning_rate": 1.4921465968586387e-06, |
|
"loss": 1.6033, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8815141301529686, |
|
"eval_loss": 1.5298349857330322, |
|
"eval_runtime": 42.9802, |
|
"eval_samples_per_second": 79.758, |
|
"eval_steps_per_second": 9.981, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8918848846253565, |
|
"grad_norm": 0.7844976186752319, |
|
"learning_rate": 1.361256544502618e-06, |
|
"loss": 1.5412, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9022556390977443, |
|
"grad_norm": 0.9173896312713623, |
|
"learning_rate": 1.230366492146597e-06, |
|
"loss": 1.585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9126263935701322, |
|
"grad_norm": 0.7674463391304016, |
|
"learning_rate": 1.099476439790576e-06, |
|
"loss": 1.533, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9229971480425201, |
|
"grad_norm": 0.901545524597168, |
|
"learning_rate": 9.685863874345552e-07, |
|
"loss": 1.6416, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.933367902514908, |
|
"grad_norm": 0.760588526725769, |
|
"learning_rate": 8.376963350785341e-07, |
|
"loss": 1.6217, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.933367902514908, |
|
"eval_loss": 1.529255747795105, |
|
"eval_runtime": 42.9676, |
|
"eval_samples_per_second": 79.781, |
|
"eval_steps_per_second": 9.984, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9437386569872959, |
|
"grad_norm": 0.780006468296051, |
|
"learning_rate": 7.068062827225131e-07, |
|
"loss": 1.5711, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9541094114596836, |
|
"grad_norm": 0.6572290062904358, |
|
"learning_rate": 5.759162303664922e-07, |
|
"loss": 1.5525, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9644801659320715, |
|
"grad_norm": 0.7653405666351318, |
|
"learning_rate": 4.4502617801047125e-07, |
|
"loss": 1.5585, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9748509204044594, |
|
"grad_norm": 0.9417358636856079, |
|
"learning_rate": 3.1413612565445027e-07, |
|
"loss": 1.5995, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"grad_norm": 0.752137303352356, |
|
"learning_rate": 1.8324607329842932e-07, |
|
"loss": 1.6332, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"eval_loss": 1.5290166139602661, |
|
"eval_runtime": 43.1488, |
|
"eval_samples_per_second": 79.446, |
|
"eval_steps_per_second": 9.942, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9955924293492352, |
|
"grad_norm": 0.7827558517456055, |
|
"learning_rate": 5.235602094240838e-08, |
|
"loss": 1.5047, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9997407311381903, |
|
"step": 964, |
|
"total_flos": 9.238171939032269e+16, |
|
"train_loss": 2.1392775007303326, |
|
"train_runtime": 1823.6358, |
|
"train_samples_per_second": 16.917, |
|
"train_steps_per_second": 0.529 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 964, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.238171939032269e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|