{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997407311381903, "eval_steps": 50, "global_step": 964, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010370754472387866, "grad_norm": 2.4068312644958496, "learning_rate": 5.000000000000001e-07, "loss": 4.4724, "step": 10 }, { "epoch": 0.020741508944775732, "grad_norm": 2.3241941928863525, "learning_rate": 1.0000000000000002e-06, "loss": 4.5137, "step": 20 }, { "epoch": 0.0311122634171636, "grad_norm": 2.4529693126678467, "learning_rate": 1.5e-06, "loss": 4.431, "step": 30 }, { "epoch": 0.041483017889551464, "grad_norm": 2.5506527423858643, "learning_rate": 2.0000000000000003e-06, "loss": 4.4615, "step": 40 }, { "epoch": 0.05185377236193933, "grad_norm": 2.6286089420318604, "learning_rate": 2.5e-06, "loss": 4.4173, "step": 50 }, { "epoch": 0.05185377236193933, "eval_loss": 4.529317855834961, "eval_runtime": 43.0684, "eval_samples_per_second": 79.594, "eval_steps_per_second": 9.961, "step": 50 }, { "epoch": 0.0622245268343272, "grad_norm": 2.2027931213378906, "learning_rate": 3e-06, "loss": 4.3936, "step": 60 }, { "epoch": 0.07259528130671507, "grad_norm": 2.632085084915161, "learning_rate": 3.5e-06, "loss": 4.4038, "step": 70 }, { "epoch": 0.08296603577910293, "grad_norm": 2.330366849899292, "learning_rate": 4.000000000000001e-06, "loss": 4.3844, "step": 80 }, { "epoch": 0.09333679025149079, "grad_norm": 2.4520134925842285, "learning_rate": 4.5e-06, "loss": 4.3325, "step": 90 }, { "epoch": 0.10370754472387866, "grad_norm": 2.727679491043091, "learning_rate": 5e-06, "loss": 4.2768, "step": 100 }, { "epoch": 0.10370754472387866, "eval_loss": 4.328857898712158, "eval_runtime": 43.0263, "eval_samples_per_second": 79.672, "eval_steps_per_second": 9.971, "step": 100 }, { "epoch": 0.11407829919626652, "grad_norm": 2.3905959129333496, "learning_rate": 5.500000000000001e-06, "loss": 4.1575, "step": 110 }, { "epoch": 0.1244490536686544, "grad_norm": 2.3810746669769287, "learning_rate": 6e-06, "loss": 4.1188, "step": 120 }, { "epoch": 0.13481980814104227, "grad_norm": 2.3154499530792236, "learning_rate": 6.5000000000000004e-06, "loss": 4.0751, "step": 130 }, { "epoch": 0.14519056261343014, "grad_norm": 2.404163360595703, "learning_rate": 7e-06, "loss": 3.9433, "step": 140 }, { "epoch": 0.155561317085818, "grad_norm": 2.620729446411133, "learning_rate": 7.500000000000001e-06, "loss": 3.9579, "step": 150 }, { "epoch": 0.155561317085818, "eval_loss": 3.87788724899292, "eval_runtime": 43.1648, "eval_samples_per_second": 79.417, "eval_steps_per_second": 9.939, "step": 150 }, { "epoch": 0.16593207155820586, "grad_norm": 2.6772756576538086, "learning_rate": 8.000000000000001e-06, "loss": 3.7662, "step": 160 }, { "epoch": 0.17630282603059372, "grad_norm": 2.6104724407196045, "learning_rate": 8.5e-06, "loss": 3.6483, "step": 170 }, { "epoch": 0.18667358050298158, "grad_norm": 2.636183023452759, "learning_rate": 9e-06, "loss": 3.4924, "step": 180 }, { "epoch": 0.19704433497536947, "grad_norm": 2.9193673133850098, "learning_rate": 9.5e-06, "loss": 3.33, "step": 190 }, { "epoch": 0.20741508944775733, "grad_norm": 2.378948926925659, "learning_rate": 1e-05, "loss": 3.1667, "step": 200 }, { "epoch": 0.20741508944775733, "eval_loss": 3.0792782306671143, "eval_runtime": 43.0269, "eval_samples_per_second": 79.671, "eval_steps_per_second": 9.971, "step": 200 }, { "epoch": 0.2177858439201452, "grad_norm": 3.6606717109680176, "learning_rate": 9.86910994764398e-06, "loss": 2.9038, "step": 210 }, { "epoch": 0.22815659839253305, "grad_norm": 4.638175964355469, "learning_rate": 9.73821989528796e-06, "loss": 2.7723, "step": 220 }, { "epoch": 0.2385273528649209, "grad_norm": 5.681021690368652, "learning_rate": 9.607329842931939e-06, "loss": 2.4375, "step": 230 }, { "epoch": 0.2488981073373088, "grad_norm": 3.9302401542663574, "learning_rate": 9.476439790575916e-06, "loss": 2.2828, "step": 240 }, { "epoch": 0.25926886180969666, "grad_norm": 3.4748728275299072, "learning_rate": 9.345549738219896e-06, "loss": 2.1372, "step": 250 }, { "epoch": 0.25926886180969666, "eval_loss": 2.027852773666382, "eval_runtime": 42.9733, "eval_samples_per_second": 79.77, "eval_steps_per_second": 9.983, "step": 250 }, { "epoch": 0.26963961628208455, "grad_norm": 2.240591049194336, "learning_rate": 9.214659685863875e-06, "loss": 2.0402, "step": 260 }, { "epoch": 0.2800103707544724, "grad_norm": 2.12862229347229, "learning_rate": 9.083769633507853e-06, "loss": 1.8311, "step": 270 }, { "epoch": 0.29038112522686027, "grad_norm": 1.9156771898269653, "learning_rate": 8.952879581151834e-06, "loss": 1.7948, "step": 280 }, { "epoch": 0.3007518796992481, "grad_norm": 1.2717920541763306, "learning_rate": 8.821989528795813e-06, "loss": 1.7745, "step": 290 }, { "epoch": 0.311122634171636, "grad_norm": 0.9778507947921753, "learning_rate": 8.691099476439791e-06, "loss": 1.6066, "step": 300 }, { "epoch": 0.311122634171636, "eval_loss": 1.6197232007980347, "eval_runtime": 43.0004, "eval_samples_per_second": 79.72, "eval_steps_per_second": 9.977, "step": 300 }, { "epoch": 0.3214933886440238, "grad_norm": 0.966334879398346, "learning_rate": 8.56020942408377e-06, "loss": 1.6183, "step": 310 }, { "epoch": 0.3318641431164117, "grad_norm": 0.8336134552955627, "learning_rate": 8.429319371727749e-06, "loss": 1.543, "step": 320 }, { "epoch": 0.3422348975887996, "grad_norm": 0.7293752431869507, "learning_rate": 8.298429319371727e-06, "loss": 1.5888, "step": 330 }, { "epoch": 0.35260565206118744, "grad_norm": 0.7492266297340393, "learning_rate": 8.167539267015708e-06, "loss": 1.5612, "step": 340 }, { "epoch": 0.3629764065335753, "grad_norm": 0.8373680710792542, "learning_rate": 8.036649214659686e-06, "loss": 1.547, "step": 350 }, { "epoch": 0.3629764065335753, "eval_loss": 1.571603536605835, "eval_runtime": 43.1239, "eval_samples_per_second": 79.492, "eval_steps_per_second": 9.948, "step": 350 }, { "epoch": 0.37334716100596316, "grad_norm": 0.9682691097259521, "learning_rate": 7.905759162303665e-06, "loss": 1.6005, "step": 360 }, { "epoch": 0.38371791547835105, "grad_norm": 0.6970401406288147, "learning_rate": 7.774869109947646e-06, "loss": 1.6102, "step": 370 }, { "epoch": 0.39408866995073893, "grad_norm": 0.8149111866950989, "learning_rate": 7.643979057591624e-06, "loss": 1.5331, "step": 380 }, { "epoch": 0.40445942442312677, "grad_norm": 0.6417681574821472, "learning_rate": 7.513089005235603e-06, "loss": 1.5559, "step": 390 }, { "epoch": 0.41483017889551466, "grad_norm": 0.669866144657135, "learning_rate": 7.382198952879581e-06, "loss": 1.6237, "step": 400 }, { "epoch": 0.41483017889551466, "eval_loss": 1.5569473505020142, "eval_runtime": 43.0517, "eval_samples_per_second": 79.625, "eval_steps_per_second": 9.965, "step": 400 }, { "epoch": 0.4252009333679025, "grad_norm": 0.7108224630355835, "learning_rate": 7.25130890052356e-06, "loss": 1.5205, "step": 410 }, { "epoch": 0.4355716878402904, "grad_norm": 0.772306501865387, "learning_rate": 7.12041884816754e-06, "loss": 1.4833, "step": 420 }, { "epoch": 0.44594244231267827, "grad_norm": 0.8170768618583679, "learning_rate": 6.989528795811519e-06, "loss": 1.506, "step": 430 }, { "epoch": 0.4563131967850661, "grad_norm": 0.7127036452293396, "learning_rate": 6.858638743455498e-06, "loss": 1.642, "step": 440 }, { "epoch": 0.466683951257454, "grad_norm": 1.1019853353500366, "learning_rate": 6.727748691099477e-06, "loss": 1.5815, "step": 450 }, { "epoch": 0.466683951257454, "eval_loss": 1.5491901636123657, "eval_runtime": 42.974, "eval_samples_per_second": 79.769, "eval_steps_per_second": 9.983, "step": 450 }, { "epoch": 0.4770547057298418, "grad_norm": 0.7836682200431824, "learning_rate": 6.5968586387434565e-06, "loss": 1.479, "step": 460 }, { "epoch": 0.4874254602022297, "grad_norm": 0.8299842476844788, "learning_rate": 6.465968586387435e-06, "loss": 1.4768, "step": 470 }, { "epoch": 0.4977962146746176, "grad_norm": 0.7423719763755798, "learning_rate": 6.335078534031414e-06, "loss": 1.5919, "step": 480 }, { "epoch": 0.5081669691470054, "grad_norm": 0.7347830533981323, "learning_rate": 6.204188481675393e-06, "loss": 1.4697, "step": 490 }, { "epoch": 0.5185377236193933, "grad_norm": 0.8458806276321411, "learning_rate": 6.073298429319372e-06, "loss": 1.5822, "step": 500 }, { "epoch": 0.5185377236193933, "eval_loss": 1.5439085960388184, "eval_runtime": 43.1032, "eval_samples_per_second": 79.53, "eval_steps_per_second": 9.953, "step": 500 }, { "epoch": 0.5289084780917812, "grad_norm": 0.8292895555496216, "learning_rate": 5.942408376963351e-06, "loss": 1.5543, "step": 510 }, { "epoch": 0.5392792325641691, "grad_norm": 0.7892965078353882, "learning_rate": 5.81151832460733e-06, "loss": 1.6241, "step": 520 }, { "epoch": 0.5496499870365569, "grad_norm": 0.8499513268470764, "learning_rate": 5.680628272251309e-06, "loss": 1.4915, "step": 530 }, { "epoch": 0.5600207415089448, "grad_norm": 0.8531098365783691, "learning_rate": 5.549738219895289e-06, "loss": 1.5094, "step": 540 }, { "epoch": 0.5703914959813327, "grad_norm": 0.7012779116630554, "learning_rate": 5.418848167539268e-06, "loss": 1.5539, "step": 550 }, { "epoch": 0.5703914959813327, "eval_loss": 1.5399216413497925, "eval_runtime": 43.067, "eval_samples_per_second": 79.597, "eval_steps_per_second": 9.961, "step": 550 }, { "epoch": 0.5807622504537205, "grad_norm": 0.7626951336860657, "learning_rate": 5.287958115183246e-06, "loss": 1.5038, "step": 560 }, { "epoch": 0.5911330049261084, "grad_norm": 0.8458223938941956, "learning_rate": 5.157068062827225e-06, "loss": 1.5217, "step": 570 }, { "epoch": 0.6015037593984962, "grad_norm": 0.8810559511184692, "learning_rate": 5.026178010471204e-06, "loss": 1.6896, "step": 580 }, { "epoch": 0.6118745138708841, "grad_norm": 0.9249419569969177, "learning_rate": 4.895287958115184e-06, "loss": 1.5184, "step": 590 }, { "epoch": 0.622245268343272, "grad_norm": 0.7158748507499695, "learning_rate": 4.764397905759163e-06, "loss": 1.5405, "step": 600 }, { "epoch": 0.622245268343272, "eval_loss": 1.5371109247207642, "eval_runtime": 43.1456, "eval_samples_per_second": 79.452, "eval_steps_per_second": 9.943, "step": 600 }, { "epoch": 0.6326160228156599, "grad_norm": 0.8123712539672852, "learning_rate": 4.633507853403142e-06, "loss": 1.4703, "step": 610 }, { "epoch": 0.6429867772880477, "grad_norm": 0.8977182507514954, "learning_rate": 4.502617801047121e-06, "loss": 1.5568, "step": 620 }, { "epoch": 0.6533575317604355, "grad_norm": 0.8391156792640686, "learning_rate": 4.3717277486910996e-06, "loss": 1.5993, "step": 630 }, { "epoch": 0.6637282862328234, "grad_norm": 0.7252123355865479, "learning_rate": 4.240837696335079e-06, "loss": 1.5162, "step": 640 }, { "epoch": 0.6740990407052113, "grad_norm": 0.7567150592803955, "learning_rate": 4.109947643979058e-06, "loss": 1.5821, "step": 650 }, { "epoch": 0.6740990407052113, "eval_loss": 1.5346648693084717, "eval_runtime": 43.0456, "eval_samples_per_second": 79.637, "eval_steps_per_second": 9.966, "step": 650 }, { "epoch": 0.6844697951775992, "grad_norm": 0.6526748538017273, "learning_rate": 3.9790575916230365e-06, "loss": 1.5429, "step": 660 }, { "epoch": 0.694840549649987, "grad_norm": 0.7770061492919922, "learning_rate": 3.848167539267016e-06, "loss": 1.497, "step": 670 }, { "epoch": 0.7052113041223749, "grad_norm": 0.6573889255523682, "learning_rate": 3.717277486910995e-06, "loss": 1.6247, "step": 680 }, { "epoch": 0.7155820585947628, "grad_norm": 0.9382066130638123, "learning_rate": 3.5863874345549743e-06, "loss": 1.5577, "step": 690 }, { "epoch": 0.7259528130671506, "grad_norm": 0.9911208748817444, "learning_rate": 3.455497382198953e-06, "loss": 1.4734, "step": 700 }, { "epoch": 0.7259528130671506, "eval_loss": 1.5329481363296509, "eval_runtime": 43.1201, "eval_samples_per_second": 79.499, "eval_steps_per_second": 9.949, "step": 700 }, { "epoch": 0.7363235675395385, "grad_norm": 0.8948063850402832, "learning_rate": 3.324607329842932e-06, "loss": 1.5257, "step": 710 }, { "epoch": 0.7466943220119263, "grad_norm": 1.0471000671386719, "learning_rate": 3.1937172774869113e-06, "loss": 1.5289, "step": 720 }, { "epoch": 0.7570650764843142, "grad_norm": 0.7089968323707581, "learning_rate": 3.0628272251308904e-06, "loss": 1.5721, "step": 730 }, { "epoch": 0.7674358309567021, "grad_norm": 0.9314925074577332, "learning_rate": 2.931937172774869e-06, "loss": 1.4879, "step": 740 }, { "epoch": 0.77780658542909, "grad_norm": 0.8222401142120361, "learning_rate": 2.8010471204188483e-06, "loss": 1.5909, "step": 750 }, { "epoch": 0.77780658542909, "eval_loss": 1.5315285921096802, "eval_runtime": 43.1263, "eval_samples_per_second": 79.487, "eval_steps_per_second": 9.948, "step": 750 }, { "epoch": 0.7881773399014779, "grad_norm": 0.7002791166305542, "learning_rate": 2.6701570680628274e-06, "loss": 1.5853, "step": 760 }, { "epoch": 0.7985480943738656, "grad_norm": 0.7302571535110474, "learning_rate": 2.5392670157068065e-06, "loss": 1.4632, "step": 770 }, { "epoch": 0.8089188488462535, "grad_norm": 0.785142719745636, "learning_rate": 2.4083769633507856e-06, "loss": 1.505, "step": 780 }, { "epoch": 0.8192896033186414, "grad_norm": 0.6490882039070129, "learning_rate": 2.2774869109947643e-06, "loss": 1.4813, "step": 790 }, { "epoch": 0.8296603577910293, "grad_norm": 0.7147834897041321, "learning_rate": 2.1465968586387435e-06, "loss": 1.4852, "step": 800 }, { "epoch": 0.8296603577910293, "eval_loss": 1.5305155515670776, "eval_runtime": 43.0366, "eval_samples_per_second": 79.653, "eval_steps_per_second": 9.968, "step": 800 }, { "epoch": 0.8400311122634172, "grad_norm": 0.742734432220459, "learning_rate": 2.0157068062827226e-06, "loss": 1.4627, "step": 810 }, { "epoch": 0.850401866735805, "grad_norm": 0.7220650315284729, "learning_rate": 1.8848167539267017e-06, "loss": 1.4692, "step": 820 }, { "epoch": 0.8607726212081929, "grad_norm": 0.8684506416320801, "learning_rate": 1.7539267015706806e-06, "loss": 1.56, "step": 830 }, { "epoch": 0.8711433756805808, "grad_norm": 0.7521070241928101, "learning_rate": 1.6230366492146598e-06, "loss": 1.5089, "step": 840 }, { "epoch": 0.8815141301529686, "grad_norm": 0.9445785284042358, "learning_rate": 1.4921465968586387e-06, "loss": 1.6033, "step": 850 }, { "epoch": 0.8815141301529686, "eval_loss": 1.5298349857330322, "eval_runtime": 42.9802, "eval_samples_per_second": 79.758, "eval_steps_per_second": 9.981, "step": 850 }, { "epoch": 0.8918848846253565, "grad_norm": 0.7844976186752319, "learning_rate": 1.361256544502618e-06, "loss": 1.5412, "step": 860 }, { "epoch": 0.9022556390977443, "grad_norm": 0.9173896312713623, "learning_rate": 1.230366492146597e-06, "loss": 1.585, "step": 870 }, { "epoch": 0.9126263935701322, "grad_norm": 0.7674463391304016, "learning_rate": 1.099476439790576e-06, "loss": 1.533, "step": 880 }, { "epoch": 0.9229971480425201, "grad_norm": 0.901545524597168, "learning_rate": 9.685863874345552e-07, "loss": 1.6416, "step": 890 }, { "epoch": 0.933367902514908, "grad_norm": 0.760588526725769, "learning_rate": 8.376963350785341e-07, "loss": 1.6217, "step": 900 }, { "epoch": 0.933367902514908, "eval_loss": 1.529255747795105, "eval_runtime": 42.9676, "eval_samples_per_second": 79.781, "eval_steps_per_second": 9.984, "step": 900 }, { "epoch": 0.9437386569872959, "grad_norm": 0.780006468296051, "learning_rate": 7.068062827225131e-07, "loss": 1.5711, "step": 910 }, { "epoch": 0.9541094114596836, "grad_norm": 0.6572290062904358, "learning_rate": 5.759162303664922e-07, "loss": 1.5525, "step": 920 }, { "epoch": 0.9644801659320715, "grad_norm": 0.7653405666351318, "learning_rate": 4.4502617801047125e-07, "loss": 1.5585, "step": 930 }, { "epoch": 0.9748509204044594, "grad_norm": 0.9417358636856079, "learning_rate": 3.1413612565445027e-07, "loss": 1.5995, "step": 940 }, { "epoch": 0.9852216748768473, "grad_norm": 0.752137303352356, "learning_rate": 1.8324607329842932e-07, "loss": 1.6332, "step": 950 }, { "epoch": 0.9852216748768473, "eval_loss": 1.5290166139602661, "eval_runtime": 43.1488, "eval_samples_per_second": 79.446, "eval_steps_per_second": 9.942, "step": 950 }, { "epoch": 0.9955924293492352, "grad_norm": 0.7827558517456055, "learning_rate": 5.235602094240838e-08, "loss": 1.5047, "step": 960 }, { "epoch": 0.9997407311381903, "step": 964, "total_flos": 9.238171939032269e+16, "train_loss": 2.1392775007303326, "train_runtime": 1823.6358, "train_samples_per_second": 16.917, "train_steps_per_second": 0.529 } ], "logging_steps": 10, "max_steps": 964, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.238171939032269e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }