{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986738753442823, "eval_steps": 200, "global_step": 445, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02244210955829848, "grad_norm": 19.15482521057129, "learning_rate": 2.222222222222222e-06, "loss": 3.8476, "step": 10 }, { "epoch": 0.04488421911659696, "grad_norm": 5.446422576904297, "learning_rate": 4.444444444444444e-06, "loss": 3.2218, "step": 20 }, { "epoch": 0.06732632867489544, "grad_norm": 1.8523049354553223, "learning_rate": 6.666666666666667e-06, "loss": 2.9756, "step": 30 }, { "epoch": 0.08976843823319391, "grad_norm": 2.139192581176758, "learning_rate": 8.888888888888888e-06, "loss": 2.88, "step": 40 }, { "epoch": 0.11221054779149241, "grad_norm": 1.853474497795105, "learning_rate": 9.996145181203616e-06, "loss": 2.8198, "step": 50 }, { "epoch": 0.13465265734979087, "grad_norm": 1.501637578010559, "learning_rate": 9.965342284774633e-06, "loss": 2.8122, "step": 60 }, { "epoch": 0.15709476690808935, "grad_norm": 2.0072269439697266, "learning_rate": 9.903926402016153e-06, "loss": 2.7806, "step": 70 }, { "epoch": 0.17953687646638783, "grad_norm": 1.7332258224487305, "learning_rate": 9.812276182268236e-06, "loss": 2.7422, "step": 80 }, { "epoch": 0.2019789860246863, "grad_norm": 1.3256088495254517, "learning_rate": 9.690956679612422e-06, "loss": 2.736, "step": 90 }, { "epoch": 0.22442109558298481, "grad_norm": 1.6238477230072021, "learning_rate": 9.540715869125407e-06, "loss": 2.7361, "step": 100 }, { "epoch": 0.2468632051412833, "grad_norm": 1.326378583908081, "learning_rate": 9.362480035363987e-06, "loss": 2.7135, "step": 110 }, { "epoch": 0.26930531469958174, "grad_norm": 1.3376497030258179, "learning_rate": 9.157348061512728e-06, "loss": 2.7064, "step": 120 }, { "epoch": 0.29174742425788025, "grad_norm": 1.2815560102462769, "learning_rate": 8.926584654403725e-06, "loss": 2.7018, "step": 130 }, { "epoch": 0.3141895338161787, "grad_norm": 1.5868873596191406, "learning_rate": 8.671612547178428e-06, "loss": 2.6961, "step": 140 }, { "epoch": 0.3366316433744772, "grad_norm": 1.366570234298706, "learning_rate": 8.39400372766471e-06, "loss": 2.6968, "step": 150 }, { "epoch": 0.35907375293277566, "grad_norm": 1.6603009700775146, "learning_rate": 8.095469746549172e-06, "loss": 2.6879, "step": 160 }, { "epoch": 0.38151586249107416, "grad_norm": 1.4688373804092407, "learning_rate": 7.777851165098012e-06, "loss": 2.6686, "step": 170 }, { "epoch": 0.4039579720493726, "grad_norm": 1.2386434078216553, "learning_rate": 7.443106207484776e-06, "loss": 2.6497, "step": 180 }, { "epoch": 0.4264000816076711, "grad_norm": 1.3002716302871704, "learning_rate": 7.093298687687141e-06, "loss": 2.6413, "step": 190 }, { "epoch": 0.44884219116596963, "grad_norm": 1.2603603601455688, "learning_rate": 6.730585285387465e-06, "loss": 2.6472, "step": 200 }, { "epoch": 0.44884219116596963, "eval_loss": 2.642993450164795, "eval_runtime": 1294.6794, "eval_samples_per_second": 148.064, "eval_steps_per_second": 0.842, "step": 200 }, { "epoch": 0.4712843007242681, "grad_norm": 1.2680917978286743, "learning_rate": 6.3572022493253715e-06, "loss": 2.6369, "step": 210 }, { "epoch": 0.4937264102825666, "grad_norm": 1.3160443305969238, "learning_rate": 5.975451610080643e-06, "loss": 2.63, "step": 220 }, { "epoch": 0.5161685198408651, "grad_norm": 1.2467771768569946, "learning_rate": 5.587686987289189e-06, "loss": 2.6209, "step": 230 }, { "epoch": 0.5386106293991635, "grad_norm": 1.208018183708191, "learning_rate": 5.1962990787953436e-06, "loss": 2.6318, "step": 240 }, { "epoch": 0.561052738957462, "grad_norm": 1.2416397333145142, "learning_rate": 4.803700921204659e-06, "loss": 2.621, "step": 250 }, { "epoch": 0.5834948485157605, "grad_norm": 1.1826361417770386, "learning_rate": 4.4123130127108125e-06, "loss": 2.6161, "step": 260 }, { "epoch": 0.605936958074059, "grad_norm": 1.2550407648086548, "learning_rate": 4.02454838991936e-06, "loss": 2.6164, "step": 270 }, { "epoch": 0.6283790676323574, "grad_norm": 1.2681384086608887, "learning_rate": 3.6427977506746293e-06, "loss": 2.6091, "step": 280 }, { "epoch": 0.6508211771906559, "grad_norm": 1.2637056112289429, "learning_rate": 3.269414714612534e-06, "loss": 2.5967, "step": 290 }, { "epoch": 0.6732632867489544, "grad_norm": 1.211774468421936, "learning_rate": 2.906701312312861e-06, "loss": 2.6031, "step": 300 }, { "epoch": 0.6957053963072529, "grad_norm": 1.1411036252975464, "learning_rate": 2.5568937925152272e-06, "loss": 2.6014, "step": 310 }, { "epoch": 0.7181475058655513, "grad_norm": 1.1422080993652344, "learning_rate": 2.2221488349019903e-06, "loss": 2.5978, "step": 320 }, { "epoch": 0.7405896154238498, "grad_norm": 1.172059416770935, "learning_rate": 1.9045302534508298e-06, "loss": 2.5911, "step": 330 }, { "epoch": 0.7630317249821483, "grad_norm": 1.1655080318450928, "learning_rate": 1.6059962723352912e-06, "loss": 2.5913, "step": 340 }, { "epoch": 0.7854738345404468, "grad_norm": 1.1286932229995728, "learning_rate": 1.3283874528215735e-06, "loss": 2.5819, "step": 350 }, { "epoch": 0.8079159440987452, "grad_norm": 1.1322216987609863, "learning_rate": 1.0734153455962765e-06, "loss": 2.5833, "step": 360 }, { "epoch": 0.8303580536570437, "grad_norm": 1.1392606496810913, "learning_rate": 8.426519384872733e-07, "loss": 2.5851, "step": 370 }, { "epoch": 0.8528001632153422, "grad_norm": 1.1811796426773071, "learning_rate": 6.375199646360142e-07, "loss": 2.5853, "step": 380 }, { "epoch": 0.8752422727736408, "grad_norm": 1.1267277002334595, "learning_rate": 4.5928413087459325e-07, "loss": 2.5832, "step": 390 }, { "epoch": 0.8976843823319393, "grad_norm": 1.116821527481079, "learning_rate": 3.0904332038757977e-07, "loss": 2.5779, "step": 400 }, { "epoch": 0.8976843823319393, "eval_loss": 2.582942485809326, "eval_runtime": 1274.0632, "eval_samples_per_second": 150.46, "eval_steps_per_second": 0.856, "step": 400 }, { "epoch": 0.9201264918902377, "grad_norm": 1.1507278680801392, "learning_rate": 1.8772381773176417e-07, "loss": 2.5833, "step": 410 }, { "epoch": 0.9425686014485362, "grad_norm": 1.0935174226760864, "learning_rate": 9.607359798384785e-08, "loss": 2.597, "step": 420 }, { "epoch": 0.9650107110068347, "grad_norm": 1.1115341186523438, "learning_rate": 3.465771522536854e-08, "loss": 2.5725, "step": 430 }, { "epoch": 0.9874528205651332, "grad_norm": 1.131402611732483, "learning_rate": 3.854818796385495e-09, "loss": 2.575, "step": 440 }, { "epoch": 0.9986738753442823, "step": 445, "total_flos": 7.860958022007259e+18, "train_loss": 2.698132219207421, "train_runtime": 39941.2631, "train_samples_per_second": 43.195, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 445, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.860958022007259e+18, "train_batch_size": 22, "trial_name": null, "trial_params": null }