{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 594.4356994628906, "epoch": 0.03731343283582089, "grad_norm": 0.1971098631620407, "kl": 0.00011806488037109375, "learning_rate": 1.0714285714285716e-06, "loss": 0.0, "reward": 0.6617346815764904, "reward_std": 0.3125086955726147, "rewards/accuracy_reward": 0.6617346815764904, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 630.1530487060547, "epoch": 0.07462686567164178, "grad_norm": 0.18470077216625214, "kl": 0.0004942655563354492, "learning_rate": 2.142857142857143e-06, "loss": 0.0, "reward": 0.6520408019423485, "reward_std": 0.2983996603637934, "rewards/accuracy_reward": 0.651530598104, "rewards/format_reward": 0.0005102040711790323, "step": 10 }, { "completion_length": 610.2959053039551, "epoch": 0.11194029850746269, "grad_norm": 0.052106305956840515, "kl": 0.0017307758331298827, "learning_rate": 2.999485987463336e-06, "loss": 0.0001, "reward": 0.7372448824346065, "reward_std": 0.2600953433662653, "rewards/accuracy_reward": 0.7372448824346065, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 611.4326400756836, "epoch": 0.14925373134328357, "grad_norm": 0.025398777797818184, "kl": 0.0024486541748046874, "learning_rate": 2.981532510892707e-06, "loss": 0.0001, "reward": 0.7372448846697808, "reward_std": 0.2582925198599696, "rewards/accuracy_reward": 0.7372448846697808, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 613.5106994628907, "epoch": 0.1865671641791045, "grad_norm": 0.012594266794621944, "kl": 0.002337074279785156, "learning_rate": 2.9382296023022897e-06, "loss": 0.0001, "reward": 0.735204067081213, "reward_std": 0.20146227926015853, "rewards/accuracy_reward": 0.735204067081213, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 565.3295829772949, "epoch": 0.22388059701492538, "grad_norm": 0.013191360048949718, "kl": 0.0028142929077148438, "learning_rate": 2.8703181864639013e-06, "loss": 0.0001, "reward": 0.7913265138864517, "reward_std": 0.18436302840709687, "rewards/accuracy_reward": 0.7913265138864517, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 572.9270294189453, "epoch": 0.26119402985074625, "grad_norm": 0.011080138385295868, "kl": 0.0028873443603515624, "learning_rate": 2.7789602465311384e-06, "loss": 0.0001, "reward": 0.7826530456542968, "reward_std": 0.17215402722358703, "rewards/accuracy_reward": 0.7826530456542968, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 585.7760055541992, "epoch": 0.29850746268656714, "grad_norm": 0.015958011150360107, "kl": 0.003158760070800781, "learning_rate": 2.6657189421854562e-06, "loss": 0.0001, "reward": 0.7872448831796646, "reward_std": 0.1621189709752798, "rewards/accuracy_reward": 0.7872448831796646, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 570.9520301818848, "epoch": 0.3358208955223881, "grad_norm": 0.02447451651096344, "kl": 0.00324554443359375, "learning_rate": 2.532531863540631e-06, "loss": 0.0001, "reward": 0.7755101852118969, "reward_std": 0.16486475374549628, "rewards/accuracy_reward": 0.7755101852118969, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 578.0943778991699, "epoch": 0.373134328358209, "grad_norm": 0.013620010577142239, "kl": 0.003280830383300781, "learning_rate": 2.3816778784387097e-06, "loss": 0.0001, "reward": 0.8040816187858582, "reward_std": 0.1476132795214653, "rewards/accuracy_reward": 0.8040816187858582, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 578.3994766235352, "epoch": 0.41044776119402987, "grad_norm": 0.016489438712596893, "kl": 0.003299522399902344, "learning_rate": 2.2157381403894125e-06, "loss": 0.0001, "reward": 0.7852040603756905, "reward_std": 0.15987344700843095, "rewards/accuracy_reward": 0.7852040603756905, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 570.6836616516114, "epoch": 0.44776119402985076, "grad_norm": 0.014856858178973198, "kl": 0.003414154052734375, "learning_rate": 2.03755192431795e-06, "loss": 0.0001, "reward": 0.7525510065257549, "reward_std": 0.16357502806931734, "rewards/accuracy_reward": 0.7525510065257549, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 557.9943771362305, "epoch": 0.48507462686567165, "grad_norm": 0.013422299176454544, "kl": 0.00412750244140625, "learning_rate": 1.8501680457838584e-06, "loss": 0.0002, "reward": 0.7938775330781936, "reward_std": 0.15019273720681667, "rewards/accuracy_reward": 0.7938775330781936, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 597.3453926086426, "epoch": 0.5223880597014925, "grad_norm": 0.014206411316990852, "kl": 0.0033687591552734376, "learning_rate": 1.6567926949014804e-06, "loss": 0.0001, "reward": 0.7632652923464776, "reward_std": 0.17450573313981294, "rewards/accuracy_reward": 0.7632652923464776, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 553.4872329711914, "epoch": 0.5597014925373134, "grad_norm": 0.01648498699069023, "kl": 0.0038028717041015624, "learning_rate": 1.4607345775381906e-06, "loss": 0.0002, "reward": 0.7770408011972905, "reward_std": 0.158019458130002, "rewards/accuracy_reward": 0.7770408011972905, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 561.41937789917, "epoch": 0.5970149253731343, "grad_norm": 0.014115195721387863, "kl": 0.0034923553466796875, "learning_rate": 1.2653483024396534e-06, "loss": 0.0001, "reward": 0.7872448809444904, "reward_std": 0.14782564975321294, "rewards/accuracy_reward": 0.7872448809444904, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 531.1652923583985, "epoch": 0.6343283582089553, "grad_norm": 0.01905824802815914, "kl": 0.004101181030273437, "learning_rate": 1.073976982944116e-06, "loss": 0.0002, "reward": 0.7877550825476647, "reward_std": 0.16267810724675655, "rewards/accuracy_reward": 0.7877550825476647, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 545.3045799255372, "epoch": 0.6716417910447762, "grad_norm": 0.021177947521209717, "kl": 0.004157257080078125, "learning_rate": 8.898950353863e-07, "loss": 0.0002, "reward": 0.7816326349973679, "reward_std": 0.1574961107224226, "rewards/accuracy_reward": 0.7816326349973679, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 568.895905303955, "epoch": 0.7089552238805971, "grad_norm": 0.02039457857608795, "kl": 0.0038570404052734376, "learning_rate": 7.162521529260768e-07, "loss": 0.0002, "reward": 0.7908163070678711, "reward_std": 0.17509947922080754, "rewards/accuracy_reward": 0.7903061032295227, "rewards/format_reward": 0.0005102040711790323, "step": 95 }, { "completion_length": 588.9494773864747, "epoch": 0.746268656716418, "grad_norm": 0.013423638418316841, "kl": 0.003804779052734375, "learning_rate": 5.560194134252441e-07, "loss": 0.0002, "reward": 0.7403061062097549, "reward_std": 0.17368433568626643, "rewards/accuracy_reward": 0.7403061062097549, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 560.371418762207, "epoch": 0.7835820895522388, "grad_norm": 0.0290207602083683, "kl": 0.0039691925048828125, "learning_rate": 4.1193844348156887e-07, "loss": 0.0002, "reward": 0.7642856985330582, "reward_std": 0.18369458429515362, "rewards/accuracy_reward": 0.7637754946947097, "rewards/format_reward": 0.0005102040711790323, "step": 105 }, { "completion_length": 560.294888305664, "epoch": 0.8208955223880597, "grad_norm": 0.02158834971487522, "kl": 0.004729461669921875, "learning_rate": 2.86474508437579e-07, "loss": 0.0002, "reward": 0.7780612073838711, "reward_std": 0.1638113146647811, "rewards/accuracy_reward": 0.7724489636719227, "rewards/format_reward": 0.005612244782969356, "step": 110 }, { "completion_length": 550.3765190124511, "epoch": 0.8582089552238806, "grad_norm": 0.1028282567858696, "kl": 0.01652069091796875, "learning_rate": 1.8177433100705209e-07, "loss": 0.0007, "reward": 1.107142834365368, "reward_std": 0.38318478502333164, "rewards/accuracy_reward": 0.7530612118542195, "rewards/format_reward": 0.3540816267952323, "step": 115 }, { "completion_length": 556.8137657165528, "epoch": 0.8955223880597015, "grad_norm": 0.05659456178545952, "kl": 0.0318084716796875, "learning_rate": 9.962936025419756e-08, "loss": 0.0013, "reward": 1.6938775151968002, "reward_std": 0.296724752895534, "rewards/accuracy_reward": 0.7999999821186066, "rewards/format_reward": 0.8938775390386582, "step": 120 }, { "completion_length": 569.5673370361328, "epoch": 0.9328358208955224, "grad_norm": 0.03255928307771683, "kl": 0.0380767822265625, "learning_rate": 4.144511940348516e-08, "loss": 0.0015, "reward": 1.7311224043369293, "reward_std": 0.26802414935082197, "rewards/accuracy_reward": 0.7811224296689033, "rewards/format_reward": 0.9499999895691872, "step": 125 }, { "completion_length": 524.7413139343262, "epoch": 0.9701492537313433, "grad_norm": 0.22196683287620544, "kl": 0.0370025634765625, "learning_rate": 8.217156947590065e-09, "loss": 0.0015, "reward": 1.7397958785295486, "reward_std": 0.26053950041532514, "rewards/accuracy_reward": 0.7872448846697807, "rewards/format_reward": 0.9525510028004647, "step": 130 }, { "completion_length": 524.1900415420532, "epoch": 1.0, "kl": 0.03165435791015625, "reward": 1.7072703689336777, "reward_std": 0.254364542895928, "rewards/accuracy_reward": 0.7646683501079679, "rewards/format_reward": 0.9426020290702581, "step": 134, "total_flos": 0.0, "train_loss": 0.00029843695958687075, "train_runtime": 20680.6754, "train_samples_per_second": 0.363, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 134, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }