{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 67, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 354.5310432434082, "epoch": 0.14925373134328357, "grad_norm": 26.582666397094727, "kl": 0.02456921637058258, "learning_rate": 2.981532510892707e-06, "loss": 0.001, "reward": 0.38080358956940474, "reward_std": 0.3429007441736758, "rewards/accuracy_reward": 0.12935268479632214, "rewards/format_reward": 0.2514509041327983, "step": 10 }, { "completion_length": 82.56049494743347, "epoch": 0.29850746268656714, "grad_norm": 6.207046031951904, "kl": 0.192266845703125, "learning_rate": 2.6657189421854562e-06, "loss": 0.0077, "reward": 1.1937500540167094, "reward_std": 0.2890436253976077, "rewards/accuracy_reward": 0.2324776900582947, "rewards/format_reward": 0.9612723555415869, "step": 20 }, { "completion_length": 163.46440453529357, "epoch": 0.44776119402985076, "grad_norm": 0.9332170486450195, "kl": 0.28681182861328125, "learning_rate": 2.03755192431795e-06, "loss": 0.0115, "reward": 1.377120592445135, "reward_std": 0.35549838868901135, "rewards/accuracy_reward": 0.4252232332248241, "rewards/format_reward": 0.9518973540514708, "step": 30 }, { "completion_length": 289.2572672843933, "epoch": 0.5970149253731343, "grad_norm": 0.30436766147613525, "kl": 0.11348419189453125, "learning_rate": 1.2653483024396534e-06, "loss": 0.0045, "reward": 1.5142857864499093, "reward_std": 0.3293194776400924, "rewards/accuracy_reward": 0.5575893112458289, "rewards/format_reward": 0.9566964589059352, "step": 40 }, { "completion_length": 328.3682068824768, "epoch": 0.746268656716418, "grad_norm": 0.4712439179420471, "kl": 0.0519775390625, "learning_rate": 5.560194134252441e-07, "loss": 0.0021, "reward": 1.4938616767525672, "reward_std": 0.352480823546648, "rewards/accuracy_reward": 0.5599330620840192, "rewards/format_reward": 0.9339286085218191, "step": 50 }, { "completion_length": 326.48863105773927, "epoch": 0.8955223880597015, "grad_norm": 1.6945326328277588, "kl": 0.05061798095703125, "learning_rate": 9.962936025419756e-08, "loss": 0.002, "reward": 1.528236673772335, "reward_std": 0.33247090512886646, "rewards/accuracy_reward": 0.574107170663774, "rewards/format_reward": 0.9541294977068902, "step": 60 }, { "completion_length": 317.54422964368547, "epoch": 1.0, "kl": 0.05271693638392857, "reward": 1.535608057464872, "reward_std": 0.3223346844315529, "rewards/accuracy_reward": 0.5788690721882241, "rewards/format_reward": 0.9567389748990536, "step": 67, "total_flos": 0.0, "train_loss": 0.004519763499943178, "train_runtime": 8988.1744, "train_samples_per_second": 0.834, "train_steps_per_second": 0.007 } ], "logging_steps": 10, "max_steps": 67, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }