{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3929273084479371, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09823182711198428, "grad_norm": 4.68359375, "learning_rate": 5e-07, "logits/chosen": -1.882665991783142, "logits/rejected": -2.022334098815918, "logps/chosen": -629.2000122070312, "logps/rejected": -886.400146484375, "loss": 0.6197, "rewards/accuracies": 0.39500001072883606, "rewards/chosen": 1.2083027362823486, "rewards/margins": 0.816202700138092, "rewards/rejected": 0.392100065946579, "step": 50 }, { "epoch": 0.09823182711198428, "eval_logits/chosen": -1.8351960182189941, "eval_logits/rejected": -1.8982521295547485, "eval_logps/chosen": -827.6643676757812, "eval_logps/rejected": -769.7268676757812, "eval_loss": 0.21161487698554993, "eval_rewards/accuracies": 0.7060185074806213, "eval_rewards/chosen": 4.600923538208008, "eval_rewards/margins": 3.7983312606811523, "eval_rewards/rejected": 0.8025919795036316, "eval_runtime": 376.5489, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 50 }, { "epoch": 0.19646365422396855, "grad_norm": 1.431640625, "learning_rate": 1e-06, "logits/chosen": -1.9131250381469727, "logits/rejected": -2.0282812118530273, "logps/chosen": -584.5037231445312, "logps/rejected": -919.989990234375, "loss": 0.3185, "rewards/accuracies": 0.5550000071525574, "rewards/chosen": 4.304075241088867, "rewards/margins": 5.650895595550537, "rewards/rejected": -1.3468197584152222, "step": 100 }, { "epoch": 0.19646365422396855, "eval_logits/chosen": -1.8304036855697632, "eval_logits/rejected": -1.8951867818832397, "eval_logps/chosen": -808.5324096679688, "eval_logps/rejected": -775.2742919921875, "eval_loss": 0.2037331759929657, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.5165486335754395, "eval_rewards/margins": 6.269077777862549, "eval_rewards/rejected": 0.24747034907341003, "eval_runtime": 376.7007, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.573, "step": 100 }, { "epoch": 0.29469548133595286, "grad_norm": 0.00038909912109375, "learning_rate": 5e-07, "logits/chosen": -1.900234341621399, "logits/rejected": -1.9972655773162842, "logps/chosen": -539.0875244140625, "logps/rejected": -859.8375244140625, "loss": 0.334, "rewards/accuracies": 0.5274999737739563, "rewards/chosen": 4.29106330871582, "rewards/margins": 8.129581451416016, "rewards/rejected": -3.838517427444458, "step": 150 }, { "epoch": 0.29469548133595286, "eval_logits/chosen": -1.8221435546875, "eval_logits/rejected": -1.8842185735702515, "eval_logps/chosen": -818.3194580078125, "eval_logps/rejected": -790.0972290039062, "eval_loss": 0.20332251489162445, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 5.53895902633667, "eval_rewards/margins": 6.775155067443848, "eval_rewards/rejected": -1.236195683479309, "eval_runtime": 376.4567, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 150 }, { "epoch": 0.3929273084479371, "grad_norm": 0.80126953125, "learning_rate": 0.0, "logits/chosen": -1.889101505279541, "logits/rejected": NaN, "logps/chosen": -537.0787353515625, "logps/rejected": -937.9553833007812, "loss": 0.3, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 4.2822651863098145, "rewards/margins": 9.660238265991211, "rewards/rejected": -5.377972602844238, "step": 200 }, { "epoch": 0.3929273084479371, "eval_logits/chosen": -1.8241690397262573, "eval_logits/rejected": -1.8862168788909912, "eval_logps/chosen": -818.3449096679688, "eval_logps/rejected": -792.08447265625, "eval_loss": 0.20316672325134277, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 5.535775661468506, "eval_rewards/margins": 6.967732906341553, "eval_rewards/rejected": -1.4319567680358887, "eval_runtime": 376.1581, "eval_samples_per_second": 1.146, "eval_steps_per_second": 0.574, "step": 200 }, { "epoch": 0.3929273084479371, "step": 200, "total_flos": 0.0, "train_loss": 0.39305298328399657, "train_runtime": 44907.6338, "train_samples_per_second": 0.036, "train_steps_per_second": 0.004 } ], "logging_steps": 50, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }