{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992429977289932, "eval_steps": 100, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.9411764705882356e-07, "logits/chosen": -2.62508487701416, "logits/rejected": -2.638840436935425, "logps/chosen": -313.21063232421875, "logps/rejected": -286.36663818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -2.7004079818725586, "logits/rejected": -2.6217572689056396, "logps/chosen": -292.9493408203125, "logps/rejected": -278.7856140136719, "loss": 0.6926, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0015960136661306024, "rewards/margins": 0.0010866459924727678, "rewards/rejected": 0.0005093678482808173, "step": 10 }, { "epoch": 0.12, "learning_rate": 4.994932636402032e-06, "logits/chosen": -2.690582752227783, "logits/rejected": -2.671006917953491, "logps/chosen": -273.6416931152344, "logps/rejected": -290.06622314453125, "loss": 0.6854, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04393266513943672, "rewards/margins": 0.014766323380172253, "rewards/rejected": 0.029166344553232193, "step": 20 }, { "epoch": 0.18, "learning_rate": 4.905416503522124e-06, "logits/chosen": -2.6617255210876465, "logits/rejected": -2.585472345352173, "logps/chosen": -288.24456787109375, "logps/rejected": -275.30908203125, "loss": 0.6639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.08098463714122772, "rewards/margins": 0.06556984782218933, "rewards/rejected": 0.015414801426231861, "step": 30 }, { "epoch": 0.24, "learning_rate": 4.707922373336524e-06, "logits/chosen": -2.5689034461975098, "logits/rejected": -2.5172557830810547, "logps/chosen": -297.8088684082031, "logps/rejected": -299.01019287109375, "loss": 0.6496, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.03176301717758179, "rewards/margins": 0.09769946336746216, "rewards/rejected": -0.12946248054504395, "step": 40 }, { "epoch": 0.3, "learning_rate": 4.411315662967732e-06, "logits/chosen": -2.543713331222534, "logits/rejected": -2.471020221710205, "logps/chosen": -278.70068359375, "logps/rejected": -281.05767822265625, "loss": 0.645, "rewards/accuracies": 0.640625, "rewards/chosen": 0.058358293026685715, "rewards/margins": 0.13286305963993073, "rewards/rejected": -0.07450475543737411, "step": 50 }, { "epoch": 0.36, "learning_rate": 4.028910905897229e-06, "logits/chosen": -2.5148937702178955, "logits/rejected": -2.403398036956787, "logps/chosen": -313.97503662109375, "logps/rejected": -300.5794677734375, "loss": 0.6317, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1467473804950714, "rewards/margins": 0.1627379208803177, "rewards/rejected": -0.3094852566719055, "step": 60 }, { "epoch": 0.42, "learning_rate": 3.577874068920446e-06, "logits/chosen": -2.4615416526794434, "logits/rejected": -2.3834948539733887, "logps/chosen": -288.96875, "logps/rejected": -298.4138488769531, "loss": 0.6272, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0141445966437459, "rewards/margins": 0.17550477385520935, "rewards/rejected": -0.16136017441749573, "step": 70 }, { "epoch": 0.48, "learning_rate": 3.0784519801008546e-06, "logits/chosen": -2.386679172515869, "logits/rejected": -2.308007001876831, "logps/chosen": -303.1172180175781, "logps/rejected": -317.23577880859375, "loss": 0.6276, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.03143765777349472, "rewards/margins": 0.20620755851268768, "rewards/rejected": -0.237645223736763, "step": 80 }, { "epoch": 0.55, "learning_rate": 2.553063458334059e-06, "logits/chosen": -2.4485552310943604, "logits/rejected": -2.3585047721862793, "logps/chosen": -294.64202880859375, "logps/rejected": -314.85906982421875, "loss": 0.6264, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.18389078974723816, "rewards/margins": 0.19124503433704376, "rewards/rejected": -0.3751358091831207, "step": 90 }, { "epoch": 0.61, "learning_rate": 2.025292943281429e-06, "logits/chosen": -2.4612982273101807, "logits/rejected": -2.3962552547454834, "logps/chosen": -300.9443359375, "logps/rejected": -299.62554931640625, "loss": 0.6237, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03618014603853226, "rewards/margins": 0.2104295939207077, "rewards/rejected": -0.24660976231098175, "step": 100 }, { "epoch": 0.61, "eval_logits/chosen": -2.440356492996216, "eval_logits/rejected": -2.3331212997436523, "eval_logps/chosen": -314.80450439453125, "eval_logps/rejected": -316.8028259277344, "eval_loss": 0.6047022938728333, "eval_rewards/accuracies": 0.6980000138282776, "eval_rewards/chosen": -0.14116904139518738, "eval_rewards/margins": 0.23404958844184875, "eval_rewards/rejected": -0.37521862983703613, "eval_runtime": 384.1798, "eval_samples_per_second": 5.206, "eval_steps_per_second": 0.651, "step": 100 }, { "epoch": 0.67, "learning_rate": 1.5188318011445907e-06, "logits/chosen": -2.4451894760131836, "logits/rejected": -2.3738484382629395, "logps/chosen": -297.38006591796875, "logps/rejected": -310.2391662597656, "loss": 0.6156, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.10568475723266602, "rewards/margins": 0.23291108012199402, "rewards/rejected": -0.33859583735466003, "step": 110 }, { "epoch": 0.73, "learning_rate": 1.0564148305586296e-06, "logits/chosen": -2.5074470043182373, "logits/rejected": -2.3541178703308105, "logps/chosen": -313.4942932128906, "logps/rejected": -304.71240234375, "loss": 0.6023, "rewards/accuracies": 0.671875, "rewards/chosen": -0.05554385855793953, "rewards/margins": 0.2575618028640747, "rewards/rejected": -0.31310564279556274, "step": 120 }, { "epoch": 0.79, "learning_rate": 6.587997083462197e-07, "logits/chosen": -2.472149133682251, "logits/rejected": -2.410820960998535, "logps/chosen": -306.9402770996094, "logps/rejected": -340.479736328125, "loss": 0.6055, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.10773856937885284, "rewards/margins": 0.24569562077522278, "rewards/rejected": -0.35343414545059204, "step": 130 }, { "epoch": 0.85, "learning_rate": 3.438351873250492e-07, "logits/chosen": -2.4470582008361816, "logits/rejected": -2.354292392730713, "logps/chosen": -300.5553283691406, "logps/rejected": -334.2596435546875, "loss": 0.6132, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.13135434687137604, "rewards/margins": 0.2554669976234436, "rewards/rejected": -0.38682132959365845, "step": 140 }, { "epoch": 0.91, "learning_rate": 1.2565987432367032e-07, "logits/chosen": -2.466301679611206, "logits/rejected": -2.3889572620391846, "logps/chosen": -304.37078857421875, "logps/rejected": -318.76507568359375, "loss": 0.6197, "rewards/accuracies": 0.671875, "rewards/chosen": -0.13493719696998596, "rewards/margins": 0.24466891586780548, "rewards/rejected": -0.379606157541275, "step": 150 }, { "epoch": 0.97, "learning_rate": 1.4067554877743861e-08, "logits/chosen": -2.437718152999878, "logits/rejected": -2.32914662361145, "logps/chosen": -297.7317810058594, "logps/rejected": -310.4150085449219, "loss": 0.5989, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.10253523290157318, "rewards/margins": 0.2659505009651184, "rewards/rejected": -0.3684857487678528, "step": 160 }, { "epoch": 1.0, "step": 165, "total_flos": 0.0, "train_loss": 0.6320372126319191, "train_runtime": 7509.7506, "train_samples_per_second": 2.814, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 165, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }