{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 18.85951805804989, "learning_rate": 3.125e-08, "logits/chosen": -2.2437264919281006, "logits/rejected": -2.1319897174835205, "logps/chosen": -136.11781311035156, "logps/pi_response": -276.34149169921875, "logps/ref_response": -276.34149169921875, "logps/rejected": -134.32876586914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "eta": 0.0010000000474974513, "grad_norm": 15.166671167458636, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.3832309246063232, "logits/rejected": -2.3636457920074463, "logps/chosen": -155.76785278320312, "logps/pi_response": -274.42333984375, "logps/ref_response": -272.425048828125, "logps/rejected": -158.64793395996094, "loss": 0.6926, "rewards/accuracies": 0.4270833432674408, "rewards/chosen": -0.00683738524094224, "rewards/margins": -0.00022508477559313178, "rewards/rejected": -0.006612300407141447, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 15.006144425101914, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.3142848014831543, "logits/rejected": -2.337123394012451, "logps/chosen": -169.91624450683594, "logps/pi_response": -305.30267333984375, "logps/ref_response": -275.4255065917969, "logps/rejected": -177.8936767578125, "loss": 0.692, "rewards/accuracies": 0.515625, "rewards/chosen": -0.154428631067276, "rewards/margins": 0.013679690659046173, "rewards/rejected": -0.16810832917690277, "step": 20 }, { "epoch": 0.19, "eta": 0.0010000000474974513, "grad_norm": 25.3042204309977, "learning_rate": 4.877641290737883e-07, "logits/chosen": -2.33975887298584, "logits/rejected": -2.3151369094848633, "logps/chosen": -189.64102172851562, "logps/pi_response": -321.45294189453125, "logps/ref_response": -261.0726013183594, "logps/rejected": -194.24017333984375, "loss": 0.6936, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.33687421679496765, "rewards/margins": 0.005444393027573824, "rewards/rejected": -0.3423186242580414, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 14.197120755569808, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.4677834510803223, "logits/rejected": -2.4844822883605957, "logps/chosen": -176.27413940429688, "logps/pi_response": -294.74114990234375, "logps/ref_response": -256.48724365234375, "logps/rejected": -179.17926025390625, "loss": 0.6905, "rewards/accuracies": 0.578125, "rewards/chosen": -0.2434801161289215, "rewards/margins": -0.00023287050134968013, "rewards/rejected": -0.24324722588062286, "step": 40 }, { "epoch": 0.32, "eta": 0.0010000000474974513, "grad_norm": 15.243074464797877, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -2.445664882659912, "logits/rejected": -2.4546258449554443, "logps/chosen": -189.74288940429688, "logps/pi_response": -307.9224548339844, "logps/ref_response": -267.48931884765625, "logps/rejected": -200.6833038330078, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.2953701615333557, "rewards/margins": 0.01801210641860962, "rewards/rejected": -0.3133822977542877, "step": 50 }, { "epoch": 0.38, "eta": 0.0010000000474974513, "grad_norm": 16.31963304577925, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.477487087249756, "logits/rejected": -2.4785385131835938, "logps/chosen": -185.6737518310547, "logps/pi_response": -291.3546447753906, "logps/ref_response": -254.33984375, "logps/rejected": -188.54415893554688, "loss": 0.6887, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.26406130194664, "rewards/margins": 0.008015439845621586, "rewards/rejected": -0.2720767557621002, "step": 60 }, { "epoch": 0.45, "eta": 0.0010000000474974513, "grad_norm": 16.071293511713314, "learning_rate": 3.378437060203357e-07, "logits/chosen": -2.359812021255493, "logits/rejected": -2.355583429336548, "logps/chosen": -209.6289520263672, "logps/pi_response": -323.3484802246094, "logps/ref_response": -260.3892517089844, "logps/rejected": -210.88949584960938, "loss": 0.6913, "rewards/accuracies": 0.515625, "rewards/chosen": -0.5319920778274536, "rewards/margins": 0.012827059254050255, "rewards/rejected": -0.5448191165924072, "step": 70 }, { "epoch": 0.51, "eta": 0.0010000000474974513, "grad_norm": 16.843259909148866, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.4202404022216797, "logits/rejected": -2.362644910812378, "logps/chosen": -196.7368927001953, "logps/pi_response": -324.1815490722656, "logps/ref_response": -271.9532470703125, "logps/rejected": -200.54006958007812, "loss": 0.6866, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.402200847864151, "rewards/margins": 0.04481234401464462, "rewards/rejected": -0.4470131993293762, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 20.3973840448787, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -2.416743278503418, "logits/rejected": -2.436403751373291, "logps/chosen": -202.49813842773438, "logps/pi_response": -311.49896240234375, "logps/ref_response": -254.697509765625, "logps/rejected": -198.43460083007812, "loss": 0.6881, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3861594796180725, "rewards/margins": 0.029975295066833496, "rewards/rejected": -0.4161347448825836, "step": 90 }, { "epoch": 0.64, "eta": 0.0010000000474974513, "grad_norm": 19.031326931010696, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.3818562030792236, "logits/rejected": -2.393977403640747, "logps/chosen": -200.36917114257812, "logps/pi_response": -338.3829040527344, "logps/ref_response": -261.9479064941406, "logps/rejected": -211.229736328125, "loss": 0.6832, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.5124812722206116, "rewards/margins": 0.037171028554439545, "rewards/rejected": -0.5496522188186646, "step": 100 }, { "epoch": 0.7, "eta": 0.0010000000474974513, "grad_norm": 15.411774484824894, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -2.4000236988067627, "logits/rejected": -2.2974681854248047, "logps/chosen": -204.61512756347656, "logps/pi_response": -330.70831298828125, "logps/ref_response": -250.0836639404297, "logps/rejected": -208.4372100830078, "loss": 0.6858, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.5626230239868164, "rewards/margins": 0.039384625852108, "rewards/rejected": -0.6020076274871826, "step": 110 }, { "epoch": 0.77, "eta": 0.0010000000474974513, "grad_norm": 15.901638390226227, "learning_rate": 7.723433775328384e-08, "logits/chosen": -2.2554521560668945, "logits/rejected": -2.3232483863830566, "logps/chosen": -223.6305694580078, "logps/pi_response": -362.30291748046875, "logps/ref_response": -276.22747802734375, "logps/rejected": -226.0797119140625, "loss": 0.6871, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.6587863564491272, "rewards/margins": 0.01464476902037859, "rewards/rejected": -0.6734310984611511, "step": 120 }, { "epoch": 0.83, "eta": 0.0010000000474974513, "grad_norm": 15.415238088814565, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -2.2723705768585205, "logits/rejected": -2.223027229309082, "logps/chosen": -215.4403839111328, "logps/pi_response": -351.7759094238281, "logps/ref_response": -266.7939453125, "logps/rejected": -221.5651397705078, "loss": 0.6829, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.6349462270736694, "rewards/margins": 0.02343112602829933, "rewards/rejected": -0.6583773493766785, "step": 130 }, { "epoch": 0.9, "eta": 0.0010000000474974513, "grad_norm": 17.415730450012774, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -2.319291591644287, "logits/rejected": -2.372563600540161, "logps/chosen": -211.4011688232422, "logps/pi_response": -335.5188903808594, "logps/ref_response": -254.0056915283203, "logps/rejected": -212.92123413085938, "loss": 0.6817, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.5842172503471375, "rewards/margins": 0.024492263793945312, "rewards/rejected": -0.6087095141410828, "step": 140 }, { "epoch": 0.96, "eta": 0.0010000000474974513, "grad_norm": 16.173628365681967, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -2.2677788734436035, "logits/rejected": -2.2873096466064453, "logps/chosen": -209.3054656982422, "logps/pi_response": -347.63079833984375, "logps/ref_response": -265.3609313964844, "logps/rejected": -218.402587890625, "loss": 0.6852, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5842490196228027, "rewards/margins": 0.023630866780877113, "rewards/rejected": -0.6078798770904541, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.688222443828216, "train_runtime": 31827.1935, "train_samples_per_second": 0.628, "train_steps_per_second": 0.005 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }