{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 18.06976070111927, "learning_rate": 3.125e-08, "logits/chosen": -2.1194543838500977, "logits/rejected": -2.2610020637512207, "logps/chosen": -254.6973419189453, "logps/pi_response": -318.5512390136719, "logps/ref_response": -318.5512390136719, "logps/rejected": -224.19918823242188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "eta": 0.0010000000474974513, "grad_norm": 16.01968550655723, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.385725975036621, "logits/rejected": -2.3076765537261963, "logps/chosen": -218.1858673095703, "logps/pi_response": -268.26239013671875, "logps/ref_response": -266.3502197265625, "logps/rejected": -224.54605102539062, "loss": 0.6927, "rewards/accuracies": 0.44871795177459717, "rewards/chosen": -0.006226478144526482, "rewards/margins": 0.0014922022819519043, "rewards/rejected": -0.007718680426478386, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 16.02312687332099, "learning_rate": 4.989490450759331e-07, "logits/chosen": -2.397501230239868, "logits/rejected": -2.3145182132720947, "logps/chosen": -228.54135131835938, "logps/pi_response": -300.3511047363281, "logps/ref_response": -260.5257873535156, "logps/rejected": -246.08518981933594, "loss": 0.6874, "rewards/accuracies": 0.5230769515037537, "rewards/chosen": -0.22692929208278656, "rewards/margins": 0.027445880696177483, "rewards/rejected": -0.254375159740448, "step": 20 }, { "epoch": 0.2, "eta": 0.0010000000474974513, "grad_norm": 18.496071851642174, "learning_rate": 4.872270441827174e-07, "logits/chosen": -2.2651801109313965, "logits/rejected": -2.206923007965088, "logps/chosen": -264.01336669921875, "logps/pi_response": -342.78717041015625, "logps/ref_response": -260.0426940917969, "logps/rejected": -266.82000732421875, "loss": 0.696, "rewards/accuracies": 0.4730769097805023, "rewards/chosen": -0.6173264980316162, "rewards/margins": 0.023560278117656708, "rewards/rejected": -0.6408867835998535, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 17.411127458960003, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -2.4821219444274902, "logits/rejected": -2.384908437728882, "logps/chosen": -243.66514587402344, "logps/pi_response": -308.2298583984375, "logps/ref_response": -255.49522399902344, "logps/rejected": -256.4287109375, "loss": 0.6903, "rewards/accuracies": 0.5538461804389954, "rewards/chosen": -0.3519066572189331, "rewards/margins": 0.04566844180226326, "rewards/rejected": -0.3975750505924225, "step": 40 }, { "epoch": 0.33, "eta": 0.0010000000474974513, "grad_norm": 14.517352316678341, "learning_rate": 4.277872161641681e-07, "logits/chosen": -2.545145273208618, "logits/rejected": -2.5242159366607666, "logps/chosen": -247.29806518554688, "logps/pi_response": -293.09326171875, "logps/ref_response": -275.4074401855469, "logps/rejected": -244.6110076904297, "loss": 0.6891, "rewards/accuracies": 0.557692289352417, "rewards/chosen": -0.1450691968202591, "rewards/margins": 0.017832614481449127, "rewards/rejected": -0.16290180385112762, "step": 50 }, { "epoch": 0.39, "eta": 0.0010000000474974513, "grad_norm": 17.626546770334993, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -2.511488676071167, "logits/rejected": -2.480903148651123, "logps/chosen": -257.10791015625, "logps/pi_response": -302.22210693359375, "logps/ref_response": -270.9198303222656, "logps/rejected": -252.54173278808594, "loss": 0.6912, "rewards/accuracies": 0.48846152424812317, "rewards/chosen": -0.23286302387714386, "rewards/margins": 0.02372700721025467, "rewards/rejected": -0.2565900385379791, "step": 60 }, { "epoch": 0.46, "eta": 0.0010000000474974513, "grad_norm": 15.787453453048895, "learning_rate": 3.316028034595861e-07, "logits/chosen": -2.3820903301239014, "logits/rejected": -2.3555104732513428, "logps/chosen": -250.5824432373047, "logps/pi_response": -322.5419921875, "logps/ref_response": -270.929931640625, "logps/rejected": -260.3626403808594, "loss": 0.695, "rewards/accuracies": 0.557692289352417, "rewards/chosen": -0.37925985455513, "rewards/margins": 0.030029216781258583, "rewards/rejected": -0.40928906202316284, "step": 70 }, { "epoch": 0.52, "eta": 0.0010000000474974513, "grad_norm": 14.876878099596093, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -2.463655948638916, "logits/rejected": -2.3770523071289062, "logps/chosen": -270.34197998046875, "logps/pi_response": -317.0186767578125, "logps/ref_response": -278.0060119628906, "logps/rejected": -269.89398193359375, "loss": 0.682, "rewards/accuracies": 0.5538461804389954, "rewards/chosen": -0.3352661728858948, "rewards/margins": 0.037235379219055176, "rewards/rejected": -0.37250155210494995, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 18.475251229694116, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -2.37835693359375, "logits/rejected": -2.3291523456573486, "logps/chosen": -244.36837768554688, "logps/pi_response": -331.8450012207031, "logps/ref_response": -267.61846923828125, "logps/rejected": -280.1258544921875, "loss": 0.6786, "rewards/accuracies": 0.5692307949066162, "rewards/chosen": -0.39472243189811707, "rewards/margins": 0.06756081432104111, "rewards/rejected": -0.46228325366973877, "step": 90 }, { "epoch": 0.65, "eta": 0.0010000000474974513, "grad_norm": 21.467183952290178, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -2.353746175765991, "logits/rejected": -2.2994236946105957, "logps/chosen": -259.2685546875, "logps/pi_response": -356.6775817871094, "logps/ref_response": -270.3107604980469, "logps/rejected": -271.2638854980469, "loss": 0.6687, "rewards/accuracies": 0.5769230723381042, "rewards/chosen": -0.4701143503189087, "rewards/margins": 0.08206918090581894, "rewards/rejected": -0.5521835088729858, "step": 100 }, { "epoch": 0.71, "eta": 0.0010000000474974513, "grad_norm": 22.52617676998604, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -2.292710542678833, "logits/rejected": -2.357100486755371, "logps/chosen": -274.7545471191406, "logps/pi_response": -387.603515625, "logps/ref_response": -285.7787780761719, "logps/rejected": -272.741943359375, "loss": 0.6847, "rewards/accuracies": 0.4923076927661896, "rewards/chosen": -0.5492157936096191, "rewards/margins": 0.018456529825925827, "rewards/rejected": -0.5676723718643188, "step": 110 }, { "epoch": 0.78, "eta": 0.0010000000474974513, "grad_norm": 23.08131985031319, "learning_rate": 6.822945986946385e-08, "logits/chosen": -2.2610812187194824, "logits/rejected": -2.1860787868499756, "logps/chosen": -282.33782958984375, "logps/pi_response": -372.6535949707031, "logps/ref_response": -265.4132080078125, "logps/rejected": -290.5935363769531, "loss": 0.6808, "rewards/accuracies": 0.5461538434028625, "rewards/chosen": -0.5871608257293701, "rewards/margins": 0.05175128951668739, "rewards/rejected": -0.6389120817184448, "step": 120 }, { "epoch": 0.84, "eta": 0.0010000000474974513, "grad_norm": 22.961865819374246, "learning_rate": 3.397296523427806e-08, "logits/chosen": -2.2791786193847656, "logits/rejected": -2.31884503364563, "logps/chosen": -277.06085205078125, "logps/pi_response": -365.2495422363281, "logps/ref_response": -260.3804626464844, "logps/rejected": -295.7681884765625, "loss": 0.672, "rewards/accuracies": 0.5653846263885498, "rewards/chosen": -0.6707223057746887, "rewards/margins": 0.08594530820846558, "rewards/rejected": -0.7566676139831543, "step": 130 }, { "epoch": 0.91, "eta": 0.0010000000474974513, "grad_norm": 22.470744853394315, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -2.3486456871032715, "logits/rejected": -2.201983690261841, "logps/chosen": -294.6012268066406, "logps/pi_response": -374.5638732910156, "logps/ref_response": -269.6314697265625, "logps/rejected": -284.3193359375, "loss": 0.6824, "rewards/accuracies": 0.5653846263885498, "rewards/chosen": -0.683001697063446, "rewards/margins": 0.06894499808549881, "rewards/rejected": -0.751946747303009, "step": 140 }, { "epoch": 0.97, "eta": 0.0010000000474974513, "grad_norm": 26.50699233707175, "learning_rate": 5.913435276374834e-10, "logits/chosen": -2.3918538093566895, "logits/rejected": -2.34391450881958, "logps/chosen": -279.7143859863281, "logps/pi_response": -387.8644104003906, "logps/ref_response": -273.84423828125, "logps/rejected": -304.1023254394531, "loss": 0.6713, "rewards/accuracies": 0.6153846383094788, "rewards/chosen": -0.6463515758514404, "rewards/margins": 0.1319323182106018, "rewards/rejected": -0.7782838940620422, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.6840069897813734, "train_runtime": 41065.6381, "train_samples_per_second": 0.487, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }