{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 40.489505054410465, "learning_rate": 1.5625e-08, "logits/chosen": -3.247814178466797, "logits/rejected": -3.1977505683898926, "logps/chosen": -851.484375, "logps/rejected": -1405.7332763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 41.92175459391116, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -3.16192626953125, "logits/rejected": -3.160381317138672, "logps/chosen": -1020.866455078125, "logps/rejected": -1326.50732421875, "loss": 0.6919, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0007796759600751102, "rewards/margins": 0.0031654490157961845, "rewards/rejected": -0.0023857729975134134, "step": 10 }, { "epoch": 0.06, "grad_norm": 32.15596425398863, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -3.18839430809021, "logits/rejected": -3.2000014781951904, "logps/chosen": -1116.934814453125, "logps/rejected": -1346.7293701171875, "loss": 0.6678, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.00040334780351258814, "rewards/margins": 0.04154806584119797, "rewards/rejected": -0.0411447174847126, "step": 20 }, { "epoch": 0.09, "grad_norm": 28.544604595173517, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -3.2953286170959473, "logits/rejected": -3.260005474090576, "logps/chosen": -985.6617431640625, "logps/rejected": -1335.411865234375, "loss": 0.5866, "rewards/accuracies": 0.8125, "rewards/chosen": -0.00017990582273341715, "rewards/margins": 0.25064677000045776, "rewards/rejected": -0.2508266866207123, "step": 30 }, { "epoch": 0.13, "grad_norm": 29.54151669834925, "learning_rate": 4.990217055187362e-07, "logits/chosen": -3.398481845855713, "logits/rejected": -3.3821473121643066, "logps/chosen": -1097.4017333984375, "logps/rejected": -1417.0760498046875, "loss": 0.5098, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.25089818239212036, "rewards/margins": 0.7165254950523376, "rewards/rejected": -0.9674237966537476, "step": 40 }, { "epoch": 0.16, "grad_norm": 31.42053941657465, "learning_rate": 4.950605027404507e-07, "logits/chosen": -3.4295551776885986, "logits/rejected": -3.376582622528076, "logps/chosen": -997.1471557617188, "logps/rejected": -1500.8634033203125, "loss": 0.4474, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.2491927146911621, "rewards/margins": 1.4835565090179443, "rewards/rejected": -1.732749342918396, "step": 50 }, { "epoch": 0.19, "grad_norm": 27.99757166459867, "learning_rate": 4.881036333395328e-07, "logits/chosen": -3.3594136238098145, "logits/rejected": -3.2863781452178955, "logps/chosen": -932.15380859375, "logps/rejected": -1515.422119140625, "loss": 0.3719, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.15043269097805023, "rewards/margins": 1.8649994134902954, "rewards/rejected": -2.0154318809509277, "step": 60 }, { "epoch": 0.22, "grad_norm": 32.17353809931757, "learning_rate": 4.782361394228472e-07, "logits/chosen": -3.284620761871338, "logits/rejected": -3.173419237136841, "logps/chosen": -1055.072509765625, "logps/rejected": -1732.321044921875, "loss": 0.3704, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.42683038115501404, "rewards/margins": 3.9190266132354736, "rewards/rejected": -4.3458571434021, "step": 70 }, { "epoch": 0.25, "grad_norm": 26.635087786755154, "learning_rate": 4.655786431300069e-07, "logits/chosen": -3.247112274169922, "logits/rejected": -3.1357181072235107, "logps/chosen": -1113.9583740234375, "logps/rejected": -1907.239013671875, "loss": 0.3278, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.7213847041130066, "rewards/margins": 5.185682773590088, "rewards/rejected": -5.90706729888916, "step": 80 }, { "epoch": 0.28, "grad_norm": 27.10930625424201, "learning_rate": 4.5028587212518697e-07, "logits/chosen": -3.1901683807373047, "logits/rejected": -3.089111804962158, "logps/chosen": -1073.7132568359375, "logps/rejected": -1941.2095947265625, "loss": 0.3083, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0008630752563477, "rewards/margins": 4.947809219360352, "rewards/rejected": -5.948672294616699, "step": 90 }, { "epoch": 0.32, "grad_norm": 32.773699746936735, "learning_rate": 4.325447681764586e-07, "logits/chosen": -3.198819637298584, "logits/rejected": -3.0583558082580566, "logps/chosen": -1223.125, "logps/rejected": -2312.65771484375, "loss": 0.2735, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3628978729248047, "rewards/margins": 8.73902702331543, "rewards/rejected": -10.10192584991455, "step": 100 }, { "epoch": 0.32, "eval_logits/chosen": -2.5374886989593506, "eval_logits/rejected": -2.7456984519958496, "eval_logps/chosen": -296.0259704589844, "eval_logps/rejected": -1420.5509033203125, "eval_loss": 0.05290684849023819, "eval_rewards/accuracies": 0.9700000286102295, "eval_rewards/chosen": -1.359223484992981, "eval_rewards/margins": 6.826464653015137, "eval_rewards/rejected": -8.185688018798828, "eval_runtime": 86.3896, "eval_samples_per_second": 9.121, "eval_steps_per_second": 0.289, "step": 100 }, { "epoch": 0.35, "grad_norm": 33.108583700602985, "learning_rate": 4.1257220194373424e-07, "logits/chosen": -3.1395342350006104, "logits/rejected": -3.039159059524536, "logps/chosen": -1132.5010986328125, "logps/rejected": -2260.93505859375, "loss": 0.2697, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6764516830444336, "rewards/margins": 7.895078182220459, "rewards/rejected": -9.571529388427734, "step": 110 }, { "epoch": 0.38, "grad_norm": 25.606473754809414, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -3.089616060256958, "logits/rejected": -2.9387471675872803, "logps/chosen": -1187.3896484375, "logps/rejected": -2645.472900390625, "loss": 0.1938, "rewards/accuracies": 0.9375, "rewards/chosen": -2.218280792236328, "rewards/margins": 10.56298828125, "rewards/rejected": -12.781268119812012, "step": 120 }, { "epoch": 0.41, "grad_norm": 43.148378021441694, "learning_rate": 3.669335698643704e-07, "logits/chosen": -3.076484203338623, "logits/rejected": -2.9197285175323486, "logps/chosen": -1269.959716796875, "logps/rejected": -2458.047119140625, "loss": 0.2392, "rewards/accuracies": 0.875, "rewards/chosen": -2.427656650543213, "rewards/margins": 9.262014389038086, "rewards/rejected": -11.689671516418457, "step": 130 }, { "epoch": 0.44, "grad_norm": 47.774750613943326, "learning_rate": 3.418253994161892e-07, "logits/chosen": -3.0531423091888428, "logits/rejected": -2.9170706272125244, "logps/chosen": -1267.736328125, "logps/rejected": -2635.373291015625, "loss": 0.2115, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.909527063369751, "rewards/margins": 10.727846145629883, "rewards/rejected": -12.637372970581055, "step": 140 }, { "epoch": 0.47, "grad_norm": 33.40461914715703, "learning_rate": 3.1559473766049476e-07, "logits/chosen": -3.0771565437316895, "logits/rejected": -2.832980155944824, "logps/chosen": -1112.315185546875, "logps/rejected": -2428.1845703125, "loss": 0.1728, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8454099893569946, "rewards/margins": 8.90544319152832, "rewards/rejected": -10.750852584838867, "step": 150 }, { "epoch": 0.51, "grad_norm": 33.41575003336433, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -2.995957612991333, "logits/rejected": -2.7771031856536865, "logps/chosen": -1257.4671630859375, "logps/rejected": -2754.3642578125, "loss": 0.1709, "rewards/accuracies": 0.9375, "rewards/chosen": -2.696488857269287, "rewards/margins": 11.541966438293457, "rewards/rejected": -14.238454818725586, "step": 160 }, { "epoch": 0.54, "grad_norm": 45.83321300032113, "learning_rate": 2.610583366813447e-07, "logits/chosen": -2.977898359298706, "logits/rejected": -2.7190890312194824, "logps/chosen": -1289.2559814453125, "logps/rejected": -2733.30712890625, "loss": 0.1563, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5896034240722656, "rewards/margins": 11.3201322555542, "rewards/rejected": -13.909736633300781, "step": 170 }, { "epoch": 0.57, "grad_norm": 51.91540831418988, "learning_rate": 2.3341926089122408e-07, "logits/chosen": -2.9528422355651855, "logits/rejected": -2.688262462615967, "logps/chosen": -1250.198486328125, "logps/rejected": -2825.1044921875, "loss": 0.1255, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.937950611114502, "rewards/margins": 11.848261833190918, "rewards/rejected": -14.786211967468262, "step": 180 }, { "epoch": 0.6, "grad_norm": 28.5647358282054, "learning_rate": 2.0598287123849092e-07, "logits/chosen": -2.933013439178467, "logits/rejected": -2.6865992546081543, "logps/chosen": -1182.699462890625, "logps/rejected": -2749.19189453125, "loss": 0.1565, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.322864055633545, "rewards/margins": 11.878046989440918, "rewards/rejected": -14.200910568237305, "step": 190 }, { "epoch": 0.63, "grad_norm": 39.43959748344799, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -2.9264111518859863, "logits/rejected": -2.7152259349823, "logps/chosen": -1279.89599609375, "logps/rejected": -2508.899169921875, "loss": 0.1321, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.4204678535461426, "rewards/margins": 9.535171508789062, "rewards/rejected": -11.95563793182373, "step": 200 }, { "epoch": 0.63, "eval_logits/chosen": -2.334916591644287, "eval_logits/rejected": -2.2518062591552734, "eval_logps/chosen": -364.15570068359375, "eval_logps/rejected": -2287.0966796875, "eval_loss": 0.05069316178560257, "eval_rewards/accuracies": 0.9599999785423279, "eval_rewards/chosen": -2.0405211448669434, "eval_rewards/margins": 14.810622215270996, "eval_rewards/rejected": -16.851144790649414, "eval_runtime": 86.3525, "eval_samples_per_second": 9.125, "eval_steps_per_second": 0.29, "step": 200 }, { "epoch": 0.66, "grad_norm": 34.20192130304297, "learning_rate": 1.5305312360052443e-07, "logits/chosen": -2.9040026664733887, "logits/rejected": -2.588223457336426, "logps/chosen": -1291.372314453125, "logps/rejected": -3483.51806640625, "loss": 0.1085, "rewards/accuracies": 0.96875, "rewards/chosen": -2.889706611633301, "rewards/margins": 18.249797821044922, "rewards/rejected": -21.13950538635254, "step": 210 }, { "epoch": 0.7, "grad_norm": 45.248909836244636, "learning_rate": 1.2820678900980092e-07, "logits/chosen": -2.8523659706115723, "logits/rejected": -2.5748908519744873, "logps/chosen": -1335.823974609375, "logps/rejected": -2864.441162109375, "loss": 0.1242, "rewards/accuracies": 0.96875, "rewards/chosen": -3.108398199081421, "rewards/margins": 11.872503280639648, "rewards/rejected": -14.980901718139648, "step": 220 }, { "epoch": 0.73, "grad_norm": 45.218726366714044, "learning_rate": 1.0484927800731982e-07, "logits/chosen": -2.782780885696411, "logits/rejected": -2.5187795162200928, "logps/chosen": -1413.355712890625, "logps/rejected": -2988.111328125, "loss": 0.1138, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.4694035053253174, "rewards/margins": 13.392831802368164, "rewards/rejected": -16.862234115600586, "step": 230 }, { "epoch": 0.76, "grad_norm": 52.31812516539656, "learning_rate": 8.32661172908373e-08, "logits/chosen": -2.7922840118408203, "logits/rejected": -2.494626760482788, "logps/chosen": -1353.203369140625, "logps/rejected": -2993.57421875, "loss": 0.1011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4644787311553955, "rewards/margins": 12.617826461791992, "rewards/rejected": -16.082305908203125, "step": 240 }, { "epoch": 0.79, "grad_norm": 45.34393981077748, "learning_rate": 6.372114355964292e-08, "logits/chosen": -2.7275874614715576, "logits/rejected": -2.5371663570404053, "logps/chosen": -1380.716064453125, "logps/rejected": -2507.10888671875, "loss": 0.1239, "rewards/accuracies": 0.9375, "rewards/chosen": -3.474669933319092, "rewards/margins": 8.190107345581055, "rewards/rejected": -11.664777755737305, "step": 250 }, { "epoch": 0.82, "grad_norm": 35.090078905846326, "learning_rate": 4.645327832410648e-08, "logits/chosen": -2.822172164916992, "logits/rejected": -2.5299789905548096, "logps/chosen": -1333.1666259765625, "logps/rejected": -2887.5546875, "loss": 0.0846, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1791491508483887, "rewards/margins": 12.44713020324707, "rewards/rejected": -15.6262788772583, "step": 260 }, { "epoch": 0.85, "grad_norm": 46.38264565730283, "learning_rate": 3.167360728327681e-08, "logits/chosen": -2.803342342376709, "logits/rejected": -2.5070111751556396, "logps/chosen": -1231.8974609375, "logps/rejected": -3075.5439453125, "loss": 0.0982, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3780341148376465, "rewards/margins": 13.437856674194336, "rewards/rejected": -16.815893173217773, "step": 270 }, { "epoch": 0.89, "grad_norm": 28.779386715268174, "learning_rate": 1.956279997278043e-08, "logits/chosen": -2.7971150875091553, "logits/rejected": -2.4917969703674316, "logps/chosen": -1336.4207763671875, "logps/rejected": -2981.490478515625, "loss": 0.0993, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5398058891296387, "rewards/margins": 13.135106086730957, "rewards/rejected": -16.674911499023438, "step": 280 }, { "epoch": 0.92, "grad_norm": 28.988427287230724, "learning_rate": 1.0268901225739979e-08, "logits/chosen": -2.8333706855773926, "logits/rejected": -2.480327844619751, "logps/chosen": -1320.4166259765625, "logps/rejected": -3045.81005859375, "loss": 0.103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3708198070526123, "rewards/margins": 13.572955131530762, "rewards/rejected": -16.943775177001953, "step": 290 }, { "epoch": 0.95, "grad_norm": 57.304982547139346, "learning_rate": 3.905521444318604e-09, "logits/chosen": -2.7884676456451416, "logits/rejected": -2.496267795562744, "logps/chosen": -1435.158935546875, "logps/rejected": -2914.93310546875, "loss": 0.117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5666072368621826, "rewards/margins": 11.863222122192383, "rewards/rejected": -15.429827690124512, "step": 300 }, { "epoch": 0.95, "eval_logits/chosen": -2.227323532104492, "eval_logits/rejected": -2.0210442543029785, "eval_logps/chosen": -408.6503601074219, "eval_logps/rejected": -2715.43310546875, "eval_loss": 0.053101107478141785, "eval_rewards/accuracies": 0.9700000286102295, "eval_rewards/chosen": -2.4854671955108643, "eval_rewards/margins": 18.64904022216797, "eval_rewards/rejected": -21.134506225585938, "eval_runtime": 86.3324, "eval_samples_per_second": 9.128, "eval_steps_per_second": 0.29, "step": 300 }, { "epoch": 0.98, "grad_norm": 17.748959579724655, "learning_rate": 5.504478043572291e-10, "logits/chosen": -2.781341791152954, "logits/rejected": -2.5292139053344727, "logps/chosen": -1375.798095703125, "logps/rejected": -2808.2548828125, "loss": 0.1158, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.3524460792541504, "rewards/margins": 11.515007972717285, "rewards/rejected": -14.867452621459961, "step": 310 }, { "epoch": 1.0, "step": 316, "total_flos": 0.0, "train_loss": 0.24172405860846555, "train_runtime": 5168.5497, "train_samples_per_second": 3.909, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 316, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }