{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 20, "global_step": 206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 8.9375, "learning_rate": 2.3809523809523808e-06, "logits/chosen": -2.7700600624084473, "logits/rejected": -2.8606302738189697, "logps/chosen": -421.64996337890625, "logps/rejected": -531.4378662109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1, "grad_norm": 7.375, "learning_rate": 2.380952380952381e-05, "logits/chosen": -2.7684054374694824, "logits/rejected": -2.7337145805358887, "logps/chosen": -333.7870178222656, "logps/rejected": -312.4859313964844, "loss": 0.6852, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.030088074505329132, "rewards/margins": 0.01666567102074623, "rewards/rejected": 0.013422403484582901, "step": 10 }, { "epoch": 0.19, "grad_norm": 6.125, "learning_rate": 4.761904761904762e-05, "logits/chosen": -2.8010494709014893, "logits/rejected": -2.79127836227417, "logps/chosen": -331.8260498046875, "logps/rejected": -332.01409912109375, "loss": 0.6028, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.41256317496299744, "rewards/margins": 0.21184520423412323, "rewards/rejected": 0.2007180005311966, "step": 20 }, { "epoch": 0.19, "eval_logits/chosen": -2.6401147842407227, "eval_logits/rejected": -2.614283800125122, "eval_logps/chosen": -324.9909973144531, "eval_logps/rejected": -327.9555969238281, "eval_loss": 0.5285959243774414, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": 0.878866970539093, "eval_rewards/margins": 0.43177998065948486, "eval_rewards/rejected": 0.44708704948425293, "eval_runtime": 114.9886, "eval_samples_per_second": 1.6, "eval_steps_per_second": 0.104, "step": 20 }, { "epoch": 0.29, "grad_norm": 4.84375, "learning_rate": 4.9708589101037306e-05, "logits/chosen": -2.6763195991516113, "logits/rejected": -2.651015043258667, "logps/chosen": -356.8539733886719, "logps/rejected": -363.6021423339844, "loss": 0.4643, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.943565845489502, "rewards/margins": 0.708370566368103, "rewards/rejected": 0.23519524931907654, "step": 30 }, { "epoch": 0.39, "grad_norm": 5.75, "learning_rate": 4.870996167038154e-05, "logits/chosen": -2.655568838119507, "logits/rejected": -2.6175591945648193, "logps/chosen": -353.34619140625, "logps/rejected": -359.96832275390625, "loss": 0.3363, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6268302798271179, "rewards/margins": 1.4118897914886475, "rewards/rejected": -0.7850595712661743, "step": 40 }, { "epoch": 0.39, "eval_logits/chosen": -2.535161256790161, "eval_logits/rejected": -2.5076351165771484, "eval_logps/chosen": -328.5650939941406, "eval_logps/rejected": -343.5235900878906, "eval_loss": 0.32319265604019165, "eval_rewards/accuracies": 0.859375, "eval_rewards/chosen": 0.5214586853981018, "eval_rewards/margins": 1.6311697959899902, "eval_rewards/rejected": -1.1097110509872437, "eval_runtime": 114.9563, "eval_samples_per_second": 1.601, "eval_steps_per_second": 0.104, "step": 40 }, { "epoch": 0.49, "grad_norm": 4.4375, "learning_rate": 4.7029241811087457e-05, "logits/chosen": -2.682722806930542, "logits/rejected": -2.627808094024658, "logps/chosen": -382.26690673828125, "logps/rejected": -376.25689697265625, "loss": 0.3043, "rewards/accuracies": 0.875, "rewards/chosen": 0.3637928366661072, "rewards/margins": 1.71381413936615, "rewards/rejected": -1.3500211238861084, "step": 50 }, { "epoch": 0.58, "grad_norm": 5.875, "learning_rate": 4.471478077342798e-05, "logits/chosen": -2.6791253089904785, "logits/rejected": -2.641322374343872, "logps/chosen": -344.8480529785156, "logps/rejected": -372.0831298828125, "loss": 0.2458, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6151469349861145, "rewards/margins": 2.2607688903808594, "rewards/rejected": -1.6456218957901, "step": 60 }, { "epoch": 0.58, "eval_logits/chosen": -2.592442274093628, "eval_logits/rejected": -2.560177803039551, "eval_logps/chosen": -328.04132080078125, "eval_logps/rejected": -351.1114196777344, "eval_loss": 0.2501268982887268, "eval_rewards/accuracies": 0.9114583134651184, "eval_rewards/chosen": 0.5738345980644226, "eval_rewards/margins": 2.4423279762268066, "eval_rewards/rejected": -1.8684934377670288, "eval_runtime": 115.0094, "eval_samples_per_second": 1.6, "eval_steps_per_second": 0.104, "step": 60 }, { "epoch": 0.68, "grad_norm": 4.3125, "learning_rate": 4.1833161387527986e-05, "logits/chosen": -2.6796765327453613, "logits/rejected": -2.6534857749938965, "logps/chosen": -371.1224670410156, "logps/rejected": -358.3480529785156, "loss": 0.2487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5859086513519287, "rewards/margins": 2.399681329727173, "rewards/rejected": -3.9855899810791016, "step": 70 }, { "epoch": 0.78, "grad_norm": 4.4375, "learning_rate": 3.84672825965686e-05, "logits/chosen": -2.568530559539795, "logits/rejected": -2.5246570110321045, "logps/chosen": -354.64984130859375, "logps/rejected": -360.8916931152344, "loss": 0.2116, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9132896661758423, "rewards/margins": 2.766739845275879, "rewards/rejected": -3.680030107498169, "step": 80 }, { "epoch": 0.78, "eval_logits/chosen": -2.3427236080169678, "eval_logits/rejected": -2.3129446506500244, "eval_logps/chosen": -340.5351257324219, "eval_logps/rejected": -370.7005920410156, "eval_loss": 0.19913233816623688, "eval_rewards/accuracies": 0.9166666865348816, "eval_rewards/chosen": -0.6755423545837402, "eval_rewards/margins": 3.1518704891204834, "eval_rewards/rejected": -3.8274126052856445, "eval_runtime": 114.9725, "eval_samples_per_second": 1.6, "eval_steps_per_second": 0.104, "step": 80 }, { "epoch": 0.87, "grad_norm": 2.0, "learning_rate": 3.471397460512563e-05, "logits/chosen": -2.428431272506714, "logits/rejected": -2.410618782043457, "logps/chosen": -361.0326232910156, "logps/rejected": -401.57269287109375, "loss": 0.1841, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7255961298942566, "rewards/margins": 3.4665279388427734, "rewards/rejected": -4.192124366760254, "step": 90 }, { "epoch": 0.97, "grad_norm": 5.4375, "learning_rate": 3.0681213250482255e-05, "logits/chosen": -2.3709776401519775, "logits/rejected": -2.353501796722412, "logps/chosen": -331.63623046875, "logps/rejected": -377.9563903808594, "loss": 0.1386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6168140172958374, "rewards/margins": 3.699888229370117, "rewards/rejected": -4.316702365875244, "step": 100 }, { "epoch": 0.97, "eval_logits/chosen": -2.3535118103027344, "eval_logits/rejected": -2.3131775856018066, "eval_logps/chosen": -330.8599548339844, "eval_logps/rejected": -362.6181335449219, "eval_loss": 0.20019526779651642, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": 0.29197368025779724, "eval_rewards/margins": 3.311133623123169, "eval_rewards/rejected": -3.019160032272339, "eval_runtime": 114.9013, "eval_samples_per_second": 1.601, "eval_steps_per_second": 0.104, "step": 100 }, { "epoch": 1.07, "grad_norm": 0.76953125, "learning_rate": 2.648501373438142e-05, "logits/chosen": -2.4437708854675293, "logits/rejected": -2.4320626258850098, "logps/chosen": -344.7204895019531, "logps/rejected": -410.47601318359375, "loss": 0.0711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.051882706582546234, "rewards/margins": 5.166382789611816, "rewards/rejected": -5.218265533447266, "step": 110 }, { "epoch": 1.17, "grad_norm": 1.4375, "learning_rate": 2.2246093076900144e-05, "logits/chosen": -2.430386781692505, "logits/rejected": -2.34106183052063, "logps/chosen": -400.32452392578125, "logps/rejected": -425.37457275390625, "loss": 0.0458, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0012158155441284, "rewards/margins": 6.266473293304443, "rewards/rejected": -7.2676897048950195, "step": 120 }, { "epoch": 1.17, "eval_logits/chosen": -2.2717294692993164, "eval_logits/rejected": -2.2290468215942383, "eval_logps/chosen": -347.5820007324219, "eval_logps/rejected": -391.1982727050781, "eval_loss": 0.17477566003799438, "eval_rewards/accuracies": 0.9479166865348816, "eval_rewards/chosen": -1.3802350759506226, "eval_rewards/margins": 4.496945858001709, "eval_rewards/rejected": -5.877180576324463, "eval_runtime": 114.9627, "eval_samples_per_second": 1.601, "eval_steps_per_second": 0.104, "step": 120 }, { "epoch": 1.26, "grad_norm": 1.0546875, "learning_rate": 1.8086397307570723e-05, "logits/chosen": -2.376091957092285, "logits/rejected": -2.3415114879608154, "logps/chosen": -337.0244140625, "logps/rejected": -408.39263916015625, "loss": 0.0283, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.006544408388435841, "rewards/margins": 6.594322204589844, "rewards/rejected": -6.600866794586182, "step": 130 }, { "epoch": 1.36, "grad_norm": 1.3515625, "learning_rate": 1.4125593300137766e-05, "logits/chosen": -2.364224672317505, "logits/rejected": -2.310997724533081, "logps/chosen": -343.6619567871094, "logps/rejected": -398.0953674316406, "loss": 0.0426, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.7415364384651184, "rewards/margins": 6.200386047363281, "rewards/rejected": -5.4588494300842285, "step": 140 }, { "epoch": 1.36, "eval_logits/chosen": -2.240306854248047, "eval_logits/rejected": -2.195923328399658, "eval_logps/chosen": -334.4142761230469, "eval_logps/rejected": -375.5160217285156, "eval_loss": 0.17553412914276123, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -0.06346017122268677, "eval_rewards/margins": 4.2454915046691895, "eval_rewards/rejected": -4.3089518547058105, "eval_runtime": 115.021, "eval_samples_per_second": 1.6, "eval_steps_per_second": 0.104, "step": 140 }, { "epoch": 1.46, "grad_norm": 0.8515625, "learning_rate": 1.0477626174477404e-05, "logits/chosen": -2.3424103260040283, "logits/rejected": -2.2978971004486084, "logps/chosen": -347.8079528808594, "logps/rejected": -375.98260498046875, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3227364122867584, "rewards/margins": 5.873551845550537, "rewards/rejected": -5.550815582275391, "step": 150 }, { "epoch": 1.55, "grad_norm": 1.96875, "learning_rate": 7.247441302957858e-06, "logits/chosen": -2.3495125770568848, "logits/rejected": -2.307555675506592, "logps/chosen": -331.29718017578125, "logps/rejected": -415.452392578125, "loss": 0.029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.16269809007644653, "rewards/margins": 6.475255012512207, "rewards/rejected": -6.312556266784668, "step": 160 }, { "epoch": 1.55, "eval_logits/chosen": -2.232851266860962, "eval_logits/rejected": -2.1893069744110107, "eval_logps/chosen": -341.7696533203125, "eval_logps/rejected": -387.3076171875, "eval_loss": 0.16915130615234375, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -0.7989979386329651, "eval_rewards/margins": 4.689115524291992, "eval_rewards/rejected": -5.4881134033203125, "eval_runtime": 114.9918, "eval_samples_per_second": 1.6, "eval_steps_per_second": 0.104, "step": 160 }, { "epoch": 1.65, "grad_norm": 0.345703125, "learning_rate": 4.527965223149957e-06, "logits/chosen": -2.407200336456299, "logits/rejected": -2.3430123329162598, "logps/chosen": -387.9550476074219, "logps/rejected": -445.9234313964844, "loss": 0.0175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.19500017166137695, "rewards/margins": 7.179248809814453, "rewards/rejected": -6.984248161315918, "step": 170 }, { "epoch": 1.75, "grad_norm": 3.015625, "learning_rate": 2.397432310532133e-06, "logits/chosen": -2.3570303916931152, "logits/rejected": -2.300320863723755, "logps/chosen": -367.35577392578125, "logps/rejected": -424.9029235839844, "loss": 0.0676, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.1322220414876938, "rewards/margins": 6.755249977111816, "rewards/rejected": -6.623027801513672, "step": 180 }, { "epoch": 1.75, "eval_logits/chosen": -2.2314395904541016, "eval_logits/rejected": -2.1864326000213623, "eval_logps/chosen": -340.7237854003906, "eval_logps/rejected": -386.9397277832031, "eval_loss": 0.16764594614505768, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -0.6944115161895752, "eval_rewards/margins": 4.756911754608154, "eval_rewards/rejected": -5.45132303237915, "eval_runtime": 114.865, "eval_samples_per_second": 1.602, "eval_steps_per_second": 0.104, "step": 180 }, { "epoch": 1.84, "grad_norm": 0.56640625, "learning_rate": 9.171341179489034e-07, "logits/chosen": -2.3660504817962646, "logits/rejected": -2.2959539890289307, "logps/chosen": -335.60052490234375, "logps/rejected": -383.60040283203125, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 0.19160650670528412, "rewards/margins": 6.720318794250488, "rewards/rejected": -6.528712272644043, "step": 190 }, { "epoch": 1.94, "grad_norm": 3.9375, "learning_rate": 1.296561292287446e-07, "logits/chosen": -2.3115243911743164, "logits/rejected": -2.281430959701538, "logps/chosen": -323.0104675292969, "logps/rejected": -385.94757080078125, "loss": 0.0517, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.006203270051628351, "rewards/margins": 6.517538547515869, "rewards/rejected": -6.523741722106934, "step": 200 }, { "epoch": 1.94, "eval_logits/chosen": -2.231421709060669, "eval_logits/rejected": -2.186391830444336, "eval_logps/chosen": -341.20733642578125, "eval_logps/rejected": -387.5655517578125, "eval_loss": 0.16659200191497803, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -0.7427660822868347, "eval_rewards/margins": 4.771137237548828, "eval_rewards/rejected": -5.513904094696045, "eval_runtime": 114.34, "eval_samples_per_second": 1.609, "eval_steps_per_second": 0.105, "step": 200 }, { "epoch": 2.0, "step": 206, "total_flos": 0.0, "train_loss": 0.1882365908726905, "train_runtime": 5068.0756, "train_samples_per_second": 0.65, "train_steps_per_second": 0.041 } ], "logging_steps": 10, "max_steps": 206, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }