{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 915, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003278688524590164, "grad_norm": 1944.5952519636162, "learning_rate": 5.4347826086956515e-09, "logits/chosen": 280.0, "logits/rejected": 284.0, "logps/chosen": -580.0, "logps/rejected": -964.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03278688524590164, "grad_norm": 353.48089479021377, "learning_rate": 5.434782608695652e-08, "logits/chosen": 282.0, "logits/rejected": 284.0, "logps/chosen": -712.0, "logps/rejected": -852.0, "loss": 0.514, "rewards/accuracies": 0.6805555820465088, "rewards/chosen": 0.369140625, "rewards/margins": 0.953125, "rewards/rejected": -0.58203125, "step": 10 }, { "epoch": 0.06557377049180328, "grad_norm": 0.05922237347268083, "learning_rate": 1.0869565217391303e-07, "logits/chosen": 280.0, "logits/rejected": 284.0, "logps/chosen": -776.0, "logps/rejected": -936.0, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 4.71875, "rewards/margins": 14.6875, "rewards/rejected": -10.0, "step": 20 }, { "epoch": 0.09836065573770492, "grad_norm": 1.0595969361055456e-05, "learning_rate": 1.6304347826086955e-07, "logits/chosen": 278.0, "logits/rejected": 282.0, "logps/chosen": -680.0, "logps/rejected": -976.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 26.125, "rewards/rejected": -17.125, "step": 30 }, { "epoch": 0.13114754098360656, "grad_norm": 0.00013924554592107253, "learning_rate": 2.1739130434782607e-07, "logits/chosen": 276.0, "logits/rejected": 280.0, "logps/chosen": -728.0, "logps/rejected": -1008.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.9375, "rewards/margins": 30.0, "rewards/rejected": -19.0, "step": 40 }, { "epoch": 0.16393442622950818, "grad_norm": 6.145153542200693e-05, "learning_rate": 2.717391304347826e-07, "logits/chosen": 276.0, "logits/rejected": 280.0, "logps/chosen": -764.0, "logps/rejected": -1072.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.125, "rewards/margins": 32.0, "rewards/rejected": -21.0, "step": 50 }, { "epoch": 0.19672131147540983, "grad_norm": 3.57838878593481e-08, "learning_rate": 3.260869565217391e-07, "logits/chosen": 274.0, "logits/rejected": 278.0, "logps/chosen": -684.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 33.25, "rewards/rejected": -24.0, "step": 60 }, { "epoch": 0.22950819672131148, "grad_norm": 1.3784860235392943e-06, "learning_rate": 3.8043478260869567e-07, "logits/chosen": 274.0, "logits/rejected": 278.0, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 33.25, "rewards/rejected": -23.625, "step": 70 }, { "epoch": 0.26229508196721313, "grad_norm": 0.00011995391278542603, "learning_rate": 4.3478260869565214e-07, "logits/chosen": 272.0, "logits/rejected": 276.0, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.375, "rewards/margins": 35.0, "rewards/rejected": -24.625, "step": 80 }, { "epoch": 0.29508196721311475, "grad_norm": 8.89521797658848e-09, "learning_rate": 4.891304347826087e-07, "logits/chosen": 272.0, "logits/rejected": 276.0, "logps/chosen": -588.0, "logps/rejected": -1144.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 36.25, "rewards/rejected": -27.25, "step": 90 }, { "epoch": 0.32786885245901637, "grad_norm": 5.492227123210883e-05, "learning_rate": 4.951397326852977e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -508.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.1875, "rewards/margins": 35.25, "rewards/rejected": -27.25, "step": 100 }, { "epoch": 0.36065573770491804, "grad_norm": 2.8148164313308533e-06, "learning_rate": 4.890643985419197e-07, "logits/chosen": 272.0, "logits/rejected": 274.0, "logps/chosen": -720.0, "logps/rejected": -1136.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.3125, "rewards/margins": 39.75, "rewards/rejected": -28.375, "step": 110 }, { "epoch": 0.39344262295081966, "grad_norm": 0.00015404284602656474, "learning_rate": 4.829890643985419e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -728.0, "logps/rejected": -1168.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.75, "rewards/margins": 39.0, "rewards/rejected": -28.25, "step": 120 }, { "epoch": 0.4262295081967213, "grad_norm": 3.8202535759803833e-07, "learning_rate": 4.76913730255164e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -688.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.75, "rewards/margins": 38.25, "rewards/rejected": -27.625, "step": 130 }, { "epoch": 0.45901639344262296, "grad_norm": 1.4759444353931545e-05, "learning_rate": 4.708383961117861e-07, "logits/chosen": 268.0, "logits/rejected": 274.0, "logps/chosen": -532.0, "logps/rejected": -1080.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.1875, "rewards/margins": 35.25, "rewards/rejected": -27.0, "step": 140 }, { "epoch": 0.4918032786885246, "grad_norm": 1.7198688973706707e-06, "learning_rate": 4.6476306196840824e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.6875, "rewards/margins": 37.25, "rewards/rejected": -25.5, "step": 150 }, { "epoch": 0.5245901639344263, "grad_norm": 0.00012554430568885295, "learning_rate": 4.5868772782503037e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.125, "rewards/margins": 33.0, "rewards/rejected": -21.75, "step": 160 }, { "epoch": 0.5573770491803278, "grad_norm": 1.192603700066448e-07, "learning_rate": 4.526123936816525e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -608.0, "logps/rejected": -1112.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.5625, "rewards/margins": 37.0, "rewards/rejected": -24.375, "step": 170 }, { "epoch": 0.5901639344262295, "grad_norm": 2.5070601445648987e-07, "learning_rate": 4.4653705953827456e-07, "logits/chosen": 272.0, "logits/rejected": 274.0, "logps/chosen": -808.0, "logps/rejected": -1072.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.4375, "rewards/margins": 38.75, "rewards/rejected": -23.25, "step": 180 }, { "epoch": 0.6229508196721312, "grad_norm": 9.00648819542586e-09, "learning_rate": 4.404617253948967e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -612.0, "logps/rejected": -1096.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.5625, "rewards/margins": 36.75, "rewards/rejected": -24.25, "step": 190 }, { "epoch": 0.6557377049180327, "grad_norm": 1.9715953568317824e-05, "learning_rate": 4.343863912515188e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -660.0, "logps/rejected": -1104.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.25, "rewards/margins": 36.5, "rewards/rejected": -24.25, "step": 200 }, { "epoch": 0.6885245901639344, "grad_norm": 2.0756472839256032e-08, "learning_rate": 4.2831105710814093e-07, "logits/chosen": 270.0, "logits/rejected": 274.0, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.5, "rewards/margins": 37.0, "rewards/rejected": -23.5, "step": 210 }, { "epoch": 0.7213114754098361, "grad_norm": 1.0170794910731339e-06, "learning_rate": 4.2223572296476306e-07, "logits/chosen": 268.0, "logits/rejected": 274.0, "logps/chosen": -692.0, "logps/rejected": -1376.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.34375, "rewards/margins": 55.25, "rewards/rejected": -48.75, "step": 220 }, { "epoch": 0.7540983606557377, "grad_norm": 0.0011738426295712775, "learning_rate": 4.161603888213852e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -556.0, "logps/rejected": -1128.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 41.75, "rewards/rejected": -32.75, "step": 230 }, { "epoch": 0.7868852459016393, "grad_norm": 0.001209330911785121, "learning_rate": 4.1008505467800725e-07, "logits/chosen": 280.0, "logits/rejected": 284.0, "logps/chosen": -712.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.8125, "rewards/margins": 43.0, "rewards/rejected": -28.25, "step": 240 }, { "epoch": 0.819672131147541, "grad_norm": 1.4220638140820001e-05, "learning_rate": 4.040097205346294e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -612.0, "logps/rejected": -1184.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.5625, "rewards/margins": 45.25, "rewards/rejected": -30.625, "step": 250 }, { "epoch": 0.8524590163934426, "grad_norm": 6.847471825850411e-06, "learning_rate": 3.979343863912515e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -716.0, "logps/rejected": -1200.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.375, "rewards/margins": 49.25, "rewards/rejected": -31.75, "step": 260 }, { "epoch": 0.8852459016393442, "grad_norm": 8.453916078398106e-06, "learning_rate": 3.918590522478736e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -572.0, "logps/rejected": -1144.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.625, "rewards/margins": 43.25, "rewards/rejected": -28.625, "step": 270 }, { "epoch": 0.9180327868852459, "grad_norm": 0.0011565007130793768, "learning_rate": 3.8578371810449575e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -468.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.9375, "rewards/margins": 42.25, "rewards/rejected": -31.25, "step": 280 }, { "epoch": 0.9508196721311475, "grad_norm": 1.8563332734782312e-09, "learning_rate": 3.797083839611178e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -772.0, "logps/rejected": -1120.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.625, "rewards/margins": 48.0, "rewards/rejected": -29.5, "step": 290 }, { "epoch": 0.9836065573770492, "grad_norm": 3.511321873939038e-05, "learning_rate": 3.7363304981773994e-07, "logits/chosen": 280.0, "logits/rejected": 284.0, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.5, "rewards/margins": 46.0, "rewards/rejected": -29.5, "step": 300 }, { "epoch": 1.0, "eval_logits/chosen": 278.0, "eval_logits/rejected": 284.0, "eval_logps/chosen": -672.0, "eval_logps/rejected": -1184.0, "eval_loss": 8.327839964294981e-07, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 17.75, "eval_rewards/margins": 49.0, "eval_rewards/rejected": -31.25, "eval_runtime": 14.8375, "eval_samples_per_second": 13.21, "eval_steps_per_second": 0.472, "step": 305 }, { "epoch": 1.0163934426229508, "grad_norm": 9.227952810041053e-05, "learning_rate": 3.6755771567436206e-07, "logits/chosen": 280.0, "logits/rejected": 284.0, "logps/chosen": -696.0, "logps/rejected": -1104.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.25, "rewards/margins": 45.25, "rewards/rejected": -30.125, "step": 310 }, { "epoch": 1.0491803278688525, "grad_norm": 9.321655255254792e-07, "learning_rate": 3.614823815309842e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -616.0, "logps/rejected": -1152.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.625, "rewards/margins": 46.0, "rewards/rejected": -31.375, "step": 320 }, { "epoch": 1.0819672131147542, "grad_norm": 4.0394238203048e-08, "learning_rate": 3.554070473876063e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.4375, "rewards/margins": 44.5, "rewards/rejected": -30.25, "step": 330 }, { "epoch": 1.1147540983606556, "grad_norm": 0.0002089902658501527, "learning_rate": 3.4933171324422844e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -604.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.25, "rewards/margins": 47.5, "rewards/rejected": -32.25, "step": 340 }, { "epoch": 1.1475409836065573, "grad_norm": 1.0875677726982335e-08, "learning_rate": 3.4325637910085056e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -712.0, "logps/rejected": -1184.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.25, "rewards/margins": 49.5, "rewards/rejected": -32.25, "step": 350 }, { "epoch": 1.180327868852459, "grad_norm": 5.396258960818274e-07, "learning_rate": 3.3718104495747263e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -560.0, "logps/rejected": -1184.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 47.75, "rewards/rejected": -32.5, "step": 360 }, { "epoch": 1.2131147540983607, "grad_norm": 1.6489526589616384e-08, "learning_rate": 3.3110571081409475e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -596.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.6875, "rewards/margins": 47.5, "rewards/rejected": -33.0, "step": 370 }, { "epoch": 1.2459016393442623, "grad_norm": 0.00047632107879453094, "learning_rate": 3.250303766707169e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -500.0, "logps/rejected": -1208.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.875, "rewards/margins": 45.5, "rewards/rejected": -32.5, "step": 380 }, { "epoch": 1.278688524590164, "grad_norm": 1.915454389486218e-09, "learning_rate": 3.18955042527339e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.625, "rewards/margins": 48.75, "rewards/rejected": -32.0, "step": 390 }, { "epoch": 1.3114754098360657, "grad_norm": 2.759821564745437e-08, "learning_rate": 3.128797083839611e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -528.0, "logps/rejected": -1256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.8125, "rewards/margins": 48.25, "rewards/rejected": -35.5, "step": 400 }, { "epoch": 1.3442622950819672, "grad_norm": 0.0006591066467453379, "learning_rate": 3.068043742405832e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -612.0, "logps/rejected": -1184.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.6875, "rewards/margins": 48.25, "rewards/rejected": -32.5, "step": 410 }, { "epoch": 1.3770491803278688, "grad_norm": 2.1138523529030532e-10, "learning_rate": 3.007290400972053e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -596.0, "logps/rejected": -1176.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.1875, "rewards/margins": 48.5, "rewards/rejected": -33.25, "step": 420 }, { "epoch": 1.4098360655737705, "grad_norm": 2.0709906762220468e-09, "learning_rate": 2.9465370595382744e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -736.0, "logps/rejected": -1160.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.75, "rewards/margins": 50.25, "rewards/rejected": -32.5, "step": 430 }, { "epoch": 1.4426229508196722, "grad_norm": 4.1493170278759146e-07, "learning_rate": 2.8857837181044957e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -584.0, "logps/rejected": -1168.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.375, "rewards/margins": 47.5, "rewards/rejected": -33.0, "step": 440 }, { "epoch": 1.4754098360655736, "grad_norm": 2.1188762497389552e-10, "learning_rate": 2.825030376670717e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -596.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.1875, "rewards/margins": 49.5, "rewards/rejected": -35.25, "step": 450 }, { "epoch": 1.5081967213114753, "grad_norm": 1.5190378422688663e-06, "learning_rate": 2.7642770352369376e-07, "logits/chosen": 276.0, "logits/rejected": 284.0, "logps/chosen": -510.0, "logps/rejected": -1264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.625, "rewards/margins": 49.5, "rewards/rejected": -35.75, "step": 460 }, { "epoch": 1.540983606557377, "grad_norm": 8.189302463496647e-06, "learning_rate": 2.7035236938031594e-07, "logits/chosen": 276.0, "logits/rejected": 284.0, "logps/chosen": -608.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.125, "rewards/margins": 51.0, "rewards/rejected": -34.75, "step": 470 }, { "epoch": 1.5737704918032787, "grad_norm": 2.344382277701634e-05, "learning_rate": 2.64277035236938e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -712.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.875, "rewards/margins": 52.0, "rewards/rejected": -35.0, "step": 480 }, { "epoch": 1.6065573770491803, "grad_norm": 8.2745995590636e-05, "learning_rate": 2.5820170109356013e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.625, "rewards/margins": 50.25, "rewards/rejected": -33.5, "step": 490 }, { "epoch": 1.639344262295082, "grad_norm": 3.555854867834636e-08, "learning_rate": 2.5212636695018226e-07, "logits/chosen": 278.0, "logits/rejected": 284.0, "logps/chosen": -560.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 50.5, "rewards/rejected": -35.25, "step": 500 }, { "epoch": 1.6721311475409837, "grad_norm": 4.780185368460246e-15, "learning_rate": 2.460510328068044e-07, "logits/chosen": 278.0, "logits/rejected": 282.0, "logps/chosen": -620.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.8125, "rewards/margins": 49.75, "rewards/rejected": -35.0, "step": 510 }, { "epoch": 1.7049180327868854, "grad_norm": 9.70803846604915e-07, "learning_rate": 2.399756986634265e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -616.0, "logps/rejected": -1208.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 50.0, "rewards/rejected": -34.75, "step": 520 }, { "epoch": 1.737704918032787, "grad_norm": 3.2732243136120943e-10, "learning_rate": 2.339003645200486e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -616.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.4375, "rewards/margins": 51.5, "rewards/rejected": -36.0, "step": 530 }, { "epoch": 1.7704918032786885, "grad_norm": 2.4254222160922874e-08, "learning_rate": 2.278250303766707e-07, "logits/chosen": 278.0, "logits/rejected": 282.0, "logps/chosen": -660.0, "logps/rejected": -1272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.0, "rewards/margins": 51.0, "rewards/rejected": -37.0, "step": 540 }, { "epoch": 1.8032786885245902, "grad_norm": 2.798631872280391e-05, "learning_rate": 2.2174969623329282e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.375, "rewards/margins": 50.25, "rewards/rejected": -34.0, "step": 550 }, { "epoch": 1.8360655737704918, "grad_norm": 2.5154816697336087e-08, "learning_rate": 2.1567436208991492e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -636.0, "logps/rejected": -1104.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.0, "rewards/margins": 48.5, "rewards/rejected": -32.5, "step": 560 }, { "epoch": 1.8688524590163933, "grad_norm": 0.00022540890793216486, "learning_rate": 2.0959902794653705e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -624.0, "logps/rejected": -1248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.8125, "rewards/margins": 53.0, "rewards/rejected": -37.25, "step": 570 }, { "epoch": 1.901639344262295, "grad_norm": 2.9160826422966057e-08, "learning_rate": 2.0352369380315917e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -536.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.375, "rewards/margins": 50.0, "rewards/rejected": -36.5, "step": 580 }, { "epoch": 1.9344262295081966, "grad_norm": 0.003381969572186117, "learning_rate": 1.9744835965978127e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -592.0, "logps/rejected": -1224.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.0, "rewards/margins": 50.5, "rewards/rejected": -36.25, "step": 590 }, { "epoch": 1.9672131147540983, "grad_norm": 2.1628911768614984e-06, "learning_rate": 1.913730255164034e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -588.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.75, "rewards/margins": 52.0, "rewards/rejected": -37.25, "step": 600 }, { "epoch": 2.0, "grad_norm": 1.4812098804468674e-11, "learning_rate": 1.8529769137302551e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -632.0, "logps/rejected": -1224.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.625, "rewards/margins": 52.25, "rewards/rejected": -36.5, "step": 610 }, { "epoch": 2.0, "eval_logits/chosen": 276.0, "eval_logits/rejected": 282.0, "eval_logps/chosen": -676.0, "eval_logps/rejected": -1248.0, "eval_loss": 1.7821403730522434e-07, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 17.625, "eval_rewards/margins": 55.25, "eval_rewards/rejected": -37.5, "eval_runtime": 15.1712, "eval_samples_per_second": 12.919, "eval_steps_per_second": 0.461, "step": 610 }, { "epoch": 2.0327868852459017, "grad_norm": 1.7713226584924222e-05, "learning_rate": 1.792223572296476e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -620.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.5, "rewards/margins": 53.75, "rewards/rejected": -37.25, "step": 620 }, { "epoch": 2.0655737704918034, "grad_norm": 1.0249551855925774e-06, "learning_rate": 1.7314702308626974e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -616.0, "logps/rejected": -1176.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.25, "rewards/margins": 52.25, "rewards/rejected": -36.0, "step": 630 }, { "epoch": 2.098360655737705, "grad_norm": 4.2707376166338526e-05, "learning_rate": 1.6707168894289186e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.625, "rewards/margins": 53.0, "rewards/rejected": -36.5, "step": 640 }, { "epoch": 2.1311475409836067, "grad_norm": 0.00034207508221282683, "learning_rate": 1.6099635479951396e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -544.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.8125, "rewards/margins": 51.5, "rewards/rejected": -37.75, "step": 650 }, { "epoch": 2.1639344262295084, "grad_norm": 5.419595774676902e-09, "learning_rate": 1.5492102065613608e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -482.0, "logps/rejected": -1200.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.1875, "rewards/margins": 49.0, "rewards/rejected": -37.0, "step": 660 }, { "epoch": 2.19672131147541, "grad_norm": 4.851068352220634e-11, "learning_rate": 1.488456865127582e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -752.0, "logps/rejected": -1288.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.0, "rewards/margins": 58.5, "rewards/rejected": -39.25, "step": 670 }, { "epoch": 2.2295081967213113, "grad_norm": 1.7612628984927503e-10, "learning_rate": 1.427703523693803e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -576.0, "logps/rejected": -1192.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.5625, "rewards/margins": 53.0, "rewards/rejected": -37.25, "step": 680 }, { "epoch": 2.262295081967213, "grad_norm": 3.275622407668139e-08, "learning_rate": 1.3669501822600243e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -612.0, "logps/rejected": -1264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.125, "rewards/margins": 54.75, "rewards/rejected": -38.5, "step": 690 }, { "epoch": 2.2950819672131146, "grad_norm": 0.0013215912863667734, "learning_rate": 1.3061968408262452e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -580.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.25, "rewards/margins": 51.0, "rewards/rejected": -36.75, "step": 700 }, { "epoch": 2.3278688524590163, "grad_norm": 0.00041899600497659085, "learning_rate": 1.2454434993924665e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -516.0, "logps/rejected": -1256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.625, "rewards/margins": 52.5, "rewards/rejected": -38.75, "step": 710 }, { "epoch": 2.360655737704918, "grad_norm": 7.351246552401219e-09, "learning_rate": 1.1846901579586877e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -584.0, "logps/rejected": -1208.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.5, "rewards/margins": 51.5, "rewards/rejected": -37.0, "step": 720 }, { "epoch": 2.3934426229508197, "grad_norm": 1.2823458114586286e-12, "learning_rate": 1.1239368165249088e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -628.0, "logps/rejected": -1256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.8125, "rewards/margins": 53.5, "rewards/rejected": -38.75, "step": 730 }, { "epoch": 2.4262295081967213, "grad_norm": 2.2183065144506676e-05, "learning_rate": 1.0631834750911299e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -672.0, "logps/rejected": -1272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.25, "rewards/margins": 54.5, "rewards/rejected": -39.0, "step": 740 }, { "epoch": 2.459016393442623, "grad_norm": 9.496741561479611e-10, "learning_rate": 1.0024301336573512e-07, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -540.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.4375, "rewards/margins": 52.0, "rewards/rejected": -38.5, "step": 750 }, { "epoch": 2.4918032786885247, "grad_norm": 6.428681857091136e-08, "learning_rate": 9.416767922235723e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -676.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.0, "rewards/margins": 54.5, "rewards/rejected": -37.75, "step": 760 }, { "epoch": 2.5245901639344264, "grad_norm": 2.0982264237925454e-08, "learning_rate": 8.809234507897934e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.25, "rewards/margins": 53.25, "rewards/rejected": -37.0, "step": 770 }, { "epoch": 2.557377049180328, "grad_norm": 9.460545250549385e-16, "learning_rate": 8.201701093560146e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.9375, "rewards/margins": 52.5, "rewards/rejected": -37.75, "step": 780 }, { "epoch": 2.5901639344262293, "grad_norm": 4.793838989333602e-09, "learning_rate": 7.594167679222357e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -556.0, "logps/rejected": -1272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.25, "rewards/margins": 53.75, "rewards/rejected": -39.5, "step": 790 }, { "epoch": 2.6229508196721314, "grad_norm": 6.169679361669359e-11, "learning_rate": 6.986634264884568e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -612.0, "logps/rejected": -1200.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 53.0, "rewards/rejected": -37.75, "step": 800 }, { "epoch": 2.6557377049180326, "grad_norm": 8.064196022250346e-08, "learning_rate": 6.37910085054678e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -656.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.875, "rewards/margins": 54.75, "rewards/rejected": -37.75, "step": 810 }, { "epoch": 2.6885245901639343, "grad_norm": 1.1139956025424588e-07, "learning_rate": 5.771567436208991e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -580.0, "logps/rejected": -1224.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 52.75, "rewards/rejected": -37.5, "step": 820 }, { "epoch": 2.721311475409836, "grad_norm": 1.8351283154166078e-11, "learning_rate": 5.164034021871203e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -648.0, "logps/rejected": -1216.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.25, "rewards/margins": 52.5, "rewards/rejected": -37.25, "step": 830 }, { "epoch": 2.7540983606557377, "grad_norm": 5.336889215608692e-08, "learning_rate": 4.5565006075334144e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -664.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.25, "rewards/margins": 54.75, "rewards/rejected": -38.5, "step": 840 }, { "epoch": 2.7868852459016393, "grad_norm": 1.3681049107441818e-10, "learning_rate": 3.9489671931956255e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -680.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.125, "rewards/margins": 55.25, "rewards/rejected": -39.25, "step": 850 }, { "epoch": 2.819672131147541, "grad_norm": 2.7039086242151e-05, "learning_rate": 3.341433778857837e-08, "logits/chosen": 274.0, "logits/rejected": 282.0, "logps/chosen": -512.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.0625, "rewards/margins": 51.5, "rewards/rejected": -38.5, "step": 860 }, { "epoch": 2.8524590163934427, "grad_norm": 4.1661882432785415e-06, "learning_rate": 2.7339003645200486e-08, "logits/chosen": 276.0, "logits/rejected": 280.0, "logps/chosen": -588.0, "logps/rejected": -1240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.5, "rewards/margins": 52.0, "rewards/rejected": -38.5, "step": 870 }, { "epoch": 2.8852459016393444, "grad_norm": 4.794762374205389e-10, "learning_rate": 2.12636695018226e-08, "logits/chosen": 274.0, "logits/rejected": 282.0, "logps/chosen": -608.0, "logps/rejected": -1208.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.3125, "rewards/margins": 53.0, "rewards/rejected": -37.75, "step": 880 }, { "epoch": 2.918032786885246, "grad_norm": 1.9558841209760312e-05, "learning_rate": 1.5188335358444714e-08, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -560.0, "logps/rejected": -1280.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.0, "rewards/margins": 54.0, "rewards/rejected": -40.0, "step": 890 }, { "epoch": 2.9508196721311473, "grad_norm": 0.00011639924338885354, "learning_rate": 9.113001215066828e-09, "logits/chosen": 274.0, "logits/rejected": 282.0, "logps/chosen": -492.0, "logps/rejected": -1232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.375, "rewards/margins": 52.0, "rewards/rejected": -38.5, "step": 900 }, { "epoch": 2.9836065573770494, "grad_norm": 9.409851507715962e-11, "learning_rate": 3.0376670716889426e-09, "logits/chosen": 276.0, "logits/rejected": 282.0, "logps/chosen": -536.0, "logps/rejected": -1296.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.375, "rewards/margins": 54.0, "rewards/rejected": -40.75, "step": 910 }, { "epoch": 3.0, "eval_logits/chosen": 276.0, "eval_logits/rejected": 282.0, "eval_logps/chosen": -676.0, "eval_logps/rejected": -1264.0, "eval_loss": 1.2766672341513186e-07, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 17.5, "eval_rewards/margins": 56.5, "eval_rewards/rejected": -39.0, "eval_runtime": 17.7894, "eval_samples_per_second": 11.018, "eval_steps_per_second": 0.393, "step": 915 }, { "epoch": 3.0, "step": 915, "total_flos": 0.0, "train_loss": 0.005914865085090486, "train_runtime": 5897.037, "train_samples_per_second": 4.954, "train_steps_per_second": 0.155 } ], "logging_steps": 10, "max_steps": 915, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }