{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990186457311089, "eval_steps": 100, "global_step": 509, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.803921568627451e-09, "logits/chosen": -2.7483465671539307, "logits/rejected": -2.739339828491211, "logps/chosen": -287.5325927734375, "logps/rejected": -235.635986328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 9.80392156862745e-08, "logits/chosen": -2.709578037261963, "logits/rejected": -2.7113540172576904, "logps/chosen": -260.56292724609375, "logps/rejected": -256.438232421875, "loss": 0.6932, "rewards/accuracies": 0.4194444417953491, "rewards/chosen": 0.00014394157915376127, "rewards/margins": 1.0432106591906631e-06, "rewards/rejected": 0.00014289839600678533, "step": 10 }, { "epoch": 0.04, "learning_rate": 1.96078431372549e-07, "logits/chosen": -2.728665828704834, "logits/rejected": -2.7061820030212402, "logps/chosen": -280.0662536621094, "logps/rejected": -254.76626586914062, "loss": 0.6926, "rewards/accuracies": 0.5724999904632568, "rewards/chosen": -4.974007424607407e-06, "rewards/margins": 0.0005589541979134083, "rewards/rejected": -0.0005639282753691077, "step": 20 }, { "epoch": 0.06, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.7290821075439453, "logits/rejected": -2.742999315261841, "logps/chosen": -279.2391357421875, "logps/rejected": -253.37265014648438, "loss": 0.6895, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": 0.0049138437025249004, "rewards/margins": 0.007674422115087509, "rewards/rejected": -0.002760578179731965, "step": 30 }, { "epoch": 0.08, "learning_rate": 3.92156862745098e-07, "logits/chosen": -2.7134017944335938, "logits/rejected": -2.698641777038574, "logps/chosen": -274.20147705078125, "logps/rejected": -255.8253936767578, "loss": 0.6782, "rewards/accuracies": 0.6924999952316284, "rewards/chosen": 0.0260241087526083, "rewards/margins": 0.026919733732938766, "rewards/rejected": -0.0008956241654232144, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.6435346603393555, "logits/rejected": -2.6110424995422363, "logps/chosen": -302.06768798828125, "logps/rejected": -261.10919189453125, "loss": 0.6612, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.023571131750941277, "rewards/margins": 0.07649616152048111, "rewards/rejected": -0.05292503535747528, "step": 50 }, { "epoch": 0.12, "learning_rate": 4.995237599803335e-07, "logits/chosen": -2.6205055713653564, "logits/rejected": -2.5843255519866943, "logps/chosen": -300.914306640625, "logps/rejected": -286.0216064453125, "loss": 0.6451, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05583832785487175, "rewards/margins": 0.11994686722755432, "rewards/rejected": -0.17578519880771637, "step": 60 }, { "epoch": 0.14, "learning_rate": 4.978798275112142e-07, "logits/chosen": -2.607668161392212, "logits/rejected": -2.568187952041626, "logps/chosen": -308.4685974121094, "logps/rejected": -305.6259460449219, "loss": 0.6212, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1777888685464859, "rewards/margins": 0.19118839502334595, "rewards/rejected": -0.3689771890640259, "step": 70 }, { "epoch": 0.16, "learning_rate": 4.950700530747689e-07, "logits/chosen": -2.6067116260528564, "logits/rejected": -2.5767879486083984, "logps/chosen": -300.19488525390625, "logps/rejected": -295.8065185546875, "loss": 0.6196, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -0.13195012509822845, "rewards/margins": 0.25833892822265625, "rewards/rejected": -0.3902890384197235, "step": 80 }, { "epoch": 0.18, "learning_rate": 4.911076517558622e-07, "logits/chosen": -2.5809831619262695, "logits/rejected": -2.555103302001953, "logps/chosen": -325.28692626953125, "logps/rejected": -330.8323974609375, "loss": 0.5844, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": -0.21861158311367035, "rewards/margins": 0.3220059275627136, "rewards/rejected": -0.5406175851821899, "step": 90 }, { "epoch": 0.2, "learning_rate": 4.860112597371772e-07, "logits/chosen": -2.5413742065429688, "logits/rejected": -2.5363407135009766, "logps/chosen": -295.8542175292969, "logps/rejected": -310.6338195800781, "loss": 0.5764, "rewards/accuracies": 0.6675000190734863, "rewards/chosen": -0.26630619168281555, "rewards/margins": 0.3358945846557617, "rewards/rejected": -0.6022006869316101, "step": 100 }, { "epoch": 0.2, "eval_logits/chosen": -2.4791219234466553, "eval_logits/rejected": -2.4360005855560303, "eval_logps/chosen": -313.6502990722656, "eval_logps/rejected": -340.86053466796875, "eval_loss": 0.5828901529312134, "eval_rewards/accuracies": 0.6931137442588806, "eval_rewards/chosen": -0.3592246174812317, "eval_rewards/margins": 0.40203189849853516, "eval_rewards/rejected": -0.7612565159797668, "eval_runtime": 494.2516, "eval_samples_per_second": 4.047, "eval_steps_per_second": 0.338, "step": 100 }, { "epoch": 0.22, "learning_rate": 4.798048466485017e-07, "logits/chosen": -2.0916123390197754, "logits/rejected": -2.1291110515594482, "logps/chosen": -337.0193786621094, "logps/rejected": -372.4815368652344, "loss": 0.5665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6119796633720398, "rewards/margins": 0.5584384799003601, "rewards/rejected": -1.1704181432724, "step": 110 }, { "epoch": 0.24, "learning_rate": 4.725176028314541e-07, "logits/chosen": -1.8370585441589355, "logits/rejected": -1.7712280750274658, "logps/chosen": -370.1864318847656, "logps/rejected": -398.8289794921875, "loss": 0.56, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -0.8116917610168457, "rewards/margins": 0.6380540728569031, "rewards/rejected": -1.449745774269104, "step": 120 }, { "epoch": 0.26, "learning_rate": 4.641838020498713e-07, "logits/chosen": -1.7485500574111938, "logits/rejected": -1.5671393871307373, "logps/chosen": -380.29913330078125, "logps/rejected": -424.1035461425781, "loss": 0.5461, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -0.8717474937438965, "rewards/margins": 0.6444628834724426, "rewards/rejected": -1.5162103176116943, "step": 130 }, { "epoch": 0.27, "learning_rate": 4.5484264029156733e-07, "logits/chosen": -1.9667887687683105, "logits/rejected": -1.6983026266098022, "logps/chosen": -322.9972839355469, "logps/rejected": -379.5963134765625, "loss": 0.5416, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -0.6348860263824463, "rewards/margins": 0.6040786504745483, "rewards/rejected": -1.2389646768569946, "step": 140 }, { "epoch": 0.29, "learning_rate": 4.445380514196192e-07, "logits/chosen": -1.2058897018432617, "logits/rejected": -0.9969528317451477, "logps/chosen": -379.3441467285156, "logps/rejected": -449.9009704589844, "loss": 0.5485, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9173200726509094, "rewards/margins": 0.7758927941322327, "rewards/rejected": -1.6932127475738525, "step": 150 }, { "epoch": 0.31, "learning_rate": 4.33318500540218e-07, "logits/chosen": -1.7521625757217407, "logits/rejected": -1.4877443313598633, "logps/chosen": -356.1580810546875, "logps/rejected": -389.0058288574219, "loss": 0.5183, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.6841800212860107, "rewards/margins": 0.7851129174232483, "rewards/rejected": -1.4692928791046143, "step": 160 }, { "epoch": 0.33, "learning_rate": 4.2123675605892985e-07, "logits/chosen": -1.6861900091171265, "logits/rejected": -1.4684306383132935, "logps/chosen": -379.7774658203125, "logps/rejected": -437.3900451660156, "loss": 0.5146, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": -0.8159699440002441, "rewards/margins": 0.7220683097839355, "rewards/rejected": -1.5380383729934692, "step": 170 }, { "epoch": 0.35, "learning_rate": 4.0834964149744333e-07, "logits/chosen": -1.3343206644058228, "logits/rejected": -1.0179518461227417, "logps/chosen": -358.3331298828125, "logps/rejected": -399.9204406738281, "loss": 0.5536, "rewards/accuracies": 0.7074999809265137, "rewards/chosen": -0.8257815837860107, "rewards/margins": 0.7000215649604797, "rewards/rejected": -1.5258032083511353, "step": 180 }, { "epoch": 0.37, "learning_rate": 3.947177682380738e-07, "logits/chosen": -1.2010215520858765, "logits/rejected": -0.8926857709884644, "logps/chosen": -375.1010437011719, "logps/rejected": -433.2417297363281, "loss": 0.5309, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -0.7876387238502502, "rewards/margins": 0.7681831121444702, "rewards/rejected": -1.5558221340179443, "step": 190 }, { "epoch": 0.39, "learning_rate": 3.804052504529933e-07, "logits/chosen": -1.1186742782592773, "logits/rejected": -0.7032889723777771, "logps/chosen": -351.2778625488281, "logps/rejected": -416.71820068359375, "loss": 0.5169, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.7259469032287598, "rewards/margins": 0.874809741973877, "rewards/rejected": -1.6007568836212158, "step": 200 }, { "epoch": 0.39, "eval_logits/chosen": -1.201006293296814, "eval_logits/rejected": -0.8443379402160645, "eval_logps/chosen": -366.2012023925781, "eval_logps/rejected": -426.77203369140625, "eval_loss": 0.531209409236908, "eval_rewards/accuracies": 0.7065868377685547, "eval_rewards/chosen": -0.8847335577011108, "eval_rewards/margins": 0.7356376647949219, "eval_rewards/rejected": -1.6203712224960327, "eval_runtime": 494.1792, "eval_samples_per_second": 4.047, "eval_steps_per_second": 0.338, "step": 200 }, { "epoch": 0.41, "learning_rate": 3.654794035589483e-07, "logits/chosen": -0.9955520629882812, "logits/rejected": -0.5436328649520874, "logps/chosen": -402.7477722167969, "logps/rejected": -444.9473876953125, "loss": 0.5126, "rewards/accuracies": 0.7225000262260437, "rewards/chosen": -1.0243951082229614, "rewards/margins": 0.7689486742019653, "rewards/rejected": -1.7933436632156372, "step": 210 }, { "epoch": 0.43, "learning_rate": 3.5001042761570826e-07, "logits/chosen": -0.7878814935684204, "logits/rejected": -0.33438754081726074, "logps/chosen": -379.41448974609375, "logps/rejected": -452.28009033203125, "loss": 0.5159, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -1.0701900720596313, "rewards/margins": 0.8491780161857605, "rewards/rejected": -1.919368028640747, "step": 220 }, { "epoch": 0.45, "learning_rate": 3.34071077157304e-07, "logits/chosen": -0.6851831078529358, "logits/rejected": -0.29147180914878845, "logps/chosen": -360.47869873046875, "logps/rejected": -406.3958740234375, "loss": 0.5399, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -0.9100778698921204, "rewards/margins": 0.7056692242622375, "rewards/rejected": -1.6157469749450684, "step": 230 }, { "epoch": 0.47, "learning_rate": 3.1773631900892204e-07, "logits/chosen": -0.6293848752975464, "logits/rejected": -0.2972988784313202, "logps/chosen": -364.2557067871094, "logps/rejected": -426.8414306640625, "loss": 0.5184, "rewards/accuracies": 0.75, "rewards/chosen": -0.945137083530426, "rewards/margins": 0.7834777235984802, "rewards/rejected": -1.7286149263381958, "step": 240 }, { "epoch": 0.49, "learning_rate": 3.0108297969883103e-07, "logits/chosen": -0.6830095052719116, "logits/rejected": -0.20727473497390747, "logps/chosen": -377.15960693359375, "logps/rejected": -440.8514709472656, "loss": 0.5199, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.9253360033035278, "rewards/margins": 0.7137148380279541, "rewards/rejected": -1.6390507221221924, "step": 250 }, { "epoch": 0.51, "learning_rate": 2.8418938412365013e-07, "logits/chosen": -0.595008909702301, "logits/rejected": -0.22117982804775238, "logps/chosen": -378.3102722167969, "logps/rejected": -421.2056884765625, "loss": 0.5259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0280470848083496, "rewards/margins": 0.6548060774803162, "rewards/rejected": -1.682853102684021, "step": 260 }, { "epoch": 0.53, "learning_rate": 2.671349871664101e-07, "logits/chosen": -0.4738517105579376, "logits/rejected": -0.06301561743021011, "logps/chosen": -391.0889892578125, "logps/rejected": -433.60174560546875, "loss": 0.4996, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.925932765007019, "rewards/margins": 0.8979344367980957, "rewards/rejected": -1.8238672018051147, "step": 270 }, { "epoch": 0.55, "learning_rate": 2.5e-07, "logits/chosen": -0.29330724477767944, "logits/rejected": 0.11182761192321777, "logps/chosen": -400.1533203125, "logps/rejected": -453.4571228027344, "loss": 0.5108, "rewards/accuracies": 0.7174999713897705, "rewards/chosen": -1.1598564386367798, "rewards/margins": 0.7635893821716309, "rewards/rejected": -1.9234455823898315, "step": 280 }, { "epoch": 0.57, "learning_rate": 2.3286501283358982e-07, "logits/chosen": -0.049084682017564774, "logits/rejected": 0.32071781158447266, "logps/chosen": -421.474853515625, "logps/rejected": -480.5507507324219, "loss": 0.5107, "rewards/accuracies": 0.75, "rewards/chosen": -1.2823936939239502, "rewards/margins": 0.920534610748291, "rewards/rejected": -2.202928304672241, "step": 290 }, { "epoch": 0.59, "learning_rate": 2.1581061587634987e-07, "logits/chosen": -0.3210409879684448, "logits/rejected": 0.13426151871681213, "logps/chosen": -392.66351318359375, "logps/rejected": -457.4385681152344, "loss": 0.5133, "rewards/accuracies": 0.7825000286102295, "rewards/chosen": -1.2225959300994873, "rewards/margins": 0.9219253659248352, "rewards/rejected": -2.1445212364196777, "step": 300 }, { "epoch": 0.59, "eval_logits/chosen": -0.38526856899261475, "eval_logits/rejected": 0.0459565594792366, "eval_logps/chosen": -396.590576171875, "eval_logps/rejected": -460.7764892578125, "eval_loss": 0.5159304141998291, "eval_rewards/accuracies": 0.7245509028434753, "eval_rewards/chosen": -1.1886271238327026, "eval_rewards/margins": 0.7717891931533813, "eval_rewards/rejected": -1.9604166746139526, "eval_runtime": 494.4328, "eval_samples_per_second": 4.045, "eval_steps_per_second": 0.338, "step": 300 }, { "epoch": 0.61, "learning_rate": 1.9891702030116897e-07, "logits/chosen": -0.6406633257865906, "logits/rejected": 0.15507885813713074, "logps/chosen": -384.56219482421875, "logps/rejected": -443.3284912109375, "loss": 0.5192, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -1.066334843635559, "rewards/margins": 0.8297566175460815, "rewards/rejected": -1.8960914611816406, "step": 310 }, { "epoch": 0.63, "learning_rate": 1.8226368099107792e-07, "logits/chosen": -0.6926136016845703, "logits/rejected": -0.09604160487651825, "logps/chosen": -414.7826232910156, "logps/rejected": -454.5480041503906, "loss": 0.5065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0457278490066528, "rewards/margins": 0.7350744605064392, "rewards/rejected": -1.7808022499084473, "step": 320 }, { "epoch": 0.65, "learning_rate": 1.6592892284269594e-07, "logits/chosen": -0.5141594409942627, "logits/rejected": 0.11050853878259659, "logps/chosen": -402.63348388671875, "logps/rejected": -431.8319091796875, "loss": 0.5093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0640606880187988, "rewards/margins": 0.7925867438316345, "rewards/rejected": -1.8566473722457886, "step": 330 }, { "epoch": 0.67, "learning_rate": 1.4998957238429172e-07, "logits/chosen": -0.08297364413738251, "logits/rejected": 0.21859808266162872, "logps/chosen": -390.8412170410156, "logps/rejected": -461.3310546875, "loss": 0.505, "rewards/accuracies": 0.7275000214576721, "rewards/chosen": -1.190333604812622, "rewards/margins": 0.8922053575515747, "rewards/rejected": -2.0825393199920654, "step": 340 }, { "epoch": 0.69, "learning_rate": 1.345205964410517e-07, "logits/chosen": -0.539190948009491, "logits/rejected": -0.053236301988363266, "logps/chosen": -392.14385986328125, "logps/rejected": -447.09844970703125, "loss": 0.5125, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -0.9940242767333984, "rewards/margins": 0.9291434288024902, "rewards/rejected": -1.9231675863265991, "step": 350 }, { "epoch": 0.71, "learning_rate": 1.1959474954700665e-07, "logits/chosen": -0.6150873303413391, "logits/rejected": -0.08470536023378372, "logps/chosen": -377.5425109863281, "logps/rejected": -434.1069030761719, "loss": 0.5266, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -1.0171641111373901, "rewards/margins": 0.7864332795143127, "rewards/rejected": -1.803597092628479, "step": 360 }, { "epoch": 0.73, "learning_rate": 1.0528223176192615e-07, "logits/chosen": -0.464309424161911, "logits/rejected": 0.11655576527118683, "logps/chosen": -397.9951477050781, "logps/rejected": -446.141845703125, "loss": 0.4885, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -1.1220192909240723, "rewards/margins": 0.7690063714981079, "rewards/rejected": -1.8910256624221802, "step": 370 }, { "epoch": 0.75, "learning_rate": 9.16503585025567e-08, "logits/chosen": -0.3131292462348938, "logits/rejected": 0.1059599220752716, "logps/chosen": -398.6189880371094, "logps/rejected": -455.5489807128906, "loss": 0.4785, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -1.180424451828003, "rewards/margins": 0.9602058529853821, "rewards/rejected": -2.1406302452087402, "step": 380 }, { "epoch": 0.77, "learning_rate": 7.876324394107017e-08, "logits/chosen": -0.06371825933456421, "logits/rejected": 0.4222162663936615, "logps/chosen": -408.15203857421875, "logps/rejected": -469.3525085449219, "loss": 0.4945, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -1.2744272947311401, "rewards/margins": 0.8693990111351013, "rewards/rejected": -2.1438262462615967, "step": 390 }, { "epoch": 0.79, "learning_rate": 6.668149945978201e-08, "logits/chosen": -0.4337286353111267, "logits/rejected": 0.11450805515050888, "logps/chosen": -406.1577453613281, "logps/rejected": -468.1871337890625, "loss": 0.4968, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -1.204884648323059, "rewards/margins": 0.9240193367004395, "rewards/rejected": -2.128904104232788, "step": 400 }, { "epoch": 0.79, "eval_logits/chosen": -0.2552393972873688, "eval_logits/rejected": 0.20138485729694366, "eval_logps/chosen": -402.1766357421875, "eval_logps/rejected": -475.3639221191406, "eval_loss": 0.5057728290557861, "eval_rewards/accuracies": 0.7140718698501587, "eval_rewards/chosen": -1.2444883584976196, "eval_rewards/margins": 0.8618020415306091, "eval_rewards/rejected": -2.106290578842163, "eval_runtime": 493.9837, "eval_samples_per_second": 4.049, "eval_steps_per_second": 0.338, "step": 400 }, { "epoch": 0.8, "learning_rate": 5.546194858038072e-08, "logits/chosen": -0.3444100618362427, "logits/rejected": 0.08428356051445007, "logps/chosen": -419.0089111328125, "logps/rejected": -482.5577392578125, "loss": 0.488, "rewards/accuracies": 0.7325000166893005, "rewards/chosen": -1.1570134162902832, "rewards/margins": 0.9088660478591919, "rewards/rejected": -2.0658795833587646, "step": 410 }, { "epoch": 0.82, "learning_rate": 4.5157359708432626e-08, "logits/chosen": -0.3363034129142761, "logits/rejected": 0.1421819031238556, "logps/chosen": -417.26116943359375, "logps/rejected": -475.9188537597656, "loss": 0.5012, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -1.1876376867294312, "rewards/margins": 0.9119570255279541, "rewards/rejected": -2.0995945930480957, "step": 420 }, { "epoch": 0.84, "learning_rate": 3.581619795012874e-08, "logits/chosen": -0.4450594186782837, "logits/rejected": 0.03785795345902443, "logps/chosen": -404.95281982421875, "logps/rejected": -467.25531005859375, "loss": 0.4861, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -1.1584584712982178, "rewards/margins": 0.9622448086738586, "rewards/rejected": -2.1207032203674316, "step": 430 }, { "epoch": 0.86, "learning_rate": 2.748239716854589e-08, "logits/chosen": -0.31011733412742615, "logits/rejected": 0.310569167137146, "logps/chosen": -389.67132568359375, "logps/rejected": -470.01104736328125, "loss": 0.5105, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -1.1304560899734497, "rewards/margins": 0.8861461877822876, "rewards/rejected": -2.016602039337158, "step": 440 }, { "epoch": 0.88, "learning_rate": 2.0195153351498323e-08, "logits/chosen": -0.3003827631473541, "logits/rejected": 0.046957388520240784, "logps/chosen": -412.5171203613281, "logps/rejected": -481.26898193359375, "loss": 0.5128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1812173128128052, "rewards/margins": 0.8305546641349792, "rewards/rejected": -2.0117719173431396, "step": 450 }, { "epoch": 0.9, "learning_rate": 1.3988740262822846e-08, "logits/chosen": -0.47582343220710754, "logits/rejected": -0.11152289062738419, "logps/chosen": -410.2917175292969, "logps/rejected": -457.774658203125, "loss": 0.5044, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -1.1460288763046265, "rewards/margins": 0.8546761870384216, "rewards/rejected": -2.0007050037384033, "step": 460 }, { "epoch": 0.92, "learning_rate": 8.892348244137788e-09, "logits/chosen": -0.5770422220230103, "logits/rejected": -0.025662722066044807, "logps/chosen": -372.98187255859375, "logps/rejected": -467.86199951171875, "loss": 0.4973, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -1.0886142253875732, "rewards/margins": 0.8808639049530029, "rewards/rejected": -1.9694780111312866, "step": 470 }, { "epoch": 0.94, "learning_rate": 4.929946925231076e-09, "logits/chosen": -0.5876446962356567, "logits/rejected": -0.16365936398506165, "logps/chosen": -400.3377685546875, "logps/rejected": -455.9208068847656, "loss": 0.5072, "rewards/accuracies": 0.7024999856948853, "rewards/chosen": -1.1451067924499512, "rewards/margins": 0.7030719518661499, "rewards/rejected": -1.848178744316101, "step": 480 }, { "epoch": 0.96, "learning_rate": 2.1201724887858484e-09, "logits/chosen": -0.4430970847606659, "logits/rejected": 0.12594802677631378, "logps/chosen": -409.6846008300781, "logps/rejected": -458.5526428222656, "loss": 0.4887, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -1.0775573253631592, "rewards/margins": 0.9305427074432373, "rewards/rejected": -2.0081000328063965, "step": 490 }, { "epoch": 0.98, "learning_rate": 4.762400196664518e-10, "logits/chosen": -0.41937455534935, "logits/rejected": -0.08660510927438736, "logps/chosen": -385.8563232421875, "logps/rejected": -454.9473571777344, "loss": 0.4833, "rewards/accuracies": 0.7699999809265137, "rewards/chosen": -1.093034267425537, "rewards/margins": 0.9196186661720276, "rewards/rejected": -2.012652635574341, "step": 500 }, { "epoch": 0.98, "eval_logits/chosen": -0.4496035575866699, "eval_logits/rejected": 0.04359949380159378, "eval_logps/chosen": -395.9374084472656, "eval_logps/rejected": -470.5448303222656, "eval_loss": 0.5045374631881714, "eval_rewards/accuracies": 0.726047933101654, "eval_rewards/chosen": -1.182096004486084, "eval_rewards/margins": 0.876003086566925, "eval_rewards/rejected": -2.0580990314483643, "eval_runtime": 494.2334, "eval_samples_per_second": 4.047, "eval_steps_per_second": 0.338, "step": 500 }, { "epoch": 1.0, "step": 509, "total_flos": 0.0, "train_loss": 0.5401819272219315, "train_runtime": 34352.758, "train_samples_per_second": 1.78, "train_steps_per_second": 0.015 } ], "logging_steps": 10, "max_steps": 509, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }