diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 15453, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.234152652005175e-10, + "logits/chosen": -3.2907767295837402, + "logits/rejected": -3.217514991760254, + "logps/chosen": -159.67581176757812, + "logps/rejected": -734.8052368164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 3.2341526520051748e-09, + "logits/chosen": -3.2340452671051025, + "logits/rejected": -3.225147008895874, + "logps/chosen": -250.78839111328125, + "logps/rejected": -257.5876770019531, + "loss": 0.693, + "rewards/accuracies": 0.3333333432674408, + "rewards/chosen": -0.0012301643146201968, + "rewards/margins": -0.001831890782341361, + "rewards/rejected": 0.000601727282628417, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 6.4683053040103496e-09, + "logits/chosen": -3.000033140182495, + "logits/rejected": -3.021458148956299, + "logps/chosen": -180.18971252441406, + "logps/rejected": -220.72903442382812, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.007337198592722416, + "rewards/margins": 0.005708443932235241, + "rewards/rejected": 0.0016287544276565313, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 9.702457956015523e-09, + "logits/chosen": -3.0931828022003174, + "logits/rejected": -3.086947202682495, + "logps/chosen": -230.19833374023438, + "logps/rejected": -267.9596252441406, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0053744749166071415, + "rewards/margins": -0.0014988690381869674, + "rewards/rejected": 0.0068733422085642815, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.2936610608020699e-08, + "logits/chosen": -3.153712034225464, + "logits/rejected": -3.165261745452881, + "logps/chosen": -220.7694854736328, + "logps/rejected": -196.19557189941406, + "loss": 0.6918, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.002314257901161909, + "rewards/margins": -0.00820563267916441, + "rewards/rejected": 0.005891374312341213, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 1.6170763260025874e-08, + "logits/chosen": -3.139439344406128, + "logits/rejected": -3.1224019527435303, + "logps/chosen": -210.93881225585938, + "logps/rejected": -199.3134765625, + "loss": 0.6901, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004287729039788246, + "rewards/margins": -0.0009997839806601405, + "rewards/rejected": 0.005287514068186283, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.9404915912031046e-08, + "logits/chosen": -2.9774956703186035, + "logits/rejected": -3.070859909057617, + "logps/chosen": -174.82815551757812, + "logps/rejected": -222.3321990966797, + "loss": 0.6911, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0032317829318344593, + "rewards/margins": 0.0030282307416200638, + "rewards/rejected": 0.0002035518700722605, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 2.2639068564036222e-08, + "logits/chosen": -3.0662150382995605, + "logits/rejected": -3.1663196086883545, + "logps/chosen": -166.78848266601562, + "logps/rejected": -136.50570678710938, + "loss": 0.6892, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.005116489715874195, + "rewards/margins": 0.0016806584317237139, + "rewards/rejected": -0.006797147449105978, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 2.5873221216041398e-08, + "logits/chosen": -3.105792999267578, + "logits/rejected": -3.087473154067993, + "logps/chosen": -250.6028289794922, + "logps/rejected": -221.30960083007812, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0002625319757498801, + "rewards/margins": -0.008208994753658772, + "rewards/rejected": 0.00794646143913269, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 2.910737386804657e-08, + "logits/chosen": -3.0313832759857178, + "logits/rejected": -3.011046886444092, + "logps/chosen": -209.96633911132812, + "logps/rejected": -251.78164672851562, + "loss": 0.6922, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.007732807192951441, + "rewards/margins": 0.009743442758917809, + "rewards/rejected": -0.017476249486207962, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 3.234152652005175e-08, + "logits/chosen": -3.208557605743408, + "logits/rejected": -3.138239860534668, + "logps/chosen": -216.93978881835938, + "logps/rejected": -125.2887954711914, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014565570279955864, + "rewards/margins": -0.0040927669033408165, + "rewards/rejected": -0.010472802445292473, + "step": 100 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -3.167656660079956, + "eval_logits/rejected": -3.177053928375244, + "eval_logps/chosen": -194.15455627441406, + "eval_logps/rejected": -171.9473419189453, + "eval_loss": 0.6889778971672058, + "eval_rewards/accuracies": 0.5475000143051147, + "eval_rewards/chosen": 0.008353026583790779, + "eval_rewards/margins": 0.011491414159536362, + "eval_rewards/rejected": -0.0031383878085762262, + "eval_runtime": 139.3488, + "eval_samples_per_second": 22.648, + "eval_steps_per_second": 0.359, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 3.557567917205692e-08, + "logits/chosen": -3.1216812133789062, + "logits/rejected": -3.1043143272399902, + "logps/chosen": -209.23745727539062, + "logps/rejected": -168.80552673339844, + "loss": 0.6885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009929950349032879, + "rewards/margins": 0.009177559986710548, + "rewards/rejected": 0.0007523916428908706, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 3.880983182406209e-08, + "logits/chosen": -3.1498494148254395, + "logits/rejected": -3.1368408203125, + "logps/chosen": -223.6138916015625, + "logps/rejected": -166.53366088867188, + "loss": 0.6838, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023775454610586166, + "rewards/margins": 0.03654768317937851, + "rewards/rejected": -0.012772229500114918, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 4.204398447606727e-08, + "logits/chosen": -3.053675413131714, + "logits/rejected": -3.118798017501831, + "logps/chosen": -163.9273681640625, + "logps/rejected": -126.46878814697266, + "loss": 0.6842, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0008082082495093346, + "rewards/margins": 0.03312928229570389, + "rewards/rejected": -0.03232107311487198, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 4.5278137128072445e-08, + "logits/chosen": -3.198451519012451, + "logits/rejected": -3.1912155151367188, + "logps/chosen": -217.3875732421875, + "logps/rejected": -197.75624084472656, + "loss": 0.6766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.018968766555190086, + "rewards/margins": 0.06026030331850052, + "rewards/rejected": -0.041291531175374985, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 4.851228978007762e-08, + "logits/chosen": -2.8457720279693604, + "logits/rejected": -2.8702220916748047, + "logps/chosen": -192.63583374023438, + "logps/rejected": -160.9226531982422, + "loss": 0.6765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03224840387701988, + "rewards/margins": 0.04415305703878403, + "rewards/rejected": -0.011904651299118996, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 5.1746442432082797e-08, + "logits/chosen": -3.1837120056152344, + "logits/rejected": -3.167569637298584, + "logps/chosen": -187.67776489257812, + "logps/rejected": -217.16650390625, + "loss": 0.6781, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.045844174921512604, + "rewards/margins": 0.0020677223801612854, + "rewards/rejected": 0.04377645626664162, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 5.4980595084087966e-08, + "logits/chosen": -3.1629438400268555, + "logits/rejected": -3.1647531986236572, + "logps/chosen": -239.3665771484375, + "logps/rejected": -206.7849884033203, + "loss": 0.6701, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07763100415468216, + "rewards/margins": 0.07111285626888275, + "rewards/rejected": 0.006518153008073568, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 5.821474773609314e-08, + "logits/chosen": -2.9683380126953125, + "logits/rejected": -3.0374159812927246, + "logps/chosen": -143.70809936523438, + "logps/rejected": -158.6737823486328, + "loss": 0.6728, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03261490538716316, + "rewards/margins": 0.06522291898727417, + "rewards/rejected": -0.03260800987482071, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 6.144890038809831e-08, + "logits/chosen": -3.1599783897399902, + "logits/rejected": -3.188602924346924, + "logps/chosen": -266.9081726074219, + "logps/rejected": -293.92669677734375, + "loss": 0.668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1383255124092102, + "rewards/margins": 0.06030426546931267, + "rewards/rejected": 0.07802124321460724, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 6.46830530401035e-08, + "logits/chosen": -2.9277074337005615, + "logits/rejected": -2.9044947624206543, + "logps/chosen": -159.93588256835938, + "logps/rejected": -129.9291534423828, + "loss": 0.6665, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03977008908987045, + "rewards/margins": 0.04497218504548073, + "rewards/rejected": -0.005202095955610275, + "step": 200 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -3.1655266284942627, + "eval_logits/rejected": -3.1750502586364746, + "eval_logps/chosen": -193.48023986816406, + "eval_logps/rejected": -171.9678192138672, + "eval_loss": 0.6631521582603455, + "eval_rewards/accuracies": 0.6449999809265137, + "eval_rewards/chosen": 0.07578270137310028, + "eval_rewards/margins": 0.08096777647733688, + "eval_rewards/rejected": -0.0051850746385753155, + "eval_runtime": 138.4167, + "eval_samples_per_second": 22.801, + "eval_steps_per_second": 0.361, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 6.791720569210866e-08, + "logits/chosen": -3.181269645690918, + "logits/rejected": -3.1761131286621094, + "logps/chosen": -280.2482604980469, + "logps/rejected": -180.41729736328125, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10838142782449722, + "rewards/margins": 0.07865364849567413, + "rewards/rejected": 0.029727783054113388, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 7.115135834411385e-08, + "logits/chosen": -3.204672336578369, + "logits/rejected": -3.2558085918426514, + "logps/chosen": -200.743408203125, + "logps/rejected": -233.82485961914062, + "loss": 0.6478, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12597759068012238, + "rewards/margins": 0.132663294672966, + "rewards/rejected": -0.006685702595859766, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 7.438551099611902e-08, + "logits/chosen": -3.0432682037353516, + "logits/rejected": -3.0207479000091553, + "logps/chosen": -202.6326904296875, + "logps/rejected": -212.5495147705078, + "loss": 0.6383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.09857180714607239, + "rewards/margins": 0.14784620702266693, + "rewards/rejected": -0.049274396151304245, + "step": 230 + }, + { + "epoch": 0.05, + "learning_rate": 7.761966364812419e-08, + "logits/chosen": -3.234316349029541, + "logits/rejected": -3.1052303314208984, + "logps/chosen": -259.6878356933594, + "logps/rejected": -223.2771453857422, + "loss": 0.6323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16739621758460999, + "rewards/margins": 0.13369934260845184, + "rewards/rejected": 0.03369685262441635, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 8.085381630012935e-08, + "logits/chosen": -3.020998477935791, + "logits/rejected": -2.997157096862793, + "logps/chosen": -296.86737060546875, + "logps/rejected": -222.3542022705078, + "loss": 0.6641, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.12462540715932846, + "rewards/margins": 0.07443695515394211, + "rewards/rejected": 0.05018845200538635, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 8.408796895213454e-08, + "logits/chosen": -3.225865602493286, + "logits/rejected": -3.2103190422058105, + "logps/chosen": -281.2466735839844, + "logps/rejected": -253.050537109375, + "loss": 0.6305, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.17998719215393066, + "rewards/margins": 0.23333589732646942, + "rewards/rejected": -0.05334869772195816, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 8.73221216041397e-08, + "logits/chosen": -3.1258041858673096, + "logits/rejected": -3.1957387924194336, + "logps/chosen": -274.235107421875, + "logps/rejected": -250.7463836669922, + "loss": 0.6346, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.11405845731496811, + "rewards/margins": 0.09639565646648407, + "rewards/rejected": 0.017662782222032547, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 9.055627425614489e-08, + "logits/chosen": -2.947895050048828, + "logits/rejected": -3.0267863273620605, + "logps/chosen": -190.09231567382812, + "logps/rejected": -167.23037719726562, + "loss": 0.6349, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03284968063235283, + "rewards/margins": 0.18831506371498108, + "rewards/rejected": -0.15546536445617676, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 9.379042690815006e-08, + "logits/chosen": -3.157709836959839, + "logits/rejected": -3.1099419593811035, + "logps/chosen": -237.16323852539062, + "logps/rejected": -219.634765625, + "loss": 0.6211, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12987887859344482, + "rewards/margins": 0.12882006168365479, + "rewards/rejected": 0.0010588064324110746, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 9.702457956015524e-08, + "logits/chosen": -3.1737518310546875, + "logits/rejected": -3.1884818077087402, + "logps/chosen": -173.21974182128906, + "logps/rejected": -213.5424346923828, + "loss": 0.6381, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1705276370048523, + "rewards/margins": 0.17234186828136444, + "rewards/rejected": -0.0018142417538911104, + "step": 300 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -3.1614997386932373, + "eval_logits/rejected": -3.171653985977173, + "eval_logps/chosen": -192.6167755126953, + "eval_logps/rejected": -172.62965393066406, + "eval_loss": 0.6209953427314758, + "eval_rewards/accuracies": 0.6875, + "eval_rewards/chosen": 0.16212961077690125, + "eval_rewards/margins": 0.23349855840206146, + "eval_rewards/rejected": -0.07136894017457962, + "eval_runtime": 139.5791, + "eval_samples_per_second": 22.611, + "eval_steps_per_second": 0.358, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 1.0025873221216041e-07, + "logits/chosen": -3.153163194656372, + "logits/rejected": -3.08953857421875, + "logps/chosen": -255.1444091796875, + "logps/rejected": -282.613037109375, + "loss": 0.5999, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.19228589534759521, + "rewards/margins": 0.24539165198802948, + "rewards/rejected": -0.053105778992176056, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 1.0349288486416559e-07, + "logits/chosen": -3.199058771133423, + "logits/rejected": -3.18471622467041, + "logps/chosen": -179.1886444091797, + "logps/rejected": -257.89703369140625, + "loss": 0.6142, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21753160655498505, + "rewards/margins": 0.11381890624761581, + "rewards/rejected": 0.10371267795562744, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 1.0672703751617076e-07, + "logits/chosen": -3.153376817703247, + "logits/rejected": -3.18345308303833, + "logps/chosen": -207.62661743164062, + "logps/rejected": -196.59873962402344, + "loss": 0.5991, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.28966066241264343, + "rewards/margins": 0.43374133110046387, + "rewards/rejected": -0.14408066868782043, + "step": 330 + }, + { + "epoch": 0.07, + "learning_rate": 1.0996119016817593e-07, + "logits/chosen": -3.0663774013519287, + "logits/rejected": -3.0867068767547607, + "logps/chosen": -176.10557556152344, + "logps/rejected": -202.84716796875, + "loss": 0.6079, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.241808220744133, + "rewards/margins": 0.007571871392428875, + "rewards/rejected": 0.23423632979393005, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 1.131953428201811e-07, + "logits/chosen": -3.060717821121216, + "logits/rejected": -3.0316948890686035, + "logps/chosen": -163.25799560546875, + "logps/rejected": -189.8742218017578, + "loss": 0.6059, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.31728318333625793, + "rewards/margins": 0.29380735754966736, + "rewards/rejected": 0.02347579039633274, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 1.1642949547218628e-07, + "logits/chosen": -3.1615848541259766, + "logits/rejected": -3.150390386581421, + "logps/chosen": -255.5399932861328, + "logps/rejected": -171.8251190185547, + "loss": 0.5856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.21890723705291748, + "rewards/margins": 0.14798519015312195, + "rewards/rejected": 0.07092205435037613, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 1.1966364812419147e-07, + "logits/chosen": -2.9652140140533447, + "logits/rejected": -3.0450243949890137, + "logps/chosen": -245.3134765625, + "logps/rejected": -197.13406372070312, + "loss": 0.6142, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.28407177329063416, + "rewards/margins": 0.3457433581352234, + "rewards/rejected": -0.061671603471040726, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 1.2289780077619662e-07, + "logits/chosen": -2.9582393169403076, + "logits/rejected": -2.9958596229553223, + "logps/chosen": -214.2012939453125, + "logps/rejected": -178.59019470214844, + "loss": 0.6159, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14877629280090332, + "rewards/margins": 0.22745585441589355, + "rewards/rejected": -0.07867956161499023, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 1.261319534282018e-07, + "logits/chosen": -3.0242486000061035, + "logits/rejected": -3.0409722328186035, + "logps/chosen": -326.0379638671875, + "logps/rejected": -293.76422119140625, + "loss": 0.5815, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2768373489379883, + "rewards/margins": 0.2867100238800049, + "rewards/rejected": -0.009872669354081154, + "step": 390 + }, + { + "epoch": 0.08, + "learning_rate": 1.29366106080207e-07, + "logits/chosen": -3.0544285774230957, + "logits/rejected": -3.0994880199432373, + "logps/chosen": -191.39834594726562, + "logps/rejected": -174.6272430419922, + "loss": 0.5753, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.24598166346549988, + "rewards/margins": 0.2981668710708618, + "rewards/rejected": -0.052185166627168655, + "step": 400 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -3.144436836242676, + "eval_logits/rejected": -3.1545228958129883, + "eval_logps/chosen": -193.4642791748047, + "eval_logps/rejected": -175.2998504638672, + "eval_loss": 0.5865161418914795, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": 0.0773763582110405, + "eval_rewards/margins": 0.4157639741897583, + "eval_rewards/rejected": -0.3383876383304596, + "eval_runtime": 139.377, + "eval_samples_per_second": 22.644, + "eval_steps_per_second": 0.359, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 1.3260025873221214e-07, + "logits/chosen": -3.0932059288024902, + "logits/rejected": -3.1315836906433105, + "logps/chosen": -232.5525360107422, + "logps/rejected": -199.24893188476562, + "loss": 0.6045, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21984687447547913, + "rewards/margins": 0.43035203218460083, + "rewards/rejected": -0.2105051577091217, + "step": 410 + }, + { + "epoch": 0.08, + "learning_rate": 1.3583441138421733e-07, + "logits/chosen": -2.9779956340789795, + "logits/rejected": -2.9564549922943115, + "logps/chosen": -174.45761108398438, + "logps/rejected": -140.55642700195312, + "loss": 0.5481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.006199514959007502, + "rewards/margins": 0.4953877329826355, + "rewards/rejected": -0.5015872716903687, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 1.390685640362225e-07, + "logits/chosen": -2.9969263076782227, + "logits/rejected": -3.0024189949035645, + "logps/chosen": -169.60955810546875, + "logps/rejected": -213.49417114257812, + "loss": 0.5877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11862262338399887, + "rewards/margins": 0.32721397280693054, + "rewards/rejected": -0.20859134197235107, + "step": 430 + }, + { + "epoch": 0.09, + "learning_rate": 1.423027166882277e-07, + "logits/chosen": -3.1354074478149414, + "logits/rejected": -3.133338212966919, + "logps/chosen": -316.2953796386719, + "logps/rejected": -312.092041015625, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5428065657615662, + "rewards/margins": 0.8526216745376587, + "rewards/rejected": -0.309814989566803, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 1.4553686934023285e-07, + "logits/chosen": -3.1289236545562744, + "logits/rejected": -3.0934605598449707, + "logps/chosen": -204.59451293945312, + "logps/rejected": -230.4581298828125, + "loss": 0.5932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.044186461716890335, + "rewards/margins": 0.3902572989463806, + "rewards/rejected": -0.43444371223449707, + "step": 450 + }, + { + "epoch": 0.09, + "learning_rate": 1.4877102199223803e-07, + "logits/chosen": -2.936959743499756, + "logits/rejected": -2.8640313148498535, + "logps/chosen": -121.1145248413086, + "logps/rejected": -152.80404663085938, + "loss": 0.5942, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.16668501496315002, + "rewards/margins": 0.2552078664302826, + "rewards/rejected": -0.4218928813934326, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 1.520051746442432e-07, + "logits/chosen": -2.9685120582580566, + "logits/rejected": -2.995673418045044, + "logps/chosen": -170.37899780273438, + "logps/rejected": -216.64419555664062, + "loss": 0.566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07195017486810684, + "rewards/margins": 0.38806548714637756, + "rewards/rejected": -0.3161153197288513, + "step": 470 + }, + { + "epoch": 0.09, + "learning_rate": 1.5523932729624837e-07, + "logits/chosen": -3.1304144859313965, + "logits/rejected": -3.1346306800842285, + "logps/chosen": -200.91786193847656, + "logps/rejected": -193.68020629882812, + "loss": 0.5686, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.19373224675655365, + "rewards/margins": 0.5450137257575989, + "rewards/rejected": -0.3512814939022064, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 1.5847347994825355e-07, + "logits/chosen": -3.076622486114502, + "logits/rejected": -3.0705759525299072, + "logps/chosen": -238.1417694091797, + "logps/rejected": -184.45993041992188, + "loss": 0.5602, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07782919704914093, + "rewards/margins": 0.4706133306026459, + "rewards/rejected": -0.3927842080593109, + "step": 490 + }, + { + "epoch": 0.1, + "learning_rate": 1.617076326002587e-07, + "logits/chosen": -3.022279739379883, + "logits/rejected": -2.996781826019287, + "logps/chosen": -333.6932373046875, + "logps/rejected": -252.64376831054688, + "loss": 0.5092, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.2657441198825836, + "rewards/margins": 1.2168611288070679, + "rewards/rejected": -0.9511170387268066, + "step": 500 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -3.1387369632720947, + "eval_logits/rejected": -3.1475753784179688, + "eval_logps/chosen": -194.74359130859375, + "eval_logps/rejected": -178.61302185058594, + "eval_loss": 0.5518030524253845, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -0.05055353045463562, + "eval_rewards/margins": 0.6191545724868774, + "eval_rewards/rejected": -0.6697080135345459, + "eval_runtime": 138.877, + "eval_samples_per_second": 22.725, + "eval_steps_per_second": 0.36, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 1.649417852522639e-07, + "logits/chosen": -3.1498000621795654, + "logits/rejected": -3.124570846557617, + "logps/chosen": -157.44468688964844, + "logps/rejected": -174.67886352539062, + "loss": 0.584, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46559691429138184, + "rewards/margins": 0.264739990234375, + "rewards/rejected": -0.7303369641304016, + "step": 510 + }, + { + "epoch": 0.1, + "learning_rate": 1.6817593790426907e-07, + "logits/chosen": -3.18113374710083, + "logits/rejected": -3.159060001373291, + "logps/chosen": -217.8606719970703, + "logps/rejected": -211.8629150390625, + "loss": 0.5402, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25132066011428833, + "rewards/margins": 0.5133770704269409, + "rewards/rejected": -0.7646977305412292, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 1.7141009055627426e-07, + "logits/chosen": -3.1162872314453125, + "logits/rejected": -3.1024136543273926, + "logps/chosen": -163.42092895507812, + "logps/rejected": -174.1112823486328, + "loss": 0.5486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.26286181807518005, + "rewards/margins": 0.5959106683731079, + "rewards/rejected": -0.8587724566459656, + "step": 530 + }, + { + "epoch": 0.1, + "learning_rate": 1.746442432082794e-07, + "logits/chosen": -3.080259084701538, + "logits/rejected": -3.0933754444122314, + "logps/chosen": -200.79342651367188, + "logps/rejected": -217.738037109375, + "loss": 0.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05293453857302666, + "rewards/margins": 0.29576388001441956, + "rewards/rejected": -0.3486984074115753, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 1.778783958602846e-07, + "logits/chosen": -3.1378228664398193, + "logits/rejected": -3.135917901992798, + "logps/chosen": -205.9120330810547, + "logps/rejected": -194.05776977539062, + "loss": 0.5737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09442628920078278, + "rewards/margins": 0.6792758703231812, + "rewards/rejected": -0.7737022042274475, + "step": 550 + }, + { + "epoch": 0.11, + "learning_rate": 1.8111254851228978e-07, + "logits/chosen": -2.992809772491455, + "logits/rejected": -3.0085959434509277, + "logps/chosen": -181.65798950195312, + "logps/rejected": -188.4614715576172, + "loss": 0.52, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11401055008172989, + "rewards/margins": 0.7531822919845581, + "rewards/rejected": -0.6391717195510864, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 1.8434670116429496e-07, + "logits/chosen": -3.00810170173645, + "logits/rejected": -3.033761978149414, + "logps/chosen": -230.50656127929688, + "logps/rejected": -231.4688262939453, + "loss": 0.5615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.27678605914115906, + "rewards/margins": 0.7314552068710327, + "rewards/rejected": -0.45466917753219604, + "step": 570 + }, + { + "epoch": 0.11, + "learning_rate": 1.8758085381630012e-07, + "logits/chosen": -3.113915205001831, + "logits/rejected": -3.156710147857666, + "logps/chosen": -342.560302734375, + "logps/rejected": -283.44403076171875, + "loss": 0.5778, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4650874137878418, + "rewards/margins": 0.941075325012207, + "rewards/rejected": -0.47598785161972046, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 1.9081500646830527e-07, + "logits/chosen": -3.1269617080688477, + "logits/rejected": -3.1666359901428223, + "logps/chosen": -234.28176879882812, + "logps/rejected": -181.70407104492188, + "loss": 0.6159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.21833765506744385, + "rewards/margins": 0.8614624738693237, + "rewards/rejected": -0.6431248784065247, + "step": 590 + }, + { + "epoch": 0.12, + "learning_rate": 1.9404915912031048e-07, + "logits/chosen": -3.076352596282959, + "logits/rejected": -2.9966773986816406, + "logps/chosen": -139.66424560546875, + "logps/rejected": -213.1605987548828, + "loss": 0.5374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4512442648410797, + "rewards/margins": 0.5663000345230103, + "rewards/rejected": -1.0175443887710571, + "step": 600 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -3.1174840927124023, + "eval_logits/rejected": -3.1267504692077637, + "eval_logps/chosen": -197.49191284179688, + "eval_logps/rejected": -183.15155029296875, + "eval_loss": 0.5301532745361328, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -0.32538560032844543, + "eval_rewards/margins": 0.7981722354888916, + "eval_rewards/rejected": -1.1235578060150146, + "eval_runtime": 148.0938, + "eval_samples_per_second": 21.311, + "eval_steps_per_second": 0.338, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 1.9728331177231564e-07, + "logits/chosen": -3.1216952800750732, + "logits/rejected": -3.145416498184204, + "logps/chosen": -248.3773956298828, + "logps/rejected": -221.88284301757812, + "loss": 0.5019, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03421555832028389, + "rewards/margins": 1.1549384593963623, + "rewards/rejected": -1.1891541481018066, + "step": 610 + }, + { + "epoch": 0.12, + "learning_rate": 2.0051746442432082e-07, + "logits/chosen": -3.0570244789123535, + "logits/rejected": -3.067242383956909, + "logps/chosen": -274.6044616699219, + "logps/rejected": -183.0326385498047, + "loss": 0.5109, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2071150541305542, + "rewards/margins": 0.6557462811470032, + "rewards/rejected": -0.8628614544868469, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 2.0375161707632598e-07, + "logits/chosen": -3.0616676807403564, + "logits/rejected": -3.0857839584350586, + "logps/chosen": -228.970947265625, + "logps/rejected": -189.21237182617188, + "loss": 0.5213, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5329365730285645, + "rewards/margins": 0.4307224154472351, + "rewards/rejected": -0.9636589288711548, + "step": 630 + }, + { + "epoch": 0.12, + "learning_rate": 2.0698576972833119e-07, + "logits/chosen": -2.9985756874084473, + "logits/rejected": -2.9745564460754395, + "logps/chosen": -220.9756622314453, + "logps/rejected": -221.86508178710938, + "loss": 0.56, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.365543931722641, + "rewards/margins": 0.93255615234375, + "rewards/rejected": -1.298100233078003, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 2.1021992238033634e-07, + "logits/chosen": -2.8438754081726074, + "logits/rejected": -2.9238531589508057, + "logps/chosen": -228.2821807861328, + "logps/rejected": -217.6581573486328, + "loss": 0.5303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.10670392215251923, + "rewards/margins": 0.9380793571472168, + "rewards/rejected": -0.8313754200935364, + "step": 650 + }, + { + "epoch": 0.13, + "learning_rate": 2.1345407503234153e-07, + "logits/chosen": -3.0386316776275635, + "logits/rejected": -3.099316120147705, + "logps/chosen": -157.04953002929688, + "logps/rejected": -198.0177001953125, + "loss": 0.5428, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.14069390296936035, + "rewards/margins": 1.6727489233016968, + "rewards/rejected": -1.532055139541626, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 2.1668822768434668e-07, + "logits/chosen": -3.1025519371032715, + "logits/rejected": -3.0931642055511475, + "logps/chosen": -182.56222534179688, + "logps/rejected": -317.12481689453125, + "loss": 0.5359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14728078246116638, + "rewards/margins": 0.41670504212379456, + "rewards/rejected": -0.2694242000579834, + "step": 670 + }, + { + "epoch": 0.13, + "learning_rate": 2.1992238033635186e-07, + "logits/chosen": -3.0557260513305664, + "logits/rejected": -3.0878586769104004, + "logps/chosen": -179.29005432128906, + "logps/rejected": -232.91567993164062, + "loss": 0.5284, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07645130902528763, + "rewards/margins": 0.4627855718135834, + "rewards/rejected": -0.5392369031906128, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 2.2315653298835705e-07, + "logits/chosen": -2.7653260231018066, + "logits/rejected": -2.883758068084717, + "logps/chosen": -247.56021118164062, + "logps/rejected": -213.8865509033203, + "loss": 0.5972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6671286821365356, + "rewards/margins": 1.2729079723358154, + "rewards/rejected": -0.6057791709899902, + "step": 690 + }, + { + "epoch": 0.14, + "learning_rate": 2.263906856403622e-07, + "logits/chosen": -2.958416223526001, + "logits/rejected": -2.972041606903076, + "logps/chosen": -167.88119506835938, + "logps/rejected": -189.9696807861328, + "loss": 0.4719, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6142417192459106, + "rewards/margins": 1.2729040384292603, + "rewards/rejected": -1.8871456384658813, + "step": 700 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -3.1009063720703125, + "eval_logits/rejected": -3.1106061935424805, + "eval_logps/chosen": -199.7174835205078, + "eval_logps/rejected": -187.49134826660156, + "eval_loss": 0.5122122168540955, + "eval_rewards/accuracies": 0.7225000262260437, + "eval_rewards/chosen": -0.5479406118392944, + "eval_rewards/margins": 1.009599208831787, + "eval_rewards/rejected": -1.5575398206710815, + "eval_runtime": 138.7439, + "eval_samples_per_second": 22.747, + "eval_steps_per_second": 0.36, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 2.2962483829236739e-07, + "logits/chosen": -2.9802756309509277, + "logits/rejected": -3.100799560546875, + "logps/chosen": -172.6888427734375, + "logps/rejected": -204.21240234375, + "loss": 0.5192, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5314847230911255, + "rewards/margins": 1.2006410360336304, + "rewards/rejected": -1.7321258783340454, + "step": 710 + }, + { + "epoch": 0.14, + "learning_rate": 2.3285899094437257e-07, + "logits/chosen": -3.094202756881714, + "logits/rejected": -3.1324687004089355, + "logps/chosen": -238.36331176757812, + "logps/rejected": -250.6588592529297, + "loss": 0.5113, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7062119245529175, + "rewards/margins": 0.8887389898300171, + "rewards/rejected": -1.5949509143829346, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 2.3609314359637775e-07, + "logits/chosen": -3.112473964691162, + "logits/rejected": -3.1348583698272705, + "logps/chosen": -223.6353302001953, + "logps/rejected": -282.94036865234375, + "loss": 0.509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3504326045513153, + "rewards/margins": 1.08292555809021, + "rewards/rejected": -1.4333581924438477, + "step": 730 + }, + { + "epoch": 0.14, + "learning_rate": 2.3932729624838293e-07, + "logits/chosen": -2.929335594177246, + "logits/rejected": -3.038344621658325, + "logps/chosen": -256.2840576171875, + "logps/rejected": -270.9310607910156, + "loss": 0.4907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7100075483322144, + "rewards/margins": 1.0054277181625366, + "rewards/rejected": -1.7154353857040405, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 2.425614489003881e-07, + "logits/chosen": -3.071770668029785, + "logits/rejected": -3.0335705280303955, + "logps/chosen": -184.1170196533203, + "logps/rejected": -190.11817932128906, + "loss": 0.5403, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0880534648895264, + "rewards/margins": 0.9439334869384766, + "rewards/rejected": -2.031986951828003, + "step": 750 + }, + { + "epoch": 0.15, + "learning_rate": 2.4579560155239325e-07, + "logits/chosen": -3.0905098915100098, + "logits/rejected": -3.1044254302978516, + "logps/chosen": -206.65673828125, + "logps/rejected": -212.1765594482422, + "loss": 0.5203, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6541646718978882, + "rewards/margins": 1.224506139755249, + "rewards/rejected": -1.8786706924438477, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 2.4902975420439843e-07, + "logits/chosen": -3.105860710144043, + "logits/rejected": -3.193061351776123, + "logps/chosen": -248.20712280273438, + "logps/rejected": -259.50555419921875, + "loss": 0.4783, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09263571351766586, + "rewards/margins": 1.649441123008728, + "rewards/rejected": -1.742077112197876, + "step": 770 + }, + { + "epoch": 0.15, + "learning_rate": 2.522639068564036e-07, + "logits/chosen": -3.066476583480835, + "logits/rejected": -3.076321840286255, + "logps/chosen": -231.3736572265625, + "logps/rejected": -201.08914184570312, + "loss": 0.4991, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7740650773048401, + "rewards/margins": 0.49084991216659546, + "rewards/rejected": -1.2649149894714355, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 2.554980595084088e-07, + "logits/chosen": -2.982896327972412, + "logits/rejected": -2.9362235069274902, + "logps/chosen": -188.5688934326172, + "logps/rejected": -235.40676879882812, + "loss": 0.5174, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.06574110686779022, + "rewards/margins": 1.3378245830535889, + "rewards/rejected": -1.2720834016799927, + "step": 790 + }, + { + "epoch": 0.16, + "learning_rate": 2.58732212160414e-07, + "logits/chosen": -2.9199814796447754, + "logits/rejected": -2.8367676734924316, + "logps/chosen": -203.72073364257812, + "logps/rejected": -298.4263610839844, + "loss": 0.5036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9280701875686646, + "rewards/margins": 0.9835942983627319, + "rewards/rejected": -1.911664605140686, + "step": 800 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -3.106929063796997, + "eval_logits/rejected": -3.1154816150665283, + "eval_logps/chosen": -197.77255249023438, + "eval_logps/rejected": -187.2395477294922, + "eval_loss": 0.5092905163764954, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -0.3534494638442993, + "eval_rewards/margins": 1.1789119243621826, + "eval_rewards/rejected": -1.5323612689971924, + "eval_runtime": 140.2502, + "eval_samples_per_second": 22.503, + "eval_steps_per_second": 0.357, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 2.619663648124191e-07, + "logits/chosen": -2.895864963531494, + "logits/rejected": -2.9698562622070312, + "logps/chosen": -253.4523162841797, + "logps/rejected": -179.3986358642578, + "loss": 0.6193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8969345092773438, + "rewards/margins": 0.17108377814292908, + "rewards/rejected": -1.0680183172225952, + "step": 810 + }, + { + "epoch": 0.16, + "learning_rate": 2.652005174644243e-07, + "logits/chosen": -3.0189733505249023, + "logits/rejected": -3.091141700744629, + "logps/chosen": -145.2667694091797, + "logps/rejected": -208.92019653320312, + "loss": 0.6153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9238840341567993, + "rewards/margins": 0.9646676182746887, + "rewards/rejected": -1.8885517120361328, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 2.684346701164295e-07, + "logits/chosen": -3.007552146911621, + "logits/rejected": -3.0918264389038086, + "logps/chosen": -149.43923950195312, + "logps/rejected": -190.80862426757812, + "loss": 0.5495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3854085206985474, + "rewards/margins": 0.9688920974731445, + "rewards/rejected": -2.3543004989624023, + "step": 830 + }, + { + "epoch": 0.16, + "learning_rate": 2.7166882276843465e-07, + "logits/chosen": -2.995171308517456, + "logits/rejected": -3.0511744022369385, + "logps/chosen": -307.744140625, + "logps/rejected": -257.0872497558594, + "loss": 0.4775, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.41892343759536743, + "rewards/margins": 1.1574362516403198, + "rewards/rejected": -1.5763596296310425, + "step": 840 + }, + { + "epoch": 0.17, + "learning_rate": 2.7490297542043984e-07, + "logits/chosen": -3.008758068084717, + "logits/rejected": -3.0798912048339844, + "logps/chosen": -206.8883819580078, + "logps/rejected": -165.884765625, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.737189769744873, + "rewards/margins": 1.2928497791290283, + "rewards/rejected": -2.0300393104553223, + "step": 850 + }, + { + "epoch": 0.17, + "learning_rate": 2.78137128072445e-07, + "logits/chosen": -3.051088809967041, + "logits/rejected": -3.110631227493286, + "logps/chosen": -113.84765625, + "logps/rejected": -138.62364196777344, + "loss": 0.5105, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2797080278396606, + "rewards/margins": 1.187454104423523, + "rewards/rejected": -2.4671621322631836, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 2.8137128072445015e-07, + "logits/chosen": -3.1219208240509033, + "logits/rejected": -3.159938335418701, + "logps/chosen": -246.7979278564453, + "logps/rejected": -236.80239868164062, + "loss": 0.4727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4427107274532318, + "rewards/margins": 1.177958607673645, + "rewards/rejected": -1.6206691265106201, + "step": 870 + }, + { + "epoch": 0.17, + "learning_rate": 2.846054333764554e-07, + "logits/chosen": -3.003744125366211, + "logits/rejected": -3.043114423751831, + "logps/chosen": -206.8845977783203, + "logps/rejected": -206.0985107421875, + "loss": 0.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19859448075294495, + "rewards/margins": 0.7293460965156555, + "rewards/rejected": -0.9279405474662781, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 2.878395860284605e-07, + "logits/chosen": -2.8752822875976562, + "logits/rejected": -2.9653193950653076, + "logps/chosen": -259.47442626953125, + "logps/rejected": -168.39279174804688, + "loss": 0.5839, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8495208621025085, + "rewards/margins": 1.1083202362060547, + "rewards/rejected": -1.957841157913208, + "step": 890 + }, + { + "epoch": 0.17, + "learning_rate": 2.910737386804657e-07, + "logits/chosen": -2.9832606315612793, + "logits/rejected": -2.9729461669921875, + "logps/chosen": -190.07797241210938, + "logps/rejected": -201.74734497070312, + "loss": 0.456, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.213139533996582, + "rewards/margins": 1.0091941356658936, + "rewards/rejected": -2.2223336696624756, + "step": 900 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -3.0858521461486816, + "eval_logits/rejected": -3.094005823135376, + "eval_logps/chosen": -205.53961181640625, + "eval_logps/rejected": -196.5068817138672, + "eval_loss": 0.5017659068107605, + "eval_rewards/accuracies": 0.7250000238418579, + "eval_rewards/chosen": -1.1301543712615967, + "eval_rewards/margins": 1.328935980796814, + "eval_rewards/rejected": -2.459090232849121, + "eval_runtime": 138.2537, + "eval_samples_per_second": 22.828, + "eval_steps_per_second": 0.362, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 2.943078913324709e-07, + "logits/chosen": -2.930833578109741, + "logits/rejected": -2.976238489151001, + "logps/chosen": -155.70570373535156, + "logps/rejected": -178.11851501464844, + "loss": 0.5571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5093227624893188, + "rewards/margins": 1.6325773000717163, + "rewards/rejected": -3.141899824142456, + "step": 910 + }, + { + "epoch": 0.18, + "learning_rate": 2.9754204398447606e-07, + "logits/chosen": -3.0615293979644775, + "logits/rejected": -2.998253107070923, + "logps/chosen": -251.2194366455078, + "logps/rejected": -176.75877380371094, + "loss": 0.4612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29622069001197815, + "rewards/margins": 1.41635262966156, + "rewards/rejected": -1.7125732898712158, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 3.0077619663648125e-07, + "logits/chosen": -3.065725088119507, + "logits/rejected": -3.0299696922302246, + "logps/chosen": -259.5385437011719, + "logps/rejected": -201.85426330566406, + "loss": 0.4777, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1391627788543701, + "rewards/margins": 1.818529486656189, + "rewards/rejected": -2.9576923847198486, + "step": 930 + }, + { + "epoch": 0.18, + "learning_rate": 3.040103492884864e-07, + "logits/chosen": -3.085815191268921, + "logits/rejected": -3.064685583114624, + "logps/chosen": -288.67962646484375, + "logps/rejected": -278.7095642089844, + "loss": 0.4442, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.728425145149231, + "rewards/margins": 0.8075621724128723, + "rewards/rejected": -1.535987138748169, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 3.0724450194049156e-07, + "logits/chosen": -3.160470962524414, + "logits/rejected": -3.116166114807129, + "logps/chosen": -230.3232879638672, + "logps/rejected": -316.96710205078125, + "loss": 0.5392, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8914705514907837, + "rewards/margins": 0.8523221015930176, + "rewards/rejected": -1.7437927722930908, + "step": 950 + }, + { + "epoch": 0.19, + "learning_rate": 3.1047865459249674e-07, + "logits/chosen": -2.935952663421631, + "logits/rejected": -3.020907402038574, + "logps/chosen": -243.97781372070312, + "logps/rejected": -287.1600646972656, + "loss": 0.4915, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.079117774963379, + "rewards/margins": 0.7942657470703125, + "rewards/rejected": -1.8733835220336914, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 3.137128072445019e-07, + "logits/chosen": -3.046394109725952, + "logits/rejected": -3.1229748725891113, + "logps/chosen": -280.108154296875, + "logps/rejected": -250.4733123779297, + "loss": 0.4657, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0350096225738525, + "rewards/margins": 1.2288711071014404, + "rewards/rejected": -2.263880729675293, + "step": 970 + }, + { + "epoch": 0.19, + "learning_rate": 3.169469598965071e-07, + "logits/chosen": -3.0382816791534424, + "logits/rejected": -2.9998793601989746, + "logps/chosen": -237.20614624023438, + "logps/rejected": -326.0390930175781, + "loss": 0.5528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1391876935958862, + "rewards/margins": 1.0271110534667969, + "rewards/rejected": -2.1662986278533936, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 3.201811125485123e-07, + "logits/chosen": -3.0846056938171387, + "logits/rejected": -3.1346778869628906, + "logps/chosen": -307.43695068359375, + "logps/rejected": -284.30010986328125, + "loss": 0.5158, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.24250833690166473, + "rewards/margins": 1.4640954732894897, + "rewards/rejected": -1.70660400390625, + "step": 990 + }, + { + "epoch": 0.19, + "learning_rate": 3.234152652005174e-07, + "logits/chosen": -2.766576051712036, + "logits/rejected": -2.9066169261932373, + "logps/chosen": -189.30099487304688, + "logps/rejected": -197.08670043945312, + "loss": 0.574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1831011772155762, + "rewards/margins": 1.3984429836273193, + "rewards/rejected": -2.5815436840057373, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -3.072593927383423, + "eval_logits/rejected": -3.078813076019287, + "eval_logps/chosen": -206.5470733642578, + "eval_logps/rejected": -198.46461486816406, + "eval_loss": 0.5005720853805542, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -1.230900764465332, + "eval_rewards/margins": 1.4239643812179565, + "eval_rewards/rejected": -2.654865026473999, + "eval_runtime": 154.2497, + "eval_samples_per_second": 20.46, + "eval_steps_per_second": 0.324, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 3.2664941785252265e-07, + "logits/chosen": -3.0678982734680176, + "logits/rejected": -3.144016742706299, + "logps/chosen": -210.421142578125, + "logps/rejected": -202.7620391845703, + "loss": 0.4927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0432560443878174, + "rewards/margins": 0.8585975766181946, + "rewards/rejected": -1.901853322982788, + "step": 1010 + }, + { + "epoch": 0.2, + "learning_rate": 3.298835705045278e-07, + "logits/chosen": -2.9162240028381348, + "logits/rejected": -2.935126781463623, + "logps/chosen": -222.281982421875, + "logps/rejected": -233.9991455078125, + "loss": 0.4584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5654414296150208, + "rewards/margins": 1.6391537189483643, + "rewards/rejected": -2.2045950889587402, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 3.3311772315653297e-07, + "logits/chosen": -3.133784770965576, + "logits/rejected": -3.15869402885437, + "logps/chosen": -320.0884094238281, + "logps/rejected": -216.3438720703125, + "loss": 0.5601, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.407951295375824, + "rewards/margins": 1.659472107887268, + "rewards/rejected": -2.0674235820770264, + "step": 1030 + }, + { + "epoch": 0.2, + "learning_rate": 3.3635187580853815e-07, + "logits/chosen": -3.0063021183013916, + "logits/rejected": -2.9411933422088623, + "logps/chosen": -222.69235229492188, + "logps/rejected": -260.7970275878906, + "loss": 0.4548, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.258476197719574, + "rewards/margins": 1.0244916677474976, + "rewards/rejected": -1.2829679250717163, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 3.395860284605433e-07, + "logits/chosen": -3.1085867881774902, + "logits/rejected": -3.031299114227295, + "logps/chosen": -192.4764404296875, + "logps/rejected": -182.42041015625, + "loss": 0.5538, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.026580810546875, + "rewards/margins": 0.8679073452949524, + "rewards/rejected": -1.894487977027893, + "step": 1050 + }, + { + "epoch": 0.21, + "learning_rate": 3.428201811125485e-07, + "logits/chosen": -3.115882635116577, + "logits/rejected": -3.1893692016601562, + "logps/chosen": -317.40435791015625, + "logps/rejected": -279.87200927734375, + "loss": 0.5181, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6270907521247864, + "rewards/margins": 0.9389120936393738, + "rewards/rejected": -1.5660028457641602, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 3.460543337645537e-07, + "logits/chosen": -3.1071622371673584, + "logits/rejected": -3.1591129302978516, + "logps/chosen": -238.4996337890625, + "logps/rejected": -222.28890991210938, + "loss": 0.4633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7945259809494019, + "rewards/margins": 2.2058539390563965, + "rewards/rejected": -3.000380039215088, + "step": 1070 + }, + { + "epoch": 0.21, + "learning_rate": 3.492884864165588e-07, + "logits/chosen": -2.865248918533325, + "logits/rejected": -2.824042320251465, + "logps/chosen": -296.9127197265625, + "logps/rejected": -319.47479248046875, + "loss": 0.6099, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8548253178596497, + "rewards/margins": 1.1957032680511475, + "rewards/rejected": -2.0505287647247314, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 3.52522639068564e-07, + "logits/chosen": -3.009572744369507, + "logits/rejected": -3.0641026496887207, + "logps/chosen": -259.3932189941406, + "logps/rejected": -245.3793487548828, + "loss": 0.4895, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5809619426727295, + "rewards/margins": 2.1275665760040283, + "rewards/rejected": -3.708528518676758, + "step": 1090 + }, + { + "epoch": 0.21, + "learning_rate": 3.557567917205692e-07, + "logits/chosen": -3.025712728500366, + "logits/rejected": -2.99841570854187, + "logps/chosen": -237.5346221923828, + "logps/rejected": -261.982421875, + "loss": 0.5162, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5719404220581055, + "rewards/margins": 1.089235544204712, + "rewards/rejected": -2.6611759662628174, + "step": 1100 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -3.022839307785034, + "eval_logits/rejected": -3.031045436859131, + "eval_logps/chosen": -213.15330505371094, + "eval_logps/rejected": -205.95526123046875, + "eval_loss": 0.5014147758483887, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -1.8915215730667114, + "eval_rewards/margins": 1.512406349182129, + "eval_rewards/rejected": -3.403928279876709, + "eval_runtime": 147.5344, + "eval_samples_per_second": 21.392, + "eval_steps_per_second": 0.339, + "step": 1100 + }, + { + "epoch": 0.22, + "learning_rate": 3.589909443725744e-07, + "logits/chosen": -2.9536046981811523, + "logits/rejected": -2.9645209312438965, + "logps/chosen": -183.07235717773438, + "logps/rejected": -205.9368896484375, + "loss": 0.6317, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.184206485748291, + "rewards/margins": 2.3128418922424316, + "rewards/rejected": -4.497048377990723, + "step": 1110 + }, + { + "epoch": 0.22, + "learning_rate": 3.6222509702457956e-07, + "logits/chosen": -3.090672492980957, + "logits/rejected": -3.0842747688293457, + "logps/chosen": -314.849609375, + "logps/rejected": -263.3332824707031, + "loss": 0.4117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4791661500930786, + "rewards/margins": 1.4494885206222534, + "rewards/rejected": -2.928654193878174, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 3.654592496765847e-07, + "logits/chosen": -2.967991352081299, + "logits/rejected": -2.962629795074463, + "logps/chosen": -202.8747100830078, + "logps/rejected": -180.35372924804688, + "loss": 0.4885, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.424861192703247, + "rewards/margins": 2.009540557861328, + "rewards/rejected": -3.434401750564575, + "step": 1130 + }, + { + "epoch": 0.22, + "learning_rate": 3.686934023285899e-07, + "logits/chosen": -2.9189953804016113, + "logits/rejected": -2.9806225299835205, + "logps/chosen": -251.71377563476562, + "logps/rejected": -224.7860565185547, + "loss": 0.5033, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2403509616851807, + "rewards/margins": 2.269951343536377, + "rewards/rejected": -3.5103023052215576, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 3.719275549805951e-07, + "logits/chosen": -2.754948377609253, + "logits/rejected": -2.7144274711608887, + "logps/chosen": -208.32113647460938, + "logps/rejected": -235.2569580078125, + "loss": 0.7725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8944647312164307, + "rewards/margins": 1.3893457651138306, + "rewards/rejected": -3.2838103771209717, + "step": 1150 + }, + { + "epoch": 0.23, + "learning_rate": 3.7516170763260023e-07, + "logits/chosen": -3.0110602378845215, + "logits/rejected": -3.023695945739746, + "logps/chosen": -219.28292846679688, + "logps/rejected": -301.84722900390625, + "loss": 0.5097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.392371416091919, + "rewards/margins": 1.1131460666656494, + "rewards/rejected": -3.5055174827575684, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 3.783958602846054e-07, + "logits/chosen": -2.7806992530822754, + "logits/rejected": -2.7770588397979736, + "logps/chosen": -143.1715850830078, + "logps/rejected": -159.3408203125, + "loss": 0.4813, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.19997239112854, + "rewards/margins": 2.889619827270508, + "rewards/rejected": -5.089591979980469, + "step": 1170 + }, + { + "epoch": 0.23, + "learning_rate": 3.8163001293661055e-07, + "logits/chosen": -3.1141510009765625, + "logits/rejected": -3.122486114501953, + "logps/chosen": -220.572998046875, + "logps/rejected": -235.4295654296875, + "loss": 0.4881, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.143183469772339, + "rewards/margins": 1.379930853843689, + "rewards/rejected": -4.5231146812438965, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 3.848641655886158e-07, + "logits/chosen": -2.950125217437744, + "logits/rejected": -2.998990535736084, + "logps/chosen": -259.8975830078125, + "logps/rejected": -214.5314483642578, + "loss": 0.64, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8857457637786865, + "rewards/margins": 2.058138132095337, + "rewards/rejected": -3.9438838958740234, + "step": 1190 + }, + { + "epoch": 0.23, + "learning_rate": 3.8809831824062096e-07, + "logits/chosen": -3.035386323928833, + "logits/rejected": -3.0723254680633545, + "logps/chosen": -212.2264404296875, + "logps/rejected": -229.48904418945312, + "loss": 0.5772, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.52046537399292, + "rewards/margins": 1.2834186553955078, + "rewards/rejected": -3.8038837909698486, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -3.0409305095672607, + "eval_logits/rejected": -3.0463762283325195, + "eval_logps/chosen": -222.2004852294922, + "eval_logps/rejected": -216.12571716308594, + "eval_loss": 0.49300292134284973, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -2.796241521835327, + "eval_rewards/margins": 1.6247340440750122, + "eval_rewards/rejected": -4.420976161956787, + "eval_runtime": 139.6999, + "eval_samples_per_second": 22.591, + "eval_steps_per_second": 0.358, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 3.913324708926261e-07, + "logits/chosen": -3.11126446723938, + "logits/rejected": -3.046506881713867, + "logps/chosen": -295.7327575683594, + "logps/rejected": -236.0751190185547, + "loss": 0.5102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2411413192749023, + "rewards/margins": 1.0197899341583252, + "rewards/rejected": -3.2609314918518066, + "step": 1210 + }, + { + "epoch": 0.24, + "learning_rate": 3.945666235446313e-07, + "logits/chosen": -2.947561740875244, + "logits/rejected": -3.013680934906006, + "logps/chosen": -287.5589294433594, + "logps/rejected": -237.54067993164062, + "loss": 0.4763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.672600507736206, + "rewards/margins": 2.017294406890869, + "rewards/rejected": -3.689894914627075, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 3.978007761966365e-07, + "logits/chosen": -3.070868730545044, + "logits/rejected": -3.0533483028411865, + "logps/chosen": -291.6087951660156, + "logps/rejected": -220.195068359375, + "loss": 0.4539, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2438287734985352, + "rewards/margins": 1.924587607383728, + "rewards/rejected": -3.1684165000915527, + "step": 1230 + }, + { + "epoch": 0.24, + "learning_rate": 4.0103492884864164e-07, + "logits/chosen": -3.076491117477417, + "logits/rejected": -3.0687084197998047, + "logps/chosen": -254.6431121826172, + "logps/rejected": -230.42984008789062, + "loss": 0.5613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1087321043014526, + "rewards/margins": 2.1011736392974854, + "rewards/rejected": -3.2099056243896484, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 4.042690815006468e-07, + "logits/chosen": -2.8690333366394043, + "logits/rejected": -2.970384120941162, + "logps/chosen": -320.2853088378906, + "logps/rejected": -204.7520294189453, + "loss": 0.6193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2358639240264893, + "rewards/margins": 0.32780444622039795, + "rewards/rejected": -2.5636682510375977, + "step": 1250 + }, + { + "epoch": 0.24, + "learning_rate": 4.0750323415265195e-07, + "logits/chosen": -2.976606845855713, + "logits/rejected": -2.9068028926849365, + "logps/chosen": -249.2131805419922, + "logps/rejected": -293.21380615234375, + "loss": 0.493, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3098329305648804, + "rewards/margins": 0.5287295579910278, + "rewards/rejected": -1.8385623693466187, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 4.1073738680465714e-07, + "logits/chosen": -3.063732624053955, + "logits/rejected": -3.0288805961608887, + "logps/chosen": -262.306640625, + "logps/rejected": -191.3130340576172, + "loss": 0.5405, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9771369695663452, + "rewards/margins": 0.6444782018661499, + "rewards/rejected": -2.621614933013916, + "step": 1270 + }, + { + "epoch": 0.25, + "learning_rate": 4.1397153945666237e-07, + "logits/chosen": -3.0463337898254395, + "logits/rejected": -3.0803537368774414, + "logps/chosen": -298.9761047363281, + "logps/rejected": -262.38958740234375, + "loss": 0.5537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.137474775314331, + "rewards/margins": 1.1659232378005981, + "rewards/rejected": -3.3033981323242188, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 4.172056921086675e-07, + "logits/chosen": -2.61924409866333, + "logits/rejected": -2.7202091217041016, + "logps/chosen": -310.4773864746094, + "logps/rejected": -193.84881591796875, + "loss": 0.5249, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0326943397521973, + "rewards/margins": 0.865719199180603, + "rewards/rejected": -2.8984134197235107, + "step": 1290 + }, + { + "epoch": 0.25, + "learning_rate": 4.204398447606727e-07, + "logits/chosen": -3.0022222995758057, + "logits/rejected": -2.9504170417785645, + "logps/chosen": -208.32400512695312, + "logps/rejected": -234.04812622070312, + "loss": 0.5046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1894853115081787, + "rewards/margins": 1.4778894186019897, + "rewards/rejected": -3.6673743724823, + "step": 1300 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -2.949617385864258, + "eval_logits/rejected": -2.9546737670898438, + "eval_logps/chosen": -214.5135498046875, + "eval_logps/rejected": -210.51483154296875, + "eval_loss": 0.49645331501960754, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -2.0275466442108154, + "eval_rewards/margins": 1.8323402404785156, + "eval_rewards/rejected": -3.85988712310791, + "eval_runtime": 140.3326, + "eval_samples_per_second": 22.489, + "eval_steps_per_second": 0.356, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 4.236739974126778e-07, + "logits/chosen": -2.9534194469451904, + "logits/rejected": -3.0005733966827393, + "logps/chosen": -203.74639892578125, + "logps/rejected": -176.9345703125, + "loss": 0.5916, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8876439929008484, + "rewards/margins": 2.431450366973877, + "rewards/rejected": -3.31909441947937, + "step": 1310 + }, + { + "epoch": 0.26, + "learning_rate": 4.2690815006468305e-07, + "logits/chosen": -3.058112621307373, + "logits/rejected": -3.0043575763702393, + "logps/chosen": -282.26806640625, + "logps/rejected": -223.65750122070312, + "loss": 0.4585, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8393690586090088, + "rewards/margins": 1.286932110786438, + "rewards/rejected": -3.1263010501861572, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 4.3014230271668823e-07, + "logits/chosen": -2.8832192420959473, + "logits/rejected": -2.9397988319396973, + "logps/chosen": -245.3288116455078, + "logps/rejected": -270.9195556640625, + "loss": 0.4679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5440532565116882, + "rewards/margins": 2.73748517036438, + "rewards/rejected": -3.281538486480713, + "step": 1330 + }, + { + "epoch": 0.26, + "learning_rate": 4.3337645536869336e-07, + "logits/chosen": -3.019270658493042, + "logits/rejected": -2.962163209915161, + "logps/chosen": -228.564208984375, + "logps/rejected": -220.4651336669922, + "loss": 0.3911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6882941722869873, + "rewards/margins": 0.9210535287857056, + "rewards/rejected": -2.609347343444824, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 4.3661060802069855e-07, + "logits/chosen": -2.8609261512756348, + "logits/rejected": -2.9113025665283203, + "logps/chosen": -264.30047607421875, + "logps/rejected": -216.177978515625, + "loss": 0.4809, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8004733324050903, + "rewards/margins": 2.5520710945129395, + "rewards/rejected": -3.3525443077087402, + "step": 1350 + }, + { + "epoch": 0.26, + "learning_rate": 4.3984476067270373e-07, + "logits/chosen": -2.969022035598755, + "logits/rejected": -3.0133910179138184, + "logps/chosen": -230.45748901367188, + "logps/rejected": -254.520751953125, + "loss": 0.618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5217835903167725, + "rewards/margins": 1.3354570865631104, + "rewards/rejected": -2.857240676879883, + "step": 1360 + }, + { + "epoch": 0.27, + "learning_rate": 4.430789133247089e-07, + "logits/chosen": -3.0336718559265137, + "logits/rejected": -3.0175960063934326, + "logps/chosen": -258.44073486328125, + "logps/rejected": -279.9928894042969, + "loss": 0.5308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4808266758918762, + "rewards/margins": 0.8624935150146484, + "rewards/rejected": -1.3433201313018799, + "step": 1370 + }, + { + "epoch": 0.27, + "learning_rate": 4.463130659767141e-07, + "logits/chosen": -2.9941906929016113, + "logits/rejected": -2.9364662170410156, + "logps/chosen": -265.09515380859375, + "logps/rejected": -274.08929443359375, + "loss": 0.481, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6047403216362, + "rewards/margins": 1.2752878665924072, + "rewards/rejected": -1.8800283670425415, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 4.495472186287192e-07, + "logits/chosen": -2.9048352241516113, + "logits/rejected": -2.934239387512207, + "logps/chosen": -265.1805114746094, + "logps/rejected": -246.13937377929688, + "loss": 0.6225, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3658626079559326, + "rewards/margins": 0.4025394320487976, + "rewards/rejected": -1.768401861190796, + "step": 1390 + }, + { + "epoch": 0.27, + "learning_rate": 4.527813712807244e-07, + "logits/chosen": -2.8936638832092285, + "logits/rejected": -2.8805880546569824, + "logps/chosen": -163.13661193847656, + "logps/rejected": -157.44528198242188, + "loss": 0.4987, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.523904800415039, + "rewards/margins": 1.9959348440170288, + "rewards/rejected": -5.519840240478516, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.938807487487793, + "eval_logits/rejected": -2.9467782974243164, + "eval_logps/chosen": -216.0721893310547, + "eval_logps/rejected": -213.0635528564453, + "eval_loss": 0.4858356714248657, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -2.1834118366241455, + "eval_rewards/margins": 1.9313461780548096, + "eval_rewards/rejected": -4.114758014678955, + "eval_runtime": 139.712, + "eval_samples_per_second": 22.589, + "eval_steps_per_second": 0.358, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 4.5601552393272964e-07, + "logits/chosen": -2.8405141830444336, + "logits/rejected": -2.8250839710235596, + "logps/chosen": -249.561767578125, + "logps/rejected": -227.09109497070312, + "loss": 0.5119, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8244565725326538, + "rewards/margins": 1.4919013977050781, + "rewards/rejected": -3.3163580894470215, + "step": 1410 + }, + { + "epoch": 0.28, + "learning_rate": 4.5924967658473477e-07, + "logits/chosen": -2.8767993450164795, + "logits/rejected": -2.8210933208465576, + "logps/chosen": -200.073486328125, + "logps/rejected": -180.85694885253906, + "loss": 0.506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6324551105499268, + "rewards/margins": 0.8869991302490234, + "rewards/rejected": -2.51945424079895, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 4.6248382923673995e-07, + "logits/chosen": -2.8442671298980713, + "logits/rejected": -2.865152359008789, + "logps/chosen": -187.21571350097656, + "logps/rejected": -225.83041381835938, + "loss": 0.6236, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.5626729726791382, + "rewards/margins": 0.7404478192329407, + "rewards/rejected": -2.3031206130981445, + "step": 1430 + }, + { + "epoch": 0.28, + "learning_rate": 4.6571798188874514e-07, + "logits/chosen": -2.970651388168335, + "logits/rejected": -2.9583518505096436, + "logps/chosen": -154.30422973632812, + "logps/rejected": -188.7027587890625, + "loss": 0.5871, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.115335702896118, + "rewards/margins": 1.2574704885482788, + "rewards/rejected": -3.3728058338165283, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 4.6895213454075027e-07, + "logits/chosen": -2.991255521774292, + "logits/rejected": -3.045728921890259, + "logps/chosen": -324.1858215332031, + "logps/rejected": -251.70266723632812, + "loss": 0.3887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4335097372531891, + "rewards/margins": 1.7088518142700195, + "rewards/rejected": -2.1423614025115967, + "step": 1450 + }, + { + "epoch": 0.28, + "learning_rate": 4.721862871927555e-07, + "logits/chosen": -2.7233452796936035, + "logits/rejected": -2.8554649353027344, + "logps/chosen": -361.3069152832031, + "logps/rejected": -300.8002014160156, + "loss": 0.5711, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6226775646209717, + "rewards/margins": 2.504540205001831, + "rewards/rejected": -4.127217769622803, + "step": 1460 + }, + { + "epoch": 0.29, + "learning_rate": 4.7542043984476063e-07, + "logits/chosen": -2.905463695526123, + "logits/rejected": -2.9778549671173096, + "logps/chosen": -302.7702941894531, + "logps/rejected": -264.3681945800781, + "loss": 0.6624, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.3320813179016113, + "rewards/margins": 0.3840333819389343, + "rewards/rejected": -3.7161145210266113, + "step": 1470 + }, + { + "epoch": 0.29, + "learning_rate": 4.786545924967659e-07, + "logits/chosen": -2.9825057983398438, + "logits/rejected": -2.9804251194000244, + "logps/chosen": -223.176025390625, + "logps/rejected": -210.6410369873047, + "loss": 0.4368, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.547914981842041, + "rewards/margins": 1.49990975856781, + "rewards/rejected": -4.047824859619141, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 4.81888745148771e-07, + "logits/chosen": -2.8775105476379395, + "logits/rejected": -2.9274189472198486, + "logps/chosen": -187.3961944580078, + "logps/rejected": -218.3221893310547, + "loss": 0.4395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8952897787094116, + "rewards/margins": 1.8903576135635376, + "rewards/rejected": -3.78564715385437, + "step": 1490 + }, + { + "epoch": 0.29, + "learning_rate": 4.851228978007762e-07, + "logits/chosen": -2.7787222862243652, + "logits/rejected": -2.817753314971924, + "logps/chosen": -257.91339111328125, + "logps/rejected": -225.732666015625, + "loss": 0.4808, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.018573522567749, + "rewards/margins": 2.3152785301208496, + "rewards/rejected": -3.3338520526885986, + "step": 1500 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.825873374938965, + "eval_logits/rejected": -2.8318793773651123, + "eval_logps/chosen": -216.43861389160156, + "eval_logps/rejected": -213.95115661621094, + "eval_loss": 0.4956172704696655, + "eval_rewards/accuracies": 0.7225000262260437, + "eval_rewards/chosen": -2.2200560569763184, + "eval_rewards/margins": 1.983464002609253, + "eval_rewards/rejected": -4.20352029800415, + "eval_runtime": 139.7185, + "eval_samples_per_second": 22.588, + "eval_steps_per_second": 0.358, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 4.883570504527814e-07, + "logits/chosen": -2.8890321254730225, + "logits/rejected": -2.9338107109069824, + "logps/chosen": -236.35006713867188, + "logps/rejected": -220.2532196044922, + "loss": 0.6088, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2814784049987793, + "rewards/margins": 2.0463881492614746, + "rewards/rejected": -4.327866554260254, + "step": 1510 + }, + { + "epoch": 0.3, + "learning_rate": 4.915912031047865e-07, + "logits/chosen": -2.895902156829834, + "logits/rejected": -2.918703556060791, + "logps/chosen": -220.9754638671875, + "logps/rejected": -253.6642608642578, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6117613315582275, + "rewards/margins": 2.7955708503723145, + "rewards/rejected": -4.407332420349121, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 4.948253557567917e-07, + "logits/chosen": -2.901484727859497, + "logits/rejected": -2.8929615020751953, + "logps/chosen": -219.70401000976562, + "logps/rejected": -210.36886596679688, + "loss": 0.6195, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6066604852676392, + "rewards/margins": 1.9129937887191772, + "rewards/rejected": -3.5196540355682373, + "step": 1530 + }, + { + "epoch": 0.3, + "learning_rate": 4.980595084087969e-07, + "logits/chosen": -2.829221248626709, + "logits/rejected": -2.8153421878814697, + "logps/chosen": -197.49081420898438, + "logps/rejected": -162.01693725585938, + "loss": 0.5914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.097716808319092, + "rewards/margins": 1.5696308612823486, + "rewards/rejected": -3.6673481464385986, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 4.998561875314589e-07, + "logits/chosen": -2.9436748027801514, + "logits/rejected": -3.01617169380188, + "logps/chosen": -219.0644989013672, + "logps/rejected": -252.2754669189453, + "loss": 0.4484, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1532790660858154, + "rewards/margins": 3.3547072410583496, + "rewards/rejected": -5.507986545562744, + "step": 1550 + }, + { + "epoch": 0.3, + "learning_rate": 4.994966563601064e-07, + "logits/chosen": -2.804248809814453, + "logits/rejected": -2.8312249183654785, + "logps/chosen": -217.8917999267578, + "logps/rejected": -174.00515747070312, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.792263984680176, + "rewards/margins": 2.13944935798645, + "rewards/rejected": -4.931713104248047, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 4.991371251887539e-07, + "logits/chosen": -2.9192967414855957, + "logits/rejected": -2.874433755874634, + "logps/chosen": -138.56375122070312, + "logps/rejected": -183.21981811523438, + "loss": 0.5351, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.108397960662842, + "rewards/margins": 1.698441505432129, + "rewards/rejected": -4.806839466094971, + "step": 1570 + }, + { + "epoch": 0.31, + "learning_rate": 4.987775940174013e-07, + "logits/chosen": -2.9292962551116943, + "logits/rejected": -2.916050434112549, + "logps/chosen": -225.0854949951172, + "logps/rejected": -209.73532104492188, + "loss": 0.6209, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.45169734954834, + "rewards/margins": 1.9957078695297241, + "rewards/rejected": -4.4474053382873535, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 4.984180628460487e-07, + "logits/chosen": -2.7843105792999268, + "logits/rejected": -2.7825684547424316, + "logps/chosen": -339.08001708984375, + "logps/rejected": -276.44769287109375, + "loss": 0.7123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.71315860748291, + "rewards/margins": 1.8002784252166748, + "rewards/rejected": -4.513437271118164, + "step": 1590 + }, + { + "epoch": 0.31, + "learning_rate": 4.980585316746962e-07, + "logits/chosen": -2.683852195739746, + "logits/rejected": -2.6745524406433105, + "logps/chosen": -230.86965942382812, + "logps/rejected": -236.5127410888672, + "loss": 0.5445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4864879846572876, + "rewards/margins": 1.105987548828125, + "rewards/rejected": -2.592475414276123, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.8415186405181885, + "eval_logits/rejected": -2.842726230621338, + "eval_logps/chosen": -221.4375762939453, + "eval_logps/rejected": -218.34251403808594, + "eval_loss": 0.49167120456695557, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -2.7199511528015137, + "eval_rewards/margins": 1.9227066040039062, + "eval_rewards/rejected": -4.64265775680542, + "eval_runtime": 139.2638, + "eval_samples_per_second": 22.662, + "eval_steps_per_second": 0.359, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 4.976990005033436e-07, + "logits/chosen": -2.9191858768463135, + "logits/rejected": -2.845217227935791, + "logps/chosen": -305.9738464355469, + "logps/rejected": -309.0461730957031, + "loss": 0.55, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8031907081604004, + "rewards/margins": 2.6041455268859863, + "rewards/rejected": -5.407336235046387, + "step": 1610 + }, + { + "epoch": 0.31, + "learning_rate": 4.973394693319911e-07, + "logits/chosen": -3.0033366680145264, + "logits/rejected": -2.983060359954834, + "logps/chosen": -212.1135711669922, + "logps/rejected": -198.12948608398438, + "loss": 0.6844, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3254258632659912, + "rewards/margins": 1.4193804264068604, + "rewards/rejected": -2.7448062896728516, + "step": 1620 + }, + { + "epoch": 0.32, + "learning_rate": 4.969799381606385e-07, + "logits/chosen": -2.794769287109375, + "logits/rejected": -2.6991593837738037, + "logps/chosen": -214.34506225585938, + "logps/rejected": -204.58248901367188, + "loss": 0.4522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0607855319976807, + "rewards/margins": 0.582096517086029, + "rewards/rejected": -2.6428821086883545, + "step": 1630 + }, + { + "epoch": 0.32, + "learning_rate": 4.966204069892859e-07, + "logits/chosen": -2.9268391132354736, + "logits/rejected": -2.9547667503356934, + "logps/chosen": -297.62213134765625, + "logps/rejected": -330.30694580078125, + "loss": 0.5049, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.506831645965576, + "rewards/margins": 3.2207348346710205, + "rewards/rejected": -5.727566719055176, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 4.962608758179334e-07, + "logits/chosen": -2.937668800354004, + "logits/rejected": -2.9284844398498535, + "logps/chosen": -274.85357666015625, + "logps/rejected": -247.8715057373047, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9010640978813171, + "rewards/margins": 1.7168909311294556, + "rewards/rejected": -2.617955446243286, + "step": 1650 + }, + { + "epoch": 0.32, + "learning_rate": 4.959013446465808e-07, + "logits/chosen": -2.8963003158569336, + "logits/rejected": -2.8931543827056885, + "logps/chosen": -225.38064575195312, + "logps/rejected": -260.68023681640625, + "loss": 0.776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7325546741485596, + "rewards/margins": 1.7724930047988892, + "rewards/rejected": -3.505047559738159, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 4.955418134752283e-07, + "logits/chosen": -2.7276527881622314, + "logits/rejected": -2.7366204261779785, + "logps/chosen": -215.1775360107422, + "logps/rejected": -203.43844604492188, + "loss": 0.5383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.350088357925415, + "rewards/margins": 2.6437973976135254, + "rewards/rejected": -3.9938862323760986, + "step": 1670 + }, + { + "epoch": 0.33, + "learning_rate": 4.951822823038758e-07, + "logits/chosen": -2.9176158905029297, + "logits/rejected": -2.9260013103485107, + "logps/chosen": -251.2834930419922, + "logps/rejected": -241.283203125, + "loss": 0.5999, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1413800716400146, + "rewards/margins": 0.9033109545707703, + "rewards/rejected": -3.044691324234009, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 4.948227511325231e-07, + "logits/chosen": -2.9102745056152344, + "logits/rejected": -2.992983341217041, + "logps/chosen": -263.1497802734375, + "logps/rejected": -233.3609619140625, + "loss": 0.7618, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4968585968017578, + "rewards/margins": 1.3661136627197266, + "rewards/rejected": -2.8629722595214844, + "step": 1690 + }, + { + "epoch": 0.33, + "learning_rate": 4.944632199611706e-07, + "logits/chosen": -2.932598829269409, + "logits/rejected": -2.862515687942505, + "logps/chosen": -226.80776977539062, + "logps/rejected": -212.4121856689453, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6572970151901245, + "rewards/margins": 1.0310018062591553, + "rewards/rejected": -1.6882988214492798, + "step": 1700 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.920361042022705, + "eval_logits/rejected": -2.9215283393859863, + "eval_logps/chosen": -218.91456604003906, + "eval_logps/rejected": -217.71157836914062, + "eval_loss": 0.5077618956565857, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -2.4676513671875, + "eval_rewards/margins": 2.111912965774536, + "eval_rewards/rejected": -4.579564571380615, + "eval_runtime": 140.9157, + "eval_samples_per_second": 22.396, + "eval_steps_per_second": 0.355, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 4.941036887898181e-07, + "logits/chosen": -2.853813648223877, + "logits/rejected": -2.8657195568084717, + "logps/chosen": -181.84080505371094, + "logps/rejected": -254.567626953125, + "loss": 0.5093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.878755807876587, + "rewards/margins": 3.014904499053955, + "rewards/rejected": -4.893660545349121, + "step": 1710 + }, + { + "epoch": 0.33, + "learning_rate": 4.937441576184655e-07, + "logits/chosen": -3.083876132965088, + "logits/rejected": -3.0712332725524902, + "logps/chosen": -339.6617126464844, + "logps/rejected": -338.98876953125, + "loss": 0.6498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8318314552307129, + "rewards/margins": 1.4427533149719238, + "rewards/rejected": -2.274585008621216, + "step": 1720 + }, + { + "epoch": 0.34, + "learning_rate": 4.933846264471129e-07, + "logits/chosen": -2.831230640411377, + "logits/rejected": -2.7701363563537598, + "logps/chosen": -202.59078979492188, + "logps/rejected": -283.06396484375, + "loss": 0.5234, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.7622671127319336, + "rewards/margins": 0.09787784516811371, + "rewards/rejected": -1.8601449728012085, + "step": 1730 + }, + { + "epoch": 0.34, + "learning_rate": 4.930250952757603e-07, + "logits/chosen": -2.901546001434326, + "logits/rejected": -2.9206769466400146, + "logps/chosen": -212.34158325195312, + "logps/rejected": -215.5955352783203, + "loss": 0.5993, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1232070922851562, + "rewards/margins": 2.2180099487304688, + "rewards/rejected": -4.341217517852783, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 4.926655641044078e-07, + "logits/chosen": -2.845414400100708, + "logits/rejected": -2.9428486824035645, + "logps/chosen": -191.5043487548828, + "logps/rejected": -227.24710083007812, + "loss": 0.6429, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3682129383087158, + "rewards/margins": 1.3480457067489624, + "rewards/rejected": -2.7162585258483887, + "step": 1750 + }, + { + "epoch": 0.34, + "learning_rate": 4.923060329330553e-07, + "logits/chosen": -2.5704612731933594, + "logits/rejected": -2.6995291709899902, + "logps/chosen": -338.72064208984375, + "logps/rejected": -299.72930908203125, + "loss": 0.496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.260344982147217, + "rewards/margins": 1.2866657972335815, + "rewards/rejected": -3.547010898590088, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 4.919465017617028e-07, + "logits/chosen": -2.8508810997009277, + "logits/rejected": -2.7702724933624268, + "logps/chosen": -280.30621337890625, + "logps/rejected": -322.1275329589844, + "loss": 0.4883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7750566005706787, + "rewards/margins": 1.663395643234253, + "rewards/rejected": -3.4384522438049316, + "step": 1770 + }, + { + "epoch": 0.35, + "learning_rate": 4.915869705903501e-07, + "logits/chosen": -2.8308699131011963, + "logits/rejected": -2.8825507164001465, + "logps/chosen": -170.27957153320312, + "logps/rejected": -243.4982147216797, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5050110816955566, + "rewards/margins": 3.1635780334472656, + "rewards/rejected": -6.668588161468506, + "step": 1780 + }, + { + "epoch": 0.35, + "learning_rate": 4.912274394189976e-07, + "logits/chosen": -2.798985719680786, + "logits/rejected": -2.7850592136383057, + "logps/chosen": -193.65354919433594, + "logps/rejected": -254.942626953125, + "loss": 0.4444, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9649609327316284, + "rewards/margins": 1.072882056236267, + "rewards/rejected": -2.0378429889678955, + "step": 1790 + }, + { + "epoch": 0.35, + "learning_rate": 4.908679082476451e-07, + "logits/chosen": -2.748511552810669, + "logits/rejected": -2.6923654079437256, + "logps/chosen": -224.71987915039062, + "logps/rejected": -220.9110565185547, + "loss": 0.4285, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0711779594421387, + "rewards/margins": 2.0031611919403076, + "rewards/rejected": -4.074339389801025, + "step": 1800 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.7308030128479004, + "eval_logits/rejected": -2.7299492359161377, + "eval_logps/chosen": -222.18240356445312, + "eval_logps/rejected": -223.67172241210938, + "eval_loss": 0.4976809620857239, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -2.794431209564209, + "eval_rewards/margins": 2.381145715713501, + "eval_rewards/rejected": -5.175577163696289, + "eval_runtime": 140.2956, + "eval_samples_per_second": 22.495, + "eval_steps_per_second": 0.356, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 4.905083770762925e-07, + "logits/chosen": -2.883899450302124, + "logits/rejected": -2.8011999130249023, + "logps/chosen": -220.16506958007812, + "logps/rejected": -258.29681396484375, + "loss": 0.5381, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.469589948654175, + "rewards/margins": 1.1768501996994019, + "rewards/rejected": -3.646440029144287, + "step": 1810 + }, + { + "epoch": 0.35, + "learning_rate": 4.9014884590494e-07, + "logits/chosen": -2.8269143104553223, + "logits/rejected": -2.83699107170105, + "logps/chosen": -187.8958282470703, + "logps/rejected": -156.596923828125, + "loss": 0.5313, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9329996109008789, + "rewards/margins": 3.001521110534668, + "rewards/rejected": -3.934520721435547, + "step": 1820 + }, + { + "epoch": 0.36, + "learning_rate": 4.897893147335873e-07, + "logits/chosen": -2.6217234134674072, + "logits/rejected": -2.7124900817871094, + "logps/chosen": -193.1573486328125, + "logps/rejected": -256.3013916015625, + "loss": 0.4575, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.2268829345703125, + "rewards/margins": 3.0330677032470703, + "rewards/rejected": -7.259950160980225, + "step": 1830 + }, + { + "epoch": 0.36, + "learning_rate": 4.894297835622348e-07, + "logits/chosen": -2.751868724822998, + "logits/rejected": -2.791961908340454, + "logps/chosen": -191.41848754882812, + "logps/rejected": -178.33953857421875, + "loss": 0.6224, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.943002223968506, + "rewards/margins": 2.345608949661255, + "rewards/rejected": -5.288610935211182, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 4.890702523908823e-07, + "logits/chosen": -2.8733506202697754, + "logits/rejected": -2.8486733436584473, + "logps/chosen": -300.8080139160156, + "logps/rejected": -216.562744140625, + "loss": 0.38, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5406880378723145, + "rewards/margins": 3.0873942375183105, + "rewards/rejected": -5.628082275390625, + "step": 1850 + }, + { + "epoch": 0.36, + "learning_rate": 4.887107212195298e-07, + "logits/chosen": -2.660714864730835, + "logits/rejected": -2.7268805503845215, + "logps/chosen": -192.49978637695312, + "logps/rejected": -242.5849151611328, + "loss": 0.5311, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8447067737579346, + "rewards/margins": 1.0581676959991455, + "rewards/rejected": -2.90287446975708, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 4.883511900481771e-07, + "logits/chosen": -2.889845132827759, + "logits/rejected": -2.840237855911255, + "logps/chosen": -275.62872314453125, + "logps/rejected": -264.7300109863281, + "loss": 0.413, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.390221357345581, + "rewards/margins": 1.194690465927124, + "rewards/rejected": -2.584911823272705, + "step": 1870 + }, + { + "epoch": 0.36, + "learning_rate": 4.879916588768246e-07, + "logits/chosen": -2.8796274662017822, + "logits/rejected": -2.825623035430908, + "logps/chosen": -430.91375732421875, + "logps/rejected": -309.5884094238281, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.307588577270508, + "rewards/margins": 1.254784345626831, + "rewards/rejected": -3.5623726844787598, + "step": 1880 + }, + { + "epoch": 0.37, + "learning_rate": 4.87632127705472e-07, + "logits/chosen": -2.8152003288269043, + "logits/rejected": -2.853797435760498, + "logps/chosen": -211.9172821044922, + "logps/rejected": -221.3629913330078, + "loss": 0.4878, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.062591552734375, + "rewards/margins": 1.4559752941131592, + "rewards/rejected": -5.518566131591797, + "step": 1890 + }, + { + "epoch": 0.37, + "learning_rate": 4.872725965341195e-07, + "logits/chosen": -2.8143889904022217, + "logits/rejected": -2.8226265907287598, + "logps/chosen": -171.64285278320312, + "logps/rejected": -147.26547241210938, + "loss": 0.5443, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.339956760406494, + "rewards/margins": 2.346651554107666, + "rewards/rejected": -5.68660831451416, + "step": 1900 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.8397412300109863, + "eval_logits/rejected": -2.8391599655151367, + "eval_logps/chosen": -225.46945190429688, + "eval_logps/rejected": -227.2286376953125, + "eval_loss": 0.48742884397506714, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -3.1231377124786377, + "eval_rewards/margins": 2.408130168914795, + "eval_rewards/rejected": -5.5312676429748535, + "eval_runtime": 153.399, + "eval_samples_per_second": 20.574, + "eval_steps_per_second": 0.326, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 4.86913065362767e-07, + "logits/chosen": -2.885929822921753, + "logits/rejected": -2.8270092010498047, + "logps/chosen": -309.38775634765625, + "logps/rejected": -254.2549591064453, + "loss": 0.6235, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.1622366905212402, + "rewards/margins": 0.6124019622802734, + "rewards/rejected": -2.7746386528015137, + "step": 1910 + }, + { + "epoch": 0.37, + "learning_rate": 4.865535341914143e-07, + "logits/chosen": -2.769245147705078, + "logits/rejected": -2.821042537689209, + "logps/chosen": -438.802001953125, + "logps/rejected": -267.00140380859375, + "loss": 0.4921, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7255628108978271, + "rewards/margins": 3.751121997833252, + "rewards/rejected": -5.4766845703125, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 4.861940030200618e-07, + "logits/chosen": -2.9151031970977783, + "logits/rejected": -2.804572105407715, + "logps/chosen": -251.7982635498047, + "logps/rejected": -223.19631958007812, + "loss": 0.4968, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3003602027893066, + "rewards/margins": 2.039172410964966, + "rewards/rejected": -3.3395328521728516, + "step": 1930 + }, + { + "epoch": 0.38, + "learning_rate": 4.858344718487092e-07, + "logits/chosen": -2.8644089698791504, + "logits/rejected": -2.88712739944458, + "logps/chosen": -262.6106262207031, + "logps/rejected": -238.1800994873047, + "loss": 0.53, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4858415126800537, + "rewards/margins": 2.418762445449829, + "rewards/rejected": -3.90460467338562, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 4.854749406773567e-07, + "logits/chosen": -2.8518614768981934, + "logits/rejected": -2.8204755783081055, + "logps/chosen": -239.5098419189453, + "logps/rejected": -214.1254119873047, + "loss": 0.4628, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1469717025756836, + "rewards/margins": 1.0126450061798096, + "rewards/rejected": -2.1596169471740723, + "step": 1950 + }, + { + "epoch": 0.38, + "learning_rate": 4.851154095060042e-07, + "logits/chosen": -2.7540643215179443, + "logits/rejected": -2.7091991901397705, + "logps/chosen": -176.52734375, + "logps/rejected": -208.6387176513672, + "loss": 0.4863, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2932955026626587, + "rewards/margins": 1.7724775075912476, + "rewards/rejected": -3.0657732486724854, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 4.847558783346516e-07, + "logits/chosen": -2.6126275062561035, + "logits/rejected": -2.620729446411133, + "logps/chosen": -273.9920654296875, + "logps/rejected": -332.6848449707031, + "loss": 0.5044, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4300267696380615, + "rewards/margins": 0.959136962890625, + "rewards/rejected": -3.3891634941101074, + "step": 1970 + }, + { + "epoch": 0.38, + "learning_rate": 4.84396347163299e-07, + "logits/chosen": -2.8108973503112793, + "logits/rejected": -2.713869333267212, + "logps/chosen": -255.5907440185547, + "logps/rejected": -264.1590881347656, + "loss": 0.55, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9435021877288818, + "rewards/margins": 1.8012924194335938, + "rewards/rejected": -3.7447943687438965, + "step": 1980 + }, + { + "epoch": 0.39, + "learning_rate": 4.840368159919465e-07, + "logits/chosen": -2.767923355102539, + "logits/rejected": -2.7916712760925293, + "logps/chosen": -229.8169403076172, + "logps/rejected": -198.69619750976562, + "loss": 0.4926, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.690882921218872, + "rewards/margins": 1.0796245336532593, + "rewards/rejected": -3.7705070972442627, + "step": 1990 + }, + { + "epoch": 0.39, + "learning_rate": 4.83677284820594e-07, + "logits/chosen": -2.8908398151397705, + "logits/rejected": -2.837955951690674, + "logps/chosen": -252.1120147705078, + "logps/rejected": -246.74606323242188, + "loss": 0.4776, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1896395683288574, + "rewards/margins": 1.070448398590088, + "rewards/rejected": -3.2600879669189453, + "step": 2000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.667562484741211, + "eval_logits/rejected": -2.6599512100219727, + "eval_logps/chosen": -228.1953125, + "eval_logps/rejected": -230.640625, + "eval_loss": 0.4850805401802063, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -3.395721435546875, + "eval_rewards/margins": 2.476745843887329, + "eval_rewards/rejected": -5.872467517852783, + "eval_runtime": 140.6915, + "eval_samples_per_second": 22.432, + "eval_steps_per_second": 0.355, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 4.833177536492413e-07, + "logits/chosen": -2.7589898109436035, + "logits/rejected": -2.8150837421417236, + "logps/chosen": -219.28005981445312, + "logps/rejected": -301.12664794921875, + "loss": 0.5235, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9610111713409424, + "rewards/margins": 2.9139275550842285, + "rewards/rejected": -5.87493896484375, + "step": 2010 + }, + { + "epoch": 0.39, + "learning_rate": 4.829582224778888e-07, + "logits/chosen": -2.8580589294433594, + "logits/rejected": -2.7701237201690674, + "logps/chosen": -185.91903686523438, + "logps/rejected": -203.38905334472656, + "loss": 0.459, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.219147682189941, + "rewards/margins": 1.6592937707901, + "rewards/rejected": -6.878440856933594, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 4.825986913065362e-07, + "logits/chosen": -2.6564595699310303, + "logits/rejected": -2.6738805770874023, + "logps/chosen": -240.1299591064453, + "logps/rejected": -339.5009765625, + "loss": 0.4661, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4794435501098633, + "rewards/margins": 2.6146645545959473, + "rewards/rejected": -5.094107627868652, + "step": 2030 + }, + { + "epoch": 0.4, + "learning_rate": 4.822391601351837e-07, + "logits/chosen": -2.725947141647339, + "logits/rejected": -2.588447093963623, + "logps/chosen": -211.72341918945312, + "logps/rejected": -216.319580078125, + "loss": 0.4058, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6720824241638184, + "rewards/margins": 2.822852611541748, + "rewards/rejected": -5.494935035705566, + "step": 2040 + }, + { + "epoch": 0.4, + "learning_rate": 4.818796289638312e-07, + "logits/chosen": -2.711277961730957, + "logits/rejected": -2.6957826614379883, + "logps/chosen": -162.187744140625, + "logps/rejected": -182.9951171875, + "loss": 0.5772, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -5.238260746002197, + "rewards/margins": 1.753045678138733, + "rewards/rejected": -6.991306304931641, + "step": 2050 + }, + { + "epoch": 0.4, + "learning_rate": 4.815200977924786e-07, + "logits/chosen": -2.830294609069824, + "logits/rejected": -2.7810616493225098, + "logps/chosen": -225.2228546142578, + "logps/rejected": -205.3040313720703, + "loss": 0.5436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.932889461517334, + "rewards/margins": 1.7703787088394165, + "rewards/rejected": -5.703269004821777, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 4.81160566621126e-07, + "logits/chosen": -2.7850966453552246, + "logits/rejected": -2.7620739936828613, + "logps/chosen": -315.70684814453125, + "logps/rejected": -245.2522430419922, + "loss": 0.5516, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.3822126388549805, + "rewards/margins": 0.6999850273132324, + "rewards/rejected": -5.082197666168213, + "step": 2070 + }, + { + "epoch": 0.4, + "learning_rate": 4.808010354497735e-07, + "logits/chosen": -2.870511770248413, + "logits/rejected": -2.805070161819458, + "logps/chosen": -252.13681030273438, + "logps/rejected": -197.99697875976562, + "loss": 0.5594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.5869674682617188, + "rewards/margins": 2.221545457839966, + "rewards/rejected": -5.808512210845947, + "step": 2080 + }, + { + "epoch": 0.41, + "learning_rate": 4.804415042784209e-07, + "logits/chosen": -2.6756701469421387, + "logits/rejected": -2.7388083934783936, + "logps/chosen": -336.222900390625, + "logps/rejected": -217.15963745117188, + "loss": 0.4252, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.3528850078582764, + "rewards/margins": 1.6979725360870361, + "rewards/rejected": -4.050858497619629, + "step": 2090 + }, + { + "epoch": 0.41, + "learning_rate": 4.800819731070684e-07, + "logits/chosen": -2.8770880699157715, + "logits/rejected": -2.9065544605255127, + "logps/chosen": -259.0509033203125, + "logps/rejected": -251.63516235351562, + "loss": 0.5387, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9471192359924316, + "rewards/margins": 2.0885226726531982, + "rewards/rejected": -5.035641670227051, + "step": 2100 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.8139474391937256, + "eval_logits/rejected": -2.803814172744751, + "eval_logps/chosen": -232.3311004638672, + "eval_logps/rejected": -233.79930114746094, + "eval_loss": 0.5210939645767212, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -3.8093035221099854, + "eval_rewards/margins": 2.3790290355682373, + "eval_rewards/rejected": -6.1883320808410645, + "eval_runtime": 140.4932, + "eval_samples_per_second": 22.464, + "eval_steps_per_second": 0.356, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 4.797224419357158e-07, + "logits/chosen": -2.919070243835449, + "logits/rejected": -2.864745616912842, + "logps/chosen": -266.123291015625, + "logps/rejected": -245.8166046142578, + "loss": 0.5069, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7676305770874023, + "rewards/margins": 2.387216567993164, + "rewards/rejected": -5.154847145080566, + "step": 2110 + }, + { + "epoch": 0.41, + "learning_rate": 4.793629107643632e-07, + "logits/chosen": -2.8027493953704834, + "logits/rejected": -2.8309144973754883, + "logps/chosen": -130.6552734375, + "logps/rejected": -140.10147094726562, + "loss": 0.4968, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0654122829437256, + "rewards/margins": 1.6262986660003662, + "rewards/rejected": -4.69171142578125, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 4.790033795930107e-07, + "logits/chosen": -2.765474319458008, + "logits/rejected": -2.8888039588928223, + "logps/chosen": -235.1878204345703, + "logps/rejected": -321.06414794921875, + "loss": 0.4361, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.94917893409729, + "rewards/margins": 3.3031139373779297, + "rewards/rejected": -6.252293109893799, + "step": 2130 + }, + { + "epoch": 0.42, + "learning_rate": 4.786438484216581e-07, + "logits/chosen": -2.7906861305236816, + "logits/rejected": -2.766310930252075, + "logps/chosen": -255.572509765625, + "logps/rejected": -258.65008544921875, + "loss": 0.7067, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1167337894439697, + "rewards/margins": 1.4146376848220825, + "rewards/rejected": -3.5313713550567627, + "step": 2140 + }, + { + "epoch": 0.42, + "learning_rate": 4.782843172503055e-07, + "logits/chosen": -2.9073071479797363, + "logits/rejected": -2.876718521118164, + "logps/chosen": -236.02719116210938, + "logps/rejected": -187.4551544189453, + "loss": 0.6016, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -3.2433650493621826, + "rewards/margins": 0.7124207615852356, + "rewards/rejected": -3.9557862281799316, + "step": 2150 + }, + { + "epoch": 0.42, + "learning_rate": 4.77924786078953e-07, + "logits/chosen": -2.8217930793762207, + "logits/rejected": -2.8676421642303467, + "logps/chosen": -148.67970275878906, + "logps/rejected": -188.03147888183594, + "loss": 0.5687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.420441150665283, + "rewards/margins": 2.5906240940093994, + "rewards/rejected": -5.0110650062561035, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 4.775652549076005e-07, + "logits/chosen": -2.799152374267578, + "logits/rejected": -2.774202585220337, + "logps/chosen": -283.7826232910156, + "logps/rejected": -225.2607879638672, + "loss": 0.6151, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1398122310638428, + "rewards/margins": 0.8396533131599426, + "rewards/rejected": -2.9794657230377197, + "step": 2170 + }, + { + "epoch": 0.42, + "learning_rate": 4.772057237362479e-07, + "logits/chosen": -2.9737019538879395, + "logits/rejected": -3.0729072093963623, + "logps/chosen": -221.1050262451172, + "logps/rejected": -283.1679992675781, + "loss": 0.5477, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.210036039352417, + "rewards/margins": 2.726353883743286, + "rewards/rejected": -5.936389923095703, + "step": 2180 + }, + { + "epoch": 0.43, + "learning_rate": 4.768461925648954e-07, + "logits/chosen": -2.972775936126709, + "logits/rejected": -2.9946017265319824, + "logps/chosen": -260.2768859863281, + "logps/rejected": -259.5898742675781, + "loss": 0.4473, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1959757804870605, + "rewards/margins": 2.791408061981201, + "rewards/rejected": -4.987383842468262, + "step": 2190 + }, + { + "epoch": 0.43, + "learning_rate": 4.7648666139354285e-07, + "logits/chosen": -2.7998204231262207, + "logits/rejected": -2.903319835662842, + "logps/chosen": -275.22467041015625, + "logps/rejected": -235.7041473388672, + "loss": 0.5673, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.17318058013916, + "rewards/margins": 2.1841416358947754, + "rewards/rejected": -4.357321739196777, + "step": 2200 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.813796281814575, + "eval_logits/rejected": -2.803727149963379, + "eval_logps/chosen": -230.1213836669922, + "eval_logps/rejected": -231.0912322998047, + "eval_loss": 0.502297580242157, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -3.5883309841156006, + "eval_rewards/margins": 2.3291962146759033, + "eval_rewards/rejected": -5.91752815246582, + "eval_runtime": 140.7946, + "eval_samples_per_second": 22.416, + "eval_steps_per_second": 0.355, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 4.761271302221903e-07, + "logits/chosen": -2.8988993167877197, + "logits/rejected": -2.9329185485839844, + "logps/chosen": -250.9599609375, + "logps/rejected": -239.0100860595703, + "loss": 0.4661, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.50346040725708, + "rewards/margins": 1.4525728225708008, + "rewards/rejected": -3.9560329914093018, + "step": 2210 + }, + { + "epoch": 0.43, + "learning_rate": 4.757675990508377e-07, + "logits/chosen": -2.6291983127593994, + "logits/rejected": -2.742318630218506, + "logps/chosen": -326.2305603027344, + "logps/rejected": -226.09603881835938, + "loss": 0.676, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.527320861816406, + "rewards/margins": 0.19092464447021484, + "rewards/rejected": -4.718245506286621, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 4.7540806787948513e-07, + "logits/chosen": -3.0049118995666504, + "logits/rejected": -2.921246290206909, + "logps/chosen": -259.56256103515625, + "logps/rejected": -237.5496826171875, + "loss": 0.6143, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.86800479888916, + "rewards/margins": 1.2542797327041626, + "rewards/rejected": -6.12228536605835, + "step": 2230 + }, + { + "epoch": 0.43, + "learning_rate": 4.7504853670813256e-07, + "logits/chosen": -2.847571611404419, + "logits/rejected": -2.8926985263824463, + "logps/chosen": -237.22921752929688, + "logps/rejected": -195.4996337890625, + "loss": 0.6923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.171439170837402, + "rewards/margins": 1.057605504989624, + "rewards/rejected": -6.2290449142456055, + "step": 2240 + }, + { + "epoch": 0.44, + "learning_rate": 4.7468900553678004e-07, + "logits/chosen": -2.865610122680664, + "logits/rejected": -2.872493267059326, + "logps/chosen": -233.55728149414062, + "logps/rejected": -200.44448852539062, + "loss": 0.5375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.8468353748321533, + "rewards/margins": 1.921197533607483, + "rewards/rejected": -4.768033027648926, + "step": 2250 + }, + { + "epoch": 0.44, + "learning_rate": 4.7432947436542747e-07, + "logits/chosen": -2.8956828117370605, + "logits/rejected": -2.912025213241577, + "logps/chosen": -274.8690185546875, + "logps/rejected": -275.8160095214844, + "loss": 0.4763, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6738688945770264, + "rewards/margins": 3.5564217567443848, + "rewards/rejected": -6.230290412902832, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 4.739699431940749e-07, + "logits/chosen": -2.9155755043029785, + "logits/rejected": -2.9084272384643555, + "logps/chosen": -305.519287109375, + "logps/rejected": -258.97882080078125, + "loss": 0.5134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.920853853225708, + "rewards/margins": 3.540989637374878, + "rewards/rejected": -5.461844444274902, + "step": 2270 + }, + { + "epoch": 0.44, + "learning_rate": 4.736104120227223e-07, + "logits/chosen": -2.7148208618164062, + "logits/rejected": -2.7656939029693604, + "logps/chosen": -334.14935302734375, + "logps/rejected": -321.5052490234375, + "loss": 0.6024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.147124767303467, + "rewards/margins": 0.8551236987113953, + "rewards/rejected": -5.0022478103637695, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 4.7325088085136975e-07, + "logits/chosen": -2.9230995178222656, + "logits/rejected": -2.8051400184631348, + "logps/chosen": -346.5646667480469, + "logps/rejected": -439.0077209472656, + "loss": 0.587, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.265329360961914, + "rewards/margins": 2.3481392860412598, + "rewards/rejected": -6.613468170166016, + "step": 2290 + }, + { + "epoch": 0.45, + "learning_rate": 4.728913496800173e-07, + "logits/chosen": -2.9406020641326904, + "logits/rejected": -2.9966301918029785, + "logps/chosen": -299.4635009765625, + "logps/rejected": -303.520263671875, + "loss": 0.5005, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.865657091140747, + "rewards/margins": 1.8061805963516235, + "rewards/rejected": -5.671838283538818, + "step": 2300 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -2.8331551551818848, + "eval_logits/rejected": -2.829441785812378, + "eval_logps/chosen": -235.6737060546875, + "eval_logps/rejected": -235.0966339111328, + "eval_loss": 0.4871974587440491, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -4.143564224243164, + "eval_rewards/margins": 2.1745026111602783, + "eval_rewards/rejected": -6.31806755065918, + "eval_runtime": 138.6189, + "eval_samples_per_second": 22.767, + "eval_steps_per_second": 0.361, + "step": 2300 + }, + { + "epoch": 0.45, + "learning_rate": 4.725318185086647e-07, + "logits/chosen": -2.6729812622070312, + "logits/rejected": -2.745810031890869, + "logps/chosen": -246.0074005126953, + "logps/rejected": -261.0729675292969, + "loss": 0.4538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.4698410034179688, + "rewards/margins": 3.8048255443573, + "rewards/rejected": -7.274666786193848, + "step": 2310 + }, + { + "epoch": 0.45, + "learning_rate": 4.7217228733731214e-07, + "logits/chosen": -2.6849420070648193, + "logits/rejected": -2.6646246910095215, + "logps/chosen": -220.2881622314453, + "logps/rejected": -269.70782470703125, + "loss": 0.6294, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.955125331878662, + "rewards/margins": 2.6565656661987305, + "rewards/rejected": -5.611690998077393, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 4.7181275616595957e-07, + "logits/chosen": -2.7159416675567627, + "logits/rejected": -2.761934280395508, + "logps/chosen": -257.1372985839844, + "logps/rejected": -216.0843963623047, + "loss": 0.4954, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.3044304847717285, + "rewards/margins": 1.3167331218719482, + "rewards/rejected": -3.6211636066436768, + "step": 2330 + }, + { + "epoch": 0.45, + "learning_rate": 4.71453224994607e-07, + "logits/chosen": -2.934607744216919, + "logits/rejected": -2.943761110305786, + "logps/chosen": -281.1361999511719, + "logps/rejected": -308.1634521484375, + "loss": 0.6032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.7284035682678223, + "rewards/margins": 1.6770026683807373, + "rewards/rejected": -5.405405521392822, + "step": 2340 + }, + { + "epoch": 0.46, + "learning_rate": 4.710936938232545e-07, + "logits/chosen": -2.8049893379211426, + "logits/rejected": -2.830340623855591, + "logps/chosen": -286.1492614746094, + "logps/rejected": -263.43572998046875, + "loss": 0.4673, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1562047004699707, + "rewards/margins": 2.307713747024536, + "rewards/rejected": -4.4639177322387695, + "step": 2350 + }, + { + "epoch": 0.46, + "learning_rate": 4.707341626519019e-07, + "logits/chosen": -2.7437050342559814, + "logits/rejected": -2.8308017253875732, + "logps/chosen": -230.4921875, + "logps/rejected": -253.0805206298828, + "loss": 0.5056, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.857966184616089, + "rewards/margins": 2.8505640029907227, + "rewards/rejected": -6.708531379699707, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 4.7037463148054933e-07, + "logits/chosen": -2.866947889328003, + "logits/rejected": -2.7586913108825684, + "logps/chosen": -289.22930908203125, + "logps/rejected": -241.8726806640625, + "loss": 0.5151, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4683752059936523, + "rewards/margins": 1.7269830703735352, + "rewards/rejected": -3.1953585147857666, + "step": 2370 + }, + { + "epoch": 0.46, + "learning_rate": 4.7001510030919676e-07, + "logits/chosen": -2.7631888389587402, + "logits/rejected": -2.7680182456970215, + "logps/chosen": -125.8613052368164, + "logps/rejected": -168.28732299804688, + "loss": 0.4811, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -3.2075114250183105, + "rewards/margins": 1.0803308486938477, + "rewards/rejected": -4.287842273712158, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 4.696555691378442e-07, + "logits/chosen": -2.788008689880371, + "logits/rejected": -2.773190975189209, + "logps/chosen": -205.5693359375, + "logps/rejected": -164.92813110351562, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.432602643966675, + "rewards/margins": 2.795102596282959, + "rewards/rejected": -6.227705955505371, + "step": 2390 + }, + { + "epoch": 0.47, + "learning_rate": 4.692960379664917e-07, + "logits/chosen": -2.9483280181884766, + "logits/rejected": -2.9471282958984375, + "logps/chosen": -247.19863891601562, + "logps/rejected": -231.531005859375, + "loss": 0.6603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3705947399139404, + "rewards/margins": 2.1876771450042725, + "rewards/rejected": -4.558271408081055, + "step": 2400 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.865147113800049, + "eval_logits/rejected": -2.862717390060425, + "eval_logps/chosen": -227.82696533203125, + "eval_logps/rejected": -227.18824768066406, + "eval_loss": 0.5267188549041748, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -3.3588902950286865, + "eval_rewards/margins": 2.1683359146118164, + "eval_rewards/rejected": -5.527226448059082, + "eval_runtime": 140.1009, + "eval_samples_per_second": 22.527, + "eval_steps_per_second": 0.357, + "step": 2400 + }, + { + "epoch": 0.47, + "learning_rate": 4.6893650679513915e-07, + "logits/chosen": -2.711394786834717, + "logits/rejected": -2.835629463195801, + "logps/chosen": -205.76205444335938, + "logps/rejected": -204.39146423339844, + "loss": 0.8134, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1112122535705566, + "rewards/margins": 2.999711751937866, + "rewards/rejected": -6.11092472076416, + "step": 2410 + }, + { + "epoch": 0.47, + "learning_rate": 4.685769756237866e-07, + "logits/chosen": -2.7515971660614014, + "logits/rejected": -2.804112672805786, + "logps/chosen": -225.8956756591797, + "logps/rejected": -270.7162170410156, + "loss": 0.6108, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4729607105255127, + "rewards/margins": 1.2283196449279785, + "rewards/rejected": -3.7012805938720703, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 4.68217444452434e-07, + "logits/chosen": -2.9887964725494385, + "logits/rejected": -2.903700590133667, + "logps/chosen": -344.06658935546875, + "logps/rejected": -273.3992004394531, + "loss": 0.4948, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.804466962814331, + "rewards/margins": 0.6572948694229126, + "rewards/rejected": -4.461761951446533, + "step": 2430 + }, + { + "epoch": 0.47, + "learning_rate": 4.6785791328108143e-07, + "logits/chosen": -2.9037537574768066, + "logits/rejected": -2.8705358505249023, + "logps/chosen": -237.170654296875, + "logps/rejected": -201.48524475097656, + "loss": 0.4903, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4012081623077393, + "rewards/margins": 2.492006778717041, + "rewards/rejected": -3.893214702606201, + "step": 2440 + }, + { + "epoch": 0.48, + "learning_rate": 4.674983821097289e-07, + "logits/chosen": -2.896740674972534, + "logits/rejected": -2.9393861293792725, + "logps/chosen": -279.9737243652344, + "logps/rejected": -331.5505676269531, + "loss": 0.562, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9780767560005188, + "rewards/margins": 2.16288423538208, + "rewards/rejected": -3.140961170196533, + "step": 2450 + }, + { + "epoch": 0.48, + "learning_rate": 4.6713885093837634e-07, + "logits/chosen": -2.800800085067749, + "logits/rejected": -2.810918092727661, + "logps/chosen": -186.5795440673828, + "logps/rejected": -224.8197784423828, + "loss": 0.4998, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2283473014831543, + "rewards/margins": 2.912879467010498, + "rewards/rejected": -5.141226291656494, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 4.6677931976702377e-07, + "logits/chosen": -2.8430376052856445, + "logits/rejected": -2.8135600090026855, + "logps/chosen": -279.0056457519531, + "logps/rejected": -211.0347137451172, + "loss": 0.5782, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6672947406768799, + "rewards/margins": 1.0928544998168945, + "rewards/rejected": -2.7601494789123535, + "step": 2470 + }, + { + "epoch": 0.48, + "learning_rate": 4.664197885956712e-07, + "logits/chosen": -2.9663290977478027, + "logits/rejected": -3.0165181159973145, + "logps/chosen": -282.0238342285156, + "logps/rejected": -326.161376953125, + "loss": 0.4105, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1331288814544678, + "rewards/margins": 1.9243627786636353, + "rewards/rejected": -4.057491779327393, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 4.660602574243186e-07, + "logits/chosen": -2.8429200649261475, + "logits/rejected": -2.7810652256011963, + "logps/chosen": -322.28619384765625, + "logps/rejected": -222.6088409423828, + "loss": 0.5032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.170711040496826, + "rewards/margins": 1.5483611822128296, + "rewards/rejected": -4.719071865081787, + "step": 2490 + }, + { + "epoch": 0.49, + "learning_rate": 4.6570072625296616e-07, + "logits/chosen": -2.7781026363372803, + "logits/rejected": -2.8628244400024414, + "logps/chosen": -261.4495849609375, + "logps/rejected": -212.68807983398438, + "loss": 0.5727, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.713273763656616, + "rewards/margins": 3.496837615966797, + "rewards/rejected": -6.210111618041992, + "step": 2500 + }, + { + "epoch": 0.49, + "eval_logits/chosen": -2.8481392860412598, + "eval_logits/rejected": -2.8475637435913086, + "eval_logps/chosen": -227.86355590820312, + "eval_logps/rejected": -228.5321807861328, + "eval_loss": 0.49507883191108704, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -3.362549066543579, + "eval_rewards/margins": 2.299072265625, + "eval_rewards/rejected": -5.66162109375, + "eval_runtime": 140.7809, + "eval_samples_per_second": 22.418, + "eval_steps_per_second": 0.355, + "step": 2500 + }, + { + "epoch": 0.49, + "learning_rate": 4.653411950816136e-07, + "logits/chosen": -2.882237672805786, + "logits/rejected": -2.8170576095581055, + "logps/chosen": -225.1431121826172, + "logps/rejected": -244.21932983398438, + "loss": 0.4277, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.2390036582946777, + "rewards/margins": 2.326955795288086, + "rewards/rejected": -5.565959930419922, + "step": 2510 + }, + { + "epoch": 0.49, + "learning_rate": 4.64981663910261e-07, + "logits/chosen": -2.8670458793640137, + "logits/rejected": -2.7911815643310547, + "logps/chosen": -197.6697235107422, + "logps/rejected": -185.67788696289062, + "loss": 0.5231, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.919440984725952, + "rewards/margins": 0.77399080991745, + "rewards/rejected": -3.6934313774108887, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 4.6462213273890844e-07, + "logits/chosen": -2.9022250175476074, + "logits/rejected": -2.893629550933838, + "logps/chosen": -273.0534362792969, + "logps/rejected": -279.77777099609375, + "loss": 0.4312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.160531520843506, + "rewards/margins": 2.096802234649658, + "rewards/rejected": -4.257333755493164, + "step": 2530 + }, + { + "epoch": 0.49, + "learning_rate": 4.6426260156755587e-07, + "logits/chosen": -2.8681180477142334, + "logits/rejected": -2.824857711791992, + "logps/chosen": -233.68972778320312, + "logps/rejected": -296.8993225097656, + "loss": 0.5388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.006774425506592, + "rewards/margins": 1.4732333421707153, + "rewards/rejected": -3.4800078868865967, + "step": 2540 + }, + { + "epoch": 0.5, + "learning_rate": 4.6390307039620335e-07, + "logits/chosen": -2.6878414154052734, + "logits/rejected": -2.652742862701416, + "logps/chosen": -116.12736511230469, + "logps/rejected": -166.33688354492188, + "loss": 0.6971, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.250061273574829, + "rewards/margins": 2.4174466133117676, + "rewards/rejected": -4.667508125305176, + "step": 2550 + }, + { + "epoch": 0.5, + "learning_rate": 4.635435392248508e-07, + "logits/chosen": -2.791367530822754, + "logits/rejected": -2.7953476905822754, + "logps/chosen": -249.4175567626953, + "logps/rejected": -228.1199188232422, + "loss": 0.5551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.463749647140503, + "rewards/margins": 1.6222326755523682, + "rewards/rejected": -4.085981845855713, + "step": 2560 + }, + { + "epoch": 0.5, + "learning_rate": 4.631840080534982e-07, + "logits/chosen": -2.8546769618988037, + "logits/rejected": -2.9030673503875732, + "logps/chosen": -214.65234375, + "logps/rejected": -260.1628112792969, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3015438318252563, + "rewards/margins": 1.6741079092025757, + "rewards/rejected": -2.975651741027832, + "step": 2570 + }, + { + "epoch": 0.5, + "learning_rate": 4.6282447688214563e-07, + "logits/chosen": -2.805986166000366, + "logits/rejected": -2.772294759750366, + "logps/chosen": -174.05673217773438, + "logps/rejected": -267.9089050292969, + "loss": 0.5648, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.704420804977417, + "rewards/margins": 2.961665391921997, + "rewards/rejected": -6.666086673736572, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 4.6246494571079306e-07, + "logits/chosen": -2.9575438499450684, + "logits/rejected": -2.956247091293335, + "logps/chosen": -341.7652893066406, + "logps/rejected": -297.8833923339844, + "loss": 0.5548, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6619125604629517, + "rewards/margins": 0.7100633978843689, + "rewards/rejected": -2.371976137161255, + "step": 2590 + }, + { + "epoch": 0.5, + "learning_rate": 4.621054145394406e-07, + "logits/chosen": -2.711975336074829, + "logits/rejected": -2.7088356018066406, + "logps/chosen": -171.69789123535156, + "logps/rejected": -199.38491821289062, + "loss": 0.5962, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.308825969696045, + "rewards/margins": 1.6770817041397095, + "rewards/rejected": -6.985907554626465, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.795304536819458, + "eval_logits/rejected": -2.794396162033081, + "eval_logps/chosen": -225.79542541503906, + "eval_logps/rejected": -227.5615234375, + "eval_loss": 0.4849202036857605, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -3.155735969543457, + "eval_rewards/margins": 2.408820152282715, + "eval_rewards/rejected": -5.564556121826172, + "eval_runtime": 149.6298, + "eval_samples_per_second": 21.092, + "eval_steps_per_second": 0.334, + "step": 2600 + }, + { + "epoch": 0.51, + "learning_rate": 4.61745883368088e-07, + "logits/chosen": -2.8558313846588135, + "logits/rejected": -2.840514659881592, + "logps/chosen": -286.47772216796875, + "logps/rejected": -210.8869171142578, + "loss": 0.6161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.449849843978882, + "rewards/margins": 1.0376472473144531, + "rewards/rejected": -3.487496852874756, + "step": 2610 + }, + { + "epoch": 0.51, + "learning_rate": 4.6138635219673545e-07, + "logits/chosen": -2.8377602100372314, + "logits/rejected": -2.850285291671753, + "logps/chosen": -229.0254364013672, + "logps/rejected": -214.21279907226562, + "loss": 0.5154, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.181450843811035, + "rewards/margins": 1.4099786281585693, + "rewards/rejected": -6.591429710388184, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 4.610268210253829e-07, + "logits/chosen": -2.809192419052124, + "logits/rejected": -2.9091243743896484, + "logps/chosen": -254.6504669189453, + "logps/rejected": -327.51019287109375, + "loss": 0.4769, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7079193592071533, + "rewards/margins": 1.4167802333831787, + "rewards/rejected": -3.124699354171753, + "step": 2630 + }, + { + "epoch": 0.51, + "learning_rate": 4.606672898540303e-07, + "logits/chosen": -2.7210512161254883, + "logits/rejected": -2.763578414916992, + "logps/chosen": -179.32467651367188, + "logps/rejected": -191.49557495117188, + "loss": 0.5819, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.314612865447998, + "rewards/margins": 3.099726438522339, + "rewards/rejected": -5.4143385887146, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 4.603077586826778e-07, + "logits/chosen": -2.784083843231201, + "logits/rejected": -2.7739856243133545, + "logps/chosen": -180.47994995117188, + "logps/rejected": -230.2683563232422, + "loss": 0.469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.519798755645752, + "rewards/margins": 0.8459455370903015, + "rewards/rejected": -3.3657443523406982, + "step": 2650 + }, + { + "epoch": 0.52, + "learning_rate": 4.599482275113252e-07, + "logits/chosen": -2.6934545040130615, + "logits/rejected": -2.6930344104766846, + "logps/chosen": -173.08389282226562, + "logps/rejected": -185.22975158691406, + "loss": 0.4295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2992825508117676, + "rewards/margins": 1.4708733558654785, + "rewards/rejected": -3.770156145095825, + "step": 2660 + }, + { + "epoch": 0.52, + "learning_rate": 4.5958869633997264e-07, + "logits/chosen": -2.882859706878662, + "logits/rejected": -2.8013839721679688, + "logps/chosen": -236.16506958007812, + "logps/rejected": -265.9134826660156, + "loss": 0.5289, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8265107870101929, + "rewards/margins": 1.7285740375518799, + "rewards/rejected": -3.555084705352783, + "step": 2670 + }, + { + "epoch": 0.52, + "learning_rate": 4.5922916516862007e-07, + "logits/chosen": -2.7338805198669434, + "logits/rejected": -2.8017101287841797, + "logps/chosen": -167.84872436523438, + "logps/rejected": -158.44155883789062, + "loss": 0.4123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3285470008850098, + "rewards/margins": 3.0958738327026367, + "rewards/rejected": -5.424420356750488, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 4.588696339972675e-07, + "logits/chosen": -2.811453342437744, + "logits/rejected": -2.855027437210083, + "logps/chosen": -264.64385986328125, + "logps/rejected": -217.11593627929688, + "loss": 0.5148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.4200408458709717, + "rewards/margins": 3.363882064819336, + "rewards/rejected": -6.783924102783203, + "step": 2690 + }, + { + "epoch": 0.52, + "learning_rate": 4.5851010282591503e-07, + "logits/chosen": -2.838355302810669, + "logits/rejected": -2.8200697898864746, + "logps/chosen": -304.71844482421875, + "logps/rejected": -251.6042022705078, + "loss": 0.5934, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.910211563110352, + "rewards/margins": 2.4452691078186035, + "rewards/rejected": -7.355480194091797, + "step": 2700 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.7893569469451904, + "eval_logits/rejected": -2.7884714603424072, + "eval_logps/chosen": -232.76983642578125, + "eval_logps/rejected": -239.4730224609375, + "eval_loss": 0.48597389459609985, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -3.8531787395477295, + "eval_rewards/margins": 2.9025280475616455, + "eval_rewards/rejected": -6.755706310272217, + "eval_runtime": 140.5224, + "eval_samples_per_second": 22.459, + "eval_steps_per_second": 0.356, + "step": 2700 + }, + { + "epoch": 0.53, + "learning_rate": 4.5815057165456246e-07, + "logits/chosen": -2.8093137741088867, + "logits/rejected": -2.7831215858459473, + "logps/chosen": -266.3493347167969, + "logps/rejected": -321.4234313964844, + "loss": 0.4197, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.4172821044921875, + "rewards/margins": 3.3637890815734863, + "rewards/rejected": -6.781070709228516, + "step": 2710 + }, + { + "epoch": 0.53, + "learning_rate": 4.577910404832099e-07, + "logits/chosen": -2.69020414352417, + "logits/rejected": -2.688347816467285, + "logps/chosen": -196.66305541992188, + "logps/rejected": -200.4778594970703, + "loss": 0.5553, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.270603656768799, + "rewards/margins": 2.629127025604248, + "rewards/rejected": -6.899730682373047, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 4.574315093118573e-07, + "logits/chosen": -2.7629377841949463, + "logits/rejected": -2.724412679672241, + "logps/chosen": -370.864501953125, + "logps/rejected": -272.51239013671875, + "loss": 0.5046, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3427891731262207, + "rewards/margins": 2.89741587638855, + "rewards/rejected": -6.240204811096191, + "step": 2730 + }, + { + "epoch": 0.53, + "learning_rate": 4.5707197814050474e-07, + "logits/chosen": -2.664853572845459, + "logits/rejected": -2.6672279834747314, + "logps/chosen": -222.2160186767578, + "logps/rejected": -236.78158569335938, + "loss": 0.7191, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.9496960639953613, + "rewards/margins": 2.4258008003234863, + "rewards/rejected": -5.375496864318848, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 4.567124469691522e-07, + "logits/chosen": -2.5228238105773926, + "logits/rejected": -2.6249096393585205, + "logps/chosen": -229.5448760986328, + "logps/rejected": -212.86734008789062, + "loss": 0.5635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.750124454498291, + "rewards/margins": 3.2047972679138184, + "rewards/rejected": -5.954921722412109, + "step": 2750 + }, + { + "epoch": 0.54, + "learning_rate": 4.5635291579779965e-07, + "logits/chosen": -2.764838695526123, + "logits/rejected": -2.788780927658081, + "logps/chosen": -279.43170166015625, + "logps/rejected": -226.8254852294922, + "loss": 0.4389, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6918768882751465, + "rewards/margins": 4.540162563323975, + "rewards/rejected": -7.232039451599121, + "step": 2760 + }, + { + "epoch": 0.54, + "learning_rate": 4.559933846264471e-07, + "logits/chosen": -2.6748318672180176, + "logits/rejected": -2.7573904991149902, + "logps/chosen": -294.40020751953125, + "logps/rejected": -274.6376647949219, + "loss": 0.4928, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.328364133834839, + "rewards/margins": 1.6496822834014893, + "rewards/rejected": -4.97804594039917, + "step": 2770 + }, + { + "epoch": 0.54, + "learning_rate": 4.556338534550945e-07, + "logits/chosen": -2.860028028488159, + "logits/rejected": -2.9034276008605957, + "logps/chosen": -256.19110107421875, + "logps/rejected": -258.70513916015625, + "loss": 0.8263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.013097286224365, + "rewards/margins": 3.11262845993042, + "rewards/rejected": -7.125726222991943, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 4.5527432228374194e-07, + "logits/chosen": -2.962338924407959, + "logits/rejected": -2.9946417808532715, + "logps/chosen": -381.32305908203125, + "logps/rejected": -331.4267272949219, + "loss": 0.498, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.3522491455078125, + "rewards/margins": 0.34079620242118835, + "rewards/rejected": -4.693045616149902, + "step": 2790 + }, + { + "epoch": 0.54, + "learning_rate": 4.5491479111238947e-07, + "logits/chosen": -2.9137508869171143, + "logits/rejected": -2.922184705734253, + "logps/chosen": -221.0836181640625, + "logps/rejected": -227.008544921875, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.47516393661499, + "rewards/margins": 2.3096227645874023, + "rewards/rejected": -6.784787178039551, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.789396286010742, + "eval_logits/rejected": -2.7867987155914307, + "eval_logps/chosen": -241.6216278076172, + "eval_logps/rejected": -248.03701782226562, + "eval_loss": 0.48178786039352417, + "eval_rewards/accuracies": 0.7225000262260437, + "eval_rewards/chosen": -4.7383551597595215, + "eval_rewards/margins": 2.873750925064087, + "eval_rewards/rejected": -7.6121063232421875, + "eval_runtime": 156.9523, + "eval_samples_per_second": 20.108, + "eval_steps_per_second": 0.319, + "step": 2800 + }, + { + "epoch": 0.55, + "learning_rate": 4.545552599410369e-07, + "logits/chosen": -2.815260171890259, + "logits/rejected": -2.8004562854766846, + "logps/chosen": -265.31866455078125, + "logps/rejected": -286.4771423339844, + "loss": 0.4594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1919310092926025, + "rewards/margins": 1.9255412817001343, + "rewards/rejected": -5.1174726486206055, + "step": 2810 + }, + { + "epoch": 0.55, + "learning_rate": 4.541957287696843e-07, + "logits/chosen": -2.8050880432128906, + "logits/rejected": -2.830878734588623, + "logps/chosen": -279.0443420410156, + "logps/rejected": -254.31881713867188, + "loss": 0.5846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.6159064769744873, + "rewards/margins": 3.61749267578125, + "rewards/rejected": -7.233399391174316, + "step": 2820 + }, + { + "epoch": 0.55, + "learning_rate": 4.5383619759833175e-07, + "logits/chosen": -2.8187646865844727, + "logits/rejected": -2.8197734355926514, + "logps/chosen": -270.74810791015625, + "logps/rejected": -321.5408020019531, + "loss": 0.4978, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.90576434135437, + "rewards/margins": 4.879975318908691, + "rewards/rejected": -7.785739898681641, + "step": 2830 + }, + { + "epoch": 0.55, + "learning_rate": 4.534766664269792e-07, + "logits/chosen": -2.8680903911590576, + "logits/rejected": -2.8194046020507812, + "logps/chosen": -213.48660278320312, + "logps/rejected": -207.0009765625, + "loss": 0.4644, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.814753293991089, + "rewards/margins": 2.791482448577881, + "rewards/rejected": -5.606235504150391, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 4.531171352556266e-07, + "logits/chosen": -2.751530408859253, + "logits/rejected": -2.72860050201416, + "logps/chosen": -231.2147979736328, + "logps/rejected": -203.83409118652344, + "loss": 0.4614, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.988515853881836, + "rewards/margins": 4.011109352111816, + "rewards/rejected": -7.999625205993652, + "step": 2850 + }, + { + "epoch": 0.56, + "learning_rate": 4.527576040842741e-07, + "logits/chosen": -2.666177988052368, + "logits/rejected": -2.6831955909729004, + "logps/chosen": -237.4653778076172, + "logps/rejected": -269.43695068359375, + "loss": 0.4596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.882408618927002, + "rewards/margins": 2.584329605102539, + "rewards/rejected": -5.466738224029541, + "step": 2860 + }, + { + "epoch": 0.56, + "learning_rate": 4.523980729129215e-07, + "logits/chosen": -2.932311534881592, + "logits/rejected": -2.8739829063415527, + "logps/chosen": -198.46334838867188, + "logps/rejected": -215.8825225830078, + "loss": 0.5902, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.875717878341675, + "rewards/margins": 1.6331195831298828, + "rewards/rejected": -4.508837699890137, + "step": 2870 + }, + { + "epoch": 0.56, + "learning_rate": 4.5203854174156895e-07, + "logits/chosen": -2.7299952507019043, + "logits/rejected": -2.773221254348755, + "logps/chosen": -160.0872039794922, + "logps/rejected": -212.7479705810547, + "loss": 0.4727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.80568790435791, + "rewards/margins": 4.347495079040527, + "rewards/rejected": -10.153182983398438, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 4.5167901057021643e-07, + "logits/chosen": -2.733940601348877, + "logits/rejected": -2.7792134284973145, + "logps/chosen": -213.64645385742188, + "logps/rejected": -215.48654174804688, + "loss": 0.4715, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3354275226593018, + "rewards/margins": 2.1211249828338623, + "rewards/rejected": -5.456552028656006, + "step": 2890 + }, + { + "epoch": 0.56, + "learning_rate": 4.5131947939886385e-07, + "logits/chosen": -2.6658873558044434, + "logits/rejected": -2.7218375205993652, + "logps/chosen": -200.94448852539062, + "logps/rejected": -262.49053955078125, + "loss": 0.4864, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2886252403259277, + "rewards/margins": 2.6156206130981445, + "rewards/rejected": -5.904245376586914, + "step": 2900 + }, + { + "epoch": 0.56, + "eval_logits/chosen": -2.7678444385528564, + "eval_logits/rejected": -2.7678475379943848, + "eval_logps/chosen": -235.4826202392578, + "eval_logps/rejected": -241.34597778320312, + "eval_loss": 0.4803493916988373, + "eval_rewards/accuracies": 0.7174999713897705, + "eval_rewards/chosen": -4.124456882476807, + "eval_rewards/margins": 2.818544626235962, + "eval_rewards/rejected": -6.943002223968506, + "eval_runtime": 141.1188, + "eval_samples_per_second": 22.364, + "eval_steps_per_second": 0.354, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 4.5095994822751134e-07, + "logits/chosen": -2.833460569381714, + "logits/rejected": -2.802799701690674, + "logps/chosen": -214.0464324951172, + "logps/rejected": -240.76315307617188, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0033795833587646, + "rewards/margins": 1.78948175907135, + "rewards/rejected": -3.7928614616394043, + "step": 2910 + }, + { + "epoch": 0.57, + "learning_rate": 4.5060041705615876e-07, + "logits/chosen": -2.7049596309661865, + "logits/rejected": -2.7685043811798096, + "logps/chosen": -196.03549194335938, + "logps/rejected": -220.3227081298828, + "loss": 0.5213, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.256979942321777, + "rewards/margins": 3.0013604164123535, + "rewards/rejected": -7.258340358734131, + "step": 2920 + }, + { + "epoch": 0.57, + "learning_rate": 4.502408858848062e-07, + "logits/chosen": -2.6681554317474365, + "logits/rejected": -2.6551687717437744, + "logps/chosen": -214.7572784423828, + "logps/rejected": -269.0203552246094, + "loss": 0.5363, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.369581699371338, + "rewards/margins": 2.0056557655334473, + "rewards/rejected": -5.375237464904785, + "step": 2930 + }, + { + "epoch": 0.57, + "learning_rate": 4.498813547134536e-07, + "logits/chosen": -2.7057762145996094, + "logits/rejected": -2.6808526515960693, + "logps/chosen": -169.85397338867188, + "logps/rejected": -280.1517639160156, + "loss": 0.6084, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6998391151428223, + "rewards/margins": 2.238490343093872, + "rewards/rejected": -4.938329219818115, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 4.4952182354210105e-07, + "logits/chosen": -2.5502073764801025, + "logits/rejected": -2.642277956008911, + "logps/chosen": -221.54415893554688, + "logps/rejected": -201.84996032714844, + "loss": 0.4849, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1933398246765137, + "rewards/margins": 3.202204465866089, + "rewards/rejected": -5.395545482635498, + "step": 2950 + }, + { + "epoch": 0.57, + "learning_rate": 4.4916229237074853e-07, + "logits/chosen": -2.723092555999756, + "logits/rejected": -2.7366278171539307, + "logps/chosen": -283.14678955078125, + "logps/rejected": -196.46414184570312, + "loss": 0.4182, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.1323070526123047, + "rewards/margins": 1.384583592414856, + "rewards/rejected": -4.516890525817871, + "step": 2960 + }, + { + "epoch": 0.58, + "learning_rate": 4.4880276119939596e-07, + "logits/chosen": -2.5838587284088135, + "logits/rejected": -2.7283897399902344, + "logps/chosen": -209.7035369873047, + "logps/rejected": -230.57687377929688, + "loss": 0.5242, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3927102088928223, + "rewards/margins": 5.4158172607421875, + "rewards/rejected": -8.808526992797852, + "step": 2970 + }, + { + "epoch": 0.58, + "learning_rate": 4.484432300280434e-07, + "logits/chosen": -2.910597562789917, + "logits/rejected": -2.785182237625122, + "logps/chosen": -210.0435028076172, + "logps/rejected": -222.11618041992188, + "loss": 0.5054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.7645599842071533, + "rewards/margins": 1.3658928871154785, + "rewards/rejected": -5.130453586578369, + "step": 2980 + }, + { + "epoch": 0.58, + "learning_rate": 4.4808369885669086e-07, + "logits/chosen": -2.771735668182373, + "logits/rejected": -2.8467822074890137, + "logps/chosen": -349.49066162109375, + "logps/rejected": -267.9779052734375, + "loss": 0.6024, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.8369460105895996, + "rewards/margins": 2.648116111755371, + "rewards/rejected": -5.485062599182129, + "step": 2990 + }, + { + "epoch": 0.58, + "learning_rate": 4.477241676853383e-07, + "logits/chosen": -2.7642741203308105, + "logits/rejected": -2.7960777282714844, + "logps/chosen": -254.3138427734375, + "logps/rejected": -278.13262939453125, + "loss": 0.4882, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0275261402130127, + "rewards/margins": 1.6648391485214233, + "rewards/rejected": -4.6923651695251465, + "step": 3000 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.789898157119751, + "eval_logits/rejected": -2.7911148071289062, + "eval_logps/chosen": -229.87535095214844, + "eval_logps/rejected": -232.6624755859375, + "eval_loss": 0.4967539310455322, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -3.5637285709381104, + "eval_rewards/margins": 2.5109221935272217, + "eval_rewards/rejected": -6.074651718139648, + "eval_runtime": 140.5322, + "eval_samples_per_second": 22.457, + "eval_steps_per_second": 0.356, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 4.4736463651398577e-07, + "logits/chosen": -2.8905491828918457, + "logits/rejected": -2.8863720893859863, + "logps/chosen": -239.4340362548828, + "logps/rejected": -264.43817138671875, + "loss": 0.5881, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.0071334838867188, + "rewards/margins": 2.091095447540283, + "rewards/rejected": -5.09822940826416, + "step": 3010 + }, + { + "epoch": 0.59, + "learning_rate": 4.470051053426332e-07, + "logits/chosen": -2.899305820465088, + "logits/rejected": -2.8747293949127197, + "logps/chosen": -255.5482635498047, + "logps/rejected": -254.17111206054688, + "loss": 0.5337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.560997724533081, + "rewards/margins": 4.4746809005737305, + "rewards/rejected": -8.03567886352539, + "step": 3020 + }, + { + "epoch": 0.59, + "learning_rate": 4.4664557417128063e-07, + "logits/chosen": -2.721799612045288, + "logits/rejected": -2.6572134494781494, + "logps/chosen": -186.32833862304688, + "logps/rejected": -239.40750122070312, + "loss": 0.5977, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5370707511901855, + "rewards/margins": 1.5322608947753906, + "rewards/rejected": -5.069332122802734, + "step": 3030 + }, + { + "epoch": 0.59, + "learning_rate": 4.4628604299992806e-07, + "logits/chosen": -2.6710963249206543, + "logits/rejected": -2.6934525966644287, + "logps/chosen": -203.04034423828125, + "logps/rejected": -218.6429443359375, + "loss": 0.5089, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.743546485900879, + "rewards/margins": 2.842298984527588, + "rewards/rejected": -5.585845947265625, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 4.459265118285755e-07, + "logits/chosen": -2.7489089965820312, + "logits/rejected": -2.7831459045410156, + "logps/chosen": -177.79653930664062, + "logps/rejected": -220.6035614013672, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.629119873046875, + "rewards/margins": 4.655067443847656, + "rewards/rejected": -8.284186363220215, + "step": 3050 + }, + { + "epoch": 0.59, + "learning_rate": 4.4556698065722296e-07, + "logits/chosen": -2.800358533859253, + "logits/rejected": -2.8105666637420654, + "logps/chosen": -211.05191040039062, + "logps/rejected": -238.6355438232422, + "loss": 0.5352, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4860329627990723, + "rewards/margins": 3.4360384941101074, + "rewards/rejected": -6.922071933746338, + "step": 3060 + }, + { + "epoch": 0.6, + "learning_rate": 4.452074494858704e-07, + "logits/chosen": -2.7737698554992676, + "logits/rejected": -2.850177526473999, + "logps/chosen": -188.44302368164062, + "logps/rejected": -232.519775390625, + "loss": 0.4941, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.984412670135498, + "rewards/margins": 3.328932285308838, + "rewards/rejected": -7.313345432281494, + "step": 3070 + }, + { + "epoch": 0.6, + "learning_rate": 4.448479183145178e-07, + "logits/chosen": -2.72746205329895, + "logits/rejected": -2.7201554775238037, + "logps/chosen": -206.14730834960938, + "logps/rejected": -231.61087036132812, + "loss": 0.4867, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.876461982727051, + "rewards/margins": 1.740944266319275, + "rewards/rejected": -6.617405891418457, + "step": 3080 + }, + { + "epoch": 0.6, + "learning_rate": 4.444883871431653e-07, + "logits/chosen": -2.839428424835205, + "logits/rejected": -2.82271146774292, + "logps/chosen": -203.8380889892578, + "logps/rejected": -207.5266571044922, + "loss": 0.5487, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.013213634490967, + "rewards/margins": 1.5793131589889526, + "rewards/rejected": -4.592526912689209, + "step": 3090 + }, + { + "epoch": 0.6, + "learning_rate": 4.4412885597181273e-07, + "logits/chosen": -2.8839659690856934, + "logits/rejected": -2.841780424118042, + "logps/chosen": -278.1788024902344, + "logps/rejected": -258.2974548339844, + "loss": 0.4958, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1666135787963867, + "rewards/margins": 1.1531345844268799, + "rewards/rejected": -4.319748401641846, + "step": 3100 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.8040521144866943, + "eval_logits/rejected": -2.8052425384521484, + "eval_logps/chosen": -234.44879150390625, + "eval_logps/rejected": -239.80535888671875, + "eval_loss": 0.48301056027412415, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -4.021074295043945, + "eval_rewards/margins": 2.7678658962249756, + "eval_rewards/rejected": -6.788939952850342, + "eval_runtime": 155.9274, + "eval_samples_per_second": 20.24, + "eval_steps_per_second": 0.321, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 4.437693248004602e-07, + "logits/chosen": -2.811739444732666, + "logits/rejected": -2.844913959503174, + "logps/chosen": -237.08291625976562, + "logps/rejected": -264.9893493652344, + "loss": 0.461, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9083025455474854, + "rewards/margins": 2.6540277004241943, + "rewards/rejected": -4.56233024597168, + "step": 3110 + }, + { + "epoch": 0.61, + "learning_rate": 4.4340979362910764e-07, + "logits/chosen": -2.6549086570739746, + "logits/rejected": -2.6251420974731445, + "logps/chosen": -238.0406494140625, + "logps/rejected": -226.7935333251953, + "loss": 0.6591, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8329887390136719, + "rewards/margins": 1.1235915422439575, + "rewards/rejected": -2.956580400466919, + "step": 3120 + }, + { + "epoch": 0.61, + "learning_rate": 4.4305026245775506e-07, + "logits/chosen": -2.838721990585327, + "logits/rejected": -2.798975944519043, + "logps/chosen": -204.61141967773438, + "logps/rejected": -245.9084930419922, + "loss": 0.5192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.60957670211792, + "rewards/margins": 1.4932568073272705, + "rewards/rejected": -6.1028337478637695, + "step": 3130 + }, + { + "epoch": 0.61, + "learning_rate": 4.426907312864025e-07, + "logits/chosen": -2.8392152786254883, + "logits/rejected": -2.7306416034698486, + "logps/chosen": -324.04840087890625, + "logps/rejected": -252.14224243164062, + "loss": 0.6379, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.778172969818115, + "rewards/margins": 0.22122666239738464, + "rewards/rejected": -4.999399662017822, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 4.423312001150499e-07, + "logits/chosen": -2.8068060874938965, + "logits/rejected": -2.824324369430542, + "logps/chosen": -191.25625610351562, + "logps/rejected": -219.1925811767578, + "loss": 0.604, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.532156944274902, + "rewards/margins": 3.1442408561706543, + "rewards/rejected": -7.676396369934082, + "step": 3150 + }, + { + "epoch": 0.61, + "learning_rate": 4.419716689436974e-07, + "logits/chosen": -2.7903690338134766, + "logits/rejected": -2.797542095184326, + "logps/chosen": -269.3061218261719, + "logps/rejected": -242.23208618164062, + "loss": 0.5406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1046996116638184, + "rewards/margins": 1.600785255432129, + "rewards/rejected": -4.7054853439331055, + "step": 3160 + }, + { + "epoch": 0.62, + "learning_rate": 4.4161213777234483e-07, + "logits/chosen": -2.6806042194366455, + "logits/rejected": -2.651494026184082, + "logps/chosen": -293.6893615722656, + "logps/rejected": -300.1238098144531, + "loss": 0.5415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.8252601623535156, + "rewards/margins": 2.5960474014282227, + "rewards/rejected": -6.4213080406188965, + "step": 3170 + }, + { + "epoch": 0.62, + "learning_rate": 4.4125260660099226e-07, + "logits/chosen": -2.7799265384674072, + "logits/rejected": -2.844611883163452, + "logps/chosen": -235.3821563720703, + "logps/rejected": -241.40951538085938, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.239741563796997, + "rewards/margins": 2.4194602966308594, + "rewards/rejected": -5.659201622009277, + "step": 3180 + }, + { + "epoch": 0.62, + "learning_rate": 4.4089307542963974e-07, + "logits/chosen": -2.8540241718292236, + "logits/rejected": -2.844773292541504, + "logps/chosen": -214.5902099609375, + "logps/rejected": -210.88577270507812, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6627594232559204, + "rewards/margins": 4.641172885894775, + "rewards/rejected": -6.303932189941406, + "step": 3190 + }, + { + "epoch": 0.62, + "learning_rate": 4.4053354425828717e-07, + "logits/chosen": -2.9225311279296875, + "logits/rejected": -2.949389934539795, + "logps/chosen": -277.113525390625, + "logps/rejected": -199.2532958984375, + "loss": 0.6056, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.612243175506592, + "rewards/margins": 2.2372536659240723, + "rewards/rejected": -4.849497318267822, + "step": 3200 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.841383695602417, + "eval_logits/rejected": -2.843325138092041, + "eval_logps/chosen": -227.9439239501953, + "eval_logps/rejected": -232.52822875976562, + "eval_loss": 0.4876376986503601, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -3.3705859184265137, + "eval_rewards/margins": 2.690638780593872, + "eval_rewards/rejected": -6.061224460601807, + "eval_runtime": 138.7046, + "eval_samples_per_second": 22.753, + "eval_steps_per_second": 0.36, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 4.4017401308693465e-07, + "logits/chosen": -2.752577781677246, + "logits/rejected": -2.7847416400909424, + "logps/chosen": -283.6220397949219, + "logps/rejected": -236.8374481201172, + "loss": 0.6678, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9232075214385986, + "rewards/margins": 1.3011012077331543, + "rewards/rejected": -4.224308490753174, + "step": 3210 + }, + { + "epoch": 0.63, + "learning_rate": 4.398144819155821e-07, + "logits/chosen": -3.000760793685913, + "logits/rejected": -2.914386034011841, + "logps/chosen": -139.03904724121094, + "logps/rejected": -159.8165283203125, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.078878402709961, + "rewards/margins": 2.0962741374969482, + "rewards/rejected": -5.17515230178833, + "step": 3220 + }, + { + "epoch": 0.63, + "learning_rate": 4.394549507442295e-07, + "logits/chosen": -2.4832091331481934, + "logits/rejected": -2.565969944000244, + "logps/chosen": -233.667236328125, + "logps/rejected": -211.1376953125, + "loss": 0.5122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.009371280670166, + "rewards/margins": 1.7042487859725952, + "rewards/rejected": -5.713620185852051, + "step": 3230 + }, + { + "epoch": 0.63, + "learning_rate": 4.3909541957287693e-07, + "logits/chosen": -2.793034076690674, + "logits/rejected": -2.841033697128296, + "logps/chosen": -248.4766082763672, + "logps/rejected": -204.9600830078125, + "loss": 0.5375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.73274827003479, + "rewards/margins": 1.598253846168518, + "rewards/rejected": -3.3310019969940186, + "step": 3240 + }, + { + "epoch": 0.63, + "learning_rate": 4.3873588840152436e-07, + "logits/chosen": -2.860628843307495, + "logits/rejected": -2.8502299785614014, + "logps/chosen": -277.3612365722656, + "logps/rejected": -240.89620971679688, + "loss": 0.5313, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.135129451751709, + "rewards/margins": 1.0863913297653198, + "rewards/rejected": -3.2215206623077393, + "step": 3250 + }, + { + "epoch": 0.63, + "learning_rate": 4.3837635723017184e-07, + "logits/chosen": -2.6584460735321045, + "logits/rejected": -2.7102010250091553, + "logps/chosen": -210.11239624023438, + "logps/rejected": -256.6304626464844, + "loss": 0.51, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1810548305511475, + "rewards/margins": 3.3691158294677734, + "rewards/rejected": -5.5501708984375, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 4.3801682605881927e-07, + "logits/chosen": -2.8054721355438232, + "logits/rejected": -2.8320729732513428, + "logps/chosen": -287.7835693359375, + "logps/rejected": -356.615234375, + "loss": 0.4803, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2600693702697754, + "rewards/margins": 2.852206230163574, + "rewards/rejected": -5.11227560043335, + "step": 3270 + }, + { + "epoch": 0.64, + "learning_rate": 4.3765729488746675e-07, + "logits/chosen": -2.8953349590301514, + "logits/rejected": -2.884124279022217, + "logps/chosen": -260.77508544921875, + "logps/rejected": -234.0574188232422, + "loss": 0.4614, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.122786045074463, + "rewards/margins": 1.525109052658081, + "rewards/rejected": -3.647895097732544, + "step": 3280 + }, + { + "epoch": 0.64, + "learning_rate": 4.372977637161142e-07, + "logits/chosen": -2.9647879600524902, + "logits/rejected": -2.901134967803955, + "logps/chosen": -288.5679626464844, + "logps/rejected": -263.0459899902344, + "loss": 0.4579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3843274116516113, + "rewards/margins": 2.8646128177642822, + "rewards/rejected": -6.248939514160156, + "step": 3290 + }, + { + "epoch": 0.64, + "learning_rate": 4.369382325447616e-07, + "logits/chosen": -2.634999990463257, + "logits/rejected": -2.5048158168792725, + "logps/chosen": -292.83612060546875, + "logps/rejected": -297.54010009765625, + "loss": 0.6339, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.710498809814453, + "rewards/margins": 3.350341796875, + "rewards/rejected": -8.06084156036377, + "step": 3300 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -2.8006222248077393, + "eval_logits/rejected": -2.7996485233306885, + "eval_logps/chosen": -229.91427612304688, + "eval_logps/rejected": -236.0455322265625, + "eval_loss": 0.5043264031410217, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -3.5676214694976807, + "eval_rewards/margins": 2.845334529876709, + "eval_rewards/rejected": -6.412956714630127, + "eval_runtime": 140.5374, + "eval_samples_per_second": 22.457, + "eval_steps_per_second": 0.356, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 4.365787013734091e-07, + "logits/chosen": -2.8656392097473145, + "logits/rejected": -2.8673369884490967, + "logps/chosen": -260.9368896484375, + "logps/rejected": -278.66778564453125, + "loss": 0.6329, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8062299489974976, + "rewards/margins": 2.270146608352661, + "rewards/rejected": -4.076376438140869, + "step": 3310 + }, + { + "epoch": 0.64, + "learning_rate": 4.362191702020565e-07, + "logits/chosen": -2.8520846366882324, + "logits/rejected": -2.815701961517334, + "logps/chosen": -333.2210998535156, + "logps/rejected": -343.0906677246094, + "loss": 0.4729, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.563772439956665, + "rewards/margins": 3.101661205291748, + "rewards/rejected": -5.665433406829834, + "step": 3320 + }, + { + "epoch": 0.65, + "learning_rate": 4.3585963903070394e-07, + "logits/chosen": -2.9428510665893555, + "logits/rejected": -2.939283847808838, + "logps/chosen": -258.36517333984375, + "logps/rejected": -252.941162109375, + "loss": 0.5681, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.794334888458252, + "rewards/margins": 3.352727174758911, + "rewards/rejected": -7.147061347961426, + "step": 3330 + }, + { + "epoch": 0.65, + "learning_rate": 4.3550010785935137e-07, + "logits/chosen": -2.8750064373016357, + "logits/rejected": -2.809535503387451, + "logps/chosen": -236.16696166992188, + "logps/rejected": -272.6056213378906, + "loss": 0.576, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -3.8540070056915283, + "rewards/margins": 1.4348869323730469, + "rewards/rejected": -5.288894176483154, + "step": 3340 + }, + { + "epoch": 0.65, + "learning_rate": 4.351405766879988e-07, + "logits/chosen": -2.998915910720825, + "logits/rejected": -2.934418201446533, + "logps/chosen": -273.8209533691406, + "logps/rejected": -249.93777465820312, + "loss": 0.4891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.418905735015869, + "rewards/margins": 0.8809320330619812, + "rewards/rejected": -4.299838066101074, + "step": 3350 + }, + { + "epoch": 0.65, + "learning_rate": 4.347810455166463e-07, + "logits/chosen": -2.7779600620269775, + "logits/rejected": -2.866751194000244, + "logps/chosen": -153.2954864501953, + "logps/rejected": -205.49044799804688, + "loss": 0.6631, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.80474591255188, + "rewards/margins": 4.289844512939453, + "rewards/rejected": -7.0945892333984375, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 4.344215143452937e-07, + "logits/chosen": -2.795165538787842, + "logits/rejected": -2.8436975479125977, + "logps/chosen": -262.58203125, + "logps/rejected": -248.729248046875, + "loss": 0.5235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3129239082336426, + "rewards/margins": 1.4233911037445068, + "rewards/rejected": -3.7363152503967285, + "step": 3370 + }, + { + "epoch": 0.66, + "learning_rate": 4.340619831739412e-07, + "logits/chosen": -2.8676393032073975, + "logits/rejected": -2.7913284301757812, + "logps/chosen": -217.6074676513672, + "logps/rejected": -229.5713653564453, + "loss": 0.7444, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.564275026321411, + "rewards/margins": 1.6173843145370483, + "rewards/rejected": -5.18165922164917, + "step": 3380 + }, + { + "epoch": 0.66, + "learning_rate": 4.337024520025886e-07, + "logits/chosen": -2.7991833686828613, + "logits/rejected": -2.822204351425171, + "logps/chosen": -246.2049560546875, + "logps/rejected": -234.36013793945312, + "loss": 0.5077, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.762002468109131, + "rewards/margins": 3.062960147857666, + "rewards/rejected": -6.824962615966797, + "step": 3390 + }, + { + "epoch": 0.66, + "learning_rate": 4.3334292083123604e-07, + "logits/chosen": -2.604262590408325, + "logits/rejected": -2.6684818267822266, + "logps/chosen": -227.061767578125, + "logps/rejected": -227.0947723388672, + "loss": 0.5974, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.399252414703369, + "rewards/margins": 0.9407544136047363, + "rewards/rejected": -4.340006351470947, + "step": 3400 + }, + { + "epoch": 0.66, + "eval_logits/chosen": -2.640723466873169, + "eval_logits/rejected": -2.638162136077881, + "eval_logps/chosen": -237.52603149414062, + "eval_logps/rejected": -240.6396484375, + "eval_loss": 0.5700723528862, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -4.328795909881592, + "eval_rewards/margins": 2.543574571609497, + "eval_rewards/rejected": -6.872370719909668, + "eval_runtime": 139.5192, + "eval_samples_per_second": 22.621, + "eval_steps_per_second": 0.358, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 4.329833896598835e-07, + "logits/chosen": -2.6771297454833984, + "logits/rejected": -2.7599921226501465, + "logps/chosen": -299.83087158203125, + "logps/rejected": -277.57061767578125, + "loss": 0.8204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.88619065284729, + "rewards/margins": 1.7270328998565674, + "rewards/rejected": -4.613223075866699, + "step": 3410 + }, + { + "epoch": 0.66, + "learning_rate": 4.3262385848853095e-07, + "logits/chosen": -2.709689140319824, + "logits/rejected": -2.6539626121520996, + "logps/chosen": -270.373046875, + "logps/rejected": -239.0745086669922, + "loss": 0.6056, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7626500129699707, + "rewards/margins": 2.2940726280212402, + "rewards/rejected": -5.056723117828369, + "step": 3420 + }, + { + "epoch": 0.67, + "learning_rate": 4.322643273171784e-07, + "logits/chosen": -2.7730724811553955, + "logits/rejected": -2.7567896842956543, + "logps/chosen": -243.9121551513672, + "logps/rejected": -292.22088623046875, + "loss": 0.5059, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -4.396458625793457, + "rewards/margins": 1.893843412399292, + "rewards/rejected": -6.290301322937012, + "step": 3430 + }, + { + "epoch": 0.67, + "learning_rate": 4.319047961458258e-07, + "logits/chosen": -2.739290952682495, + "logits/rejected": -2.7570600509643555, + "logps/chosen": -301.94866943359375, + "logps/rejected": -290.7872314453125, + "loss": 0.5992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.739505290985107, + "rewards/margins": 2.964844226837158, + "rewards/rejected": -8.704350471496582, + "step": 3440 + }, + { + "epoch": 0.67, + "learning_rate": 4.3154526497447323e-07, + "logits/chosen": -2.8693032264709473, + "logits/rejected": -2.6837515830993652, + "logps/chosen": -338.7242736816406, + "logps/rejected": -213.7662811279297, + "loss": 0.6146, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.858262300491333, + "rewards/margins": 1.727285623550415, + "rewards/rejected": -5.58554744720459, + "step": 3450 + }, + { + "epoch": 0.67, + "learning_rate": 4.311857338031207e-07, + "logits/chosen": -2.767155647277832, + "logits/rejected": -2.7514164447784424, + "logps/chosen": -261.1998291015625, + "logps/rejected": -338.3866882324219, + "loss": 0.4737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.036892890930176, + "rewards/margins": 1.918280839920044, + "rewards/rejected": -5.955173492431641, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 4.3082620263176814e-07, + "logits/chosen": -2.621947765350342, + "logits/rejected": -2.5978214740753174, + "logps/chosen": -309.0357971191406, + "logps/rejected": -294.66632080078125, + "loss": 0.5747, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7080745697021484, + "rewards/margins": 0.551271378993988, + "rewards/rejected": -4.259346008300781, + "step": 3470 + }, + { + "epoch": 0.68, + "learning_rate": 4.304666714604156e-07, + "logits/chosen": -2.422496795654297, + "logits/rejected": -2.31830096244812, + "logps/chosen": -285.1286315917969, + "logps/rejected": -286.30584716796875, + "loss": 0.4846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4158012866973877, + "rewards/margins": 5.011656761169434, + "rewards/rejected": -8.427458763122559, + "step": 3480 + }, + { + "epoch": 0.68, + "learning_rate": 4.3010714028906305e-07, + "logits/chosen": -2.6212034225463867, + "logits/rejected": -2.631873846054077, + "logps/chosen": -240.05062866210938, + "logps/rejected": -281.8507080078125, + "loss": 0.5102, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.7735342979431152, + "rewards/margins": 3.5039703845977783, + "rewards/rejected": -7.277504920959473, + "step": 3490 + }, + { + "epoch": 0.68, + "learning_rate": 4.297476091177105e-07, + "logits/chosen": -2.7259914875030518, + "logits/rejected": -2.664835214614868, + "logps/chosen": -263.580078125, + "logps/rejected": -235.29190063476562, + "loss": 0.4836, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.454415321350098, + "rewards/margins": 3.3928513526916504, + "rewards/rejected": -7.84726619720459, + "step": 3500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.5692861080169678, + "eval_logits/rejected": -2.563093662261963, + "eval_logps/chosen": -249.60523986816406, + "eval_logps/rejected": -257.0225830078125, + "eval_loss": 0.5171152949333191, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -5.536716938018799, + "eval_rewards/margins": 2.9739480018615723, + "eval_rewards/rejected": -8.510666847229004, + "eval_runtime": 141.1265, + "eval_samples_per_second": 22.363, + "eval_steps_per_second": 0.354, + "step": 3500 + }, + { + "epoch": 0.68, + "learning_rate": 4.2938807794635796e-07, + "logits/chosen": -2.737623929977417, + "logits/rejected": -2.6793720722198486, + "logps/chosen": -238.3832244873047, + "logps/rejected": -244.93896484375, + "loss": 0.7299, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.398845672607422, + "rewards/margins": 2.3521039485931396, + "rewards/rejected": -5.750948905944824, + "step": 3510 + }, + { + "epoch": 0.68, + "learning_rate": 4.290285467750054e-07, + "logits/chosen": -2.508549690246582, + "logits/rejected": -2.5834248065948486, + "logps/chosen": -190.38037109375, + "logps/rejected": -196.329345703125, + "loss": 0.471, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.651684761047363, + "rewards/margins": 1.127396821975708, + "rewards/rejected": -6.77908182144165, + "step": 3520 + }, + { + "epoch": 0.69, + "learning_rate": 4.286690156036528e-07, + "logits/chosen": -2.5850329399108887, + "logits/rejected": -2.5376124382019043, + "logps/chosen": -266.3369140625, + "logps/rejected": -243.0791473388672, + "loss": 0.5134, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.392764091491699, + "rewards/margins": 2.744372844696045, + "rewards/rejected": -7.137136936187744, + "step": 3530 + }, + { + "epoch": 0.69, + "learning_rate": 4.2830948443230024e-07, + "logits/chosen": -2.67210054397583, + "logits/rejected": -2.6390540599823, + "logps/chosen": -220.0902862548828, + "logps/rejected": -242.9448699951172, + "loss": 0.4862, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.911955833435059, + "rewards/margins": 1.756532073020935, + "rewards/rejected": -7.668488502502441, + "step": 3540 + }, + { + "epoch": 0.69, + "learning_rate": 4.2794995326094767e-07, + "logits/chosen": -2.703794002532959, + "logits/rejected": -2.648158311843872, + "logps/chosen": -278.8760986328125, + "logps/rejected": -257.67010498046875, + "loss": 0.5159, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.091516971588135, + "rewards/margins": 1.6587636470794678, + "rewards/rejected": -6.750279903411865, + "step": 3550 + }, + { + "epoch": 0.69, + "learning_rate": 4.2759042208959515e-07, + "logits/chosen": -2.6188552379608154, + "logits/rejected": -2.64038348197937, + "logps/chosen": -290.19842529296875, + "logps/rejected": -269.04217529296875, + "loss": 0.5042, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.3531312942504883, + "rewards/margins": 0.8913325071334839, + "rewards/rejected": -3.2444636821746826, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 4.272308909182426e-07, + "logits/chosen": -2.6543381214141846, + "logits/rejected": -2.639191150665283, + "logps/chosen": -240.03231811523438, + "logps/rejected": -231.0758819580078, + "loss": 0.462, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.280223846435547, + "rewards/margins": 2.169583559036255, + "rewards/rejected": -5.449807643890381, + "step": 3570 + }, + { + "epoch": 0.7, + "learning_rate": 4.2687135974689006e-07, + "logits/chosen": -2.6419341564178467, + "logits/rejected": -2.5609917640686035, + "logps/chosen": -237.6129150390625, + "logps/rejected": -345.57720947265625, + "loss": 0.596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.28879714012146, + "rewards/margins": 1.1028800010681152, + "rewards/rejected": -3.391677141189575, + "step": 3580 + }, + { + "epoch": 0.7, + "learning_rate": 4.265118285755375e-07, + "logits/chosen": -2.7299187183380127, + "logits/rejected": -2.736818552017212, + "logps/chosen": -177.31857299804688, + "logps/rejected": -240.1614532470703, + "loss": 0.4835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7482101917266846, + "rewards/margins": 1.666076421737671, + "rewards/rejected": -5.4142866134643555, + "step": 3590 + }, + { + "epoch": 0.7, + "learning_rate": 4.261522974041849e-07, + "logits/chosen": -2.653881549835205, + "logits/rejected": -2.653212308883667, + "logps/chosen": -320.09625244140625, + "logps/rejected": -292.1993408203125, + "loss": 0.6342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1678853034973145, + "rewards/margins": 2.4087586402893066, + "rewards/rejected": -5.576643943786621, + "step": 3600 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -2.596027135848999, + "eval_logits/rejected": -2.5903661251068115, + "eval_logps/chosen": -241.9811553955078, + "eval_logps/rejected": -249.30532836914062, + "eval_loss": 0.5059713125228882, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -4.774308681488037, + "eval_rewards/margins": 2.9646286964416504, + "eval_rewards/rejected": -7.738936901092529, + "eval_runtime": 139.8491, + "eval_samples_per_second": 22.567, + "eval_steps_per_second": 0.358, + "step": 3600 + }, + { + "epoch": 0.7, + "learning_rate": 4.257927662328324e-07, + "logits/chosen": -2.674074649810791, + "logits/rejected": -2.708157777786255, + "logps/chosen": -262.8844299316406, + "logps/rejected": -241.1731414794922, + "loss": 0.4843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.978820562362671, + "rewards/margins": 1.0564634799957275, + "rewards/rejected": -3.0352840423583984, + "step": 3610 + }, + { + "epoch": 0.7, + "learning_rate": 4.254332350614798e-07, + "logits/chosen": -2.7725424766540527, + "logits/rejected": -2.7608630657196045, + "logps/chosen": -287.75006103515625, + "logps/rejected": -240.6949005126953, + "loss": 0.53, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.6862740516662598, + "rewards/margins": 1.710336685180664, + "rewards/rejected": -5.396610736846924, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 4.2507370389012725e-07, + "logits/chosen": -2.5927350521087646, + "logits/rejected": -2.6337664127349854, + "logps/chosen": -232.65597534179688, + "logps/rejected": -211.85104370117188, + "loss": 0.5745, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.9736328125, + "rewards/margins": 1.2653578519821167, + "rewards/rejected": -5.238990306854248, + "step": 3630 + }, + { + "epoch": 0.71, + "learning_rate": 4.247141727187747e-07, + "logits/chosen": -2.7251429557800293, + "logits/rejected": -2.6122653484344482, + "logps/chosen": -270.0220947265625, + "logps/rejected": -203.34213256835938, + "loss": 0.5236, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.7760396003723145, + "rewards/margins": 1.1251728534698486, + "rewards/rejected": -6.901212215423584, + "step": 3640 + }, + { + "epoch": 0.71, + "learning_rate": 4.243546415474221e-07, + "logits/chosen": -2.7624127864837646, + "logits/rejected": -2.6737847328186035, + "logps/chosen": -296.1536865234375, + "logps/rejected": -256.7353515625, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.056868076324463, + "rewards/margins": 1.9823532104492188, + "rewards/rejected": -5.039221286773682, + "step": 3650 + }, + { + "epoch": 0.71, + "learning_rate": 4.239951103760696e-07, + "logits/chosen": -2.7661476135253906, + "logits/rejected": -2.7064614295959473, + "logps/chosen": -230.06900024414062, + "logps/rejected": -245.69528198242188, + "loss": 0.6072, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.373167037963867, + "rewards/margins": 3.0974011421203613, + "rewards/rejected": -7.4705681800842285, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 4.2363557920471707e-07, + "logits/chosen": -2.7427279949188232, + "logits/rejected": -2.8481740951538086, + "logps/chosen": -217.91909790039062, + "logps/rejected": -234.4322052001953, + "loss": 0.5565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.3290581703186035, + "rewards/margins": 2.53106951713562, + "rewards/rejected": -5.860127925872803, + "step": 3670 + }, + { + "epoch": 0.71, + "learning_rate": 4.232760480333645e-07, + "logits/chosen": -2.7182562351226807, + "logits/rejected": -2.6017279624938965, + "logps/chosen": -206.9663543701172, + "logps/rejected": -209.35269165039062, + "loss": 0.4904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4840569496154785, + "rewards/margins": 1.010908842086792, + "rewards/rejected": -3.4949657917022705, + "step": 3680 + }, + { + "epoch": 0.72, + "learning_rate": 4.229165168620119e-07, + "logits/chosen": -2.6685986518859863, + "logits/rejected": -2.644609212875366, + "logps/chosen": -251.0902862548828, + "logps/rejected": -220.2162322998047, + "loss": 0.6079, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8104891777038574, + "rewards/margins": 3.4027793407440186, + "rewards/rejected": -6.213269233703613, + "step": 3690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2255698569065935e-07, + "logits/chosen": -2.7443270683288574, + "logits/rejected": -2.808504104614258, + "logps/chosen": -247.8759765625, + "logps/rejected": -227.86941528320312, + "loss": 0.5143, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.108855247497559, + "rewards/margins": 2.29363751411438, + "rewards/rejected": -6.402493476867676, + "step": 3700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.6517856121063232, + "eval_logits/rejected": -2.6496589183807373, + "eval_logps/chosen": -226.3973388671875, + "eval_logps/rejected": -230.3889617919922, + "eval_loss": 0.483525812625885, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -3.215925931930542, + "eval_rewards/margins": 2.631373882293701, + "eval_rewards/rejected": -5.847299575805664, + "eval_runtime": 141.5761, + "eval_samples_per_second": 22.292, + "eval_steps_per_second": 0.353, + "step": 3700 + }, + { + "epoch": 0.72, + "learning_rate": 4.2219745451930683e-07, + "logits/chosen": -2.857578992843628, + "logits/rejected": -2.903554677963257, + "logps/chosen": -228.97799682617188, + "logps/rejected": -262.11004638671875, + "loss": 0.5322, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.296816349029541, + "rewards/margins": 2.0335192680358887, + "rewards/rejected": -6.3303351402282715, + "step": 3710 + }, + { + "epoch": 0.72, + "learning_rate": 4.2183792334795426e-07, + "logits/chosen": -2.7694320678710938, + "logits/rejected": -2.8727636337280273, + "logps/chosen": -233.75820922851562, + "logps/rejected": -247.53536987304688, + "loss": 0.4928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6068358421325684, + "rewards/margins": 1.658368706703186, + "rewards/rejected": -4.265204906463623, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 4.214783921766017e-07, + "logits/chosen": -2.8039932250976562, + "logits/rejected": -2.767218828201294, + "logps/chosen": -278.52325439453125, + "logps/rejected": -220.7368621826172, + "loss": 0.4337, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.668403148651123, + "rewards/margins": 2.4974465370178223, + "rewards/rejected": -5.165849685668945, + "step": 3730 + }, + { + "epoch": 0.73, + "learning_rate": 4.211188610052491e-07, + "logits/chosen": -2.956364393234253, + "logits/rejected": -2.916250705718994, + "logps/chosen": -281.52325439453125, + "logps/rejected": -325.3456115722656, + "loss": 0.5699, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.436584234237671, + "rewards/margins": 1.5769962072372437, + "rewards/rejected": -4.013580322265625, + "step": 3740 + }, + { + "epoch": 0.73, + "learning_rate": 4.2075932983389654e-07, + "logits/chosen": -2.8661975860595703, + "logits/rejected": -2.788851261138916, + "logps/chosen": -298.71319580078125, + "logps/rejected": -246.48599243164062, + "loss": 0.4317, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9997756481170654, + "rewards/margins": 2.1456754207611084, + "rewards/rejected": -5.145451545715332, + "step": 3750 + }, + { + "epoch": 0.73, + "learning_rate": 4.20399798662544e-07, + "logits/chosen": -2.8595833778381348, + "logits/rejected": -2.888211727142334, + "logps/chosen": -279.5419921875, + "logps/rejected": -271.6037902832031, + "loss": 0.5257, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.302333354949951, + "rewards/margins": 1.0954910516738892, + "rewards/rejected": -4.397824287414551, + "step": 3760 + }, + { + "epoch": 0.73, + "learning_rate": 4.200402674911915e-07, + "logits/chosen": -2.9476447105407715, + "logits/rejected": -2.9642484188079834, + "logps/chosen": -287.8567810058594, + "logps/rejected": -283.38482666015625, + "loss": 0.5795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0977120399475098, + "rewards/margins": 1.7883121967315674, + "rewards/rejected": -3.886023998260498, + "step": 3770 + }, + { + "epoch": 0.73, + "learning_rate": 4.1968073631983893e-07, + "logits/chosen": -2.6992695331573486, + "logits/rejected": -2.7363626956939697, + "logps/chosen": -258.24932861328125, + "logps/rejected": -219.3705596923828, + "loss": 0.5153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7454285621643066, + "rewards/margins": 1.2108144760131836, + "rewards/rejected": -4.956243515014648, + "step": 3780 + }, + { + "epoch": 0.74, + "learning_rate": 4.1932120514848636e-07, + "logits/chosen": -2.751964807510376, + "logits/rejected": -2.729923725128174, + "logps/chosen": -159.1203155517578, + "logps/rejected": -277.16534423828125, + "loss": 0.5861, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.315541744232178, + "rewards/margins": 4.209917068481445, + "rewards/rejected": -8.525461196899414, + "step": 3790 + }, + { + "epoch": 0.74, + "learning_rate": 4.189616739771338e-07, + "logits/chosen": -2.803990125656128, + "logits/rejected": -2.8793039321899414, + "logps/chosen": -198.7175750732422, + "logps/rejected": -244.63943481445312, + "loss": 0.5471, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9009945392608643, + "rewards/margins": 2.1119015216827393, + "rewards/rejected": -5.012895584106445, + "step": 3800 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -2.7517590522766113, + "eval_logits/rejected": -2.7507576942443848, + "eval_logps/chosen": -236.92933654785156, + "eval_logps/rejected": -242.65428161621094, + "eval_loss": 0.5059856176376343, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -4.2691264152526855, + "eval_rewards/margins": 2.80470609664917, + "eval_rewards/rejected": -7.0738325119018555, + "eval_runtime": 140.944, + "eval_samples_per_second": 22.392, + "eval_steps_per_second": 0.355, + "step": 3800 + }, + { + "epoch": 0.74, + "learning_rate": 4.1860214280578127e-07, + "logits/chosen": -2.780768871307373, + "logits/rejected": -2.80261492729187, + "logps/chosen": -264.9028625488281, + "logps/rejected": -270.338623046875, + "loss": 0.4883, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.935929536819458, + "rewards/margins": 4.281942844390869, + "rewards/rejected": -8.217870712280273, + "step": 3810 + }, + { + "epoch": 0.74, + "learning_rate": 4.182426116344287e-07, + "logits/chosen": -2.8779873847961426, + "logits/rejected": -2.8326969146728516, + "logps/chosen": -210.17507934570312, + "logps/rejected": -217.43331909179688, + "loss": 0.5727, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -5.283583641052246, + "rewards/margins": 0.8070454597473145, + "rewards/rejected": -6.0906291007995605, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 4.178830804630761e-07, + "logits/chosen": -2.61614727973938, + "logits/rejected": -2.53159499168396, + "logps/chosen": -311.3333740234375, + "logps/rejected": -306.41455078125, + "loss": 0.4351, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.734816789627075, + "rewards/margins": 3.9072043895721436, + "rewards/rejected": -6.642021179199219, + "step": 3830 + }, + { + "epoch": 0.75, + "learning_rate": 4.1752354929172355e-07, + "logits/chosen": -2.7780563831329346, + "logits/rejected": -2.859438419342041, + "logps/chosen": -243.5393524169922, + "logps/rejected": -316.92047119140625, + "loss": 0.5028, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.958086013793945, + "rewards/margins": 3.1509499549865723, + "rewards/rejected": -8.109037399291992, + "step": 3840 + }, + { + "epoch": 0.75, + "learning_rate": 4.17164018120371e-07, + "logits/chosen": -2.66560697555542, + "logits/rejected": -2.6807377338409424, + "logps/chosen": -221.40267944335938, + "logps/rejected": -293.7818298339844, + "loss": 0.6466, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.929117202758789, + "rewards/margins": 3.0511293411254883, + "rewards/rejected": -6.980246067047119, + "step": 3850 + }, + { + "epoch": 0.75, + "learning_rate": 4.1680448694901846e-07, + "logits/chosen": -2.6415555477142334, + "logits/rejected": -2.7434072494506836, + "logps/chosen": -256.9652099609375, + "logps/rejected": -278.50341796875, + "loss": 0.5411, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1472325325012207, + "rewards/margins": 3.2073540687561035, + "rewards/rejected": -5.354586601257324, + "step": 3860 + }, + { + "epoch": 0.75, + "learning_rate": 4.1644495577766594e-07, + "logits/chosen": -2.9617741107940674, + "logits/rejected": -2.9699528217315674, + "logps/chosen": -283.10894775390625, + "logps/rejected": -255.62863159179688, + "loss": 0.5307, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.438523769378662, + "rewards/margins": 0.3908306956291199, + "rewards/rejected": -3.829354763031006, + "step": 3870 + }, + { + "epoch": 0.75, + "learning_rate": 4.1608542460631337e-07, + "logits/chosen": -2.7873966693878174, + "logits/rejected": -2.8923511505126953, + "logps/chosen": -238.02096557617188, + "logps/rejected": -347.6580505371094, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9851253032684326, + "rewards/margins": 1.3712899684906006, + "rewards/rejected": -3.356415271759033, + "step": 3880 + }, + { + "epoch": 0.76, + "learning_rate": 4.157258934349608e-07, + "logits/chosen": -2.694671630859375, + "logits/rejected": -2.7630417346954346, + "logps/chosen": -121.61210632324219, + "logps/rejected": -173.63577270507812, + "loss": 0.5658, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.5720744132995605, + "rewards/margins": 3.7524161338806152, + "rewards/rejected": -7.324490547180176, + "step": 3890 + }, + { + "epoch": 0.76, + "learning_rate": 4.153663622636082e-07, + "logits/chosen": -2.8237593173980713, + "logits/rejected": -2.826791286468506, + "logps/chosen": -256.9156799316406, + "logps/rejected": -270.22845458984375, + "loss": 0.4817, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2757785320281982, + "rewards/margins": 4.210750579833984, + "rewards/rejected": -7.486529350280762, + "step": 3900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.6443426609039307, + "eval_logits/rejected": -2.6394639015197754, + "eval_logps/chosen": -238.49986267089844, + "eval_logps/rejected": -244.88394165039062, + "eval_loss": 0.529410183429718, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -4.426179885864258, + "eval_rewards/margins": 2.8706185817718506, + "eval_rewards/rejected": -7.2967987060546875, + "eval_runtime": 140.6835, + "eval_samples_per_second": 22.433, + "eval_steps_per_second": 0.355, + "step": 3900 + }, + { + "epoch": 0.76, + "learning_rate": 4.150068310922557e-07, + "logits/chosen": -2.673715114593506, + "logits/rejected": -2.738027334213257, + "logps/chosen": -203.73446655273438, + "logps/rejected": -292.6114501953125, + "loss": 0.8045, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.32696533203125, + "rewards/margins": 3.306959629058838, + "rewards/rejected": -7.633924961090088, + "step": 3910 + }, + { + "epoch": 0.76, + "learning_rate": 4.1464729992090313e-07, + "logits/chosen": -2.661303997039795, + "logits/rejected": -2.6372158527374268, + "logps/chosen": -257.3746337890625, + "logps/rejected": -233.3134002685547, + "loss": 0.5652, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.130716323852539, + "rewards/margins": 1.7002136707305908, + "rewards/rejected": -4.830929756164551, + "step": 3920 + }, + { + "epoch": 0.76, + "learning_rate": 4.1428776874955056e-07, + "logits/chosen": -2.7555060386657715, + "logits/rejected": -2.7300703525543213, + "logps/chosen": -184.47970581054688, + "logps/rejected": -221.07754516601562, + "loss": 0.6103, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9298088550567627, + "rewards/margins": 2.627885341644287, + "rewards/rejected": -5.557694435119629, + "step": 3930 + }, + { + "epoch": 0.76, + "learning_rate": 4.13928237578198e-07, + "logits/chosen": -2.7176475524902344, + "logits/rejected": -2.719181537628174, + "logps/chosen": -233.90646362304688, + "logps/rejected": -222.2284698486328, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0175254344940186, + "rewards/margins": 2.646049737930298, + "rewards/rejected": -5.663575172424316, + "step": 3940 + }, + { + "epoch": 0.77, + "learning_rate": 4.135687064068454e-07, + "logits/chosen": -2.4261183738708496, + "logits/rejected": -2.5859925746917725, + "logps/chosen": -261.8021240234375, + "logps/rejected": -308.89837646484375, + "loss": 0.591, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.808897614479065, + "rewards/margins": 2.689765691757202, + "rewards/rejected": -4.498663425445557, + "step": 3950 + }, + { + "epoch": 0.77, + "learning_rate": 4.132091752354929e-07, + "logits/chosen": -2.450559139251709, + "logits/rejected": -2.55533504486084, + "logps/chosen": -239.26602172851562, + "logps/rejected": -274.4757995605469, + "loss": 0.5925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.073310375213623, + "rewards/margins": 3.1609668731689453, + "rewards/rejected": -6.234277248382568, + "step": 3960 + }, + { + "epoch": 0.77, + "learning_rate": 4.128496440641404e-07, + "logits/chosen": -2.6075682640075684, + "logits/rejected": -2.7368927001953125, + "logps/chosen": -317.2792053222656, + "logps/rejected": -312.6597900390625, + "loss": 0.3885, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5682709217071533, + "rewards/margins": 2.961430072784424, + "rewards/rejected": -5.529700756072998, + "step": 3970 + }, + { + "epoch": 0.77, + "learning_rate": 4.124901128927878e-07, + "logits/chosen": -2.7510499954223633, + "logits/rejected": -2.742461681365967, + "logps/chosen": -364.917724609375, + "logps/rejected": -312.2942199707031, + "loss": 0.4847, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.139216661453247, + "rewards/margins": 3.680332899093628, + "rewards/rejected": -6.819549560546875, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 4.1213058172143523e-07, + "logits/chosen": -2.6703734397888184, + "logits/rejected": -2.690394639968872, + "logps/chosen": -292.0287170410156, + "logps/rejected": -308.9948425292969, + "loss": 0.6375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.002602577209473, + "rewards/margins": 2.602440118789673, + "rewards/rejected": -7.605042457580566, + "step": 3990 + }, + { + "epoch": 0.78, + "learning_rate": 4.1177105055008266e-07, + "logits/chosen": -2.6643238067626953, + "logits/rejected": -2.732318878173828, + "logps/chosen": -183.9958038330078, + "logps/rejected": -279.9193115234375, + "loss": 0.4616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.369785308837891, + "rewards/margins": 3.856417417526245, + "rewards/rejected": -8.226203918457031, + "step": 4000 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -2.611358642578125, + "eval_logits/rejected": -2.605623245239258, + "eval_logps/chosen": -239.37242126464844, + "eval_logps/rejected": -248.78367614746094, + "eval_loss": 0.501867949962616, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -4.513437747955322, + "eval_rewards/margins": 3.1733336448669434, + "eval_rewards/rejected": -7.686771869659424, + "eval_runtime": 140.6175, + "eval_samples_per_second": 22.444, + "eval_steps_per_second": 0.356, + "step": 4000 + }, + { + "epoch": 0.78, + "learning_rate": 4.1141151937873014e-07, + "logits/chosen": -2.7722015380859375, + "logits/rejected": -2.5845284461975098, + "logps/chosen": -261.0437927246094, + "logps/rejected": -201.52496337890625, + "loss": 0.4222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.034439563751221, + "rewards/margins": 1.531874418258667, + "rewards/rejected": -5.566313743591309, + "step": 4010 + }, + { + "epoch": 0.78, + "learning_rate": 4.1105198820737757e-07, + "logits/chosen": -2.679579257965088, + "logits/rejected": -2.6912286281585693, + "logps/chosen": -177.82229614257812, + "logps/rejected": -212.9039764404297, + "loss": 0.4357, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.626319408416748, + "rewards/margins": 2.4286320209503174, + "rewards/rejected": -6.0549516677856445, + "step": 4020 + }, + { + "epoch": 0.78, + "learning_rate": 4.10692457036025e-07, + "logits/chosen": -2.7863266468048096, + "logits/rejected": -2.7110161781311035, + "logps/chosen": -206.7767333984375, + "logps/rejected": -229.57070922851562, + "loss": 0.4667, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.5775718688964844, + "rewards/margins": 3.1784205436706543, + "rewards/rejected": -6.755992889404297, + "step": 4030 + }, + { + "epoch": 0.78, + "learning_rate": 4.1033292586467243e-07, + "logits/chosen": -2.575326919555664, + "logits/rejected": -2.6323258876800537, + "logps/chosen": -232.1804656982422, + "logps/rejected": -306.1264953613281, + "loss": 0.6106, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.040480613708496, + "rewards/margins": 3.736806869506836, + "rewards/rejected": -8.777287483215332, + "step": 4040 + }, + { + "epoch": 0.79, + "learning_rate": 4.0997339469331985e-07, + "logits/chosen": -2.7365784645080566, + "logits/rejected": -2.7570254802703857, + "logps/chosen": -198.88853454589844, + "logps/rejected": -304.9981994628906, + "loss": 0.4671, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.8167319297790527, + "rewards/margins": 2.4269371032714844, + "rewards/rejected": -6.2436699867248535, + "step": 4050 + }, + { + "epoch": 0.79, + "learning_rate": 4.096138635219674e-07, + "logits/chosen": -2.8507704734802246, + "logits/rejected": -2.8511385917663574, + "logps/chosen": -299.0123596191406, + "logps/rejected": -364.85321044921875, + "loss": 0.4188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.197487831115723, + "rewards/margins": 2.4825453758239746, + "rewards/rejected": -6.680032253265381, + "step": 4060 + }, + { + "epoch": 0.79, + "learning_rate": 4.092543323506148e-07, + "logits/chosen": -2.709254503250122, + "logits/rejected": -2.6763699054718018, + "logps/chosen": -238.7041473388672, + "logps/rejected": -234.9014892578125, + "loss": 0.4831, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.505122184753418, + "rewards/margins": 1.6862595081329346, + "rewards/rejected": -6.191380977630615, + "step": 4070 + }, + { + "epoch": 0.79, + "learning_rate": 4.0889480117926224e-07, + "logits/chosen": -2.572201728820801, + "logits/rejected": -2.569436550140381, + "logps/chosen": -255.99551391601562, + "logps/rejected": -219.61172485351562, + "loss": 0.6034, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.145529747009277, + "rewards/margins": 2.2147066593170166, + "rewards/rejected": -6.360236167907715, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 4.0853527000790967e-07, + "logits/chosen": -2.7863736152648926, + "logits/rejected": -2.7736449241638184, + "logps/chosen": -249.95947265625, + "logps/rejected": -232.7467803955078, + "loss": 0.4329, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.905479907989502, + "rewards/margins": 1.3330347537994385, + "rewards/rejected": -4.238514423370361, + "step": 4090 + }, + { + "epoch": 0.8, + "learning_rate": 4.081757388365571e-07, + "logits/chosen": -2.7727138996124268, + "logits/rejected": -2.7523348331451416, + "logps/chosen": -290.93927001953125, + "logps/rejected": -312.92694091796875, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.644190549850464, + "rewards/margins": 3.644242763519287, + "rewards/rejected": -7.2884345054626465, + "step": 4100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.5811851024627686, + "eval_logits/rejected": -2.568887710571289, + "eval_logps/chosen": -236.53573608398438, + "eval_logps/rejected": -244.0291748046875, + "eval_loss": 0.5084269642829895, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -4.229771137237549, + "eval_rewards/margins": 2.981550931930542, + "eval_rewards/rejected": -7.211321830749512, + "eval_runtime": 141.2083, + "eval_samples_per_second": 22.35, + "eval_steps_per_second": 0.354, + "step": 4100 + }, + { + "epoch": 0.8, + "learning_rate": 4.078162076652046e-07, + "logits/chosen": -2.714445114135742, + "logits/rejected": -2.7384421825408936, + "logps/chosen": -236.87255859375, + "logps/rejected": -246.306640625, + "loss": 0.5583, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.930661678314209, + "rewards/margins": 2.5478663444519043, + "rewards/rejected": -5.478527545928955, + "step": 4110 + }, + { + "epoch": 0.8, + "learning_rate": 4.07456676493852e-07, + "logits/chosen": -2.539716958999634, + "logits/rejected": -2.547043561935425, + "logps/chosen": -247.4441680908203, + "logps/rejected": -227.5391082763672, + "loss": 0.5371, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.151979446411133, + "rewards/margins": 2.5288708209991455, + "rewards/rejected": -6.680850028991699, + "step": 4120 + }, + { + "epoch": 0.8, + "learning_rate": 4.0709714532249944e-07, + "logits/chosen": -2.676037311553955, + "logits/rejected": -2.602576494216919, + "logps/chosen": -270.4694519042969, + "logps/rejected": -240.75308227539062, + "loss": 0.4446, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.0017523765563965, + "rewards/margins": 2.5888845920562744, + "rewards/rejected": -7.590636253356934, + "step": 4130 + }, + { + "epoch": 0.8, + "learning_rate": 4.0673761415114686e-07, + "logits/chosen": -2.719240665435791, + "logits/rejected": -2.7139101028442383, + "logps/chosen": -270.01556396484375, + "logps/rejected": -280.0762939453125, + "loss": 0.4548, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.841872215270996, + "rewards/margins": 3.2279715538024902, + "rewards/rejected": -8.069845199584961, + "step": 4140 + }, + { + "epoch": 0.81, + "learning_rate": 4.063780829797943e-07, + "logits/chosen": -2.706204891204834, + "logits/rejected": -2.7004897594451904, + "logps/chosen": -243.80465698242188, + "logps/rejected": -284.74468994140625, + "loss": 0.6013, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.9999115467071533, + "rewards/margins": 2.851935863494873, + "rewards/rejected": -6.851847171783447, + "step": 4150 + }, + { + "epoch": 0.81, + "learning_rate": 4.060185518084418e-07, + "logits/chosen": -2.638167142868042, + "logits/rejected": -2.601055383682251, + "logps/chosen": -239.7976531982422, + "logps/rejected": -308.3616638183594, + "loss": 0.5205, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.221074104309082, + "rewards/margins": 4.369997978210449, + "rewards/rejected": -8.591072082519531, + "step": 4160 + }, + { + "epoch": 0.81, + "learning_rate": 4.0565902063708925e-07, + "logits/chosen": -2.630215644836426, + "logits/rejected": -2.6398167610168457, + "logps/chosen": -235.03231811523438, + "logps/rejected": -266.66522216796875, + "loss": 0.5691, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.963937282562256, + "rewards/margins": 2.846355438232422, + "rewards/rejected": -5.8102922439575195, + "step": 4170 + }, + { + "epoch": 0.81, + "learning_rate": 4.052994894657367e-07, + "logits/chosen": -2.666567087173462, + "logits/rejected": -2.657777786254883, + "logps/chosen": -228.3162841796875, + "logps/rejected": -261.142822265625, + "loss": 0.4964, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.731809139251709, + "rewards/margins": 3.2146503925323486, + "rewards/rejected": -6.946459770202637, + "step": 4180 + }, + { + "epoch": 0.81, + "learning_rate": 4.049399582943841e-07, + "logits/chosen": -2.7319045066833496, + "logits/rejected": -2.7684199810028076, + "logps/chosen": -270.67095947265625, + "logps/rejected": -268.091064453125, + "loss": 0.4525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.5088818073272705, + "rewards/margins": 2.3630523681640625, + "rewards/rejected": -5.871934413909912, + "step": 4190 + }, + { + "epoch": 0.82, + "learning_rate": 4.0458042712303154e-07, + "logits/chosen": -2.6466877460479736, + "logits/rejected": -2.584412097930908, + "logps/chosen": -259.6782531738281, + "logps/rejected": -246.9607391357422, + "loss": 0.5486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.682548999786377, + "rewards/margins": 3.6152548789978027, + "rewards/rejected": -7.2978034019470215, + "step": 4200 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.612272262573242, + "eval_logits/rejected": -2.6021535396575928, + "eval_logps/chosen": -244.8979034423828, + "eval_logps/rejected": -253.74057006835938, + "eval_loss": 0.50364089012146, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -5.065984725952148, + "eval_rewards/margins": 3.1164772510528564, + "eval_rewards/rejected": -8.182461738586426, + "eval_runtime": 140.589, + "eval_samples_per_second": 22.448, + "eval_steps_per_second": 0.356, + "step": 4200 + }, + { + "epoch": 0.82, + "learning_rate": 4.04220895951679e-07, + "logits/chosen": -2.439812183380127, + "logits/rejected": -2.43867826461792, + "logps/chosen": -285.74713134765625, + "logps/rejected": -331.9280700683594, + "loss": 0.6775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.17250919342041, + "rewards/margins": 2.6503026485443115, + "rewards/rejected": -7.822811126708984, + "step": 4210 + }, + { + "epoch": 0.82, + "learning_rate": 4.0386136478032645e-07, + "logits/chosen": -2.5404868125915527, + "logits/rejected": -2.577773094177246, + "logps/chosen": -373.8901672363281, + "logps/rejected": -390.03424072265625, + "loss": 0.5626, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.9468588829040527, + "rewards/margins": 3.0632483959198, + "rewards/rejected": -6.01010799407959, + "step": 4220 + }, + { + "epoch": 0.82, + "learning_rate": 4.0350183360897387e-07, + "logits/chosen": -2.7753472328186035, + "logits/rejected": -2.746166706085205, + "logps/chosen": -290.0711975097656, + "logps/rejected": -211.5422821044922, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.921571254730225, + "rewards/margins": 3.443016767501831, + "rewards/rejected": -8.364588737487793, + "step": 4230 + }, + { + "epoch": 0.82, + "learning_rate": 4.031423024376213e-07, + "logits/chosen": -2.8585479259490967, + "logits/rejected": -2.8426735401153564, + "logps/chosen": -298.3712463378906, + "logps/rejected": -245.10330200195312, + "loss": 0.4384, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.26866340637207, + "rewards/margins": 2.2625441551208496, + "rewards/rejected": -8.531207084655762, + "step": 4240 + }, + { + "epoch": 0.83, + "learning_rate": 4.0278277126626873e-07, + "logits/chosen": -2.639944314956665, + "logits/rejected": -2.6482975482940674, + "logps/chosen": -360.97119140625, + "logps/rejected": -359.6124267578125, + "loss": 0.5181, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.2597737312316895, + "rewards/margins": 3.0593974590301514, + "rewards/rejected": -8.319170951843262, + "step": 4250 + }, + { + "epoch": 0.83, + "learning_rate": 4.0242324009491626e-07, + "logits/chosen": -2.6124892234802246, + "logits/rejected": -2.673614025115967, + "logps/chosen": -211.0795135498047, + "logps/rejected": -296.2552490234375, + "loss": 0.4993, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -5.773487091064453, + "rewards/margins": 0.6954655647277832, + "rewards/rejected": -6.4689531326293945, + "step": 4260 + }, + { + "epoch": 0.83, + "learning_rate": 4.020637089235637e-07, + "logits/chosen": -2.7454047203063965, + "logits/rejected": -2.6642699241638184, + "logps/chosen": -254.54605102539062, + "logps/rejected": -295.25604248046875, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.68234920501709, + "rewards/margins": 2.8548038005828857, + "rewards/rejected": -10.537153244018555, + "step": 4270 + }, + { + "epoch": 0.83, + "learning_rate": 4.017041777522111e-07, + "logits/chosen": -2.8814287185668945, + "logits/rejected": -2.809417486190796, + "logps/chosen": -261.25286865234375, + "logps/rejected": -224.7142333984375, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.4783172607421875, + "rewards/margins": 2.9234070777893066, + "rewards/rejected": -8.401723861694336, + "step": 4280 + }, + { + "epoch": 0.83, + "learning_rate": 4.0134464658085855e-07, + "logits/chosen": -2.738354206085205, + "logits/rejected": -2.752901077270508, + "logps/chosen": -296.1322021484375, + "logps/rejected": -322.6684265136719, + "loss": 0.4916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.142353057861328, + "rewards/margins": 2.718418836593628, + "rewards/rejected": -6.860772132873535, + "step": 4290 + }, + { + "epoch": 0.83, + "learning_rate": 4.00985115409506e-07, + "logits/chosen": -2.760732889175415, + "logits/rejected": -2.6976306438446045, + "logps/chosen": -237.74111938476562, + "logps/rejected": -277.74078369140625, + "loss": 0.4509, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.939502716064453, + "rewards/margins": 2.4747462272644043, + "rewards/rejected": -8.414249420166016, + "step": 4300 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.6864702701568604, + "eval_logits/rejected": -2.674971580505371, + "eval_logps/chosen": -247.89425659179688, + "eval_logps/rejected": -258.3559875488281, + "eval_loss": 0.49765992164611816, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -5.365617752075195, + "eval_rewards/margins": 3.278383493423462, + "eval_rewards/rejected": -8.644001007080078, + "eval_runtime": 140.4778, + "eval_samples_per_second": 22.466, + "eval_steps_per_second": 0.356, + "step": 4300 + }, + { + "epoch": 0.84, + "learning_rate": 4.006255842381534e-07, + "logits/chosen": -2.765807867050171, + "logits/rejected": -2.749279499053955, + "logps/chosen": -310.3785095214844, + "logps/rejected": -269.0523681640625, + "loss": 0.547, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.8490264415740967, + "rewards/margins": 1.1632335186004639, + "rewards/rejected": -5.012260437011719, + "step": 4310 + }, + { + "epoch": 0.84, + "learning_rate": 4.002660530668009e-07, + "logits/chosen": -2.8433854579925537, + "logits/rejected": -2.819653034210205, + "logps/chosen": -234.0947265625, + "logps/rejected": -288.65911865234375, + "loss": 0.4926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.022232532501221, + "rewards/margins": 1.3328921794891357, + "rewards/rejected": -6.355125427246094, + "step": 4320 + }, + { + "epoch": 0.84, + "learning_rate": 3.999065218954483e-07, + "logits/chosen": -2.9038548469543457, + "logits/rejected": -2.806032419204712, + "logps/chosen": -260.6301574707031, + "logps/rejected": -245.8707275390625, + "loss": 0.5484, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.149960517883301, + "rewards/margins": 1.059818983078003, + "rewards/rejected": -6.209778785705566, + "step": 4330 + }, + { + "epoch": 0.84, + "learning_rate": 3.9954699072409574e-07, + "logits/chosen": -2.7402634620666504, + "logits/rejected": -2.740111827850342, + "logps/chosen": -231.1941375732422, + "logps/rejected": -323.57861328125, + "loss": 0.5001, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.067840099334717, + "rewards/margins": 3.637691020965576, + "rewards/rejected": -5.705531120300293, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 3.9918745955274317e-07, + "logits/chosen": -2.5543370246887207, + "logits/rejected": -2.6464576721191406, + "logps/chosen": -221.44461059570312, + "logps/rejected": -240.70834350585938, + "loss": 0.5037, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.656668663024902, + "rewards/margins": 2.4383351802825928, + "rewards/rejected": -8.095004081726074, + "step": 4350 + }, + { + "epoch": 0.85, + "learning_rate": 3.9882792838139065e-07, + "logits/chosen": -2.8474435806274414, + "logits/rejected": -2.7763171195983887, + "logps/chosen": -260.658935546875, + "logps/rejected": -247.59326171875, + "loss": 0.5254, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.6842408180236816, + "rewards/margins": 2.3221702575683594, + "rewards/rejected": -6.006411552429199, + "step": 4360 + }, + { + "epoch": 0.85, + "learning_rate": 3.9846839721003813e-07, + "logits/chosen": -2.4345052242279053, + "logits/rejected": -2.4207301139831543, + "logps/chosen": -309.62835693359375, + "logps/rejected": -412.50634765625, + "loss": 0.608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.484894752502441, + "rewards/margins": 2.8796939849853516, + "rewards/rejected": -8.364588737487793, + "step": 4370 + }, + { + "epoch": 0.85, + "learning_rate": 3.9810886603868555e-07, + "logits/chosen": -2.634835958480835, + "logits/rejected": -2.5531816482543945, + "logps/chosen": -181.68931579589844, + "logps/rejected": -310.19842529296875, + "loss": 0.4233, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.333464622497559, + "rewards/margins": 3.435229778289795, + "rewards/rejected": -10.768695831298828, + "step": 4380 + }, + { + "epoch": 0.85, + "learning_rate": 3.97749334867333e-07, + "logits/chosen": -2.484170436859131, + "logits/rejected": -2.5105714797973633, + "logps/chosen": -210.3848419189453, + "logps/rejected": -203.63528442382812, + "loss": 0.6081, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.691134452819824, + "rewards/margins": 3.9965405464172363, + "rewards/rejected": -10.687674522399902, + "step": 4390 + }, + { + "epoch": 0.85, + "learning_rate": 3.973898036959804e-07, + "logits/chosen": -2.55855131149292, + "logits/rejected": -2.5305016040802, + "logps/chosen": -221.122314453125, + "logps/rejected": -277.065673828125, + "loss": 0.4964, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.6367926597595215, + "rewards/margins": 3.261251449584961, + "rewards/rejected": -7.898043632507324, + "step": 4400 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -2.6917290687561035, + "eval_logits/rejected": -2.6843764781951904, + "eval_logps/chosen": -235.93966674804688, + "eval_logps/rejected": -246.0229949951172, + "eval_loss": 0.5051913857460022, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -4.170158863067627, + "eval_rewards/margins": 3.2405447959899902, + "eval_rewards/rejected": -7.410703182220459, + "eval_runtime": 140.8315, + "eval_samples_per_second": 22.41, + "eval_steps_per_second": 0.355, + "step": 4400 + }, + { + "epoch": 0.86, + "learning_rate": 3.9703027252462784e-07, + "logits/chosen": -2.79225754737854, + "logits/rejected": -2.832019090652466, + "logps/chosen": -209.0641632080078, + "logps/rejected": -259.63214111328125, + "loss": 0.4418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.651867389678955, + "rewards/margins": 2.5974977016448975, + "rewards/rejected": -5.249365329742432, + "step": 4410 + }, + { + "epoch": 0.86, + "learning_rate": 3.966707413532753e-07, + "logits/chosen": -2.7867469787597656, + "logits/rejected": -2.7578327655792236, + "logps/chosen": -310.88336181640625, + "logps/rejected": -309.07525634765625, + "loss": 0.4097, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.954159140586853, + "rewards/margins": 3.5042128562927246, + "rewards/rejected": -5.458372592926025, + "step": 4420 + }, + { + "epoch": 0.86, + "learning_rate": 3.9631121018192275e-07, + "logits/chosen": -2.650923728942871, + "logits/rejected": -2.6128299236297607, + "logps/chosen": -158.23658752441406, + "logps/rejected": -221.2763671875, + "loss": 0.6934, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7482573986053467, + "rewards/margins": 5.173362731933594, + "rewards/rejected": -7.921620845794678, + "step": 4430 + }, + { + "epoch": 0.86, + "learning_rate": 3.959516790105702e-07, + "logits/chosen": -2.666069507598877, + "logits/rejected": -2.6895437240600586, + "logps/chosen": -241.5887451171875, + "logps/rejected": -231.3760986328125, + "loss": 0.5564, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.32515287399292, + "rewards/margins": 1.9329948425292969, + "rewards/rejected": -5.258147239685059, + "step": 4440 + }, + { + "epoch": 0.86, + "learning_rate": 3.955921478392176e-07, + "logits/chosen": -2.2363100051879883, + "logits/rejected": -2.2927143573760986, + "logps/chosen": -368.7262878417969, + "logps/rejected": -312.715576171875, + "loss": 0.5752, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.293475151062012, + "rewards/margins": 2.2721190452575684, + "rewards/rejected": -7.5655927658081055, + "step": 4450 + }, + { + "epoch": 0.87, + "learning_rate": 3.952326166678651e-07, + "logits/chosen": -2.5693564414978027, + "logits/rejected": -2.518404722213745, + "logps/chosen": -209.572509765625, + "logps/rejected": -303.83441162109375, + "loss": 0.443, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.315739631652832, + "rewards/margins": 4.045080184936523, + "rewards/rejected": -8.360819816589355, + "step": 4460 + }, + { + "epoch": 0.87, + "learning_rate": 3.9487308549651256e-07, + "logits/chosen": -2.597867250442505, + "logits/rejected": -2.702716827392578, + "logps/chosen": -201.34945678710938, + "logps/rejected": -250.2091827392578, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.714076042175293, + "rewards/margins": 5.3684258460998535, + "rewards/rejected": -11.082501411437988, + "step": 4470 + }, + { + "epoch": 0.87, + "learning_rate": 3.9451355432516e-07, + "logits/chosen": -2.6887593269348145, + "logits/rejected": -2.723721981048584, + "logps/chosen": -206.8312225341797, + "logps/rejected": -259.43182373046875, + "loss": 0.4648, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.482999324798584, + "rewards/margins": 4.759879112243652, + "rewards/rejected": -9.242877960205078, + "step": 4480 + }, + { + "epoch": 0.87, + "learning_rate": 3.941540231538074e-07, + "logits/chosen": -2.500793933868408, + "logits/rejected": -2.525344133377075, + "logps/chosen": -163.62831115722656, + "logps/rejected": -188.51388549804688, + "loss": 0.5248, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.077842712402344, + "rewards/margins": 2.760449171066284, + "rewards/rejected": -6.838292121887207, + "step": 4490 + }, + { + "epoch": 0.87, + "learning_rate": 3.9379449198245485e-07, + "logits/chosen": -2.723928928375244, + "logits/rejected": -2.648629665374756, + "logps/chosen": -233.9381866455078, + "logps/rejected": -261.3736572265625, + "loss": 0.5711, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.1998491287231445, + "rewards/margins": 0.7085366249084473, + "rewards/rejected": -4.90838623046875, + "step": 4500 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.5880494117736816, + "eval_logits/rejected": -2.577402353286743, + "eval_logps/chosen": -242.3307647705078, + "eval_logps/rejected": -256.31182861328125, + "eval_loss": 0.4862401783466339, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -4.809268951416016, + "eval_rewards/margins": 3.6303162574768066, + "eval_rewards/rejected": -8.439584732055664, + "eval_runtime": 139.5327, + "eval_samples_per_second": 22.618, + "eval_steps_per_second": 0.358, + "step": 4500 + }, + { + "epoch": 0.88, + "learning_rate": 3.934349608111023e-07, + "logits/chosen": -2.7325246334075928, + "logits/rejected": -2.665409564971924, + "logps/chosen": -215.60775756835938, + "logps/rejected": -234.34130859375, + "loss": 0.6417, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.686572074890137, + "rewards/margins": 1.9754047393798828, + "rewards/rejected": -6.661977291107178, + "step": 4510 + }, + { + "epoch": 0.88, + "learning_rate": 3.9307542963974976e-07, + "logits/chosen": -2.7185959815979004, + "logits/rejected": -2.7288401126861572, + "logps/chosen": -230.36654663085938, + "logps/rejected": -230.0455322265625, + "loss": 0.488, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6887428760528564, + "rewards/margins": 1.8191553354263306, + "rewards/rejected": -3.5078983306884766, + "step": 4520 + }, + { + "epoch": 0.88, + "learning_rate": 3.927158984683972e-07, + "logits/chosen": -2.570035219192505, + "logits/rejected": -2.6241276264190674, + "logps/chosen": -196.22262573242188, + "logps/rejected": -198.39151000976562, + "loss": 0.5186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.495260715484619, + "rewards/margins": 4.2800469398498535, + "rewards/rejected": -7.77530574798584, + "step": 4530 + }, + { + "epoch": 0.88, + "learning_rate": 3.923563672970446e-07, + "logits/chosen": -2.627640962600708, + "logits/rejected": -2.6806159019470215, + "logps/chosen": -315.4527587890625, + "logps/rejected": -282.22412109375, + "loss": 0.5285, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.5280659198760986, + "rewards/margins": 1.445022702217102, + "rewards/rejected": -4.973088264465332, + "step": 4540 + }, + { + "epoch": 0.88, + "learning_rate": 3.9199683612569204e-07, + "logits/chosen": -2.863534688949585, + "logits/rejected": -2.8257272243499756, + "logps/chosen": -309.8386535644531, + "logps/rejected": -236.9312286376953, + "loss": 0.5975, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1541249752044678, + "rewards/margins": 1.7545160055160522, + "rewards/rejected": -3.9086406230926514, + "step": 4550 + }, + { + "epoch": 0.89, + "learning_rate": 3.916373049543395e-07, + "logits/chosen": -2.670228958129883, + "logits/rejected": -2.6037065982818604, + "logps/chosen": -251.37203979492188, + "logps/rejected": -233.73489379882812, + "loss": 0.4691, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9182348251342773, + "rewards/margins": 3.068732261657715, + "rewards/rejected": -5.98696756362915, + "step": 4560 + }, + { + "epoch": 0.89, + "learning_rate": 3.91277773782987e-07, + "logits/chosen": -2.6178832054138184, + "logits/rejected": -2.6455113887786865, + "logps/chosen": -217.88308715820312, + "logps/rejected": -218.3448486328125, + "loss": 0.5759, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.60750675201416, + "rewards/margins": 2.5334486961364746, + "rewards/rejected": -7.140954494476318, + "step": 4570 + }, + { + "epoch": 0.89, + "learning_rate": 3.9091824261163443e-07, + "logits/chosen": -2.7345991134643555, + "logits/rejected": -2.7396881580352783, + "logps/chosen": -248.0063934326172, + "logps/rejected": -280.45037841796875, + "loss": 0.3442, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.391216278076172, + "rewards/margins": 2.4888811111450195, + "rewards/rejected": -5.880097389221191, + "step": 4580 + }, + { + "epoch": 0.89, + "learning_rate": 3.9055871144028186e-07, + "logits/chosen": -2.704272508621216, + "logits/rejected": -2.6848812103271484, + "logps/chosen": -225.310791015625, + "logps/rejected": -242.7181396484375, + "loss": 0.5143, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.301420211791992, + "rewards/margins": 3.005526065826416, + "rewards/rejected": -6.306946277618408, + "step": 4590 + }, + { + "epoch": 0.89, + "learning_rate": 3.901991802689293e-07, + "logits/chosen": -2.7596051692962646, + "logits/rejected": -2.7071454524993896, + "logps/chosen": -262.65264892578125, + "logps/rejected": -317.663818359375, + "loss": 0.5481, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.147545576095581, + "rewards/margins": 2.1675028800964355, + "rewards/rejected": -4.315048694610596, + "step": 4600 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.633481502532959, + "eval_logits/rejected": -2.62675142288208, + "eval_logps/chosen": -228.23257446289062, + "eval_logps/rejected": -236.8095703125, + "eval_loss": 0.49345776438713074, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -3.399451971054077, + "eval_rewards/margins": 3.0899109840393066, + "eval_rewards/rejected": -6.489363193511963, + "eval_runtime": 144.96, + "eval_samples_per_second": 21.772, + "eval_steps_per_second": 0.345, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 3.898396490975767e-07, + "logits/chosen": -2.6459014415740967, + "logits/rejected": -2.6414332389831543, + "logps/chosen": -184.276123046875, + "logps/rejected": -161.65939331054688, + "loss": 0.5431, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9952762126922607, + "rewards/margins": 1.3783482313156128, + "rewards/rejected": -4.373624324798584, + "step": 4610 + }, + { + "epoch": 0.9, + "learning_rate": 3.894801179262242e-07, + "logits/chosen": -2.7866525650024414, + "logits/rejected": -2.7403743267059326, + "logps/chosen": -282.7216491699219, + "logps/rejected": -275.5063781738281, + "loss": 0.6045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6962759494781494, + "rewards/margins": 1.5308458805084229, + "rewards/rejected": -3.2271218299865723, + "step": 4620 + }, + { + "epoch": 0.9, + "learning_rate": 3.891205867548716e-07, + "logits/chosen": -2.6336541175842285, + "logits/rejected": -2.582383155822754, + "logps/chosen": -200.53636169433594, + "logps/rejected": -205.555908203125, + "loss": 0.5376, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.658719778060913, + "rewards/margins": 3.1007637977600098, + "rewards/rejected": -4.759483337402344, + "step": 4630 + }, + { + "epoch": 0.9, + "learning_rate": 3.8876105558351905e-07, + "logits/chosen": -2.7869327068328857, + "logits/rejected": -2.701145887374878, + "logps/chosen": -231.5999298095703, + "logps/rejected": -231.6761474609375, + "loss": 0.3675, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3242385387420654, + "rewards/margins": 3.2975971698760986, + "rewards/rejected": -6.621836185455322, + "step": 4640 + }, + { + "epoch": 0.9, + "learning_rate": 3.8840152441216653e-07, + "logits/chosen": -2.629610538482666, + "logits/rejected": -2.6264729499816895, + "logps/chosen": -254.6874237060547, + "logps/rejected": -226.0127716064453, + "loss": 0.5204, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.649791240692139, + "rewards/margins": 1.5615746974945068, + "rewards/rejected": -6.211366176605225, + "step": 4650 + }, + { + "epoch": 0.9, + "learning_rate": 3.8804199324081396e-07, + "logits/chosen": -2.779430866241455, + "logits/rejected": -2.7456820011138916, + "logps/chosen": -252.6644287109375, + "logps/rejected": -230.5699005126953, + "loss": 0.4624, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6407313346862793, + "rewards/margins": 1.3123255968093872, + "rewards/rejected": -3.953057050704956, + "step": 4660 + }, + { + "epoch": 0.91, + "learning_rate": 3.8768246206946144e-07, + "logits/chosen": -2.743870496749878, + "logits/rejected": -2.747746706008911, + "logps/chosen": -241.813720703125, + "logps/rejected": -283.9878845214844, + "loss": 0.5385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.751024007797241, + "rewards/margins": 3.5924434661865234, + "rewards/rejected": -6.343467712402344, + "step": 4670 + }, + { + "epoch": 0.91, + "learning_rate": 3.8732293089810887e-07, + "logits/chosen": -2.7051734924316406, + "logits/rejected": -2.7314159870147705, + "logps/chosen": -260.3672790527344, + "logps/rejected": -315.00445556640625, + "loss": 0.4973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.504934310913086, + "rewards/margins": 2.647252321243286, + "rewards/rejected": -5.152186393737793, + "step": 4680 + }, + { + "epoch": 0.91, + "learning_rate": 3.869633997267563e-07, + "logits/chosen": -2.6802845001220703, + "logits/rejected": -2.632026195526123, + "logps/chosen": -186.29025268554688, + "logps/rejected": -173.17556762695312, + "loss": 0.5464, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.9858808517456055, + "rewards/margins": 2.548701047897339, + "rewards/rejected": -7.534582614898682, + "step": 4690 + }, + { + "epoch": 0.91, + "learning_rate": 3.866038685554037e-07, + "logits/chosen": -2.7980446815490723, + "logits/rejected": -2.851337432861328, + "logps/chosen": -254.64938354492188, + "logps/rejected": -238.0263671875, + "loss": 0.4468, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0403826236724854, + "rewards/margins": 2.884485960006714, + "rewards/rejected": -5.924868583679199, + "step": 4700 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.7352101802825928, + "eval_logits/rejected": -2.7280097007751465, + "eval_logps/chosen": -231.85617065429688, + "eval_logps/rejected": -240.11097717285156, + "eval_loss": 0.4904622733592987, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -3.761807918548584, + "eval_rewards/margins": 3.0576953887939453, + "eval_rewards/rejected": -6.819503307342529, + "eval_runtime": 139.9939, + "eval_samples_per_second": 22.544, + "eval_steps_per_second": 0.357, + "step": 4700 + }, + { + "epoch": 0.91, + "learning_rate": 3.8624433738405115e-07, + "logits/chosen": -2.713387966156006, + "logits/rejected": -2.6253886222839355, + "logps/chosen": -260.95648193359375, + "logps/rejected": -266.45062255859375, + "loss": 0.495, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.654179096221924, + "rewards/margins": 2.9133238792419434, + "rewards/rejected": -5.567502975463867, + "step": 4710 + }, + { + "epoch": 0.92, + "learning_rate": 3.8588480621269863e-07, + "logits/chosen": -2.713736057281494, + "logits/rejected": -2.695307970046997, + "logps/chosen": -284.30584716796875, + "logps/rejected": -359.34014892578125, + "loss": 0.5401, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.404874324798584, + "rewards/margins": 2.228503465652466, + "rewards/rejected": -4.633377552032471, + "step": 4720 + }, + { + "epoch": 0.92, + "learning_rate": 3.8552527504134606e-07, + "logits/chosen": -2.799454927444458, + "logits/rejected": -2.8190231323242188, + "logps/chosen": -210.095458984375, + "logps/rejected": -219.76797485351562, + "loss": 0.5772, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.435279846191406, + "rewards/margins": 1.6794917583465576, + "rewards/rejected": -6.114771842956543, + "step": 4730 + }, + { + "epoch": 0.92, + "learning_rate": 3.851657438699935e-07, + "logits/chosen": -2.6745429039001465, + "logits/rejected": -2.686805248260498, + "logps/chosen": -253.4486541748047, + "logps/rejected": -346.912841796875, + "loss": 0.5649, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.1384596824646, + "rewards/margins": 5.357385158538818, + "rewards/rejected": -10.495844841003418, + "step": 4740 + }, + { + "epoch": 0.92, + "learning_rate": 3.8480621269864097e-07, + "logits/chosen": -2.7019081115722656, + "logits/rejected": -2.7100443840026855, + "logps/chosen": -223.4260711669922, + "logps/rejected": -174.53768920898438, + "loss": 0.5382, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.346592426300049, + "rewards/margins": 2.5449345111846924, + "rewards/rejected": -6.891526699066162, + "step": 4750 + }, + { + "epoch": 0.92, + "learning_rate": 3.844466815272884e-07, + "logits/chosen": -2.7890267372131348, + "logits/rejected": -2.806248188018799, + "logps/chosen": -254.773681640625, + "logps/rejected": -342.19476318359375, + "loss": 0.4703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.618661880493164, + "rewards/margins": 2.3927292823791504, + "rewards/rejected": -5.0113911628723145, + "step": 4760 + }, + { + "epoch": 0.93, + "learning_rate": 3.840871503559359e-07, + "logits/chosen": -2.8749566078186035, + "logits/rejected": -2.8356144428253174, + "logps/chosen": -296.539306640625, + "logps/rejected": -263.4466857910156, + "loss": 0.5366, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6513047218322754, + "rewards/margins": 0.9896014332771301, + "rewards/rejected": -3.6409058570861816, + "step": 4770 + }, + { + "epoch": 0.93, + "learning_rate": 3.837276191845833e-07, + "logits/chosen": -2.923330783843994, + "logits/rejected": -2.879647731781006, + "logps/chosen": -230.6038818359375, + "logps/rejected": -209.9868621826172, + "loss": 0.5326, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.6262035369873047, + "rewards/margins": 1.0854170322418213, + "rewards/rejected": -4.711619853973389, + "step": 4780 + }, + { + "epoch": 0.93, + "learning_rate": 3.8336808801323073e-07, + "logits/chosen": -2.9384472370147705, + "logits/rejected": -2.8726000785827637, + "logps/chosen": -249.5596160888672, + "logps/rejected": -310.5499267578125, + "loss": 0.5193, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7462658882141113, + "rewards/margins": 3.243990421295166, + "rewards/rejected": -6.990255832672119, + "step": 4790 + }, + { + "epoch": 0.93, + "learning_rate": 3.8300855684187816e-07, + "logits/chosen": -2.7498159408569336, + "logits/rejected": -2.7551639080047607, + "logps/chosen": -280.60565185546875, + "logps/rejected": -251.5581512451172, + "loss": 0.5001, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7587432861328125, + "rewards/margins": 3.414700746536255, + "rewards/rejected": -5.173443794250488, + "step": 4800 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.7782437801361084, + "eval_logits/rejected": -2.7686476707458496, + "eval_logps/chosen": -239.80943298339844, + "eval_logps/rejected": -255.1630096435547, + "eval_loss": 0.48671188950538635, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -4.557135581970215, + "eval_rewards/margins": 3.7675697803497314, + "eval_rewards/rejected": -8.324706077575684, + "eval_runtime": 146.5225, + "eval_samples_per_second": 21.539, + "eval_steps_per_second": 0.341, + "step": 4800 + }, + { + "epoch": 0.93, + "learning_rate": 3.826490256705256e-07, + "logits/chosen": -2.7762343883514404, + "logits/rejected": -2.8220748901367188, + "logps/chosen": -250.0959930419922, + "logps/rejected": -290.9184875488281, + "loss": 0.4251, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.777438163757324, + "rewards/margins": 3.731992721557617, + "rewards/rejected": -8.509431838989258, + "step": 4810 + }, + { + "epoch": 0.94, + "learning_rate": 3.8228949449917307e-07, + "logits/chosen": -2.8603081703186035, + "logits/rejected": -2.8561787605285645, + "logps/chosen": -290.33416748046875, + "logps/rejected": -265.3094177246094, + "loss": 0.4681, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.198728084564209, + "rewards/margins": 3.6126930713653564, + "rewards/rejected": -8.811421394348145, + "step": 4820 + }, + { + "epoch": 0.94, + "learning_rate": 3.819299633278205e-07, + "logits/chosen": -2.7717111110687256, + "logits/rejected": -2.7392990589141846, + "logps/chosen": -314.5079040527344, + "logps/rejected": -277.3616943359375, + "loss": 0.5301, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3043341636657715, + "rewards/margins": 4.1167521476745605, + "rewards/rejected": -7.42108678817749, + "step": 4830 + }, + { + "epoch": 0.94, + "learning_rate": 3.815704321564679e-07, + "logits/chosen": -2.6337149143218994, + "logits/rejected": -2.6978964805603027, + "logps/chosen": -224.68130493164062, + "logps/rejected": -257.2038269042969, + "loss": 0.5576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9031434059143066, + "rewards/margins": 4.151448726654053, + "rewards/rejected": -7.054592132568359, + "step": 4840 + }, + { + "epoch": 0.94, + "learning_rate": 3.812109009851154e-07, + "logits/chosen": -2.9030470848083496, + "logits/rejected": -2.921684741973877, + "logps/chosen": -443.34747314453125, + "logps/rejected": -304.0016784667969, + "loss": 0.4318, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.6353559494018555, + "rewards/margins": 3.056591510772705, + "rewards/rejected": -7.691947937011719, + "step": 4850 + }, + { + "epoch": 0.94, + "learning_rate": 3.8085136981376283e-07, + "logits/chosen": -2.907698392868042, + "logits/rejected": -2.957947254180908, + "logps/chosen": -245.33419799804688, + "logps/rejected": -293.4639587402344, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.125678062438965, + "rewards/margins": 4.333249568939209, + "rewards/rejected": -8.458927154541016, + "step": 4860 + }, + { + "epoch": 0.95, + "learning_rate": 3.804918386424103e-07, + "logits/chosen": -2.820817470550537, + "logits/rejected": -2.7914021015167236, + "logps/chosen": -200.84523010253906, + "logps/rejected": -241.79531860351562, + "loss": 0.4316, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.529050350189209, + "rewards/margins": 1.5810530185699463, + "rewards/rejected": -8.110102653503418, + "step": 4870 + }, + { + "epoch": 0.95, + "learning_rate": 3.8013230747105774e-07, + "logits/chosen": -2.6274003982543945, + "logits/rejected": -2.726317882537842, + "logps/chosen": -245.4355010986328, + "logps/rejected": -259.23809814453125, + "loss": 0.5073, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.172598838806152, + "rewards/margins": 3.381195545196533, + "rewards/rejected": -7.553793907165527, + "step": 4880 + }, + { + "epoch": 0.95, + "learning_rate": 3.7977277629970517e-07, + "logits/chosen": -2.805297374725342, + "logits/rejected": -2.8723697662353516, + "logps/chosen": -237.6421661376953, + "logps/rejected": -267.11004638671875, + "loss": 0.4277, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3887939453125, + "rewards/margins": 4.594298839569092, + "rewards/rejected": -7.983092308044434, + "step": 4890 + }, + { + "epoch": 0.95, + "learning_rate": 3.794132451283526e-07, + "logits/chosen": -2.6064090728759766, + "logits/rejected": -2.6015784740448, + "logps/chosen": -194.22018432617188, + "logps/rejected": -298.46380615234375, + "loss": 0.4342, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.538807392120361, + "rewards/margins": 3.978961229324341, + "rewards/rejected": -9.517767906188965, + "step": 4900 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -2.7979583740234375, + "eval_logits/rejected": -2.7917184829711914, + "eval_logps/chosen": -240.02420043945312, + "eval_logps/rejected": -251.78773498535156, + "eval_loss": 0.49478423595428467, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -4.578611373901367, + "eval_rewards/margins": 3.408565044403076, + "eval_rewards/rejected": -7.987176418304443, + "eval_runtime": 154.9453, + "eval_samples_per_second": 20.368, + "eval_steps_per_second": 0.323, + "step": 4900 + }, + { + "epoch": 0.95, + "learning_rate": 3.79053713957e-07, + "logits/chosen": -2.7656638622283936, + "logits/rejected": -2.751854658126831, + "logps/chosen": -158.113037109375, + "logps/rejected": -208.19003295898438, + "loss": 0.4107, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.385377407073975, + "rewards/margins": 3.694204330444336, + "rewards/rejected": -8.079580307006836, + "step": 4910 + }, + { + "epoch": 0.96, + "learning_rate": 3.786941827856475e-07, + "logits/chosen": -2.855672597885132, + "logits/rejected": -2.856933832168579, + "logps/chosen": -179.34011840820312, + "logps/rejected": -202.17086791992188, + "loss": 0.5642, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.467204570770264, + "rewards/margins": 3.3610007762908936, + "rewards/rejected": -7.828205108642578, + "step": 4920 + }, + { + "epoch": 0.96, + "learning_rate": 3.7833465161429493e-07, + "logits/chosen": -2.8508670330047607, + "logits/rejected": -2.9047322273254395, + "logps/chosen": -292.36590576171875, + "logps/rejected": -293.3150329589844, + "loss": 0.5436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8637614250183105, + "rewards/margins": 2.5137455463409424, + "rewards/rejected": -5.377506732940674, + "step": 4930 + }, + { + "epoch": 0.96, + "learning_rate": 3.7797512044294236e-07, + "logits/chosen": -2.896697998046875, + "logits/rejected": -2.8970844745635986, + "logps/chosen": -282.89447021484375, + "logps/rejected": -278.5943298339844, + "loss": 0.5348, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.333247184753418, + "rewards/margins": 1.6520404815673828, + "rewards/rejected": -5.985286712646484, + "step": 4940 + }, + { + "epoch": 0.96, + "learning_rate": 3.7761558927158984e-07, + "logits/chosen": -2.8573505878448486, + "logits/rejected": -2.811361074447632, + "logps/chosen": -277.64776611328125, + "logps/rejected": -275.1065368652344, + "loss": 0.4906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.285165309906006, + "rewards/margins": 3.0737969875335693, + "rewards/rejected": -7.358962059020996, + "step": 4950 + }, + { + "epoch": 0.96, + "learning_rate": 3.7725605810023727e-07, + "logits/chosen": -2.856657028198242, + "logits/rejected": -2.924285411834717, + "logps/chosen": -289.0182800292969, + "logps/rejected": -280.6921691894531, + "loss": 0.5948, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.451930284500122, + "rewards/margins": 2.1042654514312744, + "rewards/rejected": -5.5561957359313965, + "step": 4960 + }, + { + "epoch": 0.96, + "learning_rate": 3.7689652692888475e-07, + "logits/chosen": -2.9017937183380127, + "logits/rejected": -2.838625431060791, + "logps/chosen": -263.3382568359375, + "logps/rejected": -249.8639678955078, + "loss": 0.4445, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -7.6466474533081055, + "rewards/margins": 2.6505770683288574, + "rewards/rejected": -10.297224044799805, + "step": 4970 + }, + { + "epoch": 0.97, + "learning_rate": 3.765369957575322e-07, + "logits/chosen": -2.7935118675231934, + "logits/rejected": -2.7818892002105713, + "logps/chosen": -200.6241912841797, + "logps/rejected": -195.07611083984375, + "loss": 0.5492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2856605052948, + "rewards/margins": 2.976768732070923, + "rewards/rejected": -6.262429237365723, + "step": 4980 + }, + { + "epoch": 0.97, + "learning_rate": 3.761774645861796e-07, + "logits/chosen": -2.8862531185150146, + "logits/rejected": -2.846205234527588, + "logps/chosen": -241.02072143554688, + "logps/rejected": -326.8516540527344, + "loss": 0.3793, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1322147846221924, + "rewards/margins": 4.976840019226074, + "rewards/rejected": -8.109055519104004, + "step": 4990 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581793341482703e-07, + "logits/chosen": -2.556763172149658, + "logits/rejected": -2.5429344177246094, + "logps/chosen": -323.6518859863281, + "logps/rejected": -297.3819274902344, + "loss": 0.5148, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.514235734939575, + "rewards/margins": 0.252679705619812, + "rewards/rejected": -3.7669150829315186, + "step": 5000 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.8058435916900635, + "eval_logits/rejected": -2.8001227378845215, + "eval_logps/chosen": -245.33412170410156, + "eval_logps/rejected": -256.4447937011719, + "eval_loss": 0.4876927137374878, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -5.109607219696045, + "eval_rewards/margins": 3.3432767391204834, + "eval_rewards/rejected": -8.452884674072266, + "eval_runtime": 139.968, + "eval_samples_per_second": 22.548, + "eval_steps_per_second": 0.357, + "step": 5000 + }, + { + "epoch": 0.97, + "learning_rate": 3.7545840224347446e-07, + "logits/chosen": -2.8039920330047607, + "logits/rejected": -2.8128957748413086, + "logps/chosen": -287.8877258300781, + "logps/rejected": -318.81439208984375, + "loss": 0.4314, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.142752647399902, + "rewards/margins": 3.390345335006714, + "rewards/rejected": -9.533100128173828, + "step": 5010 + }, + { + "epoch": 0.97, + "learning_rate": 3.7509887107212194e-07, + "logits/chosen": -2.816234588623047, + "logits/rejected": -2.8018856048583984, + "logps/chosen": -319.5215148925781, + "logps/rejected": -235.34506225585938, + "loss": 0.5346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.539577484130859, + "rewards/margins": 0.9457670450210571, + "rewards/rejected": -5.485343933105469, + "step": 5020 + }, + { + "epoch": 0.98, + "learning_rate": 3.7473933990076937e-07, + "logits/chosen": -2.927044153213501, + "logits/rejected": -2.916829824447632, + "logps/chosen": -262.3309020996094, + "logps/rejected": -327.40826416015625, + "loss": 0.5341, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -4.339417457580566, + "rewards/margins": 0.5237800478935242, + "rewards/rejected": -4.863197326660156, + "step": 5030 + }, + { + "epoch": 0.98, + "learning_rate": 3.7437980872941685e-07, + "logits/chosen": -2.864976406097412, + "logits/rejected": -2.893390655517578, + "logps/chosen": -260.3367004394531, + "logps/rejected": -250.24819946289062, + "loss": 0.5778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.055976390838623, + "rewards/margins": 1.819750428199768, + "rewards/rejected": -6.875726222991943, + "step": 5040 + }, + { + "epoch": 0.98, + "learning_rate": 3.740202775580643e-07, + "logits/chosen": -2.8373379707336426, + "logits/rejected": -2.9002954959869385, + "logps/chosen": -280.1398010253906, + "logps/rejected": -261.08917236328125, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2306816577911377, + "rewards/margins": 2.560249090194702, + "rewards/rejected": -5.79093074798584, + "step": 5050 + }, + { + "epoch": 0.98, + "learning_rate": 3.736607463867117e-07, + "logits/chosen": -2.8642070293426514, + "logits/rejected": -2.850830078125, + "logps/chosen": -265.0941162109375, + "logps/rejected": -274.12091064453125, + "loss": 0.5481, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -6.264739036560059, + "rewards/margins": 1.2267714738845825, + "rewards/rejected": -7.49151086807251, + "step": 5060 + }, + { + "epoch": 0.98, + "learning_rate": 3.733012152153592e-07, + "logits/chosen": -2.8552756309509277, + "logits/rejected": -2.7445178031921387, + "logps/chosen": -282.53375244140625, + "logps/rejected": -237.4908447265625, + "loss": 0.5685, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.214513063430786, + "rewards/margins": 1.601584792137146, + "rewards/rejected": -4.816098213195801, + "step": 5070 + }, + { + "epoch": 0.99, + "learning_rate": 3.729416840440066e-07, + "logits/chosen": -2.7386107444763184, + "logits/rejected": -2.662231206893921, + "logps/chosen": -281.3005065917969, + "logps/rejected": -332.75091552734375, + "loss": 0.5924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.301259994506836, + "rewards/margins": 1.7015777826309204, + "rewards/rejected": -6.002837181091309, + "step": 5080 + }, + { + "epoch": 0.99, + "learning_rate": 3.7258215287265404e-07, + "logits/chosen": -2.769857883453369, + "logits/rejected": -2.7492454051971436, + "logps/chosen": -202.14151000976562, + "logps/rejected": -204.52354431152344, + "loss": 0.567, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.8968968391418457, + "rewards/margins": 1.3117891550064087, + "rewards/rejected": -5.208685874938965, + "step": 5090 + }, + { + "epoch": 0.99, + "learning_rate": 3.7222262170130147e-07, + "logits/chosen": -2.6590256690979004, + "logits/rejected": -2.704235553741455, + "logps/chosen": -207.9397735595703, + "logps/rejected": -251.90444946289062, + "loss": 0.456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.3292739391326904, + "rewards/margins": 2.6150341033935547, + "rewards/rejected": -5.944308280944824, + "step": 5100 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.699265718460083, + "eval_logits/rejected": -2.695173740386963, + "eval_logps/chosen": -237.08937072753906, + "eval_logps/rejected": -246.49073791503906, + "eval_loss": 0.4937092065811157, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -4.28513240814209, + "eval_rewards/margins": 3.1723451614379883, + "eval_rewards/rejected": -7.457478046417236, + "eval_runtime": 141.0846, + "eval_samples_per_second": 22.37, + "eval_steps_per_second": 0.354, + "step": 5100 + }, + { + "epoch": 0.99, + "learning_rate": 3.718630905299489e-07, + "logits/chosen": -2.7389750480651855, + "logits/rejected": -2.814399242401123, + "logps/chosen": -208.7837371826172, + "logps/rejected": -269.72052001953125, + "loss": 0.4895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.171353340148926, + "rewards/margins": 3.0589101314544678, + "rewards/rejected": -8.230263710021973, + "step": 5110 + }, + { + "epoch": 0.99, + "learning_rate": 3.715035593585964e-07, + "logits/chosen": -2.74831485748291, + "logits/rejected": -2.7159152030944824, + "logps/chosen": -200.48020935058594, + "logps/rejected": -218.4559783935547, + "loss": 0.4423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.618405818939209, + "rewards/margins": 1.5190508365631104, + "rewards/rejected": -4.137456893920898, + "step": 5120 + }, + { + "epoch": 1.0, + "learning_rate": 3.711440281872438e-07, + "logits/chosen": -2.725182056427002, + "logits/rejected": -2.7014055252075195, + "logps/chosen": -237.24484252929688, + "logps/rejected": -207.36181640625, + "loss": 0.5052, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4843099117279053, + "rewards/margins": 3.431879758834839, + "rewards/rejected": -5.916189670562744, + "step": 5130 + }, + { + "epoch": 1.0, + "learning_rate": 3.707844970158913e-07, + "logits/chosen": -2.784498929977417, + "logits/rejected": -2.7601518630981445, + "logps/chosen": -226.90219116210938, + "logps/rejected": -237.20742797851562, + "loss": 0.457, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.148807525634766, + "rewards/margins": 4.220892906188965, + "rewards/rejected": -8.369699478149414, + "step": 5140 + }, + { + "epoch": 1.0, + "learning_rate": 3.704249658445387e-07, + "logits/chosen": -2.5374457836151123, + "logits/rejected": -2.507659435272217, + "logps/chosen": -161.14242553710938, + "logps/rejected": -155.3887939453125, + "loss": 0.5048, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.364739418029785, + "rewards/margins": 0.5551323890686035, + "rewards/rejected": -4.9198713302612305, + "step": 5150 + }, + { + "epoch": 1.0, + "learning_rate": 3.7006543467318614e-07, + "logits/chosen": -2.706430196762085, + "logits/rejected": -2.731123924255371, + "logps/chosen": -188.61871337890625, + "logps/rejected": -239.47265625, + "loss": 0.1459, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8703367710113525, + "rewards/margins": 6.202843189239502, + "rewards/rejected": -8.073180198669434, + "step": 5160 + }, + { + "epoch": 1.0, + "learning_rate": 3.697059035018336e-07, + "logits/chosen": -2.6322195529937744, + "logits/rejected": -2.6461081504821777, + "logps/chosen": -226.6240692138672, + "logps/rejected": -246.842041015625, + "loss": 0.1756, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.154054164886475, + "rewards/margins": 4.039252758026123, + "rewards/rejected": -8.193307876586914, + "step": 5170 + }, + { + "epoch": 1.01, + "learning_rate": 3.6934637233048105e-07, + "logits/chosen": -2.731140613555908, + "logits/rejected": -2.6016902923583984, + "logps/chosen": -212.1609344482422, + "logps/rejected": -240.11758422851562, + "loss": 0.1153, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.578425884246826, + "rewards/margins": 5.413627624511719, + "rewards/rejected": -7.992053031921387, + "step": 5180 + }, + { + "epoch": 1.01, + "learning_rate": 3.689868411591285e-07, + "logits/chosen": -2.5545010566711426, + "logits/rejected": -2.479830265045166, + "logps/chosen": -180.43634033203125, + "logps/rejected": -308.0595703125, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7720845937728882, + "rewards/margins": 8.198678970336914, + "rewards/rejected": -8.970763206481934, + "step": 5190 + }, + { + "epoch": 1.01, + "learning_rate": 3.686273099877759e-07, + "logits/chosen": -2.825456142425537, + "logits/rejected": -2.849297285079956, + "logps/chosen": -224.1262969970703, + "logps/rejected": -252.75198364257812, + "loss": 0.1524, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.472179889678955, + "rewards/margins": 4.5035929679870605, + "rewards/rejected": -6.975772857666016, + "step": 5200 + }, + { + "epoch": 1.01, + "eval_logits/chosen": -2.6616714000701904, + "eval_logits/rejected": -2.654366970062256, + "eval_logps/chosen": -239.63279724121094, + "eval_logps/rejected": -255.0330047607422, + "eval_loss": 0.4891924560070038, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -4.539472579956055, + "eval_rewards/margins": 3.772231340408325, + "eval_rewards/rejected": -8.311702728271484, + "eval_runtime": 139.642, + "eval_samples_per_second": 22.601, + "eval_steps_per_second": 0.358, + "step": 5200 + }, + { + "epoch": 1.01, + "learning_rate": 3.6826777881642334e-07, + "logits/chosen": -2.652801990509033, + "logits/rejected": -2.6216378211975098, + "logps/chosen": -254.4090576171875, + "logps/rejected": -257.03851318359375, + "loss": 0.14, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48289814591407776, + "rewards/margins": 7.643038749694824, + "rewards/rejected": -8.125936508178711, + "step": 5210 + }, + { + "epoch": 1.01, + "learning_rate": 3.679082476450708e-07, + "logits/chosen": -2.626802444458008, + "logits/rejected": -2.6811165809631348, + "logps/chosen": -239.24526977539062, + "logps/rejected": -310.8146057128906, + "loss": 0.1104, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9944814443588257, + "rewards/margins": 7.890805244445801, + "rewards/rejected": -9.885286331176758, + "step": 5220 + }, + { + "epoch": 1.02, + "learning_rate": 3.6754871647371824e-07, + "logits/chosen": -2.8209762573242188, + "logits/rejected": -2.722492218017578, + "logps/chosen": -300.58831787109375, + "logps/rejected": -349.81500244140625, + "loss": 0.1045, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.164588451385498, + "rewards/margins": 11.860956192016602, + "rewards/rejected": -10.696367263793945, + "step": 5230 + }, + { + "epoch": 1.02, + "learning_rate": 3.671891853023657e-07, + "logits/chosen": -2.7256627082824707, + "logits/rejected": -2.8017637729644775, + "logps/chosen": -235.8293914794922, + "logps/rejected": -365.16839599609375, + "loss": 0.1211, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6058365106582642, + "rewards/margins": 9.512593269348145, + "rewards/rejected": -11.118429183959961, + "step": 5240 + }, + { + "epoch": 1.02, + "learning_rate": 3.6682965413101315e-07, + "logits/chosen": -2.6234757900238037, + "logits/rejected": -2.639665126800537, + "logps/chosen": -295.0428466796875, + "logps/rejected": -347.30670166015625, + "loss": 0.1882, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5189539194107056, + "rewards/margins": 8.475537300109863, + "rewards/rejected": -7.956583499908447, + "step": 5250 + }, + { + "epoch": 1.02, + "learning_rate": 3.664701229596606e-07, + "logits/chosen": -2.682283878326416, + "logits/rejected": -2.735970973968506, + "logps/chosen": -250.5039520263672, + "logps/rejected": -295.7886047363281, + "loss": 0.1059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.94063401222229, + "rewards/margins": 11.262201309204102, + "rewards/rejected": -13.202835083007812, + "step": 5260 + }, + { + "epoch": 1.02, + "learning_rate": 3.6611059178830806e-07, + "logits/chosen": -2.609736680984497, + "logits/rejected": -2.630481243133545, + "logps/chosen": -219.67385864257812, + "logps/rejected": -265.46978759765625, + "loss": 0.1221, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2120184898376465, + "rewards/margins": 7.351651668548584, + "rewards/rejected": -10.563669204711914, + "step": 5270 + }, + { + "epoch": 1.03, + "learning_rate": 3.657510606169555e-07, + "logits/chosen": -2.79672908782959, + "logits/rejected": -2.80228853225708, + "logps/chosen": -423.10894775390625, + "logps/rejected": -475.14849853515625, + "loss": 0.1257, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.531555414199829, + "rewards/margins": 8.50876522064209, + "rewards/rejected": -11.040319442749023, + "step": 5280 + }, + { + "epoch": 1.03, + "learning_rate": 3.653915294456029e-07, + "logits/chosen": -2.7790000438690186, + "logits/rejected": -2.7992444038391113, + "logps/chosen": -302.49298095703125, + "logps/rejected": -327.6779479980469, + "loss": 0.1831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6189241409301758, + "rewards/margins": 6.2512640953063965, + "rewards/rejected": -7.870188236236572, + "step": 5290 + }, + { + "epoch": 1.03, + "learning_rate": 3.6503199827425034e-07, + "logits/chosen": -2.6647748947143555, + "logits/rejected": -2.7105484008789062, + "logps/chosen": -203.89161682128906, + "logps/rejected": -380.47930908203125, + "loss": 0.1647, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8709390759468079, + "rewards/margins": 9.922861099243164, + "rewards/rejected": -10.793800354003906, + "step": 5300 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.624562978744507, + "eval_logits/rejected": -2.6140475273132324, + "eval_logps/chosen": -246.80006408691406, + "eval_logps/rejected": -267.1991271972656, + "eval_loss": 0.5094810128211975, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -5.256199359893799, + "eval_rewards/margins": 4.272119522094727, + "eval_rewards/rejected": -9.528318405151367, + "eval_runtime": 140.6563, + "eval_samples_per_second": 22.438, + "eval_steps_per_second": 0.355, + "step": 5300 + }, + { + "epoch": 1.03, + "learning_rate": 3.6467246710289777e-07, + "logits/chosen": -2.6462931632995605, + "logits/rejected": -2.597566604614258, + "logps/chosen": -309.98675537109375, + "logps/rejected": -370.7598571777344, + "loss": 0.0859, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09952565282583237, + "rewards/margins": 10.9370698928833, + "rewards/rejected": -11.036596298217773, + "step": 5310 + }, + { + "epoch": 1.03, + "learning_rate": 3.6431293593154525e-07, + "logits/chosen": -2.8438780307769775, + "logits/rejected": -2.727034568786621, + "logps/chosen": -281.74365234375, + "logps/rejected": -366.4187316894531, + "loss": 0.1493, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0003354430082254112, + "rewards/margins": 9.595807075500488, + "rewards/rejected": -9.595470428466797, + "step": 5320 + }, + { + "epoch": 1.03, + "learning_rate": 3.639534047601927e-07, + "logits/chosen": -2.6891815662384033, + "logits/rejected": -2.644469738006592, + "logps/chosen": -254.2372283935547, + "logps/rejected": -270.01397705078125, + "loss": 0.1254, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1999330520629883, + "rewards/margins": 7.129768371582031, + "rewards/rejected": -9.32970142364502, + "step": 5330 + }, + { + "epoch": 1.04, + "learning_rate": 3.6359387358884016e-07, + "logits/chosen": -2.681487560272217, + "logits/rejected": -2.723895311355591, + "logps/chosen": -242.2496795654297, + "logps/rejected": -302.38616943359375, + "loss": 0.0976, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.672675609588623, + "rewards/margins": 8.488563537597656, + "rewards/rejected": -11.161238670349121, + "step": 5340 + }, + { + "epoch": 1.04, + "learning_rate": 3.632343424174876e-07, + "logits/chosen": -2.6785712242126465, + "logits/rejected": -2.6607227325439453, + "logps/chosen": -208.8882598876953, + "logps/rejected": -338.19293212890625, + "loss": 0.1597, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5223472118377686, + "rewards/margins": 10.683622360229492, + "rewards/rejected": -14.205968856811523, + "step": 5350 + }, + { + "epoch": 1.04, + "learning_rate": 3.62874811246135e-07, + "logits/chosen": -2.6800436973571777, + "logits/rejected": -2.621644973754883, + "logps/chosen": -261.87939453125, + "logps/rejected": -285.1792297363281, + "loss": 0.1575, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.27321195602417, + "rewards/margins": 9.290997505187988, + "rewards/rejected": -11.564209938049316, + "step": 5360 + }, + { + "epoch": 1.04, + "learning_rate": 3.625152800747825e-07, + "logits/chosen": -2.797255754470825, + "logits/rejected": -2.804145097732544, + "logps/chosen": -273.0203857421875, + "logps/rejected": -368.51812744140625, + "loss": 0.1136, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1146254539489746, + "rewards/margins": 8.267000198364258, + "rewards/rejected": -11.381625175476074, + "step": 5370 + }, + { + "epoch": 1.04, + "learning_rate": 3.621557489034299e-07, + "logits/chosen": -2.7852702140808105, + "logits/rejected": -2.77970027923584, + "logps/chosen": -245.98532104492188, + "logps/rejected": -254.83218383789062, + "loss": 0.1252, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.4917871057987213, + "rewards/margins": 7.187398433685303, + "rewards/rejected": -7.679185390472412, + "step": 5380 + }, + { + "epoch": 1.05, + "learning_rate": 3.6179621773207735e-07, + "logits/chosen": -2.769925355911255, + "logits/rejected": -2.785240650177002, + "logps/chosen": -269.5412292480469, + "logps/rejected": -322.44061279296875, + "loss": 0.0918, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9030981063842773, + "rewards/margins": 6.19405460357666, + "rewards/rejected": -9.097153663635254, + "step": 5390 + }, + { + "epoch": 1.05, + "learning_rate": 3.614366865607248e-07, + "logits/chosen": -2.684915542602539, + "logits/rejected": -2.6567978858947754, + "logps/chosen": -277.91278076171875, + "logps/rejected": -273.9879150390625, + "loss": 0.1757, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.428370863199234, + "rewards/margins": 6.694431304931641, + "rewards/rejected": -7.122802734375, + "step": 5400 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.5418596267700195, + "eval_logits/rejected": -2.5278024673461914, + "eval_logps/chosen": -239.91001892089844, + "eval_logps/rejected": -258.1794738769531, + "eval_loss": 0.54659104347229, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -4.567195415496826, + "eval_rewards/margins": 4.059156894683838, + "eval_rewards/rejected": -8.626352310180664, + "eval_runtime": 140.799, + "eval_samples_per_second": 22.415, + "eval_steps_per_second": 0.355, + "step": 5400 + }, + { + "epoch": 1.05, + "learning_rate": 3.610771553893722e-07, + "logits/chosen": -2.7176060676574707, + "logits/rejected": -2.6798465251922607, + "logps/chosen": -231.7126007080078, + "logps/rejected": -279.63458251953125, + "loss": 0.2079, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6557159423828125, + "rewards/margins": 8.253232955932617, + "rewards/rejected": -9.908949851989746, + "step": 5410 + }, + { + "epoch": 1.05, + "learning_rate": 3.607176242180197e-07, + "logits/chosen": -2.7627878189086914, + "logits/rejected": -2.677417755126953, + "logps/chosen": -243.6680908203125, + "logps/rejected": -277.702880859375, + "loss": 0.102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0694854259490967, + "rewards/margins": 6.533364772796631, + "rewards/rejected": -8.602849960327148, + "step": 5420 + }, + { + "epoch": 1.05, + "learning_rate": 3.603580930466671e-07, + "logits/chosen": -2.671948194503784, + "logits/rejected": -2.625758647918701, + "logps/chosen": -236.7624969482422, + "logps/rejected": -297.90087890625, + "loss": 0.1159, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3368608951568604, + "rewards/margins": 7.368115425109863, + "rewards/rejected": -9.704974174499512, + "step": 5430 + }, + { + "epoch": 1.06, + "learning_rate": 3.599985618753146e-07, + "logits/chosen": -2.578261613845825, + "logits/rejected": -2.6791250705718994, + "logps/chosen": -328.80523681640625, + "logps/rejected": -417.37908935546875, + "loss": 0.1223, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.867095708847046, + "rewards/margins": 7.185909271240234, + "rewards/rejected": -9.053004264831543, + "step": 5440 + }, + { + "epoch": 1.06, + "learning_rate": 3.59639030703962e-07, + "logits/chosen": -2.7734498977661133, + "logits/rejected": -2.783108711242676, + "logps/chosen": -228.15652465820312, + "logps/rejected": -352.22479248046875, + "loss": 0.1249, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5476136207580566, + "rewards/margins": 11.574209213256836, + "rewards/rejected": -13.121824264526367, + "step": 5450 + }, + { + "epoch": 1.06, + "learning_rate": 3.5927949953260945e-07, + "logits/chosen": -2.6990790367126465, + "logits/rejected": -2.6678977012634277, + "logps/chosen": -218.618408203125, + "logps/rejected": -311.88580322265625, + "loss": 0.0929, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4440550804138184, + "rewards/margins": 9.342641830444336, + "rewards/rejected": -11.786697387695312, + "step": 5460 + }, + { + "epoch": 1.06, + "learning_rate": 3.5891996836125694e-07, + "logits/chosen": -2.507127046585083, + "logits/rejected": -2.499525785446167, + "logps/chosen": -216.7612762451172, + "logps/rejected": -246.66152954101562, + "loss": 0.1814, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.382702827453613, + "rewards/margins": 4.94468355178833, + "rewards/rejected": -9.327385902404785, + "step": 5470 + }, + { + "epoch": 1.06, + "learning_rate": 3.5856043718990436e-07, + "logits/chosen": -2.820582389831543, + "logits/rejected": -2.7196497917175293, + "logps/chosen": -229.8854522705078, + "logps/rejected": -280.36907958984375, + "loss": 0.0991, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7096996307373047, + "rewards/margins": 9.916669845581055, + "rewards/rejected": -11.62636947631836, + "step": 5480 + }, + { + "epoch": 1.07, + "learning_rate": 3.582009060185518e-07, + "logits/chosen": -2.6646568775177, + "logits/rejected": -2.610426902770996, + "logps/chosen": -206.9543914794922, + "logps/rejected": -324.7890319824219, + "loss": 0.1196, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6991324424743652, + "rewards/margins": 7.090356349945068, + "rewards/rejected": -9.789487838745117, + "step": 5490 + }, + { + "epoch": 1.07, + "learning_rate": 3.578413748471992e-07, + "logits/chosen": -2.678091526031494, + "logits/rejected": -2.647273540496826, + "logps/chosen": -161.38650512695312, + "logps/rejected": -272.1451721191406, + "loss": 0.1386, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1487624645233154, + "rewards/margins": 8.97457218170166, + "rewards/rejected": -11.123334884643555, + "step": 5500 + }, + { + "epoch": 1.07, + "eval_logits/chosen": -2.6286401748657227, + "eval_logits/rejected": -2.6168596744537354, + "eval_logps/chosen": -250.84083557128906, + "eval_logps/rejected": -274.1166687011719, + "eval_loss": 0.5161064267158508, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -5.660276412963867, + "eval_rewards/margins": 4.559793949127197, + "eval_rewards/rejected": -10.220069885253906, + "eval_runtime": 159.5045, + "eval_samples_per_second": 19.786, + "eval_steps_per_second": 0.313, + "step": 5500 + }, + { + "epoch": 1.07, + "learning_rate": 3.5748184367584665e-07, + "logits/chosen": -2.623798370361328, + "logits/rejected": -2.6287975311279297, + "logps/chosen": -245.2073974609375, + "logps/rejected": -422.5575256347656, + "loss": 0.1087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.859710693359375, + "rewards/margins": 13.329292297363281, + "rewards/rejected": -17.189002990722656, + "step": 5510 + }, + { + "epoch": 1.07, + "learning_rate": 3.5712231250449413e-07, + "logits/chosen": -2.6746225357055664, + "logits/rejected": -2.600458860397339, + "logps/chosen": -234.039794921875, + "logps/rejected": -247.18466186523438, + "loss": 0.183, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6866891384124756, + "rewards/margins": 8.559911727905273, + "rewards/rejected": -11.246599197387695, + "step": 5520 + }, + { + "epoch": 1.07, + "learning_rate": 3.567627813331416e-07, + "logits/chosen": -2.7399849891662598, + "logits/rejected": -2.861042022705078, + "logps/chosen": -294.953857421875, + "logps/rejected": -363.92376708984375, + "loss": 0.1597, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7596518993377686, + "rewards/margins": 12.088563919067383, + "rewards/rejected": -13.848217964172363, + "step": 5530 + }, + { + "epoch": 1.08, + "learning_rate": 3.5640325016178904e-07, + "logits/chosen": -2.8429012298583984, + "logits/rejected": -2.8388876914978027, + "logps/chosen": -271.3568115234375, + "logps/rejected": -305.71978759765625, + "loss": 0.151, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8082034587860107, + "rewards/margins": 5.346377849578857, + "rewards/rejected": -7.154581546783447, + "step": 5540 + }, + { + "epoch": 1.08, + "learning_rate": 3.5604371899043646e-07, + "logits/chosen": -2.6036064624786377, + "logits/rejected": -2.6070005893707275, + "logps/chosen": -214.0321044921875, + "logps/rejected": -242.52236938476562, + "loss": 0.1333, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6123268604278564, + "rewards/margins": 5.286976337432861, + "rewards/rejected": -7.8993024826049805, + "step": 5550 + }, + { + "epoch": 1.08, + "learning_rate": 3.556841878190839e-07, + "logits/chosen": -2.5977420806884766, + "logits/rejected": -2.471993923187256, + "logps/chosen": -239.48806762695312, + "logps/rejected": -417.43603515625, + "loss": 0.1051, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8278324604034424, + "rewards/margins": 12.061902046203613, + "rewards/rejected": -14.889734268188477, + "step": 5560 + }, + { + "epoch": 1.08, + "learning_rate": 3.5532465664773137e-07, + "logits/chosen": -2.7525136470794678, + "logits/rejected": -2.697791814804077, + "logps/chosen": -288.59124755859375, + "logps/rejected": -317.3720703125, + "loss": 0.1616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.814112901687622, + "rewards/margins": 6.11906623840332, + "rewards/rejected": -9.93317985534668, + "step": 5570 + }, + { + "epoch": 1.08, + "learning_rate": 3.549651254763788e-07, + "logits/chosen": -2.740294933319092, + "logits/rejected": -2.7485148906707764, + "logps/chosen": -217.1917266845703, + "logps/rejected": -359.91680908203125, + "loss": 0.161, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.977596282958984, + "rewards/margins": 10.685625076293945, + "rewards/rejected": -15.66322135925293, + "step": 5580 + }, + { + "epoch": 1.09, + "learning_rate": 3.5460559430502623e-07, + "logits/chosen": -2.4666714668273926, + "logits/rejected": -2.530104160308838, + "logps/chosen": -289.1488342285156, + "logps/rejected": -310.63861083984375, + "loss": 0.1859, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.899319648742676, + "rewards/margins": 8.151875495910645, + "rewards/rejected": -16.05119514465332, + "step": 5590 + }, + { + "epoch": 1.09, + "learning_rate": 3.5424606313367366e-07, + "logits/chosen": -2.7731902599334717, + "logits/rejected": -2.767548084259033, + "logps/chosen": -284.44293212890625, + "logps/rejected": -321.66888427734375, + "loss": 0.0945, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.389395236968994, + "rewards/margins": 8.4964017868042, + "rewards/rejected": -13.885797500610352, + "step": 5600 + }, + { + "epoch": 1.09, + "eval_logits/chosen": -2.684382677078247, + "eval_logits/rejected": -2.673522710800171, + "eval_logps/chosen": -258.7541809082031, + "eval_logps/rejected": -280.08514404296875, + "eval_loss": 0.5456523895263672, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -6.45161247253418, + "eval_rewards/margins": 4.365309238433838, + "eval_rewards/rejected": -10.81692123413086, + "eval_runtime": 154.9489, + "eval_samples_per_second": 20.368, + "eval_steps_per_second": 0.323, + "step": 5600 + }, + { + "epoch": 1.09, + "learning_rate": 3.538865319623211e-07, + "logits/chosen": -2.825899839401245, + "logits/rejected": -2.8352699279785156, + "logps/chosen": -218.1634979248047, + "logps/rejected": -232.28466796875, + "loss": 0.1343, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0848653316497803, + "rewards/margins": 5.6649088859558105, + "rewards/rejected": -8.749773979187012, + "step": 5610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5352700079096856e-07, + "logits/chosen": -2.830157995223999, + "logits/rejected": -2.7892985343933105, + "logps/chosen": -236.2996368408203, + "logps/rejected": -301.45672607421875, + "loss": 0.1166, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.197899341583252, + "rewards/margins": 6.826905727386475, + "rewards/rejected": -11.024805068969727, + "step": 5620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5316746961961604e-07, + "logits/chosen": -2.744631290435791, + "logits/rejected": -2.6552014350891113, + "logps/chosen": -238.23037719726562, + "logps/rejected": -312.98663330078125, + "loss": 0.1357, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.341963291168213, + "rewards/margins": 9.205650329589844, + "rewards/rejected": -14.547612190246582, + "step": 5630 + }, + { + "epoch": 1.09, + "learning_rate": 3.5280793844826347e-07, + "logits/chosen": -2.7977890968322754, + "logits/rejected": -2.785160541534424, + "logps/chosen": -314.02178955078125, + "logps/rejected": -372.805908203125, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.443086862564087, + "rewards/margins": 10.402233123779297, + "rewards/rejected": -11.845319747924805, + "step": 5640 + }, + { + "epoch": 1.1, + "learning_rate": 3.524484072769109e-07, + "logits/chosen": -2.7562460899353027, + "logits/rejected": -2.6300792694091797, + "logps/chosen": -293.38128662109375, + "logps/rejected": -294.54876708984375, + "loss": 0.1768, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.166192054748535, + "rewards/margins": 4.648423194885254, + "rewards/rejected": -9.814615249633789, + "step": 5650 + }, + { + "epoch": 1.1, + "learning_rate": 3.5208887610555833e-07, + "logits/chosen": -2.7785027027130127, + "logits/rejected": -2.7121129035949707, + "logps/chosen": -203.39974975585938, + "logps/rejected": -292.22882080078125, + "loss": 0.1521, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.449343681335449, + "rewards/margins": 8.1329927444458, + "rewards/rejected": -12.58233642578125, + "step": 5660 + }, + { + "epoch": 1.1, + "learning_rate": 3.517293449342058e-07, + "logits/chosen": -2.847832202911377, + "logits/rejected": -2.827631950378418, + "logps/chosen": -286.88092041015625, + "logps/rejected": -437.13336181640625, + "loss": 0.1473, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8340246081352234, + "rewards/margins": 12.683631896972656, + "rewards/rejected": -13.517656326293945, + "step": 5670 + }, + { + "epoch": 1.1, + "learning_rate": 3.5136981376285324e-07, + "logits/chosen": -2.5934131145477295, + "logits/rejected": -2.6727213859558105, + "logps/chosen": -212.5731964111328, + "logps/rejected": -298.6236267089844, + "loss": 0.1356, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.224434852600098, + "rewards/margins": 9.610193252563477, + "rewards/rejected": -13.834628105163574, + "step": 5680 + }, + { + "epoch": 1.1, + "learning_rate": 3.5101028259150066e-07, + "logits/chosen": -2.627270221710205, + "logits/rejected": -2.6261956691741943, + "logps/chosen": -235.8941650390625, + "logps/rejected": -323.42633056640625, + "loss": 0.1991, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8237156867980957, + "rewards/margins": 7.967219352722168, + "rewards/rejected": -11.790935516357422, + "step": 5690 + }, + { + "epoch": 1.11, + "learning_rate": 3.506507514201481e-07, + "logits/chosen": -2.7374954223632812, + "logits/rejected": -2.672309160232544, + "logps/chosen": -186.88565063476562, + "logps/rejected": -241.9154815673828, + "loss": 0.1396, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6796095371246338, + "rewards/margins": 5.4361772537231445, + "rewards/rejected": -7.115787506103516, + "step": 5700 + }, + { + "epoch": 1.11, + "eval_logits/chosen": -2.6859819889068604, + "eval_logits/rejected": -2.6761467456817627, + "eval_logps/chosen": -252.7005615234375, + "eval_logps/rejected": -265.2137756347656, + "eval_loss": 0.5312688946723938, + "eval_rewards/accuracies": 0.6875, + "eval_rewards/chosen": -5.846251010894775, + "eval_rewards/margins": 3.483530282974243, + "eval_rewards/rejected": -9.329782485961914, + "eval_runtime": 154.0816, + "eval_samples_per_second": 20.483, + "eval_steps_per_second": 0.325, + "step": 5700 + }, + { + "epoch": 1.11, + "learning_rate": 3.502912202487955e-07, + "logits/chosen": -2.7474465370178223, + "logits/rejected": -2.621155023574829, + "logps/chosen": -196.67938232421875, + "logps/rejected": -284.1141662597656, + "loss": 0.1468, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0635247230529785, + "rewards/margins": 7.5743865966796875, + "rewards/rejected": -10.637911796569824, + "step": 5710 + }, + { + "epoch": 1.11, + "learning_rate": 3.4993168907744295e-07, + "logits/chosen": -2.782020092010498, + "logits/rejected": -2.882028102874756, + "logps/chosen": -191.17454528808594, + "logps/rejected": -237.24221801757812, + "loss": 0.1619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9687166213989258, + "rewards/margins": 5.077371120452881, + "rewards/rejected": -7.046088218688965, + "step": 5720 + }, + { + "epoch": 1.11, + "learning_rate": 3.495721579060905e-07, + "logits/chosen": -2.7573437690734863, + "logits/rejected": -2.7375659942626953, + "logps/chosen": -250.4689483642578, + "logps/rejected": -251.02511596679688, + "loss": 0.1015, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.799938201904297, + "rewards/margins": 8.151664733886719, + "rewards/rejected": -11.951601028442383, + "step": 5730 + }, + { + "epoch": 1.11, + "learning_rate": 3.492126267347379e-07, + "logits/chosen": -2.698831558227539, + "logits/rejected": -2.7048041820526123, + "logps/chosen": -170.64048767089844, + "logps/rejected": -225.00619506835938, + "loss": 0.1125, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.595546007156372, + "rewards/margins": 5.730384349822998, + "rewards/rejected": -8.32593059539795, + "step": 5740 + }, + { + "epoch": 1.12, + "learning_rate": 3.4885309556338534e-07, + "logits/chosen": -2.66579008102417, + "logits/rejected": -2.749051570892334, + "logps/chosen": -169.31460571289062, + "logps/rejected": -291.7335510253906, + "loss": 0.1228, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8864710330963135, + "rewards/margins": 10.655224800109863, + "rewards/rejected": -13.541696548461914, + "step": 5750 + }, + { + "epoch": 1.12, + "learning_rate": 3.4849356439203277e-07, + "logits/chosen": -2.6012725830078125, + "logits/rejected": -2.5726571083068848, + "logps/chosen": -219.74740600585938, + "logps/rejected": -252.9861602783203, + "loss": 0.1032, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.485055923461914, + "rewards/margins": 7.809021949768066, + "rewards/rejected": -11.29407787322998, + "step": 5760 + }, + { + "epoch": 1.12, + "learning_rate": 3.481340332206802e-07, + "logits/chosen": -2.696786880493164, + "logits/rejected": -2.65468168258667, + "logps/chosen": -244.3468475341797, + "logps/rejected": -299.8504333496094, + "loss": 0.1175, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8251352310180664, + "rewards/margins": 7.654874324798584, + "rewards/rejected": -9.480011940002441, + "step": 5770 + }, + { + "epoch": 1.12, + "learning_rate": 3.477745020493277e-07, + "logits/chosen": -2.800767421722412, + "logits/rejected": -2.798677444458008, + "logps/chosen": -206.37026977539062, + "logps/rejected": -287.0042724609375, + "loss": 0.1902, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6202633380889893, + "rewards/margins": 6.435457706451416, + "rewards/rejected": -8.055720329284668, + "step": 5780 + }, + { + "epoch": 1.12, + "learning_rate": 3.474149708779751e-07, + "logits/chosen": -2.7777695655822754, + "logits/rejected": -2.772519588470459, + "logps/chosen": -231.9873809814453, + "logps/rejected": -320.47271728515625, + "loss": 0.0867, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.735586404800415, + "rewards/margins": 8.665799140930176, + "rewards/rejected": -11.401385307312012, + "step": 5790 + }, + { + "epoch": 1.13, + "learning_rate": 3.4705543970662253e-07, + "logits/chosen": -2.915607213973999, + "logits/rejected": -2.9438180923461914, + "logps/chosen": -270.58135986328125, + "logps/rejected": -388.30206298828125, + "loss": 0.0672, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8011308908462524, + "rewards/margins": 10.29234790802002, + "rewards/rejected": -11.093478202819824, + "step": 5800 + }, + { + "epoch": 1.13, + "eval_logits/chosen": -2.7117955684661865, + "eval_logits/rejected": -2.701897382736206, + "eval_logps/chosen": -242.89669799804688, + "eval_logps/rejected": -253.0458984375, + "eval_loss": 0.5428944230079651, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -4.865864276885986, + "eval_rewards/margins": 3.2471323013305664, + "eval_rewards/rejected": -8.112995147705078, + "eval_runtime": 154.459, + "eval_samples_per_second": 20.433, + "eval_steps_per_second": 0.324, + "step": 5800 + }, + { + "epoch": 1.13, + "learning_rate": 3.4669590853526996e-07, + "logits/chosen": -2.9234933853149414, + "logits/rejected": -2.8937723636627197, + "logps/chosen": -286.6083068847656, + "logps/rejected": -358.56915283203125, + "loss": 0.1309, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.522188425064087, + "rewards/margins": 8.710290908813477, + "rewards/rejected": -10.232478141784668, + "step": 5810 + }, + { + "epoch": 1.13, + "learning_rate": 3.463363773639174e-07, + "logits/chosen": -2.7857749462127686, + "logits/rejected": -2.742250919342041, + "logps/chosen": -222.6065673828125, + "logps/rejected": -343.77618408203125, + "loss": 0.1403, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9214558601379395, + "rewards/margins": 7.006571292877197, + "rewards/rejected": -9.92802619934082, + "step": 5820 + }, + { + "epoch": 1.13, + "learning_rate": 3.459768461925649e-07, + "logits/chosen": -2.7955751419067383, + "logits/rejected": -2.8130767345428467, + "logps/chosen": -191.033203125, + "logps/rejected": -269.64898681640625, + "loss": 0.1452, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.88899302482605, + "rewards/margins": 6.952218055725098, + "rewards/rejected": -10.841211318969727, + "step": 5830 + }, + { + "epoch": 1.13, + "learning_rate": 3.4561731502121235e-07, + "logits/chosen": -2.664130210876465, + "logits/rejected": -2.6832947731018066, + "logps/chosen": -249.15432739257812, + "logps/rejected": -322.24700927734375, + "loss": 0.137, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.849069118499756, + "rewards/margins": 8.027547836303711, + "rewards/rejected": -12.876615524291992, + "step": 5840 + }, + { + "epoch": 1.14, + "learning_rate": 3.452577838498598e-07, + "logits/chosen": -2.802269458770752, + "logits/rejected": -2.750352144241333, + "logps/chosen": -254.30221557617188, + "logps/rejected": -295.14373779296875, + "loss": 0.1311, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9655396938323975, + "rewards/margins": 7.998236179351807, + "rewards/rejected": -10.963777542114258, + "step": 5850 + }, + { + "epoch": 1.14, + "learning_rate": 3.448982526785072e-07, + "logits/chosen": -2.6600778102874756, + "logits/rejected": -2.669320583343506, + "logps/chosen": -323.6750793457031, + "logps/rejected": -399.33575439453125, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.116106629371643, + "rewards/margins": 10.888641357421875, + "rewards/rejected": -12.004746437072754, + "step": 5860 + }, + { + "epoch": 1.14, + "learning_rate": 3.4453872150715463e-07, + "logits/chosen": -2.7690088748931885, + "logits/rejected": -2.690573215484619, + "logps/chosen": -266.77801513671875, + "logps/rejected": -295.81988525390625, + "loss": 0.1066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.111680030822754, + "rewards/margins": 6.277945041656494, + "rewards/rejected": -15.389625549316406, + "step": 5870 + }, + { + "epoch": 1.14, + "learning_rate": 3.441791903358021e-07, + "logits/chosen": -2.710983991622925, + "logits/rejected": -2.7271265983581543, + "logps/chosen": -142.81861877441406, + "logps/rejected": -266.3145751953125, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9935505390167236, + "rewards/margins": 7.031019687652588, + "rewards/rejected": -10.024569511413574, + "step": 5880 + }, + { + "epoch": 1.14, + "learning_rate": 3.4381965916444954e-07, + "logits/chosen": -2.831089496612549, + "logits/rejected": -2.7688608169555664, + "logps/chosen": -243.38113403320312, + "logps/rejected": -328.47723388671875, + "loss": 0.149, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.787606716156006, + "rewards/margins": 6.9904022216796875, + "rewards/rejected": -9.778009414672852, + "step": 5890 + }, + { + "epoch": 1.15, + "learning_rate": 3.4346012799309697e-07, + "logits/chosen": -2.618217945098877, + "logits/rejected": -2.5108141899108887, + "logps/chosen": -156.2168426513672, + "logps/rejected": -288.3763427734375, + "loss": 0.1091, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7162649631500244, + "rewards/margins": 9.442224502563477, + "rewards/rejected": -11.158490180969238, + "step": 5900 + }, + { + "epoch": 1.15, + "eval_logits/chosen": -2.6338837146759033, + "eval_logits/rejected": -2.6196024417877197, + "eval_logps/chosen": -258.26812744140625, + "eval_logps/rejected": -276.4388427734375, + "eval_loss": 0.5826197862625122, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -6.403004169464111, + "eval_rewards/margins": 4.049283027648926, + "eval_rewards/rejected": -10.452287673950195, + "eval_runtime": 140.158, + "eval_samples_per_second": 22.517, + "eval_steps_per_second": 0.357, + "step": 5900 + }, + { + "epoch": 1.15, + "learning_rate": 3.431005968217444e-07, + "logits/chosen": -2.7534470558166504, + "logits/rejected": -2.795793294906616, + "logps/chosen": -180.9386444091797, + "logps/rejected": -271.31573486328125, + "loss": 0.102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5734479427337646, + "rewards/margins": 8.237764358520508, + "rewards/rejected": -10.811212539672852, + "step": 5910 + }, + { + "epoch": 1.15, + "learning_rate": 3.427410656503918e-07, + "logits/chosen": -2.782384157180786, + "logits/rejected": -2.6864218711853027, + "logps/chosen": -208.7670135498047, + "logps/rejected": -257.57965087890625, + "loss": 0.1953, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2141883373260498, + "rewards/margins": 7.653440952301025, + "rewards/rejected": -8.86762809753418, + "step": 5920 + }, + { + "epoch": 1.15, + "learning_rate": 3.4238153447903936e-07, + "logits/chosen": -2.686586618423462, + "logits/rejected": -2.7302565574645996, + "logps/chosen": -162.2934112548828, + "logps/rejected": -286.556884765625, + "loss": 0.1371, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.947363376617432, + "rewards/margins": 9.68288803100586, + "rewards/rejected": -14.6302490234375, + "step": 5930 + }, + { + "epoch": 1.15, + "learning_rate": 3.420220033076868e-07, + "logits/chosen": -2.8979978561401367, + "logits/rejected": -2.8668251037597656, + "logps/chosen": -358.2300720214844, + "logps/rejected": -470.30316162109375, + "loss": 0.1297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8053625226020813, + "rewards/margins": 10.31999397277832, + "rewards/rejected": -11.125356674194336, + "step": 5940 + }, + { + "epoch": 1.16, + "learning_rate": 3.416624721363342e-07, + "logits/chosen": -2.5310025215148926, + "logits/rejected": -2.5251810550689697, + "logps/chosen": -212.5767822265625, + "logps/rejected": -287.3901672363281, + "loss": 0.1589, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.835158586502075, + "rewards/margins": 7.662600517272949, + "rewards/rejected": -11.497759819030762, + "step": 5950 + }, + { + "epoch": 1.16, + "learning_rate": 3.4130294096498164e-07, + "logits/chosen": -2.6718804836273193, + "logits/rejected": -2.584641695022583, + "logps/chosen": -257.1517639160156, + "logps/rejected": -317.1195983886719, + "loss": 0.1741, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7986114025115967, + "rewards/margins": 7.601963996887207, + "rewards/rejected": -10.400575637817383, + "step": 5960 + }, + { + "epoch": 1.16, + "learning_rate": 3.4094340979362907e-07, + "logits/chosen": -2.7640786170959473, + "logits/rejected": -2.762349843978882, + "logps/chosen": -351.98883056640625, + "logps/rejected": -444.14581298828125, + "loss": 0.1052, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.045039176940918, + "rewards/margins": 11.323704719543457, + "rewards/rejected": -13.368743896484375, + "step": 5970 + }, + { + "epoch": 1.16, + "learning_rate": 3.4058387862227655e-07, + "logits/chosen": -2.4686408042907715, + "logits/rejected": -2.4912800788879395, + "logps/chosen": -251.7490234375, + "logps/rejected": -287.3038330078125, + "loss": 0.1281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2519376277923584, + "rewards/margins": 7.740379333496094, + "rewards/rejected": -9.992318153381348, + "step": 5980 + }, + { + "epoch": 1.16, + "learning_rate": 3.40224347450924e-07, + "logits/chosen": -2.5413877964019775, + "logits/rejected": -2.5480809211730957, + "logps/chosen": -283.52972412109375, + "logps/rejected": -291.5387878417969, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6137404441833496, + "rewards/margins": 10.562993049621582, + "rewards/rejected": -14.176733016967773, + "step": 5990 + }, + { + "epoch": 1.16, + "learning_rate": 3.398648162795714e-07, + "logits/chosen": -2.7131056785583496, + "logits/rejected": -2.7502658367156982, + "logps/chosen": -225.12423706054688, + "logps/rejected": -282.3991394042969, + "loss": 0.1643, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4087367057800293, + "rewards/margins": 6.267477035522461, + "rewards/rejected": -8.676214218139648, + "step": 6000 + }, + { + "epoch": 1.16, + "eval_logits/chosen": -2.5909934043884277, + "eval_logits/rejected": -2.5798749923706055, + "eval_logps/chosen": -262.0378112792969, + "eval_logps/rejected": -283.4436950683594, + "eval_loss": 0.5502873063087463, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -6.779973983764648, + "eval_rewards/margins": 4.372798442840576, + "eval_rewards/rejected": -11.15277099609375, + "eval_runtime": 141.5414, + "eval_samples_per_second": 22.297, + "eval_steps_per_second": 0.353, + "step": 6000 + }, + { + "epoch": 1.17, + "learning_rate": 3.3950528510821883e-07, + "logits/chosen": -2.6086387634277344, + "logits/rejected": -2.444122791290283, + "logps/chosen": -221.9136962890625, + "logps/rejected": -288.4536437988281, + "loss": 0.0703, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.792720794677734, + "rewards/margins": 8.3062162399292, + "rewards/rejected": -15.098939895629883, + "step": 6010 + }, + { + "epoch": 1.17, + "learning_rate": 3.3914575393686626e-07, + "logits/chosen": -2.4521384239196777, + "logits/rejected": -2.5658693313598633, + "logps/chosen": -188.38824462890625, + "logps/rejected": -299.87847900390625, + "loss": 0.1233, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8679680824279785, + "rewards/margins": 6.399819374084473, + "rewards/rejected": -9.267788887023926, + "step": 6020 + }, + { + "epoch": 1.17, + "learning_rate": 3.387862227655138e-07, + "logits/chosen": -2.721592903137207, + "logits/rejected": -2.8075852394104004, + "logps/chosen": -207.4969940185547, + "logps/rejected": -340.68035888671875, + "loss": 0.1924, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3890742063522339, + "rewards/margins": 13.937342643737793, + "rewards/rejected": -15.326417922973633, + "step": 6030 + }, + { + "epoch": 1.17, + "learning_rate": 3.384266915941612e-07, + "logits/chosen": -2.860846996307373, + "logits/rejected": -2.8077187538146973, + "logps/chosen": -197.4476776123047, + "logps/rejected": -238.8955841064453, + "loss": 0.182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0824860334396362, + "rewards/margins": 6.919004917144775, + "rewards/rejected": -8.001490592956543, + "step": 6040 + }, + { + "epoch": 1.17, + "learning_rate": 3.3806716042280865e-07, + "logits/chosen": -2.6932692527770996, + "logits/rejected": -2.7947869300842285, + "logps/chosen": -201.7335968017578, + "logps/rejected": -232.18515014648438, + "loss": 0.1437, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2536561489105225, + "rewards/margins": 6.48312520980835, + "rewards/rejected": -7.736780643463135, + "step": 6050 + }, + { + "epoch": 1.18, + "learning_rate": 3.377076292514561e-07, + "logits/chosen": -2.4198594093322754, + "logits/rejected": -2.6124606132507324, + "logps/chosen": -222.1281280517578, + "logps/rejected": -239.62393188476562, + "loss": 0.1581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.317704677581787, + "rewards/margins": 7.412570953369141, + "rewards/rejected": -10.730276107788086, + "step": 6060 + }, + { + "epoch": 1.18, + "learning_rate": 3.373480980801035e-07, + "logits/chosen": -2.9850125312805176, + "logits/rejected": -3.0179200172424316, + "logps/chosen": -232.9512939453125, + "logps/rejected": -307.1102294921875, + "loss": 0.1774, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4660871028900146, + "rewards/margins": 7.160256862640381, + "rewards/rejected": -9.626344680786133, + "step": 6070 + }, + { + "epoch": 1.18, + "learning_rate": 3.36988566908751e-07, + "logits/chosen": -2.798388957977295, + "logits/rejected": -2.8144421577453613, + "logps/chosen": -159.11264038085938, + "logps/rejected": -215.4800567626953, + "loss": 0.1828, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.0232648849487305, + "rewards/margins": 5.241990089416504, + "rewards/rejected": -10.265254020690918, + "step": 6080 + }, + { + "epoch": 1.18, + "learning_rate": 3.366290357373984e-07, + "logits/chosen": -2.8459601402282715, + "logits/rejected": -2.795527935028076, + "logps/chosen": -232.2338104248047, + "logps/rejected": -303.9999084472656, + "loss": 0.1249, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.959782600402832, + "rewards/margins": 7.116204738616943, + "rewards/rejected": -13.075986862182617, + "step": 6090 + }, + { + "epoch": 1.18, + "learning_rate": 3.3626950456604584e-07, + "logits/chosen": -2.9556031227111816, + "logits/rejected": -2.8668324947357178, + "logps/chosen": -358.3072204589844, + "logps/rejected": -479.006103515625, + "loss": 0.1091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.777050733566284, + "rewards/margins": 11.381028175354004, + "rewards/rejected": -14.158079147338867, + "step": 6100 + }, + { + "epoch": 1.18, + "eval_logits/chosen": -2.7025187015533447, + "eval_logits/rejected": -2.690358877182007, + "eval_logps/chosen": -257.2952880859375, + "eval_logps/rejected": -274.37188720703125, + "eval_loss": 0.5208981037139893, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -6.305721282958984, + "eval_rewards/margins": 3.939871311187744, + "eval_rewards/rejected": -10.245593070983887, + "eval_runtime": 140.409, + "eval_samples_per_second": 22.477, + "eval_steps_per_second": 0.356, + "step": 6100 + }, + { + "epoch": 1.19, + "learning_rate": 3.3590997339469327e-07, + "logits/chosen": -2.724202871322632, + "logits/rejected": -2.7696611881256104, + "logps/chosen": -278.71099853515625, + "logps/rejected": -414.0077209472656, + "loss": 0.1199, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10096763074398041, + "rewards/margins": 12.203642845153809, + "rewards/rejected": -12.304609298706055, + "step": 6110 + }, + { + "epoch": 1.19, + "learning_rate": 3.3555044222334075e-07, + "logits/chosen": -2.831987142562866, + "logits/rejected": -2.799257755279541, + "logps/chosen": -340.9770812988281, + "logps/rejected": -303.50927734375, + "loss": 0.1199, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7819552421569824, + "rewards/margins": 8.611495018005371, + "rewards/rejected": -11.393450736999512, + "step": 6120 + }, + { + "epoch": 1.19, + "learning_rate": 3.3519091105198823e-07, + "logits/chosen": -2.8285770416259766, + "logits/rejected": -2.9155449867248535, + "logps/chosen": -222.5007781982422, + "logps/rejected": -323.23028564453125, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6057395935058594, + "rewards/margins": 8.996025085449219, + "rewards/rejected": -11.601765632629395, + "step": 6130 + }, + { + "epoch": 1.19, + "learning_rate": 3.3483137988063566e-07, + "logits/chosen": -2.8326056003570557, + "logits/rejected": -2.7920801639556885, + "logps/chosen": -160.5250701904297, + "logps/rejected": -212.5648956298828, + "loss": 0.1224, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.710567831993103, + "rewards/margins": 6.801335334777832, + "rewards/rejected": -8.511902809143066, + "step": 6140 + }, + { + "epoch": 1.19, + "learning_rate": 3.344718487092831e-07, + "logits/chosen": -2.9053919315338135, + "logits/rejected": -2.9459848403930664, + "logps/chosen": -260.80120849609375, + "logps/rejected": -353.389892578125, + "loss": 0.1474, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7903811931610107, + "rewards/margins": 8.168710708618164, + "rewards/rejected": -9.959092140197754, + "step": 6150 + }, + { + "epoch": 1.2, + "learning_rate": 3.341123175379305e-07, + "logits/chosen": -2.778244972229004, + "logits/rejected": -2.8497090339660645, + "logps/chosen": -294.40057373046875, + "logps/rejected": -343.45721435546875, + "loss": 0.1231, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.6665167808532715, + "rewards/margins": 8.31949234008789, + "rewards/rejected": -11.986010551452637, + "step": 6160 + }, + { + "epoch": 1.2, + "learning_rate": 3.3375278636657794e-07, + "logits/chosen": -2.754607915878296, + "logits/rejected": -2.6639506816864014, + "logps/chosen": -241.40841674804688, + "logps/rejected": -327.3476867675781, + "loss": 0.3996, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.393301486968994, + "rewards/margins": 10.620524406433105, + "rewards/rejected": -15.013826370239258, + "step": 6170 + }, + { + "epoch": 1.2, + "learning_rate": 3.333932551952254e-07, + "logits/chosen": -2.794943332672119, + "logits/rejected": -2.726440906524658, + "logps/chosen": -239.61221313476562, + "logps/rejected": -288.391357421875, + "loss": 0.1154, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0061144828796387, + "rewards/margins": 7.411135196685791, + "rewards/rejected": -8.417248725891113, + "step": 6180 + }, + { + "epoch": 1.2, + "learning_rate": 3.3303372402387285e-07, + "logits/chosen": -2.672645092010498, + "logits/rejected": -2.7147326469421387, + "logps/chosen": -184.77056884765625, + "logps/rejected": -273.83319091796875, + "loss": 0.1324, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.274429798126221, + "rewards/margins": 6.249919414520264, + "rewards/rejected": -10.524349212646484, + "step": 6190 + }, + { + "epoch": 1.2, + "learning_rate": 3.326741928525203e-07, + "logits/chosen": -2.8108317852020264, + "logits/rejected": -2.8210465908050537, + "logps/chosen": -237.4455108642578, + "logps/rejected": -306.970947265625, + "loss": 0.1128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9643885493278503, + "rewards/margins": 7.76919412612915, + "rewards/rejected": -8.73358154296875, + "step": 6200 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -2.628884792327881, + "eval_logits/rejected": -2.6117124557495117, + "eval_logps/chosen": -260.33367919921875, + "eval_logps/rejected": -282.7896728515625, + "eval_loss": 0.5365757346153259, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -6.609562873840332, + "eval_rewards/margins": 4.477807521820068, + "eval_rewards/rejected": -11.087370872497559, + "eval_runtime": 153.0609, + "eval_samples_per_second": 20.619, + "eval_steps_per_second": 0.327, + "step": 6200 + }, + { + "epoch": 1.21, + "learning_rate": 3.323146616811677e-07, + "logits/chosen": -2.976378917694092, + "logits/rejected": -2.9297187328338623, + "logps/chosen": -285.6937561035156, + "logps/rejected": -311.7433776855469, + "loss": 0.1185, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0504415035247803, + "rewards/margins": 8.310696601867676, + "rewards/rejected": -11.361138343811035, + "step": 6210 + }, + { + "epoch": 1.21, + "learning_rate": 3.319551305098152e-07, + "logits/chosen": -2.9990527629852295, + "logits/rejected": -2.865429401397705, + "logps/chosen": -217.62661743164062, + "logps/rejected": -297.47747802734375, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0945897102355957, + "rewards/margins": 7.703749179840088, + "rewards/rejected": -10.798337936401367, + "step": 6220 + }, + { + "epoch": 1.21, + "learning_rate": 3.3159559933846267e-07, + "logits/chosen": -2.7989859580993652, + "logits/rejected": -2.833512306213379, + "logps/chosen": -265.3959045410156, + "logps/rejected": -336.2202453613281, + "loss": 0.1391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1913533210754395, + "rewards/margins": 7.045161247253418, + "rewards/rejected": -13.236515998840332, + "step": 6230 + }, + { + "epoch": 1.21, + "learning_rate": 3.312360681671101e-07, + "logits/chosen": -2.6157264709472656, + "logits/rejected": -2.702986240386963, + "logps/chosen": -287.74078369140625, + "logps/rejected": -360.38238525390625, + "loss": 0.1243, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.966263771057129, + "rewards/margins": 7.23165225982666, + "rewards/rejected": -13.197916030883789, + "step": 6240 + }, + { + "epoch": 1.21, + "learning_rate": 3.308765369957575e-07, + "logits/chosen": -2.8906943798065186, + "logits/rejected": -2.8710110187530518, + "logps/chosen": -215.97463989257812, + "logps/rejected": -289.1537780761719, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.455352783203125, + "rewards/margins": 8.1903715133667, + "rewards/rejected": -10.645724296569824, + "step": 6250 + }, + { + "epoch": 1.22, + "learning_rate": 3.3051700582440495e-07, + "logits/chosen": -2.873359203338623, + "logits/rejected": -2.771235942840576, + "logps/chosen": -279.60052490234375, + "logps/rejected": -353.4242248535156, + "loss": 0.0998, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.851736545562744, + "rewards/margins": 7.721796989440918, + "rewards/rejected": -10.57353401184082, + "step": 6260 + }, + { + "epoch": 1.22, + "learning_rate": 3.301574746530524e-07, + "logits/chosen": -2.8997817039489746, + "logits/rejected": -2.9244163036346436, + "logps/chosen": -290.8459167480469, + "logps/rejected": -302.6660461425781, + "loss": 0.1221, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.964266300201416, + "rewards/margins": 8.241120338439941, + "rewards/rejected": -13.2053861618042, + "step": 6270 + }, + { + "epoch": 1.22, + "learning_rate": 3.2979794348169986e-07, + "logits/chosen": -2.8869094848632812, + "logits/rejected": -2.841811180114746, + "logps/chosen": -318.13165283203125, + "logps/rejected": -286.990966796875, + "loss": 0.1752, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.606424808502197, + "rewards/margins": 8.159226417541504, + "rewards/rejected": -14.765650749206543, + "step": 6280 + }, + { + "epoch": 1.22, + "learning_rate": 3.294384123103473e-07, + "logits/chosen": -2.8572590351104736, + "logits/rejected": -2.9467549324035645, + "logps/chosen": -174.5487060546875, + "logps/rejected": -381.8702697753906, + "loss": 0.1166, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.804342031478882, + "rewards/margins": 12.296274185180664, + "rewards/rejected": -15.100613594055176, + "step": 6290 + }, + { + "epoch": 1.22, + "learning_rate": 3.290788811389947e-07, + "logits/chosen": -2.7387213706970215, + "logits/rejected": -2.757601261138916, + "logps/chosen": -283.9013671875, + "logps/rejected": -395.25177001953125, + "loss": 0.2009, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.1901421546936035, + "rewards/margins": 6.319852352142334, + "rewards/rejected": -10.509993553161621, + "step": 6300 + }, + { + "epoch": 1.22, + "eval_logits/chosen": -2.7317395210266113, + "eval_logits/rejected": -2.7132482528686523, + "eval_logps/chosen": -273.7660217285156, + "eval_logps/rejected": -298.4337463378906, + "eval_loss": 0.5346037745475769, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -7.952797889709473, + "eval_rewards/margins": 4.6989850997924805, + "eval_rewards/rejected": -12.651782035827637, + "eval_runtime": 153.294, + "eval_samples_per_second": 20.588, + "eval_steps_per_second": 0.326, + "step": 6300 + }, + { + "epoch": 1.23, + "learning_rate": 3.2871934996764214e-07, + "logits/chosen": -2.9453299045562744, + "logits/rejected": -2.8158774375915527, + "logps/chosen": -271.2457580566406, + "logps/rejected": -371.6581115722656, + "loss": 0.1343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.781925201416016, + "rewards/margins": 7.037691593170166, + "rewards/rejected": -11.819616317749023, + "step": 6310 + }, + { + "epoch": 1.23, + "learning_rate": 3.283598187962896e-07, + "logits/chosen": -2.7703518867492676, + "logits/rejected": -2.811502695083618, + "logps/chosen": -256.62591552734375, + "logps/rejected": -290.80560302734375, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04703056812286377, + "rewards/margins": 8.074069023132324, + "rewards/rejected": -8.121099472045898, + "step": 6320 + }, + { + "epoch": 1.23, + "learning_rate": 3.280002876249371e-07, + "logits/chosen": -2.912069320678711, + "logits/rejected": -2.840228319168091, + "logps/chosen": -263.3592834472656, + "logps/rejected": -323.1920166015625, + "loss": 0.1397, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.208311557769775, + "rewards/margins": 7.382091522216797, + "rewards/rejected": -12.590402603149414, + "step": 6330 + }, + { + "epoch": 1.23, + "learning_rate": 3.2764075645358453e-07, + "logits/chosen": -2.7262308597564697, + "logits/rejected": -2.7747886180877686, + "logps/chosen": -282.7395935058594, + "logps/rejected": -277.0335693359375, + "loss": 0.1317, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.310909748077393, + "rewards/margins": 8.081521987915039, + "rewards/rejected": -12.392431259155273, + "step": 6340 + }, + { + "epoch": 1.23, + "learning_rate": 3.2728122528223196e-07, + "logits/chosen": -2.680694103240967, + "logits/rejected": -2.773026704788208, + "logps/chosen": -326.7906494140625, + "logps/rejected": -489.86309814453125, + "loss": 0.1268, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.660059928894043, + "rewards/margins": 12.254980087280273, + "rewards/rejected": -17.9150390625, + "step": 6350 + }, + { + "epoch": 1.23, + "learning_rate": 3.269216941108794e-07, + "logits/chosen": -2.919232130050659, + "logits/rejected": -2.8801798820495605, + "logps/chosen": -232.4424285888672, + "logps/rejected": -277.77850341796875, + "loss": 0.1022, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.817774772644043, + "rewards/margins": 4.973958492279053, + "rewards/rejected": -13.791735649108887, + "step": 6360 + }, + { + "epoch": 1.24, + "learning_rate": 3.265621629395268e-07, + "logits/chosen": -2.8190665245056152, + "logits/rejected": -2.726118564605713, + "logps/chosen": -242.07022094726562, + "logps/rejected": -326.2730407714844, + "loss": 0.142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.421662330627441, + "rewards/margins": 7.453909873962402, + "rewards/rejected": -14.875570297241211, + "step": 6370 + }, + { + "epoch": 1.24, + "learning_rate": 3.262026317681743e-07, + "logits/chosen": -2.7926979064941406, + "logits/rejected": -2.78657865524292, + "logps/chosen": -240.29647827148438, + "logps/rejected": -347.209716796875, + "loss": 0.135, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.99425745010376, + "rewards/margins": 10.459344863891602, + "rewards/rejected": -15.45360279083252, + "step": 6380 + }, + { + "epoch": 1.24, + "learning_rate": 3.258431005968217e-07, + "logits/chosen": -2.7790284156799316, + "logits/rejected": -2.9192051887512207, + "logps/chosen": -236.951171875, + "logps/rejected": -307.77569580078125, + "loss": 0.1336, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.672762393951416, + "rewards/margins": 8.64220905303955, + "rewards/rejected": -16.314971923828125, + "step": 6390 + }, + { + "epoch": 1.24, + "learning_rate": 3.2548356942546915e-07, + "logits/chosen": -2.873333215713501, + "logits/rejected": -2.9237146377563477, + "logps/chosen": -221.26004028320312, + "logps/rejected": -385.15484619140625, + "loss": 0.1862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.8607025146484375, + "rewards/margins": 7.9548821449279785, + "rewards/rejected": -13.815584182739258, + "step": 6400 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.6900055408477783, + "eval_logits/rejected": -2.674002170562744, + "eval_logps/chosen": -279.8787841796875, + "eval_logps/rejected": -304.4410400390625, + "eval_loss": 0.5409572720527649, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -8.56407356262207, + "eval_rewards/margins": 4.688436031341553, + "eval_rewards/rejected": -13.252508163452148, + "eval_runtime": 151.2682, + "eval_samples_per_second": 20.864, + "eval_steps_per_second": 0.331, + "step": 6400 + }, + { + "epoch": 1.24, + "learning_rate": 3.251240382541166e-07, + "logits/chosen": -2.801133632659912, + "logits/rejected": -2.825709342956543, + "logps/chosen": -246.2862548828125, + "logps/rejected": -319.25164794921875, + "loss": 0.1005, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.7105536460876465, + "rewards/margins": 8.40849781036377, + "rewards/rejected": -16.119050979614258, + "step": 6410 + }, + { + "epoch": 1.25, + "learning_rate": 3.2476450708276406e-07, + "logits/chosen": -2.862619400024414, + "logits/rejected": -2.7658121585845947, + "logps/chosen": -333.66162109375, + "logps/rejected": -423.8462829589844, + "loss": 0.1263, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.267934322357178, + "rewards/margins": 12.18339729309082, + "rewards/rejected": -17.451330184936523, + "step": 6420 + }, + { + "epoch": 1.25, + "learning_rate": 3.2440497591141154e-07, + "logits/chosen": -2.6920571327209473, + "logits/rejected": -2.769468307495117, + "logps/chosen": -226.3386688232422, + "logps/rejected": -324.83135986328125, + "loss": 0.1015, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.818525314331055, + "rewards/margins": 8.952909469604492, + "rewards/rejected": -17.771434783935547, + "step": 6430 + }, + { + "epoch": 1.25, + "learning_rate": 3.2404544474005897e-07, + "logits/chosen": -2.8283748626708984, + "logits/rejected": -2.7240800857543945, + "logps/chosen": -202.03224182128906, + "logps/rejected": -329.03814697265625, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.734574317932129, + "rewards/margins": 12.262895584106445, + "rewards/rejected": -16.997468948364258, + "step": 6440 + }, + { + "epoch": 1.25, + "learning_rate": 3.236859135687064e-07, + "logits/chosen": -2.907532215118408, + "logits/rejected": -2.916665554046631, + "logps/chosen": -274.86962890625, + "logps/rejected": -438.1475524902344, + "loss": 0.1614, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.988499402999878, + "rewards/margins": 10.746040344238281, + "rewards/rejected": -14.734539985656738, + "step": 6450 + }, + { + "epoch": 1.25, + "learning_rate": 3.233263823973538e-07, + "logits/chosen": -2.909522294998169, + "logits/rejected": -2.8612873554229736, + "logps/chosen": -207.0958251953125, + "logps/rejected": -260.4202575683594, + "loss": 0.1597, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.873379707336426, + "rewards/margins": 6.500428676605225, + "rewards/rejected": -14.373807907104492, + "step": 6460 + }, + { + "epoch": 1.26, + "learning_rate": 3.2296685122600125e-07, + "logits/chosen": -2.760603427886963, + "logits/rejected": -2.8077752590179443, + "logps/chosen": -241.1212158203125, + "logps/rejected": -393.91729736328125, + "loss": 0.1835, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.09355092048645, + "rewards/margins": 10.064352035522461, + "rewards/rejected": -12.157902717590332, + "step": 6470 + }, + { + "epoch": 1.26, + "learning_rate": 3.2260732005464873e-07, + "logits/chosen": -2.8883023262023926, + "logits/rejected": -2.870262622833252, + "logps/chosen": -241.0962371826172, + "logps/rejected": -357.7778015136719, + "loss": 0.0931, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14364352822303772, + "rewards/margins": 9.347768783569336, + "rewards/rejected": -9.491412162780762, + "step": 6480 + }, + { + "epoch": 1.26, + "learning_rate": 3.2224778888329616e-07, + "logits/chosen": -2.3451931476593018, + "logits/rejected": -2.444458484649658, + "logps/chosen": -228.77822875976562, + "logps/rejected": -273.86859130859375, + "loss": 0.2252, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.470632553100586, + "rewards/margins": 8.284172058105469, + "rewards/rejected": -11.754804611206055, + "step": 6490 + }, + { + "epoch": 1.26, + "learning_rate": 3.218882577119436e-07, + "logits/chosen": -2.957517147064209, + "logits/rejected": -2.899106502532959, + "logps/chosen": -221.6002655029297, + "logps/rejected": -266.9825134277344, + "loss": 0.137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1074684858322144, + "rewards/margins": 7.099783420562744, + "rewards/rejected": -8.207250595092773, + "step": 6500 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -2.7445404529571533, + "eval_logits/rejected": -2.728905439376831, + "eval_logps/chosen": -246.21923828125, + "eval_logps/rejected": -263.984130859375, + "eval_loss": 0.6052098274230957, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -5.198116302490234, + "eval_rewards/margins": 4.008700370788574, + "eval_rewards/rejected": -9.206816673278809, + "eval_runtime": 154.6629, + "eval_samples_per_second": 20.406, + "eval_steps_per_second": 0.323, + "step": 6500 + }, + { + "epoch": 1.26, + "learning_rate": 3.2152872654059107e-07, + "logits/chosen": -2.788585662841797, + "logits/rejected": -2.778353214263916, + "logps/chosen": -207.0595703125, + "logps/rejected": -314.166259765625, + "loss": 0.4933, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.369551420211792, + "rewards/margins": 6.188743591308594, + "rewards/rejected": -9.558295249938965, + "step": 6510 + }, + { + "epoch": 1.27, + "learning_rate": 3.211691953692385e-07, + "logits/chosen": -2.822997570037842, + "logits/rejected": -2.736123561859131, + "logps/chosen": -171.63253784179688, + "logps/rejected": -229.5925750732422, + "loss": 0.1272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5562565326690674, + "rewards/margins": 5.356114387512207, + "rewards/rejected": -7.9123711585998535, + "step": 6520 + }, + { + "epoch": 1.27, + "learning_rate": 3.20809664197886e-07, + "logits/chosen": -2.656463861465454, + "logits/rejected": -2.6478307247161865, + "logps/chosen": -233.30224609375, + "logps/rejected": -297.5663146972656, + "loss": 0.1435, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6849629878997803, + "rewards/margins": 9.462038040161133, + "rewards/rejected": -12.147003173828125, + "step": 6530 + }, + { + "epoch": 1.27, + "learning_rate": 3.204501330265334e-07, + "logits/chosen": -2.8856730461120605, + "logits/rejected": -2.8542838096618652, + "logps/chosen": -289.2203674316406, + "logps/rejected": -255.3348846435547, + "loss": 0.1301, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.026634216308594, + "rewards/margins": 5.3799614906311035, + "rewards/rejected": -9.406596183776855, + "step": 6540 + }, + { + "epoch": 1.27, + "learning_rate": 3.2009060185518083e-07, + "logits/chosen": -2.7601351737976074, + "logits/rejected": -2.740595817565918, + "logps/chosen": -225.09317016601562, + "logps/rejected": -336.46881103515625, + "loss": 0.1418, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.083618402481079, + "rewards/margins": 6.715296268463135, + "rewards/rejected": -8.798913955688477, + "step": 6550 + }, + { + "epoch": 1.27, + "learning_rate": 3.1973107068382826e-07, + "logits/chosen": -2.764744997024536, + "logits/rejected": -2.72230863571167, + "logps/chosen": -238.1551971435547, + "logps/rejected": -305.114990234375, + "loss": 0.249, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.148994445800781, + "rewards/margins": 5.854582786560059, + "rewards/rejected": -10.00357723236084, + "step": 6560 + }, + { + "epoch": 1.28, + "learning_rate": 3.193715395124757e-07, + "logits/chosen": -2.6915459632873535, + "logits/rejected": -2.642277240753174, + "logps/chosen": -354.4752502441406, + "logps/rejected": -352.70574951171875, + "loss": 0.1459, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.953001499176025, + "rewards/margins": 6.105034351348877, + "rewards/rejected": -11.058036804199219, + "step": 6570 + }, + { + "epoch": 1.28, + "learning_rate": 3.1901200834112317e-07, + "logits/chosen": -2.693657636642456, + "logits/rejected": -2.691105842590332, + "logps/chosen": -147.98080444335938, + "logps/rejected": -253.9555206298828, + "loss": 0.203, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7674566507339478, + "rewards/margins": 6.51474666595459, + "rewards/rejected": -8.282205581665039, + "step": 6580 + }, + { + "epoch": 1.28, + "learning_rate": 3.186524771697706e-07, + "logits/chosen": -2.834761381149292, + "logits/rejected": -2.7642300128936768, + "logps/chosen": -329.1448974609375, + "logps/rejected": -369.2736511230469, + "loss": 0.1137, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.5362348556518555, + "rewards/margins": 9.899274826049805, + "rewards/rejected": -15.435510635375977, + "step": 6590 + }, + { + "epoch": 1.28, + "learning_rate": 3.18292945998418e-07, + "logits/chosen": -2.750600814819336, + "logits/rejected": -2.6373486518859863, + "logps/chosen": -234.7852020263672, + "logps/rejected": -313.743408203125, + "loss": 0.2336, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.088565826416016, + "rewards/margins": 8.900773048400879, + "rewards/rejected": -12.989338874816895, + "step": 6600 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -2.6337947845458984, + "eval_logits/rejected": -2.618746042251587, + "eval_logps/chosen": -256.7079162597656, + "eval_logps/rejected": -277.7032775878906, + "eval_loss": 0.5167534947395325, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -6.246983051300049, + "eval_rewards/margins": 4.331745624542236, + "eval_rewards/rejected": -10.578729629516602, + "eval_runtime": 153.9631, + "eval_samples_per_second": 20.498, + "eval_steps_per_second": 0.325, + "step": 6600 + }, + { + "epoch": 1.28, + "learning_rate": 3.179334148270655e-07, + "logits/chosen": -2.6361284255981445, + "logits/rejected": -2.5865797996520996, + "logps/chosen": -252.5102996826172, + "logps/rejected": -332.4734191894531, + "loss": 0.1181, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4548401832580566, + "rewards/margins": 9.476263046264648, + "rewards/rejected": -11.931102752685547, + "step": 6610 + }, + { + "epoch": 1.29, + "learning_rate": 3.1757388365571294e-07, + "logits/chosen": -2.4050495624542236, + "logits/rejected": -2.5422911643981934, + "logps/chosen": -159.18374633789062, + "logps/rejected": -296.66363525390625, + "loss": 0.1392, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.344342231750488, + "rewards/margins": 6.840249538421631, + "rewards/rejected": -12.184591293334961, + "step": 6620 + }, + { + "epoch": 1.29, + "learning_rate": 3.172143524843604e-07, + "logits/chosen": -2.795285701751709, + "logits/rejected": -2.7838597297668457, + "logps/chosen": -211.2665557861328, + "logps/rejected": -280.879638671875, + "loss": 0.1005, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.351258754730225, + "rewards/margins": 5.330613136291504, + "rewards/rejected": -11.681873321533203, + "step": 6630 + }, + { + "epoch": 1.29, + "learning_rate": 3.1685482131300784e-07, + "logits/chosen": -2.852144956588745, + "logits/rejected": -2.7763545513153076, + "logps/chosen": -213.666259765625, + "logps/rejected": -256.70977783203125, + "loss": 0.1375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.642565727233887, + "rewards/margins": 6.521868705749512, + "rewards/rejected": -11.164435386657715, + "step": 6640 + }, + { + "epoch": 1.29, + "learning_rate": 3.1649529014165527e-07, + "logits/chosen": -2.8511900901794434, + "logits/rejected": -2.7841453552246094, + "logps/chosen": -233.0352325439453, + "logps/rejected": -331.7823486328125, + "loss": 0.1123, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6883093118667603, + "rewards/margins": 9.369901657104492, + "rewards/rejected": -11.058210372924805, + "step": 6650 + }, + { + "epoch": 1.29, + "learning_rate": 3.161357589703027e-07, + "logits/chosen": -2.709275007247925, + "logits/rejected": -2.7078492641448975, + "logps/chosen": -295.0395812988281, + "logps/rejected": -360.08660888671875, + "loss": 0.1436, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.444880962371826, + "rewards/margins": 8.707773208618164, + "rewards/rejected": -11.152654647827148, + "step": 6660 + }, + { + "epoch": 1.29, + "learning_rate": 3.1577622779895013e-07, + "logits/chosen": -2.689150333404541, + "logits/rejected": -2.7489943504333496, + "logps/chosen": -132.4023895263672, + "logps/rejected": -272.9388122558594, + "loss": 0.1077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5702412128448486, + "rewards/margins": 9.820098876953125, + "rewards/rejected": -13.390339851379395, + "step": 6670 + }, + { + "epoch": 1.3, + "learning_rate": 3.154166966275976e-07, + "logits/chosen": -2.8572585582733154, + "logits/rejected": -2.8273632526397705, + "logps/chosen": -232.9200439453125, + "logps/rejected": -354.3174743652344, + "loss": 0.122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9304556846618652, + "rewards/margins": 8.08757209777832, + "rewards/rejected": -11.018026351928711, + "step": 6680 + }, + { + "epoch": 1.3, + "learning_rate": 3.1505716545624504e-07, + "logits/chosen": -2.671022415161133, + "logits/rejected": -2.7371838092803955, + "logps/chosen": -218.63693237304688, + "logps/rejected": -334.6982727050781, + "loss": 0.1513, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0609169006347656, + "rewards/margins": 7.168398380279541, + "rewards/rejected": -10.229315757751465, + "step": 6690 + }, + { + "epoch": 1.3, + "learning_rate": 3.1469763428489246e-07, + "logits/chosen": -2.7181735038757324, + "logits/rejected": -2.721872329711914, + "logps/chosen": -299.78741455078125, + "logps/rejected": -276.9378967285156, + "loss": 0.1341, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6885970830917358, + "rewards/margins": 7.510015964508057, + "rewards/rejected": -9.198613166809082, + "step": 6700 + }, + { + "epoch": 1.3, + "eval_logits/chosen": -2.7110984325408936, + "eval_logits/rejected": -2.7003793716430664, + "eval_logps/chosen": -255.26902770996094, + "eval_logps/rejected": -278.4936828613281, + "eval_loss": 0.5187221765518188, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -6.103095531463623, + "eval_rewards/margins": 4.554676532745361, + "eval_rewards/rejected": -10.657772064208984, + "eval_runtime": 140.0492, + "eval_samples_per_second": 22.535, + "eval_steps_per_second": 0.357, + "step": 6700 + }, + { + "epoch": 1.3, + "learning_rate": 3.1433810311353994e-07, + "logits/chosen": -2.7828149795532227, + "logits/rejected": -2.755964756011963, + "logps/chosen": -223.8802032470703, + "logps/rejected": -281.24676513671875, + "loss": 0.078, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5590885877609253, + "rewards/margins": 12.588213920593262, + "rewards/rejected": -12.029123306274414, + "step": 6710 + }, + { + "epoch": 1.3, + "learning_rate": 3.1397857194218737e-07, + "logits/chosen": -2.7480688095092773, + "logits/rejected": -2.656557559967041, + "logps/chosen": -174.9902801513672, + "logps/rejected": -338.25262451171875, + "loss": 0.1178, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.34710431098938, + "rewards/margins": 9.372222900390625, + "rewards/rejected": -11.719327926635742, + "step": 6720 + }, + { + "epoch": 1.31, + "learning_rate": 3.1361904077083485e-07, + "logits/chosen": -2.763228178024292, + "logits/rejected": -2.7415266036987305, + "logps/chosen": -296.6454162597656, + "logps/rejected": -340.6575927734375, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6722679138183594, + "rewards/margins": 8.62702465057373, + "rewards/rejected": -10.299293518066406, + "step": 6730 + }, + { + "epoch": 1.31, + "learning_rate": 3.132595095994823e-07, + "logits/chosen": -2.5516011714935303, + "logits/rejected": -2.680459499359131, + "logps/chosen": -262.86090087890625, + "logps/rejected": -273.52545166015625, + "loss": 0.1223, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9065345525741577, + "rewards/margins": 5.898622989654541, + "rewards/rejected": -7.805157661437988, + "step": 6740 + }, + { + "epoch": 1.31, + "learning_rate": 3.128999784281297e-07, + "logits/chosen": -2.7771332263946533, + "logits/rejected": -2.7409467697143555, + "logps/chosen": -193.655517578125, + "logps/rejected": -327.9877624511719, + "loss": 0.1128, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9151012897491455, + "rewards/margins": 7.28466796875, + "rewards/rejected": -10.199769973754883, + "step": 6750 + }, + { + "epoch": 1.31, + "learning_rate": 3.1254044725677714e-07, + "logits/chosen": -2.7811279296875, + "logits/rejected": -2.8143246173858643, + "logps/chosen": -252.03726196289062, + "logps/rejected": -291.01361083984375, + "loss": 0.0802, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3618974685668945, + "rewards/margins": 7.68868350982666, + "rewards/rejected": -10.050580978393555, + "step": 6760 + }, + { + "epoch": 1.31, + "learning_rate": 3.1218091608542456e-07, + "logits/chosen": -2.7577600479125977, + "logits/rejected": -2.7990260124206543, + "logps/chosen": -322.322265625, + "logps/rejected": -342.3220520019531, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.24971342086792, + "rewards/margins": 9.047224044799805, + "rewards/rejected": -11.2969388961792, + "step": 6770 + }, + { + "epoch": 1.32, + "learning_rate": 3.1182138491407204e-07, + "logits/chosen": -2.6567625999450684, + "logits/rejected": -2.65261173248291, + "logps/chosen": -270.6991271972656, + "logps/rejected": -307.8583068847656, + "loss": 0.1087, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9129661321640015, + "rewards/margins": 11.324995040893555, + "rewards/rejected": -13.237958908081055, + "step": 6780 + }, + { + "epoch": 1.32, + "learning_rate": 3.1146185374271947e-07, + "logits/chosen": -2.7333474159240723, + "logits/rejected": -2.825624465942383, + "logps/chosen": -301.9677734375, + "logps/rejected": -447.29510498046875, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.511021614074707, + "rewards/margins": 14.370841979980469, + "rewards/rejected": -14.881861686706543, + "step": 6790 + }, + { + "epoch": 1.32, + "learning_rate": 3.111023225713669e-07, + "logits/chosen": -2.7338008880615234, + "logits/rejected": -2.703036308288574, + "logps/chosen": -205.5673370361328, + "logps/rejected": -247.8675537109375, + "loss": 0.0945, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1913504600524902, + "rewards/margins": 6.280646800994873, + "rewards/rejected": -9.471997261047363, + "step": 6800 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -2.5996198654174805, + "eval_logits/rejected": -2.5874640941619873, + "eval_logps/chosen": -262.0834655761719, + "eval_logps/rejected": -285.2012023925781, + "eval_loss": 0.5339722633361816, + "eval_rewards/accuracies": 0.7174999713897705, + "eval_rewards/chosen": -6.784541606903076, + "eval_rewards/margins": 4.543982982635498, + "eval_rewards/rejected": -11.32852554321289, + "eval_runtime": 140.1899, + "eval_samples_per_second": 22.512, + "eval_steps_per_second": 0.357, + "step": 6800 + }, + { + "epoch": 1.32, + "learning_rate": 3.107427914000144e-07, + "logits/chosen": -2.6865274906158447, + "logits/rejected": -2.637275218963623, + "logps/chosen": -201.00270080566406, + "logps/rejected": -310.13604736328125, + "loss": 0.1135, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.912355899810791, + "rewards/margins": 10.1017484664917, + "rewards/rejected": -14.014103889465332, + "step": 6810 + }, + { + "epoch": 1.32, + "learning_rate": 3.103832602286618e-07, + "logits/chosen": -2.5592846870422363, + "logits/rejected": -2.534046173095703, + "logps/chosen": -326.1671142578125, + "logps/rejected": -284.32733154296875, + "loss": 0.1286, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1637768745422363, + "rewards/margins": 8.271170616149902, + "rewards/rejected": -11.43494701385498, + "step": 6820 + }, + { + "epoch": 1.33, + "learning_rate": 3.100237290573093e-07, + "logits/chosen": -2.6639950275421143, + "logits/rejected": -2.6628472805023193, + "logps/chosen": -233.7548828125, + "logps/rejected": -330.5860900878906, + "loss": 0.1161, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3829774856567383, + "rewards/margins": 9.43483829498291, + "rewards/rejected": -11.817815780639648, + "step": 6830 + }, + { + "epoch": 1.33, + "learning_rate": 3.096641978859567e-07, + "logits/chosen": -2.7246289253234863, + "logits/rejected": -2.724916458129883, + "logps/chosen": -226.059814453125, + "logps/rejected": -303.9591064453125, + "loss": 0.0975, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.16279935836792, + "rewards/margins": 9.367891311645508, + "rewards/rejected": -12.530691146850586, + "step": 6840 + }, + { + "epoch": 1.33, + "learning_rate": 3.0930466671460415e-07, + "logits/chosen": -2.529158353805542, + "logits/rejected": -2.5818915367126465, + "logps/chosen": -226.40988159179688, + "logps/rejected": -280.0759582519531, + "loss": 0.1649, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.696617603302002, + "rewards/margins": 9.469199180603027, + "rewards/rejected": -14.165815353393555, + "step": 6850 + }, + { + "epoch": 1.33, + "learning_rate": 3.0894513554325157e-07, + "logits/chosen": -2.5131049156188965, + "logits/rejected": -2.50006103515625, + "logps/chosen": -283.16619873046875, + "logps/rejected": -292.90997314453125, + "loss": 0.2001, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4522463381290436, + "rewards/margins": 8.661517143249512, + "rewards/rejected": -9.113763809204102, + "step": 6860 + }, + { + "epoch": 1.33, + "learning_rate": 3.08585604371899e-07, + "logits/chosen": -2.4603896141052246, + "logits/rejected": -2.475236415863037, + "logps/chosen": -200.2270050048828, + "logps/rejected": -319.63165283203125, + "loss": 0.1584, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5711714029312134, + "rewards/margins": 9.169591903686523, + "rewards/rejected": -10.740763664245605, + "step": 6870 + }, + { + "epoch": 1.34, + "learning_rate": 3.082260732005465e-07, + "logits/chosen": -2.5062026977539062, + "logits/rejected": -2.481663465499878, + "logps/chosen": -270.3084411621094, + "logps/rejected": -277.72552490234375, + "loss": 0.1108, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.8045730590820312, + "rewards/margins": 6.856584072113037, + "rewards/rejected": -10.661157608032227, + "step": 6880 + }, + { + "epoch": 1.34, + "learning_rate": 3.078665420291939e-07, + "logits/chosen": -2.692714214324951, + "logits/rejected": -2.6422486305236816, + "logps/chosen": -224.814697265625, + "logps/rejected": -320.9744873046875, + "loss": 0.2283, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.366394519805908, + "rewards/margins": 6.598013877868652, + "rewards/rejected": -9.964409828186035, + "step": 6890 + }, + { + "epoch": 1.34, + "learning_rate": 3.075070108578414e-07, + "logits/chosen": -2.54801869392395, + "logits/rejected": -2.611740827560425, + "logps/chosen": -201.72970581054688, + "logps/rejected": -318.2071838378906, + "loss": 0.1569, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.590427398681641, + "rewards/margins": 7.745883941650391, + "rewards/rejected": -13.336311340332031, + "step": 6900 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.509418249130249, + "eval_logits/rejected": -2.4990177154541016, + "eval_logps/chosen": -265.41961669921875, + "eval_logps/rejected": -287.7729797363281, + "eval_loss": 0.5556238889694214, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -7.118157863616943, + "eval_rewards/margins": 4.467545986175537, + "eval_rewards/rejected": -11.585704803466797, + "eval_runtime": 141.0041, + "eval_samples_per_second": 22.382, + "eval_steps_per_second": 0.355, + "step": 6900 + }, + { + "epoch": 1.34, + "learning_rate": 3.071474796864888e-07, + "logits/chosen": -2.705265760421753, + "logits/rejected": -2.708603620529175, + "logps/chosen": -364.18621826171875, + "logps/rejected": -366.87908935546875, + "loss": 0.1044, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7980449199676514, + "rewards/margins": 7.81343936920166, + "rewards/rejected": -11.611483573913574, + "step": 6910 + }, + { + "epoch": 1.34, + "learning_rate": 3.0678794851513625e-07, + "logits/chosen": -2.68180513381958, + "logits/rejected": -2.6304049491882324, + "logps/chosen": -282.3155822753906, + "logps/rejected": -310.7538146972656, + "loss": 0.1837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.944119453430176, + "rewards/margins": 5.915678024291992, + "rewards/rejected": -13.859797477722168, + "step": 6920 + }, + { + "epoch": 1.35, + "learning_rate": 3.0642841734378373e-07, + "logits/chosen": -2.583775758743286, + "logits/rejected": -2.6555397510528564, + "logps/chosen": -290.5704345703125, + "logps/rejected": -368.8967590332031, + "loss": 0.1213, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.195664405822754, + "rewards/margins": 11.082324981689453, + "rewards/rejected": -16.27798843383789, + "step": 6930 + }, + { + "epoch": 1.35, + "learning_rate": 3.0606888617243115e-07, + "logits/chosen": -2.667675733566284, + "logits/rejected": -2.628983974456787, + "logps/chosen": -198.3682403564453, + "logps/rejected": -331.04241943359375, + "loss": 0.2169, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4205238819122314, + "rewards/margins": 10.031224250793457, + "rewards/rejected": -13.451748847961426, + "step": 6940 + }, + { + "epoch": 1.35, + "learning_rate": 3.057093550010786e-07, + "logits/chosen": -2.6205906867980957, + "logits/rejected": -2.509148120880127, + "logps/chosen": -294.91363525390625, + "logps/rejected": -373.4155578613281, + "loss": 0.1427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.280693054199219, + "rewards/margins": 6.832987308502197, + "rewards/rejected": -11.113679885864258, + "step": 6950 + }, + { + "epoch": 1.35, + "learning_rate": 3.05349823829726e-07, + "logits/chosen": -2.627901077270508, + "logits/rejected": -2.7255802154541016, + "logps/chosen": -243.75668334960938, + "logps/rejected": -333.1106872558594, + "loss": 0.1414, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9257257580757141, + "rewards/margins": 12.062132835388184, + "rewards/rejected": -12.987858772277832, + "step": 6960 + }, + { + "epoch": 1.35, + "learning_rate": 3.0499029265837344e-07, + "logits/chosen": -2.5732569694519043, + "logits/rejected": -2.576958417892456, + "logps/chosen": -217.84695434570312, + "logps/rejected": -285.22607421875, + "loss": 0.1911, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.424662113189697, + "rewards/margins": 5.363595485687256, + "rewards/rejected": -9.78825855255127, + "step": 6970 + }, + { + "epoch": 1.36, + "learning_rate": 3.046307614870209e-07, + "logits/chosen": -2.7168760299682617, + "logits/rejected": -2.7221908569335938, + "logps/chosen": -204.42691040039062, + "logps/rejected": -288.11749267578125, + "loss": 0.1641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.712026119232178, + "rewards/margins": 8.34032917022705, + "rewards/rejected": -13.052355766296387, + "step": 6980 + }, + { + "epoch": 1.36, + "learning_rate": 3.0427123031566835e-07, + "logits/chosen": -2.665600299835205, + "logits/rejected": -2.6365535259246826, + "logps/chosen": -235.10409545898438, + "logps/rejected": -309.0330505371094, + "loss": 0.1178, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.550424575805664, + "rewards/margins": 8.813916206359863, + "rewards/rejected": -11.364341735839844, + "step": 6990 + }, + { + "epoch": 1.36, + "learning_rate": 3.0391169914431583e-07, + "logits/chosen": -2.651092767715454, + "logits/rejected": -2.572640895843506, + "logps/chosen": -178.3667755126953, + "logps/rejected": -302.5531311035156, + "loss": 0.1122, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.638754367828369, + "rewards/margins": 9.221355438232422, + "rewards/rejected": -11.860109329223633, + "step": 7000 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -2.581740140914917, + "eval_logits/rejected": -2.568486213684082, + "eval_logps/chosen": -261.2301330566406, + "eval_logps/rejected": -286.8914794921875, + "eval_loss": 0.5235300660133362, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -6.699207782745361, + "eval_rewards/margins": 4.798343658447266, + "eval_rewards/rejected": -11.497550964355469, + "eval_runtime": 141.1976, + "eval_samples_per_second": 22.352, + "eval_steps_per_second": 0.354, + "step": 7000 + }, + { + "epoch": 1.36, + "learning_rate": 3.0355216797296326e-07, + "logits/chosen": -2.5472564697265625, + "logits/rejected": -2.543069839477539, + "logps/chosen": -183.33639526367188, + "logps/rejected": -266.994140625, + "loss": 0.1101, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.847899436950684, + "rewards/margins": 6.370082855224609, + "rewards/rejected": -11.217982292175293, + "step": 7010 + }, + { + "epoch": 1.36, + "learning_rate": 3.031926368016107e-07, + "logits/chosen": -2.5794944763183594, + "logits/rejected": -2.567682981491089, + "logps/chosen": -251.6931610107422, + "logps/rejected": -317.95477294921875, + "loss": 0.1137, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.406409740447998, + "rewards/margins": 7.383990287780762, + "rewards/rejected": -12.790399551391602, + "step": 7020 + }, + { + "epoch": 1.36, + "learning_rate": 3.0283310563025816e-07, + "logits/chosen": -2.6800646781921387, + "logits/rejected": -2.548715591430664, + "logps/chosen": -267.2879333496094, + "logps/rejected": -375.4579162597656, + "loss": 0.1377, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9697425365447998, + "rewards/margins": 11.063411712646484, + "rewards/rejected": -13.03315258026123, + "step": 7030 + }, + { + "epoch": 1.37, + "learning_rate": 3.024735744589056e-07, + "logits/chosen": -2.625786542892456, + "logits/rejected": -2.5683741569519043, + "logps/chosen": -331.9104919433594, + "logps/rejected": -320.0903625488281, + "loss": 0.1019, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.854667901992798, + "rewards/margins": 8.84476375579834, + "rewards/rejected": -11.699432373046875, + "step": 7040 + }, + { + "epoch": 1.37, + "learning_rate": 3.02114043287553e-07, + "logits/chosen": -2.708660364151001, + "logits/rejected": -2.8068530559539795, + "logps/chosen": -181.35525512695312, + "logps/rejected": -280.81390380859375, + "loss": 0.1748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.215568542480469, + "rewards/margins": 9.792646408081055, + "rewards/rejected": -15.008213996887207, + "step": 7050 + }, + { + "epoch": 1.37, + "learning_rate": 3.0175451211620045e-07, + "logits/chosen": -2.633408784866333, + "logits/rejected": -2.612058162689209, + "logps/chosen": -200.0987091064453, + "logps/rejected": -283.28515625, + "loss": 0.1443, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.08492374420166, + "rewards/margins": 8.34225082397461, + "rewards/rejected": -13.427175521850586, + "step": 7060 + }, + { + "epoch": 1.37, + "learning_rate": 3.013949809448479e-07, + "logits/chosen": -2.665736675262451, + "logits/rejected": -2.673962116241455, + "logps/chosen": -204.17137145996094, + "logps/rejected": -252.1431427001953, + "loss": 0.1596, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.014439105987549, + "rewards/margins": 8.255058288574219, + "rewards/rejected": -13.269497871398926, + "step": 7070 + }, + { + "epoch": 1.37, + "learning_rate": 3.0103544977349536e-07, + "logits/chosen": -2.763153076171875, + "logits/rejected": -2.7834317684173584, + "logps/chosen": -260.197265625, + "logps/rejected": -312.7044677734375, + "loss": 0.1843, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0015342235565186, + "rewards/margins": 7.654010772705078, + "rewards/rejected": -9.655545234680176, + "step": 7080 + }, + { + "epoch": 1.38, + "learning_rate": 3.006759186021428e-07, + "logits/chosen": -2.7696967124938965, + "logits/rejected": -2.7554268836975098, + "logps/chosen": -234.03488159179688, + "logps/rejected": -288.6385803222656, + "loss": 0.1456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.310217380523682, + "rewards/margins": 5.257205009460449, + "rewards/rejected": -10.567422866821289, + "step": 7090 + }, + { + "epoch": 1.38, + "learning_rate": 3.0031638743079026e-07, + "logits/chosen": -2.649148464202881, + "logits/rejected": -2.725419521331787, + "logps/chosen": -270.505615234375, + "logps/rejected": -361.8536682128906, + "loss": 0.126, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.899606704711914, + "rewards/margins": 11.054985046386719, + "rewards/rejected": -13.954591751098633, + "step": 7100 + }, + { + "epoch": 1.38, + "eval_logits/chosen": -2.5971784591674805, + "eval_logits/rejected": -2.5856800079345703, + "eval_logps/chosen": -270.76007080078125, + "eval_logps/rejected": -297.9208679199219, + "eval_loss": 0.5673274993896484, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -7.652198314666748, + "eval_rewards/margins": 4.948291301727295, + "eval_rewards/rejected": -12.600488662719727, + "eval_runtime": 140.5425, + "eval_samples_per_second": 22.456, + "eval_steps_per_second": 0.356, + "step": 7100 + }, + { + "epoch": 1.38, + "learning_rate": 2.999568562594377e-07, + "logits/chosen": -2.6537704467773438, + "logits/rejected": -2.701878786087036, + "logps/chosen": -251.71798706054688, + "logps/rejected": -315.9606018066406, + "loss": 0.1307, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.501950263977051, + "rewards/margins": 8.076028823852539, + "rewards/rejected": -12.577978134155273, + "step": 7110 + }, + { + "epoch": 1.38, + "learning_rate": 2.995973250880851e-07, + "logits/chosen": -2.7427194118499756, + "logits/rejected": -2.7168116569519043, + "logps/chosen": -269.60906982421875, + "logps/rejected": -329.2308654785156, + "loss": 0.1276, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.565219879150391, + "rewards/margins": 12.049905776977539, + "rewards/rejected": -16.61512565612793, + "step": 7120 + }, + { + "epoch": 1.38, + "learning_rate": 2.992377939167326e-07, + "logits/chosen": -2.8124959468841553, + "logits/rejected": -2.732012987136841, + "logps/chosen": -268.4694519042969, + "logps/rejected": -373.57611083984375, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.161271095275879, + "rewards/margins": 10.258711814880371, + "rewards/rejected": -15.41998291015625, + "step": 7130 + }, + { + "epoch": 1.39, + "learning_rate": 2.9887826274538003e-07, + "logits/chosen": -2.771953821182251, + "logits/rejected": -2.658160448074341, + "logps/chosen": -215.19906616210938, + "logps/rejected": -219.28829956054688, + "loss": 0.2375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.545722007751465, + "rewards/margins": 7.849542140960693, + "rewards/rejected": -10.395264625549316, + "step": 7140 + }, + { + "epoch": 1.39, + "learning_rate": 2.9851873157402746e-07, + "logits/chosen": -2.7215499877929688, + "logits/rejected": -2.8429300785064697, + "logps/chosen": -207.01937866210938, + "logps/rejected": -366.0595703125, + "loss": 0.1678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.246459484100342, + "rewards/margins": 7.551443576812744, + "rewards/rejected": -11.797903060913086, + "step": 7150 + }, + { + "epoch": 1.39, + "learning_rate": 2.981592004026749e-07, + "logits/chosen": -2.7126569747924805, + "logits/rejected": -2.68514084815979, + "logps/chosen": -179.6477813720703, + "logps/rejected": -238.47119140625, + "loss": 0.1196, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.4221978187561035, + "rewards/margins": 7.249853610992432, + "rewards/rejected": -14.672050476074219, + "step": 7160 + }, + { + "epoch": 1.39, + "learning_rate": 2.977996692313223e-07, + "logits/chosen": -2.563110113143921, + "logits/rejected": -2.6195552349090576, + "logps/chosen": -218.6953125, + "logps/rejected": -283.00616455078125, + "loss": 0.143, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0791234970092773, + "rewards/margins": 10.290868759155273, + "rewards/rejected": -13.369993209838867, + "step": 7170 + }, + { + "epoch": 1.39, + "learning_rate": 2.9744013805996974e-07, + "logits/chosen": -2.6006455421447754, + "logits/rejected": -2.6729652881622314, + "logps/chosen": -254.58407592773438, + "logps/rejected": -345.2496337890625, + "loss": 0.0951, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.587131500244141, + "rewards/margins": 10.4163236618042, + "rewards/rejected": -15.003454208374023, + "step": 7180 + }, + { + "epoch": 1.4, + "learning_rate": 2.970806068886172e-07, + "logits/chosen": -2.7501912117004395, + "logits/rejected": -2.6658809185028076, + "logps/chosen": -328.89031982421875, + "logps/rejected": -379.03094482421875, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.939349889755249, + "rewards/margins": 10.54344654083252, + "rewards/rejected": -13.482797622680664, + "step": 7190 + }, + { + "epoch": 1.4, + "learning_rate": 2.967210757172647e-07, + "logits/chosen": -2.450362205505371, + "logits/rejected": -2.339787483215332, + "logps/chosen": -223.5878448486328, + "logps/rejected": -388.7928161621094, + "loss": 0.0913, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.88235342502594, + "rewards/margins": 15.814065933227539, + "rewards/rejected": -17.69641876220703, + "step": 7200 + }, + { + "epoch": 1.4, + "eval_logits/chosen": -2.529186487197876, + "eval_logits/rejected": -2.5161945819854736, + "eval_logps/chosen": -275.12677001953125, + "eval_logps/rejected": -306.8511047363281, + "eval_loss": 0.5452268123626709, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -8.088868141174316, + "eval_rewards/margins": 5.404646396636963, + "eval_rewards/rejected": -13.493513107299805, + "eval_runtime": 141.3306, + "eval_samples_per_second": 22.331, + "eval_steps_per_second": 0.354, + "step": 7200 + }, + { + "epoch": 1.4, + "learning_rate": 2.9636154454591213e-07, + "logits/chosen": -2.620148181915283, + "logits/rejected": -2.566028594970703, + "logps/chosen": -175.0458221435547, + "logps/rejected": -224.8864288330078, + "loss": 0.1564, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.470210552215576, + "rewards/margins": 6.311079978942871, + "rewards/rejected": -11.781290054321289, + "step": 7210 + }, + { + "epoch": 1.4, + "learning_rate": 2.9600201337455956e-07, + "logits/chosen": -2.7207438945770264, + "logits/rejected": -2.718306303024292, + "logps/chosen": -244.40902709960938, + "logps/rejected": -366.9908142089844, + "loss": 0.1573, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.883308410644531, + "rewards/margins": 12.144978523254395, + "rewards/rejected": -17.02828598022461, + "step": 7220 + }, + { + "epoch": 1.4, + "learning_rate": 2.95642482203207e-07, + "logits/chosen": -2.7251694202423096, + "logits/rejected": -2.7464382648468018, + "logps/chosen": -266.37652587890625, + "logps/rejected": -318.9719543457031, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4313480854034424, + "rewards/margins": 9.35649585723877, + "rewards/rejected": -9.787843704223633, + "step": 7230 + }, + { + "epoch": 1.41, + "learning_rate": 2.9528295103185447e-07, + "logits/chosen": -2.6239540576934814, + "logits/rejected": -2.568068742752075, + "logps/chosen": -244.37234497070312, + "logps/rejected": -347.47613525390625, + "loss": 0.2529, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2521071434020996, + "rewards/margins": 8.381429672241211, + "rewards/rejected": -11.633537292480469, + "step": 7240 + }, + { + "epoch": 1.41, + "learning_rate": 2.949234198605019e-07, + "logits/chosen": -2.8597843647003174, + "logits/rejected": -2.7766873836517334, + "logps/chosen": -218.0390625, + "logps/rejected": -291.564453125, + "loss": 0.1086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.036818504333496, + "rewards/margins": 7.55121374130249, + "rewards/rejected": -12.588032722473145, + "step": 7250 + }, + { + "epoch": 1.41, + "learning_rate": 2.945638886891493e-07, + "logits/chosen": -2.6912407875061035, + "logits/rejected": -2.67124605178833, + "logps/chosen": -220.19381713867188, + "logps/rejected": -280.46551513671875, + "loss": 0.1727, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.568626403808594, + "rewards/margins": 8.104110717773438, + "rewards/rejected": -12.672737121582031, + "step": 7260 + }, + { + "epoch": 1.41, + "learning_rate": 2.9420435751779675e-07, + "logits/chosen": -2.817361831665039, + "logits/rejected": -2.7352194786071777, + "logps/chosen": -266.4664611816406, + "logps/rejected": -310.5345458984375, + "loss": 0.1487, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.841556549072266, + "rewards/margins": 8.549221992492676, + "rewards/rejected": -13.390779495239258, + "step": 7270 + }, + { + "epoch": 1.41, + "learning_rate": 2.938448263464442e-07, + "logits/chosen": -2.6565136909484863, + "logits/rejected": -2.727008581161499, + "logps/chosen": -248.929443359375, + "logps/rejected": -312.4493103027344, + "loss": 0.2064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.9427595138549805, + "rewards/margins": 7.866678714752197, + "rewards/rejected": -12.809438705444336, + "step": 7280 + }, + { + "epoch": 1.42, + "learning_rate": 2.934852951750917e-07, + "logits/chosen": -2.6423702239990234, + "logits/rejected": -2.687544584274292, + "logps/chosen": -250.701904296875, + "logps/rejected": -401.62786865234375, + "loss": 0.0906, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.986860752105713, + "rewards/margins": 12.676633834838867, + "rewards/rejected": -15.663493156433105, + "step": 7290 + }, + { + "epoch": 1.42, + "learning_rate": 2.9312576400373914e-07, + "logits/chosen": -2.683901786804199, + "logits/rejected": -2.5897936820983887, + "logps/chosen": -281.7994689941406, + "logps/rejected": -361.5560302734375, + "loss": 0.1582, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.576489448547363, + "rewards/margins": 7.160782814025879, + "rewards/rejected": -15.737272262573242, + "step": 7300 + }, + { + "epoch": 1.42, + "eval_logits/chosen": -2.6350250244140625, + "eval_logits/rejected": -2.625715970993042, + "eval_logps/chosen": -275.5716552734375, + "eval_logps/rejected": -300.4671936035156, + "eval_loss": 0.5486189126968384, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -8.1333589553833, + "eval_rewards/margins": 4.721765995025635, + "eval_rewards/rejected": -12.855124473571777, + "eval_runtime": 140.814, + "eval_samples_per_second": 22.413, + "eval_steps_per_second": 0.355, + "step": 7300 + }, + { + "epoch": 1.42, + "learning_rate": 2.9276623283238657e-07, + "logits/chosen": -2.690355062484741, + "logits/rejected": -2.8258864879608154, + "logps/chosen": -329.5840759277344, + "logps/rejected": -354.81182861328125, + "loss": 0.1519, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.988795280456543, + "rewards/margins": 7.291092872619629, + "rewards/rejected": -13.279887199401855, + "step": 7310 + }, + { + "epoch": 1.42, + "learning_rate": 2.92406701661034e-07, + "logits/chosen": -2.669297695159912, + "logits/rejected": -2.6820120811462402, + "logps/chosen": -176.64915466308594, + "logps/rejected": -262.0193786621094, + "loss": 0.1338, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.852190971374512, + "rewards/margins": 6.8422746658325195, + "rewards/rejected": -11.694464683532715, + "step": 7320 + }, + { + "epoch": 1.42, + "learning_rate": 2.920471704896814e-07, + "logits/chosen": -2.710866689682007, + "logits/rejected": -2.6943893432617188, + "logps/chosen": -185.72616577148438, + "logps/rejected": -240.77197265625, + "loss": 0.1008, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.496728897094727, + "rewards/margins": 8.53285026550293, + "rewards/rejected": -13.029577255249023, + "step": 7330 + }, + { + "epoch": 1.42, + "learning_rate": 2.916876393183289e-07, + "logits/chosen": -2.611412763595581, + "logits/rejected": -2.60302472114563, + "logps/chosen": -246.72314453125, + "logps/rejected": -309.4311828613281, + "loss": 0.1393, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.08695387840271, + "rewards/margins": 8.461469650268555, + "rewards/rejected": -9.548425674438477, + "step": 7340 + }, + { + "epoch": 1.43, + "learning_rate": 2.9132810814697633e-07, + "logits/chosen": -2.5691325664520264, + "logits/rejected": -2.631394147872925, + "logps/chosen": -240.9157257080078, + "logps/rejected": -262.32012939453125, + "loss": 0.1522, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1954829692840576, + "rewards/margins": 7.731309413909912, + "rewards/rejected": -9.92679214477539, + "step": 7350 + }, + { + "epoch": 1.43, + "learning_rate": 2.9096857697562376e-07, + "logits/chosen": -2.765463352203369, + "logits/rejected": -2.708284616470337, + "logps/chosen": -228.95162963867188, + "logps/rejected": -380.7508544921875, + "loss": 0.1379, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.617364883422852, + "rewards/margins": 10.537843704223633, + "rewards/rejected": -16.155208587646484, + "step": 7360 + }, + { + "epoch": 1.43, + "learning_rate": 2.906090458042712e-07, + "logits/chosen": -2.7099769115448, + "logits/rejected": -2.7386481761932373, + "logps/chosen": -242.02255249023438, + "logps/rejected": -284.34649658203125, + "loss": 0.1483, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.5198564529418945, + "rewards/margins": 6.369032859802246, + "rewards/rejected": -11.88888931274414, + "step": 7370 + }, + { + "epoch": 1.43, + "learning_rate": 2.902495146329186e-07, + "logits/chosen": -2.676896572113037, + "logits/rejected": -2.720167875289917, + "logps/chosen": -289.1672668457031, + "logps/rejected": -313.4903564453125, + "loss": 0.1071, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.613086700439453, + "rewards/margins": 9.373674392700195, + "rewards/rejected": -13.986761093139648, + "step": 7380 + }, + { + "epoch": 1.43, + "learning_rate": 2.8988998346156615e-07, + "logits/chosen": -2.6812024116516113, + "logits/rejected": -2.674198865890503, + "logps/chosen": -228.72802734375, + "logps/rejected": -373.49493408203125, + "loss": 0.1222, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.594104766845703, + "rewards/margins": 7.436854362487793, + "rewards/rejected": -13.030960083007812, + "step": 7390 + }, + { + "epoch": 1.44, + "learning_rate": 2.895304522902136e-07, + "logits/chosen": -2.6344497203826904, + "logits/rejected": -2.599367380142212, + "logps/chosen": -307.51458740234375, + "logps/rejected": -377.29486083984375, + "loss": 0.1205, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.841690540313721, + "rewards/margins": 10.016744613647461, + "rewards/rejected": -14.858434677124023, + "step": 7400 + }, + { + "epoch": 1.44, + "eval_logits/chosen": -2.5095021724700928, + "eval_logits/rejected": -2.4955389499664307, + "eval_logps/chosen": -270.70867919921875, + "eval_logps/rejected": -297.9638977050781, + "eval_loss": 0.5640743970870972, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -7.647061347961426, + "eval_rewards/margins": 4.95773458480835, + "eval_rewards/rejected": -12.604796409606934, + "eval_runtime": 140.8209, + "eval_samples_per_second": 22.411, + "eval_steps_per_second": 0.355, + "step": 7400 + }, + { + "epoch": 1.44, + "learning_rate": 2.89170921118861e-07, + "logits/chosen": -2.646491765975952, + "logits/rejected": -2.695291519165039, + "logps/chosen": -271.927490234375, + "logps/rejected": -369.4830627441406, + "loss": 0.1511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.7775983810424805, + "rewards/margins": 13.019407272338867, + "rewards/rejected": -17.797006607055664, + "step": 7410 + }, + { + "epoch": 1.44, + "learning_rate": 2.8881138994750843e-07, + "logits/chosen": -2.6583352088928223, + "logits/rejected": -2.6142797470092773, + "logps/chosen": -267.32098388671875, + "logps/rejected": -321.2505187988281, + "loss": 0.1269, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.502383232116699, + "rewards/margins": 9.251312255859375, + "rewards/rejected": -13.753695487976074, + "step": 7420 + }, + { + "epoch": 1.44, + "learning_rate": 2.8845185877615586e-07, + "logits/chosen": -2.571791172027588, + "logits/rejected": -2.591951370239258, + "logps/chosen": -245.6603240966797, + "logps/rejected": -329.0169372558594, + "loss": 0.1461, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9595108032226562, + "rewards/margins": 11.194948196411133, + "rewards/rejected": -15.154459953308105, + "step": 7430 + }, + { + "epoch": 1.44, + "learning_rate": 2.8809232760480334e-07, + "logits/chosen": -2.655647039413452, + "logits/rejected": -2.649632215499878, + "logps/chosen": -283.72235107421875, + "logps/rejected": -334.8876037597656, + "loss": 0.1205, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.935078144073486, + "rewards/margins": 8.224316596984863, + "rewards/rejected": -14.159395217895508, + "step": 7440 + }, + { + "epoch": 1.45, + "learning_rate": 2.8773279643345077e-07, + "logits/chosen": -2.301175117492676, + "logits/rejected": -2.3566908836364746, + "logps/chosen": -226.89065551757812, + "logps/rejected": -293.74261474609375, + "loss": 0.1226, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.175057411193848, + "rewards/margins": 8.48443603515625, + "rewards/rejected": -13.659494400024414, + "step": 7450 + }, + { + "epoch": 1.45, + "learning_rate": 2.873732652620982e-07, + "logits/chosen": -2.650192975997925, + "logits/rejected": -2.5119731426239014, + "logps/chosen": -252.47445678710938, + "logps/rejected": -339.8155517578125, + "loss": 0.1635, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.845350742340088, + "rewards/margins": 9.868247985839844, + "rewards/rejected": -16.71360206604004, + "step": 7460 + }, + { + "epoch": 1.45, + "learning_rate": 2.870137340907456e-07, + "logits/chosen": -2.5948119163513184, + "logits/rejected": -2.5240237712860107, + "logps/chosen": -235.8869171142578, + "logps/rejected": -338.0869445800781, + "loss": 0.1414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -9.814234733581543, + "rewards/margins": 7.834403038024902, + "rewards/rejected": -17.648639678955078, + "step": 7470 + }, + { + "epoch": 1.45, + "learning_rate": 2.8665420291939305e-07, + "logits/chosen": -2.5564475059509277, + "logits/rejected": -2.521803617477417, + "logps/chosen": -214.2293243408203, + "logps/rejected": -279.4332275390625, + "loss": 0.2919, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.876753330230713, + "rewards/margins": 7.654786109924316, + "rewards/rejected": -15.531538009643555, + "step": 7480 + }, + { + "epoch": 1.45, + "learning_rate": 2.862946717480406e-07, + "logits/chosen": -2.6138367652893066, + "logits/rejected": -2.494513750076294, + "logps/chosen": -288.03094482421875, + "logps/rejected": -279.65606689453125, + "loss": 0.1171, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.514867782592773, + "rewards/margins": 5.313235282897949, + "rewards/rejected": -9.828103065490723, + "step": 7490 + }, + { + "epoch": 1.46, + "learning_rate": 2.85935140576688e-07, + "logits/chosen": -2.5605311393737793, + "logits/rejected": -2.5970845222473145, + "logps/chosen": -251.8067169189453, + "logps/rejected": -333.286376953125, + "loss": 0.1483, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8011680841445923, + "rewards/margins": 10.87164306640625, + "rewards/rejected": -12.672809600830078, + "step": 7500 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -2.462235927581787, + "eval_logits/rejected": -2.4456515312194824, + "eval_logps/chosen": -262.43505859375, + "eval_logps/rejected": -291.4525451660156, + "eval_loss": 0.535338282585144, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -6.819699764251709, + "eval_rewards/margins": 5.133960723876953, + "eval_rewards/rejected": -11.95366096496582, + "eval_runtime": 140.6131, + "eval_samples_per_second": 22.445, + "eval_steps_per_second": 0.356, + "step": 7500 + }, + { + "epoch": 1.46, + "learning_rate": 2.8557560940533544e-07, + "logits/chosen": -2.6730306148529053, + "logits/rejected": -2.639164447784424, + "logps/chosen": -315.77532958984375, + "logps/rejected": -356.12261962890625, + "loss": 0.1234, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.206526279449463, + "rewards/margins": 10.395485877990723, + "rewards/rejected": -15.602012634277344, + "step": 7510 + }, + { + "epoch": 1.46, + "learning_rate": 2.8521607823398287e-07, + "logits/chosen": -2.6434378623962402, + "logits/rejected": -2.595853805541992, + "logps/chosen": -245.8604736328125, + "logps/rejected": -305.4363708496094, + "loss": 0.1407, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.256904602050781, + "rewards/margins": 7.4478936195373535, + "rewards/rejected": -12.704797744750977, + "step": 7520 + }, + { + "epoch": 1.46, + "learning_rate": 2.848565470626303e-07, + "logits/chosen": -2.3837356567382812, + "logits/rejected": -2.3848493099212646, + "logps/chosen": -251.31204223632812, + "logps/rejected": -386.92474365234375, + "loss": 0.1449, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.964468479156494, + "rewards/margins": 11.46526050567627, + "rewards/rejected": -14.429728507995605, + "step": 7530 + }, + { + "epoch": 1.46, + "learning_rate": 2.844970158912778e-07, + "logits/chosen": -2.4391016960144043, + "logits/rejected": -2.4242305755615234, + "logps/chosen": -184.51016235351562, + "logps/rejected": -272.45770263671875, + "loss": 0.1833, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.969717502593994, + "rewards/margins": 7.345050811767578, + "rewards/rejected": -11.314767837524414, + "step": 7540 + }, + { + "epoch": 1.47, + "learning_rate": 2.841374847199252e-07, + "logits/chosen": -2.539172410964966, + "logits/rejected": -2.4916083812713623, + "logps/chosen": -238.360107421875, + "logps/rejected": -321.63275146484375, + "loss": 0.1164, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.93917179107666, + "rewards/margins": 8.108965873718262, + "rewards/rejected": -14.048138618469238, + "step": 7550 + }, + { + "epoch": 1.47, + "learning_rate": 2.8377795354857263e-07, + "logits/chosen": -2.5334014892578125, + "logits/rejected": -2.5618481636047363, + "logps/chosen": -217.45339965820312, + "logps/rejected": -319.08648681640625, + "loss": 0.1339, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.835770130157471, + "rewards/margins": 8.362272262573242, + "rewards/rejected": -14.198040962219238, + "step": 7560 + }, + { + "epoch": 1.47, + "learning_rate": 2.8341842237722006e-07, + "logits/chosen": -2.2875418663024902, + "logits/rejected": -2.2655181884765625, + "logps/chosen": -260.1848449707031, + "logps/rejected": -293.3078918457031, + "loss": 0.1478, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.657872200012207, + "rewards/margins": 6.696013450622559, + "rewards/rejected": -12.353886604309082, + "step": 7570 + }, + { + "epoch": 1.47, + "learning_rate": 2.830588912058675e-07, + "logits/chosen": -2.603400707244873, + "logits/rejected": -2.6000728607177734, + "logps/chosen": -292.7062072753906, + "logps/rejected": -389.6017761230469, + "loss": 0.0906, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4953200817108154, + "rewards/margins": 12.588823318481445, + "rewards/rejected": -15.084144592285156, + "step": 7580 + }, + { + "epoch": 1.47, + "learning_rate": 2.82699360034515e-07, + "logits/chosen": -2.6736364364624023, + "logits/rejected": -2.634866952896118, + "logps/chosen": -314.75213623046875, + "logps/rejected": -339.9332580566406, + "loss": 0.0948, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.965799570083618, + "rewards/margins": 8.406060218811035, + "rewards/rejected": -11.371858596801758, + "step": 7590 + }, + { + "epoch": 1.48, + "learning_rate": 2.8233982886316245e-07, + "logits/chosen": -2.6928837299346924, + "logits/rejected": -2.6834189891815186, + "logps/chosen": -249.1254119873047, + "logps/rejected": -344.33282470703125, + "loss": 0.1431, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.108962297439575, + "rewards/margins": 8.896110534667969, + "rewards/rejected": -12.005071640014648, + "step": 7600 + }, + { + "epoch": 1.48, + "eval_logits/chosen": -2.4902963638305664, + "eval_logits/rejected": -2.4739882946014404, + "eval_logps/chosen": -266.635498046875, + "eval_logps/rejected": -295.59075927734375, + "eval_loss": 0.5330983996391296, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -7.239742279052734, + "eval_rewards/margins": 5.127737522125244, + "eval_rewards/rejected": -12.36747932434082, + "eval_runtime": 158.6779, + "eval_samples_per_second": 19.889, + "eval_steps_per_second": 0.315, + "step": 7600 + }, + { + "epoch": 1.48, + "learning_rate": 2.819802976918099e-07, + "logits/chosen": -2.430002450942993, + "logits/rejected": -2.4719510078430176, + "logps/chosen": -211.90463256835938, + "logps/rejected": -397.2557067871094, + "loss": 0.1348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.344503402709961, + "rewards/margins": 11.521432876586914, + "rewards/rejected": -19.865936279296875, + "step": 7610 + }, + { + "epoch": 1.48, + "learning_rate": 2.816207665204573e-07, + "logits/chosen": -2.663891315460205, + "logits/rejected": -2.665191650390625, + "logps/chosen": -232.7655029296875, + "logps/rejected": -342.55133056640625, + "loss": 0.1071, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.3341264724731445, + "rewards/margins": 8.600602149963379, + "rewards/rejected": -12.934728622436523, + "step": 7620 + }, + { + "epoch": 1.48, + "learning_rate": 2.8126123534910473e-07, + "logits/chosen": -2.6779227256774902, + "logits/rejected": -2.7119317054748535, + "logps/chosen": -260.8107604980469, + "logps/rejected": -391.2445373535156, + "loss": 0.1438, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.915870428085327, + "rewards/margins": 9.063987731933594, + "rewards/rejected": -11.979857444763184, + "step": 7630 + }, + { + "epoch": 1.48, + "learning_rate": 2.809017041777522e-07, + "logits/chosen": -2.4341931343078613, + "logits/rejected": -2.387016773223877, + "logps/chosen": -203.7797393798828, + "logps/rejected": -288.8929443359375, + "loss": 0.0977, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.0777482986450195, + "rewards/margins": 8.294588088989258, + "rewards/rejected": -12.372335433959961, + "step": 7640 + }, + { + "epoch": 1.49, + "learning_rate": 2.8054217300639964e-07, + "logits/chosen": -2.632253408432007, + "logits/rejected": -2.4459805488586426, + "logps/chosen": -194.9683380126953, + "logps/rejected": -177.0109100341797, + "loss": 0.0911, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.925438404083252, + "rewards/margins": 5.247866153717041, + "rewards/rejected": -8.173304557800293, + "step": 7650 + }, + { + "epoch": 1.49, + "learning_rate": 2.8018264183504707e-07, + "logits/chosen": -2.7020745277404785, + "logits/rejected": -2.551351547241211, + "logps/chosen": -270.93890380859375, + "logps/rejected": -298.1747741699219, + "loss": 0.0941, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2506473064422607, + "rewards/margins": 9.88005256652832, + "rewards/rejected": -12.13070011138916, + "step": 7660 + }, + { + "epoch": 1.49, + "learning_rate": 2.798231106636945e-07, + "logits/chosen": -2.626739978790283, + "logits/rejected": -2.601685047149658, + "logps/chosen": -298.44775390625, + "logps/rejected": -300.03656005859375, + "loss": 0.175, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.278904438018799, + "rewards/margins": 9.179478645324707, + "rewards/rejected": -11.45838451385498, + "step": 7670 + }, + { + "epoch": 1.49, + "learning_rate": 2.794635794923419e-07, + "logits/chosen": -2.74287486076355, + "logits/rejected": -2.7336511611938477, + "logps/chosen": -273.65911865234375, + "logps/rejected": -331.85638427734375, + "loss": 0.1228, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.677824020385742, + "rewards/margins": 7.765066623687744, + "rewards/rejected": -12.442890167236328, + "step": 7680 + }, + { + "epoch": 1.49, + "learning_rate": 2.7910404832098946e-07, + "logits/chosen": -2.404188871383667, + "logits/rejected": -2.394561767578125, + "logps/chosen": -238.877685546875, + "logps/rejected": -365.3401184082031, + "loss": 0.173, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.243638515472412, + "rewards/margins": 9.225210189819336, + "rewards/rejected": -14.468849182128906, + "step": 7690 + }, + { + "epoch": 1.49, + "learning_rate": 2.787445171496369e-07, + "logits/chosen": -2.6277623176574707, + "logits/rejected": -2.616374969482422, + "logps/chosen": -238.05709838867188, + "logps/rejected": -358.3810729980469, + "loss": 0.1604, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.534859657287598, + "rewards/margins": 10.644804954528809, + "rewards/rejected": -17.179664611816406, + "step": 7700 + }, + { + "epoch": 1.49, + "eval_logits/chosen": -2.5511996746063232, + "eval_logits/rejected": -2.5380501747131348, + "eval_logps/chosen": -264.64892578125, + "eval_logps/rejected": -292.4844665527344, + "eval_loss": 0.5209183692932129, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -7.0410847663879395, + "eval_rewards/margins": 5.015763759613037, + "eval_rewards/rejected": -12.056848526000977, + "eval_runtime": 141.3306, + "eval_samples_per_second": 22.331, + "eval_steps_per_second": 0.354, + "step": 7700 + }, + { + "epoch": 1.5, + "learning_rate": 2.783849859782843e-07, + "logits/chosen": -2.5951569080352783, + "logits/rejected": -2.6269772052764893, + "logps/chosen": -203.84194946289062, + "logps/rejected": -286.40325927734375, + "loss": 0.126, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.021936416625977, + "rewards/margins": 8.942475318908691, + "rewards/rejected": -17.964412689208984, + "step": 7710 + }, + { + "epoch": 1.5, + "learning_rate": 2.7802545480693174e-07, + "logits/chosen": -2.656064987182617, + "logits/rejected": -2.6841704845428467, + "logps/chosen": -289.58514404296875, + "logps/rejected": -373.8680419921875, + "loss": 0.1295, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8527544140815735, + "rewards/margins": 12.222578048706055, + "rewards/rejected": -11.369823455810547, + "step": 7720 + }, + { + "epoch": 1.5, + "learning_rate": 2.7766592363557917e-07, + "logits/chosen": -2.679734468460083, + "logits/rejected": -2.6492819786071777, + "logps/chosen": -276.2290954589844, + "logps/rejected": -336.97503662109375, + "loss": 0.1019, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.463384628295898, + "rewards/margins": 9.683026313781738, + "rewards/rejected": -18.14640998840332, + "step": 7730 + }, + { + "epoch": 1.5, + "learning_rate": 2.7730639246422665e-07, + "logits/chosen": -2.754636526107788, + "logits/rejected": -2.7663750648498535, + "logps/chosen": -310.114501953125, + "logps/rejected": -347.15887451171875, + "loss": 0.2118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.472342610359192, + "rewards/margins": 9.652831077575684, + "rewards/rejected": -11.12517261505127, + "step": 7740 + }, + { + "epoch": 1.5, + "learning_rate": 2.769468612928741e-07, + "logits/chosen": -2.741122245788574, + "logits/rejected": -2.635406970977783, + "logps/chosen": -265.55364990234375, + "logps/rejected": -269.2779235839844, + "loss": 0.1222, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.486469268798828, + "rewards/margins": 7.999451637268066, + "rewards/rejected": -14.485920906066895, + "step": 7750 + }, + { + "epoch": 1.51, + "learning_rate": 2.765873301215215e-07, + "logits/chosen": -2.7480459213256836, + "logits/rejected": -2.663301706314087, + "logps/chosen": -339.1629638671875, + "logps/rejected": -291.66278076171875, + "loss": 0.1391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.851433753967285, + "rewards/margins": 6.955389499664307, + "rewards/rejected": -13.8068208694458, + "step": 7760 + }, + { + "epoch": 1.51, + "learning_rate": 2.7622779895016893e-07, + "logits/chosen": -2.654982805252075, + "logits/rejected": -2.707353115081787, + "logps/chosen": -193.4783935546875, + "logps/rejected": -280.4853820800781, + "loss": 0.1822, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.415597915649414, + "rewards/margins": 8.169103622436523, + "rewards/rejected": -10.584702491760254, + "step": 7770 + }, + { + "epoch": 1.51, + "learning_rate": 2.7586826777881636e-07, + "logits/chosen": -2.7164063453674316, + "logits/rejected": -2.644347906112671, + "logps/chosen": -262.46246337890625, + "logps/rejected": -223.9054412841797, + "loss": 0.1171, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2464256286621094, + "rewards/margins": 6.026411533355713, + "rewards/rejected": -8.272836685180664, + "step": 7780 + }, + { + "epoch": 1.51, + "learning_rate": 2.755087366074639e-07, + "logits/chosen": -2.819343090057373, + "logits/rejected": -2.8072986602783203, + "logps/chosen": -224.7086181640625, + "logps/rejected": -403.16693115234375, + "loss": 0.1208, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5812429189682007, + "rewards/margins": 16.40489959716797, + "rewards/rejected": -16.986141204833984, + "step": 7790 + }, + { + "epoch": 1.51, + "learning_rate": 2.751492054361113e-07, + "logits/chosen": -2.7332189083099365, + "logits/rejected": -2.720179319381714, + "logps/chosen": -201.06503295898438, + "logps/rejected": -240.810791015625, + "loss": 0.1578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0317578315734863, + "rewards/margins": 7.611621856689453, + "rewards/rejected": -10.643381118774414, + "step": 7800 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -2.571337938308716, + "eval_logits/rejected": -2.5551180839538574, + "eval_logps/chosen": -263.78594970703125, + "eval_logps/rejected": -290.19305419921875, + "eval_loss": 0.5121142864227295, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -6.9547882080078125, + "eval_rewards/margins": 4.872920513153076, + "eval_rewards/rejected": -11.827710151672363, + "eval_runtime": 140.9926, + "eval_samples_per_second": 22.384, + "eval_steps_per_second": 0.355, + "step": 7800 + }, + { + "epoch": 1.52, + "learning_rate": 2.7478967426475875e-07, + "logits/chosen": -2.654966115951538, + "logits/rejected": -2.6363489627838135, + "logps/chosen": -257.8814697265625, + "logps/rejected": -331.0078125, + "loss": 0.105, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.393416881561279, + "rewards/margins": 8.878604888916016, + "rewards/rejected": -15.272021293640137, + "step": 7810 + }, + { + "epoch": 1.52, + "learning_rate": 2.744301430934062e-07, + "logits/chosen": -2.589625120162964, + "logits/rejected": -2.6775403022766113, + "logps/chosen": -200.9984893798828, + "logps/rejected": -332.18609619140625, + "loss": 0.1582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.500709533691406, + "rewards/margins": 7.889809608459473, + "rewards/rejected": -14.390518188476562, + "step": 7820 + }, + { + "epoch": 1.52, + "learning_rate": 2.740706119220536e-07, + "logits/chosen": -2.665151357650757, + "logits/rejected": -2.6970107555389404, + "logps/chosen": -218.695068359375, + "logps/rejected": -386.66424560546875, + "loss": 0.173, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.225282669067383, + "rewards/margins": 12.516640663146973, + "rewards/rejected": -15.741923332214355, + "step": 7830 + }, + { + "epoch": 1.52, + "learning_rate": 2.737110807507011e-07, + "logits/chosen": -2.839881658554077, + "logits/rejected": -2.7552831172943115, + "logps/chosen": -276.20489501953125, + "logps/rejected": -325.8049011230469, + "loss": 0.1292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9895131587982178, + "rewards/margins": 8.880070686340332, + "rewards/rejected": -11.869585037231445, + "step": 7840 + }, + { + "epoch": 1.52, + "learning_rate": 2.733515495793485e-07, + "logits/chosen": -2.5036187171936035, + "logits/rejected": -2.5023043155670166, + "logps/chosen": -213.8859100341797, + "logps/rejected": -355.48638916015625, + "loss": 0.1894, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.731394290924072, + "rewards/margins": 11.786921501159668, + "rewards/rejected": -17.5183162689209, + "step": 7850 + }, + { + "epoch": 1.53, + "learning_rate": 2.7299201840799594e-07, + "logits/chosen": -2.7792916297912598, + "logits/rejected": -2.7273244857788086, + "logps/chosen": -249.0773162841797, + "logps/rejected": -347.36083984375, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6855994462966919, + "rewards/margins": 9.428860664367676, + "rewards/rejected": -10.114459037780762, + "step": 7860 + }, + { + "epoch": 1.53, + "learning_rate": 2.7263248723664337e-07, + "logits/chosen": -2.7454593181610107, + "logits/rejected": -2.8052077293395996, + "logps/chosen": -206.2342071533203, + "logps/rejected": -292.8019104003906, + "loss": 0.1154, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.633772373199463, + "rewards/margins": 8.582466125488281, + "rewards/rejected": -13.216238021850586, + "step": 7870 + }, + { + "epoch": 1.53, + "learning_rate": 2.7227295606529085e-07, + "logits/chosen": -2.675715208053589, + "logits/rejected": -2.57414174079895, + "logps/chosen": -209.45803833007812, + "logps/rejected": -211.00537109375, + "loss": 0.3294, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.574831008911133, + "rewards/margins": 5.802220344543457, + "rewards/rejected": -10.37705135345459, + "step": 7880 + }, + { + "epoch": 1.53, + "learning_rate": 2.7191342489393833e-07, + "logits/chosen": -2.737308979034424, + "logits/rejected": -2.671957492828369, + "logps/chosen": -282.55810546875, + "logps/rejected": -256.31121826171875, + "loss": 0.1393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1064114570617676, + "rewards/margins": 8.2166748046875, + "rewards/rejected": -11.323084831237793, + "step": 7890 + }, + { + "epoch": 1.53, + "learning_rate": 2.7155389372258576e-07, + "logits/chosen": -2.7994513511657715, + "logits/rejected": -2.774040699005127, + "logps/chosen": -218.21682739257812, + "logps/rejected": -323.3345642089844, + "loss": 0.1548, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.622430324554443, + "rewards/margins": 9.895807266235352, + "rewards/rejected": -16.518238067626953, + "step": 7900 + }, + { + "epoch": 1.53, + "eval_logits/chosen": -2.567811965942383, + "eval_logits/rejected": -2.546403408050537, + "eval_logps/chosen": -265.3227844238281, + "eval_logps/rejected": -289.8968811035156, + "eval_loss": 0.5030146837234497, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -7.1084675788879395, + "eval_rewards/margins": 4.689626693725586, + "eval_rewards/rejected": -11.79809284210205, + "eval_runtime": 145.8975, + "eval_samples_per_second": 21.632, + "eval_steps_per_second": 0.343, + "step": 7900 + }, + { + "epoch": 1.54, + "learning_rate": 2.711943625512332e-07, + "logits/chosen": -2.6100189685821533, + "logits/rejected": -2.482381582260132, + "logps/chosen": -267.57415771484375, + "logps/rejected": -330.57696533203125, + "loss": 0.1203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0282962322235107, + "rewards/margins": 9.497358322143555, + "rewards/rejected": -12.525655746459961, + "step": 7910 + }, + { + "epoch": 1.54, + "learning_rate": 2.708348313798806e-07, + "logits/chosen": -2.601951837539673, + "logits/rejected": -2.559113025665283, + "logps/chosen": -210.42172241210938, + "logps/rejected": -413.45367431640625, + "loss": 0.076, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5408718585968018, + "rewards/margins": 8.901400566101074, + "rewards/rejected": -12.44227409362793, + "step": 7920 + }, + { + "epoch": 1.54, + "learning_rate": 2.7047530020852804e-07, + "logits/chosen": -2.795612335205078, + "logits/rejected": -2.6182150840759277, + "logps/chosen": -252.82345581054688, + "logps/rejected": -254.4090576171875, + "loss": 0.0967, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6061296463012695, + "rewards/margins": 8.885503768920898, + "rewards/rejected": -10.491633415222168, + "step": 7930 + }, + { + "epoch": 1.54, + "learning_rate": 2.701157690371755e-07, + "logits/chosen": -2.82684326171875, + "logits/rejected": -2.7789080142974854, + "logps/chosen": -250.04013061523438, + "logps/rejected": -273.026611328125, + "loss": 0.1309, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7443382740020752, + "rewards/margins": 7.579935550689697, + "rewards/rejected": -9.324274063110352, + "step": 7940 + }, + { + "epoch": 1.54, + "learning_rate": 2.6975623786582295e-07, + "logits/chosen": -2.643972396850586, + "logits/rejected": -2.5632810592651367, + "logps/chosen": -314.1190185546875, + "logps/rejected": -333.59759521484375, + "loss": 0.088, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.784397125244141, + "rewards/margins": 8.821067810058594, + "rewards/rejected": -13.60546588897705, + "step": 7950 + }, + { + "epoch": 1.55, + "learning_rate": 2.693967066944704e-07, + "logits/chosen": -2.6901941299438477, + "logits/rejected": -2.7379889488220215, + "logps/chosen": -240.01431274414062, + "logps/rejected": -331.6454772949219, + "loss": 0.0889, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.543796539306641, + "rewards/margins": 4.876248836517334, + "rewards/rejected": -9.420045852661133, + "step": 7960 + }, + { + "epoch": 1.55, + "learning_rate": 2.690371755231178e-07, + "logits/chosen": -2.680809497833252, + "logits/rejected": -2.5582900047302246, + "logps/chosen": -350.76336669921875, + "logps/rejected": -402.9486083984375, + "loss": 0.3298, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.949441432952881, + "rewards/margins": 7.70681095123291, + "rewards/rejected": -10.656251907348633, + "step": 7970 + }, + { + "epoch": 1.55, + "learning_rate": 2.686776443517653e-07, + "logits/chosen": -2.6802871227264404, + "logits/rejected": -2.533712387084961, + "logps/chosen": -192.17262268066406, + "logps/rejected": -205.2220916748047, + "loss": 0.1524, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.809290885925293, + "rewards/margins": 6.305342674255371, + "rewards/rejected": -9.114633560180664, + "step": 7980 + }, + { + "epoch": 1.55, + "learning_rate": 2.6831811318041277e-07, + "logits/chosen": -2.82965350151062, + "logits/rejected": -2.797982692718506, + "logps/chosen": -317.3335876464844, + "logps/rejected": -405.9150695800781, + "loss": 0.112, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4481029510498047, + "rewards/margins": 8.178030967712402, + "rewards/rejected": -11.626134872436523, + "step": 7990 + }, + { + "epoch": 1.55, + "learning_rate": 2.679585820090602e-07, + "logits/chosen": -2.75278377532959, + "logits/rejected": -2.604698896408081, + "logps/chosen": -298.3667907714844, + "logps/rejected": -364.8375549316406, + "loss": 0.114, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.107235908508301, + "rewards/margins": 9.927998542785645, + "rewards/rejected": -15.035234451293945, + "step": 8000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.5889623165130615, + "eval_logits/rejected": -2.5693280696868896, + "eval_logps/chosen": -266.796142578125, + "eval_logps/rejected": -293.58087158203125, + "eval_loss": 0.522428035736084, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -7.255805492401123, + "eval_rewards/margins": 4.9106855392456055, + "eval_rewards/rejected": -12.166491508483887, + "eval_runtime": 148.4036, + "eval_samples_per_second": 21.266, + "eval_steps_per_second": 0.337, + "step": 8000 + }, + { + "epoch": 1.56, + "learning_rate": 2.675990508377076e-07, + "logits/chosen": -2.720656394958496, + "logits/rejected": -2.5840041637420654, + "logps/chosen": -236.41552734375, + "logps/rejected": -271.75250244140625, + "loss": 0.1299, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.823799133300781, + "rewards/margins": 7.9150800704956055, + "rewards/rejected": -14.738879203796387, + "step": 8010 + }, + { + "epoch": 1.56, + "learning_rate": 2.6723951966635505e-07, + "logits/chosen": -2.767911434173584, + "logits/rejected": -2.777944803237915, + "logps/chosen": -225.2845916748047, + "logps/rejected": -319.44366455078125, + "loss": 0.1138, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.285029649734497, + "rewards/margins": 7.553208351135254, + "rewards/rejected": -10.838237762451172, + "step": 8020 + }, + { + "epoch": 1.56, + "learning_rate": 2.668799884950025e-07, + "logits/chosen": -2.7186245918273926, + "logits/rejected": -2.7040085792541504, + "logps/chosen": -219.1761016845703, + "logps/rejected": -356.8329772949219, + "loss": 0.1186, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.899411201477051, + "rewards/margins": 8.89518928527832, + "rewards/rejected": -14.794601440429688, + "step": 8030 + }, + { + "epoch": 1.56, + "learning_rate": 2.6652045732364996e-07, + "logits/chosen": -2.751316547393799, + "logits/rejected": -2.8012309074401855, + "logps/chosen": -217.4230194091797, + "logps/rejected": -340.3138122558594, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2986190915107727, + "rewards/margins": 9.464986801147461, + "rewards/rejected": -9.763606071472168, + "step": 8040 + }, + { + "epoch": 1.56, + "learning_rate": 2.661609261522974e-07, + "logits/chosen": -2.6710333824157715, + "logits/rejected": -2.5617194175720215, + "logps/chosen": -324.3326110839844, + "logps/rejected": -319.7010192871094, + "loss": 0.4444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9465482234954834, + "rewards/margins": 7.789758205413818, + "rewards/rejected": -10.736307144165039, + "step": 8050 + }, + { + "epoch": 1.56, + "learning_rate": 2.658013949809448e-07, + "logits/chosen": -2.7540974617004395, + "logits/rejected": -2.7263998985290527, + "logps/chosen": -237.75772094726562, + "logps/rejected": -243.87646484375, + "loss": 0.2051, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.662503480911255, + "rewards/margins": 5.3351731300354, + "rewards/rejected": -8.997676849365234, + "step": 8060 + }, + { + "epoch": 1.57, + "learning_rate": 2.6544186380959225e-07, + "logits/chosen": -2.76902437210083, + "logits/rejected": -2.8533883094787598, + "logps/chosen": -188.13333129882812, + "logps/rejected": -302.8412780761719, + "loss": 0.1427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.508274793624878, + "rewards/margins": 7.940915584564209, + "rewards/rejected": -11.449190139770508, + "step": 8070 + }, + { + "epoch": 1.57, + "learning_rate": 2.6508233263823973e-07, + "logits/chosen": -2.795078754425049, + "logits/rejected": -2.6513450145721436, + "logps/chosen": -288.3661193847656, + "logps/rejected": -371.65814208984375, + "loss": 0.1688, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.276179552078247, + "rewards/margins": 9.625847816467285, + "rewards/rejected": -11.902026176452637, + "step": 8080 + }, + { + "epoch": 1.57, + "learning_rate": 2.647228014668872e-07, + "logits/chosen": -2.5098202228546143, + "logits/rejected": -2.582827091217041, + "logps/chosen": -249.5102996826172, + "logps/rejected": -304.91265869140625, + "loss": 0.1628, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.1279616355896, + "rewards/margins": 9.646819114685059, + "rewards/rejected": -13.7747802734375, + "step": 8090 + }, + { + "epoch": 1.57, + "learning_rate": 2.6436327029553464e-07, + "logits/chosen": -2.7641568183898926, + "logits/rejected": -2.911437511444092, + "logps/chosen": -225.6170654296875, + "logps/rejected": -346.4241638183594, + "loss": 0.112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.264681816101074, + "rewards/margins": 10.263120651245117, + "rewards/rejected": -12.527801513671875, + "step": 8100 + }, + { + "epoch": 1.57, + "eval_logits/chosen": -2.593346357345581, + "eval_logits/rejected": -2.5735323429107666, + "eval_logps/chosen": -254.83860778808594, + "eval_logps/rejected": -277.5395202636719, + "eval_loss": 0.5374084115028381, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -6.060052871704102, + "eval_rewards/margins": 4.502304553985596, + "eval_rewards/rejected": -10.562356948852539, + "eval_runtime": 140.2574, + "eval_samples_per_second": 22.501, + "eval_steps_per_second": 0.356, + "step": 8100 + }, + { + "epoch": 1.57, + "learning_rate": 2.6400373912418206e-07, + "logits/chosen": -2.844630718231201, + "logits/rejected": -2.7642016410827637, + "logps/chosen": -283.64593505859375, + "logps/rejected": -229.9371795654297, + "loss": 0.1063, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2396063804626465, + "rewards/margins": 6.239510536193848, + "rewards/rejected": -8.479116439819336, + "step": 8110 + }, + { + "epoch": 1.58, + "learning_rate": 2.636442079528295e-07, + "logits/chosen": -2.7353687286376953, + "logits/rejected": -2.7575581073760986, + "logps/chosen": -215.2127685546875, + "logps/rejected": -360.39569091796875, + "loss": 0.1807, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.294805526733398, + "rewards/margins": 11.46848201751709, + "rewards/rejected": -15.763287544250488, + "step": 8120 + }, + { + "epoch": 1.58, + "learning_rate": 2.632846767814769e-07, + "logits/chosen": -2.775391101837158, + "logits/rejected": -2.7092642784118652, + "logps/chosen": -336.4795227050781, + "logps/rejected": -380.7021179199219, + "loss": 0.2285, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.323225736618042, + "rewards/margins": 7.921922206878662, + "rewards/rejected": -11.245149612426758, + "step": 8130 + }, + { + "epoch": 1.58, + "learning_rate": 2.629251456101244e-07, + "logits/chosen": -2.8665738105773926, + "logits/rejected": -2.80812668800354, + "logps/chosen": -241.3499298095703, + "logps/rejected": -332.4806213378906, + "loss": 0.1102, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1491594314575195, + "rewards/margins": 9.4281644821167, + "rewards/rejected": -12.577322959899902, + "step": 8140 + }, + { + "epoch": 1.58, + "learning_rate": 2.6256561443877183e-07, + "logits/chosen": -2.7424569129943848, + "logits/rejected": -2.8395683765411377, + "logps/chosen": -209.18978881835938, + "logps/rejected": -324.82208251953125, + "loss": 0.0997, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.445475101470947, + "rewards/margins": 10.062944412231445, + "rewards/rejected": -14.508418083190918, + "step": 8150 + }, + { + "epoch": 1.58, + "learning_rate": 2.6220608326741926e-07, + "logits/chosen": -2.7943625450134277, + "logits/rejected": -2.7624077796936035, + "logps/chosen": -268.23248291015625, + "logps/rejected": -392.8574523925781, + "loss": 0.1461, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.481450080871582, + "rewards/margins": 9.155346870422363, + "rewards/rejected": -15.636796951293945, + "step": 8160 + }, + { + "epoch": 1.59, + "learning_rate": 2.618465520960667e-07, + "logits/chosen": -2.827127456665039, + "logits/rejected": -2.750373363494873, + "logps/chosen": -223.2224578857422, + "logps/rejected": -334.0792541503906, + "loss": 0.1133, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.39715576171875, + "rewards/margins": 7.0858330726623535, + "rewards/rejected": -10.482988357543945, + "step": 8170 + }, + { + "epoch": 1.59, + "learning_rate": 2.6148702092471416e-07, + "logits/chosen": -2.728161334991455, + "logits/rejected": -2.774329662322998, + "logps/chosen": -158.14730834960938, + "logps/rejected": -279.3450927734375, + "loss": 0.1076, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.5900421142578125, + "rewards/margins": 8.114730834960938, + "rewards/rejected": -13.70477294921875, + "step": 8180 + }, + { + "epoch": 1.59, + "learning_rate": 2.6112748975336164e-07, + "logits/chosen": -2.5704948902130127, + "logits/rejected": -2.5647311210632324, + "logps/chosen": -217.3551025390625, + "logps/rejected": -352.66357421875, + "loss": 0.1391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.492300033569336, + "rewards/margins": 9.363241195678711, + "rewards/rejected": -14.855539321899414, + "step": 8190 + }, + { + "epoch": 1.59, + "learning_rate": 2.6076795858200907e-07, + "logits/chosen": -2.7108607292175293, + "logits/rejected": -2.745941638946533, + "logps/chosen": -234.5678253173828, + "logps/rejected": -322.69244384765625, + "loss": 0.1436, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.202203273773193, + "rewards/margins": 10.338296890258789, + "rewards/rejected": -14.540501594543457, + "step": 8200 + }, + { + "epoch": 1.59, + "eval_logits/chosen": -2.593104839324951, + "eval_logits/rejected": -2.5737414360046387, + "eval_logps/chosen": -264.7280578613281, + "eval_logps/rejected": -291.8731384277344, + "eval_loss": 0.5275627970695496, + "eval_rewards/accuracies": 0.7174999713897705, + "eval_rewards/chosen": -7.049000263214111, + "eval_rewards/margins": 4.946714401245117, + "eval_rewards/rejected": -11.995715141296387, + "eval_runtime": 152.9375, + "eval_samples_per_second": 20.636, + "eval_steps_per_second": 0.327, + "step": 8200 + }, + { + "epoch": 1.59, + "learning_rate": 2.604084274106565e-07, + "logits/chosen": -2.6715493202209473, + "logits/rejected": -2.7347023487091064, + "logps/chosen": -257.12225341796875, + "logps/rejected": -414.55670166015625, + "loss": 0.1169, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3295891284942627, + "rewards/margins": 11.935256004333496, + "rewards/rejected": -13.264846801757812, + "step": 8210 + }, + { + "epoch": 1.6, + "learning_rate": 2.6004889623930393e-07, + "logits/chosen": -2.9021527767181396, + "logits/rejected": -2.8916983604431152, + "logps/chosen": -337.2508239746094, + "logps/rejected": -341.51507568359375, + "loss": 0.1411, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.747110366821289, + "rewards/margins": 6.220602989196777, + "rewards/rejected": -8.967713356018066, + "step": 8220 + }, + { + "epoch": 1.6, + "learning_rate": 2.5968936506795136e-07, + "logits/chosen": -2.638962507247925, + "logits/rejected": -2.660064220428467, + "logps/chosen": -251.115966796875, + "logps/rejected": -266.664306640625, + "loss": 0.1566, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.338654518127441, + "rewards/margins": 6.121777534484863, + "rewards/rejected": -13.460432052612305, + "step": 8230 + }, + { + "epoch": 1.6, + "learning_rate": 2.5932983389659884e-07, + "logits/chosen": -2.718865394592285, + "logits/rejected": -2.6547913551330566, + "logps/chosen": -332.82476806640625, + "logps/rejected": -387.24530029296875, + "loss": 0.0852, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.32920503616333, + "rewards/margins": 11.49052619934082, + "rewards/rejected": -12.819730758666992, + "step": 8240 + }, + { + "epoch": 1.6, + "learning_rate": 2.5897030272524626e-07, + "logits/chosen": -2.678675413131714, + "logits/rejected": -2.5542550086975098, + "logps/chosen": -291.0780944824219, + "logps/rejected": -351.25396728515625, + "loss": 0.154, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5418224334716797, + "rewards/margins": 8.347354888916016, + "rewards/rejected": -11.889177322387695, + "step": 8250 + }, + { + "epoch": 1.6, + "learning_rate": 2.586107715538937e-07, + "logits/chosen": -2.674690008163452, + "logits/rejected": -2.646212577819824, + "logps/chosen": -220.5702362060547, + "logps/rejected": -262.0445861816406, + "loss": 0.1372, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.464944362640381, + "rewards/margins": 7.086409091949463, + "rewards/rejected": -10.551352500915527, + "step": 8260 + }, + { + "epoch": 1.61, + "learning_rate": 2.582512403825411e-07, + "logits/chosen": -2.6054418087005615, + "logits/rejected": -2.6012120246887207, + "logps/chosen": -216.8031768798828, + "logps/rejected": -303.86370849609375, + "loss": 0.1155, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.4664530754089355, + "rewards/margins": 7.803128719329834, + "rewards/rejected": -12.269582748413086, + "step": 8270 + }, + { + "epoch": 1.61, + "learning_rate": 2.578917092111886e-07, + "logits/chosen": -2.7718937397003174, + "logits/rejected": -2.740553617477417, + "logps/chosen": -302.05499267578125, + "logps/rejected": -382.68084716796875, + "loss": 0.1213, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.131575107574463, + "rewards/margins": 7.782123565673828, + "rewards/rejected": -10.913698196411133, + "step": 8280 + }, + { + "epoch": 1.61, + "learning_rate": 2.575321780398361e-07, + "logits/chosen": -2.762749433517456, + "logits/rejected": -2.6545376777648926, + "logps/chosen": -209.6224365234375, + "logps/rejected": -259.521240234375, + "loss": 0.1442, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.175038814544678, + "rewards/margins": 6.265427112579346, + "rewards/rejected": -12.440465927124023, + "step": 8290 + }, + { + "epoch": 1.61, + "learning_rate": 2.571726468684835e-07, + "logits/chosen": -2.8032710552215576, + "logits/rejected": -2.654263973236084, + "logps/chosen": -298.76776123046875, + "logps/rejected": -423.67340087890625, + "loss": 0.1369, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.403448581695557, + "rewards/margins": 9.833967208862305, + "rewards/rejected": -14.237414360046387, + "step": 8300 + }, + { + "epoch": 1.61, + "eval_logits/chosen": -2.596484661102295, + "eval_logits/rejected": -2.576378583908081, + "eval_logps/chosen": -261.24853515625, + "eval_logps/rejected": -285.3045959472656, + "eval_loss": 0.5190584659576416, + "eval_rewards/accuracies": 0.6875, + "eval_rewards/chosen": -6.701047420501709, + "eval_rewards/margins": 4.637817859649658, + "eval_rewards/rejected": -11.338866233825684, + "eval_runtime": 140.2488, + "eval_samples_per_second": 22.503, + "eval_steps_per_second": 0.357, + "step": 8300 + }, + { + "epoch": 1.61, + "learning_rate": 2.5681311569713094e-07, + "logits/chosen": -2.4661483764648438, + "logits/rejected": -2.4401745796203613, + "logps/chosen": -273.05242919921875, + "logps/rejected": -268.7724609375, + "loss": 0.1018, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7230803966522217, + "rewards/margins": 8.528326988220215, + "rewards/rejected": -12.251407623291016, + "step": 8310 + }, + { + "epoch": 1.62, + "learning_rate": 2.5645358452577837e-07, + "logits/chosen": -2.7269883155822754, + "logits/rejected": -2.721303701400757, + "logps/chosen": -209.5245361328125, + "logps/rejected": -365.2374267578125, + "loss": 0.0911, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.265542984008789, + "rewards/margins": 8.736278533935547, + "rewards/rejected": -12.001821517944336, + "step": 8320 + }, + { + "epoch": 1.62, + "learning_rate": 2.560940533544258e-07, + "logits/chosen": -2.780273199081421, + "logits/rejected": -2.7520592212677, + "logps/chosen": -231.3140411376953, + "logps/rejected": -347.9571533203125, + "loss": 0.1173, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4555554389953613, + "rewards/margins": 6.264697074890137, + "rewards/rejected": -9.720252990722656, + "step": 8330 + }, + { + "epoch": 1.62, + "learning_rate": 2.557345221830733e-07, + "logits/chosen": -2.7330617904663086, + "logits/rejected": -2.5406532287597656, + "logps/chosen": -267.46856689453125, + "logps/rejected": -313.81842041015625, + "loss": 0.1582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.8921918869018555, + "rewards/margins": 7.403140068054199, + "rewards/rejected": -13.295331954956055, + "step": 8340 + }, + { + "epoch": 1.62, + "learning_rate": 2.553749910117207e-07, + "logits/chosen": -2.6633763313293457, + "logits/rejected": -2.5974395275115967, + "logps/chosen": -225.1736297607422, + "logps/rejected": -315.27972412109375, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.175999402999878, + "rewards/margins": 9.365495681762695, + "rewards/rejected": -12.541496276855469, + "step": 8350 + }, + { + "epoch": 1.62, + "learning_rate": 2.5501545984036813e-07, + "logits/chosen": -2.697948694229126, + "logits/rejected": -2.769273281097412, + "logps/chosen": -217.4983673095703, + "logps/rejected": -340.5929260253906, + "loss": 0.1365, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.359081745147705, + "rewards/margins": 9.063468933105469, + "rewards/rejected": -11.422552108764648, + "step": 8360 + }, + { + "epoch": 1.62, + "learning_rate": 2.546559286690156e-07, + "logits/chosen": -2.5433297157287598, + "logits/rejected": -2.5445127487182617, + "logps/chosen": -286.13427734375, + "logps/rejected": -404.03656005859375, + "loss": 0.4144, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.150385856628418, + "rewards/margins": 7.654179573059082, + "rewards/rejected": -14.8045654296875, + "step": 8370 + }, + { + "epoch": 1.63, + "learning_rate": 2.5429639749766304e-07, + "logits/chosen": -2.6273345947265625, + "logits/rejected": -2.657918930053711, + "logps/chosen": -183.72579956054688, + "logps/rejected": -360.6436462402344, + "loss": 0.0865, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.842744827270508, + "rewards/margins": 7.5178422927856445, + "rewards/rejected": -12.360587120056152, + "step": 8380 + }, + { + "epoch": 1.63, + "learning_rate": 2.539368663263105e-07, + "logits/chosen": -2.592080593109131, + "logits/rejected": -2.5851070880889893, + "logps/chosen": -281.6587219238281, + "logps/rejected": -329.4895324707031, + "loss": 0.1224, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.482011318206787, + "rewards/margins": 11.097574234008789, + "rewards/rejected": -15.579585075378418, + "step": 8390 + }, + { + "epoch": 1.63, + "learning_rate": 2.5357733515495795e-07, + "logits/chosen": -2.502882957458496, + "logits/rejected": -2.366497755050659, + "logps/chosen": -233.24508666992188, + "logps/rejected": -336.41009521484375, + "loss": 0.1545, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.115644931793213, + "rewards/margins": 8.525575637817383, + "rewards/rejected": -13.64122200012207, + "step": 8400 + }, + { + "epoch": 1.63, + "eval_logits/chosen": -2.4826722145080566, + "eval_logits/rejected": -2.4635934829711914, + "eval_logps/chosen": -269.8938903808594, + "eval_logps/rejected": -299.3194580078125, + "eval_loss": 0.5305549502372742, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -7.565580368041992, + "eval_rewards/margins": 5.174770355224609, + "eval_rewards/rejected": -12.740350723266602, + "eval_runtime": 139.887, + "eval_samples_per_second": 22.561, + "eval_steps_per_second": 0.357, + "step": 8400 + }, + { + "epoch": 1.63, + "learning_rate": 2.532178039836054e-07, + "logits/chosen": -2.603243589401245, + "logits/rejected": -2.623236894607544, + "logps/chosen": -236.5806427001953, + "logps/rejected": -352.7832336425781, + "loss": 0.1467, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.015347480773926, + "rewards/margins": 10.90376091003418, + "rewards/rejected": -15.919107437133789, + "step": 8410 + }, + { + "epoch": 1.63, + "learning_rate": 2.528582728122528e-07, + "logits/chosen": -2.7527453899383545, + "logits/rejected": -2.450197696685791, + "logps/chosen": -339.81585693359375, + "logps/rejected": -272.74737548828125, + "loss": 0.1032, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.699526309967041, + "rewards/margins": 7.571126461029053, + "rewards/rejected": -12.270652770996094, + "step": 8420 + }, + { + "epoch": 1.64, + "learning_rate": 2.5249874164090023e-07, + "logits/chosen": -2.668938636779785, + "logits/rejected": -2.758716583251953, + "logps/chosen": -250.9716796875, + "logps/rejected": -392.6880187988281, + "loss": 0.1451, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.294054627418518, + "rewards/margins": 12.130877494812012, + "rewards/rejected": -13.424932479858398, + "step": 8430 + }, + { + "epoch": 1.64, + "learning_rate": 2.521392104695477e-07, + "logits/chosen": -2.587261199951172, + "logits/rejected": -2.597693681716919, + "logps/chosen": -298.6204833984375, + "logps/rejected": -461.45892333984375, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.775132179260254, + "rewards/margins": 11.153104782104492, + "rewards/rejected": -16.928237915039062, + "step": 8440 + }, + { + "epoch": 1.64, + "learning_rate": 2.5177967929819514e-07, + "logits/chosen": -2.4586944580078125, + "logits/rejected": -2.5723679065704346, + "logps/chosen": -217.3318328857422, + "logps/rejected": -385.0566101074219, + "loss": 0.1325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.816227912902832, + "rewards/margins": 9.050592422485352, + "rewards/rejected": -10.866819381713867, + "step": 8450 + }, + { + "epoch": 1.64, + "learning_rate": 2.5142014812684257e-07, + "logits/chosen": -2.317434549331665, + "logits/rejected": -2.284421443939209, + "logps/chosen": -255.93148803710938, + "logps/rejected": -367.16607666015625, + "loss": 0.1027, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.184701919555664, + "rewards/margins": 9.185425758361816, + "rewards/rejected": -13.37012767791748, + "step": 8460 + }, + { + "epoch": 1.64, + "learning_rate": 2.5106061695549005e-07, + "logits/chosen": -2.5617194175720215, + "logits/rejected": -2.551328659057617, + "logps/chosen": -260.8974609375, + "logps/rejected": -385.60650634765625, + "loss": 0.1505, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0798234939575195, + "rewards/margins": 7.473949432373047, + "rewards/rejected": -11.553773880004883, + "step": 8470 + }, + { + "epoch": 1.65, + "learning_rate": 2.507010857841375e-07, + "logits/chosen": -2.66874361038208, + "logits/rejected": -2.5429797172546387, + "logps/chosen": -307.7286376953125, + "logps/rejected": -308.6953430175781, + "loss": 0.1053, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.031424045562744, + "rewards/margins": 8.793882369995117, + "rewards/rejected": -14.825304985046387, + "step": 8480 + }, + { + "epoch": 1.65, + "learning_rate": 2.5034155461278496e-07, + "logits/chosen": -2.5712246894836426, + "logits/rejected": -2.59190034866333, + "logps/chosen": -274.2309265136719, + "logps/rejected": -383.289306640625, + "loss": 0.1168, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.81260871887207, + "rewards/margins": 5.698575019836426, + "rewards/rejected": -11.511183738708496, + "step": 8490 + }, + { + "epoch": 1.65, + "learning_rate": 2.4998202344143233e-07, + "logits/chosen": -2.5047717094421387, + "logits/rejected": -2.4825732707977295, + "logps/chosen": -236.83578491210938, + "logps/rejected": -373.082763671875, + "loss": 0.1052, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.311113357543945, + "rewards/margins": 11.240933418273926, + "rewards/rejected": -19.552045822143555, + "step": 8500 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.35133957862854, + "eval_logits/rejected": -2.327308416366577, + "eval_logps/chosen": -285.02752685546875, + "eval_logps/rejected": -317.7987365722656, + "eval_loss": 0.524840235710144, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -9.07894515991211, + "eval_rewards/margins": 5.509334564208984, + "eval_rewards/rejected": -14.588278770446777, + "eval_runtime": 140.7123, + "eval_samples_per_second": 22.429, + "eval_steps_per_second": 0.355, + "step": 8500 + }, + { + "epoch": 1.65, + "learning_rate": 2.496224922700798e-07, + "logits/chosen": -2.419813871383667, + "logits/rejected": -2.417640209197998, + "logps/chosen": -299.6729736328125, + "logps/rejected": -333.9921569824219, + "loss": 0.1381, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1953866481781006, + "rewards/margins": 9.779828071594238, + "rewards/rejected": -12.975214004516602, + "step": 8510 + }, + { + "epoch": 1.65, + "learning_rate": 2.4926296109872724e-07, + "logits/chosen": -2.5586345195770264, + "logits/rejected": -2.5299503803253174, + "logps/chosen": -187.5476837158203, + "logps/rejected": -240.4314422607422, + "loss": 0.0784, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.080376148223877, + "rewards/margins": 5.857470989227295, + "rewards/rejected": -8.937847137451172, + "step": 8520 + }, + { + "epoch": 1.66, + "learning_rate": 2.489034299273747e-07, + "logits/chosen": -2.5710880756378174, + "logits/rejected": -2.5085082054138184, + "logps/chosen": -285.5210266113281, + "logps/rejected": -348.65240478515625, + "loss": 0.1039, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.544827938079834, + "rewards/margins": 9.803266525268555, + "rewards/rejected": -14.348093032836914, + "step": 8530 + }, + { + "epoch": 1.66, + "learning_rate": 2.4854389875602215e-07, + "logits/chosen": -2.580644130706787, + "logits/rejected": -2.543076753616333, + "logps/chosen": -281.333984375, + "logps/rejected": -367.89495849609375, + "loss": 0.1258, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.154938220977783, + "rewards/margins": 11.225164413452148, + "rewards/rejected": -13.380102157592773, + "step": 8540 + }, + { + "epoch": 1.66, + "learning_rate": 2.481843675846696e-07, + "logits/chosen": -2.522185802459717, + "logits/rejected": -2.502575635910034, + "logps/chosen": -231.32388305664062, + "logps/rejected": -331.98504638671875, + "loss": 0.1144, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6895172595977783, + "rewards/margins": 10.420839309692383, + "rewards/rejected": -14.110356330871582, + "step": 8550 + }, + { + "epoch": 1.66, + "learning_rate": 2.47824836413317e-07, + "logits/chosen": -2.552109956741333, + "logits/rejected": -2.436415433883667, + "logps/chosen": -293.69000244140625, + "logps/rejected": -320.4071350097656, + "loss": 0.1453, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.968686580657959, + "rewards/margins": 10.455901145935059, + "rewards/rejected": -18.42458724975586, + "step": 8560 + }, + { + "epoch": 1.66, + "learning_rate": 2.474653052419645e-07, + "logits/chosen": -2.4954328536987305, + "logits/rejected": -2.5189099311828613, + "logps/chosen": -264.4991149902344, + "logps/rejected": -334.30804443359375, + "loss": 0.1144, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1249985694885254, + "rewards/margins": 8.98199462890625, + "rewards/rejected": -11.106993675231934, + "step": 8570 + }, + { + "epoch": 1.67, + "learning_rate": 2.471057740706119e-07, + "logits/chosen": -2.5667037963867188, + "logits/rejected": -2.458103895187378, + "logps/chosen": -246.67794799804688, + "logps/rejected": -301.4194030761719, + "loss": 0.1354, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.499109268188477, + "rewards/margins": 6.898039817810059, + "rewards/rejected": -11.397150039672852, + "step": 8580 + }, + { + "epoch": 1.67, + "learning_rate": 2.4674624289925934e-07, + "logits/chosen": -2.5735199451446533, + "logits/rejected": -2.528298854827881, + "logps/chosen": -211.7207794189453, + "logps/rejected": -347.9034423828125, + "loss": 0.1651, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.928305625915527, + "rewards/margins": 9.669815063476562, + "rewards/rejected": -15.598121643066406, + "step": 8590 + }, + { + "epoch": 1.67, + "learning_rate": 2.4638671172790677e-07, + "logits/chosen": -2.5571844577789307, + "logits/rejected": -2.4557769298553467, + "logps/chosen": -248.6903839111328, + "logps/rejected": -430.58935546875, + "loss": 0.1193, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2300491333007812, + "rewards/margins": 7.920162200927734, + "rewards/rejected": -11.150211334228516, + "step": 8600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -2.3431856632232666, + "eval_logits/rejected": -2.319796085357666, + "eval_logps/chosen": -277.3157958984375, + "eval_logps/rejected": -308.3280944824219, + "eval_loss": 0.5251381397247314, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -8.30777359008789, + "eval_rewards/margins": 5.333440780639648, + "eval_rewards/rejected": -13.641214370727539, + "eval_runtime": 139.8038, + "eval_samples_per_second": 22.574, + "eval_steps_per_second": 0.358, + "step": 8600 + }, + { + "epoch": 1.67, + "learning_rate": 2.4602718055655425e-07, + "logits/chosen": -2.423828363418579, + "logits/rejected": -2.4125685691833496, + "logps/chosen": -289.61383056640625, + "logps/rejected": -350.7088928222656, + "loss": 0.0768, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.05233097076416, + "rewards/margins": 8.507286071777344, + "rewards/rejected": -15.559616088867188, + "step": 8610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456676493852017e-07, + "logits/chosen": -2.4309775829315186, + "logits/rejected": -2.33675217628479, + "logps/chosen": -245.5044708251953, + "logps/rejected": -277.96368408203125, + "loss": 0.1245, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.733599662780762, + "rewards/margins": 9.675500869750977, + "rewards/rejected": -15.409098625183105, + "step": 8620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4530811821384916e-07, + "logits/chosen": -2.4593870639801025, + "logits/rejected": -2.5098235607147217, + "logps/chosen": -233.9044189453125, + "logps/rejected": -321.5685119628906, + "loss": 0.1215, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.319704532623291, + "rewards/margins": 9.51965045928955, + "rewards/rejected": -15.83935546875, + "step": 8630 + }, + { + "epoch": 1.68, + "learning_rate": 2.449485870424966e-07, + "logits/chosen": -2.391671895980835, + "logits/rejected": -2.434969663619995, + "logps/chosen": -234.9580841064453, + "logps/rejected": -363.7305603027344, + "loss": 0.1435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2724900245666504, + "rewards/margins": 10.682934761047363, + "rewards/rejected": -13.955424308776855, + "step": 8640 + }, + { + "epoch": 1.68, + "learning_rate": 2.44589055871144e-07, + "logits/chosen": -2.547769784927368, + "logits/rejected": -2.5287559032440186, + "logps/chosen": -245.45480346679688, + "logps/rejected": -263.2743835449219, + "loss": 0.1296, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.414554595947266, + "rewards/margins": 7.894643306732178, + "rewards/rejected": -14.309199333190918, + "step": 8650 + }, + { + "epoch": 1.68, + "learning_rate": 2.4422952469979144e-07, + "logits/chosen": -2.5728485584259033, + "logits/rejected": -2.563326835632324, + "logps/chosen": -254.01171875, + "logps/rejected": -323.68011474609375, + "loss": 0.0999, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.464980602264404, + "rewards/margins": 9.061437606811523, + "rewards/rejected": -14.526418685913086, + "step": 8660 + }, + { + "epoch": 1.68, + "learning_rate": 2.438699935284389e-07, + "logits/chosen": -2.5206618309020996, + "logits/rejected": -2.633418560028076, + "logps/chosen": -234.8514404296875, + "logps/rejected": -381.8434143066406, + "loss": 0.1121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.4790866374969482, + "rewards/margins": 8.549965858459473, + "rewards/rejected": -12.029050827026367, + "step": 8670 + }, + { + "epoch": 1.69, + "learning_rate": 2.4351046235708635e-07, + "logits/chosen": -2.5908291339874268, + "logits/rejected": -2.5766384601593018, + "logps/chosen": -264.9714050292969, + "logps/rejected": -282.068359375, + "loss": 0.1595, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.4073896408081055, + "rewards/margins": 6.865915775299072, + "rewards/rejected": -13.27330493927002, + "step": 8680 + }, + { + "epoch": 1.69, + "learning_rate": 2.431509311857338e-07, + "logits/chosen": -2.554011344909668, + "logits/rejected": -2.550485849380493, + "logps/chosen": -200.98663330078125, + "logps/rejected": -279.0447692871094, + "loss": 0.0848, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.101694583892822, + "rewards/margins": 6.140630722045898, + "rewards/rejected": -12.242324829101562, + "step": 8690 + }, + { + "epoch": 1.69, + "learning_rate": 2.427914000143812e-07, + "logits/chosen": -2.650585889816284, + "logits/rejected": -2.61324143409729, + "logps/chosen": -297.4610595703125, + "logps/rejected": -302.3990783691406, + "loss": 0.143, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.797205924987793, + "rewards/margins": 10.048017501831055, + "rewards/rejected": -14.845222473144531, + "step": 8700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -2.4667084217071533, + "eval_logits/rejected": -2.452279567718506, + "eval_logps/chosen": -264.91510009765625, + "eval_logps/rejected": -290.2835693359375, + "eval_loss": 0.5170483589172363, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -7.06770133972168, + "eval_rewards/margins": 4.76905632019043, + "eval_rewards/rejected": -11.83675765991211, + "eval_runtime": 140.4727, + "eval_samples_per_second": 22.467, + "eval_steps_per_second": 0.356, + "step": 8700 + }, + { + "epoch": 1.69, + "learning_rate": 2.424318688430287e-07, + "logits/chosen": -2.446254253387451, + "logits/rejected": -2.467956781387329, + "logps/chosen": -239.3897247314453, + "logps/rejected": -371.78204345703125, + "loss": 0.1045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7158796787261963, + "rewards/margins": 11.193151473999023, + "rewards/rejected": -13.909029960632324, + "step": 8710 + }, + { + "epoch": 1.69, + "learning_rate": 2.420723376716761e-07, + "logits/chosen": -2.509413242340088, + "logits/rejected": -2.5538456439971924, + "logps/chosen": -225.9101104736328, + "logps/rejected": -290.42205810546875, + "loss": 0.1323, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5373482704162598, + "rewards/margins": 9.28911018371582, + "rewards/rejected": -12.826458930969238, + "step": 8720 + }, + { + "epoch": 1.69, + "learning_rate": 2.417128065003236e-07, + "logits/chosen": -2.6275618076324463, + "logits/rejected": -2.5535731315612793, + "logps/chosen": -319.5244140625, + "logps/rejected": -357.25787353515625, + "loss": 0.107, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.244515895843506, + "rewards/margins": 8.635760307312012, + "rewards/rejected": -15.880276679992676, + "step": 8730 + }, + { + "epoch": 1.7, + "learning_rate": 2.41353275328971e-07, + "logits/chosen": -2.63443660736084, + "logits/rejected": -2.674879550933838, + "logps/chosen": -318.2145080566406, + "logps/rejected": -370.57366943359375, + "loss": 0.0849, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.588968753814697, + "rewards/margins": 8.581594467163086, + "rewards/rejected": -13.170560836791992, + "step": 8740 + }, + { + "epoch": 1.7, + "learning_rate": 2.4099374415761845e-07, + "logits/chosen": -2.6127374172210693, + "logits/rejected": -2.562065601348877, + "logps/chosen": -290.62213134765625, + "logps/rejected": -313.4164733886719, + "loss": 0.1102, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.564499855041504, + "rewards/margins": 7.381847381591797, + "rewards/rejected": -12.9463472366333, + "step": 8750 + }, + { + "epoch": 1.7, + "learning_rate": 2.4063421298626593e-07, + "logits/chosen": -2.541891574859619, + "logits/rejected": -2.513563632965088, + "logps/chosen": -224.95968627929688, + "logps/rejected": -294.7865295410156, + "loss": 0.1926, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.619781494140625, + "rewards/margins": 7.007911682128906, + "rewards/rejected": -12.627693176269531, + "step": 8760 + }, + { + "epoch": 1.7, + "learning_rate": 2.4027468181491336e-07, + "logits/chosen": -2.646176338195801, + "logits/rejected": -2.579622268676758, + "logps/chosen": -337.343017578125, + "logps/rejected": -337.2195129394531, + "loss": 0.1192, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.850675106048584, + "rewards/margins": 8.994073867797852, + "rewards/rejected": -13.844747543334961, + "step": 8770 + }, + { + "epoch": 1.7, + "learning_rate": 2.399151506435608e-07, + "logits/chosen": -2.61106276512146, + "logits/rejected": -2.5591437816619873, + "logps/chosen": -253.7194366455078, + "logps/rejected": -304.6730041503906, + "loss": 0.2013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.998692989349365, + "rewards/margins": 7.946354866027832, + "rewards/rejected": -14.945048332214355, + "step": 8780 + }, + { + "epoch": 1.71, + "learning_rate": 2.395556194722082e-07, + "logits/chosen": -2.667248487472534, + "logits/rejected": -2.6080033779144287, + "logps/chosen": -222.0634002685547, + "logps/rejected": -280.4680480957031, + "loss": 0.1192, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.22488784790039, + "rewards/margins": 6.8585405349731445, + "rewards/rejected": -15.083427429199219, + "step": 8790 + }, + { + "epoch": 1.71, + "learning_rate": 2.3919608830085564e-07, + "logits/chosen": -2.721423625946045, + "logits/rejected": -2.6756691932678223, + "logps/chosen": -273.72686767578125, + "logps/rejected": -351.1053466796875, + "loss": 0.0811, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.528761386871338, + "rewards/margins": 7.336087703704834, + "rewards/rejected": -13.864850044250488, + "step": 8800 + }, + { + "epoch": 1.71, + "eval_logits/chosen": -2.504269599914551, + "eval_logits/rejected": -2.4859654903411865, + "eval_logps/chosen": -292.2650451660156, + "eval_logps/rejected": -321.0940246582031, + "eval_loss": 0.5283924341201782, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -9.802698135375977, + "eval_rewards/margins": 5.115107536315918, + "eval_rewards/rejected": -14.917806625366211, + "eval_runtime": 140.2752, + "eval_samples_per_second": 22.499, + "eval_steps_per_second": 0.356, + "step": 8800 + }, + { + "epoch": 1.71, + "learning_rate": 2.388365571295031e-07, + "logits/chosen": -2.6268365383148193, + "logits/rejected": -2.6185123920440674, + "logps/chosen": -288.2203369140625, + "logps/rejected": -363.9134216308594, + "loss": 0.2844, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.239719867706299, + "rewards/margins": 12.082732200622559, + "rewards/rejected": -15.3224515914917, + "step": 8810 + }, + { + "epoch": 1.71, + "learning_rate": 2.3847702595815055e-07, + "logits/chosen": -2.707850694656372, + "logits/rejected": -2.728778600692749, + "logps/chosen": -280.7693176269531, + "logps/rejected": -307.51300048828125, + "loss": 0.1226, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3982996940612793, + "rewards/margins": 6.893430233001709, + "rewards/rejected": -10.291729927062988, + "step": 8820 + }, + { + "epoch": 1.71, + "learning_rate": 2.38117494786798e-07, + "logits/chosen": -2.6748437881469727, + "logits/rejected": -2.6657848358154297, + "logps/chosen": -264.40545654296875, + "logps/rejected": -331.6806640625, + "loss": 0.0864, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.236111640930176, + "rewards/margins": 10.806436538696289, + "rewards/rejected": -17.04254722595215, + "step": 8830 + }, + { + "epoch": 1.72, + "learning_rate": 2.3775796361544546e-07, + "logits/chosen": -2.6756479740142822, + "logits/rejected": -2.5985846519470215, + "logps/chosen": -303.21099853515625, + "logps/rejected": -344.70684814453125, + "loss": 0.1581, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.999774932861328, + "rewards/margins": 6.793562412261963, + "rewards/rejected": -15.79333782196045, + "step": 8840 + }, + { + "epoch": 1.72, + "learning_rate": 2.373984324440929e-07, + "logits/chosen": -2.760925769805908, + "logits/rejected": -2.647653102874756, + "logps/chosen": -256.6393127441406, + "logps/rejected": -352.3446044921875, + "loss": 0.112, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.275694847106934, + "rewards/margins": 12.48613166809082, + "rewards/rejected": -17.761825561523438, + "step": 8850 + }, + { + "epoch": 1.72, + "learning_rate": 2.3703890127274034e-07, + "logits/chosen": -2.7134649753570557, + "logits/rejected": -2.698920965194702, + "logps/chosen": -300.4090270996094, + "logps/rejected": -319.36651611328125, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.171463966369629, + "rewards/margins": 9.919939994812012, + "rewards/rejected": -13.091404914855957, + "step": 8860 + }, + { + "epoch": 1.72, + "learning_rate": 2.3667937010138777e-07, + "logits/chosen": -2.5016653537750244, + "logits/rejected": -2.4401469230651855, + "logps/chosen": -236.34375, + "logps/rejected": -323.60693359375, + "loss": 0.133, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.820054054260254, + "rewards/margins": 8.825299263000488, + "rewards/rejected": -14.645350456237793, + "step": 8870 + }, + { + "epoch": 1.72, + "learning_rate": 2.3631983893003522e-07, + "logits/chosen": -2.625457525253296, + "logits/rejected": -2.563734531402588, + "logps/chosen": -250.7583770751953, + "logps/rejected": -246.5982208251953, + "loss": 0.1607, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.669973850250244, + "rewards/margins": 4.5254669189453125, + "rewards/rejected": -10.195440292358398, + "step": 8880 + }, + { + "epoch": 1.73, + "learning_rate": 2.3596030775868268e-07, + "logits/chosen": -2.6635663509368896, + "logits/rejected": -2.6659483909606934, + "logps/chosen": -194.7909393310547, + "logps/rejected": -316.8825988769531, + "loss": 0.1038, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.9440155029296875, + "rewards/margins": 9.397473335266113, + "rewards/rejected": -14.341486930847168, + "step": 8890 + }, + { + "epoch": 1.73, + "learning_rate": 2.356007765873301e-07, + "logits/chosen": -2.380208730697632, + "logits/rejected": -2.390423059463501, + "logps/chosen": -193.83676147460938, + "logps/rejected": -259.2057800292969, + "loss": 0.1453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.619728088378906, + "rewards/margins": 6.362164497375488, + "rewards/rejected": -12.981892585754395, + "step": 8900 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -2.482853651046753, + "eval_logits/rejected": -2.468625068664551, + "eval_logps/chosen": -285.21710205078125, + "eval_logps/rejected": -311.3193054199219, + "eval_loss": 0.5207270979881287, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -9.097904205322266, + "eval_rewards/margins": 4.842432022094727, + "eval_rewards/rejected": -13.940337181091309, + "eval_runtime": 153.9923, + "eval_samples_per_second": 20.495, + "eval_steps_per_second": 0.325, + "step": 8900 + }, + { + "epoch": 1.73, + "learning_rate": 2.3524124541597756e-07, + "logits/chosen": -2.5679116249084473, + "logits/rejected": -2.516472101211548, + "logps/chosen": -352.97833251953125, + "logps/rejected": -322.7197265625, + "loss": 0.144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5343475341796875, + "rewards/margins": 6.903378486633301, + "rewards/rejected": -9.437726020812988, + "step": 8910 + }, + { + "epoch": 1.73, + "learning_rate": 2.34881714244625e-07, + "logits/chosen": -2.4197185039520264, + "logits/rejected": -2.461596965789795, + "logps/chosen": -305.5779113769531, + "logps/rejected": -395.81695556640625, + "loss": 0.1581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8303463459014893, + "rewards/margins": 12.583988189697266, + "rewards/rejected": -16.414335250854492, + "step": 8920 + }, + { + "epoch": 1.73, + "learning_rate": 2.3452218307327242e-07, + "logits/chosen": -2.723632335662842, + "logits/rejected": -2.62469220161438, + "logps/chosen": -285.1393127441406, + "logps/rejected": -288.421630859375, + "loss": 0.1197, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8004610538482666, + "rewards/margins": 7.442433834075928, + "rewards/rejected": -10.242895126342773, + "step": 8930 + }, + { + "epoch": 1.74, + "learning_rate": 2.341626519019199e-07, + "logits/chosen": -2.506471633911133, + "logits/rejected": -2.493252992630005, + "logps/chosen": -277.4583435058594, + "logps/rejected": -288.23663330078125, + "loss": 0.1401, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.454425811767578, + "rewards/margins": 7.775014400482178, + "rewards/rejected": -13.229438781738281, + "step": 8940 + }, + { + "epoch": 1.74, + "learning_rate": 2.3380312073056732e-07, + "logits/chosen": -2.4613587856292725, + "logits/rejected": -2.4588119983673096, + "logps/chosen": -264.9566650390625, + "logps/rejected": -396.5501403808594, + "loss": 0.2337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.629644393920898, + "rewards/margins": 7.023741722106934, + "rewards/rejected": -15.653387069702148, + "step": 8950 + }, + { + "epoch": 1.74, + "learning_rate": 2.3344358955921478e-07, + "logits/chosen": -2.557526111602783, + "logits/rejected": -2.6252875328063965, + "logps/chosen": -323.6629943847656, + "logps/rejected": -381.2836608886719, + "loss": 0.1001, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.549731731414795, + "rewards/margins": 8.749812126159668, + "rewards/rejected": -12.299544334411621, + "step": 8960 + }, + { + "epoch": 1.74, + "learning_rate": 2.330840583878622e-07, + "logits/chosen": -2.613024950027466, + "logits/rejected": -2.554842233657837, + "logps/chosen": -225.9105224609375, + "logps/rejected": -357.39404296875, + "loss": 0.0843, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.603348731994629, + "rewards/margins": 10.769676208496094, + "rewards/rejected": -18.373023986816406, + "step": 8970 + }, + { + "epoch": 1.74, + "learning_rate": 2.3272452721650963e-07, + "logits/chosen": -2.527435064315796, + "logits/rejected": -2.551858425140381, + "logps/chosen": -284.74737548828125, + "logps/rejected": -255.0273895263672, + "loss": 0.1495, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.3346686363220215, + "rewards/margins": 6.159907341003418, + "rewards/rejected": -10.494577407836914, + "step": 8980 + }, + { + "epoch": 1.75, + "learning_rate": 2.3236499604515712e-07, + "logits/chosen": -2.518888235092163, + "logits/rejected": -2.5484490394592285, + "logps/chosen": -253.7728271484375, + "logps/rejected": -293.1611633300781, + "loss": 0.1164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.769311428070068, + "rewards/margins": 6.031195640563965, + "rewards/rejected": -13.800506591796875, + "step": 8990 + }, + { + "epoch": 1.75, + "learning_rate": 2.3200546487380454e-07, + "logits/chosen": -2.5659279823303223, + "logits/rejected": -2.503180980682373, + "logps/chosen": -260.3642883300781, + "logps/rejected": -317.8871154785156, + "loss": 0.1157, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4114089012145996, + "rewards/margins": 8.762961387634277, + "rewards/rejected": -12.174372673034668, + "step": 9000 + }, + { + "epoch": 1.75, + "eval_logits/chosen": -2.459453582763672, + "eval_logits/rejected": -2.444923162460327, + "eval_logps/chosen": -277.1577453613281, + "eval_logps/rejected": -306.0013122558594, + "eval_loss": 0.5219169855117798, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -8.291966438293457, + "eval_rewards/margins": 5.116571426391602, + "eval_rewards/rejected": -13.408538818359375, + "eval_runtime": 140.7558, + "eval_samples_per_second": 22.422, + "eval_steps_per_second": 0.355, + "step": 9000 + }, + { + "epoch": 1.75, + "learning_rate": 2.31645933702452e-07, + "logits/chosen": -2.5288453102111816, + "logits/rejected": -2.3973660469055176, + "logps/chosen": -316.841796875, + "logps/rejected": -333.89495849609375, + "loss": 0.3495, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4621787071228027, + "rewards/margins": 8.068937301635742, + "rewards/rejected": -11.531115531921387, + "step": 9010 + }, + { + "epoch": 1.75, + "learning_rate": 2.3128640253109942e-07, + "logits/chosen": -2.419325351715088, + "logits/rejected": -2.4747190475463867, + "logps/chosen": -333.5755310058594, + "logps/rejected": -325.43975830078125, + "loss": 0.1293, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0763964653015137, + "rewards/margins": 8.46824836730957, + "rewards/rejected": -10.54464340209961, + "step": 9020 + }, + { + "epoch": 1.75, + "learning_rate": 2.3092687135974688e-07, + "logits/chosen": -2.6578516960144043, + "logits/rejected": -2.6606476306915283, + "logps/chosen": -289.87298583984375, + "logps/rejected": -370.78326416015625, + "loss": 0.1542, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.908090114593506, + "rewards/margins": 7.784598350524902, + "rewards/rejected": -10.692689895629883, + "step": 9030 + }, + { + "epoch": 1.75, + "learning_rate": 2.3056734018839433e-07, + "logits/chosen": -2.6555745601654053, + "logits/rejected": -2.5985846519470215, + "logps/chosen": -253.746337890625, + "logps/rejected": -312.53558349609375, + "loss": 0.1139, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.292423963546753, + "rewards/margins": 8.211528778076172, + "rewards/rejected": -11.503952980041504, + "step": 9040 + }, + { + "epoch": 1.76, + "learning_rate": 2.3020780901704176e-07, + "logits/chosen": -2.5273990631103516, + "logits/rejected": -2.5261635780334473, + "logps/chosen": -223.68917846679688, + "logps/rejected": -300.7638854980469, + "loss": 0.1014, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.830761671066284, + "rewards/margins": 7.406530857086182, + "rewards/rejected": -10.23729133605957, + "step": 9050 + }, + { + "epoch": 1.76, + "learning_rate": 2.2984827784568922e-07, + "logits/chosen": -2.6696174144744873, + "logits/rejected": -2.667482852935791, + "logps/chosen": -297.14703369140625, + "logps/rejected": -329.94586181640625, + "loss": 0.1323, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.836264133453369, + "rewards/margins": 7.174398899078369, + "rewards/rejected": -11.010663032531738, + "step": 9060 + }, + { + "epoch": 1.76, + "learning_rate": 2.2948874667433664e-07, + "logits/chosen": -2.443032741546631, + "logits/rejected": -2.4770193099975586, + "logps/chosen": -253.74658203125, + "logps/rejected": -328.1571044921875, + "loss": 0.0914, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8514180183410645, + "rewards/margins": 10.583091735839844, + "rewards/rejected": -14.43450927734375, + "step": 9070 + }, + { + "epoch": 1.76, + "learning_rate": 2.291292155029841e-07, + "logits/chosen": -2.50797438621521, + "logits/rejected": -2.493114471435547, + "logps/chosen": -247.9369659423828, + "logps/rejected": -365.7491149902344, + "loss": 0.1325, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.651138305664062, + "rewards/margins": 11.686362266540527, + "rewards/rejected": -22.33749771118164, + "step": 9080 + }, + { + "epoch": 1.76, + "learning_rate": 2.2876968433163155e-07, + "logits/chosen": -2.4330861568450928, + "logits/rejected": -2.5481677055358887, + "logps/chosen": -366.48602294921875, + "logps/rejected": -407.94866943359375, + "loss": 0.1276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.245977878570557, + "rewards/margins": 11.331197738647461, + "rewards/rejected": -15.577176094055176, + "step": 9090 + }, + { + "epoch": 1.77, + "learning_rate": 2.2841015316027898e-07, + "logits/chosen": -2.631187915802002, + "logits/rejected": -2.624697208404541, + "logps/chosen": -227.9707794189453, + "logps/rejected": -370.03076171875, + "loss": 0.127, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4985663890838623, + "rewards/margins": 10.457371711730957, + "rewards/rejected": -13.955938339233398, + "step": 9100 + }, + { + "epoch": 1.77, + "eval_logits/chosen": -2.4831197261810303, + "eval_logits/rejected": -2.4681098461151123, + "eval_logps/chosen": -264.125244140625, + "eval_logps/rejected": -287.5068054199219, + "eval_loss": 0.5275867581367493, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -6.988717079162598, + "eval_rewards/margins": 4.57036828994751, + "eval_rewards/rejected": -11.55908489227295, + "eval_runtime": 141.2338, + "eval_samples_per_second": 22.346, + "eval_steps_per_second": 0.354, + "step": 9100 + }, + { + "epoch": 1.77, + "learning_rate": 2.2805062198892643e-07, + "logits/chosen": -2.7433934211730957, + "logits/rejected": -2.7377352714538574, + "logps/chosen": -342.8365783691406, + "logps/rejected": -389.03778076171875, + "loss": 0.1139, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9598255157470703, + "rewards/margins": 7.6363725662231445, + "rewards/rejected": -9.596197128295898, + "step": 9110 + }, + { + "epoch": 1.77, + "learning_rate": 2.2769109081757386e-07, + "logits/chosen": -2.4912314414978027, + "logits/rejected": -2.461418628692627, + "logps/chosen": -225.4840850830078, + "logps/rejected": -245.8949737548828, + "loss": 0.1185, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.032782554626465, + "rewards/margins": 7.881567478179932, + "rewards/rejected": -11.914348602294922, + "step": 9120 + }, + { + "epoch": 1.77, + "learning_rate": 2.2733155964622132e-07, + "logits/chosen": -2.5779995918273926, + "logits/rejected": -2.5387330055236816, + "logps/chosen": -267.99346923828125, + "logps/rejected": -311.9544677734375, + "loss": 0.1166, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.670339345932007, + "rewards/margins": 7.5002593994140625, + "rewards/rejected": -10.170599937438965, + "step": 9130 + }, + { + "epoch": 1.77, + "learning_rate": 2.2697202847486877e-07, + "logits/chosen": -2.5345051288604736, + "logits/rejected": -2.4705216884613037, + "logps/chosen": -236.589111328125, + "logps/rejected": -355.2351989746094, + "loss": 0.2438, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7309606075286865, + "rewards/margins": 8.962685585021973, + "rewards/rejected": -10.693646430969238, + "step": 9140 + }, + { + "epoch": 1.78, + "learning_rate": 2.266124973035162e-07, + "logits/chosen": -2.480380058288574, + "logits/rejected": -2.5399022102355957, + "logps/chosen": -207.9787139892578, + "logps/rejected": -307.7817687988281, + "loss": 0.1411, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.452308654785156, + "rewards/margins": 9.30238151550293, + "rewards/rejected": -15.754690170288086, + "step": 9150 + }, + { + "epoch": 1.78, + "learning_rate": 2.2625296613216365e-07, + "logits/chosen": -2.696650981903076, + "logits/rejected": -2.574568510055542, + "logps/chosen": -278.69677734375, + "logps/rejected": -227.86477661132812, + "loss": 0.4146, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0961384773254395, + "rewards/margins": 6.557276725769043, + "rewards/rejected": -8.653414726257324, + "step": 9160 + }, + { + "epoch": 1.78, + "learning_rate": 2.2589343496081108e-07, + "logits/chosen": -2.5836429595947266, + "logits/rejected": -2.53010892868042, + "logps/chosen": -231.33139038085938, + "logps/rejected": -253.4978790283203, + "loss": 0.1333, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.0301594734191895, + "rewards/margins": 5.411094665527344, + "rewards/rejected": -9.441255569458008, + "step": 9170 + }, + { + "epoch": 1.78, + "learning_rate": 2.2553390378945853e-07, + "logits/chosen": -2.489360809326172, + "logits/rejected": -2.5844199657440186, + "logps/chosen": -304.05560302734375, + "logps/rejected": -350.4225769042969, + "loss": 0.0862, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2749789953231812, + "rewards/margins": 8.532282829284668, + "rewards/rejected": -9.807263374328613, + "step": 9180 + }, + { + "epoch": 1.78, + "learning_rate": 2.25174372618106e-07, + "logits/chosen": -2.764516592025757, + "logits/rejected": -2.6930012702941895, + "logps/chosen": -301.46575927734375, + "logps/rejected": -315.3195495605469, + "loss": 0.1086, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.600867748260498, + "rewards/margins": 6.2614426612854, + "rewards/rejected": -11.862309455871582, + "step": 9190 + }, + { + "epoch": 1.79, + "learning_rate": 2.2481484144675342e-07, + "logits/chosen": -2.718442916870117, + "logits/rejected": -2.6158175468444824, + "logps/chosen": -254.6276092529297, + "logps/rejected": -318.2427062988281, + "loss": 0.0787, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4615302085876465, + "rewards/margins": 7.8550310134887695, + "rewards/rejected": -11.316560745239258, + "step": 9200 + }, + { + "epoch": 1.79, + "eval_logits/chosen": -2.4761788845062256, + "eval_logits/rejected": -2.4595890045166016, + "eval_logps/chosen": -261.3131103515625, + "eval_logps/rejected": -284.68475341796875, + "eval_loss": 0.536875307559967, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -6.707505702972412, + "eval_rewards/margins": 4.569375514984131, + "eval_rewards/rejected": -11.276881217956543, + "eval_runtime": 140.9062, + "eval_samples_per_second": 22.398, + "eval_steps_per_second": 0.355, + "step": 9200 + }, + { + "epoch": 1.79, + "learning_rate": 2.2445531027540087e-07, + "logits/chosen": -2.5662033557891846, + "logits/rejected": -2.4942879676818848, + "logps/chosen": -229.77230834960938, + "logps/rejected": -343.25115966796875, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7950007915496826, + "rewards/margins": 10.938156127929688, + "rewards/rejected": -13.73315715789795, + "step": 9210 + }, + { + "epoch": 1.79, + "learning_rate": 2.240957791040483e-07, + "logits/chosen": -2.6636500358581543, + "logits/rejected": -2.5954031944274902, + "logps/chosen": -263.62969970703125, + "logps/rejected": -307.91949462890625, + "loss": 0.1208, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.333632707595825, + "rewards/margins": 10.105422019958496, + "rewards/rejected": -12.439054489135742, + "step": 9220 + }, + { + "epoch": 1.79, + "learning_rate": 2.2373624793269575e-07, + "logits/chosen": -2.6744582653045654, + "logits/rejected": -2.6738028526306152, + "logps/chosen": -230.3915252685547, + "logps/rejected": -297.47552490234375, + "loss": 0.1278, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.508617401123047, + "rewards/margins": 9.609359741210938, + "rewards/rejected": -13.117976188659668, + "step": 9230 + }, + { + "epoch": 1.79, + "learning_rate": 2.233767167613432e-07, + "logits/chosen": -2.5466504096984863, + "logits/rejected": -2.588721990585327, + "logps/chosen": -353.01593017578125, + "logps/rejected": -485.99517822265625, + "loss": 0.1068, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1640651226043701, + "rewards/margins": 10.018106460571289, + "rewards/rejected": -11.182169914245605, + "step": 9240 + }, + { + "epoch": 1.8, + "learning_rate": 2.2301718558999064e-07, + "logits/chosen": -2.6759397983551025, + "logits/rejected": -2.6947224140167236, + "logps/chosen": -233.6916961669922, + "logps/rejected": -310.08245849609375, + "loss": 0.2103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.124452114105225, + "rewards/margins": 6.477786064147949, + "rewards/rejected": -11.602239608764648, + "step": 9250 + }, + { + "epoch": 1.8, + "learning_rate": 2.226576544186381e-07, + "logits/chosen": -2.5456278324127197, + "logits/rejected": -2.5684640407562256, + "logps/chosen": -268.0722351074219, + "logps/rejected": -321.04791259765625, + "loss": 0.2316, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.562939643859863, + "rewards/margins": 5.708805561065674, + "rewards/rejected": -13.271745681762695, + "step": 9260 + }, + { + "epoch": 1.8, + "learning_rate": 2.2229812324728552e-07, + "logits/chosen": -2.2930455207824707, + "logits/rejected": -2.3209643363952637, + "logps/chosen": -314.3650207519531, + "logps/rejected": -317.35711669921875, + "loss": 0.1218, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.544978141784668, + "rewards/margins": 7.295835971832275, + "rewards/rejected": -11.840815544128418, + "step": 9270 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193859207593297e-07, + "logits/chosen": -2.383695125579834, + "logits/rejected": -2.3838589191436768, + "logps/chosen": -195.61062622070312, + "logps/rejected": -287.2439270019531, + "loss": 0.1056, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.162716388702393, + "rewards/margins": 8.17154312133789, + "rewards/rejected": -12.334260940551758, + "step": 9280 + }, + { + "epoch": 1.8, + "learning_rate": 2.2157906090458043e-07, + "logits/chosen": -2.4942522048950195, + "logits/rejected": -2.4862399101257324, + "logps/chosen": -200.2251739501953, + "logps/rejected": -267.40716552734375, + "loss": 0.0855, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.627869606018066, + "rewards/margins": 9.27051067352295, + "rewards/rejected": -13.8983793258667, + "step": 9290 + }, + { + "epoch": 1.81, + "learning_rate": 2.2121952973322785e-07, + "logits/chosen": -2.615295886993408, + "logits/rejected": -2.617884874343872, + "logps/chosen": -365.4449157714844, + "logps/rejected": -340.91302490234375, + "loss": 0.1575, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.824907302856445, + "rewards/margins": 9.447416305541992, + "rewards/rejected": -14.272321701049805, + "step": 9300 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.454586982727051, + "eval_logits/rejected": -2.4321353435516357, + "eval_logps/chosen": -279.1459655761719, + "eval_logps/rejected": -309.0434265136719, + "eval_loss": 0.5330539345741272, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -8.490789413452148, + "eval_rewards/margins": 5.221959114074707, + "eval_rewards/rejected": -13.712749481201172, + "eval_runtime": 141.0777, + "eval_samples_per_second": 22.371, + "eval_steps_per_second": 0.354, + "step": 9300 + }, + { + "epoch": 1.81, + "learning_rate": 2.208599985618753e-07, + "logits/chosen": -2.5942773818969727, + "logits/rejected": -2.5127804279327393, + "logps/chosen": -290.8653869628906, + "logps/rejected": -421.36181640625, + "loss": 0.1396, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.269815921783447, + "rewards/margins": 9.438082695007324, + "rewards/rejected": -13.70789909362793, + "step": 9310 + }, + { + "epoch": 1.81, + "learning_rate": 2.2050046739052274e-07, + "logits/chosen": -2.6629788875579834, + "logits/rejected": -2.5936431884765625, + "logps/chosen": -304.45794677734375, + "logps/rejected": -352.243408203125, + "loss": 0.1006, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.734714031219482, + "rewards/margins": 6.613863945007324, + "rewards/rejected": -12.348577499389648, + "step": 9320 + }, + { + "epoch": 1.81, + "learning_rate": 2.201409362191702e-07, + "logits/chosen": -2.6832408905029297, + "logits/rejected": -2.598360538482666, + "logps/chosen": -201.95150756835938, + "logps/rejected": -251.3197784423828, + "loss": 0.1827, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0557456016540527, + "rewards/margins": 7.982450008392334, + "rewards/rejected": -10.03819465637207, + "step": 9330 + }, + { + "epoch": 1.81, + "learning_rate": 2.1978140504781764e-07, + "logits/chosen": -2.6649320125579834, + "logits/rejected": -2.561836004257202, + "logps/chosen": -268.05938720703125, + "logps/rejected": -355.09478759765625, + "loss": 0.1038, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3565566539764404, + "rewards/margins": 10.691267013549805, + "rewards/rejected": -14.047823905944824, + "step": 9340 + }, + { + "epoch": 1.82, + "learning_rate": 2.1942187387646507e-07, + "logits/chosen": -2.7730791568756104, + "logits/rejected": -2.7454609870910645, + "logps/chosen": -305.49542236328125, + "logps/rejected": -362.4857482910156, + "loss": 0.1374, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.476459503173828, + "rewards/margins": 6.851313591003418, + "rewards/rejected": -14.327774047851562, + "step": 9350 + }, + { + "epoch": 1.82, + "learning_rate": 2.1906234270511253e-07, + "logits/chosen": -2.673074960708618, + "logits/rejected": -2.6720142364501953, + "logps/chosen": -225.03109741210938, + "logps/rejected": -293.6507873535156, + "loss": 0.1846, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.60236120223999, + "rewards/margins": 8.857532501220703, + "rewards/rejected": -13.459895133972168, + "step": 9360 + }, + { + "epoch": 1.82, + "learning_rate": 2.1870281153375995e-07, + "logits/chosen": -2.6925947666168213, + "logits/rejected": -2.74192476272583, + "logps/chosen": -205.503662109375, + "logps/rejected": -337.00311279296875, + "loss": 0.1988, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.798457145690918, + "rewards/margins": 9.184468269348145, + "rewards/rejected": -14.982925415039062, + "step": 9370 + }, + { + "epoch": 1.82, + "learning_rate": 2.183432803624074e-07, + "logits/chosen": -2.6186959743499756, + "logits/rejected": -2.630443811416626, + "logps/chosen": -176.66262817382812, + "logps/rejected": -272.9822082519531, + "loss": 0.136, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.631664752960205, + "rewards/margins": 7.2577362060546875, + "rewards/rejected": -11.889402389526367, + "step": 9380 + }, + { + "epoch": 1.82, + "learning_rate": 2.1798374919105486e-07, + "logits/chosen": -2.7789523601531982, + "logits/rejected": -2.6312663555145264, + "logps/chosen": -265.19976806640625, + "logps/rejected": -324.28240966796875, + "loss": 0.1166, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.802310943603516, + "rewards/margins": 10.411230087280273, + "rewards/rejected": -16.213542938232422, + "step": 9390 + }, + { + "epoch": 1.82, + "learning_rate": 2.176242180197023e-07, + "logits/chosen": -2.690073013305664, + "logits/rejected": -2.593142032623291, + "logps/chosen": -289.4481506347656, + "logps/rejected": -367.05633544921875, + "loss": 0.1627, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.602388381958008, + "rewards/margins": 9.026971817016602, + "rewards/rejected": -11.62936019897461, + "step": 9400 + }, + { + "epoch": 1.82, + "eval_logits/chosen": -2.5830817222595215, + "eval_logits/rejected": -2.5688726902008057, + "eval_logps/chosen": -262.60369873046875, + "eval_logps/rejected": -280.9705505371094, + "eval_loss": 0.5199735760688782, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -6.836562633514404, + "eval_rewards/margins": 4.068894863128662, + "eval_rewards/rejected": -10.90545654296875, + "eval_runtime": 141.1429, + "eval_samples_per_second": 22.36, + "eval_steps_per_second": 0.354, + "step": 9400 + }, + { + "epoch": 1.83, + "learning_rate": 2.1726468684834975e-07, + "logits/chosen": -2.6631975173950195, + "logits/rejected": -2.687513828277588, + "logps/chosen": -282.0251770019531, + "logps/rejected": -295.54705810546875, + "loss": 0.1117, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.567460536956787, + "rewards/margins": 6.210967063903809, + "rewards/rejected": -10.778428077697754, + "step": 9410 + }, + { + "epoch": 1.83, + "learning_rate": 2.169051556769972e-07, + "logits/chosen": -2.784729480743408, + "logits/rejected": -2.7750000953674316, + "logps/chosen": -250.9359588623047, + "logps/rejected": -264.49383544921875, + "loss": 0.1826, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9784743785858154, + "rewards/margins": 4.458198070526123, + "rewards/rejected": -8.43667221069336, + "step": 9420 + }, + { + "epoch": 1.83, + "learning_rate": 2.1654562450564463e-07, + "logits/chosen": -2.7793118953704834, + "logits/rejected": -2.763577699661255, + "logps/chosen": -273.861328125, + "logps/rejected": -327.62371826171875, + "loss": 0.1557, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.666604995727539, + "rewards/margins": 9.303274154663086, + "rewards/rejected": -14.969879150390625, + "step": 9430 + }, + { + "epoch": 1.83, + "learning_rate": 2.1618609333429208e-07, + "logits/chosen": -2.69822096824646, + "logits/rejected": -2.668501615524292, + "logps/chosen": -205.9537353515625, + "logps/rejected": -339.89825439453125, + "loss": 0.1057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.004425287246704, + "rewards/margins": 8.063764572143555, + "rewards/rejected": -11.06818962097168, + "step": 9440 + }, + { + "epoch": 1.83, + "learning_rate": 2.158265621629395e-07, + "logits/chosen": -2.7364261150360107, + "logits/rejected": -2.682142734527588, + "logps/chosen": -221.7036895751953, + "logps/rejected": -276.8183288574219, + "loss": 0.1132, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9641478657722473, + "rewards/margins": 8.006423950195312, + "rewards/rejected": -8.970571517944336, + "step": 9450 + }, + { + "epoch": 1.84, + "learning_rate": 2.1546703099158696e-07, + "logits/chosen": -2.700155735015869, + "logits/rejected": -2.825791120529175, + "logps/chosen": -209.17172241210938, + "logps/rejected": -375.6164855957031, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.204814910888672, + "rewards/margins": 10.476545333862305, + "rewards/rejected": -14.681361198425293, + "step": 9460 + }, + { + "epoch": 1.84, + "learning_rate": 2.1510749982023442e-07, + "logits/chosen": -2.7599854469299316, + "logits/rejected": -2.822727680206299, + "logps/chosen": -255.29598999023438, + "logps/rejected": -259.87677001953125, + "loss": 0.1404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.822667121887207, + "rewards/margins": 4.506711006164551, + "rewards/rejected": -12.329377174377441, + "step": 9470 + }, + { + "epoch": 1.84, + "learning_rate": 2.1474796864888185e-07, + "logits/chosen": -2.6040995121002197, + "logits/rejected": -2.551703453063965, + "logps/chosen": -250.2332763671875, + "logps/rejected": -292.47357177734375, + "loss": 0.1522, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.464228868484497, + "rewards/margins": 8.481760025024414, + "rewards/rejected": -10.945989608764648, + "step": 9480 + }, + { + "epoch": 1.84, + "learning_rate": 2.143884374775293e-07, + "logits/chosen": -2.8416645526885986, + "logits/rejected": -2.829864025115967, + "logps/chosen": -235.0521240234375, + "logps/rejected": -323.4149475097656, + "loss": 0.1556, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0735909938812256, + "rewards/margins": 7.334481239318848, + "rewards/rejected": -9.408071517944336, + "step": 9490 + }, + { + "epoch": 1.84, + "learning_rate": 2.1402890630617673e-07, + "logits/chosen": -2.5075650215148926, + "logits/rejected": -2.4215004444122314, + "logps/chosen": -199.2357177734375, + "logps/rejected": -283.7659606933594, + "loss": 0.1334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8971378803253174, + "rewards/margins": 7.249490261077881, + "rewards/rejected": -11.146627426147461, + "step": 9500 + }, + { + "epoch": 1.84, + "eval_logits/chosen": -2.616497755050659, + "eval_logits/rejected": -2.602832555770874, + "eval_logps/chosen": -269.4985046386719, + "eval_logps/rejected": -290.15087890625, + "eval_loss": 0.514388382434845, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -7.526042461395264, + "eval_rewards/margins": 4.297450542449951, + "eval_rewards/rejected": -11.823493957519531, + "eval_runtime": 139.5543, + "eval_samples_per_second": 22.615, + "eval_steps_per_second": 0.358, + "step": 9500 + }, + { + "epoch": 1.85, + "learning_rate": 2.1366937513482418e-07, + "logits/chosen": -2.7919929027557373, + "logits/rejected": -2.637633800506592, + "logps/chosen": -305.61639404296875, + "logps/rejected": -302.77288818359375, + "loss": 0.0932, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2936458587646484, + "rewards/margins": 10.195398330688477, + "rewards/rejected": -13.489044189453125, + "step": 9510 + }, + { + "epoch": 1.85, + "learning_rate": 2.1330984396347164e-07, + "logits/chosen": -2.773404598236084, + "logits/rejected": -2.8854422569274902, + "logps/chosen": -225.18533325195312, + "logps/rejected": -267.5545959472656, + "loss": 0.1145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.022705554962158, + "rewards/margins": 7.393181800842285, + "rewards/rejected": -10.415887832641602, + "step": 9520 + }, + { + "epoch": 1.85, + "learning_rate": 2.1295031279211906e-07, + "logits/chosen": -2.7786660194396973, + "logits/rejected": -2.717477798461914, + "logps/chosen": -301.073974609375, + "logps/rejected": -346.15179443359375, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.636516571044922, + "rewards/margins": 9.894160270690918, + "rewards/rejected": -13.530677795410156, + "step": 9530 + }, + { + "epoch": 1.85, + "learning_rate": 2.1259078162076652e-07, + "logits/chosen": -2.829789400100708, + "logits/rejected": -2.7225544452667236, + "logps/chosen": -307.1126708984375, + "logps/rejected": -363.5691833496094, + "loss": 0.2078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9343395233154297, + "rewards/margins": 7.367140293121338, + "rewards/rejected": -11.30147933959961, + "step": 9540 + }, + { + "epoch": 1.85, + "learning_rate": 2.1223125044941395e-07, + "logits/chosen": -2.7844507694244385, + "logits/rejected": -2.661411762237549, + "logps/chosen": -244.58200073242188, + "logps/rejected": -230.42041015625, + "loss": 0.1608, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.329751491546631, + "rewards/margins": 6.7721452713012695, + "rewards/rejected": -12.101898193359375, + "step": 9550 + }, + { + "epoch": 1.86, + "learning_rate": 2.118717192780614e-07, + "logits/chosen": -2.7518794536590576, + "logits/rejected": -2.81085467338562, + "logps/chosen": -258.4638366699219, + "logps/rejected": -396.66937255859375, + "loss": 0.1225, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.041602611541748, + "rewards/margins": 10.739147186279297, + "rewards/rejected": -14.780749320983887, + "step": 9560 + }, + { + "epoch": 1.86, + "learning_rate": 2.1151218810670886e-07, + "logits/chosen": -2.723557472229004, + "logits/rejected": -2.7834010124206543, + "logps/chosen": -274.72003173828125, + "logps/rejected": -354.28778076171875, + "loss": 0.121, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4575653076171875, + "rewards/margins": 8.722177505493164, + "rewards/rejected": -12.179742813110352, + "step": 9570 + }, + { + "epoch": 1.86, + "learning_rate": 2.1115265693535628e-07, + "logits/chosen": -2.670217514038086, + "logits/rejected": -2.5735363960266113, + "logps/chosen": -358.3692321777344, + "logps/rejected": -330.4654235839844, + "loss": 0.1248, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.361757278442383, + "rewards/margins": 8.804426193237305, + "rewards/rejected": -12.16618537902832, + "step": 9580 + }, + { + "epoch": 1.86, + "learning_rate": 2.1079312576400374e-07, + "logits/chosen": -2.7584645748138428, + "logits/rejected": -2.708832263946533, + "logps/chosen": -234.52932739257812, + "logps/rejected": -283.8543395996094, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4932403564453125, + "rewards/margins": 5.7663373947143555, + "rewards/rejected": -9.259577751159668, + "step": 9590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1043359459265117e-07, + "logits/chosen": -2.7580108642578125, + "logits/rejected": -2.601715564727783, + "logps/chosen": -247.44192504882812, + "logps/rejected": -321.3501892089844, + "loss": 0.1662, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.507093906402588, + "rewards/margins": 6.213970184326172, + "rewards/rejected": -12.721063613891602, + "step": 9600 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.5207648277282715, + "eval_logits/rejected": -2.5049188137054443, + "eval_logps/chosen": -266.2056884765625, + "eval_logps/rejected": -289.34429931640625, + "eval_loss": 0.5175375938415527, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -7.196760177612305, + "eval_rewards/margins": 4.546074390411377, + "eval_rewards/rejected": -11.742834091186523, + "eval_runtime": 145.8561, + "eval_samples_per_second": 21.638, + "eval_steps_per_second": 0.343, + "step": 9600 + }, + { + "epoch": 1.87, + "learning_rate": 2.1007406342129862e-07, + "logits/chosen": -2.7143349647521973, + "logits/rejected": -2.6954402923583984, + "logps/chosen": -256.6372985839844, + "logps/rejected": -364.6605529785156, + "loss": 0.1722, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.440862655639648, + "rewards/margins": 10.010429382324219, + "rewards/rejected": -14.45129108428955, + "step": 9610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0971453224994607e-07, + "logits/chosen": -2.80859375, + "logits/rejected": -2.6646108627319336, + "logps/chosen": -251.14077758789062, + "logps/rejected": -280.90655517578125, + "loss": 0.1657, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.744521141052246, + "rewards/margins": 8.630102157592773, + "rewards/rejected": -13.37462329864502, + "step": 9620 + }, + { + "epoch": 1.87, + "learning_rate": 2.093550010785935e-07, + "logits/chosen": -2.5912880897521973, + "logits/rejected": -2.615015983581543, + "logps/chosen": -284.47113037109375, + "logps/rejected": -373.15863037109375, + "loss": 0.1266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.701432943344116, + "rewards/margins": 9.166840553283691, + "rewards/rejected": -11.86827278137207, + "step": 9630 + }, + { + "epoch": 1.87, + "learning_rate": 2.0899546990724096e-07, + "logits/chosen": -2.627544641494751, + "logits/rejected": -2.6564548015594482, + "logps/chosen": -192.7626495361328, + "logps/rejected": -253.3898162841797, + "loss": 0.1787, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.189593315124512, + "rewards/margins": 5.3219685554504395, + "rewards/rejected": -9.51156234741211, + "step": 9640 + }, + { + "epoch": 1.87, + "learning_rate": 2.0863593873588838e-07, + "logits/chosen": -2.688269853591919, + "logits/rejected": -2.636615037918091, + "logps/chosen": -240.20712280273438, + "logps/rejected": -249.9957733154297, + "loss": 0.111, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.549140453338623, + "rewards/margins": 5.735888481140137, + "rewards/rejected": -11.285029411315918, + "step": 9650 + }, + { + "epoch": 1.88, + "learning_rate": 2.082764075645358e-07, + "logits/chosen": -2.5320065021514893, + "logits/rejected": -2.51509690284729, + "logps/chosen": -236.28836059570312, + "logps/rejected": -365.29620361328125, + "loss": 0.0996, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9913103580474854, + "rewards/margins": 10.07435417175293, + "rewards/rejected": -12.065665245056152, + "step": 9660 + }, + { + "epoch": 1.88, + "learning_rate": 2.079168763931833e-07, + "logits/chosen": -2.7490692138671875, + "logits/rejected": -2.6080615520477295, + "logps/chosen": -337.1051330566406, + "logps/rejected": -296.54193115234375, + "loss": 0.1606, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.419654369354248, + "rewards/margins": 7.501638889312744, + "rewards/rejected": -9.921293258666992, + "step": 9670 + }, + { + "epoch": 1.88, + "learning_rate": 2.0755734522183072e-07, + "logits/chosen": -2.6276113986968994, + "logits/rejected": -2.5117077827453613, + "logps/chosen": -306.5704650878906, + "logps/rejected": -345.6265869140625, + "loss": 0.1183, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0178134441375732, + "rewards/margins": 8.583242416381836, + "rewards/rejected": -11.601057052612305, + "step": 9680 + }, + { + "epoch": 1.88, + "learning_rate": 2.0719781405047817e-07, + "logits/chosen": -2.627833843231201, + "logits/rejected": -2.522871732711792, + "logps/chosen": -219.50454711914062, + "logps/rejected": -250.96414184570312, + "loss": 0.0716, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.722193717956543, + "rewards/margins": 7.623091220855713, + "rewards/rejected": -12.345284461975098, + "step": 9690 + }, + { + "epoch": 1.88, + "learning_rate": 2.068382828791256e-07, + "logits/chosen": -2.5679221153259277, + "logits/rejected": -2.5713205337524414, + "logps/chosen": -263.98077392578125, + "logps/rejected": -374.9137268066406, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.59222149848938, + "rewards/margins": 9.256675720214844, + "rewards/rejected": -12.848896980285645, + "step": 9700 + }, + { + "epoch": 1.88, + "eval_logits/chosen": -2.4926064014434814, + "eval_logits/rejected": -2.478003740310669, + "eval_logps/chosen": -269.9750061035156, + "eval_logps/rejected": -294.9535827636719, + "eval_loss": 0.5252137780189514, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -7.573695659637451, + "eval_rewards/margins": 4.730066299438477, + "eval_rewards/rejected": -12.303762435913086, + "eval_runtime": 141.196, + "eval_samples_per_second": 22.352, + "eval_steps_per_second": 0.354, + "step": 9700 + }, + { + "epoch": 1.89, + "learning_rate": 2.0647875170777303e-07, + "logits/chosen": -2.561051368713379, + "logits/rejected": -2.642465829849243, + "logps/chosen": -235.7049560546875, + "logps/rejected": -321.3508605957031, + "loss": 0.1579, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9679055213928223, + "rewards/margins": 7.88568639755249, + "rewards/rejected": -11.853592872619629, + "step": 9710 + }, + { + "epoch": 1.89, + "learning_rate": 2.061192205364205e-07, + "logits/chosen": -2.6106936931610107, + "logits/rejected": -2.5502424240112305, + "logps/chosen": -272.18902587890625, + "logps/rejected": -273.1693115234375, + "loss": 0.0999, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.936622142791748, + "rewards/margins": 5.830315589904785, + "rewards/rejected": -9.766938209533691, + "step": 9720 + }, + { + "epoch": 1.89, + "learning_rate": 2.0575968936506794e-07, + "logits/chosen": -2.5287885665893555, + "logits/rejected": -2.488464832305908, + "logps/chosen": -226.1278839111328, + "logps/rejected": -201.50521850585938, + "loss": 0.1004, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.362948417663574, + "rewards/margins": 3.775364637374878, + "rewards/rejected": -10.138312339782715, + "step": 9730 + }, + { + "epoch": 1.89, + "learning_rate": 2.054001581937154e-07, + "logits/chosen": -2.667818546295166, + "logits/rejected": -2.535402536392212, + "logps/chosen": -252.14712524414062, + "logps/rejected": -361.03997802734375, + "loss": 0.1493, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.449309825897217, + "rewards/margins": 11.48906135559082, + "rewards/rejected": -13.938371658325195, + "step": 9740 + }, + { + "epoch": 1.89, + "learning_rate": 2.0504062702236282e-07, + "logits/chosen": -2.575338363647461, + "logits/rejected": -2.5556836128234863, + "logps/chosen": -261.7845458984375, + "logps/rejected": -363.62738037109375, + "loss": 0.1067, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.278941631317139, + "rewards/margins": 9.476760864257812, + "rewards/rejected": -13.755702018737793, + "step": 9750 + }, + { + "epoch": 1.89, + "learning_rate": 2.0468109585101025e-07, + "logits/chosen": -2.6099021434783936, + "logits/rejected": -2.6040093898773193, + "logps/chosen": -181.74221801757812, + "logps/rejected": -355.05535888671875, + "loss": 0.1183, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5663819313049316, + "rewards/margins": 13.304658889770508, + "rewards/rejected": -16.87103843688965, + "step": 9760 + }, + { + "epoch": 1.9, + "learning_rate": 2.0432156467965773e-07, + "logits/chosen": -2.4464845657348633, + "logits/rejected": -2.457374095916748, + "logps/chosen": -210.5225830078125, + "logps/rejected": -306.7838439941406, + "loss": 0.1358, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.1162004470825195, + "rewards/margins": 10.571629524230957, + "rewards/rejected": -15.687829971313477, + "step": 9770 + }, + { + "epoch": 1.9, + "learning_rate": 2.0396203350830516e-07, + "logits/chosen": -2.5081634521484375, + "logits/rejected": -2.4628183841705322, + "logps/chosen": -326.0071105957031, + "logps/rejected": -349.7266845703125, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.235438346862793, + "rewards/margins": 7.810906887054443, + "rewards/rejected": -12.046346664428711, + "step": 9780 + }, + { + "epoch": 1.9, + "learning_rate": 2.036025023369526e-07, + "logits/chosen": -2.8011951446533203, + "logits/rejected": -2.6630489826202393, + "logps/chosen": -264.27154541015625, + "logps/rejected": -272.40679931640625, + "loss": 0.1293, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.549420118331909, + "rewards/margins": 6.654759407043457, + "rewards/rejected": -10.204178810119629, + "step": 9790 + }, + { + "epoch": 1.9, + "learning_rate": 2.0324297116560004e-07, + "logits/chosen": -2.625323534011841, + "logits/rejected": -2.506072998046875, + "logps/chosen": -310.63018798828125, + "logps/rejected": -398.94744873046875, + "loss": 0.2393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8594512939453125, + "rewards/margins": 13.28419017791748, + "rewards/rejected": -15.143640518188477, + "step": 9800 + }, + { + "epoch": 1.9, + "eval_logits/chosen": -2.5731213092803955, + "eval_logits/rejected": -2.5587193965911865, + "eval_logps/chosen": -269.15802001953125, + "eval_logps/rejected": -292.7436218261719, + "eval_loss": 0.5220938324928284, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -7.491992950439453, + "eval_rewards/margins": 4.590774059295654, + "eval_rewards/rejected": -12.08276653289795, + "eval_runtime": 140.9621, + "eval_samples_per_second": 22.389, + "eval_steps_per_second": 0.355, + "step": 9800 + }, + { + "epoch": 1.9, + "learning_rate": 2.0288343999424747e-07, + "logits/chosen": -2.7835259437561035, + "logits/rejected": -2.7077555656433105, + "logps/chosen": -338.28070068359375, + "logps/rejected": -358.1758117675781, + "loss": 0.1416, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9490952491760254, + "rewards/margins": 6.300492763519287, + "rewards/rejected": -9.249588012695312, + "step": 9810 + }, + { + "epoch": 1.91, + "learning_rate": 2.0252390882289495e-07, + "logits/chosen": -2.6914010047912598, + "logits/rejected": -2.6057381629943848, + "logps/chosen": -298.5309143066406, + "logps/rejected": -325.1488037109375, + "loss": 0.1239, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.406166076660156, + "rewards/margins": 9.358588218688965, + "rewards/rejected": -13.764753341674805, + "step": 9820 + }, + { + "epoch": 1.91, + "learning_rate": 2.0216437765154238e-07, + "logits/chosen": -2.612276792526245, + "logits/rejected": -2.43575119972229, + "logps/chosen": -290.4168701171875, + "logps/rejected": -339.95703125, + "loss": 0.1555, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.402797698974609, + "rewards/margins": 6.995247840881348, + "rewards/rejected": -11.398045539855957, + "step": 9830 + }, + { + "epoch": 1.91, + "learning_rate": 2.0180484648018983e-07, + "logits/chosen": -2.614217758178711, + "logits/rejected": -2.6707980632781982, + "logps/chosen": -286.5018005371094, + "logps/rejected": -294.0039978027344, + "loss": 0.0688, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.310466766357422, + "rewards/margins": 5.712352275848389, + "rewards/rejected": -13.022821426391602, + "step": 9840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0144531530883726e-07, + "logits/chosen": -2.549112319946289, + "logits/rejected": -2.592756986618042, + "logps/chosen": -260.30780029296875, + "logps/rejected": -391.18450927734375, + "loss": 0.1242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.136299133300781, + "rewards/margins": 14.734067916870117, + "rewards/rejected": -18.8703670501709, + "step": 9850 + }, + { + "epoch": 1.91, + "learning_rate": 2.0108578413748469e-07, + "logits/chosen": -2.6412360668182373, + "logits/rejected": -2.532777786254883, + "logps/chosen": -223.5194091796875, + "logps/rejected": -286.7159118652344, + "loss": 0.1378, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2553603649139404, + "rewards/margins": 7.387794494628906, + "rewards/rejected": -9.643155097961426, + "step": 9860 + }, + { + "epoch": 1.92, + "learning_rate": 2.0072625296613217e-07, + "logits/chosen": -2.7499048709869385, + "logits/rejected": -2.619619607925415, + "logps/chosen": -291.58477783203125, + "logps/rejected": -303.89544677734375, + "loss": 0.138, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7958405017852783, + "rewards/margins": 8.317672729492188, + "rewards/rejected": -10.113512992858887, + "step": 9870 + }, + { + "epoch": 1.92, + "learning_rate": 2.003667217947796e-07, + "logits/chosen": -2.6006357669830322, + "logits/rejected": -2.5927653312683105, + "logps/chosen": -297.0106506347656, + "logps/rejected": -304.1988525390625, + "loss": 0.1104, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.417545318603516, + "rewards/margins": 6.6790571212768555, + "rewards/rejected": -12.096602439880371, + "step": 9880 + }, + { + "epoch": 1.92, + "learning_rate": 2.0000719062342705e-07, + "logits/chosen": -2.7935850620269775, + "logits/rejected": -2.7453925609588623, + "logps/chosen": -265.3235168457031, + "logps/rejected": -388.5957946777344, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.464507579803467, + "rewards/margins": 11.08370304107666, + "rewards/rejected": -13.548210144042969, + "step": 9890 + }, + { + "epoch": 1.92, + "learning_rate": 1.9964765945207448e-07, + "logits/chosen": -2.7507729530334473, + "logits/rejected": -2.661170482635498, + "logps/chosen": -284.39337158203125, + "logps/rejected": -402.4375915527344, + "loss": 0.1172, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.298614025115967, + "rewards/margins": 11.880010604858398, + "rewards/rejected": -18.178625106811523, + "step": 9900 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -2.6177239418029785, + "eval_logits/rejected": -2.6024889945983887, + "eval_logps/chosen": -271.6432800292969, + "eval_logps/rejected": -297.585205078125, + "eval_loss": 0.530979335308075, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -7.740522384643555, + "eval_rewards/margins": 4.82640266418457, + "eval_rewards/rejected": -12.566925048828125, + "eval_runtime": 140.1556, + "eval_samples_per_second": 22.518, + "eval_steps_per_second": 0.357, + "step": 9900 + }, + { + "epoch": 1.92, + "learning_rate": 1.9928812828072193e-07, + "logits/chosen": -2.881540060043335, + "logits/rejected": -2.794807195663452, + "logps/chosen": -409.90179443359375, + "logps/rejected": -364.76580810546875, + "loss": 0.1559, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1918411254882812, + "rewards/margins": 8.817240715026855, + "rewards/rejected": -12.009081840515137, + "step": 9910 + }, + { + "epoch": 1.93, + "learning_rate": 1.9892859710936939e-07, + "logits/chosen": -2.580712080001831, + "logits/rejected": -2.5410375595092773, + "logps/chosen": -272.1631774902344, + "logps/rejected": -401.469482421875, + "loss": 0.1505, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.087647914886475, + "rewards/margins": 12.025777816772461, + "rewards/rejected": -16.11342430114746, + "step": 9920 + }, + { + "epoch": 1.93, + "learning_rate": 1.985690659380168e-07, + "logits/chosen": -2.774130344390869, + "logits/rejected": -2.8216567039489746, + "logps/chosen": -284.11737060546875, + "logps/rejected": -380.0205993652344, + "loss": 0.1207, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.726789951324463, + "rewards/margins": 10.794561386108398, + "rewards/rejected": -17.521350860595703, + "step": 9930 + }, + { + "epoch": 1.93, + "learning_rate": 1.9820953476666427e-07, + "logits/chosen": -2.786043405532837, + "logits/rejected": -2.8093819618225098, + "logps/chosen": -257.4915466308594, + "logps/rejected": -313.9781188964844, + "loss": 0.1823, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9452264308929443, + "rewards/margins": 9.208467483520508, + "rewards/rejected": -12.153692245483398, + "step": 9940 + }, + { + "epoch": 1.93, + "learning_rate": 1.978500035953117e-07, + "logits/chosen": -2.6762661933898926, + "logits/rejected": -2.7098498344421387, + "logps/chosen": -208.21359252929688, + "logps/rejected": -345.08111572265625, + "loss": 0.1405, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.444887638092041, + "rewards/margins": 9.521523475646973, + "rewards/rejected": -16.966411590576172, + "step": 9950 + }, + { + "epoch": 1.93, + "learning_rate": 1.9749047242395915e-07, + "logits/chosen": -2.692342758178711, + "logits/rejected": -2.689415693283081, + "logps/chosen": -208.0183563232422, + "logps/rejected": -357.1488037109375, + "loss": 0.1157, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.413248538970947, + "rewards/margins": 11.793384552001953, + "rewards/rejected": -17.206634521484375, + "step": 9960 + }, + { + "epoch": 1.94, + "learning_rate": 1.971309412526066e-07, + "logits/chosen": -2.810288906097412, + "logits/rejected": -2.7897398471832275, + "logps/chosen": -277.0812072753906, + "logps/rejected": -281.8963623046875, + "loss": 0.1659, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.449187278747559, + "rewards/margins": 7.528085231781006, + "rewards/rejected": -11.97727108001709, + "step": 9970 + }, + { + "epoch": 1.94, + "learning_rate": 1.9677141008125403e-07, + "logits/chosen": -2.614064931869507, + "logits/rejected": -2.6760191917419434, + "logps/chosen": -278.1968688964844, + "logps/rejected": -380.0922546386719, + "loss": 0.1253, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.580084800720215, + "rewards/margins": 9.989953994750977, + "rewards/rejected": -14.570037841796875, + "step": 9980 + }, + { + "epoch": 1.94, + "learning_rate": 1.9641187890990149e-07, + "logits/chosen": -2.818615436553955, + "logits/rejected": -2.7288951873779297, + "logps/chosen": -203.84352111816406, + "logps/rejected": -282.2690124511719, + "loss": 0.145, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7939720153808594, + "rewards/margins": 8.794477462768555, + "rewards/rejected": -12.588449478149414, + "step": 9990 + }, + { + "epoch": 1.94, + "learning_rate": 1.9605234773854891e-07, + "logits/chosen": -2.753340482711792, + "logits/rejected": -2.7678110599517822, + "logps/chosen": -188.6497802734375, + "logps/rejected": -354.73345947265625, + "loss": 0.0687, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.572816371917725, + "rewards/margins": 10.632683753967285, + "rewards/rejected": -17.20549774169922, + "step": 10000 + }, + { + "epoch": 1.94, + "eval_logits/chosen": -2.624098539352417, + "eval_logits/rejected": -2.611208438873291, + "eval_logps/chosen": -268.8094482421875, + "eval_logps/rejected": -292.87554931640625, + "eval_loss": 0.5245481133460999, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -7.457136154174805, + "eval_rewards/margins": 4.638821601867676, + "eval_rewards/rejected": -12.095958709716797, + "eval_runtime": 155.7237, + "eval_samples_per_second": 20.267, + "eval_steps_per_second": 0.321, + "step": 10000 + }, + { + "epoch": 1.94, + "learning_rate": 1.9569281656719637e-07, + "logits/chosen": -2.6966569423675537, + "logits/rejected": -2.6875321865081787, + "logps/chosen": -222.75955200195312, + "logps/rejected": -289.21026611328125, + "loss": 0.151, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.303269386291504, + "rewards/margins": 8.195225715637207, + "rewards/rejected": -12.498494148254395, + "step": 10010 + }, + { + "epoch": 1.95, + "learning_rate": 1.9533328539584382e-07, + "logits/chosen": -2.7097420692443848, + "logits/rejected": -2.784785032272339, + "logps/chosen": -181.155517578125, + "logps/rejected": -239.69155883789062, + "loss": 0.1093, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9080886840820312, + "rewards/margins": 7.344522953033447, + "rewards/rejected": -11.25261116027832, + "step": 10020 + }, + { + "epoch": 1.95, + "learning_rate": 1.9497375422449125e-07, + "logits/chosen": -2.7030367851257324, + "logits/rejected": -2.5604372024536133, + "logps/chosen": -224.58895874023438, + "logps/rejected": -244.3118133544922, + "loss": 0.1166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.914610862731934, + "rewards/margins": 4.591408729553223, + "rewards/rejected": -10.50601863861084, + "step": 10030 + }, + { + "epoch": 1.95, + "learning_rate": 1.946142230531387e-07, + "logits/chosen": -2.695406436920166, + "logits/rejected": -2.7164549827575684, + "logps/chosen": -256.2817077636719, + "logps/rejected": -243.5300750732422, + "loss": 0.1062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.227022647857666, + "rewards/margins": 5.592148780822754, + "rewards/rejected": -9.819170951843262, + "step": 10040 + }, + { + "epoch": 1.95, + "learning_rate": 1.9425469188178613e-07, + "logits/chosen": -2.517131805419922, + "logits/rejected": -2.587303400039673, + "logps/chosen": -218.4390869140625, + "logps/rejected": -313.0886535644531, + "loss": 0.1024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9561208486557007, + "rewards/margins": 11.146007537841797, + "rewards/rejected": -13.102127075195312, + "step": 10050 + }, + { + "epoch": 1.95, + "learning_rate": 1.9389516071043359e-07, + "logits/chosen": -2.6962180137634277, + "logits/rejected": -2.784198522567749, + "logps/chosen": -253.6819305419922, + "logps/rejected": -318.7190246582031, + "loss": 0.1264, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7917160987854004, + "rewards/margins": 6.036627769470215, + "rewards/rejected": -9.828343391418457, + "step": 10060 + }, + { + "epoch": 1.95, + "learning_rate": 1.9353562953908104e-07, + "logits/chosen": -2.6028361320495605, + "logits/rejected": -2.7085378170013428, + "logps/chosen": -231.95217895507812, + "logps/rejected": -287.3670349121094, + "loss": 0.12, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8892903327941895, + "rewards/margins": 6.986242771148682, + "rewards/rejected": -10.875532150268555, + "step": 10070 + }, + { + "epoch": 1.96, + "learning_rate": 1.9317609836772847e-07, + "logits/chosen": -2.868354082107544, + "logits/rejected": -2.792537212371826, + "logps/chosen": -304.06536865234375, + "logps/rejected": -323.65972900390625, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8082060813903809, + "rewards/margins": 9.34862995147705, + "rewards/rejected": -10.156837463378906, + "step": 10080 + }, + { + "epoch": 1.96, + "learning_rate": 1.9281656719637592e-07, + "logits/chosen": -2.729243516921997, + "logits/rejected": -2.7317256927490234, + "logps/chosen": -255.93667602539062, + "logps/rejected": -410.2506408691406, + "loss": 0.1016, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3462402820587158, + "rewards/margins": 9.17154598236084, + "rewards/rejected": -10.517786026000977, + "step": 10090 + }, + { + "epoch": 1.96, + "learning_rate": 1.9245703602502335e-07, + "logits/chosen": -2.758331537246704, + "logits/rejected": -2.755667209625244, + "logps/chosen": -265.77618408203125, + "logps/rejected": -362.4887390136719, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3049278259277344, + "rewards/margins": 9.093594551086426, + "rewards/rejected": -12.39852237701416, + "step": 10100 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.6079976558685303, + "eval_logits/rejected": -2.595292568206787, + "eval_logps/chosen": -261.605712890625, + "eval_logps/rejected": -288.4120788574219, + "eval_loss": 0.5272236466407776, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -6.736766338348389, + "eval_rewards/margins": 4.912847995758057, + "eval_rewards/rejected": -11.649614334106445, + "eval_runtime": 155.9617, + "eval_samples_per_second": 20.236, + "eval_steps_per_second": 0.321, + "step": 10100 + }, + { + "epoch": 1.96, + "learning_rate": 1.920975048536708e-07, + "logits/chosen": -2.5196948051452637, + "logits/rejected": -2.5852513313293457, + "logps/chosen": -229.99343872070312, + "logps/rejected": -265.3790283203125, + "loss": 0.1412, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.587240695953369, + "rewards/margins": 7.187707424163818, + "rewards/rejected": -13.774948120117188, + "step": 10110 + }, + { + "epoch": 1.96, + "learning_rate": 1.9173797368231826e-07, + "logits/chosen": -2.6614365577697754, + "logits/rejected": -2.7115848064422607, + "logps/chosen": -197.1228790283203, + "logps/rejected": -306.74212646484375, + "loss": 0.1299, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5633678436279297, + "rewards/margins": 8.410367965698242, + "rewards/rejected": -11.973734855651855, + "step": 10120 + }, + { + "epoch": 1.97, + "learning_rate": 1.913784425109657e-07, + "logits/chosen": -2.8587021827697754, + "logits/rejected": -2.8292853832244873, + "logps/chosen": -286.17694091796875, + "logps/rejected": -324.4984436035156, + "loss": 0.1221, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.473060131072998, + "rewards/margins": 10.909645080566406, + "rewards/rejected": -14.382707595825195, + "step": 10130 + }, + { + "epoch": 1.97, + "learning_rate": 1.9101891133961314e-07, + "logits/chosen": -2.716796636581421, + "logits/rejected": -2.649395704269409, + "logps/chosen": -178.61146545410156, + "logps/rejected": -303.76446533203125, + "loss": 0.1284, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.529031276702881, + "rewards/margins": 7.286018371582031, + "rewards/rejected": -13.81505012512207, + "step": 10140 + }, + { + "epoch": 1.97, + "learning_rate": 1.9065938016826057e-07, + "logits/chosen": -2.6412036418914795, + "logits/rejected": -2.6497039794921875, + "logps/chosen": -185.38690185546875, + "logps/rejected": -320.8858642578125, + "loss": 0.1548, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.786194801330566, + "rewards/margins": 8.789294242858887, + "rewards/rejected": -14.57548713684082, + "step": 10150 + }, + { + "epoch": 1.97, + "learning_rate": 1.9029984899690802e-07, + "logits/chosen": -2.769472360610962, + "logits/rejected": -2.743699789047241, + "logps/chosen": -273.3786315917969, + "logps/rejected": -366.3103332519531, + "loss": 0.1805, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7011394500732422, + "rewards/margins": 9.303980827331543, + "rewards/rejected": -11.005121231079102, + "step": 10160 + }, + { + "epoch": 1.97, + "learning_rate": 1.8994031782555548e-07, + "logits/chosen": -2.7782976627349854, + "logits/rejected": -2.6834959983825684, + "logps/chosen": -300.3525390625, + "logps/rejected": -316.2628173828125, + "loss": 0.1285, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.587084770202637, + "rewards/margins": 8.565958976745605, + "rewards/rejected": -13.153043746948242, + "step": 10170 + }, + { + "epoch": 1.98, + "learning_rate": 1.895807866542029e-07, + "logits/chosen": -2.8544564247131348, + "logits/rejected": -2.818455219268799, + "logps/chosen": -287.4271545410156, + "logps/rejected": -367.23004150390625, + "loss": 0.0872, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.183004856109619, + "rewards/margins": 7.94314432144165, + "rewards/rejected": -11.12614917755127, + "step": 10180 + }, + { + "epoch": 1.98, + "learning_rate": 1.8922125548285036e-07, + "logits/chosen": -2.8023242950439453, + "logits/rejected": -2.7717652320861816, + "logps/chosen": -236.1044158935547, + "logps/rejected": -303.81414794921875, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0204492807388306, + "rewards/margins": 10.077367782592773, + "rewards/rejected": -11.097817420959473, + "step": 10190 + }, + { + "epoch": 1.98, + "learning_rate": 1.888617243114978e-07, + "logits/chosen": -2.7105748653411865, + "logits/rejected": -2.742722749710083, + "logps/chosen": -236.2729034423828, + "logps/rejected": -309.09527587890625, + "loss": 0.1348, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.21351432800293, + "rewards/margins": 7.772862434387207, + "rewards/rejected": -11.98637580871582, + "step": 10200 + }, + { + "epoch": 1.98, + "eval_logits/chosen": -2.6400763988494873, + "eval_logits/rejected": -2.6271843910217285, + "eval_logps/chosen": -271.88494873046875, + "eval_logps/rejected": -299.5146179199219, + "eval_loss": 0.521010160446167, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -7.764688014984131, + "eval_rewards/margins": 4.995177745819092, + "eval_rewards/rejected": -12.759865760803223, + "eval_runtime": 151.7927, + "eval_samples_per_second": 20.792, + "eval_steps_per_second": 0.329, + "step": 10200 + }, + { + "epoch": 1.98, + "learning_rate": 1.8850219314014524e-07, + "logits/chosen": -2.7185475826263428, + "logits/rejected": -2.6648640632629395, + "logps/chosen": -320.8039245605469, + "logps/rejected": -432.5547790527344, + "loss": 0.1744, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.132582664489746, + "rewards/margins": 10.938291549682617, + "rewards/rejected": -15.070874214172363, + "step": 10210 + }, + { + "epoch": 1.98, + "learning_rate": 1.881426619687927e-07, + "logits/chosen": -2.6703097820281982, + "logits/rejected": -2.595444917678833, + "logps/chosen": -218.0316162109375, + "logps/rejected": -336.1557312011719, + "loss": 0.1209, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0630340576171875, + "rewards/margins": 7.4914870262146, + "rewards/rejected": -10.554521560668945, + "step": 10220 + }, + { + "epoch": 1.99, + "learning_rate": 1.8778313079744012e-07, + "logits/chosen": -2.706373691558838, + "logits/rejected": -2.6765666007995605, + "logps/chosen": -244.51779174804688, + "logps/rejected": -333.5496826171875, + "loss": 0.1057, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.8427653312683105, + "rewards/margins": 12.442415237426758, + "rewards/rejected": -16.285181045532227, + "step": 10230 + }, + { + "epoch": 1.99, + "learning_rate": 1.8742359962608758e-07, + "logits/chosen": -2.571765422821045, + "logits/rejected": -2.728299617767334, + "logps/chosen": -318.3007507324219, + "logps/rejected": -394.8019104003906, + "loss": 0.1453, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5207507610321045, + "rewards/margins": 7.949289798736572, + "rewards/rejected": -10.470041275024414, + "step": 10240 + }, + { + "epoch": 1.99, + "learning_rate": 1.87064068454735e-07, + "logits/chosen": -2.7412946224212646, + "logits/rejected": -2.670288562774658, + "logps/chosen": -317.1426696777344, + "logps/rejected": -364.4071350097656, + "loss": 0.1457, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.740424394607544, + "rewards/margins": 10.290372848510742, + "rewards/rejected": -14.030797004699707, + "step": 10250 + }, + { + "epoch": 1.99, + "learning_rate": 1.8670453728338246e-07, + "logits/chosen": -2.6800644397735596, + "logits/rejected": -2.735670804977417, + "logps/chosen": -246.10568237304688, + "logps/rejected": -356.8966369628906, + "loss": 0.0944, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.824759006500244, + "rewards/margins": 11.115058898925781, + "rewards/rejected": -14.939817428588867, + "step": 10260 + }, + { + "epoch": 1.99, + "learning_rate": 1.8634500611202991e-07, + "logits/chosen": -2.7291271686553955, + "logits/rejected": -2.670860767364502, + "logps/chosen": -263.6656188964844, + "logps/rejected": -385.16375732421875, + "loss": 0.1313, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1319451332092285, + "rewards/margins": 8.994240760803223, + "rewards/rejected": -15.126185417175293, + "step": 10270 + }, + { + "epoch": 2.0, + "learning_rate": 1.8598547494067734e-07, + "logits/chosen": -2.9449660778045654, + "logits/rejected": -2.628852605819702, + "logps/chosen": -303.976806640625, + "logps/rejected": -327.79473876953125, + "loss": 0.1037, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.624514102935791, + "rewards/margins": 8.703771591186523, + "rewards/rejected": -12.328285217285156, + "step": 10280 + }, + { + "epoch": 2.0, + "learning_rate": 1.856259437693248e-07, + "logits/chosen": -2.7790870666503906, + "logits/rejected": -2.678490161895752, + "logps/chosen": -214.7358856201172, + "logps/rejected": -333.129150390625, + "loss": 0.1185, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.161875009536743, + "rewards/margins": 11.135395050048828, + "rewards/rejected": -13.297269821166992, + "step": 10290 + }, + { + "epoch": 2.0, + "learning_rate": 1.8526641259797222e-07, + "logits/chosen": -2.696047067642212, + "logits/rejected": -2.70910382270813, + "logps/chosen": -295.90472412109375, + "logps/rejected": -389.22943115234375, + "loss": 0.1342, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.419425010681152, + "rewards/margins": 11.907146453857422, + "rewards/rejected": -18.32657241821289, + "step": 10300 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -2.6298301219940186, + "eval_logits/rejected": -2.617715835571289, + "eval_logps/chosen": -268.9454650878906, + "eval_logps/rejected": -296.80413818359375, + "eval_loss": 0.5257573127746582, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -7.470740795135498, + "eval_rewards/margins": 5.01807975769043, + "eval_rewards/rejected": -12.488821029663086, + "eval_runtime": 140.8767, + "eval_samples_per_second": 22.403, + "eval_steps_per_second": 0.355, + "step": 10300 + }, + { + "epoch": 2.0, + "learning_rate": 1.8490688142661968e-07, + "logits/chosen": -2.7037014961242676, + "logits/rejected": -2.7269883155822754, + "logps/chosen": -182.2980194091797, + "logps/rejected": -340.1834716796875, + "loss": 0.0958, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.161952018737793, + "rewards/margins": 12.578010559082031, + "rewards/rejected": -16.73996353149414, + "step": 10310 + }, + { + "epoch": 2.0, + "learning_rate": 1.8454735025526713e-07, + "logits/chosen": -2.660433292388916, + "logits/rejected": -2.654448986053467, + "logps/chosen": -244.2759552001953, + "logps/rejected": -284.2115478515625, + "loss": 0.1099, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.508305549621582, + "rewards/margins": 8.608351707458496, + "rewards/rejected": -15.116656303405762, + "step": 10320 + }, + { + "epoch": 2.01, + "learning_rate": 1.8418781908391456e-07, + "logits/chosen": -2.7890124320983887, + "logits/rejected": -2.805551052093506, + "logps/chosen": -232.19677734375, + "logps/rejected": -354.23663330078125, + "loss": 0.0728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.5977463722229, + "rewards/margins": 11.178841590881348, + "rewards/rejected": -17.776588439941406, + "step": 10330 + }, + { + "epoch": 2.01, + "learning_rate": 1.8382828791256202e-07, + "logits/chosen": -2.815319776535034, + "logits/rejected": -2.7543816566467285, + "logps/chosen": -298.68267822265625, + "logps/rejected": -327.13983154296875, + "loss": 0.0633, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8585197925567627, + "rewards/margins": 10.603599548339844, + "rewards/rejected": -12.46212100982666, + "step": 10340 + }, + { + "epoch": 2.01, + "learning_rate": 1.8346875674120947e-07, + "logits/chosen": -2.7649192810058594, + "logits/rejected": -2.7012877464294434, + "logps/chosen": -275.35687255859375, + "logps/rejected": -317.98406982421875, + "loss": 0.0976, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.20779275894165, + "rewards/margins": 8.998144149780273, + "rewards/rejected": -13.205937385559082, + "step": 10350 + }, + { + "epoch": 2.01, + "learning_rate": 1.831092255698569e-07, + "logits/chosen": -2.7852184772491455, + "logits/rejected": -2.641512870788574, + "logps/chosen": -283.23480224609375, + "logps/rejected": -338.6064453125, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.593654215335846, + "rewards/margins": 13.130376815795898, + "rewards/rejected": -13.724031448364258, + "step": 10360 + }, + { + "epoch": 2.01, + "learning_rate": 1.8274969439850435e-07, + "logits/chosen": -2.596500873565674, + "logits/rejected": -2.595038652420044, + "logps/chosen": -166.22879028320312, + "logps/rejected": -293.2983703613281, + "loss": 0.0621, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.355771064758301, + "rewards/margins": 10.234164237976074, + "rewards/rejected": -14.589935302734375, + "step": 10370 + }, + { + "epoch": 2.02, + "learning_rate": 1.8239016322715178e-07, + "logits/chosen": -2.841305732727051, + "logits/rejected": -2.784738063812256, + "logps/chosen": -266.9625244140625, + "logps/rejected": -399.4165954589844, + "loss": 0.0684, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.224963426589966, + "rewards/margins": 13.512248039245605, + "rewards/rejected": -15.737211227416992, + "step": 10380 + }, + { + "epoch": 2.02, + "learning_rate": 1.820306320557992e-07, + "logits/chosen": -2.6798746585845947, + "logits/rejected": -2.6772398948669434, + "logps/chosen": -328.0395812988281, + "logps/rejected": -380.20672607421875, + "loss": 0.0887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.209399700164795, + "rewards/margins": 9.099278450012207, + "rewards/rejected": -13.308677673339844, + "step": 10390 + }, + { + "epoch": 2.02, + "learning_rate": 1.816711008844467e-07, + "logits/chosen": -2.7173755168914795, + "logits/rejected": -2.7604899406433105, + "logps/chosen": -224.87344360351562, + "logps/rejected": -269.5205993652344, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.270158529281616, + "rewards/margins": 8.58400821685791, + "rewards/rejected": -10.854167938232422, + "step": 10400 + }, + { + "epoch": 2.02, + "eval_logits/chosen": -2.610612392425537, + "eval_logits/rejected": -2.5950675010681152, + "eval_logps/chosen": -276.90740966796875, + "eval_logps/rejected": -309.80352783203125, + "eval_loss": 0.5396497845649719, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -8.26693344116211, + "eval_rewards/margins": 5.521821975708008, + "eval_rewards/rejected": -13.7887544631958, + "eval_runtime": 140.6042, + "eval_samples_per_second": 22.446, + "eval_steps_per_second": 0.356, + "step": 10400 + }, + { + "epoch": 2.02, + "learning_rate": 1.8131156971309412e-07, + "logits/chosen": -2.760927200317383, + "logits/rejected": -2.7007737159729004, + "logps/chosen": -231.59725952148438, + "logps/rejected": -325.73968505859375, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4801700115203857, + "rewards/margins": 15.018013000488281, + "rewards/rejected": -17.49818229675293, + "step": 10410 + }, + { + "epoch": 2.02, + "learning_rate": 1.8095203854174157e-07, + "logits/chosen": -2.6685779094696045, + "logits/rejected": -2.6751503944396973, + "logps/chosen": -152.3220672607422, + "logps/rejected": -256.0708312988281, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9181880950927734, + "rewards/margins": 9.342168807983398, + "rewards/rejected": -12.260355949401855, + "step": 10420 + }, + { + "epoch": 2.02, + "learning_rate": 1.80592507370389e-07, + "logits/chosen": -2.7395193576812744, + "logits/rejected": -2.6276068687438965, + "logps/chosen": -304.21624755859375, + "logps/rejected": -361.04632568359375, + "loss": 0.0897, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.1980133056640625, + "rewards/margins": 9.919008255004883, + "rewards/rejected": -17.117023468017578, + "step": 10430 + }, + { + "epoch": 2.03, + "learning_rate": 1.8023297619903643e-07, + "logits/chosen": -2.6000497341156006, + "logits/rejected": -2.5789811611175537, + "logps/chosen": -256.3652648925781, + "logps/rejected": -319.67852783203125, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.087789058685303, + "rewards/margins": 9.831320762634277, + "rewards/rejected": -13.919108390808105, + "step": 10440 + }, + { + "epoch": 2.03, + "learning_rate": 1.798734450276839e-07, + "logits/chosen": -2.401099443435669, + "logits/rejected": -2.354226589202881, + "logps/chosen": -300.84490966796875, + "logps/rejected": -397.1323547363281, + "loss": 0.0753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.470708847045898, + "rewards/margins": 11.59446907043457, + "rewards/rejected": -17.065176010131836, + "step": 10450 + }, + { + "epoch": 2.03, + "learning_rate": 1.7951391385633133e-07, + "logits/chosen": -2.6439056396484375, + "logits/rejected": -2.7550628185272217, + "logps/chosen": -313.40277099609375, + "logps/rejected": -378.4320373535156, + "loss": 0.0771, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.915426731109619, + "rewards/margins": 7.226849555969238, + "rewards/rejected": -15.142277717590332, + "step": 10460 + }, + { + "epoch": 2.03, + "learning_rate": 1.791543826849788e-07, + "logits/chosen": -2.7960293292999268, + "logits/rejected": -2.7683236598968506, + "logps/chosen": -244.3646697998047, + "logps/rejected": -401.85089111328125, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9456754922866821, + "rewards/margins": 12.354697227478027, + "rewards/rejected": -14.300374031066895, + "step": 10470 + }, + { + "epoch": 2.03, + "learning_rate": 1.7879485151362622e-07, + "logits/chosen": -2.6908793449401855, + "logits/rejected": -2.7233099937438965, + "logps/chosen": -271.5380554199219, + "logps/rejected": -321.399658203125, + "loss": 0.0692, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.02646541595459, + "rewards/margins": 10.237837791442871, + "rewards/rejected": -15.264302253723145, + "step": 10480 + }, + { + "epoch": 2.04, + "learning_rate": 1.7843532034227364e-07, + "logits/chosen": -2.6730997562408447, + "logits/rejected": -2.7802577018737793, + "logps/chosen": -202.44168090820312, + "logps/rejected": -337.1480407714844, + "loss": 0.0712, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.281571388244629, + "rewards/margins": 11.65519905090332, + "rewards/rejected": -15.93677043914795, + "step": 10490 + }, + { + "epoch": 2.04, + "learning_rate": 1.7807578917092113e-07, + "logits/chosen": -2.6552672386169434, + "logits/rejected": -2.575317859649658, + "logps/chosen": -291.5713195800781, + "logps/rejected": -450.67803955078125, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9118088483810425, + "rewards/margins": 12.37339973449707, + "rewards/rejected": -13.285209655761719, + "step": 10500 + }, + { + "epoch": 2.04, + "eval_logits/chosen": -2.5996899604797363, + "eval_logits/rejected": -2.582946538925171, + "eval_logps/chosen": -279.7846374511719, + "eval_logps/rejected": -316.4410400390625, + "eval_loss": 0.5641571283340454, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -8.554656982421875, + "eval_rewards/margins": 5.897851943969727, + "eval_rewards/rejected": -14.452508926391602, + "eval_runtime": 139.233, + "eval_samples_per_second": 22.667, + "eval_steps_per_second": 0.359, + "step": 10500 + }, + { + "epoch": 2.04, + "learning_rate": 1.7771625799956855e-07, + "logits/chosen": -2.730332374572754, + "logits/rejected": -2.6540560722351074, + "logps/chosen": -322.80621337890625, + "logps/rejected": -362.97430419921875, + "loss": 0.0572, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.85236930847168, + "rewards/margins": 10.666932106018066, + "rewards/rejected": -16.51930046081543, + "step": 10510 + }, + { + "epoch": 2.04, + "learning_rate": 1.77356726828216e-07, + "logits/chosen": -2.607438564300537, + "logits/rejected": -2.577183246612549, + "logps/chosen": -232.3873748779297, + "logps/rejected": -308.77490234375, + "loss": 0.0899, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.431411743164062, + "rewards/margins": 7.742515563964844, + "rewards/rejected": -17.173925399780273, + "step": 10520 + }, + { + "epoch": 2.04, + "learning_rate": 1.7699719565686344e-07, + "logits/chosen": -2.565056562423706, + "logits/rejected": -2.5193850994110107, + "logps/chosen": -219.9849090576172, + "logps/rejected": -334.90277099609375, + "loss": 0.0808, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.017915725708008, + "rewards/margins": 8.877421379089355, + "rewards/rejected": -16.89533805847168, + "step": 10530 + }, + { + "epoch": 2.05, + "learning_rate": 1.7663766448551086e-07, + "logits/chosen": -2.7871925830841064, + "logits/rejected": -2.740809202194214, + "logps/chosen": -365.58013916015625, + "logps/rejected": -334.5350036621094, + "loss": 0.0714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8105227947235107, + "rewards/margins": 11.399116516113281, + "rewards/rejected": -15.209640502929688, + "step": 10540 + }, + { + "epoch": 2.05, + "learning_rate": 1.7627813331415834e-07, + "logits/chosen": -2.6606345176696777, + "logits/rejected": -2.625703811645508, + "logps/chosen": -215.5009307861328, + "logps/rejected": -243.1717529296875, + "loss": 0.1273, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.260490417480469, + "rewards/margins": 7.008008003234863, + "rewards/rejected": -14.2684965133667, + "step": 10550 + }, + { + "epoch": 2.05, + "learning_rate": 1.7591860214280577e-07, + "logits/chosen": -2.8269846439361572, + "logits/rejected": -2.650883674621582, + "logps/chosen": -243.93905639648438, + "logps/rejected": -306.7244567871094, + "loss": 0.0729, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3997321128845215, + "rewards/margins": 8.941568374633789, + "rewards/rejected": -10.341299057006836, + "step": 10560 + }, + { + "epoch": 2.05, + "learning_rate": 1.7555907097145323e-07, + "logits/chosen": -2.5474016666412354, + "logits/rejected": -2.551736354827881, + "logps/chosen": -319.91668701171875, + "logps/rejected": -381.8224792480469, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.944074273109436, + "rewards/margins": 15.506091117858887, + "rewards/rejected": -17.450164794921875, + "step": 10570 + }, + { + "epoch": 2.05, + "learning_rate": 1.7519953980010065e-07, + "logits/chosen": -2.663861036300659, + "logits/rejected": -2.769958972930908, + "logps/chosen": -251.05911254882812, + "logps/rejected": -313.08709716796875, + "loss": 0.0678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.601755142211914, + "rewards/margins": 9.459638595581055, + "rewards/rejected": -14.061393737792969, + "step": 10580 + }, + { + "epoch": 2.06, + "learning_rate": 1.7484000862874808e-07, + "logits/chosen": -2.6212453842163086, + "logits/rejected": -2.555856227874756, + "logps/chosen": -306.3880310058594, + "logps/rejected": -354.94879150390625, + "loss": 0.0792, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.706746101379395, + "rewards/margins": 11.236501693725586, + "rewards/rejected": -19.943248748779297, + "step": 10590 + }, + { + "epoch": 2.06, + "learning_rate": 1.7448047745739556e-07, + "logits/chosen": -2.7406089305877686, + "logits/rejected": -2.688994884490967, + "logps/chosen": -298.58282470703125, + "logps/rejected": -376.921875, + "loss": 0.0411, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.63175892829895, + "rewards/margins": 11.818182945251465, + "rewards/rejected": -14.449941635131836, + "step": 10600 + }, + { + "epoch": 2.06, + "eval_logits/chosen": -2.558769464492798, + "eval_logits/rejected": -2.538564920425415, + "eval_logps/chosen": -297.4822692871094, + "eval_logps/rejected": -336.77093505859375, + "eval_loss": 0.5768638253211975, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -10.324420928955078, + "eval_rewards/margins": 6.16107177734375, + "eval_rewards/rejected": -16.485496520996094, + "eval_runtime": 140.844, + "eval_samples_per_second": 22.408, + "eval_steps_per_second": 0.355, + "step": 10600 + }, + { + "epoch": 2.06, + "learning_rate": 1.74120946286043e-07, + "logits/chosen": -2.896561861038208, + "logits/rejected": -2.8578813076019287, + "logps/chosen": -297.384521484375, + "logps/rejected": -331.8183898925781, + "loss": 0.0645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.844348907470703, + "rewards/margins": 9.64069938659668, + "rewards/rejected": -13.485048294067383, + "step": 10610 + }, + { + "epoch": 2.06, + "learning_rate": 1.7376141511469044e-07, + "logits/chosen": -2.603555679321289, + "logits/rejected": -2.540269613265991, + "logps/chosen": -181.19918823242188, + "logps/rejected": -327.0639953613281, + "loss": 0.0679, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.039401054382324, + "rewards/margins": 12.538655281066895, + "rewards/rejected": -16.578054428100586, + "step": 10620 + }, + { + "epoch": 2.06, + "learning_rate": 1.7340188394333787e-07, + "logits/chosen": -2.6098814010620117, + "logits/rejected": -2.6087749004364014, + "logps/chosen": -221.781494140625, + "logps/rejected": -306.01324462890625, + "loss": 0.047, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.989348411560059, + "rewards/margins": 9.410599708557129, + "rewards/rejected": -16.399948120117188, + "step": 10630 + }, + { + "epoch": 2.07, + "learning_rate": 1.730423527719853e-07, + "logits/chosen": -2.7155346870422363, + "logits/rejected": -2.4269816875457764, + "logps/chosen": -239.442626953125, + "logps/rejected": -327.3274230957031, + "loss": 0.0819, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.456811904907227, + "rewards/margins": 12.340845108032227, + "rewards/rejected": -17.797657012939453, + "step": 10640 + }, + { + "epoch": 2.07, + "learning_rate": 1.7268282160063278e-07, + "logits/chosen": -2.54668927192688, + "logits/rejected": -2.483928680419922, + "logps/chosen": -241.74331665039062, + "logps/rejected": -301.92828369140625, + "loss": 0.0707, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.61628532409668, + "rewards/margins": 13.098543167114258, + "rewards/rejected": -17.714828491210938, + "step": 10650 + }, + { + "epoch": 2.07, + "learning_rate": 1.723232904292802e-07, + "logits/chosen": -2.475114107131958, + "logits/rejected": -2.5202317237854004, + "logps/chosen": -297.5644226074219, + "logps/rejected": -468.6898498535156, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.696624279022217, + "rewards/margins": 12.959909439086914, + "rewards/rejected": -17.656536102294922, + "step": 10660 + }, + { + "epoch": 2.07, + "learning_rate": 1.7196375925792766e-07, + "logits/chosen": -2.719630718231201, + "logits/rejected": -2.7059099674224854, + "logps/chosen": -252.2794647216797, + "logps/rejected": -373.1482849121094, + "loss": 0.0544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.261512756347656, + "rewards/margins": 11.596822738647461, + "rewards/rejected": -16.858333587646484, + "step": 10670 + }, + { + "epoch": 2.07, + "learning_rate": 1.716042280865751e-07, + "logits/chosen": -2.4799282550811768, + "logits/rejected": -2.416187047958374, + "logps/chosen": -255.59725952148438, + "logps/rejected": -332.4320068359375, + "loss": 0.0899, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.2090559005737305, + "rewards/margins": 11.93217945098877, + "rewards/rejected": -19.141237258911133, + "step": 10680 + }, + { + "epoch": 2.08, + "learning_rate": 1.7124469691522252e-07, + "logits/chosen": -2.6054298877716064, + "logits/rejected": -2.5862808227539062, + "logps/chosen": -216.68701171875, + "logps/rejected": -321.9322814941406, + "loss": 0.0674, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.772042274475098, + "rewards/margins": 9.14348316192627, + "rewards/rejected": -15.915525436401367, + "step": 10690 + }, + { + "epoch": 2.08, + "learning_rate": 1.7088516574387e-07, + "logits/chosen": -2.624765396118164, + "logits/rejected": -2.569063186645508, + "logps/chosen": -245.7399139404297, + "logps/rejected": -294.5632629394531, + "loss": 0.0459, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.5335798263549805, + "rewards/margins": 10.03137493133545, + "rewards/rejected": -17.56495475769043, + "step": 10700 + }, + { + "epoch": 2.08, + "eval_logits/chosen": -2.544043779373169, + "eval_logits/rejected": -2.523214340209961, + "eval_logps/chosen": -295.04119873046875, + "eval_logps/rejected": -336.9667053222656, + "eval_loss": 0.5941163897514343, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -10.080312728881836, + "eval_rewards/margins": 6.424767017364502, + "eval_rewards/rejected": -16.505081176757812, + "eval_runtime": 141.339, + "eval_samples_per_second": 22.329, + "eval_steps_per_second": 0.354, + "step": 10700 + }, + { + "epoch": 2.08, + "learning_rate": 1.7052563457251743e-07, + "logits/chosen": -2.653287887573242, + "logits/rejected": -2.660998821258545, + "logps/chosen": -278.9805603027344, + "logps/rejected": -360.039794921875, + "loss": 0.0912, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.7693047523498535, + "rewards/margins": 12.69771671295166, + "rewards/rejected": -20.467021942138672, + "step": 10710 + }, + { + "epoch": 2.08, + "learning_rate": 1.7016610340116488e-07, + "logits/chosen": -2.692852735519409, + "logits/rejected": -2.5415070056915283, + "logps/chosen": -252.43838500976562, + "logps/rejected": -375.9709777832031, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5314717292785645, + "rewards/margins": 12.215380668640137, + "rewards/rejected": -15.746851921081543, + "step": 10720 + }, + { + "epoch": 2.08, + "learning_rate": 1.698065722298123e-07, + "logits/chosen": -2.6087474822998047, + "logits/rejected": -2.59074068069458, + "logps/chosen": -276.2684631347656, + "logps/rejected": -426.1739196777344, + "loss": 0.0764, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9938807487487793, + "rewards/margins": 12.584334373474121, + "rewards/rejected": -16.57821273803711, + "step": 10730 + }, + { + "epoch": 2.09, + "learning_rate": 1.6944704105845974e-07, + "logits/chosen": -2.7049241065979004, + "logits/rejected": -2.605888843536377, + "logps/chosen": -297.2758483886719, + "logps/rejected": -457.2428283691406, + "loss": 0.0371, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.898677349090576, + "rewards/margins": 14.518112182617188, + "rewards/rejected": -17.416790008544922, + "step": 10740 + }, + { + "epoch": 2.09, + "learning_rate": 1.6908750988710722e-07, + "logits/chosen": -2.5800156593322754, + "logits/rejected": -2.4820423126220703, + "logps/chosen": -329.87335205078125, + "logps/rejected": -347.7347106933594, + "loss": 0.056, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.187057495117188, + "rewards/margins": 8.66640853881836, + "rewards/rejected": -16.853466033935547, + "step": 10750 + }, + { + "epoch": 2.09, + "learning_rate": 1.6872797871575465e-07, + "logits/chosen": -2.5678439140319824, + "logits/rejected": -2.672456741333008, + "logps/chosen": -306.06353759765625, + "logps/rejected": -465.56439208984375, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.193748474121094, + "rewards/margins": 14.256443977355957, + "rewards/rejected": -18.450191497802734, + "step": 10760 + }, + { + "epoch": 2.09, + "learning_rate": 1.683684475444021e-07, + "logits/chosen": -2.463825225830078, + "logits/rejected": -2.4699854850769043, + "logps/chosen": -351.6150817871094, + "logps/rejected": -374.30078125, + "loss": 0.1096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.511889934539795, + "rewards/margins": 11.097128868103027, + "rewards/rejected": -13.60901927947998, + "step": 10770 + }, + { + "epoch": 2.09, + "learning_rate": 1.6800891637304953e-07, + "logits/chosen": -2.4313137531280518, + "logits/rejected": -2.417144298553467, + "logps/chosen": -268.6635437011719, + "logps/rejected": -355.5901794433594, + "loss": 0.0738, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.694070816040039, + "rewards/margins": 10.495762825012207, + "rewards/rejected": -16.18983268737793, + "step": 10780 + }, + { + "epoch": 2.09, + "learning_rate": 1.6764938520169696e-07, + "logits/chosen": -2.7539901733398438, + "logits/rejected": -2.6599209308624268, + "logps/chosen": -214.2755889892578, + "logps/rejected": -298.9658508300781, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8701863288879395, + "rewards/margins": 12.36173152923584, + "rewards/rejected": -16.231918334960938, + "step": 10790 + }, + { + "epoch": 2.1, + "learning_rate": 1.6728985403034444e-07, + "logits/chosen": -2.532379150390625, + "logits/rejected": -2.454911470413208, + "logps/chosen": -246.7219696044922, + "logps/rejected": -326.46514892578125, + "loss": 0.0586, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.285130262374878, + "rewards/margins": 11.530749320983887, + "rewards/rejected": -14.815881729125977, + "step": 10800 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -2.5395236015319824, + "eval_logits/rejected": -2.516683578491211, + "eval_logps/chosen": -296.6443176269531, + "eval_logps/rejected": -339.0528564453125, + "eval_loss": 0.5881070494651794, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -10.240626335144043, + "eval_rewards/margins": 6.4730634689331055, + "eval_rewards/rejected": -16.71368980407715, + "eval_runtime": 140.6279, + "eval_samples_per_second": 22.442, + "eval_steps_per_second": 0.356, + "step": 10800 + }, + { + "epoch": 2.1, + "learning_rate": 1.6693032285899186e-07, + "logits/chosen": -2.4681193828582764, + "logits/rejected": -2.490391969680786, + "logps/chosen": -182.4458465576172, + "logps/rejected": -254.2109375, + "loss": 0.0707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.3664960861206055, + "rewards/margins": 7.3670454025268555, + "rewards/rejected": -11.733543395996094, + "step": 10810 + }, + { + "epoch": 2.1, + "learning_rate": 1.6657079168763932e-07, + "logits/chosen": -2.5093283653259277, + "logits/rejected": -2.5589137077331543, + "logps/chosen": -321.19989013671875, + "logps/rejected": -306.82843017578125, + "loss": 0.095, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.453250885009766, + "rewards/margins": 8.917437553405762, + "rewards/rejected": -13.370686531066895, + "step": 10820 + }, + { + "epoch": 2.1, + "learning_rate": 1.6621126051628675e-07, + "logits/chosen": -2.6584315299987793, + "logits/rejected": -2.6464643478393555, + "logps/chosen": -274.248779296875, + "logps/rejected": -381.333984375, + "loss": 0.0556, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8905997276306152, + "rewards/margins": 12.009408950805664, + "rewards/rejected": -14.900009155273438, + "step": 10830 + }, + { + "epoch": 2.1, + "learning_rate": 1.658517293449342e-07, + "logits/chosen": -2.649141788482666, + "logits/rejected": -2.5465428829193115, + "logps/chosen": -394.0592041015625, + "logps/rejected": -392.80303955078125, + "loss": 0.0488, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.281259536743164, + "rewards/margins": 13.165651321411133, + "rewards/rejected": -21.446908950805664, + "step": 10840 + }, + { + "epoch": 2.11, + "learning_rate": 1.6549219817358166e-07, + "logits/chosen": -2.739020824432373, + "logits/rejected": -2.6592862606048584, + "logps/chosen": -332.251953125, + "logps/rejected": -469.31219482421875, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.956437587738037, + "rewards/margins": 12.968274116516113, + "rewards/rejected": -19.924711227416992, + "step": 10850 + }, + { + "epoch": 2.11, + "learning_rate": 1.6513266700222908e-07, + "logits/chosen": -2.4472410678863525, + "logits/rejected": -2.449275255203247, + "logps/chosen": -301.61260986328125, + "logps/rejected": -461.7283630371094, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.039148330688477, + "rewards/margins": 14.28356647491455, + "rewards/rejected": -20.32271385192871, + "step": 10860 + }, + { + "epoch": 2.11, + "learning_rate": 1.6477313583087654e-07, + "logits/chosen": -2.6414430141448975, + "logits/rejected": -2.567702054977417, + "logps/chosen": -313.7510070800781, + "logps/rejected": -385.27813720703125, + "loss": 0.0872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.331001281738281, + "rewards/margins": 8.52501106262207, + "rewards/rejected": -13.856013298034668, + "step": 10870 + }, + { + "epoch": 2.11, + "learning_rate": 1.6441360465952397e-07, + "logits/chosen": -2.6752846240997314, + "logits/rejected": -2.635814666748047, + "logps/chosen": -284.8486633300781, + "logps/rejected": -546.738037109375, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9140230417251587, + "rewards/margins": 19.600696563720703, + "rewards/rejected": -21.514720916748047, + "step": 10880 + }, + { + "epoch": 2.11, + "learning_rate": 1.6405407348817142e-07, + "logits/chosen": -2.592179536819458, + "logits/rejected": -2.7527787685394287, + "logps/chosen": -332.42327880859375, + "logps/rejected": -542.1893310546875, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.809802055358887, + "rewards/margins": 17.367122650146484, + "rewards/rejected": -24.176923751831055, + "step": 10890 + }, + { + "epoch": 2.12, + "learning_rate": 1.6369454231681887e-07, + "logits/chosen": -2.4406590461730957, + "logits/rejected": -2.5358266830444336, + "logps/chosen": -213.36709594726562, + "logps/rejected": -317.38775634765625, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6232361793518066, + "rewards/margins": 13.348731994628906, + "rewards/rejected": -15.971966743469238, + "step": 10900 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.5247130393981934, + "eval_logits/rejected": -2.4991767406463623, + "eval_logps/chosen": -313.14306640625, + "eval_logps/rejected": -359.2173156738281, + "eval_loss": 0.614883542060852, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -11.890498161315918, + "eval_rewards/margins": 6.839634895324707, + "eval_rewards/rejected": -18.730134963989258, + "eval_runtime": 140.4936, + "eval_samples_per_second": 22.464, + "eval_steps_per_second": 0.356, + "step": 10900 + }, + { + "epoch": 2.12, + "learning_rate": 1.633350111454663e-07, + "logits/chosen": -2.639094829559326, + "logits/rejected": -2.6810290813446045, + "logps/chosen": -278.84307861328125, + "logps/rejected": -461.92364501953125, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.86960220336914, + "rewards/margins": 13.029779434204102, + "rewards/rejected": -21.899381637573242, + "step": 10910 + }, + { + "epoch": 2.12, + "learning_rate": 1.6297547997411376e-07, + "logits/chosen": -2.2820096015930176, + "logits/rejected": -2.2630152702331543, + "logps/chosen": -302.6440734863281, + "logps/rejected": -329.61541748046875, + "loss": 0.0937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.195849418640137, + "rewards/margins": 10.124932289123535, + "rewards/rejected": -16.320781707763672, + "step": 10920 + }, + { + "epoch": 2.12, + "learning_rate": 1.6261594880276118e-07, + "logits/chosen": -2.647876501083374, + "logits/rejected": -2.4846560955047607, + "logps/chosen": -274.51971435546875, + "logps/rejected": -401.75225830078125, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.075141906738281, + "rewards/margins": 15.784754753112793, + "rewards/rejected": -25.85989761352539, + "step": 10930 + }, + { + "epoch": 2.12, + "learning_rate": 1.6225641763140864e-07, + "logits/chosen": -2.8466548919677734, + "logits/rejected": -2.6956911087036133, + "logps/chosen": -332.37286376953125, + "logps/rejected": -427.3744201660156, + "loss": 0.0622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.177321910858154, + "rewards/margins": 15.13109016418457, + "rewards/rejected": -21.308412551879883, + "step": 10940 + }, + { + "epoch": 2.13, + "learning_rate": 1.618968864600561e-07, + "logits/chosen": -2.668459415435791, + "logits/rejected": -2.5558042526245117, + "logps/chosen": -268.5563049316406, + "logps/rejected": -342.63177490234375, + "loss": 0.0955, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.891250610351562, + "rewards/margins": 10.969114303588867, + "rewards/rejected": -20.86036491394043, + "step": 10950 + }, + { + "epoch": 2.13, + "learning_rate": 1.6153735528870352e-07, + "logits/chosen": -2.714474678039551, + "logits/rejected": -2.6775665283203125, + "logps/chosen": -343.44305419921875, + "logps/rejected": -320.18634033203125, + "loss": 0.0629, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.7504987716674805, + "rewards/margins": 10.8442964553833, + "rewards/rejected": -16.59479522705078, + "step": 10960 + }, + { + "epoch": 2.13, + "learning_rate": 1.6117782411735097e-07, + "logits/chosen": -2.4388394355773926, + "logits/rejected": -2.5422651767730713, + "logps/chosen": -258.8954162597656, + "logps/rejected": -359.7748718261719, + "loss": 0.0807, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.019707679748535, + "rewards/margins": 7.9827775955200195, + "rewards/rejected": -17.002483367919922, + "step": 10970 + }, + { + "epoch": 2.13, + "learning_rate": 1.608182929459984e-07, + "logits/chosen": -2.6410956382751465, + "logits/rejected": -2.683004856109619, + "logps/chosen": -475.34954833984375, + "logps/rejected": -489.85003662109375, + "loss": 0.0584, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.680830478668213, + "rewards/margins": 15.518026351928711, + "rewards/rejected": -18.198856353759766, + "step": 10980 + }, + { + "epoch": 2.13, + "learning_rate": 1.6045876177464586e-07, + "logits/chosen": -2.6654322147369385, + "logits/rejected": -2.5960605144500732, + "logps/chosen": -278.5632019042969, + "logps/rejected": -429.90130615234375, + "loss": 0.0675, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.198668956756592, + "rewards/margins": 12.306130409240723, + "rewards/rejected": -18.50480079650879, + "step": 10990 + }, + { + "epoch": 2.14, + "learning_rate": 1.600992306032933e-07, + "logits/chosen": -2.6287841796875, + "logits/rejected": -2.690498113632202, + "logps/chosen": -323.66571044921875, + "logps/rejected": -383.86419677734375, + "loss": 0.0518, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.227775573730469, + "rewards/margins": 13.197230339050293, + "rewards/rejected": -19.425006866455078, + "step": 11000 + }, + { + "epoch": 2.14, + "eval_logits/chosen": -2.5589678287506104, + "eval_logits/rejected": -2.5353012084960938, + "eval_logps/chosen": -313.0390930175781, + "eval_logps/rejected": -360.3355712890625, + "eval_loss": 0.6385772824287415, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -11.880102157592773, + "eval_rewards/margins": 6.9618611335754395, + "eval_rewards/rejected": -18.841962814331055, + "eval_runtime": 156.4122, + "eval_samples_per_second": 20.177, + "eval_steps_per_second": 0.32, + "step": 11000 + }, + { + "epoch": 2.14, + "learning_rate": 1.5973969943194074e-07, + "logits/chosen": -2.667440891265869, + "logits/rejected": -2.384535312652588, + "logps/chosen": -244.1143341064453, + "logps/rejected": -304.1304626464844, + "loss": 0.0741, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.142891883850098, + "rewards/margins": 10.961563110351562, + "rewards/rejected": -16.104454040527344, + "step": 11010 + }, + { + "epoch": 2.14, + "learning_rate": 1.593801682605882e-07, + "logits/chosen": -2.6622557640075684, + "logits/rejected": -2.608793258666992, + "logps/chosen": -297.91717529296875, + "logps/rejected": -365.6161804199219, + "loss": 0.0821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.57074499130249, + "rewards/margins": 13.855226516723633, + "rewards/rejected": -20.42597007751465, + "step": 11020 + }, + { + "epoch": 2.14, + "learning_rate": 1.5902063708923562e-07, + "logits/chosen": -2.718367338180542, + "logits/rejected": -2.597449779510498, + "logps/chosen": -279.3199768066406, + "logps/rejected": -388.0484313964844, + "loss": 0.0788, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.699678421020508, + "rewards/margins": 11.566597938537598, + "rewards/rejected": -22.266277313232422, + "step": 11030 + }, + { + "epoch": 2.14, + "learning_rate": 1.5866110591788308e-07, + "logits/chosen": -2.8459181785583496, + "logits/rejected": -2.811908006668091, + "logps/chosen": -329.0658874511719, + "logps/rejected": -457.80810546875, + "loss": 0.0624, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6768605709075928, + "rewards/margins": 14.439372062683105, + "rewards/rejected": -17.11623191833496, + "step": 11040 + }, + { + "epoch": 2.15, + "learning_rate": 1.5830157474653053e-07, + "logits/chosen": -2.5981099605560303, + "logits/rejected": -2.575413703918457, + "logps/chosen": -371.9594421386719, + "logps/rejected": -435.58349609375, + "loss": 0.0739, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.415132522583008, + "rewards/margins": 11.580273628234863, + "rewards/rejected": -20.99540901184082, + "step": 11050 + }, + { + "epoch": 2.15, + "learning_rate": 1.5794204357517796e-07, + "logits/chosen": -2.4940452575683594, + "logits/rejected": -2.682844638824463, + "logps/chosen": -217.81845092773438, + "logps/rejected": -368.53680419921875, + "loss": 0.0621, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.1467790603637695, + "rewards/margins": 12.292379379272461, + "rewards/rejected": -19.439159393310547, + "step": 11060 + }, + { + "epoch": 2.15, + "learning_rate": 1.575825124038254e-07, + "logits/chosen": -2.670027256011963, + "logits/rejected": -2.7122721672058105, + "logps/chosen": -258.63818359375, + "logps/rejected": -381.0811462402344, + "loss": 0.0697, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.69087028503418, + "rewards/margins": 11.159737586975098, + "rewards/rejected": -15.850606918334961, + "step": 11070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5722298123247284e-07, + "logits/chosen": -2.6194968223571777, + "logits/rejected": -2.646165370941162, + "logps/chosen": -317.9933776855469, + "logps/rejected": -354.77020263671875, + "loss": 0.0882, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.147225379943848, + "rewards/margins": 8.7410306930542, + "rewards/rejected": -16.88825798034668, + "step": 11080 + }, + { + "epoch": 2.15, + "learning_rate": 1.568634500611203e-07, + "logits/chosen": -2.602839946746826, + "logits/rejected": -2.6435656547546387, + "logps/chosen": -314.6427001953125, + "logps/rejected": -511.84307861328125, + "loss": 0.0947, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.209485054016113, + "rewards/margins": 16.623762130737305, + "rewards/rejected": -22.83324432373047, + "step": 11090 + }, + { + "epoch": 2.15, + "learning_rate": 1.5650391888976775e-07, + "logits/chosen": -2.8791985511779785, + "logits/rejected": -2.71051025390625, + "logps/chosen": -263.18023681640625, + "logps/rejected": -334.80242919921875, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.914602041244507, + "rewards/margins": 10.570642471313477, + "rewards/rejected": -13.485244750976562, + "step": 11100 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -2.534025192260742, + "eval_logits/rejected": -2.5089762210845947, + "eval_logps/chosen": -311.0261535644531, + "eval_logps/rejected": -360.55535888671875, + "eval_loss": 0.6274450421333313, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -11.678807258605957, + "eval_rewards/margins": 7.185128688812256, + "eval_rewards/rejected": -18.863937377929688, + "eval_runtime": 154.558, + "eval_samples_per_second": 20.42, + "eval_steps_per_second": 0.324, + "step": 11100 + }, + { + "epoch": 2.16, + "learning_rate": 1.5614438771841518e-07, + "logits/chosen": -2.553730010986328, + "logits/rejected": -2.4233312606811523, + "logps/chosen": -297.3955383300781, + "logps/rejected": -408.5569152832031, + "loss": 0.0643, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.867355346679688, + "rewards/margins": 10.925642013549805, + "rewards/rejected": -20.792999267578125, + "step": 11110 + }, + { + "epoch": 2.16, + "learning_rate": 1.557848565470626e-07, + "logits/chosen": -2.7405340671539307, + "logits/rejected": -2.722381830215454, + "logps/chosen": -213.37008666992188, + "logps/rejected": -380.01898193359375, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.469528675079346, + "rewards/margins": 13.943641662597656, + "rewards/rejected": -18.413171768188477, + "step": 11120 + }, + { + "epoch": 2.16, + "learning_rate": 1.5542532537571006e-07, + "logits/chosen": -2.604018449783325, + "logits/rejected": -2.575108051300049, + "logps/chosen": -248.8677978515625, + "logps/rejected": -353.578369140625, + "loss": 0.1341, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.432516098022461, + "rewards/margins": 8.367327690124512, + "rewards/rejected": -16.79984474182129, + "step": 11130 + }, + { + "epoch": 2.16, + "learning_rate": 1.550657942043575e-07, + "logits/chosen": -2.7750678062438965, + "logits/rejected": -2.6267189979553223, + "logps/chosen": -290.4456787109375, + "logps/rejected": -415.94647216796875, + "loss": 0.0588, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.543251991271973, + "rewards/margins": 12.352669715881348, + "rewards/rejected": -19.895919799804688, + "step": 11140 + }, + { + "epoch": 2.16, + "learning_rate": 1.5470626303300497e-07, + "logits/chosen": -2.464034080505371, + "logits/rejected": -2.460228681564331, + "logps/chosen": -296.686279296875, + "logps/rejected": -447.08184814453125, + "loss": 0.0575, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.144915580749512, + "rewards/margins": 15.27160358428955, + "rewards/rejected": -24.416519165039062, + "step": 11150 + }, + { + "epoch": 2.17, + "learning_rate": 1.543467318616524e-07, + "logits/chosen": -2.453579902648926, + "logits/rejected": -2.556788682937622, + "logps/chosen": -257.7510986328125, + "logps/rejected": -426.12823486328125, + "loss": 0.1015, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.701089859008789, + "rewards/margins": 14.420515060424805, + "rewards/rejected": -24.121604919433594, + "step": 11160 + }, + { + "epoch": 2.17, + "learning_rate": 1.5398720069029982e-07, + "logits/chosen": -2.607083797454834, + "logits/rejected": -2.6189160346984863, + "logps/chosen": -361.08050537109375, + "logps/rejected": -591.3878173828125, + "loss": 0.063, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.01377010345459, + "rewards/margins": 13.404559135437012, + "rewards/rejected": -22.418331146240234, + "step": 11170 + }, + { + "epoch": 2.17, + "learning_rate": 1.5362766951894728e-07, + "logits/chosen": -2.4953904151916504, + "logits/rejected": -2.5536980628967285, + "logps/chosen": -264.725830078125, + "logps/rejected": -472.8086853027344, + "loss": 0.092, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.30578899383545, + "rewards/margins": 11.997278213500977, + "rewards/rejected": -27.303064346313477, + "step": 11180 + }, + { + "epoch": 2.17, + "learning_rate": 1.5326813834759473e-07, + "logits/chosen": -2.753215789794922, + "logits/rejected": -2.685314893722534, + "logps/chosen": -293.8022155761719, + "logps/rejected": -406.1915588378906, + "loss": 0.0626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0069998502731323, + "rewards/margins": 14.971635818481445, + "rewards/rejected": -15.978635787963867, + "step": 11190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5290860717624219e-07, + "logits/chosen": -2.5073513984680176, + "logits/rejected": -2.4092020988464355, + "logps/chosen": -179.8335723876953, + "logps/rejected": -279.8216247558594, + "loss": 0.1038, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.35915756225586, + "rewards/margins": 10.612787246704102, + "rewards/rejected": -18.97194480895996, + "step": 11200 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.5274198055267334, + "eval_logits/rejected": -2.5016088485717773, + "eval_logps/chosen": -311.4629211425781, + "eval_logps/rejected": -362.78240966796875, + "eval_loss": 0.6328377723693848, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -11.722484588623047, + "eval_rewards/margins": 7.364159107208252, + "eval_rewards/rejected": -19.086645126342773, + "eval_runtime": 139.807, + "eval_samples_per_second": 22.574, + "eval_steps_per_second": 0.358, + "step": 11200 + }, + { + "epoch": 2.18, + "learning_rate": 1.525490760048896e-07, + "logits/chosen": -2.626120090484619, + "logits/rejected": -2.6258702278137207, + "logps/chosen": -225.55581665039062, + "logps/rejected": -447.088134765625, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.508953094482422, + "rewards/margins": 16.111501693725586, + "rewards/rejected": -19.620454788208008, + "step": 11210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5218954483353704e-07, + "logits/chosen": -2.7070865631103516, + "logits/rejected": -2.6965084075927734, + "logps/chosen": -276.8108825683594, + "logps/rejected": -474.0812072753906, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2777347564697266, + "rewards/margins": 17.953739166259766, + "rewards/rejected": -19.231473922729492, + "step": 11220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5183001366218452e-07, + "logits/chosen": -2.6148335933685303, + "logits/rejected": -2.7021570205688477, + "logps/chosen": -232.45120239257812, + "logps/rejected": -314.4829406738281, + "loss": 0.0619, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.771601676940918, + "rewards/margins": 10.580767631530762, + "rewards/rejected": -13.35236930847168, + "step": 11230 + }, + { + "epoch": 2.18, + "learning_rate": 1.5147048249083195e-07, + "logits/chosen": -2.48647403717041, + "logits/rejected": -2.547441005706787, + "logps/chosen": -325.8612365722656, + "logps/rejected": -394.05877685546875, + "loss": 0.0582, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.092509269714355, + "rewards/margins": 10.890533447265625, + "rewards/rejected": -23.983041763305664, + "step": 11240 + }, + { + "epoch": 2.18, + "learning_rate": 1.511109513194794e-07, + "logits/chosen": -2.523958206176758, + "logits/rejected": -2.5374932289123535, + "logps/chosen": -317.35626220703125, + "logps/rejected": -418.19427490234375, + "loss": 0.0474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.8556671142578125, + "rewards/margins": 11.438713073730469, + "rewards/rejected": -19.29437828063965, + "step": 11250 + }, + { + "epoch": 2.19, + "learning_rate": 1.5075142014812683e-07, + "logits/chosen": -2.656561851501465, + "logits/rejected": -2.5346570014953613, + "logps/chosen": -212.903076171875, + "logps/rejected": -320.86029052734375, + "loss": 0.073, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.8994927406311035, + "rewards/margins": 12.7178373336792, + "rewards/rejected": -17.617328643798828, + "step": 11260 + }, + { + "epoch": 2.19, + "learning_rate": 1.5039188897677426e-07, + "logits/chosen": -2.640470027923584, + "logits/rejected": -2.475165843963623, + "logps/chosen": -318.3912658691406, + "logps/rejected": -448.56268310546875, + "loss": 0.0763, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.485833168029785, + "rewards/margins": 11.998087882995605, + "rewards/rejected": -18.48392105102539, + "step": 11270 + }, + { + "epoch": 2.19, + "learning_rate": 1.5003235780542174e-07, + "logits/chosen": -2.808537006378174, + "logits/rejected": -2.6735212802886963, + "logps/chosen": -324.00677490234375, + "logps/rejected": -353.6200256347656, + "loss": 0.0868, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3140597343444824, + "rewards/margins": 11.747946739196777, + "rewards/rejected": -15.06200885772705, + "step": 11280 + }, + { + "epoch": 2.19, + "learning_rate": 1.4967282663406917e-07, + "logits/chosen": -2.7259631156921387, + "logits/rejected": -2.7184345722198486, + "logps/chosen": -307.0538024902344, + "logps/rejected": -416.4231872558594, + "loss": 0.0713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.9838473796844482, + "rewards/margins": 12.451475143432617, + "rewards/rejected": -16.435321807861328, + "step": 11290 + }, + { + "epoch": 2.19, + "learning_rate": 1.4931329546271662e-07, + "logits/chosen": -2.4986824989318848, + "logits/rejected": -2.492968797683716, + "logps/chosen": -327.5531921386719, + "logps/rejected": -383.15606689453125, + "loss": 0.0684, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.82669734954834, + "rewards/margins": 13.673391342163086, + "rewards/rejected": -20.50008773803711, + "step": 11300 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -2.5490097999572754, + "eval_logits/rejected": -2.528714418411255, + "eval_logps/chosen": -305.3045959472656, + "eval_logps/rejected": -352.1844482421875, + "eval_loss": 0.6159024238586426, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -11.106654167175293, + "eval_rewards/margins": 6.920196533203125, + "eval_rewards/rejected": -18.0268497467041, + "eval_runtime": 140.5121, + "eval_samples_per_second": 22.461, + "eval_steps_per_second": 0.356, + "step": 11300 + }, + { + "epoch": 2.2, + "learning_rate": 1.4895376429136405e-07, + "logits/chosen": -2.5363929271698, + "logits/rejected": -2.599428415298462, + "logps/chosen": -284.7268981933594, + "logps/rejected": -372.39935302734375, + "loss": 0.0858, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.038068771362305, + "rewards/margins": 9.73109245300293, + "rewards/rejected": -20.769161224365234, + "step": 11310 + }, + { + "epoch": 2.2, + "learning_rate": 1.4859423312001148e-07, + "logits/chosen": -2.5442938804626465, + "logits/rejected": -2.484290599822998, + "logps/chosen": -228.50784301757812, + "logps/rejected": -322.25811767578125, + "loss": 0.0613, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.360391616821289, + "rewards/margins": 13.330389022827148, + "rewards/rejected": -16.69078254699707, + "step": 11320 + }, + { + "epoch": 2.2, + "learning_rate": 1.4823470194865896e-07, + "logits/chosen": -2.5713067054748535, + "logits/rejected": -2.541801691055298, + "logps/chosen": -290.6044921875, + "logps/rejected": -360.5171813964844, + "loss": 0.0716, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.318646430969238, + "rewards/margins": 10.339444160461426, + "rewards/rejected": -17.658090591430664, + "step": 11330 + }, + { + "epoch": 2.2, + "learning_rate": 1.4787517077730639e-07, + "logits/chosen": -2.5785257816314697, + "logits/rejected": -2.57765531539917, + "logps/chosen": -262.76776123046875, + "logps/rejected": -473.09429931640625, + "loss": 0.1116, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.607316970825195, + "rewards/margins": 17.369888305664062, + "rewards/rejected": -26.977203369140625, + "step": 11340 + }, + { + "epoch": 2.2, + "learning_rate": 1.4751563960595384e-07, + "logits/chosen": -2.5037636756896973, + "logits/rejected": -2.5612921714782715, + "logps/chosen": -239.2677764892578, + "logps/rejected": -393.05511474609375, + "loss": 0.081, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.62736988067627, + "rewards/margins": 11.542470932006836, + "rewards/rejected": -21.169841766357422, + "step": 11350 + }, + { + "epoch": 2.21, + "learning_rate": 1.4715610843460127e-07, + "logits/chosen": -2.627143383026123, + "logits/rejected": -2.6791281700134277, + "logps/chosen": -336.3082275390625, + "logps/rejected": -490.94036865234375, + "loss": 0.0631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.407607078552246, + "rewards/margins": 11.873138427734375, + "rewards/rejected": -19.280746459960938, + "step": 11360 + }, + { + "epoch": 2.21, + "learning_rate": 1.467965772632487e-07, + "logits/chosen": -2.705996036529541, + "logits/rejected": -2.693732738494873, + "logps/chosen": -287.5931091308594, + "logps/rejected": -437.357177734375, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.194832801818848, + "rewards/margins": 11.717456817626953, + "rewards/rejected": -17.912288665771484, + "step": 11370 + }, + { + "epoch": 2.21, + "learning_rate": 1.4643704609189618e-07, + "logits/chosen": -2.4713706970214844, + "logits/rejected": -2.4754528999328613, + "logps/chosen": -204.80233764648438, + "logps/rejected": -354.646240234375, + "loss": 0.0896, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.231915473937988, + "rewards/margins": 10.809212684631348, + "rewards/rejected": -20.041126251220703, + "step": 11380 + }, + { + "epoch": 2.21, + "learning_rate": 1.460775149205436e-07, + "logits/chosen": -2.6862220764160156, + "logits/rejected": -2.713573932647705, + "logps/chosen": -274.80499267578125, + "logps/rejected": -307.76190185546875, + "loss": 0.0738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.292464256286621, + "rewards/margins": 8.338617324829102, + "rewards/rejected": -16.631084442138672, + "step": 11390 + }, + { + "epoch": 2.21, + "learning_rate": 1.4571798374919106e-07, + "logits/chosen": -2.600187301635742, + "logits/rejected": -2.5780961513519287, + "logps/chosen": -236.43203735351562, + "logps/rejected": -311.9661560058594, + "loss": 0.1067, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.995016574859619, + "rewards/margins": 9.66476058959961, + "rewards/rejected": -17.65977668762207, + "step": 11400 + }, + { + "epoch": 2.21, + "eval_logits/chosen": -2.597402572631836, + "eval_logits/rejected": -2.5786943435668945, + "eval_logps/chosen": -296.12762451171875, + "eval_logps/rejected": -338.4790344238281, + "eval_loss": 0.6008053421974182, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -10.188957214355469, + "eval_rewards/margins": 6.467350006103516, + "eval_rewards/rejected": -16.65630531311035, + "eval_runtime": 153.6646, + "eval_samples_per_second": 20.538, + "eval_steps_per_second": 0.325, + "step": 11400 + }, + { + "epoch": 2.22, + "learning_rate": 1.453584525778385e-07, + "logits/chosen": -2.750739812850952, + "logits/rejected": -2.723726511001587, + "logps/chosen": -234.849853515625, + "logps/rejected": -380.14990234375, + "loss": 0.0701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.196296691894531, + "rewards/margins": 10.99039363861084, + "rewards/rejected": -19.186689376831055, + "step": 11410 + }, + { + "epoch": 2.22, + "learning_rate": 1.4499892140648591e-07, + "logits/chosen": -2.583066701889038, + "logits/rejected": -2.6566250324249268, + "logps/chosen": -310.9710388183594, + "logps/rejected": -354.7814025878906, + "loss": 0.0621, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.802247524261475, + "rewards/margins": 13.766270637512207, + "rewards/rejected": -20.568519592285156, + "step": 11420 + }, + { + "epoch": 2.22, + "learning_rate": 1.446393902351334e-07, + "logits/chosen": -2.6055569648742676, + "logits/rejected": -2.6457715034484863, + "logps/chosen": -286.761962890625, + "logps/rejected": -376.4789123535156, + "loss": 0.0615, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.760054111480713, + "rewards/margins": 11.66706371307373, + "rewards/rejected": -19.4271183013916, + "step": 11430 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427985906378082e-07, + "logits/chosen": -2.3500683307647705, + "logits/rejected": -2.5180647373199463, + "logps/chosen": -298.7884216308594, + "logps/rejected": -370.7210388183594, + "loss": 0.1296, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.273965835571289, + "rewards/margins": 13.301396369934082, + "rewards/rejected": -18.575363159179688, + "step": 11440 + }, + { + "epoch": 2.22, + "learning_rate": 1.4392032789242828e-07, + "logits/chosen": -2.5943427085876465, + "logits/rejected": -2.5318379402160645, + "logps/chosen": -165.84658813476562, + "logps/rejected": -344.3070373535156, + "loss": 0.0928, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.968050003051758, + "rewards/margins": 11.761285781860352, + "rewards/rejected": -17.72933578491211, + "step": 11450 + }, + { + "epoch": 2.22, + "learning_rate": 1.435607967210757e-07, + "logits/chosen": -2.745082378387451, + "logits/rejected": -2.668109178543091, + "logps/chosen": -312.0365905761719, + "logps/rejected": -352.36077880859375, + "loss": 0.1067, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.082812309265137, + "rewards/margins": 9.763988494873047, + "rewards/rejected": -17.8468017578125, + "step": 11460 + }, + { + "epoch": 2.23, + "learning_rate": 1.4320126554972313e-07, + "logits/chosen": -2.496685028076172, + "logits/rejected": -2.6075363159179688, + "logps/chosen": -258.70440673828125, + "logps/rejected": -411.5464782714844, + "loss": 0.0474, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.412840843200684, + "rewards/margins": 13.252779960632324, + "rewards/rejected": -19.665620803833008, + "step": 11470 + }, + { + "epoch": 2.23, + "learning_rate": 1.4284173437837061e-07, + "logits/chosen": -2.6697323322296143, + "logits/rejected": -2.569648504257202, + "logps/chosen": -273.3839111328125, + "logps/rejected": -304.33648681640625, + "loss": 0.0733, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.097462177276611, + "rewards/margins": 8.689809799194336, + "rewards/rejected": -13.787272453308105, + "step": 11480 + }, + { + "epoch": 2.23, + "learning_rate": 1.4248220320701804e-07, + "logits/chosen": -2.6180739402770996, + "logits/rejected": -2.5678303241729736, + "logps/chosen": -257.52386474609375, + "logps/rejected": -337.18829345703125, + "loss": 0.0823, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.232558250427246, + "rewards/margins": 12.40059757232666, + "rewards/rejected": -16.633155822753906, + "step": 11490 + }, + { + "epoch": 2.23, + "learning_rate": 1.421226720356655e-07, + "logits/chosen": -2.77288818359375, + "logits/rejected": -2.778804063796997, + "logps/chosen": -344.33233642578125, + "logps/rejected": -400.1312561035156, + "loss": 0.076, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.488837480545044, + "rewards/margins": 13.25555419921875, + "rewards/rejected": -16.74439239501953, + "step": 11500 + }, + { + "epoch": 2.23, + "eval_logits/chosen": -2.581376314163208, + "eval_logits/rejected": -2.564934015274048, + "eval_logps/chosen": -286.0017395019531, + "eval_logps/rejected": -327.9375305175781, + "eval_loss": 0.6069397926330566, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -9.176368713378906, + "eval_rewards/margins": 6.425786972045898, + "eval_rewards/rejected": -15.602155685424805, + "eval_runtime": 141.032, + "eval_samples_per_second": 22.378, + "eval_steps_per_second": 0.355, + "step": 11500 + }, + { + "epoch": 2.23, + "learning_rate": 1.4176314086431292e-07, + "logits/chosen": -2.5592751502990723, + "logits/rejected": -2.426851749420166, + "logps/chosen": -259.74176025390625, + "logps/rejected": -386.64373779296875, + "loss": 0.076, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.105155944824219, + "rewards/margins": 12.326289176940918, + "rewards/rejected": -16.431446075439453, + "step": 11510 + }, + { + "epoch": 2.24, + "learning_rate": 1.4140360969296035e-07, + "logits/chosen": -2.7349648475646973, + "logits/rejected": -2.6921682357788086, + "logps/chosen": -247.73861694335938, + "logps/rejected": -351.2485046386719, + "loss": 0.0761, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.348642349243164, + "rewards/margins": 14.42656421661377, + "rewards/rejected": -17.775205612182617, + "step": 11520 + }, + { + "epoch": 2.24, + "learning_rate": 1.4104407852160783e-07, + "logits/chosen": -2.6718761920928955, + "logits/rejected": -2.6378886699676514, + "logps/chosen": -203.54443359375, + "logps/rejected": -397.050048828125, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.050256252288818, + "rewards/margins": 15.7075834274292, + "rewards/rejected": -19.75783920288086, + "step": 11530 + }, + { + "epoch": 2.24, + "learning_rate": 1.4068454735025526e-07, + "logits/chosen": -2.718578338623047, + "logits/rejected": -2.7763664722442627, + "logps/chosen": -348.1041259765625, + "logps/rejected": -305.02215576171875, + "loss": 0.0627, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.355883598327637, + "rewards/margins": 7.554298400878906, + "rewards/rejected": -11.910181999206543, + "step": 11540 + }, + { + "epoch": 2.24, + "learning_rate": 1.4032501617890271e-07, + "logits/chosen": -2.6585307121276855, + "logits/rejected": -2.7362751960754395, + "logps/chosen": -245.41000366210938, + "logps/rejected": -302.5990295410156, + "loss": 0.0722, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.522536516189575, + "rewards/margins": 9.066805839538574, + "rewards/rejected": -12.589343070983887, + "step": 11550 + }, + { + "epoch": 2.24, + "learning_rate": 1.3996548500755014e-07, + "logits/chosen": -2.839871644973755, + "logits/rejected": -2.7600152492523193, + "logps/chosen": -272.6055603027344, + "logps/rejected": -296.7951965332031, + "loss": 0.0681, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.299689292907715, + "rewards/margins": 9.61026668548584, + "rewards/rejected": -13.909955978393555, + "step": 11560 + }, + { + "epoch": 2.25, + "learning_rate": 1.3960595383619757e-07, + "logits/chosen": -2.465066432952881, + "logits/rejected": -2.4901702404022217, + "logps/chosen": -203.64425659179688, + "logps/rejected": -324.4049987792969, + "loss": 0.0766, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.24183464050293, + "rewards/margins": 13.511833190917969, + "rewards/rejected": -19.753665924072266, + "step": 11570 + }, + { + "epoch": 2.25, + "learning_rate": 1.3924642266484505e-07, + "logits/chosen": -2.646562099456787, + "logits/rejected": -2.6387124061584473, + "logps/chosen": -207.3817596435547, + "logps/rejected": -311.0987243652344, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9210293292999268, + "rewards/margins": 10.693174362182617, + "rewards/rejected": -14.614204406738281, + "step": 11580 + }, + { + "epoch": 2.25, + "learning_rate": 1.3888689149349248e-07, + "logits/chosen": -2.6290736198425293, + "logits/rejected": -2.6175715923309326, + "logps/chosen": -269.1551818847656, + "logps/rejected": -309.23492431640625, + "loss": 0.0723, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.581655025482178, + "rewards/margins": 9.738439559936523, + "rewards/rejected": -17.32009506225586, + "step": 11590 + }, + { + "epoch": 2.25, + "learning_rate": 1.3852736032213993e-07, + "logits/chosen": -2.6184613704681396, + "logits/rejected": -2.687415599822998, + "logps/chosen": -295.8892517089844, + "logps/rejected": -422.18292236328125, + "loss": 0.0831, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.389686584472656, + "rewards/margins": 7.802104949951172, + "rewards/rejected": -16.191789627075195, + "step": 11600 + }, + { + "epoch": 2.25, + "eval_logits/chosen": -2.553870677947998, + "eval_logits/rejected": -2.5353219509124756, + "eval_logps/chosen": -289.2669982910156, + "eval_logps/rejected": -333.825439453125, + "eval_loss": 0.6081392765045166, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -9.502893447875977, + "eval_rewards/margins": 6.688055515289307, + "eval_rewards/rejected": -16.190948486328125, + "eval_runtime": 140.1865, + "eval_samples_per_second": 22.513, + "eval_steps_per_second": 0.357, + "step": 11600 + }, + { + "epoch": 2.25, + "learning_rate": 1.3816782915078736e-07, + "logits/chosen": -2.7735273838043213, + "logits/rejected": -2.730475664138794, + "logps/chosen": -303.290283203125, + "logps/rejected": -428.74853515625, + "loss": 0.0674, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.033844470977783, + "rewards/margins": 16.606611251831055, + "rewards/rejected": -19.640457153320312, + "step": 11610 + }, + { + "epoch": 2.26, + "learning_rate": 1.378082979794348e-07, + "logits/chosen": -2.522797107696533, + "logits/rejected": -2.4746179580688477, + "logps/chosen": -251.85885620117188, + "logps/rejected": -412.51373291015625, + "loss": 0.0631, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.951503753662109, + "rewards/margins": 13.889203071594238, + "rewards/rejected": -20.84070587158203, + "step": 11620 + }, + { + "epoch": 2.26, + "learning_rate": 1.3744876680808227e-07, + "logits/chosen": -2.443213939666748, + "logits/rejected": -2.334507703781128, + "logps/chosen": -233.6834716796875, + "logps/rejected": -317.0559387207031, + "loss": 0.0901, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.278505325317383, + "rewards/margins": 8.525633811950684, + "rewards/rejected": -18.80413818359375, + "step": 11630 + }, + { + "epoch": 2.26, + "learning_rate": 1.370892356367297e-07, + "logits/chosen": -2.6557250022888184, + "logits/rejected": -2.6149024963378906, + "logps/chosen": -268.98187255859375, + "logps/rejected": -391.93408203125, + "loss": 0.0717, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.671351432800293, + "rewards/margins": 12.258050918579102, + "rewards/rejected": -16.929401397705078, + "step": 11640 + }, + { + "epoch": 2.26, + "learning_rate": 1.3672970446537715e-07, + "logits/chosen": -2.730659246444702, + "logits/rejected": -2.815887928009033, + "logps/chosen": -343.34759521484375, + "logps/rejected": -400.55322265625, + "loss": 0.055, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.423094749450684, + "rewards/margins": 12.474505424499512, + "rewards/rejected": -17.897602081298828, + "step": 11650 + }, + { + "epoch": 2.26, + "learning_rate": 1.3637017329402458e-07, + "logits/chosen": -2.6932754516601562, + "logits/rejected": -2.586027145385742, + "logps/chosen": -314.05908203125, + "logps/rejected": -355.5710144042969, + "loss": 0.0658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.076383590698242, + "rewards/margins": 10.35023307800293, + "rewards/rejected": -19.426616668701172, + "step": 11660 + }, + { + "epoch": 2.27, + "learning_rate": 1.36010642122672e-07, + "logits/chosen": -2.618831157684326, + "logits/rejected": -2.618896961212158, + "logps/chosen": -279.1488952636719, + "logps/rejected": -306.9048156738281, + "loss": 0.0833, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.087740898132324, + "rewards/margins": 7.7223076820373535, + "rewards/rejected": -14.81004810333252, + "step": 11670 + }, + { + "epoch": 2.27, + "learning_rate": 1.356511109513195e-07, + "logits/chosen": -2.6757123470306396, + "logits/rejected": -2.6962084770202637, + "logps/chosen": -198.5136260986328, + "logps/rejected": -373.1137390136719, + "loss": 0.1785, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.5840020179748535, + "rewards/margins": 12.190035820007324, + "rewards/rejected": -18.774036407470703, + "step": 11680 + }, + { + "epoch": 2.27, + "learning_rate": 1.3529157977996692e-07, + "logits/chosen": -2.469268321990967, + "logits/rejected": -2.5300540924072266, + "logps/chosen": -308.2252502441406, + "logps/rejected": -345.7593688964844, + "loss": 0.0637, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.705275058746338, + "rewards/margins": 12.153297424316406, + "rewards/rejected": -17.858570098876953, + "step": 11690 + }, + { + "epoch": 2.27, + "learning_rate": 1.3493204860861437e-07, + "logits/chosen": -2.606558084487915, + "logits/rejected": -2.5242538452148438, + "logps/chosen": -265.8565979003906, + "logps/rejected": -314.40380859375, + "loss": 0.0767, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.574146270751953, + "rewards/margins": 13.617681503295898, + "rewards/rejected": -20.19182777404785, + "step": 11700 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.5127735137939453, + "eval_logits/rejected": -2.491795301437378, + "eval_logps/chosen": -292.9400634765625, + "eval_logps/rejected": -346.1355895996094, + "eval_loss": 0.623223066329956, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -9.870200157165527, + "eval_rewards/margins": 7.551764011383057, + "eval_rewards/rejected": -17.421964645385742, + "eval_runtime": 140.0904, + "eval_samples_per_second": 22.528, + "eval_steps_per_second": 0.357, + "step": 11700 + }, + { + "epoch": 2.27, + "learning_rate": 1.345725174372618e-07, + "logits/chosen": -2.685258388519287, + "logits/rejected": -2.705810785293579, + "logps/chosen": -368.918212890625, + "logps/rejected": -513.9830322265625, + "loss": 0.0669, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.687708377838135, + "rewards/margins": 16.229557037353516, + "rewards/rejected": -23.917266845703125, + "step": 11710 + }, + { + "epoch": 2.28, + "learning_rate": 1.3421298626590923e-07, + "logits/chosen": -2.593352794647217, + "logits/rejected": -2.737347364425659, + "logps/chosen": -225.0635223388672, + "logps/rejected": -421.9320373535156, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.335352659225464, + "rewards/margins": 16.914093017578125, + "rewards/rejected": -19.24944496154785, + "step": 11720 + }, + { + "epoch": 2.28, + "learning_rate": 1.338534550945567e-07, + "logits/chosen": -2.8126816749572754, + "logits/rejected": -2.750828981399536, + "logps/chosen": -241.482421875, + "logps/rejected": -405.61962890625, + "loss": 0.0895, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.5754666328430176, + "rewards/margins": 11.979042053222656, + "rewards/rejected": -15.5545072555542, + "step": 11730 + }, + { + "epoch": 2.28, + "learning_rate": 1.3349392392320413e-07, + "logits/chosen": -2.60490083694458, + "logits/rejected": -2.6065385341644287, + "logps/chosen": -240.9623565673828, + "logps/rejected": -403.73687744140625, + "loss": 0.0506, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.312406063079834, + "rewards/margins": 18.081600189208984, + "rewards/rejected": -21.394006729125977, + "step": 11740 + }, + { + "epoch": 2.28, + "learning_rate": 1.331343927518516e-07, + "logits/chosen": -2.4395904541015625, + "logits/rejected": -2.447916269302368, + "logps/chosen": -260.7299499511719, + "logps/rejected": -365.1597595214844, + "loss": 0.0712, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.795269966125488, + "rewards/margins": 15.1010103225708, + "rewards/rejected": -20.896282196044922, + "step": 11750 + }, + { + "epoch": 2.28, + "learning_rate": 1.3277486158049902e-07, + "logits/chosen": -2.660329818725586, + "logits/rejected": -2.648406505584717, + "logps/chosen": -234.0547332763672, + "logps/rejected": -327.6827392578125, + "loss": 0.0858, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.183368682861328, + "rewards/margins": 11.445791244506836, + "rewards/rejected": -16.629159927368164, + "step": 11760 + }, + { + "epoch": 2.28, + "learning_rate": 1.3241533040914647e-07, + "logits/chosen": -2.544922351837158, + "logits/rejected": -2.5972018241882324, + "logps/chosen": -269.39300537109375, + "logps/rejected": -366.1263122558594, + "loss": 0.0885, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4013748168945312, + "rewards/margins": 14.740961074829102, + "rewards/rejected": -18.142335891723633, + "step": 11770 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205579923779393e-07, + "logits/chosen": -2.3685431480407715, + "logits/rejected": -2.4402108192443848, + "logps/chosen": -295.20526123046875, + "logps/rejected": -302.5821228027344, + "loss": 0.0937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.440817832946777, + "rewards/margins": 7.621280670166016, + "rewards/rejected": -16.062097549438477, + "step": 11780 + }, + { + "epoch": 2.29, + "learning_rate": 1.3169626806644135e-07, + "logits/chosen": -2.5984652042388916, + "logits/rejected": -2.599595308303833, + "logps/chosen": -178.05007934570312, + "logps/rejected": -301.00927734375, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.144611358642578, + "rewards/margins": 13.900667190551758, + "rewards/rejected": -20.045276641845703, + "step": 11790 + }, + { + "epoch": 2.29, + "learning_rate": 1.3133673689508878e-07, + "logits/chosen": -2.5142526626586914, + "logits/rejected": -2.565838098526001, + "logps/chosen": -313.01629638671875, + "logps/rejected": -364.46160888671875, + "loss": 0.0637, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.528944492340088, + "rewards/margins": 13.737627983093262, + "rewards/rejected": -17.266571044921875, + "step": 11800 + }, + { + "epoch": 2.29, + "eval_logits/chosen": -2.510964870452881, + "eval_logits/rejected": -2.4900963306427, + "eval_logps/chosen": -298.47015380859375, + "eval_logps/rejected": -352.2785949707031, + "eval_loss": 0.6183001399040222, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -10.423210144042969, + "eval_rewards/margins": 7.6130571365356445, + "eval_rewards/rejected": -18.03626823425293, + "eval_runtime": 139.7402, + "eval_samples_per_second": 22.585, + "eval_steps_per_second": 0.358, + "step": 11800 + }, + { + "epoch": 2.29, + "learning_rate": 1.3097720572373624e-07, + "logits/chosen": -2.4644076824188232, + "logits/rejected": -2.524941921234131, + "logps/chosen": -190.19384765625, + "logps/rejected": -309.899658203125, + "loss": 0.0632, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.109036922454834, + "rewards/margins": 9.344032287597656, + "rewards/rejected": -12.453069686889648, + "step": 11810 + }, + { + "epoch": 2.29, + "learning_rate": 1.306176745523837e-07, + "logits/chosen": -2.4824531078338623, + "logits/rejected": -2.489126682281494, + "logps/chosen": -302.51727294921875, + "logps/rejected": -450.05755615234375, + "loss": 0.0803, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.096802711486816, + "rewards/margins": 17.167470932006836, + "rewards/rejected": -22.264272689819336, + "step": 11820 + }, + { + "epoch": 2.3, + "learning_rate": 1.3025814338103114e-07, + "logits/chosen": -2.45460844039917, + "logits/rejected": -2.477003335952759, + "logps/chosen": -248.20748901367188, + "logps/rejected": -355.87957763671875, + "loss": 0.0654, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.903677463531494, + "rewards/margins": 14.553686141967773, + "rewards/rejected": -21.45736312866211, + "step": 11830 + }, + { + "epoch": 2.3, + "learning_rate": 1.2989861220967857e-07, + "logits/chosen": -2.7035422325134277, + "logits/rejected": -2.580699920654297, + "logps/chosen": -373.3416748046875, + "logps/rejected": -450.2897033691406, + "loss": 0.1008, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.678826808929443, + "rewards/margins": 14.488286018371582, + "rewards/rejected": -22.167110443115234, + "step": 11840 + }, + { + "epoch": 2.3, + "learning_rate": 1.29539081038326e-07, + "logits/chosen": -2.6361374855041504, + "logits/rejected": -2.563781261444092, + "logps/chosen": -265.8594665527344, + "logps/rejected": -346.826171875, + "loss": 0.0863, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8175954818725586, + "rewards/margins": 15.04686450958252, + "rewards/rejected": -16.864459991455078, + "step": 11850 + }, + { + "epoch": 2.3, + "learning_rate": 1.2917954986697345e-07, + "logits/chosen": -2.619905948638916, + "logits/rejected": -2.5312862396240234, + "logps/chosen": -266.5345764160156, + "logps/rejected": -360.4387512207031, + "loss": 0.0878, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5418922901153564, + "rewards/margins": 13.449142456054688, + "rewards/rejected": -15.991033554077148, + "step": 11860 + }, + { + "epoch": 2.3, + "learning_rate": 1.288200186956209e-07, + "logits/chosen": -2.5582549571990967, + "logits/rejected": -2.496058225631714, + "logps/chosen": -207.1652069091797, + "logps/rejected": -320.2586975097656, + "loss": 0.0832, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.37259578704834, + "rewards/margins": 14.15211009979248, + "rewards/rejected": -22.52470588684082, + "step": 11870 + }, + { + "epoch": 2.31, + "learning_rate": 1.2846048752426836e-07, + "logits/chosen": -2.6128625869750977, + "logits/rejected": -2.5494701862335205, + "logps/chosen": -305.84063720703125, + "logps/rejected": -368.3662109375, + "loss": 0.0913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.47107982635498, + "rewards/margins": 10.026995658874512, + "rewards/rejected": -21.498075485229492, + "step": 11880 + }, + { + "epoch": 2.31, + "learning_rate": 1.281009563529158e-07, + "logits/chosen": -2.585437297821045, + "logits/rejected": -2.546278953552246, + "logps/chosen": -284.7617492675781, + "logps/rejected": -361.58734130859375, + "loss": 0.0763, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.904560089111328, + "rewards/margins": 10.845001220703125, + "rewards/rejected": -19.749561309814453, + "step": 11890 + }, + { + "epoch": 2.31, + "learning_rate": 1.2774142518156322e-07, + "logits/chosen": -2.6387503147125244, + "logits/rejected": -2.6979053020477295, + "logps/chosen": -246.8169403076172, + "logps/rejected": -374.6175842285156, + "loss": 0.0578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.128369331359863, + "rewards/margins": 16.08565330505371, + "rewards/rejected": -20.21402359008789, + "step": 11900 + }, + { + "epoch": 2.31, + "eval_logits/chosen": -2.5245871543884277, + "eval_logits/rejected": -2.504455804824829, + "eval_logps/chosen": -298.15789794921875, + "eval_logps/rejected": -353.755615234375, + "eval_loss": 0.6302103400230408, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -10.391982078552246, + "eval_rewards/margins": 7.791986465454102, + "eval_rewards/rejected": -18.18396759033203, + "eval_runtime": 139.7884, + "eval_samples_per_second": 22.577, + "eval_steps_per_second": 0.358, + "step": 11900 + }, + { + "epoch": 2.31, + "learning_rate": 1.2738189401021067e-07, + "logits/chosen": -2.545112371444702, + "logits/rejected": -2.4530835151672363, + "logps/chosen": -244.24270629882812, + "logps/rejected": -312.3651123046875, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5539422035217285, + "rewards/margins": 10.47828483581543, + "rewards/rejected": -15.0322265625, + "step": 11910 + }, + { + "epoch": 2.31, + "learning_rate": 1.2702236283885813e-07, + "logits/chosen": -2.718308210372925, + "logits/rejected": -2.6510627269744873, + "logps/chosen": -262.9393005371094, + "logps/rejected": -426.9742736816406, + "loss": 0.0596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8453878164291382, + "rewards/margins": 14.58856201171875, + "rewards/rejected": -16.433950424194336, + "step": 11920 + }, + { + "epoch": 2.32, + "learning_rate": 1.2666283166750558e-07, + "logits/chosen": -2.606099843978882, + "logits/rejected": -2.5635433197021484, + "logps/chosen": -268.33612060546875, + "logps/rejected": -318.01702880859375, + "loss": 0.0758, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4051783084869385, + "rewards/margins": 10.541773796081543, + "rewards/rejected": -12.946952819824219, + "step": 11930 + }, + { + "epoch": 2.32, + "learning_rate": 1.26303300496153e-07, + "logits/chosen": -2.5500588417053223, + "logits/rejected": -2.6418328285217285, + "logps/chosen": -286.411376953125, + "logps/rejected": -434.02850341796875, + "loss": 0.0762, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.638510704040527, + "rewards/margins": 16.73740577697754, + "rewards/rejected": -21.37591552734375, + "step": 11940 + }, + { + "epoch": 2.32, + "learning_rate": 1.2594376932480044e-07, + "logits/chosen": -2.5598676204681396, + "logits/rejected": -2.633626699447632, + "logps/chosen": -231.1297607421875, + "logps/rejected": -374.0964660644531, + "loss": 0.0924, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.510305881500244, + "rewards/margins": 13.23644733428955, + "rewards/rejected": -19.746753692626953, + "step": 11950 + }, + { + "epoch": 2.32, + "learning_rate": 1.255842381534479e-07, + "logits/chosen": -2.5221829414367676, + "logits/rejected": -2.5997650623321533, + "logps/chosen": -315.4999694824219, + "logps/rejected": -463.0917053222656, + "loss": 0.0806, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.256501197814941, + "rewards/margins": 15.427995681762695, + "rewards/rejected": -20.684497833251953, + "step": 11960 + }, + { + "epoch": 2.32, + "learning_rate": 1.2522470698209535e-07, + "logits/chosen": -2.6015467643737793, + "logits/rejected": -2.5260097980499268, + "logps/chosen": -329.55712890625, + "logps/rejected": -531.4019775390625, + "loss": 0.0481, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.006913661956787, + "rewards/margins": 19.019302368164062, + "rewards/rejected": -25.026212692260742, + "step": 11970 + }, + { + "epoch": 2.33, + "learning_rate": 1.2486517581074277e-07, + "logits/chosen": -2.532564163208008, + "logits/rejected": -2.626721143722534, + "logps/chosen": -209.87423706054688, + "logps/rejected": -342.61004638671875, + "loss": 0.0791, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9006447792053223, + "rewards/margins": 14.513799667358398, + "rewards/rejected": -18.414443969726562, + "step": 11980 + }, + { + "epoch": 2.33, + "learning_rate": 1.2450564463939023e-07, + "logits/chosen": -2.7013509273529053, + "logits/rejected": -2.5653529167175293, + "logps/chosen": -230.11972045898438, + "logps/rejected": -313.40081787109375, + "loss": 0.0856, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.105692386627197, + "rewards/margins": 12.026542663574219, + "rewards/rejected": -17.13223648071289, + "step": 11990 + }, + { + "epoch": 2.33, + "learning_rate": 1.2414611346803768e-07, + "logits/chosen": -2.6725096702575684, + "logits/rejected": -2.6505398750305176, + "logps/chosen": -208.33596801757812, + "logps/rejected": -361.4418029785156, + "loss": 0.0665, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.173867225646973, + "rewards/margins": 13.721814155578613, + "rewards/rejected": -17.895679473876953, + "step": 12000 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -2.540191888809204, + "eval_logits/rejected": -2.520397424697876, + "eval_logps/chosen": -297.1541442871094, + "eval_logps/rejected": -353.8656311035156, + "eval_loss": 0.6309294104576111, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -10.291607856750488, + "eval_rewards/margins": 7.9033589363098145, + "eval_rewards/rejected": -18.194965362548828, + "eval_runtime": 141.0134, + "eval_samples_per_second": 22.381, + "eval_steps_per_second": 0.355, + "step": 12000 + }, + { + "epoch": 2.33, + "learning_rate": 1.237865822966851e-07, + "logits/chosen": -2.6309330463409424, + "logits/rejected": -2.6086974143981934, + "logps/chosen": -336.7874450683594, + "logps/rejected": -483.9122619628906, + "loss": 0.0937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.86191177368164, + "rewards/margins": 12.709223747253418, + "rewards/rejected": -22.571136474609375, + "step": 12010 + }, + { + "epoch": 2.33, + "learning_rate": 1.2342705112533256e-07, + "logits/chosen": -2.5626704692840576, + "logits/rejected": -2.547719955444336, + "logps/chosen": -251.743408203125, + "logps/rejected": -344.1765441894531, + "loss": 0.0793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.74990177154541, + "rewards/margins": 8.43620491027832, + "rewards/rejected": -18.18610382080078, + "step": 12020 + }, + { + "epoch": 2.34, + "learning_rate": 1.2306751995398e-07, + "logits/chosen": -2.5711910724639893, + "logits/rejected": -2.595757007598877, + "logps/chosen": -288.83160400390625, + "logps/rejected": -402.626220703125, + "loss": 0.0771, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.019674301147461, + "rewards/margins": 12.471722602844238, + "rewards/rejected": -17.491395950317383, + "step": 12030 + }, + { + "epoch": 2.34, + "learning_rate": 1.2270798878262745e-07, + "logits/chosen": -2.5989129543304443, + "logits/rejected": -2.6641898155212402, + "logps/chosen": -221.2817840576172, + "logps/rejected": -544.4244384765625, + "loss": 0.0519, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.894484043121338, + "rewards/margins": 14.513275146484375, + "rewards/rejected": -17.407756805419922, + "step": 12040 + }, + { + "epoch": 2.34, + "learning_rate": 1.223484576112749e-07, + "logits/chosen": -2.598823070526123, + "logits/rejected": -2.582695722579956, + "logps/chosen": -285.917724609375, + "logps/rejected": -420.414794921875, + "loss": 0.0775, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.9591498374938965, + "rewards/margins": 15.590853691101074, + "rewards/rejected": -22.550003051757812, + "step": 12050 + }, + { + "epoch": 2.34, + "learning_rate": 1.2198892643992233e-07, + "logits/chosen": -2.6646790504455566, + "logits/rejected": -2.7158217430114746, + "logps/chosen": -190.109375, + "logps/rejected": -303.2792663574219, + "loss": 0.0702, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.631850719451904, + "rewards/margins": 10.676166534423828, + "rewards/rejected": -15.308015823364258, + "step": 12060 + }, + { + "epoch": 2.34, + "learning_rate": 1.2162939526856978e-07, + "logits/chosen": -2.636368989944458, + "logits/rejected": -2.6788716316223145, + "logps/chosen": -238.0105743408203, + "logps/rejected": -376.60614013671875, + "loss": 0.0641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.4170451164245605, + "rewards/margins": 14.714938163757324, + "rewards/rejected": -19.13198471069336, + "step": 12070 + }, + { + "epoch": 2.35, + "learning_rate": 1.212698640972172e-07, + "logits/chosen": -2.6684865951538086, + "logits/rejected": -2.6374154090881348, + "logps/chosen": -287.66229248046875, + "logps/rejected": -482.650390625, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.367466926574707, + "rewards/margins": 22.329139709472656, + "rewards/rejected": -26.696605682373047, + "step": 12080 + }, + { + "epoch": 2.35, + "learning_rate": 1.2091033292586466e-07, + "logits/chosen": -2.632810592651367, + "logits/rejected": -2.5606772899627686, + "logps/chosen": -298.189453125, + "logps/rejected": -457.1927795410156, + "loss": 0.0705, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8998987674713135, + "rewards/margins": 13.952972412109375, + "rewards/rejected": -16.85287094116211, + "step": 12090 + }, + { + "epoch": 2.35, + "learning_rate": 1.2055080175451212e-07, + "logits/chosen": -2.4314777851104736, + "logits/rejected": -2.450103521347046, + "logps/chosen": -237.0631103515625, + "logps/rejected": -388.85223388671875, + "loss": 0.0854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.274641036987305, + "rewards/margins": 10.727151870727539, + "rewards/rejected": -19.001792907714844, + "step": 12100 + }, + { + "epoch": 2.35, + "eval_logits/chosen": -2.5343832969665527, + "eval_logits/rejected": -2.5141849517822266, + "eval_logps/chosen": -299.864990234375, + "eval_logps/rejected": -356.9398193359375, + "eval_loss": 0.6348021030426025, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -10.562690734863281, + "eval_rewards/margins": 7.939695358276367, + "eval_rewards/rejected": -18.50238609313965, + "eval_runtime": 140.3482, + "eval_samples_per_second": 22.487, + "eval_steps_per_second": 0.356, + "step": 12100 + }, + { + "epoch": 2.35, + "learning_rate": 1.2019127058315955e-07, + "logits/chosen": -2.4986445903778076, + "logits/rejected": -2.5519309043884277, + "logps/chosen": -175.40908813476562, + "logps/rejected": -474.209228515625, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2327470779418945, + "rewards/margins": 24.410297393798828, + "rewards/rejected": -28.64304542541504, + "step": 12110 + }, + { + "epoch": 2.35, + "learning_rate": 1.19831739411807e-07, + "logits/chosen": -2.821687698364258, + "logits/rejected": -2.6823883056640625, + "logps/chosen": -260.00091552734375, + "logps/rejected": -360.76104736328125, + "loss": 0.0844, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.708765506744385, + "rewards/margins": 8.458456039428711, + "rewards/rejected": -14.167219161987305, + "step": 12120 + }, + { + "epoch": 2.35, + "learning_rate": 1.1947220824045443e-07, + "logits/chosen": -2.5909080505371094, + "logits/rejected": -2.663158893585205, + "logps/chosen": -298.30401611328125, + "logps/rejected": -355.90716552734375, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.249277114868164, + "rewards/margins": 12.088628768920898, + "rewards/rejected": -14.337905883789062, + "step": 12130 + }, + { + "epoch": 2.36, + "learning_rate": 1.1911267706910188e-07, + "logits/chosen": -2.6094861030578613, + "logits/rejected": -2.5060129165649414, + "logps/chosen": -200.81222534179688, + "logps/rejected": -345.2282409667969, + "loss": 0.0578, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.438138008117676, + "rewards/margins": 9.182662010192871, + "rewards/rejected": -17.620800018310547, + "step": 12140 + }, + { + "epoch": 2.36, + "learning_rate": 1.1875314589774934e-07, + "logits/chosen": -2.7512528896331787, + "logits/rejected": -2.6919193267822266, + "logps/chosen": -224.1593017578125, + "logps/rejected": -379.17034912109375, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.352349042892456, + "rewards/margins": 16.138296127319336, + "rewards/rejected": -19.490646362304688, + "step": 12150 + }, + { + "epoch": 2.36, + "learning_rate": 1.1839361472639678e-07, + "logits/chosen": -2.757967472076416, + "logits/rejected": -2.635768413543701, + "logps/chosen": -303.3056335449219, + "logps/rejected": -403.9469909667969, + "loss": 0.0661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0109000205993652, + "rewards/margins": 12.864117622375488, + "rewards/rejected": -15.875018119812012, + "step": 12160 + }, + { + "epoch": 2.36, + "learning_rate": 1.180340835550442e-07, + "logits/chosen": -2.558183193206787, + "logits/rejected": -2.480443000793457, + "logps/chosen": -262.8445129394531, + "logps/rejected": -395.8421325683594, + "loss": 0.0367, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.135694980621338, + "rewards/margins": 19.493581771850586, + "rewards/rejected": -23.629276275634766, + "step": 12170 + }, + { + "epoch": 2.36, + "learning_rate": 1.1767455238369166e-07, + "logits/chosen": -2.7435686588287354, + "logits/rejected": -2.6610381603240967, + "logps/chosen": -273.6927795410156, + "logps/rejected": -358.9440612792969, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3958563804626465, + "rewards/margins": 12.258552551269531, + "rewards/rejected": -15.654406547546387, + "step": 12180 + }, + { + "epoch": 2.37, + "learning_rate": 1.173150212123391e-07, + "logits/chosen": -2.722874879837036, + "logits/rejected": -2.6344385147094727, + "logps/chosen": -342.13018798828125, + "logps/rejected": -456.98095703125, + "loss": 0.0898, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.861723899841309, + "rewards/margins": 18.311250686645508, + "rewards/rejected": -24.172977447509766, + "step": 12190 + }, + { + "epoch": 2.37, + "learning_rate": 1.1695549004098656e-07, + "logits/chosen": -2.6227803230285645, + "logits/rejected": -2.5830583572387695, + "logps/chosen": -237.94540405273438, + "logps/rejected": -373.0111389160156, + "loss": 0.0663, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.682139873504639, + "rewards/margins": 13.407246589660645, + "rewards/rejected": -18.089385986328125, + "step": 12200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.536633253097534, + "eval_logits/rejected": -2.516324996948242, + "eval_logps/chosen": -297.79998779296875, + "eval_logps/rejected": -354.1291809082031, + "eval_loss": 0.6440024375915527, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -10.356196403503418, + "eval_rewards/margins": 7.865126132965088, + "eval_rewards/rejected": -18.221323013305664, + "eval_runtime": 142.2384, + "eval_samples_per_second": 22.188, + "eval_steps_per_second": 0.352, + "step": 12200 + }, + { + "epoch": 2.37, + "learning_rate": 1.16595958869634e-07, + "logits/chosen": -2.6502878665924072, + "logits/rejected": -2.475832462310791, + "logps/chosen": -243.1304168701172, + "logps/rejected": -377.45263671875, + "loss": 0.073, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.234163761138916, + "rewards/margins": 15.981356620788574, + "rewards/rejected": -22.21552085876465, + "step": 12210 + }, + { + "epoch": 2.37, + "learning_rate": 1.1623642769828142e-07, + "logits/chosen": -2.5887460708618164, + "logits/rejected": -2.56135630607605, + "logps/chosen": -266.46588134765625, + "logps/rejected": -392.6053161621094, + "loss": 0.1023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.193370819091797, + "rewards/margins": 15.058713912963867, + "rewards/rejected": -20.252084732055664, + "step": 12220 + }, + { + "epoch": 2.37, + "learning_rate": 1.1587689652692888e-07, + "logits/chosen": -2.6022226810455322, + "logits/rejected": -2.5226919651031494, + "logps/chosen": -323.2103271484375, + "logps/rejected": -482.9850158691406, + "loss": 0.0753, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8766988515853882, + "rewards/margins": 12.250858306884766, + "rewards/rejected": -14.127557754516602, + "step": 12230 + }, + { + "epoch": 2.38, + "learning_rate": 1.1551736535557632e-07, + "logits/chosen": -2.6652512550354004, + "logits/rejected": -2.617802619934082, + "logps/chosen": -241.03564453125, + "logps/rejected": -381.97454833984375, + "loss": 0.0949, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.917860507965088, + "rewards/margins": 11.29955005645752, + "rewards/rejected": -18.217411041259766, + "step": 12240 + }, + { + "epoch": 2.38, + "learning_rate": 1.1515783418422377e-07, + "logits/chosen": -2.6905980110168457, + "logits/rejected": -2.621811628341675, + "logps/chosen": -239.3612060546875, + "logps/rejected": -287.470703125, + "loss": 0.0667, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.432193756103516, + "rewards/margins": 7.715624809265137, + "rewards/rejected": -12.147819519042969, + "step": 12250 + }, + { + "epoch": 2.38, + "learning_rate": 1.1479830301287122e-07, + "logits/chosen": -2.4074113368988037, + "logits/rejected": -2.4639010429382324, + "logps/chosen": -304.75128173828125, + "logps/rejected": -421.524169921875, + "loss": 0.0798, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.061871528625488, + "rewards/margins": 11.476676940917969, + "rewards/rejected": -19.53854751586914, + "step": 12260 + }, + { + "epoch": 2.38, + "learning_rate": 1.1443877184151864e-07, + "logits/chosen": -2.6760289669036865, + "logits/rejected": -2.6463124752044678, + "logps/chosen": -226.9069061279297, + "logps/rejected": -440.8207092285156, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.603955268859863, + "rewards/margins": 19.35154914855957, + "rewards/rejected": -23.955503463745117, + "step": 12270 + }, + { + "epoch": 2.38, + "learning_rate": 1.140792406701661e-07, + "logits/chosen": -2.7669763565063477, + "logits/rejected": -2.7230679988861084, + "logps/chosen": -290.60406494140625, + "logps/rejected": -373.95989990234375, + "loss": 0.0666, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.881918430328369, + "rewards/margins": 12.098763465881348, + "rewards/rejected": -16.980682373046875, + "step": 12280 + }, + { + "epoch": 2.39, + "learning_rate": 1.1371970949881354e-07, + "logits/chosen": -2.5898349285125732, + "logits/rejected": -2.603485107421875, + "logps/chosen": -267.1484680175781, + "logps/rejected": -360.93963623046875, + "loss": 0.0825, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.417284965515137, + "rewards/margins": 15.212320327758789, + "rewards/rejected": -20.62960433959961, + "step": 12290 + }, + { + "epoch": 2.39, + "learning_rate": 1.1336017832746099e-07, + "logits/chosen": -2.433037757873535, + "logits/rejected": -2.422377109527588, + "logps/chosen": -172.99049377441406, + "logps/rejected": -338.6845703125, + "loss": 0.0926, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.795866012573242, + "rewards/margins": 14.42186164855957, + "rewards/rejected": -19.217727661132812, + "step": 12300 + }, + { + "epoch": 2.39, + "eval_logits/chosen": -2.5607492923736572, + "eval_logits/rejected": -2.5421431064605713, + "eval_logps/chosen": -293.64227294921875, + "eval_logps/rejected": -347.0633544921875, + "eval_loss": 0.6197232007980347, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -9.940421104431152, + "eval_rewards/margins": 7.574316501617432, + "eval_rewards/rejected": -17.514738082885742, + "eval_runtime": 140.9172, + "eval_samples_per_second": 22.396, + "eval_steps_per_second": 0.355, + "step": 12300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1300064715610843e-07, + "logits/chosen": -2.7044150829315186, + "logits/rejected": -2.5824317932128906, + "logps/chosen": -355.00396728515625, + "logps/rejected": -378.8909912109375, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4438202381134033, + "rewards/margins": 12.555355072021484, + "rewards/rejected": -15.999174118041992, + "step": 12310 + }, + { + "epoch": 2.39, + "learning_rate": 1.1264111598475588e-07, + "logits/chosen": -2.4234459400177, + "logits/rejected": -2.404367446899414, + "logps/chosen": -347.3136901855469, + "logps/rejected": -428.6026916503906, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22336021065711975, + "rewards/margins": 15.050806045532227, + "rewards/rejected": -15.274165153503418, + "step": 12320 + }, + { + "epoch": 2.39, + "learning_rate": 1.1228158481340332e-07, + "logits/chosen": -2.611323833465576, + "logits/rejected": -2.692596673965454, + "logps/chosen": -208.95602416992188, + "logps/rejected": -438.8912048339844, + "loss": 0.0586, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6306769847869873, + "rewards/margins": 15.02960205078125, + "rewards/rejected": -18.660280227661133, + "step": 12330 + }, + { + "epoch": 2.4, + "learning_rate": 1.1192205364205076e-07, + "logits/chosen": -2.738924026489258, + "logits/rejected": -2.7660722732543945, + "logps/chosen": -268.96685791015625, + "logps/rejected": -421.6089782714844, + "loss": 0.087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.598333835601807, + "rewards/margins": 10.654764175415039, + "rewards/rejected": -16.253095626831055, + "step": 12340 + }, + { + "epoch": 2.4, + "learning_rate": 1.1156252247069821e-07, + "logits/chosen": -2.6517112255096436, + "logits/rejected": -2.6612112522125244, + "logps/chosen": -201.0699462890625, + "logps/rejected": -378.2642517089844, + "loss": 0.0511, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2537899017333984, + "rewards/margins": 12.25096607208252, + "rewards/rejected": -15.504755973815918, + "step": 12350 + }, + { + "epoch": 2.4, + "learning_rate": 1.1120299129934564e-07, + "logits/chosen": -2.626768112182617, + "logits/rejected": -2.4936606884002686, + "logps/chosen": -287.0443420410156, + "logps/rejected": -372.3125, + "loss": 0.081, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4231988191604614, + "rewards/margins": 12.1668119430542, + "rewards/rejected": -13.590011596679688, + "step": 12360 + }, + { + "epoch": 2.4, + "learning_rate": 1.108434601279931e-07, + "logits/chosen": -2.707183361053467, + "logits/rejected": -2.6809635162353516, + "logps/chosen": -214.8535919189453, + "logps/rejected": -313.95745849609375, + "loss": 0.1113, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.256438255310059, + "rewards/margins": 12.393168449401855, + "rewards/rejected": -16.649608612060547, + "step": 12370 + }, + { + "epoch": 2.4, + "learning_rate": 1.1048392895664053e-07, + "logits/chosen": -2.639615058898926, + "logits/rejected": -2.698387384414673, + "logps/chosen": -267.8791198730469, + "logps/rejected": -396.8616027832031, + "loss": 0.0945, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.8844404220581055, + "rewards/margins": 13.174560546875, + "rewards/rejected": -20.059001922607422, + "step": 12380 + }, + { + "epoch": 2.41, + "learning_rate": 1.1012439778528798e-07, + "logits/chosen": -2.7608120441436768, + "logits/rejected": -2.577022075653076, + "logps/chosen": -232.0834197998047, + "logps/rejected": -307.9110107421875, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9954754114151, + "rewards/margins": 10.54742431640625, + "rewards/rejected": -12.542899131774902, + "step": 12390 + }, + { + "epoch": 2.41, + "learning_rate": 1.0976486661393543e-07, + "logits/chosen": -2.7052533626556396, + "logits/rejected": -2.6483771800994873, + "logps/chosen": -286.0614013671875, + "logps/rejected": -414.126220703125, + "loss": 0.0846, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2018063068389893, + "rewards/margins": 14.756329536437988, + "rewards/rejected": -16.958133697509766, + "step": 12400 + }, + { + "epoch": 2.41, + "eval_logits/chosen": -2.5453546047210693, + "eval_logits/rejected": -2.529228687286377, + "eval_logps/chosen": -281.3964538574219, + "eval_logps/rejected": -323.9549865722656, + "eval_loss": 0.6193054914474487, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -8.715839385986328, + "eval_rewards/margins": 6.488065242767334, + "eval_rewards/rejected": -15.203904151916504, + "eval_runtime": 140.7253, + "eval_samples_per_second": 22.427, + "eval_steps_per_second": 0.355, + "step": 12400 + }, + { + "epoch": 2.41, + "learning_rate": 1.0940533544258286e-07, + "logits/chosen": -2.471107244491577, + "logits/rejected": -2.567960739135742, + "logps/chosen": -316.57073974609375, + "logps/rejected": -463.3955078125, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0290095806121826, + "rewards/margins": 17.88158416748047, + "rewards/rejected": -18.910593032836914, + "step": 12410 + }, + { + "epoch": 2.41, + "learning_rate": 1.0904580427123031e-07, + "logits/chosen": -2.6712772846221924, + "logits/rejected": -2.6760573387145996, + "logps/chosen": -302.7055969238281, + "logps/rejected": -340.89556884765625, + "loss": 0.0592, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.306831359863281, + "rewards/margins": 10.018339157104492, + "rewards/rejected": -16.32516860961914, + "step": 12420 + }, + { + "epoch": 2.41, + "learning_rate": 1.0868627309987775e-07, + "logits/chosen": -2.5626769065856934, + "logits/rejected": -2.4481892585754395, + "logps/chosen": -251.5598907470703, + "logps/rejected": -308.00799560546875, + "loss": 0.0834, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.11841344833374, + "rewards/margins": 10.771479606628418, + "rewards/rejected": -16.889894485473633, + "step": 12430 + }, + { + "epoch": 2.42, + "learning_rate": 1.083267419285252e-07, + "logits/chosen": -2.5774381160736084, + "logits/rejected": -2.586848020553589, + "logps/chosen": -221.2284698486328, + "logps/rejected": -383.3273010253906, + "loss": 0.0726, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.779860019683838, + "rewards/margins": 12.585046768188477, + "rewards/rejected": -19.364904403686523, + "step": 12440 + }, + { + "epoch": 2.42, + "learning_rate": 1.0796721075717265e-07, + "logits/chosen": -2.656850814819336, + "logits/rejected": -2.5535809993743896, + "logps/chosen": -266.3935852050781, + "logps/rejected": -386.8028259277344, + "loss": 0.0981, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9002602100372314, + "rewards/margins": 12.64848804473877, + "rewards/rejected": -15.548748970031738, + "step": 12450 + }, + { + "epoch": 2.42, + "learning_rate": 1.0760767958582008e-07, + "logits/chosen": -2.324298143386841, + "logits/rejected": -2.3992927074432373, + "logps/chosen": -171.7988739013672, + "logps/rejected": -357.5594482421875, + "loss": 0.1181, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.034149169921875, + "rewards/margins": 10.491291046142578, + "rewards/rejected": -16.52543830871582, + "step": 12460 + }, + { + "epoch": 2.42, + "learning_rate": 1.0724814841446753e-07, + "logits/chosen": -2.413907766342163, + "logits/rejected": -2.435821056365967, + "logps/chosen": -185.29042053222656, + "logps/rejected": -318.6675720214844, + "loss": 0.0706, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.206128120422363, + "rewards/margins": 10.577844619750977, + "rewards/rejected": -18.783971786499023, + "step": 12470 + }, + { + "epoch": 2.42, + "learning_rate": 1.0688861724311497e-07, + "logits/chosen": -2.6212925910949707, + "logits/rejected": -2.6373047828674316, + "logps/chosen": -195.32415771484375, + "logps/rejected": -452.72344970703125, + "loss": 0.0658, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.244529724121094, + "rewards/margins": 14.38170337677002, + "rewards/rejected": -19.626232147216797, + "step": 12480 + }, + { + "epoch": 2.42, + "learning_rate": 1.0652908607176243e-07, + "logits/chosen": -2.6920530796051025, + "logits/rejected": -2.716745138168335, + "logps/chosen": -307.9977111816406, + "logps/rejected": -397.4832458496094, + "loss": 0.0755, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8004132509231567, + "rewards/margins": 15.388232231140137, + "rewards/rejected": -16.18864631652832, + "step": 12490 + }, + { + "epoch": 2.43, + "learning_rate": 1.0616955490040987e-07, + "logits/chosen": -2.552727222442627, + "logits/rejected": -2.7345712184906006, + "logps/chosen": -291.3739929199219, + "logps/rejected": -428.37286376953125, + "loss": 0.0552, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3418686389923096, + "rewards/margins": 13.780197143554688, + "rewards/rejected": -15.12206745147705, + "step": 12500 + }, + { + "epoch": 2.43, + "eval_logits/chosen": -2.576340913772583, + "eval_logits/rejected": -2.5609962940216064, + "eval_logps/chosen": -285.8228759765625, + "eval_logps/rejected": -330.5560607910156, + "eval_loss": 0.6213422417640686, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -9.158485412597656, + "eval_rewards/margins": 6.705523490905762, + "eval_rewards/rejected": -15.864008903503418, + "eval_runtime": 155.7999, + "eval_samples_per_second": 20.257, + "eval_steps_per_second": 0.321, + "step": 12500 + }, + { + "epoch": 2.43, + "learning_rate": 1.058100237290573e-07, + "logits/chosen": -2.412020206451416, + "logits/rejected": -2.4402787685394287, + "logps/chosen": -280.79742431640625, + "logps/rejected": -391.39483642578125, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.541536331176758, + "rewards/margins": 14.372830390930176, + "rewards/rejected": -16.914363861083984, + "step": 12510 + }, + { + "epoch": 2.43, + "learning_rate": 1.0545049255770475e-07, + "logits/chosen": -2.697357416152954, + "logits/rejected": -2.5625782012939453, + "logps/chosen": -303.3705139160156, + "logps/rejected": -471.81488037109375, + "loss": 0.0568, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.599686622619629, + "rewards/margins": 17.236553192138672, + "rewards/rejected": -23.836238861083984, + "step": 12520 + }, + { + "epoch": 2.43, + "learning_rate": 1.0509096138635219e-07, + "logits/chosen": -2.3948986530303955, + "logits/rejected": -2.5268566608428955, + "logps/chosen": -246.1656036376953, + "logps/rejected": -372.9352722167969, + "loss": 0.0878, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.8637285232543945, + "rewards/margins": 11.812223434448242, + "rewards/rejected": -17.67595100402832, + "step": 12530 + }, + { + "epoch": 2.43, + "learning_rate": 1.0473143021499964e-07, + "logits/chosen": -2.5686511993408203, + "logits/rejected": -2.471906900405884, + "logps/chosen": -394.0550231933594, + "logps/rejected": -486.830322265625, + "loss": 0.088, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.492640495300293, + "rewards/margins": 16.098495483398438, + "rewards/rejected": -22.591136932373047, + "step": 12540 + }, + { + "epoch": 2.44, + "learning_rate": 1.0437189904364709e-07, + "logits/chosen": -2.745089530944824, + "logits/rejected": -2.6197681427001953, + "logps/chosen": -293.5634460449219, + "logps/rejected": -372.40191650390625, + "loss": 0.0635, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1913766860961914, + "rewards/margins": 14.297021865844727, + "rewards/rejected": -17.4883975982666, + "step": 12550 + }, + { + "epoch": 2.44, + "learning_rate": 1.0401236787229451e-07, + "logits/chosen": -2.6241023540496826, + "logits/rejected": -2.6891422271728516, + "logps/chosen": -207.7499237060547, + "logps/rejected": -394.9376525878906, + "loss": 0.0679, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.059928894042969, + "rewards/margins": 14.671516418457031, + "rewards/rejected": -18.7314453125, + "step": 12560 + }, + { + "epoch": 2.44, + "learning_rate": 1.0365283670094197e-07, + "logits/chosen": -2.658356189727783, + "logits/rejected": -2.581386089324951, + "logps/chosen": -255.8565216064453, + "logps/rejected": -458.71539306640625, + "loss": 0.1218, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.501692771911621, + "rewards/margins": 19.707727432250977, + "rewards/rejected": -28.20941734313965, + "step": 12570 + }, + { + "epoch": 2.44, + "learning_rate": 1.0329330552958941e-07, + "logits/chosen": -2.6098690032958984, + "logits/rejected": -2.50624942779541, + "logps/chosen": -219.2275848388672, + "logps/rejected": -326.49737548828125, + "loss": 0.076, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.19970703125, + "rewards/margins": 11.127429008483887, + "rewards/rejected": -16.327136993408203, + "step": 12580 + }, + { + "epoch": 2.44, + "learning_rate": 1.0293377435823686e-07, + "logits/chosen": -2.5886106491088867, + "logits/rejected": -2.484173536300659, + "logps/chosen": -311.59967041015625, + "logps/rejected": -574.6200561523438, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.177712440490723, + "rewards/margins": 16.55941390991211, + "rewards/rejected": -23.737125396728516, + "step": 12590 + }, + { + "epoch": 2.45, + "learning_rate": 1.025742431868843e-07, + "logits/chosen": -2.6926848888397217, + "logits/rejected": -2.5466065406799316, + "logps/chosen": -281.74493408203125, + "logps/rejected": -421.4120178222656, + "loss": 0.0667, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5114688873291016, + "rewards/margins": 15.035595893859863, + "rewards/rejected": -18.54706573486328, + "step": 12600 + }, + { + "epoch": 2.45, + "eval_logits/chosen": -2.5719873905181885, + "eval_logits/rejected": -2.5532681941986084, + "eval_logps/chosen": -297.1966552734375, + "eval_logps/rejected": -346.5535888671875, + "eval_loss": 0.6204590201377869, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -10.295859336853027, + "eval_rewards/margins": 7.167905330657959, + "eval_rewards/rejected": -17.463764190673828, + "eval_runtime": 140.7626, + "eval_samples_per_second": 22.421, + "eval_steps_per_second": 0.355, + "step": 12600 + }, + { + "epoch": 2.45, + "learning_rate": 1.0221471201553173e-07, + "logits/chosen": -2.464975595474243, + "logits/rejected": -2.448719024658203, + "logps/chosen": -246.8695831298828, + "logps/rejected": -325.03033447265625, + "loss": 0.0668, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.008930206298828, + "rewards/margins": 14.002431869506836, + "rewards/rejected": -19.011362075805664, + "step": 12610 + }, + { + "epoch": 2.45, + "learning_rate": 1.0185518084417919e-07, + "logits/chosen": -2.6147327423095703, + "logits/rejected": -2.604248046875, + "logps/chosen": -263.7695007324219, + "logps/rejected": -364.6864013671875, + "loss": 0.0757, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.259385585784912, + "rewards/margins": 14.476222038269043, + "rewards/rejected": -19.735607147216797, + "step": 12620 + }, + { + "epoch": 2.45, + "learning_rate": 1.0149564967282663e-07, + "logits/chosen": -2.736812114715576, + "logits/rejected": -2.634316921234131, + "logps/chosen": -254.13229370117188, + "logps/rejected": -322.1239013671875, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1754138469696045, + "rewards/margins": 13.549273490905762, + "rewards/rejected": -15.724688529968262, + "step": 12630 + }, + { + "epoch": 2.45, + "learning_rate": 1.0113611850147408e-07, + "logits/chosen": -2.5996899604797363, + "logits/rejected": -2.5931382179260254, + "logps/chosen": -314.7234802246094, + "logps/rejected": -403.55718994140625, + "loss": 0.0726, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7101809978485107, + "rewards/margins": 16.912425994873047, + "rewards/rejected": -20.622608184814453, + "step": 12640 + }, + { + "epoch": 2.46, + "learning_rate": 1.0077658733012152e-07, + "logits/chosen": -2.7735037803649902, + "logits/rejected": -2.748175859451294, + "logps/chosen": -344.12493896484375, + "logps/rejected": -494.8115234375, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.863116264343262, + "rewards/margins": 16.04551124572754, + "rewards/rejected": -20.908626556396484, + "step": 12650 + }, + { + "epoch": 2.46, + "learning_rate": 1.0041705615876895e-07, + "logits/chosen": -2.819232940673828, + "logits/rejected": -2.6816084384918213, + "logps/chosen": -316.5780334472656, + "logps/rejected": -399.15771484375, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7577444314956665, + "rewards/margins": 11.514211654663086, + "rewards/rejected": -13.271957397460938, + "step": 12660 + }, + { + "epoch": 2.46, + "learning_rate": 1.000575249874164e-07, + "logits/chosen": -2.6154356002807617, + "logits/rejected": -2.689516067504883, + "logps/chosen": -277.93463134765625, + "logps/rejected": -424.29412841796875, + "loss": 0.0787, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.221366882324219, + "rewards/margins": 13.102206230163574, + "rewards/rejected": -19.32357406616211, + "step": 12670 + }, + { + "epoch": 2.46, + "learning_rate": 9.969799381606385e-08, + "logits/chosen": -2.7781119346618652, + "logits/rejected": -2.730433940887451, + "logps/chosen": -297.75384521484375, + "logps/rejected": -434.0203552246094, + "loss": 0.0704, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.925167441368103, + "rewards/margins": 14.537073135375977, + "rewards/rejected": -16.46224021911621, + "step": 12680 + }, + { + "epoch": 2.46, + "learning_rate": 9.93384626447113e-08, + "logits/chosen": -2.698772430419922, + "logits/rejected": -2.6802220344543457, + "logps/chosen": -261.1056823730469, + "logps/rejected": -355.3989562988281, + "loss": 0.0806, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.682939052581787, + "rewards/margins": 14.250040054321289, + "rewards/rejected": -18.932979583740234, + "step": 12690 + }, + { + "epoch": 2.47, + "learning_rate": 9.897893147335873e-08, + "logits/chosen": -2.669194459915161, + "logits/rejected": -2.6278138160705566, + "logps/chosen": -202.12786865234375, + "logps/rejected": -358.1590881347656, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0838189125061035, + "rewards/margins": 14.480325698852539, + "rewards/rejected": -17.56414222717285, + "step": 12700 + }, + { + "epoch": 2.47, + "eval_logits/chosen": -2.5524840354919434, + "eval_logits/rejected": -2.534165143966675, + "eval_logps/chosen": -298.2552795410156, + "eval_logps/rejected": -347.70635986328125, + "eval_loss": 0.6299814581871033, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -10.401721000671387, + "eval_rewards/margins": 7.1773200035095215, + "eval_rewards/rejected": -17.579038619995117, + "eval_runtime": 139.9454, + "eval_samples_per_second": 22.552, + "eval_steps_per_second": 0.357, + "step": 12700 + }, + { + "epoch": 2.47, + "learning_rate": 9.861940030200617e-08, + "logits/chosen": -2.6520049571990967, + "logits/rejected": -2.6566200256347656, + "logps/chosen": -202.8054962158203, + "logps/rejected": -443.3741760253906, + "loss": 0.0363, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.764678001403809, + "rewards/margins": 12.155046463012695, + "rewards/rejected": -16.919723510742188, + "step": 12710 + }, + { + "epoch": 2.47, + "learning_rate": 9.825986913065362e-08, + "logits/chosen": -2.608522891998291, + "logits/rejected": -2.6032421588897705, + "logps/chosen": -255.10604858398438, + "logps/rejected": -498.72900390625, + "loss": 0.0493, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.724484443664551, + "rewards/margins": 14.063451766967773, + "rewards/rejected": -18.787935256958008, + "step": 12720 + }, + { + "epoch": 2.47, + "learning_rate": 9.790033795930106e-08, + "logits/chosen": -2.6986923217773438, + "logits/rejected": -2.6604561805725098, + "logps/chosen": -287.8281555175781, + "logps/rejected": -354.50067138671875, + "loss": 0.0692, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.757866859436035, + "rewards/margins": 11.266092300415039, + "rewards/rejected": -20.02396011352539, + "step": 12730 + }, + { + "epoch": 2.47, + "learning_rate": 9.754080678794852e-08, + "logits/chosen": -2.6669459342956543, + "logits/rejected": -2.582268238067627, + "logps/chosen": -348.4651184082031, + "logps/rejected": -378.26739501953125, + "loss": 0.085, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3547120094299316, + "rewards/margins": 17.456912994384766, + "rewards/rejected": -20.811622619628906, + "step": 12740 + }, + { + "epoch": 2.48, + "learning_rate": 9.718127561659595e-08, + "logits/chosen": -2.5141801834106445, + "logits/rejected": -2.51961088180542, + "logps/chosen": -189.30706787109375, + "logps/rejected": -281.38299560546875, + "loss": 0.0467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.091251373291016, + "rewards/margins": 10.726353645324707, + "rewards/rejected": -14.817604064941406, + "step": 12750 + }, + { + "epoch": 2.48, + "learning_rate": 9.68217444452434e-08, + "logits/chosen": -2.643195867538452, + "logits/rejected": -2.528999090194702, + "logps/chosen": -267.8007507324219, + "logps/rejected": -391.2079162597656, + "loss": 0.0754, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.385805130004883, + "rewards/margins": 11.680742263793945, + "rewards/rejected": -20.066547393798828, + "step": 12760 + }, + { + "epoch": 2.48, + "learning_rate": 9.646221327389084e-08, + "logits/chosen": -2.6890969276428223, + "logits/rejected": -2.6115336418151855, + "logps/chosen": -359.04071044921875, + "logps/rejected": -377.3211975097656, + "loss": 0.0734, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.392149448394775, + "rewards/margins": 13.718755722045898, + "rewards/rejected": -18.11090660095215, + "step": 12770 + }, + { + "epoch": 2.48, + "learning_rate": 9.610268210253828e-08, + "logits/chosen": -2.5152180194854736, + "logits/rejected": -2.525686025619507, + "logps/chosen": -254.0458984375, + "logps/rejected": -403.14715576171875, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.80006217956543, + "rewards/margins": 15.435331344604492, + "rewards/rejected": -20.235393524169922, + "step": 12780 + }, + { + "epoch": 2.48, + "learning_rate": 9.574315093118574e-08, + "logits/chosen": -2.4921417236328125, + "logits/rejected": -2.444669485092163, + "logps/chosen": -243.29800415039062, + "logps/rejected": -392.8536376953125, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.819563388824463, + "rewards/margins": 15.767827033996582, + "rewards/rejected": -21.587390899658203, + "step": 12790 + }, + { + "epoch": 2.48, + "learning_rate": 9.538361975983317e-08, + "logits/chosen": -2.5865397453308105, + "logits/rejected": -2.547572612762451, + "logps/chosen": -284.09197998046875, + "logps/rejected": -376.93170166015625, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4949049949646, + "rewards/margins": 12.917729377746582, + "rewards/rejected": -19.412633895874023, + "step": 12800 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.5215024948120117, + "eval_logits/rejected": -2.4994101524353027, + "eval_logps/chosen": -304.1522521972656, + "eval_logps/rejected": -359.07647705078125, + "eval_loss": 0.6498669385910034, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -10.99142074584961, + "eval_rewards/margins": 7.7246317863464355, + "eval_rewards/rejected": -18.716053009033203, + "eval_runtime": 140.6153, + "eval_samples_per_second": 22.444, + "eval_steps_per_second": 0.356, + "step": 12800 + }, + { + "epoch": 2.49, + "learning_rate": 9.502408858848062e-08, + "logits/chosen": -2.5164453983306885, + "logits/rejected": -2.4940741062164307, + "logps/chosen": -320.9963684082031, + "logps/rejected": -344.89447021484375, + "loss": 0.1136, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.1120405197143555, + "rewards/margins": 10.176172256469727, + "rewards/rejected": -17.288211822509766, + "step": 12810 + }, + { + "epoch": 2.49, + "learning_rate": 9.466455741712806e-08, + "logits/chosen": -2.4596664905548096, + "logits/rejected": -2.4942336082458496, + "logps/chosen": -321.5664367675781, + "logps/rejected": -388.92218017578125, + "loss": 0.0672, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.90731143951416, + "rewards/margins": 15.95301628112793, + "rewards/rejected": -25.86033058166504, + "step": 12820 + }, + { + "epoch": 2.49, + "learning_rate": 9.43050262457755e-08, + "logits/chosen": -2.4090611934661865, + "logits/rejected": -2.249752998352051, + "logps/chosen": -293.23095703125, + "logps/rejected": -350.8958435058594, + "loss": 0.0557, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9957728385925293, + "rewards/margins": 9.820385932922363, + "rewards/rejected": -13.81615924835205, + "step": 12830 + }, + { + "epoch": 2.49, + "learning_rate": 9.394549507442296e-08, + "logits/chosen": -2.5324692726135254, + "logits/rejected": -2.4817562103271484, + "logps/chosen": -232.61306762695312, + "logps/rejected": -351.1329650878906, + "loss": 0.0676, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.994973182678223, + "rewards/margins": 10.949104309082031, + "rewards/rejected": -20.944076538085938, + "step": 12840 + }, + { + "epoch": 2.49, + "learning_rate": 9.358596390307038e-08, + "logits/chosen": -2.4570517539978027, + "logits/rejected": -2.505894184112549, + "logps/chosen": -272.30963134765625, + "logps/rejected": -398.0025939941406, + "loss": 0.0682, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.065876007080078, + "rewards/margins": 13.8529691696167, + "rewards/rejected": -18.918846130371094, + "step": 12850 + }, + { + "epoch": 2.5, + "learning_rate": 9.322643273171784e-08, + "logits/chosen": -2.6570115089416504, + "logits/rejected": -2.6433606147766113, + "logps/chosen": -308.03582763671875, + "logps/rejected": -451.9251403808594, + "loss": 0.064, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.715272426605225, + "rewards/margins": 20.070213317871094, + "rewards/rejected": -27.785486221313477, + "step": 12860 + }, + { + "epoch": 2.5, + "learning_rate": 9.286690156036528e-08, + "logits/chosen": -2.61348819732666, + "logits/rejected": -2.483337879180908, + "logps/chosen": -306.2225036621094, + "logps/rejected": -383.4023132324219, + "loss": 0.0485, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.665471076965332, + "rewards/margins": 10.886301040649414, + "rewards/rejected": -16.55177116394043, + "step": 12870 + }, + { + "epoch": 2.5, + "learning_rate": 9.250737038901272e-08, + "logits/chosen": -2.4846606254577637, + "logits/rejected": -2.443726062774658, + "logps/chosen": -246.4105987548828, + "logps/rejected": -365.9189453125, + "loss": 0.1106, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.970712661743164, + "rewards/margins": 10.023126602172852, + "rewards/rejected": -22.993837356567383, + "step": 12880 + }, + { + "epoch": 2.5, + "learning_rate": 9.214783921766017e-08, + "logits/chosen": -2.5224173069000244, + "logits/rejected": -2.454105854034424, + "logps/chosen": -251.5784454345703, + "logps/rejected": -300.30377197265625, + "loss": 0.0882, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.659109592437744, + "rewards/margins": 11.266281127929688, + "rewards/rejected": -15.925392150878906, + "step": 12890 + }, + { + "epoch": 2.5, + "learning_rate": 9.17883080463076e-08, + "logits/chosen": -2.5811517238616943, + "logits/rejected": -2.615131378173828, + "logps/chosen": -298.760009765625, + "logps/rejected": -392.4461975097656, + "loss": 0.0687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.927229881286621, + "rewards/margins": 12.201677322387695, + "rewards/rejected": -21.128908157348633, + "step": 12900 + }, + { + "epoch": 2.5, + "eval_logits/chosen": -2.570328950881958, + "eval_logits/rejected": -2.5499255657196045, + "eval_logps/chosen": -313.0833740234375, + "eval_logps/rejected": -369.8017578125, + "eval_loss": 0.6572585701942444, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -11.884530067443848, + "eval_rewards/margins": 7.904053211212158, + "eval_rewards/rejected": -19.788583755493164, + "eval_runtime": 159.0856, + "eval_samples_per_second": 19.838, + "eval_steps_per_second": 0.314, + "step": 12900 + }, + { + "epoch": 2.51, + "learning_rate": 9.142877687495506e-08, + "logits/chosen": -2.7034802436828613, + "logits/rejected": -2.7853951454162598, + "logps/chosen": -244.06210327148438, + "logps/rejected": -480.805908203125, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.491147041320801, + "rewards/margins": 17.780879974365234, + "rewards/rejected": -25.272029876708984, + "step": 12910 + }, + { + "epoch": 2.51, + "learning_rate": 9.10692457036025e-08, + "logits/chosen": -2.6614511013031006, + "logits/rejected": -2.635369062423706, + "logps/chosen": -266.12908935546875, + "logps/rejected": -330.9181213378906, + "loss": 0.0683, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.025728702545166, + "rewards/margins": 9.081961631774902, + "rewards/rejected": -15.107688903808594, + "step": 12920 + }, + { + "epoch": 2.51, + "learning_rate": 9.070971453224994e-08, + "logits/chosen": -2.6953396797180176, + "logits/rejected": -2.7117714881896973, + "logps/chosen": -310.4691467285156, + "logps/rejected": -535.332275390625, + "loss": 0.0785, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.80519962310791, + "rewards/margins": 18.081783294677734, + "rewards/rejected": -24.886981964111328, + "step": 12930 + }, + { + "epoch": 2.51, + "learning_rate": 9.035018336089739e-08, + "logits/chosen": -2.518805742263794, + "logits/rejected": -2.5691771507263184, + "logps/chosen": -257.78912353515625, + "logps/rejected": -489.98846435546875, + "loss": 0.076, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.732778072357178, + "rewards/margins": 14.248738288879395, + "rewards/rejected": -20.981517791748047, + "step": 12940 + }, + { + "epoch": 2.51, + "learning_rate": 8.999065218954482e-08, + "logits/chosen": -2.514657974243164, + "logits/rejected": -2.5844292640686035, + "logps/chosen": -247.2261962890625, + "logps/rejected": -398.01141357421875, + "loss": 0.0624, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -11.016149520874023, + "rewards/margins": 15.479927062988281, + "rewards/rejected": -26.496074676513672, + "step": 12950 + }, + { + "epoch": 2.52, + "learning_rate": 8.963112101819228e-08, + "logits/chosen": -2.5862746238708496, + "logits/rejected": -2.543480396270752, + "logps/chosen": -269.873291015625, + "logps/rejected": -459.63165283203125, + "loss": 0.0779, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.907736778259277, + "rewards/margins": 17.983074188232422, + "rewards/rejected": -27.89080810546875, + "step": 12960 + }, + { + "epoch": 2.52, + "learning_rate": 8.927158984683972e-08, + "logits/chosen": -2.6230921745300293, + "logits/rejected": -2.642923593521118, + "logps/chosen": -324.45709228515625, + "logps/rejected": -424.33721923828125, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.874556064605713, + "rewards/margins": 16.19103240966797, + "rewards/rejected": -24.065587997436523, + "step": 12970 + }, + { + "epoch": 2.52, + "learning_rate": 8.891205867548717e-08, + "logits/chosen": -2.6603245735168457, + "logits/rejected": -2.6209988594055176, + "logps/chosen": -240.4772491455078, + "logps/rejected": -339.351318359375, + "loss": 0.0835, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.2165093421936035, + "rewards/margins": 11.686843872070312, + "rewards/rejected": -17.90335464477539, + "step": 12980 + }, + { + "epoch": 2.52, + "learning_rate": 8.855252750413461e-08, + "logits/chosen": -2.600198984146118, + "logits/rejected": -2.5400357246398926, + "logps/chosen": -340.0777282714844, + "logps/rejected": -481.73626708984375, + "loss": 0.0715, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.748044967651367, + "rewards/margins": 16.56332015991211, + "rewards/rejected": -25.31136703491211, + "step": 12990 + }, + { + "epoch": 2.52, + "learning_rate": 8.819299633278204e-08, + "logits/chosen": -2.6538376808166504, + "logits/rejected": -2.6430704593658447, + "logps/chosen": -210.0105743408203, + "logps/rejected": -370.7178039550781, + "loss": 0.0658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.693873405456543, + "rewards/margins": 14.175498962402344, + "rewards/rejected": -19.86937141418457, + "step": 13000 + }, + { + "epoch": 2.52, + "eval_logits/chosen": -2.5584936141967773, + "eval_logits/rejected": -2.5373728275299072, + "eval_logps/chosen": -317.293212890625, + "eval_logps/rejected": -375.7680358886719, + "eval_loss": 0.645956814289093, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -12.305511474609375, + "eval_rewards/margins": 8.07969856262207, + "eval_rewards/rejected": -20.385211944580078, + "eval_runtime": 155.6589, + "eval_samples_per_second": 20.275, + "eval_steps_per_second": 0.321, + "step": 13000 + }, + { + "epoch": 2.53, + "learning_rate": 8.78334651614295e-08, + "logits/chosen": -2.518589496612549, + "logits/rejected": -2.54185152053833, + "logps/chosen": -265.8616027832031, + "logps/rejected": -421.4515075683594, + "loss": 0.08, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.167800903320312, + "rewards/margins": 12.785776138305664, + "rewards/rejected": -22.953577041625977, + "step": 13010 + }, + { + "epoch": 2.53, + "learning_rate": 8.747393399007693e-08, + "logits/chosen": -2.6595230102539062, + "logits/rejected": -2.5554375648498535, + "logps/chosen": -358.6347351074219, + "logps/rejected": -357.8252258300781, + "loss": 0.1226, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -12.585082054138184, + "rewards/margins": 11.05892276763916, + "rewards/rejected": -23.644004821777344, + "step": 13020 + }, + { + "epoch": 2.53, + "learning_rate": 8.711440281872439e-08, + "logits/chosen": -2.597569704055786, + "logits/rejected": -2.589627742767334, + "logps/chosen": -251.9309539794922, + "logps/rejected": -354.78912353515625, + "loss": 0.0741, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.414864540100098, + "rewards/margins": 13.376960754394531, + "rewards/rejected": -20.791826248168945, + "step": 13030 + }, + { + "epoch": 2.53, + "learning_rate": 8.675487164737183e-08, + "logits/chosen": -2.6389718055725098, + "logits/rejected": -2.5409035682678223, + "logps/chosen": -452.167236328125, + "logps/rejected": -451.02911376953125, + "loss": 0.0818, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.244949340820312, + "rewards/margins": 12.40312385559082, + "rewards/rejected": -21.648075103759766, + "step": 13040 + }, + { + "epoch": 2.53, + "learning_rate": 8.639534047601926e-08, + "logits/chosen": -2.500101327896118, + "logits/rejected": -2.466165542602539, + "logps/chosen": -218.2380828857422, + "logps/rejected": -395.40924072265625, + "loss": 0.0902, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.961050987243652, + "rewards/margins": 13.048199653625488, + "rewards/rejected": -22.009252548217773, + "step": 13050 + }, + { + "epoch": 2.54, + "learning_rate": 8.603580930466671e-08, + "logits/chosen": -2.6560113430023193, + "logits/rejected": -2.672569990158081, + "logps/chosen": -347.00213623046875, + "logps/rejected": -363.33843994140625, + "loss": 0.0754, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.370176792144775, + "rewards/margins": 12.299577713012695, + "rewards/rejected": -17.669755935668945, + "step": 13060 + }, + { + "epoch": 2.54, + "learning_rate": 8.567627813331415e-08, + "logits/chosen": -2.5172853469848633, + "logits/rejected": -2.3834493160247803, + "logps/chosen": -192.8883819580078, + "logps/rejected": -286.356689453125, + "loss": 0.0378, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.385251998901367, + "rewards/margins": 10.082830429077148, + "rewards/rejected": -14.4680814743042, + "step": 13070 + }, + { + "epoch": 2.54, + "learning_rate": 8.531674696196161e-08, + "logits/chosen": -2.557122230529785, + "logits/rejected": -2.6049318313598633, + "logps/chosen": -273.2023010253906, + "logps/rejected": -496.791259765625, + "loss": 0.0696, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.941291809082031, + "rewards/margins": 16.8946590423584, + "rewards/rejected": -27.835948944091797, + "step": 13080 + }, + { + "epoch": 2.54, + "learning_rate": 8.495721579060904e-08, + "logits/chosen": -2.5869221687316895, + "logits/rejected": -2.59763240814209, + "logps/chosen": -298.122802734375, + "logps/rejected": -464.675537109375, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.380122184753418, + "rewards/margins": 18.6658878326416, + "rewards/rejected": -26.046010971069336, + "step": 13090 + }, + { + "epoch": 2.54, + "learning_rate": 8.459768461925648e-08, + "logits/chosen": -2.384657144546509, + "logits/rejected": -2.4255499839782715, + "logps/chosen": -280.59210205078125, + "logps/rejected": -405.8802795410156, + "loss": 0.0897, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.882360458374023, + "rewards/margins": 11.485540390014648, + "rewards/rejected": -20.367902755737305, + "step": 13100 + }, + { + "epoch": 2.54, + "eval_logits/chosen": -2.4809982776641846, + "eval_logits/rejected": -2.4576659202575684, + "eval_logps/chosen": -320.8460388183594, + "eval_logps/rejected": -381.0459289550781, + "eval_loss": 0.6673251986503601, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -12.660794258117676, + "eval_rewards/margins": 8.252202987670898, + "eval_rewards/rejected": -20.91299819946289, + "eval_runtime": 140.2174, + "eval_samples_per_second": 22.508, + "eval_steps_per_second": 0.357, + "step": 13100 + }, + { + "epoch": 2.55, + "learning_rate": 8.423815344790393e-08, + "logits/chosen": -2.4887547492980957, + "logits/rejected": -2.394315719604492, + "logps/chosen": -198.9996795654297, + "logps/rejected": -376.09381103515625, + "loss": 0.0873, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.836404800415039, + "rewards/margins": 17.285276412963867, + "rewards/rejected": -26.121679306030273, + "step": 13110 + }, + { + "epoch": 2.55, + "learning_rate": 8.387862227655137e-08, + "logits/chosen": -2.5716552734375, + "logits/rejected": -2.5738325119018555, + "logps/chosen": -342.38140869140625, + "logps/rejected": -508.90667724609375, + "loss": 0.0731, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.472661018371582, + "rewards/margins": 11.9817533493042, + "rewards/rejected": -21.454416275024414, + "step": 13120 + }, + { + "epoch": 2.55, + "learning_rate": 8.351909110519883e-08, + "logits/chosen": -2.459246873855591, + "logits/rejected": -2.545761823654175, + "logps/chosen": -275.0579528808594, + "logps/rejected": -410.394287109375, + "loss": 0.0794, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.266641139984131, + "rewards/margins": 11.687387466430664, + "rewards/rejected": -18.954029083251953, + "step": 13130 + }, + { + "epoch": 2.55, + "learning_rate": 8.315955993384625e-08, + "logits/chosen": -2.6211397647857666, + "logits/rejected": -2.671525239944458, + "logps/chosen": -287.2232360839844, + "logps/rejected": -538.5015258789062, + "loss": 0.0672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.823758602142334, + "rewards/margins": 21.800235748291016, + "rewards/rejected": -26.62399673461914, + "step": 13140 + }, + { + "epoch": 2.55, + "learning_rate": 8.28000287624937e-08, + "logits/chosen": -2.5801539421081543, + "logits/rejected": -2.5598695278167725, + "logps/chosen": -299.2322692871094, + "logps/rejected": -449.35943603515625, + "loss": 0.0739, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.7073540687561035, + "rewards/margins": 17.938566207885742, + "rewards/rejected": -24.645917892456055, + "step": 13150 + }, + { + "epoch": 2.55, + "learning_rate": 8.244049759114115e-08, + "logits/chosen": -2.568143129348755, + "logits/rejected": -2.414315700531006, + "logps/chosen": -256.2361145019531, + "logps/rejected": -355.2630310058594, + "loss": 0.0799, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -12.5073823928833, + "rewards/margins": 10.324131965637207, + "rewards/rejected": -22.831512451171875, + "step": 13160 + }, + { + "epoch": 2.56, + "learning_rate": 8.208096641978859e-08, + "logits/chosen": -2.345545530319214, + "logits/rejected": -2.265782117843628, + "logps/chosen": -305.25872802734375, + "logps/rejected": -455.2618103027344, + "loss": 0.0611, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.404328346252441, + "rewards/margins": 18.420879364013672, + "rewards/rejected": -28.825210571289062, + "step": 13170 + }, + { + "epoch": 2.56, + "learning_rate": 8.172143524843604e-08, + "logits/chosen": -2.5380609035491943, + "logits/rejected": -2.4967401027679443, + "logps/chosen": -229.2069091796875, + "logps/rejected": -428.5634765625, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.023338317871094, + "rewards/margins": 16.673412322998047, + "rewards/rejected": -22.69675064086914, + "step": 13180 + }, + { + "epoch": 2.56, + "learning_rate": 8.136190407708347e-08, + "logits/chosen": -2.669823169708252, + "logits/rejected": -2.5576939582824707, + "logps/chosen": -292.00836181640625, + "logps/rejected": -311.6917419433594, + "loss": 0.0681, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.127823829650879, + "rewards/margins": 11.13559341430664, + "rewards/rejected": -16.263416290283203, + "step": 13190 + }, + { + "epoch": 2.56, + "learning_rate": 8.100237290573093e-08, + "logits/chosen": -2.6349949836730957, + "logits/rejected": -2.5826575756073, + "logps/chosen": -317.48944091796875, + "logps/rejected": -490.8722229003906, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.946131706237793, + "rewards/margins": 19.052885055541992, + "rewards/rejected": -27.9990177154541, + "step": 13200 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -2.470322370529175, + "eval_logits/rejected": -2.445310592651367, + "eval_logps/chosen": -323.9682312011719, + "eval_logps/rejected": -386.3536071777344, + "eval_loss": 0.6574805378913879, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -12.973016738891602, + "eval_rewards/margins": 8.470744132995605, + "eval_rewards/rejected": -21.443761825561523, + "eval_runtime": 141.3765, + "eval_samples_per_second": 22.323, + "eval_steps_per_second": 0.354, + "step": 13200 + }, + { + "epoch": 2.56, + "learning_rate": 8.064284173437837e-08, + "logits/chosen": -2.6297945976257324, + "logits/rejected": -2.523813486099243, + "logps/chosen": -303.2640686035156, + "logps/rejected": -431.673583984375, + "loss": 0.0838, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.033949851989746, + "rewards/margins": 14.66093635559082, + "rewards/rejected": -24.694883346557617, + "step": 13210 + }, + { + "epoch": 2.57, + "learning_rate": 8.028331056302581e-08, + "logits/chosen": -2.520181179046631, + "logits/rejected": -2.526456117630005, + "logps/chosen": -241.6414794921875, + "logps/rejected": -357.3619079589844, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.619406223297119, + "rewards/margins": 12.885050773620605, + "rewards/rejected": -19.504459381103516, + "step": 13220 + }, + { + "epoch": 2.57, + "learning_rate": 7.992377939167326e-08, + "logits/chosen": -2.6000938415527344, + "logits/rejected": -2.4963479042053223, + "logps/chosen": -323.5462341308594, + "logps/rejected": -438.39453125, + "loss": 0.0593, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.876238822937012, + "rewards/margins": 16.660070419311523, + "rewards/rejected": -25.53631019592285, + "step": 13230 + }, + { + "epoch": 2.57, + "learning_rate": 7.956424822032069e-08, + "logits/chosen": -2.799954652786255, + "logits/rejected": -2.719837188720703, + "logps/chosen": -478.8736267089844, + "logps/rejected": -531.1661987304688, + "loss": 0.1776, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.289868354797363, + "rewards/margins": 9.699061393737793, + "rewards/rejected": -21.988927841186523, + "step": 13240 + }, + { + "epoch": 2.57, + "learning_rate": 7.920471704896815e-08, + "logits/chosen": -2.611156463623047, + "logits/rejected": -2.640255928039551, + "logps/chosen": -304.0635986328125, + "logps/rejected": -457.8282775878906, + "loss": 0.0534, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5346367359161377, + "rewards/margins": 18.964651107788086, + "rewards/rejected": -22.49928855895996, + "step": 13250 + }, + { + "epoch": 2.57, + "learning_rate": 7.884518587761559e-08, + "logits/chosen": -2.641752243041992, + "logits/rejected": -2.5792839527130127, + "logps/chosen": -261.02496337890625, + "logps/rejected": -397.3820495605469, + "loss": 0.1144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.743465423583984, + "rewards/margins": 14.1588716506958, + "rewards/rejected": -21.902339935302734, + "step": 13260 + }, + { + "epoch": 2.58, + "learning_rate": 7.848565470626303e-08, + "logits/chosen": -2.6197493076324463, + "logits/rejected": -2.5807297229766846, + "logps/chosen": -263.82196044921875, + "logps/rejected": -404.60382080078125, + "loss": 0.0876, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.385954856872559, + "rewards/margins": 12.897705078125, + "rewards/rejected": -20.283660888671875, + "step": 13270 + }, + { + "epoch": 2.58, + "learning_rate": 7.812612353491048e-08, + "logits/chosen": -2.526801347732544, + "logits/rejected": -2.5287177562713623, + "logps/chosen": -255.1396484375, + "logps/rejected": -448.39630126953125, + "loss": 0.0839, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.068594932556152, + "rewards/margins": 10.024816513061523, + "rewards/rejected": -17.093412399291992, + "step": 13280 + }, + { + "epoch": 2.58, + "learning_rate": 7.776659236355791e-08, + "logits/chosen": -2.612931728363037, + "logits/rejected": -2.547118663787842, + "logps/chosen": -309.223876953125, + "logps/rejected": -439.5494079589844, + "loss": 0.0703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.652108669281006, + "rewards/margins": 12.59142017364502, + "rewards/rejected": -19.243526458740234, + "step": 13290 + }, + { + "epoch": 2.58, + "learning_rate": 7.740706119220536e-08, + "logits/chosen": -2.6614480018615723, + "logits/rejected": -2.6221108436584473, + "logps/chosen": -284.7066345214844, + "logps/rejected": -397.18798828125, + "loss": 0.0771, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.011357307434082, + "rewards/margins": 12.294212341308594, + "rewards/rejected": -22.30556869506836, + "step": 13300 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.5589723587036133, + "eval_logits/rejected": -2.540689468383789, + "eval_logps/chosen": -304.8467102050781, + "eval_logps/rejected": -356.00262451171875, + "eval_loss": 0.6375167369842529, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -11.060863494873047, + "eval_rewards/margins": 7.347804069519043, + "eval_rewards/rejected": -18.408666610717773, + "eval_runtime": 139.7109, + "eval_samples_per_second": 22.59, + "eval_steps_per_second": 0.358, + "step": 13300 + }, + { + "epoch": 2.58, + "learning_rate": 7.70475300208528e-08, + "logits/chosen": -2.614485263824463, + "logits/rejected": -2.592682123184204, + "logps/chosen": -290.1607360839844, + "logps/rejected": -344.34149169921875, + "loss": 0.0756, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.696990013122559, + "rewards/margins": 10.190881729125977, + "rewards/rejected": -15.887868881225586, + "step": 13310 + }, + { + "epoch": 2.59, + "learning_rate": 7.668799884950025e-08, + "logits/chosen": -2.518864154815674, + "logits/rejected": -2.5086426734924316, + "logps/chosen": -285.72802734375, + "logps/rejected": -376.6285095214844, + "loss": 0.0696, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.568521499633789, + "rewards/margins": 9.653501510620117, + "rewards/rejected": -18.222023010253906, + "step": 13320 + }, + { + "epoch": 2.59, + "learning_rate": 7.63284676781477e-08, + "logits/chosen": -2.6888203620910645, + "logits/rejected": -2.5889365673065186, + "logps/chosen": -235.26669311523438, + "logps/rejected": -366.56146240234375, + "loss": 0.0722, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.766876220703125, + "rewards/margins": 12.524709701538086, + "rewards/rejected": -20.291584014892578, + "step": 13330 + }, + { + "epoch": 2.59, + "learning_rate": 7.596893650679513e-08, + "logits/chosen": -2.509105682373047, + "logits/rejected": -2.510641574859619, + "logps/chosen": -292.5914001464844, + "logps/rejected": -431.8297424316406, + "loss": 0.0786, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.046849250793457, + "rewards/margins": 18.554676055908203, + "rewards/rejected": -27.60152244567871, + "step": 13340 + }, + { + "epoch": 2.59, + "learning_rate": 7.560940533544258e-08, + "logits/chosen": -2.62268328666687, + "logits/rejected": -2.5355706214904785, + "logps/chosen": -301.7018127441406, + "logps/rejected": -379.0313415527344, + "loss": 0.0631, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.858834743499756, + "rewards/margins": 13.080639839172363, + "rewards/rejected": -16.93947410583496, + "step": 13350 + }, + { + "epoch": 2.59, + "learning_rate": 7.524987416409002e-08, + "logits/chosen": -2.5447497367858887, + "logits/rejected": -2.449542999267578, + "logps/chosen": -262.575439453125, + "logps/rejected": -409.4619140625, + "loss": 0.0631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.333880424499512, + "rewards/margins": 11.08108139038086, + "rewards/rejected": -16.414960861206055, + "step": 13360 + }, + { + "epoch": 2.6, + "learning_rate": 7.489034299273746e-08, + "logits/chosen": -2.615506649017334, + "logits/rejected": -2.6858839988708496, + "logps/chosen": -284.05352783203125, + "logps/rejected": -346.6708679199219, + "loss": 0.0818, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.476611137390137, + "rewards/margins": 9.028188705444336, + "rewards/rejected": -16.504802703857422, + "step": 13370 + }, + { + "epoch": 2.6, + "learning_rate": 7.453081182138492e-08, + "logits/chosen": -2.7071032524108887, + "logits/rejected": -2.63547682762146, + "logps/chosen": -226.57119750976562, + "logps/rejected": -363.83148193359375, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6655004024505615, + "rewards/margins": 12.037089347839355, + "rewards/rejected": -14.70258903503418, + "step": 13380 + }, + { + "epoch": 2.6, + "learning_rate": 7.417128065003235e-08, + "logits/chosen": -2.551342487335205, + "logits/rejected": -2.5826187133789062, + "logps/chosen": -190.64488220214844, + "logps/rejected": -350.8616638183594, + "loss": 0.0469, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.572944641113281, + "rewards/margins": 12.592473983764648, + "rewards/rejected": -20.16541862487793, + "step": 13390 + }, + { + "epoch": 2.6, + "learning_rate": 7.38117494786798e-08, + "logits/chosen": -2.6725316047668457, + "logits/rejected": -2.7998526096343994, + "logps/chosen": -226.8726348876953, + "logps/rejected": -448.68634033203125, + "loss": 0.0704, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.834217071533203, + "rewards/margins": 15.59800910949707, + "rewards/rejected": -20.43222427368164, + "step": 13400 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -2.550319194793701, + "eval_logits/rejected": -2.5312576293945312, + "eval_logps/chosen": -308.41473388671875, + "eval_logps/rejected": -361.51446533203125, + "eval_loss": 0.6408479809761047, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -11.417664527893066, + "eval_rewards/margins": 7.542184352874756, + "eval_rewards/rejected": -18.959850311279297, + "eval_runtime": 141.1534, + "eval_samples_per_second": 22.359, + "eval_steps_per_second": 0.354, + "step": 13400 + }, + { + "epoch": 2.6, + "learning_rate": 7.345221830732724e-08, + "logits/chosen": -2.673508405685425, + "logits/rejected": -2.6295700073242188, + "logps/chosen": -351.0205383300781, + "logps/rejected": -398.322998046875, + "loss": 0.0923, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.051070213317871, + "rewards/margins": 11.78144359588623, + "rewards/rejected": -18.832515716552734, + "step": 13410 + }, + { + "epoch": 2.61, + "learning_rate": 7.30926871359747e-08, + "logits/chosen": -2.6739401817321777, + "logits/rejected": -2.531371593475342, + "logps/chosen": -261.10638427734375, + "logps/rejected": -434.35626220703125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.755013942718506, + "rewards/margins": 17.52224349975586, + "rewards/rejected": -21.277257919311523, + "step": 13420 + }, + { + "epoch": 2.61, + "learning_rate": 7.273315596462212e-08, + "logits/chosen": -2.8355870246887207, + "logits/rejected": -2.8189854621887207, + "logps/chosen": -345.1497802734375, + "logps/rejected": -516.4817504882812, + "loss": 0.0501, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5474488735198975, + "rewards/margins": 16.476818084716797, + "rewards/rejected": -19.024269104003906, + "step": 13430 + }, + { + "epoch": 2.61, + "learning_rate": 7.237362479326957e-08, + "logits/chosen": -2.684986114501953, + "logits/rejected": -2.6205391883850098, + "logps/chosen": -304.0296936035156, + "logps/rejected": -425.7872009277344, + "loss": 0.0619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.477696418762207, + "rewards/margins": 13.657798767089844, + "rewards/rejected": -19.135498046875, + "step": 13440 + }, + { + "epoch": 2.61, + "learning_rate": 7.201409362191702e-08, + "logits/chosen": -2.6951632499694824, + "logits/rejected": -2.6230578422546387, + "logps/chosen": -275.9407958984375, + "logps/rejected": -411.18756103515625, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.448228359222412, + "rewards/margins": 13.241569519042969, + "rewards/rejected": -16.689800262451172, + "step": 13450 + }, + { + "epoch": 2.61, + "learning_rate": 7.165456245056446e-08, + "logits/chosen": -2.5738027095794678, + "logits/rejected": -2.509709119796753, + "logps/chosen": -227.03536987304688, + "logps/rejected": -427.52587890625, + "loss": 0.064, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.788276672363281, + "rewards/margins": 14.447412490844727, + "rewards/rejected": -23.235685348510742, + "step": 13460 + }, + { + "epoch": 2.62, + "learning_rate": 7.129503127921191e-08, + "logits/chosen": -2.4864540100097656, + "logits/rejected": -2.5512588024139404, + "logps/chosen": -316.167724609375, + "logps/rejected": -443.88153076171875, + "loss": 0.055, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.14301586151123, + "rewards/margins": 13.683004379272461, + "rewards/rejected": -23.826019287109375, + "step": 13470 + }, + { + "epoch": 2.62, + "learning_rate": 7.093550010785934e-08, + "logits/chosen": -2.6127116680145264, + "logits/rejected": -2.618218183517456, + "logps/chosen": -238.98239135742188, + "logps/rejected": -391.04779052734375, + "loss": 0.0559, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.6085662841796875, + "rewards/margins": 13.831143379211426, + "rewards/rejected": -20.43971061706543, + "step": 13480 + }, + { + "epoch": 2.62, + "learning_rate": 7.057596893650678e-08, + "logits/chosen": -2.6301097869873047, + "logits/rejected": -2.6515915393829346, + "logps/chosen": -257.0636291503906, + "logps/rejected": -378.5958557128906, + "loss": 0.0631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.423306941986084, + "rewards/margins": 18.068883895874023, + "rewards/rejected": -24.492191314697266, + "step": 13490 + }, + { + "epoch": 2.62, + "learning_rate": 7.021643776515424e-08, + "logits/chosen": -2.4764249324798584, + "logits/rejected": -2.516697406768799, + "logps/chosen": -222.3642578125, + "logps/rejected": -371.0328063964844, + "loss": 0.0715, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.297858238220215, + "rewards/margins": 12.84132194519043, + "rewards/rejected": -19.139179229736328, + "step": 13500 + }, + { + "epoch": 2.62, + "eval_logits/chosen": -2.5267419815063477, + "eval_logits/rejected": -2.50559401512146, + "eval_logps/chosen": -312.5887145996094, + "eval_logps/rejected": -369.9871826171875, + "eval_loss": 0.6432679891586304, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -11.835062026977539, + "eval_rewards/margins": 7.972060680389404, + "eval_rewards/rejected": -19.8071231842041, + "eval_runtime": 151.5765, + "eval_samples_per_second": 20.821, + "eval_steps_per_second": 0.33, + "step": 13500 + }, + { + "epoch": 2.62, + "learning_rate": 6.985690659380168e-08, + "logits/chosen": -2.46028733253479, + "logits/rejected": -2.543519973754883, + "logps/chosen": -241.09628295898438, + "logps/rejected": -493.2625427246094, + "loss": 0.0579, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.681536674499512, + "rewards/margins": 15.452682495117188, + "rewards/rejected": -21.134220123291016, + "step": 13510 + }, + { + "epoch": 2.62, + "learning_rate": 6.949737542244913e-08, + "logits/chosen": -2.661557912826538, + "logits/rejected": -2.6307854652404785, + "logps/chosen": -260.44586181640625, + "logps/rejected": -436.988525390625, + "loss": 0.0835, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.225275993347168, + "rewards/margins": 18.49778175354004, + "rewards/rejected": -25.723058700561523, + "step": 13520 + }, + { + "epoch": 2.63, + "learning_rate": 6.913784425109656e-08, + "logits/chosen": -2.634443521499634, + "logits/rejected": -2.6196234226226807, + "logps/chosen": -303.8482971191406, + "logps/rejected": -458.89794921875, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.48668909072876, + "rewards/margins": 16.242252349853516, + "rewards/rejected": -22.728944778442383, + "step": 13530 + }, + { + "epoch": 2.63, + "learning_rate": 6.8778313079744e-08, + "logits/chosen": -2.604661703109741, + "logits/rejected": -2.570814371109009, + "logps/chosen": -251.318359375, + "logps/rejected": -328.5297546386719, + "loss": 0.0828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.066451072692871, + "rewards/margins": 12.24148178100586, + "rewards/rejected": -16.307931900024414, + "step": 13540 + }, + { + "epoch": 2.63, + "learning_rate": 6.841878190839146e-08, + "logits/chosen": -2.697322368621826, + "logits/rejected": -2.6674821376800537, + "logps/chosen": -264.43255615234375, + "logps/rejected": -474.1181640625, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.035953044891357, + "rewards/margins": 17.23788833618164, + "rewards/rejected": -22.273841857910156, + "step": 13550 + }, + { + "epoch": 2.63, + "learning_rate": 6.80592507370389e-08, + "logits/chosen": -2.5578739643096924, + "logits/rejected": -2.70164155960083, + "logps/chosen": -273.7012023925781, + "logps/rejected": -475.08807373046875, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.721813201904297, + "rewards/margins": 20.668071746826172, + "rewards/rejected": -26.389883041381836, + "step": 13560 + }, + { + "epoch": 2.63, + "learning_rate": 6.769971956568635e-08, + "logits/chosen": -2.6877002716064453, + "logits/rejected": -2.658869743347168, + "logps/chosen": -234.70315551757812, + "logps/rejected": -336.563720703125, + "loss": 0.0717, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7852420806884766, + "rewards/margins": 15.942594528198242, + "rewards/rejected": -19.72783660888672, + "step": 13570 + }, + { + "epoch": 2.64, + "learning_rate": 6.734018839433378e-08, + "logits/chosen": -2.714672803878784, + "logits/rejected": -2.6009838581085205, + "logps/chosen": -319.7707824707031, + "logps/rejected": -418.991455078125, + "loss": 0.0457, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.361305236816406, + "rewards/margins": 15.218538284301758, + "rewards/rejected": -22.579843521118164, + "step": 13580 + }, + { + "epoch": 2.64, + "learning_rate": 6.698065722298122e-08, + "logits/chosen": -2.722248077392578, + "logits/rejected": -2.6592392921447754, + "logps/chosen": -306.11944580078125, + "logps/rejected": -349.6539001464844, + "loss": 0.0796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.100323677062988, + "rewards/margins": 8.627074241638184, + "rewards/rejected": -17.727397918701172, + "step": 13590 + }, + { + "epoch": 2.64, + "learning_rate": 6.662112605162868e-08, + "logits/chosen": -2.7155263423919678, + "logits/rejected": -2.6460728645324707, + "logps/chosen": -300.47674560546875, + "logps/rejected": -378.77203369140625, + "loss": 0.0511, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3318657875061035, + "rewards/margins": 12.285037994384766, + "rewards/rejected": -15.616902351379395, + "step": 13600 + }, + { + "epoch": 2.64, + "eval_logits/chosen": -2.5038247108459473, + "eval_logits/rejected": -2.4817864894866943, + "eval_logps/chosen": -306.9222412109375, + "eval_logps/rejected": -363.99371337890625, + "eval_loss": 0.6403080224990845, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -11.268416404724121, + "eval_rewards/margins": 7.939359664916992, + "eval_rewards/rejected": -19.207775115966797, + "eval_runtime": 157.5492, + "eval_samples_per_second": 20.032, + "eval_steps_per_second": 0.317, + "step": 13600 + }, + { + "epoch": 2.64, + "learning_rate": 6.626159488027612e-08, + "logits/chosen": -2.6395246982574463, + "logits/rejected": -2.4943339824676514, + "logps/chosen": -322.060302734375, + "logps/rejected": -463.0009765625, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8631839752197266, + "rewards/margins": 22.114269256591797, + "rewards/rejected": -24.97745132446289, + "step": 13610 + }, + { + "epoch": 2.64, + "learning_rate": 6.590206370892357e-08, + "logits/chosen": -2.681689500808716, + "logits/rejected": -2.5957589149475098, + "logps/chosen": -309.920166015625, + "logps/rejected": -358.9371337890625, + "loss": 0.1, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.827698707580566, + "rewards/margins": 11.025238037109375, + "rewards/rejected": -15.852938652038574, + "step": 13620 + }, + { + "epoch": 2.65, + "learning_rate": 6.5542532537571e-08, + "logits/chosen": -2.60517954826355, + "logits/rejected": -2.547213077545166, + "logps/chosen": -252.23745727539062, + "logps/rejected": -326.57940673828125, + "loss": 0.0758, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.169960021972656, + "rewards/margins": 9.891514778137207, + "rewards/rejected": -17.061473846435547, + "step": 13630 + }, + { + "epoch": 2.65, + "learning_rate": 6.518300136621844e-08, + "logits/chosen": -2.5564725399017334, + "logits/rejected": -2.5501961708068848, + "logps/chosen": -318.10589599609375, + "logps/rejected": -495.20672607421875, + "loss": 0.073, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.947672367095947, + "rewards/margins": 12.922449111938477, + "rewards/rejected": -19.8701229095459, + "step": 13640 + }, + { + "epoch": 2.65, + "learning_rate": 6.48234701948659e-08, + "logits/chosen": -2.810136079788208, + "logits/rejected": -2.693380832672119, + "logps/chosen": -486.0743103027344, + "logps/rejected": -463.4774475097656, + "loss": 0.0847, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.33536434173584, + "rewards/margins": 9.718295097351074, + "rewards/rejected": -16.053659439086914, + "step": 13650 + }, + { + "epoch": 2.65, + "learning_rate": 6.446393902351333e-08, + "logits/chosen": -2.5283124446868896, + "logits/rejected": -2.4674930572509766, + "logps/chosen": -228.8594970703125, + "logps/rejected": -392.8613586425781, + "loss": 0.0877, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.650363922119141, + "rewards/margins": 11.017718315124512, + "rewards/rejected": -16.668081283569336, + "step": 13660 + }, + { + "epoch": 2.65, + "learning_rate": 6.410440785216079e-08, + "logits/chosen": -2.524414539337158, + "logits/rejected": -2.442152500152588, + "logps/chosen": -270.5942077636719, + "logps/rejected": -389.4601135253906, + "loss": 0.0662, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.495218276977539, + "rewards/margins": 13.683825492858887, + "rewards/rejected": -24.17904281616211, + "step": 13670 + }, + { + "epoch": 2.66, + "learning_rate": 6.374487668080822e-08, + "logits/chosen": -2.6003825664520264, + "logits/rejected": -2.6286122798919678, + "logps/chosen": -243.40072631835938, + "logps/rejected": -433.07403564453125, + "loss": 0.0848, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.930796146392822, + "rewards/margins": 18.22975730895996, + "rewards/rejected": -25.16055679321289, + "step": 13680 + }, + { + "epoch": 2.66, + "learning_rate": 6.338534550945567e-08, + "logits/chosen": -2.459526300430298, + "logits/rejected": -2.2914116382598877, + "logps/chosen": -256.80401611328125, + "logps/rejected": -368.97979736328125, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.247989654541016, + "rewards/margins": 16.199419021606445, + "rewards/rejected": -21.447406768798828, + "step": 13690 + }, + { + "epoch": 2.66, + "learning_rate": 6.302581433810311e-08, + "logits/chosen": -2.480177640914917, + "logits/rejected": -2.5111031532287598, + "logps/chosen": -261.7010498046875, + "logps/rejected": -458.08294677734375, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8917877674102783, + "rewards/margins": 16.93520736694336, + "rewards/rejected": -20.826993942260742, + "step": 13700 + }, + { + "epoch": 2.66, + "eval_logits/chosen": -2.495516777038574, + "eval_logits/rejected": -2.471787214279175, + "eval_logps/chosen": -315.3421936035156, + "eval_logps/rejected": -375.2400817871094, + "eval_loss": 0.6501221656799316, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.110413551330566, + "eval_rewards/margins": 8.221999168395996, + "eval_rewards/rejected": -20.332412719726562, + "eval_runtime": 141.4151, + "eval_samples_per_second": 22.317, + "eval_steps_per_second": 0.354, + "step": 13700 + }, + { + "epoch": 2.66, + "learning_rate": 6.266628316675055e-08, + "logits/chosen": -2.5079479217529297, + "logits/rejected": -2.4605298042297363, + "logps/chosen": -314.8881530761719, + "logps/rejected": -456.40277099609375, + "loss": 0.0828, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.168451309204102, + "rewards/margins": 13.903965950012207, + "rewards/rejected": -22.07241439819336, + "step": 13710 + }, + { + "epoch": 2.66, + "learning_rate": 6.2306751995398e-08, + "logits/chosen": -2.495927095413208, + "logits/rejected": -2.5274269580841064, + "logps/chosen": -259.0688171386719, + "logps/rejected": -360.857666015625, + "loss": 0.062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.026102066040039, + "rewards/margins": 13.894006729125977, + "rewards/rejected": -19.920108795166016, + "step": 13720 + }, + { + "epoch": 2.67, + "learning_rate": 6.194722082404545e-08, + "logits/chosen": -2.5711963176727295, + "logits/rejected": -2.526247024536133, + "logps/chosen": -256.63238525390625, + "logps/rejected": -403.71551513671875, + "loss": 0.0447, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.342735290527344, + "rewards/margins": 12.615774154663086, + "rewards/rejected": -18.958511352539062, + "step": 13730 + }, + { + "epoch": 2.67, + "learning_rate": 6.158768965269289e-08, + "logits/chosen": -2.7091312408447266, + "logits/rejected": -2.7500460147857666, + "logps/chosen": -395.78179931640625, + "logps/rejected": -448.81500244140625, + "loss": 0.0805, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.350695610046387, + "rewards/margins": 12.892354965209961, + "rewards/rejected": -19.243051528930664, + "step": 13740 + }, + { + "epoch": 2.67, + "learning_rate": 6.122815848134033e-08, + "logits/chosen": -2.5087428092956543, + "logits/rejected": -2.406790256500244, + "logps/chosen": -275.4613342285156, + "logps/rejected": -355.447509765625, + "loss": 0.101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.58680248260498, + "rewards/margins": 11.131608963012695, + "rewards/rejected": -20.71841049194336, + "step": 13750 + }, + { + "epoch": 2.67, + "learning_rate": 6.086862730998777e-08, + "logits/chosen": -2.542536497116089, + "logits/rejected": -2.377361536026001, + "logps/chosen": -266.2630310058594, + "logps/rejected": -404.77392578125, + "loss": 0.0893, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.279408931732178, + "rewards/margins": 12.215632438659668, + "rewards/rejected": -19.495040893554688, + "step": 13760 + }, + { + "epoch": 2.67, + "learning_rate": 6.050909613863521e-08, + "logits/chosen": -2.49064564704895, + "logits/rejected": -2.488398551940918, + "logps/chosen": -245.0917510986328, + "logps/rejected": -366.75933837890625, + "loss": 0.0649, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.33775806427002, + "rewards/margins": 10.682060241699219, + "rewards/rejected": -21.019817352294922, + "step": 13770 + }, + { + "epoch": 2.68, + "learning_rate": 6.014956496728267e-08, + "logits/chosen": -2.468036413192749, + "logits/rejected": -2.4206693172454834, + "logps/chosen": -279.8471984863281, + "logps/rejected": -357.43023681640625, + "loss": 0.064, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.859151840209961, + "rewards/margins": 8.90800666809082, + "rewards/rejected": -17.76715850830078, + "step": 13780 + }, + { + "epoch": 2.68, + "learning_rate": 5.979003379593011e-08, + "logits/chosen": -2.41125226020813, + "logits/rejected": -2.4057607650756836, + "logps/chosen": -302.2420959472656, + "logps/rejected": -527.14208984375, + "loss": 0.0629, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.266284942626953, + "rewards/margins": 16.92730140686035, + "rewards/rejected": -22.193584442138672, + "step": 13790 + }, + { + "epoch": 2.68, + "learning_rate": 5.943050262457755e-08, + "logits/chosen": -2.621866464614868, + "logits/rejected": -2.6111340522766113, + "logps/chosen": -347.26025390625, + "logps/rejected": -494.755615234375, + "loss": 0.0724, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.864444732666016, + "rewards/margins": 14.596409797668457, + "rewards/rejected": -22.46085548400879, + "step": 13800 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.5076606273651123, + "eval_logits/rejected": -2.4851980209350586, + "eval_logps/chosen": -317.7358093261719, + "eval_logps/rejected": -375.9327697753906, + "eval_loss": 0.6394022703170776, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.349776268005371, + "eval_rewards/margins": 8.051904678344727, + "eval_rewards/rejected": -20.40167999267578, + "eval_runtime": 140.5761, + "eval_samples_per_second": 22.45, + "eval_steps_per_second": 0.356, + "step": 13800 + }, + { + "epoch": 2.68, + "learning_rate": 5.907097145322499e-08, + "logits/chosen": -2.4033799171447754, + "logits/rejected": -2.4252820014953613, + "logps/chosen": -291.6982116699219, + "logps/rejected": -412.61224365234375, + "loss": 0.0885, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.278039932250977, + "rewards/margins": 11.062031745910645, + "rewards/rejected": -21.340072631835938, + "step": 13810 + }, + { + "epoch": 2.68, + "learning_rate": 5.871144028187244e-08, + "logits/chosen": -2.5429272651672363, + "logits/rejected": -2.42922043800354, + "logps/chosen": -362.37518310546875, + "logps/rejected": -408.1888732910156, + "loss": 0.0805, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.56817626953125, + "rewards/margins": 13.611679077148438, + "rewards/rejected": -23.179853439331055, + "step": 13820 + }, + { + "epoch": 2.68, + "learning_rate": 5.8351909110519886e-08, + "logits/chosen": -2.604635238647461, + "logits/rejected": -2.6315605640411377, + "logps/chosen": -279.68341064453125, + "logps/rejected": -385.7822570800781, + "loss": 0.064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.096555709838867, + "rewards/margins": 13.717987060546875, + "rewards/rejected": -21.81454086303711, + "step": 13830 + }, + { + "epoch": 2.69, + "learning_rate": 5.799237793916732e-08, + "logits/chosen": -2.633336305618286, + "logits/rejected": -2.543452739715576, + "logps/chosen": -286.7044677734375, + "logps/rejected": -379.36456298828125, + "loss": 0.0672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.849825382232666, + "rewards/margins": 18.00542449951172, + "rewards/rejected": -23.855249404907227, + "step": 13840 + }, + { + "epoch": 2.69, + "learning_rate": 5.763284676781477e-08, + "logits/chosen": -2.496095657348633, + "logits/rejected": -2.6102306842803955, + "logps/chosen": -261.90948486328125, + "logps/rejected": -398.29937744140625, + "loss": 0.0534, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.025846481323242, + "rewards/margins": 9.977482795715332, + "rewards/rejected": -19.00333023071289, + "step": 13850 + }, + { + "epoch": 2.69, + "learning_rate": 5.727331559646221e-08, + "logits/chosen": -2.4396555423736572, + "logits/rejected": -2.5189690589904785, + "logps/chosen": -268.1959228515625, + "logps/rejected": -459.5834045410156, + "loss": 0.0711, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.469121932983398, + "rewards/margins": 18.137006759643555, + "rewards/rejected": -26.606128692626953, + "step": 13860 + }, + { + "epoch": 2.69, + "learning_rate": 5.6913784425109657e-08, + "logits/chosen": -2.403203010559082, + "logits/rejected": -2.50626802444458, + "logps/chosen": -234.3402862548828, + "logps/rejected": -384.98468017578125, + "loss": 0.0632, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.300580978393555, + "rewards/margins": 11.250377655029297, + "rewards/rejected": -20.55095863342285, + "step": 13870 + }, + { + "epoch": 2.69, + "learning_rate": 5.65542532537571e-08, + "logits/chosen": -2.5631232261657715, + "logits/rejected": -2.540283679962158, + "logps/chosen": -224.9944610595703, + "logps/rejected": -361.48199462890625, + "loss": 0.0817, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.571410179138184, + "rewards/margins": 16.1390438079834, + "rewards/rejected": -21.710453033447266, + "step": 13880 + }, + { + "epoch": 2.7, + "learning_rate": 5.619472208240454e-08, + "logits/chosen": -2.57092022895813, + "logits/rejected": -2.511808156967163, + "logps/chosen": -273.9009094238281, + "logps/rejected": -377.3023986816406, + "loss": 0.0749, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.2095308303833, + "rewards/margins": 13.39879035949707, + "rewards/rejected": -21.608320236206055, + "step": 13890 + }, + { + "epoch": 2.7, + "learning_rate": 5.5835190911051986e-08, + "logits/chosen": -2.5521140098571777, + "logits/rejected": -2.5139479637145996, + "logps/chosen": -292.22210693359375, + "logps/rejected": -453.43310546875, + "loss": 0.0735, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -12.644539833068848, + "rewards/margins": 15.819013595581055, + "rewards/rejected": -28.46355628967285, + "step": 13900 + }, + { + "epoch": 2.7, + "eval_logits/chosen": -2.4578866958618164, + "eval_logits/rejected": -2.4329802989959717, + "eval_logps/chosen": -324.872802734375, + "eval_logps/rejected": -386.6412353515625, + "eval_loss": 0.6576498746871948, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -13.063472747802734, + "eval_rewards/margins": 8.409056663513184, + "eval_rewards/rejected": -21.4725284576416, + "eval_runtime": 140.9434, + "eval_samples_per_second": 22.392, + "eval_steps_per_second": 0.355, + "step": 13900 + }, + { + "epoch": 2.7, + "learning_rate": 5.547565973969943e-08, + "logits/chosen": -2.6002488136291504, + "logits/rejected": -2.5537025928497314, + "logps/chosen": -273.44207763671875, + "logps/rejected": -367.67156982421875, + "loss": 0.1011, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.493548393249512, + "rewards/margins": 9.718470573425293, + "rewards/rejected": -19.212020874023438, + "step": 13910 + }, + { + "epoch": 2.7, + "learning_rate": 5.5116128568346875e-08, + "logits/chosen": -2.3440332412719727, + "logits/rejected": -2.3300106525421143, + "logps/chosen": -209.345947265625, + "logps/rejected": -338.5674743652344, + "loss": 0.081, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -13.825662612915039, + "rewards/margins": 10.73725414276123, + "rewards/rejected": -24.562911987304688, + "step": 13920 + }, + { + "epoch": 2.7, + "learning_rate": 5.4756597396994316e-08, + "logits/chosen": -2.546124219894409, + "logits/rejected": -2.4810428619384766, + "logps/chosen": -317.8944396972656, + "logps/rejected": -444.1865234375, + "loss": 0.0534, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.366611957550049, + "rewards/margins": 13.541949272155762, + "rewards/rejected": -16.90856170654297, + "step": 13930 + }, + { + "epoch": 2.71, + "learning_rate": 5.4397066225641764e-08, + "logits/chosen": -2.450314521789551, + "logits/rejected": -2.4597702026367188, + "logps/chosen": -285.3365173339844, + "logps/rejected": -410.66046142578125, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.780367374420166, + "rewards/margins": 17.328641891479492, + "rewards/rejected": -23.109010696411133, + "step": 13940 + }, + { + "epoch": 2.71, + "learning_rate": 5.4037535054289205e-08, + "logits/chosen": -2.4047322273254395, + "logits/rejected": -2.4492080211639404, + "logps/chosen": -326.6347351074219, + "logps/rejected": -538.2609252929688, + "loss": 0.0784, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.95956039428711, + "rewards/margins": 20.275020599365234, + "rewards/rejected": -30.234582901000977, + "step": 13950 + }, + { + "epoch": 2.71, + "learning_rate": 5.3678003882936646e-08, + "logits/chosen": -2.460026502609253, + "logits/rejected": -2.4137377738952637, + "logps/chosen": -303.47589111328125, + "logps/rejected": -330.0078125, + "loss": 0.1096, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.023459434509277, + "rewards/margins": 10.610163688659668, + "rewards/rejected": -17.633625030517578, + "step": 13960 + }, + { + "epoch": 2.71, + "learning_rate": 5.3318472711584094e-08, + "logits/chosen": -2.5243473052978516, + "logits/rejected": -2.5143184661865234, + "logps/chosen": -266.847412109375, + "logps/rejected": -472.657470703125, + "loss": 0.0632, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.299185752868652, + "rewards/margins": 18.07523536682129, + "rewards/rejected": -25.374420166015625, + "step": 13970 + }, + { + "epoch": 2.71, + "learning_rate": 5.2958941540231535e-08, + "logits/chosen": -2.412656307220459, + "logits/rejected": -2.2765166759490967, + "logps/chosen": -369.3609313964844, + "logps/rejected": -480.7267150878906, + "loss": 0.0753, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.565628051757812, + "rewards/margins": 18.540231704711914, + "rewards/rejected": -27.105859756469727, + "step": 13980 + }, + { + "epoch": 2.72, + "learning_rate": 5.259941036887898e-08, + "logits/chosen": -2.423884630203247, + "logits/rejected": -2.42649245262146, + "logps/chosen": -296.9546203613281, + "logps/rejected": -418.84246826171875, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2961201667785645, + "rewards/margins": 15.501123428344727, + "rewards/rejected": -20.7972412109375, + "step": 13990 + }, + { + "epoch": 2.72, + "learning_rate": 5.2239879197526423e-08, + "logits/chosen": -2.2800049781799316, + "logits/rejected": -2.2325406074523926, + "logps/chosen": -305.71868896484375, + "logps/rejected": -476.806884765625, + "loss": 0.0836, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.169608116149902, + "rewards/margins": 18.147136688232422, + "rewards/rejected": -24.316747665405273, + "step": 14000 + }, + { + "epoch": 2.72, + "eval_logits/chosen": -2.4533159732818604, + "eval_logits/rejected": -2.4284005165100098, + "eval_logps/chosen": -321.3069152832031, + "eval_logps/rejected": -380.9180603027344, + "eval_loss": 0.6427257061004639, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -12.70688247680664, + "eval_rewards/margins": 8.193329811096191, + "eval_rewards/rejected": -20.90021324157715, + "eval_runtime": 140.6044, + "eval_samples_per_second": 22.446, + "eval_steps_per_second": 0.356, + "step": 14000 + }, + { + "epoch": 2.72, + "learning_rate": 5.1880348026173864e-08, + "logits/chosen": -2.2966625690460205, + "logits/rejected": -2.3146255016326904, + "logps/chosen": -294.59881591796875, + "logps/rejected": -519.9945678710938, + "loss": 0.0586, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.670251846313477, + "rewards/margins": 20.10184669494629, + "rewards/rejected": -30.7720947265625, + "step": 14010 + }, + { + "epoch": 2.72, + "learning_rate": 5.152081685482131e-08, + "logits/chosen": -2.43430233001709, + "logits/rejected": -2.4862494468688965, + "logps/chosen": -319.15325927734375, + "logps/rejected": -460.1707458496094, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3942723274230957, + "rewards/margins": 16.40558433532715, + "rewards/rejected": -18.799856185913086, + "step": 14020 + }, + { + "epoch": 2.72, + "learning_rate": 5.116128568346875e-08, + "logits/chosen": -2.4217216968536377, + "logits/rejected": -2.3965320587158203, + "logps/chosen": -292.21380615234375, + "logps/rejected": -457.8392028808594, + "loss": 0.0571, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.500149726867676, + "rewards/margins": 12.409701347351074, + "rewards/rejected": -19.90985107421875, + "step": 14030 + }, + { + "epoch": 2.73, + "learning_rate": 5.08017545121162e-08, + "logits/chosen": -2.5222690105438232, + "logits/rejected": -2.5466718673706055, + "logps/chosen": -343.06988525390625, + "logps/rejected": -414.82318115234375, + "loss": 0.0769, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.995353698730469, + "rewards/margins": 13.598800659179688, + "rewards/rejected": -21.594152450561523, + "step": 14040 + }, + { + "epoch": 2.73, + "learning_rate": 5.0442223340763635e-08, + "logits/chosen": -2.6274986267089844, + "logits/rejected": -2.628922700881958, + "logps/chosen": -313.6103210449219, + "logps/rejected": -421.7408142089844, + "loss": 0.0995, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.671439170837402, + "rewards/margins": 9.996234893798828, + "rewards/rejected": -16.667675018310547, + "step": 14050 + }, + { + "epoch": 2.73, + "learning_rate": 5.008269216941108e-08, + "logits/chosen": -2.5362377166748047, + "logits/rejected": -2.470487117767334, + "logps/chosen": -264.26251220703125, + "logps/rejected": -457.0308532714844, + "loss": 0.0884, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.933724880218506, + "rewards/margins": 14.920491218566895, + "rewards/rejected": -22.854215621948242, + "step": 14060 + }, + { + "epoch": 2.73, + "learning_rate": 4.972316099805853e-08, + "logits/chosen": -2.618799924850464, + "logits/rejected": -2.589552402496338, + "logps/chosen": -285.24468994140625, + "logps/rejected": -407.5867614746094, + "loss": 0.0707, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.739290714263916, + "rewards/margins": 12.876055717468262, + "rewards/rejected": -19.615345001220703, + "step": 14070 + }, + { + "epoch": 2.73, + "learning_rate": 4.936362982670597e-08, + "logits/chosen": -2.5255298614501953, + "logits/rejected": -2.544981002807617, + "logps/chosen": -282.092041015625, + "logps/rejected": -420.1019592285156, + "loss": 0.0859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.3901495933532715, + "rewards/margins": 14.408068656921387, + "rewards/rejected": -21.798221588134766, + "step": 14080 + }, + { + "epoch": 2.74, + "learning_rate": 4.900409865535342e-08, + "logits/chosen": -2.2034499645233154, + "logits/rejected": -2.360098361968994, + "logps/chosen": -320.4710388183594, + "logps/rejected": -554.2467651367188, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.402893543243408, + "rewards/margins": 20.466861724853516, + "rewards/rejected": -24.869754791259766, + "step": 14090 + }, + { + "epoch": 2.74, + "learning_rate": 4.864456748400086e-08, + "logits/chosen": -2.5468287467956543, + "logits/rejected": -2.4685747623443604, + "logps/chosen": -282.1744079589844, + "logps/rejected": -353.25091552734375, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.291839599609375, + "rewards/margins": 14.212237358093262, + "rewards/rejected": -17.504077911376953, + "step": 14100 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -2.4540998935699463, + "eval_logits/rejected": -2.4287192821502686, + "eval_logps/chosen": -318.9844055175781, + "eval_logps/rejected": -378.7882080078125, + "eval_loss": 0.6444785594940186, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -12.474638938903809, + "eval_rewards/margins": 8.212586402893066, + "eval_rewards/rejected": -20.68722152709961, + "eval_runtime": 139.219, + "eval_samples_per_second": 22.669, + "eval_steps_per_second": 0.359, + "step": 14100 + }, + { + "epoch": 2.74, + "learning_rate": 4.82850363126483e-08, + "logits/chosen": -2.660529613494873, + "logits/rejected": -2.6550545692443848, + "logps/chosen": -290.40789794921875, + "logps/rejected": -378.42462158203125, + "loss": 0.0318, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.830068588256836, + "rewards/margins": 11.361506462097168, + "rewards/rejected": -16.19157600402832, + "step": 14110 + }, + { + "epoch": 2.74, + "learning_rate": 4.792550514129575e-08, + "logits/chosen": -2.360069513320923, + "logits/rejected": -2.2975263595581055, + "logps/chosen": -245.1154022216797, + "logps/rejected": -372.20867919921875, + "loss": 0.0574, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.051399230957031, + "rewards/margins": 13.881940841674805, + "rewards/rejected": -22.933340072631836, + "step": 14120 + }, + { + "epoch": 2.74, + "learning_rate": 4.756597396994319e-08, + "logits/chosen": -2.5174691677093506, + "logits/rejected": -2.560690402984619, + "logps/chosen": -306.4330749511719, + "logps/rejected": -424.43280029296875, + "loss": 0.0575, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -11.856558799743652, + "rewards/margins": 12.972930908203125, + "rewards/rejected": -24.829492568969727, + "step": 14130 + }, + { + "epoch": 2.75, + "learning_rate": 4.720644279859064e-08, + "logits/chosen": -2.5840866565704346, + "logits/rejected": -2.532588481903076, + "logps/chosen": -235.2403106689453, + "logps/rejected": -357.3543701171875, + "loss": 0.0707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.2822160720825195, + "rewards/margins": 13.625099182128906, + "rewards/rejected": -20.90731430053711, + "step": 14140 + }, + { + "epoch": 2.75, + "learning_rate": 4.684691162723808e-08, + "logits/chosen": -2.5340611934661865, + "logits/rejected": -2.538301944732666, + "logps/chosen": -262.1207275390625, + "logps/rejected": -438.6947326660156, + "loss": 0.1069, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.790421485900879, + "rewards/margins": 15.174471855163574, + "rewards/rejected": -20.964893341064453, + "step": 14150 + }, + { + "epoch": 2.75, + "learning_rate": 4.6487380455885527e-08, + "logits/chosen": -2.373089551925659, + "logits/rejected": -2.279886484146118, + "logps/chosen": -240.9802703857422, + "logps/rejected": -349.52069091796875, + "loss": 0.0919, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.635541915893555, + "rewards/margins": 9.87258529663086, + "rewards/rejected": -18.508129119873047, + "step": 14160 + }, + { + "epoch": 2.75, + "learning_rate": 4.612784928453297e-08, + "logits/chosen": -2.5288782119750977, + "logits/rejected": -2.490088939666748, + "logps/chosen": -276.7637939453125, + "logps/rejected": -387.97308349609375, + "loss": 0.0731, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.757145881652832, + "rewards/margins": 15.035905838012695, + "rewards/rejected": -20.79305076599121, + "step": 14170 + }, + { + "epoch": 2.75, + "learning_rate": 4.576831811318041e-08, + "logits/chosen": -2.5628132820129395, + "logits/rejected": -2.627485990524292, + "logps/chosen": -257.33306884765625, + "logps/rejected": -374.7657165527344, + "loss": 0.0621, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0018954277038574, + "rewards/margins": 12.33110237121582, + "rewards/rejected": -15.33299732208252, + "step": 14180 + }, + { + "epoch": 2.75, + "learning_rate": 4.5408786941827856e-08, + "logits/chosen": -2.4532923698425293, + "logits/rejected": -2.4614763259887695, + "logps/chosen": -241.8795166015625, + "logps/rejected": -403.23516845703125, + "loss": 0.0661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.907286643981934, + "rewards/margins": 15.930780410766602, + "rewards/rejected": -20.83806610107422, + "step": 14190 + }, + { + "epoch": 2.76, + "learning_rate": 4.50492557704753e-08, + "logits/chosen": -2.714437961578369, + "logits/rejected": -2.67118501663208, + "logps/chosen": -394.26824951171875, + "logps/rejected": -484.1063537597656, + "loss": 0.0732, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.206911563873291, + "rewards/margins": 17.77530860900879, + "rewards/rejected": -19.982221603393555, + "step": 14200 + }, + { + "epoch": 2.76, + "eval_logits/chosen": -2.451594591140747, + "eval_logits/rejected": -2.427086114883423, + "eval_logps/chosen": -310.6434326171875, + "eval_logps/rejected": -369.8207092285156, + "eval_loss": 0.651365339756012, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -11.640534400939941, + "eval_rewards/margins": 8.149942398071289, + "eval_rewards/rejected": -19.790475845336914, + "eval_runtime": 140.0649, + "eval_samples_per_second": 22.532, + "eval_steps_per_second": 0.357, + "step": 14200 + }, + { + "epoch": 2.76, + "learning_rate": 4.4689724599122745e-08, + "logits/chosen": -2.5125339031219482, + "logits/rejected": -2.4772400856018066, + "logps/chosen": -290.7689208984375, + "logps/rejected": -453.24267578125, + "loss": 0.0621, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.635432243347168, + "rewards/margins": 16.643768310546875, + "rewards/rejected": -25.27920150756836, + "step": 14210 + }, + { + "epoch": 2.76, + "learning_rate": 4.433019342777018e-08, + "logits/chosen": -2.4146082401275635, + "logits/rejected": -2.426396608352661, + "logps/chosen": -310.6662292480469, + "logps/rejected": -390.3843994140625, + "loss": 0.0801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.701229095458984, + "rewards/margins": 11.596210479736328, + "rewards/rejected": -22.297443389892578, + "step": 14220 + }, + { + "epoch": 2.76, + "learning_rate": 4.397066225641763e-08, + "logits/chosen": -2.708631992340088, + "logits/rejected": -2.660353183746338, + "logps/chosen": -370.0521240234375, + "logps/rejected": -392.21044921875, + "loss": 0.0561, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.522287368774414, + "rewards/margins": 8.776540756225586, + "rewards/rejected": -13.298826217651367, + "step": 14230 + }, + { + "epoch": 2.76, + "learning_rate": 4.3611131085065075e-08, + "logits/chosen": -2.3451640605926514, + "logits/rejected": -2.228459119796753, + "logps/chosen": -216.63671875, + "logps/rejected": -359.85760498046875, + "loss": 0.0713, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.85043716430664, + "rewards/margins": 12.831878662109375, + "rewards/rejected": -22.68231773376465, + "step": 14240 + }, + { + "epoch": 2.77, + "learning_rate": 4.3251599913712516e-08, + "logits/chosen": -2.4758808612823486, + "logits/rejected": -2.4504141807556152, + "logps/chosen": -301.84912109375, + "logps/rejected": -440.341064453125, + "loss": 0.0753, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.745691776275635, + "rewards/margins": 19.071266174316406, + "rewards/rejected": -24.816957473754883, + "step": 14250 + }, + { + "epoch": 2.77, + "learning_rate": 4.2892068742359964e-08, + "logits/chosen": -2.447396993637085, + "logits/rejected": -2.5197434425354004, + "logps/chosen": -286.86614990234375, + "logps/rejected": -557.3042602539062, + "loss": 0.0847, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.774168968200684, + "rewards/margins": 26.642169952392578, + "rewards/rejected": -34.416343688964844, + "step": 14260 + }, + { + "epoch": 2.77, + "learning_rate": 4.25325375710074e-08, + "logits/chosen": -2.426670789718628, + "logits/rejected": -2.3398423194885254, + "logps/chosen": -275.36669921875, + "logps/rejected": -398.4294128417969, + "loss": 0.0647, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.7752203941345215, + "rewards/margins": 14.710901260375977, + "rewards/rejected": -20.486120223999023, + "step": 14270 + }, + { + "epoch": 2.77, + "learning_rate": 4.2173006399654846e-08, + "logits/chosen": -2.654876470565796, + "logits/rejected": -2.5388033390045166, + "logps/chosen": -307.33367919921875, + "logps/rejected": -427.08544921875, + "loss": 0.0744, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.097901821136475, + "rewards/margins": 14.917327880859375, + "rewards/rejected": -21.01523208618164, + "step": 14280 + }, + { + "epoch": 2.77, + "learning_rate": 4.1813475228302294e-08, + "logits/chosen": -2.476609468460083, + "logits/rejected": -2.4240591526031494, + "logps/chosen": -317.2851867675781, + "logps/rejected": -396.5350341796875, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9136959314346313, + "rewards/margins": 15.125018119812012, + "rewards/rejected": -17.038715362548828, + "step": 14290 + }, + { + "epoch": 2.78, + "learning_rate": 4.1453944056949735e-08, + "logits/chosen": -2.445927381515503, + "logits/rejected": -2.392014265060425, + "logps/chosen": -263.6961364746094, + "logps/rejected": -442.0687561035156, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.59628963470459, + "rewards/margins": 16.359914779663086, + "rewards/rejected": -21.95620346069336, + "step": 14300 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -2.4207875728607178, + "eval_logits/rejected": -2.3947505950927734, + "eval_logps/chosen": -315.3155517578125, + "eval_logps/rejected": -376.6313781738281, + "eval_loss": 0.6599265933036804, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -12.107748985290527, + "eval_rewards/margins": 8.363792419433594, + "eval_rewards/rejected": -20.471540451049805, + "eval_runtime": 139.4649, + "eval_samples_per_second": 22.629, + "eval_steps_per_second": 0.359, + "step": 14300 + }, + { + "epoch": 2.78, + "learning_rate": 4.109441288559718e-08, + "logits/chosen": -2.540937900543213, + "logits/rejected": -2.498613119125366, + "logps/chosen": -379.290771484375, + "logps/rejected": -407.0986022949219, + "loss": 0.0842, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.659222602844238, + "rewards/margins": 10.250799179077148, + "rewards/rejected": -20.910022735595703, + "step": 14310 + }, + { + "epoch": 2.78, + "learning_rate": 4.0734881714244623e-08, + "logits/chosen": -2.5176444053649902, + "logits/rejected": -2.5706937313079834, + "logps/chosen": -297.27545166015625, + "logps/rejected": -507.0318908691406, + "loss": 0.0887, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.921635627746582, + "rewards/margins": 15.953900337219238, + "rewards/rejected": -25.875537872314453, + "step": 14320 + }, + { + "epoch": 2.78, + "learning_rate": 4.0375350542892064e-08, + "logits/chosen": -2.4668803215026855, + "logits/rejected": -2.3793420791625977, + "logps/chosen": -253.43984985351562, + "logps/rejected": -444.0877990722656, + "loss": 0.0847, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.362646102905273, + "rewards/margins": 16.675519943237305, + "rewards/rejected": -25.038164138793945, + "step": 14330 + }, + { + "epoch": 2.78, + "learning_rate": 4.001581937153951e-08, + "logits/chosen": -2.4649133682250977, + "logits/rejected": -2.458911418914795, + "logps/chosen": -280.91998291015625, + "logps/rejected": -424.6297302246094, + "loss": 0.0844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.609391689300537, + "rewards/margins": 15.765634536743164, + "rewards/rejected": -19.37502670288086, + "step": 14340 + }, + { + "epoch": 2.79, + "learning_rate": 3.965628820018695e-08, + "logits/chosen": -2.443978786468506, + "logits/rejected": -2.4482474327087402, + "logps/chosen": -300.916748046875, + "logps/rejected": -433.1073303222656, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.690673828125, + "rewards/margins": 14.432500839233398, + "rewards/rejected": -23.1231746673584, + "step": 14350 + }, + { + "epoch": 2.79, + "learning_rate": 3.92967570288344e-08, + "logits/chosen": -2.597811698913574, + "logits/rejected": -2.5616354942321777, + "logps/chosen": -341.90643310546875, + "logps/rejected": -461.83367919921875, + "loss": 0.0363, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.019021511077881, + "rewards/margins": 12.094686508178711, + "rewards/rejected": -19.113710403442383, + "step": 14360 + }, + { + "epoch": 2.79, + "learning_rate": 3.893722585748184e-08, + "logits/chosen": -2.4906532764434814, + "logits/rejected": -2.547138214111328, + "logps/chosen": -245.5009002685547, + "logps/rejected": -529.5654296875, + "loss": 0.0681, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.885739326477051, + "rewards/margins": 19.002941131591797, + "rewards/rejected": -26.888687133789062, + "step": 14370 + }, + { + "epoch": 2.79, + "learning_rate": 3.857769468612928e-08, + "logits/chosen": -2.509801149368286, + "logits/rejected": -2.487269639968872, + "logps/chosen": -312.53515625, + "logps/rejected": -466.1974182128906, + "loss": 0.0575, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.757012367248535, + "rewards/margins": 15.759994506835938, + "rewards/rejected": -25.517005920410156, + "step": 14380 + }, + { + "epoch": 2.79, + "learning_rate": 3.821816351477673e-08, + "logits/chosen": -2.565678358078003, + "logits/rejected": -2.5031778812408447, + "logps/chosen": -271.2328796386719, + "logps/rejected": -356.1014709472656, + "loss": 0.0878, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.773752212524414, + "rewards/margins": 11.247282028198242, + "rewards/rejected": -23.021032333374023, + "step": 14390 + }, + { + "epoch": 2.8, + "learning_rate": 3.785863234342417e-08, + "logits/chosen": -2.51530122756958, + "logits/rejected": -2.491840124130249, + "logps/chosen": -230.9707489013672, + "logps/rejected": -337.8825378417969, + "loss": 0.0881, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.354460716247559, + "rewards/margins": 12.154481887817383, + "rewards/rejected": -21.50894546508789, + "step": 14400 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -2.439577102661133, + "eval_logits/rejected": -2.4138731956481934, + "eval_logps/chosen": -310.17938232421875, + "eval_logps/rejected": -367.84320068359375, + "eval_loss": 0.6584843397140503, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -11.594132423400879, + "eval_rewards/margins": 7.998593330383301, + "eval_rewards/rejected": -19.59272575378418, + "eval_runtime": 141.5963, + "eval_samples_per_second": 22.289, + "eval_steps_per_second": 0.353, + "step": 14400 + }, + { + "epoch": 2.8, + "learning_rate": 3.749910117207162e-08, + "logits/chosen": -2.5827152729034424, + "logits/rejected": -2.5437204837799072, + "logps/chosen": -325.27362060546875, + "logps/rejected": -435.84576416015625, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.840313673019409, + "rewards/margins": 11.336217880249023, + "rewards/rejected": -14.176533699035645, + "step": 14410 + }, + { + "epoch": 2.8, + "learning_rate": 3.713957000071906e-08, + "logits/chosen": -2.6532137393951416, + "logits/rejected": -2.5596745014190674, + "logps/chosen": -307.2431640625, + "logps/rejected": -452.42010498046875, + "loss": 0.0455, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3371644020080566, + "rewards/margins": 17.32927703857422, + "rewards/rejected": -20.666439056396484, + "step": 14420 + }, + { + "epoch": 2.8, + "learning_rate": 3.678003882936651e-08, + "logits/chosen": -2.470896005630493, + "logits/rejected": -2.4679017066955566, + "logps/chosen": -242.60079956054688, + "logps/rejected": -369.19329833984375, + "loss": 0.0766, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.54631233215332, + "rewards/margins": 14.59411907196045, + "rewards/rejected": -25.140432357788086, + "step": 14430 + }, + { + "epoch": 2.8, + "learning_rate": 3.642050765801394e-08, + "logits/chosen": -2.413461446762085, + "logits/rejected": -2.347374677658081, + "logps/chosen": -180.55453491210938, + "logps/rejected": -343.64813232421875, + "loss": 0.0664, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.007493495941162, + "rewards/margins": 12.735737800598145, + "rewards/rejected": -18.74323081970215, + "step": 14440 + }, + { + "epoch": 2.81, + "learning_rate": 3.606097648666139e-08, + "logits/chosen": -2.5840563774108887, + "logits/rejected": -2.5611732006073, + "logps/chosen": -302.6882019042969, + "logps/rejected": -461.46966552734375, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.802069664001465, + "rewards/margins": 14.225900650024414, + "rewards/rejected": -19.027971267700195, + "step": 14450 + }, + { + "epoch": 2.81, + "learning_rate": 3.570144531530884e-08, + "logits/chosen": -2.698560953140259, + "logits/rejected": -2.543132781982422, + "logps/chosen": -253.9922637939453, + "logps/rejected": -451.609130859375, + "loss": 0.0713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.911755561828613, + "rewards/margins": 14.107965469360352, + "rewards/rejected": -21.019718170166016, + "step": 14460 + }, + { + "epoch": 2.81, + "learning_rate": 3.534191414395628e-08, + "logits/chosen": -2.545153856277466, + "logits/rejected": -2.606476068496704, + "logps/chosen": -266.3321228027344, + "logps/rejected": -408.425048828125, + "loss": 0.053, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.250962257385254, + "rewards/margins": 17.035131454467773, + "rewards/rejected": -21.28609275817871, + "step": 14470 + }, + { + "epoch": 2.81, + "learning_rate": 3.4982382972603727e-08, + "logits/chosen": -2.0988423824310303, + "logits/rejected": -2.0718460083007812, + "logps/chosen": -222.11117553710938, + "logps/rejected": -371.1849670410156, + "loss": 0.0765, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.412771224975586, + "rewards/margins": 14.51530933380127, + "rewards/rejected": -23.928081512451172, + "step": 14480 + }, + { + "epoch": 2.81, + "learning_rate": 3.462285180125116e-08, + "logits/chosen": -2.4497971534729004, + "logits/rejected": -2.420116662979126, + "logps/chosen": -317.0908203125, + "logps/rejected": -419.55206298828125, + "loss": 0.0707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.512151718139648, + "rewards/margins": 14.144830703735352, + "rewards/rejected": -22.656982421875, + "step": 14490 + }, + { + "epoch": 2.81, + "learning_rate": 3.426332062989861e-08, + "logits/chosen": -2.499769926071167, + "logits/rejected": -2.5119802951812744, + "logps/chosen": -295.19549560546875, + "logps/rejected": -394.10443115234375, + "loss": 0.0992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.128557205200195, + "rewards/margins": 13.180562973022461, + "rewards/rejected": -24.309118270874023, + "step": 14500 + }, + { + "epoch": 2.81, + "eval_logits/chosen": -2.4285218715667725, + "eval_logits/rejected": -2.4018208980560303, + "eval_logps/chosen": -313.8076477050781, + "eval_logps/rejected": -373.9247741699219, + "eval_loss": 0.6616764068603516, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -11.956953048706055, + "eval_rewards/margins": 8.243927955627441, + "eval_rewards/rejected": -20.200881958007812, + "eval_runtime": 139.1952, + "eval_samples_per_second": 22.673, + "eval_steps_per_second": 0.359, + "step": 14500 + }, + { + "epoch": 2.82, + "learning_rate": 3.3903789458546056e-08, + "logits/chosen": -2.5374388694763184, + "logits/rejected": -2.3807623386383057, + "logps/chosen": -289.66064453125, + "logps/rejected": -416.8006286621094, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.805605411529541, + "rewards/margins": 16.766782760620117, + "rewards/rejected": -21.572389602661133, + "step": 14510 + }, + { + "epoch": 2.82, + "learning_rate": 3.35442582871935e-08, + "logits/chosen": -2.7303237915039062, + "logits/rejected": -2.5013201236724854, + "logps/chosen": -336.2086486816406, + "logps/rejected": -450.80828857421875, + "loss": 0.0808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.56033992767334, + "rewards/margins": 16.93686866760254, + "rewards/rejected": -24.497209548950195, + "step": 14520 + }, + { + "epoch": 2.82, + "learning_rate": 3.3184727115840945e-08, + "logits/chosen": -2.517124891281128, + "logits/rejected": -2.518655776977539, + "logps/chosen": -305.09783935546875, + "logps/rejected": -508.779052734375, + "loss": 0.0651, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.57075309753418, + "rewards/margins": 16.389467239379883, + "rewards/rejected": -26.960220336914062, + "step": 14530 + }, + { + "epoch": 2.82, + "learning_rate": 3.2825195944488386e-08, + "logits/chosen": -2.4062769412994385, + "logits/rejected": -2.377316951751709, + "logps/chosen": -275.3218994140625, + "logps/rejected": -393.46405029296875, + "loss": 0.0653, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.1832356452941895, + "rewards/margins": 13.256990432739258, + "rewards/rejected": -19.440227508544922, + "step": 14540 + }, + { + "epoch": 2.82, + "learning_rate": 3.246566477313583e-08, + "logits/chosen": -2.5116472244262695, + "logits/rejected": -2.500929355621338, + "logps/chosen": -341.635498046875, + "logps/rejected": -520.7948608398438, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.789725303649902, + "rewards/margins": 19.73894500732422, + "rewards/rejected": -25.528671264648438, + "step": 14550 + }, + { + "epoch": 2.83, + "learning_rate": 3.2106133601783275e-08, + "logits/chosen": -2.4713847637176514, + "logits/rejected": -2.4499380588531494, + "logps/chosen": -265.95623779296875, + "logps/rejected": -478.7012634277344, + "loss": 0.114, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.011308670043945, + "rewards/margins": 19.960142135620117, + "rewards/rejected": -29.971450805664062, + "step": 14560 + }, + { + "epoch": 2.83, + "learning_rate": 3.1746602430430716e-08, + "logits/chosen": -2.483971118927002, + "logits/rejected": -2.351804733276367, + "logps/chosen": -212.7343292236328, + "logps/rejected": -286.5941467285156, + "loss": 0.0718, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.807860374450684, + "rewards/margins": 11.958721160888672, + "rewards/rejected": -17.76658058166504, + "step": 14570 + }, + { + "epoch": 2.83, + "learning_rate": 3.1387071259078164e-08, + "logits/chosen": -2.4391541481018066, + "logits/rejected": -2.3376450538635254, + "logps/chosen": -275.1197509765625, + "logps/rejected": -363.12860107421875, + "loss": 0.0626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.140986442565918, + "rewards/margins": 11.326373100280762, + "rewards/rejected": -15.46735954284668, + "step": 14580 + }, + { + "epoch": 2.83, + "learning_rate": 3.1027540087725605e-08, + "logits/chosen": -2.3122918605804443, + "logits/rejected": -2.3308868408203125, + "logps/chosen": -328.8714294433594, + "logps/rejected": -428.346435546875, + "loss": 0.0823, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -13.11317253112793, + "rewards/margins": 11.919618606567383, + "rewards/rejected": -25.03278923034668, + "step": 14590 + }, + { + "epoch": 2.83, + "learning_rate": 3.0668008916373046e-08, + "logits/chosen": -2.377321720123291, + "logits/rejected": -2.3733105659484863, + "logps/chosen": -343.38348388671875, + "logps/rejected": -471.52703857421875, + "loss": 0.0582, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.647674560546875, + "rewards/margins": 15.033966064453125, + "rewards/rejected": -19.681642532348633, + "step": 14600 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -2.42386794090271, + "eval_logits/rejected": -2.396596670150757, + "eval_logps/chosen": -318.4825134277344, + "eval_logps/rejected": -381.66265869140625, + "eval_loss": 0.6692880988121033, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.424447059631348, + "eval_rewards/margins": 8.550222396850586, + "eval_rewards/rejected": -20.97467041015625, + "eval_runtime": 139.7094, + "eval_samples_per_second": 22.59, + "eval_steps_per_second": 0.358, + "step": 14600 + }, + { + "epoch": 2.84, + "learning_rate": 3.0308477745020494e-08, + "logits/chosen": -2.5730843544006348, + "logits/rejected": -2.4685885906219482, + "logps/chosen": -201.51568603515625, + "logps/rejected": -315.1352844238281, + "loss": 0.0707, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.461834907531738, + "rewards/margins": 10.430822372436523, + "rewards/rejected": -15.892657279968262, + "step": 14610 + }, + { + "epoch": 2.84, + "learning_rate": 2.9948946573667935e-08, + "logits/chosen": -2.516209840774536, + "logits/rejected": -2.4631283283233643, + "logps/chosen": -323.6180725097656, + "logps/rejected": -402.645263671875, + "loss": 0.0629, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.655837535858154, + "rewards/margins": 10.929776191711426, + "rewards/rejected": -16.585613250732422, + "step": 14620 + }, + { + "epoch": 2.84, + "learning_rate": 2.958941540231538e-08, + "logits/chosen": -2.319056987762451, + "logits/rejected": -2.2761974334716797, + "logps/chosen": -247.5626678466797, + "logps/rejected": -394.25830078125, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.647191524505615, + "rewards/margins": 16.012495040893555, + "rewards/rejected": -23.659687042236328, + "step": 14630 + }, + { + "epoch": 2.84, + "learning_rate": 2.922988423096282e-08, + "logits/chosen": -2.4223062992095947, + "logits/rejected": -2.3641715049743652, + "logps/chosen": -291.48016357421875, + "logps/rejected": -394.8805847167969, + "loss": 0.0702, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.563333988189697, + "rewards/margins": 13.369488716125488, + "rewards/rejected": -20.932823181152344, + "step": 14640 + }, + { + "epoch": 2.84, + "learning_rate": 2.8870353059610268e-08, + "logits/chosen": -2.4811291694641113, + "logits/rejected": -2.3399858474731445, + "logps/chosen": -217.7340850830078, + "logps/rejected": -352.4710388183594, + "loss": 0.0988, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.096497535705566, + "rewards/margins": 14.886590957641602, + "rewards/rejected": -22.983089447021484, + "step": 14650 + }, + { + "epoch": 2.85, + "learning_rate": 2.8510821888257712e-08, + "logits/chosen": -2.580836057662964, + "logits/rejected": -2.4918313026428223, + "logps/chosen": -317.5665588378906, + "logps/rejected": -344.79632568359375, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9686279296875, + "rewards/margins": 13.31214714050293, + "rewards/rejected": -21.280776977539062, + "step": 14660 + }, + { + "epoch": 2.85, + "learning_rate": 2.8151290716905153e-08, + "logits/chosen": -2.408627986907959, + "logits/rejected": -2.405900716781616, + "logps/chosen": -267.0467529296875, + "logps/rejected": -408.45208740234375, + "loss": 0.0795, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.903483390808105, + "rewards/margins": 18.04708480834961, + "rewards/rejected": -28.950571060180664, + "step": 14670 + }, + { + "epoch": 2.85, + "learning_rate": 2.7791759545552598e-08, + "logits/chosen": -2.472996234893799, + "logits/rejected": -2.3849222660064697, + "logps/chosen": -350.3518981933594, + "logps/rejected": -338.9212646484375, + "loss": 0.0838, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.101327896118164, + "rewards/margins": 11.6890287399292, + "rewards/rejected": -17.790355682373047, + "step": 14680 + }, + { + "epoch": 2.85, + "learning_rate": 2.7432228374200042e-08, + "logits/chosen": -2.744760513305664, + "logits/rejected": -2.6889748573303223, + "logps/chosen": -425.40118408203125, + "logps/rejected": -422.3814392089844, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.307501792907715, + "rewards/margins": 12.430013656616211, + "rewards/rejected": -17.73751449584961, + "step": 14690 + }, + { + "epoch": 2.85, + "learning_rate": 2.7072697202847486e-08, + "logits/chosen": -2.524183750152588, + "logits/rejected": -2.4891419410705566, + "logps/chosen": -275.86749267578125, + "logps/rejected": -376.0390930175781, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.14548921585083, + "rewards/margins": 11.856428146362305, + "rewards/rejected": -15.001917839050293, + "step": 14700 + }, + { + "epoch": 2.85, + "eval_logits/chosen": -2.4053714275360107, + "eval_logits/rejected": -2.3763649463653564, + "eval_logps/chosen": -319.3457946777344, + "eval_logps/rejected": -383.76605224609375, + "eval_loss": 0.6742444634437561, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.51076889038086, + "eval_rewards/margins": 8.674240112304688, + "eval_rewards/rejected": -21.185009002685547, + "eval_runtime": 142.1209, + "eval_samples_per_second": 22.206, + "eval_steps_per_second": 0.352, + "step": 14700 + }, + { + "epoch": 2.86, + "learning_rate": 2.671316603149493e-08, + "logits/chosen": -2.3960258960723877, + "logits/rejected": -2.2449076175689697, + "logps/chosen": -212.5515899658203, + "logps/rejected": -299.7181091308594, + "loss": 0.0874, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.225173950195312, + "rewards/margins": 10.981122970581055, + "rewards/rejected": -19.206296920776367, + "step": 14710 + }, + { + "epoch": 2.86, + "learning_rate": 2.6353634860142375e-08, + "logits/chosen": -2.500899076461792, + "logits/rejected": -2.508479118347168, + "logps/chosen": -271.1492919921875, + "logps/rejected": -394.25018310546875, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.097317695617676, + "rewards/margins": 12.225263595581055, + "rewards/rejected": -16.322580337524414, + "step": 14720 + }, + { + "epoch": 2.86, + "learning_rate": 2.5994103688789816e-08, + "logits/chosen": -2.5822582244873047, + "logits/rejected": -2.525808095932007, + "logps/chosen": -334.08197021484375, + "logps/rejected": -461.976318359375, + "loss": 0.0515, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.583234786987305, + "rewards/margins": 19.960254669189453, + "rewards/rejected": -25.54349136352539, + "step": 14730 + }, + { + "epoch": 2.86, + "learning_rate": 2.563457251743726e-08, + "logits/chosen": -2.625471830368042, + "logits/rejected": -2.450768232345581, + "logps/chosen": -317.8865051269531, + "logps/rejected": -381.4195861816406, + "loss": 0.0533, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.570221424102783, + "rewards/margins": 12.633634567260742, + "rewards/rejected": -15.203857421875, + "step": 14740 + }, + { + "epoch": 2.86, + "learning_rate": 2.52750413460847e-08, + "logits/chosen": -2.3523027896881104, + "logits/rejected": -2.3602569103240967, + "logps/chosen": -268.3021240234375, + "logps/rejected": -550.2041625976562, + "loss": 0.0771, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.813868522644043, + "rewards/margins": 16.246450424194336, + "rewards/rejected": -25.06032371520996, + "step": 14750 + }, + { + "epoch": 2.87, + "learning_rate": 2.491551017473215e-08, + "logits/chosen": -2.4621942043304443, + "logits/rejected": -2.552905797958374, + "logps/chosen": -332.30352783203125, + "logps/rejected": -443.5380859375, + "loss": 0.0986, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.553974628448486, + "rewards/margins": 16.756128311157227, + "rewards/rejected": -21.310102462768555, + "step": 14760 + }, + { + "epoch": 2.87, + "learning_rate": 2.4555979003379594e-08, + "logits/chosen": -2.5724241733551025, + "logits/rejected": -2.546224594116211, + "logps/chosen": -335.9853820800781, + "logps/rejected": -461.4374084472656, + "loss": 0.0539, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.356352806091309, + "rewards/margins": 13.373123168945312, + "rewards/rejected": -21.729475021362305, + "step": 14770 + }, + { + "epoch": 2.87, + "learning_rate": 2.4196447832027035e-08, + "logits/chosen": -2.541684150695801, + "logits/rejected": -2.440366744995117, + "logps/chosen": -338.27825927734375, + "logps/rejected": -430.75390625, + "loss": 0.0773, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.635833740234375, + "rewards/margins": 15.04316520690918, + "rewards/rejected": -23.678998947143555, + "step": 14780 + }, + { + "epoch": 2.87, + "learning_rate": 2.383691666067448e-08, + "logits/chosen": -2.4136452674865723, + "logits/rejected": -2.4098598957061768, + "logps/chosen": -364.2322082519531, + "logps/rejected": -468.85614013671875, + "loss": 0.0526, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.864326477050781, + "rewards/margins": 16.317676544189453, + "rewards/rejected": -25.1820011138916, + "step": 14790 + }, + { + "epoch": 2.87, + "learning_rate": 2.3477385489321923e-08, + "logits/chosen": -2.3216519355773926, + "logits/rejected": -2.349653720855713, + "logps/chosen": -245.6371612548828, + "logps/rejected": -387.172119140625, + "loss": 0.0615, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.777538299560547, + "rewards/margins": 14.355929374694824, + "rewards/rejected": -22.133466720581055, + "step": 14800 + }, + { + "epoch": 2.87, + "eval_logits/chosen": -2.411236047744751, + "eval_logits/rejected": -2.3824241161346436, + "eval_logps/chosen": -322.263671875, + "eval_logps/rejected": -387.9561767578125, + "eval_loss": 0.677561342716217, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.802559852600098, + "eval_rewards/margins": 8.801458358764648, + "eval_rewards/rejected": -21.604019165039062, + "eval_runtime": 138.7975, + "eval_samples_per_second": 22.738, + "eval_steps_per_second": 0.36, + "step": 14800 + }, + { + "epoch": 2.88, + "learning_rate": 2.3117854317969364e-08, + "logits/chosen": -2.461608409881592, + "logits/rejected": -2.433798313140869, + "logps/chosen": -277.7593688964844, + "logps/rejected": -433.9398498535156, + "loss": 0.0881, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.632577896118164, + "rewards/margins": 16.77115249633789, + "rewards/rejected": -28.403732299804688, + "step": 14810 + }, + { + "epoch": 2.88, + "learning_rate": 2.2758323146616812e-08, + "logits/chosen": -2.4626240730285645, + "logits/rejected": -2.34547758102417, + "logps/chosen": -213.6193084716797, + "logps/rejected": -473.6609802246094, + "loss": 0.0653, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.824320316314697, + "rewards/margins": 19.59589195251465, + "rewards/rejected": -26.420211791992188, + "step": 14820 + }, + { + "epoch": 2.88, + "learning_rate": 2.2398791975264256e-08, + "logits/chosen": -2.5313308238983154, + "logits/rejected": -2.52459716796875, + "logps/chosen": -347.76080322265625, + "logps/rejected": -450.2403869628906, + "loss": 0.0625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.659457206726074, + "rewards/margins": 13.534477233886719, + "rewards/rejected": -19.193933486938477, + "step": 14830 + }, + { + "epoch": 2.88, + "learning_rate": 2.2039260803911698e-08, + "logits/chosen": -2.394092082977295, + "logits/rejected": -2.3857369422912598, + "logps/chosen": -261.8539123535156, + "logps/rejected": -476.70391845703125, + "loss": 0.0487, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.905121803283691, + "rewards/margins": 19.592878341674805, + "rewards/rejected": -29.498001098632812, + "step": 14840 + }, + { + "epoch": 2.88, + "learning_rate": 2.1679729632559142e-08, + "logits/chosen": -2.481632947921753, + "logits/rejected": -2.4351515769958496, + "logps/chosen": -244.8082733154297, + "logps/rejected": -385.21600341796875, + "loss": 0.0747, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0515594482421875, + "rewards/margins": 17.415645599365234, + "rewards/rejected": -23.467206954956055, + "step": 14850 + }, + { + "epoch": 2.88, + "learning_rate": 2.1320198461206583e-08, + "logits/chosen": -2.3605756759643555, + "logits/rejected": -2.2788777351379395, + "logps/chosen": -301.1482849121094, + "logps/rejected": -394.51361083984375, + "loss": 0.07, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.805695533752441, + "rewards/margins": 15.642021179199219, + "rewards/rejected": -21.447715759277344, + "step": 14860 + }, + { + "epoch": 2.89, + "learning_rate": 2.096066728985403e-08, + "logits/chosen": -2.451914072036743, + "logits/rejected": -2.4734387397766113, + "logps/chosen": -223.73916625976562, + "logps/rejected": -371.39453125, + "loss": 0.0802, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.518346786499023, + "rewards/margins": 13.512170791625977, + "rewards/rejected": -22.030515670776367, + "step": 14870 + }, + { + "epoch": 2.89, + "learning_rate": 2.0601136118501475e-08, + "logits/chosen": -2.565833806991577, + "logits/rejected": -2.570472240447998, + "logps/chosen": -260.41375732421875, + "logps/rejected": -439.21343994140625, + "loss": 0.0969, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.041805267333984, + "rewards/margins": 16.768009185791016, + "rewards/rejected": -26.809818267822266, + "step": 14880 + }, + { + "epoch": 2.89, + "learning_rate": 2.0241604947148916e-08, + "logits/chosen": -2.5179436206817627, + "logits/rejected": -2.4361252784729004, + "logps/chosen": -224.28073120117188, + "logps/rejected": -414.78326416015625, + "loss": 0.0742, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.672027111053467, + "rewards/margins": 15.319323539733887, + "rewards/rejected": -19.991352081298828, + "step": 14890 + }, + { + "epoch": 2.89, + "learning_rate": 1.988207377579636e-08, + "logits/chosen": -2.623081684112549, + "logits/rejected": -2.6000635623931885, + "logps/chosen": -258.5315246582031, + "logps/rejected": -449.21881103515625, + "loss": 0.0532, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.813483238220215, + "rewards/margins": 13.49165153503418, + "rewards/rejected": -21.305133819580078, + "step": 14900 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.413811206817627, + "eval_logits/rejected": -2.3852202892303467, + "eval_logps/chosen": -324.2155456542969, + "eval_logps/rejected": -390.41668701171875, + "eval_loss": 0.6768919229507446, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -12.997749328613281, + "eval_rewards/margins": 8.852324485778809, + "eval_rewards/rejected": -21.850074768066406, + "eval_runtime": 138.8929, + "eval_samples_per_second": 22.723, + "eval_steps_per_second": 0.36, + "step": 14900 + }, + { + "epoch": 2.89, + "learning_rate": 1.9522542604443805e-08, + "logits/chosen": -2.52410626411438, + "logits/rejected": -2.432678699493408, + "logps/chosen": -259.6901550292969, + "logps/rejected": -376.5718078613281, + "loss": 0.0363, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.789159774780273, + "rewards/margins": 13.336712837219238, + "rewards/rejected": -22.125871658325195, + "step": 14910 + }, + { + "epoch": 2.9, + "learning_rate": 1.9163011433091246e-08, + "logits/chosen": -2.556885242462158, + "logits/rejected": -2.496314525604248, + "logps/chosen": -292.65008544921875, + "logps/rejected": -459.6458435058594, + "loss": 0.0792, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.866762161254883, + "rewards/margins": 16.4494686126709, + "rewards/rejected": -19.31622886657715, + "step": 14920 + }, + { + "epoch": 2.9, + "learning_rate": 1.8803480261738694e-08, + "logits/chosen": -2.4584484100341797, + "logits/rejected": -2.449239730834961, + "logps/chosen": -236.3944854736328, + "logps/rejected": -328.36199951171875, + "loss": 0.0751, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.435039043426514, + "rewards/margins": 11.45154857635498, + "rewards/rejected": -16.8865909576416, + "step": 14930 + }, + { + "epoch": 2.9, + "learning_rate": 1.8443949090386138e-08, + "logits/chosen": -2.2923905849456787, + "logits/rejected": -2.401318073272705, + "logps/chosen": -293.0608215332031, + "logps/rejected": -418.92205810546875, + "loss": 0.0745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.43484878540039, + "rewards/margins": 13.684107780456543, + "rewards/rejected": -25.118955612182617, + "step": 14940 + }, + { + "epoch": 2.9, + "learning_rate": 1.808441791903358e-08, + "logits/chosen": -2.504848003387451, + "logits/rejected": -2.554367780685425, + "logps/chosen": -334.94842529296875, + "logps/rejected": -376.92767333984375, + "loss": 0.0723, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.64829158782959, + "rewards/margins": 11.120731353759766, + "rewards/rejected": -18.769023895263672, + "step": 14950 + }, + { + "epoch": 2.9, + "learning_rate": 1.7724886747681023e-08, + "logits/chosen": -2.5695011615753174, + "logits/rejected": -2.4521443843841553, + "logps/chosen": -296.80694580078125, + "logps/rejected": -420.30145263671875, + "loss": 0.0621, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.00919246673584, + "rewards/margins": 18.143131256103516, + "rewards/rejected": -25.152324676513672, + "step": 14960 + }, + { + "epoch": 2.91, + "learning_rate": 1.7365355576328464e-08, + "logits/chosen": -2.5077388286590576, + "logits/rejected": -2.5159683227539062, + "logps/chosen": -313.39990234375, + "logps/rejected": -466.9717712402344, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.881387233734131, + "rewards/margins": 19.802248001098633, + "rewards/rejected": -26.68363380432129, + "step": 14970 + }, + { + "epoch": 2.91, + "learning_rate": 1.700582440497591e-08, + "logits/chosen": -2.465104103088379, + "logits/rejected": -2.402278184890747, + "logps/chosen": -335.5147705078125, + "logps/rejected": -430.0679626464844, + "loss": 0.078, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.292757987976074, + "rewards/margins": 12.162251472473145, + "rewards/rejected": -16.45500946044922, + "step": 14980 + }, + { + "epoch": 2.91, + "learning_rate": 1.6646293233623356e-08, + "logits/chosen": -2.5866243839263916, + "logits/rejected": -2.520380973815918, + "logps/chosen": -319.72161865234375, + "logps/rejected": -426.16278076171875, + "loss": 0.0895, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.24250602722168, + "rewards/margins": 11.0344820022583, + "rewards/rejected": -22.276988983154297, + "step": 14990 + }, + { + "epoch": 2.91, + "learning_rate": 1.6286762062270798e-08, + "logits/chosen": -2.519317150115967, + "logits/rejected": -2.5784876346588135, + "logps/chosen": -334.0810852050781, + "logps/rejected": -473.9246520996094, + "loss": 0.0742, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.33923053741455, + "rewards/margins": 15.856913566589355, + "rewards/rejected": -25.196144104003906, + "step": 15000 + }, + { + "epoch": 2.91, + "eval_logits/chosen": -2.4096627235412598, + "eval_logits/rejected": -2.3806543350219727, + "eval_logps/chosen": -327.2182312011719, + "eval_logps/rejected": -394.3062744140625, + "eval_loss": 0.6786009073257446, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -13.29802131652832, + "eval_rewards/margins": 8.941009521484375, + "eval_rewards/rejected": -22.239030838012695, + "eval_runtime": 141.0306, + "eval_samples_per_second": 22.378, + "eval_steps_per_second": 0.355, + "step": 15000 + }, + { + "epoch": 2.91, + "learning_rate": 1.5927230890918242e-08, + "logits/chosen": -2.4977755546569824, + "logits/rejected": -2.545881748199463, + "logps/chosen": -376.63629150390625, + "logps/rejected": -472.7349548339844, + "loss": 0.087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.123446464538574, + "rewards/margins": 14.990835189819336, + "rewards/rejected": -21.11428451538086, + "step": 15010 + }, + { + "epoch": 2.92, + "learning_rate": 1.5567699719565686e-08, + "logits/chosen": -2.560659885406494, + "logits/rejected": -2.509192943572998, + "logps/chosen": -285.62042236328125, + "logps/rejected": -486.3465881347656, + "loss": 0.049, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.968367576599121, + "rewards/margins": 15.235417366027832, + "rewards/rejected": -20.20378303527832, + "step": 15020 + }, + { + "epoch": 2.92, + "learning_rate": 1.520816854821313e-08, + "logits/chosen": -2.3575987815856934, + "logits/rejected": -2.4175238609313965, + "logps/chosen": -276.5771789550781, + "logps/rejected": -396.5736083984375, + "loss": 0.0673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.302205085754395, + "rewards/margins": 14.545817375183105, + "rewards/rejected": -25.848018646240234, + "step": 15030 + }, + { + "epoch": 2.92, + "learning_rate": 1.4848637376860573e-08, + "logits/chosen": -2.5779805183410645, + "logits/rejected": -2.4924569129943848, + "logps/chosen": -334.1487121582031, + "logps/rejected": -340.0343322753906, + "loss": 0.0497, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.865668296813965, + "rewards/margins": 10.10538101196289, + "rewards/rejected": -18.971050262451172, + "step": 15040 + }, + { + "epoch": 2.92, + "learning_rate": 1.4489106205508016e-08, + "logits/chosen": -2.425614833831787, + "logits/rejected": -2.4983208179473877, + "logps/chosen": -319.1209411621094, + "logps/rejected": -559.3380126953125, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6970150470733643, + "rewards/margins": 18.550779342651367, + "rewards/rejected": -22.247793197631836, + "step": 15050 + }, + { + "epoch": 2.92, + "learning_rate": 1.412957503415546e-08, + "logits/chosen": -2.4435060024261475, + "logits/rejected": -2.3748388290405273, + "logps/chosen": -237.87460327148438, + "logps/rejected": -297.3375244140625, + "loss": 0.0539, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.932347297668457, + "rewards/margins": 9.860334396362305, + "rewards/rejected": -16.792682647705078, + "step": 15060 + }, + { + "epoch": 2.93, + "learning_rate": 1.3770043862802905e-08, + "logits/chosen": -2.5922610759735107, + "logits/rejected": -2.5537643432617188, + "logps/chosen": -318.402587890625, + "logps/rejected": -365.552490234375, + "loss": 0.0919, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.109453201293945, + "rewards/margins": 14.492500305175781, + "rewards/rejected": -22.601953506469727, + "step": 15070 + }, + { + "epoch": 2.93, + "learning_rate": 1.3410512691450349e-08, + "logits/chosen": -2.5073835849761963, + "logits/rejected": -2.505967140197754, + "logps/chosen": -260.0834045410156, + "logps/rejected": -330.8136901855469, + "loss": 0.063, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -11.07208251953125, + "rewards/margins": 10.903297424316406, + "rewards/rejected": -21.975379943847656, + "step": 15080 + }, + { + "epoch": 2.93, + "learning_rate": 1.3050981520097792e-08, + "logits/chosen": -2.3870558738708496, + "logits/rejected": -2.444859743118286, + "logps/chosen": -276.3438415527344, + "logps/rejected": -446.5189514160156, + "loss": 0.0645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.5569586753845215, + "rewards/margins": 17.061595916748047, + "rewards/rejected": -23.61855697631836, + "step": 15090 + }, + { + "epoch": 2.93, + "learning_rate": 1.2691450348745235e-08, + "logits/chosen": -2.709730863571167, + "logits/rejected": -2.6142489910125732, + "logps/chosen": -305.55706787109375, + "logps/rejected": -433.7855529785156, + "loss": 0.0626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.537403583526611, + "rewards/margins": 13.999174118041992, + "rewards/rejected": -19.536579132080078, + "step": 15100 + }, + { + "epoch": 2.93, + "eval_logits/chosen": -2.425344228744507, + "eval_logits/rejected": -2.397414207458496, + "eval_logps/chosen": -326.3965759277344, + "eval_logps/rejected": -392.78887939453125, + "eval_loss": 0.6751859188079834, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -13.215849876403809, + "eval_rewards/margins": 8.871443748474121, + "eval_rewards/rejected": -22.087291717529297, + "eval_runtime": 140.7296, + "eval_samples_per_second": 22.426, + "eval_steps_per_second": 0.355, + "step": 15100 + }, + { + "epoch": 2.93, + "learning_rate": 1.233191917739268e-08, + "logits/chosen": -2.536986827850342, + "logits/rejected": -2.5120372772216797, + "logps/chosen": -309.6207580566406, + "logps/rejected": -427.33935546875, + "loss": 0.07, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.251399040222168, + "rewards/margins": 14.412198066711426, + "rewards/rejected": -19.663597106933594, + "step": 15110 + }, + { + "epoch": 2.94, + "learning_rate": 1.1972388006040123e-08, + "logits/chosen": -2.5813136100769043, + "logits/rejected": -2.582587957382202, + "logps/chosen": -335.59967041015625, + "logps/rejected": -392.62261962890625, + "loss": 0.0527, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.910311222076416, + "rewards/margins": 14.15099811553955, + "rewards/rejected": -20.061307907104492, + "step": 15120 + }, + { + "epoch": 2.94, + "learning_rate": 1.1612856834687566e-08, + "logits/chosen": -2.494337558746338, + "logits/rejected": -2.4271411895751953, + "logps/chosen": -238.4246063232422, + "logps/rejected": -349.49774169921875, + "loss": 0.0888, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.892738342285156, + "rewards/margins": 8.284916877746582, + "rewards/rejected": -21.177656173706055, + "step": 15130 + }, + { + "epoch": 2.94, + "learning_rate": 1.125332566333501e-08, + "logits/chosen": -2.5586869716644287, + "logits/rejected": -2.590772867202759, + "logps/chosen": -358.2134094238281, + "logps/rejected": -464.673828125, + "loss": 0.0882, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.563177108764648, + "rewards/margins": 15.573925971984863, + "rewards/rejected": -26.137104034423828, + "step": 15140 + }, + { + "epoch": 2.94, + "learning_rate": 1.0893794491982455e-08, + "logits/chosen": -2.4009156227111816, + "logits/rejected": -2.3815758228302, + "logps/chosen": -277.3514709472656, + "logps/rejected": -384.10076904296875, + "loss": 0.1009, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -12.652212142944336, + "rewards/margins": 14.973703384399414, + "rewards/rejected": -27.62591552734375, + "step": 15150 + }, + { + "epoch": 2.94, + "learning_rate": 1.0534263320629897e-08, + "logits/chosen": -2.365360736846924, + "logits/rejected": -2.4083216190338135, + "logps/chosen": -265.6725158691406, + "logps/rejected": -445.07708740234375, + "loss": 0.1004, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -12.468011856079102, + "rewards/margins": 13.022611618041992, + "rewards/rejected": -25.49062728881836, + "step": 15160 + }, + { + "epoch": 2.95, + "learning_rate": 1.0174732149277342e-08, + "logits/chosen": -2.5130014419555664, + "logits/rejected": -2.389761447906494, + "logps/chosen": -270.8379821777344, + "logps/rejected": -416.90155029296875, + "loss": 0.097, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.039586067199707, + "rewards/margins": 15.094144821166992, + "rewards/rejected": -24.13373374938965, + "step": 15170 + }, + { + "epoch": 2.95, + "learning_rate": 9.815200977924786e-09, + "logits/chosen": -2.552694320678711, + "logits/rejected": -2.5263783931732178, + "logps/chosen": -306.68438720703125, + "logps/rejected": -456.9385681152344, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.045750617980957, + "rewards/margins": 18.316104888916016, + "rewards/rejected": -26.36185646057129, + "step": 15180 + }, + { + "epoch": 2.95, + "learning_rate": 9.455669806572229e-09, + "logits/chosen": -2.729008197784424, + "logits/rejected": -2.628079652786255, + "logps/chosen": -268.72808837890625, + "logps/rejected": -435.31927490234375, + "loss": 0.0667, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.929935932159424, + "rewards/margins": 13.383522033691406, + "rewards/rejected": -17.313457489013672, + "step": 15190 + }, + { + "epoch": 2.95, + "learning_rate": 9.096138635219673e-09, + "logits/chosen": -2.6436684131622314, + "logits/rejected": -2.4826548099517822, + "logps/chosen": -322.7117004394531, + "logps/rejected": -473.43695068359375, + "loss": 0.046, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.110642910003662, + "rewards/margins": 16.755704879760742, + "rewards/rejected": -23.86634635925293, + "step": 15200 + }, + { + "epoch": 2.95, + "eval_logits/chosen": -2.4254937171936035, + "eval_logits/rejected": -2.397653818130493, + "eval_logps/chosen": -326.61761474609375, + "eval_logps/rejected": -393.1146240234375, + "eval_loss": 0.673405110836029, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -13.237957954406738, + "eval_rewards/margins": 8.881909370422363, + "eval_rewards/rejected": -22.11986541748047, + "eval_runtime": 141.0271, + "eval_samples_per_second": 22.379, + "eval_steps_per_second": 0.355, + "step": 15200 + }, + { + "epoch": 2.95, + "learning_rate": 8.736607463867116e-09, + "logits/chosen": -2.497650623321533, + "logits/rejected": -2.518714427947998, + "logps/chosen": -283.69696044921875, + "logps/rejected": -366.68072509765625, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.063966751098633, + "rewards/margins": 13.147346496582031, + "rewards/rejected": -22.2113094329834, + "step": 15210 + }, + { + "epoch": 2.95, + "learning_rate": 8.377076292514562e-09, + "logits/chosen": -2.4690277576446533, + "logits/rejected": -2.4953746795654297, + "logps/chosen": -242.02328491210938, + "logps/rejected": -444.21026611328125, + "loss": 0.0614, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.401689529418945, + "rewards/margins": 15.326090812683105, + "rewards/rejected": -23.727779388427734, + "step": 15220 + }, + { + "epoch": 2.96, + "learning_rate": 8.017545121162005e-09, + "logits/chosen": -2.475654363632202, + "logits/rejected": -2.307687282562256, + "logps/chosen": -294.1593322753906, + "logps/rejected": -385.40960693359375, + "loss": 0.065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.7741804122924805, + "rewards/margins": 10.907903671264648, + "rewards/rejected": -17.682085037231445, + "step": 15230 + }, + { + "epoch": 2.96, + "learning_rate": 7.658013949809447e-09, + "logits/chosen": -2.499718427658081, + "logits/rejected": -2.417612314224243, + "logps/chosen": -297.8713073730469, + "logps/rejected": -427.8961486816406, + "loss": 0.0839, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.2389554977417, + "rewards/margins": 18.335186004638672, + "rewards/rejected": -29.574138641357422, + "step": 15240 + }, + { + "epoch": 2.96, + "learning_rate": 7.298482778456892e-09, + "logits/chosen": -2.4662883281707764, + "logits/rejected": -2.3811111450195312, + "logps/chosen": -278.16046142578125, + "logps/rejected": -444.9239196777344, + "loss": 0.0541, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.652475357055664, + "rewards/margins": 16.56214141845703, + "rewards/rejected": -25.214616775512695, + "step": 15250 + }, + { + "epoch": 2.96, + "learning_rate": 6.938951607104335e-09, + "logits/chosen": -2.5906896591186523, + "logits/rejected": -2.440596342086792, + "logps/chosen": -280.88531494140625, + "logps/rejected": -372.7237548828125, + "loss": 0.0434, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.08198070526123, + "rewards/margins": 14.472452163696289, + "rewards/rejected": -22.554431915283203, + "step": 15260 + }, + { + "epoch": 2.96, + "learning_rate": 6.57942043575178e-09, + "logits/chosen": -2.5039706230163574, + "logits/rejected": -2.443559408187866, + "logps/chosen": -280.50653076171875, + "logps/rejected": -429.90460205078125, + "loss": 0.0848, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.04439640045166, + "rewards/margins": 13.79267406463623, + "rewards/rejected": -20.83707046508789, + "step": 15270 + }, + { + "epoch": 2.97, + "learning_rate": 6.2198892643992225e-09, + "logits/chosen": -2.618950128555298, + "logits/rejected": -2.5126147270202637, + "logps/chosen": -292.8873291015625, + "logps/rejected": -413.92669677734375, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.59955358505249, + "rewards/margins": 13.895268440246582, + "rewards/rejected": -21.494823455810547, + "step": 15280 + }, + { + "epoch": 2.97, + "learning_rate": 5.860358093046667e-09, + "logits/chosen": -2.558330535888672, + "logits/rejected": -2.397799491882324, + "logps/chosen": -303.582763671875, + "logps/rejected": -461.54522705078125, + "loss": 0.0643, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.999855995178223, + "rewards/margins": 15.0127534866333, + "rewards/rejected": -23.01260757446289, + "step": 15290 + }, + { + "epoch": 2.97, + "learning_rate": 5.500826921694111e-09, + "logits/chosen": -2.5134618282318115, + "logits/rejected": -2.5630767345428467, + "logps/chosen": -297.6739196777344, + "logps/rejected": -353.0406799316406, + "loss": 0.0464, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.0575456619262695, + "rewards/margins": 11.345844268798828, + "rewards/rejected": -16.403390884399414, + "step": 15300 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -2.429832935333252, + "eval_logits/rejected": -2.4019625186920166, + "eval_logps/chosen": -326.5859375, + "eval_logps/rejected": -393.0613708496094, + "eval_loss": 0.6733829975128174, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -13.234786033630371, + "eval_rewards/margins": 8.879753112792969, + "eval_rewards/rejected": -22.114540100097656, + "eval_runtime": 142.4888, + "eval_samples_per_second": 22.149, + "eval_steps_per_second": 0.351, + "step": 15300 + }, + { + "epoch": 2.97, + "learning_rate": 5.141295750341554e-09, + "logits/chosen": -2.551086902618408, + "logits/rejected": -2.5036799907684326, + "logps/chosen": -296.6831970214844, + "logps/rejected": -526.3468017578125, + "loss": 0.0762, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.280969142913818, + "rewards/margins": 23.061071395874023, + "rewards/rejected": -30.342041015625, + "step": 15310 + }, + { + "epoch": 2.97, + "learning_rate": 4.781764578988998e-09, + "logits/chosen": -2.374816417694092, + "logits/rejected": -2.4634015560150146, + "logps/chosen": -311.0668640136719, + "logps/rejected": -408.4220886230469, + "loss": 0.0696, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.171885013580322, + "rewards/margins": 14.707513809204102, + "rewards/rejected": -21.8794002532959, + "step": 15320 + }, + { + "epoch": 2.98, + "learning_rate": 4.422233407636442e-09, + "logits/chosen": -2.568384885787964, + "logits/rejected": -2.5400028228759766, + "logps/chosen": -287.30303955078125, + "logps/rejected": -478.36456298828125, + "loss": 0.078, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.014510154724121, + "rewards/margins": 16.825496673583984, + "rewards/rejected": -21.840007781982422, + "step": 15330 + }, + { + "epoch": 2.98, + "learning_rate": 4.062702236283886e-09, + "logits/chosen": -2.4930148124694824, + "logits/rejected": -2.4020228385925293, + "logps/chosen": -299.522216796875, + "logps/rejected": -595.8134765625, + "loss": 0.045, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.225922584533691, + "rewards/margins": 22.615100860595703, + "rewards/rejected": -29.841022491455078, + "step": 15340 + }, + { + "epoch": 2.98, + "learning_rate": 3.7031710649313298e-09, + "logits/chosen": -2.5365307331085205, + "logits/rejected": -2.519602060317993, + "logps/chosen": -301.25238037109375, + "logps/rejected": -390.9139709472656, + "loss": 0.0628, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.842535495758057, + "rewards/margins": 15.489392280578613, + "rewards/rejected": -22.33193016052246, + "step": 15350 + }, + { + "epoch": 2.98, + "learning_rate": 3.3436398935787733e-09, + "logits/chosen": -2.5512332916259766, + "logits/rejected": -2.3916690349578857, + "logps/chosen": -360.29815673828125, + "logps/rejected": -470.9840393066406, + "loss": 0.0661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.730353355407715, + "rewards/margins": 18.736759185791016, + "rewards/rejected": -27.467113494873047, + "step": 15360 + }, + { + "epoch": 2.98, + "learning_rate": 2.984108722226217e-09, + "logits/chosen": -2.6082935333251953, + "logits/rejected": -2.5024819374084473, + "logps/chosen": -270.26788330078125, + "logps/rejected": -463.6595764160156, + "loss": 0.0785, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.266807556152344, + "rewards/margins": 11.69709300994873, + "rewards/rejected": -20.96390151977539, + "step": 15370 + }, + { + "epoch": 2.99, + "learning_rate": 2.624577550873661e-09, + "logits/chosen": -2.6313486099243164, + "logits/rejected": -2.6200969219207764, + "logps/chosen": -316.62152099609375, + "logps/rejected": -471.7225646972656, + "loss": 0.0747, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.830097675323486, + "rewards/margins": 19.329891204833984, + "rewards/rejected": -25.159992218017578, + "step": 15380 + }, + { + "epoch": 2.99, + "learning_rate": 2.2650463795211044e-09, + "logits/chosen": -2.4944376945495605, + "logits/rejected": -2.4808757305145264, + "logps/chosen": -314.99005126953125, + "logps/rejected": -491.7989196777344, + "loss": 0.0821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.987020492553711, + "rewards/margins": 13.945060729980469, + "rewards/rejected": -23.932083129882812, + "step": 15390 + }, + { + "epoch": 2.99, + "learning_rate": 1.905515208168548e-09, + "logits/chosen": -2.416228771209717, + "logits/rejected": -2.2908473014831543, + "logps/chosen": -287.78607177734375, + "logps/rejected": -383.16790771484375, + "loss": 0.0599, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.41087532043457, + "rewards/margins": 18.279054641723633, + "rewards/rejected": -26.689929962158203, + "step": 15400 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.4312970638275146, + "eval_logits/rejected": -2.403452157974243, + "eval_logps/chosen": -326.8237609863281, + "eval_logps/rejected": -393.3601989746094, + "eval_loss": 0.6729053258895874, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -13.25857162475586, + "eval_rewards/margins": 8.885856628417969, + "eval_rewards/rejected": -22.144426345825195, + "eval_runtime": 141.2842, + "eval_samples_per_second": 22.338, + "eval_steps_per_second": 0.354, + "step": 15400 + }, + { + "epoch": 2.99, + "learning_rate": 1.5459840368159919e-09, + "logits/chosen": -2.6447572708129883, + "logits/rejected": -2.5998551845550537, + "logps/chosen": -347.8985290527344, + "logps/rejected": -591.987548828125, + "loss": 0.0466, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.259090423583984, + "rewards/margins": 16.267677307128906, + "rewards/rejected": -23.52676773071289, + "step": 15410 + }, + { + "epoch": 2.99, + "learning_rate": 1.1864528654634356e-09, + "logits/chosen": -2.485642433166504, + "logits/rejected": -2.6031100749969482, + "logps/chosen": -523.435302734375, + "logps/rejected": -507.1851501464844, + "loss": 0.1018, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.69085693359375, + "rewards/margins": 16.88809585571289, + "rewards/rejected": -25.578950881958008, + "step": 15420 + }, + { + "epoch": 3.0, + "learning_rate": 8.269216941108794e-10, + "logits/chosen": -2.476501703262329, + "logits/rejected": -2.467801570892334, + "logps/chosen": -260.9589538574219, + "logps/rejected": -526.767822265625, + "loss": 0.0479, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.30578899383545, + "rewards/margins": 20.849079132080078, + "rewards/rejected": -29.154870986938477, + "step": 15430 + }, + { + "epoch": 3.0, + "learning_rate": 4.673905227583231e-10, + "logits/chosen": -2.3333487510681152, + "logits/rejected": -2.4137845039367676, + "logps/chosen": -223.9978485107422, + "logps/rejected": -391.9574279785156, + "loss": 0.0962, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.648858547210693, + "rewards/margins": 13.01892375946045, + "rewards/rejected": -17.667781829833984, + "step": 15440 + }, + { + "epoch": 3.0, + "learning_rate": 1.0785935140576688e-10, + "logits/chosen": -2.3760483264923096, + "logits/rejected": -2.3039135932922363, + "logps/chosen": -270.649169921875, + "logps/rejected": -341.77001953125, + "loss": 0.0706, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1886796951293945, + "rewards/margins": 13.63255500793457, + "rewards/rejected": -15.821233749389648, + "step": 15450 + }, + { + "epoch": 3.0, + "step": 15453, + "total_flos": 0.0, + "train_loss": 0.2455477410652717, + "train_runtime": 50581.1093, + "train_samples_per_second": 4.887, + "train_steps_per_second": 0.306 + } + ], + "logging_steps": 10, + "max_steps": 15453, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}