{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 577, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008665511265164644, "grad_norm": 144.30636115944256, "learning_rate": 8.620689655172414e-09, "logps/chosen": -2.4541897773742676, "logps/rejected": -0.7712920904159546, "loss": 19.0474, "rewards/accuracies": 0.0, "rewards/chosen": -24.541898727416992, "rewards/margins": -16.828975677490234, "rewards/rejected": -7.712920188903809, "step": 5 }, { "epoch": 0.01733102253032929, "grad_norm": 161.5257295296766, "learning_rate": 1.724137931034483e-08, "logps/chosen": -1.8905527591705322, "logps/rejected": -0.8088749647140503, "loss": 17.1265, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.90552520751953, "rewards/margins": -10.81677532196045, "rewards/rejected": -8.088749885559082, "step": 10 }, { "epoch": 0.025996533795493933, "grad_norm": 137.887178762291, "learning_rate": 2.586206896551724e-08, "logps/chosen": -2.4456095695495605, "logps/rejected": -0.7901965379714966, "loss": 18.428, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.456096649169922, "rewards/margins": -16.55413246154785, "rewards/rejected": -7.901966094970703, "step": 15 }, { "epoch": 0.03466204506065858, "grad_norm": 126.89326071068598, "learning_rate": 3.448275862068966e-08, "logps/chosen": -2.5534451007843018, "logps/rejected": -1.1100285053253174, "loss": 14.051, "rewards/accuracies": 0.0, "rewards/chosen": -25.53445053100586, "rewards/margins": -14.434164047241211, "rewards/rejected": -11.100285530090332, "step": 20 }, { "epoch": 0.043327556325823226, "grad_norm": 192.70418536899953, "learning_rate": 4.3103448275862064e-08, "logps/chosen": -1.6850401163101196, "logps/rejected": -0.7781764268875122, "loss": 14.7994, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.850399017333984, "rewards/margins": -9.068635940551758, "rewards/rejected": -7.781764030456543, "step": 25 }, { "epoch": 0.05199306759098787, "grad_norm": 150.3771160776878, "learning_rate": 5.172413793103448e-08, "logps/chosen": -2.7311158180236816, "logps/rejected": -1.1124465465545654, "loss": 17.0918, "rewards/accuracies": 0.0, "rewards/chosen": -27.3111572265625, "rewards/margins": -16.186691284179688, "rewards/rejected": -11.12446403503418, "step": 30 }, { "epoch": 0.060658578856152515, "grad_norm": 119.59215071363475, "learning_rate": 6.03448275862069e-08, "logps/chosen": -2.034471273422241, "logps/rejected": -0.7978214025497437, "loss": 17.5941, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.344711303710938, "rewards/margins": -12.366498947143555, "rewards/rejected": -7.978213310241699, "step": 35 }, { "epoch": 0.06932409012131716, "grad_norm": 156.50159769490176, "learning_rate": 6.896551724137931e-08, "logps/chosen": -2.3780651092529297, "logps/rejected": -1.1195534467697144, "loss": 16.6879, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -23.780651092529297, "rewards/margins": -12.585118293762207, "rewards/rejected": -11.195535659790039, "step": 40 }, { "epoch": 0.0779896013864818, "grad_norm": 120.36498362349127, "learning_rate": 7.758620689655172e-08, "logps/chosen": -2.3653724193573, "logps/rejected": -0.8212674260139465, "loss": 16.0315, "rewards/accuracies": 0.0, "rewards/chosen": -23.65372085571289, "rewards/margins": -15.441045761108398, "rewards/rejected": -8.212674140930176, "step": 45 }, { "epoch": 0.08665511265164645, "grad_norm": 140.48384963924636, "learning_rate": 8.620689655172413e-08, "logps/chosen": -2.0239596366882324, "logps/rejected": -0.8580087423324585, "loss": 17.0992, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.23959732055664, "rewards/margins": -11.65950870513916, "rewards/rejected": -8.580087661743164, "step": 50 }, { "epoch": 0.09532062391681109, "grad_norm": 128.56326610568328, "learning_rate": 9.482758620689655e-08, "logps/chosen": -2.002194881439209, "logps/rejected": -0.8230875134468079, "loss": 15.9385, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.021947860717773, "rewards/margins": -11.791071891784668, "rewards/rejected": -8.230875015258789, "step": 55 }, { "epoch": 0.10398613518197573, "grad_norm": 120.25035045719453, "learning_rate": 9.999633596405632e-08, "logps/chosen": -2.8458476066589355, "logps/rejected": -0.928970992565155, "loss": 17.6529, "rewards/accuracies": 0.0, "rewards/chosen": -28.458477020263672, "rewards/margins": -19.168766021728516, "rewards/rejected": -9.289709091186523, "step": 60 }, { "epoch": 0.11265164644714037, "grad_norm": 110.05644440005858, "learning_rate": 9.995512172662394e-08, "logps/chosen": -2.2458930015563965, "logps/rejected": -0.8278621435165405, "loss": 15.5726, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.458932876586914, "rewards/margins": -14.180310249328613, "rewards/rejected": -8.278621673583984, "step": 65 }, { "epoch": 0.12131715771230503, "grad_norm": 121.60204255752639, "learning_rate": 9.986815108288271e-08, "logps/chosen": -1.1772969961166382, "logps/rejected": -0.8211766481399536, "loss": 14.691, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -11.772969245910645, "rewards/margins": -3.561204195022583, "rewards/rejected": -8.211766242980957, "step": 70 }, { "epoch": 0.12998266897746968, "grad_norm": 101.42661844850082, "learning_rate": 9.973550369361562e-08, "logps/chosen": -1.9986432790756226, "logps/rejected": -1.0442060232162476, "loss": 15.0518, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.986433029174805, "rewards/margins": -9.54437255859375, "rewards/rejected": -10.442060470581055, "step": 75 }, { "epoch": 0.1386481802426343, "grad_norm": 152.87624500994318, "learning_rate": 9.955730105723222e-08, "logps/chosen": -1.9634087085723877, "logps/rejected": -0.7768585085868835, "loss": 17.3279, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.63408851623535, "rewards/margins": -11.865501403808594, "rewards/rejected": -7.768586158752441, "step": 80 }, { "epoch": 0.14731369150779897, "grad_norm": 103.93615787493052, "learning_rate": 9.93337063984821e-08, "logps/chosen": -2.262453079223633, "logps/rejected": -0.8849401473999023, "loss": 15.6727, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.624530792236328, "rewards/margins": -13.775128364562988, "rewards/rejected": -8.849401473999023, "step": 85 }, { "epoch": 0.1559792027729636, "grad_norm": 106.77837368888825, "learning_rate": 9.906492451894921e-08, "logps/chosen": -1.9127286672592163, "logps/rejected": -1.1486315727233887, "loss": 14.4561, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.127286911010742, "rewards/margins": -7.640970706939697, "rewards/rejected": -11.48631477355957, "step": 90 }, { "epoch": 0.16464471403812825, "grad_norm": 115.47717214236427, "learning_rate": 9.875120160946371e-08, "logps/chosen": -1.957664132118225, "logps/rejected": -0.7855945825576782, "loss": 17.5329, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.576641082763672, "rewards/margins": -11.720695495605469, "rewards/rejected": -7.8559465408325195, "step": 95 }, { "epoch": 0.1733102253032929, "grad_norm": 142.79264155311262, "learning_rate": 9.83928250246034e-08, "logps/chosen": -1.542560338973999, "logps/rejected": -0.8671437501907349, "loss": 11.9286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.425603866577148, "rewards/margins": -6.754166603088379, "rewards/rejected": -8.67143726348877, "step": 100 }, { "epoch": 0.18197573656845753, "grad_norm": 129.71279838398823, "learning_rate": 9.799012301949136e-08, "logps/chosen": -2.3877921104431152, "logps/rejected": -0.9329522252082825, "loss": 17.6579, "rewards/accuracies": 0.0, "rewards/chosen": -23.8779239654541, "rewards/margins": -14.54840087890625, "rewards/rejected": -9.329523086547852, "step": 105 }, { "epoch": 0.19064124783362218, "grad_norm": 148.06599803902049, "learning_rate": 9.754346444913042e-08, "logps/chosen": -2.4009385108947754, "logps/rejected": -1.1456081867218018, "loss": 16.4529, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.009387969970703, "rewards/margins": -12.553304672241211, "rewards/rejected": -11.456080436706543, "step": 110 }, { "epoch": 0.19930675909878684, "grad_norm": 160.51208384133946, "learning_rate": 9.705325843055043e-08, "logps/chosen": -1.5254523754119873, "logps/rejected": -0.8646356463432312, "loss": 12.0618, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.254524230957031, "rewards/margins": -6.608166694641113, "rewards/rejected": -8.646357536315918, "step": 115 }, { "epoch": 0.20797227036395147, "grad_norm": 122.45823808036546, "learning_rate": 9.651995396807742e-08, "logps/chosen": -2.2533581256866455, "logps/rejected": -0.908549964427948, "loss": 15.2713, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.53358268737793, "rewards/margins": -13.448083877563477, "rewards/rejected": -9.08549976348877, "step": 120 }, { "epoch": 0.21663778162911612, "grad_norm": 92.45529246500953, "learning_rate": 9.594403954206789e-08, "logps/chosen": -1.814674973487854, "logps/rejected": -0.8079738616943359, "loss": 14.6802, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.14674949645996, "rewards/margins": -10.067010879516602, "rewards/rejected": -8.07973861694336, "step": 125 }, { "epoch": 0.22530329289428075, "grad_norm": 94.30521246591931, "learning_rate": 9.53260426614852e-08, "logps/chosen": -2.3024892807006836, "logps/rejected": -1.008353590965271, "loss": 17.7655, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -23.024892807006836, "rewards/margins": -12.941357612609863, "rewards/rejected": -10.083536148071289, "step": 130 }, { "epoch": 0.2339688041594454, "grad_norm": 112.29975161277991, "learning_rate": 9.466652938072753e-08, "logps/chosen": -2.0203070640563965, "logps/rejected": -1.113219141960144, "loss": 13.6402, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.203073501586914, "rewards/margins": -9.070881843566895, "rewards/rejected": -11.13219165802002, "step": 135 }, { "epoch": 0.24263431542461006, "grad_norm": 120.79443565778939, "learning_rate": 9.396610378115025e-08, "logps/chosen": -2.2065517902374268, "logps/rejected": -0.9879514575004578, "loss": 16.0808, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.06551742553711, "rewards/margins": -12.186001777648926, "rewards/rejected": -9.879514694213867, "step": 140 }, { "epoch": 0.2512998266897747, "grad_norm": 114.12979842546811, "learning_rate": 9.322540741775743e-08, "logps/chosen": -1.9439926147460938, "logps/rejected": -0.9498629570007324, "loss": 14.4467, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.439926147460938, "rewards/margins": -9.94129753112793, "rewards/rejected": -9.498628616333008, "step": 145 }, { "epoch": 0.25996533795493937, "grad_norm": 125.45737006408432, "learning_rate": 9.244511873156944e-08, "logps/chosen": -1.816112756729126, "logps/rejected": -1.040378451347351, "loss": 13.5576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.1611270904541, "rewards/margins": -7.7573418617248535, "rewards/rejected": -10.403783798217773, "step": 150 }, { "epoch": 0.268630849220104, "grad_norm": 112.60264240438396, "learning_rate": 9.16259524282046e-08, "logps/chosen": -1.8842222690582275, "logps/rejected": -1.0307762622833252, "loss": 13.5764, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.842222213745117, "rewards/margins": -8.534459114074707, "rewards/rejected": -10.307764053344727, "step": 155 }, { "epoch": 0.2772963604852686, "grad_norm": 122.00400288293022, "learning_rate": 9.076865882324452e-08, "logps/chosen": -1.9621737003326416, "logps/rejected": -1.0562703609466553, "loss": 17.3966, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.621734619140625, "rewards/margins": -9.05903434753418, "rewards/rejected": -10.562703132629395, "step": 160 }, { "epoch": 0.28596187175043325, "grad_norm": 124.16081774837384, "learning_rate": 8.987402315498223e-08, "logps/chosen": -1.1465253829956055, "logps/rejected": -1.0642750263214111, "loss": 12.7749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -11.465253829956055, "rewards/margins": -0.8225023150444031, "rewards/rejected": -10.64275074005127, "step": 165 }, { "epoch": 0.29462738301559793, "grad_norm": 124.62572211999075, "learning_rate": 8.89428648651831e-08, "logps/chosen": -1.7204177379608154, "logps/rejected": -1.1967977285385132, "loss": 14.6418, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.204177856445312, "rewards/margins": -5.23620080947876, "rewards/rejected": -11.967977523803711, "step": 170 }, { "epoch": 0.30329289428076256, "grad_norm": 112.56300891139004, "learning_rate": 8.797603684851683e-08, "logps/chosen": -2.6767685413360596, "logps/rejected": -1.069800853729248, "loss": 16.2711, "rewards/accuracies": 0.0, "rewards/chosen": -26.767688751220703, "rewards/margins": -16.069677352905273, "rewards/rejected": -10.69800853729248, "step": 175 }, { "epoch": 0.3119584055459272, "grad_norm": 120.78807431486482, "learning_rate": 8.697442467134845e-08, "logps/chosen": -1.9326350688934326, "logps/rejected": -1.0922300815582275, "loss": 15.0838, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.326351165771484, "rewards/margins": -8.40404987335205, "rewards/rejected": -10.922300338745117, "step": 180 }, { "epoch": 0.32062391681109187, "grad_norm": 110.86338987148085, "learning_rate": 8.593894576060354e-08, "logps/chosen": -1.953160285949707, "logps/rejected": -1.1958823204040527, "loss": 10.7819, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.53160285949707, "rewards/margins": -7.572779178619385, "rewards/rejected": -11.958823204040527, "step": 185 }, { "epoch": 0.3292894280762565, "grad_norm": 123.90742849743228, "learning_rate": 8.487054856345081e-08, "logps/chosen": -1.568197250366211, "logps/rejected": -1.1650375127792358, "loss": 12.3005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.681971549987793, "rewards/margins": -4.03159761428833, "rewards/rejected": -11.650375366210938, "step": 190 }, { "epoch": 0.3379549393414211, "grad_norm": 126.6347080831252, "learning_rate": 8.377021167857166e-08, "logps/chosen": -1.8314793109893799, "logps/rejected": -1.2466024160385132, "loss": 14.669, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.31479263305664, "rewards/margins": -5.848769187927246, "rewards/rejected": -12.466025352478027, "step": 195 }, { "epoch": 0.3466204506065858, "grad_norm": 121.64468314179587, "learning_rate": 8.263894295981257e-08, "logps/chosen": -2.2646422386169434, "logps/rejected": -1.226609706878662, "loss": 12.8032, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.64642333984375, "rewards/margins": -10.380326271057129, "rewards/rejected": -12.266097068786621, "step": 200 }, { "epoch": 0.35528596187175043, "grad_norm": 126.85588315541666, "learning_rate": 8.147777859304095e-08, "logps/chosen": -1.198981523513794, "logps/rejected": -1.345165491104126, "loss": 12.7799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -11.989816665649414, "rewards/margins": 1.461838960647583, "rewards/rejected": -13.451654434204102, "step": 205 }, { "epoch": 0.36395147313691506, "grad_norm": 120.43560857917016, "learning_rate": 8.028778214705058e-08, "logps/chosen": -2.1712536811828613, "logps/rejected": -1.199004888534546, "loss": 11.8499, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.712539672851562, "rewards/margins": -9.722488403320312, "rewards/rejected": -11.990049362182617, "step": 210 }, { "epoch": 0.37261698440207974, "grad_norm": 626.8379552865316, "learning_rate": 7.907004359938548e-08, "logps/chosen": -2.164957046508789, "logps/rejected": -1.0314610004425049, "loss": 14.2149, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.649572372436523, "rewards/margins": -11.3349609375, "rewards/rejected": -10.314610481262207, "step": 215 }, { "epoch": 0.38128249566724437, "grad_norm": 161.48734947105442, "learning_rate": 7.782567833797457e-08, "logps/chosen": -2.232151508331299, "logps/rejected": -1.2837927341461182, "loss": 13.421, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.321514129638672, "rewards/margins": -9.483587265014648, "rewards/rejected": -12.837926864624023, "step": 220 }, { "epoch": 0.389948006932409, "grad_norm": 151.19422846807146, "learning_rate": 7.655582613949202e-08, "logps/chosen": -2.078399658203125, "logps/rejected": -1.2598081827163696, "loss": 14.0057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.783998489379883, "rewards/margins": -8.185917854309082, "rewards/rejected": -12.5980806350708, "step": 225 }, { "epoch": 0.3986135181975737, "grad_norm": 114.95939011147192, "learning_rate": 7.526165012537843e-08, "logps/chosen": -2.0014336109161377, "logps/rejected": -1.2770476341247559, "loss": 12.8558, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.014333724975586, "rewards/margins": -7.24385929107666, "rewards/rejected": -12.770475387573242, "step": 230 }, { "epoch": 0.4072790294627383, "grad_norm": 611.9655768877072, "learning_rate": 7.394433569647934e-08, "logps/chosen": -1.4717886447906494, "logps/rejected": -1.286112666130066, "loss": 11.6772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.717885971069336, "rewards/margins": -1.8567602634429932, "rewards/rejected": -12.861126899719238, "step": 235 }, { "epoch": 0.41594454072790293, "grad_norm": 122.39432963173844, "learning_rate": 7.260508944727723e-08, "logps/chosen": -2.4743523597717285, "logps/rejected": -1.451537013053894, "loss": 13.4604, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.74352264404297, "rewards/margins": -10.22815227508545, "rewards/rejected": -14.515371322631836, "step": 240 }, { "epoch": 0.4246100519930676, "grad_norm": 178.54805378295796, "learning_rate": 7.124513806071086e-08, "logps/chosen": -1.7837231159210205, "logps/rejected": -1.259641170501709, "loss": 13.0229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.837230682373047, "rewards/margins": -5.240820407867432, "rewards/rejected": -12.596410751342773, "step": 245 }, { "epoch": 0.43327556325823224, "grad_norm": 301.11675694113546, "learning_rate": 6.986572718459478e-08, "logps/chosen": -2.424816131591797, "logps/rejected": -1.6280561685562134, "loss": 12.263, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.2481632232666, "rewards/margins": -7.9676008224487305, "rewards/rejected": -16.280563354492188, "step": 250 }, { "epoch": 0.44194107452339687, "grad_norm": 151.1924896359916, "learning_rate": 6.846812029066787e-08, "logps/chosen": -2.284111499786377, "logps/rejected": -1.4265038967132568, "loss": 10.2145, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.841115951538086, "rewards/margins": -8.576075553894043, "rewards/rejected": -14.265039443969727, "step": 255 }, { "epoch": 0.4506065857885615, "grad_norm": 117.0245050950957, "learning_rate": 6.705359751731611e-08, "logps/chosen": -2.581634998321533, "logps/rejected": -1.299729585647583, "loss": 11.8198, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -25.81635093688965, "rewards/margins": -12.819055557250977, "rewards/rejected": -12.997296333312988, "step": 260 }, { "epoch": 0.4592720970537262, "grad_norm": 164.40913549983566, "learning_rate": 6.562345449702951e-08, "logps/chosen": -1.6220916509628296, "logps/rejected": -1.5810987949371338, "loss": 10.2127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.220916748046875, "rewards/margins": -0.40992775559425354, "rewards/rejected": -15.81098747253418, "step": 265 }, { "epoch": 0.4679376083188908, "grad_norm": 197.25533329293654, "learning_rate": 6.417900116966714e-08, "logps/chosen": -2.8826935291290283, "logps/rejected": -1.366994857788086, "loss": 11.5423, "rewards/accuracies": 0.0, "rewards/chosen": -28.826934814453125, "rewards/margins": -15.156987190246582, "rewards/rejected": -13.669949531555176, "step": 270 }, { "epoch": 0.47660311958405543, "grad_norm": 152.00371643796973, "learning_rate": 6.272156058261753e-08, "logps/chosen": -1.8090347051620483, "logps/rejected": -1.4397003650665283, "loss": 9.7595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.090347290039062, "rewards/margins": -3.6933434009552, "rewards/rejected": -14.397003173828125, "step": 275 }, { "epoch": 0.4852686308492201, "grad_norm": 394.38126102668485, "learning_rate": 6.125246767895286e-08, "logps/chosen": -1.9543708562850952, "logps/rejected": -1.5106321573257446, "loss": 11.2695, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.54370880126953, "rewards/margins": -4.437387943267822, "rewards/rejected": -15.10632038116455, "step": 280 }, { "epoch": 0.49393414211438474, "grad_norm": 142.92864697276755, "learning_rate": 5.977306807468774e-08, "logps/chosen": -2.7028214931488037, "logps/rejected": -1.3628876209259033, "loss": 11.5499, "rewards/accuracies": 0.0, "rewards/chosen": -27.028217315673828, "rewards/margins": -13.399337768554688, "rewards/rejected": -13.628877639770508, "step": 285 }, { "epoch": 0.5025996533795494, "grad_norm": 178.01151675646489, "learning_rate": 5.828471682626175e-08, "logps/chosen": -2.410367012023926, "logps/rejected": -1.658764123916626, "loss": 10.0203, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.103670120239258, "rewards/margins": -7.516029357910156, "rewards/rejected": -16.5876407623291, "step": 290 }, { "epoch": 0.511265164644714, "grad_norm": 441.7833165957171, "learning_rate": 5.678877718937519e-08, "logps/chosen": -1.7007253170013428, "logps/rejected": -1.7013359069824219, "loss": 10.809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.00725555419922, "rewards/margins": 0.006104087922722101, "rewards/rejected": -17.01336097717285, "step": 295 }, { "epoch": 0.5199306759098787, "grad_norm": 117.48808947391147, "learning_rate": 5.52866193703147e-08, "logps/chosen": -2.6699347496032715, "logps/rejected": -1.7835134267807007, "loss": 9.6309, "rewards/accuracies": 0.0, "rewards/chosen": -26.699350357055664, "rewards/margins": -8.864214897155762, "rewards/rejected": -17.835134506225586, "step": 300 }, { "epoch": 0.5285961871750433, "grad_norm": 217.9261859040597, "learning_rate": 5.3779619270912414e-08, "logps/chosen": -1.99038827419281, "logps/rejected": -1.9274762868881226, "loss": 10.2615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.90388298034668, "rewards/margins": -0.6291202306747437, "rewards/rejected": -19.274763107299805, "step": 305 }, { "epoch": 0.537261698440208, "grad_norm": 205.3616408894159, "learning_rate": 5.22691572282884e-08, "logps/chosen": -2.596215009689331, "logps/rejected": -2.011929750442505, "loss": 9.5973, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.962148666381836, "rewards/margins": -5.842852592468262, "rewards/rejected": -20.11929702758789, "step": 310 }, { "epoch": 0.5459272097053726, "grad_norm": 190.6871604383273, "learning_rate": 5.0756616750530436e-08, "logps/chosen": -1.3547618389129639, "logps/rejected": -1.6575043201446533, "loss": 8.561, "rewards/accuracies": 1.0, "rewards/chosen": -13.54761791229248, "rewards/margins": 3.027425527572632, "rewards/rejected": -16.575044631958008, "step": 315 }, { "epoch": 0.5545927209705372, "grad_norm": 233.80398397633627, "learning_rate": 4.924338324946955e-08, "logps/chosen": -2.2690348625183105, "logps/rejected": -1.7823927402496338, "loss": 10.8419, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.690349578857422, "rewards/margins": -4.8664231300354, "rewards/rejected": -17.823925018310547, "step": 320 }, { "epoch": 0.5632582322357019, "grad_norm": 271.18165460140943, "learning_rate": 4.77308427717116e-08, "logps/chosen": -2.08347749710083, "logps/rejected": -1.5966533422470093, "loss": 9.6404, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.83477783203125, "rewards/margins": -4.868241786956787, "rewards/rejected": -15.966534614562988, "step": 325 }, { "epoch": 0.5719237435008665, "grad_norm": 242.574459310927, "learning_rate": 4.622038072908758e-08, "logps/chosen": -1.9583733081817627, "logps/rejected": -1.7426979541778564, "loss": 8.1746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.58373260498047, "rewards/margins": -2.156752824783325, "rewards/rejected": -17.42698097229004, "step": 330 }, { "epoch": 0.5805892547660312, "grad_norm": 253.3241055482191, "learning_rate": 4.4713380629685306e-08, "logps/chosen": -2.1484360694885254, "logps/rejected": -2.0616002082824707, "loss": 7.292, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.48436164855957, "rewards/margins": -0.8683557510375977, "rewards/rejected": -20.616004943847656, "step": 335 }, { "epoch": 0.5892547660311959, "grad_norm": 252.56727877933076, "learning_rate": 4.321122281062481e-08, "logps/chosen": -2.473069906234741, "logps/rejected": -1.9099452495574951, "loss": 8.4532, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.73069953918457, "rewards/margins": -5.631247043609619, "rewards/rejected": -19.09945297241211, "step": 340 }, { "epoch": 0.5979202772963604, "grad_norm": 193.98886710844522, "learning_rate": 4.1715283173738244e-08, "logps/chosen": -1.969444990158081, "logps/rejected": -1.8921003341674805, "loss": 7.5218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.6944522857666, "rewards/margins": -0.7734478116035461, "rewards/rejected": -18.921003341674805, "step": 345 }, { "epoch": 0.6065857885615251, "grad_norm": 237.02702622060875, "learning_rate": 4.022693192531226e-08, "logps/chosen": -2.34706974029541, "logps/rejected": -1.886749029159546, "loss": 6.8617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.4706974029541, "rewards/margins": -4.603204727172852, "rewards/rejected": -18.86749267578125, "step": 350 }, { "epoch": 0.6152512998266898, "grad_norm": 291.8573957521699, "learning_rate": 3.874753232104714e-08, "logps/chosen": -2.6580417156219482, "logps/rejected": -1.7777878046035767, "loss": 9.7241, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -26.580419540405273, "rewards/margins": -8.802539825439453, "rewards/rejected": -17.777877807617188, "step": 355 }, { "epoch": 0.6239168110918544, "grad_norm": 189.51798627174236, "learning_rate": 3.727843941738248e-08, "logps/chosen": -2.312593936920166, "logps/rejected": -2.0596203804016113, "loss": 6.3336, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.125940322875977, "rewards/margins": -2.529735565185547, "rewards/rejected": -20.596202850341797, "step": 360 }, { "epoch": 0.6325823223570191, "grad_norm": 286.87663861473686, "learning_rate": 3.582099883033285e-08, "logps/chosen": -2.689950704574585, "logps/rejected": -2.024871826171875, "loss": 9.0664, "rewards/accuracies": 0.0, "rewards/chosen": -26.89950942993164, "rewards/margins": -6.650788307189941, "rewards/rejected": -20.248720169067383, "step": 365 }, { "epoch": 0.6412478336221837, "grad_norm": 206.15397080557344, "learning_rate": 3.437654550297049e-08, "logps/chosen": -1.957179069519043, "logps/rejected": -2.0333800315856934, "loss": 5.0065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.57179069519043, "rewards/margins": 0.7620084881782532, "rewards/rejected": -20.333799362182617, "step": 370 }, { "epoch": 0.6499133448873483, "grad_norm": 265.4555794317019, "learning_rate": 3.294640248268389e-08, "logps/chosen": -2.076646566390991, "logps/rejected": -2.0849030017852783, "loss": 6.0048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.766464233398438, "rewards/margins": 0.08256340026855469, "rewards/rejected": -20.849027633666992, "step": 375 }, { "epoch": 0.658578856152513, "grad_norm": 244.98209798309657, "learning_rate": 3.153187970933213e-08, "logps/chosen": -2.385413646697998, "logps/rejected": -2.327801465988159, "loss": 5.5495, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.854137420654297, "rewards/margins": -0.576123058795929, "rewards/rejected": -23.27801513671875, "step": 380 }, { "epoch": 0.6672443674176777, "grad_norm": 243.82242299590067, "learning_rate": 3.0134272815405224e-08, "logps/chosen": -2.1494534015655518, "logps/rejected": -2.3645691871643066, "loss": 6.1118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.49453353881836, "rewards/margins": 2.1511597633361816, "rewards/rejected": -23.645694732666016, "step": 385 }, { "epoch": 0.6759098786828422, "grad_norm": 306.8365924711833, "learning_rate": 2.8754861939289133e-08, "logps/chosen": -2.2285687923431396, "logps/rejected": -2.242079257965088, "loss": 5.9324, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.285686492919922, "rewards/margins": 0.1351058930158615, "rewards/rejected": -22.420795440673828, "step": 390 }, { "epoch": 0.6845753899480069, "grad_norm": 292.0875551239737, "learning_rate": 2.7394910552722773e-08, "logps/chosen": -2.4236674308776855, "logps/rejected": -2.342806339263916, "loss": 5.984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.236675262451172, "rewards/margins": -0.8086126446723938, "rewards/rejected": -23.428064346313477, "step": 395 }, { "epoch": 0.6932409012131716, "grad_norm": 330.0702338474899, "learning_rate": 2.6055664303520652e-08, "logps/chosen": -2.262308359146118, "logps/rejected": -2.4734067916870117, "loss": 5.5632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.623083114624023, "rewards/margins": 2.1109836101531982, "rewards/rejected": -24.734067916870117, "step": 400 }, { "epoch": 0.6932409012131716, "eval_logps/chosen": -2.345215082168579, "eval_logps/rejected": -2.3579459190368652, "eval_loss": 6.394889831542969, "eval_rewards/accuracies": 0.5, "eval_rewards/chosen": -23.452152252197266, "eval_rewards/margins": 0.12730884552001953, "eval_rewards/rejected": -23.57946014404297, "eval_runtime": 4.1708, "eval_samples_per_second": 2.398, "eval_steps_per_second": 0.48, "step": 400 }, { "epoch": 0.7019064124783362, "grad_norm": 432.18725823455117, "learning_rate": 2.4738349874621583e-08, "logps/chosen": -2.2886712551116943, "logps/rejected": -2.3248345851898193, "loss": 5.6469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.8867130279541, "rewards/margins": 0.36163291335105896, "rewards/rejected": -23.248348236083984, "step": 405 }, { "epoch": 0.7105719237435009, "grad_norm": 289.0106970167949, "learning_rate": 2.3444173860507965e-08, "logps/chosen": -2.2404701709747314, "logps/rejected": -2.36967134475708, "loss": 4.7682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.40470314025879, "rewards/margins": 1.2920114994049072, "rewards/rejected": -23.696712493896484, "step": 410 }, { "epoch": 0.7192374350086655, "grad_norm": 336.7530764224563, "learning_rate": 2.2174321662025424e-08, "logps/chosen": -2.565544843673706, "logps/rejected": -2.5797317028045654, "loss": 5.0399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.65545082092285, "rewards/margins": 0.1418682038784027, "rewards/rejected": -25.797317504882812, "step": 415 }, { "epoch": 0.7279029462738301, "grad_norm": 236.60136142580333, "learning_rate": 2.092995640061454e-08, "logps/chosen": -2.1976778507232666, "logps/rejected": -2.399247169494629, "loss": 3.4771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.976778030395508, "rewards/margins": 2.015695095062256, "rewards/rejected": -23.99247169494629, "step": 420 }, { "epoch": 0.7365684575389948, "grad_norm": 285.99685463404285, "learning_rate": 1.9712217852949407e-08, "logps/chosen": -1.974400281906128, "logps/rejected": -2.2366092205047607, "loss": 3.8414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.744003295898438, "rewards/margins": 2.6220881938934326, "rewards/rejected": -22.366092681884766, "step": 425 }, { "epoch": 0.7452339688041595, "grad_norm": 386.7211621912297, "learning_rate": 1.852222140695906e-08, "logps/chosen": -2.8100500106811523, "logps/rejected": -2.9486374855041504, "loss": 3.7648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.100500106811523, "rewards/margins": 1.3858760595321655, "rewards/rejected": -29.486377716064453, "step": 430 }, { "epoch": 0.7538994800693241, "grad_norm": 435.7919496767513, "learning_rate": 1.736105704018744e-08, "logps/chosen": -2.645963668823242, "logps/rejected": -2.6360223293304443, "loss": 3.8238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.45963478088379, "rewards/margins": -0.09941063076257706, "rewards/rejected": -26.3602237701416, "step": 435 }, { "epoch": 0.7625649913344887, "grad_norm": 300.3158959034186, "learning_rate": 1.622978832142833e-08, "logps/chosen": -2.481766700744629, "logps/rejected": -2.7485013008117676, "loss": 3.9495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.817668914794922, "rewards/margins": 2.6673457622528076, "rewards/rejected": -27.48501205444336, "step": 440 }, { "epoch": 0.7712305025996534, "grad_norm": 267.00928393815593, "learning_rate": 1.51294514365492e-08, "logps/chosen": -2.0402495861053467, "logps/rejected": -2.3271498680114746, "loss": 3.2962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.402498245239258, "rewards/margins": 2.8689987659454346, "rewards/rejected": -23.271495819091797, "step": 445 }, { "epoch": 0.779896013864818, "grad_norm": 440.98075927307, "learning_rate": 1.4061054239396452e-08, "logps/chosen": -2.774311065673828, "logps/rejected": -2.5872533321380615, "loss": 4.3524, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.743112564086914, "rewards/margins": -1.8705768585205078, "rewards/rejected": -25.872533798217773, "step": 450 }, { "epoch": 0.7885615251299827, "grad_norm": 249.3269527978748, "learning_rate": 1.302557532865155e-08, "logps/chosen": -2.2388012409210205, "logps/rejected": -2.5778894424438477, "loss": 3.1082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.388011932373047, "rewards/margins": 3.390883207321167, "rewards/rejected": -25.778894424438477, "step": 455 }, { "epoch": 0.7972270363951474, "grad_norm": 266.5651240717741, "learning_rate": 1.2023963151483163e-08, "logps/chosen": -2.2709708213806152, "logps/rejected": -2.752668857574463, "loss": 2.9738, "rewards/accuracies": 1.0, "rewards/chosen": -22.7097110748291, "rewards/margins": 4.816981792449951, "rewards/rejected": -27.52669334411621, "step": 460 }, { "epoch": 0.8058925476603119, "grad_norm": 242.9598196737481, "learning_rate": 1.1057135134816897e-08, "logps/chosen": -2.198323965072632, "logps/rejected": -2.5944952964782715, "loss": 3.1251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.98324203491211, "rewards/margins": 3.96171236038208, "rewards/rejected": -25.9449520111084, "step": 465 }, { "epoch": 0.8145580589254766, "grad_norm": 352.7538054645251, "learning_rate": 1.012597684501777e-08, "logps/chosen": -2.405252456665039, "logps/rejected": -2.7467832565307617, "loss": 2.9631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.052526473999023, "rewards/margins": 3.415306806564331, "rewards/rejected": -27.467830657958984, "step": 470 }, { "epoch": 0.8232235701906413, "grad_norm": 419.88510678787657, "learning_rate": 9.231341176755487e-09, "logps/chosen": -2.7878851890563965, "logps/rejected": -2.999274730682373, "loss": 3.0581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.87885093688965, "rewards/margins": 2.113895893096924, "rewards/rejected": -29.992748260498047, "step": 475 }, { "epoch": 0.8318890814558059, "grad_norm": 374.57650865628955, "learning_rate": 8.3740475717954e-09, "logps/chosen": -2.368149518966675, "logps/rejected": -2.6514530181884766, "loss": 2.767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.681495666503906, "rewards/margins": 2.8330326080322266, "rewards/rejected": -26.5145263671875, "step": 480 }, { "epoch": 0.8405545927209706, "grad_norm": 298.2316161579765, "learning_rate": 7.554881268430563e-09, "logps/chosen": -1.9999809265136719, "logps/rejected": -2.5488007068634033, "loss": 2.6938, "rewards/accuracies": 1.0, "rewards/chosen": -19.99980926513672, "rewards/margins": 5.488197326660156, "rewards/rejected": -25.488004684448242, "step": 485 }, { "epoch": 0.8492201039861352, "grad_norm": 289.378607710532, "learning_rate": 6.774592582242567e-09, "logps/chosen": -2.1875576972961426, "logps/rejected": -2.7966582775115967, "loss": 2.2383, "rewards/accuracies": 1.0, "rewards/chosen": -21.87557601928711, "rewards/margins": 6.091004371643066, "rewards/rejected": -27.966583251953125, "step": 490 }, { "epoch": 0.8578856152512998, "grad_norm": 284.6211316368398, "learning_rate": 6.033896218849766e-09, "logps/chosen": -2.5260894298553467, "logps/rejected": -2.4975428581237793, "loss": 2.7359, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.260896682739258, "rewards/margins": -0.2854675352573395, "rewards/rejected": -24.97542953491211, "step": 495 }, { "epoch": 0.8665511265164645, "grad_norm": 450.49404572669596, "learning_rate": 5.3334706192724786e-09, "logps/chosen": -2.397015333175659, "logps/rejected": -2.851841449737549, "loss": 3.193, "rewards/accuracies": 1.0, "rewards/chosen": -23.97015380859375, "rewards/margins": 4.548262596130371, "rewards/rejected": -28.518413543701172, "step": 500 }, { "epoch": 0.8752166377816292, "grad_norm": 382.86348627674346, "learning_rate": 4.673957338514811e-09, "logps/chosen": -2.497405529022217, "logps/rejected": -2.8319692611694336, "loss": 2.438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.974056243896484, "rewards/margins": 3.3456368446350098, "rewards/rejected": -28.319692611694336, "step": 505 }, { "epoch": 0.8838821490467937, "grad_norm": 169.16569455598628, "learning_rate": 4.055960457932106e-09, "logps/chosen": -2.318751811981201, "logps/rejected": -2.7315309047698975, "loss": 2.5218, "rewards/accuracies": 1.0, "rewards/chosen": -23.187519073486328, "rewards/margins": 4.127790451049805, "rewards/rejected": -27.315311431884766, "step": 510 }, { "epoch": 0.8925476603119584, "grad_norm": 290.65862840125214, "learning_rate": 3.4800460319225734e-09, "logps/chosen": -2.376023054122925, "logps/rejected": -2.822336196899414, "loss": 3.0831, "rewards/accuracies": 1.0, "rewards/chosen": -23.760231018066406, "rewards/margins": 4.463130950927734, "rewards/rejected": -28.22336196899414, "step": 515 }, { "epoch": 0.901213171577123, "grad_norm": 473.51636465627206, "learning_rate": 2.9467415694495624e-09, "logps/chosen": -2.086275100708008, "logps/rejected": -2.509833574295044, "loss": 2.7674, "rewards/accuracies": 1.0, "rewards/chosen": -20.86275291442871, "rewards/margins": 4.235583782196045, "rewards/rejected": -25.098339080810547, "step": 520 }, { "epoch": 0.9098786828422877, "grad_norm": 195.68912447867717, "learning_rate": 2.4565355508695816e-09, "logps/chosen": -2.3413867950439453, "logps/rejected": -3.0255393981933594, "loss": 2.3523, "rewards/accuracies": 1.0, "rewards/chosen": -23.413867950439453, "rewards/margins": 6.841525077819824, "rewards/rejected": -30.255395889282227, "step": 525 }, { "epoch": 0.9185441941074524, "grad_norm": 165.55622486042316, "learning_rate": 2.0098769805086456e-09, "logps/chosen": -2.413383722305298, "logps/rejected": -2.902367115020752, "loss": 2.0526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.133834838867188, "rewards/margins": 4.889834880828857, "rewards/rejected": -29.023672103881836, "step": 530 }, { "epoch": 0.9272097053726169, "grad_norm": 271.2253606637343, "learning_rate": 1.6071749753965912e-09, "logps/chosen": -2.4116291999816895, "logps/rejected": -3.0955259799957275, "loss": 2.1614, "rewards/accuracies": 1.0, "rewards/chosen": -24.116291046142578, "rewards/margins": 6.838971138000488, "rewards/rejected": -30.95526123046875, "step": 535 }, { "epoch": 0.9358752166377816, "grad_norm": 222.81738320230136, "learning_rate": 1.2487983905362932e-09, "logps/chosen": -2.4959144592285156, "logps/rejected": -2.7256417274475098, "loss": 2.1184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.959142684936523, "rewards/margins": 2.297271251678467, "rewards/rejected": -27.25641441345215, "step": 540 }, { "epoch": 0.9445407279029463, "grad_norm": 214.85286607093462, "learning_rate": 9.350754810507845e-10, "logps/chosen": -2.820622682571411, "logps/rejected": -3.021366596221924, "loss": 2.2908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.206228256225586, "rewards/margins": 2.007439136505127, "rewards/rejected": -30.213665008544922, "step": 545 }, { "epoch": 0.9532062391681109, "grad_norm": 211.8074649095587, "learning_rate": 6.662936015178977e-10, "logps/chosen": -2.4099273681640625, "logps/rejected": -2.8419957160949707, "loss": 3.6358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.099275588989258, "rewards/margins": 4.3206787109375, "rewards/rejected": -28.419952392578125, "step": 550 }, { "epoch": 0.9618717504332756, "grad_norm": 378.71477984576666, "learning_rate": 4.4269894276779406e-10, "logps/chosen": -2.685300588607788, "logps/rejected": -3.123471736907959, "loss": 2.9005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.85300636291504, "rewards/margins": 4.381711006164551, "rewards/rejected": -31.234716415405273, "step": 555 }, { "epoch": 0.9705372616984402, "grad_norm": 288.4807564220741, "learning_rate": 2.6449630638438193e-10, "logps/chosen": -2.6586508750915527, "logps/rejected": -2.9211390018463135, "loss": 2.7155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.58650779724121, "rewards/margins": 2.624882221221924, "rewards/rejected": -29.211389541625977, "step": 560 }, { "epoch": 0.9792027729636048, "grad_norm": 271.5646477670488, "learning_rate": 1.3184891711727765e-10, "logps/chosen": -2.1007838249206543, "logps/rejected": -2.534923553466797, "loss": 2.8816, "rewards/accuracies": 1.0, "rewards/chosen": -21.007837295532227, "rewards/margins": 4.341399192810059, "rewards/rejected": -25.3492374420166, "step": 565 }, { "epoch": 0.9878682842287695, "grad_norm": 354.88743197085506, "learning_rate": 4.487827337604222e-11, "logps/chosen": -2.3186147212982178, "logps/rejected": -3.0023269653320312, "loss": 2.7364, "rewards/accuracies": 1.0, "rewards/chosen": -23.186147689819336, "rewards/margins": 6.837121486663818, "rewards/rejected": -30.023269653320312, "step": 570 }, { "epoch": 0.9965337954939342, "grad_norm": 184.69149045968697, "learning_rate": 3.664035943679033e-12, "logps/chosen": -2.4100213050842285, "logps/rejected": -2.7757315635681152, "loss": 3.0952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.100210189819336, "rewards/margins": 3.6571030616760254, "rewards/rejected": -27.757314682006836, "step": 575 }, { "epoch": 1.0, "step": 577, "total_flos": 0.0, "train_loss": 9.667842355096774, "train_runtime": 2221.9837, "train_samples_per_second": 2.075, "train_steps_per_second": 0.26 } ], "logging_steps": 5, "max_steps": 577, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }