{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 1.0308921337127686, "learning_rate": 5e-09, "logits/chosen": 0.4729853868484497, "logits/rejected": 0.370886892080307, "logps/chosen": -173.5879364013672, "logps/rejected": -178.2545166015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.002, "grad_norm": 0.8464347720146179, "learning_rate": 1e-08, "logits/chosen": 0.9691154360771179, "logits/rejected": 0.2301868051290512, "logps/chosen": -312.7537841796875, "logps/rejected": -147.8146209716797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.003, "grad_norm": 0.9007073044776917, "learning_rate": 1.5e-08, "logits/chosen": 0.9382452368736267, "logits/rejected": 0.09409143030643463, "logps/chosen": -246.36904907226562, "logps/rejected": -146.04879760742188, "loss": 0.6953, "rewards/accuracies": 0.125, "rewards/chosen": -0.004052639007568359, "rewards/margins": -0.0041182516142725945, "rewards/rejected": 6.561279587913305e-05, "step": 3 }, { "epoch": 0.004, "grad_norm": 0.9822142720222473, "learning_rate": 2e-08, "logits/chosen": 0.6596120595932007, "logits/rejected": -0.047199320048093796, "logps/chosen": -321.41180419921875, "logps/rejected": -165.54098510742188, "loss": 0.7123, "rewards/accuracies": 0.25, "rewards/chosen": -0.025841616094112396, "rewards/margins": -0.035054586827754974, "rewards/rejected": 0.009212970733642578, "step": 4 }, { "epoch": 0.005, "grad_norm": 1.3622119426727295, "learning_rate": 2.5e-08, "logits/chosen": 0.3201568126678467, "logits/rejected": 1.2954438924789429, "logps/chosen": -164.15830993652344, "logps/rejected": -314.04644775390625, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -0.014291668310761452, "rewards/margins": 0.03856554254889488, "rewards/rejected": -0.052857208997011185, "step": 5 }, { "epoch": 0.006, "grad_norm": 0.7548319101333618, "learning_rate": 3e-08, "logits/chosen": 0.3662494122982025, "logits/rejected": 0.41041114926338196, "logps/chosen": -163.99026489257812, "logps/rejected": -200.36395263671875, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.018863296136260033, "rewards/margins": 0.05273933336138725, "rewards/rejected": -0.03387603908777237, "step": 6 }, { "epoch": 0.007, "grad_norm": 1.2791181802749634, "learning_rate": 3.5e-08, "logits/chosen": 0.05459073930978775, "logits/rejected": 0.4691392183303833, "logps/chosen": -127.85208129882812, "logps/rejected": -278.8637390136719, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.007275009527802467, "rewards/margins": -0.0008567832410335541, "rewards/rejected": -0.006418227683752775, "step": 7 }, { "epoch": 0.008, "grad_norm": 1.0980570316314697, "learning_rate": 4e-08, "logits/chosen": 0.5381655693054199, "logits/rejected": 0.1410129815340042, "logps/chosen": -255.92562866210938, "logps/rejected": -242.81265258789062, "loss": 0.7107, "rewards/accuracies": 0.5, "rewards/chosen": -0.040052104741334915, "rewards/margins": -0.03230130299925804, "rewards/rejected": -0.007750797085464001, "step": 8 }, { "epoch": 0.009, "grad_norm": 0.9415262341499329, "learning_rate": 4.5e-08, "logits/chosen": 0.5085548162460327, "logits/rejected": 1.014196753501892, "logps/chosen": -176.69046020507812, "logps/rejected": -265.6269226074219, "loss": 0.7337, "rewards/accuracies": 0.125, "rewards/chosen": -0.004787970334291458, "rewards/margins": -0.0773511454463005, "rewards/rejected": 0.07256316393613815, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.9219750165939331, "learning_rate": 5e-08, "logits/chosen": 0.2244638204574585, "logits/rejected": 0.8504382371902466, "logps/chosen": -143.77114868164062, "logps/rejected": -268.7762145996094, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": -0.002990530803799629, "rewards/margins": 0.0001066187396645546, "rewards/rejected": -0.0030971523374319077, "step": 10 }, { "epoch": 0.011, "grad_norm": 1.1202610731124878, "learning_rate": 5.4999999999999996e-08, "logits/chosen": 0.12036789953708649, "logits/rejected": 1.04570734500885, "logps/chosen": -141.6674041748047, "logps/rejected": -206.2694091796875, "loss": 0.7065, "rewards/accuracies": 0.25, "rewards/chosen": -0.01575503498315811, "rewards/margins": -0.02557559125125408, "rewards/rejected": 0.009820557199418545, "step": 11 }, { "epoch": 0.012, "grad_norm": 0.9519157409667969, "learning_rate": 6e-08, "logits/chosen": 0.9627233743667603, "logits/rejected": -0.004189133644104004, "logps/chosen": -282.91436767578125, "logps/rejected": -167.30120849609375, "loss": 0.6995, "rewards/accuracies": 0.625, "rewards/chosen": -0.0028006555512547493, "rewards/margins": -0.011238767765462399, "rewards/rejected": 0.008438109420239925, "step": 12 }, { "epoch": 0.013, "grad_norm": 0.7452525496482849, "learning_rate": 6.5e-08, "logits/chosen": 0.569416880607605, "logits/rejected": 0.20242811739444733, "logps/chosen": -182.47616577148438, "logps/rejected": -156.08114624023438, "loss": 0.662, "rewards/accuracies": 0.75, "rewards/chosen": 0.02531280741095543, "rewards/margins": 0.06497974693775177, "rewards/rejected": -0.03966693952679634, "step": 13 }, { "epoch": 0.014, "grad_norm": 0.9929474592208862, "learning_rate": 7e-08, "logits/chosen": 0.7005753517150879, "logits/rejected": 0.4341432750225067, "logps/chosen": -324.36175537109375, "logps/rejected": -211.28091430664062, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": -0.01895613968372345, "rewards/margins": -0.03357195854187012, "rewards/rejected": 0.014615822583436966, "step": 14 }, { "epoch": 0.015, "grad_norm": 0.9993263483047485, "learning_rate": 7.5e-08, "logits/chosen": 0.4990122616291046, "logits/rejected": 0.613381028175354, "logps/chosen": -216.69097900390625, "logps/rejected": -211.88699340820312, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": 0.02364644967019558, "rewards/margins": 0.04736337438225746, "rewards/rejected": -0.02371692657470703, "step": 15 }, { "epoch": 0.016, "grad_norm": 0.785152792930603, "learning_rate": 8e-08, "logits/chosen": 0.7535920143127441, "logits/rejected": 0.4144750237464905, "logps/chosen": -254.79269409179688, "logps/rejected": -170.35623168945312, "loss": 0.6702, "rewards/accuracies": 0.75, "rewards/chosen": 0.01926097646355629, "rewards/margins": 0.04847526550292969, "rewards/rejected": -0.029214289039373398, "step": 16 }, { "epoch": 0.017, "grad_norm": 1.0578926801681519, "learning_rate": 8.500000000000001e-08, "logits/chosen": 0.7777413129806519, "logits/rejected": 0.6671375036239624, "logps/chosen": -284.0263977050781, "logps/rejected": -189.15484619140625, "loss": 0.7129, "rewards/accuracies": 0.375, "rewards/chosen": -0.03240576013922691, "rewards/margins": -0.03487062081694603, "rewards/rejected": 0.0024648671969771385, "step": 17 }, { "epoch": 0.018, "grad_norm": 0.9720740914344788, "learning_rate": 9e-08, "logits/chosen": 0.3121270537376404, "logits/rejected": 0.6747240424156189, "logps/chosen": -147.3482666015625, "logps/rejected": -288.2359924316406, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": 0.0034391884692013264, "rewards/margins": 0.040085840970277786, "rewards/rejected": -0.03664665296673775, "step": 18 }, { "epoch": 0.019, "grad_norm": 0.919995129108429, "learning_rate": 9.499999999999999e-08, "logits/chosen": 0.6503831148147583, "logits/rejected": 0.3564409017562866, "logps/chosen": -274.7367248535156, "logps/rejected": -163.66087341308594, "loss": 0.7232, "rewards/accuracies": 0.125, "rewards/chosen": -0.041147902607917786, "rewards/margins": -0.05722065269947052, "rewards/rejected": 0.016072750091552734, "step": 19 }, { "epoch": 0.02, "grad_norm": 0.9424862861633301, "learning_rate": 1e-07, "logits/chosen": 1.0529050827026367, "logits/rejected": 0.21441280841827393, "logps/chosen": -238.2061309814453, "logps/rejected": -200.26480102539062, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": 0.027144338935613632, "rewards/margins": 0.04264220967888832, "rewards/rejected": -0.015497874468564987, "step": 20 }, { "epoch": 0.021, "grad_norm": 1.3195918798446655, "learning_rate": 1.0499999999999999e-07, "logits/chosen": 0.11141542345285416, "logits/rejected": 0.4812345504760742, "logps/chosen": -118.89556884765625, "logps/rejected": -200.41943359375, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": 0.028679752722382545, "rewards/margins": 0.045675091445446014, "rewards/rejected": -0.01699533686041832, "step": 21 }, { "epoch": 0.022, "grad_norm": 0.9004831314086914, "learning_rate": 1.0999999999999999e-07, "logits/chosen": 0.27431464195251465, "logits/rejected": 0.500339150428772, "logps/chosen": -211.56443786621094, "logps/rejected": -165.65713500976562, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": 0.023714400827884674, "rewards/margins": 0.03718409687280655, "rewards/rejected": -0.0134696951135993, "step": 22 }, { "epoch": 0.023, "grad_norm": 0.9678566455841064, "learning_rate": 1.15e-07, "logits/chosen": 0.8343542814254761, "logits/rejected": 0.7284224033355713, "logps/chosen": -250.7786407470703, "logps/rejected": -222.6984405517578, "loss": 0.6999, "rewards/accuracies": 0.375, "rewards/chosen": -0.0062184324488043785, "rewards/margins": -0.012155819684267044, "rewards/rejected": 0.005937385372817516, "step": 23 }, { "epoch": 0.024, "grad_norm": 0.9012037515640259, "learning_rate": 1.2e-07, "logits/chosen": 0.3248209059238434, "logits/rejected": 0.5673588514328003, "logps/chosen": -257.42401123046875, "logps/rejected": -195.74240112304688, "loss": 0.7025, "rewards/accuracies": 0.25, "rewards/chosen": 0.0016888617537915707, "rewards/margins": -0.017475413158535957, "rewards/rejected": 0.019164277240633965, "step": 24 }, { "epoch": 0.025, "grad_norm": 1.2061468362808228, "learning_rate": 1.25e-07, "logits/chosen": 0.6874338984489441, "logits/rejected": 0.5379788279533386, "logps/chosen": -222.4801483154297, "logps/rejected": -189.00457763671875, "loss": 0.7092, "rewards/accuracies": 0.375, "rewards/chosen": 0.04668769985437393, "rewards/margins": -0.025967881083488464, "rewards/rejected": 0.0726555809378624, "step": 25 }, { "epoch": 0.026, "grad_norm": 1.0132747888565063, "learning_rate": 1.3e-07, "logits/chosen": 0.5130616426467896, "logits/rejected": 0.738013744354248, "logps/chosen": -170.4252471923828, "logps/rejected": -201.4698028564453, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": -0.03638296201825142, "rewards/margins": -0.005205247551202774, "rewards/rejected": -0.031177710741758347, "step": 26 }, { "epoch": 0.027, "grad_norm": 1.0084081888198853, "learning_rate": 1.35e-07, "logits/chosen": 0.6908137798309326, "logits/rejected": 0.004685647785663605, "logps/chosen": -279.24517822265625, "logps/rejected": -159.4570770263672, "loss": 0.7097, "rewards/accuracies": 0.375, "rewards/chosen": -0.03437165915966034, "rewards/margins": -0.030690863728523254, "rewards/rejected": -0.0036808010190725327, "step": 27 }, { "epoch": 0.028, "grad_norm": 0.9576644897460938, "learning_rate": 1.4e-07, "logits/chosen": 0.35767388343811035, "logits/rejected": 0.4049478769302368, "logps/chosen": -154.50901794433594, "logps/rejected": -170.79129028320312, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.007301711477339268, "rewards/margins": 0.0017983447760343552, "rewards/rejected": -0.009100056253373623, "step": 28 }, { "epoch": 0.029, "grad_norm": 1.1507296562194824, "learning_rate": 1.45e-07, "logits/chosen": 0.0699136033654213, "logits/rejected": 0.7621070146560669, "logps/chosen": -156.8745880126953, "logps/rejected": -223.19482421875, "loss": 0.7243, "rewards/accuracies": 0.125, "rewards/chosen": 0.004231643863022327, "rewards/margins": -0.05945014953613281, "rewards/rejected": 0.06368179619312286, "step": 29 }, { "epoch": 0.03, "grad_norm": 1.0412936210632324, "learning_rate": 1.5e-07, "logits/chosen": 0.32147902250289917, "logits/rejected": 0.5553557872772217, "logps/chosen": -171.48687744140625, "logps/rejected": -188.0211181640625, "loss": 0.7366, "rewards/accuracies": 0.25, "rewards/chosen": -0.0524907112121582, "rewards/margins": -0.08379249274730682, "rewards/rejected": 0.03130178526043892, "step": 30 }, { "epoch": 0.031, "grad_norm": 1.047019362449646, "learning_rate": 1.55e-07, "logits/chosen": 0.4191957712173462, "logits/rejected": 0.7505618929862976, "logps/chosen": -228.24305725097656, "logps/rejected": -239.7966766357422, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.03303880617022514, "rewards/margins": -0.003995990380644798, "rewards/rejected": -0.029042817652225494, "step": 31 }, { "epoch": 0.032, "grad_norm": 1.0373077392578125, "learning_rate": 1.6e-07, "logits/chosen": 0.8680647611618042, "logits/rejected": 0.46923577785491943, "logps/chosen": -301.78631591796875, "logps/rejected": -166.162353515625, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.017630958929657936, "rewards/margins": 0.025303076952695847, "rewards/rejected": -0.007672118954360485, "step": 32 }, { "epoch": 0.033, "grad_norm": 1.1108970642089844, "learning_rate": 1.65e-07, "logits/chosen": 0.07721806317567825, "logits/rejected": 0.5161712169647217, "logps/chosen": -185.68460083007812, "logps/rejected": -246.47381591796875, "loss": 0.6994, "rewards/accuracies": 0.5, "rewards/chosen": -0.010094165802001953, "rewards/margins": -0.01102600246667862, "rewards/rejected": 0.0009318357333540916, "step": 33 }, { "epoch": 0.034, "grad_norm": 0.9075908064842224, "learning_rate": 1.7000000000000001e-07, "logits/chosen": 0.8515866994857788, "logits/rejected": 0.5081568360328674, "logps/chosen": -266.22296142578125, "logps/rejected": -195.5361328125, "loss": 0.714, "rewards/accuracies": 0.25, "rewards/chosen": 0.0040839193388819695, "rewards/margins": -0.04021826013922691, "rewards/rejected": 0.04430217668414116, "step": 34 }, { "epoch": 0.035, "grad_norm": 0.8546167612075806, "learning_rate": 1.75e-07, "logits/chosen": 0.4797782301902771, "logits/rejected": 0.5928432941436768, "logps/chosen": -192.8991241455078, "logps/rejected": -211.36068725585938, "loss": 0.6961, "rewards/accuracies": 0.5, "rewards/chosen": -0.006035994738340378, "rewards/margins": -0.0052916519343853, "rewards/rejected": -0.0007443428039550781, "step": 35 }, { "epoch": 0.036, "grad_norm": 0.895753026008606, "learning_rate": 1.8e-07, "logits/chosen": 0.6829379200935364, "logits/rejected": 0.6323165893554688, "logps/chosen": -240.95333862304688, "logps/rejected": -139.59239196777344, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.034822940826416016, "rewards/margins": 0.001462672371417284, "rewards/rejected": 0.03336026892066002, "step": 36 }, { "epoch": 0.037, "grad_norm": 0.936718761920929, "learning_rate": 1.85e-07, "logits/chosen": 0.3450223505496979, "logits/rejected": 0.3304767608642578, "logps/chosen": -203.22412109375, "logps/rejected": -190.06491088867188, "loss": 0.7088, "rewards/accuracies": 0.125, "rewards/chosen": -0.0356050506234169, "rewards/margins": -0.02983880043029785, "rewards/rejected": -0.0057662492617964745, "step": 37 }, { "epoch": 0.038, "grad_norm": 0.9401094913482666, "learning_rate": 1.8999999999999998e-07, "logits/chosen": 0.21520234644412994, "logits/rejected": 0.573419988155365, "logps/chosen": -189.58175659179688, "logps/rejected": -241.5142059326172, "loss": 0.7096, "rewards/accuracies": 0.625, "rewards/chosen": 0.039638713002204895, "rewards/margins": -0.02919158898293972, "rewards/rejected": 0.06883029639720917, "step": 38 }, { "epoch": 0.039, "grad_norm": 0.9499339461326599, "learning_rate": 1.9499999999999999e-07, "logits/chosen": 0.7199922800064087, "logits/rejected": 0.11692295968532562, "logps/chosen": -194.4860076904297, "logps/rejected": -167.94607543945312, "loss": 0.669, "rewards/accuracies": 0.5, "rewards/chosen": 0.047289468348026276, "rewards/margins": 0.05173635110259056, "rewards/rejected": -0.004446889273822308, "step": 39 }, { "epoch": 0.04, "grad_norm": 1.0241577625274658, "learning_rate": 2e-07, "logits/chosen": 0.6722542643547058, "logits/rejected": 0.823941707611084, "logps/chosen": -263.2477111816406, "logps/rejected": -185.41275024414062, "loss": 0.7191, "rewards/accuracies": 0.375, "rewards/chosen": -0.027885816991329193, "rewards/margins": -0.048616502434015274, "rewards/rejected": 0.02073068544268608, "step": 40 }, { "epoch": 0.041, "grad_norm": 0.8532500267028809, "learning_rate": 2.0499999999999997e-07, "logits/chosen": 0.8187202215194702, "logits/rejected": 0.3506469428539276, "logps/chosen": -273.02984619140625, "logps/rejected": -154.79466247558594, "loss": 0.7019, "rewards/accuracies": 0.5, "rewards/chosen": -0.02523527294397354, "rewards/margins": -0.016391132026910782, "rewards/rejected": -0.008844136260449886, "step": 41 }, { "epoch": 0.042, "grad_norm": 0.8844724297523499, "learning_rate": 2.0999999999999997e-07, "logits/chosen": 0.6985961198806763, "logits/rejected": 0.9443420767784119, "logps/chosen": -194.65476989746094, "logps/rejected": -179.43881225585938, "loss": 0.7186, "rewards/accuracies": 0.125, "rewards/chosen": -0.020079804584383965, "rewards/margins": -0.050048258155584335, "rewards/rejected": 0.02996845170855522, "step": 42 }, { "epoch": 0.043, "grad_norm": 0.8850707411766052, "learning_rate": 2.1499999999999998e-07, "logits/chosen": 0.7509806156158447, "logits/rejected": 0.20345763862133026, "logps/chosen": -212.66683959960938, "logps/rejected": -176.9527587890625, "loss": 0.718, "rewards/accuracies": 0.125, "rewards/chosen": -0.004851436708122492, "rewards/margins": -0.048969365656375885, "rewards/rejected": 0.04411792755126953, "step": 43 }, { "epoch": 0.044, "grad_norm": 1.093645691871643, "learning_rate": 2.1999999999999998e-07, "logits/chosen": 0.36265093088150024, "logits/rejected": -0.1168452650308609, "logps/chosen": -194.133544921875, "logps/rejected": -145.99517822265625, "loss": 0.6907, "rewards/accuracies": 0.625, "rewards/chosen": 0.0056983474642038345, "rewards/margins": 0.007354403845965862, "rewards/rejected": -0.0016560549847781658, "step": 44 }, { "epoch": 0.045, "grad_norm": 0.9933081269264221, "learning_rate": 2.25e-07, "logits/chosen": 0.16107098758220673, "logits/rejected": 0.8569546341896057, "logps/chosen": -193.62020874023438, "logps/rejected": -187.57081604003906, "loss": 0.7139, "rewards/accuracies": 0.125, "rewards/chosen": -0.01730518229305744, "rewards/margins": -0.04016360640525818, "rewards/rejected": 0.022858429700136185, "step": 45 }, { "epoch": 0.046, "grad_norm": 1.0277868509292603, "learning_rate": 2.3e-07, "logits/chosen": 0.7980355024337769, "logits/rejected": 0.40779587626457214, "logps/chosen": -261.631591796875, "logps/rejected": -205.68359375, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": 0.04246931150555611, "rewards/margins": 0.017904091626405716, "rewards/rejected": 0.02456521987915039, "step": 46 }, { "epoch": 0.047, "grad_norm": 0.916858434677124, "learning_rate": 2.3499999999999997e-07, "logits/chosen": 0.16089436411857605, "logits/rejected": 0.8079207539558411, "logps/chosen": -178.353271484375, "logps/rejected": -250.08343505859375, "loss": 0.6981, "rewards/accuracies": 0.5, "rewards/chosen": -0.0217850673943758, "rewards/margins": -0.006749441847205162, "rewards/rejected": -0.015035629272460938, "step": 47 }, { "epoch": 0.048, "grad_norm": 0.9144095182418823, "learning_rate": 2.4e-07, "logits/chosen": 0.19950537383556366, "logits/rejected": 0.04468570277094841, "logps/chosen": -173.4908447265625, "logps/rejected": -157.77500915527344, "loss": 0.7058, "rewards/accuracies": 0.375, "rewards/chosen": -0.00909871980547905, "rewards/margins": -0.023751353845000267, "rewards/rejected": 0.014652634039521217, "step": 48 }, { "epoch": 0.049, "grad_norm": 1.0937427282333374, "learning_rate": 2.45e-07, "logits/chosen": 0.48092934489250183, "logits/rejected": -0.12963053584098816, "logps/chosen": -394.088623046875, "logps/rejected": -224.62535095214844, "loss": 0.6998, "rewards/accuracies": 0.375, "rewards/chosen": 0.014795971103012562, "rewards/margins": -0.011305142194032669, "rewards/rejected": 0.026101112365722656, "step": 49 }, { "epoch": 0.05, "grad_norm": 0.8507908582687378, "learning_rate": 2.5e-07, "logits/chosen": 0.6645485758781433, "logits/rejected": 0.6458091735839844, "logps/chosen": -299.6480712890625, "logps/rejected": -164.16807556152344, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.012771034613251686, "rewards/margins": -0.005171201191842556, "rewards/rejected": -0.007599831558763981, "step": 50 }, { "epoch": 0.051, "grad_norm": 1.1392483711242676, "learning_rate": 2.55e-07, "logits/chosen": 0.7092135548591614, "logits/rejected": 0.17535598576068878, "logps/chosen": -304.66546630859375, "logps/rejected": -171.13278198242188, "loss": 0.7102, "rewards/accuracies": 0.5, "rewards/chosen": -0.03840198367834091, "rewards/margins": -0.03232994303107262, "rewards/rejected": -0.006072043441236019, "step": 51 }, { "epoch": 0.052, "grad_norm": 1.0500819683074951, "learning_rate": 2.6e-07, "logits/chosen": 1.018998384475708, "logits/rejected": 0.6269283294677734, "logps/chosen": -266.6947937011719, "logps/rejected": -210.14138793945312, "loss": 0.6982, "rewards/accuracies": 0.625, "rewards/chosen": -0.012327386066317558, "rewards/margins": -0.008932494558393955, "rewards/rejected": -0.003394890110939741, "step": 52 }, { "epoch": 0.053, "grad_norm": 1.1637508869171143, "learning_rate": 2.65e-07, "logits/chosen": 0.5379363894462585, "logits/rejected": 0.5696307420730591, "logps/chosen": -189.79730224609375, "logps/rejected": -309.75140380859375, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": -0.008027076721191406, "rewards/margins": 0.018260952085256577, "rewards/rejected": -0.02628803253173828, "step": 53 }, { "epoch": 0.054, "grad_norm": 1.079467535018921, "learning_rate": 2.7e-07, "logits/chosen": 0.6560742259025574, "logits/rejected": 0.7789033651351929, "logps/chosen": -217.6083526611328, "logps/rejected": -209.48594665527344, "loss": 0.6836, "rewards/accuracies": 0.5, "rewards/chosen": -0.0029646879993379116, "rewards/margins": 0.022043894976377487, "rewards/rejected": -0.025008583441376686, "step": 54 }, { "epoch": 0.055, "grad_norm": 0.9360901117324829, "learning_rate": 2.75e-07, "logits/chosen": 0.6357616186141968, "logits/rejected": 0.45001035928726196, "logps/chosen": -187.12559509277344, "logps/rejected": -138.97802734375, "loss": 0.7221, "rewards/accuracies": 0.25, "rewards/chosen": -0.04264955222606659, "rewards/margins": -0.05616731196641922, "rewards/rejected": 0.01351776160299778, "step": 55 }, { "epoch": 0.056, "grad_norm": 0.8942244648933411, "learning_rate": 2.8e-07, "logits/chosen": 0.11582612246274948, "logits/rejected": 0.5354433655738831, "logps/chosen": -185.9103546142578, "logps/rejected": -178.12387084960938, "loss": 0.7109, "rewards/accuracies": 0.25, "rewards/chosen": -0.004523754585534334, "rewards/margins": -0.03415841981768608, "rewards/rejected": 0.029634665697813034, "step": 56 }, { "epoch": 0.057, "grad_norm": 0.942110002040863, "learning_rate": 2.8499999999999997e-07, "logits/chosen": 0.46307361125946045, "logits/rejected": 0.45508962869644165, "logps/chosen": -161.59872436523438, "logps/rejected": -207.33566284179688, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": -0.022051239386200905, "rewards/margins": 0.00028734467923641205, "rewards/rejected": -0.022338582202792168, "step": 57 }, { "epoch": 0.058, "grad_norm": 1.0877649784088135, "learning_rate": 2.9e-07, "logits/chosen": 0.3243567943572998, "logits/rejected": -0.017475523054599762, "logps/chosen": -267.06414794921875, "logps/rejected": -170.3492431640625, "loss": 0.7205, "rewards/accuracies": 0.125, "rewards/chosen": -0.0020663272589445114, "rewards/margins": -0.05323515087366104, "rewards/rejected": 0.05116882920265198, "step": 58 }, { "epoch": 0.059, "grad_norm": 0.8543508052825928, "learning_rate": 2.95e-07, "logits/chosen": 0.6892141103744507, "logits/rejected": 0.8993517160415649, "logps/chosen": -211.1507110595703, "logps/rejected": -195.83462524414062, "loss": 0.6939, "rewards/accuracies": 0.5, "rewards/chosen": -0.005979823414236307, "rewards/margins": 0.0005870824679732323, "rewards/rejected": -0.006566905882209539, "step": 59 }, { "epoch": 0.06, "grad_norm": 1.254030704498291, "learning_rate": 3e-07, "logits/chosen": 0.3917055130004883, "logits/rejected": -0.0881551057100296, "logps/chosen": -341.82550048828125, "logps/rejected": -173.90548706054688, "loss": 0.714, "rewards/accuracies": 0.25, "rewards/chosen": 0.006498241797089577, "rewards/margins": -0.04009418934583664, "rewards/rejected": 0.04659242555499077, "step": 60 }, { "epoch": 0.061, "grad_norm": 1.046561598777771, "learning_rate": 3.05e-07, "logits/chosen": 1.2933306694030762, "logits/rejected": 0.21294373273849487, "logps/chosen": -335.2054748535156, "logps/rejected": -169.60699462890625, "loss": 0.6679, "rewards/accuracies": 0.5, "rewards/chosen": 0.024694254621863365, "rewards/margins": 0.05475463718175888, "rewards/rejected": -0.030060388147830963, "step": 61 }, { "epoch": 0.062, "grad_norm": 0.8746111989021301, "learning_rate": 3.1e-07, "logits/chosen": 0.6644487380981445, "logits/rejected": 0.8520562648773193, "logps/chosen": -235.44107055664062, "logps/rejected": -186.63900756835938, "loss": 0.6818, "rewards/accuracies": 0.625, "rewards/chosen": 0.020098019391298294, "rewards/margins": 0.02356548421084881, "rewards/rejected": -0.003467465750873089, "step": 62 }, { "epoch": 0.063, "grad_norm": 1.202088713645935, "learning_rate": 3.15e-07, "logits/chosen": 0.1677904725074768, "logits/rejected": 0.6827667951583862, "logps/chosen": -115.25525665283203, "logps/rejected": -197.7379150390625, "loss": 0.6611, "rewards/accuracies": 0.75, "rewards/chosen": 0.0640714168548584, "rewards/margins": 0.0667632520198822, "rewards/rejected": -0.002691841684281826, "step": 63 }, { "epoch": 0.064, "grad_norm": 0.9602220058441162, "learning_rate": 3.2e-07, "logits/chosen": -0.11732780933380127, "logits/rejected": 0.9438371658325195, "logps/chosen": -143.0791778564453, "logps/rejected": -251.417236328125, "loss": 0.6849, "rewards/accuracies": 0.375, "rewards/chosen": 0.02147979661822319, "rewards/margins": 0.017765996977686882, "rewards/rejected": 0.0037137987092137337, "step": 64 }, { "epoch": 0.065, "grad_norm": 0.9338053464889526, "learning_rate": 3.25e-07, "logits/chosen": 0.1599784642457962, "logits/rejected": 0.3146834373474121, "logps/chosen": -152.7606964111328, "logps/rejected": -173.37094116210938, "loss": 0.7405, "rewards/accuracies": 0.25, "rewards/chosen": -0.030649565160274506, "rewards/margins": -0.08942098915576935, "rewards/rejected": 0.058771420270204544, "step": 65 }, { "epoch": 0.066, "grad_norm": 0.9572187066078186, "learning_rate": 3.3e-07, "logits/chosen": 0.47518032789230347, "logits/rejected": 0.9148991107940674, "logps/chosen": -239.2025604248047, "logps/rejected": -209.71896362304688, "loss": 0.6915, "rewards/accuracies": 0.25, "rewards/chosen": 0.009760047309100628, "rewards/margins": 0.005901100113987923, "rewards/rejected": 0.0038589476607739925, "step": 66 }, { "epoch": 0.067, "grad_norm": 0.987013041973114, "learning_rate": 3.35e-07, "logits/chosen": 0.5210095047950745, "logits/rejected": 0.5648468732833862, "logps/chosen": -232.1898956298828, "logps/rejected": -171.57342529296875, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": 0.027416419237852097, "rewards/margins": 0.015152743086218834, "rewards/rejected": 0.012263678014278412, "step": 67 }, { "epoch": 0.068, "grad_norm": 0.895359218120575, "learning_rate": 3.4000000000000003e-07, "logits/chosen": 0.918536365032196, "logits/rejected": 0.36781296133995056, "logps/chosen": -286.7901306152344, "logps/rejected": -158.02426147460938, "loss": 0.7226, "rewards/accuracies": 0.375, "rewards/chosen": -0.025365255773067474, "rewards/margins": -0.05500602722167969, "rewards/rejected": 0.029640767723321915, "step": 68 }, { "epoch": 0.069, "grad_norm": 1.1199206113815308, "learning_rate": 3.45e-07, "logits/chosen": 0.6423901319503784, "logits/rejected": 1.3478939533233643, "logps/chosen": -207.56170654296875, "logps/rejected": -256.2862854003906, "loss": 0.7156, "rewards/accuracies": 0.375, "rewards/chosen": -0.031380653381347656, "rewards/margins": -0.04156980663537979, "rewards/rejected": 0.010189153254032135, "step": 69 }, { "epoch": 0.07, "grad_norm": 0.7943235635757446, "learning_rate": 3.5e-07, "logits/chosen": 0.5906227827072144, "logits/rejected": 0.7410835027694702, "logps/chosen": -209.1223602294922, "logps/rejected": -180.31021118164062, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": 0.019616983830928802, "rewards/margins": 0.03939247131347656, "rewards/rejected": -0.01977548561990261, "step": 70 }, { "epoch": 0.071, "grad_norm": 1.1208845376968384, "learning_rate": 3.55e-07, "logits/chosen": 0.2600058317184448, "logits/rejected": 0.9456261396408081, "logps/chosen": -191.2197265625, "logps/rejected": -233.64004516601562, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": 0.026430128142237663, "rewards/margins": 0.020038411021232605, "rewards/rejected": 0.006391718052327633, "step": 71 }, { "epoch": 0.072, "grad_norm": 0.9633866548538208, "learning_rate": 3.6e-07, "logits/chosen": 0.5714065432548523, "logits/rejected": 0.6857787370681763, "logps/chosen": -205.814453125, "logps/rejected": -217.43885803222656, "loss": 0.7031, "rewards/accuracies": 0.25, "rewards/chosen": -0.003326701931655407, "rewards/margins": -0.016319608315825462, "rewards/rejected": 0.01299290731549263, "step": 72 }, { "epoch": 0.073, "grad_norm": 1.1186124086380005, "learning_rate": 3.65e-07, "logits/chosen": 0.33976659178733826, "logits/rejected": 0.2543525993824005, "logps/chosen": -234.9783935546875, "logps/rejected": -191.3319091796875, "loss": 0.7164, "rewards/accuracies": 0.25, "rewards/chosen": -0.03504962846636772, "rewards/margins": -0.04400758817791939, "rewards/rejected": 0.008957957848906517, "step": 73 }, { "epoch": 0.074, "grad_norm": 0.9047127366065979, "learning_rate": 3.7e-07, "logits/chosen": 0.1509988158941269, "logits/rejected": 0.7285292148590088, "logps/chosen": -163.8496856689453, "logps/rejected": -223.2261505126953, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": -0.004609203897416592, "rewards/margins": 0.0031506530940532684, "rewards/rejected": -0.007759857922792435, "step": 74 }, { "epoch": 0.075, "grad_norm": 1.0231198072433472, "learning_rate": 3.75e-07, "logits/chosen": 0.7632989287376404, "logits/rejected": 0.5625308752059937, "logps/chosen": -285.361328125, "logps/rejected": -156.09629821777344, "loss": 0.6972, "rewards/accuracies": 0.25, "rewards/chosen": -0.01373529527336359, "rewards/margins": -0.006705046631395817, "rewards/rejected": -0.0070302472449839115, "step": 75 }, { "epoch": 0.076, "grad_norm": 0.923767626285553, "learning_rate": 3.7999999999999996e-07, "logits/chosen": 0.5444830060005188, "logits/rejected": 0.1947866529226303, "logps/chosen": -172.4898681640625, "logps/rejected": -165.986572265625, "loss": 0.7036, "rewards/accuracies": 0.5, "rewards/chosen": -0.013919448480010033, "rewards/margins": -0.01862926408648491, "rewards/rejected": 0.004709814675152302, "step": 76 }, { "epoch": 0.077, "grad_norm": 0.8898613452911377, "learning_rate": 3.8499999999999997e-07, "logits/chosen": 0.5218280553817749, "logits/rejected": 0.41404852271080017, "logps/chosen": -239.25608825683594, "logps/rejected": -152.665771484375, "loss": 0.7031, "rewards/accuracies": 0.375, "rewards/chosen": -0.007730007171630859, "rewards/margins": -0.01844329945743084, "rewards/rejected": 0.01071329228579998, "step": 77 }, { "epoch": 0.078, "grad_norm": 0.962451696395874, "learning_rate": 3.8999999999999997e-07, "logits/chosen": 0.41357988119125366, "logits/rejected": 0.6733251810073853, "logps/chosen": -188.4990692138672, "logps/rejected": -215.06260681152344, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": -0.03272666782140732, "rewards/margins": 0.019771957769989967, "rewards/rejected": -0.052498627454042435, "step": 78 }, { "epoch": 0.079, "grad_norm": 0.9946324825286865, "learning_rate": 3.95e-07, "logits/chosen": 0.20090560615062714, "logits/rejected": 0.626895546913147, "logps/chosen": -177.5116729736328, "logps/rejected": -177.72470092773438, "loss": 0.6951, "rewards/accuracies": 0.375, "rewards/chosen": 0.0069602965377271175, "rewards/margins": -0.00246419757604599, "rewards/rejected": 0.00942449551075697, "step": 79 }, { "epoch": 0.08, "grad_norm": 1.1888736486434937, "learning_rate": 4e-07, "logits/chosen": 0.22232143580913544, "logits/rejected": 0.708989679813385, "logps/chosen": -129.25509643554688, "logps/rejected": -201.85366821289062, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": -0.008580397814512253, "rewards/margins": 0.00012455042451620102, "rewards/rejected": -0.008704949170351028, "step": 80 }, { "epoch": 0.081, "grad_norm": 1.2536020278930664, "learning_rate": 4.05e-07, "logits/chosen": 0.39407873153686523, "logits/rejected": 0.9598340392112732, "logps/chosen": -135.3942413330078, "logps/rejected": -221.0856475830078, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": -0.011825226247310638, "rewards/margins": 0.011064101941883564, "rewards/rejected": -0.022889329120516777, "step": 81 }, { "epoch": 0.082, "grad_norm": 0.9947065711021423, "learning_rate": 4.0999999999999994e-07, "logits/chosen": 0.1303069293498993, "logits/rejected": -0.05107775703072548, "logps/chosen": -203.45480346679688, "logps/rejected": -153.215576171875, "loss": 0.6999, "rewards/accuracies": 0.375, "rewards/chosen": 0.001056002452969551, "rewards/margins": -0.010976506397128105, "rewards/rejected": 0.012032508850097656, "step": 82 }, { "epoch": 0.083, "grad_norm": 1.3932172060012817, "learning_rate": 4.1499999999999994e-07, "logits/chosen": 0.45824819803237915, "logits/rejected": 0.2638169527053833, "logps/chosen": -218.52696228027344, "logps/rejected": -170.40899658203125, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": 0.032052040100097656, "rewards/margins": 0.03870754316449165, "rewards/rejected": -0.006655503064393997, "step": 83 }, { "epoch": 0.084, "grad_norm": 0.9318606853485107, "learning_rate": 4.1999999999999995e-07, "logits/chosen": 1.0916774272918701, "logits/rejected": 0.4020550549030304, "logps/chosen": -320.07562255859375, "logps/rejected": -173.65342712402344, "loss": 0.6661, "rewards/accuracies": 0.625, "rewards/chosen": 0.005606460850685835, "rewards/margins": 0.05568142235279083, "rewards/rejected": -0.05007496103644371, "step": 84 }, { "epoch": 0.085, "grad_norm": 0.9118911623954773, "learning_rate": 4.2499999999999995e-07, "logits/chosen": 0.5922536849975586, "logits/rejected": 0.3963426649570465, "logps/chosen": -305.4964599609375, "logps/rejected": -185.83351135253906, "loss": 0.6789, "rewards/accuracies": 0.625, "rewards/chosen": 0.034504130482673645, "rewards/margins": 0.030281497165560722, "rewards/rejected": 0.004222630988806486, "step": 85 }, { "epoch": 0.086, "grad_norm": 0.9579813480377197, "learning_rate": 4.2999999999999996e-07, "logits/chosen": 0.16672705113887787, "logits/rejected": 0.2483358085155487, "logps/chosen": -195.25241088867188, "logps/rejected": -182.67071533203125, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": 0.02576274797320366, "rewards/margins": 0.04013547673821449, "rewards/rejected": -0.014372730627655983, "step": 86 }, { "epoch": 0.087, "grad_norm": 0.8937185406684875, "learning_rate": 4.3499999999999996e-07, "logits/chosen": 0.7490791082382202, "logits/rejected": 0.7007721066474915, "logps/chosen": -261.4656982421875, "logps/rejected": -178.0865936279297, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": 0.03612957149744034, "rewards/margins": 0.02289094775915146, "rewards/rejected": 0.013238620944321156, "step": 87 }, { "epoch": 0.088, "grad_norm": 0.9669702053070068, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 0.6714414358139038, "logits/rejected": 0.4412405788898468, "logps/chosen": -201.61395263671875, "logps/rejected": -227.375732421875, "loss": 0.7093, "rewards/accuracies": 0.375, "rewards/chosen": 0.00041951984167099, "rewards/margins": -0.029848389327526093, "rewards/rejected": 0.030267905443906784, "step": 88 }, { "epoch": 0.089, "grad_norm": 0.8729268312454224, "learning_rate": 4.45e-07, "logits/chosen": 0.5733513832092285, "logits/rejected": 1.0163509845733643, "logps/chosen": -209.19134521484375, "logps/rejected": -219.1939239501953, "loss": 0.6978, "rewards/accuracies": 0.375, "rewards/chosen": 0.02093029022216797, "rewards/margins": -0.00837860070168972, "rewards/rejected": 0.029308892786502838, "step": 89 }, { "epoch": 0.09, "grad_norm": 0.9547777771949768, "learning_rate": 4.5e-07, "logits/chosen": 0.21368414163589478, "logits/rejected": -0.06810185313224792, "logps/chosen": -194.08543395996094, "logps/rejected": -202.89324951171875, "loss": 0.7041, "rewards/accuracies": 0.375, "rewards/chosen": -0.012234211899340153, "rewards/margins": -0.018740464001893997, "rewards/rejected": 0.0065062521025538445, "step": 90 }, { "epoch": 0.091, "grad_norm": 1.2431501150131226, "learning_rate": 4.55e-07, "logits/chosen": 0.15157419443130493, "logits/rejected": 0.3486846089363098, "logps/chosen": -140.67019653320312, "logps/rejected": -216.38217163085938, "loss": 0.6836, "rewards/accuracies": 0.875, "rewards/chosen": -0.02292623557150364, "rewards/margins": 0.021136470139026642, "rewards/rejected": -0.04406271129846573, "step": 91 }, { "epoch": 0.092, "grad_norm": 0.9262710213661194, "learning_rate": 4.6e-07, "logits/chosen": 0.016119547188282013, "logits/rejected": 0.6565425395965576, "logps/chosen": -118.88551330566406, "logps/rejected": -204.05825805664062, "loss": 0.7091, "rewards/accuracies": 0.375, "rewards/chosen": -0.006563140079379082, "rewards/margins": -0.029090553522109985, "rewards/rejected": 0.022527409717440605, "step": 92 }, { "epoch": 0.093, "grad_norm": 0.8907932639122009, "learning_rate": 4.65e-07, "logits/chosen": 0.5321849584579468, "logits/rejected": 0.48178738355636597, "logps/chosen": -230.42710876464844, "logps/rejected": -156.0656280517578, "loss": 0.7091, "rewards/accuracies": 0.375, "rewards/chosen": 0.0109879020601511, "rewards/margins": -0.03046889416873455, "rewards/rejected": 0.04145679622888565, "step": 93 }, { "epoch": 0.094, "grad_norm": 1.0807218551635742, "learning_rate": 4.6999999999999995e-07, "logits/chosen": 0.40838587284088135, "logits/rejected": 0.6667180061340332, "logps/chosen": -188.59750366210938, "logps/rejected": -241.0636749267578, "loss": 0.6972, "rewards/accuracies": 0.625, "rewards/chosen": -0.015648746863007545, "rewards/margins": -0.004806614480912685, "rewards/rejected": -0.010842131450772285, "step": 94 }, { "epoch": 0.095, "grad_norm": 1.445617437362671, "learning_rate": 4.7499999999999995e-07, "logits/chosen": 0.11437070369720459, "logits/rejected": 1.2057455778121948, "logps/chosen": -113.12872314453125, "logps/rejected": -259.61212158203125, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": -0.02416076697409153, "rewards/margins": 0.021562766283750534, "rewards/rejected": -0.04572353512048721, "step": 95 }, { "epoch": 0.096, "grad_norm": 0.8657000660896301, "learning_rate": 4.8e-07, "logits/chosen": 0.1491718590259552, "logits/rejected": 0.2450392097234726, "logps/chosen": -166.14593505859375, "logps/rejected": -239.87307739257812, "loss": 0.7031, "rewards/accuracies": 0.375, "rewards/chosen": -0.013789178803563118, "rewards/margins": -0.01709900051355362, "rewards/rejected": 0.0033098217099905014, "step": 96 }, { "epoch": 0.097, "grad_norm": 1.1755341291427612, "learning_rate": 4.85e-07, "logits/chosen": 0.3385622501373291, "logits/rejected": 0.8846427798271179, "logps/chosen": -204.02464294433594, "logps/rejected": -222.078125, "loss": 0.6945, "rewards/accuracies": 0.625, "rewards/chosen": 0.04088611900806427, "rewards/margins": -0.0004428839311003685, "rewards/rejected": 0.04132900387048721, "step": 97 }, { "epoch": 0.098, "grad_norm": 0.8654389977455139, "learning_rate": 4.9e-07, "logits/chosen": 0.4623313546180725, "logits/rejected": 0.6918434500694275, "logps/chosen": -253.07766723632812, "logps/rejected": -212.91836547851562, "loss": 0.7284, "rewards/accuracies": 0.375, "rewards/chosen": -0.050318051129579544, "rewards/margins": -0.06607776135206223, "rewards/rejected": 0.015759708359837532, "step": 98 }, { "epoch": 0.099, "grad_norm": 1.021981120109558, "learning_rate": 4.95e-07, "logits/chosen": 0.8151800632476807, "logits/rejected": 0.8510885238647461, "logps/chosen": -240.58255004882812, "logps/rejected": -277.4241027832031, "loss": 0.7416, "rewards/accuracies": 0.25, "rewards/chosen": -0.05477771908044815, "rewards/margins": -0.09312572330236435, "rewards/rejected": 0.0383480079472065, "step": 99 }, { "epoch": 0.1, "grad_norm": 1.017086386680603, "learning_rate": 5e-07, "logits/chosen": 1.1118158102035522, "logits/rejected": 0.4660806655883789, "logps/chosen": -264.56939697265625, "logps/rejected": -165.7677001953125, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.029677625745534897, "rewards/margins": 0.017164945602416992, "rewards/rejected": 0.012512682937085629, "step": 100 }, { "epoch": 0.101, "grad_norm": 1.0333914756774902, "learning_rate": 5.049999999999999e-07, "logits/chosen": 0.08982948213815689, "logits/rejected": 1.3492913246154785, "logps/chosen": -139.569091796875, "logps/rejected": -342.845947265625, "loss": 0.6782, "rewards/accuracies": 0.875, "rewards/chosen": 0.025769757106900215, "rewards/margins": 0.031024884432554245, "rewards/rejected": -0.0052551282569766045, "step": 101 }, { "epoch": 0.102, "grad_norm": 0.8848878741264343, "learning_rate": 5.1e-07, "logits/chosen": 0.3853943645954132, "logits/rejected": 0.6572705507278442, "logps/chosen": -285.064697265625, "logps/rejected": -185.22250366210938, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": 0.034216880798339844, "rewards/margins": 0.025764085352420807, "rewards/rejected": 0.008452797308564186, "step": 102 }, { "epoch": 0.103, "grad_norm": 1.33487868309021, "learning_rate": 5.149999999999999e-07, "logits/chosen": -0.2976032793521881, "logits/rejected": 0.6363657116889954, "logps/chosen": -128.2157745361328, "logps/rejected": -222.9949188232422, "loss": 0.6873, "rewards/accuracies": 0.375, "rewards/chosen": -0.0072755818255245686, "rewards/margins": 0.012992476113140583, "rewards/rejected": -0.020268060266971588, "step": 103 }, { "epoch": 0.104, "grad_norm": 0.998635470867157, "learning_rate": 5.2e-07, "logits/chosen": 0.5922977924346924, "logits/rejected": 0.11661890149116516, "logps/chosen": -178.92227172851562, "logps/rejected": -151.0570068359375, "loss": 0.6784, "rewards/accuracies": 0.75, "rewards/chosen": 0.019972514361143112, "rewards/margins": 0.0316309928894043, "rewards/rejected": -0.011658478528261185, "step": 104 }, { "epoch": 0.105, "grad_norm": 0.8992610573768616, "learning_rate": 5.25e-07, "logits/chosen": 0.29861554503440857, "logits/rejected": 0.37202876806259155, "logps/chosen": -196.61843872070312, "logps/rejected": -182.96153259277344, "loss": 0.7324, "rewards/accuracies": 0.25, "rewards/chosen": -0.022155094891786575, "rewards/margins": -0.07474155724048615, "rewards/rejected": 0.05258645862340927, "step": 105 }, { "epoch": 0.106, "grad_norm": 0.809232771396637, "learning_rate": 5.3e-07, "logits/chosen": 0.7163326740264893, "logits/rejected": 0.3950262665748596, "logps/chosen": -290.20672607421875, "logps/rejected": -151.91714477539062, "loss": 0.6897, "rewards/accuracies": 0.375, "rewards/chosen": -0.010413741692900658, "rewards/margins": 0.007892276160418987, "rewards/rejected": -0.01830601692199707, "step": 106 }, { "epoch": 0.107, "grad_norm": 1.0943409204483032, "learning_rate": 5.35e-07, "logits/chosen": 0.423749715089798, "logits/rejected": 0.33748292922973633, "logps/chosen": -200.27076721191406, "logps/rejected": -244.25540161132812, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": 0.0017691589891910553, "rewards/margins": 0.03150835633277893, "rewards/rejected": -0.02973918803036213, "step": 107 }, { "epoch": 0.108, "grad_norm": 0.8696032762527466, "learning_rate": 5.4e-07, "logits/chosen": 0.6948135495185852, "logits/rejected": 1.0234471559524536, "logps/chosen": -207.39718627929688, "logps/rejected": -176.31105041503906, "loss": 0.6996, "rewards/accuracies": 0.375, "rewards/chosen": -0.020975209772586823, "rewards/margins": -0.011292743496596813, "rewards/rejected": -0.00968246627599001, "step": 108 }, { "epoch": 0.109, "grad_norm": 1.0800658464431763, "learning_rate": 5.45e-07, "logits/chosen": 0.5257311463356018, "logits/rejected": 0.4685211181640625, "logps/chosen": -178.61416625976562, "logps/rejected": -238.67237854003906, "loss": 0.7038, "rewards/accuracies": 0.375, "rewards/chosen": -0.02256030961871147, "rewards/margins": -0.01892423816025257, "rewards/rejected": -0.0036360733211040497, "step": 109 }, { "epoch": 0.11, "grad_norm": 0.9364849925041199, "learning_rate": 5.5e-07, "logits/chosen": 0.4155673086643219, "logits/rejected": 0.34755435585975647, "logps/chosen": -259.1890869140625, "logps/rejected": -173.95712280273438, "loss": 0.6956, "rewards/accuracies": 0.625, "rewards/chosen": -0.04266338422894478, "rewards/margins": -0.0020116791129112244, "rewards/rejected": -0.040651701390743256, "step": 110 }, { "epoch": 0.111, "grad_norm": 1.2157273292541504, "learning_rate": 5.55e-07, "logits/chosen": 0.24761734902858734, "logits/rejected": 0.47557708621025085, "logps/chosen": -274.2572021484375, "logps/rejected": -242.95870971679688, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": 0.012380219995975494, "rewards/margins": 0.040896326303482056, "rewards/rejected": -0.028516102582216263, "step": 111 }, { "epoch": 0.112, "grad_norm": 0.9171537160873413, "learning_rate": 5.6e-07, "logits/chosen": 1.3327269554138184, "logits/rejected": 0.4503490924835205, "logps/chosen": -344.3856201171875, "logps/rejected": -228.86322021484375, "loss": 0.71, "rewards/accuracies": 0.375, "rewards/chosen": -0.019307708367705345, "rewards/margins": -0.0310837272554636, "rewards/rejected": 0.011776017025113106, "step": 112 }, { "epoch": 0.113, "grad_norm": 0.9926645159721375, "learning_rate": 5.649999999999999e-07, "logits/chosen": 0.5794891119003296, "logits/rejected": 0.7736021876335144, "logps/chosen": -303.1412353515625, "logps/rejected": -174.86236572265625, "loss": 0.7089, "rewards/accuracies": 0.375, "rewards/chosen": 0.011075115762650967, "rewards/margins": -0.030097579583525658, "rewards/rejected": 0.04117269814014435, "step": 113 }, { "epoch": 0.114, "grad_norm": 1.450002908706665, "learning_rate": 5.699999999999999e-07, "logits/chosen": 0.9836143255233765, "logits/rejected": -0.16348722577095032, "logps/chosen": -319.823486328125, "logps/rejected": -135.35154724121094, "loss": 0.7191, "rewards/accuracies": 0.25, "rewards/chosen": -0.04064255207777023, "rewards/margins": -0.048138052225112915, "rewards/rejected": 0.007495499216020107, "step": 114 }, { "epoch": 0.115, "grad_norm": 1.058058500289917, "learning_rate": 5.749999999999999e-07, "logits/chosen": 0.6679388284683228, "logits/rejected": 0.32989150285720825, "logps/chosen": -354.42041015625, "logps/rejected": -176.5383758544922, "loss": 0.6966, "rewards/accuracies": 0.25, "rewards/chosen": 0.008972642943263054, "rewards/margins": -0.003351498395204544, "rewards/rejected": 0.012324145063757896, "step": 115 }, { "epoch": 0.116, "grad_norm": 1.0850225687026978, "learning_rate": 5.8e-07, "logits/chosen": 0.8339398503303528, "logits/rejected": 0.4970490634441376, "logps/chosen": -245.4641571044922, "logps/rejected": -191.6376190185547, "loss": 0.7083, "rewards/accuracies": 0.25, "rewards/chosen": 0.018229199573397636, "rewards/margins": -0.027737285941839218, "rewards/rejected": 0.045966483652591705, "step": 116 }, { "epoch": 0.117, "grad_norm": 0.9728962779045105, "learning_rate": 5.849999999999999e-07, "logits/chosen": -0.2677839398384094, "logits/rejected": 1.057948112487793, "logps/chosen": -104.23141479492188, "logps/rejected": -224.95864868164062, "loss": 0.7061, "rewards/accuracies": 0.375, "rewards/chosen": -0.02010932005941868, "rewards/margins": -0.025182582437992096, "rewards/rejected": 0.00507326191291213, "step": 117 }, { "epoch": 0.118, "grad_norm": 1.0837849378585815, "learning_rate": 5.9e-07, "logits/chosen": 0.9560957551002502, "logits/rejected": 0.32683080434799194, "logps/chosen": -309.875732421875, "logps/rejected": -167.523193359375, "loss": 0.6726, "rewards/accuracies": 0.75, "rewards/chosen": 0.024574756622314453, "rewards/margins": 0.04339180141687393, "rewards/rejected": -0.01881704479455948, "step": 118 }, { "epoch": 0.119, "grad_norm": 1.011488914489746, "learning_rate": 5.949999999999999e-07, "logits/chosen": 0.7357783913612366, "logits/rejected": 0.6012024879455566, "logps/chosen": -191.5654296875, "logps/rejected": -203.54830932617188, "loss": 0.6813, "rewards/accuracies": 0.625, "rewards/chosen": 0.015390872955322266, "rewards/margins": 0.025005195289850235, "rewards/rejected": -0.009614325128495693, "step": 119 }, { "epoch": 0.12, "grad_norm": 0.9445905685424805, "learning_rate": 6e-07, "logits/chosen": 0.2877943515777588, "logits/rejected": 0.9384462833404541, "logps/chosen": -167.73745727539062, "logps/rejected": -224.75303649902344, "loss": 0.6878, "rewards/accuracies": 0.625, "rewards/chosen": 0.021228980273008347, "rewards/margins": 0.012193869799375534, "rewards/rejected": 0.009035110473632812, "step": 120 }, { "epoch": 0.121, "grad_norm": 1.1918119192123413, "learning_rate": 6.049999999999999e-07, "logits/chosen": 0.4725179970264435, "logits/rejected": 0.7480962872505188, "logps/chosen": -160.33602905273438, "logps/rejected": -231.2279510498047, "loss": 0.703, "rewards/accuracies": 0.5, "rewards/chosen": 0.004519651643931866, "rewards/margins": -0.017488859593868256, "rewards/rejected": 0.022008515894412994, "step": 121 }, { "epoch": 0.122, "grad_norm": 1.0572704076766968, "learning_rate": 6.1e-07, "logits/chosen": 0.32970476150512695, "logits/rejected": 0.5079172849655151, "logps/chosen": -172.5562744140625, "logps/rejected": -213.30758666992188, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.022168539464473724, "rewards/margins": 0.0332636833190918, "rewards/rejected": -0.011095141991972923, "step": 122 }, { "epoch": 0.123, "grad_norm": 1.4392750263214111, "learning_rate": 6.149999999999999e-07, "logits/chosen": -0.13772737979888916, "logits/rejected": 0.530848503112793, "logps/chosen": -143.97140502929688, "logps/rejected": -234.4232177734375, "loss": 0.7261, "rewards/accuracies": 0.125, "rewards/chosen": -0.04729442670941353, "rewards/margins": -0.06337080150842667, "rewards/rejected": 0.016076374799013138, "step": 123 }, { "epoch": 0.124, "grad_norm": 1.010684847831726, "learning_rate": 6.2e-07, "logits/chosen": 0.382506400346756, "logits/rejected": 0.4541802406311035, "logps/chosen": -129.94924926757812, "logps/rejected": -205.95388793945312, "loss": 0.6995, "rewards/accuracies": 0.375, "rewards/chosen": -0.020525455474853516, "rewards/margins": -0.010506153106689453, "rewards/rejected": -0.010019302368164062, "step": 124 }, { "epoch": 0.125, "grad_norm": 1.0286598205566406, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.5236959457397461, "logits/rejected": 0.4517212212085724, "logps/chosen": -221.98275756835938, "logps/rejected": -177.97232055664062, "loss": 0.7148, "rewards/accuracies": 0.125, "rewards/chosen": -0.05525369942188263, "rewards/margins": -0.04194803535938263, "rewards/rejected": -0.0133056640625, "step": 125 }, { "epoch": 0.126, "grad_norm": 0.9430485367774963, "learning_rate": 6.3e-07, "logits/chosen": 1.1879680156707764, "logits/rejected": 1.1707652807235718, "logps/chosen": -220.36453247070312, "logps/rejected": -216.7008819580078, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.003734302707016468, "rewards/margins": 0.0288148894906044, "rewards/rejected": -0.025080587714910507, "step": 126 }, { "epoch": 0.127, "grad_norm": 1.053552508354187, "learning_rate": 6.35e-07, "logits/chosen": 0.7519561648368835, "logits/rejected": 0.8199808597564697, "logps/chosen": -264.63739013671875, "logps/rejected": -267.1257019042969, "loss": 0.6655, "rewards/accuracies": 0.75, "rewards/chosen": 0.03707084432244301, "rewards/margins": 0.057557202875614166, "rewards/rejected": -0.02048635669052601, "step": 127 }, { "epoch": 0.128, "grad_norm": 0.971837043762207, "learning_rate": 6.4e-07, "logits/chosen": 0.43763959407806396, "logits/rejected": 0.5910612344741821, "logps/chosen": -198.32760620117188, "logps/rejected": -189.07745361328125, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.018829919397830963, "rewards/margins": 0.001199435442686081, "rewards/rejected": 0.017630483955144882, "step": 128 }, { "epoch": 0.129, "grad_norm": 1.2713701725006104, "learning_rate": 6.45e-07, "logits/chosen": -0.48176246881484985, "logits/rejected": 0.8667222261428833, "logps/chosen": -129.19482421875, "logps/rejected": -203.53680419921875, "loss": 0.7024, "rewards/accuracies": 0.375, "rewards/chosen": 0.023796558380126953, "rewards/margins": -0.016008852049708366, "rewards/rejected": 0.03980541229248047, "step": 129 }, { "epoch": 0.13, "grad_norm": 1.109737515449524, "learning_rate": 6.5e-07, "logits/chosen": 0.20734266936779022, "logits/rejected": 0.30975568294525146, "logps/chosen": -230.73191833496094, "logps/rejected": -161.6073760986328, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -0.007894659414887428, "rewards/margins": 0.012976123951375484, "rewards/rejected": -0.020870782434940338, "step": 130 }, { "epoch": 0.131, "grad_norm": 0.8145519495010376, "learning_rate": 6.55e-07, "logits/chosen": 0.6864017844200134, "logits/rejected": 0.4194210469722748, "logps/chosen": -242.90528869628906, "logps/rejected": -156.98963928222656, "loss": 0.6745, "rewards/accuracies": 0.875, "rewards/chosen": 0.046404264867305756, "rewards/margins": 0.03961954265832901, "rewards/rejected": 0.006784726399928331, "step": 131 }, { "epoch": 0.132, "grad_norm": 0.8700035214424133, "learning_rate": 6.6e-07, "logits/chosen": 0.3489328622817993, "logits/rejected": 0.23755095899105072, "logps/chosen": -231.43399047851562, "logps/rejected": -170.736328125, "loss": 0.6836, "rewards/accuracies": 0.25, "rewards/chosen": -0.00555572472512722, "rewards/margins": 0.02246132120490074, "rewards/rejected": -0.02801704593002796, "step": 132 }, { "epoch": 0.133, "grad_norm": 1.2265843152999878, "learning_rate": 6.65e-07, "logits/chosen": 0.7562121748924255, "logits/rejected": 0.11694112420082092, "logps/chosen": -213.9826202392578, "logps/rejected": -197.51837158203125, "loss": 0.6654, "rewards/accuracies": 0.75, "rewards/chosen": 0.04471459612250328, "rewards/margins": 0.05792651325464249, "rewards/rejected": -0.013211918994784355, "step": 133 }, { "epoch": 0.134, "grad_norm": 1.1396231651306152, "learning_rate": 6.7e-07, "logits/chosen": 0.5555574893951416, "logits/rejected": 0.7132057547569275, "logps/chosen": -147.33001708984375, "logps/rejected": -196.97348022460938, "loss": 0.6518, "rewards/accuracies": 0.875, "rewards/chosen": 0.04772786796092987, "rewards/margins": 0.08730573952198029, "rewards/rejected": -0.03957786783576012, "step": 134 }, { "epoch": 0.135, "grad_norm": 1.009803056716919, "learning_rate": 6.75e-07, "logits/chosen": 0.5590571165084839, "logits/rejected": 0.5139557123184204, "logps/chosen": -273.2331848144531, "logps/rejected": -171.616943359375, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.04860858619213104, "rewards/margins": 0.06315740942955017, "rewards/rejected": -0.014548826962709427, "step": 135 }, { "epoch": 0.136, "grad_norm": 0.8746850490570068, "learning_rate": 6.800000000000001e-07, "logits/chosen": 0.9793382287025452, "logits/rejected": 0.5371599197387695, "logps/chosen": -323.06622314453125, "logps/rejected": -183.49713134765625, "loss": 0.6905, "rewards/accuracies": 0.375, "rewards/chosen": 0.04355049133300781, "rewards/margins": 0.006455517373979092, "rewards/rejected": 0.037094973027706146, "step": 136 }, { "epoch": 0.137, "grad_norm": 1.1849921941757202, "learning_rate": 6.85e-07, "logits/chosen": 0.40213772654533386, "logits/rejected": 0.33298811316490173, "logps/chosen": -195.01812744140625, "logps/rejected": -240.88584899902344, "loss": 0.7003, "rewards/accuracies": 0.375, "rewards/chosen": 0.021933652460575104, "rewards/margins": -0.011707019992172718, "rewards/rejected": 0.03364067152142525, "step": 137 }, { "epoch": 0.138, "grad_norm": 0.9966764450073242, "learning_rate": 6.9e-07, "logits/chosen": 0.24319538474082947, "logits/rejected": 0.9118220210075378, "logps/chosen": -172.19744873046875, "logps/rejected": -224.35337829589844, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": -0.021261025220155716, "rewards/margins": -0.019939517602324486, "rewards/rejected": -0.0013215085491538048, "step": 138 }, { "epoch": 0.139, "grad_norm": 0.8755557537078857, "learning_rate": 6.949999999999999e-07, "logits/chosen": 0.8747591972351074, "logits/rejected": 0.5649628043174744, "logps/chosen": -247.29843139648438, "logps/rejected": -151.66734313964844, "loss": 0.7056, "rewards/accuracies": 0.375, "rewards/chosen": -0.0188764575868845, "rewards/margins": -0.02184486575424671, "rewards/rejected": 0.0029684072360396385, "step": 139 }, { "epoch": 0.14, "grad_norm": 0.9589688777923584, "learning_rate": 7e-07, "logits/chosen": 0.6634988188743591, "logits/rejected": 0.7778658270835876, "logps/chosen": -226.35986328125, "logps/rejected": -165.52333068847656, "loss": 0.6968, "rewards/accuracies": 0.625, "rewards/chosen": -0.0073858266696333885, "rewards/margins": -0.004305646754801273, "rewards/rejected": -0.0030801785178482533, "step": 140 }, { "epoch": 0.141, "grad_norm": 0.9646795988082886, "learning_rate": 7.049999999999999e-07, "logits/chosen": 0.3188600242137909, "logits/rejected": 0.861352264881134, "logps/chosen": -149.6117401123047, "logps/rejected": -213.9823455810547, "loss": 0.7025, "rewards/accuracies": 0.375, "rewards/chosen": 0.003949498757719994, "rewards/margins": -0.014610150828957558, "rewards/rejected": 0.01855964958667755, "step": 141 }, { "epoch": 0.142, "grad_norm": 0.9634246826171875, "learning_rate": 7.1e-07, "logits/chosen": 0.955387532711029, "logits/rejected": 0.836426317691803, "logps/chosen": -212.35690307617188, "logps/rejected": -177.43043518066406, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.006721876561641693, "rewards/margins": 0.013531493954360485, "rewards/rejected": -0.006809615064412355, "step": 142 }, { "epoch": 0.143, "grad_norm": 0.8865317106246948, "learning_rate": 7.149999999999999e-07, "logits/chosen": 0.543188214302063, "logits/rejected": 0.24458560347557068, "logps/chosen": -194.4044647216797, "logps/rejected": -211.82830810546875, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": -0.013395881280303001, "rewards/margins": -0.007937143556773663, "rewards/rejected": -0.005458737723529339, "step": 143 }, { "epoch": 0.144, "grad_norm": 0.8306776881217957, "learning_rate": 7.2e-07, "logits/chosen": 0.12755748629570007, "logits/rejected": 0.8491436243057251, "logps/chosen": -184.24923706054688, "logps/rejected": -239.02871704101562, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": 0.03551516681909561, "rewards/margins": 0.06688075512647629, "rewards/rejected": -0.03136558458209038, "step": 144 }, { "epoch": 0.145, "grad_norm": 1.0331743955612183, "learning_rate": 7.249999999999999e-07, "logits/chosen": 0.3051682114601135, "logits/rejected": 0.46885356307029724, "logps/chosen": -262.9817810058594, "logps/rejected": -239.87974548339844, "loss": 0.6986, "rewards/accuracies": 0.25, "rewards/chosen": 0.003522586077451706, "rewards/margins": -0.009735395200550556, "rewards/rejected": 0.013257981278002262, "step": 145 }, { "epoch": 0.146, "grad_norm": 1.5334570407867432, "learning_rate": 7.3e-07, "logits/chosen": 0.3211784362792969, "logits/rejected": 0.3193144202232361, "logps/chosen": -285.0498962402344, "logps/rejected": -142.99365234375, "loss": 0.7261, "rewards/accuracies": 0.25, "rewards/chosen": -0.03148975223302841, "rewards/margins": -0.06328096985816956, "rewards/rejected": 0.03179121017456055, "step": 146 }, { "epoch": 0.147, "grad_norm": 1.461521029472351, "learning_rate": 7.35e-07, "logits/chosen": 0.18960724771022797, "logits/rejected": 0.5652821660041809, "logps/chosen": -160.46441650390625, "logps/rejected": -286.67755126953125, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 0.029915999621152878, "rewards/margins": 0.0011822674423456192, "rewards/rejected": 0.02873373031616211, "step": 147 }, { "epoch": 0.148, "grad_norm": 1.1123545169830322, "learning_rate": 7.4e-07, "logits/chosen": 0.7872523069381714, "logits/rejected": 0.15430155396461487, "logps/chosen": -346.59375, "logps/rejected": -149.9037628173828, "loss": 0.6786, "rewards/accuracies": 0.75, "rewards/chosen": 0.013402367010712624, "rewards/margins": 0.03125810623168945, "rewards/rejected": -0.01785573922097683, "step": 148 }, { "epoch": 0.149, "grad_norm": 0.9192137122154236, "learning_rate": 7.45e-07, "logits/chosen": 0.6728577017784119, "logits/rejected": -0.03180414438247681, "logps/chosen": -224.44851684570312, "logps/rejected": -143.09951782226562, "loss": 0.7006, "rewards/accuracies": 0.5, "rewards/chosen": -0.01320800930261612, "rewards/margins": -0.013856791891157627, "rewards/rejected": 0.0006487850332632661, "step": 149 }, { "epoch": 0.15, "grad_norm": 1.3383042812347412, "learning_rate": 7.5e-07, "logits/chosen": 0.20958293974399567, "logits/rejected": 0.46691370010375977, "logps/chosen": -106.15850830078125, "logps/rejected": -174.1712188720703, "loss": 0.6597, "rewards/accuracies": 0.75, "rewards/chosen": 0.04640550911426544, "rewards/margins": 0.06993703544139862, "rewards/rejected": -0.023531531915068626, "step": 150 }, { "epoch": 0.151, "grad_norm": 0.8555638790130615, "learning_rate": 7.55e-07, "logits/chosen": 0.35093656182289124, "logits/rejected": 0.483124703168869, "logps/chosen": -189.566650390625, "logps/rejected": -181.3053436279297, "loss": 0.6581, "rewards/accuracies": 0.875, "rewards/chosen": 0.05794115364551544, "rewards/margins": 0.07316217571496964, "rewards/rejected": -0.015221023932099342, "step": 151 }, { "epoch": 0.152, "grad_norm": 1.0777040719985962, "learning_rate": 7.599999999999999e-07, "logits/chosen": -0.0464349165558815, "logits/rejected": 0.4547436535358429, "logps/chosen": -197.500244140625, "logps/rejected": -197.3854522705078, "loss": 0.7138, "rewards/accuracies": 0.25, "rewards/chosen": -0.06224489584565163, "rewards/margins": -0.040166761726140976, "rewards/rejected": -0.02207813411951065, "step": 152 }, { "epoch": 0.153, "grad_norm": 0.9965372681617737, "learning_rate": 7.65e-07, "logits/chosen": 0.39982521533966064, "logits/rejected": 0.7609515190124512, "logps/chosen": -193.66229248046875, "logps/rejected": -244.82290649414062, "loss": 0.7225, "rewards/accuracies": 0.25, "rewards/chosen": -0.019975949078798294, "rewards/margins": -0.0549190528690815, "rewards/rejected": 0.0349431037902832, "step": 153 }, { "epoch": 0.154, "grad_norm": 0.936324417591095, "learning_rate": 7.699999999999999e-07, "logits/chosen": 0.3867191672325134, "logits/rejected": 0.07888787984848022, "logps/chosen": -220.97369384765625, "logps/rejected": -155.41775512695312, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": -0.0066855428740382195, "rewards/margins": 0.013088367879390717, "rewards/rejected": -0.01977391354739666, "step": 154 }, { "epoch": 0.155, "grad_norm": 0.8299586772918701, "learning_rate": 7.75e-07, "logits/chosen": 0.717190146446228, "logits/rejected": 0.4002940058708191, "logps/chosen": -217.5019989013672, "logps/rejected": -206.2070770263672, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": 0.025737475603818893, "rewards/margins": 0.05082841217517853, "rewards/rejected": -0.025090932846069336, "step": 155 }, { "epoch": 0.156, "grad_norm": 1.0134903192520142, "learning_rate": 7.799999999999999e-07, "logits/chosen": 0.127466082572937, "logits/rejected": 0.6504237055778503, "logps/chosen": -150.47634887695312, "logps/rejected": -190.0186767578125, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.016997909173369408, "rewards/margins": 0.020297527313232422, "rewards/rejected": -0.0032996172085404396, "step": 156 }, { "epoch": 0.157, "grad_norm": 0.8928995132446289, "learning_rate": 7.85e-07, "logits/chosen": 0.9949202537536621, "logits/rejected": 0.8309842348098755, "logps/chosen": -268.4368591308594, "logps/rejected": -176.90997314453125, "loss": 0.6916, "rewards/accuracies": 0.25, "rewards/chosen": 0.03707695007324219, "rewards/margins": 0.006309223361313343, "rewards/rejected": 0.03076772764325142, "step": 157 }, { "epoch": 0.158, "grad_norm": 1.531850814819336, "learning_rate": 7.9e-07, "logits/chosen": 0.8315258622169495, "logits/rejected": 0.4454377591609955, "logps/chosen": -318.953857421875, "logps/rejected": -154.406494140625, "loss": 0.7193, "rewards/accuracies": 0.375, "rewards/chosen": -0.0018948556389659643, "rewards/margins": -0.049708180129528046, "rewards/rejected": 0.04781332239508629, "step": 158 }, { "epoch": 0.159, "grad_norm": 1.0606074333190918, "learning_rate": 7.95e-07, "logits/chosen": 0.7620002031326294, "logits/rejected": 0.17521271109580994, "logps/chosen": -293.7979431152344, "logps/rejected": -155.35354614257812, "loss": 0.668, "rewards/accuracies": 0.75, "rewards/chosen": 0.015316581353545189, "rewards/margins": 0.05418586730957031, "rewards/rejected": -0.03886928781867027, "step": 159 }, { "epoch": 0.16, "grad_norm": 1.2387757301330566, "learning_rate": 8e-07, "logits/chosen": 0.2601490914821625, "logits/rejected": 0.6928614974021912, "logps/chosen": -194.49179077148438, "logps/rejected": -227.52413940429688, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.005357552319765091, "rewards/margins": 0.0037741661071777344, "rewards/rejected": -0.009131718426942825, "step": 160 }, { "epoch": 0.161, "grad_norm": 1.0659303665161133, "learning_rate": 8.05e-07, "logits/chosen": 1.1569201946258545, "logits/rejected": 0.5065603256225586, "logps/chosen": -348.809326171875, "logps/rejected": -196.37237548828125, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": -0.018807413056492805, "rewards/margins": 0.028777126222848892, "rewards/rejected": -0.04758453741669655, "step": 161 }, { "epoch": 0.162, "grad_norm": 1.0257278680801392, "learning_rate": 8.1e-07, "logits/chosen": 0.2728986144065857, "logits/rejected": 0.7364625334739685, "logps/chosen": -182.3512420654297, "logps/rejected": -254.52159118652344, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": -0.017744731158018112, "rewards/margins": 0.022591017186641693, "rewards/rejected": -0.040335752069950104, "step": 162 }, { "epoch": 0.163, "grad_norm": 1.0713056325912476, "learning_rate": 8.149999999999999e-07, "logits/chosen": 0.514846682548523, "logits/rejected": 0.35681211948394775, "logps/chosen": -222.89126586914062, "logps/rejected": -175.93251037597656, "loss": 0.6937, "rewards/accuracies": 0.625, "rewards/chosen": -0.012969971634447575, "rewards/margins": 0.00042438507080078125, "rewards/rejected": -0.013394355773925781, "step": 163 }, { "epoch": 0.164, "grad_norm": 1.3558681011199951, "learning_rate": 8.199999999999999e-07, "logits/chosen": 0.8242709040641785, "logits/rejected": 0.2723900079727173, "logps/chosen": -286.2349853515625, "logps/rejected": -139.5950164794922, "loss": 0.6997, "rewards/accuracies": 0.375, "rewards/chosen": -0.005196094512939453, "rewards/margins": -0.010555364191532135, "rewards/rejected": 0.005359269678592682, "step": 164 }, { "epoch": 0.165, "grad_norm": 0.9459619522094727, "learning_rate": 8.249999999999999e-07, "logits/chosen": 0.5785090923309326, "logits/rejected": 0.32478243112564087, "logps/chosen": -169.0185546875, "logps/rejected": -180.64381408691406, "loss": 0.7141, "rewards/accuracies": 0.25, "rewards/chosen": -0.05807190015912056, "rewards/margins": -0.03881492465734482, "rewards/rejected": -0.01925697550177574, "step": 165 }, { "epoch": 0.166, "grad_norm": 1.0612988471984863, "learning_rate": 8.299999999999999e-07, "logits/chosen": 0.3929353356361389, "logits/rejected": 0.05629369616508484, "logps/chosen": -199.333740234375, "logps/rejected": -258.7743225097656, "loss": 0.6835, "rewards/accuracies": 0.625, "rewards/chosen": -0.010769844055175781, "rewards/margins": 0.02159137651324272, "rewards/rejected": -0.0323612205684185, "step": 166 }, { "epoch": 0.167, "grad_norm": 0.9420310258865356, "learning_rate": 8.349999999999999e-07, "logits/chosen": 0.7154072523117065, "logits/rejected": 0.2673560380935669, "logps/chosen": -259.135498046875, "logps/rejected": -138.47569274902344, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.008481788448989391, "rewards/margins": 0.03437738120555878, "rewards/rejected": -0.025895599275827408, "step": 167 }, { "epoch": 0.168, "grad_norm": 1.0821714401245117, "learning_rate": 8.399999999999999e-07, "logits/chosen": 0.5827202200889587, "logits/rejected": 0.5746755599975586, "logps/chosen": -216.3154754638672, "logps/rejected": -225.87599182128906, "loss": 0.6761, "rewards/accuracies": 0.5, "rewards/chosen": 0.034232139587402344, "rewards/margins": 0.03624601289629936, "rewards/rejected": -0.002013874240219593, "step": 168 }, { "epoch": 0.169, "grad_norm": 1.6189515590667725, "learning_rate": 8.45e-07, "logits/chosen": -0.06927088648080826, "logits/rejected": 0.6882977485656738, "logps/chosen": -124.29713439941406, "logps/rejected": -314.3223876953125, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": 0.045432187616825104, "rewards/margins": 0.048607539385557175, "rewards/rejected": -0.003175351768732071, "step": 169 }, { "epoch": 0.17, "grad_norm": 1.045383095741272, "learning_rate": 8.499999999999999e-07, "logits/chosen": 0.28434887528419495, "logits/rejected": 0.48736685514450073, "logps/chosen": -148.57244873046875, "logps/rejected": -166.12948608398438, "loss": 0.684, "rewards/accuracies": 0.75, "rewards/chosen": 0.01562042348086834, "rewards/margins": 0.01897134818136692, "rewards/rejected": -0.003350926097482443, "step": 170 }, { "epoch": 0.171, "grad_norm": 0.8353698253631592, "learning_rate": 8.55e-07, "logits/chosen": 0.08720003068447113, "logits/rejected": 0.6061348915100098, "logps/chosen": -166.6918487548828, "logps/rejected": -200.7294464111328, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.023891925811767578, "rewards/margins": 0.011293984949588776, "rewards/rejected": 0.012597941793501377, "step": 171 }, { "epoch": 0.172, "grad_norm": 1.3302123546600342, "learning_rate": 8.599999999999999e-07, "logits/chosen": 0.253623366355896, "logits/rejected": 0.7165024280548096, "logps/chosen": -149.29356384277344, "logps/rejected": -172.12490844726562, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": -0.011370660737156868, "rewards/margins": -0.017051314935088158, "rewards/rejected": 0.005680656060576439, "step": 172 }, { "epoch": 0.173, "grad_norm": 1.3374916315078735, "learning_rate": 8.65e-07, "logits/chosen": 1.2052711248397827, "logits/rejected": 0.298039972782135, "logps/chosen": -354.31573486328125, "logps/rejected": -169.74932861328125, "loss": 0.7054, "rewards/accuracies": 0.375, "rewards/chosen": -0.027636529877781868, "rewards/margins": -0.02176361158490181, "rewards/rejected": -0.005872918292880058, "step": 173 }, { "epoch": 0.174, "grad_norm": 0.9911280274391174, "learning_rate": 8.699999999999999e-07, "logits/chosen": 0.5849171280860901, "logits/rejected": 0.9172890782356262, "logps/chosen": -144.2360076904297, "logps/rejected": -188.0074005126953, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": 0.037108518183231354, "rewards/margins": 0.01706714555621147, "rewards/rejected": 0.020041370764374733, "step": 174 }, { "epoch": 0.175, "grad_norm": 0.8901740312576294, "learning_rate": 8.75e-07, "logits/chosen": 0.8949350714683533, "logits/rejected": 0.003621704876422882, "logps/chosen": -236.73794555664062, "logps/rejected": -109.05380249023438, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": 0.06252841651439667, "rewards/margins": 0.04044008255004883, "rewards/rejected": 0.022088337689638138, "step": 175 }, { "epoch": 0.176, "grad_norm": 0.8345538377761841, "learning_rate": 8.799999999999999e-07, "logits/chosen": 0.7453010678291321, "logits/rejected": 0.5548909306526184, "logps/chosen": -165.50222778320312, "logps/rejected": -253.7607879638672, "loss": 0.6798, "rewards/accuracies": 0.75, "rewards/chosen": -0.023292064666748047, "rewards/margins": 0.029611684381961823, "rewards/rejected": -0.05290374904870987, "step": 176 }, { "epoch": 0.177, "grad_norm": 1.3513246774673462, "learning_rate": 8.85e-07, "logits/chosen": 0.8163166642189026, "logits/rejected": -0.08251897990703583, "logps/chosen": -304.60174560546875, "logps/rejected": -145.28599548339844, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 0.017812158912420273, "rewards/margins": -0.0035993559285998344, "rewards/rejected": 0.021411515772342682, "step": 177 }, { "epoch": 0.178, "grad_norm": 0.9090297222137451, "learning_rate": 8.9e-07, "logits/chosen": 0.3133397102355957, "logits/rejected": 0.24339300394058228, "logps/chosen": -169.86593627929688, "logps/rejected": -194.11746215820312, "loss": 0.6967, "rewards/accuracies": 0.625, "rewards/chosen": -0.005936385132372379, "rewards/margins": -0.0059586986899375916, "rewards/rejected": 2.2314488887786865e-05, "step": 178 }, { "epoch": 0.179, "grad_norm": 0.9260727763175964, "learning_rate": 8.95e-07, "logits/chosen": 0.9457390904426575, "logits/rejected": 0.6680195331573486, "logps/chosen": -342.3825988769531, "logps/rejected": -159.33193969726562, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.028945161029696465, "rewards/margins": 0.004976797848939896, "rewards/rejected": 0.023968365043401718, "step": 179 }, { "epoch": 0.18, "grad_norm": 0.9848647713661194, "learning_rate": 9e-07, "logits/chosen": 0.36396482586860657, "logits/rejected": 0.5037656426429749, "logps/chosen": -197.41015625, "logps/rejected": -197.99839782714844, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": -0.002166557125747204, "rewards/margins": 0.018871307373046875, "rewards/rejected": -0.021037865430116653, "step": 180 }, { "epoch": 0.181, "grad_norm": 1.175249457359314, "learning_rate": 9.05e-07, "logits/chosen": 1.0047409534454346, "logits/rejected": 0.16827890276908875, "logps/chosen": -287.83148193359375, "logps/rejected": -187.6125946044922, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": 0.03237028047442436, "rewards/margins": 0.008857536129653454, "rewards/rejected": 0.023512747138738632, "step": 181 }, { "epoch": 0.182, "grad_norm": 1.0260051488876343, "learning_rate": 9.1e-07, "logits/chosen": 0.7021408677101135, "logits/rejected": 0.4536955952644348, "logps/chosen": -239.39678955078125, "logps/rejected": -180.57321166992188, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.015623379498720169, "rewards/margins": 0.003603646531701088, "rewards/rejected": 0.012019731104373932, "step": 182 }, { "epoch": 0.183, "grad_norm": 1.0881257057189941, "learning_rate": 9.15e-07, "logits/chosen": 0.15348845720291138, "logits/rejected": 0.36633655428886414, "logps/chosen": -151.44102478027344, "logps/rejected": -240.52354431152344, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": -0.013879682868719101, "rewards/margins": -0.020476438105106354, "rewards/rejected": 0.006596754305064678, "step": 183 }, { "epoch": 0.184, "grad_norm": 0.9365029335021973, "learning_rate": 9.2e-07, "logits/chosen": 0.16685308516025543, "logits/rejected": 0.6557785272598267, "logps/chosen": -186.08901977539062, "logps/rejected": -182.65711975097656, "loss": 0.6983, "rewards/accuracies": 0.5, "rewards/chosen": -0.022886421531438828, "rewards/margins": -0.008577490225434303, "rewards/rejected": -0.01430893037468195, "step": 184 }, { "epoch": 0.185, "grad_norm": 1.039925217628479, "learning_rate": 9.25e-07, "logits/chosen": 0.8909739255905151, "logits/rejected": 0.49718064069747925, "logps/chosen": -274.49908447265625, "logps/rejected": -186.63800048828125, "loss": 0.7102, "rewards/accuracies": 0.375, "rewards/chosen": -0.035229019820690155, "rewards/margins": -0.03264370188117027, "rewards/rejected": -0.0025853165425360203, "step": 185 }, { "epoch": 0.186, "grad_norm": 1.0284082889556885, "learning_rate": 9.3e-07, "logits/chosen": 0.9793567657470703, "logits/rejected": 0.7020518183708191, "logps/chosen": -286.49798583984375, "logps/rejected": -193.66941833496094, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": 0.017500493675470352, "rewards/margins": 0.02211027219891548, "rewards/rejected": -0.0046097757294774055, "step": 186 }, { "epoch": 0.187, "grad_norm": 1.1153188943862915, "learning_rate": 9.35e-07, "logits/chosen": 0.048031389713287354, "logits/rejected": 0.6719032526016235, "logps/chosen": -168.3256378173828, "logps/rejected": -224.977783203125, "loss": 0.7168, "rewards/accuracies": 0.25, "rewards/chosen": -0.02319316938519478, "rewards/margins": -0.0443146750330925, "rewards/rejected": 0.021121501922607422, "step": 187 }, { "epoch": 0.188, "grad_norm": 1.3024826049804688, "learning_rate": 9.399999999999999e-07, "logits/chosen": 0.43251264095306396, "logits/rejected": 0.7845343351364136, "logps/chosen": -160.12867736816406, "logps/rejected": -230.43130493164062, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": 0.004238605499267578, "rewards/margins": 0.013691708445549011, "rewards/rejected": -0.009453106671571732, "step": 188 }, { "epoch": 0.189, "grad_norm": 0.9745520949363708, "learning_rate": 9.45e-07, "logits/chosen": -0.1358582079410553, "logits/rejected": 0.6348815560340881, "logps/chosen": -110.15872192382812, "logps/rejected": -212.3846893310547, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": 0.0155480382964015, "rewards/margins": 0.008341120555996895, "rewards/rejected": 0.0072069186717271805, "step": 189 }, { "epoch": 0.19, "grad_norm": 0.8874483704566956, "learning_rate": 9.499999999999999e-07, "logits/chosen": 0.6723577976226807, "logits/rejected": 0.6026561260223389, "logps/chosen": -201.51266479492188, "logps/rejected": -193.02740478515625, "loss": 0.6871, "rewards/accuracies": 0.625, "rewards/chosen": 0.01966390572488308, "rewards/margins": 0.015453432686626911, "rewards/rejected": 0.004210471175611019, "step": 190 }, { "epoch": 0.191, "grad_norm": 1.2122604846954346, "learning_rate": 9.55e-07, "logits/chosen": 0.750452995300293, "logits/rejected": 0.42456087470054626, "logps/chosen": -227.32530212402344, "logps/rejected": -208.4167938232422, "loss": 0.6946, "rewards/accuracies": 0.625, "rewards/chosen": -0.027904320508241653, "rewards/margins": -0.000382901169359684, "rewards/rejected": -0.027521418407559395, "step": 191 }, { "epoch": 0.192, "grad_norm": 0.9997500777244568, "learning_rate": 9.6e-07, "logits/chosen": 0.2053053081035614, "logits/rejected": 0.18494416773319244, "logps/chosen": -199.85748291015625, "logps/rejected": -201.58763122558594, "loss": 0.7018, "rewards/accuracies": 0.5, "rewards/chosen": 0.015450002625584602, "rewards/margins": -0.014702319167554379, "rewards/rejected": 0.030152320861816406, "step": 192 }, { "epoch": 0.193, "grad_norm": 1.0208231210708618, "learning_rate": 9.649999999999999e-07, "logits/chosen": 0.7647454142570496, "logits/rejected": 0.4526996612548828, "logps/chosen": -228.3838653564453, "logps/rejected": -221.18441772460938, "loss": 0.7045, "rewards/accuracies": 0.25, "rewards/chosen": -0.01768026314675808, "rewards/margins": -0.020010851323604584, "rewards/rejected": 0.0023305879440158606, "step": 193 }, { "epoch": 0.194, "grad_norm": 0.9470121264457703, "learning_rate": 9.7e-07, "logits/chosen": 0.1879546344280243, "logits/rejected": 0.3220532536506653, "logps/chosen": -191.69386291503906, "logps/rejected": -216.99566650390625, "loss": 0.7088, "rewards/accuracies": 0.5, "rewards/chosen": -0.007712649181485176, "rewards/margins": -0.026423165574669838, "rewards/rejected": 0.01871052011847496, "step": 194 }, { "epoch": 0.195, "grad_norm": 0.9428632259368896, "learning_rate": 9.75e-07, "logits/chosen": 0.7700178623199463, "logits/rejected": 0.16120345890522003, "logps/chosen": -261.6532897949219, "logps/rejected": -182.89932250976562, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": 0.0686354711651802, "rewards/margins": 0.08433666080236435, "rewards/rejected": -0.01570119895040989, "step": 195 }, { "epoch": 0.196, "grad_norm": 1.0294092893600464, "learning_rate": 9.8e-07, "logits/chosen": 0.9205968379974365, "logits/rejected": 0.3811769485473633, "logps/chosen": -302.5023193359375, "logps/rejected": -184.22532653808594, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": -0.005384732503443956, "rewards/margins": 0.020464610308408737, "rewards/rejected": -0.025849342346191406, "step": 196 }, { "epoch": 0.197, "grad_norm": 0.9761362075805664, "learning_rate": 9.849999999999999e-07, "logits/chosen": 0.43990767002105713, "logits/rejected": 0.5650414824485779, "logps/chosen": -189.61451721191406, "logps/rejected": -161.92764282226562, "loss": 0.7034, "rewards/accuracies": 0.5, "rewards/chosen": -0.01175546646118164, "rewards/margins": -0.019323254004120827, "rewards/rejected": 0.007567785680294037, "step": 197 }, { "epoch": 0.198, "grad_norm": 0.7941681742668152, "learning_rate": 9.9e-07, "logits/chosen": 0.8007881045341492, "logits/rejected": -0.14505337178707123, "logps/chosen": -220.75921630859375, "logps/rejected": -134.06614685058594, "loss": 0.6779, "rewards/accuracies": 0.75, "rewards/chosen": 0.029970644041895866, "rewards/margins": 0.03197288513183594, "rewards/rejected": -0.0020022401586174965, "step": 198 }, { "epoch": 0.199, "grad_norm": 1.0160140991210938, "learning_rate": 9.95e-07, "logits/chosen": 0.622139036655426, "logits/rejected": 0.4136618375778198, "logps/chosen": -233.63136291503906, "logps/rejected": -196.47933959960938, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": -0.01917295530438423, "rewards/margins": 0.010200023651123047, "rewards/rejected": -0.02937297895550728, "step": 199 }, { "epoch": 0.2, "grad_norm": 1.0923044681549072, "learning_rate": 1e-06, "logits/chosen": 0.6668936014175415, "logits/rejected": 0.32691580057144165, "logps/chosen": -245.63449096679688, "logps/rejected": -165.91786193847656, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": -0.001226138323545456, "rewards/margins": 0.01692371256649494, "rewards/rejected": -0.018149852752685547, "step": 200 }, { "epoch": 0.201, "grad_norm": 1.3662002086639404, "learning_rate": 1.005e-06, "logits/chosen": 0.16802039742469788, "logits/rejected": 0.5549113750457764, "logps/chosen": -137.806884765625, "logps/rejected": -167.52847290039062, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": 0.060404397547245026, "rewards/margins": 0.047501660883426666, "rewards/rejected": 0.01290273666381836, "step": 201 }, { "epoch": 0.202, "grad_norm": 0.9281997680664062, "learning_rate": 1.0099999999999999e-06, "logits/chosen": 0.37509095668792725, "logits/rejected": 0.6319289207458496, "logps/chosen": -167.23583984375, "logps/rejected": -225.6464385986328, "loss": 0.7018, "rewards/accuracies": 0.375, "rewards/chosen": -0.00023994408547878265, "rewards/margins": -0.01694660261273384, "rewards/rejected": 0.01670665852725506, "step": 202 }, { "epoch": 0.203, "grad_norm": 1.1020464897155762, "learning_rate": 1.0149999999999998e-06, "logits/chosen": 0.36056020855903625, "logits/rejected": 0.6125026941299438, "logps/chosen": -193.30084228515625, "logps/rejected": -204.798095703125, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.029367923736572266, "rewards/margins": 0.055663298815488815, "rewards/rejected": -0.02629537507891655, "step": 203 }, { "epoch": 0.204, "grad_norm": 1.2723053693771362, "learning_rate": 1.02e-06, "logits/chosen": 0.6459702253341675, "logits/rejected": 0.4811781644821167, "logps/chosen": -219.1228790283203, "logps/rejected": -142.2452850341797, "loss": 0.6976, "rewards/accuracies": 0.5, "rewards/chosen": -0.00907297246158123, "rewards/margins": -0.0069632502272725105, "rewards/rejected": -0.002109719440340996, "step": 204 }, { "epoch": 0.205, "grad_norm": 1.1933237314224243, "learning_rate": 1.025e-06, "logits/chosen": 0.4243503510951996, "logits/rejected": 0.027152959257364273, "logps/chosen": -233.4176788330078, "logps/rejected": -158.333251953125, "loss": 0.702, "rewards/accuracies": 0.375, "rewards/chosen": -0.005216693971306086, "rewards/margins": -0.017008066177368164, "rewards/rejected": 0.011791372671723366, "step": 205 }, { "epoch": 0.206, "grad_norm": 0.8023396134376526, "learning_rate": 1.0299999999999999e-06, "logits/chosen": 0.5142326354980469, "logits/rejected": 0.07853871583938599, "logps/chosen": -274.01141357421875, "logps/rejected": -134.2783203125, "loss": 0.6741, "rewards/accuracies": 0.75, "rewards/chosen": 0.031002426519989967, "rewards/margins": 0.0412176139652729, "rewards/rejected": -0.010215186513960361, "step": 206 }, { "epoch": 0.207, "grad_norm": 0.8965845704078674, "learning_rate": 1.0349999999999998e-06, "logits/chosen": 0.633490800857544, "logits/rejected": 0.5871403813362122, "logps/chosen": -292.2937927246094, "logps/rejected": -175.78436279296875, "loss": 0.6925, "rewards/accuracies": 0.375, "rewards/chosen": -0.007471560966223478, "rewards/margins": 0.002490903250873089, "rewards/rejected": -0.00996246375143528, "step": 207 }, { "epoch": 0.208, "grad_norm": 1.0301687717437744, "learning_rate": 1.04e-06, "logits/chosen": 0.5863943696022034, "logits/rejected": 0.11886841803789139, "logps/chosen": -219.891845703125, "logps/rejected": -157.77317810058594, "loss": 0.6775, "rewards/accuracies": 0.5, "rewards/chosen": 0.0323365218937397, "rewards/margins": 0.03505973890423775, "rewards/rejected": -0.002723217010498047, "step": 208 }, { "epoch": 0.209, "grad_norm": 0.8676708936691284, "learning_rate": 1.045e-06, "logits/chosen": 0.4543479084968567, "logits/rejected": 0.26804134249687195, "logps/chosen": -155.9835968017578, "logps/rejected": -166.5315704345703, "loss": 0.6764, "rewards/accuracies": 0.75, "rewards/chosen": 0.01345362700521946, "rewards/margins": 0.03463177755475044, "rewards/rejected": -0.021178152412176132, "step": 209 }, { "epoch": 0.21, "grad_norm": 1.033105492591858, "learning_rate": 1.05e-06, "logits/chosen": 0.7584234476089478, "logits/rejected": 0.7504396438598633, "logps/chosen": -275.835693359375, "logps/rejected": -216.526611328125, "loss": 0.7069, "rewards/accuracies": 0.375, "rewards/chosen": -0.028854846954345703, "rewards/margins": -0.025314712896943092, "rewards/rejected": -0.0035401349887251854, "step": 210 }, { "epoch": 0.211, "grad_norm": 1.025468111038208, "learning_rate": 1.0549999999999999e-06, "logits/chosen": 0.49675309658050537, "logits/rejected": 0.3557896614074707, "logps/chosen": -272.4500427246094, "logps/rejected": -204.13558959960938, "loss": 0.7078, "rewards/accuracies": 0.375, "rewards/chosen": -0.020186472684144974, "rewards/margins": -0.027920201420783997, "rewards/rejected": 0.007733725011348724, "step": 211 }, { "epoch": 0.212, "grad_norm": 0.9611704349517822, "learning_rate": 1.06e-06, "logits/chosen": 1.0949403047561646, "logits/rejected": 1.0453746318817139, "logps/chosen": -318.9901123046875, "logps/rejected": -312.2640075683594, "loss": 0.6984, "rewards/accuracies": 0.375, "rewards/chosen": -0.003883647732436657, "rewards/margins": -0.008245562203228474, "rewards/rejected": 0.004361914470791817, "step": 212 }, { "epoch": 0.213, "grad_norm": 1.0473815202713013, "learning_rate": 1.065e-06, "logits/chosen": 0.417339026927948, "logits/rejected": 0.649168074131012, "logps/chosen": -158.218505859375, "logps/rejected": -201.0109100341797, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": -0.0074952132999897, "rewards/margins": 0.009994888678193092, "rewards/rejected": -0.017490103840827942, "step": 213 }, { "epoch": 0.214, "grad_norm": 0.994734525680542, "learning_rate": 1.07e-06, "logits/chosen": 0.19550496339797974, "logits/rejected": 0.3977714776992798, "logps/chosen": -152.03826904296875, "logps/rejected": -214.7374267578125, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.012615585699677467, "rewards/margins": 0.013880539685487747, "rewards/rejected": -0.026496123522520065, "step": 214 }, { "epoch": 0.215, "grad_norm": 1.0423827171325684, "learning_rate": 1.0749999999999999e-06, "logits/chosen": 0.5889912247657776, "logits/rejected": 0.37784343957901, "logps/chosen": -254.80868530273438, "logps/rejected": -178.99151611328125, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.03173799812793732, "rewards/margins": 0.0491521842777729, "rewards/rejected": -0.017414189875125885, "step": 215 }, { "epoch": 0.216, "grad_norm": 1.1154192686080933, "learning_rate": 1.08e-06, "logits/chosen": 0.3470994234085083, "logits/rejected": 0.7319322824478149, "logps/chosen": -282.77227783203125, "logps/rejected": -237.03175354003906, "loss": 0.7186, "rewards/accuracies": 0.375, "rewards/chosen": -0.0230928435921669, "rewards/margins": -0.04832811653614044, "rewards/rejected": 0.025235271081328392, "step": 216 }, { "epoch": 0.217, "grad_norm": 0.9417163133621216, "learning_rate": 1.085e-06, "logits/chosen": 0.5659552216529846, "logits/rejected": 0.4735337793827057, "logps/chosen": -233.456298828125, "logps/rejected": -162.9910888671875, "loss": 0.711, "rewards/accuracies": 0.25, "rewards/chosen": 0.002686501946300268, "rewards/margins": -0.03443107753992081, "rewards/rejected": 0.03711757808923721, "step": 217 }, { "epoch": 0.218, "grad_norm": 0.9817081689834595, "learning_rate": 1.09e-06, "logits/chosen": 0.6738709807395935, "logits/rejected": 0.23898851871490479, "logps/chosen": -236.48480224609375, "logps/rejected": -137.30372619628906, "loss": 0.7069, "rewards/accuracies": 0.25, "rewards/chosen": 0.014798354357481003, "rewards/margins": -0.023100471124053, "rewards/rejected": 0.03789882734417915, "step": 218 }, { "epoch": 0.219, "grad_norm": 1.3535319566726685, "learning_rate": 1.0949999999999999e-06, "logits/chosen": 0.7047688961029053, "logits/rejected": 0.17510001361370087, "logps/chosen": -235.05984497070312, "logps/rejected": -142.72828674316406, "loss": 0.6978, "rewards/accuracies": 0.375, "rewards/chosen": 0.022247983142733574, "rewards/margins": -0.00670471228659153, "rewards/rejected": 0.028952695429325104, "step": 219 }, { "epoch": 0.22, "grad_norm": 1.4029165506362915, "learning_rate": 1.1e-06, "logits/chosen": 0.026009537279605865, "logits/rejected": 1.2012250423431396, "logps/chosen": -159.6278839111328, "logps/rejected": -247.75607299804688, "loss": 0.6725, "rewards/accuracies": 0.75, "rewards/chosen": 0.02441844716668129, "rewards/margins": 0.04555254057049751, "rewards/rejected": -0.021134091541171074, "step": 220 }, { "epoch": 0.221, "grad_norm": 1.2930941581726074, "learning_rate": 1.105e-06, "logits/chosen": 0.8590155839920044, "logits/rejected": 1.0406197309494019, "logps/chosen": -223.3317413330078, "logps/rejected": -239.08737182617188, "loss": 0.6952, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029329778626561165, "rewards/margins": -0.0019862670451402664, "rewards/rejected": 0.004919243510812521, "step": 221 }, { "epoch": 0.222, "grad_norm": 1.2898215055465698, "learning_rate": 1.11e-06, "logits/chosen": -0.16380517184734344, "logits/rejected": 0.9819362163543701, "logps/chosen": -80.77042388916016, "logps/rejected": -220.115966796875, "loss": 0.6609, "rewards/accuracies": 0.75, "rewards/chosen": 0.03983726724982262, "rewards/margins": 0.06667843461036682, "rewards/rejected": -0.026841163635253906, "step": 222 }, { "epoch": 0.223, "grad_norm": 0.9380588531494141, "learning_rate": 1.115e-06, "logits/chosen": 0.9598632454872131, "logits/rejected": 0.7423070669174194, "logps/chosen": -218.65234375, "logps/rejected": -191.31365966796875, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": 0.005873680114746094, "rewards/margins": 0.04810181260108948, "rewards/rejected": -0.042228132486343384, "step": 223 }, { "epoch": 0.224, "grad_norm": 0.8731001615524292, "learning_rate": 1.12e-06, "logits/chosen": 0.7347698211669922, "logits/rejected": 0.4002431631088257, "logps/chosen": -254.91307067871094, "logps/rejected": -163.71188354492188, "loss": 0.6831, "rewards/accuracies": 0.875, "rewards/chosen": 0.01957721821963787, "rewards/margins": 0.022014137357473373, "rewards/rejected": -0.0024369237944483757, "step": 224 }, { "epoch": 0.225, "grad_norm": 1.2058078050613403, "learning_rate": 1.125e-06, "logits/chosen": -0.04991581290960312, "logits/rejected": 1.4159826040267944, "logps/chosen": -143.93084716796875, "logps/rejected": -299.35302734375, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.009054755792021751, "rewards/margins": 0.002250481629744172, "rewards/rejected": 0.00680427486076951, "step": 225 }, { "epoch": 0.226, "grad_norm": 0.9291170239448547, "learning_rate": 1.1299999999999998e-06, "logits/chosen": 0.5577459335327148, "logits/rejected": 0.6230242848396301, "logps/chosen": -188.72665405273438, "logps/rejected": -160.44406127929688, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": 0.03172335773706436, "rewards/margins": 0.017669152468442917, "rewards/rejected": 0.014054203405976295, "step": 226 }, { "epoch": 0.227, "grad_norm": 1.0671693086624146, "learning_rate": 1.135e-06, "logits/chosen": 0.25954145193099976, "logits/rejected": 0.8623673319816589, "logps/chosen": -127.24774169921875, "logps/rejected": -206.2274932861328, "loss": 0.7112, "rewards/accuracies": 0.625, "rewards/chosen": 0.019540881738066673, "rewards/margins": -0.03233280032873154, "rewards/rejected": 0.05187368392944336, "step": 227 }, { "epoch": 0.228, "grad_norm": 1.0142183303833008, "learning_rate": 1.1399999999999999e-06, "logits/chosen": 0.7570252418518066, "logits/rejected": 0.5237069129943848, "logps/chosen": -315.77130126953125, "logps/rejected": -188.66146850585938, "loss": 0.6854, "rewards/accuracies": 0.375, "rewards/chosen": -0.0038170821499079466, "rewards/margins": 0.018517781049013138, "rewards/rejected": -0.022334862500429153, "step": 228 }, { "epoch": 0.229, "grad_norm": 1.1599078178405762, "learning_rate": 1.145e-06, "logits/chosen": 0.42911285161972046, "logits/rejected": 0.5244413018226624, "logps/chosen": -276.21392822265625, "logps/rejected": -212.75729370117188, "loss": 0.7131, "rewards/accuracies": 0.375, "rewards/chosen": -0.017008017748594284, "rewards/margins": -0.03684301674365997, "rewards/rejected": 0.01983499526977539, "step": 229 }, { "epoch": 0.23, "grad_norm": 1.0550422668457031, "learning_rate": 1.1499999999999998e-06, "logits/chosen": 0.14167088270187378, "logits/rejected": 0.14782150089740753, "logps/chosen": -178.90139770507812, "logps/rejected": -193.51400756835938, "loss": 0.6698, "rewards/accuracies": 0.75, "rewards/chosen": 0.0370451919734478, "rewards/margins": 0.04819011688232422, "rewards/rejected": -0.011144926771521568, "step": 230 }, { "epoch": 0.231, "grad_norm": 0.9601698517799377, "learning_rate": 1.155e-06, "logits/chosen": 0.3925574719905853, "logits/rejected": -0.07344101369380951, "logps/chosen": -174.36093139648438, "logps/rejected": -134.32225036621094, "loss": 0.6974, "rewards/accuracies": 0.375, "rewards/chosen": -0.015920449048280716, "rewards/margins": -0.007454013451933861, "rewards/rejected": -0.008466435596346855, "step": 231 }, { "epoch": 0.232, "grad_norm": 1.2822437286376953, "learning_rate": 1.16e-06, "logits/chosen": 0.43384799361228943, "logits/rejected": -0.23258158564567566, "logps/chosen": -211.48863220214844, "logps/rejected": -152.5162353515625, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": -0.019832514226436615, "rewards/margins": -0.012744517996907234, "rewards/rejected": -0.007087994366884232, "step": 232 }, { "epoch": 0.233, "grad_norm": 1.1268254518508911, "learning_rate": 1.165e-06, "logits/chosen": 0.1507655531167984, "logits/rejected": 0.7390242218971252, "logps/chosen": -186.33319091796875, "logps/rejected": -213.1161346435547, "loss": 0.6529, "rewards/accuracies": 0.875, "rewards/chosen": 0.039742518216371536, "rewards/margins": 0.08386807888746262, "rewards/rejected": -0.04412555694580078, "step": 233 }, { "epoch": 0.234, "grad_norm": 1.151625394821167, "learning_rate": 1.1699999999999998e-06, "logits/chosen": 0.7470299601554871, "logits/rejected": 0.68381267786026, "logps/chosen": -218.18408203125, "logps/rejected": -151.82781982421875, "loss": 0.7107, "rewards/accuracies": 0.5, "rewards/chosen": -0.054589271545410156, "rewards/margins": -0.03116130270063877, "rewards/rejected": -0.023427966982126236, "step": 234 }, { "epoch": 0.235, "grad_norm": 0.9744617938995361, "learning_rate": 1.175e-06, "logits/chosen": 0.5122992396354675, "logits/rejected": 0.0005918033421039581, "logps/chosen": -234.93206787109375, "logps/rejected": -139.8157958984375, "loss": 0.7022, "rewards/accuracies": 0.25, "rewards/chosen": -0.013765526935458183, "rewards/margins": -0.014237597584724426, "rewards/rejected": 0.000472070649266243, "step": 235 }, { "epoch": 0.236, "grad_norm": 1.453041434288025, "learning_rate": 1.18e-06, "logits/chosen": 0.33123505115509033, "logits/rejected": 0.9805466532707214, "logps/chosen": -119.31253051757812, "logps/rejected": -316.982177734375, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": 0.031664323061704636, "rewards/margins": 0.024255797266960144, "rewards/rejected": 0.007408524863421917, "step": 236 }, { "epoch": 0.237, "grad_norm": 1.0757273435592651, "learning_rate": 1.185e-06, "logits/chosen": 0.5614053010940552, "logits/rejected": 0.9187469482421875, "logps/chosen": -232.79087829589844, "logps/rejected": -300.9568786621094, "loss": 0.6817, "rewards/accuracies": 0.5, "rewards/chosen": 0.0011318204924464226, "rewards/margins": 0.024785613641142845, "rewards/rejected": -0.023653794080018997, "step": 237 }, { "epoch": 0.238, "grad_norm": 0.9605821371078491, "learning_rate": 1.1899999999999998e-06, "logits/chosen": 0.9101227521896362, "logits/rejected": 0.7197586297988892, "logps/chosen": -184.09756469726562, "logps/rejected": -162.18165588378906, "loss": 0.7123, "rewards/accuracies": 0.25, "rewards/chosen": -0.03847751393914223, "rewards/margins": -0.03663825988769531, "rewards/rejected": -0.0018392566125839949, "step": 238 }, { "epoch": 0.239, "grad_norm": 1.0338730812072754, "learning_rate": 1.195e-06, "logits/chosen": 0.11375229805707932, "logits/rejected": 0.8196192979812622, "logps/chosen": -195.7779083251953, "logps/rejected": -222.14991760253906, "loss": 0.6581, "rewards/accuracies": 0.875, "rewards/chosen": 0.030839061364531517, "rewards/margins": 0.07247690856456757, "rewards/rejected": -0.0416378527879715, "step": 239 }, { "epoch": 0.24, "grad_norm": 1.3301876783370972, "learning_rate": 1.2e-06, "logits/chosen": 0.30875691771507263, "logits/rejected": 1.1047029495239258, "logps/chosen": -179.04466247558594, "logps/rejected": -262.42144775390625, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.006791114807128906, "rewards/margins": 0.009503462351858616, "rewards/rejected": -0.0027123470790684223, "step": 240 }, { "epoch": 0.241, "grad_norm": 0.8561538457870483, "learning_rate": 1.2050000000000001e-06, "logits/chosen": 0.29375073313713074, "logits/rejected": 0.8493534326553345, "logps/chosen": -162.91632080078125, "logps/rejected": -192.84698486328125, "loss": 0.6617, "rewards/accuracies": 0.875, "rewards/chosen": 0.0299687422811985, "rewards/margins": 0.06603317707777023, "rewards/rejected": -0.03606443479657173, "step": 241 }, { "epoch": 0.242, "grad_norm": 0.9859458208084106, "learning_rate": 1.2099999999999998e-06, "logits/chosen": 0.3072091042995453, "logits/rejected": 0.199928879737854, "logps/chosen": -210.32720947265625, "logps/rejected": -185.28997802734375, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": 0.018975306302309036, "rewards/margins": 0.010487412102520466, "rewards/rejected": 0.008487893268465996, "step": 242 }, { "epoch": 0.243, "grad_norm": 1.053462028503418, "learning_rate": 1.215e-06, "logits/chosen": 0.6505094766616821, "logits/rejected": 0.22370877861976624, "logps/chosen": -281.7289733886719, "logps/rejected": -234.56509399414062, "loss": 0.695, "rewards/accuracies": 0.625, "rewards/chosen": 5.741044878959656e-05, "rewards/margins": 0.006135933101177216, "rewards/rejected": -0.006078529637306929, "step": 243 }, { "epoch": 0.244, "grad_norm": 0.9720907211303711, "learning_rate": 1.22e-06, "logits/chosen": 0.33044305443763733, "logits/rejected": 0.6324706673622131, "logps/chosen": -243.8623809814453, "logps/rejected": -178.7584228515625, "loss": 0.6718, "rewards/accuracies": 0.75, "rewards/chosen": 0.041851140558719635, "rewards/margins": 0.0472412109375, "rewards/rejected": -0.005390070378780365, "step": 244 }, { "epoch": 0.245, "grad_norm": 1.1169217824935913, "learning_rate": 1.2250000000000001e-06, "logits/chosen": 0.6289919018745422, "logits/rejected": 0.7100332975387573, "logps/chosen": -203.35214233398438, "logps/rejected": -268.24365234375, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": 0.005020712967962027, "rewards/margins": 0.04044341668486595, "rewards/rejected": -0.035422708839178085, "step": 245 }, { "epoch": 0.246, "grad_norm": 1.037975549697876, "learning_rate": 1.2299999999999999e-06, "logits/chosen": -0.36083054542541504, "logits/rejected": 0.7738804817199707, "logps/chosen": -94.34832000732422, "logps/rejected": -254.9934844970703, "loss": 0.6655, "rewards/accuracies": 0.75, "rewards/chosen": 0.04026695340871811, "rewards/margins": 0.05974222347140312, "rewards/rejected": -0.019475270062685013, "step": 246 }, { "epoch": 0.247, "grad_norm": 1.1274217367172241, "learning_rate": 1.235e-06, "logits/chosen": 0.7561760544776917, "logits/rejected": 0.05882197618484497, "logps/chosen": -389.09002685546875, "logps/rejected": -181.12782287597656, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": -0.03654804080724716, "rewards/margins": 0.005356781184673309, "rewards/rejected": -0.04190482944250107, "step": 247 }, { "epoch": 0.248, "grad_norm": 1.0334943532943726, "learning_rate": 1.24e-06, "logits/chosen": 0.7918481826782227, "logits/rejected": 0.38922035694122314, "logps/chosen": -220.61294555664062, "logps/rejected": -117.56398010253906, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.01936369016766548, "rewards/margins": 0.0129791758954525, "rewards/rejected": 0.006384516134858131, "step": 248 }, { "epoch": 0.249, "grad_norm": 1.0406079292297363, "learning_rate": 1.2450000000000002e-06, "logits/chosen": 0.15759213268756866, "logits/rejected": 0.9595627784729004, "logps/chosen": -150.45587158203125, "logps/rejected": -204.47686767578125, "loss": 0.6627, "rewards/accuracies": 0.875, "rewards/chosen": 0.021280763670802116, "rewards/margins": 0.06249905005097389, "rewards/rejected": -0.04121828079223633, "step": 249 }, { "epoch": 0.25, "grad_norm": 0.816681981086731, "learning_rate": 1.2499999999999999e-06, "logits/chosen": -0.0002520531415939331, "logits/rejected": 0.5922094583511353, "logps/chosen": -130.68069458007812, "logps/rejected": -184.4814910888672, "loss": 0.6667, "rewards/accuracies": 0.875, "rewards/chosen": 0.028873300179839134, "rewards/margins": 0.055608704686164856, "rewards/rejected": -0.026735402643680573, "step": 250 }, { "epoch": 0.251, "grad_norm": 1.0321588516235352, "learning_rate": 1.2549999999999998e-06, "logits/chosen": 0.3808414340019226, "logits/rejected": 0.6866657733917236, "logps/chosen": -179.98806762695312, "logps/rejected": -244.78228759765625, "loss": 0.6875, "rewards/accuracies": 0.375, "rewards/chosen": -0.024824000895023346, "rewards/margins": 0.014104320667684078, "rewards/rejected": -0.0389283187687397, "step": 251 }, { "epoch": 0.252, "grad_norm": 0.9056465029716492, "learning_rate": 1.26e-06, "logits/chosen": 0.5619550943374634, "logits/rejected": 0.6100658774375916, "logps/chosen": -191.25759887695312, "logps/rejected": -194.38296508789062, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": 0.03409580886363983, "rewards/margins": 0.03895192593336105, "rewards/rejected": -0.004856109619140625, "step": 252 }, { "epoch": 0.253, "grad_norm": 1.178154468536377, "learning_rate": 1.2649999999999997e-06, "logits/chosen": 0.2623811364173889, "logits/rejected": 0.6419914364814758, "logps/chosen": -156.36483764648438, "logps/rejected": -208.58937072753906, "loss": 0.6585, "rewards/accuracies": 0.625, "rewards/chosen": 0.04258785396814346, "rewards/margins": 0.07212629169225693, "rewards/rejected": -0.029538441449403763, "step": 253 }, { "epoch": 0.254, "grad_norm": 0.9678664803504944, "learning_rate": 1.27e-06, "logits/chosen": 0.8385523557662964, "logits/rejected": 0.269305020570755, "logps/chosen": -267.43072509765625, "logps/rejected": -192.57327270507812, "loss": 0.7056, "rewards/accuracies": 0.375, "rewards/chosen": -0.018472956493496895, "rewards/margins": -0.019736196845769882, "rewards/rejected": 0.0012632384896278381, "step": 254 }, { "epoch": 0.255, "grad_norm": 1.3888988494873047, "learning_rate": 1.2749999999999999e-06, "logits/chosen": 0.5060139894485474, "logits/rejected": 1.0692815780639648, "logps/chosen": -163.1456298828125, "logps/rejected": -320.25970458984375, "loss": 0.6358, "rewards/accuracies": 0.875, "rewards/chosen": 0.0474369078874588, "rewards/margins": 0.12008476257324219, "rewards/rejected": -0.07264785468578339, "step": 255 }, { "epoch": 0.256, "grad_norm": 1.0056661367416382, "learning_rate": 1.28e-06, "logits/chosen": 0.4457331895828247, "logits/rejected": 0.5648226141929626, "logps/chosen": -167.50186157226562, "logps/rejected": -170.70628356933594, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": 0.0006772037595510483, "rewards/margins": 0.006837702356278896, "rewards/rejected": -0.006160498131066561, "step": 256 }, { "epoch": 0.257, "grad_norm": 0.9882188439369202, "learning_rate": 1.2849999999999998e-06, "logits/chosen": 0.7015238404273987, "logits/rejected": 0.613044261932373, "logps/chosen": -193.90866088867188, "logps/rejected": -192.89224243164062, "loss": 0.6959, "rewards/accuracies": 0.5, "rewards/chosen": 0.034049175679683685, "rewards/margins": -0.002407122403383255, "rewards/rejected": 0.03645630180835724, "step": 257 }, { "epoch": 0.258, "grad_norm": 1.4017807245254517, "learning_rate": 1.29e-06, "logits/chosen": 0.4758968651294708, "logits/rejected": 0.5275716781616211, "logps/chosen": -250.48095703125, "logps/rejected": -187.55422973632812, "loss": 0.7435, "rewards/accuracies": 0.125, "rewards/chosen": -0.0260786060243845, "rewards/margins": -0.09537658095359802, "rewards/rejected": 0.06929797679185867, "step": 258 }, { "epoch": 0.259, "grad_norm": 1.0112618207931519, "learning_rate": 1.2949999999999999e-06, "logits/chosen": 0.2985512614250183, "logits/rejected": 0.07122158259153366, "logps/chosen": -163.86087036132812, "logps/rejected": -145.13104248046875, "loss": 0.6779, "rewards/accuracies": 0.5, "rewards/chosen": 0.04700083285570145, "rewards/margins": 0.03425155207514763, "rewards/rejected": 0.012749290093779564, "step": 259 }, { "epoch": 0.26, "grad_norm": 1.0614863634109497, "learning_rate": 1.3e-06, "logits/chosen": 0.606985330581665, "logits/rejected": 0.49054625630378723, "logps/chosen": -203.87210083007812, "logps/rejected": -199.43646240234375, "loss": 0.6618, "rewards/accuracies": 0.75, "rewards/chosen": 0.02793855592608452, "rewards/margins": 0.06660003960132599, "rewards/rejected": -0.03866147994995117, "step": 260 }, { "epoch": 0.261, "grad_norm": 1.4057782888412476, "learning_rate": 1.3049999999999998e-06, "logits/chosen": 0.3616446852684021, "logits/rejected": 0.5133142471313477, "logps/chosen": -233.7707061767578, "logps/rejected": -283.2337341308594, "loss": 0.7042, "rewards/accuracies": 0.5, "rewards/chosen": 0.014948271214962006, "rewards/margins": -0.008520899340510368, "rewards/rejected": 0.023469163104891777, "step": 261 }, { "epoch": 0.262, "grad_norm": 1.118856430053711, "learning_rate": 1.31e-06, "logits/chosen": 0.07856161892414093, "logits/rejected": 0.49817702174186707, "logps/chosen": -157.9601287841797, "logps/rejected": -209.18885803222656, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.007783079985529184, "rewards/margins": 0.01976027525961399, "rewards/rejected": -0.027543358504772186, "step": 262 }, { "epoch": 0.263, "grad_norm": 1.2943832874298096, "learning_rate": 1.315e-06, "logits/chosen": 1.0386327505111694, "logits/rejected": 0.135043665766716, "logps/chosen": -316.94183349609375, "logps/rejected": -182.78884887695312, "loss": 0.7102, "rewards/accuracies": 0.375, "rewards/chosen": -0.03491248935461044, "rewards/margins": -0.03238806873559952, "rewards/rejected": -0.002524423412978649, "step": 263 }, { "epoch": 0.264, "grad_norm": 1.0862984657287598, "learning_rate": 1.32e-06, "logits/chosen": 0.5695623159408569, "logits/rejected": 0.49722468852996826, "logps/chosen": -230.70803833007812, "logps/rejected": -170.88314819335938, "loss": 0.6981, "rewards/accuracies": 0.625, "rewards/chosen": -0.007489398121833801, "rewards/margins": -0.003032691776752472, "rewards/rejected": -0.004456710070371628, "step": 264 }, { "epoch": 0.265, "grad_norm": 1.0834001302719116, "learning_rate": 1.3249999999999998e-06, "logits/chosen": 0.718224823474884, "logits/rejected": 0.7044109106063843, "logps/chosen": -263.6732177734375, "logps/rejected": -213.95469665527344, "loss": 0.6376, "rewards/accuracies": 0.75, "rewards/chosen": 0.10388221591711044, "rewards/margins": 0.12207815051078796, "rewards/rejected": -0.018195917829871178, "step": 265 }, { "epoch": 0.266, "grad_norm": 1.086617112159729, "learning_rate": 1.33e-06, "logits/chosen": 0.5830378532409668, "logits/rejected": 0.40679794549942017, "logps/chosen": -270.5284729003906, "logps/rejected": -153.62144470214844, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008919714018702507, "rewards/margins": 0.014483547769486904, "rewards/rejected": -0.013591578230261803, "step": 266 }, { "epoch": 0.267, "grad_norm": 0.8864206075668335, "learning_rate": 1.335e-06, "logits/chosen": 0.29727035760879517, "logits/rejected": 0.13846389949321747, "logps/chosen": -173.50189208984375, "logps/rejected": -174.86468505859375, "loss": 0.6721, "rewards/accuracies": 0.75, "rewards/chosen": 0.032854367047548294, "rewards/margins": 0.044970035552978516, "rewards/rejected": -0.012115669436752796, "step": 267 }, { "epoch": 0.268, "grad_norm": 1.2493860721588135, "learning_rate": 1.34e-06, "logits/chosen": 0.8189170956611633, "logits/rejected": -0.042558491230010986, "logps/chosen": -282.247314453125, "logps/rejected": -165.76670837402344, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -0.0184299498796463, "rewards/margins": -0.025797706097364426, "rewards/rejected": 0.007367757149040699, "step": 268 }, { "epoch": 0.269, "grad_norm": 1.4965965747833252, "learning_rate": 1.3449999999999998e-06, "logits/chosen": 0.5348610877990723, "logits/rejected": 0.40121662616729736, "logps/chosen": -250.91021728515625, "logps/rejected": -188.557373046875, "loss": 0.7451, "rewards/accuracies": 0.375, "rewards/chosen": -0.012908173725008965, "rewards/margins": -0.08940229564905167, "rewards/rejected": 0.07649411261081696, "step": 269 }, { "epoch": 0.27, "grad_norm": 1.2582502365112305, "learning_rate": 1.35e-06, "logits/chosen": 0.48767518997192383, "logits/rejected": 0.544613778591156, "logps/chosen": -164.61578369140625, "logps/rejected": -178.47984313964844, "loss": 0.652, "rewards/accuracies": 0.875, "rewards/chosen": 0.024373626336455345, "rewards/margins": 0.08833789825439453, "rewards/rejected": -0.06396427750587463, "step": 270 }, { "epoch": 0.271, "grad_norm": 1.1193361282348633, "learning_rate": 1.355e-06, "logits/chosen": -0.06395981460809708, "logits/rejected": 0.04221387580037117, "logps/chosen": -143.20152282714844, "logps/rejected": -173.08810424804688, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": 0.05210190266370773, "rewards/margins": 0.06637449562549591, "rewards/rejected": -0.014272594824433327, "step": 271 }, { "epoch": 0.272, "grad_norm": 1.4507025480270386, "learning_rate": 1.3600000000000001e-06, "logits/chosen": 0.3162364959716797, "logits/rejected": 0.9647402167320251, "logps/chosen": -174.81834411621094, "logps/rejected": -288.8807373046875, "loss": 0.6626, "rewards/accuracies": 0.625, "rewards/chosen": 0.012935353443026543, "rewards/margins": 0.06649522483348846, "rewards/rejected": -0.05355988070368767, "step": 272 }, { "epoch": 0.273, "grad_norm": 1.0432432889938354, "learning_rate": 1.3649999999999998e-06, "logits/chosen": 0.06650448590517044, "logits/rejected": 0.731299877166748, "logps/chosen": -126.62890625, "logps/rejected": -246.07095336914062, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 0.023127270862460136, "rewards/margins": 0.05239543318748474, "rewards/rejected": -0.02926817163825035, "step": 273 }, { "epoch": 0.274, "grad_norm": 0.9217251539230347, "learning_rate": 1.37e-06, "logits/chosen": -0.19956381618976593, "logits/rejected": 0.868782639503479, "logps/chosen": -156.48728942871094, "logps/rejected": -212.50234985351562, "loss": 0.7042, "rewards/accuracies": 0.5, "rewards/chosen": -0.006143760867416859, "rewards/margins": -0.020163297653198242, "rewards/rejected": 0.014019537717103958, "step": 274 }, { "epoch": 0.275, "grad_norm": 1.2100605964660645, "learning_rate": 1.375e-06, "logits/chosen": 0.9690887928009033, "logits/rejected": 0.32842355966567993, "logps/chosen": -275.6150817871094, "logps/rejected": -144.61233520507812, "loss": 0.7073, "rewards/accuracies": 0.5, "rewards/chosen": -0.020016241818666458, "rewards/margins": -0.026662779971957207, "rewards/rejected": 0.006646538153290749, "step": 275 }, { "epoch": 0.276, "grad_norm": 1.1912081241607666, "learning_rate": 1.38e-06, "logits/chosen": 0.25868722796440125, "logits/rejected": 0.5688268542289734, "logps/chosen": -302.63653564453125, "logps/rejected": -286.3971252441406, "loss": 0.6688, "rewards/accuracies": 0.75, "rewards/chosen": 0.00042886845767498016, "rewards/margins": 0.057300373911857605, "rewards/rejected": -0.056871507316827774, "step": 276 }, { "epoch": 0.277, "grad_norm": 1.1540113687515259, "learning_rate": 1.3849999999999999e-06, "logits/chosen": 0.25070834159851074, "logits/rejected": 0.7768968343734741, "logps/chosen": -267.166748046875, "logps/rejected": -301.2960205078125, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": -0.01862325705587864, "rewards/margins": 0.022533509880304337, "rewards/rejected": -0.041156768798828125, "step": 277 }, { "epoch": 0.278, "grad_norm": 1.1594853401184082, "learning_rate": 1.3899999999999998e-06, "logits/chosen": 0.8049395084381104, "logits/rejected": 0.6259706020355225, "logps/chosen": -304.3548278808594, "logps/rejected": -201.62094116210938, "loss": 0.7169, "rewards/accuracies": 0.25, "rewards/chosen": -0.06666126847267151, "rewards/margins": -0.04408235475420952, "rewards/rejected": -0.022578906267881393, "step": 278 }, { "epoch": 0.279, "grad_norm": 0.9725602269172668, "learning_rate": 1.395e-06, "logits/chosen": 0.8681111931800842, "logits/rejected": 0.6754910945892334, "logps/chosen": -211.06529235839844, "logps/rejected": -195.4900360107422, "loss": 0.6936, "rewards/accuracies": 0.375, "rewards/chosen": -0.007837202399969101, "rewards/margins": 0.002758217975497246, "rewards/rejected": -0.010595417581498623, "step": 279 }, { "epoch": 0.28, "grad_norm": 1.0501736402511597, "learning_rate": 1.4e-06, "logits/chosen": 0.9491932988166809, "logits/rejected": 0.24285762012004852, "logps/chosen": -264.52923583984375, "logps/rejected": -194.5080108642578, "loss": 0.6907, "rewards/accuracies": 0.375, "rewards/chosen": 0.0011305809020996094, "rewards/margins": 0.009171007201075554, "rewards/rejected": -0.008040428161621094, "step": 280 }, { "epoch": 0.281, "grad_norm": 0.918915331363678, "learning_rate": 1.4049999999999999e-06, "logits/chosen": 0.33974218368530273, "logits/rejected": -0.17169322073459625, "logps/chosen": -188.0531005859375, "logps/rejected": -173.41542053222656, "loss": 0.6948, "rewards/accuracies": 0.625, "rewards/chosen": 0.01578063890337944, "rewards/margins": 0.010243415832519531, "rewards/rejected": 0.005537224933505058, "step": 281 }, { "epoch": 0.282, "grad_norm": 1.027968406677246, "learning_rate": 1.4099999999999998e-06, "logits/chosen": 0.44348403811454773, "logits/rejected": 0.07764196395874023, "logps/chosen": -287.67254638671875, "logps/rejected": -169.75991821289062, "loss": 0.6535, "rewards/accuracies": 0.75, "rewards/chosen": 0.02203545719385147, "rewards/margins": 0.08566318452358246, "rewards/rejected": -0.06362771987915039, "step": 282 }, { "epoch": 0.283, "grad_norm": 1.1000747680664062, "learning_rate": 1.415e-06, "logits/chosen": 0.635586142539978, "logits/rejected": 1.1161057949066162, "logps/chosen": -282.08111572265625, "logps/rejected": -249.9522247314453, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": 0.007414292544126511, "rewards/margins": 0.007960652932524681, "rewards/rejected": -0.0005463603883981705, "step": 283 }, { "epoch": 0.284, "grad_norm": 1.0493136644363403, "learning_rate": 1.42e-06, "logits/chosen": 0.746127724647522, "logits/rejected": 0.6457306742668152, "logps/chosen": -240.19064331054688, "logps/rejected": -189.3340606689453, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": 0.012227535247802734, "rewards/margins": 0.013124370947480202, "rewards/rejected": -0.000896836631000042, "step": 284 }, { "epoch": 0.285, "grad_norm": 1.015268325805664, "learning_rate": 1.425e-06, "logits/chosen": 1.037064790725708, "logits/rejected": 0.5407778024673462, "logps/chosen": -236.62799072265625, "logps/rejected": -142.71878051757812, "loss": 0.7106, "rewards/accuracies": 0.25, "rewards/chosen": -0.04743695259094238, "rewards/margins": -0.03324465453624725, "rewards/rejected": -0.014192295260727406, "step": 285 }, { "epoch": 0.286, "grad_norm": 1.561449408531189, "learning_rate": 1.4299999999999999e-06, "logits/chosen": 0.17505532503128052, "logits/rejected": 0.49383777379989624, "logps/chosen": -134.72837829589844, "logps/rejected": -271.65264892578125, "loss": 0.6298, "rewards/accuracies": 1.0, "rewards/chosen": 0.07894019782543182, "rewards/margins": 0.1316825896501541, "rewards/rejected": -0.05274238809943199, "step": 286 }, { "epoch": 0.287, "grad_norm": 1.1106423139572144, "learning_rate": 1.435e-06, "logits/chosen": 0.36813172698020935, "logits/rejected": 0.8072736263275146, "logps/chosen": -218.35089111328125, "logps/rejected": -212.86129760742188, "loss": 0.6878, "rewards/accuracies": 0.375, "rewards/chosen": -0.02539386972784996, "rewards/margins": 0.023108482360839844, "rewards/rejected": -0.048502348363399506, "step": 287 }, { "epoch": 0.288, "grad_norm": 1.0584012269973755, "learning_rate": 1.44e-06, "logits/chosen": 0.21495947241783142, "logits/rejected": 0.303507924079895, "logps/chosen": -330.5262145996094, "logps/rejected": -171.592529296875, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": -0.0188961960375309, "rewards/margins": 0.050443124026060104, "rewards/rejected": -0.069339320063591, "step": 288 }, { "epoch": 0.289, "grad_norm": 0.9443803429603577, "learning_rate": 1.445e-06, "logits/chosen": 0.35456910729408264, "logits/rejected": 0.463100790977478, "logps/chosen": -127.24048614501953, "logps/rejected": -169.12554931640625, "loss": 0.6682, "rewards/accuracies": 0.75, "rewards/chosen": 0.06379289925098419, "rewards/margins": 0.05394010990858078, "rewards/rejected": 0.009852791205048561, "step": 289 }, { "epoch": 0.29, "grad_norm": 2.5071990489959717, "learning_rate": 1.4499999999999999e-06, "logits/chosen": 1.1762628555297852, "logits/rejected": 0.5432217717170715, "logps/chosen": -523.2445678710938, "logps/rejected": -180.208251953125, "loss": 0.8089, "rewards/accuracies": 0.0, "rewards/chosen": -0.08670520782470703, "rewards/margins": -0.2141346037387848, "rewards/rejected": 0.12742939591407776, "step": 290 }, { "epoch": 0.291, "grad_norm": 1.068060278892517, "learning_rate": 1.455e-06, "logits/chosen": 1.0754815340042114, "logits/rejected": 0.7018258571624756, "logps/chosen": -362.4765930175781, "logps/rejected": -194.3949432373047, "loss": 0.6825, "rewards/accuracies": 0.625, "rewards/chosen": 0.027841567993164062, "rewards/margins": 0.03645477071404457, "rewards/rejected": -0.008613203652203083, "step": 291 }, { "epoch": 0.292, "grad_norm": 0.9703415632247925, "learning_rate": 1.46e-06, "logits/chosen": 0.1759399026632309, "logits/rejected": 0.4473460614681244, "logps/chosen": -185.4920196533203, "logps/rejected": -233.07672119140625, "loss": 0.7142, "rewards/accuracies": 0.5, "rewards/chosen": -0.03617153316736221, "rewards/margins": -0.03606872633099556, "rewards/rejected": -0.0001028040423989296, "step": 292 }, { "epoch": 0.293, "grad_norm": 1.0347139835357666, "learning_rate": 1.465e-06, "logits/chosen": 0.4482729434967041, "logits/rejected": 0.5728879570960999, "logps/chosen": -190.30056762695312, "logps/rejected": -214.3016357421875, "loss": 0.6413, "rewards/accuracies": 0.875, "rewards/chosen": 0.06084737926721573, "rewards/margins": 0.11217251420021057, "rewards/rejected": -0.051325127482414246, "step": 293 }, { "epoch": 0.294, "grad_norm": 1.2208014726638794, "learning_rate": 1.47e-06, "logits/chosen": 0.6151356101036072, "logits/rejected": 0.11997794359922409, "logps/chosen": -313.12701416015625, "logps/rejected": -181.6331787109375, "loss": 0.7412, "rewards/accuracies": 0.375, "rewards/chosen": -0.060726262629032135, "rewards/margins": -0.08206043392419815, "rewards/rejected": 0.021334175020456314, "step": 294 }, { "epoch": 0.295, "grad_norm": 1.0304771661758423, "learning_rate": 1.475e-06, "logits/chosen": 0.8789685964584351, "logits/rejected": -0.01672588661313057, "logps/chosen": -296.50860595703125, "logps/rejected": -118.10025024414062, "loss": 0.6774, "rewards/accuracies": 0.625, "rewards/chosen": 0.005430031567811966, "rewards/margins": 0.03565573692321777, "rewards/rejected": -0.030225707218050957, "step": 295 }, { "epoch": 0.296, "grad_norm": 1.1805697679519653, "learning_rate": 1.48e-06, "logits/chosen": 0.8477104306221008, "logits/rejected": 0.3310486674308777, "logps/chosen": -317.5341796875, "logps/rejected": -142.20254516601562, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": 0.01711854711174965, "rewards/margins": 0.03527851402759552, "rewards/rejected": -0.018159961327910423, "step": 296 }, { "epoch": 0.297, "grad_norm": 0.9169073700904846, "learning_rate": 1.485e-06, "logits/chosen": 0.5317859649658203, "logits/rejected": 0.5789294242858887, "logps/chosen": -222.00927734375, "logps/rejected": -195.04171752929688, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": -0.04116630554199219, "rewards/margins": 0.026781463995575905, "rewards/rejected": -0.06794776767492294, "step": 297 }, { "epoch": 0.298, "grad_norm": 1.113012671470642, "learning_rate": 1.49e-06, "logits/chosen": 0.3417741060256958, "logits/rejected": 0.8881428837776184, "logps/chosen": -172.91262817382812, "logps/rejected": -193.1946258544922, "loss": 0.6428, "rewards/accuracies": 0.75, "rewards/chosen": 0.05278053134679794, "rewards/margins": 0.10687465965747833, "rewards/rejected": -0.05409412458539009, "step": 298 }, { "epoch": 0.299, "grad_norm": 1.2036292552947998, "learning_rate": 1.495e-06, "logits/chosen": 0.434637188911438, "logits/rejected": 1.0091832876205444, "logps/chosen": -254.85855102539062, "logps/rejected": -242.49050903320312, "loss": 0.6556, "rewards/accuracies": 0.75, "rewards/chosen": 0.021394923329353333, "rewards/margins": 0.08333244174718857, "rewards/rejected": -0.061937522143125534, "step": 299 }, { "epoch": 0.3, "grad_norm": 0.9382742643356323, "learning_rate": 1.5e-06, "logits/chosen": 0.5893725156784058, "logits/rejected": 0.9517191052436829, "logps/chosen": -178.38601684570312, "logps/rejected": -251.58740234375, "loss": 0.686, "rewards/accuracies": 0.375, "rewards/chosen": 0.013275576755404472, "rewards/margins": 0.01713385619223118, "rewards/rejected": -0.0038582789711654186, "step": 300 }, { "epoch": 0.301, "grad_norm": 1.0452605485916138, "learning_rate": 1.5049999999999998e-06, "logits/chosen": 0.6536056399345398, "logits/rejected": 0.6545695662498474, "logps/chosen": -240.60223388671875, "logps/rejected": -235.60650634765625, "loss": 0.6954, "rewards/accuracies": 0.75, "rewards/chosen": -0.02026348188519478, "rewards/margins": 0.0010046940296888351, "rewards/rejected": -0.021268177777528763, "step": 301 }, { "epoch": 0.302, "grad_norm": 1.028985857963562, "learning_rate": 1.51e-06, "logits/chosen": 0.48929062485694885, "logits/rejected": 1.0992149114608765, "logps/chosen": -149.0586395263672, "logps/rejected": -201.9005126953125, "loss": 0.7071, "rewards/accuracies": 0.625, "rewards/chosen": 0.007470512762665749, "rewards/margins": -0.019823454320430756, "rewards/rejected": 0.027293965220451355, "step": 302 }, { "epoch": 0.303, "grad_norm": 1.0268110036849976, "learning_rate": 1.515e-06, "logits/chosen": 0.29446977376937866, "logits/rejected": 0.8761690855026245, "logps/chosen": -214.56069946289062, "logps/rejected": -183.79681396484375, "loss": 0.658, "rewards/accuracies": 0.75, "rewards/chosen": 0.06213526800274849, "rewards/margins": 0.076821468770504, "rewards/rejected": -0.014686202630400658, "step": 303 }, { "epoch": 0.304, "grad_norm": 1.1473933458328247, "learning_rate": 1.5199999999999998e-06, "logits/chosen": 0.7357966303825378, "logits/rejected": -0.0742356926202774, "logps/chosen": -283.18310546875, "logps/rejected": -164.08151245117188, "loss": 0.7022, "rewards/accuracies": 0.5, "rewards/chosen": 0.013760757632553577, "rewards/margins": -0.013923263177275658, "rewards/rejected": 0.02768402174115181, "step": 304 }, { "epoch": 0.305, "grad_norm": 1.1077204942703247, "learning_rate": 1.5249999999999998e-06, "logits/chosen": 1.3128401041030884, "logits/rejected": 0.7016350030899048, "logps/chosen": -342.1813049316406, "logps/rejected": -180.080322265625, "loss": 0.7035, "rewards/accuracies": 0.5, "rewards/chosen": 0.004582691937685013, "rewards/margins": -0.006383609026670456, "rewards/rejected": 0.010966300964355469, "step": 305 }, { "epoch": 0.306, "grad_norm": 1.0359216928482056, "learning_rate": 1.53e-06, "logits/chosen": 0.3099416494369507, "logits/rejected": 0.7022977471351624, "logps/chosen": -204.47872924804688, "logps/rejected": -211.00784301757812, "loss": 0.6662, "rewards/accuracies": 0.625, "rewards/chosen": -0.002562666777521372, "rewards/margins": 0.05779814347624779, "rewards/rejected": -0.06036081537604332, "step": 306 }, { "epoch": 0.307, "grad_norm": 1.0655264854431152, "learning_rate": 1.535e-06, "logits/chosen": 0.5924462676048279, "logits/rejected": 0.5625861883163452, "logps/chosen": -184.190673828125, "logps/rejected": -167.32684326171875, "loss": 0.6896, "rewards/accuracies": 0.375, "rewards/chosen": 0.016949083656072617, "rewards/margins": 0.013155462220311165, "rewards/rejected": 0.0037936228327453136, "step": 307 }, { "epoch": 0.308, "grad_norm": 1.0876227617263794, "learning_rate": 1.5399999999999999e-06, "logits/chosen": 0.6967666149139404, "logits/rejected": 0.5874997973442078, "logps/chosen": -249.103515625, "logps/rejected": -220.0326385498047, "loss": 0.6555, "rewards/accuracies": 0.625, "rewards/chosen": 0.020321086049079895, "rewards/margins": 0.09787407517433167, "rewards/rejected": -0.07755298912525177, "step": 308 }, { "epoch": 0.309, "grad_norm": 0.9268637299537659, "learning_rate": 1.5449999999999998e-06, "logits/chosen": 0.4288312792778015, "logits/rejected": 0.1676562875509262, "logps/chosen": -244.74879455566406, "logps/rejected": -167.04766845703125, "loss": 0.674, "rewards/accuracies": 0.5, "rewards/chosen": 0.0532621368765831, "rewards/margins": 0.050454430282115936, "rewards/rejected": 0.0028077103197574615, "step": 309 }, { "epoch": 0.31, "grad_norm": 1.4421571493148804, "learning_rate": 1.55e-06, "logits/chosen": 0.9205632209777832, "logits/rejected": 0.18402212858200073, "logps/chosen": -326.34576416015625, "logps/rejected": -134.63525390625, "loss": 0.7016, "rewards/accuracies": 0.375, "rewards/chosen": 0.04689826816320419, "rewards/margins": -0.007701396942138672, "rewards/rejected": 0.054599665105342865, "step": 310 }, { "epoch": 0.311, "grad_norm": 1.117472529411316, "learning_rate": 1.555e-06, "logits/chosen": 0.1322879046201706, "logits/rejected": 0.5627867579460144, "logps/chosen": -148.8985595703125, "logps/rejected": -236.89637756347656, "loss": 0.6632, "rewards/accuracies": 0.625, "rewards/chosen": 0.02755889855325222, "rewards/margins": 0.06830864399671555, "rewards/rejected": -0.040749743580818176, "step": 311 }, { "epoch": 0.312, "grad_norm": 1.198238492012024, "learning_rate": 1.5599999999999999e-06, "logits/chosen": 0.42923468351364136, "logits/rejected": 0.770487904548645, "logps/chosen": -160.53367614746094, "logps/rejected": -221.19876098632812, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": 0.02120685577392578, "rewards/margins": 0.09523897618055344, "rewards/rejected": -0.07403211295604706, "step": 312 }, { "epoch": 0.313, "grad_norm": 1.0827109813690186, "learning_rate": 1.5649999999999998e-06, "logits/chosen": 0.4293614625930786, "logits/rejected": 0.44144195318222046, "logps/chosen": -321.51898193359375, "logps/rejected": -238.4175567626953, "loss": 0.7131, "rewards/accuracies": 0.625, "rewards/chosen": -0.003181740641593933, "rewards/margins": -0.018090154975652695, "rewards/rejected": 0.01490840781480074, "step": 313 }, { "epoch": 0.314, "grad_norm": 1.076608657836914, "learning_rate": 1.57e-06, "logits/chosen": 0.425843209028244, "logits/rejected": 0.17109350860118866, "logps/chosen": -210.23536682128906, "logps/rejected": -173.02716064453125, "loss": 0.6608, "rewards/accuracies": 0.5, "rewards/chosen": 0.05963964760303497, "rewards/margins": 0.07618893682956696, "rewards/rejected": -0.016549302265048027, "step": 314 }, { "epoch": 0.315, "grad_norm": 1.1424622535705566, "learning_rate": 1.575e-06, "logits/chosen": 0.5989822149276733, "logits/rejected": 1.5404406785964966, "logps/chosen": -221.219970703125, "logps/rejected": -334.51519775390625, "loss": 0.6593, "rewards/accuracies": 0.625, "rewards/chosen": 0.03496389463543892, "rewards/margins": 0.07532644271850586, "rewards/rejected": -0.04036254808306694, "step": 315 }, { "epoch": 0.316, "grad_norm": 1.0565152168273926, "learning_rate": 1.58e-06, "logits/chosen": 0.3847041726112366, "logits/rejected": 0.09393486380577087, "logps/chosen": -203.85780334472656, "logps/rejected": -254.39466857910156, "loss": 0.6417, "rewards/accuracies": 0.75, "rewards/chosen": 0.07946843653917313, "rewards/margins": 0.11198586970567703, "rewards/rejected": -0.032517436891794205, "step": 316 }, { "epoch": 0.317, "grad_norm": 0.9865637421607971, "learning_rate": 1.5849999999999999e-06, "logits/chosen": 0.6704980134963989, "logits/rejected": 0.7274569272994995, "logps/chosen": -213.19589233398438, "logps/rejected": -233.0350341796875, "loss": 0.6638, "rewards/accuracies": 0.75, "rewards/chosen": 0.06391648948192596, "rewards/margins": 0.06661825627088547, "rewards/rejected": -0.0027017593383789062, "step": 317 }, { "epoch": 0.318, "grad_norm": 1.0053743124008179, "learning_rate": 1.59e-06, "logits/chosen": 0.8863073587417603, "logits/rejected": 0.18743349611759186, "logps/chosen": -217.66404724121094, "logps/rejected": -139.21401977539062, "loss": 0.7098, "rewards/accuracies": 0.5, "rewards/chosen": -0.04395762085914612, "rewards/margins": -0.020697731524705887, "rewards/rejected": -0.023259880021214485, "step": 318 }, { "epoch": 0.319, "grad_norm": 1.0260891914367676, "learning_rate": 1.595e-06, "logits/chosen": 0.9168418645858765, "logits/rejected": 0.061137855052948, "logps/chosen": -253.96664428710938, "logps/rejected": -170.05398559570312, "loss": 0.6868, "rewards/accuracies": 0.5, "rewards/chosen": -0.035225871950387955, "rewards/margins": 0.026967521756887436, "rewards/rejected": -0.06219339370727539, "step": 319 }, { "epoch": 0.32, "grad_norm": 0.9961371421813965, "learning_rate": 1.6e-06, "logits/chosen": 0.17218074202537537, "logits/rejected": 0.1811085194349289, "logps/chosen": -205.7753448486328, "logps/rejected": -166.93438720703125, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.019242288544774055, "rewards/margins": 0.01084565557539463, "rewards/rejected": -0.030087949708104134, "step": 320 }, { "epoch": 0.321, "grad_norm": 1.146938681602478, "learning_rate": 1.6049999999999999e-06, "logits/chosen": 0.4852319657802582, "logits/rejected": 1.4176218509674072, "logps/chosen": -132.52890014648438, "logps/rejected": -238.03746032714844, "loss": 0.6517, "rewards/accuracies": 0.75, "rewards/chosen": 0.05086975544691086, "rewards/margins": 0.09571409225463867, "rewards/rejected": -0.04484434053301811, "step": 321 }, { "epoch": 0.322, "grad_norm": 1.0485297441482544, "learning_rate": 1.61e-06, "logits/chosen": 0.47505107522010803, "logits/rejected": 0.5672650337219238, "logps/chosen": -198.9989013671875, "logps/rejected": -196.12936401367188, "loss": 0.64, "rewards/accuracies": 0.625, "rewards/chosen": 0.07375888526439667, "rewards/margins": 0.11489515006542206, "rewards/rejected": -0.04113626480102539, "step": 322 }, { "epoch": 0.323, "grad_norm": 1.099208950996399, "learning_rate": 1.615e-06, "logits/chosen": 0.845172643661499, "logits/rejected": 0.48590847849845886, "logps/chosen": -252.90802001953125, "logps/rejected": -166.3927001953125, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": 0.038692474365234375, "rewards/margins": 0.06367626786231995, "rewards/rejected": -0.024983789771795273, "step": 323 }, { "epoch": 0.324, "grad_norm": 1.221148133277893, "learning_rate": 1.62e-06, "logits/chosen": 0.4804660379886627, "logits/rejected": 0.3341968357563019, "logps/chosen": -201.97373962402344, "logps/rejected": -163.26107788085938, "loss": 0.6125, "rewards/accuracies": 0.75, "rewards/chosen": 0.06624545902013779, "rewards/margins": 0.18734923005104065, "rewards/rejected": -0.12110376358032227, "step": 324 }, { "epoch": 0.325, "grad_norm": 0.9235427379608154, "learning_rate": 1.625e-06, "logits/chosen": 0.17175054550170898, "logits/rejected": 0.37292978167533875, "logps/chosen": -165.38819885253906, "logps/rejected": -209.30421447753906, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.03732623904943466, "rewards/margins": 0.031703852117061615, "rewards/rejected": 0.005622386932373047, "step": 325 }, { "epoch": 0.326, "grad_norm": 1.1531405448913574, "learning_rate": 1.6299999999999999e-06, "logits/chosen": 0.9758821129798889, "logits/rejected": 0.526422917842865, "logps/chosen": -258.7580261230469, "logps/rejected": -171.63897705078125, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": 0.03754949942231178, "rewards/margins": 0.05191412195563316, "rewards/rejected": -0.014364626258611679, "step": 326 }, { "epoch": 0.327, "grad_norm": 1.1303048133850098, "learning_rate": 1.635e-06, "logits/chosen": 1.1181402206420898, "logits/rejected": 0.23228907585144043, "logps/chosen": -269.217529296875, "logps/rejected": -164.08901977539062, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": -0.009985161013901234, "rewards/margins": 0.02018599584698677, "rewards/rejected": -0.03017115592956543, "step": 327 }, { "epoch": 0.328, "grad_norm": 0.9502154588699341, "learning_rate": 1.6399999999999998e-06, "logits/chosen": 0.38525187969207764, "logits/rejected": 0.8924083113670349, "logps/chosen": -190.75067138671875, "logps/rejected": -206.844482421875, "loss": 0.6741, "rewards/accuracies": 0.625, "rewards/chosen": -0.00014138314872980118, "rewards/margins": 0.04391618072986603, "rewards/rejected": -0.044057559221982956, "step": 328 }, { "epoch": 0.329, "grad_norm": 1.0155192613601685, "learning_rate": 1.645e-06, "logits/chosen": 0.6357846856117249, "logits/rejected": 0.6459890604019165, "logps/chosen": -242.8671875, "logps/rejected": -210.54302978515625, "loss": 0.7063, "rewards/accuracies": 0.5, "rewards/chosen": -0.009164334274828434, "rewards/margins": -0.014743617735803127, "rewards/rejected": 0.0055792806670069695, "step": 329 }, { "epoch": 0.33, "grad_norm": 1.026426911354065, "learning_rate": 1.6499999999999999e-06, "logits/chosen": 0.0021919989958405495, "logits/rejected": 0.6450769901275635, "logps/chosen": -168.81983947753906, "logps/rejected": -184.35768127441406, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": -0.00016145594418048859, "rewards/margins": 0.04996347799897194, "rewards/rejected": -0.05012493208050728, "step": 330 }, { "epoch": 0.331, "grad_norm": 1.1783004999160767, "learning_rate": 1.655e-06, "logits/chosen": 0.38051077723503113, "logits/rejected": 0.2449723184108734, "logps/chosen": -221.82730102539062, "logps/rejected": -179.30372619628906, "loss": 0.6777, "rewards/accuracies": 0.625, "rewards/chosen": 0.022039033472537994, "rewards/margins": 0.05075044184923172, "rewards/rejected": -0.028711412101984024, "step": 331 }, { "epoch": 0.332, "grad_norm": 1.0960571765899658, "learning_rate": 1.6599999999999998e-06, "logits/chosen": 0.4085034132003784, "logits/rejected": 0.5785081386566162, "logps/chosen": -177.27357482910156, "logps/rejected": -171.75564575195312, "loss": 0.7488, "rewards/accuracies": 0.5, "rewards/chosen": -0.030577659606933594, "rewards/margins": -0.08175162971019745, "rewards/rejected": 0.051173970103263855, "step": 332 }, { "epoch": 0.333, "grad_norm": 1.2170546054840088, "learning_rate": 1.665e-06, "logits/chosen": 0.42106789350509644, "logits/rejected": -0.012144291773438454, "logps/chosen": -329.51092529296875, "logps/rejected": -136.89956665039062, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": -0.028850745409727097, "rewards/margins": 0.0051878951489925385, "rewards/rejected": -0.034038640558719635, "step": 333 }, { "epoch": 0.334, "grad_norm": 1.085311770439148, "learning_rate": 1.6699999999999999e-06, "logits/chosen": 0.7664310932159424, "logits/rejected": 0.5170066952705383, "logps/chosen": -253.5804901123047, "logps/rejected": -236.6798095703125, "loss": 0.7101, "rewards/accuracies": 0.5, "rewards/chosen": -0.060125403106212616, "rewards/margins": -0.026356080546975136, "rewards/rejected": -0.03376932442188263, "step": 334 }, { "epoch": 0.335, "grad_norm": 1.0194603204727173, "learning_rate": 1.675e-06, "logits/chosen": 0.5912635922431946, "logits/rejected": 0.48315349221229553, "logps/chosen": -212.75173950195312, "logps/rejected": -167.36553955078125, "loss": 0.7217, "rewards/accuracies": 0.5, "rewards/chosen": -0.056719403713941574, "rewards/margins": -0.03942042589187622, "rewards/rejected": -0.0172989871352911, "step": 335 }, { "epoch": 0.336, "grad_norm": 1.0562477111816406, "learning_rate": 1.6799999999999998e-06, "logits/chosen": 0.11773402243852615, "logits/rejected": 0.5993293523788452, "logps/chosen": -133.7306365966797, "logps/rejected": -205.90017700195312, "loss": 0.6461, "rewards/accuracies": 0.75, "rewards/chosen": 0.015471458435058594, "rewards/margins": 0.1026005819439888, "rewards/rejected": -0.08712911605834961, "step": 336 }, { "epoch": 0.337, "grad_norm": 1.140097737312317, "learning_rate": 1.685e-06, "logits/chosen": 0.5680500864982605, "logits/rejected": 0.3810732960700989, "logps/chosen": -217.39202880859375, "logps/rejected": -189.2959747314453, "loss": 0.7208, "rewards/accuracies": 0.375, "rewards/chosen": -0.05563192069530487, "rewards/margins": -0.03594189137220383, "rewards/rejected": -0.01969003677368164, "step": 337 }, { "epoch": 0.338, "grad_norm": 1.2594763040542603, "learning_rate": 1.69e-06, "logits/chosen": 0.7694472074508667, "logits/rejected": 0.2543339431285858, "logps/chosen": -297.2930603027344, "logps/rejected": -187.77825927734375, "loss": 0.7079, "rewards/accuracies": 0.375, "rewards/chosen": -0.04203047603368759, "rewards/margins": -0.018899770453572273, "rewards/rejected": -0.02313070371747017, "step": 338 }, { "epoch": 0.339, "grad_norm": 1.1627064943313599, "learning_rate": 1.695e-06, "logits/chosen": 0.36570417881011963, "logits/rejected": 0.44544678926467896, "logps/chosen": -208.65878295898438, "logps/rejected": -193.80059814453125, "loss": 0.6429, "rewards/accuracies": 0.75, "rewards/chosen": 0.03881072998046875, "rewards/margins": 0.12133999168872833, "rewards/rejected": -0.08252926170825958, "step": 339 }, { "epoch": 0.34, "grad_norm": 1.5284695625305176, "learning_rate": 1.6999999999999998e-06, "logits/chosen": -0.2521318197250366, "logits/rejected": 0.2988232672214508, "logps/chosen": -205.24282836914062, "logps/rejected": -247.8821258544922, "loss": 0.6209, "rewards/accuracies": 0.75, "rewards/chosen": 0.07833190262317657, "rewards/margins": 0.1617879420518875, "rewards/rejected": -0.08345603942871094, "step": 340 }, { "epoch": 0.341, "grad_norm": 1.253596305847168, "learning_rate": 1.705e-06, "logits/chosen": 0.6743031740188599, "logits/rejected": 0.22530610859394073, "logps/chosen": -224.7559814453125, "logps/rejected": -172.9122314453125, "loss": 0.7212, "rewards/accuracies": 0.25, "rewards/chosen": -0.058115389198064804, "rewards/margins": -0.04140176251530647, "rewards/rejected": -0.016713619232177734, "step": 341 }, { "epoch": 0.342, "grad_norm": 1.0365386009216309, "learning_rate": 1.71e-06, "logits/chosen": 0.5540598630905151, "logits/rejected": 0.5535869002342224, "logps/chosen": -183.95245361328125, "logps/rejected": -191.15719604492188, "loss": 0.6423, "rewards/accuracies": 0.625, "rewards/chosen": 0.027001576498150826, "rewards/margins": 0.11466295272111893, "rewards/rejected": -0.08766137063503265, "step": 342 }, { "epoch": 0.343, "grad_norm": 0.9374682307243347, "learning_rate": 1.715e-06, "logits/chosen": 0.7207291126251221, "logits/rejected": 0.6586239337921143, "logps/chosen": -201.79159545898438, "logps/rejected": -192.34234619140625, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": 0.05649833753705025, "rewards/margins": 0.13698500394821167, "rewards/rejected": -0.08048668503761292, "step": 343 }, { "epoch": 0.344, "grad_norm": 1.1141316890716553, "learning_rate": 1.7199999999999998e-06, "logits/chosen": 0.49208950996398926, "logits/rejected": 1.02610182762146, "logps/chosen": -185.59762573242188, "logps/rejected": -281.50567626953125, "loss": 0.628, "rewards/accuracies": 0.875, "rewards/chosen": 0.014632701873779297, "rewards/margins": 0.14573369920253754, "rewards/rejected": -0.13110099732875824, "step": 344 }, { "epoch": 0.345, "grad_norm": 1.1193255186080933, "learning_rate": 1.725e-06, "logits/chosen": 0.14097139239311218, "logits/rejected": 0.5931141972541809, "logps/chosen": -122.50676727294922, "logps/rejected": -177.38262939453125, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": 0.07495031505823135, "rewards/margins": 0.17980679869651794, "rewards/rejected": -0.10485649108886719, "step": 345 }, { "epoch": 0.346, "grad_norm": 1.0847055912017822, "learning_rate": 1.73e-06, "logits/chosen": 0.8074724674224854, "logits/rejected": 0.16611096262931824, "logps/chosen": -173.27951049804688, "logps/rejected": -188.0372772216797, "loss": 0.6369, "rewards/accuracies": 0.75, "rewards/chosen": 0.0384400337934494, "rewards/margins": 0.12681493163108826, "rewards/rejected": -0.08837490528821945, "step": 346 }, { "epoch": 0.347, "grad_norm": 1.111163854598999, "learning_rate": 1.7350000000000001e-06, "logits/chosen": 0.7189805507659912, "logits/rejected": 0.8971098065376282, "logps/chosen": -248.99008178710938, "logps/rejected": -205.12966918945312, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.007892992347478867, "rewards/margins": 0.022763065993785858, "rewards/rejected": -0.014870071783661842, "step": 347 }, { "epoch": 0.348, "grad_norm": 1.20064115524292, "learning_rate": 1.7399999999999999e-06, "logits/chosen": 0.5369797945022583, "logits/rejected": 0.29909250140190125, "logps/chosen": -222.20730590820312, "logps/rejected": -225.87294006347656, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": 0.005690194666385651, "rewards/margins": 0.04995299130678177, "rewards/rejected": -0.04426279291510582, "step": 348 }, { "epoch": 0.349, "grad_norm": 0.9469853043556213, "learning_rate": 1.745e-06, "logits/chosen": 0.07744185626506805, "logits/rejected": 0.39168381690979004, "logps/chosen": -222.48388671875, "logps/rejected": -202.2346954345703, "loss": 0.671, "rewards/accuracies": 0.5, "rewards/chosen": 0.024666981771588326, "rewards/margins": 0.058170899748802185, "rewards/rejected": -0.03350391983985901, "step": 349 }, { "epoch": 0.35, "grad_norm": 0.9405419826507568, "learning_rate": 1.75e-06, "logits/chosen": 0.6552055478096008, "logits/rejected": 0.5759083032608032, "logps/chosen": -238.2633819580078, "logps/rejected": -298.6871643066406, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 0.060591891407966614, "rewards/margins": 0.11268138885498047, "rewards/rejected": -0.052089497447013855, "step": 350 }, { "epoch": 0.351, "grad_norm": 1.1366326808929443, "learning_rate": 1.7549999999999997e-06, "logits/chosen": 0.813056468963623, "logits/rejected": 0.9049899578094482, "logps/chosen": -180.49822998046875, "logps/rejected": -253.1209716796875, "loss": 0.6686, "rewards/accuracies": 0.625, "rewards/chosen": 0.03423319011926651, "rewards/margins": 0.07767954468727112, "rewards/rejected": -0.04344635456800461, "step": 351 }, { "epoch": 0.352, "grad_norm": 0.9385539889335632, "learning_rate": 1.7599999999999999e-06, "logits/chosen": 0.25222545862197876, "logits/rejected": 0.6778271794319153, "logps/chosen": -176.11146545410156, "logps/rejected": -242.2910614013672, "loss": 0.7079, "rewards/accuracies": 0.5, "rewards/chosen": -0.04485177993774414, "rewards/margins": -0.015909960493445396, "rewards/rejected": -0.028941821306943893, "step": 352 }, { "epoch": 0.353, "grad_norm": 0.9959011673927307, "learning_rate": 1.7649999999999998e-06, "logits/chosen": 0.5924135446548462, "logits/rejected": 0.43987536430358887, "logps/chosen": -181.09783935546875, "logps/rejected": -170.28414916992188, "loss": 0.6673, "rewards/accuracies": 0.5, "rewards/chosen": -0.02520780637860298, "rewards/margins": 0.07571744173765182, "rewards/rejected": -0.1009252518415451, "step": 353 }, { "epoch": 0.354, "grad_norm": 1.0973167419433594, "learning_rate": 1.77e-06, "logits/chosen": 0.3523298501968384, "logits/rejected": 0.733353316783905, "logps/chosen": -203.89202880859375, "logps/rejected": -218.2725372314453, "loss": 0.6105, "rewards/accuracies": 0.75, "rewards/chosen": 0.08044219017028809, "rewards/margins": 0.20520740747451782, "rewards/rejected": -0.12476520240306854, "step": 354 }, { "epoch": 0.355, "grad_norm": 1.2543106079101562, "learning_rate": 1.7749999999999997e-06, "logits/chosen": 0.4557206332683563, "logits/rejected": 0.43095046281814575, "logps/chosen": -110.67022705078125, "logps/rejected": -174.1546630859375, "loss": 0.6244, "rewards/accuracies": 0.75, "rewards/chosen": 0.04143434017896652, "rewards/margins": 0.15760235488414764, "rewards/rejected": -0.11616802215576172, "step": 355 }, { "epoch": 0.356, "grad_norm": 1.0665454864501953, "learning_rate": 1.78e-06, "logits/chosen": 0.08100622892379761, "logits/rejected": 0.24057011306285858, "logps/chosen": -171.95474243164062, "logps/rejected": -180.51165771484375, "loss": 0.6108, "rewards/accuracies": 0.75, "rewards/chosen": 0.11953344941139221, "rewards/margins": 0.1880839467048645, "rewards/rejected": -0.0685504898428917, "step": 356 }, { "epoch": 0.357, "grad_norm": 0.9638935923576355, "learning_rate": 1.7849999999999999e-06, "logits/chosen": 0.3798410892486572, "logits/rejected": 0.6465194225311279, "logps/chosen": -232.45407104492188, "logps/rejected": -155.09115600585938, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.023302890360355377, "rewards/margins": 0.032225221395492554, "rewards/rejected": -0.05552811548113823, "step": 357 }, { "epoch": 0.358, "grad_norm": 0.9997841119766235, "learning_rate": 1.79e-06, "logits/chosen": 0.6874808073043823, "logits/rejected": 0.5797542929649353, "logps/chosen": -271.99462890625, "logps/rejected": -195.04489135742188, "loss": 0.6732, "rewards/accuracies": 0.5, "rewards/chosen": 0.029434731230139732, "rewards/margins": 0.057974766939878464, "rewards/rejected": -0.02854003943502903, "step": 358 }, { "epoch": 0.359, "grad_norm": 1.0035969018936157, "learning_rate": 1.7949999999999998e-06, "logits/chosen": 0.23375451564788818, "logits/rejected": 1.0757113695144653, "logps/chosen": -216.1209716796875, "logps/rejected": -252.90463256835938, "loss": 0.6817, "rewards/accuracies": 0.75, "rewards/chosen": 0.007815837860107422, "rewards/margins": 0.04465150833129883, "rewards/rejected": -0.036835670471191406, "step": 359 }, { "epoch": 0.36, "grad_norm": 1.1821963787078857, "learning_rate": 1.8e-06, "logits/chosen": 0.9887760877609253, "logits/rejected": 0.14929494261741638, "logps/chosen": -263.2257080078125, "logps/rejected": -156.228759765625, "loss": 0.6404, "rewards/accuracies": 0.625, "rewards/chosen": 0.029242422431707382, "rewards/margins": 0.12214647233486176, "rewards/rejected": -0.09290404617786407, "step": 360 }, { "epoch": 0.361, "grad_norm": 1.1154003143310547, "learning_rate": 1.8049999999999999e-06, "logits/chosen": 0.38093888759613037, "logits/rejected": 0.5234131813049316, "logps/chosen": -232.05548095703125, "logps/rejected": -184.94259643554688, "loss": 0.6272, "rewards/accuracies": 0.75, "rewards/chosen": 0.05377740412950516, "rewards/margins": 0.16291846334934235, "rewards/rejected": -0.10914106667041779, "step": 361 }, { "epoch": 0.362, "grad_norm": 1.2865588665008545, "learning_rate": 1.81e-06, "logits/chosen": 0.39121973514556885, "logits/rejected": 0.4549907445907593, "logps/chosen": -238.6729278564453, "logps/rejected": -196.27691650390625, "loss": 0.623, "rewards/accuracies": 0.625, "rewards/chosen": 0.09602928161621094, "rewards/margins": 0.1941043734550476, "rewards/rejected": -0.09807510673999786, "step": 362 }, { "epoch": 0.363, "grad_norm": 1.1206899881362915, "learning_rate": 1.8149999999999998e-06, "logits/chosen": 0.4177142381668091, "logits/rejected": 0.7719815373420715, "logps/chosen": -162.75262451171875, "logps/rejected": -193.34457397460938, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": 0.08137169480323792, "rewards/margins": 0.22357121109962463, "rewards/rejected": -0.14219951629638672, "step": 363 }, { "epoch": 0.364, "grad_norm": 1.5774331092834473, "learning_rate": 1.82e-06, "logits/chosen": 0.8220181465148926, "logits/rejected": 0.3892381191253662, "logps/chosen": -235.69236755371094, "logps/rejected": -156.97067260742188, "loss": 0.8189, "rewards/accuracies": 0.375, "rewards/chosen": -0.1655128449201584, "rewards/margins": -0.19413280487060547, "rewards/rejected": 0.028619956225156784, "step": 364 }, { "epoch": 0.365, "grad_norm": 1.1063847541809082, "learning_rate": 1.8249999999999999e-06, "logits/chosen": 0.31826043128967285, "logits/rejected": 0.8144761919975281, "logps/chosen": -153.62744140625, "logps/rejected": -195.26889038085938, "loss": 0.6566, "rewards/accuracies": 0.75, "rewards/chosen": -0.007386971265077591, "rewards/margins": 0.10953215509653091, "rewards/rejected": -0.1169191375374794, "step": 365 }, { "epoch": 0.366, "grad_norm": 1.434327483177185, "learning_rate": 1.83e-06, "logits/chosen": 0.7723796367645264, "logits/rejected": 0.07937260717153549, "logps/chosen": -338.230712890625, "logps/rejected": -184.17835998535156, "loss": 0.7537, "rewards/accuracies": 0.375, "rewards/chosen": -0.053044892847537994, "rewards/margins": -0.09581717848777771, "rewards/rejected": 0.04277229309082031, "step": 366 }, { "epoch": 0.367, "grad_norm": 1.0995802879333496, "learning_rate": 1.8349999999999998e-06, "logits/chosen": 0.4798252582550049, "logits/rejected": 0.9080204367637634, "logps/chosen": -189.12966918945312, "logps/rejected": -308.43450927734375, "loss": 0.6305, "rewards/accuracies": 0.875, "rewards/chosen": 0.08427543193101883, "rewards/margins": 0.15142764151096344, "rewards/rejected": -0.06715221703052521, "step": 367 }, { "epoch": 0.368, "grad_norm": 1.17204749584198, "learning_rate": 1.84e-06, "logits/chosen": 0.9024513363838196, "logits/rejected": 0.2519771158695221, "logps/chosen": -290.2710876464844, "logps/rejected": -192.02239990234375, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": 0.033781714737415314, "rewards/margins": 0.1282917857170105, "rewards/rejected": -0.09451007843017578, "step": 368 }, { "epoch": 0.369, "grad_norm": 1.328178882598877, "learning_rate": 1.845e-06, "logits/chosen": 0.8226251602172852, "logits/rejected": 0.5407423973083496, "logps/chosen": -331.56591796875, "logps/rejected": -231.70693969726562, "loss": 0.7678, "rewards/accuracies": 0.375, "rewards/chosen": -0.04546623304486275, "rewards/margins": -0.11672792583703995, "rewards/rejected": 0.0712616965174675, "step": 369 }, { "epoch": 0.37, "grad_norm": 1.2209572792053223, "learning_rate": 1.85e-06, "logits/chosen": 0.16111956536769867, "logits/rejected": 0.30801767110824585, "logps/chosen": -171.3136444091797, "logps/rejected": -210.41696166992188, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": 0.06762085109949112, "rewards/margins": 0.24422970414161682, "rewards/rejected": -0.1766088604927063, "step": 370 }, { "epoch": 0.371, "grad_norm": 1.026171326637268, "learning_rate": 1.8549999999999998e-06, "logits/chosen": 0.49748143553733826, "logits/rejected": 0.7995032072067261, "logps/chosen": -186.72967529296875, "logps/rejected": -200.089111328125, "loss": 0.6704, "rewards/accuracies": 0.5, "rewards/chosen": -0.03203611448407173, "rewards/margins": 0.06454076617956161, "rewards/rejected": -0.09657688438892365, "step": 371 }, { "epoch": 0.372, "grad_norm": 1.7915866374969482, "learning_rate": 1.86e-06, "logits/chosen": 0.9682018756866455, "logits/rejected": -0.31689149141311646, "logps/chosen": -329.78546142578125, "logps/rejected": -127.74301147460938, "loss": 0.8134, "rewards/accuracies": 0.125, "rewards/chosen": -0.13027323782444, "rewards/margins": -0.20672598481178284, "rewards/rejected": 0.07645273208618164, "step": 372 }, { "epoch": 0.373, "grad_norm": 1.115010380744934, "learning_rate": 1.865e-06, "logits/chosen": 0.7287737131118774, "logits/rejected": 0.528397262096405, "logps/chosen": -285.0152893066406, "logps/rejected": -188.54612731933594, "loss": 0.6163, "rewards/accuracies": 0.625, "rewards/chosen": -0.0433938093483448, "rewards/margins": 0.19576311111450195, "rewards/rejected": -0.23915691673755646, "step": 373 }, { "epoch": 0.374, "grad_norm": 1.0501158237457275, "learning_rate": 1.87e-06, "logits/chosen": 0.4656597971916199, "logits/rejected": 0.21670733392238617, "logps/chosen": -184.55636596679688, "logps/rejected": -157.8406524658203, "loss": 0.582, "rewards/accuracies": 0.875, "rewards/chosen": 0.06386899948120117, "rewards/margins": 0.2452603429555893, "rewards/rejected": -0.18139134347438812, "step": 374 }, { "epoch": 0.375, "grad_norm": 0.9824644327163696, "learning_rate": 1.8749999999999998e-06, "logits/chosen": 0.523141086101532, "logits/rejected": 0.164381206035614, "logps/chosen": -206.1132354736328, "logps/rejected": -156.33087158203125, "loss": 0.6219, "rewards/accuracies": 0.625, "rewards/chosen": 0.0729791671037674, "rewards/margins": 0.2226632982492447, "rewards/rejected": -0.1496841311454773, "step": 375 }, { "epoch": 0.376, "grad_norm": 0.9806884527206421, "learning_rate": 1.8799999999999998e-06, "logits/chosen": 0.612923264503479, "logits/rejected": 0.4929297864437103, "logps/chosen": -263.17767333984375, "logps/rejected": -153.2121124267578, "loss": 0.6458, "rewards/accuracies": 0.625, "rewards/chosen": 0.04519853740930557, "rewards/margins": 0.1465948075056076, "rewards/rejected": -0.10139627754688263, "step": 376 }, { "epoch": 0.377, "grad_norm": 1.0399932861328125, "learning_rate": 1.885e-06, "logits/chosen": 0.40542861819267273, "logits/rejected": 0.7496431469917297, "logps/chosen": -152.58714294433594, "logps/rejected": -227.69369506835938, "loss": 0.6955, "rewards/accuracies": 0.625, "rewards/chosen": -0.06269612163305283, "rewards/margins": 0.021128028631210327, "rewards/rejected": -0.08382415771484375, "step": 377 }, { "epoch": 0.378, "grad_norm": 1.1881886720657349, "learning_rate": 1.89e-06, "logits/chosen": 0.4318368434906006, "logits/rejected": 0.5828996896743774, "logps/chosen": -150.50234985351562, "logps/rejected": -221.1134490966797, "loss": 0.5627, "rewards/accuracies": 0.875, "rewards/chosen": 0.11353693157434464, "rewards/margins": 0.3141591548919678, "rewards/rejected": -0.20062223076820374, "step": 378 }, { "epoch": 0.379, "grad_norm": 0.893639087677002, "learning_rate": 1.8949999999999999e-06, "logits/chosen": 0.41236087679862976, "logits/rejected": 0.3922349512577057, "logps/chosen": -177.31289672851562, "logps/rejected": -176.8455810546875, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": -0.023639250546693802, "rewards/margins": 0.1293785572052002, "rewards/rejected": -0.1530178040266037, "step": 379 }, { "epoch": 0.38, "grad_norm": 2.0527753829956055, "learning_rate": 1.8999999999999998e-06, "logits/chosen": 0.8260540962219238, "logits/rejected": 0.1837531477212906, "logps/chosen": -244.26766967773438, "logps/rejected": -135.44151306152344, "loss": 0.8804, "rewards/accuracies": 0.125, "rewards/chosen": -0.18592509627342224, "rewards/margins": -0.3259087800979614, "rewards/rejected": 0.1399836540222168, "step": 380 }, { "epoch": 0.381, "grad_norm": 1.113620638847351, "learning_rate": 1.905e-06, "logits/chosen": 0.5669646859169006, "logits/rejected": 0.5862668752670288, "logps/chosen": -374.7446594238281, "logps/rejected": -209.33200073242188, "loss": 0.5983, "rewards/accuracies": 0.625, "rewards/chosen": 0.07986536622047424, "rewards/margins": 0.24227765202522278, "rewards/rejected": -0.16241227090358734, "step": 381 }, { "epoch": 0.382, "grad_norm": 1.1709551811218262, "learning_rate": 1.91e-06, "logits/chosen": 0.06162326782941818, "logits/rejected": 1.6016793251037598, "logps/chosen": -188.77923583984375, "logps/rejected": -394.769287109375, "loss": 0.5488, "rewards/accuracies": 0.75, "rewards/chosen": 0.10402165353298187, "rewards/margins": 0.3618011474609375, "rewards/rejected": -0.2577795088291168, "step": 382 }, { "epoch": 0.383, "grad_norm": 1.045265793800354, "learning_rate": 1.915e-06, "logits/chosen": 0.25743886828422546, "logits/rejected": 0.5375794172286987, "logps/chosen": -246.01602172851562, "logps/rejected": -185.86328125, "loss": 0.6276, "rewards/accuracies": 0.625, "rewards/chosen": 0.017625709995627403, "rewards/margins": 0.15993423759937286, "rewards/rejected": -0.14230851829051971, "step": 383 }, { "epoch": 0.384, "grad_norm": 1.0174232721328735, "learning_rate": 1.92e-06, "logits/chosen": 0.2467745691537857, "logits/rejected": 0.17974737286567688, "logps/chosen": -178.4981689453125, "logps/rejected": -207.07867431640625, "loss": 0.7034, "rewards/accuracies": 0.625, "rewards/chosen": -0.0669788345694542, "rewards/margins": 0.033615678548812866, "rewards/rejected": -0.10059452056884766, "step": 384 }, { "epoch": 0.385, "grad_norm": 1.2078837156295776, "learning_rate": 1.9249999999999998e-06, "logits/chosen": 0.9496592283248901, "logits/rejected": 0.15576156973838806, "logps/chosen": -229.36383056640625, "logps/rejected": -139.03005981445312, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": -0.06448955833911896, "rewards/margins": 0.026032302528619766, "rewards/rejected": -0.09052185714244843, "step": 385 }, { "epoch": 0.386, "grad_norm": 1.6804425716400146, "learning_rate": 1.9299999999999997e-06, "logits/chosen": 1.6014424562454224, "logits/rejected": -0.07677893340587616, "logps/chosen": -340.1585693359375, "logps/rejected": -138.10276794433594, "loss": 0.7526, "rewards/accuracies": 0.25, "rewards/chosen": -0.11472588032484055, "rewards/margins": -0.08270196616649628, "rewards/rejected": -0.03202391043305397, "step": 386 }, { "epoch": 0.387, "grad_norm": 1.063323974609375, "learning_rate": 1.935e-06, "logits/chosen": 0.8787519931793213, "logits/rejected": 0.6571893095970154, "logps/chosen": -289.99285888671875, "logps/rejected": -262.1700134277344, "loss": 0.622, "rewards/accuracies": 0.75, "rewards/chosen": 0.03479376435279846, "rewards/margins": 0.18349963426589966, "rewards/rejected": -0.1487058699131012, "step": 387 }, { "epoch": 0.388, "grad_norm": 1.1336675882339478, "learning_rate": 1.94e-06, "logits/chosen": 0.5838436484336853, "logits/rejected": 1.0092312097549438, "logps/chosen": -253.77133178710938, "logps/rejected": -263.13104248046875, "loss": 0.6423, "rewards/accuracies": 0.625, "rewards/chosen": 0.014143895357847214, "rewards/margins": 0.13118425011634827, "rewards/rejected": -0.11704035103321075, "step": 388 }, { "epoch": 0.389, "grad_norm": 1.2252693176269531, "learning_rate": 1.945e-06, "logits/chosen": 0.466339111328125, "logits/rejected": 0.24174702167510986, "logps/chosen": -249.57388305664062, "logps/rejected": -190.28564453125, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": 0.005223274230957031, "rewards/margins": 0.08015822619199753, "rewards/rejected": -0.0749349594116211, "step": 389 }, { "epoch": 0.39, "grad_norm": 0.9876509308815002, "learning_rate": 1.95e-06, "logits/chosen": 0.4855291545391083, "logits/rejected": 0.5199207663536072, "logps/chosen": -162.26995849609375, "logps/rejected": -196.49359130859375, "loss": 0.6039, "rewards/accuracies": 0.625, "rewards/chosen": 0.029283523559570312, "rewards/margins": 0.21015316247940063, "rewards/rejected": -0.18086963891983032, "step": 390 }, { "epoch": 0.391, "grad_norm": 1.212896466255188, "learning_rate": 1.955e-06, "logits/chosen": 0.5562991499900818, "logits/rejected": 0.5435662269592285, "logps/chosen": -160.68783569335938, "logps/rejected": -176.9147186279297, "loss": 0.5542, "rewards/accuracies": 0.75, "rewards/chosen": 0.14972619712352753, "rewards/margins": 0.3282322883605957, "rewards/rejected": -0.17850609123706818, "step": 391 }, { "epoch": 0.392, "grad_norm": 1.1381555795669556, "learning_rate": 1.96e-06, "logits/chosen": 0.520726203918457, "logits/rejected": 0.8218934535980225, "logps/chosen": -171.1871337890625, "logps/rejected": -162.58145141601562, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": -0.06641139835119247, "rewards/margins": 0.1083737313747406, "rewards/rejected": -0.17478513717651367, "step": 392 }, { "epoch": 0.393, "grad_norm": 1.2814563512802124, "learning_rate": 1.965e-06, "logits/chosen": 0.2725251317024231, "logits/rejected": 0.5111180543899536, "logps/chosen": -205.677001953125, "logps/rejected": -198.1390380859375, "loss": 0.7135, "rewards/accuracies": 0.5, "rewards/chosen": -0.0984770879149437, "rewards/margins": 0.003230839967727661, "rewards/rejected": -0.10170794278383255, "step": 393 }, { "epoch": 0.394, "grad_norm": 1.4476624727249146, "learning_rate": 1.9699999999999998e-06, "logits/chosen": 0.4868222177028656, "logits/rejected": 0.4672034978866577, "logps/chosen": -219.81918334960938, "logps/rejected": -222.48495483398438, "loss": 0.7808, "rewards/accuracies": 0.375, "rewards/chosen": -0.15476356446743011, "rewards/margins": -0.1360560953617096, "rewards/rejected": -0.01870746538043022, "step": 394 }, { "epoch": 0.395, "grad_norm": 1.1169873476028442, "learning_rate": 1.975e-06, "logits/chosen": 0.6602992415428162, "logits/rejected": 0.5117785930633545, "logps/chosen": -222.24276733398438, "logps/rejected": -168.8640899658203, "loss": 0.6134, "rewards/accuracies": 0.625, "rewards/chosen": 0.061863623559474945, "rewards/margins": 0.2475699633359909, "rewards/rejected": -0.18570633232593536, "step": 395 }, { "epoch": 0.396, "grad_norm": 1.4847232103347778, "learning_rate": 1.98e-06, "logits/chosen": 1.139582872390747, "logits/rejected": 0.6668686270713806, "logps/chosen": -363.3453063964844, "logps/rejected": -184.29251098632812, "loss": 0.7093, "rewards/accuracies": 0.375, "rewards/chosen": -0.03947105258703232, "rewards/margins": 0.03041306883096695, "rewards/rejected": -0.06988410651683807, "step": 396 }, { "epoch": 0.397, "grad_norm": 0.9787327647209167, "learning_rate": 1.985e-06, "logits/chosen": 0.2199222594499588, "logits/rejected": 1.0808141231536865, "logps/chosen": -145.6376953125, "logps/rejected": -290.9228515625, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": 0.0008927332237362862, "rewards/margins": 0.11820049583911896, "rewards/rejected": -0.11730776727199554, "step": 397 }, { "epoch": 0.398, "grad_norm": 1.1977660655975342, "learning_rate": 1.99e-06, "logits/chosen": 0.7593017220497131, "logits/rejected": 0.5149338841438293, "logps/chosen": -286.8496398925781, "logps/rejected": -247.10458374023438, "loss": 0.6175, "rewards/accuracies": 0.625, "rewards/chosen": 0.04838556796312332, "rewards/margins": 0.2055913507938385, "rewards/rejected": -0.15720577538013458, "step": 398 }, { "epoch": 0.399, "grad_norm": 1.05784273147583, "learning_rate": 1.995e-06, "logits/chosen": 0.5849027633666992, "logits/rejected": 0.2850176990032196, "logps/chosen": -308.51812744140625, "logps/rejected": -190.8610076904297, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": -0.0904783234000206, "rewards/margins": 0.03853845223784447, "rewards/rejected": -0.12901678681373596, "step": 399 }, { "epoch": 0.4, "grad_norm": 1.4056379795074463, "learning_rate": 2e-06, "logits/chosen": 0.8333343267440796, "logits/rejected": 0.24120986461639404, "logps/chosen": -276.91729736328125, "logps/rejected": -157.8484649658203, "loss": 0.7643, "rewards/accuracies": 0.5, "rewards/chosen": -0.10887308418750763, "rewards/margins": -0.08287554979324341, "rewards/rejected": -0.025997541844844818, "step": 400 }, { "epoch": 0.401, "grad_norm": 1.058074712753296, "learning_rate": 1.99875e-06, "logits/chosen": 0.745337963104248, "logits/rejected": 0.9038192629814148, "logps/chosen": -198.78665161132812, "logps/rejected": -210.73773193359375, "loss": 0.5347, "rewards/accuracies": 0.875, "rewards/chosen": 0.16671746969223022, "rewards/margins": 0.3826809525489807, "rewards/rejected": -0.2159634530544281, "step": 401 }, { "epoch": 0.402, "grad_norm": 1.0148754119873047, "learning_rate": 1.9975e-06, "logits/chosen": 0.27993243932724, "logits/rejected": 0.7770500183105469, "logps/chosen": -159.31744384765625, "logps/rejected": -227.3844451904297, "loss": 0.613, "rewards/accuracies": 0.625, "rewards/chosen": 0.06421393901109695, "rewards/margins": 0.2101423293352127, "rewards/rejected": -0.14592838287353516, "step": 402 }, { "epoch": 0.403, "grad_norm": 1.1069914102554321, "learning_rate": 1.99625e-06, "logits/chosen": 1.0368890762329102, "logits/rejected": 0.4458419680595398, "logps/chosen": -279.73443603515625, "logps/rejected": -141.08389282226562, "loss": 0.5752, "rewards/accuracies": 0.75, "rewards/chosen": 0.14309358596801758, "rewards/margins": 0.2802150845527649, "rewards/rejected": -0.13712148368358612, "step": 403 }, { "epoch": 0.404, "grad_norm": 1.0783796310424805, "learning_rate": 1.995e-06, "logits/chosen": 0.7878132462501526, "logits/rejected": 0.5543602108955383, "logps/chosen": -236.41664123535156, "logps/rejected": -243.24803161621094, "loss": 0.6121, "rewards/accuracies": 0.625, "rewards/chosen": 0.044278234243392944, "rewards/margins": 0.21100035309791565, "rewards/rejected": -0.1667221188545227, "step": 404 }, { "epoch": 0.405, "grad_norm": 1.1506671905517578, "learning_rate": 1.9937499999999998e-06, "logits/chosen": 0.9204286336898804, "logits/rejected": 0.6508611440658569, "logps/chosen": -268.1604919433594, "logps/rejected": -247.93202209472656, "loss": 0.6787, "rewards/accuracies": 0.5, "rewards/chosen": 0.0012538507580757141, "rewards/margins": 0.06798988580703735, "rewards/rejected": -0.06673603504896164, "step": 405 }, { "epoch": 0.406, "grad_norm": 1.5170561075210571, "learning_rate": 1.9925e-06, "logits/chosen": 0.6571277379989624, "logits/rejected": 0.06704630702733994, "logps/chosen": -243.7225341796875, "logps/rejected": -118.2477798461914, "loss": 0.7905, "rewards/accuracies": 0.25, "rewards/chosen": -0.16247950494289398, "rewards/margins": -0.13436412811279297, "rewards/rejected": -0.02811536379158497, "step": 406 }, { "epoch": 0.407, "grad_norm": 1.0446224212646484, "learning_rate": 1.9912499999999998e-06, "logits/chosen": -0.16019965708255768, "logits/rejected": 0.395446240901947, "logps/chosen": -140.05331420898438, "logps/rejected": -197.49374389648438, "loss": 0.6473, "rewards/accuracies": 0.75, "rewards/chosen": 0.039157018065452576, "rewards/margins": 0.16833259165287018, "rewards/rejected": -0.1291755735874176, "step": 407 }, { "epoch": 0.408, "grad_norm": 1.1918494701385498, "learning_rate": 1.99e-06, "logits/chosen": 0.5423214435577393, "logits/rejected": 0.12050648778676987, "logps/chosen": -201.4560089111328, "logps/rejected": -147.77891540527344, "loss": 0.6036, "rewards/accuracies": 0.75, "rewards/chosen": 0.08225822448730469, "rewards/margins": 0.24400024116039276, "rewards/rejected": -0.16174201667308807, "step": 408 }, { "epoch": 0.409, "grad_norm": 1.1646705865859985, "learning_rate": 1.98875e-06, "logits/chosen": 0.44588738679885864, "logits/rejected": 0.22098171710968018, "logps/chosen": -310.50115966796875, "logps/rejected": -183.228271484375, "loss": 0.6435, "rewards/accuracies": 0.625, "rewards/chosen": -0.009434650652110577, "rewards/margins": 0.13224005699157715, "rewards/rejected": -0.1416746973991394, "step": 409 }, { "epoch": 0.41, "grad_norm": 1.178562045097351, "learning_rate": 1.9875e-06, "logits/chosen": 0.11433494091033936, "logits/rejected": 0.6693436503410339, "logps/chosen": -131.41616821289062, "logps/rejected": -279.724609375, "loss": 0.5484, "rewards/accuracies": 0.75, "rewards/chosen": 0.10032706707715988, "rewards/margins": 0.3696604371070862, "rewards/rejected": -0.2693333625793457, "step": 410 }, { "epoch": 0.411, "grad_norm": 1.684470772743225, "learning_rate": 1.98625e-06, "logits/chosen": 0.2520783245563507, "logits/rejected": 0.36906924843788147, "logps/chosen": -216.26577758789062, "logps/rejected": -177.0050506591797, "loss": 0.8077, "rewards/accuracies": 0.5, "rewards/chosen": -0.10514307022094727, "rewards/margins": -0.1378948986530304, "rewards/rejected": 0.03275184705853462, "step": 411 }, { "epoch": 0.412, "grad_norm": 1.962244987487793, "learning_rate": 1.985e-06, "logits/chosen": 0.829700231552124, "logits/rejected": 0.055433239787817, "logps/chosen": -228.18087768554688, "logps/rejected": -153.25375366210938, "loss": 0.8978, "rewards/accuracies": 0.25, "rewards/chosen": -0.2511364817619324, "rewards/margins": -0.3371148705482483, "rewards/rejected": 0.08597835898399353, "step": 412 }, { "epoch": 0.413, "grad_norm": 1.199547290802002, "learning_rate": 1.98375e-06, "logits/chosen": 0.4809487462043762, "logits/rejected": 0.7451724410057068, "logps/chosen": -160.64657592773438, "logps/rejected": -183.30006408691406, "loss": 0.5495, "rewards/accuracies": 0.75, "rewards/chosen": 0.18490135669708252, "rewards/margins": 0.3916758894920349, "rewards/rejected": -0.2067745178937912, "step": 413 }, { "epoch": 0.414, "grad_norm": 1.0465309619903564, "learning_rate": 1.9824999999999997e-06, "logits/chosen": -0.35035115480422974, "logits/rejected": 0.6600967645645142, "logps/chosen": -107.80024719238281, "logps/rejected": -205.5438995361328, "loss": 0.4778, "rewards/accuracies": 0.875, "rewards/chosen": 0.2552957534790039, "rewards/margins": 0.5967411994934082, "rewards/rejected": -0.3414454460144043, "step": 414 }, { "epoch": 0.415, "grad_norm": 1.207563877105713, "learning_rate": 1.98125e-06, "logits/chosen": 0.8470306396484375, "logits/rejected": 0.03437821567058563, "logps/chosen": -410.2377624511719, "logps/rejected": -150.81243896484375, "loss": 0.6197, "rewards/accuracies": 0.625, "rewards/chosen": 0.040316008031368256, "rewards/margins": 0.17623481154441833, "rewards/rejected": -0.13591881096363068, "step": 415 }, { "epoch": 0.416, "grad_norm": 1.1942849159240723, "learning_rate": 1.98e-06, "logits/chosen": 0.7661194205284119, "logits/rejected": 0.6272225379943848, "logps/chosen": -275.400146484375, "logps/rejected": -184.88926696777344, "loss": 0.7097, "rewards/accuracies": 0.5, "rewards/chosen": -0.12190103530883789, "rewards/margins": 0.008094221353530884, "rewards/rejected": -0.12999525666236877, "step": 416 }, { "epoch": 0.417, "grad_norm": 1.2512850761413574, "learning_rate": 1.97875e-06, "logits/chosen": 0.8595912456512451, "logits/rejected": 0.4103412926197052, "logps/chosen": -235.99900817871094, "logps/rejected": -205.57943725585938, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": -0.04766054451465607, "rewards/margins": 0.02199648879468441, "rewards/rejected": -0.06965704262256622, "step": 417 }, { "epoch": 0.418, "grad_norm": 0.9562399387359619, "learning_rate": 1.9775e-06, "logits/chosen": 0.5054230690002441, "logits/rejected": 0.2441631555557251, "logps/chosen": -164.34976196289062, "logps/rejected": -178.19403076171875, "loss": 0.6125, "rewards/accuracies": 0.75, "rewards/chosen": 0.11109648644924164, "rewards/margins": 0.24358302354812622, "rewards/rejected": -0.13248653709888458, "step": 418 }, { "epoch": 0.419, "grad_norm": 1.2135347127914429, "learning_rate": 1.97625e-06, "logits/chosen": 0.17169198393821716, "logits/rejected": 0.7990589141845703, "logps/chosen": -143.40328979492188, "logps/rejected": -170.59228515625, "loss": 0.5275, "rewards/accuracies": 0.875, "rewards/chosen": 0.1370701789855957, "rewards/margins": 0.41155850887298584, "rewards/rejected": -0.2744883596897125, "step": 419 }, { "epoch": 0.42, "grad_norm": 1.2914634943008423, "learning_rate": 1.975e-06, "logits/chosen": 0.6467694044113159, "logits/rejected": -0.10626905411481857, "logps/chosen": -264.020751953125, "logps/rejected": -167.28314208984375, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": -0.07698650658130646, "rewards/margins": 0.07052135467529297, "rewards/rejected": -0.14750786125659943, "step": 420 }, { "epoch": 0.421, "grad_norm": 2.179330825805664, "learning_rate": 1.97375e-06, "logits/chosen": 0.33948877453804016, "logits/rejected": 0.18884307146072388, "logps/chosen": -217.57252502441406, "logps/rejected": -167.43701171875, "loss": 0.8529, "rewards/accuracies": 0.375, "rewards/chosen": -0.2112560272216797, "rewards/margins": -0.2511075735092163, "rewards/rejected": 0.03985157236456871, "step": 421 }, { "epoch": 0.422, "grad_norm": 1.122196912765503, "learning_rate": 1.9724999999999997e-06, "logits/chosen": 0.3913411498069763, "logits/rejected": 0.7531154751777649, "logps/chosen": -185.70420837402344, "logps/rejected": -304.6329345703125, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": 0.1914404034614563, "rewards/margins": 0.345049113035202, "rewards/rejected": -0.15360870957374573, "step": 422 }, { "epoch": 0.423, "grad_norm": 1.6864991188049316, "learning_rate": 1.97125e-06, "logits/chosen": 0.8400959968566895, "logits/rejected": 0.03465903550386429, "logps/chosen": -339.238037109375, "logps/rejected": -151.68211364746094, "loss": 0.7747, "rewards/accuracies": 0.25, "rewards/chosen": -0.09773501753807068, "rewards/margins": -0.1192445233464241, "rewards/rejected": 0.021509502083063126, "step": 423 }, { "epoch": 0.424, "grad_norm": 0.9395923614501953, "learning_rate": 1.9699999999999998e-06, "logits/chosen": 0.31084173917770386, "logits/rejected": 0.5422607660293579, "logps/chosen": -148.90768432617188, "logps/rejected": -218.08428955078125, "loss": 0.6579, "rewards/accuracies": 0.5, "rewards/chosen": 0.05205363780260086, "rewards/margins": 0.13246507942676544, "rewards/rejected": -0.08041143417358398, "step": 424 }, { "epoch": 0.425, "grad_norm": 0.9137839674949646, "learning_rate": 1.96875e-06, "logits/chosen": 0.1279730349779129, "logits/rejected": 0.2262856662273407, "logps/chosen": -180.16375732421875, "logps/rejected": -167.8311767578125, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": 0.03951987996697426, "rewards/margins": 0.21017208695411682, "rewards/rejected": -0.17065219581127167, "step": 425 }, { "epoch": 0.426, "grad_norm": 1.362886905670166, "learning_rate": 1.9675e-06, "logits/chosen": 0.2645191550254822, "logits/rejected": 0.14890418946743011, "logps/chosen": -232.0857696533203, "logps/rejected": -182.93154907226562, "loss": 0.6967, "rewards/accuracies": 0.375, "rewards/chosen": -0.010779190808534622, "rewards/margins": 0.06629772484302521, "rewards/rejected": -0.07707691937685013, "step": 426 }, { "epoch": 0.427, "grad_norm": 1.1976165771484375, "learning_rate": 1.96625e-06, "logits/chosen": 0.5195376873016357, "logits/rejected": 0.9304274916648865, "logps/chosen": -224.10626220703125, "logps/rejected": -194.5261993408203, "loss": 0.71, "rewards/accuracies": 0.625, "rewards/chosen": -0.06169300526380539, "rewards/margins": 0.0011362060904502869, "rewards/rejected": -0.06282921135425568, "step": 427 }, { "epoch": 0.428, "grad_norm": 1.233264684677124, "learning_rate": 1.965e-06, "logits/chosen": 0.5121873617172241, "logits/rejected": 0.6244115829467773, "logps/chosen": -384.067138671875, "logps/rejected": -190.97743225097656, "loss": 0.7291, "rewards/accuracies": 0.625, "rewards/chosen": -0.0977991595864296, "rewards/margins": -0.012029886245727539, "rewards/rejected": -0.08576927334070206, "step": 428 }, { "epoch": 0.429, "grad_norm": 0.9452357888221741, "learning_rate": 1.96375e-06, "logits/chosen": 0.4838348627090454, "logits/rejected": 0.34936580061912537, "logps/chosen": -199.69332885742188, "logps/rejected": -176.5416717529297, "loss": 0.6009, "rewards/accuracies": 0.625, "rewards/chosen": 0.123742014169693, "rewards/margins": 0.27094680070877075, "rewards/rejected": -0.14720478653907776, "step": 429 }, { "epoch": 0.43, "grad_norm": 1.2023991346359253, "learning_rate": 1.9625e-06, "logits/chosen": -0.1679268777370453, "logits/rejected": 0.47761285305023193, "logps/chosen": -119.9551773071289, "logps/rejected": -269.0670471191406, "loss": 0.5721, "rewards/accuracies": 0.875, "rewards/chosen": 0.025558283552527428, "rewards/margins": 0.2775990664958954, "rewards/rejected": -0.2520407736301422, "step": 430 }, { "epoch": 0.431, "grad_norm": 0.9554707407951355, "learning_rate": 1.9612499999999996e-06, "logits/chosen": 0.2833382487297058, "logits/rejected": 0.5828163623809814, "logps/chosen": -135.5400848388672, "logps/rejected": -204.48446655273438, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": 0.0951632559299469, "rewards/margins": 0.2626613676548004, "rewards/rejected": -0.16749811172485352, "step": 431 }, { "epoch": 0.432, "grad_norm": 1.3427138328552246, "learning_rate": 1.96e-06, "logits/chosen": 0.9716482162475586, "logits/rejected": 0.40519577264785767, "logps/chosen": -325.68927001953125, "logps/rejected": -175.98382568359375, "loss": 0.6949, "rewards/accuracies": 0.625, "rewards/chosen": 0.017689693719148636, "rewards/margins": 0.05028828978538513, "rewards/rejected": -0.0325985886156559, "step": 432 }, { "epoch": 0.433, "grad_norm": 1.1497275829315186, "learning_rate": 1.95875e-06, "logits/chosen": 0.6060264706611633, "logits/rejected": 0.3616088330745697, "logps/chosen": -211.53529357910156, "logps/rejected": -163.748291015625, "loss": 0.6209, "rewards/accuracies": 0.625, "rewards/chosen": 0.007984161376953125, "rewards/margins": 0.21692219376564026, "rewards/rejected": -0.20893803238868713, "step": 433 }, { "epoch": 0.434, "grad_norm": 1.2516378164291382, "learning_rate": 1.9575e-06, "logits/chosen": 0.1565462201833725, "logits/rejected": 0.8954795598983765, "logps/chosen": -151.75921630859375, "logps/rejected": -251.033203125, "loss": 0.5306, "rewards/accuracies": 0.875, "rewards/chosen": 0.16697168350219727, "rewards/margins": 0.38187581300735474, "rewards/rejected": -0.21490412950515747, "step": 434 }, { "epoch": 0.435, "grad_norm": 1.1987541913986206, "learning_rate": 1.95625e-06, "logits/chosen": 0.7574813365936279, "logits/rejected": 0.5925643444061279, "logps/chosen": -185.587158203125, "logps/rejected": -277.0776062011719, "loss": 0.5348, "rewards/accuracies": 0.875, "rewards/chosen": 0.15233716368675232, "rewards/margins": 0.4069911241531372, "rewards/rejected": -0.2546539306640625, "step": 435 }, { "epoch": 0.436, "grad_norm": 1.1621133089065552, "learning_rate": 1.955e-06, "logits/chosen": -0.08262443542480469, "logits/rejected": 0.7891265153884888, "logps/chosen": -209.66116333007812, "logps/rejected": -211.2588653564453, "loss": 0.5091, "rewards/accuracies": 0.75, "rewards/chosen": 0.19979266822338104, "rewards/margins": 0.48153895139694214, "rewards/rejected": -0.2817462980747223, "step": 436 }, { "epoch": 0.437, "grad_norm": 1.2934396266937256, "learning_rate": 1.95375e-06, "logits/chosen": 0.5435717105865479, "logits/rejected": -0.015168935060501099, "logps/chosen": -204.67340087890625, "logps/rejected": -141.23562622070312, "loss": 0.7743, "rewards/accuracies": 0.375, "rewards/chosen": -0.2071756273508072, "rewards/margins": -0.09152554720640182, "rewards/rejected": -0.11565008759498596, "step": 437 }, { "epoch": 0.438, "grad_norm": 1.338179349899292, "learning_rate": 1.9525e-06, "logits/chosen": -0.007975280284881592, "logits/rejected": 0.31014975905418396, "logps/chosen": -163.28506469726562, "logps/rejected": -190.3165283203125, "loss": 0.5142, "rewards/accuracies": 0.875, "rewards/chosen": 0.19623500108718872, "rewards/margins": 0.4759407937526703, "rewards/rejected": -0.27970582246780396, "step": 438 }, { "epoch": 0.439, "grad_norm": 1.0589388608932495, "learning_rate": 1.9512499999999997e-06, "logits/chosen": 0.4901193380355835, "logits/rejected": 0.06517826020717621, "logps/chosen": -275.965576171875, "logps/rejected": -178.02093505859375, "loss": 0.6222, "rewards/accuracies": 0.625, "rewards/chosen": 0.06258288025856018, "rewards/margins": 0.19301491975784302, "rewards/rejected": -0.13043203949928284, "step": 439 }, { "epoch": 0.44, "grad_norm": 1.4461954832077026, "learning_rate": 1.95e-06, "logits/chosen": 0.7628079652786255, "logits/rejected": 0.640371561050415, "logps/chosen": -232.11495971679688, "logps/rejected": -195.09701538085938, "loss": 0.707, "rewards/accuracies": 0.375, "rewards/chosen": -0.09009446948766708, "rewards/margins": 0.02615470439195633, "rewards/rejected": -0.11624918133020401, "step": 440 }, { "epoch": 0.441, "grad_norm": 1.156647801399231, "learning_rate": 1.9487499999999998e-06, "logits/chosen": 0.2938793897628784, "logits/rejected": 0.6240886449813843, "logps/chosen": -165.98635864257812, "logps/rejected": -225.8262939453125, "loss": 0.6958, "rewards/accuracies": 0.625, "rewards/chosen": -0.12277040630578995, "rewards/margins": 0.05569592863321304, "rewards/rejected": -0.178466334939003, "step": 441 }, { "epoch": 0.442, "grad_norm": 1.633872389793396, "learning_rate": 1.9475e-06, "logits/chosen": 0.19287508726119995, "logits/rejected": 0.7572751641273499, "logps/chosen": -190.30807495117188, "logps/rejected": -230.30296325683594, "loss": 0.7536, "rewards/accuracies": 0.375, "rewards/chosen": -0.07230167835950851, "rewards/margins": -0.019221171736717224, "rewards/rejected": -0.053080469369888306, "step": 442 }, { "epoch": 0.443, "grad_norm": 1.3378924131393433, "learning_rate": 1.94625e-06, "logits/chosen": 0.122111476957798, "logits/rejected": 0.7749381065368652, "logps/chosen": -131.42495727539062, "logps/rejected": -297.4617919921875, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": 0.04395942762494087, "rewards/margins": 0.36911141872406006, "rewards/rejected": -0.32515203952789307, "step": 443 }, { "epoch": 0.444, "grad_norm": 1.1784088611602783, "learning_rate": 1.945e-06, "logits/chosen": 0.5778785943984985, "logits/rejected": 0.19170507788658142, "logps/chosen": -298.5111083984375, "logps/rejected": -160.38113403320312, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": 0.014932731166481972, "rewards/margins": 0.15474414825439453, "rewards/rejected": -0.1398114264011383, "step": 444 }, { "epoch": 0.445, "grad_norm": 1.3394466638565063, "learning_rate": 1.94375e-06, "logits/chosen": 0.7125555872917175, "logits/rejected": 0.46465757489204407, "logps/chosen": -269.07373046875, "logps/rejected": -194.47787475585938, "loss": 0.7222, "rewards/accuracies": 0.5, "rewards/chosen": -0.04709835350513458, "rewards/margins": 0.023411475121974945, "rewards/rejected": -0.07050982117652893, "step": 445 }, { "epoch": 0.446, "grad_norm": 1.4066178798675537, "learning_rate": 1.9424999999999996e-06, "logits/chosen": 0.2778037488460541, "logits/rejected": -0.007744520902633667, "logps/chosen": -191.966796875, "logps/rejected": -151.77584838867188, "loss": 0.7332, "rewards/accuracies": 0.5, "rewards/chosen": -0.09379561245441437, "rewards/margins": 0.0063710808753967285, "rewards/rejected": -0.1001667007803917, "step": 446 }, { "epoch": 0.447, "grad_norm": 0.9503964185714722, "learning_rate": 1.94125e-06, "logits/chosen": 0.47201260924339294, "logits/rejected": 1.032735824584961, "logps/chosen": -228.49078369140625, "logps/rejected": -218.77481079101562, "loss": 0.6283, "rewards/accuracies": 0.625, "rewards/chosen": 0.017954636365175247, "rewards/margins": 0.17502528429031372, "rewards/rejected": -0.15707063674926758, "step": 447 }, { "epoch": 0.448, "grad_norm": 1.1944303512573242, "learning_rate": 1.94e-06, "logits/chosen": 0.26164841651916504, "logits/rejected": 0.3884517550468445, "logps/chosen": -146.15554809570312, "logps/rejected": -250.00460815429688, "loss": 0.5406, "rewards/accuracies": 0.875, "rewards/chosen": 0.11036920547485352, "rewards/margins": 0.3677005171775818, "rewards/rejected": -0.2573312520980835, "step": 448 }, { "epoch": 0.449, "grad_norm": 0.9389795660972595, "learning_rate": 1.93875e-06, "logits/chosen": 0.4253494143486023, "logits/rejected": 0.6872414946556091, "logps/chosen": -171.59103393554688, "logps/rejected": -203.06004333496094, "loss": 0.5753, "rewards/accuracies": 0.625, "rewards/chosen": 0.11103382706642151, "rewards/margins": 0.3560481071472168, "rewards/rejected": -0.2450142800807953, "step": 449 }, { "epoch": 0.45, "grad_norm": 1.1959264278411865, "learning_rate": 1.9375e-06, "logits/chosen": 0.5702050924301147, "logits/rejected": 0.6234212517738342, "logps/chosen": -225.08294677734375, "logps/rejected": -221.16648864746094, "loss": 0.6295, "rewards/accuracies": 0.625, "rewards/chosen": -0.018271636217832565, "rewards/margins": 0.2651980221271515, "rewards/rejected": -0.28346967697143555, "step": 450 }, { "epoch": 0.451, "grad_norm": 1.4812098741531372, "learning_rate": 1.93625e-06, "logits/chosen": 0.6562855839729309, "logits/rejected": 0.5354712605476379, "logps/chosen": -316.9854431152344, "logps/rejected": -207.08157348632812, "loss": 0.6838, "rewards/accuracies": 0.375, "rewards/chosen": 0.010418606922030449, "rewards/margins": 0.09389640390872955, "rewards/rejected": -0.08347779512405396, "step": 451 }, { "epoch": 0.452, "grad_norm": 1.557835578918457, "learning_rate": 1.935e-06, "logits/chosen": 0.695262610912323, "logits/rejected": 0.5033326148986816, "logps/chosen": -240.22225952148438, "logps/rejected": -181.49749755859375, "loss": 0.7928, "rewards/accuracies": 0.5, "rewards/chosen": -0.12249907106161118, "rewards/margins": -0.05931682884693146, "rewards/rejected": -0.06318226456642151, "step": 452 }, { "epoch": 0.453, "grad_norm": 0.9852244853973389, "learning_rate": 1.93375e-06, "logits/chosen": 0.4107428193092346, "logits/rejected": 1.1233607530593872, "logps/chosen": -155.2694854736328, "logps/rejected": -216.71629333496094, "loss": 0.6208, "rewards/accuracies": 0.75, "rewards/chosen": -0.0033666566014289856, "rewards/margins": 0.23343631625175476, "rewards/rejected": -0.23680296540260315, "step": 453 }, { "epoch": 0.454, "grad_norm": 1.253177285194397, "learning_rate": 1.9325e-06, "logits/chosen": 0.10686859488487244, "logits/rejected": 0.8794882297515869, "logps/chosen": -103.87701416015625, "logps/rejected": -230.19065856933594, "loss": 0.5829, "rewards/accuracies": 0.875, "rewards/chosen": 0.05282247066497803, "rewards/margins": 0.286652147769928, "rewards/rejected": -0.23382969200611115, "step": 454 }, { "epoch": 0.455, "grad_norm": 1.6531702280044556, "learning_rate": 1.93125e-06, "logits/chosen": 0.48516881465911865, "logits/rejected": 0.0908808708190918, "logps/chosen": -185.89175415039062, "logps/rejected": -174.0558319091797, "loss": 0.7771, "rewards/accuracies": 0.5, "rewards/chosen": -0.14866457879543304, "rewards/margins": -0.0613689124584198, "rewards/rejected": -0.08729562908411026, "step": 455 }, { "epoch": 0.456, "grad_norm": 1.6269440650939941, "learning_rate": 1.9299999999999997e-06, "logits/chosen": 0.9587519764900208, "logits/rejected": 0.45789143443107605, "logps/chosen": -254.3141632080078, "logps/rejected": -198.8308563232422, "loss": 0.8184, "rewards/accuracies": 0.5, "rewards/chosen": -0.16438837349414825, "rewards/margins": -0.1523512899875641, "rewards/rejected": -0.012037087231874466, "step": 456 }, { "epoch": 0.457, "grad_norm": 1.216989278793335, "learning_rate": 1.92875e-06, "logits/chosen": 0.3424152433872223, "logits/rejected": 0.7179023027420044, "logps/chosen": -207.4228973388672, "logps/rejected": -228.86578369140625, "loss": 0.6888, "rewards/accuracies": 0.375, "rewards/chosen": -0.08338947594165802, "rewards/margins": 0.0740334540605545, "rewards/rejected": -0.15742293000221252, "step": 457 }, { "epoch": 0.458, "grad_norm": 1.3746393918991089, "learning_rate": 1.9274999999999998e-06, "logits/chosen": 0.7842135429382324, "logits/rejected": 0.42857158184051514, "logps/chosen": -383.43634033203125, "logps/rejected": -174.63406372070312, "loss": 0.7192, "rewards/accuracies": 0.5, "rewards/chosen": -0.09329294413328171, "rewards/margins": 0.04202904924750328, "rewards/rejected": -0.13532200455665588, "step": 458 }, { "epoch": 0.459, "grad_norm": 1.3285026550292969, "learning_rate": 1.92625e-06, "logits/chosen": 0.02074110507965088, "logits/rejected": 0.562690794467926, "logps/chosen": -185.11819458007812, "logps/rejected": -189.71865844726562, "loss": 0.7543, "rewards/accuracies": 0.625, "rewards/chosen": -0.11898480355739594, "rewards/margins": -0.03247024118900299, "rewards/rejected": -0.08651456236839294, "step": 459 }, { "epoch": 0.46, "grad_norm": 1.1041377782821655, "learning_rate": 1.9249999999999998e-06, "logits/chosen": -0.11277240514755249, "logits/rejected": 0.5977302193641663, "logps/chosen": -138.55189514160156, "logps/rejected": -181.87637329101562, "loss": 0.6527, "rewards/accuracies": 0.75, "rewards/chosen": -0.013396687805652618, "rewards/margins": 0.16523928940296173, "rewards/rejected": -0.17863598465919495, "step": 460 }, { "epoch": 0.461, "grad_norm": 1.2421455383300781, "learning_rate": 1.92375e-06, "logits/chosen": 0.799392819404602, "logits/rejected": 0.12411405146121979, "logps/chosen": -195.35546875, "logps/rejected": -239.77920532226562, "loss": 0.6079, "rewards/accuracies": 0.625, "rewards/chosen": -0.07609449326992035, "rewards/margins": 0.21718832850456238, "rewards/rejected": -0.29328280687332153, "step": 461 }, { "epoch": 0.462, "grad_norm": 1.1447502374649048, "learning_rate": 1.9225e-06, "logits/chosen": 0.8584494590759277, "logits/rejected": 0.37123048305511475, "logps/chosen": -249.76475524902344, "logps/rejected": -163.42343139648438, "loss": 0.6144, "rewards/accuracies": 0.625, "rewards/chosen": 0.06720095127820969, "rewards/margins": 0.2281181514263153, "rewards/rejected": -0.16091719269752502, "step": 462 }, { "epoch": 0.463, "grad_norm": 1.1531199216842651, "learning_rate": 1.9212499999999996e-06, "logits/chosen": 0.17782120406627655, "logits/rejected": 0.11537297815084457, "logps/chosen": -180.9814453125, "logps/rejected": -204.7666015625, "loss": 0.6708, "rewards/accuracies": 0.625, "rewards/chosen": -0.04308290779590607, "rewards/margins": 0.13436788320541382, "rewards/rejected": -0.17745080590248108, "step": 463 }, { "epoch": 0.464, "grad_norm": 1.559523105621338, "learning_rate": 1.92e-06, "logits/chosen": 0.6779786348342896, "logits/rejected": -0.2097073197364807, "logps/chosen": -226.29354858398438, "logps/rejected": -133.04421997070312, "loss": 0.7666, "rewards/accuracies": 0.375, "rewards/chosen": -0.12543049454689026, "rewards/margins": -0.07144831866025925, "rewards/rejected": -0.05398216098546982, "step": 464 }, { "epoch": 0.465, "grad_norm": 1.115533709526062, "learning_rate": 1.91875e-06, "logits/chosen": 0.32810667157173157, "logits/rejected": 0.5604045391082764, "logps/chosen": -129.10574340820312, "logps/rejected": -185.96652221679688, "loss": 0.5069, "rewards/accuracies": 0.875, "rewards/chosen": 0.18170495331287384, "rewards/margins": 0.4635438323020935, "rewards/rejected": -0.28183889389038086, "step": 465 }, { "epoch": 0.466, "grad_norm": 1.4263274669647217, "learning_rate": 1.9175e-06, "logits/chosen": 0.6004995107650757, "logits/rejected": 0.4814796447753906, "logps/chosen": -156.8258819580078, "logps/rejected": -269.515380859375, "loss": 0.4592, "rewards/accuracies": 0.875, "rewards/chosen": 0.16238708794116974, "rewards/margins": 0.5800882577896118, "rewards/rejected": -0.4177011549472809, "step": 466 }, { "epoch": 0.467, "grad_norm": 1.2034857273101807, "learning_rate": 1.91625e-06, "logits/chosen": 0.436048686504364, "logits/rejected": 0.47867876291275024, "logps/chosen": -218.17333984375, "logps/rejected": -190.01736450195312, "loss": 0.539, "rewards/accuracies": 0.75, "rewards/chosen": 0.1400846540927887, "rewards/margins": 0.4107806980609894, "rewards/rejected": -0.27069607377052307, "step": 467 }, { "epoch": 0.468, "grad_norm": 0.898371160030365, "learning_rate": 1.915e-06, "logits/chosen": -0.14569199085235596, "logits/rejected": 0.1382240653038025, "logps/chosen": -149.48362731933594, "logps/rejected": -155.10946655273438, "loss": 0.5997, "rewards/accuracies": 0.75, "rewards/chosen": 0.0010832250118255615, "rewards/margins": 0.2592179775238037, "rewards/rejected": -0.25813475251197815, "step": 468 }, { "epoch": 0.469, "grad_norm": 1.8510180711746216, "learning_rate": 1.91375e-06, "logits/chosen": 0.711759626865387, "logits/rejected": 0.6012658476829529, "logps/chosen": -331.48724365234375, "logps/rejected": -177.74681091308594, "loss": 0.7497, "rewards/accuracies": 0.5, "rewards/chosen": -0.08408565074205399, "rewards/margins": -0.05051411688327789, "rewards/rejected": -0.033571530133485794, "step": 469 }, { "epoch": 0.47, "grad_norm": 1.0414369106292725, "learning_rate": 1.9125e-06, "logits/chosen": 0.15935510396957397, "logits/rejected": 0.15937119722366333, "logps/chosen": -192.14463806152344, "logps/rejected": -240.49154663085938, "loss": 0.6533, "rewards/accuracies": 0.625, "rewards/chosen": -0.09523268043994904, "rewards/margins": 0.13693256676197052, "rewards/rejected": -0.23216524720191956, "step": 470 }, { "epoch": 0.471, "grad_norm": 0.9036175608634949, "learning_rate": 1.9112499999999997e-06, "logits/chosen": 0.772010862827301, "logits/rejected": 0.6488708257675171, "logps/chosen": -161.945068359375, "logps/rejected": -169.8529815673828, "loss": 0.5918, "rewards/accuracies": 0.625, "rewards/chosen": 0.038114361464977264, "rewards/margins": 0.266729474067688, "rewards/rejected": -0.22861510515213013, "step": 471 }, { "epoch": 0.472, "grad_norm": 0.9476335048675537, "learning_rate": 1.91e-06, "logits/chosen": 0.6271799802780151, "logits/rejected": 0.5369877815246582, "logps/chosen": -172.10755920410156, "logps/rejected": -173.46873474121094, "loss": 0.5826, "rewards/accuracies": 0.75, "rewards/chosen": 0.0642915815114975, "rewards/margins": 0.33353930711746216, "rewards/rejected": -0.26924771070480347, "step": 472 }, { "epoch": 0.473, "grad_norm": 1.2180567979812622, "learning_rate": 1.9087499999999997e-06, "logits/chosen": 0.5357088446617126, "logits/rejected": 0.6544821262359619, "logps/chosen": -192.4263458251953, "logps/rejected": -224.67185974121094, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": -0.014055922627449036, "rewards/margins": 0.13717547059059143, "rewards/rejected": -0.15123139321804047, "step": 473 }, { "epoch": 0.474, "grad_norm": 1.2359743118286133, "learning_rate": 1.9075e-06, "logits/chosen": 0.2239057570695877, "logits/rejected": 0.953005313873291, "logps/chosen": -140.44219970703125, "logps/rejected": -232.9300994873047, "loss": 0.5583, "rewards/accuracies": 0.75, "rewards/chosen": 0.08091612160205841, "rewards/margins": 0.3488151431083679, "rewards/rejected": -0.2678990364074707, "step": 474 }, { "epoch": 0.475, "grad_norm": 0.918680727481842, "learning_rate": 1.90625e-06, "logits/chosen": 0.5716888308525085, "logits/rejected": -0.012839406728744507, "logps/chosen": -153.54135131835938, "logps/rejected": -142.2911834716797, "loss": 0.5341, "rewards/accuracies": 0.75, "rewards/chosen": 0.09775634109973907, "rewards/margins": 0.40121686458587646, "rewards/rejected": -0.3034605085849762, "step": 475 }, { "epoch": 0.476, "grad_norm": 0.9220218658447266, "learning_rate": 1.905e-06, "logits/chosen": 0.13741040229797363, "logits/rejected": 0.7050686478614807, "logps/chosen": -139.59695434570312, "logps/rejected": -192.0801544189453, "loss": 0.5531, "rewards/accuracies": 0.75, "rewards/chosen": 0.07695475220680237, "rewards/margins": 0.4172833561897278, "rewards/rejected": -0.3403286039829254, "step": 476 }, { "epoch": 0.477, "grad_norm": 1.1996937990188599, "learning_rate": 1.90375e-06, "logits/chosen": 0.5282697081565857, "logits/rejected": 0.6983755230903625, "logps/chosen": -189.42916870117188, "logps/rejected": -199.88157653808594, "loss": 0.667, "rewards/accuracies": 0.625, "rewards/chosen": 0.020326383411884308, "rewards/margins": 0.139961376786232, "rewards/rejected": -0.11963501572608948, "step": 477 }, { "epoch": 0.478, "grad_norm": 1.1779203414916992, "learning_rate": 1.9025e-06, "logits/chosen": 0.29214227199554443, "logits/rejected": 0.3285360336303711, "logps/chosen": -208.2008056640625, "logps/rejected": -176.19715881347656, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": -0.027807921171188354, "rewards/margins": 0.1267564743757248, "rewards/rejected": -0.15456438064575195, "step": 478 }, { "epoch": 0.479, "grad_norm": 1.281305193901062, "learning_rate": 1.90125e-06, "logits/chosen": 0.46839556097984314, "logits/rejected": 0.9562997817993164, "logps/chosen": -154.07754516601562, "logps/rejected": -228.52352905273438, "loss": 0.6388, "rewards/accuracies": 0.75, "rewards/chosen": -0.06292057037353516, "rewards/margins": 0.2139364331960678, "rewards/rejected": -0.2768569886684418, "step": 479 }, { "epoch": 0.48, "grad_norm": 1.4618314504623413, "learning_rate": 1.8999999999999998e-06, "logits/chosen": 0.7511192560195923, "logits/rejected": 0.5301550626754761, "logps/chosen": -255.28915405273438, "logps/rejected": -204.92996215820312, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": -0.03775624930858612, "rewards/margins": 0.1463807076215744, "rewards/rejected": -0.18413697183132172, "step": 480 }, { "epoch": 0.481, "grad_norm": 1.0453449487686157, "learning_rate": 1.8987499999999998e-06, "logits/chosen": 0.19023901224136353, "logits/rejected": 0.8934377431869507, "logps/chosen": -226.07513427734375, "logps/rejected": -258.0047912597656, "loss": 0.5362, "rewards/accuracies": 0.875, "rewards/chosen": 0.013428021222352982, "rewards/margins": 0.3958398699760437, "rewards/rejected": -0.3824118971824646, "step": 481 }, { "epoch": 0.482, "grad_norm": 0.9548736810684204, "learning_rate": 1.8974999999999998e-06, "logits/chosen": -0.03834528848528862, "logits/rejected": 0.9685264825820923, "logps/chosen": -127.01415252685547, "logps/rejected": -210.27923583984375, "loss": 0.6869, "rewards/accuracies": 0.75, "rewards/chosen": -0.09576378017663956, "rewards/margins": 0.0828714445233345, "rewards/rejected": -0.17863520979881287, "step": 482 }, { "epoch": 0.483, "grad_norm": 1.2513056993484497, "learning_rate": 1.8962499999999998e-06, "logits/chosen": 0.6714546084403992, "logits/rejected": 0.5687328577041626, "logps/chosen": -322.3977355957031, "logps/rejected": -193.21665954589844, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": 0.04327450320124626, "rewards/margins": 0.39403313398361206, "rewards/rejected": -0.3507586419582367, "step": 483 }, { "epoch": 0.484, "grad_norm": 1.08930242061615, "learning_rate": 1.8949999999999999e-06, "logits/chosen": 0.5590872168540955, "logits/rejected": 0.8537343740463257, "logps/chosen": -139.78353881835938, "logps/rejected": -236.70130920410156, "loss": 0.5765, "rewards/accuracies": 0.75, "rewards/chosen": 0.07470199465751648, "rewards/margins": 0.35937997698783875, "rewards/rejected": -0.28467798233032227, "step": 484 }, { "epoch": 0.485, "grad_norm": 2.216805934906006, "learning_rate": 1.8937499999999999e-06, "logits/chosen": 1.3378500938415527, "logits/rejected": 0.028005048632621765, "logps/chosen": -337.7785339355469, "logps/rejected": -125.79883575439453, "loss": 0.8414, "rewards/accuracies": 0.375, "rewards/chosen": -0.16951867938041687, "rewards/margins": -0.18795976042747498, "rewards/rejected": 0.018441105261445045, "step": 485 }, { "epoch": 0.486, "grad_norm": 1.2276042699813843, "learning_rate": 1.8924999999999999e-06, "logits/chosen": 0.56645268201828, "logits/rejected": 0.6518830060958862, "logps/chosen": -161.1385040283203, "logps/rejected": -150.42056274414062, "loss": 0.5955, "rewards/accuracies": 0.75, "rewards/chosen": 0.04654281586408615, "rewards/margins": 0.3365105390548706, "rewards/rejected": -0.28996771574020386, "step": 486 }, { "epoch": 0.487, "grad_norm": 1.712276577949524, "learning_rate": 1.89125e-06, "logits/chosen": 1.0559580326080322, "logits/rejected": 0.22229784727096558, "logps/chosen": -388.03192138671875, "logps/rejected": -152.2833709716797, "loss": 0.7543, "rewards/accuracies": 0.375, "rewards/chosen": -0.13390502333641052, "rewards/margins": -0.04344131052494049, "rewards/rejected": -0.09046373516321182, "step": 487 }, { "epoch": 0.488, "grad_norm": 1.3947985172271729, "learning_rate": 1.89e-06, "logits/chosen": 0.9182350635528564, "logits/rejected": 0.10631431639194489, "logps/chosen": -277.0918273925781, "logps/rejected": -148.05642700195312, "loss": 0.7311, "rewards/accuracies": 0.375, "rewards/chosen": -0.11813335865736008, "rewards/margins": -0.015022855252027512, "rewards/rejected": -0.10311050713062286, "step": 488 }, { "epoch": 0.489, "grad_norm": 1.1899596452713013, "learning_rate": 1.88875e-06, "logits/chosen": 0.1924266368150711, "logits/rejected": 0.41280779242515564, "logps/chosen": -205.58847045898438, "logps/rejected": -299.70379638671875, "loss": 0.5614, "rewards/accuracies": 0.75, "rewards/chosen": 0.08262848854064941, "rewards/margins": 0.31667086482048035, "rewards/rejected": -0.23404237627983093, "step": 489 }, { "epoch": 0.49, "grad_norm": 1.7743061780929565, "learning_rate": 1.8875e-06, "logits/chosen": 0.967599630355835, "logits/rejected": 0.1353110522031784, "logps/chosen": -323.97381591796875, "logps/rejected": -158.96881103515625, "loss": 0.8104, "rewards/accuracies": 0.375, "rewards/chosen": -0.19697466492652893, "rewards/margins": -0.09642171859741211, "rewards/rejected": -0.10055293887853622, "step": 490 }, { "epoch": 0.491, "grad_norm": 1.6252981424331665, "learning_rate": 1.88625e-06, "logits/chosen": 0.7797929048538208, "logits/rejected": 0.16514848172664642, "logps/chosen": -262.45245361328125, "logps/rejected": -244.43734741210938, "loss": 0.7624, "rewards/accuracies": 0.5, "rewards/chosen": -0.27865391969680786, "rewards/margins": -0.019016355276107788, "rewards/rejected": -0.2596375644207001, "step": 491 }, { "epoch": 0.492, "grad_norm": 1.0141000747680664, "learning_rate": 1.885e-06, "logits/chosen": 0.6501842737197876, "logits/rejected": 0.2869603931903839, "logps/chosen": -185.82949829101562, "logps/rejected": -153.54937744140625, "loss": 0.7132, "rewards/accuracies": 0.5, "rewards/chosen": -0.2295394241809845, "rewards/margins": 0.03337029367685318, "rewards/rejected": -0.2629097104072571, "step": 492 }, { "epoch": 0.493, "grad_norm": 1.0653809309005737, "learning_rate": 1.88375e-06, "logits/chosen": 0.9598445892333984, "logits/rejected": 0.47362029552459717, "logps/chosen": -293.39910888671875, "logps/rejected": -199.69473266601562, "loss": 0.5463, "rewards/accuracies": 0.75, "rewards/chosen": 0.08639582991600037, "rewards/margins": 0.4267027676105499, "rewards/rejected": -0.34030693769454956, "step": 493 }, { "epoch": 0.494, "grad_norm": 0.9720371961593628, "learning_rate": 1.8825e-06, "logits/chosen": 0.12117283046245575, "logits/rejected": 0.566558301448822, "logps/chosen": -146.87356567382812, "logps/rejected": -190.7934112548828, "loss": 0.5242, "rewards/accuracies": 0.875, "rewards/chosen": 0.015417001210153103, "rewards/margins": 0.410744845867157, "rewards/rejected": -0.3953278660774231, "step": 494 }, { "epoch": 0.495, "grad_norm": 0.8474186658859253, "learning_rate": 1.88125e-06, "logits/chosen": 0.0410698801279068, "logits/rejected": 0.655257523059845, "logps/chosen": -140.15260314941406, "logps/rejected": -179.47216796875, "loss": 0.5447, "rewards/accuracies": 0.75, "rewards/chosen": 0.05433226004242897, "rewards/margins": 0.3816531300544739, "rewards/rejected": -0.3273208737373352, "step": 495 }, { "epoch": 0.496, "grad_norm": 1.3320292234420776, "learning_rate": 1.8799999999999998e-06, "logits/chosen": 0.6763129234313965, "logits/rejected": 0.17245441675186157, "logps/chosen": -294.5776672363281, "logps/rejected": -161.91741943359375, "loss": 0.7205, "rewards/accuracies": 0.5, "rewards/chosen": -0.14089298248291016, "rewards/margins": 0.060952845960855484, "rewards/rejected": -0.20184583961963654, "step": 496 }, { "epoch": 0.497, "grad_norm": 1.5632883310317993, "learning_rate": 1.8787499999999998e-06, "logits/chosen": 0.4433920383453369, "logits/rejected": 0.7838399410247803, "logps/chosen": -232.89715576171875, "logps/rejected": -284.16705322265625, "loss": 0.7019, "rewards/accuracies": 0.625, "rewards/chosen": -0.07815399765968323, "rewards/margins": 0.14915335178375244, "rewards/rejected": -0.22730731964111328, "step": 497 }, { "epoch": 0.498, "grad_norm": 1.0206900835037231, "learning_rate": 1.8774999999999998e-06, "logits/chosen": 0.3443008363246918, "logits/rejected": 0.9402576684951782, "logps/chosen": -215.92276000976562, "logps/rejected": -256.0380859375, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": 0.013476468622684479, "rewards/margins": 0.352166086435318, "rewards/rejected": -0.3386896252632141, "step": 498 }, { "epoch": 0.499, "grad_norm": 1.6114665269851685, "learning_rate": 1.8762499999999998e-06, "logits/chosen": 0.5443048477172852, "logits/rejected": 0.48271268606185913, "logps/chosen": -267.4411315917969, "logps/rejected": -140.02615356445312, "loss": 0.7739, "rewards/accuracies": 0.5, "rewards/chosen": -0.19848670065402985, "rewards/margins": -0.024513721466064453, "rewards/rejected": -0.1739729791879654, "step": 499 }, { "epoch": 0.5, "grad_norm": 1.5740680694580078, "learning_rate": 1.8749999999999998e-06, "logits/chosen": 0.748949408531189, "logits/rejected": 0.3754490315914154, "logps/chosen": -337.61724853515625, "logps/rejected": -157.1270751953125, "loss": 0.7438, "rewards/accuracies": 0.375, "rewards/chosen": -0.17923402786254883, "rewards/margins": -0.006320677697658539, "rewards/rejected": -0.17291337251663208, "step": 500 }, { "epoch": 0.501, "grad_norm": 1.099474310874939, "learning_rate": 1.8737499999999998e-06, "logits/chosen": 0.2750489115715027, "logits/rejected": -0.15976279973983765, "logps/chosen": -244.89019775390625, "logps/rejected": -168.3121795654297, "loss": 0.5577, "rewards/accuracies": 0.625, "rewards/chosen": 0.04462823644280434, "rewards/margins": 0.37483111023902893, "rewards/rejected": -0.3302028775215149, "step": 501 }, { "epoch": 0.502, "grad_norm": 1.2955206632614136, "learning_rate": 1.8725e-06, "logits/chosen": 1.0099387168884277, "logits/rejected": 0.5496715903282166, "logps/chosen": -234.2972412109375, "logps/rejected": -200.86553955078125, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": -0.03916855528950691, "rewards/margins": 0.14007815718650818, "rewards/rejected": -0.1792467087507248, "step": 502 }, { "epoch": 0.503, "grad_norm": 1.181132197380066, "learning_rate": 1.87125e-06, "logits/chosen": 0.6334929466247559, "logits/rejected": 0.578885555267334, "logps/chosen": -249.97982788085938, "logps/rejected": -200.81080627441406, "loss": 0.5878, "rewards/accuracies": 0.625, "rewards/chosen": 0.027814671397209167, "rewards/margins": 0.25744009017944336, "rewards/rejected": -0.2296254187822342, "step": 503 }, { "epoch": 0.504, "grad_norm": 1.1634799242019653, "learning_rate": 1.87e-06, "logits/chosen": 0.05017165094614029, "logits/rejected": 1.0151281356811523, "logps/chosen": -165.66012573242188, "logps/rejected": -252.61911010742188, "loss": 0.5037, "rewards/accuracies": 0.875, "rewards/chosen": 0.11813096702098846, "rewards/margins": 0.48568281531333923, "rewards/rejected": -0.3675518333911896, "step": 504 }, { "epoch": 0.505, "grad_norm": 1.2199615240097046, "learning_rate": 1.86875e-06, "logits/chosen": 0.141788512468338, "logits/rejected": 0.7088327407836914, "logps/chosen": -169.067138671875, "logps/rejected": -206.10000610351562, "loss": 0.7059, "rewards/accuracies": 0.625, "rewards/chosen": -0.0904262512922287, "rewards/margins": 0.08799783140420914, "rewards/rejected": -0.17842409014701843, "step": 505 }, { "epoch": 0.506, "grad_norm": 1.0423557758331299, "learning_rate": 1.8675e-06, "logits/chosen": 0.5238258838653564, "logits/rejected": 0.5962169766426086, "logps/chosen": -228.37820434570312, "logps/rejected": -187.35154724121094, "loss": 0.5643, "rewards/accuracies": 0.75, "rewards/chosen": -0.0042761266231536865, "rewards/margins": 0.34793370962142944, "rewards/rejected": -0.3522098660469055, "step": 506 }, { "epoch": 0.507, "grad_norm": 1.0101890563964844, "learning_rate": 1.86625e-06, "logits/chosen": 0.6190729141235352, "logits/rejected": 0.37238889932632446, "logps/chosen": -224.9355010986328, "logps/rejected": -177.9835968017578, "loss": 0.5919, "rewards/accuracies": 0.625, "rewards/chosen": 0.048798371106386185, "rewards/margins": 0.30845698714256287, "rewards/rejected": -0.2596586346626282, "step": 507 }, { "epoch": 0.508, "grad_norm": 2.0746734142303467, "learning_rate": 1.865e-06, "logits/chosen": 1.0190448760986328, "logits/rejected": 0.5759732723236084, "logps/chosen": -368.63604736328125, "logps/rejected": -173.12643432617188, "loss": 0.8353, "rewards/accuracies": 0.375, "rewards/chosen": -0.2698609530925751, "rewards/margins": -0.19787760078907013, "rewards/rejected": -0.07198333740234375, "step": 508 }, { "epoch": 0.509, "grad_norm": 1.3663837909698486, "learning_rate": 1.86375e-06, "logits/chosen": 0.6849511861801147, "logits/rejected": 0.21333293616771698, "logps/chosen": -180.558837890625, "logps/rejected": -162.29942321777344, "loss": 0.7564, "rewards/accuracies": 0.5, "rewards/chosen": -0.21332120895385742, "rewards/margins": -0.005486913025379181, "rewards/rejected": -0.20783430337905884, "step": 509 }, { "epoch": 0.51, "grad_norm": 1.071324110031128, "learning_rate": 1.8625e-06, "logits/chosen": 0.12415628135204315, "logits/rejected": 0.31762126088142395, "logps/chosen": -171.14100646972656, "logps/rejected": -180.7943115234375, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": -0.05057287961244583, "rewards/margins": 0.22835609316825867, "rewards/rejected": -0.2789289355278015, "step": 510 }, { "epoch": 0.511, "grad_norm": 1.0516231060028076, "learning_rate": 1.86125e-06, "logits/chosen": 0.5579717755317688, "logits/rejected": 0.9182597994804382, "logps/chosen": -176.69041442871094, "logps/rejected": -209.15689086914062, "loss": 0.5359, "rewards/accuracies": 0.75, "rewards/chosen": 0.08257189393043518, "rewards/margins": 0.4110858142375946, "rewards/rejected": -0.3285139203071594, "step": 511 }, { "epoch": 0.512, "grad_norm": 1.2335838079452515, "learning_rate": 1.86e-06, "logits/chosen": 0.5914740562438965, "logits/rejected": 0.21816937625408173, "logps/chosen": -269.8058166503906, "logps/rejected": -167.12930297851562, "loss": 0.6455, "rewards/accuracies": 0.5, "rewards/chosen": -0.014656925573945045, "rewards/margins": 0.17862138152122498, "rewards/rejected": -0.19327831268310547, "step": 512 }, { "epoch": 0.513, "grad_norm": 1.2048710584640503, "learning_rate": 1.8587499999999998e-06, "logits/chosen": 0.15371708571910858, "logits/rejected": 0.9000301957130432, "logps/chosen": -207.2163848876953, "logps/rejected": -237.83346557617188, "loss": 0.5505, "rewards/accuracies": 0.75, "rewards/chosen": 0.015834420919418335, "rewards/margins": 0.3684879541397095, "rewards/rejected": -0.35265350341796875, "step": 513 }, { "epoch": 0.514, "grad_norm": 1.5621068477630615, "learning_rate": 1.8574999999999998e-06, "logits/chosen": 1.1562763452529907, "logits/rejected": 0.33417218923568726, "logps/chosen": -344.55029296875, "logps/rejected": -199.93258666992188, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": -0.013615038245916367, "rewards/margins": 0.16363783180713654, "rewards/rejected": -0.177252858877182, "step": 514 }, { "epoch": 0.515, "grad_norm": 1.0789251327514648, "learning_rate": 1.8562499999999998e-06, "logits/chosen": 0.259402871131897, "logits/rejected": 0.7208682298660278, "logps/chosen": -266.779052734375, "logps/rejected": -165.15122985839844, "loss": 0.652, "rewards/accuracies": 0.5, "rewards/chosen": -0.03625965490937233, "rewards/margins": 0.15968942642211914, "rewards/rejected": -0.19594909250736237, "step": 515 }, { "epoch": 0.516, "grad_norm": 1.248518705368042, "learning_rate": 1.8549999999999998e-06, "logits/chosen": 0.26568281650543213, "logits/rejected": 0.4886091947555542, "logps/chosen": -201.1352996826172, "logps/rejected": -218.29330444335938, "loss": 0.4846, "rewards/accuracies": 1.0, "rewards/chosen": 0.17910295724868774, "rewards/margins": 0.4910540282726288, "rewards/rejected": -0.31195107102394104, "step": 516 }, { "epoch": 0.517, "grad_norm": 1.0984160900115967, "learning_rate": 1.8537499999999998e-06, "logits/chosen": 0.36544689536094666, "logits/rejected": 0.34085187315940857, "logps/chosen": -161.94332885742188, "logps/rejected": -186.73289489746094, "loss": 0.6214, "rewards/accuracies": 0.75, "rewards/chosen": -0.046038247644901276, "rewards/margins": 0.2628755569458008, "rewards/rejected": -0.30891379714012146, "step": 517 }, { "epoch": 0.518, "grad_norm": 1.0669794082641602, "learning_rate": 1.8525e-06, "logits/chosen": 0.6804245114326477, "logits/rejected": 1.1867188215255737, "logps/chosen": -304.23626708984375, "logps/rejected": -258.03857421875, "loss": 0.5472, "rewards/accuracies": 0.625, "rewards/chosen": 0.051842838525772095, "rewards/margins": 0.43522924184799194, "rewards/rejected": -0.38338643312454224, "step": 518 }, { "epoch": 0.519, "grad_norm": 1.0831319093704224, "learning_rate": 1.85125e-06, "logits/chosen": 0.5448541641235352, "logits/rejected": 0.47368955612182617, "logps/chosen": -176.95468139648438, "logps/rejected": -211.65463256835938, "loss": 0.5609, "rewards/accuracies": 0.75, "rewards/chosen": -0.039034321904182434, "rewards/margins": 0.37084829807281494, "rewards/rejected": -0.40988263487815857, "step": 519 }, { "epoch": 0.52, "grad_norm": 1.6819666624069214, "learning_rate": 1.85e-06, "logits/chosen": 1.0376904010772705, "logits/rejected": 0.13019517064094543, "logps/chosen": -362.4861755371094, "logps/rejected": -189.81280517578125, "loss": 0.6381, "rewards/accuracies": 0.5, "rewards/chosen": -0.08508177101612091, "rewards/margins": 0.17988377809524536, "rewards/rejected": -0.2649655342102051, "step": 520 }, { "epoch": 0.521, "grad_norm": 1.1335711479187012, "learning_rate": 1.8487499999999999e-06, "logits/chosen": 0.13412940502166748, "logits/rejected": 0.7923240661621094, "logps/chosen": -155.23110961914062, "logps/rejected": -206.27792358398438, "loss": 0.4688, "rewards/accuracies": 0.875, "rewards/chosen": 0.12022562325000763, "rewards/margins": 0.6079647541046143, "rewards/rejected": -0.48773908615112305, "step": 521 }, { "epoch": 0.522, "grad_norm": 1.2035735845565796, "learning_rate": 1.8474999999999999e-06, "logits/chosen": 0.5239711999893188, "logits/rejected": 0.7718482613563538, "logps/chosen": -188.38082885742188, "logps/rejected": -220.96633911132812, "loss": 0.5541, "rewards/accuracies": 0.75, "rewards/chosen": 0.10582685470581055, "rewards/margins": 0.3957505226135254, "rewards/rejected": -0.28992366790771484, "step": 522 }, { "epoch": 0.523, "grad_norm": 1.2098888158798218, "learning_rate": 1.84625e-06, "logits/chosen": 0.8611014485359192, "logits/rejected": 0.47080299258232117, "logps/chosen": -275.80255126953125, "logps/rejected": -186.08242797851562, "loss": 0.61, "rewards/accuracies": 0.5, "rewards/chosen": -0.04251380264759064, "rewards/margins": 0.24999254941940308, "rewards/rejected": -0.2925063669681549, "step": 523 }, { "epoch": 0.524, "grad_norm": 1.3406740427017212, "learning_rate": 1.845e-06, "logits/chosen": 1.0546916723251343, "logits/rejected": 0.5116968154907227, "logps/chosen": -302.77410888671875, "logps/rejected": -176.1361083984375, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -0.15597963333129883, "rewards/margins": 0.12475509941577911, "rewards/rejected": -0.28073474764823914, "step": 524 }, { "epoch": 0.525, "grad_norm": 1.2580718994140625, "learning_rate": 1.84375e-06, "logits/chosen": 0.7728922963142395, "logits/rejected": 0.31162071228027344, "logps/chosen": -319.2456970214844, "logps/rejected": -184.10537719726562, "loss": 0.7138, "rewards/accuracies": 0.375, "rewards/chosen": -0.06068410724401474, "rewards/margins": 0.007483094930648804, "rewards/rejected": -0.06816720962524414, "step": 525 }, { "epoch": 0.526, "grad_norm": 1.7104111909866333, "learning_rate": 1.8425e-06, "logits/chosen": 0.4671601355075836, "logits/rejected": 0.8375613689422607, "logps/chosen": -216.33999633789062, "logps/rejected": -196.01368713378906, "loss": 0.8169, "rewards/accuracies": 0.375, "rewards/chosen": -0.20116719603538513, "rewards/margins": -0.11862479150295258, "rewards/rejected": -0.08254241943359375, "step": 526 }, { "epoch": 0.527, "grad_norm": 1.6949775218963623, "learning_rate": 1.84125e-06, "logits/chosen": 0.9625173211097717, "logits/rejected": 0.6072355508804321, "logps/chosen": -295.7554016113281, "logps/rejected": -182.9761962890625, "loss": 0.7899, "rewards/accuracies": 0.375, "rewards/chosen": -0.22402802109718323, "rewards/margins": -0.09283638000488281, "rewards/rejected": -0.13119164109230042, "step": 527 }, { "epoch": 0.528, "grad_norm": 0.9955521821975708, "learning_rate": 1.84e-06, "logits/chosen": 0.4452539086341858, "logits/rejected": 0.6899340152740479, "logps/chosen": -143.21932983398438, "logps/rejected": -192.8555908203125, "loss": 0.4532, "rewards/accuracies": 0.875, "rewards/chosen": 0.20504388213157654, "rewards/margins": 0.5861846208572388, "rewards/rejected": -0.38114070892333984, "step": 528 }, { "epoch": 0.529, "grad_norm": 1.321012020111084, "learning_rate": 1.83875e-06, "logits/chosen": 0.7106202840805054, "logits/rejected": 0.6134408712387085, "logps/chosen": -248.95606994628906, "logps/rejected": -151.026123046875, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -0.05561751872301102, "rewards/margins": 0.08664074540138245, "rewards/rejected": -0.14225825667381287, "step": 529 }, { "epoch": 0.53, "grad_norm": 1.0926682949066162, "learning_rate": 1.8374999999999998e-06, "logits/chosen": 0.5248074531555176, "logits/rejected": 0.48727673292160034, "logps/chosen": -164.6722412109375, "logps/rejected": -183.1012725830078, "loss": 0.6168, "rewards/accuracies": 0.625, "rewards/chosen": -0.023662563413381577, "rewards/margins": 0.20655089616775513, "rewards/rejected": -0.230213463306427, "step": 530 }, { "epoch": 0.531, "grad_norm": 1.6005582809448242, "learning_rate": 1.8362499999999998e-06, "logits/chosen": 0.863574743270874, "logits/rejected": 0.7439917325973511, "logps/chosen": -200.7871856689453, "logps/rejected": -233.28067016601562, "loss": 0.7997, "rewards/accuracies": 0.5, "rewards/chosen": -0.1481301337480545, "rewards/margins": -0.06837711483240128, "rewards/rejected": -0.07975301146507263, "step": 531 }, { "epoch": 0.532, "grad_norm": 1.3001655340194702, "learning_rate": 1.8349999999999998e-06, "logits/chosen": 0.31683677434921265, "logits/rejected": 0.3533654808998108, "logps/chosen": -170.08982849121094, "logps/rejected": -147.50660705566406, "loss": 0.574, "rewards/accuracies": 0.75, "rewards/chosen": 0.013477511703968048, "rewards/margins": 0.327903151512146, "rewards/rejected": -0.31442564725875854, "step": 532 }, { "epoch": 0.533, "grad_norm": 1.3846936225891113, "learning_rate": 1.8337499999999998e-06, "logits/chosen": 1.2979599237442017, "logits/rejected": 0.15003564953804016, "logps/chosen": -363.21405029296875, "logps/rejected": -223.14288330078125, "loss": 0.7159, "rewards/accuracies": 0.5, "rewards/chosen": -0.14364595711231232, "rewards/margins": 0.026147939264774323, "rewards/rejected": -0.16979388892650604, "step": 533 }, { "epoch": 0.534, "grad_norm": 2.0537471771240234, "learning_rate": 1.8325e-06, "logits/chosen": 0.3426993489265442, "logits/rejected": 0.1594812124967575, "logps/chosen": -204.63775634765625, "logps/rejected": -182.89598083496094, "loss": 0.8338, "rewards/accuracies": 0.5, "rewards/chosen": -0.1485128402709961, "rewards/margins": -0.11131897568702698, "rewards/rejected": -0.03719387203454971, "step": 534 }, { "epoch": 0.535, "grad_norm": 1.091325283050537, "learning_rate": 1.83125e-06, "logits/chosen": 0.5720323324203491, "logits/rejected": 0.6338455080986023, "logps/chosen": -141.3292694091797, "logps/rejected": -197.17474365234375, "loss": 0.5955, "rewards/accuracies": 0.75, "rewards/chosen": 0.01657586172223091, "rewards/margins": 0.28462642431259155, "rewards/rejected": -0.26805055141448975, "step": 535 }, { "epoch": 0.536, "grad_norm": 1.0759015083312988, "learning_rate": 1.83e-06, "logits/chosen": 0.029831260442733765, "logits/rejected": 0.6252671480178833, "logps/chosen": -187.81373596191406, "logps/rejected": -272.73162841796875, "loss": 0.5655, "rewards/accuracies": 0.75, "rewards/chosen": -0.03486655652523041, "rewards/margins": 0.3643239438533783, "rewards/rejected": -0.3991905450820923, "step": 536 }, { "epoch": 0.537, "grad_norm": 1.9082720279693604, "learning_rate": 1.82875e-06, "logits/chosen": 0.6516749858856201, "logits/rejected": 0.29025572538375854, "logps/chosen": -310.3393859863281, "logps/rejected": -168.1263427734375, "loss": 0.8618, "rewards/accuracies": 0.375, "rewards/chosen": -0.2450736165046692, "rewards/margins": -0.2470172792673111, "rewards/rejected": 0.001943681389093399, "step": 537 }, { "epoch": 0.538, "grad_norm": 1.387744665145874, "learning_rate": 1.8274999999999999e-06, "logits/chosen": 1.0544770956039429, "logits/rejected": 0.015745345503091812, "logps/chosen": -319.9405517578125, "logps/rejected": -131.5377655029297, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.07284346222877502, "rewards/margins": 0.18071967363357544, "rewards/rejected": -0.25356313586235046, "step": 538 }, { "epoch": 0.539, "grad_norm": 1.1182641983032227, "learning_rate": 1.8262499999999999e-06, "logits/chosen": 0.7182667851448059, "logits/rejected": 0.6490883827209473, "logps/chosen": -264.5526428222656, "logps/rejected": -185.50042724609375, "loss": 0.5992, "rewards/accuracies": 0.625, "rewards/chosen": 0.06941352039575577, "rewards/margins": 0.27646327018737793, "rewards/rejected": -0.20704975724220276, "step": 539 }, { "epoch": 0.54, "grad_norm": 1.3177313804626465, "learning_rate": 1.8249999999999999e-06, "logits/chosen": 0.5860405564308167, "logits/rejected": 0.2692539095878601, "logps/chosen": -311.8475036621094, "logps/rejected": -175.1718292236328, "loss": 0.6649, "rewards/accuracies": 0.625, "rewards/chosen": -0.0768270492553711, "rewards/margins": 0.12607936561107635, "rewards/rejected": -0.20290642976760864, "step": 540 }, { "epoch": 0.541, "grad_norm": 0.9908604621887207, "learning_rate": 1.82375e-06, "logits/chosen": 0.3484402298927307, "logits/rejected": 0.4749228060245514, "logps/chosen": -169.6615447998047, "logps/rejected": -228.04727172851562, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": 0.08033046126365662, "rewards/margins": 0.46774521470069885, "rewards/rejected": -0.38741475343704224, "step": 541 }, { "epoch": 0.542, "grad_norm": 1.497728943824768, "learning_rate": 1.8225e-06, "logits/chosen": 0.9254794716835022, "logits/rejected": 0.6451727151870728, "logps/chosen": -238.83212280273438, "logps/rejected": -193.00274658203125, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": -0.10098705440759659, "rewards/margins": 0.09565077722072601, "rewards/rejected": -0.196637824177742, "step": 542 }, { "epoch": 0.543, "grad_norm": 0.9914747476577759, "learning_rate": 1.82125e-06, "logits/chosen": -0.1440151184797287, "logits/rejected": 0.5665435791015625, "logps/chosen": -98.92823791503906, "logps/rejected": -171.18753051757812, "loss": 0.5612, "rewards/accuracies": 0.875, "rewards/chosen": 0.07301750034093857, "rewards/margins": 0.3788784146308899, "rewards/rejected": -0.3058609068393707, "step": 543 }, { "epoch": 0.544, "grad_norm": 1.0557138919830322, "learning_rate": 1.82e-06, "logits/chosen": 0.8153963088989258, "logits/rejected": 0.5848681330680847, "logps/chosen": -243.7738800048828, "logps/rejected": -201.95436096191406, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": -0.08339925110340118, "rewards/margins": 0.06175171211361885, "rewards/rejected": -0.14515095949172974, "step": 544 }, { "epoch": 0.545, "grad_norm": 1.5948783159255981, "learning_rate": 1.81875e-06, "logits/chosen": 0.30357658863067627, "logits/rejected": 0.6649646759033203, "logps/chosen": -191.64352416992188, "logps/rejected": -204.9613037109375, "loss": 0.6673, "rewards/accuracies": 0.75, "rewards/chosen": -0.10481086373329163, "rewards/margins": 0.31007593870162964, "rewards/rejected": -0.41488680243492126, "step": 545 }, { "epoch": 0.546, "grad_norm": 0.8284395933151245, "learning_rate": 1.8174999999999998e-06, "logits/chosen": 0.2558647394180298, "logits/rejected": 0.6950669288635254, "logps/chosen": -110.38037872314453, "logps/rejected": -178.61508178710938, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": -0.016033653169870377, "rewards/margins": 0.4334052801132202, "rewards/rejected": -0.4494389295578003, "step": 546 }, { "epoch": 0.547, "grad_norm": 1.411900281906128, "learning_rate": 1.8162499999999998e-06, "logits/chosen": 0.7889536619186401, "logits/rejected": 0.8252739906311035, "logps/chosen": -284.90191650390625, "logps/rejected": -224.26632690429688, "loss": 0.6358, "rewards/accuracies": 0.375, "rewards/chosen": -0.06312590092420578, "rewards/margins": 0.20631131529808044, "rewards/rejected": -0.2694372236728668, "step": 547 }, { "epoch": 0.548, "grad_norm": 1.129799723625183, "learning_rate": 1.8149999999999998e-06, "logits/chosen": 0.6498042345046997, "logits/rejected": 0.4900035560131073, "logps/chosen": -204.78561401367188, "logps/rejected": -212.3568878173828, "loss": 0.5564, "rewards/accuracies": 0.75, "rewards/chosen": -0.0015934929251670837, "rewards/margins": 0.3299192488193512, "rewards/rejected": -0.3315127491950989, "step": 548 }, { "epoch": 0.549, "grad_norm": 1.0771491527557373, "learning_rate": 1.8137499999999998e-06, "logits/chosen": 0.3771512806415558, "logits/rejected": 1.3818809986114502, "logps/chosen": -148.3845977783203, "logps/rejected": -280.12652587890625, "loss": 0.5942, "rewards/accuracies": 0.75, "rewards/chosen": -0.02579483389854431, "rewards/margins": 0.3220781087875366, "rewards/rejected": -0.34787291288375854, "step": 549 }, { "epoch": 0.55, "grad_norm": 1.1413328647613525, "learning_rate": 1.8125e-06, "logits/chosen": 0.8517430424690247, "logits/rejected": 0.16039901971817017, "logps/chosen": -245.96434020996094, "logps/rejected": -180.6445770263672, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.027102459222078323, "rewards/margins": 0.05177823454141617, "rewards/rejected": -0.0788806900382042, "step": 550 }, { "epoch": 0.551, "grad_norm": 0.8640431761741638, "learning_rate": 1.81125e-06, "logits/chosen": 0.42744120955467224, "logits/rejected": 0.5773760080337524, "logps/chosen": -127.28846740722656, "logps/rejected": -170.771728515625, "loss": 0.5787, "rewards/accuracies": 0.75, "rewards/chosen": -0.010936826467514038, "rewards/margins": 0.35207968950271606, "rewards/rejected": -0.3630165159702301, "step": 551 }, { "epoch": 0.552, "grad_norm": 1.434426188468933, "learning_rate": 1.81e-06, "logits/chosen": 0.9572994112968445, "logits/rejected": 0.5482254028320312, "logps/chosen": -279.372314453125, "logps/rejected": -195.20574951171875, "loss": 0.6171, "rewards/accuracies": 0.5, "rewards/chosen": -0.10219898074865341, "rewards/margins": 0.29473403096199036, "rewards/rejected": -0.396932989358902, "step": 552 }, { "epoch": 0.553, "grad_norm": 1.3059550523757935, "learning_rate": 1.80875e-06, "logits/chosen": 0.7046111822128296, "logits/rejected": 0.2625904977321625, "logps/chosen": -238.3018798828125, "logps/rejected": -211.44790649414062, "loss": 0.7215, "rewards/accuracies": 0.375, "rewards/chosen": -0.121790312230587, "rewards/margins": 0.025155816227197647, "rewards/rejected": -0.14694613218307495, "step": 553 }, { "epoch": 0.554, "grad_norm": 1.2356590032577515, "learning_rate": 1.8075e-06, "logits/chosen": 0.23424969613552094, "logits/rejected": 0.7624509334564209, "logps/chosen": -142.7352294921875, "logps/rejected": -296.685302734375, "loss": 0.649, "rewards/accuracies": 0.625, "rewards/chosen": -0.08041225373744965, "rewards/margins": 0.1642603576183319, "rewards/rejected": -0.24467259645462036, "step": 554 }, { "epoch": 0.555, "grad_norm": 1.3723255395889282, "learning_rate": 1.8062499999999999e-06, "logits/chosen": 0.44012123346328735, "logits/rejected": 1.2605739831924438, "logps/chosen": -126.12421417236328, "logps/rejected": -264.95416259765625, "loss": 0.4001, "rewards/accuracies": 1.0, "rewards/chosen": 0.1092560812830925, "rewards/margins": 0.7291345596313477, "rewards/rejected": -0.6198784708976746, "step": 555 }, { "epoch": 0.556, "grad_norm": 1.3445881605148315, "learning_rate": 1.8049999999999999e-06, "logits/chosen": 0.7125198841094971, "logits/rejected": 0.10429316014051437, "logps/chosen": -217.21141052246094, "logps/rejected": -177.35507202148438, "loss": 0.7178, "rewards/accuracies": 0.375, "rewards/chosen": -0.10482501983642578, "rewards/margins": 0.03260165452957153, "rewards/rejected": -0.13742665946483612, "step": 556 }, { "epoch": 0.557, "grad_norm": 1.3422718048095703, "learning_rate": 1.8037499999999999e-06, "logits/chosen": 0.9891902208328247, "logits/rejected": 0.4813511073589325, "logps/chosen": -328.4732971191406, "logps/rejected": -161.51417541503906, "loss": 0.7464, "rewards/accuracies": 0.375, "rewards/chosen": -0.23631659150123596, "rewards/margins": -0.05098825320601463, "rewards/rejected": -0.18532833456993103, "step": 557 }, { "epoch": 0.558, "grad_norm": 0.9069197773933411, "learning_rate": 1.8025e-06, "logits/chosen": 0.6576359272003174, "logits/rejected": 0.454481840133667, "logps/chosen": -226.1372833251953, "logps/rejected": -178.0142364501953, "loss": 0.563, "rewards/accuracies": 0.875, "rewards/chosen": 0.022047758102416992, "rewards/margins": 0.33500808477401733, "rewards/rejected": -0.31296035647392273, "step": 558 }, { "epoch": 0.559, "grad_norm": 1.2954083681106567, "learning_rate": 1.80125e-06, "logits/chosen": 0.9119006395339966, "logits/rejected": 0.35427534580230713, "logps/chosen": -306.46148681640625, "logps/rejected": -203.54385375976562, "loss": 0.703, "rewards/accuracies": 0.5, "rewards/chosen": -0.15547439455986023, "rewards/margins": 0.0405709333717823, "rewards/rejected": -0.19604529440402985, "step": 559 }, { "epoch": 0.56, "grad_norm": 1.1135534048080444, "learning_rate": 1.8e-06, "logits/chosen": 0.7153990268707275, "logits/rejected": 0.8222575187683105, "logps/chosen": -331.7166442871094, "logps/rejected": -271.278076171875, "loss": 0.6539, "rewards/accuracies": 0.5, "rewards/chosen": -0.04584836587309837, "rewards/margins": 0.18088941276073456, "rewards/rejected": -0.22673779726028442, "step": 560 }, { "epoch": 0.561, "grad_norm": 1.4387671947479248, "learning_rate": 1.79875e-06, "logits/chosen": 0.28902754187583923, "logits/rejected": 1.0729830265045166, "logps/chosen": -241.58700561523438, "logps/rejected": -250.73817443847656, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": -0.10121440887451172, "rewards/margins": 0.1332038938999176, "rewards/rejected": -0.23441830277442932, "step": 561 }, { "epoch": 0.562, "grad_norm": 1.4097422361373901, "learning_rate": 1.7975e-06, "logits/chosen": -0.060612961649894714, "logits/rejected": 0.3227797746658325, "logps/chosen": -116.93154907226562, "logps/rejected": -246.2366485595703, "loss": 0.4213, "rewards/accuracies": 1.0, "rewards/chosen": 0.19990025460720062, "rewards/margins": 0.6596012711524963, "rewards/rejected": -0.45970097184181213, "step": 562 }, { "epoch": 0.563, "grad_norm": 2.090855121612549, "learning_rate": 1.7962499999999997e-06, "logits/chosen": 1.4063630104064941, "logits/rejected": -0.010628756135702133, "logps/chosen": -414.1719970703125, "logps/rejected": -142.15127563476562, "loss": 0.8642, "rewards/accuracies": 0.25, "rewards/chosen": -0.2601586580276489, "rewards/margins": -0.2893584668636322, "rewards/rejected": 0.029199793934822083, "step": 563 }, { "epoch": 0.564, "grad_norm": 1.0946136713027954, "learning_rate": 1.7949999999999998e-06, "logits/chosen": 0.1255052536725998, "logits/rejected": 0.5536356568336487, "logps/chosen": -217.56765747070312, "logps/rejected": -252.7147979736328, "loss": 0.6404, "rewards/accuracies": 0.75, "rewards/chosen": -0.05322352051734924, "rewards/margins": 0.15950268507003784, "rewards/rejected": -0.21272622048854828, "step": 564 }, { "epoch": 0.565, "grad_norm": 1.0863999128341675, "learning_rate": 1.79375e-06, "logits/chosen": 0.5333768129348755, "logits/rejected": 0.9561200141906738, "logps/chosen": -197.46011352539062, "logps/rejected": -241.4066619873047, "loss": 0.6006, "rewards/accuracies": 0.625, "rewards/chosen": -0.013852410018444061, "rewards/margins": 0.2590827941894531, "rewards/rejected": -0.2729352116584778, "step": 565 }, { "epoch": 0.566, "grad_norm": 1.1066704988479614, "learning_rate": 1.7925e-06, "logits/chosen": 0.5507768392562866, "logits/rejected": 0.892998456954956, "logps/chosen": -156.7679901123047, "logps/rejected": -212.05410766601562, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -0.015497205778956413, "rewards/margins": 0.33309805393218994, "rewards/rejected": -0.3485952913761139, "step": 566 }, { "epoch": 0.567, "grad_norm": 1.292858362197876, "learning_rate": 1.79125e-06, "logits/chosen": 0.2372628003358841, "logits/rejected": 1.0619182586669922, "logps/chosen": -157.68617248535156, "logps/rejected": -235.21697998046875, "loss": 0.5058, "rewards/accuracies": 0.875, "rewards/chosen": 0.02759700082242489, "rewards/margins": 0.5169894695281982, "rewards/rejected": -0.4893924593925476, "step": 567 }, { "epoch": 0.568, "grad_norm": 1.5520377159118652, "learning_rate": 1.79e-06, "logits/chosen": 0.9531646966934204, "logits/rejected": 0.5000211000442505, "logps/chosen": -242.98263549804688, "logps/rejected": -137.86105346679688, "loss": 0.7081, "rewards/accuracies": 0.375, "rewards/chosen": -0.18997907638549805, "rewards/margins": 0.06493272632360458, "rewards/rejected": -0.2549118101596832, "step": 568 }, { "epoch": 0.569, "grad_norm": 1.0514682531356812, "learning_rate": 1.78875e-06, "logits/chosen": 0.1998925805091858, "logits/rejected": -0.24016037583351135, "logps/chosen": -214.3524169921875, "logps/rejected": -180.87118530273438, "loss": 0.5751, "rewards/accuracies": 0.625, "rewards/chosen": -0.005154421553015709, "rewards/margins": 0.3426356613636017, "rewards/rejected": -0.34779009222984314, "step": 569 }, { "epoch": 0.57, "grad_norm": 1.0700112581253052, "learning_rate": 1.7875e-06, "logits/chosen": 0.4002823829650879, "logits/rejected": 0.9566532969474792, "logps/chosen": -326.58782958984375, "logps/rejected": -282.12188720703125, "loss": 0.4859, "rewards/accuracies": 1.0, "rewards/chosen": 0.05100717395544052, "rewards/margins": 0.48729220032691956, "rewards/rejected": -0.43628501892089844, "step": 570 }, { "epoch": 0.571, "grad_norm": 1.7370915412902832, "learning_rate": 1.7862499999999998e-06, "logits/chosen": 0.6602214574813843, "logits/rejected": 0.6510977149009705, "logps/chosen": -292.46771240234375, "logps/rejected": -162.83302307128906, "loss": 0.7398, "rewards/accuracies": 0.5, "rewards/chosen": -0.1318548321723938, "rewards/margins": -0.004116356372833252, "rewards/rejected": -0.12773847579956055, "step": 571 }, { "epoch": 0.572, "grad_norm": 1.5614980459213257, "learning_rate": 1.7849999999999999e-06, "logits/chosen": 0.5366610884666443, "logits/rejected": 0.6566224694252014, "logps/chosen": -145.83969116210938, "logps/rejected": -192.08656311035156, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": -0.12417258322238922, "rewards/margins": 0.1966477334499359, "rewards/rejected": -0.32082033157348633, "step": 572 }, { "epoch": 0.573, "grad_norm": 1.188032627105713, "learning_rate": 1.7837499999999999e-06, "logits/chosen": 0.7881054878234863, "logits/rejected": 0.3460056781768799, "logps/chosen": -278.6236572265625, "logps/rejected": -259.07244873046875, "loss": 0.6504, "rewards/accuracies": 0.5, "rewards/chosen": -0.059804823249578476, "rewards/margins": 0.18967333436012268, "rewards/rejected": -0.24947816133499146, "step": 573 }, { "epoch": 0.574, "grad_norm": 1.2385958433151245, "learning_rate": 1.7824999999999999e-06, "logits/chosen": 0.8162249326705933, "logits/rejected": 0.35961973667144775, "logps/chosen": -221.7299346923828, "logps/rejected": -246.23013305664062, "loss": 0.6772, "rewards/accuracies": 0.5, "rewards/chosen": -0.1582798957824707, "rewards/margins": 0.10614529252052307, "rewards/rejected": -0.2644251883029938, "step": 574 }, { "epoch": 0.575, "grad_norm": 1.5978442430496216, "learning_rate": 1.7812499999999999e-06, "logits/chosen": 0.40262699127197266, "logits/rejected": 0.5906732082366943, "logps/chosen": -215.39198303222656, "logps/rejected": -192.03221130371094, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": -0.014652866870164871, "rewards/margins": 0.25632816553115845, "rewards/rejected": -0.2709810137748718, "step": 575 }, { "epoch": 0.576, "grad_norm": 1.1202038526535034, "learning_rate": 1.78e-06, "logits/chosen": 1.168519139289856, "logits/rejected": 0.38152748346328735, "logps/chosen": -305.39312744140625, "logps/rejected": -194.37046813964844, "loss": 0.5918, "rewards/accuracies": 0.625, "rewards/chosen": -0.035419657826423645, "rewards/margins": 0.26972752809524536, "rewards/rejected": -0.3051471710205078, "step": 576 }, { "epoch": 0.577, "grad_norm": 1.1842827796936035, "learning_rate": 1.77875e-06, "logits/chosen": 0.6792956590652466, "logits/rejected": 0.18541982769966125, "logps/chosen": -238.6282958984375, "logps/rejected": -153.26373291015625, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": -0.049092672765254974, "rewards/margins": 0.37242239713668823, "rewards/rejected": -0.4215150773525238, "step": 577 }, { "epoch": 0.578, "grad_norm": 1.3471111059188843, "learning_rate": 1.7775e-06, "logits/chosen": 0.7395996451377869, "logits/rejected": 0.2852098345756531, "logps/chosen": -215.50856018066406, "logps/rejected": -165.15328979492188, "loss": 0.6168, "rewards/accuracies": 0.625, "rewards/chosen": -0.10922060161828995, "rewards/margins": 0.318654328584671, "rewards/rejected": -0.42787495255470276, "step": 578 }, { "epoch": 0.579, "grad_norm": 1.0548595190048218, "learning_rate": 1.77625e-06, "logits/chosen": 0.9245792031288147, "logits/rejected": 0.4588315486907959, "logps/chosen": -215.12338256835938, "logps/rejected": -181.52085876464844, "loss": 0.5316, "rewards/accuracies": 0.75, "rewards/chosen": 0.011512475088238716, "rewards/margins": 0.4363487958908081, "rewards/rejected": -0.42483633756637573, "step": 579 }, { "epoch": 0.58, "grad_norm": 1.2464340925216675, "learning_rate": 1.7749999999999997e-06, "logits/chosen": 0.7250610589981079, "logits/rejected": 1.0927269458770752, "logps/chosen": -197.63230895996094, "logps/rejected": -193.14881896972656, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 0.1304703652858734, "rewards/margins": 0.8135562539100647, "rewards/rejected": -0.6830858588218689, "step": 580 }, { "epoch": 0.581, "grad_norm": 1.1414377689361572, "learning_rate": 1.77375e-06, "logits/chosen": 0.8031628727912903, "logits/rejected": 0.14372047781944275, "logps/chosen": -195.65219116210938, "logps/rejected": -196.10702514648438, "loss": 0.5795, "rewards/accuracies": 0.75, "rewards/chosen": 0.03685598075389862, "rewards/margins": 0.30817222595214844, "rewards/rejected": -0.2713162302970886, "step": 581 }, { "epoch": 0.582, "grad_norm": 1.6928431987762451, "learning_rate": 1.7725e-06, "logits/chosen": 1.2033690214157104, "logits/rejected": 0.14623993635177612, "logps/chosen": -359.58721923828125, "logps/rejected": -139.76263427734375, "loss": 0.6901, "rewards/accuracies": 0.375, "rewards/chosen": -0.15004128217697144, "rewards/margins": 0.11378832161426544, "rewards/rejected": -0.2638295888900757, "step": 582 }, { "epoch": 0.583, "grad_norm": 0.9475902915000916, "learning_rate": 1.77125e-06, "logits/chosen": 0.31530264019966125, "logits/rejected": 0.23473942279815674, "logps/chosen": -145.2839813232422, "logps/rejected": -174.67770385742188, "loss": 0.4926, "rewards/accuracies": 0.75, "rewards/chosen": 0.03225899115204811, "rewards/margins": 0.5501573085784912, "rewards/rejected": -0.5178983211517334, "step": 583 }, { "epoch": 0.584, "grad_norm": 1.8497509956359863, "learning_rate": 1.77e-06, "logits/chosen": 0.8487683534622192, "logits/rejected": 0.373588502407074, "logps/chosen": -261.9595947265625, "logps/rejected": -147.77491760253906, "loss": 0.8354, "rewards/accuracies": 0.375, "rewards/chosen": -0.33281242847442627, "rewards/margins": -0.1883590966463089, "rewards/rejected": -0.14445333182811737, "step": 584 }, { "epoch": 0.585, "grad_norm": 1.0800780057907104, "learning_rate": 1.76875e-06, "logits/chosen": 0.5821106433868408, "logits/rejected": 0.38228940963745117, "logps/chosen": -198.5299072265625, "logps/rejected": -162.51675415039062, "loss": 0.5815, "rewards/accuracies": 0.625, "rewards/chosen": 0.023809147998690605, "rewards/margins": 0.3751732110977173, "rewards/rejected": -0.35136404633522034, "step": 585 }, { "epoch": 0.586, "grad_norm": 1.0433357954025269, "learning_rate": 1.7675e-06, "logits/chosen": 0.8974719643592834, "logits/rejected": 0.13674476742744446, "logps/chosen": -264.5386962890625, "logps/rejected": -164.85433959960938, "loss": 0.6392, "rewards/accuracies": 0.875, "rewards/chosen": -0.04186725616455078, "rewards/margins": 0.1696593165397644, "rewards/rejected": -0.21152658760547638, "step": 586 }, { "epoch": 0.587, "grad_norm": 1.016843557357788, "learning_rate": 1.76625e-06, "logits/chosen": 0.33615612983703613, "logits/rejected": 0.6893444657325745, "logps/chosen": -161.19007873535156, "logps/rejected": -238.76727294921875, "loss": 0.5521, "rewards/accuracies": 0.75, "rewards/chosen": -0.06931904703378677, "rewards/margins": 0.378324031829834, "rewards/rejected": -0.44764310121536255, "step": 587 }, { "epoch": 0.588, "grad_norm": 1.1982815265655518, "learning_rate": 1.7649999999999998e-06, "logits/chosen": 0.26253584027290344, "logits/rejected": 0.5734891295433044, "logps/chosen": -167.55462646484375, "logps/rejected": -223.73056030273438, "loss": 0.5007, "rewards/accuracies": 0.875, "rewards/chosen": 0.022367190569639206, "rewards/margins": 0.4699689447879791, "rewards/rejected": -0.4476017653942108, "step": 588 }, { "epoch": 0.589, "grad_norm": 1.3746821880340576, "learning_rate": 1.7637499999999998e-06, "logits/chosen": 0.820080578327179, "logits/rejected": 0.2938690185546875, "logps/chosen": -216.08250427246094, "logps/rejected": -292.96331787109375, "loss": 0.6766, "rewards/accuracies": 0.5, "rewards/chosen": -0.18301138281822205, "rewards/margins": 0.10399451851844788, "rewards/rejected": -0.2870059013366699, "step": 589 }, { "epoch": 0.59, "grad_norm": 1.0343652963638306, "learning_rate": 1.7624999999999999e-06, "logits/chosen": 0.2782900035381317, "logits/rejected": 0.17362025380134583, "logps/chosen": -158.14056396484375, "logps/rejected": -167.83021545410156, "loss": 0.5116, "rewards/accuracies": 0.75, "rewards/chosen": 0.013735488057136536, "rewards/margins": 0.5540079474449158, "rewards/rejected": -0.5402724742889404, "step": 590 }, { "epoch": 0.591, "grad_norm": 0.9770963191986084, "learning_rate": 1.7612499999999999e-06, "logits/chosen": 0.9279900789260864, "logits/rejected": 0.663773775100708, "logps/chosen": -265.84228515625, "logps/rejected": -249.68212890625, "loss": 0.5287, "rewards/accuracies": 0.625, "rewards/chosen": 0.0519678071141243, "rewards/margins": 0.45721542835235596, "rewards/rejected": -0.40524759888648987, "step": 591 }, { "epoch": 0.592, "grad_norm": 1.0160202980041504, "learning_rate": 1.7599999999999999e-06, "logits/chosen": 0.5355463027954102, "logits/rejected": 0.7247662544250488, "logps/chosen": -178.4764862060547, "logps/rejected": -228.3853759765625, "loss": 0.4977, "rewards/accuracies": 0.75, "rewards/chosen": -0.010017205029726028, "rewards/margins": 0.5549812316894531, "rewards/rejected": -0.5649985074996948, "step": 592 }, { "epoch": 0.593, "grad_norm": 1.0084673166275024, "learning_rate": 1.7587499999999999e-06, "logits/chosen": 0.19829675555229187, "logits/rejected": 0.6237442493438721, "logps/chosen": -118.66205596923828, "logps/rejected": -209.81973266601562, "loss": 0.5118, "rewards/accuracies": 0.875, "rewards/chosen": 0.0008130613714456558, "rewards/margins": 0.5294625759124756, "rewards/rejected": -0.5286495685577393, "step": 593 }, { "epoch": 0.594, "grad_norm": 2.162724494934082, "learning_rate": 1.7575e-06, "logits/chosen": 0.679324746131897, "logits/rejected": 0.3963544964790344, "logps/chosen": -280.79736328125, "logps/rejected": -163.91119384765625, "loss": 0.8555, "rewards/accuracies": 0.375, "rewards/chosen": -0.27787894010543823, "rewards/margins": -0.20374032855033875, "rewards/rejected": -0.07413863390684128, "step": 594 }, { "epoch": 0.595, "grad_norm": 1.89451265335083, "learning_rate": 1.75625e-06, "logits/chosen": 0.8960809707641602, "logits/rejected": 0.096053346991539, "logps/chosen": -451.2500305175781, "logps/rejected": -159.0491943359375, "loss": 0.623, "rewards/accuracies": 0.75, "rewards/chosen": -0.07317838817834854, "rewards/margins": 0.26463574171066284, "rewards/rejected": -0.3378141522407532, "step": 595 }, { "epoch": 0.596, "grad_norm": 1.3325767517089844, "learning_rate": 1.7549999999999997e-06, "logits/chosen": 0.4457875192165375, "logits/rejected": 0.4439559876918793, "logps/chosen": -191.5576171875, "logps/rejected": -293.19317626953125, "loss": 0.7083, "rewards/accuracies": 0.375, "rewards/chosen": -0.2010696530342102, "rewards/margins": 0.046387284994125366, "rewards/rejected": -0.24745693802833557, "step": 596 }, { "epoch": 0.597, "grad_norm": 1.6821943521499634, "learning_rate": 1.75375e-06, "logits/chosen": 0.8496333360671997, "logits/rejected": 0.13965512812137604, "logps/chosen": -266.72357177734375, "logps/rejected": -204.3828125, "loss": 0.7049, "rewards/accuracies": 0.375, "rewards/chosen": -0.16451989114284515, "rewards/margins": 0.08404932171106339, "rewards/rejected": -0.24856922030448914, "step": 597 }, { "epoch": 0.598, "grad_norm": 1.19459867477417, "learning_rate": 1.7525e-06, "logits/chosen": 0.6873576641082764, "logits/rejected": 0.8930942416191101, "logps/chosen": -322.9263610839844, "logps/rejected": -234.54583740234375, "loss": 0.4938, "rewards/accuracies": 0.875, "rewards/chosen": 0.03450965881347656, "rewards/margins": 0.5170238018035889, "rewards/rejected": -0.4825142025947571, "step": 598 }, { "epoch": 0.599, "grad_norm": 2.6131484508514404, "learning_rate": 1.75125e-06, "logits/chosen": 0.9985935688018799, "logits/rejected": 0.5168092846870422, "logps/chosen": -364.424560546875, "logps/rejected": -173.25729370117188, "loss": 0.9044, "rewards/accuracies": 0.5, "rewards/chosen": -0.3848533630371094, "rewards/margins": -0.278416246175766, "rewards/rejected": -0.10643711686134338, "step": 599 }, { "epoch": 0.6, "grad_norm": 1.4539074897766113, "learning_rate": 1.75e-06, "logits/chosen": 0.16952653229236603, "logits/rejected": 1.291298270225525, "logps/chosen": -105.81463623046875, "logps/rejected": -259.5905456542969, "loss": 0.3639, "rewards/accuracies": 1.0, "rewards/chosen": 0.10786780714988708, "rewards/margins": 0.8588839769363403, "rewards/rejected": -0.7510161399841309, "step": 600 }, { "epoch": 0.601, "grad_norm": 1.885623574256897, "learning_rate": 1.74875e-06, "logits/chosen": 0.5044308304786682, "logits/rejected": 0.3128920793533325, "logps/chosen": -209.3037109375, "logps/rejected": -166.61024475097656, "loss": 0.8715, "rewards/accuracies": 0.375, "rewards/chosen": -0.33468878269195557, "rewards/margins": -0.22849781811237335, "rewards/rejected": -0.10619096457958221, "step": 601 }, { "epoch": 0.602, "grad_norm": 1.2224737405776978, "learning_rate": 1.7475e-06, "logits/chosen": 0.30355337262153625, "logits/rejected": 0.21402737498283386, "logps/chosen": -148.68576049804688, "logps/rejected": -160.3529510498047, "loss": 0.4151, "rewards/accuracies": 0.875, "rewards/chosen": 0.11291952431201935, "rewards/margins": 0.7339346408843994, "rewards/rejected": -0.621015191078186, "step": 602 }, { "epoch": 0.603, "grad_norm": 1.0036722421646118, "learning_rate": 1.74625e-06, "logits/chosen": 0.32198774814605713, "logits/rejected": 0.5120143890380859, "logps/chosen": -179.5652618408203, "logps/rejected": -189.06134033203125, "loss": 0.4465, "rewards/accuracies": 0.875, "rewards/chosen": 0.10909032821655273, "rewards/margins": 0.646635890007019, "rewards/rejected": -0.5375456213951111, "step": 603 }, { "epoch": 0.604, "grad_norm": 1.5626026391983032, "learning_rate": 1.745e-06, "logits/chosen": 0.9306298494338989, "logits/rejected": 0.43437185883522034, "logps/chosen": -268.08258056640625, "logps/rejected": -182.66387939453125, "loss": 0.7422, "rewards/accuracies": 0.625, "rewards/chosen": -0.3010294735431671, "rewards/margins": -0.02045994997024536, "rewards/rejected": -0.28056955337524414, "step": 604 }, { "epoch": 0.605, "grad_norm": 2.2170307636260986, "learning_rate": 1.7437499999999998e-06, "logits/chosen": 0.7651162147521973, "logits/rejected": 0.7146144509315491, "logps/chosen": -269.05035400390625, "logps/rejected": -164.1693115234375, "loss": 0.8131, "rewards/accuracies": 0.5, "rewards/chosen": -0.3456314504146576, "rewards/margins": -0.1187538430094719, "rewards/rejected": -0.2268775999546051, "step": 605 }, { "epoch": 0.606, "grad_norm": 0.971372663974762, "learning_rate": 1.7424999999999998e-06, "logits/chosen": 0.2980734407901764, "logits/rejected": 0.3017364740371704, "logps/chosen": -176.53036499023438, "logps/rejected": -174.14280700683594, "loss": 0.4905, "rewards/accuracies": 0.75, "rewards/chosen": 0.10030432045459747, "rewards/margins": 0.5756558179855347, "rewards/rejected": -0.4753515124320984, "step": 606 }, { "epoch": 0.607, "grad_norm": 0.9801563620567322, "learning_rate": 1.7412499999999998e-06, "logits/chosen": 0.5832249522209167, "logits/rejected": -0.20390565693378448, "logps/chosen": -198.7496337890625, "logps/rejected": -130.44271850585938, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": 0.04495672881603241, "rewards/margins": 0.5753152370452881, "rewards/rejected": -0.5303584933280945, "step": 607 }, { "epoch": 0.608, "grad_norm": 1.136842131614685, "learning_rate": 1.7399999999999999e-06, "logits/chosen": 0.5849374532699585, "logits/rejected": 0.6898928880691528, "logps/chosen": -228.94129943847656, "logps/rejected": -210.88711547851562, "loss": 0.5665, "rewards/accuracies": 0.875, "rewards/chosen": -0.07966957986354828, "rewards/margins": 0.406995952129364, "rewards/rejected": -0.4866655170917511, "step": 608 }, { "epoch": 0.609, "grad_norm": 1.2684650421142578, "learning_rate": 1.7387499999999999e-06, "logits/chosen": 0.8838132619857788, "logits/rejected": 0.3385126292705536, "logps/chosen": -241.66107177734375, "logps/rejected": -126.99005889892578, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.1654399037361145, "rewards/margins": 0.0781511664390564, "rewards/rejected": -0.2435910701751709, "step": 609 }, { "epoch": 0.61, "grad_norm": 0.9916340112686157, "learning_rate": 1.7374999999999999e-06, "logits/chosen": 0.2551823556423187, "logits/rejected": 1.158161997795105, "logps/chosen": -113.4399642944336, "logps/rejected": -230.26956176757812, "loss": 0.3773, "rewards/accuracies": 0.875, "rewards/chosen": 0.09239298105239868, "rewards/margins": 0.8609386682510376, "rewards/rejected": -0.7685456275939941, "step": 610 }, { "epoch": 0.611, "grad_norm": 1.0652607679367065, "learning_rate": 1.7362499999999999e-06, "logits/chosen": 0.25727158784866333, "logits/rejected": 0.6850873827934265, "logps/chosen": -175.3056640625, "logps/rejected": -156.10009765625, "loss": 0.4792, "rewards/accuracies": 0.75, "rewards/chosen": 0.045632652938365936, "rewards/margins": 0.5945248603820801, "rewards/rejected": -0.548892080783844, "step": 611 }, { "epoch": 0.612, "grad_norm": 1.1469817161560059, "learning_rate": 1.7350000000000001e-06, "logits/chosen": 0.42900997400283813, "logits/rejected": 0.6431488394737244, "logps/chosen": -168.66763305664062, "logps/rejected": -159.66952514648438, "loss": 0.5481, "rewards/accuracies": 0.625, "rewards/chosen": -0.14041224122047424, "rewards/margins": 0.4041622281074524, "rewards/rejected": -0.5445744395256042, "step": 612 }, { "epoch": 0.613, "grad_norm": 1.6536628007888794, "learning_rate": 1.73375e-06, "logits/chosen": 0.642346978187561, "logits/rejected": 0.2078804224729538, "logps/chosen": -252.5835723876953, "logps/rejected": -182.35064697265625, "loss": 0.7299, "rewards/accuracies": 0.375, "rewards/chosen": -0.23638497292995453, "rewards/margins": 0.04131147265434265, "rewards/rejected": -0.2776964008808136, "step": 613 }, { "epoch": 0.614, "grad_norm": 1.3757033348083496, "learning_rate": 1.7325e-06, "logits/chosen": 0.335348904132843, "logits/rejected": 0.5889415740966797, "logps/chosen": -127.64784240722656, "logps/rejected": -226.4728546142578, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": 0.10321540385484695, "rewards/margins": 0.6473432779312134, "rewards/rejected": -0.544127881526947, "step": 614 }, { "epoch": 0.615, "grad_norm": 1.6554300785064697, "learning_rate": 1.73125e-06, "logits/chosen": 0.8262001872062683, "logits/rejected": 0.14905568957328796, "logps/chosen": -208.46145629882812, "logps/rejected": -142.73406982421875, "loss": 0.6325, "rewards/accuracies": 0.625, "rewards/chosen": -0.07002735137939453, "rewards/margins": 0.317892462015152, "rewards/rejected": -0.3879197835922241, "step": 615 }, { "epoch": 0.616, "grad_norm": 1.6477717161178589, "learning_rate": 1.73e-06, "logits/chosen": 1.2537070512771606, "logits/rejected": 0.5625426769256592, "logps/chosen": -362.11029052734375, "logps/rejected": -234.2504425048828, "loss": 0.5949, "rewards/accuracies": 0.75, "rewards/chosen": -0.15337982773780823, "rewards/margins": 0.2455664575099945, "rewards/rejected": -0.39894628524780273, "step": 616 }, { "epoch": 0.617, "grad_norm": 1.005473017692566, "learning_rate": 1.72875e-06, "logits/chosen": 0.6523083448410034, "logits/rejected": 0.9884077906608582, "logps/chosen": -196.84671020507812, "logps/rejected": -215.27700805664062, "loss": 0.4788, "rewards/accuracies": 0.875, "rewards/chosen": 0.08751630783081055, "rewards/margins": 0.5606390833854675, "rewards/rejected": -0.47312280535697937, "step": 617 }, { "epoch": 0.618, "grad_norm": 1.467173457145691, "learning_rate": 1.7275e-06, "logits/chosen": 1.0574836730957031, "logits/rejected": 0.8057023286819458, "logps/chosen": -386.4236145019531, "logps/rejected": -239.62603759765625, "loss": 0.5157, "rewards/accuracies": 0.875, "rewards/chosen": -0.03792600333690643, "rewards/margins": 0.496099591255188, "rewards/rejected": -0.5340256094932556, "step": 618 }, { "epoch": 0.619, "grad_norm": 1.0582294464111328, "learning_rate": 1.72625e-06, "logits/chosen": 0.757910966873169, "logits/rejected": 0.21127353608608246, "logps/chosen": -217.70440673828125, "logps/rejected": -171.44850158691406, "loss": 0.6835, "rewards/accuracies": 0.625, "rewards/chosen": -0.07733402401208878, "rewards/margins": 0.1341821849346161, "rewards/rejected": -0.21151620149612427, "step": 619 }, { "epoch": 0.62, "grad_norm": 1.9178158044815063, "learning_rate": 1.725e-06, "logits/chosen": 0.827692985534668, "logits/rejected": 0.7501679062843323, "logps/chosen": -309.13580322265625, "logps/rejected": -190.93533325195312, "loss": 0.7491, "rewards/accuracies": 0.5, "rewards/chosen": -0.2893714904785156, "rewards/margins": 0.0033950060606002808, "rewards/rejected": -0.2927664816379547, "step": 620 }, { "epoch": 0.621, "grad_norm": 1.273333191871643, "learning_rate": 1.7237499999999998e-06, "logits/chosen": 0.32327187061309814, "logits/rejected": 1.2360018491744995, "logps/chosen": -253.3721466064453, "logps/rejected": -284.21319580078125, "loss": 0.6539, "rewards/accuracies": 0.5, "rewards/chosen": -0.08622069656848907, "rewards/margins": 0.20021483302116394, "rewards/rejected": -0.2864355146884918, "step": 621 }, { "epoch": 0.622, "grad_norm": 1.2744060754776, "learning_rate": 1.7224999999999998e-06, "logits/chosen": 0.30605819821357727, "logits/rejected": 0.9060853719711304, "logps/chosen": -234.25038146972656, "logps/rejected": -251.2718505859375, "loss": 0.5553, "rewards/accuracies": 0.75, "rewards/chosen": -0.0979493111371994, "rewards/margins": 0.3851323127746582, "rewards/rejected": -0.4830816388130188, "step": 622 }, { "epoch": 0.623, "grad_norm": 1.3908876180648804, "learning_rate": 1.7212499999999998e-06, "logits/chosen": 0.7340800166130066, "logits/rejected": 0.5360531806945801, "logps/chosen": -316.18670654296875, "logps/rejected": -201.8384552001953, "loss": 0.4811, "rewards/accuracies": 0.875, "rewards/chosen": -0.009313344955444336, "rewards/margins": 0.5249558687210083, "rewards/rejected": -0.5342692136764526, "step": 623 }, { "epoch": 0.624, "grad_norm": 1.075116753578186, "learning_rate": 1.7199999999999998e-06, "logits/chosen": 0.7536613345146179, "logits/rejected": 0.8307297825813293, "logps/chosen": -223.4132537841797, "logps/rejected": -186.45729064941406, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": -0.0496399886906147, "rewards/margins": 0.3330395221710205, "rewards/rejected": -0.3826794922351837, "step": 624 }, { "epoch": 0.625, "grad_norm": 1.235116958618164, "learning_rate": 1.7187499999999998e-06, "logits/chosen": 0.8816282749176025, "logits/rejected": 0.19548992812633514, "logps/chosen": -228.17083740234375, "logps/rejected": -170.63067626953125, "loss": 0.6092, "rewards/accuracies": 0.625, "rewards/chosen": -0.052015211433172226, "rewards/margins": 0.2787264883518219, "rewards/rejected": -0.3307417035102844, "step": 625 }, { "epoch": 0.626, "grad_norm": 1.111493706703186, "learning_rate": 1.7174999999999999e-06, "logits/chosen": 0.3053998053073883, "logits/rejected": 0.7288224697113037, "logps/chosen": -198.54689025878906, "logps/rejected": -212.58863830566406, "loss": 0.6074, "rewards/accuracies": 0.75, "rewards/chosen": -0.15514709055423737, "rewards/margins": 0.2959044277667999, "rewards/rejected": -0.4510515034198761, "step": 626 }, { "epoch": 0.627, "grad_norm": 1.0642855167388916, "learning_rate": 1.7162499999999999e-06, "logits/chosen": 1.0717599391937256, "logits/rejected": 0.2096109390258789, "logps/chosen": -294.2655334472656, "logps/rejected": -170.93124389648438, "loss": 0.4903, "rewards/accuracies": 0.875, "rewards/chosen": 0.010083001106977463, "rewards/margins": 0.5183560848236084, "rewards/rejected": -0.5082730054855347, "step": 627 }, { "epoch": 0.628, "grad_norm": 1.4286178350448608, "learning_rate": 1.715e-06, "logits/chosen": 0.7422631978988647, "logits/rejected": 0.7425699830055237, "logps/chosen": -282.5257568359375, "logps/rejected": -218.55853271484375, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": -0.17031317949295044, "rewards/margins": 0.31653738021850586, "rewards/rejected": -0.4868505001068115, "step": 628 }, { "epoch": 0.629, "grad_norm": 1.589859127998352, "learning_rate": 1.7137500000000001e-06, "logits/chosen": 0.8022851943969727, "logits/rejected": 0.42277607321739197, "logps/chosen": -276.56536865234375, "logps/rejected": -184.9110565185547, "loss": 0.8027, "rewards/accuracies": 0.5, "rewards/chosen": -0.2344508171081543, "rewards/margins": -0.12481320649385452, "rewards/rejected": -0.10963759571313858, "step": 629 }, { "epoch": 0.63, "grad_norm": 1.435430645942688, "learning_rate": 1.7125e-06, "logits/chosen": 0.346598744392395, "logits/rejected": 0.7477303743362427, "logps/chosen": -231.181640625, "logps/rejected": -172.57504272460938, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": -0.19188815355300903, "rewards/margins": 0.16814343631267548, "rewards/rejected": -0.3600316047668457, "step": 630 }, { "epoch": 0.631, "grad_norm": 1.081853985786438, "learning_rate": 1.71125e-06, "logits/chosen": 0.5427528023719788, "logits/rejected": 0.6587169766426086, "logps/chosen": -230.0791015625, "logps/rejected": -186.410400390625, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": -0.15851926803588867, "rewards/margins": 0.3090033233165741, "rewards/rejected": -0.46752262115478516, "step": 631 }, { "epoch": 0.632, "grad_norm": 1.6335241794586182, "learning_rate": 1.71e-06, "logits/chosen": 0.726620078086853, "logits/rejected": 0.16273924708366394, "logps/chosen": -251.16473388671875, "logps/rejected": -161.16702270507812, "loss": 0.8401, "rewards/accuracies": 0.5, "rewards/chosen": -0.31066980957984924, "rewards/margins": -0.1301659494638443, "rewards/rejected": -0.18050384521484375, "step": 632 }, { "epoch": 0.633, "grad_norm": 1.1919530630111694, "learning_rate": 1.70875e-06, "logits/chosen": 0.36495041847229004, "logits/rejected": 0.7347344160079956, "logps/chosen": -164.83203125, "logps/rejected": -201.21932983398438, "loss": 0.5216, "rewards/accuracies": 0.75, "rewards/chosen": -0.07905589044094086, "rewards/margins": 0.6074168086051941, "rewards/rejected": -0.6864727139472961, "step": 633 }, { "epoch": 0.634, "grad_norm": 1.492375135421753, "learning_rate": 1.7075e-06, "logits/chosen": 0.4727865755558014, "logits/rejected": 0.6263105273246765, "logps/chosen": -125.82455444335938, "logps/rejected": -238.4691925048828, "loss": 0.3629, "rewards/accuracies": 1.0, "rewards/chosen": 0.2343900501728058, "rewards/margins": 0.8947431445121765, "rewards/rejected": -0.6603531241416931, "step": 634 }, { "epoch": 0.635, "grad_norm": 2.0820248126983643, "learning_rate": 1.70625e-06, "logits/chosen": 1.3536158800125122, "logits/rejected": 0.3191388249397278, "logps/chosen": -377.27734375, "logps/rejected": -175.6242218017578, "loss": 0.7502, "rewards/accuracies": 0.5, "rewards/chosen": -0.33179694414138794, "rewards/margins": -0.06204243376851082, "rewards/rejected": -0.2697544991970062, "step": 635 }, { "epoch": 0.636, "grad_norm": 1.9976575374603271, "learning_rate": 1.705e-06, "logits/chosen": 0.7412731647491455, "logits/rejected": 0.5924468636512756, "logps/chosen": -220.5921173095703, "logps/rejected": -154.74290466308594, "loss": 0.728, "rewards/accuracies": 0.375, "rewards/chosen": -0.3491371273994446, "rewards/margins": 0.04693584144115448, "rewards/rejected": -0.39607295393943787, "step": 636 }, { "epoch": 0.637, "grad_norm": 1.2444469928741455, "learning_rate": 1.70375e-06, "logits/chosen": 0.9843742847442627, "logits/rejected": 0.4352917969226837, "logps/chosen": -439.1971435546875, "logps/rejected": -214.67771911621094, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": -0.18828339874744415, "rewards/margins": 0.27801546454429626, "rewards/rejected": -0.4662988781929016, "step": 637 }, { "epoch": 0.638, "grad_norm": 1.484726071357727, "learning_rate": 1.7024999999999998e-06, "logits/chosen": 0.3476608991622925, "logits/rejected": 0.8909132480621338, "logps/chosen": -119.98409271240234, "logps/rejected": -257.69317626953125, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": -0.15939921140670776, "rewards/margins": 0.3423263430595398, "rewards/rejected": -0.5017255544662476, "step": 638 }, { "epoch": 0.639, "grad_norm": 1.1205824613571167, "learning_rate": 1.7012499999999998e-06, "logits/chosen": 0.7164477109909058, "logits/rejected": 0.20562660694122314, "logps/chosen": -219.53433227539062, "logps/rejected": -173.891357421875, "loss": 0.5713, "rewards/accuracies": 0.875, "rewards/chosen": -0.06757602095603943, "rewards/margins": 0.3239170014858246, "rewards/rejected": -0.391493022441864, "step": 639 }, { "epoch": 0.64, "grad_norm": 1.2922612428665161, "learning_rate": 1.6999999999999998e-06, "logits/chosen": 0.750927209854126, "logits/rejected": 0.7281703948974609, "logps/chosen": -331.2215576171875, "logps/rejected": -231.9676513671875, "loss": 0.631, "rewards/accuracies": 0.625, "rewards/chosen": -0.09288549423217773, "rewards/margins": 0.2849258780479431, "rewards/rejected": -0.37781137228012085, "step": 640 }, { "epoch": 0.641, "grad_norm": 1.405611515045166, "learning_rate": 1.6987499999999998e-06, "logits/chosen": 0.49989062547683716, "logits/rejected": 0.4761597812175751, "logps/chosen": -189.60986328125, "logps/rejected": -203.07791137695312, "loss": 0.7525, "rewards/accuracies": 0.5, "rewards/chosen": -0.2740132212638855, "rewards/margins": 0.028111658990383148, "rewards/rejected": -0.30212488770484924, "step": 641 }, { "epoch": 0.642, "grad_norm": 1.8327155113220215, "learning_rate": 1.6974999999999998e-06, "logits/chosen": 1.0974537134170532, "logits/rejected": 0.06364043056964874, "logps/chosen": -290.9371337890625, "logps/rejected": -145.08200073242188, "loss": 0.8203, "rewards/accuracies": 0.375, "rewards/chosen": -0.32486993074417114, "rewards/margins": -0.11023159325122833, "rewards/rejected": -0.21463832259178162, "step": 642 }, { "epoch": 0.643, "grad_norm": 1.4330918788909912, "learning_rate": 1.6962499999999999e-06, "logits/chosen": 1.3110928535461426, "logits/rejected": 0.5567178130149841, "logps/chosen": -374.49822998046875, "logps/rejected": -184.16253662109375, "loss": 0.6351, "rewards/accuracies": 0.625, "rewards/chosen": -0.1879827380180359, "rewards/margins": 0.1980431079864502, "rewards/rejected": -0.3860258460044861, "step": 643 }, { "epoch": 0.644, "grad_norm": 1.0639867782592773, "learning_rate": 1.695e-06, "logits/chosen": 0.19518430531024933, "logits/rejected": 0.5510204434394836, "logps/chosen": -153.8926544189453, "logps/rejected": -219.3041534423828, "loss": 0.6125, "rewards/accuracies": 0.625, "rewards/chosen": -0.06127748638391495, "rewards/margins": 0.383848637342453, "rewards/rejected": -0.44512614607810974, "step": 644 }, { "epoch": 0.645, "grad_norm": 1.2394709587097168, "learning_rate": 1.69375e-06, "logits/chosen": -0.09841229766607285, "logits/rejected": 0.30882707238197327, "logps/chosen": -272.10821533203125, "logps/rejected": -220.61566162109375, "loss": 0.5633, "rewards/accuracies": 0.75, "rewards/chosen": -0.14183320105075836, "rewards/margins": 0.424462229013443, "rewards/rejected": -0.5662954449653625, "step": 645 }, { "epoch": 0.646, "grad_norm": 1.5904271602630615, "learning_rate": 1.6924999999999999e-06, "logits/chosen": 0.5364912748336792, "logits/rejected": 0.3358636498451233, "logps/chosen": -298.0599060058594, "logps/rejected": -170.140380859375, "loss": 0.629, "rewards/accuracies": 0.625, "rewards/chosen": -0.1996936947107315, "rewards/margins": 0.21096451580524445, "rewards/rejected": -0.41065821051597595, "step": 646 }, { "epoch": 0.647, "grad_norm": 1.0511391162872314, "learning_rate": 1.69125e-06, "logits/chosen": 1.0233464241027832, "logits/rejected": 0.1961265653371811, "logps/chosen": -213.86428833007812, "logps/rejected": -158.924072265625, "loss": 0.6073, "rewards/accuracies": 0.875, "rewards/chosen": -0.18453273177146912, "rewards/margins": 0.2567834258079529, "rewards/rejected": -0.4413161277770996, "step": 647 }, { "epoch": 0.648, "grad_norm": 1.3003513813018799, "learning_rate": 1.69e-06, "logits/chosen": 0.2752476930618286, "logits/rejected": 0.41610610485076904, "logps/chosen": -120.14323425292969, "logps/rejected": -192.0526580810547, "loss": 0.4721, "rewards/accuracies": 0.875, "rewards/chosen": -0.10771246254444122, "rewards/margins": 0.5565762519836426, "rewards/rejected": -0.6642887592315674, "step": 648 }, { "epoch": 0.649, "grad_norm": 1.1762104034423828, "learning_rate": 1.68875e-06, "logits/chosen": 0.227717787027359, "logits/rejected": 0.40605229139328003, "logps/chosen": -128.981689453125, "logps/rejected": -202.0441436767578, "loss": 0.5723, "rewards/accuracies": 0.875, "rewards/chosen": -0.1766986846923828, "rewards/margins": 0.3939991295337677, "rewards/rejected": -0.5706977844238281, "step": 649 }, { "epoch": 0.65, "grad_norm": 1.456400752067566, "learning_rate": 1.6875e-06, "logits/chosen": 1.0110818147659302, "logits/rejected": 0.33169570565223694, "logps/chosen": -221.42767333984375, "logps/rejected": -174.84878540039062, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": -0.23527786135673523, "rewards/margins": 0.10497531294822693, "rewards/rejected": -0.34025317430496216, "step": 650 }, { "epoch": 0.651, "grad_norm": 1.4026892185211182, "learning_rate": 1.68625e-06, "logits/chosen": 0.21105089783668518, "logits/rejected": 0.6472386121749878, "logps/chosen": -170.49888610839844, "logps/rejected": -224.18768310546875, "loss": 0.4326, "rewards/accuracies": 1.0, "rewards/chosen": 0.04858851432800293, "rewards/margins": 0.6627246737480164, "rewards/rejected": -0.6141360998153687, "step": 651 }, { "epoch": 0.652, "grad_norm": 1.0231022834777832, "learning_rate": 1.685e-06, "logits/chosen": 0.20704969763755798, "logits/rejected": 0.5889328718185425, "logps/chosen": -126.26939392089844, "logps/rejected": -183.8645782470703, "loss": 0.5874, "rewards/accuracies": 0.75, "rewards/chosen": -0.2049291431903839, "rewards/margins": 0.366242378950119, "rewards/rejected": -0.5711715221405029, "step": 652 }, { "epoch": 0.653, "grad_norm": 1.1778959035873413, "learning_rate": 1.68375e-06, "logits/chosen": 0.25057709217071533, "logits/rejected": 0.29926231503486633, "logps/chosen": -175.58587646484375, "logps/rejected": -218.1710662841797, "loss": 0.674, "rewards/accuracies": 0.5, "rewards/chosen": -0.23140601813793182, "rewards/margins": 0.10640545189380646, "rewards/rejected": -0.3378114700317383, "step": 653 }, { "epoch": 0.654, "grad_norm": 1.1309490203857422, "learning_rate": 1.6825e-06, "logits/chosen": 0.20329983532428741, "logits/rejected": 0.406637966632843, "logps/chosen": -150.11134338378906, "logps/rejected": -185.14862060546875, "loss": 0.4743, "rewards/accuracies": 1.0, "rewards/chosen": -0.1364750862121582, "rewards/margins": 0.517103910446167, "rewards/rejected": -0.6535789966583252, "step": 654 }, { "epoch": 0.655, "grad_norm": 1.1019415855407715, "learning_rate": 1.6812499999999998e-06, "logits/chosen": 0.756277322769165, "logits/rejected": 0.45203810930252075, "logps/chosen": -238.2572021484375, "logps/rejected": -176.67327880859375, "loss": 0.5584, "rewards/accuracies": 0.75, "rewards/chosen": -0.09822273999452591, "rewards/margins": 0.40381234884262085, "rewards/rejected": -0.5020350217819214, "step": 655 }, { "epoch": 0.656, "grad_norm": 1.3509379625320435, "learning_rate": 1.6799999999999998e-06, "logits/chosen": 0.6171531677246094, "logits/rejected": 0.3355521261692047, "logps/chosen": -175.38551330566406, "logps/rejected": -149.13351440429688, "loss": 0.682, "rewards/accuracies": 0.625, "rewards/chosen": -0.3465457558631897, "rewards/margins": 0.20109869539737701, "rewards/rejected": -0.5476444363594055, "step": 656 }, { "epoch": 0.657, "grad_norm": 1.0173254013061523, "learning_rate": 1.6787499999999998e-06, "logits/chosen": 0.31808388233184814, "logits/rejected": 0.4687625765800476, "logps/chosen": -172.28939819335938, "logps/rejected": -198.8114013671875, "loss": 0.6048, "rewards/accuracies": 0.75, "rewards/chosen": -0.06249713525176048, "rewards/margins": 0.2954968214035034, "rewards/rejected": -0.3579939603805542, "step": 657 }, { "epoch": 0.658, "grad_norm": 1.3430063724517822, "learning_rate": 1.6774999999999998e-06, "logits/chosen": 0.9033331871032715, "logits/rejected": 0.9642009139060974, "logps/chosen": -262.8448791503906, "logps/rejected": -213.96243286132812, "loss": 0.5757, "rewards/accuracies": 0.75, "rewards/chosen": -0.1913803219795227, "rewards/margins": 0.3490581512451172, "rewards/rejected": -0.5404384732246399, "step": 658 }, { "epoch": 0.659, "grad_norm": 1.4827162027359009, "learning_rate": 1.67625e-06, "logits/chosen": 0.2258588671684265, "logits/rejected": 0.28439444303512573, "logps/chosen": -157.74252319335938, "logps/rejected": -243.00384521484375, "loss": 0.8488, "rewards/accuracies": 0.375, "rewards/chosen": -0.36561354994773865, "rewards/margins": -0.11705123633146286, "rewards/rejected": -0.24856233596801758, "step": 659 }, { "epoch": 0.66, "grad_norm": 1.0030157566070557, "learning_rate": 1.675e-06, "logits/chosen": 1.0012707710266113, "logits/rejected": 0.5914648175239563, "logps/chosen": -197.07308959960938, "logps/rejected": -155.1943359375, "loss": 0.5276, "rewards/accuracies": 0.75, "rewards/chosen": -0.155442476272583, "rewards/margins": 0.4217466711997986, "rewards/rejected": -0.5771891474723816, "step": 660 }, { "epoch": 0.661, "grad_norm": 1.2637308835983276, "learning_rate": 1.67375e-06, "logits/chosen": 0.5169379115104675, "logits/rejected": 0.32676976919174194, "logps/chosen": -219.05032348632812, "logps/rejected": -210.27896118164062, "loss": 0.6235, "rewards/accuracies": 0.75, "rewards/chosen": -0.20122934877872467, "rewards/margins": 0.2219400852918625, "rewards/rejected": -0.42316943407058716, "step": 661 }, { "epoch": 0.662, "grad_norm": 1.480782389640808, "learning_rate": 1.6725e-06, "logits/chosen": 0.3970191478729248, "logits/rejected": 0.41910237073898315, "logps/chosen": -204.122314453125, "logps/rejected": -169.9698944091797, "loss": 0.785, "rewards/accuracies": 0.5, "rewards/chosen": -0.37371349334716797, "rewards/margins": -0.06244978308677673, "rewards/rejected": -0.3112637400627136, "step": 662 }, { "epoch": 0.663, "grad_norm": 1.3299139738082886, "learning_rate": 1.6712499999999999e-06, "logits/chosen": 0.19821426272392273, "logits/rejected": 1.1655802726745605, "logps/chosen": -135.2071990966797, "logps/rejected": -277.46746826171875, "loss": 0.5974, "rewards/accuracies": 0.625, "rewards/chosen": -0.2013198882341385, "rewards/margins": 0.25126391649246216, "rewards/rejected": -0.45258381962776184, "step": 663 }, { "epoch": 0.664, "grad_norm": 1.2967517375946045, "learning_rate": 1.6699999999999999e-06, "logits/chosen": 0.35758286714553833, "logits/rejected": 0.29943594336509705, "logps/chosen": -166.15335083007812, "logps/rejected": -156.671142578125, "loss": 0.5886, "rewards/accuracies": 0.75, "rewards/chosen": -0.09950952976942062, "rewards/margins": 0.42086881399154663, "rewards/rejected": -0.5203782916069031, "step": 664 }, { "epoch": 0.665, "grad_norm": 2.5349767208099365, "learning_rate": 1.66875e-06, "logits/chosen": 0.8490903377532959, "logits/rejected": 0.23771581053733826, "logps/chosen": -256.9043884277344, "logps/rejected": -132.1465606689453, "loss": 0.8869, "rewards/accuracies": 0.25, "rewards/chosen": -0.5084307193756104, "rewards/margins": -0.27304530143737793, "rewards/rejected": -0.23538541793823242, "step": 665 }, { "epoch": 0.666, "grad_norm": 1.1046247482299805, "learning_rate": 1.6675e-06, "logits/chosen": 0.5887060761451721, "logits/rejected": 0.3953006863594055, "logps/chosen": -279.0029296875, "logps/rejected": -152.6133575439453, "loss": 0.5301, "rewards/accuracies": 0.75, "rewards/chosen": -0.10241289436817169, "rewards/margins": 0.4343247711658478, "rewards/rejected": -0.5367376208305359, "step": 666 }, { "epoch": 0.667, "grad_norm": 1.7602676153182983, "learning_rate": 1.66625e-06, "logits/chosen": 0.7691593170166016, "logits/rejected": -0.04371662437915802, "logps/chosen": -288.6562194824219, "logps/rejected": -144.22116088867188, "loss": 0.7313, "rewards/accuracies": 0.625, "rewards/chosen": -0.3343995213508606, "rewards/margins": 0.09895430505275726, "rewards/rejected": -0.43335381150245667, "step": 667 }, { "epoch": 0.668, "grad_norm": 1.381635308265686, "learning_rate": 1.665e-06, "logits/chosen": 0.33619624376296997, "logits/rejected": 0.9137500524520874, "logps/chosen": -119.33733367919922, "logps/rejected": -188.05104064941406, "loss": 0.5684, "rewards/accuracies": 0.875, "rewards/chosen": -0.11756926029920578, "rewards/margins": 0.4187098741531372, "rewards/rejected": -0.5362791419029236, "step": 668 }, { "epoch": 0.669, "grad_norm": 1.5818932056427002, "learning_rate": 1.66375e-06, "logits/chosen": 0.9079129695892334, "logits/rejected": 0.46325549483299255, "logps/chosen": -259.3852844238281, "logps/rejected": -155.25462341308594, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": -0.3111768960952759, "rewards/margins": 0.11190588772296906, "rewards/rejected": -0.42308273911476135, "step": 669 }, { "epoch": 0.67, "grad_norm": 1.0987766981124878, "learning_rate": 1.6625e-06, "logits/chosen": 0.2783443331718445, "logits/rejected": 0.8250811100006104, "logps/chosen": -114.21633911132812, "logps/rejected": -204.59033203125, "loss": 0.4264, "rewards/accuracies": 0.875, "rewards/chosen": 0.08936844021081924, "rewards/margins": 0.7504366040229797, "rewards/rejected": -0.6610682010650635, "step": 670 }, { "epoch": 0.671, "grad_norm": 1.6819673776626587, "learning_rate": 1.6612499999999998e-06, "logits/chosen": 1.1332391500473022, "logits/rejected": 0.7257215976715088, "logps/chosen": -228.47463989257812, "logps/rejected": -156.68824768066406, "loss": 0.6966, "rewards/accuracies": 0.625, "rewards/chosen": -0.21503467857837677, "rewards/margins": 0.11245635896921158, "rewards/rejected": -0.32749104499816895, "step": 671 }, { "epoch": 0.672, "grad_norm": 1.246893048286438, "learning_rate": 1.6599999999999998e-06, "logits/chosen": 0.3481070101261139, "logits/rejected": 0.11708682030439377, "logps/chosen": -141.36170959472656, "logps/rejected": -215.15496826171875, "loss": 0.3955, "rewards/accuracies": 1.0, "rewards/chosen": 0.04553885757923126, "rewards/margins": 0.7698690295219421, "rewards/rejected": -0.7243301868438721, "step": 672 }, { "epoch": 0.673, "grad_norm": 1.0800520181655884, "learning_rate": 1.6587499999999998e-06, "logits/chosen": 0.1573810875415802, "logits/rejected": 0.8857356309890747, "logps/chosen": -141.53065490722656, "logps/rejected": -215.57785034179688, "loss": 0.4945, "rewards/accuracies": 0.875, "rewards/chosen": -0.13353194296360016, "rewards/margins": 0.5929074287414551, "rewards/rejected": -0.7264394164085388, "step": 673 }, { "epoch": 0.674, "grad_norm": 1.7849617004394531, "learning_rate": 1.6574999999999998e-06, "logits/chosen": 0.8099430203437805, "logits/rejected": 0.31594768166542053, "logps/chosen": -225.56556701660156, "logps/rejected": -179.7908477783203, "loss": 0.6956, "rewards/accuracies": 0.625, "rewards/chosen": -0.3615303039550781, "rewards/margins": 0.08514024317264557, "rewards/rejected": -0.4466705322265625, "step": 674 }, { "epoch": 0.675, "grad_norm": 1.0620108842849731, "learning_rate": 1.65625e-06, "logits/chosen": 0.8655974268913269, "logits/rejected": 0.49854791164398193, "logps/chosen": -259.5467529296875, "logps/rejected": -206.93479919433594, "loss": 0.5171, "rewards/accuracies": 0.75, "rewards/chosen": 0.0392819382250309, "rewards/margins": 0.5234990119934082, "rewards/rejected": -0.48421710729599, "step": 675 }, { "epoch": 0.676, "grad_norm": 1.3412753343582153, "learning_rate": 1.655e-06, "logits/chosen": 0.5691099166870117, "logits/rejected": 0.656730055809021, "logps/chosen": -220.90463256835938, "logps/rejected": -197.491943359375, "loss": 0.6085, "rewards/accuracies": 0.875, "rewards/chosen": -0.1878703087568283, "rewards/margins": 0.2871994972229004, "rewards/rejected": -0.4750698208808899, "step": 676 }, { "epoch": 0.677, "grad_norm": 1.9118224382400513, "learning_rate": 1.65375e-06, "logits/chosen": 0.9177846908569336, "logits/rejected": 0.6129404902458191, "logps/chosen": -304.5629577636719, "logps/rejected": -167.52755737304688, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -0.31904709339141846, "rewards/margins": 0.09379369765520096, "rewards/rejected": -0.41284075379371643, "step": 677 }, { "epoch": 0.678, "grad_norm": 1.1294021606445312, "learning_rate": 1.6525e-06, "logits/chosen": 0.06339101493358612, "logits/rejected": 0.96987384557724, "logps/chosen": -182.3640594482422, "logps/rejected": -223.15174865722656, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031331516802310944, "rewards/margins": 0.6678333282470703, "rewards/rejected": -0.6709665656089783, "step": 678 }, { "epoch": 0.679, "grad_norm": 1.6889824867248535, "learning_rate": 1.65125e-06, "logits/chosen": 0.891345739364624, "logits/rejected": 0.5585298538208008, "logps/chosen": -284.7972106933594, "logps/rejected": -192.03720092773438, "loss": 0.5537, "rewards/accuracies": 0.625, "rewards/chosen": -0.21672001481056213, "rewards/margins": 0.4019472002983093, "rewards/rejected": -0.6186672449111938, "step": 679 }, { "epoch": 0.68, "grad_norm": 1.3776360750198364, "learning_rate": 1.6499999999999999e-06, "logits/chosen": 0.7073293924331665, "logits/rejected": 0.8526109457015991, "logps/chosen": -260.99676513671875, "logps/rejected": -269.4983825683594, "loss": 0.7104, "rewards/accuracies": 0.625, "rewards/chosen": -0.2825283706188202, "rewards/margins": 0.03692641109228134, "rewards/rejected": -0.31945478916168213, "step": 680 }, { "epoch": 0.681, "grad_norm": 1.3183895349502563, "learning_rate": 1.6487499999999999e-06, "logits/chosen": 0.9450984001159668, "logits/rejected": 0.5871213674545288, "logps/chosen": -267.7979736328125, "logps/rejected": -179.2218017578125, "loss": 0.5873, "rewards/accuracies": 0.625, "rewards/chosen": -0.08197888731956482, "rewards/margins": 0.32510167360305786, "rewards/rejected": -0.4070805609226227, "step": 681 }, { "epoch": 0.682, "grad_norm": 1.2523612976074219, "learning_rate": 1.6475e-06, "logits/chosen": 0.7995237112045288, "logits/rejected": 0.29285162687301636, "logps/chosen": -206.14053344726562, "logps/rejected": -160.63839721679688, "loss": 0.5983, "rewards/accuracies": 0.5, "rewards/chosen": -0.112261101603508, "rewards/margins": 0.25657498836517334, "rewards/rejected": -0.3688361644744873, "step": 682 }, { "epoch": 0.683, "grad_norm": 1.0458556413650513, "learning_rate": 1.64625e-06, "logits/chosen": 0.3298438787460327, "logits/rejected": 0.8406169414520264, "logps/chosen": -188.3623504638672, "logps/rejected": -201.03948974609375, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": -0.22613415122032166, "rewards/margins": 0.41153085231781006, "rewards/rejected": -0.6376650333404541, "step": 683 }, { "epoch": 0.684, "grad_norm": 1.0287476778030396, "learning_rate": 1.645e-06, "logits/chosen": 0.8059996366500854, "logits/rejected": 0.35404035449028015, "logps/chosen": -219.35113525390625, "logps/rejected": -158.27334594726562, "loss": 0.4532, "rewards/accuracies": 0.875, "rewards/chosen": -0.019248005002737045, "rewards/margins": 0.6500066518783569, "rewards/rejected": -0.669254720211029, "step": 684 }, { "epoch": 0.685, "grad_norm": 1.1872116327285767, "learning_rate": 1.64375e-06, "logits/chosen": 0.4238273501396179, "logits/rejected": 0.6339011788368225, "logps/chosen": -153.7528076171875, "logps/rejected": -171.43214416503906, "loss": 0.6038, "rewards/accuracies": 0.625, "rewards/chosen": -0.158203125, "rewards/margins": 0.4184987246990204, "rewards/rejected": -0.5767018795013428, "step": 685 }, { "epoch": 0.686, "grad_norm": 1.0507184267044067, "learning_rate": 1.6425e-06, "logits/chosen": 0.15332046151161194, "logits/rejected": 0.969159722328186, "logps/chosen": -195.8888397216797, "logps/rejected": -203.1723175048828, "loss": 0.5906, "rewards/accuracies": 0.625, "rewards/chosen": -0.12686528265476227, "rewards/margins": 0.3242022693157196, "rewards/rejected": -0.4510675370693207, "step": 686 }, { "epoch": 0.687, "grad_norm": 1.4690614938735962, "learning_rate": 1.64125e-06, "logits/chosen": 0.6585624814033508, "logits/rejected": 0.3101648688316345, "logps/chosen": -309.09381103515625, "logps/rejected": -172.74444580078125, "loss": 0.5284, "rewards/accuracies": 0.75, "rewards/chosen": -0.07665872573852539, "rewards/margins": 0.4444850981235504, "rewards/rejected": -0.5211437940597534, "step": 687 }, { "epoch": 0.688, "grad_norm": 1.2337238788604736, "learning_rate": 1.6399999999999998e-06, "logits/chosen": 0.8273634314537048, "logits/rejected": 0.9420344233512878, "logps/chosen": -284.606201171875, "logps/rejected": -218.57559204101562, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": -0.15150070190429688, "rewards/margins": 0.3127359449863434, "rewards/rejected": -0.46423664689064026, "step": 688 }, { "epoch": 0.689, "grad_norm": 1.5067169666290283, "learning_rate": 1.6387499999999998e-06, "logits/chosen": 0.6511004567146301, "logits/rejected": 0.7164186835289001, "logps/chosen": -251.79986572265625, "logps/rejected": -271.4736328125, "loss": 0.8148, "rewards/accuracies": 0.5, "rewards/chosen": -0.4418000280857086, "rewards/margins": -0.14902114868164062, "rewards/rejected": -0.292778879404068, "step": 689 }, { "epoch": 0.69, "grad_norm": 1.8110663890838623, "learning_rate": 1.6374999999999998e-06, "logits/chosen": 0.8625117540359497, "logits/rejected": 0.664855420589447, "logps/chosen": -286.6794738769531, "logps/rejected": -300.2720642089844, "loss": 0.7682, "rewards/accuracies": 0.5, "rewards/chosen": -0.2603228688240051, "rewards/margins": -0.053256407380104065, "rewards/rejected": -0.20706646144390106, "step": 690 }, { "epoch": 0.691, "grad_norm": 1.608376383781433, "learning_rate": 1.63625e-06, "logits/chosen": 0.8313698768615723, "logits/rejected": 0.48316776752471924, "logps/chosen": -240.27105712890625, "logps/rejected": -232.1470184326172, "loss": 0.5071, "rewards/accuracies": 0.625, "rewards/chosen": -0.0899609625339508, "rewards/margins": 0.5320311784744263, "rewards/rejected": -0.6219921112060547, "step": 691 }, { "epoch": 0.692, "grad_norm": 0.9304317831993103, "learning_rate": 1.635e-06, "logits/chosen": 0.41197317838668823, "logits/rejected": 1.2215124368667603, "logps/chosen": -172.07302856445312, "logps/rejected": -269.4045104980469, "loss": 0.4764, "rewards/accuracies": 0.875, "rewards/chosen": -0.10532475262880325, "rewards/margins": 0.6005846261978149, "rewards/rejected": -0.7059093713760376, "step": 692 }, { "epoch": 0.693, "grad_norm": 1.993904709815979, "learning_rate": 1.63375e-06, "logits/chosen": 1.534001350402832, "logits/rejected": 0.30615901947021484, "logps/chosen": -437.77825927734375, "logps/rejected": -142.26258850097656, "loss": 0.6452, "rewards/accuracies": 0.625, "rewards/chosen": -0.27252331376075745, "rewards/margins": 0.21717195212841034, "rewards/rejected": -0.489695280790329, "step": 693 }, { "epoch": 0.694, "grad_norm": 1.8383904695510864, "learning_rate": 1.6325e-06, "logits/chosen": 0.9901219606399536, "logits/rejected": 0.42730391025543213, "logps/chosen": -282.8812561035156, "logps/rejected": -159.18429565429688, "loss": 0.653, "rewards/accuracies": 0.375, "rewards/chosen": -0.30386829376220703, "rewards/margins": 0.18611013889312744, "rewards/rejected": -0.4899784326553345, "step": 694 }, { "epoch": 0.695, "grad_norm": 1.309606909751892, "learning_rate": 1.63125e-06, "logits/chosen": 0.7051821947097778, "logits/rejected": 0.5149757266044617, "logps/chosen": -177.39585876464844, "logps/rejected": -207.51416015625, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": -0.0031028762459754944, "rewards/margins": 0.6370488405227661, "rewards/rejected": -0.6401517391204834, "step": 695 }, { "epoch": 0.696, "grad_norm": 1.201055884361267, "learning_rate": 1.6299999999999999e-06, "logits/chosen": 0.6134026050567627, "logits/rejected": 0.6371772289276123, "logps/chosen": -138.85903930664062, "logps/rejected": -175.21902465820312, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": -0.23592042922973633, "rewards/margins": 0.28280314803123474, "rewards/rejected": -0.5187236070632935, "step": 696 }, { "epoch": 0.697, "grad_norm": 1.3550466299057007, "learning_rate": 1.6287499999999999e-06, "logits/chosen": -0.08731433749198914, "logits/rejected": 0.4341522753238678, "logps/chosen": -120.07252502441406, "logps/rejected": -167.62319946289062, "loss": 0.7661, "rewards/accuracies": 0.375, "rewards/chosen": -0.36529985070228577, "rewards/margins": -0.07893520593643188, "rewards/rejected": -0.2863646447658539, "step": 697 }, { "epoch": 0.698, "grad_norm": 1.203334093093872, "learning_rate": 1.6274999999999999e-06, "logits/chosen": 1.0631163120269775, "logits/rejected": 0.39084216952323914, "logps/chosen": -274.81134033203125, "logps/rejected": -140.9290771484375, "loss": 0.5271, "rewards/accuracies": 0.75, "rewards/chosen": 0.044146448373794556, "rewards/margins": 0.50274258852005, "rewards/rejected": -0.4585961699485779, "step": 698 }, { "epoch": 0.699, "grad_norm": 1.537676453590393, "learning_rate": 1.6262499999999999e-06, "logits/chosen": 0.4939998686313629, "logits/rejected": 0.7995157837867737, "logps/chosen": -187.9720458984375, "logps/rejected": -188.88467407226562, "loss": 0.7086, "rewards/accuracies": 0.5, "rewards/chosen": -0.2977104187011719, "rewards/margins": 0.17525023221969604, "rewards/rejected": -0.4729606807231903, "step": 699 }, { "epoch": 0.7, "grad_norm": 1.2132574319839478, "learning_rate": 1.625e-06, "logits/chosen": 0.8809593319892883, "logits/rejected": 0.5064460039138794, "logps/chosen": -331.3251953125, "logps/rejected": -163.70928955078125, "loss": 0.5443, "rewards/accuracies": 0.75, "rewards/chosen": -0.11700677126646042, "rewards/margins": 0.42207908630371094, "rewards/rejected": -0.539085865020752, "step": 700 }, { "epoch": 0.701, "grad_norm": 1.1717944145202637, "learning_rate": 1.62375e-06, "logits/chosen": 0.7950129508972168, "logits/rejected": 0.6558113098144531, "logps/chosen": -267.0146484375, "logps/rejected": -189.55416870117188, "loss": 0.4399, "rewards/accuracies": 0.875, "rewards/chosen": -0.06380004435777664, "rewards/margins": 0.7404083609580994, "rewards/rejected": -0.8042083978652954, "step": 701 }, { "epoch": 0.702, "grad_norm": 1.3148914575576782, "learning_rate": 1.6225e-06, "logits/chosen": 1.220686674118042, "logits/rejected": 0.23817673325538635, "logps/chosen": -268.51470947265625, "logps/rejected": -152.30325317382812, "loss": 0.6874, "rewards/accuracies": 0.625, "rewards/chosen": -0.2700698971748352, "rewards/margins": 0.06811632961034775, "rewards/rejected": -0.33818623423576355, "step": 702 }, { "epoch": 0.703, "grad_norm": 1.2506014108657837, "learning_rate": 1.62125e-06, "logits/chosen": 0.9136346578598022, "logits/rejected": 0.3556394577026367, "logps/chosen": -277.16693115234375, "logps/rejected": -220.16929626464844, "loss": 0.6317, "rewards/accuracies": 0.625, "rewards/chosen": -0.21284905076026917, "rewards/margins": 0.2918160557746887, "rewards/rejected": -0.5046650767326355, "step": 703 }, { "epoch": 0.704, "grad_norm": 1.3226679563522339, "learning_rate": 1.62e-06, "logits/chosen": 1.0169626474380493, "logits/rejected": 0.4405118525028229, "logps/chosen": -276.83074951171875, "logps/rejected": -195.19903564453125, "loss": 0.5055, "rewards/accuracies": 0.875, "rewards/chosen": -0.13671541213989258, "rewards/margins": 0.4993707835674286, "rewards/rejected": -0.6360862255096436, "step": 704 }, { "epoch": 0.705, "grad_norm": 1.1912633180618286, "learning_rate": 1.6187499999999997e-06, "logits/chosen": 0.6026633977890015, "logits/rejected": 0.32677167654037476, "logps/chosen": -228.48316955566406, "logps/rejected": -160.0124969482422, "loss": 0.5648, "rewards/accuracies": 0.625, "rewards/chosen": -0.11356773972511292, "rewards/margins": 0.3393697738647461, "rewards/rejected": -0.452937513589859, "step": 705 }, { "epoch": 0.706, "grad_norm": 2.2307181358337402, "learning_rate": 1.6174999999999998e-06, "logits/chosen": 0.9230671525001526, "logits/rejected": 0.08098691701889038, "logps/chosen": -372.4360046386719, "logps/rejected": -140.78509521484375, "loss": 0.8198, "rewards/accuracies": 0.5, "rewards/chosen": -0.3438417315483093, "rewards/margins": -0.12725067138671875, "rewards/rejected": -0.21659107506275177, "step": 706 }, { "epoch": 0.707, "grad_norm": 1.4363453388214111, "learning_rate": 1.61625e-06, "logits/chosen": 0.9583258032798767, "logits/rejected": 0.7215851545333862, "logps/chosen": -246.3148193359375, "logps/rejected": -183.2366943359375, "loss": 0.6582, "rewards/accuracies": 0.625, "rewards/chosen": -0.140709787607193, "rewards/margins": 0.3017476201057434, "rewards/rejected": -0.442457377910614, "step": 707 }, { "epoch": 0.708, "grad_norm": 1.5791869163513184, "learning_rate": 1.615e-06, "logits/chosen": 0.4940716028213501, "logits/rejected": 1.4002926349639893, "logps/chosen": -256.02471923828125, "logps/rejected": -352.0762023925781, "loss": 0.6297, "rewards/accuracies": 0.5, "rewards/chosen": -0.30098867416381836, "rewards/margins": 0.30588656663894653, "rewards/rejected": -0.6068752408027649, "step": 708 }, { "epoch": 0.709, "grad_norm": 1.3272684812545776, "learning_rate": 1.61375e-06, "logits/chosen": 0.3630864918231964, "logits/rejected": 0.7362226843833923, "logps/chosen": -230.68356323242188, "logps/rejected": -204.7578887939453, "loss": 0.6221, "rewards/accuracies": 0.75, "rewards/chosen": -0.21094553172588348, "rewards/margins": 0.22994878888130188, "rewards/rejected": -0.44089430570602417, "step": 709 }, { "epoch": 0.71, "grad_norm": 1.4980697631835938, "learning_rate": 1.6125e-06, "logits/chosen": 0.765902042388916, "logits/rejected": 0.19500786066055298, "logps/chosen": -222.49517822265625, "logps/rejected": -190.416015625, "loss": 0.7063, "rewards/accuracies": 0.5, "rewards/chosen": -0.21314488351345062, "rewards/margins": 0.13360795378684998, "rewards/rejected": -0.3467528223991394, "step": 710 }, { "epoch": 0.711, "grad_norm": 1.5122487545013428, "learning_rate": 1.61125e-06, "logits/chosen": 0.46109968423843384, "logits/rejected": 0.8814452886581421, "logps/chosen": -187.27151489257812, "logps/rejected": -263.0892639160156, "loss": 0.6594, "rewards/accuracies": 0.625, "rewards/chosen": -0.15717488527297974, "rewards/margins": 0.14426252245903015, "rewards/rejected": -0.3014374077320099, "step": 711 }, { "epoch": 0.712, "grad_norm": 1.4221237897872925, "learning_rate": 1.61e-06, "logits/chosen": 0.5202518701553345, "logits/rejected": 0.845240592956543, "logps/chosen": -221.52345275878906, "logps/rejected": -229.47987365722656, "loss": 0.3817, "rewards/accuracies": 1.0, "rewards/chosen": 0.06952285766601562, "rewards/margins": 0.8134939670562744, "rewards/rejected": -0.7439712285995483, "step": 712 }, { "epoch": 0.713, "grad_norm": 1.2307822704315186, "learning_rate": 1.6087499999999998e-06, "logits/chosen": 0.3362438976764679, "logits/rejected": -0.0592617467045784, "logps/chosen": -238.94036865234375, "logps/rejected": -175.55465698242188, "loss": 0.4769, "rewards/accuracies": 1.0, "rewards/chosen": -0.020775794982910156, "rewards/margins": 0.5266586542129517, "rewards/rejected": -0.5474344491958618, "step": 713 }, { "epoch": 0.714, "grad_norm": 1.0698472261428833, "learning_rate": 1.6074999999999999e-06, "logits/chosen": 0.1874169409275055, "logits/rejected": 0.3976481854915619, "logps/chosen": -256.67559814453125, "logps/rejected": -223.70716857910156, "loss": 0.5183, "rewards/accuracies": 0.75, "rewards/chosen": -0.014676950871944427, "rewards/margins": 0.46430426836013794, "rewards/rejected": -0.47898122668266296, "step": 714 }, { "epoch": 0.715, "grad_norm": 1.8316980600357056, "learning_rate": 1.6062499999999999e-06, "logits/chosen": 0.8762261867523193, "logits/rejected": 0.6006833910942078, "logps/chosen": -258.51312255859375, "logps/rejected": -169.32582092285156, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": -0.371234655380249, "rewards/margins": 0.1397799551486969, "rewards/rejected": -0.5110145807266235, "step": 715 }, { "epoch": 0.716, "grad_norm": 1.2710658311843872, "learning_rate": 1.6049999999999999e-06, "logits/chosen": 0.5825903415679932, "logits/rejected": 0.9319605827331543, "logps/chosen": -200.15115356445312, "logps/rejected": -248.99319458007812, "loss": 0.4724, "rewards/accuracies": 0.875, "rewards/chosen": 0.05699019134044647, "rewards/margins": 0.683768630027771, "rewards/rejected": -0.6267783641815186, "step": 716 }, { "epoch": 0.717, "grad_norm": 1.4750862121582031, "learning_rate": 1.6037499999999999e-06, "logits/chosen": -0.33946165442466736, "logits/rejected": 0.502934455871582, "logps/chosen": -95.21710205078125, "logps/rejected": -197.10195922851562, "loss": 0.6616, "rewards/accuracies": 0.625, "rewards/chosen": -0.19740810990333557, "rewards/margins": 0.16967254877090454, "rewards/rejected": -0.3670806586742401, "step": 717 }, { "epoch": 0.718, "grad_norm": 1.1522222757339478, "learning_rate": 1.6025e-06, "logits/chosen": 0.4699358344078064, "logits/rejected": 0.2317737340927124, "logps/chosen": -269.74578857421875, "logps/rejected": -194.86178588867188, "loss": 0.4558, "rewards/accuracies": 0.875, "rewards/chosen": 0.07950381934642792, "rewards/margins": 0.6185877323150635, "rewards/rejected": -0.5390839576721191, "step": 718 }, { "epoch": 0.719, "grad_norm": 1.0902351140975952, "learning_rate": 1.60125e-06, "logits/chosen": 0.5780194401741028, "logits/rejected": 0.6795755624771118, "logps/chosen": -199.0326690673828, "logps/rejected": -177.42425537109375, "loss": 0.4548, "rewards/accuracies": 0.875, "rewards/chosen": 0.13697710633277893, "rewards/margins": 0.6811473369598389, "rewards/rejected": -0.5441702604293823, "step": 719 }, { "epoch": 0.72, "grad_norm": 1.2701936960220337, "learning_rate": 1.6e-06, "logits/chosen": 0.7551618814468384, "logits/rejected": 0.06238964945077896, "logps/chosen": -241.27723693847656, "logps/rejected": -164.26812744140625, "loss": 0.6573, "rewards/accuracies": 0.625, "rewards/chosen": -0.24328957498073578, "rewards/margins": 0.23480339348316193, "rewards/rejected": -0.4780929684638977, "step": 720 }, { "epoch": 0.721, "grad_norm": 1.4496409893035889, "learning_rate": 1.5987499999999997e-06, "logits/chosen": 0.3191937804222107, "logits/rejected": 0.7821966409683228, "logps/chosen": -160.13323974609375, "logps/rejected": -215.38046264648438, "loss": 0.6671, "rewards/accuracies": 0.75, "rewards/chosen": -0.244561567902565, "rewards/margins": 0.14729043841362, "rewards/rejected": -0.3918519914150238, "step": 721 }, { "epoch": 0.722, "grad_norm": 1.3719006776809692, "learning_rate": 1.5975e-06, "logits/chosen": 0.7663177251815796, "logits/rejected": 0.8585834503173828, "logps/chosen": -224.27273559570312, "logps/rejected": -237.51620483398438, "loss": 0.5348, "rewards/accuracies": 0.75, "rewards/chosen": -0.24553456902503967, "rewards/margins": 0.40572264790534973, "rewards/rejected": -0.6512572765350342, "step": 722 }, { "epoch": 0.723, "grad_norm": 1.5354456901550293, "learning_rate": 1.59625e-06, "logits/chosen": 1.167351245880127, "logits/rejected": 0.12067823857069016, "logps/chosen": -277.2524719238281, "logps/rejected": -318.8912353515625, "loss": 0.6397, "rewards/accuracies": 0.75, "rewards/chosen": -0.12320729345083237, "rewards/margins": 0.21159051358699799, "rewards/rejected": -0.33479779958724976, "step": 723 }, { "epoch": 0.724, "grad_norm": 1.1989909410476685, "learning_rate": 1.595e-06, "logits/chosen": 0.6808390021324158, "logits/rejected": 0.346516489982605, "logps/chosen": -209.73373413085938, "logps/rejected": -225.31777954101562, "loss": 0.4944, "rewards/accuracies": 0.875, "rewards/chosen": 0.0051816366612911224, "rewards/margins": 0.5775045156478882, "rewards/rejected": -0.5723228454589844, "step": 724 }, { "epoch": 0.725, "grad_norm": 1.5931808948516846, "learning_rate": 1.59375e-06, "logits/chosen": 0.8242297768592834, "logits/rejected": 0.6452058553695679, "logps/chosen": -207.25494384765625, "logps/rejected": -171.25811767578125, "loss": 0.532, "rewards/accuracies": 0.75, "rewards/chosen": -0.2720054090023041, "rewards/margins": 0.4440545439720154, "rewards/rejected": -0.7160600423812866, "step": 725 }, { "epoch": 0.726, "grad_norm": 1.446157693862915, "learning_rate": 1.5925e-06, "logits/chosen": 0.6427602767944336, "logits/rejected": 0.6267442107200623, "logps/chosen": -251.2432861328125, "logps/rejected": -188.19363403320312, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 0.1512136459350586, "rewards/margins": 0.8804153800010681, "rewards/rejected": -0.7292017340660095, "step": 726 }, { "epoch": 0.727, "grad_norm": 1.6469485759735107, "learning_rate": 1.59125e-06, "logits/chosen": 0.17459037899971008, "logits/rejected": 1.0114368200302124, "logps/chosen": -125.93876647949219, "logps/rejected": -252.32850646972656, "loss": 0.7415, "rewards/accuracies": 0.375, "rewards/chosen": -0.24813929200172424, "rewards/margins": 0.05888262391090393, "rewards/rejected": -0.3070219159126282, "step": 727 }, { "epoch": 0.728, "grad_norm": 1.3332149982452393, "learning_rate": 1.59e-06, "logits/chosen": 0.48904991149902344, "logits/rejected": 0.7166186571121216, "logps/chosen": -278.22607421875, "logps/rejected": -233.47781372070312, "loss": 0.65, "rewards/accuracies": 0.625, "rewards/chosen": -0.3131164312362671, "rewards/margins": 0.14230450987815857, "rewards/rejected": -0.45542091131210327, "step": 728 }, { "epoch": 0.729, "grad_norm": 1.3860689401626587, "learning_rate": 1.58875e-06, "logits/chosen": 0.6981331706047058, "logits/rejected": 0.8177593350410461, "logps/chosen": -225.95144653320312, "logps/rejected": -267.68927001953125, "loss": 0.5762, "rewards/accuracies": 0.75, "rewards/chosen": -0.2547157108783722, "rewards/margins": 0.41567373275756836, "rewards/rejected": -0.6703894734382629, "step": 729 }, { "epoch": 0.73, "grad_norm": 1.7886799573898315, "learning_rate": 1.5874999999999998e-06, "logits/chosen": 0.5842785835266113, "logits/rejected": 0.5664003491401672, "logps/chosen": -153.9149932861328, "logps/rejected": -185.8612823486328, "loss": 0.7982, "rewards/accuracies": 0.375, "rewards/chosen": -0.4230860769748688, "rewards/margins": -0.13101176917552948, "rewards/rejected": -0.2920742928981781, "step": 730 }, { "epoch": 0.731, "grad_norm": 1.435346007347107, "learning_rate": 1.5862499999999998e-06, "logits/chosen": 0.8845757842063904, "logits/rejected": 0.8787198662757874, "logps/chosen": -220.83377075195312, "logps/rejected": -166.7554168701172, "loss": 0.7776, "rewards/accuracies": 0.625, "rewards/chosen": -0.4503912031650543, "rewards/margins": -0.04115933179855347, "rewards/rejected": -0.40923187136650085, "step": 731 }, { "epoch": 0.732, "grad_norm": 1.318712592124939, "learning_rate": 1.5849999999999999e-06, "logits/chosen": 0.3981918692588806, "logits/rejected": 1.0134177207946777, "logps/chosen": -135.9412841796875, "logps/rejected": -281.4140625, "loss": 0.5244, "rewards/accuracies": 0.875, "rewards/chosen": -0.03888101875782013, "rewards/margins": 0.42422735691070557, "rewards/rejected": -0.4631083607673645, "step": 732 }, { "epoch": 0.733, "grad_norm": 1.4163874387741089, "learning_rate": 1.5837499999999999e-06, "logits/chosen": 0.4027024209499359, "logits/rejected": 0.24069547653198242, "logps/chosen": -209.9232635498047, "logps/rejected": -168.62368774414062, "loss": 0.6478, "rewards/accuracies": 0.75, "rewards/chosen": -0.23345975577831268, "rewards/margins": 0.2419845461845398, "rewards/rejected": -0.47544431686401367, "step": 733 }, { "epoch": 0.734, "grad_norm": 1.4599624872207642, "learning_rate": 1.5824999999999999e-06, "logits/chosen": 0.7611532807350159, "logits/rejected": 0.2095411717891693, "logps/chosen": -258.32965087890625, "logps/rejected": -157.4111328125, "loss": 0.5952, "rewards/accuracies": 0.75, "rewards/chosen": -0.04864911735057831, "rewards/margins": 0.3061278462409973, "rewards/rejected": -0.3547769784927368, "step": 734 }, { "epoch": 0.735, "grad_norm": 1.5260270833969116, "learning_rate": 1.58125e-06, "logits/chosen": 1.4799976348876953, "logits/rejected": 0.24358968436717987, "logps/chosen": -489.13140869140625, "logps/rejected": -187.91275024414062, "loss": 0.5736, "rewards/accuracies": 0.625, "rewards/chosen": -0.1393706351518631, "rewards/margins": 0.3368644118309021, "rewards/rejected": -0.476235032081604, "step": 735 }, { "epoch": 0.736, "grad_norm": 1.4688773155212402, "learning_rate": 1.58e-06, "logits/chosen": 0.2972239553928375, "logits/rejected": 0.6412472724914551, "logps/chosen": -193.93333435058594, "logps/rejected": -164.6117401123047, "loss": 0.6589, "rewards/accuracies": 0.75, "rewards/chosen": -0.23341065645217896, "rewards/margins": 0.1259712278842926, "rewards/rejected": -0.35938188433647156, "step": 736 }, { "epoch": 0.737, "grad_norm": 1.2038322687149048, "learning_rate": 1.5787500000000001e-06, "logits/chosen": 0.45705559849739075, "logits/rejected": 0.6780625581741333, "logps/chosen": -193.65118408203125, "logps/rejected": -235.9606475830078, "loss": 0.4892, "rewards/accuracies": 0.75, "rewards/chosen": -0.06509742140769958, "rewards/margins": 0.6348360776901245, "rewards/rejected": -0.6999334692955017, "step": 737 }, { "epoch": 0.738, "grad_norm": 1.0801982879638672, "learning_rate": 1.5775e-06, "logits/chosen": 1.1465768814086914, "logits/rejected": 0.3702002167701721, "logps/chosen": -225.86294555664062, "logps/rejected": -199.65176391601562, "loss": 0.502, "rewards/accuracies": 0.875, "rewards/chosen": -0.004537008702754974, "rewards/margins": 0.47587263584136963, "rewards/rejected": -0.4804096221923828, "step": 738 }, { "epoch": 0.739, "grad_norm": 1.4404325485229492, "learning_rate": 1.57625e-06, "logits/chosen": 1.202430248260498, "logits/rejected": 0.8964845538139343, "logps/chosen": -208.91192626953125, "logps/rejected": -216.65194702148438, "loss": 0.7177, "rewards/accuracies": 0.625, "rewards/chosen": -0.2939854860305786, "rewards/margins": 0.06405087560415268, "rewards/rejected": -0.3580363392829895, "step": 739 }, { "epoch": 0.74, "grad_norm": 1.075452446937561, "learning_rate": 1.575e-06, "logits/chosen": 1.270569086074829, "logits/rejected": 0.7679373621940613, "logps/chosen": -399.7657165527344, "logps/rejected": -180.37237548828125, "loss": 0.4317, "rewards/accuracies": 0.75, "rewards/chosen": 0.06345997005701065, "rewards/margins": 0.8272677063941956, "rewards/rejected": -0.7638076543807983, "step": 740 }, { "epoch": 0.741, "grad_norm": 1.2958706617355347, "learning_rate": 1.57375e-06, "logits/chosen": 0.36644506454467773, "logits/rejected": 0.281933069229126, "logps/chosen": -186.335693359375, "logps/rejected": -207.37704467773438, "loss": 0.5114, "rewards/accuracies": 0.875, "rewards/chosen": -0.05128125846385956, "rewards/margins": 0.5847166180610657, "rewards/rejected": -0.6359978914260864, "step": 741 }, { "epoch": 0.742, "grad_norm": 1.321826457977295, "learning_rate": 1.5725e-06, "logits/chosen": 0.24357381463050842, "logits/rejected": 0.8854681253433228, "logps/chosen": -188.79092407226562, "logps/rejected": -269.482177734375, "loss": 0.5704, "rewards/accuracies": 0.75, "rewards/chosen": -0.26717090606689453, "rewards/margins": 0.30404412746429443, "rewards/rejected": -0.571215033531189, "step": 742 }, { "epoch": 0.743, "grad_norm": 1.236730694770813, "learning_rate": 1.57125e-06, "logits/chosen": 0.6246983408927917, "logits/rejected": 0.7233045101165771, "logps/chosen": -252.91734313964844, "logps/rejected": -209.5896453857422, "loss": 0.6166, "rewards/accuracies": 0.625, "rewards/chosen": -0.236596018075943, "rewards/margins": 0.2653410732746124, "rewards/rejected": -0.5019370913505554, "step": 743 }, { "epoch": 0.744, "grad_norm": 1.4020497798919678, "learning_rate": 1.57e-06, "logits/chosen": 1.0730022192001343, "logits/rejected": -0.009103767573833466, "logps/chosen": -335.08514404296875, "logps/rejected": -152.7361602783203, "loss": 0.5644, "rewards/accuracies": 0.625, "rewards/chosen": -0.015366651117801666, "rewards/margins": 0.35276147723197937, "rewards/rejected": -0.36812809109687805, "step": 744 }, { "epoch": 0.745, "grad_norm": 1.5694549083709717, "learning_rate": 1.56875e-06, "logits/chosen": 0.42050880193710327, "logits/rejected": 0.6780085563659668, "logps/chosen": -198.69149780273438, "logps/rejected": -224.4080352783203, "loss": 0.3567, "rewards/accuracies": 0.875, "rewards/chosen": 0.10588884353637695, "rewards/margins": 0.9256786108016968, "rewards/rejected": -0.819789707660675, "step": 745 }, { "epoch": 0.746, "grad_norm": 1.3810499906539917, "learning_rate": 1.5674999999999998e-06, "logits/chosen": 0.443477988243103, "logits/rejected": 0.7418208122253418, "logps/chosen": -175.4984588623047, "logps/rejected": -199.54318237304688, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -0.27443087100982666, "rewards/margins": 0.26153334975242615, "rewards/rejected": -0.5359642505645752, "step": 746 }, { "epoch": 0.747, "grad_norm": 2.3283159732818604, "learning_rate": 1.5662499999999998e-06, "logits/chosen": -0.004448685795068741, "logits/rejected": 0.72901451587677, "logps/chosen": -221.5937042236328, "logps/rejected": -209.0286102294922, "loss": 0.8112, "rewards/accuracies": 0.375, "rewards/chosen": -0.4475892186164856, "rewards/margins": -0.16064806282520294, "rewards/rejected": -0.28694117069244385, "step": 747 }, { "epoch": 0.748, "grad_norm": 1.2371102571487427, "learning_rate": 1.5649999999999998e-06, "logits/chosen": 0.8797957897186279, "logits/rejected": 0.41744178533554077, "logps/chosen": -307.27545166015625, "logps/rejected": -195.3773193359375, "loss": 0.5097, "rewards/accuracies": 0.875, "rewards/chosen": -0.14138752222061157, "rewards/margins": 0.4933328926563263, "rewards/rejected": -0.6347204446792603, "step": 748 }, { "epoch": 0.749, "grad_norm": 1.3252248764038086, "learning_rate": 1.5637499999999999e-06, "logits/chosen": 0.8736483454704285, "logits/rejected": 0.7261696457862854, "logps/chosen": -371.2701721191406, "logps/rejected": -230.5, "loss": 0.5699, "rewards/accuracies": 0.875, "rewards/chosen": -0.12127400934696198, "rewards/margins": 0.3992803692817688, "rewards/rejected": -0.5205543637275696, "step": 749 }, { "epoch": 0.75, "grad_norm": 1.5043272972106934, "learning_rate": 1.5624999999999999e-06, "logits/chosen": 0.5604843497276306, "logits/rejected": 0.5896613597869873, "logps/chosen": -239.43460083007812, "logps/rejected": -165.2947540283203, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": -0.30483803153038025, "rewards/margins": 0.17462575435638428, "rewards/rejected": -0.47946375608444214, "step": 750 }, { "epoch": 0.751, "grad_norm": 1.325305700302124, "learning_rate": 1.5612499999999999e-06, "logits/chosen": 0.7745131254196167, "logits/rejected": 0.5418330430984497, "logps/chosen": -259.0572814941406, "logps/rejected": -212.76007080078125, "loss": 0.5761, "rewards/accuracies": 0.75, "rewards/chosen": -0.11845226585865021, "rewards/margins": 0.37342962622642517, "rewards/rejected": -0.4918818473815918, "step": 751 }, { "epoch": 0.752, "grad_norm": 1.140007734298706, "learning_rate": 1.5599999999999999e-06, "logits/chosen": 0.8020329475402832, "logits/rejected": 0.6756629943847656, "logps/chosen": -217.8274688720703, "logps/rejected": -157.33319091796875, "loss": 0.4597, "rewards/accuracies": 0.875, "rewards/chosen": -0.0007083974778652191, "rewards/margins": 0.6528151631355286, "rewards/rejected": -0.6535235643386841, "step": 752 }, { "epoch": 0.753, "grad_norm": 1.2172952890396118, "learning_rate": 1.5587500000000001e-06, "logits/chosen": 0.8520956039428711, "logits/rejected": 0.9496738314628601, "logps/chosen": -263.5419006347656, "logps/rejected": -247.14849853515625, "loss": 0.595, "rewards/accuracies": 0.625, "rewards/chosen": -0.03291292488574982, "rewards/margins": 0.26747849583625793, "rewards/rejected": -0.30039146542549133, "step": 753 }, { "epoch": 0.754, "grad_norm": 2.8444268703460693, "learning_rate": 1.5575000000000001e-06, "logits/chosen": 0.882819652557373, "logits/rejected": 0.33799073100090027, "logps/chosen": -334.3796691894531, "logps/rejected": -144.58663940429688, "loss": 0.99, "rewards/accuracies": 0.25, "rewards/chosen": -0.4954530596733093, "rewards/margins": -0.42435914278030396, "rewards/rejected": -0.07109394669532776, "step": 754 }, { "epoch": 0.755, "grad_norm": 1.3540642261505127, "learning_rate": 1.55625e-06, "logits/chosen": 0.7659162878990173, "logits/rejected": 0.5379577279090881, "logps/chosen": -252.84140014648438, "logps/rejected": -259.7785949707031, "loss": 0.6636, "rewards/accuracies": 0.625, "rewards/chosen": -0.12812824547290802, "rewards/margins": 0.26035624742507935, "rewards/rejected": -0.38848447799682617, "step": 755 }, { "epoch": 0.756, "grad_norm": 1.1994717121124268, "learning_rate": 1.555e-06, "logits/chosen": 0.7927484512329102, "logits/rejected": 0.5752880573272705, "logps/chosen": -164.84201049804688, "logps/rejected": -182.4931182861328, "loss": 0.6155, "rewards/accuracies": 0.625, "rewards/chosen": -0.27252763509750366, "rewards/margins": 0.37136149406433105, "rewards/rejected": -0.6438891887664795, "step": 756 }, { "epoch": 0.757, "grad_norm": 1.4153848886489868, "learning_rate": 1.55375e-06, "logits/chosen": 0.5892941355705261, "logits/rejected": 0.6727914810180664, "logps/chosen": -199.3570098876953, "logps/rejected": -218.2960662841797, "loss": 0.662, "rewards/accuracies": 0.625, "rewards/chosen": -0.2605397701263428, "rewards/margins": 0.1716597080230713, "rewards/rejected": -0.43219947814941406, "step": 757 }, { "epoch": 0.758, "grad_norm": 1.3538659811019897, "learning_rate": 1.5525e-06, "logits/chosen": 0.6541364192962646, "logits/rejected": 1.1165982484817505, "logps/chosen": -236.3895721435547, "logps/rejected": -255.580322265625, "loss": 0.52, "rewards/accuracies": 0.625, "rewards/chosen": -0.17224597930908203, "rewards/margins": 0.5660436153411865, "rewards/rejected": -0.7382897138595581, "step": 758 }, { "epoch": 0.759, "grad_norm": 1.4816436767578125, "learning_rate": 1.55125e-06, "logits/chosen": 0.6137555837631226, "logits/rejected": 0.81046462059021, "logps/chosen": -220.4472198486328, "logps/rejected": -220.1424560546875, "loss": 0.5421, "rewards/accuracies": 0.75, "rewards/chosen": -0.2290482521057129, "rewards/margins": 0.40336018800735474, "rewards/rejected": -0.6324084401130676, "step": 759 }, { "epoch": 0.76, "grad_norm": 1.6415830850601196, "learning_rate": 1.55e-06, "logits/chosen": 0.5375827550888062, "logits/rejected": 0.8706777095794678, "logps/chosen": -206.4009552001953, "logps/rejected": -273.5037536621094, "loss": 0.6327, "rewards/accuracies": 0.75, "rewards/chosen": -0.23001977801322937, "rewards/margins": 0.25016871094703674, "rewards/rejected": -0.4801884889602661, "step": 760 }, { "epoch": 0.761, "grad_norm": 1.4463003873825073, "learning_rate": 1.54875e-06, "logits/chosen": 0.8639179468154907, "logits/rejected": 1.2593750953674316, "logps/chosen": -217.44747924804688, "logps/rejected": -216.78294372558594, "loss": 0.8248, "rewards/accuracies": 0.375, "rewards/chosen": -0.5589588284492493, "rewards/margins": -0.11741162091493607, "rewards/rejected": -0.4415472149848938, "step": 761 }, { "epoch": 0.762, "grad_norm": 1.3738993406295776, "learning_rate": 1.5475e-06, "logits/chosen": 0.09218085557222366, "logits/rejected": 0.552115797996521, "logps/chosen": -157.78750610351562, "logps/rejected": -224.25909423828125, "loss": 0.5889, "rewards/accuracies": 0.625, "rewards/chosen": -0.289254754781723, "rewards/margins": 0.32429301738739014, "rewards/rejected": -0.6135478019714355, "step": 762 }, { "epoch": 0.763, "grad_norm": 1.541743278503418, "learning_rate": 1.5462499999999998e-06, "logits/chosen": 0.9419839382171631, "logits/rejected": 0.9134670495986938, "logps/chosen": -286.8064270019531, "logps/rejected": -215.81890869140625, "loss": 0.6727, "rewards/accuracies": 0.75, "rewards/chosen": -0.1976357400417328, "rewards/margins": 0.1999179869890213, "rewards/rejected": -0.3975537419319153, "step": 763 }, { "epoch": 0.764, "grad_norm": 1.3499782085418701, "learning_rate": 1.5449999999999998e-06, "logits/chosen": 0.8977514505386353, "logits/rejected": 0.41746655106544495, "logps/chosen": -298.4237060546875, "logps/rejected": -182.33340454101562, "loss": 0.5108, "rewards/accuracies": 0.75, "rewards/chosen": -0.09747782349586487, "rewards/margins": 0.5057858824729919, "rewards/rejected": -0.6032636761665344, "step": 764 }, { "epoch": 0.765, "grad_norm": 1.4757037162780762, "learning_rate": 1.5437499999999998e-06, "logits/chosen": 0.5261470079421997, "logits/rejected": 0.708360493183136, "logps/chosen": -183.702392578125, "logps/rejected": -178.88955688476562, "loss": 0.7331, "rewards/accuracies": 0.625, "rewards/chosen": -0.3412853479385376, "rewards/margins": 0.004340723156929016, "rewards/rejected": -0.3456260561943054, "step": 765 }, { "epoch": 0.766, "grad_norm": 1.2997450828552246, "learning_rate": 1.5424999999999998e-06, "logits/chosen": 0.9780928492546082, "logits/rejected": 0.2453433871269226, "logps/chosen": -287.72711181640625, "logps/rejected": -173.9216766357422, "loss": 0.5007, "rewards/accuracies": 0.875, "rewards/chosen": 0.026446733623743057, "rewards/margins": 0.5206562280654907, "rewards/rejected": -0.4942094683647156, "step": 766 }, { "epoch": 0.767, "grad_norm": 1.1930928230285645, "learning_rate": 1.5412499999999999e-06, "logits/chosen": 0.8154456615447998, "logits/rejected": 0.4985177516937256, "logps/chosen": -265.0450744628906, "logps/rejected": -138.09011840820312, "loss": 0.5023, "rewards/accuracies": 0.75, "rewards/chosen": -0.06663933396339417, "rewards/margins": 0.5110476613044739, "rewards/rejected": -0.5776870250701904, "step": 767 }, { "epoch": 0.768, "grad_norm": 1.585509181022644, "learning_rate": 1.5399999999999999e-06, "logits/chosen": 0.4154246151447296, "logits/rejected": 0.9698058366775513, "logps/chosen": -157.31268310546875, "logps/rejected": -219.0492706298828, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": -0.22724609076976776, "rewards/margins": 0.4033297598361969, "rewards/rejected": -0.6305758357048035, "step": 768 }, { "epoch": 0.769, "grad_norm": 2.0741958618164062, "learning_rate": 1.53875e-06, "logits/chosen": 0.7868121862411499, "logits/rejected": 0.42737218737602234, "logps/chosen": -259.45819091796875, "logps/rejected": -144.81512451171875, "loss": 0.8374, "rewards/accuracies": 0.5, "rewards/chosen": -0.33347997069358826, "rewards/margins": -0.03941202163696289, "rewards/rejected": -0.29406797885894775, "step": 769 }, { "epoch": 0.77, "grad_norm": 1.6663457155227661, "learning_rate": 1.5375e-06, "logits/chosen": 0.4609401822090149, "logits/rejected": 0.9765943884849548, "logps/chosen": -133.75662231445312, "logps/rejected": -204.86988830566406, "loss": 0.5605, "rewards/accuracies": 0.75, "rewards/chosen": -0.18968161940574646, "rewards/margins": 0.412254273891449, "rewards/rejected": -0.601935863494873, "step": 770 }, { "epoch": 0.771, "grad_norm": 1.2057814598083496, "learning_rate": 1.53625e-06, "logits/chosen": 0.5932042598724365, "logits/rejected": 0.5166403651237488, "logps/chosen": -209.23568725585938, "logps/rejected": -170.67779541015625, "loss": 0.6045, "rewards/accuracies": 0.75, "rewards/chosen": -0.12625141441822052, "rewards/margins": 0.32395339012145996, "rewards/rejected": -0.4502047598361969, "step": 771 }, { "epoch": 0.772, "grad_norm": 1.8328189849853516, "learning_rate": 1.535e-06, "logits/chosen": 0.6493757963180542, "logits/rejected": 0.2763243615627289, "logps/chosen": -163.8371124267578, "logps/rejected": -223.5072021484375, "loss": 0.5172, "rewards/accuracies": 0.75, "rewards/chosen": -0.07180938869714737, "rewards/margins": 0.4770936369895935, "rewards/rejected": -0.5489029884338379, "step": 772 }, { "epoch": 0.773, "grad_norm": 1.3096734285354614, "learning_rate": 1.53375e-06, "logits/chosen": 0.36874744296073914, "logits/rejected": 0.7621723413467407, "logps/chosen": -164.24822998046875, "logps/rejected": -187.83749389648438, "loss": 0.5317, "rewards/accuracies": 0.875, "rewards/chosen": -0.19455794990062714, "rewards/margins": 0.38444215059280396, "rewards/rejected": -0.5790001153945923, "step": 773 }, { "epoch": 0.774, "grad_norm": 1.4323984384536743, "learning_rate": 1.5325e-06, "logits/chosen": 0.11071226000785828, "logits/rejected": 0.7504755258560181, "logps/chosen": -206.2627716064453, "logps/rejected": -205.74600219726562, "loss": 0.6053, "rewards/accuracies": 0.75, "rewards/chosen": -0.14244261384010315, "rewards/margins": 0.36120253801345825, "rewards/rejected": -0.5036451816558838, "step": 774 }, { "epoch": 0.775, "grad_norm": 1.827346920967102, "learning_rate": 1.53125e-06, "logits/chosen": 0.8071094751358032, "logits/rejected": 0.5418567657470703, "logps/chosen": -195.22732543945312, "logps/rejected": -273.976318359375, "loss": 0.7662, "rewards/accuracies": 0.625, "rewards/chosen": -0.3980606198310852, "rewards/margins": 0.0016422197222709656, "rewards/rejected": -0.3997028172016144, "step": 775 }, { "epoch": 0.776, "grad_norm": 1.2556040287017822, "learning_rate": 1.53e-06, "logits/chosen": 0.7525615692138672, "logits/rejected": 1.056390404701233, "logps/chosen": -160.5291748046875, "logps/rejected": -231.32662963867188, "loss": 0.5667, "rewards/accuracies": 0.625, "rewards/chosen": -0.13983382284641266, "rewards/margins": 0.42714083194732666, "rewards/rejected": -0.5669746994972229, "step": 776 }, { "epoch": 0.777, "grad_norm": 1.4989293813705444, "learning_rate": 1.52875e-06, "logits/chosen": 1.1420445442199707, "logits/rejected": 0.44776397943496704, "logps/chosen": -301.7631530761719, "logps/rejected": -141.0514373779297, "loss": 0.6233, "rewards/accuracies": 0.75, "rewards/chosen": -0.2332446128129959, "rewards/margins": 0.3122140169143677, "rewards/rejected": -0.5454585552215576, "step": 777 }, { "epoch": 0.778, "grad_norm": 1.3262202739715576, "learning_rate": 1.5275e-06, "logits/chosen": 0.38140425086021423, "logits/rejected": 1.0907573699951172, "logps/chosen": -150.4269561767578, "logps/rejected": -195.86746215820312, "loss": 0.4896, "rewards/accuracies": 0.875, "rewards/chosen": -0.18025383353233337, "rewards/margins": 0.5489279627799988, "rewards/rejected": -0.7291817665100098, "step": 778 }, { "epoch": 0.779, "grad_norm": 1.294095516204834, "learning_rate": 1.52625e-06, "logits/chosen": 0.5574647188186646, "logits/rejected": 0.3184836208820343, "logps/chosen": -152.60263061523438, "logps/rejected": -178.30853271484375, "loss": 0.7155, "rewards/accuracies": 0.625, "rewards/chosen": -0.16072475910186768, "rewards/margins": 0.13800381124019623, "rewards/rejected": -0.2987285554409027, "step": 779 }, { "epoch": 0.78, "grad_norm": 1.2552924156188965, "learning_rate": 1.5249999999999998e-06, "logits/chosen": 0.6440221667289734, "logits/rejected": 0.5480654835700989, "logps/chosen": -180.00950622558594, "logps/rejected": -191.9711151123047, "loss": 0.4841, "rewards/accuracies": 0.875, "rewards/chosen": -0.028660684823989868, "rewards/margins": 0.5596901774406433, "rewards/rejected": -0.5883508324623108, "step": 780 }, { "epoch": 0.781, "grad_norm": 1.3583394289016724, "learning_rate": 1.5237499999999998e-06, "logits/chosen": 0.5067780017852783, "logits/rejected": 0.1420191526412964, "logps/chosen": -234.71475219726562, "logps/rejected": -185.09686279296875, "loss": 0.5103, "rewards/accuracies": 1.0, "rewards/chosen": -0.07258254289627075, "rewards/margins": 0.4608457088470459, "rewards/rejected": -0.5334281921386719, "step": 781 }, { "epoch": 0.782, "grad_norm": 1.2055169343948364, "learning_rate": 1.5224999999999998e-06, "logits/chosen": 0.14802466332912445, "logits/rejected": 1.1153855323791504, "logps/chosen": -145.94302368164062, "logps/rejected": -240.89967346191406, "loss": 0.6029, "rewards/accuracies": 0.625, "rewards/chosen": -0.13801996409893036, "rewards/margins": 0.434284508228302, "rewards/rejected": -0.5723044872283936, "step": 782 }, { "epoch": 0.783, "grad_norm": 1.301399827003479, "learning_rate": 1.5212499999999998e-06, "logits/chosen": 0.6443755626678467, "logits/rejected": 0.3890560567378998, "logps/chosen": -196.5700225830078, "logps/rejected": -164.60533142089844, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": -0.20959226787090302, "rewards/margins": 0.3137778341770172, "rewards/rejected": -0.523370087146759, "step": 783 }, { "epoch": 0.784, "grad_norm": 1.1585192680358887, "learning_rate": 1.5199999999999998e-06, "logits/chosen": 0.9598143100738525, "logits/rejected": 0.3689217269420624, "logps/chosen": -326.3083190917969, "logps/rejected": -154.89930725097656, "loss": 0.5183, "rewards/accuracies": 0.875, "rewards/chosen": 0.13334302604198456, "rewards/margins": 0.47690349817276, "rewards/rejected": -0.34356048703193665, "step": 784 }, { "epoch": 0.785, "grad_norm": 1.2322452068328857, "learning_rate": 1.51875e-06, "logits/chosen": 0.3838818669319153, "logits/rejected": 0.5828351378440857, "logps/chosen": -245.32821655273438, "logps/rejected": -184.29234313964844, "loss": 0.6984, "rewards/accuracies": 0.75, "rewards/chosen": -0.21832275390625, "rewards/margins": 0.13440591096878052, "rewards/rejected": -0.3527286648750305, "step": 785 }, { "epoch": 0.786, "grad_norm": 1.3386081457138062, "learning_rate": 1.5175e-06, "logits/chosen": 0.9431249499320984, "logits/rejected": 1.0793761014938354, "logps/chosen": -258.467041015625, "logps/rejected": -211.68832397460938, "loss": 0.5933, "rewards/accuracies": 0.625, "rewards/chosen": -0.08865384757518768, "rewards/margins": 0.30847465991973877, "rewards/rejected": -0.39712849259376526, "step": 786 }, { "epoch": 0.787, "grad_norm": 1.5502439737319946, "learning_rate": 1.51625e-06, "logits/chosen": 1.0413932800292969, "logits/rejected": 0.6569304466247559, "logps/chosen": -288.59832763671875, "logps/rejected": -166.20762634277344, "loss": 0.6471, "rewards/accuracies": 0.75, "rewards/chosen": -0.26588231325149536, "rewards/margins": 0.3138413727283478, "rewards/rejected": -0.5797236561775208, "step": 787 }, { "epoch": 0.788, "grad_norm": 1.3097540140151978, "learning_rate": 1.515e-06, "logits/chosen": 0.8081635236740112, "logits/rejected": 0.33847174048423767, "logps/chosen": -283.52056884765625, "logps/rejected": -200.31930541992188, "loss": 0.4973, "rewards/accuracies": 0.875, "rewards/chosen": 0.038835424929857254, "rewards/margins": 0.6905443668365479, "rewards/rejected": -0.6517089605331421, "step": 788 }, { "epoch": 0.789, "grad_norm": 1.3069400787353516, "learning_rate": 1.51375e-06, "logits/chosen": 0.9385854601860046, "logits/rejected": 0.4137600064277649, "logps/chosen": -231.11245727539062, "logps/rejected": -184.8876495361328, "loss": 0.5913, "rewards/accuracies": 0.75, "rewards/chosen": -0.0799955278635025, "rewards/margins": 0.4010087251663208, "rewards/rejected": -0.4810042977333069, "step": 789 }, { "epoch": 0.79, "grad_norm": 1.4182368516921997, "learning_rate": 1.5125e-06, "logits/chosen": 0.27134865522384644, "logits/rejected": 0.28528356552124023, "logps/chosen": -199.92013549804688, "logps/rejected": -163.1998748779297, "loss": 0.7179, "rewards/accuracies": 0.625, "rewards/chosen": -0.18156327307224274, "rewards/margins": 0.1135752722620964, "rewards/rejected": -0.29513853788375854, "step": 790 }, { "epoch": 0.791, "grad_norm": 1.5899220705032349, "learning_rate": 1.51125e-06, "logits/chosen": 1.3573362827301025, "logits/rejected": 0.4730851948261261, "logps/chosen": -297.67620849609375, "logps/rejected": -163.25726318359375, "loss": 0.587, "rewards/accuracies": 0.75, "rewards/chosen": -0.10136166214942932, "rewards/margins": 0.3061084747314453, "rewards/rejected": -0.407470166683197, "step": 791 }, { "epoch": 0.792, "grad_norm": 1.4931384325027466, "learning_rate": 1.51e-06, "logits/chosen": 0.48285701870918274, "logits/rejected": 0.8901668190956116, "logps/chosen": -173.68115234375, "logps/rejected": -253.2416534423828, "loss": 0.5757, "rewards/accuracies": 0.75, "rewards/chosen": -0.1510397046804428, "rewards/margins": 0.34809166193008423, "rewards/rejected": -0.49913138151168823, "step": 792 }, { "epoch": 0.793, "grad_norm": 1.5643867254257202, "learning_rate": 1.50875e-06, "logits/chosen": 0.4564848840236664, "logits/rejected": 1.0659544467926025, "logps/chosen": -166.96835327148438, "logps/rejected": -307.1507873535156, "loss": 0.48, "rewards/accuracies": 0.75, "rewards/chosen": -0.06265564262866974, "rewards/margins": 0.6285252571105957, "rewards/rejected": -0.691180944442749, "step": 793 }, { "epoch": 0.794, "grad_norm": 1.1750324964523315, "learning_rate": 1.5075e-06, "logits/chosen": 1.1247142553329468, "logits/rejected": 0.875421404838562, "logps/chosen": -265.78680419921875, "logps/rejected": -218.62191772460938, "loss": 0.418, "rewards/accuracies": 0.75, "rewards/chosen": 0.14720305800437927, "rewards/margins": 0.9132810831069946, "rewards/rejected": -0.766077995300293, "step": 794 }, { "epoch": 0.795, "grad_norm": 1.2340149879455566, "learning_rate": 1.50625e-06, "logits/chosen": -0.06575103104114532, "logits/rejected": 1.0309563875198364, "logps/chosen": -119.19514465332031, "logps/rejected": -250.01699829101562, "loss": 0.5153, "rewards/accuracies": 0.75, "rewards/chosen": -0.12527455389499664, "rewards/margins": 0.6726338863372803, "rewards/rejected": -0.7979084253311157, "step": 795 }, { "epoch": 0.796, "grad_norm": 2.161205291748047, "learning_rate": 1.5049999999999998e-06, "logits/chosen": 0.3608647584915161, "logits/rejected": 0.3977285623550415, "logps/chosen": -168.7228240966797, "logps/rejected": -359.3806457519531, "loss": 0.7967, "rewards/accuracies": 0.375, "rewards/chosen": -0.2686005234718323, "rewards/margins": -0.08790544420480728, "rewards/rejected": -0.1806950569152832, "step": 796 }, { "epoch": 0.797, "grad_norm": 1.4021546840667725, "learning_rate": 1.5037499999999998e-06, "logits/chosen": 0.3168800473213196, "logits/rejected": 0.3818569779396057, "logps/chosen": -188.533447265625, "logps/rejected": -209.96743774414062, "loss": 0.7391, "rewards/accuracies": 0.625, "rewards/chosen": -0.34003743529319763, "rewards/margins": 0.009614188224077225, "rewards/rejected": -0.34965163469314575, "step": 797 }, { "epoch": 0.798, "grad_norm": 1.115319013595581, "learning_rate": 1.5024999999999998e-06, "logits/chosen": 0.9038507342338562, "logits/rejected": 0.508152961730957, "logps/chosen": -259.6842041015625, "logps/rejected": -189.18408203125, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 0.13372211158275604, "rewards/margins": 0.7513911128044128, "rewards/rejected": -0.6176689863204956, "step": 798 }, { "epoch": 0.799, "grad_norm": 1.4196957349777222, "learning_rate": 1.5012499999999998e-06, "logits/chosen": 0.3321461081504822, "logits/rejected": 0.1719559133052826, "logps/chosen": -237.83969116210938, "logps/rejected": -164.5838623046875, "loss": 0.645, "rewards/accuracies": 0.5, "rewards/chosen": -0.12003432214260101, "rewards/margins": 0.16770172119140625, "rewards/rejected": -0.28773602843284607, "step": 799 }, { "epoch": 0.8, "grad_norm": 2.061581611633301, "learning_rate": 1.5e-06, "logits/chosen": 0.6113697290420532, "logits/rejected": 0.23700463771820068, "logps/chosen": -242.11514282226562, "logps/rejected": -165.82196044921875, "loss": 0.7929, "rewards/accuracies": 0.625, "rewards/chosen": -0.35094720125198364, "rewards/margins": -0.0595981702208519, "rewards/rejected": -0.29134902358055115, "step": 800 }, { "epoch": 0.801, "grad_norm": 1.5273189544677734, "learning_rate": 1.49875e-06, "logits/chosen": 0.6000630855560303, "logits/rejected": 1.1508779525756836, "logps/chosen": -176.05833435058594, "logps/rejected": -269.7518005371094, "loss": 0.7585, "rewards/accuracies": 0.5, "rewards/chosen": -0.3507137596607208, "rewards/margins": 0.007139682769775391, "rewards/rejected": -0.35785341262817383, "step": 801 }, { "epoch": 0.802, "grad_norm": 1.4658358097076416, "learning_rate": 1.4975e-06, "logits/chosen": 0.3409305810928345, "logits/rejected": 0.9276995658874512, "logps/chosen": -193.1905517578125, "logps/rejected": -234.0181121826172, "loss": 0.618, "rewards/accuracies": 0.75, "rewards/chosen": -0.16789327561855316, "rewards/margins": 0.3055591881275177, "rewards/rejected": -0.47345247864723206, "step": 802 }, { "epoch": 0.803, "grad_norm": 1.4592721462249756, "learning_rate": 1.49625e-06, "logits/chosen": 0.7946934103965759, "logits/rejected": 0.5559270977973938, "logps/chosen": -258.1374206542969, "logps/rejected": -178.01087951660156, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": 0.04305625706911087, "rewards/margins": 0.7293940782546997, "rewards/rejected": -0.6863378882408142, "step": 803 }, { "epoch": 0.804, "grad_norm": 1.2911036014556885, "learning_rate": 1.495e-06, "logits/chosen": 0.48940813541412354, "logits/rejected": 0.8227964639663696, "logps/chosen": -294.52850341796875, "logps/rejected": -296.8548583984375, "loss": 0.4836, "rewards/accuracies": 0.875, "rewards/chosen": -0.07588032633066177, "rewards/margins": 0.7776464223861694, "rewards/rejected": -0.8535268306732178, "step": 804 }, { "epoch": 0.805, "grad_norm": 1.5979398488998413, "learning_rate": 1.4937499999999999e-06, "logits/chosen": 0.5802615284919739, "logits/rejected": 0.009022045880556107, "logps/chosen": -293.708740234375, "logps/rejected": -117.73948669433594, "loss": 0.6515, "rewards/accuracies": 0.5, "rewards/chosen": -0.3708326518535614, "rewards/margins": 0.1854618340730667, "rewards/rejected": -0.5562945008277893, "step": 805 }, { "epoch": 0.806, "grad_norm": 1.8174247741699219, "learning_rate": 1.4925e-06, "logits/chosen": 0.13999603688716888, "logits/rejected": 1.4212559461593628, "logps/chosen": -181.98574829101562, "logps/rejected": -282.38043212890625, "loss": 0.8134, "rewards/accuracies": 0.5, "rewards/chosen": -0.3101896047592163, "rewards/margins": -0.02394762635231018, "rewards/rejected": -0.2862420082092285, "step": 806 }, { "epoch": 0.807, "grad_norm": 1.099652886390686, "learning_rate": 1.49125e-06, "logits/chosen": 0.8213542103767395, "logits/rejected": 0.8631301522254944, "logps/chosen": -152.04611206054688, "logps/rejected": -181.54319763183594, "loss": 0.4693, "rewards/accuracies": 0.875, "rewards/chosen": -0.002526763826608658, "rewards/margins": 0.6299232244491577, "rewards/rejected": -0.6324500441551208, "step": 807 }, { "epoch": 0.808, "grad_norm": 1.3440742492675781, "learning_rate": 1.49e-06, "logits/chosen": 0.46009665727615356, "logits/rejected": 0.9061279296875, "logps/chosen": -115.65908813476562, "logps/rejected": -173.36114501953125, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 0.35245752334594727, "rewards/margins": 1.1185604333877563, "rewards/rejected": -0.7661029100418091, "step": 808 }, { "epoch": 0.809, "grad_norm": 2.05415678024292, "learning_rate": 1.48875e-06, "logits/chosen": 0.35436806082725525, "logits/rejected": 1.2460463047027588, "logps/chosen": -153.7179412841797, "logps/rejected": -221.1810302734375, "loss": 0.8112, "rewards/accuracies": 0.375, "rewards/chosen": -0.308025062084198, "rewards/margins": 0.021788500249385834, "rewards/rejected": -0.3298135995864868, "step": 809 }, { "epoch": 0.81, "grad_norm": 1.5354843139648438, "learning_rate": 1.4875e-06, "logits/chosen": 0.7726863622665405, "logits/rejected": 0.7758393287658691, "logps/chosen": -221.41824340820312, "logps/rejected": -224.82354736328125, "loss": 0.7097, "rewards/accuracies": 0.625, "rewards/chosen": -0.2210342437028885, "rewards/margins": 0.1001177653670311, "rewards/rejected": -0.3211520314216614, "step": 810 }, { "epoch": 0.811, "grad_norm": 0.8342384696006775, "learning_rate": 1.48625e-06, "logits/chosen": 0.47428980469703674, "logits/rejected": 0.8310196399688721, "logps/chosen": -121.64692687988281, "logps/rejected": -225.59616088867188, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": 0.21538972854614258, "rewards/margins": 0.9947484731674194, "rewards/rejected": -0.7793587446212769, "step": 811 }, { "epoch": 0.812, "grad_norm": 1.6445648670196533, "learning_rate": 1.485e-06, "logits/chosen": 0.5310044288635254, "logits/rejected": 0.24833601713180542, "logps/chosen": -184.9776611328125, "logps/rejected": -201.85928344726562, "loss": 0.4277, "rewards/accuracies": 0.75, "rewards/chosen": 0.18687641620635986, "rewards/margins": 0.7736696004867554, "rewards/rejected": -0.5867931842803955, "step": 812 }, { "epoch": 0.813, "grad_norm": 1.2347183227539062, "learning_rate": 1.4837499999999998e-06, "logits/chosen": 0.41815054416656494, "logits/rejected": 0.49847185611724854, "logps/chosen": -189.28749084472656, "logps/rejected": -216.09718322753906, "loss": 0.5185, "rewards/accuracies": 0.875, "rewards/chosen": -0.12084513157606125, "rewards/margins": 0.5375738143920898, "rewards/rejected": -0.6584189534187317, "step": 813 }, { "epoch": 0.814, "grad_norm": 1.1495170593261719, "learning_rate": 1.4824999999999998e-06, "logits/chosen": 0.6478469967842102, "logits/rejected": 0.9508465528488159, "logps/chosen": -324.0361022949219, "logps/rejected": -239.50985717773438, "loss": 0.4488, "rewards/accuracies": 0.875, "rewards/chosen": 0.041604045778512955, "rewards/margins": 0.6327150464057922, "rewards/rejected": -0.5911110043525696, "step": 814 }, { "epoch": 0.815, "grad_norm": 2.088806390762329, "learning_rate": 1.4812499999999998e-06, "logits/chosen": 1.0184882879257202, "logits/rejected": 0.5636916160583496, "logps/chosen": -291.3311767578125, "logps/rejected": -200.804443359375, "loss": 0.723, "rewards/accuracies": 0.625, "rewards/chosen": -0.1852259784936905, "rewards/margins": 0.10099717974662781, "rewards/rejected": -0.2862231433391571, "step": 815 }, { "epoch": 0.816, "grad_norm": 1.1604162454605103, "learning_rate": 1.48e-06, "logits/chosen": 0.9162092208862305, "logits/rejected": 0.49806350469589233, "logps/chosen": -252.3245086669922, "logps/rejected": -201.28118896484375, "loss": 0.5715, "rewards/accuracies": 0.75, "rewards/chosen": -0.0004030168056488037, "rewards/margins": 0.34054815769195557, "rewards/rejected": -0.340951144695282, "step": 816 }, { "epoch": 0.817, "grad_norm": 1.1061822175979614, "learning_rate": 1.47875e-06, "logits/chosen": 0.6185919046401978, "logits/rejected": 0.3234015107154846, "logps/chosen": -149.65696716308594, "logps/rejected": -174.33901977539062, "loss": 0.5215, "rewards/accuracies": 0.75, "rewards/chosen": 0.049570366740226746, "rewards/margins": 0.5401611924171448, "rewards/rejected": -0.49059075117111206, "step": 817 }, { "epoch": 0.818, "grad_norm": 2.410219669342041, "learning_rate": 1.4775e-06, "logits/chosen": 0.7890682220458984, "logits/rejected": 0.5442745685577393, "logps/chosen": -266.68939208984375, "logps/rejected": -148.311767578125, "loss": 0.6776, "rewards/accuracies": 0.5, "rewards/chosen": -0.2269268035888672, "rewards/margins": 0.2534627318382263, "rewards/rejected": -0.4803895056247711, "step": 818 }, { "epoch": 0.819, "grad_norm": 1.1865323781967163, "learning_rate": 1.47625e-06, "logits/chosen": 0.5894671678543091, "logits/rejected": 0.3459717631340027, "logps/chosen": -194.57275390625, "logps/rejected": -170.3459930419922, "loss": 0.3971, "rewards/accuracies": 0.875, "rewards/chosen": 0.14286909997463226, "rewards/margins": 0.9336158037185669, "rewards/rejected": -0.7907466888427734, "step": 819 }, { "epoch": 0.82, "grad_norm": 2.330641746520996, "learning_rate": 1.475e-06, "logits/chosen": 1.2246785163879395, "logits/rejected": 0.5430814027786255, "logps/chosen": -346.7149658203125, "logps/rejected": -150.56851196289062, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": -0.31609097123146057, "rewards/margins": -0.024228662252426147, "rewards/rejected": -0.2918623089790344, "step": 820 }, { "epoch": 0.821, "grad_norm": 1.4661563634872437, "learning_rate": 1.4737499999999999e-06, "logits/chosen": 0.5427192449569702, "logits/rejected": 0.7114030122756958, "logps/chosen": -174.8108673095703, "logps/rejected": -199.66326904296875, "loss": 0.5931, "rewards/accuracies": 0.5, "rewards/chosen": -0.061776310205459595, "rewards/margins": 0.5205284953117371, "rewards/rejected": -0.5823047757148743, "step": 821 }, { "epoch": 0.822, "grad_norm": 1.4120709896087646, "learning_rate": 1.4724999999999999e-06, "logits/chosen": 0.1661166399717331, "logits/rejected": 0.3047623038291931, "logps/chosen": -207.91275024414062, "logps/rejected": -193.60130310058594, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": -0.17364710569381714, "rewards/margins": 0.24227342009544373, "rewards/rejected": -0.41592055559158325, "step": 822 }, { "epoch": 0.823, "grad_norm": 1.7753530740737915, "learning_rate": 1.4712499999999999e-06, "logits/chosen": 0.02858623117208481, "logits/rejected": 1.2466577291488647, "logps/chosen": -167.55223083496094, "logps/rejected": -222.17388916015625, "loss": 0.7109, "rewards/accuracies": 0.25, "rewards/chosen": -0.3002348840236664, "rewards/margins": 0.08659552037715912, "rewards/rejected": -0.3868304491043091, "step": 823 }, { "epoch": 0.824, "grad_norm": 1.2115516662597656, "learning_rate": 1.47e-06, "logits/chosen": 1.2945767641067505, "logits/rejected": 0.5178027153015137, "logps/chosen": -219.18258666992188, "logps/rejected": -161.58486938476562, "loss": 0.4324, "rewards/accuracies": 0.75, "rewards/chosen": -0.12030278146266937, "rewards/margins": 0.7596643567085266, "rewards/rejected": -0.8799671530723572, "step": 824 }, { "epoch": 0.825, "grad_norm": 1.248079776763916, "learning_rate": 1.46875e-06, "logits/chosen": 1.005236029624939, "logits/rejected": 0.8677165508270264, "logps/chosen": -219.2430419921875, "logps/rejected": -203.5360870361328, "loss": 0.5931, "rewards/accuracies": 0.625, "rewards/chosen": -0.2089425027370453, "rewards/margins": 0.41777515411376953, "rewards/rejected": -0.6267176270484924, "step": 825 }, { "epoch": 0.826, "grad_norm": 1.149048924446106, "learning_rate": 1.4675e-06, "logits/chosen": 0.7814576625823975, "logits/rejected": 1.283768892288208, "logps/chosen": -172.81549072265625, "logps/rejected": -250.43455505371094, "loss": 0.5015, "rewards/accuracies": 0.75, "rewards/chosen": -0.08408622443675995, "rewards/margins": 0.608651340007782, "rewards/rejected": -0.6927375793457031, "step": 826 }, { "epoch": 0.827, "grad_norm": 1.49221670627594, "learning_rate": 1.46625e-06, "logits/chosen": 0.8542211055755615, "logits/rejected": 0.5495758056640625, "logps/chosen": -224.4303741455078, "logps/rejected": -262.3560791015625, "loss": 0.5778, "rewards/accuracies": 0.75, "rewards/chosen": -0.124696746468544, "rewards/margins": 0.4417930841445923, "rewards/rejected": -0.5664898157119751, "step": 827 }, { "epoch": 0.828, "grad_norm": 1.0497483015060425, "learning_rate": 1.465e-06, "logits/chosen": 1.3076245784759521, "logits/rejected": 0.7535489201545715, "logps/chosen": -349.5000305175781, "logps/rejected": -211.078125, "loss": 0.5287, "rewards/accuracies": 0.625, "rewards/chosen": 0.1900438368320465, "rewards/margins": 0.5045112371444702, "rewards/rejected": -0.31446734070777893, "step": 828 }, { "epoch": 0.829, "grad_norm": 1.7689208984375, "learning_rate": 1.46375e-06, "logits/chosen": 0.8072570562362671, "logits/rejected": 0.3915274739265442, "logps/chosen": -225.2689208984375, "logps/rejected": -168.09439086914062, "loss": 0.7548, "rewards/accuracies": 0.625, "rewards/chosen": -0.22721654176712036, "rewards/margins": 0.10455496609210968, "rewards/rejected": -0.33177149295806885, "step": 829 }, { "epoch": 0.83, "grad_norm": 0.9378156065940857, "learning_rate": 1.4624999999999998e-06, "logits/chosen": 0.8885684013366699, "logits/rejected": 0.455442875623703, "logps/chosen": -225.94369506835938, "logps/rejected": -190.0890350341797, "loss": 0.4744, "rewards/accuracies": 0.875, "rewards/chosen": 0.07008133083581924, "rewards/margins": 0.6040204167366028, "rewards/rejected": -0.5339391231536865, "step": 830 }, { "epoch": 0.831, "grad_norm": 1.7333014011383057, "learning_rate": 1.4612499999999998e-06, "logits/chosen": 0.3127462863922119, "logits/rejected": 0.796444296836853, "logps/chosen": -209.7428741455078, "logps/rejected": -260.23992919921875, "loss": 0.7078, "rewards/accuracies": 0.625, "rewards/chosen": -0.284018337726593, "rewards/margins": 0.11959724873304367, "rewards/rejected": -0.4036155641078949, "step": 831 }, { "epoch": 0.832, "grad_norm": 1.2493869066238403, "learning_rate": 1.46e-06, "logits/chosen": 0.6942001581192017, "logits/rejected": 1.0000519752502441, "logps/chosen": -185.5996551513672, "logps/rejected": -237.98223876953125, "loss": 0.6013, "rewards/accuracies": 0.75, "rewards/chosen": -0.19065725803375244, "rewards/margins": 0.33741649985313416, "rewards/rejected": -0.528073787689209, "step": 832 }, { "epoch": 0.833, "grad_norm": 2.082021474838257, "learning_rate": 1.45875e-06, "logits/chosen": 0.97160804271698, "logits/rejected": 0.6944774389266968, "logps/chosen": -284.5702209472656, "logps/rejected": -204.0965576171875, "loss": 0.749, "rewards/accuracies": 0.625, "rewards/chosen": -0.3182583749294281, "rewards/margins": 0.04116714745759964, "rewards/rejected": -0.35942554473876953, "step": 833 }, { "epoch": 0.834, "grad_norm": 1.570696234703064, "learning_rate": 1.4575e-06, "logits/chosen": 0.9496181011199951, "logits/rejected": 0.24534550309181213, "logps/chosen": -273.1126708984375, "logps/rejected": -182.30996704101562, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": -0.15760716795921326, "rewards/margins": 0.41534656286239624, "rewards/rejected": -0.5729537606239319, "step": 834 }, { "epoch": 0.835, "grad_norm": 1.6825851202011108, "learning_rate": 1.45625e-06, "logits/chosen": 0.6259968876838684, "logits/rejected": 0.743320882320404, "logps/chosen": -248.32733154296875, "logps/rejected": -185.61843872070312, "loss": 0.8422, "rewards/accuracies": 0.375, "rewards/chosen": -0.530404269695282, "rewards/margins": -0.20825353264808655, "rewards/rejected": -0.3221507668495178, "step": 835 }, { "epoch": 0.836, "grad_norm": 1.117641806602478, "learning_rate": 1.455e-06, "logits/chosen": 0.6960142254829407, "logits/rejected": 0.7176426649093628, "logps/chosen": -242.34295654296875, "logps/rejected": -196.70372009277344, "loss": 0.5293, "rewards/accuracies": 0.875, "rewards/chosen": -0.002633199095726013, "rewards/margins": 0.5106139779090881, "rewards/rejected": -0.5132472515106201, "step": 836 }, { "epoch": 0.837, "grad_norm": 2.0998990535736084, "learning_rate": 1.45375e-06, "logits/chosen": 0.4678783714771271, "logits/rejected": 0.4066188931465149, "logps/chosen": -166.09713745117188, "logps/rejected": -248.61978149414062, "loss": 0.6455, "rewards/accuracies": 0.625, "rewards/chosen": -0.028799917548894882, "rewards/margins": 0.3343544900417328, "rewards/rejected": -0.36315441131591797, "step": 837 }, { "epoch": 0.838, "grad_norm": 1.197037935256958, "learning_rate": 1.4524999999999999e-06, "logits/chosen": 0.7816302180290222, "logits/rejected": -0.04748930037021637, "logps/chosen": -235.880126953125, "logps/rejected": -143.69241333007812, "loss": 0.5212, "rewards/accuracies": 0.75, "rewards/chosen": -0.14050187170505524, "rewards/margins": 0.5552880764007568, "rewards/rejected": -0.6957899332046509, "step": 838 }, { "epoch": 0.839, "grad_norm": 1.8854578733444214, "learning_rate": 1.4512499999999999e-06, "logits/chosen": 0.2560434341430664, "logits/rejected": 0.8038116693496704, "logps/chosen": -235.70919799804688, "logps/rejected": -228.223876953125, "loss": 0.7369, "rewards/accuracies": 0.5, "rewards/chosen": -0.3974069654941559, "rewards/margins": 0.14316654205322266, "rewards/rejected": -0.5405734777450562, "step": 839 }, { "epoch": 0.84, "grad_norm": 1.4685466289520264, "learning_rate": 1.4499999999999999e-06, "logits/chosen": 0.9274765253067017, "logits/rejected": -0.05479664355516434, "logps/chosen": -306.3356628417969, "logps/rejected": -136.99057006835938, "loss": 0.4986, "rewards/accuracies": 0.75, "rewards/chosen": -0.12457820028066635, "rewards/margins": 0.5413271188735962, "rewards/rejected": -0.6659053564071655, "step": 840 }, { "epoch": 0.841, "grad_norm": 1.0918644666671753, "learning_rate": 1.4487499999999999e-06, "logits/chosen": 1.1453410387039185, "logits/rejected": 0.41478997468948364, "logps/chosen": -250.73544311523438, "logps/rejected": -168.36795043945312, "loss": 0.525, "rewards/accuracies": 0.75, "rewards/chosen": -0.07194480299949646, "rewards/margins": 0.633941650390625, "rewards/rejected": -0.7058865427970886, "step": 841 }, { "epoch": 0.842, "grad_norm": 1.3372031450271606, "learning_rate": 1.4475e-06, "logits/chosen": 0.448350191116333, "logits/rejected": 0.5272084474563599, "logps/chosen": -165.34832763671875, "logps/rejected": -204.76231384277344, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": -0.19336986541748047, "rewards/margins": 0.08974085748195648, "rewards/rejected": -0.28311073780059814, "step": 842 }, { "epoch": 0.843, "grad_norm": 1.3078843355178833, "learning_rate": 1.44625e-06, "logits/chosen": 0.30791085958480835, "logits/rejected": 0.9456870555877686, "logps/chosen": -156.27682495117188, "logps/rejected": -215.331787109375, "loss": 0.6048, "rewards/accuracies": 0.75, "rewards/chosen": -0.28642740845680237, "rewards/margins": 0.3014953136444092, "rewards/rejected": -0.5879226922988892, "step": 843 }, { "epoch": 0.844, "grad_norm": 1.3378649950027466, "learning_rate": 1.445e-06, "logits/chosen": 0.6953549385070801, "logits/rejected": 1.0749958753585815, "logps/chosen": -250.79470825195312, "logps/rejected": -253.4893798828125, "loss": 0.5924, "rewards/accuracies": 0.75, "rewards/chosen": -0.260633647441864, "rewards/margins": 0.266015887260437, "rewards/rejected": -0.5266495943069458, "step": 844 }, { "epoch": 0.845, "grad_norm": 1.3889352083206177, "learning_rate": 1.44375e-06, "logits/chosen": 0.38474607467651367, "logits/rejected": 1.0512117147445679, "logps/chosen": -143.43157958984375, "logps/rejected": -237.48158264160156, "loss": 0.4833, "rewards/accuracies": 0.75, "rewards/chosen": -0.0317256934940815, "rewards/margins": 0.5951000452041626, "rewards/rejected": -0.6268256902694702, "step": 845 }, { "epoch": 0.846, "grad_norm": 1.483250379562378, "learning_rate": 1.4424999999999997e-06, "logits/chosen": 0.7645325660705566, "logits/rejected": 0.8830052018165588, "logps/chosen": -182.09158325195312, "logps/rejected": -197.81637573242188, "loss": 0.7806, "rewards/accuracies": 0.5, "rewards/chosen": -0.3636135160923004, "rewards/margins": -0.012301355600357056, "rewards/rejected": -0.35131216049194336, "step": 846 }, { "epoch": 0.847, "grad_norm": 1.6061326265335083, "learning_rate": 1.4412499999999998e-06, "logits/chosen": 1.0892888307571411, "logits/rejected": 0.7253185510635376, "logps/chosen": -283.70111083984375, "logps/rejected": -184.4206085205078, "loss": 0.6174, "rewards/accuracies": 0.625, "rewards/chosen": -0.268643856048584, "rewards/margins": 0.294492244720459, "rewards/rejected": -0.563136100769043, "step": 847 }, { "epoch": 0.848, "grad_norm": 1.0704879760742188, "learning_rate": 1.44e-06, "logits/chosen": 1.4329355955123901, "logits/rejected": 0.48844069242477417, "logps/chosen": -311.377685546875, "logps/rejected": -152.85206604003906, "loss": 0.4301, "rewards/accuracies": 0.875, "rewards/chosen": 0.10988025367259979, "rewards/margins": 0.6669503450393677, "rewards/rejected": -0.5570700168609619, "step": 848 }, { "epoch": 0.849, "grad_norm": 1.3765523433685303, "learning_rate": 1.43875e-06, "logits/chosen": 1.626275658607483, "logits/rejected": 0.8906983137130737, "logps/chosen": -436.637451171875, "logps/rejected": -191.9149169921875, "loss": 0.5212, "rewards/accuracies": 0.75, "rewards/chosen": 0.058373644948005676, "rewards/margins": 0.6237785220146179, "rewards/rejected": -0.5654048919677734, "step": 849 }, { "epoch": 0.85, "grad_norm": 1.3269729614257812, "learning_rate": 1.4375e-06, "logits/chosen": 0.23609599471092224, "logits/rejected": 1.0051785707473755, "logps/chosen": -131.31912231445312, "logps/rejected": -216.55718994140625, "loss": 0.5326, "rewards/accuracies": 0.75, "rewards/chosen": -0.0060788169503211975, "rewards/margins": 0.48769286274909973, "rewards/rejected": -0.49377164244651794, "step": 850 }, { "epoch": 0.851, "grad_norm": 1.1873856782913208, "learning_rate": 1.43625e-06, "logits/chosen": 1.1208226680755615, "logits/rejected": 0.5165396928787231, "logps/chosen": -233.917724609375, "logps/rejected": -148.64279174804688, "loss": 0.4164, "rewards/accuracies": 0.875, "rewards/chosen": 0.08753661066293716, "rewards/margins": 0.7497595548629761, "rewards/rejected": -0.6622229218482971, "step": 851 }, { "epoch": 0.852, "grad_norm": 1.4624890089035034, "learning_rate": 1.435e-06, "logits/chosen": 0.8920823335647583, "logits/rejected": 0.7986461520195007, "logps/chosen": -180.9380340576172, "logps/rejected": -181.044677734375, "loss": 0.5897, "rewards/accuracies": 0.625, "rewards/chosen": -0.20917168259620667, "rewards/margins": 0.4007396697998047, "rewards/rejected": -0.609911322593689, "step": 852 }, { "epoch": 0.853, "grad_norm": 1.8548634052276611, "learning_rate": 1.43375e-06, "logits/chosen": 0.7728656530380249, "logits/rejected": 1.0907254219055176, "logps/chosen": -273.18023681640625, "logps/rejected": -206.45668029785156, "loss": 0.8194, "rewards/accuracies": 0.625, "rewards/chosen": -0.4955300986766815, "rewards/margins": 0.03325909376144409, "rewards/rejected": -0.528789222240448, "step": 853 }, { "epoch": 0.854, "grad_norm": 1.0852863788604736, "learning_rate": 1.4325e-06, "logits/chosen": 1.3994724750518799, "logits/rejected": 0.8464314341545105, "logps/chosen": -290.48162841796875, "logps/rejected": -194.08380126953125, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": -0.011805962771177292, "rewards/margins": 0.7240908741950989, "rewards/rejected": -0.73589688539505, "step": 854 }, { "epoch": 0.855, "grad_norm": 1.6539078950881958, "learning_rate": 1.4312499999999998e-06, "logits/chosen": 0.9229463934898376, "logits/rejected": 0.6731488108634949, "logps/chosen": -276.556884765625, "logps/rejected": -160.12852478027344, "loss": 0.5668, "rewards/accuracies": 0.625, "rewards/chosen": -0.3024282455444336, "rewards/margins": 0.4227105379104614, "rewards/rejected": -0.7251387238502502, "step": 855 }, { "epoch": 0.856, "grad_norm": 1.1247491836547852, "learning_rate": 1.4299999999999999e-06, "logits/chosen": 0.3128207325935364, "logits/rejected": 0.4865463674068451, "logps/chosen": -178.31170654296875, "logps/rejected": -181.40621948242188, "loss": 0.5899, "rewards/accuracies": 0.625, "rewards/chosen": -0.18839694559574127, "rewards/margins": 0.37713462114334106, "rewards/rejected": -0.5655316114425659, "step": 856 }, { "epoch": 0.857, "grad_norm": 1.3092116117477417, "learning_rate": 1.4287499999999999e-06, "logits/chosen": 0.729078471660614, "logits/rejected": 0.5379890203475952, "logps/chosen": -314.27099609375, "logps/rejected": -174.89772033691406, "loss": 0.5676, "rewards/accuracies": 0.75, "rewards/chosen": -0.09368456155061722, "rewards/margins": 0.4694609045982361, "rewards/rejected": -0.5631454586982727, "step": 857 }, { "epoch": 0.858, "grad_norm": 1.5807218551635742, "learning_rate": 1.4274999999999999e-06, "logits/chosen": 1.1919059753417969, "logits/rejected": 0.5795677900314331, "logps/chosen": -331.64630126953125, "logps/rejected": -186.7829132080078, "loss": 0.5546, "rewards/accuracies": 0.75, "rewards/chosen": -0.11883344501256943, "rewards/margins": 0.5190702676773071, "rewards/rejected": -0.6379037499427795, "step": 858 }, { "epoch": 0.859, "grad_norm": 1.0147168636322021, "learning_rate": 1.42625e-06, "logits/chosen": 0.5151049494743347, "logits/rejected": 0.783844530582428, "logps/chosen": -219.9441375732422, "logps/rejected": -214.03805541992188, "loss": 0.5014, "rewards/accuracies": 0.75, "rewards/chosen": -0.11330008506774902, "rewards/margins": 0.5585408210754395, "rewards/rejected": -0.6718408465385437, "step": 859 }, { "epoch": 0.86, "grad_norm": 1.2886697053909302, "learning_rate": 1.425e-06, "logits/chosen": 0.9792620539665222, "logits/rejected": 0.5151446461677551, "logps/chosen": -276.7094421386719, "logps/rejected": -161.65524291992188, "loss": 0.5386, "rewards/accuracies": 0.875, "rewards/chosen": -0.01977100968360901, "rewards/margins": 0.4068881869316101, "rewards/rejected": -0.4266592264175415, "step": 860 }, { "epoch": 0.861, "grad_norm": 1.274018406867981, "learning_rate": 1.42375e-06, "logits/chosen": 0.6334936618804932, "logits/rejected": 0.24859677255153656, "logps/chosen": -262.00341796875, "logps/rejected": -161.40676879882812, "loss": 0.5951, "rewards/accuracies": 0.75, "rewards/chosen": -0.1386736035346985, "rewards/margins": 0.3004884123802185, "rewards/rejected": -0.439162015914917, "step": 861 }, { "epoch": 0.862, "grad_norm": 1.2399545907974243, "learning_rate": 1.4225e-06, "logits/chosen": 0.8403391242027283, "logits/rejected": 0.519575297832489, "logps/chosen": -218.12872314453125, "logps/rejected": -205.54788208007812, "loss": 0.522, "rewards/accuracies": 0.875, "rewards/chosen": -0.19036130607128143, "rewards/margins": 0.49108895659446716, "rewards/rejected": -0.6814502477645874, "step": 862 }, { "epoch": 0.863, "grad_norm": 1.0551269054412842, "learning_rate": 1.4212499999999997e-06, "logits/chosen": 1.0143283605575562, "logits/rejected": 0.6086602210998535, "logps/chosen": -198.24649047851562, "logps/rejected": -200.88377380371094, "loss": 0.4952, "rewards/accuracies": 0.875, "rewards/chosen": -0.16085729002952576, "rewards/margins": 0.48785659670829773, "rewards/rejected": -0.6487138867378235, "step": 863 }, { "epoch": 0.864, "grad_norm": 1.9159413576126099, "learning_rate": 1.42e-06, "logits/chosen": 0.6848264932632446, "logits/rejected": 0.8757666945457458, "logps/chosen": -262.3509521484375, "logps/rejected": -258.1694641113281, "loss": 0.706, "rewards/accuracies": 0.625, "rewards/chosen": -0.24590462446212769, "rewards/margins": 0.1507417857646942, "rewards/rejected": -0.3966464400291443, "step": 864 }, { "epoch": 0.865, "grad_norm": 1.24154794216156, "learning_rate": 1.41875e-06, "logits/chosen": 0.6882704496383667, "logits/rejected": 0.9581655263900757, "logps/chosen": -252.5535430908203, "logps/rejected": -211.16969299316406, "loss": 0.5257, "rewards/accuracies": 0.75, "rewards/chosen": -0.19099941849708557, "rewards/margins": 0.4437776505947113, "rewards/rejected": -0.6347770690917969, "step": 865 }, { "epoch": 0.866, "grad_norm": 1.4620466232299805, "learning_rate": 1.4175e-06, "logits/chosen": 0.687907338142395, "logits/rejected": 1.3246145248413086, "logps/chosen": -273.80682373046875, "logps/rejected": -248.23631286621094, "loss": 0.6189, "rewards/accuracies": 0.75, "rewards/chosen": -0.2861749827861786, "rewards/margins": 0.3814810514450073, "rewards/rejected": -0.6676560640335083, "step": 866 }, { "epoch": 0.867, "grad_norm": 1.6389131546020508, "learning_rate": 1.41625e-06, "logits/chosen": 1.0448640584945679, "logits/rejected": 0.6267768144607544, "logps/chosen": -234.91427612304688, "logps/rejected": -130.81512451171875, "loss": 0.6809, "rewards/accuracies": 0.75, "rewards/chosen": -0.5424530506134033, "rewards/margins": 0.07294635474681854, "rewards/rejected": -0.6153994202613831, "step": 867 }, { "epoch": 0.868, "grad_norm": 1.3483761548995972, "learning_rate": 1.415e-06, "logits/chosen": 0.5321905612945557, "logits/rejected": 0.5273289680480957, "logps/chosen": -191.9685516357422, "logps/rejected": -175.6036376953125, "loss": 0.6628, "rewards/accuracies": 0.5, "rewards/chosen": -0.19385044276714325, "rewards/margins": 0.3270173966884613, "rewards/rejected": -0.5208678245544434, "step": 868 }, { "epoch": 0.869, "grad_norm": 1.2943872213363647, "learning_rate": 1.41375e-06, "logits/chosen": 0.435405969619751, "logits/rejected": 0.44467270374298096, "logps/chosen": -165.13453674316406, "logps/rejected": -221.27462768554688, "loss": 0.7212, "rewards/accuracies": 0.625, "rewards/chosen": -0.39716261625289917, "rewards/margins": 0.10220541059970856, "rewards/rejected": -0.49936801195144653, "step": 869 }, { "epoch": 0.87, "grad_norm": 1.3657855987548828, "learning_rate": 1.4125e-06, "logits/chosen": 0.707287609577179, "logits/rejected": 0.854562520980835, "logps/chosen": -159.66989135742188, "logps/rejected": -267.57794189453125, "loss": 0.6663, "rewards/accuracies": 0.5, "rewards/chosen": -0.2589988112449646, "rewards/margins": 0.1777014136314392, "rewards/rejected": -0.4367002546787262, "step": 870 }, { "epoch": 0.871, "grad_norm": 1.4618343114852905, "learning_rate": 1.4112499999999998e-06, "logits/chosen": 0.9022541046142578, "logits/rejected": 0.9535800218582153, "logps/chosen": -304.98614501953125, "logps/rejected": -172.05633544921875, "loss": 0.7069, "rewards/accuracies": 0.625, "rewards/chosen": -0.29177483916282654, "rewards/margins": 0.09995345771312714, "rewards/rejected": -0.39172831177711487, "step": 871 }, { "epoch": 0.872, "grad_norm": 1.5499664545059204, "learning_rate": 1.4099999999999998e-06, "logits/chosen": 0.5474669337272644, "logits/rejected": 0.6617109775543213, "logps/chosen": -185.95323181152344, "logps/rejected": -196.3461151123047, "loss": 0.739, "rewards/accuracies": 0.5, "rewards/chosen": -0.3240274488925934, "rewards/margins": -0.05370005965232849, "rewards/rejected": -0.2703273892402649, "step": 872 }, { "epoch": 0.873, "grad_norm": 1.5139293670654297, "learning_rate": 1.4087499999999999e-06, "logits/chosen": 0.30601510405540466, "logits/rejected": 1.0925383567810059, "logps/chosen": -149.37669372558594, "logps/rejected": -266.8717956542969, "loss": 0.8572, "rewards/accuracies": 0.375, "rewards/chosen": -0.37894001603126526, "rewards/margins": -0.2019810676574707, "rewards/rejected": -0.17695894837379456, "step": 873 }, { "epoch": 0.874, "grad_norm": 1.2711925506591797, "learning_rate": 1.4074999999999999e-06, "logits/chosen": 0.6186120510101318, "logits/rejected": 1.1118088960647583, "logps/chosen": -186.1488037109375, "logps/rejected": -212.18585205078125, "loss": 0.6518, "rewards/accuracies": 0.5, "rewards/chosen": -0.2729954719543457, "rewards/margins": 0.1464681774377823, "rewards/rejected": -0.4194636642932892, "step": 874 }, { "epoch": 0.875, "grad_norm": 0.9572591781616211, "learning_rate": 1.4062499999999999e-06, "logits/chosen": 0.6900919675827026, "logits/rejected": 0.960578203201294, "logps/chosen": -261.71295166015625, "logps/rejected": -198.54058837890625, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": 0.10114632546901703, "rewards/margins": 1.0099724531173706, "rewards/rejected": -0.9088261127471924, "step": 875 }, { "epoch": 0.876, "grad_norm": 1.2018089294433594, "learning_rate": 1.4049999999999999e-06, "logits/chosen": 0.7576265931129456, "logits/rejected": 0.6717976331710815, "logps/chosen": -228.4851531982422, "logps/rejected": -198.01971435546875, "loss": 0.5539, "rewards/accuracies": 0.625, "rewards/chosen": -0.1622961163520813, "rewards/margins": 0.42172279953956604, "rewards/rejected": -0.584018886089325, "step": 876 }, { "epoch": 0.877, "grad_norm": 1.32090163230896, "learning_rate": 1.40375e-06, "logits/chosen": 0.7783986330032349, "logits/rejected": 0.34317895770072937, "logps/chosen": -310.6788024902344, "logps/rejected": -172.40823364257812, "loss": 0.5527, "rewards/accuracies": 0.75, "rewards/chosen": -0.20469006896018982, "rewards/margins": 0.5509166717529297, "rewards/rejected": -0.7556067705154419, "step": 877 }, { "epoch": 0.878, "grad_norm": 1.1003425121307373, "learning_rate": 1.4025e-06, "logits/chosen": 0.5861908197402954, "logits/rejected": 0.6367526054382324, "logps/chosen": -198.855712890625, "logps/rejected": -192.96466064453125, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": -0.055344775319099426, "rewards/margins": 0.6527049541473389, "rewards/rejected": -0.7080497741699219, "step": 878 }, { "epoch": 0.879, "grad_norm": 1.2360206842422485, "learning_rate": 1.4012500000000001e-06, "logits/chosen": 1.2197543382644653, "logits/rejected": 0.5054636001586914, "logps/chosen": -292.0068664550781, "logps/rejected": -186.48684692382812, "loss": 0.498, "rewards/accuracies": 0.75, "rewards/chosen": -0.21480655670166016, "rewards/margins": 0.5140448212623596, "rewards/rejected": -0.7288513779640198, "step": 879 }, { "epoch": 0.88, "grad_norm": 1.577164888381958, "learning_rate": 1.4e-06, "logits/chosen": 0.36563441157341003, "logits/rejected": 0.8147153258323669, "logps/chosen": -132.58413696289062, "logps/rejected": -258.2640380859375, "loss": 0.6754, "rewards/accuracies": 0.75, "rewards/chosen": -0.4633481502532959, "rewards/margins": 0.2965816855430603, "rewards/rejected": -0.759929895401001, "step": 880 }, { "epoch": 0.881, "grad_norm": 1.4152069091796875, "learning_rate": 1.39875e-06, "logits/chosen": 0.9666420817375183, "logits/rejected": 1.0528497695922852, "logps/chosen": -228.48934936523438, "logps/rejected": -216.28160095214844, "loss": 0.5135, "rewards/accuracies": 0.875, "rewards/chosen": -0.18215225636959076, "rewards/margins": 0.46076107025146484, "rewards/rejected": -0.6429133415222168, "step": 881 }, { "epoch": 0.882, "grad_norm": 1.5886119604110718, "learning_rate": 1.3975e-06, "logits/chosen": 0.8223452568054199, "logits/rejected": 0.5195742249488831, "logps/chosen": -206.46725463867188, "logps/rejected": -213.30178833007812, "loss": 0.7793, "rewards/accuracies": 0.5, "rewards/chosen": -0.35676634311676025, "rewards/margins": -0.008713528513908386, "rewards/rejected": -0.3480527997016907, "step": 882 }, { "epoch": 0.883, "grad_norm": 1.7326768636703491, "learning_rate": 1.39625e-06, "logits/chosen": 0.6919585466384888, "logits/rejected": 1.027918815612793, "logps/chosen": -164.87149047851562, "logps/rejected": -276.07635498046875, "loss": 0.7404, "rewards/accuracies": 0.625, "rewards/chosen": -0.3548433780670166, "rewards/margins": 0.07728704065084457, "rewards/rejected": -0.4321304261684418, "step": 883 }, { "epoch": 0.884, "grad_norm": 1.449398398399353, "learning_rate": 1.395e-06, "logits/chosen": 0.8659530282020569, "logits/rejected": 1.0399484634399414, "logps/chosen": -210.6104278564453, "logps/rejected": -275.9422607421875, "loss": 0.5192, "rewards/accuracies": 0.875, "rewards/chosen": -0.0787663459777832, "rewards/margins": 0.6560595035552979, "rewards/rejected": -0.734825849533081, "step": 884 }, { "epoch": 0.885, "grad_norm": 1.4697751998901367, "learning_rate": 1.39375e-06, "logits/chosen": 0.6156061887741089, "logits/rejected": 1.0359694957733154, "logps/chosen": -183.43701171875, "logps/rejected": -272.8504333496094, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": -0.21910402178764343, "rewards/margins": 0.2168729603290558, "rewards/rejected": -0.4359769821166992, "step": 885 }, { "epoch": 0.886, "grad_norm": 1.3513144254684448, "learning_rate": 1.3925e-06, "logits/chosen": 0.6627638339996338, "logits/rejected": 1.0169336795806885, "logps/chosen": -249.88461303710938, "logps/rejected": -222.68551635742188, "loss": 0.4866, "rewards/accuracies": 0.625, "rewards/chosen": 0.028345927596092224, "rewards/margins": 0.6422004103660583, "rewards/rejected": -0.6138545274734497, "step": 886 }, { "epoch": 0.887, "grad_norm": 1.587241291999817, "learning_rate": 1.39125e-06, "logits/chosen": 0.3786071538925171, "logits/rejected": 0.4906216263771057, "logps/chosen": -140.41990661621094, "logps/rejected": -197.34231567382812, "loss": 0.5181, "rewards/accuracies": 0.875, "rewards/chosen": -0.10159683972597122, "rewards/margins": 0.5183708667755127, "rewards/rejected": -0.6199676990509033, "step": 887 }, { "epoch": 0.888, "grad_norm": 1.4969161748886108, "learning_rate": 1.3899999999999998e-06, "logits/chosen": 0.3989590108394623, "logits/rejected": 1.1450800895690918, "logps/chosen": -167.45469665527344, "logps/rejected": -272.6234130859375, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": -0.1623590588569641, "rewards/margins": 0.34551581740379333, "rewards/rejected": -0.5078749060630798, "step": 888 }, { "epoch": 0.889, "grad_norm": 0.9845202565193176, "learning_rate": 1.3887499999999998e-06, "logits/chosen": 0.5842801332473755, "logits/rejected": 0.2424633651971817, "logps/chosen": -232.50582885742188, "logps/rejected": -150.3061065673828, "loss": 0.5225, "rewards/accuracies": 0.875, "rewards/chosen": -0.1735023707151413, "rewards/margins": 0.5076296329498291, "rewards/rejected": -0.6811319589614868, "step": 889 }, { "epoch": 0.89, "grad_norm": 2.1856513023376465, "learning_rate": 1.3874999999999998e-06, "logits/chosen": 0.34827011823654175, "logits/rejected": 1.0639123916625977, "logps/chosen": -156.72344970703125, "logps/rejected": -305.1180114746094, "loss": 0.6229, "rewards/accuracies": 0.625, "rewards/chosen": -0.26232948899269104, "rewards/margins": 0.30728885531425476, "rewards/rejected": -0.5696183443069458, "step": 890 }, { "epoch": 0.891, "grad_norm": 1.7646009922027588, "learning_rate": 1.3862499999999999e-06, "logits/chosen": 0.8881881237030029, "logits/rejected": 0.8115545511245728, "logps/chosen": -185.96817016601562, "logps/rejected": -181.93405151367188, "loss": 0.9037, "rewards/accuracies": 0.625, "rewards/chosen": -0.4557831883430481, "rewards/margins": -0.13432812690734863, "rewards/rejected": -0.3214550018310547, "step": 891 }, { "epoch": 0.892, "grad_norm": 0.9623580574989319, "learning_rate": 1.3849999999999999e-06, "logits/chosen": 0.9362077713012695, "logits/rejected": 0.6925711631774902, "logps/chosen": -179.64524841308594, "logps/rejected": -146.7115478515625, "loss": 0.5022, "rewards/accuracies": 0.875, "rewards/chosen": -0.1194731742143631, "rewards/margins": 0.48460546135902405, "rewards/rejected": -0.6040786504745483, "step": 892 }, { "epoch": 0.893, "grad_norm": 2.692838430404663, "learning_rate": 1.3837499999999999e-06, "logits/chosen": 0.8923050165176392, "logits/rejected": 0.8493921160697937, "logps/chosen": -299.2377014160156, "logps/rejected": -187.02798461914062, "loss": 0.6961, "rewards/accuracies": 0.5, "rewards/chosen": -0.6756857633590698, "rewards/margins": 0.3514941334724426, "rewards/rejected": -1.0271798372268677, "step": 893 }, { "epoch": 0.894, "grad_norm": 1.6222935914993286, "learning_rate": 1.3825e-06, "logits/chosen": 0.41446012258529663, "logits/rejected": 0.5218898057937622, "logps/chosen": -280.0311279296875, "logps/rejected": -169.93492126464844, "loss": 0.5857, "rewards/accuracies": 0.625, "rewards/chosen": -0.24418945610523224, "rewards/margins": 0.4694889485836029, "rewards/rejected": -0.7136783599853516, "step": 894 }, { "epoch": 0.895, "grad_norm": 1.8652900457382202, "learning_rate": 1.3812500000000001e-06, "logits/chosen": 0.9157848358154297, "logits/rejected": 0.27757346630096436, "logps/chosen": -261.9938049316406, "logps/rejected": -138.52337646484375, "loss": 0.6468, "rewards/accuracies": 0.875, "rewards/chosen": -0.33124861121177673, "rewards/margins": 0.1712644100189209, "rewards/rejected": -0.5025129914283752, "step": 895 }, { "epoch": 0.896, "grad_norm": 1.3947120904922485, "learning_rate": 1.38e-06, "logits/chosen": 0.6116048097610474, "logits/rejected": 0.6595140695571899, "logps/chosen": -204.127685546875, "logps/rejected": -246.4355926513672, "loss": 0.7101, "rewards/accuracies": 0.625, "rewards/chosen": -0.1466989517211914, "rewards/margins": 0.2631716728210449, "rewards/rejected": -0.40987062454223633, "step": 896 }, { "epoch": 0.897, "grad_norm": 0.9696836471557617, "learning_rate": 1.37875e-06, "logits/chosen": 1.0685018301010132, "logits/rejected": 0.7120904922485352, "logps/chosen": -227.0920867919922, "logps/rejected": -182.04095458984375, "loss": 0.2943, "rewards/accuracies": 1.0, "rewards/chosen": 0.2024410218000412, "rewards/margins": 1.1480021476745605, "rewards/rejected": -0.9455611705780029, "step": 897 }, { "epoch": 0.898, "grad_norm": 1.3274248838424683, "learning_rate": 1.3775e-06, "logits/chosen": 0.6967720985412598, "logits/rejected": 0.6899027824401855, "logps/chosen": -135.08506774902344, "logps/rejected": -191.26251220703125, "loss": 0.5067, "rewards/accuracies": 0.75, "rewards/chosen": -0.0881163626909256, "rewards/margins": 0.7419899702072144, "rewards/rejected": -0.8301063776016235, "step": 898 }, { "epoch": 0.899, "grad_norm": 1.266689658164978, "learning_rate": 1.37625e-06, "logits/chosen": 1.1968469619750977, "logits/rejected": 0.2735234498977661, "logps/chosen": -314.3270568847656, "logps/rejected": -169.16793823242188, "loss": 0.6721, "rewards/accuracies": 0.75, "rewards/chosen": -0.17940235137939453, "rewards/margins": 0.2920053005218506, "rewards/rejected": -0.4714076519012451, "step": 899 }, { "epoch": 0.9, "grad_norm": 1.7552146911621094, "learning_rate": 1.375e-06, "logits/chosen": 0.5339395999908447, "logits/rejected": 0.4500995874404907, "logps/chosen": -219.88827514648438, "logps/rejected": -182.46572875976562, "loss": 0.7254, "rewards/accuracies": 0.625, "rewards/chosen": -0.324928879737854, "rewards/margins": 0.12806949019432068, "rewards/rejected": -0.4529983401298523, "step": 900 }, { "epoch": 0.901, "grad_norm": 2.3341825008392334, "learning_rate": 1.37375e-06, "logits/chosen": 0.9263631701469421, "logits/rejected": 0.21183495223522186, "logps/chosen": -308.7745361328125, "logps/rejected": -139.90660095214844, "loss": 0.8616, "rewards/accuracies": 0.375, "rewards/chosen": -0.31959351897239685, "rewards/margins": -0.1444246917963028, "rewards/rejected": -0.17516885697841644, "step": 901 }, { "epoch": 0.902, "grad_norm": 1.8473480939865112, "learning_rate": 1.3725e-06, "logits/chosen": 1.14193594455719, "logits/rejected": 0.6333068013191223, "logps/chosen": -264.5732116699219, "logps/rejected": -188.22607421875, "loss": 0.7151, "rewards/accuracies": 0.5, "rewards/chosen": -0.3999454379081726, "rewards/margins": 0.05723799765110016, "rewards/rejected": -0.45718348026275635, "step": 902 }, { "epoch": 0.903, "grad_norm": 1.3335527181625366, "learning_rate": 1.37125e-06, "logits/chosen": 0.586179256439209, "logits/rejected": 1.3522412776947021, "logps/chosen": -237.1947479248047, "logps/rejected": -302.75604248046875, "loss": 0.6022, "rewards/accuracies": 0.625, "rewards/chosen": -0.19195112586021423, "rewards/margins": 0.28813156485557556, "rewards/rejected": -0.4800827205181122, "step": 903 }, { "epoch": 0.904, "grad_norm": 1.2022805213928223, "learning_rate": 1.37e-06, "logits/chosen": 0.03906996548175812, "logits/rejected": 0.7381466627120972, "logps/chosen": -207.433837890625, "logps/rejected": -283.2435607910156, "loss": 0.5664, "rewards/accuracies": 0.875, "rewards/chosen": -0.1970464289188385, "rewards/margins": 0.46744686365127563, "rewards/rejected": -0.6644932627677917, "step": 904 }, { "epoch": 0.905, "grad_norm": 1.1080031394958496, "learning_rate": 1.3687499999999998e-06, "logits/chosen": 0.9117807149887085, "logits/rejected": 0.3888566792011261, "logps/chosen": -348.93756103515625, "logps/rejected": -213.84109497070312, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": 0.053052376955747604, "rewards/margins": 0.9517583847045898, "rewards/rejected": -0.8987059593200684, "step": 905 }, { "epoch": 0.906, "grad_norm": 1.8287303447723389, "learning_rate": 1.3674999999999998e-06, "logits/chosen": 0.736534595489502, "logits/rejected": 0.6487816572189331, "logps/chosen": -205.96372985839844, "logps/rejected": -374.47210693359375, "loss": 0.5745, "rewards/accuracies": 0.75, "rewards/chosen": -0.11114998161792755, "rewards/margins": 0.5051987171173096, "rewards/rejected": -0.6163486242294312, "step": 906 }, { "epoch": 0.907, "grad_norm": 1.623054027557373, "learning_rate": 1.3662499999999998e-06, "logits/chosen": 1.0298954248428345, "logits/rejected": 1.0118335485458374, "logps/chosen": -238.9768829345703, "logps/rejected": -254.80697631835938, "loss": 0.7526, "rewards/accuracies": 0.5, "rewards/chosen": -0.16678589582443237, "rewards/margins": 0.038836970925331116, "rewards/rejected": -0.2056228667497635, "step": 907 }, { "epoch": 0.908, "grad_norm": 1.1800541877746582, "learning_rate": 1.3649999999999998e-06, "logits/chosen": 1.038724422454834, "logits/rejected": 0.35047709941864014, "logps/chosen": -228.93112182617188, "logps/rejected": -136.34796142578125, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -0.07789191603660583, "rewards/margins": 0.5576158165931702, "rewards/rejected": -0.6355077028274536, "step": 908 }, { "epoch": 0.909, "grad_norm": 1.6368927955627441, "learning_rate": 1.3637499999999999e-06, "logits/chosen": 0.3780650198459625, "logits/rejected": 0.8004998564720154, "logps/chosen": -189.5909423828125, "logps/rejected": -191.01486206054688, "loss": 0.8279, "rewards/accuracies": 0.375, "rewards/chosen": -0.4095698595046997, "rewards/margins": -0.12713585793972015, "rewards/rejected": -0.28243398666381836, "step": 909 }, { "epoch": 0.91, "grad_norm": 1.1898800134658813, "learning_rate": 1.3625e-06, "logits/chosen": 0.6283532381057739, "logits/rejected": 0.6250406503677368, "logps/chosen": -204.31077575683594, "logps/rejected": -210.953125, "loss": 0.4365, "rewards/accuracies": 0.875, "rewards/chosen": -0.22492389380931854, "rewards/margins": 0.6843199133872986, "rewards/rejected": -0.9092438220977783, "step": 910 }, { "epoch": 0.911, "grad_norm": 1.1199418306350708, "learning_rate": 1.36125e-06, "logits/chosen": 0.623859703540802, "logits/rejected": 0.9252781867980957, "logps/chosen": -170.57666015625, "logps/rejected": -187.1365966796875, "loss": 0.6132, "rewards/accuracies": 0.625, "rewards/chosen": -0.22421100735664368, "rewards/margins": 0.2570618689060211, "rewards/rejected": -0.4812728762626648, "step": 911 }, { "epoch": 0.912, "grad_norm": 1.068314552307129, "learning_rate": 1.3600000000000001e-06, "logits/chosen": 0.9012981653213501, "logits/rejected": 0.8960477113723755, "logps/chosen": -239.7100830078125, "logps/rejected": -169.37982177734375, "loss": 0.5149, "rewards/accuracies": 0.75, "rewards/chosen": -0.10248098522424698, "rewards/margins": 0.5338137149810791, "rewards/rejected": -0.6362946629524231, "step": 912 }, { "epoch": 0.913, "grad_norm": 1.0728094577789307, "learning_rate": 1.35875e-06, "logits/chosen": 0.7974905967712402, "logits/rejected": 0.717726469039917, "logps/chosen": -174.2677764892578, "logps/rejected": -201.69935607910156, "loss": 0.4875, "rewards/accuracies": 0.875, "rewards/chosen": -0.055164724588394165, "rewards/margins": 0.5959109663963318, "rewards/rejected": -0.6510757207870483, "step": 913 }, { "epoch": 0.914, "grad_norm": 1.2522761821746826, "learning_rate": 1.3575e-06, "logits/chosen": 0.9022465348243713, "logits/rejected": 0.25865882635116577, "logps/chosen": -265.7127685546875, "logps/rejected": -230.65994262695312, "loss": 0.5614, "rewards/accuracies": 0.75, "rewards/chosen": -0.19514504075050354, "rewards/margins": 0.47143393754959106, "rewards/rejected": -0.6665789484977722, "step": 914 }, { "epoch": 0.915, "grad_norm": 1.5141209363937378, "learning_rate": 1.35625e-06, "logits/chosen": 0.7422874569892883, "logits/rejected": 1.2547049522399902, "logps/chosen": -342.4610290527344, "logps/rejected": -240.1263885498047, "loss": 0.8281, "rewards/accuracies": 0.375, "rewards/chosen": -0.47067710757255554, "rewards/margins": -0.11523637920618057, "rewards/rejected": -0.35544073581695557, "step": 915 }, { "epoch": 0.916, "grad_norm": 1.3749991655349731, "learning_rate": 1.355e-06, "logits/chosen": 1.385894775390625, "logits/rejected": 0.29677149653434753, "logps/chosen": -303.8800048828125, "logps/rejected": -185.16995239257812, "loss": 0.4679, "rewards/accuracies": 0.875, "rewards/chosen": -0.060796163976192474, "rewards/margins": 0.572822630405426, "rewards/rejected": -0.6336187124252319, "step": 916 }, { "epoch": 0.917, "grad_norm": 1.1065936088562012, "learning_rate": 1.35375e-06, "logits/chosen": 0.6208105683326721, "logits/rejected": 0.8618709444999695, "logps/chosen": -190.59088134765625, "logps/rejected": -177.4392547607422, "loss": 0.4389, "rewards/accuracies": 0.75, "rewards/chosen": 0.06480292975902557, "rewards/margins": 0.8526935577392578, "rewards/rejected": -0.7878906726837158, "step": 917 }, { "epoch": 0.918, "grad_norm": 1.836554765701294, "learning_rate": 1.3525e-06, "logits/chosen": 0.15675407648086548, "logits/rejected": 1.1701598167419434, "logps/chosen": -130.3609619140625, "logps/rejected": -261.1822814941406, "loss": 0.7505, "rewards/accuracies": 0.5, "rewards/chosen": -0.36210551857948303, "rewards/margins": 0.06285979598760605, "rewards/rejected": -0.4249653220176697, "step": 918 }, { "epoch": 0.919, "grad_norm": 1.339767336845398, "learning_rate": 1.35125e-06, "logits/chosen": 1.0790523290634155, "logits/rejected": 0.3420099914073944, "logps/chosen": -219.88265991210938, "logps/rejected": -161.18569946289062, "loss": 0.6778, "rewards/accuracies": 0.75, "rewards/chosen": -0.14657525718212128, "rewards/margins": 0.22498264908790588, "rewards/rejected": -0.37155792117118835, "step": 919 }, { "epoch": 0.92, "grad_norm": 1.3931046724319458, "learning_rate": 1.35e-06, "logits/chosen": 0.859144389629364, "logits/rejected": 0.4116951525211334, "logps/chosen": -228.42808532714844, "logps/rejected": -180.4910888671875, "loss": 0.8318, "rewards/accuracies": 0.5, "rewards/chosen": -0.47572964429855347, "rewards/margins": -0.052348583936691284, "rewards/rejected": -0.4233810603618622, "step": 920 }, { "epoch": 0.921, "grad_norm": 1.2275761365890503, "learning_rate": 1.3487499999999998e-06, "logits/chosen": 0.7638009786605835, "logits/rejected": 0.5836972594261169, "logps/chosen": -171.27444458007812, "logps/rejected": -179.96554565429688, "loss": 0.4125, "rewards/accuracies": 0.875, "rewards/chosen": 0.15400820970535278, "rewards/margins": 0.8002764582633972, "rewards/rejected": -0.6462683081626892, "step": 921 }, { "epoch": 0.922, "grad_norm": 1.098158597946167, "learning_rate": 1.3474999999999998e-06, "logits/chosen": 0.5425792932510376, "logits/rejected": 1.1507470607757568, "logps/chosen": -136.45651245117188, "logps/rejected": -211.21429443359375, "loss": 0.6181, "rewards/accuracies": 0.75, "rewards/chosen": -0.22960419952869415, "rewards/margins": 0.3622058629989624, "rewards/rejected": -0.5918100476264954, "step": 922 }, { "epoch": 0.923, "grad_norm": 1.109308123588562, "learning_rate": 1.3462499999999998e-06, "logits/chosen": 0.6540423035621643, "logits/rejected": 0.7633308172225952, "logps/chosen": -220.23477172851562, "logps/rejected": -223.5801239013672, "loss": 0.6216, "rewards/accuracies": 0.75, "rewards/chosen": -0.3179049491882324, "rewards/margins": 0.32499074935913086, "rewards/rejected": -0.6428956985473633, "step": 923 }, { "epoch": 0.924, "grad_norm": 1.483243703842163, "learning_rate": 1.3449999999999998e-06, "logits/chosen": 0.9031802415847778, "logits/rejected": 1.0917285680770874, "logps/chosen": -268.55072021484375, "logps/rejected": -229.06253051757812, "loss": 0.4762, "rewards/accuracies": 0.875, "rewards/chosen": -0.18110352754592896, "rewards/margins": 0.5557022094726562, "rewards/rejected": -0.7368057370185852, "step": 924 }, { "epoch": 0.925, "grad_norm": 1.6551954746246338, "learning_rate": 1.3437499999999998e-06, "logits/chosen": 0.682947039604187, "logits/rejected": 0.9235650300979614, "logps/chosen": -115.48561096191406, "logps/rejected": -243.1217803955078, "loss": 0.3785, "rewards/accuracies": 0.875, "rewards/chosen": 0.09667368233203888, "rewards/margins": 0.885861873626709, "rewards/rejected": -0.7891882658004761, "step": 925 }, { "epoch": 0.926, "grad_norm": 1.3020192384719849, "learning_rate": 1.3425e-06, "logits/chosen": 0.9035972356796265, "logits/rejected": 0.7149280309677124, "logps/chosen": -196.93597412109375, "logps/rejected": -194.93331909179688, "loss": 0.6059, "rewards/accuracies": 0.875, "rewards/chosen": -0.20437517762184143, "rewards/margins": 0.33728331327438354, "rewards/rejected": -0.5416585206985474, "step": 926 }, { "epoch": 0.927, "grad_norm": 1.4011098146438599, "learning_rate": 1.34125e-06, "logits/chosen": 0.7820214033126831, "logits/rejected": 0.4923742711544037, "logps/chosen": -300.0621337890625, "logps/rejected": -204.26022338867188, "loss": 0.7422, "rewards/accuracies": 0.625, "rewards/chosen": -0.21863774955272675, "rewards/margins": 0.18411895632743835, "rewards/rejected": -0.4027567207813263, "step": 927 }, { "epoch": 0.928, "grad_norm": 1.930621862411499, "learning_rate": 1.34e-06, "logits/chosen": 1.2357721328735352, "logits/rejected": 0.5641683340072632, "logps/chosen": -258.9219055175781, "logps/rejected": -169.5609130859375, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": -0.2478817105293274, "rewards/margins": 0.4517066180706024, "rewards/rejected": -0.6995882987976074, "step": 928 }, { "epoch": 0.929, "grad_norm": 1.391797661781311, "learning_rate": 1.33875e-06, "logits/chosen": 0.5414198637008667, "logits/rejected": 0.9986668825149536, "logps/chosen": -192.91903686523438, "logps/rejected": -198.4952850341797, "loss": 0.4031, "rewards/accuracies": 0.875, "rewards/chosen": 0.0656590461730957, "rewards/margins": 0.87157142162323, "rewards/rejected": -0.8059123754501343, "step": 929 }, { "epoch": 0.93, "grad_norm": 2.0220463275909424, "learning_rate": 1.3375e-06, "logits/chosen": 1.1652487516403198, "logits/rejected": 0.8807476162910461, "logps/chosen": -257.56011962890625, "logps/rejected": -227.36468505859375, "loss": 0.7919, "rewards/accuracies": 0.375, "rewards/chosen": -0.37428322434425354, "rewards/margins": -0.03506365418434143, "rewards/rejected": -0.3392195701599121, "step": 930 }, { "epoch": 0.931, "grad_norm": 1.1457816362380981, "learning_rate": 1.33625e-06, "logits/chosen": 1.2271898984909058, "logits/rejected": 0.5254790782928467, "logps/chosen": -345.3774719238281, "logps/rejected": -196.4671630859375, "loss": 0.5001, "rewards/accuracies": 0.875, "rewards/chosen": -0.047016441822052, "rewards/margins": 0.616150438785553, "rewards/rejected": -0.663166880607605, "step": 931 }, { "epoch": 0.932, "grad_norm": 1.2237110137939453, "learning_rate": 1.335e-06, "logits/chosen": 0.5235464572906494, "logits/rejected": 1.0156126022338867, "logps/chosen": -213.64646911621094, "logps/rejected": -161.82261657714844, "loss": 0.3886, "rewards/accuracies": 1.0, "rewards/chosen": 0.09097342193126678, "rewards/margins": 0.77101069688797, "rewards/rejected": -0.680037260055542, "step": 932 }, { "epoch": 0.933, "grad_norm": 1.5802826881408691, "learning_rate": 1.33375e-06, "logits/chosen": 0.5266709327697754, "logits/rejected": 0.9887780547142029, "logps/chosen": -185.06324768066406, "logps/rejected": -287.95025634765625, "loss": 0.8035, "rewards/accuracies": 0.5, "rewards/chosen": -0.3431036174297333, "rewards/margins": -0.08114776015281677, "rewards/rejected": -0.2619558572769165, "step": 933 }, { "epoch": 0.934, "grad_norm": 1.3771024942398071, "learning_rate": 1.3325e-06, "logits/chosen": 0.4837348461151123, "logits/rejected": 0.537998378276825, "logps/chosen": -159.216552734375, "logps/rejected": -273.6780090332031, "loss": 0.4974, "rewards/accuracies": 0.75, "rewards/chosen": -0.06714344769716263, "rewards/margins": 0.5640031099319458, "rewards/rejected": -0.6311465501785278, "step": 934 }, { "epoch": 0.935, "grad_norm": 1.0508297681808472, "learning_rate": 1.33125e-06, "logits/chosen": 0.2530682384967804, "logits/rejected": 0.8246477246284485, "logps/chosen": -256.2955322265625, "logps/rejected": -251.36614990234375, "loss": 0.3876, "rewards/accuracies": 0.875, "rewards/chosen": 0.05865888297557831, "rewards/margins": 0.9223098158836365, "rewards/rejected": -0.8636508584022522, "step": 935 }, { "epoch": 0.936, "grad_norm": 1.3079086542129517, "learning_rate": 1.33e-06, "logits/chosen": 0.9559457898139954, "logits/rejected": 0.9482189416885376, "logps/chosen": -224.17288208007812, "logps/rejected": -210.96127319335938, "loss": 0.3248, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020192191004753113, "rewards/margins": 1.1273577213287354, "rewards/rejected": -1.1293768882751465, "step": 936 }, { "epoch": 0.937, "grad_norm": 1.2997103929519653, "learning_rate": 1.32875e-06, "logits/chosen": 0.9620323181152344, "logits/rejected": 0.9755074977874756, "logps/chosen": -406.945556640625, "logps/rejected": -232.39141845703125, "loss": 0.5425, "rewards/accuracies": 0.75, "rewards/chosen": -0.06354779750108719, "rewards/margins": 0.5799987316131592, "rewards/rejected": -0.6435465812683105, "step": 937 }, { "epoch": 0.938, "grad_norm": 2.0173113346099854, "learning_rate": 1.3274999999999998e-06, "logits/chosen": 1.118606448173523, "logits/rejected": 0.43781477212905884, "logps/chosen": -286.9566650390625, "logps/rejected": -173.2505340576172, "loss": 0.7538, "rewards/accuracies": 0.5, "rewards/chosen": -0.4380507469177246, "rewards/margins": 0.05723462998867035, "rewards/rejected": -0.49528539180755615, "step": 938 }, { "epoch": 0.939, "grad_norm": 1.7452341318130493, "learning_rate": 1.3262499999999998e-06, "logits/chosen": 1.073616623878479, "logits/rejected": 0.49481409788131714, "logps/chosen": -389.4587707519531, "logps/rejected": -171.05523681640625, "loss": 0.6588, "rewards/accuracies": 0.625, "rewards/chosen": -0.34119489789009094, "rewards/margins": 0.12555742263793945, "rewards/rejected": -0.4667523503303528, "step": 939 }, { "epoch": 0.94, "grad_norm": 2.09262752532959, "learning_rate": 1.3249999999999998e-06, "logits/chosen": 0.7642920017242432, "logits/rejected": 0.783728301525116, "logps/chosen": -267.7671203613281, "logps/rejected": -178.0279083251953, "loss": 0.7316, "rewards/accuracies": 0.625, "rewards/chosen": -0.31400203704833984, "rewards/margins": 0.0516047328710556, "rewards/rejected": -0.36560678482055664, "step": 940 }, { "epoch": 0.941, "grad_norm": 1.612867832183838, "learning_rate": 1.3237499999999998e-06, "logits/chosen": 1.2292985916137695, "logits/rejected": 0.9695636034011841, "logps/chosen": -251.375244140625, "logps/rejected": -208.08447265625, "loss": 0.5179, "rewards/accuracies": 0.75, "rewards/chosen": -0.20037461817264557, "rewards/margins": 0.5748899579048157, "rewards/rejected": -0.7752645611763, "step": 941 }, { "epoch": 0.942, "grad_norm": 1.0817149877548218, "learning_rate": 1.3225e-06, "logits/chosen": 0.32559967041015625, "logits/rejected": 0.47439631819725037, "logps/chosen": -172.37071228027344, "logps/rejected": -174.02145385742188, "loss": 0.4606, "rewards/accuracies": 0.875, "rewards/chosen": 0.06005163490772247, "rewards/margins": 0.6180134415626526, "rewards/rejected": -0.5579618215560913, "step": 942 }, { "epoch": 0.943, "grad_norm": 2.089449167251587, "learning_rate": 1.32125e-06, "logits/chosen": 0.6505305767059326, "logits/rejected": 0.6257936954498291, "logps/chosen": -287.56982421875, "logps/rejected": -175.2873992919922, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.25166815519332886, "rewards/margins": 0.16631948947906494, "rewards/rejected": -0.4179876446723938, "step": 943 }, { "epoch": 0.944, "grad_norm": 1.447571873664856, "learning_rate": 1.32e-06, "logits/chosen": 0.6929805278778076, "logits/rejected": 0.5807334780693054, "logps/chosen": -174.66015625, "logps/rejected": -197.47767639160156, "loss": 0.6081, "rewards/accuracies": 0.75, "rewards/chosen": -0.14290191233158112, "rewards/margins": 0.4897392988204956, "rewards/rejected": -0.6326411962509155, "step": 944 }, { "epoch": 0.945, "grad_norm": 1.3829938173294067, "learning_rate": 1.31875e-06, "logits/chosen": 0.7812260985374451, "logits/rejected": 0.008342694491147995, "logps/chosen": -306.72503662109375, "logps/rejected": -161.5153045654297, "loss": 0.5839, "rewards/accuracies": 0.625, "rewards/chosen": -0.20261459052562714, "rewards/margins": 0.41600438952445984, "rewards/rejected": -0.6186190247535706, "step": 945 }, { "epoch": 0.946, "grad_norm": 1.1762398481369019, "learning_rate": 1.3174999999999999e-06, "logits/chosen": 0.6790724396705627, "logits/rejected": 0.8757444620132446, "logps/chosen": -131.71817016601562, "logps/rejected": -205.23934936523438, "loss": 0.5621, "rewards/accuracies": 0.75, "rewards/chosen": -0.1597513109445572, "rewards/margins": 0.33993908762931824, "rewards/rejected": -0.49969038367271423, "step": 946 }, { "epoch": 0.947, "grad_norm": 1.0822423696517944, "learning_rate": 1.3162499999999999e-06, "logits/chosen": 0.23632687330245972, "logits/rejected": 0.8986390233039856, "logps/chosen": -131.2552032470703, "logps/rejected": -240.42587280273438, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -0.1510668694972992, "rewards/margins": 0.5896661877632141, "rewards/rejected": -0.7407330274581909, "step": 947 }, { "epoch": 0.948, "grad_norm": 1.2444632053375244, "learning_rate": 1.315e-06, "logits/chosen": 0.9307480454444885, "logits/rejected": 0.8913495540618896, "logps/chosen": -265.2528991699219, "logps/rejected": -238.42532348632812, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": -0.10869413614273071, "rewards/margins": 0.4451776146888733, "rewards/rejected": -0.5538716912269592, "step": 948 }, { "epoch": 0.949, "grad_norm": 1.2203505039215088, "learning_rate": 1.31375e-06, "logits/chosen": 0.9628933072090149, "logits/rejected": 0.7978300452232361, "logps/chosen": -303.4044189453125, "logps/rejected": -185.03843688964844, "loss": 0.5492, "rewards/accuracies": 0.75, "rewards/chosen": -0.015598684549331665, "rewards/margins": 0.4752826690673828, "rewards/rejected": -0.49088138341903687, "step": 949 }, { "epoch": 0.95, "grad_norm": 1.4484578371047974, "learning_rate": 1.3125e-06, "logits/chosen": 0.5937182307243347, "logits/rejected": 1.2547211647033691, "logps/chosen": -186.9585723876953, "logps/rejected": -293.1280822753906, "loss": 0.3438, "rewards/accuracies": 1.0, "rewards/chosen": 0.12847137451171875, "rewards/margins": 1.0557363033294678, "rewards/rejected": -0.9272648096084595, "step": 950 }, { "epoch": 0.951, "grad_norm": 1.7344406843185425, "learning_rate": 1.31125e-06, "logits/chosen": 0.9878774881362915, "logits/rejected": 0.3352017402648926, "logps/chosen": -325.87823486328125, "logps/rejected": -141.93478393554688, "loss": 0.6344, "rewards/accuracies": 0.625, "rewards/chosen": -0.2289734035730362, "rewards/margins": 0.21871903538703918, "rewards/rejected": -0.4476923942565918, "step": 951 }, { "epoch": 0.952, "grad_norm": 0.9391379952430725, "learning_rate": 1.31e-06, "logits/chosen": 0.8595563769340515, "logits/rejected": 0.5143384337425232, "logps/chosen": -218.9112091064453, "logps/rejected": -164.01593017578125, "loss": 0.4522, "rewards/accuracies": 0.875, "rewards/chosen": -0.04428168013691902, "rewards/margins": 0.7051953673362732, "rewards/rejected": -0.7494770288467407, "step": 952 }, { "epoch": 0.953, "grad_norm": 1.3193058967590332, "learning_rate": 1.30875e-06, "logits/chosen": 0.32365766167640686, "logits/rejected": 0.23626014590263367, "logps/chosen": -132.18798828125, "logps/rejected": -150.67733764648438, "loss": 0.3937, "rewards/accuracies": 0.875, "rewards/chosen": -0.02596311643719673, "rewards/margins": 0.8065215945243835, "rewards/rejected": -0.8324846029281616, "step": 953 }, { "epoch": 0.954, "grad_norm": 1.2000073194503784, "learning_rate": 1.3075e-06, "logits/chosen": 0.3963034152984619, "logits/rejected": 0.7422875165939331, "logps/chosen": -154.4480743408203, "logps/rejected": -257.890380859375, "loss": 0.6057, "rewards/accuracies": 0.75, "rewards/chosen": -0.04206562042236328, "rewards/margins": 0.4293833076953888, "rewards/rejected": -0.4714488983154297, "step": 954 }, { "epoch": 0.955, "grad_norm": 1.050464391708374, "learning_rate": 1.3062499999999998e-06, "logits/chosen": 0.8174654245376587, "logits/rejected": 0.9979656338691711, "logps/chosen": -208.31094360351562, "logps/rejected": -177.45298767089844, "loss": 0.567, "rewards/accuracies": 0.625, "rewards/chosen": -0.1691681444644928, "rewards/margins": 0.4167383015155792, "rewards/rejected": -0.585906445980072, "step": 955 }, { "epoch": 0.956, "grad_norm": 1.3199574947357178, "learning_rate": 1.3049999999999998e-06, "logits/chosen": 0.6583003997802734, "logits/rejected": 0.16492050886154175, "logps/chosen": -271.4279479980469, "logps/rejected": -171.2433319091797, "loss": 0.3304, "rewards/accuracies": 0.875, "rewards/chosen": 0.18869467079639435, "rewards/margins": 1.2859807014465332, "rewards/rejected": -1.0972861051559448, "step": 956 }, { "epoch": 0.957, "grad_norm": 1.3830201625823975, "learning_rate": 1.30375e-06, "logits/chosen": 0.574012279510498, "logits/rejected": 0.8154116272926331, "logps/chosen": -219.5290069580078, "logps/rejected": -199.80577087402344, "loss": 0.6062, "rewards/accuracies": 0.75, "rewards/chosen": -0.3550899624824524, "rewards/margins": 0.353854775428772, "rewards/rejected": -0.7089447379112244, "step": 957 }, { "epoch": 0.958, "grad_norm": 1.4791102409362793, "learning_rate": 1.3025e-06, "logits/chosen": 0.8795971870422363, "logits/rejected": 0.9048504829406738, "logps/chosen": -221.6392059326172, "logps/rejected": -184.78736877441406, "loss": 0.5911, "rewards/accuracies": 0.625, "rewards/chosen": -0.3276076316833496, "rewards/margins": 0.5505509972572327, "rewards/rejected": -0.8781586289405823, "step": 958 }, { "epoch": 0.959, "grad_norm": 1.7578521966934204, "learning_rate": 1.30125e-06, "logits/chosen": 0.5724989771842957, "logits/rejected": 0.8486925959587097, "logps/chosen": -105.54882049560547, "logps/rejected": -248.742431640625, "loss": 0.5229, "rewards/accuracies": 0.75, "rewards/chosen": -0.22237235307693481, "rewards/margins": 0.5650813579559326, "rewards/rejected": -0.7874537110328674, "step": 959 }, { "epoch": 0.96, "grad_norm": 1.1585209369659424, "learning_rate": 1.3e-06, "logits/chosen": 1.1549530029296875, "logits/rejected": 0.7316799163818359, "logps/chosen": -266.1779479980469, "logps/rejected": -161.86737060546875, "loss": 0.4803, "rewards/accuracies": 0.75, "rewards/chosen": -0.008078535087406635, "rewards/margins": 0.6567882895469666, "rewards/rejected": -0.6648668646812439, "step": 960 }, { "epoch": 0.961, "grad_norm": 1.1378726959228516, "learning_rate": 1.29875e-06, "logits/chosen": 0.8001881241798401, "logits/rejected": 0.6186779737472534, "logps/chosen": -181.7181396484375, "logps/rejected": -167.06149291992188, "loss": 0.5347, "rewards/accuracies": 0.75, "rewards/chosen": -0.11072878539562225, "rewards/margins": 0.5285705327987671, "rewards/rejected": -0.6392993330955505, "step": 961 }, { "epoch": 0.962, "grad_norm": 0.9434884786605835, "learning_rate": 1.2975e-06, "logits/chosen": 0.798587441444397, "logits/rejected": 0.3150412440299988, "logps/chosen": -224.7817840576172, "logps/rejected": -169.4275665283203, "loss": 0.3863, "rewards/accuracies": 0.875, "rewards/chosen": -0.06295862793922424, "rewards/margins": 0.8533462882041931, "rewards/rejected": -0.9163048267364502, "step": 962 }, { "epoch": 0.963, "grad_norm": 1.3226816654205322, "learning_rate": 1.2962499999999999e-06, "logits/chosen": 0.5526736974716187, "logits/rejected": 0.6086435317993164, "logps/chosen": -153.17825317382812, "logps/rejected": -190.28677368164062, "loss": 0.5178, "rewards/accuracies": 0.75, "rewards/chosen": -0.22586068511009216, "rewards/margins": 0.530813992023468, "rewards/rejected": -0.7566746473312378, "step": 963 }, { "epoch": 0.964, "grad_norm": 1.0540539026260376, "learning_rate": 1.2949999999999999e-06, "logits/chosen": 0.5685205459594727, "logits/rejected": 0.38433289527893066, "logps/chosen": -265.8465881347656, "logps/rejected": -155.21209716796875, "loss": 0.3903, "rewards/accuracies": 0.875, "rewards/chosen": -0.0092073455452919, "rewards/margins": 0.8466264009475708, "rewards/rejected": -0.8558337092399597, "step": 964 }, { "epoch": 0.965, "grad_norm": 1.1678149700164795, "learning_rate": 1.29375e-06, "logits/chosen": 0.6819874048233032, "logits/rejected": 0.38984039425849915, "logps/chosen": -186.31884765625, "logps/rejected": -151.095703125, "loss": 0.4805, "rewards/accuracies": 0.75, "rewards/chosen": 0.009210862219333649, "rewards/margins": 0.82137131690979, "rewards/rejected": -0.8121603727340698, "step": 965 }, { "epoch": 0.966, "grad_norm": 1.2832304239273071, "learning_rate": 1.2925e-06, "logits/chosen": 0.7682188153266907, "logits/rejected": 0.7555311322212219, "logps/chosen": -180.80572509765625, "logps/rejected": -226.88739013671875, "loss": 0.3222, "rewards/accuracies": 0.875, "rewards/chosen": 0.12306413054466248, "rewards/margins": 1.0533047914505005, "rewards/rejected": -0.9302407503128052, "step": 966 }, { "epoch": 0.967, "grad_norm": 1.4763944149017334, "learning_rate": 1.29125e-06, "logits/chosen": 0.517465353012085, "logits/rejected": 0.8785140514373779, "logps/chosen": -184.78189086914062, "logps/rejected": -215.08949279785156, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": -0.39556026458740234, "rewards/margins": 0.38111960887908936, "rewards/rejected": -0.7766798734664917, "step": 967 }, { "epoch": 0.968, "grad_norm": 1.2085455656051636, "learning_rate": 1.29e-06, "logits/chosen": 0.4657043218612671, "logits/rejected": 0.655015766620636, "logps/chosen": -146.36036682128906, "logps/rejected": -192.96115112304688, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 0.11272755265235901, "rewards/margins": 1.2004377841949463, "rewards/rejected": -1.0877103805541992, "step": 968 }, { "epoch": 0.969, "grad_norm": 1.6787225008010864, "learning_rate": 1.28875e-06, "logits/chosen": 1.3346463441848755, "logits/rejected": 0.3851624131202698, "logps/chosen": -309.69049072265625, "logps/rejected": -165.044189453125, "loss": 0.5573, "rewards/accuracies": 0.75, "rewards/chosen": -0.34313517808914185, "rewards/margins": 0.5031852722167969, "rewards/rejected": -0.8463204503059387, "step": 969 }, { "epoch": 0.97, "grad_norm": 1.4653741121292114, "learning_rate": 1.2875e-06, "logits/chosen": 0.26829713582992554, "logits/rejected": 0.682020902633667, "logps/chosen": -145.6267547607422, "logps/rejected": -226.6839141845703, "loss": 0.7031, "rewards/accuracies": 0.625, "rewards/chosen": -0.35143595933914185, "rewards/margins": 0.2132338583469391, "rewards/rejected": -0.5646697878837585, "step": 970 }, { "epoch": 0.971, "grad_norm": 1.1016570329666138, "learning_rate": 1.2862499999999998e-06, "logits/chosen": 0.4484003782272339, "logits/rejected": 0.40839338302612305, "logps/chosen": -159.00242614746094, "logps/rejected": -171.42864990234375, "loss": 0.4891, "rewards/accuracies": 0.75, "rewards/chosen": -0.06217843294143677, "rewards/margins": 0.7482416033744812, "rewards/rejected": -0.810420036315918, "step": 971 }, { "epoch": 0.972, "grad_norm": 1.2122769355773926, "learning_rate": 1.2849999999999998e-06, "logits/chosen": 0.925641655921936, "logits/rejected": 0.625074028968811, "logps/chosen": -319.7994079589844, "logps/rejected": -226.09027099609375, "loss": 0.47, "rewards/accuracies": 0.625, "rewards/chosen": -0.12105546146631241, "rewards/margins": 0.7821508646011353, "rewards/rejected": -0.9032062888145447, "step": 972 }, { "epoch": 0.973, "grad_norm": 0.9143964648246765, "learning_rate": 1.28375e-06, "logits/chosen": 0.5914730429649353, "logits/rejected": 0.7155131101608276, "logps/chosen": -134.60707092285156, "logps/rejected": -202.66061401367188, "loss": 0.378, "rewards/accuracies": 0.875, "rewards/chosen": 0.052900850772857666, "rewards/margins": 1.0230423212051392, "rewards/rejected": -0.9701415300369263, "step": 973 }, { "epoch": 0.974, "grad_norm": 1.314026951789856, "learning_rate": 1.2825e-06, "logits/chosen": 0.8632372617721558, "logits/rejected": 0.6434943675994873, "logps/chosen": -217.57394409179688, "logps/rejected": -167.55355834960938, "loss": 0.5618, "rewards/accuracies": 0.625, "rewards/chosen": -0.19007253646850586, "rewards/margins": 0.43356427550315857, "rewards/rejected": -0.6236368417739868, "step": 974 }, { "epoch": 0.975, "grad_norm": 1.6871527433395386, "learning_rate": 1.28125e-06, "logits/chosen": 0.8545501232147217, "logits/rejected": 1.232175350189209, "logps/chosen": -271.9912414550781, "logps/rejected": -292.3136291503906, "loss": 0.7634, "rewards/accuracies": 0.5, "rewards/chosen": -0.3806224763393402, "rewards/margins": 0.08042007684707642, "rewards/rejected": -0.461042582988739, "step": 975 }, { "epoch": 0.976, "grad_norm": 1.0830068588256836, "learning_rate": 1.28e-06, "logits/chosen": 1.0021286010742188, "logits/rejected": 0.6217373609542847, "logps/chosen": -276.50115966796875, "logps/rejected": -223.31747436523438, "loss": 0.4113, "rewards/accuracies": 0.875, "rewards/chosen": -0.06892509013414383, "rewards/margins": 0.9043062925338745, "rewards/rejected": -0.9732313752174377, "step": 976 }, { "epoch": 0.977, "grad_norm": 1.5650935173034668, "learning_rate": 1.27875e-06, "logits/chosen": 0.9344983100891113, "logits/rejected": 0.538056492805481, "logps/chosen": -312.04461669921875, "logps/rejected": -152.2235107421875, "loss": 0.5113, "rewards/accuracies": 0.875, "rewards/chosen": -0.25104576349258423, "rewards/margins": 0.5250248908996582, "rewards/rejected": -0.7760707139968872, "step": 977 }, { "epoch": 0.978, "grad_norm": 1.4167083501815796, "learning_rate": 1.2775e-06, "logits/chosen": 0.8860515356063843, "logits/rejected": 0.990925669670105, "logps/chosen": -216.3712615966797, "logps/rejected": -214.8328094482422, "loss": 0.4444, "rewards/accuracies": 0.875, "rewards/chosen": -0.1477947235107422, "rewards/margins": 0.7815337181091309, "rewards/rejected": -0.929328441619873, "step": 978 }, { "epoch": 0.979, "grad_norm": 1.5092636346817017, "learning_rate": 1.27625e-06, "logits/chosen": 0.7861217856407166, "logits/rejected": 0.8387948274612427, "logps/chosen": -293.1144714355469, "logps/rejected": -198.1114501953125, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": -0.2550410032272339, "rewards/margins": 0.548112154006958, "rewards/rejected": -0.8031532764434814, "step": 979 }, { "epoch": 0.98, "grad_norm": 1.2366300821304321, "learning_rate": 1.2749999999999999e-06, "logits/chosen": 1.3689906597137451, "logits/rejected": 0.5796546936035156, "logps/chosen": -280.5626220703125, "logps/rejected": -161.8746795654297, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": -0.08403263241052628, "rewards/margins": 0.4205564558506012, "rewards/rejected": -0.5045890808105469, "step": 980 }, { "epoch": 0.981, "grad_norm": 1.5886600017547607, "learning_rate": 1.2737499999999999e-06, "logits/chosen": 0.46968838572502136, "logits/rejected": 0.7385321855545044, "logps/chosen": -178.19143676757812, "logps/rejected": -272.2085876464844, "loss": 0.6004, "rewards/accuracies": 0.625, "rewards/chosen": -0.10958176106214523, "rewards/margins": 0.40984344482421875, "rewards/rejected": -0.5194252729415894, "step": 981 }, { "epoch": 0.982, "grad_norm": 1.7049529552459717, "learning_rate": 1.2724999999999999e-06, "logits/chosen": 1.1308308839797974, "logits/rejected": 0.23960626125335693, "logps/chosen": -213.993896484375, "logps/rejected": -152.05812072753906, "loss": 0.7527, "rewards/accuracies": 0.5, "rewards/chosen": -0.3493622839450836, "rewards/margins": 0.12196006625890732, "rewards/rejected": -0.4713223874568939, "step": 982 }, { "epoch": 0.983, "grad_norm": 1.206504225730896, "learning_rate": 1.27125e-06, "logits/chosen": 0.9539329409599304, "logits/rejected": 0.6792821288108826, "logps/chosen": -244.84934997558594, "logps/rejected": -206.2161102294922, "loss": 0.5062, "rewards/accuracies": 0.75, "rewards/chosen": -0.07423362880945206, "rewards/margins": 0.7262243032455444, "rewards/rejected": -0.8004578948020935, "step": 983 }, { "epoch": 0.984, "grad_norm": 1.693018913269043, "learning_rate": 1.27e-06, "logits/chosen": 0.7829250693321228, "logits/rejected": 0.7372003793716431, "logps/chosen": -282.9153747558594, "logps/rejected": -206.5380859375, "loss": 0.6951, "rewards/accuracies": 0.75, "rewards/chosen": -0.4814613461494446, "rewards/margins": 0.22213315963745117, "rewards/rejected": -0.7035945653915405, "step": 984 }, { "epoch": 0.985, "grad_norm": 1.0727367401123047, "learning_rate": 1.26875e-06, "logits/chosen": 0.8320564031600952, "logits/rejected": 0.5135716199874878, "logps/chosen": -380.9358825683594, "logps/rejected": -193.13369750976562, "loss": 0.3845, "rewards/accuracies": 0.875, "rewards/chosen": -0.1044306755065918, "rewards/margins": 0.8177229762077332, "rewards/rejected": -0.9221535921096802, "step": 985 }, { "epoch": 0.986, "grad_norm": 1.3088600635528564, "learning_rate": 1.2675e-06, "logits/chosen": 1.2297210693359375, "logits/rejected": 0.8255873918533325, "logps/chosen": -266.1742248535156, "logps/rejected": -206.56687927246094, "loss": 0.5301, "rewards/accuracies": 0.75, "rewards/chosen": -0.2707939147949219, "rewards/margins": 0.6389590501785278, "rewards/rejected": -0.9097529649734497, "step": 986 }, { "epoch": 0.987, "grad_norm": 1.3582837581634521, "learning_rate": 1.26625e-06, "logits/chosen": 0.5884078741073608, "logits/rejected": 0.8429640531539917, "logps/chosen": -167.2076416015625, "logps/rejected": -297.1697998046875, "loss": 0.6072, "rewards/accuracies": 0.5, "rewards/chosen": -0.30706721544265747, "rewards/margins": 0.2608567178249359, "rewards/rejected": -0.567923903465271, "step": 987 }, { "epoch": 0.988, "grad_norm": 1.458215355873108, "learning_rate": 1.2649999999999997e-06, "logits/chosen": 0.5832163095474243, "logits/rejected": 1.0063236951828003, "logps/chosen": -131.51254272460938, "logps/rejected": -201.13714599609375, "loss": 0.7045, "rewards/accuracies": 0.625, "rewards/chosen": -0.41814756393432617, "rewards/margins": 0.22599995136260986, "rewards/rejected": -0.644147515296936, "step": 988 }, { "epoch": 0.989, "grad_norm": 1.4954781532287598, "learning_rate": 1.26375e-06, "logits/chosen": 0.40369558334350586, "logits/rejected": 0.8426293730735779, "logps/chosen": -161.79759216308594, "logps/rejected": -315.1351318359375, "loss": 0.6098, "rewards/accuracies": 0.75, "rewards/chosen": -0.13164129853248596, "rewards/margins": 0.30968207120895386, "rewards/rejected": -0.4413233697414398, "step": 989 }, { "epoch": 0.99, "grad_norm": 1.260823369026184, "learning_rate": 1.2625e-06, "logits/chosen": 1.3028727769851685, "logits/rejected": 0.25664788484573364, "logps/chosen": -246.27423095703125, "logps/rejected": -150.49664306640625, "loss": 0.5807, "rewards/accuracies": 0.75, "rewards/chosen": -0.2055654227733612, "rewards/margins": 0.44802290201187134, "rewards/rejected": -0.6535883545875549, "step": 990 }, { "epoch": 0.991, "grad_norm": 1.1941568851470947, "learning_rate": 1.26125e-06, "logits/chosen": 0.6910281181335449, "logits/rejected": 0.6736725568771362, "logps/chosen": -215.34898376464844, "logps/rejected": -141.96571350097656, "loss": 0.4438, "rewards/accuracies": 0.875, "rewards/chosen": 0.0016737356781959534, "rewards/margins": 0.759501576423645, "rewards/rejected": -0.757827877998352, "step": 991 }, { "epoch": 0.992, "grad_norm": 2.2484681606292725, "learning_rate": 1.26e-06, "logits/chosen": 1.0433590412139893, "logits/rejected": 0.28205862641334534, "logps/chosen": -257.97784423828125, "logps/rejected": -220.51979064941406, "loss": 0.9569, "rewards/accuracies": 0.375, "rewards/chosen": -0.6742290258407593, "rewards/margins": -0.23217278718948364, "rewards/rejected": -0.442056268453598, "step": 992 }, { "epoch": 0.993, "grad_norm": 1.894972801208496, "learning_rate": 1.25875e-06, "logits/chosen": 1.3871018886566162, "logits/rejected": 0.7401602864265442, "logps/chosen": -383.9561767578125, "logps/rejected": -177.70858764648438, "loss": 0.5677, "rewards/accuracies": 0.625, "rewards/chosen": -0.20023614168167114, "rewards/margins": 0.6850894689559937, "rewards/rejected": -0.88532555103302, "step": 993 }, { "epoch": 0.994, "grad_norm": 1.3810898065567017, "learning_rate": 1.2575e-06, "logits/chosen": 0.9467871189117432, "logits/rejected": 0.6147024035453796, "logps/chosen": -164.43524169921875, "logps/rejected": -176.4134063720703, "loss": 0.6535, "rewards/accuracies": 0.5, "rewards/chosen": -0.13851794600486755, "rewards/margins": 0.3758319616317749, "rewards/rejected": -0.5143499374389648, "step": 994 }, { "epoch": 0.995, "grad_norm": 1.3240684270858765, "learning_rate": 1.25625e-06, "logits/chosen": 1.0316517353057861, "logits/rejected": 0.510753870010376, "logps/chosen": -264.9785461425781, "logps/rejected": -185.44155883789062, "loss": 0.4645, "rewards/accuracies": 0.75, "rewards/chosen": -0.1825242042541504, "rewards/margins": 0.6678446531295776, "rewards/rejected": -0.850368857383728, "step": 995 }, { "epoch": 0.996, "grad_norm": 1.2064716815948486, "learning_rate": 1.2549999999999998e-06, "logits/chosen": 1.19627046585083, "logits/rejected": 0.8020365238189697, "logps/chosen": -262.3677978515625, "logps/rejected": -218.63536071777344, "loss": 0.553, "rewards/accuracies": 0.75, "rewards/chosen": -0.25341796875, "rewards/margins": 0.4614294171333313, "rewards/rejected": -0.7148473858833313, "step": 996 }, { "epoch": 0.997, "grad_norm": 0.8806898593902588, "learning_rate": 1.2537499999999999e-06, "logits/chosen": 0.6168298125267029, "logits/rejected": 1.0492502450942993, "logps/chosen": -227.1119384765625, "logps/rejected": -219.40582275390625, "loss": 0.2805, "rewards/accuracies": 1.0, "rewards/chosen": 0.04786257818341255, "rewards/margins": 1.2160085439682007, "rewards/rejected": -1.168146014213562, "step": 997 }, { "epoch": 0.998, "grad_norm": 1.2710604667663574, "learning_rate": 1.2524999999999999e-06, "logits/chosen": 0.8720124363899231, "logits/rejected": 0.5956549644470215, "logps/chosen": -255.34185791015625, "logps/rejected": -164.58755493164062, "loss": 0.5878, "rewards/accuracies": 0.625, "rewards/chosen": -0.20646563172340393, "rewards/margins": 0.540126621723175, "rewards/rejected": -0.7465922832489014, "step": 998 }, { "epoch": 0.999, "grad_norm": 2.3672854900360107, "learning_rate": 1.2512499999999999e-06, "logits/chosen": 0.787426233291626, "logits/rejected": 1.2462575435638428, "logps/chosen": -213.3551788330078, "logps/rejected": -237.8119354248047, "loss": 0.7875, "rewards/accuracies": 0.5, "rewards/chosen": -0.15249386429786682, "rewards/margins": 0.16598045825958252, "rewards/rejected": -0.31847429275512695, "step": 999 }, { "epoch": 1.0, "grad_norm": 1.1537644863128662, "learning_rate": 1.2499999999999999e-06, "logits/chosen": 0.95160311460495, "logits/rejected": 0.9931919574737549, "logps/chosen": -216.23135375976562, "logps/rejected": -168.84793090820312, "loss": 0.436, "rewards/accuracies": 0.875, "rewards/chosen": -0.10827923566102982, "rewards/margins": 0.8745056390762329, "rewards/rejected": -0.9827848672866821, "step": 1000 }, { "epoch": 1.001, "grad_norm": 1.108749508857727, "learning_rate": 1.24875e-06, "logits/chosen": 0.9294902682304382, "logits/rejected": 0.5632860660552979, "logps/chosen": -263.7054138183594, "logps/rejected": -170.55636596679688, "loss": 0.4616, "rewards/accuracies": 0.75, "rewards/chosen": 0.09740610420703888, "rewards/margins": 0.7421407103538513, "rewards/rejected": -0.6447345614433289, "step": 1001 }, { "epoch": 1.002, "grad_norm": 2.4063897132873535, "learning_rate": 1.2475e-06, "logits/chosen": 1.2583866119384766, "logits/rejected": 0.43130987882614136, "logps/chosen": -416.81024169921875, "logps/rejected": -130.24842834472656, "loss": 0.7362, "rewards/accuracies": 0.625, "rewards/chosen": -0.4298596978187561, "rewards/margins": 0.16133907437324524, "rewards/rejected": -0.591198742389679, "step": 1002 }, { "epoch": 1.003, "grad_norm": 1.5643421411514282, "learning_rate": 1.24625e-06, "logits/chosen": 0.20974299311637878, "logits/rejected": 1.066982626914978, "logps/chosen": -169.49815368652344, "logps/rejected": -269.33251953125, "loss": 0.7995, "rewards/accuracies": 0.625, "rewards/chosen": -0.5531753301620483, "rewards/margins": 0.029687363654375076, "rewards/rejected": -0.5828626751899719, "step": 1003 }, { "epoch": 1.004, "grad_norm": 1.01950204372406, "learning_rate": 1.2450000000000002e-06, "logits/chosen": 0.8373785018920898, "logits/rejected": 1.1091270446777344, "logps/chosen": -220.5594482421875, "logps/rejected": -249.27169799804688, "loss": 0.5274, "rewards/accuracies": 0.5, "rewards/chosen": -0.16163063049316406, "rewards/margins": 0.5728954076766968, "rewards/rejected": -0.7345260381698608, "step": 1004 }, { "epoch": 1.005, "grad_norm": 1.2958334684371948, "learning_rate": 1.24375e-06, "logits/chosen": 0.850496768951416, "logits/rejected": 0.9690978527069092, "logps/chosen": -130.93951416015625, "logps/rejected": -229.52615356445312, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": -0.1071997582912445, "rewards/margins": 0.7887861728668213, "rewards/rejected": -0.8959859609603882, "step": 1005 }, { "epoch": 1.006, "grad_norm": 1.4926834106445312, "learning_rate": 1.2425e-06, "logits/chosen": 0.8774628639221191, "logits/rejected": 0.5075086951255798, "logps/chosen": -210.734619140625, "logps/rejected": -176.00027465820312, "loss": 0.6279, "rewards/accuracies": 0.5, "rewards/chosen": -0.2644529342651367, "rewards/margins": 0.3871576189994812, "rewards/rejected": -0.6516105532646179, "step": 1006 }, { "epoch": 1.007, "grad_norm": 1.1877944469451904, "learning_rate": 1.24125e-06, "logits/chosen": 0.9319489598274231, "logits/rejected": 0.5672241449356079, "logps/chosen": -236.7650146484375, "logps/rejected": -249.08453369140625, "loss": 0.5346, "rewards/accuracies": 0.75, "rewards/chosen": -0.0019295737147331238, "rewards/margins": 0.5538799166679382, "rewards/rejected": -0.555809497833252, "step": 1007 }, { "epoch": 1.008, "grad_norm": 2.3298182487487793, "learning_rate": 1.24e-06, "logits/chosen": 0.988804817199707, "logits/rejected": 0.256790429353714, "logps/chosen": -394.99224853515625, "logps/rejected": -155.62132263183594, "loss": 0.8601, "rewards/accuracies": 0.625, "rewards/chosen": -0.5257980823516846, "rewards/margins": -0.221825510263443, "rewards/rejected": -0.3039725422859192, "step": 1008 }, { "epoch": 1.009, "grad_norm": 0.9852937459945679, "learning_rate": 1.23875e-06, "logits/chosen": 0.36246344447135925, "logits/rejected": 1.1050901412963867, "logps/chosen": -144.0105438232422, "logps/rejected": -208.290283203125, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 0.05950794368982315, "rewards/margins": 1.1074018478393555, "rewards/rejected": -1.04789400100708, "step": 1009 }, { "epoch": 1.01, "grad_norm": 1.4431747198104858, "learning_rate": 1.2375e-06, "logits/chosen": 1.2092070579528809, "logits/rejected": 0.7301546335220337, "logps/chosen": -339.46856689453125, "logps/rejected": -184.56509399414062, "loss": 0.4274, "rewards/accuracies": 0.875, "rewards/chosen": -0.038115307688713074, "rewards/margins": 0.8480875492095947, "rewards/rejected": -0.8862028121948242, "step": 1010 }, { "epoch": 1.011, "grad_norm": 0.8606985807418823, "learning_rate": 1.23625e-06, "logits/chosen": 1.1336840391159058, "logits/rejected": 1.127260684967041, "logps/chosen": -223.29522705078125, "logps/rejected": -195.83969116210938, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": -0.09247326850891113, "rewards/margins": 1.0510461330413818, "rewards/rejected": -1.1435192823410034, "step": 1011 }, { "epoch": 1.012, "grad_norm": 1.2686916589736938, "learning_rate": 1.235e-06, "logits/chosen": 0.9561412930488586, "logits/rejected": 1.0730876922607422, "logps/chosen": -260.9944763183594, "logps/rejected": -309.11346435546875, "loss": 0.5471, "rewards/accuracies": 0.625, "rewards/chosen": -0.1698663830757141, "rewards/margins": 0.42312091588974, "rewards/rejected": -0.5929872989654541, "step": 1012 }, { "epoch": 1.013, "grad_norm": 1.4326456785202026, "learning_rate": 1.2337499999999998e-06, "logits/chosen": 1.3015788793563843, "logits/rejected": 0.8866193294525146, "logps/chosen": -375.75634765625, "logps/rejected": -174.13742065429688, "loss": 0.4885, "rewards/accuracies": 0.875, "rewards/chosen": -0.043810565024614334, "rewards/margins": 0.6309666037559509, "rewards/rejected": -0.6747771501541138, "step": 1013 }, { "epoch": 1.014, "grad_norm": 1.6068143844604492, "learning_rate": 1.2324999999999998e-06, "logits/chosen": 0.8934760689735413, "logits/rejected": 0.5236175060272217, "logps/chosen": -282.1280212402344, "logps/rejected": -153.14898681640625, "loss": 0.6331, "rewards/accuracies": 0.5, "rewards/chosen": -0.2706336975097656, "rewards/margins": 0.29185250401496887, "rewards/rejected": -0.5624861717224121, "step": 1014 }, { "epoch": 1.015, "grad_norm": 1.0821971893310547, "learning_rate": 1.2312499999999999e-06, "logits/chosen": 0.6443102359771729, "logits/rejected": 0.7390841245651245, "logps/chosen": -139.44293212890625, "logps/rejected": -197.09970092773438, "loss": 0.452, "rewards/accuracies": 0.75, "rewards/chosen": -0.1810988336801529, "rewards/margins": 0.7010268568992615, "rewards/rejected": -0.882125735282898, "step": 1015 }, { "epoch": 1.016, "grad_norm": 1.0709749460220337, "learning_rate": 1.2299999999999999e-06, "logits/chosen": 0.7744061946868896, "logits/rejected": 0.9022359848022461, "logps/chosen": -297.6886291503906, "logps/rejected": -185.7029266357422, "loss": 0.4763, "rewards/accuracies": 0.75, "rewards/chosen": -0.044371891766786575, "rewards/margins": 0.9763849973678589, "rewards/rejected": -1.0207568407058716, "step": 1016 }, { "epoch": 1.017, "grad_norm": 1.2688885927200317, "learning_rate": 1.2287499999999999e-06, "logits/chosen": 0.30310097336769104, "logits/rejected": 0.9831249713897705, "logps/chosen": -186.366455078125, "logps/rejected": -229.43304443359375, "loss": 0.5921, "rewards/accuracies": 0.75, "rewards/chosen": -0.25425228476524353, "rewards/margins": 0.41243094205856323, "rewards/rejected": -0.6666832566261292, "step": 1017 }, { "epoch": 1.018, "grad_norm": 1.6052571535110474, "learning_rate": 1.2275e-06, "logits/chosen": 0.9359822273254395, "logits/rejected": 0.7193039655685425, "logps/chosen": -288.30322265625, "logps/rejected": -219.5087127685547, "loss": 0.4741, "rewards/accuracies": 0.875, "rewards/chosen": -0.17703379690647125, "rewards/margins": 0.698630690574646, "rewards/rejected": -0.8756645917892456, "step": 1018 }, { "epoch": 1.019, "grad_norm": 1.4249718189239502, "learning_rate": 1.22625e-06, "logits/chosen": 0.44276726245880127, "logits/rejected": 0.7089403867721558, "logps/chosen": -184.72039794921875, "logps/rejected": -236.81649780273438, "loss": 0.5779, "rewards/accuracies": 0.75, "rewards/chosen": -0.10464948415756226, "rewards/margins": 0.47615668177604675, "rewards/rejected": -0.5808061361312866, "step": 1019 }, { "epoch": 1.02, "grad_norm": 0.9898892641067505, "learning_rate": 1.2250000000000001e-06, "logits/chosen": 1.074373483657837, "logits/rejected": 1.1765326261520386, "logps/chosen": -191.0301055908203, "logps/rejected": -258.2561950683594, "loss": 0.4461, "rewards/accuracies": 0.875, "rewards/chosen": 0.012541104108095169, "rewards/margins": 0.6954814791679382, "rewards/rejected": -0.6829403638839722, "step": 1020 }, { "epoch": 1.021, "grad_norm": 1.1500457525253296, "learning_rate": 1.22375e-06, "logits/chosen": 0.5531064867973328, "logits/rejected": 0.5930328369140625, "logps/chosen": -262.8097229003906, "logps/rejected": -185.23367309570312, "loss": 0.4918, "rewards/accuracies": 0.75, "rewards/chosen": -0.046300020068883896, "rewards/margins": 0.790861964225769, "rewards/rejected": -0.8371620178222656, "step": 1021 }, { "epoch": 1.022, "grad_norm": 1.2306188344955444, "learning_rate": 1.2225e-06, "logits/chosen": 0.3813488185405731, "logits/rejected": 1.0869981050491333, "logps/chosen": -152.94239807128906, "logps/rejected": -262.48675537109375, "loss": 0.3741, "rewards/accuracies": 0.875, "rewards/chosen": -0.02008362114429474, "rewards/margins": 1.0062625408172607, "rewards/rejected": -1.026346206665039, "step": 1022 }, { "epoch": 1.023, "grad_norm": 1.5712593793869019, "learning_rate": 1.22125e-06, "logits/chosen": -0.20038634538650513, "logits/rejected": 0.38598933815956116, "logps/chosen": -140.86083984375, "logps/rejected": -226.7571563720703, "loss": 0.6233, "rewards/accuracies": 0.625, "rewards/chosen": -0.3247276544570923, "rewards/margins": 0.5089502334594727, "rewards/rejected": -0.8336778879165649, "step": 1023 }, { "epoch": 1.024, "grad_norm": 1.121610403060913, "learning_rate": 1.22e-06, "logits/chosen": 0.9464428424835205, "logits/rejected": 0.9137997627258301, "logps/chosen": -417.5994873046875, "logps/rejected": -204.03636169433594, "loss": 0.307, "rewards/accuracies": 0.875, "rewards/chosen": 0.29073333740234375, "rewards/margins": 1.2137489318847656, "rewards/rejected": -0.9230156540870667, "step": 1024 }, { "epoch": 1.025, "grad_norm": 1.5920467376708984, "learning_rate": 1.21875e-06, "logits/chosen": 0.49962323904037476, "logits/rejected": 0.7981727123260498, "logps/chosen": -234.12655639648438, "logps/rejected": -162.52157592773438, "loss": 0.5877, "rewards/accuracies": 0.75, "rewards/chosen": -0.12817354500293732, "rewards/margins": 0.37604275345802307, "rewards/rejected": -0.5042163133621216, "step": 1025 }, { "epoch": 1.026, "grad_norm": 1.674794316291809, "learning_rate": 1.2175e-06, "logits/chosen": 0.48400264978408813, "logits/rejected": 0.6010373830795288, "logps/chosen": -224.8567657470703, "logps/rejected": -219.0921173095703, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": -0.2069208174943924, "rewards/margins": 0.3207903802394867, "rewards/rejected": -0.5277112126350403, "step": 1026 }, { "epoch": 1.027, "grad_norm": 1.9772342443466187, "learning_rate": 1.21625e-06, "logits/chosen": 0.7715440988540649, "logits/rejected": 0.859468936920166, "logps/chosen": -266.03350830078125, "logps/rejected": -251.89498901367188, "loss": 0.5551, "rewards/accuracies": 0.75, "rewards/chosen": -0.17975303530693054, "rewards/margins": 0.6425652503967285, "rewards/rejected": -0.8223183155059814, "step": 1027 }, { "epoch": 1.028, "grad_norm": 1.3080830574035645, "learning_rate": 1.215e-06, "logits/chosen": 0.9456852078437805, "logits/rejected": 1.1051018238067627, "logps/chosen": -215.61070251464844, "logps/rejected": -232.41842651367188, "loss": 0.5195, "rewards/accuracies": 0.875, "rewards/chosen": -0.28099575638771057, "rewards/margins": 0.5053824186325073, "rewards/rejected": -0.7863782048225403, "step": 1028 }, { "epoch": 1.029, "grad_norm": 1.291466236114502, "learning_rate": 1.21375e-06, "logits/chosen": 0.6467471718788147, "logits/rejected": 0.6808027029037476, "logps/chosen": -166.5665740966797, "logps/rejected": -168.3721466064453, "loss": 0.5375, "rewards/accuracies": 0.625, "rewards/chosen": -0.275396466255188, "rewards/margins": 0.5660582184791565, "rewards/rejected": -0.8414546251296997, "step": 1029 }, { "epoch": 1.03, "grad_norm": 1.1935462951660156, "learning_rate": 1.2124999999999998e-06, "logits/chosen": 1.1221253871917725, "logits/rejected": 1.10881507396698, "logps/chosen": -229.43966674804688, "logps/rejected": -219.1280517578125, "loss": 0.3191, "rewards/accuracies": 1.0, "rewards/chosen": 0.10928144305944443, "rewards/margins": 1.05849027633667, "rewards/rejected": -0.9492088556289673, "step": 1030 }, { "epoch": 1.031, "grad_norm": 1.3884462118148804, "learning_rate": 1.2112499999999998e-06, "logits/chosen": 0.8629527688026428, "logits/rejected": 0.5470641255378723, "logps/chosen": -261.2013854980469, "logps/rejected": -189.04351806640625, "loss": 0.5471, "rewards/accuracies": 0.5, "rewards/chosen": -0.3480491638183594, "rewards/margins": 0.6049649119377136, "rewards/rejected": -0.9530141353607178, "step": 1031 }, { "epoch": 1.032, "grad_norm": 1.2852572202682495, "learning_rate": 1.2099999999999998e-06, "logits/chosen": 0.2972247004508972, "logits/rejected": 0.8584858775138855, "logps/chosen": -169.68971252441406, "logps/rejected": -251.492431640625, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 0.18015289306640625, "rewards/margins": 1.1124401092529297, "rewards/rejected": -0.9322872161865234, "step": 1032 }, { "epoch": 1.033, "grad_norm": 1.5334596633911133, "learning_rate": 1.2087499999999999e-06, "logits/chosen": 0.5789185762405396, "logits/rejected": 1.1119245290756226, "logps/chosen": -153.96839904785156, "logps/rejected": -216.0934295654297, "loss": 0.6734, "rewards/accuracies": 0.625, "rewards/chosen": -0.5394551157951355, "rewards/margins": 0.33356374502182007, "rewards/rejected": -0.8730188608169556, "step": 1033 }, { "epoch": 1.034, "grad_norm": 1.3440935611724854, "learning_rate": 1.2074999999999999e-06, "logits/chosen": 0.8939976692199707, "logits/rejected": 0.04872910678386688, "logps/chosen": -263.998046875, "logps/rejected": -155.1846160888672, "loss": 0.4987, "rewards/accuracies": 0.75, "rewards/chosen": -0.26099300384521484, "rewards/margins": 0.7460137605667114, "rewards/rejected": -1.0070067644119263, "step": 1034 }, { "epoch": 1.035, "grad_norm": 1.33452308177948, "learning_rate": 1.2062499999999999e-06, "logits/chosen": 0.20393891632556915, "logits/rejected": 0.7178255319595337, "logps/chosen": -145.98939514160156, "logps/rejected": -244.4403076171875, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": -0.5479766130447388, "rewards/margins": 0.2862977981567383, "rewards/rejected": -0.8342743515968323, "step": 1035 }, { "epoch": 1.036, "grad_norm": 1.3413779735565186, "learning_rate": 1.2050000000000001e-06, "logits/chosen": 0.9137049913406372, "logits/rejected": 0.6589609980583191, "logps/chosen": -199.02227783203125, "logps/rejected": -254.9770965576172, "loss": 0.3693, "rewards/accuracies": 0.875, "rewards/chosen": 0.05199404060840607, "rewards/margins": 1.0230010747909546, "rewards/rejected": -0.9710071086883545, "step": 1036 }, { "epoch": 1.037, "grad_norm": 1.7255675792694092, "learning_rate": 1.2037500000000001e-06, "logits/chosen": 0.5892699956893921, "logits/rejected": 0.9785710573196411, "logps/chosen": -159.28353881835938, "logps/rejected": -252.113525390625, "loss": 0.6402, "rewards/accuracies": 0.625, "rewards/chosen": -0.1049380749464035, "rewards/margins": 0.4371607005596161, "rewards/rejected": -0.5420988202095032, "step": 1037 }, { "epoch": 1.038, "grad_norm": 1.0218048095703125, "learning_rate": 1.2025e-06, "logits/chosen": 0.7351108193397522, "logits/rejected": 0.6420285105705261, "logps/chosen": -217.7957763671875, "logps/rejected": -248.04916381835938, "loss": 0.3704, "rewards/accuracies": 0.875, "rewards/chosen": -0.0008803308010101318, "rewards/margins": 1.0840322971343994, "rewards/rejected": -1.0849125385284424, "step": 1038 }, { "epoch": 1.039, "grad_norm": 1.489680290222168, "learning_rate": 1.20125e-06, "logits/chosen": 0.6974009275436401, "logits/rejected": 1.2487118244171143, "logps/chosen": -140.03518676757812, "logps/rejected": -222.8593292236328, "loss": 0.5541, "rewards/accuracies": 0.625, "rewards/chosen": -0.1678999811410904, "rewards/margins": 0.6254744529724121, "rewards/rejected": -0.7933744192123413, "step": 1039 }, { "epoch": 1.04, "grad_norm": 1.246228575706482, "learning_rate": 1.2e-06, "logits/chosen": 1.1675537824630737, "logits/rejected": 0.8723880052566528, "logps/chosen": -471.5497741699219, "logps/rejected": -222.30630493164062, "loss": 0.5364, "rewards/accuracies": 0.875, "rewards/chosen": -0.056923285126686096, "rewards/margins": 0.564597487449646, "rewards/rejected": -0.6215207576751709, "step": 1040 }, { "epoch": 1.041, "grad_norm": 1.764837622642517, "learning_rate": 1.19875e-06, "logits/chosen": 1.096501111984253, "logits/rejected": 0.5684105157852173, "logps/chosen": -317.93994140625, "logps/rejected": -163.37350463867188, "loss": 0.4939, "rewards/accuracies": 0.625, "rewards/chosen": -0.06205329671502113, "rewards/margins": 0.7630100250244141, "rewards/rejected": -0.8250633478164673, "step": 1041 }, { "epoch": 1.042, "grad_norm": 1.2062294483184814, "learning_rate": 1.1975e-06, "logits/chosen": 0.9559551477432251, "logits/rejected": 0.27995961904525757, "logps/chosen": -258.7989196777344, "logps/rejected": -161.69374084472656, "loss": 0.3987, "rewards/accuracies": 0.875, "rewards/chosen": -0.01801776885986328, "rewards/margins": 0.8994569778442383, "rewards/rejected": -0.9174748063087463, "step": 1042 }, { "epoch": 1.043, "grad_norm": 2.2232484817504883, "learning_rate": 1.19625e-06, "logits/chosen": 0.4100378155708313, "logits/rejected": 0.6422800421714783, "logps/chosen": -210.22723388671875, "logps/rejected": -214.36492919921875, "loss": 0.8846, "rewards/accuracies": 0.375, "rewards/chosen": -0.567203164100647, "rewards/margins": -0.16810572147369385, "rewards/rejected": -0.3990974426269531, "step": 1043 }, { "epoch": 1.044, "grad_norm": 1.4242802858352661, "learning_rate": 1.195e-06, "logits/chosen": 0.42782023549079895, "logits/rejected": 1.4739673137664795, "logps/chosen": -215.66741943359375, "logps/rejected": -254.76882934570312, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": -0.2799696922302246, "rewards/margins": 0.4758256673812866, "rewards/rejected": -0.7557953596115112, "step": 1044 }, { "epoch": 1.045, "grad_norm": 1.4046063423156738, "learning_rate": 1.19375e-06, "logits/chosen": 0.8449044823646545, "logits/rejected": 1.3001391887664795, "logps/chosen": -330.3206481933594, "logps/rejected": -260.4854736328125, "loss": 0.5574, "rewards/accuracies": 0.625, "rewards/chosen": -0.2523442506790161, "rewards/margins": 0.591307520866394, "rewards/rejected": -0.8436517715454102, "step": 1045 }, { "epoch": 1.046, "grad_norm": 1.3741497993469238, "learning_rate": 1.1924999999999998e-06, "logits/chosen": 0.43720078468322754, "logits/rejected": 0.9628201127052307, "logps/chosen": -158.84580993652344, "logps/rejected": -284.7048034667969, "loss": 0.4078, "rewards/accuracies": 0.875, "rewards/chosen": -0.02874155342578888, "rewards/margins": 0.7973895072937012, "rewards/rejected": -0.8261311054229736, "step": 1046 }, { "epoch": 1.047, "grad_norm": 1.871039867401123, "learning_rate": 1.1912499999999998e-06, "logits/chosen": 0.9322543144226074, "logits/rejected": 0.1503409892320633, "logps/chosen": -257.8082275390625, "logps/rejected": -150.54640197753906, "loss": 0.695, "rewards/accuracies": 0.5, "rewards/chosen": -0.40364035964012146, "rewards/margins": 0.16741791367530823, "rewards/rejected": -0.5710582733154297, "step": 1047 }, { "epoch": 1.048, "grad_norm": 2.0114176273345947, "learning_rate": 1.1899999999999998e-06, "logits/chosen": 0.8303149938583374, "logits/rejected": 0.7304349541664124, "logps/chosen": -198.070556640625, "logps/rejected": -268.3736572265625, "loss": 1.0894, "rewards/accuracies": 0.375, "rewards/chosen": -0.7167526483535767, "rewards/margins": -0.29435622692108154, "rewards/rejected": -0.42239639163017273, "step": 1048 }, { "epoch": 1.049, "grad_norm": 1.6145424842834473, "learning_rate": 1.1887499999999998e-06, "logits/chosen": 0.9632750749588013, "logits/rejected": 0.9119467735290527, "logps/chosen": -263.5657043457031, "logps/rejected": -192.61900329589844, "loss": 0.5892, "rewards/accuracies": 0.625, "rewards/chosen": -0.305825799703598, "rewards/margins": 0.39754652976989746, "rewards/rejected": -0.7033722400665283, "step": 1049 }, { "epoch": 1.05, "grad_norm": 1.0652143955230713, "learning_rate": 1.1874999999999999e-06, "logits/chosen": 0.5500315427780151, "logits/rejected": 0.7710057497024536, "logps/chosen": -258.5954284667969, "logps/rejected": -197.56365966796875, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": -0.10547023266553879, "rewards/margins": 0.7147519588470459, "rewards/rejected": -0.8202222585678101, "step": 1050 }, { "epoch": 1.051, "grad_norm": 1.6211962699890137, "learning_rate": 1.18625e-06, "logits/chosen": 1.243082880973816, "logits/rejected": 1.1378010511398315, "logps/chosen": -220.72708129882812, "logps/rejected": -238.22103881835938, "loss": 0.7088, "rewards/accuracies": 0.625, "rewards/chosen": -0.43719664216041565, "rewards/margins": 0.15515337884426117, "rewards/rejected": -0.5923500061035156, "step": 1051 }, { "epoch": 1.052, "grad_norm": 1.3929615020751953, "learning_rate": 1.185e-06, "logits/chosen": 1.0455257892608643, "logits/rejected": 0.6312541961669922, "logps/chosen": -317.8899230957031, "logps/rejected": -199.69534301757812, "loss": 0.6585, "rewards/accuracies": 0.625, "rewards/chosen": -0.14557552337646484, "rewards/margins": 0.3713876008987427, "rewards/rejected": -0.5169631242752075, "step": 1052 }, { "epoch": 1.053, "grad_norm": 1.966210961341858, "learning_rate": 1.18375e-06, "logits/chosen": 0.39819881319999695, "logits/rejected": 0.7219215631484985, "logps/chosen": -198.62940979003906, "logps/rejected": -234.83514404296875, "loss": 0.6004, "rewards/accuracies": 0.625, "rewards/chosen": -0.2598787248134613, "rewards/margins": 0.5776638984680176, "rewards/rejected": -0.8375426530838013, "step": 1053 }, { "epoch": 1.054, "grad_norm": 1.586686611175537, "learning_rate": 1.1825000000000001e-06, "logits/chosen": 0.8924707174301147, "logits/rejected": 0.553135097026825, "logps/chosen": -223.2870635986328, "logps/rejected": -185.1249237060547, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": -0.3099657893180847, "rewards/margins": 0.639003336429596, "rewards/rejected": -0.9489691257476807, "step": 1054 }, { "epoch": 1.055, "grad_norm": 1.2241368293762207, "learning_rate": 1.18125e-06, "logits/chosen": 0.8566513657569885, "logits/rejected": 0.6361480951309204, "logps/chosen": -286.13104248046875, "logps/rejected": -238.57376098632812, "loss": 0.3425, "rewards/accuracies": 0.875, "rewards/chosen": 0.13264991343021393, "rewards/margins": 1.065665602684021, "rewards/rejected": -0.9330156445503235, "step": 1055 }, { "epoch": 1.056, "grad_norm": 2.150078296661377, "learning_rate": 1.18e-06, "logits/chosen": 0.5131460428237915, "logits/rejected": 0.9936041235923767, "logps/chosen": -146.08587646484375, "logps/rejected": -300.8660583496094, "loss": 0.8547, "rewards/accuracies": 0.5, "rewards/chosen": -0.208792582154274, "rewards/margins": -0.18616315722465515, "rewards/rejected": -0.02262945845723152, "step": 1056 }, { "epoch": 1.057, "grad_norm": 0.9462231397628784, "learning_rate": 1.17875e-06, "logits/chosen": 0.66650390625, "logits/rejected": 0.7077014446258545, "logps/chosen": -224.23007202148438, "logps/rejected": -168.31619262695312, "loss": 0.3669, "rewards/accuracies": 0.875, "rewards/chosen": 0.1922370046377182, "rewards/margins": 0.981205940246582, "rewards/rejected": -0.7889689803123474, "step": 1057 }, { "epoch": 1.058, "grad_norm": 1.5849369764328003, "learning_rate": 1.1775e-06, "logits/chosen": 1.1604421138763428, "logits/rejected": 0.7599620819091797, "logps/chosen": -250.34072875976562, "logps/rejected": -190.21791076660156, "loss": 0.7569, "rewards/accuracies": 0.625, "rewards/chosen": -0.26249369978904724, "rewards/margins": 0.19449618458747864, "rewards/rejected": -0.4569898843765259, "step": 1058 }, { "epoch": 1.059, "grad_norm": 1.1644287109375, "learning_rate": 1.17625e-06, "logits/chosen": 0.18749046325683594, "logits/rejected": 0.6283838152885437, "logps/chosen": -110.58757019042969, "logps/rejected": -182.1555938720703, "loss": 0.476, "rewards/accuracies": 0.75, "rewards/chosen": 0.004129219800233841, "rewards/margins": 0.8577073812484741, "rewards/rejected": -0.8535781502723694, "step": 1059 }, { "epoch": 1.06, "grad_norm": 1.9628432989120483, "learning_rate": 1.175e-06, "logits/chosen": 1.6400701999664307, "logits/rejected": 0.34803879261016846, "logps/chosen": -427.09716796875, "logps/rejected": -188.1736297607422, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": -0.46947401762008667, "rewards/margins": 0.26506349444389343, "rewards/rejected": -0.7345375418663025, "step": 1060 }, { "epoch": 1.061, "grad_norm": 1.4998210668563843, "learning_rate": 1.17375e-06, "logits/chosen": 0.7550729513168335, "logits/rejected": 0.5409754514694214, "logps/chosen": -212.89865112304688, "logps/rejected": -157.24993896484375, "loss": 0.5112, "rewards/accuracies": 0.75, "rewards/chosen": -0.10782729834318161, "rewards/margins": 0.7822786569595337, "rewards/rejected": -0.8901059627532959, "step": 1061 }, { "epoch": 1.062, "grad_norm": 1.7916253805160522, "learning_rate": 1.1725e-06, "logits/chosen": 0.7735161185264587, "logits/rejected": 0.33337345719337463, "logps/chosen": -263.3514709472656, "logps/rejected": -173.93405151367188, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": -0.25982651114463806, "rewards/margins": 0.5810016989707947, "rewards/rejected": -0.8408282399177551, "step": 1062 }, { "epoch": 1.063, "grad_norm": 1.2148547172546387, "learning_rate": 1.1712499999999998e-06, "logits/chosen": 0.7255082130432129, "logits/rejected": 0.401000440120697, "logps/chosen": -203.92019653320312, "logps/rejected": -142.7261505126953, "loss": 0.4872, "rewards/accuracies": 0.75, "rewards/chosen": -0.027907192707061768, "rewards/margins": 0.7898129224777222, "rewards/rejected": -0.8177201747894287, "step": 1063 }, { "epoch": 1.064, "grad_norm": 1.463847279548645, "learning_rate": 1.1699999999999998e-06, "logits/chosen": 0.6128353476524353, "logits/rejected": 0.8738833665847778, "logps/chosen": -134.44540405273438, "logps/rejected": -213.28439331054688, "loss": 0.4535, "rewards/accuracies": 0.625, "rewards/chosen": -0.10045662522315979, "rewards/margins": 0.8023175597190857, "rewards/rejected": -0.9027742147445679, "step": 1064 }, { "epoch": 1.065, "grad_norm": 1.3696305751800537, "learning_rate": 1.1687499999999998e-06, "logits/chosen": 0.8294633626937866, "logits/rejected": 0.47317013144493103, "logps/chosen": -208.48532104492188, "logps/rejected": -173.96127319335938, "loss": 0.698, "rewards/accuracies": 0.75, "rewards/chosen": -0.30830568075180054, "rewards/margins": 0.3084789514541626, "rewards/rejected": -0.6167846918106079, "step": 1065 }, { "epoch": 1.066, "grad_norm": 1.2028043270111084, "learning_rate": 1.1674999999999998e-06, "logits/chosen": 0.6466650366783142, "logits/rejected": 0.9048054814338684, "logps/chosen": -222.79556274414062, "logps/rejected": -262.6460266113281, "loss": 0.427, "rewards/accuracies": 0.75, "rewards/chosen": 0.1923464834690094, "rewards/margins": 0.8879067897796631, "rewards/rejected": -0.6955603361129761, "step": 1066 }, { "epoch": 1.067, "grad_norm": 1.2323628664016724, "learning_rate": 1.16625e-06, "logits/chosen": 0.5930283069610596, "logits/rejected": 0.968631386756897, "logps/chosen": -125.14126586914062, "logps/rejected": -215.1036376953125, "loss": 0.5002, "rewards/accuracies": 0.75, "rewards/chosen": -0.10139305889606476, "rewards/margins": 0.8169814348220825, "rewards/rejected": -0.9183744192123413, "step": 1067 }, { "epoch": 1.068, "grad_norm": 1.341876745223999, "learning_rate": 1.165e-06, "logits/chosen": 0.7137465476989746, "logits/rejected": 0.789882242679596, "logps/chosen": -262.8941345214844, "logps/rejected": -230.26596069335938, "loss": 0.5218, "rewards/accuracies": 0.625, "rewards/chosen": -0.34485089778900146, "rewards/margins": 0.6294686794281006, "rewards/rejected": -0.9743196964263916, "step": 1068 }, { "epoch": 1.069, "grad_norm": 2.060394048690796, "learning_rate": 1.16375e-06, "logits/chosen": 0.8392796516418457, "logits/rejected": 0.24773874878883362, "logps/chosen": -434.873291015625, "logps/rejected": -146.82415771484375, "loss": 0.5647, "rewards/accuracies": 0.5, "rewards/chosen": -0.3918878436088562, "rewards/margins": 0.4205797016620636, "rewards/rejected": -0.8124675750732422, "step": 1069 }, { "epoch": 1.07, "grad_norm": 1.1268436908721924, "learning_rate": 1.1625e-06, "logits/chosen": 0.8824154138565063, "logits/rejected": 0.9947584867477417, "logps/chosen": -356.79437255859375, "logps/rejected": -227.9666748046875, "loss": 0.3799, "rewards/accuracies": 0.875, "rewards/chosen": -0.11792895942926407, "rewards/margins": 0.9433798789978027, "rewards/rejected": -1.0613088607788086, "step": 1070 }, { "epoch": 1.071, "grad_norm": 1.4960728883743286, "learning_rate": 1.1612499999999999e-06, "logits/chosen": 0.5874254107475281, "logits/rejected": 0.7951304316520691, "logps/chosen": -169.6992950439453, "logps/rejected": -218.56349182128906, "loss": 0.696, "rewards/accuracies": 0.625, "rewards/chosen": -0.3156511187553406, "rewards/margins": 0.25845301151275635, "rewards/rejected": -0.5741041302680969, "step": 1071 }, { "epoch": 1.072, "grad_norm": 1.6319154500961304, "learning_rate": 1.16e-06, "logits/chosen": 0.3545195758342743, "logits/rejected": 0.6300568580627441, "logps/chosen": -197.50457763671875, "logps/rejected": -193.251220703125, "loss": 0.7066, "rewards/accuracies": 0.625, "rewards/chosen": -0.40317127108573914, "rewards/margins": 0.2002461701631546, "rewards/rejected": -0.6034173965454102, "step": 1072 }, { "epoch": 1.073, "grad_norm": 1.018089771270752, "learning_rate": 1.15875e-06, "logits/chosen": 0.5919352769851685, "logits/rejected": 0.15588748455047607, "logps/chosen": -200.77789306640625, "logps/rejected": -217.8507080078125, "loss": 0.4253, "rewards/accuracies": 0.625, "rewards/chosen": 0.08213987201452255, "rewards/margins": 0.8214713335037231, "rewards/rejected": -0.7393314242362976, "step": 1073 }, { "epoch": 1.074, "grad_norm": 1.0822713375091553, "learning_rate": 1.1575e-06, "logits/chosen": 0.8367973566055298, "logits/rejected": 0.6041265726089478, "logps/chosen": -172.1947021484375, "logps/rejected": -178.72265625, "loss": 0.2722, "rewards/accuracies": 1.0, "rewards/chosen": 0.25145214796066284, "rewards/margins": 1.2138046026229858, "rewards/rejected": -0.9623525142669678, "step": 1074 }, { "epoch": 1.075, "grad_norm": 1.590942621231079, "learning_rate": 1.15625e-06, "logits/chosen": 0.43090513348579407, "logits/rejected": 1.0441017150878906, "logps/chosen": -172.2742919921875, "logps/rejected": -195.00570678710938, "loss": 0.7905, "rewards/accuracies": 0.625, "rewards/chosen": -0.5376498103141785, "rewards/margins": 0.13950759172439575, "rewards/rejected": -0.6771574020385742, "step": 1075 }, { "epoch": 1.076, "grad_norm": 0.7435200810432434, "learning_rate": 1.155e-06, "logits/chosen": 1.086047887802124, "logits/rejected": 0.5352864265441895, "logps/chosen": -194.33627319335938, "logps/rejected": -203.8048095703125, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 0.11628953367471695, "rewards/margins": 1.233609676361084, "rewards/rejected": -1.11732017993927, "step": 1076 }, { "epoch": 1.077, "grad_norm": 1.100590467453003, "learning_rate": 1.15375e-06, "logits/chosen": 0.8932472467422485, "logits/rejected": 0.7338306903839111, "logps/chosen": -184.5355987548828, "logps/rejected": -181.34515380859375, "loss": 0.4482, "rewards/accuracies": 0.875, "rewards/chosen": -0.07399202883243561, "rewards/margins": 0.8858978152275085, "rewards/rejected": -0.9598897695541382, "step": 1077 }, { "epoch": 1.078, "grad_norm": 3.771437168121338, "learning_rate": 1.1525e-06, "logits/chosen": 1.2361574172973633, "logits/rejected": 0.5542974472045898, "logps/chosen": -390.75775146484375, "logps/rejected": -157.173583984375, "loss": 0.9079, "rewards/accuracies": 0.25, "rewards/chosen": -0.8103100061416626, "rewards/margins": -0.1111581027507782, "rewards/rejected": -0.6991519331932068, "step": 1078 }, { "epoch": 1.079, "grad_norm": 1.325682520866394, "learning_rate": 1.15125e-06, "logits/chosen": 0.9627928733825684, "logits/rejected": 1.2728445529937744, "logps/chosen": -174.35635375976562, "logps/rejected": -212.415771484375, "loss": 0.5682, "rewards/accuracies": 0.625, "rewards/chosen": -0.27015143632888794, "rewards/margins": 0.44442448019981384, "rewards/rejected": -0.7145759463310242, "step": 1079 }, { "epoch": 1.08, "grad_norm": 0.9819089770317078, "learning_rate": 1.1499999999999998e-06, "logits/chosen": 0.39152103662490845, "logits/rejected": 0.7019654512405396, "logps/chosen": -167.7852020263672, "logps/rejected": -192.74472045898438, "loss": 0.3164, "rewards/accuracies": 1.0, "rewards/chosen": 0.09183908253908157, "rewards/margins": 1.0447165966033936, "rewards/rejected": -0.9528775215148926, "step": 1080 }, { "epoch": 1.081, "grad_norm": 2.271350145339966, "learning_rate": 1.1487499999999998e-06, "logits/chosen": 0.7774705290794373, "logits/rejected": 0.6392337083816528, "logps/chosen": -230.27069091796875, "logps/rejected": -214.4527130126953, "loss": 0.7344, "rewards/accuracies": 0.5, "rewards/chosen": -0.5656520128250122, "rewards/margins": 0.20788070559501648, "rewards/rejected": -0.7735326886177063, "step": 1081 }, { "epoch": 1.082, "grad_norm": 1.324945092201233, "learning_rate": 1.1474999999999998e-06, "logits/chosen": 0.494367778301239, "logits/rejected": 0.8607401251792908, "logps/chosen": -179.21656799316406, "logps/rejected": -217.66754150390625, "loss": 0.5764, "rewards/accuracies": 0.75, "rewards/chosen": -0.2803231179714203, "rewards/margins": 0.44169628620147705, "rewards/rejected": -0.7220194339752197, "step": 1082 }, { "epoch": 1.083, "grad_norm": 1.5852605104446411, "learning_rate": 1.14625e-06, "logits/chosen": 1.2392520904541016, "logits/rejected": 0.5193478465080261, "logps/chosen": -257.0646057128906, "logps/rejected": -208.13778686523438, "loss": 0.6282, "rewards/accuracies": 0.75, "rewards/chosen": -0.2590755522251129, "rewards/margins": 0.37974613904953003, "rewards/rejected": -0.6388217210769653, "step": 1083 }, { "epoch": 1.084, "grad_norm": 2.4663286209106445, "learning_rate": 1.145e-06, "logits/chosen": 1.191383957862854, "logits/rejected": 0.3916516900062561, "logps/chosen": -261.8016357421875, "logps/rejected": -118.86978149414062, "loss": 0.7767, "rewards/accuracies": 0.5, "rewards/chosen": -0.4422111511230469, "rewards/margins": 0.1301049143075943, "rewards/rejected": -0.5723161101341248, "step": 1084 }, { "epoch": 1.085, "grad_norm": 1.735673427581787, "learning_rate": 1.14375e-06, "logits/chosen": 1.1631953716278076, "logits/rejected": 0.3769620358943939, "logps/chosen": -295.7972412109375, "logps/rejected": -172.2859344482422, "loss": 0.6263, "rewards/accuracies": 0.625, "rewards/chosen": -0.29886573553085327, "rewards/margins": 0.3692222833633423, "rewards/rejected": -0.6680880188941956, "step": 1085 }, { "epoch": 1.086, "grad_norm": 2.9825551509857178, "learning_rate": 1.1425e-06, "logits/chosen": 0.9798634648323059, "logits/rejected": 0.7519315481185913, "logps/chosen": -310.903564453125, "logps/rejected": -187.55319213867188, "loss": 0.8269, "rewards/accuracies": 0.625, "rewards/chosen": -0.7593337297439575, "rewards/margins": -0.05762898921966553, "rewards/rejected": -0.701704740524292, "step": 1086 }, { "epoch": 1.087, "grad_norm": 1.5980007648468018, "learning_rate": 1.14125e-06, "logits/chosen": 0.977048397064209, "logits/rejected": 0.26338568329811096, "logps/chosen": -197.33143615722656, "logps/rejected": -173.58216857910156, "loss": 0.6263, "rewards/accuracies": 0.625, "rewards/chosen": -0.3314272463321686, "rewards/margins": 0.35472723841667175, "rewards/rejected": -0.6861544847488403, "step": 1087 }, { "epoch": 1.088, "grad_norm": 2.690153121948242, "learning_rate": 1.1399999999999999e-06, "logits/chosen": 0.5918645262718201, "logits/rejected": 0.864393413066864, "logps/chosen": -187.012939453125, "logps/rejected": -227.45333862304688, "loss": 0.875, "rewards/accuracies": 0.5, "rewards/chosen": -0.658757209777832, "rewards/margins": -0.001976191997528076, "rewards/rejected": -0.6567809581756592, "step": 1088 }, { "epoch": 1.089, "grad_norm": 1.3746803998947144, "learning_rate": 1.13875e-06, "logits/chosen": 0.6988212466239929, "logits/rejected": 0.70427405834198, "logps/chosen": -176.59405517578125, "logps/rejected": -192.03668212890625, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": -0.3181685507297516, "rewards/margins": 0.21887600421905518, "rewards/rejected": -0.5370445251464844, "step": 1089 }, { "epoch": 1.09, "grad_norm": 1.479427695274353, "learning_rate": 1.1375e-06, "logits/chosen": 0.4700494706630707, "logits/rejected": 0.46111008524894714, "logps/chosen": -135.201171875, "logps/rejected": -193.24462890625, "loss": 0.7778, "rewards/accuracies": 0.75, "rewards/chosen": -0.3239273428916931, "rewards/margins": 0.07141757011413574, "rewards/rejected": -0.39534494280815125, "step": 1090 }, { "epoch": 1.091, "grad_norm": 1.225520372390747, "learning_rate": 1.13625e-06, "logits/chosen": 0.8019444346427917, "logits/rejected": 0.29261285066604614, "logps/chosen": -234.15554809570312, "logps/rejected": -168.52403259277344, "loss": 0.4059, "rewards/accuracies": 0.75, "rewards/chosen": -0.004856396466493607, "rewards/margins": 0.9099271297454834, "rewards/rejected": -0.9147835373878479, "step": 1091 }, { "epoch": 1.092, "grad_norm": 1.199183702468872, "learning_rate": 1.135e-06, "logits/chosen": 0.8154212832450867, "logits/rejected": 0.9412962198257446, "logps/chosen": -223.2179412841797, "logps/rejected": -271.39642333984375, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": -0.2595347464084625, "rewards/margins": 0.48310184478759766, "rewards/rejected": -0.7426365613937378, "step": 1092 }, { "epoch": 1.093, "grad_norm": 1.7391233444213867, "learning_rate": 1.13375e-06, "logits/chosen": 0.7234534621238708, "logits/rejected": 1.0009876489639282, "logps/chosen": -185.6785125732422, "logps/rejected": -269.36590576171875, "loss": 0.5546, "rewards/accuracies": 0.75, "rewards/chosen": -0.3853538930416107, "rewards/margins": 0.4349015951156616, "rewards/rejected": -0.8202555179595947, "step": 1093 }, { "epoch": 1.094, "grad_norm": 1.5625464916229248, "learning_rate": 1.1325e-06, "logits/chosen": 0.35373741388320923, "logits/rejected": 0.9151265621185303, "logps/chosen": -179.1125030517578, "logps/rejected": -230.00961303710938, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": -0.33600887656211853, "rewards/margins": 0.27071917057037354, "rewards/rejected": -0.6067280769348145, "step": 1094 }, { "epoch": 1.095, "grad_norm": 1.0353262424468994, "learning_rate": 1.13125e-06, "logits/chosen": 0.35901015996932983, "logits/rejected": 0.4311184287071228, "logps/chosen": -191.37640380859375, "logps/rejected": -216.61740112304688, "loss": 0.3896, "rewards/accuracies": 1.0, "rewards/chosen": -0.04644102603197098, "rewards/margins": 0.8078058958053589, "rewards/rejected": -0.8542469143867493, "step": 1095 }, { "epoch": 1.096, "grad_norm": 1.727476716041565, "learning_rate": 1.1299999999999998e-06, "logits/chosen": 0.41564828157424927, "logits/rejected": 0.3368123769760132, "logps/chosen": -128.01998901367188, "logps/rejected": -164.85491943359375, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": -0.12670336663722992, "rewards/margins": 0.30986541509628296, "rewards/rejected": -0.4365687668323517, "step": 1096 }, { "epoch": 1.097, "grad_norm": 1.2551544904708862, "learning_rate": 1.1287499999999998e-06, "logits/chosen": 1.0221658945083618, "logits/rejected": 0.5163205862045288, "logps/chosen": -349.26800537109375, "logps/rejected": -217.63885498046875, "loss": 0.6088, "rewards/accuracies": 0.75, "rewards/chosen": 0.05102883279323578, "rewards/margins": 0.6222374439239502, "rewards/rejected": -0.5712085962295532, "step": 1097 }, { "epoch": 1.098, "grad_norm": 1.4929406642913818, "learning_rate": 1.1274999999999998e-06, "logits/chosen": 0.8447607755661011, "logits/rejected": 0.7050151228904724, "logps/chosen": -185.02883911132812, "logps/rejected": -228.28939819335938, "loss": 0.3284, "rewards/accuracies": 0.875, "rewards/chosen": 0.1388016641139984, "rewards/margins": 1.1171083450317383, "rewards/rejected": -0.9783066511154175, "step": 1098 }, { "epoch": 1.099, "grad_norm": 1.2072798013687134, "learning_rate": 1.12625e-06, "logits/chosen": 1.1278948783874512, "logits/rejected": 0.6549420356750488, "logps/chosen": -203.3969268798828, "logps/rejected": -184.9246063232422, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": -0.09071627259254456, "rewards/margins": 0.5488370060920715, "rewards/rejected": -0.6395532488822937, "step": 1099 }, { "epoch": 1.1, "grad_norm": 1.4835679531097412, "learning_rate": 1.125e-06, "logits/chosen": -0.10714483261108398, "logits/rejected": 0.9745811820030212, "logps/chosen": -124.56803131103516, "logps/rejected": -227.90977478027344, "loss": 0.6344, "rewards/accuracies": 0.625, "rewards/chosen": -0.1956585943698883, "rewards/margins": 0.28199756145477295, "rewards/rejected": -0.47765618562698364, "step": 1100 }, { "epoch": 1.101, "grad_norm": 2.0096657276153564, "learning_rate": 1.12375e-06, "logits/chosen": 0.7115126252174377, "logits/rejected": 0.8377418518066406, "logps/chosen": -159.54989624023438, "logps/rejected": -170.7284698486328, "loss": 0.716, "rewards/accuracies": 0.625, "rewards/chosen": -0.41199570894241333, "rewards/margins": 0.150462806224823, "rewards/rejected": -0.5624585151672363, "step": 1101 }, { "epoch": 1.102, "grad_norm": 1.5518004894256592, "learning_rate": 1.1225e-06, "logits/chosen": 0.5234984755516052, "logits/rejected": 0.638088047504425, "logps/chosen": -224.12742614746094, "logps/rejected": -164.10934448242188, "loss": 0.5857, "rewards/accuracies": 0.625, "rewards/chosen": -0.13990774750709534, "rewards/margins": 0.5265593528747559, "rewards/rejected": -0.6664670705795288, "step": 1102 }, { "epoch": 1.103, "grad_norm": 1.1263617277145386, "learning_rate": 1.12125e-06, "logits/chosen": 1.148582100868225, "logits/rejected": 0.35597658157348633, "logps/chosen": -260.7701110839844, "logps/rejected": -185.2922821044922, "loss": 0.5429, "rewards/accuracies": 0.75, "rewards/chosen": -0.1855553686618805, "rewards/margins": 0.41370218992233276, "rewards/rejected": -0.5992575883865356, "step": 1103 }, { "epoch": 1.104, "grad_norm": 1.5447711944580078, "learning_rate": 1.12e-06, "logits/chosen": 0.22127845883369446, "logits/rejected": 1.2678537368774414, "logps/chosen": -149.12054443359375, "logps/rejected": -241.48670959472656, "loss": 0.6515, "rewards/accuracies": 0.625, "rewards/chosen": -0.4592551290988922, "rewards/margins": 0.2188105583190918, "rewards/rejected": -0.6780656576156616, "step": 1104 }, { "epoch": 1.105, "grad_norm": 1.4629106521606445, "learning_rate": 1.1187499999999999e-06, "logits/chosen": 0.5687536001205444, "logits/rejected": 0.235971599817276, "logps/chosen": -239.05348205566406, "logps/rejected": -165.42666625976562, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": -0.11567628383636475, "rewards/margins": 0.3427093029022217, "rewards/rejected": -0.4583855867385864, "step": 1105 }, { "epoch": 1.106, "grad_norm": 1.341322660446167, "learning_rate": 1.1174999999999999e-06, "logits/chosen": 0.618337094783783, "logits/rejected": 0.48896151781082153, "logps/chosen": -153.13243103027344, "logps/rejected": -178.7378387451172, "loss": 0.6743, "rewards/accuracies": 0.625, "rewards/chosen": -0.15306808054447174, "rewards/margins": 0.24787549674510956, "rewards/rejected": -0.4009435772895813, "step": 1106 }, { "epoch": 1.107, "grad_norm": 1.1444560289382935, "learning_rate": 1.11625e-06, "logits/chosen": 0.7115829586982727, "logits/rejected": 0.5208548903465271, "logps/chosen": -214.01553344726562, "logps/rejected": -223.1718292236328, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": 0.11078520119190216, "rewards/margins": 1.0063159465789795, "rewards/rejected": -0.8955308198928833, "step": 1107 }, { "epoch": 1.108, "grad_norm": 1.5661931037902832, "learning_rate": 1.115e-06, "logits/chosen": 0.5675338506698608, "logits/rejected": 0.7351273894309998, "logps/chosen": -220.79354858398438, "logps/rejected": -158.24087524414062, "loss": 0.6127, "rewards/accuracies": 0.75, "rewards/chosen": -0.11815454065799713, "rewards/margins": 0.4393523335456848, "rewards/rejected": -0.5575068593025208, "step": 1108 }, { "epoch": 1.109, "grad_norm": 2.3050854206085205, "learning_rate": 1.11375e-06, "logits/chosen": 1.0060597658157349, "logits/rejected": 0.7991210222244263, "logps/chosen": -295.4192199707031, "logps/rejected": -294.910400390625, "loss": 0.9472, "rewards/accuracies": 0.375, "rewards/chosen": -0.7045313119888306, "rewards/margins": -0.2583794593811035, "rewards/rejected": -0.44615182280540466, "step": 1109 }, { "epoch": 1.11, "grad_norm": 1.2981491088867188, "learning_rate": 1.1125e-06, "logits/chosen": 0.22103485465049744, "logits/rejected": 0.618222177028656, "logps/chosen": -127.69096374511719, "logps/rejected": -215.05865478515625, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": -0.41827690601348877, "rewards/margins": 0.20225057005882263, "rewards/rejected": -0.6205275058746338, "step": 1110 }, { "epoch": 1.111, "grad_norm": 1.163520097732544, "learning_rate": 1.11125e-06, "logits/chosen": 0.42174768447875977, "logits/rejected": 0.858606219291687, "logps/chosen": -182.12950134277344, "logps/rejected": -221.12380981445312, "loss": 0.4787, "rewards/accuracies": 0.625, "rewards/chosen": 0.03671126812696457, "rewards/margins": 0.6112688779830933, "rewards/rejected": -0.5745576024055481, "step": 1111 }, { "epoch": 1.112, "grad_norm": 1.4893856048583984, "learning_rate": 1.11e-06, "logits/chosen": 0.8875036239624023, "logits/rejected": 0.7506153583526611, "logps/chosen": -185.1221160888672, "logps/rejected": -191.3973388671875, "loss": 0.4039, "rewards/accuracies": 0.875, "rewards/chosen": -0.09487714618444443, "rewards/margins": 0.8592801094055176, "rewards/rejected": -0.954157292842865, "step": 1112 }, { "epoch": 1.113, "grad_norm": 0.9886296391487122, "learning_rate": 1.1087499999999998e-06, "logits/chosen": 1.1094164848327637, "logits/rejected": 0.6682988405227661, "logps/chosen": -232.96942138671875, "logps/rejected": -195.09255981445312, "loss": 0.3868, "rewards/accuracies": 0.75, "rewards/chosen": 0.12191762775182724, "rewards/margins": 0.9581345915794373, "rewards/rejected": -0.836216926574707, "step": 1113 }, { "epoch": 1.114, "grad_norm": 1.5551568269729614, "learning_rate": 1.1075e-06, "logits/chosen": 0.7852246761322021, "logits/rejected": 0.5349957942962646, "logps/chosen": -238.10464477539062, "logps/rejected": -252.51805114746094, "loss": 0.6126, "rewards/accuracies": 0.75, "rewards/chosen": -0.29928135871887207, "rewards/margins": 0.5044091939926147, "rewards/rejected": -0.8036905527114868, "step": 1114 }, { "epoch": 1.115, "grad_norm": 1.8610605001449585, "learning_rate": 1.10625e-06, "logits/chosen": 0.8833299279212952, "logits/rejected": 0.5454740524291992, "logps/chosen": -281.01007080078125, "logps/rejected": -139.16592407226562, "loss": 0.645, "rewards/accuracies": 0.5, "rewards/chosen": -0.29925811290740967, "rewards/margins": 0.4037206768989563, "rewards/rejected": -0.7029788494110107, "step": 1115 }, { "epoch": 1.116, "grad_norm": 1.763486623764038, "learning_rate": 1.105e-06, "logits/chosen": 0.93599534034729, "logits/rejected": 0.7766948938369751, "logps/chosen": -246.43161010742188, "logps/rejected": -262.88763427734375, "loss": 0.5509, "rewards/accuracies": 0.75, "rewards/chosen": -0.28139907121658325, "rewards/margins": 0.3984231948852539, "rewards/rejected": -0.6798223257064819, "step": 1116 }, { "epoch": 1.117, "grad_norm": 1.3927795886993408, "learning_rate": 1.10375e-06, "logits/chosen": 0.7535165548324585, "logits/rejected": 0.5671959519386292, "logps/chosen": -283.9959411621094, "logps/rejected": -182.97695922851562, "loss": 0.5306, "rewards/accuracies": 0.75, "rewards/chosen": -0.32453978061676025, "rewards/margins": 0.4993846118450165, "rewards/rejected": -0.8239243626594543, "step": 1117 }, { "epoch": 1.1179999999999999, "grad_norm": 1.3210333585739136, "learning_rate": 1.1025e-06, "logits/chosen": 0.514342188835144, "logits/rejected": 0.7284033298492432, "logps/chosen": -168.3839111328125, "logps/rejected": -188.10177612304688, "loss": 0.6187, "rewards/accuracies": 0.625, "rewards/chosen": -0.23431181907653809, "rewards/margins": 0.4731159508228302, "rewards/rejected": -0.7074277997016907, "step": 1118 }, { "epoch": 1.119, "grad_norm": 1.2651634216308594, "learning_rate": 1.10125e-06, "logits/chosen": 0.42431801557540894, "logits/rejected": 0.31233519315719604, "logps/chosen": -171.66156005859375, "logps/rejected": -207.9723663330078, "loss": 0.5621, "rewards/accuracies": 0.75, "rewards/chosen": -0.09707039594650269, "rewards/margins": 0.6579117178916931, "rewards/rejected": -0.754982054233551, "step": 1119 }, { "epoch": 1.12, "grad_norm": 1.4831985235214233, "learning_rate": 1.1e-06, "logits/chosen": 0.40256235003471375, "logits/rejected": 0.9454125165939331, "logps/chosen": -102.18125915527344, "logps/rejected": -267.7298583984375, "loss": 0.4282, "rewards/accuracies": 0.75, "rewards/chosen": 0.1422681361436844, "rewards/margins": 0.9402196407318115, "rewards/rejected": -0.7979515194892883, "step": 1120 }, { "epoch": 1.121, "grad_norm": 1.2646148204803467, "learning_rate": 1.0987499999999999e-06, "logits/chosen": 0.4384085536003113, "logits/rejected": 0.7730313539505005, "logps/chosen": -154.10055541992188, "logps/rejected": -218.0858612060547, "loss": 0.4575, "rewards/accuracies": 0.875, "rewards/chosen": -0.1470511257648468, "rewards/margins": 0.8387222290039062, "rewards/rejected": -0.9857733845710754, "step": 1121 }, { "epoch": 1.1219999999999999, "grad_norm": 1.1085528135299683, "learning_rate": 1.0974999999999999e-06, "logits/chosen": 0.7424660325050354, "logits/rejected": 0.9213983416557312, "logps/chosen": -230.85760498046875, "logps/rejected": -266.3504638671875, "loss": 0.3238, "rewards/accuracies": 1.0, "rewards/chosen": 0.24002161622047424, "rewards/margins": 1.1131277084350586, "rewards/rejected": -0.8731060028076172, "step": 1122 }, { "epoch": 1.123, "grad_norm": 1.284653663635254, "learning_rate": 1.0962499999999999e-06, "logits/chosen": 0.4064185321331024, "logits/rejected": 0.527804970741272, "logps/chosen": -176.25340270996094, "logps/rejected": -176.28919982910156, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": -0.11230802536010742, "rewards/margins": 0.3761066794395447, "rewards/rejected": -0.4884147346019745, "step": 1123 }, { "epoch": 1.124, "grad_norm": 1.8667951822280884, "learning_rate": 1.0949999999999999e-06, "logits/chosen": 1.1931791305541992, "logits/rejected": 0.7023943066596985, "logps/chosen": -267.77386474609375, "logps/rejected": -159.3695831298828, "loss": 0.6962, "rewards/accuracies": 0.75, "rewards/chosen": -0.37358465790748596, "rewards/margins": 0.23653200268745422, "rewards/rejected": -0.610116720199585, "step": 1124 }, { "epoch": 1.125, "grad_norm": 1.6428231000900269, "learning_rate": 1.09375e-06, "logits/chosen": 0.44443392753601074, "logits/rejected": 0.2993094325065613, "logps/chosen": -190.07691955566406, "logps/rejected": -200.14414978027344, "loss": 0.5317, "rewards/accuracies": 0.75, "rewards/chosen": -0.020447062328457832, "rewards/margins": 0.5216141939163208, "rewards/rejected": -0.5420612692832947, "step": 1125 }, { "epoch": 1.126, "grad_norm": 1.5495860576629639, "learning_rate": 1.0925e-06, "logits/chosen": 0.8275769948959351, "logits/rejected": 0.8804055452346802, "logps/chosen": -219.03811645507812, "logps/rejected": -237.89089965820312, "loss": 0.4026, "rewards/accuracies": 0.75, "rewards/chosen": -0.21027278900146484, "rewards/margins": 0.8964025974273682, "rewards/rejected": -1.106675386428833, "step": 1126 }, { "epoch": 1.127, "grad_norm": 1.4341678619384766, "learning_rate": 1.09125e-06, "logits/chosen": 1.4578930139541626, "logits/rejected": 0.487335205078125, "logps/chosen": -337.37518310546875, "logps/rejected": -168.90213012695312, "loss": 0.4571, "rewards/accuracies": 0.875, "rewards/chosen": -0.19702216982841492, "rewards/margins": 0.8492749929428101, "rewards/rejected": -1.0462970733642578, "step": 1127 }, { "epoch": 1.1280000000000001, "grad_norm": 1.1413400173187256, "learning_rate": 1.09e-06, "logits/chosen": 0.2835526168346405, "logits/rejected": 1.0140962600708008, "logps/chosen": -166.29547119140625, "logps/rejected": -242.3469696044922, "loss": 0.422, "rewards/accuracies": 0.875, "rewards/chosen": 0.04046468436717987, "rewards/margins": 0.9182666540145874, "rewards/rejected": -0.8778018951416016, "step": 1128 }, { "epoch": 1.129, "grad_norm": 1.5551999807357788, "learning_rate": 1.08875e-06, "logits/chosen": 1.0000406503677368, "logits/rejected": 1.007237434387207, "logps/chosen": -221.89163208007812, "logps/rejected": -219.50054931640625, "loss": 0.6327, "rewards/accuracies": 0.75, "rewards/chosen": -0.12419615685939789, "rewards/margins": 0.4586321711540222, "rewards/rejected": -0.5828283429145813, "step": 1129 }, { "epoch": 1.13, "grad_norm": 1.2091360092163086, "learning_rate": 1.0875e-06, "logits/chosen": 0.8071736097335815, "logits/rejected": 0.8455997705459595, "logps/chosen": -122.88209533691406, "logps/rejected": -225.67881774902344, "loss": 0.315, "rewards/accuracies": 1.0, "rewards/chosen": 0.22261705994606018, "rewards/margins": 1.0537244081497192, "rewards/rejected": -0.8311073780059814, "step": 1130 }, { "epoch": 1.131, "grad_norm": 1.3431636095046997, "learning_rate": 1.08625e-06, "logits/chosen": 0.7044259309768677, "logits/rejected": 0.6024453043937683, "logps/chosen": -257.3581237792969, "logps/rejected": -273.5203857421875, "loss": 0.4392, "rewards/accuracies": 0.75, "rewards/chosen": -0.1612163484096527, "rewards/margins": 0.8644458055496216, "rewards/rejected": -1.0256621837615967, "step": 1131 }, { "epoch": 1.1320000000000001, "grad_norm": 1.7809752225875854, "learning_rate": 1.085e-06, "logits/chosen": 0.42164531350135803, "logits/rejected": 1.154435157775879, "logps/chosen": -160.61476135253906, "logps/rejected": -280.0984802246094, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": -0.37860292196273804, "rewards/margins": 0.3130011260509491, "rewards/rejected": -0.6916040778160095, "step": 1132 }, { "epoch": 1.133, "grad_norm": 1.2640042304992676, "learning_rate": 1.08375e-06, "logits/chosen": 0.9973465204238892, "logits/rejected": 0.7600120902061462, "logps/chosen": -236.735107421875, "logps/rejected": -193.92129516601562, "loss": 0.5534, "rewards/accuracies": 0.625, "rewards/chosen": -0.15723763406276703, "rewards/margins": 0.5066515207290649, "rewards/rejected": -0.6638891696929932, "step": 1133 }, { "epoch": 1.134, "grad_norm": 1.324262022972107, "learning_rate": 1.0825e-06, "logits/chosen": 0.7558152675628662, "logits/rejected": 0.8655913472175598, "logps/chosen": -178.86846923828125, "logps/rejected": -143.08432006835938, "loss": 0.5114, "rewards/accuracies": 0.75, "rewards/chosen": -0.1015019416809082, "rewards/margins": 0.7023622393608093, "rewards/rejected": -0.8038641810417175, "step": 1134 }, { "epoch": 1.135, "grad_norm": 1.0073816776275635, "learning_rate": 1.08125e-06, "logits/chosen": 1.0083874464035034, "logits/rejected": 0.6675175428390503, "logps/chosen": -230.5754852294922, "logps/rejected": -223.7482452392578, "loss": 0.4435, "rewards/accuracies": 0.875, "rewards/chosen": -0.017008405178785324, "rewards/margins": 0.7013007998466492, "rewards/rejected": -0.718309223651886, "step": 1135 }, { "epoch": 1.1360000000000001, "grad_norm": 1.9863452911376953, "learning_rate": 1.08e-06, "logits/chosen": 0.9270622730255127, "logits/rejected": 1.0952696800231934, "logps/chosen": -205.10081481933594, "logps/rejected": -289.9578857421875, "loss": 0.7208, "rewards/accuracies": 0.5, "rewards/chosen": -0.3858628571033478, "rewards/margins": 0.22208552062511444, "rewards/rejected": -0.607948362827301, "step": 1136 }, { "epoch": 1.137, "grad_norm": 1.215493083000183, "learning_rate": 1.07875e-06, "logits/chosen": 0.25300538539886475, "logits/rejected": 0.6744279265403748, "logps/chosen": -159.5237579345703, "logps/rejected": -250.98721313476562, "loss": 0.3962, "rewards/accuracies": 0.875, "rewards/chosen": -0.006610248237848282, "rewards/margins": 1.0376938581466675, "rewards/rejected": -1.0443041324615479, "step": 1137 }, { "epoch": 1.138, "grad_norm": 1.8875874280929565, "learning_rate": 1.0774999999999998e-06, "logits/chosen": 0.8769053220748901, "logits/rejected": 0.7231817245483398, "logps/chosen": -247.68984985351562, "logps/rejected": -158.5634002685547, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": -0.41713085770606995, "rewards/margins": 0.40915602445602417, "rewards/rejected": -0.8262869119644165, "step": 1138 }, { "epoch": 1.139, "grad_norm": 1.289406180381775, "learning_rate": 1.0762499999999999e-06, "logits/chosen": 0.4951213300228119, "logits/rejected": 0.5389550924301147, "logps/chosen": -217.8084716796875, "logps/rejected": -241.41317749023438, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": -0.24264822900295258, "rewards/margins": 0.4619300663471222, "rewards/rejected": -0.7045782208442688, "step": 1139 }, { "epoch": 1.1400000000000001, "grad_norm": 2.75770902633667, "learning_rate": 1.0749999999999999e-06, "logits/chosen": 1.1077344417572021, "logits/rejected": 1.3466459512710571, "logps/chosen": -304.216796875, "logps/rejected": -221.19691467285156, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": -0.33973315358161926, "rewards/margins": 0.641077995300293, "rewards/rejected": -0.9808111190795898, "step": 1140 }, { "epoch": 1.141, "grad_norm": 0.8530765771865845, "learning_rate": 1.0737499999999999e-06, "logits/chosen": 1.1449856758117676, "logits/rejected": 0.5673150420188904, "logps/chosen": -294.487060546875, "logps/rejected": -184.28884887695312, "loss": 0.3698, "rewards/accuracies": 1.0, "rewards/chosen": 0.045958615839481354, "rewards/margins": 0.8562301397323608, "rewards/rejected": -0.8102715015411377, "step": 1141 }, { "epoch": 1.142, "grad_norm": 1.0950390100479126, "learning_rate": 1.0725e-06, "logits/chosen": 1.0039973258972168, "logits/rejected": 0.38287609815597534, "logps/chosen": -149.79168701171875, "logps/rejected": -164.07171630859375, "loss": 0.4243, "rewards/accuracies": 0.75, "rewards/chosen": 0.18197199702262878, "rewards/margins": 1.0249426364898682, "rewards/rejected": -0.842970609664917, "step": 1142 }, { "epoch": 1.143, "grad_norm": 1.4553399085998535, "learning_rate": 1.07125e-06, "logits/chosen": 1.0770231485366821, "logits/rejected": 0.6667523384094238, "logps/chosen": -234.0405731201172, "logps/rejected": -195.873291015625, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": -0.3471773862838745, "rewards/margins": 0.20750290155410767, "rewards/rejected": -0.5546802282333374, "step": 1143 }, { "epoch": 1.144, "grad_norm": 1.643660545349121, "learning_rate": 1.07e-06, "logits/chosen": 0.6054476499557495, "logits/rejected": 0.5758451223373413, "logps/chosen": -186.32533264160156, "logps/rejected": -162.19418334960938, "loss": 0.6622, "rewards/accuracies": 0.75, "rewards/chosen": -0.28317558765411377, "rewards/margins": 0.34133827686309814, "rewards/rejected": -0.6245138645172119, "step": 1144 }, { "epoch": 1.145, "grad_norm": 1.4434503316879272, "learning_rate": 1.0687500000000001e-06, "logits/chosen": 0.2504841685295105, "logits/rejected": 0.5982304215431213, "logps/chosen": -142.77316284179688, "logps/rejected": -212.06692504882812, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": -0.20052136480808258, "rewards/margins": 0.36792004108428955, "rewards/rejected": -0.5684413909912109, "step": 1145 }, { "epoch": 1.146, "grad_norm": 1.3294378519058228, "learning_rate": 1.0675e-06, "logits/chosen": 0.9003331661224365, "logits/rejected": 0.7982346415519714, "logps/chosen": -251.24169921875, "logps/rejected": -188.4387664794922, "loss": 0.4431, "rewards/accuracies": 0.75, "rewards/chosen": -0.04092513397336006, "rewards/margins": 0.7389991283416748, "rewards/rejected": -0.779924213886261, "step": 1146 }, { "epoch": 1.147, "grad_norm": 1.3655503988265991, "learning_rate": 1.06625e-06, "logits/chosen": 0.8023422956466675, "logits/rejected": 1.3240721225738525, "logps/chosen": -188.79273986816406, "logps/rejected": -234.933837890625, "loss": 0.5943, "rewards/accuracies": 0.625, "rewards/chosen": -0.30089807510375977, "rewards/margins": 0.37259215116500854, "rewards/rejected": -0.6734902858734131, "step": 1147 }, { "epoch": 1.148, "grad_norm": 1.3100886344909668, "learning_rate": 1.065e-06, "logits/chosen": 0.45686590671539307, "logits/rejected": 1.1409708261489868, "logps/chosen": -163.35476684570312, "logps/rejected": -239.01516723632812, "loss": 0.4584, "rewards/accuracies": 0.75, "rewards/chosen": -0.07092829048633575, "rewards/margins": 0.7235363721847534, "rewards/rejected": -0.7944647073745728, "step": 1148 }, { "epoch": 1.149, "grad_norm": 1.1839828491210938, "learning_rate": 1.06375e-06, "logits/chosen": 0.9866127967834473, "logits/rejected": 0.6660155057907104, "logps/chosen": -210.5399169921875, "logps/rejected": -195.32972717285156, "loss": 0.4138, "rewards/accuracies": 0.875, "rewards/chosen": -0.17411766946315765, "rewards/margins": 0.889102578163147, "rewards/rejected": -1.0632202625274658, "step": 1149 }, { "epoch": 1.15, "grad_norm": 1.0202140808105469, "learning_rate": 1.0625e-06, "logits/chosen": 0.7545133829116821, "logits/rejected": 0.40587449073791504, "logps/chosen": -287.3480224609375, "logps/rejected": -180.18930053710938, "loss": 0.3897, "rewards/accuracies": 0.875, "rewards/chosen": 0.008127346634864807, "rewards/margins": 1.0078316926956177, "rewards/rejected": -0.9997043013572693, "step": 1150 }, { "epoch": 1.151, "grad_norm": 1.188273549079895, "learning_rate": 1.06125e-06, "logits/chosen": 1.2751834392547607, "logits/rejected": 0.9948415756225586, "logps/chosen": -265.95147705078125, "logps/rejected": -242.52545166015625, "loss": 0.5196, "rewards/accuracies": 0.625, "rewards/chosen": -0.047799959778785706, "rewards/margins": 0.6039276719093323, "rewards/rejected": -0.6517277359962463, "step": 1151 }, { "epoch": 1.152, "grad_norm": 1.2335623502731323, "learning_rate": 1.06e-06, "logits/chosen": 0.6281924843788147, "logits/rejected": 0.7082436680793762, "logps/chosen": -236.93942260742188, "logps/rejected": -239.96026611328125, "loss": 0.4163, "rewards/accuracies": 0.875, "rewards/chosen": -0.02787552773952484, "rewards/margins": 0.8390821218490601, "rewards/rejected": -0.8669576644897461, "step": 1152 }, { "epoch": 1.153, "grad_norm": 1.1434324979782104, "learning_rate": 1.05875e-06, "logits/chosen": 0.9767642021179199, "logits/rejected": 0.5403496623039246, "logps/chosen": -221.17828369140625, "logps/rejected": -204.60768127441406, "loss": 0.4202, "rewards/accuracies": 0.75, "rewards/chosen": -0.07962523400783539, "rewards/margins": 0.8683220744132996, "rewards/rejected": -0.9479472637176514, "step": 1153 }, { "epoch": 1.154, "grad_norm": 1.234824299812317, "learning_rate": 1.0575e-06, "logits/chosen": 0.9127506613731384, "logits/rejected": 1.1006507873535156, "logps/chosen": -227.61935424804688, "logps/rejected": -272.5013427734375, "loss": 0.5221, "rewards/accuracies": 0.625, "rewards/chosen": -0.2852003574371338, "rewards/margins": 0.6385257244110107, "rewards/rejected": -0.9237260818481445, "step": 1154 }, { "epoch": 1.155, "grad_norm": 2.0343289375305176, "learning_rate": 1.0562499999999998e-06, "logits/chosen": 1.404999017715454, "logits/rejected": 0.8342368602752686, "logps/chosen": -320.5321350097656, "logps/rejected": -160.1365966796875, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": -0.22657348215579987, "rewards/margins": 0.3965683579444885, "rewards/rejected": -0.6231418251991272, "step": 1155 }, { "epoch": 1.156, "grad_norm": 0.9947513937950134, "learning_rate": 1.0549999999999999e-06, "logits/chosen": 0.8027662038803101, "logits/rejected": 0.45095378160476685, "logps/chosen": -308.793701171875, "logps/rejected": -206.70150756835938, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 0.08616113662719727, "rewards/margins": 1.0181328058242798, "rewards/rejected": -0.931971549987793, "step": 1156 }, { "epoch": 1.157, "grad_norm": 2.060321569442749, "learning_rate": 1.0537499999999999e-06, "logits/chosen": 0.6565268039703369, "logits/rejected": 0.6101848483085632, "logps/chosen": -206.91131591796875, "logps/rejected": -139.04151916503906, "loss": 0.7323, "rewards/accuracies": 0.5, "rewards/chosen": -0.45198535919189453, "rewards/margins": 0.15138067305088043, "rewards/rejected": -0.6033660173416138, "step": 1157 }, { "epoch": 1.158, "grad_norm": 1.8651186227798462, "learning_rate": 1.0524999999999999e-06, "logits/chosen": 0.7912957072257996, "logits/rejected": 0.6883158683776855, "logps/chosen": -302.88702392578125, "logps/rejected": -193.8897705078125, "loss": 0.4662, "rewards/accuracies": 0.875, "rewards/chosen": -0.1516781896352768, "rewards/margins": 0.7755095958709717, "rewards/rejected": -0.9271878004074097, "step": 1158 }, { "epoch": 1.159, "grad_norm": 1.5042940378189087, "learning_rate": 1.0512499999999999e-06, "logits/chosen": 0.9807133078575134, "logits/rejected": 0.6423943042755127, "logps/chosen": -305.0409240722656, "logps/rejected": -161.7105712890625, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -0.3781503736972809, "rewards/margins": 0.42172521352767944, "rewards/rejected": -0.7998756170272827, "step": 1159 }, { "epoch": 1.16, "grad_norm": 1.5822570323944092, "learning_rate": 1.05e-06, "logits/chosen": 1.0229029655456543, "logits/rejected": 0.6854151487350464, "logps/chosen": -246.36904907226562, "logps/rejected": -137.72158813476562, "loss": 0.7154, "rewards/accuracies": 0.375, "rewards/chosen": -0.08810693770647049, "rewards/margins": 0.43885499238967896, "rewards/rejected": -0.5269619226455688, "step": 1160 }, { "epoch": 1.161, "grad_norm": 0.9286596775054932, "learning_rate": 1.0487500000000001e-06, "logits/chosen": 1.3069225549697876, "logits/rejected": 1.1389260292053223, "logps/chosen": -297.9068298339844, "logps/rejected": -207.97451782226562, "loss": 0.27, "rewards/accuracies": 1.0, "rewards/chosen": 0.29923009872436523, "rewards/margins": 1.4978423118591309, "rewards/rejected": -1.1986122131347656, "step": 1161 }, { "epoch": 1.162, "grad_norm": 1.0836671590805054, "learning_rate": 1.0475000000000001e-06, "logits/chosen": 0.7947351336479187, "logits/rejected": 0.811655580997467, "logps/chosen": -255.64785766601562, "logps/rejected": -182.96673583984375, "loss": 0.4332, "rewards/accuracies": 0.75, "rewards/chosen": 0.08425416052341461, "rewards/margins": 0.940658688545227, "rewards/rejected": -0.8564045429229736, "step": 1162 }, { "epoch": 1.163, "grad_norm": 2.005244016647339, "learning_rate": 1.04625e-06, "logits/chosen": 0.6140627264976501, "logits/rejected": 0.6664201617240906, "logps/chosen": -154.6314697265625, "logps/rejected": -216.05569458007812, "loss": 0.7098, "rewards/accuracies": 0.375, "rewards/chosen": -0.3247707486152649, "rewards/margins": 0.27791374921798706, "rewards/rejected": -0.602684497833252, "step": 1163 }, { "epoch": 1.164, "grad_norm": 2.0373826026916504, "learning_rate": 1.045e-06, "logits/chosen": 1.1194541454315186, "logits/rejected": 0.2916899025440216, "logps/chosen": -359.7054443359375, "logps/rejected": -163.29586791992188, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": -0.17050857841968536, "rewards/margins": 0.5186102390289307, "rewards/rejected": -0.6891187429428101, "step": 1164 }, { "epoch": 1.165, "grad_norm": 1.6860296726226807, "learning_rate": 1.04375e-06, "logits/chosen": 0.16223379969596863, "logits/rejected": 0.5560694336891174, "logps/chosen": -101.7349853515625, "logps/rejected": -199.94268798828125, "loss": 0.3648, "rewards/accuracies": 0.875, "rewards/chosen": 0.12553918361663818, "rewards/margins": 0.9266396760940552, "rewards/rejected": -0.8011004328727722, "step": 1165 }, { "epoch": 1.166, "grad_norm": 1.1729803085327148, "learning_rate": 1.0425e-06, "logits/chosen": 0.8902087211608887, "logits/rejected": 0.000414825975894928, "logps/chosen": -267.0210876464844, "logps/rejected": -123.92733764648438, "loss": 0.4621, "rewards/accuracies": 0.875, "rewards/chosen": 0.2953527569770813, "rewards/margins": 0.6720112562179565, "rewards/rejected": -0.37665852904319763, "step": 1166 }, { "epoch": 1.167, "grad_norm": 1.2531827688217163, "learning_rate": 1.04125e-06, "logits/chosen": 0.7013033628463745, "logits/rejected": 0.9960473775863647, "logps/chosen": -194.54234313964844, "logps/rejected": -178.23130798339844, "loss": 0.5007, "rewards/accuracies": 0.875, "rewards/chosen": -0.13045473396778107, "rewards/margins": 0.5203843712806702, "rewards/rejected": -0.65083909034729, "step": 1167 }, { "epoch": 1.168, "grad_norm": 1.6608086824417114, "learning_rate": 1.04e-06, "logits/chosen": 1.1659033298492432, "logits/rejected": 0.9570848941802979, "logps/chosen": -222.94998168945312, "logps/rejected": -221.95413208007812, "loss": 0.5537, "rewards/accuracies": 0.75, "rewards/chosen": -0.12357570230960846, "rewards/margins": 0.5973485112190247, "rewards/rejected": -0.7209241986274719, "step": 1168 }, { "epoch": 1.169, "grad_norm": 1.5570523738861084, "learning_rate": 1.03875e-06, "logits/chosen": 1.589505910873413, "logits/rejected": 0.8543832302093506, "logps/chosen": -382.6640930175781, "logps/rejected": -196.36044311523438, "loss": 0.5757, "rewards/accuracies": 0.625, "rewards/chosen": -0.2789110541343689, "rewards/margins": 0.6254437565803528, "rewards/rejected": -0.9043547511100769, "step": 1169 }, { "epoch": 1.17, "grad_norm": 1.425499677658081, "learning_rate": 1.0375e-06, "logits/chosen": 1.122374176979065, "logits/rejected": 0.989254355430603, "logps/chosen": -230.42550659179688, "logps/rejected": -202.89378356933594, "loss": 0.4929, "rewards/accuracies": 0.75, "rewards/chosen": -0.1086820736527443, "rewards/margins": 0.6625434160232544, "rewards/rejected": -0.7712254524230957, "step": 1170 }, { "epoch": 1.171, "grad_norm": 1.8963812589645386, "learning_rate": 1.0362499999999998e-06, "logits/chosen": 0.9620141983032227, "logits/rejected": 0.25603175163269043, "logps/chosen": -233.86549377441406, "logps/rejected": -144.8094482421875, "loss": 0.7683, "rewards/accuracies": 0.625, "rewards/chosen": -0.21391089260578156, "rewards/margins": 0.19711951911449432, "rewards/rejected": -0.4110303819179535, "step": 1171 }, { "epoch": 1.172, "grad_norm": 1.2732094526290894, "learning_rate": 1.0349999999999998e-06, "logits/chosen": 1.0603452920913696, "logits/rejected": 0.9873194098472595, "logps/chosen": -314.7107238769531, "logps/rejected": -289.2650146484375, "loss": 0.4815, "rewards/accuracies": 0.75, "rewards/chosen": -0.009261511266231537, "rewards/margins": 0.5946117043495178, "rewards/rejected": -0.6038731932640076, "step": 1172 }, { "epoch": 1.173, "grad_norm": 1.491674542427063, "learning_rate": 1.0337499999999998e-06, "logits/chosen": 0.964127779006958, "logits/rejected": 0.664932131767273, "logps/chosen": -225.7668914794922, "logps/rejected": -228.61892700195312, "loss": 0.5916, "rewards/accuracies": 0.75, "rewards/chosen": -0.1820804625749588, "rewards/margins": 0.32144588232040405, "rewards/rejected": -0.5035263299942017, "step": 1173 }, { "epoch": 1.174, "grad_norm": 1.3125019073486328, "learning_rate": 1.0324999999999999e-06, "logits/chosen": 0.734715461730957, "logits/rejected": 0.9549443125724792, "logps/chosen": -160.43760681152344, "logps/rejected": -234.5572967529297, "loss": 0.4274, "rewards/accuracies": 0.875, "rewards/chosen": -0.07523611932992935, "rewards/margins": 0.7759490013122559, "rewards/rejected": -0.8511850833892822, "step": 1174 }, { "epoch": 1.175, "grad_norm": 1.2757242918014526, "learning_rate": 1.0312499999999999e-06, "logits/chosen": 0.8392452597618103, "logits/rejected": 1.0772207975387573, "logps/chosen": -160.40745544433594, "logps/rejected": -230.681640625, "loss": 0.5237, "rewards/accuracies": 0.75, "rewards/chosen": -0.16282907128334045, "rewards/margins": 0.6041034460067749, "rewards/rejected": -0.7669325470924377, "step": 1175 }, { "epoch": 1.176, "grad_norm": 1.2906203269958496, "learning_rate": 1.0299999999999999e-06, "logits/chosen": 1.226552963256836, "logits/rejected": 0.8040697574615479, "logps/chosen": -226.04043579101562, "logps/rejected": -181.1104736328125, "loss": 0.4425, "rewards/accuracies": 0.875, "rewards/chosen": -0.010020643472671509, "rewards/margins": 0.7293627262115479, "rewards/rejected": -0.739383339881897, "step": 1176 }, { "epoch": 1.177, "grad_norm": 1.5895758867263794, "learning_rate": 1.02875e-06, "logits/chosen": 1.1581590175628662, "logits/rejected": 0.790153980255127, "logps/chosen": -380.99920654296875, "logps/rejected": -274.23443603515625, "loss": 0.5578, "rewards/accuracies": 0.75, "rewards/chosen": 0.0030307769775390625, "rewards/margins": 0.46537867188453674, "rewards/rejected": -0.4623478949069977, "step": 1177 }, { "epoch": 1.178, "grad_norm": 1.2278035879135132, "learning_rate": 1.0275000000000001e-06, "logits/chosen": 0.43033576011657715, "logits/rejected": 1.11920964717865, "logps/chosen": -145.932373046875, "logps/rejected": -205.968994140625, "loss": 0.5015, "rewards/accuracies": 0.875, "rewards/chosen": -0.15798798203468323, "rewards/margins": 0.6174898147583008, "rewards/rejected": -0.7754777669906616, "step": 1178 }, { "epoch": 1.179, "grad_norm": 1.05939519405365, "learning_rate": 1.0262500000000001e-06, "logits/chosen": 0.8143284916877747, "logits/rejected": 0.8199576139450073, "logps/chosen": -127.67581176757812, "logps/rejected": -222.5196533203125, "loss": 0.2963, "rewards/accuracies": 1.0, "rewards/chosen": 0.026296336203813553, "rewards/margins": 1.161435842514038, "rewards/rejected": -1.1351394653320312, "step": 1179 }, { "epoch": 1.18, "grad_norm": 1.738823652267456, "learning_rate": 1.025e-06, "logits/chosen": 0.7957693338394165, "logits/rejected": 0.8499516248703003, "logps/chosen": -197.2432861328125, "logps/rejected": -225.9811553955078, "loss": 0.5932, "rewards/accuracies": 0.75, "rewards/chosen": -0.07345428317785263, "rewards/margins": 0.41778868436813354, "rewards/rejected": -0.4912429749965668, "step": 1180 }, { "epoch": 1.181, "grad_norm": 2.1681578159332275, "learning_rate": 1.02375e-06, "logits/chosen": 0.9317888021469116, "logits/rejected": 0.6251713633537292, "logps/chosen": -262.5891418457031, "logps/rejected": -186.7195587158203, "loss": 0.599, "rewards/accuracies": 0.625, "rewards/chosen": -0.5357331037521362, "rewards/margins": 0.4602227210998535, "rewards/rejected": -0.9959558248519897, "step": 1181 }, { "epoch": 1.182, "grad_norm": 0.9874314665794373, "learning_rate": 1.0225e-06, "logits/chosen": 1.4018480777740479, "logits/rejected": 0.5329911112785339, "logps/chosen": -425.8797912597656, "logps/rejected": -158.02056884765625, "loss": 0.3775, "rewards/accuracies": 0.875, "rewards/chosen": 0.16538183391094208, "rewards/margins": 0.9293777942657471, "rewards/rejected": -0.7639960050582886, "step": 1182 }, { "epoch": 1.183, "grad_norm": 1.1293842792510986, "learning_rate": 1.02125e-06, "logits/chosen": 0.6274093389511108, "logits/rejected": 0.613770067691803, "logps/chosen": -175.77407836914062, "logps/rejected": -184.60134887695312, "loss": 0.416, "rewards/accuracies": 0.75, "rewards/chosen": 0.07706300169229507, "rewards/margins": 0.9053223133087158, "rewards/rejected": -0.828259289264679, "step": 1183 }, { "epoch": 1.184, "grad_norm": 2.036309003829956, "learning_rate": 1.02e-06, "logits/chosen": 0.4967538118362427, "logits/rejected": 0.8081119656562805, "logps/chosen": -137.39015197753906, "logps/rejected": -249.45530700683594, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": -0.28377455472946167, "rewards/margins": 0.413066565990448, "rewards/rejected": -0.6968411803245544, "step": 1184 }, { "epoch": 1.185, "grad_norm": 1.0967764854431152, "learning_rate": 1.01875e-06, "logits/chosen": 0.2457328885793686, "logits/rejected": 1.0627975463867188, "logps/chosen": -118.62890625, "logps/rejected": -221.88003540039062, "loss": 0.3563, "rewards/accuracies": 0.875, "rewards/chosen": 0.16465544700622559, "rewards/margins": 1.2420231103897095, "rewards/rejected": -1.0773677825927734, "step": 1185 }, { "epoch": 1.186, "grad_norm": 2.048794984817505, "learning_rate": 1.0175e-06, "logits/chosen": 1.4890222549438477, "logits/rejected": 0.8138489723205566, "logps/chosen": -254.92132568359375, "logps/rejected": -195.11007690429688, "loss": 0.6432, "rewards/accuracies": 0.5, "rewards/chosen": -0.3802872896194458, "rewards/margins": 0.41926896572113037, "rewards/rejected": -0.7995562553405762, "step": 1186 }, { "epoch": 1.187, "grad_norm": 1.2772876024246216, "learning_rate": 1.01625e-06, "logits/chosen": 0.5815724730491638, "logits/rejected": 0.6845219135284424, "logps/chosen": -143.76535034179688, "logps/rejected": -166.42788696289062, "loss": 0.4783, "rewards/accuracies": 0.75, "rewards/chosen": -0.057054728269577026, "rewards/margins": 1.063072919845581, "rewards/rejected": -1.1201276779174805, "step": 1187 }, { "epoch": 1.188, "grad_norm": 1.173514485359192, "learning_rate": 1.0149999999999998e-06, "logits/chosen": 0.9689463376998901, "logits/rejected": 1.0427247285842896, "logps/chosen": -221.29376220703125, "logps/rejected": -152.63583374023438, "loss": 0.3857, "rewards/accuracies": 0.75, "rewards/chosen": 0.03638792783021927, "rewards/margins": 1.0643048286437988, "rewards/rejected": -1.0279169082641602, "step": 1188 }, { "epoch": 1.189, "grad_norm": 1.6055854558944702, "learning_rate": 1.0137499999999998e-06, "logits/chosen": 1.1884719133377075, "logits/rejected": 0.2690780460834503, "logps/chosen": -293.3643798828125, "logps/rejected": -159.9482879638672, "loss": 0.5329, "rewards/accuracies": 0.75, "rewards/chosen": -0.006382763385772705, "rewards/margins": 0.6404843926429749, "rewards/rejected": -0.6468670964241028, "step": 1189 }, { "epoch": 1.19, "grad_norm": 1.3221632242202759, "learning_rate": 1.0124999999999998e-06, "logits/chosen": 0.5562900304794312, "logits/rejected": 0.8126837015151978, "logps/chosen": -160.3160400390625, "logps/rejected": -226.5155029296875, "loss": 0.5142, "rewards/accuracies": 0.75, "rewards/chosen": -0.15858487784862518, "rewards/margins": 0.6052203178405762, "rewards/rejected": -0.7638052701950073, "step": 1190 }, { "epoch": 1.191, "grad_norm": 1.007865309715271, "learning_rate": 1.0112499999999998e-06, "logits/chosen": 0.8803719282150269, "logits/rejected": 0.5865211486816406, "logps/chosen": -193.1351318359375, "logps/rejected": -197.9898223876953, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 0.05376044288277626, "rewards/margins": 1.0130635499954224, "rewards/rejected": -0.9593031406402588, "step": 1191 }, { "epoch": 1.192, "grad_norm": 1.0260674953460693, "learning_rate": 1.0099999999999999e-06, "logits/chosen": 0.7416608929634094, "logits/rejected": 0.6037184000015259, "logps/chosen": -208.27198791503906, "logps/rejected": -189.2329864501953, "loss": 0.4077, "rewards/accuracies": 0.75, "rewards/chosen": -0.027952291071414948, "rewards/margins": 0.9075918197631836, "rewards/rejected": -0.9355441331863403, "step": 1192 }, { "epoch": 1.193, "grad_norm": 1.4397697448730469, "learning_rate": 1.00875e-06, "logits/chosen": 1.0746575593948364, "logits/rejected": 0.47515708208084106, "logps/chosen": -322.722900390625, "logps/rejected": -181.20164489746094, "loss": 0.5279, "rewards/accuracies": 0.875, "rewards/chosen": 0.0010580532252788544, "rewards/margins": 0.5281505584716797, "rewards/rejected": -0.527092456817627, "step": 1193 }, { "epoch": 1.194, "grad_norm": 1.2657520771026611, "learning_rate": 1.0075e-06, "logits/chosen": 0.5214769840240479, "logits/rejected": 0.6625247001647949, "logps/chosen": -179.82606506347656, "logps/rejected": -248.94232177734375, "loss": 0.376, "rewards/accuracies": 0.875, "rewards/chosen": 0.022913552820682526, "rewards/margins": 0.9611155986785889, "rewards/rejected": -0.9382021427154541, "step": 1194 }, { "epoch": 1.195, "grad_norm": 0.9836166501045227, "learning_rate": 1.0062500000000001e-06, "logits/chosen": 0.4938434064388275, "logits/rejected": 0.44414713978767395, "logps/chosen": -187.0266571044922, "logps/rejected": -168.18887329101562, "loss": 0.3086, "rewards/accuracies": 0.875, "rewards/chosen": 0.13175459206104279, "rewards/margins": 1.2099300622940063, "rewards/rejected": -1.0781755447387695, "step": 1195 }, { "epoch": 1.196, "grad_norm": 1.5507038831710815, "learning_rate": 1.005e-06, "logits/chosen": 0.5987940430641174, "logits/rejected": 0.6904351711273193, "logps/chosen": -152.32302856445312, "logps/rejected": -263.55694580078125, "loss": 0.3797, "rewards/accuracies": 0.875, "rewards/chosen": -0.1237354427576065, "rewards/margins": 1.0327434539794922, "rewards/rejected": -1.156479001045227, "step": 1196 }, { "epoch": 1.197, "grad_norm": 1.5685291290283203, "learning_rate": 1.00375e-06, "logits/chosen": 1.0634021759033203, "logits/rejected": 0.746648907661438, "logps/chosen": -223.52320861816406, "logps/rejected": -243.82379150390625, "loss": 0.6266, "rewards/accuracies": 0.75, "rewards/chosen": -0.24239349365234375, "rewards/margins": 0.4327338635921478, "rewards/rejected": -0.6751273274421692, "step": 1197 }, { "epoch": 1.198, "grad_norm": 1.7624574899673462, "learning_rate": 1.0025e-06, "logits/chosen": 0.6292460560798645, "logits/rejected": 1.4541797637939453, "logps/chosen": -124.99452209472656, "logps/rejected": -229.556884765625, "loss": 0.5621, "rewards/accuracies": 0.625, "rewards/chosen": -0.14243096113204956, "rewards/margins": 0.5683976411819458, "rewards/rejected": -0.7108286023139954, "step": 1198 }, { "epoch": 1.199, "grad_norm": 2.081012010574341, "learning_rate": 1.00125e-06, "logits/chosen": 0.3318791687488556, "logits/rejected": 0.5415562391281128, "logps/chosen": -125.69586181640625, "logps/rejected": -228.36752319335938, "loss": 0.4331, "rewards/accuracies": 0.875, "rewards/chosen": -0.08066000044345856, "rewards/margins": 0.9209991693496704, "rewards/rejected": -1.0016591548919678, "step": 1199 }, { "epoch": 1.2, "grad_norm": 3.5410614013671875, "learning_rate": 1e-06, "logits/chosen": 1.2347830533981323, "logits/rejected": 0.4495829939842224, "logps/chosen": -280.1577453613281, "logps/rejected": -140.7763671875, "loss": 1.2394, "rewards/accuracies": 0.375, "rewards/chosen": -0.9676413536071777, "rewards/margins": -0.6875264048576355, "rewards/rejected": -0.2801150381565094, "step": 1200 }, { "epoch": 1.201, "grad_norm": 1.7286938428878784, "learning_rate": 9.9875e-07, "logits/chosen": 1.2028430700302124, "logits/rejected": 0.6071735620498657, "logps/chosen": -307.1026916503906, "logps/rejected": -153.29443359375, "loss": 0.6203, "rewards/accuracies": 0.625, "rewards/chosen": -0.21288147568702698, "rewards/margins": 0.421753466129303, "rewards/rejected": -0.6346349120140076, "step": 1201 }, { "epoch": 1.202, "grad_norm": 1.4500534534454346, "learning_rate": 9.975e-07, "logits/chosen": 0.5586124658584595, "logits/rejected": 1.2272998094558716, "logps/chosen": -123.49586486816406, "logps/rejected": -192.39727783203125, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": -0.22512690722942352, "rewards/margins": 0.6751748323440552, "rewards/rejected": -0.9003017544746399, "step": 1202 }, { "epoch": 1.203, "grad_norm": 2.262558937072754, "learning_rate": 9.9625e-07, "logits/chosen": 1.01169753074646, "logits/rejected": 1.1704771518707275, "logps/chosen": -247.9832000732422, "logps/rejected": -282.60736083984375, "loss": 0.8558, "rewards/accuracies": 0.5, "rewards/chosen": -0.43090248107910156, "rewards/margins": -0.17337413132190704, "rewards/rejected": -0.2575283646583557, "step": 1203 }, { "epoch": 1.204, "grad_norm": 1.0475949048995972, "learning_rate": 9.95e-07, "logits/chosen": 1.105096459388733, "logits/rejected": 0.5639420747756958, "logps/chosen": -202.48330688476562, "logps/rejected": -171.9492950439453, "loss": 0.4782, "rewards/accuracies": 0.875, "rewards/chosen": 0.021127760410308838, "rewards/margins": 0.869050145149231, "rewards/rejected": -0.8479223847389221, "step": 1204 }, { "epoch": 1.205, "grad_norm": 1.6595971584320068, "learning_rate": 9.9375e-07, "logits/chosen": 1.031968593597412, "logits/rejected": 0.256939172744751, "logps/chosen": -242.24169921875, "logps/rejected": -151.8815155029297, "loss": 0.6395, "rewards/accuracies": 0.5, "rewards/chosen": -0.35344985127449036, "rewards/margins": 0.3108649253845215, "rewards/rejected": -0.6643148064613342, "step": 1205 }, { "epoch": 1.206, "grad_norm": 0.9957694411277771, "learning_rate": 9.925e-07, "logits/chosen": 0.24291999638080597, "logits/rejected": 0.4730932414531708, "logps/chosen": -162.31471252441406, "logps/rejected": -175.14923095703125, "loss": 0.3409, "rewards/accuracies": 0.875, "rewards/chosen": 0.07029061019420624, "rewards/margins": 1.1390265226364136, "rewards/rejected": -1.068735957145691, "step": 1206 }, { "epoch": 1.207, "grad_norm": 1.3117166757583618, "learning_rate": 9.912499999999998e-07, "logits/chosen": 0.434802770614624, "logits/rejected": 0.718829870223999, "logps/chosen": -166.0129852294922, "logps/rejected": -216.33731079101562, "loss": 0.5049, "rewards/accuracies": 0.75, "rewards/chosen": -0.016818813979625702, "rewards/margins": 0.7669503688812256, "rewards/rejected": -0.7837692499160767, "step": 1207 }, { "epoch": 1.208, "grad_norm": 1.32023024559021, "learning_rate": 9.9e-07, "logits/chosen": 0.5778423547744751, "logits/rejected": 0.5342999696731567, "logps/chosen": -138.04808044433594, "logps/rejected": -194.90386962890625, "loss": 0.3706, "rewards/accuracies": 0.875, "rewards/chosen": 0.05855179205536842, "rewards/margins": 1.0019433498382568, "rewards/rejected": -0.9433916211128235, "step": 1208 }, { "epoch": 1.209, "grad_norm": 1.131312608718872, "learning_rate": 9.8875e-07, "logits/chosen": 1.1040480136871338, "logits/rejected": 1.1748543977737427, "logps/chosen": -261.07684326171875, "logps/rejected": -243.9725341796875, "loss": 0.5053, "rewards/accuracies": 0.75, "rewards/chosen": 0.0030137058347463608, "rewards/margins": 0.6442444920539856, "rewards/rejected": -0.6412308216094971, "step": 1209 }, { "epoch": 1.21, "grad_norm": 1.4056700468063354, "learning_rate": 9.875e-07, "logits/chosen": 0.842929482460022, "logits/rejected": 0.8786438703536987, "logps/chosen": -215.935546875, "logps/rejected": -261.4675598144531, "loss": 0.6374, "rewards/accuracies": 0.5, "rewards/chosen": -0.3176743686199188, "rewards/margins": 0.37527352571487427, "rewards/rejected": -0.6929478645324707, "step": 1210 }, { "epoch": 1.211, "grad_norm": 1.4373376369476318, "learning_rate": 9.862499999999999e-07, "logits/chosen": 0.496817946434021, "logits/rejected": 1.0272283554077148, "logps/chosen": -169.0338592529297, "logps/rejected": -215.54385375976562, "loss": 0.5902, "rewards/accuracies": 0.625, "rewards/chosen": -0.1738411784172058, "rewards/margins": 0.4872089624404907, "rewards/rejected": -0.6610502004623413, "step": 1211 }, { "epoch": 1.212, "grad_norm": 1.3265007734298706, "learning_rate": 9.849999999999999e-07, "logits/chosen": 1.3629294633865356, "logits/rejected": 0.7871792316436768, "logps/chosen": -337.7012939453125, "logps/rejected": -182.8880615234375, "loss": 0.3362, "rewards/accuracies": 0.875, "rewards/chosen": 0.078595831990242, "rewards/margins": 1.1767185926437378, "rewards/rejected": -1.0981227159500122, "step": 1212 }, { "epoch": 1.213, "grad_norm": 2.3166663646698, "learning_rate": 9.8375e-07, "logits/chosen": 0.846839189529419, "logits/rejected": 0.6693770289421082, "logps/chosen": -183.64808654785156, "logps/rejected": -311.951904296875, "loss": 0.6564, "rewards/accuracies": 0.5, "rewards/chosen": -0.1811104118824005, "rewards/margins": 0.4834439754486084, "rewards/rejected": -0.6645544171333313, "step": 1213 }, { "epoch": 1.214, "grad_norm": 1.3473474979400635, "learning_rate": 9.825e-07, "logits/chosen": 0.7777671813964844, "logits/rejected": 0.5970234274864197, "logps/chosen": -216.5134735107422, "logps/rejected": -215.01834106445312, "loss": 0.4761, "rewards/accuracies": 0.75, "rewards/chosen": -0.09230347722768784, "rewards/margins": 0.7960019707679749, "rewards/rejected": -0.8883054852485657, "step": 1214 }, { "epoch": 1.215, "grad_norm": 1.5944819450378418, "learning_rate": 9.8125e-07, "logits/chosen": 1.047531247138977, "logits/rejected": 0.8005573749542236, "logps/chosen": -274.5458068847656, "logps/rejected": -213.5723876953125, "loss": 0.4047, "rewards/accuracies": 0.875, "rewards/chosen": -0.110809326171875, "rewards/margins": 0.8621519804000854, "rewards/rejected": -0.9729613065719604, "step": 1215 }, { "epoch": 1.216, "grad_norm": 1.6096186637878418, "learning_rate": 9.8e-07, "logits/chosen": 0.5848865509033203, "logits/rejected": 1.1818217039108276, "logps/chosen": -103.17569732666016, "logps/rejected": -261.5119934082031, "loss": 0.4821, "rewards/accuracies": 0.875, "rewards/chosen": -0.019134338945150375, "rewards/margins": 0.7117371559143066, "rewards/rejected": -0.7308714985847473, "step": 1216 }, { "epoch": 1.217, "grad_norm": 1.188140869140625, "learning_rate": 9.7875e-07, "logits/chosen": 0.7264192700386047, "logits/rejected": 0.9756501913070679, "logps/chosen": -189.57656860351562, "logps/rejected": -231.72618103027344, "loss": 0.4976, "rewards/accuracies": 0.75, "rewards/chosen": -0.17394718527793884, "rewards/margins": 0.7108909487724304, "rewards/rejected": -0.8848381638526917, "step": 1217 }, { "epoch": 1.218, "grad_norm": 1.663684368133545, "learning_rate": 9.775e-07, "logits/chosen": 0.23277540504932404, "logits/rejected": 1.3722538948059082, "logps/chosen": -186.17059326171875, "logps/rejected": -255.70733642578125, "loss": 0.7073, "rewards/accuracies": 0.75, "rewards/chosen": -0.25520458817481995, "rewards/margins": 0.12830761075019836, "rewards/rejected": -0.3835121989250183, "step": 1218 }, { "epoch": 1.219, "grad_norm": 1.4065848588943481, "learning_rate": 9.7625e-07, "logits/chosen": 0.8490874171257019, "logits/rejected": 1.0050830841064453, "logps/chosen": -248.59861755371094, "logps/rejected": -271.289306640625, "loss": 0.5947, "rewards/accuracies": 0.75, "rewards/chosen": -0.10009646415710449, "rewards/margins": 0.5512074828147888, "rewards/rejected": -0.6513039469718933, "step": 1219 }, { "epoch": 1.22, "grad_norm": 1.7898527383804321, "learning_rate": 9.75e-07, "logits/chosen": 1.2834380865097046, "logits/rejected": 0.21790321171283722, "logps/chosen": -263.8951721191406, "logps/rejected": -133.24472045898438, "loss": 0.6368, "rewards/accuracies": 0.75, "rewards/chosen": -0.1837257444858551, "rewards/margins": 0.4522421956062317, "rewards/rejected": -0.6359679102897644, "step": 1220 }, { "epoch": 1.221, "grad_norm": 1.7850316762924194, "learning_rate": 9.7375e-07, "logits/chosen": 0.6355536580085754, "logits/rejected": 0.5071321725845337, "logps/chosen": -207.936767578125, "logps/rejected": -174.81463623046875, "loss": 0.8202, "rewards/accuracies": 0.5, "rewards/chosen": -0.3509925901889801, "rewards/margins": 0.08816316723823547, "rewards/rejected": -0.4391557574272156, "step": 1221 }, { "epoch": 1.222, "grad_norm": 2.241255521774292, "learning_rate": 9.725e-07, "logits/chosen": 1.0116685628890991, "logits/rejected": 0.9556651711463928, "logps/chosen": -360.414306640625, "logps/rejected": -253.746337890625, "loss": 0.7681, "rewards/accuracies": 0.5, "rewards/chosen": -0.2673821449279785, "rewards/margins": 0.08080962300300598, "rewards/rejected": -0.3481917381286621, "step": 1222 }, { "epoch": 1.223, "grad_norm": 2.090730905532837, "learning_rate": 9.712499999999998e-07, "logits/chosen": 1.0351582765579224, "logits/rejected": 1.2573606967926025, "logps/chosen": -361.90008544921875, "logps/rejected": -328.12335205078125, "loss": 0.7013, "rewards/accuracies": 0.75, "rewards/chosen": -0.20518046617507935, "rewards/margins": 0.6465373039245605, "rewards/rejected": -0.8517177104949951, "step": 1223 }, { "epoch": 1.224, "grad_norm": 1.8276963233947754, "learning_rate": 9.7e-07, "logits/chosen": 0.727600634098053, "logits/rejected": 0.9025369882583618, "logps/chosen": -186.8101348876953, "logps/rejected": -239.95265197753906, "loss": 0.6013, "rewards/accuracies": 0.625, "rewards/chosen": -0.33889615535736084, "rewards/margins": 0.4004040062427521, "rewards/rejected": -0.7393001317977905, "step": 1224 }, { "epoch": 1.225, "grad_norm": 0.9760556817054749, "learning_rate": 9.6875e-07, "logits/chosen": 0.7768080830574036, "logits/rejected": 0.8795886039733887, "logps/chosen": -256.3324890136719, "logps/rejected": -162.42929077148438, "loss": 0.4081, "rewards/accuracies": 0.875, "rewards/chosen": -0.11777310073375702, "rewards/margins": 0.7960665225982666, "rewards/rejected": -0.9138396978378296, "step": 1225 }, { "epoch": 1.226, "grad_norm": 2.3913230895996094, "learning_rate": 9.675e-07, "logits/chosen": 1.0297497510910034, "logits/rejected": 0.8719997406005859, "logps/chosen": -195.89404296875, "logps/rejected": -210.68829345703125, "loss": 0.8307, "rewards/accuracies": 0.5, "rewards/chosen": -0.5557735562324524, "rewards/margins": -0.035172272473573685, "rewards/rejected": -0.5206012725830078, "step": 1226 }, { "epoch": 1.227, "grad_norm": 1.1218256950378418, "learning_rate": 9.6625e-07, "logits/chosen": 0.4878440499305725, "logits/rejected": 0.7771992683410645, "logps/chosen": -159.87611389160156, "logps/rejected": -258.782470703125, "loss": 0.4799, "rewards/accuracies": 0.75, "rewards/chosen": -0.10563401877880096, "rewards/margins": 0.8407614231109619, "rewards/rejected": -0.9463953971862793, "step": 1227 }, { "epoch": 1.228, "grad_norm": 1.1263353824615479, "learning_rate": 9.649999999999999e-07, "logits/chosen": 0.9302968978881836, "logits/rejected": 0.978205144405365, "logps/chosen": -209.238525390625, "logps/rejected": -182.76065063476562, "loss": 0.4182, "rewards/accuracies": 0.875, "rewards/chosen": -0.0633082389831543, "rewards/margins": 0.8153718709945679, "rewards/rejected": -0.8786801695823669, "step": 1228 }, { "epoch": 1.229, "grad_norm": 1.7365015745162964, "learning_rate": 9.637499999999999e-07, "logits/chosen": 0.9216824173927307, "logits/rejected": 0.8003802299499512, "logps/chosen": -266.00433349609375, "logps/rejected": -247.21505737304688, "loss": 0.7312, "rewards/accuracies": 0.625, "rewards/chosen": -0.4022713899612427, "rewards/margins": 0.14525634050369263, "rewards/rejected": -0.5475277304649353, "step": 1229 }, { "epoch": 1.23, "grad_norm": 1.088574767112732, "learning_rate": 9.624999999999999e-07, "logits/chosen": 0.6299886703491211, "logits/rejected": 0.3075682520866394, "logps/chosen": -194.25979614257812, "logps/rejected": -154.39111328125, "loss": 0.4304, "rewards/accuracies": 0.75, "rewards/chosen": 0.09338504076004028, "rewards/margins": 0.8595805168151855, "rewards/rejected": -0.76619553565979, "step": 1230 }, { "epoch": 1.231, "grad_norm": 0.8994821310043335, "learning_rate": 9.6125e-07, "logits/chosen": 0.6126967072486877, "logits/rejected": 0.49705085158348083, "logps/chosen": -267.0284729003906, "logps/rejected": -212.5930633544922, "loss": 0.3478, "rewards/accuracies": 0.875, "rewards/chosen": 0.09175348281860352, "rewards/margins": 1.1073837280273438, "rewards/rejected": -1.0156301259994507, "step": 1231 }, { "epoch": 1.232, "grad_norm": 1.5261142253875732, "learning_rate": 9.6e-07, "logits/chosen": 0.9822372198104858, "logits/rejected": 0.5220682621002197, "logps/chosen": -246.79965209960938, "logps/rejected": -253.4149932861328, "loss": 0.4306, "rewards/accuracies": 0.75, "rewards/chosen": -0.15570050477981567, "rewards/margins": 1.009648323059082, "rewards/rejected": -1.165348768234253, "step": 1232 }, { "epoch": 1.233, "grad_norm": 1.3911175727844238, "learning_rate": 9.5875e-07, "logits/chosen": 1.0450928211212158, "logits/rejected": 0.6367064118385315, "logps/chosen": -245.06625366210938, "logps/rejected": -158.78387451171875, "loss": 0.5069, "rewards/accuracies": 0.625, "rewards/chosen": -0.06644430011510849, "rewards/margins": 0.7416574954986572, "rewards/rejected": -0.8081017732620239, "step": 1233 }, { "epoch": 1.234, "grad_norm": 1.546083927154541, "learning_rate": 9.575e-07, "logits/chosen": 0.49674350023269653, "logits/rejected": 0.40666624903678894, "logps/chosen": -180.15420532226562, "logps/rejected": -139.7668914794922, "loss": 0.5846, "rewards/accuracies": 0.75, "rewards/chosen": -0.29668760299682617, "rewards/margins": 0.5276574492454529, "rewards/rejected": -0.8243451118469238, "step": 1234 }, { "epoch": 1.2349999999999999, "grad_norm": 1.626781940460205, "learning_rate": 9.5625e-07, "logits/chosen": 0.5059763193130493, "logits/rejected": 0.12179061770439148, "logps/chosen": -231.34564208984375, "logps/rejected": -177.5782928466797, "loss": 0.5439, "rewards/accuracies": 0.75, "rewards/chosen": -0.2909153997898102, "rewards/margins": 0.5952122211456299, "rewards/rejected": -0.8861275911331177, "step": 1235 }, { "epoch": 1.236, "grad_norm": 1.6688631772994995, "learning_rate": 9.55e-07, "logits/chosen": 0.572638988494873, "logits/rejected": 0.7234945297241211, "logps/chosen": -171.57830810546875, "logps/rejected": -183.9564208984375, "loss": 0.725, "rewards/accuracies": 0.75, "rewards/chosen": -0.306282639503479, "rewards/margins": 0.48455700278282166, "rewards/rejected": -0.7908396124839783, "step": 1236 }, { "epoch": 1.237, "grad_norm": 1.2403597831726074, "learning_rate": 9.5375e-07, "logits/chosen": 0.2198057919740677, "logits/rejected": 0.5981228947639465, "logps/chosen": -162.70169067382812, "logps/rejected": -248.33126831054688, "loss": 0.486, "rewards/accuracies": 0.625, "rewards/chosen": -0.010256096720695496, "rewards/margins": 0.8187582492828369, "rewards/rejected": -0.8290144205093384, "step": 1237 }, { "epoch": 1.238, "grad_norm": 1.387811541557312, "learning_rate": 9.525e-07, "logits/chosen": 0.6919543743133545, "logits/rejected": 0.8841191530227661, "logps/chosen": -184.99166870117188, "logps/rejected": -220.46188354492188, "loss": 0.2978, "rewards/accuracies": 1.0, "rewards/chosen": 0.10257302969694138, "rewards/margins": 1.174391746520996, "rewards/rejected": -1.0718187093734741, "step": 1238 }, { "epoch": 1.2389999999999999, "grad_norm": 0.9290878772735596, "learning_rate": 9.5125e-07, "logits/chosen": 0.5681501626968384, "logits/rejected": 0.5004057884216309, "logps/chosen": -243.00833129882812, "logps/rejected": -192.50918579101562, "loss": 0.2425, "rewards/accuracies": 1.0, "rewards/chosen": 0.36853188276290894, "rewards/margins": 1.4234192371368408, "rewards/rejected": -1.054887294769287, "step": 1239 }, { "epoch": 1.24, "grad_norm": 4.30327033996582, "learning_rate": 9.499999999999999e-07, "logits/chosen": 1.4431638717651367, "logits/rejected": 0.1636154055595398, "logps/chosen": -402.919921875, "logps/rejected": -142.02041625976562, "loss": 1.0007, "rewards/accuracies": 0.5, "rewards/chosen": -0.6328968405723572, "rewards/margins": -0.3619573712348938, "rewards/rejected": -0.270939439535141, "step": 1240 }, { "epoch": 1.241, "grad_norm": 1.3862537145614624, "learning_rate": 9.487499999999999e-07, "logits/chosen": 0.5898460745811462, "logits/rejected": 0.5381274223327637, "logps/chosen": -162.2577362060547, "logps/rejected": -249.39930725097656, "loss": 0.5601, "rewards/accuracies": 0.75, "rewards/chosen": -0.139988511800766, "rewards/margins": 0.41378888487815857, "rewards/rejected": -0.5537773966789246, "step": 1241 }, { "epoch": 1.242, "grad_norm": 1.5735241174697876, "learning_rate": 9.474999999999999e-07, "logits/chosen": 1.113017201423645, "logits/rejected": 0.765633761882782, "logps/chosen": -321.22271728515625, "logps/rejected": -268.65850830078125, "loss": 0.7099, "rewards/accuracies": 0.5, "rewards/chosen": -0.28950756788253784, "rewards/margins": 0.5895587205886841, "rewards/rejected": -0.8790662884712219, "step": 1242 }, { "epoch": 1.2429999999999999, "grad_norm": 0.9794692397117615, "learning_rate": 9.462499999999999e-07, "logits/chosen": 0.691249430179596, "logits/rejected": 0.7686201930046082, "logps/chosen": -232.84361267089844, "logps/rejected": -200.92758178710938, "loss": 0.3793, "rewards/accuracies": 0.875, "rewards/chosen": 0.042276009917259216, "rewards/margins": 0.9520549774169922, "rewards/rejected": -0.9097790122032166, "step": 1243 }, { "epoch": 1.244, "grad_norm": 1.2603451013565063, "learning_rate": 9.45e-07, "logits/chosen": 0.988508939743042, "logits/rejected": 0.34489867091178894, "logps/chosen": -209.24374389648438, "logps/rejected": -173.81166076660156, "loss": 0.4675, "rewards/accuracies": 0.75, "rewards/chosen": 0.22730466723442078, "rewards/margins": 1.154464840888977, "rewards/rejected": -0.9271601438522339, "step": 1244 }, { "epoch": 1.245, "grad_norm": 1.7391643524169922, "learning_rate": 9.4375e-07, "logits/chosen": 0.3562854528427124, "logits/rejected": 0.9162298440933228, "logps/chosen": -150.17478942871094, "logps/rejected": -223.18438720703125, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": -0.3123243451118469, "rewards/margins": 0.40205004811286926, "rewards/rejected": -0.714374303817749, "step": 1245 }, { "epoch": 1.246, "grad_norm": 2.1693267822265625, "learning_rate": 9.425e-07, "logits/chosen": 0.5654512643814087, "logits/rejected": 0.6584863662719727, "logps/chosen": -190.0970001220703, "logps/rejected": -188.67835998535156, "loss": 0.6019, "rewards/accuracies": 0.75, "rewards/chosen": -0.34308120608329773, "rewards/margins": 0.5238328576087952, "rewards/rejected": -0.8669140338897705, "step": 1246 }, { "epoch": 1.2469999999999999, "grad_norm": 2.346191644668579, "learning_rate": 9.4125e-07, "logits/chosen": 0.6340808272361755, "logits/rejected": 0.7701869010925293, "logps/chosen": -252.0585174560547, "logps/rejected": -229.88748168945312, "loss": 0.9125, "rewards/accuracies": 0.375, "rewards/chosen": -0.7651640772819519, "rewards/margins": -0.2838420271873474, "rewards/rejected": -0.4813220798969269, "step": 1247 }, { "epoch": 1.248, "grad_norm": 1.3834763765335083, "learning_rate": 9.399999999999999e-07, "logits/chosen": 0.5413788557052612, "logits/rejected": 0.042011938989162445, "logps/chosen": -215.6510467529297, "logps/rejected": -141.158447265625, "loss": 0.5624, "rewards/accuracies": 0.75, "rewards/chosen": -0.03778572380542755, "rewards/margins": 0.554914116859436, "rewards/rejected": -0.5926998257637024, "step": 1248 }, { "epoch": 1.249, "grad_norm": 1.6446195840835571, "learning_rate": 9.387499999999999e-07, "logits/chosen": 0.9480259418487549, "logits/rejected": 0.4540032744407654, "logps/chosen": -174.8953857421875, "logps/rejected": -228.46127319335938, "loss": 0.8472, "rewards/accuracies": 0.5, "rewards/chosen": -0.6586581468582153, "rewards/margins": 0.12069255113601685, "rewards/rejected": -0.7793506979942322, "step": 1249 }, { "epoch": 1.25, "grad_norm": 1.1787525415420532, "learning_rate": 9.374999999999999e-07, "logits/chosen": 0.24040138721466064, "logits/rejected": 1.1968753337860107, "logps/chosen": -111.950439453125, "logps/rejected": -256.82171630859375, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": 0.15780305862426758, "rewards/margins": 1.1065837144851685, "rewards/rejected": -0.9487806558609009, "step": 1250 }, { "epoch": 1.251, "grad_norm": 0.9216907024383545, "learning_rate": 9.3625e-07, "logits/chosen": 0.8631693124771118, "logits/rejected": 0.9114048480987549, "logps/chosen": -226.53314208984375, "logps/rejected": -231.5448760986328, "loss": 0.2987, "rewards/accuracies": 0.875, "rewards/chosen": 0.30499088764190674, "rewards/margins": 1.2669382095336914, "rewards/rejected": -0.9619472026824951, "step": 1251 }, { "epoch": 1.252, "grad_norm": 2.1358859539031982, "learning_rate": 9.35e-07, "logits/chosen": 0.49934226274490356, "logits/rejected": 1.1550345420837402, "logps/chosen": -183.1085205078125, "logps/rejected": -272.2427062988281, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": -0.20853224396705627, "rewards/margins": 0.4907176196575165, "rewards/rejected": -0.6992498636245728, "step": 1252 }, { "epoch": 1.2530000000000001, "grad_norm": 2.068361520767212, "learning_rate": 9.3375e-07, "logits/chosen": 0.9362561106681824, "logits/rejected": 0.7396535277366638, "logps/chosen": -228.37147521972656, "logps/rejected": -295.00103759765625, "loss": 0.7476, "rewards/accuracies": 0.375, "rewards/chosen": -0.5440347790718079, "rewards/margins": 0.03646620362997055, "rewards/rejected": -0.5805010199546814, "step": 1253 }, { "epoch": 1.254, "grad_norm": 1.5750243663787842, "learning_rate": 9.325e-07, "logits/chosen": 0.36201032996177673, "logits/rejected": 0.5563344955444336, "logps/chosen": -118.72964477539062, "logps/rejected": -178.14007568359375, "loss": 0.3325, "rewards/accuracies": 0.875, "rewards/chosen": 0.173640638589859, "rewards/margins": 1.206504225730896, "rewards/rejected": -1.0328636169433594, "step": 1254 }, { "epoch": 1.255, "grad_norm": 1.7998045682907104, "learning_rate": 9.3125e-07, "logits/chosen": 0.8068802356719971, "logits/rejected": 0.29445335268974304, "logps/chosen": -293.6274719238281, "logps/rejected": -207.05999755859375, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": -0.21635659039020538, "rewards/margins": 0.554617702960968, "rewards/rejected": -0.7709742784500122, "step": 1255 }, { "epoch": 1.256, "grad_norm": 2.021437406539917, "learning_rate": 9.3e-07, "logits/chosen": 1.5707025527954102, "logits/rejected": 0.7650539875030518, "logps/chosen": -287.0270080566406, "logps/rejected": -193.79977416992188, "loss": 0.7817, "rewards/accuracies": 0.5, "rewards/chosen": -0.40513932704925537, "rewards/margins": 0.013212442398071289, "rewards/rejected": -0.41835176944732666, "step": 1256 }, { "epoch": 1.2570000000000001, "grad_norm": 1.3417688608169556, "learning_rate": 9.287499999999999e-07, "logits/chosen": 0.9080255031585693, "logits/rejected": 0.6699914336204529, "logps/chosen": -265.16552734375, "logps/rejected": -231.9658966064453, "loss": 0.4961, "rewards/accuracies": 0.875, "rewards/chosen": -0.2317645102739334, "rewards/margins": 0.517615020275116, "rewards/rejected": -0.7493795156478882, "step": 1257 }, { "epoch": 1.258, "grad_norm": 1.1175082921981812, "learning_rate": 9.274999999999999e-07, "logits/chosen": 1.009948968887329, "logits/rejected": 1.085049033164978, "logps/chosen": -283.09942626953125, "logps/rejected": -215.92471313476562, "loss": 0.3138, "rewards/accuracies": 0.875, "rewards/chosen": -0.028244204819202423, "rewards/margins": 1.2994745969772339, "rewards/rejected": -1.3277188539505005, "step": 1258 }, { "epoch": 1.259, "grad_norm": 1.7260518074035645, "learning_rate": 9.2625e-07, "logits/chosen": 0.6160776615142822, "logits/rejected": 1.5273164510726929, "logps/chosen": -161.50259399414062, "logps/rejected": -396.385498046875, "loss": 0.5591, "rewards/accuracies": 0.625, "rewards/chosen": 0.006674773991107941, "rewards/margins": 0.45960086584091187, "rewards/rejected": -0.45292606949806213, "step": 1259 }, { "epoch": 1.26, "grad_norm": 1.5215699672698975, "learning_rate": 9.25e-07, "logits/chosen": 0.46381986141204834, "logits/rejected": 0.7297818660736084, "logps/chosen": -147.38143920898438, "logps/rejected": -241.5770263671875, "loss": 0.5598, "rewards/accuracies": 0.75, "rewards/chosen": -0.14948970079421997, "rewards/margins": 0.7032841444015503, "rewards/rejected": -0.852773904800415, "step": 1260 }, { "epoch": 1.2610000000000001, "grad_norm": 1.2712041139602661, "learning_rate": 9.237499999999999e-07, "logits/chosen": 0.8568912148475647, "logits/rejected": 0.46304386854171753, "logps/chosen": -211.05950927734375, "logps/rejected": -211.7920379638672, "loss": 0.4198, "rewards/accuracies": 0.875, "rewards/chosen": 0.14413443207740784, "rewards/margins": 1.006489872932434, "rewards/rejected": -0.8623554110527039, "step": 1261 }, { "epoch": 1.262, "grad_norm": 1.16160249710083, "learning_rate": 9.225e-07, "logits/chosen": 0.5607972741127014, "logits/rejected": 0.5445629954338074, "logps/chosen": -219.78494262695312, "logps/rejected": -263.4852600097656, "loss": 0.3994, "rewards/accuracies": 0.875, "rewards/chosen": 0.16808146238327026, "rewards/margins": 1.040792465209961, "rewards/rejected": -0.8727110028266907, "step": 1262 }, { "epoch": 1.263, "grad_norm": 1.1656805276870728, "learning_rate": 9.2125e-07, "logits/chosen": 0.5902819633483887, "logits/rejected": 0.9804059267044067, "logps/chosen": -255.66900634765625, "logps/rejected": -215.69387817382812, "loss": 0.5391, "rewards/accuracies": 0.75, "rewards/chosen": -0.059659577906131744, "rewards/margins": 0.7690480947494507, "rewards/rejected": -0.8287075757980347, "step": 1263 }, { "epoch": 1.264, "grad_norm": 1.6315882205963135, "learning_rate": 9.2e-07, "logits/chosen": 0.7998386025428772, "logits/rejected": 0.5962492227554321, "logps/chosen": -235.70327758789062, "logps/rejected": -151.130615234375, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": -0.2240365892648697, "rewards/margins": 0.37889939546585083, "rewards/rejected": -0.6029359698295593, "step": 1264 }, { "epoch": 1.2650000000000001, "grad_norm": 1.8641374111175537, "learning_rate": 9.187499999999999e-07, "logits/chosen": 0.677179217338562, "logits/rejected": 1.2858288288116455, "logps/chosen": -94.98684692382812, "logps/rejected": -264.4476623535156, "loss": 0.6525, "rewards/accuracies": 0.75, "rewards/chosen": -0.12175427377223969, "rewards/margins": 0.46567410230636597, "rewards/rejected": -0.5874283909797668, "step": 1265 }, { "epoch": 1.266, "grad_norm": 2.2227859497070312, "learning_rate": 9.174999999999999e-07, "logits/chosen": 0.5455325841903687, "logits/rejected": 1.0957154035568237, "logps/chosen": -163.5259246826172, "logps/rejected": -244.29098510742188, "loss": 0.7364, "rewards/accuracies": 0.5, "rewards/chosen": -0.33313456177711487, "rewards/margins": 0.13694286346435547, "rewards/rejected": -0.47007742524147034, "step": 1266 }, { "epoch": 1.267, "grad_norm": 1.747718334197998, "learning_rate": 9.1625e-07, "logits/chosen": 1.0351859331130981, "logits/rejected": 0.9138863682746887, "logps/chosen": -176.3755645751953, "logps/rejected": -159.32186889648438, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": -0.25800108909606934, "rewards/margins": 0.633395791053772, "rewards/rejected": -0.8913969397544861, "step": 1267 }, { "epoch": 1.268, "grad_norm": 2.837785243988037, "learning_rate": 9.15e-07, "logits/chosen": 0.615704357624054, "logits/rejected": 0.4891183376312256, "logps/chosen": -181.77392578125, "logps/rejected": -171.5309295654297, "loss": 1.1581, "rewards/accuracies": 0.375, "rewards/chosen": -0.8319633603096008, "rewards/margins": -0.41230618953704834, "rewards/rejected": -0.4196571707725525, "step": 1268 }, { "epoch": 1.2690000000000001, "grad_norm": 1.5422759056091309, "learning_rate": 9.137499999999999e-07, "logits/chosen": 0.39608824253082275, "logits/rejected": 1.1030930280685425, "logps/chosen": -121.88031005859375, "logps/rejected": -276.36273193359375, "loss": 0.3128, "rewards/accuracies": 1.0, "rewards/chosen": 0.22108250856399536, "rewards/margins": 1.1349709033966064, "rewards/rejected": -0.9138883352279663, "step": 1269 }, { "epoch": 1.27, "grad_norm": 1.9641218185424805, "learning_rate": 9.124999999999999e-07, "logits/chosen": 1.568311095237732, "logits/rejected": 0.05508464574813843, "logps/chosen": -305.29864501953125, "logps/rejected": -169.3494415283203, "loss": 0.5797, "rewards/accuracies": 0.75, "rewards/chosen": -0.28515636920928955, "rewards/margins": 0.5808374285697937, "rewards/rejected": -0.8659937977790833, "step": 1270 }, { "epoch": 1.271, "grad_norm": 1.083357810974121, "learning_rate": 9.1125e-07, "logits/chosen": 0.6949782371520996, "logits/rejected": 0.9079910516738892, "logps/chosen": -120.4149169921875, "logps/rejected": -207.85360717773438, "loss": 0.3417, "rewards/accuracies": 0.875, "rewards/chosen": 0.1953480839729309, "rewards/margins": 1.1268284320831299, "rewards/rejected": -0.9314804077148438, "step": 1271 }, { "epoch": 1.272, "grad_norm": 1.0308130979537964, "learning_rate": 9.1e-07, "logits/chosen": 0.3259807825088501, "logits/rejected": 0.1306534707546234, "logps/chosen": -154.2001953125, "logps/rejected": -161.63304138183594, "loss": 0.2946, "rewards/accuracies": 0.875, "rewards/chosen": 0.2619275152683258, "rewards/margins": 1.4536540508270264, "rewards/rejected": -1.1917264461517334, "step": 1272 }, { "epoch": 1.2730000000000001, "grad_norm": 1.2297865152359009, "learning_rate": 9.087499999999999e-07, "logits/chosen": 1.15565025806427, "logits/rejected": 0.9670915603637695, "logps/chosen": -259.4508056640625, "logps/rejected": -240.8125762939453, "loss": 0.5048, "rewards/accuracies": 0.875, "rewards/chosen": -0.10847682505846024, "rewards/margins": 0.8912837505340576, "rewards/rejected": -0.9997605681419373, "step": 1273 }, { "epoch": 1.274, "grad_norm": 1.3985612392425537, "learning_rate": 9.074999999999999e-07, "logits/chosen": 0.3047868609428406, "logits/rejected": 0.8959250450134277, "logps/chosen": -128.84335327148438, "logps/rejected": -269.8446044921875, "loss": 0.5443, "rewards/accuracies": 0.625, "rewards/chosen": -0.24178777635097504, "rewards/margins": 0.4846455156803131, "rewards/rejected": -0.726433277130127, "step": 1274 }, { "epoch": 1.275, "grad_norm": 1.1925045251846313, "learning_rate": 9.0625e-07, "logits/chosen": 0.7957760095596313, "logits/rejected": 0.6546478271484375, "logps/chosen": -174.49252319335938, "logps/rejected": -209.94102478027344, "loss": 0.3135, "rewards/accuracies": 0.875, "rewards/chosen": 0.31289738416671753, "rewards/margins": 1.3453617095947266, "rewards/rejected": -1.0324642658233643, "step": 1275 }, { "epoch": 1.276, "grad_norm": 1.625672698020935, "learning_rate": 9.05e-07, "logits/chosen": 0.792607843875885, "logits/rejected": 1.059097409248352, "logps/chosen": -220.04461669921875, "logps/rejected": -229.04452514648438, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": -0.2556207776069641, "rewards/margins": 0.3067537546157837, "rewards/rejected": -0.5623745322227478, "step": 1276 }, { "epoch": 1.2770000000000001, "grad_norm": 1.2779176235198975, "learning_rate": 9.0375e-07, "logits/chosen": 1.007444143295288, "logits/rejected": 1.0771980285644531, "logps/chosen": -245.84646606445312, "logps/rejected": -177.94677734375, "loss": 0.5158, "rewards/accuracies": 0.875, "rewards/chosen": -0.18626898527145386, "rewards/margins": 0.5038755536079407, "rewards/rejected": -0.6901445388793945, "step": 1277 }, { "epoch": 1.278, "grad_norm": 1.7222808599472046, "learning_rate": 9.024999999999999e-07, "logits/chosen": 0.8556268215179443, "logits/rejected": 0.4015171527862549, "logps/chosen": -262.3511962890625, "logps/rejected": -164.2280731201172, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": -0.23225784301757812, "rewards/margins": 0.4560941457748413, "rewards/rejected": -0.6883519887924194, "step": 1278 }, { "epoch": 1.279, "grad_norm": 1.4982578754425049, "learning_rate": 9.0125e-07, "logits/chosen": 0.7191729545593262, "logits/rejected": 0.5611151456832886, "logps/chosen": -184.10641479492188, "logps/rejected": -263.1464538574219, "loss": 0.627, "rewards/accuracies": 0.625, "rewards/chosen": -0.13137808442115784, "rewards/margins": 0.7105056643486023, "rewards/rejected": -0.8418837189674377, "step": 1279 }, { "epoch": 1.28, "grad_norm": 1.701052188873291, "learning_rate": 9e-07, "logits/chosen": 0.8780432939529419, "logits/rejected": 0.820953369140625, "logps/chosen": -208.4167022705078, "logps/rejected": -173.42869567871094, "loss": 0.6327, "rewards/accuracies": 0.625, "rewards/chosen": -0.1317327618598938, "rewards/margins": 0.4579417407512665, "rewards/rejected": -0.5896744728088379, "step": 1280 }, { "epoch": 1.2810000000000001, "grad_norm": 2.27254319190979, "learning_rate": 8.9875e-07, "logits/chosen": 0.7231910228729248, "logits/rejected": 0.33543941378593445, "logps/chosen": -213.48785400390625, "logps/rejected": -184.5814208984375, "loss": 0.8228, "rewards/accuracies": 0.5, "rewards/chosen": -0.5143212676048279, "rewards/margins": 0.08709108829498291, "rewards/rejected": -0.6014123558998108, "step": 1281 }, { "epoch": 1.282, "grad_norm": 1.5704562664031982, "learning_rate": 8.974999999999999e-07, "logits/chosen": 0.9815012216567993, "logits/rejected": 0.35468071699142456, "logps/chosen": -210.79959106445312, "logps/rejected": -165.90374755859375, "loss": 0.4733, "rewards/accuracies": 0.75, "rewards/chosen": -0.04026278853416443, "rewards/margins": 0.7919244766235352, "rewards/rejected": -0.832187294960022, "step": 1282 }, { "epoch": 1.283, "grad_norm": 1.4532066583633423, "learning_rate": 8.9625e-07, "logits/chosen": 1.182507872581482, "logits/rejected": 0.7181922197341919, "logps/chosen": -301.03302001953125, "logps/rejected": -175.91326904296875, "loss": 0.47, "rewards/accuracies": 0.875, "rewards/chosen": 0.1362217217683792, "rewards/margins": 1.0627992153167725, "rewards/rejected": -0.9265775680541992, "step": 1283 }, { "epoch": 1.284, "grad_norm": 1.0176613330841064, "learning_rate": 8.95e-07, "logits/chosen": 1.0431139469146729, "logits/rejected": 0.7857840061187744, "logps/chosen": -238.40853881835938, "logps/rejected": -208.607666015625, "loss": 0.2942, "rewards/accuracies": 0.875, "rewards/chosen": 0.20975057780742645, "rewards/margins": 1.2091467380523682, "rewards/rejected": -0.9993961453437805, "step": 1284 }, { "epoch": 1.285, "grad_norm": 0.8527458310127258, "learning_rate": 8.9375e-07, "logits/chosen": 1.253794550895691, "logits/rejected": 0.214610755443573, "logps/chosen": -340.14166259765625, "logps/rejected": -146.6614990234375, "loss": 0.3386, "rewards/accuracies": 1.0, "rewards/chosen": 0.20224428176879883, "rewards/margins": 1.098584532737732, "rewards/rejected": -0.8963402509689331, "step": 1285 }, { "epoch": 1.286, "grad_norm": 1.2199009656906128, "learning_rate": 8.924999999999999e-07, "logits/chosen": 0.4480971097946167, "logits/rejected": 0.3296063244342804, "logps/chosen": -193.48300170898438, "logps/rejected": -198.3564453125, "loss": 0.543, "rewards/accuracies": 0.75, "rewards/chosen": -0.08782054483890533, "rewards/margins": 0.8807709813117981, "rewards/rejected": -0.968591570854187, "step": 1286 }, { "epoch": 1.287, "grad_norm": 1.4549808502197266, "learning_rate": 8.912499999999999e-07, "logits/chosen": 1.1251550912857056, "logits/rejected": 0.8955844640731812, "logps/chosen": -256.9498291015625, "logps/rejected": -160.00640869140625, "loss": 0.6558, "rewards/accuracies": 0.625, "rewards/chosen": -0.10055667161941528, "rewards/margins": 0.36721181869506836, "rewards/rejected": -0.46776849031448364, "step": 1287 }, { "epoch": 1.288, "grad_norm": 1.5021144151687622, "learning_rate": 8.9e-07, "logits/chosen": 1.0555065870285034, "logits/rejected": 0.6205139756202698, "logps/chosen": -243.76187133789062, "logps/rejected": -234.77105712890625, "loss": 0.5057, "rewards/accuracies": 0.625, "rewards/chosen": -0.1225338876247406, "rewards/margins": 0.8222696781158447, "rewards/rejected": -0.9448036551475525, "step": 1288 }, { "epoch": 1.289, "grad_norm": 1.8721277713775635, "learning_rate": 8.8875e-07, "logits/chosen": 0.8523857593536377, "logits/rejected": 1.0287914276123047, "logps/chosen": -243.48741149902344, "logps/rejected": -225.49375915527344, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": -0.43573904037475586, "rewards/margins": 0.2395942509174347, "rewards/rejected": -0.6753333806991577, "step": 1289 }, { "epoch": 1.29, "grad_norm": 1.066711664199829, "learning_rate": 8.874999999999999e-07, "logits/chosen": 0.4821949899196625, "logits/rejected": 0.6085804104804993, "logps/chosen": -153.74363708496094, "logps/rejected": -213.4766387939453, "loss": 0.4236, "rewards/accuracies": 0.75, "rewards/chosen": 0.0624026358127594, "rewards/margins": 1.0476317405700684, "rewards/rejected": -0.9852291345596313, "step": 1290 }, { "epoch": 1.291, "grad_norm": 1.6478922367095947, "learning_rate": 8.8625e-07, "logits/chosen": 0.355948805809021, "logits/rejected": 0.7575950622558594, "logps/chosen": -135.8829345703125, "logps/rejected": -205.49668884277344, "loss": 0.3814, "rewards/accuracies": 0.875, "rewards/chosen": 0.20407655835151672, "rewards/margins": 0.9469687342643738, "rewards/rejected": -0.7428921461105347, "step": 1291 }, { "epoch": 1.292, "grad_norm": 2.1345536708831787, "learning_rate": 8.85e-07, "logits/chosen": 0.9619739055633545, "logits/rejected": 0.380983829498291, "logps/chosen": -352.5763244628906, "logps/rejected": -219.29833984375, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": -0.15587511658668518, "rewards/margins": 0.6695168614387512, "rewards/rejected": -0.8253920078277588, "step": 1292 }, { "epoch": 1.293, "grad_norm": 1.3513872623443604, "learning_rate": 8.8375e-07, "logits/chosen": 0.6772041320800781, "logits/rejected": 0.70635586977005, "logps/chosen": -240.5176239013672, "logps/rejected": -238.47882080078125, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": -0.030015558004379272, "rewards/margins": 0.7283435463905334, "rewards/rejected": -0.7583591341972351, "step": 1293 }, { "epoch": 1.294, "grad_norm": 1.7185916900634766, "learning_rate": 8.824999999999999e-07, "logits/chosen": 1.0982818603515625, "logits/rejected": 1.2669733762741089, "logps/chosen": -234.1272430419922, "logps/rejected": -175.87188720703125, "loss": 0.4528, "rewards/accuracies": 0.875, "rewards/chosen": -0.08911390602588654, "rewards/margins": 0.7836543321609497, "rewards/rejected": -0.8727682828903198, "step": 1294 }, { "epoch": 1.295, "grad_norm": 1.1919124126434326, "learning_rate": 8.812499999999999e-07, "logits/chosen": 0.3599320650100708, "logits/rejected": 0.9283674955368042, "logps/chosen": -118.24847412109375, "logps/rejected": -173.2344970703125, "loss": 0.5233, "rewards/accuracies": 0.625, "rewards/chosen": -0.03866367042064667, "rewards/margins": 0.7402774095535278, "rewards/rejected": -0.7789410948753357, "step": 1295 }, { "epoch": 1.296, "grad_norm": 2.221818208694458, "learning_rate": 8.799999999999999e-07, "logits/chosen": 0.8836160898208618, "logits/rejected": 0.9896606802940369, "logps/chosen": -220.54641723632812, "logps/rejected": -185.36181640625, "loss": 0.6308, "rewards/accuracies": 0.625, "rewards/chosen": -0.33872079849243164, "rewards/margins": 0.6642179489135742, "rewards/rejected": -1.0029387474060059, "step": 1296 }, { "epoch": 1.297, "grad_norm": 1.2465291023254395, "learning_rate": 8.7875e-07, "logits/chosen": 0.6670724153518677, "logits/rejected": 0.8511427640914917, "logps/chosen": -134.50656127929688, "logps/rejected": -179.61705017089844, "loss": 0.5585, "rewards/accuracies": 0.75, "rewards/chosen": 0.10705994814634323, "rewards/margins": 0.7410832047462463, "rewards/rejected": -0.6340231895446777, "step": 1297 }, { "epoch": 1.298, "grad_norm": 1.6158528327941895, "learning_rate": 8.774999999999999e-07, "logits/chosen": 0.5948008298873901, "logits/rejected": 0.2972056269645691, "logps/chosen": -234.14703369140625, "logps/rejected": -203.94076538085938, "loss": 0.713, "rewards/accuracies": 0.625, "rewards/chosen": -0.26919910311698914, "rewards/margins": 0.29465144872665405, "rewards/rejected": -0.5638506412506104, "step": 1298 }, { "epoch": 1.299, "grad_norm": 1.2533174753189087, "learning_rate": 8.7625e-07, "logits/chosen": 0.9845945835113525, "logits/rejected": 0.9430001974105835, "logps/chosen": -167.29092407226562, "logps/rejected": -187.1877899169922, "loss": 0.565, "rewards/accuracies": 0.625, "rewards/chosen": 0.027132989838719368, "rewards/margins": 0.5377622246742249, "rewards/rejected": -0.5106292963027954, "step": 1299 }, { "epoch": 1.3, "grad_norm": 1.938321828842163, "learning_rate": 8.75e-07, "logits/chosen": 1.3359713554382324, "logits/rejected": 0.562684953212738, "logps/chosen": -370.62255859375, "logps/rejected": -163.74240112304688, "loss": 0.6511, "rewards/accuracies": 0.625, "rewards/chosen": -0.34352433681488037, "rewards/margins": 0.3636408746242523, "rewards/rejected": -0.7071652412414551, "step": 1300 }, { "epoch": 1.301, "grad_norm": 1.3988417387008667, "learning_rate": 8.7375e-07, "logits/chosen": 0.8185974955558777, "logits/rejected": 1.0408029556274414, "logps/chosen": -272.2936706542969, "logps/rejected": -191.9078369140625, "loss": 0.5424, "rewards/accuracies": 0.625, "rewards/chosen": -0.11696949601173401, "rewards/margins": 0.6712332963943481, "rewards/rejected": -0.7882027626037598, "step": 1301 }, { "epoch": 1.302, "grad_norm": 1.5886136293411255, "learning_rate": 8.725e-07, "logits/chosen": 1.1187081336975098, "logits/rejected": 0.4850267767906189, "logps/chosen": -290.1072998046875, "logps/rejected": -204.52845764160156, "loss": 0.6138, "rewards/accuracies": 0.625, "rewards/chosen": 0.042394548654556274, "rewards/margins": 0.7397377490997314, "rewards/rejected": -0.6973431706428528, "step": 1302 }, { "epoch": 1.303, "grad_norm": 1.9220103025436401, "learning_rate": 8.712499999999999e-07, "logits/chosen": 1.045251488685608, "logits/rejected": 0.8591528534889221, "logps/chosen": -316.4497985839844, "logps/rejected": -162.51011657714844, "loss": 0.6301, "rewards/accuracies": 0.625, "rewards/chosen": -0.2761216163635254, "rewards/margins": 0.23484745621681213, "rewards/rejected": -0.5109691023826599, "step": 1303 }, { "epoch": 1.304, "grad_norm": 2.070148229598999, "learning_rate": 8.699999999999999e-07, "logits/chosen": 0.21958975493907928, "logits/rejected": 0.6183717250823975, "logps/chosen": -295.78912353515625, "logps/rejected": -212.49549865722656, "loss": 0.9941, "rewards/accuracies": 0.25, "rewards/chosen": -0.7895805835723877, "rewards/margins": -0.2863632142543793, "rewards/rejected": -0.503217339515686, "step": 1304 }, { "epoch": 1.305, "grad_norm": 0.8665822148323059, "learning_rate": 8.687499999999999e-07, "logits/chosen": 1.050012469291687, "logits/rejected": 1.0860037803649902, "logps/chosen": -296.41961669921875, "logps/rejected": -201.27407836914062, "loss": 0.2706, "rewards/accuracies": 1.0, "rewards/chosen": 0.21365481615066528, "rewards/margins": 1.3414061069488525, "rewards/rejected": -1.127751350402832, "step": 1305 }, { "epoch": 1.306, "grad_norm": 1.2943687438964844, "learning_rate": 8.675000000000001e-07, "logits/chosen": 1.0448757410049438, "logits/rejected": 0.9810686707496643, "logps/chosen": -252.63284301757812, "logps/rejected": -235.59173583984375, "loss": 0.5778, "rewards/accuracies": 0.625, "rewards/chosen": -0.08780726045370102, "rewards/margins": 0.6546199917793274, "rewards/rejected": -0.7424272894859314, "step": 1306 }, { "epoch": 1.307, "grad_norm": 0.9656104445457458, "learning_rate": 8.6625e-07, "logits/chosen": 0.536704421043396, "logits/rejected": 0.9849444627761841, "logps/chosen": -269.37335205078125, "logps/rejected": -187.354736328125, "loss": 0.3384, "rewards/accuracies": 0.875, "rewards/chosen": 0.08918021619319916, "rewards/margins": 1.1860804557800293, "rewards/rejected": -1.096900224685669, "step": 1307 }, { "epoch": 1.308, "grad_norm": 1.2844964265823364, "learning_rate": 8.65e-07, "logits/chosen": 0.7159487009048462, "logits/rejected": 0.19436895847320557, "logps/chosen": -240.14601135253906, "logps/rejected": -173.42181396484375, "loss": 0.459, "rewards/accuracies": 0.75, "rewards/chosen": 0.19450236856937408, "rewards/margins": 0.7807922959327698, "rewards/rejected": -0.5862898826599121, "step": 1308 }, { "epoch": 1.309, "grad_norm": 1.943441390991211, "learning_rate": 8.6375e-07, "logits/chosen": 0.604119598865509, "logits/rejected": 0.987236738204956, "logps/chosen": -169.8560791015625, "logps/rejected": -413.69744873046875, "loss": 0.6863, "rewards/accuracies": 0.625, "rewards/chosen": -0.18736255168914795, "rewards/margins": 0.4061235189437866, "rewards/rejected": -0.5934860706329346, "step": 1309 }, { "epoch": 1.31, "grad_norm": 2.778831720352173, "learning_rate": 8.625e-07, "logits/chosen": 0.7729562520980835, "logits/rejected": 0.5035397410392761, "logps/chosen": -211.5457763671875, "logps/rejected": -174.49874877929688, "loss": 1.0461, "rewards/accuracies": 0.375, "rewards/chosen": -0.550618588924408, "rewards/margins": -0.2634700536727905, "rewards/rejected": -0.28714847564697266, "step": 1310 }, { "epoch": 1.311, "grad_norm": 1.5537607669830322, "learning_rate": 8.612499999999999e-07, "logits/chosen": 0.6288065314292908, "logits/rejected": 0.6490699052810669, "logps/chosen": -203.39669799804688, "logps/rejected": -192.27615356445312, "loss": 0.5759, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003503747284412384, "rewards/margins": 0.5687527656555176, "rewards/rejected": -0.5684024095535278, "step": 1311 }, { "epoch": 1.312, "grad_norm": 0.9765475988388062, "learning_rate": 8.599999999999999e-07, "logits/chosen": 0.8520948886871338, "logits/rejected": 0.775364875793457, "logps/chosen": -175.28680419921875, "logps/rejected": -218.49783325195312, "loss": 0.3915, "rewards/accuracies": 0.875, "rewards/chosen": 0.08216295391321182, "rewards/margins": 0.8915662169456482, "rewards/rejected": -0.8094033002853394, "step": 1312 }, { "epoch": 1.313, "grad_norm": 2.055342435836792, "learning_rate": 8.587499999999999e-07, "logits/chosen": 1.0758342742919922, "logits/rejected": 0.6296137571334839, "logps/chosen": -315.6943054199219, "logps/rejected": -183.39697265625, "loss": 0.563, "rewards/accuracies": 0.75, "rewards/chosen": -0.05296248197555542, "rewards/margins": 0.6413634419441223, "rewards/rejected": -0.6943259239196777, "step": 1313 }, { "epoch": 1.314, "grad_norm": 1.6456315517425537, "learning_rate": 8.575e-07, "logits/chosen": 0.580416202545166, "logits/rejected": 0.7460057735443115, "logps/chosen": -209.0205078125, "logps/rejected": -196.45614624023438, "loss": 0.6679, "rewards/accuracies": 0.75, "rewards/chosen": -0.3564635217189789, "rewards/margins": 0.2949848175048828, "rewards/rejected": -0.6514483094215393, "step": 1314 }, { "epoch": 1.315, "grad_norm": 1.1447842121124268, "learning_rate": 8.5625e-07, "logits/chosen": 1.0893465280532837, "logits/rejected": 0.6952319145202637, "logps/chosen": -198.38494873046875, "logps/rejected": -166.0326690673828, "loss": 0.4791, "rewards/accuracies": 0.75, "rewards/chosen": 0.08157192170619965, "rewards/margins": 0.8429378271102905, "rewards/rejected": -0.7613658905029297, "step": 1315 }, { "epoch": 1.316, "grad_norm": 1.0874677896499634, "learning_rate": 8.55e-07, "logits/chosen": 0.903035044670105, "logits/rejected": 0.28250616788864136, "logps/chosen": -195.8601837158203, "logps/rejected": -195.3267364501953, "loss": 0.4875, "rewards/accuracies": 0.875, "rewards/chosen": -0.02293882705271244, "rewards/margins": 0.7549116611480713, "rewards/rejected": -0.7778504490852356, "step": 1316 }, { "epoch": 1.317, "grad_norm": 1.1997225284576416, "learning_rate": 8.5375e-07, "logits/chosen": 1.0893490314483643, "logits/rejected": 0.9711129665374756, "logps/chosen": -245.02366638183594, "logps/rejected": -187.39608764648438, "loss": 0.4023, "rewards/accuracies": 0.75, "rewards/chosen": 0.02841825783252716, "rewards/margins": 1.0093162059783936, "rewards/rejected": -0.9808980226516724, "step": 1317 }, { "epoch": 1.318, "grad_norm": 0.9020401239395142, "learning_rate": 8.525e-07, "logits/chosen": 0.7157158851623535, "logits/rejected": 0.06295809149742126, "logps/chosen": -226.0538330078125, "logps/rejected": -181.40670776367188, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 0.23029851913452148, "rewards/margins": 1.0040916204452515, "rewards/rejected": -0.7737930417060852, "step": 1318 }, { "epoch": 1.319, "grad_norm": 1.711280107498169, "learning_rate": 8.512499999999999e-07, "logits/chosen": 0.9822552800178528, "logits/rejected": 1.1653162240982056, "logps/chosen": -192.66534423828125, "logps/rejected": -193.90542602539062, "loss": 0.6048, "rewards/accuracies": 0.625, "rewards/chosen": -0.09805136173963547, "rewards/margins": 0.6199140548706055, "rewards/rejected": -0.7179654836654663, "step": 1319 }, { "epoch": 1.32, "grad_norm": 1.0666472911834717, "learning_rate": 8.499999999999999e-07, "logits/chosen": 0.6422545313835144, "logits/rejected": 0.5056374073028564, "logps/chosen": -179.15469360351562, "logps/rejected": -191.08755493164062, "loss": 0.4212, "rewards/accuracies": 0.75, "rewards/chosen": -0.04775085300207138, "rewards/margins": 0.9401448965072632, "rewards/rejected": -0.9878957271575928, "step": 1320 }, { "epoch": 1.321, "grad_norm": 1.348238468170166, "learning_rate": 8.487499999999999e-07, "logits/chosen": 0.7388296127319336, "logits/rejected": 0.966191291809082, "logps/chosen": -217.1342010498047, "logps/rejected": -195.63461303710938, "loss": 0.4878, "rewards/accuracies": 0.625, "rewards/chosen": -0.21022582054138184, "rewards/margins": 0.6614423394203186, "rewards/rejected": -0.8716681003570557, "step": 1321 }, { "epoch": 1.322, "grad_norm": 2.59639310836792, "learning_rate": 8.475e-07, "logits/chosen": 0.7145159244537354, "logits/rejected": 0.7734929323196411, "logps/chosen": -156.4267578125, "logps/rejected": -175.27664184570312, "loss": 0.9438, "rewards/accuracies": 0.375, "rewards/chosen": -0.3126049041748047, "rewards/margins": 0.027595236897468567, "rewards/rejected": -0.34020012617111206, "step": 1322 }, { "epoch": 1.323, "grad_norm": 1.2384774684906006, "learning_rate": 8.462499999999999e-07, "logits/chosen": 0.9099799394607544, "logits/rejected": 0.709993839263916, "logps/chosen": -193.52365112304688, "logps/rejected": -198.1246795654297, "loss": 0.4191, "rewards/accuracies": 0.75, "rewards/chosen": -0.08716589957475662, "rewards/margins": 0.9527381658554077, "rewards/rejected": -1.039903998374939, "step": 1323 }, { "epoch": 1.324, "grad_norm": 1.3101136684417725, "learning_rate": 8.45e-07, "logits/chosen": 0.4267379641532898, "logits/rejected": 0.4449787139892578, "logps/chosen": -247.74441528320312, "logps/rejected": -172.50155639648438, "loss": 0.3705, "rewards/accuracies": 0.875, "rewards/chosen": 0.23110811412334442, "rewards/margins": 1.1878489255905151, "rewards/rejected": -0.9567407965660095, "step": 1324 }, { "epoch": 1.325, "grad_norm": 1.7178881168365479, "learning_rate": 8.4375e-07, "logits/chosen": 0.4914216697216034, "logits/rejected": 0.548219621181488, "logps/chosen": -133.91668701171875, "logps/rejected": -201.10345458984375, "loss": 0.5967, "rewards/accuracies": 0.75, "rewards/chosen": 0.08008230477571487, "rewards/margins": 0.8089432716369629, "rewards/rejected": -0.7288609743118286, "step": 1325 }, { "epoch": 1.326, "grad_norm": 1.6618740558624268, "learning_rate": 8.425e-07, "logits/chosen": 0.44081825017929077, "logits/rejected": 0.8067243695259094, "logps/chosen": -122.83506774902344, "logps/rejected": -269.94866943359375, "loss": 0.5814, "rewards/accuracies": 0.625, "rewards/chosen": -0.04170350730419159, "rewards/margins": 0.6261223554611206, "rewards/rejected": -0.6678259372711182, "step": 1326 }, { "epoch": 1.327, "grad_norm": 1.5112054347991943, "learning_rate": 8.4125e-07, "logits/chosen": 0.7763016819953918, "logits/rejected": 0.9893720149993896, "logps/chosen": -210.26295471191406, "logps/rejected": -244.09674072265625, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": -0.08962956070899963, "rewards/margins": 0.5981401801109314, "rewards/rejected": -0.6877697706222534, "step": 1327 }, { "epoch": 1.328, "grad_norm": 1.5356272459030151, "learning_rate": 8.399999999999999e-07, "logits/chosen": 1.3698948621749878, "logits/rejected": 0.6476418972015381, "logps/chosen": -303.0278015136719, "logps/rejected": -199.29824829101562, "loss": 0.6469, "rewards/accuracies": 0.625, "rewards/chosen": -0.09862727671861649, "rewards/margins": 0.36775919795036316, "rewards/rejected": -0.46638646721839905, "step": 1328 }, { "epoch": 1.329, "grad_norm": 1.0104210376739502, "learning_rate": 8.387499999999999e-07, "logits/chosen": 0.9309543967247009, "logits/rejected": 1.237991213798523, "logps/chosen": -143.35784912109375, "logps/rejected": -228.14581298828125, "loss": 0.3848, "rewards/accuracies": 0.75, "rewards/chosen": 0.1209208071231842, "rewards/margins": 1.0759565830230713, "rewards/rejected": -0.9550358057022095, "step": 1329 }, { "epoch": 1.33, "grad_norm": 1.634462833404541, "learning_rate": 8.375e-07, "logits/chosen": 0.6919509172439575, "logits/rejected": 0.3884083926677704, "logps/chosen": -143.1808319091797, "logps/rejected": -209.6642608642578, "loss": 0.3685, "rewards/accuracies": 0.875, "rewards/chosen": 0.017844878137111664, "rewards/margins": 1.0813432931900024, "rewards/rejected": -1.0634984970092773, "step": 1330 }, { "epoch": 1.331, "grad_norm": 1.8501453399658203, "learning_rate": 8.3625e-07, "logits/chosen": 0.8204688429832458, "logits/rejected": 0.8554626703262329, "logps/chosen": -154.56976318359375, "logps/rejected": -207.59942626953125, "loss": 0.4772, "rewards/accuracies": 0.875, "rewards/chosen": -0.1330859363079071, "rewards/margins": 0.8923271894454956, "rewards/rejected": -1.0254130363464355, "step": 1331 }, { "epoch": 1.332, "grad_norm": 1.4343740940093994, "learning_rate": 8.349999999999999e-07, "logits/chosen": 0.7034198045730591, "logits/rejected": 0.8070913553237915, "logps/chosen": -188.5478057861328, "logps/rejected": -226.0336456298828, "loss": 0.697, "rewards/accuracies": 0.375, "rewards/chosen": -0.17452400922775269, "rewards/margins": 0.12220192700624466, "rewards/rejected": -0.29672595858573914, "step": 1332 }, { "epoch": 1.333, "grad_norm": 1.3038638830184937, "learning_rate": 8.3375e-07, "logits/chosen": 0.6013425588607788, "logits/rejected": 1.1287832260131836, "logps/chosen": -174.6452178955078, "logps/rejected": -219.93605041503906, "loss": 0.5685, "rewards/accuracies": 0.75, "rewards/chosen": -0.25046366453170776, "rewards/margins": 0.532450795173645, "rewards/rejected": -0.7829144597053528, "step": 1333 }, { "epoch": 1.334, "grad_norm": 1.6033098697662354, "learning_rate": 8.325e-07, "logits/chosen": 0.5979781150817871, "logits/rejected": 0.8274508118629456, "logps/chosen": -229.64886474609375, "logps/rejected": -267.85687255859375, "loss": 0.6531, "rewards/accuracies": 0.5, "rewards/chosen": -0.17559584975242615, "rewards/margins": 0.43503984808921814, "rewards/rejected": -0.6106356382369995, "step": 1334 }, { "epoch": 1.335, "grad_norm": 1.8346500396728516, "learning_rate": 8.3125e-07, "logits/chosen": 0.952346682548523, "logits/rejected": 0.7723774909973145, "logps/chosen": -242.2229461669922, "logps/rejected": -238.00308227539062, "loss": 0.8524, "rewards/accuracies": 0.25, "rewards/chosen": -0.4422215521335602, "rewards/margins": 0.0351860374212265, "rewards/rejected": -0.4774075746536255, "step": 1335 }, { "epoch": 1.336, "grad_norm": 0.7662882208824158, "learning_rate": 8.299999999999999e-07, "logits/chosen": 0.9514011144638062, "logits/rejected": 1.0031702518463135, "logps/chosen": -215.25035095214844, "logps/rejected": -194.536865234375, "loss": 0.4056, "rewards/accuracies": 0.875, "rewards/chosen": 0.10214120894670486, "rewards/margins": 0.9270979762077332, "rewards/rejected": -0.8249568343162537, "step": 1336 }, { "epoch": 1.337, "grad_norm": 1.7191413640975952, "learning_rate": 8.287499999999999e-07, "logits/chosen": 0.5725105404853821, "logits/rejected": 0.8570709824562073, "logps/chosen": -121.45439147949219, "logps/rejected": -285.0115661621094, "loss": 0.6035, "rewards/accuracies": 0.625, "rewards/chosen": -0.15390777587890625, "rewards/margins": 0.46272924542427063, "rewards/rejected": -0.6166370511054993, "step": 1337 }, { "epoch": 1.338, "grad_norm": 1.4503811597824097, "learning_rate": 8.275e-07, "logits/chosen": 1.1516993045806885, "logits/rejected": 0.6720497608184814, "logps/chosen": -276.44525146484375, "logps/rejected": -197.00070190429688, "loss": 0.7628, "rewards/accuracies": 0.25, "rewards/chosen": -0.34911319613456726, "rewards/margins": 0.02185129001736641, "rewards/rejected": -0.37096449732780457, "step": 1338 }, { "epoch": 1.339, "grad_norm": 1.7026859521865845, "learning_rate": 8.2625e-07, "logits/chosen": 1.282652497291565, "logits/rejected": 0.8077288866043091, "logps/chosen": -278.9132995605469, "logps/rejected": -182.6810302734375, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": -0.254397988319397, "rewards/margins": 0.2920173406600952, "rewards/rejected": -0.5464153289794922, "step": 1339 }, { "epoch": 1.34, "grad_norm": 1.3980417251586914, "learning_rate": 8.249999999999999e-07, "logits/chosen": 1.395113468170166, "logits/rejected": 0.7647114992141724, "logps/chosen": -301.899169921875, "logps/rejected": -185.59542846679688, "loss": 0.5549, "rewards/accuracies": 0.625, "rewards/chosen": -0.061626240611076355, "rewards/margins": 0.3776232898235321, "rewards/rejected": -0.43924951553344727, "step": 1340 }, { "epoch": 1.341, "grad_norm": 1.5233649015426636, "learning_rate": 8.2375e-07, "logits/chosen": 0.9750848412513733, "logits/rejected": 0.5026398301124573, "logps/chosen": -252.5115203857422, "logps/rejected": -169.9789276123047, "loss": 0.617, "rewards/accuracies": 0.625, "rewards/chosen": -0.26436901092529297, "rewards/margins": 0.31039002537727356, "rewards/rejected": -0.5747590065002441, "step": 1341 }, { "epoch": 1.342, "grad_norm": 1.0955712795257568, "learning_rate": 8.225e-07, "logits/chosen": 0.4838569164276123, "logits/rejected": 0.2774118185043335, "logps/chosen": -210.02099609375, "logps/rejected": -186.7760467529297, "loss": 0.5112, "rewards/accuracies": 0.75, "rewards/chosen": 0.11221819370985031, "rewards/margins": 0.748864471912384, "rewards/rejected": -0.6366463303565979, "step": 1342 }, { "epoch": 1.343, "grad_norm": 1.319805383682251, "learning_rate": 8.2125e-07, "logits/chosen": 1.05986487865448, "logits/rejected": 0.6953441500663757, "logps/chosen": -289.7618103027344, "logps/rejected": -148.94247436523438, "loss": 0.5481, "rewards/accuracies": 0.625, "rewards/chosen": 0.24558162689208984, "rewards/margins": 0.6561340093612671, "rewards/rejected": -0.41055241227149963, "step": 1343 }, { "epoch": 1.3439999999999999, "grad_norm": 1.5792450904846191, "learning_rate": 8.199999999999999e-07, "logits/chosen": 0.6659982204437256, "logits/rejected": 0.033706024289131165, "logps/chosen": -131.14578247070312, "logps/rejected": -198.7508544921875, "loss": 0.7492, "rewards/accuracies": 0.25, "rewards/chosen": -0.21056902408599854, "rewards/margins": 0.07672213762998581, "rewards/rejected": -0.28729116916656494, "step": 1344 }, { "epoch": 1.345, "grad_norm": 1.497306227684021, "learning_rate": 8.187499999999999e-07, "logits/chosen": 0.7602400779724121, "logits/rejected": 0.9827349185943604, "logps/chosen": -230.16519165039062, "logps/rejected": -227.28158569335938, "loss": 0.5269, "rewards/accuracies": 0.875, "rewards/chosen": -0.14670944213867188, "rewards/margins": 0.5203040838241577, "rewards/rejected": -0.6670135259628296, "step": 1345 }, { "epoch": 1.346, "grad_norm": 1.8454816341400146, "learning_rate": 8.175e-07, "logits/chosen": 0.3031233847141266, "logits/rejected": 0.6674733757972717, "logps/chosen": -160.187744140625, "logps/rejected": -287.5416259765625, "loss": 0.7094, "rewards/accuracies": 0.625, "rewards/chosen": -0.3636096119880676, "rewards/margins": 0.4229111671447754, "rewards/rejected": -0.7865207195281982, "step": 1346 }, { "epoch": 1.347, "grad_norm": 1.212100625038147, "learning_rate": 8.1625e-07, "logits/chosen": 1.001625418663025, "logits/rejected": 1.1101171970367432, "logps/chosen": -197.09140014648438, "logps/rejected": -221.58473205566406, "loss": 0.4884, "rewards/accuracies": 0.75, "rewards/chosen": -0.09293918311595917, "rewards/margins": 0.6397424936294556, "rewards/rejected": -0.7326816916465759, "step": 1347 }, { "epoch": 1.3479999999999999, "grad_norm": 1.7766462564468384, "learning_rate": 8.149999999999999e-07, "logits/chosen": -0.01729283481836319, "logits/rejected": 0.9428423047065735, "logps/chosen": -138.75125122070312, "logps/rejected": -234.66465759277344, "loss": 0.7693, "rewards/accuracies": 0.5, "rewards/chosen": -0.29017123579978943, "rewards/margins": 0.0944371223449707, "rewards/rejected": -0.3846083879470825, "step": 1348 }, { "epoch": 1.349, "grad_norm": 1.2569695711135864, "learning_rate": 8.137499999999999e-07, "logits/chosen": 0.392655611038208, "logits/rejected": 0.7259657382965088, "logps/chosen": -200.249755859375, "logps/rejected": -199.510498046875, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": -0.2222004532814026, "rewards/margins": 0.38350361585617065, "rewards/rejected": -0.6057040095329285, "step": 1349 }, { "epoch": 1.35, "grad_norm": 1.4468775987625122, "learning_rate": 8.125e-07, "logits/chosen": 0.5608208179473877, "logits/rejected": 0.43187910318374634, "logps/chosen": -341.71356201171875, "logps/rejected": -244.74172973632812, "loss": 0.5341, "rewards/accuracies": 0.625, "rewards/chosen": -0.23829135298728943, "rewards/margins": 0.7010910511016846, "rewards/rejected": -0.9393824338912964, "step": 1350 }, { "epoch": 1.351, "grad_norm": 1.8897285461425781, "learning_rate": 8.1125e-07, "logits/chosen": 0.5415535569190979, "logits/rejected": 0.7641249895095825, "logps/chosen": -162.58631896972656, "logps/rejected": -230.44619750976562, "loss": 0.7463, "rewards/accuracies": 0.5, "rewards/chosen": -0.3507262170314789, "rewards/margins": 0.11199356615543365, "rewards/rejected": -0.4627198576927185, "step": 1351 }, { "epoch": 1.3519999999999999, "grad_norm": 1.7278635501861572, "learning_rate": 8.1e-07, "logits/chosen": 0.5899893045425415, "logits/rejected": 0.20062987506389618, "logps/chosen": -196.19041442871094, "logps/rejected": -133.85791015625, "loss": 0.7878, "rewards/accuracies": 0.5, "rewards/chosen": -0.30470794439315796, "rewards/margins": 0.04716091603040695, "rewards/rejected": -0.35186880826950073, "step": 1352 }, { "epoch": 1.353, "grad_norm": 1.5635266304016113, "learning_rate": 8.087499999999999e-07, "logits/chosen": 0.8996132612228394, "logits/rejected": 0.5964779257774353, "logps/chosen": -308.7386779785156, "logps/rejected": -219.159912109375, "loss": 0.4494, "rewards/accuracies": 0.75, "rewards/chosen": -0.043245598673820496, "rewards/margins": 0.7619830369949341, "rewards/rejected": -0.805228590965271, "step": 1353 }, { "epoch": 1.354, "grad_norm": 2.0038259029388428, "learning_rate": 8.075e-07, "logits/chosen": 0.6985829472541809, "logits/rejected": 0.39061498641967773, "logps/chosen": -296.48516845703125, "logps/rejected": -187.60711669921875, "loss": 0.5446, "rewards/accuracies": 0.625, "rewards/chosen": -0.049213506281375885, "rewards/margins": 0.6509785056114197, "rewards/rejected": -0.7001920938491821, "step": 1354 }, { "epoch": 1.355, "grad_norm": 1.2398160696029663, "learning_rate": 8.0625e-07, "logits/chosen": 0.7048007845878601, "logits/rejected": 0.5289508700370789, "logps/chosen": -229.1233367919922, "logps/rejected": -234.3249053955078, "loss": 0.4977, "rewards/accuracies": 0.75, "rewards/chosen": 0.10200224071741104, "rewards/margins": 0.9398449659347534, "rewards/rejected": -0.8378427624702454, "step": 1355 }, { "epoch": 1.3559999999999999, "grad_norm": 1.3967435359954834, "learning_rate": 8.05e-07, "logits/chosen": 1.1256859302520752, "logits/rejected": 0.786101222038269, "logps/chosen": -289.6701965332031, "logps/rejected": -221.65567016601562, "loss": 0.5295, "rewards/accuracies": 0.75, "rewards/chosen": -0.21423578262329102, "rewards/margins": 0.47651398181915283, "rewards/rejected": -0.6907497644424438, "step": 1356 }, { "epoch": 1.357, "grad_norm": 1.534828543663025, "learning_rate": 8.037499999999999e-07, "logits/chosen": 0.37779927253723145, "logits/rejected": 0.7633060216903687, "logps/chosen": -174.6029510498047, "logps/rejected": -223.23187255859375, "loss": 0.3595, "rewards/accuracies": 1.0, "rewards/chosen": 0.2139551341533661, "rewards/margins": 0.9811078906059265, "rewards/rejected": -0.767152726650238, "step": 1357 }, { "epoch": 1.358, "grad_norm": 1.400700330734253, "learning_rate": 8.024999999999999e-07, "logits/chosen": 0.6238504648208618, "logits/rejected": 0.8073710203170776, "logps/chosen": -190.92868041992188, "logps/rejected": -188.38914489746094, "loss": 0.5733, "rewards/accuracies": 0.625, "rewards/chosen": -0.32912710309028625, "rewards/margins": 0.3959890604019165, "rewards/rejected": -0.7251161932945251, "step": 1358 }, { "epoch": 1.359, "grad_norm": 1.219592809677124, "learning_rate": 8.0125e-07, "logits/chosen": 0.647642970085144, "logits/rejected": 0.6943406462669373, "logps/chosen": -147.75506591796875, "logps/rejected": -225.43972778320312, "loss": 0.5522, "rewards/accuracies": 0.75, "rewards/chosen": -0.18014365434646606, "rewards/margins": 0.48086076974868774, "rewards/rejected": -0.661004364490509, "step": 1359 }, { "epoch": 1.3599999999999999, "grad_norm": 2.182954788208008, "learning_rate": 8e-07, "logits/chosen": 1.298270344734192, "logits/rejected": 0.9516549110412598, "logps/chosen": -336.4623107910156, "logps/rejected": -189.19219970703125, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -0.1145620197057724, "rewards/margins": 0.5067890286445618, "rewards/rejected": -0.6213510036468506, "step": 1360 }, { "epoch": 1.361, "grad_norm": 1.6476542949676514, "learning_rate": 7.9875e-07, "logits/chosen": 0.9761913418769836, "logits/rejected": 0.6161493062973022, "logps/chosen": -298.1476745605469, "logps/rejected": -192.81912231445312, "loss": 0.6075, "rewards/accuracies": 0.5, "rewards/chosen": -0.21450786292552948, "rewards/margins": 0.3007577061653137, "rewards/rejected": -0.5152655839920044, "step": 1361 }, { "epoch": 1.362, "grad_norm": 1.2080568075180054, "learning_rate": 7.975e-07, "logits/chosen": 0.8230854868888855, "logits/rejected": 0.7306537628173828, "logps/chosen": -230.95396423339844, "logps/rejected": -187.69435119628906, "loss": 0.3892, "rewards/accuracies": 0.875, "rewards/chosen": -0.09079714119434357, "rewards/margins": 0.8447355031967163, "rewards/rejected": -0.9355325102806091, "step": 1362 }, { "epoch": 1.363, "grad_norm": 0.8695333003997803, "learning_rate": 7.9625e-07, "logits/chosen": 0.6156946420669556, "logits/rejected": 1.01701021194458, "logps/chosen": -198.68223571777344, "logps/rejected": -187.77197265625, "loss": 0.4268, "rewards/accuracies": 0.75, "rewards/chosen": -0.015088982880115509, "rewards/margins": 0.763136088848114, "rewards/rejected": -0.7782250642776489, "step": 1363 }, { "epoch": 1.3639999999999999, "grad_norm": 1.316707730293274, "learning_rate": 7.95e-07, "logits/chosen": 0.5367580056190491, "logits/rejected": 0.6987018585205078, "logps/chosen": -219.66583251953125, "logps/rejected": -168.3768310546875, "loss": 0.5652, "rewards/accuracies": 0.625, "rewards/chosen": -0.1672050505876541, "rewards/margins": 0.4556790590286255, "rewards/rejected": -0.6228840947151184, "step": 1364 }, { "epoch": 1.365, "grad_norm": 1.6309884786605835, "learning_rate": 7.937499999999999e-07, "logits/chosen": 0.6241253018379211, "logits/rejected": 1.0998711585998535, "logps/chosen": -160.53665161132812, "logps/rejected": -223.9813690185547, "loss": 0.5345, "rewards/accuracies": 0.75, "rewards/chosen": -0.003143221139907837, "rewards/margins": 0.6402206420898438, "rewards/rejected": -0.6433638334274292, "step": 1365 }, { "epoch": 1.366, "grad_norm": 1.6192463636398315, "learning_rate": 7.924999999999999e-07, "logits/chosen": 0.6561671495437622, "logits/rejected": 0.508443295955658, "logps/chosen": -214.64996337890625, "logps/rejected": -181.5585479736328, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": -0.09629088640213013, "rewards/margins": 0.5160404443740845, "rewards/rejected": -0.6123313307762146, "step": 1366 }, { "epoch": 1.367, "grad_norm": 1.394002914428711, "learning_rate": 7.912499999999999e-07, "logits/chosen": 0.44554418325424194, "logits/rejected": 1.0327019691467285, "logps/chosen": -196.39895629882812, "logps/rejected": -233.3854522705078, "loss": 0.4192, "rewards/accuracies": 0.875, "rewards/chosen": 0.03572368621826172, "rewards/margins": 0.8392866253852844, "rewards/rejected": -0.8035629391670227, "step": 1367 }, { "epoch": 1.3679999999999999, "grad_norm": 1.03119695186615, "learning_rate": 7.9e-07, "logits/chosen": 0.6921532154083252, "logits/rejected": 0.5824877619743347, "logps/chosen": -177.20135498046875, "logps/rejected": -193.69749450683594, "loss": 0.4124, "rewards/accuracies": 0.75, "rewards/chosen": 0.1920413076877594, "rewards/margins": 0.8534635901451111, "rewards/rejected": -0.6614222526550293, "step": 1368 }, { "epoch": 1.369, "grad_norm": 1.0018354654312134, "learning_rate": 7.8875e-07, "logits/chosen": 1.0744796991348267, "logits/rejected": 0.6247693300247192, "logps/chosen": -386.8643798828125, "logps/rejected": -197.2201690673828, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 0.296421617269516, "rewards/margins": 1.1765024662017822, "rewards/rejected": -0.8800806999206543, "step": 1369 }, { "epoch": 1.37, "grad_norm": 1.5442349910736084, "learning_rate": 7.875e-07, "logits/chosen": 0.7483223676681519, "logits/rejected": 1.111497163772583, "logps/chosen": -308.624267578125, "logps/rejected": -212.2720184326172, "loss": 0.6387, "rewards/accuracies": 0.625, "rewards/chosen": -0.22905075550079346, "rewards/margins": 0.33306795358657837, "rewards/rejected": -0.5621187090873718, "step": 1370 }, { "epoch": 1.371, "grad_norm": 1.2356669902801514, "learning_rate": 7.8625e-07, "logits/chosen": 0.49361005425453186, "logits/rejected": 0.23010239005088806, "logps/chosen": -215.67019653320312, "logps/rejected": -201.1309356689453, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": -0.16174308955669403, "rewards/margins": 0.5663349628448486, "rewards/rejected": -0.7280780673027039, "step": 1371 }, { "epoch": 1.3719999999999999, "grad_norm": 1.7808854579925537, "learning_rate": 7.85e-07, "logits/chosen": 0.8365046977996826, "logits/rejected": 0.7616521120071411, "logps/chosen": -307.1062927246094, "logps/rejected": -226.54803466796875, "loss": 0.7182, "rewards/accuracies": 0.625, "rewards/chosen": -0.39482393860816956, "rewards/margins": 0.047784894704818726, "rewards/rejected": -0.4426088333129883, "step": 1372 }, { "epoch": 1.373, "grad_norm": 2.5497772693634033, "learning_rate": 7.837499999999999e-07, "logits/chosen": 0.45334741473197937, "logits/rejected": 0.9788945913314819, "logps/chosen": -186.12271118164062, "logps/rejected": -199.4229736328125, "loss": 0.9296, "rewards/accuracies": 0.375, "rewards/chosen": -0.6977206468582153, "rewards/margins": -0.08192476630210876, "rewards/rejected": -0.615795910358429, "step": 1373 }, { "epoch": 1.374, "grad_norm": 2.183464527130127, "learning_rate": 7.824999999999999e-07, "logits/chosen": 0.6412173509597778, "logits/rejected": 0.7051180005073547, "logps/chosen": -312.2007751464844, "logps/rejected": -190.55551147460938, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": -0.48950260877609253, "rewards/margins": 0.40083402395248413, "rewards/rejected": -0.8903366327285767, "step": 1374 }, { "epoch": 1.375, "grad_norm": 1.4333127737045288, "learning_rate": 7.812499999999999e-07, "logits/chosen": 1.3825665712356567, "logits/rejected": 0.2595500648021698, "logps/chosen": -279.52105712890625, "logps/rejected": -140.9004364013672, "loss": 0.448, "rewards/accuracies": 0.75, "rewards/chosen": -0.16545476019382477, "rewards/margins": 0.6725793480873108, "rewards/rejected": -0.8380340933799744, "step": 1375 }, { "epoch": 1.376, "grad_norm": 1.710211157798767, "learning_rate": 7.799999999999999e-07, "logits/chosen": 0.5553854703903198, "logits/rejected": 1.1284680366516113, "logps/chosen": -167.45278930664062, "logps/rejected": -270.153564453125, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": -0.11039314419031143, "rewards/margins": 0.5341784954071045, "rewards/rejected": -0.6445716023445129, "step": 1376 }, { "epoch": 1.377, "grad_norm": 1.3208072185516357, "learning_rate": 7.787500000000001e-07, "logits/chosen": 0.47394126653671265, "logits/rejected": 0.9614529013633728, "logps/chosen": -107.46669006347656, "logps/rejected": -214.45867919921875, "loss": 0.271, "rewards/accuracies": 1.0, "rewards/chosen": 0.1798509657382965, "rewards/margins": 1.436913013458252, "rewards/rejected": -1.2570621967315674, "step": 1377 }, { "epoch": 1.3780000000000001, "grad_norm": 0.9108434319496155, "learning_rate": 7.775e-07, "logits/chosen": 0.9464594721794128, "logits/rejected": 0.553077757358551, "logps/chosen": -208.8799591064453, "logps/rejected": -145.98675537109375, "loss": 0.3062, "rewards/accuracies": 1.0, "rewards/chosen": 0.27303487062454224, "rewards/margins": 1.1025669574737549, "rewards/rejected": -0.8295320272445679, "step": 1378 }, { "epoch": 1.379, "grad_norm": 1.2860084772109985, "learning_rate": 7.7625e-07, "logits/chosen": 1.0765351057052612, "logits/rejected": 0.9869227409362793, "logps/chosen": -224.7505340576172, "logps/rejected": -222.91693115234375, "loss": 0.5802, "rewards/accuracies": 0.625, "rewards/chosen": -0.1549540013074875, "rewards/margins": 0.6134374141693115, "rewards/rejected": -0.7683914303779602, "step": 1379 }, { "epoch": 1.38, "grad_norm": 1.845897912979126, "learning_rate": 7.75e-07, "logits/chosen": 0.7511076927185059, "logits/rejected": 0.5535818934440613, "logps/chosen": -223.82321166992188, "logps/rejected": -164.63839721679688, "loss": 0.6097, "rewards/accuracies": 0.625, "rewards/chosen": -0.1802753508090973, "rewards/margins": 0.45278334617614746, "rewards/rejected": -0.6330586671829224, "step": 1380 }, { "epoch": 1.381, "grad_norm": 1.270997166633606, "learning_rate": 7.7375e-07, "logits/chosen": 0.4976740777492523, "logits/rejected": 0.768723726272583, "logps/chosen": -195.19239807128906, "logps/rejected": -210.84327697753906, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": -0.2894253730773926, "rewards/margins": 0.5287814140319824, "rewards/rejected": -0.818206787109375, "step": 1381 }, { "epoch": 1.3820000000000001, "grad_norm": 1.5032719373703003, "learning_rate": 7.724999999999999e-07, "logits/chosen": 1.3168892860412598, "logits/rejected": 1.1380293369293213, "logps/chosen": -272.36138916015625, "logps/rejected": -208.5391387939453, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": 0.1553483009338379, "rewards/margins": 0.8150612115859985, "rewards/rejected": -0.6597128510475159, "step": 1382 }, { "epoch": 1.383, "grad_norm": 0.9504706263542175, "learning_rate": 7.712499999999999e-07, "logits/chosen": 0.47478464245796204, "logits/rejected": 0.11149322241544724, "logps/chosen": -189.77108764648438, "logps/rejected": -151.3388671875, "loss": 0.4164, "rewards/accuracies": 0.875, "rewards/chosen": 0.15089941024780273, "rewards/margins": 0.7624630928039551, "rewards/rejected": -0.6115636825561523, "step": 1383 }, { "epoch": 1.384, "grad_norm": 2.295851230621338, "learning_rate": 7.699999999999999e-07, "logits/chosen": 1.0393025875091553, "logits/rejected": 0.7358381152153015, "logps/chosen": -264.025634765625, "logps/rejected": -180.53724670410156, "loss": 0.7727, "rewards/accuracies": 0.375, "rewards/chosen": -0.37720704078674316, "rewards/margins": 0.0443938672542572, "rewards/rejected": -0.42160093784332275, "step": 1384 }, { "epoch": 1.385, "grad_norm": 1.8763492107391357, "learning_rate": 7.6875e-07, "logits/chosen": 0.6901434659957886, "logits/rejected": 0.8806326389312744, "logps/chosen": -199.63824462890625, "logps/rejected": -313.93328857421875, "loss": 0.6129, "rewards/accuracies": 0.75, "rewards/chosen": -0.21558204293251038, "rewards/margins": 0.5071415305137634, "rewards/rejected": -0.7227236032485962, "step": 1385 }, { "epoch": 1.3860000000000001, "grad_norm": 2.2392470836639404, "learning_rate": 7.675e-07, "logits/chosen": 0.7429041266441345, "logits/rejected": 0.5982365608215332, "logps/chosen": -219.5093536376953, "logps/rejected": -150.82186889648438, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": -0.3833544850349426, "rewards/margins": 0.42490828037261963, "rewards/rejected": -0.8082627058029175, "step": 1386 }, { "epoch": 1.387, "grad_norm": 1.5488852262496948, "learning_rate": 7.6625e-07, "logits/chosen": 0.679686427116394, "logits/rejected": 0.4807572364807129, "logps/chosen": -130.64617919921875, "logps/rejected": -209.34347534179688, "loss": 0.5303, "rewards/accuracies": 0.625, "rewards/chosen": -0.0012809373438358307, "rewards/margins": 0.6119035482406616, "rewards/rejected": -0.6131844520568848, "step": 1387 }, { "epoch": 1.388, "grad_norm": 1.221376895904541, "learning_rate": 7.65e-07, "logits/chosen": 0.8510133028030396, "logits/rejected": 1.1334041357040405, "logps/chosen": -199.92800903320312, "logps/rejected": -234.76904296875, "loss": 0.4376, "rewards/accuracies": 0.875, "rewards/chosen": -0.09413862973451614, "rewards/margins": 0.7349680066108704, "rewards/rejected": -0.8291065692901611, "step": 1388 }, { "epoch": 1.389, "grad_norm": 1.2776883840560913, "learning_rate": 7.6375e-07, "logits/chosen": 0.6906741857528687, "logits/rejected": 0.2928491234779358, "logps/chosen": -316.7529296875, "logps/rejected": -198.38885498046875, "loss": 0.5221, "rewards/accuracies": 0.75, "rewards/chosen": 0.07698598504066467, "rewards/margins": 0.7403848171234131, "rewards/rejected": -0.6633988618850708, "step": 1389 }, { "epoch": 1.3900000000000001, "grad_norm": 1.994518518447876, "learning_rate": 7.624999999999999e-07, "logits/chosen": 0.39662599563598633, "logits/rejected": 0.6227397322654724, "logps/chosen": -172.55982971191406, "logps/rejected": -275.0782470703125, "loss": 0.81, "rewards/accuracies": 0.375, "rewards/chosen": -0.4076424837112427, "rewards/margins": -0.056571587920188904, "rewards/rejected": -0.3510708510875702, "step": 1390 }, { "epoch": 1.391, "grad_norm": 1.5229498147964478, "learning_rate": 7.612499999999999e-07, "logits/chosen": 0.7223613858222961, "logits/rejected": 1.3627982139587402, "logps/chosen": -199.59307861328125, "logps/rejected": -237.6206512451172, "loss": 0.7815, "rewards/accuracies": 0.5, "rewards/chosen": -0.5117303133010864, "rewards/margins": -0.075690358877182, "rewards/rejected": -0.43603992462158203, "step": 1391 }, { "epoch": 1.392, "grad_norm": 1.8658658266067505, "learning_rate": 7.599999999999999e-07, "logits/chosen": 0.7643138766288757, "logits/rejected": 0.9544691443443298, "logps/chosen": -374.7898254394531, "logps/rejected": -277.478515625, "loss": 0.7355, "rewards/accuracies": 0.625, "rewards/chosen": -0.20294800400733948, "rewards/margins": 0.5145626068115234, "rewards/rejected": -0.7175105810165405, "step": 1392 }, { "epoch": 1.393, "grad_norm": 2.434133291244507, "learning_rate": 7.5875e-07, "logits/chosen": 1.114821434020996, "logits/rejected": 0.537956953048706, "logps/chosen": -317.114013671875, "logps/rejected": -157.4687042236328, "loss": 0.7633, "rewards/accuracies": 0.625, "rewards/chosen": -0.527774453163147, "rewards/margins": 0.0402037650346756, "rewards/rejected": -0.5679781436920166, "step": 1393 }, { "epoch": 1.3940000000000001, "grad_norm": 1.291877269744873, "learning_rate": 7.575e-07, "logits/chosen": 0.562152087688446, "logits/rejected": 1.0509257316589355, "logps/chosen": -168.78138732910156, "logps/rejected": -224.80374145507812, "loss": 0.4631, "rewards/accuracies": 0.75, "rewards/chosen": 0.16016796231269836, "rewards/margins": 0.7471588253974915, "rewards/rejected": -0.5869908332824707, "step": 1394 }, { "epoch": 1.395, "grad_norm": 1.3282362222671509, "learning_rate": 7.5625e-07, "logits/chosen": 0.34030085802078247, "logits/rejected": 0.3928487300872803, "logps/chosen": -150.33346557617188, "logps/rejected": -227.9147186279297, "loss": 0.6012, "rewards/accuracies": 0.75, "rewards/chosen": -0.204417884349823, "rewards/margins": 0.30836403369903564, "rewards/rejected": -0.5127819180488586, "step": 1395 }, { "epoch": 1.396, "grad_norm": 1.2391135692596436, "learning_rate": 7.55e-07, "logits/chosen": 0.7309718132019043, "logits/rejected": 0.4304504692554474, "logps/chosen": -231.33938598632812, "logps/rejected": -196.75924682617188, "loss": 0.5933, "rewards/accuracies": 0.625, "rewards/chosen": -0.11379566788673401, "rewards/margins": 0.32909417152404785, "rewards/rejected": -0.4428897798061371, "step": 1396 }, { "epoch": 1.397, "grad_norm": 1.219835877418518, "learning_rate": 7.5375e-07, "logits/chosen": 0.637157678604126, "logits/rejected": 1.0956404209136963, "logps/chosen": -162.1149444580078, "logps/rejected": -232.3452911376953, "loss": 0.3099, "rewards/accuracies": 1.0, "rewards/chosen": 0.28683578968048096, "rewards/margins": 1.107222318649292, "rewards/rejected": -0.8203864097595215, "step": 1397 }, { "epoch": 1.3980000000000001, "grad_norm": 1.9998489618301392, "learning_rate": 7.524999999999999e-07, "logits/chosen": 0.7321929931640625, "logits/rejected": 0.8739153146743774, "logps/chosen": -168.46640014648438, "logps/rejected": -262.2493591308594, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": -0.321503221988678, "rewards/margins": 0.3701930046081543, "rewards/rejected": -0.691696286201477, "step": 1398 }, { "epoch": 1.399, "grad_norm": 1.461844563484192, "learning_rate": 7.512499999999999e-07, "logits/chosen": 0.2708473205566406, "logits/rejected": 0.9439447522163391, "logps/chosen": -125.7916259765625, "logps/rejected": -224.82684326171875, "loss": 0.3248, "rewards/accuracies": 0.875, "rewards/chosen": 0.18914584815502167, "rewards/margins": 1.260875940322876, "rewards/rejected": -1.071730136871338, "step": 1399 }, { "epoch": 1.4, "grad_norm": 1.328641414642334, "learning_rate": 7.5e-07, "logits/chosen": 0.8731410503387451, "logits/rejected": 0.9308778047561646, "logps/chosen": -295.6904296875, "logps/rejected": -241.84664916992188, "loss": 0.5397, "rewards/accuracies": 0.75, "rewards/chosen": -0.14889907836914062, "rewards/margins": 0.6562164425849915, "rewards/rejected": -0.8051155805587769, "step": 1400 }, { "epoch": 1.401, "grad_norm": 1.223624348640442, "learning_rate": 7.4875e-07, "logits/chosen": 0.25500431656837463, "logits/rejected": 0.9772498607635498, "logps/chosen": -133.09544372558594, "logps/rejected": -257.734130859375, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": -0.2539989948272705, "rewards/margins": 0.6360911130905151, "rewards/rejected": -0.8900901079177856, "step": 1401 }, { "epoch": 1.4020000000000001, "grad_norm": 1.2678073644638062, "learning_rate": 7.475e-07, "logits/chosen": 0.6168767809867859, "logits/rejected": 1.068081259727478, "logps/chosen": -211.21435546875, "logps/rejected": -260.0406494140625, "loss": 0.3929, "rewards/accuracies": 1.0, "rewards/chosen": 0.030079178512096405, "rewards/margins": 0.7837750911712646, "rewards/rejected": -0.7536959052085876, "step": 1402 }, { "epoch": 1.403, "grad_norm": 1.5766667127609253, "learning_rate": 7.4625e-07, "logits/chosen": 0.6586151719093323, "logits/rejected": 1.4053239822387695, "logps/chosen": -164.77859497070312, "logps/rejected": -284.7578430175781, "loss": 0.6632, "rewards/accuracies": 0.625, "rewards/chosen": -0.08108215779066086, "rewards/margins": 0.3910752832889557, "rewards/rejected": -0.47215747833251953, "step": 1403 }, { "epoch": 1.404, "grad_norm": 1.2210263013839722, "learning_rate": 7.45e-07, "logits/chosen": 0.27139633893966675, "logits/rejected": 0.3547808527946472, "logps/chosen": -265.7499694824219, "logps/rejected": -250.6005859375, "loss": 0.4313, "rewards/accuracies": 0.875, "rewards/chosen": 0.08719334751367569, "rewards/margins": 0.82997727394104, "rewards/rejected": -0.7427839040756226, "step": 1404 }, { "epoch": 1.405, "grad_norm": 1.5013155937194824, "learning_rate": 7.4375e-07, "logits/chosen": 0.9417780637741089, "logits/rejected": 0.4989338517189026, "logps/chosen": -302.4927978515625, "logps/rejected": -187.48394775390625, "loss": 0.7086, "rewards/accuracies": 0.5, "rewards/chosen": -0.0708124041557312, "rewards/margins": 0.39275723695755005, "rewards/rejected": -0.46356964111328125, "step": 1405 }, { "epoch": 1.4060000000000001, "grad_norm": 1.4762015342712402, "learning_rate": 7.425e-07, "logits/chosen": 1.0573558807373047, "logits/rejected": 0.607580304145813, "logps/chosen": -234.17269897460938, "logps/rejected": -164.1981201171875, "loss": 0.5933, "rewards/accuracies": 0.625, "rewards/chosen": -0.13220062851905823, "rewards/margins": 0.4745478630065918, "rewards/rejected": -0.6067484617233276, "step": 1406 }, { "epoch": 1.407, "grad_norm": 2.2362823486328125, "learning_rate": 7.412499999999999e-07, "logits/chosen": 0.8967036604881287, "logits/rejected": 0.8534939289093018, "logps/chosen": -254.18362426757812, "logps/rejected": -294.7814636230469, "loss": 0.707, "rewards/accuracies": 0.75, "rewards/chosen": -0.11644573509693146, "rewards/margins": 0.4081370532512665, "rewards/rejected": -0.5245827436447144, "step": 1407 }, { "epoch": 1.408, "grad_norm": 1.372908353805542, "learning_rate": 7.4e-07, "logits/chosen": 1.0152853727340698, "logits/rejected": 0.5470159649848938, "logps/chosen": -315.5711669921875, "logps/rejected": -182.0076904296875, "loss": 0.5012, "rewards/accuracies": 0.625, "rewards/chosen": 0.06033597141504288, "rewards/margins": 0.675502359867096, "rewards/rejected": -0.615166425704956, "step": 1408 }, { "epoch": 1.409, "grad_norm": 1.5658762454986572, "learning_rate": 7.3875e-07, "logits/chosen": 0.5052169561386108, "logits/rejected": 0.705912709236145, "logps/chosen": -154.08856201171875, "logps/rejected": -181.224365234375, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": -0.3040027320384979, "rewards/margins": 0.21654126048088074, "rewards/rejected": -0.5205439329147339, "step": 1409 }, { "epoch": 1.41, "grad_norm": 1.083383560180664, "learning_rate": 7.375e-07, "logits/chosen": 0.8179569244384766, "logits/rejected": 0.48161089420318604, "logps/chosen": -304.67730712890625, "logps/rejected": -173.3621826171875, "loss": 0.4408, "rewards/accuracies": 0.75, "rewards/chosen": 0.18488521873950958, "rewards/margins": 0.9911090135574341, "rewards/rejected": -0.8062237501144409, "step": 1410 }, { "epoch": 1.411, "grad_norm": 1.649474024772644, "learning_rate": 7.362499999999999e-07, "logits/chosen": 0.533514678478241, "logits/rejected": -0.06878691911697388, "logps/chosen": -173.39334106445312, "logps/rejected": -161.37606811523438, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": -0.05329132825136185, "rewards/margins": 0.21786488592624664, "rewards/rejected": -0.2711562216281891, "step": 1411 }, { "epoch": 1.412, "grad_norm": 1.7019360065460205, "learning_rate": 7.35e-07, "logits/chosen": 0.7987525463104248, "logits/rejected": 1.1034303903579712, "logps/chosen": -141.36224365234375, "logps/rejected": -246.95947265625, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": -0.21422341465950012, "rewards/margins": 0.3406141400337219, "rewards/rejected": -0.5548375248908997, "step": 1412 }, { "epoch": 1.413, "grad_norm": 1.3478633165359497, "learning_rate": 7.3375e-07, "logits/chosen": 0.8403586149215698, "logits/rejected": 0.8935227394104004, "logps/chosen": -287.2445373535156, "logps/rejected": -197.22528076171875, "loss": 0.5567, "rewards/accuracies": 0.75, "rewards/chosen": 0.054839715361595154, "rewards/margins": 0.6823397278785706, "rewards/rejected": -0.627500057220459, "step": 1413 }, { "epoch": 1.414, "grad_norm": 1.4009876251220703, "learning_rate": 7.325e-07, "logits/chosen": 0.257554292678833, "logits/rejected": 0.32896482944488525, "logps/chosen": -189.98907470703125, "logps/rejected": -187.22900390625, "loss": 0.6315, "rewards/accuracies": 0.5, "rewards/chosen": -0.2260374128818512, "rewards/margins": 0.4146610498428345, "rewards/rejected": -0.6406984329223633, "step": 1414 }, { "epoch": 1.415, "grad_norm": 1.6161690950393677, "learning_rate": 7.312499999999999e-07, "logits/chosen": 0.8672292232513428, "logits/rejected": 0.8717777729034424, "logps/chosen": -165.91534423828125, "logps/rejected": -185.7310791015625, "loss": 0.8807, "rewards/accuracies": 0.5, "rewards/chosen": -0.5090636014938354, "rewards/margins": 0.0018796175718307495, "rewards/rejected": -0.5109432935714722, "step": 1415 }, { "epoch": 1.416, "grad_norm": 1.1429851055145264, "learning_rate": 7.3e-07, "logits/chosen": 0.15995100140571594, "logits/rejected": -0.07094007730484009, "logps/chosen": -135.62527465820312, "logps/rejected": -182.45616149902344, "loss": 0.3872, "rewards/accuracies": 0.875, "rewards/chosen": 0.00539093092083931, "rewards/margins": 1.0748697519302368, "rewards/rejected": -1.0694787502288818, "step": 1416 }, { "epoch": 1.417, "grad_norm": 1.1922175884246826, "learning_rate": 7.2875e-07, "logits/chosen": 0.7650423049926758, "logits/rejected": 0.9426575899124146, "logps/chosen": -210.9346160888672, "logps/rejected": -167.53036499023438, "loss": 0.5033, "rewards/accuracies": 0.75, "rewards/chosen": -0.10627242177724838, "rewards/margins": 0.5541090965270996, "rewards/rejected": -0.660381555557251, "step": 1417 }, { "epoch": 1.418, "grad_norm": 1.4000941514968872, "learning_rate": 7.275e-07, "logits/chosen": 1.0942161083221436, "logits/rejected": 0.7403175830841064, "logps/chosen": -291.5762023925781, "logps/rejected": -284.77020263671875, "loss": 0.4735, "rewards/accuracies": 0.875, "rewards/chosen": -0.035132601857185364, "rewards/margins": 0.756583571434021, "rewards/rejected": -0.7917162179946899, "step": 1418 }, { "epoch": 1.419, "grad_norm": 1.3530477285385132, "learning_rate": 7.262499999999999e-07, "logits/chosen": 0.7611401081085205, "logits/rejected": 0.773658812046051, "logps/chosen": -205.24588012695312, "logps/rejected": -167.114013671875, "loss": 0.4899, "rewards/accuracies": 0.75, "rewards/chosen": -0.016302447766065598, "rewards/margins": 0.6504226326942444, "rewards/rejected": -0.6667250990867615, "step": 1419 }, { "epoch": 1.42, "grad_norm": 1.5973883867263794, "learning_rate": 7.249999999999999e-07, "logits/chosen": 0.7045927047729492, "logits/rejected": 0.5750696659088135, "logps/chosen": -211.900390625, "logps/rejected": -187.98780822753906, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": -0.15169081091880798, "rewards/margins": 0.48105037212371826, "rewards/rejected": -0.6327411532402039, "step": 1420 }, { "epoch": 1.421, "grad_norm": 1.7141752243041992, "learning_rate": 7.2375e-07, "logits/chosen": 0.7620884776115417, "logits/rejected": 0.8648840188980103, "logps/chosen": -211.6280517578125, "logps/rejected": -237.90106201171875, "loss": 0.5657, "rewards/accuracies": 0.625, "rewards/chosen": -0.1102534830570221, "rewards/margins": 0.5513396263122559, "rewards/rejected": -0.6615930795669556, "step": 1421 }, { "epoch": 1.422, "grad_norm": 1.3582226037979126, "learning_rate": 7.225e-07, "logits/chosen": 0.8996158838272095, "logits/rejected": 0.7785694003105164, "logps/chosen": -242.1129150390625, "logps/rejected": -161.49319458007812, "loss": 0.5025, "rewards/accuracies": 0.75, "rewards/chosen": 0.037681203335523605, "rewards/margins": 0.6685771346092224, "rewards/rejected": -0.6308959722518921, "step": 1422 }, { "epoch": 1.423, "grad_norm": 1.9561457633972168, "learning_rate": 7.212499999999999e-07, "logits/chosen": 0.5794469118118286, "logits/rejected": 1.1632484197616577, "logps/chosen": -234.44180297851562, "logps/rejected": -247.45294189453125, "loss": 0.7959, "rewards/accuracies": 0.5, "rewards/chosen": -0.2810233235359192, "rewards/margins": 0.04567959904670715, "rewards/rejected": -0.32670289278030396, "step": 1423 }, { "epoch": 1.424, "grad_norm": 1.2967866659164429, "learning_rate": 7.2e-07, "logits/chosen": 0.7740830183029175, "logits/rejected": 0.93252032995224, "logps/chosen": -238.14828491210938, "logps/rejected": -226.32696533203125, "loss": 0.484, "rewards/accuracies": 0.875, "rewards/chosen": -0.17235945165157318, "rewards/margins": 0.6964046955108643, "rewards/rejected": -0.8687642812728882, "step": 1424 }, { "epoch": 1.425, "grad_norm": 2.431821584701538, "learning_rate": 7.1875e-07, "logits/chosen": 0.9082874655723572, "logits/rejected": 0.7277279496192932, "logps/chosen": -171.77137756347656, "logps/rejected": -182.29278564453125, "loss": 0.8326, "rewards/accuracies": 0.375, "rewards/chosen": -0.5059413313865662, "rewards/margins": 0.13365359604358673, "rewards/rejected": -0.6395949125289917, "step": 1425 }, { "epoch": 1.426, "grad_norm": 1.2793430089950562, "learning_rate": 7.175e-07, "logits/chosen": 0.41090312600135803, "logits/rejected": 0.8430221676826477, "logps/chosen": -185.7472381591797, "logps/rejected": -236.87890625, "loss": 0.4585, "rewards/accuracies": 0.75, "rewards/chosen": 0.0504300631582737, "rewards/margins": 0.7412524819374084, "rewards/rejected": -0.6908223628997803, "step": 1426 }, { "epoch": 1.427, "grad_norm": 1.2724788188934326, "learning_rate": 7.1625e-07, "logits/chosen": 0.7445758581161499, "logits/rejected": 0.6530389785766602, "logps/chosen": -193.94927978515625, "logps/rejected": -171.20262145996094, "loss": 0.4507, "rewards/accuracies": 0.875, "rewards/chosen": 0.04989691078662872, "rewards/margins": 0.9979909658432007, "rewards/rejected": -0.9480940103530884, "step": 1427 }, { "epoch": 1.428, "grad_norm": 1.304681420326233, "learning_rate": 7.149999999999999e-07, "logits/chosen": 0.5345464944839478, "logits/rejected": 0.7006722092628479, "logps/chosen": -187.42037963867188, "logps/rejected": -171.8540802001953, "loss": 0.5591, "rewards/accuracies": 0.75, "rewards/chosen": -0.13455864787101746, "rewards/margins": 0.4030146300792694, "rewards/rejected": -0.5375732779502869, "step": 1428 }, { "epoch": 1.429, "grad_norm": 1.0338855981826782, "learning_rate": 7.137499999999999e-07, "logits/chosen": 1.0212894678115845, "logits/rejected": 0.8975992798805237, "logps/chosen": -266.3175048828125, "logps/rejected": -226.05133056640625, "loss": 0.4254, "rewards/accuracies": 0.875, "rewards/chosen": 0.09490566700696945, "rewards/margins": 0.7751341462135315, "rewards/rejected": -0.6802284717559814, "step": 1429 }, { "epoch": 1.43, "grad_norm": 1.2751314640045166, "learning_rate": 7.125e-07, "logits/chosen": 0.6760354042053223, "logits/rejected": 0.42570334672927856, "logps/chosen": -210.9595489501953, "logps/rejected": -166.7029571533203, "loss": 0.4881, "rewards/accuracies": 0.875, "rewards/chosen": 0.013029668480157852, "rewards/margins": 0.6929757595062256, "rewards/rejected": -0.6799460649490356, "step": 1430 }, { "epoch": 1.431, "grad_norm": 1.6882976293563843, "learning_rate": 7.1125e-07, "logits/chosen": 1.4261668920516968, "logits/rejected": 0.45127278566360474, "logps/chosen": -389.3536071777344, "logps/rejected": -136.0374298095703, "loss": 0.7082, "rewards/accuracies": 0.5, "rewards/chosen": -0.10869140923023224, "rewards/margins": 0.39578789472579956, "rewards/rejected": -0.5044792890548706, "step": 1431 }, { "epoch": 1.432, "grad_norm": 1.74241042137146, "learning_rate": 7.1e-07, "logits/chosen": 0.8051356673240662, "logits/rejected": 0.7251309752464294, "logps/chosen": -176.10760498046875, "logps/rejected": -211.44403076171875, "loss": 0.7026, "rewards/accuracies": 0.625, "rewards/chosen": -0.40025970339775085, "rewards/margins": 0.24774324893951416, "rewards/rejected": -0.6480029821395874, "step": 1432 }, { "epoch": 1.433, "grad_norm": 1.575380563735962, "learning_rate": 7.0875e-07, "logits/chosen": 0.35619986057281494, "logits/rejected": 0.9306442737579346, "logps/chosen": -124.6120376586914, "logps/rejected": -234.51870727539062, "loss": 0.6214, "rewards/accuracies": 0.75, "rewards/chosen": -0.15568771958351135, "rewards/margins": 0.3934541940689087, "rewards/rejected": -0.5491418838500977, "step": 1433 }, { "epoch": 1.434, "grad_norm": 1.6803604364395142, "learning_rate": 7.075e-07, "logits/chosen": 0.4213780164718628, "logits/rejected": 1.0878856182098389, "logps/chosen": -266.5368347167969, "logps/rejected": -263.7500915527344, "loss": 0.7627, "rewards/accuracies": 0.625, "rewards/chosen": -0.14111490547657013, "rewards/margins": 0.5789899230003357, "rewards/rejected": -0.7201048135757446, "step": 1434 }, { "epoch": 1.435, "grad_norm": 1.2393348217010498, "learning_rate": 7.0625e-07, "logits/chosen": 1.1419739723205566, "logits/rejected": 0.8850384950637817, "logps/chosen": -314.6064758300781, "logps/rejected": -299.02227783203125, "loss": 0.4922, "rewards/accuracies": 0.75, "rewards/chosen": 0.04389561340212822, "rewards/margins": 0.7153801321983337, "rewards/rejected": -0.6714845299720764, "step": 1435 }, { "epoch": 1.436, "grad_norm": 1.9916133880615234, "learning_rate": 7.049999999999999e-07, "logits/chosen": 0.6780106425285339, "logits/rejected": 0.7773717641830444, "logps/chosen": -177.83203125, "logps/rejected": -194.92735290527344, "loss": 0.7192, "rewards/accuracies": 0.625, "rewards/chosen": -0.32116422057151794, "rewards/margins": 0.28955399990081787, "rewards/rejected": -0.6107182502746582, "step": 1436 }, { "epoch": 1.437, "grad_norm": 1.7486172914505005, "learning_rate": 7.037499999999999e-07, "logits/chosen": 0.9467741250991821, "logits/rejected": 0.7960407137870789, "logps/chosen": -211.60560607910156, "logps/rejected": -218.4339599609375, "loss": 0.7924, "rewards/accuracies": 0.5, "rewards/chosen": -0.38673314452171326, "rewards/margins": 0.036527737975120544, "rewards/rejected": -0.423260897397995, "step": 1437 }, { "epoch": 1.438, "grad_norm": 0.7742115259170532, "learning_rate": 7.024999999999999e-07, "logits/chosen": 0.6640701293945312, "logits/rejected": 0.5077575445175171, "logps/chosen": -202.8569793701172, "logps/rejected": -168.3050079345703, "loss": 0.2209, "rewards/accuracies": 1.0, "rewards/chosen": 0.2783316671848297, "rewards/margins": 1.5125659704208374, "rewards/rejected": -1.23423433303833, "step": 1438 }, { "epoch": 1.439, "grad_norm": 1.1101621389389038, "learning_rate": 7.0125e-07, "logits/chosen": 0.7419872283935547, "logits/rejected": 0.7366975545883179, "logps/chosen": -231.17823791503906, "logps/rejected": -185.92494201660156, "loss": 0.4348, "rewards/accuracies": 0.875, "rewards/chosen": 0.15001635253429413, "rewards/margins": 0.7284699082374573, "rewards/rejected": -0.578453540802002, "step": 1439 }, { "epoch": 1.44, "grad_norm": 2.0994536876678467, "learning_rate": 7e-07, "logits/chosen": 0.720568060874939, "logits/rejected": 1.049893856048584, "logps/chosen": -115.95785522460938, "logps/rejected": -288.3072509765625, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": -0.24440965056419373, "rewards/margins": 0.388322651386261, "rewards/rejected": -0.6327322721481323, "step": 1440 }, { "epoch": 1.441, "grad_norm": 2.9688236713409424, "learning_rate": 6.9875e-07, "logits/chosen": 0.6967078447341919, "logits/rejected": 0.8071848750114441, "logps/chosen": -195.09481811523438, "logps/rejected": -166.61044311523438, "loss": 0.9296, "rewards/accuracies": 0.375, "rewards/chosen": -0.5701450109481812, "rewards/margins": -0.3194277584552765, "rewards/rejected": -0.2507171630859375, "step": 1441 }, { "epoch": 1.442, "grad_norm": 1.1883295774459839, "learning_rate": 6.975e-07, "logits/chosen": 0.7344188690185547, "logits/rejected": 0.5654569864273071, "logps/chosen": -211.82415771484375, "logps/rejected": -168.52471923828125, "loss": 0.5472, "rewards/accuracies": 0.75, "rewards/chosen": 0.024799436330795288, "rewards/margins": 0.49955663084983826, "rewards/rejected": -0.47475719451904297, "step": 1442 }, { "epoch": 1.443, "grad_norm": 1.5849921703338623, "learning_rate": 6.9625e-07, "logits/chosen": 1.0504956245422363, "logits/rejected": 0.7425302267074585, "logps/chosen": -196.46014404296875, "logps/rejected": -154.46438598632812, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": -0.17111138999462128, "rewards/margins": 0.6042836904525757, "rewards/rejected": -0.7753950357437134, "step": 1443 }, { "epoch": 1.444, "grad_norm": 1.6555954217910767, "learning_rate": 6.949999999999999e-07, "logits/chosen": 0.7781299948692322, "logits/rejected": 0.9917671084403992, "logps/chosen": -162.24252319335938, "logps/rejected": -207.94338989257812, "loss": 0.5703, "rewards/accuracies": 0.625, "rewards/chosen": -0.05369415134191513, "rewards/margins": 0.5169521570205688, "rewards/rejected": -0.5706462264060974, "step": 1444 }, { "epoch": 1.445, "grad_norm": 1.5228626728057861, "learning_rate": 6.937499999999999e-07, "logits/chosen": 0.9347870945930481, "logits/rejected": 0.7009289264678955, "logps/chosen": -205.48570251464844, "logps/rejected": -198.13821411132812, "loss": 0.4517, "rewards/accuracies": 0.875, "rewards/chosen": 0.1265133023262024, "rewards/margins": 0.6440770626068115, "rewards/rejected": -0.5175637602806091, "step": 1445 }, { "epoch": 1.446, "grad_norm": 1.4212968349456787, "learning_rate": 6.924999999999999e-07, "logits/chosen": 0.7647461891174316, "logits/rejected": 0.9961062073707581, "logps/chosen": -179.66026306152344, "logps/rejected": -207.14625549316406, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": -0.13829270005226135, "rewards/margins": 0.7892873287200928, "rewards/rejected": -0.9275799989700317, "step": 1446 }, { "epoch": 1.447, "grad_norm": 1.1725337505340576, "learning_rate": 6.9125e-07, "logits/chosen": 1.3410311937332153, "logits/rejected": 1.1812472343444824, "logps/chosen": -344.71490478515625, "logps/rejected": -262.98907470703125, "loss": 0.4514, "rewards/accuracies": 0.625, "rewards/chosen": -0.011127077043056488, "rewards/margins": 0.8459898233413696, "rewards/rejected": -0.8571169376373291, "step": 1447 }, { "epoch": 1.448, "grad_norm": 0.8611771464347839, "learning_rate": 6.9e-07, "logits/chosen": 1.122956395149231, "logits/rejected": 0.9556081891059875, "logps/chosen": -227.0072479248047, "logps/rejected": -166.28289794921875, "loss": 0.2847, "rewards/accuracies": 0.875, "rewards/chosen": 0.38207170367240906, "rewards/margins": 1.2635236978530884, "rewards/rejected": -0.8814519643783569, "step": 1448 }, { "epoch": 1.449, "grad_norm": 1.20086669921875, "learning_rate": 6.8875e-07, "logits/chosen": 0.9106875658035278, "logits/rejected": 1.025189995765686, "logps/chosen": -189.6164093017578, "logps/rejected": -205.27313232421875, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": -0.15067565441131592, "rewards/margins": 0.28722280263900757, "rewards/rejected": -0.4378984570503235, "step": 1449 }, { "epoch": 1.45, "grad_norm": 1.5616222620010376, "learning_rate": 6.875e-07, "logits/chosen": 0.9253209233283997, "logits/rejected": 1.1307332515716553, "logps/chosen": -182.41949462890625, "logps/rejected": -180.45301818847656, "loss": 0.5878, "rewards/accuracies": 0.75, "rewards/chosen": -0.3170318901538849, "rewards/margins": 0.4602786898612976, "rewards/rejected": -0.7773105502128601, "step": 1450 }, { "epoch": 1.451, "grad_norm": 1.151713490486145, "learning_rate": 6.8625e-07, "logits/chosen": 0.5056749582290649, "logits/rejected": 0.9499728083610535, "logps/chosen": -142.99386596679688, "logps/rejected": -257.1495666503906, "loss": 0.3445, "rewards/accuracies": 0.875, "rewards/chosen": 0.09265942871570587, "rewards/margins": 1.0181041955947876, "rewards/rejected": -0.9254448413848877, "step": 1451 }, { "epoch": 1.452, "grad_norm": 2.4736876487731934, "learning_rate": 6.85e-07, "logits/chosen": 0.7839338183403015, "logits/rejected": 0.31147387623786926, "logps/chosen": -213.23898315429688, "logps/rejected": -134.06500244140625, "loss": 0.8312, "rewards/accuracies": 0.5, "rewards/chosen": -0.34118595719337463, "rewards/margins": 0.23424232006072998, "rewards/rejected": -0.575428307056427, "step": 1452 }, { "epoch": 1.453, "grad_norm": 1.42962646484375, "learning_rate": 6.837499999999999e-07, "logits/chosen": 0.834670901298523, "logits/rejected": 1.1614545583724976, "logps/chosen": -166.29147338867188, "logps/rejected": -195.11923217773438, "loss": 0.5048, "rewards/accuracies": 0.75, "rewards/chosen": -0.2019079327583313, "rewards/margins": 0.6984113454818726, "rewards/rejected": -0.9003193378448486, "step": 1453 }, { "epoch": 1.454, "grad_norm": 1.5712600946426392, "learning_rate": 6.824999999999999e-07, "logits/chosen": 0.20263659954071045, "logits/rejected": 0.9740326404571533, "logps/chosen": -141.47415161132812, "logps/rejected": -252.68893432617188, "loss": 0.7404, "rewards/accuracies": 0.5, "rewards/chosen": -0.44581595063209534, "rewards/margins": 0.28469690680503845, "rewards/rejected": -0.7305128574371338, "step": 1454 }, { "epoch": 1.455, "grad_norm": 1.3319958448410034, "learning_rate": 6.8125e-07, "logits/chosen": 1.0543173551559448, "logits/rejected": 0.7847309112548828, "logps/chosen": -272.3893127441406, "logps/rejected": -145.83102416992188, "loss": 0.414, "rewards/accuracies": 0.75, "rewards/chosen": -0.06360979378223419, "rewards/margins": 0.9110444784164429, "rewards/rejected": -0.9746542572975159, "step": 1455 }, { "epoch": 1.456, "grad_norm": 1.4196829795837402, "learning_rate": 6.800000000000001e-07, "logits/chosen": 0.7482665777206421, "logits/rejected": 0.560417115688324, "logps/chosen": -233.08827209472656, "logps/rejected": -164.45018005371094, "loss": 0.6051, "rewards/accuracies": 0.75, "rewards/chosen": -0.14340630173683167, "rewards/margins": 0.343129962682724, "rewards/rejected": -0.48653626441955566, "step": 1456 }, { "epoch": 1.457, "grad_norm": 1.204067587852478, "learning_rate": 6.7875e-07, "logits/chosen": 0.7932897806167603, "logits/rejected": 0.7067117691040039, "logps/chosen": -252.31668090820312, "logps/rejected": -183.81285095214844, "loss": 0.4557, "rewards/accuracies": 0.875, "rewards/chosen": -0.10537318140268326, "rewards/margins": 0.721760630607605, "rewards/rejected": -0.8271337747573853, "step": 1457 }, { "epoch": 1.458, "grad_norm": 1.3096369504928589, "learning_rate": 6.775e-07, "logits/chosen": 0.9658982157707214, "logits/rejected": 0.666537880897522, "logps/chosen": -243.49888610839844, "logps/rejected": -172.58688354492188, "loss": 0.4904, "rewards/accuracies": 0.875, "rewards/chosen": 0.021201465278863907, "rewards/margins": 0.5037802457809448, "rewards/rejected": -0.4825788140296936, "step": 1458 }, { "epoch": 1.459, "grad_norm": 1.4229408502578735, "learning_rate": 6.7625e-07, "logits/chosen": 0.9671297073364258, "logits/rejected": 0.7404968738555908, "logps/chosen": -212.83670043945312, "logps/rejected": -177.31674194335938, "loss": 0.4171, "rewards/accuracies": 0.875, "rewards/chosen": -0.03943967819213867, "rewards/margins": 0.7451460361480713, "rewards/rejected": -0.78458571434021, "step": 1459 }, { "epoch": 1.46, "grad_norm": 1.5163036584854126, "learning_rate": 6.75e-07, "logits/chosen": 0.8838427066802979, "logits/rejected": 0.681904673576355, "logps/chosen": -188.0188751220703, "logps/rejected": -163.18917846679688, "loss": 0.7185, "rewards/accuracies": 0.5, "rewards/chosen": -0.2722342610359192, "rewards/margins": 0.27259618043899536, "rewards/rejected": -0.5448304414749146, "step": 1460 }, { "epoch": 1.461, "grad_norm": 1.4308044910430908, "learning_rate": 6.737499999999999e-07, "logits/chosen": 0.506589949131012, "logits/rejected": 1.1372307538986206, "logps/chosen": -132.99940490722656, "logps/rejected": -210.92922973632812, "loss": 0.4758, "rewards/accuracies": 0.75, "rewards/chosen": -0.1934184432029724, "rewards/margins": 0.8243257403373718, "rewards/rejected": -1.0177441835403442, "step": 1461 }, { "epoch": 1.462, "grad_norm": 1.3724899291992188, "learning_rate": 6.724999999999999e-07, "logits/chosen": 0.9121749401092529, "logits/rejected": 1.2733010053634644, "logps/chosen": -230.3199462890625, "logps/rejected": -266.65380859375, "loss": 0.4831, "rewards/accuracies": 0.75, "rewards/chosen": -0.04198971390724182, "rewards/margins": 0.6885999441146851, "rewards/rejected": -0.7305896878242493, "step": 1462 }, { "epoch": 1.463, "grad_norm": 1.9055688381195068, "learning_rate": 6.7125e-07, "logits/chosen": 0.8018637895584106, "logits/rejected": 0.29593074321746826, "logps/chosen": -251.5190887451172, "logps/rejected": -129.25997924804688, "loss": 0.5831, "rewards/accuracies": 0.75, "rewards/chosen": -0.0720134824514389, "rewards/margins": 0.5046040415763855, "rewards/rejected": -0.5766175389289856, "step": 1463 }, { "epoch": 1.464, "grad_norm": 1.0295711755752563, "learning_rate": 6.7e-07, "logits/chosen": 1.0136332511901855, "logits/rejected": 0.7964615225791931, "logps/chosen": -202.5402374267578, "logps/rejected": -192.8975372314453, "loss": 0.4813, "rewards/accuracies": 0.625, "rewards/chosen": -0.061362359672784805, "rewards/margins": 0.7300816774368286, "rewards/rejected": -0.7914440631866455, "step": 1464 }, { "epoch": 1.465, "grad_norm": 1.2315843105316162, "learning_rate": 6.6875e-07, "logits/chosen": 1.2214723825454712, "logits/rejected": 0.33961451053619385, "logps/chosen": -280.6473083496094, "logps/rejected": -151.38571166992188, "loss": 0.5003, "rewards/accuracies": 0.75, "rewards/chosen": 0.09637241065502167, "rewards/margins": 0.6020631790161133, "rewards/rejected": -0.5056907534599304, "step": 1465 }, { "epoch": 1.466, "grad_norm": 1.3159099817276, "learning_rate": 6.675e-07, "logits/chosen": 0.7103286981582642, "logits/rejected": 0.6774654984474182, "logps/chosen": -282.203857421875, "logps/rejected": -232.33229064941406, "loss": 0.7413, "rewards/accuracies": 0.75, "rewards/chosen": -0.18974581360816956, "rewards/margins": 0.14543506503105164, "rewards/rejected": -0.3351808488368988, "step": 1466 }, { "epoch": 1.467, "grad_norm": 1.7757288217544556, "learning_rate": 6.6625e-07, "logits/chosen": 0.7445204257965088, "logits/rejected": 0.424896776676178, "logps/chosen": -264.28021240234375, "logps/rejected": -239.48358154296875, "loss": 0.5319, "rewards/accuracies": 0.625, "rewards/chosen": -0.22161483764648438, "rewards/margins": 0.7072646021842957, "rewards/rejected": -0.92887943983078, "step": 1467 }, { "epoch": 1.468, "grad_norm": 1.9038575887680054, "learning_rate": 6.65e-07, "logits/chosen": 0.38225409388542175, "logits/rejected": 0.7541413903236389, "logps/chosen": -155.0933837890625, "logps/rejected": -210.73797607421875, "loss": 0.5066, "rewards/accuracies": 0.875, "rewards/chosen": -0.12025079876184464, "rewards/margins": 0.6535221934318542, "rewards/rejected": -0.7737729549407959, "step": 1468 }, { "epoch": 1.4689999999999999, "grad_norm": 1.5914493799209595, "learning_rate": 6.637499999999999e-07, "logits/chosen": 0.9989169836044312, "logits/rejected": 0.7477903962135315, "logps/chosen": -271.7366027832031, "logps/rejected": -161.43585205078125, "loss": 0.5743, "rewards/accuracies": 0.75, "rewards/chosen": -0.30872786045074463, "rewards/margins": 0.3685743510723114, "rewards/rejected": -0.6773021817207336, "step": 1469 }, { "epoch": 1.47, "grad_norm": 1.2384151220321655, "learning_rate": 6.624999999999999e-07, "logits/chosen": 0.8467905521392822, "logits/rejected": 0.6902221441268921, "logps/chosen": -268.0375671386719, "logps/rejected": -206.26528930664062, "loss": 0.4632, "rewards/accuracies": 0.875, "rewards/chosen": -0.07873191684484482, "rewards/margins": 0.7163950800895691, "rewards/rejected": -0.7951270341873169, "step": 1470 }, { "epoch": 1.471, "grad_norm": 1.2378009557724, "learning_rate": 6.6125e-07, "logits/chosen": 0.40375930070877075, "logits/rejected": 0.9655121564865112, "logps/chosen": -154.49551391601562, "logps/rejected": -259.539306640625, "loss": 0.4418, "rewards/accuracies": 0.75, "rewards/chosen": 0.0539851188659668, "rewards/margins": 0.8111722469329834, "rewards/rejected": -0.7571870684623718, "step": 1471 }, { "epoch": 1.472, "grad_norm": 1.6271111965179443, "learning_rate": 6.6e-07, "logits/chosen": 0.5653853416442871, "logits/rejected": 0.8210399150848389, "logps/chosen": -190.77845764160156, "logps/rejected": -252.9866943359375, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": -0.06112454831600189, "rewards/margins": 0.3701580762863159, "rewards/rejected": -0.4312826097011566, "step": 1472 }, { "epoch": 1.4729999999999999, "grad_norm": 2.0671300888061523, "learning_rate": 6.587499999999999e-07, "logits/chosen": 0.5788148045539856, "logits/rejected": 1.0366566181182861, "logps/chosen": -174.803955078125, "logps/rejected": -236.3282928466797, "loss": 0.6568, "rewards/accuracies": 0.5, "rewards/chosen": -0.20636263489723206, "rewards/margins": 0.41988858580589294, "rewards/rejected": -0.626251220703125, "step": 1473 }, { "epoch": 1.474, "grad_norm": 1.8815178871154785, "learning_rate": 6.575e-07, "logits/chosen": 0.7585075497627258, "logits/rejected": 1.2068917751312256, "logps/chosen": -178.4964141845703, "logps/rejected": -297.2519226074219, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.0995492935180664, "rewards/margins": 0.2567828297615051, "rewards/rejected": -0.35633212327957153, "step": 1474 }, { "epoch": 1.475, "grad_norm": 1.7065531015396118, "learning_rate": 6.5625e-07, "logits/chosen": 1.0664937496185303, "logits/rejected": 0.2823287844657898, "logps/chosen": -243.4758758544922, "logps/rejected": -151.94998168945312, "loss": 0.7611, "rewards/accuracies": 0.5, "rewards/chosen": -0.20960722863674164, "rewards/margins": 0.02006593346595764, "rewards/rejected": -0.22967320680618286, "step": 1475 }, { "epoch": 1.476, "grad_norm": 1.9476170539855957, "learning_rate": 6.55e-07, "logits/chosen": 0.604293942451477, "logits/rejected": 1.1393486261367798, "logps/chosen": -109.37255096435547, "logps/rejected": -303.2665710449219, "loss": 0.4563, "rewards/accuracies": 0.875, "rewards/chosen": -0.06997533142566681, "rewards/margins": 0.8195042610168457, "rewards/rejected": -0.8894795775413513, "step": 1476 }, { "epoch": 1.4769999999999999, "grad_norm": 1.5433918237686157, "learning_rate": 6.5375e-07, "logits/chosen": 0.837272047996521, "logits/rejected": 0.37823137640953064, "logps/chosen": -215.20599365234375, "logps/rejected": -231.70333862304688, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": -0.2846061885356903, "rewards/margins": 0.31794148683547974, "rewards/rejected": -0.6025476455688477, "step": 1477 }, { "epoch": 1.478, "grad_norm": 1.1955583095550537, "learning_rate": 6.524999999999999e-07, "logits/chosen": 0.6927171945571899, "logits/rejected": 0.7853608727455139, "logps/chosen": -202.99258422851562, "logps/rejected": -266.62078857421875, "loss": 0.4972, "rewards/accuracies": 0.875, "rewards/chosen": 0.014613717794418335, "rewards/margins": 0.6080847978591919, "rewards/rejected": -0.5934710502624512, "step": 1478 }, { "epoch": 1.479, "grad_norm": 1.3121070861816406, "learning_rate": 6.5125e-07, "logits/chosen": 1.1188040971755981, "logits/rejected": 0.561796247959137, "logps/chosen": -300.73504638671875, "logps/rejected": -147.6702880859375, "loss": 0.4347, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005743503570556641, "rewards/margins": 0.7230260372161865, "rewards/rejected": -0.7236003875732422, "step": 1479 }, { "epoch": 1.48, "grad_norm": 2.3257932662963867, "learning_rate": 6.5e-07, "logits/chosen": 1.5137556791305542, "logits/rejected": 0.374220073223114, "logps/chosen": -333.7154235839844, "logps/rejected": -138.26715087890625, "loss": 0.6171, "rewards/accuracies": 0.75, "rewards/chosen": -0.25011998414993286, "rewards/margins": 0.38098955154418945, "rewards/rejected": -0.6311095356941223, "step": 1480 }, { "epoch": 1.4809999999999999, "grad_norm": 1.090006709098816, "learning_rate": 6.4875e-07, "logits/chosen": 0.749820351600647, "logits/rejected": 0.41891616582870483, "logps/chosen": -271.512939453125, "logps/rejected": -186.31430053710938, "loss": 0.433, "rewards/accuracies": 0.75, "rewards/chosen": 0.10330791771411896, "rewards/margins": 1.052950382232666, "rewards/rejected": -0.949642539024353, "step": 1481 }, { "epoch": 1.482, "grad_norm": 1.4467768669128418, "learning_rate": 6.474999999999999e-07, "logits/chosen": 0.46321508288383484, "logits/rejected": 0.7797105312347412, "logps/chosen": -214.86219787597656, "logps/rejected": -173.2172393798828, "loss": 0.5023, "rewards/accuracies": 0.75, "rewards/chosen": -0.2634585499763489, "rewards/margins": 0.6661888360977173, "rewards/rejected": -0.9296473860740662, "step": 1482 }, { "epoch": 1.483, "grad_norm": 1.701973795890808, "learning_rate": 6.4625e-07, "logits/chosen": 0.6382265090942383, "logits/rejected": 0.3444274961948395, "logps/chosen": -203.87608337402344, "logps/rejected": -171.88072204589844, "loss": 0.5303, "rewards/accuracies": 0.75, "rewards/chosen": -0.08475608378648758, "rewards/margins": 0.5479320287704468, "rewards/rejected": -0.6326881051063538, "step": 1483 }, { "epoch": 1.484, "grad_norm": 1.203799843788147, "learning_rate": 6.45e-07, "logits/chosen": 0.9994441270828247, "logits/rejected": 1.5645548105239868, "logps/chosen": -259.1150207519531, "logps/rejected": -272.2398681640625, "loss": 0.4327, "rewards/accuracies": 0.75, "rewards/chosen": -0.15744847059249878, "rewards/margins": 0.9827592372894287, "rewards/rejected": -1.1402077674865723, "step": 1484 }, { "epoch": 1.4849999999999999, "grad_norm": 1.1698634624481201, "learning_rate": 6.4375e-07, "logits/chosen": 0.8117804527282715, "logits/rejected": 0.349663645029068, "logps/chosen": -189.9619140625, "logps/rejected": -148.5346221923828, "loss": 0.4636, "rewards/accuracies": 0.875, "rewards/chosen": 0.09065033495426178, "rewards/margins": 0.6728788018226624, "rewards/rejected": -0.5822284817695618, "step": 1485 }, { "epoch": 1.486, "grad_norm": 1.4716683626174927, "learning_rate": 6.424999999999999e-07, "logits/chosen": 0.5038296580314636, "logits/rejected": 0.8876982927322388, "logps/chosen": -137.45530700683594, "logps/rejected": -244.7040557861328, "loss": 0.7114, "rewards/accuracies": 0.625, "rewards/chosen": -0.18312925100326538, "rewards/margins": 0.2428026795387268, "rewards/rejected": -0.4259319603443146, "step": 1486 }, { "epoch": 1.487, "grad_norm": 1.1363153457641602, "learning_rate": 6.4125e-07, "logits/chosen": 1.1456722021102905, "logits/rejected": 1.141929030418396, "logps/chosen": -245.2411346435547, "logps/rejected": -208.52078247070312, "loss": 0.4737, "rewards/accuracies": 0.875, "rewards/chosen": 0.12229013442993164, "rewards/margins": 0.6869491934776306, "rewards/rejected": -0.5646589994430542, "step": 1487 }, { "epoch": 1.488, "grad_norm": 0.9496299624443054, "learning_rate": 6.4e-07, "logits/chosen": 0.44194138050079346, "logits/rejected": 0.6694850921630859, "logps/chosen": -138.69923400878906, "logps/rejected": -170.47862243652344, "loss": 0.414, "rewards/accuracies": 0.75, "rewards/chosen": 0.1631060093641281, "rewards/margins": 0.9128634333610535, "rewards/rejected": -0.7497574090957642, "step": 1488 }, { "epoch": 1.4889999999999999, "grad_norm": 1.499947190284729, "learning_rate": 6.3875e-07, "logits/chosen": 1.3450286388397217, "logits/rejected": 0.4527179002761841, "logps/chosen": -363.56585693359375, "logps/rejected": -170.68353271484375, "loss": 0.53, "rewards/accuracies": 0.875, "rewards/chosen": -0.22925148904323578, "rewards/margins": 0.603415310382843, "rewards/rejected": -0.8326667547225952, "step": 1489 }, { "epoch": 1.49, "grad_norm": 1.5999199151992798, "learning_rate": 6.374999999999999e-07, "logits/chosen": 1.2547085285186768, "logits/rejected": 1.1207058429718018, "logps/chosen": -209.89093017578125, "logps/rejected": -199.00442504882812, "loss": 0.7421, "rewards/accuracies": 0.625, "rewards/chosen": -0.2828953266143799, "rewards/margins": 0.08847232908010483, "rewards/rejected": -0.3713676333427429, "step": 1490 }, { "epoch": 1.491, "grad_norm": 1.7588125467300415, "learning_rate": 6.362499999999999e-07, "logits/chosen": 1.1455681324005127, "logits/rejected": 0.4491672217845917, "logps/chosen": -256.74700927734375, "logps/rejected": -219.329833984375, "loss": 0.7405, "rewards/accuracies": 0.75, "rewards/chosen": -0.17136022448539734, "rewards/margins": 0.12246552109718323, "rewards/rejected": -0.2938257157802582, "step": 1491 }, { "epoch": 1.492, "grad_norm": 1.4290732145309448, "learning_rate": 6.35e-07, "logits/chosen": 0.4649093747138977, "logits/rejected": 0.3806522786617279, "logps/chosen": -268.4090576171875, "logps/rejected": -191.51388549804688, "loss": 0.5713, "rewards/accuracies": 0.625, "rewards/chosen": -0.2636198103427887, "rewards/margins": 0.3906336724758148, "rewards/rejected": -0.6542534828186035, "step": 1492 }, { "epoch": 1.4929999999999999, "grad_norm": 1.6453056335449219, "learning_rate": 6.3375e-07, "logits/chosen": 0.4317954480648041, "logits/rejected": 0.6746066808700562, "logps/chosen": -173.40377807617188, "logps/rejected": -263.4965515136719, "loss": 0.5063, "rewards/accuracies": 0.75, "rewards/chosen": -0.06070557236671448, "rewards/margins": 0.5276964902877808, "rewards/rejected": -0.5884020924568176, "step": 1493 }, { "epoch": 1.494, "grad_norm": 2.3953285217285156, "learning_rate": 6.324999999999999e-07, "logits/chosen": 0.9702523946762085, "logits/rejected": 0.5139336585998535, "logps/chosen": -304.14312744140625, "logps/rejected": -169.43463134765625, "loss": 0.9073, "rewards/accuracies": 0.375, "rewards/chosen": -0.560818612575531, "rewards/margins": -0.260921835899353, "rewards/rejected": -0.299896776676178, "step": 1494 }, { "epoch": 1.495, "grad_norm": 1.3614697456359863, "learning_rate": 6.3125e-07, "logits/chosen": 0.8442736864089966, "logits/rejected": 0.7049640417098999, "logps/chosen": -231.95114135742188, "logps/rejected": -213.84735107421875, "loss": 0.4921, "rewards/accuracies": 0.75, "rewards/chosen": -0.06669159978628159, "rewards/margins": 0.6495099067687988, "rewards/rejected": -0.7162014245986938, "step": 1495 }, { "epoch": 1.496, "grad_norm": 2.2820682525634766, "learning_rate": 6.3e-07, "logits/chosen": 1.4019136428833008, "logits/rejected": 1.003382921218872, "logps/chosen": -346.2218322753906, "logps/rejected": -235.7071533203125, "loss": 0.8721, "rewards/accuracies": 0.5, "rewards/chosen": -0.5135564804077148, "rewards/margins": -0.18508349359035492, "rewards/rejected": -0.3284730017185211, "step": 1496 }, { "epoch": 1.4969999999999999, "grad_norm": 1.4622989892959595, "learning_rate": 6.2875e-07, "logits/chosen": 0.8325498104095459, "logits/rejected": -0.07432513684034348, "logps/chosen": -284.1763916015625, "logps/rejected": -145.277587890625, "loss": 0.5158, "rewards/accuracies": 0.75, "rewards/chosen": 0.016040321439504623, "rewards/margins": 0.6205407381057739, "rewards/rejected": -0.604500412940979, "step": 1497 }, { "epoch": 1.498, "grad_norm": 1.3528673648834229, "learning_rate": 6.274999999999999e-07, "logits/chosen": 0.4774852693080902, "logits/rejected": 0.635087788105011, "logps/chosen": -248.7316131591797, "logps/rejected": -197.333740234375, "loss": 0.4972, "rewards/accuracies": 0.875, "rewards/chosen": -0.06092529743909836, "rewards/margins": 0.5868107080459595, "rewards/rejected": -0.6477360129356384, "step": 1498 }, { "epoch": 1.499, "grad_norm": 1.410686731338501, "learning_rate": 6.262499999999999e-07, "logits/chosen": 0.99399733543396, "logits/rejected": 0.6192254424095154, "logps/chosen": -353.2482604980469, "logps/rejected": -223.1816864013672, "loss": 0.6039, "rewards/accuracies": 0.75, "rewards/chosen": -0.09533844888210297, "rewards/margins": 0.509575605392456, "rewards/rejected": -0.6049140095710754, "step": 1499 }, { "epoch": 1.5, "grad_norm": 1.3914258480072021, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.14824306964874268, "logits/rejected": 0.6548575162887573, "logps/chosen": -123.12017822265625, "logps/rejected": -208.9913330078125, "loss": 0.4019, "rewards/accuracies": 0.875, "rewards/chosen": -0.013256929814815521, "rewards/margins": 1.0257655382156372, "rewards/rejected": -1.039022445678711, "step": 1500 }, { "epoch": 1.501, "grad_norm": 1.3796641826629639, "learning_rate": 6.2375e-07, "logits/chosen": 1.031219244003296, "logits/rejected": 0.7073427438735962, "logps/chosen": -257.38970947265625, "logps/rejected": -210.82066345214844, "loss": 0.8068, "rewards/accuracies": 0.75, "rewards/chosen": -0.29204607009887695, "rewards/margins": 0.03305082023143768, "rewards/rejected": -0.32509690523147583, "step": 1501 }, { "epoch": 1.502, "grad_norm": 1.2692792415618896, "learning_rate": 6.225000000000001e-07, "logits/chosen": 1.3294206857681274, "logits/rejected": 0.742118239402771, "logps/chosen": -318.27734375, "logps/rejected": -223.26901245117188, "loss": 0.6427, "rewards/accuracies": 0.625, "rewards/chosen": -0.06611737608909607, "rewards/margins": 0.3707004189491272, "rewards/rejected": -0.43681782484054565, "step": 1502 }, { "epoch": 1.5030000000000001, "grad_norm": 1.506757378578186, "learning_rate": 6.2125e-07, "logits/chosen": 0.39477914571762085, "logits/rejected": 1.069583773612976, "logps/chosen": -128.76199340820312, "logps/rejected": -250.67193603515625, "loss": 0.4489, "rewards/accuracies": 0.875, "rewards/chosen": -0.04841495305299759, "rewards/margins": 0.7347847819328308, "rewards/rejected": -0.783199667930603, "step": 1503 }, { "epoch": 1.504, "grad_norm": 1.9314478635787964, "learning_rate": 6.2e-07, "logits/chosen": 0.27674800157546997, "logits/rejected": 1.1015427112579346, "logps/chosen": -124.10054016113281, "logps/rejected": -250.12094116210938, "loss": 0.6583, "rewards/accuracies": 0.625, "rewards/chosen": -0.03134870156645775, "rewards/margins": 0.37553760409355164, "rewards/rejected": -0.4068863093852997, "step": 1504 }, { "epoch": 1.505, "grad_norm": 1.4423409700393677, "learning_rate": 6.1875e-07, "logits/chosen": 0.8938694596290588, "logits/rejected": 0.7052759528160095, "logps/chosen": -177.67593383789062, "logps/rejected": -212.77455139160156, "loss": 0.544, "rewards/accuracies": 0.625, "rewards/chosen": -0.036061760038137436, "rewards/margins": 0.4855886995792389, "rewards/rejected": -0.5216505527496338, "step": 1505 }, { "epoch": 1.506, "grad_norm": 1.746295690536499, "learning_rate": 6.175e-07, "logits/chosen": 1.0169897079467773, "logits/rejected": 0.7620422840118408, "logps/chosen": -288.2772521972656, "logps/rejected": -233.67816162109375, "loss": 0.5914, "rewards/accuracies": 0.75, "rewards/chosen": -0.1329292207956314, "rewards/margins": 0.49496978521347046, "rewards/rejected": -0.6278989911079407, "step": 1506 }, { "epoch": 1.5070000000000001, "grad_norm": 1.1418023109436035, "learning_rate": 6.162499999999999e-07, "logits/chosen": 1.2568731307983398, "logits/rejected": 0.5119950175285339, "logps/chosen": -345.37957763671875, "logps/rejected": -170.21804809570312, "loss": 0.4679, "rewards/accuracies": 0.875, "rewards/chosen": 0.10996340960264206, "rewards/margins": 0.7511594295501709, "rewards/rejected": -0.641196072101593, "step": 1507 }, { "epoch": 1.508, "grad_norm": 2.300931453704834, "learning_rate": 6.149999999999999e-07, "logits/chosen": 0.8181798458099365, "logits/rejected": 0.8677399754524231, "logps/chosen": -168.07766723632812, "logps/rejected": -266.03533935546875, "loss": 0.8899, "rewards/accuracies": 0.5, "rewards/chosen": -0.5113115310668945, "rewards/margins": -0.17382003366947174, "rewards/rejected": -0.337491512298584, "step": 1508 }, { "epoch": 1.509, "grad_norm": 1.2229373455047607, "learning_rate": 6.1375e-07, "logits/chosen": -0.018903698772192, "logits/rejected": 0.3250964283943176, "logps/chosen": -130.8203125, "logps/rejected": -219.1761474609375, "loss": 0.2858, "rewards/accuracies": 1.0, "rewards/chosen": 0.060613300651311874, "rewards/margins": 1.390319585800171, "rewards/rejected": -1.3297061920166016, "step": 1509 }, { "epoch": 1.51, "grad_norm": 1.8668290376663208, "learning_rate": 6.125000000000001e-07, "logits/chosen": 0.4123390316963196, "logits/rejected": 0.8226284384727478, "logps/chosen": -181.42543029785156, "logps/rejected": -232.95777893066406, "loss": 0.5778, "rewards/accuracies": 0.75, "rewards/chosen": -0.34549108147621155, "rewards/margins": 0.48934561014175415, "rewards/rejected": -0.8348366618156433, "step": 1510 }, { "epoch": 1.5110000000000001, "grad_norm": 1.6811962127685547, "learning_rate": 6.1125e-07, "logits/chosen": 0.8946871161460876, "logits/rejected": 0.9352653622627258, "logps/chosen": -214.47207641601562, "logps/rejected": -186.8939208984375, "loss": 0.4195, "rewards/accuracies": 0.75, "rewards/chosen": 0.1457434892654419, "rewards/margins": 0.927345335483551, "rewards/rejected": -0.7816017866134644, "step": 1511 }, { "epoch": 1.512, "grad_norm": 1.7837153673171997, "learning_rate": 6.1e-07, "logits/chosen": 0.3459591865539551, "logits/rejected": 0.8644090890884399, "logps/chosen": -122.98484802246094, "logps/rejected": -198.58065795898438, "loss": 0.6232, "rewards/accuracies": 0.5, "rewards/chosen": -0.05530919134616852, "rewards/margins": 0.3204357326030731, "rewards/rejected": -0.37574493885040283, "step": 1512 }, { "epoch": 1.513, "grad_norm": 1.5190001726150513, "learning_rate": 6.0875e-07, "logits/chosen": 1.12245774269104, "logits/rejected": 1.2658491134643555, "logps/chosen": -193.00555419921875, "logps/rejected": -241.32601928710938, "loss": 0.6608, "rewards/accuracies": 0.5, "rewards/chosen": -0.24091334640979767, "rewards/margins": 0.3834274411201477, "rewards/rejected": -0.624340832233429, "step": 1513 }, { "epoch": 1.514, "grad_norm": 1.8010472059249878, "learning_rate": 6.075e-07, "logits/chosen": 0.7173105478286743, "logits/rejected": 1.0950584411621094, "logps/chosen": -262.22528076171875, "logps/rejected": -196.88714599609375, "loss": 0.6027, "rewards/accuracies": 0.5, "rewards/chosen": -0.1736244112253189, "rewards/margins": 0.36923858523368835, "rewards/rejected": -0.5428630113601685, "step": 1514 }, { "epoch": 1.5150000000000001, "grad_norm": 1.0484064817428589, "learning_rate": 6.062499999999999e-07, "logits/chosen": 0.7615284323692322, "logits/rejected": 0.6490079164505005, "logps/chosen": -176.1219482421875, "logps/rejected": -146.51348876953125, "loss": 0.4196, "rewards/accuracies": 0.75, "rewards/chosen": 0.03872547671198845, "rewards/margins": 0.9682254791259766, "rewards/rejected": -0.9294999837875366, "step": 1515 }, { "epoch": 1.516, "grad_norm": 1.6219706535339355, "learning_rate": 6.049999999999999e-07, "logits/chosen": 0.8652312755584717, "logits/rejected": 0.9189947247505188, "logps/chosen": -207.19073486328125, "logps/rejected": -188.97372436523438, "loss": 0.706, "rewards/accuracies": 0.75, "rewards/chosen": -0.25589582324028015, "rewards/margins": 0.19271203875541687, "rewards/rejected": -0.44860783219337463, "step": 1516 }, { "epoch": 1.517, "grad_norm": 1.6765015125274658, "learning_rate": 6.037499999999999e-07, "logits/chosen": 0.6026560664176941, "logits/rejected": 0.8057491779327393, "logps/chosen": -168.02845764160156, "logps/rejected": -275.5740966796875, "loss": 0.5167, "rewards/accuracies": 0.75, "rewards/chosen": 0.052440449595451355, "rewards/margins": 0.7130851745605469, "rewards/rejected": -0.6606447696685791, "step": 1517 }, { "epoch": 1.518, "grad_norm": 1.4464367628097534, "learning_rate": 6.025000000000001e-07, "logits/chosen": 0.9766566753387451, "logits/rejected": 0.5751059055328369, "logps/chosen": -156.76834106445312, "logps/rejected": -184.56658935546875, "loss": 0.4676, "rewards/accuracies": 0.75, "rewards/chosen": 0.21850767731666565, "rewards/margins": 0.9295706152915955, "rewards/rejected": -0.7110629081726074, "step": 1518 }, { "epoch": 1.5190000000000001, "grad_norm": 2.2237467765808105, "learning_rate": 6.0125e-07, "logits/chosen": 0.8447433710098267, "logits/rejected": 1.103177547454834, "logps/chosen": -290.0350341796875, "logps/rejected": -179.2801513671875, "loss": 0.7891, "rewards/accuracies": 0.625, "rewards/chosen": -0.2586851119995117, "rewards/margins": 0.11049758642911911, "rewards/rejected": -0.36918267607688904, "step": 1519 }, { "epoch": 1.52, "grad_norm": 1.4430090188980103, "learning_rate": 6e-07, "logits/chosen": 0.8829358220100403, "logits/rejected": 0.736518919467926, "logps/chosen": -163.46682739257812, "logps/rejected": -206.30709838867188, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.22559499740600586, "rewards/margins": 0.19575384259223938, "rewards/rejected": -0.4213488698005676, "step": 1520 }, { "epoch": 1.521, "grad_norm": 1.9262323379516602, "learning_rate": 5.9875e-07, "logits/chosen": 0.7541528344154358, "logits/rejected": 1.0589041709899902, "logps/chosen": -203.08282470703125, "logps/rejected": -205.45999145507812, "loss": 0.6552, "rewards/accuracies": 0.625, "rewards/chosen": -0.1266166865825653, "rewards/margins": 0.3522354066371918, "rewards/rejected": -0.4788520932197571, "step": 1521 }, { "epoch": 1.522, "grad_norm": 2.022139310836792, "learning_rate": 5.975e-07, "logits/chosen": 1.1696909666061401, "logits/rejected": 0.4034879207611084, "logps/chosen": -313.8883056640625, "logps/rejected": -180.97250366210938, "loss": 0.5978, "rewards/accuracies": 0.625, "rewards/chosen": -0.22573718428611755, "rewards/margins": 0.3985455632209778, "rewards/rejected": -0.624282717704773, "step": 1522 }, { "epoch": 1.5230000000000001, "grad_norm": 0.8844925761222839, "learning_rate": 5.962499999999999e-07, "logits/chosen": 1.4226963520050049, "logits/rejected": 0.4830358326435089, "logps/chosen": -369.96270751953125, "logps/rejected": -200.74667358398438, "loss": 0.2868, "rewards/accuracies": 1.0, "rewards/chosen": 0.3696562647819519, "rewards/margins": 1.2449696063995361, "rewards/rejected": -0.8753133416175842, "step": 1523 }, { "epoch": 1.524, "grad_norm": 1.3801857233047485, "learning_rate": 5.949999999999999e-07, "logits/chosen": 0.9701583981513977, "logits/rejected": 0.6230897903442383, "logps/chosen": -225.12318420410156, "logps/rejected": -176.73175048828125, "loss": 0.3964, "rewards/accuracies": 0.875, "rewards/chosen": 0.029484834522008896, "rewards/margins": 0.9760314226150513, "rewards/rejected": -0.9465466141700745, "step": 1524 }, { "epoch": 1.525, "grad_norm": 1.2480857372283936, "learning_rate": 5.937499999999999e-07, "logits/chosen": 0.785354733467102, "logits/rejected": 0.7417744994163513, "logps/chosen": -189.6148223876953, "logps/rejected": -212.47727966308594, "loss": 0.6103, "rewards/accuracies": 0.75, "rewards/chosen": 0.07182417809963226, "rewards/margins": 0.6770042181015015, "rewards/rejected": -0.605180025100708, "step": 1525 }, { "epoch": 1.526, "grad_norm": 2.0929813385009766, "learning_rate": 5.925e-07, "logits/chosen": 0.6784870624542236, "logits/rejected": 0.7626144289970398, "logps/chosen": -237.5394744873047, "logps/rejected": -214.80819702148438, "loss": 0.9394, "rewards/accuracies": 0.375, "rewards/chosen": -0.4385497272014618, "rewards/margins": -0.20707982778549194, "rewards/rejected": -0.23146985471248627, "step": 1526 }, { "epoch": 1.5270000000000001, "grad_norm": 2.273080825805664, "learning_rate": 5.912500000000001e-07, "logits/chosen": 0.19543258845806122, "logits/rejected": 0.2602660655975342, "logps/chosen": -205.4873504638672, "logps/rejected": -174.84747314453125, "loss": 0.8369, "rewards/accuracies": 0.5, "rewards/chosen": -0.39122897386550903, "rewards/margins": 0.020003825426101685, "rewards/rejected": -0.41123273968696594, "step": 1527 }, { "epoch": 1.528, "grad_norm": 1.4023948907852173, "learning_rate": 5.9e-07, "logits/chosen": 0.3942679166793823, "logits/rejected": 0.6288066506385803, "logps/chosen": -158.60560607910156, "logps/rejected": -255.75991821289062, "loss": 0.6342, "rewards/accuracies": 0.625, "rewards/chosen": -0.09262790530920029, "rewards/margins": 0.2174554318189621, "rewards/rejected": -0.3100833296775818, "step": 1528 }, { "epoch": 1.529, "grad_norm": 1.1168861389160156, "learning_rate": 5.8875e-07, "logits/chosen": 0.744176983833313, "logits/rejected": 0.3803388476371765, "logps/chosen": -150.03778076171875, "logps/rejected": -164.8263702392578, "loss": 0.5346, "rewards/accuracies": 0.75, "rewards/chosen": 0.1007695347070694, "rewards/margins": 0.654672384262085, "rewards/rejected": -0.553902804851532, "step": 1529 }, { "epoch": 1.53, "grad_norm": 1.3859765529632568, "learning_rate": 5.875e-07, "logits/chosen": 0.56316077709198, "logits/rejected": 0.9909462928771973, "logps/chosen": -231.7544403076172, "logps/rejected": -196.60714721679688, "loss": 0.4506, "rewards/accuracies": 0.75, "rewards/chosen": 0.1015707403421402, "rewards/margins": 0.8938732147216797, "rewards/rejected": -0.7923025488853455, "step": 1530 }, { "epoch": 1.5310000000000001, "grad_norm": 1.9787440299987793, "learning_rate": 5.8625e-07, "logits/chosen": 0.8815635442733765, "logits/rejected": 0.310629665851593, "logps/chosen": -217.56375122070312, "logps/rejected": -185.17620849609375, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": -0.2207084745168686, "rewards/margins": 0.27466848492622375, "rewards/rejected": -0.4953770041465759, "step": 1531 }, { "epoch": 1.532, "grad_norm": 1.7833763360977173, "learning_rate": 5.849999999999999e-07, "logits/chosen": 0.40367698669433594, "logits/rejected": 1.252711534500122, "logps/chosen": -128.5850830078125, "logps/rejected": -316.5958251953125, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": -0.2346567064523697, "rewards/margins": 0.3169891834259033, "rewards/rejected": -0.5516458749771118, "step": 1532 }, { "epoch": 1.533, "grad_norm": 1.085974097251892, "learning_rate": 5.837499999999999e-07, "logits/chosen": 0.48182347416877747, "logits/rejected": 0.4512050151824951, "logps/chosen": -250.10531616210938, "logps/rejected": -215.7789764404297, "loss": 0.3745, "rewards/accuracies": 0.875, "rewards/chosen": 0.1848759651184082, "rewards/margins": 0.9715173244476318, "rewards/rejected": -0.7866412997245789, "step": 1533 }, { "epoch": 1.534, "grad_norm": 1.110121488571167, "learning_rate": 5.825e-07, "logits/chosen": 0.3840411901473999, "logits/rejected": 0.753925085067749, "logps/chosen": -142.81141662597656, "logps/rejected": -211.96253967285156, "loss": 0.5157, "rewards/accuracies": 0.875, "rewards/chosen": 0.030934907495975494, "rewards/margins": 0.899672269821167, "rewards/rejected": -0.8687372803688049, "step": 1534 }, { "epoch": 1.5350000000000001, "grad_norm": 1.6671924591064453, "learning_rate": 5.8125e-07, "logits/chosen": 0.8426387310028076, "logits/rejected": 0.9984213709831238, "logps/chosen": -344.2657165527344, "logps/rejected": -217.68702697753906, "loss": 0.5004, "rewards/accuracies": 0.75, "rewards/chosen": -0.23666277527809143, "rewards/margins": 0.537987232208252, "rewards/rejected": -0.7746500372886658, "step": 1535 }, { "epoch": 1.536, "grad_norm": 2.5559535026550293, "learning_rate": 5.8e-07, "logits/chosen": 0.5494533181190491, "logits/rejected": 0.4985315799713135, "logps/chosen": -255.65240478515625, "logps/rejected": -188.99169921875, "loss": 0.7433, "rewards/accuracies": 0.5, "rewards/chosen": -0.4085816740989685, "rewards/margins": 0.1575041115283966, "rewards/rejected": -0.566085696220398, "step": 1536 }, { "epoch": 1.537, "grad_norm": 1.797321081161499, "learning_rate": 5.7875e-07, "logits/chosen": 0.5260075330734253, "logits/rejected": 0.8243385553359985, "logps/chosen": -179.82814025878906, "logps/rejected": -178.41229248046875, "loss": 0.9055, "rewards/accuracies": 0.375, "rewards/chosen": -0.5226670503616333, "rewards/margins": -0.16180089116096497, "rewards/rejected": -0.3608661890029907, "step": 1537 }, { "epoch": 1.538, "grad_norm": 1.312278389930725, "learning_rate": 5.775e-07, "logits/chosen": 1.1059679985046387, "logits/rejected": 1.1791969537734985, "logps/chosen": -255.10934448242188, "logps/rejected": -244.41452026367188, "loss": 0.5018, "rewards/accuracies": 0.875, "rewards/chosen": -0.20950479805469513, "rewards/margins": 0.5254867672920227, "rewards/rejected": -0.7349915504455566, "step": 1538 }, { "epoch": 1.5390000000000001, "grad_norm": 1.3411939144134521, "learning_rate": 5.7625e-07, "logits/chosen": 0.9910305738449097, "logits/rejected": 0.39663147926330566, "logps/chosen": -192.22555541992188, "logps/rejected": -167.2235870361328, "loss": 0.5882, "rewards/accuracies": 0.625, "rewards/chosen": 0.021416861563920975, "rewards/margins": 0.530985951423645, "rewards/rejected": -0.5095690488815308, "step": 1539 }, { "epoch": 1.54, "grad_norm": 1.5956101417541504, "learning_rate": 5.749999999999999e-07, "logits/chosen": 1.0921813249588013, "logits/rejected": 0.9503743052482605, "logps/chosen": -263.7242736816406, "logps/rejected": -246.37060546875, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": -0.3594663739204407, "rewards/margins": 0.22653527557849884, "rewards/rejected": -0.5860016345977783, "step": 1540 }, { "epoch": 1.541, "grad_norm": 1.5958130359649658, "learning_rate": 5.737499999999999e-07, "logits/chosen": 0.7761889100074768, "logits/rejected": 0.4000634551048279, "logps/chosen": -184.584228515625, "logps/rejected": -176.7926025390625, "loss": 0.7114, "rewards/accuracies": 0.625, "rewards/chosen": -0.2770897150039673, "rewards/margins": 0.2861531972885132, "rewards/rejected": -0.5632429122924805, "step": 1541 }, { "epoch": 1.542, "grad_norm": 2.1948509216308594, "learning_rate": 5.725e-07, "logits/chosen": 0.5319662094116211, "logits/rejected": 0.8207015991210938, "logps/chosen": -203.17947387695312, "logps/rejected": -232.83538818359375, "loss": 0.8239, "rewards/accuracies": 0.375, "rewards/chosen": -0.3322344124317169, "rewards/margins": -0.055127352476119995, "rewards/rejected": -0.2771070599555969, "step": 1542 }, { "epoch": 1.5430000000000001, "grad_norm": 1.241180419921875, "learning_rate": 5.7125e-07, "logits/chosen": 0.9023712873458862, "logits/rejected": 0.9627875089645386, "logps/chosen": -277.3453369140625, "logps/rejected": -232.06297302246094, "loss": 0.4183, "rewards/accuracies": 0.75, "rewards/chosen": 0.07646191120147705, "rewards/margins": 0.8805078864097595, "rewards/rejected": -0.8040460348129272, "step": 1543 }, { "epoch": 1.544, "grad_norm": 1.9143502712249756, "learning_rate": 5.699999999999999e-07, "logits/chosen": 0.8032559156417847, "logits/rejected": 0.6509256362915039, "logps/chosen": -345.915771484375, "logps/rejected": -171.17153930664062, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": -0.16680940985679626, "rewards/margins": 0.44276130199432373, "rewards/rejected": -0.6095706820487976, "step": 1544 }, { "epoch": 1.545, "grad_norm": 1.7477978467941284, "learning_rate": 5.6875e-07, "logits/chosen": 1.0230066776275635, "logits/rejected": 0.4323039650917053, "logps/chosen": -281.9122314453125, "logps/rejected": -199.1446533203125, "loss": 0.7194, "rewards/accuracies": 0.625, "rewards/chosen": -0.18229436874389648, "rewards/margins": 0.2143116146326065, "rewards/rejected": -0.3966059684753418, "step": 1545 }, { "epoch": 1.546, "grad_norm": 1.335503101348877, "learning_rate": 5.675e-07, "logits/chosen": 0.38469910621643066, "logits/rejected": 0.7293872833251953, "logps/chosen": -134.6124725341797, "logps/rejected": -210.26002502441406, "loss": 0.3934, "rewards/accuracies": 0.75, "rewards/chosen": 0.2569518983364105, "rewards/margins": 0.9172809720039368, "rewards/rejected": -0.6603291034698486, "step": 1546 }, { "epoch": 1.5470000000000002, "grad_norm": 1.6019331216812134, "learning_rate": 5.6625e-07, "logits/chosen": 1.1823545694351196, "logits/rejected": 0.5297802090644836, "logps/chosen": -382.4652099609375, "logps/rejected": -234.85519409179688, "loss": 0.6442, "rewards/accuracies": 0.75, "rewards/chosen": 0.1188322976231575, "rewards/margins": 0.6120856404304504, "rewards/rejected": -0.49325335025787354, "step": 1547 }, { "epoch": 1.548, "grad_norm": 2.321380376815796, "learning_rate": 5.649999999999999e-07, "logits/chosen": 0.8482381701469421, "logits/rejected": 0.15385104715824127, "logps/chosen": -197.32554626464844, "logps/rejected": -144.26161193847656, "loss": 0.7298, "rewards/accuracies": 0.375, "rewards/chosen": -0.33904534578323364, "rewards/margins": 0.04555961489677429, "rewards/rejected": -0.38460493087768555, "step": 1548 }, { "epoch": 1.549, "grad_norm": 1.6703211069107056, "learning_rate": 5.637499999999999e-07, "logits/chosen": 0.8052096366882324, "logits/rejected": 0.34839683771133423, "logps/chosen": -305.0067443847656, "logps/rejected": -174.86744689941406, "loss": 0.8295, "rewards/accuracies": 0.5, "rewards/chosen": -0.3475554585456848, "rewards/margins": -0.08039581775665283, "rewards/rejected": -0.267159640789032, "step": 1549 }, { "epoch": 1.55, "grad_norm": 1.274648666381836, "learning_rate": 5.625e-07, "logits/chosen": 0.8610935211181641, "logits/rejected": 0.5620766878128052, "logps/chosen": -180.0436553955078, "logps/rejected": -230.1593017578125, "loss": 0.542, "rewards/accuracies": 0.75, "rewards/chosen": -0.04884509742259979, "rewards/margins": 0.6952531933784485, "rewards/rejected": -0.7440983057022095, "step": 1550 }, { "epoch": 1.5510000000000002, "grad_norm": 1.5112076997756958, "learning_rate": 5.6125e-07, "logits/chosen": 0.746974527835846, "logits/rejected": 0.36909061670303345, "logps/chosen": -301.1734924316406, "logps/rejected": -151.80557250976562, "loss": 0.5177, "rewards/accuracies": 0.625, "rewards/chosen": -0.1399211436510086, "rewards/margins": 0.727028489112854, "rewards/rejected": -0.8669496774673462, "step": 1551 }, { "epoch": 1.552, "grad_norm": 1.093327522277832, "learning_rate": 5.6e-07, "logits/chosen": 1.075130820274353, "logits/rejected": 1.1048517227172852, "logps/chosen": -360.7198791503906, "logps/rejected": -209.15296936035156, "loss": 0.3143, "rewards/accuracies": 1.0, "rewards/chosen": 0.19846487045288086, "rewards/margins": 1.100075602531433, "rewards/rejected": -0.901610791683197, "step": 1552 }, { "epoch": 1.553, "grad_norm": 1.0936734676361084, "learning_rate": 5.587499999999999e-07, "logits/chosen": 0.5738826990127563, "logits/rejected": 0.3327639698982239, "logps/chosen": -402.3553161621094, "logps/rejected": -198.22422790527344, "loss": 0.375, "rewards/accuracies": 0.875, "rewards/chosen": 0.12460555881261826, "rewards/margins": 0.9814390540122986, "rewards/rejected": -0.8568334579467773, "step": 1553 }, { "epoch": 1.554, "grad_norm": 1.574804663658142, "learning_rate": 5.575e-07, "logits/chosen": 0.8231016397476196, "logits/rejected": 0.5855071544647217, "logps/chosen": -188.6795654296875, "logps/rejected": -213.07427978515625, "loss": 0.613, "rewards/accuracies": 0.75, "rewards/chosen": -0.07653865963220596, "rewards/margins": 0.42213374376296997, "rewards/rejected": -0.4986724257469177, "step": 1554 }, { "epoch": 1.5550000000000002, "grad_norm": 2.4065825939178467, "learning_rate": 5.5625e-07, "logits/chosen": 0.5874152779579163, "logits/rejected": 1.0069419145584106, "logps/chosen": -145.2845458984375, "logps/rejected": -250.33355712890625, "loss": 0.4546, "rewards/accuracies": 0.75, "rewards/chosen": -0.05384978652000427, "rewards/margins": 0.7543731331825256, "rewards/rejected": -0.8082230091094971, "step": 1555 }, { "epoch": 1.556, "grad_norm": 1.8787857294082642, "learning_rate": 5.55e-07, "logits/chosen": 0.7316951155662537, "logits/rejected": 0.8617544770240784, "logps/chosen": -172.18246459960938, "logps/rejected": -197.39407348632812, "loss": 0.7554, "rewards/accuracies": 0.5, "rewards/chosen": -0.18086014688014984, "rewards/margins": 0.16317148506641388, "rewards/rejected": -0.3440316617488861, "step": 1556 }, { "epoch": 1.557, "grad_norm": 1.5060125589370728, "learning_rate": 5.5375e-07, "logits/chosen": 0.7973343729972839, "logits/rejected": 1.1583144664764404, "logps/chosen": -209.24017333984375, "logps/rejected": -234.40817260742188, "loss": 0.702, "rewards/accuracies": 0.625, "rewards/chosen": -0.2957184910774231, "rewards/margins": 0.09993526339530945, "rewards/rejected": -0.39565372467041016, "step": 1557 }, { "epoch": 1.558, "grad_norm": 1.1346383094787598, "learning_rate": 5.525e-07, "logits/chosen": 1.099133014678955, "logits/rejected": 1.1934620141983032, "logps/chosen": -233.17922973632812, "logps/rejected": -226.27252197265625, "loss": 0.3634, "rewards/accuracies": 0.875, "rewards/chosen": 0.1679423451423645, "rewards/margins": 0.9928398728370667, "rewards/rejected": -0.8248975872993469, "step": 1558 }, { "epoch": 1.5590000000000002, "grad_norm": 1.1083683967590332, "learning_rate": 5.5125e-07, "logits/chosen": 0.9116238355636597, "logits/rejected": 0.31816065311431885, "logps/chosen": -229.93255615234375, "logps/rejected": -223.5911407470703, "loss": 0.5099, "rewards/accuracies": 0.75, "rewards/chosen": -0.01577044278383255, "rewards/margins": 0.7262900471687317, "rewards/rejected": -0.7420604228973389, "step": 1559 }, { "epoch": 1.56, "grad_norm": 2.213789939880371, "learning_rate": 5.5e-07, "logits/chosen": 1.1508445739746094, "logits/rejected": 0.1620720624923706, "logps/chosen": -231.75526428222656, "logps/rejected": -127.57050323486328, "loss": 0.7509, "rewards/accuracies": 0.625, "rewards/chosen": -0.15809279680252075, "rewards/margins": 0.13566748797893524, "rewards/rejected": -0.2937602996826172, "step": 1560 }, { "epoch": 1.561, "grad_norm": 1.225342035293579, "learning_rate": 5.487499999999999e-07, "logits/chosen": 1.0134916305541992, "logits/rejected": 0.7709360122680664, "logps/chosen": -300.97967529296875, "logps/rejected": -243.98614501953125, "loss": 0.3932, "rewards/accuracies": 0.875, "rewards/chosen": 0.3669919967651367, "rewards/margins": 1.0974063873291016, "rewards/rejected": -0.7304142713546753, "step": 1561 }, { "epoch": 1.562, "grad_norm": 1.069129228591919, "learning_rate": 5.474999999999999e-07, "logits/chosen": 1.0481982231140137, "logits/rejected": 0.5206572413444519, "logps/chosen": -228.0587158203125, "logps/rejected": -194.5350341796875, "loss": 0.4033, "rewards/accuracies": 0.75, "rewards/chosen": -0.05710944905877113, "rewards/margins": 0.8551846742630005, "rewards/rejected": -0.9122941493988037, "step": 1562 }, { "epoch": 1.563, "grad_norm": 1.034360647201538, "learning_rate": 5.4625e-07, "logits/chosen": 0.99156254529953, "logits/rejected": 0.8357526659965515, "logps/chosen": -292.2571716308594, "logps/rejected": -219.71041870117188, "loss": 0.4634, "rewards/accuracies": 0.75, "rewards/chosen": -0.038962654769420624, "rewards/margins": 1.039175033569336, "rewards/rejected": -1.0781376361846924, "step": 1563 }, { "epoch": 1.564, "grad_norm": 1.4404217004776, "learning_rate": 5.45e-07, "logits/chosen": 0.49347078800201416, "logits/rejected": 0.4688453674316406, "logps/chosen": -209.17971801757812, "logps/rejected": -165.48635864257812, "loss": 0.564, "rewards/accuracies": 0.625, "rewards/chosen": -0.16393297910690308, "rewards/margins": 0.38678228855133057, "rewards/rejected": -0.5507152080535889, "step": 1564 }, { "epoch": 1.565, "grad_norm": 1.6884608268737793, "learning_rate": 5.4375e-07, "logits/chosen": 0.5985822081565857, "logits/rejected": 0.20447513461112976, "logps/chosen": -210.04266357421875, "logps/rejected": -179.55499267578125, "loss": 0.623, "rewards/accuracies": 0.75, "rewards/chosen": -0.0811111330986023, "rewards/margins": 0.47195369005203247, "rewards/rejected": -0.5530648231506348, "step": 1565 }, { "epoch": 1.5659999999999998, "grad_norm": 2.637011766433716, "learning_rate": 5.425e-07, "logits/chosen": 0.35036274790763855, "logits/rejected": 0.5469360947608948, "logps/chosen": -117.81365203857422, "logps/rejected": -205.39163208007812, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": -0.11777451634407043, "rewards/margins": 0.5604389309883118, "rewards/rejected": -0.6782134175300598, "step": 1566 }, { "epoch": 1.567, "grad_norm": 1.1011465787887573, "learning_rate": 5.4125e-07, "logits/chosen": 0.9711023569107056, "logits/rejected": 1.2157255411148071, "logps/chosen": -268.8255920410156, "logps/rejected": -250.1524658203125, "loss": 0.4515, "rewards/accuracies": 0.75, "rewards/chosen": 0.07460766285657883, "rewards/margins": 0.8992657661437988, "rewards/rejected": -0.8246581554412842, "step": 1567 }, { "epoch": 1.568, "grad_norm": 1.3121711015701294, "learning_rate": 5.4e-07, "logits/chosen": 0.5051148533821106, "logits/rejected": 0.6882556676864624, "logps/chosen": -220.90521240234375, "logps/rejected": -215.9794921875, "loss": 0.4557, "rewards/accuracies": 0.875, "rewards/chosen": 0.07858877629041672, "rewards/margins": 0.6446248292922974, "rewards/rejected": -0.5660360455513, "step": 1568 }, { "epoch": 1.569, "grad_norm": 2.0485453605651855, "learning_rate": 5.387499999999999e-07, "logits/chosen": 0.567615807056427, "logits/rejected": 0.5642049312591553, "logps/chosen": -308.47705078125, "logps/rejected": -228.63931274414062, "loss": 0.3813, "rewards/accuracies": 0.875, "rewards/chosen": 0.11120186001062393, "rewards/margins": 1.050244688987732, "rewards/rejected": -0.939042866230011, "step": 1569 }, { "epoch": 1.5699999999999998, "grad_norm": 1.4678573608398438, "learning_rate": 5.374999999999999e-07, "logits/chosen": 1.0344345569610596, "logits/rejected": 1.1406208276748657, "logps/chosen": -190.5001220703125, "logps/rejected": -241.62498474121094, "loss": 0.5741, "rewards/accuracies": 0.625, "rewards/chosen": -0.04546976089477539, "rewards/margins": 0.5997985005378723, "rewards/rejected": -0.6452682614326477, "step": 1570 }, { "epoch": 1.571, "grad_norm": 1.4154664278030396, "learning_rate": 5.3625e-07, "logits/chosen": 1.1666128635406494, "logits/rejected": 0.890588641166687, "logps/chosen": -263.13641357421875, "logps/rejected": -196.8474578857422, "loss": 0.4925, "rewards/accuracies": 0.75, "rewards/chosen": -0.17337504029273987, "rewards/margins": 0.7944075465202332, "rewards/rejected": -0.9677826166152954, "step": 1571 }, { "epoch": 1.572, "grad_norm": 1.5513412952423096, "learning_rate": 5.35e-07, "logits/chosen": 0.9272554516792297, "logits/rejected": 0.8022012710571289, "logps/chosen": -202.7754364013672, "logps/rejected": -189.21148681640625, "loss": 0.5377, "rewards/accuracies": 0.75, "rewards/chosen": -0.10610420256853104, "rewards/margins": 0.42254772782325745, "rewards/rejected": -0.5286518931388855, "step": 1572 }, { "epoch": 1.573, "grad_norm": 1.083047866821289, "learning_rate": 5.3375e-07, "logits/chosen": 0.43895962834358215, "logits/rejected": 0.5331488847732544, "logps/chosen": -186.4896697998047, "logps/rejected": -184.50421142578125, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 0.08301784098148346, "rewards/margins": 1.037026047706604, "rewards/rejected": -0.954008162021637, "step": 1573 }, { "epoch": 1.5739999999999998, "grad_norm": 1.087281584739685, "learning_rate": 5.325e-07, "logits/chosen": 0.5978398323059082, "logits/rejected": 1.292940378189087, "logps/chosen": -120.74457550048828, "logps/rejected": -228.20262145996094, "loss": 0.4328, "rewards/accuracies": 0.75, "rewards/chosen": 0.1162453219294548, "rewards/margins": 0.7199656963348389, "rewards/rejected": -0.6037204265594482, "step": 1574 }, { "epoch": 1.575, "grad_norm": 0.8384570479393005, "learning_rate": 5.3125e-07, "logits/chosen": 0.8547197580337524, "logits/rejected": 0.4029054045677185, "logps/chosen": -245.30429077148438, "logps/rejected": -147.22091674804688, "loss": 0.291, "rewards/accuracies": 1.0, "rewards/chosen": 0.2074592560529709, "rewards/margins": 1.245100498199463, "rewards/rejected": -1.0376410484313965, "step": 1575 }, { "epoch": 1.576, "grad_norm": 1.211897373199463, "learning_rate": 5.3e-07, "logits/chosen": 0.9181674718856812, "logits/rejected": 0.4586753845214844, "logps/chosen": -210.09799194335938, "logps/rejected": -174.5371551513672, "loss": 0.4482, "rewards/accuracies": 0.875, "rewards/chosen": 0.1731867790222168, "rewards/margins": 0.7960940599441528, "rewards/rejected": -0.622907280921936, "step": 1576 }, { "epoch": 1.577, "grad_norm": 1.4548513889312744, "learning_rate": 5.2875e-07, "logits/chosen": 0.5691368579864502, "logits/rejected": 0.5155277848243713, "logps/chosen": -293.88250732421875, "logps/rejected": -162.6005859375, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": -0.019720062613487244, "rewards/margins": 0.5390408635139465, "rewards/rejected": -0.558760941028595, "step": 1577 }, { "epoch": 1.5779999999999998, "grad_norm": 1.7298657894134521, "learning_rate": 5.274999999999999e-07, "logits/chosen": 0.6559510827064514, "logits/rejected": 0.9455639123916626, "logps/chosen": -175.23939514160156, "logps/rejected": -203.05599975585938, "loss": 0.5512, "rewards/accuracies": 0.75, "rewards/chosen": -0.1574355959892273, "rewards/margins": 0.6527374982833862, "rewards/rejected": -0.810172975063324, "step": 1578 }, { "epoch": 1.579, "grad_norm": 1.7442562580108643, "learning_rate": 5.262499999999999e-07, "logits/chosen": 0.5379511117935181, "logits/rejected": 0.36697402596473694, "logps/chosen": -161.92823791503906, "logps/rejected": -236.4819793701172, "loss": 0.6716, "rewards/accuracies": 0.5, "rewards/chosen": -0.22057273983955383, "rewards/margins": 0.3160741925239563, "rewards/rejected": -0.5366469621658325, "step": 1579 }, { "epoch": 1.58, "grad_norm": 1.1986433267593384, "learning_rate": 5.25e-07, "logits/chosen": 0.37178003787994385, "logits/rejected": 0.7433163523674011, "logps/chosen": -137.4705047607422, "logps/rejected": -213.9710235595703, "loss": 0.3759, "rewards/accuracies": 0.875, "rewards/chosen": 0.1713801473379135, "rewards/margins": 0.9814348220825195, "rewards/rejected": -0.8100547194480896, "step": 1580 }, { "epoch": 1.581, "grad_norm": 2.2632229328155518, "learning_rate": 5.237500000000001e-07, "logits/chosen": 0.7908750772476196, "logits/rejected": 0.043988801538944244, "logps/chosen": -265.2799072265625, "logps/rejected": -169.09417724609375, "loss": 0.7188, "rewards/accuracies": 0.625, "rewards/chosen": -0.40192341804504395, "rewards/margins": 0.06900300830602646, "rewards/rejected": -0.4709264039993286, "step": 1581 }, { "epoch": 1.5819999999999999, "grad_norm": 1.538622260093689, "learning_rate": 5.225e-07, "logits/chosen": 0.8150578141212463, "logits/rejected": 0.8553760051727295, "logps/chosen": -169.4733428955078, "logps/rejected": -173.09117126464844, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": -0.13619089126586914, "rewards/margins": 0.3628886342048645, "rewards/rejected": -0.49907955527305603, "step": 1582 }, { "epoch": 1.583, "grad_norm": 1.2707957029342651, "learning_rate": 5.2125e-07, "logits/chosen": 0.5382089614868164, "logits/rejected": 0.4305916726589203, "logps/chosen": -268.30157470703125, "logps/rejected": -183.53636169433594, "loss": 0.3336, "rewards/accuracies": 0.875, "rewards/chosen": 0.13418009877204895, "rewards/margins": 1.1276748180389404, "rewards/rejected": -0.9934946894645691, "step": 1583 }, { "epoch": 1.584, "grad_norm": 1.1534332036972046, "learning_rate": 5.2e-07, "logits/chosen": 1.2665220499038696, "logits/rejected": 1.1388620138168335, "logps/chosen": -267.3429870605469, "logps/rejected": -213.93081665039062, "loss": 0.399, "rewards/accuracies": 0.875, "rewards/chosen": 0.11149073392152786, "rewards/margins": 1.1631323099136353, "rewards/rejected": -1.051641583442688, "step": 1584 }, { "epoch": 1.585, "grad_norm": 1.1626896858215332, "learning_rate": 5.1875e-07, "logits/chosen": 0.601845383644104, "logits/rejected": 0.6363790035247803, "logps/chosen": -208.21981811523438, "logps/rejected": -190.766357421875, "loss": 0.5524, "rewards/accuracies": 0.875, "rewards/chosen": -0.12116508930921555, "rewards/margins": 0.5466978549957275, "rewards/rejected": -0.6678628921508789, "step": 1585 }, { "epoch": 1.5859999999999999, "grad_norm": 1.2727373838424683, "learning_rate": 5.174999999999999e-07, "logits/chosen": 0.758233904838562, "logits/rejected": -0.08061446994543076, "logps/chosen": -315.635498046875, "logps/rejected": -147.48367309570312, "loss": 0.3861, "rewards/accuracies": 0.875, "rewards/chosen": 0.09265965968370438, "rewards/margins": 0.9226577877998352, "rewards/rejected": -0.8299981355667114, "step": 1586 }, { "epoch": 1.587, "grad_norm": 1.0563807487487793, "learning_rate": 5.162499999999999e-07, "logits/chosen": 0.8499429225921631, "logits/rejected": 1.2508190870285034, "logps/chosen": -300.05615234375, "logps/rejected": -184.81973266601562, "loss": 0.3125, "rewards/accuracies": 0.875, "rewards/chosen": 0.2031828910112381, "rewards/margins": 1.2249810695648193, "rewards/rejected": -1.0217981338500977, "step": 1587 }, { "epoch": 1.588, "grad_norm": 1.396654486656189, "learning_rate": 5.149999999999999e-07, "logits/chosen": 0.3441028594970703, "logits/rejected": 0.4078998863697052, "logps/chosen": -186.55856323242188, "logps/rejected": -216.26063537597656, "loss": 0.5914, "rewards/accuracies": 0.75, "rewards/chosen": 0.016343124210834503, "rewards/margins": 0.37875375151634216, "rewards/rejected": -0.36241063475608826, "step": 1588 }, { "epoch": 1.589, "grad_norm": 1.3374398946762085, "learning_rate": 5.137500000000001e-07, "logits/chosen": 1.3899586200714111, "logits/rejected": 0.895106852054596, "logps/chosen": -202.24575805664062, "logps/rejected": -171.4456024169922, "loss": 0.4861, "rewards/accuracies": 0.75, "rewards/chosen": 0.10181998461484909, "rewards/margins": 0.9152944684028625, "rewards/rejected": -0.8134744763374329, "step": 1589 }, { "epoch": 1.5899999999999999, "grad_norm": 1.4414806365966797, "learning_rate": 5.125e-07, "logits/chosen": 0.8830114006996155, "logits/rejected": 0.8184061050415039, "logps/chosen": -352.78045654296875, "logps/rejected": -183.06790161132812, "loss": 0.6235, "rewards/accuracies": 0.75, "rewards/chosen": -0.2281421720981598, "rewards/margins": 0.23221272230148315, "rewards/rejected": -0.46035489439964294, "step": 1590 }, { "epoch": 1.591, "grad_norm": 1.5171213150024414, "learning_rate": 5.1125e-07, "logits/chosen": 0.3751501441001892, "logits/rejected": 1.2645732164382935, "logps/chosen": -140.6444854736328, "logps/rejected": -257.652099609375, "loss": 0.6372, "rewards/accuracies": 0.5, "rewards/chosen": -0.08150997757911682, "rewards/margins": 0.4137965440750122, "rewards/rejected": -0.49530652165412903, "step": 1591 }, { "epoch": 1.592, "grad_norm": 1.06696355342865, "learning_rate": 5.1e-07, "logits/chosen": 0.8484474420547485, "logits/rejected": 0.38932129740715027, "logps/chosen": -218.4700927734375, "logps/rejected": -161.32369995117188, "loss": 0.4892, "rewards/accuracies": 0.875, "rewards/chosen": -0.001978576183319092, "rewards/margins": 0.6818371415138245, "rewards/rejected": -0.6838157176971436, "step": 1592 }, { "epoch": 1.593, "grad_norm": 1.1709678173065186, "learning_rate": 5.0875e-07, "logits/chosen": 0.6556435823440552, "logits/rejected": 0.7382711172103882, "logps/chosen": -176.73170471191406, "logps/rejected": -240.3316650390625, "loss": 0.4614, "rewards/accuracies": 0.875, "rewards/chosen": 0.052446894347667694, "rewards/margins": 0.759585440158844, "rewards/rejected": -0.7071385383605957, "step": 1593 }, { "epoch": 1.5939999999999999, "grad_norm": 1.6860692501068115, "learning_rate": 5.074999999999999e-07, "logits/chosen": 0.949813961982727, "logits/rejected": 0.8615245819091797, "logps/chosen": -228.02828979492188, "logps/rejected": -182.806396484375, "loss": 0.6071, "rewards/accuracies": 0.75, "rewards/chosen": -0.1355191171169281, "rewards/margins": 0.4285391867160797, "rewards/rejected": -0.5640583038330078, "step": 1594 }, { "epoch": 1.595, "grad_norm": 0.9991126656532288, "learning_rate": 5.062499999999999e-07, "logits/chosen": 0.7975804805755615, "logits/rejected": 0.3545049726963043, "logps/chosen": -250.19931030273438, "logps/rejected": -185.13070678710938, "loss": 0.3124, "rewards/accuracies": 0.875, "rewards/chosen": 0.3448874354362488, "rewards/margins": 1.192323088645935, "rewards/rejected": -0.8474355936050415, "step": 1595 }, { "epoch": 1.596, "grad_norm": 1.786399006843567, "learning_rate": 5.049999999999999e-07, "logits/chosen": 0.5919554829597473, "logits/rejected": 0.6296700239181519, "logps/chosen": -275.12091064453125, "logps/rejected": -164.44659423828125, "loss": 0.5134, "rewards/accuracies": 0.75, "rewards/chosen": 0.09955157339572906, "rewards/margins": 0.9735186100006104, "rewards/rejected": -0.8739669322967529, "step": 1596 }, { "epoch": 1.597, "grad_norm": 1.2289695739746094, "learning_rate": 5.0375e-07, "logits/chosen": 0.7780284881591797, "logits/rejected": 0.4338618516921997, "logps/chosen": -315.5618896484375, "logps/rejected": -171.98155212402344, "loss": 0.4785, "rewards/accuracies": 0.75, "rewards/chosen": 0.15717563033103943, "rewards/margins": 0.6488312482833862, "rewards/rejected": -0.4916556477546692, "step": 1597 }, { "epoch": 1.5979999999999999, "grad_norm": 1.2616627216339111, "learning_rate": 5.025e-07, "logits/chosen": 0.9205986857414246, "logits/rejected": 1.131025791168213, "logps/chosen": -202.36105346679688, "logps/rejected": -253.451416015625, "loss": 0.5378, "rewards/accuracies": 0.75, "rewards/chosen": -0.3060951232910156, "rewards/margins": 0.42891788482666016, "rewards/rejected": -0.7350130081176758, "step": 1598 }, { "epoch": 1.599, "grad_norm": 1.5550618171691895, "learning_rate": 5.0125e-07, "logits/chosen": 0.5263956785202026, "logits/rejected": 0.6067065000534058, "logps/chosen": -148.2102813720703, "logps/rejected": -203.91807556152344, "loss": 0.5728, "rewards/accuracies": 0.75, "rewards/chosen": -0.3153356611728668, "rewards/margins": 0.5015134811401367, "rewards/rejected": -0.8168491125106812, "step": 1599 }, { "epoch": 1.6, "grad_norm": 1.4654008150100708, "learning_rate": 5e-07, "logits/chosen": 1.0366092920303345, "logits/rejected": 0.5249112844467163, "logps/chosen": -213.44151306152344, "logps/rejected": -174.10861206054688, "loss": 0.5063, "rewards/accuracies": 0.75, "rewards/chosen": -0.19147033989429474, "rewards/margins": 0.5994195938110352, "rewards/rejected": -0.7908899188041687, "step": 1600 }, { "epoch": 1.601, "grad_norm": 1.4705549478530884, "learning_rate": 4.9875e-07, "logits/chosen": 0.7677404880523682, "logits/rejected": 0.32563316822052, "logps/chosen": -246.22227478027344, "logps/rejected": -256.803466796875, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": -0.14426183700561523, "rewards/margins": 0.3270477056503296, "rewards/rejected": -0.4713096022605896, "step": 1601 }, { "epoch": 1.6019999999999999, "grad_norm": 1.037534475326538, "learning_rate": 4.975e-07, "logits/chosen": 0.8838670253753662, "logits/rejected": 0.22082939743995667, "logps/chosen": -227.13119506835938, "logps/rejected": -129.62335205078125, "loss": 0.417, "rewards/accuracies": 0.875, "rewards/chosen": 0.28071328997612, "rewards/margins": 0.9469521045684814, "rewards/rejected": -0.6662387847900391, "step": 1602 }, { "epoch": 1.603, "grad_norm": 1.3960016965866089, "learning_rate": 4.9625e-07, "logits/chosen": 0.645460307598114, "logits/rejected": 1.1543270349502563, "logps/chosen": -195.65133666992188, "logps/rejected": -265.63427734375, "loss": 0.5368, "rewards/accuracies": 0.875, "rewards/chosen": 0.015338331460952759, "rewards/margins": 0.5968034863471985, "rewards/rejected": -0.5814651250839233, "step": 1603 }, { "epoch": 1.604, "grad_norm": 1.7500711679458618, "learning_rate": 4.95e-07, "logits/chosen": 0.5808650851249695, "logits/rejected": 0.8409769535064697, "logps/chosen": -149.1014862060547, "logps/rejected": -220.53944396972656, "loss": 0.8155, "rewards/accuracies": 0.625, "rewards/chosen": -0.35838353633880615, "rewards/margins": 0.014583125710487366, "rewards/rejected": -0.3729666769504547, "step": 1604 }, { "epoch": 1.605, "grad_norm": 2.295604705810547, "learning_rate": 4.9375e-07, "logits/chosen": 0.5245009660720825, "logits/rejected": 0.9075722694396973, "logps/chosen": -132.2194061279297, "logps/rejected": -232.82333374023438, "loss": 0.7022, "rewards/accuracies": 0.625, "rewards/chosen": -0.06955939531326294, "rewards/margins": 0.23730257153511047, "rewards/rejected": -0.3068619668483734, "step": 1605 }, { "epoch": 1.6059999999999999, "grad_norm": 1.1052982807159424, "learning_rate": 4.924999999999999e-07, "logits/chosen": 0.9891985058784485, "logits/rejected": 0.5147380828857422, "logps/chosen": -289.8399963378906, "logps/rejected": -140.60107421875, "loss": 0.3929, "rewards/accuracies": 1.0, "rewards/chosen": 0.2069161981344223, "rewards/margins": 0.9412480592727661, "rewards/rejected": -0.7343319654464722, "step": 1606 }, { "epoch": 1.607, "grad_norm": 2.078439474105835, "learning_rate": 4.9125e-07, "logits/chosen": 0.9135196805000305, "logits/rejected": 0.6719605922698975, "logps/chosen": -276.75469970703125, "logps/rejected": -172.3477783203125, "loss": 0.9365, "rewards/accuracies": 0.5, "rewards/chosen": -0.23660364747047424, "rewards/margins": -0.11574910581111908, "rewards/rejected": -0.12085457146167755, "step": 1607 }, { "epoch": 1.608, "grad_norm": 1.72258722782135, "learning_rate": 4.9e-07, "logits/chosen": 0.3604295551776886, "logits/rejected": 0.8368610739707947, "logps/chosen": -147.57586669921875, "logps/rejected": -226.47325134277344, "loss": 0.7048, "rewards/accuracies": 0.625, "rewards/chosen": -0.1298293173313141, "rewards/margins": 0.48783183097839355, "rewards/rejected": -0.61766117811203, "step": 1608 }, { "epoch": 1.609, "grad_norm": 1.4858754873275757, "learning_rate": 4.8875e-07, "logits/chosen": 0.6577932834625244, "logits/rejected": 0.556419849395752, "logps/chosen": -147.25355529785156, "logps/rejected": -195.92608642578125, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -0.16554279625415802, "rewards/margins": 0.3563847541809082, "rewards/rejected": -0.521927535533905, "step": 1609 }, { "epoch": 1.6099999999999999, "grad_norm": 1.1138249635696411, "learning_rate": 4.875e-07, "logits/chosen": 0.23302552103996277, "logits/rejected": 0.7756725549697876, "logps/chosen": -237.97906494140625, "logps/rejected": -192.75515747070312, "loss": 0.4105, "rewards/accuracies": 0.875, "rewards/chosen": 0.14424151182174683, "rewards/margins": 1.0390293598175049, "rewards/rejected": -0.8947877883911133, "step": 1610 }, { "epoch": 1.611, "grad_norm": 1.2576884031295776, "learning_rate": 4.8625e-07, "logits/chosen": 0.8890618681907654, "logits/rejected": 0.9496195316314697, "logps/chosen": -218.58541870117188, "logps/rejected": -262.32025146484375, "loss": 0.4867, "rewards/accuracies": 0.875, "rewards/chosen": -0.16732291877269745, "rewards/margins": 0.7368068695068359, "rewards/rejected": -0.9041297435760498, "step": 1611 }, { "epoch": 1.612, "grad_norm": 0.9606653451919556, "learning_rate": 4.85e-07, "logits/chosen": 0.13640877604484558, "logits/rejected": 0.45915305614471436, "logps/chosen": -273.1182861328125, "logps/rejected": -180.4254150390625, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 0.20565477013587952, "rewards/margins": 1.0637176036834717, "rewards/rejected": -0.8580628037452698, "step": 1612 }, { "epoch": 1.613, "grad_norm": 1.126865267753601, "learning_rate": 4.8375e-07, "logits/chosen": 0.7830723524093628, "logits/rejected": 0.7482032775878906, "logps/chosen": -298.69256591796875, "logps/rejected": -219.29547119140625, "loss": 0.5034, "rewards/accuracies": 0.75, "rewards/chosen": 0.19811183214187622, "rewards/margins": 0.8410624265670776, "rewards/rejected": -0.6429506540298462, "step": 1613 }, { "epoch": 1.6139999999999999, "grad_norm": 2.0176942348480225, "learning_rate": 4.824999999999999e-07, "logits/chosen": 0.8216218948364258, "logits/rejected": 0.7437022924423218, "logps/chosen": -171.60781860351562, "logps/rejected": -177.19149780273438, "loss": 0.7608, "rewards/accuracies": 0.5, "rewards/chosen": -0.23228301107883453, "rewards/margins": 0.4085353910923004, "rewards/rejected": -0.6408184170722961, "step": 1614 }, { "epoch": 1.615, "grad_norm": 1.2809174060821533, "learning_rate": 4.812499999999999e-07, "logits/chosen": 1.361405849456787, "logits/rejected": 0.8864692449569702, "logps/chosen": -306.4669494628906, "logps/rejected": -190.31765747070312, "loss": 0.4843, "rewards/accuracies": 0.75, "rewards/chosen": 0.19521674513816833, "rewards/margins": 0.6441802978515625, "rewards/rejected": -0.44896355271339417, "step": 1615 }, { "epoch": 1.616, "grad_norm": 2.0274646282196045, "learning_rate": 4.8e-07, "logits/chosen": 0.840550422668457, "logits/rejected": 0.8063107132911682, "logps/chosen": -298.9013977050781, "logps/rejected": -172.83123779296875, "loss": 0.6737, "rewards/accuracies": 0.625, "rewards/chosen": -0.28245171904563904, "rewards/margins": 0.6181302070617676, "rewards/rejected": -0.9005818963050842, "step": 1616 }, { "epoch": 1.617, "grad_norm": 1.6367508172988892, "learning_rate": 4.7875e-07, "logits/chosen": 1.2933648824691772, "logits/rejected": 0.6262969374656677, "logps/chosen": -300.4711608886719, "logps/rejected": -136.60948181152344, "loss": 0.4977, "rewards/accuracies": 0.75, "rewards/chosen": -0.05483999103307724, "rewards/margins": 0.573809027671814, "rewards/rejected": -0.6286490559577942, "step": 1617 }, { "epoch": 1.6179999999999999, "grad_norm": 1.2169796228408813, "learning_rate": 4.775e-07, "logits/chosen": 0.9956185817718506, "logits/rejected": 0.8867989778518677, "logps/chosen": -269.9835510253906, "logps/rejected": -225.90689086914062, "loss": 0.4887, "rewards/accuracies": 0.75, "rewards/chosen": 0.20416831970214844, "rewards/margins": 0.6712733507156372, "rewards/rejected": -0.46710512042045593, "step": 1618 }, { "epoch": 1.619, "grad_norm": 1.7588660717010498, "learning_rate": 4.7625e-07, "logits/chosen": 0.4973166882991791, "logits/rejected": 0.8058808445930481, "logps/chosen": -259.187255859375, "logps/rejected": -242.4773406982422, "loss": 0.7404, "rewards/accuracies": 0.625, "rewards/chosen": -0.3381613790988922, "rewards/margins": 0.17080259323120117, "rewards/rejected": -0.508963942527771, "step": 1619 }, { "epoch": 1.62, "grad_norm": 2.109159469604492, "learning_rate": 4.7499999999999995e-07, "logits/chosen": 1.0073459148406982, "logits/rejected": 0.9578939080238342, "logps/chosen": -340.98992919921875, "logps/rejected": -260.7259521484375, "loss": 0.547, "rewards/accuracies": 0.625, "rewards/chosen": -0.1070978194475174, "rewards/margins": 0.6082642078399658, "rewards/rejected": -0.715362012386322, "step": 1620 }, { "epoch": 1.621, "grad_norm": 1.76665198802948, "learning_rate": 4.7374999999999996e-07, "logits/chosen": 1.0750975608825684, "logits/rejected": 0.4953741431236267, "logps/chosen": -219.9033203125, "logps/rejected": -141.3267364501953, "loss": 0.5989, "rewards/accuracies": 0.625, "rewards/chosen": -0.15364569425582886, "rewards/margins": 0.5576308369636536, "rewards/rejected": -0.7112765312194824, "step": 1621 }, { "epoch": 1.6219999999999999, "grad_norm": 1.5146902799606323, "learning_rate": 4.725e-07, "logits/chosen": 0.7096942663192749, "logits/rejected": 1.255096435546875, "logps/chosen": -283.9905700683594, "logps/rejected": -211.17665100097656, "loss": 0.7276, "rewards/accuracies": 0.75, "rewards/chosen": -0.3457258343696594, "rewards/margins": 0.16428892314434052, "rewards/rejected": -0.5100147128105164, "step": 1622 }, { "epoch": 1.623, "grad_norm": 1.9152100086212158, "learning_rate": 4.7125e-07, "logits/chosen": 0.7711881995201111, "logits/rejected": 0.6579387784004211, "logps/chosen": -252.95816040039062, "logps/rejected": -177.72837829589844, "loss": 0.6502, "rewards/accuracies": 0.5, "rewards/chosen": -0.030137818306684494, "rewards/margins": 0.37537339329719543, "rewards/rejected": -0.40551120042800903, "step": 1623 }, { "epoch": 1.624, "grad_norm": 1.1963212490081787, "learning_rate": 4.6999999999999995e-07, "logits/chosen": 1.0084387063980103, "logits/rejected": 0.61052405834198, "logps/chosen": -203.5374755859375, "logps/rejected": -155.5756072998047, "loss": 0.4943, "rewards/accuracies": 0.75, "rewards/chosen": -0.029750823974609375, "rewards/margins": 0.6814650893211365, "rewards/rejected": -0.7112159132957458, "step": 1624 }, { "epoch": 1.625, "grad_norm": 1.4434278011322021, "learning_rate": 4.6874999999999996e-07, "logits/chosen": 0.5604917407035828, "logits/rejected": 0.5134961009025574, "logps/chosen": -225.07229614257812, "logps/rejected": -189.08990478515625, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.2092108577489853, "rewards/margins": 0.2816039025783539, "rewards/rejected": -0.49081480503082275, "step": 1625 }, { "epoch": 1.626, "grad_norm": 1.3671512603759766, "learning_rate": 4.675e-07, "logits/chosen": 1.0442485809326172, "logits/rejected": 0.8724249601364136, "logps/chosen": -252.03952026367188, "logps/rejected": -203.857666015625, "loss": 0.5481, "rewards/accuracies": 0.625, "rewards/chosen": 0.2081564962863922, "rewards/margins": 0.6808065176010132, "rewards/rejected": -0.47265005111694336, "step": 1626 }, { "epoch": 1.627, "grad_norm": 2.7353861331939697, "learning_rate": 4.6625e-07, "logits/chosen": 1.0996782779693604, "logits/rejected": 0.8827647566795349, "logps/chosen": -302.73626708984375, "logps/rejected": -178.0133819580078, "loss": 0.7674, "rewards/accuracies": 0.5, "rewards/chosen": -0.30376508831977844, "rewards/margins": 0.13323792815208435, "rewards/rejected": -0.4370030164718628, "step": 1627 }, { "epoch": 1.6280000000000001, "grad_norm": 2.461371421813965, "learning_rate": 4.65e-07, "logits/chosen": 0.4530395269393921, "logits/rejected": 0.9596747159957886, "logps/chosen": -123.5177230834961, "logps/rejected": -313.40948486328125, "loss": 0.5637, "rewards/accuracies": 0.75, "rewards/chosen": -0.2578675150871277, "rewards/margins": 0.4201083183288574, "rewards/rejected": -0.6779758334159851, "step": 1628 }, { "epoch": 1.629, "grad_norm": 1.656783938407898, "learning_rate": 4.6374999999999995e-07, "logits/chosen": 1.0241479873657227, "logits/rejected": 0.4582086205482483, "logps/chosen": -227.60806274414062, "logps/rejected": -169.15834045410156, "loss": 0.764, "rewards/accuracies": 0.5, "rewards/chosen": -0.24388645589351654, "rewards/margins": 0.25525617599487305, "rewards/rejected": -0.4991426467895508, "step": 1629 }, { "epoch": 1.63, "grad_norm": 1.3750486373901367, "learning_rate": 4.625e-07, "logits/chosen": 0.9955852031707764, "logits/rejected": 0.5510284900665283, "logps/chosen": -314.79022216796875, "logps/rejected": -172.2689208984375, "loss": 0.4983, "rewards/accuracies": 0.875, "rewards/chosen": 0.02841387689113617, "rewards/margins": 0.6186654567718506, "rewards/rejected": -0.5902515649795532, "step": 1630 }, { "epoch": 1.631, "grad_norm": 0.7745797038078308, "learning_rate": 4.6125e-07, "logits/chosen": 0.4532454013824463, "logits/rejected": 0.7991157174110413, "logps/chosen": -260.2367858886719, "logps/rejected": -218.84039306640625, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": 0.3235228359699249, "rewards/margins": 1.3269107341766357, "rewards/rejected": -1.0033879280090332, "step": 1631 }, { "epoch": 1.6320000000000001, "grad_norm": 1.157860279083252, "learning_rate": 4.6e-07, "logits/chosen": 0.49039584398269653, "logits/rejected": 0.8860206604003906, "logps/chosen": -161.86439514160156, "logps/rejected": -283.6057434082031, "loss": 0.5401, "rewards/accuracies": 0.625, "rewards/chosen": -0.09466767311096191, "rewards/margins": 0.4966088831424713, "rewards/rejected": -0.5912765860557556, "step": 1632 }, { "epoch": 1.633, "grad_norm": 1.5260530710220337, "learning_rate": 4.5874999999999995e-07, "logits/chosen": 0.7979570627212524, "logits/rejected": 1.0511752367019653, "logps/chosen": -179.06956481933594, "logps/rejected": -181.89080810546875, "loss": 0.5753, "rewards/accuracies": 0.75, "rewards/chosen": -0.19253036379814148, "rewards/margins": 0.5094989538192749, "rewards/rejected": -0.702029287815094, "step": 1633 }, { "epoch": 1.634, "grad_norm": 2.0528316497802734, "learning_rate": 4.575e-07, "logits/chosen": 0.823519766330719, "logits/rejected": 0.4067522883415222, "logps/chosen": -288.43023681640625, "logps/rejected": -143.80516052246094, "loss": 0.7474, "rewards/accuracies": 0.625, "rewards/chosen": -0.40121424198150635, "rewards/margins": 0.17030596733093262, "rewards/rejected": -0.5715201497077942, "step": 1634 }, { "epoch": 1.635, "grad_norm": 2.006333351135254, "learning_rate": 4.5624999999999997e-07, "logits/chosen": 0.6561278104782104, "logits/rejected": 0.19381779432296753, "logps/chosen": -163.72801208496094, "logps/rejected": -151.32383728027344, "loss": 0.924, "rewards/accuracies": 0.375, "rewards/chosen": -0.4205602705478668, "rewards/margins": -0.0713697299361229, "rewards/rejected": -0.34919053316116333, "step": 1635 }, { "epoch": 1.6360000000000001, "grad_norm": 2.32741117477417, "learning_rate": 4.55e-07, "logits/chosen": 0.851702094078064, "logits/rejected": 1.050606369972229, "logps/chosen": -237.04226684570312, "logps/rejected": -260.5228576660156, "loss": 0.6193, "rewards/accuracies": 0.75, "rewards/chosen": -0.04506370425224304, "rewards/margins": 0.9072364568710327, "rewards/rejected": -0.9523001909255981, "step": 1636 }, { "epoch": 1.637, "grad_norm": 1.1861673593521118, "learning_rate": 4.5374999999999994e-07, "logits/chosen": 1.013533115386963, "logits/rejected": 0.9000863432884216, "logps/chosen": -158.9329071044922, "logps/rejected": -221.87347412109375, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 0.08343782275915146, "rewards/margins": 1.1718487739562988, "rewards/rejected": -1.0884109735488892, "step": 1637 }, { "epoch": 1.638, "grad_norm": 1.4360641241073608, "learning_rate": 4.525e-07, "logits/chosen": 0.6766047477722168, "logits/rejected": 0.2629750370979309, "logps/chosen": -177.73318481445312, "logps/rejected": -142.18409729003906, "loss": 0.3526, "rewards/accuracies": 0.875, "rewards/chosen": 0.20827904343605042, "rewards/margins": 0.9844516515731812, "rewards/rejected": -0.7761726379394531, "step": 1638 }, { "epoch": 1.639, "grad_norm": 2.0444724559783936, "learning_rate": 4.5124999999999997e-07, "logits/chosen": 1.0212130546569824, "logits/rejected": 0.5477228164672852, "logps/chosen": -255.89938354492188, "logps/rejected": -166.73226928710938, "loss": 0.5756, "rewards/accuracies": 0.75, "rewards/chosen": -0.1621417999267578, "rewards/margins": 0.40938520431518555, "rewards/rejected": -0.5715270042419434, "step": 1639 }, { "epoch": 1.6400000000000001, "grad_norm": 1.2649662494659424, "learning_rate": 4.5e-07, "logits/chosen": 0.7790277004241943, "logits/rejected": 0.5656301975250244, "logps/chosen": -263.32427978515625, "logps/rejected": -194.38232421875, "loss": 0.4975, "rewards/accuracies": 0.75, "rewards/chosen": 0.04667072743177414, "rewards/margins": 0.8066459894180298, "rewards/rejected": -0.7599751949310303, "step": 1640 }, { "epoch": 1.641, "grad_norm": 1.3168308734893799, "learning_rate": 4.4874999999999994e-07, "logits/chosen": 0.8197029232978821, "logits/rejected": 0.7687908411026001, "logps/chosen": -190.005615234375, "logps/rejected": -198.33230590820312, "loss": 0.5846, "rewards/accuracies": 0.75, "rewards/chosen": -0.07957343757152557, "rewards/margins": 0.38093510270118713, "rewards/rejected": -0.4605085551738739, "step": 1641 }, { "epoch": 1.642, "grad_norm": 1.2489596605300903, "learning_rate": 4.475e-07, "logits/chosen": 0.48373541235923767, "logits/rejected": 0.7899067401885986, "logps/chosen": -122.06062316894531, "logps/rejected": -244.148681640625, "loss": 0.359, "rewards/accuracies": 0.875, "rewards/chosen": 0.2277633547782898, "rewards/margins": 0.9990706443786621, "rewards/rejected": -0.7713072896003723, "step": 1642 }, { "epoch": 1.643, "grad_norm": 1.7161469459533691, "learning_rate": 4.4624999999999996e-07, "logits/chosen": 0.5783166289329529, "logits/rejected": 0.5695788860321045, "logps/chosen": -159.33401489257812, "logps/rejected": -207.67572021484375, "loss": 0.495, "rewards/accuracies": 0.75, "rewards/chosen": 0.12270736694335938, "rewards/margins": 0.7153163552284241, "rewards/rejected": -0.5926088690757751, "step": 1643 }, { "epoch": 1.6440000000000001, "grad_norm": 1.3393902778625488, "learning_rate": 4.45e-07, "logits/chosen": 0.9493013620376587, "logits/rejected": 0.6800234913825989, "logps/chosen": -303.1028747558594, "logps/rejected": -209.06910705566406, "loss": 0.4704, "rewards/accuracies": 0.75, "rewards/chosen": -0.04181376099586487, "rewards/margins": 0.6975270509719849, "rewards/rejected": -0.7393408417701721, "step": 1644 }, { "epoch": 1.645, "grad_norm": 1.8447871208190918, "learning_rate": 4.4374999999999993e-07, "logits/chosen": 0.8421307802200317, "logits/rejected": 0.6921848058700562, "logps/chosen": -261.468017578125, "logps/rejected": -208.04664611816406, "loss": 0.5608, "rewards/accuracies": 0.75, "rewards/chosen": -0.4583624005317688, "rewards/margins": 0.44801169633865356, "rewards/rejected": -0.9063740968704224, "step": 1645 }, { "epoch": 1.646, "grad_norm": 1.1605573892593384, "learning_rate": 4.425e-07, "logits/chosen": 0.9044342041015625, "logits/rejected": 0.5411347150802612, "logps/chosen": -305.38873291015625, "logps/rejected": -164.5653076171875, "loss": 0.5138, "rewards/accuracies": 0.875, "rewards/chosen": -0.08038312196731567, "rewards/margins": 0.5251595377922058, "rewards/rejected": -0.6055426597595215, "step": 1646 }, { "epoch": 1.647, "grad_norm": 1.3087291717529297, "learning_rate": 4.4124999999999996e-07, "logits/chosen": 1.4295490980148315, "logits/rejected": 1.2442978620529175, "logps/chosen": -403.8655700683594, "logps/rejected": -201.22396850585938, "loss": 0.4197, "rewards/accuracies": 0.75, "rewards/chosen": 0.10071258246898651, "rewards/margins": 0.800848126411438, "rewards/rejected": -0.7001355290412903, "step": 1647 }, { "epoch": 1.6480000000000001, "grad_norm": 1.1950931549072266, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 0.5873208045959473, "logits/rejected": 0.588564395904541, "logps/chosen": -212.61666870117188, "logps/rejected": -134.5131072998047, "loss": 0.4508, "rewards/accuracies": 0.75, "rewards/chosen": -0.021532105281949043, "rewards/margins": 0.7840496301651001, "rewards/rejected": -0.8055816888809204, "step": 1648 }, { "epoch": 1.649, "grad_norm": 1.8499772548675537, "learning_rate": 4.3874999999999993e-07, "logits/chosen": 0.9328703880310059, "logits/rejected": 0.8208816647529602, "logps/chosen": -250.10064697265625, "logps/rejected": -188.6854705810547, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": -0.32321396470069885, "rewards/margins": 0.3507993519306183, "rewards/rejected": -0.6740133166313171, "step": 1649 }, { "epoch": 1.65, "grad_norm": 1.3320708274841309, "learning_rate": 4.375e-07, "logits/chosen": 0.3937493562698364, "logits/rejected": 0.766486644744873, "logps/chosen": -156.6799774169922, "logps/rejected": -228.52122497558594, "loss": 0.4789, "rewards/accuracies": 0.875, "rewards/chosen": 0.066524937748909, "rewards/margins": 0.71001136302948, "rewards/rejected": -0.6434864401817322, "step": 1650 }, { "epoch": 1.651, "grad_norm": 1.0601259469985962, "learning_rate": 4.3625e-07, "logits/chosen": 0.6864509582519531, "logits/rejected": 0.3048419654369354, "logps/chosen": -146.97744750976562, "logps/rejected": -163.4197540283203, "loss": 0.5378, "rewards/accuracies": 0.75, "rewards/chosen": 0.12133455276489258, "rewards/margins": 0.6152979731559753, "rewards/rejected": -0.49396342039108276, "step": 1651 }, { "epoch": 1.6520000000000001, "grad_norm": 1.1687301397323608, "learning_rate": 4.3499999999999996e-07, "logits/chosen": 0.7606545686721802, "logits/rejected": 0.9803179502487183, "logps/chosen": -148.0195770263672, "logps/rejected": -222.30838012695312, "loss": 0.5194, "rewards/accuracies": 0.875, "rewards/chosen": 0.20065990090370178, "rewards/margins": 0.7118262052536011, "rewards/rejected": -0.5111663341522217, "step": 1652 }, { "epoch": 1.653, "grad_norm": 1.9299230575561523, "learning_rate": 4.3375000000000003e-07, "logits/chosen": 0.9831434488296509, "logits/rejected": 0.45488241314888, "logps/chosen": -207.42344665527344, "logps/rejected": -187.19837951660156, "loss": 0.384, "rewards/accuracies": 0.875, "rewards/chosen": 0.06307916343212128, "rewards/margins": 0.9583989381790161, "rewards/rejected": -0.8953198790550232, "step": 1653 }, { "epoch": 1.654, "grad_norm": 1.3589991331100464, "learning_rate": 4.325e-07, "logits/chosen": 1.056096076965332, "logits/rejected": 0.9492619037628174, "logps/chosen": -267.4261779785156, "logps/rejected": -197.76272583007812, "loss": 0.4737, "rewards/accuracies": 0.75, "rewards/chosen": 0.007935620844364166, "rewards/margins": 0.7136976718902588, "rewards/rejected": -0.7057620286941528, "step": 1654 }, { "epoch": 1.655, "grad_norm": 1.7445697784423828, "learning_rate": 4.3125e-07, "logits/chosen": 0.6317890882492065, "logits/rejected": 1.1832399368286133, "logps/chosen": -235.27378845214844, "logps/rejected": -285.3525695800781, "loss": 0.7251, "rewards/accuracies": 0.75, "rewards/chosen": -0.4244826138019562, "rewards/margins": 0.32235217094421387, "rewards/rejected": -0.7468347549438477, "step": 1655 }, { "epoch": 1.6560000000000001, "grad_norm": 1.0989062786102295, "learning_rate": 4.2999999999999996e-07, "logits/chosen": 0.9841744899749756, "logits/rejected": 0.8871806263923645, "logps/chosen": -281.80548095703125, "logps/rejected": -189.31002807617188, "loss": 0.4669, "rewards/accuracies": 0.875, "rewards/chosen": -0.01700611412525177, "rewards/margins": 0.8139510750770569, "rewards/rejected": -0.830957293510437, "step": 1656 }, { "epoch": 1.657, "grad_norm": 1.2428587675094604, "learning_rate": 4.2875e-07, "logits/chosen": 1.1138262748718262, "logits/rejected": 0.6856958270072937, "logps/chosen": -219.501220703125, "logps/rejected": -178.85757446289062, "loss": 0.4831, "rewards/accuracies": 0.75, "rewards/chosen": -0.04680357128381729, "rewards/margins": 0.6182606816291809, "rewards/rejected": -0.66506427526474, "step": 1657 }, { "epoch": 1.658, "grad_norm": 1.1280720233917236, "learning_rate": 4.275e-07, "logits/chosen": 0.6996747851371765, "logits/rejected": 0.9227247834205627, "logps/chosen": -218.45889282226562, "logps/rejected": -211.621826171875, "loss": 0.5247, "rewards/accuracies": 0.75, "rewards/chosen": 0.08389297872781754, "rewards/margins": 0.49712443351745605, "rewards/rejected": -0.4132314622402191, "step": 1658 }, { "epoch": 1.659, "grad_norm": 2.0392532348632812, "learning_rate": 4.2625e-07, "logits/chosen": 0.6503754258155823, "logits/rejected": 0.9761238694190979, "logps/chosen": -237.68362426757812, "logps/rejected": -238.43521118164062, "loss": 0.6941, "rewards/accuracies": 0.75, "rewards/chosen": -0.427395224571228, "rewards/margins": 0.15744096040725708, "rewards/rejected": -0.5848361849784851, "step": 1659 }, { "epoch": 1.6600000000000001, "grad_norm": 1.7125709056854248, "learning_rate": 4.2499999999999995e-07, "logits/chosen": 0.8568577766418457, "logits/rejected": 0.356116384267807, "logps/chosen": -164.62435913085938, "logps/rejected": -186.14920043945312, "loss": 0.6098, "rewards/accuracies": 0.75, "rewards/chosen": -0.25101834535598755, "rewards/margins": 0.3036290109157562, "rewards/rejected": -0.5546473860740662, "step": 1660 }, { "epoch": 1.661, "grad_norm": 1.4712263345718384, "learning_rate": 4.2375e-07, "logits/chosen": 0.5724210739135742, "logits/rejected": 0.6083923578262329, "logps/chosen": -231.9429931640625, "logps/rejected": -207.40875244140625, "loss": 0.6098, "rewards/accuracies": 0.5, "rewards/chosen": -0.11178522557020187, "rewards/margins": 0.479367733001709, "rewards/rejected": -0.5911529064178467, "step": 1661 }, { "epoch": 1.662, "grad_norm": 1.0099951028823853, "learning_rate": 4.225e-07, "logits/chosen": 0.4937620460987091, "logits/rejected": 0.1680685579776764, "logps/chosen": -171.4246368408203, "logps/rejected": -171.45892333984375, "loss": 0.4405, "rewards/accuracies": 0.75, "rewards/chosen": 0.1405496597290039, "rewards/margins": 1.0097655057907104, "rewards/rejected": -0.8692158460617065, "step": 1662 }, { "epoch": 1.663, "grad_norm": 1.1495476961135864, "learning_rate": 4.2125e-07, "logits/chosen": 0.45867255330085754, "logits/rejected": 0.99334317445755, "logps/chosen": -164.68850708007812, "logps/rejected": -180.50299072265625, "loss": 0.5806, "rewards/accuracies": 0.75, "rewards/chosen": -0.1436791568994522, "rewards/margins": 0.6324994564056396, "rewards/rejected": -0.7761785984039307, "step": 1663 }, { "epoch": 1.6640000000000001, "grad_norm": 1.4524860382080078, "learning_rate": 4.1999999999999995e-07, "logits/chosen": 0.8077362775802612, "logits/rejected": 0.7629373073577881, "logps/chosen": -293.991455078125, "logps/rejected": -198.83078002929688, "loss": 0.6742, "rewards/accuracies": 0.75, "rewards/chosen": -0.2117842733860016, "rewards/margins": 0.3494192361831665, "rewards/rejected": -0.5612034797668457, "step": 1664 }, { "epoch": 1.665, "grad_norm": 1.9324712753295898, "learning_rate": 4.1875e-07, "logits/chosen": 0.9710912704467773, "logits/rejected": 0.9419949650764465, "logps/chosen": -169.6720733642578, "logps/rejected": -229.64044189453125, "loss": 0.5016, "rewards/accuracies": 0.75, "rewards/chosen": -0.10352640599012375, "rewards/margins": 0.7428768873214722, "rewards/rejected": -0.846403181552887, "step": 1665 }, { "epoch": 1.666, "grad_norm": 1.0021588802337646, "learning_rate": 4.1749999999999997e-07, "logits/chosen": 0.3094959557056427, "logits/rejected": 0.4255434274673462, "logps/chosen": -183.9716339111328, "logps/rejected": -170.57432556152344, "loss": 0.388, "rewards/accuracies": 0.75, "rewards/chosen": 0.08064199239015579, "rewards/margins": 1.0027191638946533, "rewards/rejected": -0.9220771193504333, "step": 1666 }, { "epoch": 1.667, "grad_norm": 0.9781518578529358, "learning_rate": 4.1625e-07, "logits/chosen": 0.8788111209869385, "logits/rejected": 0.3578367829322815, "logps/chosen": -274.7979736328125, "logps/rejected": -152.32421875, "loss": 0.376, "rewards/accuracies": 0.875, "rewards/chosen": 0.10117741674184799, "rewards/margins": 1.0034568309783936, "rewards/rejected": -0.9022793173789978, "step": 1667 }, { "epoch": 1.6680000000000001, "grad_norm": 1.151915431022644, "learning_rate": 4.1499999999999994e-07, "logits/chosen": 1.2184032201766968, "logits/rejected": 0.7953318357467651, "logps/chosen": -170.36953735351562, "logps/rejected": -172.6405487060547, "loss": 0.3837, "rewards/accuracies": 0.875, "rewards/chosen": 0.2013271450996399, "rewards/margins": 1.0196254253387451, "rewards/rejected": -0.8182982206344604, "step": 1668 }, { "epoch": 1.669, "grad_norm": 1.7745460271835327, "learning_rate": 4.1375e-07, "logits/chosen": 0.77907395362854, "logits/rejected": 1.0846363306045532, "logps/chosen": -199.947509765625, "logps/rejected": -301.0570373535156, "loss": 0.61, "rewards/accuracies": 0.625, "rewards/chosen": -0.10531911998987198, "rewards/margins": 0.3090701103210449, "rewards/rejected": -0.4143892526626587, "step": 1669 }, { "epoch": 1.67, "grad_norm": 1.7963709831237793, "learning_rate": 4.1249999999999997e-07, "logits/chosen": 0.757892370223999, "logits/rejected": 0.8420758843421936, "logps/chosen": -238.47830200195312, "logps/rejected": -283.1058654785156, "loss": 0.5186, "rewards/accuracies": 0.625, "rewards/chosen": 0.05630388855934143, "rewards/margins": 0.5616725087165833, "rewards/rejected": -0.5053686499595642, "step": 1670 }, { "epoch": 1.671, "grad_norm": 0.9479244947433472, "learning_rate": 4.1125e-07, "logits/chosen": 0.7470322847366333, "logits/rejected": 0.7452429533004761, "logps/chosen": -200.50146484375, "logps/rejected": -149.92861938476562, "loss": 0.3836, "rewards/accuracies": 0.75, "rewards/chosen": 0.12554660439491272, "rewards/margins": 1.0244686603546143, "rewards/rejected": -0.8989219665527344, "step": 1671 }, { "epoch": 1.6720000000000002, "grad_norm": 1.854182481765747, "learning_rate": 4.0999999999999994e-07, "logits/chosen": 1.1024364233016968, "logits/rejected": 0.9712563753128052, "logps/chosen": -272.97149658203125, "logps/rejected": -226.52281188964844, "loss": 0.5343, "rewards/accuracies": 0.625, "rewards/chosen": -0.13329839706420898, "rewards/margins": 0.5182825326919556, "rewards/rejected": -0.6515809893608093, "step": 1672 }, { "epoch": 1.673, "grad_norm": 1.0025814771652222, "learning_rate": 4.0875e-07, "logits/chosen": 0.39122098684310913, "logits/rejected": 0.629190981388092, "logps/chosen": -156.86134338378906, "logps/rejected": -201.9403076171875, "loss": 0.3901, "rewards/accuracies": 0.875, "rewards/chosen": 0.15825635194778442, "rewards/margins": 0.9938713312149048, "rewards/rejected": -0.8356149792671204, "step": 1673 }, { "epoch": 1.674, "grad_norm": 1.3319761753082275, "learning_rate": 4.0749999999999996e-07, "logits/chosen": 0.8466730117797852, "logits/rejected": 1.0240906476974487, "logps/chosen": -322.2559814453125, "logps/rejected": -226.97731018066406, "loss": 0.4932, "rewards/accuracies": 0.625, "rewards/chosen": -0.03144679218530655, "rewards/margins": 0.8844863772392273, "rewards/rejected": -0.9159331917762756, "step": 1674 }, { "epoch": 1.675, "grad_norm": 1.0653841495513916, "learning_rate": 4.0625e-07, "logits/chosen": 0.8189990520477295, "logits/rejected": 0.7449501752853394, "logps/chosen": -183.85165405273438, "logps/rejected": -177.78787231445312, "loss": 0.5489, "rewards/accuracies": 0.875, "rewards/chosen": 0.05078916624188423, "rewards/margins": 0.8949483036994934, "rewards/rejected": -0.8441591262817383, "step": 1675 }, { "epoch": 1.6760000000000002, "grad_norm": 1.1139754056930542, "learning_rate": 4.05e-07, "logits/chosen": 0.7537757754325867, "logits/rejected": 1.0303642749786377, "logps/chosen": -198.18234252929688, "logps/rejected": -209.33917236328125, "loss": 0.359, "rewards/accuracies": 0.875, "rewards/chosen": 0.15983347594738007, "rewards/margins": 0.9852489233016968, "rewards/rejected": -0.8254154324531555, "step": 1676 }, { "epoch": 1.677, "grad_norm": 1.2182427644729614, "learning_rate": 4.0375e-07, "logits/chosen": 0.9576379656791687, "logits/rejected": 0.6905419230461121, "logps/chosen": -225.7008819580078, "logps/rejected": -234.14053344726562, "loss": 0.5355, "rewards/accuracies": 0.875, "rewards/chosen": -0.25692635774612427, "rewards/margins": 0.5612471699714661, "rewards/rejected": -0.8181735277175903, "step": 1677 }, { "epoch": 1.678, "grad_norm": 1.2758959531784058, "learning_rate": 4.025e-07, "logits/chosen": 0.5494418144226074, "logits/rejected": 0.7232640981674194, "logps/chosen": -139.14727783203125, "logps/rejected": -220.76841735839844, "loss": 0.3764, "rewards/accuracies": 1.0, "rewards/chosen": 0.25917690992355347, "rewards/margins": 0.9195187091827393, "rewards/rejected": -0.660341739654541, "step": 1678 }, { "epoch": 1.679, "grad_norm": 1.220479130744934, "learning_rate": 4.0124999999999997e-07, "logits/chosen": 0.0955885499715805, "logits/rejected": 0.5538020730018616, "logps/chosen": -134.57827758789062, "logps/rejected": -212.29351806640625, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 0.09304419159889221, "rewards/margins": 1.0990958213806152, "rewards/rejected": -1.0060516595840454, "step": 1679 }, { "epoch": 1.6800000000000002, "grad_norm": 1.294001579284668, "learning_rate": 4e-07, "logits/chosen": 1.2192330360412598, "logits/rejected": 0.6481128931045532, "logps/chosen": -280.51031494140625, "logps/rejected": -178.91268920898438, "loss": 0.415, "rewards/accuracies": 0.875, "rewards/chosen": 0.19814881682395935, "rewards/margins": 0.932976245880127, "rewards/rejected": -0.73482745885849, "step": 1680 }, { "epoch": 1.681, "grad_norm": 0.9806723594665527, "learning_rate": 3.9875e-07, "logits/chosen": 0.8427867889404297, "logits/rejected": 0.33292847871780396, "logps/chosen": -314.6043701171875, "logps/rejected": -178.4646453857422, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": 0.5374531745910645, "rewards/margins": 1.2838019132614136, "rewards/rejected": -0.7463487982749939, "step": 1681 }, { "epoch": 1.682, "grad_norm": 1.8924973011016846, "learning_rate": 3.975e-07, "logits/chosen": 0.9844673871994019, "logits/rejected": 0.8796500563621521, "logps/chosen": -165.292724609375, "logps/rejected": -232.52938842773438, "loss": 0.5333, "rewards/accuracies": 0.75, "rewards/chosen": -0.30443620681762695, "rewards/margins": 0.4445212483406067, "rewards/rejected": -0.7489575147628784, "step": 1682 }, { "epoch": 1.683, "grad_norm": 1.345392107963562, "learning_rate": 3.9624999999999996e-07, "logits/chosen": 0.9662304520606995, "logits/rejected": 0.4241752326488495, "logps/chosen": -226.189697265625, "logps/rejected": -231.40524291992188, "loss": 0.6167, "rewards/accuracies": 0.625, "rewards/chosen": -0.1200842633843422, "rewards/margins": 0.4882065951824188, "rewards/rejected": -0.6082908511161804, "step": 1683 }, { "epoch": 1.6840000000000002, "grad_norm": 2.017773389816284, "learning_rate": 3.95e-07, "logits/chosen": 0.7922036647796631, "logits/rejected": 0.5287953019142151, "logps/chosen": -318.71917724609375, "logps/rejected": -149.3468017578125, "loss": 0.6277, "rewards/accuracies": 0.625, "rewards/chosen": -0.2190781533718109, "rewards/margins": 0.33657169342041016, "rewards/rejected": -0.5556498765945435, "step": 1684 }, { "epoch": 1.685, "grad_norm": 1.412570595741272, "learning_rate": 3.9375e-07, "logits/chosen": 0.8510602712631226, "logits/rejected": 0.020348701626062393, "logps/chosen": -191.531494140625, "logps/rejected": -138.7805633544922, "loss": 0.4769, "rewards/accuracies": 0.875, "rewards/chosen": 0.005906492471694946, "rewards/margins": 0.7090493440628052, "rewards/rejected": -0.7031428217887878, "step": 1685 }, { "epoch": 1.686, "grad_norm": 1.5249406099319458, "learning_rate": 3.925e-07, "logits/chosen": 0.7419472932815552, "logits/rejected": 0.8710364103317261, "logps/chosen": -206.21041870117188, "logps/rejected": -185.63189697265625, "loss": 0.5341, "rewards/accuracies": 0.75, "rewards/chosen": -0.07585106045007706, "rewards/margins": 0.6531033515930176, "rewards/rejected": -0.7289543151855469, "step": 1686 }, { "epoch": 1.687, "grad_norm": 1.4992777109146118, "learning_rate": 3.9124999999999996e-07, "logits/chosen": 0.3029516935348511, "logits/rejected": 0.546722412109375, "logps/chosen": -143.6208038330078, "logps/rejected": -207.1804962158203, "loss": 0.3684, "rewards/accuracies": 0.875, "rewards/chosen": 0.08576764166355133, "rewards/margins": 1.0735793113708496, "rewards/rejected": -0.9878116846084595, "step": 1687 }, { "epoch": 1.688, "grad_norm": 1.947607398033142, "learning_rate": 3.8999999999999997e-07, "logits/chosen": 0.24697618186473846, "logits/rejected": 0.36180880665779114, "logps/chosen": -170.7027130126953, "logps/rejected": -174.38873291015625, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": -0.10307589173316956, "rewards/margins": 0.5534098744392395, "rewards/rejected": -0.6564857959747314, "step": 1688 }, { "epoch": 1.689, "grad_norm": 1.3847576379776, "learning_rate": 3.8875e-07, "logits/chosen": 1.7600088119506836, "logits/rejected": 0.6772890686988831, "logps/chosen": -449.9745178222656, "logps/rejected": -164.20574951171875, "loss": 0.5315, "rewards/accuracies": 0.625, "rewards/chosen": -0.16990183293819427, "rewards/margins": 0.7060989141464233, "rewards/rejected": -0.8760007619857788, "step": 1689 }, { "epoch": 1.69, "grad_norm": 1.129701018333435, "learning_rate": 3.875e-07, "logits/chosen": 0.7193349599838257, "logits/rejected": 0.7296316623687744, "logps/chosen": -167.56375122070312, "logps/rejected": -229.1716766357422, "loss": 0.3693, "rewards/accuracies": 0.875, "rewards/chosen": 0.34946319460868835, "rewards/margins": 1.171429991722107, "rewards/rejected": -0.821966826915741, "step": 1690 }, { "epoch": 1.6909999999999998, "grad_norm": 1.617957353591919, "learning_rate": 3.8624999999999995e-07, "logits/chosen": 1.298269510269165, "logits/rejected": 0.20620205998420715, "logps/chosen": -236.8658447265625, "logps/rejected": -147.23043823242188, "loss": 0.7145, "rewards/accuracies": 0.625, "rewards/chosen": -0.46638089418411255, "rewards/margins": 0.1665128767490387, "rewards/rejected": -0.6328937411308289, "step": 1691 }, { "epoch": 1.692, "grad_norm": 1.107123851776123, "learning_rate": 3.8499999999999997e-07, "logits/chosen": 0.7847678065299988, "logits/rejected": 1.1141512393951416, "logps/chosen": -212.5850830078125, "logps/rejected": -192.005859375, "loss": 0.3591, "rewards/accuracies": 0.75, "rewards/chosen": 0.1531694233417511, "rewards/margins": 1.1824750900268555, "rewards/rejected": -1.0293056964874268, "step": 1692 }, { "epoch": 1.693, "grad_norm": 1.0590574741363525, "learning_rate": 3.8375e-07, "logits/chosen": 0.951667308807373, "logits/rejected": 1.1530307531356812, "logps/chosen": -144.77500915527344, "logps/rejected": -204.32403564453125, "loss": 0.2809, "rewards/accuracies": 1.0, "rewards/chosen": 0.34081774950027466, "rewards/margins": 1.3000632524490356, "rewards/rejected": -0.959245502948761, "step": 1693 }, { "epoch": 1.694, "grad_norm": 1.714247226715088, "learning_rate": 3.825e-07, "logits/chosen": 0.8067673444747925, "logits/rejected": 0.593841552734375, "logps/chosen": -135.34286499023438, "logps/rejected": -198.080322265625, "loss": 0.5364, "rewards/accuracies": 0.875, "rewards/chosen": 0.09613720327615738, "rewards/margins": 0.5929511785507202, "rewards/rejected": -0.49681398272514343, "step": 1694 }, { "epoch": 1.6949999999999998, "grad_norm": 1.2780756950378418, "learning_rate": 3.8124999999999995e-07, "logits/chosen": 0.9568260908126831, "logits/rejected": 0.6795281171798706, "logps/chosen": -211.24588012695312, "logps/rejected": -201.41668701171875, "loss": 0.4374, "rewards/accuracies": 0.75, "rewards/chosen": 0.05704956501722336, "rewards/margins": 0.7971489429473877, "rewards/rejected": -0.7400993704795837, "step": 1695 }, { "epoch": 1.696, "grad_norm": 1.43048095703125, "learning_rate": 3.7999999999999996e-07, "logits/chosen": 1.2274667024612427, "logits/rejected": 0.38497984409332275, "logps/chosen": -249.34519958496094, "logps/rejected": -165.11419677734375, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": -0.023597098886966705, "rewards/margins": 0.7140700221061707, "rewards/rejected": -0.7376671433448792, "step": 1696 }, { "epoch": 1.697, "grad_norm": 1.0205078125, "learning_rate": 3.7875e-07, "logits/chosen": 1.1059752702713013, "logits/rejected": 0.5063579082489014, "logps/chosen": -306.63946533203125, "logps/rejected": -157.5673828125, "loss": 0.3893, "rewards/accuracies": 0.75, "rewards/chosen": 0.1423918604850769, "rewards/margins": 0.9580212235450745, "rewards/rejected": -0.8156293034553528, "step": 1697 }, { "epoch": 1.698, "grad_norm": 2.122652769088745, "learning_rate": 3.775e-07, "logits/chosen": 0.4771159291267395, "logits/rejected": 0.9287596344947815, "logps/chosen": -113.29005432128906, "logps/rejected": -298.435302734375, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.018424514681100845, "rewards/margins": 0.5975054502487183, "rewards/rejected": -0.5790809988975525, "step": 1698 }, { "epoch": 1.6989999999999998, "grad_norm": 1.971734881401062, "learning_rate": 3.7624999999999994e-07, "logits/chosen": 1.0358221530914307, "logits/rejected": 0.6723889112472534, "logps/chosen": -264.9248046875, "logps/rejected": -205.4897918701172, "loss": 0.9402, "rewards/accuracies": 0.5, "rewards/chosen": -0.4621693789958954, "rewards/margins": -0.15293759107589722, "rewards/rejected": -0.3092317581176758, "step": 1699 }, { "epoch": 1.7, "grad_norm": 2.362253427505493, "learning_rate": 3.75e-07, "logits/chosen": 1.0892763137817383, "logits/rejected": 0.5914822816848755, "logps/chosen": -237.27032470703125, "logps/rejected": -166.46578979492188, "loss": 0.9542, "rewards/accuracies": 0.375, "rewards/chosen": -0.6117686033248901, "rewards/margins": -0.26094603538513184, "rewards/rejected": -0.3508225679397583, "step": 1700 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }