{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026171159382360636, "grad_norm": 1.421875, "learning_rate": 1.3054830287206266e-09, "logits/chosen": -2.9892377853393555, "logits/rejected": -2.938478946685791, "logps/chosen": -307.68707275390625, "logps/rejected": -392.1196594238281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0026171159382360636, "grad_norm": 1.359375, "learning_rate": 1.3054830287206264e-08, "logits/chosen": -2.846788167953491, "logits/rejected": -2.834296941757202, "logps/chosen": -299.1590881347656, "logps/rejected": -260.9870300292969, "loss": 0.6927, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.00017009497969411314, "rewards/margins": 0.0008415079792030156, "rewards/rejected": -0.0006714130286127329, "step": 10 }, { "epoch": 0.005234231876472127, "grad_norm": 1.609375, "learning_rate": 2.610966057441253e-08, "logits/chosen": -2.8615875244140625, "logits/rejected": -2.8269271850585938, "logps/chosen": -325.3974609375, "logps/rejected": -252.712158203125, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0003612989676184952, "rewards/margins": 2.1457055481732823e-05, "rewards/rejected": 0.00033984187757596374, "step": 20 }, { "epoch": 0.007851347814708191, "grad_norm": 1.796875, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.8635482788085938, "logits/rejected": -2.83804988861084, "logps/chosen": -269.81329345703125, "logps/rejected": -268.55670166015625, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.000123085526865907, "rewards/margins": 6.434940587496385e-05, "rewards/rejected": -0.0001874349982244894, "step": 30 }, { "epoch": 0.010468463752944255, "grad_norm": 1.1640625, "learning_rate": 5.221932114882506e-08, "logits/chosen": -2.8312931060791016, "logits/rejected": -2.821013927459717, "logps/chosen": -233.34909057617188, "logps/rejected": -238.37490844726562, "loss": 0.6934, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0005193200195208192, "rewards/margins": -0.0004341518506407738, "rewards/rejected": -8.516813250025734e-05, "step": 40 }, { "epoch": 0.01308557969118032, "grad_norm": 1.15625, "learning_rate": 6.527415143603133e-08, "logits/chosen": -2.866091251373291, "logits/rejected": -2.85339093208313, "logps/chosen": -290.05963134765625, "logps/rejected": -253.92349243164062, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00046620480134151876, "rewards/margins": 0.0002595257537905127, "rewards/rejected": 0.0002066790621029213, "step": 50 }, { "epoch": 0.015702695629416383, "grad_norm": 1.25, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.825549364089966, "logits/rejected": -2.8121423721313477, "logps/chosen": -273.64691162109375, "logps/rejected": -246.85317993164062, "loss": 0.693, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00024823236162774265, "rewards/margins": 0.00039805466076359153, "rewards/rejected": -0.0001498223573435098, "step": 60 }, { "epoch": 0.018319811567652448, "grad_norm": 1.234375, "learning_rate": 9.138381201044386e-08, "logits/chosen": -2.8805994987487793, "logits/rejected": -2.8450770378112793, "logps/chosen": -293.1197814941406, "logps/rejected": -266.08135986328125, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0004158564261160791, "rewards/margins": 7.372137770289555e-05, "rewards/rejected": 0.0003421350847929716, "step": 70 }, { "epoch": 0.02093692750588851, "grad_norm": 1.6328125, "learning_rate": 1.0443864229765012e-07, "logits/chosen": -2.820730209350586, "logits/rejected": -2.7984094619750977, "logps/chosen": -279.29498291015625, "logps/rejected": -266.357666015625, "loss": 0.6933, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 9.654175664763898e-05, "rewards/margins": -0.00033630471443757415, "rewards/rejected": 0.0004328465147409588, "step": 80 }, { "epoch": 0.023554043444124574, "grad_norm": 1.28125, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.8342747688293457, "logits/rejected": -2.8211700916290283, "logps/chosen": -270.66888427734375, "logps/rejected": -251.8229522705078, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0002475693472661078, "rewards/margins": 0.0005019751843065023, "rewards/rejected": -0.000254405866144225, "step": 90 }, { "epoch": 0.02617115938236064, "grad_norm": 1.296875, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.849017381668091, "logits/rejected": -2.842028856277466, "logps/chosen": -267.05035400390625, "logps/rejected": -248.63992309570312, "loss": 0.6929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00027873244835063815, "rewards/margins": 0.0004233802610542625, "rewards/rejected": -0.00014464779815170914, "step": 100 }, { "epoch": 0.02617115938236064, "eval_logits/chosen": -2.865492343902588, "eval_logits/rejected": -2.838137626647949, "eval_logps/chosen": -282.7629699707031, "eval_logps/rejected": -261.4512023925781, "eval_loss": 0.6930338740348816, "eval_rewards/accuracies": 0.5134999752044678, "eval_rewards/chosen": 0.00010537073103478178, "eval_rewards/margins": 0.00023393578885588795, "eval_rewards/rejected": -0.00012856510875280946, "eval_runtime": 623.7252, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.401, "step": 100 }, { "epoch": 0.028788275320596704, "grad_norm": 1.4375, "learning_rate": 1.4360313315926893e-07, "logits/chosen": -2.855942964553833, "logits/rejected": -2.822741985321045, "logps/chosen": -307.44110107421875, "logps/rejected": -257.2309265136719, "loss": 0.6929, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00018880394054576755, "rewards/margins": 0.0004303649184294045, "rewards/rejected": -0.0002415610069874674, "step": 110 }, { "epoch": 0.031405391258832765, "grad_norm": 1.21875, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.86763334274292, "logits/rejected": -2.844106435775757, "logps/chosen": -310.5987854003906, "logps/rejected": -287.745361328125, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00025308955810032785, "rewards/margins": 0.00023145000159274787, "rewards/rejected": 2.1639541955664754e-05, "step": 120 }, { "epoch": 0.03402250719706883, "grad_norm": 1.4921875, "learning_rate": 1.6971279373368143e-07, "logits/chosen": -2.847980499267578, "logits/rejected": -2.8163723945617676, "logps/chosen": -271.6886291503906, "logps/rejected": -269.58660888671875, "loss": 0.6928, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 1.363331080028729e-06, "rewards/margins": 0.000652765913400799, "rewards/rejected": -0.0006514025735668838, "step": 130 }, { "epoch": 0.036639623135304895, "grad_norm": 1.3125, "learning_rate": 1.8276762402088773e-07, "logits/chosen": -2.8673386573791504, "logits/rejected": -2.8119819164276123, "logps/chosen": -291.5235290527344, "logps/rejected": -247.7689971923828, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0003679326910059899, "rewards/margins": 0.000704582198522985, "rewards/rejected": -0.00033664953662082553, "step": 140 }, { "epoch": 0.03925673907354096, "grad_norm": 1.3828125, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.8565478324890137, "logits/rejected": -2.8365466594696045, "logps/chosen": -299.02996826171875, "logps/rejected": -255.97604370117188, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0001471690193284303, "rewards/margins": 0.0002785170800052583, "rewards/rejected": -0.00013134813343640417, "step": 150 }, { "epoch": 0.04187385501177702, "grad_norm": 1.359375, "learning_rate": 2.0887728459530023e-07, "logits/chosen": -2.8643641471862793, "logits/rejected": -2.8453617095947266, "logps/chosen": -275.17669677734375, "logps/rejected": -274.9828186035156, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.313215948874131e-05, "rewards/margins": 0.0004607086593750864, "rewards/rejected": -0.0005438407533802092, "step": 160 }, { "epoch": 0.04449097095001309, "grad_norm": 1.546875, "learning_rate": 2.2193211488250652e-07, "logits/chosen": -2.8222973346710205, "logits/rejected": -2.803818941116333, "logps/chosen": -236.69189453125, "logps/rejected": -238.2162628173828, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00023162660363595933, "rewards/margins": 0.00025372960953973234, "rewards/rejected": -2.2102987713878974e-05, "step": 170 }, { "epoch": 0.04710808688824915, "grad_norm": 1.125, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.850526809692383, "logits/rejected": -2.8234286308288574, "logps/chosen": -276.2384338378906, "logps/rejected": -259.85089111328125, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0005887501174584031, "rewards/margins": 0.000796462525613606, "rewards/rejected": -0.000207712480914779, "step": 180 }, { "epoch": 0.04972520282648522, "grad_norm": 2.234375, "learning_rate": 2.4804177545691903e-07, "logits/chosen": -2.887956380844116, "logits/rejected": -2.8700356483459473, "logps/chosen": -291.0037841796875, "logps/rejected": -257.3691711425781, "loss": 0.6929, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 8.575538959121332e-05, "rewards/margins": 0.00044340407475829124, "rewards/rejected": -0.00035764873609878123, "step": 190 }, { "epoch": 0.05234231876472128, "grad_norm": 1.3125, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.838761806488037, "logits/rejected": -2.828749179840088, "logps/chosen": -268.03924560546875, "logps/rejected": -225.5205078125, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.57282687118277e-05, "rewards/margins": 0.0003965885262005031, "rewards/rejected": -0.0004723168385680765, "step": 200 }, { "epoch": 0.05234231876472128, "eval_logits/chosen": -2.8625597953796387, "eval_logits/rejected": -2.8349130153656006, "eval_logps/chosen": -282.7611389160156, "eval_logps/rejected": -261.49249267578125, "eval_loss": 0.6928190588951111, "eval_rewards/accuracies": 0.546999990940094, "eval_rewards/chosen": 0.00012387627793941647, "eval_rewards/margins": 0.0006650119903497398, "eval_rewards/rejected": -0.0005411357851698995, "eval_runtime": 622.9697, "eval_samples_per_second": 3.21, "eval_steps_per_second": 0.401, "step": 200 }, { "epoch": 0.05495943470295734, "grad_norm": 1.25, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.875335931777954, "logits/rejected": -2.841496229171753, "logps/chosen": -276.1015930175781, "logps/rejected": -245.19223022460938, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0002998802810907364, "rewards/margins": 5.954801963525824e-05, "rewards/rejected": -0.00035942820250056684, "step": 210 }, { "epoch": 0.05757655064119341, "grad_norm": 1.1953125, "learning_rate": 2.8720626631853785e-07, "logits/chosen": -2.8162028789520264, "logits/rejected": -2.810290575027466, "logps/chosen": -274.1748962402344, "logps/rejected": -242.88381958007812, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00047311707749031484, "rewards/margins": 0.0010585600975900888, "rewards/rejected": -0.0005854429909959435, "step": 220 }, { "epoch": 0.06019366657942947, "grad_norm": 1.4140625, "learning_rate": 3.002610966057441e-07, "logits/chosen": -2.886976957321167, "logits/rejected": -2.862199544906616, "logps/chosen": -322.8957824707031, "logps/rejected": -285.7581787109375, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0009428686462342739, "rewards/margins": 0.0013226759620010853, "rewards/rejected": -0.0003798073739744723, "step": 230 }, { "epoch": 0.06281078251766553, "grad_norm": 1.2890625, "learning_rate": 3.133159268929504e-07, "logits/chosen": -2.8522121906280518, "logits/rejected": -2.838016986846924, "logps/chosen": -312.5648498535156, "logps/rejected": -297.47650146484375, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0015294912736862898, "rewards/margins": 0.0015876994002610445, "rewards/rejected": -5.820817386847921e-05, "step": 240 }, { "epoch": 0.06542789845590159, "grad_norm": 1.1640625, "learning_rate": 3.263707571801567e-07, "logits/chosen": -2.815152883529663, "logits/rejected": -2.8188998699188232, "logps/chosen": -277.23309326171875, "logps/rejected": -249.0277862548828, "loss": 0.6924, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0006144286599010229, "rewards/margins": 0.0015933450777083635, "rewards/rejected": -0.0009789163013920188, "step": 250 }, { "epoch": 0.06804501439413765, "grad_norm": 1.1171875, "learning_rate": 3.3942558746736286e-07, "logits/chosen": -2.8725204467773438, "logits/rejected": -2.8254947662353516, "logps/chosen": -297.4732971191406, "logps/rejected": -277.87225341796875, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.000995874172076583, "rewards/margins": 0.0009019881254062057, "rewards/rejected": 9.38860684982501e-05, "step": 260 }, { "epoch": 0.07066213033237373, "grad_norm": 0.99609375, "learning_rate": 3.5248041775456916e-07, "logits/chosen": -2.8370730876922607, "logits/rejected": -2.825009346008301, "logps/chosen": -281.54547119140625, "logps/rejected": -245.32528686523438, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0012482403544709086, "rewards/margins": 0.002398666925728321, "rewards/rejected": -0.0011504264548420906, "step": 270 }, { "epoch": 0.07327924627060979, "grad_norm": 1.203125, "learning_rate": 3.6553524804177545e-07, "logits/chosen": -2.8796634674072266, "logits/rejected": -2.836472272872925, "logps/chosen": -276.66485595703125, "logps/rejected": -253.39230346679688, "loss": 0.6922, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0009085072088055313, "rewards/margins": 0.0018395546358078718, "rewards/rejected": -0.0009310474852100015, "step": 280 }, { "epoch": 0.07589636220884585, "grad_norm": 1.359375, "learning_rate": 3.785900783289817e-07, "logits/chosen": -2.8511414527893066, "logits/rejected": -2.840785264968872, "logps/chosen": -304.3522033691406, "logps/rejected": -279.17950439453125, "loss": 0.6922, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0009422661969438195, "rewards/margins": 0.0019500417402014136, "rewards/rejected": -0.0010077755432575941, "step": 290 }, { "epoch": 0.07851347814708191, "grad_norm": 1.4453125, "learning_rate": 3.91644908616188e-07, "logits/chosen": -2.8077988624572754, "logits/rejected": -2.763946294784546, "logps/chosen": -266.3786315917969, "logps/rejected": -248.56350708007812, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": 0.0008328545955009758, "rewards/margins": 0.0023509101010859013, "rewards/rejected": -0.0015180552145466208, "step": 300 }, { "epoch": 0.07851347814708191, "eval_logits/chosen": -2.8650434017181396, "eval_logits/rejected": -2.8377885818481445, "eval_logps/chosen": -282.674560546875, "eval_logps/rejected": -261.546142578125, "eval_loss": 0.692122220993042, "eval_rewards/accuracies": 0.6050000190734863, "eval_rewards/chosen": 0.000989287393167615, "eval_rewards/margins": 0.0020671640522778034, "eval_rewards/rejected": -0.0010778764262795448, "eval_runtime": 622.8014, "eval_samples_per_second": 3.211, "eval_steps_per_second": 0.401, "step": 300 }, { "epoch": 0.08113059408531798, "grad_norm": 1.4921875, "learning_rate": 4.046997389033943e-07, "logits/chosen": -2.895244598388672, "logits/rejected": -2.8767800331115723, "logps/chosen": -306.62994384765625, "logps/rejected": -250.0150909423828, "loss": 0.6919, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0013422651682049036, "rewards/margins": 0.0025762903969734907, "rewards/rejected": -0.001234025345183909, "step": 310 }, { "epoch": 0.08374771002355404, "grad_norm": 1.2265625, "learning_rate": 4.1775456919060046e-07, "logits/chosen": -2.8745856285095215, "logits/rejected": -2.8429815769195557, "logps/chosen": -273.4037170410156, "logps/rejected": -255.09585571289062, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0011208092328161001, "rewards/margins": 0.001973007107153535, "rewards/rejected": -0.0008521980489604175, "step": 320 }, { "epoch": 0.08636482596179011, "grad_norm": 1.234375, "learning_rate": 4.3080939947780675e-07, "logits/chosen": -2.8409087657928467, "logits/rejected": -2.8410866260528564, "logps/chosen": -277.77545166015625, "logps/rejected": -250.94821166992188, "loss": 0.6917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.002120083197951317, "rewards/margins": 0.0029364789370447397, "rewards/rejected": -0.0008163956226781011, "step": 330 }, { "epoch": 0.08898194190002617, "grad_norm": 1.3984375, "learning_rate": 4.4386422976501305e-07, "logits/chosen": -2.8706066608428955, "logits/rejected": -2.857938766479492, "logps/chosen": -307.44732666015625, "logps/rejected": -284.9738464355469, "loss": 0.6912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0030747547280043364, "rewards/margins": 0.00399099662899971, "rewards/rejected": -0.0009162420174106956, "step": 340 }, { "epoch": 0.09159905783826224, "grad_norm": 1.234375, "learning_rate": 4.569190600522193e-07, "logits/chosen": -2.8302149772644043, "logits/rejected": -2.803089141845703, "logps/chosen": -309.71893310546875, "logps/rejected": -296.48101806640625, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": 0.002736264606937766, "rewards/margins": 0.002880766289308667, "rewards/rejected": -0.00014450155140366405, "step": 350 }, { "epoch": 0.0942161737764983, "grad_norm": 0.890625, "learning_rate": 4.699738903394256e-07, "logits/chosen": -2.8377342224121094, "logits/rejected": -2.8193764686584473, "logps/chosen": -256.7732238769531, "logps/rejected": -236.75698852539062, "loss": 0.6914, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0019551387522369623, "rewards/margins": 0.003464858280494809, "rewards/rejected": -0.0015097195282578468, "step": 360 }, { "epoch": 0.09683328971473436, "grad_norm": 1.4609375, "learning_rate": 4.830287206266319e-07, "logits/chosen": -2.8519506454467773, "logits/rejected": -2.822915554046631, "logps/chosen": -295.97418212890625, "logps/rejected": -251.2534637451172, "loss": 0.6908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.002289209049195051, "rewards/margins": 0.004647364374250174, "rewards/rejected": -0.002358155557885766, "step": 370 }, { "epoch": 0.09945040565297043, "grad_norm": 1.3046875, "learning_rate": 4.960835509138381e-07, "logits/chosen": -2.8553032875061035, "logits/rejected": -2.8058464527130127, "logps/chosen": -316.52178955078125, "logps/rejected": -279.59136962890625, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028482459019869566, "rewards/margins": 0.0039718905463814735, "rewards/rejected": -0.0011236447608098388, "step": 380 }, { "epoch": 0.1020675215912065, "grad_norm": 1.6328125, "learning_rate": 4.999948856244767e-07, "logits/chosen": -2.8345859050750732, "logits/rejected": -2.829207420349121, "logps/chosen": -298.51348876953125, "logps/rejected": -278.06976318359375, "loss": 0.6904, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.004357966594398022, "rewards/margins": 0.00553758442401886, "rewards/rejected": -0.00117961794603616, "step": 390 }, { "epoch": 0.10468463752944256, "grad_norm": 1.25, "learning_rate": 4.999698361256577e-07, "logits/chosen": -2.8570194244384766, "logits/rejected": -2.820826768875122, "logps/chosen": -280.4349670410156, "logps/rejected": -238.0439453125, "loss": 0.6913, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.003575119422748685, "rewards/margins": 0.0037900402676314116, "rewards/rejected": -0.00021492131054401398, "step": 400 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.862203598022461, "eval_logits/rejected": -2.8348591327667236, "eval_logps/chosen": -282.4126892089844, "eval_logps/rejected": -261.5210876464844, "eval_loss": 0.6909525394439697, "eval_rewards/accuracies": 0.6395000219345093, "eval_rewards/chosen": 0.003608107101172209, "eval_rewards/margins": 0.004435193259268999, "eval_rewards/rejected": -0.0008270857506431639, "eval_runtime": 623.9261, "eval_samples_per_second": 3.206, "eval_steps_per_second": 0.401, "step": 400 }, { "epoch": 0.10730175346767862, "grad_norm": 1.34375, "learning_rate": 4.99923914217458e-07, "logits/chosen": -2.8254337310791016, "logits/rejected": -2.810080051422119, "logps/chosen": -257.35760498046875, "logps/rejected": -256.613525390625, "loss": 0.6921, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0025605126284062862, "rewards/margins": 0.0021402649581432343, "rewards/rejected": 0.000420247990405187, "step": 410 }, { "epoch": 0.10991886940591468, "grad_norm": 2.890625, "learning_rate": 4.99857123734344e-07, "logits/chosen": -2.823087215423584, "logits/rejected": -2.776906967163086, "logps/chosen": -245.77804565429688, "logps/rejected": -238.0629119873047, "loss": 0.6908, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0032352313864976168, "rewards/margins": 0.004795724991708994, "rewards/rejected": -0.0015604936052113771, "step": 420 }, { "epoch": 0.11253598534415074, "grad_norm": 1.5703125, "learning_rate": 4.997694702533016e-07, "logits/chosen": -2.8463032245635986, "logits/rejected": -2.815331220626831, "logps/chosen": -295.5052490234375, "logps/rejected": -272.6297912597656, "loss": 0.6903, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.004903838969767094, "rewards/margins": 0.005789603106677532, "rewards/rejected": -0.0008857650682330132, "step": 430 }, { "epoch": 0.11515310128238682, "grad_norm": 1.2578125, "learning_rate": 4.996609610933712e-07, "logits/chosen": -2.88322114944458, "logits/rejected": -2.861856460571289, "logps/chosen": -286.9052734375, "logps/rejected": -257.013427734375, "loss": 0.6898, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.005403473507612944, "rewards/margins": 0.0067200749181210995, "rewards/rejected": -0.0013166010612621903, "step": 440 }, { "epoch": 0.11777021722062288, "grad_norm": 1.21875, "learning_rate": 4.995316053150366e-07, "logits/chosen": -2.814988374710083, "logits/rejected": -2.8178412914276123, "logps/chosen": -290.4037170410156, "logps/rejected": -260.14459228515625, "loss": 0.6903, "rewards/accuracies": 0.65625, "rewards/chosen": 0.006219993345439434, "rewards/margins": 0.005712195299565792, "rewards/rejected": 0.0005077989189885557, "step": 450 }, { "epoch": 0.12038733315885894, "grad_norm": 2.140625, "learning_rate": 4.99381413719468e-07, "logits/chosen": -2.8341031074523926, "logits/rejected": -2.8203091621398926, "logps/chosen": -282.1357421875, "logps/rejected": -269.10565185546875, "loss": 0.6887, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006647266447544098, "rewards/margins": 0.00890478678047657, "rewards/rejected": -0.0022575196344405413, "step": 460 }, { "epoch": 0.123004449097095, "grad_norm": 1.3046875, "learning_rate": 4.992103988476205e-07, "logits/chosen": -2.846776247024536, "logits/rejected": -2.8195786476135254, "logps/chosen": -259.63775634765625, "logps/rejected": -245.67117309570312, "loss": 0.6905, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.005352762993425131, "rewards/margins": 0.0052968584932386875, "rewards/rejected": 5.59051513846498e-05, "step": 470 }, { "epoch": 0.12562156503533106, "grad_norm": 1.359375, "learning_rate": 4.990185749791864e-07, "logits/chosen": -2.8792474269866943, "logits/rejected": -2.8467297554016113, "logps/chosen": -273.77880859375, "logps/rejected": -274.2110595703125, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": 0.005960241891443729, "rewards/margins": 0.007407790515571833, "rewards/rejected": -0.001447548856958747, "step": 480 }, { "epoch": 0.12823868097356714, "grad_norm": 1.3828125, "learning_rate": 4.988059581314039e-07, "logits/chosen": -2.858649730682373, "logits/rejected": -2.8390355110168457, "logps/chosen": -307.80267333984375, "logps/rejected": -269.5003662109375, "loss": 0.6891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007140771951526403, "rewards/margins": 0.008223852142691612, "rewards/rejected": -0.0010830799583345652, "step": 490 }, { "epoch": 0.13085579691180318, "grad_norm": 1.34375, "learning_rate": 4.985725660577184e-07, "logits/chosen": -2.8739092350006104, "logits/rejected": -2.8554844856262207, "logps/chosen": -290.1658935546875, "logps/rejected": -249.3054656982422, "loss": 0.689, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.005505991168320179, "rewards/margins": 0.008494934067130089, "rewards/rejected": -0.0029889424331486225, "step": 500 }, { "epoch": 0.13085579691180318, "eval_logits/chosen": -2.8655941486358643, "eval_logits/rejected": -2.838855028152466, "eval_logps/chosen": -282.2831115722656, "eval_logps/rejected": -261.68048095703125, "eval_loss": 0.6895392537117004, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": 0.004903781693428755, "eval_rewards/margins": 0.007324740756303072, "eval_rewards/rejected": -0.00242095859721303, "eval_runtime": 622.8706, "eval_samples_per_second": 3.211, "eval_steps_per_second": 0.401, "step": 500 }, { "epoch": 0.13347291285003926, "grad_norm": 1.546875, "learning_rate": 4.983184182463008e-07, "logits/chosen": -2.8507940769195557, "logits/rejected": -2.828244686126709, "logps/chosen": -294.1042785644531, "logps/rejected": -255.8568572998047, "loss": 0.6886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0062326351180672646, "rewards/margins": 0.009330684319138527, "rewards/rejected": -0.003098049433901906, "step": 510 }, { "epoch": 0.1360900287882753, "grad_norm": 1.546875, "learning_rate": 4.980435359184203e-07, "logits/chosen": -2.8747315406799316, "logits/rejected": -2.8765642642974854, "logps/chosen": -287.198486328125, "logps/rejected": -270.84130859375, "loss": 0.6888, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.006150488276034594, "rewards/margins": 0.008788048289716244, "rewards/rejected": -0.002637560246512294, "step": 520 }, { "epoch": 0.13870714472651138, "grad_norm": 1.6015625, "learning_rate": 4.977479420266723e-07, "logits/chosen": -2.8206260204315186, "logits/rejected": -2.8258962631225586, "logps/chosen": -280.0619201660156, "logps/rejected": -288.264892578125, "loss": 0.6891, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005794334691017866, "rewards/margins": 0.008307529613375664, "rewards/rejected": -0.0025131958536803722, "step": 530 }, { "epoch": 0.14132426066474746, "grad_norm": 1.1953125, "learning_rate": 4.974316612530614e-07, "logits/chosen": -2.813945770263672, "logits/rejected": -2.796184778213501, "logps/chosen": -298.8610534667969, "logps/rejected": -258.8526611328125, "loss": 0.6865, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.0080220652744174, "rewards/margins": 0.01355043239891529, "rewards/rejected": -0.00552836898714304, "step": 540 }, { "epoch": 0.1439413766029835, "grad_norm": 1.3359375, "learning_rate": 4.970947200069415e-07, "logits/chosen": -2.829272747039795, "logits/rejected": -2.816063404083252, "logps/chosen": -298.9976806640625, "logps/rejected": -276.99444580078125, "loss": 0.6894, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005816199816763401, "rewards/margins": 0.007670801132917404, "rewards/rejected": -0.0018546013161540031, "step": 550 }, { "epoch": 0.14655849254121958, "grad_norm": 1.1875, "learning_rate": 4.967371464228095e-07, "logits/chosen": -2.890547513961792, "logits/rejected": -2.869276762008667, "logps/chosen": -271.36053466796875, "logps/rejected": -272.12469482421875, "loss": 0.689, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.005742850713431835, "rewards/margins": 0.00846049003303051, "rewards/rejected": -0.0027176393195986748, "step": 560 }, { "epoch": 0.14917560847945563, "grad_norm": 1.3984375, "learning_rate": 4.963589703579569e-07, "logits/chosen": -2.9156060218811035, "logits/rejected": -2.888892412185669, "logps/chosen": -315.22613525390625, "logps/rejected": -279.62847900390625, "loss": 0.6881, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.007057487033307552, "rewards/margins": 0.010187765583395958, "rewards/rejected": -0.0031302771531045437, "step": 570 }, { "epoch": 0.1517927244176917, "grad_norm": 1.15625, "learning_rate": 4.959602233899761e-07, "logits/chosen": -2.9088664054870605, "logits/rejected": -2.8700101375579834, "logps/chosen": -314.38787841796875, "logps/rejected": -272.05511474609375, "loss": 0.6876, "rewards/accuracies": 0.65625, "rewards/chosen": 0.009098478592932224, "rewards/margins": 0.011330665089190006, "rewards/rejected": -0.002232185797765851, "step": 580 }, { "epoch": 0.15440984035592778, "grad_norm": 1.4609375, "learning_rate": 4.955409388141243e-07, "logits/chosen": -2.843714475631714, "logits/rejected": -2.8304061889648438, "logps/chosen": -275.0677490234375, "logps/rejected": -249.8322296142578, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": 0.005214322358369827, "rewards/margins": 0.009766822680830956, "rewards/rejected": -0.004552501253783703, "step": 590 }, { "epoch": 0.15702695629416383, "grad_norm": 1.2109375, "learning_rate": 4.951011516405429e-07, "logits/chosen": -2.858010768890381, "logits/rejected": -2.856701374053955, "logps/chosen": -266.87335205078125, "logps/rejected": -251.0322265625, "loss": 0.6875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007665629498660564, "rewards/margins": 0.011526472866535187, "rewards/rejected": -0.0038608419708907604, "step": 600 }, { "epoch": 0.15702695629416383, "eval_logits/chosen": -2.860320568084717, "eval_logits/rejected": -2.8332278728485107, "eval_logps/chosen": -282.1841125488281, "eval_logps/rejected": -261.906005859375, "eval_loss": 0.6879660487174988, "eval_rewards/accuracies": 0.6690000295639038, "eval_rewards/chosen": 0.005893694702535868, "eval_rewards/margins": 0.010570226237177849, "eval_rewards/rejected": -0.004676531068980694, "eval_runtime": 622.7218, "eval_samples_per_second": 3.212, "eval_steps_per_second": 0.401, "step": 600 }, { "epoch": 0.1596440722323999, "grad_norm": 1.421875, "learning_rate": 4.946408985913344e-07, "logits/chosen": -2.852583169937134, "logits/rejected": -2.8311781883239746, "logps/chosen": -263.86956787109375, "logps/rejected": -244.29763793945312, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007280237972736359, "rewards/margins": 0.009582052007317543, "rewards/rejected": -0.002301814965903759, "step": 610 }, { "epoch": 0.16226118817063595, "grad_norm": 1.40625, "learning_rate": 4.941602180974958e-07, "logits/chosen": -2.8539230823516846, "logits/rejected": -2.8148884773254395, "logps/chosen": -304.7937316894531, "logps/rejected": -242.7307891845703, "loss": 0.6874, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007980446331202984, "rewards/margins": 0.011698475107550621, "rewards/rejected": -0.003718029009178281, "step": 620 }, { "epoch": 0.16487830410887203, "grad_norm": 1.296875, "learning_rate": 4.936591502957101e-07, "logits/chosen": -2.857060194015503, "logits/rejected": -2.83305025100708, "logps/chosen": -262.7440490722656, "logps/rejected": -254.7294464111328, "loss": 0.6863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.00918588787317276, "rewards/margins": 0.014022831805050373, "rewards/rejected": -0.004836943931877613, "step": 630 }, { "epoch": 0.16749542004710807, "grad_norm": 1.2734375, "learning_rate": 4.931377370249945e-07, "logits/chosen": -2.8656134605407715, "logits/rejected": -2.8077850341796875, "logps/chosen": -280.6070251464844, "logps/rejected": -258.4964599609375, "loss": 0.6866, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.005175912287086248, "rewards/margins": 0.013334142975509167, "rewards/rejected": -0.008158231154084206, "step": 640 }, { "epoch": 0.17011253598534415, "grad_norm": 1.3125, "learning_rate": 4.925960218232072e-07, "logits/chosen": -2.84588885307312, "logits/rejected": -2.82362699508667, "logps/chosen": -269.46539306640625, "logps/rejected": -259.7959899902344, "loss": 0.6864, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006764856167137623, "rewards/margins": 0.013819174841046333, "rewards/rejected": -0.007054319139569998, "step": 650 }, { "epoch": 0.17272965192358022, "grad_norm": 2.046875, "learning_rate": 4.920340499234116e-07, "logits/chosen": -2.81691312789917, "logits/rejected": -2.7776267528533936, "logps/chosen": -285.55377197265625, "logps/rejected": -248.0377960205078, "loss": 0.6868, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008050017058849335, "rewards/margins": 0.012922885827720165, "rewards/rejected": -0.0048728687688708305, "step": 660 }, { "epoch": 0.17534676786181627, "grad_norm": 1.1875, "learning_rate": 4.914518682500995e-07, "logits/chosen": -2.8940651416778564, "logits/rejected": -2.864454507827759, "logps/chosen": -299.08636474609375, "logps/rejected": -257.27215576171875, "loss": 0.6848, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.011382282711565495, "rewards/margins": 0.017072781920433044, "rewards/rejected": -0.005690500605851412, "step": 670 }, { "epoch": 0.17796388380005235, "grad_norm": 1.8515625, "learning_rate": 4.90849525415273e-07, "logits/chosen": -2.8536484241485596, "logits/rejected": -2.832000255584717, "logps/chosen": -289.3675231933594, "logps/rejected": -240.21029663085938, "loss": 0.6854, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.010842313058674335, "rewards/margins": 0.01590149477124214, "rewards/rejected": -0.005059181712567806, "step": 680 }, { "epoch": 0.1805809997382884, "grad_norm": 1.3515625, "learning_rate": 4.902270717143858e-07, "logits/chosen": -2.862431049346924, "logits/rejected": -2.8454391956329346, "logps/chosen": -255.1441650390625, "logps/rejected": -264.6862487792969, "loss": 0.685, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.008327770046889782, "rewards/margins": 0.016681838780641556, "rewards/rejected": -0.0083540678024292, "step": 690 }, { "epoch": 0.18319811567652447, "grad_norm": 1.65625, "learning_rate": 4.895845591221426e-07, "logits/chosen": -2.85901141166687, "logits/rejected": -2.8616912364959717, "logps/chosen": -268.5135803222656, "logps/rejected": -264.209716796875, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00641227001324296, "rewards/margins": 0.011734376661479473, "rewards/rejected": -0.005322106648236513, "step": 700 }, { "epoch": 0.18319811567652447, "eval_logits/chosen": -2.861030340194702, "eval_logits/rejected": -2.8341639041900635, "eval_logps/chosen": -281.93695068359375, "eval_logps/rejected": -261.9841613769531, "eval_loss": 0.686406135559082, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": 0.008365440182387829, "eval_rewards/margins": 0.013823293149471283, "eval_rewards/rejected": -0.005457851104438305, "eval_runtime": 622.4947, "eval_samples_per_second": 3.213, "eval_steps_per_second": 0.402, "step": 700 }, { "epoch": 0.18581523161476055, "grad_norm": 1.4921875, "learning_rate": 4.8892204128816e-07, "logits/chosen": -2.8912875652313232, "logits/rejected": -2.8667664527893066, "logps/chosen": -280.8445739746094, "logps/rejected": -267.3077392578125, "loss": 0.6874, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.006677532102912664, "rewards/margins": 0.011775776743888855, "rewards/rejected": -0.005098243709653616, "step": 710 }, { "epoch": 0.1884323475529966, "grad_norm": 1.1953125, "learning_rate": 4.882395735324863e-07, "logits/chosen": -2.8655221462249756, "logits/rejected": -2.8227005004882812, "logps/chosen": -280.4420166015625, "logps/rejected": -267.8427429199219, "loss": 0.6841, "rewards/accuracies": 0.71875, "rewards/chosen": 0.00945108663290739, "rewards/margins": 0.01858513429760933, "rewards/rejected": -0.009134046733379364, "step": 720 }, { "epoch": 0.19104946349123267, "grad_norm": 1.2578125, "learning_rate": 4.875372128409829e-07, "logits/chosen": -2.8432908058166504, "logits/rejected": -2.813136577606201, "logps/chosen": -282.8307800292969, "logps/rejected": -251.0844268798828, "loss": 0.6851, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.009715097956359386, "rewards/margins": 0.01650545559823513, "rewards/rejected": -0.006790356244891882, "step": 730 }, { "epoch": 0.19366657942946872, "grad_norm": 1.0390625, "learning_rate": 4.868150178605653e-07, "logits/chosen": -2.8426120281219482, "logits/rejected": -2.8164243698120117, "logps/chosen": -242.0150146484375, "logps/rejected": -210.2700653076172, "loss": 0.6844, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0059540546499192715, "rewards/margins": 0.017786705866456032, "rewards/rejected": -0.011832650750875473, "step": 740 }, { "epoch": 0.1962836953677048, "grad_norm": 1.5078125, "learning_rate": 4.860730488943068e-07, "logits/chosen": -2.8057663440704346, "logits/rejected": -2.794985294342041, "logps/chosen": -250.8291778564453, "logps/rejected": -247.78134155273438, "loss": 0.6854, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.010436545126140118, "rewards/margins": 0.0157928504049778, "rewards/rejected": -0.005356303416192532, "step": 750 }, { "epoch": 0.19890081130594087, "grad_norm": 1.578125, "learning_rate": 4.853113678964021e-07, "logits/chosen": -2.8220317363739014, "logits/rejected": -2.8117756843566895, "logps/chosen": -293.8874206542969, "logps/rejected": -279.42962646484375, "loss": 0.6839, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.012539887800812721, "rewards/margins": 0.019003767520189285, "rewards/rejected": -0.006463879253715277, "step": 760 }, { "epoch": 0.20151792724417691, "grad_norm": 1.1484375, "learning_rate": 4.845300384669957e-07, "logits/chosen": -2.839818239212036, "logits/rejected": -2.8078839778900146, "logps/chosen": -269.07232666015625, "logps/rejected": -247.03567504882812, "loss": 0.6861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009587548673152924, "rewards/margins": 0.014613436535000801, "rewards/rejected": -0.005025886930525303, "step": 770 }, { "epoch": 0.204135043182413, "grad_norm": 1.3671875, "learning_rate": 4.8372912584687e-07, "logits/chosen": -2.8615410327911377, "logits/rejected": -2.827815055847168, "logps/chosen": -300.0118103027344, "logps/rejected": -276.13140869140625, "loss": 0.6856, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.01048839557915926, "rewards/margins": 0.015544673427939415, "rewards/rejected": -0.005056279711425304, "step": 780 }, { "epoch": 0.20675215912064904, "grad_norm": 1.9140625, "learning_rate": 4.829086969119983e-07, "logits/chosen": -2.827129602432251, "logits/rejected": -2.8344006538391113, "logps/chosen": -273.4031066894531, "logps/rejected": -268.6723937988281, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": 0.0065932185389101505, "rewards/margins": 0.011614800430834293, "rewards/rejected": -0.00502158235758543, "step": 790 }, { "epoch": 0.2093692750588851, "grad_norm": 1.265625, "learning_rate": 4.820688201679605e-07, "logits/chosen": -2.8842103481292725, "logits/rejected": -2.836937189102173, "logps/chosen": -276.0343933105469, "logps/rejected": -212.7479705810547, "loss": 0.682, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.015654196962714195, "rewards/margins": 0.022758139297366142, "rewards/rejected": -0.007103943265974522, "step": 800 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -2.857757568359375, "eval_logits/rejected": -2.8307132720947266, "eval_logps/chosen": -281.70330810546875, "eval_logps/rejected": -262.04193115234375, "eval_loss": 0.6850252151489258, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": 0.010701690800487995, "eval_rewards/margins": 0.016737323254346848, "eval_rewards/rejected": -0.006035633385181427, "eval_runtime": 622.1863, "eval_samples_per_second": 3.214, "eval_steps_per_second": 0.402, "step": 800 }, { "epoch": 0.21198639099712116, "grad_norm": 1.2734375, "learning_rate": 4.812095657442231e-07, "logits/chosen": -2.8657875061035156, "logits/rejected": -2.8741297721862793, "logps/chosen": -288.73614501953125, "logps/rejected": -281.89031982421875, "loss": 0.6884, "rewards/accuracies": 0.59375, "rewards/chosen": 0.008292586542665958, "rewards/margins": 0.010020687244832516, "rewards/rejected": -0.001728100934997201, "step": 810 }, { "epoch": 0.21460350693535724, "grad_norm": 1.34375, "learning_rate": 4.803310053882831e-07, "logits/chosen": -2.851111650466919, "logits/rejected": -2.863678455352783, "logps/chosen": -248.24319458007812, "logps/rejected": -259.7985534667969, "loss": 0.6865, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.009849630296230316, "rewards/margins": 0.013707734644412994, "rewards/rejected": -0.003858105046674609, "step": 820 }, { "epoch": 0.2172206228735933, "grad_norm": 1.484375, "learning_rate": 4.794332124596775e-07, "logits/chosen": -2.881307363510132, "logits/rejected": -2.8686277866363525, "logps/chosen": -284.14605712890625, "logps/rejected": -279.83477783203125, "loss": 0.6849, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.014169754460453987, "rewards/margins": 0.017167720943689346, "rewards/rejected": -0.00299796718172729, "step": 830 }, { "epoch": 0.21983773881182936, "grad_norm": 1.5859375, "learning_rate": 4.785162619238574e-07, "logits/chosen": -2.824626922607422, "logits/rejected": -2.7853519916534424, "logps/chosen": -269.13934326171875, "logps/rejected": -243.8739471435547, "loss": 0.6831, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.011793789453804493, "rewards/margins": 0.020573901012539864, "rewards/rejected": -0.008780110627412796, "step": 840 }, { "epoch": 0.22245485475006543, "grad_norm": 1.625, "learning_rate": 4.775802303459287e-07, "logits/chosen": -2.8298261165618896, "logits/rejected": -2.8166985511779785, "logps/chosen": -262.9281311035156, "logps/rejected": -260.26129150390625, "loss": 0.686, "rewards/accuracies": 0.65625, "rewards/chosen": 0.010038264095783234, "rewards/margins": 0.014955776743590832, "rewards/rejected": -0.004917514510452747, "step": 850 }, { "epoch": 0.22507197068830148, "grad_norm": 1.8125, "learning_rate": 4.766251958842589e-07, "logits/chosen": -2.800764799118042, "logits/rejected": -2.7910823822021484, "logps/chosen": -290.7630615234375, "logps/rejected": -278.87103271484375, "loss": 0.6842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.012506058439612389, "rewards/margins": 0.018407398834824562, "rewards/rejected": -0.005901341326534748, "step": 860 }, { "epoch": 0.22768908662653756, "grad_norm": 1.21875, "learning_rate": 4.756512382839506e-07, "logits/chosen": -2.82393217086792, "logits/rejected": -2.7991511821746826, "logps/chosen": -269.1101989746094, "logps/rejected": -271.6678161621094, "loss": 0.6838, "rewards/accuracies": 0.71875, "rewards/chosen": 0.01023150235414505, "rewards/margins": 0.01938125677406788, "rewards/rejected": -0.009149751625955105, "step": 870 }, { "epoch": 0.23030620256477363, "grad_norm": 1.2109375, "learning_rate": 4.746584388701831e-07, "logits/chosen": -2.840731620788574, "logits/rejected": -2.840230941772461, "logps/chosen": -278.5453186035156, "logps/rejected": -264.83380126953125, "loss": 0.6835, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.014049595221877098, "rewards/margins": 0.019881747663021088, "rewards/rejected": -0.005832154303789139, "step": 880 }, { "epoch": 0.23292331850300968, "grad_norm": 1.75, "learning_rate": 4.736468805414218e-07, "logits/chosen": -2.811013698577881, "logits/rejected": -2.8108229637145996, "logps/chosen": -266.8439636230469, "logps/rejected": -278.64306640625, "loss": 0.6825, "rewards/accuracies": 0.6875, "rewards/chosen": 0.013364280574023724, "rewards/margins": 0.021861828863620758, "rewards/rejected": -0.008497546426951885, "step": 890 }, { "epoch": 0.23554043444124576, "grad_norm": 1.3359375, "learning_rate": 4.7261664776249595e-07, "logits/chosen": -2.783407211303711, "logits/rejected": -2.7699193954467773, "logps/chosen": -245.31137084960938, "logps/rejected": -236.346923828125, "loss": 0.6837, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008871756494045258, "rewards/margins": 0.01949758641421795, "rewards/rejected": -0.010625829920172691, "step": 900 }, { "epoch": 0.23554043444124576, "eval_logits/chosen": -2.8573083877563477, "eval_logits/rejected": -2.8303797245025635, "eval_logps/chosen": -281.4179992675781, "eval_logps/rejected": -261.9797058105469, "eval_loss": 0.6839740872383118, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.013555029407143593, "eval_rewards/margins": 0.01896839775145054, "eval_rewards/rejected": -0.005413366016000509, "eval_runtime": 621.6447, "eval_samples_per_second": 3.217, "eval_steps_per_second": 0.402, "step": 900 }, { "epoch": 0.2381575503794818, "grad_norm": 1.203125, "learning_rate": 4.7156782655754624e-07, "logits/chosen": -2.847557544708252, "logits/rejected": -2.807833194732666, "logps/chosen": -298.7544250488281, "logps/rejected": -243.92807006835938, "loss": 0.6823, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01836327835917473, "rewards/margins": 0.0223236046731472, "rewards/rejected": -0.003960323985666037, "step": 910 }, { "epoch": 0.24077466631771788, "grad_norm": 1.25, "learning_rate": 4.705005045028414e-07, "logits/chosen": -2.8043971061706543, "logits/rejected": -2.775317430496216, "logps/chosen": -279.80584716796875, "logps/rejected": -261.52227783203125, "loss": 0.6838, "rewards/accuracies": 0.6875, "rewards/chosen": 0.014174017123878002, "rewards/margins": 0.01934727467596531, "rewards/rejected": -0.005173257552087307, "step": 920 }, { "epoch": 0.24339178225595393, "grad_norm": 1.3984375, "learning_rate": 4.694147707194659e-07, "logits/chosen": -2.871277332305908, "logits/rejected": -2.861692428588867, "logps/chosen": -286.360107421875, "logps/rejected": -268.2128601074219, "loss": 0.6821, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.016537340357899666, "rewards/margins": 0.022989634424448013, "rewards/rejected": -0.00645229360088706, "step": 930 }, { "epoch": 0.24600889819419, "grad_norm": 2.578125, "learning_rate": 4.683107158658781e-07, "logits/chosen": -2.818206548690796, "logits/rejected": -2.799956798553467, "logps/chosen": -306.58203125, "logps/rejected": -278.14752197265625, "loss": 0.678, "rewards/accuracies": 0.71875, "rewards/chosen": 0.021548133343458176, "rewards/margins": 0.03120315447449684, "rewards/rejected": -0.009655019268393517, "step": 940 }, { "epoch": 0.24862601413242608, "grad_norm": 1.4609375, "learning_rate": 4.6718843213034066e-07, "logits/chosen": -2.831376314163208, "logits/rejected": -2.815030097961426, "logps/chosen": -261.57659912109375, "logps/rejected": -250.5101318359375, "loss": 0.6823, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.014541561715304852, "rewards/margins": 0.022364726290106773, "rewards/rejected": -0.007823166437447071, "step": 950 }, { "epoch": 0.2512431300706621, "grad_norm": 1.3984375, "learning_rate": 4.660480132232224e-07, "logits/chosen": -2.8427586555480957, "logits/rejected": -2.8429322242736816, "logps/chosen": -285.56378173828125, "logps/rejected": -263.73260498046875, "loss": 0.6849, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.014430510811507702, "rewards/margins": 0.01709624193608761, "rewards/rejected": -0.002665730658918619, "step": 960 }, { "epoch": 0.25386024600889817, "grad_norm": 1.1484375, "learning_rate": 4.64889554369174e-07, "logits/chosen": -2.8453879356384277, "logits/rejected": -2.8087105751037598, "logps/chosen": -296.9171142578125, "logps/rejected": -250.25, "loss": 0.6784, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.022362882271409035, "rewards/margins": 0.030544385313987732, "rewards/rejected": -0.008181498385965824, "step": 970 }, { "epoch": 0.2564773619471343, "grad_norm": 1.328125, "learning_rate": 4.637131522991764e-07, "logits/chosen": -2.8417975902557373, "logits/rejected": -2.8379454612731934, "logps/chosen": -304.47784423828125, "logps/rejected": -279.02911376953125, "loss": 0.6831, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01869601011276245, "rewards/margins": 0.02095440961420536, "rewards/rejected": -0.002258400898426771, "step": 980 }, { "epoch": 0.2590944778853703, "grad_norm": 1.171875, "learning_rate": 4.6251890524246375e-07, "logits/chosen": -2.8454818725585938, "logits/rejected": -2.8245933055877686, "logps/chosen": -253.8804931640625, "logps/rejected": -232.57418823242188, "loss": 0.6802, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.015828203409910202, "rewards/margins": 0.02681964635848999, "rewards/rejected": -0.010991444811224937, "step": 990 }, { "epoch": 0.26171159382360637, "grad_norm": 1.21875, "learning_rate": 4.613069129183218e-07, "logits/chosen": -2.8802895545959473, "logits/rejected": -2.840637683868408, "logps/chosen": -319.56268310546875, "logps/rejected": -281.52264404296875, "loss": 0.6819, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.018990501761436462, "rewards/margins": 0.023325005546212196, "rewards/rejected": -0.004334500525146723, "step": 1000 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -2.853975534439087, "eval_logits/rejected": -2.8269295692443848, "eval_logps/chosen": -281.1678466796875, "eval_logps/rejected": -261.9830017089844, "eval_loss": 0.6827893257141113, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.016056543216109276, "eval_rewards/margins": 0.021502956748008728, "eval_rewards/rejected": -0.005446411669254303, "eval_runtime": 621.9705, "eval_samples_per_second": 3.216, "eval_steps_per_second": 0.402, "step": 1000 }, { "epoch": 0.2643287097618425, "grad_norm": 1.265625, "learning_rate": 4.6007727652776065e-07, "logits/chosen": -2.8141977787017822, "logits/rejected": -2.7999210357666016, "logps/chosen": -249.3959197998047, "logps/rejected": -245.47830200195312, "loss": 0.6824, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016399413347244263, "rewards/margins": 0.02237236499786377, "rewards/rejected": -0.0059729525819420815, "step": 1010 }, { "epoch": 0.2669458257000785, "grad_norm": 1.578125, "learning_rate": 4.588300987450652e-07, "logits/chosen": -2.86116099357605, "logits/rejected": -2.8352913856506348, "logps/chosen": -268.90374755859375, "logps/rejected": -237.9334259033203, "loss": 0.682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.016041552647948265, "rewards/margins": 0.02327124960720539, "rewards/rejected": -0.007229696027934551, "step": 1020 }, { "epoch": 0.26956294163831457, "grad_norm": 1.0859375, "learning_rate": 4.5756548370922134e-07, "logits/chosen": -2.8210678100585938, "logits/rejected": -2.8014864921569824, "logps/chosen": -254.616455078125, "logps/rejected": -245.8887939453125, "loss": 0.6857, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01615295186638832, "rewards/margins": 0.015799041837453842, "rewards/rejected": 0.0003539065073709935, "step": 1030 }, { "epoch": 0.2721800575765506, "grad_norm": 1.1328125, "learning_rate": 4.5628353701522047e-07, "logits/chosen": -2.8566808700561523, "logits/rejected": -2.823983669281006, "logps/chosen": -317.4922180175781, "logps/rejected": -287.9608154296875, "loss": 0.6771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.024755671620368958, "rewards/margins": 0.03346611559391022, "rewards/rejected": -0.00871044397354126, "step": 1040 }, { "epoch": 0.2747971735147867, "grad_norm": 1.6875, "learning_rate": 4.549843657052429e-07, "logits/chosen": -2.8731515407562256, "logits/rejected": -2.845651388168335, "logps/chosen": -282.2266540527344, "logps/rejected": -279.3399658203125, "loss": 0.677, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02262326516211033, "rewards/margins": 0.03333864361047745, "rewards/rejected": -0.010715381242334843, "step": 1050 }, { "epoch": 0.27741428945302277, "grad_norm": 1.140625, "learning_rate": 4.5366807825971907e-07, "logits/chosen": -2.8223326206207275, "logits/rejected": -2.813047409057617, "logps/chosen": -252.23336791992188, "logps/rejected": -246.99490356445312, "loss": 0.6827, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.015223952010273933, "rewards/margins": 0.02188166417181492, "rewards/rejected": -0.006657709833234549, "step": 1060 }, { "epoch": 0.2800314053912588, "grad_norm": 1.2734375, "learning_rate": 4.5233478458827176e-07, "logits/chosen": -2.856292247772217, "logits/rejected": -2.8288464546203613, "logps/chosen": -306.0169372558594, "logps/rejected": -254.22042846679688, "loss": 0.6782, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.0232028029859066, "rewards/margins": 0.03090915083885193, "rewards/rejected": -0.007706350646913052, "step": 1070 }, { "epoch": 0.2826485213294949, "grad_norm": 1.328125, "learning_rate": 4.509845960205389e-07, "logits/chosen": -2.791149854660034, "logits/rejected": -2.7939980030059814, "logps/chosen": -294.93231201171875, "logps/rejected": -263.44830322265625, "loss": 0.6818, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.020023692399263382, "rewards/margins": 0.023503463715314865, "rewards/rejected": -0.0034797731786966324, "step": 1080 }, { "epoch": 0.28526563726773096, "grad_norm": 1.765625, "learning_rate": 4.4961762529687736e-07, "logits/chosen": -2.8478240966796875, "logits/rejected": -2.8274638652801514, "logps/chosen": -277.98541259765625, "logps/rejected": -260.0039978027344, "loss": 0.6833, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01708926074206829, "rewards/margins": 0.020394863560795784, "rewards/rejected": -0.0033055986277759075, "step": 1090 }, { "epoch": 0.287882753205967, "grad_norm": 1.2890625, "learning_rate": 4.482339865589492e-07, "logits/chosen": -2.8545358180999756, "logits/rejected": -2.8111648559570312, "logps/chosen": -281.62652587890625, "logps/rejected": -238.7964324951172, "loss": 0.6836, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.013560217805206776, "rewards/margins": 0.019967440515756607, "rewards/rejected": -0.006407222244888544, "step": 1100 }, { "epoch": 0.287882753205967, "eval_logits/chosen": -2.852851152420044, "eval_logits/rejected": -2.825786590576172, "eval_logps/chosen": -280.98529052734375, "eval_logps/rejected": -262.00518798828125, "eval_loss": 0.6818436980247498, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": 0.01788218505680561, "eval_rewards/margins": 0.023550525307655334, "eval_rewards/rejected": -0.00566834257915616, "eval_runtime": 621.9928, "eval_samples_per_second": 3.215, "eval_steps_per_second": 0.402, "step": 1100 }, { "epoch": 0.2904998691442031, "grad_norm": 1.2890625, "learning_rate": 4.4683379534019076e-07, "logits/chosen": -2.8489432334899902, "logits/rejected": -2.8446106910705566, "logps/chosen": -284.64337158203125, "logps/rejected": -280.3296203613281, "loss": 0.6832, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01724550686776638, "rewards/margins": 0.020626548677682877, "rewards/rejected": -0.0033810404129326344, "step": 1110 }, { "epoch": 0.29311698508243916, "grad_norm": 1.28125, "learning_rate": 4.4541716855616593e-07, "logits/chosen": -2.822422742843628, "logits/rejected": -2.8002758026123047, "logps/chosen": -256.53265380859375, "logps/rejected": -259.35723876953125, "loss": 0.684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.01423877477645874, "rewards/margins": 0.019155513495206833, "rewards/rejected": -0.004916741047054529, "step": 1120 }, { "epoch": 0.2957341010206752, "grad_norm": 1.203125, "learning_rate": 4.4398422449480357e-07, "logits/chosen": -2.8172736167907715, "logits/rejected": -2.7678089141845703, "logps/chosen": -278.84429931640625, "logps/rejected": -282.52862548828125, "loss": 0.6835, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.014500883407890797, "rewards/margins": 0.020229389891028404, "rewards/rejected": -0.0057285078801214695, "step": 1130 }, { "epoch": 0.29835121695891126, "grad_norm": 1.1953125, "learning_rate": 4.4253508280652036e-07, "logits/chosen": -2.838125705718994, "logits/rejected": -2.791625738143921, "logps/chosen": -301.6747741699219, "logps/rejected": -253.36123657226562, "loss": 0.6789, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.02180757373571396, "rewards/margins": 0.029516074806451797, "rewards/rejected": -0.007708498742431402, "step": 1140 }, { "epoch": 0.30096833289714736, "grad_norm": 1.1640625, "learning_rate": 4.410698644942302e-07, "logits/chosen": -2.879281997680664, "logits/rejected": -2.851644277572632, "logps/chosen": -285.1121520996094, "logps/rejected": -263.08416748046875, "loss": 0.6796, "rewards/accuracies": 0.71875, "rewards/chosen": 0.022570660337805748, "rewards/margins": 0.02802709862589836, "rewards/rejected": -0.005456441547721624, "step": 1150 }, { "epoch": 0.3035854488353834, "grad_norm": 1.546875, "learning_rate": 4.3958869190324057e-07, "logits/chosen": -2.8084969520568848, "logits/rejected": -2.7690627574920654, "logps/chosen": -277.8807067871094, "logps/rejected": -252.42825317382812, "loss": 0.6801, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.018787220120429993, "rewards/margins": 0.027080217376351357, "rewards/rejected": -0.008292997255921364, "step": 1160 }, { "epoch": 0.30620256477361946, "grad_norm": 1.4140625, "learning_rate": 4.380916887110365e-07, "logits/chosen": -2.869062900543213, "logits/rejected": -2.8376870155334473, "logps/chosen": -273.46063232421875, "logps/rejected": -233.0785675048828, "loss": 0.6807, "rewards/accuracies": 0.71875, "rewards/chosen": 0.01758132129907608, "rewards/margins": 0.026008691638708115, "rewards/rejected": -0.008427368476986885, "step": 1170 }, { "epoch": 0.30881968071185556, "grad_norm": 1.3046875, "learning_rate": 4.3657897991695394e-07, "logits/chosen": -2.7800498008728027, "logits/rejected": -2.8185646533966064, "logps/chosen": -268.2494201660156, "logps/rejected": -269.7010498046875, "loss": 0.6818, "rewards/accuracies": 0.6875, "rewards/chosen": 0.019908545538783073, "rewards/margins": 0.023755352944135666, "rewards/rejected": -0.0038468041457235813, "step": 1180 }, { "epoch": 0.3114367966500916, "grad_norm": 1.2265625, "learning_rate": 4.350506918317416e-07, "logits/chosen": -2.856717824935913, "logits/rejected": -2.824436664581299, "logps/chosen": -260.1253967285156, "logps/rejected": -256.15118408203125, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": 0.017839016392827034, "rewards/margins": 0.021472811698913574, "rewards/rejected": -0.003633796004578471, "step": 1190 }, { "epoch": 0.31405391258832765, "grad_norm": 1.515625, "learning_rate": 4.335069520670149e-07, "logits/chosen": -2.8279824256896973, "logits/rejected": -2.802241325378418, "logps/chosen": -241.59786987304688, "logps/rejected": -255.2316131591797, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.017605960369110107, "rewards/margins": 0.01734979636967182, "rewards/rejected": 0.0002561651053838432, "step": 1200 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -2.8509681224823, "eval_logits/rejected": -2.8237783908843994, "eval_logps/chosen": -280.56793212890625, "eval_logps/rejected": -261.7609558105469, "eval_loss": 0.6810342073440552, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.022055484354496002, "eval_rewards/margins": 0.02528143860399723, "eval_rewards/rejected": -0.003225954482331872, "eval_runtime": 623.5714, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.401, "step": 1200 }, { "epoch": 0.3166710285265637, "grad_norm": 1.2578125, "learning_rate": 4.319478895245999e-07, "logits/chosen": -2.846019744873047, "logits/rejected": -2.8138914108276367, "logps/chosen": -262.89471435546875, "logps/rejected": -238.1546173095703, "loss": 0.6785, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02249886468052864, "rewards/margins": 0.030421704053878784, "rewards/rejected": -0.007922842167317867, "step": 1210 }, { "epoch": 0.3192881444647998, "grad_norm": 1.359375, "learning_rate": 4.3037363438577036e-07, "logits/chosen": -2.8656246662139893, "logits/rejected": -2.828981399536133, "logps/chosen": -269.7254943847656, "logps/rejected": -284.9521789550781, "loss": 0.6799, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.025678550824522972, "rewards/margins": 0.027557630091905594, "rewards/rejected": -0.001879077055491507, "step": 1220 }, { "epoch": 0.32190526040303585, "grad_norm": 1.3671875, "learning_rate": 4.2878431810037716e-07, "logits/chosen": -2.8651747703552246, "logits/rejected": -2.851069927215576, "logps/chosen": -309.12872314453125, "logps/rejected": -264.8939514160156, "loss": 0.6775, "rewards/accuracies": 0.6875, "rewards/chosen": 0.025112558156251907, "rewards/margins": 0.03247564285993576, "rewards/rejected": -0.007363081909716129, "step": 1230 }, { "epoch": 0.3245223763412719, "grad_norm": 1.34375, "learning_rate": 4.271800733758729e-07, "logits/chosen": -2.838114023208618, "logits/rejected": -2.8366000652313232, "logps/chosen": -301.5270690917969, "logps/rejected": -268.2883605957031, "loss": 0.6769, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.027721276506781578, "rewards/margins": 0.03389520198106766, "rewards/rejected": -0.006173927802592516, "step": 1240 }, { "epoch": 0.327139492279508, "grad_norm": 1.3359375, "learning_rate": 4.255610341662304e-07, "logits/chosen": -2.863908529281616, "logits/rejected": -2.806837797164917, "logps/chosen": -273.0283203125, "logps/rejected": -253.21798706054688, "loss": 0.6806, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.023084726184606552, "rewards/margins": 0.026232142001390457, "rewards/rejected": -0.003147417213767767, "step": 1250 }, { "epoch": 0.32975660821774405, "grad_norm": 1.4453125, "learning_rate": 4.2392733566075757e-07, "logits/chosen": -2.8411412239074707, "logits/rejected": -2.812453508377075, "logps/chosen": -271.3999938964844, "logps/rejected": -254.3863983154297, "loss": 0.6838, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.021707288920879364, "rewards/margins": 0.019517619162797928, "rewards/rejected": 0.002189669292420149, "step": 1260 }, { "epoch": 0.3323737241559801, "grad_norm": 1.2265625, "learning_rate": 4.2227911427280973e-07, "logits/chosen": -2.8064496517181396, "logits/rejected": -2.777616500854492, "logps/chosen": -263.7330627441406, "logps/rejected": -234.1592559814453, "loss": 0.6807, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0242301132529974, "rewards/margins": 0.026022180914878845, "rewards/rejected": -0.0017920676618814468, "step": 1270 }, { "epoch": 0.33499084009421615, "grad_norm": 1.359375, "learning_rate": 4.206165076283982e-07, "logits/chosen": -2.8404831886291504, "logits/rejected": -2.81711483001709, "logps/chosen": -259.4391174316406, "logps/rejected": -243.4575653076172, "loss": 0.6796, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02131321281194687, "rewards/margins": 0.028222400695085526, "rewards/rejected": -0.00690918555483222, "step": 1280 }, { "epoch": 0.33760795603245225, "grad_norm": 1.4296875, "learning_rate": 4.1893965455469946e-07, "logits/chosen": -2.855498790740967, "logits/rejected": -2.832735538482666, "logps/chosen": -263.16021728515625, "logps/rejected": -243.6949462890625, "loss": 0.6818, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01883005164563656, "rewards/margins": 0.02386125549674034, "rewards/rejected": -0.005031202454119921, "step": 1290 }, { "epoch": 0.3402250719706883, "grad_norm": 1.1328125, "learning_rate": 4.172486950684626e-07, "logits/chosen": -2.851036548614502, "logits/rejected": -2.842118501663208, "logps/chosen": -266.0923767089844, "logps/rejected": -266.4703369140625, "loss": 0.6785, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02591409906744957, "rewards/margins": 0.030536871403455734, "rewards/rejected": -0.004622773267328739, "step": 1300 }, { "epoch": 0.3402250719706883, "eval_logits/chosen": -2.85286283493042, "eval_logits/rejected": -2.825887441635132, "eval_logps/chosen": -280.6852111816406, "eval_logps/rejected": -262.0453186035156, "eval_loss": 0.6802608966827393, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.02088269591331482, "eval_rewards/margins": 0.026952272281050682, "eval_rewards/rejected": -0.006069576367735863, "eval_runtime": 623.414, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.401, "step": 1300 }, { "epoch": 0.34284218790892435, "grad_norm": 1.265625, "learning_rate": 4.155437703643181e-07, "logits/chosen": -2.877864360809326, "logits/rejected": -2.8368418216705322, "logps/chosen": -258.5326843261719, "logps/rejected": -233.0809326171875, "loss": 0.6764, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.025921067222952843, "rewards/margins": 0.034750454127788544, "rewards/rejected": -0.008829386904835701, "step": 1310 }, { "epoch": 0.34545930384716045, "grad_norm": 1.21875, "learning_rate": 4.138250228029881e-07, "logits/chosen": -2.8507418632507324, "logits/rejected": -2.8341784477233887, "logps/chosen": -270.1946716308594, "logps/rejected": -279.32489013671875, "loss": 0.6829, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.018970279023051262, "rewards/margins": 0.021766219288110733, "rewards/rejected": -0.002795940963551402, "step": 1320 }, { "epoch": 0.3480764197853965, "grad_norm": 1.1796875, "learning_rate": 4.1209259589939935e-07, "logits/chosen": -2.8348724842071533, "logits/rejected": -2.8295791149139404, "logps/chosen": -247.55770874023438, "logps/rejected": -242.4851837158203, "loss": 0.6819, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.022017816081643105, "rewards/margins": 0.02364877425134182, "rewards/rejected": -0.00163096038158983, "step": 1330 }, { "epoch": 0.35069353572363254, "grad_norm": 1.2578125, "learning_rate": 4.103466343106998e-07, "logits/chosen": -2.868483543395996, "logits/rejected": -2.855750560760498, "logps/chosen": -287.6753845214844, "logps/rejected": -257.07257080078125, "loss": 0.6827, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01955850049853325, "rewards/margins": 0.02184746228158474, "rewards/rejected": -0.002288959687575698, "step": 1340 }, { "epoch": 0.35331065166186865, "grad_norm": 1.1328125, "learning_rate": 4.085872838241796e-07, "logits/chosen": -2.8042919635772705, "logits/rejected": -2.765010356903076, "logps/chosen": -293.25677490234375, "logps/rejected": -261.722412109375, "loss": 0.6818, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.015371786430478096, "rewards/margins": 0.023923274129629135, "rewards/rejected": -0.008551487699151039, "step": 1350 }, { "epoch": 0.3559277676001047, "grad_norm": 1.390625, "learning_rate": 4.06814691345098e-07, "logits/chosen": -2.7857346534729004, "logits/rejected": -2.756810426712036, "logps/chosen": -272.96185302734375, "logps/rejected": -253.2128448486328, "loss": 0.6801, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.02000296302139759, "rewards/margins": 0.027280131354928017, "rewards/rejected": -0.00727717112749815, "step": 1360 }, { "epoch": 0.35854488353834074, "grad_norm": 1.6328125, "learning_rate": 4.0502900488441707e-07, "logits/chosen": -2.8389689922332764, "logits/rejected": -2.8258564472198486, "logps/chosen": -283.4964294433594, "logps/rejected": -280.92559814453125, "loss": 0.6809, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0220597293227911, "rewards/margins": 0.025511642917990685, "rewards/rejected": -0.003451913595199585, "step": 1370 }, { "epoch": 0.3611619994765768, "grad_norm": 1.3046875, "learning_rate": 4.032303735464422e-07, "logits/chosen": -2.9172284603118896, "logits/rejected": -2.868638753890991, "logps/chosen": -287.5843811035156, "logps/rejected": -264.72808837890625, "loss": 0.6775, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02606380544602871, "rewards/margins": 0.032886773347854614, "rewards/rejected": -0.006822962313890457, "step": 1380 }, { "epoch": 0.3637791154148129, "grad_norm": 1.4296875, "learning_rate": 4.014189475163726e-07, "logits/chosen": -2.8349239826202393, "logits/rejected": -2.8192358016967773, "logps/chosen": -270.93731689453125, "logps/rejected": -261.95611572265625, "loss": 0.6779, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.019413406029343605, "rewards/margins": 0.0317561998963356, "rewards/rejected": -0.012342792935669422, "step": 1390 }, { "epoch": 0.36639623135304894, "grad_norm": 1.59375, "learning_rate": 3.995948780477605e-07, "logits/chosen": -2.8566317558288574, "logits/rejected": -2.821171998977661, "logps/chosen": -283.40374755859375, "logps/rejected": -261.3340148925781, "loss": 0.6828, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015268507413566113, "rewards/margins": 0.022029511630535126, "rewards/rejected": -0.0067610046826303005, "step": 1400 }, { "epoch": 0.36639623135304894, "eval_logits/chosen": -2.850475549697876, "eval_logits/rejected": -2.8233399391174316, "eval_logps/chosen": -280.606201171875, "eval_logps/rejected": -262.1006774902344, "eval_loss": 0.6796398758888245, "eval_rewards/accuracies": 0.6865000128746033, "eval_rewards/chosen": 0.02167338877916336, "eval_rewards/margins": 0.028296444565057755, "eval_rewards/rejected": -0.006623056251555681, "eval_runtime": 622.3734, "eval_samples_per_second": 3.214, "eval_steps_per_second": 0.402, "step": 1400 }, { "epoch": 0.369013347291285, "grad_norm": 1.3515625, "learning_rate": 3.977583174498816e-07, "logits/chosen": -2.856717824935913, "logits/rejected": -2.8365931510925293, "logps/chosen": -283.3439025878906, "logps/rejected": -262.3489990234375, "loss": 0.676, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.025825675576925278, "rewards/margins": 0.03570712357759476, "rewards/rejected": -0.009881444275379181, "step": 1410 }, { "epoch": 0.3716304632295211, "grad_norm": 1.3984375, "learning_rate": 3.9590941907501717e-07, "logits/chosen": -2.86562180519104, "logits/rejected": -2.84371280670166, "logps/chosen": -298.9716796875, "logps/rejected": -272.981689453125, "loss": 0.676, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.03262133151292801, "rewards/margins": 0.03562031686306, "rewards/rejected": -0.002998985815793276, "step": 1420 }, { "epoch": 0.37424757916775714, "grad_norm": 2.0625, "learning_rate": 3.9404833730564974e-07, "logits/chosen": -2.766550302505493, "logits/rejected": -2.7500851154327393, "logps/chosen": -270.97052001953125, "logps/rejected": -261.961181640625, "loss": 0.6788, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02307405136525631, "rewards/margins": 0.0301786120980978, "rewards/rejected": -0.007104557007551193, "step": 1430 }, { "epoch": 0.3768646951059932, "grad_norm": 1.296875, "learning_rate": 3.9217522754157117e-07, "logits/chosen": -2.84255051612854, "logits/rejected": -2.838588237762451, "logps/chosen": -266.6410827636719, "logps/rejected": -246.63916015625, "loss": 0.6756, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.02127646468579769, "rewards/margins": 0.036509204655885696, "rewards/rejected": -0.015232739970088005, "step": 1440 }, { "epoch": 0.37948181104422923, "grad_norm": 1.296875, "learning_rate": 3.9029024618690785e-07, "logits/chosen": -2.854072093963623, "logits/rejected": -2.8262181282043457, "logps/chosen": -253.68600463867188, "logps/rejected": -238.5004425048828, "loss": 0.6816, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01315716840326786, "rewards/margins": 0.024385813623666763, "rewards/rejected": -0.011228645220398903, "step": 1450 }, { "epoch": 0.38209892698246534, "grad_norm": 1.4609375, "learning_rate": 3.883935506370605e-07, "logits/chosen": -2.816702365875244, "logits/rejected": -2.8033461570739746, "logps/chosen": -267.4139404296875, "logps/rejected": -239.7705535888672, "loss": 0.6758, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.027285242453217506, "rewards/margins": 0.036063503473997116, "rewards/rejected": -0.008778261952102184, "step": 1460 }, { "epoch": 0.3847160429207014, "grad_norm": 1.734375, "learning_rate": 3.864852992655616e-07, "logits/chosen": -2.8303184509277344, "logits/rejected": -2.813981533050537, "logps/chosen": -266.30218505859375, "logps/rejected": -254.7268524169922, "loss": 0.6749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02994285151362419, "rewards/margins": 0.03793361037969589, "rewards/rejected": -0.007990758866071701, "step": 1470 }, { "epoch": 0.38733315885893743, "grad_norm": 1.140625, "learning_rate": 3.845656514108515e-07, "logits/chosen": -2.8377737998962402, "logits/rejected": -2.815836191177368, "logps/chosen": -279.17205810546875, "logps/rejected": -219.8304901123047, "loss": 0.6792, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02100013568997383, "rewards/margins": 0.029268179088830948, "rewards/rejected": -0.008268042467534542, "step": 1480 }, { "epoch": 0.38995027479717354, "grad_norm": 1.4609375, "learning_rate": 3.8263476736297375e-07, "logits/chosen": -2.8322536945343018, "logits/rejected": -2.7860350608825684, "logps/chosen": -266.8233947753906, "logps/rejected": -243.4827880859375, "loss": 0.6787, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.018916089087724686, "rewards/margins": 0.03032476268708706, "rewards/rejected": -0.011408672668039799, "step": 1490 }, { "epoch": 0.3925673907354096, "grad_norm": 1.9453125, "learning_rate": 3.8069280835019055e-07, "logits/chosen": -2.822990894317627, "logits/rejected": -2.789095878601074, "logps/chosen": -282.0010070800781, "logps/rejected": -262.74383544921875, "loss": 0.6795, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.024773618206381798, "rewards/margins": 0.028730124235153198, "rewards/rejected": -0.0039565060287714005, "step": 1500 }, { "epoch": 0.3925673907354096, "eval_logits/chosen": -2.851984977722168, "eval_logits/rejected": -2.825019121170044, "eval_logps/chosen": -280.51751708984375, "eval_logps/rejected": -262.1142883300781, "eval_loss": 0.6791806221008301, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.022559717297554016, "eval_rewards/margins": 0.029319126158952713, "eval_rewards/rejected": -0.0067594097927212715, "eval_runtime": 621.2203, "eval_samples_per_second": 3.219, "eval_steps_per_second": 0.402, "step": 1500 }, { "epoch": 0.39518450667364563, "grad_norm": 1.3125, "learning_rate": 3.7873993652552073e-07, "logits/chosen": -2.8283066749572754, "logits/rejected": -2.811255693435669, "logps/chosen": -247.559814453125, "logps/rejected": -242.0292205810547, "loss": 0.6857, "rewards/accuracies": 0.53125, "rewards/chosen": 0.015011796727776527, "rewards/margins": 0.016142752021551132, "rewards/rejected": -0.0011309570400044322, "step": 1510 }, { "epoch": 0.39780162261188173, "grad_norm": 1.34375, "learning_rate": 3.767763149531995e-07, "logits/chosen": -2.8359994888305664, "logits/rejected": -2.819225311279297, "logps/chosen": -277.1551208496094, "logps/rejected": -260.8001708984375, "loss": 0.6772, "rewards/accuracies": 0.75, "rewards/chosen": 0.02346893586218357, "rewards/margins": 0.03324516490101814, "rewards/rejected": -0.009776233695447445, "step": 1520 }, { "epoch": 0.4004187385501178, "grad_norm": 1.5546875, "learning_rate": 3.7480210759506326e-07, "logits/chosen": -2.808189868927002, "logits/rejected": -2.8010077476501465, "logps/chosen": -292.74847412109375, "logps/rejected": -281.989013671875, "loss": 0.6814, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.02831420861184597, "rewards/margins": 0.025223467499017715, "rewards/rejected": 0.003090745070949197, "step": 1530 }, { "epoch": 0.40303585448835383, "grad_norm": 1.234375, "learning_rate": 3.728174792968582e-07, "logits/chosen": -2.8126258850097656, "logits/rejected": -2.782135248184204, "logps/chosen": -253.31851196289062, "logps/rejected": -239.48095703125, "loss": 0.6811, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.016257187351584435, "rewards/margins": 0.02541721798479557, "rewards/rejected": -0.00916003156453371, "step": 1540 }, { "epoch": 0.4056529704265899, "grad_norm": 1.3828125, "learning_rate": 3.70822595774476e-07, "logits/chosen": -2.8408150672912598, "logits/rejected": -2.8108041286468506, "logps/chosen": -285.0970153808594, "logps/rejected": -271.54425048828125, "loss": 0.6743, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.029356488958001137, "rewards/margins": 0.03932540863752365, "rewards/rejected": -0.009968922473490238, "step": 1550 }, { "epoch": 0.408270086364826, "grad_norm": 1.4609375, "learning_rate": 3.688176236001168e-07, "logits/chosen": -2.837639331817627, "logits/rejected": -2.7985403537750244, "logps/chosen": -294.97491455078125, "logps/rejected": -259.5853271484375, "loss": 0.6779, "rewards/accuracies": 0.6875, "rewards/chosen": 0.028945360332727432, "rewards/margins": 0.03233477473258972, "rewards/rejected": -0.0033894157968461514, "step": 1560 }, { "epoch": 0.410887202303062, "grad_norm": 1.34375, "learning_rate": 3.6680273018838016e-07, "logits/chosen": -2.847684144973755, "logits/rejected": -2.8297903537750244, "logps/chosen": -267.1182556152344, "logps/rejected": -251.4414825439453, "loss": 0.6751, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0260789655148983, "rewards/margins": 0.03772038221359253, "rewards/rejected": -0.011641415767371655, "step": 1570 }, { "epoch": 0.4135043182412981, "grad_norm": 1.265625, "learning_rate": 3.6477808378228596e-07, "logits/chosen": -2.8226513862609863, "logits/rejected": -2.8190042972564697, "logps/chosen": -268.88055419921875, "logps/rejected": -301.400146484375, "loss": 0.678, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.022145092487335205, "rewards/margins": 0.03182849660515785, "rewards/rejected": -0.009683402255177498, "step": 1580 }, { "epoch": 0.4161214341795342, "grad_norm": 1.140625, "learning_rate": 3.6274385343922674e-07, "logits/chosen": -2.8877930641174316, "logits/rejected": -2.8846001625061035, "logps/chosen": -250.56411743164062, "logps/rejected": -259.74505615234375, "loss": 0.6826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015636231750249863, "rewards/margins": 0.0221557579934597, "rewards/rejected": -0.006519525311887264, "step": 1590 }, { "epoch": 0.4187385501177702, "grad_norm": 1.140625, "learning_rate": 3.6070020901685057e-07, "logits/chosen": -2.8079447746276855, "logits/rejected": -2.812084197998047, "logps/chosen": -280.13079833984375, "logps/rejected": -259.9898681640625, "loss": 0.6801, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02125183865427971, "rewards/margins": 0.027416234835982323, "rewards/rejected": -0.00616439338773489, "step": 1600 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -2.8515522480010986, "eval_logits/rejected": -2.824509382247925, "eval_logps/chosen": -280.82861328125, "eval_logps/rejected": -262.506591796875, "eval_loss": 0.6788100600242615, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": 0.01944848708808422, "eval_rewards/margins": 0.030131228268146515, "eval_rewards/rejected": -0.010682739317417145, "eval_runtime": 623.5252, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.401, "step": 1600 }, { "epoch": 0.4213556660560063, "grad_norm": 1.3046875, "learning_rate": 3.5864732115887863e-07, "logits/chosen": -2.8428633213043213, "logits/rejected": -2.8304831981658936, "logps/chosen": -258.84423828125, "logps/rejected": -267.82415771484375, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.021647287532687187, "rewards/margins": 0.03337367996573448, "rewards/rejected": -0.011726390570402145, "step": 1610 }, { "epoch": 0.4239727819942423, "grad_norm": 1.515625, "learning_rate": 3.565853612808562e-07, "logits/chosen": -2.8622894287109375, "logits/rejected": -2.8286705017089844, "logps/chosen": -278.792724609375, "logps/rejected": -251.3448028564453, "loss": 0.6813, "rewards/accuracies": 0.625, "rewards/chosen": 0.011435525491833687, "rewards/margins": 0.02519642934203148, "rewards/rejected": -0.013760904781520367, "step": 1620 }, { "epoch": 0.4265898979324784, "grad_norm": 1.234375, "learning_rate": 3.5451450155583984e-07, "logits/chosen": -2.776291608810425, "logits/rejected": -2.8099074363708496, "logps/chosen": -247.71432495117188, "logps/rejected": -233.94900512695312, "loss": 0.6799, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01177397184073925, "rewards/margins": 0.027832742780447006, "rewards/rejected": -0.016058770939707756, "step": 1630 }, { "epoch": 0.42920701387071447, "grad_norm": 1.6171875, "learning_rate": 3.5243491490002055e-07, "logits/chosen": -2.8553478717803955, "logits/rejected": -2.8476006984710693, "logps/chosen": -271.2582702636719, "logps/rejected": -265.4175720214844, "loss": 0.681, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.014447445049881935, "rewards/margins": 0.026055917143821716, "rewards/rejected": -0.011608473025262356, "step": 1640 }, { "epoch": 0.4318241298089505, "grad_norm": 1.3515625, "learning_rate": 3.503467749582857e-07, "logits/chosen": -2.834573984146118, "logits/rejected": -2.7920243740081787, "logps/chosen": -269.9908142089844, "logps/rejected": -235.1788787841797, "loss": 0.6829, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.01244247704744339, "rewards/margins": 0.021913422271609306, "rewards/rejected": -0.009470945224165916, "step": 1650 }, { "epoch": 0.4344412457471866, "grad_norm": 1.15625, "learning_rate": 3.482502560897194e-07, "logits/chosen": -2.8074707984924316, "logits/rejected": -2.7939584255218506, "logps/chosen": -236.347412109375, "logps/rejected": -241.1566925048828, "loss": 0.684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.011268834583461285, "rewards/margins": 0.019097527489066124, "rewards/rejected": -0.007828695699572563, "step": 1660 }, { "epoch": 0.43705836168542267, "grad_norm": 1.5546875, "learning_rate": 3.4614553335304403e-07, "logits/chosen": -2.8423566818237305, "logits/rejected": -2.7879586219787598, "logps/chosen": -288.4642639160156, "logps/rejected": -253.7875213623047, "loss": 0.6767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02120782807469368, "rewards/margins": 0.034634821116924286, "rewards/rejected": -0.013426998630166054, "step": 1670 }, { "epoch": 0.4396754776236587, "grad_norm": 1.75, "learning_rate": 3.440327824920022e-07, "logits/chosen": -2.831993579864502, "logits/rejected": -2.8056693077087402, "logps/chosen": -299.0148010253906, "logps/rejected": -260.6773681640625, "loss": 0.6741, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.026383286342024803, "rewards/margins": 0.03987019881606102, "rewards/rejected": -0.013486906886100769, "step": 1680 }, { "epoch": 0.44229259356189476, "grad_norm": 1.671875, "learning_rate": 3.4191217992068287e-07, "logits/chosen": -2.870518207550049, "logits/rejected": -2.841670513153076, "logps/chosen": -292.0727844238281, "logps/rejected": -247.9720916748047, "loss": 0.678, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.026544999331235886, "rewards/margins": 0.03184535354375839, "rewards/rejected": -0.005300348624587059, "step": 1690 }, { "epoch": 0.44490970950013087, "grad_norm": 1.3828125, "learning_rate": 3.3978390270879056e-07, "logits/chosen": -2.8237035274505615, "logits/rejected": -2.810272455215454, "logps/chosen": -227.2367706298828, "logps/rejected": -232.7034454345703, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": 0.008710166439414024, "rewards/margins": 0.01971607096493244, "rewards/rejected": -0.011005903594195843, "step": 1700 }, { "epoch": 0.44490970950013087, "eval_logits/chosen": -2.853027105331421, "eval_logits/rejected": -2.8261446952819824, "eval_logps/chosen": -280.7289123535156, "eval_logps/rejected": -262.4769592285156, "eval_loss": 0.6784868240356445, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": 0.02044598199427128, "eval_rewards/margins": 0.030832206830382347, "eval_rewards/rejected": -0.010386227630078793, "eval_runtime": 622.0841, "eval_samples_per_second": 3.215, "eval_steps_per_second": 0.402, "step": 1700 }, { "epoch": 0.4475268254383669, "grad_norm": 1.2734375, "learning_rate": 3.376481285668599e-07, "logits/chosen": -2.8446857929229736, "logits/rejected": -2.8485488891601562, "logps/chosen": -237.1801300048828, "logps/rejected": -253.5928192138672, "loss": 0.6814, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.013800742104649544, "rewards/margins": 0.024945253506302834, "rewards/rejected": -0.01114450953900814, "step": 1710 }, { "epoch": 0.45014394137660296, "grad_norm": 1.3046875, "learning_rate": 3.355050358314172e-07, "logits/chosen": -2.874572515487671, "logits/rejected": -2.8544344902038574, "logps/chosen": -282.2732238769531, "logps/rejected": -267.3336181640625, "loss": 0.6765, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.026168251410126686, "rewards/margins": 0.03479185700416565, "rewards/rejected": -0.008623604662716389, "step": 1720 }, { "epoch": 0.45276105731483907, "grad_norm": 1.3515625, "learning_rate": 3.33354803450089e-07, "logits/chosen": -2.7801265716552734, "logits/rejected": -2.7793445587158203, "logps/chosen": -282.25518798828125, "logps/rejected": -262.77978515625, "loss": 0.6811, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.016791898757219315, "rewards/margins": 0.025678789243102074, "rewards/rejected": -0.008886890485882759, "step": 1730 }, { "epoch": 0.4553781732530751, "grad_norm": 1.1953125, "learning_rate": 3.311976109666605e-07, "logits/chosen": -2.8067824840545654, "logits/rejected": -2.7824745178222656, "logps/chosen": -292.15130615234375, "logps/rejected": -263.1966247558594, "loss": 0.6782, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.027290191501379013, "rewards/margins": 0.03162240982055664, "rewards/rejected": -0.004332221113145351, "step": 1740 }, { "epoch": 0.45799528919131116, "grad_norm": 1.3125, "learning_rate": 3.2903363850608317e-07, "logits/chosen": -2.899350166320801, "logits/rejected": -2.855714797973633, "logps/chosen": -263.13433837890625, "logps/rejected": -244.21298217773438, "loss": 0.6777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012938638217747211, "rewards/margins": 0.03214184567332268, "rewards/rejected": -0.01920320652425289, "step": 1750 }, { "epoch": 0.46061240512954726, "grad_norm": 1.25, "learning_rate": 3.2686306675943477e-07, "logits/chosen": -2.8296382427215576, "logits/rejected": -2.8444466590881348, "logps/chosen": -271.3722839355469, "logps/rejected": -247.5081787109375, "loss": 0.6782, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.02265056222677231, "rewards/margins": 0.031140562146902084, "rewards/rejected": -0.008490001782774925, "step": 1760 }, { "epoch": 0.4632295210677833, "grad_norm": 1.5, "learning_rate": 3.2468607696883145e-07, "logits/chosen": -2.8013827800750732, "logits/rejected": -2.793203592300415, "logps/chosen": -266.6105651855469, "logps/rejected": -276.02911376953125, "loss": 0.6764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01664569415152073, "rewards/margins": 0.034995269030332565, "rewards/rejected": -0.018349576741456985, "step": 1770 }, { "epoch": 0.46584663700601936, "grad_norm": 1.2421875, "learning_rate": 3.2250285091229435e-07, "logits/chosen": -2.863778591156006, "logits/rejected": -2.8398165702819824, "logps/chosen": -248.42300415039062, "logps/rejected": -239.9966583251953, "loss": 0.682, "rewards/accuracies": 0.65625, "rewards/chosen": 0.011377891525626183, "rewards/margins": 0.02343577891588211, "rewards/rejected": -0.012057888321578503, "step": 1780 }, { "epoch": 0.4684637529442554, "grad_norm": 7.71875, "learning_rate": 3.2031357088857083e-07, "logits/chosen": -2.851457118988037, "logits/rejected": -2.8426060676574707, "logps/chosen": -291.23236083984375, "logps/rejected": -300.08843994140625, "loss": 0.6808, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.015203478746116161, "rewards/margins": 0.02637268602848053, "rewards/rejected": -0.011169209145009518, "step": 1790 }, { "epoch": 0.4710808688824915, "grad_norm": 1.4375, "learning_rate": 3.1811841970191267e-07, "logits/chosen": -2.7736434936523438, "logits/rejected": -2.7497920989990234, "logps/chosen": -245.38912963867188, "logps/rejected": -276.73297119140625, "loss": 0.6793, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.009697729721665382, "rewards/margins": 0.029323875904083252, "rewards/rejected": -0.01962614618241787, "step": 1800 }, { "epoch": 0.4710808688824915, "eval_logits/chosen": -2.851853132247925, "eval_logits/rejected": -2.8248438835144043, "eval_logps/chosen": -280.89361572265625, "eval_logps/rejected": -262.69610595703125, "eval_loss": 0.6782403588294983, "eval_rewards/accuracies": 0.6869999766349792, "eval_rewards/chosen": 0.018798967823386192, "eval_rewards/margins": 0.03137620911002159, "eval_rewards/rejected": -0.012577244080603123, "eval_runtime": 622.7177, "eval_samples_per_second": 3.212, "eval_steps_per_second": 0.401, "step": 1800 }, { "epoch": 0.47369798482072756, "grad_norm": 1.28125, "learning_rate": 3.1591758064681257e-07, "logits/chosen": -2.779759407043457, "logits/rejected": -2.7464611530303955, "logps/chosen": -269.7041015625, "logps/rejected": -234.5542755126953, "loss": 0.6773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017099203541874886, "rewards/margins": 0.03334728628396988, "rewards/rejected": -0.01624808833003044, "step": 1810 }, { "epoch": 0.4763151007589636, "grad_norm": 1.484375, "learning_rate": 3.13711237492698e-07, "logits/chosen": -2.8304784297943115, "logits/rejected": -2.8185439109802246, "logps/chosen": -296.6722717285156, "logps/rejected": -284.0693664550781, "loss": 0.6829, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016499513760209084, "rewards/margins": 0.02195136621594429, "rewards/rejected": -0.005451851524412632, "step": 1820 }, { "epoch": 0.4789322166971997, "grad_norm": 1.046875, "learning_rate": 3.1149957446858767e-07, "logits/chosen": -2.82464599609375, "logits/rejected": -2.8389458656311035, "logps/chosen": -263.2919006347656, "logps/rejected": -250.74813842773438, "loss": 0.6841, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0153631791472435, "rewards/margins": 0.019609825685620308, "rewards/rejected": -0.004246644675731659, "step": 1830 }, { "epoch": 0.48154933263543576, "grad_norm": 1.4609375, "learning_rate": 3.0928277624770736e-07, "logits/chosen": -2.8787219524383545, "logits/rejected": -2.8530170917510986, "logps/chosen": -300.767333984375, "logps/rejected": -275.71856689453125, "loss": 0.6736, "rewards/accuracies": 0.75, "rewards/chosen": 0.02286524511873722, "rewards/margins": 0.041333895176649094, "rewards/rejected": -0.018468648195266724, "step": 1840 }, { "epoch": 0.4841664485736718, "grad_norm": 1.3203125, "learning_rate": 3.0706102793207073e-07, "logits/chosen": -2.8641512393951416, "logits/rejected": -2.832724094390869, "logps/chosen": -301.1673278808594, "logps/rejected": -282.6499938964844, "loss": 0.6723, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.030217334628105164, "rewards/margins": 0.043759047985076904, "rewards/rejected": -0.01354171335697174, "step": 1850 }, { "epoch": 0.48678356451190785, "grad_norm": 1.28125, "learning_rate": 3.048345150370226e-07, "logits/chosen": -2.8586244583129883, "logits/rejected": -2.8518083095550537, "logps/chosen": -300.2522888183594, "logps/rejected": -283.9441223144531, "loss": 0.6775, "rewards/accuracies": 0.71875, "rewards/chosen": 0.020278874784708023, "rewards/margins": 0.03307543322443962, "rewards/rejected": -0.012796561233699322, "step": 1860 }, { "epoch": 0.48940068045014395, "grad_norm": 1.1171875, "learning_rate": 3.0260342347574913e-07, "logits/chosen": -2.8430685997009277, "logits/rejected": -2.8185229301452637, "logps/chosen": -285.382568359375, "logps/rejected": -269.14886474609375, "loss": 0.6767, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.026956235989928246, "rewards/margins": 0.03441258519887924, "rewards/rejected": -0.007456351071596146, "step": 1870 }, { "epoch": 0.49201779638838, "grad_norm": 1.21875, "learning_rate": 3.0036793954375357e-07, "logits/chosen": -2.8681893348693848, "logits/rejected": -2.8438541889190674, "logps/chosen": -283.40582275390625, "logps/rejected": -243.862060546875, "loss": 0.6752, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.026450032368302345, "rewards/margins": 0.037637047469615936, "rewards/rejected": -0.01118701882660389, "step": 1880 }, { "epoch": 0.49463491232661605, "grad_norm": 1.375, "learning_rate": 2.9812824990330085e-07, "logits/chosen": -2.837024211883545, "logits/rejected": -2.825876235961914, "logps/chosen": -290.41644287109375, "logps/rejected": -267.0772705078125, "loss": 0.6775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017299989238381386, "rewards/margins": 0.03299538046121597, "rewards/rejected": -0.015695389360189438, "step": 1890 }, { "epoch": 0.49725202826485215, "grad_norm": 1.421875, "learning_rate": 2.958845415678316e-07, "logits/chosen": -2.8465914726257324, "logits/rejected": -2.8124914169311523, "logps/chosen": -293.70098876953125, "logps/rejected": -275.6326904296875, "loss": 0.6766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020625559613108635, "rewards/margins": 0.034684114158153534, "rewards/rejected": -0.01405855268239975, "step": 1900 }, { "epoch": 0.49725202826485215, "eval_logits/chosen": -2.854793071746826, "eval_logits/rejected": -2.8281238079071045, "eval_logps/chosen": -280.89208984375, "eval_logps/rejected": -262.73114013671875, "eval_loss": 0.6780784726142883, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.01881374977529049, "eval_rewards/margins": 0.03174133226275444, "eval_rewards/rejected": -0.012927580624818802, "eval_runtime": 623.0648, "eval_samples_per_second": 3.21, "eval_steps_per_second": 0.401, "step": 1900 }, { "epoch": 0.4998691442030882, "grad_norm": 1.0625, "learning_rate": 2.936370018863459e-07, "logits/chosen": -2.86594295501709, "logits/rejected": -2.852074384689331, "logps/chosen": -278.3857727050781, "logps/rejected": -242.82290649414062, "loss": 0.6795, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.016295427456498146, "rewards/margins": 0.028570901602506638, "rewards/rejected": -0.012275472283363342, "step": 1910 }, { "epoch": 0.5024862601413242, "grad_norm": 1.3125, "learning_rate": 2.913858185277605e-07, "logits/chosen": -2.8365845680236816, "logits/rejected": -2.8241991996765137, "logps/chosen": -274.61334228515625, "logps/rejected": -262.88934326171875, "loss": 0.6764, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.025087693706154823, "rewards/margins": 0.03509819880127907, "rewards/rejected": -0.010010505095124245, "step": 1920 }, { "epoch": 0.5051033760795604, "grad_norm": 1.3828125, "learning_rate": 2.89131179465238e-07, "logits/chosen": -2.802734375, "logits/rejected": -2.7592692375183105, "logps/chosen": -286.10986328125, "logps/rejected": -250.1353759765625, "loss": 0.6755, "rewards/accuracies": 0.6875, "rewards/chosen": 0.019024692475795746, "rewards/margins": 0.03733197599649429, "rewards/rejected": -0.018307287245988846, "step": 1930 }, { "epoch": 0.5077204920177963, "grad_norm": 1.2265625, "learning_rate": 2.8687327296049125e-07, "logits/chosen": -2.841648578643799, "logits/rejected": -2.817791700363159, "logps/chosen": -272.06378173828125, "logps/rejected": -273.47930908203125, "loss": 0.6797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0162358395755291, "rewards/margins": 0.02848033234477043, "rewards/rejected": -0.012244494631886482, "step": 1940 }, { "epoch": 0.5103376079560324, "grad_norm": 1.21875, "learning_rate": 2.846122875480637e-07, "logits/chosen": -2.8606886863708496, "logits/rejected": -2.822801113128662, "logps/chosen": -288.4855651855469, "logps/rejected": -264.97247314453125, "loss": 0.6769, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.024403361603617668, "rewards/margins": 0.03406291827559471, "rewards/rejected": -0.009659556671977043, "step": 1950 }, { "epoch": 0.5129547238942685, "grad_norm": 1.1484375, "learning_rate": 2.8234841201958647e-07, "logits/chosen": -2.8555562496185303, "logits/rejected": -2.819885730743408, "logps/chosen": -297.9819641113281, "logps/rejected": -261.5909423828125, "loss": 0.6754, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02403775416314602, "rewards/margins": 0.03708943352103233, "rewards/rejected": -0.013051679357886314, "step": 1960 }, { "epoch": 0.5155718398325045, "grad_norm": 1.4375, "learning_rate": 2.800818354080148e-07, "logits/chosen": -2.83642840385437, "logits/rejected": -2.805063486099243, "logps/chosen": -287.24420166015625, "logps/rejected": -243.9567413330078, "loss": 0.6775, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.021363835781812668, "rewards/margins": 0.032995063811540604, "rewards/rejected": -0.011631224304437637, "step": 1970 }, { "epoch": 0.5181889557707406, "grad_norm": 1.265625, "learning_rate": 2.778127469718435e-07, "logits/chosen": -2.7859811782836914, "logits/rejected": -2.7979307174682617, "logps/chosen": -245.3804168701172, "logps/rejected": -266.1874694824219, "loss": 0.6803, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.015351531095802784, "rewards/margins": 0.02699609100818634, "rewards/rejected": -0.011644558981060982, "step": 1980 }, { "epoch": 0.5208060717089767, "grad_norm": 1.3515625, "learning_rate": 2.755413361793039e-07, "logits/chosen": -2.8089661598205566, "logits/rejected": -2.779783010482788, "logps/chosen": -262.85589599609375, "logps/rejected": -253.60653686523438, "loss": 0.6757, "rewards/accuracies": 0.6875, "rewards/chosen": 0.026462215930223465, "rewards/margins": 0.03666644170880318, "rewards/rejected": -0.010204223915934563, "step": 1990 }, { "epoch": 0.5234231876472127, "grad_norm": 1.3125, "learning_rate": 2.7326779269254356e-07, "logits/chosen": -2.8754069805145264, "logits/rejected": -2.8527140617370605, "logps/chosen": -303.8473205566406, "logps/rejected": -247.6015625, "loss": 0.6762, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.026804108172655106, "rewards/margins": 0.03586486726999283, "rewards/rejected": -0.009060760028660297, "step": 2000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -2.853806972503662, "eval_logits/rejected": -2.8270227909088135, "eval_logps/chosen": -280.87493896484375, "eval_logps/rejected": -262.7651062011719, "eval_loss": 0.6778436303138733, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.01898558810353279, "eval_rewards/margins": 0.03225287050008774, "eval_rewards/rejected": -0.013267277739942074, "eval_runtime": 622.6334, "eval_samples_per_second": 3.212, "eval_steps_per_second": 0.402, "step": 2000 }, { "epoch": 0.5260403035854488, "grad_norm": 1.4296875, "learning_rate": 2.709923063517895e-07, "logits/chosen": -2.8158721923828125, "logits/rejected": -2.8292183876037598, "logps/chosen": -277.1296691894531, "logps/rejected": -276.7500915527344, "loss": 0.6757, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.020963847637176514, "rewards/margins": 0.03652495518326759, "rewards/rejected": -0.015561106614768505, "step": 2010 }, { "epoch": 0.528657419523685, "grad_norm": 1.5390625, "learning_rate": 2.68715067159496e-07, "logits/chosen": -2.8547072410583496, "logits/rejected": -2.8296151161193848, "logps/chosen": -266.697265625, "logps/rejected": -248.7971649169922, "loss": 0.6773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.023486215621232986, "rewards/margins": 0.0331868901848793, "rewards/rejected": -0.009700671769678593, "step": 2020 }, { "epoch": 0.5312745354619209, "grad_norm": 1.5625, "learning_rate": 2.664362652644806e-07, "logits/chosen": -2.871127128601074, "logits/rejected": -2.859767198562622, "logps/chosen": -309.3524475097656, "logps/rejected": -267.5863342285156, "loss": 0.6745, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03075565956532955, "rewards/margins": 0.039490751922130585, "rewards/rejected": -0.008735088631510735, "step": 2030 }, { "epoch": 0.533891651400157, "grad_norm": 1.2421875, "learning_rate": 2.6415609094604555e-07, "logits/chosen": -2.8490989208221436, "logits/rejected": -2.849595308303833, "logps/chosen": -284.77679443359375, "logps/rejected": -266.975830078125, "loss": 0.6771, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.029801562428474426, "rewards/margins": 0.034142639487981796, "rewards/rejected": -0.004341077525168657, "step": 2040 }, { "epoch": 0.5365087673383931, "grad_norm": 1.1875, "learning_rate": 2.618747345980904e-07, "logits/chosen": -2.85640287399292, "logits/rejected": -2.8106446266174316, "logps/chosen": -262.911376953125, "logps/rejected": -212.4907989501953, "loss": 0.6744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0176672525703907, "rewards/margins": 0.039339274168014526, "rewards/rejected": -0.021672027185559273, "step": 2050 }, { "epoch": 0.5391258832766291, "grad_norm": 1.421875, "learning_rate": 2.595923867132136e-07, "logits/chosen": -2.8832926750183105, "logits/rejected": -2.872882604598999, "logps/chosen": -296.1038818359375, "logps/rejected": -274.6143493652344, "loss": 0.6772, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01775936782360077, "rewards/margins": 0.03422468900680542, "rewards/rejected": -0.0164653230458498, "step": 2060 }, { "epoch": 0.5417429992148652, "grad_norm": 1.3359375, "learning_rate": 2.5730923786680667e-07, "logits/chosen": -2.860802173614502, "logits/rejected": -2.862802028656006, "logps/chosen": -264.1892395019531, "logps/rejected": -275.02532958984375, "loss": 0.6787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016188761219382286, "rewards/margins": 0.030575359240174294, "rewards/rejected": -0.014386599883437157, "step": 2070 }, { "epoch": 0.5443601151531012, "grad_norm": 1.2890625, "learning_rate": 2.5502547870114135e-07, "logits/chosen": -2.8470053672790527, "logits/rejected": -2.8130552768707275, "logps/chosen": -269.6037902832031, "logps/rejected": -240.1824493408203, "loss": 0.6779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01747250370681286, "rewards/margins": 0.03254149109125137, "rewards/rejected": -0.015068987384438515, "step": 2080 }, { "epoch": 0.5469772310913373, "grad_norm": 3.09375, "learning_rate": 2.527412999094506e-07, "logits/chosen": -2.8123087882995605, "logits/rejected": -2.7862212657928467, "logps/chosen": -315.90447998046875, "logps/rejected": -302.16552734375, "loss": 0.6781, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.022080183029174805, "rewards/margins": 0.031787317246198654, "rewards/rejected": -0.009707136079668999, "step": 2090 }, { "epoch": 0.5495943470295734, "grad_norm": 1.828125, "learning_rate": 2.5045689222000636e-07, "logits/chosen": -2.798381805419922, "logits/rejected": -2.7811503410339355, "logps/chosen": -256.8103332519531, "logps/rejected": -242.29971313476562, "loss": 0.6796, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01742711290717125, "rewards/margins": 0.02845141850411892, "rewards/rejected": -0.011024304665625095, "step": 2100 }, { "epoch": 0.5495943470295734, "eval_logits/chosen": -2.856419324874878, "eval_logits/rejected": -2.8298940658569336, "eval_logps/chosen": -280.9320983886719, "eval_logps/rejected": -262.8512878417969, "eval_loss": 0.6777089834213257, "eval_rewards/accuracies": 0.6794999837875366, "eval_rewards/chosen": 0.018413949757814407, "eval_rewards/margins": 0.03254299610853195, "eval_rewards/rejected": -0.01412904355674982, "eval_runtime": 623.6848, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.401, "step": 2100 }, { "epoch": 0.5522114629678094, "grad_norm": 1.2734375, "learning_rate": 2.481724463801933e-07, "logits/chosen": -2.837977170944214, "logits/rejected": -2.8143982887268066, "logps/chosen": -293.23687744140625, "logps/rejected": -254.9249725341797, "loss": 0.6749, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01989181898534298, "rewards/margins": 0.03839176893234253, "rewards/rejected": -0.01849994622170925, "step": 2110 }, { "epoch": 0.5548285789060455, "grad_norm": 1.3984375, "learning_rate": 2.4588815314058154e-07, "logits/chosen": -2.828207492828369, "logits/rejected": -2.825892448425293, "logps/chosen": -257.57012939453125, "logps/rejected": -226.6698455810547, "loss": 0.677, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.021512161940336227, "rewards/margins": 0.03372306749224663, "rewards/rejected": -0.01221090741455555, "step": 2120 }, { "epoch": 0.5574456948442816, "grad_norm": 1.2265625, "learning_rate": 2.4360420323899917e-07, "logits/chosen": -2.8333821296691895, "logits/rejected": -2.8204522132873535, "logps/chosen": -294.45135498046875, "logps/rejected": -261.3866271972656, "loss": 0.6785, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.021041734144091606, "rewards/margins": 0.031500063836574554, "rewards/rejected": -0.01045832596719265, "step": 2130 }, { "epoch": 0.5600628107825176, "grad_norm": 1.2578125, "learning_rate": 2.4132078738460583e-07, "logits/chosen": -2.8641979694366455, "logits/rejected": -2.8372373580932617, "logps/chosen": -277.0169677734375, "logps/rejected": -240.1663055419922, "loss": 0.6765, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.020255176350474358, "rewards/margins": 0.03485842049121857, "rewards/rejected": -0.014603245072066784, "step": 2140 }, { "epoch": 0.5626799267207537, "grad_norm": 1.3984375, "learning_rate": 2.390380962419682e-07, "logits/chosen": -2.8402717113494873, "logits/rejected": -2.82948899269104, "logps/chosen": -248.21908569335938, "logps/rejected": -215.9970703125, "loss": 0.6813, "rewards/accuracies": 0.625, "rewards/chosen": 0.013856288976967335, "rewards/margins": 0.02503989078104496, "rewards/rejected": -0.011183603666722775, "step": 2150 }, { "epoch": 0.5652970426589898, "grad_norm": 1.203125, "learning_rate": 2.3675632041513977e-07, "logits/chosen": -2.8817086219787598, "logits/rejected": -2.830371856689453, "logps/chosen": -299.80419921875, "logps/rejected": -237.6551055908203, "loss": 0.6704, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.029662128537893295, "rewards/margins": 0.0477459654211998, "rewards/rejected": -0.018083838745951653, "step": 2160 }, { "epoch": 0.5679141585972258, "grad_norm": 1.1953125, "learning_rate": 2.344756504317453e-07, "logits/chosen": -2.8310768604278564, "logits/rejected": -2.7922732830047607, "logps/chosen": -273.2166442871094, "logps/rejected": -238.2533721923828, "loss": 0.6782, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.013544699177145958, "rewards/margins": 0.031235402449965477, "rewards/rejected": -0.01769069954752922, "step": 2170 }, { "epoch": 0.5705312745354619, "grad_norm": 1.3203125, "learning_rate": 2.3219627672707237e-07, "logits/chosen": -2.814478874206543, "logits/rejected": -2.807798147201538, "logps/chosen": -271.345703125, "logps/rejected": -229.5119171142578, "loss": 0.6808, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.008285674266517162, "rewards/margins": 0.025873666629195213, "rewards/rejected": -0.017587993294000626, "step": 2180 }, { "epoch": 0.573148390473698, "grad_norm": 1.359375, "learning_rate": 2.2991838962816918e-07, "logits/chosen": -2.812224864959717, "logits/rejected": -2.787701368331909, "logps/chosen": -268.98724365234375, "logps/rejected": -268.3625793457031, "loss": 0.681, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01591324433684349, "rewards/margins": 0.025850754231214523, "rewards/rejected": -0.009937510825693607, "step": 2190 }, { "epoch": 0.575765506411934, "grad_norm": 1.4453125, "learning_rate": 2.2764217933795297e-07, "logits/chosen": -2.8286900520324707, "logits/rejected": -2.8099260330200195, "logps/chosen": -274.784912109375, "logps/rejected": -258.30084228515625, "loss": 0.6736, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02573958970606327, "rewards/margins": 0.04115080088376999, "rewards/rejected": -0.015411211177706718, "step": 2200 }, { "epoch": 0.575765506411934, "eval_logits/chosen": -2.857100486755371, "eval_logits/rejected": -2.8306167125701904, "eval_logps/chosen": -280.9635314941406, "eval_logps/rejected": -262.8892822265625, "eval_loss": 0.6776819825172424, "eval_rewards/accuracies": 0.6825000047683716, "eval_rewards/chosen": 0.018099820241332054, "eval_rewards/margins": 0.03260912373661995, "eval_rewards/rejected": -0.014509301632642746, "eval_runtime": 623.717, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.401, "step": 2200 }, { "epoch": 0.5783826223501701, "grad_norm": 1.6171875, "learning_rate": 2.253678359193278e-07, "logits/chosen": -2.901681423187256, "logits/rejected": -2.858135223388672, "logps/chosen": -292.3751525878906, "logps/rejected": -273.216064453125, "loss": 0.6782, "rewards/accuracies": 0.65625, "rewards/chosen": 0.017452511936426163, "rewards/margins": 0.0317564532160759, "rewards/rejected": -0.014303937554359436, "step": 2210 }, { "epoch": 0.5809997382884062, "grad_norm": 1.2421875, "learning_rate": 2.230955492793149e-07, "logits/chosen": -2.785632371902466, "logits/rejected": -2.791489362716675, "logps/chosen": -290.7291259765625, "logps/rejected": -277.0824890136719, "loss": 0.6816, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.015981845557689667, "rewards/margins": 0.02465016394853592, "rewards/rejected": -0.008668316528201103, "step": 2220 }, { "epoch": 0.5836168542266422, "grad_norm": 1.5, "learning_rate": 2.2082550915319468e-07, "logits/chosen": -2.797928810119629, "logits/rejected": -2.795860767364502, "logps/chosen": -292.92291259765625, "logps/rejected": -257.6455078125, "loss": 0.6766, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02159210667014122, "rewards/margins": 0.03476356342434883, "rewards/rejected": -0.013171456754207611, "step": 2230 }, { "epoch": 0.5862339701648783, "grad_norm": 1.3203125, "learning_rate": 2.1855790508866433e-07, "logits/chosen": -2.8182015419006348, "logits/rejected": -2.815925121307373, "logps/chosen": -324.13592529296875, "logps/rejected": -299.08135986328125, "loss": 0.6768, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02183537557721138, "rewards/margins": 0.034798912703990936, "rewards/rejected": -0.012963538058102131, "step": 2240 }, { "epoch": 0.5888510861031143, "grad_norm": 1.2421875, "learning_rate": 2.162929264300107e-07, "logits/chosen": -2.79923939704895, "logits/rejected": -2.7914280891418457, "logps/chosen": -282.1067810058594, "logps/rejected": -265.30364990234375, "loss": 0.6733, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.02532428503036499, "rewards/margins": 0.04165149852633476, "rewards/rejected": -0.016327213495969772, "step": 2250 }, { "epoch": 0.5914682020413504, "grad_norm": 1.390625, "learning_rate": 2.1403076230230005e-07, "logits/chosen": -2.816969394683838, "logits/rejected": -2.7890102863311768, "logps/chosen": -290.90618896484375, "logps/rejected": -261.3204345703125, "loss": 0.6801, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.02144182100892067, "rewards/margins": 0.028353065252304077, "rewards/rejected": -0.006911243312060833, "step": 2260 }, { "epoch": 0.5940853179795865, "grad_norm": 1.703125, "learning_rate": 2.1177160159558596e-07, "logits/chosen": -2.8060302734375, "logits/rejected": -2.788020610809326, "logps/chosen": -297.41741943359375, "logps/rejected": -247.78640747070312, "loss": 0.6745, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02653617225587368, "rewards/margins": 0.03934457153081894, "rewards/rejected": -0.01280839741230011, "step": 2270 }, { "epoch": 0.5967024339178225, "grad_norm": 1.3203125, "learning_rate": 2.0951563294913734e-07, "logits/chosen": -2.8132269382476807, "logits/rejected": -2.7866339683532715, "logps/chosen": -277.17449951171875, "logps/rejected": -250.9734344482422, "loss": 0.6754, "rewards/accuracies": 0.71875, "rewards/chosen": 0.01868070289492607, "rewards/margins": 0.03689347952604294, "rewards/rejected": -0.018212776631116867, "step": 2280 }, { "epoch": 0.5993195498560586, "grad_norm": 1.1875, "learning_rate": 2.072630447356869e-07, "logits/chosen": -2.8403782844543457, "logits/rejected": -2.8337533473968506, "logps/chosen": -274.53546142578125, "logps/rejected": -242.531494140625, "loss": 0.6767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01636110618710518, "rewards/margins": 0.03431684896349907, "rewards/rejected": -0.01795574650168419, "step": 2290 }, { "epoch": 0.6019366657942947, "grad_norm": 1.5625, "learning_rate": 2.0501402504570232e-07, "logits/chosen": -2.87614107131958, "logits/rejected": -2.8186376094818115, "logps/chosen": -293.62347412109375, "logps/rejected": -262.20794677734375, "loss": 0.6779, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.020630482584238052, "rewards/margins": 0.032361775636672974, "rewards/rejected": -0.011731292121112347, "step": 2300 }, { "epoch": 0.6019366657942947, "eval_logits/chosen": -2.8548264503479004, "eval_logits/rejected": -2.8281185626983643, "eval_logps/chosen": -281.0184326171875, "eval_logps/rejected": -262.955810546875, "eval_loss": 0.6776320934295654, "eval_rewards/accuracies": 0.6875, "eval_rewards/chosen": 0.017550628632307053, "eval_rewards/margins": 0.032725006341934204, "eval_rewards/rejected": -0.015174377709627151, "eval_runtime": 623.5172, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.401, "step": 2300 }, { "epoch": 0.6045537817325307, "grad_norm": 1.3125, "learning_rate": 2.027687616716804e-07, "logits/chosen": -2.776857614517212, "logits/rejected": -2.766300916671753, "logps/chosen": -245.21249389648438, "logps/rejected": -210.9021453857422, "loss": 0.6796, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.012185106053948402, "rewards/margins": 0.028403136879205704, "rewards/rejected": -0.016218028962612152, "step": 2310 }, { "epoch": 0.6071708976707668, "grad_norm": 1.5859375, "learning_rate": 2.005274420924668e-07, "logits/chosen": -2.842299699783325, "logits/rejected": -2.8256301879882812, "logps/chosen": -268.92669677734375, "logps/rejected": -236.4410858154297, "loss": 0.6773, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01880219765007496, "rewards/margins": 0.03351093456149101, "rewards/rejected": -0.014708739705383778, "step": 2320 }, { "epoch": 0.6097880136090029, "grad_norm": 1.8203125, "learning_rate": 1.9829025345760121e-07, "logits/chosen": -2.8297770023345947, "logits/rejected": -2.830937147140503, "logps/chosen": -295.0563049316406, "logps/rejected": -287.86016845703125, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": 0.02504206821322441, "rewards/margins": 0.02800445258617401, "rewards/rejected": -0.002962383907288313, "step": 2330 }, { "epoch": 0.6124051295472389, "grad_norm": 1.4453125, "learning_rate": 1.960573825716911e-07, "logits/chosen": -2.8119211196899414, "logits/rejected": -2.7910213470458984, "logps/chosen": -250.70425415039062, "logps/rejected": -246.34921264648438, "loss": 0.6817, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.005780251231044531, "rewards/margins": 0.024395998567342758, "rewards/rejected": -0.01861574873328209, "step": 2340 }, { "epoch": 0.615022245485475, "grad_norm": 1.4453125, "learning_rate": 1.9382901587881273e-07, "logits/chosen": -2.8759961128234863, "logits/rejected": -2.864570379257202, "logps/chosen": -273.0665588378906, "logps/rejected": -240.42294311523438, "loss": 0.6727, "rewards/accuracies": 0.6875, "rewards/chosen": 0.027859041467308998, "rewards/margins": 0.04289738088846207, "rewards/rejected": -0.015038339421153069, "step": 2350 }, { "epoch": 0.6176393614237111, "grad_norm": 1.53125, "learning_rate": 1.9160533944694364e-07, "logits/chosen": -2.870702028274536, "logits/rejected": -2.8234288692474365, "logps/chosen": -276.2862854003906, "logps/rejected": -267.0870666503906, "loss": 0.6738, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.025392061099410057, "rewards/margins": 0.04046647623181343, "rewards/rejected": -0.015074415132403374, "step": 2360 }, { "epoch": 0.6202564773619471, "grad_norm": 1.3671875, "learning_rate": 1.8938653895242602e-07, "logits/chosen": -2.861572027206421, "logits/rejected": -2.8287158012390137, "logps/chosen": -277.0164489746094, "logps/rejected": -251.5226593017578, "loss": 0.6715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02161785028874874, "rewards/margins": 0.045254360884428024, "rewards/rejected": -0.02363651618361473, "step": 2370 }, { "epoch": 0.6228735933001832, "grad_norm": 1.3515625, "learning_rate": 1.8717279966446264e-07, "logits/chosen": -2.7649269104003906, "logits/rejected": -2.748934268951416, "logps/chosen": -266.9613037109375, "logps/rejected": -256.705810546875, "loss": 0.6789, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.01746024750173092, "rewards/margins": 0.030255427584052086, "rewards/rejected": -0.012795181944966316, "step": 2380 }, { "epoch": 0.6254907092384192, "grad_norm": 1.3046875, "learning_rate": 1.8496430642964694e-07, "logits/chosen": -2.8276329040527344, "logits/rejected": -2.8031527996063232, "logps/chosen": -289.2098083496094, "logps/rejected": -266.8305969238281, "loss": 0.6785, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.020906824618577957, "rewards/margins": 0.031347136944532394, "rewards/rejected": -0.010440316051244736, "step": 2390 }, { "epoch": 0.6281078251766553, "grad_norm": 1.375, "learning_rate": 1.8276124365652855e-07, "logits/chosen": -2.8458991050720215, "logits/rejected": -2.7970054149627686, "logps/chosen": -278.1546325683594, "logps/rejected": -264.06427001953125, "loss": 0.6782, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01725105755031109, "rewards/margins": 0.031516797840595245, "rewards/rejected": -0.014265733771026134, "step": 2400 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -2.8540356159210205, "eval_logits/rejected": -2.8272616863250732, "eval_logps/chosen": -280.9810485839844, "eval_logps/rejected": -262.9154968261719, "eval_loss": 0.6776500344276428, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": 0.01792425848543644, "eval_rewards/margins": 0.032695669680833817, "eval_rewards/rejected": -0.014771413058042526, "eval_runtime": 623.7982, "eval_samples_per_second": 3.206, "eval_steps_per_second": 0.401, "step": 2400 }, { "epoch": 0.6307249411148914, "grad_norm": 1.328125, "learning_rate": 1.805637953002149e-07, "logits/chosen": -2.8647313117980957, "logits/rejected": -2.8551206588745117, "logps/chosen": -258.0870666503906, "logps/rejected": -236.39266967773438, "loss": 0.6787, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016582269221544266, "rewards/margins": 0.030466347932815552, "rewards/rejected": -0.013884077779948711, "step": 2410 }, { "epoch": 0.6333420570531274, "grad_norm": 1.1484375, "learning_rate": 1.7837214484701153e-07, "logits/chosen": -2.8555073738098145, "logits/rejected": -2.837476968765259, "logps/chosen": -266.9813537597656, "logps/rejected": -244.21142578125, "loss": 0.673, "rewards/accuracies": 0.71875, "rewards/chosen": 0.023186923936009407, "rewards/margins": 0.04209943115711212, "rewards/rejected": -0.018912509083747864, "step": 2420 }, { "epoch": 0.6359591729913635, "grad_norm": 1.28125, "learning_rate": 1.761864752991004e-07, "logits/chosen": -2.8415451049804688, "logits/rejected": -2.8176722526550293, "logps/chosen": -272.5819396972656, "logps/rejected": -260.46868896484375, "loss": 0.6765, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.019747793674468994, "rewards/margins": 0.034796275198459625, "rewards/rejected": -0.015048478730022907, "step": 2430 }, { "epoch": 0.6385762889295996, "grad_norm": 1.3515625, "learning_rate": 1.7400696915925995e-07, "logits/chosen": -2.8523507118225098, "logits/rejected": -2.8236594200134277, "logps/chosen": -287.25445556640625, "logps/rejected": -227.38986206054688, "loss": 0.6748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017432263121008873, "rewards/margins": 0.03874523937702179, "rewards/rejected": -0.021312978118658066, "step": 2440 }, { "epoch": 0.6411934048678356, "grad_norm": 1.6875, "learning_rate": 1.718338084156254e-07, "logits/chosen": -2.797588586807251, "logits/rejected": -2.780844211578369, "logps/chosen": -304.15277099609375, "logps/rejected": -267.3579406738281, "loss": 0.6756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023083947598934174, "rewards/margins": 0.036666251718997955, "rewards/rejected": -0.013582308776676655, "step": 2450 }, { "epoch": 0.6438105208060717, "grad_norm": 1.09375, "learning_rate": 1.696671745264937e-07, "logits/chosen": -2.860663652420044, "logits/rejected": -2.863227128982544, "logps/chosen": -295.4175720214844, "logps/rejected": -240.9844207763672, "loss": 0.6724, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.025130432099103928, "rewards/margins": 0.04342951625585556, "rewards/rejected": -0.018299078568816185, "step": 2460 }, { "epoch": 0.6464276367443078, "grad_norm": 1.3359375, "learning_rate": 1.67507248405171e-07, "logits/chosen": -2.846250534057617, "logits/rejected": -2.82800030708313, "logps/chosen": -270.4811706542969, "logps/rejected": -273.17401123046875, "loss": 0.6794, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.020759141072630882, "rewards/margins": 0.029174262657761574, "rewards/rejected": -0.00841512344777584, "step": 2470 }, { "epoch": 0.6490447526825438, "grad_norm": 1.2890625, "learning_rate": 1.6535421040486683e-07, "logits/chosen": -2.760650396347046, "logits/rejected": -2.743494987487793, "logps/chosen": -270.25933837890625, "logps/rejected": -240.8209228515625, "loss": 0.6748, "rewards/accuracies": 0.71875, "rewards/chosen": 0.016713624820113182, "rewards/margins": 0.03836838901042938, "rewards/rejected": -0.02165476605296135, "step": 2480 }, { "epoch": 0.6516618686207799, "grad_norm": 1.234375, "learning_rate": 1.6320824030363456e-07, "logits/chosen": -2.8263466358184814, "logits/rejected": -2.828117609024048, "logps/chosen": -248.79129028320312, "logps/rejected": -235.21774291992188, "loss": 0.6765, "rewards/accuracies": 0.6875, "rewards/chosen": 0.017327528446912766, "rewards/margins": 0.03489188104867935, "rewards/rejected": -0.01756434701383114, "step": 2490 }, { "epoch": 0.654278984559016, "grad_norm": 1.359375, "learning_rate": 1.6106951728936024e-07, "logits/chosen": -2.8797359466552734, "logits/rejected": -2.8334743976593018, "logps/chosen": -271.38677978515625, "logps/rejected": -267.6949768066406, "loss": 0.6753, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.020725153386592865, "rewards/margins": 0.03783208504319191, "rewards/rejected": -0.017106933519244194, "step": 2500 }, { "epoch": 0.654278984559016, "eval_logits/chosen": -2.852537155151367, "eval_logits/rejected": -2.8256473541259766, "eval_logps/chosen": -280.9631042480469, "eval_logps/rejected": -262.90740966796875, "eval_loss": 0.6776077151298523, "eval_rewards/accuracies": 0.6804999709129333, "eval_rewards/chosen": 0.01810392364859581, "eval_rewards/margins": 0.0327942781150341, "eval_rewards/rejected": -0.014690355397760868, "eval_runtime": 623.8344, "eval_samples_per_second": 3.206, "eval_steps_per_second": 0.401, "step": 2500 }, { "epoch": 0.656896100497252, "grad_norm": 1.3125, "learning_rate": 1.5893821994479994e-07, "logits/chosen": -2.860830307006836, "logits/rejected": -2.8481099605560303, "logps/chosen": -290.0372009277344, "logps/rejected": -253.4038848876953, "loss": 0.6757, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.024327334016561508, "rewards/margins": 0.03673567250370979, "rewards/rejected": -0.012408342212438583, "step": 2510 }, { "epoch": 0.6595132164354881, "grad_norm": 1.34375, "learning_rate": 1.5681452623266867e-07, "logits/chosen": -2.8527517318725586, "logits/rejected": -2.8060853481292725, "logps/chosen": -301.49542236328125, "logps/rejected": -247.3957977294922, "loss": 0.6683, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.03281102329492569, "rewards/margins": 0.05217736214399338, "rewards/rejected": -0.019366348162293434, "step": 2520 }, { "epoch": 0.6621303323737242, "grad_norm": 2.484375, "learning_rate": 1.546986134807801e-07, "logits/chosen": -2.8651843070983887, "logits/rejected": -2.8340165615081787, "logps/chosen": -263.162109375, "logps/rejected": -252.37625122070312, "loss": 0.6783, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.015812261030077934, "rewards/margins": 0.0313270129263401, "rewards/rejected": -0.015514750964939594, "step": 2530 }, { "epoch": 0.6647474483119602, "grad_norm": 1.296875, "learning_rate": 1.5259065836724034e-07, "logits/chosen": -2.7946388721466064, "logits/rejected": -2.7776710987091064, "logps/chosen": -262.2623291015625, "logps/rejected": -254.7265167236328, "loss": 0.6787, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01517055369913578, "rewards/margins": 0.030337844043970108, "rewards/rejected": -0.015167290344834328, "step": 2540 }, { "epoch": 0.6673645642501963, "grad_norm": 1.25, "learning_rate": 1.5049083690569454e-07, "logits/chosen": -2.809542179107666, "logits/rejected": -2.793139934539795, "logps/chosen": -251.2034454345703, "logps/rejected": -249.8028106689453, "loss": 0.6761, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018413711339235306, "rewards/margins": 0.03606470674276352, "rewards/rejected": -0.017650997266173363, "step": 2550 }, { "epoch": 0.6699816801884323, "grad_norm": 1.2421875, "learning_rate": 1.4839932443063056e-07, "logits/chosen": -2.837368965148926, "logits/rejected": -2.810147523880005, "logps/chosen": -305.4731750488281, "logps/rejected": -251.56161499023438, "loss": 0.6732, "rewards/accuracies": 0.71875, "rewards/chosen": 0.024250676855444908, "rewards/margins": 0.041700925678014755, "rewards/rejected": -0.017450252547860146, "step": 2560 }, { "epoch": 0.6725987961266684, "grad_norm": 1.1171875, "learning_rate": 1.46316295582738e-07, "logits/chosen": -2.8204097747802734, "logits/rejected": -2.8028788566589355, "logps/chosen": -257.53228759765625, "logps/rejected": -245.14895629882812, "loss": 0.6809, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006483917590230703, "rewards/margins": 0.025950897485017776, "rewards/rejected": -0.019466979429125786, "step": 2570 }, { "epoch": 0.6752159120649045, "grad_norm": 1.6171875, "learning_rate": 1.4424192429432655e-07, "logits/chosen": -2.848489999771118, "logits/rejected": -2.8286855220794678, "logps/chosen": -270.4859619140625, "logps/rejected": -277.18206787109375, "loss": 0.6742, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.024375300854444504, "rewards/margins": 0.03957264870405197, "rewards/rejected": -0.015197351574897766, "step": 2580 }, { "epoch": 0.6778330280031405, "grad_norm": 2.0625, "learning_rate": 1.4217638377480158e-07, "logits/chosen": -2.829794406890869, "logits/rejected": -2.8167781829833984, "logps/chosen": -274.6668395996094, "logps/rejected": -262.61700439453125, "loss": 0.6789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.014721376821398735, "rewards/margins": 0.02995181456208229, "rewards/rejected": -0.015230434946715832, "step": 2590 }, { "epoch": 0.6804501439413766, "grad_norm": 1.3203125, "learning_rate": 1.401198464962021e-07, "logits/chosen": -2.8342947959899902, "logits/rejected": -2.8125643730163574, "logps/chosen": -284.03253173828125, "logps/rejected": -243.7402801513672, "loss": 0.6776, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.015603385865688324, "rewards/margins": 0.03253168612718582, "rewards/rejected": -0.016928300261497498, "step": 2600 }, { "epoch": 0.6804501439413766, "eval_logits/chosen": -2.849823474884033, "eval_logits/rejected": -2.822629690170288, "eval_logps/chosen": -280.9640808105469, "eval_logps/rejected": -262.91668701171875, "eval_loss": 0.6775689721107483, "eval_rewards/accuracies": 0.6775000095367432, "eval_rewards/chosen": 0.018093857914209366, "eval_rewards/margins": 0.0328776054084301, "eval_rewards/rejected": -0.014783743768930435, "eval_runtime": 624.015, "eval_samples_per_second": 3.205, "eval_steps_per_second": 0.401, "step": 2600 }, { "epoch": 0.6830672598796127, "grad_norm": 1.15625, "learning_rate": 1.3807248417879894e-07, "logits/chosen": -2.866183280944824, "logits/rejected": -2.85908842086792, "logps/chosen": -286.482666015625, "logps/rejected": -269.18145751953125, "loss": 0.6742, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.024856090545654297, "rewards/margins": 0.03993413224816322, "rewards/rejected": -0.015078043565154076, "step": 2610 }, { "epoch": 0.6856843758178487, "grad_norm": 1.40625, "learning_rate": 1.3603446777675665e-07, "logits/chosen": -2.7812695503234863, "logits/rejected": -2.7601253986358643, "logps/chosen": -280.71826171875, "logps/rejected": -258.6421813964844, "loss": 0.6757, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.018343228846788406, "rewards/margins": 0.036758117377758026, "rewards/rejected": -0.01841488853096962, "step": 2620 }, { "epoch": 0.6883014917560848, "grad_norm": 1.3359375, "learning_rate": 1.3400596746385814e-07, "logits/chosen": -2.844383716583252, "logits/rejected": -2.8064093589782715, "logps/chosen": -286.34820556640625, "logps/rejected": -258.3263244628906, "loss": 0.6781, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.018320731818675995, "rewards/margins": 0.03194695711135864, "rewards/rejected": -0.013626225292682648, "step": 2630 }, { "epoch": 0.6909186076943209, "grad_norm": 1.1484375, "learning_rate": 1.3198715261929586e-07, "logits/chosen": -2.8701038360595703, "logits/rejected": -2.835305690765381, "logps/chosen": -248.2649688720703, "logps/rejected": -243.41397094726562, "loss": 0.6755, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.015776509419083595, "rewards/margins": 0.03662073612213135, "rewards/rejected": -0.02084423042833805, "step": 2640 }, { "epoch": 0.6935357236325569, "grad_norm": 1.3359375, "learning_rate": 1.299781918135282e-07, "logits/chosen": -2.85074782371521, "logits/rejected": -2.8091163635253906, "logps/chosen": -315.61590576171875, "logps/rejected": -293.9820251464844, "loss": 0.6697, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.035952307283878326, "rewards/margins": 0.04937596619129181, "rewards/rejected": -0.013423657044768333, "step": 2650 }, { "epoch": 0.696152839570793, "grad_norm": 1.3359375, "learning_rate": 1.279792527942045e-07, "logits/chosen": -2.8586459159851074, "logits/rejected": -2.8158531188964844, "logps/chosen": -284.60882568359375, "logps/rejected": -277.387451171875, "loss": 0.6764, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0246993750333786, "rewards/margins": 0.035598695278167725, "rewards/rejected": -0.010899320244789124, "step": 2660 }, { "epoch": 0.6987699555090291, "grad_norm": 1.6796875, "learning_rate": 1.259905024721576e-07, "logits/chosen": -2.8398678302764893, "logits/rejected": -2.823967695236206, "logps/chosen": -273.98944091796875, "logps/rejected": -254.6985321044922, "loss": 0.6742, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.019164234399795532, "rewards/margins": 0.039530009031295776, "rewards/rejected": -0.020365772768855095, "step": 2670 }, { "epoch": 0.7013870714472651, "grad_norm": 1.3984375, "learning_rate": 1.2401210690746703e-07, "logits/chosen": -2.8383288383483887, "logits/rejected": -2.8142457008361816, "logps/chosen": -283.2543029785156, "logps/rejected": -252.313720703125, "loss": 0.6763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02543952502310276, "rewards/margins": 0.03542017191648483, "rewards/rejected": -0.009980651549994946, "step": 2680 }, { "epoch": 0.7040041873855012, "grad_norm": 1.5625, "learning_rate": 1.2204423129559305e-07, "logits/chosen": -2.861647844314575, "logits/rejected": -2.8627748489379883, "logps/chosen": -281.1255798339844, "logps/rejected": -280.34027099609375, "loss": 0.6749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.021706473082304, "rewards/margins": 0.03822758048772812, "rewards/rejected": -0.016521107405424118, "step": 2690 }, { "epoch": 0.7066213033237373, "grad_norm": 2.171875, "learning_rate": 1.2008703995358299e-07, "logits/chosen": -2.837878704071045, "logits/rejected": -2.8239123821258545, "logps/chosen": -279.67852783203125, "logps/rejected": -253.0082550048828, "loss": 0.6774, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.020584728568792343, "rewards/margins": 0.033195436000823975, "rewards/rejected": -0.012610706500709057, "step": 2700 }, { "epoch": 0.7066213033237373, "eval_logits/chosen": -2.85300874710083, "eval_logits/rejected": -2.82612943649292, "eval_logps/chosen": -280.95526123046875, "eval_logps/rejected": -262.92626953125, "eval_loss": 0.6774773001670837, "eval_rewards/accuracies": 0.6859999895095825, "eval_rewards/chosen": 0.018182458356022835, "eval_rewards/margins": 0.03306160494685173, "eval_rewards/rejected": -0.014879145659506321, "eval_runtime": 623.2959, "eval_samples_per_second": 3.209, "eval_steps_per_second": 0.401, "step": 2700 }, { "epoch": 0.7092384192619733, "grad_norm": 1.6171875, "learning_rate": 1.1814069630635068e-07, "logits/chosen": -2.8202016353607178, "logits/rejected": -2.8188061714172363, "logps/chosen": -286.5379333496094, "logps/rejected": -281.89678955078125, "loss": 0.6787, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.02239087037742138, "rewards/margins": 0.030664747580885887, "rewards/rejected": -0.008273878134787083, "step": 2710 }, { "epoch": 0.7118555352002094, "grad_norm": 1.7109375, "learning_rate": 1.1620536287303051e-07, "logits/chosen": -2.8520309925079346, "logits/rejected": -2.8322553634643555, "logps/chosen": -306.9613342285156, "logps/rejected": -276.6466979980469, "loss": 0.6798, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.024515343829989433, "rewards/margins": 0.02836497500538826, "rewards/rejected": -0.0038496279157698154, "step": 2720 }, { "epoch": 0.7144726511384454, "grad_norm": 1.2265625, "learning_rate": 1.1428120125340716e-07, "logits/chosen": -2.8408806324005127, "logits/rejected": -2.8193554878234863, "logps/chosen": -278.6138000488281, "logps/rejected": -233.0282440185547, "loss": 0.6728, "rewards/accuracies": 0.78125, "rewards/chosen": 0.020976107567548752, "rewards/margins": 0.042338818311691284, "rewards/rejected": -0.021362707018852234, "step": 2730 }, { "epoch": 0.7170897670766815, "grad_norm": 1.3515625, "learning_rate": 1.123683721144223e-07, "logits/chosen": -2.8390605449676514, "logits/rejected": -2.817472457885742, "logps/chosen": -299.55767822265625, "logps/rejected": -270.6512145996094, "loss": 0.6779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.022130262106657028, "rewards/margins": 0.03234560787677765, "rewards/rejected": -0.010215344838798046, "step": 2740 }, { "epoch": 0.7197068830149176, "grad_norm": 1.4296875, "learning_rate": 1.1046703517675845e-07, "logits/chosen": -2.8513295650482178, "logits/rejected": -2.834134578704834, "logps/chosen": -269.61492919921875, "logps/rejected": -279.116455078125, "loss": 0.6781, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.020882591605186462, "rewards/margins": 0.03162946552038193, "rewards/rejected": -0.010746878571808338, "step": 2750 }, { "epoch": 0.7223239989531536, "grad_norm": 1.25, "learning_rate": 1.085773492015028e-07, "logits/chosen": -2.837613821029663, "logits/rejected": -2.811807155609131, "logps/chosen": -262.3995361328125, "logps/rejected": -229.64999389648438, "loss": 0.671, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02249622717499733, "rewards/margins": 0.04641376808285713, "rewards/rejected": -0.023917539045214653, "step": 2760 }, { "epoch": 0.7249411148913897, "grad_norm": 1.421875, "learning_rate": 1.0669947197689033e-07, "logits/chosen": -2.829591989517212, "logits/rejected": -2.78979754447937, "logps/chosen": -289.36322021484375, "logps/rejected": -266.07537841796875, "loss": 0.6777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016385111957788467, "rewards/margins": 0.0324719175696373, "rewards/rejected": -0.01608681119978428, "step": 2770 }, { "epoch": 0.7275582308296258, "grad_norm": 1.53125, "learning_rate": 1.048335603051291e-07, "logits/chosen": -2.811000347137451, "logits/rejected": -2.7975831031799316, "logps/chosen": -303.7996520996094, "logps/rejected": -278.1279296875, "loss": 0.6693, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.022211695089936256, "rewards/margins": 0.05009465292096138, "rewards/rejected": -0.027882959693670273, "step": 2780 }, { "epoch": 0.7301753467678618, "grad_norm": 2.515625, "learning_rate": 1.0297976998930663e-07, "logits/chosen": -2.853620767593384, "logits/rejected": -2.842625379562378, "logps/chosen": -290.62567138671875, "logps/rejected": -259.65020751953125, "loss": 0.6734, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.018552960827946663, "rewards/margins": 0.04151231050491333, "rewards/rejected": -0.022959351539611816, "step": 2790 }, { "epoch": 0.7327924627060979, "grad_norm": 1.421875, "learning_rate": 1.0113825582038077e-07, "logits/chosen": -2.854717493057251, "logits/rejected": -2.836151123046875, "logps/chosen": -279.61346435546875, "logps/rejected": -265.38531494140625, "loss": 0.679, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.020294269546866417, "rewards/margins": 0.02987518534064293, "rewards/rejected": -0.009580916725099087, "step": 2800 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -2.853910446166992, "eval_logits/rejected": -2.8271100521087646, "eval_logps/chosen": -280.93585205078125, "eval_logps/rejected": -262.9162292480469, "eval_loss": 0.6774327754974365, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.018376635387539864, "eval_rewards/margins": 0.0331551730632782, "eval_rewards/rejected": -0.014778541401028633, "eval_runtime": 623.7205, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.401, "step": 2800 }, { "epoch": 0.735409578644334, "grad_norm": 1.453125, "learning_rate": 9.930917156425475e-08, "logits/chosen": -2.86027193069458, "logits/rejected": -2.840989828109741, "logps/chosen": -278.8993225097656, "logps/rejected": -277.65216064453125, "loss": 0.6774, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01882421225309372, "rewards/margins": 0.033416565507650375, "rewards/rejected": -0.01459235418587923, "step": 2810 }, { "epoch": 0.73802669458257, "grad_norm": 1.484375, "learning_rate": 9.749266994893754e-08, "logits/chosen": -2.8003342151641846, "logits/rejected": -2.7634270191192627, "logps/chosen": -253.88516235351562, "logps/rejected": -244.5758056640625, "loss": 0.6833, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.008167347870767117, "rewards/margins": 0.02078983187675476, "rewards/rejected": -0.012622484937310219, "step": 2820 }, { "epoch": 0.7406438105208061, "grad_norm": 1.265625, "learning_rate": 9.568890265179128e-08, "logits/chosen": -2.8219611644744873, "logits/rejected": -2.818441867828369, "logps/chosen": -277.968994140625, "logps/rejected": -250.047119140625, "loss": 0.6772, "rewards/accuracies": 0.65625, "rewards/chosen": 0.020925423130393028, "rewards/margins": 0.033755991607904434, "rewards/rejected": -0.012830562889575958, "step": 2830 }, { "epoch": 0.7432609264590422, "grad_norm": 1.265625, "learning_rate": 9.389802028686616e-08, "logits/chosen": -2.8413894176483154, "logits/rejected": -2.823989152908325, "logps/chosen": -277.80926513671875, "logps/rejected": -246.23593139648438, "loss": 0.6805, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.011136185377836227, "rewards/margins": 0.02661561407148838, "rewards/rejected": -0.015479430556297302, "step": 2840 }, { "epoch": 0.7458780423972782, "grad_norm": 1.2734375, "learning_rate": 9.212017239232426e-08, "logits/chosen": -2.831408977508545, "logits/rejected": -2.8223624229431152, "logps/chosen": -287.7478942871094, "logps/rejected": -267.5721435546875, "loss": 0.673, "rewards/accuracies": 0.71875, "rewards/chosen": 0.021497588604688644, "rewards/margins": 0.04216768592596054, "rewards/rejected": -0.020670095458626747, "step": 2850 }, { "epoch": 0.7484951583355143, "grad_norm": 1.609375, "learning_rate": 9.035550741795328e-08, "logits/chosen": -2.814898729324341, "logits/rejected": -2.8209142684936523, "logps/chosen": -271.9150085449219, "logps/rejected": -279.5975646972656, "loss": 0.6727, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02707098424434662, "rewards/margins": 0.04284884035587311, "rewards/rejected": -0.01577785238623619, "step": 2860 }, { "epoch": 0.7511122742737504, "grad_norm": 1.2109375, "learning_rate": 8.860417271277065e-08, "logits/chosen": -2.8868813514709473, "logits/rejected": -2.88276743888855, "logps/chosen": -283.67864990234375, "logps/rejected": -275.3718566894531, "loss": 0.6819, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.018196921795606613, "rewards/margins": 0.02416098490357399, "rewards/rejected": -0.005964064504951239, "step": 2870 }, { "epoch": 0.7537293902119864, "grad_norm": 1.3359375, "learning_rate": 8.686631451272029e-08, "logits/chosen": -2.869019031524658, "logits/rejected": -2.84165096282959, "logps/chosen": -268.4243469238281, "logps/rejected": -247.5635986328125, "loss": 0.6799, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0169739481061697, "rewards/margins": 0.028122667223215103, "rewards/rejected": -0.011148716323077679, "step": 2880 }, { "epoch": 0.7563465061502225, "grad_norm": 1.4453125, "learning_rate": 8.514207792846168e-08, "logits/chosen": -2.8492226600646973, "logits/rejected": -2.844165086746216, "logps/chosen": -265.9365539550781, "logps/rejected": -237.9735107421875, "loss": 0.6782, "rewards/accuracies": 0.71875, "rewards/chosen": 0.012092510238289833, "rewards/margins": 0.031133780255913734, "rewards/rejected": -0.0190412737429142, "step": 2890 }, { "epoch": 0.7589636220884585, "grad_norm": 1.34375, "learning_rate": 8.343160693325355e-08, "logits/chosen": -2.815880060195923, "logits/rejected": -2.804381847381592, "logps/chosen": -269.6540832519531, "logps/rejected": -268.2055358886719, "loss": 0.6782, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.017512865364551544, "rewards/margins": 0.03190717473626137, "rewards/rejected": -0.014394307509064674, "step": 2900 }, { "epoch": 0.7589636220884585, "eval_logits/chosen": -2.852909803390503, "eval_logits/rejected": -2.826012134552002, "eval_logps/chosen": -280.9681396484375, "eval_logps/rejected": -262.93359375, "eval_loss": 0.6775044202804565, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": 0.01805364154279232, "eval_rewards/margins": 0.033006127923727036, "eval_rewards/rejected": -0.014952489174902439, "eval_runtime": 622.9143, "eval_samples_per_second": 3.211, "eval_steps_per_second": 0.401, "step": 2900 }, { "epoch": 0.7615807380266946, "grad_norm": 1.09375, "learning_rate": 8.173504435093173e-08, "logits/chosen": -2.832644462585449, "logits/rejected": -2.7984976768493652, "logps/chosen": -263.4750061035156, "logps/rejected": -228.9986114501953, "loss": 0.674, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.018485140055418015, "rewards/margins": 0.040302351117134094, "rewards/rejected": -0.02181720733642578, "step": 2910 }, { "epoch": 0.7641978539649307, "grad_norm": 1.2890625, "learning_rate": 8.005253184398359e-08, "logits/chosen": -2.8306822776794434, "logits/rejected": -2.8159360885620117, "logps/chosen": -292.702880859375, "logps/rejected": -288.18695068359375, "loss": 0.6762, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02734486199915409, "rewards/margins": 0.03592860698699951, "rewards/rejected": -0.008583742193877697, "step": 2920 }, { "epoch": 0.7668149699031667, "grad_norm": 1.21875, "learning_rate": 7.838420990171926e-08, "logits/chosen": -2.8607592582702637, "logits/rejected": -2.825814962387085, "logps/chosen": -286.3602600097656, "logps/rejected": -260.51641845703125, "loss": 0.6777, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01990603655576706, "rewards/margins": 0.03237393498420715, "rewards/rejected": -0.012467900291085243, "step": 2930 }, { "epoch": 0.7694320858414028, "grad_norm": 1.2578125, "learning_rate": 7.673021782854083e-08, "logits/chosen": -2.780974864959717, "logits/rejected": -2.763579845428467, "logps/chosen": -284.2770690917969, "logps/rejected": -233.38668823242188, "loss": 0.6742, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02143119089305401, "rewards/margins": 0.04003281518816948, "rewards/rejected": -0.01860162802040577, "step": 2940 }, { "epoch": 0.7720492017796389, "grad_norm": 1.234375, "learning_rate": 7.509069373231039e-08, "logits/chosen": -2.8137221336364746, "logits/rejected": -2.788684368133545, "logps/chosen": -266.8929138183594, "logps/rejected": -250.69326782226562, "loss": 0.6767, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.018960343673825264, "rewards/margins": 0.034537579864263535, "rewards/rejected": -0.015577234327793121, "step": 2950 }, { "epoch": 0.7746663177178749, "grad_norm": 1.3203125, "learning_rate": 7.346577451281821e-08, "logits/chosen": -2.823444366455078, "logits/rejected": -2.826974391937256, "logps/chosen": -279.7121276855469, "logps/rejected": -261.50518798828125, "loss": 0.6768, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.018806222826242447, "rewards/margins": 0.03476772829890251, "rewards/rejected": -0.015961505472660065, "step": 2960 }, { "epoch": 0.777283433656111, "grad_norm": 1.59375, "learning_rate": 7.185559585035136e-08, "logits/chosen": -2.84146785736084, "logits/rejected": -2.8069043159484863, "logps/chosen": -296.74932861328125, "logps/rejected": -284.635009765625, "loss": 0.6736, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02409261278808117, "rewards/margins": 0.041189759969711304, "rewards/rejected": -0.017097145318984985, "step": 2970 }, { "epoch": 0.7799005495943471, "grad_norm": 1.5703125, "learning_rate": 7.026029219436502e-08, "logits/chosen": -2.816230535507202, "logits/rejected": -2.792483329772949, "logps/chosen": -270.294189453125, "logps/rejected": -262.2767333984375, "loss": 0.6756, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.01601272262632847, "rewards/margins": 0.03693497180938721, "rewards/rejected": -0.02092224732041359, "step": 2980 }, { "epoch": 0.7825176655325831, "grad_norm": 1.1875, "learning_rate": 6.867999675225522e-08, "logits/chosen": -2.864154815673828, "logits/rejected": -2.833688259124756, "logps/chosen": -245.67288208007812, "logps/rejected": -232.77743530273438, "loss": 0.6775, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.017845796421170235, "rewards/margins": 0.03295399993658066, "rewards/rejected": -0.015108207240700722, "step": 2990 }, { "epoch": 0.7851347814708192, "grad_norm": 1.203125, "learning_rate": 6.711484147823662e-08, "logits/chosen": -2.8105015754699707, "logits/rejected": -2.8049824237823486, "logps/chosen": -248.8085479736328, "logps/rejected": -257.3614807128906, "loss": 0.6784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.017226997762918472, "rewards/margins": 0.03089422546327114, "rewards/rejected": -0.01366722583770752, "step": 3000 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": -2.8550119400024414, "eval_logits/rejected": -2.8283438682556152, "eval_logps/chosen": -280.97314453125, "eval_logps/rejected": -262.9586486816406, "eval_loss": 0.67740797996521, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": 0.018003566190600395, "eval_rewards/margins": 0.03320648893713951, "eval_rewards/rejected": -0.015202920883893967, "eval_runtime": 623.2623, "eval_samples_per_second": 3.209, "eval_steps_per_second": 0.401, "step": 3000 }, { "epoch": 0.7877518974090553, "grad_norm": 1.4609375, "learning_rate": 6.556495706232412e-08, "logits/chosen": -2.820091724395752, "logits/rejected": -2.819214105606079, "logps/chosen": -285.6153869628906, "logps/rejected": -269.1045227050781, "loss": 0.6751, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.017025303095579147, "rewards/margins": 0.03793289139866829, "rewards/rejected": -0.020907586440443993, "step": 3010 }, { "epoch": 0.7903690133472913, "grad_norm": 1.4375, "learning_rate": 6.403047291942057e-08, "logits/chosen": -2.7955071926116943, "logits/rejected": -2.757645606994629, "logps/chosen": -243.58535766601562, "logps/rejected": -218.78341674804688, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.010361788794398308, "rewards/margins": 0.02738323248922825, "rewards/rejected": -0.017021439969539642, "step": 3020 }, { "epoch": 0.7929861292855274, "grad_norm": 1.6328125, "learning_rate": 6.251151717851021e-08, "logits/chosen": -2.819065570831299, "logits/rejected": -2.8068315982818604, "logps/chosen": -249.41567993164062, "logps/rejected": -235.5353546142578, "loss": 0.681, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.016130346804857254, "rewards/margins": 0.025897834450006485, "rewards/rejected": -0.009767485782504082, "step": 3030 }, { "epoch": 0.7956032452237635, "grad_norm": 1.1328125, "learning_rate": 6.100821667196041e-08, "logits/chosen": -2.893716812133789, "logits/rejected": -2.835082530975342, "logps/chosen": -288.95574951171875, "logps/rejected": -224.43881225585938, "loss": 0.6735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01886625960469246, "rewards/margins": 0.04120669886469841, "rewards/rejected": -0.0223404411226511, "step": 3040 }, { "epoch": 0.7982203611619995, "grad_norm": 1.4140625, "learning_rate": 5.952069692493061e-08, "logits/chosen": -2.790799617767334, "logits/rejected": -2.7859864234924316, "logps/chosen": -243.0890350341797, "logps/rejected": -251.2998504638672, "loss": 0.6737, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.025469347834587097, "rewards/margins": 0.041153885424137115, "rewards/rejected": -0.01568453758955002, "step": 3050 }, { "epoch": 0.8008374771002356, "grad_norm": 1.578125, "learning_rate": 5.8049082144891794e-08, "logits/chosen": -2.7803027629852295, "logits/rejected": -2.7686164379119873, "logps/chosen": -278.40985107421875, "logps/rejected": -323.7679443359375, "loss": 0.6815, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01774556189775467, "rewards/margins": 0.024822643026709557, "rewards/rejected": -0.007077082060277462, "step": 3060 }, { "epoch": 0.8034545930384716, "grad_norm": 1.1953125, "learning_rate": 5.659349521125459e-08, "logits/chosen": -2.9022018909454346, "logits/rejected": -2.901132822036743, "logps/chosen": -296.6135559082031, "logps/rejected": -276.8060607910156, "loss": 0.6787, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.020619522780179977, "rewards/margins": 0.030655449256300926, "rewards/rejected": -0.010035926476120949, "step": 3070 }, { "epoch": 0.8060717089767077, "grad_norm": 1.2578125, "learning_rate": 5.5154057665109e-08, "logits/chosen": -2.8536829948425293, "logits/rejected": -2.8322811126708984, "logps/chosen": -274.37506103515625, "logps/rejected": -250.5011749267578, "loss": 0.6774, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.017293009907007217, "rewards/margins": 0.03290316089987755, "rewards/rejected": -0.015610149130225182, "step": 3080 }, { "epoch": 0.8086888249149438, "grad_norm": 1.34375, "learning_rate": 5.3730889699075853e-08, "logits/chosen": -2.864671230316162, "logits/rejected": -2.833627700805664, "logps/chosen": -294.9132995605469, "logps/rejected": -240.95458984375, "loss": 0.6754, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020536890253424644, "rewards/margins": 0.037192363291978836, "rewards/rejected": -0.016655471175909042, "step": 3090 }, { "epoch": 0.8113059408531798, "grad_norm": 1.34375, "learning_rate": 5.2324110147270893e-08, "logits/chosen": -2.837832450866699, "logits/rejected": -2.827129602432251, "logps/chosen": -296.73492431640625, "logps/rejected": -290.35296630859375, "loss": 0.6713, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.03090183436870575, "rewards/margins": 0.04617828503251076, "rewards/rejected": -0.015276448801159859, "step": 3100 }, { "epoch": 0.8113059408531798, "eval_logits/chosen": -2.8546648025512695, "eval_logits/rejected": -2.827969789505005, "eval_logps/chosen": -280.9596252441406, "eval_logps/rejected": -262.92376708984375, "eval_loss": 0.6775153279304504, "eval_rewards/accuracies": 0.6825000047683716, "eval_rewards/chosen": 0.01813914068043232, "eval_rewards/margins": 0.03299335017800331, "eval_rewards/rejected": -0.014854210428893566, "eval_runtime": 623.3449, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.401, "step": 3100 }, { "epoch": 0.8139230567914159, "grad_norm": 1.2578125, "learning_rate": 5.0933836475381795e-08, "logits/chosen": -2.8541502952575684, "logits/rejected": -2.819565534591675, "logps/chosen": -299.7816467285156, "logps/rejected": -286.51507568359375, "loss": 0.6744, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02967965044081211, "rewards/margins": 0.039316095411777496, "rewards/rejected": -0.00963644403964281, "step": 3110 }, { "epoch": 0.816540172729652, "grad_norm": 1.8203125, "learning_rate": 4.956018477086005e-08, "logits/chosen": -2.8387677669525146, "logits/rejected": -2.8084654808044434, "logps/chosen": -289.3456115722656, "logps/rejected": -263.594970703125, "loss": 0.6773, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.019268421456217766, "rewards/margins": 0.03356018662452698, "rewards/rejected": -0.014291766099631786, "step": 3120 }, { "epoch": 0.819157288667888, "grad_norm": 1.203125, "learning_rate": 4.820326973322763e-08, "logits/chosen": -2.839634656906128, "logits/rejected": -2.8185315132141113, "logps/chosen": -266.79034423828125, "logps/rejected": -267.4413146972656, "loss": 0.6779, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.010893099941313267, "rewards/margins": 0.03200749307870865, "rewards/rejected": -0.021114394068717957, "step": 3130 }, { "epoch": 0.821774404606124, "grad_norm": 1.2578125, "learning_rate": 4.686320466449981e-08, "logits/chosen": -2.8295464515686035, "logits/rejected": -2.778109312057495, "logps/chosen": -256.5823059082031, "logps/rejected": -256.2238464355469, "loss": 0.6793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012616870924830437, "rewards/margins": 0.029145419597625732, "rewards/rejected": -0.016528548672795296, "step": 3140 }, { "epoch": 0.8243915205443602, "grad_norm": 1.3359375, "learning_rate": 4.554010145972417e-08, "logits/chosen": -2.8855738639831543, "logits/rejected": -2.836945056915283, "logps/chosen": -278.258544921875, "logps/rejected": -268.62969970703125, "loss": 0.6786, "rewards/accuracies": 0.65625, "rewards/chosen": 0.016889285296201706, "rewards/margins": 0.030887436121702194, "rewards/rejected": -0.013998152688145638, "step": 3150 }, { "epoch": 0.8270086364825961, "grad_norm": 1.25, "learning_rate": 4.423407059763745e-08, "logits/chosen": -2.8406424522399902, "logits/rejected": -2.825413227081299, "logps/chosen": -289.0135192871094, "logps/rejected": -282.2171630859375, "loss": 0.6774, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.020682137459516525, "rewards/margins": 0.0332643985748291, "rewards/rejected": -0.012582260183990002, "step": 3160 }, { "epoch": 0.8296257524208323, "grad_norm": 1.15625, "learning_rate": 4.294522113144078e-08, "logits/chosen": -2.791628360748291, "logits/rejected": -2.753540515899658, "logps/chosen": -284.7926025390625, "logps/rejected": -252.89364624023438, "loss": 0.6749, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.02161264792084694, "rewards/margins": 0.03834725171327591, "rewards/rejected": -0.01673460379242897, "step": 3170 }, { "epoch": 0.8322428683590684, "grad_norm": 1.140625, "learning_rate": 4.1673660679693804e-08, "logits/chosen": -2.8362486362457275, "logits/rejected": -2.8258681297302246, "logps/chosen": -235.7146759033203, "logps/rejected": -263.33929443359375, "loss": 0.6785, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.01654769480228424, "rewards/margins": 0.03069342114031315, "rewards/rejected": -0.014145726338028908, "step": 3180 }, { "epoch": 0.8348599842973043, "grad_norm": 1.4609375, "learning_rate": 4.041949541732825e-08, "logits/chosen": -2.846217632293701, "logits/rejected": -2.845409870147705, "logps/chosen": -278.4856872558594, "logps/rejected": -266.3340759277344, "loss": 0.678, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.018012622371315956, "rewards/margins": 0.03179007023572922, "rewards/rejected": -0.01377745158970356, "step": 3190 }, { "epoch": 0.8374771002355405, "grad_norm": 2.15625, "learning_rate": 3.9182830066782605e-08, "logits/chosen": -2.809504747390747, "logits/rejected": -2.8133652210235596, "logps/chosen": -273.1795349121094, "logps/rejected": -288.18035888671875, "loss": 0.6774, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.016945619136095047, "rewards/margins": 0.03313397616147995, "rewards/rejected": -0.016188358888030052, "step": 3200 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -2.854266405105591, "eval_logits/rejected": -2.8275365829467773, "eval_logps/chosen": -280.958251953125, "eval_logps/rejected": -262.9411315917969, "eval_loss": 0.6774209141731262, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.018152602016925812, "eval_rewards/margins": 0.03318041190505028, "eval_rewards/rejected": -0.015027807094156742, "eval_runtime": 624.1731, "eval_samples_per_second": 3.204, "eval_steps_per_second": 0.401, "step": 3200 }, { "epoch": 0.8400942161737766, "grad_norm": 1.484375, "learning_rate": 3.79637678892577e-08, "logits/chosen": -2.8115928173065186, "logits/rejected": -2.8144474029541016, "logps/chosen": -290.44683837890625, "logps/rejected": -275.8350830078125, "loss": 0.6819, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.01646428182721138, "rewards/margins": 0.0238783098757267, "rewards/rejected": -0.007414024323225021, "step": 3210 }, { "epoch": 0.8427113321120125, "grad_norm": 1.203125, "learning_rate": 3.6762410676094645e-08, "logits/chosen": -2.822303533554077, "logits/rejected": -2.8179895877838135, "logps/chosen": -316.88623046875, "logps/rejected": -273.47967529296875, "loss": 0.6733, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02506067231297493, "rewards/margins": 0.04178461804986, "rewards/rejected": -0.01672394946217537, "step": 3220 }, { "epoch": 0.8453284480502486, "grad_norm": 1.21875, "learning_rate": 3.557885874027497e-08, "logits/chosen": -2.8261001110076904, "logits/rejected": -2.814418077468872, "logps/chosen": -276.53558349609375, "logps/rejected": -266.6348571777344, "loss": 0.6794, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.018592124804854393, "rewards/margins": 0.029115628451108932, "rewards/rejected": -0.010523504577577114, "step": 3230 }, { "epoch": 0.8479455639884846, "grad_norm": 1.359375, "learning_rate": 3.441321090804469e-08, "logits/chosen": -2.8752944469451904, "logits/rejected": -2.841357469558716, "logps/chosen": -281.1075744628906, "logps/rejected": -243.8683319091797, "loss": 0.6785, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01994512416422367, "rewards/margins": 0.030809426680207253, "rewards/rejected": -0.010864300653338432, "step": 3240 }, { "epoch": 0.8505626799267207, "grad_norm": 1.25, "learning_rate": 3.326556451066234e-08, "logits/chosen": -2.877654552459717, "logits/rejected": -2.847358465194702, "logps/chosen": -308.5765380859375, "logps/rejected": -283.89654541015625, "loss": 0.6741, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02807055041193962, "rewards/margins": 0.04025264084339142, "rewards/rejected": -0.012182091362774372, "step": 3250 }, { "epoch": 0.8531797958649568, "grad_norm": 1.2578125, "learning_rate": 3.2136015376271946e-08, "logits/chosen": -2.8326189517974854, "logits/rejected": -2.801771640777588, "logps/chosen": -274.5975646972656, "logps/rejected": -257.2004699707031, "loss": 0.6812, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.010432440787553787, "rewards/margins": 0.02520870603621006, "rewards/rejected": -0.014776261523365974, "step": 3260 }, { "epoch": 0.8557969118031928, "grad_norm": 1.6640625, "learning_rate": 3.102465782190106e-08, "logits/chosen": -2.84391713142395, "logits/rejected": -2.8385825157165527, "logps/chosen": -264.6709899902344, "logps/rejected": -251.43215942382812, "loss": 0.6785, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.017527025192975998, "rewards/margins": 0.0312645398080349, "rewards/rejected": -0.01373751275241375, "step": 3270 }, { "epoch": 0.8584140277414289, "grad_norm": 1.28125, "learning_rate": 2.993158464558565e-08, "logits/chosen": -2.8273541927337646, "logits/rejected": -2.8198060989379883, "logps/chosen": -289.29791259765625, "logps/rejected": -293.8225402832031, "loss": 0.6809, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.02297678217291832, "rewards/margins": 0.026020046323537827, "rewards/rejected": -0.0030432622879743576, "step": 3280 }, { "epoch": 0.861031143679665, "grad_norm": 1.359375, "learning_rate": 2.8856887118621358e-08, "logits/chosen": -2.870941638946533, "logits/rejected": -2.8777921199798584, "logps/chosen": -274.54119873046875, "logps/rejected": -275.3123474121094, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": 0.013969512656331062, "rewards/margins": 0.030474882572889328, "rewards/rejected": -0.016505368053913116, "step": 3290 }, { "epoch": 0.863648259617901, "grad_norm": 1.296875, "learning_rate": 2.7800654977942482e-08, "logits/chosen": -2.828477382659912, "logits/rejected": -2.7952821254730225, "logps/chosen": -273.37109375, "logps/rejected": -293.2996520996094, "loss": 0.6781, "rewards/accuracies": 0.6875, "rewards/chosen": 0.017902836203575134, "rewards/margins": 0.03182779997587204, "rewards/rejected": -0.013924960978329182, "step": 3300 }, { "epoch": 0.863648259617901, "eval_logits/chosen": -2.8559255599975586, "eval_logits/rejected": -2.8293371200561523, "eval_logps/chosen": -280.9559326171875, "eval_logps/rejected": -262.91455078125, "eval_loss": 0.6775384545326233, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.018175845965743065, "eval_rewards/margins": 0.03293789178133011, "eval_rewards/rejected": -0.014762048609554768, "eval_runtime": 623.0311, "eval_samples_per_second": 3.21, "eval_steps_per_second": 0.401, "step": 3300 }, { "epoch": 0.8662653755561371, "grad_norm": 1.0625, "learning_rate": 2.676297641862879e-08, "logits/chosen": -2.8453030586242676, "logits/rejected": -2.831481456756592, "logps/chosen": -240.8975830078125, "logps/rejected": -200.9988250732422, "loss": 0.6755, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016030047088861465, "rewards/margins": 0.03695772588253021, "rewards/rejected": -0.020927678793668747, "step": 3310 }, { "epoch": 0.8688824914943732, "grad_norm": 1.3125, "learning_rate": 2.5743938086541352e-08, "logits/chosen": -2.8288021087646484, "logits/rejected": -2.802476167678833, "logps/chosen": -278.8105773925781, "logps/rejected": -255.1897430419922, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.013786676339805126, "rewards/margins": 0.03420311585068703, "rewards/rejected": -0.020416438579559326, "step": 3320 }, { "epoch": 0.8714996074326092, "grad_norm": 1.40625, "learning_rate": 2.474362507108757e-08, "logits/chosen": -2.8980019092559814, "logits/rejected": -2.8595707416534424, "logps/chosen": -289.9441833496094, "logps/rejected": -271.2764587402344, "loss": 0.6707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.026317168027162552, "rewards/margins": 0.047435760498046875, "rewards/rejected": -0.02111859992146492, "step": 3330 }, { "epoch": 0.8741167233708453, "grad_norm": 1.40625, "learning_rate": 2.3762120898116495e-08, "logits/chosen": -2.849844455718994, "logits/rejected": -2.834979772567749, "logps/chosen": -287.1720886230469, "logps/rejected": -279.83941650390625, "loss": 0.6816, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.00895099900662899, "rewards/margins": 0.0245995931327343, "rewards/rejected": -0.01564859412610531, "step": 3340 }, { "epoch": 0.8767338393090814, "grad_norm": 1.546875, "learning_rate": 2.2799507522944044e-08, "logits/chosen": -2.7699649333953857, "logits/rejected": -2.7505178451538086, "logps/chosen": -284.6213073730469, "logps/rejected": -281.0083923339844, "loss": 0.6758, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02020053006708622, "rewards/margins": 0.036288876086473465, "rewards/rejected": -0.016088349744677544, "step": 3350 }, { "epoch": 0.8793509552473174, "grad_norm": 1.4296875, "learning_rate": 2.1855865323510054e-08, "logits/chosen": -2.8084092140197754, "logits/rejected": -2.7662644386291504, "logps/chosen": -292.76629638671875, "logps/rejected": -290.1708679199219, "loss": 0.6738, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.022515032440423965, "rewards/margins": 0.04099477082490921, "rewards/rejected": -0.018479738384485245, "step": 3360 }, { "epoch": 0.8819680711855535, "grad_norm": 1.3125, "learning_rate": 2.0931273093666573e-08, "logits/chosen": -2.817469358444214, "logits/rejected": -2.792358875274658, "logps/chosen": -256.8840637207031, "logps/rejected": -239.3094024658203, "loss": 0.6749, "rewards/accuracies": 0.71875, "rewards/chosen": 0.013278042897582054, "rewards/margins": 0.03780357167124748, "rewards/rejected": -0.02452552691102028, "step": 3370 }, { "epoch": 0.8845851871237895, "grad_norm": 1.3671875, "learning_rate": 2.002580803659873e-08, "logits/chosen": -2.821927070617676, "logits/rejected": -2.779125213623047, "logps/chosen": -268.94305419921875, "logps/rejected": -259.77130126953125, "loss": 0.6793, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011967052705585957, "rewards/margins": 0.02943551540374756, "rewards/rejected": -0.017468463629484177, "step": 3380 }, { "epoch": 0.8872023030620256, "grad_norm": 1.171875, "learning_rate": 1.9139545758378256e-08, "logits/chosen": -2.8498919010162354, "logits/rejected": -2.797402858734131, "logps/chosen": -284.4458923339844, "logps/rejected": -238.79989624023438, "loss": 0.6721, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.028188347816467285, "rewards/margins": 0.04387947544455528, "rewards/rejected": -0.015691127628087997, "step": 3390 }, { "epoch": 0.8898194190002617, "grad_norm": 1.3671875, "learning_rate": 1.8272560261650277e-08, "logits/chosen": -2.8581271171569824, "logits/rejected": -2.828350305557251, "logps/chosen": -329.14361572265625, "logps/rejected": -273.26776123046875, "loss": 0.6733, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02928735874593258, "rewards/margins": 0.04170341044664383, "rewards/rejected": -0.012416050769388676, "step": 3400 }, { "epoch": 0.8898194190002617, "eval_logits/chosen": -2.850832462310791, "eval_logits/rejected": -2.823747396469116, "eval_logps/chosen": -280.9770202636719, "eval_logps/rejected": -262.9403381347656, "eval_loss": 0.6775196194648743, "eval_rewards/accuracies": 0.6825000047683716, "eval_rewards/chosen": 0.017964746803045273, "eval_rewards/margins": 0.03298423811793327, "eval_rewards/rejected": -0.01501949317753315, "eval_runtime": 623.1437, "eval_samples_per_second": 3.21, "eval_steps_per_second": 0.401, "step": 3400 }, { "epoch": 0.8924365349384977, "grad_norm": 1.40625, "learning_rate": 1.742492393945427e-08, "logits/chosen": -2.830068826675415, "logits/rejected": -2.787055015563965, "logps/chosen": -295.30035400390625, "logps/rejected": -255.3282470703125, "loss": 0.6776, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01680312678217888, "rewards/margins": 0.032698854804039, "rewards/rejected": -0.015895728021860123, "step": 3410 }, { "epoch": 0.8950536508767338, "grad_norm": 1.46875, "learning_rate": 1.6596707569179302e-08, "logits/chosen": -2.8651199340820312, "logits/rejected": -2.8412108421325684, "logps/chosen": -294.6651916503906, "logps/rejected": -264.3734130859375, "loss": 0.6762, "rewards/accuracies": 0.6875, "rewards/chosen": 0.022434063255786896, "rewards/margins": 0.035665739327669144, "rewards/rejected": -0.013231677003204823, "step": 3420 }, { "epoch": 0.8976707668149699, "grad_norm": 1.265625, "learning_rate": 1.5787980306653848e-08, "logits/chosen": -2.8364651203155518, "logits/rejected": -2.7913882732391357, "logps/chosen": -288.8866271972656, "logps/rejected": -276.93865966796875, "loss": 0.6742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.024098176509141922, "rewards/margins": 0.03978481888771057, "rewards/rejected": -0.01568664237856865, "step": 3430 }, { "epoch": 0.9002878827532059, "grad_norm": 1.203125, "learning_rate": 1.499880968037165e-08, "logits/chosen": -2.8334012031555176, "logits/rejected": -2.8106682300567627, "logps/chosen": -267.6552734375, "logps/rejected": -231.9657745361328, "loss": 0.6767, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02195141650736332, "rewards/margins": 0.03444907069206238, "rewards/rejected": -0.012497651390731335, "step": 3440 }, { "epoch": 0.902904998691442, "grad_norm": 1.2109375, "learning_rate": 1.4229261585852803e-08, "logits/chosen": -2.8546454906463623, "logits/rejected": -2.8432843685150146, "logps/chosen": -280.5713806152344, "logps/rejected": -257.9490966796875, "loss": 0.6755, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.022586923092603683, "rewards/margins": 0.037085022777318954, "rewards/rejected": -0.01449810154736042, "step": 3450 }, { "epoch": 0.9055221146296781, "grad_norm": 1.328125, "learning_rate": 1.3479400280141883e-08, "logits/chosen": -2.823387384414673, "logits/rejected": -2.8125298023223877, "logps/chosen": -262.6939697265625, "logps/rejected": -266.7622985839844, "loss": 0.6774, "rewards/accuracies": 0.6875, "rewards/chosen": 0.018851932138204575, "rewards/margins": 0.03330928832292557, "rewards/rejected": -0.014457357116043568, "step": 3460 }, { "epoch": 0.9081392305679141, "grad_norm": 1.203125, "learning_rate": 1.2749288376442042e-08, "logits/chosen": -2.839914083480835, "logits/rejected": -2.8061881065368652, "logps/chosen": -314.83599853515625, "logps/rejected": -256.56256103515625, "loss": 0.6731, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.027168557047843933, "rewards/margins": 0.041992831975221634, "rewards/rejected": -0.01482427679002285, "step": 3470 }, { "epoch": 0.9107563465061502, "grad_norm": 1.3828125, "learning_rate": 1.2038986838887127e-08, "logits/chosen": -2.8704135417938232, "logits/rejected": -2.8492181301116943, "logps/chosen": -257.75701904296875, "logps/rejected": -257.2181091308594, "loss": 0.6845, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.006689242087304592, "rewards/margins": 0.019121108576655388, "rewards/rejected": -0.01243186742067337, "step": 3480 }, { "epoch": 0.9133734624443863, "grad_norm": 1.25, "learning_rate": 1.1348554977451131e-08, "logits/chosen": -2.879945993423462, "logits/rejected": -2.854792356491089, "logps/chosen": -299.6977233886719, "logps/rejected": -266.33148193359375, "loss": 0.6776, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01960437372326851, "rewards/margins": 0.03308872506022453, "rewards/rejected": -0.013484349474310875, "step": 3490 }, { "epoch": 0.9159905783826223, "grad_norm": 1.2890625, "learning_rate": 1.06780504429958e-08, "logits/chosen": -2.860222578048706, "logits/rejected": -2.8316166400909424, "logps/chosen": -295.9383850097656, "logps/rejected": -253.4501495361328, "loss": 0.6739, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.025537196546792984, "rewards/margins": 0.040846120566129684, "rewards/rejected": -0.015308921225368977, "step": 3500 }, { "epoch": 0.9159905783826223, "eval_logits/chosen": -2.8575375080108643, "eval_logits/rejected": -2.8311452865600586, "eval_logps/chosen": -280.9686279296875, "eval_logps/rejected": -262.9413146972656, "eval_loss": 0.677466869354248, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.01804887317121029, "eval_rewards/margins": 0.033078454434871674, "eval_rewards/rejected": -0.015029575675725937, "eval_runtime": 623.0737, "eval_samples_per_second": 3.21, "eval_steps_per_second": 0.401, "step": 3500 }, { "epoch": 0.9186076943208584, "grad_norm": 2.078125, "learning_rate": 1.0027529222456754e-08, "logits/chosen": -2.80438232421875, "logits/rejected": -2.773073673248291, "logps/chosen": -268.601318359375, "logps/rejected": -252.92831420898438, "loss": 0.6728, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.02063935436308384, "rewards/margins": 0.0424073152244091, "rewards/rejected": -0.021767962723970413, "step": 3510 }, { "epoch": 0.9212248102590945, "grad_norm": 1.2734375, "learning_rate": 9.397045634168766e-09, "logits/chosen": -2.8710060119628906, "logits/rejected": -2.8575310707092285, "logps/chosen": -283.4559020996094, "logps/rejected": -289.69219970703125, "loss": 0.6729, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.031010348349809647, "rewards/margins": 0.04308422654867172, "rewards/rejected": -0.012073880061507225, "step": 3520 }, { "epoch": 0.9238419261973305, "grad_norm": 1.3125, "learning_rate": 8.78665232332998e-09, "logits/chosen": -2.8076834678649902, "logits/rejected": -2.7891430854797363, "logps/chosen": -245.83383178710938, "logps/rejected": -245.38418579101562, "loss": 0.6793, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.013311171904206276, "rewards/margins": 0.028882578015327454, "rewards/rejected": -0.015571406111121178, "step": 3530 }, { "epoch": 0.9264590421355666, "grad_norm": 1.21875, "learning_rate": 8.196400257606206e-09, "logits/chosen": -2.8521595001220703, "logits/rejected": -2.810176134109497, "logps/chosen": -298.8065490722656, "logps/rejected": -297.49627685546875, "loss": 0.6753, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.018893834203481674, "rewards/margins": 0.037972934544086456, "rewards/rejected": -0.019079100340604782, "step": 3540 }, { "epoch": 0.9290761580738026, "grad_norm": 1.421875, "learning_rate": 7.626338722875075e-09, "logits/chosen": -2.8442888259887695, "logits/rejected": -2.8555784225463867, "logps/chosen": -271.51593017578125, "logps/rejected": -271.29620361328125, "loss": 0.679, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.014854473061859608, "rewards/margins": 0.02983902022242546, "rewards/rejected": -0.014984548091888428, "step": 3550 }, { "epoch": 0.9316932740120387, "grad_norm": 1.3203125, "learning_rate": 7.0765153191106875e-09, "logits/chosen": -2.8553731441497803, "logits/rejected": -2.8395111560821533, "logps/chosen": -269.2403564453125, "logps/rejected": -229.14599609375, "loss": 0.6757, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.017869068309664726, "rewards/margins": 0.03671371936798096, "rewards/rejected": -0.01884464919567108, "step": 3560 }, { "epoch": 0.9343103899502748, "grad_norm": 1.0, "learning_rate": 6.54697595640899e-09, "logits/chosen": -2.8470935821533203, "logits/rejected": -2.827101945877075, "logps/chosen": -307.4560241699219, "logps/rejected": -287.3361511230469, "loss": 0.6755, "rewards/accuracies": 0.71875, "rewards/chosen": 0.024625394493341446, "rewards/margins": 0.037567656487226486, "rewards/rejected": -0.01294226385653019, "step": 3570 }, { "epoch": 0.9369275058885108, "grad_norm": 1.2578125, "learning_rate": 6.037764851154425e-09, "logits/chosen": -2.817866802215576, "logits/rejected": -2.8075547218322754, "logps/chosen": -280.808837890625, "logps/rejected": -287.5721740722656, "loss": 0.6757, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.019267046824097633, "rewards/margins": 0.03673623502254486, "rewards/rejected": -0.017469191923737526, "step": 3580 }, { "epoch": 0.9395446218267469, "grad_norm": 1.171875, "learning_rate": 5.548924522327747e-09, "logits/chosen": -2.839773178100586, "logits/rejected": -2.8247604370117188, "logps/chosen": -277.45343017578125, "logps/rejected": -264.47119140625, "loss": 0.6772, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015755945816636086, "rewards/margins": 0.03340662270784378, "rewards/rejected": -0.017650676891207695, "step": 3590 }, { "epoch": 0.942161737764983, "grad_norm": 1.34375, "learning_rate": 5.080495787955691e-09, "logits/chosen": -2.8103365898132324, "logits/rejected": -2.794466257095337, "logps/chosen": -242.8784637451172, "logps/rejected": -246.01123046875, "loss": 0.6807, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.014323743060231209, "rewards/margins": 0.02595413103699684, "rewards/rejected": -0.011630385182797909, "step": 3600 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -2.8526551723480225, "eval_logits/rejected": -2.825742483139038, "eval_logps/chosen": -280.9523620605469, "eval_logps/rejected": -262.9205017089844, "eval_loss": 0.6774939298629761, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": 0.01821131445467472, "eval_rewards/margins": 0.03303277865052223, "eval_rewards/rejected": -0.014821460470557213, "eval_runtime": 622.8509, "eval_samples_per_second": 3.211, "eval_steps_per_second": 0.401, "step": 3600 }, { "epoch": 0.944778853703219, "grad_norm": 2.9375, "learning_rate": 4.632517761702814e-09, "logits/chosen": -2.7846920490264893, "logits/rejected": -2.756865978240967, "logps/chosen": -257.6873779296875, "logps/rejected": -247.35317993164062, "loss": 0.6763, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.009808182716369629, "rewards/margins": 0.03539792820811272, "rewards/rejected": -0.025589745491743088, "step": 3610 }, { "epoch": 0.9473959696414551, "grad_norm": 1.3515625, "learning_rate": 4.205027849605358e-09, "logits/chosen": -2.8126158714294434, "logits/rejected": -2.7998046875, "logps/chosen": -264.48272705078125, "logps/rejected": -232.7639617919922, "loss": 0.6783, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.010889967903494835, "rewards/margins": 0.031071290373802185, "rewards/rejected": -0.020181316882371902, "step": 3620 }, { "epoch": 0.9500130855796912, "grad_norm": 1.34375, "learning_rate": 3.798061746947995e-09, "logits/chosen": -2.8637187480926514, "logits/rejected": -2.8383963108062744, "logps/chosen": -279.2807922363281, "logps/rejected": -244.65072631835938, "loss": 0.6745, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01557912863790989, "rewards/margins": 0.03933199122548103, "rewards/rejected": -0.023752864450216293, "step": 3630 }, { "epoch": 0.9526302015179272, "grad_norm": 1.296875, "learning_rate": 3.411653435283157e-09, "logits/chosen": -2.8357815742492676, "logits/rejected": -2.8046395778656006, "logps/chosen": -287.30950927734375, "logps/rejected": -232.4551239013672, "loss": 0.6757, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.02017480693757534, "rewards/margins": 0.03672494366765022, "rewards/rejected": -0.016550134867429733, "step": 3640 }, { "epoch": 0.9552473174561633, "grad_norm": 1.390625, "learning_rate": 3.0458351795936698e-09, "logits/chosen": -2.8704733848571777, "logits/rejected": -2.8465213775634766, "logps/chosen": -264.4095764160156, "logps/rejected": -236.84811401367188, "loss": 0.6731, "rewards/accuracies": 0.71875, "rewards/chosen": 0.023878643289208412, "rewards/margins": 0.04207443445920944, "rewards/rejected": -0.01819578930735588, "step": 3650 }, { "epoch": 0.9578644333943994, "grad_norm": 1.5078125, "learning_rate": 2.700637525598598e-09, "logits/chosen": -2.8182005882263184, "logits/rejected": -2.823500156402588, "logps/chosen": -287.9983825683594, "logps/rejected": -288.73779296875, "loss": 0.681, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.018164271488785744, "rewards/margins": 0.02582050859928131, "rewards/rejected": -0.007656236179172993, "step": 3660 }, { "epoch": 0.9604815493326354, "grad_norm": 1.3515625, "learning_rate": 2.3760892972027324e-09, "logits/chosen": -2.886582612991333, "logits/rejected": -2.8624680042266846, "logps/chosen": -286.31439208984375, "logps/rejected": -253.89181518554688, "loss": 0.6793, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.013353118672966957, "rewards/margins": 0.029065540060400963, "rewards/rejected": -0.015712425112724304, "step": 3670 }, { "epoch": 0.9630986652708715, "grad_norm": 1.4765625, "learning_rate": 2.0722175940897645e-09, "logits/chosen": -2.8091981410980225, "logits/rejected": -2.827847719192505, "logps/chosen": -275.06439208984375, "logps/rejected": -267.3816833496094, "loss": 0.6769, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.014090280048549175, "rewards/margins": 0.03396814316511154, "rewards/rejected": -0.01987786404788494, "step": 3680 }, { "epoch": 0.9657157812091076, "grad_norm": 2.0, "learning_rate": 1.7890477894593748e-09, "logits/chosen": -2.8381717205047607, "logits/rejected": -2.8093409538269043, "logps/chosen": -335.9226379394531, "logps/rejected": -286.562744140625, "loss": 0.6684, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.035428646951913834, "rewards/margins": 0.05196043848991394, "rewards/rejected": -0.016531798988580704, "step": 3690 }, { "epoch": 0.9683328971473436, "grad_norm": 1.2890625, "learning_rate": 1.5266035279088708e-09, "logits/chosen": -2.7718958854675293, "logits/rejected": -2.762516498565674, "logps/chosen": -317.29156494140625, "logps/rejected": -293.4043884277344, "loss": 0.6731, "rewards/accuracies": 0.75, "rewards/chosen": 0.02600114978849888, "rewards/margins": 0.042024485766887665, "rewards/rejected": -0.016023332253098488, "step": 3700 }, { "epoch": 0.9683328971473436, "eval_logits/chosen": -2.850998640060425, "eval_logits/rejected": -2.8239121437072754, "eval_logps/chosen": -280.9513854980469, "eval_logps/rejected": -262.9112854003906, "eval_loss": 0.6775330901145935, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": 0.018221192061901093, "eval_rewards/margins": 0.03295028209686279, "eval_rewards/rejected": -0.014729092828929424, "eval_runtime": 623.4667, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.401, "step": 3700 }, { "epoch": 0.9709500130855797, "grad_norm": 1.5390625, "learning_rate": 1.2849067234584621e-09, "logits/chosen": -2.7960381507873535, "logits/rejected": -2.789794921875, "logps/chosen": -251.1592559814453, "logps/rejected": -244.37112426757812, "loss": 0.6784, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.014368300326168537, "rewards/margins": 0.031433962285518646, "rewards/rejected": -0.017065661028027534, "step": 3710 }, { "epoch": 0.9735671290238157, "grad_norm": 1.65625, "learning_rate": 1.0639775577218625e-09, "logits/chosen": -2.7958970069885254, "logits/rejected": -2.7415878772735596, "logps/chosen": -266.13140869140625, "logps/rejected": -232.8394775390625, "loss": 0.677, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015364277176558971, "rewards/margins": 0.03415878862142563, "rewards/rejected": -0.018794508650898933, "step": 3720 }, { "epoch": 0.9761842449620518, "grad_norm": 1.578125, "learning_rate": 8.638344782207485e-10, "logits/chosen": -2.802359104156494, "logits/rejected": -2.7990007400512695, "logps/chosen": -271.896240234375, "logps/rejected": -248.9223175048828, "loss": 0.6758, "rewards/accuracies": 0.71875, "rewards/chosen": 0.018408995121717453, "rewards/margins": 0.036677196621894836, "rewards/rejected": -0.018268201500177383, "step": 3730 }, { "epoch": 0.9788013609002879, "grad_norm": 1.5859375, "learning_rate": 6.844941968447149e-10, "logits/chosen": -2.8409347534179688, "logits/rejected": -2.8161139488220215, "logps/chosen": -288.1478576660156, "logps/rejected": -280.65985107421875, "loss": 0.6694, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.027417827397584915, "rewards/margins": 0.049938250333070755, "rewards/rejected": -0.02252042479813099, "step": 3740 }, { "epoch": 0.9814184768385239, "grad_norm": 1.25, "learning_rate": 5.25971688455612e-10, "logits/chosen": -2.8641200065612793, "logits/rejected": -2.8417608737945557, "logps/chosen": -288.54437255859375, "logps/rejected": -287.37908935546875, "loss": 0.6733, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.0248104277998209, "rewards/margins": 0.04141296073794365, "rewards/rejected": -0.016602538526058197, "step": 3750 }, { "epoch": 0.98403559277676, "grad_norm": 1.3515625, "learning_rate": 3.882801896372967e-10, "logits/chosen": -2.8650496006011963, "logits/rejected": -2.8601956367492676, "logps/chosen": -280.47113037109375, "logps/rejected": -251.7021942138672, "loss": 0.6775, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.017177898436784744, "rewards/margins": 0.03319885581731796, "rewards/rejected": -0.016020962968468666, "step": 3760 }, { "epoch": 0.9866527087149961, "grad_norm": 1.2578125, "learning_rate": 2.714311975902661e-10, "logits/chosen": -2.8119492530822754, "logits/rejected": -2.7749814987182617, "logps/chosen": -303.2042541503906, "logps/rejected": -277.75457763671875, "loss": 0.6761, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.020443648099899292, "rewards/margins": 0.03567901626229286, "rewards/rejected": -0.015235371887683868, "step": 3770 }, { "epoch": 0.9892698246532321, "grad_norm": 1.4453125, "learning_rate": 1.754344691717591e-10, "logits/chosen": -2.8344626426696777, "logits/rejected": -2.811007022857666, "logps/chosen": -266.02685546875, "logps/rejected": -288.6179504394531, "loss": 0.6835, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.019811708480119705, "rewards/margins": 0.020612578839063644, "rewards/rejected": -0.0008008688455447555, "step": 3780 }, { "epoch": 0.9918869405914682, "grad_norm": 1.4609375, "learning_rate": 1.0029802008096333e-10, "logits/chosen": -2.843888521194458, "logits/rejected": -2.801081418991089, "logps/chosen": -288.82562255859375, "logps/rejected": -270.85028076171875, "loss": 0.6735, "rewards/accuracies": 0.71875, "rewards/chosen": 0.020014481619000435, "rewards/margins": 0.04117124527692795, "rewards/rejected": -0.021156763657927513, "step": 3790 }, { "epoch": 0.9945040565297043, "grad_norm": 1.296875, "learning_rate": 4.602812418974533e-11, "logits/chosen": -2.866516351699829, "logits/rejected": -2.8406145572662354, "logps/chosen": -301.0005798339844, "logps/rejected": -279.62908935546875, "loss": 0.675, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02747102454304695, "rewards/margins": 0.03847536817193031, "rewards/rejected": -0.011004343628883362, "step": 3800 }, { "epoch": 0.9945040565297043, "eval_logits/chosen": -2.850416660308838, "eval_logits/rejected": -2.8232762813568115, "eval_logps/chosen": -280.95458984375, "eval_logps/rejected": -262.90020751953125, "eval_loss": 0.6776041388511658, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": 0.018189024180173874, "eval_rewards/margins": 0.03280767798423767, "eval_rewards/rejected": -0.014618655666708946, "eval_runtime": 622.8726, "eval_samples_per_second": 3.211, "eval_steps_per_second": 0.401, "step": 3800 }, { "epoch": 0.9971211724679403, "grad_norm": 1.2890625, "learning_rate": 1.2629313018819309e-11, "logits/chosen": -2.8219265937805176, "logits/rejected": -2.7998881340026855, "logps/chosen": -272.62823486328125, "logps/rejected": -255.8968048095703, "loss": 0.676, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015002429485321045, "rewards/margins": 0.03602247312664986, "rewards/rejected": -0.021020041778683662, "step": 3810 }, { "epoch": 0.9997382884061764, "grad_norm": 3.515625, "learning_rate": 1.0437535929996855e-13, "logits/chosen": -2.8465044498443604, "logits/rejected": -2.825206995010376, "logps/chosen": -305.00775146484375, "logps/rejected": -262.2654724121094, "loss": 0.6739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.018057797104120255, "rewards/margins": 0.04028897359967232, "rewards/rejected": -0.02223118022084236, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.680465580483626, "train_runtime": 64957.9706, "train_samples_per_second": 0.941, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }